scraper-0.18.1/.cargo_vcs_info.json0000644000000001360000000000100125620ustar { "git": { "sha1": "90693404c2eb3bf6c534989b00847d34c9d67fa2" }, "path_in_vcs": "" }scraper-0.18.1/.editorconfig000064400000000000000000000003300072674642500140530ustar 00000000000000# EditorConfig root = true [*] end_of_line = lf charset = utf-8 trim_trailing_whitespace = true insert_final_newline = true indent_style = space indent_size = 4 [*.md] trim_trailing_whitespace = false scraper-0.18.1/.github/dependabot.yml000064400000000000000000000006730072674642500156000ustar 00000000000000# To get started with Dependabot version updates, you'll need to specify which # package ecosystems to update and where the package manifests are located. # Please see the documentation for all configuration options: # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates version: 2 updates: - package-ecosystem: "cargo" directory: "/" schedule: interval: "weekly" scraper-0.18.1/.github/workflows/test.yml000064400000000000000000000021420072674642500165000ustar 00000000000000name: Tests on: pull_request: push: branches: - "master" jobs: format: name: Format code runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable with: components: rustfmt - run: cargo fmt -- --check clippy: name: Clippy check runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable with: components: clippy - uses: Swatinem/rust-cache@v2 - run: cargo clippy --all-targets --all-features -- --deny warnings test: name: Test code runs-on: ubuntu-latest strategy: matrix: rust_version: [stable, beta, nightly] fail-fast: false steps: - uses: actions/checkout@v3 - uses: dtolnay/rust-toolchain@master with: toolchain: ${{matrix.rust_version}} - uses: Swatinem/rust-cache@v2 with: key: ${{matrix.rust_version}} - run: cargo update - run: cargo test --all-features scraper-0.18.1/.gitignore000064400000000000000000000000250072674642500133670ustar 00000000000000target *.bk .idea scraper-0.18.1/Cargo.lock0000644000000407120000000000100105410ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "ahash" version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" dependencies = [ "cfg-if", "getrandom", "once_cell", "version_check", ] [[package]] name = "autocfg" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "bitflags" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635" [[package]] name = "byteorder" version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "cssparser" version = "0.31.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be" dependencies = [ "cssparser-macros", "dtoa-short", "itoa", "phf 0.11.2", "smallvec", ] [[package]] name = "cssparser-macros" version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" dependencies = [ "quote", "syn 2.0.37", ] [[package]] name = "derive_more" version = "0.99.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321" dependencies = [ "proc-macro2", "quote", "syn 1.0.109", ] [[package]] name = "dtoa" version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653" [[package]] name = "dtoa-short" version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbaceec3c6e4211c79e7b1800fb9680527106beb2f9c51904a3210c03a448c74" dependencies = [ "dtoa", ] [[package]] name = "ego-tree" version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591" [[package]] name = "equivalent" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "futf" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" dependencies = [ "mac", "new_debug_unreachable", ] [[package]] name = "fxhash" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" dependencies = [ "byteorder", ] [[package]] name = "getopts" version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" dependencies = [ "unicode-width", ] [[package]] name = "getrandom" version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" dependencies = [ "cfg-if", "libc", "wasi", ] [[package]] name = "hashbrown" version = "0.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7dfda62a12f55daeae5015f81b0baea145391cb4520f86c248fc615d72640d12" [[package]] name = "html5ever" version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" dependencies = [ "log", "mac", "markup5ever", "proc-macro2", "quote", "syn 1.0.109", ] [[package]] name = "indexmap" version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8adf3ddd720272c6ea8bf59463c04e0f93d0bbf7c5439b691bca2987e0270897" dependencies = [ "equivalent", "hashbrown", ] [[package]] name = "itoa" version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" [[package]] name = "libc" version = "0.2.148" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b" [[package]] name = "lock_api" version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" dependencies = [ "autocfg", "scopeguard", ] [[package]] name = "log" version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" [[package]] name = "mac" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "markup5ever" version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" dependencies = [ "log", "phf 0.10.1", "phf_codegen", "string_cache", "string_cache_codegen", "tendril", ] [[package]] name = "new_debug_unreachable" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" [[package]] name = "once_cell" version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "parking_lot" version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" dependencies = [ "lock_api", "parking_lot_core", ] [[package]] name = "parking_lot_core" version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" dependencies = [ "cfg-if", "libc", "redox_syscall", "smallvec", "windows-targets", ] [[package]] name = "phf" version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" dependencies = [ "phf_shared 0.10.0", ] [[package]] name = "phf" version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" dependencies = [ "phf_macros", "phf_shared 0.11.2", ] [[package]] name = "phf_codegen" version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" dependencies = [ "phf_generator 0.10.0", "phf_shared 0.10.0", ] [[package]] name = "phf_generator" version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" dependencies = [ "phf_shared 0.10.0", "rand", ] [[package]] name = "phf_generator" version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" dependencies = [ "phf_shared 0.11.2", "rand", ] [[package]] name = "phf_macros" version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b" dependencies = [ "phf_generator 0.11.2", "phf_shared 0.11.2", "proc-macro2", "quote", "syn 2.0.37", ] [[package]] name = "phf_shared" version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" dependencies = [ "siphasher", ] [[package]] name = "phf_shared" version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" dependencies = [ "siphasher", ] [[package]] name = "ppv-lite86" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "precomputed-hash" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" [[package]] name = "proc-macro2" version = "1.0.67" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328" dependencies = [ "unicode-ident", ] [[package]] name = "quote" version = "1.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" dependencies = [ "proc-macro2", ] [[package]] name = "rand" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", "rand_chacha", "rand_core", ] [[package]] name = "rand_chacha" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", "rand_core", ] [[package]] name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ "getrandom", ] [[package]] name = "redox_syscall" version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" dependencies = [ "bitflags 1.3.2", ] [[package]] name = "scopeguard" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "scraper" version = "0.18.1" dependencies = [ "ahash", "cssparser", "ego-tree", "getopts", "html5ever", "indexmap", "once_cell", "selectors", "tendril", ] [[package]] name = "selectors" version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06" dependencies = [ "bitflags 2.4.0", "cssparser", "derive_more", "fxhash", "log", "new_debug_unreachable", "phf 0.10.1", "phf_codegen", "precomputed-hash", "servo_arc", "smallvec", ] [[package]] name = "serde" version = "1.0.188" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" version = "1.0.188" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" dependencies = [ "proc-macro2", "quote", "syn 2.0.37", ] [[package]] name = "servo_arc" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44" dependencies = [ "stable_deref_trait", ] [[package]] name = "siphasher" version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" [[package]] name = "smallvec" version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a" [[package]] name = "stable_deref_trait" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" [[package]] name = "string_cache" version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b" dependencies = [ "new_debug_unreachable", "once_cell", "parking_lot", "phf_shared 0.10.0", "precomputed-hash", "serde", ] [[package]] name = "string_cache_codegen" version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" dependencies = [ "phf_generator 0.10.0", "phf_shared 0.10.0", "proc-macro2", "quote", ] [[package]] name = "syn" version = "1.0.109" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "syn" version = "2.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7303ef2c05cd654186cb250d29049a24840ca25d2747c25c0381c8d9e2f582e8" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "tendril" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" dependencies = [ "futf", "mac", "utf-8", ] [[package]] name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unicode-width" version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" [[package]] name = "utf-8" version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" [[package]] name = "version_check" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "windows-targets" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", "windows_i686_gnu", "windows_i686_msvc", "windows_x86_64_gnu", "windows_x86_64_gnullvm", "windows_x86_64_msvc", ] [[package]] name = "windows_aarch64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_i686_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_x86_64_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" scraper-0.18.1/Cargo.toml0000644000000026110000000000100105600ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" name = "scraper" version = "0.18.1" authors = ["June McEnroe "] description = "HTML parsing and querying with CSS selectors" readme = "README.md" keywords = [ "html", "css", "selector", "scraping", ] license = "ISC" repository = "https://github.com/causal-agent/scraper" [[bin]] name = "scraper" path = "src/main.rs" required-features = ["main"] [dependencies.ahash] version = "0.8" [dependencies.cssparser] version = "0.31.0" [dependencies.ego-tree] version = "0.6.2" [dependencies.getopts] version = "0.2.21" optional = true [dependencies.html5ever] version = "0.26" [dependencies.indexmap] version = "2.0.2" optional = true [dependencies.once_cell] version = "1.0" [dependencies.selectors] version = "0.25.0" [dependencies.tendril] version = "0.4.3" [features] atomic = [] default = [ "main", "errors", ] deterministic = ["indexmap"] errors = [] main = ["getopts"] scraper-0.18.1/Cargo.toml.orig000064400000000000000000000014410072674642500142710ustar 00000000000000[package] name = "scraper" version = "0.18.1" edition = "2021" description = "HTML parsing and querying with CSS selectors" keywords = ["html", "css", "selector", "scraping"] authors = ["June McEnroe "] license = "ISC" repository = "https://github.com/causal-agent/scraper" readme = "README.md" [dependencies] cssparser = "0.31.0" ego-tree = "0.6.2" html5ever = "0.26" selectors = "0.25.0" tendril = "0.4.3" ahash = "0.8" indexmap = { version = "2.0.2", optional = true } once_cell = "1.0" [dependencies.getopts] version = "0.2.21" optional = true [features] default = ["main", "errors"] deterministic = ["indexmap"] main = ["getopts"] atomic = [] errors = [] [[bin]] name = "scraper" path = "src/main.rs" required-features = ["main"] scraper-0.18.1/LICENSE000064400000000000000000000014660072674642500124160ustar 00000000000000Copyright © 2016, June McEnroe Copyright © 2017, Vivek Kushwaha Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. scraper-0.18.1/README.md000064400000000000000000000074050072674642500126670ustar 00000000000000# scraper [![crates.io](https://img.shields.io/crates/v/scraper?color=dark-green)][crate] [![downloads](https://img.shields.io/crates/d/scraper)][crate] [![test](https://github.com/causal-agent/scraper/actions/workflows/test.yml/badge.svg)][tests] HTML parsing and querying with CSS selectors. `scraper` is on [Crates.io][crate] and [GitHub][github]. [crate]: https://crates.io/crates/scraper [github]: https://github.com/causal-agent/scraper [tests]: https://github.com/causal-agent/scraper/actions/workflows/test.yml Scraper provides an interface to Servo's `html5ever` and `selectors` crates, for browser-grade parsing and querying. ## Examples ### Parsing a document ```rust use scraper::Html; let html = r#" Hello, world!

Hello, world!

"#; let document = Html::parse_document(html); ``` ### Parsing a fragment ```rust use scraper::Html; let fragment = Html::parse_fragment("

Hello, world!

"); ``` ### Parsing a selector ```rust use scraper::Selector; let selector = Selector::parse("h1.foo").unwrap(); ``` ### Selecting elements ```rust use scraper::{Html, Selector}; let html = r#"
  • Foo
  • Bar
  • Baz
"#; let fragment = Html::parse_fragment(html); let selector = Selector::parse("li").unwrap(); for element in fragment.select(&selector) { assert_eq!("li", element.value().name()); } ``` ### Selecting descendent elements ```rust use scraper::{Html, Selector}; let html = r#"
  • Foo
  • Bar
  • Baz
"#; let fragment = Html::parse_fragment(html); let ul_selector = Selector::parse("ul").unwrap(); let li_selector = Selector::parse("li").unwrap(); let ul = fragment.select(&ul_selector).next().unwrap(); for element in ul.select(&li_selector) { assert_eq!("li", element.value().name()); } ``` ### Accessing element attributes ```rust use scraper::{Html, Selector}; let fragment = Html::parse_fragment(r#""#); let selector = Selector::parse(r#"input[name="foo"]"#).unwrap(); let input = fragment.select(&selector).next().unwrap(); assert_eq!(Some("bar"), input.value().attr("value")); ``` ### Serializing HTML and inner HTML ```rust use scraper::{Html, Selector}; let fragment = Html::parse_fragment("

Hello, world!

"); let selector = Selector::parse("h1").unwrap(); let h1 = fragment.select(&selector).next().unwrap(); assert_eq!("

Hello, world!

", h1.html()); assert_eq!("Hello, world!", h1.inner_html()); ``` ### Accessing descendent text ```rust use scraper::{Html, Selector}; let fragment = Html::parse_fragment("

Hello, world!

"); let selector = Selector::parse("h1").unwrap(); let h1 = fragment.select(&selector).next().unwrap(); let text = h1.text().collect::>(); assert_eq!(vec!["Hello, ", "world!"], text); ``` ### Manipulating the DOM ```rust use html5ever::tree_builder::TreeSink; use scraper::{Html, Selector}; let html = "hello

REMOVE ME

"; let selector = Selector::parse(".hello").unwrap(); let mut document = Html::parse_document(html); let node_ids: Vec<_> = document.select(&selector).map(|x| x.id()).collect(); for id in node_ids { document.remove_from_parent(&id); } assert_eq!(document.html(), "hello"); ``` ## Contributing Please feel free to open pull requests. If you're planning on implementing something big (i.e. not fixing a typo, a small bug fix, minor refactor, etc) then please open an issue first. scraper-0.18.1/examples/document.rs000064400000000000000000000013270072674642500154070ustar 00000000000000extern crate scraper; use std::io::{self, Read, Write}; use scraper::{Html, Selector}; fn main() { let mut input = String::new(); let mut stdout = io::stdout(); let mut stdin = io::stdin(); write!(stdout, "CSS selector: ").unwrap(); stdout.flush().unwrap(); stdin.read_line(&mut input).unwrap(); let selector = Selector::parse(&input).unwrap(); writeln!(stdout, "HTML document:").unwrap(); stdout.flush().unwrap(); input.clear(); stdin.read_to_string(&mut input).unwrap(); let document = Html::parse_document(&input); println!("{:#?}", document); for node in document.select(&selector) { println!("{:?}", node.value()); } } scraper-0.18.1/examples/fragment.rs000064400000000000000000000013270072674642500153740ustar 00000000000000extern crate scraper; use std::io::{self, Read, Write}; use scraper::{Html, Selector}; fn main() { let mut input = String::new(); let mut stdout = io::stdout(); let mut stdin = io::stdin(); write!(stdout, "CSS selector: ").unwrap(); stdout.flush().unwrap(); stdin.read_line(&mut input).unwrap(); let selector = Selector::parse(&input).unwrap(); writeln!(stdout, "HTML fragment:").unwrap(); stdout.flush().unwrap(); input.clear(); stdin.read_to_string(&mut input).unwrap(); let fragment = Html::parse_fragment(&input); println!("{:#?}", fragment); for node in fragment.select(&selector) { println!("{:?}", node.value()); } } scraper-0.18.1/scraper.1000064400000000000000000000025500072674642500131250ustar 00000000000000.Dd October 29, 2018 .Dt SCRAPER 1 .Os . .Sh NAME .Nm scraper .Nd HTML querying with CSS selectors . .Sh SYNOPSIS .Nm .Op Fl HIcint .Op Fl a Ar attr .Op Fl d | f .Ar selector .Op Ar . .Sh DESCRIPTION The .Nm utility parses HTML and outputs elements matching CSS selectors. . .Pp The arguments are as follows: .Bl -tag -width Ds . .It Fl H , Fl \-html Output the HTML of the matching elements. This is the default. . .It Fl I , Fl \-inner\-html Output the inner HTML of the matching elements. . .It Fl a Ar attr , Fl \-attr Ar attr Output the value of the attribute .Ar attr of the matching elements. . .It Fl c , Fl \-classes Output the classes of the matching elements. . .It Fl d , Fl \-document Parse the input as HTML documents. This is the default. . .It Fl f , Fl \-fragment Parse the input as HTML fragments. . .It Fl i , Fl \-id Output the IDs of the matching elements. . .It Fl n , Fl \-name Output the names of the matching elements. . .It Fl t , Fl \-text Output the text of the matching elements. .El . .Sh EXIT STATUS The .Nm utility exits 0 on success, 1 if no elements match, and >1 if an error occurs. . .Sh AUTHORS .An June McEnroe Aq Mt june@causal.agency .An Vivek Kushwaha Aq Mt yoursvivek@gmail.com .Pp The .Nm utility relies heavily on code from the .Lk https://servo.org "Servo project" . scraper-0.18.1/src/element_ref/element.rs000064400000000000000000000135730072674642500164660ustar 00000000000000use html5ever::Namespace; use selectors::{ attr::{AttrSelectorOperation, CaseSensitivity, NamespaceConstraint}, matching, Element, OpaqueElement, }; use super::ElementRef; use crate::selector::{CssLocalName, CssString, NonTSPseudoClass, PseudoElement, Simple}; /// Note: will never match against non-tree-structure pseudo-classes. impl<'a> Element for ElementRef<'a> { type Impl = Simple; fn opaque(&self) -> OpaqueElement { OpaqueElement::new(self.node.value()) } fn parent_element(&self) -> Option { self.parent().and_then(ElementRef::wrap) } fn parent_node_is_shadow_root(&self) -> bool { false } fn containing_shadow_host(&self) -> Option { None } fn is_pseudo_element(&self) -> bool { false } fn is_part(&self, _name: &CssLocalName) -> bool { false } fn is_same_type(&self, other: &Self) -> bool { self.value().name == other.value().name } fn imported_part(&self, _: &CssLocalName) -> Option { None } fn prev_sibling_element(&self) -> Option { self.prev_siblings() .find(|sibling| sibling.value().is_element()) .map(ElementRef::new) } fn next_sibling_element(&self) -> Option { self.next_siblings() .find(|sibling| sibling.value().is_element()) .map(ElementRef::new) } fn first_element_child(&self) -> Option { self.children() .find(|child| child.value().is_element()) .map(ElementRef::new) } fn is_html_element_in_html_document(&self) -> bool { // FIXME: Is there more to this? self.value().name.ns == ns!(html) } fn has_local_name(&self, name: &CssLocalName) -> bool { self.value().name.local == name.0 } fn has_namespace(&self, namespace: &Namespace) -> bool { &self.value().name.ns == namespace } fn attr_matches( &self, ns: &NamespaceConstraint<&Namespace>, local_name: &CssLocalName, operation: &AttrSelectorOperation<&CssString>, ) -> bool { self.value().attrs.iter().any(|(key, value)| { !matches!(*ns, NamespaceConstraint::Specific(url) if *url != key.ns) && local_name.0 == key.local && operation.eval_str(value) }) } fn match_non_ts_pseudo_class( &self, _pc: &NonTSPseudoClass, _context: &mut matching::MatchingContext<'_, Self::Impl>, ) -> bool { false } fn match_pseudo_element( &self, _pe: &PseudoElement, _context: &mut matching::MatchingContext, ) -> bool { false } fn is_link(&self) -> bool { self.value().name() == "link" } fn is_html_slot_element(&self) -> bool { true } fn has_id(&self, id: &CssLocalName, case_sensitivity: CaseSensitivity) -> bool { match self.value().id() { Some(val) => case_sensitivity.eq(id.0.as_bytes(), val.as_bytes()), None => false, } } fn has_class(&self, name: &CssLocalName, case_sensitivity: CaseSensitivity) -> bool { self.value().has_class(&name.0, case_sensitivity) } fn is_empty(&self) -> bool { !self .children() .any(|child| child.value().is_element() || child.value().is_text()) } fn is_root(&self) -> bool { self.parent() .map_or(false, |parent| parent.value().is_document()) } fn apply_selector_flags(&self, _flags: matching::ElementSelectorFlags) {} } #[cfg(test)] mod tests { use crate::html::Html; use crate::selector::{CssLocalName, Selector}; use selectors::attr::CaseSensitivity; use selectors::Element; #[test] fn test_has_id() { let html = ""; let fragment = Html::parse_fragment(html); let sel = Selector::parse("p").unwrap(); let element = fragment.select(&sel).next().unwrap(); assert!(element.has_id( &CssLocalName::from("link_id_456"), CaseSensitivity::CaseSensitive )); let html = "

hey there

"; let fragment = Html::parse_fragment(html); let element = fragment.select(&sel).next().unwrap(); assert!(!element.has_id( &CssLocalName::from("any_link_id"), CaseSensitivity::CaseSensitive )); } #[test] fn test_is_link() { let html = ""; let fragment = Html::parse_fragment(html); let sel = Selector::parse("link").unwrap(); let element = fragment.select(&sel).next().unwrap(); assert!(element.is_link()); let html = "

hey there

"; let fragment = Html::parse_fragment(html); let sel = Selector::parse("p").unwrap(); let element = fragment.select(&sel).next().unwrap(); assert!(!element.is_link()); } #[test] fn test_has_class() { let html = "

hey there

"; let fragment = Html::parse_fragment(html); let sel = Selector::parse("p").unwrap(); let element = fragment.select(&sel).next().unwrap(); assert!(element.has_class( &CssLocalName::from("my_class"), CaseSensitivity::CaseSensitive )); let html = "

hey there

"; let fragment = Html::parse_fragment(html); let sel = Selector::parse("p").unwrap(); let element = fragment.select(&sel).next().unwrap(); assert!(!element.has_class( &CssLocalName::from("my_class"), CaseSensitivity::CaseSensitive )); } } scraper-0.18.1/src/element_ref/mod.rs000064400000000000000000000107120072674642500156040ustar 00000000000000//! Element references. use std::ops::Deref; use ego_tree::iter::{Edge, Traverse}; use ego_tree::NodeRef; use html5ever::serialize::{serialize, SerializeOpts, TraversalScope}; use crate::node::Element; use crate::{Node, Selector}; /// Wrapper around a reference to an element node. /// /// This wrapper implements the `Element` trait from the `selectors` crate, which allows it to be /// matched against CSS selectors. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct ElementRef<'a> { node: NodeRef<'a, Node>, } impl<'a> ElementRef<'a> { fn new(node: NodeRef<'a, Node>) -> Self { ElementRef { node } } /// Wraps a `NodeRef` only if it references a `Node::Element`. pub fn wrap(node: NodeRef<'a, Node>) -> Option { if node.value().is_element() { Some(ElementRef::new(node)) } else { None } } /// Returns the `Element` referenced by `self`. pub fn value(&self) -> &'a Element { self.node.value().as_element().unwrap() } /// Returns an iterator over descendent elements matching a selector. pub fn select<'b>(&self, selector: &'b Selector) -> Select<'a, 'b> { let mut inner = self.traverse(); inner.next(); // Skip Edge::Open(self). Select { scope: *self, inner, selector, } } fn serialize(&self, traversal_scope: TraversalScope) -> String { let opts = SerializeOpts { scripting_enabled: false, // It's not clear what this does. traversal_scope, create_missing_parent: false, }; let mut buf = Vec::new(); serialize(&mut buf, self, opts).unwrap(); String::from_utf8(buf).unwrap() } /// Returns the HTML of this element. pub fn html(&self) -> String { self.serialize(TraversalScope::IncludeNode) } /// Returns the inner HTML of this element. pub fn inner_html(&self) -> String { self.serialize(TraversalScope::ChildrenOnly(None)) } /// Returns the value of an attribute. pub fn attr(&self, attr: &str) -> Option<&'a str> { self.value().attr(attr) } /// Returns an iterator over descendent text nodes. pub fn text(&self) -> Text<'a> { Text { inner: self.traverse(), } } } impl<'a> Deref for ElementRef<'a> { type Target = NodeRef<'a, Node>; fn deref(&self) -> &NodeRef<'a, Node> { &self.node } } /// Iterator over descendent elements matching a selector. #[derive(Debug, Clone)] pub struct Select<'a, 'b> { scope: ElementRef<'a>, inner: Traverse<'a, Node>, selector: &'b Selector, } impl<'a, 'b> Iterator for Select<'a, 'b> { type Item = ElementRef<'a>; fn next(&mut self) -> Option> { for edge in &mut self.inner { if let Edge::Open(node) = edge { if let Some(element) = ElementRef::wrap(node) { if self.selector.matches_with_scope(&element, Some(self.scope)) { return Some(element); } } } } None } } /// Iterator over descendent text nodes. #[derive(Debug, Clone)] pub struct Text<'a> { inner: Traverse<'a, Node>, } impl<'a> Iterator for Text<'a> { type Item = &'a str; fn next(&mut self) -> Option<&'a str> { for edge in &mut self.inner { if let Edge::Open(node) = edge { if let Node::Text(ref text) = node.value() { return Some(&**text); } } } None } } mod element; mod serializable; #[cfg(test)] mod tests { use crate::html::Html; use crate::selector::Selector; #[test] fn test_scope() { let html = r"
1 2 3
"; let fragment = Html::parse_fragment(html); let sel1 = Selector::parse("div > span").unwrap(); let sel2 = Selector::parse(":scope > b").unwrap(); let element1 = fragment.select(&sel1).next().unwrap(); let element2 = element1.select(&sel2).next().unwrap(); assert_eq!(element2.inner_html(), "3"); } } scraper-0.18.1/src/element_ref/serializable.rs000064400000000000000000000006240072674642500174740ustar 00000000000000use std::io::Error; use html5ever::serialize::{Serialize, Serializer, TraversalScope}; use crate::ElementRef; impl<'a> Serialize for ElementRef<'a> { fn serialize( &self, serializer: &mut S, traversal_scope: TraversalScope, ) -> Result<(), Error> { crate::node::serializable::serialize(**self, serializer, traversal_scope) } } scraper-0.18.1/src/error/utils.rs000064400000000000000000000056130072674642500150350ustar 00000000000000use cssparser::Token; pub(crate) fn render_token(token: &Token<'_>) -> String { // THIS TOOK FOREVER TO IMPLEMENT match token { // TODO: Give these guys some better names Token::Ident(ident) => format!("{}", ident.clone()), Token::AtKeyword(value) => format!("@{}", value.clone()), Token::Hash(name) | Token::IDHash(name) => format!("#{}", name.clone()), Token::QuotedString(value) => format!("\"{}\"", value.clone()), Token::Number { has_sign: signed, value: num, int_value: _, } | Token::Percentage { has_sign: signed, unit_value: num, int_value: _, } => render_number(*signed, *num, token), Token::Dimension { has_sign: signed, value: num, int_value: _, unit, } => format!("{}{}", render_int(*signed, *num), unit), Token::WhiteSpace(_) => String::from(" "), Token::Comment(comment) => format!("/* {} */", comment), Token::Function(name) => format!("{}()", name.clone()), Token::BadString(string) => format!("", string.clone()), Token::BadUrl(url) => format!("", url.clone()), // Single-character token sc_token => render_single_char_token(sc_token), } } fn render_single_char_token(token: &Token) -> String { String::from(match token { Token::Colon => ":", Token::Semicolon => ";", Token::Comma => ",", Token::IncludeMatch => "~=", Token::DashMatch => "|=", Token::PrefixMatch => "^=", Token::SuffixMatch => "$=", Token::SubstringMatch => "*=", Token::CDO => "", Token::ParenthesisBlock => "<(", Token::SquareBracketBlock => "<[", Token::CurlyBracketBlock => "<{", Token::CloseParenthesis => "<)", Token::CloseSquareBracket => "<]", Token::CloseCurlyBracket => "<}", other => panic!( "Token {:?} is not supposed to match as a single-character token!", other ), }) } fn render_number(signed: bool, num: f32, token: &Token) -> String { let num = render_int(signed, num); match token { Token::Number { .. } => num, Token::Percentage { .. } => format!("{}%", num), _ => panic!("render_number is not supposed to be called on a non-numerical token"), } } fn render_int(signed: bool, num: f32) -> String { if signed { render_int_signed(num) } else { render_int_unsigned(num) } } fn render_int_signed(num: f32) -> String { if num > 0.0 { format!("+{}", num) } else { format!("-{}", num) } } fn render_int_unsigned(num: f32) -> String { format!("{}", num) } scraper-0.18.1/src/error.rs000064400000000000000000000111570072674642500136750ustar 00000000000000//! Custom error types for diagnostics //! Includes re-exported error types from dependencies mod utils; use std::{error::Error, fmt::Display}; use cssparser::{BasicParseErrorKind, ParseErrorKind, Token}; use selectors::parser::SelectorParseErrorKind; /// Error type that is returned when calling `Selector::parse` #[derive(Debug, Clone)] pub enum SelectorErrorKind<'a> { /// A `Token` was not expected UnexpectedToken(Token<'a>), /// End-Of-Line was unexpected EndOfLine, /// `@` rule is invalid InvalidAtRule(String), /// The body of an `@` rule is invalid InvalidAtRuleBody, /// The qualified rule is invalid QualRuleInvalid, /// Expected a `::` for a pseudoelement ExpectedColonOnPseudoElement(Token<'a>), /// Expected an identity for a pseudoelement ExpectedIdentityOnPseudoElement(Token<'a>), /// A `SelectorParseErrorKind` error that isn't really supposed to happen did UnexpectedSelectorParseError(SelectorParseErrorKind<'a>), } impl<'a> From>> for SelectorErrorKind<'a> { fn from(original: cssparser::ParseError<'a, SelectorParseErrorKind<'a>>) -> Self { // NOTE: This could be improved, but I dont // exactly know how match original.kind { ParseErrorKind::Basic(err) => SelectorErrorKind::from(err), ParseErrorKind::Custom(err) => SelectorErrorKind::from(err), } } } impl<'a> From> for SelectorErrorKind<'a> { fn from(err: BasicParseErrorKind<'a>) -> Self { match err { BasicParseErrorKind::UnexpectedToken(token) => Self::UnexpectedToken(token), BasicParseErrorKind::EndOfInput => Self::EndOfLine, BasicParseErrorKind::AtRuleInvalid(rule) => Self::InvalidAtRule(rule.to_string()), BasicParseErrorKind::AtRuleBodyInvalid => Self::InvalidAtRuleBody, BasicParseErrorKind::QualifiedRuleInvalid => Self::QualRuleInvalid, } } } impl<'a> From> for SelectorErrorKind<'a> { fn from(err: SelectorParseErrorKind<'a>) -> Self { match err { SelectorParseErrorKind::PseudoElementExpectedColon(token) => { Self::ExpectedColonOnPseudoElement(token) } SelectorParseErrorKind::PseudoElementExpectedIdent(token) => { Self::ExpectedIdentityOnPseudoElement(token) } other => Self::UnexpectedSelectorParseError(other), } } } impl<'a> Display for SelectorErrorKind<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, "{}", match self { Self::UnexpectedToken(token) => { format!("Token {:?} was not expected", utils::render_token(token)) } Self::EndOfLine => "Unexpected EOL".to_string(), Self::InvalidAtRule(rule) => format!("Invalid @-rule {:?}", rule), Self::InvalidAtRuleBody => "The body of an @-rule was invalid".to_string(), Self::QualRuleInvalid => "The qualified name was invalid".to_string(), Self::ExpectedColonOnPseudoElement(token) => format!( "Expected a ':' token for pseudoelement, got {:?} instead", utils::render_token(token) ), Self::ExpectedIdentityOnPseudoElement(token) => format!( "Expected identity for pseudoelement, got {:?} instead", utils::render_token(token) ), Self::UnexpectedSelectorParseError(err) => format!( "Unexpected error occurred. Please report this to the developer\n{:#?}", err ), } ) } } impl<'a> Error for SelectorErrorKind<'a> { fn description(&self) -> &str { match self { Self::UnexpectedToken(_) => "Token was not expected", Self::EndOfLine => "Unexpected EOL", Self::InvalidAtRule(_) => "Invalid @-rule", Self::InvalidAtRuleBody => "The body of an @-rule was invalid", Self::QualRuleInvalid => "The qualified name was invalid", Self::ExpectedColonOnPseudoElement(_) => "Missing colon character on pseudoelement", Self::ExpectedIdentityOnPseudoElement(_) => "Missing pseudoelement identity", Self::UnexpectedSelectorParseError(_) => "Unexpected error", } } } scraper-0.18.1/src/html/mod.rs000064400000000000000000000155630072674642500142740ustar 00000000000000//! HTML documents and fragments. #[cfg(feature = "errors")] use std::borrow::Cow; use ego_tree::iter::Nodes; use ego_tree::Tree; use html5ever::serialize::SerializeOpts; use html5ever::tree_builder::QuirksMode; use html5ever::QualName; use html5ever::{driver, serialize}; use tendril::TendrilSink; use crate::selector::Selector; use crate::{ElementRef, Node}; /// An HTML tree. /// /// Parsing does not fail hard. Instead, the `quirks_mode` is set and errors are added to the /// `errors` field. The `tree` will still be populated as best as possible. /// /// Implements the `TreeSink` trait from the `html5ever` crate, which allows HTML to be parsed. #[derive(Debug, Clone, PartialEq, Eq)] pub struct Html { #[cfg(feature = "errors")] /// Parse errors. pub errors: Vec>, /// The quirks mode. pub quirks_mode: QuirksMode, /// The node tree. pub tree: Tree, } impl Html { /// Creates an empty HTML document. pub fn new_document() -> Self { Html { #[cfg(feature = "errors")] errors: Vec::new(), quirks_mode: QuirksMode::NoQuirks, tree: Tree::new(Node::Document), } } /// Creates an empty HTML fragment. pub fn new_fragment() -> Self { Html { #[cfg(feature = "errors")] errors: Vec::new(), quirks_mode: QuirksMode::NoQuirks, tree: Tree::new(Node::Fragment), } } /// Parses a string of HTML as a document. /// /// This is a convenience method for the following: /// /// ``` /// # extern crate html5ever; /// # extern crate scraper; /// # extern crate tendril; /// # fn main() { /// # let document = ""; /// use html5ever::driver::{self, ParseOpts}; /// use scraper::Html; /// use tendril::TendrilSink; /// /// let parser = driver::parse_document(Html::new_document(), ParseOpts::default()); /// let html = parser.one(document); /// # } /// ``` pub fn parse_document(document: &str) -> Self { let parser = driver::parse_document(Self::new_document(), Default::default()); parser.one(document) } /// Parses a string of HTML as a fragment. pub fn parse_fragment(fragment: &str) -> Self { let parser = driver::parse_fragment( Self::new_fragment(), Default::default(), QualName::new(None, ns!(html), local_name!("body")), Vec::new(), ); parser.one(fragment) } /// Returns an iterator over elements matching a selector. pub fn select<'a, 'b>(&'a self, selector: &'b Selector) -> Select<'a, 'b> { Select { inner: self.tree.nodes(), selector, } } /// Returns the root `` element. pub fn root_element(&self) -> ElementRef { let root_node = self .tree .root() .children() .find(|child| child.value().is_element()) .expect("html node missing"); ElementRef::wrap(root_node).unwrap() } /// Serialize entire document into HTML. pub fn html(&self) -> String { let opts = SerializeOpts { scripting_enabled: false, // It's not clear what this does. traversal_scope: html5ever::serialize::TraversalScope::IncludeNode, create_missing_parent: false, }; let mut buf = Vec::new(); serialize(&mut buf, self, opts).unwrap(); String::from_utf8(buf).unwrap() } } /// Iterator over elements matching a selector. #[derive(Debug)] pub struct Select<'a, 'b> { inner: Nodes<'a, Node>, selector: &'b Selector, } impl<'a, 'b> Iterator for Select<'a, 'b> { type Item = ElementRef<'a>; fn next(&mut self) -> Option> { for node in self.inner.by_ref() { if let Some(element) = ElementRef::wrap(node) { if element.parent().is_some() && self.selector.matches(&element) { return Some(element); } } } None } fn size_hint(&self) -> (usize, Option) { let (_lower, upper) = self.inner.size_hint(); (0, upper) } } impl<'a, 'b> DoubleEndedIterator for Select<'a, 'b> { fn next_back(&mut self) -> Option { for node in self.inner.by_ref().rev() { if let Some(element) = ElementRef::wrap(node) { if element.parent().is_some() && self.selector.matches(&element) { return Some(element); } } } None } } mod serializable; mod tree_sink; #[cfg(test)] mod tests { use super::Html; use super::Selector; #[test] fn root_element_fragment() { let html = Html::parse_fragment(r#"1"#); let root_ref = html.root_element(); let href = root_ref .select(&Selector::parse("a").unwrap()) .next() .unwrap(); assert_eq!(href.inner_html(), "1"); assert_eq!(href.value().attr("href").unwrap(), "http://github.com"); } #[test] fn root_element_document_doctype() { let html = Html::parse_document("\nabc"); let root_ref = html.root_element(); let title = root_ref .select(&Selector::parse("title").unwrap()) .next() .unwrap(); assert_eq!(title.inner_html(), "abc"); } #[test] fn root_element_document_comment() { let html = Html::parse_document("abc"); let root_ref = html.root_element(); let title = root_ref .select(&Selector::parse("title").unwrap()) .next() .unwrap(); assert_eq!(title.inner_html(), "abc"); } #[test] fn select_is_reversible() { let html = Html::parse_document("

element1

element2

element3

"); let selector = Selector::parse("p").unwrap(); let result: Vec<_> = html .select(&selector) .rev() .map(|e| e.inner_html()) .collect(); assert_eq!(result, vec!["element3", "element2", "element1"]); } #[test] fn select_has_a_size_hint() { let html = Html::parse_document("

element1

element2

element3

"); let selector = Selector::parse("p").unwrap(); let (lower, upper) = html.select(&selector).size_hint(); assert_eq!(lower, 0); assert_eq!(upper, Some(10)); } #[cfg(feature = "atomic")] #[test] fn html_is_send() { fn send_sync() {} send_sync::(); } } scraper-0.18.1/src/html/serializable.rs000064400000000000000000000013120072674642500161460ustar 00000000000000use std::io::Error; use html5ever::serialize::{Serialize, Serializer, TraversalScope}; use crate::Html; impl Serialize for Html { fn serialize( &self, serializer: &mut S, traversal_scope: TraversalScope, ) -> Result<(), Error> { crate::node::serializable::serialize(self.tree.root(), serializer, traversal_scope) } } #[cfg(test)] mod tests { use crate::Html; #[test] fn test_serialize() { let src = r#"

Hello world!

"#; let html = Html::parse_document(src); assert_eq!(html.html(), src); } } scraper-0.18.1/src/html/tree_sink.rs000064400000000000000000000202310072674642500154640ustar 00000000000000use super::Html; use crate::node::{Comment, Doctype, Element, Node, ProcessingInstruction, Text}; use crate::tendril_util::make as make_tendril; use ego_tree::NodeId; use html5ever::tendril::StrTendril; use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; use html5ever::Attribute; use html5ever::{ExpandedName, QualName}; use std::borrow::Cow; /// Note: does not support the `