adblock-0.8.12/.cargo_vcs_info.json0000644000000001360000000000100125230ustar { "git": { "sha1": "e01430a534b64163e16590290bbf4df6218d16aa" }, "path_in_vcs": "" }adblock-0.8.12/Cargo.lock0000644000001444330000000000100105070ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "adblock" version = "0.8.12" dependencies = [ "addr", "base64 0.13.0", "bitflags", "criterion", "cssparser", "csv", "futures", "idna 0.2.3", "itertools", "lifeguard", "memchr", "mock_instant", "once_cell", "percent-encoding", "regex", "reqwest", "rmp-serde", "seahash", "selectors", "serde", "serde_json", "sha2", "thiserror", "tokio", "url", ] [[package]] name = "addr" version = "0.15.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a93b8a41dbe230ad5087cc721f8d41611de654542180586b315d9f4cf6b72bef" dependencies = [ "psl", "psl-types", ] [[package]] name = "aho-corasick" version = "0.7.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4f55bd91a0978cbfd91c457a164bab8b4001c833b7f323132c0a4e1922dd44e" dependencies = [ "memchr", ] [[package]] name = "anes" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstyle" version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" [[package]] name = "autocfg" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "base64" version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" [[package]] name = "base64" version = "0.21.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9" [[package]] name = "bitflags" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "block-buffer" version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" dependencies = [ "generic-array", ] [[package]] name = "bumpalo" version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535" [[package]] name = "byteorder" version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec8a7b6a70fde80372154c65702f00a0f56f3e1c36abbc6c440484be248856db" [[package]] name = "cast" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" dependencies = [ "libc", ] [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "ciborium" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b0c137568cc60b904a7724001b35ce2630fd00d5d84805fbb608ab89509d788f" dependencies = [ "ciborium-io", "ciborium-ll", "serde", ] [[package]] name = "ciborium-io" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "346de753af073cc87b52b2083a506b38ac176a44cfb05497b622e27be899b369" [[package]] name = "ciborium-ll" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "213030a2b5a4e0c0892b6652260cf6ccac84827b83a85a534e178e3906c4cf1b" dependencies = [ "ciborium-io", "half", ] [[package]] name = "clap" version = "4.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90bc066a67923782aa8515dbaea16946c5bcc5addbd668bb80af688e53e548a0" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" version = "4.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4" dependencies = [ "anstyle", "clap_lex", ] [[package]] name = "clap_lex" version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" [[package]] name = "convert_case" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" [[package]] name = "core-foundation" version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146" dependencies = [ "core-foundation-sys", "libc", ] [[package]] name = "core-foundation-sys" version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" [[package]] name = "cpufeatures" version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28d997bd5e24a5928dd43e46dc529867e207907fe0b239c3477d924f7f2ca320" dependencies = [ "libc", ] [[package]] name = "criterion" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" dependencies = [ "anes", "cast", "ciborium", "clap", "criterion-plot", "is-terminal", "itertools", "num-traits", "once_cell", "oorandom", "plotters", "rayon", "regex", "serde", "serde_derive", "serde_json", "tinytemplate", "walkdir", ] [[package]] name = "criterion-plot" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" dependencies = [ "cast", "itertools", ] [[package]] name = "crossbeam-channel" version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" dependencies = [ "cfg-if", "crossbeam-utils", ] [[package]] name = "crossbeam-deque" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" dependencies = [ "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "045ebe27666471bb549370b4b0b3e51b07f56325befa4284db65fc89c02511b1" dependencies = [ "autocfg", "cfg-if", "crossbeam-utils", "memoffset", "once_cell", "scopeguard", ] [[package]] name = "crossbeam-utils" version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc" dependencies = [ "cfg-if", "once_cell", ] [[package]] name = "cssparser" version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1db8599a9761b371751fbf13e076fa03c6e1a78f8c5288e6ab9467f10a2322c1" dependencies = [ "cssparser-macros", "dtoa-short", "itoa 0.4.8", "matches", "phf", "proc-macro2", "quote", "smallvec", "syn 1.0.99", ] [[package]] name = "cssparser-macros" version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dfae75de57f2b2e85e8768c3ea840fd159c8f33e2b6522c7835b7abac81be16e" dependencies = [ "quote", "syn 1.0.99", ] [[package]] name = "csv" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" dependencies = [ "csv-core", "itoa 1.0.3", "ryu", "serde", ] [[package]] name = "csv-core" version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" dependencies = [ "memchr", ] [[package]] name = "derive_more" version = "0.99.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321" dependencies = [ "convert_case", "proc-macro2", "quote", "rustc_version", "syn 1.0.99", ] [[package]] name = "digest" version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" dependencies = [ "generic-array", ] [[package]] name = "dtoa" version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56899898ce76aaf4a0f24d914c97ea6ed976d42fec6ad33fcbb0a1103e07b2b0" [[package]] name = "dtoa-short" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bde03329ae10e79ede66c9ce4dc930aa8599043b0743008548680f25b91502d6" dependencies = [ "dtoa", ] [[package]] name = "either" version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797" [[package]] name = "encoding_rs" version = "0.8.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9852635589dc9f9ea1b6fe9f05b50ef208c85c834a562f0c6abb1c475736ec2b" dependencies = [ "cfg-if", ] [[package]] name = "equivalent" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "fnv" version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] name = "form_urlencoded" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a9c384f161156f5260c24a097c56119f9be8c798586aecc13afbcbe7b7e26bf8" dependencies = [ "percent-encoding", ] [[package]] name = "futures" version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" dependencies = [ "futures-channel", "futures-core", "futures-executor", "futures-io", "futures-sink", "futures-task", "futures-util", ] [[package]] name = "futures-channel" version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" dependencies = [ "futures-core", "futures-sink", ] [[package]] name = "futures-core" version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" [[package]] name = "futures-executor" version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" dependencies = [ "futures-core", "futures-task", "futures-util", ] [[package]] name = "futures-io" version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" [[package]] name = "futures-macro" version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", "syn 2.0.61", ] [[package]] name = "futures-sink" version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" [[package]] name = "futures-task" version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" [[package]] name = "futures-util" version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" dependencies = [ "futures-channel", "futures-core", "futures-io", "futures-macro", "futures-sink", "futures-task", "memchr", "pin-project-lite", "pin-utils", "slab", ] [[package]] name = "fxhash" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" dependencies = [ "byteorder", ] [[package]] name = "generic-array" version = "0.14.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9" dependencies = [ "typenum", "version_check", ] [[package]] name = "getrandom" version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" dependencies = [ "cfg-if", "libc", "wasi 0.9.0+wasi-snapshot-preview1", ] [[package]] name = "getrandom" version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fe9006bed769170c11f845cf00c7c1e9092aeb3f268e007c3e760ac68008070f" dependencies = [ "cfg-if", "libc", "wasi 0.11.0+wasi-snapshot-preview1", ] [[package]] name = "h2" version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" dependencies = [ "bytes", "fnv", "futures-core", "futures-sink", "futures-util", "http", "indexmap", "slab", "tokio", "tokio-util", "tracing", ] [[package]] name = "half" version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" [[package]] name = "hashbrown" version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" [[package]] name = "hermit-abi" version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" dependencies = [ "libc", ] [[package]] name = "hermit-abi" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" [[package]] name = "http" version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399" dependencies = [ "bytes", "fnv", "itoa 1.0.3", ] [[package]] name = "http-body" version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" dependencies = [ "bytes", "http", "pin-project-lite", ] [[package]] name = "httparse" version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" [[package]] name = "httpdate" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421" [[package]] name = "hyper" version = "0.14.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffb1cfd654a8219eaef89881fdb3bb3b1cdc5fa75ded05d6933b2b382e395468" dependencies = [ "bytes", "futures-channel", "futures-core", "futures-util", "h2", "http", "http-body", "httparse", "httpdate", "itoa 1.0.3", "pin-project-lite", "socket2", "tokio", "tower-service", "tracing", "want", ] [[package]] name = "hyper-rustls" version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" dependencies = [ "futures-util", "http", "hyper", "rustls", "tokio", "tokio-rustls", ] [[package]] name = "idna" version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8" dependencies = [ "matches", "unicode-bidi", "unicode-normalization", ] [[package]] name = "idna" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6" dependencies = [ "unicode-bidi", "unicode-normalization", ] [[package]] name = "indexmap" version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f" dependencies = [ "equivalent", "hashbrown", ] [[package]] name = "ipnet" version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "879d54834c8c76457ef4293a689b2a8c59b076067ad77b15efafbb05f92a592b" [[package]] name = "is-terminal" version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" dependencies = [ "hermit-abi 0.3.9", "libc", "windows-sys 0.52.0", ] [[package]] name = "itertools" version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8bf247779e67a9082a4790b45e71ac7cfd1321331a5c856a74a9faebdab78d0" dependencies = [ "either", ] [[package]] name = "itoa" version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" [[package]] name = "itoa" version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754" [[package]] name = "js-sys" version = "0.3.60" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49409df3e3bf0856b916e2ceaca09ee28e6871cf7d9ce97a692cacfdb2a25a47" dependencies = [ "wasm-bindgen", ] [[package]] name = "libc" version = "0.2.150" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" [[package]] name = "lifeguard" version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "89be94dbd775db37b46ca4f4bf5cf89adfb13ba197bfbcb69b2122848ee73c26" [[package]] name = "log" version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" dependencies = [ "cfg-if", ] [[package]] name = "matches" version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f" [[package]] name = "memchr" version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" [[package]] name = "memoffset" version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" dependencies = [ "autocfg", ] [[package]] name = "mime" version = "0.3.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" [[package]] name = "mio" version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" dependencies = [ "libc", "log", "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.48.0", ] [[package]] name = "mock_instant" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cdcebb6db83796481097dedc7747809243cc81d9ed83e6a938b76d4ea0b249cf" [[package]] name = "nodrop" version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" [[package]] name = "num-traits" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", ] [[package]] name = "num_cpus" version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" dependencies = [ "hermit-abi 0.1.19", "libc", ] [[package]] name = "once_cell" version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "oorandom" version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" [[package]] name = "opaque-debug" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" [[package]] name = "paste" version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1de2e551fb905ac83f73f7aedf2f0cb4a0da7e35efa24a202a936269f1f18e1" [[package]] name = "percent-encoding" version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" [[package]] name = "phf" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" dependencies = [ "phf_macros", "phf_shared", "proc-macro-hack", ] [[package]] name = "phf_codegen" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" dependencies = [ "phf_generator", "phf_shared", ] [[package]] name = "phf_generator" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" dependencies = [ "phf_shared", "rand", ] [[package]] name = "phf_macros" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f6fde18ff429ffc8fe78e2bf7f8b7a5a5a6e2a8b58bc5a9ac69198bbda9189c" dependencies = [ "phf_generator", "phf_shared", "proc-macro-hack", "proc-macro2", "quote", "syn 1.0.99", ] [[package]] name = "phf_shared" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" dependencies = [ "siphasher", ] [[package]] name = "pin-project-lite" version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" [[package]] name = "pin-utils" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "plotters" version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2538b639e642295546c50fcd545198c9d64ee2a38620a628724a3b266d5fbf97" dependencies = [ "num-traits", "plotters-backend", "plotters-svg", "wasm-bindgen", "web-sys", ] [[package]] name = "plotters-backend" version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "193228616381fecdc1224c62e96946dfbc73ff4384fba576e052ff8c1bea8142" [[package]] name = "plotters-svg" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9a81d2759aae1dae668f783c308bc5c8ebd191ff4184aaa1b37f65a6ae5a56f" dependencies = [ "plotters-backend", ] [[package]] name = "ppv-lite86" version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" [[package]] name = "precomputed-hash" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" [[package]] name = "proc-macro-hack" version = "0.5.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" [[package]] name = "proc-macro2" version = "1.0.82" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ad3d49ab951a01fbaafe34f2ec74122942fe18a3f9814c3268f1bb72042131b" dependencies = [ "unicode-ident", ] [[package]] name = "psl" version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07242622d9f4b9c1a6fe9c2691cf18ee7d34400a5eed2e1668c756bfaea93fb3" dependencies = [ "psl-types", ] [[package]] name = "psl-types" version = "2.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33cb294fe86a74cbcf50d4445b37da762029549ebeea341421c7c70370f86cac" [[package]] name = "quote" version = "1.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" dependencies = [ "proc-macro2", ] [[package]] name = "rand" version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" dependencies = [ "getrandom 0.1.16", "libc", "rand_chacha", "rand_core", "rand_hc", "rand_pcg", ] [[package]] name = "rand_chacha" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" dependencies = [ "ppv-lite86", "rand_core", ] [[package]] name = "rand_core" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" dependencies = [ "getrandom 0.1.16", ] [[package]] name = "rand_hc" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" dependencies = [ "rand_core", ] [[package]] name = "rand_pcg" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" dependencies = [ "rand_core", ] [[package]] name = "rayon" version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" dependencies = [ "autocfg", "crossbeam-deque", "either", "rayon-core", ] [[package]] name = "rayon-core" version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" dependencies = [ "crossbeam-channel", "crossbeam-deque", "crossbeam-utils", "num_cpus", ] [[package]] name = "regex" version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b" dependencies = [ "aho-corasick", "memchr", "regex-syntax", ] [[package]] name = "regex-syntax" version = "0.6.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" [[package]] name = "reqwest" version = "0.11.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "046cd98826c46c2ac8ddecae268eb5c2e58628688a5fc7a2643704a73faba95b" dependencies = [ "base64 0.21.5", "bytes", "encoding_rs", "futures-core", "futures-util", "h2", "http", "http-body", "hyper", "hyper-rustls", "ipnet", "js-sys", "log", "mime", "once_cell", "percent-encoding", "pin-project-lite", "rustls", "rustls-pemfile", "serde", "serde_json", "serde_urlencoded", "system-configuration", "tokio", "tokio-rustls", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", "web-sys", "webpki-roots", "winreg", ] [[package]] name = "ring" version = "0.16.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" dependencies = [ "cc", "libc", "once_cell", "spin 0.5.2", "untrusted 0.7.1", "web-sys", "winapi", ] [[package]] name = "ring" version = "0.17.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb0205304757e5d899b9c2e448b867ffd03ae7f988002e47cd24954391394d0b" dependencies = [ "cc", "getrandom 0.2.11", "libc", "spin 0.9.8", "untrusted 0.9.0", "windows-sys 0.48.0", ] [[package]] name = "rmp" version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "44519172358fd6d58656c86ab8e7fbc9e1490c3e8f14d35ed78ca0dd07403c9f" dependencies = [ "byteorder", "num-traits", "paste", ] [[package]] name = "rmp-serde" version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "723ecff9ad04f4ad92fe1c8ca6c20d2196d9286e9c60727c4cb5511629260e9d" dependencies = [ "byteorder", "rmp", "serde", ] [[package]] name = "rustc_version" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ "semver", ] [[package]] name = "rustls" version = "0.21.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4" dependencies = [ "log", "ring 0.17.5", "rustls-webpki", "sct", ] [[package]] name = "rustls-pemfile" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0864aeff53f8c05aa08d86e5ef839d3dfcf07aeba2db32f12db0ef716e87bd55" dependencies = [ "base64 0.13.0", ] [[package]] name = "rustls-webpki" version = "0.101.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" dependencies = [ "ring 0.17.5", "untrusted 0.9.0", ] [[package]] name = "ryu" version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09" [[package]] name = "same-file" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" dependencies = [ "winapi-util", ] [[package]] name = "scopeguard" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "sct" version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4" dependencies = [ "ring 0.16.20", "untrusted 0.7.1", ] [[package]] name = "seahash" version = "3.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "58f57ca1d128a43733fd71d583e837b1f22239a37ebea09cde11d8d9a9080f47" [[package]] name = "selectors" version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdea87c686be721aab36607728047801ee21561bfdbd6bf0da7ace2536d5879f" dependencies = [ "bitflags", "cssparser", "derive_more", "fxhash", "log", "phf", "phf_codegen", "precomputed-hash", "servo_arc", "smallvec", ] [[package]] name = "semver" version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e25dfac463d778e353db5be2449d1cce89bd6fd23c9f1ea21310ce6e5a1b29c4" [[package]] name = "serde" version = "1.0.203" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" version = "1.0.203" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" dependencies = [ "proc-macro2", "quote", "syn 2.0.61", ] [[package]] name = "serde_json" version = "1.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "455182ea6142b14f93f4bc5320a2b31c1f266b66a4a5c858b013302a5d8cbfc3" dependencies = [ "itoa 1.0.3", "ryu", "serde", ] [[package]] name = "serde_urlencoded" version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" dependencies = [ "form_urlencoded", "itoa 1.0.3", "ryu", "serde", ] [[package]] name = "servo_arc" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432" dependencies = [ "nodrop", "stable_deref_trait", ] [[package]] name = "sha2" version = "0.9.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800" dependencies = [ "block-buffer", "cfg-if", "cpufeatures", "digest", "opaque-debug", ] [[package]] name = "siphasher" version = "0.3.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" [[package]] name = "slab" version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef" dependencies = [ "autocfg", ] [[package]] name = "smallvec" version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1" [[package]] name = "socket2" version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd" dependencies = [ "libc", "winapi", ] [[package]] name = "spin" version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" [[package]] name = "spin" version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" [[package]] name = "stable_deref_trait" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" [[package]] name = "syn" version = "1.0.99" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "58dbef6ec655055e20b86b15a8cc6d439cca19b667537ac6a1369572d151ab13" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "syn" version = "2.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c993ed8ccba56ae856363b1845da7266a7cb78e1d146c8a32d54b45a8b831fc9" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "system-configuration" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" dependencies = [ "bitflags", "core-foundation", "system-configuration-sys", ] [[package]] name = "system-configuration-sys" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9" dependencies = [ "core-foundation-sys", "libc", ] [[package]] name = "thiserror" version = "1.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" version = "1.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" dependencies = [ "proc-macro2", "quote", "syn 2.0.61", ] [[package]] name = "tinytemplate" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" dependencies = [ "serde", "serde_json", ] [[package]] name = "tinyvec" version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" dependencies = [ "tinyvec_macros", ] [[package]] name = "tinyvec_macros" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" version = "1.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "597a12a59981d9e3c38d216785b0c37399f6e415e8d0712047620f189371b0bb" dependencies = [ "autocfg", "bytes", "libc", "memchr", "mio", "num_cpus", "pin-project-lite", "socket2", "windows-sys 0.42.0", ] [[package]] name = "tokio-rustls" version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" dependencies = [ "rustls", "tokio", ] [[package]] name = "tokio-util" version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bb2e075f03b3d66d8d8785356224ba688d2906a371015e225beeb65ca92c740" dependencies = [ "bytes", "futures-core", "futures-sink", "pin-project-lite", "tokio", "tracing", ] [[package]] name = "tower-service" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" [[package]] name = "tracing" version = "0.1.36" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2fce9567bd60a67d08a16488756721ba392f24f29006402881e43b19aac64307" dependencies = [ "cfg-if", "pin-project-lite", "tracing-core", ] [[package]] name = "tracing-core" version = "0.1.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5aeea4303076558a00714b823f9ad67d58a3bbda1df83d8827d21193156e22f7" dependencies = [ "once_cell", ] [[package]] name = "try-lock" version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642" [[package]] name = "typenum" version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987" [[package]] name = "unicode-bidi" version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992" [[package]] name = "unicode-ident" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd" [[package]] name = "unicode-normalization" version = "0.1.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" dependencies = [ "tinyvec", ] [[package]] name = "untrusted" version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" [[package]] name = "untrusted" version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d68c799ae75762b8c3fe375feb6600ef5602c883c5d21eb51c09f22b83c4643" dependencies = [ "form_urlencoded", "idna 0.3.0", "percent-encoding", ] [[package]] name = "version_check" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" [[package]] name = "walkdir" version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" dependencies = [ "same-file", "winapi", "winapi-util", ] [[package]] name = "want" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0" dependencies = [ "log", "try-lock", ] [[package]] name = "wasi" version = "0.9.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" version = "0.2.83" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eaf9f5aceeec8be17c128b2e93e031fb8a4d469bb9c4ae2d7dc1888b26887268" dependencies = [ "cfg-if", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" version = "0.2.83" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c8ffb332579b0557b52d268b91feab8df3615f265d5270fec2a8c95b17c1142" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", "syn 1.0.99", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" version = "0.4.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "23639446165ca5a5de86ae1d8896b737ae80319560fbaa4c2887b7da6e7ebd7d" dependencies = [ "cfg-if", "js-sys", "wasm-bindgen", "web-sys", ] [[package]] name = "wasm-bindgen-macro" version = "0.2.83" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "052be0f94026e6cbc75cdefc9bae13fd6052cdcaf532fa6c45e7ae33a1e6c810" dependencies = [ "quote", "wasm-bindgen-macro-support", ] [[package]] name = "wasm-bindgen-macro-support" version = "0.2.83" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07bc0c051dc5f23e307b13285f9d75df86bfdf816c5721e573dec1f9b8aa193c" dependencies = [ "proc-macro2", "quote", "syn 1.0.99", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" version = "0.2.83" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1c38c045535d93ec4f0b4defec448e4291638ee608530863b1e2ba115d4fff7f" [[package]] name = "web-sys" version = "0.3.60" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bcda906d8be16e728fd5adc5b729afad4e444e106ab28cd1c7256e54fa61510f" dependencies = [ "js-sys", "wasm-bindgen", ] [[package]] name = "webpki-roots" version = "0.25.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc" [[package]] name = "winapi" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" dependencies = [ "winapi-i686-pc-windows-gnu", "winapi-x86_64-pc-windows-gnu", ] [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" dependencies = [ "winapi", ] [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-sys" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" dependencies = [ "windows_aarch64_gnullvm 0.42.0", "windows_aarch64_msvc 0.42.0", "windows_i686_gnu 0.42.0", "windows_i686_msvc 0.42.0", "windows_x86_64_gnu 0.42.0", "windows_x86_64_gnullvm 0.42.0", "windows_x86_64_msvc 0.42.0", ] [[package]] name = "windows-sys" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ "windows-targets 0.48.5", ] [[package]] name = "windows-sys" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ "windows-targets 0.52.5", ] [[package]] name = "windows-targets" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ "windows_aarch64_gnullvm 0.48.5", "windows_aarch64_msvc 0.48.5", "windows_i686_gnu 0.48.5", "windows_i686_msvc 0.48.5", "windows_x86_64_gnu 0.48.5", "windows_x86_64_gnullvm 0.48.5", "windows_x86_64_msvc 0.48.5", ] [[package]] name = "windows-targets" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" dependencies = [ "windows_aarch64_gnullvm 0.52.5", "windows_aarch64_msvc 0.52.5", "windows_i686_gnu 0.52.5", "windows_i686_gnullvm", "windows_i686_msvc 0.52.5", "windows_x86_64_gnu 0.52.5", "windows_x86_64_gnullvm 0.52.5", "windows_x86_64_msvc 0.52.5", ] [[package]] name = "windows_aarch64_gnullvm" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41d2aa71f6f0cbe00ae5167d90ef3cfe66527d6f613ca78ac8024c3ccab9a19e" [[package]] name = "windows_aarch64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_gnullvm" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" [[package]] name = "windows_aarch64_msvc" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd0f252f5a35cac83d6311b2e795981f5ee6e67eb1f9a7f64eb4500fbc4dcdb4" [[package]] name = "windows_aarch64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_aarch64_msvc" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" [[package]] name = "windows_i686_gnu" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fbeae19f6716841636c28d695375df17562ca208b2b7d0dc47635a50ae6c5de7" [[package]] name = "windows_i686_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_gnu" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" [[package]] name = "windows_i686_gnullvm" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" [[package]] name = "windows_i686_msvc" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84c12f65daa39dd2babe6e442988fc329d6243fdce47d7d2d155b8d874862246" [[package]] name = "windows_i686_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_i686_msvc" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" [[package]] name = "windows_x86_64_gnu" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf7b1b21b5362cbc318f686150e5bcea75ecedc74dd157d874d754a2ca44b0ed" [[package]] name = "windows_x86_64_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnu" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" [[package]] name = "windows_x86_64_gnullvm" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09d525d2ba30eeb3297665bd434a54297e4170c7f1a44cad4ef58095b4cd2028" [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_gnullvm" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" [[package]] name = "windows_x86_64_msvc" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f40009d85759725a34da6d89a94e63d7bdc50a862acf0dbc7c8e488f1edcb6f5" [[package]] name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "windows_x86_64_msvc" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" [[package]] name = "winreg" version = "0.50.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" dependencies = [ "cfg-if", "windows-sys 0.48.0", ] adblock-0.8.12/Cargo.toml0000644000000057050000000000100105300ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" name = "adblock" version = "0.8.12" authors = [ "Anton Lazarev ", "Andrius Aucinas", ] exclude = [ "*.md", ".github/", ".gitignore", ".npmignore", "data/", "examples/", "js/", "package-lock.json", "package.json", "tests/", ] description = "Native Rust module for Adblock Plus syntax (e.g. EasyList, EasyPrivacy) filter parsing and matching." readme = "README.md" license = "MPL-2.0" repository = "https://github.com/brave/adblock-rust/" [lib] bench = false [[bench]] name = "bench_regex" harness = false [[bench]] name = "bench_matching" harness = false [[bench]] name = "bench_url" harness = false [[bench]] name = "bench_rules" harness = false [[bench]] name = "bench_redirect_performance" harness = false [dependencies.addr] version = "0.15" features = ["psl"] optional = true default-features = false [dependencies.base64] version = "0.13" [dependencies.bitflags] version = "1.3" [dependencies.cssparser] version = "0.28" optional = true [dependencies.idna] version = "0.2" [dependencies.itertools] version = "0.10" [dependencies.lifeguard] version = "^ 0.6.1" optional = true [dependencies.memchr] version = "2.4" [dependencies.once_cell] version = "1.8" [dependencies.percent-encoding] version = "2.1" [dependencies.regex] version = "1.5" [dependencies.rmp-serde] version = "0.15" [dependencies.seahash] version = "3" [dependencies.selectors] version = "0.23" optional = true [dependencies.serde] version = "1.0" features = [ "derive", "rc", ] [dependencies.serde_json] version = "1.0" optional = true [dependencies.thiserror] version = "1.0" [dependencies.url] version = "2.2" [dev-dependencies.criterion] version = "0.5" [dev-dependencies.csv] version = "1" [dev-dependencies.futures] version = "0.3" [dev-dependencies.mock_instant] version = "0.5" [dev-dependencies.reqwest] version = "0.11" features = ["rustls-tls"] default-features = false [dev-dependencies.serde_json] version = "1.0" [dev-dependencies.sha2] version = "0.9" [dev-dependencies.tokio] version = "1.24" features = ["rt-multi-thread"] [features] content-blocking = ["serde_json"] css-validation = [ "cssparser", "selectors", ] default = [ "embedded-domain-resolver", "full-regex-handling", "object-pooling", "unsync-regex-caching", ] embedded-domain-resolver = ["addr"] full-regex-handling = [] object-pooling = ["lifeguard"] regex-debug-info = [] resource-assembler = ["serde_json"] unsync-regex-caching = [] adblock-0.8.12/Cargo.toml.orig000064400000000000000000000050361046102023000142060ustar 00000000000000[package] name = "adblock" version = "0.8.12" authors = ["Anton Lazarev ", "Andrius Aucinas"] edition = "2021" description = "Native Rust module for Adblock Plus syntax (e.g. EasyList, EasyPrivacy) filter parsing and matching." repository = "https://github.com/brave/adblock-rust/" license = "MPL-2.0" readme = "README.md" exclude = [ "*.md", ".github/", ".gitignore", ".npmignore", "data/", "examples/", "js/", "package-lock.json", "package.json", "tests/", ] [dependencies] addr = { version = "0.15", default-features = false, features = ["psl"], optional = true } url = "2.2" percent-encoding = "2.1" once_cell = "1.8" regex = "1.5" bitflags = "1.3" itertools = "0.10" idna = "0.2" serde = { version = "1.0", features = ["derive", "rc"] } seahash = "3" # seahash 4 introduces a breaking hash algorithm change memchr = "2.4" base64 = "0.13" rmp-serde = "0.15" lifeguard = { version = "^ 0.6.1", optional = true } cssparser = { version = "0.28", optional = true } selectors = { version = "0.23", optional = true } serde_json = { version = "1.0", optional = true } thiserror = "1.0" [dev-dependencies] criterion = "0.5" csv = "1" mock_instant = { version = "0.5" } serde_json = "1.0" # By default, reqwest builds openssl from source, which fails on missing/incompatible system dependencies reqwest = { version = "0.11", features = ["rustls-tls"], default-features = false } futures = "0.3" tokio = { version = "1.24", features = ["rt-multi-thread"] } sha2 = "0.9" [lib] bench = false [[bench]] name = "bench_regex" harness = false [[bench]] name = "bench_matching" harness = false [[bench]] name = "bench_url" harness = false [[bench]] name = "bench_rules" harness = false [[bench]] name = "bench_redirect_performance" harness = false # Currently disabled, as cosmetic filter internals # are no longer part of the crate's public API #[[bench]] #name = "bench_cosmetic_matching" #harness = false [features] # If disabling default features, consider explicitly re-enabling the # "embedded-domain-resolver" feature. default = ["embedded-domain-resolver", "full-regex-handling", "object-pooling", "unsync-regex-caching"] full-regex-handling = [] object-pooling = ["lifeguard"] # disables `Send` and `Sync` on `Engine`. unsync-regex-caching = [] # disables `Send` and `Sync` on `Engine`. regex-debug-info = [] css-validation = ["cssparser", "selectors"] content-blocking = ["serde_json"] embedded-domain-resolver = ["addr"] # Requires setting an external domain resolver if disabled. resource-assembler = ["serde_json"] adblock-0.8.12/LICENSE000064400000000000000000000405261046102023000123270ustar 00000000000000Mozilla Public License Version 2.0 ================================== 1. Definitions -------------- 1.1. "Contributor" means each individual or legal entity that creates, contributes to the creation of, or owns Covered Software. 1.2. "Contributor Version" means the combination of the Contributions of others (if any) used by a Contributor and that particular Contributor's Contribution. 1.3. "Contribution" means Covered Software of a particular Contributor. 1.4. "Covered Software" means Source Code Form to which the initial Contributor has attached the notice in Exhibit A, the Executable Form of such Source Code Form, and Modifications of such Source Code Form, in each case including portions thereof. 1.5. "Incompatible With Secondary Licenses" means (a) that the initial Contributor has attached the notice described in Exhibit B to the Covered Software; or (b) that the Covered Software was made available under the terms of version 1.1 or earlier of the License, but not also under the terms of a Secondary License. 1.6. "Executable Form" means any form of the work other than Source Code Form. 1.7. "Larger Work" means a work that combines Covered Software with other material, in a separate file or files, that is not Covered Software. 1.8. "License" means this document. 1.9. "Licensable" means having the right to grant, to the maximum extent possible, whether at the time of the initial grant or subsequently, any and all of the rights conveyed by this License. 1.10. "Modifications" means any of the following: (a) any file in Source Code Form that results from an addition to, deletion from, or modification of the contents of Covered Software; or (b) any new file in Source Code Form that contains any Covered Software. 1.11. "Patent Claims" of a Contributor means any patent claim(s), including without limitation, method, process, and apparatus claims, in any patent Licensable by such Contributor that would be infringed, but for the grant of the License, by the making, using, selling, offering for sale, having made, import, or transfer of either its Contributions or its Contributor Version. 1.12. "Secondary License" means either the GNU General Public License, Version 2.0, the GNU Lesser General Public License, Version 2.1, the GNU Affero General Public License, Version 3.0, or any later versions of those licenses. 1.13. "Source Code Form" means the form of the work preferred for making modifications. 1.14. "You" (or "Your") means an individual or a legal entity exercising rights under this License. For legal entities, "You" includes any entity that controls, is controlled by, or is under common control with You. For purposes of this definition, "control" means (a) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (b) ownership of more than fifty percent (50%) of the outstanding shares or beneficial ownership of such entity. 2. License Grants and Conditions -------------------------------- 2.1. Grants Each Contributor hereby grants You a world-wide, royalty-free, non-exclusive license: (a) under intellectual property rights (other than patent or trademark) Licensable by such Contributor to use, reproduce, make available, modify, display, perform, distribute, and otherwise exploit its Contributions, either on an unmodified basis, with Modifications, or as part of a Larger Work; and (b) under Patent Claims of such Contributor to make, use, sell, offer for sale, have made, import, and otherwise transfer either its Contributions or its Contributor Version. 2.2. Effective Date The licenses granted in Section 2.1 with respect to any Contribution become effective for each Contribution on the date the Contributor first distributes such Contribution. 2.3. Limitations on Grant Scope The licenses granted in this Section 2 are the only rights granted under this License. No additional rights or licenses will be implied from the distribution or licensing of Covered Software under this License. Notwithstanding Section 2.1(b) above, no patent license is granted by a Contributor: (a) for any code that a Contributor has removed from Covered Software; or (b) for infringements caused by: (i) Your and any other third party's modifications of Covered Software, or (ii) the combination of its Contributions with other software (except as part of its Contributor Version); or (c) under Patent Claims infringed by Covered Software in the absence of its Contributions. This License does not grant any rights in the trademarks, service marks, or logos of any Contributor (except as may be necessary to comply with the notice requirements in Section 3.4). 2.4. Subsequent Licenses No Contributor makes additional grants as a result of Your choice to distribute the Covered Software under a subsequent version of this License (see Section 10.2) or under the terms of a Secondary License (if permitted under the terms of Section 3.3). 2.5. Representation Each Contributor represents that the Contributor believes its Contributions are its original creation(s) or it has sufficient rights to grant the rights to its Contributions conveyed by this License. 2.6. Fair Use This License is not intended to limit any rights You have under applicable copyright doctrines of fair use, fair dealing, or other equivalents. 2.7. Conditions Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in Section 2.1. 3. Responsibilities ------------------- 3.1. Distribution of Source Form All distribution of Covered Software in Source Code Form, including any Modifications that You create or to which You contribute, must be under the terms of this License. You must inform recipients that the Source Code Form of the Covered Software is governed by the terms of this License, and how they can obtain a copy of this License. You may not attempt to alter or restrict the recipients' rights in the Source Code Form. 3.2. Distribution of Executable Form If You distribute Covered Software in Executable Form then: (a) such Covered Software must also be made available in Source Code Form, as described in Section 3.1, and You must inform recipients of the Executable Form how they can obtain a copy of such Source Code Form by reasonable means in a timely manner, at a charge no more than the cost of distribution to the recipient; and (b) You may distribute such Executable Form under the terms of this License, or sublicense it under different terms, provided that the license for the Executable Form does not attempt to limit or alter the recipients' rights in the Source Code Form under this License. 3.3. Distribution of a Larger Work You may create and distribute a Larger Work under terms of Your choice, provided that You also comply with the requirements of this License for the Covered Software. If the Larger Work is a combination of Covered Software with a work governed by one or more Secondary Licenses, and the Covered Software is not Incompatible With Secondary Licenses, this License permits You to additionally distribute such Covered Software under the terms of such Secondary License(s), so that the recipient of the Larger Work may, at their option, further distribute the Covered Software under the terms of either this License or such Secondary License(s). 3.4. Notices You may not remove or alter the substance of any license notices (including copyright notices, patent notices, disclaimers of warranty, or limitations of liability) contained within the Source Code Form of the Covered Software, except that You may alter any license notices to the extent required to remedy known factual inaccuracies. 3.5. Application of Additional Terms You may choose to offer, and to charge a fee for, warranty, support, indemnity or liability obligations to one or more recipients of Covered Software. However, You may do so only on Your own behalf, and not on behalf of any Contributor. You must make it absolutely clear that any such warranty, support, indemnity, or liability obligation is offered by You alone, and You hereby agree to indemnify every Contributor for any liability incurred by such Contributor as a result of warranty, support, indemnity or liability terms You offer. You may include additional disclaimers of warranty and limitations of liability specific to any jurisdiction. 4. Inability to Comply Due to Statute or Regulation --------------------------------------------------- If it is impossible for You to comply with any of the terms of this License with respect to some or all of the Covered Software due to statute, judicial order, or regulation then You must: (a) comply with the terms of this License to the maximum extent possible; and (b) describe the limitations and the code they affect. Such description must be placed in a text file included with all distributions of the Covered Software under this License. Except to the extent prohibited by statute or regulation, such description must be sufficiently detailed for a recipient of ordinary skill to be able to understand it. 5. Termination -------------- 5.1. The rights granted under this License will terminate automatically if You fail to comply with any of its terms. However, if You become compliant, then the rights granted under this License from a particular Contributor are reinstated (a) provisionally, unless and until such Contributor explicitly and finally terminates Your grants, and (b) on an ongoing basis, if such Contributor fails to notify You of the non-compliance by some reasonable means prior to 60 days after You have come back into compliance. Moreover, Your grants from a particular Contributor are reinstated on an ongoing basis if such Contributor notifies You of the non-compliance by some reasonable means, this is the first time You have received notice of non-compliance with this License from such Contributor, and You become compliant prior to 30 days after Your receipt of the notice. 5.2. If You initiate litigation against any entity by asserting a patent infringement claim (excluding declaratory judgment actions, counter-claims, and cross-claims) alleging that a Contributor Version directly or indirectly infringes any patent, then the rights granted to You by any and all Contributors for the Covered Software under Section 2.1 of this License shall terminate. 5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user license agreements (excluding distributors and resellers) which have been validly granted by You or Your distributors under this License prior to termination shall survive termination. ************************************************************************ * * * 6. Disclaimer of Warranty * * ------------------------- * * * * Covered Software is provided under this License on an "as is" * * basis, without warranty of any kind, either expressed, implied, or * * statutory, including, without limitation, warranties that the * * Covered Software is free of defects, merchantable, fit for a * * particular purpose or non-infringing. The entire risk as to the * * quality and performance of the Covered Software is with You. * * Should any Covered Software prove defective in any respect, You * * (not any Contributor) assume the cost of any necessary servicing, * * repair, or correction. This disclaimer of warranty constitutes an * * essential part of this License. No use of any Covered Software is * * authorized under this License except under this disclaimer. * * * ************************************************************************ ************************************************************************ * * * 7. Limitation of Liability * * -------------------------- * * * * Under no circumstances and under no legal theory, whether tort * * (including negligence), contract, or otherwise, shall any * * Contributor, or anyone who distributes Covered Software as * * permitted above, be liable to You for any direct, indirect, * * special, incidental, or consequential damages of any character * * including, without limitation, damages for lost profits, loss of * * goodwill, work stoppage, computer failure or malfunction, or any * * and all other commercial damages or losses, even if such party * * shall have been informed of the possibility of such damages. This * * limitation of liability shall not apply to liability for death or * * personal injury resulting from such party's negligence to the * * extent applicable law prohibits such limitation. Some * * jurisdictions do not allow the exclusion or limitation of * * incidental or consequential damages, so this exclusion and * * limitation may not apply to You. * * * ************************************************************************ 8. Litigation ------------- Any litigation relating to this License may be brought only in the courts of a jurisdiction where the defendant maintains its principal place of business and such litigation shall be governed by laws of that jurisdiction, without reference to its conflict-of-law provisions. Nothing in this Section shall prevent a party's ability to bring cross-claims or counter-claims. 9. Miscellaneous ---------------- This License represents the complete agreement concerning the subject matter hereof. If any provision of this License is held to be unenforceable, such provision shall be reformed only to the extent necessary to make it enforceable. Any law or regulation which provides that the language of a contract shall be construed against the drafter shall not be used to construe this License against a Contributor. 10. Versions of the License --------------------------- 10.1. New Versions Mozilla Foundation is the license steward. Except as provided in Section 10.3, no one other than the license steward has the right to modify or publish new versions of this License. Each version will be given a distinguishing version number. 10.2. Effect of New Versions You may distribute the Covered Software under the terms of the version of the License under which You originally received the Covered Software, or under the terms of any subsequent version published by the license steward. 10.3. Modified Versions If you create software not governed by this License, and you want to create a new license for such software, you may create and use a modified version of this License if you rename the license and remove any references to the name of the license steward (except to note that such modified license differs from this License). 10.4. Distributing Source Code Form that is Incompatible With Secondary Licenses If You choose to distribute Source Code Form that is Incompatible With Secondary Licenses under the terms of this version of the License, the notice described in Exhibit B of this License must be attached. Exhibit A - Source Code Form License Notice ------------------------------------------- This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/. If it is not possible or desirable to put the notice in a particular file, then You may include the notice in a location (such as a LICENSE file in a relevant directory) where a recipient would be likely to look for such a notice. You may add additional accurate notices of copyright ownership. Exhibit B - "Incompatible With Secondary Licenses" Notice --------------------------------------------------------- This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0. adblock-0.8.12/README.md000064400000000000000000000067741046102023000126100ustar 00000000000000# adblock-rust [![crates.io](https://img.shields.io/crates/v/adblock.svg)](https://crates.io/crates/adblock) [![npmjs.com](https://img.shields.io/npm/v/adblock-rs.svg)](https://www.npmjs.com/package/adblock-rs) [![docs.rs](https://docs.rs/adblock/badge.svg)](https://docs.rs/adblock) ![Build Status](https://github.com/brave/adblock-rust/actions/workflows/ci.yml/badge.svg) [![License](https://img.shields.io/badge/License-MPL--2.0-blue)](LICENSE) ### _Putting you back in control of your browsing experience._ `adblock-rust` is the engine powering [Brave](https://brave.com)'s native adblocker, available as a library for anyone to use. It features: - Network blocking - Cosmetic filtering - Resource replacements - Hosts syntax - uBlock Origin syntax extensions - iOS content-blocking syntax conversion - Compiling to native code or WASM - Rust bindings ([crates](https://crates.io/crates/adblock)) - JS bindings ([npm](https://npmjs.com/adblock-rs)) - Community-maintained Python bindings ([pypi](https://pypi.org/project/adblock/)) - High performance! ## Getting started `adblock-rust` is used in several projects, including browsers, research tools, and proxies. It may be a good fit for yours, too! See [docs.rs](https://docs.rs/adblock) for detailed API documentation. Also check the [Rust example](./examples/example.rs) or the [NodeJS example](./js/example.js). ### Optional features The following `cargo` [features](https://doc.rust-lang.org/cargo/reference/features.html) can be used to tweak `adblock-rust` to best fit your use-case. #### CSS validation during rule parsing (`css-validation`) When parsing cosmetic filter rules, it's possible to include a built-in implementation of CSS validation (through the [selectors](https://crates.io/crates/selectors) and [cssparser](https://crates.io/crates/cssparser) crates) by enabling the `css-validation` feature. This will cause `adblock-rust` to reject cosmetic filter rules with invalid CSS syntax. #### Content blocking format translation (`content-blocking`) Enabling the `content-blocking` feature gives `adblock-rust` support for conversion of standard ABP-style rules into Apple's [content-blocking format](https://developer.apple.com/documentation/safariservices/creating_a_content_blocker), which can be exported for use on iOS and macOS platforms. #### External domain resolution (`embedded-domain-resolver`) By default, `adblock-rust` ships with a built-in domain resolution implementation (through the [addr](https://crates.io/crates/addr) crate) that will generally suffice for standalone use-cases. For more advanced use-cases, disabling the `embedded-domain-resolver` feature will allow `adblock-rust` to use an external domain resolution implementation instead. This is extremely useful to reduce binary bloat and improve consistency when embedding `adblock-rust` within a browser. #### Parsing resources from uBlock Origin's formats (`resource-assembler`) `adblock-rust` uses uBlock Origin-compatible resources for scriptlet injection and redirect rules. The `resource-assembler` feature allows `adblock-rust` to parse these resources directly from the file formats used by the uBlock Origin repository. #### Thread safety (`object-pooling`, `unsync-regex-caching`) The `object-pooling` and `unsync-regex-caching` features enable optimizations for rule matching speed and the amount of memory used by the engine. These features can be disabled to make the engine `Send + Sync`, although it is recommended to only access the engine on a single thread to maintain optimal performance. adblock-0.8.12/benches/bench_cosmetic_matching.rs000064400000000000000000000116141046102023000201120ustar 00000000000000#![cfg(any())] // This attribute disables the entire module use criterion::*; use adblock::cosmetic_filter_cache::CosmeticFilterCache; use adblock::lists::{parse_filters, FilterFormat}; #[path = "../tests/test_utils.rs"] mod test_utils; use test_utils::rules_from_lists; fn by_hostname(c: &mut Criterion) { let mut group = c.benchmark_group("cosmetic-hostname-match"); group.throughput(Throughput::Elements(1)); group.sample_size(20); group.bench_function("easylist", move |b| { let rules = rules_from_lists(&["data/easylist.to/easylist/easylist.txt"]); let (_, cosmetic_filters) = parse_filters(&rules, false, FilterFormat::Standard); let cfcache = CosmeticFilterCache::from_rules(cosmetic_filters); b.iter(|| cfcache.hostname_cosmetic_resources("google.com")) }); group.bench_function("many lists", move |b| { let rules = rules_from_lists(&[ "data/easylist.to/easylist/easylist.txt", "data/easylist.to/easylistgermany/easylistgermany.txt", "data/uBlockOrigin/filters.txt", "data/uBlockOrigin/unbreak.txt", ]); let (_, cosmetic_filters) = parse_filters(&rules, false, FilterFormat::Standard); let cfcache = CosmeticFilterCache::from_rules(cosmetic_filters); b.iter(|| cfcache.hostname_cosmetic_resources("google.com")) }); group.bench_function("complex_hostname", move |b| { let rules = rules_from_lists(&[ "data/easylist.to/easylist/easylist.txt", "data/easylist.to/easylistgermany/easylistgermany.txt", "data/uBlockOrigin/filters.txt", "data/uBlockOrigin/unbreak.txt", ]); let (_, cosmetic_filters) = parse_filters(&rules, false, FilterFormat::Standard); let cfcache = CosmeticFilterCache::from_rules(cosmetic_filters); b.iter(|| cfcache.hostname_cosmetic_resources("ads.serve.1.domain.google.com")) }); group.finish(); } fn by_classes_ids(c: &mut Criterion) { let mut group = c.benchmark_group("cosmetic-class-id-match"); group.throughput(Throughput::Elements(1)); group.sample_size(20); group.bench_function("easylist", move |b| { let rules = rules_from_lists(&["data/easylist.to/easylist/easylist.txt"]); let (_, cosmetic_filters) = parse_filters(&rules, false, FilterFormat::Standard); let cfcache = CosmeticFilterCache::from_rules(cosmetic_filters); let exceptions = Default::default(); b.iter(|| { cfcache.hidden_class_id_selectors( &["ad"], &["ad"], &exceptions, ) }) }); group.bench_function("many lists", move |b| { let rules = rules_from_lists(&[ "data/easylist.to/easylist/easylist.txt", "data/easylist.to/easylistgermany/easylistgermany.txt", "data/uBlockOrigin/filters.txt", "data/uBlockOrigin/unbreak.txt", ]); let (_, cosmetic_filters) = parse_filters(&rules, false, FilterFormat::Standard); let cfcache = CosmeticFilterCache::from_rules(cosmetic_filters); let exceptions = Default::default(); b.iter(|| { cfcache.hidden_class_id_selectors( &["ad"], &["ad"], &exceptions, ) }) }); group.bench_function("many matching classes and ids", move |b| { let rules = rules_from_lists(&[ "data/easylist.to/easylist/easylist.txt", "data/easylist.to/easylistgermany/easylistgermany.txt", "data/uBlockOrigin/filters.txt", "data/uBlockOrigin/unbreak.txt", ]); let (_, cosmetic_filters) = parse_filters(&rules, false, FilterFormat::Standard); let cfcache = CosmeticFilterCache::from_rules(cosmetic_filters); let exceptions = Default::default(); let class_list = [ "block-bg-advertisement-region-1", "photobox-adbox", "headerad-720", "rscontainer", "rail-article-sponsored", "fbPhotoSnowboxAds", "sidebar_ad_module", "ad-728x90_forum", "commercial-unit-desktop-rhs", "sponsored-editorial", "rr-300x600-ad", "adfoot", "lads", ]; let id_list = [ "footer-adspace", "adsponsored_links_box", "lsadvert-top", "mn", "col-right-ad", "view_ads_bottom_bg_middle", "ad_468x60", "rightAdColumn", "content", "rhs_block", "center_col", "header", "advertisingModule160x600", ]; b.iter(|| cfcache.hidden_class_id_selectors(&class_list, &id_list, &exceptions)) }); group.finish(); } criterion_group!(cosmetic_benches, by_hostname, by_classes_ids,); criterion_main!(cosmetic_benches); adblock-0.8.12/benches/bench_matching.rs000064400000000000000000000272351046102023000162320ustar 00000000000000use criterion::*; use serde::{Deserialize, Serialize}; use adblock::Engine; use adblock::blocker::{Blocker, BlockerOptions}; use adblock::request::Request; use adblock::resources::ResourceStorage; use adblock::url_parser::parse_url; #[path = "../tests/test_utils.rs"] mod test_utils; use test_utils::rules_from_lists; #[allow(non_snake_case)] #[derive(Serialize, Deserialize, Clone)] struct TestRequest { frameUrl: String, url: String, cpt: String, } impl From<&TestRequest> for Request { fn from(v: &TestRequest) -> Self { Request::new(&v.url, &v.frameUrl, &v.cpt).unwrap() } } fn load_requests() -> Vec { let requests_str = rules_from_lists(&["data/requests.json"]); let reqs: Vec = requests_str .into_iter() .map(|r| serde_json::from_str(&r)) .filter_map(Result::ok) .collect(); reqs } fn get_blocker(rules: impl IntoIterator>) -> Blocker { let (network_filters, _) = adblock::lists::parse_filters(rules, false, Default::default()); let blocker_options = BlockerOptions { enable_optimizations: true, }; Blocker::new(network_filters, &blocker_options) } fn bench_rule_matching(engine: &Engine, requests: &Vec) -> (u32, u32) { let mut matches = 0; let mut passes = 0; requests.iter().for_each(|r| { let res = engine.check_network_request(&r.into()); if res.matched { matches += 1; } else { passes += 1; } }); // println!("Got {} matches, {} passes, {} errors", matches, passes, errors); (matches, passes) } fn bench_matching_only(blocker: &Blocker, resources: &ResourceStorage, requests: &Vec) -> (u32, u32) { let mut matches = 0; let mut passes = 0; requests.iter().for_each(|parsed| { let check = blocker.check(&parsed, resources); if check.matched { matches += 1; } else { passes += 1; } }); // println!("Got {} matches, {} passes", matches, passes); (matches, passes) } fn bench_rule_matching_browserlike( blocker: &Engine, requests: &Vec<(String, String, String, String, bool)>, ) -> (u32, u32) { let mut matches = 0; let mut passes = 0; requests.iter().for_each( |(url, hostname, source_hostname, request_type, third_party)| { let check = blocker.check_network_request(&Request::preparsed( &url, &hostname, &source_hostname, &request_type, *third_party, )); if check.matched { matches += 1; } else { passes += 1; } }, ); // println!("Got {} matches, {} passes", matches, passes); (matches, passes) } fn rule_match(c: &mut Criterion) { let mut group = c.benchmark_group("rule-match"); let requests = load_requests(); let elep_req = requests.clone(); let el_req = requests.clone(); let slim_req = requests.clone(); let requests_len = requests.len() as u64; group.throughput(Throughput::Elements(requests_len)); group.sample_size(10); group.bench_function("el+ep", move |b| { let rules = rules_from_lists(&[ "data/easylist.to/easylist/easylist.txt", "data/easylist.to/easylist/easyprivacy.txt", ]); let engine = Engine::from_rules(rules, Default::default()); b.iter(|| bench_rule_matching(&engine, &elep_req)) }); group.bench_function("easylist", move |b| { let rules = rules_from_lists(&["data/easylist.to/easylist/easylist.txt"]); let engine = Engine::from_rules(rules, Default::default()); b.iter(|| bench_rule_matching(&engine, &el_req)) }); group.bench_function("slimlist", move |b| { let rules = rules_from_lists(&["data/slim-list.txt"]); let engine = Engine::from_rules(rules, Default::default()); b.iter(|| bench_rule_matching(&engine, &slim_req)) }); group.finish(); } fn rule_match_parsed_el(c: &mut Criterion) { let mut group = c.benchmark_group("rule-match-parsed"); let rules = rules_from_lists(&[ "data/easylist.to/easylist/easylist.txt", ]); let requests = load_requests(); let requests_parsed: Vec<_> = requests .into_iter() .map(|r| Request::new(&r.url, &r.frameUrl, &r.cpt)) .filter_map(Result::ok) .collect(); let requests_len = requests_parsed.len() as u64; let blocker = get_blocker(rules); let resources = ResourceStorage::default(); group.throughput(Throughput::Elements(requests_len)); group.sample_size(10); group.bench_function("easylist", move |b| { b.iter(|| bench_matching_only(&blocker, &resources, &requests_parsed)) }); group.finish(); } fn rule_match_parsed_elep_slimlist(c: &mut Criterion) { let mut group = c.benchmark_group("rule-match-parsed"); let full_rules = rules_from_lists(&[ "data/easylist.to/easylist/easylist.txt", "data/easylist.to/easylist/easyprivacy.txt", ]); let blocker = get_blocker(full_rules); let resources = ResourceStorage::default(); let requests = load_requests(); let requests_parsed: Vec<_> = requests .into_iter() .map(|r| Request::new(&r.url, &r.frameUrl, &r.cpt)) .filter_map(Result::ok) .collect(); let requests_len = requests_parsed.len() as u64; let slim_rules = rules_from_lists(&["data/slim-list.txt"]); let slim_blocker = get_blocker(slim_rules); let requests_copy = load_requests(); let requests_parsed_copy: Vec<_> = requests_copy .into_iter() .map(|r| Request::new(&r.url, &r.frameUrl, &r.cpt)) .filter_map(Result::ok) .collect(); group.throughput(Throughput::Elements(requests_len)); group.sample_size(10); group.bench_function("el+ep", move |b| { b.iter(|| bench_matching_only(&blocker, &resources, &requests_parsed)) }); let resources = ResourceStorage::default(); group.bench_function("slimlist", move |b| { b.iter(|| bench_matching_only(&slim_blocker, &resources, &requests_parsed_copy)) }); group.finish(); } fn serialization(c: &mut Criterion) { let mut group = c.benchmark_group("blocker-serialization"); group.sample_size(20); group.bench_function("el+ep", move |b| { let full_rules = rules_from_lists(&[ "data/easylist.to/easylist/easylist.txt", "data/easylist.to/easylist/easyprivacy.txt", ]); let engine = Engine::from_rules(full_rules, Default::default()); b.iter(|| assert!(engine.serialize_raw().unwrap().len() > 0)) }); group.bench_function("el", move |b| { let full_rules = rules_from_lists(&[ "data/easylist.to/easylist/easylist.txt", ]); let engine = Engine::from_rules(full_rules, Default::default()); b.iter(|| assert!(engine.serialize_raw().unwrap().len() > 0)) }); group.bench_function("slimlist", move |b| { let full_rules = rules_from_lists(&["data/slim-list.txt"]); let engine = Engine::from_rules(full_rules, Default::default()); b.iter(|| assert!(engine.serialize_raw().unwrap().len() > 0)) }); group.finish(); } fn deserialization(c: &mut Criterion) { let mut group = c.benchmark_group("blocker-deserialization"); group.sample_size(20); group.bench_function("el+ep", move |b| { let full_rules = rules_from_lists(&[ "data/easylist.to/easylist/easylist.txt", "data/easylist.to/easylist/easyprivacy.txt", ]); let engine = Engine::from_rules(full_rules, Default::default()); let serialized = engine.serialize_raw().unwrap(); b.iter(|| { let mut deserialized = Engine::default(); assert!(deserialized.deserialize(&serialized).is_ok()); }) }); group.bench_function("el", move |b| { let full_rules = rules_from_lists(&[ "data/easylist.to/easylist/easylist.txt", ]); let engine = Engine::from_rules(full_rules, Default::default()); let serialized = engine.serialize_raw().unwrap(); b.iter(|| { let mut deserialized = Engine::default(); assert!(deserialized.deserialize(&serialized).is_ok()); }) }); group.bench_function("slimlist", move |b| { let full_rules = rules_from_lists(&["data/slim-list.txt"]); let engine = Engine::from_rules(full_rules, Default::default()); let serialized = engine.serialize_raw().unwrap(); b.iter(|| { let mut deserialized = Engine::default(); assert!(deserialized.deserialize(&serialized).is_ok()); }) }); group.finish(); } fn rule_match_browserlike_comparable(c: &mut Criterion) { let mut group = c.benchmark_group("rule-match-browserlike"); let requests = load_requests(); let requests_len = requests.len() as u64; group.throughput(Throughput::Elements(requests_len)); group.sample_size(20); fn requests_parsed( requests: &[TestRequest], ) -> Vec<(String, String, String, String, bool)> { requests .iter() .map(|r| { let url_norm = r.url.to_ascii_lowercase(); let source_url_norm = r.frameUrl.to_ascii_lowercase(); let maybe_parsed_url = parse_url(&url_norm); if maybe_parsed_url.is_none() { return Err("bad url"); } let parsed_url = maybe_parsed_url.unwrap(); let maybe_parsed_source = parse_url(&source_url_norm); if let Some(parsed_source) = maybe_parsed_source { Ok(( parsed_url.url.to_owned(), parsed_url.hostname().to_owned(), parsed_source.hostname().to_owned(), r.cpt.clone(), parsed_source.domain() != parsed_url.domain(), )) } else { Ok(( parsed_url.url.to_owned(), parsed_url.hostname().to_owned(), "".to_owned(), r.cpt.clone(), true, )) } }) .filter_map(Result::ok) .collect::>() } let elep_req = requests_parsed(&requests); let el_req = elep_req.clone(); let slim = elep_req.clone(); group.bench_function("el+ep", move |b| { let rules = rules_from_lists(&[ "data/easylist.to/easylist/easylist.txt", "data/easylist.to/easylist/easyprivacy.txt", ]); let engine = Engine::from_rules_parametrised(rules, Default::default(), false, true); b.iter(|| bench_rule_matching_browserlike(&engine, &elep_req)) }); group.bench_function("el", move |b| { let rules = rules_from_lists(&["data/easylist.to/easylist/easylist.txt"]); let engine = Engine::from_rules_parametrised(rules, Default::default(), false, true); b.iter(|| bench_rule_matching_browserlike(&engine, &el_req)) }); group.bench_function("slimlist", move |b| { let rules = rules_from_lists(&["data/slim-list.txt"]); let engine = Engine::from_rules_parametrised(rules, Default::default(), false, true); b.iter(|| bench_rule_matching_browserlike(&engine, &slim)) }); group.finish(); } criterion_group!( benches, rule_match, rule_match_parsed_el, rule_match_parsed_elep_slimlist, rule_match_browserlike_comparable, serialization, deserialization ); criterion_main!(benches); adblock-0.8.12/benches/bench_redirect_performance.rs000064400000000000000000000214231046102023000206130ustar 00000000000000use criterion::*; use tokio::runtime::Runtime; use adblock::blocker::{Blocker, BlockerOptions}; use adblock::filters::network::{NetworkFilter, NetworkFilterMask}; use adblock::request::Request; use adblock::resources::ResourceStorage; const DEFAULT_LISTS_URL: &str = "https://raw.githubusercontent.com/brave/adblock-resources/master/filter_lists/list_catalog.json"; async fn get_all_filters() -> Vec { use futures::FutureExt; #[derive(serde::Serialize, serde::Deserialize)] struct ComponentDescriptor { sources: Vec, } #[derive(serde::Serialize, serde::Deserialize)] struct SourceDescriptor { url: String, } let default_components = reqwest::get(DEFAULT_LISTS_URL) .then(|resp| resp.expect("Could not get default filter listing").text()) .map(|text| { serde_json::from_str::>( &text.expect("Could not get default filter listing as text"), ) .expect("Could not parse default filter listing JSON") }) .await; let filters_fut: Vec<_> = default_components[0] .sources .iter() .map(|list| { reqwest::get(&list.url) .then(|resp| resp.expect("Could not request rules").text()) .map(|text| { text.expect("Could not get rules as text") .lines() .map(|s| s.to_owned()) .collect::>() }) }) .collect(); futures::future::join_all(filters_fut) .await .iter() .flatten() .cloned() .collect() } /// Gets all rules with redirects, and modifies them to apply to resources at `a{0-n}.com/bad.js` fn get_redirect_rules() -> Vec { let async_runtime = Runtime::new().expect("Could not start Tokio runtime"); let filters = async_runtime.block_on(get_all_filters()); let (network_filters, _) = adblock::lists::parse_filters(&filters, true, Default::default()); network_filters .into_iter() .filter(NetworkFilter::is_redirect) .filter(NetworkFilter::also_block_redirect) .filter(|rule| { rule.modifier_option.as_ref().unwrap() != "none" }) .enumerate() .map(|(index, mut rule)| { rule.mask.insert(NetworkFilterMask::IS_LEFT_ANCHOR); rule.mask.insert(NetworkFilterMask::IS_RIGHT_ANCHOR); rule.hostname = Some(format!("a{}.com/bad.js", index)); rule.filter = adblock::filters::network::FilterPart::Empty; rule.mask.remove(NetworkFilterMask::IS_HOSTNAME_ANCHOR); rule.mask.remove(NetworkFilterMask::IS_HOSTNAME_REGEX); rule.mask.remove(NetworkFilterMask::IS_REGEX); rule.mask.remove(NetworkFilterMask::IS_COMPLETE_REGEX); rule }) .collect() } /// Loads the supplied rules, and the test set of resources, into a Blocker fn get_preloaded_blocker(rules: Vec) -> Blocker { let blocker_options = BlockerOptions { enable_optimizations: true, }; let blocker = Blocker::new(rules, &blocker_options); blocker } fn build_resources_for_filters(#[allow(unused)] filters: &[NetworkFilter]) -> ResourceStorage { let mut resources = ResourceStorage::default(); #[cfg(feature = "resource-assembler")] { use std::path::Path; use adblock::resources::resource_assembler::assemble_web_accessible_resources; let mut resource_data = assemble_web_accessible_resources( Path::new("data/test/fake-uBO-files/web_accessible_resources"), Path::new("data/test/fake-uBO-files/redirect-resources.js"), ); #[allow(deprecated)] resource_data.append( &mut adblock::resources::resource_assembler::assemble_scriptlet_resources(Path::new( "data/test/fake-uBO-files/scriptlets.js", )), ); resource_data .into_iter() .for_each(|resource| { let _res = resources.add_resource(resource); }); } #[cfg(not(feature = "resource-assembler"))] { use adblock::resources::{Resource, ResourceType, MimeType}; filters .iter() .filter(|f| f.is_redirect()) .map(|f| { let mut redirect = f.modifier_option.as_ref().unwrap().as_str(); // strip priority, if present if let Some(i) = redirect.rfind(':') { redirect = &redirect[0..i]; } Resource { name: redirect.to_owned(), aliases: vec![], kind: ResourceType::Mime(MimeType::from_extension(&redirect)), content: base64::encode(redirect), dependencies: vec![], permission: Default::default(), } }) .for_each(|resource| { let _res = resources.add_resource(resource); }); } resources } /// Maps network filter rules into `Request`s that would trigger those rules pub fn build_custom_requests(rules: Vec) -> Vec { rules .iter() .map(|rule| { let raw_type = if rule.mask.contains(NetworkFilterMask::FROM_IMAGE) { "image" } else if rule.mask.contains(NetworkFilterMask::FROM_MEDIA) { "media" } else if rule.mask.contains(NetworkFilterMask::FROM_OBJECT) { "object" } else if rule.mask.contains(NetworkFilterMask::FROM_OTHER) { "other" } else if rule.mask.contains(NetworkFilterMask::FROM_PING) { "ping" } else if rule.mask.contains(NetworkFilterMask::FROM_SCRIPT) { "script" } else if rule.mask.contains(NetworkFilterMask::FROM_STYLESHEET) { "stylesheet" } else if rule.mask.contains(NetworkFilterMask::FROM_SUBDOCUMENT) { "subdocument" } else if rule.mask.contains(NetworkFilterMask::FROM_DOCUMENT) { "main_frame" } else if rule.mask.contains(NetworkFilterMask::FROM_XMLHTTPREQUEST) { "xhr" } else if rule.mask.contains(NetworkFilterMask::FROM_WEBSOCKET) { "websocket" } else if rule.mask.contains(NetworkFilterMask::FROM_FONT) { "font" } else { unreachable!() }; let rule_hostname = rule.hostname.clone().unwrap(); let url = format!("https://{}", rule_hostname.clone()); let domain = &rule_hostname[..rule_hostname.find('/').unwrap()]; let hostname = domain; let raw_line = rule.raw_line.clone().unwrap(); let source_hostname = if rule.opt_domains.is_some() { let domain_start = raw_line.rfind("domain=").unwrap() + "domain=".len(); let from_start = &raw_line[domain_start..]; let domain_end = from_start .find('|') .or_else(|| from_start.find(",")) .or_else(|| Some(from_start.len())) .unwrap() + domain_start; let source_hostname = &raw_line[domain_start..domain_end]; source_hostname } else if rule.mask.contains(NetworkFilterMask::THIRD_PARTY) { "always-third-party.com" } else { hostname }; let source_url = format!("https://{}", source_hostname); Request::new( &url, &source_url, raw_type, ).unwrap() }) .collect::>() } fn bench_fn(blocker: &Blocker, resources: &ResourceStorage, requests: &[Request]) { requests.iter().for_each(|request| { let block_result = blocker.check(&request, &resources); assert!(block_result.redirect.is_some(), "{:?}, {:?}", request, block_result); }); } fn redirect_performance(c: &mut Criterion) { let mut group = c.benchmark_group("redirect_performance"); let rules = get_redirect_rules(); let blocker = get_preloaded_blocker(rules.clone()); let resources = build_resources_for_filters(&rules); let requests = build_custom_requests(rules.clone()); let requests_len = requests.len() as u64; group.throughput(Throughput::Elements(requests_len)); group.sample_size(10); group.bench_function("without_alias_lookup", move |b| { b.iter(|| bench_fn(&blocker, &resources, &requests)) }); group.finish(); } criterion_group!(benches, redirect_performance,); criterion_main!(benches); adblock-0.8.12/benches/bench_regex.rs000064400000000000000000000044201046102023000155410ustar 00000000000000use criterion::*; use regex::{bytes::Regex as BytesRegex, Regex, RegexSet}; fn bench_simple_regexes(c: &mut Criterion) { let mut group = c.benchmark_group("regex"); let pattern = "?/static/adv/foobar/asd?q=1"; let rules = vec![ Regex::new(r"(?:[^\\w\\d\\._%-])/static/ad-").unwrap(), Regex::new(r"(?:[^\\w\\d\\._%-])/static/ad/.*").unwrap(), Regex::new(r"(?:[^\\w\\d\\._%-])/static/ads/.*").unwrap(), Regex::new(r"(?:[^\\w\\d\\._%-])/static/adv/.*").unwrap(), ]; group.bench_function("list", move |b| { b.iter(|| { for rule in rules.iter() { if rule.is_match(&pattern) { true; } else { false; } } }) }); group.finish(); } fn bench_joined_regex(c: &mut Criterion) { let mut group = c.benchmark_group("regex"); let pattern = "?/static/adv/foobar/asd?q=1"; let rule = Regex::new(r"(?:([^\\w\\d\\._%-])/static/ad-)|(?:([^\\w\\d\\._%-])/static/ad/.*)(?:([^\\w\\d\\._%-])/static/ads/.*)(?:([^\\w\\d\\._%-])/static/adv/.*)").unwrap(); group.bench_function("joined", move |b| b.iter(|| rule.is_match(&pattern))); group.finish(); } fn bench_joined_bytes_regex(c: &mut Criterion) { let mut group = c.benchmark_group("regex"); let pattern = "?/static/adv/foobar/asd?q=1"; let rule = BytesRegex::new(r"(?:([^\\w\\d\\._%-])/static/ad-)|(?:([^\\w\\d\\._%-])/static/ad/.*)(?:([^\\w\\d\\._%-])/static/ads/.*)(?:([^\\w\\d\\._%-])/static/adv/.*)").unwrap(); group.bench_function("u8", move |b| b.iter(|| rule.is_match(pattern.as_bytes()))); group.finish(); } fn bench_regex_set(c: &mut Criterion) { let mut group = c.benchmark_group("regex"); let pattern = "?/static/adv/foobar/asd?q=1"; let set = RegexSet::new(&[ r"(?:[^\\w\\d\\._%-])/static/ad-", r"(?:[^\\w\\d\\._%-])/static/ad/.*", r"(?:[^\\w\\d\\._%-])/static/ads/.*", r"(?:[^\\w\\d\\._%-])/static/adv/.*", ]) .unwrap(); group.bench_function("set", move |b| b.iter(|| set.is_match(&pattern))); group.finish(); } criterion_group!( benches, bench_simple_regexes, bench_joined_regex, bench_joined_bytes_regex, bench_regex_set ); criterion_main!(benches); adblock-0.8.12/benches/bench_rules.rs000064400000000000000000000057151046102023000155710ustar 00000000000000use criterion::*; use once_cell::sync::Lazy; use adblock::blocker::{Blocker, BlockerOptions}; #[path = "../tests/test_utils.rs"] mod test_utils; use test_utils::rules_from_lists; static DEFAULT_LISTS: Lazy> = Lazy::new(|| { rules_from_lists(&[ "data/easylist.to/easylist/easylist.txt", ]).collect() }); fn bench_string_hashing(filters: &Vec) -> adblock::utils::Hash { let mut dummy: adblock::utils::Hash = 0; for filter in filters { dummy = (dummy + adblock::utils::fast_hash(filter)) % 1000000000; } dummy } fn bench_string_tokenize(filters: &Vec) -> usize { let mut dummy: usize = 0; for filter in filters { dummy = (dummy + adblock::utils::tokenize(filter).len()) % 1000000000; } dummy } fn string_hashing(c: &mut Criterion) { let mut group = c.benchmark_group("string-hashing"); group.throughput(Throughput::Elements(1)); group.bench_function("hash", move |b| { b.iter(|| bench_string_hashing(&DEFAULT_LISTS)) }); group.finish(); } fn string_tokenize(c: &mut Criterion) { let mut group = c.benchmark_group("string-tokenize"); group.throughput(Throughput::Elements(1)); group.bench_function("tokenize", move |b| { b.iter(|| bench_string_tokenize(&DEFAULT_LISTS)) }); group.finish(); } fn bench_parsing_impl(lists: &Vec<&Vec>) -> usize { let mut dummy = 0; for list in lists { let (network_filters, _) = adblock::lists::parse_filters(*list, false, Default::default()); dummy = dummy + network_filters.len() % 1000000; } dummy } fn list_parse(c: &mut Criterion) { let mut group = c.benchmark_group("parse-filters"); group.throughput(Throughput::Elements(1)); group.sample_size(10); group.bench_function("network filters", |b| { b.iter(|| bench_parsing_impl(&vec![DEFAULT_LISTS.as_ref()])) }); group.bench_function("all filters", |b| { b.iter(|| bench_parsing_impl(&vec![DEFAULT_LISTS.as_ref()])) }); group.finish(); } fn get_blocker(rules: impl IntoIterator>) -> Blocker { let (network_filters, _) = adblock::lists::parse_filters(rules, false, Default::default()); println!("Got {} network filters", network_filters.len()); let blocker_options = BlockerOptions { enable_optimizations: true, }; Blocker::new(network_filters, &blocker_options) } fn blocker_new(c: &mut Criterion) { let mut group = c.benchmark_group("blocker_new"); group.throughput(Throughput::Elements(1)); group.sample_size(10); let rules: Vec<_> = rules_from_lists(&[ "data/easylist.to/easylist/easylist.txt", "data/easylist.to/easylist/easyprivacy.txt", ]).collect(); group.bench_function("el+ep", move |b| b.iter(|| get_blocker(&rules))); group.finish(); } criterion_group!( benches, blocker_new, list_parse, string_hashing, string_tokenize ); criterion_main!(benches); adblock-0.8.12/benches/bench_url.rs000064400000000000000000000051611046102023000152340ustar 00000000000000use criterion::*; use serde::{Deserialize, Serialize}; use adblock::request::Request; use adblock::url_parser::parse_url; #[path = "../tests/test_utils.rs"] mod test_utils; use test_utils::rules_from_lists; #[allow(non_snake_case)] #[derive(Serialize, Deserialize, Clone)] struct TestRequest { frameUrl: String, url: String, cpt: String, } fn load_requests() -> Vec { rules_from_lists(&["data/requests.json"]) .into_iter() .map(|r| serde_json::from_str(&r)) .filter_map(Result::ok) .collect::>() } fn request_parsing_throughput(c: &mut Criterion) { let mut group = c.benchmark_group("throughput-request"); let requests = load_requests(); let requests_len = requests.len() as u64; group.throughput(Throughput::Elements(requests_len)); group.sample_size(10); group.bench_function("create", move |b| { b.iter(|| { let mut successful = 0; requests.iter().for_each(|r| { let req: Result = Request::new(&r.url, &r.frameUrl, &r.cpt); if req.is_ok() { successful += 1; } }) }) }); group.finish(); } fn request_extract_hostname(c: &mut Criterion) { let mut group = c.benchmark_group("throughput-request"); let requests = load_requests(); let requests_len = requests.len() as u64; group.throughput(Throughput::Elements(requests_len)); group.sample_size(10); group.bench_function("hostname+domain extract", move |b| { b.iter(|| { let mut successful = 0; requests.iter().for_each(|r| { if parse_url(&r.url).is_some() { successful += 1; } if parse_url(&r.frameUrl).is_some() { successful += 1; } }); }) }); group.finish(); } fn request_new_throughput(c: &mut Criterion) { let mut group = c.benchmark_group("throughput-request"); let requests = load_requests(); let requests_len = requests.len() as u64; group.throughput(Throughput::Elements(requests_len)); group.sample_size(10); group.bench_function("new", move |b| { b.iter(|| { let mut successful = 0; requests.iter().for_each(|r| { Request::new(&r.url, &r.frameUrl, &r.cpt).ok(); successful += 1; }); }) }); group.finish(); } criterion_group!( benches, request_new_throughput, request_extract_hostname, request_parsing_throughput ); criterion_main!(benches); adblock-0.8.12/rust-toolchain.toml000064400000000000000000000000371046102023000151630ustar 00000000000000[toolchain] channel = "stable" adblock-0.8.12/src/blocker.rs000064400000000000000000002741101046102023000140760ustar 00000000000000//! Holds [`Blocker`], which handles all network-based adblocking queries. use memchr::{memchr as find_char, memrchr as find_char_reverse}; use once_cell::sync::Lazy; use std::ops::DerefMut; use serde::{Deserialize, Serialize}; use std::sync::Arc; use std::collections::{HashMap, HashSet}; use thiserror::Error; #[cfg(feature = "object-pooling")] use lifeguard::Pool; use crate::filters::network::{NetworkFilter, NetworkMatchable}; use crate::regex_manager::{RegexManager, RegexManagerDiscardPolicy}; use crate::request::Request; use crate::utils::{fast_hash, Hash}; use crate::optimizer; use crate::resources::ResourceStorage; use crate::utils; /// Options used when constructing a [`Blocker`]. pub struct BlockerOptions { pub enable_optimizations: bool, } /// Describes how a particular network request should be handled. #[derive(Debug, Serialize)] pub struct BlockerResult { /// Was a blocking filter matched for this request? pub matched: bool, /// Important is used to signal that a rule with the `important` option /// matched. An `important` match means that exceptions should not apply /// and no further checking is neccesary--the request should be blocked /// (empty body or cancelled). /// /// Brave Browser keeps multiple instances of [`Blocker`], so `important` /// here is used to correct behaviour between them: checking should stop /// instead of moving to the next instance iff an `important` rule matched. pub important: bool, /// Specifies what to load instead of the original request, rather than /// just blocking it outright. This can come from a filter with a `redirect` /// or `redirect-rule` option. If present, the field will contain the body /// of the redirect to be injected. /// /// Note that the presence of a redirect does _not_ imply that the request /// should be blocked. The `redirect-rule` option can produce a redirection /// that's only applied if another blocking filter matches a request. pub redirect: Option, /// `removeparam` may remove URL parameters. If the original request URL was /// modified at all, the new version will be here. This should be used /// as long as the request is not blocked. pub rewritten_url: Option, /// Contains a string representation of any matched exception rule. /// Effectively this means that there was a match, but the request should /// not be blocked. /// /// If debugging was _not_ enabled (see [`crate::FilterSet::new`]), this /// will only contain a constant `"NetworkFilter"` placeholder string. pub exception: Option, /// When `matched` is true, this contains a string representation of the /// matched blocking rule. /// /// If debugging was _not_ enabled (see [`crate::FilterSet::new`]), this /// will only contain a constant `"NetworkFilter"` placeholder string. pub filter: Option, } impl Default for BlockerResult { fn default() -> BlockerResult { BlockerResult { matched: false, important: false, redirect: None, rewritten_url: None, exception: None, filter: None, } } } /// Possible errors when adding a filter to a [`Blocker`]. #[derive(Debug, Error, PartialEq)] pub enum BlockerError { #[error("$badfilter cannot be added (unsupported)")] BadFilterAddUnsupported, #[error("filter already exists")] FilterExists, } #[cfg(feature = "object-pooling")] pub(crate) struct TokenPool { pub pool: Pool> } #[cfg(feature = "object-pooling")] impl Default for TokenPool { fn default() -> TokenPool { TokenPool { pool: lifeguard::pool() .with(lifeguard::StartingSize(1)) .with(lifeguard::Supplier(|| Vec::with_capacity(utils::TOKENS_BUFFER_SIZE))) .build() } } } // only check for tags in tagged and exception rule buckets, // pass empty set for the rest static NO_TAGS: Lazy> = Lazy::new(HashSet::new); /// Stores network filters for efficient querying. pub struct Blocker { pub(crate) csp: NetworkFilterList, pub(crate) exceptions: NetworkFilterList, pub(crate) importants: NetworkFilterList, pub(crate) redirects: NetworkFilterList, pub(crate) removeparam: NetworkFilterList, pub(crate) filters_tagged: NetworkFilterList, pub(crate) filters: NetworkFilterList, pub(crate) generic_hide: NetworkFilterList, // Enabled tags are not serialized - when deserializing, tags of the existing // instance (the one we are recreating lists into) are maintained pub(crate) tags_enabled: HashSet, pub(crate) tagged_filters_all: Vec, pub(crate) enable_optimizations: bool, // Not serialized #[cfg(feature = "object-pooling")] pub(crate) pool: TokenPool, // Not serialized #[cfg(feature = "unsync-regex-caching")] pub(crate) regex_manager: std::cell::RefCell, #[cfg(not(feature = "unsync-regex-caching"))] pub(crate) regex_manager: std::sync::Mutex, } impl Blocker { /// Decide if a network request (usually from WebRequest API) should be /// blocked, redirected or allowed. pub fn check(&self, request: &Request, resources: &ResourceStorage) -> BlockerResult { self.check_parameterised(request, resources, false, false) } #[cfg(feature = "unsync-regex-caching")] fn borrow_regex_manager(&self) -> std::cell::RefMut { #[allow(unused_mut)] let mut manager = self.regex_manager.borrow_mut(); #[cfg(not(target_arch = "wasm32"))] manager.update_time(); manager } #[cfg(not(feature = "unsync-regex-caching"))] fn borrow_regex_manager(&self) -> std::sync::MutexGuard { let mut manager = self.regex_manager.lock().unwrap(); manager.update_time(); manager } pub fn check_generic_hide(&self, hostname_request: &Request) -> bool { let mut regex_manager = self.borrow_regex_manager(); let mut request_tokens; #[cfg(feature = "object-pooling")] { request_tokens = self.pool.pool.new(); } #[cfg(not(feature = "object-pooling"))] { request_tokens = Vec::with_capacity(utils::TOKENS_BUFFER_SIZE); } hostname_request.get_tokens(&mut request_tokens); self.generic_hide .check( hostname_request, &request_tokens, &HashSet::new(), &mut regex_manager, ) .is_some() } pub fn check_parameterised( &self, request: &Request, resources: &ResourceStorage, matched_rule: bool, force_check_exceptions: bool, ) -> BlockerResult { let mut regex_manager = self.borrow_regex_manager(); if !request.is_supported { return BlockerResult::default(); } let mut request_tokens; #[cfg(feature = "object-pooling")] { request_tokens = self.pool.pool.new(); } #[cfg(not(feature = "object-pooling"))] { request_tokens = Vec::with_capacity(utils::TOKENS_BUFFER_SIZE); } request.get_tokens(&mut request_tokens); // Check the filters in the following order: // 1. $important (not subject to exceptions) // 2. redirection ($redirect=resource) // 3. normal filters - if no match by then // 4. exceptions - if any non-important match of forced // Always check important filters let important_filter = self.importants.check( request, &request_tokens, &NO_TAGS, &mut regex_manager, ); // only check the rest of the rules if not previously matched let filter = if important_filter.is_none() && !matched_rule { self.filters_tagged .check( request, &request_tokens, &self.tags_enabled, &mut regex_manager, ) .or_else(|| { self.filters.check( request, &request_tokens, &NO_TAGS, &mut regex_manager, ) }) } else { important_filter }; let exception = match filter.as_ref() { // if no other rule matches, only check exceptions if forced to None if matched_rule || force_check_exceptions => { self.exceptions.check( request, &request_tokens, &self.tags_enabled, &mut regex_manager, ) } None => None, // If matched an important filter, exceptions don't atter Some(f) if f.is_important() => None, Some(_) => { self.exceptions.check( request, &request_tokens, &self.tags_enabled, &mut regex_manager, ) } }; let redirect_filters = self.redirects.check_all( request, &request_tokens, &NO_TAGS, regex_manager.deref_mut(), ); // Extract the highest priority redirect directive. // 1. Exceptions - can bail immediately if found // 2. Find highest priority non-exception redirect let redirect_resource = { let mut exceptions = vec![]; for redirect_filter in redirect_filters.iter() { if redirect_filter.is_exception() { if let Some(redirect) = redirect_filter.modifier_option.as_ref() { exceptions.push(redirect); } } } let mut resource_and_priority = None; for redirect_filter in redirect_filters.iter() { if !redirect_filter.is_exception() { if let Some(redirect) = redirect_filter.modifier_option.as_ref() { if !exceptions.contains(&redirect) { // parse redirect + priority let (resource, priority) = if let Some(idx) = find_char_reverse(b':', redirect.as_bytes()) { let priority_str = &redirect[idx + 1..]; let resource = &redirect[..idx]; if let Ok(priority) = priority_str.parse::() { (resource, priority) } else { (&redirect[..], 0) } } else { (&redirect[..], 0) }; if let Some((_, p1)) = resource_and_priority { if priority > p1 { resource_and_priority = Some((resource, priority)); } } else { resource_and_priority = Some((resource, priority)); } } } } } resource_and_priority.map(|(r, _)| r) }; let redirect: Option = redirect_resource.and_then(|resource_name| { resources.get_redirect_resource(resource_name).or_else(|| { // It's acceptable to pass no redirection if no matching resource is loaded. // TODO - it may be useful to return a status flag to indicate that this occurred. #[cfg(test)] eprintln!("Matched rule with redirect option but did not find corresponding resource to send"); None }) }); let important = filter.is_some() && filter.as_ref().map(|f| f.is_important()).unwrap_or_else(|| false); let rewritten_url = if important { None } else { Self::apply_removeparam( &self.removeparam, request, &request_tokens, regex_manager.deref_mut(), ) }; // If something has already matched before but we don't know what, still return a match let matched = exception.is_none() && (filter.is_some() || matched_rule); BlockerResult { matched, important, redirect, rewritten_url, exception: exception.as_ref().map(|f| f.to_string()), // copy the exception filter: filter.as_ref().map(|f| f.to_string()), // copy the filter } } fn apply_removeparam( removeparam_filters: &NetworkFilterList, request: &Request, request_tokens: &[Hash], regex_manager: &mut RegexManager, ) -> Option { /// Represents an `&`-separated argument from a URL query parameter string enum QParam<'a> { /// Just a key, e.g. `...&key&...` KeyOnly(&'a str), /// Key-value pair separated by an equal sign, e.g. `...&key=value&...` KeyValue(&'a str, &'a str), } impl<'a> std::fmt::Display for QParam<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::KeyOnly(k) => write!(f, "{}", k), Self::KeyValue(k, v) => write!(f, "{}={}", k, v), } } } let url = &request.original_url; // Only check for removeparam if there's a query string in the request URL if let Some(i) = find_char(b'?', url.as_bytes()) { // String indexing safety: indices come from `.len()` or `find_char` on individual ASCII // characters (1 byte each), some plus 1. let params_start = i + 1; let hash_index = if let Some(j) = find_char(b'#', url[params_start..].as_bytes()) { params_start + j } else { url.len() }; let qparams = &url[params_start..hash_index]; let mut params: Vec<(QParam, bool)> = qparams .split('&') .map(|pair| { if let Some((k, v)) = pair.split_once('=') { QParam::KeyValue(k, v) } else { QParam::KeyOnly(pair) } }) .map(|param| (param, true)) .collect(); let filters = removeparam_filters.check_all(request, request_tokens, &NO_TAGS, regex_manager); let mut rewrite = false; for removeparam_filter in filters { if let Some(removeparam) = &removeparam_filter.modifier_option { params.iter_mut().for_each(|(param, include)| { if let QParam::KeyValue(k, v) = param { if !v.is_empty() && k == removeparam { *include = false; rewrite = true; } } }); } } if rewrite { let p = itertools::join(params.into_iter().filter(|(_, include)| *include).map(|(param, _)| param.to_string()), "&"); let new_param_str = if p.is_empty() { String::from("") } else { format!("?{}", p) }; Some(format!("{}{}{}", &url[0..i], new_param_str, &url[hash_index..])) } else { None } } else { None } } /// Given a "main_frame" or "subdocument" request, check if some content security policies /// should be injected in the page. pub fn get_csp_directives(&self, request: &Request) -> Option { use crate::request::RequestType; if request.request_type != RequestType::Document && request.request_type != RequestType::Subdocument { return None; } let mut request_tokens; let mut regex_manager = self.borrow_regex_manager(); #[cfg(feature = "object-pooling")] { request_tokens = self.pool.pool.new(); } #[cfg(not(feature = "object-pooling"))] { request_tokens = Vec::with_capacity(utils::TOKENS_BUFFER_SIZE); } request.get_tokens(&mut request_tokens); let filters = self.csp.check_all( request, &request_tokens, &self.tags_enabled, &mut regex_manager, ); if filters.is_empty() { return None; } let mut disabled_directives: HashSet<&str> = HashSet::new(); let mut enabled_directives: HashSet<&str> = HashSet::new(); for filter in filters { if filter.is_exception() { if filter.is_csp() { if let Some(csp_directive) = &filter.modifier_option { disabled_directives.insert(csp_directive); } else { // Exception filters with empty `csp` options will disable all CSP // injections for matching pages. return None } } } else if filter.is_csp() { if let Some(csp_directive) = &filter.modifier_option { enabled_directives.insert(csp_directive); } } } let mut remaining_directives = enabled_directives.difference(&disabled_directives); let mut merged = if let Some(directive) = remaining_directives.next() { String::from(*directive) } else { return None; }; remaining_directives.for_each(|directive| { merged.push(','); merged.push_str(directive); }); Some(merged) } pub fn new(network_filters: Vec, options: &BlockerOptions) -> Blocker { // Capacity of filter subsets estimated based on counts in EasyList and EasyPrivacy - if necessary // the Vectors will grow beyond the pre-set capacity, but it is more efficient to allocate all at once // $csp= let mut csp = Vec::with_capacity(200); // @@filter let mut exceptions = Vec::with_capacity(network_filters.len() / 8); // $important let mut importants = Vec::with_capacity(200); // $redirect, $redirect-rule let mut redirects = Vec::with_capacity(200); // $removeparam let mut removeparam = Vec::with_capacity(60); // $tag= let mut tagged_filters_all = Vec::with_capacity(200); // $badfilter let mut badfilters = Vec::with_capacity(100); // $generichide let mut generic_hide = Vec::with_capacity(4000); // All other filters let mut filters = Vec::with_capacity(network_filters.len()); // Injections // TODO: resource handling if !network_filters.is_empty() { for filter in network_filters.iter() { if filter.is_badfilter() { badfilters.push(filter); } } let badfilter_ids: HashSet = badfilters.iter().map(|f| f.get_id_without_badfilter()).collect(); for filter in network_filters { // skip any bad filters let filter_id = filter.get_id(); if badfilter_ids.contains(&filter_id) || filter.is_badfilter() { continue; } // Redirects are independent of blocking behavior. if filter.is_redirect() { redirects.push(filter.clone()); } if filter.is_csp() { csp.push(filter); } else if filter.is_removeparam() { removeparam.push(filter); } else if filter.is_generic_hide() { generic_hide.push(filter); } else if filter.is_exception() { exceptions.push(filter); } else if filter.is_important() { importants.push(filter); } else if filter.tag.is_some() && !filter.is_redirect() { // `tag` + `redirect` is unsupported for now. tagged_filters_all.push(filter); } else { if (filter.is_redirect() && filter.also_block_redirect()) || !filter.is_redirect() { filters.push(filter); } } } } tagged_filters_all.shrink_to_fit(); Blocker { csp: NetworkFilterList::new(csp, options.enable_optimizations), exceptions: NetworkFilterList::new(exceptions, options.enable_optimizations), importants: NetworkFilterList::new(importants, options.enable_optimizations), redirects: NetworkFilterList::new(redirects, options.enable_optimizations), // Don't optimize removeparam, since it can fuse filters without respecting distinct // queryparam values removeparam: NetworkFilterList::new(removeparam, false), filters_tagged: NetworkFilterList::new(Vec::new(), options.enable_optimizations), filters: NetworkFilterList::new(filters, options.enable_optimizations), generic_hide: NetworkFilterList::new(generic_hide, options.enable_optimizations), // Tags special case for enabling/disabling them dynamically tags_enabled: HashSet::new(), tagged_filters_all, // Options enable_optimizations: options.enable_optimizations, #[cfg(feature = "object-pooling")] pool: TokenPool::default(), regex_manager: Default::default(), } } /// If optimizations are enabled, the `Blocker` will be configured to automatically optimize /// its filters after batch updates. However, even if they are disabled, it is possible to /// manually call `optimize()`. It may be useful to have finer-grained control over /// optimization scheduling when frequently updating filters. pub fn optimize(&mut self) { self.csp.optimize(); self.exceptions.optimize(); self.importants.optimize(); self.redirects.optimize(); // note - don't optimize removeparam self.filters_tagged.optimize(); self.filters.optimize(); self.generic_hide.optimize(); } /// Has this exact filter already been added? Note that this is a best-effort method and may /// miss some filters, especially if optimizations are enabled. pub fn filter_exists(&self, filter: &NetworkFilter) -> bool { if filter.is_csp() { self.csp.filter_exists(filter) } else if filter.is_generic_hide() { self.generic_hide.filter_exists(filter) } else if filter.is_exception() { self.exceptions.filter_exists(filter) } else if filter.is_important() { self.importants.filter_exists(filter) } else if filter.is_redirect() { self.redirects.filter_exists(filter) } else if filter.is_removeparam() { self.removeparam.filter_exists(filter) } else if filter.tag.is_some() { self.tagged_filters_all.iter().any(|f| f.id == filter.id) } else { self.filters.filter_exists(filter) } } /// Add a single filter to this [`Blocker`]. /// /// Filter optimization is skipped when using this method. pub fn add_filter(&mut self, filter: NetworkFilter) -> Result<(), BlockerError> { // Redirects are independent of blocking behavior. if filter.is_redirect() { self.redirects.add_filter(filter.clone()); } if filter.is_badfilter() { Err(BlockerError::BadFilterAddUnsupported) } else if self.filter_exists(&filter) { Err(BlockerError::FilterExists) } else if filter.is_csp() { self.csp.add_filter(filter); Ok(()) } else if filter.is_generic_hide() { self.generic_hide.add_filter(filter); Ok(()) } else if filter.is_exception() { self.exceptions.add_filter(filter); Ok(()) } else if filter.is_important() { self.importants.add_filter(filter); Ok(()) } else if filter.is_removeparam() { self.removeparam.add_filter(filter); Ok(()) } else if filter.tag.is_some() && !filter.is_redirect() { // `tag` + `redirect` is unsupported self.tagged_filters_all.push(filter); let tags_enabled = self.tags_enabled().into_iter().collect::>(); self.tags_with_set(tags_enabled); Ok(()) } else if (filter.is_redirect() && filter.also_block_redirect()) || !filter.is_redirect() { self.filters.add_filter(filter); Ok(()) } else { Ok(()) } } pub fn use_tags(&mut self, tags: &[&str]) { let tag_set: HashSet = tags.iter().map(|&t| String::from(t)).collect(); self.tags_with_set(tag_set); } pub fn enable_tags(&mut self, tags: &[&str]) { let tag_set: HashSet = tags.iter().map(|&t| String::from(t)).collect::>() .union(&self.tags_enabled) .cloned() .collect(); self.tags_with_set(tag_set); } pub fn disable_tags(&mut self, tags: &[&str]) { let tag_set: HashSet = self.tags_enabled .difference(&tags.iter().map(|&t| String::from(t)).collect()) .cloned() .collect(); self.tags_with_set(tag_set); } fn tags_with_set(&mut self, tags_enabled: HashSet) { self.tags_enabled = tags_enabled; let filters: Vec = self.tagged_filters_all.iter() .filter(|n| n.tag.is_some() && self.tags_enabled.contains(n.tag.as_ref().unwrap())) .cloned() .collect(); self.filters_tagged = NetworkFilterList::new(filters, self.enable_optimizations); } pub fn tags_enabled(&self) -> Vec { self.tags_enabled.iter().cloned().collect() } pub fn set_regex_discard_policy( &self, new_discard_policy: RegexManagerDiscardPolicy ) { let mut regex_manager = self.borrow_regex_manager(); regex_manager.set_discard_policy(new_discard_policy); } #[cfg(feature = "regex-debug-info")] pub fn discard_regex(&self, regex_id: u64) { let mut regex_manager = self.borrow_regex_manager(); regex_manager.discard_regex(regex_id); } #[cfg(feature = "regex-debug-info")] pub fn get_regex_debug_info(&self) -> crate::regex_manager::RegexDebugInfo { let regex_manager = self.borrow_regex_manager(); regex_manager.get_debug_info() } } #[derive(Serialize, Deserialize, Default)] pub(crate) struct NetworkFilterList { #[serde(serialize_with = "crate::data_format::utils::stabilize_hashmap_serialization")] pub(crate) filter_map: HashMap>>, } impl NetworkFilterList { pub fn new(filters: Vec, optimize: bool) -> NetworkFilterList { // Compute tokens for all filters let filter_tokens: Vec<_> = filters .into_iter() .map(|filter| { let tokens = filter.get_tokens(); (Arc::new(filter), tokens) }) .collect(); // compute the tokens' frequency histogram let (total_number_of_tokens, tokens_histogram) = token_histogram(&filter_tokens); // Build a HashMap of tokens to Network Filters (held through Arc, Atomic Reference Counter) let mut filter_map = HashMap::with_capacity(filter_tokens.len()); { for (filter_pointer, multi_tokens) in filter_tokens { for tokens in multi_tokens { let mut best_token: Hash = 0; let mut min_count = total_number_of_tokens + 1; for token in tokens { match tokens_histogram.get(&token) { None => { min_count = 0; best_token = token } Some(&count) if count < min_count => { min_count = count; best_token = token } _ => {} } } insert_dup(&mut filter_map, best_token, Arc::clone(&filter_pointer)); } } } let mut self_ = NetworkFilterList { filter_map, }; if optimize { self_.optimize(); } else { self_.filter_map.shrink_to_fit(); } self_ } pub fn optimize(&mut self) { let mut optimized_map = HashMap::with_capacity(self.filter_map.len()); for (key, filters) in self.filter_map.drain() { let mut unoptimized: Vec = Vec::with_capacity(filters.len()); let mut unoptimizable: Vec> = Vec::with_capacity(filters.len()); for f in filters { match Arc::try_unwrap(f) { Ok(f) => unoptimized.push(f), Err(af) => unoptimizable.push(af) } } let mut optimized: Vec<_> = if unoptimized.len() > 1 { optimizer::optimize(unoptimized).into_iter().map(Arc::new).collect() } else { // nothing to optimize unoptimized.into_iter().map(Arc::new).collect() }; optimized.append(&mut unoptimizable); optimized.shrink_to_fit(); optimized_map.insert(key, optimized); } // won't mutate anymore, shrink to fit items optimized_map.shrink_to_fit(); self.filter_map = optimized_map; } pub fn add_filter(&mut self, filter: NetworkFilter) { let filter_tokens = filter.get_tokens(); let total_rules = vec_hashmap_len(&self.filter_map); let filter_pointer = Arc::new(filter); for tokens in filter_tokens { let mut best_token: Hash = 0; let mut min_count = total_rules + 1; for token in tokens { match self.filter_map.get(&token) { None => { min_count = 0; best_token = token } Some(filters) if filters.len() < min_count => { min_count = filters.len(); best_token = token } _ => {} } } insert_dup(&mut self.filter_map, best_token, Arc::clone(&filter_pointer)); } } /// This may not work if the list has been optimized. pub fn filter_exists(&self, filter: &NetworkFilter) -> bool { let mut tokens: Vec<_> = filter.get_tokens().into_iter().flatten().collect(); if tokens.is_empty() { tokens.push(0) } for token in tokens { if let Some(filters) = self.filter_map.get(&token) { for saved_filter in filters { if saved_filter.id == filter.id { return true; } } } } false } /// Returns the first found filter, if any, that matches the given request. The backing storage /// has a non-deterministic order, so this should be used for any category of filters where a /// match from each would be functionally equivalent. For example, if two different exception /// filters match a certain request, it doesn't matter _which_ one is matched - the request /// will be excepted either way. pub fn check( &self, request: &Request, request_tokens: &[Hash], active_tags: &HashSet, regex_manager: &mut RegexManager, ) -> Option<&NetworkFilter> { if self.filter_map.is_empty() { return None; } if let Some(source_hostname_hashes) = request.source_hostname_hashes.as_ref() { for token in source_hostname_hashes { if let Some(filter_bucket) = self.filter_map.get(token) { for filter in filter_bucket { // if matched, also needs to be tagged with an active tag (or not tagged at all) if filter.matches(request, regex_manager) && filter .tag .as_ref() .map(|t| active_tags.contains(t)) .unwrap_or(true) { return Some(filter); } } } } } for token in request_tokens { if let Some(filter_bucket) = self.filter_map.get(token) { for filter in filter_bucket { // if matched, also needs to be tagged with an active tag (or not tagged at all) if filter.matches(request, regex_manager) && filter.tag.as_ref().map(|t| active_tags.contains(t)).unwrap_or(true) { return Some(filter); } } } } None } /// Returns _all_ filters that match the given request. This should be used for any category of /// filters where a match from each may carry unique information. For example, if two different /// `$csp` filters match a certain request, they may each carry a distinct CSP directive, and /// each directive should be combined for the final result. pub fn check_all( &self, request: &Request, request_tokens: &[Hash], active_tags: &HashSet, regex_manager: &mut RegexManager, ) -> Vec<&NetworkFilter> { let mut filters: Vec<&NetworkFilter> = vec![]; if self.filter_map.is_empty() { return filters; } if let Some(source_hostname_hashes) = request.source_hostname_hashes.as_ref() { for token in source_hostname_hashes { if let Some(filter_bucket) = self.filter_map.get(token) { for filter in filter_bucket { // if matched, also needs to be tagged with an active tag (or not tagged at all) if filter.matches(request, regex_manager) && filter.tag.as_ref().map(|t| active_tags.contains(t)).unwrap_or(true) { filters.push(filter); } } } } } for token in request_tokens { if let Some(filter_bucket) = self.filter_map.get(token) { for filter in filter_bucket { // if matched, also needs to be tagged with an active tag (or not tagged at all) if filter.matches(request, regex_manager) && filter.tag.as_ref().map(|t| active_tags.contains(t)).unwrap_or(true) { filters.push(filter); } } } } filters } } /// Inserts a value into the `Vec` under the specified key in the `HashMap`. The entry will be /// created if it does not exist. If it already exists, it will be inserted in the `Vec` in a /// sorted order. fn insert_dup(map: &mut HashMap, H>, k: K, v: V) where K: std::cmp::Ord + std::hash::Hash, V: PartialOrd, { let entry = map.entry(k).or_insert_with(Vec::new); match entry.binary_search_by(|f| f.partial_cmp(&v).unwrap_or(std::cmp::Ordering::Equal)) { Ok(_pos) => (), // Can occur if the exact same rule is inserted twice. No reason to add anything. Err(slot) => entry.insert(slot, v), } } fn vec_hashmap_len(map: &HashMap, H>) -> usize { let mut size = 0usize; for (_, val) in map.iter() { size += val.len(); } size } fn token_histogram(filter_tokens: &[(T, Vec>)]) -> (u32, HashMap) { let mut tokens_histogram: HashMap = HashMap::new(); let mut number_of_tokens = 0; for (_, tokens) in filter_tokens.iter() { for tg in tokens { for t in tg { *tokens_histogram.entry(*t).or_insert(0) += 1; number_of_tokens += 1; } } } for bad_token in ["http", "https", "www", "com"].iter() { tokens_histogram.insert(fast_hash(bad_token), number_of_tokens); } (number_of_tokens, tokens_histogram) } #[cfg(test)] mod tests { use super::*; #[test] fn insert_dup_works() { let mut dup_map: HashMap> = HashMap::new(); // inserts into empty insert_dup(&mut dup_map, 1, String::from("foo")); assert_eq!(dup_map.get(&1), Some(&vec![String::from("foo")])); // adds item insert_dup(&mut dup_map, 1, String::from("bar")); assert_eq!( dup_map.get(&1), Some(&vec![String::from("bar"), String::from("foo")]) ); // inserts into another key item insert_dup(&mut dup_map, 123, String::from("baz")); assert_eq!(dup_map.get(&123), Some(&vec![String::from("baz")])); assert_eq!( dup_map.get(&1), Some(&vec![String::from("bar"), String::from("foo")]) ); } #[test] fn token_histogram_works() { // handle the case of just 1 token { let tokens = vec![(0, vec![vec![111]])]; let (total_tokens, histogram) = token_histogram(&tokens); assert_eq!(total_tokens, 1); assert_eq!(histogram.get(&111), Some(&1)); // include bad tokens assert_eq!(histogram.get(&fast_hash("http")), Some(&1)); assert_eq!(histogram.get(&fast_hash("www")), Some(&1)); } // handle the case of repeating tokens { let tokens = vec![(0, vec![vec![111]]), (1, vec![vec![111]])]; let (total_tokens, histogram) = token_histogram(&tokens); assert_eq!(total_tokens, 2); assert_eq!(histogram.get(&111), Some(&2)); // include bad tokens assert_eq!(histogram.get(&fast_hash("http")), Some(&2)); assert_eq!(histogram.get(&fast_hash("www")), Some(&2)); } // handle the different token set sizes { let tokens = vec![ (0, vec![vec![111, 123, 132]]), (1, vec![vec![111], vec![123], vec![132]]), (2, vec![vec![111, 123], vec![132]]), (3, vec![vec![111, 111], vec![111]]), ]; let (total_tokens, histogram) = token_histogram(&tokens); assert_eq!(total_tokens, 12); assert_eq!(histogram.get(&111), Some(&6)); assert_eq!(histogram.get(&123), Some(&3)); assert_eq!(histogram.get(&132), Some(&3)); // include bad tokens assert_eq!(histogram.get(&fast_hash("http")), Some(&12)); assert_eq!(histogram.get(&fast_hash("www")), Some(&12)); } } #[test] fn network_filter_list_new_works() { { let filters = ["||foo.com"]; let network_filters: Vec<_> = filters .into_iter() .map(|f| NetworkFilter::parse(&f, true, Default::default())) .filter_map(Result::ok) .collect(); let filter_list = NetworkFilterList::new(network_filters, false); let maybe_matching_filter = filter_list.filter_map.get(&fast_hash("foo")); assert!(maybe_matching_filter.is_some(), "Expected filter not found"); } // choses least frequent token { let filters = ["||foo.com", "||bar.com/foo"]; let network_filters: Vec<_> = filters .into_iter() .map(|f| NetworkFilter::parse(&f, true, Default::default())) .filter_map(Result::ok) .collect(); let filter_list = NetworkFilterList::new(network_filters, false); assert_eq!( filter_list.filter_map.get(&fast_hash("bar")).unwrap().len(), 1 ); assert_eq!( filter_list.filter_map.get(&fast_hash("foo")).unwrap().len(), 1 ); } // choses blacklisted token when no other choice { let filters = ["||foo.com", "||foo.com/bar", "||www"]; let network_filters: Vec<_> = filters .into_iter() .map(|f| NetworkFilter::parse(&f, true, Default::default())) .filter_map(Result::ok) .collect(); let filter_list = NetworkFilterList::new(network_filters, false); assert!( filter_list.filter_map.get(&fast_hash("www")).is_some(), "Filter matching {} not found", "www" ); assert_eq!( filter_list.filter_map.get(&fast_hash("www")).unwrap().len(), 1 ); } // uses domain as token when only one domain { let filters = ["||foo.com", "||foo.com$domain=bar.com"]; let network_filters: Vec<_> = filters .into_iter() .map(|f| NetworkFilter::parse(&f, true, Default::default())) .filter_map(Result::ok) .collect(); let filter_list = NetworkFilterList::new(network_filters, false); assert!( filter_list.filter_map.get(&fast_hash("bar.com")).is_some(), "Filter matching {} not found", "bar.com" ); assert_eq!( filter_list .filter_map .get(&fast_hash("bar.com")) .unwrap() .len(), 1 ); } // dispatches filter to multiple buckets per domain options if no token in main part { let filters = ["foo*$domain=bar.com|baz.com"]; let network_filters: Vec<_> = filters .into_iter() .map(|f| NetworkFilter::parse(&f, true, Default::default())) .filter_map(Result::ok) .collect(); let filter_list = NetworkFilterList::new(network_filters, false); assert_eq!(filter_list.filter_map.len(), 2); assert!( filter_list.filter_map.get(&fast_hash("bar.com")).is_some(), "Filter matching {} not found", "bar.com" ); assert_eq!( filter_list .filter_map .get(&fast_hash("bar.com")) .unwrap() .len(), 1 ); assert!( filter_list.filter_map.get(&fast_hash("baz.com")).is_some(), "Filter matching {} not found", "baz.com" ); assert_eq!( filter_list .filter_map .get(&fast_hash("baz.com")) .unwrap() .len(), 1 ); } } fn test_requests_filters(filters: impl IntoIterator>, requests: &[(Request, bool)]) { let network_filters: Vec<_> = filters .into_iter() .map(|f| NetworkFilter::parse(&f.as_ref(), true, Default::default())) .filter_map(Result::ok) .collect(); let filter_list = NetworkFilterList::new(network_filters, false); let mut regex_manager = RegexManager::default(); requests.into_iter().for_each(|(req, expected_result)| { let mut tokens = Vec::new(); req.get_tokens(&mut tokens); let matched_rule = filter_list.check(&req, &tokens, &HashSet::new(), &mut regex_manager); if *expected_result { assert!(matched_rule.is_some(), "Expected match for {}", req.url); } else { assert!(matched_rule.is_none(), "Expected no match for {}, matched with {}", req.url, matched_rule.unwrap().to_string()); } }); } #[test] fn network_filter_list_check_works_plain_filter() { // includes cases with fall back to 0 bucket (no tokens from a rule) let filters = [ "foo", "-foo-", "&fo.o=+_-", "foo/bar/baz", "com/bar/baz", "https://bar.com/bar/baz", ]; let url_results = [ ("https://bar.com/foo", true), ("https://bar.com/baz/foo", true), ("https://bar.com/q=foo/baz", true), ("https://foo.com", true), ("https://bar.com/baz/42-foo-q", true), ("https://bar.com?baz=42&fo.o=+_-", true), ("https://bar.com/foo/bar/baz", true), ("https://bar.com/bar/baz", true), ]; let request_expectations: Vec<_> = url_results .into_iter() .map(|(url, expected_result)| { let request = Request::new(url, "https://example.com", "other").unwrap(); (request, expected_result) }) .collect(); test_requests_filters(&filters, &request_expectations); } #[test] fn network_filter_list_check_works_hostname_anchor() { let filters = [ "||foo.com", "||bar.com/bar", "||coo.baz.", "||foo.bar.com^", "||foo.baz^", ]; let url_results = [ ("https://foo.com/bar", true), ("https://bar.com/bar", true), ("https://baz.com/bar", false), ("https://baz.foo.com/bar", true), ("https://coo.baz.com/bar", true), ("https://foo.bar.com/bar", true), ("https://foo.baz.com/bar", false), ("https://baz.com", false), ("https://foo-bar.baz.com/bar", false), ("https://foo.de", false), ("https://bar.foo.de", false), ]; let request_expectations: Vec<_> = url_results .into_iter() .map(|(url, expected_result)| { let request = Request::new(url, "https://example.com", "other").unwrap(); (request, expected_result) }) .collect(); test_requests_filters(&filters, &request_expectations); } #[test] fn network_filter_list_check_works_unicode() { let filters = [ "||firstrowsports.li/frame/", "||fırstrowsports.eu/pu/", "||atđhe.net/pu/", ]; let url_results = [ ("https://firstrowsports.li/frame/bar", true), ("https://secondrowsports.li/frame/bar", false), ("https://fırstrowsports.eu/pu/foo", true), ("https://xn--frstrowsports-39b.eu/pu/foo", true), ("https://atđhe.net/pu/foo", true), ("https://xn--athe-1ua.net/pu/foo", true), ]; let request_expectations: Vec<_> = url_results .into_iter() .map(|(url, expected_result)| { let request = Request::new(url, "https://example.com", "other").unwrap(); (request, expected_result) }).collect(); test_requests_filters(&filters, &request_expectations); } #[test] fn network_filter_list_check_works_regex_escaping() { let filters = [ r#"/^https?:\/\/.*(bitly|bit)\.(com|ly)\/.*/$domain=123movies.com|1337x.to"#, r#"/\:\/\/data.*\.com\/[a-zA-Z0-9]{30,}/$third-party,xmlhttprequest"# ]; let url_results = [ ( Request::new("https://bit.ly/bar/", "http://123movies.com", "").unwrap(), true, ), ( Request::new( "https://data.foo.com/9VjjrjU9Or2aqkb8PDiqTBnULPgeI48WmYEHkYer", "http://123movies.com", "xmlhttprequest", ) .unwrap(), true, ), ]; let request_expectations: Vec<_> = url_results .into_iter() .map(|(request, expected_result)| (request, expected_result)) .collect(); test_requests_filters(&filters, &request_expectations); } } #[cfg(test)] mod blocker_tests { use super::*; use crate::lists::parse_filters; use crate::resources::Resource; use crate::request::Request; use std::collections::HashSet; use std::iter::FromIterator; #[test] fn single_slash() { let filters = [ "/|", ]; let (network_filters, _) = parse_filters(filters, true, Default::default()); let blocker_options = BlockerOptions { enable_optimizations: true, }; let blocker = Blocker::new(network_filters, &blocker_options); let request = Request::new("https://example.com/test/", "https://example.com", "xmlhttprequest").unwrap(); assert!(blocker.check(&request, &Default::default()).matched); let request = Request::new("https://example.com/test", "https://example.com", "xmlhttprequest").unwrap(); assert!(!blocker.check(&request, &Default::default()).matched); } fn test_requests_filters(filters: impl IntoIterator>, requests: &[(Request, bool)]) { let (network_filters, _) = parse_filters(filters, true, Default::default()); let blocker_options: BlockerOptions = BlockerOptions { enable_optimizations: false, // optimizations will reduce number of rules }; let blocker = Blocker::new(network_filters, &blocker_options); requests.iter().for_each(|(req, expected_result)| { let matched_rule = blocker.check(&req, &Default::default()); if *expected_result { assert!(matched_rule.matched, "Expected match for {}", req.url); } else { assert!(!matched_rule.matched, "Expected no match for {}, matched with {:?}", req.url, matched_rule.filter); } }); } #[test] fn redirect_blocking_exception() { let filters = [ "||imdb-video.media-imdb.com$media,redirect=noop-0.1s.mp3", "@@||imdb-video.media-imdb.com^$domain=imdb.com", ]; let request = Request::new("https://imdb-video.media-imdb.com/kBOeI88k1o23eNAi", "https://www.imdb.com/video/13", "media").unwrap(); let (network_filters, _) = parse_filters(&filters, true, Default::default()); let blocker_options: BlockerOptions = BlockerOptions { enable_optimizations: false, }; let blocker = Blocker::new(network_filters, &blocker_options); let mut resources = ResourceStorage::default(); resources.add_resource( Resource::simple("noop-0.1s.mp3", crate::resources::MimeType::AudioMp3, "mp3"), ).unwrap(); let matched_rule = blocker.check(&request, &resources); assert_eq!(matched_rule.matched, false); assert_eq!(matched_rule.important, false); assert_eq!(matched_rule.redirect, Some("data:audio/mp3;base64,bXAz".to_string())); assert_eq!(matched_rule.exception, Some("@@||imdb-video.media-imdb.com^$domain=imdb.com".to_string())); } #[test] fn redirect_exception() { let filters = [ "||imdb-video.media-imdb.com$media,redirect=noop-0.1s.mp3", "@@||imdb-video.media-imdb.com^$domain=imdb.com,redirect=noop-0.1s.mp3", ]; let request = Request::new("https://imdb-video.media-imdb.com/kBOeI88k1o23eNAi", "https://www.imdb.com/video/13", "media").unwrap(); let (network_filters, _) = parse_filters(&filters, true, Default::default()); let blocker_options: BlockerOptions = BlockerOptions { enable_optimizations: false, }; let blocker = Blocker::new(network_filters, &blocker_options); let mut resources = ResourceStorage::default(); resources.add_resource( Resource::simple("noop-0.1s.mp3", crate::resources::MimeType::AudioMp3, "mp3"), ).unwrap(); let matched_rule = blocker.check(&request, &resources); assert_eq!(matched_rule.matched, false); assert_eq!(matched_rule.important, false); assert_eq!(matched_rule.redirect, None); assert_eq!(matched_rule.exception, Some("@@||imdb-video.media-imdb.com^$domain=imdb.com,redirect=noop-0.1s.mp3".to_string())); } #[test] fn redirect_rule_redirection() { let filters = [ "||doubleclick.net^", "||www3.doubleclick.net^$xmlhttprequest,redirect-rule=noop.txt,domain=lineups.fun", ]; let request = Request::new("https://www3.doubleclick.net", "https://lineups.fun", "xhr").unwrap(); let (network_filters, _) = parse_filters(&filters, true, Default::default()); let blocker_options: BlockerOptions = BlockerOptions { enable_optimizations: false, }; let blocker = Blocker::new(network_filters, &blocker_options); let mut resources = ResourceStorage::default(); resources.add_resource(Resource::simple("noop.txt", crate::resources::MimeType::TextPlain, "noop")).unwrap(); let matched_rule = blocker.check(&request, &resources); assert_eq!(matched_rule.matched, true); assert_eq!(matched_rule.important, false); assert_eq!(matched_rule.redirect, Some("data:text/plain;base64,bm9vcA==".to_string())); assert_eq!(matched_rule.exception, None); } #[test] fn badfilter_does_not_match() { let filters = ["||foo.com$badfilter"]; let url_results = [ ( Request::new("https://foo.com", "https://bar.com", "image").unwrap(), false, ), ]; let request_expectations: Vec<_> = url_results .into_iter() .map(|(request, expected_result)| (request, expected_result)) .collect(); test_requests_filters(&filters, &request_expectations); } #[test] fn badfilter_cancels_with_same_id() { let filters = [ "||foo.com$domain=bar.com|foo.com,badfilter", "||foo.com$domain=foo.com|bar.com", ]; let url_results = [ ( Request::new("https://foo.com", "https://bar.com", "image").unwrap(), false, ), ]; let request_expectations: Vec<_> = url_results .into_iter() .map(|(request, expected_result)| (request, expected_result)) .collect(); test_requests_filters(&filters, &request_expectations); } #[test] fn badfilter_does_not_cancel_similar_filter() { let filters = [ "||foo.com$domain=bar.com|foo.com,badfilter", "||foo.com$domain=foo.com|bar.com,image", ]; let url_results = [ ( Request::new("https://foo.com", "https://bar.com", "image").unwrap(), true, ), ]; let request_expectations: Vec<_> = url_results .into_iter() .map(|(request, expected_result)| (request, expected_result)) .collect(); test_requests_filters(&filters, &request_expectations); } #[test] fn hostname_regex_filter_works() { let filters = [ "||alimc*.top^$domain=letv.com", "||aa*.top^$domain=letv.com", ]; let url_results = [ (Request::new("https://r.alimc1.top/test.js", "https://minisite.letv.com/", "script").unwrap(), true), (Request::new("https://www.baidu.com/test.js", "https://minisite.letv.com/", "script").unwrap(), false), (Request::new("https://r.aabb.top/test.js", "https://example.com/", "script").unwrap(), false), (Request::new("https://r.aabb.top/test.js", "https://minisite.letv.com/", "script").unwrap(), true), ]; let (network_filters, _) = parse_filters(&filters, true, Default::default()); let blocker_options = BlockerOptions { enable_optimizations: false, // optimizations will reduce number of rules }; let blocker = Blocker::new(network_filters, &blocker_options); let resources = ResourceStorage::default(); url_results.into_iter().for_each(|(req, expected_result)| { let matched_rule = blocker.check(&req, &resources); if expected_result { assert!(matched_rule.matched, "Expected match for {}", req.url); } else { assert!(!matched_rule.matched, "Expected no match for {}, matched with {:?}", req.url, matched_rule.filter); } }); } #[test] fn get_csp_directives() { let filters = [ "$csp=script-src 'self' * 'unsafe-inline',domain=thepiratebay.vip|pirateproxy.live|thehiddenbay.com|downloadpirate.com|thepiratebay10.org|kickass.vip|pirateproxy.app|ukpass.co|prox.icu|pirateproxy.life", "$csp=worker-src 'none',domain=pirateproxy.live|thehiddenbay.com|tpb.party|thepiratebay.org|thepiratebay.vip|thepiratebay10.org|flashx.cc|vidoza.co|vidoza.net", "||1337x.to^$csp=script-src 'self' 'unsafe-inline'", "@@^no-csp^$csp=script-src 'self' 'unsafe-inline'", "^duplicated-directive^$csp=worker-src 'none'", "@@^disable-all^$csp", "^first-party-only^$csp=script-src 'none',1p", ]; let (network_filters, _) = parse_filters(&filters, true, Default::default()); let blocker_options = BlockerOptions { enable_optimizations: false, }; let blocker = Blocker::new(network_filters, &blocker_options); { // No directives should be returned for requests that are not `document` or `subdocument` content types. assert_eq!(blocker.get_csp_directives(&Request::new("https://pirateproxy.live/static/custom_ads.js", "https://pirateproxy.live", "script").unwrap()), None); assert_eq!(blocker.get_csp_directives(&Request::new("https://pirateproxy.live/static/custom_ads.js", "https://pirateproxy.live", "image").unwrap()), None); assert_eq!(blocker.get_csp_directives(&Request::new("https://pirateproxy.live/static/custom_ads.js", "https://pirateproxy.live", "object").unwrap()), None); } { // A single directive should be returned if only one match is present in the engine, for both document and subdocument types assert_eq!(blocker.get_csp_directives(&Request::new("https://example.com", "https://vidoza.co", "document").unwrap()), Some(String::from("worker-src 'none'"))); assert_eq!(blocker.get_csp_directives(&Request::new("https://example.com", "https://vidoza.net", "subdocument").unwrap()), Some(String::from("worker-src 'none'"))); } { // Multiple merged directives should be returned if more than one match is present in the engine let possible_results = [ Some(String::from("script-src 'self' * 'unsafe-inline',worker-src 'none'")), Some(String::from("worker-src 'none',script-src 'self' * 'unsafe-inline'")), ]; assert!(possible_results.contains(&blocker.get_csp_directives(&Request::new("https://example.com", "https://pirateproxy.live", "document").unwrap()))); assert!(possible_results.contains(&blocker.get_csp_directives(&Request::new("https://example.com", "https://pirateproxy.live", "subdocument").unwrap()))); } { // A directive with an exception should not be returned assert_eq!(blocker.get_csp_directives(&Request::new("https://1337x.to", "https://1337x.to", "document").unwrap()), Some(String::from("script-src 'self' 'unsafe-inline'"))); assert_eq!(blocker.get_csp_directives(&Request::new("https://1337x.to/no-csp", "https://1337x.to", "subdocument").unwrap()), None); } { // Multiple identical directives should only appear in the output once assert_eq!(blocker.get_csp_directives(&Request::new("https://example.com/duplicated-directive", "https://flashx.cc", "document").unwrap()), Some(String::from("worker-src 'none'"))); assert_eq!(blocker.get_csp_directives(&Request::new("https://example.com/duplicated-directive", "https://flashx.cc", "subdocument").unwrap()), Some(String::from("worker-src 'none'"))); } { // A CSP exception with no corresponding directive should disable all CSP injections for the page assert_eq!(blocker.get_csp_directives(&Request::new("https://1337x.to/duplicated-directive/disable-all", "https://thepiratebay10.org", "document").unwrap()), None); assert_eq!(blocker.get_csp_directives(&Request::new("https://1337x.to/duplicated-directive/disable-all", "https://thepiratebay10.org", "document").unwrap()), None); } { // A CSP exception with a partyness modifier should only match where the modifier applies assert_eq!(blocker.get_csp_directives(&Request::new("htps://github.com/first-party-only", "https://example.com", "subdocument").unwrap()), None); assert_eq!(blocker.get_csp_directives(&Request::new("https://example.com/first-party-only", "https://example.com", "document").unwrap()), Some(String::from("script-src 'none'"))); } } #[test] fn test_removeparam() { let filters = [ "||example.com^$removeparam=test", "*$removeparam=fbclid", "/script.js$redirect-rule=noopjs", "^block^$important", "$removeparam=testCase,~xhr", ]; let (network_filters, _) = parse_filters(&filters, true, Default::default()); let blocker_options = BlockerOptions { enable_optimizations: true, }; let blocker = Blocker::new(network_filters, &blocker_options); let mut resources = ResourceStorage::default(); resources.add_resource(Resource::simple("noopjs", crate::resources::MimeType::ApplicationJavascript, "(() => {})()")).unwrap(); let result = blocker.check(&Request::new("https://example.com?q=1&test=2#blue", "https://antonok.com", "xhr").unwrap(), &resources); assert_eq!(result.rewritten_url, Some("https://example.com?q=1#blue".into())); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com?test=2&q=1#blue", "https://antonok.com", "xhr").unwrap(), &resources); assert_eq!(result.rewritten_url, Some("https://example.com?q=1#blue".into())); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com?test=2#blue", "https://antonok.com", "xhr").unwrap(), &resources); assert_eq!(result.rewritten_url, Some("https://example.com#blue".into())); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com?q=1#blue", "https://antonok.com", "xhr").unwrap(), &resources); assert_eq!(result.rewritten_url, None); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com?q=1&test=2", "https://antonok.com", "xhr").unwrap(), &resources); assert_eq!(result.rewritten_url, Some("https://example.com?q=1".into())); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com?test=2&q=1", "https://antonok.com", "xhr").unwrap(), &resources); assert_eq!(result.rewritten_url, Some("https://example.com?q=1".into())); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com?test=2", "https://antonok.com", "xhr").unwrap(), &resources); assert_eq!(result.rewritten_url, Some("https://example.com".into())); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com?test=2", "https://antonok.com", "image").unwrap(), &resources); assert_eq!(result.rewritten_url, None); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com?q=1", "https://antonok.com", "xhr").unwrap(), &resources); assert_eq!(result.rewritten_url, None); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com?q=fbclid", "https://antonok.com", "xhr").unwrap(), &resources); assert_eq!(result.rewritten_url, None); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com?fbclid=10938&q=1&test=2", "https://antonok.com", "xhr").unwrap(), &resources); assert_eq!(result.rewritten_url, Some("https://example.com?q=1".into())); assert!(!result.matched); let result = blocker.check(&Request::new("https://test.com?fbclid=10938&q=1&test=2", "https://antonok.com", "xhr").unwrap(), &resources); assert_eq!(result.rewritten_url, Some("https://test.com?q=1&test=2".into())); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com?q1=1&q2=2&q3=3&test=2&q4=4&q5=5&fbclid=39", "https://antonok.com", "xhr").unwrap(), &resources); assert_eq!(result.rewritten_url, Some("https://example.com?q1=1&q2=2&q3=3&q4=4&q5=5".into())); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com?q1=1&q1=2&test=2&test=3", "https://antonok.com", "xhr").unwrap(), &resources); assert_eq!(result.rewritten_url, Some("https://example.com?q1=1&q1=2".into())); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com/script.js?test=2#blue", "https://antonok.com", "xhr").unwrap(), &resources); assert_eq!(result.rewritten_url, Some("https://example.com/script.js#blue".into())); assert_eq!(result.redirect, Some("data:application/javascript;base64,KCgpID0+IHt9KSgp".into())); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com/block/script.js?test=2", "https://antonok.com", "xhr").unwrap(), &resources); assert_eq!(result.rewritten_url, None); assert_eq!(result.redirect, Some("data:application/javascript;base64,KCgpID0+IHt9KSgp".into())); assert!(result.matched); let result = blocker.check(&Request::new("https://example.com/Path/?Test=ABC&testcase=AbC&testCase=aBc", "https://antonok.com", "xhr").unwrap(), &resources); assert_eq!(result.rewritten_url, None); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com/Path/?Test=ABC&testcase=AbC&testCase=aBc", "https://antonok.com", "image").unwrap(), &resources); assert_eq!(result.rewritten_url, None); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com/Path/?Test=ABC&testcase=AbC&testCase=aBc", "https://antonok.com", "subdocument").unwrap(), &resources); assert_eq!(result.rewritten_url, Some("https://example.com/Path/?Test=ABC&testcase=AbC".into())); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com/Path/?Test=ABC&testcase=AbC&testCase=aBc", "https://antonok.com", "document").unwrap(), &resources); assert_eq!(result.rewritten_url, Some("https://example.com/Path/?Test=ABC&testcase=AbC".into())); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com?Test=ABC?123&test=3#&test=4#b", "https://antonok.com", "document").unwrap(), &resources); assert_eq!(result.rewritten_url, Some("https://example.com?Test=ABC?123#&test=4#b".into())); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com?Test=ABC&testCase=5", "https://antonok.com", "document").unwrap(), &resources); assert_eq!(result.rewritten_url, Some("https://example.com?Test=ABC".into())); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com?Test=ABC&testCase=5", "https://antonok.com", "image").unwrap(), &resources); assert_eq!(result.rewritten_url, None); assert!(!result.matched); } /// Tests ported from the previous query parameter stripping logic in brave-core #[test] fn removeparam_brave_core_tests() { let testcases = [ // (original url, expected url after filtering) ("https://example.com/?fbclid=1234", "https://example.com/"), ("https://example.com/?fbclid=1234&", "https://example.com/"), ("https://example.com/?&fbclid=1234", "https://example.com/"), ("https://example.com/?gclid=1234", "https://example.com/"), ("https://example.com/?fbclid=0&gclid=1&msclkid=a&mc_eid=a1", "https://example.com/"), ("https://example.com/?fbclid=&foo=1&bar=2&gclid=abc", "https://example.com/?fbclid=&foo=1&bar=2"), ("https://example.com/?fbclid=&foo=1&gclid=1234&bar=2", "https://example.com/?fbclid=&foo=1&bar=2"), ("http://u:p@example.com/path/file.html?foo=1&fbclid=abcd#fragment", "http://u:p@example.com/path/file.html?foo=1#fragment"), ("https://example.com/?__s=1234-abcd", "https://example.com/"), // Obscure edge cases that break most parsers: ("https://example.com/?fbclid&foo&&gclid=2&bar=&%20", "https://example.com/?fbclid&foo&&bar=&%20"), ("https://example.com/?fbclid=1&1==2&=msclkid&foo=bar&&a=b=c&", "https://example.com/?1==2&=msclkid&foo=bar&&a=b=c&"), ("https://example.com/?fbclid=1&=2&?foo=yes&bar=2+", "https://example.com/?=2&?foo=yes&bar=2+"), ("https://example.com/?fbclid=1&a+b+c=some%20thing&1%202=3+4", "https://example.com/?a+b+c=some%20thing&1%202=3+4"), // Conditional query parameter stripping /*("https://example.com/?mkt_tok=123&foo=bar", "https://example.com/?foo=bar"),*/ ]; let filters = [ "fbclid", "gclid", "msclkid", "mc_eid", "dclid", "oly_anon_id", "oly_enc_id", "_openstat", "vero_conv", "vero_id", "wickedid", "yclid", "__s", "rb_clickid", "s_cid", "ml_subscriber", "ml_subscriber_hash", "twclid", "gbraid", "wbraid", "_hsenc", "__hssc", "__hstc", "__hsfp", "hsCtaTracking", "oft_id", "oft_k", "oft_lk", "oft_d", "oft_c", "oft_ck", "oft_ids", "oft_sk", "ss_email_id", "bsft_uid", "bsft_clkid", "vgo_ee", "igshid", ].iter().map(|s| format!("*$removeparam={}", s)).collect::>(); let (network_filters, _) = parse_filters(&filters, true, Default::default()); let blocker_options = BlockerOptions { enable_optimizations: true, }; let blocker = Blocker::new(network_filters, &blocker_options); let resources = ResourceStorage::default(); for (original, expected) in testcases.into_iter() { let result = blocker.check(&Request::new(original, "https://example.net", "xhr").unwrap(), &resources); let expected = if original == expected { None } else { Some(expected.to_string()) }; assert_eq!(expected, result.rewritten_url, "Filtering parameters on {} failed", original); } } #[test] fn test_removeparam_same_tokens() { let filters = [ "$removeparam=example1_", "$removeparam=example1-", ]; let (network_filters, _) = parse_filters(&filters, true, Default::default()); let blocker_options = BlockerOptions { enable_optimizations: true, }; let blocker = Blocker::new(network_filters, &blocker_options); let result = blocker.check(&Request::new("https://example.com?example1_=1&example1-=2", "https://example.com", "xhr").unwrap(), &Default::default()); assert_eq!(result.rewritten_url, Some("https://example.com".into())); assert!(!result.matched); } #[test] fn test_redirect_priority() { let filters = [ ".txt^$redirect-rule=a", "||example.com^$redirect-rule=b:10", "/text$redirect-rule=c:20", "@@^excepta^$redirect-rule=a", "@@^exceptb10^$redirect-rule=b:10", "@@^exceptc20^$redirect-rule=c:20", ]; let (network_filters, _) = parse_filters(&filters, true, Default::default()); let blocker_options = BlockerOptions { enable_optimizations: true, }; let blocker = Blocker::new(network_filters, &blocker_options); let mut resources = ResourceStorage::default(); fn add_simple_resource(resources: &mut ResourceStorage, identifier: &str) -> Option { resources.add_resource(Resource::simple(identifier, crate::resources::MimeType::TextPlain, identifier)).unwrap(); Some(format!("data:text/plain;base64,{}", base64::encode(identifier))) } let a_redirect = add_simple_resource(&mut resources, "a"); let b_redirect = add_simple_resource(&mut resources, "b"); let c_redirect = add_simple_resource(&mut resources, "c"); let result = blocker.check(&Request::new("https://example.net/test", "https://example.com", "xmlhttprequest").unwrap(), &resources); assert_eq!(result.redirect, None); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.net/test.txt", "https://example.com", "xmlhttprequest").unwrap(), &resources); assert_eq!(result.redirect, a_redirect); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com/test.txt", "https://example.com", "xmlhttprequest").unwrap(), &resources); assert_eq!(result.redirect, b_redirect); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com/text.txt", "https://example.com", "xmlhttprequest").unwrap(), &resources); assert_eq!(result.redirect, c_redirect); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com/exceptc20/text.txt", "https://example.com", "xmlhttprequest").unwrap(), &resources); assert_eq!(result.redirect, b_redirect); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com/exceptb10/text.txt", "https://example.com", "xmlhttprequest").unwrap(), &resources); assert_eq!(result.redirect, c_redirect); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com/exceptc20/exceptb10/text.txt", "https://example.com", "xmlhttprequest").unwrap(), &resources); assert_eq!(result.redirect, a_redirect); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com/exceptc20/exceptb10/excepta/text.txt", "https://example.com", "xmlhttprequest").unwrap(), &resources); assert_eq!(result.redirect, None); assert!(!result.matched); let result = blocker.check(&Request::new("https://example.com/exceptc20/exceptb10/text", "https://example.com", "xmlhttprequest").unwrap(), &resources); assert_eq!(result.redirect, None); assert!(!result.matched); } #[test] fn tags_enable_works() { let filters = [ "adv$tag=stuff", "somelongpath/test$tag=stuff", "||brianbondy.com/$tag=brian", "||brave.com$tag=brian", ]; let url_results = [ ("http://example.com/advert.html", true), ("http://example.com/somelongpath/test/2.html", true), ("https://brianbondy.com/about", false), ("https://brave.com/about", false), ]; let request_expectations: Vec<_> = url_results .into_iter() .map(|(url, expected_result)| { let request = Request::new(url, "https://example.com", "other").unwrap(); (request, expected_result) }).collect(); let (network_filters, _) = parse_filters(&filters, true, Default::default()); let blocker_options: BlockerOptions = BlockerOptions { enable_optimizations: false, // optimizations will reduce number of rules }; let mut blocker = Blocker::new(network_filters, &blocker_options); let resources = Default::default(); blocker.enable_tags(&["stuff"]); assert_eq!(blocker.tags_enabled, HashSet::from_iter([String::from("stuff")].into_iter())); assert_eq!(vec_hashmap_len(&blocker.filters_tagged.filter_map), 2); request_expectations.into_iter().for_each(|(req, expected_result)| { let matched_rule = blocker.check(&req, &resources); if expected_result { assert!(matched_rule.matched, "Expected match for {}", req.url); } else { assert!(!matched_rule.matched, "Expected no match for {}, matched with {:?}", req.url, matched_rule.filter); } }); } #[test] fn tags_enable_adds_tags() { let filters = [ "adv$tag=stuff", "somelongpath/test$tag=stuff", "||brianbondy.com/$tag=brian", "||brave.com$tag=brian", ]; let url_results = [ ("http://example.com/advert.html", true), ("http://example.com/somelongpath/test/2.html", true), ("https://brianbondy.com/about", true), ("https://brave.com/about", true), ]; let request_expectations: Vec<_> = url_results .into_iter() .map(|(url, expected_result)| { let request = Request::new(url, "https://example.com", "other").unwrap(); (request, expected_result) }).collect(); let (network_filters, _) = parse_filters(&filters, true, Default::default()); let blocker_options: BlockerOptions = BlockerOptions { enable_optimizations: false, // optimizations will reduce number of rules }; let mut blocker = Blocker::new(network_filters, &blocker_options); let resources = Default::default(); blocker.enable_tags(&["stuff"]); blocker.enable_tags(&["brian"]); assert_eq!(blocker.tags_enabled, HashSet::from_iter([String::from("brian"), String::from("stuff")].into_iter())); assert_eq!(vec_hashmap_len(&blocker.filters_tagged.filter_map), 4); request_expectations.into_iter().for_each(|(req, expected_result)| { let matched_rule = blocker.check(&req, &resources); if expected_result { assert!(matched_rule.matched, "Expected match for {}", req.url); } else { assert!(!matched_rule.matched, "Expected no match for {}, matched with {:?}", req.url, matched_rule.filter); } }); } #[test] fn tags_disable_works() { let filters = [ "adv$tag=stuff", "somelongpath/test$tag=stuff", "||brianbondy.com/$tag=brian", "||brave.com$tag=brian", ]; let url_results = [ ("http://example.com/advert.html", false), ("http://example.com/somelongpath/test/2.html", false), ("https://brianbondy.com/about", true), ("https://brave.com/about", true), ]; let request_expectations: Vec<_> = url_results .into_iter() .map(|(url, expected_result)| { let request = Request::new(url, "https://example.com", "other").unwrap(); (request, expected_result) }).collect(); let (network_filters, _) = parse_filters(&filters, true, Default::default()); let blocker_options: BlockerOptions = BlockerOptions { enable_optimizations: false, // optimizations will reduce number of rules }; let mut blocker = Blocker::new(network_filters, &blocker_options); let resources = Default::default(); blocker.enable_tags(&["brian", "stuff"]); assert_eq!(blocker.tags_enabled, HashSet::from_iter([String::from("brian"), String::from("stuff")].into_iter())); assert_eq!(vec_hashmap_len(&blocker.filters_tagged.filter_map), 4); blocker.disable_tags(&["stuff"]); assert_eq!(blocker.tags_enabled, HashSet::from_iter([String::from("brian")].into_iter())); assert_eq!(vec_hashmap_len(&blocker.filters_tagged.filter_map), 2); request_expectations.into_iter().for_each(|(req, expected_result)| { let matched_rule = blocker.check(&req, &resources); if expected_result { assert!(matched_rule.matched, "Expected match for {}", req.url); } else { assert!(!matched_rule.matched, "Expected no match for {}, matched with {:?}", req.url, matched_rule.filter); } }); } #[test] fn filter_add_badfilter_error() { let blocker_options: BlockerOptions = BlockerOptions { enable_optimizations: false, }; let mut blocker = Blocker::new(Vec::new(), &blocker_options); let filter = NetworkFilter::parse("adv$badfilter", true, Default::default()).unwrap(); let added = blocker.add_filter(filter); assert!(added.is_err()); assert_eq!(added.err().unwrap(), BlockerError::BadFilterAddUnsupported); } #[test] #[ignore] fn filter_add_twice_handling_error() { { // Not allow filter to be added twice hwn the engine is not optimised let blocker_options: BlockerOptions = BlockerOptions { enable_optimizations: false, }; let mut blocker = Blocker::new(Vec::new(), &blocker_options); let filter = NetworkFilter::parse("adv", true, Default::default()).unwrap(); blocker.add_filter(filter.clone()).unwrap(); assert!(blocker.filter_exists(&filter), "Expected filter to be inserted"); let added = blocker.add_filter(filter); assert!(added.is_err(), "Expected repeated insertion to fail"); assert_eq!(added.err().unwrap(), BlockerError::FilterExists, "Expected specific error on repeated insertion fail"); } { // Allow filter to be added twice when the engine is optimised let blocker_options: BlockerOptions = BlockerOptions { enable_optimizations: true, }; let mut blocker = Blocker::new(Vec::new(), &blocker_options); let filter = NetworkFilter::parse("adv", true, Default::default()).unwrap(); blocker.add_filter(filter.clone()).unwrap(); let added = blocker.add_filter(filter); assert!(added.is_ok()); } } #[test] fn filter_add_tagged() { // Allow filter to be added twice when the engine is optimised let blocker_options: BlockerOptions = BlockerOptions { enable_optimizations: true, }; let mut blocker = Blocker::new(Vec::new(), &blocker_options); let resources = Default::default(); blocker.enable_tags(&["brian"]); blocker.add_filter(NetworkFilter::parse("adv$tag=stuff", true, Default::default()).unwrap()).unwrap(); blocker.add_filter(NetworkFilter::parse("somelongpath/test$tag=stuff", true, Default::default()).unwrap()).unwrap(); blocker.add_filter(NetworkFilter::parse("||brianbondy.com/$tag=brian", true, Default::default()).unwrap()).unwrap(); blocker.add_filter(NetworkFilter::parse("||brave.com$tag=brian", true, Default::default()).unwrap()).unwrap(); let url_results = [ ("http://example.com/advert.html", false), ("http://example.com/somelongpath/test/2.html", false), ("https://brianbondy.com/about", true), ("https://brave.com/about", true), ]; let request_expectations: Vec<_> = url_results .into_iter() .map(|(url, expected_result)| { let request = Request::new(url, "https://example.com", "other").unwrap(); (request, expected_result) }).collect(); request_expectations.into_iter().for_each(|(req, expected_result)| { let matched_rule = blocker.check(&req, &resources); if expected_result { assert!(matched_rule.matched, "Expected match for {}", req.url); } else { assert!(!matched_rule.matched, "Expected no match for {}, matched with {:?}", req.url, matched_rule.filter); } }); } #[test] fn exception_force_check() { let blocker_options: BlockerOptions = BlockerOptions { enable_optimizations: true, }; let mut blocker = Blocker::new(Vec::new(), &blocker_options); let resources = Default::default(); blocker.add_filter(NetworkFilter::parse("@@*ad_banner.png", true, Default::default()).unwrap()).unwrap(); let request = Request::new("http://example.com/ad_banner.png", "https://example.com", "other").unwrap(); let matched_rule = blocker.check_parameterised(&request, &resources, false, true); assert!(!matched_rule.matched); assert!(matched_rule.exception.is_some()); } #[test] fn generichide() { let blocker_options: BlockerOptions = BlockerOptions { enable_optimizations: true, }; let mut blocker = Blocker::new(Vec::new(), &blocker_options); blocker.add_filter(NetworkFilter::parse("@@||example.com$generichide", true, Default::default()).unwrap()).unwrap(); assert!(blocker.check_generic_hide(&Request::new("https://example.com", "https://example.com", "other").unwrap())); } } #[cfg(test)] mod placeholder_string_tests { /// If this changes, be sure to update the documentation for [`BlockerResult`] as well. #[test] fn test_constant_placeholder_string() { let mut filter_set = crate::lists::FilterSet::new(false); filter_set.add_filter("||example.com^", Default::default()).unwrap(); let engine = crate::Engine::from_filter_set(filter_set, true); let block = engine.check_network_request(&crate::request::Request::new("https://example.com", "https://example.com", "document").unwrap()); assert_eq!(block.filter, Some("NetworkFilter".to_string())); } } #[cfg(test)] mod legacy_rule_parsing_tests { use crate::test_utils::rules_from_lists; use crate::lists::{parse_filters, FilterFormat, ParseOptions}; use crate::blocker::{Blocker, BlockerOptions}; use crate::blocker::vec_hashmap_len; struct ListCounts { pub filters: usize, pub cosmetic_filters: usize, pub exceptions: usize, pub duplicates: usize, } impl std::ops::Add for ListCounts { type Output = ListCounts; fn add(self, other: ListCounts) -> Self::Output { ListCounts { filters: self.filters + other.filters, cosmetic_filters: self.cosmetic_filters + other.cosmetic_filters, exceptions: self.exceptions + other.exceptions, duplicates: 0, // Don't bother trying to calculate - lists could have cross-duplicated entries } } } // number of expected EasyList cosmetic rules from old engine is 31144, but is incorrect as it skips a few particularly long rules that are nevertheless valid // easyList = { 24478, 31144, 0, 5589 }; // not handling (and not including) filters with the following options: // - $popup // - $elemhide // difference from original counts caused by not handling document/subdocument options and possibly miscounting on the blocker side. // Printing all non-cosmetic, non-html, non-comment/-empty rules and ones with no unsupported options yields 29142 items // This engine also handles 3 rules that old one does not const EASY_LIST: ListCounts = ListCounts { filters: 24064, cosmetic_filters: 31163, exceptions: 5796, duplicates: 0 }; // easyPrivacy = { 11817, 0, 0, 1020 }; // differences in counts explained by hashset size underreporting as detailed in the next two cases const EASY_PRIVACY: ListCounts = ListCounts { filters: 11889, cosmetic_filters: 0, exceptions: 1021, duplicates: 2 }; // ublockUnbreak = { 4, 8, 0, 94 }; // differences in counts explained by client.hostAnchoredExceptionHashSet->GetSize() underreporting when compared to client.numHostAnchoredExceptionFilters const UBLOCK_UNBREAK: ListCounts = ListCounts { filters: 4, cosmetic_filters: 8, exceptions: 98, duplicates: 0 }; // braveUnbreak = { 31, 0, 0, 4 }; // differences in counts explained by client.hostAnchoredHashSet->GetSize() underreporting when compared to client.numHostAnchoredFilters const BRAVE_UNBREAK: ListCounts = ListCounts { filters: 32, cosmetic_filters: 0, exceptions: 4, duplicates: 0 }; // disconnectSimpleMalware = { 2450, 0, 0, 0 }; const DISCONNECT_SIMPLE_MALWARE: ListCounts = ListCounts { filters: 2450, cosmetic_filters: 0, exceptions: 0, duplicates: 0 }; // spam404MainBlacklist = { 5629, 166, 0, 0 }; const SPAM_404_MAIN_BLACKLIST: ListCounts = ListCounts { filters: 5629, cosmetic_filters: 166, exceptions: 0, duplicates: 0 }; const MALWARE_DOMAIN_LIST: ListCounts = ListCounts { filters: 1104, cosmetic_filters: 0, exceptions: 0, duplicates: 3 }; const MALWARE_DOMAINS: ListCounts = ListCounts { filters: 26853, cosmetic_filters: 0, exceptions: 0, duplicates: 48 }; fn check_list_counts(rule_lists: impl IntoIterator>, format: FilterFormat, expectation: ListCounts) { let rules = rules_from_lists(rule_lists); let (network_filters, cosmetic_filters) = parse_filters(rules, true, ParseOptions { format, ..Default::default() }); assert_eq!( (network_filters.len(), network_filters.iter().filter(|f| f.is_exception()).count(), cosmetic_filters.len()), (expectation.filters + expectation.exceptions, expectation.exceptions, expectation.cosmetic_filters), "Number of collected filters does not match expectation"); let blocker_options = BlockerOptions { enable_optimizations: false, // optimizations will reduce number of rules }; let blocker = Blocker::new(network_filters, &blocker_options); // Some filters in the filter_map are pointed at by multiple tokens, increasing the total number of items assert!(vec_hashmap_len(&blocker.exceptions.filter_map) + vec_hashmap_len(&blocker.generic_hide.filter_map) >= expectation.exceptions, "Number of collected exceptions does not match expectation"); assert!(vec_hashmap_len(&blocker.filters.filter_map) + vec_hashmap_len(&blocker.importants.filter_map) + vec_hashmap_len(&blocker.redirects.filter_map) + vec_hashmap_len(&blocker.redirects.filter_map) + vec_hashmap_len(&blocker.csp.filter_map) >= expectation.filters - expectation.duplicates, "Number of collected network filters does not match expectation"); } #[test] fn parse_easylist() { check_list_counts(["./data/test/easylist.txt"], FilterFormat::Standard, EASY_LIST); } #[test] fn parse_easyprivacy() { check_list_counts(["./data/test/easyprivacy.txt"], FilterFormat::Standard, EASY_PRIVACY); } #[test] fn parse_ublock_unbreak() { check_list_counts(["./data/test/ublock-unbreak.txt"], FilterFormat::Standard, UBLOCK_UNBREAK); } #[test] fn parse_brave_unbreak() { check_list_counts(["./data/test/brave-unbreak.txt"], FilterFormat::Standard, BRAVE_UNBREAK); } #[test] fn parse_brave_disconnect_simple_malware() { check_list_counts(["./data/test/disconnect-simple-malware.txt"], FilterFormat::Standard, DISCONNECT_SIMPLE_MALWARE); } #[test] fn parse_spam404_main_blacklist() { check_list_counts(["./data/test/spam404-main-blacklist.txt"], FilterFormat::Standard, SPAM_404_MAIN_BLACKLIST); } #[test] fn parse_malware_domain_list() { check_list_counts(["./data/test/malwaredomainlist.txt"], FilterFormat::Hosts, MALWARE_DOMAIN_LIST); } #[test] fn parse_malware_domain_list_just_hosts() { check_list_counts(["./data/test/malwaredomainlist_justhosts.txt"], FilterFormat::Hosts, MALWARE_DOMAIN_LIST); } #[test] fn parse_malware_domains() { check_list_counts(["./data/test/malwaredomains.txt"], FilterFormat::Hosts, MALWARE_DOMAINS); } #[test] fn parse_multilist() { let expectation = EASY_LIST + EASY_PRIVACY + UBLOCK_UNBREAK + BRAVE_UNBREAK; check_list_counts( [ "./data/test/easylist.txt", "./data/test/easyprivacy.txt", "./data/test/ublock-unbreak.txt", "./data/test/brave-unbreak.txt", ], FilterFormat::Standard, expectation, ) } #[test] fn parse_malware_multilist() { let expectation = SPAM_404_MAIN_BLACKLIST + DISCONNECT_SIMPLE_MALWARE; check_list_counts( [ "./data/test/spam404-main-blacklist.txt", "./data/test/disconnect-simple-malware.txt", ], FilterFormat::Standard, expectation, ) } #[test] fn parse_hosts_formats() { let mut expectation = MALWARE_DOMAIN_LIST + MALWARE_DOMAINS; expectation.duplicates = 69; check_list_counts( [ "./data/test/malwaredomainlist.txt", "./data/test/malwaredomains.txt", ], FilterFormat::Hosts, expectation, ) } } adblock-0.8.12/src/content_blocking.rs000064400000000000000000001466511046102023000160070ustar 00000000000000//! Transforms filter rules into content blocking syntax used on iOS and MacOS. use crate::filters::cosmetic::CosmeticFilter; use crate::filters::network::{NetworkFilter, NetworkFilterMask}; use crate::lists::ParsedFilter; use memchr::{memchr as find_char, memmem}; use once_cell::sync::Lazy; use regex::Regex; use serde::{Deserialize, Serialize}; use std::collections::HashSet; use std::convert::{TryFrom, TryInto}; /// By default, ABP rules do not block top-level document requests. There's no way to express that /// in content blocking format, so instead it's approximated with a rule that applies an exception /// to any first-party requests that are document types. /// /// This rule should be added after all other network rules. pub fn ignore_previous_fp_documents() -> CbRule { let mut resource_type = HashSet::new(); resource_type.insert(CbResourceType::Document); CbRule { trigger: CbTrigger { url_filter: String::from(".*"), resource_type: Some(resource_type), load_type: vec![CbLoadType::FirstParty], ..CbTrigger::default() }, action: CbAction { typ: CbType::IgnorePreviousRules, selector: None, }, } } /// Rust representation of a single content blocking rule. /// /// This can be deserialized with `serde_json` directly into the correct format. #[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] pub struct CbRule { pub action: CbAction, pub trigger: CbTrigger, } impl CbRule { /// If this returns false, the rule will not compile and should not be used. fn is_ascii(&self) -> bool { self.action.selector.iter().all(|s| s.is_ascii()) && self.trigger.url_filter.is_ascii() && self.trigger.if_domain.iter().flatten().all(|d| d.is_ascii()) && self.trigger.unless_domain.iter().flatten().all(|d| d.is_ascii()) && self.trigger.if_top_url.iter().flatten().all(|d| d.is_ascii()) && self.trigger.unless_top_url.iter().flatten().all(|d| d.is_ascii()) } } /// Corresponds to the `action` field of a Safari content blocking rule. #[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] pub struct CbAction { #[serde(rename = "type")] pub typ: CbType, /// Specify a string that defines a selector list. This value is required when the action type /// is css-display-none. If it's not, the selector field is ignored by Safari. Use CSS /// identifiers as the individual selector values, separated by commas. Safari and WebKit /// supports all of its CSS selectors for Safari content-blocking rules. #[serde(default, skip_serializing_if = "Option::is_none")] pub selector: Option, } /// Corresponds to the `action.type` field of a Safari content blocking rule. #[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] #[serde(rename_all = "kebab-case")] pub enum CbType { /// Stops loading of the resource. If the resource was cached, the cache is ignored. Block, /// Strips cookies from the header before sending to the server. Only cookies otherwise /// acceptable to Safari's privacy policy can be blocked. Combining with ignore-previous-rules /// doesn't override the browser’s privacy settings. BlockCookies, /// Hides elements of the page based on a CSS selector. A selector field contains the selector /// list. Any matching element has its display property set to none, which hides it. CssDisplayNone, /// Ignores previously triggered actions. IgnorePreviousRules, /// Changes a URL from http to https. URLs with a specified (nondefault) port and links using /// other protocols are unaffected. MakeHttps, } /// Corresponds to possible entries in the `trigger.load_type` field of a Safari content blocking /// rule. #[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] #[serde(rename_all = "kebab-case")] pub enum CbLoadType { FirstParty, ThirdParty, } /// Corresponds to possible entries in the `trigger.resource_type` field of a Safari content /// blocking rule. #[derive(Clone, Debug, PartialEq, Eq, Hash, Deserialize, Serialize)] #[serde(rename_all = "kebab-case")] pub enum CbResourceType { Document, Image, StyleSheet, Script, Font, Raw, SvgDocument, Media, Popup, } /// Corresponds to the `trigger` field of a Safari content blocking rule. #[derive(Clone, Debug, Default, PartialEq, Deserialize, Serialize)] #[serde(rename_all = "kebab-case")] pub struct CbTrigger { /// Specifies a pattern to match the URL against. pub url_filter: String, #[serde(default, skip_serializing_if = "Option::is_none")] /// A Boolean value. The default value is false. pub url_filter_is_case_sensitive: Option, /// An array of strings matched to a URL's domain; limits action to a list of specific domains. /// Values must be lowercase ASCII, or punycode for non-ASCII. Add * in front to match domain /// and subdomains. Can't be used with unless-domain. #[serde(default, skip_serializing_if = "Option::is_none")] pub if_domain: Option>, /// An array of strings matched to a URL's domain; acts on any site except domains in a /// provided list. Values must be lowercase ASCII, or punycode for non-ASCII. Add * in front to /// match domain and subdomains. Can't be used with if-domain. #[serde(default, skip_serializing_if = "Option::is_none")] pub unless_domain: Option>, /// An array of strings representing the resource types (how the browser intends to use the /// resource) that the rule should match. If not specified, the rule matches all resource /// types. Valid values: document, image, style-sheet, script, font, raw (Any untyped load), /// svg-document, media, popup. #[serde(default, skip_serializing_if = "Option::is_none")] pub resource_type: Option>, /// An array of strings that can include one of two mutually exclusive values. If not /// specified, the rule matches all load types. first-party is triggered only if the resource /// has the same scheme, domain, and port as the main page resource. third-party is triggered /// if the resource is not from the same domain as the main page resource. #[serde(default, skip_serializing_if = "Vec::is_empty")] pub load_type: Vec, /// An array of strings matched to the entire main document URL; limits the action to a /// specific list of URL patterns. Values must be lowercase ASCII, or punycode for non-ASCII. /// Can't be used with unless-top-url. #[serde(default, skip_serializing_if = "Option::is_none")] pub if_top_url: Option>, /// An array of strings matched to the entire main document URL; acts on any site except URL /// patterns in provided list. Values must be lowercase ASCII, or punycode for non-ASCII. Can't /// be used with if-top-url. #[serde(default, skip_serializing_if = "Option::is_none")] pub unless_top_url: Option>, } /// Possible failure reasons when attempting to convert an adblock rule into content filtering /// syntax. #[derive(Debug)] pub enum CbRuleCreationFailure { /// Currently, only filter rules parsed in debug mode can be translated into equivalent content /// blocking syntax. NeedsDebugMode, /// Content blocking rules cannot have if-domain and unless-domain together at the same time. UnlessAndIfDomainTogetherUnsupported, /// A network filter rule with only the given content type flags was provided, and none of them /// are supported. If at least one supported content type is provided, no failure will occur /// and unsupported types will be silently dropped. NoSupportedNetworkOptions(NetworkFilterMask), /// Network rules with redirect options cannot be represented in content blocking syntax. NetworkRedirectUnsupported, /// Network rules with generichide options cannot be supported in content blocking syntax. NetworkGenerichideUnsupported, /// Network rules with badfilter options cannot be supported in content blocking syntax. NetworkBadFilterUnsupported, /// Network rules with csp options cannot be supported in content blocking syntax. NetworkCspUnsupported, /// Network rules with removeparam options cannot be supported in content blocking syntax. NetworkRemoveparamUnsupported, /// Content blocking syntax only supports a subset of regex features, namely: /// - Matching any character with “.”. /// - Matching ranges with the range syntax [a-b]. /// - Quantifying expressions with “?”, “+” and “*”. /// - Groups with parenthesis. /// It may be possible to correctly convert some full-regex rules, but others use unsupported /// features (e.g. quantified repetition with {...}) that make conversion to content blocking /// syntax impossible. FullRegexUnsupported, /// `Blocker`-internal `NetworkFilter`s can be represented in optimized form, but these cannot /// be currently converted into content blocking syntax. OptimizedRulesUnsupported, /// Cosmetic rules with entities (e.g. google.*) rather than hostnames cannot be represented in /// content blocking syntax. CosmeticEntitiesUnsupported, /// Cosmetic rules with custom action specification (i.e. `:style(...)`) cannot be represented /// in content blocking syntax. CosmeticActionRulesNotSupported, /// Cosmetic rules with scriptlet injections (i.e. `+js(...)`) cannot be represented in content /// blocking syntax. ScriptletInjectionsNotSupported, /// Valid content blocking rules can only include ASCII characters. RuleContainsNonASCII, /// `from` as a `domain` alias is not currently supported in content blocking syntax. FromNotSupported, } impl TryFrom for CbRuleEquivalent { type Error = CbRuleCreationFailure; fn try_from(v: ParsedFilter) -> Result { match v { ParsedFilter::Network(f) => f.try_into(), ParsedFilter::Cosmetic(f) => Ok(Self::SingleRule(f.try_into()?)), } } } fn non_empty(v: Vec) -> Option> { if v.len() > 0 { Some(v) } else { None } } /// Some adblock rules cannot be directly represented by a single content blocking rule. This enum /// serves as an intermediate conversion step that provides extra context on why one rule turned /// into multiple rules. /// /// The contained rules can be accessed using `IntoIterator`. pub enum CbRuleEquivalent { /// In most successful cases, an ABP rule can be converted into a single content blocking rule. SingleRule(CbRule), /// If a network rule has more than one specified resource type, one of those types is /// `Document`, and no load type is specified, then the rule should be split into two content /// blocking rules: the first has all original resource types except `Document`, and the second /// only specifies `Document` with a third-party load type. SplitDocument(CbRule, CbRule), } impl IntoIterator for CbRuleEquivalent { type Item = CbRule; type IntoIter = CbRuleEquivalentIterator; fn into_iter(self) -> Self::IntoIter { match self { Self::SingleRule(r) => CbRuleEquivalentIterator { rules: [Some(r), None], index: 0, }, Self::SplitDocument(r1, r2) => CbRuleEquivalentIterator { rules: [Some(r1), Some(r2)], index: 0, }, } } } /// Returned by [`CbRuleEquivalent`]'s `IntoIterator` implementation. pub struct CbRuleEquivalentIterator { rules: [Option; 2], index: usize, } impl Iterator for CbRuleEquivalentIterator { type Item = CbRule; fn next(&mut self) -> Option { if self.index >= self.rules.len() { return None; } let result = self.rules[self.index].take(); self.index += 1; result } } impl TryFrom for CbRuleEquivalent { type Error = CbRuleCreationFailure; fn try_from(v: NetworkFilter) -> Result { static SPECIAL_CHARS: Lazy = Lazy::new(|| Regex::new(r##"([.+?^${}()|\[\]\\])"##).unwrap()); static REPLACE_WILDCARDS: Lazy = Lazy::new(|| Regex::new(r##"\*"##).unwrap()); static TRAILING_SEPARATOR: Lazy = Lazy::new(|| Regex::new(r##"\^$"##).unwrap()); if let Some(raw_line) = &v.raw_line { if v.is_redirect() { return Err(CbRuleCreationFailure::NetworkRedirectUnsupported); } if v.mask.contains(NetworkFilterMask::GENERIC_HIDE) { return Err(CbRuleCreationFailure::NetworkGenerichideUnsupported); } if v.mask.contains(NetworkFilterMask::BAD_FILTER) { return Err(CbRuleCreationFailure::NetworkBadFilterUnsupported); } if v.is_csp() { return Err(CbRuleCreationFailure::NetworkCspUnsupported); } if v.mask.contains(NetworkFilterMask::IS_COMPLETE_REGEX) { return Err(CbRuleCreationFailure::FullRegexUnsupported); } if v.is_removeparam() { return Err(CbRuleCreationFailure::NetworkRemoveparamUnsupported); } let load_type = if v .mask .contains(NetworkFilterMask::THIRD_PARTY | NetworkFilterMask::FIRST_PARTY) { vec![] } else if v.mask.contains(NetworkFilterMask::THIRD_PARTY) { vec![CbLoadType::ThirdParty] } else if v.mask.contains(NetworkFilterMask::FIRST_PARTY) { vec![CbLoadType::FirstParty] } else { vec![] }; let url_filter = match (v.filter, v.hostname) { (crate::filters::network::FilterPart::AnyOf(_), _) => { return Err(CbRuleCreationFailure::OptimizedRulesUnsupported) } (crate::filters::network::FilterPart::Simple(part), Some(hostname)) => { let without_trailing_separator = TRAILING_SEPARATOR.replace_all(&part, ""); let escaped_special_chars = SPECIAL_CHARS.replace_all(&without_trailing_separator, r##"\$1"##); let with_fixed_wildcards = REPLACE_WILDCARDS.replace_all(&escaped_special_chars, ".*"); let mut url_filter = format!( "^[^:]+:(//)?([^/]+\\.)?{}", SPECIAL_CHARS.replace_all(&hostname, r##"\$1"##) ); if v.mask.contains(NetworkFilterMask::IS_HOSTNAME_REGEX) { url_filter += ".*"; } url_filter += &with_fixed_wildcards; if v.mask.contains(NetworkFilterMask::IS_RIGHT_ANCHOR) { url_filter += "$"; } url_filter } (crate::filters::network::FilterPart::Simple(part), None) => { let without_trailing_separator = TRAILING_SEPARATOR.replace_all(&part, ""); let escaped_special_chars = SPECIAL_CHARS.replace_all(&without_trailing_separator, r##"\$1"##); let with_fixed_wildcards = REPLACE_WILDCARDS.replace_all(&escaped_special_chars, ".*"); let mut url_filter = if v.mask.contains(NetworkFilterMask::IS_LEFT_ANCHOR) { format!("^{}", with_fixed_wildcards) } else { let scheme_part = if v .mask .contains(NetworkFilterMask::FROM_HTTP | NetworkFilterMask::FROM_HTTPS) { "" } else if v.mask.contains(NetworkFilterMask::FROM_HTTP) { "^http://.*" } else if v.mask.contains(NetworkFilterMask::FROM_HTTPS) { "^https://.*" } else if v.mask.contains(NetworkFilterMask::FROM_WEBSOCKET) { "^wss?://.*" } else { unreachable!("Invalid scheme information"); }; format!("{}{}", scheme_part, with_fixed_wildcards) }; if v.mask.contains(NetworkFilterMask::IS_RIGHT_ANCHOR) { url_filter += "$"; } url_filter } (crate::filters::network::FilterPart::Empty, Some(hostname)) => { let escaped_special_chars = SPECIAL_CHARS.replace_all(&hostname, r##"\$1"##); format!("^[^:]+:(//)?([^/]+\\.)?{}", escaped_special_chars) } (crate::filters::network::FilterPart::Empty, None) => { if v.mask.contains(NetworkFilterMask::FROM_HTTP | NetworkFilterMask::FROM_HTTPS) { "^https?://" } else if v.mask.contains(NetworkFilterMask::FROM_HTTP) { "^http://" } else if v.mask.contains(NetworkFilterMask::FROM_HTTPS) { "^https://" } else if v.mask.contains(NetworkFilterMask::FROM_WEBSOCKET) { "^wss?://" } else { unreachable!("Invalid scheme information"); }.to_string() } }; let (if_domain, unless_domain) = if v.opt_domains.is_some() || v.opt_not_domains.is_some() { let mut if_domain = vec![]; let mut unless_domain = vec![]; // Unwraps are okay here - any rules with opt_domains or opt_not_domains must have // an options section delimited by a '$' character, followed by a `domain=` option. let opts = &raw_line[find_char(b'$', raw_line.as_bytes()).unwrap() + "$".len()..]; let domain_start_index = if let Some(index) = memmem::find(opts.as_bytes(), b"domain=") { index } else { return Err(CbRuleCreationFailure::FromNotSupported); }; let domains_start = &opts[domain_start_index + "domain=".len()..]; let domains = if let Some(comma) = find_char(b',', domains_start.as_bytes()) { &domains_start[..comma] } else { domains_start }.split('|'); domains.for_each(|domain| { let (collection, domain) = if let Some(domain_stripped) = domain.strip_prefix('~') { (&mut unless_domain, domain_stripped) } else { (&mut if_domain, domain) }; let lowercase = domain.to_lowercase(); let normalized_domain = if lowercase.is_ascii() { lowercase } else { // The network filter has already parsed successfully, so this should be // safe idna::domain_to_ascii(&lowercase).unwrap() }; collection.push(format!("*{}", normalized_domain)); }); (non_empty(if_domain), non_empty(unless_domain)) } else { (None, None) }; if if_domain.is_some() && unless_domain.is_some() { return Err(CbRuleCreationFailure::UnlessAndIfDomainTogetherUnsupported); } let blocking_type = if v.mask.contains(NetworkFilterMask::IS_EXCEPTION) { CbType::IgnorePreviousRules } else { CbType::Block }; let resource_type = if v.mask.contains(NetworkFilterMask::FROM_NETWORK_TYPES) { None } else { let mut types = HashSet::new(); let mut unsupported_flags = NetworkFilterMask::empty(); macro_rules! push_if_flag { ($flag:ident, $target:ident) => { if v.mask.contains(NetworkFilterMask::$flag) { types.insert(CbResourceType::$target); } }; ($flag:ident) => { if v.mask.contains(NetworkFilterMask::$flag) { unsupported_flags |= NetworkFilterMask::$flag; } }; } push_if_flag!(FROM_IMAGE, Image); push_if_flag!(FROM_MEDIA, Media); push_if_flag!(FROM_OBJECT); push_if_flag!(FROM_OTHER); push_if_flag!(FROM_PING); push_if_flag!(FROM_SCRIPT, Script); push_if_flag!(FROM_STYLESHEET, StyleSheet); push_if_flag!(FROM_SUBDOCUMENT, Document); push_if_flag!(FROM_WEBSOCKET); push_if_flag!(FROM_XMLHTTPREQUEST, Raw); push_if_flag!(FROM_FONT, Font); // TODO - Popup, Document when implemented if !unsupported_flags.is_empty() && types.is_empty() { return Err(CbRuleCreationFailure::NoSupportedNetworkOptions( unsupported_flags, )); } Some(types) }; let url_filter_is_case_sensitive = if v.mask.contains(NetworkFilterMask::MATCH_CASE) { Some(true) } else { None }; let single_rule = CbRule { action: CbAction { typ: blocking_type, selector: None, }, trigger: CbTrigger { url_filter, load_type, if_domain, unless_domain, resource_type, url_filter_is_case_sensitive, ..Default::default() }, }; if !single_rule.is_ascii() { return Err(CbRuleCreationFailure::RuleContainsNonASCII); } if let Some(resource_types) = &single_rule.trigger.resource_type { if resource_types.len() > 1 && resource_types.contains(&CbResourceType::Document) && single_rule.trigger.load_type.is_empty() { let mut non_doc_types = resource_types.clone(); non_doc_types.remove(&CbResourceType::Document); let rule_clone = single_rule.clone(); let non_doc_rule = CbRule { trigger: CbTrigger { resource_type: Some(non_doc_types), ..rule_clone.trigger }, ..rule_clone }; let mut doc_type = HashSet::new(); doc_type.insert(CbResourceType::Document); let just_doc_rule = CbRule { trigger: CbTrigger { resource_type: Some(doc_type), load_type: vec![CbLoadType::ThirdParty], ..single_rule.trigger }, ..single_rule }; return Ok(Self::SplitDocument(non_doc_rule, just_doc_rule)); } } Ok(Self::SingleRule(single_rule)) } else { Err(CbRuleCreationFailure::NeedsDebugMode) } } } impl TryFrom for CbRule { type Error = CbRuleCreationFailure; fn try_from(v: CosmeticFilter) -> Result { use crate::filters::cosmetic::{CosmeticFilterLocationType, CosmeticFilterMask}; if v.action.is_some() { return Err(CbRuleCreationFailure::CosmeticActionRulesNotSupported); } if v.mask.contains(CosmeticFilterMask::SCRIPT_INJECT) { return Err(CbRuleCreationFailure::ScriptletInjectionsNotSupported); } if let Some(raw_line) = v.raw_line { let mut hostnames_vec = vec![]; let mut not_hostnames_vec = vec![]; let mut any_entities = false; // Unwrap is okay here - cosmetic rules must have a '#' character let sharp_index = find_char(b'#', raw_line.as_bytes()).unwrap(); CosmeticFilter::locations_before_sharp(&raw_line, sharp_index).for_each( |(location_type, location)| match location_type { CosmeticFilterLocationType::Entity => any_entities = true, CosmeticFilterLocationType::NotEntity => any_entities = true, CosmeticFilterLocationType::Hostname => { if let Ok(encoded) = idna::domain_to_ascii(location) { hostnames_vec.push(encoded); } } CosmeticFilterLocationType::NotHostname => { if let Ok(encoded) = idna::domain_to_ascii(location) { not_hostnames_vec.push(encoded); } } }, ); if any_entities { return Err(CbRuleCreationFailure::CosmeticEntitiesUnsupported); } let hostnames_vec = non_empty(hostnames_vec); let not_hostnames_vec = non_empty(not_hostnames_vec); if hostnames_vec.is_some() && not_hostnames_vec.is_some() { return Err(CbRuleCreationFailure::UnlessAndIfDomainTogetherUnsupported); } let (unless_domain, if_domain) = if v.mask.contains(CosmeticFilterMask::UNHIDE) { (hostnames_vec, not_hostnames_vec) } else { (not_hostnames_vec, hostnames_vec) }; let rule = Self { action: CbAction { typ: CbType::CssDisplayNone, selector: Some(v.selector), }, trigger: CbTrigger { url_filter: ".*".to_string(), if_domain, unless_domain, ..Default::default() }, }; if !rule.is_ascii() { return Err(CbRuleCreationFailure::RuleContainsNonASCII); } Ok(rule) } else { Err(CbRuleCreationFailure::NeedsDebugMode) } } } #[cfg(test)] mod ab2cb_tests { use super::*; fn test_from_abp(abp_rule: &str, cb: &str) { let filter = crate::lists::parse_filter(abp_rule, true, Default::default()) .expect("Rule under test could not be parsed"); assert_eq!( CbRuleEquivalent::try_from(filter) .unwrap() .into_iter() .collect::>(), serde_json::from_str::>(cb) .expect("content blocking rule under test could not be deserialized") ); } #[test] fn ad_tests() { test_from_abp( "&ad_box_", r####"[{ "action": { "type": "block" }, "trigger": { "url-filter": "&ad_box_" } }]"####, ); test_from_abp( "&ad_channel=", r####"[{ "action": { "type": "block" }, "trigger": { "url-filter": "&ad_channel=" } }]"####, ); test_from_abp( "+advertorial.", r####"[{ "action": { "type": "block" }, "trigger": { "url-filter": "\\+advertorial\\." } }]"####, ); test_from_abp( "&prvtof=*&poru=", r####"[{ "action": { "type": "block" }, "trigger": { "url-filter": "&prvtof=.*&poru=" } }]"####, ); test_from_abp( "-ad-180x150px.", r####"[{ "action": { "type": "block" }, "trigger": { "url-filter": "-ad-180x150px\\." } }]"####, ); test_from_abp( "://findnsave.*.*/api/groupon.json?", r####"[{ "action": { "type": "block" }, "trigger": { "url-filter": "://findnsave\\..*\\..*/api/groupon\\.json\\?" } }]"####, ); test_from_abp( "|https://$script,third-party,domain=tamilrockers.ws", r####"[{ "action": { "type": "block" }, "trigger": { "if-domain": ["*tamilrockers.ws"], "load-type": ["third-party"], "resource-type": ["script"], "url-filter": "^https://" } }]"####, ); test_from_abp("||com/banners/$image,object,subdocument,domain=~pingdom.com|~thetvdb.com|~tooltrucks.com", r####"[{ "action": { "type": "block" }, "trigger": { "url-filter": "^[^:]+:(//)?([^/]+\\.)?com/banners/", "unless-domain": [ "*pingdom.com", "*thetvdb.com", "*tooltrucks.com" ], "resource-type": [ "image" ] } }, { "trigger": { "url-filter": "^[^:]+:(//)?([^/]+\\.)?com/banners/", "unless-domain": [ "*pingdom.com", "*thetvdb.com", "*tooltrucks.com" ], "resource-type": [ "document" ], "load-type": [ "third-party" ] }, "action": { "type": "block" } }]"####); test_from_abp( "$image,third-party,xmlhttprequest,domain=rd.com", r####"[{ "action": { "type": "block" }, "trigger": { "url-filter": "^https?://", "if-domain": [ "*rd.com" ], "resource-type": [ "image", "raw" ], "load-type": [ "third-party" ] } }]"####, ); test_from_abp( "|https://r.i.ua^", r####"[{ "action": { "type": "block" }, "trigger": { "url-filter": "^https://r\\.i\\.ua" } }]"####, ); test_from_abp( "|ws://$domain=4shared.com", r####"[{ "action": { "type": "block" }, "trigger": { "url-filter": "^wss?://", "if-domain": [ "*4shared.com" ] } }]"####, ); } #[test] fn element_hiding_tests() { test_from_abp( "###A9AdsMiddleBoxTop", r####"[{ "action": { "type": "css-display-none", "selector": "#A9AdsMiddleBoxTop" }, "trigger": { "url-filter": ".*" } }]"####, ); test_from_abp( "thedailygreen.com#@##AD_banner", r####"[{ "action": { "type": "css-display-none", "selector": "#AD_banner" }, "trigger": { "url-filter": ".*", "unless-domain": [ "thedailygreen.com" ] } }]"####, ); test_from_abp( "sprouts.com,tbns.com.au#@##AdImage", r####"[{ "action": { "type": "css-display-none", "selector": "#AdImage" }, "trigger": { "url-filter": ".*", "unless-domain": [ "sprouts.com", "tbns.com.au" ] } }]"####, ); test_from_abp( r#"santander.co.uk#@#a[href^="http://ad-emea.doubleclick.net/"]"#, r####"[{ "action": { "type": "css-display-none", "selector": "a[href^=\"http://ad-emea.doubleclick.net/\"]" }, "trigger": { "url-filter": ".*", "unless-domain": [ "santander.co.uk" ] } }]"####, ); test_from_abp( "search.safefinder.com,search.snapdo.com###ABottomD", r####"[{ "action": { "type": "css-display-none", "selector": "#ABottomD" }, "trigger": { "url-filter": ".*", "if-domain": [ "search.safefinder.com", "search.snapdo.com" ] } }]"####, ); test_from_abp( r#"tweakguides.com###adbar > br + p[style="text-align: center"] + p[style="text-align: center"]"#, r####"[{ "action": { "type": "css-display-none", "selector": "#adbar > br + p[style=\"text-align: center\"] + p[style=\"text-align: center\"]" }, "trigger": { "url-filter": ".*", "if-domain": [ "tweakguides.com" ] } }]"####, ); } /* TODO - `$popup` is currently unsupported by NetworkFilter #[test] fn popup_tests() { test_from_abp("||admngronline.com^$popup,third-party", r####"[{ "action": { "type": "block" }, "trigger": { "url-filter": "^https?://admngronline\\.com(?:[\\x00-\\x24\\x26-\\x2C\\x2F\\x3A-\\x40\\x5B-\\x5E\\x60\\x7B-\\x7F]|$)", "load-type": [ "third-party" ], "resource-type": [ "popup" ] } }]"####); test_from_abp("||bet365.com^*affiliate=$popup", r####"[{ "action": { "type": "block" }, "trigger": { "url-filter": "^https?://bet365\\.com(?:[\\x00-\\x24\\x26-\\x2C\\x2F\\x3A-\\x40\\x5B-\\x5E\\x60\\x7B-\\x7F]|$).*affiliate=", "resource-type": [ "popup" ] } }]"####); } */ #[test] fn third_party() { test_from_abp( "||007-gateway.com^$third-party", r####"[{ "action": { "type": "block" }, "trigger": { "url-filter": "^[^:]+:(//)?([^/]+\\.)?007-gateway\\.com", "load-type": [ "third-party" ] } }]"####, ); test_from_abp( "||allestörungen.at^$third-party", r####"[{ "action": { "type": "block" }, "trigger": { "url-filter": "^[^:]+:(//)?([^/]+\\.)?xn--allestrungen-9ib\\.at", "load-type": [ "third-party" ] } }]"####, ); test_from_abp( "||anet*.tradedoubler.com^$third-party", r####"[{ "action": { "type": "block" }, "trigger": { "url-filter": "^[^:]+:(//)?([^/]+\\.)?anet.*\\.tradedoubler\\.com", "load-type": [ "third-party" ] } }]"####, ); test_from_abp("||doubleclick.net^$third-party,domain=3news.co.nz|92q.com|abc-7.com|addictinggames.com|allbusiness.com|allthingsd.com|bizjournals.com|bloomberg.com|bnn.ca|boom92houston.com|boom945.com|boomphilly.com|break.com|cbc.ca|cbs19.tv|cbs3springfield.com|cbsatlanta.com|cbslocal.com|complex.com|dailymail.co.uk|darkhorizons.com|doubleviking.com|euronews.com|extratv.com|fandango.com|fox19.com|fox5vegas.com|gorillanation.com|hawaiinewsnow.com|hellobeautiful.com|hiphopnc.com|hot1041stl.com|hothiphopdetroit.com|hotspotatl.com|hulu.com|imdb.com|indiatimes.com|indyhiphop.com|ipowerrichmond.com|joblo.com|kcra.com|kctv5.com|ketv.com|koat.com|koco.com|kolotv.com|kpho.com|kptv.com|ksat.com|ksbw.com|ksfy.com|ksl.com|kypost.com|kysdc.com|live5news.com|livestation.com|livestream.com|metro.us|metronews.ca|miamiherald.com|my9nj.com|myboom1029.com|mycolumbusmagic.com|mycolumbuspower.com|myfoxdetroit.com|myfoxorlando.com|myfoxphilly.com|myfoxphoenix.com|myfoxtampabay.com|nbcrightnow.com|neatorama.com|necn.com|neopets.com|news.com.au|news4jax.com|newsone.com|nintendoeverything.com|oldschoolcincy.com|own3d.tv|pagesuite-professional.co.uk|pandora.com|player.theplatform.com|ps3news.com|radio.com|radionowindy.com|rottentomatoes.com|sbsun.com|shacknews.com|sk-gaming.com|ted.com|thebeatdfw.com|theboxhouston.com|theglobeandmail.com|timesnow.tv|tv2.no|twitch.tv|universalsports.com|ustream.tv|wapt.com|washingtonpost.com|wate.com|wbaltv.com|wcvb.com|wdrb.com|wdsu.com|wflx.com|wfmz.com|wfsb.com|wgal.com|whdh.com|wired.com|wisn.com|wiznation.com|wlky.com|wlns.com|wlwt.com|wmur.com|wnem.com|wowt.com|wral.com|wsj.com|wsmv.com|wsvn.com|wtae.com|wthr.com|wxii12.com|wyff4.com|yahoo.com|youtube.com|zhiphopcleveland.com", r####"[{ "action": { "type": "block" }, "trigger": { "url-filter": "^[^:]+:(//)?([^/]+\\.)?doubleclick\\.net", "load-type": [ "third-party" ], "if-domain": [ "*3news.co.nz", "*92q.com", "*abc-7.com", "*addictinggames.com", "*allbusiness.com", "*allthingsd.com", "*bizjournals.com", "*bloomberg.com", "*bnn.ca", "*boom92houston.com", "*boom945.com", "*boomphilly.com", "*break.com", "*cbc.ca", "*cbs19.tv", "*cbs3springfield.com", "*cbsatlanta.com", "*cbslocal.com", "*complex.com", "*dailymail.co.uk", "*darkhorizons.com", "*doubleviking.com", "*euronews.com", "*extratv.com", "*fandango.com", "*fox19.com", "*fox5vegas.com", "*gorillanation.com", "*hawaiinewsnow.com", "*hellobeautiful.com", "*hiphopnc.com", "*hot1041stl.com", "*hothiphopdetroit.com", "*hotspotatl.com", "*hulu.com", "*imdb.com", "*indiatimes.com", "*indyhiphop.com", "*ipowerrichmond.com", "*joblo.com", "*kcra.com", "*kctv5.com", "*ketv.com", "*koat.com", "*koco.com", "*kolotv.com", "*kpho.com", "*kptv.com", "*ksat.com", "*ksbw.com", "*ksfy.com", "*ksl.com", "*kypost.com", "*kysdc.com", "*live5news.com", "*livestation.com", "*livestream.com", "*metro.us", "*metronews.ca", "*miamiherald.com", "*my9nj.com", "*myboom1029.com", "*mycolumbusmagic.com", "*mycolumbuspower.com", "*myfoxdetroit.com", "*myfoxorlando.com", "*myfoxphilly.com", "*myfoxphoenix.com", "*myfoxtampabay.com", "*nbcrightnow.com", "*neatorama.com", "*necn.com", "*neopets.com", "*news.com.au", "*news4jax.com", "*newsone.com", "*nintendoeverything.com", "*oldschoolcincy.com", "*own3d.tv", "*pagesuite-professional.co.uk", "*pandora.com", "*player.theplatform.com", "*ps3news.com", "*radio.com", "*radionowindy.com", "*rottentomatoes.com", "*sbsun.com", "*shacknews.com", "*sk-gaming.com", "*ted.com", "*thebeatdfw.com", "*theboxhouston.com", "*theglobeandmail.com", "*timesnow.tv", "*tv2.no", "*twitch.tv", "*universalsports.com", "*ustream.tv", "*wapt.com", "*washingtonpost.com", "*wate.com", "*wbaltv.com", "*wcvb.com", "*wdrb.com", "*wdsu.com", "*wflx.com", "*wfmz.com", "*wfsb.com", "*wgal.com", "*whdh.com", "*wired.com", "*wisn.com", "*wiznation.com", "*wlky.com", "*wlns.com", "*wlwt.com", "*wmur.com", "*wnem.com", "*wowt.com", "*wral.com", "*wsj.com", "*wsmv.com", "*wsvn.com", "*wtae.com", "*wthr.com", "*wxii12.com", "*wyff4.com", "*yahoo.com", "*youtube.com", "*zhiphopcleveland.com" ] } }]"####); test_from_abp("||dt00.net^$third-party,domain=~marketgid.com|~marketgid.ru|~marketgid.ua|~mgid.com|~thechive.com", r####"[{ "action": { "type": "block" }, "trigger": { "url-filter": "^[^:]+:(//)?([^/]+\\.)?dt00\\.net", "load-type": [ "third-party" ], "unless-domain": [ "*marketgid.com", "*marketgid.ru", "*marketgid.ua", "*mgid.com", "*thechive.com" ] } }]"####); test_from_abp("||amazonaws.com/newscloud-production/*/backgrounds/$domain=crescent-news.com|daily-jeff.com|recordpub.com|state-journal.com|the-daily-record.com|the-review.com|times-gazette.com", r####"[{ "action": { "type": "block" }, "trigger": { "url-filter": "^[^:]+:(//)?([^/]+\\.)?amazonaws\\.com/newscloud-production/.*/backgrounds/", "if-domain": [ "*crescent-news.com", "*daily-jeff.com", "*recordpub.com", "*state-journal.com", "*the-daily-record.com", "*the-review.com", "*times-gazette.com" ] } }]"####); test_from_abp( "||d1noellhv8fksc.cloudfront.net^", r####"[{ "action": { "type": "block" }, "trigger": { "url-filter": "^[^:]+:(//)?([^/]+\\.)?d1noellhv8fksc\\.cloudfront\\.net" } }]"####, ); } #[test] fn whitelist() { test_from_abp( "@@||google.com/recaptcha/$domain=mediafire.com", r####"[{ "action": { "type": "ignore-previous-rules" }, "trigger": { "url-filter": "^[^:]+:(//)?([^/]+\\.)?google\\.com/recaptcha/", "if-domain": [ "*mediafire.com" ] } }]"####, ); test_from_abp( "@@||ad4.liverail.com/?compressed|$domain=majorleaguegaming.com|pbs.org|wikihow.com", r####"[{ "action": { "type": "ignore-previous-rules" }, "trigger": { "url-filter": "^[^:]+:(//)?([^/]+\\.)?ad4\\.liverail\\.com/\\?compressed$", "if-domain": [ "*majorleaguegaming.com", "*pbs.org", "*wikihow.com" ] } }]"####, ); test_from_abp( "@@||googletagservices.com/tag/js/gpt.js$domain=allestoringen.nl|allestörungen.at", r####"[{ "action": { "type": "ignore-previous-rules" }, "trigger": { "url-filter": "^[^:]+:(//)?([^/]+\\.)?googletagservices\\.com/tag/js/gpt\\.js", "if-domain": [ "*allestoringen.nl", "*xn--allestrungen-9ib.at" ] } }]"####, ); test_from_abp( "@@||advertising.autotrader.co.uk^$~third-party", r####"[{ "action": { "type": "ignore-previous-rules" }, "trigger": { "load-type": [ "first-party" ], "url-filter": "^[^:]+:(//)?([^/]+\\.)?advertising\\.autotrader\\.co\\.uk" } }]"####, ); test_from_abp( "@@||advertising.racingpost.com^$image,script,stylesheet,~third-party,xmlhttprequest", r####"[{ "action": { "type": "ignore-previous-rules" }, "trigger": { "load-type": [ "first-party" ], "url-filter": "^[^:]+:(//)?([^/]+\\.)?advertising\\.racingpost\\.com", "resource-type": [ "image", "style-sheet", "script", "raw" ] } }]"####, ); } #[test] fn test_ignore_previous_fp_documents() { assert_eq!( vec![ignore_previous_fp_documents()], serde_json::from_str::>( r####"[{ "trigger":{ "url-filter":".*", "resource-type":["document"], "load-type":["first-party"] }, "action":{"type":"ignore-previous-rules"} }]"#### ) .expect("content blocking rule under test could not be deserialized") ); } #[test] fn escape_literal_backslashes() { test_from_abp( r#"||gamer.no/?module=Tumedia\DFProxy\Modules^"#, r####"[{ "action": { "type": "block" }, "trigger": { "url-filter": "^[^:]+:(//)?([^/]+\\.)?gamer\\.no/\\?module=tumedia\\\\dfproxy\\\\modules" } }]"####, ); } } #[cfg(test)] mod filterset_tests { use crate::lists::{FilterSet, ParseOptions, RuleTypes}; const FILTER_LIST: &[&str] = &[ "||example.com^$script", "||test.net^$image,third-party", "/trackme.js^$script", "example.com##.ad-banner", "##.ad-640x480", "##p.sponsored", ]; #[test] fn convert_all_rules() -> Result<(), ()> { let mut set = FilterSet::new(true); set.add_filters(FILTER_LIST, Default::default()); let (cb_rules, used_rules) = set.into_content_blocking()?; assert_eq!(used_rules, FILTER_LIST); // All 6 rules plus `ignore_previous_fp_documents()` assert_eq!(cb_rules.len(), 7); Ok(()) } #[test] fn convert_network_only() -> Result<(), ()> { let parse_opts = ParseOptions { rule_types: RuleTypes::NetworkOnly, ..Default::default() }; let mut set = FilterSet::new(true); set.add_filters(FILTER_LIST, parse_opts); let (cb_rules, used_rules) = set.into_content_blocking()?; assert_eq!(used_rules, &FILTER_LIST[0..3]); // 3 network rules plus `ignore_previous_fp_documents()` assert_eq!(cb_rules.len(), 4); Ok(()) } #[test] fn convert_cosmetic_only() -> Result<(), ()> { let parse_opts = ParseOptions { rule_types: RuleTypes::CosmeticOnly, ..Default::default() }; let mut set = FilterSet::new(true); set.add_filters(FILTER_LIST, parse_opts); let (cb_rules, used_rules) = set.into_content_blocking()?; assert_eq!(used_rules, &FILTER_LIST[3..6]); // 3 cosmetic rules only assert_eq!(cb_rules.len(), 3); Ok(()) } #[test] fn ignore_unsupported_rules() -> Result<(), ()> { let mut set = FilterSet::new(true); set.add_filters(FILTER_LIST, Default::default()); set.add_filters([ // unicode characters "||rgmechanics.info/uploads/660х90_", "||insaattrendy.com/Upload/bükerbanner*.jpg", // from domain "/siropu/am/core.min.js$script,important,from=~audi-sport.net|~hifiwigwam.com", // leading zero-width space r#"​##a[href^="https://www.g2fame.com/"] > img"#, ], Default::default()); let (cb_rules, used_rules) = set.into_content_blocking()?; assert_eq!(used_rules, FILTER_LIST); // All 6 rules plus `ignore_previous_fp_documents()` assert_eq!(cb_rules.len(), 7); Ok(()) } #[test] fn punycode_if_domains() -> Result<(), ()> { let list = [ "smskaraborg.se,örnsköldsviksgymnasium.se,mojligheternashusab.se##.env-modal-dialog__backdrop", ]; let mut set = FilterSet::new(true); set.add_filters(&list, Default::default()); let (cb_rules, used_rules) = set.into_content_blocking()?; assert_eq!(used_rules, list); assert_eq!(cb_rules.len(), 1); assert!(cb_rules[0].trigger.if_domain.is_some()); assert_eq!(cb_rules[0].trigger.if_domain.as_ref().unwrap(), &["smskaraborg.se", "xn--rnskldsviksgymnasium-29be.se", "mojligheternashusab.se"]); Ok(()) } } adblock-0.8.12/src/cosmetic_filter_cache.rs000064400000000000000000001304601046102023000167520ustar 00000000000000//! Provides behavior related to cosmetic filtering - that is, modifying a page's contents after //! it's been loaded into a browser. This is primarily used to hide or clean up unwanted page //! elements that are served inline with the rest of the first-party content from a page, but can //! also be used to inject JavaScript "scriptlets" that intercept and modify the behavior of //! scripts on the page at runtime. //! //! The primary API exposed by this module is the `CosmeticFilterCache` struct, which stores //! cosmetic filters and allows them to be queried efficiently at runtime for any which may be //! relevant to a particular page. use crate::filters::cosmetic::CosmeticFilter; use crate::filters::cosmetic::CosmeticFilterMask; use crate::resources::{PermissionMask, ResourceStorage}; use crate::utils::Hash; use std::collections::{HashMap, HashSet}; use serde::{Deserialize, Serialize}; /// Contains cosmetic filter information intended to be used on a particular URL. #[derive(Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct UrlSpecificResources { /// `hide_selectors` is a set of any CSS selector on the page that should be hidden, i.e. /// styled as `{ display: none !important; }`. pub hide_selectors: HashSet, /// `style_selectors` is a map of CSS selectors on the page to respective non-hide style rules, /// i.e. any required styles other than `display: none`. pub style_selectors: HashMap>, /// `remove_selectors` is a set of any CSS selector on the page that should be removed from the /// DOM. pub remove_selectors: HashSet, /// `remove_attrs` is a map of CSS selectors on the page to respective HTML attributes that /// should be removed from matching elements. pub remove_attrs: HashMap>, /// `remove_attrs` is a map of CSS selectors on the page to respective CSS classes that should /// be removed from matching elements. pub remove_classes: HashMap>, /// `exceptions` is a set of any class or id CSS selectors that should not have generic rules /// applied. In practice, these should be passed to `class_id_stylesheet` and not used /// otherwise. pub exceptions: HashSet, /// `injected_script` is the Javascript code for any scriptlets that should be injected into /// the page. pub injected_script: String, /// `generichide` is set to true if there is a corresponding `$generichide` exception network /// filter. If so, the page should not query for additional generic rules using /// `hidden_class_id_selectors`. pub generichide: bool, } impl UrlSpecificResources { pub fn empty() -> Self { Self { hide_selectors: HashSet::new(), style_selectors: HashMap::new(), remove_selectors: HashSet::new(), remove_attrs: HashMap::new(), remove_classes: HashMap::new(), exceptions: HashSet::new(), injected_script: String::new(), generichide: false, } } } /// The main engine driving cosmetic filtering. /// /// There are two primary methods that should be considered when using this in a browser: /// `hidden_class_id_selectors`, and `url_cosmetic_resources`. /// /// Note that cosmetic filtering is imprecise and that this structure is intenionally designed for /// efficient querying in the context of a browser, optimizing for low memory usage in the page /// context and good performance. It is *not* designed to provide a 100% accurate report of what /// will be blocked on any particular page, although when used correctly, all provided rules and /// scriptlets should be safe to apply. pub(crate) struct CosmeticFilterCache { /// Rules that are just the CSS class of an element to be hidden on all sites, e.g. `##.ad`. pub(crate) simple_class_rules: HashSet, /// Rules that are just the CSS id of an element to be hidden on all sites, e.g. `###banner`. pub(crate) simple_id_rules: HashSet, /// Rules that are the CSS selector of an element to be hidden on all sites, starting with a /// class, e.g. `##.ad image`. pub(crate) complex_class_rules: HashMap>, /// Rules that are the CSS selector of an element to be hidden on all sites, starting with an /// id, e.g. `###banner > .text a`. pub(crate) complex_id_rules: HashMap>, pub(crate) specific_rules: HostnameRuleDb, /// Rules that are the CSS selector of an element to be hidden on all sites that do not fit /// into any of the class or id buckets above, e.g. `##a[href="https://malware.com"]` pub(crate) misc_generic_selectors: HashSet, } impl CosmeticFilterCache { pub fn new() -> Self { Self { simple_class_rules: HashSet::new(), simple_id_rules: HashSet::new(), complex_class_rules: HashMap::new(), complex_id_rules: HashMap::new(), specific_rules: HostnameRuleDb::default(), misc_generic_selectors: HashSet::new(), } } pub fn from_rules(rules: Vec) -> Self { let mut self_ = Self { simple_class_rules: HashSet::with_capacity(rules.len() / 2), simple_id_rules: HashSet::with_capacity(rules.len() / 2), complex_class_rules: HashMap::with_capacity(rules.len() / 2), complex_id_rules: HashMap::with_capacity(rules.len() / 2), specific_rules: HostnameRuleDb::default(), misc_generic_selectors: HashSet::with_capacity(rules.len() / 30), }; for rule in rules { self_.add_filter(rule) } self_ } pub fn add_filter(&mut self, rule: CosmeticFilter) { if rule.has_hostname_constraint() { if let Some(generic_rule) = rule.hidden_generic_rule() { self.add_generic_filter(generic_rule); } self.specific_rules.store_rule(rule); } else { self.add_generic_filter(rule); } } /// Add a filter, assuming it has already been determined to be a generic rule fn add_generic_filter(&mut self, rule: CosmeticFilter) { if rule.mask.contains(CosmeticFilterMask::IS_CLASS_SELECTOR) { if let Some(key) = &rule.key { let key = key.clone(); if rule.mask.contains(CosmeticFilterMask::IS_SIMPLE) { self.simple_class_rules.insert(key); } else { if let Some(bucket) = self.complex_class_rules.get_mut(&key) { bucket.push(rule.selector); } else { self.complex_class_rules.insert(key, vec![rule.selector]); } } } } else if rule.mask.contains(CosmeticFilterMask::IS_ID_SELECTOR) { if let Some(key) = &rule.key { let key = key.clone(); if rule.mask.contains(CosmeticFilterMask::IS_SIMPLE) { self.simple_id_rules.insert(key); } else { if let Some(bucket) = self.complex_id_rules.get_mut(&key) { bucket.push(rule.selector); } else { self.complex_id_rules.insert(key, vec![rule.selector]); } } } } else { self.misc_generic_selectors.insert(rule.selector); } } /// Generic class/id rules are by far the most common type of cosmetic filtering rule, and they /// apply to all sites. Rather than injecting all of these rules onto every page, which would /// blow up memory usage, we only inject rules based on classes and ids that actually appear on /// the page (in practice, a `MutationObserver` is used to identify those elements). We can /// include rules like `.a-class div#ads > .advertisement`, keyed by the `.a-class` selector, /// since we know that this rule cannot possibly apply unless there is an `.a-class` element on /// the page. /// /// This method returns all of the generic CSS selectors of elements to hide (i.e. with a /// `display: none !important` CSS rule) that could possibly be or become relevant to the page /// given the new classes and ids that have appeared on the page. It guarantees that it will be /// safe to hide those elements on a particular page by taking into account the page's /// hostname-specific set of exception rules. /// /// The exceptions should be returned directly as they appear in the page's /// `UrlSpecificResources`. The exceptions, along with the set of already-seen classes and ids, /// must be cached externally as the cosmetic filtering subsystem here is designed to be /// stateless with regard to active page sessions. pub fn hidden_class_id_selectors( &self, classes: impl IntoIterator>, ids: impl IntoIterator>, exceptions: &HashSet, ) -> Vec { let mut selectors = vec![]; classes.into_iter().for_each(|class| { let class = class.as_ref(); if self.simple_class_rules.contains(class) && !exceptions.contains(&format!(".{}", class)) { selectors.push(format!(".{}", class)); } if let Some(bucket) = self.complex_class_rules.get(class) { selectors.extend(bucket.iter().filter(|sel| !exceptions.contains(*sel)).map(|s| s.to_owned())); } }); ids.into_iter().for_each(|id| { let id = id.as_ref(); if self.simple_id_rules.contains(id) && !exceptions.contains(&format!("#{}", id)) { selectors.push(format!("#{}", id)); } if let Some(bucket) = self.complex_id_rules.get(id) { selectors.extend(bucket.iter().filter(|sel| !exceptions.contains(*sel)).map(|s| s.to_owned())); } }); selectors } /// Any rules that can't be handled by `hidden_class_id_selectors` are returned by /// `hostname_cosmetic_resources`. As soon as a page navigation is committed, this method /// should be queried to get the initial set of cosmetic filtering operations to apply to the /// page. This provides any rules specifying elements to hide by selectors that are too complex /// to be returned by `hidden_class_id_selectors` (i.e. not directly starting with a class or /// id selector, like `div[class*="Ads"]`), or any rule that is only applicable to a particular /// hostname or set of hostnames (like `example.com##.a-class`). The first category is always /// injected into every page, and makes up a relatively small number of rules in practice. pub fn hostname_cosmetic_resources( &self, resources: &ResourceStorage, hostname: &str, generichide: bool, ) -> UrlSpecificResources { let domain_str = { let (start, end) = crate::url_parser::get_host_domain(hostname); &hostname[start..end] }; let (request_entities, request_hostnames) = hostname_domain_hashes(hostname, domain_str); let mut specific_hide_selectors = HashSet::new(); let mut style_selectors = HashMap::<_, Vec<_>>::new(); let mut remove_selectors = HashSet::new(); let mut remove_attrs = HashMap::<_, Vec<_>>::new(); let mut remove_classes = HashMap::<_, Vec<_>>::new(); let mut script_injections = HashMap::<&str, PermissionMask>::new(); let mut exceptions = HashSet::new(); let mut except_all_scripts = false; let hashes: Vec<&Hash> = request_entities.iter().chain(request_hostnames.iter()).collect(); fn populate_set(hash: &Hash, source_bin: &HostnameFilterBin, dest_set: &mut HashSet) { if let Some(s) = source_bin.get(hash) { s.iter().for_each(|s| { dest_set.insert(s.to_owned()); }); } } fn populate_map(hash: &Hash, source_bin: &HostnameFilterBin<(String, String)>, dest_map: &mut HashMap>) { if let Some(s) = source_bin.get(hash) { s.iter().for_each(|s| { dest_map.entry(s.0.to_owned()).and_modify(|v| v.push(s.1.to_owned())).or_insert_with(|| vec![s.1.to_owned()]); }); } } for hash in hashes.iter() { populate_set(hash, &self.specific_rules.hide, &mut specific_hide_selectors); populate_set(hash, &self.specific_rules.remove, &mut remove_selectors); // special behavior: `script_injections` doesn't have to own the strings yet, since the // scripts need to be fetched and templated later if let Some(s) = self.specific_rules.inject_script.get(hash) { s.iter().for_each(|(s, mask)| { script_injections.entry(s).and_modify(|entry| *entry |= *mask).or_insert(*mask); }); } populate_map(hash, &self.specific_rules.style, &mut style_selectors); populate_map(hash, &self.specific_rules.remove_attr, &mut remove_attrs); populate_map(hash, &self.specific_rules.remove_class, &mut remove_classes); } fn prune_set(hash: &Hash, source_bin: &HostnameFilterBin, dest_set: &mut HashSet) { if let Some(s) = source_bin.get(hash) { s.iter().for_each(|s| { dest_set.remove(s); }); } } fn prune_map(hash: &Hash, source_bin: &HostnameFilterBin<(String, String)>, dest_map: &mut HashMap>) { if let Some(s) = source_bin.get(hash) { s.iter().for_each(|s| { if let Some(v) = dest_map.get_mut(&s.0) { v.retain(|e| e != &s.1); if v.is_empty() { dest_map.remove(&s.0); } } }); } } for hash in hashes.iter() { // special behavior: unhide rules need to go in `exceptions` as well if let Some(s) = self.specific_rules.unhide.get(hash) { s.iter().for_each(|s| { specific_hide_selectors.remove(s); exceptions.insert(s.to_owned()); }); } prune_set(hash, &self.specific_rules.unremove, &mut remove_selectors); // same logic but not using prune_set since strings are unowned, (see above) if let Some(s) = self.specific_rules.uninject_script.get(hash) { for s in s { if s.is_empty() { except_all_scripts = true; script_injections.clear(); } if except_all_scripts { continue; } script_injections.remove(s.as_str()); } } prune_map(hash, &self.specific_rules.unstyle, &mut style_selectors); prune_map(hash, &self.specific_rules.unremove_attr, &mut remove_attrs); prune_map(hash, &self.specific_rules.unremove_class, &mut remove_classes); } let hide_selectors = if generichide { specific_hide_selectors } else { let mut hide_selectors = self .misc_generic_selectors .difference(&exceptions) .cloned() .collect::>(); specific_hide_selectors.into_iter().for_each(|sel| { hide_selectors.insert(sel); }); hide_selectors }; let mut injected_script = String::new(); script_injections.iter().for_each(|(s, mask)| { if let Ok(filled_template) = resources.get_scriptlet_resource(s, *mask) { injected_script += "try {\n"; injected_script += &filled_template; injected_script += "\n} catch ( e ) { }\n"; } }); UrlSpecificResources { hide_selectors, style_selectors, remove_selectors, remove_attrs, remove_classes, exceptions, injected_script, generichide, } } } /// Each hostname-specific filter can be pointed to by several different hostnames, and each /// hostname can correspond to several different filters. To effectively store and access those /// filters by hostname, all the non-hostname information for filters is stored in per-hostname /// "buckets" within a Vec, and each bucket is identified by its index. Hostname hashes are used as /// keys to get the indices of relevant buckets, which are in turn used to retrieve all the filters /// that apply. #[derive(Default)] pub(crate) struct HostnameFilterBin(pub HashMap>); impl HostnameFilterBin { pub fn insert(&mut self, token: &Hash, filter: T) { if let Some(bucket) = self.0.get_mut(token) { bucket.push(filter); } else { self.0.insert(*token, vec![filter]); } } fn get(&self, token: &Hash) -> Option<&Vec> { self.0.get(token) } } /// Holds filter bins categorized by filter type. #[derive(Default)] pub(crate) struct HostnameRuleDb { /// Simple hostname-specific hide rules, e.g. `example.com##.ad`. /// /// The parameter is the rule's CSS selector. pub hide: HostnameFilterBin, /// Simple hostname-specific hide exception rules, e.g. `example.com#@#.ad`. /// /// The parameter is the rule's CSS selector. pub unhide: HostnameFilterBin, /// Hostname-specific rules with a scriptlet to inject along with any arguments, e.g. /// `example.com##+js(acis, Number.isNan)`. /// /// The parameter is the contents of the `+js(...)` syntax construct. pub inject_script: HostnameFilterBin<(String, PermissionMask)>, /// Hostname-specific rules to except a scriptlet to inject along with any arguments, e.g. /// `example.com#@#+js(acis, Number.isNan)`. /// /// The parameter is the contents of the `+js(...)` syntax construct. /// /// In practice, these rules are extremely rare in filter lists. pub uninject_script: HostnameFilterBin, /// Simple hostname-specific rules with a remove action, e.g. `example.com##.ad:remove()`. /// /// The parameter is the rule's CSS selector. pub remove: HostnameFilterBin, /// Simple hostname-specific remove action exception rules, e.g. `example.com#@#.ad:remove()`. /// /// The parameter is the rule's CSS selector. pub unremove: HostnameFilterBin, /// Hostname-specific rules with a custom style for an element, e.g. /// `example.com##.ad:style(margin: 0)`. /// /// The parameters are the rule's selector and its additional style. pub style: HostnameFilterBin<(String, String)>, /// Hostname-specific exception rules for a custom style for an element, e.g. /// `example.com#@#.ad:style(margin: 0)`. /// /// The parameters are the rule's selector and its additional style. /// /// In practice, this kind of rule does not appear in filter lists, although it is not /// explicitly forbidden according to any syntax documentation. pub unstyle: HostnameFilterBin<(String, String)>, /// Simple hostname-specific rules with a remove attribute action, e.g. `example.com##.ad:remove()`. /// /// The parameters are the rule's CSS selector and the class to remove. pub remove_attr: HostnameFilterBin<(String, String)>, /// Simple hostname-specific remove attribute action exception rules, e.g. `example.com#@#.ad:remove()`. /// /// The parameters are the rule's CSS selector and the class to remove. pub unremove_attr: HostnameFilterBin<(String, String)>, /// Simple hostname-specific rules with a remove class action, e.g. `example.com##.ad:remove()`. /// /// The parameters are the rule's CSS selector and the class to remove. pub remove_class: HostnameFilterBin<(String, String)>, /// Simple hostname-specific remove class action exception rules, e.g. `example.com#@#.ad:remove()`. /// /// The parameters are the rule's CSS selector and the class to remove. pub unremove_class: HostnameFilterBin<(String, String)>, } impl HostnameRuleDb { pub fn store_rule(&mut self, rule: CosmeticFilter) { use crate::filters::cosmetic::CosmeticFilterAction; use SpecificFilterType::*; let unhide = rule.mask.contains(CosmeticFilterMask::UNHIDE); let script_inject = rule.mask.contains(CosmeticFilterMask::SCRIPT_INJECT); let selector = rule.selector; let kind = match (unhide, script_inject, rule.action) { (false, false, None) => Hide(selector), (true, false, None) => Unhide(selector), (false, true, None) => InjectScript((selector, rule.permission)), (true, true, None) => UninjectScript((selector, rule.permission)), (false, false, Some(CosmeticFilterAction::Style(s))) => Style((selector, s)), (true, false, Some(CosmeticFilterAction::Style(s)) )=> Unstyle((selector, s)), (false, false, Some(CosmeticFilterAction::Remove)) => Remove(selector), (true, false, Some(CosmeticFilterAction::Remove)) => Unremove(selector), (false, false, Some(CosmeticFilterAction::RemoveClass(c))) => RemoveClass((selector, c)), (true, false, Some(CosmeticFilterAction::RemoveClass(c))) => UnremoveClass((selector, c)), (false, false, Some(CosmeticFilterAction::RemoveAttr(a))) => RemoveAttr((selector, a)), (true, false, Some(CosmeticFilterAction::RemoveAttr(a))) => UnremoveAttr((selector, a)), (_, true, Some(_)) => return, // shouldn't be possible }; let tokens_to_insert = std::iter::empty() .chain(rule.hostnames.unwrap_or(Vec::new())) .chain(rule.entities.unwrap_or(Vec::new())); tokens_to_insert.for_each(|t| self.store(&t, kind.clone())); let tokens_to_insert_negated = std::iter::empty() .chain(rule.not_hostnames.unwrap_or(Vec::new())) .chain(rule.not_entities.unwrap_or(Vec::new())); let negated = kind.negated(); tokens_to_insert_negated.for_each(|t| self.store(&t, negated.clone())); } fn store(&mut self, token: &Hash, kind: SpecificFilterType) { use SpecificFilterType::*; match kind { Hide(s) => self.hide.insert(token, s), Unhide(s) => self.unhide.insert(token, s), InjectScript(s) => self.inject_script.insert(token, s), UninjectScript((s, _)) => self.uninject_script.insert(token, s), Remove(s) => self.remove.insert(token, s), Unremove(s) => self.unremove.insert(token, s), Style(s) => self.style.insert(token, s), Unstyle(s) => self.unstyle.insert(token, s), RemoveAttr(s) => self.remove_attr.insert(token, s), UnremoveAttr(s) => self.unremove_attr.insert(token, s), RemoveClass(s) => self.remove_class.insert(token, s), UnremoveClass(s) => self.unremove_class.insert(token, s), } } } /// Exists to use common logic for binning filters correctly #[derive(Clone)] enum SpecificFilterType { Hide(String), Unhide(String), InjectScript((String, PermissionMask)), UninjectScript((String, PermissionMask)), Remove(String), Unremove(String), Style((String, String)), Unstyle((String, String)), RemoveAttr((String, String)), UnremoveAttr((String, String)), RemoveClass((String, String)), UnremoveClass((String, String)), } impl SpecificFilterType { fn negated(self) -> Self { match self { Self::Hide(s) => Self::Unhide(s), Self::Unhide(s) => Self::Hide(s), Self::InjectScript(s) => Self::UninjectScript(s), Self::UninjectScript(s) => Self::InjectScript(s), Self::Remove(s) => Self::Unremove(s), Self::Unremove(s) => Self::Remove(s), Self::Style(s) => Self::Unstyle(s), Self::Unstyle(s) => Self::Style(s), Self::RemoveAttr(s) => Self::UnremoveAttr(s), Self::UnremoveAttr(s) => Self::RemoveAttr(s), Self::RemoveClass(s) => Self::UnremoveClass(s), Self::UnremoveClass(s) => Self::RemoveClass(s), } } } fn hostname_domain_hashes(hostname: &str, domain: &str) -> (Vec, Vec) { let request_entities = crate::filters::cosmetic::get_entity_hashes_from_labels(hostname, domain); let request_hostnames = crate::filters::cosmetic::get_hostname_hashes_from_labels(hostname, domain); (request_entities, request_hostnames) } #[cfg(test)] mod cosmetic_cache_tests { use super::*; use crate::resources::Resource; fn cache_from_rules(rules: Vec<&str>) -> CosmeticFilterCache { let parsed_rules = rules .iter() .map(|r| CosmeticFilter::parse(r, false, Default::default()).unwrap()) .collect::>(); CosmeticFilterCache::from_rules(parsed_rules) } #[test] fn exceptions() { let cfcache = cache_from_rules(vec!["~example.com##.item", "sub.example.com#@#.item2"]); let resources = ResourceStorage::default(); let out = cfcache.hostname_cosmetic_resources(&resources, "test.com", false); let mut expected = UrlSpecificResources::empty(); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "example.com", false); expected.exceptions.insert(".item".into()); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "sub.example.com", false); expected.exceptions.insert(".item2".into()); assert_eq!(out, expected); } #[test] fn exceptions2() { let cfcache = cache_from_rules(vec!["example.com,~sub.example.com##.item"]); let resources = ResourceStorage::default(); let out = cfcache.hostname_cosmetic_resources(&resources, "test.com", false); let mut expected = UrlSpecificResources::empty(); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "example.com", false); expected.hide_selectors.insert(".item".to_owned()); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "sub.example.com", false); let mut expected = UrlSpecificResources::empty(); expected.exceptions.insert(".item".into()); assert_eq!(out, expected); } #[test] fn style_exceptions() { let cfcache = cache_from_rules(vec![ "example.com,~sub.example.com##.element:style(background: #fff)", "sub.test.example.com#@#.element:style(background: #fff)", "a1.sub.example.com##.element", "a2.sub.example.com##.element:style(background: #000)", "a3.example.com##.element:style(background: #000)", ]); let resources = ResourceStorage::default(); let out = cfcache.hostname_cosmetic_resources(&resources, "sub.example.com", false); let mut expected = UrlSpecificResources::empty(); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "sub.test.example.com", false); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "a1.sub.example.com", false); expected.hide_selectors.insert(".element".to_owned()); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "test.example.com", false); expected.hide_selectors.clear(); expected .style_selectors .insert(".element".to_owned(), vec!["background: #fff".to_owned()]); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "a2.sub.example.com", false); expected.style_selectors.clear(); expected .style_selectors .insert(".element".to_owned(), vec!["background: #000".to_owned()]); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "a3.example.com", false); expected.style_selectors.clear(); expected .style_selectors .insert(".element".to_owned(), vec!["background: #000".to_owned(), "background: #fff".to_owned()]); // order is non-deterministic if out != expected { expected .style_selectors .get_mut(".element") .unwrap() .reverse(); assert_eq!(out, expected); } } #[test] fn script_exceptions() { use crate::resources::{MimeType, ResourceType}; let cfcache = cache_from_rules(vec![ "example.com,~sub.example.com##+js(set-constant.js, atob, trueFunc)", "sub.test.example.com#@#+js(set-constant.js, atob, trueFunc)", "cosmetic.net##+js(nowebrtc.js)", "g.cosmetic.net##+js(window.open-defuser.js)", "c.g.cosmetic.net#@#+js(nowebrtc.js)", "d.g.cosmetic.net#@#+js()", ]); let resources = ResourceStorage::from_resources([ Resource { name: "set-constant.js".into(), aliases: vec![], kind: ResourceType::Template, content: base64::encode("set-constant.js, {{1}}, {{2}}"), dependencies: vec![], permission: Default::default(), }, Resource::simple("nowebrtc.js", MimeType::ApplicationJavascript, "nowebrtc.js"), Resource::simple("window.open-defuser.js", MimeType::ApplicationJavascript, "window.open-defuser.js"), ]); let out = cfcache.hostname_cosmetic_resources(&resources, "sub.example.com", false); let mut expected = UrlSpecificResources::empty(); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "sub.test.example.com", false); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "test.example.com", false); expected.injected_script = "try {\nset-constant.js, atob, trueFunc\n} catch ( e ) { }\n".to_owned(); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "cosmetic.net", false); expected.injected_script = "try {\nnowebrtc.js\n} catch ( e ) { }\n".to_owned(); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "g.cosmetic.net", false); expected.injected_script = "try {\nnowebrtc.js\n} catch ( e ) { }\ntry {\nwindow.open-defuser.js\n} catch ( e ) { }\n".to_owned(); // order is non-deterministic if out != expected { expected.injected_script = "try {\nwindow.open-defuser.js\n} catch ( e ) { }\ntry {\nnowebrtc.js\n} catch ( e ) { }\n".to_owned(); assert_eq!(out, expected); } let out = cfcache.hostname_cosmetic_resources(&resources, "c.g.cosmetic.net", false); expected.injected_script = "try {\nwindow.open-defuser.js\n} catch ( e ) { }\n".to_owned(); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "d.g.cosmetic.net", false); expected.injected_script = "".to_owned(); assert_eq!(out, expected); } #[test] fn remove_exceptions() { let cfcache = cache_from_rules(vec![ "example.com,~sub.example.com##.element:remove()", "sub.test.example.com#@#.element:remove()", "a1.sub.example.com##.element", "a2.sub.example.com##.element:remove()", "a3.example.com##.element:remove()", ]); let resources = ResourceStorage::default(); let out = cfcache.hostname_cosmetic_resources(&resources, "sub.example.com", false); let mut expected = UrlSpecificResources::empty(); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "sub.test.example.com", false); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "a1.sub.example.com", false); expected.hide_selectors.insert(".element".to_owned()); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "test.example.com", false); expected.hide_selectors.clear(); expected.remove_selectors.insert(".element".to_owned()); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "a2.sub.example.com", false); expected.remove_selectors.clear(); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "a3.example.com", false); expected.remove_selectors.clear(); expected.remove_selectors.insert(".element".to_owned()); assert_eq!(out, expected); } #[test] fn remove_attr_exceptions() { let cfcache = cache_from_rules(vec![ "example.com,~sub.example.com##.element:remove-attr(style)", "sub.test.example.com#@#.element:remove-attr(style)", "a1.sub.example.com##.element", "a2.sub.example.com##.element:remove-attr(src)", "a3.example.com##.element:remove-attr(src)", ]); let resources = ResourceStorage::default(); let out = cfcache.hostname_cosmetic_resources(&resources, "sub.example.com", false); let mut expected = UrlSpecificResources::empty(); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "sub.test.example.com", false); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "a1.sub.example.com", false); expected.hide_selectors.insert(".element".to_owned()); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "test.example.com", false); expected.hide_selectors.clear(); expected .remove_attrs .insert(".element".to_owned(), vec!["style".to_owned()]); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "a2.sub.example.com", false); expected.remove_attrs.clear(); expected .remove_attrs .insert(".element".to_owned(), vec!["src".to_owned()]); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "a3.example.com", false); expected.remove_attrs.clear(); expected .remove_attrs .insert(".element".to_owned(), vec!["src".to_owned(), "style".to_owned()]); // order is non-deterministic if out != expected { expected .remove_attrs .get_mut(".element") .unwrap() .reverse(); assert_eq!(out, expected); } } #[test] fn remove_class_exceptions() { let cfcache = cache_from_rules(vec![ "example.com,~sub.example.com##.element:remove-class(overlay)", "sub.test.example.com#@#.element:remove-class(overlay)", "a1.sub.example.com##.element", "a2.sub.example.com##.element:remove-class(banner)", "a3.example.com##.element:remove-class(banner)", ]); let resources = ResourceStorage::default(); let out = cfcache.hostname_cosmetic_resources(&resources, "sub.example.com", false); let mut expected = UrlSpecificResources::empty(); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "sub.test.example.com", false); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "a1.sub.example.com", false); expected.hide_selectors.insert(".element".to_owned()); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "test.example.com", false); expected.hide_selectors.clear(); expected .remove_classes .insert(".element".to_owned(), vec!["overlay".to_owned()]); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "a2.sub.example.com", false); expected.remove_classes.clear(); expected .remove_classes .insert(".element".to_owned(), vec!["banner".to_owned()]); assert_eq!(out, expected); let out = cfcache.hostname_cosmetic_resources(&resources, "a3.example.com", false); expected.remove_classes.clear(); expected .remove_classes .insert(".element".to_owned(), vec!["banner".to_owned(), "overlay".to_owned()]); // order is non-deterministic if out != expected { expected .remove_classes .get_mut(".element") .unwrap() .reverse(); assert_eq!(out, expected); } } /// Avoid impossible type inference for type parameter `impl AsRef` const EMPTY: &[&str] = &[]; #[test] fn matching_hidden_class_id_selectors() { let rules = [ "##.a-class", "###simple-id", "##.a-class .with .children", "##.children .including #simple-id", "##a.a-class", ]; let cfcache = CosmeticFilterCache::from_rules( rules .iter() .map(|r| CosmeticFilter::parse(r, false, Default::default()).unwrap()) .collect::>(), ); let out = cfcache.hidden_class_id_selectors(["with"], EMPTY, &HashSet::default()); assert_eq!(out, Vec::::new()); let out = cfcache.hidden_class_id_selectors(EMPTY, ["with"], &HashSet::default()); assert_eq!(out, Vec::::new()); let out = cfcache.hidden_class_id_selectors(EMPTY, ["a-class"], &HashSet::default()); assert_eq!(out, Vec::::new()); let out = cfcache.hidden_class_id_selectors(["simple-id"], EMPTY, &HashSet::default()); assert_eq!(out, Vec::::new()); let out = cfcache.hidden_class_id_selectors(["a-class"], EMPTY, &HashSet::default()); assert_eq!(out, [".a-class", ".a-class .with .children"]); let out = cfcache.hidden_class_id_selectors( ["children", "a-class"], EMPTY, &HashSet::default(), ); assert_eq!( out, [ ".children .including #simple-id", ".a-class", ".a-class .with .children", ] ); let out = cfcache.hidden_class_id_selectors(EMPTY, ["simple-id"], &HashSet::default()); assert_eq!(out, ["#simple-id"]); let out = cfcache.hidden_class_id_selectors( ["children", "a-class"], ["simple-id"], &HashSet::default(), ); assert_eq!( out, [ ".children .including #simple-id", ".a-class", ".a-class .with .children", "#simple-id", ] ); } #[test] fn class_id_exceptions() { let rules = vec![ "##.a-class", "###simple-id", "##.a-class .with .children", "##.children .including #simple-id", "##a.a-class", "example.*#@#.a-class", "~test.com###test-element", ]; let cfcache = CosmeticFilterCache::from_rules( rules .iter() .map(|r| CosmeticFilter::parse(r, false, Default::default()).unwrap()) .collect::>(), ); let resources = ResourceStorage::default(); let exceptions = cfcache .hostname_cosmetic_resources(&resources, "example.co.uk", false) .exceptions; let out = cfcache.hidden_class_id_selectors(["a-class"], EMPTY, &exceptions); assert_eq!(out, [".a-class .with .children"]); let out = cfcache.hidden_class_id_selectors( ["children", "a-class"], ["simple-id"], &exceptions, ); assert_eq!( out, [ ".children .including #simple-id", ".a-class .with .children", "#simple-id", ] ); let out = cfcache.hidden_class_id_selectors(EMPTY, ["test-element"], &exceptions); assert_eq!(out, ["#test-element"]); let exceptions = cfcache .hostname_cosmetic_resources(&resources, "a1.test.com", false) .exceptions; let out = cfcache.hidden_class_id_selectors(["a-class"], EMPTY, &exceptions); assert_eq!(out, [".a-class", ".a-class .with .children"]); let out = cfcache.hidden_class_id_selectors( ["children", "a-class"], ["simple-id"], &exceptions, ); assert_eq!( out, [ ".children .including #simple-id", ".a-class", ".a-class .with .children", "#simple-id", ] ); let out = cfcache.hidden_class_id_selectors(EMPTY, ["test-element"], &exceptions); assert_eq!(out, Vec::::new()); } #[test] fn misc_generic_exceptions() { let rules = vec![ "##a[href=\"bad.com\"]", "##div > p", "##a[href=\"notbad.com\"]", "example.com#@#div > p", "~example.com##a[href=\"notbad.com\"]", ]; let cfcache = CosmeticFilterCache::from_rules( rules .iter() .map(|r| CosmeticFilter::parse(r, false, Default::default()).unwrap()) .collect::>(), ); let resources = ResourceStorage::default(); let hide_selectors = cfcache .hostname_cosmetic_resources(&resources, "test.com", false) .hide_selectors; let mut expected_hides = HashSet::new(); expected_hides.insert("a[href=\"bad.com\"]".to_owned()); expected_hides.insert("div > p".to_owned()); expected_hides.insert("a[href=\"notbad.com\"]".to_owned()); assert_eq!(hide_selectors, expected_hides); let hide_selectors = cfcache .hostname_cosmetic_resources(&resources, "example.com", false) .hide_selectors; let mut expected_hides = HashSet::new(); expected_hides.insert("a[href=\"bad.com\"]".to_owned()); assert_eq!(hide_selectors, expected_hides); } #[test] fn apply_to_tld() { use crate::resources::ResourceType; // toolforge.org and github.io are examples of TLDs with multiple segments. These rules // should still be parsed correctly and applied on corresponding subdomains. let rules = vec![ "toolforge.org##+js(abort-on-property-read, noAdBlockers)", "github.io##div.adToBlock", ]; let cfcache = CosmeticFilterCache::from_rules( rules .iter() .map(|r| CosmeticFilter::parse(r, false, Default::default()).unwrap()) .collect::>(), ); let resources = ResourceStorage::from_resources([ Resource { name: "abort-on-property-read.js".into(), aliases: vec!["aopr".to_string()], kind: ResourceType::Template, content: base64::encode("abort-on-property-read.js, {{1}}"), dependencies: vec![], permission: Default::default(), } ]); let injected_script = cfcache .hostname_cosmetic_resources(&resources, "antonok.toolforge.org", false) .injected_script; assert_eq!( injected_script, "try {\nabort-on-property-read.js, noAdBlockers\n} catch ( e ) { }\n" ); let hide_selectors = cfcache .hostname_cosmetic_resources(&resources, "antonok.github.io", false) .hide_selectors; let mut expected_hides = HashSet::new(); expected_hides.insert("div.adToBlock".to_owned()); assert_eq!(hide_selectors, expected_hides); } } adblock-0.8.12/src/data_format/mod.rs000064400000000000000000000077561046102023000155270ustar 00000000000000//! Allows serialization of the adblock engine into a compact binary format, as well as subsequent //! rapid deserialization back into an engine. //! //! In order to support multiple format versions simultaneously, this module wraps around different //! serialization/deserialization implementations and can automatically dispatch to the appropriate //! one. mod v0; pub(crate) mod utils; use crate::blocker::Blocker; use crate::cosmetic_filter_cache::CosmeticFilterCache; /// Newer formats start with this magic byte sequence. /// Calculated as the leading 4 bytes of `echo -n 'brave/adblock-rust' | sha512sum`. const ADBLOCK_RUST_DAT_MAGIC: [u8; 4] = [0xd1, 0xd9, 0x3a, 0xaf]; /// Provides structural aggregration of referenced adblock engine data to allow for allocation-free /// serialization. /// /// Note that this does not implement `Serialize` directly, as it is composed of parts which must /// be serialized independently. Instead, use the `serialize` method. pub(crate) enum SerializeFormat<'a> { V0(v0::SerializeFormat<'a>), } #[derive(Debug)] pub enum SerializationError { RmpSerdeError(rmp_serde::encode::Error), } impl From for SerializationError { fn from(e: rmp_serde::encode::Error) -> Self { Self::RmpSerdeError(e) } } impl<'a> SerializeFormat<'a> { pub(crate) fn build(blocker: &'a Blocker, cfc: &'a CosmeticFilterCache) -> Self { Self::V0(v0::SerializeFormat::from((blocker, cfc))) } pub(crate) fn serialize(&self) -> Result, SerializationError> { match self { Self::V0(v) => v.serialize(), } } } /// Structural representation of adblock engine data that can be built up from deserialization and /// used directly to construct new `Engine` components without unnecessary allocation. /// /// Note that this does not implement `Deserialize` directly, as it is composed of parts which must /// be deserialized independently. Instead, use the `deserialize` method. pub(crate) enum DeserializeFormat { V0(v0::DeserializeFormat), } #[derive(Debug)] pub enum DeserializationError { RmpSerdeError(rmp_serde::decode::Error), UnsupportedFormatVersion(u8), NoHeaderFound, /// Support for the legacy gzip-compressed data format was removed in version 0.8.0 of this /// crate. If you still need it for some reason, you can convert it using 0.7.x by /// deserializing and then reserializing it into the newer V0 format. LegacyFormatNoLongerSupported, } impl From for DeserializationError { fn from(e: rmp_serde::decode::Error) -> Self { Self::RmpSerdeError(e) } } impl DeserializeFormat { pub(crate) fn build(self) -> (Blocker, CosmeticFilterCache) { match self { Self::V0(v) => v.into(), } } pub(crate) fn deserialize(serialized: &[u8]) -> Result { /// adblock-rust's legacy DAT format has always used flate2 1.0.x, which has never changed /// the header sequence from these 10 bits when the GzEncoder is left uncustomized. const FLATE2_GZ_HEADER_BYTES: [u8; 10] = [31, 139, 8, 0, 0, 0, 0, 0, 0, 255]; if serialized.starts_with(&ADBLOCK_RUST_DAT_MAGIC) { let version = serialized[ADBLOCK_RUST_DAT_MAGIC.len()]; match version { 0 => Ok(Self::V0(v0::DeserializeFormat::deserialize(serialized)?)), v => Err(DeserializationError::UnsupportedFormatVersion(v)), } } else if serialized.starts_with(&FLATE2_GZ_HEADER_BYTES) { Err(DeserializationError::LegacyFormatNoLongerSupported) } else { Err(DeserializationError::NoHeaderFound) } } } #[cfg(test)] mod tests { use super::*; #[test] fn validate_magic_bytes() { use sha2::Digest; let mut hasher = sha2::Sha512::new(); hasher.update("brave/adblock-rust"); let result = hasher.finalize(); assert!(result.starts_with(&ADBLOCK_RUST_DAT_MAGIC)); } } adblock-0.8.12/src/data_format/utils.rs000064400000000000000000000017031046102023000160720ustar 00000000000000//! Common utilities associated with serialization and deserialization of the `Engine` data into //! binary formats. use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use serde::{Serialize, Serializer}; /// Forces a `HashSet` to be serialized with a stable ordering by temporarily representing it as a /// `BTreeSet`. pub fn stabilize_hashset_serialization(set: &HashSet, s: S) -> Result where S: Serializer, V: Ord + serde::Serialize, { let stabilized: BTreeSet<&V> = set.iter().collect(); stabilized.serialize(s) } /// Forces a `HashMap` to be serialized with a stable ordering by temporarily representing it as a /// `BTreeMap`. pub fn stabilize_hashmap_serialization( set: &HashMap, s: S, ) -> Result where S: Serializer, K: Ord + Serialize, V: Serialize, { let stabilized: BTreeMap<&K, &V> = set.iter().collect(); stabilized.serialize(s) } adblock-0.8.12/src/data_format/v0.rs000064400000000000000000000414211046102023000152600ustar 00000000000000//! Contains representations of data from the adblocking engine in a //! forwards-and-backwards-compatible format, as well as utilities for converting these to and from //! the actual `Engine` components. //! //! Any new fields should be added to the _end_ of both `SerializeFormat` and `DeserializeFormat`. use std::collections::{HashMap, HashSet}; use rmp_serde as rmps; use serde::{Deserialize, Serialize}; use crate::blocker::{Blocker, NetworkFilterList}; use crate::cosmetic_filter_cache::{CosmeticFilterCache, HostnameRuleDb}; use crate::filters::network::NetworkFilter; use crate::utils::Hash; use super::utils::{stabilize_hashmap_serialization, stabilize_hashset_serialization}; use super::{DeserializationError, SerializationError}; /// Each variant describes a single rule that is specific to a particular hostname. #[derive(Clone, Debug, Deserialize, Serialize)] enum LegacySpecificFilterType { Hide(String), Unhide(String), Style(String, String), UnhideStyle(String, String), ScriptInject(String), UnhideScriptInject(String), } #[derive(Deserialize, Serialize, Default)] pub(crate) struct LegacyHostnameRuleDb { #[serde(serialize_with = "crate::data_format::utils::stabilize_hashmap_serialization")] db: HashMap>, } impl From<&HostnameRuleDb> for LegacyHostnameRuleDb { fn from(v: &HostnameRuleDb) -> Self { let mut db = HashMap::>::new(); for (hash, bin) in v.hide.0.iter() { for f in bin { db.entry(*hash) .and_modify(|v| v.push(LegacySpecificFilterType::Hide(f.to_owned()))) .or_insert_with(|| vec![LegacySpecificFilterType::Hide(f.to_owned())]); } } for (hash, bin) in v.unhide.0.iter() { for f in bin { db.entry(*hash) .and_modify(|v| v.push(LegacySpecificFilterType::Unhide(f.to_owned()))) .or_insert_with(|| vec![LegacySpecificFilterType::Unhide(f.to_owned())]); } } for (hash, bin) in v.inject_script.0.iter() { for (f, _mask) in bin { db.entry(*hash) .and_modify(|v| v.push(LegacySpecificFilterType::ScriptInject(f.to_owned()))) .or_insert_with(|| vec![LegacySpecificFilterType::ScriptInject(f.to_owned())]); } } for (hash, bin) in v.uninject_script.0.iter() { for f in bin { db.entry(*hash) .and_modify(|v| v.push(LegacySpecificFilterType::UnhideScriptInject(f.to_owned()))) .or_insert_with(|| vec![LegacySpecificFilterType::UnhideScriptInject(f.to_owned())]); } } for (hash, bin) in v.style.0.iter() { for f in bin { db.entry(*hash) .and_modify(|v| v.push(LegacySpecificFilterType::Style(f.0.to_owned(), f.1.to_owned()))) .or_insert_with(|| vec![LegacySpecificFilterType::Style(f.0.to_owned(), f.1.to_owned())]); } } for (hash, bin) in v.unstyle.0.iter() { for f in bin { db.entry(*hash) .and_modify(|v| v.push(LegacySpecificFilterType::UnhideStyle(f.0.to_owned(), f.1.to_owned()))) .or_insert_with(|| vec![LegacySpecificFilterType::UnhideStyle(f.0.to_owned(), f.1.to_owned())]); } } LegacyHostnameRuleDb { db, } } } impl Into for LegacyHostnameRuleDb { fn into(self) -> HostnameRuleDb { use crate::cosmetic_filter_cache::HostnameFilterBin; let mut hide = HostnameFilterBin::default(); let mut unhide = HostnameFilterBin::default(); let mut style = HostnameFilterBin::default(); let mut unstyle = HostnameFilterBin::default(); let mut inject_script = HostnameFilterBin::default(); let mut uninject_script = HostnameFilterBin::default(); for (hash, bin) in self.db.into_iter() { for rule in bin.into_iter() { match rule { LegacySpecificFilterType::Hide(s) => hide.insert(&hash, s), LegacySpecificFilterType::Unhide(s) => unhide.insert(&hash, s), LegacySpecificFilterType::Style(s, st) => style.insert(&hash, (s, st)), LegacySpecificFilterType::UnhideStyle(s, st) => unstyle.insert(&hash, (s, st)), LegacySpecificFilterType::ScriptInject(s) => inject_script.insert(&hash, (s, Default::default())), LegacySpecificFilterType::UnhideScriptInject(s) => uninject_script.insert(&hash, s), } } } HostnameRuleDb { hide, unhide, inject_script, uninject_script, remove: HostnameFilterBin::default(), unremove: HostnameFilterBin::default(), style, unstyle, remove_attr: HostnameFilterBin::default(), unremove_attr: HostnameFilterBin::default(), remove_class: HostnameFilterBin::default(), unremove_class: HostnameFilterBin::default(), } } } #[derive(Serialize, Deserialize, Debug, PartialEq, Clone)] pub(crate) struct LegacyRedirectResource { pub content_type: String, pub data: String, } #[derive(Serialize, Deserialize, Debug, PartialEq, Default)] pub(crate) struct LegacyRedirectResourceStorage { #[serde(serialize_with = "crate::data_format::utils::stabilize_hashmap_serialization")] pub resources: HashMap, } #[derive(Clone, Deserialize, Serialize)] pub(crate) struct LegacyScriptletResource { scriptlet: String, } #[derive(Default, Deserialize, Serialize)] pub(crate) struct LegacyScriptletResourceStorage { #[serde(serialize_with = "crate::data_format::utils::stabilize_hashmap_serialization")] resources: HashMap, } /// `_bug` is no longer used, and is removed from future format versions. #[derive(Debug, Clone, Serialize)] struct NetworkFilterV0SerializeFmt<'a> { mask: &'a crate::filters::network::NetworkFilterMask, filter: &'a crate::filters::network::FilterPart, opt_domains: &'a Option>, opt_not_domains: &'a Option>, redirect: &'a Option, hostname: &'a Option, csp: &'a Option, _bug: Option, tag: &'a Option, raw_line: Option, id: &'a crate::utils::Hash, opt_domains_union: &'a Option, opt_not_domains_union: &'a Option, } /// Generic over `Borrow` because `tagged_filters_all` requires `&'a NetworkFilter` /// while `NetworkFilterList` requires `&'a Arc`. impl<'a, T> From<&'a T> for NetworkFilterV0SerializeFmt<'a> where T: std::borrow::Borrow, { fn from(v: &'a T) -> NetworkFilterV0SerializeFmt<'a> { let v = v.borrow(); NetworkFilterV0SerializeFmt { mask: &v.mask, filter: &v.filter, opt_domains: &v.opt_domains, opt_not_domains: &v.opt_not_domains, redirect: if v.is_redirect() { &v.modifier_option } else { &None }, hostname: &v.hostname, csp: if v.is_csp() { &v.modifier_option } else { &None }, _bug: None, tag: &v.tag, raw_line: v.raw_line.as_ref().map(|raw| *raw.clone()), id: &v.id, opt_domains_union: &v.opt_domains_union, opt_not_domains_union: &v.opt_not_domains_union, } } } /// Forces a `NetworkFilterList` to be serialized with the v0 filter format by converting to an /// intermediate representation that is constructed with `NetworkFilterV0Fmt` instead. fn serialize_v0_network_filter_list(list: &NetworkFilterList, s: S) -> Result where S: serde::Serializer, { #[derive(Serialize, Default)] struct NetworkFilterListV0SerializeFmt<'a> { #[serde(serialize_with = "crate::data_format::utils::stabilize_hashmap_serialization")] filter_map: HashMap>>, } let v0_list = NetworkFilterListV0SerializeFmt { filter_map: list .filter_map .iter() .map(|(k, v)| (*k, v.iter().map(|f| f.into()).collect())) .collect(), }; v0_list.serialize(s) } /// Forces a `NetworkFilter` slice to be serialized with the v0 filter format by converting to /// an intermediate representation that is constructed with `NetworkFilterV0Fmt` instead. fn serialize_v0_network_filter_vec(vec: &[NetworkFilter], s: S) -> Result where S: serde::Serializer, { let v0_vec: Vec<_> = vec.iter().map(NetworkFilterV0SerializeFmt::from).collect(); v0_vec.serialize(s) } /// Provides structural aggregration of referenced adblock engine data to allow for allocation-free /// serialization. #[derive(Serialize)] pub(crate) struct SerializeFormat<'a> { #[serde(serialize_with = "serialize_v0_network_filter_list")] csp: &'a NetworkFilterList, #[serde(serialize_with = "serialize_v0_network_filter_list")] exceptions: &'a NetworkFilterList, #[serde(serialize_with = "serialize_v0_network_filter_list")] importants: &'a NetworkFilterList, #[serde(serialize_with = "serialize_v0_network_filter_list")] redirects: &'a NetworkFilterList, #[serde(serialize_with = "serialize_v0_network_filter_list")] filters_tagged: &'a NetworkFilterList, #[serde(serialize_with = "serialize_v0_network_filter_list")] filters: &'a NetworkFilterList, #[serde(serialize_with = "serialize_v0_network_filter_list")] generic_hide: &'a NetworkFilterList, #[serde(serialize_with = "serialize_v0_network_filter_vec")] tagged_filters_all: &'a Vec, enable_optimizations: bool, resources: LegacyRedirectResourceStorage, #[serde(serialize_with = "stabilize_hashset_serialization")] simple_class_rules: &'a HashSet, #[serde(serialize_with = "stabilize_hashset_serialization")] simple_id_rules: &'a HashSet, #[serde(serialize_with = "stabilize_hashmap_serialization")] complex_class_rules: &'a HashMap>, #[serde(serialize_with = "stabilize_hashmap_serialization")] complex_id_rules: &'a HashMap>, specific_rules: LegacyHostnameRuleDb, #[serde(serialize_with = "stabilize_hashset_serialization")] misc_generic_selectors: &'a HashSet, scriptlets: LegacyScriptletResourceStorage, } impl<'a> SerializeFormat<'a> { pub fn serialize(&self) -> Result, SerializationError> { let mut output = super::ADBLOCK_RUST_DAT_MAGIC.to_vec(); output.push(0); rmps::encode::write(&mut output, &self)?; Ok(output) } } /// `_bug` is no longer used, and is cleaned up from future format versions. #[derive(Debug, Clone, Deserialize)] pub(crate) struct NetworkFilterV0DeserializeFmt { pub mask: crate::filters::network::NetworkFilterMask, pub filter: crate::filters::network::FilterPart, pub opt_domains: Option>, pub opt_not_domains: Option>, pub redirect: Option, pub hostname: Option, pub csp: Option, _bug: Option, pub tag: Option, pub raw_line: Option, pub id: crate::utils::Hash, pub opt_domains_union: Option, pub opt_not_domains_union: Option, } impl From for NetworkFilter { fn from(v: NetworkFilterV0DeserializeFmt) -> Self { Self { mask: v.mask, filter: v.filter, opt_domains: v.opt_domains, opt_not_domains: v.opt_not_domains, modifier_option: v.redirect.or(v.csp), hostname: v.hostname, tag: v.tag, raw_line: v.raw_line.map(Box::new), id: v.id, opt_domains_union: v.opt_domains_union, opt_not_domains_union: v.opt_not_domains_union, } } } #[derive(Debug, Deserialize, Default)] pub(crate) struct NetworkFilterListV0DeserializeFmt { pub filter_map: HashMap>, } impl From for NetworkFilterList { fn from(v: NetworkFilterListV0DeserializeFmt) -> Self { Self { filter_map: v .filter_map .into_iter() .map(|(k, v)| { ( k, v.into_iter() .map(|f| std::sync::Arc::new(f.into())) .collect(), ) }) .collect(), } } } /// Structural representation of adblock engine data that can be built up from deserialization and /// used directly to construct new `Engine` components without unnecessary allocation. #[derive(Deserialize)] pub(crate) struct DeserializeFormat { csp: NetworkFilterListV0DeserializeFmt, exceptions: NetworkFilterListV0DeserializeFmt, importants: NetworkFilterListV0DeserializeFmt, redirects: NetworkFilterListV0DeserializeFmt, filters_tagged: NetworkFilterListV0DeserializeFmt, filters: NetworkFilterListV0DeserializeFmt, generic_hide: NetworkFilterListV0DeserializeFmt, tagged_filters_all: Vec, enable_optimizations: bool, _resources: LegacyRedirectResourceStorage, simple_class_rules: HashSet, simple_id_rules: HashSet, complex_class_rules: HashMap>, complex_id_rules: HashMap>, specific_rules: LegacyHostnameRuleDb, misc_generic_selectors: HashSet, _scriptlets: LegacyScriptletResourceStorage, } impl DeserializeFormat { pub fn deserialize(serialized: &[u8]) -> Result { assert!(serialized.starts_with(&super::ADBLOCK_RUST_DAT_MAGIC)); assert!(serialized[super::ADBLOCK_RUST_DAT_MAGIC.len()] == 0); let format: Self = rmps::decode::from_read(&serialized[super::ADBLOCK_RUST_DAT_MAGIC.len() + 1..])?; Ok(format) } } impl<'a> From<(&'a Blocker, &'a CosmeticFilterCache)> for SerializeFormat<'a> { fn from(v: (&'a Blocker, &'a CosmeticFilterCache)) -> Self { let (blocker, cfc) = v; Self { csp: &blocker.csp, exceptions: &blocker.exceptions, importants: &blocker.importants, redirects: &blocker.redirects, filters_tagged: &blocker.filters_tagged, filters: &blocker.filters, generic_hide: &blocker.generic_hide, tagged_filters_all: &blocker.tagged_filters_all, enable_optimizations: blocker.enable_optimizations, resources: LegacyRedirectResourceStorage::default(), simple_class_rules: &cfc.simple_class_rules, simple_id_rules: &cfc.simple_id_rules, complex_class_rules: &cfc.complex_class_rules, complex_id_rules: &cfc.complex_id_rules, specific_rules: (&cfc.specific_rules).into(), misc_generic_selectors: &cfc.misc_generic_selectors, scriptlets: LegacyScriptletResourceStorage::default(), } } } impl From for (Blocker, CosmeticFilterCache) { fn from(v: DeserializeFormat) -> Self { ( Blocker { csp: v.csp.into(), exceptions: v.exceptions.into(), importants: v.importants.into(), redirects: v.redirects.into(), removeparam: NetworkFilterList::default(), filters_tagged: v.filters_tagged.into(), filters: v.filters.into(), generic_hide: v.generic_hide.into(), tags_enabled: Default::default(), tagged_filters_all: v.tagged_filters_all.into_iter().map(|f| f.into()).collect(), enable_optimizations: v.enable_optimizations, #[cfg(feature = "object-pooling")] pool: Default::default(), regex_manager: Default::default(), }, CosmeticFilterCache { simple_class_rules: v.simple_class_rules, simple_id_rules: v.simple_id_rules, complex_class_rules: v.complex_class_rules, complex_id_rules: v.complex_id_rules, specific_rules: v.specific_rules.into(), misc_generic_selectors: v.misc_generic_selectors, }, ) } } adblock-0.8.12/src/engine.rs000064400000000000000000001374251046102023000137310ustar 00000000000000//! The adblock [`Engine`] is the primary interface for adblocking. use crate::blocker::{Blocker, BlockerOptions, BlockerResult}; use crate::cosmetic_filter_cache::{CosmeticFilterCache, UrlSpecificResources}; use crate::lists::{FilterSet, ParseOptions}; use crate::regex_manager::RegexManagerDiscardPolicy; use crate::request::Request; use crate::resources::{Resource, ResourceStorage}; use std::collections::HashSet; /// Drives high-level blocking logic and is responsible for loading filter lists into an optimized /// format that can be queried efficiently. /// /// For performance optimization reasons, the [`Engine`] is not designed to have rules added or /// removed after its initial creation. Making changes to the rules loaded is accomplished by /// creating a new engine to replace it. /// /// ## Usage /// /// ### Initialization /// /// You'll first want to combine all of your filter lists in a [`FilterSet`], which will parse list /// header metadata. Once all lists have been composed together, you can call /// [`Engine::from_filter_set`] to start using them for blocking. /// /// You may also want to supply certain assets for `$redirect` filters and `##+js(...)` scriptlet /// injections. These are known as [`Resource`]s, and can be provided with /// [`Engine::use_resources`]. See the [`crate::resources`] module for more information. /// /// ### Network blocking /// /// Use the [`Engine::check_network_request`] method to determine how to handle a network request. /// /// If you _only_ need network blocking, consider using a [`Blocker`] directly. /// /// ### Cosmetic filtering /// /// Call [`Engine::url_cosmetic_resources`] to determine what actions should be taken to prepare a /// particular page before it starts loading. /// /// Once the page has been loaded, any new CSS classes or ids that appear on the page should be passed to /// [`Engine::hidden_class_id_selectors`] on an ongoing basis to determine additional elements that /// should be hidden dynamically. pub struct Engine { blocker: Blocker, cosmetic_cache: CosmeticFilterCache, resources: ResourceStorage, } impl Default for Engine { /// Equivalent to `Engine::new(true)`. fn default() -> Self { Self::new(true) } } impl Engine { /// Creates a new adblocking `Engine`. `Engine`s created without rules should generally only be /// used with deserialization. /// - `optimize` specifies whether or not to attempt to compress the internal representation by /// combining similar rules. pub fn new(optimize: bool) -> Self { let blocker_options = BlockerOptions { enable_optimizations: optimize, }; Self { blocker: Blocker::new(vec![], &blocker_options), cosmetic_cache: CosmeticFilterCache::new(), resources: ResourceStorage::default(), } } /// Loads rules in a single format, enabling optimizations and discarding debug information. pub fn from_rules(rules: impl IntoIterator>, opts: ParseOptions) -> Self { let mut filter_set = FilterSet::new(false); filter_set.add_filters(rules, opts); Self::from_filter_set(filter_set, true) } /// Loads rules, enabling optimizations and including debug information. pub fn from_rules_debug(rules: impl IntoIterator>, opts: ParseOptions) -> Self { Self::from_rules_parametrised(rules, opts, true, true) } pub fn from_rules_parametrised(filter_rules: impl IntoIterator>, opts: ParseOptions, debug: bool, optimize: bool) -> Self { let mut filter_set = FilterSet::new(debug); filter_set.add_filters(filter_rules, opts); Self::from_filter_set(filter_set, optimize) } /// Loads rules from the given `FilterSet`. It is recommended to use a `FilterSet` when adding /// rules from multiple sources. pub fn from_filter_set(set: FilterSet, optimize: bool) -> Self { let FilterSet { network_filters, cosmetic_filters, .. } = set; let blocker_options = BlockerOptions { enable_optimizations: optimize, }; Self { blocker: Blocker::new(network_filters, &blocker_options), cosmetic_cache: CosmeticFilterCache::from_rules(cosmetic_filters), resources: ResourceStorage::default(), } } /// Serializes the `Engine` into a binary format so that it can be quickly reloaded later. pub fn serialize_raw(&self) -> Result, crate::data_format::SerializationError> { use crate::data_format::SerializeFormat; let serialize_format = SerializeFormat::build(&self.blocker, &self.cosmetic_cache); serialize_format.serialize() } /// Deserialize the `Engine` from the binary format generated by `Engine::serialize_raw`. The /// method will automatically select the correct deserialization implementation. pub fn deserialize(&mut self, serialized: &[u8]) -> Result<(), crate::data_format::DeserializationError> { use crate::data_format::DeserializeFormat; let current_tags = self.blocker.tags_enabled(); let deserialize_format = DeserializeFormat::deserialize(serialized)?; let (blocker, cosmetic_cache) = deserialize_format.build(); self.blocker = blocker; self.blocker.use_tags(¤t_tags.iter().map(|s| &**s).collect::>()); self.cosmetic_cache = cosmetic_cache; Ok(()) } /// Check if a request for a network resource from `url`, of type `request_type`, initiated by /// `source_url`, should be blocked. pub fn check_network_request(&self, request: &Request) -> BlockerResult { self.blocker.check(request, &self.resources) } pub fn check_network_request_subset( &self, request: &Request, previously_matched_rule: bool, force_check_exceptions: bool, ) -> BlockerResult { self.blocker.check_parameterised(request, &self.resources, previously_matched_rule, force_check_exceptions) } /// Returns a string containing any additional CSP directives that should be added to this /// request's response. Only applies to document and subdocument requests. /// /// If multiple policies are present from different rules, they will be joined by commas. pub fn get_csp_directives( &self, request: &Request, ) -> Option { self.blocker.get_csp_directives(request) } /// Sets this engine's tags to be _only_ the ones provided in `tags`. /// /// Tags can be used to cheaply enable or disable network rules with a corresponding `$tag` /// option. pub fn use_tags(&mut self, tags: &[&str]) { self.blocker.use_tags(tags); } /// Sets this engine's tags to additionally include the ones provided in `tags`. /// /// Tags can be used to cheaply enable or disable network rules with a corresponding `$tag` /// option. pub fn enable_tags(&mut self, tags: &[&str]) { self.blocker.enable_tags(tags); } /// Sets this engine's tags to no longer include the ones provided in `tags`. /// /// Tags can be used to cheaply enable or disable network rules with a corresponding `$tag` /// option. pub fn disable_tags(&mut self, tags: &[&str]) { self.blocker.disable_tags(tags); } /// Checks if a given tag exists in this engine. /// /// Tags can be used to cheaply enable or disable network rules with a corresponding `$tag` /// option. pub fn tag_exists(&self, tag: &str) -> bool { self.blocker.tags_enabled().contains(&tag.to_owned()) } /// Sets this engine's resources to be _only_ the ones provided in `resources`. pub fn use_resources(&mut self, resources: impl IntoIterator) { self.resources = ResourceStorage::from_resources(resources); } /// Sets this engine's resources to additionally include `resource`. pub fn add_resource(&mut self, resource: Resource) -> Result<(), crate::resources::AddResourceError> { self.resources.add_resource(resource) } // Cosmetic filter functionality /// If any of the provided CSS classes or ids could cause a certain generic CSS hide rule /// (i.e. `{ display: none !important; }`) to be required, this method will return a list of /// CSS selectors corresponding to rules referencing those classes or ids, provided that the /// corresponding rules are not excepted. /// /// `exceptions` should be passed directly from `UrlSpecificResources`. pub fn hidden_class_id_selectors(&self, classes: impl IntoIterator>, ids: impl IntoIterator>, exceptions: &HashSet) -> Vec { self.cosmetic_cache.hidden_class_id_selectors(classes, ids, exceptions) } /// Returns a set of cosmetic filter resources required for a particular url. Once this has /// been called, all CSS ids and classes on a page should be passed to /// `hidden_class_id_selectors` to obtain any stylesheets consisting of generic rules (if the /// returned `generichide` value is false). pub fn url_cosmetic_resources(&self, url: &str) -> UrlSpecificResources { let request = if let Ok(request) = Request::new(url, url, "document") { request } else { return UrlSpecificResources::empty(); }; let generichide = self.blocker.check_generic_hide(&request); self.cosmetic_cache.hostname_cosmetic_resources(&self.resources, &request.hostname, generichide) } pub fn set_regex_discard_policy( &mut self, new_discard_policy: RegexManagerDiscardPolicy ) { self.blocker.set_regex_discard_policy(new_discard_policy); } #[cfg(feature = "regex-debug-info")] pub fn discard_regex(&mut self, regex_id: u64) { self.blocker.discard_regex(regex_id); } #[cfg(feature = "regex-debug-info")] pub fn get_regex_debug_info(&self) -> crate::regex_manager::RegexDebugInfo { self.blocker.get_regex_debug_info() } } /// Static assertions for `Engine: Send + Sync` traits. #[cfg(not(any(feature = "object-pooling", feature = "unsync-regex-caching")))] fn _assertions() { fn _assert_send() {} fn _assert_sync() {} _assert_send::(); _assert_sync::(); } #[cfg(test)] mod tests { use super::*; use crate::resources::MimeType; use crate::lists::FilterFormat; #[test] fn tags_enable_adds_tags() { let filters = [ "adv$tag=stuff", "somelongpath/test$tag=stuff", "||brianbondy.com/$tag=brian", "||brave.com$tag=brian", ]; let url_results = [ ("http://example.com/advert.html", true), ("http://example.com/somelongpath/test/2.html", true), ("https://brianbondy.com/about", true), ("https://brave.com/about", true), ]; let mut engine = Engine::from_rules(&filters, Default::default()); engine.enable_tags(&["stuff"]); engine.enable_tags(&["brian"]); url_results.into_iter().for_each(|(url, expected_result)| { let request = Request::new(&url, "", "").unwrap(); let matched_rule = engine.check_network_request(&request); if expected_result { assert!(matched_rule.matched, "Expected match for {}", url); } else { assert!(!matched_rule.matched, "Expected no match for {}, matched with {:?}", url, matched_rule.filter); } }); } #[test] fn tags_disable_works() { let filters = [ "adv$tag=stuff", "somelongpath/test$tag=stuff", "||brianbondy.com/$tag=brian", "||brave.com$tag=brian", ]; let url_results = [ ("http://example.com/advert.html", false), ("http://example.com/somelongpath/test/2.html", false), ("https://brianbondy.com/about", true), ("https://brave.com/about", true), ]; let mut engine = Engine::from_rules(&filters, Default::default()); engine.enable_tags(&["brian", "stuff"]); engine.disable_tags(&["stuff"]); url_results.into_iter().for_each(|(url, expected_result)| { let request = Request::new(&url, "", "").unwrap(); let matched_rule = engine.check_network_request(&request); if expected_result { assert!(matched_rule.matched, "Expected match for {}", url); } else { assert!(!matched_rule.matched, "Expected no match for {}, matched with {:?}", url, matched_rule.filter); } }); } #[test] fn exception_tags_inactive_by_default() { let filters = [ "adv", "||brianbondy.com/$tag=brian", "@@||brianbondy.com/$tag=brian", ]; let url_results = [ ("http://example.com/advert.html", true), ("https://brianbondy.com/about", false), ("https://brianbondy.com/advert", true), ]; let engine = Engine::from_rules(&filters, Default::default()); url_results.into_iter().for_each(|(url, expected_result)| { let request = Request::new(&url, "", "").unwrap(); let matched_rule = engine.check_network_request(&request); if expected_result { assert!(matched_rule.matched, "Expected match for {}", url); } else { assert!(!matched_rule.matched, "Expected no match for {}, matched with {:?}", url, matched_rule.filter); } }); } #[test] fn exception_tags_works() { let filters = [ "adv", "||brianbondy.com/$tag=brian", "@@||brianbondy.com/$tag=brian", ]; let url_results = [ ("http://example.com/advert.html", true), ("https://brianbondy.com/about", false), ("https://brianbondy.com/advert", false), ]; let mut engine = Engine::from_rules(&filters, Default::default()); engine.enable_tags(&["brian", "stuff"]); url_results.into_iter().for_each(|(url, expected_result)| { let request = Request::new(&url, "", "").unwrap(); let matched_rule = engine.check_network_request(&request); if expected_result { assert!(matched_rule.matched, "Expected match for {}", url); } else { assert!(!matched_rule.matched, "Expected no match for {}, matched with {:?}", url, matched_rule.filter); } }); } #[test] fn serialization_retains_tags() { let filters = [ "adv$tag=stuff", "somelongpath/test$tag=stuff", "||brianbondy.com/$tag=brian", "||brave.com$tag=brian", ]; let url_results = [ ("http://example.com/advert.html", true), ("http://example.com/somelongpath/test/2.html", true), ("https://brianbondy.com/about", false), ("https://brave.com/about", false), ]; let mut engine = Engine::from_rules(&filters, Default::default()); engine.enable_tags(&["stuff"]); engine.enable_tags(&["brian"]); let serialized = engine.serialize_raw().unwrap(); let mut deserialized_engine = Engine::default(); deserialized_engine.enable_tags(&["stuff"]); deserialized_engine.deserialize(&serialized).unwrap(); url_results.into_iter().for_each(|(url, expected_result)| { let request = Request::new(&url, "", "").unwrap(); let matched_rule = deserialized_engine.check_network_request(&request); if expected_result { assert!(matched_rule.matched, "Expected match for {}", url); } else { assert!(!matched_rule.matched, "Expected no match for {}, matched with {:?}", url, matched_rule.filter); } }); } #[test] fn deserialization_backwards_compatible_plain() { // deserialization_generate_simple(); // assert!(false); // converted from the legacy compressed format let serialized = [209, 217, 58, 175, 0, 220, 0, 17, 145, 128, 145, 128, 145, 128, 145, 128, 145, 128, 145, 129, 207, 202, 167, 36, 217, 43, 56, 97, 176, 145, 157, 145, 206, 0, 3, 31, 255, 129, 1, 169, 97, 100, 45, 98, 97, 110, 110, 101, 114, 192, 192, 192, 192, 192, 192, 192, 192, 207, 186, 136, 69, 13, 115, 187, 170, 226, 192, 192, 145, 128, 144, 195, 145, 128, 144, 144, 128, 128, 145, 128, 144, 145, 128]; let mut deserialized_engine = Engine::default(); deserialized_engine.deserialize(&serialized).unwrap(); let url = "http://example.com/ad-banner.gif"; let request = Request::new(&url, "", "").unwrap(); let matched_rule = deserialized_engine.check_network_request(&request); assert!(matched_rule.matched, "Expected match for {}", url); } #[test] fn deserialization_backwards_compatible_tags() { // deserialization_generate_tags(); // assert!(false); // converted from the legacy compressed format let serialized = [209, 217, 58, 175, 0, 220, 0, 17, 145, 128, 145, 128, 145, 128, 145, 128, 145, 128, 145, 128, 145, 128, 145, 157, 145, 206, 0, 3, 31, 255, 129, 1, 169, 97, 100, 45, 98, 97, 110, 110, 101, 114, 192, 192, 192, 192, 192, 192, 163, 97, 98, 99, 192, 207, 126, 212, 53, 83, 113, 159, 143, 134, 192, 192, 195, 145, 128, 144, 144, 128, 128, 145, 128, 144, 145, 128]; let mut deserialized_engine = Engine::default(); deserialized_engine.enable_tags(&[]); deserialized_engine.deserialize(&serialized).unwrap(); let url = "http://example.com/ad-banner.gif"; let request = Request::new(&url, "", "").unwrap(); let matched_rule = deserialized_engine.check_network_request(&request); assert!(!matched_rule.matched, "Expected NO match for {}", url); deserialized_engine.enable_tags(&["abc"]); deserialized_engine.deserialize(&serialized).unwrap(); let url = "http://example.com/ad-banner.gif"; let request = Request::new(&url, "", "").unwrap(); let matched_rule = deserialized_engine.check_network_request(&request); assert!(matched_rule.matched, "Expected match for {}", url); } #[test] fn deserialization_generate_simple() { let mut engine = Engine::from_rules(&[ "ad-banner", ], Default::default()); let serialized = engine.serialize_raw().unwrap(); println!("Engine serialized: {:?}", serialized); engine.deserialize(&serialized).unwrap(); } #[test] fn deserialization_generate_tags() { let mut engine = Engine::from_rules(&[ "ad-banner$tag=abc", ], Default::default()); engine.use_tags(&["abc"]); let serialized = engine.serialize_raw().unwrap(); println!("Engine serialized: {:?}", serialized); engine.deserialize(&serialized).unwrap(); } #[test] fn deserialization_generate_resources() { let mut engine = Engine::from_rules(&[ "ad-banner$redirect=nooptext", ], Default::default()); engine.use_resources([ Resource::simple("nooptext", MimeType::TextPlain, ""), Resource::simple("noopcss", MimeType::TextCss, ""), ]); let serialized = engine.serialize_raw().unwrap(); println!("Engine serialized: {:?}", serialized); engine.deserialize(&serialized).unwrap(); } #[test] fn redirect_resource_insertion_works() { let mut engine = Engine::from_rules(&[ "ad-banner$redirect=nooptext", "script.js$redirect=noop.js", ], Default::default()); let script = r#" (function() { ; })(); "#; let mut resources = [ Resource::simple("nooptext", MimeType::TextPlain, ""), Resource::simple("noopjs", MimeType::ApplicationJavascript, script), ]; resources[1].aliases.push("noop.js".to_string()); engine.use_resources(resources); let url = "http://example.com/ad-banner.gif"; let request = Request::new(url, "", "").unwrap(); let matched_rule = engine.check_network_request(&request); assert!(matched_rule.matched, "Expected match for {}", url); assert_eq!(matched_rule.redirect, Some("data:text/plain;base64,".to_owned()), "Expected redirect to contain resource"); let url = "http://example.com/script.js"; let request = Request::new(url, "", "").unwrap(); let matched_rule = engine.check_network_request(&request); assert!(matched_rule.matched, "Expected match for {}", url); assert_eq!(matched_rule.redirect, Some(format!("data:application/javascript;base64,{}", base64::encode(format!("{}", script)))), "Expected redirect to contain resource"); } #[test] fn document() { let filters = [ "||example.com$document", "@@||sub.example.com$document", ]; let engine = Engine::from_rules_debug(&filters, Default::default()); assert!(engine.check_network_request(&Request::new("https://example.com", "https://example.com", "document").unwrap()).matched); assert!(!engine.check_network_request(&Request::new("https://example.com", "https://example.com", "script").unwrap()).matched); assert!(engine.check_network_request(&Request::new("https://sub.example.com", "https://sub.example.com", "document").unwrap()).exception.is_some()); } #[test] fn implicit_all() { { let engine = Engine::from_rules_debug(["||example.com^"], Default::default()); assert!(engine.check_network_request(&Request::new("https://example.com", "https://example.com", "document").unwrap()).matched); } { let engine = Engine::from_rules_debug(["||example.com^$first-party"], Default::default()); assert!(engine.check_network_request(&Request::new("https://example.com", "https://example.com", "document").unwrap()).matched); } { let engine = Engine::from_rules_debug(["||example.com^$script"], Default::default()); assert!(!engine.check_network_request(&Request::new("https://example.com", "https://example.com", "document").unwrap()).matched); } { let engine = Engine::from_rules_debug(["||example.com^$~script"], Default::default()); assert!(!engine.check_network_request(&Request::new("https://example.com", "https://example.com", "document").unwrap()).matched); } { let engine = Engine::from_rules_debug(["||example.com^$document", "@@||example.com^$generichide"], Default::default()); assert!(engine.check_network_request(&Request::new("https://example.com", "https://example.com", "document").unwrap()).matched); } { let engine = Engine::from_rules_debug(["example.com"], ParseOptions { format: FilterFormat::Hosts, ..Default::default() }); assert!(engine.check_network_request(&Request::new("https://example.com", "https://example.com", "document").unwrap()).matched); } { let engine = Engine::from_rules_debug(["||example.com/path"], Default::default()); assert!(!engine.check_network_request(&Request::new("https://example.com/path", "https://example.com/path", "document").unwrap()).matched); } { let engine = Engine::from_rules_debug(["||example.com/path^"], Default::default()); assert!(!engine.check_network_request(&Request::new("https://example.com/path", "https://example.com/path", "document").unwrap()).matched); } } #[test] fn generichide() { let filters = [ "##.donotblock", "##a[href=\"generic.com\"]", "@@||example.com$generichide", "example.com##.block", "@@||example2.com/test.html$generichide", "example2.com##.block", ]; let url_results = [ ("https://example.com", vec![".block"], true), ("https://example.com/test.html", vec![".block"], true), ("https://example2.com", vec![".block", "a[href=\"generic.com\"]"], false), ("https://example2.com/test.html", vec![".block"], true), ]; let engine = Engine::from_rules(&filters, Default::default()); url_results.into_iter().for_each(|(url, expected_result, expected_generichide)| { let result = engine.url_cosmetic_resources(url); assert_eq!(result.hide_selectors, expected_result.iter().map(|s| s.to_string()).collect::>()); assert_eq!(result.generichide, expected_generichide); }); } #[test] fn important_redirect() { let mut filter_set = FilterSet::new(true); filter_set.add_filters([ "||addthis.com^$important,3p,domain=~missingkids.com|~missingkids.org|~sainsburys.jobs|~sitecore.com|~amd.com", "||addthis.com/*/addthis_widget.js$script,redirect=addthis.com/addthis_widget.js", ], Default::default()); let mut engine = Engine::from_filter_set(filter_set, false); engine.add_resource( Resource::simple("addthis.com/addthis_widget.js", MimeType::ApplicationJavascript, "window.addthis = undefined"), ).unwrap(); let request = Request::new("https://s7.addthis.com/js/250/addthis_widget.js?pub=resto", "https://www.rhmodern.com/catalog/product/product.jsp?productId=prod14970086&categoryId=cat7150028", "script").unwrap(); let result = engine.check_network_request(&request); assert!(result.redirect.is_some()); } #[test] fn check_match_case_regex_filtering() { { // match case without regex is discarded let engine = Engine::from_rules_debug(["ad.png$match-case"], Default::default()); let request = Request::new("https://example.com/ad.png", "https://example.com", "image").unwrap(); assert!(!engine.check_network_request(&request).matched); } { // /^https:\/\/[0-9a-z]{3,}\.[-a-z]{10,}\.(?:li[fv]e|top|xyz)\/[a-z]{8}\/\?utm_campaign=\w{40,}/$doc,match-case,domain=life|live|top|xyz let engine = Engine::from_rules_debug([r#"/^https:\/\/[0-9a-z]{3,}\.[-a-z]{10,}\.(?:li[fv]e|top|xyz)\/[a-z]{8}\/\?utm_campaign=\w{40,}/$doc,match-case,domain=life|live|top|xyz"#], Default::default()); let request = Request::new("https://www.exampleaaa.xyz/testtest/?utm_campaign=aaaaaaaaaabbbbbbbbbbccccccccccdddddddddd", "https://www.exampleaaa.xyz/testtest/?utm_campaign=aaaaaaaaaabbbbbbbbbbccccccccccdddddddddd", "document").unwrap(); assert!(engine.check_network_request(&request).matched); } // fails - because of non-supported look around operator in rust regex https://github.com/rust-lang/regex/issues/127#issuecomment-154713666 /*{ // /^https?:\/\/((?!www)[a-z]{3,}|\d{2})?\.?[-0-9a-z]{6,}\.[a-z]{2,6}\/(?:[a-z]{6,8}\/)?\/?\?u=[0-9a-z]{7}&o=[0-9a-z]{7}/$doc,frame,match-case,domain=buzz|com|de|fun|guru|info|life|live|mobi|online|pw|site|space|top|us|xyz let engine = Engine::from_rules_debug([r#"/^https?:\/\/((?!www)[a-z]{3,}|\d{2})?\.?[-0-9a-z]{6,}\.[a-z]{2,6}\/(?:[a-z]{6,8}\/)?\/?\?u=[0-9a-z]{7}&o=[0-9a-z]{7}/$doc,frame,match-case,domain=buzz|com|de|fun|guru|info|life|live|mobi|online|pw|site|space|top|us|xyz"#], Default::default()); let request = Request::new("https://example.com/aaaaaa/?u=aaaaaaa&o=bbbbbbb", "https://example.com/aaaaaa/?u=aaaaaaa&o=bbbbbbb", "document").unwrap(); assert!(engine.check_network_request(&request).matched); }*/ // fails - because of non-supported look around operator in rust regex https://github.com/rust-lang/regex/issues/127#issuecomment-154713666 /*{ // /^https:\/\/(?:www\d\.)?[-a-z]{6,}\.(?:com|info|net|org)\/(?=[-_a-zA-Z]{0,42}\d)(?=[-_0-9a-z]{0,42}[A-Z])[-_0-9a-zA-Z]{43}\/\?cid=[-_0-9a-zA-Z]{16,36}(?:&qs\d=\S+)?&sid=[_0-9a-f]{1,32}$/$doc,match-case,domain=com|info|net|org let engine = Engine::from_rules_debug([r#"/^https:\/\/(?:www\d\.)?[-a-z]{6,}\.(?:com|info|net|org)\/(?=[-_a-zA-Z]{0,42}\d)(?=[-_0-9a-z]{0,42}[A-Z])[-_0-9a-zA-Z]{43}\/\?cid=[-_0-9a-zA-Z]{16,36}(?:&qs\d=\S+)?&sid=[_0-9a-f]{1,32}$/$doc,match-case,domain=com|info|net|org"#], Default::default()); let request = Request::new("https://www3.example.com/aaaaaaaaaabbbbbbbbbbccccccccccddddddddddAA5/?cid=aaaaaaaaaabbbbbb&qs5=\n&sid=a", "https://www3.example.com/aaaaaaaaaabbbbbbbbbbccccccccccddddddddddAA5/?cid=aaaaaaaaaabbbbbb&qs5=\n&sid=a", "document").unwrap(); assert!(engine.check_network_request(&request).matched); }*/ // fails - because of non-supported look around operator in rust regex https://github.com/rust-lang/regex/issues/127#issuecomment-154713666 /*{ // /^https:\/\/(?:www\d\.)?[-a-z]{6,}\.(?:com|info|net|org)\/(?=[-_a-zA-Z]{0,42}\d)(?=[-_0-9a-z]{0,42}[A-Z])[-_0-9a-zA-Z]{43}\/\?sid=[_0-9a-f]{1,32}(?:&qs\d=\S+)?&cid=[-_0-9a-zA-Z]{16,36}$/$doc,match-case,domain=com|info|net|org let engine = Engine::from_rules_debug([r#"/^https:\/\/(?:www\d\.)?[-a-z]{6,}\.(?:com|info|net|org)\/(?=[-_a-zA-Z]{0,42}\d)(?=[-_0-9a-z]{0,42}[A-Z])[-_0-9a-zA-Z]{43}\/\?cid=[-_0-9a-zA-Z]{16,36}(?:&qs\d=\S+)?&sid=[_0-9a-f]{1,32}$/$doc,match-case,domain=com|info|net|org"#], Default::default()); let request = Request::new("https://www3.example.com/aaaaaaaaaabbbbbbbbbbccccccccccddddddddddAA5/?sid=1&qs1=\n&cid=aaaaaaaaaabbbbbb", "https://www3.example.com/aaaaaaaaaabbbbbbbbbbccccccccccddddddddddAA5/?sid=1&qs1=\n&cid=aaaaaaaaaabbbbbb", "document").unwrap(); assert!(engine.check_network_request(&request).matched); }*/ { // /^http:\/\/[a-z]{5}\.[a-z]{5}\.com\/[a-z]{10}\.apk$/$doc,match-case,domain=com let engine = Engine::from_rules_debug([r#"/^http:\/\/[a-z]{5}\.[a-z]{5}\.com\/[a-z]{10}\.apk$/$doc,match-case,domain=com"#], Default::default()); let request = Request::new("http://abcde.abcde.com/aaaaabbbbb.apk", "http://abcde.abcde.com/aaaaabbbbb.apk", "document").unwrap(); assert!(engine.check_network_request(&request).matched); } // fails - because of non-supported look around operator in rust regex https://github.com/rust-lang/regex/issues/127#issuecomment-154713666 /*{ // /\/[A-Z]\/[-0-9a-z]{5,}\.com\/(?:[0-9a-f]{2}\/){3}[0-9a-f]{32}\.js$/$script,1p,match-case let engine = Engine::from_rules_debug([r#"/\/[A-Z]\/[-0-9a-z]{5,}\.com\/(?:[0-9a-f]{2}\/){3}[0-9a-f]{32}\.js$/$script,1p,match-case"#], Default::default()); let request = Request::new("/A/aaaaa.com/aa/bb/cc/aaaaaaaabbbbbbbbccccccccdddddddd.js", "/A/aaaaa.com/aa/bb/cc/aaaaaaaabbbbbbbbccccccccdddddddd.js", "script").unwrap(); assert!(engine.check_network_request(&request).matched); }*/ // fails - because of non-supported look around operator in rust regex https://github.com/rust-lang/regex/issues/127#issuecomment-154713666 /*{ // /^https?:\/\/(?:[a-z]{2}\.)?[0-9a-z]{7,16}\.com\/[a-z](?=[a-z]{0,25}[0-9A-Z])[0-9a-zA-Z]{3,26}\/(?:[1-5]\d{4}|[3-9]\d{3})\??(?:_=\d+|v=\d)?$/$frame,script,xhr,popup,3p,match-case let engine = Engine::from_rules_debug([r#"/^https?:\/\/(?:[a-z]{2}\.)?[0-9a-z]{7,16}\.com\/[a-z](?=[a-z]{0,25}[0-9A-Z])[0-9a-zA-Z]{3,26}\/(?:[1-5]\d{4}|[3-9]\d{3})\??(?:_=\d+|v=\d)?$/$frame,script,xhr,popup,3p,match-case"#], Default::default()); let request = Request::new("https://aa.example.com/aAaaa/12222", "https://aa.example.net/aAaaa/12222", "frame").unwrap(); assert!(engine.check_network_request(&request).matched); }*/ // fails - because of non-supported look around operator in rust regex https://github.com/rust-lang/regex/issues/127#issuecomment-154713666 /*{ // /^https?:\/\/(?:[a-z]{2}\.)?[0-9a-z]{7,16}\.website\/[a-z](?=[a-z]{0,25}[0-9A-Z])[0-9a-zA-Z]{3,26}\/(?:[1-5]\d{4}|[3-9]\d{3})\??(?:_=\d+|v=\d)?$/$frame,script,xhr,popup,3p,match-case let engine = Engine::from_rules_debug([r#"/^https?:\/\/(?:[a-z]{2}\.)?[0-9a-z]{7,16}\.website\/[a-z](?=[a-z]{0,25}[0-9A-Z])[0-9a-zA-Z]{3,26}\/(?:[1-5]\d{4}|[3-9]\d{3})\??(?:_=\d+|v=\d)?$/$frame,script,xhr,popup,3p,match-case"#], Default::default()); let request = Request::new("https://aa.example.website/aAaaa/12222", "https://aa.example.website/aAaaa/12222", "frame").unwrap(); assert!(engine.check_network_request(&request).matched); }*/ // fails - because of non-supported look around operator in rust regex https://github.com/rust-lang/regex/issues/127#issuecomment-154713666 /*{ // /^https?:\/\/[a-z]{8,15}\.top(\/(?:\d{1,5}|0NaN|articles?|browse|index|movie|news|pages?|static|view|web|wiki)){1,4}(?:\.html|\/)$/$frame,3p,match-case let engine = Engine::from_rules_debug([r#"/^https?:\/\/[a-z]{8,15}\.top(\/(?:\d{1,5}|0NaN|articles?|browse|index|movie|news|pages?|static|view|web|wiki)){1,4}(?:\.html|\/)$/$frame,3p,match-case"#], Default::default()); let request = Request::new("https://examples.top/articles.html", "https://examples.top/articles.html", "frame").unwrap(); assert!(engine.check_network_request(&request).matched); }*/ { // /^https?:\/\/[a-z]{8,15}\.top\/[a-z]{4,}\.json$/$xhr,3p,match-case let engine = Engine::from_rules_debug([r#"/^https?:\/\/[a-z]{8,15}\.top\/[a-z]{4,}\.json$/$xhr,3p,match-case"#], Default::default()); let request = Request::new("https://examples.top/abcd.json", "https://examples.com/abcd.json", "xhr").unwrap(); assert!(engine.check_network_request(&request).matched); } // fails - inferring unescaped `$` inside regex pattern /*{ // /^https?:\/\/[a-z]{8,15}\.top\/[-a-z]{4,}\.css\?aHR0c[\/0-9a-zA-Z]{33,}=?=?$/$css,3p,match-case let engine = Engine::from_rules_debug([r#"/^https?:\/\/[a-z]{8,15}\.top\/[-a-z]{4,}\.css\?aHR0c[\/0-9a-zA-Z]{33,}=?=?$/$css,3p,match-case"#], Default::default()); let request = Request::new("https://examples.top/abcd.css?aHR0c/aaaaaaaaaaAAAAAAAAAA000000000012==", "https://examples.com/abcd.css?aHR0c/aaaaaaaaaaAAAAAAAAAA000000000012==", "stylesheet").unwrap(); assert!(engine.check_network_request(&request).matched); }*/ // fails - inferring unescaped `$` inside regex pattern /*{ // /^https?:\/\/[a-z]{8,15}\.top\/[a-z]{4,}\.png\?aHR0c[\/0-9a-zA-Z]{33,}=?=?$/$image,3p,match-case let engine = Engine::from_rules_debug([r#"/^https?:\/\/[a-z]{8,15}\.top\/[a-z]{4,}\.png\?aHR0c[\/0-9a-zA-Z]{33,}=?=?$/$image,3p,match-case"#], Default::default()); let request = Request::new("https://examples.top/abcd.png?aHR0c/aaaaaaaaaaAAAAAAAAAA000000000012==", "https://examples.com/abcd.png?aHR0c/aaaaaaaaaaAAAAAAAAAA000000000012==", "image").unwrap(); assert!(engine.check_network_request(&request).matched); }*/ // fails - because of non-supported look around operator in rust regex https://github.com/rust-lang/regex/issues/127#issuecomment-154713666 /*{ // /^https?:\/\/[a-z]{8,15}\.xyz(\/(?:\d{1,5}|0NaN|articles?|browse|index|movie|news|pages?|static|view|web|wiki)){1,4}(?:\.html|\/)$/$frame,3p,match-case let engine = Engine::from_rules_debug([r#"/^https?:\/\/[a-z]{8,15}\.xyz(\/(?:\d{1,5}|0NaN|articles?|browse|index|movie|news|pages?|static|view|web|wiki)){1,4}(?:\.html|\/)$/$frame,3p,match-case"#], Default::default()); let request = Request::new("https://examples.xyz/articles.html", "https://examples.xyz/articles.html", "frame").unwrap(); assert!(engine.check_network_request(&request).matched); }*/ { // /^https?:\/\/cdn\.[a-z]{4,6}\.xyz\/app\.js$/$script,3p,match-case let engine = Engine::from_rules_debug([r#"/^https?:\/\/cdn\.[a-z]{4,6}\.xyz\/app\.js$/$script,3p,match-case"#], Default::default()); let request = Request::new("https://cdn.abcde.xyz/app.js", "https://cdn.abcde.com/app.js", "script").unwrap(); assert!(engine.check_network_request(&request).matched); } // fails - because of non-supported look around operator in rust regex https://github.com/rust-lang/regex/issues/127#issuecomment-154713666 /*{ // /^https:\/\/a\.[-0-9a-z]{4,16}\.(?:club|com?|cyou|info|net|ru|site|top?|xxx|xyz)\/(?=[a-z]{0,6}[0-9A-Z])[0-9a-zA-Z]{7}\.js$/$script,match-case let engine = Engine::from_rules_debug([r#"/^https:\/\/a\.[-0-9a-z]{4,16}\.(?:club|com?|cyou|info|net|ru|site|top?|xxx|xyz)\/(?=[a-z]{0,6}[0-9A-Z])[0-9a-zA-Z]{7}\.js$/$script,match-case"#], Default::default()); let request = Request::new("https://a.abcd.club/aaaaaaA.js", "https://a.abcd.club/aaaaaaA.js", "script").unwrap(); assert!(engine.check_network_request(&request).matched); }*/ { // /^https:\/\/cdn\.jsdelivr\.net\/npm\/[-a-z_]{4,22}@latest\/dist\/script\.min\.js$/$script,3p,match-case let engine = Engine::from_rules_debug([r#"/^https:\/\/cdn\.jsdelivr\.net\/npm\/[-a-z_]{4,22}@latest\/dist\/script\.min\.js$/$script,3p,match-case"#], Default::default()); let request = Request::new("https://cdn.jsdelivr.net/npm/abcd@latest/dist/script.min.js", "https://cdn.jsdelivr.com/npm/abcd@latest/dist/script.min.js", "script").unwrap(); assert!(engine.check_network_request(&request).matched); } // fails - inferring unescaped `$` inside regex pattern /*{ // /^https?:\/\/[-.0-9a-z]+\/script\.js$/$script,1p,strict3p,match-case let engine = Engine::from_rules_debug([r#"/^https?:\/\/[-.0-9a-z]+\/script\.js$/$script,1p,strict3p,match-case"#], Default::default()); let request = Request::new("https://www.example.com/script.js", "https://www.abc.com/script.js", "script").unwrap(); assert!(engine.check_network_request(&request).matched); }*/ { let engine = Engine::from_rules_debug([r#"/tesT߶/$domain=example.com"#], Default::default()); let request = Request::new("https://example.com/tesT߶", "https://example.com", "script").unwrap(); assert!(engine.check_network_request(&request).matched); } // fails - punycoded domain /*{ let engine = Engine::from_rules_debug([r#"/tesT߶/$domain=example.com"#], Default::default()); let request = Request::new("https://example-tesT߶.com/tesT", "https://example.com", "script").unwrap(); assert!(engine.check_network_request(&request).matched); }*/ } #[test] fn scriptlet_permissions() { use crate::resources::{PermissionMask, ResourceType}; const UBO_PERM: PermissionMask = PermissionMask::from_bits(0b00000001); const BRAVE_PERM: PermissionMask = PermissionMask::from_bits(0b00000011); let resources = [ Resource::simple("refresh-defuser.js", MimeType::ApplicationJavascript, "refresh-defuser"), Resource { name: "trusted-set-cookie.js".to_string(), aliases: vec![], kind: ResourceType::Mime(MimeType::ApplicationJavascript), content: base64::encode("trusted-set-cookie"), dependencies: vec![], permission: UBO_PERM, }, Resource { name: "brave-fix.js".to_string(), aliases: vec![], kind: ResourceType::Mime(MimeType::ApplicationJavascript), content: base64::encode("brave-fix"), dependencies: vec![], permission: BRAVE_PERM, }, ]; let mut filter_set = FilterSet::new(false); filter_set.add_filters([ "sub1.example.com##+js(refresh-defuser)", "sub2.example.com##+js(trusted-set-cookie)", "sub3.example.com##+js(brave-fix)" ], Default::default()); filter_set.add_filters([ "sub4.example.com##+js(refresh-defuser)", "sub5.example.com##+js(trusted-set-cookie)", "sub6.example.com##+js(brave-fix)" ], ParseOptions { permissions: UBO_PERM, ..Default::default() }); filter_set.add_filters([ "sub7.example.com##+js(refresh-defuser)", "sub8.example.com##+js(trusted-set-cookie)", "sub9.example.com##+js(brave-fix)" ], ParseOptions { permissions: BRAVE_PERM, ..Default::default() }); let mut engine = Engine::from_filter_set(filter_set, true); engine.use_resources(resources); fn wrap_try(scriptlet_content: &str) -> String { format!("try {{\n{}\n}} catch ( e ) {{ }}\n", scriptlet_content) } assert_eq!(engine.url_cosmetic_resources("https://sub1.example.com").injected_script, wrap_try("refresh-defuser")); assert_eq!(engine.url_cosmetic_resources("https://sub2.example.com").injected_script, ""); assert_eq!(engine.url_cosmetic_resources("https://sub3.example.com").injected_script, ""); assert_eq!(engine.url_cosmetic_resources("https://sub4.example.com").injected_script, wrap_try("refresh-defuser")); assert_eq!(engine.url_cosmetic_resources("https://sub5.example.com").injected_script, wrap_try("trusted-set-cookie")); assert_eq!(engine.url_cosmetic_resources("https://sub6.example.com").injected_script, ""); assert_eq!(engine.url_cosmetic_resources("https://sub7.example.com").injected_script, wrap_try("refresh-defuser")); assert_eq!(engine.url_cosmetic_resources("https://sub8.example.com").injected_script, wrap_try("trusted-set-cookie")); assert_eq!(engine.url_cosmetic_resources("https://sub9.example.com").injected_script, wrap_try("brave-fix")); } #[test] fn quoted_scriptlet_args() { use crate::resources::{MimeType, ResourceType}; let resources = [ Resource { name: "trusted-set-local-storage-item.js".into(), aliases: vec![], kind: ResourceType::Mime(MimeType::ApplicationJavascript), content: base64::encode("function trustedSetLocalStorageItem(key = '', value = '') { setLocalStorageItemFn('local', true, key, value); }"), dependencies: vec![], permission: Default::default(), }, ]; let mut filter_set = FilterSet::new(false); filter_set.add_filters([ r#"dailymail.co.uk##+js(trusted-set-local-storage-item, mol.ads.cmp.tcf.cache, '{"getTCData":{"cmpId":27,"cmpVersion":3,"gdprApplies":true,"tcfPolicyVersion":2,"tcString":"CPyz5QAPyz5QAAbADCENC6CgAAAAAAAAAAwIAAASjAJINW4gCLMscGaQEIoEAIgjCQggUAAFAILRAQAODgp2VgE6MIkAAAUARABAhwAQAQCAAASABCAAJAAwQAAAiAQAAAAQCAAAMCAILACgAAAABANAhRCgAECQAyIAIpTAgKgSCAFsKAAADJCQCAKgMAKARGgEACIIARGAAACwMAgBICFggABMQbBAAMACAESoBoCTEwBACDQFgBkADLAGzAPsA_ACAAEFAIwASYAp8BaAFpAOqAfIBDoCJgEiAKRAXIAyMBk4DlAI_gSKEQEwBkADLAGzAPsA_ACAAEYAJMAU8A6oB8gEOgJEAUiAuQBkYDJwHKAR_AkU.f_gAAagAAAAA","eventStatus":"useractioncomplete","cmpStatus":"loaded","isServiceSpecific":true,"useNonStandardStacks":false,"publisherCC":"GB","purposeOneTreatment":false,"addtlConsent":"1~","acmVersion":2,"molGvlVersion":"186.gb.web","nrvString":"1~","nrvVersion":1,"repromptVersion":5},"getStoredRepromptVersion":5,"hasUserConsentedToAll":false,"hasUserDissentedToAll":true,"getConsentDegree":"no","getValidTCData":{"cmpId":27,"cmpVersion":3,"gdprApplies":true,"tcfPolicyVersion":2,"tcString":"CPyz5QAPyz5QAAbADCENC6CgAAAAAAAAAAwIAAASjAJINW4gCLMscGaQEIoEAIgjCQggUAAFAILRAQAODgp2VgE6MIkAAAUARABAhwAQAQCAAASABCAAJAAwQAAAiAQAAAAQCAAAMCAILACgAAAABANAhRCgAECQAyIAIpTAgKgSCAFsKAAADJCQCAKgMAKARGgEACIIARGAAACwMAgBICFggABMQbBAAMACAESoBoCTEwBACDQFgBkADLAGzAPsA_ACAAEFAIwASYAp8BaAFpAOqAfIBDoCJgEiAKRAXIAyMBk4DlAI_gSKEQEwBkADLAGzAPsA_ACAAEYAJMAU8A6oB8gEOgJEAUiAuQBkYDJwHKAR_AkU.f_gAAagAAAAA","listenerId":1,"eventStatus":"useractioncomplete","cmpStatus":"loaded","isServiceSpecific":true,"useNonStandardStacks":false,"publisherCC":"GB","purposeOneTreatment":false,"addtlConsent":"1~","acmVersion":2,"molGvlVersion":"186.gb.web","nrvString":"1~","nrvVersion":1,"repromptVersion":5}}')"#, // invalid - unclosed quoted arg r#"example.com##+js(trusted-set-local-storage-item, "test)"#, // invalid - closing quote does not surround the argument r#"example.com##+js(trusted-set-local-storage-item, "test"test, 3)"#, ], Default::default()); let mut engine = Engine::from_filter_set(filter_set, true); engine.use_resources(resources); assert_eq!(engine.url_cosmetic_resources("https://dailymail.co.uk").injected_script, r#"try { (function trustedSetLocalStorageItem(key = '', value = '') { setLocalStorageItemFn('local', true, key, value); })("mol.ads.cmp.tcf.cache", "{\"getTCData\":{\"cmpId\":27,\"cmpVersion\":3,\"gdprApplies\":true,\"tcfPolicyVersion\":2,\"tcString\":\"CPyz5QAPyz5QAAbADCENC6CgAAAAAAAAAAwIAAASjAJINW4gCLMscGaQEIoEAIgjCQggUAAFAILRAQAODgp2VgE6MIkAAAUARABAhwAQAQCAAASABCAAJAAwQAAAiAQAAAAQCAAAMCAILACgAAAABANAhRCgAECQAyIAIpTAgKgSCAFsKAAADJCQCAKgMAKARGgEACIIARGAAACwMAgBICFggABMQbBAAMACAESoBoCTEwBACDQFgBkADLAGzAPsA_ACAAEFAIwASYAp8BaAFpAOqAfIBDoCJgEiAKRAXIAyMBk4DlAI_gSKEQEwBkADLAGzAPsA_ACAAEYAJMAU8A6oB8gEOgJEAUiAuQBkYDJwHKAR_AkU.f_gAAagAAAAA\",\"eventStatus\":\"useractioncomplete\",\"cmpStatus\":\"loaded\",\"isServiceSpecific\":true,\"useNonStandardStacks\":false,\"publisherCC\":\"GB\",\"purposeOneTreatment\":false,\"addtlConsent\":\"1~\",\"acmVersion\":2,\"molGvlVersion\":\"186.gb.web\",\"nrvString\":\"1~\",\"nrvVersion\":1,\"repromptVersion\":5},\"getStoredRepromptVersion\":5,\"hasUserConsentedToAll\":false,\"hasUserDissentedToAll\":true,\"getConsentDegree\":\"no\",\"getValidTCData\":{\"cmpId\":27,\"cmpVersion\":3,\"gdprApplies\":true,\"tcfPolicyVersion\":2,\"tcString\":\"CPyz5QAPyz5QAAbADCENC6CgAAAAAAAAAAwIAAASjAJINW4gCLMscGaQEIoEAIgjCQggUAAFAILRAQAODgp2VgE6MIkAAAUARABAhwAQAQCAAASABCAAJAAwQAAAiAQAAAAQCAAAMCAILACgAAAABANAhRCgAECQAyIAIpTAgKgSCAFsKAAADJCQCAKgMAKARGgEACIIARGAAACwMAgBICFggABMQbBAAMACAESoBoCTEwBACDQFgBkADLAGzAPsA_ACAAEFAIwASYAp8BaAFpAOqAfIBDoCJgEiAKRAXIAyMBk4DlAI_gSKEQEwBkADLAGzAPsA_ACAAEYAJMAU8A6oB8gEOgJEAUiAuQBkYDJwHKAR_AkU.f_gAAagAAAAA\",\"listenerId\":1,\"eventStatus\":\"useractioncomplete\",\"cmpStatus\":\"loaded\",\"isServiceSpecific\":true,\"useNonStandardStacks\":false,\"publisherCC\":\"GB\",\"purposeOneTreatment\":false,\"addtlConsent\":\"1~\",\"acmVersion\":2,\"molGvlVersion\":\"186.gb.web\",\"nrvString\":\"1~\",\"nrvVersion\":1,\"repromptVersion\":5}}") } catch ( e ) { } "#.to_owned()); assert_eq!(engine.url_cosmetic_resources("https://example.com").injected_script, ""); } } adblock-0.8.12/src/filters/cosmetic.rs000064400000000000000000002374161046102023000157430ustar 00000000000000//! Filters that take effect at a page-content level, including CSS selector-based filtering and //! content script injection. use memchr::{memchr as find_char, memmem, memrchr as find_char_reverse}; use once_cell::sync::Lazy; use regex::Regex; use serde::{Deserialize, Serialize}; use thiserror::Error; use crate::resources::PermissionMask; use crate::utils::Hash; use css_validation::{is_valid_css_style, validate_css_selector}; #[derive(Debug, Error, PartialEq)] pub enum CosmeticFilterError { #[error("punycode error")] PunycodeError, #[error("invalid action specifier")] InvalidActionSpecifier, #[error("unsupported syntax")] UnsupportedSyntax, #[error("missing sharp")] MissingSharp, #[error("invalid css style")] InvalidCssStyle, #[error("invalid css selector")] InvalidCssSelector, #[error("generic unhide")] GenericUnhide, #[error("generic script inject")] GenericScriptInject, #[error("generic action")] GenericAction, #[error("double negation")] DoubleNegation, #[error("empty rule")] EmptyRule, #[error("html filtering is unsupported")] HtmlFilteringUnsupported, #[error("scriptlet args could not be parsed")] InvalidScriptletArgs, #[error("location modifiers are unsupported")] LocationModifiersUnsupported, } /// Refer to #[derive(PartialEq, Debug, Clone, Serialize, Deserialize)] pub enum CosmeticFilterAction { Remove, /// Argument is one or more CSS property declarations, separated by the standard ;. Some /// characters, strings, and values are forbidden. Style(String), RemoveAttr(String), RemoveClass(String), } impl CosmeticFilterAction { fn new_style(style: &str) -> Result { if !is_valid_css_style(style) { return Err(CosmeticFilterError::InvalidCssStyle); } Ok(Self::Style(style.to_string())) } fn new_remove_attr(attr: &str) -> Result { Self::forbid_regex_or_quoted_args(attr)?; Ok(CosmeticFilterAction::RemoveAttr(attr.to_string())) } fn new_remove_class(class: &str) -> Result { Self::forbid_regex_or_quoted_args(class)?; Ok(CosmeticFilterAction::RemoveClass(class.to_string())) } /// Regex and quoted args aren't supported yet fn forbid_regex_or_quoted_args(arg: &str) -> Result<(), CosmeticFilterError> { if arg.starts_with('/') || arg.starts_with('\"') || arg.starts_with('\'') { return Err(CosmeticFilterError::UnsupportedSyntax); } Ok(()) } } bitflags::bitflags! { /// Boolean flags for cosmetic filter rules. #[derive(Serialize, Deserialize)] pub struct CosmeticFilterMask: u8 { const UNHIDE = 1 << 0; const SCRIPT_INJECT = 1 << 1; const IS_UNICODE = 1 << 2; const IS_CLASS_SELECTOR = 1 << 3; const IS_ID_SELECTOR = 1 << 4; const IS_SIMPLE = 1 << 5; // Careful with checking for NONE - will always match const NONE = 0; } } /// Struct representing a parsed cosmetic filter rule. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CosmeticFilter { pub entities: Option>, pub hostnames: Option>, pub mask: CosmeticFilterMask, pub not_entities: Option>, pub not_hostnames: Option>, pub raw_line: Option>, pub selector: String, pub key: Option, pub action: Option, pub permission: PermissionMask, } pub enum CosmeticFilterLocationType { Entity, NotEntity, Hostname, NotHostname, } /// Contains hashes of all of the comma separated location items that were populated before the /// hash separator in a cosmetic filter rule. #[derive(Default)] struct CosmeticFilterLocations { /// Locations of the form `entity.*` entities: Option>, /// Locations of the form `~entity.*` not_entities: Option>, /// Locations of the form `hostname` hostnames: Option>, /// Locations of the form `~hostname` not_hostnames: Option>, } impl CosmeticFilter { #[inline] pub fn locations_before_sharp( line: &str, sharp_index: usize, ) -> impl Iterator { line[0..sharp_index].split(',').filter_map(|part| { if part.is_empty() { return None; } let hostname = part; let negation = hostname.starts_with('~'); let entity = hostname.ends_with(".*"); let start = if negation { 1 } else { 0 }; let end = if entity { hostname.len() - 2 } else { hostname.len() }; let location = &hostname[start..end]; Some(match (negation, entity) { (true, true) => (CosmeticFilterLocationType::NotEntity, location), (true, false) => (CosmeticFilterLocationType::NotHostname, location), (false, true) => (CosmeticFilterLocationType::Entity, location), (false, false) => (CosmeticFilterLocationType::Hostname, location), }) }) } /// Parses the contents of a cosmetic filter rule up to the `##` or `#@#` separator. /// /// On success, returns hashes of all the comma separated location items that were populated in /// the rule. /// /// This should only be called if `sharp_index` is greater than 0, in which case all four are /// guaranteed to be `None`. #[inline] fn parse_before_sharp( line: &str, sharp_index: usize, mask: &mut CosmeticFilterMask, ) -> Result { let mut entities_vec = vec![]; let mut not_entities_vec = vec![]; let mut hostnames_vec = vec![]; let mut not_hostnames_vec = vec![]; if line.starts_with('[') { return Err(CosmeticFilterError::LocationModifiersUnsupported); } for (location_type, location) in Self::locations_before_sharp(line, sharp_index) { let mut hostname = String::new(); if location.is_ascii() { hostname.push_str(location); } else { *mask |= CosmeticFilterMask::IS_UNICODE; match idna::domain_to_ascii(location) { Ok(x) if !x.is_empty() => hostname.push_str(&x), _ => return Err(CosmeticFilterError::PunycodeError), } } let hash = crate::utils::fast_hash(&hostname); match location_type { CosmeticFilterLocationType::NotEntity => not_entities_vec.push(hash), CosmeticFilterLocationType::NotHostname => not_hostnames_vec.push(hash), CosmeticFilterLocationType::Entity => entities_vec.push(hash), CosmeticFilterLocationType::Hostname => hostnames_vec.push(hash), } } /// Sorts `vec` and wraps it in `Some` if it's not empty, or returns `None` if it is. #[inline] fn sorted_or_none(mut vec: Vec) -> Option> { if !vec.is_empty() { vec.sort(); Some(vec) } else { None } } let entities = sorted_or_none(entities_vec); let hostnames = sorted_or_none(hostnames_vec); let not_entities = sorted_or_none(not_entities_vec); let not_hostnames = sorted_or_none(not_hostnames_vec); Ok(CosmeticFilterLocations { entities, not_entities, hostnames, not_hostnames, }) } /// Parses the contents of a cosmetic filter rule following the `##` or `#@#` separator. /// /// On success, returns `selector` and `style` according to the rule. /// /// This should only be called if the rule part after the separator has been confirmed not to /// be a script injection rule using `+js()`. #[inline] fn parse_after_sharp_nonscript( after_sharp: &str, ) -> Result<(&str, Option), CosmeticFilterError> { if after_sharp.starts_with('^') { return Err(CosmeticFilterError::HtmlFilteringUnsupported); } const STYLE_TOKEN: &[u8] = b":style("; const REMOVE_ATTR_TOKEN: &[u8] = b":remove-attr("; const REMOVE_CLASS_TOKEN: &[u8] = b":remove-class("; const REMOVE_TOKEN: &str = ":remove()"; const PAIRS: &[(&[u8], fn(&str) -> Result)] = &[ (STYLE_TOKEN, CosmeticFilterAction::new_style), (REMOVE_ATTR_TOKEN, CosmeticFilterAction::new_remove_attr), (REMOVE_CLASS_TOKEN, CosmeticFilterAction::new_remove_class), ]; let action; let selector; 'init: { for (token, constructor) in PAIRS { if let Some(i) = memmem::find(after_sharp.as_bytes(), token) { if after_sharp.ends_with(')') { // indexing safe because of find and ends_with let arg = &after_sharp[i + token.len()..after_sharp.len() - 1]; action = Some(constructor(arg)?); selector = &after_sharp[..i]; break 'init; } else { return Err(CosmeticFilterError::InvalidActionSpecifier); } } } if let Some(before_suffix) = after_sharp.strip_suffix(REMOVE_TOKEN) { action = Some(CosmeticFilterAction::Remove); selector = before_suffix; break 'init; } else { action = None; selector = after_sharp; } } Ok((selector, action)) } /// Parse the rule in `line` into a `CosmeticFilter`. If `debug` is true, the original rule /// will be reported in the resulting `CosmeticFilter` struct as well. Use `permission` to /// manage the filter's access to scriptlet resources for `+js(...)` injections. pub fn parse(line: &str, debug: bool, permission: PermissionMask) -> Result { let mut mask = CosmeticFilterMask::NONE; if let Some(sharp_index) = find_char(b'#', line.as_bytes()) { let after_sharp_index = sharp_index + 1; let second_sharp_index = match find_char(b'#', line[after_sharp_index..].as_bytes()) { Some(i) => i + after_sharp_index, None => return Err(CosmeticFilterError::UnsupportedSyntax), }; let mut translate_abp_syntax = false; // Consume filter options embedded in the `##` marker: let mut between_sharps = &line[after_sharp_index..second_sharp_index]; if between_sharps.starts_with('@') { // Exception marker will always come first. if sharp_index == 0 { return Err(CosmeticFilterError::GenericUnhide); } mask |= CosmeticFilterMask::UNHIDE; between_sharps = &between_sharps[1..]; } if between_sharps.starts_with('%') { // AdGuard script injection syntax - not supported // `#%#` / `#@%#` return Err(CosmeticFilterError::UnsupportedSyntax); } if between_sharps.starts_with('$') { // AdGuard `:style` syntax - not supported for now // `#$?#` for CSS rules, `#@$?#` — for exceptions return Err(CosmeticFilterError::UnsupportedSyntax); } if between_sharps.starts_with('?') { // ABP/ADG extended CSS syntax: // - #?# — for element hiding, #@?# — for exceptions translate_abp_syntax = true; between_sharps = &between_sharps[1..]; } if !between_sharps.is_empty() { return Err(CosmeticFilterError::UnsupportedSyntax); } let suffix_start_index = second_sharp_index + 1; let CosmeticFilterLocations { entities, not_entities, hostnames, not_hostnames, } = if sharp_index > 0 { CosmeticFilter::parse_before_sharp(line, sharp_index, &mut mask)? } else { CosmeticFilterLocations::default() }; let after_sharp = &line[suffix_start_index..].trim(); if after_sharp.is_empty() { return Err(CosmeticFilterError::EmptyRule); } let (selector, action) = if line.len() - suffix_start_index > 4 && line[suffix_start_index..].starts_with("+js(") && line.ends_with(')') { if sharp_index == 0 { return Err(CosmeticFilterError::GenericScriptInject); } let args = &line[suffix_start_index + 4..line.len() - 1]; if crate::resources::parse_scriptlet_args(args).is_none() { return Err(CosmeticFilterError::InvalidScriptletArgs); } mask |= CosmeticFilterMask::SCRIPT_INJECT; ( String::from(&line[suffix_start_index + 4..line.len() - 1]), None, ) } else { let (selector, action) = CosmeticFilter::parse_after_sharp_nonscript(after_sharp)?; let validated_selector = match validate_css_selector(selector, translate_abp_syntax) { Some(s) => s, None => return Err(CosmeticFilterError::InvalidCssSelector), }; if sharp_index == 0 && action.is_some() { return Err(CosmeticFilterError::GenericAction); } (validated_selector, action) }; if (not_entities.is_some() || not_hostnames.is_some()) && mask.contains(CosmeticFilterMask::UNHIDE) { return Err(CosmeticFilterError::DoubleNegation); } if !selector.is_ascii() { mask |= CosmeticFilterMask::IS_UNICODE; } let key = if !mask.contains(CosmeticFilterMask::SCRIPT_INJECT) { if selector.starts_with('.') { let key = key_from_selector(&selector)?; mask |= CosmeticFilterMask::IS_CLASS_SELECTOR; if key == selector { mask |= CosmeticFilterMask::IS_SIMPLE; } Some(String::from(&key[1..])) } else if selector.starts_with('#') { let key = key_from_selector(&selector)?; mask |= CosmeticFilterMask::IS_ID_SELECTOR; if key == selector { mask |= CosmeticFilterMask::IS_SIMPLE; } Some(String::from(&key[1..])) } else { None } } else { None }; Ok(CosmeticFilter { entities, hostnames, mask, not_entities, not_hostnames, raw_line: if debug { Some(Box::new(String::from(line))) } else { None }, selector, key, action, permission, }) } else { Err(CosmeticFilterError::MissingSharp) } } /// Any cosmetic filter rule that specifies (possibly negated) hostnames or entities has a /// hostname constraint. pub fn has_hostname_constraint(&self) -> bool { self.hostnames.is_some() || self.entities.is_some() || self.not_entities.is_some() || self.not_hostnames.is_some() } /// In general, adding a hostname or entity to a rule *increases* the number of situations in /// which it applies. However, if a specific rule only has negated hostnames or entities, it /// technically should apply to any hostname which does not match a negation. /// /// See: /// /// To account for this inconsistency, this method will generate and return the corresponding /// 'hidden' generic rule if one applies. /// /// Note that this behavior is not applied to script injections or rules with actions. pub fn hidden_generic_rule(&self) -> Option { if self.hostnames.is_some() || self.entities.is_some() { None } else if (self.not_hostnames.is_some() || self.not_entities.is_some()) && (self.action.is_none() && !self.mask.contains(CosmeticFilterMask::SCRIPT_INJECT)) { let mut generic_rule = self.clone(); generic_rule.not_hostnames = None; generic_rule.not_entities = None; Some(generic_rule) } else { None } } } /// Returns a slice of `hostname` up to and including the segment that overlaps with the first /// segment of `domain`, which has the effect of stripping ".com", ".co.uk", etc., as well as the /// public suffix itself. fn get_hostname_without_public_suffix<'a>( hostname: &'a str, domain: &str, ) -> Option<(&'a str, &'a str)> { let index_of_dot = find_char(b'.', domain.as_bytes()); if let Some(index_of_dot) = index_of_dot { let public_suffix = &domain[index_of_dot + 1..]; Some(( &hostname[0..hostname.len() - public_suffix.len() - 1], &hostname[hostname.len() - domain.len() + index_of_dot + 1..], )) } else { None } } /// Given a hostname and the indices of an end position and the start of the domain, returns a /// `Vec` of hashes of all subdomains the hostname falls under, ordered from least to most /// specific. /// /// Check the `label_hashing` tests for examples. fn get_hashes_from_labels(hostname: &str, end: usize, start_of_domain: usize) -> Vec { let mut hashes = vec![]; if end == 0 { return hashes; } let mut dot_ptr = start_of_domain; while let Some(dot_index) = find_char_reverse(b'.', hostname[..dot_ptr].as_bytes()) { dot_ptr = dot_index; hashes.push(crate::utils::fast_hash(&hostname[dot_ptr + 1..end])); } hashes.push(crate::utils::fast_hash(&hostname[..end])); hashes } /// Returns a `Vec` of the hashes of all segments of `hostname` that may match an /// entity-constrained rule. pub fn get_entity_hashes_from_labels(hostname: &str, domain: &str) -> Vec { if let Some((hostname_without_public_suffix, public_suffix)) = get_hostname_without_public_suffix(hostname, domain) { let mut hashes = get_hashes_from_labels( hostname_without_public_suffix, hostname_without_public_suffix.len(), hostname_without_public_suffix.len(), ); hashes.push(crate::utils::fast_hash(public_suffix)); hashes } else { vec![] } } /// Returns a `Vec` of the hashes of all segments of `hostname` that may match a /// hostname-constrained rule. pub fn get_hostname_hashes_from_labels(hostname: &str, domain: &str) -> Vec { get_hashes_from_labels(hostname, hostname.len(), hostname.len() - domain.len()) } #[cfg(not(feature = "css-validation"))] mod css_validation { pub fn validate_css_selector(selector: &str, _accept_abp_selectors: bool) -> Option { Some(selector.to_string()) } pub fn is_valid_css_style(_style: &str) -> bool { true } } #[cfg(feature = "css-validation")] mod css_validation { //! Methods for validating CSS selectors and style rules extracted from cosmetic filter rules. use core::fmt::{Result as FmtResult, Write}; use cssparser::{CowRcStr, ParseError, Parser, ParserInput, SourceLocation, ToCss, Token}; use selectors::parser::SelectorParseErrorKind; /// Returns a validated canonical CSS selector for the given input, or nothing if one can't be /// determined. /// /// For the majority of filters, this works by trivial regex matching. More complex filters are /// assembled into a mock stylesheet which is then parsed using `cssparser` and validated. /// /// In addition to normalizing formatting, this function will remove unsupported procedural /// selectors and convert others to canonical representations (i.e. `:-abp-has` -> `:has`). pub fn validate_css_selector(selector: &str, accept_abp_selectors: bool) -> Option { use once_cell::sync::Lazy; use regex::Regex; static RE_SIMPLE_SELECTOR: Lazy = Lazy::new(|| Regex::new(r"^[#.]?[A-Za-z_][\w-]*$").unwrap()); if RE_SIMPLE_SELECTOR.is_match(selector) { return Some(selector.to_string()); } // Use `mock-stylesheet-marker` where uBO uses `color: red` since we have control over the // parsing logic within the block. let mock_stylesheet = format!("{}{{mock-stylesheet-marker}}", selector); let mut pi = ParserInput::new(&mock_stylesheet); let mut parser = Parser::new(&mut pi); let mut rule_list_parser = cssparser::RuleListParser::new_for_stylesheet( &mut parser, QualifiedRuleParserImpl { accept_abp_selectors, }, ); let prelude = rule_list_parser .next() .and_then(|r| r.ok()) .map(|prelude| prelude.to_css_string()); if rule_list_parser.next().is_some() { return None; } prelude } struct QualifiedRuleParserImpl { accept_abp_selectors: bool, } impl<'i> cssparser::QualifiedRuleParser<'i> for QualifiedRuleParserImpl { type Prelude = selectors::SelectorList; type QualifiedRule = selectors::SelectorList; type Error = (); fn parse_prelude<'t>( &mut self, input: &mut Parser<'i, 't>, ) -> Result> { selectors::SelectorList::parse( &SelectorParseImpl { accept_abp_selectors: self.accept_abp_selectors, }, input, ) .map_err(|_| ParseError { kind: cssparser::ParseErrorKind::Custom(()), location: SourceLocation { line: 0, column: 0 }, }) } /// Check that the block is exactly equal to "mock-stylesheet-marker", and return just the /// selector list as the prelude fn parse_block<'t>( &mut self, prelude: Self::Prelude, _start: &cssparser::ParserState, input: &mut Parser<'i, 't>, ) -> Result> { let err = Err(ParseError { kind: cssparser::ParseErrorKind::Custom(()), location: SourceLocation { line: 0, column: 0 }, }); match input.next() { Ok(Token::Ident(i)) if i.as_ref() == "mock-stylesheet-marker" => (), _ => return err, } if input.next().is_ok() { return err; } Ok(prelude) } } /// Default implementations for `AtRuleParser` parsing methods return false. This is /// acceptable; at-rules should not be valid in cosmetic rules. impl cssparser::AtRuleParser<'_> for QualifiedRuleParserImpl { type PreludeNoBlock = (); type PreludeBlock = (); /// unused; just to satisfy type checking type AtRule = selectors::SelectorList; type Error = (); } pub fn is_valid_css_style(style: &str) -> bool { if style.contains('\\') { return false; } if style.contains("url(") { return false; } if style.contains("/*") { return false; } true } struct SelectorParseImpl { accept_abp_selectors: bool, } fn nested_matching_close(arg: &Token) -> Option> { match arg { Token::Function(..) => Some(Token::CloseParenthesis), Token::ParenthesisBlock => Some(Token::CloseParenthesis), Token::CurlyBracketBlock => Some(Token::CloseCurlyBracket), Token::SquareBracketBlock => Some(Token::CloseSquareBracket), _ => None, } } /// Just convert the rest of the selector to a string fn to_css_nested<'i>( arguments: &mut Parser<'i, '_>, ) -> Result>> { let mut inner = String::new(); while let Ok(arg) = arguments.next_including_whitespace() { if arg.to_css(&mut inner).is_err() { return Err(arguments.new_custom_error(SelectorParseErrorKind::InvalidState)); }; if let Some(closing_token) = nested_matching_close(arg) { let nested = arguments.parse_nested_block(to_css_nested)?; inner.push_str(&nested); closing_token.to_css(&mut inner).map_err(|_| { arguments.new_custom_error(SelectorParseErrorKind::InvalidState) })?; } } Ok(inner) } impl<'i> selectors::parser::Parser<'i> for SelectorParseImpl { type Impl = SelectorImpl; type Error = SelectorParseErrorKind<'i>; fn parse_slotted(&self) -> bool { true } fn parse_part(&self) -> bool { true } fn parse_is_and_where(&self) -> bool { true } fn parse_host(&self) -> bool { true } fn parse_non_ts_pseudo_class( &self, _location: SourceLocation, name: CowRcStr<'i>, ) -> Result< ::NonTSPseudoClass, ParseError<'i, Self::Error>, > { Ok(NonTSPseudoClass::AnythingElse(name.to_string(), None)) } fn parse_non_ts_functional_pseudo_class<'t>( &self, name: CowRcStr<'i>, arguments: &mut Parser<'i, 't>, ) -> Result< ::NonTSPseudoClass, ParseError<'i, Self::Error>, > { let canonical_name = match (self.accept_abp_selectors, name.as_ref()) { (true, "-abp-has") => Some("has"), _ => None, } .unwrap_or(name.as_ref()); match canonical_name { "-abp-contains" | "-abp-has" | "-abp-properties" | "has-text" | "if" | "if-not" | "matches-attr" | "matches-css" | "matches-css-after" | "matches-css-before" | "matches-media" | "matches-path" | "min-text-length" | "nth-ancestor" | "properties" | "subject" | "upward" | "remove" | "remove-attr" | "remove-class" | "watch-attr" | "xpath" => { return Err(arguments.new_custom_error( SelectorParseErrorKind::UnsupportedPseudoClassOrElement(name), )) } _ => (), } let inner_selector = to_css_nested(arguments)?; Ok(NonTSPseudoClass::AnythingElse( canonical_name.to_string(), Some(inner_selector), )) } fn parse_pseudo_element( &self, _location: SourceLocation, name: CowRcStr<'i>, ) -> Result< ::PseudoElement, ParseError<'i, Self::Error>, > { Ok(PseudoElement(name.to_string(), None)) } fn parse_functional_pseudo_element<'t>( &self, name: CowRcStr<'i>, arguments: &mut Parser<'i, 't>, ) -> Result< ::PseudoElement, ParseError<'i, Self::Error>, > { let inner_selector = to_css_nested(arguments)?; Ok(PseudoElement(name.to_string(), Some(inner_selector))) } } /// The `selectors` library requires an object that implements `SelectorImpl` to store data /// about a parsed selector. For performance, the actual content of parsed selectors is /// discarded as much as possible - it only matters whether the returned `Result` is `Ok` or /// `Err`. #[derive(Debug, Clone)] struct SelectorImpl; impl selectors::parser::SelectorImpl for SelectorImpl { type ExtraMatchingData = (); type AttrValue = CssString; type Identifier = CssIdent; type LocalName = CssString; type NamespaceUrl = DummyValue; type NamespacePrefix = DummyValue; type BorrowedNamespaceUrl = DummyValue; type BorrowedLocalName = CssString; type NonTSPseudoClass = NonTSPseudoClass; type PseudoElement = PseudoElement; } /// Serialized using `CssStringWriter`. #[derive(Debug, Clone, PartialEq, Eq, Default)] struct CssString(String); impl ToCss for CssString { fn to_css(&self, dest: &mut W) -> core::fmt::Result { cssparser::CssStringWriter::new(dest).write_str(&self.0) } } impl<'a> From<&'a str> for CssString { fn from(s: &'a str) -> Self { CssString(s.to_string()) } } /// Serialized using `serialize_identifier`. #[derive(Debug, Clone, PartialEq, Eq, Default)] struct CssIdent(String); impl ToCss for CssIdent { fn to_css(&self, dest: &mut W) -> core::fmt::Result { cssparser::serialize_identifier(&self.0, dest) } } impl<'a> From<&'a str> for CssIdent { fn from(s: &'a str) -> Self { CssIdent(s.to_string()) } } /// For performance, individual fields of parsed selectors are discarded. Instead, they are /// parsed into a `DummyValue` with no fields. #[derive(Debug, Clone, PartialEq, Eq, Default)] struct DummyValue(String); impl ToCss for DummyValue { fn to_css(&self, dest: &mut W) -> core::fmt::Result { write!(dest, "{}", self.0) } } impl<'a> From<&'a str> for DummyValue { fn from(s: &'a str) -> Self { DummyValue(s.to_string()) } } /// Dummy struct for non-tree-structural pseudo-classes. #[derive(Clone, PartialEq, Eq)] enum NonTSPseudoClass { /// Any native CSS pseudoclass that isn't a procedural operator. Second argument contains inner arguments, if present. AnythingElse(String, Option), } impl selectors::parser::NonTSPseudoClass for NonTSPseudoClass { type Impl = SelectorImpl; fn is_active_or_hover(&self) -> bool { false } fn is_user_action_state(&self) -> bool { false } } impl ToCss for NonTSPseudoClass { fn to_css(&self, dest: &mut W) -> FmtResult { write!(dest, ":")?; match self { Self::AnythingElse(name, None) => write!(dest, "{}", name)?, Self::AnythingElse(name, Some(args)) => write!(dest, "{}({})", name, args)?, } Ok(()) } } /// Dummy struct for pseudo-elements. #[derive(Clone, PartialEq, Eq)] struct PseudoElement(String, Option); impl selectors::parser::PseudoElement for PseudoElement { type Impl = SelectorImpl; fn valid_after_slotted(&self) -> bool { true } } impl ToCss for PseudoElement { fn to_css(&self, dest: &mut W) -> FmtResult { write!(dest, "::")?; match self { Self(name, None) => write!(dest, "{}", name)?, Self(name, Some(args)) => write!(dest, "{}({})", name, args)?, } Ok(()) } } #[test] fn bad_selector_inputs() { assert!(validate_css_selector(r#"rm -rf ./*"#, false).is_none()); assert!(validate_css_selector(r#"javascript:alert("All pseudo-classes are valid")"#, false).is_some()); assert!(validate_css_selector(r#"javascript:alert("But opening comments are still forbidden" /*)"#, false).is_none()); assert!(validate_css_selector(r#"This is not a CSS selector."#, false).is_none()); assert!(validate_css_selector(r#"./malware.sh"#, false).is_none()); assert!(validate_css_selector(r#"https://safesite.ru"#, false).is_none()); assert!(validate_css_selector(r#"(function(){var e=60;return String.fromCharCode(e.charCodeAt(0))})();"#, false).is_none()); assert!(validate_css_selector(r#"#!/usr/bin/sh"#, false).is_none()); assert!(validate_css_selector(r#"input,input/*"#, false).is_none()); // Accept a closing comment within a string. It should still be impossible to create an // opening comment to match it. assert!(validate_css_selector(r#"input[x="*/{}*{background:url(https://hackvertor.co.uk/images/logo.gif)}"]"#, false).is_some()); } #[test] fn escaped_quote_in_tag_name() { assert_eq!(validate_css_selector(r#"head\""#, false), Some(r#"head\""#.to_string())); } } static RE_PLAIN_SELECTOR: Lazy = Lazy::new(|| Regex::new(r"^[#.][\w\\-]+").unwrap()); static RE_PLAIN_SELECTOR_ESCAPED: Lazy = Lazy::new(|| Regex::new(r"^[#.](?:\\[0-9A-Fa-f]+ |\\.|\w|-)+").unwrap()); static RE_ESCAPE_SEQUENCE: Lazy = Lazy::new(|| Regex::new(r"\\([0-9A-Fa-f]+ |.)").unwrap()); /// Returns the first token of a CSS selector. /// /// This should only be called once `selector` has been verified to start with either a "#" or "." /// character. fn key_from_selector(selector: &str) -> Result { // If there are no escape characters in the selector, just take the first class or id token. let mat = RE_PLAIN_SELECTOR.find(selector); if let Some(location) = mat { let key = &location.as_str(); if find_char(b'\\', key.as_bytes()).is_none() { return Ok((*key).into()); } } else { return Err(CosmeticFilterError::InvalidCssSelector); } // Otherwise, the characters in the selector must be escaped. let mat = RE_PLAIN_SELECTOR_ESCAPED.find(selector); if let Some(location) = mat { let mut key = String::with_capacity(selector.len()); let escaped = &location.as_str(); let mut beginning = 0; let mat = RE_ESCAPE_SEQUENCE.captures_iter(escaped); for capture in mat { // Unwrap is safe because the 0th capture group is the match itself let location = capture.get(0).unwrap(); key += &escaped[beginning..location.start()]; beginning = location.end(); // Unwrap is safe because there is a capture group specified in the regex let capture = capture.get(1).unwrap().as_str(); if capture.chars().count() == 1 { // Check number of unicode characters rather than byte length key += capture; } else { // This u32 conversion can overflow let codepoint = u32::from_str_radix(&capture[..capture.len() - 1], 16) .map_err(|_| CosmeticFilterError::InvalidCssSelector)?; // Not all u32s are valid Unicode codepoints key += &core::char::from_u32(codepoint) .ok_or(CosmeticFilterError::InvalidCssSelector)? .to_string(); } } Ok(key + &escaped[beginning..]) } else { Err(CosmeticFilterError::InvalidCssSelector) } } #[cfg(test)] mod key_from_selector_tests { use super::key_from_selector; #[test] fn no_escapes() { assert_eq!(key_from_selector(r#"#selector"#).unwrap(), "#selector"); assert_eq!(key_from_selector(r#"#ad-box[href="https://popads.net"]"#).unwrap(), "#ad-box"); assert_eq!(key_from_selector(r#".p"#).unwrap(), ".p"); assert_eq!(key_from_selector(r#".ad #ad.adblockblock"#).unwrap(), ".ad"); assert_eq!(key_from_selector(r#"#container.contained"#).unwrap(), "#container"); } #[test] fn escaped_characters() { assert_eq!(key_from_selector(r"#Meebo\:AdElement\.Root").unwrap(), "#Meebo:AdElement.Root"); assert_eq!(key_from_selector(r"#\ Banner\ Ad\ -\ 590\ x\ 90").unwrap(), "# Banner Ad - 590 x 90"); assert_eq!(key_from_selector(r"#\ rek").unwrap(), "# rek"); assert_eq!(key_from_selector(r#"#\:rr .nH[role="main"] .mq:first-child"#).unwrap(), "#:rr"); assert_eq!(key_from_selector(r#"#adspot-300x600\,300x250-pos-1"#).unwrap(), "#adspot-300x600,300x250-pos-1"); assert_eq!(key_from_selector(r#"#adv_\'146\'"#).unwrap(), "#adv_\'146\'"); assert_eq!(key_from_selector(r#"#oas-mpu-left\<\/div\>"#).unwrap(), "#oas-mpu-left"); assert_eq!(key_from_selector(r#".Trsp\(op\).Trsdu\(3s\)"#).unwrap(), ".Trsp(op)"); } #[test] fn escape_codes() { assert_eq!(key_from_selector(r#"#\5f _mom_ad_12"#).unwrap(), "#__mom_ad_12"); assert_eq!(key_from_selector(r#"#\5f _nq__hh[style="display:block!important"]"#).unwrap(), "#__nq__hh"); assert_eq!(key_from_selector(r#"#\31 000-014-ros"#).unwrap(), "#1000-014-ros"); assert_eq!(key_from_selector(r#"#\33 00X250ad"#).unwrap(), "#300X250ad"); assert_eq!(key_from_selector(r#"#\5f _fixme"#).unwrap(), "#__fixme"); assert_eq!(key_from_selector(r#"#\37 28ad"#).unwrap(), "#728ad"); } #[test] fn bad_escapes() { assert!(key_from_selector(r#"#\5ffffffffff overflows"#).is_err()); assert!(key_from_selector(r#"#\5fffffff is_too_large"#).is_err()); } } #[cfg(test)] mod parse_tests { use super::*; /// An easily modified summary of a `CosmeticFilter` rule to be used in tests. #[derive(Debug, PartialEq)] struct CosmeticFilterBreakdown { entities: Option>, hostnames: Option>, not_entities: Option>, not_hostnames: Option>, selector: String, key: Option, action: Option, unhide: bool, script_inject: bool, is_unicode: bool, is_class_selector: bool, is_id_selector: bool, } impl From<&CosmeticFilter> for CosmeticFilterBreakdown { fn from(filter: &CosmeticFilter) -> CosmeticFilterBreakdown { CosmeticFilterBreakdown { entities: filter.entities.as_ref().cloned(), hostnames: filter.hostnames.as_ref().cloned(), not_entities: filter.not_entities.as_ref().cloned(), not_hostnames: filter.not_hostnames.as_ref().cloned(), selector: filter.selector.clone(), key: filter.key.as_ref().cloned(), action: filter.action.as_ref().cloned(), unhide: filter.mask.contains(CosmeticFilterMask::UNHIDE), script_inject: filter.mask.contains(CosmeticFilterMask::SCRIPT_INJECT), is_unicode: filter.mask.contains(CosmeticFilterMask::IS_UNICODE), is_class_selector: filter.mask.contains(CosmeticFilterMask::IS_CLASS_SELECTOR), is_id_selector: filter.mask.contains(CosmeticFilterMask::IS_ID_SELECTOR), } } } impl From for CosmeticFilterBreakdown { fn from(filter: CosmeticFilter) -> CosmeticFilterBreakdown { (&filter).into() } } impl Default for CosmeticFilterBreakdown { fn default() -> Self { CosmeticFilterBreakdown { entities: None, hostnames: None, not_entities: None, not_hostnames: None, selector: "".to_string(), key: None, action: None, unhide: false, script_inject: false, is_unicode: false, is_class_selector: false, is_id_selector: false, } } } fn parse_cf(rule: &str) -> Result { CosmeticFilter::parse(rule, false, Default::default()) } /// Asserts that `rule` parses into a `CosmeticFilter` equivalent to the summary provided by /// `expected`. fn check_parse_result(rule: &str, expected: CosmeticFilterBreakdown) { let filter: CosmeticFilterBreakdown = parse_cf(rule).unwrap().into(); assert_eq!(expected, filter); } #[test] fn simple_selectors() { check_parse_result( "##div.popup", CosmeticFilterBreakdown { selector: "div.popup".to_string(), ..Default::default() }, ); check_parse_result( "###selector", CosmeticFilterBreakdown { selector: "#selector".to_string(), is_id_selector: true, key: Some("selector".to_string()), ..Default::default() }, ); check_parse_result( "##.selector", CosmeticFilterBreakdown { selector: ".selector".to_string(), is_class_selector: true, key: Some("selector".to_string()), ..Default::default() }, ); check_parse_result( "##a[href=\"foo.com\"]", CosmeticFilterBreakdown { selector: "a[href=\"foo.com\"]".to_string(), ..Default::default() }, ); check_parse_result( "##[href=\"foo.com\"]", CosmeticFilterBreakdown { selector: "[href=\"foo.com\"]".to_string(), ..Default::default() }, ); } /// Produces a sorted vec of the hashes of all the given domains. /// /// For convenience, the return value is wrapped in a `Some()` to be consumed by a /// `CosmeticFilterBreakdown`. fn sort_hash_domains(domains: Vec<&str>) -> Option> { let mut hashes: Vec<_> = domains.iter().map(|d| crate::utils::fast_hash(d)).collect(); hashes.sort(); Some(hashes) } #[test] fn hostnames() { check_parse_result( r#"u00p.com##div[class^="adv-box"]"#, CosmeticFilterBreakdown { selector: r#"div[class^="adv-box"]"#.to_string(), hostnames: sort_hash_domains(vec!["u00p.com"]), ..Default::default() }, ); check_parse_result( r#"distractify.com##div[class*="AdInArticle"]"#, CosmeticFilterBreakdown { selector: r#"div[class*="AdInArticle"]"#.to_string(), hostnames: sort_hash_domains(vec!["distractify.com"]), ..Default::default() }, ); check_parse_result( r#"soundtrackcollector.com,the-numbers.com##a[href^="http://affiliates.allposters.com/"]"#, CosmeticFilterBreakdown { selector: r#"a[href^="http://affiliates.allposters.com/"]"#.to_string(), hostnames: sort_hash_domains(vec!["soundtrackcollector.com", "the-numbers.com"]), ..Default::default() }, ); check_parse_result( r#"thelocal.at,thelocal.ch,thelocal.de,thelocal.dk,thelocal.es,thelocal.fr,thelocal.it,thelocal.no,thelocal.se##div[class*="-widget"]"#, CosmeticFilterBreakdown { selector: r#"div[class*="-widget"]"#.to_string(), hostnames: sort_hash_domains(vec![ "thelocal.at", "thelocal.ch", "thelocal.de", "thelocal.dk", "thelocal.es", "thelocal.fr", "thelocal.it", "thelocal.no", "thelocal.se", ]), ..Default::default() }, ); check_parse_result( r#"base64decode.org,base64encode.org,beautifyjson.org,minifyjson.org,numgen.org,pdfmrg.com,pdfspl.com,prettifycss.com,pwdgen.org,strlength.com,strreverse.com,uglifyjs.net,urldecoder.org##div[class^="banner_"]"#, CosmeticFilterBreakdown { selector: r#"div[class^="banner_"]"#.to_string(), hostnames: sort_hash_domains(vec![ "base64decode.org", "base64encode.org", "beautifyjson.org", "minifyjson.org", "numgen.org", "pdfmrg.com", "pdfspl.com", "prettifycss.com", "pwdgen.org", "strlength.com", "strreverse.com", "uglifyjs.net", "urldecoder.org", ]), ..Default::default() }, ); check_parse_result( r#"adforum.com,alliednews.com,americustimesrecorder.com,andovertownsman.com,athensreview.com,batesvilleheraldtribune.com,bdtonline.com,channel24.pk,chickashanews.com,claremoreprogress.com,cleburnetimesreview.com,clintonherald.com,commercejournal.com,commercial-news.com,coopercrier.com,cordeledispatch.com,corsicanadailysun.com,crossville-chronicle.com,cullmantimes.com,dailyiowegian.com,dailyitem.com,daltondailycitizen.com,derrynews.com,duncanbanner.com,eagletribune.com,edmondsun.com,effinghamdailynews.com,enewscourier.com,enidnews.com,farmtalknewspaper.com,fayettetribune.com,flasharcade.com,flashgames247.com,flyergroup.com,foxsportsasia.com,gainesvilleregister.com,gloucestertimes.com,goshennews.com,greensburgdailynews.com,heraldbanner.com,heraldbulletin.com,hgazette.com,homemagonline.com,itemonline.com,jacksonvilleprogress.com,jerusalemonline.com,joplinglobe.com,journal-times.com,journalexpress.net,kexp.org,kokomotribune.com,lockportjournal.com,mankatofreepress.com,mcalesternews.com,mccrearyrecord.com,mcleansborotimesleader.com,meadvilletribune.com,meridianstar.com,mineralwellsindex.com,montgomery-herald.com,mooreamerican.com,moultrieobserver.com,muskogeephoenix.com,ncnewsonline.com,newburyportnews.com,newsaegis.com,newsandtribune.com,niagara-gazette.com,njeffersonnews.com,normantranscript.com,opposingviews.com,orangeleader.com,oskaloosa.com,ottumwacourier.com,outlookmoney.com,palestineherald.com,panews.com,paulsvalleydailydemocrat.com,pellachronicle.com,pharostribune.com,pressrepublican.com,pryordailytimes.com,randolphguide.com,record-eagle.com,register-herald.com,register-news.com,reporter.net,rockwallheraldbanner.com,roysecityheraldbanner.com,rushvillerepublican.com,salemnews.com,sentinel-echo.com,sharonherald.com,shelbyvilledailyunion.com,siteslike.com,standardmedia.co.ke,starbeacon.com,stwnewspress.com,suwanneedemocrat.com,tahlequahdailypress.com,theadanews.com,theawesomer.com,thedailystar.com,thelandonline.com,themoreheadnews.com,thesnaponline.com,tiftongazette.com,times-news.com,timesenterprise.com,timessentinel.com,timeswv.com,tonawanda-news.com,tribdem.com,tribstar.com,unionrecorder.com,valdostadailytimes.com,washtimesherald.com,waurikademocrat.com,wcoutlook.com,weatherforddemocrat.com,woodwardnews.net,wrestlinginc.com##div[style="width:300px; height:250px;"]"#, CosmeticFilterBreakdown { selector: r#"div[style="width:300px; height:250px;"]"#.to_string(), hostnames: sort_hash_domains(vec![ "adforum.com", "alliednews.com", "americustimesrecorder.com", "andovertownsman.com", "athensreview.com", "batesvilleheraldtribune.com", "bdtonline.com", "channel24.pk", "chickashanews.com", "claremoreprogress.com", "cleburnetimesreview.com", "clintonherald.com", "commercejournal.com", "commercial-news.com", "coopercrier.com", "cordeledispatch.com", "corsicanadailysun.com", "crossville-chronicle.com", "cullmantimes.com", "dailyiowegian.com", "dailyitem.com", "daltondailycitizen.com", "derrynews.com", "duncanbanner.com", "eagletribune.com", "edmondsun.com", "effinghamdailynews.com", "enewscourier.com", "enidnews.com", "farmtalknewspaper.com", "fayettetribune.com", "flasharcade.com", "flashgames247.com", "flyergroup.com", "foxsportsasia.com", "gainesvilleregister.com", "gloucestertimes.com", "goshennews.com", "greensburgdailynews.com", "heraldbanner.com", "heraldbulletin.com", "hgazette.com", "homemagonline.com", "itemonline.com", "jacksonvilleprogress.com", "jerusalemonline.com", "joplinglobe.com", "journal-times.com", "journalexpress.net", "kexp.org", "kokomotribune.com", "lockportjournal.com", "mankatofreepress.com", "mcalesternews.com", "mccrearyrecord.com", "mcleansborotimesleader.com", "meadvilletribune.com", "meridianstar.com", "mineralwellsindex.com", "montgomery-herald.com", "mooreamerican.com", "moultrieobserver.com", "muskogeephoenix.com", "ncnewsonline.com", "newburyportnews.com", "newsaegis.com", "newsandtribune.com", "niagara-gazette.com", "njeffersonnews.com", "normantranscript.com", "opposingviews.com", "orangeleader.com", "oskaloosa.com", "ottumwacourier.com", "outlookmoney.com", "palestineherald.com", "panews.com", "paulsvalleydailydemocrat.com", "pellachronicle.com", "pharostribune.com", "pressrepublican.com", "pryordailytimes.com", "randolphguide.com", "record-eagle.com", "register-herald.com", "register-news.com", "reporter.net", "rockwallheraldbanner.com", "roysecityheraldbanner.com", "rushvillerepublican.com", "salemnews.com", "sentinel-echo.com", "sharonherald.com", "shelbyvilledailyunion.com", "siteslike.com", "standardmedia.co.ke", "starbeacon.com", "stwnewspress.com", "suwanneedemocrat.com", "tahlequahdailypress.com", "theadanews.com", "theawesomer.com", "thedailystar.com", "thelandonline.com", "themoreheadnews.com", "thesnaponline.com", "tiftongazette.com", "times-news.com", "timesenterprise.com", "timessentinel.com", "timeswv.com", "tonawanda-news.com", "tribdem.com", "tribstar.com", "unionrecorder.com", "valdostadailytimes.com", "washtimesherald.com", "waurikademocrat.com", "wcoutlook.com", "weatherforddemocrat.com", "woodwardnews.net", "wrestlinginc.com", ]), ..Default::default() }, ); } #[test] fn href() { check_parse_result( r#"##a[href$="/vghd.shtml"]"#, CosmeticFilterBreakdown { selector: r#"a[href$="/vghd.shtml"]"#.to_string(), ..Default::default() }, ); check_parse_result( r#"##a[href*=".adk2x.com/"]"#, CosmeticFilterBreakdown { selector: r#"a[href*=".adk2x.com/"]"#.to_string(), ..Default::default() }, ); check_parse_result( r#"##a[href^="//40ceexln7929.com/"]"#, CosmeticFilterBreakdown { selector: r#"a[href^="//40ceexln7929.com/"]"#.to_string(), ..Default::default() }, ); check_parse_result( r#"##a[href*=".trust.zone"]"#, CosmeticFilterBreakdown { selector: r#"a[href*=".trust.zone"]"#.to_string(), ..Default::default() }, ); check_parse_result( r#"tf2maps.net##a[href="http://forums.tf2maps.net/payments.php"]"#, CosmeticFilterBreakdown { selector: r#"a[href="http://forums.tf2maps.net/payments.php"]"#.to_string(), hostnames: sort_hash_domains(vec!["tf2maps.net"]), ..Default::default() }, ); check_parse_result( r#"rarbg.to,rarbg.unblockall.org,rarbgaccess.org,rarbgmirror.com,rarbgmirror.org,rarbgmirror.xyz,rarbgproxy.com,rarbgproxy.org,rarbgunblock.com##a[href][target="_blank"] > button"#, CosmeticFilterBreakdown { selector: r#"a[href][target="_blank"] > button"#.to_string(), hostnames: sort_hash_domains(vec![ "rarbg.to", "rarbg.unblockall.org", "rarbgaccess.org", "rarbgmirror.com", "rarbgmirror.org", "rarbgmirror.xyz", "rarbgproxy.com", "rarbgproxy.org", "rarbgunblock.com", ]), ..Default::default() }, ); } #[test] fn injected_scripts() { check_parse_result( r#"hentaifr.net,jeu.info,tuxboard.com,xstory-fr.com##+js(goyavelab-defuser.js)"#, CosmeticFilterBreakdown { selector: r#"goyavelab-defuser.js"#.to_string(), hostnames: sort_hash_domains(vec![ "hentaifr.net", "jeu.info", "tuxboard.com", "xstory-fr.com", ]), script_inject: true, ..Default::default() }, ); check_parse_result( r#"haus-garten-test.de,sozialversicherung-kompetent.de##+js(set-constant.js, Object.keys, trueFunc)"#, CosmeticFilterBreakdown { selector: r#"set-constant.js, Object.keys, trueFunc"#.to_string(), hostnames: sort_hash_domains(vec![ "haus-garten-test.de", "sozialversicherung-kompetent.de", ]), script_inject: true, ..Default::default() }, ); check_parse_result( r#"airliners.de,auszeit.bio,autorevue.at,clever-tanken.de,fanfiktion.de,finya.de,frag-mutti.de,frustfrei-lernen.de,fussballdaten.de,gameswelt.*,liga3-online.de,lz.de,mt.de,psychic.de,rimondo.com,spielen.de,weltfussball.at,weristdeinfreund.de##+js(abort-current-inline-script.js, Number.isNaN)"#, CosmeticFilterBreakdown { selector: r#"abort-current-inline-script.js, Number.isNaN"#.to_string(), hostnames: sort_hash_domains(vec![ "airliners.de", "auszeit.bio", "autorevue.at", "clever-tanken.de", "fanfiktion.de", "finya.de", "frag-mutti.de", "frustfrei-lernen.de", "fussballdaten.de", "liga3-online.de", "lz.de", "mt.de", "psychic.de", "rimondo.com", "spielen.de", "weltfussball.at", "weristdeinfreund.de", ]), entities: sort_hash_domains(vec!["gameswelt"]), script_inject: true, ..Default::default() }, ); check_parse_result( r#"prad.de##+js(abort-on-property-read.js, document.cookie)"#, CosmeticFilterBreakdown { selector: r#"abort-on-property-read.js, document.cookie"#.to_string(), hostnames: sort_hash_domains(vec!["prad.de"]), script_inject: true, ..Default::default() }, ); check_parse_result( r#"computerbild.de##+js(abort-on-property-read.js, Date.prototype.toUTCString)"#, CosmeticFilterBreakdown { selector: r#"abort-on-property-read.js, Date.prototype.toUTCString"#.to_string(), hostnames: sort_hash_domains(vec!["computerbild.de"]), script_inject: true, ..Default::default() }, ); check_parse_result( r#"computerbild.de##+js(setTimeout-defuser.js, ())return)"#, CosmeticFilterBreakdown { selector: r#"setTimeout-defuser.js, ())return"#.to_string(), hostnames: sort_hash_domains(vec!["computerbild.de"]), script_inject: true, ..Default::default() }, ); } #[test] fn entities() { check_parse_result( r#"monova.*##+js(nowebrtc.js)"#, CosmeticFilterBreakdown { selector: r#"nowebrtc.js"#.to_string(), entities: sort_hash_domains(vec!["monova"]), script_inject: true, ..Default::default() }, ); check_parse_result( r#"monova.*##tr.success.desktop"#, CosmeticFilterBreakdown { selector: r#"tr.success.desktop"#.to_string(), entities: sort_hash_domains(vec!["monova"]), ..Default::default() }, ); check_parse_result( r#"monova.*#@#script + [class] > [class]:first-child"#, CosmeticFilterBreakdown { selector: r#"script + [class] > [class]:first-child"#.to_string(), entities: sort_hash_domains(vec!["monova"]), unhide: true, ..Default::default() }, ); check_parse_result( r#"adshort.im,adsrt.*#@#[id*="ScriptRoot"]"#, CosmeticFilterBreakdown { selector: r#"[id*="ScriptRoot"]"#.to_string(), hostnames: sort_hash_domains(vec!["adshort.im"]), entities: sort_hash_domains(vec!["adsrt"]), unhide: true, ..Default::default() }, ); check_parse_result( r#"downloadsource.*##.date:not(dt):style(display: block !important;)"#, CosmeticFilterBreakdown { selector: r#".date:not(dt)"#.to_string(), entities: sort_hash_domains(vec!["downloadsource"]), action: Some(CosmeticFilterAction::Style("display: block !important;".into())), is_class_selector: true, key: Some("date".to_string()), ..Default::default() }, ); } #[test] fn styles() { check_parse_result( r#"chip.de##.video-wrapper > video[style]:style(display:block!important;padding-top:0!important;)"#, CosmeticFilterBreakdown { selector: r#".video-wrapper > video[style]"#.to_string(), hostnames: sort_hash_domains(vec!["chip.de"]), action: Some(CosmeticFilterAction::Style("display:block!important;padding-top:0!important;".into())), is_class_selector: true, key: Some("video-wrapper".to_string()), ..Default::default() }, ); check_parse_result( r#"allmusic.com##.advertising.medium-rectangle:style(min-height: 1px !important;)"#, CosmeticFilterBreakdown { selector: r#".advertising.medium-rectangle"#.to_string(), hostnames: sort_hash_domains(vec!["allmusic.com"]), action: Some(CosmeticFilterAction::Style("min-height: 1px !important;".into())), is_class_selector: true, key: Some("advertising".to_string()), ..Default::default() }, ); #[cfg(feature = "css-validation")] check_parse_result( r#"quora.com##.signup_wall_prevent_scroll .SiteHeader,.signup_wall_prevent_scroll .LoggedOutFooter,.signup_wall_prevent_scroll .ContentWrapper:style(filter: none !important;)"#, CosmeticFilterBreakdown { selector: r#".signup_wall_prevent_scroll .SiteHeader, .signup_wall_prevent_scroll .LoggedOutFooter, .signup_wall_prevent_scroll .ContentWrapper"#.to_string(), hostnames: sort_hash_domains(vec!["quora.com"]), action: Some(CosmeticFilterAction::Style("filter: none !important;".into())), is_class_selector: true, key: Some("signup_wall_prevent_scroll".to_string()), ..Default::default() } ); check_parse_result( r#"imdb.com##body#styleguide-v2:style(background-color: #e3e2dd !important; background-image: none !important;)"#, CosmeticFilterBreakdown { selector: r#"body#styleguide-v2"#.to_string(), hostnames: sort_hash_domains(vec!["imdb.com"]), action: Some(CosmeticFilterAction::Style("background-color: #e3e2dd !important; background-image: none !important;".into())), ..Default::default() }, ); check_parse_result( r#"streamcloud.eu###login > div[style^="width"]:style(display: block !important)"#, CosmeticFilterBreakdown { selector: r#"#login > div[style^="width"]"#.to_string(), hostnames: sort_hash_domains(vec!["streamcloud.eu"]), action: Some(CosmeticFilterAction::Style("display: block !important".into())), is_id_selector: true, key: Some("login".to_string()), ..Default::default() }, ); check_parse_result( r#"moonbit.co.in,moondoge.co.in,moonliteco.in##[src^="//coinad.com/ads/"]:style(visibility: collapse !important)"#, CosmeticFilterBreakdown { selector: r#"[src^="//coinad.com/ads/"]"#.to_string(), hostnames: sort_hash_domains(vec![ "moonbit.co.in", "moondoge.co.in", "moonliteco.in", ]), action: Some(CosmeticFilterAction::Style("visibility: collapse !important".into())), ..Default::default() }, ); } #[test] fn unicode() { check_parse_result( "###неделя", CosmeticFilterBreakdown { selector: "#неделя".to_string(), is_unicode: true, is_id_selector: true, key: Some("неделя".to_string()), ..Default::default() }, ); check_parse_result( "неlloworlд.com#@##week", CosmeticFilterBreakdown { selector: "#week".to_string(), hostnames: sort_hash_domains(vec!["xn--lloworl-5ggb3f.com"]), is_unicode: true, is_id_selector: true, key: Some("week".to_string()), unhide: true, ..Default::default() } ); } #[test] #[cfg(feature = "css-validation")] fn unsupported() { assert!(parse_cf("yandex.*##.serp-item:if(:scope > div.organic div.organic__subtitle:matches-css-after(content: /[Рр]еклама/))").is_err()); assert!(parse_cf(r#"facebook.com,facebookcorewwwi.onion##.ego_column:if(a[href^="/campaign/landing"])"#).is_err()); assert!(parse_cf(r#"readcomiconline.to##^script:has-text(this[atob)"#).is_err()); assert!(parse_cf("twitter.com##article:has-text(/Promoted|Gesponsert|Реклама|Promocionado/):xpath(../..)").is_err()); assert!(parse_cf("##").is_err()); assert!(parse_cf("").is_err()); // `:has` was previously limited to procedural filtering, but is now a native CSS feature. assert!(parse_cf(r#"thedailywtf.com##.article-body > div:has(a[href*="utm_medium"])"#).is_ok()); } #[test] fn hidden_generic() { let rule = parse_cf("##.selector").unwrap(); assert!(rule.hidden_generic_rule().is_none()); let rule = parse_cf("test.com##.selector").unwrap(); assert!(rule.hidden_generic_rule().is_none()); let rule = parse_cf("test.*##.selector").unwrap(); assert!(rule.hidden_generic_rule().is_none()); let rule = parse_cf("test.com,~a.test.com##.selector").unwrap(); assert!(rule.hidden_generic_rule().is_none()); let rule = parse_cf("test.*,~a.test.com##.selector").unwrap(); assert!(rule.hidden_generic_rule().is_none()); let rule = parse_cf("test.*,~a.test.*##.selector").unwrap(); assert!(rule.hidden_generic_rule().is_none()); let rule = parse_cf("test.com#@#.selector").unwrap(); assert!(rule.hidden_generic_rule().is_none()); let rule = parse_cf("~test.com##.selector").unwrap(); assert_eq!( CosmeticFilterBreakdown::from(rule.hidden_generic_rule().unwrap()), parse_cf("##.selector").unwrap().into(), ); let rule = parse_cf("~test.*##.selector").unwrap(); assert_eq!( CosmeticFilterBreakdown::from(rule.hidden_generic_rule().unwrap()), parse_cf("##.selector").unwrap().into(), ); let rule = parse_cf("~test.*,~a.test.*##.selector").unwrap(); assert_eq!( CosmeticFilterBreakdown::from(rule.hidden_generic_rule().unwrap()), parse_cf("##.selector").unwrap().into(), ); let rule = parse_cf("test.com##.selector:style(border-radius: 13px)").unwrap(); assert!(rule.hidden_generic_rule().is_none()); let rule = parse_cf("test.*##.selector:style(border-radius: 13px)").unwrap(); assert!(rule.hidden_generic_rule().is_none()); let rule = parse_cf("~test.com##.selector:style(border-radius: 13px)").unwrap(); assert!(rule.hidden_generic_rule().is_none()); let rule = parse_cf("~test.*##.selector:style(border-radius: 13px)").unwrap(); assert!(rule.hidden_generic_rule().is_none()); let rule = parse_cf("test.com#@#.selector:style(border-radius: 13px)").unwrap(); assert!(rule.hidden_generic_rule().is_none()); let rule = parse_cf("test.com##+js(nowebrtc.js)").unwrap(); assert!(rule.hidden_generic_rule().is_none()); let rule = parse_cf("test.*##+js(nowebrtc.js)").unwrap(); assert!(rule.hidden_generic_rule().is_none()); let rule = parse_cf("~test.com##+js(nowebrtc.js)").unwrap(); assert!(rule.hidden_generic_rule().is_none()); let rule = parse_cf("~test.*##+js(nowebrtc.js)").unwrap(); assert!(rule.hidden_generic_rule().is_none()); let rule = parse_cf("test.com#@#+js(nowebrtc.js)").unwrap(); assert!(rule.hidden_generic_rule().is_none()); } } #[cfg(test)] mod util_tests { use super::*; use crate::utils::fast_hash; #[test] fn label_hashing() { assert_eq!(get_hashes_from_labels("foo.bar.baz", 11, 11), vec![fast_hash("baz"), fast_hash("bar.baz"), fast_hash("foo.bar.baz")]); assert_eq!(get_hashes_from_labels("foo.bar.baz.com", 15, 8), vec![fast_hash("baz.com"), fast_hash("bar.baz.com"), fast_hash("foo.bar.baz.com")]); assert_eq!(get_hashes_from_labels("foo.bar.baz.com", 11, 11), vec![fast_hash("baz"), fast_hash("bar.baz"), fast_hash("foo.bar.baz")]); assert_eq!(get_hashes_from_labels("foo.bar.baz.com", 11, 8), vec![fast_hash("baz"), fast_hash("bar.baz"), fast_hash("foo.bar.baz")]); } #[test] fn without_public_suffix() { assert_eq!(get_hostname_without_public_suffix("", ""), None); assert_eq!(get_hostname_without_public_suffix("com", ""), None); assert_eq!(get_hostname_without_public_suffix("com", "com"), None); assert_eq!(get_hostname_without_public_suffix("foo.com", "foo.com"), Some(("foo", "com"))); assert_eq!(get_hostname_without_public_suffix("foo.bar.com", "bar.com"), Some(("foo.bar", "com"))); assert_eq!(get_hostname_without_public_suffix("test.github.io", "test.github.io"), Some(("test", "github.io"))); } } #[cfg(test)] mod matching_tests { use super::*; use crate::utils::bin_lookup; trait MatchByStr { fn matches(&self, request_entities: &[Hash], request_hostnames: &[Hash]) -> bool; fn matches_str(&self, hostname: &str, domain: &str) -> bool; } impl MatchByStr for CosmeticFilter { /// `hostname` and `domain` should be specified as, e.g. "subdomain.domain.com" and /// "domain.com", respectively. This function will panic if the specified `domain` is /// longer than the specified `hostname`. fn matches_str(&self, hostname: &str, domain: &str) -> bool { debug_assert!(hostname.len() >= domain.len()); let request_entities = get_entity_hashes_from_labels(hostname, domain); let request_hostnames = get_hostname_hashes_from_labels(hostname, domain); self.matches(&request_entities[..], &request_hostnames[..]) } /// Check whether this rule applies to content from the hostname and domain corresponding to /// the provided hash lists. /// /// See the `matches_str` test function for an example of how to convert hostnames and /// domains into the appropriate hash lists. fn matches(&self, request_entities: &[Hash], request_hostnames: &[Hash]) -> bool { let has_hostname_constraint = self.has_hostname_constraint(); if !has_hostname_constraint { return true; } if request_entities.is_empty() && request_hostnames.is_empty() && has_hostname_constraint { return false; } if let Some(ref filter_not_hostnames) = self.not_hostnames { if request_hostnames .iter() .any(|hash| bin_lookup(filter_not_hostnames, *hash)) { return false; } } if let Some(ref filter_not_entities) = self.not_entities { if request_entities .iter() .any(|hash| bin_lookup(filter_not_entities, *hash)) { return false; } } if self.hostnames.is_some() || self.entities.is_some() { if let Some(ref filter_hostnames) = self.hostnames { if request_hostnames .iter() .any(|hash| bin_lookup(filter_hostnames, *hash)) { return true; } } if let Some(ref filter_entities) = self.entities { if request_entities .iter() .any(|hash| bin_lookup(filter_entities, *hash)) { return true; } } return false; } true } } fn parse_cf(rule: &str) -> Result { CosmeticFilter::parse(rule, false, Default::default()) } #[test] fn generic_filter() { let rule = parse_cf("##.selector").unwrap(); assert!(rule.matches_str("foo.com", "foo.com")); } #[test] fn single_domain() { let rule = parse_cf("foo.com##.selector").unwrap(); assert!(rule.matches_str("foo.com", "foo.com")); assert!(!rule.matches_str("bar.com", "bar.com")); } #[test] fn multiple_domains() { let rule = parse_cf("foo.com,test.com##.selector").unwrap(); assert!(rule.matches_str("foo.com", "foo.com")); assert!(rule.matches_str("test.com", "test.com")); assert!(!rule.matches_str("bar.com", "bar.com")); } #[test] fn subdomain() { let rule = parse_cf("foo.com,test.com##.selector").unwrap(); assert!(rule.matches_str("sub.foo.com", "foo.com")); assert!(rule.matches_str("sub.test.com", "test.com")); let rule = parse_cf("foo.com,sub.test.com##.selector").unwrap(); assert!(rule.matches_str("sub.test.com", "test.com")); assert!(!rule.matches_str("test.com", "test.com")); assert!(!rule.matches_str("com", "com")); } #[test] fn entity() { let rule = parse_cf("foo.com,sub.test.*##.selector").unwrap(); assert!(rule.matches_str("foo.com", "foo.com")); assert!(rule.matches_str("bar.foo.com", "foo.com")); assert!(rule.matches_str("sub.test.com", "test.com")); assert!(rule.matches_str("sub.test.fr", "test.fr")); assert!(!rule.matches_str("sub.test.evil.biz", "evil.biz")); let rule = parse_cf("foo.*##.selector").unwrap(); assert!(rule.matches_str("foo.co.uk", "foo.co.uk")); assert!(rule.matches_str("bar.foo.co.uk", "foo.co.uk")); assert!(rule.matches_str("baz.bar.foo.co.uk", "foo.co.uk")); assert!(!rule.matches_str("foo.evil.biz", "evil.biz")); } #[test] fn nonmatching() { let rule = parse_cf("foo.*##.selector").unwrap(); assert!(!rule.matches_str("foo.bar.com", "bar.com")); assert!(!rule.matches_str("bar-foo.com", "bar-foo.com")); } #[test] fn entity_negations() { let rule = parse_cf("~foo.*##.selector").unwrap(); assert!(!rule.matches_str("foo.com", "foo.com")); assert!(rule.matches_str("foo.evil.biz", "evil.biz")); let rule = parse_cf("~foo.*,~bar.*##.selector").unwrap(); assert!(rule.matches_str("baz.com", "baz.com")); assert!(!rule.matches_str("foo.com", "foo.com")); assert!(!rule.matches_str("sub.foo.com", "foo.com")); assert!(!rule.matches_str("bar.com", "bar.com")); assert!(!rule.matches_str("sub.bar.com", "bar.com")); } #[test] fn hostname_negations() { let rule = parse_cf("~foo.com##.selector").unwrap(); assert!(!rule.matches_str("foo.com", "foo.com")); assert!(!rule.matches_str("bar.foo.com", "foo.com")); assert!(rule.matches_str("foo.com.bar", "com.bar")); assert!(rule.matches_str("foo.co.uk", "foo.co.uk")); let rule = parse_cf("~foo.com,~foo.de,~bar.com##.selector").unwrap(); assert!(!rule.matches_str("foo.com", "foo.com")); assert!(!rule.matches_str("sub.foo.com", "foo.com")); assert!(!rule.matches_str("foo.de", "foo.de")); assert!(!rule.matches_str("sub.foo.de", "foo.de")); assert!(!rule.matches_str("bar.com", "bar.com")); assert!(!rule.matches_str("sub.bar.com", "bar.com")); assert!(rule.matches_str("bar.de", "bar.de")); assert!(rule.matches_str("sub.bar.de", "bar.de")); } #[test] fn entity_with_suffix_exception() { let rule = parse_cf("foo.*,~foo.com##.selector").unwrap(); assert!(!rule.matches_str("foo.com", "foo.com")); assert!(!rule.matches_str("sub.foo.com", "foo.com")); assert!(rule.matches_str("foo.de", "foo.de")); assert!(rule.matches_str("sub.foo.de", "foo.de")); } #[test] fn entity_with_subdomain_exception() { let rule = parse_cf("foo.*,~sub.foo.*##.selector").unwrap(); assert!(rule.matches_str("foo.com", "foo.com")); assert!(rule.matches_str("foo.de", "foo.de")); assert!(!rule.matches_str("sub.foo.com", "foo.com")); assert!(!rule.matches_str("bar.com", "bar.com")); assert!(rule.matches_str("sub2.foo.com", "foo.com")); } #[test] fn no_domain_provided() { let rule = parse_cf("foo.*##.selector").unwrap(); assert!(!rule.matches_str("foo.com", "")); } #[test] fn no_hostname_provided() { let rule = parse_cf("domain.com##.selector").unwrap(); assert!(!rule.matches_str("", "")); let rule = parse_cf("domain.*##.selector").unwrap(); assert!(!rule.matches_str("", "")); let rule = parse_cf("~domain.*##.selector").unwrap(); assert!(!rule.matches_str("", "")); let rule = parse_cf("~domain.com##.selector").unwrap(); assert!(!rule.matches_str("", "")); } #[test] fn respects_etld() { let rule = parse_cf("github.io##.selector").unwrap(); assert!(rule.matches_str("test.github.io", "github.io")); } #[test] fn multiple_selectors() { assert!(parse_cf("youtube.com##.masthead-ad-control,.ad-div,.pyv-afc-ads-container").is_ok()); assert!(parse_cf("m.economictimes.com###appBanner,#stickyBanner").is_ok()); assert!(parse_cf("googledrivelinks.com###wpsafe-generate, #wpsafe-link:style(display: block !important;)").is_ok()); } #[test] fn actions() { assert!(parse_cf("example.com###adBanner:style(background: transparent)").is_ok()); assert!(parse_cf("example.com###adBanner:remove()").is_ok()); assert!(parse_cf("example.com###adBanner:remove-attr(style)").is_ok()); assert!(parse_cf("example.com###adBanner:remove-class(src)").is_ok()); } #[test] fn zero_width_space() { assert!(parse_cf(r#"​##a[href^="https://www.g2fame.com/"] > img"#).is_err()); } #[test] #[cfg(feature = "css-validation")] fn abp_has_conversion() { let rule = parse_cf("imgur.com#?#div.Gallery-Sidebar-PostContainer:-abp-has(div.promoted-hover)").unwrap(); assert_eq!(rule.selector, "div.Gallery-Sidebar-PostContainer:has(div.promoted-hover)"); let rule = parse_cf(r##"webtools.fineaty.com#?#div[class*=" hidden-"]:-abp-has(.adsbygoogle)"##).unwrap(); assert_eq!(rule.selector, r#"div[class*=" hidden-"]:has(.adsbygoogle)"#); let rule = parse_cf(r##"facebook.com,facebookcorewwwi.onion#?#._6y8t:-abp-has(a[href="/ads/about/?entry_product=ad_preferences"])"##).unwrap(); assert_eq!(rule.selector, r#"._6y8t:has(a[href="/ads/about/?entry_product=ad_preferences"])"#); let rule = parse_cf(r##"mtgarena.pro#?##root > div > div:-abp-has(> .vm-placement)"##).unwrap(); assert_eq!(rule.selector, r#"#root > div > div:has(> .vm-placement)"#); // Error without `#?#`: assert!(parse_cf(r##"mtgarena.pro###root > div > div:-abp-has(> .vm-placement)"##).is_err()); } } adblock-0.8.12/src/filters/mod.rs000064400000000000000000000001661046102023000147020ustar 00000000000000//! Contains representations and standalone behaviors of individual filter rules. pub mod cosmetic; pub mod network; adblock-0.8.12/src/filters/network.rs000064400000000000000000004373321046102023000156250ustar 00000000000000//! Filters that take effect at the network request level, including blocking and response //! modification. use memchr::{memchr as find_char, memmem, memrchr as find_char_reverse}; use once_cell::sync::Lazy; use regex::{Regex, RegexSet}; use serde::{Deserialize, Serialize}; use thiserror::Error; use std::fmt; use crate::lists::ParseOptions; use crate::regex_manager::RegexManager; use crate::request; use crate::utils::{self, Hash}; pub const TOKENS_BUFFER_SIZE: usize = 200; /// For now, only support `$removeparam` with simple alphanumeric/dash/underscore patterns. static VALID_PARAM: Lazy = Lazy::new(|| Regex::new(r"^[a-zA-Z0-9_\-]+$").unwrap()); #[derive(Debug, Error, PartialEq, Clone)] pub enum NetworkFilterError { #[error("failed to parse filter")] FilterParseError, #[error("negated badfilter option")] NegatedBadFilter, #[error("negated important")] NegatedImportant, #[error("negated match-case")] NegatedOptionMatchCase, #[error("negated explicitcancel")] NegatedExplicitCancel, #[error("negated redirection")] NegatedRedirection, #[error("negated tag")] NegatedTag, #[error("negated generichide")] NegatedGenericHide, #[error("negated document")] NegatedDocument, #[error("generichide without exception")] GenericHideWithoutException, #[error("empty redirection")] EmptyRedirection, #[error("empty removeparam")] EmptyRemoveparam, #[error("negated removeparam")] NegatedRemoveparam, #[error("removeparam with exception")] RemoveparamWithException, #[error("removeparam regex unsupported")] RemoveparamRegexUnsupported, #[error("redirection url invalid")] RedirectionUrlInvalid, #[error("multiple modifier options")] MultipleModifierOptions, #[error("unrecognised option")] UnrecognisedOption, #[error("no regex")] NoRegex, #[error("full regex unsupported")] FullRegexUnsupported, #[error("regex parsing error")] RegexParsingError(regex::Error), #[error("punycode error")] PunycodeError, #[error("csp with content type")] CspWithContentType, #[error("match-case without full regex")] MatchCaseWithoutFullRegex, #[error("no supported domains")] NoSupportedDomains, } bitflags::bitflags! { #[derive(Serialize, Deserialize)] pub struct NetworkFilterMask: u32 { const FROM_IMAGE = 1; // 1 << 0; const FROM_MEDIA = 1 << 1; const FROM_OBJECT = 1 << 2; const FROM_OTHER = 1 << 3; const FROM_PING = 1 << 4; const FROM_SCRIPT = 1 << 5; const FROM_STYLESHEET = 1 << 6; const FROM_SUBDOCUMENT = 1 << 7; const FROM_WEBSOCKET = 1 << 8; // e.g.: ws, ws const FROM_XMLHTTPREQUEST = 1 << 9; const FROM_FONT = 1 << 10; const FROM_HTTP = 1 << 11; const FROM_HTTPS = 1 << 12; const IS_IMPORTANT = 1 << 13; const MATCH_CASE = 1 << 14; const IS_REMOVEPARAM = 1 << 15; const THIRD_PARTY = 1 << 16; const FIRST_PARTY = 1 << 17; const IS_REDIRECT = 1 << 26; const BAD_FILTER = 1 << 27; const GENERIC_HIDE = 1 << 30; // Full document rules are not implied by negated types. const FROM_DOCUMENT = 1 << 29; // Kind of pattern const IS_REGEX = 1 << 18; const IS_LEFT_ANCHOR = 1 << 19; const IS_RIGHT_ANCHOR = 1 << 20; const IS_HOSTNAME_ANCHOR = 1 << 21; const IS_EXCEPTION = 1 << 22; const IS_CSP = 1 << 23; const IS_COMPLETE_REGEX = 1 << 24; const IS_HOSTNAME_REGEX = 1 << 28; // Specifies that a redirect rule should also create a corresponding block rule. // This is used to avoid returning two separate rules from `NetworkFilter::parse`. const ALSO_BLOCK_REDIRECT = 1 << 31; // "Other" network request types const UNMATCHED = 1 << 25; // Includes all request types that are implied by any negated types. const FROM_NETWORK_TYPES = Self::FROM_FONT.bits | Self::FROM_IMAGE.bits | Self::FROM_MEDIA.bits | Self::FROM_OBJECT.bits | Self::FROM_OTHER.bits | Self::FROM_PING.bits | Self::FROM_SCRIPT.bits | Self::FROM_STYLESHEET.bits | Self::FROM_SUBDOCUMENT.bits | Self::FROM_WEBSOCKET.bits | Self::FROM_XMLHTTPREQUEST.bits; // Includes all remaining types, not implied by any negated types. // TODO Could also include popup, inline-font, inline-script const FROM_ALL_TYPES = Self::FROM_NETWORK_TYPES.bits | Self::FROM_DOCUMENT.bits; // Unless filter specifies otherwise, all these options are set by default const DEFAULT_OPTIONS = Self::FROM_NETWORK_TYPES.bits | Self::FROM_HTTP.bits | Self::FROM_HTTPS.bits | Self::THIRD_PARTY.bits | Self::FIRST_PARTY.bits; // Careful with checking for NONE - will always match const NONE = 0; } } impl fmt::Display for NetworkFilterMask { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{:b}", &self) } } impl From<&request::RequestType> for NetworkFilterMask { fn from(request_type: &request::RequestType) -> NetworkFilterMask { match request_type { request::RequestType::Beacon => NetworkFilterMask::FROM_PING, request::RequestType::Csp => NetworkFilterMask::UNMATCHED, request::RequestType::Document => NetworkFilterMask::FROM_DOCUMENT, request::RequestType::Dtd => NetworkFilterMask::FROM_OTHER, request::RequestType::Fetch => NetworkFilterMask::FROM_OTHER, request::RequestType::Font => NetworkFilterMask::FROM_FONT, request::RequestType::Image => NetworkFilterMask::FROM_IMAGE, request::RequestType::Media => NetworkFilterMask::FROM_MEDIA, request::RequestType::Object => NetworkFilterMask::FROM_OBJECT, request::RequestType::Other => NetworkFilterMask::FROM_OTHER, request::RequestType::Ping => NetworkFilterMask::FROM_PING, request::RequestType::Script => NetworkFilterMask::FROM_SCRIPT, request::RequestType::Stylesheet => NetworkFilterMask::FROM_STYLESHEET, request::RequestType::Subdocument => NetworkFilterMask::FROM_SUBDOCUMENT, request::RequestType::Websocket => NetworkFilterMask::FROM_WEBSOCKET, request::RequestType::Xlst => NetworkFilterMask::FROM_OTHER, request::RequestType::Xmlhttprequest => NetworkFilterMask::FROM_XMLHTTPREQUEST, } } } #[derive(Debug, Clone)] pub enum CompiledRegex { Compiled(Regex), CompiledSet(RegexSet), MatchAll, RegexParsingError(regex::Error), } impl CompiledRegex { pub fn is_match(&self, pattern: &str) -> bool { match &self { CompiledRegex::MatchAll => true, // simple case for matching everything, e.g. for empty filter CompiledRegex::RegexParsingError(_e) => false, // no match if regex didn't even compile CompiledRegex::Compiled(r) => r.is_match(pattern), CompiledRegex::CompiledSet(r) => { // let matches: Vec<_> = r.matches(pattern).into_iter().collect(); // println!("Matching {} against RegexSet: {:?}", pattern, matches); r.is_match(pattern) } } } } impl fmt::Display for CompiledRegex { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match &self { CompiledRegex::MatchAll => write!(f, ".*"), // simple case for matching everything, e.g. for empty filter CompiledRegex::RegexParsingError(_e) => write!(f, "ERROR"), // no match if regex didn't even compile CompiledRegex::Compiled(r) => write!(f, "{}", r.as_str()), CompiledRegex::CompiledSet(r) => write!(f, "{}", r.patterns().join(" | ")), } } } #[derive(Debug, Clone, Serialize, Deserialize)] pub enum FilterPart { Empty, Simple(String), AnyOf(Vec), } impl FilterPart { pub fn string_view(&self) -> Option { match &self { FilterPart::Empty => None, FilterPart::Simple(s) => Some(s.clone()), FilterPart::AnyOf(s) => Some(s.join("|")), } } } #[derive(Clone, Copy)] enum NetworkFilterLeftAnchor { /// A `||` token, which represents a match to the start of a domain or subdomain segment. DoublePipe, /// A `|` token, which represents a match to the exact start of the URL. SinglePipe, } #[derive(Clone, Copy)] enum NetworkFilterRightAnchor { /// A `|` token, which represents a match to the exact end of the URL. SinglePipe, } /// Pattern for a network filter, describing what URLs to match against. #[derive(Clone)] struct NetworkFilterPattern { left_anchor: Option, pattern: String, right_anchor: Option, } /// Any option that appears on the right side of a network filter as initiated by a `$` character. /// All `bool` arguments below are `true` if the option stands alone, or `false` if the option is /// negated using a prepended `~`. #[derive(Clone)] enum NetworkFilterOption { Domain(Vec<(bool, String)>), Badfilter, Important, MatchCase, ThirdParty(bool), FirstParty(bool), Tag(String), Redirect(String), RedirectRule(String), Csp(Option), Removeparam(String), Generichide, Document, Image(bool), Media(bool), Object(bool), Other(bool), Ping(bool), Script(bool), Stylesheet(bool), Subdocument(bool), XmlHttpRequest(bool), Websocket(bool), Font(bool), } impl NetworkFilterOption { pub fn is_content_type(&self) -> bool { matches!(self, Self::Document | Self::Image(..) | Self::Media(..) | Self::Object(..) | Self::Other(..) | Self::Ping(..) | Self::Script(..) | Self::Stylesheet(..) | Self::Subdocument(..) | Self::XmlHttpRequest(..) | Self::Websocket(..) | Self::Font(..)) } pub fn is_redirection(&self) -> bool { matches!(self, Self::Redirect(..) | Self::RedirectRule(..)) } } /// Abstract syntax representation of a network filter. This representation can fully specify the /// string representation of a filter as written, with the exception of aliased options like `1p` /// or `ghide`. This allows separation of concerns between parsing and interpretation. struct AbstractNetworkFilter { exception: bool, pattern: NetworkFilterPattern, options: Option>, } impl AbstractNetworkFilter { fn parse(line: &str) -> Result { let mut filter_index_start: usize = 0; let mut filter_index_end: usize = line.len(); let mut exception = false; if line.starts_with("@@") { filter_index_start += 2; exception = true; } let maybe_options_index: Option = find_char_reverse(b'$', line.as_bytes()); let mut options = None; if let Some(options_index) = maybe_options_index { filter_index_end = options_index; // slicing here is safe; the first byte after '$' will be a character boundary let raw_options = &line[filter_index_end + 1..]; options = Some(parse_filter_options(raw_options)?); } let left_anchor = if line[filter_index_start..].starts_with("||") { filter_index_start += 2; Some(NetworkFilterLeftAnchor::DoublePipe) } else if line[filter_index_start..].starts_with('|') { filter_index_start += 1; Some(NetworkFilterLeftAnchor::SinglePipe) } else { None }; let right_anchor = if filter_index_end > 0 && filter_index_end > filter_index_start && line[..filter_index_end].ends_with('|') { filter_index_end -= 1; Some(NetworkFilterRightAnchor::SinglePipe) } else { None }; let pattern = &line[filter_index_start..filter_index_end]; Ok(AbstractNetworkFilter { exception, pattern: NetworkFilterPattern { left_anchor, pattern: pattern.to_string(), right_anchor, }, options, }) } } fn parse_filter_options(raw_options: &str) -> Result, NetworkFilterError> { let mut result = vec![]; for raw_option in raw_options.split(',') { // Check for negation: ~option let negation = raw_option.starts_with('~'); let maybe_negated_option = raw_option.trim_start_matches('~'); // Check for options: option=value1|value2 let mut option_and_values = maybe_negated_option.splitn(2, '='); let (option, value) = ( option_and_values.next().unwrap(), option_and_values.next().unwrap_or_default(), ); result.push(match (option, negation) { ("domain", _) | ("from", _) => { let domains: Vec<(bool, String)> = value.split('|').map(|domain| { if let Some(negated_domain) = domain.strip_prefix('~') { (false, negated_domain.to_string()) } else { (true, domain.to_string()) } }) .filter(|(_, d)| !(d.starts_with('/') && d.ends_with('/'))) .collect(); if domains.is_empty() { return Err(NetworkFilterError::NoSupportedDomains); } NetworkFilterOption::Domain(domains) } ("badfilter", true) => return Err(NetworkFilterError::NegatedBadFilter), ("badfilter", false) => NetworkFilterOption::Badfilter, ("important", true) => return Err(NetworkFilterError::NegatedImportant), ("important", false) => NetworkFilterOption::Important, ("match-case", true) => return Err(NetworkFilterError::NegatedOptionMatchCase), ("match-case", false) => NetworkFilterOption::MatchCase, ("third-party", negated) | ("3p", negated) => NetworkFilterOption::ThirdParty(!negated), ("first-party", negated) | ("1p", negated) => NetworkFilterOption::FirstParty(!negated), ("tag", true) => return Err(NetworkFilterError::NegatedTag), ("tag", false) => NetworkFilterOption::Tag(String::from(value)), ("redirect", true) => return Err(NetworkFilterError::NegatedRedirection), ("redirect", false) => { // Ignore this filter if no redirection resource is specified if value.is_empty() { return Err(NetworkFilterError::EmptyRedirection); } NetworkFilterOption::Redirect(String::from(value)) } ("redirect-rule", true) => return Err(NetworkFilterError::NegatedRedirection), ("redirect-rule", false) => { if value.is_empty() { return Err(NetworkFilterError::EmptyRedirection); } NetworkFilterOption::RedirectRule(String::from(value)) } ("csp", _) => NetworkFilterOption::Csp(if !value.is_empty() { Some(String::from(value)) } else { None }), ("removeparam", true) => return Err(NetworkFilterError::NegatedRemoveparam), ("removeparam", false) => { if value.is_empty() { return Err(NetworkFilterError::EmptyRemoveparam); } if !VALID_PARAM.is_match(value) { return Err(NetworkFilterError::RemoveparamRegexUnsupported); } NetworkFilterOption::Removeparam(String::from(value)) } ("generichide", true) | ("ghide", true) => return Err(NetworkFilterError::NegatedGenericHide), ("generichide", false) | ("ghide", false) => NetworkFilterOption::Generichide, ("document", true) | ("doc", true) => return Err(NetworkFilterError::NegatedDocument), ("document", false) | ("doc", false) => NetworkFilterOption::Document, ("image", negated) => NetworkFilterOption::Image(!negated), ("media", negated) => NetworkFilterOption::Media(!negated), ("object", negated) | ("object-subrequest", negated) => NetworkFilterOption::Object(!negated), ("other", negated) => NetworkFilterOption::Other(!negated), ("ping", negated) | ("beacon", negated) => NetworkFilterOption::Ping(!negated), ("script", negated) => NetworkFilterOption::Script(!negated), ("stylesheet", negated) | ("css", negated) => NetworkFilterOption::Stylesheet(!negated), ("subdocument", negated) | ("frame", negated) => NetworkFilterOption::Subdocument(!negated), ("xmlhttprequest", negated) | ("xhr", negated) => NetworkFilterOption::XmlHttpRequest(!negated), ("websocket", negated) => NetworkFilterOption::Websocket(!negated), ("font", negated) => NetworkFilterOption::Font(!negated), (_, _) => return Err(NetworkFilterError::UnrecognisedOption), }); } Ok(result) } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct NetworkFilter { pub mask: NetworkFilterMask, pub filter: FilterPart, pub opt_domains: Option>, pub opt_not_domains: Option>, /// Used for `$redirect`, `$redirect-rule`, `$csp`, and `$removeparam` - only one of which is /// supported per-rule. pub modifier_option: Option, pub hostname: Option, pub(crate) tag: Option, pub raw_line: Option>, pub id: Hash, // All domain option values (their hashes) OR'ed together to quickly dismiss mis-matches pub opt_domains_union: Option, pub opt_not_domains_union: Option, } // TODO - restrict the API so that this is always true - i.e. lazy-calculate IDs from actual data, // prevent field access, and don't load the ID from the serialized format. /// The ID of a filter is assumed to be correctly calculated for the purposes of this /// implementation. impl PartialEq for NetworkFilter { fn eq(&self, other: &Self) -> bool { self.id == other.id } } /// Filters are sorted by ID to preserve a stable ordering of data in the serialized format. impl PartialOrd for NetworkFilter { fn partial_cmp(&self, other: &Self) -> Option { self.id.partial_cmp(&other.id) } } /// Ensure that no invalid option combinations were provided for a filter. fn validate_options(options: &[NetworkFilterOption]) -> Result<(), NetworkFilterError> { let mut has_csp = false; let mut has_content_type = false; let mut modifier_options = 0; for option in options { if matches!(option, NetworkFilterOption::Csp(..)) { has_csp = true; modifier_options += 1; } else if option.is_content_type() { has_content_type = true; } else if option.is_redirection() || matches!(option, NetworkFilterOption::Removeparam(..)) { modifier_options += 1; } } if has_csp && has_content_type { return Err(NetworkFilterError::CspWithContentType); } if modifier_options > 1 { return Err(NetworkFilterError::MultipleModifierOptions); } Ok(()) } impl NetworkFilter { pub fn parse(line: &str, debug: bool, _opts: ParseOptions) -> Result { let parsed = AbstractNetworkFilter::parse(line)?; // Represent options as a bitmask let mut mask: NetworkFilterMask = NetworkFilterMask::THIRD_PARTY | NetworkFilterMask::FIRST_PARTY | NetworkFilterMask::FROM_HTTPS | NetworkFilterMask::FROM_HTTP; // Temporary masks for positive (e.g.: $script) and negative (e.g.: $~script) // content type options. let mut cpt_mask_positive: NetworkFilterMask = NetworkFilterMask::NONE; let mut cpt_mask_negative: NetworkFilterMask = NetworkFilterMask::NONE; let mut hostname: Option = None; let mut opt_domains: Option> = None; let mut opt_not_domains: Option> = None; let mut opt_domains_union: Option = None; let mut opt_not_domains_union: Option = None; let mut modifier_option: Option = None; let mut tag: Option = None; if parsed.exception { mask.set(NetworkFilterMask::IS_EXCEPTION, true); } if let Some(options) = parsed.options { validate_options(&options)?; macro_rules! apply_content_type { ($content_type:ident, $enabled:ident) => { if $enabled { cpt_mask_positive.set(NetworkFilterMask::$content_type, true); } else { cpt_mask_negative.set(NetworkFilterMask::$content_type, true); } }; } options.into_iter().for_each(|option| { match option { NetworkFilterOption::Domain(mut domains) => { // Some rules have duplicate domain options - avoid including duplicates // Benchmarking doesn't indicate signficant performance degradation across the entire easylist domains.sort_unstable(); domains.dedup(); let mut opt_domains_array: Vec = vec![]; let mut opt_not_domains_array: Vec = vec![]; for (enabled, domain) in domains { let domain_hash = utils::fast_hash(&domain); if !enabled { opt_not_domains_array.push(domain_hash); } else { opt_domains_array.push(domain_hash); } } if !opt_domains_array.is_empty() { opt_domains_array.sort_unstable(); opt_domains_union = Some(opt_domains_array.iter().fold(0, |acc, x| acc | x)); opt_domains = Some(opt_domains_array); } if !opt_not_domains_array.is_empty() { opt_not_domains_array.sort_unstable(); opt_not_domains_union = Some(opt_not_domains_array.iter().fold(0, |acc, x| acc | x)); opt_not_domains = Some(opt_not_domains_array); } } NetworkFilterOption::Badfilter => mask.set(NetworkFilterMask::BAD_FILTER, true), NetworkFilterOption::Important => mask.set(NetworkFilterMask::IS_IMPORTANT, true), NetworkFilterOption::MatchCase => mask.set(NetworkFilterMask::MATCH_CASE, true), NetworkFilterOption::ThirdParty(false) | NetworkFilterOption::FirstParty(true) => mask.set(NetworkFilterMask::THIRD_PARTY, false), NetworkFilterOption::ThirdParty(true) | NetworkFilterOption::FirstParty(false) => mask.set(NetworkFilterMask::FIRST_PARTY, false), NetworkFilterOption::Tag(value) => tag = Some(value), NetworkFilterOption::Redirect(value) => { mask.set(NetworkFilterMask::IS_REDIRECT, true); mask.set(NetworkFilterMask::ALSO_BLOCK_REDIRECT, true); modifier_option = Some(value); } NetworkFilterOption::RedirectRule(value) => { mask.set(NetworkFilterMask::IS_REDIRECT, true); modifier_option = Some(value); } NetworkFilterOption::Removeparam(value) => { mask.set(NetworkFilterMask::IS_REMOVEPARAM, true); modifier_option = Some(value); } NetworkFilterOption::Csp(value) => { mask.set(NetworkFilterMask::IS_CSP, true); // CSP rules can never have content types, and should always match against // subdocument and document rules. Rules do not match against document // requests by default, so this must be explictly added. mask.set(NetworkFilterMask::FROM_DOCUMENT, true); modifier_option = value; } NetworkFilterOption::Generichide => mask.set(NetworkFilterMask::GENERIC_HIDE, true), NetworkFilterOption::Document => cpt_mask_positive.set(NetworkFilterMask::FROM_DOCUMENT, true), NetworkFilterOption::Image(enabled) => apply_content_type!(FROM_IMAGE, enabled), NetworkFilterOption::Media(enabled) => apply_content_type!(FROM_MEDIA, enabled), NetworkFilterOption::Object(enabled) => apply_content_type!(FROM_OBJECT, enabled), NetworkFilterOption::Other(enabled) => apply_content_type!(FROM_OTHER, enabled), NetworkFilterOption::Ping(enabled) => apply_content_type!(FROM_PING, enabled), NetworkFilterOption::Script(enabled) => apply_content_type!(FROM_SCRIPT, enabled), NetworkFilterOption::Stylesheet(enabled) => apply_content_type!(FROM_STYLESHEET, enabled), NetworkFilterOption::Subdocument(enabled) => apply_content_type!(FROM_SUBDOCUMENT, enabled), NetworkFilterOption::XmlHttpRequest(enabled) => apply_content_type!(FROM_XMLHTTPREQUEST, enabled), NetworkFilterOption::Websocket(enabled) => apply_content_type!(FROM_WEBSOCKET, enabled), NetworkFilterOption::Font(enabled) => apply_content_type!(FROM_FONT, enabled), } }); } mask |= cpt_mask_positive; // If any negated "network" types were set, then implicitly enable all network types. // The negated types will be applied later. // // This doesn't apply to removeparam filters. if !mask.contains(NetworkFilterMask::IS_REMOVEPARAM) && (cpt_mask_negative & NetworkFilterMask::FROM_NETWORK_TYPES) != NetworkFilterMask::NONE { mask |= NetworkFilterMask::FROM_NETWORK_TYPES; } // If no positive types were set, then the filter should apply to all network types. if (cpt_mask_positive & NetworkFilterMask::FROM_ALL_TYPES).is_empty() { // Removeparam is again a special case. if mask.contains(NetworkFilterMask::IS_REMOVEPARAM) { mask |= NetworkFilterMask::FROM_DOCUMENT | NetworkFilterMask::FROM_SUBDOCUMENT | NetworkFilterMask::FROM_XMLHTTPREQUEST; } else { mask |= NetworkFilterMask::FROM_NETWORK_TYPES; } } match parsed.pattern.left_anchor { Some(NetworkFilterLeftAnchor::DoublePipe) => mask.set(NetworkFilterMask::IS_HOSTNAME_ANCHOR, true), Some(NetworkFilterLeftAnchor::SinglePipe) => mask.set(NetworkFilterMask::IS_LEFT_ANCHOR, true), None => (), } // TODO these need to actually be handled differently than trailing `^`. let mut end_url_anchor = false; if let Some(NetworkFilterRightAnchor::SinglePipe) = parsed.pattern.right_anchor { mask.set(NetworkFilterMask::IS_RIGHT_ANCHOR, true); end_url_anchor = true; } let pattern = &parsed.pattern.pattern; let is_regex = check_is_regex(pattern); mask.set(NetworkFilterMask::IS_REGEX, is_regex); if pattern.starts_with('/') && pattern.ends_with('/') && pattern.len() > 1 { #[cfg(feature = "full-regex-handling")] { mask.set(NetworkFilterMask::IS_COMPLETE_REGEX, true); } #[cfg(not(feature = "full-regex-handling"))] { return Err(NetworkFilterError::FullRegexUnsupported); } } else { if !(mask & NetworkFilterMask::MATCH_CASE).is_empty() { return Err(NetworkFilterError::MatchCaseWithoutFullRegex); } } let (mut filter_index_start, mut filter_index_end) = (0, pattern.len()); if let Some(NetworkFilterLeftAnchor::DoublePipe) = parsed.pattern.left_anchor { if is_regex { // Split at the first '/', '*' or '^' character to get the hostname // and then the pattern. // TODO - this could be made more efficient if we could match between two // indices. Once again, we have to do more work than is really needed. static SEPARATOR: Lazy = Lazy::new(|| Regex::new("[/^*]").unwrap()); if let Some(first_separator) = SEPARATOR.find(pattern) { let first_separator_start = first_separator.start(); // NOTE: `first_separator` shall never be -1 here since `IS_REGEX` is true. // This means there must be at least an occurrence of `*` or `^` // somewhere. // If the first separator is a wildcard, included in in hostname if first_separator_start < pattern.len() && pattern[first_separator_start..=first_separator_start].starts_with('*') { mask.set(NetworkFilterMask::IS_HOSTNAME_REGEX, true); } hostname = Some(String::from(&pattern[..first_separator_start])); filter_index_start = first_separator_start; // If the only symbol remaining for the selector is '^' then ignore it // but set the filter as right anchored since there should not be any // other label on the right if filter_index_end - filter_index_start == 1 && pattern[filter_index_start..].starts_with('^') { mask.set(NetworkFilterMask::IS_REGEX, false); filter_index_start = filter_index_end; mask.set(NetworkFilterMask::IS_RIGHT_ANCHOR, true); } else { mask.set(NetworkFilterMask::IS_LEFT_ANCHOR, true); mask.set( NetworkFilterMask::IS_REGEX, check_is_regex(&pattern[filter_index_start..filter_index_end]), ); } } } else { // Look for next / let slash_index = find_char(b'/', pattern.as_bytes()); slash_index .map(|i| { hostname = Some(String::from( &pattern[..i], )); filter_index_start += i; mask.set(NetworkFilterMask::IS_LEFT_ANCHOR, true); }) .or_else(|| { hostname = Some(String::from(pattern)); filter_index_start = filter_index_end; None }); } } // Remove trailing '*' if filter_index_end > filter_index_start && pattern.ends_with('*') { filter_index_end -= 1; } // Remove leading '*' if the filter is not hostname anchored. if filter_index_end > filter_index_start && pattern[filter_index_start..].starts_with('*') { mask.set(NetworkFilterMask::IS_LEFT_ANCHOR, false); filter_index_start += 1; } // Transform filters on protocol (http, https, ws) if mask.contains(NetworkFilterMask::IS_LEFT_ANCHOR) { if filter_index_end == filter_index_start + 5 && pattern[filter_index_start..].starts_with("ws://") { mask.set(NetworkFilterMask::FROM_WEBSOCKET, true); mask.set(NetworkFilterMask::FROM_HTTP, false); mask.set(NetworkFilterMask::FROM_HTTPS, false); mask.set(NetworkFilterMask::IS_LEFT_ANCHOR, false); filter_index_start = filter_index_end; } else if filter_index_end == filter_index_start + 7 && pattern[filter_index_start..].starts_with("http://") { mask.set(NetworkFilterMask::FROM_HTTP, true); mask.set(NetworkFilterMask::FROM_HTTPS, false); mask.set(NetworkFilterMask::IS_LEFT_ANCHOR, false); filter_index_start = filter_index_end; } else if filter_index_end == filter_index_start + 8 && pattern[filter_index_start..].starts_with("https://") { mask.set(NetworkFilterMask::FROM_HTTPS, true); mask.set(NetworkFilterMask::FROM_HTTP, false); mask.set(NetworkFilterMask::IS_LEFT_ANCHOR, false); filter_index_start = filter_index_end; } else if filter_index_end == filter_index_start + 8 && pattern[filter_index_start..].starts_with("http*://") { mask.set(NetworkFilterMask::FROM_HTTPS, true); mask.set(NetworkFilterMask::FROM_HTTP, true); mask.set(NetworkFilterMask::IS_LEFT_ANCHOR, false); filter_index_start = filter_index_end; } } let filter: Option = if filter_index_end > filter_index_start { let filter_str = &pattern[filter_index_start..filter_index_end]; mask.set( NetworkFilterMask::IS_REGEX, check_is_regex(filter_str), ); if mask.contains(NetworkFilterMask::MATCH_CASE) { Some(String::from(filter_str)) } else { Some(filter_str.to_ascii_lowercase()) } } else { None }; // TODO: ignore hostname anchor is not hostname provided let hostname_decoded = hostname.map(|host| { let hostname_normalised = if mask.contains(NetworkFilterMask::IS_HOSTNAME_ANCHOR) { host.trim_start_matches("www.") } else { &host }; let lowercase = hostname_normalised.to_lowercase(); let hostname = if lowercase.is_ascii() { lowercase } else { idna::domain_to_ascii(&lowercase).map_err(|_| NetworkFilterError::PunycodeError)? }; Ok(hostname) }).transpose(); if mask.contains(NetworkFilterMask::GENERIC_HIDE) && !parsed.exception { return Err(NetworkFilterError::GenericHideWithoutException); } if mask.contains(NetworkFilterMask::IS_REMOVEPARAM) && parsed.exception { return Err(NetworkFilterError::RemoveparamWithException); } // uBlock Origin would block main document `https://example.com` requests with all of the // following filters: // - ||example.com // - ||example.com/ // - example.com // - https://example.com // However, it relies on checking the URL post-match against information from the matched // filter, which isn't saved in Brave unless running with filter lists compiled in "debug" // mode. Instead, we apply the implicit document matching more strictly, only for hostname // filters of the form `||example.com^`. if (cpt_mask_positive & NetworkFilterMask::FROM_ALL_TYPES).is_empty() && (cpt_mask_negative & NetworkFilterMask::FROM_ALL_TYPES).is_empty() && mask.contains(NetworkFilterMask::IS_HOSTNAME_ANCHOR) && mask.contains(NetworkFilterMask::IS_RIGHT_ANCHOR) && !end_url_anchor && !mask.contains(NetworkFilterMask::IS_REMOVEPARAM) { mask |= NetworkFilterMask::FROM_ALL_TYPES; } // Finally, apply any explicitly negated request types mask &= !cpt_mask_negative; Ok(NetworkFilter { filter: if let Some(simple_filter) = filter { FilterPart::Simple(simple_filter) } else { FilterPart::Empty }, hostname: hostname_decoded?, mask, opt_domains, opt_not_domains, tag, raw_line: if debug { Some(Box::new(String::from(line))) } else { None }, modifier_option, id: utils::fast_hash(line), opt_domains_union, opt_not_domains_union, }) } /// Given a hostname, produces an equivalent filter parsed from the form `"||hostname^"`, to /// emulate the behavior of hosts-style blocking. pub fn parse_hosts_style(hostname: &str, debug: bool) -> Result { // Make sure the hostname doesn't contain any invalid characters static INVALID_CHARS: Lazy = Lazy::new(|| Regex::new("[/^*!?$&(){}\\[\\]+=~`\\s|@,'\"><:;]").unwrap()); if INVALID_CHARS.is_match(hostname) { return Err(NetworkFilterError::FilterParseError); } // This shouldn't be used to block an entire TLD, and the hostname shouldn't end with a dot if find_char(b'.', hostname.as_bytes()).is_none() || (hostname.starts_with('.') && find_char(b'.', hostname[1..].as_bytes()).is_none()) || hostname.ends_with('.') { return Err(NetworkFilterError::FilterParseError); } // Normalize the hostname to punycode and parse it as a `||hostname^` rule. let normalized_host = hostname.to_lowercase(); let normalized_host = normalized_host.trim_start_matches("www."); let mut hostname = "||".to_string(); if normalized_host.is_ascii() { hostname.push_str(normalized_host); } else { hostname.push_str(&idna::domain_to_ascii(normalized_host).map_err(|_| NetworkFilterError::PunycodeError)?); } hostname.push('^'); NetworkFilter::parse(&hostname, debug, Default::default()) } pub fn get_id_without_badfilter(&self) -> Hash { let mut mask = self.mask; mask.set(NetworkFilterMask::BAD_FILTER, false); compute_filter_id( self.modifier_option.as_deref(), mask, self.filter.string_view().as_deref(), self.hostname.as_deref(), self.opt_domains.as_ref(), self.opt_not_domains.as_ref(), ) } pub fn get_id(&self) -> Hash { compute_filter_id( self.modifier_option.as_deref(), self.mask, self.filter.string_view().as_deref(), self.hostname.as_deref(), self.opt_domains.as_ref(), self.opt_not_domains.as_ref(), ) } pub fn get_tokens(&self) -> Vec> { let mut tokens: Vec = Vec::with_capacity(TOKENS_BUFFER_SIZE); // If there is only one domain and no domain negation, we also use this // domain as a token. if self.opt_domains.is_some() && self.opt_not_domains.is_none() && self.opt_domains.as_ref().map(|d| d.len()) == Some(1) { if let Some(domains) = self.opt_domains.as_ref() { if let Some(domain) = domains.first() { tokens.push(*domain) } } } // Get tokens from filter match &self.filter { FilterPart::Simple(f) => { if !self.is_complete_regex() { let skip_last_token = (self.is_plain() || self.is_regex()) && !self.is_right_anchor(); let skip_first_token = self.is_right_anchor(); let mut filter_tokens = utils::tokenize_filter(f, skip_first_token, skip_last_token); tokens.append(&mut filter_tokens); } } FilterPart::AnyOf(_) => (), // across AnyOf set of filters no single token is guaranteed to match to a request _ => (), } // Append tokens from hostname, if any if !self.mask.contains(NetworkFilterMask::IS_HOSTNAME_REGEX) { if let Some(hostname) = self.hostname.as_ref() { let mut hostname_tokens = utils::tokenize(hostname); tokens.append(&mut hostname_tokens); } } if tokens.is_empty() && self.mask.contains(NetworkFilterMask::IS_REMOVEPARAM) { if let Some(removeparam) = &self.modifier_option { if VALID_PARAM.is_match(removeparam) { let mut param_tokens = utils::tokenize(&removeparam.to_ascii_lowercase()); tokens.append(&mut param_tokens); } } } // If we got no tokens for the filter/hostname part, then we will dispatch // this filter in multiple buckets based on the domains option. if tokens.is_empty() && self.opt_domains.is_some() && self.opt_not_domains.is_none() { self.opt_domains .as_ref() .unwrap_or(&vec![]) .iter() .map(|&d| vec![d]) .collect() } else { // Add optional token for protocol if self.for_http() && !self.for_https() { tokens.push(utils::fast_hash("http")); } else if self.for_https() && !self.for_http() { tokens.push(utils::fast_hash("https")); } tokens.shrink_to_fit(); vec![tokens] } } pub fn is_exception(&self) -> bool { self.mask.contains(NetworkFilterMask::IS_EXCEPTION) } pub fn is_hostname_anchor(&self) -> bool { self.mask.contains(NetworkFilterMask::IS_HOSTNAME_ANCHOR) } pub fn is_right_anchor(&self) -> bool { self.mask.contains(NetworkFilterMask::IS_RIGHT_ANCHOR) } pub fn is_left_anchor(&self) -> bool { self.mask.contains(NetworkFilterMask::IS_LEFT_ANCHOR) } fn match_case(&self) -> bool { self.mask.contains(NetworkFilterMask::MATCH_CASE) } pub fn is_important(&self) -> bool { self.mask.contains(NetworkFilterMask::IS_IMPORTANT) } pub fn is_redirect(&self) -> bool { self.mask.contains(NetworkFilterMask::IS_REDIRECT) } pub fn is_removeparam(&self) -> bool { self.mask.contains(NetworkFilterMask::IS_REMOVEPARAM) } pub fn also_block_redirect(&self) -> bool { self.mask.contains(NetworkFilterMask::ALSO_BLOCK_REDIRECT) } pub fn is_badfilter(&self) -> bool { self.mask.contains(NetworkFilterMask::BAD_FILTER) } pub fn is_generic_hide(&self) -> bool { self.mask.contains(NetworkFilterMask::GENERIC_HIDE) } pub fn is_regex(&self) -> bool { self.mask.contains(NetworkFilterMask::IS_REGEX) } pub fn is_complete_regex(&self) -> bool { self.mask.contains(NetworkFilterMask::IS_COMPLETE_REGEX) } fn is_plain(&self) -> bool { !self.is_regex() } pub fn is_csp(&self) -> bool { self.mask.contains(NetworkFilterMask::IS_CSP) } fn third_party(&self) -> bool { self.mask.contains(NetworkFilterMask::THIRD_PARTY) } fn first_party(&self) -> bool { self.mask.contains(NetworkFilterMask::FIRST_PARTY) } fn for_http(&self) -> bool { self.mask.contains(NetworkFilterMask::FROM_HTTP) } fn for_https(&self) -> bool { self.mask.contains(NetworkFilterMask::FROM_HTTPS) } } impl fmt::Display for NetworkFilter { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { match self.raw_line.as_ref() { Some(r) => write!(f, "{}", r.clone()), None => write!(f, "NetworkFilter"), } } } pub trait NetworkMatchable { fn matches(&self, request: &request::Request, regex_manager: &mut RegexManager) -> bool; #[cfg(test)] fn matches_test(&self, request: &request::Request) -> bool; } impl NetworkMatchable for NetworkFilter { fn matches(&self, request: &request::Request, regex_manager: &mut RegexManager) -> bool { check_options(self, request) && check_pattern(self, request, regex_manager) } #[cfg(test)] fn matches_test(&self, request: &request::Request) -> bool { self.matches(request, &mut RegexManager::default()) } } // --------------------------------------------------------------------------- // Filter parsing // --------------------------------------------------------------------------- fn compute_filter_id( modifier_option: Option<&str>, mask: NetworkFilterMask, filter: Option<&str>, hostname: Option<&str>, opt_domains: Option<&Vec>, opt_not_domains: Option<&Vec>, ) -> Hash { let mut hash: Hash = (5408 * 33) ^ Hash::from(mask.bits); if let Some(s) = modifier_option { let chars = s.chars(); for c in chars { hash = hash.wrapping_mul(33) ^ (c as Hash); } }; if let Some(domains) = opt_domains { for d in domains { hash = hash.wrapping_mul(33) ^ d; } }; if let Some(domains) = opt_not_domains { for d in domains { hash = hash.wrapping_mul(33) ^ d; } } if let Some(s) = filter { let chars = s.chars(); for c in chars { hash = hash.wrapping_mul(33) ^ (c as Hash); } } if let Some(s) = hostname { let chars = s.chars(); for c in chars { hash = hash.wrapping_mul(33) ^ (c as Hash); } } hash } /// Compiles a filter pattern to a regex. This is only performed *lazily* for /// filters containing at least a * or ^ symbol. Because Regexes are expansive, /// we try to convert some patterns to plain filters. #[allow(clippy::trivial_regex)] pub fn compile_regex( filter: &FilterPart, is_right_anchor: bool, is_left_anchor: bool, is_complete_regex: bool, ) -> CompiledRegex { // Escape special regex characters: |.$+?{}()[]\ static SPECIAL_RE: Lazy = Lazy::new(|| Regex::new(r"([\|\.\$\+\?\{\}\(\)\[\]])").unwrap()); // * can match anything static WILDCARD_RE: Lazy = Lazy::new(|| Regex::new(r"\*").unwrap()); // ^ can match any separator or the end of the pattern static ANCHOR_RE: Lazy = Lazy::new(|| Regex::new(r"\^(.)").unwrap()); // ^ can match any separator or the end of the pattern static ANCHOR_RE_EOL: Lazy = Lazy::new(|| Regex::new(r"\^$").unwrap()); let filters: Vec = match filter { FilterPart::Empty => vec![], FilterPart::Simple(s) => vec![s.clone()], FilterPart::AnyOf(f) => f.clone(), }; let mut escaped_patterns = Vec::with_capacity(filters.len()); for filter_str in filters { // If any filter is empty, the entire set matches anything if filter_str.is_empty() { return CompiledRegex::MatchAll; } if is_complete_regex { // unescape unrecognised escaping sequences, otherwise a normal regex let unescaped = filter_str[1..filter_str.len() - 1] .replace("\\/", "/") .replace("\\:", ":"); escaped_patterns.push(unescaped); } else { let repl = SPECIAL_RE.replace_all(&filter_str, "\\$1"); let repl = WILDCARD_RE.replace_all(&repl, ".*"); // in adblock rules, '^' is a separator. // The separator character is anything but a letter, a digit, or one of the following: _ - . % let repl = ANCHOR_RE.replace_all(&repl, "(?:[^\\w\\d\\._%-])$1"); let repl = ANCHOR_RE_EOL.replace_all(&repl, "(?:[^\\w\\d\\._%-]|$)"); // Should match start or end of url let left_anchor = if is_left_anchor { "^" } else { "" }; let right_anchor = if is_right_anchor { "$" } else { "" }; let filter = format!("{}{}{}", left_anchor, repl, right_anchor); escaped_patterns.push(filter); } } if escaped_patterns.is_empty() { CompiledRegex::MatchAll } else if escaped_patterns.len() == 1 { let pattern = &escaped_patterns[0]; match Regex::new(pattern) { Ok(compiled) => CompiledRegex::Compiled(compiled), Err(e) => { // println!("Regex parsing failed ({:?})", e); CompiledRegex::RegexParsingError(e) } } } else { match RegexSet::new(escaped_patterns) { Ok(compiled) => CompiledRegex::CompiledSet(compiled), Err(e) => CompiledRegex::RegexParsingError(e), } } } /// Check if the sub-string contained between the indices start and end is a /// regex filter (it contains a '*' or '^' char). Here we are limited by the /// capability of javascript to check the presence of a pattern between two /// indices (same for Regex...). fn check_is_regex(filter: &str) -> bool { // TODO - we could use sticky regex here let start_index = find_char(b'*', filter.as_bytes()); let separator_index = find_char(b'^', filter.as_bytes()); start_index.is_some() || separator_index.is_some() } /// Handle hostname anchored filters, given 'hostname' from ||hostname and /// request's hostname, check if there is a match. This is tricky because /// filters authors rely and different assumption. We can have prefix of suffix /// matches of anchor. fn is_anchored_by_hostname(filter_hostname: &str, hostname: &str, wildcard_filter_hostname: bool) -> bool { let filter_hostname_len = filter_hostname.len(); // Corner-case, if `filterHostname` is empty, then it's a match if filter_hostname_len == 0 { return true; } let hostname_len = hostname.len(); if filter_hostname_len > hostname_len { // `filterHostname` cannot be longer than actual hostname false } else if filter_hostname_len == hostname_len { // If they have the same len(), they should be equal filter_hostname == hostname } else if let Some(match_index) = memmem::find(hostname.as_bytes(), filter_hostname.as_bytes()) { if match_index == 0 { // `filter_hostname` is a prefix of `hostname` and needs to match full a label. // // Examples (filter_hostname, hostname): // * (foo, foo.com) // * (sub.foo, sub.foo.com) wildcard_filter_hostname || filter_hostname.ends_with('.') || hostname[filter_hostname_len..].starts_with('.') } else if match_index == hostname_len - filter_hostname_len { // `filter_hostname` is a suffix of `hostname`. // // Examples (filter_hostname, hostname): // * (foo.com, sub.foo.com) // * (com, foo.com) filter_hostname.starts_with('.') || hostname[match_index - 1..].starts_with('.') } else { // `filter_hostname` is infix of `hostname` and needs match full labels (wildcard_filter_hostname || filter_hostname.ends_with('.') || hostname[filter_hostname_len..].starts_with('.')) && (filter_hostname.starts_with('.') || hostname[match_index - 1..].starts_with('.')) } } else { // No match false } } fn get_url_after_hostname<'a>(url: &'a str, hostname: &str) -> &'a str { let start = memmem::find(url.as_bytes(), hostname.as_bytes()).unwrap_or(url.len() - hostname.len()); &url[start + hostname.len()..] } // --------------------------------------------------------------------------- // Filter matching // --------------------------------------------------------------------------- // pattern fn check_pattern_plain_filter_filter(filter: &NetworkFilter, request: &request::Request) -> bool { let request_url = request.get_url(filter.match_case()); match &filter.filter { FilterPart::Empty => true, FilterPart::Simple(f) => memmem::find(request_url.as_bytes(), f.as_bytes()).is_some(), FilterPart::AnyOf(filters) => { for f in filters { if memmem::find(request_url.as_bytes(), f.as_bytes()).is_some() { return true; } } false } } } // pattern| fn check_pattern_right_anchor_filter(filter: &NetworkFilter, request: &request::Request) -> bool { let request_url = request.get_url(filter.match_case()); match &filter.filter { FilterPart::Empty => true, FilterPart::Simple(f) => request_url.ends_with(f), FilterPart::AnyOf(filters) => { for f in filters { if request_url.ends_with(f) { return true; } } false } } } // |pattern fn check_pattern_left_anchor_filter(filter: &NetworkFilter, request: &request::Request) -> bool { let request_url = request.get_url(filter.match_case()); match &filter.filter { FilterPart::Empty => true, FilterPart::Simple(f) => request_url.starts_with(f), FilterPart::AnyOf(filters) => { for f in filters { if request_url.starts_with(f) { return true; } } false } } } // |pattern| fn check_pattern_left_right_anchor_filter( filter: &NetworkFilter, request: &request::Request, ) -> bool { let request_url = request.get_url(filter.match_case()); match &filter.filter { FilterPart::Empty => true, FilterPart::Simple(f) => &request_url == f, FilterPart::AnyOf(filters) => { for f in filters { if &request_url == f { return true; } } false } } } // pattern*^ fn check_pattern_regex_filter_at( filter: &NetworkFilter, request: &request::Request, start_from: usize, regex_manager: &mut RegexManager, ) -> bool { let request_url = request.get_url(filter.match_case()); regex_manager.matches(filter, &request_url[start_from..]) } fn check_pattern_regex_filter( filter: &NetworkFilter, request: &request::Request, regex_manager: &mut RegexManager, ) -> bool { check_pattern_regex_filter_at(filter, request, 0, regex_manager) } // ||pattern*^ fn check_pattern_hostname_anchor_regex_filter( filter: &NetworkFilter, request: &request::Request, regex_manager: &mut RegexManager, ) -> bool { let request_url = request.get_url(filter.match_case()); filter .hostname .as_ref() .map(|hostname| { if is_anchored_by_hostname(hostname, &request.hostname, filter.mask.contains(NetworkFilterMask::IS_HOSTNAME_REGEX)) { check_pattern_regex_filter_at( filter, request, memmem::find(request_url.as_bytes(), hostname.as_bytes()).unwrap_or_default() + hostname.len(), regex_manager, ) } else { false } }) .unwrap_or_else(|| unreachable!()) // no match if filter has no hostname - should be unreachable } // ||pattern| fn check_pattern_hostname_right_anchor_filter( filter: &NetworkFilter, request: &request::Request, ) -> bool { filter .hostname .as_ref() .map(|hostname| { if is_anchored_by_hostname(hostname, &request.hostname, filter.mask.contains(NetworkFilterMask::IS_HOSTNAME_REGEX)) { match &filter.filter { // In this specific case it means that the specified hostname should match // at the end of the hostname of the request. This allows to prevent false // positive like ||foo.bar which would match https://foo.bar.baz where // ||foo.bar^ would not. FilterPart::Empty => { request.hostname.len() == hostname.len() // if lengths are equal, hostname equality is implied by anchoring check || request.hostname.ends_with(hostname) } _ => check_pattern_right_anchor_filter(filter, request), } } else { false } }) .unwrap_or_else(|| unreachable!()) // no match if filter has no hostname - should be unreachable } // |||pattern| fn check_pattern_hostname_left_right_anchor_filter( filter: &NetworkFilter, request: &request::Request, ) -> bool { // Since this is not a regex, the filter pattern must follow the hostname // with nothing in between. So we extract the part of the URL following // after hostname and will perform the matching on it. filter .hostname .as_ref() .map(|hostname| { if is_anchored_by_hostname(hostname, &request.hostname, filter.mask.contains(NetworkFilterMask::IS_HOSTNAME_REGEX)) { let request_url = request.get_url(filter.match_case()); match &filter.filter { // if no filter, we have a match FilterPart::Empty => true, // Since it must follow immediatly after the hostname and be a suffix of // the URL, we conclude that filter must be equal to the part of the // url following the hostname. FilterPart::Simple(f) => get_url_after_hostname(&request_url, hostname) == f, FilterPart::AnyOf(filters) => { let url_after_hostname = get_url_after_hostname(&request_url, hostname); for f in filters { if url_after_hostname == f { return true; } } false } } } else { false } }) .unwrap_or_else(|| unreachable!()) // no match if filter has no hostname - should be unreachable } // ||pattern + left-anchor => This means that a plain pattern needs to appear // exactly after the hostname, with nothing in between. fn check_pattern_hostname_left_anchor_filter( filter: &NetworkFilter, request: &request::Request, ) -> bool { filter .hostname .as_ref() .map(|hostname| { if is_anchored_by_hostname(hostname, &request.hostname, filter.mask.contains(NetworkFilterMask::IS_HOSTNAME_REGEX)) { let request_url = request.get_url(filter.match_case()); match &filter.filter { // if no filter, we have a match FilterPart::Empty => true, // Since this is not a regex, the filter pattern must follow the hostname // with nothing in between. So we extract the part of the URL following // after hostname and will perform the matching on it. FilterPart::Simple(f) => get_url_after_hostname(&request_url, hostname).starts_with(f), FilterPart::AnyOf(filters) => { let url_after_hostname = get_url_after_hostname(&request_url, hostname); for f in filters { if url_after_hostname.starts_with(f) { return true; } } false } } } else { false } }) .unwrap_or_else(|| unreachable!()) // no match if filter has no hostname - should be unreachable } // ||pattern fn check_pattern_hostname_anchor_filter( filter: &NetworkFilter, request: &request::Request, ) -> bool { filter .hostname .as_ref() .map(|hostname| { if is_anchored_by_hostname(hostname, &request.hostname, filter.mask.contains(NetworkFilterMask::IS_HOSTNAME_REGEX)) { let request_url = request.get_url(filter.match_case()); match &filter.filter { // if no filter, we have a match FilterPart::Empty => true, // Filter hostname does not necessarily have to be a full, proper hostname, part of it can be lumped together with the URL FilterPart::Simple(f) => get_url_after_hostname(&request_url, hostname) .contains(f), FilterPart::AnyOf(filters) => { let url_after_hostname = get_url_after_hostname(&request_url, hostname); for f in filters { if url_after_hostname.contains(f) { return true; } } false } } } else { false } }) .unwrap_or_else(|| unreachable!()) // no match if filter has no hostname - should be unreachable } /// Efficiently checks if a certain network filter matches against a network /// request. fn check_pattern( filter: &NetworkFilter, request: &request::Request, regex_manager: &mut RegexManager, ) -> bool { if filter.is_hostname_anchor() { if filter.is_regex() { check_pattern_hostname_anchor_regex_filter(filter, request, regex_manager) } else if filter.is_right_anchor() && filter.is_left_anchor() { check_pattern_hostname_left_right_anchor_filter(filter, request) } else if filter.is_right_anchor() { check_pattern_hostname_right_anchor_filter(filter, request) } else if filter.is_left_anchor() { check_pattern_hostname_left_anchor_filter(filter, request) } else { check_pattern_hostname_anchor_filter(filter, request) } } else if filter.is_regex() || filter.is_complete_regex() { check_pattern_regex_filter(filter, request, regex_manager) } else if filter.is_left_anchor() && filter.is_right_anchor() { check_pattern_left_right_anchor_filter(filter, request) } else if filter.is_left_anchor() { check_pattern_left_anchor_filter(filter, request) } else if filter.is_right_anchor() { check_pattern_right_anchor_filter(filter, request) } else { check_pattern_plain_filter_filter(filter, request) } } pub fn check_cpt_allowed(filter: &NetworkFilter, cpt: &request::RequestType) -> bool { match NetworkFilterMask::from(cpt) { // TODO this is not ideal, but required to allow regexed exception rules without an // explicit `$document` option to apply uBO-style. // See also: https://github.com/uBlockOrigin/uBlock-issues/issues/1501 NetworkFilterMask::FROM_DOCUMENT => filter.mask.contains(NetworkFilterMask::FROM_DOCUMENT) || filter.is_exception(), mask => filter.mask.contains(mask), } } fn check_options(filter: &NetworkFilter, request: &request::Request) -> bool { // Bad filter never matches if filter.is_badfilter() { return false; } // We first discard requests based on type, protocol and party. This is really // cheap and should be done first. if !check_cpt_allowed(filter, &request.request_type) || (request.is_https && !filter.for_https()) || (request.is_http && !filter.for_http()) || (!filter.first_party() && !request.is_third_party) || (!filter.third_party() && request.is_third_party) { return false; } // Source URL must be among these domains to match if let Some(included_domains) = filter.opt_domains.as_ref() { if let Some(source_hashes) = request.source_hostname_hashes.as_ref() { // If the union of included domains is recorded if let Some(included_domains_union) = filter.opt_domains_union { // If there isn't any source hash that matches the union, there's no match at all if source_hashes.iter().all(|h| h & included_domains_union != *h) { return false } } if source_hashes.iter().all(|h| !utils::bin_lookup(included_domains, *h)) { return false } } } if let Some(excluded_domains) = filter.opt_not_domains.as_ref() { if let Some(source_hashes) = request.source_hostname_hashes.as_ref() { // If the union of excluded domains is recorded if let Some(excluded_domains_union) = filter.opt_not_domains_union { // If there's any source hash that matches the union, check the actual values if source_hashes.iter().any(|h| (h & excluded_domains_union == *h) && utils::bin_lookup(excluded_domains, *h)) { return false } } else if source_hashes.iter().any(|h| utils::bin_lookup(excluded_domains, *h)) { return false } } } true } #[cfg(test)] mod parse_tests { use super::*; #[derive(Debug, PartialEq)] struct NetworkFilterBreakdown { filter: Option, hostname: Option, opt_domains: Option>, opt_not_domains: Option>, modifier_option: Option, // filter type is_exception: bool, is_hostname_anchor: bool, is_right_anchor: bool, is_left_anchor: bool, is_regex: bool, is_csp: bool, is_plain: bool, is_important: bool, // Options first_party: bool, from_network_types: bool, from_font: bool, from_image: bool, from_media: bool, from_object: bool, from_other: bool, from_ping: bool, from_script: bool, from_stylesheet: bool, from_subdocument: bool, from_websocket: bool, from_xml_http_request: bool, from_document: bool, match_case: bool, third_party: bool, } impl From<&NetworkFilter> for NetworkFilterBreakdown { fn from(filter: &NetworkFilter) -> NetworkFilterBreakdown { NetworkFilterBreakdown { filter: filter.filter.string_view(), hostname: filter.hostname.as_ref().cloned(), opt_domains: filter.opt_domains.as_ref().cloned(), opt_not_domains: filter.opt_not_domains.as_ref().cloned(), modifier_option: filter.modifier_option.as_ref().cloned(), // filter type is_exception: filter.is_exception(), is_hostname_anchor: filter.is_hostname_anchor(), is_right_anchor: filter.is_right_anchor(), is_left_anchor: filter.is_left_anchor(), is_regex: filter.is_regex(), is_csp: filter.is_csp(), is_plain: filter.is_plain(), is_important: filter.is_important(), // Options first_party: filter.first_party(), from_network_types: filter.mask.contains(NetworkFilterMask::FROM_NETWORK_TYPES), from_font: filter.mask.contains(NetworkFilterMask::FROM_FONT), from_image: filter.mask.contains(NetworkFilterMask::FROM_IMAGE), from_media: filter.mask.contains(NetworkFilterMask::FROM_MEDIA), from_object: filter.mask.contains(NetworkFilterMask::FROM_OBJECT), from_other: filter.mask.contains(NetworkFilterMask::FROM_OTHER), from_ping: filter.mask.contains(NetworkFilterMask::FROM_PING), from_script: filter.mask.contains(NetworkFilterMask::FROM_SCRIPT), from_stylesheet: filter.mask.contains(NetworkFilterMask::FROM_STYLESHEET), from_subdocument: filter.mask.contains(NetworkFilterMask::FROM_SUBDOCUMENT), from_websocket: filter.mask.contains(NetworkFilterMask::FROM_WEBSOCKET), from_xml_http_request: filter.mask.contains(NetworkFilterMask::FROM_XMLHTTPREQUEST), from_document: filter.mask.contains(NetworkFilterMask::FROM_DOCUMENT), match_case: filter.match_case(), third_party: filter.third_party(), } } } fn default_network_filter_breakdown() -> NetworkFilterBreakdown { NetworkFilterBreakdown { filter: None, hostname: None, opt_domains: None, opt_not_domains: None, modifier_option: None, // filter type is_exception: false, is_hostname_anchor: false, is_right_anchor: false, is_left_anchor: false, is_regex: false, is_csp: false, is_plain: false, is_important: false, // Options first_party: true, from_network_types: true, from_font: true, from_image: true, from_media: true, from_object: true, from_other: true, from_ping: true, from_script: true, from_stylesheet: true, from_subdocument: true, from_websocket: true, from_xml_http_request: true, from_document: false, match_case: false, third_party: true, } } #[test] // pattern fn parses_plain_pattern() { { let filter = NetworkFilter::parse("ads", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.filter = Some(String::from("ads")); defaults.is_plain = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } { let filter = NetworkFilter::parse("/ads/foo-", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.filter = Some(String::from("/ads/foo-")); defaults.is_plain = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } { let filter = NetworkFilter::parse("/ads/foo-$important", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.filter = Some(String::from("/ads/foo-")); defaults.is_plain = true; defaults.is_important = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } { let filter = NetworkFilter::parse("foo.com/ads$important", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.filter = Some(String::from("foo.com/ads")); defaults.is_plain = true; defaults.is_important = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } } #[test] // ||pattern fn parses_hostname_anchor_pattern() { { let filter = NetworkFilter::parse("||foo.com", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.filter = None; defaults.hostname = Some(String::from("foo.com")); defaults.is_plain = true; defaults.is_hostname_anchor = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } { let filter = NetworkFilter::parse("||foo.com$important", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.filter = None; defaults.hostname = Some(String::from("foo.com")); defaults.is_plain = true; defaults.is_hostname_anchor = true; defaults.is_important = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } { let filter = NetworkFilter::parse("||foo.com/bar/baz$important", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some(String::from("foo.com")); defaults.filter = Some(String::from("/bar/baz")); defaults.is_plain = true; defaults.is_hostname_anchor = true; defaults.is_important = true; defaults.is_left_anchor = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } } #[test] // ||pattern| fn parses_hostname_right_anchor_pattern() { { let filter = NetworkFilter::parse("||foo.com|", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some(String::from("foo.com")); defaults.filter = None; defaults.is_plain = true; defaults.is_right_anchor = true; defaults.is_hostname_anchor = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } { let filter = NetworkFilter::parse("||foo.com|$important", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some(String::from("foo.com")); defaults.filter = None; defaults.is_plain = true; defaults.is_important = true; defaults.is_right_anchor = true; defaults.is_hostname_anchor = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } { let filter = NetworkFilter::parse("||foo.com/bar/baz|$important", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some(String::from("foo.com")); defaults.filter = Some(String::from("/bar/baz")); defaults.is_plain = true; defaults.is_important = true; defaults.is_left_anchor = true; defaults.is_right_anchor = true; defaults.is_hostname_anchor = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } { let filter = NetworkFilter::parse("||foo.com^bar/*baz|$important", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some(String::from("foo.com")); defaults.filter = Some(String::from("^bar/*baz")); defaults.is_important = true; defaults.is_left_anchor = true; defaults.is_right_anchor = true; defaults.is_hostname_anchor = true; defaults.is_regex = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } } #[test] // |pattern fn parses_left_anchor_pattern() { { let filter = NetworkFilter::parse("|foo.com", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.filter = Some(String::from("foo.com")); defaults.is_plain = true; defaults.is_left_anchor = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } { let filter = NetworkFilter::parse("|foo.com/bar/baz", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.filter = Some(String::from("foo.com/bar/baz")); defaults.is_plain = true; defaults.is_left_anchor = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } { let filter = NetworkFilter::parse("|foo.com^bar/*baz", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.filter = Some(String::from("foo.com^bar/*baz")); defaults.is_regex = true; defaults.is_left_anchor = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } } #[test] // |pattern| fn parses_left_right_anchor_pattern() { { let filter = NetworkFilter::parse("|foo.com|", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.filter = Some(String::from("foo.com")); defaults.is_plain = true; defaults.is_right_anchor = true; defaults.is_left_anchor = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } { let filter = NetworkFilter::parse("|foo.com/bar|", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.filter = Some(String::from("foo.com/bar")); defaults.is_plain = true; defaults.is_right_anchor = true; defaults.is_left_anchor = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } { let filter = NetworkFilter::parse("|foo.com*bar^|", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.filter = Some(String::from("foo.com*bar^")); defaults.is_regex = true; defaults.is_right_anchor = true; defaults.is_left_anchor = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } } #[test] // ||regexp fn parses_hostname_anchor_regex_pattern() { { let filter = NetworkFilter::parse("||foo.com*bar^", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some(String::from("foo.com")); defaults.filter = Some(String::from("bar^")); defaults.is_hostname_anchor = true; defaults.is_regex = true; defaults.is_plain = false; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } { let filter = NetworkFilter::parse("||foo.com^bar*/baz^", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some(String::from("foo.com")); defaults.filter = Some(String::from("^bar*/baz^")); defaults.is_hostname_anchor = true; defaults.is_left_anchor = true; defaults.is_regex = true; defaults.is_plain = false; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } } #[test] // ||regexp| fn parses_hostname_right_anchor_regex_pattern() { { let filter = NetworkFilter::parse("||foo.com*bar^|", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some(String::from("foo.com")); defaults.filter = Some(String::from("bar^")); defaults.is_hostname_anchor = true; defaults.is_right_anchor = true; defaults.is_regex = true; defaults.is_plain = false; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } { let filter = NetworkFilter::parse("||foo.com^bar*/baz^|", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some(String::from("foo.com")); defaults.filter = Some(String::from("^bar*/baz^")); defaults.is_hostname_anchor = true; defaults.is_left_anchor = true; defaults.is_right_anchor = true; defaults.is_regex = true; defaults.is_plain = false; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } } #[test] // |regexp fn parses_hostname_left_anchor_regex_pattern() { { let filter = NetworkFilter::parse("|foo.com*bar^", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = None; defaults.filter = Some(String::from("foo.com*bar^")); defaults.is_left_anchor = true; defaults.is_regex = true; defaults.is_plain = false; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } { let filter = NetworkFilter::parse("|foo.com^bar*/baz^", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = None; defaults.filter = Some(String::from("foo.com^bar*/baz^")); defaults.is_left_anchor = true; defaults.is_regex = true; defaults.is_plain = false; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } } #[test] // |regexp| fn parses_hostname_left_right_anchor_regex_pattern() { { let filter = NetworkFilter::parse("|foo.com*bar^|", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = None; defaults.filter = Some(String::from("foo.com*bar^")); defaults.is_left_anchor = true; defaults.is_right_anchor = true; defaults.is_regex = true; defaults.is_plain = false; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } { let filter = NetworkFilter::parse("|foo.com^bar*/baz^|", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = None; defaults.filter = Some(String::from("foo.com^bar*/baz^")); defaults.is_left_anchor = true; defaults.is_right_anchor = true; defaults.is_regex = true; defaults.is_plain = false; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } } #[test] // @@pattern fn parses_exception_pattern() { { let filter = NetworkFilter::parse("@@ads", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.is_exception = true; defaults.filter = Some(String::from("ads")); defaults.is_plain = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)); } { let filter = NetworkFilter::parse("@@||foo.com/ads", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.is_exception = true; defaults.filter = Some(String::from("/ads")); defaults.hostname = Some(String::from("foo.com")); defaults.is_hostname_anchor = true; defaults.is_left_anchor = true; defaults.is_plain = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)); } { let filter = NetworkFilter::parse("@@|foo.com/ads", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.is_exception = true; defaults.filter = Some(String::from("foo.com/ads")); defaults.is_left_anchor = true; defaults.is_plain = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)); } { let filter = NetworkFilter::parse("@@|foo.com/ads|", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.is_exception = true; defaults.filter = Some(String::from("foo.com/ads")); defaults.is_left_anchor = true; defaults.is_plain = true; defaults.is_right_anchor = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)); } { let filter = NetworkFilter::parse("@@foo.com/ads|", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.is_exception = true; defaults.filter = Some(String::from("foo.com/ads")); defaults.is_plain = true; defaults.is_right_anchor = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)); } { let filter = NetworkFilter::parse("@@||foo.com/ads|", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.is_exception = true; defaults.filter = Some(String::from("/ads")); defaults.hostname = Some(String::from("foo.com")); defaults.is_hostname_anchor = true; defaults.is_left_anchor = true; defaults.is_plain = true; defaults.is_right_anchor = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)); } } // Options #[test] fn accepts_any_content_type() { { let filter = NetworkFilter::parse("||foo.com", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.from_network_types = true; defaults.hostname = Some(String::from("foo.com")); defaults.is_hostname_anchor = true; defaults.is_plain = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)); } { let filter = NetworkFilter::parse("||foo.com$first-party", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.from_network_types = true; defaults.hostname = Some(String::from("foo.com")); defaults.is_hostname_anchor = true; defaults.is_plain = true; defaults.first_party = true; defaults.third_party = false; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)); } { let filter = NetworkFilter::parse("||foo.com$third-party", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.from_network_types = true; defaults.hostname = Some(String::from("foo.com")); defaults.is_hostname_anchor = true; defaults.is_plain = true; defaults.first_party = false; defaults.third_party = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)); } { let filter = NetworkFilter::parse("||foo.com$domain=test.com", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.from_network_types = true; defaults.hostname = Some(String::from("foo.com")); defaults.is_hostname_anchor = true; defaults.is_plain = true; defaults.opt_domains = Some(vec![utils::fast_hash("test.com")]); assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)); } { let filter = NetworkFilter::parse("||foo.com$domain=test.com", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.from_network_types = true; defaults.hostname = Some(String::from("foo.com")); defaults.is_hostname_anchor = true; defaults.is_plain = true; defaults.opt_domains = Some(vec![utils::fast_hash("test.com")]); assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)); } } #[test] fn parses_important() { { let filter = NetworkFilter::parse("||foo.com$important", true, Default::default()).unwrap(); assert_eq!(filter.is_important(), true); } { // parses ~important let filter = NetworkFilter::parse("||foo.com$~important", true, Default::default()); assert_eq!(filter.err(), Some(NetworkFilterError::NegatedImportant)); } { // defaults to false let filter = NetworkFilter::parse("||foo.com", true, Default::default()).unwrap(); assert_eq!(filter.is_important(), false); } } #[test] fn parses_csp() { { let filter = NetworkFilter::parse("||foo.com", true, Default::default()).unwrap(); assert_eq!(filter.modifier_option, None); } { // parses simple CSP let filter = NetworkFilter::parse(r#"||foo.com$csp=self bar """#, true, Default::default()).unwrap(); assert_eq!(filter.is_csp(), true); assert_eq!(filter.modifier_option, Some(String::from(r#"self bar """#))); } { // parses empty CSP let filter = NetworkFilter::parse("||foo.com$csp", true, Default::default()).unwrap(); assert_eq!(filter.is_csp(), true); assert_eq!(filter.modifier_option, None); } { // CSP mixed with content type is an error let filter = NetworkFilter::parse(r#"||foo.com$domain=foo|bar,csp=self bar "",image"#, true, Default::default()); assert_eq!(filter.err(), Some(NetworkFilterError::CspWithContentType)); } } #[test] fn parses_domain() { // parses domain { let filter = NetworkFilter::parse("||foo.com$domain=bar.com", true, Default::default()).unwrap(); assert_eq!(filter.opt_domains, Some(vec![utils::fast_hash("bar.com")])); assert_eq!(filter.opt_not_domains, None); } { let filter = NetworkFilter::parse("||foo.com$domain=bar.com|baz.com", true, Default::default()).unwrap(); let mut domains = vec![utils::fast_hash("bar.com"), utils::fast_hash("baz.com")]; domains.sort_unstable(); assert_eq!(filter.opt_domains, Some(domains)); assert_eq!(filter.opt_not_domains, None); } // parses ~domain { let filter = NetworkFilter::parse("||foo.com$domain=~bar.com", true, Default::default()).unwrap(); assert_eq!(filter.opt_domains, None); assert_eq!( filter.opt_not_domains, Some(vec![utils::fast_hash("bar.com")]) ); } { let filter = NetworkFilter::parse("||foo.com$domain=~bar.com|~baz.com", true, Default::default()).unwrap(); assert_eq!(filter.opt_domains, None); let mut domains = vec![utils::fast_hash("bar.com"), utils::fast_hash("baz.com")]; domains.sort_unstable(); assert_eq!(filter.opt_not_domains, Some(domains)); } // parses domain and ~domain { let filter = NetworkFilter::parse("||foo.com$domain=~bar.com|baz.com", true, Default::default()).unwrap(); assert_eq!(filter.opt_domains, Some(vec![utils::fast_hash("baz.com")])); assert_eq!( filter.opt_not_domains, Some(vec![utils::fast_hash("bar.com")]) ); } { let filter = NetworkFilter::parse("||foo.com$domain=bar.com|~baz.com", true, Default::default()).unwrap(); assert_eq!(filter.opt_domains, Some(vec![utils::fast_hash("bar.com")])); assert_eq!( filter.opt_not_domains, Some(vec![utils::fast_hash("baz.com")]) ); } { let filter = NetworkFilter::parse("||foo.com$domain=foo|~bar|baz", true, Default::default()).unwrap(); let mut domains = vec![utils::fast_hash("foo"), utils::fast_hash("baz")]; domains.sort(); assert_eq!(filter.opt_domains, Some(domains)); assert_eq!(filter.opt_not_domains, Some(vec![utils::fast_hash("bar")])); } // defaults to no constraint { let filter = NetworkFilter::parse("||foo.com", true, Default::default()).unwrap(); assert_eq!(filter.opt_domains, None); assert_eq!(filter.opt_not_domains, None); } // `from` is an alias for `domain` { let filter = NetworkFilter::parse("||foo.com$from=bar.com", true, Default::default()).unwrap(); assert_eq!(filter.opt_domains, Some(vec![utils::fast_hash("bar.com")])); assert_eq!(filter.opt_not_domains, None); } { let filter = NetworkFilter::parse(r"||video.twimg.com/ext_tw_video/*/*.m3u8$domain=/^i[a-z]*\.strmrdr[a-z]+\..*/", true, Default::default()); assert_eq!(filter.err(), Some(NetworkFilterError::NoSupportedDomains)); } } #[test] fn parses_redirects() { // parses redirect { let filter = NetworkFilter::parse("||foo.com$redirect=bar.js", true, Default::default()).unwrap(); assert_eq!(filter.modifier_option, Some(String::from("bar.js"))); } { let filter = NetworkFilter::parse("$redirect=bar.js", true, Default::default()).unwrap(); assert_eq!(filter.modifier_option, Some(String::from("bar.js"))); } // parses ~redirect { // ~redirect is not a valid option let filter = NetworkFilter::parse("||foo.com$~redirect", true, Default::default()); assert_eq!(filter.err(), Some(NetworkFilterError::NegatedRedirection)); } // parses redirect without a value { // Not valid let filter = NetworkFilter::parse("||foo.com$redirect", true, Default::default()); assert_eq!(filter.err(), Some(NetworkFilterError::EmptyRedirection)); } { let filter = NetworkFilter::parse("||foo.com$redirect=", true, Default::default()); assert_eq!(filter.err(), Some(NetworkFilterError::EmptyRedirection)) } // defaults to false { let filter = NetworkFilter::parse("||foo.com", true, Default::default()).unwrap(); assert_eq!(filter.modifier_option, None); } } #[test] fn parses_removeparam() { { let filter = NetworkFilter::parse("||foo.com^$removeparam", true, Default::default()); assert!(filter.is_err()); } { let filter = NetworkFilter::parse("$~removeparam=test", true, Default::default()); assert!(filter.is_err()); } { let filter = NetworkFilter::parse("@@||foo.com^$removeparam=test", true, Default::default()); assert!(filter.is_err()); } { let filter = NetworkFilter::parse("||foo.com^$removeparam=", true, Default::default()); assert!(filter.is_err()); } { let filter = NetworkFilter::parse("||foo.com^$removeparam=test,redirect=test", true, Default::default()); assert!(filter.is_err()); } { let filter = NetworkFilter::parse("||foo.com^$removeparam=test,removeparam=test2", true, Default::default()); assert!(filter.is_err()); } { let filter = NetworkFilter::parse("||foo.com^$removeparam=𝐔𝐍𝐈𝐂𝐎𝐃𝐄🧋", true, Default::default()); assert!(filter.is_err()); } { let filter = NetworkFilter::parse("||foo.com^$removeparam=/abc.*/", true, Default::default()); assert_eq!(filter, Err(NetworkFilterError::RemoveparamRegexUnsupported)); } { let filter = NetworkFilter::parse("||foo.com^$removeparam=test", true, Default::default()).unwrap(); assert!(filter.is_removeparam()); assert_eq!(filter.modifier_option, Some("test".into())); } } #[test] fn parses_match_case() { // match-case on non-regex rules is invalid { assert!(NetworkFilter::parse("||foo.com$match-case", true, Default::default()).is_err()); } { assert!(NetworkFilter::parse("||foo.com$image,match-case", true, Default::default()).is_err()); } { assert!(NetworkFilter::parse("||foo.com$media,match-case,image", true, Default::default()).is_err()); } // match-case on regex rules is ok { let filter = NetworkFilter::parse(r#"/foo[0-9]*\.com/$media,match-case,image"#, true, Default::default()).unwrap(); assert_eq!(filter.match_case(), true); } { let filter = NetworkFilter::parse(r#"/^https?:\/\/[a-z]{8,15}\.top\/[-a-z]{4,}\.css\?aHR0c[\/0-9a-zA-Z]{33,}=?=?\$/$css,3p,match-case"#, true, Default::default()).unwrap(); assert_eq!(filter.match_case(), true); } // parses ~match-case { // ~match-case is not supported let filter = NetworkFilter::parse("||foo.com$~match-case", true, Default::default()); assert_eq!(filter.err(), Some(NetworkFilterError::NegatedOptionMatchCase)); } // defaults to false { let filter = NetworkFilter::parse("||foo.com", true, Default::default()).unwrap(); assert_eq!(filter.match_case(), false) } } #[test] fn parses_first_party() { // parses first-party assert_eq!( NetworkFilter::parse("||foo.com$first-party", true, Default::default()) .unwrap() .first_party(), true ); assert_eq!( NetworkFilter::parse("@@||foo.com$first-party", true, Default::default()) .unwrap() .first_party(), true ); assert_eq!( NetworkFilter::parse("@@||foo.com|$first-party", true, Default::default()) .unwrap() .first_party(), true ); // parses ~first-party assert_eq!( NetworkFilter::parse("||foo.com$~first-party", true, Default::default()) .unwrap() .first_party(), false ); assert_eq!( NetworkFilter::parse("||foo.com$first-party,~first-party", true, Default::default()) .unwrap() .first_party(), false ); // defaults to true assert_eq!( NetworkFilter::parse("||foo.com", true, Default::default()) .unwrap() .first_party(), true ); } #[test] fn parses_third_party() { // parses third-party assert_eq!( NetworkFilter::parse("||foo.com$third-party", true, Default::default()) .unwrap() .third_party(), true ); assert_eq!( NetworkFilter::parse("@@||foo.com$third-party", true, Default::default()) .unwrap() .third_party(), true ); assert_eq!( NetworkFilter::parse("@@||foo.com|$third-party", true, Default::default()) .unwrap() .third_party(), true ); assert_eq!( NetworkFilter::parse("||foo.com$~first-party", true, Default::default()) .unwrap() .third_party(), true ); // parses ~third-party assert_eq!( NetworkFilter::parse("||foo.com$~third-party", true, Default::default()) .unwrap() .third_party(), false ); assert_eq!( NetworkFilter::parse("||foo.com$first-party,~third-party", true, Default::default()) .unwrap() .third_party(), false ); // defaults to true assert_eq!( NetworkFilter::parse("||foo.com", true, Default::default()) .unwrap() .third_party(), true ); } #[test] fn parses_generic_hide() { { let filter = NetworkFilter::parse("||foo.com$generichide", true, Default::default()); assert!(filter.is_err()); } { let filter = NetworkFilter::parse("@@||foo.com$generichide", true, Default::default()).unwrap(); assert_eq!(filter.is_exception(), true); assert_eq!(filter.is_generic_hide(), true); } { let filter = NetworkFilter::parse("@@||foo.com|$generichide", true, Default::default()).unwrap(); assert_eq!(filter.is_exception(), true); assert_eq!(filter.is_generic_hide(), true); } { let filter = NetworkFilter::parse("@@$generichide,domain=example.com", true, Default::default()).unwrap(); assert_eq!(filter.is_generic_hide(), true); let breakdown = NetworkFilterBreakdown::from(&filter); assert_eq!(breakdown.opt_domains, Some(vec![utils::fast_hash("example.com")])); } { let filter = NetworkFilter::parse("||foo.com", true, Default::default()).unwrap(); assert_eq!(filter.is_generic_hide(), false); } } #[test] fn parses_hosts_style() { { let filter = NetworkFilter::parse_hosts_style("example.com", true).unwrap(); assert!(filter.raw_line.is_some()); assert_eq!(*filter.raw_line.clone().unwrap(), "||example.com^"); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some("example.com".to_string()); defaults.is_plain = true; defaults.is_hostname_anchor = true; defaults.is_right_anchor = true; defaults.from_document = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } { let filter = NetworkFilter::parse_hosts_style("www.example.com", true).unwrap(); assert!(filter.raw_line.is_some()); assert_eq!(*filter.raw_line.clone().unwrap(), "||example.com^"); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some("example.com".to_string()); defaults.is_plain = true; defaults.is_hostname_anchor = true; defaults.is_right_anchor = true; defaults.from_document = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } { let filter = NetworkFilter::parse_hosts_style("malware.example.com", true).unwrap(); assert!(filter.raw_line.is_some()); assert_eq!(*filter.raw_line.clone().unwrap(), "||malware.example.com^"); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some("malware.example.com".to_string()); defaults.is_plain = true; defaults.is_hostname_anchor = true; defaults.is_right_anchor = true; defaults.from_document = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)) } } #[test] fn handles_unsupported_options() { let options = vec![ "genericblock", "inline-script", "popunder", "popup", "woot", ]; for option in options { let filter = NetworkFilter::parse(&format!("||foo.com${}", option), true, Default::default()); assert!(filter.err().is_some()); } } #[test] fn handles_content_type_options() { let options = vec![ "font", "image", "media", "object", "object-subrequest", "other", "ping", "script", "stylesheet", "subdocument", "websocket", "xmlhttprequest", "xhr", ]; fn set_all_options(breakdown: &mut NetworkFilterBreakdown, value: bool) { breakdown.from_font = value; breakdown.from_image = value; breakdown.from_media = value; breakdown.from_object = value; breakdown.from_other = value; breakdown.from_ping = value; breakdown.from_script = value; breakdown.from_stylesheet = value; breakdown.from_subdocument = value; breakdown.from_websocket = value; breakdown.from_xml_http_request = value; } fn set_option(option: &str, breakdown: &mut NetworkFilterBreakdown, value: bool) { match option { "font" => breakdown.from_font = value, "image" => breakdown.from_image = value, "media" => breakdown.from_media = value, "object" => breakdown.from_object = value, "object-subrequest" => breakdown.from_object = value, "other" => breakdown.from_other = value, "ping" => breakdown.from_ping = value, "script" => breakdown.from_script = value, "stylesheet" => breakdown.from_stylesheet = value, "subdocument" => breakdown.from_subdocument = value, "websocket" => breakdown.from_websocket = value, "xmlhttprequest" => breakdown.from_xml_http_request = value, "xhr" => breakdown.from_xml_http_request = value, _ => unreachable!(), } } for option in options { // positive { let filter = NetworkFilter::parse(&format!("||foo.com${}", option), true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some(String::from("foo.com")); defaults.is_hostname_anchor = true; defaults.is_plain = true; defaults.from_network_types = false; set_all_options(&mut defaults, false); set_option(&option, &mut defaults, true); assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)); } { let filter = NetworkFilter::parse(&format!("||foo.com$object,{}", option), true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some(String::from("foo.com")); defaults.is_hostname_anchor = true; defaults.is_plain = true; defaults.from_network_types = false; set_all_options(&mut defaults, false); set_option(&option, &mut defaults, true); set_option("object", &mut defaults, true); assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)); } { let filter = NetworkFilter::parse(&format!("||foo.com$domain=bar.com,{}", option), true, Default::default()) .unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some(String::from("foo.com")); defaults.is_hostname_anchor = true; defaults.is_plain = true; defaults.from_network_types = false; defaults.opt_domains = Some(vec![utils::fast_hash("bar.com")]); set_all_options(&mut defaults, false); set_option(&option, &mut defaults, true); assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)); } // negative { let filter = NetworkFilter::parse(&format!("||foo.com$~{}", option), true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some(String::from("foo.com")); defaults.is_hostname_anchor = true; defaults.is_plain = true; defaults.from_network_types = false; set_all_options(&mut defaults, true); set_option(&option, &mut defaults, false); assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)); } { let filter = NetworkFilter::parse(&format!("||foo.com${},~{}", option, option), true, Default::default()) .unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some(String::from("foo.com")); defaults.is_hostname_anchor = true; defaults.is_plain = true; defaults.from_network_types = false; set_all_options(&mut defaults, true); set_option(&option, &mut defaults, false); assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)); } // default - positive { let filter = NetworkFilter::parse(&format!("||foo.com"), true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some(String::from("foo.com")); defaults.is_hostname_anchor = true; defaults.is_plain = true; defaults.from_network_types = true; set_all_options(&mut defaults, true); assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)); } } } #[test] fn binary_serialization_works() { use rmp_serde::{Deserializer, Serializer}; { let filter = NetworkFilter::parse("||foo.com/bar/baz$important", true, Default::default()).unwrap(); let mut encoded = Vec::new(); filter.serialize(&mut Serializer::new(&mut encoded)).unwrap(); let mut de = Deserializer::new(&encoded[..]); let decoded: NetworkFilter = Deserialize::deserialize(&mut de).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some(String::from("foo.com")); defaults.filter = Some(String::from("/bar/baz")); defaults.is_plain = true; defaults.is_hostname_anchor = true; defaults.is_important = true; defaults.is_left_anchor = true; assert_eq!(defaults, NetworkFilterBreakdown::from(&decoded)) } { let filter = NetworkFilter::parse("||foo.com*bar^", true, Default::default()).unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some(String::from("foo.com")); defaults.filter = Some(String::from("bar^")); defaults.is_hostname_anchor = true; defaults.is_regex = true; defaults.is_plain = false; let mut encoded = Vec::new(); filter.serialize(&mut Serializer::new(&mut encoded)).unwrap(); let mut de = Deserializer::new(&encoded[..]); let decoded: NetworkFilter = Deserialize::deserialize(&mut de).unwrap(); assert_eq!(defaults, NetworkFilterBreakdown::from(&decoded)); assert_eq!(RegexManager::default().matches(&decoded, "bar/"), true); } } #[test] fn parse_empty_host_anchor_exception() { let filter_parsed = NetworkFilter::parse("@@||$domain=auth.wi-fi.ru", true, Default::default()); assert!(filter_parsed.is_ok()); let filter = filter_parsed.unwrap(); let mut defaults = default_network_filter_breakdown(); defaults.hostname = Some(String::from("")); defaults.is_hostname_anchor = true; defaults.is_exception = true; defaults.is_plain = true; defaults.from_network_types = true; defaults.opt_domains = Some(vec![utils::fast_hash("auth.wi-fi.ru")]); assert_eq!(defaults, NetworkFilterBreakdown::from(&filter)); } } #[cfg(test)] mod match_tests { use super::*; #[test] fn is_anchored_by_hostname_works() { // matches empty hostname assert_eq!(is_anchored_by_hostname("", "foo.com", false), true); // does not match when filter hostname is longer than hostname assert_eq!(is_anchored_by_hostname("bar.foo.com", "foo.com", false), false); assert_eq!(is_anchored_by_hostname("b", "", false), false); assert_eq!(is_anchored_by_hostname("foo.com", "foo.co", false), false); // does not match if there is not match assert_eq!(is_anchored_by_hostname("bar", "foo.com", false), false); // ## prefix match // matches exact match assert_eq!(is_anchored_by_hostname("", "", false), true); assert_eq!(is_anchored_by_hostname("f", "f", false), true); assert_eq!(is_anchored_by_hostname("foo", "foo", false), true); assert_eq!(is_anchored_by_hostname("foo.com", "foo.com", false), true); assert_eq!(is_anchored_by_hostname(".com", ".com", false), true); assert_eq!(is_anchored_by_hostname("com.", "com.", false), true); // matches partial // Single label assert_eq!(is_anchored_by_hostname("foo", "foo.com", false), true); assert_eq!(is_anchored_by_hostname("foo.", "foo.com", false), true); assert_eq!(is_anchored_by_hostname(".foo", ".foo.com", false), true); assert_eq!(is_anchored_by_hostname(".foo.", ".foo.com", false), true); // Multiple labels assert_eq!(is_anchored_by_hostname("foo.com", "foo.com.", false), true); assert_eq!(is_anchored_by_hostname("foo.com.", "foo.com.", false), true); assert_eq!(is_anchored_by_hostname(".foo.com.", ".foo.com.", false), true); assert_eq!(is_anchored_by_hostname(".foo.com", ".foo.com", false), true); assert_eq!(is_anchored_by_hostname("foo.bar", "foo.bar.com", false), true); assert_eq!(is_anchored_by_hostname("foo.bar.", "foo.bar.com", false), true); // does not match partial prefix // Single label assert_eq!(is_anchored_by_hostname("foo", "foobar.com", false), false); assert_eq!(is_anchored_by_hostname("fo", "foo.com", false), false); assert_eq!(is_anchored_by_hostname(".foo", "foobar.com", false), false); // Multiple labels assert_eq!(is_anchored_by_hostname("foo.bar", "foo.barbaz.com", false), false); assert_eq!( is_anchored_by_hostname(".foo.bar", ".foo.barbaz.com", false), false ); // ## suffix match // matches partial // Single label assert_eq!(is_anchored_by_hostname("com", "foo.com", false), true); assert_eq!(is_anchored_by_hostname(".com", "foo.com", false), true); assert_eq!(is_anchored_by_hostname(".com.", "foo.com.", false), true); assert_eq!(is_anchored_by_hostname("com.", "foo.com.", false), true); // Multiple labels assert_eq!(is_anchored_by_hostname("foo.com.", ".foo.com.", false), true); assert_eq!(is_anchored_by_hostname("foo.com", ".foo.com", false), true); // does not match partial // Single label assert_eq!(is_anchored_by_hostname("om", "foo.com", false), false); assert_eq!(is_anchored_by_hostname("com", "foocom", false), false); // Multiple labels assert_eq!(is_anchored_by_hostname("foo.bar.com", "baz.bar.com", false), false); assert_eq!(is_anchored_by_hostname("fo.bar.com", "foo.bar.com", false), false); assert_eq!(is_anchored_by_hostname(".fo.bar.com", "foo.bar.com", false), false); assert_eq!(is_anchored_by_hostname("bar.com", "foobar.com", false), false); assert_eq!(is_anchored_by_hostname(".bar.com", "foobar.com", false), false); // ## infix match // matches partial assert_eq!(is_anchored_by_hostname("bar", "foo.bar.com", false), true); assert_eq!(is_anchored_by_hostname("bar.", "foo.bar.com", false), true); assert_eq!(is_anchored_by_hostname(".bar.", "foo.bar.com", false), true); } fn filter_match_url(filter: &str, url: &str, matching: bool) { let network_filter = NetworkFilter::parse(filter, true, Default::default()).unwrap(); let request = request::Request::new(url, "https://example.com", "other").unwrap(); assert!( network_filter.matches_test(&request) == matching, "Expected match={} for {} {:?} on {}", matching, filter, network_filter, url ); } fn hosts_filter_match_url(filter: &str, url: &str, matching: bool) { let network_filter = NetworkFilter::parse_hosts_style(filter, true).unwrap(); let request = request::Request::new(url, "https://example.com", "other").unwrap(); assert!( network_filter.matches_test(&request) == matching, "Expected match={} for {} {:?} on {}", matching, filter, network_filter, url ); } #[test] // pattern fn check_pattern_plain_filter_filter_works() { filter_match_url("foo", "https://bar.com/foo", true); filter_match_url("foo", "https://bar.com/baz/foo", true); filter_match_url("foo", "https://bar.com/q=foo/baz", true); filter_match_url("foo", "https://foo.com", true); filter_match_url("-foo-", "https://bar.com/baz/42-foo-q", true); filter_match_url("&fo.o=+_-", "https://bar.com?baz=42&fo.o=+_-", true); filter_match_url("foo/bar/baz", "https://bar.com/foo/bar/baz", true); filter_match_url("com/bar/baz", "https://bar.com/bar/baz", true); filter_match_url("https://bar.com/bar/baz", "https://bar.com/bar/baz", true); } #[test] // ||pattern fn check_pattern_hostname_anchor_filter_works() { filter_match_url("||foo.com", "https://foo.com/bar", true); filter_match_url("||foo.com/bar", "https://foo.com/bar", true); filter_match_url("||foo", "https://foo.com/bar", true); filter_match_url("||foo", "https://baz.foo.com/bar", true); filter_match_url("||foo", "https://foo.baz.com/bar", true); filter_match_url("||foo.baz", "https://foo.baz.com/bar", true); filter_match_url("||foo.baz.", "https://foo.baz.com/bar", true); filter_match_url("||foo.baz.com^", "https://foo.baz.com/bar", true); filter_match_url("||foo.baz^", "https://foo.baz.com/bar", false); filter_match_url("||foo", "https://baz.com", false); filter_match_url("||foo", "https://foo-bar.baz.com/bar", false); filter_match_url("||foo.com", "https://foo.de", false); filter_match_url("||foo.com", "https://bar.foo.de", false); filter_match_url("||s.foo.com", "https://substring.s.foo.com", true); filter_match_url("||s.foo.com", "https://substrings.foo.com", false); } #[test] fn check_hosts_style_works() { hosts_filter_match_url("foo.com", "https://foo.com/bar", true); hosts_filter_match_url("foo.foo.com", "https://foo.com/bar", false); hosts_filter_match_url("www.foo.com", "https://foo.com/bar", true); hosts_filter_match_url("com.foo", "https://foo.baz.com/bar", false); hosts_filter_match_url("foo.baz", "https://foo.baz.com/bar", false); hosts_filter_match_url("foo.baz.com", "https://foo.baz.com/bar", true); hosts_filter_match_url("foo.baz", "https://foo.baz.com/bar", false); hosts_filter_match_url("foo.com", "https://baz.com", false); hosts_filter_match_url("bar.baz.com", "https://foo-bar.baz.com/bar", false); hosts_filter_match_url("foo.com", "https://foo.de", false); hosts_filter_match_url("foo.com", "https://bar.foo.de", false); } #[test] // ||pattern| fn check_pattern_hostname_right_anchor_filter_works() { filter_match_url("||foo.com|", "https://foo.com", true); filter_match_url("||foo.com/bar|", "https://foo.com/bar", true); filter_match_url("||foo.com/bar|", "https://foo.com/bar/baz", false); filter_match_url("||foo.com/bar|", "https://foo.com/", false); filter_match_url("||bar.com/bar|", "https://foo.com/", false); } #[test] // pattern| fn check_pattern_right_anchor_filter_works() { filter_match_url("foo.com", "https://foo.com", true); filter_match_url("foo|", "https://bar.com/foo", true); filter_match_url("foo|", "https://bar.com/foo/", false); filter_match_url("foo|", "https://bar.com/foo/baz", false); } #[test] // |pattern fn check_pattern_left_anchor_filter_works() { filter_match_url("|http", "http://foo.com", true); filter_match_url("|http", "https://foo.com", true); filter_match_url("|https://", "https://foo.com", true); filter_match_url("https", "http://foo.com", false); } #[test] // |pattern| fn check_pattern_left_right_anchor_filter_works() { filter_match_url("|https://foo.com|", "https://foo.com", true); } #[test] // ||pattern + left-anchor fn check_pattern_hostname_left_anchor_filter_works() { filter_match_url("||foo.com^test", "https://foo.com/test", true); filter_match_url("||foo.com/test", "https://foo.com/test", true); filter_match_url("||foo.com^test", "https://foo.com/tes", false); filter_match_url("||foo.com/test", "https://foo.com/tes", false); filter_match_url("||foo.com^", "https://foo.com/test", true); filter_match_url("||foo.com/test*bar", "https://foo.com/testbar", true); filter_match_url("||foo.com^test*bar", "https://foo.com/testbar", true); } #[test] // ||hostname^*/pattern fn check_pattern_hostname_anchor_regex_filter_works() { filter_match_url("||foo.com^*/bar", "https://foo.com/bar", false); filter_match_url("||com^*/bar", "https://foo.com/bar", false); filter_match_url("||foo^*/bar", "https://foo.com/bar", false); // @see https://github.com/cliqz-oss/adblocker/issues/29 filter_match_url("||foo.co^aaa/", "https://bar.foo.com/bbb/aaa/", false); filter_match_url("||foo.com^aaa/", "https://bar.foo.com/bbb/aaa/", false); filter_match_url("||com*^bar", "https://foo.com/bar", true); filter_match_url("||foo.com^bar", "https://foo.com/bar", true); filter_match_url("||com^bar", "https://foo.com/bar", true); filter_match_url("||foo*^bar", "https://foo.com/bar", true); filter_match_url("||foo*/bar", "https://foo.com/bar", true); filter_match_url("||foo*com/bar", "https://foo.com/bar", true); filter_match_url("||foo2*com/bar", "https://foo2.com/bar", true); filter_match_url("||foo*com*/bar", "https://foo.com/bar", true); filter_match_url("||foo*com*^bar", "https://foo.com/bar", true); filter_match_url("||*foo*com*^bar", "https://foo.com/bar", true); filter_match_url("||*/bar", "https://foo.com/bar", true); filter_match_url("||*^bar", "https://foo.com/bar", true); filter_match_url("||*com/bar", "https://foo.com/bar", true); filter_match_url("||*.com/bar", "https://foo.com/bar", true); filter_match_url("||*foo.com/bar", "https://foo.com/bar", true); filter_match_url("||*com/bar", "https://foo.com/bar", true); filter_match_url("||*com*/bar", "https://foo.com/bar", true); filter_match_url("||*com*^bar", "https://foo.com/bar", true); } #[test] fn check_pattern_hostname_anchor_regex_filter_works_realisitic() { filter_match_url("||vimeo.com^*?type=", "https://vimeo.com/ablincoln/fatal_attraction?type=pageview&target=%2F193641463", true); } #[test] fn check_pattern_hostname_left_right_anchor_regex_filter_works() { filter_match_url("||geo*.hltv.org^", "https://geo2.hltv.org/rekl13.php", true); filter_match_url( "||www*.swatchseries.to^", "https://www1.swatchseries.to/sw.js", true, ); filter_match_url("||imp*.tradedoubler.com^", "https://impde.tradedoubler.com/imp?type(js)g(22608602)a(1725113)epi(30148500144427100033372010772028)preurl(https://pixel.mathtag.com/event/js?mt_id=1160537&mt_adid=166882&mt_exem=&mt_excl=&v1=&v2=&v3=&s1=&s2=&s3=&mt_nsync=1&redirect=https%3A%2F%2Fad28.ad-srv.net%2Fc%2Fczqwm6dm6kagr2j%3Ftprde%3D)768489806", true); } #[test] fn check_pattern_exception_works() { { let filter = "@@||fastly.net/ad2/$image,script,xmlhttprequest"; let url = "https://0914.global.ssl.fastly.net/ad2/script/x.js?cb=1549980040838"; let network_filter = NetworkFilter::parse(filter, true, Default::default()).unwrap(); let request = request::Request::new( url, "https://www.gamespot.com/metro-exodus/", "script", ) .unwrap(); assert!( network_filter.matches_test(&request) == true, "Expected match for {} on {}", filter, url ); } { let filter = "@@||swatchseries.to/public/js/edit-show.js$script,domain=swatchseries.to"; let url = "https://www1.swatchseries.to/public/js/edit-show.js"; let network_filter = NetworkFilter::parse(filter, true, Default::default()).unwrap(); let request = request::Request::new( url, "https://www1.swatchseries.to/serie/roswell_new_mexico", "script", ) .unwrap(); assert!( network_filter.matches_test(&request) == true, "Expected match for {} on {}", filter, url ); } } #[test] fn check_pattern_match_case() { filter_match_url(r#"/BannerAd[0-9]/$match-case"#, "https://example.com/BannerAd0.gif", true); filter_match_url(r#"/BannerAd[0-9]/$match-case"#, "https://example.com/bannerad0.gif", false); } #[test] fn check_ws_vs_http_matching() { let network_filter = NetworkFilter::parse("|ws://$domain=4shared.com", true, Default::default()).unwrap(); assert!(network_filter.matches_test(&request::Request::new("ws://example.com", "https://4shared.com", "websocket").unwrap())); assert!(network_filter.matches_test(&request::Request::new("wss://example.com", "https://4shared.com", "websocket").unwrap())); assert!(!network_filter.matches_test(&request::Request::new("http://example.com", "https://4shared.com", "script").unwrap())); assert!(!network_filter.matches_test(&request::Request::new("https://example.com", "https://4shared.com", "script").unwrap())); // The `ws://` and `wss://` protocols should be used, rather than the resource type. assert!(network_filter.matches_test(&request::Request::new("ws://example.com", "https://4shared.com", "script").unwrap())); assert!(network_filter.matches_test(&request::Request::new("wss://example.com", "https://4shared.com", "script").unwrap())); assert!(!network_filter.matches_test(&request::Request::new("http://example.com", "https://4shared.com", "websocket").unwrap())); assert!(!network_filter.matches_test(&request::Request::new("https://example.com", "https://4shared.com", "websocket").unwrap())); } #[test] // options fn check_options_works() { // cpt test { let network_filter = NetworkFilter::parse("||foo$image", true, Default::default()).unwrap(); let request = request::Request::new("https://foo.com/bar", "", "image").unwrap(); assert_eq!(check_options(&network_filter, &request), true); } { let network_filter = NetworkFilter::parse("||foo$image", true, Default::default()).unwrap(); let request = request::Request::new("https://foo.com/bar", "", "script").unwrap(); assert_eq!(check_options(&network_filter, &request), false); } { let network_filter = NetworkFilter::parse("||foo$~image", true, Default::default()).unwrap(); let request = request::Request::new("https://foo.com/bar", "", "script").unwrap(); assert_eq!(check_options(&network_filter, &request), true); } // ~third-party { let network_filter = NetworkFilter::parse("||foo$~third-party", true, Default::default()).unwrap(); let request = request::Request::new("https://foo.com/bar", "http://baz.foo.com", "") .unwrap(); assert_eq!(check_options(&network_filter, &request), true); } { let network_filter = NetworkFilter::parse("||foo$~third-party", true, Default::default()).unwrap(); let request = request::Request::new("https://foo.com/bar", "http://baz.bar.com", "") .unwrap(); assert_eq!(check_options(&network_filter, &request), false); } // ~first-party { let network_filter = NetworkFilter::parse("||foo$~first-party", true, Default::default()).unwrap(); let request = request::Request::new("https://foo.com/bar", "http://baz.bar.com", "") .unwrap(); assert_eq!(check_options(&network_filter, &request), true); } { let network_filter = NetworkFilter::parse("||foo$~first-party", true, Default::default()).unwrap(); let request = request::Request::new("https://foo.com/bar", "http://baz.foo.com", "") .unwrap(); assert_eq!(check_options(&network_filter, &request), false); } // opt-domain { let network_filter = NetworkFilter::parse("||foo$domain=foo.com", true, Default::default()).unwrap(); let request = request::Request::new("https://foo.com/bar", "http://foo.com", "").unwrap(); assert_eq!(check_options(&network_filter, &request), true); } { let network_filter = NetworkFilter::parse("||foo$domain=foo.com", true, Default::default()).unwrap(); let request = request::Request::new("https://foo.com/bar", "http://bar.com", "").unwrap(); assert_eq!(check_options(&network_filter, &request), false); } // opt-not-domain { let network_filter = NetworkFilter::parse("||foo$domain=~bar.com", true, Default::default()).unwrap(); let request = request::Request::new("https://foo.com/bar", "http://foo.com", "").unwrap(); assert_eq!(check_options(&network_filter, &request), true); } { let network_filter = NetworkFilter::parse("||foo$domain=~bar.com", true, Default::default()).unwrap(); let request = request::Request::new("https://foo.com/bar", "http://bar.com", "").unwrap(); assert_eq!(check_options(&network_filter, &request), false); } } #[test] fn check_domain_option_subsetting_works() { { let network_filter = NetworkFilter::parse("adv$domain=example.com|~foo.example.com", true, Default::default()).unwrap(); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://example.com", "").unwrap()) == true); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://foo.example.com", "").unwrap()) == false); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://subfoo.foo.example.com", "").unwrap()) == false); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://bar.example.com", "").unwrap()) == true); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://anotherexample.com", "").unwrap()) == false); } { let network_filter = NetworkFilter::parse("adv$domain=~example.com|~foo.example.com", true, Default::default()).unwrap(); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://example.com", "").unwrap()) == false); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://foo.example.com", "").unwrap()) == false); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://subfoo.foo.example.com", "").unwrap()) == false); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://bar.example.com", "").unwrap()) == false); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://anotherexample.com", "").unwrap()) == true); } { let network_filter = NetworkFilter::parse("adv$domain=example.com|foo.example.com", true, Default::default()).unwrap(); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://example.com", "").unwrap()) == true); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://foo.example.com", "").unwrap()) == true); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://subfoo.foo.example.com", "").unwrap()) == true); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://bar.example.com", "").unwrap()) == true); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://anotherexample.com", "").unwrap()) == false); } { let network_filter = NetworkFilter::parse("adv$domain=~example.com|foo.example.com", true, Default::default()).unwrap(); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://example.com", "").unwrap()) == false); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://foo.example.com", "").unwrap()) == false); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://subfoo.foo.example.com", "").unwrap()) == false); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://bar.example.com", "").unwrap()) == false); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://anotherexample.com", "").unwrap()) == false); } { let network_filter = NetworkFilter::parse("adv$domain=com|~foo.com", true, Default::default()).unwrap(); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://com", "").unwrap()) == true); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://foo.com", "").unwrap()) == false); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://subfoo.foo.com", "").unwrap()) == false); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://bar.com", "").unwrap()) == true); assert!(network_filter.matches_test(&request::Request::new("http://example.net/adv", "http://co.uk", "").unwrap()) == false); } } #[test] fn check_unicode_handled() { filter_match_url( "||firstrowsports.li/frame/", "https://firstrowsports.li/frame/bar", true, ); filter_match_url( "||fırstrowsports.eu/pu/", "https://fırstrowsports.eu/pu/foo", true, ); filter_match_url( "||fırstrowsports.eu/pu/", "https://xn--frstrowsports-39b.eu/pu/foo", true, ); filter_match_url("||atđhe.net/pu/", "https://atđhe.net/pu/foo", true); filter_match_url("||atđhe.net/pu/", "https://xn--athe-1ua.net/pu/foo", true); filter_match_url("foo", "https://example.com/Ѥ/foo", true); filter_match_url("Ѥ", "https://example.com/Ѥ/foo", true); } #[test] fn check_regex_escaping_handled() { // A few rules that are not correctly escaped for rust Regex { // regex escaping "\/" unrecognised let filter = r#"/^https?:\/\/.*(bitly|bit)\.(com|ly)\/.*/$domain=123movies.com|1337x.to"#; let network_filter = NetworkFilter::parse(filter, true, Default::default()).unwrap(); let url = "https://bit.ly/bar/"; let source = "http://123movies.com"; let request = request::Request::new(url, source, "").unwrap(); assert!( network_filter.matches_test(&request) == true, "Expected match for {} on {}", filter, url ); } { // regex escaping "\:" unrecognised let filter = r#"/\:\/\/data.*\.com\/[a-zA-Z0-9]{30,}/$third-party,xmlhttprequest"#; let network_filter = NetworkFilter::parse(filter, true, Default::default()).unwrap(); let url = "https://data.foo.com/9VjjrjU9Or2aqkb8PDiqTBnULPgeI48WmYEHkYer"; let source = "http://123movies.com"; let request = request::Request::new(url, source, "xmlhttprequest").unwrap(); assert!( network_filter.matches_test(&request) == true, "Expected match for {} on {}", filter, url ); } // { let filter = r#"/\.(accountant|bid|click|club|com|cricket|date|download|faith|link|loan|lol|men|online|party|racing|review|science|site|space|stream|top|trade|webcam|website|win|xyz|com)\/(([0-9]{2,9})(\.|\/)(css|\?)?)$/$script,stylesheet,third-party,xmlhttprequest"#; let network_filter = NetworkFilter::parse(filter, true, Default::default()).unwrap(); let url = "https://hello.club/123.css"; let source = "http://123movies.com"; let request = request::Request::new(url, source, "stylesheet").unwrap(); assert!( network_filter.matches_test(&request) == true, "Expected match for {} on {}", filter, url ); } } #[test] #[ignore] // Not going to handle lookaround regexes #[cfg(feature = "regex-debug-info")] fn check_lookaround_regex_handled() { { let filter = r#"/^https?:\/\/([0-9a-z\-]+\.)?(9anime|animeland|animenova|animeplus|animetoon|animewow|gamestorrent|goodanime|gogoanime|igg-games|kimcartoon|memecenter|readcomiconline|toonget|toonova|watchcartoononline)\.[a-z]{2,4}\/(?!([Ee]xternal|[Ii]mages|[Ss]cripts|[Uu]ploads|ac|ajax|assets|combined|content|cov|cover|(img\/bg)|(img\/icon)|inc|jwplayer|player|playlist-cat-rss|static|thumbs|wp-content|wp-includes)\/)(.*)/$image,other,script,~third-party,xmlhttprequest,domain=~animeland.hu"#; let network_filter = NetworkFilter::parse(filter, true, Default::default()).unwrap(); let url = "https://data.foo.com/9VjjrjU9Or2aqkb8PDiqTBnULPgeI48WmYEHkYer"; let source = "http://123movies.com"; let request = request::Request::new(url, source, "script").unwrap(); let mut regex_manager = RegexManager::default(); assert!(regex_manager.get_compiled_regex_count() == 0); assert!( network_filter.matches(&request, &mut regex_manager) == true, "Expected match for {} on {}", filter, url ); assert!(regex_manager.get_compiled_regex_count() == 1); } } #[test] fn check_empty_host_anchor_matches() { { let filter = "||$domain=auth.wi-fi.ru"; let network_filter = NetworkFilter::parse(filter, true, Default::default()).unwrap(); let url = "https://example.com/ad.js"; let source = "http://auth.wi-fi.ru"; let request = request::Request::new(url, source, "script").unwrap(); assert!( network_filter.matches_test(&request) == true, "Expected match for {} on {}", filter, url ); } { let filter = "@@||$domain=auth.wi-fi.ru"; let network_filter = NetworkFilter::parse(filter, true, Default::default()).unwrap(); let url = "https://example.com/ad.js"; let source = "http://auth.wi-fi.ru"; let request = request::Request::new(url, source, "script").unwrap(); assert!( network_filter.matches_test(&request) == true, "Expected match for {} on {}", filter, url ); } } #[test] fn check_url_path_regex_matches() { { let filter = "@@||www.google.com/aclk?*&adurl=$document,~third-party"; let network_filter = NetworkFilter::parse(filter, true, Default::default()).unwrap(); let url = "https://www.google.com/aclk?sa=l&ai=DChcSEwioqMfq5ovjAhVvte0KHXBYDKoYABAJGgJkZw&sig=AOD64_0IL5OYOIkZA7qWOBt0yRmKL4hKJw&ctype=5&q=&ved=0ahUKEwjQ88Hq5ovjAhXYiVwKHWAgB5gQww8IXg&adurl="; let source = "https://www.google.com/aclk?sa=l&ai=DChcSEwioqMfq5ovjAhVvte0KHXBYDKoYABAJGgJkZw&sig=AOD64_0IL5OYOIkZA7qWOBt0yRmKL4hKJw&ctype=5&q=&ved=0ahUKEwjQ88Hq5ovjAhXYiVwKHWAgB5gQww8IXg&adurl="; let request = request::Request::new(url, source, "document").unwrap(); assert!(!request.is_third_party); assert!( network_filter.matches_test(&request) == true, "Expected match for {} on {}", filter, url ); } { let filter = "@@||www.google.*/aclk?$first-party"; let network_filter = NetworkFilter::parse(filter, true, Default::default()).unwrap(); let url = "https://www.google.com/aclk?sa=l&ai=DChcSEwioqMfq5ovjAhVvte0KHXBYDKoYABAJGgJkZw&sig=AOD64_0IL5OYOIkZA7qWOBt0yRmKL4hKJw&ctype=5&q=&ved=0ahUKEwjQ88Hq5ovjAhXYiVwKHWAgB5gQww8IXg&adurl="; let source = "https://www.google.com/aclk?sa=l&ai=DChcSEwioqMfq5ovjAhVvte0KHXBYDKoYABAJGgJkZw&sig=AOD64_0IL5OYOIkZA7qWOBt0yRmKL4hKJw&ctype=5&q=&ved=0ahUKEwjQ88Hq5ovjAhXYiVwKHWAgB5gQww8IXg&adurl="; let request = request::Request::new(url, source, "main_frame").unwrap(); assert!(!request.is_third_party); assert!( network_filter.matches_test(&request) == true, "Expected match for {} on {}", filter, url ); } } #[test] fn check_get_url_after_hostname_handles_bad_input() { // The function requires the hostname to necessarily be there in the URL, // but should fail gracefully if that is not the case. // Graceful failure here is returning an empty string for the rest of the URL assert_eq!(get_url_after_hostname("https://www.google.com/ad", "google.com"), "/ad"); assert_eq!(get_url_after_hostname("https://www.google.com/?aclksa=l&ai=DChcSEwioqMfq5", "google.com"), "/?aclksa=l&ai=DChcSEwioqMfq5"); assert_eq!(get_url_after_hostname("https://www.google.com/?aclksa=l&ai=DChcSEwioqMfq5", "www.google.com"), "/?aclksa=l&ai=DChcSEwioqMfq5"); assert_eq!(get_url_after_hostname("https://www.youtube.com/?aclksa=l&ai=DChcSEwioqMfq5", "google.com"), ""); } } #[cfg(test)] mod hash_collision_tests { use super::*; use crate::test_utils; use crate::lists::parse_filters; use std::collections::HashMap; #[test] fn check_rule_ids_no_collisions() { let rules = test_utils::rules_from_lists([ "data/easylist.to/easylist/easylist.txt", "data/easylist.to/easylist/easyprivacy.txt", ]); let (network_filters, _) = parse_filters(rules, true, Default::default()); let mut filter_ids: HashMap = HashMap::new(); for filter in network_filters { let id = filter.get_id(); let rule = *filter.raw_line.unwrap_or_default(); let existing_rule = filter_ids.get(&id); assert!(existing_rule.is_none() || existing_rule.unwrap() == &rule, "ID {} for {} already present from {}", id, rule, existing_rule.unwrap()); filter_ids.insert(id, rule); } } } adblock-0.8.12/src/lib.rs000064400000000000000000000027471046102023000132300ustar 00000000000000//! `adblock-rust` is the engine powering Brave's native adblocker, available as a library for //! anyone to use. It features: //! //! - Network blocking //! - Cosmetic filtering //! - Resource replacements //! - Hosts syntax //! - uBlock Origin syntax extensions //! - iOS content-blocking syntax conversion //! - Compiling to native code or WASM //! - Rust bindings ([crates](https://crates.io/crates/adblock)) //! - JS bindings ([npm](https://npmjs.com/adblock-rs)) //! - Community-maintained Python bindings ([pypi](https://pypi.org/project/adblock/)) //! - High performance! //! //! Check the [`Engine`] documentation to get started with adblocking. // Own modules, currently everything is exposed, will need to limit pub mod blocker; #[cfg(feature = "content-blocking")] pub mod content_blocking; pub mod cosmetic_filter_cache; mod data_format; mod engine; pub mod filters; pub mod lists; mod optimizer; pub mod regex_manager; pub mod request; pub mod resources; pub mod url_parser; #[doc(hidden)] pub mod utils; #[doc(inline)] pub use engine::Engine; #[doc(inline)] pub use lists::FilterSet; #[cfg(test)] #[path = "../tests/test_utils.rs"] mod test_utils; #[cfg(test)] mod sync_tests { #[allow(unused)] fn static_assert_sync() { let _ = core::marker::PhantomData::::default(); } #[test] #[cfg(not(any(feature = "object-pooling", feature = "unsync-regex-caching")))] fn assert_engine_sync() { static_assert_sync::(); } } adblock-0.8.12/src/lists.rs000064400000000000000000001013331046102023000136070ustar 00000000000000//! Parsing functions and collections for handling with multiple filter rules. use std::convert::TryFrom; use crate::filters::network::{NetworkFilter, NetworkFilterError}; use crate::filters::cosmetic::{CosmeticFilter, CosmeticFilterError}; use crate::resources::PermissionMask; use itertools::{Either, Itertools}; use memchr::memchr as find_char; use serde::{Deserialize, Serialize}; use thiserror::Error; /// Specifies rule types to keep during parsing. #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub enum RuleTypes { All, NetworkOnly, CosmeticOnly, } impl Default for RuleTypes { fn default() -> Self { Self::All } } impl RuleTypes { pub fn loads_network_rules(&self) -> bool { matches!(self, Self::All | Self::NetworkOnly) } pub fn loads_cosmetic_rules(&self) -> bool { matches!(self, Self::All | Self::CosmeticOnly) } } /// Options for tweaking how a filter or list of filters is interpreted when parsing. It's /// recommended to use _struct update syntax_ with a `default()` "rest" value; adding new fields to /// this struct will not be considered a breaking change. /// /// ``` /// # use adblock::lists::{FilterFormat, ParseOptions}; /// let parse_options = ParseOptions { /// format: FilterFormat::Hosts, /// ..ParseOptions::default() /// }; /// ``` #[derive(Copy, Clone, Deserialize)] pub struct ParseOptions { /// Assume filters are in the given format when parsing. Defaults to `FilterFormat::Standard`. #[serde(default)] pub format: FilterFormat, /// Specifies rule types to keep during parsing. Defaults to `RuleTypes::All`. This can be used /// to reduce the memory impact of engines that will only be used for cosmetic filtering or /// network filtering, but not both. It can also be useful for iOS and macOS when exporting to /// content-blocking syntax, as these platforms limit the number of content blocking rules that /// can be loaded. #[serde(default)] pub rule_types: RuleTypes, /// Specifies permissions to use when parsing a given filter list. See [`PermissionMask`] for /// more info. #[serde(default)] pub permissions: PermissionMask, } impl Default for ParseOptions { fn default() -> Self { ParseOptions { format: FilterFormat::Standard, rule_types: RuleTypes::All, permissions: PermissionMask::default(), } } } /// Manages a set of rules to be added to an [`crate::Engine`]. /// /// To be able to efficiently handle special options like `$badfilter`, and to allow optimizations, /// all rules must be available when the `Engine` is first created. `FilterSet` allows assembling a /// compound list from multiple different sources before compiling the rules into an `Engine`. #[derive(Clone)] pub struct FilterSet { debug: bool, pub(crate) network_filters: Vec, pub(crate) cosmetic_filters: Vec, } /// Collects metadata for the list by reading just until the first non-comment line. pub fn read_list_metadata(list: &str) -> FilterListMetadata { let mut metadata = FilterListMetadata::default(); // uBO only searches within the first 1024 characters; the same optimization can be useful here let mut cutoff = list.len().min(1024); while !list.is_char_boundary(cutoff) { cutoff -= 1; } // String slice is safe here because `cutoff` is guaranteed to be a character boundary for line in list[0..cutoff].lines() { if line.starts_with('!') { metadata.try_add(line); } else if line.starts_with('[') { continue; } else { break; } } metadata } impl Default for FilterSet { /// Equivalent to `FilterSet::new(false)`, or `FilterSet::new(true)` when compiled in test /// configuration. fn default() -> Self { #[cfg(not(test))] let debug = false; #[cfg(test)] let debug = true; Self::new(debug) } } /// Corresponds to the `expires` field of `FilterListMetadata`. #[derive(Debug, PartialEq, Serialize)] pub enum ExpiresInterval { Hours(u16), Days(u8), } impl TryFrom<&str> for ExpiresInterval { type Error = (); fn try_from(v: &str) -> Result { const DAYS_MAX: u8 = 14; const HOURS_MAX: u16 = DAYS_MAX as u16 * 24; // Extract time amount and unit from str let mut v_split = v.split(' '); let amount = v_split.next().ok_or(())?; let unit = v_split.next().ok_or(())?; // str::parse:: accepts a leading plus sign, but we explicitly forbid it here if amount.starts_with('+') { return Err(()); } // Only accept values in the range [1, MAX] for values with a matching unit match unit { "hour" | "hours" => { let amount = amount.parse::().map_err(|_| ())?; if (1..=HOURS_MAX).contains(&amount) { return Ok(Self::Hours(amount)); } }, "day" | "days" => { let amount = amount.parse::().map_err(|_| ())?; if (1..=DAYS_MAX).contains(&amount) { return Ok(Self::Days(amount)) } } _ => () } Err(()) } } /// Includes information about any "special comments" as described by /// #[derive(Default, Serialize)] pub struct FilterListMetadata { /// `! Homepage: http://example.com` - This comment determines which webpage should be linked /// as filter list homepage. pub homepage: Option, /// `! Title: FooList` - This comment sets a fixed title for the filter list. If this comment /// is present, the user is no longer able to change the title. pub title: Option, /// `! Expires: 5 days` - This comment sets the update interval for the filter list. The value /// can be given in days (e.g. 5 days) or hours (e.g. 8 hours). Any value between 1 hour and 14 /// days is possible. Note that the update will not necessarily happen after this time /// interval. The actual update time is slightly randomized and depends on some additional /// factors to reduce server load. pub expires: Option, /// `! Redirect: http://example.com/list.txt` - This comment indicates that the filter list has /// moved to a new download address. Adblock Plus ignores any file content beyond that comment /// and immediately tries downloading from the new address. In case of success, the address of /// the filter list is updated in the settings. This comment is ignored if the new address is /// the same as the current address, meaning that it can be used to enforce the "canonical" /// address of the filter list. pub redirect: Option, } impl FilterListMetadata { /// Attempts to add a line of a filter list to this collection of metadata. Only comment lines /// with valid metadata content will be added. Previously added information will not be /// rewritten. fn try_add(&mut self, line: &str) { if let Some(kv) = line.strip_prefix("! ") { if let Some((key, value)) = kv.split_once(": ") { match key { "Homepage" if self.homepage.is_none() => self.homepage = Some(value.to_string()), "Title" if self.title.is_none() => self.title = Some(value.to_string()), "Expires" if self.expires.is_none() => { if let Ok(expires) = ExpiresInterval::try_from(value) { self.expires = Some(expires); } } "Redirect" if self.redirect.is_none() => self.redirect = Some(value.to_string()), _ => (), } } } } } impl FilterSet { /// Creates a new `FilterSet`. `debug` specifies whether or not to save information about the /// original raw filter rules alongside the more compact internal representation. If enabled, /// this information will be passed to the corresponding `Engine`. pub fn new(debug: bool) -> Self { Self { debug, network_filters: Vec::new(), cosmetic_filters: Vec::new(), } } /// Adds the contents of an entire filter list to this `FilterSet`. Filters that cannot be /// parsed successfully are ignored. Returns any discovered metadata about the list of rules /// added. pub fn add_filter_list(&mut self, filter_list: &str, opts: ParseOptions) -> FilterListMetadata { self.add_filters(filter_list.lines(), opts) } /// Adds a collection of filter rules to this `FilterSet`. Filters that cannot be parsed /// successfully are ignored. Returns any discovered metadata about the list of rules added. pub fn add_filters(&mut self, filters: impl IntoIterator>, opts: ParseOptions) -> FilterListMetadata { let (metadata, mut parsed_network_filters, mut parsed_cosmetic_filters) = parse_filters_with_metadata(filters, self.debug, opts); self.network_filters.append(&mut parsed_network_filters); self.cosmetic_filters.append(&mut parsed_cosmetic_filters); metadata } /// Adds the string representation of a single filter rule to this `FilterSet`. pub fn add_filter(&mut self, filter: &str, opts: ParseOptions) -> Result<(), FilterParseError> { let filter_parsed = parse_filter(filter, self.debug, opts); match filter_parsed? { ParsedFilter::Network(filter) => self.network_filters.push(filter), ParsedFilter::Cosmetic(filter) => self.cosmetic_filters.push(filter), } Ok(()) } /// Consumes this `FilterSet`, returning an equivalent list of content blocking rules and a /// corresponding new list containing the `String` representation of all filters that were /// successfully converted (as `FilterFormat::Standard` rules). /// /// The list of content blocking rules will be properly ordered to ensure correct behavior of /// `ignore-previous-rules`-typed rules. /// /// This function will fail if the `FilterSet` was not created in debug mode. #[cfg(feature = "content-blocking")] pub fn into_content_blocking(self) -> Result<(Vec, Vec), ()> { use crate::content_blocking; if !self.debug { return Err(()) } let mut ignore_previous_rules = vec![]; let mut other_rules = vec![]; let mut filters_used = vec![]; self.network_filters.into_iter().for_each(|filter| { let original_rule = *filter.raw_line.clone().expect("All rules should be in debug mode"); if let Ok(equivalent) = TryInto::::try_into(filter) { filters_used.push(original_rule); equivalent.into_iter().for_each(|cb_rule| { match &cb_rule.action.typ { content_blocking::CbType::IgnorePreviousRules => ignore_previous_rules.push(cb_rule), _ => other_rules.push(cb_rule), } }); } }); let add_fp_document_exception = !filters_used.is_empty(); self.cosmetic_filters.into_iter().for_each(|filter| { let original_rule = *filter.raw_line.clone().expect("All rules should be in debug mode"); if let Ok(cb_rule) = TryInto::::try_into(filter) { filters_used.push(original_rule); match &cb_rule.action.typ { content_blocking::CbType::IgnorePreviousRules => ignore_previous_rules.push(cb_rule), _ => other_rules.push(cb_rule), } } }); other_rules.append(&mut ignore_previous_rules); if add_fp_document_exception { other_rules.push(content_blocking::ignore_previous_fp_documents()); } Ok((other_rules, filters_used)) } } /// Denotes the format of a particular list resource, which affects how its rules should be parsed. #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub enum FilterFormat { /// Rules should be parsed in ABP/uBO-style format. Standard, /// Each line consists of an IP address (usually 127.0.0.1 or 0.0.0.0), some whitespace, and a /// single hostname. This syntax is normally used directly for HOSTS-based adblockers. These /// rules will be treated equivalently to `"||hostname^"` rules in `Standard` format; the IP /// addresses will not be used. /// /// Note that some sources provide a more raw format, where each line consists of just a /// hostname. This option will also accept that format. /// /// For this option, `!` is accepted as a comment character at the beginning of a line, and `#` /// is accepted as a comment character anywhere in a line. Hosts, } /// Default to parsing lists in `Standard` format. impl Default for FilterFormat { fn default() -> Self { Self::Standard } } /// Describes the type of a single filter. #[derive(Debug, PartialEq)] pub enum FilterType { /// A network filter, used for changing the behavior of network requests Network, /// A network filter, used for changing the behavior of fetched pages Cosmetic, /// Something else that isn't supported NotSupported, } /// Successful result of parsing a single filter rule pub enum ParsedFilter { Network(NetworkFilter), Cosmetic(CosmeticFilter), } impl From for ParsedFilter { fn from(v: NetworkFilter) -> Self { ParsedFilter::Network(v) } } impl From for ParsedFilter { fn from(v: CosmeticFilter) -> Self { ParsedFilter::Cosmetic(v) } } /// Unsuccessful result of parsing a single filter rule. #[derive(Debug, Error)] pub enum FilterParseError { #[error("network filter error: {0}")] Network(#[source] NetworkFilterError), #[error("cosmetic filter error: {0}")] Cosmetic(#[source] CosmeticFilterError), #[error("unsupported")] Unsupported, #[error("empty")] Empty, } impl From for FilterParseError { fn from(v: NetworkFilterError) -> Self { FilterParseError::Network(v) } } impl From for FilterParseError { fn from(v: CosmeticFilterError) -> Self { FilterParseError::Cosmetic(v) } } /// Parse a single filter rule pub fn parse_filter( line: &str, debug: bool, opts: ParseOptions, ) -> Result { let filter = line.trim(); if filter.is_empty() { return Err(FilterParseError::Empty); } match opts.format { FilterFormat::Standard => { match (detect_filter_type(filter), opts.rule_types) { (FilterType::Network, RuleTypes::All | RuleTypes::NetworkOnly) => NetworkFilter::parse(filter, debug, opts) .map(|f| f.into()) .map_err(|e| e.into()), (FilterType::Cosmetic, RuleTypes::All | RuleTypes::CosmeticOnly) => CosmeticFilter::parse(filter, debug, opts.permissions) .map(|f| f.into()) .map_err(|e| e.into()), _ => Err(FilterParseError::Unsupported), } } FilterFormat::Hosts => { // Hosts-style rules can only ever be network rules if !opts.rule_types.loads_network_rules() { return Err(FilterParseError::Unsupported); } if filter.starts_with('!') { return Err(FilterParseError::Unsupported); } // Discard contents after first `#` character let filter = if let Some(hash_loc) = find_char(b'#', filter.as_bytes()) { let filter = &filter[..hash_loc]; let filter = filter.trim(); if filter.is_empty() { return Err(FilterParseError::Unsupported); } filter } else { filter }; // Take the last of at most 2 whitespace separated fields let mut filter_parts = filter.split_whitespace(); let hostname = match (filter_parts.next(), filter_parts.next(), filter_parts.next()) { (None, None, None) => return Err(FilterParseError::Unsupported), (Some(hostname), None, None) => hostname, (Some(_ip), Some(hostname), None) => hostname, (Some(_), Some(_), Some(_)) => return Err(FilterParseError::Unsupported), _ => unreachable!(), }; // Matches in hosts lists are usually redirected to localhost. For that reason, some // lists include an entry for "localhost", which should be explicitly ignored when // performing request-level adblocking. if hostname == "localhost" { return Err(FilterParseError::Unsupported); } NetworkFilter::parse_hosts_style(hostname, debug) .map(|f| f.into()) .map_err(|e| e.into()) } } } /// Parse an entire list of filters, ignoring any errors pub fn parse_filters( list: impl IntoIterator>, debug: bool, opts: ParseOptions, ) -> (Vec, Vec) { let (_metadata, network_filters, cosmetic_filters) = parse_filters_with_metadata( list, debug, opts, ); (network_filters, cosmetic_filters) } /// Parse an entire list of filters, ignoring any errors pub fn parse_filters_with_metadata( list: impl IntoIterator>, debug: bool, opts: ParseOptions, ) -> (FilterListMetadata, Vec, Vec) { let mut metadata = FilterListMetadata::default(); let list_iter = list.into_iter(); let (network_filters, cosmetic_filters): (Vec<_>, Vec<_>) = list_iter .map(|line| { metadata.try_add(line.as_ref()); parse_filter(line.as_ref(), debug, opts) }) .filter_map(Result::ok) .partition_map(|filter| match filter { ParsedFilter::Network(f) => Either::Left(f), ParsedFilter::Cosmetic(f) => Either::Right(f), }); (metadata, network_filters, cosmetic_filters) } /// Given a single line, checks if this would likely be a cosmetic filter, a /// network filter or something that is not supported. This check is performed /// before calling a more specific parser to create an instance of /// `NetworkFilter` or `CosmeticFilter`. fn detect_filter_type(filter: &str) -> FilterType { // Ignore comments if filter.len() == 1 || filter.starts_with('!') || (filter.starts_with('#') && filter[1..].starts_with(char::is_whitespace)) || filter.starts_with("[Adblock") { return FilterType::NotSupported; } if filter.starts_with('|') || filter.starts_with("@@|") { return FilterType::Network; } // Check if filter is cosmetic if let Some(sharp_index) = find_char(b'#', filter.as_bytes()) { let after_sharp_index = sharp_index + 1; // Check the next few bytes for a second `#` // Indexing is safe here because it uses the filter's byte // representation and guards against short strings if find_char(b'#', &filter.as_bytes()[after_sharp_index..(after_sharp_index+4).min(filter.len())]).is_some() { return FilterType::Cosmetic; } } // Ignore Adguard cosmetics if filter.contains("$$") { return FilterType::NotSupported; } // Everything else is a network filter FilterType::Network } #[cfg(test)] mod tests { use super::*; #[test] fn parse_hosts_style() { { let input = "www.malware.com"; let result = parse_filter(input, true, ParseOptions { format: FilterFormat::Hosts, ..Default::default() }); assert!(result.is_ok()); } { let input = "www.malware.com/virus.txt"; let result = parse_filter(input, true, ParseOptions { format: FilterFormat::Hosts, ..Default::default() }); assert!(result.is_err()); } { let input = "127.0.0.1 www.malware.com"; let result = parse_filter(input, true, ParseOptions { format: FilterFormat::Hosts, ..Default::default() }); assert!(result.is_ok()); } { let input = "127.0.0.1\t\twww.malware.com"; let result = parse_filter(input, true, ParseOptions { format: FilterFormat::Hosts, ..Default::default() }); assert!(result.is_ok()); } { let input = "0.0.0.0 www.malware.com"; let result = parse_filter(input, true, ParseOptions { format: FilterFormat::Hosts, ..Default::default() }); assert!(result.is_ok()); } { let input = "0.0.0.0 www.malware.com # replace after issue #289336 is addressed"; let result = parse_filter(input, true, ParseOptions { format: FilterFormat::Hosts, ..Default::default() }); assert!(result.is_ok()); } { let input = "! Title: list.txt"; let result = parse_filter(input, true, ParseOptions { format: FilterFormat::Hosts, ..Default::default() }); assert!(result.is_err()); } { let input = "127.0.0.1 localhost"; let result = parse_filter(input, true, ParseOptions { format: FilterFormat::Hosts, ..Default::default() }); assert!(result.is_err()); } { let input = "127.0.0.1 com"; let result = parse_filter(input, true, ParseOptions { format: FilterFormat::Hosts, ..Default::default() }); assert!(result.is_err()); } { let input = ".com"; let result = parse_filter(input, true, ParseOptions { format: FilterFormat::Hosts, ..Default::default() }); assert!(result.is_err()); } { let input = "*.com"; let result = parse_filter(input, true, ParseOptions { format: FilterFormat::Hosts, ..Default::default() }); assert!(result.is_err()); } { let input = "www."; let result = parse_filter(input, true, ParseOptions { format: FilterFormat::Hosts, ..Default::default() }); assert!(result.is_err()); } } #[test] fn adguard_cosmetic_detection() { { let input = r#"example.org$$script[data-src="banner"]"#; let result = parse_filter(input, true, Default::default()); assert!(result.is_err()); } { let input = "example.org##+js(set-local-storage-item, Test, $$remove$$)"; let result = parse_filter(input, true, Default::default()); assert!(result.is_ok()); } { let input = "[$app=org.example.app]example.com##.textad"; let result = parse_filter(input, true, Default::default()); assert!(result.is_err()); } { let input = r#"[$domain=/^i\[a-z\]*\.strmrdr\[a-z\]+\..*/]##+js(set-constant, adscfg.enabled, false)"#; let result = parse_filter(input, true, Default::default()); assert!(result.is_err()); } } #[test] fn parse_filter_failed_fuzz_1() { let input = "Ѥ"; let result = parse_filter(input, true, Default::default()); assert!(result.is_ok()); } #[test] fn parse_filter_failed_fuzz_2() { assert!(parse_filter(r#"###\\\00DB \008D"#, true, Default::default()).is_ok()); assert!(parse_filter(r#"###\Û"#, true, Default::default()).is_ok()); } #[test] fn parse_filter_failed_fuzz_3() { let input = "||$3p=/"; let result = parse_filter(input, true, Default::default()); assert!(result.is_ok()); } #[test] fn parse_filter_failed_fuzz_4() { // \\##+js(,\xdd\x8d let parsed = parse_filter( &String::from_utf8(vec![92, 35, 35, 43, 106, 115, 40, 44, 221, 141]).unwrap(), true, Default::default(), ); #[cfg(feature = "css-validation")] assert!(parsed.is_err()); #[cfg(not(feature = "css-validation"))] assert!(parsed.is_ok()); } #[test] #[cfg(feature = "css-validation")] fn parse_filter_opening_comment() { assert!(parse_filter( "##input,input/*", true, Default::default(), ).is_err()); } #[test] fn test_parse_expires_interval() { assert_eq!(ExpiresInterval::try_from("0 hour"), Err(())); assert_eq!(ExpiresInterval::try_from("0 hours"), Err(())); assert_eq!(ExpiresInterval::try_from("1 hour"), Ok(ExpiresInterval::Hours(1))); assert_eq!(ExpiresInterval::try_from("1 hours"), Ok(ExpiresInterval::Hours(1))); assert_eq!(ExpiresInterval::try_from("2 hours"), Ok(ExpiresInterval::Hours(2))); assert_eq!(ExpiresInterval::try_from("2 hour"), Ok(ExpiresInterval::Hours(2))); assert_eq!(ExpiresInterval::try_from("3.5 hours"), Err(())); assert_eq!(ExpiresInterval::try_from("336 hours"), Ok(ExpiresInterval::Hours(336))); assert_eq!(ExpiresInterval::try_from("337 hours"), Err(())); assert_eq!(ExpiresInterval::try_from("0 day"), Err(())); assert_eq!(ExpiresInterval::try_from("0 days"), Err(())); assert_eq!(ExpiresInterval::try_from("1 day"), Ok(ExpiresInterval::Days(1))); assert_eq!(ExpiresInterval::try_from("1 days"), Ok(ExpiresInterval::Days(1))); assert_eq!(ExpiresInterval::try_from("2 days"), Ok(ExpiresInterval::Days(2))); assert_eq!(ExpiresInterval::try_from("2 day"), Ok(ExpiresInterval::Days(2))); assert_eq!(ExpiresInterval::try_from("3.5 days"), Err(())); assert_eq!(ExpiresInterval::try_from("14 days"), Ok(ExpiresInterval::Days(14))); assert_eq!(ExpiresInterval::try_from("15 days"), Err(())); assert_eq!(ExpiresInterval::try_from("-5 hours"), Err(())); assert_eq!(ExpiresInterval::try_from("+5 hours"), Err(())); assert_eq!(ExpiresInterval::try_from("2 days (update frequency)"), Ok(ExpiresInterval::Days(2))); assert_eq!(ExpiresInterval::try_from("2 hours (update frequency)"), Ok(ExpiresInterval::Hours(2))); } #[test] fn test_parsing_list_metadata() { let list = [ "[Adblock Plus 2.0]", "! Title: 0131 Block List", "! Homepage: https://austinhuang.me/0131-block-list", "! Licence: https://creativecommons.org/licenses/by-sa/4.0/", "! Expires: 7 days", "! Version: 20220411", "", "! => https://austinhuang.me/0131-block-list/list.txt", ]; let mut filter_set = FilterSet::new(false); let metadata = filter_set.add_filters(list, ParseOptions::default()); assert_eq!(metadata.title, Some("0131 Block List".to_string())); assert_eq!(metadata.homepage, Some("https://austinhuang.me/0131-block-list".to_string())); assert_eq!(metadata.expires, Some(ExpiresInterval::Days(7))); assert_eq!(metadata.redirect, None); } #[test] /// Some lists are formatted in unusual ways. This example has a version string with /// non-numeric characters and an `Expires` field with extra information trailing afterwards. /// Valid fields should still be recognized and parsed accordingly. fn test_parsing_list_best_effort() { let list = [ "[Adblock Plus 2]", "!-----------------------------------", "! ABOUT", "!-----------------------------------", "! Version: 1.2.0.0", "! Title: ABPVN Advanced", "! Last modified: 09/03/2021", "! Expires: 7 days (update frequency)", "! Homepage: https://www.haopro.net/", ]; let mut filter_set = FilterSet::new(false); let metadata = filter_set.add_filters(list, ParseOptions::default()); assert_eq!(metadata.title, Some("ABPVN Advanced".to_string())); assert_eq!(metadata.homepage, Some("https://www.haopro.net/".to_string())); assert_eq!(metadata.expires, Some(ExpiresInterval::Days(7))); assert_eq!(metadata.redirect, None); } #[test] fn test_read_metadata() { { let list = r##"! Title: uBlock₀ filters – Annoyances ! Description: Filters optimized for uBlock Origin, to be used with Fanboy's ! and/or Adguard's "Annoyances" list(s) ! Expires: 4 days ! Last modified: %timestamp% ! License: https://github.com/uBlockOrigin/uAssets/blob/master/LICENSE ! Homepage: https://github.com/uBlockOrigin/uAssets ! Forums: https://github.com/uBlockOrigin/uAssets/issues"##; let metadata = read_list_metadata(&list); assert_eq!(metadata.title, Some("uBlock₀ filters – Annoyances".to_string())); assert_eq!(metadata.homepage, Some("https://github.com/uBlockOrigin/uAssets".to_string())); assert_eq!(metadata.expires, Some(ExpiresInterval::Days(4))); assert_eq!(metadata.redirect, None); } { let list = r##"[uBlock Origin] ! Title: PersianBlocker ! Description: سرانجام، یک لیست بهینه و گسترده برای مسدودسازی تبلیغ ها و ردیاب ها در سایت های پارسی زبان! ! Expires: 2 days ! Last modified: 2022-12-11 ! Homepage: https://github.com/MasterKia/PersianBlocker ! License: AGPLv3 (https://github.com/MasterKia/PersianBlocker/blob/main/LICENSE) ! مشکل/پیشنهاد: https://github.com/MasterKia/PersianBlocker/issues ! مشارکت: https://github.com/MasterKia/PersianBlocker/pulls ! لیستی برای برگرداندن آزادی کاربران، چون هر کاربر این آزادی را دارد که چه چیزی وارد مرورگرش می‌شود و چه چیزی وارد نمی‌شود !-------------------------v Experimental Generic Filters v-----------------------! ! applicationha.com, androidgozar.com, downloadkral.com, gold-team.org, iranecar.com, icoff.ee, koolakmag.ir, !! mybia4music.com, my-film.pw, pedal.ir, vgdl.ir, sakhamusic.ir /wp-admin/admin-ajax.php?postviews_id=$xhr "##; let metadata = read_list_metadata(&list); assert_eq!(metadata.title, Some("PersianBlocker".to_string())); assert_eq!(metadata.homepage, Some("https://github.com/MasterKia/PersianBlocker".to_string())); assert_eq!(metadata.expires, Some(ExpiresInterval::Days(2))); assert_eq!(metadata.redirect, None); } } #[test] fn parse_cosmetic_variants() { { let input = "example.com##.selector"; let result = parse_filter(input, true, Default::default()); assert!(matches!(result, Ok(ParsedFilter::Cosmetic(..)))); } { let input = "9gag.com#?#article:-abp-has(.promoted)"; let result = parse_filter(input, true, Default::default()); assert!(matches!(result, Ok(ParsedFilter::Cosmetic(..)))); } #[cfg(feature = "css-validation")] { let input = "sportowefakty.wp.pl#@?#body > [class]:not([id]):matches-css(position: fixed):matches-css(top: 0px)"; let result = parse_filter(input, true, Default::default()); assert!(matches!(result, Err(FilterParseError::Cosmetic(CosmeticFilterError::InvalidCssSelector)))); } { let input = r#"odkrywamyzakryte.com#%#//scriptlet("abort-on-property-read", "sc_adv_out")"#; let result = parse_filter(input, true, Default::default()); assert!(matches!(result, Err(FilterParseError::Cosmetic(CosmeticFilterError::UnsupportedSyntax)))); } { let input = "bikeradar.com,spiegel.de#@%#!function(){function b(){}function a(a){return{get:function(){return a},set:b}}function c(a)"; let result = parse_filter(input, true, Default::default()); assert!(matches!(result, Err(FilterParseError::Cosmetic(CosmeticFilterError::UnsupportedSyntax)))); } { let input = "nczas.com#$#.adsbygoogle { position: absolute!important; left: -3000px!important; }"; let result = parse_filter(input, true, Default::default()); assert!(matches!(result, Err(FilterParseError::Cosmetic(CosmeticFilterError::UnsupportedSyntax)))); } { let input = "kurnik.pl#@$#.adsbygoogle { height: 1px !important; width: 1px !important; }"; let result = parse_filter(input, true, Default::default()); assert!(matches!(result, Err(FilterParseError::Cosmetic(CosmeticFilterError::UnsupportedSyntax)))); } } } adblock-0.8.12/src/optimizer.rs000064400000000000000000000417051046102023000145010ustar 00000000000000use crate::filters::network::{FilterPart, NetworkFilter, NetworkFilterMask}; use itertools::*; use std::collections::HashMap; trait Optimization { fn fusion(&self, filters: &[NetworkFilter]) -> NetworkFilter; fn group_by_criteria(&self, filter: &NetworkFilter) -> String; fn select(&self, filter: &NetworkFilter) -> bool; } /// Fuse `NetworkFilter`s together by applying optimizations sequentially. pub fn optimize(filters: Vec) -> Vec { let mut optimized: Vec = Vec::new(); /* let union_domain_group = UnionDomainGroup {}; let (mut fused, unfused) = apply_optimisation(&union_domain_group, filters); optimized.append(&mut fused); */ let simple_pattern_group = SimplePatternGroup {}; let (mut fused, mut unfused) = apply_optimisation(&simple_pattern_group, filters); optimized.append(&mut fused); // Append whatever is still left unfused optimized.append(&mut unfused); // Re-sort the list, now that the order has been perturbed optimized.sort_by_key(|f| f.id); optimized } fn apply_optimisation( optimization: &T, filters: Vec, ) -> (Vec, Vec) { let (positive, mut negative): (Vec, Vec) = filters.into_iter().partition_map(|f| { if optimization.select(&f) { Either::Left(f) } else { Either::Right(f) } }); let mut to_fuse: HashMap> = HashMap::with_capacity(positive.len()); positive .into_iter() .for_each(|f| insert_dup(&mut to_fuse, optimization.group_by_criteria(&f), f)); let mut fused = Vec::with_capacity(to_fuse.len()); for (_, group) in to_fuse { if group.len() > 1 { // println!("Fusing {} filters together", group.len()); fused.push(optimization.fusion(group.as_slice())); } else { group.into_iter().for_each(|f| negative.push(f)); } } fused.shrink_to_fit(); (fused, negative) } fn insert_dup(map: &mut HashMap>, k: K, v: V) where K: std::cmp::Ord + std::hash::Hash, { map.entry(k).or_insert_with(Vec::new).push(v) } struct SimplePatternGroup {} impl Optimization for SimplePatternGroup { // Group simple patterns, into a single filter fn fusion(&self, filters: &[NetworkFilter]) -> NetworkFilter { let base_filter = &filters[0]; // FIXME: can technically panic, if filters list is empty let mut filter = base_filter.clone(); // if any filter is empty (meaning matches anything), the entire combiation matches anything if filters .iter() .any(|f| matches!(f.filter, FilterPart::Empty)) { filter.filter = FilterPart::Empty } else { let mut flat_patterns: Vec = Vec::with_capacity(filters.len()); for f in filters { match &f.filter { FilterPart::Empty => (), FilterPart::Simple(s) => flat_patterns.push(s.clone()), FilterPart::AnyOf(s) => flat_patterns.extend_from_slice(s), } } if flat_patterns.is_empty() { filter.filter = FilterPart::Empty; } else if flat_patterns.len() == 1 { filter.filter = FilterPart::Simple(flat_patterns[0].clone()) } else { filter.filter = FilterPart::AnyOf(flat_patterns) } } let is_regex = filters.iter().any(NetworkFilter::is_regex); filter.mask.set(NetworkFilterMask::IS_REGEX, is_regex); let is_complete_regex = filters.iter().any(|f| f.is_complete_regex()); filter .mask .set(NetworkFilterMask::IS_COMPLETE_REGEX, is_complete_regex); if base_filter.raw_line.is_some() { filter.raw_line = Some(Box::new( filters .iter() .flat_map(|f| f.raw_line.clone()) .join(" <+> "), )) } filter } fn group_by_criteria(&self, filter: &NetworkFilter) -> String { format!("{:b}:{:?}", filter.mask, filter.is_complete_regex()) } fn select(&self, filter: &NetworkFilter) -> bool { filter.opt_domains.is_none() && filter.opt_not_domains.is_none() && !filter.is_hostname_anchor() && !filter.is_redirect() && !filter.is_csp() } } /* struct UnionDomainGroup {} impl Optimization for UnionDomainGroup { fn fusion(&self, filters: &[NetworkFilter]) -> NetworkFilter { let base_filter = &filters[0]; // FIXME: can technically panic, if filters list is empty let mut filter = base_filter.clone(); let mut domains = HashSet::new(); let mut not_domains = HashSet::new(); filters.iter().for_each(|f| { if let Some(opt_domains) = f.opt_domains.as_ref() { for d in opt_domains { domains.insert(d); } } if let Some(opt_not_domains) = f.opt_not_domains.as_ref() { for d in opt_not_domains { not_domains.insert(d); } } }); if !domains.is_empty() { let mut domains = domains.into_iter().cloned().collect::>(); domains.sort_unstable(); let opt_domains_union = Some(domains.iter().fold(0, |acc, x| acc | x)); filter.opt_domains = Some(domains); filter.opt_domains_union = opt_domains_union; } if !not_domains.is_empty() { let mut domains = not_domains.into_iter().cloned().collect::>(); domains.sort_unstable(); let opt_not_domains_union = Some(domains.iter().fold(0, |acc, x| acc | x)); filter.opt_not_domains = Some(domains); filter.opt_not_domains_union = opt_not_domains_union; } if base_filter.raw_line.is_some() { filter.raw_line = Some(Box::new( filters .iter() .flat_map(|f| f.raw_line.clone()) .join(" <+> "), )) } filter } fn group_by_criteria(&self, filter: &NetworkFilter) -> String { format!( "{:?}:{}:{:b}:{:?}", filter.hostname.as_ref(), filter.filter.string_view().unwrap_or_default(), filter.mask, filter.modifier_option.as_ref() ) } fn select(&self, filter: &NetworkFilter) -> bool { !filter.is_csp() && (filter.opt_domains.is_some() || filter.opt_not_domains.is_some()) } } */ #[cfg(test)] mod optimization_tests_pattern_group { use super::*; use crate::filters::network::CompiledRegex; use crate::filters::network::NetworkMatchable; use crate::lists; use crate::regex_manager::RegexManager; use crate::request::Request; use regex::RegexSet; fn check_regex_match(regex: &CompiledRegex, pattern: &str, matches: bool) { let is_match = regex.is_match(pattern); assert!( is_match == matches, "Expected {} match {} = {}", regex.to_string(), pattern, matches ); } fn check_match( regex_manager: &mut RegexManager, filter: &NetworkFilter, url_path: &str, matches: bool, ) { let is_match = filter.matches(&Request::new( ("https://example.com/".to_string() + url_path).as_str(), "https://google.com", "" ).unwrap(), regex_manager); assert!( is_match == matches, "Expected {} match {} = {}", filter.to_string(), url_path, matches ); } #[test] fn regex_set_works() { let regex_set = RegexSet::new(&[ r"/static/ad\.", "/static/ad-", "/static/ad/.*", "/static/ads/.*", "/static/adv/.*", ]); let fused_regex = CompiledRegex::CompiledSet(regex_set.unwrap()); assert!(matches!(fused_regex, CompiledRegex::CompiledSet(_))); check_regex_match(&fused_regex, "/static/ad.", true); check_regex_match(&fused_regex, "/static/ad-", true); check_regex_match(&fused_regex, "/static/ads-", false); check_regex_match(&fused_regex, "/static/ad/", true); check_regex_match(&fused_regex, "/static/ad", false); check_regex_match(&fused_regex, "/static/ad/foobar", true); check_regex_match(&fused_regex, "/static/ad/foobar/asd?q=1", true); check_regex_match(&fused_regex, "/static/ads/", true); check_regex_match(&fused_regex, "/static/ads", false); check_regex_match(&fused_regex, "/static/ads/foobar", true); check_regex_match(&fused_regex, "/static/ads/foobar/asd?q=1", true); check_regex_match(&fused_regex, "/static/adv/", true); check_regex_match(&fused_regex, "/static/adv", false); check_regex_match(&fused_regex, "/static/adv/foobar", true); check_regex_match(&fused_regex, "/static/adv/foobar/asd?q=1", true); } #[test] fn combines_simple_regex_patterns() { let rules = [ "/static/ad-", "/static/ad.", "/static/ad/*", "/static/ads/*", "/static/adv/*", ]; let (filters, _) = lists::parse_filters(&rules, true, Default::default()); let optimization = SimplePatternGroup {}; filters .iter() .for_each(|f| assert!(optimization.select(f), "Expected rule to be selected")); let fused = optimization.fusion(&filters); assert!(fused.is_regex() == false, "Expected rule to not be a regex"); assert_eq!( fused.to_string(), "/static/ad- <+> /static/ad. <+> /static/ad/* <+> /static/ads/* <+> /static/adv/*" ); let mut regex_manager = RegexManager::default(); check_match(&mut regex_manager, &fused, "/static/ad-", true); check_match(&mut regex_manager, &fused, "/static/ad.", true); check_match(&mut regex_manager, &fused, "/static/ad%", false); check_match(&mut regex_manager, &fused, "/static/ads-", false); check_match(&mut regex_manager, &fused, "/static/ad/", true); check_match(&mut regex_manager, &fused, "/static/ad", false); check_match(&mut regex_manager, &fused, "/static/ad/foobar", true); check_match( &mut regex_manager, &fused, "/static/ad/foobar/asd?q=1", true, ); check_match(&mut regex_manager, &fused, "/static/ads/", true); check_match(&mut regex_manager, &fused, "/static/ads", false); check_match(&mut regex_manager, &fused, "/static/ads/foobar", true); check_match( &mut regex_manager, &fused, "/static/ads/foobar/asd?q=1", true, ); check_match(&mut regex_manager, &fused, "/static/adv/", true); check_match(&mut regex_manager, &fused, "/static/adv", false); check_match(&mut regex_manager, &fused, "/static/adv/foobar", true); check_match( &mut regex_manager, &fused, "/static/adv/foobar/asd?q=1", true, ); } #[test] fn separates_pattern_by_grouping() { let rules = [ "/analytics-v1.", "/v1/pixel?", "/api/v1/stat?", "/analytics/v1/*$domain=~my.leadpages.net", "/v1/ads/*", ]; let (filters, _) = lists::parse_filters(&rules, true, Default::default()); let optimization = SimplePatternGroup {}; let (fused, skipped) = apply_optimisation(&optimization, filters); assert_eq!(fused.len(), 1); let filter = fused.get(0).unwrap(); assert_eq!( filter.to_string(), "/analytics-v1. <+> /v1/pixel? <+> /api/v1/stat? <+> /v1/ads/*" ); assert!(filter.matches_test( &Request::new( "https://example.com/v1/pixel?", "https://my.leadpages.net", "" ) .unwrap() )); assert_eq!(skipped.len(), 1); let filter = skipped.get(0).unwrap(); assert_eq!( filter.to_string(), "/analytics/v1/*$domain=~my.leadpages.net" ); assert!(filter.matches_test( &Request::new( "https://example.com/analytics/v1/foobar", "https://foo.leadpages.net", "" ) .unwrap() )) } } /* #[cfg(test)] mod optimization_tests_union_domain { use super::*; use crate::filters::network::NetworkMatchable; use crate::lists; use crate::request::Request; use crate::utils; #[test] fn merges_domains() { let rules = [ "/analytics-v1$domain=google.com", "/analytics-v1$domain=example.com", ]; let (filters, _) = lists::parse_filters(&rules, true, Default::default()); let optimization = UnionDomainGroup {}; let (fused, _) = apply_optimisation(&optimization, filters); assert_eq!(fused.len(), 1); let filter = fused.get(0).unwrap(); assert_eq!( filter.to_string(), "/analytics-v1$domain=google.com <+> /analytics-v1$domain=example.com" ); let expected_domains = vec![ utils::fast_hash("example.com"), utils::fast_hash("google.com"), ]; assert!(filter.opt_domains.is_some()); let filter_domains = filter.opt_domains.as_ref().unwrap(); for dom in expected_domains { assert!(filter_domains.contains(&dom)); } assert!( filter.matches_test( &Request::new( "https://example.com/analytics-v1/foobar", "https://google.com", "" ) .unwrap() ) == true ); assert!( filter.matches_test( &Request::new( "https://example.com/analytics-v1/foobar", "https://foo.leadpages.net", "" ) .unwrap() ) == false ); } #[test] fn skips_rules_with_no_domain() { let rules = [ "/analytics-v1$domain=google.com", "/analytics-v1$domain=example.com", "/analytics-v1", ]; let (filters, _) = lists::parse_filters(&rules, true, Default::default()); let optimization = UnionDomainGroup {}; let (_, skipped) = apply_optimisation(&optimization, filters); assert_eq!(skipped.len(), 1); let filter = skipped.get(0).unwrap(); assert_eq!(filter.to_string(), "/analytics-v1"); } #[test] fn optimises_domains() { let rules = [ "/analytics-v1$domain=google.com", "/analytics-v1$domain=example.com", "/analytics-v1$domain=exampleone.com|exampletwo.com", "/analytics-v1", ]; let (filters, _) = lists::parse_filters(&rules, true, Default::default()); let optimization = UnionDomainGroup {}; let (fused, skipped) = apply_optimisation(&optimization, filters); assert_eq!(fused.len(), 1); let filter = fused.get(0).unwrap(); assert_eq!( filter.to_string(), "/analytics-v1$domain=google.com <+> /analytics-v1$domain=example.com <+> /analytics-v1$domain=exampleone.com|exampletwo.com" ); assert_eq!(skipped.len(), 1); let skipped_filter = skipped.get(0).unwrap(); assert_eq!(skipped_filter.to_string(), "/analytics-v1"); assert!( filter.matches_test( &Request::new( "https://example.com/analytics-v1/foobar", "https://google.com", "" ) .unwrap() ) == true ); assert!( filter.matches_test( &Request::new( "https://example.com/analytics-v1/foobar", "https://example.com", "" ) .unwrap() ) == true ); assert!( filter.matches_test( &Request::new( "https://example.com/analytics-v1/foobar", "https://exampletwo.com", "" ) .unwrap() ) == true ); assert!( filter.matches_test( &Request::new( "https://example.com/analytics-v1/foobar", "https://foo.leadpages.net", "" ) .unwrap() ) == false ); } } */ adblock-0.8.12/src/regex_manager.rs000064400000000000000000000245251046102023000152640ustar 00000000000000//! Compiled regexes can take up large amounts of memory. To reduce the overall memory footprint of //! the [`crate::Engine`], infrequently used regexes can be discarded. The [`RegexManager`] is //! responsible for managing the storage of regexes used by filters. use crate::filters::network::{compile_regex, CompiledRegex, NetworkFilter}; use std::collections::HashMap; use std::time::Duration; #[cfg(test)] #[cfg(not(target_arch = "wasm32"))] use mock_instant::global::Instant; #[cfg(not(test))] #[cfg(not(target_arch = "wasm32"))] use std::time::Instant; #[cfg(target_arch = "wasm32")] #[derive(Clone, Copy)] pub struct Instant; #[cfg(target_arch = "wasm32")] impl Instant { pub fn now() -> Self { Self } } /// `*const NetworkFilter` could technically leak across threads through `RegexDebugEntry::id`, but /// it's disguised as a unique identifier and not intended to be dereferenced. unsafe impl Send for RegexManager {} const DEFAULT_CLEAN_UP_INTERVAL: Duration = Duration::from_secs(30); const DEFAULT_DISCARD_UNUSED_TIME: Duration = Duration::from_secs(180); /// Reports [`RegexManager`] metrics that may be useful for creating an optimized /// [`RegexManagerDiscardPolicy`]. #[cfg(feature = "regex-debug-info")] pub struct RegexDebugInfo { /// Information about each regex contained in the [`RegexManager`]. pub regex_data: Vec, /// Total count of compiled regexes. pub compiled_regex_count: usize, } /// Describes metrics about a single regex from the [`RegexManager`]. #[cfg(feature = "regex-debug-info")] pub struct RegexDebugEntry { /// Id for this particular regex, which is constant and unique for its lifetime. /// /// Note that there are no guarantees about a particular id's constancy or uniqueness beyond /// the lifetime of a corresponding regex. pub id: u64, /// A string representation of this regex, if available. It may be `None` if the regex has been /// cleaned up to conserve memory. pub regex: Option, /// When this regex was last used. pub last_used: Instant, /// How many times this regex has been used. pub usage_count: usize, } struct RegexEntry { regex: Option, last_used: Instant, usage_count: usize, } /// Used for customization of regex discarding behavior in the [`RegexManager`]. pub struct RegexManagerDiscardPolicy { /// The [`RegexManager`] will check for and cleanup unused filters on this interval. pub cleanup_interval: Duration, /// The [`RegexManager`] will discard a regex if it hasn't been used for this much time. pub discard_unused_time: Duration, } impl Default for RegexManagerDiscardPolicy { fn default() -> Self { Self { cleanup_interval: DEFAULT_CLEAN_UP_INTERVAL, discard_unused_time: DEFAULT_DISCARD_UNUSED_TIME, } } } type RandomState = std::hash::BuildHasherDefault; /// A manager that creates and stores all regular expressions used by filters. /// Rarely used entries are discarded to save memory. /// /// The [`RegexManager`] is not thread safe, so any access to it must be synchronized externally. pub struct RegexManager { map: HashMap<*const NetworkFilter, RegexEntry, RandomState>, compiled_regex_count: usize, now: Instant, #[cfg_attr(target_arch = "wasm32", allow(unused))] last_cleanup: Instant, discard_policy: RegexManagerDiscardPolicy, } impl Default for RegexManager { fn default() -> Self { Self { map: Default::default(), compiled_regex_count: 0, now: Instant::now(), last_cleanup: Instant::now(), discard_policy: Default::default(), } } } fn make_regexp(filter: &NetworkFilter) -> CompiledRegex { compile_regex( &filter.filter, filter.is_right_anchor(), filter.is_left_anchor(), filter.is_complete_regex(), ) } impl RegexManager { /// Check whether or not a regex network filter matches a certain URL pattern, using the /// [`RegexManager`]'s managed regex storage. pub fn matches(&mut self, filter: &NetworkFilter, pattern: &str) -> bool { if !filter.is_regex() && !filter.is_complete_regex() { return true; } let key = filter as *const NetworkFilter; use std::collections::hash_map::Entry; match self.map.entry(key) { Entry::Occupied(mut e) => { let v = e.get_mut(); v.usage_count += 1; v.last_used = self.now; if v.regex.is_none() { // A discarded entry, recreate it: v.regex = Some(make_regexp(filter)); self.compiled_regex_count += 1; } return v.regex.as_ref().unwrap().is_match(pattern); } Entry::Vacant(e) => { self.compiled_regex_count += 1; let new_entry = RegexEntry { regex: Some(make_regexp(filter)), last_used: self.now, usage_count: 1, }; return e .insert(new_entry) .regex .as_ref() .unwrap() .is_match(pattern); } }; } /// The [`RegexManager`] is just a struct and doesn't manage any worker threads, so this method /// must be called periodically to ensure that it can track usage patterns of regexes over /// time. This method will handle periodically discarding filters if necessary. #[cfg(not(target_arch = "wasm32"))] pub fn update_time(&mut self) { self.now = Instant::now(); if !self.discard_policy.cleanup_interval.is_zero() && self.now - self.last_cleanup >= self.discard_policy.cleanup_interval { self.last_cleanup = self.now; self.cleanup(); } } #[cfg(not(target_arch = "wasm32"))] pub(crate) fn cleanup(&mut self) { let now = self.now; for v in self.map.values_mut() { if now - v.last_used >= self.discard_policy.discard_unused_time { // Discard the regex to save memory. v.regex = None; } } } /// Customize the discard behavior of this [`RegexManager`]. pub fn set_discard_policy(&mut self, new_discard_policy: RegexManagerDiscardPolicy) { self.discard_policy = new_discard_policy; } /// Discard one regex, identified by its id from a [`RegexDebugEntry`]. #[cfg(feature = "regex-debug-info")] pub fn discard_regex(&mut self, regex_id: u64) { self.map .iter_mut() .filter(|(k, _)| **k as u64 == regex_id) .for_each(|(_, v)| { v.regex = None; }); } #[cfg(feature = "regex-debug-info")] pub(crate) fn get_debug_regex_data(&self) -> Vec { use itertools::Itertools; self.map .iter() .map(|(k, e)| RegexDebugEntry { id: *k as u64, regex: e.regex.as_ref().map(|x| x.to_string()), last_used: e.last_used, usage_count: e.usage_count, }) .collect_vec() } #[cfg(feature = "regex-debug-info")] pub(crate) fn get_compiled_regex_count(&self) -> usize { self.compiled_regex_count } /// Collect metrics that may be useful for creating an optimized [`RegexManagerDiscardPolicy`]. #[cfg(feature = "regex-debug-info")] pub fn get_debug_info(&self) -> RegexDebugInfo { RegexDebugInfo { regex_data: self.get_debug_regex_data(), compiled_regex_count: self.get_compiled_regex_count(), } } } #[cfg(all(test, feature = "regex-debug-info"))] mod tests { use super::*; use crate::filters::network::NetworkMatchable; use crate::request; use mock_instant::global::MockClock; fn make_filter(line: &str) -> NetworkFilter { NetworkFilter::parse(line, true, Default::default()).unwrap() } fn make_request(url: &str) -> request::Request { request::Request::new(url, "https://example.com", "other").unwrap() } fn get_active_regex_count(regex_manager: &RegexManager) -> usize { regex_manager .get_debug_regex_data() .iter() .filter(|x| x.regex.is_some()) .count() } #[test] fn simple_match() { let mut regex_manager = RegexManager::default(); regex_manager.update_time(); let filter = make_filter("||geo*.hltv.org^"); assert!(filter.matches(&make_request("https://geo2.hltv.org/"), &mut regex_manager)); assert_eq!(get_active_regex_count(®ex_manager), 1); assert_eq!(regex_manager.get_debug_regex_data().len(), 1); } #[test] fn discard_and_recreate() { let mut regex_manager = RegexManager::default(); regex_manager.update_time(); let filter = make_filter("||geo*.hltv.org^"); assert!(filter.matches(&make_request("https://geo2.hltv.org/"), &mut regex_manager)); assert_eq!(regex_manager.get_compiled_regex_count(), 1); assert_eq!(get_active_regex_count(®ex_manager), 1); MockClock::advance(DEFAULT_DISCARD_UNUSED_TIME - Duration::from_secs(1)); regex_manager.update_time(); // The entry shouldn't be discarded because was used during // last REGEX_MANAGER_DISCARD_TIME. assert_eq!(get_active_regex_count(®ex_manager), 1); // The entry is entry is outdated, but should be discarded only // in the next cleanup() call. The call was 2 sec ago and is throttled // now. MockClock::advance(DEFAULT_CLEAN_UP_INTERVAL - Duration::from_secs(1)); regex_manager.update_time(); assert_eq!(get_active_regex_count(®ex_manager), 1); MockClock::advance(Duration::from_secs(2)); regex_manager.update_time(); // The entry is now outdated & cleanup() should be called => discard. assert_eq!(get_active_regex_count(®ex_manager), 0); // The entry is recreated, get_compiled_regex_count() increased +1. assert!(filter.matches(&make_request("https://geo2.hltv.org/"), &mut regex_manager)); assert_eq!(regex_manager.get_compiled_regex_count(), 2); assert_eq!(get_active_regex_count(®ex_manager), 1); } } adblock-0.8.12/src/request.rs000064400000000000000000000312601046102023000141420ustar 00000000000000//! Contains structures needed to describe network requests. use std::borrow::Cow; use thiserror::Error; use crate::url_parser; use crate::utils; /// The type of resource requested from the URL endpoint. #[derive(Clone, PartialEq, Debug)] pub enum RequestType { Beacon, Csp, Document, Dtd, Fetch, Font, Image, Media, Object, Other, Ping, Script, Stylesheet, Subdocument, Websocket, Xlst, Xmlhttprequest, } /// Possible failure reasons when creating a [`Request`]. #[derive(Debug, Error, PartialEq)] pub enum RequestError { #[error("hostname parsing failed")] HostnameParseError, #[error("source hostname parsing failed")] SourceHostnameParseError, #[error("invalid Unicode provided")] UnicodeDecodingError, } impl From for RequestError { fn from(_err: idna::Errors) -> RequestError { RequestError::UnicodeDecodingError } } impl From for RequestError { fn from(_err: url::ParseError) -> RequestError { RequestError::HostnameParseError } } fn cpt_match_type(cpt: &str) -> RequestType { match cpt { "beacon" => RequestType::Ping, "csp_report" => RequestType::Csp, "document" | "main_frame" => RequestType::Document, "font" => RequestType::Font, "image" | "imageset" => RequestType::Image, "media" => RequestType::Media, "object" | "object_subrequest" => RequestType::Object, "ping" => RequestType::Ping, "script" => RequestType::Script, "stylesheet" => RequestType::Stylesheet, "sub_frame" | "subdocument" => RequestType::Subdocument, "websocket" => RequestType::Websocket, "xhr" | "xmlhttprequest" => RequestType::Xmlhttprequest, "other" => RequestType::Other, "speculative" => RequestType::Other, "web_manifest" => RequestType::Other, "xbl" => RequestType::Other, "xml_dtd" => RequestType::Other, "xslt" => RequestType::Other, _ => RequestType::Other, } } /// A network [`Request`], used as an interface for network blocking in the [`crate::Engine`]. #[derive(Clone, Debug)] pub struct Request { pub request_type: RequestType, pub is_http: bool, pub is_https: bool, pub is_supported: bool, pub is_third_party: bool, pub url: String, pub hostname: String, pub source_hostname_hashes: Option>, pub(crate) original_url: String, } impl Request { pub(crate) fn get_url(&self, case_sensitive: bool) -> std::borrow::Cow { if case_sensitive { Cow::Borrowed(&self.url) } else { Cow::Owned(self.url.to_ascii_lowercase()) } } pub fn get_tokens(&self, token_buffer: &mut Vec) { token_buffer.clear(); utils::tokenize_pooled(&self.url.to_ascii_lowercase(), token_buffer); // Add zero token as a fallback to wildcard rule bucket token_buffer.push(0); } #[allow(clippy::too_many_arguments)] fn from_detailed_parameters( raw_type: &str, url: &str, schema: &str, hostname: &str, source_hostname: &str, third_party: bool, original_url: String, ) -> Request { let is_http: bool; let is_https: bool; let is_supported: bool; let request_type: RequestType; if schema.is_empty() { // no ':' was found is_https = true; is_http = false; is_supported = true; request_type = cpt_match_type(raw_type); } else { is_http = schema == "http"; is_https = !is_http && schema == "https"; let is_websocket = !is_http && !is_https && (schema == "ws" || schema == "wss"); is_supported = is_http || is_https || is_websocket; if is_websocket { request_type = RequestType::Websocket; } else { request_type = cpt_match_type(raw_type); } } let source_hostname_hashes = if !source_hostname.is_empty() { let mut hashes = Vec::with_capacity(4); hashes.push(utils::fast_hash(source_hostname)); for (i, c) in source_hostname.char_indices() { if c == '.' && i + 1 < source_hostname.len() { hashes.push(utils::fast_hash(&source_hostname[i + 1..])); } } Some(hashes) } else { None }; Request { request_type, url: url.to_owned(), hostname: hostname.to_owned(), source_hostname_hashes, is_third_party: third_party, is_http, is_https, is_supported, original_url, } } /// Construct a new [`Request`]. pub fn new( url: &str, source_url: &str, request_type: &str, ) -> Result { if let Some(parsed_url) = url_parser::parse_url(url) { if let Some(parsed_source) = url_parser::parse_url(source_url) { let source_domain = parsed_source.domain(); let third_party = source_domain != parsed_url.domain(); Ok(Request::from_detailed_parameters( request_type, &parsed_url.url, parsed_url.schema(), parsed_url.hostname(), parsed_source.hostname(), third_party, url.to_string(), )) } else { Ok(Request::from_detailed_parameters( request_type, &parsed_url.url, parsed_url.schema(), parsed_url.hostname(), "", true, url.to_string(), )) } } else { Err(RequestError::HostnameParseError) } } /// If you're building a [`Request`] in a context that already has access to parsed /// representations of the input URLs, you can use this constructor to avoid extra lookups from /// the public suffix list. Take care to pass data correctly. pub fn preparsed( url: &str, hostname: &str, source_hostname: &str, request_type: &str, third_party: bool, ) -> Request { let splitter = memchr::memchr(b':', url.as_bytes()).unwrap_or(0); let schema: &str = &url[..splitter]; Request::from_detailed_parameters( request_type, url, schema, hostname, source_hostname, third_party, url.to_string(), ) } } #[cfg(test)] mod tests { use super::*; fn build_request( raw_type: &str, url: &str, schema: &str, hostname: &str, domain: &str, source_hostname: &str, source_domain: &str, ) -> Request { let third_party = source_domain != domain; Request::from_detailed_parameters( raw_type, url, schema, hostname, source_hostname, third_party, url.to_string(), ) } #[test] fn new_works() { let simple_example = build_request( "document", "https://example.com/ad", "https", "example.com", "example.com", "example.com", "example.com", ); assert_eq!(simple_example.is_https, true); assert_eq!(simple_example.is_supported, true); assert_eq!(simple_example.is_third_party, false); assert_eq!(simple_example.request_type, RequestType::Document); assert_eq!( simple_example.source_hostname_hashes, Some(vec![ utils::fast_hash("example.com"), utils::fast_hash("com") ]), ); let unsupported_example = build_request( "document", "file://example.com/ad", "file", "example.com", "example.com", "example.com", "example.com", ); assert_eq!(unsupported_example.is_https, false); assert_eq!(unsupported_example.is_http, false); assert_eq!(unsupported_example.is_supported, false); let first_party = build_request( "document", "https://subdomain.example.com/ad", "https", "subdomain.example.com", "example.com", "example.com", "example.com", ); assert_eq!(first_party.is_https, true); assert_eq!(first_party.is_supported, true); assert_eq!(first_party.is_third_party, false); let third_party = build_request( "document", "https://subdomain.anotherexample.com/ad", "https", "subdomain.anotherexample.com", "anotherexample.com", "example.com", "example.com", ); assert_eq!(third_party.is_https, true); assert_eq!(third_party.is_supported, true); assert_eq!(third_party.is_third_party, true); let websocket = build_request( "document", "wss://subdomain.anotherexample.com/ad", "wss", "subdomain.anotherexample.com", "anotherexample.com", "example.com", "example.com", ); assert_eq!(websocket.is_https, false); assert_eq!(websocket.is_https, false); assert_eq!(websocket.is_supported, true); assert_eq!(websocket.is_third_party, true); assert_eq!(websocket.request_type, RequestType::Websocket); let assumed_https = build_request( "document", "//subdomain.anotherexample.com/ad", "", "subdomain.anotherexample.com", "anotherexample.com", "example.com", "example.com", ); assert_eq!(assumed_https.is_https, true); assert_eq!(assumed_https.is_http, false); assert_eq!(assumed_https.is_supported, true); } fn tokenize(tokens: &[&str], extra_tokens: &[utils::Hash]) -> Vec { let mut tokens: Vec<_> = tokens.into_iter().map(|t| utils::fast_hash(&t)).collect(); tokens.extend(extra_tokens); tokens } #[test] fn tokens_works() { let simple_example = build_request( "document", "https://subdomain.example.com/ad", "https", "subdomain.example.com", "example.com", "subdomain.example.com", "example.com", ); assert_eq!( simple_example .source_hostname_hashes .as_ref() .unwrap() .as_slice(), tokenize(&["subdomain.example.com", "example.com", "com",], &[]).as_slice() ); let mut tokens = Vec::new(); simple_example.get_tokens(&mut tokens); assert_eq!( tokens.as_slice(), tokenize(&["https", "subdomain", "example", "com", "ad"], &[0]).as_slice() ) } #[test] fn parses_urls() { let parsed = Request::new( "https://subdomain.example.com/ad", "https://example.com/", "document", ) .unwrap(); assert_eq!(parsed.is_https, true); assert_eq!(parsed.is_supported, true); assert_eq!(parsed.is_third_party, false); assert_eq!(parsed.request_type, RequestType::Document); // assert_eq!(parsed.domain, "example.com"); assert_eq!(parsed.hostname, "subdomain.example.com"); // assert_eq!(parsed.source_domain, "example.com"); assert_eq!( parsed.source_hostname_hashes, Some(vec![ utils::fast_hash("example.com"), utils::fast_hash("com") ]), ); // assert_eq!(parsed.source_hostname, "example.com"); let bad_url = Request::new( "subdomain.example.com/ad", "https://example.com/", "document", ); assert_eq!(bad_url.err(), Some(RequestError::HostnameParseError)); } #[test] fn fuzzing_errors() { { let parsed = Request::new("https://߶", "https://example.com", "other"); assert!(parsed.is_ok()); } { let parsed = Request::new(&format!( "https://{}", std::str::from_utf8(&[9, 9, 64]).unwrap() ), "https://example.com", "other"); assert!(parsed.is_err()); } } } adblock-0.8.12/src/resources/mod.rs000064400000000000000000000357271046102023000152570ustar 00000000000000//! In adblocking terms, [`Resource`]s are special placeholder scripts, images, //! video files, etc. that can be returned as drop-in replacements for harmful //! equivalents from remote servers. Resources also encompass scriptlets, which //! can be injected into pages to inhibit malicious behavior. //! //! If the `resource-assembler` feature is enabled, the #![cfg_attr(not(feature = "resource-assembler"), doc="`resource_assembler`")] #![cfg_attr(feature = "resource-assembler", doc="[`resource_assembler`]")] //! module will assist with the construction of [`Resource`]s directly from the uBlock Origin //! project. #[cfg(feature = "resource-assembler")] pub mod resource_assembler; mod resource_storage; #[doc(inline)] pub use resource_storage::{AddResourceError, ResourceStorage, ScriptletResourceError}; pub(crate) use resource_storage::parse_scriptlet_args; use memchr::memrchr as find_char_reverse; use serde::{Deserialize, Serialize}; /// Specifies a set of permissions required to inject a scriptlet resource. /// /// Permissions can be specified when parsing individual lists using [`crate::FilterSet`] in /// order to propagate the permission level to all filters contained in the list. /// /// In practice, permissions are used to limit the risk of third-party lists having access to /// powerful scriptlets like uBlock Origin's `trusted-set-cookie`, which has the ability to set /// arbitrary cookies to arbitrary values on visited sites. /// /// ### Example /// /// ``` /// # use adblock::Engine; /// # use adblock::lists::ParseOptions; /// # use adblock::resources::{MimeType, PermissionMask, Resource, ResourceType}; /// # let mut filter_set = adblock::lists::FilterSet::default(); /// # let untrusted_filters = vec![""]; /// # let trusted_filters = vec![""]; /// const COOKIE_ACCESS: PermissionMask = PermissionMask::from_bits(0b00000001); /// const LOCALSTORAGE_ACCESS: PermissionMask = PermissionMask::from_bits(0b00000010); /// /// // `untrusted_filters` will not be able to use privileged scriptlet injections. /// filter_set.add_filters( /// untrusted_filters, /// Default::default(), /// ); /// // `trusted_filters` will be able to inject scriptlets requiring `COOKIE_ACCESS` /// // permissions or `LOCALSTORAGE_ACCESS` permissions. /// filter_set.add_filters( /// trusted_filters, /// ParseOptions { /// permissions: COOKIE_ACCESS | LOCALSTORAGE_ACCESS, /// ..Default::default() /// }, /// ); /// /// let mut engine = Engine::from_filter_set(filter_set, true); /// // The `trusted-set-cookie` scriptlet cannot be injected without `COOKIE_ACCESS` /// // permission. /// engine.add_resource(Resource { /// name: "trusted-set-cookie.js".to_string(), /// aliases: vec![], /// kind: ResourceType::Mime(MimeType::ApplicationJavascript), /// content: base64::encode("document.cookie = '...';"), /// dependencies: vec![], /// permission: COOKIE_ACCESS, /// }); /// ``` #[derive(Serialize, Deserialize, Clone, Copy, Default)] #[repr(transparent)] #[serde(transparent)] pub struct PermissionMask(u8); impl std::fmt::Debug for PermissionMask { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "PermissionMask({:b})", self.0) } } impl core::ops::BitOr for PermissionMask { type Output = PermissionMask; fn bitor(self, rhs: PermissionMask) -> Self::Output { Self(self.0 | rhs.0) } } impl core::ops::BitOrAssign for PermissionMask { fn bitor_assign(&mut self, rhs: PermissionMask) { self.0 |= rhs.0; } } impl PermissionMask { /// Construct a new [`PermissionMask`] with the given bitmask. Use /// [`PermissionMask::default()`] instead if you don't want to restrict or grant any /// permissions. pub const fn from_bits(bits: u8) -> Self { Self(bits) } /// Can `filter_mask` authorize injecting a resource requiring `self` permissions? pub fn is_injectable_by(&self, filter_mask: PermissionMask) -> bool { // For any particular bit index, the scriptlet is injectable if: // (there is a requirement, AND the filter meets it) OR (there's no requirement) // in other words: // (self & filter_mask) | (!self) == 1 // (self | !self) & (filter_mask | !self) == 1 // filter_mask | !self == 1 // !(filter_mask | !self) == 0 // !filter_mask & self == 0 // which we can compare across *all* bits using bitwise operations, hence: !filter_mask.0 & self.0 == 0 } /// The default value for [`PermissionMask`] is one which provides no additional permissions. fn is_default(&self) -> bool { self.0 == 0 } } /// Struct representing a resource that can be used by an adblocking engine. #[derive(Serialize, Deserialize, Clone)] pub struct Resource { /// Represents the primary name of the resource, often a filename pub name: String, /// Represents secondary names that can be used to access the resource #[serde(default)] pub aliases: Vec, /// How to interpret the resource data within `content` pub kind: ResourceType, /// The resource data, encoded using standard base64 configuration pub content: String, /// Optionally contains the name of any dependencies used by this resource. Currently, this /// only applies to `application/javascript` and `fn/javascript` MIME types. /// /// Aliases should never be added to this list. It should only contain primary/canonical /// resource names. /// /// Currently ignored, but will be respected in a future release. Bundle any required /// dependencies inside the resource for now. #[serde(default, skip_serializing_if = "Vec::is_empty")] pub dependencies: Vec, /// Optionally defines permission levels required to use this resource for a scriptlet /// injection. See [`PermissionMask`] for more details. /// /// If there is any customized permission, this resource cannot be used for redirects. /// /// This field is similar to the `requiresTrust` field from uBlock Origin's scriptlet /// resources, except that it supports up to 8 different trust "domains". #[serde(default, skip_serializing_if = "PermissionMask::is_default")] pub permission: PermissionMask, } impl Resource { /// Convenience constructor for tests. Creates a new [`Resource`] with no aliases or /// dependencies. Content will be automatically base64-encoded by the constructor. #[cfg(test)] pub fn simple(name: &str, kind: MimeType, content: &str) -> Self { Self { name: name.to_string(), aliases: vec![], kind: ResourceType::Mime(kind), content: base64::encode(content), dependencies: vec![], permission: Default::default(), } } } /// Different ways that the data within the `content` field of a `Resource` can be interpreted. #[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] #[serde(rename_all = "lowercase")] pub enum ResourceType { /// Interpret the data according to the MIME type represented by `type` Mime(MimeType), /// Interpret the data as a Javascript scriptlet template, with embedded template /// parameters in the form of `{{1}}`, `{{2}}`, etc. Note that `Mime(ApplicationJavascript)` /// can still be used as a templated resource, for compatibility purposes. Template, } impl ResourceType { /// Can resources of this type be used as network redirects? pub fn supports_redirect(&self) -> bool { !matches!(self, ResourceType::Template | ResourceType::Mime(MimeType::FnJavascript)) } /// Can resources of this type be used for scriptlet injections? pub fn supports_scriptlet_injection(&self) -> bool { matches!(self, ResourceType::Template | ResourceType::Mime(MimeType::ApplicationJavascript)) } } /// Acceptable MIME types for resources used by `$redirect` and `+js(...)` adblock rules. #[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] #[serde(into = "&str")] #[serde(from = "std::borrow::Cow<'static, str>")] pub enum MimeType { /// `"text/css"` TextCss, /// `"image/gif"` ImageGif, /// `"text/html"` TextHtml, /// `"application/javascript"` ApplicationJavascript, /// `"application/json"` ApplicationJson, /// `"audio/mp3"` AudioMp3, /// `"video/mp4"` VideoMp4, /// `"image/png"` ImagePng, /// `"text/plain"` TextPlain, /// `"text/xml"` TextXml, /// Custom MIME type invented for the uBlock Origin project. Represented by `"fn/javascript"`. /// Used to describe JavaScript functions that can be used as dependencies of other JavaScript /// resources. FnJavascript, /// Any other unhandled MIME type. Maps to `"application/octet-stream"` when re-serialized. Unknown, } impl MimeType { /// Infers a resource's MIME type according to the extension of its path pub fn from_extension(resource_path: &str) -> Self { if let Some(extension_index) = find_char_reverse(b'.', resource_path.as_bytes()) { match &resource_path[extension_index + 1..] { "css" => MimeType::TextCss, "gif" => MimeType::ImageGif, "html" => MimeType::TextHtml, "js" => MimeType::ApplicationJavascript, "json" => MimeType::ApplicationJson, "mp3" => MimeType::AudioMp3, "mp4" => MimeType::VideoMp4, "png" => MimeType::ImagePng, "txt" => MimeType::TextPlain, "xml" => MimeType::TextXml, _ => { #[cfg(test)] eprintln!("Unrecognized file extension on: {:?}", resource_path); MimeType::Unknown } } } else { MimeType::Unknown } } /// Should the MIME type decode as valid UTF8? pub fn is_textual(&self) -> bool { matches!( self, Self::ApplicationJavascript | Self::FnJavascript | Self::ApplicationJson | Self::TextCss | Self::TextPlain | Self::TextHtml | Self::TextXml ) } /// Can the MIME type have dependencies on other resources? pub fn supports_dependencies(&self) -> bool { matches!(self, Self::ApplicationJavascript | Self::FnJavascript) } } impl From<&str> for MimeType { fn from(v: &str) -> Self { match v { "text/css" => MimeType::TextCss, "image/gif" => MimeType::ImageGif, "text/html" => MimeType::TextHtml, "application/javascript" => MimeType::ApplicationJavascript, "application/json" => MimeType::ApplicationJson, "audio/mp3" => MimeType::AudioMp3, "video/mp4" => MimeType::VideoMp4, "image/png" => MimeType::ImagePng, "text/plain" => MimeType::TextPlain, "text/xml" => MimeType::TextXml, "fn/javascript" => MimeType::FnJavascript, _ => MimeType::Unknown, } } } impl From<&MimeType> for &str { fn from(v: &MimeType) -> Self { match v { MimeType::TextCss => "text/css", MimeType::ImageGif => "image/gif", MimeType::TextHtml => "text/html", MimeType::ApplicationJavascript => "application/javascript", MimeType::ApplicationJson => "application/json", MimeType::AudioMp3 => "audio/mp3", MimeType::VideoMp4 => "video/mp4", MimeType::ImagePng => "image/png", MimeType::TextPlain => "text/plain", MimeType::TextXml => "text/xml", MimeType::FnJavascript => "fn/javascript", MimeType::Unknown => "application/octet-stream", } } } // Required for `#[serde(from = "std::borrow::Cow<'static, str>")]` impl From> for MimeType { fn from(v: std::borrow::Cow<'static, str>) -> Self { v.as_ref().into() } } // Required for `#[serde(into = &str)]` impl From for &str { fn from(v: MimeType) -> Self { (&v).into() } } impl std::fmt::Display for MimeType { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let s: &str = self.into(); write!(f, "{}", s) } } #[cfg(test)] mod permission_tests { use super::*; #[test] fn test_permissions() { { let resource = PermissionMask(0b00000000); assert!(resource.is_injectable_by(PermissionMask(0b00000000))); assert!(resource.is_injectable_by(PermissionMask(0b00000001))); assert!(resource.is_injectable_by(PermissionMask(0b00000010))); assert!(resource.is_injectable_by(PermissionMask(0b00000011))); assert!(resource.is_injectable_by(PermissionMask(0b10000000))); assert!(resource.is_injectable_by(PermissionMask(0b11111111))); } { let resource = PermissionMask(0b00000001); assert!(!resource.is_injectable_by(PermissionMask(0b00000000))); assert!(resource.is_injectable_by(PermissionMask(0b00000001))); assert!(!resource.is_injectable_by(PermissionMask(0b00000010))); assert!(resource.is_injectable_by(PermissionMask(0b00000011))); assert!(!resource.is_injectable_by(PermissionMask(0b10000000))); assert!(resource.is_injectable_by(PermissionMask(0b11111111))); } { let resource = PermissionMask(0b00000010); assert!(!resource.is_injectable_by(PermissionMask(0b00000000))); assert!(!resource.is_injectable_by(PermissionMask(0b00000001))); assert!(resource.is_injectable_by(PermissionMask(0b00000010))); assert!(resource.is_injectable_by(PermissionMask(0b00000011))); assert!(!resource.is_injectable_by(PermissionMask(0b10000000))); assert!(resource.is_injectable_by(PermissionMask(0b11111111))); } { let resource = PermissionMask(0b00000011); assert!(!resource.is_injectable_by(PermissionMask(0b00000000))); assert!(!resource.is_injectable_by(PermissionMask(0b00000001))); assert!(!resource.is_injectable_by(PermissionMask(0b00000010))); assert!(resource.is_injectable_by(PermissionMask(0b00000011))); assert!(!resource.is_injectable_by(PermissionMask(0b10000000))); assert!(resource.is_injectable_by(PermissionMask(0b11111111))); } { let resource = PermissionMask(0b10000011); assert!(!resource.is_injectable_by(PermissionMask(0b00000000))); assert!(!resource.is_injectable_by(PermissionMask(0b00000001))); assert!(!resource.is_injectable_by(PermissionMask(0b00000010))); assert!(!resource.is_injectable_by(PermissionMask(0b00000011))); assert!(!resource.is_injectable_by(PermissionMask(0b10000000))); assert!(resource.is_injectable_by(PermissionMask(0b11111111))); } } } adblock-0.8.12/src/resources/resource_assembler.rs000064400000000000000000000642341046102023000203570ustar 00000000000000//! Contains methods useful for building [`Resource`] descriptors from resources directly from //! files in the uBlock Origin repository. use crate::resources::{MimeType, Resource, ResourceType}; use memchr::memmem; use once_cell::sync::Lazy; use regex::Regex; use std::fs::File; use std::io::Read; use std::path::Path; static TOP_COMMENT_RE: Lazy = Lazy::new(|| Regex::new(r#"^/\*[\S\s]+?\n\*/\s*"#).unwrap()); static NON_EMPTY_LINE_RE: Lazy = Lazy::new(|| Regex::new(r#"\S"#).unwrap()); /// Represents a single entry of the `Map` from uBlock Origin's `redirect-resources.js`. struct ResourceProperties { /// The name of a resource, corresponding to its path in the `web_accessible_resources` /// directory name: String, /// A list of optional additional names that can be used to reference the resource alias: Vec, /// Either `"text"` or `"blob"`, but is currently unused in `adblock-rust`. Within uBlock /// Origin, it's used to prevent text files from being encoded in base64 in a data URL. #[allow(unused)] data: Option, } /// The deserializable represenation of the `alias` field of a resource's properties, which can /// either be a single string or a list of strings. #[derive(serde::Deserialize)] #[serde(untagged)] enum ResourceAliasField { SingleString(String), ListOfStrings(Vec), } impl ResourceAliasField { fn to_vec(self) -> Vec { match self { Self::SingleString(s) => vec![s], Self::ListOfStrings(l) => l, } } } /// Directly deserializable representation of a resource's properties from `redirect-resources.js`. #[derive(serde::Deserialize)] struct JsResourceProperties { #[serde(default)] alias: Option, #[serde(default)] data: Option, #[serde(default)] params: Option>, } /// Maps the name of the resource to its properties in a 2-element tuple. type JsResourceEntry = (String, JsResourceProperties); const REDIRECTABLE_RESOURCES_DECLARATION: &str = "export default new Map(["; // ]); static MAP_END_RE: Lazy = Lazy::new(|| Regex::new(r#"^\s*\]\s*\)"#).unwrap()); static TRAILING_COMMA_RE: Lazy = Lazy::new(|| Regex::new(r#",([\],\}])"#).unwrap()); static UNQUOTED_FIELD_RE: Lazy = Lazy::new(|| Regex::new(r#"([\{,])([a-zA-Z][a-zA-Z0-9_]*):"#).unwrap()); // Avoid matching a starting `/*` inside a string static TRAILING_BLOCK_COMMENT_RE: Lazy = Lazy::new(|| Regex::new(r#"\s*/\*[^'"]*\*/\s*$"#).unwrap()); /// Reads data from a a file in the format of uBlock Origin's `redirect-resources.js` file to /// determine the files in the `web_accessible_resources` directory, as well as any of their /// aliases. /// /// This is read from the exported `Map`. fn read_redirectable_resource_mapping(mapfile_data: &str) -> Vec { // This isn't bulletproof, but it should handle the historical versions of the mapping // correctly, and having a strict JSON parser should catch any unexpected format changes. Plus, // it prevents dependending on a full JS engine. // Extract just the map. It's between REDIRECTABLE_RESOURCES_DECLARATION and MAP_END_RE. let mut map: String = mapfile_data .lines() .skip_while(|line| *line != REDIRECTABLE_RESOURCES_DECLARATION) .take_while(|line| !MAP_END_RE.is_match(line)) // Strip any trailing comments from each line. .map(|line| { if let Some(i) = memmem::find(line.as_bytes(), b"//") { &line[..i] } else { line } }) .map(|line| { TRAILING_BLOCK_COMMENT_RE.replace_all(line, "") }) // Remove all newlines from the entire string. .fold(String::new(), |s, line| s + &line); // Add back the final square brace that was omitted above as part of MAP_END_RE. map.push(']'); // Trim out the beginning `export default new Map(`. // Also, replace all single quote characters with double quotes. assert!(map.starts_with(REDIRECTABLE_RESOURCES_DECLARATION)); map = map[REDIRECTABLE_RESOURCES_DECLARATION.len() - 1..].replace('\'', "\""); // Remove all whitespace from the entire string. map.retain(|c| !c.is_whitespace()); // Replace all matches for `,]` or `,}` with `]` or `}`, respectively. map = TRAILING_COMMA_RE .replace_all(&map, |caps: ®ex::Captures| caps[1].to_string()) .to_string(); // Replace all property keys directly preceded by a `{` or a `,` and followed by a `:` with // double-quoted versions. map = UNQUOTED_FIELD_RE .replace_all(&map, |caps: ®ex::Captures| { format!("{}\"{}\":", &caps[1], &caps[2]) }) .to_string(); // It *should* be valid JSON now, so parse it with serde_json. let parsed: Vec = serde_json::from_str(&map).unwrap(); parsed .into_iter() .filter_map(|(name, props)| { // Ignore resources with params for now, since there's no support for them currently. if props.params.is_some() { None } else { Some(ResourceProperties { name, alias: props.alias.map(|a| a.to_vec()).unwrap_or_default(), data: props.data, }) } }) .collect() } /// Reads data from a file in the form of uBlock Origin's `scriptlets.js` file and produces /// templatable scriptlets for use in cosmetic filtering. fn read_template_resources(scriptlets_data: &str) -> Vec { let mut resources = Vec::new(); let uncommented = TOP_COMMENT_RE.replace_all(&scriptlets_data, ""); let mut name: Option<&str> = None; let mut details = std::collections::HashMap::<_, Vec<_>>::new(); let mut script = String::new(); for line in uncommented.lines() { if line.starts_with('#') || line.starts_with("// ") || line == "//" { continue; } if name.is_none() { if let Some(stripped) = line.strip_prefix("/// ") { name = Some(stripped.trim()); } continue; } if let Some(stripped) = line.strip_prefix("/// ") { let mut line = stripped.split_whitespace(); let prop = line.next().expect("Detail line has property name"); let value = line.next().expect("Detail line has property value"); details .entry(prop) .and_modify(|v| v.push(value)) .or_insert_with(|| vec![value]); continue; } if NON_EMPTY_LINE_RE.is_match(line) { script += line.trim(); script.push('\n'); continue; } let kind = if script.contains("{{1}}") { ResourceType::Template } else { ResourceType::Mime(MimeType::ApplicationJavascript) }; resources.push(Resource { name: name.expect("Resource name must be specified").to_owned(), aliases: details .get("alias") .map(|aliases| aliases.iter().map(|alias| alias.to_string()).collect()) .unwrap_or_default(), kind, content: base64::encode(&script), dependencies: vec![], permission: Default::default(), }); name = None; details.clear(); script.clear(); } resources } /// Reads byte data from an arbitrary resource file, and assembles a `Resource` from it with the /// provided `resource_info`. fn build_resource_from_file_contents( resource_contents: &[u8], resource_info: &ResourceProperties, ) -> Resource { let name = resource_info.name.to_owned(); let aliases = resource_info .alias .iter() .map(|alias| alias.to_string()) .collect(); let mimetype = MimeType::from_extension(&resource_info.name[..]); let content = match mimetype { MimeType::ApplicationJavascript | MimeType::TextHtml | MimeType::TextPlain => { let utf8string = std::str::from_utf8(resource_contents).unwrap(); base64::encode(&utf8string.replace('\r', "")) } _ => base64::encode(&resource_contents), }; Resource { name, aliases, kind: ResourceType::Mime(mimetype), content, dependencies: vec![], permission: Default::default(), } } /// Produces a `Resource` from the `web_accessible_resource_dir` directory according to the /// information in `resource_info. fn read_resource_from_web_accessible_dir( web_accessible_resource_dir: &Path, resource_info: &ResourceProperties, ) -> Resource { let resource_path = web_accessible_resource_dir.join(&resource_info.name); if !resource_path.is_file() { panic!("Expected {:?} to be a file", resource_path); } let mut resource_file = File::open(resource_path).expect("open resource file for reading"); let mut resource_contents = Vec::new(); resource_file .read_to_end(&mut resource_contents) .expect("read resource file contents"); build_resource_from_file_contents(&resource_contents, resource_info) } /// Builds a `Vec` of `Resource`s from the specified paths on the filesystem: /// /// - `web_accessible_resource_dir`: A folder full of resource files /// /// - `redirect_resources_path`: A file in the format of uBlock Origin's `redirect-resources.js` /// containing an index of the resources in `web_accessible_resource_dir` /// /// The resulting resources can be serialized into JSON using `serde_json`. pub fn assemble_web_accessible_resources( web_accessible_resource_dir: &Path, redirect_resources_path: &Path, ) -> Vec { let mapfile_data = std::fs::read_to_string(redirect_resources_path).expect("read aliases path"); let resource_properties = read_redirectable_resource_mapping(&mapfile_data); resource_properties .iter() .map(|resource_info| { read_resource_from_web_accessible_dir(web_accessible_resource_dir, resource_info) }) .collect() } /// Parses the _old_ format of uBlock Origin templated scriptlet resources, prior to /// . /// /// The newer format is intended to be imported as an ES module, making line-based parsing even /// more complex and error-prone. Instead, it's recommended to transform them into [Resource]s /// using JS code. A short prelude containing an array of `[{{1}}, {{2}}, {{3}}, ...]` can be used /// to backport the newer scriptlet format into the older one; the new one will be directly /// supported in a future update. /// /// - `scriptlets_path`: A file in the format of uBlock Origin's `scriptlets.js` containing /// templatable scriptlet files for use in cosmetic filtering #[deprecated] pub fn assemble_scriptlet_resources(scriptlets_path: &Path) -> Vec { let scriptlets_data = std::fs::read_to_string(scriptlets_path).expect("read scriptlets path"); read_template_resources(&scriptlets_data) } #[cfg(test)] mod tests { use super::*; #[test] fn test_war_resource_assembly() { let web_accessible_resource_dir = Path::new("data/test/fake-uBO-files/web_accessible_resources"); let redirect_resources_path = Path::new("data/test/fake-uBO-files/redirect-resources.js"); let resources = assemble_web_accessible_resources(web_accessible_resource_dir, redirect_resources_path); let expected_resource_names = vec![ "1x1.gif", "2x2.png", "3x2.png", "32x32.png", "addthis_widget.js", "amazon_ads.js", "amazon_apstag.js", "ampproject_v0.js", "chartbeat.js", //"click-to-load.html" is ignored because it has a params field. "doubleclick_instream_ad_status.js", "empty", "fingerprint2.js", "fingerprint3.js", "google-analytics_analytics.js", "google-analytics_cx_api.js", "google-analytics_ga.js", "google-analytics_inpage_linkid.js", "google-ima.js", "googlesyndication_adsbygoogle.js", "googletagservices_gpt.js", "hd-main.js", "ligatus_angular-tag.js", "mxpnl_mixpanel.js", "monkeybroker.js", "noeval.js", "noeval-silent.js", "nobab.js", "nobab2.js", "nofab.js", "noop-0.1s.mp3", "noop-0.5s.mp3", "noop-1s.mp4", "noop.html", "noop.js", "noop.txt", "noop-vmap1.0.xml", "outbrain-widget.js", "popads.js", "popads-dummy.js", "prebid-ads.js", "scorecardresearch_beacon.js", "window.open-defuser.js", ]; for name in expected_resource_names { dbg!(&name); assert!( resources .iter() .find(|resource| { if let ResourceType::Mime(_) = resource.kind { resource.name == name } else { false } }) .is_some(), "{:?}", name ); } let serialized = serde_json::to_string(&resources).expect("serialize resources"); let reserialized: Vec = serde_json::from_str(&serialized).expect("deserialize resources"); assert_eq!(reserialized[0].name, "1x1.gif"); assert_eq!(reserialized[0].aliases, vec!["1x1-transparent.gif"]); assert_eq!(reserialized[0].kind, ResourceType::Mime(MimeType::ImageGif)); assert_eq!(reserialized[34].name, "noop.js"); assert_eq!( reserialized[34].aliases, vec!["noopjs", "abp-resource:blank-js"] ); assert_eq!( reserialized[34].kind, ResourceType::Mime(MimeType::ApplicationJavascript) ); let noopjs_contents = std::fs::read_to_string(Path::new( "data/test/fake-uBO-files/web_accessible_resources/noop.js", )) .unwrap() .replace('\r', ""); assert_eq!( std::str::from_utf8( &base64::decode(&reserialized[34].content).expect("decode base64 content") ) .expect("convert to utf8 string"), noopjs_contents, ); } #[test] fn test_scriptlet_resource_assembly2() { let scriptlets_path = Path::new("data/test/fake-uBO-files/scriptlets2.js"); #[allow(deprecated)] let resources = assemble_scriptlet_resources(scriptlets_path); let expected_resource_names = vec![ "abort-current-inline-script.js", "abort-on-property-read.js", "abort-on-property-write.js", "abort-on-stack-trace.js", "addEventListener-defuser.js", "addEventListener-logger.js", "json-prune.js", "nano-setInterval-booster.js", "nano-setTimeout-booster.js", "noeval-if.js", "no-fetch-if.js", "no-floc.js", "remove-attr.js", "remove-class.js", "no-requestAnimationFrame-if.js", "set-constant.js", "no-setInterval-if.js", "no-setTimeout-if.js", "webrtc-if.js", "window.name-defuser", "overlay-buster.js", "alert-buster.js", "gpt-defuser.js", "nowebrtc.js", "golem.de.js", "upmanager-defuser.js", "smartadserver.com.js", "adfly-defuser.js", "disable-newtab-links.js", "damoh-defuser.js", "twitch-videoad.js", "fingerprint2.js", "cookie-remover.js", ]; for name in expected_resource_names { assert!( resources .iter() .find(|resource| { match resource.kind { ResourceType::Template | ResourceType::Mime(MimeType::ApplicationJavascript) => { resource.name == name } _ => false, } }) .is_some(), "failed to find {}", name ); } let serialized = serde_json::to_string(&resources).expect("serialize resources"); let reserialized: Vec = serde_json::from_str(&serialized).expect("deserialize resources"); assert_eq!(reserialized[0].name, "abort-current-inline-script.js"); assert_eq!(reserialized[0].aliases, vec!["acis.js"]); assert_eq!(reserialized[0].kind, ResourceType::Template); assert_eq!(reserialized[17].name, "no-setTimeout-if.js"); assert_eq!( reserialized[17].aliases, vec!["nostif.js", "setTimeout-defuser.js"] ); assert_eq!(reserialized[17].kind, ResourceType::Template); assert_eq!(reserialized[20].name, "overlay-buster.js"); assert_eq!(reserialized[20].aliases, Vec::::new()); assert_eq!( reserialized[20].kind, ResourceType::Mime(MimeType::ApplicationJavascript) ); assert_eq!( std::str::from_utf8( &base64::decode(&reserialized[20].content).expect("decode base64 content") ).expect("convert to utf8 string"), "(function() {\nif ( window !== window.top ) {\nreturn;\n}\nvar tstart;\nvar ttl = 30000;\nvar delay = 0;\nvar delayStep = 50;\nvar buster = function() {\nvar docEl = document.documentElement,\nbodyEl = document.body,\nvw = Math.min(docEl.clientWidth, window.innerWidth),\nvh = Math.min(docEl.clientHeight, window.innerHeight),\ntol = Math.min(vw, vh) * 0.05,\nel = document.elementFromPoint(vw/2, vh/2),\nstyle, rect;\nfor (;;) {\nif ( el === null || el.parentNode === null || el === bodyEl ) {\nbreak;\n}\nstyle = window.getComputedStyle(el);\nif ( parseInt(style.zIndex, 10) >= 1000 || style.position === 'fixed' ) {\nrect = el.getBoundingClientRect();\nif ( rect.left <= tol && rect.top <= tol && (vw - rect.right) <= tol && (vh - rect.bottom) < tol ) {\nel.parentNode.removeChild(el);\ntstart = Date.now();\nel = document.elementFromPoint(vw/2, vh/2);\nbodyEl.style.setProperty('overflow', 'auto', 'important');\ndocEl.style.setProperty('overflow', 'auto', 'important');\ncontinue;\n}\n}\nel = el.parentNode;\n}\nif ( (Date.now() - tstart) < ttl ) {\ndelay = Math.min(delay + delayStep, 1000);\nsetTimeout(buster, delay);\n}\n};\nvar domReady = function(ev) {\nif ( ev ) {\ndocument.removeEventListener(ev.type, domReady);\n}\ntstart = Date.now();\nsetTimeout(buster, delay);\n};\nif ( document.readyState === 'loading' ) {\ndocument.addEventListener('DOMContentLoaded', domReady);\n} else {\ndomReady();\n}\n})();\n", ); assert_eq!(reserialized[6].name, "json-prune.js"); assert_eq!(reserialized[6].aliases, Vec::::new()); assert_eq!(reserialized[6].kind, ResourceType::Template); assert_eq!( std::str::from_utf8( &base64::decode(&reserialized[6].content).expect("decode base64 content") ).expect("convert to utf8 string"), "(function() {\nconst rawPrunePaths = '{{1}}';\nconst rawNeedlePaths = '{{2}}';\nconst prunePaths = rawPrunePaths !== '{{1}}' && rawPrunePaths !== ''\n? rawPrunePaths.split(/ +/)\n: [];\nlet needlePaths;\nlet log, reLogNeedle;\nif ( prunePaths.length !== 0 ) {\nneedlePaths = prunePaths.length !== 0 &&\nrawNeedlePaths !== '{{2}}' && rawNeedlePaths !== ''\n? rawNeedlePaths.split(/ +/)\n: [];\n} else {\nlog = console.log.bind(console);\nlet needle;\nif ( rawNeedlePaths === '' || rawNeedlePaths === '{{2}}' ) {\nneedle = '.?';\n} else if ( rawNeedlePaths.charAt(0) === '/' && rawNeedlePaths.slice(-1) === '/' ) {\nneedle = rawNeedlePaths.slice(1, -1);\n} else {\nneedle = rawNeedlePaths.replace(/[.*+?^${}()|[\\]\\\\]/g, '\\\\$&');\n}\nreLogNeedle = new RegExp(needle);\n}\nconst findOwner = function(root, path, prune = false) {\nlet owner = root;\nlet chain = path;\nfor (;;) {\nif ( typeof owner !== 'object' || owner === null ) {\nreturn false;\n}\nconst pos = chain.indexOf('.');\nif ( pos === -1 ) {\nif ( prune === false ) {\nreturn owner.hasOwnProperty(chain);\n}\nif ( chain === '*' ) {\nfor ( const key in owner ) {\nif ( owner.hasOwnProperty(key) === false ) { continue; }\ndelete owner[key];\n}\n} else if ( owner.hasOwnProperty(chain) ) {\ndelete owner[chain];\n}\nreturn true;\n}\nconst prop = chain.slice(0, pos);\nif (\nprop === '[]' && Array.isArray(owner) ||\nprop === '*' && owner instanceof Object\n) {\nconst next = chain.slice(pos + 1);\nlet found = false;\nfor ( const key of Object.keys(owner) ) {\nfound = findOwner(owner[key], next, prune) || found;\n}\nreturn found;\n}\nif ( owner.hasOwnProperty(prop) === false ) { return false; }\nowner = owner[prop];\nchain = chain.slice(pos + 1);\n}\n};\nconst mustProcess = function(root) {\nfor ( const needlePath of needlePaths ) {\nif ( findOwner(root, needlePath) === false ) {\nreturn false;\n}\n}\nreturn true;\n};\nconst pruner = function(o) {\nif ( log !== undefined ) {\nconst json = JSON.stringify(o, null, 2);\nif ( reLogNeedle.test(json) ) {\nlog('uBO:', location.hostname, json);\n}\nreturn o;\n}\nif ( mustProcess(o) === false ) { return o; }\nfor ( const path of prunePaths ) {\nfindOwner(o, path, true);\n}\nreturn o;\n};\nJSON.parse = new Proxy(JSON.parse, {\napply: function() {\nreturn pruner(Reflect.apply(...arguments));\n},\n});\nResponse.prototype.json = new Proxy(Response.prototype.json, {\napply: function() {\nreturn Reflect.apply(...arguments).then(o => pruner(o));\n},\n});\n})();\n", ); } #[test] fn test_scriptlet_resource_assembly() { let scriptlets_path = Path::new("data/test/fake-uBO-files/scriptlets.js"); #[allow(deprecated)] let resources = assemble_scriptlet_resources(scriptlets_path); let expected_resource_names = vec![ "abort-current-inline-script.js", "abort-on-property-read.js", "abort-on-property-write.js", "addEventListener-defuser.js", "addEventListener-logger.js", "json-prune.js", "nano-setInterval-booster.js", "nano-setTimeout-booster.js", "noeval-if.js", "remove-attr.js", "requestAnimationFrame-if.js", "set-constant.js", "setInterval-defuser.js", "no-setInterval-if.js", "setTimeout-defuser.js", "no-setTimeout-if.js", "webrtc-if.js", "window.name-defuser", "overlay-buster.js", "alert-buster.js", "gpt-defuser.js", "nowebrtc.js", "golem.de.js", "upmanager-defuser.js", "smartadserver.com.js", "adfly-defuser.js", "disable-newtab-links.js", "damoh-defuser.js", "twitch-videoad.js", "fingerprint2.js", "cookie-remover.js", ]; for name in expected_resource_names { assert!( resources .iter() .find(|resource| { match resource.kind { ResourceType::Template | ResourceType::Mime(MimeType::ApplicationJavascript) => { resource.name == name } _ => false, } }) .is_some(), "failed to find {}", name ); } let serialized = serde_json::to_string(&resources).expect("serialize resources"); let reserialized: Vec = serde_json::from_str(&serialized).expect("deserialize resources"); assert_eq!(reserialized[0].name, "abort-current-inline-script.js"); assert_eq!(reserialized[0].aliases, vec!["acis.js"]); assert_eq!(reserialized[0].kind, ResourceType::Template); assert_eq!(reserialized[18].name, "overlay-buster.js"); assert_eq!(reserialized[18].aliases, Vec::::new()); assert_eq!( reserialized[18].kind, ResourceType::Mime(MimeType::ApplicationJavascript) ); assert_eq!( std::str::from_utf8( &base64::decode(&reserialized[18].content).expect("decode base64 content") ).expect("convert to utf8 string"), "(function() {\nif ( window !== window.top ) {\nreturn;\n}\nvar tstart;\nvar ttl = 30000;\nvar delay = 0;\nvar delayStep = 50;\nvar buster = function() {\nvar docEl = document.documentElement,\nbodyEl = document.body,\nvw = Math.min(docEl.clientWidth, window.innerWidth),\nvh = Math.min(docEl.clientHeight, window.innerHeight),\ntol = Math.min(vw, vh) * 0.05,\nel = document.elementFromPoint(vw/2, vh/2),\nstyle, rect;\nfor (;;) {\nif ( el === null || el.parentNode === null || el === bodyEl ) {\nbreak;\n}\nstyle = window.getComputedStyle(el);\nif ( parseInt(style.zIndex, 10) >= 1000 || style.position === 'fixed' ) {\nrect = el.getBoundingClientRect();\nif ( rect.left <= tol && rect.top <= tol && (vw - rect.right) <= tol && (vh - rect.bottom) < tol ) {\nel.parentNode.removeChild(el);\ntstart = Date.now();\nel = document.elementFromPoint(vw/2, vh/2);\nbodyEl.style.setProperty('overflow', 'auto', 'important');\ndocEl.style.setProperty('overflow', 'auto', 'important');\ncontinue;\n}\n}\nel = el.parentNode;\n}\nif ( (Date.now() - tstart) < ttl ) {\ndelay = Math.min(delay + delayStep, 1000);\nsetTimeout(buster, delay);\n}\n};\nvar domReady = function(ev) {\nif ( ev ) {\ndocument.removeEventListener(ev.type, domReady);\n}\ntstart = Date.now();\nsetTimeout(buster, delay);\n};\nif ( document.readyState === 'loading' ) {\ndocument.addEventListener('DOMContentLoaded', domReady);\n} else {\ndomReady();\n}\n})();\n", ); } } adblock-0.8.12/src/resources/resource_storage.rs000064400000000000000000001026251046102023000200430ustar 00000000000000//! Storage and retrieval for redirect and scriptlet resources. use std::collections::HashMap; use once_cell::sync::Lazy; use regex::Regex; use thiserror::Error; use super::{MimeType, PermissionMask, Resource, ResourceType}; /// Unified resource storage for both redirects and scriptlets. #[derive(Default)] pub struct ResourceStorage { /// Stores each resource by its canonical name resources: HashMap, /// Stores mappings from aliases to their canonical resource names aliases: HashMap, } /// Formats `arg` such that it either is a JSON string, or is safe to insert within a JSON string, /// depending on `QUOTED`. /// /// Implementation modified from `json-rust` (MIT license). /// https://github.com/maciejhirsz/json-rust #[inline(always)] fn stringify_arg(arg: &str) -> String { const QU: u8 = b'"'; const BS: u8 = b'\\'; const BB: u8 = b'b'; const TT: u8 = b't'; const NN: u8 = b'n'; const FF: u8 = b'f'; const RR: u8 = b'r'; const UU: u8 = b'u'; const __: u8 = 0; // Look up table for characters that need escaping in a product string static ESCAPED: [u8; 256] = [ // 0 1 2 3 4 5 6 7 8 9 A B C D E F UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0 UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1 __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4 __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F ]; #[inline(never)] fn write_string_complex(output: &mut Vec, string: &str, mut start: usize) { output.extend_from_slice(&string.as_bytes()[ .. start]); for (index, ch) in string.bytes().enumerate().skip(start) { let escape = ESCAPED[ch as usize]; if escape > 0 { output.extend_from_slice(&string.as_bytes()[start .. index]); output.extend_from_slice(&[b'\\', escape]); start = index + 1; } if escape == b'u' { output.extend_from_slice(format!("{:04x}", ch).as_bytes()); } } output.extend_from_slice(&string.as_bytes()[start ..]); } let mut output = Vec::with_capacity(arg.as_bytes().len() + 2); if QUOTED { output.push(b'"'); } 'process: { for (index, ch) in arg.bytes().enumerate() { if ESCAPED[ch as usize] > 0 { write_string_complex(&mut output, arg, index); break 'process; } } output.extend_from_slice(arg.as_bytes()); } if QUOTED { output.push(b'"'); } // unwrap safety: input is always valid UTF8; output processing only replaces some ASCII // characters with other valid ones return String::from_utf8(output).unwrap(); } impl ResourceStorage { /// Convenience constructor that allows building storage for many resources at once. Errors are /// silently consumed. pub fn from_resources(resources: impl IntoIterator) -> Self { let mut self_ = Self::default(); resources.into_iter().for_each(|resource| { self_ .add_resource(resource) .unwrap_or_else(|_e| { #[cfg(test)] eprintln!("Failed to add resource: {:?}", _e) }) }); self_ } /// Adds a resource to storage so that it can be retrieved later. pub fn add_resource(&mut self, resource: Resource) -> Result<(), AddResourceError> { if let ResourceType::Mime(content_type) = &resource.kind { if matches!(content_type, MimeType::FnJavascript) { return Err(AddResourceError::FnJavascriptNotSupported); } if !resource.dependencies.is_empty() && !content_type.supports_dependencies() { return Err(AddResourceError::ContentTypeDoesNotSupportDependencies); } // Ensure the resource contents are valid base64 (and utf8 if applicable) let decoded = base64::decode(&resource.content)?; if content_type.is_textual() { let _ = String::from_utf8(decoded)?; } } for ident in std::iter::once(&resource.name).chain(resource.aliases.iter()) { if self.resources.contains_key(ident) || self.aliases.contains_key(ident) { return Err(AddResourceError::NameAlreadyAdded); } } resource.aliases.iter().for_each(|alias| { self.aliases.insert(alias.clone(), resource.name.clone()); }); self.resources.insert(resource.name.clone(), resource); Ok(()) } /// Given the contents of a `+js(...)` filter part, return a scriptlet string appropriate for /// injection in a page. pub fn get_scriptlet_resource(&self, scriptlet_args: &str, filter_permission: PermissionMask) -> Result { // `unwrap` is safe because these are guaranteed valid at filter parsing. let scriptlet_args = parse_scriptlet_args(scriptlet_args).unwrap(); if scriptlet_args.is_empty() { return Err(ScriptletResourceError::MissingScriptletName); } let scriptlet_name = with_js_extension(scriptlet_args[0].as_ref()); let args = &scriptlet_args[1..]; if args.len() == 1 && args[0].starts_with('{') && args[0].ends_with('}') { return Err(ScriptletResourceError::ScriptletArgObjectSyntaxUnsupported); } let resource = self .get_internal_resource(&scriptlet_name) .ok_or(ScriptletResourceError::NoMatchingScriptlet)?; if !resource.permission.is_injectable_by(filter_permission) { return Err(ScriptletResourceError::InsufficientPermissions); } if !resource.kind.supports_scriptlet_injection() { return Err(ScriptletResourceError::ContentTypeNotInjectable); } let template = String::from_utf8(base64::decode(&resource.content)?)?; if template.starts_with("function") { // newer function-style resource: pass args using function call syntax use itertools::Itertools as _; Ok(format!("({})({})", template, args.iter().map(|arg| stringify_arg::(arg)).join(", "))) } else { // older template-style resource: replace first instances with args Ok(patch_template_scriptlet(template, args.iter().map(|arg| stringify_arg::(arg)))) } } /// Get a data-URL formatted resource appropriate for a `$redirect` response. pub fn get_redirect_resource(&self, resource_ident: &str) -> Option { let resource = self.get_internal_resource(resource_ident); resource.and_then(|resource| { if !resource.permission.is_default() { return None; } if !resource.kind.supports_redirect() { return None; } if let ResourceType::Mime(mime) = &resource.kind { Some(format!("data:{};base64,{}", mime, &resource.content)) } else { None } }) } /// Gets the resource associated with `resource_ident`, respecting aliases if necessary. fn get_internal_resource(&self, resource_ident: &str) -> Option<&Resource> { let resource = if let Some(resource) = self.resources.get(resource_ident) { Some(resource) } else if let Some(canonical_name) = self.aliases.get(resource_ident) { self.resources.get(canonical_name) } else { None }; resource } } /// Describes failure cases when preparing [`Resource`]s to be used for adblocking. #[derive(Debug, Error, PartialEq)] pub enum AddResourceError { #[error("invalid base64 content")] InvalidBase64Content, #[error("invalid utf-8 content")] InvalidUtf8Content, #[error("resource name already added")] NameAlreadyAdded, #[error("fn/javascript mime type is not yet supported")] FnJavascriptNotSupported, #[error("resource content type does not support dependencies")] ContentTypeDoesNotSupportDependencies, } impl From for AddResourceError { fn from(_: base64::DecodeError) -> Self { AddResourceError::InvalidBase64Content } } impl From for AddResourceError { fn from(_: std::string::FromUtf8Error) -> Self { AddResourceError::InvalidUtf8Content } } /// Describes failure cases when attempting to retrieve a resource for scriptlet injection. #[derive(Debug, Error, PartialEq)] pub enum ScriptletResourceError { #[error("no scriptlet has the provided name")] NoMatchingScriptlet, #[error("no scriptlet name was provided")] MissingScriptletName, #[error("object syntax for scriptlet arguments is unsupported")] ScriptletArgObjectSyntaxUnsupported, #[error("scriptlet content was corrupted")] CorruptScriptletContent, #[error("resource content type cannot be used for a scriptlet injection")] ContentTypeNotInjectable, #[error("filter rule is not authorized to inject the intended scriptlet")] InsufficientPermissions, } impl From for ScriptletResourceError { fn from(_: base64::DecodeError) -> Self { Self::CorruptScriptletContent } } impl From for ScriptletResourceError { fn from(_: std::string::FromUtf8Error) -> Self { Self::CorruptScriptletContent } } static TEMPLATE_ARGUMENT_RE: [Lazy; 9] = [ Lazy::new(|| template_argument_regex(1)), Lazy::new(|| template_argument_regex(2)), Lazy::new(|| template_argument_regex(3)), Lazy::new(|| template_argument_regex(4)), Lazy::new(|| template_argument_regex(5)), Lazy::new(|| template_argument_regex(6)), Lazy::new(|| template_argument_regex(7)), Lazy::new(|| template_argument_regex(8)), Lazy::new(|| template_argument_regex(9)), ]; fn template_argument_regex(i: usize) -> Regex { Regex::new(&format!(r"\{{\{{{}\}}\}}", i)).unwrap() } /// Omit the 0th element of `args` (the scriptlet name) when calling this method. fn patch_template_scriptlet(mut template: String, args: impl IntoIterator>) -> String { // `regex` treats `$` as a special character. Instead, `$$` is interpreted as a literal `$` // character. args.into_iter().take(TEMPLATE_ARGUMENT_RE.len()).enumerate().for_each(|(i, arg)| { template = TEMPLATE_ARGUMENT_RE[i] .replace(&template, arg.as_ref().replace('$', "$$")) .to_string(); }); template } /// Scriptlet injections must be JS resources. However, the `.js` extension may need to be added as /// a canonicalization step, since it can be omitted in filter rules. fn with_js_extension(scriptlet_name: &str) -> String { if scriptlet_name.ends_with(".js") { scriptlet_name.to_string() } else { format!("{}.js", scriptlet_name) } } /// Returns the index of the next unescaped separator, as well as a boolean indicating whether or /// not the string must be postprocessed to normalize any separators along the way. fn index_next_unescaped_separator(s: &str, separator: char) -> (Option, bool) { assert!(separator != '\\'); let mut new_arg_end = 0; let mut needs_transform = false; // guaranteed to terminate: // - loop only proceeds if there is an odd number of escape characters // - new_arg_end increases by at least 1 in that case // - s has finite length while new_arg_end < s.len() { let rest = &s[new_arg_end..]; if let Some(i) = rest.find(separator) { // check how many escape characters there are before the matched separator let mut trailing_escapes = 0; while trailing_escapes < i && rest[..i - trailing_escapes].ends_with('\\') { trailing_escapes += 1; } if trailing_escapes % 2 == 0 { // even number; all escape characters are literal backslashes new_arg_end += i; break; } else { // odd number; the last escape character is escaping this separator new_arg_end += i + 1; needs_transform = true; continue; } } else { // no match return (None, needs_transform) } } // don't index beyond the end of the string let new_arg_end = if new_arg_end >= s.len() { None } else { Some(new_arg_end) }; (new_arg_end, needs_transform) } /// Replaces escaped instances of `separator` in `arg` with unescaped characters. fn normalize_arg(arg: &str, separator: char) -> String { assert!(separator != '\\'); let mut output = String::with_capacity(arg.len()); let mut escaped = false; for i in arg.chars() { if i == '\\' { if escaped { escaped = false; output += "\\\\"; } else { escaped = true; } continue; } if escaped { if i != separator { output.push('\\'); } escaped = false; } output.push(i); } output } /// Parses the inner contents of a `+js(...)` operator of a cosmetic filter. /// /// Returns `None` if the contents are malformed. pub(crate) fn parse_scriptlet_args(mut args: &str) -> Option> { let mut args_vec = vec![]; if args.trim().is_empty() { return Some(args_vec); } // guaranteed to terminate: // - each branch of the `match` consumes at least 1 character from the beginning of `args` // - loop exits if `args` is empty loop { // n.b. `args.trim_start()` leaves an empty string if it's only whitespace if let Some(i) = args.find(|c: char| !c.is_whitespace()) { args = &args[i..]; } let (arg, needs_transform); match args.chars().next() { Some(qc) if qc == '"' || qc == '\'' || qc == '`' => { args = &args[1..]; let i; (i, needs_transform) = index_next_unescaped_separator(args, qc); if let Some(i) = i { arg = &args[..i]; args = &args[i+1..]; // consume whitespace following the quote if let Some(i) = args.find(|c: char| !c.is_whitespace()) { args = &args[i..]; } // consume comma separator if args.starts_with(',') { args = &args[1..]; } else if !args.is_empty() { // uBO pushes everything up to the next comma without escapes, but it's // very weird and probably not what the filter list author intended. // Treating it as an error for now. return None; } } else { // uBO pushes the entire argument, including the unmatched quote. Again, weird // and probably not intended. return None; } } Some(_) => { let i; (i, needs_transform) = index_next_unescaped_separator(args, ','); arg = args[..i.unwrap_or(args.len())].trim_end(); args = &args[i.map(|i| i + 1).unwrap_or(args.len())..]; } None => { // `args` is empty break; } } let arg = if needs_transform { normalize_arg(arg, ',') } else { arg.to_string() }; args_vec.push(arg); } Some(args_vec) } #[cfg(test)] mod arg_parsing_util_tests { use super::*; #[test] fn test_index_next_unescaped_separator() { assert_eq!(index_next_unescaped_separator(r#"``"#, '`'), (Some(0), false)); assert_eq!(index_next_unescaped_separator(r#"\``"#, '`'), (Some(2), true)); assert_eq!(index_next_unescaped_separator(r#"\\``"#, '`'), (Some(2), false)); assert_eq!(index_next_unescaped_separator(r#"\\\``"#, '`'), (Some(4), true)); assert_eq!(index_next_unescaped_separator(r#"\\\\``"#, '`'), (Some(4), false)); assert_eq!(index_next_unescaped_separator(r#"\`\\\``"#, '`'), (Some(6), true)); assert_eq!(index_next_unescaped_separator(r#"\\\`\``"#, '`'), (Some(6), true)); assert_eq!(index_next_unescaped_separator(r#"\\\`\\``"#, '`'), (Some(6), true)); assert_eq!(index_next_unescaped_separator(r#"\,test\,"#, ','), (None, true)) } #[test] fn test_normalize_arg() { assert_eq!(normalize_arg(r#"\`"#, '`'), r#"`"#); assert_eq!(normalize_arg(r#"\\\`"#, '`'), r#"\\`"#); assert_eq!(normalize_arg(r#"\`\\\`"#, '`'), r#"`\\`"#); assert_eq!(normalize_arg(r#"\\\`\`"#, '`'), r#"\\``"#); assert_eq!(normalize_arg(r#"\\\`\\`"#, '`'), r#"\\`\\`"#); } } #[cfg(test)] mod redirect_storage_tests { use super::*; #[test] fn get_resource_by_name() { let mut storage = ResourceStorage::default(); storage .add_resource( Resource::simple("name.js", MimeType::ApplicationJavascript, "resource data"), ) .unwrap(); assert_eq!( storage.get_redirect_resource("name.js"), Some(format!("data:application/javascript;base64,{}", base64::encode("resource data"))), ); } #[test] fn get_resource_by_alias() { let mut storage = ResourceStorage::default(); let mut r = Resource::simple("name.js", MimeType::ApplicationJavascript, "resource data"); r.aliases.push("alias.js".to_string()); storage .add_resource(r) .unwrap(); assert_eq!( storage.get_redirect_resource("alias.js"), Some(format!("data:application/javascript;base64,{}", base64::encode("resource data"))), ); } #[test] fn permissions() { let mut storage = ResourceStorage::default(); let mut r = Resource::simple("name.js", MimeType::ApplicationJavascript, "resource data"); r.aliases.push("alias.js".to_string()); r.permission = PermissionMask::from_bits(0b00000001); storage .add_resource(r) .unwrap(); assert_eq!( storage.get_redirect_resource("name.js"), None, ); assert_eq!( storage.get_redirect_resource("alias.js"), None, ); } } #[cfg(test)] mod scriptlet_storage_tests { use super::*; #[test] fn parse_argslist() { let args = parse_scriptlet_args("scriptlet, hello world, foobar").unwrap(); assert_eq!(args, vec!["scriptlet", "hello world", "foobar"]); } #[test] fn parse_argslist_noargs() { let args = parse_scriptlet_args("scriptlet").unwrap(); assert_eq!(args, vec!["scriptlet"]); } #[test] fn parse_argslist_empty() { let args = parse_scriptlet_args("").unwrap(); assert!(args.is_empty()); } #[test] fn parse_argslist_commas() { let args = parse_scriptlet_args("scriptletname, one\\, two\\, three, four").unwrap(); assert_eq!(args, vec!["scriptletname", "one, two, three", "four"]); } #[test] fn parse_argslist_badchars() { let args = parse_scriptlet_args( r##"scriptlet, "; window.location.href = bad.com; , '; alert("you're\, hacked"); , \u\r\l(bad.com) "##, ); assert_eq!(args, None); } #[test] fn parse_argslist_quoted() { let args = parse_scriptlet_args(r#"debug-scriptlet, 'test', '"test"', "test", "'test'", `test`, '`test`'"#).unwrap(); assert_eq!( args, vec![ r#"debug-scriptlet"#, r#"test"#, r#""test""#, r#"test"#, r#"'test'"#, r#"test"#, r#"`test`"#, ], ); let args = parse_scriptlet_args(r#"debug-scriptlet, 'test,test', '', "", ' ', ' test '"#).unwrap(); assert_eq!( args, vec![ r#"debug-scriptlet"#, r#"test,test"#, r#""#, r#""#, r#" "#, r#" test "#, ], ); let args = parse_scriptlet_args(r#"debug-scriptlet, test\,test, test\test, "test\test", 'test\test', "#).unwrap(); assert_eq!( args, vec![ r#"debug-scriptlet"#, r#"test,test"#, r#"test\test"#, r#"test\test"#, r#"test\test"#, r#""#, ], ); let args = parse_scriptlet_args(r#"debug-scriptlet, "test"#); assert_eq!(args, None); let args = parse_scriptlet_args(r#"debug-scriptlet, 'test'"test""#); assert_eq!(args, None); } #[test] fn parse_argslist_trailing_escaped_comma() { let args = parse_scriptlet_args(r#"remove-node-text, script, \,mr=function(r\,"#).unwrap(); assert_eq!(args, vec!["remove-node-text", "script", ",mr=function(r,"]); } #[test] fn get_patched_scriptlets() { let resources = ResourceStorage::from_resources([ Resource { name: "greet.js".to_string(), aliases: vec![], kind: ResourceType::Template, content: base64::encode("console.log('Hello {{1}}, my name is {{2}}')"), dependencies: vec![], permission: Default::default(), }, Resource { name: "alert.js".to_owned(), aliases: vec![], kind: ResourceType::Template, content: base64::encode("alert('{{1}}')"), dependencies: vec![], permission: Default::default(), }, Resource { name: "blocktimer.js".to_owned(), aliases: vec![], kind: ResourceType::Template, content: base64::encode("setTimeout(blockAds, {{1}})"), dependencies: vec![], permission: Default::default(), }, Resource { name: "null.js".to_owned(), aliases: vec![], kind: ResourceType::Template, content: base64::encode("(()=>{})()"), dependencies: vec![], permission: Default::default(), }, Resource { name: "set-local-storage-item.js".to_owned(), aliases: vec![], kind: ResourceType::Template, content: base64::encode(r#"{{1}} that dollar signs in {{2}} are untouched"#), dependencies: vec![], permission: Default::default(), }, ]); assert_eq!( resources.get_scriptlet_resource("greet, world, adblock-rust", Default::default()), Ok("console.log('Hello world, my name is adblock-rust')".into()) ); assert_eq!( resources.get_scriptlet_resource("alert, All systems are go!! ", Default::default()), Ok("alert('All systems are go!!')".into()) ); assert_eq!( resources.get_scriptlet_resource("alert, Uh oh\\, check the logs...", Default::default()), Ok("alert('Uh oh, check the logs...')".into()) ); assert_eq!( resources.get_scriptlet_resource(r#"alert, this has "quotes""#, Default::default()), Ok(r#"alert('this has \"quotes\"')"#.into()) ); assert_eq!( resources.get_scriptlet_resource("blocktimer, 3000", Default::default()), Ok("setTimeout(blockAds, 3000)".into()) ); assert_eq!(resources.get_scriptlet_resource("null", Default::default()), Ok("(()=>{})()".into())); assert_eq!( resources.get_scriptlet_resource("null, null", Default::default()), Ok("(()=>{})()".into()) ); assert_eq!( resources.get_scriptlet_resource("greet, everybody", Default::default()), Ok("console.log('Hello everybody, my name is {{2}}')".into()) ); assert_eq!( resources.get_scriptlet_resource("unit-testing", Default::default()), Err(ScriptletResourceError::NoMatchingScriptlet) ); assert_eq!( resources.get_scriptlet_resource("", Default::default()), Err(ScriptletResourceError::MissingScriptletName) ); assert_eq!( resources.get_scriptlet_resource("set-local-storage-item, Test, $remove$", Default::default()), Ok("Test that dollar signs in $remove$ are untouched".into()), ); } #[test] fn parse_template_file_format() { let resources = ResourceStorage::from_resources([ Resource { name: "abort-current-inline-script.js".into(), aliases: vec!["acis.js".into()], kind: ResourceType::Mime(MimeType::ApplicationJavascript), content: base64::encode("(function() {alert(\"hi\");})();"), dependencies: vec![], permission: Default::default(), }, Resource { name: "abort-on-property-read.js".into(), aliases: vec!["aopr.js".into()], kind: ResourceType::Template, content: base64::encode("(function() {confirm(\"Do you want to {{1}}?\");})();"), dependencies: vec![], permission: Default::default(), }, Resource { name: "googletagservices_gpt.js".into(), aliases: vec!["googletagservices.com/gpt.js".into(), "googletagservices-gpt".into()], kind: ResourceType::Template, content: base64::encode("function(a1 = '', a2 = '') {console.log(a1, a2)}"), dependencies: vec![], permission: Default::default(), }, ]); assert_eq!( resources.get_scriptlet_resource("aopr, code", Default::default()), Ok("(function() {confirm(\"Do you want to code?\");})();".to_owned()), ); assert_eq!( resources.get_scriptlet_resource("abort-on-property-read, write tests", Default::default()), Ok("(function() {confirm(\"Do you want to write tests?\");})();".to_owned()), ); assert_eq!( resources.get_scriptlet_resource("abort-on-property-read.js, block advertisements", Default::default()), Ok("(function() {confirm(\"Do you want to block advertisements?\");})();".to_owned()), ); assert_eq!( resources.get_scriptlet_resource("acis", Default::default()), Ok("(function() {alert(\"hi\");})();".to_owned()), ); assert_eq!( resources.get_scriptlet_resource("acis.js", Default::default()), Ok("(function() {alert(\"hi\");})();".to_owned()), ); assert_eq!( resources.get_scriptlet_resource("googletagservices_gpt.js", Default::default()), Ok("(function(a1 = '', a2 = '') {console.log(a1, a2)})()".to_owned()), ); assert_eq!( resources.get_scriptlet_resource("googletagservices_gpt, test1", Default::default()), Ok("(function(a1 = '', a2 = '') {console.log(a1, a2)})(\"test1\")".to_owned()), ); assert_eq!( resources.get_scriptlet_resource("googletagservices.com/gpt, test1, test2", Default::default()), Ok("(function(a1 = '', a2 = '') {console.log(a1, a2)})(\"test1\", \"test2\")".to_owned()), ); assert_eq!( resources.get_scriptlet_resource(r#"googletagservices.com/gpt.js, t"es't1, $te\st2$"#, Default::default()), Ok(r#"(function(a1 = '', a2 = '') {console.log(a1, a2)})("t\"es't1", "$te\\st2$")"#.to_owned()), ); // The alias does not have a `.js` extension, so it cannot be used for a scriptlet // injection (only as a redirect resource). assert_eq!( resources.get_scriptlet_resource(r#"googletagservices-gpt, t"es't1, te\st2"#, Default::default()), Err(ScriptletResourceError::NoMatchingScriptlet), ); // Object-style injection assert_eq!( resources.get_scriptlet_resource(r#"googletagservices.com/gpt, { "test": true }"#, Default::default()), Err(ScriptletResourceError::ScriptletArgObjectSyntaxUnsupported), ); } /// Currently, only 9 template arguments are supported - but reaching that limit should not /// cause a panic. #[test] fn patch_argslist_many_args() { let resources = ResourceStorage::from_resources([ Resource { name: "abort-current-script.js".into(), aliases: vec!["acs.js".into()], kind: ResourceType::Mime(MimeType::ApplicationJavascript), content: base64::encode("{{1}} {{2}} {{3}} {{4}} {{5}} {{6}} {{7}} {{8}} {{9}} {{10}} {{11}} {{12}}"), dependencies: vec![], permission: Default::default(), }, ]); let args = parse_scriptlet_args("acs, this, probably, is, going, to, break, brave, and, crash, it, instead, of, ignoring, it").unwrap(); assert_eq!(args, vec!["acs", "this", "probably", "is", "going", "to", "break", "brave", "and", "crash", "it", "instead", "of", "ignoring", "it"]); assert_eq!( resources.get_scriptlet_resource("acs, this, probably, is, going, to, break, brave, and, crash, it, instead, of, ignoring, it", Default::default()), Ok("this probably is going to break brave and crash {{10}} {{11}} {{12}}".to_string()), ); } #[test] fn permissions() { const PERM0: PermissionMask = PermissionMask::from_bits(0b00000001); const PERM1: PermissionMask = PermissionMask::from_bits(0b00000010); const PERM10: PermissionMask = PermissionMask::from_bits(0b00000011); let resources = ResourceStorage::from_resources([ Resource::simple("default-perms.js", MimeType::ApplicationJavascript, "default-perms"), Resource { name: "perm0.js".into(), aliases: vec!["0.js".to_string()], kind: ResourceType::Mime(MimeType::ApplicationJavascript), content: base64::encode("perm0"), dependencies: vec![], permission: PERM0, }, Resource { name: "perm1.js".into(), aliases: vec!["1.js".to_string()], kind: ResourceType::Mime(MimeType::ApplicationJavascript), content: base64::encode("perm1"), dependencies: vec![], permission: PERM1, }, Resource { name: "perm10.js".into(), aliases: vec!["10.js".to_string()], kind: ResourceType::Mime(MimeType::ApplicationJavascript), content: base64::encode("perm10"), dependencies: vec![], permission: PERM10, }, ]); fn test_perm(resources: &ResourceStorage, perm: PermissionMask, expect_ok: &[&str], expect_fail: &[&str]) { for ident in expect_ok { if ident.len() > 2 { assert_eq!( resources.get_scriptlet_resource(ident, perm), Ok(ident.to_string()), ); } else { assert_eq!( resources.get_scriptlet_resource(ident, perm), Ok(format!("perm{}", ident)), ); } } for ident in expect_fail { assert_eq!( resources.get_scriptlet_resource(ident, perm), Err(ScriptletResourceError::InsufficientPermissions), ); } } test_perm(&resources, Default::default(), &["default-perms"], &["perm0", "perm1", "perm10", "0", "1", "10"]); test_perm(&resources, PERM0, &["default-perms", "perm0", "0"], &["perm1", "perm10", "1", "10"]); test_perm(&resources, PERM1, &["default-perms", "perm1", "1"], &["perm0", "perm10", "0", "10"]); test_perm(&resources, PERM10, &["default-perms", "perm0", "perm1", "perm10", "0", "1", "10"], &[]); } } adblock-0.8.12/src/url_parser/mod.rs000064400000000000000000000122161046102023000154070ustar 00000000000000//! Simplified URL parsing infrastructure, including the domain resolver //! implementation if the `embedded-domain-resolver` feature is disabled. mod parser; // mod parser_regex; #[cfg(not(feature = "embedded-domain-resolver"))] static DOMAIN_RESOLVER: once_cell::sync::OnceCell> = once_cell::sync::OnceCell::new(); /// Sets the library's domain resolver implementation. /// /// If the `embedded-domain-resolver` feature is disabled and the library is /// used without this having been set, panics may occur! /// /// Will return the resolver if it has already been previously set. #[cfg(not(feature = "embedded-domain-resolver"))] pub fn set_domain_resolver( resolver: Box, ) -> Result<(), Box> { DOMAIN_RESOLVER.set(resolver) } /// Default `addr`-based domain resolution implementation used when the /// `embedded-domain-resolver` feature is enabled. #[cfg(feature = "embedded-domain-resolver")] struct DefaultResolver; #[cfg(feature = "embedded-domain-resolver")] impl ResolvesDomain for DefaultResolver { fn get_host_domain(&self, host: &str) -> (usize, usize) { use addr::parser::DomainName; use addr::psl::List; if host.is_empty() { (0, 0) } else { match List.parse_domain_name(host) { Err(_e) => (0, host.len()), Ok(domain) => { let host_len = host.len(); let domain_len = domain.root().unwrap_or_else(|| domain.suffix()).len(); (host_len - domain_len, host_len) } } } } } /// Required trait for any domain resolution implementation used with this /// crate. pub trait ResolvesDomain: Send + Sync { /// Return the start and end indices of the domain (eTLD+1) of the given hostname. /// /// If there isn't a valid domain, `(0, host.len())` should be returned. /// /// ``` /// # use adblock::url_parser::ResolvesDomain; /// # /// I'd use DefaultResolver here, but I can't use private structs in doctests. /// # /// Enjoy this mock implementation instead :( /// # struct Resolver; /// # impl ResolvesDomain for Resolver { /// # fn get_host_domain(&self, host: &str) -> (usize, usize) { /// # match host { /// # "api.m.example.com" => (6, 17), /// # "a.b.co.uk" => (2, 9), /// # _ => unreachable!() /// # } /// # } /// # } /// # let resolver = Resolver; /// let host = "api.m.example.com"; /// let (start, end) = resolver.get_host_domain(host); /// assert_eq!(&host[start..end], "example.com"); /// /// let host = "a.b.co.uk"; /// let (start, end) = resolver.get_host_domain(host); /// assert_eq!(&host[start..end], "b.co.uk"); /// ``` fn get_host_domain(&self, host: &str) -> (usize, usize); } /// Parsed URL representation. pub struct RequestUrl { pub url: String, schema_end: usize, pub hostname_pos: (usize, usize), domain: (usize, usize), } impl RequestUrl { pub fn schema(&self) -> &str { &self.url[..self.schema_end] } pub fn hostname(&self) -> &str { &self.url[self.hostname_pos.0..self.hostname_pos.1] } pub fn domain(&self) -> &str { &self.url[self.hostname_pos.0 + self.domain.0..self.hostname_pos.0 + self.domain.1] } } /// Return the start and end indices of the domain of the given hostname. pub(crate) fn get_host_domain(host: &str) -> (usize, usize) { #[cfg(not(feature = "embedded-domain-resolver"))] let domain_resolver = DOMAIN_RESOLVER.get().expect("An external domain resolver must be set when the `embedded-domain-resolver` feature is disabled."); #[cfg(feature = "embedded-domain-resolver")] let domain_resolver = DefaultResolver; domain_resolver.get_host_domain(host) } /// Return the string representation of the host (domain or IP address) for /// this URL, if any together with the URL. /// /// As part of hostname parsing, punycode decoding is used to convert URLs with /// UTF characters to plain ASCII ones. Serialisation then contains this /// decoded URL that is used for further matching. pub fn parse_url(url: &str) -> Option { let parsed = parser::Hostname::parse(url).ok(); parsed.and_then(|h| match h.host_str() { Some(_host) => Some(RequestUrl { url: h.url_str().to_owned(), schema_end: h.scheme_end, hostname_pos: (h.host_start, h.host_end), domain: get_host_domain(&h.url_str()[h.host_start..h.host_end]), }), _ => None, }) } #[cfg(all(test, feature = "embedded-domain-resolver"))] mod embedded_domain_resolver_tests { use super::*; #[test] fn test_get_host_domain() { fn domain(host: &str) -> &str { let resolver = DefaultResolver; let (a, b) = resolver.get_host_domain(host); &host[a..b] } assert_eq!(domain("www.google.com"), "google.com"); assert_eq!(domain("google.com."), "google.com."); assert_eq!(domain("a.b.co.uk"), "b.co.uk"); assert_eq!(domain("foo.bar"), "foo.bar"); } } adblock-0.8.12/src/url_parser/parser.rs000064400000000000000000000403041046102023000161230ustar 00000000000000// Copyright 2013-2016 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use std::error::Error; use std::fmt::{self, Formatter, Write}; use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS}; use std::ops::{Range, RangeFrom, RangeTo}; /// https://url.spec.whatwg.org/#fragment-percent-encode-set const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').add(b'`'); /// https://url.spec.whatwg.org/#path-percent-encode-set const PATH: &AsciiSet = &FRAGMENT.add(b'#').add(b'?').add(b'{').add(b'}'); /// https://url.spec.whatwg.org/#userinfo-percent-encode-set pub(crate) const USERINFO: &AsciiSet = &PATH .add(b'/') .add(b':') .add(b';') .add(b'=') .add(b'@') .add(b'[') .add(b'\\') .add(b']') .add(b'^') .add(b'|'); #[derive(Clone)] pub(super) struct Hostname { serialization: String, // Components pub(super) scheme_end: usize, // Before ':' pub(super) host_start: usize, pub(super) host_end: usize, } impl Hostname { pub fn parse(input: &str) -> Result { Parser { serialization: String::with_capacity(input.len()), } .parse_url(input) } /// Equivalent to `url.host().is_some()`. /// /// # Examples /// /// ``` /// use url::Url; /// # use url::ParseError; /// /// # fn run() -> Result<(), ParseError> { /// let url = Url::parse("ftp://rms@example.com")?; /// assert!(url.has_host()); /// /// let url = Url::parse("unix:/run/foo.socket")?; /// assert!(!url.has_host()); /// /// let url = Url::parse("data:text/plain,Stuff")?; /// assert!(!url.has_host()); /// # Ok(()) /// # } /// # run().unwrap(); /// ``` fn has_host(&self) -> bool { self.host_end > self.host_start } /// Return the string representation of the host (domain or IP address) for this URL, if any. /// /// Non-ASCII domains are punycode-encoded per IDNA. /// IPv6 addresses are given between `[` and `]` brackets. /// /// Cannot-be-a-base URLs (typical of `data:` and `mailto:`) and some `file:` URLs /// don’t have a host. /// /// See also the `host` method. /// /// # Examples /// /// ``` /// use url::Url; /// # use url::ParseError; /// /// # fn run() -> Result<(), ParseError> { /// let url = Url::parse("https://127.0.0.1/index.html")?; /// assert_eq!(url.host_str(), Some("127.0.0.1")); /// /// let url = Url::parse("ftp://rms@example.com")?; /// assert_eq!(url.host_str(), Some("example.com")); /// /// let url = Url::parse("unix:/run/foo.socket")?; /// assert_eq!(url.host_str(), None); /// /// let url = Url::parse("data:text/plain,Stuff")?; /// assert_eq!(url.host_str(), None); /// # Ok(()) /// # } /// # run().unwrap(); /// ``` pub fn host_str(&self) -> Option<&str> { if self.has_host() { Some(self.slice(self.host_start..self.host_end)) } else { None } } pub fn url_str(&self) -> &str { &self.serialization } // Private helper methods: fn slice(&self, range: R) -> &str where R: RangeArg, { range.slice_of(&self.serialization) } } trait RangeArg { fn slice_of<'a>(&self, s: &'a str) -> &'a str; } impl RangeArg for Range { fn slice_of<'a>(&self, s: &'a str) -> &'a str { &s[self.start..self.end] } } impl RangeArg for RangeFrom { fn slice_of<'a>(&self, s: &'a str) -> &'a str { &s[self.start..] } } impl RangeArg for RangeTo { fn slice_of<'a>(&self, s: &'a str) -> &'a str { &s[..self.end] } } pub type ParseResult = Result; macro_rules! simple_enum_error { ($($name: ident => $description: expr,)+) => { /// Errors that can occur during parsing. #[derive(PartialEq, Eq, Clone, Copy, Debug)] pub enum ParseError { $( $name, )+ } impl Error for ParseError {} impl fmt::Display for ParseError { fn fmt(&self, fmt: &mut Formatter) -> fmt::Result { match *self { $( ParseError::$name => $description, )+ }.fmt(fmt) } } } } simple_enum_error! { // EmptyHost => "empty host", IdnaError => "invalid international domain name", // InvalidPort => "invalid port number", // InvalidIpv4Address => "invalid IPv4 address", // InvalidIpv6Address => "invalid IPv6 address", // InvalidDomainCharacter => "invalid domain character", // HostParseError => "internal host parse error", RelativeUrlWithoutBase => "relative URL without a base", // RelativeUrlWithCannotBeABaseBase => "relative URL with a cannot-be-a-base base", // SetHostOnCannotBeABaseUrl => "a cannot-be-a-base URL doesn’t have a host to set", // Overflow => "URLs more than 4 GB are not supported", FileUrlNotSupported => "file URLs are not supported", ExpectedMoreChars => "Expected more characters", } impl From for ParseError { fn from(_: idna::Errors) -> ParseError { ParseError::IdnaError } } #[derive(Copy, Clone)] pub enum SchemeType { File, SpecialNotFile, NotSpecial, } impl SchemeType { pub fn is_special(self) -> bool { !matches!(self, SchemeType::NotSpecial) } pub fn from(s: &str) -> Self { match s { "http" | "https" | "ws" | "wss" | "ftp" | "gopher" => SchemeType::SpecialNotFile, "file" => SchemeType::File, _ => SchemeType::NotSpecial, } } } #[derive(Clone)] pub struct Input<'i> { chars: std::str::Chars<'i>, } impl<'i> Input<'i> { pub fn new(input: &'i str) -> Self { let input = input.trim_matches(c0_control_or_space); Input { chars: input.chars(), } } pub fn is_empty(&self) -> bool { self.clone().next().is_none() } fn starts_with(&self, p: P) -> bool { p.split_prefix(&mut self.clone()) } pub fn split_prefix(&self, p: P) -> Option { let mut remaining = self.clone(); if p.split_prefix(&mut remaining) { Some(remaining) } else { None } } fn count_matching bool>(&self, f: F) -> (u32, Self) { let mut count = 0; let mut remaining = self.clone(); loop { let mut input = remaining.clone(); if matches!(input.next(), Some(c) if f(c)) { remaining = input; count += 1; } else { return (count, remaining); } } } fn next_utf8(&mut self) -> Option<(char, &'i str)> { loop { let utf8 = self.chars.as_str(); match self.chars.next() { Some(c) => { if !matches!(c, '\t' | '\n' | '\r') { return Some((c, &utf8[..c.len_utf8()])); } } None => return None, } } } } pub trait Pattern { fn split_prefix(self, input: &mut Input) -> bool; } impl Pattern for char { fn split_prefix(self, input: &mut Input) -> bool { input.next() == Some(self) } } impl<'a> Pattern for &'a str { fn split_prefix(self, input: &mut Input) -> bool { for c in self.chars() { if input.next() != Some(c) { return false; } } true } } impl bool> Pattern for F { fn split_prefix(self, input: &mut Input) -> bool { input.next().map_or(false, self) } } impl<'i> Iterator for Input<'i> { type Item = char; fn next(&mut self) -> Option { self.chars.next() //by_ref().find(|&c| !matches!(c, '\t' | '\n' | '\r')) } } pub struct Parser { pub serialization: String, } impl Parser { /// https://url.spec.whatwg.org/#concept-basic-url-parser pub fn parse_url(mut self, input: &str) -> ParseResult { // println!("Parse {}", input); let input = Input::new(input); if let Ok(remaining) = self.parse_scheme(input.clone()) { return self.parse_with_scheme(remaining); } // No-scheme state Err(ParseError::RelativeUrlWithoutBase) } pub fn parse_scheme<'i>(&mut self, mut input: Input<'i>) -> Result, ()> { if input.is_empty() || !input.starts_with(|c: char| c.is_ascii_alphabetic()) { return Err(()); } debug_assert!(self.serialization.is_empty()); while let Some(c) = input.next() { match c { 'a'..='z' => self.serialization.push(c), 'A'..='Z' => self.serialization.push(c.to_ascii_lowercase()), '0'..='9' | '+' | '-' | '.' => self.serialization.push(c), ':' => return Ok(input), _ => { self.serialization.clear(); return Err(()); } } } Err(()) } fn parse_with_scheme(mut self, input: Input) -> ParseResult { let scheme_end = self.serialization.len(); let scheme_type = SchemeType::from(&self.serialization); self.serialization.push(':'); match scheme_type { SchemeType::File => { // println!("Parse file - not supported"); Err(ParseError::FileUrlNotSupported) } SchemeType::SpecialNotFile => { // println!("Parse special, not file"); // special relative or authority state let (_, remaining) = input.count_matching(|c| matches!(c, '/' | '\\')); // special authority slashes state // println!("Parse after double slash {}", remaining.chars.as_str()); self.after_double_slash(remaining, scheme_type, scheme_end) } SchemeType::NotSpecial => { // println!("Parse non special {}", &self.serialization); self.parse_non_special(input, scheme_type, scheme_end) } } } /// Scheme other than file, http, https, ws, ws, ftp, gopher. fn parse_non_special( mut self, input: Input, scheme_type: SchemeType, scheme_end: usize, ) -> ParseResult { // path or authority state ( if let Some(input) = input.split_prefix("//") { return self.after_double_slash(input, scheme_type, scheme_end); } // Anarchist URL (no authority) let path_start = self.serialization.len(); let host_start = path_start; let host_end = path_start; self.serialization.push_str(input.chars.as_str()); let ser_remaining = self.serialization.as_mut_str().get_mut(host_end..); ser_remaining.map(|s| { s.make_ascii_lowercase(); &*s }); Ok(Hostname { serialization: self.serialization, scheme_end, host_start, host_end, }) } fn after_double_slash( mut self, input: Input, scheme_type: SchemeType, scheme_end: usize, ) -> ParseResult { self.serialization.push_str("//"); // authority state let (_username_end, remaining) = self.parse_userinfo(input, scheme_type)?; // host state let host_start = self.serialization.len(); let (host_end, remaining) = self.parse_host(remaining, scheme_type)?; self.serialization.push_str(remaining.chars.as_str()); Ok(Hostname { serialization: self.serialization, scheme_end, host_start, host_end, }) } /// Return (username_end, remaining) fn parse_userinfo<'i>( &mut self, mut input: Input<'i>, scheme_type: SchemeType, ) -> ParseResult<(usize, Input<'i>)> { let mut last_at = None; let mut remaining = input.clone(); let mut char_count = 0; while let Some(c) = remaining.next() { match c { '@' => last_at = Some((char_count, remaining.clone())), '/' | '?' | '#' => break, '\\' if scheme_type.is_special() => break, _ => (), } char_count += 1; } let (mut userinfo_char_count, remaining) = match last_at { None => return Ok((self.serialization.len(), input)), Some((0, remaining)) => return Ok((self.serialization.len(), remaining)), Some(x) => x, }; let mut username_end = None; let mut has_password = false; let mut has_username = false; while userinfo_char_count > 0 { let (c, utf8_c) = input.next_utf8().ok_or(ParseError::ExpectedMoreChars)?; userinfo_char_count -= 1; if c == ':' && username_end.is_none() { // Start parsing password username_end = Some(self.serialization.len()); // We don't add a colon if the password is empty if userinfo_char_count > 0 { self.serialization.push(':'); has_password = true; } } else { if !has_password { has_username = true; } self.serialization .extend(utf8_percent_encode(utf8_c, USERINFO)); } } let username_end = match username_end { Some(i) => i, None => self.serialization.len(), }; if has_username || has_password { self.serialization.push('@'); } Ok((username_end, remaining)) } pub fn parse_host<'i>( &mut self, mut input: Input<'i>, scheme_type: SchemeType, ) -> ParseResult<(usize, Input<'i>)> { // Undo the Input abstraction here to avoid allocating in the common case // where the host part of the input does not contain any tab or newline let input_str = input.chars.as_str(); let mut remaining = input.clone(); let mut inside_square_brackets = false; let mut has_ignored_chars = false; let mut non_ignored_chars = 0; let mut bytes = 0; for c in input_str.chars() { match c { ':' if !inside_square_brackets => break, '\\' if scheme_type.is_special() => break, '/' | '?' | '#' => break, '\t' | '\n' | '\r' => { has_ignored_chars = true; } '[' => { inside_square_brackets = true; non_ignored_chars += 1 } ']' => { inside_square_brackets = false; non_ignored_chars += 1 } _ => non_ignored_chars += 1, } remaining.next(); bytes += c.len_utf8(); } let replaced: String; let host_str; { let host_input = input.by_ref().take(non_ignored_chars); if has_ignored_chars { replaced = host_input.collect(); host_str = &*replaced } else { for _ in host_input {} host_str = &input_str[..bytes] } } if host_str.is_ascii() { write!(&mut self.serialization, "{}", host_str).unwrap(); } else { let encoded = idna::domain_to_ascii(host_str)?; write!(&mut self.serialization, "{}", encoded).unwrap(); } let host_end = self.serialization.len(); Ok((host_end, remaining)) } } /// https://url.spec.whatwg.org/#c0-controls-and-space #[inline] fn c0_control_or_space(ch: char) -> bool { ch <= ' ' // U+0000 to U+0020 } adblock-0.8.12/src/url_parser/parser_full.rs000064400000000000000000000003551046102023000171470ustar 00000000000000use url::{Url}; fn parse_url(url: &str) -> Option { url.parse::() .ok() // convert to Option } pub fn get_url_host(url: &str) -> Option { parse_url(url) .and_then(|p| p.host_str().map(String::from)) } adblock-0.8.12/src/url_parser/parser_regex.rs000064400000000000000000000072171046102023000173230ustar 00000000000000use regex::Regex; pub fn get_hostname_regex(url: &str) -> Option<(usize, (usize, usize))> { lazy_static! { static ref HOSTNAME_REGEX_STR: &'static str = concat!( r"(?P[a-z][a-z0-9+\-.]*)://", // Scheme r"(?:[a-z0-9\-._~%!$&'()*+,;=]+@)?", // User r"(?P[\w\-.~%]+", // Named host r"|\[[a-f0-9:.]+\]", // IPv6 host r"|\[v[a-f0-9][a-z0-9\-._~%!$&'()*+,;=:]+\])", // IPvFuture host // r"(?::[0-9]+)?", // Port // r"(?:/[a-z0-9\-._~%!$&'()*+,;=:@]+)*/?", // Path // r"(?:\?[a-z0-9\-._~%!$&'()*+,;=:@/?]*)?", // Query // r"(?:\#[a-z0-9\-._~%!$&'()*+,;=:@/?]*)?", // Fragment ); static ref HOST_REGEX: Regex = Regex::new(&HOSTNAME_REGEX_STR).unwrap(); } HOST_REGEX.captures(url) .and_then(|c| { Some((c.name("scheme")?.end(), (c.name("host")?.start(), c.name("host")?.end()))) }) } pub fn get_url_host(url: &str) -> Option<(String, usize, (usize, usize))> { let decode_flags = idna::uts46::Flags { use_std3_ascii_rules: true, transitional_processing: true, verify_dns_length: true, }; get_hostname_regex(&url) .and_then(|(schema_end, (hostname_start, hostname_end))| { let host = &url[hostname_start..hostname_end]; if host.is_ascii() { Some((url.to_owned(), schema_end, (hostname_start, hostname_end))) } else { idna::uts46::to_ascii(&host, decode_flags).map(|h| { let normalised = format!("{}://{}{}", &url[..schema_end], &h, &url[hostname_end..]); (normalised, schema_end, (hostname_start, hostname_start + h.len())) }).ok() } }) } impl super::UrlParser for crate::request::Request { fn parse_url(url: &str) -> Option { let parsed = get_url_host(&url); parsed.map(|(url, schema_end, (host_start, host_end))| { super::RequestUrl { url: url, schema_end: schema_end, hostname_pos: (host_start, host_end), domain: super::get_host_domain(&url[host_start..host_end]) } }) } } #[cfg(test)] mod parse_tests { use super::*; #[test] // pattern fn parses_hostname() { assert_eq!(get_url_host("http://example.foo.edu.au"), Some(("http://example.foo.edu.au".to_owned(), 4, (7, 25)))); assert_eq!(get_url_host("http://example.foo.edu.sh"), Some(("http://example.foo.edu.sh".to_owned(), 4, (7, 25)))); assert_eq!(get_url_host("http://example.foo.nom.br"), Some(("http://example.foo.nom.br".to_owned(), 4, (7, 25)))); assert_eq!(get_url_host("http://example.foo.nom.br:80/"), Some(("http://example.foo.nom.br:80/".to_owned(), 4, (7, 25)))); assert_eq!(get_url_host("http://example.foo.nom.br:8080/hello?world=true"), Some(("http://example.foo.nom.br:8080/hello?world=true".to_owned(), 4, (7, 25)))); assert_eq!(get_url_host("http://example.foo.nom.br/hello#world"), Some(("http://example.foo.nom.br/hello#world".to_owned(), 4, (7, 25)))); assert_eq!(get_url_host("http://127.0.0.1:80"), Some(("http://127.0.0.1:80".to_owned(), 4, (7, 16)))); assert_eq!(get_url_host("http://[2001:470:20::2]"), Some(("http://[2001:470:20::2]".to_owned(), 4, (7, 23)))); assert_eq!(get_url_host("http://[2001:4860:4860::1:8888]"), Some(("http://[2001:4860:4860::1:8888]".to_owned(), 4, (7, 31)))); } } adblock-0.8.12/src/utils.rs000064400000000000000000000144061046102023000136150ustar 00000000000000//! Common utilities used by the library. Some tests and benchmarks rely on this module having //! public visibility. #[cfg(target_pointer_width = "64")] use seahash::hash; #[cfg(target_pointer_width = "32")] use seahash::reference::hash; pub type Hash = u64; #[inline] pub fn fast_hash(input: &str) -> Hash { hash(input.as_bytes()) as Hash } #[inline] fn is_allowed_filter(ch: char) -> bool { ch.is_alphanumeric() || ch == '%' } pub(crate) const TOKENS_BUFFER_SIZE: usize = 128; pub(crate) const TOKENS_BUFFER_RESERVED: usize = 1; const TOKENS_MAX: usize = TOKENS_BUFFER_SIZE - TOKENS_BUFFER_RESERVED; fn fast_tokenizer_no_regex( pattern: &str, is_allowed_code: &dyn Fn(char) -> bool, skip_first_token: bool, skip_last_token: bool, tokens_buffer: &mut Vec, ) { // let mut tokens_buffer_index = 0; let mut inside: bool = false; let mut start = 0; let mut preceding_ch: Option = None; // Used to check if a '*' is not just before a token for (i, c) in pattern.char_indices() { if tokens_buffer.len() >= TOKENS_MAX { return; } if is_allowed_code(c) { if !inside { inside = true; start = i; } } else if inside { inside = false; // Should not be followed by '*' if (start != 0 || !skip_first_token) && i - start > 1 && c != '*' && preceding_ch != Some('*') { let hash = fast_hash(&pattern[start..i]); tokens_buffer.push(hash); } preceding_ch = Some(c); } else { preceding_ch = Some(c); } } if !skip_last_token && inside && pattern.len() - start > 1 && (preceding_ch != Some('*')) { let hash = fast_hash(&pattern[start..]); tokens_buffer.push(hash); } } pub(crate) fn tokenize_pooled(pattern: &str, tokens_buffer: &mut Vec) { fast_tokenizer_no_regex(pattern, &is_allowed_filter, false, false, tokens_buffer); } pub fn tokenize(pattern: &str) -> Vec { let mut tokens_buffer: Vec = Vec::with_capacity(TOKENS_BUFFER_SIZE); fast_tokenizer_no_regex( pattern, &is_allowed_filter, false, false, &mut tokens_buffer, ); tokens_buffer } pub(crate) fn tokenize_filter( pattern: &str, skip_first_token: bool, skip_last_token: bool, ) -> Vec { let mut tokens_buffer: Vec = Vec::with_capacity(TOKENS_BUFFER_SIZE); fast_tokenizer_no_regex( pattern, &is_allowed_filter, skip_first_token, skip_last_token, &mut tokens_buffer, ); tokens_buffer } pub(crate) fn bin_lookup(arr: &[T], elt: T) -> bool { arr.binary_search(&elt).is_ok() } #[cfg(test)] mod tests { use super::*; #[test] #[ignore] // won't match hard-coded values when using a different hash function fn fast_hash_matches_ts() { assert_eq!(fast_hash("hello world"), 4173747013); // cross-checked with the TS implementation assert_eq!(fast_hash("ello worl"), 2759317833); // cross-checked with the TS implementation assert_eq!(fast_hash(&"hello world"[1..10]), fast_hash("ello worl")); assert_eq!(fast_hash(&"hello world"[1..5]), fast_hash("ello")); } fn t(tokens: &[&str]) -> Vec { tokens.into_iter().map(|t| fast_hash(&t)).collect() } #[test] fn tokenize_filter_works() { assert_eq!( tokenize_filter("", false, false).as_slice(), t(&vec![]).as_slice() ); assert_eq!( tokenize_filter("", true, false).as_slice(), t(&vec![]).as_slice() ); assert_eq!( tokenize_filter("", false, true).as_slice(), t(&vec![]).as_slice() ); assert_eq!( tokenize_filter("", true, true).as_slice(), t(&vec![]).as_slice() ); assert_eq!( tokenize_filter("", false, false).as_slice(), t(&vec![]).as_slice() ); assert_eq!( tokenize_filter("foo/bar baz", false, false).as_slice(), t(&vec!["foo", "bar", "baz"]).as_slice() ); assert_eq!( tokenize_filter("foo/bar baz", true, false).as_slice(), t(&vec!["bar", "baz"]).as_slice() ); assert_eq!( tokenize_filter("foo/bar baz", true, true).as_slice(), t(&vec!["bar"]).as_slice() ); assert_eq!( tokenize_filter("foo/bar baz", false, true).as_slice(), t(&vec!["foo", "bar"]).as_slice() ); assert_eq!( tokenize_filter("foo////bar baz", false, true).as_slice(), t(&vec!["foo", "bar"]).as_slice() ); } #[test] fn tokenize_works() { assert_eq!(tokenize("").as_slice(), t(&vec![]).as_slice()); assert_eq!(tokenize("foo").as_slice(), t(&vec!["foo"]).as_slice()); assert_eq!( tokenize("foo/bar").as_slice(), t(&vec!["foo", "bar"]).as_slice() ); assert_eq!( tokenize("foo-bar").as_slice(), t(&vec!["foo", "bar"]).as_slice() ); assert_eq!( tokenize("foo.bar").as_slice(), t(&vec!["foo", "bar"]).as_slice() ); assert_eq!( tokenize("foo.barƬ").as_slice(), t(&vec!["foo", "barƬ"]).as_slice() ); // Tokens cannot be surrounded by * assert_eq!(tokenize("foo.barƬ*").as_slice(), t(&vec!["foo"]).as_slice()); assert_eq!( tokenize("*foo.barƬ").as_slice(), t(&vec!["barƬ"]).as_slice() ); assert_eq!(tokenize("*foo.barƬ*").as_slice(), t(&vec![]).as_slice()); } #[test] fn bin_lookup_works() { assert_eq!(bin_lookup(&[], 42), false); assert_eq!(bin_lookup(&[42], 42), true); assert_eq!(bin_lookup(&[1, 2, 3, 4, 42], 42), true); assert_eq!(bin_lookup(&[1, 2, 3, 4, 42], 1), true); assert_eq!(bin_lookup(&[1, 2, 3, 4, 42], 3), true); assert_eq!(bin_lookup(&[1, 2, 3, 4, 42], 43), false); assert_eq!(bin_lookup(&[1, 2, 3, 4, 42], 0), false); assert_eq!(bin_lookup(&[1, 2, 3, 4, 42], 5), false); } }