ruzstd-0.7.3/.cargo_vcs_info.json0000644000000001360000000000100123760ustar { "git": { "sha1": "6b371baa8b8656bb0a14056b71377df1f1f9e50c" }, "path_in_vcs": "" }ruzstd-0.7.3/.github/workflows/ci.yml000064400000000000000000000021231046102023000156770ustar 00000000000000on: [push, pull_request] name: CI jobs: tests: name: Check runs-on: ubuntu-latest steps: - name: Checkout sources uses: actions/checkout@v4 - name: Install stable toolchain uses: dtolnay/rust-toolchain@stable - name: Install cargo-hack uses: taiki-e/install-action@v2 with: tool: cargo-hack - run: cargo hack check --feature-powerset --exclude-features rustc-dep-of-std - run: cargo hack clippy --feature-powerset --exclude-features rustc-dep-of-std - run: cargo hack test --feature-powerset --exclude-features rustc-dep-of-std clippy-nightly: name: clippy nightly runs-on: ubuntu-latest steps: - name: Checkout sources uses: actions/checkout@v4 - name: Install nightly toolchain uses: dtolnay/rust-toolchain@nightly with: components: rustfmt, clippy - run: cargo +nightly fmt --all -- --check - run: cargo +nightly clippy --no-default-features -- -D warnings - run: cargo +nightly clippy -- -D warningsruzstd-0.7.3/.gitignore000064400000000000000000000001451046102023000131560ustar 00000000000000/target **/*.rs.bk Cargo.lock /local_corpus_files /orig-zstd fuzz_decodecorpus perf.data* fuzz/corpusruzstd-0.7.3/Cargo.lock0000644000000450110000000000100103520ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "aho-corasick" version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" dependencies = [ "memchr", ] [[package]] name = "anes" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstyle" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" [[package]] name = "autocfg" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80" [[package]] name = "bumpalo" version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" [[package]] name = "cast" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "ciborium" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" dependencies = [ "ciborium-io", "ciborium-ll", "serde", ] [[package]] name = "ciborium-io" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" [[package]] name = "ciborium-ll" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" dependencies = [ "ciborium-io", "half", ] [[package]] name = "clap" version = "4.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90bc066a67923782aa8515dbaea16946c5bcc5addbd668bb80af688e53e548a0" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" version = "4.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4" dependencies = [ "anstyle", "clap_lex", ] [[package]] name = "clap_lex" version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" [[package]] name = "compiler_builtins" version = "0.1.125" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bd02a01d7bc069bed818e956600fe437ee222dd1d6ad92bfb9db87b43b71fd87" [[package]] name = "criterion" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" dependencies = [ "anes", "cast", "ciborium", "clap", "criterion-plot", "is-terminal", "itertools", "num-traits", "once_cell", "oorandom", "plotters", "rayon", "regex", "serde", "serde_derive", "serde_json", "tinytemplate", "walkdir", ] [[package]] name = "criterion-plot" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" dependencies = [ "cast", "itertools", ] [[package]] name = "crossbeam-deque" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" dependencies = [ "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ "crossbeam-utils", ] [[package]] name = "crossbeam-utils" version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] name = "crunchy" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" [[package]] name = "either" version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a47c1c47d2f5964e29c61246e81db715514cd532db6b5116a25ea3c03d6780a2" [[package]] name = "getrandom" version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94b22e06ecb0110981051723910cbf0b5f5e09a2062dd7663334ee79a9d1286c" dependencies = [ "cfg-if", "libc", "wasi", ] [[package]] name = "half" version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" dependencies = [ "cfg-if", "crunchy", ] [[package]] name = "hermit-abi" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" [[package]] name = "is-terminal" version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" dependencies = [ "hermit-abi", "libc", "windows-sys", ] [[package]] name = "itertools" version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" dependencies = [ "either", ] [[package]] name = "itoa" version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "js-sys" version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" dependencies = [ "wasm-bindgen", ] [[package]] name = "libc" version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] name = "log" version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" [[package]] name = "memchr" version = "2.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" [[package]] name = "num-traits" version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" dependencies = [ "autocfg", ] [[package]] name = "once_cell" version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "oorandom" version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" [[package]] name = "plotters" version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2c224ba00d7cadd4d5c660deaf2098e5e80e07846537c51f9cfa4be50c1fd45" dependencies = [ "num-traits", "plotters-backend", "plotters-svg", "wasm-bindgen", "web-sys", ] [[package]] name = "plotters-backend" version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e76628b4d3a7581389a35d5b6e2139607ad7c75b17aed325f210aa91f4a9609" [[package]] name = "plotters-svg" version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38f6d39893cca0701371e3c27294f09797214b86f1fb951b89ade8ec04e2abab" dependencies = [ "plotters-backend", ] [[package]] name = "ppv-lite86" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" version = "1.0.81" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba" dependencies = [ "unicode-ident", ] [[package]] name = "quote" version = "1.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" dependencies = [ "proc-macro2", ] [[package]] name = "rand" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", "rand_chacha", "rand_core", ] [[package]] name = "rand_chacha" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", "rand_core", ] [[package]] name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ "getrandom", ] [[package]] name = "rayon" version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" dependencies = [ "either", "rayon-core", ] [[package]] name = "rayon-core" version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" dependencies = [ "crossbeam-deque", "crossbeam-utils", ] [[package]] name = "regex" version = "1.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" dependencies = [ "aho-corasick", "memchr", "regex-automata", "regex-syntax", ] [[package]] name = "regex-automata" version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" dependencies = [ "aho-corasick", "memchr", "regex-syntax", ] [[package]] name = "regex-syntax" version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" [[package]] name = "rustc-std-workspace-alloc" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff66d57013a5686e1917ed6a025d54dd591fcda71a41fe07edf4d16726aefa86" [[package]] name = "rustc-std-workspace-core" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1956f5517128a2b6f23ab2dadf1a976f4f5b27962e7724c2bf3d45e539ec098c" [[package]] name = "ruzstd" version = "0.7.3" dependencies = [ "compiler_builtins", "criterion", "rand", "rustc-std-workspace-alloc", "rustc-std-workspace-core", "twox-hash", ] [[package]] name = "ryu" version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" [[package]] name = "same-file" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" dependencies = [ "winapi-util", ] [[package]] name = "serde" version = "1.0.198" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9846a40c979031340571da2545a4e5b7c4163bdae79b301d5f86d03979451fcc" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" version = "1.0.198" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e88edab869b01783ba905e7d0153f9fc1a6505a96e4ad3018011eedb838566d9" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "serde_json" version = "1.0.116" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e17db7126d17feb94eb3fad46bf1a96b034e8aacbc2e775fe81505f8b0b2813" dependencies = [ "itoa", "ryu", "serde", ] [[package]] name = "static_assertions" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" [[package]] name = "syn" version = "2.0.60" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "909518bc7b1c9b779f1bbf07f2929d35af9f0f37e47c6e9ef7f9dddc1e1821f3" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "tinytemplate" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" dependencies = [ "serde", "serde_json", ] [[package]] name = "twox-hash" version = "1.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" dependencies = [ "cfg-if", "static_assertions", ] [[package]] name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "walkdir" version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" dependencies = [ "same-file", "winapi-util", ] [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" dependencies = [ "cfg-if", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", "syn", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-macro" version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" dependencies = [ "quote", "wasm-bindgen-macro-support", ] [[package]] name = "wasm-bindgen-macro-support" version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", "syn", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" [[package]] name = "web-sys" version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" dependencies = [ "js-sys", "wasm-bindgen", ] [[package]] name = "winapi" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" dependencies = [ "winapi-i686-pc-windows-gnu", "winapi-x86_64-pc-windows-gnu", ] [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" dependencies = [ "winapi", ] [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-sys" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ "windows-targets", ] [[package]] name = "windows-targets" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", "windows_i686_gnu", "windows_i686_gnullvm", "windows_i686_msvc", "windows_x86_64_gnu", "windows_x86_64_gnullvm", "windows_x86_64_msvc", ] [[package]] name = "windows_aarch64_gnullvm" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" [[package]] name = "windows_aarch64_msvc" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" [[package]] name = "windows_i686_gnu" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" [[package]] name = "windows_i686_gnullvm" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" [[package]] name = "windows_i686_msvc" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" [[package]] name = "windows_x86_64_gnu" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" [[package]] name = "windows_x86_64_gnullvm" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" [[package]] name = "windows_x86_64_msvc" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" ruzstd-0.7.3/Cargo.toml0000644000000037150000000000100104020ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2018" name = "ruzstd" version = "0.7.3" authors = ["Moritz Borcherding "] build = false exclude = [ "decodecorpus_files/*", "dict_tests/*", "fuzz_decodecorpus/*", ] autobins = false autoexamples = false autotests = false autobenches = false description = "A decoder for the zstd compression format" homepage = "https://github.com/KillingSpark/zstd-rs" readme = "Readme.md" keywords = [ "zstd", "zstandard", "decompression", ] categories = ["compression"] license = "MIT" repository = "https://github.com/KillingSpark/zstd-rs" [lib] name = "ruzstd" path = "src/lib.rs" [[bin]] name = "zstd" path = "src/bin/zstd.rs" required-features = ["std"] [[bin]] name = "zstd_stream" path = "src/bin/zstd_stream.rs" required-features = ["std"] [[bench]] name = "reversedbitreader_bench" path = "benches/reversedbitreader_bench.rs" harness = false [dependencies.alloc] version = "1.0.0" optional = true package = "rustc-std-workspace-alloc" [dependencies.compiler_builtins] version = "0.1.2" optional = true [dependencies.core] version = "1.0.0" optional = true package = "rustc-std-workspace-core" [dependencies.twox-hash] version = "1.6" optional = true default-features = false [dev-dependencies.criterion] version = "0.5" [dev-dependencies.rand] version = "0.8.5" features = ["small_rng"] [features] default = [ "hash", "std", ] hash = ["dep:twox-hash"] rustc-dep-of-std = [ "dep:compiler_builtins", "dep:core", "dep:alloc", ] std = [] ruzstd-0.7.3/Cargo.toml.orig000064400000000000000000000026261046102023000140630ustar 00000000000000[package] name = "ruzstd" version = "0.7.3" authors = ["Moritz Borcherding "] edition = "2018" license = "MIT" homepage = "https://github.com/KillingSpark/zstd-rs" repository = "https://github.com/KillingSpark/zstd-rs" description = "A decoder for the zstd compression format" exclude = ["decodecorpus_files/*", "dict_tests/*", "fuzz_decodecorpus/*"] readme = "Readme.md" keywords = ["zstd", "zstandard", "decompression"] categories = ["compression"] [dependencies] twox-hash = { version = "1.6", default-features = false, optional = true } # Internal feature, only used when building as part of libstd, not part of the # stable interface of this crate. compiler_builtins = { version = "0.1.2", optional = true } core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" } alloc = { version = "1.0.0", optional = true, package = "rustc-std-workspace-alloc" } [dev-dependencies] criterion = "0.5" rand = { version = "0.8.5", features = ["small_rng"] } [features] default = ["hash", "std"] hash = ["dep:twox-hash"] std = [] # Internal feature, only used when building as part of libstd, not part of the # stable interface of this crate. rustc-dep-of-std = ["dep:compiler_builtins", "dep:core", "dep:alloc"] [[bench]] name = "reversedbitreader_bench" harness = false [[bin]] name = "zstd" required-features = ["std"] [[bin]] name = "zstd_stream" required-features = ["std"] ruzstd-0.7.3/Changelog.md000064400000000000000000000032671046102023000134070ustar 00000000000000# Changelog This document records the changes made between versions, starting with version 0.5.0 # After 0.5.0 * Make the hashing checksum optional (thanks to [@tamird](https://github.com/tamird)) * breaking change as the public API changes based on features * The FrameDecoder is now Send + Sync (RingBuffer impls these traits now) # After 0.6.0 * Small fix in the zstd binary, progress tracking was slighty off for skippable frames resulting in an error only when the last frame in a file was skippable * Small performance improvement by reorganizing code with `#[cold]` annotations * Documentation for `StreamDecoder` mentioning the limitations around multiple frames (https://github.com/Sorseg) * Documentation around skippable frames (https://github.com/Sorseg) * **Breaking** `StreamDecoder` API changes to get access to the inner parts (https://github.com/ifd3f) * Big internal documentation contribution (https://github.com/zleyyij) * Dropped derive_more as a dependency (https://github.com/xd009642) * Small improvement by removing the error cases from the reverse bitreader (and making sure invalid requests can't even happen) # After 0.7.0 * Fix for drain_to functions into limited targets (https://github.com/michaelkirk) # After 0.7.1 * Remove byteorder dependency (https://github.com/workingjubilee) * Preparations to become a std dependency (https://github.com/workingjubilee) # After 0.7.2 * Soundness fix in decoding::RingBuffer. The lengths of the diferent regions where sometimes calculated wrongly, resulting in reads of heap memory not belonging to that ringbuffer * Fixed by https://github.com/paolobarbolini * Affected versions: 0.7.0 up to and including 0.7.2 # After 0.7.3 ruzstd-0.7.3/LICENSE000064400000000000000000000020631046102023000121740ustar 00000000000000MIT License Copyright (c) 2019 Moritz Borcherding Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ruzstd-0.7.3/Readme.md000064400000000000000000000121021046102023000127010ustar 00000000000000# Ruzstd (a pure rust zstd decoder) [![Released API docs](https://docs.rs/ruzstd/badge.svg)](https://docs.rs/ruzstd) [![CI](https://github.com/killingspark/zstd-rs/workflows/CI/badge.svg)](https://github.com/killingspark/zstd-rs/actions?query=workflow%3ACI) # What is this A feature-complete decoder for the zstd compression format as defined in: [This document](https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md). It is NOT a compressor. I don't plan on implementing that part either, at least not in the near future. (If someone is motivated enough I will of course accept a pull-request!) This crate might look like it is not active, this is because there isn't really anything to do anymore, unless a bug is found or a new API feature is requested. I will of course respond to and look into issues! # Current Status Feature complete on the decoder side. In terms of speed it is still behind the original C implementation which has a rust binding located [here](https://github.com/gyscos/zstd-rs). Actively maintained but no new features currently planned. If you have suggestions please open an issue and I'll consider it. ## Speed Measuring with the 'time' utility the original zstd and my decoder both decoding the same enwik9.zst file from aramfs, my decoder is about 3.5 times slower. Enwik9 is highly compressible, for less compressible data (like a ubuntu installation .iso) my decoder comes close to only being 1.4 times slower. ## Can do: 1. Parse all files in /decodecorpus_files. These were generated with [decodecorpus](https://github.com/facebook/zstd/tree/dev/tests) by the original zstd developers 1. Decode all of them correctly into the output buffer 1. Decode all the decode_corpus files (1000+) I created locally 1. Calculate checksums 1. Act as a `zstd -c -d` dropin replacement 1. Can be compiled in a no-std environment that provides alloc ## Cannot do This decoder is pretty much feature complete. If there are any wishes for new APIs or bug reports please file an issue, I will gladly take a look! ## Roadmap 1. More Performance optimizations (targets would be sequence_decoding and reverse_bitreader::get_bits. Those account for about 50% of the whole time used) ## Testing Tests take two forms. 1. Tests using well-formed files that have to decode correctly and are checked against their originals 1. Tests using malformed input that have been generated by the fuzzer. These don't have to decode (they are garbage) but they must not make the decoder panic ## Fuzzing Fuzzing has been done with cargo fuzz. Each time it crashes the decoder I fixed the issue and added the offending input as a test. It's checked into the repo in the fuzz/artifacts/fuzz_target_1 directory. Those get tested in the fuzz_regressions.rs test. At the time of writing the fuzzer was able to run for over 12 hours on the random input without finding new crashes. Obviously this doesn't mean there are no bugs but the common ones are probably fixed. Fuzzing has been done on 1. Random input with no initial corpus 2. The \*.zst in /fuzz_decodecorpus ### You wanna help fuzz? Use `cargo +nightly fuzz run decode` to run the fuzzer. It is seeded with files created with decodecorpus. If (when) the fuzzer finds a crash it will be saved to the artifacts dir by the fuzzer. Run `cargo test artifacts` to run the artifacts tests. This will tell you where the decoder panics exactly. If you are able to fix the issue please feel free to do a pull request. If not please still submit the offending input and I will see how to fix it myself. # How can you use it? Additionally to the descriptions and the docs you can have a look at the zstd / zstd_streaming binaries. They showcase how this library can be used. ## Easy The easiest is to wrap the io::Read into a StreamingDecoder which itself implements io::Read. It will decode blocks as necessary to fulfill the read requests ```rust let mut f = File::open(path).unwrap(); let mut decoder = StreamingDecoder::new(&mut f).unwrap(); let mut result = Vec::new(); decoder.read_to_end(&mut result).unwrap(); ``` This might be a problem if you are accepting user provided data. Frames can be REALLY big when decoded. If this is the case you should either check how big the frame actually is or use the memory efficient approach described below. ## Memory efficient If memory is a concern you can decode frames partially. There are two ways to do this: #### Streaming decoder Use the StreamingDecoder and use a while loop to fill your buffer (see src/bin/zstd_stream.rs for an example). This is the recommended approach. #### Use the lower level FrameDecoder For an example see the src/bin/zstd.rs file. Basically you can decode the frame until either a given block count has been decoded or the decodebuffer has reached a certain size. Then you can collect no longer needed bytes from the buffer and do something with them, discard them and resume decoding the frame in a loop until the frame has been decoded completely. # Contributing Contributions will be published under the same MIT license as this project. Please make an entry in the Changelog.md file when you make a PR.ruzstd-0.7.3/benches/reversedbitreader_bench.rs000064400000000000000000000020661046102023000200070ustar 00000000000000use criterion::{black_box, criterion_group, criterion_main, Criterion}; use rand::{Rng, SeedableRng}; use ruzstd::decoding::bit_reader_reverse::BitReaderReversed; fn do_all_accesses(br: &mut BitReaderReversed, accesses: &[u8]) -> u64 { let mut sum = 0; for x in accesses { sum += br.get_bits(*x); } let _ = black_box(br); sum } fn criterion_benchmark(c: &mut Criterion) { const DATA_SIZE: usize = 1024 * 1024; let mut rng = rand::rngs::SmallRng::seed_from_u64(0xDEADBEEF); let mut rand_vec = vec![]; for _ in 0..DATA_SIZE { rand_vec.push(rng.gen()); } let mut access_vec = vec![]; let mut br = BitReaderReversed::new(&rand_vec); while br.bits_remaining() > 0 { let x = rng.gen_range(0..20); br.get_bits(x); access_vec.push(x); } c.bench_function("reversed bitreader", |b| { b.iter(|| { br.reset(&rand_vec); do_all_accesses(&mut br, &access_vec) }) }); } criterion_group!(benches, criterion_benchmark); criterion_main!(benches); ruzstd-0.7.3/optimizations.md000064400000000000000000000027711046102023000144300ustar 00000000000000# Optimizations This document tracks which optimizations have been done after the initial implementation passed corpus tests and a good amount of fuzzing. ## Introducing more unsafe code: These optimizations introduced more unsafe code. These should yield significant improvements, or else they are not really worth it. ### Optimizing bitreader with byteorder which uses ptr::copy_nonoverlapping * Reverse bitreader_reversed::get_bits was identified by linux perf tool using about 36% of the whole time * Benchmark: decode enwik9 * Before: about 14.7 seconds * After: about 12.2 seconds with about 25% of the time used for get_bits() ### Optimizing decodebuffer::repeat with ptr::copy_nonoverlapping * decodebuffer::repeate was identified by linux perf tool using about 28% of the whole time * Benchmark: decode enwik9 * Before: about 9.9 seconds * After: about 9.4 seconds ### Use custom ringbuffer in the decodebuffer The decode buffer must be able to do two things efficiently * Collect bytes from the front * Copy bytes from the contents to the end The stdlibs VecDequeu and Vec can each do one but not the other efficiently. So a custom implementation of a ringbuffer was written. ## Introducing NO additional unsafe code These are just nice to have ### Even better bitreaders Studying this material lead to a big improvement in bitreader speed * https://fgiesen.wordpress.com/2018/02/19/reading-bits-in-far-too-many-ways-part-1/ * https://fgiesen.wordpress.com/2018/02/20/reading-bits-in-far-too-many-ways-part-2/ ruzstd-0.7.3/src/bin/zstd.rs000064400000000000000000000113461046102023000140640ustar 00000000000000extern crate ruzstd; use std::fs::File; use std::io::Read; use std::io::Seek; use std::io::SeekFrom; use std::io::Write; use ruzstd::frame::ReadFrameHeaderError; use ruzstd::frame_decoder::FrameDecoderError; struct StateTracker { bytes_used: u64, frames_used: usize, valid_checksums: usize, invalid_checksums: usize, file_pos: u64, file_size: u64, old_percentage: i8, } fn main() { let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect(); let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect(); file_paths.remove(0); if !flags.contains(&"-d".to_owned()) { eprintln!("This zstd implementation only supports decompression. Please add a \"-d\" flag"); return; } if !flags.contains(&"-c".to_owned()) { eprintln!("This zstd implementation only supports output on the stdout. Please add a \"-c\" flag and pipe the output into a file"); return; } if flags.len() != 2 { eprintln!( "No flags other than -d and -c are currently implemented. Flags used: {:?}", flags ); return; } let mut frame_dec = ruzstd::FrameDecoder::new(); for path in file_paths { eprintln!("File: {}", path); let mut f = File::open(path).unwrap(); let mut tracker = StateTracker { bytes_used: 0, frames_used: 0, valid_checksums: 0, invalid_checksums: 0, file_size: f.metadata().unwrap().len(), file_pos: 0, old_percentage: -1, }; let batch_size = 1024 * 1024 * 10; let mut result = vec![0; batch_size]; while tracker.file_pos < tracker.file_size { match frame_dec.reset(&mut f) { Err(FrameDecoderError::ReadFrameHeaderError(ReadFrameHeaderError::SkipFrame { magic_number: magic_num, length: skip_size, })) => { eprintln!("Found a skippable frame with magic number: {magic_num} and size: {skip_size}"); tracker.file_pos = f.stream_position().unwrap(); tracker.file_pos += skip_size as u64; f.seek(SeekFrom::Current(skip_size as i64)).unwrap(); continue; } other => other.unwrap(), } tracker.frames_used += 1; while !frame_dec.is_finished() { frame_dec .decode_blocks(&mut f, ruzstd::BlockDecodingStrategy::UptoBytes(batch_size)) .unwrap(); if frame_dec.can_collect() > batch_size { let x = frame_dec.read(result.as_mut_slice()).unwrap(); tracker.file_pos = f.stream_position().unwrap(); do_something(&result[..x], &mut tracker); } } // handle the last chunk of data while frame_dec.can_collect() > 0 { let x = frame_dec.read(result.as_mut_slice()).unwrap(); tracker.file_pos = f.stream_position().unwrap(); do_something(&result[..x], &mut tracker); } #[cfg(feature = "hash")] if let Some(chksum) = frame_dec.get_checksum_from_data() { if frame_dec.get_calculated_checksum().unwrap() != chksum { tracker.invalid_checksums += 1; eprintln!( "Checksum did not match in frame {}! From data: {}, calculated while decoding: {}", tracker.frames_used, chksum, frame_dec.get_calculated_checksum().unwrap() ); } else { tracker.valid_checksums += 1; } } } eprintln!( "\nDecoded frames: {} bytes: {}", tracker.frames_used, tracker.bytes_used ); if tracker.valid_checksums == 0 && tracker.invalid_checksums == 0 { eprintln!("No checksums to test"); } else { eprintln!( "{} of {} checksums are ok!", tracker.valid_checksums, tracker.valid_checksums + tracker.invalid_checksums, ); } } } fn do_something(data: &[u8], s: &mut StateTracker) { //Do something. Like writing it to a file or to stdout... std::io::stdout().write_all(data).unwrap(); s.bytes_used += data.len() as u64; let percentage = (s.file_pos * 100) / s.file_size; if percentage as i8 != s.old_percentage { eprint!("\r"); eprint!("{} % done", percentage); s.old_percentage = percentage as i8; } } ruzstd-0.7.3/src/bin/zstd_stream.rs000064400000000000000000000025601046102023000154350ustar 00000000000000extern crate ruzstd; use std::fs::File; use std::io::{Read, Write}; fn main() { let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect(); let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect(); file_paths.remove(0); if !flags.contains(&"-d".to_owned()) { eprintln!("This zstd implementation only supports decompression. Please add a \"-d\" flag"); return; } if !flags.contains(&"-c".to_owned()) { eprintln!("This zstd implementation only supports output on the stdout. Please add a \"-c\" flag and pipe the output into a file"); return; } if flags.len() != 2 { eprintln!( "No flags other than -d and -c are currently implemented. Flags used: {:?}", flags ); return; } for path in file_paths { eprintln!("File: {}", path); let f = File::open(path).unwrap(); let mut buf_read = std::io::BufReader::new(f); let mut decoder = ruzstd::StreamingDecoder::new(&mut buf_read).unwrap(); let mut buf = [0u8; 1024 * 1024]; let mut stdout = std::io::stdout(); while !decoder.decoder.is_finished() || decoder.decoder.can_collect() > 0 { let bytes = decoder.read(&mut buf[..]).unwrap(); stdout.write_all(&buf[..bytes]).unwrap(); } } } ruzstd-0.7.3/src/blocks/block.rs000064400000000000000000000032031046102023000146700ustar 00000000000000//! Block header definitions. /// There are 4 different kinds of blocks, and the type of block influences the meaning of `Block_Size`. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum BlockType { /// An uncompressed block. Raw, /// A single byte, repeated `Block_Size` times (Run Length Encoding). RLE, /// A Zstandard compressed block. `Block_Size` is the length of the compressed data. Compressed, /// This is not a valid block, and this value should not be used. /// If this value is present, it should be considered corrupted data. Reserved, } impl core::fmt::Display for BlockType { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> { match self { BlockType::Compressed => write!(f, "Compressed"), BlockType::Raw => write!(f, "Raw"), BlockType::RLE => write!(f, "RLE"), BlockType::Reserved => write!(f, "Reserverd"), } } } /// A representation of a single block header. As well as containing a frame header, /// each Zstandard frame contains one or more blocks. pub struct BlockHeader { /// Whether this block is the last block in the frame. /// It may be followed by an optional `Content_Checksum` if it is. pub last_block: bool, pub block_type: BlockType, /// The size of the decompressed data. If the block type /// is [BlockType::Reserved] or [BlockType::Compressed], /// this value is set to zero and should not be referenced. pub decompressed_size: u32, /// The size of the block. If the block is [BlockType::RLE], /// this value will be 1. pub content_size: u32, } ruzstd-0.7.3/src/blocks/literals_section.rs000064400000000000000000000272711046102023000171540ustar 00000000000000//! Utilities and representations for the first half of a block, the literals section. //! It contains data that is then copied from by the sequences section. use super::super::decoding::bit_reader::{BitReader, GetBitsError}; /// A compressed block consists of two sections, a literals section, and a sequences section. /// /// This is the first of those two sections. A literal is just any arbitrary data, and it is copied by the sequences section pub struct LiteralsSection { /// - If this block is of type [LiteralsSectionType::Raw], then the data is `regenerated_bytes` /// bytes long, and it contains the raw literals data to be used during the second section, /// the sequences section. /// - If this block is of type [LiteralsSectionType::RLE], /// then the literal consists of a single byte repeated `regenerated_size` times. /// - For types [LiteralsSectionType::Compressed] or [LiteralsSectionType::Treeless], /// then this is the size of the decompressed data. pub regenerated_size: u32, /// - For types [LiteralsSectionType::Raw] and [LiteralsSectionType::RLE], this value is not present. /// - For types [LiteralsSectionType::Compressed] and [LiteralsSectionType::Treeless], this value will /// be set to the size of the compressed data. pub compressed_size: Option, /// This value will be either 1 stream or 4 streams if the literal is of type /// [LiteralsSectionType::Compressed] or [LiteralsSectionType::Treeless], and it /// is not used for RLE or uncompressed literals. pub num_streams: Option, /// The type of the literal section. pub ls_type: LiteralsSectionType, } /// The way which a literal section is encoded. pub enum LiteralsSectionType { /// Literals are stored uncompressed. Raw, /// Literals consist of a single byte value repeated [LiteralsSection::regenerated_size] times. RLE, /// This is a standard Huffman-compressed block, starting with a Huffman tree description. /// In this mode, there are at least *2* different literals represented in the Huffman tree /// description. Compressed, /// This is a Huffman-compressed block, /// using the Huffman tree from the previous [LiteralsSectionType::Compressed] block /// in the sequence. If this mode is triggered without any previous Huffman-tables in the /// frame (or dictionary), it should be treated as data corruption. Treeless, } #[derive(Debug)] #[non_exhaustive] pub enum LiteralsSectionParseError { IllegalLiteralSectionType { got: u8 }, GetBitsError(GetBitsError), NotEnoughBytes { have: usize, need: u8 }, } #[cfg(feature = "std")] impl std::error::Error for LiteralsSectionParseError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { LiteralsSectionParseError::GetBitsError(source) => Some(source), _ => None, } } } impl core::fmt::Display for LiteralsSectionParseError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { LiteralsSectionParseError::IllegalLiteralSectionType { got } => { write!( f, "Illegal literalssectiontype. Is: {}, must be in: 0, 1, 2, 3", got ) } LiteralsSectionParseError::GetBitsError(e) => write!(f, "{:?}", e), LiteralsSectionParseError::NotEnoughBytes { have, need } => { write!( f, "Not enough byte to parse the literals section header. Have: {}, Need: {}", have, need, ) } } } } impl From for LiteralsSectionParseError { fn from(val: GetBitsError) -> Self { Self::GetBitsError(val) } } impl core::fmt::Display for LiteralsSectionType { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> { match self { LiteralsSectionType::Compressed => write!(f, "Compressed"), LiteralsSectionType::Raw => write!(f, "Raw"), LiteralsSectionType::RLE => write!(f, "RLE"), LiteralsSectionType::Treeless => write!(f, "Treeless"), } } } impl Default for LiteralsSection { fn default() -> Self { Self::new() } } impl LiteralsSection { /// Create a new [LiteralsSection]. pub fn new() -> LiteralsSection { LiteralsSection { regenerated_size: 0, compressed_size: None, num_streams: None, ls_type: LiteralsSectionType::Raw, } } /// Given the first byte of a header, determine the size of the whole header, from 1 to 5 bytes. pub fn header_bytes_needed(&self, first_byte: u8) -> Result { let ls_type: LiteralsSectionType = Self::section_type(first_byte)?; let size_format = (first_byte >> 2) & 0x3; match ls_type { LiteralsSectionType::RLE | LiteralsSectionType::Raw => { match size_format { 0 | 2 => { // size_format actually only uses one bit // regenerated_size uses 5 bits Ok(1) } 1 => { // size_format uses 2 bit // regenerated_size uses 12 bits Ok(2) } 3 => { // size_format uses 2 bit // regenerated_size uses 20 bits Ok(3) } _ => panic!( "This is a bug in the program. There should only be values between 0..3" ), } } LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => { match size_format { 0 | 1 => { // Only differ in num_streams // both regenerated and compressed sizes use 10 bit Ok(3) } 2 => { // both regenerated and compressed sizes use 14 bit Ok(4) } 3 => { // both regenerated and compressed sizes use 18 bit Ok(5) } _ => panic!( "This is a bug in the program. There should only be values between 0..3" ), } } } } /// Parse the header into `self`, and returns the number of bytes read. pub fn parse_from_header(&mut self, raw: &[u8]) -> Result { let mut br: BitReader<'_> = BitReader::new(raw); let block_type = br.get_bits(2)? as u8; self.ls_type = Self::section_type(block_type)?; let size_format = br.get_bits(2)? as u8; let byte_needed = self.header_bytes_needed(raw[0])?; if raw.len() < byte_needed as usize { return Err(LiteralsSectionParseError::NotEnoughBytes { have: raw.len(), need: byte_needed, }); } match self.ls_type { LiteralsSectionType::RLE | LiteralsSectionType::Raw => { self.compressed_size = None; match size_format { 0 | 2 => { // size_format actually only uses one bit // regenerated_size uses 5 bits self.regenerated_size = u32::from(raw[0]) >> 3; Ok(1) } 1 => { // size_format uses 2 bit // regenerated_size uses 12 bits self.regenerated_size = (u32::from(raw[0]) >> 4) + (u32::from(raw[1]) << 4); Ok(2) } 3 => { // size_format uses 2 bit // regenerated_size uses 20 bits self.regenerated_size = (u32::from(raw[0]) >> 4) + (u32::from(raw[1]) << 4) + (u32::from(raw[2]) << 12); Ok(3) } _ => panic!( "This is a bug in the program. There should only be values between 0..3" ), } } LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => { match size_format { 0 => { self.num_streams = Some(1); } 1..=3 => { self.num_streams = Some(4); } _ => panic!( "This is a bug in the program. There should only be values between 0..3" ), }; match size_format { 0 | 1 => { // Differ in num_streams see above // both regenerated and compressed sizes use 10 bit // 4 from the first, six from the second byte self.regenerated_size = (u32::from(raw[0]) >> 4) + ((u32::from(raw[1]) & 0x3f) << 4); // 2 from the second, full last byte self.compressed_size = Some(u32::from(raw[1] >> 6) + (u32::from(raw[2]) << 2)); Ok(3) } 2 => { // both regenerated and compressed sizes use 14 bit // 4 from first, full second, 2 from the third byte self.regenerated_size = (u32::from(raw[0]) >> 4) + (u32::from(raw[1]) << 4) + ((u32::from(raw[2]) & 0x3) << 12); // 6 from the third, full last byte self.compressed_size = Some((u32::from(raw[2]) >> 2) + (u32::from(raw[3]) << 6)); Ok(4) } 3 => { // both regenerated and compressed sizes use 18 bit // 4 from first, full second, six from third byte self.regenerated_size = (u32::from(raw[0]) >> 4) + (u32::from(raw[1]) << 4) + ((u32::from(raw[2]) & 0x3F) << 12); // 2 from third, full fourth, full fifth byte self.compressed_size = Some( (u32::from(raw[2]) >> 6) + (u32::from(raw[3]) << 2) + (u32::from(raw[4]) << 10), ); Ok(5) } _ => panic!( "This is a bug in the program. There should only be values between 0..3" ), } } } } /// Given the first two bits of a header, determine the type of a header. fn section_type(raw: u8) -> Result { let t = raw & 0x3; match t { 0 => Ok(LiteralsSectionType::Raw), 1 => Ok(LiteralsSectionType::RLE), 2 => Ok(LiteralsSectionType::Compressed), 3 => Ok(LiteralsSectionType::Treeless), other => Err(LiteralsSectionParseError::IllegalLiteralSectionType { got: other }), } } } ruzstd-0.7.3/src/blocks/mod.rs000064400000000000000000000005471046102023000143650ustar 00000000000000//! In a Zstandard frame, there's a frame header, followed by one or more *blocks*. //! //! A block contains data, and a header describing how that data is encoded, as well //! as other misc metadata. //! //! pub mod block; pub mod literals_section; pub mod sequence_section; ruzstd-0.7.3/src/blocks/sequence_section.rs000064400000000000000000000144451046102023000171440ustar 00000000000000//! Utilities and representations for the second half of a block, the sequence section. //! This section copies literals from the literals section into the decompressed output. pub(crate) const MAX_LITERAL_LENGTH_CODE: u8 = 35; pub(crate) const MAX_MATCH_LENGTH_CODE: u8 = 52; pub(crate) const MAX_OFFSET_CODE: u8 = 31; pub struct SequencesHeader { pub num_sequences: u32, pub modes: Option, } /// A sequence represents potentially redundant data, and it can be broken up into 2 steps: /// - A copy step, where data is copied from the literals section to the decompressed output /// - A *match* copy step that copies data from within the previously decompressed output. /// /// #[derive(Clone, Copy)] pub struct Sequence { /// Literal length, or the number of bytes to be copied from the literals section /// in the copy step. pub ll: u32, /// The length of the match to make during the match copy step. pub ml: u32, /// How far back to go in the decompressed data to read from the match copy step. /// If this value is greater than 3, then the offset is `of -3`. If `of` is from 1-3, /// then it has special handling: /// /// The first 3 values define 3 different repeated offsets, with 1 referring to the most /// recent, 2 the second recent, and so on. When the current sequence has a literal length of 0, /// then the repeated offsets are shifted by 1. So an offset value of 1 refers to 2, 2 refers to 3, /// and 3 refers to the most recent offset minus one. If that value is equal to zero, the data /// is considered corrupted. pub of: u32, } impl core::fmt::Display for Sequence { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> { write!(f, "LL: {}, ML: {}, OF: {}", self.ll, self.ml, self.of) } } /// This byte defines the compression mode of each symbol type #[derive(Copy, Clone)] pub struct CompressionModes(u8); /// The compression mode used for symbol compression pub enum ModeType { /// A predefined FSE distribution table is used, and no distribution table /// will be present. Predefined, /// The table consists of a single byte, which contains the symbol's value. RLE, /// Standard FSE compression, a distribution table will be present. This /// mode should not be used when only one symbol is present. FSECompressed, /// The table used in the previous compressed block with at least one sequence /// will be used again. If this is the first block, the table in the dictionary will /// be used. Repeat, } impl CompressionModes { /// Deserialize a two bit mode value into a [ModeType] pub fn decode_mode(m: u8) -> ModeType { match m { 0 => ModeType::Predefined, 1 => ModeType::RLE, 2 => ModeType::FSECompressed, 3 => ModeType::Repeat, _ => panic!("This can never happen"), } } /// Read the compression mode of the literal lengths field. pub fn ll_mode(self) -> ModeType { Self::decode_mode(self.0 >> 6) } /// Read the compression mode of the offset value field. pub fn of_mode(self) -> ModeType { Self::decode_mode((self.0 >> 4) & 0x3) } /// Read the compression mode of the match lengths field. pub fn ml_mode(self) -> ModeType { Self::decode_mode((self.0 >> 2) & 0x3) } } impl Default for SequencesHeader { fn default() -> Self { Self::new() } } #[derive(Debug)] #[non_exhaustive] pub enum SequencesHeaderParseError { NotEnoughBytes { need_at_least: u8, got: usize }, } #[cfg(feature = "std")] impl std::error::Error for SequencesHeaderParseError {} impl core::fmt::Display for SequencesHeaderParseError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { SequencesHeaderParseError::NotEnoughBytes { need_at_least, got } => { write!( f, "source must have at least {} bytes to parse header; got {} bytes", need_at_least, got, ) } } } } impl SequencesHeader { /// Create a new [SequencesHeader]. pub fn new() -> SequencesHeader { SequencesHeader { num_sequences: 0, modes: None, } } /// Attempt to deserialize the provided buffer into `self`, returning the number of bytes read. pub fn parse_from_header(&mut self, source: &[u8]) -> Result { let mut bytes_read = 0; if source.is_empty() { return Err(SequencesHeaderParseError::NotEnoughBytes { need_at_least: 1, got: 0, }); } let source = match source[0] { 0 => { self.num_sequences = 0; return Ok(1); } 1..=127 => { if source.len() < 2 { return Err(SequencesHeaderParseError::NotEnoughBytes { need_at_least: 2, got: source.len(), }); } self.num_sequences = u32::from(source[0]); bytes_read += 1; &source[1..] } 128..=254 => { if source.len() < 3 { return Err(SequencesHeaderParseError::NotEnoughBytes { need_at_least: 3, got: source.len(), }); } self.num_sequences = ((u32::from(source[0]) - 128) << 8) + u32::from(source[1]); bytes_read += 2; &source[2..] } 255 => { if source.len() < 4 { return Err(SequencesHeaderParseError::NotEnoughBytes { need_at_least: 4, got: source.len(), }); } self.num_sequences = u32::from(source[1]) + (u32::from(source[2]) << 8) + 0x7F00; bytes_read += 3; &source[3..] } }; self.modes = Some(CompressionModes(source[0])); bytes_read += 1; Ok(bytes_read) } } ruzstd-0.7.3/src/decoding/bit_reader.rs000064400000000000000000000076421046102023000162100ustar 00000000000000/// Interact with a provided source at a bit level. pub struct BitReader<'s> { idx: usize, //index counts bits already read source: &'s [u8], } #[derive(Debug)] #[non_exhaustive] pub enum GetBitsError { TooManyBits { num_requested_bits: usize, limit: u8, }, NotEnoughRemainingBits { requested: usize, remaining: usize, }, } #[cfg(feature = "std")] impl std::error::Error for GetBitsError {} impl core::fmt::Display for GetBitsError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { GetBitsError::TooManyBits { num_requested_bits, limit, } => { write!( f, "Cant serve this request. The reader is limited to {} bits, requested {} bits", limit, num_requested_bits, ) } GetBitsError::NotEnoughRemainingBits { requested, remaining, } => { write!( f, "Can\'t read {} bits, only have {} bits left", requested, remaining, ) } } } } impl<'s> BitReader<'s> { pub fn new(source: &'s [u8]) -> BitReader<'s> { BitReader { idx: 0, source } } pub fn bits_left(&self) -> usize { self.source.len() * 8 - self.idx } pub fn bits_read(&self) -> usize { self.idx } pub fn return_bits(&mut self, n: usize) { if n > self.idx { panic!("Cant return this many bits"); } self.idx -= n; } pub fn get_bits(&mut self, n: usize) -> Result { if n > 64 { return Err(GetBitsError::TooManyBits { num_requested_bits: n, limit: 64, }); } if self.bits_left() < n { return Err(GetBitsError::NotEnoughRemainingBits { requested: n, remaining: self.bits_left(), }); } let old_idx = self.idx; let bits_left_in_current_byte = 8 - (self.idx % 8); let bits_not_needed_in_current_byte = 8 - bits_left_in_current_byte; //collect bits from the currently pointed to byte let mut value = u64::from(self.source[self.idx / 8] >> bits_not_needed_in_current_byte); if bits_left_in_current_byte >= n { //no need for fancy stuff //just mask all but the needed n bit value &= (1 << n) - 1; self.idx += n; } else { self.idx += bits_left_in_current_byte; //n spans over multiple bytes let full_bytes_needed = (n - bits_left_in_current_byte) / 8; let bits_in_last_byte_needed = n - bits_left_in_current_byte - full_bytes_needed * 8; assert!( bits_left_in_current_byte + full_bytes_needed * 8 + bits_in_last_byte_needed == n ); let mut bit_shift = bits_left_in_current_byte; //this many bits are already set in value assert!(self.idx % 8 == 0); //collect full bytes for _ in 0..full_bytes_needed { value |= u64::from(self.source[self.idx / 8]) << bit_shift; self.idx += 8; bit_shift += 8; } assert!(n - bit_shift == bits_in_last_byte_needed); if bits_in_last_byte_needed > 0 { let val_las_byte = u64::from(self.source[self.idx / 8]) & ((1 << bits_in_last_byte_needed) - 1); value |= val_las_byte << bit_shift; self.idx += bits_in_last_byte_needed; } } assert!(self.idx == old_idx + n); Ok(value) } pub fn reset(&mut self, new_source: &'s [u8]) { self.idx = 0; self.source = new_source; } } ruzstd-0.7.3/src/decoding/bit_reader_reverse.rs000064400000000000000000000170031046102023000177330ustar 00000000000000use core::convert::TryInto; pub use super::bit_reader::GetBitsError; use crate::io::Read; /// Zstandard encodes some types of data in a way that the data must be read /// back to front to decode it properly. `BitReaderReversed` provides a /// convenient interface to do that. pub struct BitReaderReversed<'s> { idx: isize, //index counts bits already read source: &'s [u8], /// The reader doesn't read directly from the source, /// it reads bits from here, and the container is /// "refilled" as it's emptied. bit_container: u64, bits_in_container: u8, } impl<'s> BitReaderReversed<'s> { /// How many bits are left to read by the reader. pub fn bits_remaining(&self) -> isize { self.idx + self.bits_in_container as isize } pub fn new(source: &'s [u8]) -> BitReaderReversed<'s> { BitReaderReversed { idx: source.len() as isize * 8, source, bit_container: 0, bits_in_container: 0, } } /// We refill the container in full bytes, shifting the still unread portion to the left, and filling the lower bits with new data #[inline(always)] fn refill_container(&mut self) { let byte_idx = self.byte_idx() as usize; let retain_bytes = (self.bits_in_container + 7) / 8; let want_to_read_bits = 64 - (retain_bytes * 8); // if there are >= 8 byte left to read we go a fast path: // The slice is looking something like this |U..UCCCCCCCCR..R| Where U are some unread bytes, C are the bytes in the container, and R are already read bytes // What we do is, we shift the container by a few bytes to the left by just reading a u64 from the correct position, rereading the portion we did not yet return from the conainer. // Technically this would still work for positions lower than 8 but this guarantees that enough bytes are in the source and generally makes for less edge cases if byte_idx >= 8 { self.refill_fast(byte_idx, retain_bytes, want_to_read_bits) } else { // In the slow path we just read however many bytes we can self.refill_slow(byte_idx, want_to_read_bits) } } #[inline(always)] fn refill_fast(&mut self, byte_idx: usize, retain_bytes: u8, want_to_read_bits: u8) { let load_from_byte_idx = byte_idx - 7 + retain_bytes as usize; let tmp_bytes: [u8; 8] = (&self.source[load_from_byte_idx..][..8]) .try_into() .unwrap(); let refill = u64::from_le_bytes(tmp_bytes); self.bit_container = refill; self.bits_in_container += want_to_read_bits; self.idx -= want_to_read_bits as isize; } #[cold] fn refill_slow(&mut self, byte_idx: usize, want_to_read_bits: u8) { let can_read_bits = isize::min(want_to_read_bits as isize, self.idx); let can_read_bytes = can_read_bits / 8; let mut tmp_bytes = [0u8; 8]; let offset @ 1..=8 = can_read_bytes as usize else { unreachable!() }; let bits_read = offset * 8; let _ = (&self.source[byte_idx - (offset - 1)..]).read_exact(&mut tmp_bytes[0..offset]); self.bits_in_container += bits_read as u8; self.idx -= bits_read as isize; if offset < 8 { self.bit_container <<= bits_read; self.bit_container |= u64::from_le_bytes(tmp_bytes); } else { self.bit_container = u64::from_le_bytes(tmp_bytes); } } /// Next byte that should be read into the container /// Negative values mean that the source buffer as been read into the container completetly. fn byte_idx(&self) -> isize { (self.idx - 1) / 8 } /// Read `n` number of bits from the source. Will read at most 56 bits. /// If there are no more bits to be read from the source zero bits will be returned instead. #[inline(always)] pub fn get_bits(&mut self, n: u8) -> u64 { if n == 0 { return 0; } if self.bits_in_container >= n { return self.get_bits_unchecked(n); } self.get_bits_cold(n) } #[cold] fn get_bits_cold(&mut self, n: u8) -> u64 { let n = u8::min(n, 56); let signed_n = n as isize; if self.bits_remaining() <= 0 { self.idx -= signed_n; return 0; } if self.bits_remaining() < signed_n { let emulated_read_shift = signed_n - self.bits_remaining(); let v = self.get_bits(self.bits_remaining() as u8); debug_assert!(self.idx == 0); let value = v.wrapping_shl(emulated_read_shift as u32); self.idx -= emulated_read_shift; return value; } while (self.bits_in_container < n) && self.idx > 0 { self.refill_container(); } debug_assert!(self.bits_in_container >= n); //if we reach this point there are enough bits in the container self.get_bits_unchecked(n) } /// Same as calling get_bits three times but slightly more performant #[inline(always)] pub fn get_bits_triple(&mut self, n1: u8, n2: u8, n3: u8) -> (u64, u64, u64) { let sum = n1 as usize + n2 as usize + n3 as usize; if sum == 0 { return (0, 0, 0); } if sum > 56 { // try and get the values separately return (self.get_bits(n1), self.get_bits(n2), self.get_bits(n3)); } let sum = sum as u8; if self.bits_in_container >= sum { let v1 = if n1 == 0 { 0 } else { self.get_bits_unchecked(n1) }; let v2 = if n2 == 0 { 0 } else { self.get_bits_unchecked(n2) }; let v3 = if n3 == 0 { 0 } else { self.get_bits_unchecked(n3) }; return (v1, v2, v3); } self.get_bits_triple_cold(n1, n2, n3, sum) } #[cold] fn get_bits_triple_cold(&mut self, n1: u8, n2: u8, n3: u8, sum: u8) -> (u64, u64, u64) { let sum_signed = sum as isize; if self.bits_remaining() <= 0 { self.idx -= sum_signed; return (0, 0, 0); } if self.bits_remaining() < sum_signed { return (self.get_bits(n1), self.get_bits(n2), self.get_bits(n3)); } while (self.bits_in_container < sum) && self.idx > 0 { self.refill_container(); } debug_assert!(self.bits_in_container >= sum); //if we reach this point there are enough bits in the container let v1 = if n1 == 0 { 0 } else { self.get_bits_unchecked(n1) }; let v2 = if n2 == 0 { 0 } else { self.get_bits_unchecked(n2) }; let v3 = if n3 == 0 { 0 } else { self.get_bits_unchecked(n3) }; (v1, v2, v3) } #[inline(always)] fn get_bits_unchecked(&mut self, n: u8) -> u64 { let shift_by = self.bits_in_container - n; let mask = (1u64 << n) - 1u64; let value = self.bit_container >> shift_by; self.bits_in_container -= n; let value_masked = value & mask; debug_assert!(value_masked < (1 << n)); value_masked } pub fn reset(&mut self, new_source: &'s [u8]) { self.idx = new_source.len() as isize * 8; self.source = new_source; self.bit_container = 0; self.bits_in_container = 0; } } ruzstd-0.7.3/src/decoding/block_decoder.rs000064400000000000000000000456311046102023000166670ustar 00000000000000use super::super::blocks::block::BlockHeader; use super::super::blocks::block::BlockType; use super::super::blocks::literals_section::LiteralsSection; use super::super::blocks::literals_section::LiteralsSectionType; use super::super::blocks::sequence_section::SequencesHeader; use super::literals_section_decoder::{decode_literals, DecompressLiteralsError}; use super::sequence_execution::ExecuteSequencesError; use super::sequence_section_decoder::decode_sequences; use super::sequence_section_decoder::DecodeSequenceError; use crate::blocks::literals_section::LiteralsSectionParseError; use crate::blocks::sequence_section::SequencesHeaderParseError; use crate::decoding::scratch::DecoderScratch; use crate::decoding::sequence_execution::execute_sequences; use crate::io::{self, Read}; pub struct BlockDecoder { header_buffer: [u8; 3], internal_state: DecoderState, } enum DecoderState { ReadyToDecodeNextHeader, ReadyToDecodeNextBody, #[allow(dead_code)] Failed, //TODO put "self.internal_state = DecoderState::Failed;" everywhere an unresolvable error occurs } #[derive(Debug)] #[non_exhaustive] pub enum BlockHeaderReadError { ReadError(io::Error), FoundReservedBlock, BlockTypeError(BlockTypeError), BlockSizeError(BlockSizeError), } #[cfg(feature = "std")] impl std::error::Error for BlockHeaderReadError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { BlockHeaderReadError::ReadError(source) => Some(source), BlockHeaderReadError::BlockTypeError(source) => Some(source), BlockHeaderReadError::BlockSizeError(source) => Some(source), BlockHeaderReadError::FoundReservedBlock => None, } } } impl ::core::fmt::Display for BlockHeaderReadError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> ::core::fmt::Result { match self { BlockHeaderReadError::ReadError(_) => write!(f, "Error while reading the block header"), BlockHeaderReadError::FoundReservedBlock => write!( f, "Reserved block occured. This is considered corruption by the documentation" ), BlockHeaderReadError::BlockTypeError(e) => write!(f, "Error getting block type: {}", e), BlockHeaderReadError::BlockSizeError(e) => { write!(f, "Error getting block content size: {}", e) } } } } impl From for BlockHeaderReadError { fn from(val: io::Error) -> Self { Self::ReadError(val) } } impl From for BlockHeaderReadError { fn from(val: BlockTypeError) -> Self { Self::BlockTypeError(val) } } impl From for BlockHeaderReadError { fn from(val: BlockSizeError) -> Self { Self::BlockSizeError(val) } } #[derive(Debug)] #[non_exhaustive] pub enum BlockTypeError { InvalidBlocktypeNumber { num: u8 }, } #[cfg(feature = "std")] impl std::error::Error for BlockTypeError {} impl core::fmt::Display for BlockTypeError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { BlockTypeError::InvalidBlocktypeNumber { num } => { write!(f, "Invalid Blocktype number. Is: {} Should be one of: 0, 1, 2, 3 (3 is reserved though", num, ) } } } } #[derive(Debug)] #[non_exhaustive] pub enum BlockSizeError { BlockSizeTooLarge { size: u32 }, } #[cfg(feature = "std")] impl std::error::Error for BlockSizeError {} impl core::fmt::Display for BlockSizeError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { BlockSizeError::BlockSizeTooLarge { size } => { write!( f, "Blocksize was bigger than the absolute maximum {} (128kb). Is: {}", ABSOLUTE_MAXIMUM_BLOCK_SIZE, size, ) } } } } #[derive(Debug)] #[non_exhaustive] pub enum DecompressBlockError { BlockContentReadError(io::Error), MalformedSectionHeader { expected_len: usize, remaining_bytes: usize, }, DecompressLiteralsError(DecompressLiteralsError), LiteralsSectionParseError(LiteralsSectionParseError), SequencesHeaderParseError(SequencesHeaderParseError), DecodeSequenceError(DecodeSequenceError), ExecuteSequencesError(ExecuteSequencesError), } #[cfg(feature = "std")] impl std::error::Error for DecompressBlockError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { DecompressBlockError::BlockContentReadError(source) => Some(source), DecompressBlockError::DecompressLiteralsError(source) => Some(source), DecompressBlockError::LiteralsSectionParseError(source) => Some(source), DecompressBlockError::SequencesHeaderParseError(source) => Some(source), DecompressBlockError::DecodeSequenceError(source) => Some(source), DecompressBlockError::ExecuteSequencesError(source) => Some(source), _ => None, } } } impl core::fmt::Display for DecompressBlockError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { DecompressBlockError::BlockContentReadError(e) => { write!(f, "Error while reading the block content: {}", e) } DecompressBlockError::MalformedSectionHeader { expected_len, remaining_bytes, } => { write!(f, "Malformed section header. Says literals would be this long: {} but there are only {} bytes left", expected_len, remaining_bytes, ) } DecompressBlockError::DecompressLiteralsError(e) => write!(f, "{:?}", e), DecompressBlockError::LiteralsSectionParseError(e) => write!(f, "{:?}", e), DecompressBlockError::SequencesHeaderParseError(e) => write!(f, "{:?}", e), DecompressBlockError::DecodeSequenceError(e) => write!(f, "{:?}", e), DecompressBlockError::ExecuteSequencesError(e) => write!(f, "{:?}", e), } } } impl From for DecompressBlockError { fn from(val: io::Error) -> Self { Self::BlockContentReadError(val) } } impl From for DecompressBlockError { fn from(val: DecompressLiteralsError) -> Self { Self::DecompressLiteralsError(val) } } impl From for DecompressBlockError { fn from(val: LiteralsSectionParseError) -> Self { Self::LiteralsSectionParseError(val) } } impl From for DecompressBlockError { fn from(val: SequencesHeaderParseError) -> Self { Self::SequencesHeaderParseError(val) } } impl From for DecompressBlockError { fn from(val: DecodeSequenceError) -> Self { Self::DecodeSequenceError(val) } } impl From for DecompressBlockError { fn from(val: ExecuteSequencesError) -> Self { Self::ExecuteSequencesError(val) } } #[derive(Debug)] #[non_exhaustive] pub enum DecodeBlockContentError { DecoderStateIsFailed, ExpectedHeaderOfPreviousBlock, ReadError { step: BlockType, source: io::Error }, DecompressBlockError(DecompressBlockError), } #[cfg(feature = "std")] impl std::error::Error for DecodeBlockContentError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { DecodeBlockContentError::ReadError { step: _, source } => Some(source), DecodeBlockContentError::DecompressBlockError(source) => Some(source), _ => None, } } } impl core::fmt::Display for DecodeBlockContentError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { DecodeBlockContentError::DecoderStateIsFailed => { write!( f, "Can't decode next block if failed along the way. Results will be nonsense", ) } DecodeBlockContentError::ExpectedHeaderOfPreviousBlock => { write!(f, "Can't decode next block body, while expecting to decode the header of the previous block. Results will be nonsense", ) } DecodeBlockContentError::ReadError { step, source } => { write!(f, "Error while reading bytes for {}: {}", step, source,) } DecodeBlockContentError::DecompressBlockError(e) => write!(f, "{:?}", e), } } } impl From for DecodeBlockContentError { fn from(val: DecompressBlockError) -> Self { Self::DecompressBlockError(val) } } /// Create a new [BlockDecoder]. pub fn new() -> BlockDecoder { BlockDecoder { internal_state: DecoderState::ReadyToDecodeNextHeader, header_buffer: [0u8; 3], } } const ABSOLUTE_MAXIMUM_BLOCK_SIZE: u32 = 128 * 1024; impl BlockDecoder { pub fn decode_block_content( &mut self, header: &BlockHeader, workspace: &mut DecoderScratch, //reuse this as often as possible. Not only if the trees are reused but also reuse the allocations when building new trees mut source: impl Read, ) -> Result { match self.internal_state { DecoderState::ReadyToDecodeNextBody => { /* Happy :) */ } DecoderState::Failed => return Err(DecodeBlockContentError::DecoderStateIsFailed), DecoderState::ReadyToDecodeNextHeader => { return Err(DecodeBlockContentError::ExpectedHeaderOfPreviousBlock) } } let block_type = header.block_type; match block_type { BlockType::RLE => { const BATCH_SIZE: usize = 512; let mut buf = [0u8; BATCH_SIZE]; let full_reads = header.decompressed_size / BATCH_SIZE as u32; let single_read_size = header.decompressed_size % BATCH_SIZE as u32; source.read_exact(&mut buf[0..1]).map_err(|err| { DecodeBlockContentError::ReadError { step: block_type, source: err, } })?; self.internal_state = DecoderState::ReadyToDecodeNextHeader; for i in 1..BATCH_SIZE { buf[i] = buf[0]; } for _ in 0..full_reads { workspace.buffer.push(&buf[..]); } let smaller = &mut buf[..single_read_size as usize]; workspace.buffer.push(smaller); Ok(1) } BlockType::Raw => { const BATCH_SIZE: usize = 128 * 1024; let mut buf = [0u8; BATCH_SIZE]; let full_reads = header.decompressed_size / BATCH_SIZE as u32; let single_read_size = header.decompressed_size % BATCH_SIZE as u32; for _ in 0..full_reads { source.read_exact(&mut buf[..]).map_err(|err| { DecodeBlockContentError::ReadError { step: block_type, source: err, } })?; workspace.buffer.push(&buf[..]); } let smaller = &mut buf[..single_read_size as usize]; source .read_exact(smaller) .map_err(|err| DecodeBlockContentError::ReadError { step: block_type, source: err, })?; workspace.buffer.push(smaller); self.internal_state = DecoderState::ReadyToDecodeNextHeader; Ok(u64::from(header.decompressed_size)) } BlockType::Reserved => { panic!("How did you even get this. The decoder should error out if it detects a reserved-type block"); } BlockType::Compressed => { self.decompress_block(header, workspace, source)?; self.internal_state = DecoderState::ReadyToDecodeNextHeader; Ok(u64::from(header.content_size)) } } } fn decompress_block( &mut self, header: &BlockHeader, workspace: &mut DecoderScratch, //reuse this as often as possible. Not only if the trees are reused but also reuse the allocations when building new trees mut source: impl Read, ) -> Result<(), DecompressBlockError> { workspace .block_content_buffer .resize(header.content_size as usize, 0); source.read_exact(workspace.block_content_buffer.as_mut_slice())?; let raw = workspace.block_content_buffer.as_slice(); let mut section = LiteralsSection::new(); let bytes_in_literals_header = section.parse_from_header(raw)?; let raw = &raw[bytes_in_literals_header as usize..]; vprintln!( "Found {} literalssection with regenerated size: {}, and compressed size: {:?}", section.ls_type, section.regenerated_size, section.compressed_size ); let upper_limit_for_literals = match section.compressed_size { Some(x) => x as usize, None => match section.ls_type { LiteralsSectionType::RLE => 1, LiteralsSectionType::Raw => section.regenerated_size as usize, _ => panic!("Bug in this library"), }, }; if raw.len() < upper_limit_for_literals { return Err(DecompressBlockError::MalformedSectionHeader { expected_len: upper_limit_for_literals, remaining_bytes: raw.len(), }); } let raw_literals = &raw[..upper_limit_for_literals]; vprintln!("Slice for literals: {}", raw_literals.len()); workspace.literals_buffer.clear(); //all literals of the previous block must have been used in the sequence execution anyways. just be defensive here let bytes_used_in_literals_section = decode_literals( §ion, &mut workspace.huf, raw_literals, &mut workspace.literals_buffer, )?; assert!( section.regenerated_size == workspace.literals_buffer.len() as u32, "Wrong number of literals: {}, Should have been: {}", workspace.literals_buffer.len(), section.regenerated_size ); assert!(bytes_used_in_literals_section == upper_limit_for_literals as u32); let raw = &raw[upper_limit_for_literals..]; vprintln!("Slice for sequences with headers: {}", raw.len()); let mut seq_section = SequencesHeader::new(); let bytes_in_sequence_header = seq_section.parse_from_header(raw)?; let raw = &raw[bytes_in_sequence_header as usize..]; vprintln!( "Found sequencessection with sequences: {} and size: {}", seq_section.num_sequences, raw.len() ); assert!( u32::from(bytes_in_literals_header) + bytes_used_in_literals_section + u32::from(bytes_in_sequence_header) + raw.len() as u32 == header.content_size ); vprintln!("Slice for sequences: {}", raw.len()); if seq_section.num_sequences != 0 { decode_sequences( &seq_section, raw, &mut workspace.fse, &mut workspace.sequences, )?; vprintln!("Executing sequences"); execute_sequences(workspace)?; } else { workspace.buffer.push(&workspace.literals_buffer); workspace.sequences.clear(); } Ok(()) } pub fn read_block_header( &mut self, mut r: impl Read, ) -> Result<(BlockHeader, u8), BlockHeaderReadError> { //match self.internal_state { // DecoderState::ReadyToDecodeNextHeader => {/* Happy :) */}, // DecoderState::Failed => return Err(format!("Cant decode next block if failed along the way. Results will be nonsense")), // DecoderState::ReadyToDecodeNextBody => return Err(format!("Cant decode next block header, while expecting to decode the body of the previous block. Results will be nonsense")), //} r.read_exact(&mut self.header_buffer[0..3])?; let btype = self.block_type()?; if let BlockType::Reserved = btype { return Err(BlockHeaderReadError::FoundReservedBlock); } let block_size = self.block_content_size()?; let decompressed_size = match btype { BlockType::Raw => block_size, BlockType::RLE => block_size, BlockType::Reserved => 0, //should be caught above, this is an error state BlockType::Compressed => 0, //unknown but will be smaller than 128kb (or window_size if that is smaller than 128kb) }; let content_size = match btype { BlockType::Raw => block_size, BlockType::Compressed => block_size, BlockType::RLE => 1, BlockType::Reserved => 0, //should be caught above, this is an error state }; let last_block = self.is_last(); self.reset_buffer(); self.internal_state = DecoderState::ReadyToDecodeNextBody; //just return 3. Blockheaders always take 3 bytes Ok(( BlockHeader { last_block, block_type: btype, decompressed_size, content_size, }, 3, )) } fn reset_buffer(&mut self) { self.header_buffer[0] = 0; self.header_buffer[1] = 0; self.header_buffer[2] = 0; } fn is_last(&self) -> bool { self.header_buffer[0] & 0x1 == 1 } fn block_type(&self) -> Result { let t = (self.header_buffer[0] >> 1) & 0x3; match t { 0 => Ok(BlockType::Raw), 1 => Ok(BlockType::RLE), 2 => Ok(BlockType::Compressed), 3 => Ok(BlockType::Reserved), other => Err(BlockTypeError::InvalidBlocktypeNumber { num: other }), } } fn block_content_size(&self) -> Result { let val = self.block_content_size_unchecked(); if val > ABSOLUTE_MAXIMUM_BLOCK_SIZE { Err(BlockSizeError::BlockSizeTooLarge { size: val }) } else { Ok(val) } } fn block_content_size_unchecked(&self) -> u32 { u32::from(self.header_buffer[0] >> 3) //push out type and last_block flags. Retain 5 bit | (u32::from(self.header_buffer[1]) << 5) | (u32::from(self.header_buffer[2]) << 13) } } ruzstd-0.7.3/src/decoding/decodebuffer.rs000064400000000000000000000402621046102023000165200ustar 00000000000000use crate::io::{Error, Read, Write}; use alloc::vec::Vec; #[cfg(feature = "hash")] use core::hash::Hasher; use super::ringbuffer::RingBuffer; pub struct DecodeBuffer { buffer: RingBuffer, pub dict_content: Vec, pub window_size: usize, total_output_counter: u64, #[cfg(feature = "hash")] pub hash: twox_hash::XxHash64, } #[derive(Debug)] #[non_exhaustive] pub enum DecodeBufferError { NotEnoughBytesInDictionary { got: usize, need: usize }, OffsetTooBig { offset: usize, buf_len: usize }, } #[cfg(feature = "std")] impl std::error::Error for DecodeBufferError {} impl core::fmt::Display for DecodeBufferError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { DecodeBufferError::NotEnoughBytesInDictionary { got, need } => { write!( f, "Need {} bytes from the dictionary but it is only {} bytes long", need, got, ) } DecodeBufferError::OffsetTooBig { offset, buf_len } => { write!(f, "offset: {} bigger than buffer: {}", offset, buf_len,) } } } } impl Read for DecodeBuffer { fn read(&mut self, target: &mut [u8]) -> Result { let max_amount = self.can_drain_to_window_size().unwrap_or(0); let amount = max_amount.min(target.len()); let mut written = 0; self.drain_to(amount, |buf| { target[written..][..buf.len()].copy_from_slice(buf); written += buf.len(); (buf.len(), Ok(())) })?; Ok(amount) } } impl DecodeBuffer { pub fn new(window_size: usize) -> DecodeBuffer { DecodeBuffer { buffer: RingBuffer::new(), dict_content: Vec::new(), window_size, total_output_counter: 0, #[cfg(feature = "hash")] hash: twox_hash::XxHash64::with_seed(0), } } pub fn reset(&mut self, window_size: usize) { self.window_size = window_size; self.buffer.clear(); self.buffer.reserve(self.window_size); self.dict_content.clear(); self.total_output_counter = 0; #[cfg(feature = "hash")] { self.hash = twox_hash::XxHash64::with_seed(0); } } pub fn len(&self) -> usize { self.buffer.len() } pub fn is_empty(&self) -> bool { self.buffer.is_empty() } pub fn push(&mut self, data: &[u8]) { self.buffer.extend(data); self.total_output_counter += data.len() as u64; } pub fn repeat(&mut self, offset: usize, match_length: usize) -> Result<(), DecodeBufferError> { if offset > self.buffer.len() { self.repeat_from_dict(offset, match_length) } else { let buf_len = self.buffer.len(); let start_idx = buf_len - offset; let end_idx = start_idx + match_length; self.buffer.reserve(match_length); if end_idx > buf_len { // We need to copy in chunks. self.repeat_in_chunks(offset, match_length, start_idx); } else { // can just copy parts of the existing buffer // SAFETY: Requirements checked: // 1. start_idx + match_length must be <= self.buffer.len() // We know that: // 1. start_idx = self.buffer.len() - offset // 2. end_idx = start_idx + match_length // 3. end_idx <= self.buffer.len() // Thus follows: start_idx + match_length <= self.buffer.len() // // 2. explicitly reserved enough memory for the whole match_length unsafe { self.buffer .extend_from_within_unchecked(start_idx, match_length) }; } self.total_output_counter += match_length as u64; Ok(()) } } fn repeat_in_chunks(&mut self, offset: usize, match_length: usize, start_idx: usize) { // We have at max offset bytes in one chunk, the last one can be smaller let mut start_idx = start_idx; let mut copied_counter_left = match_length; // TODO this can be optimized further I think. // Each time we copy a chunk we have a repetiton of length 'offset', so we can copy offset * iteration many bytes from start_idx while copied_counter_left > 0 { let chunksize = usize::min(offset, copied_counter_left); // SAFETY: Requirements checked: // 1. start_idx + chunksize must be <= self.buffer.len() // We know that: // 1. start_idx starts at buffer.len() - offset // 2. chunksize <= offset (== offset for each iteration but the last, and match_length modulo offset in the last iteration) // 3. the buffer grows by offset many bytes each iteration but the last // 4. start_idx is increased by the same amount as the buffer grows each iteration // // Thus follows: start_idx + chunksize == self.buffer.len() in each iteration but the last, where match_length modulo offset == chunksize < offset // Meaning: start_idx + chunksize <= self.buffer.len() // // 2. explicitly reserved enough memory for the whole match_length unsafe { self.buffer .extend_from_within_unchecked(start_idx, chunksize) }; copied_counter_left -= chunksize; start_idx += chunksize; } } #[cold] fn repeat_from_dict( &mut self, offset: usize, match_length: usize, ) -> Result<(), DecodeBufferError> { if self.total_output_counter <= self.window_size as u64 { // at least part of that repeat is from the dictionary content let bytes_from_dict = offset - self.buffer.len(); if bytes_from_dict > self.dict_content.len() { return Err(DecodeBufferError::NotEnoughBytesInDictionary { got: self.dict_content.len(), need: bytes_from_dict, }); } if bytes_from_dict < match_length { let dict_slice = &self.dict_content[self.dict_content.len() - bytes_from_dict..]; self.buffer.extend(dict_slice); self.total_output_counter += bytes_from_dict as u64; return self.repeat(self.buffer.len(), match_length - bytes_from_dict); } else { let low = self.dict_content.len() - bytes_from_dict; let high = low + match_length; let dict_slice = &self.dict_content[low..high]; self.buffer.extend(dict_slice); } Ok(()) } else { Err(DecodeBufferError::OffsetTooBig { offset, buf_len: self.buffer.len(), }) } } /// Check if and how many bytes can currently be drawn from the buffer pub fn can_drain_to_window_size(&self) -> Option { if self.buffer.len() > self.window_size { Some(self.buffer.len() - self.window_size) } else { None } } //How many bytes can be drained if the window_size does not have to be maintained pub fn can_drain(&self) -> usize { self.buffer.len() } /// Drain as much as possible while retaining enough so that decoding si still possible with the required window_size /// At best call only if can_drain_to_window_size reports a 'high' number of bytes to reduce allocations pub fn drain_to_window_size(&mut self) -> Option> { //TODO investigate if it is possible to return the std::vec::Drain iterator directly without collecting here match self.can_drain_to_window_size() { None => None, Some(can_drain) => { let mut vec = Vec::with_capacity(can_drain); self.drain_to(can_drain, |buf| { vec.extend_from_slice(buf); (buf.len(), Ok(())) }) .ok()?; Some(vec) } } } pub fn drain_to_window_size_writer(&mut self, mut sink: impl Write) -> Result { match self.can_drain_to_window_size() { None => Ok(0), Some(can_drain) => self.drain_to(can_drain, |buf| write_all_bytes(&mut sink, buf)), } } /// drain the buffer completely pub fn drain(&mut self) -> Vec { let (slice1, slice2) = self.buffer.as_slices(); #[cfg(feature = "hash")] { self.hash.write(slice1); self.hash.write(slice2); } let mut vec = Vec::with_capacity(slice1.len() + slice2.len()); vec.extend_from_slice(slice1); vec.extend_from_slice(slice2); self.buffer.clear(); vec } pub fn drain_to_writer(&mut self, mut sink: impl Write) -> Result { let write_limit = self.buffer.len(); self.drain_to(write_limit, |buf| write_all_bytes(&mut sink, buf)) } pub fn read_all(&mut self, target: &mut [u8]) -> Result { let amount = self.buffer.len().min(target.len()); let mut written = 0; self.drain_to(amount, |buf| { target[written..][..buf.len()].copy_from_slice(buf); written += buf.len(); (buf.len(), Ok(())) })?; Ok(amount) } /// Semantics of write_bytes: /// Should dump as many of the provided bytes as possible to whatever sink until no bytes are left or an error is encountered /// Return how many bytes have actually been dumped to the sink. fn drain_to( &mut self, amount: usize, mut write_bytes: impl FnMut(&[u8]) -> (usize, Result<(), Error>), ) -> Result { if amount == 0 { return Ok(0); } struct DrainGuard<'a> { buffer: &'a mut RingBuffer, amount: usize, } impl<'a> Drop for DrainGuard<'a> { fn drop(&mut self) { if self.amount != 0 { self.buffer.drop_first_n(self.amount); } } } let mut drain_guard = DrainGuard { buffer: &mut self.buffer, amount: 0, }; let (slice1, slice2) = drain_guard.buffer.as_slices(); let n1 = slice1.len().min(amount); let n2 = slice2.len().min(amount - n1); if n1 != 0 { let (written1, res1) = write_bytes(&slice1[..n1]); #[cfg(feature = "hash")] self.hash.write(&slice1[..written1]); drain_guard.amount += written1; // Apparently this is what clippy thinks is the best way of expressing this res1?; // Only if the first call to write_bytes was not a partial write we can continue with slice2 // Partial writes SHOULD never happen without res1 being an error, but lets just protect against it anyways. if written1 == n1 && n2 != 0 { let (written2, res2) = write_bytes(&slice2[..n2]); #[cfg(feature = "hash")] self.hash.write(&slice2[..written2]); drain_guard.amount += written2; // Apparently this is what clippy thinks is the best way of expressing this res2?; } } let amount_written = drain_guard.amount; // Make sure we don't accidentally drop `DrainGuard` earlier. drop(drain_guard); Ok(amount_written) } } /// Like Write::write_all but returns partial write length even on error fn write_all_bytes(mut sink: impl Write, buf: &[u8]) -> (usize, Result<(), Error>) { let mut written = 0; while written < buf.len() { match sink.write(&buf[written..]) { Ok(0) => return (written, Ok(())), Ok(w) => written += w, Err(e) => return (written, Err(e)), } } (written, Ok(())) } #[cfg(test)] mod tests { use super::DecodeBuffer; use crate::io::{Error, ErrorKind, Write}; extern crate std; use alloc::vec; use alloc::vec::Vec; #[test] fn short_writer() { struct ShortWriter { buf: Vec, write_len: usize, } impl Write for ShortWriter { fn write(&mut self, buf: &[u8]) -> std::result::Result { if buf.len() > self.write_len { self.buf.extend_from_slice(&buf[..self.write_len]); Ok(self.write_len) } else { self.buf.extend_from_slice(buf); Ok(buf.len()) } } fn flush(&mut self) -> std::result::Result<(), Error> { Ok(()) } } let mut short_writer = ShortWriter { buf: vec![], write_len: 10, }; let mut decode_buf = DecodeBuffer::new(100); decode_buf.push(b"0123456789"); decode_buf.repeat(10, 90).unwrap(); let repeats = 1000; for _ in 0..repeats { assert_eq!(decode_buf.len(), 100); decode_buf.repeat(10, 50).unwrap(); assert_eq!(decode_buf.len(), 150); decode_buf .drain_to_window_size_writer(&mut short_writer) .unwrap(); assert_eq!(decode_buf.len(), 100); } assert_eq!(short_writer.buf.len(), repeats * 50); decode_buf.drain_to_writer(&mut short_writer).unwrap(); assert_eq!(short_writer.buf.len(), repeats * 50 + 100); } #[test] fn wouldblock_writer() { struct WouldblockWriter { buf: Vec, last_blocked: usize, block_every: usize, } impl Write for WouldblockWriter { fn write(&mut self, buf: &[u8]) -> std::result::Result { if self.last_blocked < self.block_every { self.buf.extend_from_slice(buf); self.last_blocked += 1; Ok(buf.len()) } else { self.last_blocked = 0; Err(Error::from(ErrorKind::WouldBlock)) } } fn flush(&mut self) -> std::result::Result<(), Error> { Ok(()) } } let mut short_writer = WouldblockWriter { buf: vec![], last_blocked: 0, block_every: 5, }; let mut decode_buf = DecodeBuffer::new(100); decode_buf.push(b"0123456789"); decode_buf.repeat(10, 90).unwrap(); let repeats = 1000; for _ in 0..repeats { assert_eq!(decode_buf.len(), 100); decode_buf.repeat(10, 50).unwrap(); assert_eq!(decode_buf.len(), 150); loop { match decode_buf.drain_to_window_size_writer(&mut short_writer) { Ok(written) => { if written == 0 { break; } } Err(e) => { if e.kind() == ErrorKind::WouldBlock { continue; } else { panic!("Unexpected error {:?}", e); } } } } assert_eq!(decode_buf.len(), 100); } assert_eq!(short_writer.buf.len(), repeats * 50); loop { match decode_buf.drain_to_writer(&mut short_writer) { Ok(written) => { if written == 0 { break; } } Err(e) => { if e.kind() == ErrorKind::WouldBlock { continue; } else { panic!("Unexpected error {:?}", e); } } } } assert_eq!(short_writer.buf.len(), repeats * 50 + 100); } } ruzstd-0.7.3/src/decoding/dictionary.rs000064400000000000000000000126711046102023000162530ustar 00000000000000use alloc::vec::Vec; use core::convert::TryInto; use crate::decoding::scratch::FSEScratch; use crate::decoding::scratch::HuffmanScratch; use crate::fse::FSETableError; use crate::huff0::HuffmanTableError; /// Zstandard includes support for "raw content" dictionaries, that store bytes optionally used /// during sequence execution. /// /// pub struct Dictionary { /// A 4 byte value used by decoders to check if they can use /// the correct dictionary. This value must not be zero. pub id: u32, /// A dictionary can contain an entropy table, either FSE or /// Huffman. pub fse: FSEScratch, /// A dictionary can contain an entropy table, either FSE or /// Huffman. pub huf: HuffmanScratch, /// The content of a dictionary acts as a "past" in front of data /// to compress or decompress, /// so it can be referenced in sequence commands. /// As long as the amount of data decoded from this frame is less than or /// equal to Window_Size, sequence commands may specify offsets longer than /// the total length of decoded output so far to reference back to the /// dictionary, even parts of the dictionary with offsets larger than Window_Size. /// After the total output has surpassed Window_Size however, /// this is no longer allowed and the dictionary is no longer accessible pub dict_content: Vec, /// The 3 most recent offsets are stored so that they can be used /// during sequence execution, see /// /// for more. pub offset_hist: [u32; 3], } #[derive(Debug)] #[non_exhaustive] pub enum DictionaryDecodeError { BadMagicNum { got: [u8; 4] }, FSETableError(FSETableError), HuffmanTableError(HuffmanTableError), } #[cfg(feature = "std")] impl std::error::Error for DictionaryDecodeError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { DictionaryDecodeError::FSETableError(source) => Some(source), DictionaryDecodeError::HuffmanTableError(source) => Some(source), _ => None, } } } impl core::fmt::Display for DictionaryDecodeError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { DictionaryDecodeError::BadMagicNum { got } => { write!( f, "Bad magic_num at start of the dictionary; Got: {:#04X?}, Expected: {:#04x?}", got, MAGIC_NUM, ) } DictionaryDecodeError::FSETableError(e) => write!(f, "{:?}", e), DictionaryDecodeError::HuffmanTableError(e) => write!(f, "{:?}", e), } } } impl From for DictionaryDecodeError { fn from(val: FSETableError) -> Self { Self::FSETableError(val) } } impl From for DictionaryDecodeError { fn from(val: HuffmanTableError) -> Self { Self::HuffmanTableError(val) } } /// This 4 byte (little endian) magic number refers to the start of a dictionary pub const MAGIC_NUM: [u8; 4] = [0x37, 0xA4, 0x30, 0xEC]; impl Dictionary { /// Parses the dictionary from `raw` and set the tables /// it returns the dict_id for checking with the frame's `dict_id`` pub fn decode_dict(raw: &[u8]) -> Result { let mut new_dict = Dictionary { id: 0, fse: FSEScratch::new(), huf: HuffmanScratch::new(), dict_content: Vec::new(), offset_hist: [2, 4, 8], }; let magic_num: [u8; 4] = raw[..4].try_into().expect("optimized away"); if magic_num != MAGIC_NUM { return Err(DictionaryDecodeError::BadMagicNum { got: magic_num }); } let dict_id = raw[4..8].try_into().expect("optimized away"); let dict_id = u32::from_le_bytes(dict_id); new_dict.id = dict_id; let raw_tables = &raw[8..]; let huf_size = new_dict.huf.table.build_decoder(raw_tables)?; let raw_tables = &raw_tables[huf_size as usize..]; let of_size = new_dict.fse.offsets.build_decoder( raw_tables, crate::decoding::sequence_section_decoder::OF_MAX_LOG, )?; let raw_tables = &raw_tables[of_size..]; let ml_size = new_dict.fse.match_lengths.build_decoder( raw_tables, crate::decoding::sequence_section_decoder::ML_MAX_LOG, )?; let raw_tables = &raw_tables[ml_size..]; let ll_size = new_dict.fse.literal_lengths.build_decoder( raw_tables, crate::decoding::sequence_section_decoder::LL_MAX_LOG, )?; let raw_tables = &raw_tables[ll_size..]; let offset1 = raw_tables[0..4].try_into().expect("optimized away"); let offset1 = u32::from_le_bytes(offset1); let offset2 = raw_tables[4..8].try_into().expect("optimized away"); let offset2 = u32::from_le_bytes(offset2); let offset3 = raw_tables[8..12].try_into().expect("optimized away"); let offset3 = u32::from_le_bytes(offset3); new_dict.offset_hist[0] = offset1; new_dict.offset_hist[1] = offset2; new_dict.offset_hist[2] = offset3; let raw_content = &raw_tables[12..]; new_dict.dict_content.extend(raw_content); Ok(new_dict) } } ruzstd-0.7.3/src/decoding/literals_section_decoder.rs000064400000000000000000000232441046102023000211340ustar 00000000000000//! This module contains the [decompress_literals] function, used to take a //! parsed literals header and a source and decompress it. use super::super::blocks::literals_section::{LiteralsSection, LiteralsSectionType}; use super::bit_reader_reverse::{BitReaderReversed, GetBitsError}; use super::scratch::HuffmanScratch; use crate::huff0::{HuffmanDecoder, HuffmanDecoderError, HuffmanTableError}; use alloc::vec::Vec; #[derive(Debug)] #[non_exhaustive] pub enum DecompressLiteralsError { MissingCompressedSize, MissingNumStreams, GetBitsError(GetBitsError), HuffmanTableError(HuffmanTableError), HuffmanDecoderError(HuffmanDecoderError), UninitializedHuffmanTable, MissingBytesForJumpHeader { got: usize }, MissingBytesForLiterals { got: usize, needed: usize }, ExtraPadding { skipped_bits: i32 }, BitstreamReadMismatch { read_til: isize, expected: isize }, DecodedLiteralCountMismatch { decoded: usize, expected: usize }, } #[cfg(feature = "std")] impl std::error::Error for DecompressLiteralsError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { DecompressLiteralsError::GetBitsError(source) => Some(source), DecompressLiteralsError::HuffmanTableError(source) => Some(source), DecompressLiteralsError::HuffmanDecoderError(source) => Some(source), _ => None, } } } impl core::fmt::Display for DecompressLiteralsError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { DecompressLiteralsError::MissingCompressedSize => { write!(f, "compressed size was none even though it must be set to something for compressed literals", ) } DecompressLiteralsError::MissingNumStreams => { write!(f, "num_streams was none even though it must be set to something (1 or 4) for compressed literals", ) } DecompressLiteralsError::GetBitsError(e) => write!(f, "{:?}", e), DecompressLiteralsError::HuffmanTableError(e) => write!(f, "{:?}", e), DecompressLiteralsError::HuffmanDecoderError(e) => write!(f, "{:?}", e), DecompressLiteralsError::UninitializedHuffmanTable => { write!( f, "Tried to reuse huffman table but it was never initialized", ) } DecompressLiteralsError::MissingBytesForJumpHeader { got } => { write!(f, "Need 6 bytes to decode jump header, got {} bytes", got,) } DecompressLiteralsError::MissingBytesForLiterals { got, needed } => { write!( f, "Need at least {} bytes to decode literals. Have: {} bytes", needed, got, ) } DecompressLiteralsError::ExtraPadding { skipped_bits } => { write!(f, "Padding at the end of the sequence_section was more than a byte long: {} bits. Probably caused by data corruption", skipped_bits, ) } DecompressLiteralsError::BitstreamReadMismatch { read_til, expected } => { write!( f, "Bitstream was read till: {}, should have been: {}", read_til, expected, ) } DecompressLiteralsError::DecodedLiteralCountMismatch { decoded, expected } => { write!( f, "Did not decode enough literals: {}, Should have been: {}", decoded, expected, ) } } } } impl From for DecompressLiteralsError { fn from(val: HuffmanDecoderError) -> Self { Self::HuffmanDecoderError(val) } } impl From for DecompressLiteralsError { fn from(val: GetBitsError) -> Self { Self::GetBitsError(val) } } impl From for DecompressLiteralsError { fn from(val: HuffmanTableError) -> Self { Self::HuffmanTableError(val) } } /// Decode and decompress the provided literals section into `target`, returning the number of bytes read. pub fn decode_literals( section: &LiteralsSection, scratch: &mut HuffmanScratch, source: &[u8], target: &mut Vec, ) -> Result { match section.ls_type { LiteralsSectionType::Raw => { target.extend(&source[0..section.regenerated_size as usize]); Ok(section.regenerated_size) } LiteralsSectionType::RLE => { target.resize(target.len() + section.regenerated_size as usize, source[0]); Ok(1) } LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => { let bytes_read = decompress_literals(section, scratch, source, target)?; //return sum of used bytes Ok(bytes_read) } } } /// Decompress the provided literals section and source into the provided `target`. /// This function is used when the literals section is `Compressed` or `Treeless` /// /// Returns the number of bytes read. fn decompress_literals( section: &LiteralsSection, scratch: &mut HuffmanScratch, source: &[u8], target: &mut Vec, ) -> Result { use DecompressLiteralsError as err; let compressed_size = section.compressed_size.ok_or(err::MissingCompressedSize)? as usize; let num_streams = section.num_streams.ok_or(err::MissingNumStreams)?; target.reserve(section.regenerated_size as usize); let source = &source[0..compressed_size]; let mut bytes_read = 0; match section.ls_type { LiteralsSectionType::Compressed => { //read Huffman tree description bytes_read += scratch.table.build_decoder(source)?; vprintln!("Built huffman table using {} bytes", bytes_read); } LiteralsSectionType::Treeless => { if scratch.table.max_num_bits == 0 { return Err(err::UninitializedHuffmanTable); } } _ => { /* nothing to do, huffman tree has been provided by previous block */ } } let source = &source[bytes_read as usize..]; if num_streams == 4 { //build jumptable if source.len() < 6 { return Err(err::MissingBytesForJumpHeader { got: source.len() }); } let jump1 = source[0] as usize + ((source[1] as usize) << 8); let jump2 = jump1 + source[2] as usize + ((source[3] as usize) << 8); let jump3 = jump2 + source[4] as usize + ((source[5] as usize) << 8); bytes_read += 6; let source = &source[6..]; if source.len() < jump3 { return Err(err::MissingBytesForLiterals { got: source.len(), needed: jump3, }); } //decode 4 streams let stream1 = &source[..jump1]; let stream2 = &source[jump1..jump2]; let stream3 = &source[jump2..jump3]; let stream4 = &source[jump3..]; for stream in &[stream1, stream2, stream3, stream4] { let mut decoder = HuffmanDecoder::new(&scratch.table); let mut br = BitReaderReversed::new(stream); //skip the 0 padding at the end of the last byte of the bit stream and throw away the first 1 found let mut skipped_bits = 0; loop { let val = br.get_bits(1); skipped_bits += 1; if val == 1 || skipped_bits > 8 { break; } } if skipped_bits > 8 { //if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data return Err(DecompressLiteralsError::ExtraPadding { skipped_bits }); } decoder.init_state(&mut br); while br.bits_remaining() > -(scratch.table.max_num_bits as isize) { target.push(decoder.decode_symbol()); decoder.next_state(&mut br); } if br.bits_remaining() != -(scratch.table.max_num_bits as isize) { return Err(DecompressLiteralsError::BitstreamReadMismatch { read_til: br.bits_remaining(), expected: -(scratch.table.max_num_bits as isize), }); } } bytes_read += source.len() as u32; } else { //just decode the one stream assert!(num_streams == 1); let mut decoder = HuffmanDecoder::new(&scratch.table); let mut br = BitReaderReversed::new(source); let mut skipped_bits = 0; loop { let val = br.get_bits(1); skipped_bits += 1; if val == 1 || skipped_bits > 8 { break; } } if skipped_bits > 8 { //if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data return Err(DecompressLiteralsError::ExtraPadding { skipped_bits }); } decoder.init_state(&mut br); while br.bits_remaining() > -(scratch.table.max_num_bits as isize) { target.push(decoder.decode_symbol()); decoder.next_state(&mut br); } bytes_read += source.len() as u32; } if target.len() != section.regenerated_size as usize { return Err(DecompressLiteralsError::DecodedLiteralCountMismatch { decoded: target.len(), expected: section.regenerated_size as usize, }); } Ok(bytes_read) } ruzstd-0.7.3/src/decoding/mod.rs000064400000000000000000000005571046102023000146650ustar 00000000000000//! Structures and utilities used for reading from data, decoding that data //! and storing the output. pub mod bit_reader; pub mod bit_reader_reverse; pub mod block_decoder; pub mod decodebuffer; pub mod dictionary; pub mod literals_section_decoder; mod ringbuffer; #[allow(dead_code)] pub mod scratch; pub mod sequence_execution; pub mod sequence_section_decoder; ruzstd-0.7.3/src/decoding/ringbuffer.rs000064400000000000000000000772471046102023000162510ustar 00000000000000use alloc::alloc::{alloc, dealloc}; use core::{alloc::Layout, ptr::NonNull, slice}; pub struct RingBuffer { // Safety invariants: // // 1. // a.`buf` must be a valid allocation of capacity `cap` // b. ...unless `cap=0`, in which case it is dangling // 2. If tail≥head // a. `head..tail` must contain initialized memory. // b. Else, `head..` and `..tail` must be initialized // 3. `head` and `tail` are in bounds (≥ 0 and < cap) // 4. `tail` is never `cap` except for a full buffer, and instead uses the value `0`. In other words, `tail` always points to the place // where the next element would go (if there is space) buf: NonNull, cap: usize, head: usize, tail: usize, } // SAFETY: RingBuffer does not hold any thread specific values -> it can be sent to another thread -> RingBuffer is Send unsafe impl Send for RingBuffer {} // SAFETY: Ringbuffer does not provide unsyncronized interior mutability which makes &RingBuffer Send -> RingBuffer is Sync unsafe impl Sync for RingBuffer {} impl RingBuffer { pub fn new() -> Self { RingBuffer { // SAFETY: Upholds invariant 1a as stated buf: NonNull::dangling(), cap: 0, // SAFETY: Upholds invariant 2-4 head: 0, tail: 0, } } /// Return the number of bytes in the buffer. pub fn len(&self) -> usize { let (x, y) = self.data_slice_lengths(); x + y } /// Return the amount of available space (in bytes) of the buffer. pub fn free(&self) -> usize { let (x, y) = self.free_slice_lengths(); (x + y).saturating_sub(1) } /// Empty the buffer and reset the head and tail. pub fn clear(&mut self) { // SAFETY: Upholds invariant 2, trivially // SAFETY: Upholds invariant 3; 0 is always valid self.head = 0; self.tail = 0; } /// Whether the buffer is empty pub fn is_empty(&self) -> bool { self.head == self.tail } /// Ensure that there's space for `amount` elements in the buffer. pub fn reserve(&mut self, amount: usize) { let free = self.free(); if free >= amount { return; } self.reserve_amortized(amount - free); } #[inline(never)] #[cold] fn reserve_amortized(&mut self, amount: usize) { // SAFETY: if we were succesfully able to construct this layout when we allocated then it's also valid do so now let current_layout = unsafe { Layout::array::(self.cap).unwrap_unchecked() }; // Always have at least 1 unused element as the sentinel. let new_cap = usize::max( self.cap.next_power_of_two(), (self.cap + amount).next_power_of_two(), ) + 1; // Check that the capacity isn't bigger than isize::MAX, which is the max allowed by LLVM, or that // we are on a >= 64 bit system which will never allow that much memory to be allocated #[allow(clippy::assertions_on_constants)] { debug_assert!(usize::BITS >= 64 || new_cap < isize::MAX as usize); } let new_layout = Layout::array::(new_cap) .unwrap_or_else(|_| panic!("Could not create layout for u8 array of size {}", new_cap)); // alloc the new memory region and panic if alloc fails // TODO maybe rework this to generate an error? let new_buf = unsafe { let new_buf = alloc(new_layout); NonNull::new(new_buf).expect("Allocating new space for the ringbuffer failed") }; // If we had data before, copy it over to the newly alloced memory region if self.cap > 0 { let ((s1_ptr, s1_len), (s2_ptr, s2_len)) = self.data_slice_parts(); unsafe { // SAFETY: Upholds invariant 2, we end up populating (0..(lenâ‚ + lenâ‚‚)) new_buf.as_ptr().copy_from_nonoverlapping(s1_ptr, s1_len); new_buf .as_ptr() .add(s1_len) .copy_from_nonoverlapping(s2_ptr, s2_len); dealloc(self.buf.as_ptr(), current_layout); } // SAFETY: Upholds invariant 3, head is 0 and in bounds, tail is only ever `cap` if the buffer // is entirely full self.tail = s1_len + s2_len; self.head = 0; } // SAFETY: Upholds invariant 1: the buffer was just allocated correctly self.buf = new_buf; self.cap = new_cap; } #[allow(dead_code)] pub fn push_back(&mut self, byte: u8) { self.reserve(1); // SAFETY: Upholds invariant 2 by writing initialized memory unsafe { self.buf.as_ptr().add(self.tail).write(byte) }; // SAFETY: Upholds invariant 3 by wrapping `tail` around self.tail = (self.tail + 1) % self.cap; } /// Fetch the byte stored at the selected index from the buffer, returning it, or /// `None` if the index is out of bounds. #[allow(dead_code)] pub fn get(&self, idx: usize) -> Option { if idx < self.len() { // SAFETY: Establishes invariants on memory being initialized and the range being in-bounds // (Invariants 2 & 3) let idx = (self.head + idx) % self.cap; Some(unsafe { self.buf.as_ptr().add(idx).read() }) } else { None } } /// Append the provided data to the end of `self`. pub fn extend(&mut self, data: &[u8]) { let len = data.len(); let ptr = data.as_ptr(); if len == 0 { return; } self.reserve(len); debug_assert!(self.len() + len <= self.cap - 1); debug_assert!(self.free() >= len, "free: {} len: {}", self.free(), len); let ((f1_ptr, f1_len), (f2_ptr, f2_len)) = self.free_slice_parts(); debug_assert!(f1_len + f2_len >= len, "{} + {} < {}", f1_len, f2_len, len); let in_f1 = usize::min(len, f1_len); let in_f2 = len - in_f1; debug_assert!(in_f1 + in_f2 == len); unsafe { // SAFETY: `in_fâ‚ + in_fâ‚‚ = len`, so this writes `len` bytes total // upholding invariant 2 if in_f1 > 0 { f1_ptr.copy_from_nonoverlapping(ptr, in_f1); } if in_f2 > 0 { f2_ptr.copy_from_nonoverlapping(ptr.add(in_f1), in_f2); } } // SAFETY: Upholds invariant 3 by wrapping `tail` around. self.tail = (self.tail + len) % self.cap; } /// Advance head past `amount` elements, effectively removing /// them from the buffer. pub fn drop_first_n(&mut self, amount: usize) { debug_assert!(amount <= self.len()); let amount = usize::min(amount, self.len()); // SAFETY: we maintain invariant 2 here since this will always lead to a smaller buffer // for amount≤len self.head = (self.head + amount) % self.cap; } /// Return the size of the two contiguous occupied sections of memory used /// by the buffer. // SAFETY: other code relies on this pointing to initialized halves of the buffer only fn data_slice_lengths(&self) -> (usize, usize) { let len_after_head; let len_to_tail; // TODO can we do this branchless? if self.tail >= self.head { len_after_head = self.tail - self.head; len_to_tail = 0; } else { len_after_head = self.cap - self.head; len_to_tail = self.tail; } (len_after_head, len_to_tail) } // SAFETY: other code relies on this pointing to initialized halves of the buffer only /// Return pointers to the head and tail, and the length of each section. fn data_slice_parts(&self) -> ((*const u8, usize), (*const u8, usize)) { let (len_after_head, len_to_tail) = self.data_slice_lengths(); ( (unsafe { self.buf.as_ptr().add(self.head) }, len_after_head), (self.buf.as_ptr(), len_to_tail), ) } /// Return references to each part of the ring buffer. pub fn as_slices(&self) -> (&[u8], &[u8]) { let (s1, s2) = self.data_slice_parts(); unsafe { // SAFETY: relies on the behavior of data_slice_parts for producing initialized memory let s1 = slice::from_raw_parts(s1.0, s1.1); let s2 = slice::from_raw_parts(s2.0, s2.1); (s1, s2) } } // SAFETY: other code relies on this producing the lengths of free zones // at the beginning/end of the buffer. Everything else must be initialized /// Returns the size of the two unoccupied sections of memory used by the buffer. fn free_slice_lengths(&self) -> (usize, usize) { let len_to_head; let len_after_tail; // TODO can we do this branchless? if self.tail < self.head { len_after_tail = self.head - self.tail; len_to_head = 0; } else { len_after_tail = self.cap - self.tail; len_to_head = self.head; } (len_to_head, len_after_tail) } /// Returns mutable references to the available space and the size of that available space, /// for the two sections in the buffer. // SAFETY: Other code relies on this pointing to the free zones, data after the first and before the second must // be valid fn free_slice_parts(&self) -> ((*mut u8, usize), (*mut u8, usize)) { let (len_to_head, len_after_tail) = self.free_slice_lengths(); ( (unsafe { self.buf.as_ptr().add(self.tail) }, len_after_tail), (self.buf.as_ptr(), len_to_head), ) } /// Copies elements from the provided range to the end of the buffer. #[allow(dead_code)] pub fn extend_from_within(&mut self, start: usize, len: usize) { if start + len > self.len() { panic!( "Calls to this functions must respect start ({}) + len ({}) <= self.len() ({})!", start, len, self.len() ); } self.reserve(len); // SAFETY: Requirements checked: // 1. explicitly checked above, resulting in a panic if it does not hold // 2. explicitly reserved enough memory unsafe { self.extend_from_within_unchecked(start, len) } } /// Copies data from the provided range to the end of the buffer, without /// first verifying that the unoccupied capacity is available. /// /// SAFETY: /// For this to be safe two requirements need to hold: /// 1. start + len <= self.len() so we do not copy uninitialised memory /// 2. More then len reserved space so we do not write out-of-bounds #[warn(unsafe_op_in_unsafe_fn)] pub unsafe fn extend_from_within_unchecked(&mut self, start: usize, len: usize) { debug_assert!(start + len <= self.len()); debug_assert!(self.free() >= len); if self.head < self.tail { // Continuous source section and possibly non continuous write section: // // H T // Read: ____XXXXSSSSXXXX________ // Write: ________________DDDD____ // // H: Head position (first readable byte) // T: Tail position (first writable byte) // X: Uninvolved bytes in the readable section // S: Source bytes, to be copied to D bytes // D: Destination bytes, going to be copied from S bytes // _: Uninvolved bytes in the writable section let after_tail = usize::min(len, self.cap - self.tail); let src = ( // SAFETY: `len <= isize::MAX` and fits the memory range of `buf` unsafe { self.buf.as_ptr().add(self.head + start) }.cast_const(), // Src length (see above diagram) self.tail - self.head - start, ); let dst = ( // SAFETY: `len <= isize::MAX` and fits the memory range of `buf` unsafe { self.buf.as_ptr().add(self.tail) }, // Dst length (see above diagram) self.cap - self.tail, ); // SAFETY: `src` points at initialized data, `dst` points to writable memory // and does not overlap `src`. unsafe { copy_bytes_overshooting(src, dst, after_tail) } if after_tail < len { // The write section was not continuous: // // H T // Read: ____XXXXSSSSXXXX__ // Write: DD______________DD // // H: Head position (first readable byte) // T: Tail position (first writable byte) // X: Uninvolved bytes in the readable section // S: Source bytes, to be copied to D bytes // D: Destination bytes, going to be copied from S bytes // _: Uninvolved bytes in the writable section let src = ( // SAFETY: we are still within the memory range of `buf` unsafe { src.0.add(after_tail) }, // Src length (see above diagram) src.1 - after_tail, ); let dst = ( self.buf.as_ptr(), // Dst length overflowing (see above diagram) self.head, ); // SAFETY: `src` points at initialized data, `dst` points to writable memory // and does not overlap `src`. unsafe { copy_bytes_overshooting(src, dst, len - after_tail) } } } else { if self.head + start > self.cap { // Continuous read section and destination section: // // T H // Read: XXSSSSXXXX____________XX // Write: __________DDDD__________ // // H: Head position (first readable byte) // T: Tail position (first writable byte) // X: Uninvolved bytes in the readable section // S: Source bytes, to be copied to D bytes // D: Destination bytes, going to be copied from S bytes // _: Uninvolved bytes in the writable section let start = (self.head + start) % self.cap; let src = ( // SAFETY: `len <= isize::MAX` and fits the memory range of `buf` unsafe { self.buf.as_ptr().add(start) }.cast_const(), // Src length (see above diagram) self.tail - start, ); let dst = ( // SAFETY: `len <= isize::MAX` and fits the memory range of `buf` unsafe { self.buf.as_ptr().add(self.tail) }, // Dst length (see above diagram) // Dst length (see above diagram) self.head - self.tail, ); // SAFETY: `src` points at initialized data, `dst` points to writable memory // and does not overlap `src`. unsafe { copy_bytes_overshooting(src, dst, len) } } else { // Possibly non continuous read section and continuous destination section: // // T H // Read: XXXX____________XXSSSSXX // Write: ____DDDD________________ // // H: Head position (first readable byte) // T: Tail position (first writable byte) // X: Uninvolved bytes in the readable section // S: Source bytes, to be copied to D bytes // D: Destination bytes, going to be copied from S bytes // _: Uninvolved bytes in the writable section let after_start = usize::min(len, self.cap - self.head - start); let src = ( // SAFETY: `len <= isize::MAX` and fits the memory range of `buf` unsafe { self.buf.as_ptr().add(self.head + start) }.cast_const(), // Src length - chunk 1 (see above diagram on the right) self.cap - self.head - start, ); let dst = ( // SAFETY: `len <= isize::MAX` and fits the memory range of `buf` unsafe { self.buf.as_ptr().add(self.tail) }, // Dst length (see above diagram) self.head - self.tail, ); // SAFETY: `src` points at initialized data, `dst` points to writable memory // and does not overlap `src`. unsafe { copy_bytes_overshooting(src, dst, after_start) } if after_start < len { // The read section was not continuous: // // T H // Read: SSXXXXXX____________XXSS // Write: ________DDDD____________ // // H: Head position (first readable byte) // T: Tail position (first writable byte) // X: Uninvolved bytes in the readable section // S: Source bytes, to be copied to D bytes // D: Destination bytes, going to be copied from S bytes // _: Uninvolved bytes in the writable section let src = ( self.buf.as_ptr().cast_const(), // Src length - chunk 2 (see above diagram on the left) self.tail, ); let dst = ( // SAFETY: we are still within the memory range of `buf` unsafe { dst.0.add(after_start) }, // Dst length (see above diagram) dst.1 - after_start, ); // SAFETY: `src` points at initialized data, `dst` points to writable memory // and does not overlap `src`. unsafe { copy_bytes_overshooting(src, dst, len - after_start) } } } } self.tail = (self.tail + len) % self.cap; } #[allow(dead_code)] /// This function is functionally the same as [RingBuffer::extend_from_within_unchecked], /// but it does not contain any branching operations. /// /// SAFETY: /// Needs start + len <= self.len() /// And more then len reserved space pub unsafe fn extend_from_within_unchecked_branchless(&mut self, start: usize, len: usize) { // data slices in raw parts let ((s1_ptr, s1_len), (s2_ptr, s2_len)) = self.data_slice_parts(); debug_assert!(len <= s1_len + s2_len, "{} > {} + {}", len, s1_len, s2_len); // calc the actually wanted slices in raw parts let start_in_s1 = usize::min(s1_len, start); let end_in_s1 = usize::min(s1_len, start + len); let m1_ptr = s1_ptr.add(start_in_s1); let m1_len = end_in_s1 - start_in_s1; debug_assert!(end_in_s1 <= s1_len); debug_assert!(start_in_s1 <= s1_len); let start_in_s2 = start.saturating_sub(s1_len); let end_in_s2 = start_in_s2 + (len - m1_len); let m2_ptr = s2_ptr.add(start_in_s2); let m2_len = end_in_s2 - start_in_s2; debug_assert!(start_in_s2 <= s2_len); debug_assert!(end_in_s2 <= s2_len); debug_assert_eq!(len, m1_len + m2_len); // the free slices, must hold: f1_len + f2_len >= m1_len + m2_len let ((f1_ptr, f1_len), (f2_ptr, f2_len)) = self.free_slice_parts(); debug_assert!(f1_len + f2_len >= m1_len + m2_len); // calc how many from where bytes go where let m1_in_f1 = usize::min(m1_len, f1_len); let m1_in_f2 = m1_len - m1_in_f1; let m2_in_f1 = usize::min(f1_len - m1_in_f1, m2_len); let m2_in_f2 = m2_len - m2_in_f1; debug_assert_eq!(m1_len, m1_in_f1 + m1_in_f2); debug_assert_eq!(m2_len, m2_in_f1 + m2_in_f2); debug_assert!(f1_len >= m1_in_f1 + m2_in_f1); debug_assert!(f2_len >= m1_in_f2 + m2_in_f2); debug_assert_eq!(len, m1_in_f1 + m2_in_f1 + m1_in_f2 + m2_in_f2); debug_assert!(self.buf.as_ptr().add(self.cap) > f1_ptr.add(m1_in_f1 + m2_in_f1)); debug_assert!(self.buf.as_ptr().add(self.cap) > f2_ptr.add(m1_in_f2 + m2_in_f2)); debug_assert!((m1_in_f2 > 0) ^ (m2_in_f1 > 0) || (m1_in_f2 == 0 && m2_in_f1 == 0)); copy_with_checks( m1_ptr, m2_ptr, f1_ptr, f2_ptr, m1_in_f1, m2_in_f1, m1_in_f2, m2_in_f2, ); self.tail = (self.tail + len) % self.cap; } } impl Drop for RingBuffer { fn drop(&mut self) { if self.cap == 0 { return; } // SAFETY: is we were succesfully able to construct this layout when we allocated then it's also valid do so now // Relies on / establishes invariant 1 let current_layout = unsafe { Layout::array::(self.cap).unwrap_unchecked() }; unsafe { dealloc(self.buf.as_ptr(), current_layout); } } } /// Similar to ptr::copy_nonoverlapping /// /// But it might overshoot the desired copy length if deemed useful /// /// src and dst specify the entire length they are eligible for reading/writing respectively /// in addition to the desired copy length. /// /// This function will then copy in chunks and might copy up to chunk size - 1 more bytes from src to dst /// if that operation does not read/write memory that does not belong to src/dst. /// /// The chunk size is not part of the contract and may change depending on the target platform. /// /// If that isn't possible we just fall back to ptr::copy_nonoverlapping #[inline(always)] unsafe fn copy_bytes_overshooting( src: (*const u8, usize), dst: (*mut u8, usize), copy_at_least: usize, ) { // By default use usize as the copy size #[cfg(all(not(target_feature = "sse2"), not(target_feature = "neon")))] type CopyType = usize; // Use u128 if we detect a simd feature #[cfg(target_feature = "neon")] type CopyType = u128; #[cfg(target_feature = "sse2")] type CopyType = u128; const COPY_AT_ONCE_SIZE: usize = core::mem::size_of::(); let min_buffer_size = usize::min(src.1, dst.1); // Can copy in just one read+write, very common case if min_buffer_size >= COPY_AT_ONCE_SIZE && copy_at_least <= COPY_AT_ONCE_SIZE { dst.0 .cast::() .write_unaligned(src.0.cast::().read_unaligned()) } else { let copy_multiple = copy_at_least.next_multiple_of(COPY_AT_ONCE_SIZE); // Can copy in multiple simple instructions if min_buffer_size >= copy_multiple { let mut src_ptr = src.0.cast::(); let src_ptr_end = src.0.add(copy_multiple).cast::(); let mut dst_ptr = dst.0.cast::(); while src_ptr < src_ptr_end { dst_ptr.write_unaligned(src_ptr.read_unaligned()); src_ptr = src_ptr.add(1); dst_ptr = dst_ptr.add(1); } } else { // Fall back to standard memcopy dst.0.copy_from_nonoverlapping(src.0, copy_at_least); } } debug_assert_eq!( slice::from_raw_parts(src.0, copy_at_least), slice::from_raw_parts(dst.0, copy_at_least) ); } #[allow(dead_code)] #[inline(always)] #[allow(clippy::too_many_arguments)] unsafe fn copy_without_checks( m1_ptr: *const u8, m2_ptr: *const u8, f1_ptr: *mut u8, f2_ptr: *mut u8, m1_in_f1: usize, m2_in_f1: usize, m1_in_f2: usize, m2_in_f2: usize, ) { f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1); f1_ptr .add(m1_in_f1) .copy_from_nonoverlapping(m2_ptr, m2_in_f1); f2_ptr.copy_from_nonoverlapping(m1_ptr.add(m1_in_f1), m1_in_f2); f2_ptr .add(m1_in_f2) .copy_from_nonoverlapping(m2_ptr.add(m2_in_f1), m2_in_f2); } #[allow(dead_code)] #[inline(always)] #[allow(clippy::too_many_arguments)] unsafe fn copy_with_checks( m1_ptr: *const u8, m2_ptr: *const u8, f1_ptr: *mut u8, f2_ptr: *mut u8, m1_in_f1: usize, m2_in_f1: usize, m1_in_f2: usize, m2_in_f2: usize, ) { if m1_in_f1 != 0 { f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1); } if m2_in_f1 != 0 { f1_ptr .add(m1_in_f1) .copy_from_nonoverlapping(m2_ptr, m2_in_f1); } if m1_in_f2 != 0 { f2_ptr.copy_from_nonoverlapping(m1_ptr.add(m1_in_f1), m1_in_f2); } if m2_in_f2 != 0 { f2_ptr .add(m1_in_f2) .copy_from_nonoverlapping(m2_ptr.add(m2_in_f1), m2_in_f2); } } #[allow(dead_code)] #[inline(always)] #[allow(clippy::too_many_arguments)] unsafe fn copy_with_nobranch_check( m1_ptr: *const u8, m2_ptr: *const u8, f1_ptr: *mut u8, f2_ptr: *mut u8, m1_in_f1: usize, m2_in_f1: usize, m1_in_f2: usize, m2_in_f2: usize, ) { let case = (m1_in_f1 > 0) as usize | (((m2_in_f1 > 0) as usize) << 1) | (((m1_in_f2 > 0) as usize) << 2) | (((m2_in_f2 > 0) as usize) << 3); match case { 0 => {} // one bit set 1 => { f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1); } 2 => { f1_ptr.copy_from_nonoverlapping(m2_ptr, m2_in_f1); } 4 => { f2_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f2); } 8 => { f2_ptr.copy_from_nonoverlapping(m2_ptr, m2_in_f2); } // two bit set 3 => { f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1); f1_ptr .add(m1_in_f1) .copy_from_nonoverlapping(m2_ptr, m2_in_f1); } 5 => { f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1); f2_ptr.copy_from_nonoverlapping(m1_ptr.add(m1_in_f1), m1_in_f2); } 6 => core::hint::unreachable_unchecked(), 7 => core::hint::unreachable_unchecked(), 9 => { f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1); f2_ptr.copy_from_nonoverlapping(m2_ptr, m2_in_f2); } 10 => { f1_ptr.copy_from_nonoverlapping(m2_ptr, m2_in_f1); f2_ptr.copy_from_nonoverlapping(m2_ptr.add(m2_in_f1), m2_in_f2); } 12 => { f2_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f2); f2_ptr .add(m1_in_f2) .copy_from_nonoverlapping(m2_ptr, m2_in_f2); } // three bit set 11 => { f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1); f1_ptr .add(m1_in_f1) .copy_from_nonoverlapping(m2_ptr, m2_in_f1); f2_ptr.copy_from_nonoverlapping(m2_ptr.add(m2_in_f1), m2_in_f2); } 13 => { f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1); f2_ptr.copy_from_nonoverlapping(m1_ptr.add(m1_in_f1), m1_in_f2); f2_ptr .add(m1_in_f2) .copy_from_nonoverlapping(m2_ptr, m2_in_f2); } 14 => core::hint::unreachable_unchecked(), 15 => core::hint::unreachable_unchecked(), _ => core::hint::unreachable_unchecked(), } } #[cfg(test)] mod tests { use super::RingBuffer; #[test] fn smoke() { let mut rb = RingBuffer::new(); rb.reserve(15); assert_eq!(17, rb.cap); rb.extend(b"0123456789"); assert_eq!(rb.len(), 10); assert_eq!(rb.as_slices().0, b"0123456789"); assert_eq!(rb.as_slices().1, b""); rb.drop_first_n(5); assert_eq!(rb.len(), 5); assert_eq!(rb.as_slices().0, b"56789"); assert_eq!(rb.as_slices().1, b""); rb.extend_from_within(2, 3); assert_eq!(rb.len(), 8); assert_eq!(rb.as_slices().0, b"56789789"); assert_eq!(rb.as_slices().1, b""); rb.extend_from_within(0, 3); assert_eq!(rb.len(), 11); assert_eq!(rb.as_slices().0, b"56789789567"); assert_eq!(rb.as_slices().1, b""); rb.extend_from_within(0, 2); assert_eq!(rb.len(), 13); assert_eq!(rb.as_slices().0, b"567897895675"); assert_eq!(rb.as_slices().1, b"6"); rb.drop_first_n(11); assert_eq!(rb.len(), 2); assert_eq!(rb.as_slices().0, b"5"); assert_eq!(rb.as_slices().1, b"6"); rb.extend(b"0123456789"); assert_eq!(rb.len(), 12); assert_eq!(rb.as_slices().0, b"5"); assert_eq!(rb.as_slices().1, b"60123456789"); rb.drop_first_n(11); assert_eq!(rb.len(), 1); assert_eq!(rb.as_slices().0, b"9"); assert_eq!(rb.as_slices().1, b""); rb.extend(b"0123456789"); assert_eq!(rb.len(), 11); assert_eq!(rb.as_slices().0, b"9012345"); assert_eq!(rb.as_slices().1, b"6789"); } #[test] fn edge_cases() { // Fill exactly, then empty then fill again let mut rb = RingBuffer::new(); rb.reserve(16); assert_eq!(17, rb.cap); rb.extend(b"0123456789012345"); assert_eq!(17, rb.cap); assert_eq!(16, rb.len()); assert_eq!(0, rb.free()); rb.drop_first_n(16); assert_eq!(0, rb.len()); assert_eq!(16, rb.free()); rb.extend(b"0123456789012345"); assert_eq!(16, rb.len()); assert_eq!(0, rb.free()); assert_eq!(17, rb.cap); assert_eq!(1, rb.as_slices().0.len()); assert_eq!(15, rb.as_slices().1.len()); rb.clear(); // data in both slices and then reserve rb.extend(b"0123456789012345"); rb.drop_first_n(8); rb.extend(b"67890123"); assert_eq!(16, rb.len()); assert_eq!(0, rb.free()); assert_eq!(17, rb.cap); assert_eq!(9, rb.as_slices().0.len()); assert_eq!(7, rb.as_slices().1.len()); rb.reserve(1); assert_eq!(16, rb.len()); assert_eq!(16, rb.free()); assert_eq!(33, rb.cap); assert_eq!(16, rb.as_slices().0.len()); assert_eq!(0, rb.as_slices().1.len()); rb.clear(); // fill exactly, then extend from within rb.extend(b"0123456789012345"); rb.extend_from_within(0, 16); assert_eq!(32, rb.len()); assert_eq!(0, rb.free()); assert_eq!(33, rb.cap); assert_eq!(32, rb.as_slices().0.len()); assert_eq!(0, rb.as_slices().1.len()); // extend from within cases let mut rb = RingBuffer::new(); rb.reserve(8); rb.extend(b"01234567"); rb.drop_first_n(5); rb.extend_from_within(0, 3); assert_eq!(4, rb.as_slices().0.len()); assert_eq!(2, rb.as_slices().1.len()); rb.drop_first_n(2); assert_eq!(2, rb.as_slices().0.len()); assert_eq!(2, rb.as_slices().1.len()); rb.extend_from_within(0, 4); assert_eq!(2, rb.as_slices().0.len()); assert_eq!(6, rb.as_slices().1.len()); rb.drop_first_n(2); assert_eq!(6, rb.as_slices().0.len()); assert_eq!(0, rb.as_slices().1.len()); rb.drop_first_n(2); assert_eq!(4, rb.as_slices().0.len()); assert_eq!(0, rb.as_slices().1.len()); rb.extend_from_within(0, 4); assert_eq!(7, rb.as_slices().0.len()); assert_eq!(1, rb.as_slices().1.len()); let mut rb = RingBuffer::new(); rb.reserve(8); rb.extend(b"11111111"); rb.drop_first_n(7); rb.extend(b"111"); assert_eq!(2, rb.as_slices().0.len()); assert_eq!(2, rb.as_slices().1.len()); rb.extend_from_within(0, 4); assert_eq!(b"11", rb.as_slices().0); assert_eq!(b"111111", rb.as_slices().1); } } ruzstd-0.7.3/src/decoding/scratch.rs000064400000000000000000000071161046102023000155330ustar 00000000000000//! Structures that wrap around various decoders to make decoding easier. use super::super::blocks::sequence_section::Sequence; use super::decodebuffer::DecodeBuffer; use crate::decoding::dictionary::Dictionary; use crate::fse::FSETable; use crate::huff0::HuffmanTable; use alloc::vec::Vec; use crate::blocks::sequence_section::{ MAX_LITERAL_LENGTH_CODE, MAX_MATCH_LENGTH_CODE, MAX_OFFSET_CODE, }; /// A block level decoding buffer. pub struct DecoderScratch { /// The decoder used for Huffman blocks. pub huf: HuffmanScratch, /// The decoder used for FSE blocks. pub fse: FSEScratch, pub buffer: DecodeBuffer, pub offset_hist: [u32; 3], pub literals_buffer: Vec, pub sequences: Vec, pub block_content_buffer: Vec, } impl DecoderScratch { pub fn new(window_size: usize) -> DecoderScratch { DecoderScratch { huf: HuffmanScratch { table: HuffmanTable::new(), }, fse: FSEScratch { offsets: FSETable::new(MAX_OFFSET_CODE), of_rle: None, literal_lengths: FSETable::new(MAX_LITERAL_LENGTH_CODE), ll_rle: None, match_lengths: FSETable::new(MAX_MATCH_LENGTH_CODE), ml_rle: None, }, buffer: DecodeBuffer::new(window_size), offset_hist: [1, 4, 8], block_content_buffer: Vec::new(), literals_buffer: Vec::new(), sequences: Vec::new(), } } pub fn reset(&mut self, window_size: usize) { self.offset_hist = [1, 4, 8]; self.literals_buffer.clear(); self.sequences.clear(); self.block_content_buffer.clear(); self.buffer.reset(window_size); self.fse.literal_lengths.reset(); self.fse.match_lengths.reset(); self.fse.offsets.reset(); self.fse.ll_rle = None; self.fse.ml_rle = None; self.fse.of_rle = None; self.huf.table.reset(); } pub fn init_from_dict(&mut self, dict: &Dictionary) { self.fse.reinit_from(&dict.fse); self.huf.table.reinit_from(&dict.huf.table); self.offset_hist = dict.offset_hist; self.buffer.dict_content.clear(); self.buffer .dict_content .extend_from_slice(&dict.dict_content); } } pub struct HuffmanScratch { pub table: HuffmanTable, } impl HuffmanScratch { pub fn new() -> HuffmanScratch { HuffmanScratch { table: HuffmanTable::new(), } } } impl Default for HuffmanScratch { fn default() -> Self { Self::new() } } pub struct FSEScratch { pub offsets: FSETable, pub of_rle: Option, pub literal_lengths: FSETable, pub ll_rle: Option, pub match_lengths: FSETable, pub ml_rle: Option, } impl FSEScratch { pub fn new() -> FSEScratch { FSEScratch { offsets: FSETable::new(MAX_OFFSET_CODE), of_rle: None, literal_lengths: FSETable::new(MAX_LITERAL_LENGTH_CODE), ll_rle: None, match_lengths: FSETable::new(MAX_MATCH_LENGTH_CODE), ml_rle: None, } } pub fn reinit_from(&mut self, other: &Self) { self.offsets.reinit_from(&other.offsets); self.literal_lengths.reinit_from(&other.literal_lengths); self.match_lengths.reinit_from(&other.match_lengths); self.of_rle = other.of_rle; self.ll_rle = other.ll_rle; self.ml_rle = other.ml_rle; } } impl Default for FSEScratch { fn default() -> Self { Self::new() } } ruzstd-0.7.3/src/decoding/sequence_execution.rs000064400000000000000000000114101046102023000177670ustar 00000000000000use super::{decodebuffer::DecodeBufferError, scratch::DecoderScratch}; #[derive(Debug)] #[non_exhaustive] pub enum ExecuteSequencesError { DecodebufferError(DecodeBufferError), NotEnoughBytesForSequence { wanted: usize, have: usize }, ZeroOffset, } impl core::fmt::Display for ExecuteSequencesError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { ExecuteSequencesError::DecodebufferError(e) => { write!(f, "{:?}", e) } ExecuteSequencesError::NotEnoughBytesForSequence { wanted, have } => { write!( f, "Sequence wants to copy up to byte {}. Bytes in literalsbuffer: {}", wanted, have ) } ExecuteSequencesError::ZeroOffset => { write!(f, "Illegal offset: 0 found") } } } } #[cfg(feature = "std")] impl std::error::Error for ExecuteSequencesError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { ExecuteSequencesError::DecodebufferError(source) => Some(source), _ => None, } } } impl From for ExecuteSequencesError { fn from(val: DecodeBufferError) -> Self { Self::DecodebufferError(val) } } /// Take the provided decoder and execute the sequences stored within pub fn execute_sequences(scratch: &mut DecoderScratch) -> Result<(), ExecuteSequencesError> { let mut literals_copy_counter = 0; let old_buffer_size = scratch.buffer.len(); let mut seq_sum = 0; for idx in 0..scratch.sequences.len() { let seq = scratch.sequences[idx]; if seq.ll > 0 { let high = literals_copy_counter + seq.ll as usize; if high > scratch.literals_buffer.len() { return Err(ExecuteSequencesError::NotEnoughBytesForSequence { wanted: high, have: scratch.literals_buffer.len(), }); } let literals = &scratch.literals_buffer[literals_copy_counter..high]; literals_copy_counter += seq.ll as usize; scratch.buffer.push(literals); } let actual_offset = do_offset_history(seq.of, seq.ll, &mut scratch.offset_hist); if actual_offset == 0 { return Err(ExecuteSequencesError::ZeroOffset); } if seq.ml > 0 { scratch .buffer .repeat(actual_offset as usize, seq.ml as usize)?; } seq_sum += seq.ml; seq_sum += seq.ll; } if literals_copy_counter < scratch.literals_buffer.len() { let rest_literals = &scratch.literals_buffer[literals_copy_counter..]; scratch.buffer.push(rest_literals); seq_sum += rest_literals.len() as u32; } let diff = scratch.buffer.len() - old_buffer_size; assert!( seq_sum as usize == diff, "Seq_sum: {} is different from the difference in buffersize: {}", seq_sum, diff ); Ok(()) } /// Update the most recently used offsets to reflect the provided offset value, and return the /// "actual" offset needed because offsets are not stored in a raw way, some transformations are needed /// before you get a functional number. fn do_offset_history(offset_value: u32, lit_len: u32, scratch: &mut [u32; 3]) -> u32 { let actual_offset = if lit_len > 0 { match offset_value { 1..=3 => scratch[offset_value as usize - 1], _ => { //new offset offset_value - 3 } } } else { match offset_value { 1..=2 => scratch[offset_value as usize], 3 => scratch[0] - 1, _ => { //new offset offset_value - 3 } } }; //update history if lit_len > 0 { match offset_value { 1 => { //nothing } 2 => { scratch[1] = scratch[0]; scratch[0] = actual_offset; } _ => { scratch[2] = scratch[1]; scratch[1] = scratch[0]; scratch[0] = actual_offset; } } } else { match offset_value { 1 => { scratch[1] = scratch[0]; scratch[0] = actual_offset; } 2 => { scratch[2] = scratch[1]; scratch[1] = scratch[0]; scratch[0] = actual_offset; } _ => { scratch[2] = scratch[1]; scratch[1] = scratch[0]; scratch[0] = actual_offset; } } } actual_offset } ruzstd-0.7.3/src/decoding/sequence_section_decoder.rs000064400000000000000000000460041046102023000211240ustar 00000000000000use super::super::blocks::sequence_section::ModeType; use super::super::blocks::sequence_section::Sequence; use super::super::blocks::sequence_section::SequencesHeader; use super::bit_reader_reverse::{BitReaderReversed, GetBitsError}; use super::scratch::FSEScratch; use crate::blocks::sequence_section::{ MAX_LITERAL_LENGTH_CODE, MAX_MATCH_LENGTH_CODE, MAX_OFFSET_CODE, }; use crate::fse::{FSEDecoder, FSEDecoderError, FSETableError}; use alloc::vec::Vec; #[derive(Debug)] #[non_exhaustive] pub enum DecodeSequenceError { GetBitsError(GetBitsError), FSEDecoderError(FSEDecoderError), FSETableError(FSETableError), ExtraPadding { skipped_bits: i32 }, UnsupportedOffset { offset_code: u8 }, ZeroOffset, NotEnoughBytesForNumSequences, ExtraBits { bits_remaining: isize }, MissingCompressionMode, MissingByteForRleLlTable, MissingByteForRleOfTable, MissingByteForRleMlTable, } #[cfg(feature = "std")] impl std::error::Error for DecodeSequenceError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { DecodeSequenceError::GetBitsError(source) => Some(source), DecodeSequenceError::FSEDecoderError(source) => Some(source), DecodeSequenceError::FSETableError(source) => Some(source), _ => None, } } } impl core::fmt::Display for DecodeSequenceError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { DecodeSequenceError::GetBitsError(e) => write!(f, "{:?}", e), DecodeSequenceError::FSEDecoderError(e) => write!(f, "{:?}", e), DecodeSequenceError::FSETableError(e) => write!(f, "{:?}", e), DecodeSequenceError::ExtraPadding { skipped_bits } => { write!(f, "Padding at the end of the sequence_section was more than a byte long: {} bits. Probably caused by data corruption", skipped_bits, ) } DecodeSequenceError::UnsupportedOffset { offset_code } => { write!( f, "Do not support offsets bigger than 1<<32; got: {}", offset_code, ) } DecodeSequenceError::ZeroOffset => write!( f, "Read an offset == 0. That is an illegal value for offsets" ), DecodeSequenceError::NotEnoughBytesForNumSequences => write!( f, "Bytestream did not contain enough bytes to decode num_sequences" ), DecodeSequenceError::ExtraBits { bits_remaining } => write!(f, "{}", bits_remaining), DecodeSequenceError::MissingCompressionMode => write!( f, "compression modes are none but they must be set to something" ), DecodeSequenceError::MissingByteForRleLlTable => { write!(f, "Need a byte to read for RLE ll table") } DecodeSequenceError::MissingByteForRleOfTable => { write!(f, "Need a byte to read for RLE of table") } DecodeSequenceError::MissingByteForRleMlTable => { write!(f, "Need a byte to read for RLE ml table") } } } } impl From for DecodeSequenceError { fn from(val: GetBitsError) -> Self { Self::GetBitsError(val) } } impl From for DecodeSequenceError { fn from(val: FSETableError) -> Self { Self::FSETableError(val) } } impl From for DecodeSequenceError { fn from(val: FSEDecoderError) -> Self { Self::FSEDecoderError(val) } } /// Decode the provided source as a series of sequences into the supplied `target`. pub fn decode_sequences( section: &SequencesHeader, source: &[u8], scratch: &mut FSEScratch, target: &mut Vec, ) -> Result<(), DecodeSequenceError> { let bytes_read = maybe_update_fse_tables(section, source, scratch)?; vprintln!("Updating tables used {} bytes", bytes_read); let bit_stream = &source[bytes_read..]; let mut br = BitReaderReversed::new(bit_stream); //skip the 0 padding at the end of the last byte of the bit stream and throw away the first 1 found let mut skipped_bits = 0; loop { let val = br.get_bits(1); skipped_bits += 1; if val == 1 || skipped_bits > 8 { break; } } if skipped_bits > 8 { //if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data return Err(DecodeSequenceError::ExtraPadding { skipped_bits }); } if scratch.ll_rle.is_some() || scratch.ml_rle.is_some() || scratch.of_rle.is_some() { decode_sequences_with_rle(section, &mut br, scratch, target) } else { decode_sequences_without_rle(section, &mut br, scratch, target) } } fn decode_sequences_with_rle( section: &SequencesHeader, br: &mut BitReaderReversed<'_>, scratch: &FSEScratch, target: &mut Vec, ) -> Result<(), DecodeSequenceError> { let mut ll_dec = FSEDecoder::new(&scratch.literal_lengths); let mut ml_dec = FSEDecoder::new(&scratch.match_lengths); let mut of_dec = FSEDecoder::new(&scratch.offsets); if scratch.ll_rle.is_none() { ll_dec.init_state(br)?; } if scratch.of_rle.is_none() { of_dec.init_state(br)?; } if scratch.ml_rle.is_none() { ml_dec.init_state(br)?; } target.clear(); target.reserve(section.num_sequences as usize); for _seq_idx in 0..section.num_sequences { //get the codes from either the RLE byte or from the decoder let ll_code = if scratch.ll_rle.is_some() { scratch.ll_rle.unwrap() } else { ll_dec.decode_symbol() }; let ml_code = if scratch.ml_rle.is_some() { scratch.ml_rle.unwrap() } else { ml_dec.decode_symbol() }; let of_code = if scratch.of_rle.is_some() { scratch.of_rle.unwrap() } else { of_dec.decode_symbol() }; let (ll_value, ll_num_bits) = lookup_ll_code(ll_code); let (ml_value, ml_num_bits) = lookup_ml_code(ml_code); //println!("Sequence: {}", i); //println!("of stat: {}", of_dec.state); //println!("of Code: {}", of_code); //println!("ll stat: {}", ll_dec.state); //println!("ll bits: {}", ll_num_bits); //println!("ll Code: {}", ll_value); //println!("ml stat: {}", ml_dec.state); //println!("ml bits: {}", ml_num_bits); //println!("ml Code: {}", ml_value); //println!(""); if of_code > MAX_OFFSET_CODE { return Err(DecodeSequenceError::UnsupportedOffset { offset_code: of_code, }); } let (obits, ml_add, ll_add) = br.get_bits_triple(of_code, ml_num_bits, ll_num_bits); let offset = obits as u32 + (1u32 << of_code); if offset == 0 { return Err(DecodeSequenceError::ZeroOffset); } target.push(Sequence { ll: ll_value + ll_add as u32, ml: ml_value + ml_add as u32, of: offset, }); if target.len() < section.num_sequences as usize { //println!( // "Bits left: {} ({} bytes)", // br.bits_remaining(), // br.bits_remaining() / 8, //); if scratch.ll_rle.is_none() { ll_dec.update_state(br); } if scratch.ml_rle.is_none() { ml_dec.update_state(br); } if scratch.of_rle.is_none() { of_dec.update_state(br); } } if br.bits_remaining() < 0 { return Err(DecodeSequenceError::NotEnoughBytesForNumSequences); } } if br.bits_remaining() > 0 { Err(DecodeSequenceError::ExtraBits { bits_remaining: br.bits_remaining(), }) } else { Ok(()) } } fn decode_sequences_without_rle( section: &SequencesHeader, br: &mut BitReaderReversed<'_>, scratch: &FSEScratch, target: &mut Vec, ) -> Result<(), DecodeSequenceError> { let mut ll_dec = FSEDecoder::new(&scratch.literal_lengths); let mut ml_dec = FSEDecoder::new(&scratch.match_lengths); let mut of_dec = FSEDecoder::new(&scratch.offsets); ll_dec.init_state(br)?; of_dec.init_state(br)?; ml_dec.init_state(br)?; target.clear(); target.reserve(section.num_sequences as usize); for _seq_idx in 0..section.num_sequences { let ll_code = ll_dec.decode_symbol(); let ml_code = ml_dec.decode_symbol(); let of_code = of_dec.decode_symbol(); let (ll_value, ll_num_bits) = lookup_ll_code(ll_code); let (ml_value, ml_num_bits) = lookup_ml_code(ml_code); if of_code > MAX_OFFSET_CODE { return Err(DecodeSequenceError::UnsupportedOffset { offset_code: of_code, }); } let (obits, ml_add, ll_add) = br.get_bits_triple(of_code, ml_num_bits, ll_num_bits); let offset = obits as u32 + (1u32 << of_code); if offset == 0 { return Err(DecodeSequenceError::ZeroOffset); } target.push(Sequence { ll: ll_value + ll_add as u32, ml: ml_value + ml_add as u32, of: offset, }); if target.len() < section.num_sequences as usize { //println!( // "Bits left: {} ({} bytes)", // br.bits_remaining(), // br.bits_remaining() / 8, //); ll_dec.update_state(br); ml_dec.update_state(br); of_dec.update_state(br); } if br.bits_remaining() < 0 { return Err(DecodeSequenceError::NotEnoughBytesForNumSequences); } } if br.bits_remaining() > 0 { Err(DecodeSequenceError::ExtraBits { bits_remaining: br.bits_remaining(), }) } else { Ok(()) } } /// Look up the provided state value from a literal length table predefined /// by the Zstandard reference document. Returns a tuple of (value, number of bits). /// /// fn lookup_ll_code(code: u8) -> (u32, u8) { match code { 0..=15 => (u32::from(code), 0), 16 => (16, 1), 17 => (18, 1), 18 => (20, 1), 19 => (22, 1), 20 => (24, 2), 21 => (28, 2), 22 => (32, 3), 23 => (40, 3), 24 => (48, 4), 25 => (64, 6), 26 => (128, 7), 27 => (256, 8), 28 => (512, 9), 29 => (1024, 10), 30 => (2048, 11), 31 => (4096, 12), 32 => (8192, 13), 33 => (16384, 14), 34 => (32768, 15), 35 => (65536, 16), _ => unreachable!("Illegal literal length code was: {}", code), } } /// Look up the provided state value from a match length table predefined /// by the Zstandard reference document. Returns a tuple of (value, number of bits). /// /// fn lookup_ml_code(code: u8) -> (u32, u8) { match code { 0..=31 => (u32::from(code) + 3, 0), 32 => (35, 1), 33 => (37, 1), 34 => (39, 1), 35 => (41, 1), 36 => (43, 2), 37 => (47, 2), 38 => (51, 3), 39 => (59, 3), 40 => (67, 4), 41 => (83, 4), 42 => (99, 5), 43 => (131, 7), 44 => (259, 8), 45 => (515, 9), 46 => (1027, 10), 47 => (2051, 11), 48 => (4099, 12), 49 => (8195, 13), 50 => (16387, 14), 51 => (32771, 15), 52 => (65539, 16), _ => unreachable!("Illegal match length code was: {}", code), } } // This info is buried in the symbol compression mode table /// "The maximum allowed accuracy log for literals length and match length tables is 9" pub const LL_MAX_LOG: u8 = 9; /// "The maximum allowed accuracy log for literals length and match length tables is 9" pub const ML_MAX_LOG: u8 = 9; /// "The maximum accuracy log for the offset table is 8." pub const OF_MAX_LOG: u8 = 8; fn maybe_update_fse_tables( section: &SequencesHeader, source: &[u8], scratch: &mut FSEScratch, ) -> Result { let modes = section .modes .ok_or(DecodeSequenceError::MissingCompressionMode)?; let mut bytes_read = 0; match modes.ll_mode() { ModeType::FSECompressed => { let bytes = scratch.literal_lengths.build_decoder(source, LL_MAX_LOG)?; bytes_read += bytes; vprintln!("Updating ll table"); vprintln!("Used bytes: {}", bytes); scratch.ll_rle = None; } ModeType::RLE => { vprintln!("Use RLE ll table"); if source.is_empty() { return Err(DecodeSequenceError::MissingByteForRleLlTable); } bytes_read += 1; if source[0] > MAX_LITERAL_LENGTH_CODE { return Err(DecodeSequenceError::MissingByteForRleMlTable); } scratch.ll_rle = Some(source[0]); } ModeType::Predefined => { vprintln!("Use predefined ll table"); scratch.literal_lengths.build_from_probabilities( LL_DEFAULT_ACC_LOG, &Vec::from(&LITERALS_LENGTH_DEFAULT_DISTRIBUTION[..]), )?; scratch.ll_rle = None; } ModeType::Repeat => { vprintln!("Repeat ll table"); /* Nothing to do */ } }; let of_source = &source[bytes_read..]; match modes.of_mode() { ModeType::FSECompressed => { let bytes = scratch.offsets.build_decoder(of_source, OF_MAX_LOG)?; vprintln!("Updating of table"); vprintln!("Used bytes: {}", bytes); bytes_read += bytes; scratch.of_rle = None; } ModeType::RLE => { vprintln!("Use RLE of table"); if of_source.is_empty() { return Err(DecodeSequenceError::MissingByteForRleOfTable); } bytes_read += 1; if of_source[0] > MAX_OFFSET_CODE { return Err(DecodeSequenceError::MissingByteForRleMlTable); } scratch.of_rle = Some(of_source[0]); } ModeType::Predefined => { vprintln!("Use predefined of table"); scratch.offsets.build_from_probabilities( OF_DEFAULT_ACC_LOG, &Vec::from(&OFFSET_DEFAULT_DISTRIBUTION[..]), )?; scratch.of_rle = None; } ModeType::Repeat => { vprintln!("Repeat of table"); /* Nothing to do */ } }; let ml_source = &source[bytes_read..]; match modes.ml_mode() { ModeType::FSECompressed => { let bytes = scratch.match_lengths.build_decoder(ml_source, ML_MAX_LOG)?; bytes_read += bytes; vprintln!("Updating ml table"); vprintln!("Used bytes: {}", bytes); scratch.ml_rle = None; } ModeType::RLE => { vprintln!("Use RLE ml table"); if ml_source.is_empty() { return Err(DecodeSequenceError::MissingByteForRleMlTable); } bytes_read += 1; if ml_source[0] > MAX_MATCH_LENGTH_CODE { return Err(DecodeSequenceError::MissingByteForRleMlTable); } scratch.ml_rle = Some(ml_source[0]); } ModeType::Predefined => { vprintln!("Use predefined ml table"); scratch.match_lengths.build_from_probabilities( ML_DEFAULT_ACC_LOG, &Vec::from(&MATCH_LENGTH_DEFAULT_DISTRIBUTION[..]), )?; scratch.ml_rle = None; } ModeType::Repeat => { vprintln!("Repeat ml table"); /* Nothing to do */ } }; Ok(bytes_read) } // The default Literal Length decoding table uses an accuracy logarithm of 6 bits. const LL_DEFAULT_ACC_LOG: u8 = 6; /// If [ModeType::Predefined] is selected for a symbol type, its FSE decoding /// table is generated using a predefined distribution table. /// /// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#literals-length const LITERALS_LENGTH_DEFAULT_DISTRIBUTION: [i32; 36] = [ 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1, -1, -1, -1, -1, ]; // The default Match Length decoding table uses an accuracy logarithm of 6 bits. const ML_DEFAULT_ACC_LOG: u8 = 6; /// If [ModeType::Predefined] is selected for a symbol type, its FSE decoding /// table is generated using a predefined distribution table. /// /// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#match-length const MATCH_LENGTH_DEFAULT_DISTRIBUTION: [i32; 53] = [ 1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, ]; // The default Match Length decoding table uses an accuracy logarithm of 5 bits. const OF_DEFAULT_ACC_LOG: u8 = 5; /// If [ModeType::Predefined] is selected for a symbol type, its FSE decoding /// table is generated using a predefined distribution table. /// /// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#match-length const OFFSET_DEFAULT_DISTRIBUTION: [i32; 29] = [ 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, ]; #[test] fn test_ll_default() { let mut table = crate::fse::FSETable::new(MAX_LITERAL_LENGTH_CODE); table .build_from_probabilities( LL_DEFAULT_ACC_LOG, &Vec::from(&LITERALS_LENGTH_DEFAULT_DISTRIBUTION[..]), ) .unwrap(); #[cfg(feature = "std")] for idx in 0..table.decode.len() { std::println!( "{:3}: {:3} {:3} {:3}", idx, table.decode[idx].symbol, table.decode[idx].num_bits, table.decode[idx].base_line ); } assert!(table.decode.len() == 64); //just test a few values. TODO test all values assert!(table.decode[0].symbol == 0); assert!(table.decode[0].num_bits == 4); assert!(table.decode[0].base_line == 0); assert!(table.decode[19].symbol == 27); assert!(table.decode[19].num_bits == 6); assert!(table.decode[19].base_line == 0); assert!(table.decode[39].symbol == 25); assert!(table.decode[39].num_bits == 4); assert!(table.decode[39].base_line == 16); assert!(table.decode[60].symbol == 35); assert!(table.decode[60].num_bits == 6); assert!(table.decode[60].base_line == 0); assert!(table.decode[59].symbol == 24); assert!(table.decode[59].num_bits == 5); assert!(table.decode[59].base_line == 32); } ruzstd-0.7.3/src/frame.rs000064400000000000000000000344251046102023000134250ustar 00000000000000use crate::io::{Error, Read}; use core::fmt; #[cfg(feature = "std")] use std::error::Error as StdError; /// This magic number is included at the start of a single Zstandard frame pub const MAGIC_NUM: u32 = 0xFD2F_B528; /// The minimum window size is defined as 1 KB pub const MIN_WINDOW_SIZE: u64 = 1024; /// The maximum window size is 3.75TB pub const MAX_WINDOW_SIZE: u64 = (1 << 41) + 7 * (1 << 38); /// Zstandard compressed data is made of one or more [Frame]s. Each frame is independent and can be /// decompressed independently of other frames. /// /// There are two frame formats defined by Zstandard: Zstandard frames and Skippable frames. /// Zstandard frames contain compressed data, while skippable frames contain custom user metadata. /// /// This structure contains the header of the frame. /// /// pub struct Frame { pub header: FrameHeader, } /// A frame header has a variable size, with a minimum of 2 bytes, and a maximum of 14 bytes. pub struct FrameHeader { pub descriptor: FrameDescriptor, /// The `Window_Descriptor` field contains the minimum size of a memory buffer needed to /// decompress the entire frame. /// /// This byte is not included in the frame header when the `Single_Segment_flag` is set. /// /// Bits 7-3 refer to the `Exponent`, where bits 2-0 refer to the `Mantissa`. /// /// To determine the size of a window, the following formula can be used: /// ```text /// windowLog = 10 + Exponent; /// windowBase = 1 << windowLog; /// windowAdd = (windowBase / 8) * Mantissa; /// Window_Size = windowBase + windowAdd; /// ``` /// window_descriptor: u8, /// The `Dictionary_ID` field contains the ID of the dictionary to be used to decode the frame. /// When this value is not present, it's up to the decoder to know which dictionary to use. dict_id: Option, /// The size of the original/uncompressed content. frame_content_size: u64, } /// The first byte is called the `Frame Header Descriptor`, and it describes what other fields /// are present. pub struct FrameDescriptor(u8); #[derive(Debug)] #[non_exhaustive] pub enum FrameDescriptorError { InvalidFrameContentSizeFlag { got: u8 }, } impl fmt::Display for FrameDescriptorError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::InvalidFrameContentSizeFlag { got } => write!( f, "Invalid Frame_Content_Size_Flag; Is: {}, Should be one of: 0, 1, 2, 3", got ), } } } #[cfg(feature = "std")] impl StdError for FrameDescriptorError {} impl FrameDescriptor { /// Read the `Frame_Content_Size_flag` from the frame header descriptor. /// /// This is a 2 bit flag, specifying if the `Frame_Content_Size` field is present /// within the header. It notates the number of bytes used by `Frame_Content_size` /// /// When this value is is 0, `FCS_Field_Size` depends on Single_Segment_flag. /// If the `Single_Segment_flag` field is set in the frame header descriptor, /// the size of the `Frame_Content_Size` field of the header is 1 byte. /// Otherwise, `FCS_Field_Size` is 0, and the `Frame_Content_Size` is not provided. /// /// | Flag Value (decimal) | Size of the `Frame_Content_Size` field in bytes | /// | -- | -- | /// | 0 | 0 or 1 (see above) | /// | 1 | 2 | /// | 2 | 4 | /// | 3 | 8 | pub fn frame_content_size_flag(&self) -> u8 { self.0 >> 6 } /// This bit is reserved for some future feature, a compliant decoder **must ensure** /// that this value is set to zero. pub fn reserved_flag(&self) -> bool { ((self.0 >> 3) & 0x1) == 1 } /// If this flag is set, data must be regenerated within a single continuous memory segment. /// /// In this case, the `Window_Descriptor` byte is skipped, but `Frame_Content_Size` is present. /// The decoder must allocate a memory segment equal to or larger than `Frame_Content_Size`. pub fn single_segment_flag(&self) -> bool { ((self.0 >> 5) & 0x1) == 1 } /// If this flag is set, a 32 bit `Content_Checksum` will be present at the end of the frame. pub fn content_checksum_flag(&self) -> bool { ((self.0 >> 2) & 0x1) == 1 } /// This is a two bit flag telling if a dictionary ID is provided within the header. It also /// specifies the size of this field /// /// | Value (Decimal) | `DID_Field_Size` (bytes) | /// | -- | -- | /// | 0 | 0 | /// | 1 | 1 | /// | 2 | 2 | /// | 3 | 4 | pub fn dict_id_flag(&self) -> u8 { self.0 & 0x3 } /// Read the size of the `Frame_Content_size` field from the frame header descriptor, returning /// the size in bytes. /// If this value is zero, then the `Frame_Content_Size` field is not present within the header. pub fn frame_content_size_bytes(&self) -> Result { match self.frame_content_size_flag() { 0 => { if self.single_segment_flag() { Ok(1) } else { Ok(0) } } 1 => Ok(2), 2 => Ok(4), 3 => Ok(8), other => Err(FrameDescriptorError::InvalidFrameContentSizeFlag { got: other }), } } /// Read the size of the `Dictionary_ID` field from the frame header descriptor, returning the size in bytes. /// If this value is zero, then the dictionary id is not present within the header, /// and "It's up to the decoder to know which dictionary to use." pub fn dictionary_id_bytes(&self) -> Result { match self.dict_id_flag() { 0 => Ok(0), 1 => Ok(1), 2 => Ok(2), 3 => Ok(4), other => Err(FrameDescriptorError::InvalidFrameContentSizeFlag { got: other }), } } } #[derive(Debug)] #[non_exhaustive] pub enum FrameHeaderError { WindowTooBig { got: u64 }, WindowTooSmall { got: u64 }, FrameDescriptorError(FrameDescriptorError), DictIdTooSmall { got: usize, expected: usize }, MismatchedFrameSize { got: usize, expected: u8 }, FrameSizeIsZero, InvalidFrameSize { got: u8 }, } impl fmt::Display for FrameHeaderError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::WindowTooBig { got } => write!( f, "window_size bigger than allowed maximum. Is: {}, Should be lower than: {}", got, MAX_WINDOW_SIZE ), Self::WindowTooSmall { got } => write!( f, "window_size smaller than allowed minimum. Is: {}, Should be greater than: {}", got, MIN_WINDOW_SIZE ), Self::FrameDescriptorError(e) => write!(f, "{:?}", e), Self::DictIdTooSmall { got, expected } => write!( f, "Not enough bytes in dict_id. Is: {}, Should be: {}", got, expected ), Self::MismatchedFrameSize { got, expected } => write!( f, "frame_content_size does not have the right length. Is: {}, Should be: {}", got, expected ), Self::FrameSizeIsZero => write!(f, "frame_content_size was zero"), Self::InvalidFrameSize { got } => write!( f, "Invalid frame_content_size. Is: {}, Should be one of 1, 2, 4, 8 bytes", got ), } } } #[cfg(feature = "std")] impl StdError for FrameHeaderError { fn source(&self) -> Option<&(dyn StdError + 'static)> { match self { FrameHeaderError::FrameDescriptorError(source) => Some(source), _ => None, } } } impl From for FrameHeaderError { fn from(error: FrameDescriptorError) -> Self { Self::FrameDescriptorError(error) } } impl FrameHeader { /// Read the size of the window from the header, returning the size in bytes. pub fn window_size(&self) -> Result { if self.descriptor.single_segment_flag() { Ok(self.frame_content_size()) } else { let exp = self.window_descriptor >> 3; let mantissa = self.window_descriptor & 0x7; let window_log = 10 + u64::from(exp); let window_base = 1 << window_log; let window_add = (window_base / 8) * u64::from(mantissa); let window_size = window_base + window_add; if window_size >= MIN_WINDOW_SIZE { if window_size < MAX_WINDOW_SIZE { Ok(window_size) } else { Err(FrameHeaderError::WindowTooBig { got: window_size }) } } else { Err(FrameHeaderError::WindowTooSmall { got: window_size }) } } } /// The ID (if provided) of the dictionary required to decode this frame. pub fn dictionary_id(&self) -> Option { self.dict_id } /// Obtain the uncompressed size (in bytes) of the frame contents. pub fn frame_content_size(&self) -> u64 { self.frame_content_size } } #[derive(Debug)] #[non_exhaustive] pub enum ReadFrameHeaderError { MagicNumberReadError(Error), BadMagicNumber(u32), FrameDescriptorReadError(Error), InvalidFrameDescriptor(FrameDescriptorError), WindowDescriptorReadError(Error), DictionaryIdReadError(Error), FrameContentSizeReadError(Error), SkipFrame { magic_number: u32, length: u32 }, } impl fmt::Display for ReadFrameHeaderError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::MagicNumberReadError(e) => write!(f, "Error while reading magic number: {}", e), Self::BadMagicNumber(e) => write!(f, "Read wrong magic number: 0x{:X}", e), Self::FrameDescriptorReadError(e) => { write!(f, "Error while reading frame descriptor: {}", e) } Self::InvalidFrameDescriptor(e) => write!(f, "{:?}", e), Self::WindowDescriptorReadError(e) => { write!(f, "Error while reading window descriptor: {}", e) } Self::DictionaryIdReadError(e) => write!(f, "Error while reading dictionary id: {}", e), Self::FrameContentSizeReadError(e) => { write!(f, "Error while reading frame content size: {}", e) } Self::SkipFrame { magic_number, length, } => write!( f, "SkippableFrame encountered with MagicNumber 0x{:X} and length {} bytes", magic_number, length ), } } } #[cfg(feature = "std")] impl StdError for ReadFrameHeaderError { fn source(&self) -> Option<&(dyn StdError + 'static)> { match self { ReadFrameHeaderError::MagicNumberReadError(source) => Some(source), ReadFrameHeaderError::FrameDescriptorReadError(source) => Some(source), ReadFrameHeaderError::InvalidFrameDescriptor(source) => Some(source), ReadFrameHeaderError::WindowDescriptorReadError(source) => Some(source), ReadFrameHeaderError::DictionaryIdReadError(source) => Some(source), ReadFrameHeaderError::FrameContentSizeReadError(source) => Some(source), _ => None, } } } impl From for ReadFrameHeaderError { fn from(error: FrameDescriptorError) -> Self { Self::InvalidFrameDescriptor(error) } } /// Read a single serialized frame from the reader and return a tuple containing the parsed frame and the number of bytes read. pub fn read_frame_header(mut r: impl Read) -> Result<(Frame, u8), ReadFrameHeaderError> { use ReadFrameHeaderError as err; let mut buf = [0u8; 4]; r.read_exact(&mut buf).map_err(err::MagicNumberReadError)?; let mut bytes_read = 4; let magic_num = u32::from_le_bytes(buf); // Skippable frames have a magic number in this interval if (0x184D2A50..=0x184D2A5F).contains(&magic_num) { r.read_exact(&mut buf) .map_err(err::FrameDescriptorReadError)?; let skip_size = u32::from_le_bytes(buf); return Err(ReadFrameHeaderError::SkipFrame { magic_number: magic_num, length: skip_size, }); } if magic_num != MAGIC_NUM { return Err(ReadFrameHeaderError::BadMagicNumber(magic_num)); } r.read_exact(&mut buf[0..1]) .map_err(err::FrameDescriptorReadError)?; let desc = FrameDescriptor(buf[0]); bytes_read += 1; let mut frame_header = FrameHeader { descriptor: FrameDescriptor(desc.0), dict_id: None, frame_content_size: 0, window_descriptor: 0, }; if !desc.single_segment_flag() { r.read_exact(&mut buf[0..1]) .map_err(err::WindowDescriptorReadError)?; frame_header.window_descriptor = buf[0]; bytes_read += 1; } let dict_id_len = desc.dictionary_id_bytes()? as usize; if dict_id_len != 0 { let buf = &mut buf[..dict_id_len]; r.read_exact(buf).map_err(err::DictionaryIdReadError)?; bytes_read += dict_id_len; let mut dict_id = 0u32; #[allow(clippy::needless_range_loop)] for i in 0..dict_id_len { dict_id += (buf[i] as u32) << (8 * i); } if dict_id != 0 { frame_header.dict_id = Some(dict_id); } } let fcs_len = desc.frame_content_size_bytes()? as usize; if fcs_len != 0 { let mut fcs_buf = [0u8; 8]; let fcs_buf = &mut fcs_buf[..fcs_len]; r.read_exact(fcs_buf) .map_err(err::FrameContentSizeReadError)?; bytes_read += fcs_len; let mut fcs = 0u64; #[allow(clippy::needless_range_loop)] for i in 0..fcs_len { fcs += (fcs_buf[i] as u64) << (8 * i); } if fcs_len == 2 { fcs += 256; } frame_header.frame_content_size = fcs; } let frame: Frame = Frame { header: frame_header, }; Ok((frame, bytes_read as u8)) } ruzstd-0.7.3/src/frame_decoder.rs000064400000000000000000000657521046102023000151210ustar 00000000000000//! Framedecoder is the man struct users interact with to decode zstd frames //! //! Zstandard compressed data is made of one or more [Frame]s. Each frame is independent and can be //! decompressed independently of other frames. This module contains structures //! and utilities that can be used to decode a frame. use super::frame; use crate::decoding::dictionary::Dictionary; use crate::decoding::scratch::DecoderScratch; use crate::decoding::{self, dictionary}; use crate::io::{Error, Read, Write}; use alloc::collections::BTreeMap; use alloc::vec::Vec; use core::convert::TryInto; #[cfg(feature = "std")] use std::error::Error as StdError; /// This implements a decoder for zstd frames. /// /// This decoder is able to decode frames only partially and gives control /// over how many bytes/blocks will be decoded at a time (so you don't have to decode a 10GB file into memory all at once). /// It reads bytes as needed from a provided source and can be read from to collect partial results. /// /// If you want to just read the whole frame with an `io::Read` without having to deal with manually calling [FrameDecoder::decode_blocks] /// you can use the provided StreamingDecoder with wraps this FrameDecoder /// /// Workflow is as follows: /// ``` /// use ruzstd::frame_decoder::BlockDecodingStrategy; /// /// # #[cfg(feature = "std")] /// use std::io::{Read, Write}; /// /// // no_std environments can use the crate's own Read traits /// # #[cfg(not(feature = "std"))] /// use ruzstd::io::{Read, Write}; /// /// fn decode_this(mut file: impl Read) { /// //Create a new decoder /// let mut frame_dec = ruzstd::FrameDecoder::new(); /// let mut result = Vec::new(); /// /// // Use reset or init to make the decoder ready to decode the frame from the io::Read /// frame_dec.reset(&mut file).unwrap(); /// /// // Loop until the frame has been decoded completely /// while !frame_dec.is_finished() { /// // decode (roughly) batch_size many bytes /// frame_dec.decode_blocks(&mut file, BlockDecodingStrategy::UptoBytes(1024)).unwrap(); /// /// // read from the decoder to collect bytes from the internal buffer /// let bytes_read = frame_dec.read(result.as_mut_slice()).unwrap(); /// /// // then do something with it /// do_something(&result[0..bytes_read]); /// } /// /// // handle the last chunk of data /// while frame_dec.can_collect() > 0 { /// let x = frame_dec.read(result.as_mut_slice()).unwrap(); /// /// do_something(&result[0..x]); /// } /// } /// /// fn do_something(data: &[u8]) { /// # #[cfg(feature = "std")] /// std::io::stdout().write_all(data).unwrap(); /// } /// ``` pub struct FrameDecoder { state: Option, dicts: BTreeMap, } struct FrameDecoderState { pub frame: frame::Frame, decoder_scratch: DecoderScratch, frame_finished: bool, block_counter: usize, bytes_read_counter: u64, check_sum: Option, using_dict: Option, } pub enum BlockDecodingStrategy { All, UptoBlocks(usize), UptoBytes(usize), } #[derive(Debug)] #[non_exhaustive] pub enum FrameDecoderError { ReadFrameHeaderError(frame::ReadFrameHeaderError), FrameHeaderError(frame::FrameHeaderError), WindowSizeTooBig { requested: u64 }, DictionaryDecodeError(dictionary::DictionaryDecodeError), FailedToReadBlockHeader(decoding::block_decoder::BlockHeaderReadError), FailedToReadBlockBody(decoding::block_decoder::DecodeBlockContentError), FailedToReadChecksum(Error), NotYetInitialized, FailedToInitialize(frame::FrameHeaderError), FailedToDrainDecodebuffer(Error), FailedToSkipFrame, TargetTooSmall, DictNotProvided { dict_id: u32 }, } #[cfg(feature = "std")] impl StdError for FrameDecoderError { fn source(&self) -> Option<&(dyn StdError + 'static)> { match self { FrameDecoderError::ReadFrameHeaderError(source) => Some(source), FrameDecoderError::FrameHeaderError(source) => Some(source), FrameDecoderError::DictionaryDecodeError(source) => Some(source), FrameDecoderError::FailedToReadBlockHeader(source) => Some(source), FrameDecoderError::FailedToReadBlockBody(source) => Some(source), FrameDecoderError::FailedToReadChecksum(source) => Some(source), FrameDecoderError::FailedToInitialize(source) => Some(source), FrameDecoderError::FailedToDrainDecodebuffer(source) => Some(source), _ => None, } } } impl core::fmt::Display for FrameDecoderError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> ::core::fmt::Result { match self { FrameDecoderError::ReadFrameHeaderError(e) => { write!(f, "{:?}", e) } FrameDecoderError::FrameHeaderError(e) => { write!(f, "{:?}", e) } FrameDecoderError::WindowSizeTooBig { requested } => { write!( f, "Specified window_size is too big; Requested: {}, Max: {}", requested, MAX_WINDOW_SIZE, ) } FrameDecoderError::DictionaryDecodeError(e) => { write!(f, "{:?}", e) } FrameDecoderError::FailedToReadBlockHeader(e) => { write!(f, "Failed to parse/decode block body: {}", e) } FrameDecoderError::FailedToReadBlockBody(e) => { write!(f, "Failed to parse block header: {}", e) } FrameDecoderError::FailedToReadChecksum(e) => { write!(f, "Failed to read checksum: {}", e) } FrameDecoderError::NotYetInitialized => { write!(f, "Decoder must initialized or reset before using it",) } FrameDecoderError::FailedToInitialize(e) => { write!(f, "Decoder encountered error while initializing: {}", e) } FrameDecoderError::FailedToDrainDecodebuffer(e) => { write!( f, "Decoder encountered error while draining the decodebuffer: {}", e, ) } FrameDecoderError::FailedToSkipFrame => { write!( f, "Failed to skip bytes for the length given in the frame header" ) } FrameDecoderError::TargetTooSmall => { write!(f, "Target must have at least as many bytes as the contentsize of the frame reports") } FrameDecoderError::DictNotProvided { dict_id } => { write!(f, "Frame header specified dictionary id 0x{:X} that wasnt provided by add_dict() or reset_with_dict()", dict_id) } } } } impl From for FrameDecoderError { fn from(val: dictionary::DictionaryDecodeError) -> Self { Self::DictionaryDecodeError(val) } } impl From for FrameDecoderError { fn from(val: decoding::block_decoder::BlockHeaderReadError) -> Self { Self::FailedToReadBlockHeader(val) } } impl From for FrameDecoderError { fn from(val: frame::FrameHeaderError) -> Self { Self::FrameHeaderError(val) } } impl From for FrameDecoderError { fn from(val: frame::ReadFrameHeaderError) -> Self { Self::ReadFrameHeaderError(val) } } const MAX_WINDOW_SIZE: u64 = 1024 * 1024 * 100; impl FrameDecoderState { pub fn new(source: impl Read) -> Result { let (frame, header_size) = frame::read_frame_header(source)?; let window_size = frame.header.window_size()?; Ok(FrameDecoderState { frame, frame_finished: false, block_counter: 0, decoder_scratch: DecoderScratch::new(window_size as usize), bytes_read_counter: u64::from(header_size), check_sum: None, using_dict: None, }) } pub fn reset(&mut self, source: impl Read) -> Result<(), FrameDecoderError> { let (frame, header_size) = frame::read_frame_header(source)?; let window_size = frame.header.window_size()?; if window_size > MAX_WINDOW_SIZE { return Err(FrameDecoderError::WindowSizeTooBig { requested: window_size, }); } self.frame = frame; self.frame_finished = false; self.block_counter = 0; self.decoder_scratch.reset(window_size as usize); self.bytes_read_counter = u64::from(header_size); self.check_sum = None; self.using_dict = None; Ok(()) } } impl Default for FrameDecoder { fn default() -> Self { Self::new() } } impl FrameDecoder { /// This will create a new decoder without allocating anything yet. /// init()/reset() will allocate all needed buffers if it is the first time this decoder is used /// else they just reset these buffers with not further allocations pub fn new() -> FrameDecoder { FrameDecoder { state: None, dicts: BTreeMap::new(), } } /// init() will allocate all needed buffers if it is the first time this decoder is used /// else they just reset these buffers with not further allocations /// /// Note that all bytes currently in the decodebuffer from any previous frame will be lost. Collect them with collect()/collect_to_writer() /// /// equivalent to reset() pub fn init(&mut self, source: impl Read) -> Result<(), FrameDecoderError> { self.reset(source) } /// reset() will allocate all needed buffers if it is the first time this decoder is used /// else they just reset these buffers with not further allocations /// /// Note that all bytes currently in the decodebuffer from any previous frame will be lost. Collect them with collect()/collect_to_writer() /// /// equivalent to init() pub fn reset(&mut self, source: impl Read) -> Result<(), FrameDecoderError> { use FrameDecoderError as err; let state = match &mut self.state { Some(s) => { s.reset(source)?; s } None => { self.state = Some(FrameDecoderState::new(source)?); self.state.as_mut().unwrap() } }; if let Some(dict_id) = state.frame.header.dictionary_id() { let dict = self .dicts .get(&dict_id) .ok_or(err::DictNotProvided { dict_id })?; state.decoder_scratch.init_from_dict(dict); state.using_dict = Some(dict_id); } Ok(()) } /// Add a dict to the FrameDecoder that can be used when needed. The FrameDecoder uses the appropriate one dynamically pub fn add_dict(&mut self, dict: Dictionary) -> Result<(), FrameDecoderError> { self.dicts.insert(dict.id, dict); Ok(()) } pub fn force_dict(&mut self, dict_id: u32) -> Result<(), FrameDecoderError> { use FrameDecoderError as err; let Some(state) = self.state.as_mut() else { return Err(err::NotYetInitialized); }; let dict = self .dicts .get(&dict_id) .ok_or(err::DictNotProvided { dict_id })?; state.decoder_scratch.init_from_dict(dict); state.using_dict = Some(dict_id); Ok(()) } /// Returns how many bytes the frame contains after decompression pub fn content_size(&self) -> u64 { match &self.state { None => 0, Some(s) => s.frame.header.frame_content_size(), } } /// Returns the checksum that was read from the data. Only available after all bytes have been read. It is the last 4 bytes of a zstd-frame pub fn get_checksum_from_data(&self) -> Option { let state = match &self.state { None => return None, Some(s) => s, }; state.check_sum } /// Returns the checksum that was calculated while decoding. /// Only a sensible value after all decoded bytes have been collected/read from the FrameDecoder #[cfg(feature = "hash")] pub fn get_calculated_checksum(&self) -> Option { use core::hash::Hasher; let state = match &self.state { None => return None, Some(s) => s, }; let cksum_64bit = state.decoder_scratch.buffer.hash.finish(); //truncate to lower 32bit because reasons... Some(cksum_64bit as u32) } /// Counter for how many bytes have been consumed while decoding the frame pub fn bytes_read_from_source(&self) -> u64 { let state = match &self.state { None => return 0, Some(s) => s, }; state.bytes_read_counter } /// Whether the current frames last block has been decoded yet /// If this returns true you can call the drain* functions to get all content /// (the read() function will drain automatically if this returns true) pub fn is_finished(&self) -> bool { let state = match &self.state { None => return true, Some(s) => s, }; if state.frame.header.descriptor.content_checksum_flag() { state.frame_finished && state.check_sum.is_some() } else { state.frame_finished } } /// Counter for how many blocks have already been decoded pub fn blocks_decoded(&self) -> usize { let state = match &self.state { None => return 0, Some(s) => s, }; state.block_counter } /// Decodes blocks from a reader. It requires that the framedecoder has been initialized first. /// The Strategy influences how many blocks will be decoded before the function returns /// This is important if you want to manage memory consumption carefully. If you don't care /// about that you can just choose the strategy "All" and have all blocks of the frame decoded into the buffer pub fn decode_blocks( &mut self, mut source: impl Read, strat: BlockDecodingStrategy, ) -> Result { use FrameDecoderError as err; let state = self.state.as_mut().ok_or(err::NotYetInitialized)?; let mut block_dec = decoding::block_decoder::new(); let buffer_size_before = state.decoder_scratch.buffer.len(); let block_counter_before = state.block_counter; loop { vprintln!("################"); vprintln!("Next Block: {}", state.block_counter); vprintln!("################"); let (block_header, block_header_size) = block_dec .read_block_header(&mut source) .map_err(err::FailedToReadBlockHeader)?; state.bytes_read_counter += u64::from(block_header_size); vprintln!(); vprintln!( "Found {} block with size: {}, which will be of size: {}", block_header.block_type, block_header.content_size, block_header.decompressed_size ); let bytes_read_in_block_body = block_dec .decode_block_content(&block_header, &mut state.decoder_scratch, &mut source) .map_err(err::FailedToReadBlockBody)?; state.bytes_read_counter += bytes_read_in_block_body; state.block_counter += 1; vprintln!("Output: {}", state.decoder_scratch.buffer.len()); if block_header.last_block { state.frame_finished = true; if state.frame.header.descriptor.content_checksum_flag() { let mut chksum = [0u8; 4]; source .read_exact(&mut chksum) .map_err(err::FailedToReadChecksum)?; state.bytes_read_counter += 4; let chksum = u32::from_le_bytes(chksum); state.check_sum = Some(chksum); } break; } match strat { BlockDecodingStrategy::All => { /* keep going */ } BlockDecodingStrategy::UptoBlocks(n) => { if state.block_counter - block_counter_before >= n { break; } } BlockDecodingStrategy::UptoBytes(n) => { if state.decoder_scratch.buffer.len() - buffer_size_before >= n { break; } } } } Ok(state.frame_finished) } /// Collect bytes and retain window_size bytes while decoding is still going on. /// After decoding of the frame (is_finished() == true) has finished it will collect all remaining bytes pub fn collect(&mut self) -> Option> { let finished = self.is_finished(); let state = self.state.as_mut()?; if finished { Some(state.decoder_scratch.buffer.drain()) } else { state.decoder_scratch.buffer.drain_to_window_size() } } /// Collect bytes and retain window_size bytes while decoding is still going on. /// After decoding of the frame (is_finished() == true) has finished it will collect all remaining bytes pub fn collect_to_writer(&mut self, w: impl Write) -> Result { let finished = self.is_finished(); let state = match &mut self.state { None => return Ok(0), Some(s) => s, }; if finished { state.decoder_scratch.buffer.drain_to_writer(w) } else { state.decoder_scratch.buffer.drain_to_window_size_writer(w) } } /// How many bytes can currently be collected from the decodebuffer, while decoding is going on this will be lower than the actual decodbuffer size /// because window_size bytes need to be retained for decoding. /// After decoding of the frame (is_finished() == true) has finished it will report all remaining bytes pub fn can_collect(&self) -> usize { let finished = self.is_finished(); let state = match &self.state { None => return 0, Some(s) => s, }; if finished { state.decoder_scratch.buffer.can_drain() } else { state .decoder_scratch .buffer .can_drain_to_window_size() .unwrap_or(0) } } /// Decodes as many blocks as possible from the source slice and reads from the decodebuffer into the target slice /// The source slice may contain only parts of a frame but must contain at least one full block to make progress /// /// By all means use decode_blocks if you have a io.Reader available. This is just for compatibility with other decompressors /// which try to serve an old-style c api /// /// Returns (read, written), if read == 0 then the source did not contain a full block and further calls with the same /// input will not make any progress! /// /// Note that no kind of block can be bigger than 128kb. /// So to be safe use at least 128*1024 (max block content size) + 3 (block_header size) + 18 (max frame_header size) bytes as your source buffer /// /// You may call this function with an empty source after all bytes have been decoded. This is equivalent to just call decoder.read(&mut target) pub fn decode_from_to( &mut self, source: &[u8], target: &mut [u8], ) -> Result<(usize, usize), FrameDecoderError> { use FrameDecoderError as err; let bytes_read_at_start = match &self.state { Some(s) => s.bytes_read_counter, None => 0, }; if !self.is_finished() || self.state.is_none() { let mut mt_source = source; if self.state.is_none() { self.init(&mut mt_source)?; } //pseudo block to scope "state" so we can borrow self again after the block { let state = match &mut self.state { Some(s) => s, None => panic!("Bug in library"), }; let mut block_dec = decoding::block_decoder::new(); if state.frame.header.descriptor.content_checksum_flag() && state.frame_finished && state.check_sum.is_none() { //this block is needed if the checksum were the only 4 bytes that were not included in the last decode_from_to call for a frame if mt_source.len() >= 4 { let chksum = mt_source[..4].try_into().expect("optimized away"); state.bytes_read_counter += 4; let chksum = u32::from_le_bytes(chksum); state.check_sum = Some(chksum); } return Ok((4, 0)); } loop { //check if there are enough bytes for the next header if mt_source.len() < 3 { break; } let (block_header, block_header_size) = block_dec .read_block_header(&mut mt_source) .map_err(err::FailedToReadBlockHeader)?; // check the needed size for the block before updating counters. // If not enough bytes are in the source, the header will have to be read again, so act like we never read it in the first place if mt_source.len() < block_header.content_size as usize { break; } state.bytes_read_counter += u64::from(block_header_size); let bytes_read_in_block_body = block_dec .decode_block_content( &block_header, &mut state.decoder_scratch, &mut mt_source, ) .map_err(err::FailedToReadBlockBody)?; state.bytes_read_counter += bytes_read_in_block_body; state.block_counter += 1; if block_header.last_block { state.frame_finished = true; if state.frame.header.descriptor.content_checksum_flag() { //if there are enough bytes handle this here. Else the block at the start of this function will handle it at the next call if mt_source.len() >= 4 { let chksum = mt_source[..4].try_into().expect("optimized away"); state.bytes_read_counter += 4; let chksum = u32::from_le_bytes(chksum); state.check_sum = Some(chksum); } } break; } } } } let result_len = self.read(target).map_err(err::FailedToDrainDecodebuffer)?; let bytes_read_at_end = match &mut self.state { Some(s) => s.bytes_read_counter, None => panic!("Bug in library"), }; let read_len = bytes_read_at_end - bytes_read_at_start; Ok((read_len as usize, result_len)) } /// Decode multiple frames into the output slice. /// /// `input` must contain an exact number of frames. /// /// `output` must be large enough to hold the decompressed data. If you don't know /// how large the output will be, use [`FrameDecoder::decode_blocks`] instead. /// /// This calls [`FrameDecoder::init`], and all bytes currently in the decoder will be lost. /// /// Returns the number of bytes written to `output`. pub fn decode_all( &mut self, mut input: &[u8], mut output: &mut [u8], ) -> Result { let mut total_bytes_written = 0; while !input.is_empty() { match self.init(&mut input) { Ok(_) => {} Err(FrameDecoderError::ReadFrameHeaderError( frame::ReadFrameHeaderError::SkipFrame { length, .. }, )) => { input = input .get(length as usize..) .ok_or(FrameDecoderError::FailedToSkipFrame)?; continue; } Err(e) => return Err(e), }; loop { self.decode_blocks(&mut input, BlockDecodingStrategy::UptoBlocks(1))?; let bytes_written = self .read(output) .map_err(FrameDecoderError::FailedToDrainDecodebuffer)?; output = &mut output[bytes_written..]; total_bytes_written += bytes_written; if self.can_collect() != 0 { return Err(FrameDecoderError::TargetTooSmall); } if self.is_finished() { break; } } } Ok(total_bytes_written) } /// Decode multiple frames into the extra capacity of the output vector. /// /// `input` must contain an exact number of frames. /// /// `output` must have enough extra capacity to hold the decompressed data. /// This function will not reallocate or grow the vector. If you don't know /// how large the output will be, use [`FrameDecoder::decode_blocks`] instead. /// /// This calls [`FrameDecoder::init`], and all bytes currently in the decoder will be lost. /// /// The length of the output vector is updated to include the decompressed data. /// The length is not changed if an error occurs. pub fn decode_all_to_vec( &mut self, input: &[u8], output: &mut Vec, ) -> Result<(), FrameDecoderError> { let len = output.len(); let cap = output.capacity(); output.resize(cap, 0); match self.decode_all(input, &mut output[len..]) { Ok(bytes_written) => { let new_len = core::cmp::min(len + bytes_written, cap); // Sanitizes `bytes_written`. output.resize(new_len, 0); Ok(()) } Err(e) => { output.resize(len, 0); Err(e) } } } } /// Read bytes from the decode_buffer that are no longer needed. While the frame is not yet finished /// this will retain window_size bytes, else it will drain it completely impl Read for FrameDecoder { fn read(&mut self, target: &mut [u8]) -> Result { let state = match &mut self.state { None => return Ok(0), Some(s) => s, }; if state.frame_finished { state.decoder_scratch.buffer.read_all(target) } else { state.decoder_scratch.buffer.read(target) } } } ruzstd-0.7.3/src/fse/fse_decoder.rs000064400000000000000000000413461046102023000153520ustar 00000000000000use crate::decoding::bit_reader::BitReader; use crate::decoding::bit_reader_reverse::{BitReaderReversed, GetBitsError}; use alloc::vec::Vec; /// FSE decoding involves a decoding table that describes the probabilities of /// all literals from 0 to the highest present one /// /// pub struct FSETable { /// The maximum symbol in the table (inclusive). Limits the probabilities length to max_symbol + 1. max_symbol: u8, /// The actual table containing the decoded symbol and the compression data /// connected to that symbol. pub decode: Vec, //used to decode symbols, and calculate the next state /// The size of the table is stored in logarithm base 2 format, /// with the **size of the table** being equal to `(1 << accuracy_log)`. /// This value is used so that the decoder knows how many bits to read from the bitstream. pub accuracy_log: u8, /// In this context, probability refers to the likelihood that a symbol occurs in the given data. /// Given this info, the encoder can assign shorter codes to symbols that appear more often, /// and longer codes that appear less often, then the decoder can use the probability /// to determine what code was assigned to what symbol. /// /// The probability of a single symbol is a value representing the proportion of times the symbol /// would fall within the data. /// /// If a symbol probability is set to `-1`, it means that the probability of a symbol /// occurring in the data is less than one. pub symbol_probabilities: Vec, //used while building the decode Vector /// The number of times each symbol occurs (The first entry being 0x0, the second being 0x1) and so on /// up until the highest possible symbol (255). symbol_counter: Vec, } #[derive(Debug)] #[non_exhaustive] pub enum FSETableError { AccLogIsZero, AccLogTooBig { got: u8, max: u8, }, GetBitsError(GetBitsError), ProbabilityCounterMismatch { got: u32, expected_sum: u32, symbol_probabilities: Vec, }, TooManySymbols { got: usize, }, } #[cfg(feature = "std")] impl std::error::Error for FSETableError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { FSETableError::GetBitsError(source) => Some(source), _ => None, } } } impl core::fmt::Display for FSETableError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { FSETableError::AccLogIsZero => write!(f, "Acclog must be at least 1"), FSETableError::AccLogTooBig { got, max } => { write!( f, "Found FSE acc_log: {0} bigger than allowed maximum in this case: {1}", got, max ) } FSETableError::GetBitsError(e) => write!(f, "{:?}", e), FSETableError::ProbabilityCounterMismatch { got, expected_sum, symbol_probabilities, } => { write!(f, "The counter ({}) exceeded the expected sum: {}. This means an error or corrupted data \n {:?}", got, expected_sum, symbol_probabilities, ) } FSETableError::TooManySymbols { got } => { write!( f, "There are too many symbols in this distribution: {}. Max: 256", got, ) } } } } impl From for FSETableError { fn from(val: GetBitsError) -> Self { Self::GetBitsError(val) } } pub struct FSEDecoder<'table> { /// An FSE state value represents an index in the FSE table. pub state: Entry, /// A reference to the table used for decoding. table: &'table FSETable, } #[derive(Debug)] #[non_exhaustive] pub enum FSEDecoderError { GetBitsError(GetBitsError), TableIsUninitialized, } #[cfg(feature = "std")] impl std::error::Error for FSEDecoderError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { FSEDecoderError::GetBitsError(source) => Some(source), _ => None, } } } impl core::fmt::Display for FSEDecoderError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { FSEDecoderError::GetBitsError(e) => write!(f, "{:?}", e), FSEDecoderError::TableIsUninitialized => { write!(f, "Tried to use an uninitialized table!") } } } } impl From for FSEDecoderError { fn from(val: GetBitsError) -> Self { Self::GetBitsError(val) } } /// A single entry in an FSE table. #[derive(Copy, Clone)] pub struct Entry { /// This value is used as an offset value, and it is added /// to a value read from the stream to determine the next state value. pub base_line: u32, /// How many bits should be read from the stream when decoding this entry. pub num_bits: u8, /// The byte that should be put in the decode output when encountering this state. pub symbol: u8, } /// This value is added to the first 4 bits of the stream to determine the /// `Accuracy_Log` const ACC_LOG_OFFSET: u8 = 5; fn highest_bit_set(x: u32) -> u32 { assert!(x > 0); u32::BITS - x.leading_zeros() } impl<'t> FSEDecoder<'t> { /// Initialize a new Finite State Entropy decoder. pub fn new(table: &'t FSETable) -> FSEDecoder<'t> { FSEDecoder { state: table.decode.first().copied().unwrap_or(Entry { base_line: 0, num_bits: 0, symbol: 0, }), table, } } /// Returns the byte associated with the symbol the internal cursor is pointing at. pub fn decode_symbol(&self) -> u8 { self.state.symbol } /// Initialize internal state and prepare for decoding. After this, `decode_symbol` can be called /// to read the first symbol and `update_state` can be called to prepare to read the next symbol. pub fn init_state(&mut self, bits: &mut BitReaderReversed<'_>) -> Result<(), FSEDecoderError> { if self.table.accuracy_log == 0 { return Err(FSEDecoderError::TableIsUninitialized); } self.state = self.table.decode[bits.get_bits(self.table.accuracy_log) as usize]; Ok(()) } /// Advance the internal state to decode the next symbol in the bitstream. pub fn update_state(&mut self, bits: &mut BitReaderReversed<'_>) { let num_bits = self.state.num_bits; let add = bits.get_bits(num_bits); let base_line = self.state.base_line; let new_state = base_line + add as u32; self.state = self.table.decode[new_state as usize]; //println!("Update: {}, {} -> {}", base_line, add, self.state); } } impl FSETable { /// Initialize a new empty Finite State Entropy decoding table. pub fn new(max_symbol: u8) -> FSETable { FSETable { max_symbol, symbol_probabilities: Vec::with_capacity(256), //will never be more than 256 symbols because u8 symbol_counter: Vec::with_capacity(256), //will never be more than 256 symbols because u8 decode: Vec::new(), //depending on acc_log. accuracy_log: 0, } } /// Reset `self` and update `self`'s state to mirror the provided table. pub fn reinit_from(&mut self, other: &Self) { self.reset(); self.symbol_counter.extend_from_slice(&other.symbol_counter); self.symbol_probabilities .extend_from_slice(&other.symbol_probabilities); self.decode.extend_from_slice(&other.decode); self.accuracy_log = other.accuracy_log; } /// Empty the table and clear all internal state. pub fn reset(&mut self) { self.symbol_counter.clear(); self.symbol_probabilities.clear(); self.decode.clear(); self.accuracy_log = 0; } /// returns how many BYTEs (not bits) were read while building the decoder pub fn build_decoder(&mut self, source: &[u8], max_log: u8) -> Result { self.accuracy_log = 0; let bytes_read = self.read_probabilities(source, max_log)?; self.build_decoding_table()?; Ok(bytes_read) } /// Given the provided accuracy log, build a decoding table from that log. pub fn build_from_probabilities( &mut self, acc_log: u8, probs: &[i32], ) -> Result<(), FSETableError> { if acc_log == 0 { return Err(FSETableError::AccLogIsZero); } self.symbol_probabilities = probs.to_vec(); self.accuracy_log = acc_log; self.build_decoding_table() } /// Build the actual decoding table after probabilities have been read into the table. /// After this function is called, the decoding process can begin. fn build_decoding_table(&mut self) -> Result<(), FSETableError> { if self.symbol_probabilities.len() > self.max_symbol as usize + 1 { return Err(FSETableError::TooManySymbols { got: self.symbol_probabilities.len(), }); } self.decode.clear(); let table_size = 1 << self.accuracy_log; if self.decode.len() < table_size { self.decode.reserve(table_size - self.decode.len()); } //fill with dummy entries self.decode.resize( table_size, Entry { base_line: 0, num_bits: 0, symbol: 0, }, ); let mut negative_idx = table_size; //will point to the highest index with is already occupied by a negative-probability-symbol //first scan for all -1 probabilities and place them at the top of the table for symbol in 0..self.symbol_probabilities.len() { if self.symbol_probabilities[symbol] == -1 { negative_idx -= 1; let entry = &mut self.decode[negative_idx]; entry.symbol = symbol as u8; entry.base_line = 0; entry.num_bits = self.accuracy_log; } } //then place in a semi-random order all of the other symbols let mut position = 0; for idx in 0..self.symbol_probabilities.len() { let symbol = idx as u8; if self.symbol_probabilities[idx] <= 0 { continue; } //for each probability point the symbol gets on slot let prob = self.symbol_probabilities[idx]; for _ in 0..prob { let entry = &mut self.decode[position]; entry.symbol = symbol; position = next_position(position, table_size); while position >= negative_idx { position = next_position(position, table_size); //everything above negative_idx is already taken } } } // baselines and num_bits can only be calculated when all symbols have been spread self.symbol_counter.clear(); self.symbol_counter .resize(self.symbol_probabilities.len(), 0); for idx in 0..negative_idx { let entry = &mut self.decode[idx]; let symbol = entry.symbol; let prob = self.symbol_probabilities[symbol as usize]; let symbol_count = self.symbol_counter[symbol as usize]; let (bl, nb) = calc_baseline_and_numbits(table_size as u32, prob as u32, symbol_count); //println!("symbol: {:2}, table: {}, prob: {:3}, count: {:3}, bl: {:3}, nb: {:2}", symbol, table_size, prob, symbol_count, bl, nb); assert!(nb <= self.accuracy_log); self.symbol_counter[symbol as usize] += 1; entry.base_line = bl; entry.num_bits = nb; } Ok(()) } /// Read the accuracy log and the probability table from the source and return the number of bytes /// read. If the size of the table is larger than the provided `max_log`, return an error. fn read_probabilities(&mut self, source: &[u8], max_log: u8) -> Result { self.symbol_probabilities.clear(); //just clear, we will fill a probability for each entry anyways. No need to force new allocs here let mut br = BitReader::new(source); self.accuracy_log = ACC_LOG_OFFSET + (br.get_bits(4)? as u8); if self.accuracy_log > max_log { return Err(FSETableError::AccLogTooBig { got: self.accuracy_log, max: max_log, }); } if self.accuracy_log == 0 { return Err(FSETableError::AccLogIsZero); } let probability_sum = 1 << self.accuracy_log; let mut probability_counter = 0; while probability_counter < probability_sum { let max_remaining_value = probability_sum - probability_counter + 1; let bits_to_read = highest_bit_set(max_remaining_value); let unchecked_value = br.get_bits(bits_to_read as usize)? as u32; let low_threshold = ((1 << bits_to_read) - 1) - (max_remaining_value); let mask = (1 << (bits_to_read - 1)) - 1; let small_value = unchecked_value & mask; let value = if small_value < low_threshold { br.return_bits(1); small_value } else if unchecked_value > mask { unchecked_value - low_threshold } else { unchecked_value }; //println!("{}, {}, {}", self.symbol_probablilities.len(), unchecked_value, value); let prob = (value as i32) - 1; self.symbol_probabilities.push(prob); if prob != 0 { if prob > 0 { probability_counter += prob as u32; } else { // probability -1 counts as 1 assert!(prob == -1); probability_counter += 1; } } else { //fast skip further zero probabilities loop { let skip_amount = br.get_bits(2)? as usize; self.symbol_probabilities .resize(self.symbol_probabilities.len() + skip_amount, 0); if skip_amount != 3 { break; } } } } if probability_counter != probability_sum { return Err(FSETableError::ProbabilityCounterMismatch { got: probability_counter, expected_sum: probability_sum, symbol_probabilities: self.symbol_probabilities.clone(), }); } if self.symbol_probabilities.len() > self.max_symbol as usize + 1 { return Err(FSETableError::TooManySymbols { got: self.symbol_probabilities.len(), }); } let bytes_read = if br.bits_read() % 8 == 0 { br.bits_read() / 8 } else { (br.bits_read() / 8) + 1 }; Ok(bytes_read) } } //utility functions for building the decoding table from probabilities /// Calculate the position of the next entry of the table given the current /// position and size of the table. fn next_position(mut p: usize, table_size: usize) -> usize { p += (table_size >> 1) + (table_size >> 3) + 3; p &= table_size - 1; p } fn calc_baseline_and_numbits( num_states_total: u32, num_states_symbol: u32, state_number: u32, ) -> (u32, u8) { let num_state_slices = if 1 << (highest_bit_set(num_states_symbol) - 1) == num_states_symbol { num_states_symbol } else { 1 << (highest_bit_set(num_states_symbol)) }; //always power of two let num_double_width_state_slices = num_state_slices - num_states_symbol; //leftovers to the power of two need to be distributed let num_single_width_state_slices = num_states_symbol - num_double_width_state_slices; //these will not receive a double width slice of states let slice_width = num_states_total / num_state_slices; //size of a single width slice of states let num_bits = highest_bit_set(slice_width) - 1; //number of bits needed to read for one slice if state_number < num_double_width_state_slices { let baseline = num_single_width_state_slices * slice_width + state_number * slice_width * 2; (baseline, num_bits as u8 + 1) } else { let index_shifted = state_number - num_double_width_state_slices; ((index_shifted * slice_width), num_bits as u8) } } ruzstd-0.7.3/src/fse/mod.rs000064400000000000000000000012411046102023000136550ustar 00000000000000//! FSE, short for Finite State Entropy, is an encoding technique //! that assigns shorter codes to symbols that appear more frequently in data, //! and longer codes to less frequent symbols. //! //! FSE works by mutating a state and using that state to index into a table. //! //! Zstandard uses two different kinds of entropy encoding: FSE, and Huffman coding. //! Huffman is used to compress literals, //! while FSE is used for all other symbols (literal length code, match length code, offset code). //! //! https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#fse //! //! mod fse_decoder; pub use fse_decoder::*; ruzstd-0.7.3/src/huff0/huff0_decoder.rs000064400000000000000000000507071046102023000160410ustar 00000000000000//! Utilities for decoding Huff0 encoded huffman data. use crate::decoding::bit_reader_reverse::{BitReaderReversed, GetBitsError}; use crate::fse::{FSEDecoder, FSEDecoderError, FSETable, FSETableError}; use alloc::vec::Vec; #[cfg(feature = "std")] use std::error::Error as StdError; pub struct HuffmanTable { decode: Vec, /// The weight of a symbol is the number of occurences in a table. /// This value is used in constructing a binary tree referred to as /// a huffman tree. weights: Vec, /// The maximum size in bits a prefix code in the encoded data can be. /// This value is used so that the decoder knows how many bits /// to read from the bitstream before checking the table. This /// value must be 11 or lower. pub max_num_bits: u8, bits: Vec, bit_ranks: Vec, rank_indexes: Vec, /// In some cases, the list of weights is compressed using FSE compression. fse_table: FSETable, } #[derive(Debug)] #[non_exhaustive] pub enum HuffmanTableError { GetBitsError(GetBitsError), FSEDecoderError(FSEDecoderError), FSETableError(FSETableError), SourceIsEmpty, NotEnoughBytesForWeights { got_bytes: usize, expected_bytes: u8, }, ExtraPadding { skipped_bits: i32, }, TooManyWeights { got: usize, }, MissingWeights, LeftoverIsNotAPowerOf2 { got: u32, }, NotEnoughBytesToDecompressWeights { have: usize, need: usize, }, FSETableUsedTooManyBytes { used: usize, available_bytes: u8, }, NotEnoughBytesInSource { got: usize, need: usize, }, WeightBiggerThanMaxNumBits { got: u8, }, MaxBitsTooHigh { got: u8, }, } #[cfg(feature = "std")] impl StdError for HuffmanTableError { fn source(&self) -> Option<&(dyn StdError + 'static)> { match self { HuffmanTableError::GetBitsError(source) => Some(source), HuffmanTableError::FSEDecoderError(source) => Some(source), HuffmanTableError::FSETableError(source) => Some(source), _ => None, } } } impl core::fmt::Display for HuffmanTableError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> ::core::fmt::Result { match self { HuffmanTableError::GetBitsError(e) => write!(f, "{:?}", e), HuffmanTableError::FSEDecoderError(e) => write!(f, "{:?}", e), HuffmanTableError::FSETableError(e) => write!(f, "{:?}", e), HuffmanTableError::SourceIsEmpty => write!(f, "Source needs to have at least one byte"), HuffmanTableError::NotEnoughBytesForWeights { got_bytes, expected_bytes, } => { write!(f, "Header says there should be {} bytes for the weights but there are only {} bytes in the stream", expected_bytes, got_bytes) } HuffmanTableError::ExtraPadding { skipped_bits } => { write!(f, "Padding at the end of the sequence_section was more than a byte long: {} bits. Probably caused by data corruption", skipped_bits, ) } HuffmanTableError::TooManyWeights { got } => { write!( f, "More than 255 weights decoded (got {} weights). Stream is probably corrupted", got, ) } HuffmanTableError::MissingWeights => { write!(f, "Can\'t build huffman table without any weights") } HuffmanTableError::LeftoverIsNotAPowerOf2 { got } => { write!(f, "Leftover must be power of two but is: {}", got) } HuffmanTableError::NotEnoughBytesToDecompressWeights { have, need } => { write!( f, "Not enough bytes in stream to decompress weights. Is: {}, Should be: {}", have, need, ) } HuffmanTableError::FSETableUsedTooManyBytes { used, available_bytes, } => { write!(f, "FSE table used more bytes: {} than were meant to be used for the whole stream of huffman weights ({})", used, available_bytes, ) } HuffmanTableError::NotEnoughBytesInSource { got, need } => { write!( f, "Source needs to have at least {} bytes, got: {}", need, got, ) } HuffmanTableError::WeightBiggerThanMaxNumBits { got } => { write!( f, "Cant have weight: {} bigger than max_num_bits: {}", got, MAX_MAX_NUM_BITS, ) } HuffmanTableError::MaxBitsTooHigh { got } => { write!( f, "max_bits derived from weights is: {} should be lower than: {}", got, MAX_MAX_NUM_BITS, ) } } } } impl From for HuffmanTableError { fn from(val: GetBitsError) -> Self { Self::GetBitsError(val) } } impl From for HuffmanTableError { fn from(val: FSEDecoderError) -> Self { Self::FSEDecoderError(val) } } impl From for HuffmanTableError { fn from(val: FSETableError) -> Self { Self::FSETableError(val) } } /// An interface around a huffman table used to decode data. pub struct HuffmanDecoder<'table> { table: &'table HuffmanTable, /// State is used to index into the table. pub state: u64, } #[derive(Debug)] #[non_exhaustive] pub enum HuffmanDecoderError { GetBitsError(GetBitsError), } impl core::fmt::Display for HuffmanDecoderError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { HuffmanDecoderError::GetBitsError(e) => write!(f, "{:?}", e), } } } #[cfg(feature = "std")] impl StdError for HuffmanDecoderError { fn source(&self) -> Option<&(dyn StdError + 'static)> { match self { HuffmanDecoderError::GetBitsError(source) => Some(source), } } } impl From for HuffmanDecoderError { fn from(val: GetBitsError) -> Self { Self::GetBitsError(val) } } /// A single entry in the table contains the decoded symbol/literal and the /// size of the prefix code. #[derive(Copy, Clone)] pub struct Entry { /// The byte that the prefix code replaces during encoding. symbol: u8, /// The number of bits the prefix code occupies. num_bits: u8, } /// The Zstandard specification limits the maximum length of a code to 11 bits. const MAX_MAX_NUM_BITS: u8 = 11; /// Assert that the provided value is greater than zero, and returns the /// 32 - the number of leading zeros fn highest_bit_set(x: u32) -> u32 { assert!(x > 0); u32::BITS - x.leading_zeros() } impl<'t> HuffmanDecoder<'t> { /// Create a new decoder with the provided table pub fn new(table: &'t HuffmanTable) -> HuffmanDecoder<'t> { HuffmanDecoder { table, state: 0 } } /// Re-initialize the decoder, using the new table if one is provided. /// This might used for treeless blocks, because they re-use the table from old /// data. pub fn reset(mut self, new_table: Option<&'t HuffmanTable>) { self.state = 0; if let Some(next_table) = new_table { self.table = next_table; } } /// Decode the symbol the internal state (cursor) is pointed at and return the /// decoded literal. pub fn decode_symbol(&mut self) -> u8 { self.table.decode[self.state as usize].symbol } /// Initialize internal state and prepare to decode data. Then, `decode_symbol` can be called /// to read the byte the internal cursor is pointing at, and `next_state` can be called to advance /// the cursor until the max number of bits has been read. pub fn init_state(&mut self, br: &mut BitReaderReversed<'_>) -> u8 { let num_bits = self.table.max_num_bits; let new_bits = br.get_bits(num_bits); self.state = new_bits; num_bits } /// Advance the internal cursor to the next symbol. After this, you can call `decode_symbol` /// to read from the new position. pub fn next_state(&mut self, br: &mut BitReaderReversed<'_>) -> u8 { // self.state stores a small section, or a window of the bit stream. The table can be indexed via this state, // telling you how many bits identify the current symbol. let num_bits = self.table.decode[self.state as usize].num_bits; // New bits are read from the stream let new_bits = br.get_bits(num_bits); // Shift and mask out the bits that identify the current symbol self.state <<= num_bits; self.state &= self.table.decode.len() as u64 - 1; // The new bits are appended at the end of the current state. self.state |= new_bits; num_bits } } impl Default for HuffmanTable { fn default() -> Self { Self::new() } } impl HuffmanTable { /// Create a new, empty table. pub fn new() -> HuffmanTable { HuffmanTable { decode: Vec::new(), weights: Vec::with_capacity(256), max_num_bits: 0, bits: Vec::with_capacity(256), bit_ranks: Vec::with_capacity(11), rank_indexes: Vec::with_capacity(11), fse_table: FSETable::new(100), } } /// Completely empty the table then repopulate as a replica /// of `other`. pub fn reinit_from(&mut self, other: &Self) { self.reset(); self.decode.extend_from_slice(&other.decode); self.weights.extend_from_slice(&other.weights); self.max_num_bits = other.max_num_bits; self.bits.extend_from_slice(&other.bits); self.rank_indexes.extend_from_slice(&other.rank_indexes); self.fse_table.reinit_from(&other.fse_table); } /// Completely empty the table of all data. pub fn reset(&mut self) { self.decode.clear(); self.weights.clear(); self.max_num_bits = 0; self.bits.clear(); self.bit_ranks.clear(); self.rank_indexes.clear(); self.fse_table.reset(); } /// Read from `source` and parse it into a huffman table. /// /// Returns the number of bytes read. pub fn build_decoder(&mut self, source: &[u8]) -> Result { self.decode.clear(); let bytes_used = self.read_weights(source)?; self.build_table_from_weights()?; Ok(bytes_used) } /// Read weights from the provided source. /// /// The huffman table is represented in the encoded data as a list of weights /// at the most basic level. After the header, weights are read, then the table /// can be built using that list of weights. /// /// Returns the number of bytes read. fn read_weights(&mut self, source: &[u8]) -> Result { use HuffmanTableError as err; if source.is_empty() { return Err(err::SourceIsEmpty); } let header = source[0]; let mut bits_read = 8; match header { // If the header byte is less than 128, the series of weights // is compressed using two interleaved FSE streams that share // a distribution table. 0..=127 => { let fse_stream = &source[1..]; if header as usize > fse_stream.len() { return Err(err::NotEnoughBytesForWeights { got_bytes: fse_stream.len(), expected_bytes: header, }); } //fse decompress weights let bytes_used_by_fse_header = self .fse_table .build_decoder(fse_stream, /*TODO find actual max*/ 100)?; if bytes_used_by_fse_header > header as usize { return Err(err::FSETableUsedTooManyBytes { used: bytes_used_by_fse_header, available_bytes: header, }); } vprintln!( "Building fse table for huffman weights used: {}", bytes_used_by_fse_header ); // Huffman headers are compressed using two interleaved // FSE bitstreams, where the first state (decoder) handles // even symbols, and the second handles odd symbols. let mut dec1 = FSEDecoder::new(&self.fse_table); let mut dec2 = FSEDecoder::new(&self.fse_table); let compressed_start = bytes_used_by_fse_header; let compressed_length = header as usize - bytes_used_by_fse_header; let compressed_weights = &fse_stream[compressed_start..]; if compressed_weights.len() < compressed_length { return Err(err::NotEnoughBytesToDecompressWeights { have: compressed_weights.len(), need: compressed_length, }); } let compressed_weights = &compressed_weights[..compressed_length]; let mut br = BitReaderReversed::new(compressed_weights); bits_read += (bytes_used_by_fse_header + compressed_length) * 8; //skip the 0 padding at the end of the last byte of the bit stream and throw away the first 1 found let mut skipped_bits = 0; loop { let val = br.get_bits(1); skipped_bits += 1; if val == 1 || skipped_bits > 8 { break; } } if skipped_bits > 8 { //if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data return Err(err::ExtraPadding { skipped_bits }); } dec1.init_state(&mut br)?; dec2.init_state(&mut br)?; self.weights.clear(); // The two decoders take turns decoding a single symbol and updating their state. loop { let w = dec1.decode_symbol(); self.weights.push(w); dec1.update_state(&mut br); if br.bits_remaining() <= -1 { //collect final states self.weights.push(dec2.decode_symbol()); break; } let w = dec2.decode_symbol(); self.weights.push(w); dec2.update_state(&mut br); if br.bits_remaining() <= -1 { //collect final states self.weights.push(dec1.decode_symbol()); break; } //maximum number of weights is 255 because we use u8 symbols and the last weight is inferred from the sum of all others if self.weights.len() > 255 { return Err(err::TooManyWeights { got: self.weights.len(), }); } } } // If the header byte is greater than or equal to 128, // weights are directly represented, where each weight is // encoded directly as a 4 bit field. The weights will // always be encoded with full bytes, meaning if there's // an odd number of weights, the last weight will still // occupy a full byte. _ => { // weights are directly encoded let weights_raw = &source[1..]; let num_weights = header - 127; self.weights.resize(num_weights as usize, 0); let bytes_needed = if num_weights % 2 == 0 { num_weights as usize / 2 } else { (num_weights as usize / 2) + 1 }; if weights_raw.len() < bytes_needed { return Err(err::NotEnoughBytesInSource { got: weights_raw.len(), need: bytes_needed, }); } for idx in 0..num_weights { if idx % 2 == 0 { self.weights[idx as usize] = weights_raw[idx as usize / 2] >> 4; } else { self.weights[idx as usize] = weights_raw[idx as usize / 2] & 0xF; } bits_read += 4; } } } let bytes_read = if bits_read % 8 == 0 { bits_read / 8 } else { (bits_read / 8) + 1 }; Ok(bytes_read as u32) } /// Once the weights have been read from the data, you can decode the weights /// into a table, and use that table to decode the actual compressed data. /// /// This function populates the rest of the table from the series of weights. fn build_table_from_weights(&mut self) -> Result<(), HuffmanTableError> { use HuffmanTableError as err; self.bits.clear(); self.bits.resize(self.weights.len() + 1, 0); let mut weight_sum: u32 = 0; for w in &self.weights { if *w > MAX_MAX_NUM_BITS { return Err(err::WeightBiggerThanMaxNumBits { got: *w }); } weight_sum += if *w > 0 { 1_u32 << (*w - 1) } else { 0 }; } if weight_sum == 0 { return Err(err::MissingWeights); } let max_bits = highest_bit_set(weight_sum) as u8; let left_over = (1 << max_bits) - weight_sum; //left_over must be power of two if !left_over.is_power_of_two() { return Err(err::LeftoverIsNotAPowerOf2 { got: left_over }); } let last_weight = highest_bit_set(left_over) as u8; for symbol in 0..self.weights.len() { let bits = if self.weights[symbol] > 0 { max_bits + 1 - self.weights[symbol] } else { 0 }; self.bits[symbol] = bits; } self.bits[self.weights.len()] = max_bits + 1 - last_weight; self.max_num_bits = max_bits; if max_bits > MAX_MAX_NUM_BITS { return Err(err::MaxBitsTooHigh { got: max_bits }); } self.bit_ranks.clear(); self.bit_ranks.resize((max_bits + 1) as usize, 0); for num_bits in &self.bits { self.bit_ranks[(*num_bits) as usize] += 1; } //fill with dummy symbols self.decode.resize( 1 << self.max_num_bits, Entry { symbol: 0, num_bits: 0, }, ); //starting codes for each rank self.rank_indexes.clear(); self.rank_indexes.resize((max_bits + 1) as usize, 0); self.rank_indexes[max_bits as usize] = 0; for bits in (1..self.rank_indexes.len() as u8).rev() { self.rank_indexes[bits as usize - 1] = self.rank_indexes[bits as usize] + self.bit_ranks[bits as usize] as usize * (1 << (max_bits - bits)); } assert!( self.rank_indexes[0] == self.decode.len(), "rank_idx[0]: {} should be: {}", self.rank_indexes[0], self.decode.len() ); for symbol in 0..self.bits.len() { let bits_for_symbol = self.bits[symbol]; if bits_for_symbol != 0 { // allocate code for the symbol and set in the table // a code ignores all max_bits - bits[symbol] bits, so it gets // a range that spans all of those in the decoding table let base_idx = self.rank_indexes[bits_for_symbol as usize]; let len = 1 << (max_bits - bits_for_symbol); self.rank_indexes[bits_for_symbol as usize] += len; for idx in 0..len { self.decode[base_idx + idx].symbol = symbol as u8; self.decode[base_idx + idx].num_bits = bits_for_symbol; } } } Ok(()) } } ruzstd-0.7.3/src/huff0/mod.rs000064400000000000000000000005011046102023000141060ustar 00000000000000/// Huffman coding is a method of encoding where symbols are assigned a code, /// and more commonly used symbols get shorter codes, and less commonly /// used symbols get longer codes. Codes are prefix free, meaning no two codes /// will start with the same sequence of bits. mod huff0_decoder; pub use huff0_decoder::*; ruzstd-0.7.3/src/io.rs000064400000000000000000000002061046102023000127300ustar 00000000000000//! Re-exports of std values for when the std is available. #[cfg(feature = "std")] pub use std::io::{Error, ErrorKind, Read, Write}; ruzstd-0.7.3/src/io_nostd.rs000064400000000000000000000077311046102023000141510ustar 00000000000000//! Manual implementations of representations for `#![no_std]` use alloc::boxed::Box; #[non_exhaustive] #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd)] pub enum ErrorKind { Interrupted, UnexpectedEof, WouldBlock, Other, } impl ErrorKind { fn as_str(&self) -> &'static str { use ErrorKind::*; match *self { Interrupted => "operation interrupted", UnexpectedEof => "unexpected end of file", WouldBlock => "operation would block", Other => "other error", } } } impl core::fmt::Display for ErrorKind { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.write_str(self.as_str()) } } pub struct Error { kind: ErrorKind, err: Option>, } impl alloc::fmt::Debug for Error { fn fmt(&self, f: &mut alloc::fmt::Formatter<'_>) -> Result<(), alloc::fmt::Error> { let mut s = f.debug_struct("Error"); s.field("kind", &self.kind); if let Some(err) = self.err.as_ref() { s.field("err", &alloc::format!("{err}")); } s.finish() } } impl Error { pub fn new(kind: ErrorKind, err: Box) -> Self { Self { kind, err: Some(err), } } pub fn from(kind: ErrorKind) -> Self { Self { kind, err: None } } pub fn kind(&self) -> ErrorKind { self.kind } pub fn get_ref(&self) -> Option<&(dyn core::fmt::Display + Send + Sync)> { self.err.as_ref().map(|e| e.as_ref()) } pub fn into_inner(self) -> Option> { self.err } } impl core::fmt::Display for Error { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.write_str(self.kind.as_str())?; if let Some(ref e) = self.err { e.fmt(f)?; } Ok(()) } } impl From for Error { fn from(value: ErrorKind) -> Self { Self::from(value) } } pub trait Read { fn read(&mut self, buf: &mut [u8]) -> Result; fn read_exact(&mut self, mut buf: &mut [u8]) -> Result<(), Error> { while !buf.is_empty() { match self.read(buf) { Ok(0) => break, Ok(n) => { let tmp = buf; buf = &mut tmp[n..]; } Err(ref e) if e.kind() == ErrorKind::Interrupted => {} Err(e) => return Err(e), } } if !buf.is_empty() { Err(Error::from(ErrorKind::UnexpectedEof)) } else { Ok(()) } } } impl Read for &[u8] { fn read(&mut self, buf: &mut [u8]) -> Result { let size = core::cmp::min(self.len(), buf.len()); let (to_copy, rest) = self.split_at(size); if size == 1 { buf[0] = to_copy[0]; } else { buf[..size].copy_from_slice(to_copy); } *self = rest; Ok(size) } } impl<'a, T> Read for &'a mut T where T: Read, { fn read(&mut self, buf: &mut [u8]) -> Result { (*self).read(buf) } } pub trait Write { fn write(&mut self, buf: &[u8]) -> Result; fn flush(&mut self) -> Result<(), Error>; } impl<'a, T> Write for &'a mut T where T: Write, { fn write(&mut self, buf: &[u8]) -> Result { (*self).write(buf) } fn flush(&mut self) -> Result<(), Error> { (*self).flush() } } impl Write for &mut [u8] { #[inline] fn write(&mut self, data: &[u8]) -> Result { let amt = core::cmp::min(data.len(), self.len()); let (a, b) = core::mem::take(self).split_at_mut(amt); a.copy_from_slice(&data[..amt]); *self = b; Ok(amt) } fn flush(&mut self) -> Result<(), Error> { Ok(()) } } ruzstd-0.7.3/src/lib.rs000064400000000000000000000014521046102023000130730ustar 00000000000000#![no_std] #![deny(trivial_casts, trivial_numeric_casts, rust_2018_idioms)] #[cfg(feature = "std")] extern crate std; #[cfg(not(feature = "rustc-dep-of-std"))] extern crate alloc; #[cfg(feature = "std")] pub const VERBOSE: bool = false; macro_rules! vprintln { ($($x:expr),*) => { #[cfg(feature = "std")] if crate::VERBOSE { std::println!($($x),*); } } } pub mod blocks; pub mod decoding; pub mod frame; pub mod frame_decoder; pub mod fse; pub mod huff0; pub mod streaming_decoder; mod tests; #[cfg(feature = "std")] pub mod io; #[cfg(not(feature = "std"))] pub mod io_nostd; #[cfg(not(feature = "std"))] pub use io_nostd as io; pub use frame_decoder::BlockDecodingStrategy; pub use frame_decoder::FrameDecoder; pub use streaming_decoder::StreamingDecoder; ruzstd-0.7.3/src/streaming_decoder.rs000064400000000000000000000120251046102023000160010ustar 00000000000000use core::borrow::BorrowMut; use crate::frame_decoder::{BlockDecodingStrategy, FrameDecoder, FrameDecoderError}; use crate::io::{Error, ErrorKind, Read}; /// High level Zstandard frame decoder that can be used to decompress a given Zstandard frame. /// /// This decoder implements `io::Read`, so you can interact with it by calling /// `io::Read::read_to_end` / `io::Read::read_exact` or passing this to another library / module as a source for the decoded content /// /// If you need more control over how decompression takes place, you can use /// the lower level [FrameDecoder], which allows for greater control over how /// decompression takes place but the implementor must call /// [FrameDecoder::decode_blocks] repeatedly to decode the entire frame. /// /// ## Caveat /// [StreamingDecoder] expects the underlying stream to only contain a single frame, /// yet the specification states that a single archive may contain multiple frames. /// /// To decode all the frames in a finite stream, the calling code needs to recreate /// the instance of the decoder and handle /// [crate::frame::ReadFrameHeaderError::SkipFrame] /// errors by skipping forward the `length` amount of bytes, see /// /// ```no_run /// // `read_to_end` is not implemented by the no_std implementation. /// #[cfg(feature = "std")] /// { /// use std::fs::File; /// use std::io::Read; /// use ruzstd::{StreamingDecoder}; /// /// // Read a Zstandard archive from the filesystem then decompress it into a vec. /// let mut f: File = todo!("Read a .zstd archive from somewhere"); /// let mut decoder = StreamingDecoder::new(f).unwrap(); /// let mut result = Vec::new(); /// Read::read_to_end(&mut decoder, &mut result).unwrap(); /// } /// ``` pub struct StreamingDecoder> { pub decoder: DEC, source: READ, } impl> StreamingDecoder { pub fn new_with_decoder( mut source: READ, mut decoder: DEC, ) -> Result, FrameDecoderError> { decoder.borrow_mut().init(&mut source)?; Ok(StreamingDecoder { decoder, source }) } } impl StreamingDecoder { pub fn new( mut source: READ, ) -> Result, FrameDecoderError> { let mut decoder = FrameDecoder::new(); decoder.init(&mut source)?; Ok(StreamingDecoder { decoder, source }) } } impl> StreamingDecoder { /// Gets a reference to the underlying reader. pub fn get_ref(&self) -> &READ { &self.source } /// Gets a mutable reference to the underlying reader. /// /// It is inadvisable to directly read from the underlying reader. pub fn get_mut(&mut self) -> &mut READ { &mut self.source } /// Destructures this object into the inner reader. pub fn into_inner(self) -> READ where READ: Sized, { self.source } /// Destructures this object into both the inner reader and [FrameDecoder]. pub fn into_parts(self) -> (READ, DEC) where READ: Sized, { (self.source, self.decoder) } /// Destructures this object into the inner [FrameDecoder]. pub fn into_frame_decoder(self) -> DEC { self.decoder } } impl> Read for StreamingDecoder { fn read(&mut self, buf: &mut [u8]) -> Result { let decoder = self.decoder.borrow_mut(); if decoder.is_finished() && decoder.can_collect() == 0 { //No more bytes can ever be decoded return Ok(0); } // need to loop. The UpToBytes strategy doesn't take any effort to actually reach that limit. // The first few calls can result in just filling the decode buffer but these bytes can not be collected. // So we need to call this until we can actually collect enough bytes // TODO add BlockDecodingStrategy::UntilCollectable(usize) that pushes this logic into the decode_blocks function while decoder.can_collect() < buf.len() && !decoder.is_finished() { //More bytes can be decoded let additional_bytes_needed = buf.len() - decoder.can_collect(); match decoder.decode_blocks( &mut self.source, BlockDecodingStrategy::UptoBytes(additional_bytes_needed), ) { Ok(_) => { /*Nothing to do*/ } Err(e) => { let err; #[cfg(feature = "std")] { err = Error::new(ErrorKind::Other, e); } #[cfg(not(feature = "std"))] { err = Error::new(ErrorKind::Other, alloc::boxed::Box::new(e)); } return Err(err); } } } decoder.read(buf) } } ruzstd-0.7.3/src/tests/bit_reader.rs000064400000000000000000000040561046102023000155720ustar 00000000000000#[test] fn test_bitreader_reversed() { use crate::decoding::bit_reader_reverse::BitReaderReversed; let encoded: [u8; 16] = [ 0xC1, 0x41, 0x08, 0x00, 0x00, 0xEC, 0xC8, 0x96, 0x42, 0x79, 0xD4, 0xBC, 0xF7, 0x2C, 0xD5, 0x48, ]; //just the u128 in encoded let num_rev: u128 = 0x48_D5_2C_F7_BC_D4_79_42_96_C8_EC_00_00_08_41_C1; let mut br = BitReaderReversed::new(&encoded[..]); let mut accumulator = 0; let mut bits_read = 0; let mut x = 0; loop { x += 3; //semi random access pattern let mut num_bits = x % 16; if bits_read > 128 - num_bits { num_bits = 128 - bits_read; } let bits = br.get_bits(num_bits); bits_read += num_bits; accumulator |= u128::from(bits) << (128 - bits_read); if bits_read >= 128 { break; } } if accumulator != num_rev { panic!( "Bitreader failed somewhere. Accumulated bits: {:?}, Should be: {:?}", accumulator, num_rev ); } } #[test] fn test_bitreader_normal() { use crate::decoding::bit_reader::BitReader; let encoded: [u8; 16] = [ 0xC1, 0x41, 0x08, 0x00, 0x00, 0xEC, 0xC8, 0x96, 0x42, 0x79, 0xD4, 0xBC, 0xF7, 0x2C, 0xD5, 0x48, ]; //just the u128 in encoded let num: u128 = 0x48_D5_2C_F7_BC_D4_79_42_96_C8_EC_00_00_08_41_C1; let mut br = BitReader::new(&encoded[..]); let mut accumulator = 0; let mut bits_read = 0; let mut x = 0; loop { x += 3; //semi random access pattern let mut num_bits = x % 16; if bits_read > 128 - num_bits { num_bits = 128 - bits_read; } let bits = br.get_bits(num_bits).unwrap(); accumulator |= u128::from(bits) << bits_read; bits_read += num_bits; if bits_read >= 128 { break; } } if accumulator != num { panic!( "Bitreader failed somewhere. Accumulated bits: {:?}, Should be: {:?}", accumulator, num ); } } ruzstd-0.7.3/src/tests/decode_corpus.rs000064400000000000000000000143341046102023000163100ustar 00000000000000#[test] fn test_decode_corpus_files() { extern crate std; use crate::frame_decoder; use alloc::borrow::ToOwned; use alloc::string::{String, ToString}; use alloc::vec::Vec; use std::fs; use std::io::Read; use std::println; let mut success_counter = 0; let mut fail_counter_diff = 0; let mut fail_counter_size = 0; let mut fail_counter_bytes_read = 0; #[cfg_attr(not(feature = "hash"), allow(unused_mut))] let mut fail_counter_chksum = 0; let mut total_counter = 0; let mut failed: Vec = Vec::new(); let mut speeds = Vec::new(); let mut speeds_read = Vec::new(); let mut files: Vec<_> = fs::read_dir("./decodecorpus_files").unwrap().collect(); if fs::read_dir("./local_corpus_files").is_ok() { files.extend(fs::read_dir("./local_corpus_files").unwrap()); } files.sort_by_key(|x| match x { Err(_) => "".to_owned(), Ok(entry) => entry.path().to_str().unwrap().to_owned(), }); let mut frame_dec = frame_decoder::FrameDecoder::new(); for file in files { let f = file.unwrap(); let metadata = f.metadata().unwrap(); let file_size = metadata.len(); let p = String::from(f.path().to_str().unwrap()); if !p.ends_with(".zst") { continue; } println!("Trying file: {}", p); let mut content = fs::File::open(f.path()).unwrap(); frame_dec.reset(&mut content).unwrap(); let start_time = std::time::Instant::now(); /////DECODING frame_dec .decode_blocks(&mut content, frame_decoder::BlockDecodingStrategy::All) .unwrap(); let result = frame_dec.collect().unwrap(); let end_time = start_time.elapsed(); match frame_dec.get_checksum_from_data() { Some(chksum) => { #[cfg(feature = "hash")] if frame_dec.get_calculated_checksum().unwrap() != chksum { println!( "Checksum did not match! From data: {}, calculated while decoding: {}\n", chksum, frame_dec.get_calculated_checksum().unwrap() ); fail_counter_chksum += 1; failed.push(p.clone().to_string()); } else { println!("Checksums are ok!\n"); } #[cfg(not(feature = "hash"))] println!( "Checksum feature not enabled, skipping. From data: {}\n", chksum ); } None => println!("No checksums to test\n"), } let mut original_p = p.clone(); original_p.truncate(original_p.len() - 4); let original_f = fs::File::open(original_p).unwrap(); let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); println!("Results for file: {}", p.clone()); let mut success = true; if original.len() != result.len() { println!( "Result has wrong length: {}, should be: {}", result.len(), original.len() ); success = false; fail_counter_size += 1; } if frame_dec.bytes_read_from_source() != file_size { println!( "Framedecoder counted wrong amount of bytes: {}, should be: {}", frame_dec.bytes_read_from_source(), file_size ); success = false; fail_counter_bytes_read += 1; } let mut counter = 0; let min = if original.len() < result.len() { original.len() } else { result.len() }; for idx in 0..min { if original[idx] != result[idx] { counter += 1; //println!( // "Original {} not equal to result {} at byte: {}", // original[idx], result[idx], idx, //); } } if counter > 0 { println!("Result differs in at least {} bytes from original", counter); success = false; fail_counter_diff += 1; } if success { success_counter += 1; } else { failed.push(p.clone().to_string()); } total_counter += 1; let dur = end_time.as_micros() as usize; let speed = result.len() / if dur == 0 { 1 } else { dur }; let speed_read = file_size as usize / if dur == 0 { 1 } else { dur }; println!("SPEED: {}", speed); println!("SPEED_read: {}", speed_read); speeds.push(speed); speeds_read.push(speed_read); } println!("###################"); println!("Summary:"); println!("###################"); println!( "Total: {}, Success: {}, WrongSize: {}, WrongBytecount: {}, WrongChecksum: {}, Diffs: {}", total_counter, success_counter, fail_counter_size, fail_counter_bytes_read, fail_counter_chksum, fail_counter_diff ); println!("Failed files: "); for f in &failed { println!("{}", f); } let speed_len = speeds.len(); let sum_speed: usize = speeds.into_iter().sum(); let avg_speed = sum_speed / speed_len; let avg_speed_bps = avg_speed * 1_000_000; if avg_speed_bps < 1000 { println!("Average speed: {} B/s", avg_speed_bps); } else if avg_speed_bps < 1_000_000 { println!("Average speed: {} KB/s", avg_speed_bps / 1000); } else { println!("Average speed: {} MB/s", avg_speed_bps / 1_000_000); } let speed_read_len = speeds_read.len(); let sum_speed_read: usize = speeds_read.into_iter().sum(); let avg_speed_read = sum_speed_read / speed_read_len; let avg_speed_read_bps = avg_speed_read * 1_000_000; if avg_speed_read_bps < 1000 { println!("Average speed reading: {} B/s", avg_speed_read_bps); } else if avg_speed_bps < 1_000_000 { println!("Average speed reading: {} KB/s", avg_speed_read_bps / 1000); } else { println!( "Average speed reading: {} MB/s", avg_speed_read_bps / 1_000_000 ); } assert!(failed.is_empty()); } ruzstd-0.7.3/src/tests/dict_test.rs000064400000000000000000000206051046102023000154520ustar 00000000000000#[test] fn test_dict_parsing() { use crate::decoding::dictionary::Dictionary; use alloc::vec; let mut raw = vec![0u8; 8]; // correct magic num raw[0] = 0x37; raw[1] = 0xA4; raw[2] = 0x30; raw[3] = 0xEC; //dict-id let dict_id = 0x47232101; raw[4] = 0x01; raw[5] = 0x21; raw[6] = 0x23; raw[7] = 0x47; // tables copied from ./dict_tests/dictionary let raw_tables = &[ 54, 16, 192, 155, 4, 0, 207, 59, 239, 121, 158, 116, 220, 93, 114, 229, 110, 41, 249, 95, 165, 255, 83, 202, 254, 68, 74, 159, 63, 161, 100, 151, 137, 21, 184, 183, 189, 100, 235, 209, 251, 174, 91, 75, 91, 185, 19, 39, 75, 146, 98, 177, 249, 14, 4, 35, 0, 0, 0, 40, 40, 20, 10, 12, 204, 37, 196, 1, 173, 122, 0, 4, 0, 128, 1, 2, 2, 25, 32, 27, 27, 22, 24, 26, 18, 12, 12, 15, 16, 11, 69, 37, 225, 48, 20, 12, 6, 2, 161, 80, 40, 20, 44, 137, 145, 204, 46, 0, 0, 0, 0, 0, 116, 253, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; raw.extend(&raw_tables[..]); //offset history 3,10,0x00ABCDEF raw.extend(vec![3, 0, 0, 0]); raw.extend(vec![10, 0, 0, 0]); raw.extend(vec![0xEF, 0xCD, 0xAB, 0]); //just some random bytes let raw_content = vec![ 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 123, 3, 234, 23, 234, 34, 23, 234, 34, 34, 234, 234, ]; raw.extend(&raw_content); let dict = Dictionary::decode_dict(&raw).unwrap(); if dict.id != dict_id { panic!( "Dict-id did not get parsed correctly. Is: {}, Should be: {}", dict.id, dict_id ); } if !dict.dict_content.eq(&raw_content) { panic!( "dict content did not get parsed correctly. Is: {:?}, Should be: {:?}", dict.dict_content, raw_content ); } if !dict.offset_hist.eq(&[3, 10, 0x00ABCDEF]) { panic!( "offset history did not get parsed correctly. Is: {:?}, Should be: {:?}", dict.offset_hist, [3, 10, 0x00ABCDEF] ); } // test magic num checking raw[0] = 1; raw[1] = 1; raw[2] = 1; raw[3] = 1; match Dictionary::decode_dict(&raw) { Ok(_) => panic!("The dict got decoded but the magic num was incorrect!"), Err(_) => { /* This is what should happen*/ } } } #[test] fn test_dict_decoding() { extern crate std; use crate::frame_decoder; use alloc::borrow::ToOwned; use alloc::string::{String, ToString}; use alloc::vec::Vec; use std::fs; use std::io::Read; use std::println; let mut success_counter = 0; let mut fail_counter_diff = 0; let mut fail_counter_size = 0; let mut fail_counter_bytes_read = 0; let mut total_counter = 0; let mut failed: Vec = Vec::new(); let mut speeds = Vec::new(); let mut speeds_read = Vec::new(); let mut files: Vec<_> = fs::read_dir("./dict_tests/files").unwrap().collect(); let dict = fs::File::open("./dict_tests/dictionary").unwrap(); let dict: Vec = dict.bytes().map(|x| x.unwrap()).collect(); files.sort_by_key(|x| match x { Err(_) => "".to_owned(), Ok(entry) => entry.path().to_str().unwrap().to_owned(), }); let mut frame_dec = frame_decoder::FrameDecoder::new(); let dict = crate::decoding::dictionary::Dictionary::decode_dict(&dict).unwrap(); frame_dec.add_dict(dict).unwrap(); for file in files { let f = file.unwrap(); let metadata = f.metadata().unwrap(); let file_size = metadata.len(); let p = String::from(f.path().to_str().unwrap()); if !p.ends_with(".zst") { continue; } println!("Trying file: {}", p); let mut content = fs::File::open(f.path()).unwrap(); frame_dec.reset(&mut content).unwrap(); let start_time = std::time::Instant::now(); /////DECODING frame_dec .decode_blocks(&mut content, frame_decoder::BlockDecodingStrategy::All) .unwrap(); let result = frame_dec.collect().unwrap(); let end_time = start_time.elapsed(); match frame_dec.get_checksum_from_data() { Some(chksum) => { #[cfg(feature = "hash")] if frame_dec.get_calculated_checksum().unwrap() != chksum { println!( "Checksum did not match! From data: {}, calculated while decoding: {}\n", chksum, frame_dec.get_calculated_checksum().unwrap() ); } else { println!("Checksums are ok!\n"); } #[cfg(not(feature = "hash"))] println!( "Checksum feature not enabled, skipping. From data: {}\n", chksum ); } None => println!("No checksums to test\n"), } let mut original_p = p.clone(); original_p.truncate(original_p.len() - 4); let original_f = fs::File::open(original_p).unwrap(); let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); println!("Results for file: {}", p.clone()); let mut success = true; if original.len() != result.len() { println!( "Result has wrong length: {}, should be: {}", result.len(), original.len() ); success = false; fail_counter_size += 1; } if frame_dec.bytes_read_from_source() != file_size { println!( "Framedecoder counted wrong amount of bytes: {}, should be: {}", frame_dec.bytes_read_from_source(), file_size ); success = false; fail_counter_bytes_read += 1; } let mut counter = 0; let min = if original.len() < result.len() { original.len() } else { result.len() }; for idx in 0..min { if original[idx] != result[idx] { counter += 1; //println!( // "Original {} not equal to result {} at byte: {}", // original[idx], result[idx], idx, //); } } if counter > 0 { println!("Result differs in at least {} bytes from original", counter); success = false; fail_counter_diff += 1; } if success { success_counter += 1; } else { failed.push(p.clone().to_string()); } total_counter += 1; let dur = end_time.as_micros() as usize; let speed = result.len() / if dur == 0 { 1 } else { dur }; let speed_read = file_size as usize / if dur == 0 { 1 } else { dur }; println!("SPEED: {}", speed); println!("SPEED_read: {}", speed_read); speeds.push(speed); speeds_read.push(speed_read); } println!("###################"); println!("Summary:"); println!("###################"); println!( "Total: {}, Success: {}, WrongSize: {}, WrongBytecount: {}, Diffs: {}", total_counter, success_counter, fail_counter_size, fail_counter_bytes_read, fail_counter_diff ); println!("Failed files: "); for f in &failed { println!("{}", f); } let speed_len = speeds.len(); let sum_speed: usize = speeds.into_iter().sum(); let avg_speed = sum_speed / speed_len; let avg_speed_bps = avg_speed * 1_000_000; if avg_speed_bps < 1000 { println!("Average speed: {} B/s", avg_speed_bps); } else if avg_speed_bps < 1_000_000 { println!("Average speed: {} KB/s", avg_speed_bps / 1000); } else { println!("Average speed: {} MB/s", avg_speed_bps / 1_000_000); } let speed_read_len = speeds_read.len(); let sum_speed_read: usize = speeds_read.into_iter().sum(); let avg_speed_read = sum_speed_read / speed_read_len; let avg_speed_read_bps = avg_speed_read * 1_000_000; if avg_speed_read_bps < 1000 { println!("Average speed reading: {} B/s", avg_speed_read_bps); } else if avg_speed_bps < 1_000_000 { println!("Average speed reading: {} KB/s", avg_speed_read_bps / 1000); } else { println!( "Average speed reading: {} MB/s", avg_speed_read_bps / 1_000_000 ); } assert!(failed.is_empty()); } ruzstd-0.7.3/src/tests/fuzz_regressions.rs000064400000000000000000000014101046102023000171020ustar 00000000000000#[test] fn test_all_artifacts() { extern crate std; use crate::frame_decoder; use std::borrow::ToOwned; use std::fs; use std::fs::File; let mut frame_dec = frame_decoder::FrameDecoder::new(); for file in fs::read_dir("./fuzz/artifacts/decode").unwrap() { let file_name = file.unwrap().path(); let fnstr = file_name.to_str().unwrap().to_owned(); if !fnstr.contains("/crash-") { continue; } let mut f = File::open(file_name.clone()).unwrap(); /* ignore errors. It just should never panic on invalid input */ let _: Result<_, _> = frame_dec.reset(&mut f).and_then(|()| { frame_dec.decode_blocks(&mut f, frame_decoder::BlockDecodingStrategy::All) }); } } ruzstd-0.7.3/src/tests/mod.rs000064400000000000000000000416361046102023000142560ustar 00000000000000#[cfg(test)] use alloc::vec; #[cfg(test)] use alloc::vec::Vec; #[cfg(test)] extern crate std; #[cfg(all(test, not(feature = "std")))] impl crate::io_nostd::Read for std::fs::File { fn read(&mut self, buf: &mut [u8]) -> Result { std::io::Read::read(self, buf).map_err(|e| { if e.get_ref().is_none() { crate::io_nostd::Error::from(crate::io_nostd::ErrorKind::Other) } else { crate::io_nostd::Error::new( crate::io_nostd::ErrorKind::Other, alloc::boxed::Box::new(e.into_inner().unwrap()), ) } }) } } #[cfg(all(test, feature = "std"))] #[allow(dead_code)] fn assure_error_impl() { // not a real test just there to throw an compiler error if Error is not derived correctly use crate::frame_decoder::FrameDecoderError; let _err: &dyn std::error::Error = &FrameDecoderError::NotYetInitialized; } #[cfg(all(test, feature = "std"))] #[allow(dead_code)] fn assure_decoder_send_sync() { // not a real test just there to throw an compiler error if FrameDecoder is Send + Sync use crate::frame_decoder::FrameDecoder; let decoder = FrameDecoder::new(); std::thread::spawn(move || { drop(decoder); }); } #[test] fn skippable_frame() { use crate::frame; let mut content = vec![]; content.extend_from_slice(&0x184D2A50u32.to_le_bytes()); content.extend_from_slice(&300u32.to_le_bytes()); assert_eq!(8, content.len()); let err = frame::read_frame_header(content.as_slice()); assert!(matches!( err, Err(frame::ReadFrameHeaderError::SkipFrame { magic_number: 0x184D2A50u32, length: 300 }) )); content.clear(); content.extend_from_slice(&0x184D2A5Fu32.to_le_bytes()); content.extend_from_slice(&0xFFFFFFFFu32.to_le_bytes()); assert_eq!(8, content.len()); let err = frame::read_frame_header(content.as_slice()); assert!(matches!( err, Err(frame::ReadFrameHeaderError::SkipFrame { magic_number: 0x184D2A5Fu32, length: 0xFFFFFFFF }) )); } #[cfg(test)] #[test] fn test_frame_header_reading() { use crate::frame; use std::fs; let mut content = fs::File::open("./decodecorpus_files/z000088.zst").unwrap(); let (_frame, _) = frame::read_frame_header(&mut content).unwrap(); } #[test] fn test_block_header_reading() { use crate::decoding; use crate::frame; use std::fs; let mut content = fs::File::open("./decodecorpus_files/z000088.zst").unwrap(); let (_frame, _) = frame::read_frame_header(&mut content).unwrap(); let mut block_dec = decoding::block_decoder::new(); let block_header = block_dec.read_block_header(&mut content).unwrap(); let _ = block_header; //TODO validate blockheader in a smart way } #[test] fn test_frame_decoder() { use crate::frame_decoder; use std::fs; let mut content = fs::File::open("./decodecorpus_files/z000088.zst").unwrap(); struct NullWriter(()); impl std::io::Write for NullWriter { fn write(&mut self, buf: &[u8]) -> Result { Ok(buf.len()) } fn flush(&mut self) -> Result<(), std::io::Error> { Ok(()) } } let mut _null_target = NullWriter(()); let mut frame_dec = frame_decoder::FrameDecoder::new(); frame_dec.reset(&mut content).unwrap(); frame_dec .decode_blocks(&mut content, frame_decoder::BlockDecodingStrategy::All) .unwrap(); } #[test] fn test_decode_from_to() { use crate::frame_decoder; use std::fs::File; use std::io::Read; let f = File::open("./decodecorpus_files/z000088.zst").unwrap(); let mut frame_dec = frame_decoder::FrameDecoder::new(); let content: Vec = f.bytes().map(|x| x.unwrap()).collect(); let mut target = vec![0u8; 1024 * 1024]; // first part let source1 = &content[..50 * 1024]; let (read1, written1) = frame_dec .decode_from_to(source1, target.as_mut_slice()) .unwrap(); //second part explicitely without checksum let source2 = &content[read1..content.len() - 4]; let (read2, written2) = frame_dec .decode_from_to(source2, &mut target[written1..]) .unwrap(); //must have decoded until checksum assert!(read1 + read2 == content.len() - 4); //insert checksum separatly to test that this is handled correctly let chksum_source = &content[read1 + read2..]; let (read3, written3) = frame_dec .decode_from_to(chksum_source, &mut target[written1 + written2..]) .unwrap(); //this must result in these values because just the checksum was processed assert!(read3 == 4); assert!(written3 == 0); let read = read1 + read2 + read3; let written = written1 + written2; let result = &target.as_slice()[..written]; if read != content.len() { panic!( "Byte counter: {} was wrong. Should be: {}", read, content.len() ); } match frame_dec.get_checksum_from_data() { Some(chksum) => { #[cfg(feature = "hash")] if frame_dec.get_calculated_checksum().unwrap() != chksum { std::println!( "Checksum did not match! From data: {}, calculated while decoding: {}\n", chksum, frame_dec.get_calculated_checksum().unwrap() ); } else { std::println!("Checksums are ok!\n"); } #[cfg(not(feature = "hash"))] std::println!( "Checksum feature not enabled, skipping. From data: {}\n", chksum ); } None => std::println!("No checksums to test\n"), } let original_f = File::open("./decodecorpus_files/z000088").unwrap(); let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); if original.len() != result.len() { panic!( "Result has wrong length: {}, should be: {}", result.len(), original.len() ); } let mut counter = 0; let min = if original.len() < result.len() { original.len() } else { result.len() }; for idx in 0..min { if original[idx] != result[idx] { counter += 1; //std::println!( // "Original {:3} not equal to result {:3} at byte: {}", // original[idx], result[idx], idx, //); } } if counter > 0 { panic!("Result differs in at least {} bytes from original", counter); } } #[test] fn test_specific_file() { use crate::frame_decoder; use std::fs; use std::io::Read; let path = "./decodecorpus_files/z000068.zst"; let mut content = fs::File::open(path).unwrap(); struct NullWriter(()); impl std::io::Write for NullWriter { fn write(&mut self, buf: &[u8]) -> Result { Ok(buf.len()) } fn flush(&mut self) -> Result<(), std::io::Error> { Ok(()) } } let mut _null_target = NullWriter(()); let mut frame_dec = frame_decoder::FrameDecoder::new(); frame_dec.reset(&mut content).unwrap(); frame_dec .decode_blocks(&mut content, frame_decoder::BlockDecodingStrategy::All) .unwrap(); let result = frame_dec.collect().unwrap(); let original_f = fs::File::open("./decodecorpus_files/z000088").unwrap(); let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); std::println!("Results for file: {}", path); if original.len() != result.len() { std::println!( "Result has wrong length: {}, should be: {}", result.len(), original.len() ); } let mut counter = 0; let min = if original.len() < result.len() { original.len() } else { result.len() }; for idx in 0..min { if original[idx] != result[idx] { counter += 1; //std::println!( // "Original {:3} not equal to result {:3} at byte: {}", // original[idx], result[idx], idx, //); } } if counter > 0 { std::println!("Result differs in at least {} bytes from original", counter); } } #[test] #[cfg(feature = "std")] fn test_streaming() { use std::fs; use std::io::Read; let mut content = fs::File::open("./decodecorpus_files/z000088.zst").unwrap(); let mut stream = crate::streaming_decoder::StreamingDecoder::new(&mut content).unwrap(); let mut result = Vec::new(); Read::read_to_end(&mut stream, &mut result).unwrap(); let original_f = fs::File::open("./decodecorpus_files/z000088").unwrap(); let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); if original.len() != result.len() { panic!( "Result has wrong length: {}, should be: {}", result.len(), original.len() ); } let mut counter = 0; let min = if original.len() < result.len() { original.len() } else { result.len() }; for idx in 0..min { if original[idx] != result[idx] { counter += 1; //std::println!( // "Original {:3} not equal to result {:3} at byte: {}", // original[idx], result[idx], idx, //); } } if counter > 0 { panic!("Result differs in at least {} bytes from original", counter); } // Test resetting to a new file while keeping the old decoder let mut content = fs::File::open("./decodecorpus_files/z000068.zst").unwrap(); let mut stream = crate::streaming_decoder::StreamingDecoder::new_with_decoder( &mut content, stream.into_frame_decoder(), ) .unwrap(); let mut result = Vec::new(); Read::read_to_end(&mut stream, &mut result).unwrap(); let original_f = fs::File::open("./decodecorpus_files/z000068").unwrap(); let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); std::println!("Results for file:"); if original.len() != result.len() { panic!( "Result has wrong length: {}, should be: {}", result.len(), original.len() ); } let mut counter = 0; let min = if original.len() < result.len() { original.len() } else { result.len() }; for idx in 0..min { if original[idx] != result[idx] { counter += 1; //std::println!( // "Original {:3} not equal to result {:3} at byte: {}", // original[idx], result[idx], idx, //); } } if counter > 0 { panic!("Result differs in at least {} bytes from original", counter); } } #[test] fn test_incremental_read() { use crate::frame_decoder::FrameDecoder; let mut unread_compressed_content = include_bytes!("../../decodecorpus_files/abc.txt.zst").as_slice(); let mut frame_dec = FrameDecoder::new(); frame_dec.reset(&mut unread_compressed_content).unwrap(); let mut output = [0u8; 3]; let (_, written) = frame_dec .decode_from_to(unread_compressed_content, &mut output) .unwrap(); assert_eq!(written, 3); assert_eq!(output.map(char::from), ['a', 'b', 'c']); assert!(frame_dec.is_finished()); let written = frame_dec.collect_to_writer(&mut &mut output[..]).unwrap(); assert_eq!(written, 3); assert_eq!(output.map(char::from), ['d', 'e', 'f']); } #[test] #[cfg(not(feature = "std"))] fn test_streaming_no_std() { use crate::io::Read; let content = include_bytes!("../../decodecorpus_files/z000088.zst"); let mut content = content.as_slice(); let mut stream = crate::streaming_decoder::StreamingDecoder::new(&mut content).unwrap(); let original = include_bytes!("../../decodecorpus_files/z000088"); let mut result = vec![0; original.len()]; Read::read_exact(&mut stream, &mut result).unwrap(); if original.len() != result.len() { panic!( "Result has wrong length: {}, should be: {}", result.len(), original.len() ); } let mut counter = 0; let min = if original.len() < result.len() { original.len() } else { result.len() }; for idx in 0..min { if original[idx] != result[idx] { counter += 1; //std::println!( // "Original {:3} not equal to result {:3} at byte: {}", // original[idx], result[idx], idx, //); } } if counter > 0 { panic!("Result differs in at least {} bytes from original", counter); } // Test resetting to a new file while keeping the old decoder let content = include_bytes!("../../decodecorpus_files/z000068.zst"); let mut content = content.as_slice(); let mut stream = crate::streaming_decoder::StreamingDecoder::new_with_decoder( &mut content, stream.into_frame_decoder(), ) .unwrap(); let original = include_bytes!("../../decodecorpus_files/z000068"); let mut result = vec![0; original.len()]; Read::read_exact(&mut stream, &mut result).unwrap(); std::println!("Results for file:"); if original.len() != result.len() { panic!( "Result has wrong length: {}, should be: {}", result.len(), original.len() ); } let mut counter = 0; let min = if original.len() < result.len() { original.len() } else { result.len() }; for idx in 0..min { if original[idx] != result[idx] { counter += 1; //std::println!( // "Original {:3} not equal to result {:3} at byte: {}", // original[idx], result[idx], idx, //); } } if counter > 0 { panic!("Result differs in at least {} bytes from original", counter); } } #[test] fn test_decode_all() { use crate::frame_decoder::{FrameDecoder, FrameDecoderError}; let skip_frame = |input: &mut Vec, length: usize| { input.extend_from_slice(&0x184D2A50u32.to_le_bytes()); input.extend_from_slice(&(length as u32).to_le_bytes()); input.resize(input.len() + length, 0); }; let mut original = Vec::new(); let mut input = Vec::new(); skip_frame(&mut input, 300); input.extend_from_slice(include_bytes!("../../decodecorpus_files/z000089.zst")); original.extend_from_slice(include_bytes!("../../decodecorpus_files/z000089")); skip_frame(&mut input, 400); input.extend_from_slice(include_bytes!("../../decodecorpus_files/z000090.zst")); original.extend_from_slice(include_bytes!("../../decodecorpus_files/z000090")); skip_frame(&mut input, 500); let mut decoder = FrameDecoder::new(); // decode_all with correct buffers. let mut output = vec![0; original.len()]; let result = decoder.decode_all(&input, &mut output).unwrap(); assert_eq!(result, original.len()); assert_eq!(output, original); // decode_all with smaller output length. let mut output = vec![0; original.len() - 1]; let result = decoder.decode_all(&input, &mut output); assert!( matches!(result, Err(FrameDecoderError::TargetTooSmall)), "{:?}", result ); // decode_all with larger output length. let mut output = vec![0; original.len() + 1]; let result = decoder.decode_all(&input, &mut output).unwrap(); assert_eq!(result, original.len()); assert_eq!(&output[..result], original); // decode_all with truncated regular frame. let mut output = vec![0; original.len()]; let result = decoder.decode_all(&input[..input.len() - 600], &mut output); assert!( matches!(result, Err(FrameDecoderError::FailedToReadBlockBody(_))), "{:?}", result ); // decode_all with truncated skip frame. let mut output = vec![0; original.len()]; let result = decoder.decode_all(&input[..input.len() - 1], &mut output); assert!( matches!(result, Err(FrameDecoderError::FailedToSkipFrame)), "{:?}", result ); // decode_all_to_vec with correct output capacity. let mut output = Vec::new(); output.reserve_exact(original.len()); decoder.decode_all_to_vec(&input, &mut output).unwrap(); assert_eq!(output, original); // decode_all_to_vec with smaller output capacity. let mut output = Vec::new(); output.reserve_exact(original.len() - 1); let result = decoder.decode_all_to_vec(&input, &mut output); assert!( matches!(result, Err(FrameDecoderError::TargetTooSmall)), "{:?}", result ); // decode_all_to_vec with larger output capacity. let mut output = Vec::new(); output.reserve_exact(original.len() + 1); decoder.decode_all_to_vec(&input, &mut output).unwrap(); assert_eq!(output, original); } pub mod bit_reader; pub mod decode_corpus; pub mod dict_test; pub mod fuzz_regressions; ruzstd-0.7.3/test_fixtures/abc.txt.zst000064400000000000000000000000471046102023000162040ustar 00000000000000(µ/ýXÑabcdefghijklmnopqrstuvwxyz\ƒ‰ú