snap-1.1.1/.cargo_vcs_info.json0000644000000001360000000000100117750ustar { "git": { "sha1": "742c9cdad8b70c7e982035bc2c6471f90f86fc0d" }, "path_in_vcs": "" }snap-1.1.1/.github/workflows/ci.yml000064400000000000000000000135571046102023000153130ustar 00000000000000name: ci on: pull_request: branches: - master push: branches: - master schedule: - cron: '00 01 * * *' # The section is needed to drop write-all permissions that are granted on # `schedule` event. By specifying any permission explicitly all others are set # to none. By using the principle of least privilege the damage a compromised # workflow can do (because of an injection or compromised third party tool or # action) is restricted. Currently the worklow doesn't need any additional # permission except for pulling the code. Adding labels to issues, commenting # on pull-requests, etc. may need additional permissions: # # Syntax for this section: # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#permissions # # Reference for how to assign permissions on a job-by-job basis: # https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs # # Reference for available permissions that we can enable if needed: # https://docs.github.com/en/actions/security-guides/automatic-token-authentication#permissions-for-the-github_token permissions: # to fetch code (actions/checkout) contents: read jobs: test: name: test env: # For some builds, we use cross to test on 32-bit and big-endian # systems. CARGO: cargo # When CARGO is set to CROSS, TARGET is set to `--target matrix.target`. # Note that we only use cross on Linux, so setting a target on a # different OS will just use normal cargo. TARGET: # Bump this as appropriate. We pin to a version to make sure CI # continues to work as cross releases in the past have broken things # in subtle ways. CROSS_VERSION: v0.2.5 runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: include: - build: pinned os: ubuntu-latest rust: 1.70.0 - build: stable os: ubuntu-latest rust: stable - build: beta os: ubuntu-latest rust: beta - build: nightly os: ubuntu-latest rust: nightly - build: macos os: macos-latest rust: stable - build: win-msvc os: windows-latest rust: stable - build: win-gnu os: windows-latest rust: stable-x86_64-gnu - build: stable-x86 os: ubuntu-latest rust: stable target: i686-unknown-linux-gnu - build: stable-aarch64 os: ubuntu-latest rust: stable target: aarch64-unknown-linux-gnu - build: stable-powerpc64 os: ubuntu-latest rust: stable target: powerpc64-unknown-linux-gnu - build: stable-s390x os: ubuntu-latest rust: stable target: s390x-unknown-linux-gnu steps: - name: Checkout repository uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: toolchain: ${{ matrix.rust }} - name: Use Cross if: matrix.os == 'ubuntu-latest' && matrix.target != '' run: | # In the past, new releases of 'cross' have broken CI. So for now, we # pin it. We also use their pre-compiled binary releases because cross # has over 100 dependencies and takes a bit to compile. dir="$RUNNER_TEMP/cross-download" mkdir "$dir" echo "$dir" >> $GITHUB_PATH cd "$dir" curl -LO "https://github.com/cross-rs/cross/releases/download/$CROSS_VERSION/cross-x86_64-unknown-linux-musl.tar.gz" tar xf cross-x86_64-unknown-linux-musl.tar.gz echo "CARGO=cross" >> $GITHUB_ENV echo "TARGET=--target ${{ matrix.target }}" >> $GITHUB_ENV - name: Show command used for Cargo run: | echo "cargo command is: ${{ env.CARGO }}" echo "target flag is: ${{ env.TARGET }}" - name: Build snap library run: ${{ env.CARGO }} build --verbose ${{ env.TARGET }} - name: Build docs run: ${{ env.CARGO }} doc --verbose ${{ env.TARGET }} - name: Run tests for snap run: ${{ env.CARGO }} test --verbose --all ${{ env.TARGET }} - name: Build szip CLI tool run: ${{ env.CARGO }} build --verbose --manifest-path szip/Cargo.toml ${{ env.TARGET }} - name: Compile benchmarks run: ${{ env.CARGO }} bench --manifest-path bench/Cargo.toml --verbose ${{ env.TARGET }} -- --test # Only worry about snappy-cpp when not using cross, since it's a pain # otherwise to deal with cross compiling with a C++ library. Similarly, # skip CPP testing on Windows. - name: Install snappy C++ library (Ubuntu) if: env.CARGO == 'cargo' && matrix.os == 'ubuntu-latest' run: sudo apt-get install libsnappy-dev - name: Install snappy C++ library (macOS) if: env.CARGO == 'cargo' && matrix.os == 'macos-latest' run: brew install snappy - name: Build snappy-cpp library if: env.CARGO == 'cargo' && matrix.os != 'windows-latest' run: cargo build --verbose --manifest-path snappy-cpp/Cargo.toml - name: Run tests with snappy-cpp if: env.CARGO == 'cargo' && matrix.os != 'windows-latest' && matrix.os != 'macos-latest' run: cargo test --verbose --manifest-path test/Cargo.toml --features cpp - name: Compile benchmarks with snappy-cpp if: env.CARGO == 'cargo' && matrix.os != 'windows-latest' run: cargo bench --manifest-path bench/Cargo.toml --verbose --features cpp -- --test rustfmt: runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: toolchain: stable components: rustfmt - name: Check formatting run: | cargo fmt --all -- --check - name: Check formatting of snappy-cpp run: | cargo fmt --manifest-path snappy-cpp/Cargo.toml -- --check snap-1.1.1/.github/workflows/release.yml000064400000000000000000000105161046102023000163300ustar 00000000000000# The way this works is a little weird. But basically, the create-release job # runs purely to initialize the GitHub release itself. Once done, the upload # URL of the release is saved as an artifact. # # The build-release job runs only once create-release is finished. It gets the # release upload URL by downloading the corresponding artifact. It then builds # the release executables for each supported platform and attaches them as # release assets to the previously created release. # # The key here is that we create the release only once. name: release on: push: # Enable when testing release infrastructure on a branch. # branches: # - ag/rollup tags: - 'szip-[0-9]+.[0-9]+.[0-9]+' jobs: create-release: name: create-release runs-on: ubuntu-latest # env: # Set to force version number, e.g., when no tag exists. # SZIP_VERSION: 4.5.6 steps: - name: Create artifacts directory run: mkdir artifacts - name: Get the release version from the tag if: env.SZIP_VERSION == '' run: | # Apparently, this is the right way to get a tag name. Really? # # See: https://github.community/t5/GitHub-Actions/How-to-get-just-the-tag-name/m-p/32167/highlight/true#M1027 echo "SZIP_VERSION=${GITHUB_REF#refs/tags/szip-}" >> $GITHUB_ENV echo "version is: ${{ env.SZIP_VERSION }}" - name: Create GitHub release id: release uses: actions/create-release@v1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: tag_name: szip-${{ env.SZIP_VERSION }} release_name: szip ${{ env.SZIP_VERSION }} - name: Save release upload URL to artifact run: echo "${{ steps.release.outputs.upload_url }}" > artifacts/release-upload-url - name: Upload artifacts uses: actions/upload-artifact@v1 with: name: artifacts path: artifacts build-release: name: build-release needs: ['create-release'] runs-on: ${{ matrix.os }} strategy: matrix: build: [linux, macos, windows] include: - build: linux os: ubuntu-latest rust: nightly target: x86_64-unknown-linux-musl - build: macos os: macos-latest rust: nightly target: x86_64-apple-darwin - build: windows os: windows-latest rust: nightly target: x86_64-pc-windows-msvc steps: - name: Checkout repository uses: actions/checkout@v3 - name: Install Rust uses: dtolnay/rust-toolchain@master with: toolchain: ${{ matrix.rust }} - name: Set TARGET flags run: echo "TARGET_FLAGS=--target ${{ matrix.target }}" >> $GITHUB_ENV - name: Get release download URL uses: actions/download-artifact@v1 with: name: artifacts path: artifacts - name: Set release upload URL shell: bash run: | release_upload_url="$(cat artifacts/release-upload-url)" echo "RELEASE_UPLOAD_URL=$release_upload_url" >> $GITHUB_ENV echo "release upload url: $RELEASE_UPLOAD_URL" - name: Build release binary run: cargo build --manifest-path szip/Cargo.toml --verbose --release ${{ env.TARGET_FLAGS }} - name: Strip release binary (linux and macos) if: matrix.build == 'linux' || matrix.build == 'macos' run: strip "target/${{ matrix.target }}/release/szip" - name: Build archive shell: bash run: | staging="szip-${{ matrix.target }}" mkdir "$staging" cp {szip/README.md,szip/COPYING} "$staging/" if [ "${{ matrix.build }}" = "windows" ]; then cp "target/${{ matrix.target }}/release/szip.exe" "$staging/" 7z a "$staging.zip" "$staging" echo "ASSET=$staging.zip" >> $GITHUB_ENV else cp "target/${{ matrix.target }}/release/szip" "$staging/" tar czf "$staging.tar.gz" "$staging" echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV fi - name: Upload release archive uses: actions/upload-release-asset@v1.0.1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: upload_url: ${{ env.RELEASE_UPLOAD_URL }} asset_path: ${{ env.ASSET }} asset_name: ${{ env.ASSET }} asset_content_type: application/octet-stream snap-1.1.1/.gitignore000064400000000000000000000001321046102023000125510ustar 00000000000000.*.swp doc tags examples/ss10pusa.csv build target Cargo.lock scratch* bench_large/huge x snap-1.1.1/COPYING000064400000000000000000000027311046102023000116230ustar 00000000000000Copyright 2011, The Snappy-Rust Authors. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. snap-1.1.1/Cargo.lock0000644000000005720000000000100077540ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "doc-comment" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" [[package]] name = "snap" version = "1.1.1" dependencies = [ "doc-comment", ] snap-1.1.1/Cargo.toml0000644000000022120000000000100077700ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2018" name = "snap" version = "1.1.1" authors = ["Andrew Gallant "] exclude = ["data/*"] description = """ A pure Rust implementation of the Snappy compression algorithm. Includes streaming compression and decompression. """ homepage = "https://github.com/BurntSushi/rust-snappy" documentation = "https://docs.rs/snap" readme = "README.md" keywords = [ "snappy", "compress", "compression", "decompress", "decompression", ] license = "BSD-3-Clause" repository = "https://github.com/BurntSushi/rust-snappy" [profile.release] debug = 2 [profile.test] opt-level = 3 [lib] bench = false [dev-dependencies.doc-comment] version = "0.3.1" snap-1.1.1/Cargo.toml.orig000064400000000000000000000013601046102023000134540ustar 00000000000000[package] name = "snap" version = "1.1.1" #:version authors = ["Andrew Gallant "] description = """ A pure Rust implementation of the Snappy compression algorithm. Includes streaming compression and decompression. """ documentation = "https://docs.rs/snap" homepage = "https://github.com/BurntSushi/rust-snappy" repository = "https://github.com/BurntSushi/rust-snappy" readme = "README.md" keywords = ["snappy", "compress", "compression", "decompress", "decompression"] license = "BSD-3-Clause" edition = "2018" exclude = ["data/*"] [workspace] exclude = ["snappy-cpp"] members = ["bench", "test", "szip"] [lib] bench = false [profile.release] debug = true [profile.test] opt-level = 3 [dev-dependencies] doc-comment = "0.3.1" snap-1.1.1/README.md000064400000000000000000000227371046102023000120570ustar 00000000000000snap ==== A pure Rust implementation of the [Snappy compression algorithm](https://google.github.io/snappy/). Includes streaming compression and decompression using the Snappy frame format. This implementation is ported from both the [reference C++ implementation](https://github.com/google/snappy) and the [Go implementation](https://github.com/golang/snappy). [![Build status](https://github.com/BurntSushi/rust-snappy/workflows/ci/badge.svg)](https://github.com/BurntSushi/rust-snappy/actions) [![](https://meritbadge.herokuapp.com/snap)](https://crates.io/crates/snap) Licensed under the BSD 3-Clause. ### Documentation https://docs.rs/snap ### Usage Add this to your `Cargo.toml`: ```toml [dependencies] snap = "1" ``` ### Example: compress data on `stdin` This program reads data from `stdin`, compresses it and emits it to `stdout`. This example can be found in `examples/compress.rs`: ```rust use std::io; fn main() { let stdin = io::stdin(); let stdout = io::stdout(); let mut rdr = stdin.lock(); // Wrap the stdout writer in a Snappy writer. let mut wtr = snap::write::FrameEncoder::new(stdout.lock()); io::copy(&mut rdr, &mut wtr).expect("I/O operation failed"); } ``` ### Example: decompress data on `stdin` This program reads data from `stdin`, decompresses it and emits it to `stdout`. This example can be found in `examples/decompress.rs`: ```rust use std::io; fn main() { let stdin = io::stdin(); let stdout = io::stdout(); // Wrap the stdin reader in a Snappy reader. let mut rdr = snap::read::FrameDecoder::new(stdin.lock()); let mut wtr = stdout.lock(); io::copy(&mut rdr, &mut wtr).expect("I/O operation failed"); } ``` ### Example: the szip tool `szip` is a tool with similar behavior as `gzip`, except it uses Snappy compression. It can be installed with Cargo: ``` $ cargo install szip ``` To compress a file, run `szip file`. To decompress a file, run `szip -d file.sz`. See `szip --help` for more details. ### Testing This crate is tested against the reference C++ implementation of Snappy. Currently, compression is byte-for-byte equivalent with the C++ implementation. This seems like a reasonable starting point, although it is not necessarily a goal to always maintain byte-for-byte equivalence. Tests against the reference C++ implementation can be run with `cargo test --features cpp`. Note that you will need to have the C++ Snappy library in your `LD_LIBRARY_PATH` (or equivalent). To run tests, you'll need to explicitly run the `test` crate: ``` $ cargo test --manifest-path test/Cargo.toml ``` To test that this library matches the output of the reference C++ library, use: ``` $ cargo test --manifest-path test/Cargo.toml --features cpp ``` Tests are in a separate crate because of the dependency on the C++ reference library. Namely, Cargo does not yet permit optional dev dependencies. ### Minimum Rust version policy This crate's minimum supported `rustc` version is `1.39.0`. The current policy is that the minimum Rust version required to use this crate can be increased in minor version updates. For example, if `crate 1.0` requires Rust 1.20.0, then `crate 1.0.z` for all values of `z` will also require Rust 1.20.0 or newer. However, `crate 1.y` for `y > 0` may require a newer minimum version of Rust. In general, this crate will be conservative with respect to the minimum supported version of Rust. ### Performance The performance of this implementation should roughly match the performance of the C++ implementation on x86_64. Below are the results of the microbenchmarks (as defined in the C++ library): ``` group snappy/cpp/ snappy/snap/ ----- ----------- ------------ compress/zflat00_html 1.00 94.5±0.62µs 1033.1 MB/sec 1.02 96.1±0.74µs 1016.2 MB/sec compress/zflat01_urls 1.00 1182.3±8.89µs 566.3 MB/sec 1.04 1235.3±11.99µs 542.0 MB/sec compress/zflat02_jpg 1.00 7.2±0.11µs 15.9 GB/sec 1.01 7.3±0.06µs 15.8 GB/sec compress/zflat03_jpg_200 1.10 262.4±1.84ns 727.0 MB/sec 1.00 237.5±2.95ns 803.2 MB/sec compress/zflat04_pdf 1.02 10.3±0.18µs 9.2 GB/sec 1.00 10.1±0.16µs 9.4 GB/sec compress/zflat05_html4 1.00 399.2±5.36µs 978.4 MB/sec 1.01 404.0±2.46µs 966.8 MB/sec compress/zflat06_txt1 1.00 397.3±2.61µs 365.1 MB/sec 1.00 398.5±3.06µs 364.0 MB/sec compress/zflat07_txt2 1.00 352.8±3.20µs 338.4 MB/sec 1.01 355.2±5.01µs 336.1 MB/sec compress/zflat08_txt3 1.01 1058.8±6.85µs 384.4 MB/sec 1.00 1051.8±6.74µs 386.9 MB/sec compress/zflat09_txt4 1.00 1444.1±8.10µs 318.2 MB/sec 1.00 1450.0±13.36µs 316.9 MB/sec compress/zflat10_pb 1.00 85.1±0.58µs 1328.6 MB/sec 1.02 87.0±0.90µs 1300.2 MB/sec compress/zflat11_gaviota 1.07 311.9±4.27µs 563.5 MB/sec 1.00 291.9±1.86µs 602.3 MB/sec decompress/uflat00_html 1.03 36.9±0.28µs 2.6 GB/sec 1.00 36.0±0.25µs 2.7 GB/sec decompress/uflat01_urls 1.04 437.4±2.89µs 1530.7 MB/sec 1.00 419.9±3.10µs 1594.6 MB/sec decompress/uflat02_jpg 1.00 4.6±0.05µs 24.9 GB/sec 1.00 4.6±0.03µs 25.0 GB/sec decompress/uflat03_jpg_200 1.08 122.4±1.06ns 1558.6 MB/sec 1.00 112.8±1.35ns 1690.8 MB/sec decompress/uflat04_pdf 1.00 5.7±0.05µs 16.8 GB/sec 1.10 6.2±0.07µs 15.3 GB/sec decompress/uflat05_html4 1.01 164.1±1.71µs 2.3 GB/sec 1.00 162.6±2.16µs 2.3 GB/sec decompress/uflat06_txt1 1.08 146.6±1.01µs 989.5 MB/sec 1.00 135.3±1.11µs 1072.0 MB/sec decompress/uflat07_txt2 1.09 130.2±0.93µs 916.6 MB/sec 1.00 119.2±0.96µs 1001.8 MB/sec decompress/uflat08_txt3 1.07 387.2±2.30µs 1051.0 MB/sec 1.00 361.9±6.29µs 1124.7 MB/sec decompress/uflat09_txt4 1.09 536.1±3.47µs 857.2 MB/sec 1.00 494.0±5.05µs 930.2 MB/sec decompress/uflat10_pb 1.00 32.5±0.19µs 3.4 GB/sec 1.05 34.0±0.48µs 3.2 GB/sec decompress/uflat11_gaviota 1.00 142.1±2.05µs 1236.7 MB/sec 1.00 141.5±0.92µs 1242.3 MB/sec ``` Notes: These benchmarks were run with Snappy/C++ 1.1.8. Both the C++ and Rust benchmarks were run with the same benchmark harness. Benchmarks were run on an Intel i7-6900K. Additionally, here are the benchmarks run on the same machine from the Go implementation of Snappy (which has a hand rolled implementation in Assembly). Note that these were run using Go's microbenchmark tool, so the numbers may not be directly comparable, but they should serve as a useful signpost: ``` Benchmark_UFlat0 25040 45180 ns/op 2266.49 MB/s Benchmark_UFlat1 2648 451475 ns/op 1555.10 MB/s Benchmark_UFlat2 229965 4788 ns/op 25709.01 MB/s Benchmark_UFlat3 11355555 101 ns/op 1973.65 MB/s Benchmark_UFlat4 196551 6055 ns/op 16912.64 MB/s Benchmark_UFlat5 6016 189219 ns/op 2164.68 MB/s Benchmark_UFlat6 6914 166371 ns/op 914.16 MB/s Benchmark_UFlat7 8173 142506 ns/op 878.41 MB/s Benchmark_UFlat8 2744 436424 ns/op 977.84 MB/s Benchmark_UFlat9 1999 591141 ns/op 815.14 MB/s Benchmark_UFlat10 28885 37291 ns/op 3180.04 MB/s Benchmark_UFlat11 7308 163366 ns/op 1128.26 MB/s Benchmark_ZFlat0 12902 91231 ns/op 1122.43 MB/s Benchmark_ZFlat1 997 1200579 ns/op 584.79 MB/s Benchmark_ZFlat2 136762 7832 ns/op 15716.53 MB/s Benchmark_ZFlat3 4896124 245 ns/op 817.27 MB/s Benchmark_ZFlat4 117643 10129 ns/op 10109.44 MB/s Benchmark_ZFlat5 2934 394742 ns/op 1037.64 MB/s Benchmark_ZFlat6 3008 382877 ns/op 397.23 MB/s Benchmark_ZFlat7 3411 344916 ns/op 362.93 MB/s Benchmark_ZFlat8 966 1057985 ns/op 403.36 MB/s Benchmark_ZFlat9 854 1429024 ns/op 337.20 MB/s Benchmark_ZFlat10 13861 83040 ns/op 1428.08 MB/s Benchmark_ZFlat11 4070 293952 ns/op 627.04 MB/s ``` To run benchmarks, including the reference C++ implementation, do the following: ``` $ cd bench $ cargo bench --features cpp -- --save-baseline snappy ``` To compare them, as shown above, install [`critcmp`](https://github.com/BurntSushi/critcmp) and run (assuming you saved the baseline above under the name `snappy`): ``` $ critcmp snappy -g '.*?/(.*$)' ``` Finally, the Go benchmarks were run with the following command on commit `ff6b7dc8`: ``` $ go test -cpu 1 -bench Flat -download ``` ### Comparison with other Snappy crates * `snappy` - These are bindings to the C++ library. No support for the Snappy frame format. * `snappy_framed` - Implements the Snappy frame format on top of the `snappy` crate. * `rsnappy` - Written in pure Rust, but lacks documentation and the Snappy frame format. Performance is unclear and tests appear incomplete. * `snzip` - Was created and immediately yanked from crates.io. snap-1.1.1/build.rs000064400000000000000000000057471046102023000122470ustar 00000000000000use std::env; use std::fs::File; use std::io::{self, Write}; use std::path::{Path, PathBuf}; const CASTAGNOLI_POLY: u32 = 0x82f63b78; type Result = std::result::Result>; fn main() { if let Err(err) = try_main() { panic!("{}", err); } } fn try_main() -> Result<()> { let out_dir = match env::var_os("OUT_DIR") { None => { return Err(From::from("OUT_DIR environment variable not defined")) } Some(out_dir) => PathBuf::from(out_dir), }; write_tag_lookup_table(&out_dir)?; write_crc_tables(&out_dir)?; Ok(()) } fn write_tag_lookup_table(out_dir: &Path) -> Result<()> { let out_path = out_dir.join("tag.rs"); let mut out = io::BufWriter::new(File::create(out_path)?); writeln!(out, "pub const TAG_LOOKUP_TABLE: [u16; 256] = [")?; for b in 0u8..=255 { writeln!(out, " {},", tag_entry(b))?; } writeln!(out, "];")?; Ok(()) } fn tag_entry(b: u8) -> u16 { let b = b as u16; match b & 0b00000011 { 0b00 => { let lit_len = (b >> 2) + 1; if lit_len <= 60 { lit_len } else { assert!(lit_len <= 64); (lit_len - 60) << 11 } } 0b01 => { let len = 4 + ((b >> 2) & 0b111); let offset = (b >> 5) & 0b111; (1 << 11) | (offset << 8) | len } 0b10 => { let len = 1 + (b >> 2); (2 << 11) | len } 0b11 => { let len = 1 + (b >> 2); (4 << 11) | len } _ => unreachable!(), } } fn write_crc_tables(out_dir: &Path) -> Result<()> { let out_path = out_dir.join("crc32_table.rs"); let mut out = io::BufWriter::new(File::create(out_path)?); let table = make_table(CASTAGNOLI_POLY); let table16 = make_table16(CASTAGNOLI_POLY); writeln!(out, "pub const TABLE: [u32; 256] = [")?; for &x in table.iter() { writeln!(out, " {},", x)?; } writeln!(out, "];\n")?; writeln!(out, "pub const TABLE16: [[u32; 256]; 16] = [")?; for table in table16.iter() { writeln!(out, " [")?; for &x in table.iter() { writeln!(out, " {},", x)?; } writeln!(out, " ],")?; } writeln!(out, "];")?; out.flush()?; Ok(()) } fn make_table16(poly: u32) -> [[u32; 256]; 16] { let mut tab = [[0; 256]; 16]; tab[0] = make_table(poly); for i in 0..256 { let mut crc = tab[0][i]; for j in 1..16 { crc = (crc >> 8) ^ tab[0][crc as u8 as usize]; tab[j][i] = crc; } } tab } fn make_table(poly: u32) -> [u32; 256] { let mut tab = [0; 256]; for i in 0u32..256u32 { let mut crc = i; for _ in 0..8 { if crc & 1 == 1 { crc = (crc >> 1) ^ poly; } else { crc >>= 1; } } tab[i as usize] = crc; } tab } snap-1.1.1/examples/compress-escaped.rs000064400000000000000000000017411046102023000162110ustar 00000000000000use std::env; use std::io::{self, Write}; use std::process; fn main() { let bytes = match env::args().nth(1) { None => { writeln!(&mut io::stderr(), "Usage: compress-escaped string") .unwrap(); process::exit(1); } Some(arg) => arg.into_bytes(), }; let compressed = frame_press(&bytes); println!("{}", escape(&compressed)); println!("{}", escape(&frame_depress(&compressed))); } fn frame_press(bytes: &[u8]) -> Vec { use snap::write; let mut wtr = write::FrameEncoder::new(vec![]); wtr.write_all(bytes).unwrap(); wtr.into_inner().unwrap() } fn frame_depress(bytes: &[u8]) -> Vec { use snap::read; use std::io::Read; let mut buf = vec![]; read::FrameDecoder::new(bytes).read_to_end(&mut buf).unwrap(); buf } fn escape(bytes: &[u8]) -> String { use std::ascii::escape_default; bytes.iter().flat_map(|&b| escape_default(b)).map(|b| b as char).collect() } snap-1.1.1/examples/compress.rs000064400000000000000000000004701046102023000146050ustar 00000000000000use snap; use std::io; fn main() { let stdin = io::stdin(); let stdout = io::stdout(); let mut rdr = stdin.lock(); // Wrap the stdout writer in a Snappy writer. let mut wtr = snap::write::FrameEncoder::new(stdout.lock()); io::copy(&mut rdr, &mut wtr).expect("I/O operation failed"); } snap-1.1.1/examples/decompress.rs000064400000000000000000000004661046102023000151230ustar 00000000000000use snap; use std::io; fn main() { let stdin = io::stdin(); let stdout = io::stdout(); // Wrap the stdin reader in a Snappy reader. let mut rdr = snap::read::FrameDecoder::new(stdin.lock()); let mut wtr = stdout.lock(); io::copy(&mut rdr, &mut wtr).expect("I/O operation failed"); } snap-1.1.1/rustfmt.toml000064400000000000000000000000541046102023000131650ustar 00000000000000max_width = 79 use_small_heuristics = "max" snap-1.1.1/src/bytes.rs000064400000000000000000000075351046102023000130620ustar 00000000000000use std::convert::TryInto; use std::io; /// Read a u16 in little endian format from the beginning of the given slice. /// This panics if the slice has length less than 2. pub fn read_u16_le(slice: &[u8]) -> u16 { u16::from_le_bytes(slice[..2].try_into().unwrap()) } /// Read a u24 (returned as a u32 with the most significant 8 bits always set /// to 0) in little endian format from the beginning of the given slice. This /// panics if the slice has length less than 3. pub fn read_u24_le(slice: &[u8]) -> u32 { slice[0] as u32 | (slice[1] as u32) << 8 | (slice[2] as u32) << 16 } /// Read a u32 in little endian format from the beginning of the given slice. /// This panics if the slice has length less than 4. pub fn read_u32_le(slice: &[u8]) -> u32 { u32::from_le_bytes(slice[..4].try_into().unwrap()) } /// Like read_u32_le, but from an io::Read implementation. If io::Read does /// not yield at least 4 bytes, then this returns an unexpected EOF error. pub fn io_read_u32_le(mut rdr: R) -> io::Result { let mut buf = [0; 4]; rdr.read_exact(&mut buf)?; Ok(u32::from_le_bytes(buf)) } /// Write a u16 in little endian format to the beginning of the given slice. /// This panics if the slice has length less than 2. pub fn write_u16_le(n: u16, slice: &mut [u8]) { assert!(slice.len() >= 2); let bytes = n.to_le_bytes(); slice[0] = bytes[0]; slice[1] = bytes[1]; } /// Write a u24 (given as a u32 where the most significant 8 bits are ignored) /// in little endian format to the beginning of the given slice. This panics /// if the slice has length less than 3. pub fn write_u24_le(n: u32, slice: &mut [u8]) { slice[0] = n as u8; slice[1] = (n >> 8) as u8; slice[2] = (n >> 16) as u8; } /// Write a u32 in little endian format to the beginning of the given slice. /// This panics if the slice has length less than 4. pub fn write_u32_le(n: u32, slice: &mut [u8]) { assert!(slice.len() >= 4); let bytes = n.to_le_bytes(); slice[0] = bytes[0]; slice[1] = bytes[1]; slice[2] = bytes[2]; slice[3] = bytes[3]; } /// https://developers.google.com/protocol-buffers/docs/encoding#varints pub fn write_varu64(data: &mut [u8], mut n: u64) -> usize { let mut i = 0; while n >= 0b1000_0000 { data[i] = (n as u8) | 0b1000_0000; n >>= 7; i += 1; } data[i] = n as u8; i + 1 } /// https://developers.google.com/protocol-buffers/docs/encoding#varints pub fn read_varu64(data: &[u8]) -> (u64, usize) { let mut n: u64 = 0; let mut shift: u32 = 0; for (i, &b) in data.iter().enumerate() { if b < 0b1000_0000 { return match (b as u64).checked_shl(shift) { None => (0, 0), Some(b) => (n | b, i + 1), }; } match ((b as u64) & 0b0111_1111).checked_shl(shift) { None => return (0, 0), Some(b) => n |= b, } shift += 7; } (0, 0) } /// Does an unaligned load of a little endian encoded u32. /// /// This is unsafe because `data` must point to some memory of size at least 4. pub unsafe fn loadu_u32_le(data: *const u8) -> u32 { loadu_u32_ne(data).to_le() } /// Does an unaligned load of a native endian encoded u32. /// /// This is unsafe because `data` must point to some memory of size at least 4. pub unsafe fn loadu_u32_ne(data: *const u8) -> u32 { (data as *const u32).read_unaligned() } /// Does an unaligned load of a little endian encoded u64. /// /// This is unsafe because `data` must point to some memory of size at least 8. pub unsafe fn loadu_u64_le(data: *const u8) -> u64 { loadu_u64_ne(data).to_le() } /// Does an unaligned load of a native endian encoded u64. /// /// This is unsafe because `data` must point to some memory of size at least 8. pub unsafe fn loadu_u64_ne(data: *const u8) -> u64 { (data as *const u64).read_unaligned() } snap-1.1.1/src/compress.rs000064400000000000000000000522621046102023000135640ustar 00000000000000use std::fmt; use std::ops::{Deref, DerefMut}; use std::ptr; use crate::bytes; use crate::error::{Error, Result}; use crate::{MAX_BLOCK_SIZE, MAX_INPUT_SIZE}; /// The total number of slots we permit for our hash table of 4 byte repeat /// sequences. const MAX_TABLE_SIZE: usize = 1 << 14; /// The size of a small hash table. This is useful for reducing overhead when /// compressing very small blocks of bytes. const SMALL_TABLE_SIZE: usize = 1 << 10; /// The total number of bytes that we always leave uncompressed at the end /// of the buffer. This in particular affords us some wiggle room during /// compression such that faster copy operations can be used. const INPUT_MARGIN: usize = 16 - 1; /// The minimum block size that we're willing to consider for compression. /// Anything smaller than this gets emitted as a literal. const MIN_NON_LITERAL_BLOCK_SIZE: usize = 1 + 1 + INPUT_MARGIN; /// Nice names for the various Snappy tags. enum Tag { Literal = 0b00, Copy1 = 0b01, Copy2 = 0b10, // Compression never actually emits a Copy4 operation and decompression // uses tricks so that we never explicitly do case analysis on the copy // operation type, therefore leading to the fact that we never use Copy4. #[allow(dead_code)] Copy4 = 0b11, } /// Returns the maximum compressed size given the uncompressed size. /// /// If the uncompressed size exceeds the maximum allowable size then this /// returns 0. pub fn max_compress_len(input_len: usize) -> usize { let input_len = input_len as u64; if input_len > MAX_INPUT_SIZE { return 0; } let max = 32 + input_len + (input_len / 6); if max > MAX_INPUT_SIZE { 0 } else { max as usize } } /// Encoder is a raw encoder for compressing bytes in the Snappy format. /// /// Thie encoder does not use the Snappy frame format and simply compresses the /// given bytes in one big Snappy block (that is, it has a single header). /// /// Unless you explicitly need the low-level control, you should use /// [`read::FrameEncoder`](../read/struct.FrameEncoder.html) /// or /// [`write::FrameEncoder`](../write/struct.FrameEncoder.html) /// instead, which compresses to the Snappy frame format. /// /// It is beneficial to reuse an Encoder when possible. pub struct Encoder { small: [u16; SMALL_TABLE_SIZE], big: Vec, } impl fmt::Debug for Encoder { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "Encoder(...)") } } impl Encoder { /// Return a new encoder that can be used for compressing bytes. pub fn new() -> Encoder { Encoder { small: [0; SMALL_TABLE_SIZE], big: vec![] } } /// Compresses all bytes in `input` into `output`. /// /// `input` can be any arbitrary sequence of bytes. /// /// `output` must be large enough to hold the maximum possible compressed /// size of `input`, which can be computed using `max_compress_len`. /// /// On success, this returns the number of bytes written to `output`. /// /// # Errors /// /// This method returns an error in the following circumstances: /// /// * The total number of bytes to compress exceeds `2^32 - 1`. /// * `output` has length less than `max_compress_len(input.len())`. pub fn compress( &mut self, mut input: &[u8], output: &mut [u8], ) -> Result { match max_compress_len(input.len()) { 0 => { return Err(Error::TooBig { given: input.len() as u64, max: MAX_INPUT_SIZE, }); } min if output.len() < min => { return Err(Error::BufferTooSmall { given: output.len() as u64, min: min as u64, }); } _ => {} } // Handle an edge case specially. if input.is_empty() { // Encodes a varint of 0, denoting the total size of uncompressed // bytes. output[0] = 0; return Ok(1); } // Write the Snappy header, which is just the total number of // uncompressed bytes. let mut d = bytes::write_varu64(output, input.len() as u64); while !input.is_empty() { // Find the next block. let mut src = input; if src.len() > MAX_BLOCK_SIZE { src = &src[..MAX_BLOCK_SIZE as usize]; } input = &input[src.len()..]; // If the block is smallish, then don't waste time on it and just // emit a literal. let mut block = Block::new(src, output, d); if block.src.len() < MIN_NON_LITERAL_BLOCK_SIZE { let lit_end = block.src.len(); unsafe { // SAFETY: next_emit is zero (in bounds) and the end is // the length of the block (in bounds). block.emit_literal(lit_end); } } else { let table = self.block_table(block.src.len()); block.compress(table); } d = block.d; } Ok(d) } /// Compresses all bytes in `input` into a freshly allocated `Vec`. /// /// This is just like the `compress` method, except it allocates a `Vec` /// with the right size for you. (This is intended to be a convenience /// method.) /// /// This method returns an error under the same circumstances that /// `compress` does. pub fn compress_vec(&mut self, input: &[u8]) -> Result> { let mut buf = vec![0; max_compress_len(input.len())]; let n = self.compress(input, &mut buf)?; buf.truncate(n); Ok(buf) } } struct Block<'s, 'd> { src: &'s [u8], s: usize, s_limit: usize, dst: &'d mut [u8], d: usize, next_emit: usize, } impl<'s, 'd> Block<'s, 'd> { #[inline(always)] fn new(src: &'s [u8], dst: &'d mut [u8], d: usize) -> Block<'s, 'd> { Block { src: src, s: 0, s_limit: src.len(), dst: dst, d: d, next_emit: 0, } } #[inline(always)] fn compress(&mut self, mut table: BlockTable<'_>) { debug_assert!(!table.is_empty()); debug_assert!(self.src.len() >= MIN_NON_LITERAL_BLOCK_SIZE); self.s += 1; self.s_limit -= INPUT_MARGIN; let mut next_hash = table.hash(bytes::read_u32_le(&self.src[self.s..])); loop { let mut skip = 32; let mut candidate; let mut s_next = self.s; loop { self.s = s_next; let bytes_between_hash_lookups = skip >> 5; s_next = self.s + bytes_between_hash_lookups; skip += bytes_between_hash_lookups; if s_next > self.s_limit { return self.done(); } unsafe { // SAFETY: next_hash is always computed by table.hash // which is guaranteed to be in bounds. candidate = *table.get_unchecked(next_hash) as usize; *table.get_unchecked_mut(next_hash) = self.s as u16; let srcp = self.src.as_ptr(); // SAFETY: s_next is guaranteed to be less than s_limit by // the conditional above, which implies s_next is in // bounds. let x = bytes::loadu_u32_le(srcp.add(s_next)); next_hash = table.hash(x); // SAFETY: self.s is always less than s_next, so it is also // in bounds by the argument above. // // candidate is extracted from table, which is only ever // set to valid positions in the block and is therefore // also in bounds. // // We only need to compare y/z for equality, so we don't // need to both with endianness. cur corresponds to the // bytes at the current position and cand corresponds to // a potential match. If they're equal, we declare victory // and move below to try and extend the match. let cur = bytes::loadu_u32_ne(srcp.add(self.s)); let cand = bytes::loadu_u32_ne(srcp.add(candidate)); if cur == cand { break; } } } // While the above found a candidate for compression, before we // emit a copy operation for it, we need to make sure that we emit // any bytes between the last copy operation and this one as a // literal. let lit_end = self.s; unsafe { // SAFETY: next_emit is set to a previous value of self.s, // which is guaranteed to be less than s_limit (in bounds). // lit_end is set to the current value of self.s, also // guaranteed to be less than s_limit (in bounds). self.emit_literal(lit_end); } loop { // Look for more matching bytes starting at the position of // the candidate and the current src position. We increment // self.s and candidate by 4 since we already know the first 4 // bytes match. let base = self.s; self.s += 4; unsafe { // SAFETY: candidate is always set to a value from our // hash table, which only contains positions in self.src // that have been seen for this block that occurred before // self.s. self.extend_match(candidate + 4); } let (offset, len) = (base - candidate, self.s - base); self.emit_copy(offset, len); self.next_emit = self.s; if self.s >= self.s_limit { return self.done(); } // Update the hash table with the byte sequences // self.src[self.s - 1..self.s + 3] and // self.src[self.s..self.s + 4]. Instead of reading 4 bytes // twice, we read 8 bytes once. // // If we happen to get a hit on self.src[self.s..self.s + 4], // then continue this loop and extend the match. unsafe { let srcp = self.src.as_ptr(); // SAFETY: self.s can never exceed s_limit given by the // conditional above and self.s is guaranteed to be // non-zero and is therefore in bounds. let x = bytes::loadu_u64_le(srcp.add(self.s - 1)); // The lower 4 bytes of x correspond to // self.src[self.s - 1..self.s + 3]. let prev_hash = table.hash(x as u32); // SAFETY: Hash values are guaranteed to be in bounds. *table.get_unchecked_mut(prev_hash) = (self.s - 1) as u16; // The lower 4 bytes of x>>8 correspond to // self.src[self.s..self.s + 4]. let cur_hash = table.hash((x >> 8) as u32); // SAFETY: Hash values are guaranteed to be in bounds. candidate = *table.get_unchecked(cur_hash) as usize; *table.get_unchecked_mut(cur_hash) = self.s as u16; // SAFETY: candidate is set from table, which always // contains valid positions in the current block. let y = bytes::loadu_u32_le(srcp.add(candidate)); if (x >> 8) as u32 != y { // If we didn't get a hit, update the next hash // and move on. Our initial 8 byte read continues to // pay off. next_hash = table.hash((x >> 16) as u32); self.s += 1; break; } } } } } /// Emits one or more copy operations with the given offset and length. /// offset must be in the range [1, 65535] and len must be in the range /// [4, 65535]. #[inline(always)] fn emit_copy(&mut self, offset: usize, mut len: usize) { debug_assert!(1 <= offset && offset <= 65535); // Copy operations only allow lengths up to 64, but we'll allow bigger // lengths and emit as many operations as we need. // // N.B. Since our block size is 64KB, we never actually emit a copy 4 // operation. debug_assert!(4 <= len && len <= 65535); // Emit copy 2 operations until we don't have to. // We check on 68 here and emit a shorter copy than 64 below because // it is cheaper to, e.g., encode a length 67 copy as a length 60 // copy 2 followed by a length 7 copy 1 than to encode it as a length // 64 copy 2 followed by a length 3 copy 2. They key here is that a // copy 1 operation requires at least length 4 which forces a length 3 // copy to use a copy 2 operation. while len >= 68 { self.emit_copy2(offset, 64); len -= 64; } if len > 64 { self.emit_copy2(offset, 60); len -= 60; } // If we can squeeze the last copy into a copy 1 operation, do it. if len <= 11 && offset <= 2047 { self.dst[self.d] = (((offset >> 8) as u8) << 5) | (((len - 4) as u8) << 2) | (Tag::Copy1 as u8); self.dst[self.d + 1] = offset as u8; self.d += 2; } else { self.emit_copy2(offset, len); } } /// Emits a "copy 2" operation with the given offset and length. The /// offset and length must be valid for a copy 2 operation. i.e., offset /// must be in the range [1, 65535] and len must be in the range [1, 64]. #[inline(always)] fn emit_copy2(&mut self, offset: usize, len: usize) { debug_assert!(1 <= offset && offset <= 65535); debug_assert!(1 <= len && len <= 64); self.dst[self.d] = (((len - 1) as u8) << 2) | (Tag::Copy2 as u8); bytes::write_u16_le(offset as u16, &mut self.dst[self.d + 1..]); self.d += 3; } /// Attempts to extend a match from the current position in self.src with /// the candidate position given. /// /// This method uses unaligned loads and elides bounds checks, so the /// caller must guarantee that cand points to a valid location in self.src /// and is less than the current position in src. #[inline(always)] unsafe fn extend_match(&mut self, mut cand: usize) { debug_assert!(cand < self.s); while self.s + 8 <= self.src.len() { let srcp = self.src.as_ptr(); // SAFETY: The loop invariant guarantees that there is at least // 8 bytes to read at self.src + self.s. Since cand must be // guaranteed by the caller to be valid and less than self.s, it // also has enough room to read 8 bytes. // // TODO(ag): Despite my best efforts, I couldn't get this to // autovectorize with 128-bit loads. The logic after the loads // appears to be a little too clever... let x = bytes::loadu_u64_ne(srcp.add(self.s)); let y = bytes::loadu_u64_ne(srcp.add(cand)); if x == y { // If all 8 bytes are equal, move on... self.s += 8; cand += 8; } else { // Otherwise, find the last byte that was equal. We can do // this efficiently by interpreted x/y as little endian // numbers, which lets us use the number of trailing zeroes // as a proxy for the number of equivalent bits (after an XOR). let z = x.to_le() ^ y.to_le(); self.s += z.trailing_zeros() as usize / 8; return; } } // When we have fewer than 8 bytes left in the block, fall back to the // slow loop. while self.s < self.src.len() && self.src[self.s] == self.src[cand] { self.s += 1; cand += 1; } } /// Executes any cleanup when the current block has finished compressing. /// In particular, it emits any leftover bytes as a literal. #[inline(always)] fn done(&mut self) { if self.next_emit < self.src.len() { let lit_end = self.src.len(); unsafe { // SAFETY: Both next_emit and lit_end are trivially in bounds // given the conditional and definition above. self.emit_literal(lit_end); } } } /// Emits a literal from self.src[self.next_emit..lit_end]. /// /// This uses unaligned loads and elides bounds checks, so the caller must /// guarantee that self.src[self.next_emit..lit_end] is valid. #[inline(always)] unsafe fn emit_literal(&mut self, lit_end: usize) { let lit_start = self.next_emit; let len = lit_end - lit_start; let n = len.checked_sub(1).unwrap(); if n <= 59 { self.dst[self.d] = ((n as u8) << 2) | (Tag::Literal as u8); self.d += 1; if len <= 16 && lit_start + 16 <= self.src.len() { // SAFETY: lit_start is equivalent to self.next_emit, which is // only set to self.s immediately following a copy emit. The // conditional above also ensures that there is at least 16 // bytes of room in both src and dst. // // dst is big enough because the buffer is guaranteed to // be big enough to hold biggest possible compressed size plus // an extra 32 bytes, which exceeds the 16 byte copy here. let srcp = self.src.as_ptr().add(lit_start); let dstp = self.dst.as_mut_ptr().add(self.d); ptr::copy_nonoverlapping(srcp, dstp, 16); self.d += len; return; } } else if n < 256 { self.dst[self.d] = (60 << 2) | (Tag::Literal as u8); self.dst[self.d + 1] = n as u8; self.d += 2; } else { self.dst[self.d] = (61 << 2) | (Tag::Literal as u8); bytes::write_u16_le(n as u16, &mut self.dst[self.d + 1..]); self.d += 3; } // SAFETY: lit_start is equivalent to self.next_emit, which is only set // to self.s immediately following a copy, which implies that it always // points to valid bytes in self.src. // // We can't guarantee that there are at least len bytes though, which // must be guaranteed by the caller and is why this method is unsafe. let srcp = self.src.as_ptr().add(lit_start); let dstp = self.dst.as_mut_ptr().add(self.d); ptr::copy_nonoverlapping(srcp, dstp, len); self.d += len; } } /// `BlockTable` is a map from 4 byte sequences to positions of their most /// recent occurrence in a block. In particular, this table lets us quickly /// find candidates for compression. /// /// We expose the `hash` method so that callers can be fastidious about the /// number of times a hash is computed. struct BlockTable<'a> { table: &'a mut [u16], /// The number of bits required to shift the hash such that the result /// is less than table.len(). shift: u32, } impl Encoder { fn block_table(&mut self, block_size: usize) -> BlockTable<'_> { let mut shift: u32 = 32 - 8; let mut table_size = 256; while table_size < MAX_TABLE_SIZE && table_size < block_size { shift -= 1; table_size *= 2; } // If our block size is small, then use a small stack allocated table // instead of putting a bigger one on the heap. This particular // optimization is important if the caller is using Snappy to compress // many small blocks. (The memset savings alone is considerable.) let table: &mut [u16] = if table_size <= SMALL_TABLE_SIZE { &mut self.small[0..table_size] } else { if self.big.is_empty() { // Interestingly, using `self.big.resize` here led to some // very weird code getting generated that led to a large // slow down. Forcing the issue with a new vec seems to // fix it. ---AG self.big = vec![0; MAX_TABLE_SIZE]; } &mut self.big[0..table_size] }; for x in &mut *table { *x = 0; } BlockTable { table: table, shift: shift } } } impl<'a> BlockTable<'a> { #[inline(always)] fn hash(&self, x: u32) -> usize { (x.wrapping_mul(0x1E35A7BD) >> self.shift) as usize } } impl<'a> Deref for BlockTable<'a> { type Target = [u16]; fn deref(&self) -> &[u16] { self.table } } impl<'a> DerefMut for BlockTable<'a> { fn deref_mut(&mut self) -> &mut [u16] { self.table } } snap-1.1.1/src/crc32.rs000064400000000000000000000074721046102023000126500ustar 00000000000000use crate::bytes; use crate::crc32_table::{TABLE, TABLE16}; /// Provides a simple API to generate "masked" CRC32C checksums specifically /// for use in Snappy. When available, this will make use of SSE 4.2 to compute /// checksums. Otherwise, it falls back to only-marginally-slower "slicing by /// 16" technique. /// /// The main purpose of this type is to cache the CPU feature check and expose /// a safe API. #[derive(Clone, Copy, Debug)] pub struct CheckSummer { sse42: bool, } impl CheckSummer { /// Create a new checksummer that can compute CRC32C checksums on arbitrary /// bytes. #[cfg(not(target_arch = "x86_64"))] pub fn new() -> CheckSummer { CheckSummer { sse42: false } } /// Create a new checksummer that can compute CRC32C checksums on arbitrary /// bytes. #[cfg(target_arch = "x86_64")] pub fn new() -> CheckSummer { CheckSummer { sse42: is_x86_feature_detected!("sse4.2") } } /// Returns the "masked" CRC32 checksum of `buf` using the Castagnoli /// polynomial. This "masked" checksum is defined by the Snappy frame /// format. Masking is supposed to make the checksum robust with respect to /// the data that contains the checksum itself. pub fn crc32c_masked(&self, buf: &[u8]) -> u32 { let sum = self.crc32c(buf); (sum.wrapping_shr(15) | sum.wrapping_shl(17)).wrapping_add(0xA282EAD8) } /// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial. #[cfg(not(target_arch = "x86_64"))] fn crc32c(&self, buf: &[u8]) -> u32 { crc32c_slice16(buf) } /// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial. #[cfg(target_arch = "x86_64")] fn crc32c(&self, buf: &[u8]) -> u32 { if self.sse42 { // SAFETY: When sse42 is true, we are guaranteed to be running on // a CPU that supports SSE 4.2. unsafe { crc32c_sse(buf) } } else { crc32c_slice16(buf) } } } #[cfg(target_arch = "x86_64")] #[target_feature(enable = "sse4.2")] unsafe fn crc32c_sse(buf: &[u8]) -> u32 { use std::arch::x86_64::*; let mut crc = !0u32; // SAFETY: This is safe since alignment is handled by align_to (oh how I // love you) and since 8 adjacent u8's are guaranteed to have the same // in-memory representation as u64 for all possible values. let (prefix, u64s, suffix) = buf.align_to::(); for &b in prefix { // SAFETY: Safe since we have sse4.2 enabled. crc = _mm_crc32_u8(crc, b); } for &n in u64s { // SAFETY: Safe since we have sse4.2 enabled. crc = _mm_crc32_u64(crc as u64, n) as u32; } for &b in suffix { // SAFETY: Safe since we have sse4.2 enabled. crc = _mm_crc32_u8(crc, b); } !crc } /// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial. fn crc32c_slice16(mut buf: &[u8]) -> u32 { let mut crc: u32 = !0; while buf.len() >= 16 { crc ^= bytes::read_u32_le(buf); crc = TABLE16[0][buf[15] as usize] ^ TABLE16[1][buf[14] as usize] ^ TABLE16[2][buf[13] as usize] ^ TABLE16[3][buf[12] as usize] ^ TABLE16[4][buf[11] as usize] ^ TABLE16[5][buf[10] as usize] ^ TABLE16[6][buf[9] as usize] ^ TABLE16[7][buf[8] as usize] ^ TABLE16[8][buf[7] as usize] ^ TABLE16[9][buf[6] as usize] ^ TABLE16[10][buf[5] as usize] ^ TABLE16[11][buf[4] as usize] ^ TABLE16[12][(crc >> 24) as u8 as usize] ^ TABLE16[13][(crc >> 16) as u8 as usize] ^ TABLE16[14][(crc >> 8) as u8 as usize] ^ TABLE16[15][(crc) as u8 as usize]; buf = &buf[16..]; } for &b in buf { crc = TABLE[((crc as u8) ^ b) as usize] ^ (crc >> 8); } !crc } snap-1.1.1/src/crc32_table.rs000064400000000000000000000001211046102023000137770ustar 00000000000000// Generated by build.rs. include!(concat!(env!("OUT_DIR"), "/crc32_table.rs")); snap-1.1.1/src/decompress.rs000064400000000000000000000454441046102023000141010ustar 00000000000000use std::ptr; use crate::bytes; use crate::error::{Error, Result}; use crate::tag; use crate::MAX_INPUT_SIZE; /// A lookup table for quickly computing the various attributes derived from a /// tag byte. const TAG_LOOKUP_TABLE: TagLookupTable = TagLookupTable(tag::TAG_LOOKUP_TABLE); /// `WORD_MASK` is a map from the size of an integer in bytes to its /// corresponding on a 32 bit integer. This is used when we need to read an /// integer and we know there are at least 4 bytes to read from a buffer. In /// this case, we can read a 32 bit little endian integer and mask out only the /// bits we need. This in particular saves a branch. const WORD_MASK: [usize; 5] = [0, 0xFF, 0xFFFF, 0xFFFFFF, 0xFFFFFFFF]; /// Returns the decompressed size (in bytes) of the compressed bytes given. /// /// `input` must be a sequence of bytes returned by a conforming Snappy /// compressor. /// /// # Errors /// /// This function returns an error in the following circumstances: /// /// * An invalid Snappy header was seen. /// * The total space required for decompression exceeds `2^32 - 1`. pub fn decompress_len(input: &[u8]) -> Result { if input.is_empty() { return Ok(0); } Ok(Header::read(input)?.decompress_len) } /// Decoder is a raw decoder for decompressing bytes in the Snappy format. /// /// This decoder does not use the Snappy frame format and simply decompresses /// the given bytes as if it were returned from `Encoder`. /// /// Unless you explicitly need the low-level control, you should use /// [`read::FrameDecoder`](../read/struct.FrameDecoder.html) /// instead, which decompresses the Snappy frame format. #[derive(Clone, Debug, Default)] pub struct Decoder { // Place holder for potential future fields. _dummy: (), } impl Decoder { /// Return a new decoder that can be used for decompressing bytes. pub fn new() -> Decoder { Decoder { _dummy: () } } /// Decompresses all bytes in `input` into `output`. /// /// `input` must be a sequence of bytes returned by a conforming Snappy /// compressor. /// /// The size of `output` must be large enough to hold all decompressed /// bytes from the `input`. The size required can be queried with the /// `decompress_len` function. /// /// On success, this returns the number of bytes written to `output`. /// /// # Errors /// /// This method returns an error in the following circumstances: /// /// * Invalid compressed Snappy data was seen. /// * The total space required for decompression exceeds `2^32 - 1`. /// * `output` has length less than `decompress_len(input)`. pub fn decompress( &mut self, input: &[u8], output: &mut [u8], ) -> Result { if input.is_empty() { return Err(Error::Empty); } let hdr = Header::read(input)?; if hdr.decompress_len > output.len() { return Err(Error::BufferTooSmall { given: output.len() as u64, min: hdr.decompress_len as u64, }); } let dst = &mut output[..hdr.decompress_len]; let mut dec = Decompress { src: &input[hdr.len..], s: 0, dst: dst, d: 0 }; dec.decompress()?; Ok(dec.dst.len()) } /// Decompresses all bytes in `input` into a freshly allocated `Vec`. /// /// This is just like the `decompress` method, except it allocates a `Vec` /// with the right size for you. (This is intended to be a convenience /// method.) /// /// This method returns an error under the same circumstances that /// `decompress` does. pub fn decompress_vec(&mut self, input: &[u8]) -> Result> { let mut buf = vec![0; decompress_len(input)?]; let n = self.decompress(input, &mut buf)?; buf.truncate(n); Ok(buf) } } /// Decompress is the state of the Snappy compressor. struct Decompress<'s, 'd> { /// The original compressed bytes not including the header. src: &'s [u8], /// The current position in the compressed bytes. s: usize, /// The output buffer to write the decompressed bytes. dst: &'d mut [u8], /// The current position in the decompressed buffer. d: usize, } impl<'s, 'd> Decompress<'s, 'd> { /// Decompresses snappy compressed bytes in `src` to `dst`. /// /// This assumes that the header has already been read and that `dst` is /// big enough to store all decompressed bytes. fn decompress(&mut self) -> Result<()> { while self.s < self.src.len() { let byte = self.src[self.s]; self.s += 1; if byte & 0b000000_11 == 0 { let len = (byte >> 2) as usize + 1; self.read_literal(len)?; } else { self.read_copy(byte)?; } } if self.d != self.dst.len() { return Err(Error::HeaderMismatch { expected_len: self.dst.len() as u64, got_len: self.d as u64, }); } Ok(()) } /// Decompresses a literal from `src` starting at `s` to `dst` starting at /// `d` and returns the updated values of `s` and `d`. `s` should point to /// the byte immediately proceding the literal tag byte. /// /// `len` is the length of the literal if it's <=60. Otherwise, it's the /// length tag, indicating the number of bytes needed to read a little /// endian integer at `src[s..]`. i.e., `61 => 1 byte`, `62 => 2 bytes`, /// `63 => 3 bytes` and `64 => 4 bytes`. /// /// `len` must be <=64. #[inline(always)] fn read_literal(&mut self, len: usize) -> Result<()> { debug_assert!(len <= 64); let mut len = len as u64; // As an optimization for the common case, if the literal length is // <=16 and we have enough room in both `src` and `dst`, copy the // literal using unaligned loads and stores. // // We pick 16 bytes with the hope that it optimizes down to a 128 bit // load/store. if len <= 16 && self.s + 16 <= self.src.len() && self.d + 16 <= self.dst.len() { unsafe { // SAFETY: We know both src and dst have at least 16 bytes of // wiggle room after s/d, even if `len` is <16, so the copy is // safe. let srcp = self.src.as_ptr().add(self.s); let dstp = self.dst.as_mut_ptr().add(self.d); // Hopefully uses SIMD registers for 128 bit load/store. ptr::copy_nonoverlapping(srcp, dstp, 16); } self.d += len as usize; self.s += len as usize; return Ok(()); } // When the length is bigger than 60, it indicates that we need to read // an additional 1-4 bytes to get the real length of the literal. if len >= 61 { // If there aren't at least 4 bytes left to read then we know this // is corrupt because the literal must have length >=61. if self.s as u64 + 4 > self.src.len() as u64 { return Err(Error::Literal { len: 4, src_len: (self.src.len() - self.s) as u64, dst_len: (self.dst.len() - self.d) as u64, }); } // Since we know there are 4 bytes left to read, read a 32 bit LE // integer and mask away the bits we don't need. let byte_count = len as usize - 60; len = bytes::read_u32_le(&self.src[self.s..]) as u64; len = (len & (WORD_MASK[byte_count] as u64)) + 1; self.s += byte_count; } // If there's not enough buffer left to load or store this literal, // then the input is corrupt. // if self.s + len > self.src.len() || self.d + len > self.dst.len() { if ((self.src.len() - self.s) as u64) < len || ((self.dst.len() - self.d) as u64) < len { return Err(Error::Literal { len: len, src_len: (self.src.len() - self.s) as u64, dst_len: (self.dst.len() - self.d) as u64, }); } unsafe { // SAFETY: We've already checked the bounds, so we know this copy // is correct. let srcp = self.src.as_ptr().add(self.s); let dstp = self.dst.as_mut_ptr().add(self.d); ptr::copy_nonoverlapping(srcp, dstp, len as usize); } self.s += len as usize; self.d += len as usize; Ok(()) } /// Reads a copy from `src` and writes the decompressed bytes to `dst`. `s` /// should point to the byte immediately proceding the copy tag byte. #[inline(always)] fn read_copy(&mut self, tag_byte: u8) -> Result<()> { // Find the copy offset and len, then advance the input past the copy. // The rest of this function deals with reading/writing to output only. let entry = TAG_LOOKUP_TABLE.entry(tag_byte); let offset = entry.offset(self.src, self.s)?; let len = entry.len(); self.s += entry.num_tag_bytes(); // What we really care about here is whether `d == 0` or `d < offset`. // To save an extra branch, use `d < offset - 1` instead. If `d` is // `0`, then `offset.wrapping_sub(1)` will be usize::MAX which is also // the max value of `d`. if self.d <= offset.wrapping_sub(1) { return Err(Error::Offset { offset: offset as u64, dst_pos: self.d as u64, }); } // When all is said and done, dst is advanced to end. let end = self.d + len; // When the copy is small and the offset is at least 8 bytes away from // `d`, then we can decompress the copy with two 64 bit unaligned // loads/stores. if offset >= 8 && len <= 16 && self.d + 16 <= self.dst.len() { unsafe { // SAFETY: We know dstp points to at least 16 bytes of memory // from the condition above, and we also know that dstp is // preceded by at least `offset` bytes from the `d <= offset` // check above. // // We also know that dstp and dstp-8 do not overlap from the // check above, justifying the use of copy_nonoverlapping. let dstp = self.dst.as_mut_ptr().add(self.d); let srcp = dstp.sub(offset); // We can't do a single 16 byte load/store because src/dst may // overlap with each other. Namely, the second copy here may // copy bytes written in the first copy! ptr::copy_nonoverlapping(srcp, dstp, 8); ptr::copy_nonoverlapping(srcp.add(8), dstp.add(8), 8); } // If we have some wiggle room, try to decompress the copy 16 bytes // at a time with 128 bit unaligned loads/stores. Remember, we can't // just do a memcpy because decompressing copies may require copying // overlapping memory. // // We need the extra wiggle room to make effective use of 128 bit // loads/stores. Even if the store ends up copying more data than we // need, we're careful to advance `d` by the correct amount at the end. } else if end + 24 <= self.dst.len() { unsafe { // SAFETY: We know that dstp is preceded by at least `offset` // bytes from the `d <= offset` check above. // // We don't know whether dstp overlaps with srcp, so we start // by copying from srcp to dstp until they no longer overlap. // The worst case is when dstp-src = 3 and copy length = 1. The // first loop will issue these copy operations before stopping: // // [-1, 14] -> [0, 15] // [-1, 14] -> [3, 18] // [-1, 14] -> [9, 24] // // But the copy had length 1, so it was only supposed to write // to [0, 0]. But the last copy wrote to [9, 24], which is 24 // extra bytes in dst *beyond* the end of the copy, which is // guaranteed by the conditional above. // Save destination length here to avoid a reborrow UB violation // under the Tree Borrows model. let dest_len = self.dst.len(); let mut dstp = self.dst.as_mut_ptr().add(self.d); let mut srcp = dstp.sub(offset); loop { debug_assert!(dstp >= srcp); let diff = (dstp as usize) - (srcp as usize); if diff >= 16 { break; } // srcp and dstp can overlap, so use ptr::copy. debug_assert!(self.d + 16 <= dest_len); ptr::copy(srcp, dstp, 16); self.d += diff as usize; dstp = dstp.add(diff); } while self.d < end { ptr::copy_nonoverlapping(srcp, dstp, 16); srcp = srcp.add(16); dstp = dstp.add(16); self.d += 16; } // At this point, `d` is likely wrong. We correct it before // returning. It's correct value is `end`. } } else { if end > self.dst.len() { return Err(Error::CopyWrite { len: len as u64, dst_len: (self.dst.len() - self.d) as u64, }); } // Finally, the slow byte-by-byte case, which should only be used // for the last few bytes of decompression. while self.d != end { self.dst[self.d] = self.dst[self.d - offset]; self.d += 1; } } self.d = end; Ok(()) } } /// Header represents the single varint that starts every Snappy compressed /// block. #[derive(Debug)] struct Header { /// The length of the header in bytes (i.e., the varint). len: usize, /// The length of the original decompressed input in bytes. decompress_len: usize, } impl Header { /// Reads the varint header from the given input. /// /// If there was a problem reading the header then an error is returned. /// If a header is returned then it is guaranteed to be valid. #[inline(always)] fn read(input: &[u8]) -> Result
{ let (decompress_len, header_len) = bytes::read_varu64(input); if header_len == 0 { return Err(Error::Header); } if decompress_len > MAX_INPUT_SIZE { return Err(Error::TooBig { given: decompress_len as u64, max: MAX_INPUT_SIZE, }); } Ok(Header { len: header_len, decompress_len: decompress_len as usize }) } } /// A lookup table for quickly computing the various attributes derived from /// a tag byte. The attributes are most useful for the three "copy" tags /// and include the length of the copy, part of the offset (for copy 1-byte /// only) and the total number of bytes proceding the tag byte that encode /// the other part of the offset (1 for copy 1, 2 for copy 2 and 4 for copy 4). /// /// More specifically, the keys of the table are u8s and the values are u16s. /// The bits of the values are laid out as follows: /// /// xxaa abbb xxcc cccc /// /// Where `a` is the number of bytes, `b` are the three bits of the offset /// for copy 1 (the other 8 bits are in the byte proceding the tag byte; for /// copy 2 and copy 4, `b = 0`), and `c` is the length of the copy (max of 64). /// /// We could pack this in fewer bits, but the position of the three `b` bits /// lines up with the most significant three bits in the total offset for copy /// 1, which avoids an extra shift instruction. /// /// In sum, this table is useful because it reduces branches and various /// arithmetic operations. struct TagLookupTable([u16; 256]); impl TagLookupTable { /// Look up the tag entry given the tag `byte`. #[inline(always)] fn entry(&self, byte: u8) -> TagEntry { TagEntry(self.0[byte as usize] as usize) } } /// Represents a single entry in the tag lookup table. /// /// See the documentation in `TagLookupTable` for the bit layout. /// /// The type is a `usize` for convenience. struct TagEntry(usize); impl TagEntry { /// Return the total number of bytes proceding this tag byte required to /// encode the offset. fn num_tag_bytes(&self) -> usize { self.0 >> 11 } /// Return the total copy length, capped at 255. fn len(&self) -> usize { self.0 & 0xFF } /// Return the copy offset corresponding to this copy operation. `s` should /// point to the position just after the tag byte that this entry was read /// from. /// /// This requires reading from the compressed input since the offset is /// encoded in bytes proceding the tag byte. fn offset(&self, src: &[u8], s: usize) -> Result { let num_tag_bytes = self.num_tag_bytes(); let trailer = // It is critical for this case to come first, since it is the // fast path. We really hope that this case gets branch // predicted. if s + 4 <= src.len() { unsafe { // SAFETY: The conditional above guarantees that // src[s..s+4] is valid to read from. let p = src.as_ptr().add(s); // We use WORD_MASK here to mask out the bits we don't // need. While we're guaranteed to read 4 valid bytes, // not all of those bytes are necessarily part of the // offset. This is the key optimization: we don't need to // branch on num_tag_bytes. bytes::loadu_u32_le(p) as usize & WORD_MASK[num_tag_bytes] } } else if num_tag_bytes == 1 { if s >= src.len() { return Err(Error::CopyRead { len: 1, src_len: (src.len() - s) as u64, }); } src[s] as usize } else if num_tag_bytes == 2 { if s + 1 >= src.len() { return Err(Error::CopyRead { len: 2, src_len: (src.len() - s) as u64, }); } bytes::read_u16_le(&src[s..]) as usize } else { return Err(Error::CopyRead { len: num_tag_bytes as u64, src_len: (src.len() - s) as u64, }); }; Ok((self.0 & 0b0000_0111_0000_0000) | trailer) } } snap-1.1.1/src/error.rs000064400000000000000000000307301046102023000130560ustar 00000000000000use std::fmt; use std::io; use std::result; /// A convenient type alias for `Result`. pub type Result = result::Result; /// `IntoInnerError` occurs when consuming an encoder fails. /// /// Consuming the encoder causes a flush to happen. If the flush fails, then /// this error is returned, which contains both the original encoder and the /// error that occurred. /// /// The type parameter `W` is the unconsumed writer. pub struct IntoInnerError { wtr: W, err: io::Error, } impl IntoInnerError { pub(crate) fn new(wtr: W, err: io::Error) -> IntoInnerError { IntoInnerError { wtr, err } } /// Returns the error which caused the call to `into_inner` to fail. /// /// This error was returned when attempting to flush the internal buffer. pub fn error(&self) -> &io::Error { &self.err } /// Returns the error which caused the call to `into_inner` to fail. /// /// This error was returned when attempting to flush the internal buffer. pub fn into_error(self) -> io::Error { self.err } /// Returns the underlying writer which generated the error. /// /// The returned value can be used for error recovery, such as /// re-inspecting the buffer. pub fn into_inner(self) -> W { self.wtr } } impl std::error::Error for IntoInnerError {} impl fmt::Display for IntoInnerError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.err.fmt(f) } } impl fmt::Debug for IntoInnerError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.err.fmt(f) } } /// Error describes all the possible errors that may occur during Snappy /// compression or decompression. /// /// Note that it's unlikely that you'll need to care about the specific error /// reported since all of them indicate a corrupt Snappy data or a limitation /// that cannot be worked around. Therefore, /// `From for std::io::Error` is provided so that any Snappy /// errors will be converted to a `std::io::Error` automatically when using /// `try!`. #[derive(Clone, Debug)] pub enum Error { /// This error occurs when the given input is too big. This can happen /// during compression or decompression. TooBig { /// The size of the given input. given: u64, /// The maximum allowed size of an input buffer. max: u64, }, /// This error occurs when the given buffer is too small to contain the /// maximum possible compressed bytes or the total number of decompressed /// bytes. BufferTooSmall { /// The size of the given output buffer. given: u64, /// The minimum size of the output buffer. min: u64, }, /// This error occurs when trying to decompress a zero length buffer. Empty, /// This error occurs when an invalid header is found during decompression. Header, /// This error occurs when there is a mismatch between the number of /// decompressed bytes reported in the header and the number of /// actual decompressed bytes. In this error case, the number of actual /// decompressed bytes is always less than the number reported in the /// header. HeaderMismatch { /// The total number of decompressed bytes expected (i.e., the header /// value). expected_len: u64, /// The total number of actual decompressed bytes. got_len: u64, }, /// This error occurs during decompression when there was a problem /// reading a literal. Literal { /// The expected length of the literal. len: u64, /// The number of remaining bytes in the compressed bytes. src_len: u64, /// The number of remaining slots in the decompression buffer. dst_len: u64, }, /// This error occurs during decompression when there was a problem /// reading a copy. CopyRead { /// The expected length of the copy (as encoded in the compressed /// bytes). len: u64, /// The number of remaining bytes in the compressed bytes. src_len: u64, }, /// This error occurs during decompression when there was a problem /// writing a copy to the decompression buffer. CopyWrite { /// The length of the copy (i.e., the total number of bytes to be /// produced by this copy in the decompression buffer). len: u64, /// The number of remaining bytes in the decompression buffer. dst_len: u64, }, /// This error occurs during decompression when an invalid copy offset /// is found. An offset is invalid if it is zero or if it is out of bounds. Offset { /// The offset that was read. offset: u64, /// The current position in the decompression buffer. If the offset is /// non-zero, then the offset must be greater than this position. dst_pos: u64, }, /// This error occurs when a stream header chunk type was expected but got /// a different chunk type. /// This error only occurs when reading a Snappy frame formatted stream. StreamHeader { /// The chunk type byte that was read. byte: u8, }, /// This error occurs when the magic stream headers bytes do not match /// what is expected. /// This error only occurs when reading a Snappy frame formatted stream. StreamHeaderMismatch { /// The bytes that were read. bytes: Vec, }, /// This error occurs when an unsupported chunk type is seen. /// This error only occurs when reading a Snappy frame formatted stream. UnsupportedChunkType { /// The chunk type byte that was read. byte: u8, }, /// This error occurs when trying to read a chunk with an unexpected or /// incorrect length when reading a Snappy frame formatted stream. /// This error only occurs when reading a Snappy frame formatted stream. UnsupportedChunkLength { /// The length of the chunk encountered. len: u64, /// True when this error occured while reading the stream header. header: bool, }, /// This error occurs when a checksum validity check fails. /// This error only occurs when reading a Snappy frame formatted stream. Checksum { /// The expected checksum read from the stream. expected: u32, /// The computed checksum. got: u32, }, } impl From for io::Error { fn from(err: Error) -> io::Error { io::Error::new(io::ErrorKind::Other, err) } } impl Eq for Error {} impl PartialEq for Error { fn eq(&self, other: &Error) -> bool { use self::Error::*; match (self, other) { ( &TooBig { given: given1, max: max1 }, &TooBig { given: given2, max: max2 }, ) => (given1, max1) == (given2, max2), ( &BufferTooSmall { given: given1, min: min1 }, &BufferTooSmall { given: given2, min: min2 }, ) => (given1, min1) == (given2, min2), (&Empty, &Empty) | (&Header, &Header) => true, ( &HeaderMismatch { expected_len: elen1, got_len: glen1 }, &HeaderMismatch { expected_len: elen2, got_len: glen2 }, ) => (elen1, glen1) == (elen2, glen2), ( &Literal { len: len1, src_len: src_len1, dst_len: dst_len1 }, &Literal { len: len2, src_len: src_len2, dst_len: dst_len2 }, ) => (len1, src_len1, dst_len1) == (len2, src_len2, dst_len2), ( &CopyRead { len: len1, src_len: src_len1 }, &CopyRead { len: len2, src_len: src_len2 }, ) => (len1, src_len1) == (len2, src_len2), ( &CopyWrite { len: len1, dst_len: dst_len1 }, &CopyWrite { len: len2, dst_len: dst_len2 }, ) => (len1, dst_len1) == (len2, dst_len2), ( &Offset { offset: offset1, dst_pos: dst_pos1 }, &Offset { offset: offset2, dst_pos: dst_pos2 }, ) => (offset1, dst_pos1) == (offset2, dst_pos2), (&StreamHeader { byte: byte1 }, &StreamHeader { byte: byte2 }) => { byte1 == byte2 } ( &StreamHeaderMismatch { bytes: ref bytes1 }, &StreamHeaderMismatch { bytes: ref bytes2 }, ) => bytes1 == bytes2, ( &UnsupportedChunkType { byte: byte1 }, &UnsupportedChunkType { byte: byte2 }, ) => byte1 == byte2, ( &UnsupportedChunkLength { len: len1, header: header1 }, &UnsupportedChunkLength { len: len2, header: header2 }, ) => (len1, header1) == (len2, header2), ( &Checksum { expected: e1, got: g1 }, &Checksum { expected: e2, got: g2 }, ) => (e1, g1) == (e2, g2), _ => false, } } } impl std::error::Error for Error {} impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { Error::TooBig { given, max } => write!( f, "snappy: input buffer (size = {}) is larger than \ allowed (size = {})", given, max ), Error::BufferTooSmall { given, min } => write!( f, "snappy: output buffer (size = {}) is smaller than \ required (size = {})", given, min ), Error::Empty => write!(f, "snappy: corrupt input (empty)"), Error::Header => { write!(f, "snappy: corrupt input (invalid header)") } Error::HeaderMismatch { expected_len, got_len } => write!( f, "snappy: corrupt input (header mismatch; expected \ {} decompressed bytes but got {})", expected_len, got_len ), Error::Literal { len, src_len, dst_len } => write!( f, "snappy: corrupt input (expected literal read of \ length {}; remaining src: {}; remaining dst: {})", len, src_len, dst_len ), Error::CopyRead { len, src_len } => write!( f, "snappy: corrupt input (expected copy read of \ length {}; remaining src: {})", len, src_len ), Error::CopyWrite { len, dst_len } => write!( f, "snappy: corrupt input (expected copy write of \ length {}; remaining dst: {})", len, dst_len ), Error::Offset { offset, dst_pos } => write!( f, "snappy: corrupt input (expected valid offset but \ got offset {}; dst position: {})", offset, dst_pos ), Error::StreamHeader { byte } => write!( f, "snappy: corrupt input (expected stream header but \ got unexpected chunk type byte {})", byte ), Error::StreamHeaderMismatch { ref bytes } => write!( f, "snappy: corrupt input (expected sNaPpY stream \ header but got {})", escape(&**bytes) ), Error::UnsupportedChunkType { byte } => write!( f, "snappy: corrupt input (unsupported chunk type: {})", byte ), Error::UnsupportedChunkLength { len, header: false } => write!( f, "snappy: corrupt input \ (unsupported chunk length: {})", len ), Error::UnsupportedChunkLength { len, header: true } => write!( f, "snappy: corrupt input \ (invalid stream header length: {})", len ), Error::Checksum { expected, got } => write!( f, "snappy: corrupt input (bad checksum; \ expected: {}, got: {})", expected, got ), } } } fn escape(bytes: &[u8]) -> String { use std::ascii::escape_default; bytes.iter().flat_map(|&b| escape_default(b)).map(|b| b as char).collect() } snap-1.1.1/src/frame.rs000064400000000000000000000076061046102023000130250ustar 00000000000000use crate::bytes; use crate::compress::{max_compress_len, Encoder}; use crate::crc32::CheckSummer; use crate::error::Error; use crate::MAX_BLOCK_SIZE; /// The maximum chunk of compressed bytes that can be processed at one time. /// /// This is computed via `max_compress_len(MAX_BLOCK_SIZE)`. /// /// TODO(ag): Replace with const fn once they support nominal branching. pub const MAX_COMPRESS_BLOCK_SIZE: usize = 76490; /// The special magic string that starts any stream. /// /// This may appear more than once in a stream in order to support easy /// concatenation of files compressed in the Snappy frame format. pub const STREAM_IDENTIFIER: &'static [u8] = b"\xFF\x06\x00\x00sNaPpY"; /// The body of the special stream identifier. pub const STREAM_BODY: &'static [u8] = b"sNaPpY"; /// The length of a snappy chunk type (1 byte), packet length (3 bytes) /// and CRC field (4 bytes). This is technically the chunk header _plus_ /// the CRC present in most chunks. pub const CHUNK_HEADER_AND_CRC_SIZE: usize = 8; /// An enumeration describing each of the 4 main chunk types. #[derive(Copy, Clone, Debug, Eq, PartialEq)] pub enum ChunkType { Stream = 0xFF, Compressed = 0x00, Uncompressed = 0x01, Padding = 0xFE, } impl ChunkType { /// Converts a byte to one of the four defined chunk types represented by /// a single byte. If the chunk type is reserved, then it is returned as /// an Err. pub fn from_u8(b: u8) -> Result { match b { 0xFF => Ok(ChunkType::Stream), 0x00 => Ok(ChunkType::Compressed), 0x01 => Ok(ChunkType::Uncompressed), 0xFE => Ok(ChunkType::Padding), b => Err(b), } } } /// Compress a single frame (or decide to pass it through uncompressed). This /// will output a frame header in `dst_chunk_header`, and it will return a slice /// pointing to the data to use in the frame. The `dst_chunk_header` array must /// always have a size of 8 bytes. /// /// If `always_use_dst` is set to false, the return value may point into either /// `src` (for data we couldn't compress) or into `dst` (for data we could /// compress). If `always_use_dst` is true, the data will always be in `dst`. /// This is a bit weird, but because of Rust's ownership rules, it's easiest /// for a single function to always be in charge of writing to `dst`. pub fn compress_frame<'a>( enc: &mut Encoder, checksummer: CheckSummer, src: &'a [u8], dst_chunk_header: &mut [u8], dst: &'a mut [u8], always_use_dst: bool, ) -> Result<&'a [u8], Error> { // This is a purely internal function, with a bunch of preconditions. assert!(src.len() <= MAX_BLOCK_SIZE); assert!(dst.len() >= max_compress_len(MAX_BLOCK_SIZE)); assert_eq!(dst_chunk_header.len(), CHUNK_HEADER_AND_CRC_SIZE); // Build a checksum of our _uncompressed_ data. let checksum = checksummer.crc32c_masked(src); // Compress the buffer. If compression sucked, throw it out and // write uncompressed bytes instead. Since our buffer is at most // MAX_BLOCK_SIZE and our dst buffer has size // max_compress_len(MAX_BLOCK_SIZE), we have enough space. let compress_len = enc.compress(src, dst)?; let (chunk_type, chunk_len) = // We add 4 to the chunk_len because of the checksum. if compress_len >= src.len() - (src.len() / 8) { (ChunkType::Uncompressed, 4 + src.len()) } else { (ChunkType::Compressed, 4 + compress_len) }; dst_chunk_header[0] = chunk_type as u8; bytes::write_u24_le(chunk_len as u32, &mut dst_chunk_header[1..]); bytes::write_u32_le(checksum, &mut dst_chunk_header[4..]); // Return the data to put in our frame. if chunk_type == ChunkType::Compressed { Ok(&dst[0..compress_len]) } else if always_use_dst { dst[..src.len()].copy_from_slice(src); Ok(&dst[..src.len()]) } else { Ok(src) } } snap-1.1.1/src/lib.rs000064400000000000000000000063771046102023000125050ustar 00000000000000/*! This crate provides an implementation of the [Snappy compression format](https://github.com/google/snappy/blob/master/format_description.txt), as well as the [framing format](https://github.com/google/snappy/blob/master/framing_format.txt). The goal of Snappy is to provide reasonable compression at high speed. On a modern CPU, Snappy can compress data at about 300 MB/sec or more and can decompress data at about 800 MB/sec or more. # Install To use this crate with [Cargo](https://doc.rust-lang.org/cargo/), simply add it as a dependency to your `Cargo.toml`: ```ignore [dependencies] snap = "1" ``` # Overview This crate provides two ways to use Snappy. The first way is through the [`read::FrameDecoder`](read/struct.FrameDecoder.html) and [`write::FrameEncoder`](write/struct.FrameEncoder.html) types, which implement the `std::io::Read` and `std::io::Write` traits with the Snappy frame format. Unless you have a specific reason to the contrary, you should only use the Snappy frame format. Specifically, the Snappy frame format permits streaming compression or decompression. The second way is through the [`raw::Decoder`](raw/struct.Decoder.html) and [`raw::Encoder`](raw/struct.Encoder.html) types. These types provide lower level control to the raw Snappy format, and don't support a streaming interface directly. You should only use these types if you know you specifically need the Snappy raw format. Finally, the `Error` type in this crate provides an exhaustive list of error conditions that are probably useless in most circumstances. Therefore, `From for io::Error` is implemented in this crate, which will let you automatically convert a Snappy error to an `std::io::Error` (when using `?`) with an appropriate error message to display to an end user. # Example: compress data on `stdin` This program reads data from `stdin`, compresses it and emits it to `stdout`. This example can be found in `examples/compress.rs`: ```no_run use std::io; fn main() { let stdin = io::stdin(); let stdout = io::stdout(); let mut rdr = stdin.lock(); // Wrap the stdout writer in a Snappy writer. let mut wtr = snap::write::FrameEncoder::new(stdout.lock()); io::copy(&mut rdr, &mut wtr).expect("I/O operation failed"); } ``` # Example: decompress data on `stdin` This program reads data from `stdin`, decompresses it and emits it to `stdout`. This example can be found in `examples/decompress.rs`: ```no_run use std::io; fn main() { let stdin = io::stdin(); let stdout = io::stdout(); // Wrap the stdin reader in a Snappy reader. let mut rdr = snap::read::FrameDecoder::new(stdin.lock()); let mut wtr = stdout.lock(); io::copy(&mut rdr, &mut wtr).expect("I/O operation failed"); } ``` */ #![deny(missing_docs)] #[cfg(test)] doc_comment::doctest!("../README.md"); pub use crate::error::{Error, Result}; /// We don't permit compressing a block bigger than what can fit in a u32. const MAX_INPUT_SIZE: u64 = std::u32::MAX as u64; /// The maximum number of bytes that we process at once. A block is the unit /// at which we scan for candidates for compression. const MAX_BLOCK_SIZE: usize = 1 << 16; mod bytes; mod compress; mod crc32; mod crc32_table; mod decompress; mod error; mod frame; pub mod raw; pub mod read; mod tag; pub mod write; snap-1.1.1/src/raw.rs000064400000000000000000000013031046102023000125100ustar 00000000000000/*! This module provides a raw Snappy encoder and decoder. A raw Snappy encoder/decoder can only compress/decompress a fixed amount of data at a time. For this reason, this module is lower level and more difficult to use than the higher level streaming readers and writers exposed as part of the [`read`](../read/index.html) and [`write`](../write/index.html) modules. Generally, one only needs to use the raw format if some other source is generating raw Snappy compressed data and you have no choice but to do the same. Otherwise, the Snappy frame format should probably always be preferred. */ pub use crate::compress::{max_compress_len, Encoder}; pub use crate::decompress::{decompress_len, Decoder}; snap-1.1.1/src/read.rs000064400000000000000000000421111046102023000126340ustar 00000000000000/*! This module provides two `std::io::Read` implementations: * [`read::FrameDecoder`](struct.FrameDecoder.html) wraps another `std::io::Read` implemenation, and decompresses data encoded using the Snappy frame format. Use this if you have a compressed data source and wish to read it as uncompressed data. * [`read::FrameEncoder`](struct.FrameEncoder.html) wraps another `std::io::Read` implemenation, and compresses data encoded using the Snappy frame format. Use this if you have uncompressed data source and wish to read it as compressed data. Typically, `read::FrameDecoder` is the version that you'll want. */ use std::cmp; use std::fmt; use std::io; use crate::bytes; use crate::compress::Encoder; use crate::crc32::CheckSummer; use crate::decompress::{decompress_len, Decoder}; use crate::error::Error; use crate::frame::{ compress_frame, ChunkType, CHUNK_HEADER_AND_CRC_SIZE, MAX_COMPRESS_BLOCK_SIZE, STREAM_BODY, STREAM_IDENTIFIER, }; use crate::MAX_BLOCK_SIZE; /// The maximum size of a compressed block, including the header and stream /// identifier, that can be emitted by FrameEncoder. const MAX_READ_FRAME_ENCODER_BLOCK_SIZE: usize = STREAM_IDENTIFIER.len() + CHUNK_HEADER_AND_CRC_SIZE + MAX_COMPRESS_BLOCK_SIZE; /// A reader for decompressing a Snappy stream. /// /// This `FrameDecoder` wraps any other reader that implements `std::io::Read`. /// Bytes read from this reader are decompressed using the /// [Snappy frame format](https://github.com/google/snappy/blob/master/framing_format.txt) /// (file extension `sz`, MIME type `application/x-snappy-framed`). /// /// This reader can potentially make many small reads from the underlying /// stream depending on its format, therefore, passing in a buffered reader /// may be beneficial. pub struct FrameDecoder { /// The underlying reader. r: R, /// A Snappy decoder that we reuse that does the actual block based /// decompression. dec: Decoder, /// A CRC32 checksummer that is configured to either use the portable /// fallback version or the SSE4.2 accelerated version when the right CPU /// features are available. checksummer: CheckSummer, /// The compressed bytes buffer, taken from the underlying reader. src: Vec, /// The decompressed bytes buffer. Bytes are decompressed from src to dst /// before being passed back to the caller. dst: Vec, /// Index into dst: starting point of bytes not yet given back to caller. dsts: usize, /// Index into dst: ending point of bytes not yet given back to caller. dste: usize, /// Whether we've read the special stream header or not. read_stream_ident: bool, } impl FrameDecoder { /// Create a new reader for streaming Snappy decompression. pub fn new(rdr: R) -> FrameDecoder { FrameDecoder { r: rdr, dec: Decoder::new(), checksummer: CheckSummer::new(), src: vec![0; MAX_COMPRESS_BLOCK_SIZE], dst: vec![0; MAX_BLOCK_SIZE], dsts: 0, dste: 0, read_stream_ident: false, } } /// Gets a reference to the underlying reader in this decoder. pub fn get_ref(&self) -> &R { &self.r } /// Gets a mutable reference to the underlying reader in this decoder. /// /// Note that mutation of the stream may result in surprising results if /// this decoder is continued to be used. pub fn get_mut(&mut self) -> &mut R { &mut self.r } /// Gets the underlying reader of this decoder. pub fn into_inner(self) -> R { self.r } } impl io::Read for FrameDecoder { fn read(&mut self, buf: &mut [u8]) -> io::Result { macro_rules! fail { ($err:expr) => { return Err(io::Error::from($err)) }; } loop { if self.dsts < self.dste { let len = cmp::min(self.dste - self.dsts, buf.len()); let dste = self.dsts.checked_add(len).unwrap(); buf[0..len].copy_from_slice(&self.dst[self.dsts..dste]); self.dsts = dste; return Ok(len); } if !read_exact_eof(&mut self.r, &mut self.src[0..4])? { return Ok(0); } let ty = ChunkType::from_u8(self.src[0]); if !self.read_stream_ident { if ty != Ok(ChunkType::Stream) { fail!(Error::StreamHeader { byte: self.src[0] }); } self.read_stream_ident = true; } let len64 = bytes::read_u24_le(&self.src[1..]) as u64; if len64 > self.src.len() as u64 { fail!(Error::UnsupportedChunkLength { len: len64, header: false, }); } let len = len64 as usize; match ty { Err(b) if 0x02 <= b && b <= 0x7F => { // Spec says that chunk types 0x02-0x7F are reserved and // conformant decoders must return an error. fail!(Error::UnsupportedChunkType { byte: b }); } Err(b) if 0x80 <= b && b <= 0xFD => { // Spec says that chunk types 0x80-0xFD are reserved but // skippable. self.r.read_exact(&mut self.src[0..len])?; } Err(b) => { // Can never happen. 0x02-0x7F and 0x80-0xFD are handled // above in the error case. That leaves 0x00, 0x01, 0xFE // and 0xFF, each of which correspond to one of the four // defined chunk types. unreachable!("BUG: unhandled chunk type: {}", b); } Ok(ChunkType::Padding) => { // Just read and move on. self.r.read_exact(&mut self.src[0..len])?; } Ok(ChunkType::Stream) => { if len != STREAM_BODY.len() { fail!(Error::UnsupportedChunkLength { len: len64, header: true, }) } self.r.read_exact(&mut self.src[0..len])?; if &self.src[0..len] != STREAM_BODY { fail!(Error::StreamHeaderMismatch { bytes: self.src[0..len].to_vec(), }); } } Ok(ChunkType::Uncompressed) => { if len < 4 { fail!(Error::UnsupportedChunkLength { len: len as u64, header: false, }); } let expected_sum = bytes::io_read_u32_le(&mut self.r)?; let n = len - 4; if n > self.dst.len() { fail!(Error::UnsupportedChunkLength { len: n as u64, header: false, }); } self.r.read_exact(&mut self.dst[0..n])?; let got_sum = self.checksummer.crc32c_masked(&self.dst[0..n]); if expected_sum != got_sum { fail!(Error::Checksum { expected: expected_sum, got: got_sum, }); } self.dsts = 0; self.dste = n; } Ok(ChunkType::Compressed) => { if len < 4 { fail!(Error::UnsupportedChunkLength { len: len as u64, header: false, }); } let expected_sum = bytes::io_read_u32_le(&mut self.r)?; let sn = len - 4; if sn > self.src.len() { fail!(Error::UnsupportedChunkLength { len: len64, header: false, }); } self.r.read_exact(&mut self.src[0..sn])?; let dn = decompress_len(&self.src)?; if dn > self.dst.len() { fail!(Error::UnsupportedChunkLength { len: dn as u64, header: false, }); } self.dec .decompress(&self.src[0..sn], &mut self.dst[0..dn])?; let got_sum = self.checksummer.crc32c_masked(&self.dst[0..dn]); if expected_sum != got_sum { fail!(Error::Checksum { expected: expected_sum, got: got_sum, }); } self.dsts = 0; self.dste = dn; } } } } } impl fmt::Debug for FrameDecoder { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("FrameDecoder") .field("r", &self.r) .field("dec", &self.dec) .field("checksummer", &self.checksummer) .field("src", &"[...]") .field("dst", &"[...]") .field("dsts", &self.dsts) .field("dste", &self.dste) .field("read_stream_ident", &self.read_stream_ident) .finish() } } /// A reader for compressing data using snappy as it is read. /// /// This `FrameEncoder` wraps any other reader that implements `std::io::Read`. /// Bytes read from this reader are compressed using the /// [Snappy frame format](https://github.com/google/snappy/blob/master/framing_format.txt) /// (file extension `sz`, MIME type `application/x-snappy-framed`). /// /// Usually you'll want /// [`read::FrameDecoder`](struct.FrameDecoder.html) /// (for decompressing while reading) or /// [`write::FrameEncoder`](../write/struct.FrameEncoder.html) /// (for compressing while writing) instead. /// /// Unlike `FrameDecoder`, this will attempt to make large reads roughly /// equivalent to the size of a single Snappy block. Therefore, callers may not /// benefit from using a buffered reader. pub struct FrameEncoder { /// Internally, we split `FrameEncoder` in two to keep the borrow checker /// happy. The `inner` member contains everything that `read_frame` needs /// to fetch a frame's worth of data and compress it. inner: Inner, /// Data that we've encoded and are ready to return to our caller. dst: Vec, /// Starting point of bytes in `dst` not yet given back to the caller. dsts: usize, /// Ending point of bytes in `dst` that we want to give to our caller. dste: usize, } struct Inner { /// The underlying data source. r: R, /// An encoder that we reuse that does the actual block based compression. enc: Encoder, /// A CRC32 checksummer that is configured to either use the portable /// fallback version or the SSE4.2 accelerated version when the right CPU /// features are available. checksummer: CheckSummer, /// Data taken from the underlying `r`, and not yet compressed. src: Vec, /// Have we written the standard snappy header to `dst` yet? wrote_stream_ident: bool, } impl FrameEncoder { /// Create a new reader for streaming Snappy compression. pub fn new(rdr: R) -> FrameEncoder { FrameEncoder { inner: Inner { r: rdr, enc: Encoder::new(), checksummer: CheckSummer::new(), src: vec![0; MAX_BLOCK_SIZE], wrote_stream_ident: false, }, dst: vec![0; MAX_READ_FRAME_ENCODER_BLOCK_SIZE], dsts: 0, dste: 0, } } /// Gets a reference to the underlying reader in this decoder. pub fn get_ref(&self) -> &R { &self.inner.r } /// Gets a mutable reference to the underlying reader in this decoder. /// /// Note that mutation of the stream may result in surprising results if /// this encoder is continued to be used. pub fn get_mut(&mut self) -> &mut R { &mut self.inner.r } /// Read previously compressed data from `self.dst`, returning the number of /// bytes read. If `self.dst` is empty, returns 0. fn read_from_dst(&mut self, buf: &mut [u8]) -> usize { let available_bytes = self.dste - self.dsts; let count = cmp::min(available_bytes, buf.len()); buf[..count].copy_from_slice(&self.dst[self.dsts..self.dsts + count]); self.dsts += count; count } } impl io::Read for FrameEncoder { fn read(&mut self, buf: &mut [u8]) -> io::Result { // Try reading previously compressed bytes from our `dst` buffer, if // any. let count = self.read_from_dst(buf); if count > 0 { // We had some bytes in our `dst` buffer that we used. Ok(count) } else if buf.len() >= MAX_READ_FRAME_ENCODER_BLOCK_SIZE { // Our output `buf` is big enough that we can directly write into // it, so bypass `dst` entirely. self.inner.read_frame(buf) } else { // We need to refill `self.dst`, and then return some bytes from // that. let count = self.inner.read_frame(&mut self.dst)?; self.dsts = 0; self.dste = count; Ok(self.read_from_dst(buf)) } } } impl Inner { /// Read from `self.r`, and create a new frame, writing it to `dst`, which /// must be at least `MAX_READ_FRAME_ENCODER_BLOCK_SIZE` bytes in size. fn read_frame(&mut self, dst: &mut [u8]) -> io::Result { debug_assert!(dst.len() >= MAX_READ_FRAME_ENCODER_BLOCK_SIZE); // We make one read to the underlying reader. If the underlying reader // doesn't fill the buffer but there are still bytes to be read, then // compression won't be optimal. The alternative would be to block // until our buffer is maximally full (or we see EOF), but this seems // more surprising. In general, io::Read implementations should try to // fill the caller's buffer as much as they can, so this seems like the // better choice. let nread = self.r.read(&mut self.src)?; if nread == 0 { return Ok(0); } // If we haven't yet written the stream header to `dst`, write it. let mut dst_write_start = 0; if !self.wrote_stream_ident { dst[0..STREAM_IDENTIFIER.len()].copy_from_slice(STREAM_IDENTIFIER); dst_write_start += STREAM_IDENTIFIER.len(); self.wrote_stream_ident = true; } // Reserve space for our chunk header. We need to use `split_at_mut` so // that we can get two mutable slices pointing at non-overlapping parts // of `dst`. let (chunk_header, remaining_dst) = dst[dst_write_start..].split_at_mut(CHUNK_HEADER_AND_CRC_SIZE); dst_write_start += CHUNK_HEADER_AND_CRC_SIZE; // Compress our frame if possible, telling `compress_frame` to always // put the output in `dst`. let frame_data = compress_frame( &mut self.enc, self.checksummer, &self.src[..nread], chunk_header, remaining_dst, true, )?; Ok(dst_write_start + frame_data.len()) } } impl fmt::Debug for FrameEncoder { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("FrameEncoder") .field("inner", &self.inner) .field("dst", &"[...]") .field("dsts", &self.dsts) .field("dste", &self.dste) .finish() } } impl fmt::Debug for Inner { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("Inner") .field("r", &self.r) .field("enc", &self.enc) .field("checksummer", &self.checksummer) .field("src", &"[...]") .field("wrote_stream_ident", &self.wrote_stream_ident) .finish() } } // read_exact_eof is like Read::read_exact, except it detects EOF // and returns Ok(false) instead of an error. // // If buf was read successfully, it returns Ok(true). fn read_exact_eof( rdr: &mut R, buf: &mut [u8], ) -> io::Result { match rdr.read(buf) { // EOF Ok(0) => Ok(false), // Read everything w/ the read call Ok(i) if i == buf.len() => Ok(true), // There's some bytes left to fill, which can be deferred to read_exact Ok(i) => { rdr.read_exact(&mut buf[i..])?; Ok(true) } Err(e) => Err(e), } } snap-1.1.1/src/tag.rs000064400000000000000000000001111046102023000124660ustar 00000000000000// Generated by build.rs. include!(concat!(env!("OUT_DIR"), "/tag.rs")); snap-1.1.1/src/varint.rs000064400000000000000000000015621046102023000132310ustar 00000000000000/// https://developers.google.com/protocol-buffers/docs/encoding#varints pub fn write_varu64(data: &mut [u8], mut n: u64) -> usize { let mut i = 0; while n >= 0b1000_0000 { data[i] = (n as u8) | 0b1000_0000; n >>= 7; i += 1; } data[i] = n as u8; i + 1 } /// https://developers.google.com/protocol-buffers/docs/encoding#varints pub fn read_varu64(data: &[u8]) -> (u64, usize) { let mut n: u64 = 0; let mut shift: u32 = 0; for (i, &b) in data.iter().enumerate() { if b < 0b1000_0000 { return match (b as u64).checked_shl(shift) { None => (0, 0), Some(b) => (n | b, i + 1), }; } match ((b as u64) & 0b0111_1111).checked_shl(shift) { None => return (0, 0), Some(b) => n |= b, } shift += 7; } (0, 0) } snap-1.1.1/src/write.rs000064400000000000000000000170371046102023000130640ustar 00000000000000/*! This module provides a `std::io::Write` implementation: - `write::FrameEncoder` wraps another `std::io::Write` implemenation, and compresses data encoded using the Snappy frame format. Use this if you have uncompressed data source and wish to write it as compressed data. It would also be possible to provide a `write::FrameDecoder`, which decompresses data as it writes it, but it hasn't been implemented yet. */ use std::fmt; use std::io::{self, Write}; use crate::compress::Encoder; use crate::crc32::CheckSummer; pub use crate::error::IntoInnerError; use crate::frame::{ compress_frame, CHUNK_HEADER_AND_CRC_SIZE, MAX_COMPRESS_BLOCK_SIZE, STREAM_IDENTIFIER, }; use crate::MAX_BLOCK_SIZE; /// A writer for compressing a Snappy stream. /// /// This `FrameEncoder` wraps any other writer that implements `io::Write`. /// Bytes written to this writer are compressed using the [Snappy frame /// format](https://github.com/google/snappy/blob/master/framing_format.txt) /// (file extension `sz`, MIME type `application/x-snappy-framed`). /// /// Writes are buffered automatically, so there's no need to wrap the given /// writer in a `std::io::BufWriter`. /// /// The writer will be flushed automatically when it is dropped. If an error /// occurs, it is ignored. pub struct FrameEncoder { /// Our main internal state, split out for borrowck reasons (happily paid). /// /// Also, it's an `Option` so we can move out of it even though /// `FrameEncoder` impls `Drop`. inner: Option>, /// Our buffer of uncompressed bytes. This isn't part of `inner` because /// we may write bytes directly from the caller if the given buffer was /// big enough. As a result, the main `write` implementation needs to /// accept either the internal buffer or the caller's bytes directly. Since /// `write` requires a mutable borrow, we satisfy the borrow checker by /// separating `src` from the rest of the state. src: Vec, } struct Inner { /// The underlying writer. w: W, /// An encoder that we reuse that does the actual block based compression. enc: Encoder, /// A CRC32 checksummer that is configured to either use the portable /// fallback version or the SSE4.2 accelerated version when the right CPU /// features are available. checksummer: CheckSummer, /// The compressed bytes buffer. Bytes are compressed from src (usually) /// to dst before being written to w. dst: Vec, /// When false, the stream identifier (with magic bytes) must precede the /// next write. wrote_stream_ident: bool, /// Space for writing the header of a chunk before writing it to the /// underlying writer. chunk_header: [u8; 8], } impl FrameEncoder { /// Create a new writer for streaming Snappy compression. pub fn new(wtr: W) -> FrameEncoder { FrameEncoder { inner: Some(Inner { w: wtr, enc: Encoder::new(), checksummer: CheckSummer::new(), dst: vec![0; MAX_COMPRESS_BLOCK_SIZE], wrote_stream_ident: false, chunk_header: [0; CHUNK_HEADER_AND_CRC_SIZE], }), src: Vec::with_capacity(MAX_BLOCK_SIZE), } } /// Returns the underlying stream, consuming and flushing this writer. /// /// If flushing the writer caused an error, then an `IntoInnerError` is /// returned, which contains both the writer and the original writer. pub fn into_inner(mut self) -> Result>> { match self.flush() { Ok(()) => Ok(self.inner.take().unwrap().w), Err(err) => Err(IntoInnerError::new(self, err)), } } /// Gets a reference to the underlying writer in this encoder. pub fn get_ref(&self) -> &W { &self.inner.as_ref().unwrap().w } /// Gets a reference to the underlying writer in this encoder. /// /// Note that mutating the output/input state of the stream may corrupt /// this encoder, so care must be taken when using this method. pub fn get_mut(&mut self) -> &mut W { &mut self.inner.as_mut().unwrap().w } } impl Drop for FrameEncoder { fn drop(&mut self) { if self.inner.is_some() { // Ignore errors because we can't conceivably return an error and // panicing in a dtor is bad juju. let _ = self.flush(); } } } impl io::Write for FrameEncoder { fn write(&mut self, mut buf: &[u8]) -> io::Result { let mut total = 0; // If there isn't enough room to add buf to src, then add only a piece // of it, flush it and mush on. loop { let free = self.src.capacity() - self.src.len(); // n is the number of bytes extracted from buf. let n = if buf.len() <= free { break; } else if self.src.is_empty() { // If buf is bigger than our entire buffer then avoid // the indirection and write the buffer directly. self.inner.as_mut().unwrap().write(buf)? } else { self.src.extend_from_slice(&buf[0..free]); self.flush()?; free }; buf = &buf[n..]; total += n; } // We're only here if buf.len() will fit within the available space of // self.src. debug_assert!(buf.len() <= (self.src.capacity() - self.src.len())); self.src.extend_from_slice(buf); total += buf.len(); // We should never expand or contract self.src. debug_assert!(self.src.capacity() == MAX_BLOCK_SIZE); Ok(total) } fn flush(&mut self) -> io::Result<()> { if self.src.is_empty() { return Ok(()); } self.inner.as_mut().unwrap().write(&self.src)?; self.src.truncate(0); Ok(()) } } impl Inner { fn write(&mut self, mut buf: &[u8]) -> io::Result { let mut total = 0; if !self.wrote_stream_ident { self.wrote_stream_ident = true; self.w.write_all(STREAM_IDENTIFIER)?; } while !buf.is_empty() { // Advance buf and get our block. let mut src = buf; if src.len() > MAX_BLOCK_SIZE { src = &src[0..MAX_BLOCK_SIZE]; } buf = &buf[src.len()..]; let frame_data = compress_frame( &mut self.enc, self.checksummer, src, &mut self.chunk_header, &mut self.dst, false, )?; self.w.write_all(&self.chunk_header)?; self.w.write_all(frame_data)?; total += src.len(); } Ok(total) } } impl fmt::Debug for FrameEncoder { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("FrameEncoder") .field("inner", &self.inner) .field("src", &"[...]") .finish() } } impl fmt::Debug for Inner { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("Inner") .field("w", &self.w) .field("enc", &self.enc) .field("checksummer", &self.checksummer) .field("dst", &"[...]") .field("wrote_stream_ident", &self.wrote_stream_ident) .field("chunk_header", &self.chunk_header) .finish() } }