deflate-1.0.0/.cargo_vcs_info.json0000644000000001120000000000100124300ustar { "git": { "sha1": "d7a1d651ae3f70904577492a2ab1dbd03a62b443" } } deflate-1.0.0/.gitattributes000064400000000000000000000000230072674642500141440ustar 00000000000000tests/*.txt binary deflate-1.0.0/.gitignore000064400000000000000000000004400072674642500132440ustar 00000000000000# Compiled files *.o *.so *.rlib *.dll # Executables *.exe # Generated by Cargo /target/ # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html Cargo.lock /data/ deflate-1.0.0/.travis.yml000064400000000000000000000001560072674642500133710ustar 00000000000000language: rust rust: - stable - beta - nightly - 1.32.0 os: - linux - osx - windows sudo: false deflate-1.0.0/Cargo.toml0000644000000023110000000000100104310ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2018" name = "deflate" version = "1.0.0" authors = ["oyvindln "] exclude = ["tests/*", "fuzz/*"] description = "A DEFLATE, zlib and gzip encoder written in Rust.\n" homepage = "https://github.com/image-rs/deflate-rs" documentation = "https://docs.rs/deflate/" readme = "README.md" keywords = ["flate", "deflate", "zlib", "compression", "gzip"] categories = ["compression"] license = "MIT/Apache-2.0" repository = "https://github.com/image-rs/deflate-rs" [package.metadata.docs.rs] features = ["gzip"] [dependencies.adler32] version = "1.2.0" [dependencies.gzip-header] version = "1.0" optional = true [dev-dependencies.miniz_oxide] version = "0.5.0" [features] benchmarks = [] gzip = ["gzip-header"] deflate-1.0.0/Cargo.toml.orig0000644000000013530000000000100113750ustar [package] name = "deflate" version = "1.0.0" edition = "2018" license = "MIT/Apache-2.0" authors = ["oyvindln "] readme = "README.md" keywords = ["flate", "deflate", "zlib", "compression", "gzip"] repository = "https://github.com/image-rs/deflate-rs" homepage = "https://github.com/image-rs/deflate-rs" documentation = "https://docs.rs/deflate/" description = """ A DEFLATE, zlib and gzip encoder written in Rust. """ categories = ["compression"] exclude = [ "tests/*", "fuzz/*" ] [dependencies] adler32 = "1.2.0" gzip-header = { version = "1.0", optional = true } [dev-dependencies] miniz_oxide = "0.5.0" [features] benchmarks = [] gzip = ["gzip-header"] [package.metadata.docs.rs] features = ["gzip"] deflate-1.0.0/Cargo.toml.orig000064400000000000000000000013530072674642500141470ustar 00000000000000[package] name = "deflate" version = "1.0.0" edition = "2018" license = "MIT/Apache-2.0" authors = ["oyvindln "] readme = "README.md" keywords = ["flate", "deflate", "zlib", "compression", "gzip"] repository = "https://github.com/image-rs/deflate-rs" homepage = "https://github.com/image-rs/deflate-rs" documentation = "https://docs.rs/deflate/" description = """ A DEFLATE, zlib and gzip encoder written in Rust. """ categories = ["compression"] exclude = [ "tests/*", "fuzz/*" ] [dependencies] adler32 = "1.2.0" gzip-header = { version = "1.0", optional = true } [dev-dependencies] miniz_oxide = "0.5.0" [features] benchmarks = [] gzip = ["gzip-header"] [package.metadata.docs.rs] features = ["gzip"] deflate-1.0.0/LICENSE-APACHE000064400000000000000000000251370072674642500132120ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. deflate-1.0.0/LICENSE-MIT000064400000000000000000000020530072674642500127120ustar 00000000000000The MIT License (MIT) Copyright (c) 2016 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. deflate-1.0.0/README.md000064400000000000000000000051530072674642500125410ustar 00000000000000# deflate-rs [![Build Status](https://travis-ci.com/image-rs/deflate-rs.svg)](https://travis-ci.com/github/image-rs/deflate-rs)[![Crates.io](https://img.shields.io/crates/v/deflate.svg)](https://crates.io/crates/deflate)[![Docs](https://docs.rs/deflate/badge.svg)](https://docs.rs/deflate) An implementation of a [DEFLATE](http://www.gzip.org/zlib/rfc-deflate.html) encoder in pure Rust. Not a direct port, but does take some inspiration from [zlib](https://www.zlib.net/), [miniz](https://github.com/richgel999/miniz) and [zopfli](https://github.com/google/zopfli). The API is based on the one in the [flate2](https://crates.io/crates/flate2) crate that contains bindings, zlib miniz_oxide, and miniz. Deflate encoding with and without zlib and gzip metadata (zlib dictionaries are not supported) is supported. No unsafe code is used. Encoding in gzip format requires enabling the 'gzip' feature. This library is now mostly in maintenance mode, focus being on the Rust backend of [flate2](https://crates.io/crates/flate2) instead. The minimum required Rust version is 1.32.0 due to use of library functions for endinaness conversion (unit tests requires a newer version). # Usage: ## Simple compression function: ``` rust use deflate::deflate_bytes; let data = b"Some data"; let compressed = deflate_bytes(&data); ``` ## Using a writer: ``` rust use std::io::Write; use deflate::Compression; use deflate::write::ZlibEncoder; let data = b"This is some test data"; let mut encoder = ZlibEncoder::new(Vec::new(), Compression::Default); encoder.write_all(data).unwrap(); let compressed_data = encoder.finish().unwrap(); ``` # Other deflate/zlib Rust projects from various people * [flate2](https://github.com/rust-lang/flate2-rs) FLATE, Gzip, and Zlib bindings for Rust - can use miniz_oxide for a pure Rust implementation. * [Zopfli in Rust](https://github.com/carols10cents/zopfli) Rust port of zopfli * [inflate](https://github.com/image-rs/inflate) DEFLATE decoder implemented in Rust * [miniz-oxide](https://github.com/Frommi/miniz_oxide) Port of miniz to Rust. * [libflate](https://github.com/sile/libflate) Another DEFLATE/Zlib/Gzip encoder and decoder written in Rust. (Only does some very light compression). # License deflate is distributed under the terms of both the MIT and Apache 2.0 licences. bitstream.rs is © @nwin and was released under both MIT and Apache 2.0 Some code in length_encode.rs has been ported from the `miniz` library, which is public domain. The test data (tests/pg11.txt) is borrowed from [Project Gutenberg](https://www.gutenberg.org/ebooks/11) and is available under public domain, or the Project Gutenberg Licence deflate-1.0.0/changelog.md000064400000000000000000000042070072674642500135320ustar 00000000000000 ## 1.0.0 (2021-11-10) * This release does not add any new features or change the API, only some minor refactor and doc fixes. It's mainly for indicating crate stability. #### Bug Fixes * fix warnings from rustc and clippy, update dependencies ([a5fcb05e](https://github.com/image-rs/deflate-rs/commit/a5fcb05e74cf6529ebb98b46701f666f1039c5e3)) * Fix test data location and capitalize Rust, update various links, see pr #53 (thanks @atouchet) ([34fecd18](https://github.com/image-rs/deflate-rs/commit/34fecd18cafaf40ea71abfdcbf054db90e7ce5fa)) ### 0.9.1 (2021-03-24) #### Bug Fixes * Fix gzip feature that was broken in 0.9 (thanks @oheralla) ([49ac5cfe](https://github.com/image-rs/deflate-rs/commit/49ac5cfec5e1a6c4398a8753309e1f7d66108c41)) ## 0.9.0 (2021-01-21) #### Bug Fixes * Use std functions instead of byteorder (bumps minimum version to 1.32.0 ([d217fbd9](https://github.com/image-rs/deflate-rs/commit/d217fbd956597706d80efc1de93c65f4fbe957fd)) ### 0.8.6 (2020-07-06) #### Bug Fixes * try to fix issues with sync flush behaviour ([6c97e514](https://github.com/image-rs/deflate-rs/commit/6c97e5143df139af578cdd884e0dee9940414ea1), closes [#48](https://github.com/image-rs/deflate-rs/issues/48)) * add #!forbid(unsafe_code) to crate root ([fcbe4206](https://github.com/image-rs/deflate-rs/commit/fcbe4206c45cf55d80ae8feb94f0613fe795659f)) ### 0.8.5 (2020-07-04) #### Bug Fixes * Avoid infinitely looping on sync flush with short buffer writers ([99a1a75f](99a1a75f), closes [#47](https://github.com/image-rs/deflate-rs/issues/47)) * Remove unsafe in write_length_rle ([77227c8b](77227c8b), closes [#46](https://github.com/image-rs/deflate-rs/issues/46)) ### 0.8.4 (2020-04-04) #### Bug Fixes * Fix block size counter bug [#44](https://github.com/image-rs/deflate-rs/issues/44) (probably introduced in 1b70be) that triggered a debug assertion and that could possibly in theory cause stored block to start at the wrong input position at a block split with low entropy data followed by uncompressible data. deflate-1.0.0/src/bit_reverse.rs000064400000000000000000000015740072674642500147330ustar 00000000000000/// Reverse the first length bits of n. /// (Passing more than 16 as length will produce garbage. pub fn reverse_bits(mut n: u16, length: u8) -> u16 { // Borrowed from http://aggregate.org/MAGIC/#Bit%20Reversal n = ((n & 0xaaaa) >> 1) | ((n & 0x5555) << 1); n = ((n & 0xcccc) >> 2) | ((n & 0x3333) << 2); n = ((n & 0xf0f0) >> 4) | ((n & 0x0f0f) << 4); n = ((n & 0xff00) >> 8) | ((n & 0x00ff) << 8); n >> (16 - length) } #[cfg(test)] mod test { use super::reverse_bits; #[test] fn test_bit_reverse() { assert_eq!(reverse_bits(0b0111_0100, 8), 0b0010_1110); assert_eq!( reverse_bits(0b1100_1100_1100_1100, 16), 0b0011_0011_0011_0011 ); // Check that we ignore >16 length // We no longer check for this. // assert_eq!(reverse_bits(0b11001100_11001100, 32), 0b0011001100110011); } } deflate-1.0.0/src/bitstream.rs000064400000000000000000000134770072674642500144210ustar 00000000000000// This was originally based on code from: https://github.com/nwin/lzw // Copyright (c) 2015 nwin // which is under both Apache 2.0 and MIT //! This module provides a bit writer use std::io::{self, Write}; #[cfg(target_pointer_width = "64")] #[macro_use] mod arch_dep { /// The data type of the accumulator. /// a 64-bit value allows us to store more before /// each push to the vector, but is sub-optimal /// on 32-bit platforms. pub type AccType = u64; pub const FLUSH_AT: u8 = 48; /// Push pending bits to vector. /// Using a macro here since an inline function. /// didn't optimise properly. /// TODO June 2019: See if it's still needed. macro_rules! push { ($s:ident) => { $s.w.extend_from_slice( &[ $s.acc as u8, ($s.acc >> 8) as u8, ($s.acc >> 16) as u8, ($s.acc >> 24) as u8, ($s.acc >> 32) as u8, ($s.acc >> 40) as u8, ][..], ) }; } } #[cfg(not(target_pointer_width = "64"))] #[macro_use] mod arch_dep { pub type AccType = u32; pub const FLUSH_AT: u8 = 16; macro_rules! push { ($s:ident) => { // Unlike the 64-bit case, using copy_from_slice seemed to worsen performance here. // TODO: Needs benching on a 32-bit system to see what works best. $s.w.push($s.acc as u8); $s.w.push(($s.acc >> 8) as u8); }; } } use self::arch_dep::*; /// Writes bits to a byte stream, LSB first. pub struct LsbWriter { // Public for now so it can be replaced after initialization. pub w: Vec, bits: u8, acc: AccType, } impl LsbWriter { /// Creates a new bit reader pub const fn new(writer: Vec) -> LsbWriter { LsbWriter { w: writer, bits: 0, acc: 0, } } pub const fn pending_bits(&self) -> u8 { self.bits } /// Buffer n number of bits, and write them to the vec if there are enough pending bits. pub fn write_bits(&mut self, v: u16, n: u8) { // NOTE: This outputs garbage data if n is 0, but v is not 0 self.acc |= (AccType::from(v)) << self.bits; self.bits += n; // Waiting until we have FLUSH_AT bits and pushing them all in one batch. while self.bits >= FLUSH_AT { push!(self); self.acc >>= FLUSH_AT; self.bits -= FLUSH_AT; } } fn write_bits_finish(&mut self, v: u16, n: u8) { // NOTE: This outputs garbage data if n is 0, but v is not 0 self.acc |= (AccType::from(v)) << self.bits; self.bits += n % 8; while self.bits >= 8 { self.w.push(self.acc as u8); self.acc >>= 8; self.bits -= 8; } } pub fn flush_raw(&mut self) { let missing = FLUSH_AT - self.bits; // Have to test for self.bits > 0 here, // otherwise flush would output an extra byte when flush was called at a byte boundary if missing > 0 && self.bits > 0 { self.write_bits_finish(0, missing); } } } impl Write for LsbWriter { fn write(&mut self, buf: &[u8]) -> io::Result { if self.acc == 0 { self.w.extend_from_slice(buf) } else { for &byte in buf.iter() { self.write_bits(u16::from(byte), 8) } } Ok(buf.len()) } fn flush(&mut self) -> io::Result<()> { self.flush_raw(); Ok(()) } } #[cfg(test)] mod test { use super::LsbWriter; #[test] fn write_bits() { let input = [ (3, 3), (10, 8), (88, 7), (0, 2), (0, 5), (0, 0), (238, 8), (126, 8), (161, 8), (10, 8), (238, 8), (174, 8), (126, 8), (174, 8), (65, 8), (142, 8), (62, 8), (10, 8), (1, 8), (161, 8), (78, 8), (62, 8), (158, 8), (206, 8), (10, 8), (64, 7), (0, 0), (24, 5), (0, 0), (174, 8), (126, 8), (193, 8), (174, 8), ]; let expected = [ 83, 192, 2, 220, 253, 66, 21, 220, 93, 253, 92, 131, 28, 125, 20, 2, 66, 157, 124, 60, 157, 21, 128, 216, 213, 47, 216, 21, ]; let mut writer = LsbWriter::new(Vec::new()); for v in input.iter() { writer.write_bits(v.0, v.1); } writer.flush_raw(); assert_eq!(writer.w, expected); } } #[cfg(all(test, feature = "benchmarks"))] mod bench { use super::LsbWriter; use test_std::Bencher; #[bench] fn bit_writer(b: &mut Bencher) { let input = [ (3, 3), (10, 8), (88, 7), (0, 2), (0, 5), (0, 0), (238, 8), (126, 8), (161, 8), (10, 8), (238, 8), (174, 8), (126, 8), (174, 8), (65, 8), (142, 8), (62, 8), (10, 8), (1, 8), (161, 8), (78, 8), (62, 8), (158, 8), (206, 8), (10, 8), (64, 7), (0, 0), (24, 5), (0, 0), (174, 8), (126, 8), (193, 8), (174, 8), ]; let mut writer = LsbWriter::new(Vec::with_capacity(100)); b.iter(|| { for v in input.iter() { let _ = writer.write_bits(v.0, v.1); } }); } } deflate-1.0.0/src/chained_hash_table.rs000064400000000000000000000262160072674642500161670ustar 00000000000000pub const WINDOW_SIZE: usize = 32768; pub const WINDOW_MASK: usize = WINDOW_SIZE - 1; #[cfg(test)] pub const HASH_BYTES: usize = 3; const HASH_SHIFT: u16 = 5; const HASH_MASK: u16 = WINDOW_MASK as u16; /// Helper struct to let us allocate both head and prev in the same block. struct Tables { /// Starts of hash chains (in prev) pub head: [u16; WINDOW_SIZE], /// Link to previous occurence of this hash value pub prev: [u16; WINDOW_SIZE], } impl Default for Tables { #[inline] fn default() -> Tables { Tables { head: [0; WINDOW_SIZE], prev: [0; WINDOW_SIZE], } } } impl Tables { #[inline] fn fill_prev(&mut self) { self.prev.copy_from_slice(&self.head); } } /// Create and box the hash chains. fn create_tables() -> Box { // Using default here is a trick to get around the lack of box syntax on stable Rust. // // Box::new([0u16,n]) ends up creating an temporary array on the stack which is not optimised // but using default, which simply calls `box value` internally allows us to get around this. // // We could use vec instead, but using a boxed array helps the compiler optimise // away bounds checks as `n & WINDOW_MASK < WINDOW_SIZE` will always be true. let mut t: Box = Box::default(); for (n, b) in t.head.iter_mut().enumerate() { *b = n as u16; } t.fill_prev(); t } /// Returns a new hash value based on the previous value and the next byte #[inline] pub fn update_hash(current_hash: u16, to_insert: u8) -> u16 { update_hash_conf(current_hash, to_insert, HASH_SHIFT, HASH_MASK) } #[inline] fn update_hash_conf(current_hash: u16, to_insert: u8, shift: u16, mask: u16) -> u16 { ((current_hash << shift) ^ (u16::from(to_insert))) & mask } #[inline] fn reset_array(arr: &mut [u16; WINDOW_SIZE]) { for (n, b) in arr.iter_mut().enumerate() { *b = n as u16; } } pub struct ChainedHashTable { // Current running hash value of the last 3 bytes current_hash: u16, // Hash chains. c: Box, // Used for testing // count: DebugCounter, } impl ChainedHashTable { pub fn new() -> ChainedHashTable { ChainedHashTable { current_hash: 0, c: create_tables(), //count: DebugCounter::default(), } } #[cfg(test)] pub fn from_starting_values(v1: u8, v2: u8) -> ChainedHashTable { let mut t = ChainedHashTable::new(); t.current_hash = update_hash(t.current_hash, v1); t.current_hash = update_hash(t.current_hash, v2); t } /// Resets the hash value and hash chains pub fn reset(&mut self) { self.current_hash = 0; reset_array(&mut self.c.head); { let h = self.c.head; let mut c = self.c.prev; c[..].copy_from_slice(&h[..]); } /*if cfg!(debug_assertions) { self.count.reset(); }*/ } pub fn add_initial_hash_values(&mut self, v1: u8, v2: u8) { self.current_hash = update_hash(self.current_hash, v1); self.current_hash = update_hash(self.current_hash, v2); } /// Insert a byte into the hash table #[inline] pub fn add_hash_value(&mut self, position: usize, value: u8) { // Check that all bytes are input in order and at the correct positions. // Disabled for now as it breaks when sync flushing. /*debug_assert_eq!( position & WINDOW_MASK, self.count.get() as usize & WINDOW_MASK );*/ debug_assert!( position < WINDOW_SIZE * 2, "Position is larger than 2 * window size! {}", position ); // Storing the hash in a temporary variable here makes the compiler avoid the // bounds checks in this function. let new_hash = update_hash(self.current_hash, value); self.add_with_hash(position, new_hash); // Update the stored hash value with the new hash. self.current_hash = new_hash; } /// Directly set the current hash value #[inline] pub fn set_hash(&mut self, hash: u16) { self.current_hash = hash; } /// Update the tables directly, providing the hash. #[inline] pub fn add_with_hash(&mut self, position: usize, hash: u16) { /*if cfg!(debug_assertions) { self.count.add(1); }*/ self.c.prev[position & WINDOW_MASK] = self.c.head[hash as usize]; // Ignoring any bits over 16 here is deliberate, as we only concern ourselves about // where in the buffer (which is 64k bytes) we are referring to. self.c.head[hash as usize] = position as u16; } // Get the head of the hash chain for the current hash value #[cfg(test)] #[inline] pub const fn current_head(&self) -> u16 { self.c.head[self.current_hash as usize] } #[inline] pub const fn current_hash(&self) -> u16 { self.current_hash } #[inline] pub const fn get_prev(&self, bytes: usize) -> u16 { self.c.prev[bytes & WINDOW_MASK] } #[cfg(test)] #[inline] pub fn farthest_next(&self, match_pos: usize, match_len: usize) -> usize { let to_check = match_len.saturating_sub(2); let mut n = 0; let mut smallest_prev = self.get_prev(match_pos); let mut smallest_pos = 0; while n < to_check { let prev = self.get_prev(match_pos + n); if prev < smallest_prev { smallest_prev = prev; smallest_pos = n; } n += 1; } smallest_pos } #[inline] fn slide_value(b: u16, pos: u16, bytes: u16) -> u16 { if b >= bytes { b - bytes } else { pos } } #[inline] fn slide_table(table: &mut [u16; WINDOW_SIZE], bytes: u16) { for (n, b) in table.iter_mut().enumerate() { *b = ChainedHashTable::slide_value(*b, n as u16, bytes); } } pub fn slide(&mut self, bytes: usize) { /*if cfg!(debug_assertions) && bytes != WINDOW_SIZE { // This should only happen in tests in this file. self.count.reset(); }*/ ChainedHashTable::slide_table(&mut self.c.head, bytes as u16); ChainedHashTable::slide_table(&mut self.c.prev, bytes as u16); } } #[cfg(test)] pub fn filled_hash_table(data: &[u8]) -> ChainedHashTable { assert!(data.len() <= (WINDOW_SIZE * 2) + 2); let mut hash_table = ChainedHashTable::from_starting_values(data[0], data[1]); for (n, b) in data[2..].iter().enumerate() { hash_table.add_hash_value(n, *b); } hash_table } #[cfg(test)] mod test { use super::{filled_hash_table, ChainedHashTable}; #[test] fn chained_hash() { use std::str; let test_string = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \ eiusmod tempor. rum. incididunt ut labore et dolore magna aliqua. Ut \ enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi \ ut aliquip ex ea commodo consequat. rum. Duis aute irure dolor in \ reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla \ pariatur. Excepteur sint occaecat cupidatat non proident, sunt in \ culpa qui officia deserunt mollit anim id est laborum."; let test_data = test_string.as_bytes(); let current_bytes = &test_data[test_data.len() - super::HASH_BYTES..test_data.len()]; let num_iters = test_string .matches(str::from_utf8(current_bytes).unwrap()) .count(); let hash_table = filled_hash_table(test_data); // Test that the positions in the chain are valid let mut prev_value = hash_table.get_prev(hash_table.current_head() as usize) as usize; let mut count = 0; let mut current = hash_table.current_head() as usize; while current != prev_value { count += 1; current = prev_value; prev_value = hash_table.get_prev(prev_value) as usize; } // There should be at least as many occurences of the hash of the checked bytes as the // numbers of occurences of the checked bytes themselves. As the hashes are not large enough // to store 8 * 3 = 24 bits, there could be more with different input data. assert!(count >= num_iters); } #[test] fn table_unique() { let mut test_data = Vec::new(); test_data.extend(0u8..255); test_data.extend(255u8..0); let hash_table = filled_hash_table(&test_data); let prev_pos = hash_table.get_prev(hash_table.current_head() as usize); // Since all sequences in the input are unique, there shouldn't be any previous values. assert_eq!(prev_pos, hash_table.current_hash()); } #[test] fn table_slide() { use std::fs::File; use std::io::Read; let window_size = super::WINDOW_SIZE; let window_size16 = super::WINDOW_SIZE as u16; let mut input = Vec::new(); let mut f = File::open("tests/pg11.txt").unwrap(); f.read_to_end(&mut input).unwrap(); let mut hash_table = filled_hash_table(&input[..window_size + 2]); for (n, b) in input[2..window_size + 2].iter().enumerate() { hash_table.add_hash_value(n + window_size, *b); } hash_table.slide(window_size); { let max_head = hash_table.c.head.iter().max().unwrap(); // After sliding there should be no hashes referring to values // higher than the window size assert!(*max_head < window_size16); assert!(*max_head > 0); let pos = hash_table.get_prev(hash_table.current_head() as usize); // There should be a previous occurence since we inserted the data 3 times assert!(pos < window_size16); assert!(pos > 0); } for (n, b) in input[2..(window_size / 2)].iter().enumerate() { hash_table.add_hash_value(n + window_size, *b); } // There should hashes referring to values in the upper part of the input window // at this point let max_prev = hash_table.c.prev.iter().max().unwrap(); assert!(*max_prev > window_size16); let mut pos = hash_table.current_head(); // There should be a previous occurence since we inserted the data 3 times assert!(pos > window_size16); let end_byte = input[(window_size / 2) - 1 - 2]; let mut iterations = 0; while pos > window_size16 && iterations < 5000 { assert_eq!(input[pos as usize & window_size - 1], end_byte); pos = hash_table.get_prev(pos as usize); iterations += 1; } } #[test] /// Ensure that the initial hash values are correct. fn initial_chains() { let t = ChainedHashTable::new(); for (n, &b) in t.c.head.iter().enumerate() { assert_eq!(n, b as usize); } for (n, &b) in t.c.prev.iter().enumerate() { assert_eq!(n, b as usize); } } } deflate-1.0.0/src/checksum.rs000064400000000000000000000027020072674642500142160ustar 00000000000000use adler32::RollingAdler32; pub trait RollingChecksum { fn update(&mut self, byte: u8); fn update_from_slice(&mut self, data: &[u8]); fn current_hash(&self) -> u32; } pub struct NoChecksum {} impl NoChecksum { pub const fn new() -> NoChecksum { NoChecksum {} } } impl RollingChecksum for NoChecksum { fn update(&mut self, _: u8) {} fn update_from_slice(&mut self, _: &[u8]) {} fn current_hash(&self) -> u32 { 1 } } impl<'a> RollingChecksum for &'a mut NoChecksum { fn update(&mut self, _: u8) {} fn update_from_slice(&mut self, _: &[u8]) {} fn current_hash(&self) -> u32 { 1 } } pub struct Adler32Checksum { adler32: RollingAdler32, } impl Adler32Checksum { pub fn new() -> Adler32Checksum { Adler32Checksum { adler32: RollingAdler32::new(), } } } impl RollingChecksum for Adler32Checksum { fn update(&mut self, byte: u8) { self.adler32.update(byte); } fn update_from_slice(&mut self, data: &[u8]) { self.adler32.update_buffer(data); } fn current_hash(&self) -> u32 { self.adler32.hash() } } impl<'a> RollingChecksum for &'a mut Adler32Checksum { fn update(&mut self, byte: u8) { self.adler32.update(byte); } fn update_from_slice(&mut self, data: &[u8]) { self.adler32.update_buffer(data); } fn current_hash(&self) -> u32 { self.adler32.hash() } } deflate-1.0.0/src/compress.rs000064400000000000000000000325000072674642500142460ustar 00000000000000use std::io; use std::io::Write; use crate::bitstream::LsbWriter; use crate::deflate_state::DeflateState; use crate::encoder_state::EncoderState; use crate::huffman_lengths::{gen_huffman_lengths, write_huffman_lengths, BlockType}; use crate::lz77::{lz77_compress_block, LZ77Status}; use crate::lzvalue::LZValue; use crate::stored_block::{compress_block_stored, write_stored_header, MAX_STORED_BLOCK_LENGTH}; const LARGEST_OUTPUT_BUF_SIZE: usize = 1024 * 32; /// Flush mode to use when compressing input received in multiple steps. /// /// (The more obscure ZLIB flush modes are not implemented.) #[derive(Eq, PartialEq, Debug, Copy, Clone)] pub enum Flush { // Simply wait for more input when we are out of input data to process. None, // Send a "sync block", corresponding to Z_SYNC_FLUSH in zlib. This finishes compressing and // outputting all pending data, and then outputs an empty stored block. // (That is, the block header indicating a stored block followed by `0000FFFF`). Sync, _Partial, _Block, _Full, // Finish compressing and output all remaining input. Finish, } /// Write all the lz77 encoded data in the buffer using the specified `EncoderState`, and finish /// with the end of block code. pub fn flush_to_bitstream(buffer: &[LZValue], state: &mut EncoderState) { for &b in buffer { state.write_lzvalue(b.value()); } state.write_end_of_block() } /// Compress the input data using only fixed huffman codes. /// /// Currently only used in tests. #[cfg(test)] pub fn compress_data_fixed(input: &[u8]) -> Vec { use crate::lz77::lz77_compress; let mut state = EncoderState::fixed(Vec::new()); let compressed = lz77_compress(input).unwrap(); // We currently don't split blocks here(this function is just used for tests anyhow) state.write_start_of_block(true, true); flush_to_bitstream(&compressed, &mut state); state.flush(); state.reset(Vec::new()) } fn write_stored_block(input: &[u8], mut writer: &mut LsbWriter, final_block: bool) { // If the input is not zero, we write stored blocks for the input data. if !input.is_empty() { let mut i = input.chunks(MAX_STORED_BLOCK_LENGTH).peekable(); while let Some(chunk) = i.next() { let last_chunk = i.peek().is_none(); // Write the block header write_stored_header(writer, final_block && last_chunk); // Write the actual data. compress_block_stored(chunk, &mut writer).expect("Write error"); } } else { // If the input length is zero, we output an empty block. This is used for syncing. write_stored_header(writer, final_block); compress_block_stored(&[], &mut writer).expect("Write error"); } } /// Inner compression function used by both the writers and the simple compression functions. pub fn compress_data_dynamic_n( input: &[u8], deflate_state: &mut DeflateState, flush: Flush, ) -> io::Result { let mut bytes_written = 0; let mut slice = input; // enter the decompression loop unless we did a sync flush, in case we want to make sure // everything is output before continuing. while !deflate_state.needs_flush { let output_buf_len = deflate_state.output_buf().len(); let output_buf_pos = deflate_state.output_buf_pos; // If the output buffer has too much data in it already, flush it before doing anything // else. if output_buf_len > LARGEST_OUTPUT_BUF_SIZE { let written = deflate_state .inner .as_mut() .expect("Missing writer!") .write(&deflate_state.encoder_state.inner_vec()[output_buf_pos..])?; if written < output_buf_len.checked_sub(output_buf_pos).unwrap() { // Only some of the data was flushed, so keep track of where we were. deflate_state.output_buf_pos += written; } else { // If we flushed all of the output, reset the output buffer. deflate_state.needs_flush = false; deflate_state.output_buf_pos = 0; deflate_state.output_buf().clear(); } if bytes_written == 0 { // If the buffer was already full when the function was called, this has to be // returned rather than Ok(0) to indicate that we didn't write anything, but are // not done yet. return Err(io::Error::new( io::ErrorKind::Interrupted, "Internal buffer full.", )); } else { return Ok(bytes_written); } } if deflate_state.lz77_state.is_last_block() { // The last block has already been written, so we don't have anything to compress. break; } let (written, status, position) = lz77_compress_block( slice, &mut deflate_state.lz77_state, &mut deflate_state.input_buffer, &mut deflate_state.lz77_writer, flush, ); // Bytes written in this call bytes_written += written; // Total bytes written since the compression process started // TODO: Should we realistically have to worry about overflowing here? deflate_state.bytes_written += written as u64; if status == LZ77Status::NeedInput { // If we've consumed all the data input so far, and we're not // finishing or syncing or ending the block here, simply return // the number of bytes consumed so far. return Ok(bytes_written); } // Increment start of input data slice = &slice[written..]; // We need to check if this is the last block as the header will then be // slightly different to indicate this. let last_block = deflate_state.lz77_state.is_last_block(); let current_block_input_bytes = deflate_state.lz77_state.current_block_input_bytes(); if cfg!(debug_assertions) { deflate_state .bytes_written_control .add(current_block_input_bytes); } let partial_bits = deflate_state.encoder_state.writer.pending_bits(); let res = { let (l_freqs, d_freqs) = deflate_state.lz77_writer.get_frequencies(); let (l_lengths, d_lengths) = deflate_state.encoder_state.huffman_table.get_lengths_mut(); gen_huffman_lengths( l_freqs, d_freqs, current_block_input_bytes, partial_bits, l_lengths, d_lengths, &mut deflate_state.length_buffers, ) }; // Check if we've actually managed to compress the input, and output stored blocks // if not. match res { BlockType::Dynamic(header) => { // Write the block header. deflate_state .encoder_state .write_start_of_block(false, last_block); // Output the lengths of the huffman codes used in this block. write_huffman_lengths( &header, &deflate_state.encoder_state.huffman_table, &deflate_state.length_buffers.length_buf, &mut deflate_state.encoder_state.writer, ); // Uupdate the huffman codes that will be used to encode the // lz77-compressed data. deflate_state .encoder_state .huffman_table .update_from_lengths(); // Write the huffman compressed data and the end of block marker. flush_to_bitstream( deflate_state.lz77_writer.get_buffer(), &mut deflate_state.encoder_state, ); } BlockType::Fixed => { // Write the block header for fixed code blocks. deflate_state .encoder_state .write_start_of_block(true, last_block); // Use the pre-defined static huffman codes. deflate_state.encoder_state.set_huffman_to_fixed(); // Write the compressed data and the end of block marker. flush_to_bitstream( deflate_state.lz77_writer.get_buffer(), &mut deflate_state.encoder_state, ); } BlockType::Stored => { // If compression fails, output a stored block instead. let start_pos = position.saturating_sub(current_block_input_bytes as usize); assert!( position >= current_block_input_bytes as usize, "Error! Trying to output a stored block with forgotten data!\ if you encounter this error, please file an issue!" ); write_stored_block( &deflate_state.input_buffer.get_buffer()[start_pos..position], &mut deflate_state.encoder_state.writer, flush == Flush::Finish && last_block, ); } }; // Clear the current lz77 data in the writer for the next call. deflate_state.lz77_writer.clear(); // We are done with the block, so we reset the number of bytes taken // for the next one. deflate_state.lz77_state.reset_input_bytes(); // We are done for now. if status == LZ77Status::Finished { // This flush mode means that there should be an empty stored block at the end. if flush == Flush::Sync { write_stored_block(&[], &mut deflate_state.encoder_state.writer, false); // Indicate that we need to flush the buffers before doing anything else. deflate_state.needs_flush = true; } else if !deflate_state.lz77_state.is_last_block() { // Make sure a block with the last block header has been output. // Not sure this can actually happen, but we make sure to finish properly // if it somehow does. // An empty fixed block is the shortest. let es = &mut deflate_state.encoder_state; es.set_huffman_to_fixed(); es.write_start_of_block(true, true); es.write_end_of_block(); } break; } } // If we reach this point, the remaining data in the buffers is to be flushed. deflate_state.encoder_state.flush(); // Make sure we've output everything, and return the number of bytes written if everything // went well. let output_buf_pos = deflate_state.output_buf_pos; let written_to_writer = deflate_state .inner .as_mut() .expect("Missing writer!") .write(&deflate_state.encoder_state.inner_vec()[output_buf_pos..])?; if written_to_writer < deflate_state .output_buf() .len() .checked_sub(output_buf_pos) .unwrap() { deflate_state.output_buf_pos += written_to_writer; } else { // If we sucessfully wrote all the data, we can clear the output buffer. deflate_state.output_buf_pos = 0; deflate_state.output_buf().clear(); deflate_state.needs_flush = false; } Ok(bytes_written) } #[cfg(test)] mod test { use super::*; use crate::test_utils::{decompress_to_end, get_test_data}; #[test] /// Test compressing a short string using fixed encoding. fn fixed_string_mem() { let test_data = String::from(" GNU GENERAL PUBLIC LICENSE").into_bytes(); let compressed = compress_data_fixed(&test_data); let result = decompress_to_end(&compressed); assert_eq!(test_data, result); } #[test] fn fixed_data() { let data = vec![190u8; 400]; let compressed = compress_data_fixed(&data); let result = decompress_to_end(&compressed); assert_eq!(data, result); } /// Test deflate example. /// /// Check if the encoder produces the same code as the example given by Mark Adler here: /// https://stackoverflow.com/questions/17398931/deflate-encoding-with-static-huffman-codes/17415203 #[test] fn fixed_example() { let test_data = b"Deflate late"; // let check = // [0x73, 0x49, 0x4d, 0xcb, 0x49, 0x2c, 0x49, 0x55, 0xc8, 0x49, 0x2c, 0x49, 0x5, 0x0]; let check = [ 0x73, 0x49, 0x4d, 0xcb, 0x49, 0x2c, 0x49, 0x55, 0x00, 0x11, 0x00, ]; let compressed = compress_data_fixed(test_data); assert_eq!(&compressed, &check); let decompressed = decompress_to_end(&compressed); assert_eq!(&decompressed, test_data) } #[test] /// Test compression from a file. fn fixed_string_file() { let input = get_test_data(); let compressed = compress_data_fixed(&input); println!("Fixed codes compressed len: {}", compressed.len()); let result = decompress_to_end(&compressed); assert_eq!(input.len(), result.len()); // Not using assert_eq here deliberately to avoid massive amounts of output spam. assert!(input == result); } } deflate-1.0.0/src/compression_options.rs000064400000000000000000000167020072674642500165350ustar 00000000000000//! This module contains the various options to tweak how compression is performed. //! //! Note that due to the nature of the `DEFLATE` format, lower compression levels //! may for some data compress better than higher compression levels. //! //! For applications where a maximum level of compression (irrespective of compression //! speed) is required, consider using the [`Zopfli`](https://crates.io/crates/zopfli) //! compressor, which uses a specialised (but slow) algorithm to figure out the maximum //! of compression for the provided data. //! use crate::lz77::MatchingType; use std::convert::From; pub const HIGH_MAX_HASH_CHECKS: u16 = 1768; pub const HIGH_LAZY_IF_LESS_THAN: u16 = 128; /// The maximum number of hash checks that make sense as this is the length /// of the hash chain. pub const MAX_HASH_CHECKS: u16 = 32 * 1024; pub const DEFAULT_MAX_HASH_CHECKS: u16 = 128; pub const DEFAULT_LAZY_IF_LESS_THAN: u16 = 32; /// An enum describing the level of compression to be used by the encoder /// /// Higher compression ratios will take longer to encode. /// /// This is a simplified interface to specify a compression level. /// /// [See also `CompressionOptions`](./struct.CompressionOptions.html) which provides for /// tweaking the settings more finely. #[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] pub enum Compression { /// Fast minimal compression (`CompressionOptions::fast()`). Fast, /// Default level (`CompressionOptions::default()`). Default, /// Higher compression level (`CompressionOptions::high()`). /// /// Best in this context isn't actually the highest possible level /// the encoder can do, but is meant to emulate the `Best` setting in the `Flate2` /// library. Best, } impl Default for Compression { fn default() -> Compression { Compression::Default } } /// Enum allowing some special options (not implemented yet)! #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] pub enum SpecialOptions { /// Compress normally. Normal, /// Force fixed Huffman tables. (Unimplemented!). _ForceFixed, /// Force stored (uncompressed) blocks only. (Unimplemented!). _ForceStored, } impl Default for SpecialOptions { fn default() -> SpecialOptions { SpecialOptions::Normal } } pub const DEFAULT_OPTIONS: CompressionOptions = CompressionOptions { max_hash_checks: DEFAULT_MAX_HASH_CHECKS, lazy_if_less_than: DEFAULT_LAZY_IF_LESS_THAN, matching_type: MatchingType::Lazy, special: SpecialOptions::Normal, }; /// A struct describing the options for a compressor or compression function. /// /// These values are not stable and still subject to change! #[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)] pub struct CompressionOptions { /// The maximum number of checks to make in the hash table for matches. /// /// Higher numbers mean slower, but better compression. Very high (say `>1024`) values /// will impact compression speed a lot. The maximum match length is 2^15, so values higher than /// this won't make any difference, and will be truncated to 2^15 by the compression /// function/writer. /// /// Default value: `128` pub max_hash_checks: u16, // pub _window_size: u16, /// Only lazy match if we have a length less than this value. /// /// Higher values degrade compression slightly, but improve compression speed. /// /// * `0`: Never lazy match. (Same effect as setting `MatchingType` to greedy, but may be slower). /// * `1...257`: Only check for a better match if the first match was shorter than this value. /// * `258`: Always lazy match. /// /// As the maximum length of a match is `258`, values higher than this will have /// no further effect. /// /// * Default value: `32` pub lazy_if_less_than: u16, // pub _decent_match: u16, /// Whether to use lazy or greedy matching. /// /// Lazy matching will provide better compression, at the expense of compression speed. /// /// As a special case, if max_hash_checks is set to 0, and matching_type is set to lazy, /// compression using only run-length encoding (i.e maximum match distance of 1) is performed. /// (This may be changed in the future but is defined like this at the moment to avoid API /// breakage. /// /// [See `MatchingType`](./enum.MatchingType.html) /// /// * Default value: `MatchingType::Lazy` pub matching_type: MatchingType, /// Force fixed/stored blocks (Not implemented yet). /// * Default value: `SpecialOptions::Normal` pub special: SpecialOptions, } // Some standard profiles for the compression options. // Ord should be implemented at some point, but won't yet until the struct is stabilised. impl CompressionOptions { /// Returns compression settings roughly corresponding to the `HIGH(9)` setting in miniz. pub const fn high() -> CompressionOptions { CompressionOptions { max_hash_checks: HIGH_MAX_HASH_CHECKS, lazy_if_less_than: HIGH_LAZY_IF_LESS_THAN, matching_type: MatchingType::Lazy, special: SpecialOptions::Normal, } } /// Returns a fast set of compression settings /// /// Ideally this should roughly correspond to the `FAST(1)` setting in miniz. /// However, that setting makes miniz use a somewhat different algorithm, /// so currently hte fast level in this library is slower and better compressing /// than the corresponding level in miniz. pub const fn fast() -> CompressionOptions { CompressionOptions { max_hash_checks: 1, lazy_if_less_than: 0, matching_type: MatchingType::Greedy, special: SpecialOptions::Normal, } } /// Returns a set of compression settings that makes the compressor only compress using /// Huffman coding. (Ignoring any length/distance matching) /// /// This will normally have the worst compression ratio (besides only using uncompressed data), /// but may be the fastest method in some cases. pub const fn huffman_only() -> CompressionOptions { CompressionOptions { max_hash_checks: 0, lazy_if_less_than: 0, matching_type: MatchingType::Greedy, special: SpecialOptions::Normal, } } /// Returns a set of compression settings that makes the compressor compress only using /// run-length encoding (i.e only looking for matches one byte back). /// /// This is very fast, but tends to compress worse than looking for more matches using hash /// chains that the slower settings do. /// Works best on data that has runs of equivalent bytes, like binary or simple images, /// less good for text. pub const fn rle() -> CompressionOptions { CompressionOptions { max_hash_checks: 0, lazy_if_less_than: 0, matching_type: MatchingType::Lazy, special: SpecialOptions::Normal, } } } impl Default for CompressionOptions { /// Returns the options describing the default compression level. fn default() -> CompressionOptions { DEFAULT_OPTIONS } } impl From for CompressionOptions { fn from(compression: Compression) -> CompressionOptions { match compression { Compression::Fast => CompressionOptions::fast(), Compression::Default => CompressionOptions::default(), Compression::Best => CompressionOptions::high(), } } } deflate-1.0.0/src/deflate_state.rs000064400000000000000000000121310072674642500152150ustar 00000000000000use std::io::Write; use std::{cmp, io, mem}; use crate::compress::Flush; use crate::compression_options::{CompressionOptions, MAX_HASH_CHECKS}; use crate::encoder_state::EncoderState; pub use crate::huffman_table::MAX_MATCH; use crate::huffman_table::NUM_LITERALS_AND_LENGTHS; use crate::input_buffer::InputBuffer; use crate::length_encode::{EncodedLength, LeafVec}; use crate::lz77::LZ77State; use crate::output_writer::DynamicWriter; /// A counter used for checking values in debug mode. /// Does nothing when debug assertions are disabled. #[derive(Default)] pub struct DebugCounter { #[cfg(debug_assertions)] count: u64, } impl DebugCounter { #[cfg(debug_assertions)] pub const fn get(&self) -> u64 { self.count } #[cfg(not(debug_assertions))] pub const fn get(&self) -> u64 { 0 } #[cfg(debug_assertions)] pub fn reset(&mut self) { self.count = 0; } #[cfg(not(debug_assertions))] pub fn reset(&self) {} #[cfg(debug_assertions)] pub fn add(&mut self, val: u64) { self.count += val; } #[cfg(not(debug_assertions))] pub fn add(&self, _: u64) {} } pub struct LengthBuffers { pub leaf_buf: LeafVec, pub length_buf: Vec, } impl LengthBuffers { #[inline] fn new() -> LengthBuffers { LengthBuffers { leaf_buf: Vec::with_capacity(NUM_LITERALS_AND_LENGTHS), length_buf: Vec::with_capacity(19), } } } /// A struct containing all the stored state used for the encoder. pub struct DeflateState { /// State of lz77 compression. pub lz77_state: LZ77State, pub input_buffer: InputBuffer, pub compression_options: CompressionOptions, /// State the Huffman part of the compression and the output buffer. pub encoder_state: EncoderState, /// The buffer containing the raw output of the lz77-encoding. pub lz77_writer: DynamicWriter, /// Buffers used when generating Huffman code lengths. pub length_buffers: LengthBuffers, /// Total number of bytes consumed/written to the input buffer. pub bytes_written: u64, /// Wrapped writer. /// Option is used to allow us to implement `Drop` and `finish()` at the same time for the /// writer structs. pub inner: Option, /// The position in the output buffer where data should be flushed from, to keep track of /// what data has been output in case not all data is output when writing to the wrapped /// writer. pub output_buf_pos: usize, pub flush_mode: Flush, /// Whether we need to flush everything before continuing. /// Currently only used after having output a sync flush. /// This is implemented in a somewhat clunky manner at the moment, /// ideally it should be done in a more fail-safe way to avoid /// further bugs. pub needs_flush: bool, /// Number of bytes written as calculated by sum of block input lengths. /// Used to check that they are correct when `debug_assertions` are enabled. pub bytes_written_control: DebugCounter, } impl DeflateState { pub fn new(compression_options: CompressionOptions, writer: W) -> DeflateState { DeflateState { input_buffer: InputBuffer::empty(), lz77_state: LZ77State::new( compression_options.max_hash_checks, cmp::min(compression_options.lazy_if_less_than, MAX_HASH_CHECKS), compression_options.matching_type, ), encoder_state: EncoderState::new(Vec::with_capacity(1024 * 32)), lz77_writer: DynamicWriter::new(), length_buffers: LengthBuffers::new(), compression_options, bytes_written: 0, inner: Some(writer), output_buf_pos: 0, flush_mode: Flush::None, needs_flush: false, bytes_written_control: DebugCounter::default(), } } #[inline] pub fn output_buf(&mut self) -> &mut Vec { self.encoder_state.inner_vec() } /// Resets the status of the decoder, leaving the compression options intact /// /// If flushing the current writer succeeds, it is replaced with the provided one, /// buffers and status (except compression options) is reset and the old writer /// is returned. /// /// If flushing fails, the rest of the writer is not cleared. pub fn reset(&mut self, writer: W) -> io::Result { self.encoder_state.flush(); self.inner .as_mut() .expect("Missing writer!") .write_all(self.encoder_state.inner_vec())?; self.encoder_state.inner_vec().clear(); self.input_buffer = InputBuffer::empty(); self.lz77_writer.clear(); self.lz77_state.reset(); self.bytes_written = 0; self.output_buf_pos = 0; self.flush_mode = Flush::None; self.needs_flush = false; if cfg!(debug_assertions) { self.bytes_written_control.reset(); } mem::replace(&mut self.inner, Some(writer)) .ok_or_else(|| io::Error::new(io::ErrorKind::Other, "Missing writer")) } } deflate-1.0.0/src/encoder_state.rs000064400000000000000000000107450072674642500152410ustar 00000000000000use crate::bitstream::LsbWriter; use crate::huffman_table::HuffmanTable; use crate::lzvalue::LZType; #[cfg(test)] use std::mem; // The first bits of each block, which describe the type of the block // `-TTF` - TT = type, 00 = stored, 01 = fixed, 10 = dynamic, 11 = reserved, F - 1 if final block // `0000`; const FIXED_FIRST_BYTE: u16 = 0b010; const FIXED_FIRST_BYTE_FINAL: u16 = 0b011; const DYNAMIC_FIRST_BYTE: u16 = 0b100; const DYNAMIC_FIRST_BYTE_FINAL: u16 = 0b101; #[allow(dead_code)] pub enum BType { NoCompression = 0b00, FixedHuffman = 0b01, DynamicHuffman = 0b10, // Reserved = 0b11, //Error } /// A struct wrapping a writer that writes data compressed using the provided Huffman table pub struct EncoderState { pub huffman_table: HuffmanTable, pub writer: LsbWriter, } impl EncoderState { /// Creates a new encoder state using the provided Huffman table and writer pub fn new(writer: Vec) -> EncoderState { EncoderState { huffman_table: HuffmanTable::empty(), writer: LsbWriter::new(writer), } } #[cfg(test)] /// Creates a new encoder state using the fixed Huffman table pub fn fixed(writer: Vec) -> EncoderState { EncoderState { huffman_table: HuffmanTable::fixed_table(), writer: LsbWriter::new(writer), } } pub fn inner_vec(&mut self) -> &mut Vec { &mut self.writer.w } /// Encodes a literal value to the writer fn write_literal(&mut self, value: u8) { let code = self.huffman_table.get_literal(value); debug_assert!(code.length > 0); self.writer.write_bits(code.code, code.length); } /// Write a LZvalue to the contained writer, returning Err if the write operation fails pub fn write_lzvalue(&mut self, value: LZType) { match value { LZType::Literal(l) => self.write_literal(l), LZType::StoredLengthDistance(l, d) => { let (code, extra_bits_code) = self.huffman_table.get_length_huffman(l); debug_assert!( code.length != 0, "Code: {:?}, Value: {:?}", code, value ); self.writer.write_bits(code.code, code.length); self.writer .write_bits(extra_bits_code.code, extra_bits_code.length); let (code, extra_bits_code) = self.huffman_table.get_distance_huffman(d); debug_assert!( code.length != 0, "Code: {:?}, Value: {:?}", code, value ); self.writer.write_bits(code.code, code.length); self.writer .write_bits(extra_bits_code.code, extra_bits_code.length) } }; } /// Write the start of a block, returning Err if the write operation fails. pub fn write_start_of_block(&mut self, fixed: bool, final_block: bool) { if final_block { // The final block has one bit flipped to indicate it's // the final one if fixed { self.writer.write_bits(FIXED_FIRST_BYTE_FINAL, 3) } else { self.writer.write_bits(DYNAMIC_FIRST_BYTE_FINAL, 3) } } else if fixed { self.writer.write_bits(FIXED_FIRST_BYTE, 3) } else { self.writer.write_bits(DYNAMIC_FIRST_BYTE, 3) } } /// Write the end of block code pub fn write_end_of_block(&mut self) { let code = self.huffman_table.get_end_of_block(); self.writer.write_bits(code.code, code.length) } /// Flush the contained writer and it's bitstream wrapper. pub fn flush(&mut self) { self.writer.flush_raw() } pub fn set_huffman_to_fixed(&mut self) { self.huffman_table.set_to_fixed() } /// Reset the encoder state with a new writer, returning the old one if flushing /// succeeds. #[cfg(test)] pub fn reset(&mut self, writer: Vec) -> Vec { // Make sure the writer is flushed // Ideally this should be done before this function is called, but we // do it here just in case. self.flush(); // Reset the huffman table // This probably isn't needed, but again, we do it just in case to avoid leaking any data // If this turns out to be a performance issue, it can probably be ignored later. self.huffman_table = HuffmanTable::empty(); mem::replace(&mut self.writer.w, writer) } } deflate-1.0.0/src/huffman_lengths.rs000064400000000000000000000340730072674642500155720ustar 00000000000000use crate::bitstream::LsbWriter; use crate::deflate_state::LengthBuffers; use crate::huffman_table::{ create_codes_in_place, num_extra_bits_for_distance_code, num_extra_bits_for_length_code, HuffmanTable, FIXED_CODE_LENGTHS, LENGTH_BITS_START, MAX_CODE_LENGTH, NUM_DISTANCE_CODES, NUM_LITERALS_AND_LENGTHS, }; use crate::length_encode::{ encode_lengths_m, huffman_lengths_from_frequency_m, EncodedLength, COPY_PREVIOUS, REPEAT_ZERO_3_BITS, REPEAT_ZERO_7_BITS, }; use crate::output_writer::FrequencyType; use crate::stored_block::MAX_STORED_BLOCK_LENGTH; use std::cmp; /// The minimum number of literal/length values pub const MIN_NUM_LITERALS_AND_LENGTHS: usize = 257; /// The minimum number of distances pub const MIN_NUM_DISTANCES: usize = 1; const NUM_HUFFMAN_LENGTHS: usize = 19; /// The output ordering of the lengths for the Huffman codes used to encode the lengths /// used to build the full Huffman tree for length/literal codes. /// http://www.gzip.org/zlib/rfc-deflate.html#dyn const HUFFMAN_LENGTH_ORDER: [u8; NUM_HUFFMAN_LENGTHS] = [ 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15, ]; // Number of bits used for the values specifying the number of codes const HLIT_BITS: u8 = 5; const HDIST_BITS: u8 = 5; const HCLEN_BITS: u8 = 4; /// The longest a Huffman code describing another Huffman length can be const MAX_HUFFMAN_CODE_LENGTH: usize = 7; // How many bytes (not including padding and the 3-bit block type) the stored block header takes up. const STORED_BLOCK_HEADER_LENGTH: u64 = 4; const BLOCK_MARKER_LENGTH: u8 = 3; /// Creates a new slice from the input slice that stops at the final non-zero value pub fn remove_trailing_zeroes + PartialEq>(input: &[T], min_length: usize) -> &[T] { let num_zeroes = input.iter().rev().take_while(|&a| *a == T::from(0)).count(); &input[0..cmp::max(input.len() - num_zeroes, min_length)] } /// How many extra bits the Huffman length code uses to represent a value. fn extra_bits_for_huffman_length_code(code: u8) -> u8 { match code { 16..=17 => 3, 18 => 7, _ => 0, } } /// Calculate how many bits the Huffman-encoded Huffman lengths will use. fn calculate_huffman_length(frequencies: &[FrequencyType], code_lengths: &[u8]) -> u64 { frequencies .iter() .zip(code_lengths) .enumerate() .fold(0, |acc, (n, (&f, &l))| { acc + (u64::from(f) * (u64::from(l) + u64::from(extra_bits_for_huffman_length_code(n as u8)))) }) } /// Calculate how many bits data with the given frequencies will use when compressed with dynamic /// code lengths (first return value) and static code lengths (second return value). /// /// Parameters: /// Frequencies, length of dynamic codes, and a function to get how many extra bits in addition /// to the length of the Huffman code the symbol will use. fn calculate_block_length( frequencies: &[FrequencyType], dyn_code_lengths: &[u8], get_num_extra_bits: &F, ) -> (u64, u64) where F: Fn(usize) -> u64, { // Length of data represented by dynamic codes. let mut d_ll_length = 0u64; // length of data represented by static codes. let mut s_ll_length = 0u64; let iter = frequencies .iter() .zip(dyn_code_lengths.iter().zip(FIXED_CODE_LENGTHS.iter())) .enumerate(); // This could maybe be optimised a bit by splitting the iteration of codes using extra bits and // codes not using extra bits, but the extra complexity may not be worth it. for (c, (&f, (&l, &fl))) in iter { // Frequency let f = u64::from(f); // How many extra bits the current code number needs. let extra_bits_for_code = get_num_extra_bits(c); d_ll_length += f * (u64::from(l) + extra_bits_for_code); s_ll_length += f * (u64::from(fl) + extra_bits_for_code); } (d_ll_length, s_ll_length) } /// Get how extra padding bits after a block start header a stored block would use. /// /// # Panics /// Panics if `pending_bits > 8` fn stored_padding(pending_bits: u8) -> u64 { assert!(pending_bits <= 8); let free_space = 8 - pending_bits; if free_space >= BLOCK_MARKER_LENGTH { // There is space in the current byte for the header. free_space - BLOCK_MARKER_LENGTH } else { // The header will require an extra byte. 8 - (BLOCK_MARKER_LENGTH - free_space) } .into() } /// Calculate the number of bits storing the data in stored blocks will take up, excluding the /// first block start code and potential padding bits. As stored blocks have a maximum length, /// (as opposed to fixed and dynamic ones), multiple blocks may have to be utilised. /// /// # Panics /// Panics if `input_bytes` is 0. fn stored_length(input_bytes: u64) -> u64 { // Check how many stored blocks these bytes would take up. // (Integer divison rounding up.) let num_blocks = (input_bytes .checked_sub(1) .expect("Underflow calculating stored block length!") / MAX_STORED_BLOCK_LENGTH as u64) + 1; // The length will be the input length and the headers for each block. (Excluding the start // of block code for the first one) (input_bytes + (STORED_BLOCK_HEADER_LENGTH as u64 * num_blocks) + (num_blocks - 1)) * 8 } pub enum BlockType { Stored, Fixed, Dynamic(DynamicBlockHeader), } /// A struct containing the different data needed to write the header for a dynamic block. /// /// The code lengths are stored directly in the `HuffmanTable` struct. /// TODO: Do the same for other things here. pub struct DynamicBlockHeader { /// Length of the run-length encoding symbols. pub huffman_table_lengths: Vec, /// Number of lengths for values describing the Huffman table that encodes the length values /// of the main Huffman tables. pub used_hclens: usize, } /// Generate the lengths of the Huffman codes we will be using, using the /// frequency of the different symbols/lengths/distances, and determine what block type will give /// the shortest representation. /// TODO: This needs a test pub fn gen_huffman_lengths( l_freqs: &[FrequencyType], d_freqs: &[FrequencyType], num_input_bytes: u64, pending_bits: u8, l_lengths: &mut [u8; 288], d_lengths: &mut [u8; 32], length_buffers: &mut LengthBuffers, ) -> BlockType { // Avoid corner cases and issues if this is called for an empty block. // For blocks this short, a fixed block will be the shortest. // TODO: Find the minimum value it's worth doing calculations for. if num_input_bytes <= 4 { return BlockType::Fixed; }; let l_freqs = remove_trailing_zeroes(l_freqs, MIN_NUM_LITERALS_AND_LENGTHS); let d_freqs = remove_trailing_zeroes(d_freqs, MIN_NUM_DISTANCES); // The huffman spec allows us to exclude zeroes at the end of the // table of huffman lengths. // Since a frequency of 0 will give an huffman // length of 0. We strip off the trailing zeroes before even // generating the lengths to save some work. // There is however a minimum number of values we have to keep // according to the deflate spec. // TODO: We could probably compute some of this in parallel. huffman_lengths_from_frequency_m( l_freqs, MAX_CODE_LENGTH, &mut length_buffers.leaf_buf, l_lengths, ); huffman_lengths_from_frequency_m( d_freqs, MAX_CODE_LENGTH, &mut length_buffers.leaf_buf, d_lengths, ); let used_lengths = l_freqs.len(); let used_distances = d_freqs.len(); // Encode length values let mut freqs = [0u16; 19]; encode_lengths_m( l_lengths[..used_lengths] .iter() .chain(&d_lengths[..used_distances]), &mut length_buffers.length_buf, &mut freqs, ); // Create huffman lengths for the length/distance code lengths let mut huffman_table_lengths = vec![0; freqs.len()]; huffman_lengths_from_frequency_m( &freqs, MAX_HUFFMAN_CODE_LENGTH, &mut length_buffers.leaf_buf, huffman_table_lengths.as_mut_slice(), ); // Count how many of these lengths we use. let used_hclens = HUFFMAN_LENGTH_ORDER.len() - HUFFMAN_LENGTH_ORDER .iter() .rev() .take_while(|&&n| huffman_table_lengths[n as usize] == 0) .count(); // There has to be at least 4 hclens, so if there isn't, something went wrong. debug_assert!(used_hclens >= 4); // Calculate how many bytes of space this block will take up with the different block types // (excluding the 3-bit block header since it's used in all block types). // Total length of the compressed literals/lengths. let (d_ll_length, s_ll_length) = calculate_block_length(l_freqs, l_lengths, &|c| { num_extra_bits_for_length_code(c.saturating_sub(LENGTH_BITS_START as usize) as u8).into() }); // Total length of the compressed distances. let (d_dist_length, s_dist_length) = calculate_block_length(d_freqs, d_lengths, &|c| { num_extra_bits_for_distance_code(c as u8).into() }); // Total length of the compressed huffman code lengths. let huff_table_length = calculate_huffman_length(&freqs, &huffman_table_lengths); // For dynamic blocks the huffman tables takes up some extra space. let dynamic_length = d_ll_length + d_dist_length + huff_table_length + (used_hclens as u64 * 3) + u64::from(HLIT_BITS) + u64::from(HDIST_BITS) + u64::from(HCLEN_BITS); // Static blocks don't have any extra header data. let static_length = s_ll_length + s_dist_length; // Calculate how many bits it will take to store the data in uncompressed (stored) block(s). let stored_length = stored_length(num_input_bytes) + stored_padding(pending_bits % 8); let used_length = cmp::min(cmp::min(dynamic_length, static_length), stored_length); // Check if the block is actually compressed. If using a dynamic block // increases the length of the block (for instance if the input data is mostly random or // already compressed), we want to output a stored(uncompressed) block instead to avoid wasting // space. if used_length == static_length { BlockType::Fixed } else if used_length == stored_length { BlockType::Stored } else { BlockType::Dynamic(DynamicBlockHeader { huffman_table_lengths, used_hclens, }) } } /// Write the specified Huffman lengths to the bit writer pub fn write_huffman_lengths( header: &DynamicBlockHeader, huffman_table: &HuffmanTable, encoded_lengths: &[EncodedLength], writer: &mut LsbWriter, ) { // Ignore trailing zero lengths as allowed by the deflate spec. let (literal_len_lengths, distance_lengths) = huffman_table.get_lengths(); let literal_len_lengths = remove_trailing_zeroes(literal_len_lengths, MIN_NUM_LITERALS_AND_LENGTHS); let distance_lengths = remove_trailing_zeroes(distance_lengths, MIN_NUM_DISTANCES); let huffman_table_lengths = &header.huffman_table_lengths; let used_hclens = header.used_hclens; assert!(literal_len_lengths.len() <= NUM_LITERALS_AND_LENGTHS); assert!(literal_len_lengths.len() >= MIN_NUM_LITERALS_AND_LENGTHS); assert!(distance_lengths.len() <= NUM_DISTANCE_CODES); assert!(distance_lengths.len() >= MIN_NUM_DISTANCES); // Number of length codes - 257. let hlit = (literal_len_lengths.len() - MIN_NUM_LITERALS_AND_LENGTHS) as u16; writer.write_bits(hlit, HLIT_BITS); // Number of distance codes - 1. let hdist = (distance_lengths.len() - MIN_NUM_DISTANCES) as u16; writer.write_bits(hdist, HDIST_BITS); // Number of huffman table lengths - 4. let hclen = used_hclens.saturating_sub(4); // Write HCLEN. // Casting to u16 is safe since the length can never be more than the length of // `HUFFMAN_LENGTH_ORDER` anyhow. writer.write_bits(hclen as u16, HCLEN_BITS); // Write the lengths for the huffman table describing the huffman table // Each length is 3 bits for n in &HUFFMAN_LENGTH_ORDER[..used_hclens] { writer.write_bits(u16::from(huffman_table_lengths[usize::from(*n)]), 3); } // Generate codes for the main huffman table using the lengths we just wrote let mut codes = [0u16; NUM_HUFFMAN_LENGTHS]; create_codes_in_place(&mut codes[..], huffman_table_lengths); // Write the actual huffman lengths for v in encoded_lengths { match *v { EncodedLength::Length(n) => { let (c, l) = (codes[usize::from(n)], huffman_table_lengths[usize::from(n)]); writer.write_bits(c, l); } EncodedLength::CopyPrevious(n) => { let (c, l) = (codes[COPY_PREVIOUS], huffman_table_lengths[COPY_PREVIOUS]); writer.write_bits(c, l); debug_assert!(n >= 3); debug_assert!(n <= 6); writer.write_bits((n - 3).into(), 2); } EncodedLength::RepeatZero3Bits(n) => { let (c, l) = ( codes[REPEAT_ZERO_3_BITS], huffman_table_lengths[REPEAT_ZERO_3_BITS], ); writer.write_bits(c, l); debug_assert!(n >= 3); writer.write_bits((n - 3).into(), 3); } EncodedLength::RepeatZero7Bits(n) => { let (c, l) = ( codes[REPEAT_ZERO_7_BITS], huffman_table_lengths[REPEAT_ZERO_7_BITS], ); writer.write_bits(c, l); debug_assert!(n >= 11); debug_assert!(n <= 138); writer.write_bits((n - 11).into(), 7); } } } } #[cfg(test)] mod test { use super::stored_padding; #[test] fn padding() { assert_eq!(stored_padding(0), 5); assert_eq!(stored_padding(1), 4); assert_eq!(stored_padding(2), 3); assert_eq!(stored_padding(3), 2); assert_eq!(stored_padding(4), 1); assert_eq!(stored_padding(5), 0); assert_eq!(stored_padding(6), 7); assert_eq!(stored_padding(7), 6); } } deflate-1.0.0/src/huffman_table.rs000064400000000000000000000514370072674642500152200ustar 00000000000000use crate::bit_reverse::reverse_bits; use crate::lzvalue::StoredLength; use std::fmt; /// The number of length codes in the Huffman table pub const NUM_LENGTH_CODES: usize = 29; /// The number of distance codes in the distance Huffman table // NOTE: two mode codes are actually used when constructing codes pub const NUM_DISTANCE_CODES: usize = 30; /// Combined number of literal and length codes // NOTE: two mode codes are actually used when constructing codes pub const NUM_LITERALS_AND_LENGTHS: usize = 286; /// The maximum length of a Huffman code pub const MAX_CODE_LENGTH: usize = 15; /// The minimum and maximum lengths for a match according to the DEFLATE specification pub const MIN_MATCH: u16 = 3; pub const MAX_MATCH: u16 = 258; #[cfg(test)] pub const MIN_DISTANCE: u16 = 1; pub const MAX_DISTANCE: u16 = 32768; /// The position in the literal/length table of the end of block symbol pub const END_OF_BLOCK_POSITION: usize = 256; /// Bit lengths for literal and length codes in the fixed Huffman table /// The Huffman codes are generated from this and the distance bit length table pub static FIXED_CODE_LENGTHS: [u8; NUM_LITERALS_AND_LENGTHS + 2] = [ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, ]; /// The number of extra bits for the length codes const LENGTH_EXTRA_BITS_LENGTH: [u8; NUM_LENGTH_CODES] = [ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, ]; /// Table used to get a code from a length value (see get_distance_code_and_extra_bits) const LENGTH_CODE: [u8; 256] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, ]; /// Base values to calculate the value of the bits in length codes const BASE_LENGTH: [u8; NUM_LENGTH_CODES] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 255, ]; // 258 - MIN_MATCh /// What number in the literal/length table the lengths start at pub const LENGTH_BITS_START: u16 = 257; /// Lengths for the distance codes in the pre-defined/fixed Huffman table /// (All distance codes are 5 bits long) pub const FIXED_CODE_LENGTHS_DISTANCE: [u8; NUM_DISTANCE_CODES + 2] = [5; NUM_DISTANCE_CODES + 2]; const DISTANCE_CODES: [u8; 512] = [ 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 0, 0, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, ]; /// Number of extra bits following the distance codes #[cfg(test)] const DISTANCE_EXTRA_BITS: [u8; NUM_DISTANCE_CODES] = [ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, ]; const DISTANCE_BASE: [u16; NUM_DISTANCE_CODES] = [ 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576, ]; pub const fn num_extra_bits_for_length_code(code: u8) -> u8 { LENGTH_EXTRA_BITS_LENGTH[code as usize] } /// Get the number of extra bits used for a distance code. /// (Code numbers above `NUM_DISTANCE_CODES` will give some garbage /// value.) pub fn num_extra_bits_for_distance_code(code: u8) -> u8 { // This can be easily calculated without a lookup. // let mut c = code >> 1; c -= (c != 0) as u8; c } /// A struct representing the data needed to generate the bit codes for /// a given value and Huffman table. #[derive(Copy, Clone)] struct ExtraBits { /// The position of the length in the Huffman table. pub code_number: u16, /// Number of extra bits following the code. pub num_bits: u8, /// The value of the extra bits, which together with the length/distance code /// allow us to calculate the exact length/distance. pub value: u16, } /// Get the length code that corresponds to the length value /// Panics if length is out of range. pub fn get_length_code(length: u16) -> usize { // Going via an u8 here helps the compiler evade bounds checking. usize::from(LENGTH_CODE[(length.wrapping_sub(MIN_MATCH)) as u8 as usize]) + LENGTH_BITS_START as usize } /// Get the code for the Huffman table and the extra bits for the requested length. fn get_length_code_and_extra_bits(length: StoredLength) -> ExtraBits { // Length values are stored as unsigned bytes, where the actual length is the value - 3 // The `StoredLength` struct takes care of this conversion for us. let n = LENGTH_CODE[length.stored_length() as usize]; // We can then get the base length from the base length table, // which we use to calculate the value of the extra bits. let base = BASE_LENGTH[n as usize]; let num_bits = num_extra_bits_for_length_code(n); ExtraBits { code_number: n as u16 + LENGTH_BITS_START, num_bits, value: (length.stored_length() - base) as u16, } } /// Get the spot in the Huffman table for distances `distance` corresponds to /// Returns 255 if the distance is invalid. /// Avoiding option here for simplicity and performance) as this being called with an invalid /// value would be a bug. pub fn get_distance_code(distance: u16) -> u8 { let distance = distance as usize; match distance { // Since the array starts at 0, we need to subtract 1 to get the correct code number. 1..=256 => DISTANCE_CODES[distance - 1], // Due to the distrubution of the distance codes above 256, we can get away with only // using the top bits to determine the code, rather than having a 32k long table of // distance codes. 257..=32768 => DISTANCE_CODES[256 + ((distance - 1) >> 7)], _ => 0, } } fn get_distance_code_and_extra_bits(distance: u16) -> ExtraBits { let distance_code = get_distance_code(distance); let extra = num_extra_bits_for_distance_code(distance_code); // FIXME: We should add 1 to the values in distance_base to avoid having to add one here let base = DISTANCE_BASE[distance_code as usize] + 1; ExtraBits { code_number: distance_code.into(), num_bits: extra, value: distance - base, } } #[derive(Copy, Clone, Default)] pub struct HuffmanCode { pub code: u16, pub length: u8, } impl fmt::Debug for HuffmanCode { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, "HuffmanCode {{ code: {:b}, length: {}}}", self.code, self.length ) } } impl HuffmanCode { #[inline] /// Create a Huffman code value from a code and length. const fn new(code: u16, length: u8) -> HuffmanCode { HuffmanCode { code, length } } } #[cfg(test)] pub struct LengthAndDistanceBits { pub length_code: HuffmanCode, pub length_extra_bits: HuffmanCode, pub distance_code: HuffmanCode, pub distance_extra_bits: HuffmanCode, } /// Counts the number of values of each length. /// Returns a tuple containing the longest length value in the table, it's position, /// and fills in lengths in the `len_counts` slice. /// Returns an error if `table` is empty, or if any of the lengths exceed 15. fn build_length_count_table(table: &[u8], len_counts: &mut [u16; 16]) -> (usize, usize) { // TODO: Validate the length table properly in debug mode. let max_length = (*table.iter().max().expect("BUG! Empty lengths!")).into(); assert!(max_length <= MAX_CODE_LENGTH); let mut max_length_pos = 0; for (n, &length) in table.iter().enumerate() { // TODO: Make sure we don't have more of one length than we can make // codes for if length > 0 { len_counts[usize::from(length)] += 1; max_length_pos = n; } } (max_length, max_length_pos) } /// Generates a vector of Huffman codes given a table of bit lengths /// Returns an error if any of the lengths are > 15 pub fn create_codes_in_place(code_table: &mut [u16], length_table: &[u8]) { let mut len_counts = [0; 16]; let (max_length, max_length_pos) = build_length_count_table(length_table, &mut len_counts); let lengths = len_counts; let mut code = 0u16; let mut next_code = Vec::with_capacity(length_table.len()); next_code.push(code); for bits in 1..=max_length { code = (code + lengths[bits - 1]) << 1; next_code.push(code); } for n in 0..=max_length_pos { let length = usize::from(length_table[n]); if length != 0 { // The algorithm generates the code in the reverse bit order, so we need to reverse them // to get the correct codes. code_table[n] = reverse_bits(next_code[length], length as u8); // We use wrapping here as we would otherwise overflow on the last code // This should be okay as we exit the loop after this so the value is ignored next_code[length] = next_code[length].wrapping_add(1); } } } /// A structure containing the tables of Huffman codes for lengths, literals and distances pub struct HuffmanTable { // Literal, end of block and length codes codes: [u16; 288], code_lengths: [u8; 288], // Distance codes distance_codes: [u16; 32], distance_code_lengths: [u8; 32], } impl HuffmanTable { pub const fn empty() -> HuffmanTable { HuffmanTable { codes: [0; 288], code_lengths: [0; 288], distance_codes: [0; 32], distance_code_lengths: [0; 32], } } #[cfg(test)] pub fn from_length_tables( literals_and_lengths: &[u8; 288], distances: &[u8; 32], ) -> HuffmanTable { let mut table = HuffmanTable { codes: [0; 288], code_lengths: *literals_and_lengths, distance_codes: [0; 32], distance_code_lengths: *distances, }; table.update_from_lengths(); table } /// Get references to the lengths of the current Huffman codes. #[inline] pub const fn get_lengths(&self) -> (&[u8; 288], &[u8; 32]) { (&self.code_lengths, &self.distance_code_lengths) } /// Get mutable references to the lengths of the current Huffman codes. /// /// Used for updating the lengths in place. #[inline] pub fn get_lengths_mut(&mut self) -> (&mut [u8; 288], &mut [u8; 32]) { (&mut self.code_lengths, &mut self.distance_code_lengths) } /// Update the Huffman codes using the existing length values in the Huffman table. pub fn update_from_lengths(&mut self) { create_codes_in_place(self.codes.as_mut(), &self.code_lengths[..]); create_codes_in_place( self.distance_codes.as_mut(), &self.distance_code_lengths[..], ); } pub fn set_to_fixed(&mut self) { self.code_lengths = FIXED_CODE_LENGTHS; self.distance_code_lengths = FIXED_CODE_LENGTHS_DISTANCE; self.update_from_lengths(); } /// Create a `HuffmanTable` using the fixed tables specified in the DEFLATE format specification. #[cfg(test)] pub fn fixed_table() -> HuffmanTable { // This should be safe to unwrap, if it were to panic the code is wrong, // tests should catch it. HuffmanTable::from_length_tables(&FIXED_CODE_LENGTHS, &FIXED_CODE_LENGTHS_DISTANCE) } #[inline] const fn get_ll_huff(&self, value: usize) -> HuffmanCode { HuffmanCode::new(self.codes[value], self.code_lengths[value]) } /// Get the Huffman code from the corresponding literal value #[inline] pub fn get_literal(&self, value: u8) -> HuffmanCode { let index = usize::from(value); HuffmanCode::new(self.codes[index], self.code_lengths[index]) } /// Get the Huffman code for the end of block value #[inline] pub const fn get_end_of_block(&self) -> HuffmanCode { self.get_ll_huff(END_OF_BLOCK_POSITION) } /// Get the Huffman code and extra bits for the specified length #[inline] pub fn get_length_huffman(&self, length: StoredLength) -> (HuffmanCode, HuffmanCode) { let length_data = get_length_code_and_extra_bits(length); let length_huffman_code = self.get_ll_huff(length_data.code_number as usize); ( length_huffman_code, HuffmanCode { code: length_data.value, length: length_data.num_bits, }, ) } /// Get the Huffman code and extra bits for the specified distance /// /// Returns None if distance is 0 or above 32768 #[inline] pub fn get_distance_huffman(&self, distance: u16) -> (HuffmanCode, HuffmanCode) { //debug_assert!(distance >= MIN_DISTANCE && distance <= MAX_DISTANCE); let distance_data = get_distance_code_and_extra_bits(distance); let distance_huffman_code = self.distance_codes[distance_data.code_number as usize]; let distance_huffman_length = self.distance_code_lengths[distance_data.code_number as usize]; ( HuffmanCode { code: distance_huffman_code, length: distance_huffman_length, }, HuffmanCode { code: distance_data.value, length: distance_data.num_bits, }, ) } #[cfg(test)] pub fn get_length_distance_code(&self, length: u16, distance: u16) -> LengthAndDistanceBits { assert!(length >= MIN_MATCH && length < MAX_DISTANCE); let l_codes = self.get_length_huffman(StoredLength::from_actual_length(length)); let d_codes = self.get_distance_huffman(distance); LengthAndDistanceBits { length_code: l_codes.0, length_extra_bits: l_codes.1, distance_code: d_codes.0, distance_extra_bits: d_codes.1, } } } #[cfg(test)] mod test { use super::*; use super::{ build_length_count_table, get_distance_code_and_extra_bits, get_length_code_and_extra_bits, }; use crate::lzvalue::StoredLength; fn l(length: u16) -> StoredLength { StoredLength::from_actual_length(length) } #[test] fn test_get_length_code() { let extra_bits = get_length_code_and_extra_bits(l(4)); assert_eq!(extra_bits.code_number, 258); assert_eq!(extra_bits.num_bits, 0); assert_eq!(extra_bits.value, 0); let extra_bits = get_length_code_and_extra_bits(l(165)); assert_eq!(extra_bits.code_number, 282); assert_eq!(extra_bits.num_bits, 5); assert_eq!(extra_bits.value, 2); let extra_bits = get_length_code_and_extra_bits(l(257)); assert_eq!(extra_bits.code_number, 284); assert_eq!(extra_bits.num_bits, 5); assert_eq!(extra_bits.value, 30); let extra_bits = get_length_code_and_extra_bits(l(258)); assert_eq!(extra_bits.code_number, 285); assert_eq!(extra_bits.num_bits, 0); } #[test] fn test_distance_code() { assert_eq!(get_distance_code(1), 0); // Using 0 for None at the moment assert_eq!(get_distance_code(0), 0); assert_eq!(get_distance_code(50000), 0); assert_eq!(get_distance_code(6146), 25); assert_eq!(get_distance_code(256), 15); assert_eq!(get_distance_code(4733), 24); assert_eq!(get_distance_code(257), 16); } #[test] fn test_distance_extra_bits() { let extra = get_distance_code_and_extra_bits(527); assert_eq!(extra.value, 0b1110); assert_eq!(extra.code_number, 18); assert_eq!(extra.num_bits, 8); let extra = get_distance_code_and_extra_bits(256); assert_eq!(extra.code_number, 15); assert_eq!(extra.num_bits, 6); let extra = get_distance_code_and_extra_bits(4733); assert_eq!(extra.code_number, 24); assert_eq!(extra.num_bits, 11); } #[test] fn test_length_table_fixed() { let _ = build_length_count_table(&FIXED_CODE_LENGTHS, &mut [0; 16]); } #[test] #[should_panic] fn test_length_table_max_length() { let table = [16u8; 288]; build_length_count_table(&table, &mut [0; 16]); } #[test] #[should_panic] fn test_empty_table() { let table = []; build_length_count_table(&table, &mut [0; 16]); } #[test] fn make_table_fixed() { let table = HuffmanTable::fixed_table(); assert_eq!(table.codes[0], 0b00001100); assert_eq!(table.codes[143], 0b11111101); assert_eq!(table.codes[144], 0b000010011); assert_eq!(table.codes[255], 0b111111111); assert_eq!(table.codes[256], 0b0000000); assert_eq!(table.codes[279], 0b1110100); assert_eq!(table.codes[280], 0b00000011); assert_eq!(table.codes[287], 0b11100011); assert_eq!(table.distance_codes[0], 0); assert_eq!(table.distance_codes[5], 20); let ld = table.get_length_distance_code(4, 5); assert_eq!(ld.length_code.code, 0b00100000); assert_eq!(ld.distance_code.code, 0b00100); assert_eq!(ld.distance_extra_bits.length, 1); assert_eq!(ld.distance_extra_bits.code, 0); } #[test] fn extra_bits_distance() { use std::mem::size_of; for i in 0..NUM_DISTANCE_CODES { assert_eq!( num_extra_bits_for_distance_code(i as u8), DISTANCE_EXTRA_BITS[i] ); } println!("Size of huffmanCode struct: {}", size_of::()); } } deflate-1.0.0/src/input_buffer.rs000064400000000000000000000115160072674642500151070ustar 00000000000000use std::cmp; use crate::chained_hash_table::WINDOW_SIZE; const MAX_MATCH: usize = crate::huffman_table::MAX_MATCH as usize; /// The maximum size of the buffer. pub const BUFFER_SIZE: usize = (WINDOW_SIZE * 2) + MAX_MATCH; pub struct InputBuffer { buffer: Vec, } impl InputBuffer { #[cfg(test)] pub fn new<'a>(data: &'a [u8]) -> (InputBuffer, Option<&[u8]>) { let mut b = InputBuffer::empty(); let rem = b.add_data(data); (b, rem) } pub fn empty() -> InputBuffer { InputBuffer { buffer: Vec::with_capacity(BUFFER_SIZE), } } /// Add data to the buffer. /// /// Returns a slice of the data that was not added (including the lookahead if any). pub fn add_data<'a>(&mut self, data: &'a [u8]) -> Option<&'a [u8]> { debug_assert!(self.current_end() <= BUFFER_SIZE); if self.current_end() + data.len() > BUFFER_SIZE { // Add data and return how much was left. let consumed = { let space_left = BUFFER_SIZE - self.buffer.len(); self.buffer.extend_from_slice(&data[..space_left]); space_left }; Some(&data[consumed..]) } else { // There's space for all of the data. self.buffer.extend_from_slice(data); None } } /// Get the current amount of data in the buffer. pub fn current_end(&self) -> usize { self.buffer.len() } /// Slide the input window and add new data. /// /// Returns a slice containing the data that did not fit, or `None` if all data was consumed. pub fn slide<'a>(&mut self, data: &'a [u8]) -> Option<&'a [u8]> { // This should only be used when the buffer is full assert!(self.buffer.len() > WINDOW_SIZE * 2); // Do this in a closure to to end the borrow of buffer. let (final_len, upper_len, end) = { // Split into lower window and upper window + lookahead let (lower, upper) = self.buffer.split_at_mut(WINDOW_SIZE); // Copy the upper window to the lower window lower.copy_from_slice(&upper[..WINDOW_SIZE]); let lookahead_len = { // Copy the lookahead to the start of the upper window let (upper_2, lookahead) = upper.split_at_mut(WINDOW_SIZE); let lookahead_len = lookahead.len(); debug_assert!(lookahead_len <= MAX_MATCH); upper_2[..lookahead_len].copy_from_slice(lookahead); lookahead_len }; // Length of the upper window minus the lookahead bytes let upper_len = upper.len() - lookahead_len; let end = cmp::min(data.len(), upper_len); upper[lookahead_len..lookahead_len + end].copy_from_slice(&data[..end]); // Remove unused data if any. (lower.len() + lookahead_len + end, upper_len, end) }; // Remove unused space. self.buffer.truncate(final_len); if data.len() > upper_len { // Return a slice of the data that was not added Some(&data[end..]) } else { None } } /// Get a mutable slice of the used part of the buffer. pub fn get_buffer(&mut self) -> &mut [u8] { &mut self.buffer } } #[cfg(test)] mod test { use super::MAX_MATCH; use super::*; use crate::chained_hash_table::WINDOW_SIZE; #[test] pub fn buffer_add_full() { let data = [10u8; BUFFER_SIZE + 10]; let (mut buf, extra) = InputBuffer::new(&data[..]); assert!(extra.unwrap() == &[10; 10]); let to_add = [2, 5, 3]; let not_added = buf.add_data(&to_add); assert_eq!(not_added.unwrap(), to_add); } #[test] pub fn buffer_add_not_full() { let data = [10u8; BUFFER_SIZE - 5]; let (mut buf, extra) = InputBuffer::new(&data[..]); assert_eq!(buf.current_end(), data.len()); assert_eq!(extra, None); let to_add = [2, 5, 3]; { let not_added = buf.add_data(&to_add); assert!(not_added.is_none()); } let not_added = buf.add_data(&to_add); assert_eq!(not_added.unwrap()[0], 3); } #[test] fn slide() { let data = [10u8; BUFFER_SIZE]; let (mut buf, extra) = InputBuffer::new(&data[..]); assert_eq!(extra, None); let to_add = [5; 5]; let rem = buf.slide(&to_add); assert!(rem.is_none()); { let slice = buf.get_buffer(); assert!(slice[..WINDOW_SIZE + MAX_MATCH] == data[WINDOW_SIZE..]); assert_eq!( slice[WINDOW_SIZE + MAX_MATCH..WINDOW_SIZE + MAX_MATCH + 5], to_add ); } assert_eq!(buf.current_end(), WINDOW_SIZE + MAX_MATCH + to_add.len()); } } deflate-1.0.0/src/length_encode.rs000064400000000000000000000604200072674642500152130ustar 00000000000000use std::clone::Clone; use std::iter::Iterator; /// An enum representing the different types in the run-length encoded data used to encode /// Huffman table lengths #[derive(Debug, PartialEq, Eq)] pub enum EncodedLength { // An actual length value Length(u8), // Copy the previous value n times CopyPrevious(u8), // Repeat zero n times (with n represented by 3 bits) RepeatZero3Bits(u8), // Repeat zero n times (with n represented by 7 bits) RepeatZero7Bits(u8), } impl EncodedLength { fn from_prev_and_repeat(prev: u8, repeat: u8) -> EncodedLength { match prev { 0 => { if repeat <= 10 { EncodedLength::RepeatZero3Bits(repeat) } else { EncodedLength::RepeatZero7Bits(repeat) } } 1..=15 => EncodedLength::CopyPrevious(repeat), _ => panic!(), } } } pub const COPY_PREVIOUS: usize = 16; pub const REPEAT_ZERO_3_BITS: usize = 17; pub const REPEAT_ZERO_7_BITS: usize = 18; const MIN_REPEAT: u8 = 3; /// Push an `EncodedLength` to the vector and update the frequency table. fn update_out_and_freq( encoded: EncodedLength, output: &mut Vec, frequencies: &mut [u16; 19], ) { let index = match encoded { EncodedLength::Length(l) => usize::from(l), EncodedLength::CopyPrevious(_) => COPY_PREVIOUS, EncodedLength::RepeatZero3Bits(_) => REPEAT_ZERO_3_BITS, EncodedLength::RepeatZero7Bits(_) => REPEAT_ZERO_7_BITS, }; frequencies[index] += 1; output.push(encoded); } /// Convenience function to check if the repeat counter should be incremented further fn not_max_repetitions(length_value: u8, repeats: u8) -> bool { (length_value == 0 && repeats < 138) || repeats < 6 } ///Convenience version for unit tests. #[cfg(test)] pub fn encode_lengths<'a, I>(lengths: I) -> (Vec, [u16; 19]) where I: Iterator + Clone, { let mut freqs = [0u16; 19]; let mut encoded: Vec = Vec::new(); encode_lengths_m(lengths, &mut encoded, &mut freqs); (encoded, freqs) } /// Run-length encodes the lengths of the values in `lengths` according to the deflate /// specification. This is used for writing the code lengths for the Huffman tables for /// the deflate stream. /// /// Populates the supplied array with the frequency of the different encoded length values /// The frequency array is taken as a parameter rather than returned to avoid /// excessive `memcpy`-ing. pub fn encode_lengths_m<'a, I>( lengths: I, mut out: &mut Vec, mut frequencies: &mut [u16; 19], ) where I: Iterator + Clone, { out.clear(); // Number of repetitions of the current value let mut repeat = 0; let mut iter = lengths.clone().enumerate().peekable(); // Previous value // We set it to the compliment of the first falue to simplify the code. let mut prev = !iter.peek().expect("No length values!").1; while let Some((n, &l)) = iter.next() { if l == prev && not_max_repetitions(l, repeat) { repeat += 1; } if l != prev || iter.peek().is_none() || !not_max_repetitions(l, repeat) { if repeat >= MIN_REPEAT { // The previous value has been repeated enough times to write out a repeat code. let val = EncodedLength::from_prev_and_repeat(prev, repeat); update_out_and_freq(val, &mut out, &mut frequencies); repeat = 0; // If we have a new length value, output l unless the last value is 0 or l is the // last byte. if l != prev { if l != 0 || iter.peek().is_none() { update_out_and_freq(EncodedLength::Length(l), &mut out, &mut frequencies); repeat = 0; } else { // If we have a zero, we start repeat at one instead of outputting, as // there are separate codes for repeats of zero so we don't need a literal // to define what byte to repeat. repeat = 1; } } } else { // There haven't been enough repetitions of the previous value, // so just we output the lengths directly. // If we are at the end, and we have a value that is repeated, we need to // skip a byte and output the last one. let extra_skip = if iter.peek().is_none() && l == prev { 1 } else { 0 }; // Get to the position of the next byte to output by starting at zero and skipping. let b_iter = lengths.clone().skip(n + extra_skip - repeat as usize); // As repeats of zeroes have separate codes, we don't need to output a literal here // if we have a zero (unless we are at the end). let extra = if l != 0 || iter.peek().is_none() { 1 } else { 0 }; for &i in b_iter.take(repeat as usize + extra) { update_out_and_freq(EncodedLength::Length(i), &mut out, &mut frequencies); } // If the current byte is zero we start repeat at 1 as we didn't output the literal // directly. repeat = 1 - extra as u8; } } prev = l; } } #[cfg(test)] pub fn huffman_lengths_from_frequency(frequencies: &[u16], max_len: usize) -> Vec { in_place::gen_lengths(frequencies, max_len) } pub type LeafVec = Vec; /// Generate a set of canonical huffman lengths from the given frequencies, with a maximum length /// of `max_len`. The lengths are put in the lens slice parameter. Unused lengths are set to 0. /// /// The leaf buffer is passed in to avoid allocating it every time this function is called. /// The existing data contained in it is not preserved. pub fn huffman_lengths_from_frequency_m( frequencies: &[u16], max_len: usize, leaf_buffer: &mut LeafVec, lens: &mut [u8], ) { in_place::in_place_lengths(frequencies, max_len, leaf_buffer, lens); } mod in_place { type WeightType = u32; #[cfg(debug)] pub fn validate_lengths(lengths: &[u8]) -> bool { // Avoid issue with floating point on mips: https://github.com/image-rs/deflate-rs/issues/23 if cfg!(any( target_arch = "mips", target_arch = "mipsel", target_arch = "mips64", target_arch = "mipsel64" )) { true } else { let v = lengths.iter().fold(0f64, |acc, &n| { acc + if n != 0 { 2f64.powi(-(i32::from(n))) } else { 0f64 } }); match v.partial_cmp(&1.0) { Some(std::cmp::Ordering::Greater) => false, _ => true, } } } #[cfg(not(debug))] pub fn validate_lengths(_: &[u8]) -> bool { true } #[derive(Eq, PartialEq, Debug)] pub struct Node { value: WeightType, symbol: u16, } fn step_1(leaves: &mut [Node]) { // If there are less than 2 non-zero frequencies, this function should not have been // called and we should not have gotten to this point. debug_assert!(leaves.len() >= 2); let mut root = 0; let mut leaf = 2; leaves[0].value += leaves[1].value; for next in 1..leaves.len() - 1 { if (leaf >= leaves.len()) || (leaves[root].value < leaves[leaf].value) { leaves[next].value = leaves[root].value; leaves[root].value = next as WeightType; root += 1; } else { leaves[next].value = leaves[leaf].value; leaf += 1; } if (leaf >= leaves.len()) || (root < next && (leaves[root].value < leaves[leaf].value)) { leaves[next].value += leaves[root].value; leaves[root].value = next as WeightType; root += 1; } else { leaves[next].value += leaves[leaf].value; leaf += 1; } } } fn step_2(leaves: &mut [Node]) { debug_assert!(leaves.len() >= 2); let n = leaves.len(); leaves[n - 2].value = 0; for t in (0..(n + 1 - 3)).rev() { leaves[t].value = leaves[leaves[t].value as usize].value + 1; } let mut available = 1_usize; let mut used = 0; let mut depth = 0; let mut root = n as isize - 2; let mut next = n as isize - 1; while available > 0 { while root >= 0 && leaves[root as usize].value == depth { used += 1; root -= 1; } while available > used { leaves[next as usize].value = depth; next -= 1; available -= 1; } available = 2 * used; depth += 1; used = 0; } } const MAX_NUMBER_OF_CODES: usize = 32; const NUM_CODES_LENGTH: usize = MAX_NUMBER_OF_CODES + 1; /// Checks if any of the lengths exceed `max_len`, and if that is the case, alters the length /// table so that no codes exceed `max_len`. /// This is ported from miniz (which is released as public domain by Rich Geldreich /// https://github.com/richgel999/miniz/blob/master/miniz.c) /// /// This will not generate optimal (minimim-redundancy) codes, however in most cases /// this won't make a large difference. pub fn enforce_max_code_lengths( num_codes: &mut [u16; NUM_CODES_LENGTH], num_used: usize, max_len: usize, ) { debug_assert!(max_len <= 15); if num_used > 1 { let mut num_above_max = 0u16; for &l in num_codes[(max_len as usize + 1)..].iter() { num_above_max += l; } num_codes[max_len] += num_above_max; let mut total = 0u32; for i in (1..=max_len).rev() { // This should be safe as max_len won't be higher than 15, and num_codes[i] can't // be higher than 288, // and 288 << 15 will not be anywhere close to overflowing 32 bits total += (u32::from(num_codes[i])) << (max_len - i); } // miniz uses unsigned long here. 32-bits should be sufficient though, // as max_len won't be longer than 15 anyhow. while total != 1u32 << max_len { num_codes[max_len] -= 1; for i in (1..max_len).rev() { if num_codes[i] != 0 { num_codes[i] -= 1; num_codes[i + 1] += 2; break; } } total -= 1; } } } #[cfg(test)] /// Convenience wrapper for tests. pub fn gen_lengths(frequencies: &[u16], max_len: usize) -> Vec { let mut lens = vec![0u8; frequencies.len()]; let mut leaves = Vec::new(); in_place_lengths(frequencies, max_len, &mut leaves, lens.as_mut_slice()); lens } /// Generate huffman code lengths, using the algorithm described by /// Moffat and Katajainen in In-Place Calculation of Minimum-Redundancy Codes /// https://people.eng.unimelb.edu.au/ammoffat/abstracts/mk95wads.html /// and it's implementation. /// /// This is significantly faster, and seems to generally create lengths that result in length /// tables that are better compressible than the algorithm used previously. The downside of this /// algorithm is that it's not length-limited, so if too long code lengths are generated, /// it might result in a sub-optimal tables as the length-restricting function isn't optimal. pub fn in_place_lengths( frequencies: &[u16], max_len: usize, mut leaves: &mut Vec, lengths: &mut [u8], ) { debug_assert!(lengths.len() >= frequencies.len()); for l in lengths.iter_mut() { *l = 0; } // Clear any previous leaves in the leaf buffer. leaves.clear(); // Discard zero length nodes as they won't be given a code and thus don't need to // participate in code length generation and create a new vec of the remaining // symbols and weights. leaves.extend(frequencies.iter().enumerate().filter_map(|(n, f)| { if *f > 0 { Some(Node { value: u32::from(*f), symbol: n as u16, }) } else { None } })); // Special cases with zero or 1 value having a non-zero frequency if leaves.len() == 1 { lengths[leaves[0].symbol as usize] = 1; return; } else if leaves.is_empty() { return; } // Sort the leaves by value. As the sort in the standard library is stable, we don't // have to worry about the symbol code here. leaves.sort_by(|a, b| a.value.cmp(&b.value)); step_1(&mut leaves); step_2(&mut leaves); // Count how many codes of each length used, for usage in the next section. let mut num_codes = [0u16; NUM_CODES_LENGTH]; for l in leaves.iter() { num_codes[l.value as usize] += 1; } // As the algorithm used here doesn't limit the maximum length that can be generated // we need to make sure none of the lengths exceed `max_len` enforce_max_code_lengths(&mut num_codes, leaves.len(), max_len); // Output the actual lengths let mut leaf_it = leaves.iter().rev(); // Start at 1 since the length table is already filled with zeroes. for (&n_codes, i) in num_codes[1..=max_len].iter().zip(1..=(max_len as u8)) { for _ in 0..n_codes { lengths[leaf_it.next().unwrap().symbol as usize] = i; } } debug_assert_eq!(leaf_it.next(), None); debug_assert!( validate_lengths(lengths), "The generated length codes were not valid!" ); } } #[cfg(test)] mod test { use super::*; use crate::huffman_table::NUM_LITERALS_AND_LENGTHS; use std::u16; fn lit(value: u8) -> EncodedLength { EncodedLength::Length(value) } fn zero(repeats: u8) -> EncodedLength { match repeats { 0..=1 => EncodedLength::Length(0), 2..=10 => EncodedLength::RepeatZero3Bits(repeats), _ => EncodedLength::RepeatZero7Bits(repeats), } } fn copy(copies: u8) -> EncodedLength { EncodedLength::CopyPrevious(copies) } #[test] fn test_encode_lengths() { use crate::huffman_table::FIXED_CODE_LENGTHS; let enc = encode_lengths(FIXED_CODE_LENGTHS.iter()); // There are no lengths lower than 6 in the fixed table assert_eq!(enc.1[0..7], [0, 0, 0, 0, 0, 0, 0]); // Neither are there any lengths above 9 assert_eq!(enc.1[10..16], [0, 0, 0, 0, 0, 0]); // Also there are no zero-length codes so there shouldn't be any repetitions of zero assert_eq!(enc.1[17..19], [0, 0]); let test_lengths = [0, 0, 5, 0, 15, 1, 0, 0, 0, 2, 4, 4, 4, 4, 3, 5, 5, 5, 5]; let enc = encode_lengths(test_lengths.iter()).0; assert_eq!( enc, vec![ lit(0), lit(0), lit(5), lit(0), lit(15), lit(1), zero(3), lit(2), lit(4), copy(3), lit(3), lit(5), copy(3), ] ); let test_lengths = [0, 0, 0, 5, 2, 3, 0, 0, 0]; let enc = encode_lengths(test_lengths.iter()).0; assert_eq!(enc, vec![zero(3), lit(5), lit(2), lit(3), zero(3)]); let test_lengths = [0, 0, 0, 3, 3, 3, 5, 4, 4, 4, 4, 0, 0]; let enc = encode_lengths(test_lengths.iter()).0; assert_eq!( enc, vec![ zero(3), lit(3), lit(3), lit(3), lit(5), lit(4), copy(3), lit(0), lit(0), ] ); let lens = [ 0, 0, 4, 0, 0, 4, 0, 0, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ]; let _ = encode_lengths(lens.iter()).0; let lens = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0, 0, 7, 8, 7, 8, 6, 6, 8, 0, 7, 6, 7, 8, 7, 7, 8, 0, 0, 0, 0, 0, 8, 8, 0, 8, 7, 0, 10, 8, 0, 8, 0, 10, 10, 8, 8, 10, 8, 0, 8, 7, 0, 10, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 7, 7, 6, 7, 8, 8, 6, 0, 0, 8, 8, 7, 8, 8, 0, 7, 6, 6, 8, 8, 8, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 4, 3, 3, 4, 4, 5, 5, 5, 5, 5, 8, 8, 6, 7, 8, 10, 10, 0, 9, /* litlen */ 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 6, 6, 5, 5, 5, 5, 6, 5, 5, 4, 4, 4, 4, 4, 4, 3, 4, 3, 4, ]; let enc = encode_lengths(lens.iter()).0; assert_eq!( &enc[..10], &[ zero(10), lit(9), lit(0), lit(0), lit(9), zero(18), lit(6), zero(3), lit(8), zero(4) ] ); assert_eq!( &enc[10..20], &[ lit(8), lit(0), lit(0), lit(7), lit(8), lit(7), lit(8), lit(6), lit(6), lit(8) ] ); let enc = encode_lengths([1, 1, 1, 2].iter()).0; assert_eq!(enc, vec![lit(1), lit(1), lit(1), lit(2)]); let enc = encode_lengths([0, 0, 3].iter()).0; assert_eq!(enc, vec![lit(0), lit(0), lit(3)]); let enc = encode_lengths([0, 0, 0, 5, 2].iter()).0; assert_eq!(enc, vec![zero(3), lit(5), lit(2)]); let enc = encode_lengths([0, 0, 0, 5, 0].iter()).0; assert!(*enc.last().unwrap() != lit(5)); let enc = encode_lengths([0, 4, 4, 4, 4, 0].iter()).0; assert_eq!(*enc.last().unwrap(), zero(0)); } #[test] fn test_lengths_from_frequencies() { let frequencies = [1, 1, 5, 7, 10, 14]; let expected = [4, 4, 3, 2, 2, 2]; let res = huffman_lengths_from_frequency(&frequencies, 4); assert_eq!(expected, res.as_slice()); let frequencies = [1, 5, 1, 7, 10, 14]; let expected = [4, 3, 4, 2, 2, 2]; let res = huffman_lengths_from_frequency(&frequencies, 4); assert_eq!(expected, res.as_slice()); let frequencies = [0, 25, 0, 10, 2, 4]; let res = huffman_lengths_from_frequency(&frequencies, 4); assert_eq!(res[0], 0); assert_eq!(res[2], 0); assert!(res[1] < 4); // Only one value let frequencies = [0, 0, 0, 0, 0, 0, 0, 0, 55, 0, 0, 0]; let res = huffman_lengths_from_frequency(&frequencies, 5); let expected = [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]; assert_eq!(expected, res.as_slice()); // No values let frequencies = [0; 30]; let res = huffman_lengths_from_frequency(&frequencies, 5); for (a, b) in frequencies.iter().zip(res.iter()) { assert_eq!(*a, (*b).into()); } // assert_eq!(frequencies, res.as_slice()); let mut frequencies = vec![3; NUM_LITERALS_AND_LENGTHS]; frequencies[55] = u16::MAX / 3; frequencies[125] = u16::MAX / 3; let res = huffman_lengths_from_frequency(&frequencies, 15); assert_eq!(res.len(), NUM_LITERALS_AND_LENGTHS); assert!(res[55] < 3); assert!(res[125] < 3); } #[test] /// Test if the bit lengths for a set of frequencies are optimal (give the best compression /// give the provided frequencies). fn optimal_lengths() { let freqs = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 68, 0, 14, 0, 0, 0, 0, 3, 7, 6, 1, 0, 12, 14, 9, 2, 6, 9, 4, 1, 1, 4, 1, 1, 0, 0, 1, 3, 0, 6, 0, 0, 0, 4, 4, 1, 2, 5, 3, 2, 2, 9, 0, 0, 3, 1, 5, 5, 8, 0, 6, 10, 5, 2, 0, 0, 1, 2, 0, 8, 11, 4, 0, 1, 3, 31, 13, 23, 22, 56, 22, 8, 11, 43, 0, 7, 33, 15, 45, 40, 16, 1, 28, 37, 35, 26, 3, 7, 11, 9, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 126, 114, 66, 31, 41, 25, 15, 21, 20, 16, 15, 10, 7, 5, 1, 1, ]; let lens = huffman_lengths_from_frequency(&freqs, 15); // Lengths produced by miniz for this frequency table for comparison // the number of total bits encoded with these huffman codes is 7701 // NOTE: There can be more than one set of optimal lengths for a set of frequencies, // (though there may be a difference in how well the table itself can be represented) // so testing for a specific length table is not ideal since different algorithms // may produce different length tables. // let lens3 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0, 0, 0, 0, 0, // 0, 0, 0, 0, 0, 0, 4, 0, 7, 0, 0, 0, 0, 9, 8, 8, 10, 0, 7, 7, 7, 10, 8, 7, 8, // 10, 10, 8, 10, 10, 0, 0, 10, 9, 0, 8, 0, 0, 0, 8, 8, 10, 9, 8, 9, 9, 9, 7, 0, // 0, 9, 10, 8, 8, 7, 0, 8, 7, 8, 9, 0, 0, 10, 9, 0, 7, 7, 8, 0, 10, 9, 6, 7, 6, // 6, 5, 6, 7, 7, 5, 0, 8, 5, 7, 5, 5, 6, 10, 6, 5, 5, 6, 9, 8, 7, 7, 10, 10, 0, // 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0, 0, 10, 4, 4, 4, 5, 5, 6, 7, 6, 6, 6, 6, 7, 8, 8, 10, 10]; // let num_bits = lens .iter() .zip(freqs.iter()) .fold(0, |a, (&f, &l)| a + (f as u16 * l)); assert_eq!(num_bits, 7701); } } deflate-1.0.0/src/lib.rs000064400000000000000000000347070072674642500131740ustar 00000000000000//! An implementation an encoder using [DEFLATE](http://www.gzip.org/zlib/rfc-deflate.html) //! compression algorithm in pure Rust. //! //! This library provides functions to compress data using the DEFLATE algorithm, //! optionally wrapped using the [zlib](https://tools.ietf.org/html/rfc1950) or //! [gzip](http://www.gzip.org/zlib/rfc-gzip.html) formats. //! The current implementation is still a bit lacking speed-wise compared to C-libraries //! like zlib and miniz. //! //! The deflate algorithm is an older compression algorithm that is still widely used today, //! by e.g html headers, the `.png` image format, the Unix `gzip` program and commonly in `.zip` //! files. The `zlib` and `gzip` formats are wrappers around DEFLATE-compressed data, containing //! some extra metadata and a checksum to validate the integrity of the raw data. //! //! The deflate algorithm does not perform as well as newer algorithms used in file formats such as //! `.7z`, `.rar`, `.xz` and `.bz2`, and is thus not the ideal choice for applications where //! the `DEFLATE` format (with or without wrappers) is not required. //! //! Support for the gzip wrapper (the wrapper that is used in `.gz` files) is disabled by default //! but can be enabled with the `gzip` feature. //! //! As this library is still in development, the compression output may change slightly //! between versions. //! //! //! # Examples: //! ## Simple compression function: //! ``` rust //! use deflate::deflate_bytes; //! //! let data = b"Some data"; //! let compressed = deflate_bytes(data); //! # let _ = compressed; //! ``` //! //! ## Using a writer: //! ``` rust //! use std::io::Write; //! //! use deflate::Compression; //! use deflate::write::ZlibEncoder; //! //! let data = b"This is some test data"; //! let mut encoder = ZlibEncoder::new(Vec::new(), Compression::Default); //! encoder.write_all(data).expect("Write error!"); //! let compressed_data = encoder.finish().expect("Failed to finish compression!"); //! # let _ = compressed_data; //! ``` #![forbid(unsafe_code)] #![cfg_attr(all(feature = "benchmarks", test), feature(test))] #[cfg(all(test, feature = "benchmarks"))] extern crate test as test_std; #[cfg(test)] extern crate miniz_oxide; extern crate adler32; #[cfg(feature = "gzip")] extern crate gzip_header; mod bit_reverse; mod bitstream; mod chained_hash_table; mod checksum; mod compress; mod compression_options; mod deflate_state; mod encoder_state; mod huffman_lengths; mod huffman_table; mod input_buffer; mod length_encode; mod lz77; mod lzvalue; mod matching; mod output_writer; mod rle; mod stored_block; #[cfg(test)] mod test_utils; mod writer; mod zlib; use std::io; use std::io::Write; #[cfg(feature = "gzip")] use gzip_header::Crc; #[cfg(feature = "gzip")] use gzip_header::GzBuilder; use crate::checksum::RollingChecksum; use crate::deflate_state::DeflateState; use crate::compress::Flush; pub use compression_options::{Compression, CompressionOptions, SpecialOptions}; pub use lz77::MatchingType; use crate::writer::compress_until_done; /// Encoders implementing a `Write` interface. pub mod write { #[cfg(feature = "gzip")] pub use crate::writer::gzip::GzEncoder; pub use crate::writer::{DeflateEncoder, ZlibEncoder}; } fn compress_data_dynamic( input: &[u8], writer: &mut W, mut checksum: RC, compression_options: CompressionOptions, ) -> io::Result<()> { checksum.update_from_slice(input); // We use a box here to avoid putting the buffers on the stack // It's done here rather than in the structs themselves for now to // keep the data close in memory. let mut deflate_state = Box::new(DeflateState::new(compression_options, writer)); compress_until_done(input, &mut deflate_state, Flush::Finish) } /// Compress the given slice of bytes with DEFLATE compression. /// /// Returns a `Vec` of the compressed data. /// /// # Examples /// /// ``` /// use deflate::{deflate_bytes_conf, Compression}; /// /// let data = b"This is some test data"; /// let compressed_data = deflate_bytes_conf(data, Compression::Best); /// # let _ = compressed_data; /// ``` pub fn deflate_bytes_conf>(input: &[u8], options: O) -> Vec { let mut writer = Vec::with_capacity(input.len() / 3); compress_data_dynamic( input, &mut writer, checksum::NoChecksum::new(), options.into(), ) .expect("Write error!"); writer } /// Compress the given slice of bytes with DEFLATE compression using the default compression /// level. /// /// Returns a `Vec` of the compressed data. /// /// # Examples /// /// ``` /// use deflate::deflate_bytes; /// /// let data = b"This is some test data"; /// let compressed_data = deflate_bytes(data); /// # let _ = compressed_data; /// ``` pub fn deflate_bytes(input: &[u8]) -> Vec { deflate_bytes_conf(input, Compression::Default) } /// Compress the given slice of bytes with DEFLATE compression, including a zlib header and trailer. /// /// Returns a `Vec` of the compressed data. /// /// Zlib dictionaries are not yet suppored. /// /// # Examples /// /// ``` /// use deflate::{deflate_bytes_zlib_conf, Compression}; /// /// let data = b"This is some test data"; /// let compressed_data = deflate_bytes_zlib_conf(data, Compression::Best); /// # let _ = compressed_data; /// ``` pub fn deflate_bytes_zlib_conf>(input: &[u8], options: O) -> Vec { let mut writer = Vec::with_capacity(input.len() / 3); // Write header zlib::write_zlib_header(&mut writer, zlib::CompressionLevel::Default) .expect("Write error when writing zlib header!"); let mut checksum = checksum::Adler32Checksum::new(); compress_data_dynamic(input, &mut writer, &mut checksum, options.into()) .expect("Write error when writing compressed data!"); let hash = checksum.current_hash(); writer .write_all(&hash.to_be_bytes()) .expect("Write error when writing checksum!"); writer } /// Compress the given slice of bytes with DEFLATE compression, including a zlib header and trailer, /// using the default compression level. /// /// Returns a Vec of the compressed data. /// /// Zlib dictionaries are not yet suppored. /// /// # Examples /// /// ``` /// use deflate::deflate_bytes_zlib; /// /// let data = b"This is some test data"; /// let compressed_data = deflate_bytes_zlib(data); /// # let _ = compressed_data; /// ``` pub fn deflate_bytes_zlib(input: &[u8]) -> Vec { deflate_bytes_zlib_conf(input, Compression::Default) } /// Compress the given slice of bytes with DEFLATE compression, including a gzip header and trailer /// using the given gzip header and compression options. /// /// Returns a `Vec` of the compressed data. /// /// /// # Examples /// /// ``` /// extern crate gzip_header; /// extern crate deflate; /// /// # fn main() { /// use deflate::{deflate_bytes_gzip_conf, Compression}; /// use gzip_header::GzBuilder; /// /// let data = b"This is some test data"; /// let compressed_data = deflate_bytes_gzip_conf(data, Compression::Best, GzBuilder::new()); /// # let _ = compressed_data; /// # } /// ``` #[cfg(feature = "gzip")] pub fn deflate_bytes_gzip_conf>( input: &[u8], options: O, gzip_header: GzBuilder, ) -> Vec { let mut writer = Vec::with_capacity(input.len() / 3); // Write header writer .write_all(&gzip_header.into_header()) .expect("Write error when writing header!"); let mut checksum = checksum::NoChecksum::new(); compress_data_dynamic(input, &mut writer, &mut checksum, options.into()) .expect("Write error when writing compressed data!"); let mut crc = Crc::new(); crc.update(input); writer .write_all(&crc.sum().to_le_bytes()) .expect("Write error when writing checksum!"); writer .write_all(&crc.amt_as_u32().to_le_bytes()) .expect("Write error when writing amt!"); writer } /// Compress the given slice of bytes with DEFLATE compression, including a gzip header and trailer, /// using the default compression level, and a gzip header with default values. /// /// Returns a `Vec` of the compressed data. /// /// /// # Examples /// /// ``` /// use deflate::deflate_bytes_gzip; /// let data = b"This is some test data"; /// let compressed_data = deflate_bytes_gzip(data); /// # let _ = compressed_data; /// ``` #[cfg(feature = "gzip")] pub fn deflate_bytes_gzip(input: &[u8]) -> Vec { deflate_bytes_gzip_conf(input, Compression::Default, GzBuilder::new()) } #[cfg(test)] mod test { use super::*; use std::io::Write; #[cfg(feature = "gzip")] use test_utils::decompress_gzip; use test_utils::{decompress_to_end, decompress_zlib, get_test_data}; type CO = CompressionOptions; /// Write data to the writer in chunks of chunk_size. fn chunked_write(mut writer: W, data: &[u8], chunk_size: usize) { for chunk in data.chunks(chunk_size) { writer.write_all(&chunk).unwrap(); } } #[test] fn dynamic_string_mem() { let test_data = String::from(" GNU GENERAL PUBLIC LICENSE").into_bytes(); let compressed = deflate_bytes(&test_data); assert!(compressed.len() < test_data.len()); let result = decompress_to_end(&compressed); assert_eq!(test_data, result); } #[test] fn dynamic_string_file() { let input = get_test_data(); let compressed = deflate_bytes(&input); let result = decompress_to_end(&compressed); for (n, (&a, &b)) in input.iter().zip(result.iter()).enumerate() { if a != b { println!("First difference at {}, input: {}, output: {}", n, a, b); println!( "input: {:?}, output: {:?}", &input[n - 3..n + 3], &result[n - 3..n + 3] ); break; } } // Not using assert_eq here deliberately to avoid massive amounts of output spam assert!(input == result); // Check that we actually managed to compress the input assert!(compressed.len() < input.len()); } #[test] fn file_rle() { let input = get_test_data(); let compressed = deflate_bytes_conf(&input, CO::rle()); let result = decompress_to_end(&compressed); assert!(input == result); } #[test] fn file_zlib() { let test_data = get_test_data(); let compressed = deflate_bytes_zlib(&test_data); // { // use std::fs::File; // use std::io::Write; // let mut f = File::create("out.zlib").unwrap(); // f.write_all(&compressed).unwrap(); // } println!("file_zlib compressed(default) length: {}", compressed.len()); let result = decompress_zlib(&compressed); assert!(&test_data == &result); assert!(compressed.len() < test_data.len()); } #[test] fn zlib_short() { let test_data = [10, 10, 10, 10, 10, 55]; roundtrip_zlib(&test_data, CO::default()); } #[test] fn zlib_last_block() { let mut test_data = vec![22; 32768]; test_data.extend(&[5, 2, 55, 11, 12]); roundtrip_zlib(&test_data, CO::default()); } #[test] fn deflate_short() { let test_data = [10, 10, 10, 10, 10, 55]; let compressed = deflate_bytes(&test_data); let result = decompress_to_end(&compressed); assert_eq!(&test_data, result.as_slice()); // If block type and compression is selected correctly, this should only take 5 bytes. assert_eq!(compressed.len(), 5); } #[cfg(feature = "gzip")] #[test] fn gzip() { let data = get_test_data(); let comment = b"Test"; let compressed = deflate_bytes_gzip_conf( &data, Compression::Default, GzBuilder::new().comment(&comment[..]), ); let (dec, decompressed) = decompress_gzip(&compressed); assert_eq!(dec.comment().unwrap(), comment); assert!(data == decompressed); } fn chunk_test(chunk_size: usize, level: CompressionOptions) { let mut compressed = Vec::with_capacity(32000); let data = get_test_data(); { let mut compressor = write::ZlibEncoder::new(&mut compressed, level); chunked_write(&mut compressor, &data, chunk_size); compressor.finish().unwrap(); } let compressed2 = deflate_bytes_zlib_conf(&data, level); let res = decompress_zlib(&compressed); assert!(res == data); assert_eq!(compressed.len(), compressed2.len()); assert!(compressed == compressed2); } fn writer_chunks_level(level: CompressionOptions) { use input_buffer::BUFFER_SIZE; let ct = |n| chunk_test(n, level); ct(1); ct(50); ct(400); ct(32768); ct(BUFFER_SIZE); ct(50000); ct((32768 * 2) + 258); } #[ignore] #[test] /// Test the writer by inputing data in one chunk at the time. fn zlib_writer_chunks() { writer_chunks_level(CompressionOptions::default()); writer_chunks_level(CompressionOptions::fast()); writer_chunks_level(CompressionOptions::rle()); } /// Check that the frequency values don't overflow. #[test] fn frequency_overflow() { let _ = deflate_bytes_conf( &vec![5; 100000], compression_options::CompressionOptions::default(), ); } fn roundtrip_zlib(data: &[u8], level: CompressionOptions) { let compressed = deflate_bytes_zlib_conf(data, level); let res = decompress_zlib(&compressed); if data.len() <= 32 { assert_eq!(res, data, "Failed with level: {:?}", level); } else { assert!(res == data, "Failed with level: {:?}", level); } } fn check_zero(level: CompressionOptions) { roundtrip_zlib(&[], level); } /// Compress with an empty slice. #[test] fn empty_input() { check_zero(CompressionOptions::default()); check_zero(CompressionOptions::fast()); check_zero(CompressionOptions::rle()); } #[test] fn one_and_two_values() { let one = &[1][..]; roundtrip_zlib(one, CO::rle()); roundtrip_zlib(one, CO::fast()); roundtrip_zlib(one, CO::default()); let two = &[5, 6, 7, 8][..]; roundtrip_zlib(two, CO::rle()); roundtrip_zlib(two, CO::fast()); roundtrip_zlib(two, CO::default()); } } deflate-1.0.0/src/lz77.rs000064400000000000000000001306730072674642500132300ustar 00000000000000//! This module contains functionality for doing lz77 compression of data. #![macro_use] use std::cmp; use std::fmt; use std::iter::{self, Iterator}; use std::ops::{Range, RangeFrom}; use std::slice::Iter; use crate::chained_hash_table::{update_hash, ChainedHashTable}; use crate::compress::Flush; #[cfg(test)] use crate::compression_options::{HIGH_LAZY_IF_LESS_THAN, HIGH_MAX_HASH_CHECKS}; use crate::input_buffer::InputBuffer; #[cfg(test)] use crate::lzvalue::{LZType, LZValue}; use crate::matching::longest_match; use crate::output_writer::{BufferStatus, DynamicWriter}; use crate::rle::process_chunk_greedy_rle; const MAX_MATCH: usize = crate::huffman_table::MAX_MATCH as usize; const MIN_MATCH: usize = crate::huffman_table::MIN_MATCH as usize; const NO_RLE: u16 = 43212; /// An enum describing whether we use lazy or greedy matching. #[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] pub enum MatchingType { /// Use greedy matching: the matching algorithm simply uses a match right away /// if found. Greedy, /// Use lazy matching: after finding a match, the next input byte is checked, to see /// if there is a better match starting at that byte. /// /// As a special case, if max_hash_checks is set to 0, compression using only run-length /// (i.e maximum match distance of 1) is performed instead. Lazy, } impl fmt::Display for MatchingType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match *self { MatchingType::Greedy => write!(f, "Greedy matching"), MatchingType::Lazy => write!(f, "Lazy matching"), } } } /// A struct that contains the hash table, and keeps track of where we are in the input data pub struct LZ77State { /// Struct containing hash chains that will be used to find matches. hash_table: ChainedHashTable, /// True if this is the first window that is being processed. is_first_window: bool, /// Set to true when the last block has been processed. is_last_block: bool, /// How many bytes the last match in the previous window extended into the current one. overlap: usize, /// How many bytes of input the current block contains. current_block_input_bytes: u64, /// The maximum number of hash entries to search. max_hash_checks: u16, /// Only lazy match if we have a match length less than this. lazy_if_less_than: u16, /// Whether to use greedy or lazy parsing matching_type: MatchingType, /// Keep track of the previous match and byte in case the buffer is full when lazy matching. match_state: ChunkState, /// Keep track of how many bytes in the lookahead that was part of a match, but has not been /// added to the hash chain yet. bytes_to_hash: usize, /// Keep track of if sync flush was used. If this is the case, the two first bytes needs to be /// hashed. was_synced: bool, } impl LZ77State { /// Creates a new LZ77 state pub fn new( max_hash_checks: u16, lazy_if_less_than: u16, matching_type: MatchingType, ) -> LZ77State { LZ77State { hash_table: ChainedHashTable::new(), is_first_window: true, is_last_block: false, overlap: 0, current_block_input_bytes: 0, max_hash_checks, lazy_if_less_than, matching_type, match_state: ChunkState::new(), bytes_to_hash: 0, was_synced: false, } } /// Resets the state excluding max_hash_checks and lazy_if_less_than pub fn reset(&mut self) { self.hash_table.reset(); self.is_first_window = true; self.is_last_block = false; self.overlap = 0; self.current_block_input_bytes = 0; self.match_state = ChunkState::new(); self.bytes_to_hash = 0 } pub fn set_last(&mut self) { self.is_last_block = true; } /// Is this the last block we are outputting? pub const fn is_last_block(&self) -> bool { self.is_last_block } /// How many bytes of input the current block contains. pub const fn current_block_input_bytes(&self) -> u64 { self.current_block_input_bytes } /// Sets the number of input bytes for the current block to 0. pub fn reset_input_bytes(&mut self) { self.current_block_input_bytes = 0; } /// Is there a buffered byte that has not been output yet? pub const fn pending_byte(&self) -> bool { self.match_state.add } /// Returns 1 if pending_byte is true, 0 otherwise. pub fn pending_byte_as_num(&self) -> usize { // This could be implemented by using `as usize` as the documentation states this would give // the same result, but not sure if that should be relied upon. if self.match_state.add { 1 } else { 0 } } } const DEFAULT_WINDOW_SIZE: usize = 32768; #[derive(Debug)] /// Status after calling `process_chunk`. pub enum ProcessStatus { /// All the input data was processed. Ok, /// The output buffer was full. /// /// The argument is the position of the next byte to be checked by `process_chunk`. BufferFull(usize), } #[derive(Debug)] /// A struct to keep track of status between calls of `process_chunk_lazy` /// /// This is needed as the output buffer might become full before having output all pending data. pub struct ChunkState { /// Length of the last match that was found, if any. current_length: u16, /// Distance of the last match that was found, if any. current_distance: u16, /// The previous byte checked in process_chunk. prev_byte: u8, /// The current byte being checked in process_chunk. cur_byte: u8, /// Whether prev_byte still needs to be output. add: bool, } impl ChunkState { pub fn new() -> ChunkState { ChunkState { current_length: 0, current_distance: 0, prev_byte: 0, cur_byte: 0, add: false, } } } pub const fn buffer_full(position: usize) -> ProcessStatus { ProcessStatus::BufferFull(position) } #[allow(clippy::too_many_arguments)] fn process_chunk( data: &[u8], iterated_data: &Range, mut match_state: &mut ChunkState, hash_table: &mut ChainedHashTable, writer: &mut DynamicWriter, max_hash_checks: u16, lazy_if_less_than: usize, matching_type: MatchingType, ) -> (usize, ProcessStatus) { let avoid_rle = if cfg!(test) { // Avoid RLE if lazy_if_less than is a specific value. // This is used in some tests, ideally we should probably do this in a less clunky way, // but we use a value here that is higher than the maximum sensible one anyhow, and will // be truncated by deflate_state for calls from outside the library. lazy_if_less_than == NO_RLE as usize } else { false }; match matching_type { MatchingType::Greedy => { process_chunk_greedy(data, iterated_data, hash_table, writer, max_hash_checks) } MatchingType::Lazy => { if max_hash_checks > 0 || avoid_rle { process_chunk_lazy( data, iterated_data, &mut match_state, hash_table, writer, max_hash_checks, lazy_if_less_than, ) } else { // Use the RLE method if max_hash_checks is set to 0. process_chunk_greedy_rle(data, iterated_data, writer) } } } } /// Add the specified number of bytes to the hash table from the iterators /// adding `start` to the position supplied to the hash table. fn add_to_hash_table( bytes_to_add: usize, insert_it: &mut iter::Zip, Iter>, hash_it: &mut Iter, hash_table: &mut ChainedHashTable, ) { let taker = insert_it.by_ref().take(bytes_to_add); let mut hash_taker = hash_it.by_ref().take(bytes_to_add); // Update the hash manually here to keep it in a register. let mut hash = hash_table.current_hash(); // Advance the iterators and add the bytes we jump over to the hash table and // checksum for (ipos, _) in taker { if let Some(&i_hash_byte) = hash_taker.next() { hash = update_hash(hash, i_hash_byte); hash_table.add_with_hash(ipos, hash); } } // Write the hash back once we are done. hash_table.set_hash(hash); } /// Write the specified literal `byte` to the writer `w`, and return /// `ProcessStatus::BufferFull($pos)` if the buffer is full after writing. /// /// `pos` should indicate the byte to start at in the next call to `process_chunk`, macro_rules! write_literal { ($w:ident, $byte:expr, $pos:expr) => { let b_status = $w.write_literal($byte); if let BufferStatus::Full = b_status { return (0, buffer_full($pos)); } }; } /// If the match is only 3 bytes long and the distance is more than 8 * 1024, it's likely to take /// up more space than it would save. #[inline] fn match_too_far(match_len: usize, match_dist: usize) -> bool { const TOO_FAR: usize = 8 * 1024; match_len == MIN_MATCH && match_dist > TOO_FAR } ///Create the iterators used when processing through a chunk of data. fn create_iterators<'a>( data: &'a [u8], iterated_data: &Range, ) -> ( usize, iter::Zip, Iter<'a, u8>>, Iter<'a, u8>, ) { let end = cmp::min(data.len(), iterated_data.end); let start = iterated_data.start; let current_chunk = &data[start..end]; let insert_it = (start..).zip(current_chunk.iter()); let hash_it = { let hash_start = if data.len() - start > 2 { start + 2 } else { data.len() }; (&data[hash_start..]).iter() }; (end, insert_it, hash_it) } fn process_chunk_lazy( data: &[u8], iterated_data: &Range, state: &mut ChunkState, mut hash_table: &mut ChainedHashTable, writer: &mut DynamicWriter, max_hash_checks: u16, lazy_if_less_than: usize, ) -> (usize, ProcessStatus) { let (end, mut insert_it, mut hash_it) = create_iterators(data, iterated_data); const NO_LENGTH: u16 = 0; // The previous match length, if any. let mut prev_length = state.current_length; // The distance of the previous match if any. let mut prev_distance = state.current_distance; state.current_length = 0; state.current_distance = 0; // The number of bytes past end that was added due to finding a match that extends into // the lookahead window. let mut overlap = 0; // Set to true if we found a match that is equal to or longer than `lazy_if_less_than`, // indicating that we won't lazy match (check for a better match at the next byte). // If we had a good match, carry this over from the previous call. let mut ignore_next = prev_length as usize >= lazy_if_less_than; // This is to output the correct byte in case there is one pending to be output // from the previous call. state.prev_byte = state.cur_byte; // Iterate through the slice, adding literals or length/distance pairs while let Some((position, &b)) = insert_it.next() { state.cur_byte = b; if let Some(&hash_byte) = hash_it.next() { hash_table.add_hash_value(position, hash_byte); // Only lazy match if we have a match shorter than a set value // TODO: This should be cleaned up a bit if !ignore_next { let (mut match_len, match_dist) = { // If there already was a decent match at the previous byte // and we are lazy matching, do less match checks in this step. let max_hash_checks = if prev_length >= 32 { max_hash_checks >> 2 } else { max_hash_checks }; // Check if we can find a better match here than the one we had at // the previous byte. longest_match( data, hash_table, position, prev_length as usize, max_hash_checks, ) }; // If the match is only 3 bytes long and very far back, it's probably not worth // outputting. if match_too_far(match_len, match_dist) { match_len = NO_LENGTH as usize; }; if match_len >= lazy_if_less_than { // We found a decent match, so we won't check for a better one at the next byte. ignore_next = true; } state.current_length = match_len as u16; state.current_distance = match_dist as u16; } else { // We already had a decent match, so we don't bother checking for another one. state.current_length = NO_LENGTH; state.current_distance = 0; // Make sure we check again next time. ignore_next = false; }; if prev_length >= state.current_length && prev_length >= MIN_MATCH as u16 { // The previous match was better so we add it. // Casting note: length and distance is already bounded by the longest match // function. Usize is just used for convenience. let b_status = writer.write_length_distance(prev_length as u16, prev_distance as u16); // We add the bytes to the hash table and checksum. // Since we've already added two of them, we need to add two less than // the length. let bytes_to_add = prev_length - 2; add_to_hash_table( bytes_to_add as usize, &mut insert_it, &mut hash_it, &mut hash_table, ); // If the match is longer than the current window, we have note how many // bytes we overlap, since we don't need to do any matching on these bytes // in the next call of this function. // We don't have to worry about setting overlap to 0 if this is false, as the // function will stop after this condition is true, and overlap is not altered // elsewhere. if position + prev_length as usize > end { // We need to subtract 1 since the byte at pos is also included. overlap = position + prev_length as usize - end - 1; }; state.add = false; // Note that there is no current match. state.current_length = 0; state.current_distance = 0; if let BufferStatus::Full = b_status { // MATCH(lazy) return (overlap, buffer_full(position + prev_length as usize - 1)); } ignore_next = false; } else if state.add { // We found a better match (or there was no previous match) // so output the previous byte. // BETTER OR NO MATCH write_literal!(writer, state.prev_byte, position + 1); } else { state.add = true } prev_length = state.current_length; prev_distance = state.current_distance; state.prev_byte = b; } else { // If there is a match at this point, it will not have been added, so we need to add it. if prev_length >= MIN_MATCH as u16 { let b_status = writer.write_length_distance(prev_length as u16, prev_distance as u16); state.current_length = 0; state.current_distance = 0; state.add = false; debug_assert!((position + prev_length as usize) >= end - 1); // Needed to note overlap as we can end up with the last window containing the rest // of the match. overlap = (position + prev_length as usize) .saturating_sub(end) .saturating_sub(1); // TODO: Not sure if we need to signal that the buffer is full here. // It's only needed in the case of syncing. if let BufferStatus::Full = b_status { // TODO: These bytes should be hashed when doing a sync flush. // This can't be done here as the new input data does not exist yet. return (overlap, buffer_full(end)); } else { return (overlap, ProcessStatus::Ok); } }; if state.add { // We may still have a leftover byte at this point, so we add it here if needed. state.add = false; // ADD write_literal!(writer, state.prev_byte, position); }; // We are at the last two bytes we want to add, so there is no point // searching for matches here. // AFTER ADD write_literal!(writer, b, position + 1); } } (overlap, ProcessStatus::Ok) } fn process_chunk_greedy( data: &[u8], iterated_data: &Range, mut hash_table: &mut ChainedHashTable, writer: &mut DynamicWriter, max_hash_checks: u16, ) -> (usize, ProcessStatus) { let (end, mut insert_it, mut hash_it) = create_iterators(data, iterated_data); const NO_LENGTH: usize = 0; // The number of bytes past end that was added due to finding a match that extends into // the lookahead window. let mut overlap = 0; // Iterate through the slice, adding literals or length/distance pairs. while let Some((position, &b)) = insert_it.next() { if let Some(&hash_byte) = hash_it.next() { hash_table.add_hash_value(position, hash_byte); // TODO: This should be cleaned up a bit. let (match_len, match_dist) = { longest_match(data, hash_table, position, NO_LENGTH, max_hash_checks) }; if match_len >= MIN_MATCH as usize && !match_too_far(match_len, match_dist) { // Casting note: length and distance is already bounded by the longest match // function. Usize is just used for convenience. let b_status = writer.write_length_distance(match_len as u16, match_dist as u16); // We add the bytes to the hash table and checksum. // Since we've already added one of them, we need to add one less than // the length. let bytes_to_add = match_len - 1; add_to_hash_table(bytes_to_add, &mut insert_it, &mut hash_it, &mut hash_table); // If the match is longer than the current window, we have note how many // bytes we overlap, since we don't need to do any matching on these bytes // in the next call of this function. if position + match_len > end { // We need to subtract 1 since the byte at pos is also included. overlap = position + match_len - end; }; if let BufferStatus::Full = b_status { // MATCH return (overlap, buffer_full(position + match_len)); } } else { // NO MATCH write_literal!(writer, b, position + 1); } } else { // We are at the last two bytes we want to add, so there is no point // searching for matches here. // END write_literal!(writer, b, position + 1); } } (overlap, ProcessStatus::Ok) } #[derive(Eq, PartialEq, Clone, Copy, Debug)] pub enum LZ77Status { /// Waiting for more input before doing any processing NeedInput, /// The output buffer is full, so the current block needs to be ended so the /// buffer can be flushed. EndBlock, /// All pending data has been processed. Finished, } #[cfg(test)] pub fn lz77_compress_block_finish( data: &[u8], state: &mut LZ77State, buffer: &mut InputBuffer, mut writer: &mut DynamicWriter, ) -> (usize, LZ77Status) { let (consumed, status, _) = lz77_compress_block(data, state, buffer, &mut writer, Flush::Finish); (consumed, status) } /// Compress a slice with lz77 compression. /// /// This function processes one window at a time, and returns when there is no input left, /// or it determines it's time to end a block. /// /// Returns the number of bytes of the input that were consumed, a status describing /// whether there is no input, it's time to finish, or it's time to end the block, and the position /// of the first byte in the input buffer that has not been output (but may have been checked for /// matches). pub fn lz77_compress_block( data: &[u8], state: &mut LZ77State, buffer: &mut InputBuffer, mut writer: &mut DynamicWriter, flush: Flush, ) -> (usize, LZ77Status, usize) { // Currently we only support the maximum window size let window_size = DEFAULT_WINDOW_SIZE; // Indicates whether we should try to process all the data including the lookahead, or if we // should wait until we have at least one window size of data before doing anything. let finish = flush == Flush::Finish || flush == Flush::Sync; let sync = flush == Flush::Sync; let mut current_position = 0; // The current status of the encoding. let mut status = LZ77Status::EndBlock; // Whether warm up the hash chain with the two first values. let mut add_initial = true; // If we have synced, add the two first bytes to the hash as they couldn't be added before. if state.was_synced { if buffer.current_end() > 2 { let pos_add = buffer.current_end() - 2; for (n, &b) in data.iter().take(2).enumerate() { state.hash_table.add_hash_value(n + pos_add, b); } add_initial = false; } state.was_synced = false; } // Add data to the input buffer and keep a reference to the slice of data not added yet. let mut remaining_data = buffer.add_data(data); loop { // Note if there is a pending byte from the previous call to process_chunk, // so we get the block input size right. let pending_previous = state.pending_byte_as_num(); assert!(writer.buffer_length() <= (window_size * 2)); // Don't do anything until we are either flushing, or we have at least one window of // data. if buffer.current_end() >= (window_size * 2) + MAX_MATCH || finish { if state.is_first_window { if buffer.get_buffer().len() >= 2 && add_initial && state.current_block_input_bytes == 0 { let b = buffer.get_buffer(); // Warm up the hash with the two first values, so we can find matches at // index 0. state.hash_table.add_initial_hash_values(b[0], b[1]); add_initial = false; } } else if buffer.current_end() >= window_size + 2 { for (n, &h) in buffer.get_buffer()[window_size + 2..] .iter() .enumerate() .take(state.bytes_to_hash) { state.hash_table.add_hash_value(window_size + n, h); } state.bytes_to_hash = 0; } let window_start = if state.is_first_window { 0 } else { window_size }; let start = state.overlap + window_start; let end = cmp::min(window_size + window_start, buffer.current_end()); let (overlap, p_status) = process_chunk( buffer.get_buffer(), &(start..end), &mut state.match_state, &mut state.hash_table, &mut writer, state.max_hash_checks, state.lazy_if_less_than as usize, state.matching_type, ); state.bytes_to_hash = overlap; if let ProcessStatus::BufferFull(written) = p_status { state.current_block_input_bytes += (written - start + pending_previous - state.pending_byte_as_num()) as u64; // If the buffer is full, return and end the block. // If overlap is non-zero, the buffer was full after outputting the last byte, // otherwise we have to skip to the point in the buffer where we stopped in the // next call. state.overlap = if overlap > 0 { if !state.is_first_window { // If we are at the end of the window, make sure we slide the buffer and the // hash table. if state.max_hash_checks > 0 { state.hash_table.slide(window_size); } remaining_data = buffer.slide(remaining_data.unwrap_or(&[])); } else { state.is_first_window = false; } overlap } else { written - window_start }; current_position = written - state.pending_byte_as_num(); // Status is already EndBlock at this point. // status = LZ77Status::EndBlock; break; } state.current_block_input_bytes += (end - start + overlap + pending_previous - state.pending_byte_as_num()) as u64; // The buffer is not full, but we still need to note if there is any overlap into the // next window. state.overlap = overlap; if (state.is_first_window || remaining_data.is_none()) && finish && end >= buffer.current_end() { current_position = if state.is_first_window { end - state.pending_byte_as_num() } else { debug_assert!( !state.pending_byte(), "Bug! Ended compression with a pending byte!" ); buffer.current_end() }; // We stopped before or at the window size, so we are at the end. if !sync { // If we are finishing and not syncing, we simply indicate that we are done. state.set_last(); state.is_first_window = false; } else { // For sync flushing we need to slide the buffer and the hash chains so that the // next call to this function starts at the right place. // There won't be any overlap, since when syncing, we process to the end of the // pending data. state.overlap = if state.is_first_window { end } else { buffer.current_end() - window_size }; state.was_synced = true; } status = LZ77Status::Finished; break; } else if state.is_first_window { state.is_first_window = false; } else { // We are not at the end, so slide and continue. // We slide the hash table back to make space for new hash values // We only need to remember 2^15 bytes back (the maximum distance allowed by the // deflate spec). if state.max_hash_checks > 0 { state.hash_table.slide(window_size); } // Also slide the buffer, discarding data we no longer need and adding new data. remaining_data = buffer.slide(remaining_data.unwrap_or(&[])); } } else { // The caller has not indicated that they want to finish or flush, and there is less // than a window + lookahead of new data, so we wait for more. status = LZ77Status::NeedInput; break; } } ( data.len() - remaining_data.unwrap_or(&[]).len(), status, current_position, ) } #[cfg(test)] pub fn decompress_lz77(input: &[LZValue]) -> Vec { decompress_lz77_with_backbuffer(input, &[]) } #[cfg(test)] pub fn decompress_lz77_with_backbuffer(input: &[LZValue], back_buffer: &[u8]) -> Vec { let mut output = Vec::new(); for p in input { match p.value() { LZType::Literal(l) => output.push(l), LZType::StoredLengthDistance(l, d) => { // We found a match, so we have to get the data that the match refers to. let d = d as usize; let prev_output_len = output.len(); // Get match data from the back buffer if the match extends that far. let consumed = if d > output.len() { let into_back_buffer = d - output.len(); assert!( into_back_buffer <= back_buffer.len(), "ERROR: Attempted to refer to a match in non-existing data!\ into_back_buffer: {}, back_buffer len {}, d {}, l {:?}", into_back_buffer, back_buffer.len(), d, l ); let start = back_buffer.len() - into_back_buffer; let end = cmp::min(back_buffer.len(), start + l.actual_length() as usize); output.extend_from_slice(&back_buffer[start..end]); end - start } else { 0 }; // Get match data from the currently decompressed data. let start = prev_output_len.saturating_sub(d); let mut n = 0; while n < (l.actual_length() as usize).saturating_sub(consumed) { let b = output[start + n]; output.push(b); n += 1; } } } } output } #[cfg(test)] pub struct TestStruct { state: LZ77State, buffer: InputBuffer, writer: DynamicWriter, } #[cfg(test)] impl TestStruct { fn new() -> TestStruct { TestStruct::with_config( HIGH_MAX_HASH_CHECKS, HIGH_LAZY_IF_LESS_THAN, MatchingType::Lazy, ) } fn with_config( max_hash_checks: u16, lazy_if_less_than: u16, matching_type: MatchingType, ) -> TestStruct { TestStruct { state: LZ77State::new(max_hash_checks, lazy_if_less_than, matching_type), buffer: InputBuffer::empty(), writer: DynamicWriter::new(), } } fn compress_block(&mut self, data: &[u8], flush: bool) -> (usize, LZ77Status, usize) { lz77_compress_block( data, &mut self.state, &mut self.buffer, &mut self.writer, if flush { Flush::Finish } else { Flush::None }, ) } } #[cfg(test)] pub fn lz77_compress(data: &[u8]) -> Option> { lz77_compress_conf( data, HIGH_MAX_HASH_CHECKS, HIGH_LAZY_IF_LESS_THAN, MatchingType::Lazy, ) } /// Compress a slice, not storing frequency information /// /// This is a convenience function for compression with fixed huffman values /// Only used in tests for now #[cfg(test)] pub fn lz77_compress_conf( data: &[u8], max_hash_checks: u16, lazy_if_less_than: u16, matching_type: MatchingType, ) -> Option> { let mut test_boxed = Box::new(TestStruct::with_config( max_hash_checks, lazy_if_less_than, matching_type, )); let mut out = Vec::::with_capacity(data.len() / 3); { let test = test_boxed.as_mut(); let mut slice = data; while !test.state.is_last_block { let bytes_written = lz77_compress_block_finish( slice, &mut test.state, &mut test.buffer, &mut test.writer, ) .0; slice = &slice[bytes_written..]; out.extend(test.writer.get_buffer()); test.writer.clear(); } } Some(out) } #[cfg(test)] mod test { use super::*; use crate::chained_hash_table::WINDOW_SIZE; use crate::compression_options::DEFAULT_LAZY_IF_LESS_THAN; use crate::lzvalue::{ld, lit, LZType, LZValue}; use crate::output_writer::MAX_BUFFER_LENGTH; use crate::test_utils::get_test_data; /// Helper function to print the output from the lz77 compression function fn print_output(input: &[LZValue]) { let mut output = vec![]; for l in input { match l.value() { LZType::Literal(l) => output.push(l), LZType::StoredLengthDistance(l, d) => { output.extend(format!("", l.actual_length()).into_bytes()); output.extend(format!("", d).into_bytes()) } } } println!("\"{}\"", String::from_utf8(output).unwrap()); } /// Test that a short string from an example on SO compresses correctly #[test] fn compress_short() { let test_bytes = String::from("Deflate late").into_bytes(); let res = lz77_compress(&test_bytes).unwrap(); let decompressed = decompress_lz77(&res); assert_eq!(test_bytes, decompressed); assert_eq!(*res.last().unwrap(), LZValue::length_distance(4, 5)); } /// Test that compression is working for a longer file #[test] fn compress_long() { let input = get_test_data(); let compressed = lz77_compress(&input).unwrap(); assert!(compressed.len() < input.len()); let decompressed = decompress_lz77(&compressed); println!("compress_long length: {}", input.len()); // This is to check where the compression fails, if it were to for (n, (&a, &b)) in input.iter().zip(decompressed.iter()).enumerate() { if a != b { println!("First difference at {}, input: {}, output: {}", n, a, b); break; } } assert_eq!(input.len(), decompressed.len()); assert!(&decompressed == &input); } /// Check that lazy matching is working as intended #[test] fn lazy() { // We want to match on `badger` rather than `nba` as it is longer // let data = b" nba nbadg badger nbadger"; let data = b"nba badger nbadger"; let compressed = lz77_compress(data).unwrap(); let test = compressed[compressed.len() - 1]; if let LZType::StoredLengthDistance(l, _) = test.value() { assert_eq!(l.actual_length(), 6); } else { print_output(&compressed); panic!(); } } fn roundtrip(data: &[u8]) { let compressed = super::lz77_compress(&data).unwrap(); let decompressed = decompress_lz77(&compressed); assert!(decompressed == data); } // Check that data with the exact window size is working properly #[test] #[allow(unused)] fn exact_window_size() { use std::io::Write; let mut data = vec![0; WINDOW_SIZE]; roundtrip(&data); { data.write(&[22; WINDOW_SIZE]); } roundtrip(&data); { data.write(&[55; WINDOW_SIZE]); } roundtrip(&data); } /// Test that matches at the window border are working correctly #[test] fn border() { use crate::chained_hash_table::WINDOW_SIZE; let mut data = vec![35; WINDOW_SIZE]; data.extend(b"Test"); let compressed = super::lz77_compress(&data).unwrap(); assert!(compressed.len() < data.len()); let decompressed = decompress_lz77(&compressed); assert_eq!(decompressed.len(), data.len()); assert!(decompressed == data); } #[test] fn border_multiple_blocks() { use crate::chained_hash_table::WINDOW_SIZE; let mut data = vec![0; (WINDOW_SIZE * 2) + 50]; data.push(1); let compressed = super::lz77_compress(&data).unwrap(); assert!(compressed.len() < data.len()); let decompressed = decompress_lz77(&compressed); assert_eq!(decompressed.len(), data.len()); assert!(decompressed == data); } #[test] fn compress_block_status() { use crate::input_buffer::InputBuffer; let data = b"Test data data"; let mut writer = DynamicWriter::new(); let mut buffer = InputBuffer::empty(); let mut state = LZ77State::new(4096, DEFAULT_LAZY_IF_LESS_THAN, MatchingType::Lazy); let status = lz77_compress_block_finish(data, &mut state, &mut buffer, &mut writer); assert_eq!(status.1, LZ77Status::Finished); assert!(&buffer.get_buffer()[..data.len()] == data); assert_eq!(buffer.current_end(), data.len()); } #[test] fn compress_block_multiple_windows() { use crate::input_buffer::InputBuffer; use crate::output_writer::DynamicWriter; let data = get_test_data(); assert!(data.len() > (WINDOW_SIZE * 2) + super::MAX_MATCH); let mut writer = DynamicWriter::new(); let mut buffer = InputBuffer::empty(); let mut state = LZ77State::new(0, DEFAULT_LAZY_IF_LESS_THAN, MatchingType::Lazy); let (bytes_consumed, status) = lz77_compress_block_finish(&data, &mut state, &mut buffer, &mut writer); assert_eq!( buffer.get_buffer().len(), (WINDOW_SIZE * 2) + super::MAX_MATCH ); assert_eq!(status, LZ77Status::EndBlock); let buf_len = buffer.get_buffer().len(); assert!(buffer.get_buffer()[..] == data[..buf_len]); writer.clear(); let (_, status) = lz77_compress_block_finish( &data[bytes_consumed..], &mut state, &mut buffer, &mut writer, ); assert_eq!(status, LZ77Status::EndBlock); } #[test] fn multiple_inputs() { let data = b"Badger badger bababa test data 25 asfgestghresjkgh"; let comp1 = lz77_compress(data).unwrap(); let comp2 = { const SPLIT: usize = 25; let first_part = &data[..SPLIT]; let second_part = &data[SPLIT..]; let mut state = TestStruct::new(); let (bytes_written, status, _) = state.compress_block(first_part, false); assert_eq!(bytes_written, first_part.len()); assert_eq!(status, LZ77Status::NeedInput); let (bytes_written, status, _) = state.compress_block(second_part, true); assert_eq!(bytes_written, second_part.len()); assert_eq!(status, LZ77Status::Finished); Vec::from(state.writer.get_buffer()) }; assert!(comp1 == comp2); } #[test] /// Test that the exit from process_chunk when buffer is full is working correctly. fn buffer_fill() { let data = get_test_data(); // The comments above these calls refers the positions with the // corersponding comments in process_chunk_{greedy/lazy}. // POS BETTER OR NO MATCH buffer_test_literals(&data); // POS END // POS NO MATCH buffer_test_last_bytes(MatchingType::Greedy, &data); // POS ADD // POS AFTER ADD buffer_test_last_bytes(MatchingType::Lazy, &data); // POS MATCH buffer_test_match(MatchingType::Greedy); // POS MATCH(lazy) buffer_test_match(MatchingType::Lazy); // POS END buffer_test_add_end(&data); } /// Test buffer fill when a byte is added due to no match being found. fn buffer_test_literals(data: &[u8]) { let mut state = TestStruct::with_config(0, NO_RLE, MatchingType::Lazy); let (bytes_consumed, status, position) = state.compress_block(&data, false); // There should be enough data for the block to have ended. assert_eq!(status, LZ77Status::EndBlock); assert!(bytes_consumed <= (WINDOW_SIZE * 2) + MAX_MATCH); // The buffer should be full. assert_eq!(state.writer.get_buffer().len(), MAX_BUFFER_LENGTH); assert_eq!(position, state.writer.get_buffer().len()); // Since all literals have been input, the block should have the exact number of litlens // as there were input bytes. assert_eq!( state.state.current_block_input_bytes() as usize, MAX_BUFFER_LENGTH ); state.state.reset_input_bytes(); let mut out = decompress_lz77(state.writer.get_buffer()); state.writer.clear(); // The buffer should now be cleared. assert_eq!(state.writer.get_buffer().len(), 0); assert!(data[..out.len()] == out[..]); let _ = state.compress_block(&data[bytes_consumed..], false); // We should have some new data in the buffer at this point. assert!(state.writer.get_buffer().len() > 0); assert_eq!( state.state.current_block_input_bytes() as usize, MAX_BUFFER_LENGTH ); out.extend_from_slice(&decompress_lz77(state.writer.get_buffer())); assert!(data[..out.len()] == out[..]); } /// Test buffer fill at the last two bytes that are not hashed. fn buffer_test_last_bytes(matching_type: MatchingType, data: &[u8]) { const BYTES_USED: usize = MAX_BUFFER_LENGTH; assert!( &data[..BYTES_USED] == &decompress_lz77( &lz77_compress_conf(&data[..BYTES_USED], 0, NO_RLE, matching_type,).unwrap() )[..] ); assert!( &data[..BYTES_USED + 1] == &decompress_lz77( &lz77_compress_conf(&data[..BYTES_USED + 1], 0, NO_RLE, matching_type,) .unwrap() )[..] ); } /// Test buffer fill when buffer is full at a match. fn buffer_test_match(matching_type: MatchingType) { // TODO: Also test this for the second block to make sure // buffer is slid. let mut state = TestStruct::with_config(1, 0, matching_type); for _ in 0..MAX_BUFFER_LENGTH - 4 { assert!(state.writer.write_literal(0) == BufferStatus::NotFull); } state.compress_block(&[1, 2, 3, 1, 2, 3, 4], true); assert!(*state.writer.get_buffer().last().unwrap() == LZValue::length_distance(3, 3)); } /// Test buffer fill for the lazy match algorithm when adding a pending byte at the end. fn buffer_test_add_end(_data: &[u8]) { // This is disabled while the buffer size has not been stabilized. /* let mut state = TestStruct::with_config(DEFAULT_MAX_HASH_CHECKS, DEFAULT_LAZY_IF_LESS_THAN, MatchingType::Lazy); // For the test file, this is how much data needs to be added to get the buffer // full at the right spot to test that this buffer full exit is workong correctly. for i in 0..31743 { assert!(state.writer.write_literal(0) == BufferStatus::NotFull, "Buffer pos: {}", i); } let dec = decompress_lz77(&state.writer.get_buffer()[pos..]); assert!(dec.len() > 0); assert!(dec[..] == data[..dec.len()]); */ } /// Check that decompressing lz77-data that refers to the back-buffer works. #[test] fn test_decompress_with_backbuffer() { let bb = vec![5; WINDOW_SIZE]; let lz_compressed = [lit(2), lit(4), ld(4, 20), lit(1), lit(1), ld(5, 10)]; let dec = decompress_lz77_with_backbuffer(&lz_compressed, &bb); // ------------l2 l4 <-ld4,20-> l1 l1 <---ld5,10--> assert!(dec == [2, 4, 5, 5, 5, 5, 1, 1, 5, 5, 2, 4, 5]); } } #[cfg(all(test, feature = "benchmarks"))] mod bench { use super::lz77_compress; use test_std::Bencher; use test_utils::get_test_data; #[bench] fn test_file_zlib_lz77_only(b: &mut Bencher) { let test_data = get_test_data(); b.iter(|| lz77_compress(&test_data)); } } deflate-1.0.0/src/lzvalue.rs000064400000000000000000000056260072674642500141060ustar 00000000000000#[cfg(test)] use crate::huffman_table::MAX_MATCH; use crate::huffman_table::{MAX_DISTANCE, MIN_MATCH}; #[derive(Copy, Clone, Eq, PartialEq, Debug)] pub struct StoredLength { length: u8, } impl StoredLength { #[cfg(test)] pub fn from_actual_length(length: u16) -> StoredLength { assert!(length <= MAX_MATCH && length >= MIN_MATCH); StoredLength { length: (length - MIN_MATCH) as u8, } } pub const fn new(stored_length: u8) -> StoredLength { StoredLength { length: stored_length, } } pub const fn stored_length(&self) -> u8 { self.length } #[cfg(test)] pub fn actual_length(&self) -> u16 { u16::from(self.length) + MIN_MATCH } } #[derive(Copy, Clone, Eq, PartialEq, Debug)] pub enum LZType { Literal(u8), StoredLengthDistance(StoredLength, u16), } #[derive(Copy, Clone, Eq, PartialEq, Debug)] pub struct LZValue { litlen: u8, distance: u16, } impl LZValue { #[inline] pub const fn literal(value: u8) -> LZValue { LZValue { litlen: value, distance: 0, } } /// Create length-distance pair. #[inline] pub fn length_distance(length: u16, distance: u16) -> LZValue { // TODO: Enforce min/max without too much perf penalty. debug_assert!(distance > 0 && distance <= MAX_DISTANCE); let stored_length = (length - MIN_MATCH) as u8; LZValue { litlen: stored_length, distance, } } #[inline] pub fn value(&self) -> LZType { if self.distance != 0 { LZType::StoredLengthDistance(StoredLength::new(self.litlen), self.distance) } else { LZType::Literal(self.litlen) } } } #[cfg(test)] pub fn lit(l: u8) -> LZValue { LZValue::literal(l) } #[cfg(test)] pub fn ld(l: u16, d: u16) -> LZValue { LZValue::length_distance(l, d) } #[cfg(test)] mod test { use super::*; use crate::huffman_table::{MAX_DISTANCE, MAX_MATCH, MIN_DISTANCE, MIN_MATCH}; #[test] fn lzvalue() { for i in 0..255 as usize + 1 { let v = LZValue::literal(i as u8); if let LZType::Literal(n) = v.value() { assert_eq!(n as usize, i); } else { panic!(); } } for i in MIN_MATCH..MAX_MATCH + 1 { let v = LZValue::length_distance(i, 5); if let LZType::StoredLengthDistance(l, _) = v.value() { assert_eq!(l.actual_length(), i); } else { panic!(); } } for i in MIN_DISTANCE..MAX_DISTANCE + 1 { let v = LZValue::length_distance(5, i); if let LZType::StoredLengthDistance(_, d) = v.value() { assert_eq!(d, i); } else { panic!("Failed to get distance {}", i); } } } } deflate-1.0.0/src/matching.rs000064400000000000000000000354710072674642500142170ustar 00000000000000use std::cmp; use crate::chained_hash_table::{ChainedHashTable, WINDOW_SIZE}; const MAX_MATCH: usize = crate::huffman_table::MAX_MATCH as usize; #[cfg(test)] const MIN_MATCH: usize = crate::huffman_table::MIN_MATCH as usize; /// Get the length of the checked match /// The function returns number of bytes at and including `current_pos` that are the same as the /// ones at `pos_to_check` #[inline] pub fn get_match_length(data: &[u8], current_pos: usize, pos_to_check: usize) -> usize { // Unsafe version using unaligned loads for comparison. // Faster when benching the matching function alone, // but not as significant when running the full thing. /* type Comp = u64; use std::mem::size_of; let max = cmp::min(data.len() - current_pos, MAX_MATCH); let mut left = max; let s = size_of::(); unsafe { let mut cur = data.as_ptr().offset(current_pos as isize); let mut tc = data.as_ptr().offset(pos_to_check as isize); while left >= s && (*(cur as *const Comp) == *(tc as *const Comp)) { left -= s; cur = cur.offset(s as isize); tc = tc.offset(s as isize); } while left > 0 && *cur == *tc { left -= 1; cur = cur.offset(1); tc = tc.offset(1); } } max - left */ // Slightly faster than naive in single bench. // Does not use unaligned loads. // let l = cmp::min(MAX_MATCH, data.len() - current_pos); // let a = unsafe{&data.get_unchecked(current_pos..current_pos + l)}; // let b = unsafe{&data.get_unchecked(pos_to_check..)}; // let mut len = 0; // for (l, r) in a // .iter() // .zip(b.iter()) { // if *l == *r { // len += 1; // continue; // } else { // break; // } // } // len as usize // Naive version data[current_pos..] .iter() .zip(data[pos_to_check..].iter()) .take(MAX_MATCH) .take_while(|&(&a, &b)| a == b) .count() } /// Try finding the position and length of the longest match in the input data. /// # Returns /// (length, distance from position) /// If no match is found that was better than `prev_length` or at all, or we are at the start, /// the length value returned will be 2. /// /// # Arguments: /// `data`: The data to search in. /// `hash_table`: Hash table to use for searching. /// `position`: The position in the data to match against. /// `prev_length`: The length of the previous `longest_match` check to compare against. /// `max_hash_checks`: The maximum number of matching hash chain positions to check. pub fn longest_match( data: &[u8], hash_table: &ChainedHashTable, position: usize, prev_length: usize, max_hash_checks: u16, ) -> (usize, usize) { // debug_assert_eq!(position, hash_table.current_head() as usize); // If we already have a match at the maximum length, // or we can't grow further, we stop here. if prev_length >= MAX_MATCH || position + prev_length >= data.len() { return (0, 0); } let limit = if position > WINDOW_SIZE { position - WINDOW_SIZE } else { 0 }; // Make sure the length is at least one to simplify the matching code, as // otherwise the matching code might underflow. let prev_length = cmp::max(prev_length, 1); let max_length = cmp::min(data.len() - position, MAX_MATCH); // The position in the hash chain we are currently checking. let mut current_head = position; // The best match length we've found so far, and it's distance. let mut best_length = prev_length; let mut best_distance = 0; // The position of the previous value in the hash chain. let mut prev_head; for _ in 0..max_hash_checks { prev_head = current_head; current_head = hash_table.get_prev(current_head) as usize; if current_head >= prev_head || current_head < limit { // If the current hash chain value refers to itself, or is referring to // a value that's higher (we only move backwars through the chain), // we are at the end and can stop. break; } // We only check further if the match length can actually increase // Checking if the end byte and the potential next byte matches is generally // more likely to give a quick answer rather than checking from the start first, given // that the hashes match. // If there is no previous match, best_length will be 1 and the two first bytes will // be checked instead. // Since we've made sure best_length is always at least 1, this shouldn't underflow. if data[position + best_length - 1..=position + best_length] == data[current_head + best_length - 1..=current_head + best_length] { // Actually check how many bytes match. // At the moment this will check the two bytes we just checked again, // though adding code for skipping these bytes may not result in any speed // gain due to the added complexity. let length = get_match_length(data, position, current_head); if length > best_length { best_length = length; best_distance = position - current_head; if length == max_length { // We are at the max length, so there is no point // searching any longer break; } } } } if best_length > prev_length { (best_length, best_distance) } else { (0, 0) } } /// Try finding the position and length of the longest match in the input data using fast zlib /// hash skipping algorithm. /// # Returns /// (length, distance from position) /// If no match is found that was better than `prev_length` or at all, or we are at the start, /// the length value returned will be 2. /// /// # Arguments: /// `data`: The data to search in. /// `hash_table`: Hash table to use for searching. /// `position`: The position in the data to match against. /// `prev_length`: The length of the previous `longest_match` check to compare against. /// `max_hash_checks`: The maximum number of matching hash chain positions to check. #[cfg(test)] pub fn longest_match_fast( data: &[u8], hash_table: &ChainedHashTable, position: usize, prev_length: usize, max_hash_checks: u16, ) -> (usize, usize) { // debug_assert_eq!(position, hash_table.current_head() as usize); // If we already have a match at the maximum length, // or we can't grow further, we stop here. if prev_length >= MAX_MATCH || position + prev_length >= data.len() { return (0, 0); } let limit = if position > WINDOW_SIZE { position - WINDOW_SIZE } else { 0 }; // Make sure the length is at least one to simplify the matching code, as // otherwise the matching code might underflow. let prev_length = cmp::max(prev_length, 1); let max_length = cmp::min(data.len() - position, MAX_MATCH); // The position in the hash chain we are currently checking. let mut current_head = position; // The best match length we've found so far, and it's distance. let mut best_length = prev_length; let mut best_distance = 0; // The offset from the start of the match of the hash chain we are traversing. let mut offset = 0; // The position of the previous value in the hash chain. let mut prev_head; for _ in 0..max_hash_checks { prev_head = current_head; current_head = hash_table.get_prev(current_head) as usize; if current_head >= prev_head || current_head < limit + offset { // If the current hash chain value refers to itself, or is referring to // a value that's higher (we only move backwars through the chain), // we are at the end and can stop. break; } let offset_head = current_head - offset; // We only check further if the match length can actually increase // Checking if the end byte and the potential next byte matches is generally // more likely to give a quick answer rather than checking from the start first, given // that the hashes match. // If there is no previous match, best_length will be 1 and the two first bytes will // be checked instead. // Since we've made sure best_length is always at least 1, this shouldn't underflow. if data[position + best_length - 1..position + best_length + 1] == data[offset_head + best_length - 1..offset_head + best_length + 1] { // Actually check how many bytes match. // At the moment this will check the two bytes we just checked again, // though adding code for skipping these bytes may not result in any speed // gain due to the added complexity. let length = get_match_length(data, position, offset_head); if length > best_length { best_length = length; best_distance = position - offset_head; if length == max_length { // We are at the max length, so there is no point // searching any longer break; } // Find the position in the match where the next has position is the furthest away. // By moving to a different hash chain we can potentially skip a lot of checks, // saving time. // We avoid doing this for matches that extend past the starting position, as // those will contain positions that are not in the hash table yet. if best_distance > best_length { offset = hash_table.farthest_next(offset_head, length); current_head = offset_head + offset; } } } } if best_length > prev_length { (best_length, best_distance) } else { (0, 0) } } // Get the longest match from the current position of the hash table. #[inline] #[cfg(test)] pub fn longest_match_current(data: &[u8], hash_table: &ChainedHashTable) -> (usize, usize) { use crate::compression_options::MAX_HASH_CHECKS; longest_match( data, hash_table, hash_table.current_head() as usize, MIN_MATCH as usize - 1, MAX_HASH_CHECKS, ) } #[cfg(test)] mod test { use super::{get_match_length, longest_match, longest_match_fast}; use crate::chained_hash_table::{filled_hash_table, ChainedHashTable, HASH_BYTES}; /// Test that match lengths are calculated correctly #[test] fn match_length() { let test_arr = [5u8, 5, 5, 5, 5, 9, 9, 2, 3, 5, 5, 5, 5, 5]; let l = get_match_length(&test_arr, 9, 0); assert_eq!(l, 5); let l2 = get_match_length(&test_arr, 9, 7); assert_eq!(l2, 0); let l3 = get_match_length(&test_arr, 10, 0); assert_eq!(l3, 4); } /// Test that we get the longest of the matches #[test] fn get_longest_match() { let test_data = b"xTest data, Test_data,zTest data"; let hash_table = filled_hash_table(&test_data[..23 + 1 + HASH_BYTES - 1]); let (length, distance) = super::longest_match_current(test_data, &hash_table); // We check that we get the longest match, rather than the shorter, but closer one. assert_eq!(distance, 22); assert_eq!(length, 9); let test_arr2 = [ 10u8, 10, 10, 10, 10, 10, 10, 10, 2, 3, 5, 10, 10, 10, 10, 10, ]; let hash_table = filled_hash_table(&test_arr2[..HASH_BYTES + 1 + 1 + 2]); let (length, distance) = super::longest_match_current(&test_arr2, &hash_table); assert_eq!(distance, 1); assert_eq!(length, 4); } /// Make sure we can get a match at index zero #[test] fn match_index_zero() { let test_data = b"AAAAAAA"; let mut hash_table = ChainedHashTable::from_starting_values(test_data[0], test_data[1]); for (n, &b) in test_data[2..5].iter().enumerate() { hash_table.add_hash_value(n, b); } let (match_length, match_dist) = longest_match(test_data, &hash_table, 1, 0, 4096); assert_eq!(match_dist, 1); assert!(match_length == 6); } /// Test for fast_zlib algorithm. /// Check that it doesn't give worse matches than the default one. /// ignored by default as it's slow, and best ran in release mode. #[ignore] #[test] fn fast_match_at_least_equal() { use crate::test_utils::get_test_data; for start_pos in 10000..50000 { const NUM_CHECKS: u16 = 400; let data = get_test_data(); let hash_table = filled_hash_table(&data[..start_pos + 1]); let pos = hash_table.current_head() as usize; let naive_match = longest_match(&data[..], &hash_table, pos, 0, NUM_CHECKS); let fast_match = longest_match_fast(&data[..], &hash_table, pos, 0, NUM_CHECKS); if fast_match.0 > naive_match.0 { println!("Fast match found better match!"); } assert!( fast_match.0 >= naive_match.0, "naive match had better length! start_pos: {}, naive: {:?}, fast {:?}", start_pos, naive_match, fast_match ); assert!( fast_match.1 >= naive_match.1, "naive match had better dist! start_pos: {} naive {:?}, fast {:?}", start_pos, naive_match, fast_match ); } } } #[cfg(all(test, feature = "benchmarks"))] mod bench { use super::{longest_match, longest_match_fast}; use chained_hash_table::filled_hash_table; use test_std::Bencher; use test_utils::get_test_data; #[bench] fn matching(b: &mut Bencher) { const POS: usize = 29000; let data = get_test_data(); let hash_table = filled_hash_table(&data[..POS + 1]); let pos = hash_table.current_head() as usize; println!( "M: {:?}", longest_match(&data[..], &hash_table, pos, 0, 4096) ); b.iter(|| longest_match(&data[..], &hash_table, pos, 0, 4096)); } #[bench] fn fast_matching(b: &mut Bencher) { const POS: usize = 29000; let data = get_test_data(); let hash_table = filled_hash_table(&data[..POS + 1]); let pos = hash_table.current_head() as usize; println!( "M: {:?}", longest_match_fast(&data[..], &hash_table, pos, 0, 4096) ); b.iter(|| longest_match_fast(&data[..], &hash_table, pos, 0, 4096)); } } deflate-1.0.0/src/output_writer.rs000064400000000000000000000111360072674642500153510ustar 00000000000000use std::u16; use crate::huffman_table::{ get_distance_code, get_length_code, END_OF_BLOCK_POSITION, NUM_DISTANCE_CODES, NUM_LITERALS_AND_LENGTHS, }; use crate::lzvalue::LZValue; /// The type used for representing how many times a literal, length or distance code has been output /// to the current buffer. /// As we are limiting the blocks to be at most 2^16 bytes long, we can represent frequencies using /// 16-bit values. pub type FrequencyType = u16; /// The maximum number of literals/lengths in the buffer, which in practice also means the maximum /// number of literals/lengths output before a new block is started. /// This should not be larger than the maximum value `FrequencyType` can represent to prevent /// overflowing (which would degrade, or in the worst case break compression). pub const MAX_BUFFER_LENGTH: usize = 1024 * 31; #[derive(Debug, PartialEq)] pub enum BufferStatus { NotFull, Full, } /// Struct that buffers lz77 data and keeps track of the usage of different codes pub struct DynamicWriter { buffer: Vec, // The two last length codes are not actually used, but only participates in code construction // Therefore, we ignore them to get the correct number of lengths frequencies: [FrequencyType; NUM_LITERALS_AND_LENGTHS], distance_frequencies: [FrequencyType; NUM_DISTANCE_CODES], } impl DynamicWriter { #[inline] pub fn check_buffer_length(&self) -> BufferStatus { if self.buffer.len() >= MAX_BUFFER_LENGTH { BufferStatus::Full } else { BufferStatus::NotFull } } #[inline] pub fn write_literal(&mut self, literal: u8) -> BufferStatus { debug_assert!(self.buffer.len() < MAX_BUFFER_LENGTH); self.buffer.push(LZValue::literal(literal)); self.frequencies[usize::from(literal)] += 1; self.check_buffer_length() } #[inline] pub fn write_length_distance(&mut self, length: u16, distance: u16) -> BufferStatus { self.buffer.push(LZValue::length_distance(length, distance)); let l_code_num = get_length_code(length); // As we limit the buffer to 2^16 values, this should be safe from overflowing. self.frequencies[l_code_num] += 1; let d_code_num = get_distance_code(distance); // The compiler seems to be able to evade the bounds check here somehow. self.distance_frequencies[usize::from(d_code_num)] += 1; self.check_buffer_length() } pub fn buffer_length(&self) -> usize { self.buffer.len() } pub fn get_buffer(&self) -> &[LZValue] { &self.buffer } pub fn new() -> DynamicWriter { let mut w = DynamicWriter { buffer: Vec::with_capacity(MAX_BUFFER_LENGTH), frequencies: [0; NUM_LITERALS_AND_LENGTHS], distance_frequencies: [0; NUM_DISTANCE_CODES], }; // This will always be 1, // since there will always only be one end of block marker in each block w.frequencies[END_OF_BLOCK_POSITION] = 1; w } /// Special output function used with RLE compression /// that avoids bothering to lookup a distance code. #[inline] pub fn write_length_rle(&mut self, length: u16) -> BufferStatus { self.buffer.push(LZValue::length_distance(length, 1)); let l_code_num = get_length_code(length); // As we limit the buffer to 2^16 values, this should be safe from overflowing. self.frequencies[l_code_num] += 1; self.distance_frequencies[0] += 1; self.check_buffer_length() } pub fn get_frequencies(&self) -> (&[u16], &[u16]) { (&self.frequencies, &self.distance_frequencies) } pub fn clear_frequencies(&mut self) { self.frequencies = [0; NUM_LITERALS_AND_LENGTHS]; self.distance_frequencies = [0; NUM_DISTANCE_CODES]; self.frequencies[END_OF_BLOCK_POSITION] = 1; } pub fn clear_data(&mut self) { self.buffer.clear() } pub fn clear(&mut self) { self.clear_frequencies(); self.clear_data(); } } #[cfg(test)] mod test { use super::*; use crate::huffman_table::{get_distance_code, get_length_code}; #[test] /// Ensure that these function won't produce values that would overflow the output_writer /// tables since we use some unsafe indexing. fn array_bounds() { let w = DynamicWriter::new(); for i in 0..u16::max_value() { assert!(get_length_code(i) < w.frequencies.len()); } for i in 0..u16::max_value() { assert!(get_distance_code(i) < w.distance_frequencies.len() as u8); } } } deflate-1.0.0/src/rle.rs000064400000000000000000000061520072674642500132010ustar 00000000000000use crate::lz77::{buffer_full, ProcessStatus}; use crate::output_writer::{BufferStatus, DynamicWriter}; use std::cmp; use std::ops::Range; const MIN_MATCH: usize = crate::huffman_table::MIN_MATCH as usize; const MAX_MATCH: usize = crate::huffman_table::MAX_MATCH as usize; /// Simple match function for run-length encoding. /// /// Checks how many of the next bytes from the start of the slice `data` matches prev. fn get_match_length_rle(data: &[u8], prev: u8) -> usize { data.iter() .take(MAX_MATCH) .take_while(|&&b| b == prev) .count() } /// L77-Compress data using the RLE(Run-length encoding) strategy /// /// This function simply looks for runs of data of at least length 3. pub fn process_chunk_greedy_rle( data: &[u8], iterated_data: &Range, writer: &mut DynamicWriter, ) -> (usize, ProcessStatus) { if data.is_empty() { return (0, ProcessStatus::Ok); }; let end = cmp::min(data.len(), iterated_data.end); // Start on at least byte 1. let start = cmp::max(iterated_data.start, 1); // The previous byte. let mut prev = data[start - 1]; // Iterate through the requested range, but avoid going off the end. let current_chunk = &data[cmp::min(start, end)..end]; let mut insert_it = current_chunk.iter().enumerate(); let mut overlap = 0; // Make sure to output the first byte if iterated_data.start == 0 && !data.is_empty() { write_literal!(writer, data[0], 1); } while let Some((n, &b)) = insert_it.next() { let position = n + start; let match_len = if prev == b { //TODO: Avoid comparing with self here. // Would use as_slice() but that doesn't work on an enumerated iterator. get_match_length_rle(&data[position..], prev) } else { 0 }; if match_len >= MIN_MATCH { if position + match_len > end { overlap = position + match_len - end; }; let b_status = writer.write_length_rle(match_len as u16); if b_status == BufferStatus::Full { return (overlap, buffer_full(position + match_len)); } insert_it.nth(match_len - 2); } else { write_literal!(writer, b, position + 1); } prev = b; } (overlap, ProcessStatus::Ok) } #[cfg(test)] mod test { use super::*; use crate::lzvalue::{ld, lit, LZValue}; fn l(c: char) -> LZValue { lit(c as u8) } #[test] fn rle_compress() { let input = b"textaaaaaaaaatext"; let mut w = DynamicWriter::new(); let r = 0..input.len(); let (overlap, _) = process_chunk_greedy_rle(input, &r, &mut w); let expected = [ l('t'), l('e'), l('x'), l('t'), l('a'), ld(8, 1), l('t'), l('e'), l('x'), l('t'), ]; //println!("expected: {:?}", expected); //println!("actual: {:?}", w.get_buffer()); assert!(w.get_buffer() == expected); assert_eq!(overlap, 0); } } deflate-1.0.0/src/stored_block.rs000064400000000000000000000057570072674642500151030ustar 00000000000000use crate::bitstream::LsbWriter; use std::io; use std::io::Write; use std::u16; #[cfg(test)] const BLOCK_SIZE: u16 = 32000; const STORED_FIRST_BYTE: u8 = 0b0000_0000; pub const STORED_FIRST_BYTE_FINAL: u8 = 0b0000_0001; pub const MAX_STORED_BLOCK_LENGTH: usize = (u16::MAX as usize) / 2; pub fn write_stored_header(writer: &mut LsbWriter, final_block: bool) { let header = if final_block { STORED_FIRST_BYTE_FINAL } else { STORED_FIRST_BYTE }; // Write the block header writer.write_bits(header.into(), 3); // Flush the writer to make sure we are aligned to the byte boundary. writer.flush_raw(); } // Compress one stored block (excluding the header) pub fn compress_block_stored(input: &[u8], writer: &mut W) -> io::Result { if input.len() > u16::max_value() as usize { return Err(io::Error::new( io::ErrorKind::InvalidInput, "Stored block too long!", )); }; // The header is written before this function. // The next two bytes indicates the length writer.write_all(&(input.len() as u16).to_le_bytes())?; // the next two after the length is the ones complement of the length writer.write_all(&(!input.len() as u16).to_le_bytes())?; // After this the data is written directly with no compression writer.write(input) } #[cfg(test)] pub fn compress_data_stored(input: &[u8]) -> Vec { let block_length = BLOCK_SIZE as usize; let mut output = Vec::with_capacity(input.len() + 2); let mut i = input.chunks(block_length).peekable(); while let Some(chunk) = i.next() { let last_chunk = i.peek().is_none(); // First bit tells us if this is the final chunk // the next two details compression type (none in this case) let first_byte = if last_chunk { STORED_FIRST_BYTE_FINAL } else { STORED_FIRST_BYTE }; output.write(&[first_byte]).unwrap(); compress_block_stored(chunk, &mut output).unwrap(); } output } #[cfg(test)] mod test { use super::*; use crate::test_utils::decompress_to_end; #[test] fn no_compression_one_chunk() { let test_data = vec![1u8, 2, 3, 4, 5, 6, 7, 8]; let compressed = compress_data_stored(&test_data); let result = decompress_to_end(&compressed); assert_eq!(test_data, result); } #[test] fn no_compression_multiple_chunks() { let test_data = vec![32u8; 40000]; let compressed = compress_data_stored(&test_data); let result = decompress_to_end(&compressed); assert_eq!(test_data, result); } #[test] fn no_compression_string() { let test_data = String::from( "This is some text, this is some more text, this is even \ more text, lots of text here.", ) .into_bytes(); let compressed = compress_data_stored(&test_data); let result = decompress_to_end(&compressed); assert_eq!(test_data, result); } } deflate-1.0.0/src/test_utils.rs000064400000000000000000000035710072674642500146200ustar 00000000000000#![cfg(test)] #[cfg(feature = "gzip")] use gzip_header::GzHeader; fn get_test_file_data(name: &str) -> Vec { use std::fs::File; use std::io::Read; let mut input = Vec::new(); let mut f = File::open(name).unwrap(); f.read_to_end(&mut input).unwrap(); input } pub fn get_test_data() -> Vec { use std::env; let path = env::var("TEST_FILE").unwrap_or("tests/pg11.txt".to_string()); get_test_file_data(&path) } /// Helper function to decompress into a `Vec` pub fn decompress_to_end(input: &[u8]) -> Vec { use miniz_oxide::inflate::decompress_to_vec; decompress_to_vec(input).expect("Decompression failed!") } #[cfg(feature = "gzip")] pub fn decompress_gzip(compressed: &[u8]) -> (GzHeader, Vec) { use gzip_header::{read_gz_header, Crc}; use std::io::Cursor; let mut c = Cursor::new(compressed); let h = read_gz_header(&mut c).expect("Failed to decode gzip header!"); let pos = c.position(); let compressed = &c.into_inner()[pos as usize..]; let result = miniz_oxide::inflate::decompress_to_vec(compressed).expect("Decompression failed"); let s = compressed.len(); let crc = u32::from_le_bytes([ compressed[s - 8], compressed[s - 7], compressed[s - 6], compressed[s - 5], ]); let len = u32::from_le_bytes([ compressed[s - 4], compressed[s - 3], compressed[s - 2], compressed[s - 1], ]); let mut comp_crc = Crc::new(); comp_crc.update(&result); assert_eq!( crc, comp_crc.sum(), "Checksum failed File: {}, computed: {}", crc, comp_crc.sum() ); assert_eq!(len, result.len() as u32, "Length mismatch"); (h, result) } pub fn decompress_zlib(compressed: &[u8]) -> Vec { miniz_oxide::inflate::decompress_to_vec_zlib(&compressed).expect("Decompression failed!") } deflate-1.0.0/src/writer.rs000064400000000000000000000550720072674642500137400ustar 00000000000000use std::io::Write; use std::{io, thread}; use crate::checksum::{Adler32Checksum, RollingChecksum}; use crate::compress::compress_data_dynamic_n; use crate::compress::Flush; use crate::compression_options::CompressionOptions; use crate::deflate_state::DeflateState; use crate::zlib::{write_zlib_header, CompressionLevel}; const ERR_STR: &str = "Error! The wrapped writer is missing.\ This is a bug, please file an issue."; /// Keep compressing until all the input has been compressed and output or the writer returns `Err`. pub fn compress_until_done( mut input: &[u8], deflate_state: &mut DeflateState, flush_mode: Flush, ) -> io::Result<()> { // This should only be used for flushing. assert!(flush_mode != Flush::None); loop { match compress_data_dynamic_n(input, deflate_state, flush_mode) { Ok(0) => { if deflate_state.output_buf().is_empty() { break; } else { // If the output buffer isn't empty, keep going until it is, as there is still // data to be flushed. input = &[]; } } Ok(n) => { if n < input.len() { input = &input[n..] } else { input = &[]; } } Err(e) => { match e.kind() { // This error means that there may still be data to flush. // This could possibly get stuck if the underlying writer keeps returning this // error. io::ErrorKind::Interrupted => (), _ => return Err(e), } } } } debug_assert_eq!( deflate_state.bytes_written, deflate_state.bytes_written_control.get() ); Ok(()) } /// A DEFLATE encoder/compressor. /// /// A struct implementing a [`Write`] interface that takes arbitrary data and compresses it to /// the provided writer using DEFLATE compression. /// /// # Examples /// /// ```rust /// # use std::io; /// # /// # fn try_main() -> io::Result> { /// # /// use std::io::Write; /// /// use deflate::Compression; /// use deflate::write::DeflateEncoder; /// /// let data = b"This is some test data"; /// let mut encoder = DeflateEncoder::new(Vec::new(), Compression::Default); /// encoder.write_all(data)?; /// let compressed_data = encoder.finish()?; /// # Ok(compressed_data) /// # /// # } /// # fn main() { /// # try_main().unwrap(); /// # } /// ``` /// [`Write`]: https://doc.rust-lang.org/std/io/trait.Write.html pub struct DeflateEncoder { deflate_state: DeflateState, } impl DeflateEncoder { /// Creates a new encoder using the provided compression options. pub fn new>(writer: W, options: O) -> DeflateEncoder { DeflateEncoder { deflate_state: DeflateState::new(options.into(), writer), } } /// Encode all pending data to the contained writer, consume this `DeflateEncoder`, /// and return the contained writer if writing succeeds. pub fn finish(mut self) -> io::Result { self.output_all()?; // We have to move the inner writer out of the encoder, and replace it with `None` // to let the `DeflateEncoder` drop safely. Ok(self.deflate_state.inner.take().expect(ERR_STR)) } /// Resets the encoder (except the compression options), replacing the current writer /// with a new one, returning the old one. pub fn reset(&mut self, w: W) -> io::Result { self.output_all()?; self.deflate_state.reset(w) } /// Output all pending data as if encoding is done, but without resetting anything fn output_all(&mut self) -> io::Result<()> { compress_until_done(&[], &mut self.deflate_state, Flush::Finish) } } impl io::Write for DeflateEncoder { fn write(&mut self, buf: &[u8]) -> io::Result { let flush_mode = self.deflate_state.flush_mode; compress_data_dynamic_n(buf, &mut self.deflate_state, flush_mode) } /// Flush the encoder. /// /// This will flush the encoder, emulating the Sync flush method from Zlib. /// This essentially finishes the current block, and sends an additional empty stored block to /// the writer. fn flush(&mut self) -> io::Result<()> { compress_until_done(&[], &mut self.deflate_state, Flush::Sync) } } impl Drop for DeflateEncoder { /// When the encoder is dropped, output the rest of the data. /// /// WARNING: This may silently fail if writing fails, so using this to finish encoding /// for writers where writing might fail is not recommended, for that call /// [`finish()`](#method.finish) instead. fn drop(&mut self) { // Not sure if implementing drop is a good idea or not, but we follow flate2 for now. // We only do this if we are not panicking, to avoid a double panic. if self.deflate_state.inner.is_some() && !thread::panicking() { let _ = self.output_all(); } } } /// A Zlib encoder/compressor. /// /// A struct implementing a [`Write`] interface that takes arbitrary data and compresses it to /// the provided writer using DEFLATE compression with Zlib headers and trailers. /// /// # Examples /// /// ```rust /// # use std::io; /// # /// # fn try_main() -> io::Result> { /// # /// use std::io::Write; /// /// use deflate::Compression; /// use deflate::write::ZlibEncoder; /// /// let data = b"This is some test data"; /// let mut encoder = ZlibEncoder::new(Vec::new(), Compression::Default); /// encoder.write_all(data)?; /// let compressed_data = encoder.finish()?; /// # Ok(compressed_data) /// # /// # } /// # fn main() { /// # try_main().unwrap(); /// # } /// ``` /// [`Write`]: https://doc.rust-lang.org/std/io/trait.Write.html pub struct ZlibEncoder { deflate_state: DeflateState, checksum: Adler32Checksum, header_written: bool, } impl ZlibEncoder { /// Create a new `ZlibEncoder` using the provided compression options. pub fn new>(writer: W, options: O) -> ZlibEncoder { ZlibEncoder { deflate_state: DeflateState::new(options.into(), writer), checksum: Adler32Checksum::new(), header_written: false, } } /// Output all pending data ,including the trailer(checksum) as if encoding is done, /// but without resetting anything. fn output_all(&mut self) -> io::Result<()> { self.check_write_header()?; compress_until_done(&[], &mut self.deflate_state, Flush::Finish)?; self.write_trailer() } /// Encode all pending data to the contained writer, consume this `ZlibEncoder`, /// and return the contained writer if writing succeeds. pub fn finish(mut self) -> io::Result { self.output_all()?; // We have to move the inner writer out of the encoder, and replace it with `None` // to let the `DeflateEncoder` drop safely. Ok(self.deflate_state.inner.take().expect(ERR_STR)) } /// Resets the encoder (except the compression options), replacing the current writer /// with a new one, returning the old one. pub fn reset(&mut self, writer: W) -> io::Result { self.output_all()?; self.header_written = false; self.checksum = Adler32Checksum::new(); self.deflate_state.reset(writer) } /// Check if a zlib header should be written. fn check_write_header(&mut self) -> io::Result<()> { if !self.header_written { write_zlib_header(self.deflate_state.output_buf(), CompressionLevel::Default)?; self.header_written = true; } Ok(()) } /// Write the trailer, which for zlib is the Adler32 checksum. fn write_trailer(&mut self) -> io::Result<()> { let hash = self.checksum.current_hash(); self.deflate_state .inner .as_mut() .expect(ERR_STR) .write_all(&hash.to_be_bytes())?; Ok(()) } /// Return the adler32 checksum of the currently consumed data. pub fn checksum(&self) -> u32 { self.checksum.current_hash() } } impl io::Write for ZlibEncoder { fn write(&mut self, buf: &[u8]) -> io::Result { self.check_write_header()?; let flush_mode = self.deflate_state.flush_mode; let res = compress_data_dynamic_n(buf, &mut self.deflate_state, flush_mode); match res { // If this is returned, the whole buffer was consumed Ok(0) => self.checksum.update_from_slice(buf), // Otherwise, only part of it was consumed, so only that part // added to the checksum. Ok(n) => self.checksum.update_from_slice(&buf[0..n]), _ => (), }; res } /// Flush the encoder. /// /// This will flush the encoder, emulating the Sync flush method from Zlib. /// This essentially finishes the current block, and sends an additional empty stored block to /// the writer. fn flush(&mut self) -> io::Result<()> { compress_until_done(&[], &mut self.deflate_state, Flush::Sync) } } impl Drop for ZlibEncoder { /// When the encoder is dropped, output the rest of the data. /// /// WARNING: This may silently fail if writing fails, so using this to finish encoding /// for writers where writing might fail is not recommended, for that call /// [`finish()`](#method.finish) instead. fn drop(&mut self) { if self.deflate_state.inner.is_some() && !thread::panicking() { let _ = self.output_all(); } } } #[cfg(feature = "gzip")] pub mod gzip { use std::io::{Cursor, Write}; use std::{io, thread}; use super::*; use gzip_header::{Crc, GzBuilder}; /// A Gzip encoder/compressor. /// /// A struct implementing a [`Write`] interface that takes arbitrary data and compresses it to /// the provided writer using DEFLATE compression with Gzip headers and trailers. /// /// # Examples /// /// ```rust /// # use std::io; /// # /// # fn try_main() -> io::Result> { /// # /// use std::io::Write; /// /// use deflate::Compression; /// use deflate::write::GzEncoder; /// /// let data = b"This is some test data"; /// let mut encoder = GzEncoder::new(Vec::new(), Compression::Default); /// encoder.write_all(data)?; /// let compressed_data = encoder.finish()?; /// # Ok(compressed_data) /// # /// # } /// # fn main() { /// # try_main().unwrap(); /// # } /// ``` /// [`Write`]: https://doc.rust-lang.org/std/io/trait.Write.html pub struct GzEncoder { inner: DeflateEncoder, checksum: Crc, header: Vec, } impl GzEncoder { /// Create a new `GzEncoder` writing deflate-compressed data to the underlying writer when /// written to, wrapped in a gzip header and trailer. The header details will be blank. pub fn new>(writer: W, options: O) -> GzEncoder { GzEncoder::from_builder(GzBuilder::new(), writer, options) } /// Create a new GzEncoder from the provided `GzBuilder`. This allows customising /// the details of the header, such as the filename and comment fields. pub fn from_builder>( builder: GzBuilder, writer: W, options: O, ) -> GzEncoder { GzEncoder { inner: DeflateEncoder::new(writer, options), checksum: Crc::new(), header: builder.into_header(), } } /// Write header to the output buffer if it hasn't been done yet. fn check_write_header(&mut self) { if !self.header.is_empty() { self.inner .deflate_state .output_buf() .extend_from_slice(&self.header); self.header.clear(); } } /// Output all pending data ,including the trailer(checksum + count) as if encoding is done. /// but without resetting anything. fn output_all(&mut self) -> io::Result<()> { self.check_write_header(); self.inner.output_all()?; self.write_trailer() } /// Encode all pending data to the contained writer, consume this `GzEncoder`, /// and return the contained writer if writing succeeds. pub fn finish(mut self) -> io::Result { self.output_all()?; // We have to move the inner writer out of the encoder, and replace it with `None` // to let the `DeflateEncoder` drop safely. Ok(self.inner.deflate_state.inner.take().expect(ERR_STR)) } fn reset_no_header(&mut self, writer: W) -> io::Result { self.output_all()?; self.checksum = Crc::new(); self.inner.deflate_state.reset(writer) } /// Resets the encoder (except the compression options), replacing the current writer /// with a new one, returning the old one. (Using a blank header). pub fn reset(&mut self, writer: W) -> io::Result { let w = self.reset_no_header(writer); self.header = GzBuilder::new().into_header(); w } /// Resets the encoder (except the compression options), replacing the current writer /// with a new one, returning the old one, and using the provided `GzBuilder` to /// create the header. pub fn reset_with_builder(&mut self, writer: W, builder: GzBuilder) -> io::Result { let w = self.reset_no_header(writer); self.header = builder.into_header(); w } /// Write the checksum and number of bytes mod 2^32 to the output writer. fn write_trailer(&mut self) -> io::Result<()> { let crc = self.checksum.sum(); let amount = self.checksum.amt_as_u32(); // We use a buffer here to make sure we don't end up writing only half the header if // writing fails. let mut buf = [0u8; 8]; let mut temp = Cursor::new(&mut buf[..]); temp.write_all(&crc.to_le_bytes()).unwrap(); temp.write_all(&amount.to_le_bytes()).unwrap(); self.inner .deflate_state .inner .as_mut() .expect(ERR_STR) .write_all(temp.into_inner()) } /// Get the crc32 checksum of the data consumed so far. pub fn checksum(&self) -> u32 { self.checksum.sum() } } impl io::Write for GzEncoder { fn write(&mut self, buf: &[u8]) -> io::Result { self.check_write_header(); let res = self.inner.write(buf); match res { Ok(0) => self.checksum.update(buf), Ok(n) => self.checksum.update(&buf[0..n]), _ => (), }; res } /// Flush the encoder. /// /// This will flush the encoder, emulating the Sync flush method from Zlib. /// This essentially finishes the current block, and sends an additional empty stored /// block to the writer. fn flush(&mut self) -> io::Result<()> { self.inner.flush() } } impl Drop for GzEncoder { /// When the encoder is dropped, output the rest of the data. /// /// WARNING: This may silently fail if writing fails, so using this to finish encoding /// for writers where writing might fail is not recommended, for that call /// [`finish()`](#method.finish) instead. fn drop(&mut self) { if self.inner.deflate_state.inner.is_some() && !thread::panicking() { let _ = self.output_all(); } } } #[cfg(test)] mod test { use super::*; use crate::test_utils::{decompress_gzip, get_test_data}; #[test] fn gzip_writer() { let data = get_test_data(); let comment = b"Comment"; let compressed = { let mut compressor = GzEncoder::from_builder( GzBuilder::new().comment(&comment[..]), Vec::with_capacity(data.len() / 3), CompressionOptions::default(), ); compressor.write_all(&data[0..data.len() / 2]).unwrap(); compressor.write_all(&data[data.len() / 2..]).unwrap(); compressor.finish().unwrap() }; let (dec, res) = decompress_gzip(&compressed); assert_eq!(dec.comment().unwrap(), comment); assert!(res == data); } } } #[cfg(test)] mod test { use super::*; use crate::compression_options::CompressionOptions; use crate::test_utils::{decompress_to_end, decompress_zlib, get_test_data}; use std::io::Write; #[test] fn deflate_writer() { let data = get_test_data(); let compressed = { let mut compressor = DeflateEncoder::new( Vec::with_capacity(data.len() / 3), CompressionOptions::high(), ); // Write in multiple steps to see if this works as it's supposed to. compressor.write_all(&data[0..data.len() / 2]).unwrap(); compressor.write_all(&data[data.len() / 2..]).unwrap(); compressor.finish().unwrap() }; let res = decompress_to_end(&compressed); assert!(res == data); } #[test] fn zlib_writer() { let data = get_test_data(); let compressed = { let mut compressor = ZlibEncoder::new( Vec::with_capacity(data.len() / 3), CompressionOptions::high(), ); compressor.write_all(&data[0..data.len() / 2]).unwrap(); compressor.write_all(&data[data.len() / 2..]).unwrap(); compressor.finish().unwrap() }; let res = decompress_zlib(&compressed); assert!(res == data); } #[test] /// Check if the result of compressing after resetting is the same as before. fn writer_reset() { let data = get_test_data(); let mut compressor = DeflateEncoder::new( Vec::with_capacity(data.len() / 3), CompressionOptions::default(), ); compressor.write_all(&data).unwrap(); let res1 = compressor .reset(Vec::with_capacity(data.len() / 3)) .unwrap(); compressor.write_all(&data).unwrap(); let res2 = compressor.finish().unwrap(); assert!(res1 == res2); } #[test] fn writer_reset_zlib() { let data = get_test_data(); let mut compressor = ZlibEncoder::new( Vec::with_capacity(data.len() / 3), CompressionOptions::default(), ); compressor.write_all(&data).unwrap(); let res1 = compressor .reset(Vec::with_capacity(data.len() / 3)) .unwrap(); compressor.write_all(&data).unwrap(); let res2 = compressor.finish().unwrap(); assert!(res1 == res2); } #[test] fn writer_sync() { let data = get_test_data(); let compressed = { let mut compressor = DeflateEncoder::new( Vec::with_capacity(data.len() / 3), CompressionOptions::default(), ); let split = data.len() / 2; compressor.write_all(&data[..split]).unwrap(); compressor.flush().unwrap(); { let buf = &mut compressor.deflate_state.inner.as_mut().unwrap(); let buf_len = buf.len(); // Check for the sync marker. (excluding the header as it might not line // up with the byte boundary.) assert_eq!(buf[buf_len - 4..], [0, 0, 255, 255]); } compressor.write_all(&data[split..]).unwrap(); compressor.finish().unwrap() }; let decompressed = decompress_to_end(&compressed); assert!(decompressed == data); } #[test] /// Make sure compression works with the writer when the input is between 1 and 2 window sizes. fn issue_18() { use crate::compression_options::Compression; let data = vec![0; 61000]; let compressed = { let mut compressor = ZlibEncoder::new(Vec::new(), Compression::Default); compressor.write_all(&data[..]).unwrap(); compressor.finish().unwrap() }; let decompressed = decompress_zlib(&compressed); assert!(decompressed == data); } #[test] fn writer_sync_multiple() { use std::cmp; let data = get_test_data(); let compressed = { let mut compressor = DeflateEncoder::new( Vec::with_capacity(data.len() / 3), CompressionOptions::default(), ); let split = data.len() / 2; compressor.write_all(&data[..split]).unwrap(); compressor.flush().unwrap(); compressor.flush().unwrap(); { let buf = &mut compressor.deflate_state.inner.as_mut().unwrap(); let buf_len = buf.len(); // Check for the sync marker. (excluding the header as it might not line // up with the byte boundary.) assert_eq!(buf[buf_len - 4..], [0, 0, 255, 255]); } compressor .write_all(&data[split..cmp::min(split + 2, data.len())]) .unwrap(); compressor.flush().unwrap(); compressor .write_all(&data[cmp::min(split + 2, data.len())..]) .unwrap(); compressor.finish().unwrap() }; let decompressed = decompress_to_end(&compressed); assert!(decompressed == data); let mut compressor = DeflateEncoder::new( Vec::with_capacity(data.len() / 3), CompressionOptions::default(), ); compressor.flush().unwrap(); compressor.write_all(&[1, 2]).unwrap(); compressor.flush().unwrap(); compressor.write_all(&[3]).unwrap(); compressor.flush().unwrap(); let compressed = compressor.finish().unwrap(); let decompressed = decompress_to_end(&compressed); assert_eq!(decompressed, [1, 2, 3]); } } deflate-1.0.0/src/zlib.rs000064400000000000000000000055420072674642500133610ustar 00000000000000//! This module contains functionality for generating a [zlib](https://tools.ietf.org/html/rfc1950) //! header. //! //! The Zlib header contains some metadata (a window size and a compression level), and optionally //! a block of data serving as an extra dictionary for the compressor/decompressor. //! The dictionary is not implemented in this library. //! The data in the header aside from the dictionary doesn't actually have any effect on the //! decompressed data, it only offers some hints for the decompressor on how the data was //! compressed. use std::io::{Result, Write}; // CM = 8 means to use the DEFLATE compression method. const DEFAULT_CM: u8 = 8; // CINFO = 7 Indicates a 32k window size. const DEFAULT_CINFO: u8 = 7 << 4; const DEFAULT_CMF: u8 = DEFAULT_CM | DEFAULT_CINFO; // No dict by default. #[cfg(test)] const DEFAULT_FDICT: u8 = 0; // FLEVEL = 0 means fastest compression algorithm. const _DEFAULT_FLEVEL: u8 = 0 << 7; // The 16-bit value consisting of CMF and FLG must be divisible by this to be valid. const FCHECK_DIVISOR: u8 = 31; #[allow(dead_code)] #[repr(u8)] pub enum CompressionLevel { Fastest = 0 << 6, Fast = 1 << 6, Default = 2 << 6, Maximum = 3 << 6, } /// Generate FCHECK from CMF and FLG (without FCKECH )so that they are correct according to the /// specification, i.e (CMF*256 + FCHK) % 31 = 0. /// Returns flg with the FCHKECK bits added (any existing FCHECK bits are ignored). fn add_fcheck(cmf: u8, flg: u8) -> u8 { let rem = ((cmf as usize * 256) + flg as usize) % FCHECK_DIVISOR as usize; // Clear existing FCHECK if any let flg = flg & 0b11100000; // Casting is safe as rem can't overflow since it is a value mod 31 // We can simply add the value to flg as (31 - rem) will never be above 2^5 flg + (FCHECK_DIVISOR - rem as u8) } /// Write a zlib header with an empty dictionary to the writer using the specified /// compression level preset. pub fn write_zlib_header(writer: &mut W, level: CompressionLevel) -> Result<()> { writer.write_all(&get_zlib_header(level)) } /// Get the zlib header for the `CompressionLevel` level using the default window size and no /// dictionary. pub fn get_zlib_header(level: CompressionLevel) -> [u8; 2] { let cmf = DEFAULT_CMF; [cmf, add_fcheck(cmf, level as u8)] } #[cfg(test)] mod test { use super::DEFAULT_CMF; use super::*; #[test] fn test_gen_fcheck() { let cmf = DEFAULT_CMF; let flg = super::add_fcheck( DEFAULT_CMF, CompressionLevel::Default as u8 | super::DEFAULT_FDICT, ); assert_eq!(((usize::from(cmf) * 256) + usize::from(flg)) % 31, 0); } #[test] fn test_header() { let header = get_zlib_header(CompressionLevel::Fastest); assert_eq!( ((usize::from(header[0]) * 256) + usize::from(header[1])) % 31, 0 ); } }