weezl-0.1.5/.github/workflows/rust.yml010064400017500001750000000025301403363510300161770ustar0000000000000000name: Rust CI on: push: branches: [ master ] pull_request: branches: [ master ] jobs: build: runs-on: ubuntu-latest strategy: matrix: rust: [stable, beta, nightly] features: ["", "alloc", "std", "async"] steps: - uses: actions/checkout@v2 - run: rustup default ${{ matrix.rust }} - name: build run: > cargo build --verbose --no-default-features --features "$FEATURES" env: FEATURES: ${{ matrix.features }} - name: test run: > cargo test --tests --benches --no-default-features --features "$FEATURES" --release if: ${{ matrix.rust != '1.34.2' }} env: FEATURES: ${{ matrix.features }} build_msrv: runs-on: ubuntu-latest strategy: matrix: features: ["", "std"] steps: - uses: actions/checkout@v2 - run: rustup default "1.34.2" - name: build run: cargo build --verbose --no-default-features --features "$FEATURES" env: FEATURES: ${{ matrix.features }} rustfmt: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - uses: actions-rs/toolchain@v1 with: toolchain: stable override: true components: rustfmt, clippy - name: Run rustfmt check uses: actions-rs/cargo@v1 with: command: fmt args: -- --check weezl-0.1.5/.gitignore010064400017500001750000000000221354024601000130410ustar0000000000000000target Cargo.lock weezl-0.1.5/Cargo.toml.orig010064400017500001750000000037131403664335400137670ustar0000000000000000[package] name = "weezl" version = "0.1.5" license = "MIT OR Apache-2.0" description = "Fast LZW compression and decompression." authors = ["HeroicKatora "] readme = "README.md" repository = "https://github.com/image-rs/lzw.git" documentation = "https://docs.rs/weezl" edition = "2018" exclude = ["benches/*.lzw"] [lib] name = "weezl" bench = false [dependencies.futures] optional = true version = "0.3.12" default-features = false features = ["std"] [dev-dependencies] criterion = "0.3.1" [dev-dependencies.tokio] version = "1" default-features = false features = ["macros", "io-util", "net", "rt", "rt-multi-thread"] [dev-dependencies.tokio-util] version = "0.6.2" default-features = false features = ["compat"] [features] default = ["std"] # Enable usage of the standard library and in particular any interface that # requires the io interfaces `Read` and `Write`. std = ["alloc"] # Enable usage of the `alloc` crate. You should always have this feature # enabled. Without this feature neither the encoder nor the decoder exists. # This is purely a reservation for future possibilities. alloc = [] # Enable usage of `async` through `futures`. This is basically the blocker # for `1.0` as we may track its version. Note that this negates no_std right # now but implicitly through being required from futures. We also use the # `std::io::Error` interface. Note that this features is NOT supported on # 1.34.2 but only on stable. async = ["futures", "std"] [[bin]] name = "lzw" path = "bin/lzw.rs" bench = false required-features = ["std"] [[bench]] name = "msb8" harness = false required-features = ["std"] [[example]] name = "lzw-compress" required-features = ["std"] [[example]] name = "lzw-decompress" required-features = ["std"] [[test]] name = "async" required-features = ["async", "std"] [[test]] name = "roundtrip" required-features = ["std"] [[test]] name = "roundtrip_vec" required-features = ["alloc"] [package.metadata.docs.rs] all-features = true weezl-0.1.5/Cargo.toml0000644000000035130000000000000102240ustar00# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] edition = "2018" name = "weezl" version = "0.1.5" authors = ["HeroicKatora "] exclude = ["benches/*.lzw"] description = "Fast LZW compression and decompression." documentation = "https://docs.rs/weezl" readme = "README.md" license = "MIT OR Apache-2.0" repository = "https://github.com/image-rs/lzw.git" [package.metadata.docs.rs] all-features = true [lib] name = "weezl" bench = false [[bin]] name = "lzw" path = "bin/lzw.rs" bench = false required-features = ["std"] [[example]] name = "lzw-compress" required-features = ["std"] [[example]] name = "lzw-decompress" required-features = ["std"] [[test]] name = "async" required-features = ["async", "std"] [[test]] name = "roundtrip" required-features = ["std"] [[test]] name = "roundtrip_vec" required-features = ["alloc"] [[bench]] name = "msb8" harness = false required-features = ["std"] [dependencies.futures] version = "0.3.12" features = ["std"] optional = true default-features = false [dev-dependencies.criterion] version = "0.3.1" [dev-dependencies.tokio] version = "1" features = ["macros", "io-util", "net", "rt", "rt-multi-thread"] default-features = false [dev-dependencies.tokio-util] version = "0.6.2" features = ["compat"] default-features = false [features] alloc = [] async = ["futures", "std"] default = ["std"] std = ["alloc"] weezl-0.1.5/Changes.md010064400017500001750000000027001403664335400127650ustar0000000000000000## Version 0.1.5 - Added `IntoVec` adapters that simplify in-memory de- and encoding. A further 'one-shot' interface is exposed in the `Decoder` and `Encoder` themselves which makes the process a one liner in the simplest cases. Contrary to `IntoStream`, these are available in all cases and do not require `std`. ## Version 0.1.4 - Added `IntoAsync` adapters for asynchronous de- and encoding. The interface is implemented only in terms of `futures = 0.3` traits at the moment. - Code sizes smaller than 2 are now allowed for decoding. Since they do not roundtrip it is still an error to use them in the decoder but this avoids accidental panicking, i.e. denial of service, in parsers. ## Version 0.1.3 - Fixes an issue in compression that caused some data to be lost around clear codes. This could corrupt the data stream. ## Version 0.1.2 - Fixes incorrect compression after `Encoder::reset`. ## Version 0.1.1 - The `IntoStream` types now reuse their internal buffers. - Added the methods `set_buffer`, `set_buffer_size` to `IntoStream` for both the encoder and decoder, used to control the automatic allocation. - Deprecated `IntoStream` in configurations without the `std` feature where the type can't even be constructed. ## Version 0.1.0 – Aleph - Initial major release - Support gif and tiff code size changes - Rough performance numbers: On i5-4690, 8GiB DIMM DDR3 Synchronous 1600 MHz (0,6 ns) ~70MB/s encode, ~230MB/s decode weezl-0.1.5/LICENSE-APACHE010064400017500001750000000251361354024601000130120ustar0000000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.weezl-0.1.5/LICENSE-MIT010064400017500001750000000020701370761274500125330ustar0000000000000000The MIT License (MIT) Copyright (c) HeroicKatora 2020 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. weezl-0.1.5/README.md010064400017500001750000000013031371731454500123520ustar0000000000000000# weezl LZW en- and decoding that goes weeeee! ## Overview This library, written in purely safe and dependency-less Rust, provides encoding and decoding for lzw compression in the style as it occurs in `gif` and `tiff` image formats. It has a standalone binary that may be used to handle those data streams but it is _not_ compatible with Spencer's `compress` and `uncompress` binaries (though a drop-in may be developed at a later point). Using in a `no_std` environment is also possible though an allocator is required. This, too, may be relaxed in a later release. A feature flag already exists but currently turns off almost all interfaces. ## License All code is dual licensed MIT OR Apache-2.0. weezl-0.1.5/benches/msb8.rs010064400017500001750000000033671373242646200137340ustar0000000000000000extern crate criterion; extern crate weezl; use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; use std::fs; use weezl::{decode::Decoder, BitOrder, LzwStatus}; pub fn criterion_benchmark(c: &mut Criterion, file: &str) { let data = fs::read(file).expect("Benchmark input not found"); let mut group = c.benchmark_group("msb-8"); let id = BenchmarkId::new(file, data.len()); let mut outbuf = vec![0; 1 << 26]; // 64MB, what wuff uses.. let mut decode_once = |data: &[u8]| { let mut decoder = Decoder::new(BitOrder::Msb, 8); let mut written = 0; let outbuf = outbuf.as_mut_slice(); let mut data = data; loop { let result = decoder.decode_bytes(data, &mut outbuf[..]); let done = result.status.expect("Error"); data = &data[result.consumed_in..]; written += result.consumed_out; black_box(&outbuf[..result.consumed_out]); if let LzwStatus::Done = done { break; } if let LzwStatus::NoProgress = done { panic!("Need to make progress"); } } written }; group.throughput(Throughput::Bytes(decode_once(&data) as u64)); group.bench_with_input(id, &data, |b, data| { b.iter(|| { decode_once(data); }) }); } pub fn bench_toml(c: &mut Criterion) { criterion_benchmark(c, "benches/Cargo-8-msb.lzw"); } pub fn bench_binary(c: &mut Criterion) { criterion_benchmark(c, "benches/binary-8-msb.lzw"); } pub fn bench_lib(c: &mut Criterion) { criterion_benchmark(c, "benches/lib-8-msb.lzw"); } criterion_group!(benches, bench_toml, bench_binary, bench_lib); criterion_main!(benches); weezl-0.1.5/bin/lzw.rs010064400017500001750000000116231373242646200130320ustar0000000000000000use std::path::PathBuf; use std::{env, ffi, fs, io, process}; extern crate weezl; use weezl::{decode as delzw, encode as enlzw, BitOrder}; fn main() { let args = env::args_os().skip(1); let flags = Flags::from_args(args).unwrap_or_else(|ParamError| explain()); let out = io::stdout(); let out = out.lock(); let mut files = flags.files; let input = files.pop().unwrap_or_else(explain); if !files.is_empty() { return explain(); } let operation = flags.operation.unwrap_or_else(explain); let min_code = if flags.min_code < 2 || flags.min_code > 12 { return explain(); } else { flags.min_code }; let bit_order = flags.bit_order; let result = match (input, operation) { (Input::File(file), Operation::Encode) => (|| { let data = fs::File::open(file)?; let file = io::BufReader::with_capacity(1 << 26, data); let mut encoder = enlzw::Encoder::new(bit_order, min_code); encoder.into_stream(out).encode_all(file).status })(), (Input::Stdin, Operation::Encode) => { let input = io::BufReader::with_capacity(1 << 26, io::stdin()); let mut encoder = enlzw::Encoder::new(bit_order, min_code); encoder.into_stream(out).encode_all(input).status } (Input::File(file), Operation::Decode) => (|| { let data = fs::File::open(file)?; let file = io::BufReader::with_capacity(1 << 26, data); let mut decoder = delzw::Decoder::new(bit_order, min_code); decoder.into_stream(out).decode_all(file).status })(), (Input::Stdin, Operation::Decode) => { let input = io::BufReader::with_capacity(1 << 26, io::stdin()); let mut decoder = delzw::Decoder::new(bit_order, min_code); decoder.into_stream(out).decode_all(input).status } }; result.expect("Operation Failed: "); } struct Flags { files: Vec, operation: Option, min_code: u8, bit_order: BitOrder, } struct ParamError; enum Input { File(PathBuf), Stdin, } enum Operation { Encode, Decode, } fn explain() -> T { println!( "Usage: lzw [-e|-d] \n\ Arguments:\n\ -e\t operation encode (default)\n\ -d\t operation decode\n\ \tfilepath or '-' for stdin" ); process::exit(1); } impl Default for Flags { fn default() -> Flags { Flags { files: vec![], operation: None, min_code: 8, bit_order: BitOrder::Msb, } } } impl Flags { fn from_args(mut args: impl Iterator) -> Result { let mut flags = Flags::default(); let mut operation = None; loop { match args.next().as_ref().and_then(|s| s.to_str()) { Some("-d") | Some("--decode") => { if operation.is_some() { return Err(ParamError); } operation = Some(Operation::Decode); } Some("-e") | Some("--encode") => { if operation.is_some() { return Err(ParamError); } operation = Some(Operation::Encode); } Some("-w") | Some("--word-bits") => match args.next() { None => return Err(ParamError), Some(bits) => { let st = bits.to_str().ok_or(ParamError)?; flags.min_code = st.parse().ok().ok_or(ParamError)?; } }, Some("-le") | Some("--little-endian") => { flags.bit_order = BitOrder::Lsb; } Some("-be") | Some("--big-endian") | Some("-ne") | Some("--network-endian") => { flags.bit_order = BitOrder::Msb; } Some("-") => { flags.files.push(Input::Stdin); } Some(other) if other.starts_with('-') => { // Reserved for future use. // -a: self-describing archive format, similar to actual compress // -b: maximum bits // -v: verbosity // some compress compatibility mode? Probably through arg(0) though. return Err(ParamError); } Some(file) => { flags.files.push(Input::File(file.into())); } None => break, }; } flags.files.extend(args.map(|file| { if let Some("-") = file.to_str() { Input::Stdin } else { Input::File(file.into()) } })); flags.operation = operation; Ok(flags) } } weezl-0.1.5/examples/lzw-compress.rs010064400017500001750000000007661371731454500157400ustar0000000000000000//! Compresses the input from stdin and writes the result to stdout. use std::io::{self, BufWriter}; fn main() { match { let mut encoder = weezl::encode::Encoder::new(weezl::BitOrder::Msb, 8); let stdin = io::stdin(); let stdin = stdin.lock(); let stdout = io::stdout(); let stdout = BufWriter::new(stdout.lock()); encoder.into_stream(stdout).encode_all(stdin).status } { Ok(()) => (), Err(err) => eprintln!("{}", err), } } weezl-0.1.5/examples/lzw-decompress.rs010064400017500001750000000007701371731454500162440ustar0000000000000000//! Decompresses the input from stdin and writes the result to stdout. use std::io::{self, BufWriter}; fn main() { match { let mut decoder = weezl::decode::Decoder::new(weezl::BitOrder::Msb, 8); let stdout = io::stdout(); let stdout = BufWriter::new(stdout.lock()); let stdin = io::stdin(); let stdin = stdin.lock(); decoder.into_stream(stdout).decode_all(stdin).status } { Ok(()) => (), Err(err) => eprintln!("{}", err), } } weezl-0.1.5/src/decode.rs010064400017500001750000001350031403437655100134560ustar0000000000000000//! A module for all decoding needs. #[cfg(feature = "std")] use crate::error::StreamResult; use crate::error::{BufferResult, LzwError, LzwStatus, VectorResult}; use crate::{BitOrder, Code, StreamBuf, MAX_CODESIZE, MAX_ENTRIES, STREAM_BUF_SIZE}; use crate::alloc::{boxed::Box, vec, vec::Vec}; #[cfg(feature = "std")] use std::io::{self, BufRead, Write}; /// The state for decoding data with an LZW algorithm. /// /// The same structure can be utilized with streams as well as your own buffers and driver logic. /// It may even be possible to mix them if you are sufficiently careful not to lose or skip any /// already decode data in the process. /// /// This is a sans-IO implementation, meaning that it only contains the state of the decoder and /// the caller will provide buffers for input and output data when calling the basic /// [`decode_bytes`] method. Nevertheless, a number of _adapters_ are provided in the `into_*` /// methods for decoding with a particular style of common IO. /// /// * [`decode`] for decoding once without any IO-loop. /// * [`into_async`] for decoding with the `futures` traits for asynchronous IO. /// * [`into_stream`] for decoding with the standard `io` traits. /// * [`into_vec`] for in-memory decoding. /// /// [`decode_bytes`]: #method.decode_bytes /// [`decode`]: #method.decode /// [`into_async`]: #method.into_async /// [`into_stream`]: #method.into_stream /// [`into_vec`]: #method.into_vec pub struct Decoder { state: Box, } /// A decoding stream sink. /// /// See [`Decoder::into_stream`] on how to create this type. /// /// [`Decoder::into_stream`]: struct.Decoder.html#method.into_stream #[cfg_attr( not(feature = "std"), deprecated = "This type is only useful with the `std` feature." )] #[cfg_attr(not(feature = "std"), allow(dead_code))] pub struct IntoStream<'d, W> { decoder: &'d mut Decoder, writer: W, buffer: Option>, default_size: usize, } /// An async decoding sink. /// /// See [`Decoder::into_async`] on how to create this type. /// /// [`Decoder::into_async`]: struct.Decoder.html#method.into_async #[cfg(feature = "async")] pub struct IntoAsync<'d, W> { decoder: &'d mut Decoder, writer: W, buffer: Option>, default_size: usize, } /// A decoding sink into a vector. /// /// See [`Decoder::into_vec`] on how to create this type. /// /// [`Decoder::into_vec`]: struct.Decoder.html#method.into_vec pub struct IntoVec<'d> { decoder: &'d mut Decoder, vector: &'d mut Vec, } trait Stateful { fn advance(&mut self, inp: &[u8], out: &mut [u8]) -> BufferResult; fn has_ended(&self) -> bool; /// Ignore an end code and continue decoding (no implied reset). fn restart(&mut self); /// Reset the decoder to the beginning, dropping all buffers etc. fn reset(&mut self); } #[derive(Clone)] struct Link { prev: Code, byte: u8, } #[derive(Default)] struct MsbBuffer { /// A buffer of individual bits. The oldest code is kept in the high-order bits. bit_buffer: u64, /// A precomputed mask for this code. code_mask: u16, /// The current code size. code_size: u8, /// The number of bits in the buffer. bits: u8, } #[derive(Default)] struct LsbBuffer { /// A buffer of individual bits. The oldest code is kept in the high-order bits. bit_buffer: u64, /// A precomputed mask for this code. code_mask: u16, /// The current code size. code_size: u8, /// The number of bits in the buffer. bits: u8, } trait CodeBuffer { fn new(min_size: u8) -> Self; fn reset(&mut self, min_size: u8); fn bump_code_size(&mut self); /// Retrieve the next symbol, refilling if necessary. fn next_symbol(&mut self, inp: &mut &[u8]) -> Option; /// Refill the internal buffer. fn refill_bits(&mut self, inp: &mut &[u8]); /// Get the next buffered code word. fn get_bits(&mut self) -> Option; fn max_code(&self) -> Code; fn code_size(&self) -> u8; } struct DecodeState { /// The original minimum code size. min_size: u8, /// The table of decoded codes. table: Table, /// The buffer of decoded data. buffer: Buffer, /// The link which we are still decoding and its original code. last: Option<(Code, Link)>, /// The next code entry. next_code: Code, /// Code to reset all tables. clear_code: Code, /// Code to signal the end of the stream. end_code: Code, /// A stored flag if the end code has already appeared. has_ended: bool, /// If tiff then bumps are a single code sooner. is_tiff: bool, /// The buffer for decoded words. code_buffer: CodeBuffer, } struct Buffer { bytes: Box<[u8]>, read_mark: usize, write_mark: usize, } struct Table { inner: Vec, depths: Vec, } impl Decoder { /// Create a new decoder with the specified bit order and symbol size. /// /// The algorithm for dynamically increasing the code symbol bit width is compatible with the /// original specification. In particular you will need to specify an `Lsb` bit oder to decode /// the data portion of a compressed `gif` image. /// /// # Panics /// /// The `size` needs to be in the interval `0..=12`. pub fn new(order: BitOrder, size: u8) -> Self { type Boxed = Box; super::assert_decode_size(size); let state = match order { BitOrder::Lsb => Box::new(DecodeState::::new(size)) as Boxed, BitOrder::Msb => Box::new(DecodeState::::new(size)) as Boxed, }; Decoder { state } } /// Create a TIFF compatible decoder with the specified bit order and symbol size. /// /// The algorithm for dynamically increasing the code symbol bit width is compatible with the /// TIFF specification, which is a misinterpretation of the original algorithm for increasing /// the code size. It switches one symbol sooner. /// /// # Panics /// /// The `size` needs to be in the interval `0..=12`. pub fn with_tiff_size_switch(order: BitOrder, size: u8) -> Self { type Boxed = Box; super::assert_decode_size(size); let state = match order { BitOrder::Lsb => { let mut state = Box::new(DecodeState::::new(size)); state.is_tiff = true; state as Boxed } BitOrder::Msb => { let mut state = Box::new(DecodeState::::new(size)); state.is_tiff = true; state as Boxed } }; Decoder { state } } /// Decode some bytes from `inp` and write result to `out`. /// /// This will consume a prefix of the input buffer and write decoded output into a prefix of /// the output buffer. See the respective fields of the return value for the count of consumed /// and written bytes. For the next call You should have adjusted the inputs accordingly. /// /// The call will try to decode and write as many bytes of output as available. It will be /// much more optimized (and avoid intermediate buffering) if it is allowed to write a large /// contiguous chunk at once. /// /// See [`into_stream`] for high-level functions (that are only available with the `std` /// feature). /// /// [`into_stream`]: #method.into_stream pub fn decode_bytes(&mut self, inp: &[u8], out: &mut [u8]) -> BufferResult { self.state.advance(inp, out) } /// Decode a single chunk of lzw encoded data. /// /// This method requires the data to contain an end marker, and returns an error otherwise. /// /// This is a convenience wrapper around [`into_vec`]. Use the `into_vec` adapter to customize /// buffer size, to supply an existing vector, to control whether an end marker is required, or /// to preserve partial data in the case of a decoding error. /// /// [`into_vec`]: #into_vec /// /// # Example /// /// ``` /// use weezl::{BitOrder, decode::Decoder}; /// /// // Encoded that was created with an encoder. /// let data = b"\x80\x04\x81\x94l\x1b\x06\xf0\xb0 \x1d\xc6\xf1\xc8l\x19 \x10"; /// let decoded = Decoder::new(BitOrder::Msb, 9) /// .decode(data) /// .unwrap(); /// assert_eq!(decoded, b"Hello, world"); /// ``` pub fn decode(&mut self, data: &[u8]) -> Result, LzwError> { let mut output = vec![]; self.into_vec(&mut output).decode_all(data).status?; Ok(output) } /// Construct a decoder into a writer. #[cfg(feature = "std")] pub fn into_stream(&mut self, writer: W) -> IntoStream<'_, W> { IntoStream { decoder: self, writer, buffer: None, default_size: STREAM_BUF_SIZE, } } /// Construct a decoder into an async writer. #[cfg(feature = "async")] pub fn into_async(&mut self, writer: W) -> IntoAsync<'_, W> { IntoAsync { decoder: self, writer, buffer: None, default_size: STREAM_BUF_SIZE, } } /// Construct a decoder into a vector. /// /// All decoded data is appended and the vector is __not__ cleared. /// /// Compared to `into_stream` this interface allows a high-level access to decoding without /// requires the `std`-feature. Also, it can make full use of the extra buffer control that the /// special target exposes. pub fn into_vec<'lt>(&'lt mut self, vec: &'lt mut Vec) -> IntoVec<'lt> { IntoVec { decoder: self, vector: vec, } } /// Check if the decoding has finished. /// /// No more output is produced beyond the end code that marked the finish of the stream. The /// decoder may have read additional bytes, including padding bits beyond the last code word /// but also excess bytes provided. pub fn has_ended(&self) -> bool { self.state.has_ended() } /// Ignore an end code and continue. /// /// This will _not_ reset any of the inner code tables and not have the effect of a clear code. /// It will instead continue as if the end code had not been present. If no end code has /// occurred then this is a no-op. /// /// You can test if an end code has occurred with [`has_ended`](#method.has_ended). /// FIXME: clarify how this interacts with padding introduced after end code. #[allow(dead_code)] pub(crate) fn restart(&mut self) { self.state.restart(); } /// Reset all internal state. /// /// This produce a decoder as if just constructed with `new` but taking slightly less work. In /// particular it will not deallocate any internal allocations. It will also avoid some /// duplicate setup work. pub fn reset(&mut self) { self.state.reset(); } } #[cfg(feature = "std")] impl<'d, W: Write> IntoStream<'d, W> { /// Decode data from a reader. /// /// This will read data until the stream is empty or an end marker is reached. pub fn decode(&mut self, read: impl BufRead) -> StreamResult { self.decode_part(read, false) } /// Decode data from a reader, requiring an end marker. pub fn decode_all(mut self, read: impl BufRead) -> StreamResult { self.decode_part(read, true) } /// Set the size of the intermediate decode buffer. /// /// A buffer of this size is allocated to hold one part of the decoded stream when no buffer is /// available and any decoding method is called. No buffer is allocated if `set_buffer` has /// been called. The buffer is reused. /// /// # Panics /// This method panics if `size` is `0`. pub fn set_buffer_size(&mut self, size: usize) { assert_ne!(size, 0, "Attempted to set empty buffer"); self.default_size = size; } /// Use a particular buffer as an intermediate decode buffer. /// /// Calling this sets or replaces the buffer. When a buffer has been set then it is used /// instead of dynamically allocating a buffer. Note that the size of the buffer is critical /// for efficient decoding. Some optimization techniques require the buffer to hold one or more /// previous decoded words. There is also additional overhead from `write` calls each time the /// buffer has been filled. /// /// # Panics /// This method panics if the `buffer` is empty. pub fn set_buffer(&mut self, buffer: &'d mut [u8]) { assert_ne!(buffer.len(), 0, "Attempted to set empty buffer"); self.buffer = Some(StreamBuf::Borrowed(buffer)); } fn decode_part(&mut self, mut read: impl BufRead, must_finish: bool) -> StreamResult { let IntoStream { decoder, writer, buffer, default_size, } = self; enum Progress { Ok, Done, } let mut bytes_read = 0; let mut bytes_written = 0; // Converting to mutable refs to move into the `once` closure. let read_bytes = &mut bytes_read; let write_bytes = &mut bytes_written; let outbuf: &mut [u8] = match { buffer.get_or_insert_with(|| StreamBuf::Owned(vec![0u8; *default_size])) } { StreamBuf::Borrowed(slice) => &mut *slice, StreamBuf::Owned(vec) => &mut *vec, }; assert!(!outbuf.is_empty()); let once = move || { // Try to grab one buffer of input data. let data = read.fill_buf()?; // Decode as much of the buffer as fits. let result = decoder.decode_bytes(data, &mut outbuf[..]); // Do the bookkeeping and consume the buffer. *read_bytes += result.consumed_in; *write_bytes += result.consumed_out; read.consume(result.consumed_in); // Handle the status in the result. let done = result.status.map_err(|err| { io::Error::new(io::ErrorKind::InvalidData, &*format!("{:?}", err)) })?; // Check if we had any new data at all. if let LzwStatus::NoProgress = done { debug_assert_eq!( result.consumed_out, 0, "No progress means we have not decoded any data" ); // In particular we did not finish decoding. if must_finish { return Err(io::Error::new( io::ErrorKind::UnexpectedEof, "No more data but no end marker detected", )); } else { return Ok(Progress::Done); } } // And finish by writing our result. // TODO: we may lose data on error (also on status error above) which we might want to // deterministically handle so that we don't need to restart everything from scratch as // the only recovery strategy. Any changes welcome. writer.write_all(&outbuf[..result.consumed_out])?; Ok(if let LzwStatus::Done = done { Progress::Done } else { Progress::Ok }) }; // Decode chunks of input data until we're done. let status = core::iter::repeat_with(once) // scan+fuse can be replaced with map_while .scan((), |(), result| match result { Ok(Progress::Ok) => Some(Ok(())), Err(err) => Some(Err(err)), Ok(Progress::Done) => None, }) .fuse() .collect(); StreamResult { bytes_read, bytes_written, status, } } } impl IntoVec<'_> { /// Decode data from a slice. /// /// This will read data until the slice is empty or an end marker is reached. pub fn decode(&mut self, read: &[u8]) -> VectorResult { self.decode_part(read, false) } /// Decode data from a slice, requiring an end marker. pub fn decode_all(mut self, read: &[u8]) -> VectorResult { self.decode_part(read, true) } fn grab_buffer(&mut self) -> (&mut [u8], &mut Decoder) { const CHUNK_SIZE: usize = 1 << 12; let decoder = &mut self.decoder; let length = self.vector.len(); // Use the vector to do overflow checks and w/e. self.vector.reserve(CHUNK_SIZE); // FIXME: decoding into uninit buffer? self.vector.resize(length + CHUNK_SIZE, 0u8); (&mut self.vector[length..], decoder) } fn decode_part(&mut self, part: &[u8], must_finish: bool) -> VectorResult { let mut result = VectorResult { consumed_in: 0, consumed_out: 0, status: Ok(LzwStatus::Ok), }; enum Progress { Ok, Done, } // Converting to mutable refs to move into the `once` closure. let read_bytes = &mut result.consumed_in; let write_bytes = &mut result.consumed_out; let mut data = part; // A 64 MB buffer is quite large but should get alloc_zeroed. // Note that the decoded size can be up to quadratic in code block. let once = move || { // Grab a new output buffer. let (outbuf, decoder) = self.grab_buffer(); // Decode as much of the buffer as fits. let result = decoder.decode_bytes(data, &mut outbuf[..]); // Do the bookkeeping and consume the buffer. *read_bytes += result.consumed_in; *write_bytes += result.consumed_out; data = &data[result.consumed_in..]; let unfilled = outbuf.len() - result.consumed_out; let filled = self.vector.len() - unfilled; self.vector.truncate(filled); // Handle the status in the result. match result.status { Err(err) => Err(err), Ok(LzwStatus::NoProgress) if must_finish => Err(LzwError::InvalidCode), Ok(LzwStatus::NoProgress) | Ok(LzwStatus::Done) => Ok(Progress::Done), Ok(LzwStatus::Ok) => Ok(Progress::Ok), } }; // Decode chunks of input data until we're done. let status: Result<(), _> = core::iter::repeat_with(once) // scan+fuse can be replaced with map_while .scan((), |(), result| match result { Ok(Progress::Ok) => Some(Ok(())), Err(err) => Some(Err(err)), Ok(Progress::Done) => None, }) .fuse() .collect(); if let Err(err) = status { result.status = Err(err); } result } } // This is implemented in a separate file, so that 1.34.2 does not parse it. Otherwise, it would // trip over the usage of await, which is a reserved keyword in that edition/version. It only // contains an impl block. #[cfg(feature = "async")] #[path = "decode_into_async.rs"] mod impl_decode_into_async; impl DecodeState { fn new(min_size: u8) -> Self { DecodeState { min_size: min_size, table: Table::new(), buffer: Buffer::new(), last: None, clear_code: 1 << min_size, end_code: (1 << min_size) + 1, next_code: (1 << min_size) + 2, has_ended: false, is_tiff: false, code_buffer: CodeBuffer::new(min_size), } } fn init_tables(&mut self) { self.code_buffer.reset(self.min_size); self.next_code = (1 << self.min_size) + 2; self.table.init(self.min_size); } fn reset_tables(&mut self) { self.code_buffer.reset(self.min_size); self.next_code = (1 << self.min_size) + 2; self.table.clear(self.min_size); } } impl Stateful for DecodeState { fn has_ended(&self) -> bool { self.has_ended } fn restart(&mut self) { self.has_ended = false; } fn reset(&mut self) { self.table.init(self.min_size); self.buffer.read_mark = 0; self.buffer.write_mark = 0; self.last = None; self.restart(); self.code_buffer = CodeBuffer::new(self.min_size); } fn advance(&mut self, mut inp: &[u8], mut out: &mut [u8]) -> BufferResult { // Skip everything if there is nothing to do. if self.has_ended { return BufferResult { consumed_in: 0, consumed_out: 0, status: Ok(LzwStatus::Done), }; } // Rough description: // We will fill the output slice as much as possible until either there is no more symbols // to decode or an end code has been reached. This requires an internal buffer to hold a // potential tail of the word corresponding to the last symbol. This tail will then be // decoded first before continuing with the regular decoding. The same buffer is required // to persist some symbol state across calls. // // We store the words corresponding to code symbols in an index chain, bytewise, where we // push each decoded symbol. (TODO: wuffs shows some success with 8-byte units). This chain // is traversed for each symbol when it is decoded and bytes are placed directly into the // output slice. In the special case (new_code == next_code) we use an existing decoded // version that is present in either the out bytes of this call or in buffer to copy the // repeated prefix slice. // TODO: I played with a 'decoding cache' to remember the position of long symbols and // avoid traversing the chain, doing a copy of memory instead. It did however not lead to // a serious improvement. It's just unlikely to both have a long symbol and have that // repeated twice in the same output buffer. // // You will also find the (to my knowledge novel) concept of a _decoding burst_ which // gained some >~10% speedup in tests. This is motivated by wanting to use out-of-order // execution as much as possible and for this reason have the least possible stress on // branch prediction. Our decoding table already gives us a lookahead on symbol lengths but // only for re-used codes, not novel ones. This lookahead also makes the loop termination // when restoring each byte of the code word perfectly predictable! So a burst is a chunk // of code words which are all independent of each other, have known lengths _and_ are // guaranteed to fit into the out slice without requiring a buffer. One burst can be // decoded in an extremely tight loop. // // TODO: since words can be at most (1 << MAX_CODESIZE) = 4096 bytes long we could avoid // that intermediate buffer at the expense of not always filling the output buffer // completely. Alternatively we might follow its chain of precursor states twice. This may // be even cheaper if we store more than one byte per link so it really should be // evaluated. // TODO: if the caller was required to provide the previous last word we could also avoid // the buffer for cases where we need it to restore the next code! This could be built // backwards compatible by only doing it after an opt-in call that enables the behaviour. // Record initial lengths for the result that is returned. let o_in = inp.len(); let o_out = out.len(); // The code_link is the previously decoded symbol. // It's used to link the new code back to its predecessor. let mut code_link = None; // The status, which is written to on an invalid code. let mut status = Ok(LzwStatus::Ok); match self.last.take() { // No last state? This is the first code after a reset? None => { match self.next_symbol(&mut inp) { // Plainly invalid code. Some(code) if code > self.next_code => status = Err(LzwError::InvalidCode), // next_code would require an actual predecessor. Some(code) if code == self.next_code => status = Err(LzwError::InvalidCode), // No more symbols available and nothing decoded yet. // Assume that we didn't make progress, this may get reset to Done if we read // some bytes from the input. None => status = Ok(LzwStatus::NoProgress), // Handle a valid code. Some(init_code) => { if init_code == self.clear_code { self.init_tables(); } else if init_code == self.end_code { self.has_ended = true; status = Ok(LzwStatus::Done); } else if self.table.is_empty() { // We require an explicit reset. // TODO: allow this to be configured and do the setup implicitly. status = Err(LzwError::InvalidCode); } else { // Reconstruct the first code in the buffer. self.buffer.fill_reconstruct(&self.table, init_code); let link = self.table.at(init_code).clone(); code_link = Some((init_code, link)); } } } } // Move the tracking state to the stack. Some(tup) => code_link = Some(tup), }; // Track an empty `burst` (see below) means we made no progress. let mut burst_required_for_progress = false; // Restore the previous state, if any. if let Some((code, link)) = code_link.take() { code_link = Some((code, link)); let remain = self.buffer.buffer(); // Check if we can fully finish the buffer. if remain.len() > out.len() { if out.is_empty() { status = Ok(LzwStatus::NoProgress); } else { out.copy_from_slice(&remain[..out.len()]); self.buffer.consume(out.len()); out = &mut []; } } else if remain.is_empty() { status = Ok(LzwStatus::NoProgress); burst_required_for_progress = true; } else { let consumed = remain.len(); out[..consumed].copy_from_slice(remain); self.buffer.consume(consumed); out = &mut out[consumed..]; burst_required_for_progress = false; } } // The tracking state for a burst. // These are actually initialized later but compiler wasn't smart enough to fully optimize // out the init code so that appears outside th loop. // TODO: maybe we can make it part of the state but it's dubious if that really gives a // benefit over stack usage? Also the slices stored here would need some treatment as we // can't infect the main struct with a lifetime. let mut burst = [0; 6]; let mut bytes = [0u16; 6]; let mut target: [&mut [u8]; 6] = Default::default(); // A special reference to out slice which holds the last decoded symbol. let mut last_decoded: Option<&[u8]> = None; while let Some((mut code, mut link)) = code_link.take() { if out.is_empty() && !self.buffer.buffer().is_empty() { code_link = Some((code, link)); break; } let mut burst_size = 0; // Ensure the code buffer is full, we're about to request some codes. // Note that this also ensures at least one code is in the buffer if any input is left. self.refill_bits(&mut inp); // A burst is a sequence of decodes that are completely independent of each other. This // is the case if neither is an end code, a clear code, or a next code, i.e. we have // all of them in the decoding table and thus known their depths, and additionally if // we can decode them directly into the output buffer. for b in &mut burst { // TODO: does it actually make a perf difference to avoid reading new bits here? *b = match self.get_bits() { None => break, Some(code) => code, }; // We can commit the previous burst code, and will take a slice from the output // buffer. This also avoids the bounds check in the tight loop later. if burst_size > 0 { let len = bytes[burst_size - 1]; let (into, tail) = out.split_at_mut(usize::from(len)); target[burst_size - 1] = into; out = tail; } // Check that we don't overflow the code size with all codes we burst decode. let potential_code = self.next_code + burst_size as u16; burst_size += 1; if potential_code == self.code_buffer.max_code() - Code::from(self.is_tiff) { break; } // A burst code can't be special. if *b == self.clear_code || *b == self.end_code || *b >= self.next_code { break; } // Read the code length and check that we can decode directly into the out slice. let len = self.table.depths[usize::from(*b)]; if out.len() < usize::from(len) { break; } bytes[burst_size - 1] = len; } // No code left, and no more bytes to fill the buffer. if burst_size == 0 { if burst_required_for_progress { status = Ok(LzwStatus::NoProgress); } code_link = Some((code, link)); break; } burst_required_for_progress = false; // Note that the very last code in the burst buffer doesn't actually belong to the // burst itself. TODO: sometimes it could, we just don't differentiate between the // breaks and a loop end condition above. That may be a speed advantage? let (&new_code, burst) = burst[..burst_size].split_last().unwrap(); // The very tight loop for restoring the actual burst. for (&burst, target) in burst.iter().zip(&mut target[..burst_size - 1]) { let cha = self.table.reconstruct(burst, target); // TODO: this pushes into a Vec, maybe we can make this cleaner. // Theoretically this has a branch and llvm tends to be flaky with code layout for // the case of requiring an allocation (which can't occur in practice). let new_link = self.table.derive(&link, cha, code); self.next_code += 1; code = burst; link = new_link; } // Update the slice holding the last decoded word. if let Some(new_last) = target[..burst_size - 1].last_mut() { let slice = core::mem::replace(new_last, &mut []); last_decoded = Some(&*slice); } // Now handle the special codes. if new_code == self.clear_code { self.reset_tables(); last_decoded = None; continue; } if new_code == self.end_code { self.has_ended = true; status = Ok(LzwStatus::Done); last_decoded = None; break; } if new_code > self.next_code { status = Err(LzwError::InvalidCode); last_decoded = None; break; } let required_len = if new_code == self.next_code { self.table.depths[usize::from(code)] + 1 } else { self.table.depths[usize::from(new_code)] }; let cha; let is_in_buffer; // Check if we will need to store our current state into the buffer. if usize::from(required_len) > out.len() { is_in_buffer = true; if new_code == self.next_code { // last_decoded will be Some if we have restored any code into the out slice. // Otherwise it will still be present in the buffer. if let Some(last) = last_decoded.take() { self.buffer.bytes[..last.len()].copy_from_slice(last); self.buffer.write_mark = last.len(); self.buffer.read_mark = last.len(); } cha = self.buffer.fill_cscsc(); } else { // Restore the decoded word into the buffer. last_decoded = None; cha = self.buffer.fill_reconstruct(&self.table, new_code); } } else { is_in_buffer = false; let (target, tail) = out.split_at_mut(usize::from(required_len)); out = tail; if new_code == self.next_code { // Reconstruct high. let source = match last_decoded.take() { Some(last) => last, None => &self.buffer.bytes[..self.buffer.write_mark], }; cha = source[0]; target[..source.len()].copy_from_slice(source); target[source.len()..][0] = source[0]; } else { cha = self.table.reconstruct(new_code, target); } // A new decoded word. last_decoded = Some(target); } let new_link; // Each newly read code creates one new code/link based on the preceding code if we // have enough space to put it there. if !self.table.is_full() { let link = self.table.derive(&link, cha, code); if self.next_code == self.code_buffer.max_code() - Code::from(self.is_tiff) && self.code_buffer.code_size() < MAX_CODESIZE { self.bump_code_size(); } self.next_code += 1; new_link = link; } else { // It's actually quite likely that the next code will be a reset but just in case. // FIXME: this path hasn't been tested very well. new_link = link.clone(); } // store the information on the decoded word. code_link = Some((new_code, new_link)); // Can't make any more progress with decoding. if is_in_buffer { break; } } // We need to store the last word into the buffer in case the first code in the next // iteration is the next_code. if let Some(tail) = last_decoded { self.buffer.bytes[..tail.len()].copy_from_slice(tail); self.buffer.write_mark = tail.len(); self.buffer.read_mark = tail.len(); } // Ensure we don't indicate that no progress was made if we read some bytes from the input // (which is progress). if o_in > inp.len() { if let Ok(LzwStatus::NoProgress) = status { status = Ok(LzwStatus::Ok); } } // Store the code/link state. self.last = code_link; BufferResult { consumed_in: o_in.wrapping_sub(inp.len()), consumed_out: o_out.wrapping_sub(out.len()), status, } } } impl DecodeState { fn next_symbol(&mut self, inp: &mut &[u8]) -> Option { self.code_buffer.next_symbol(inp) } fn bump_code_size(&mut self) { self.code_buffer.bump_code_size() } fn refill_bits(&mut self, inp: &mut &[u8]) { self.code_buffer.refill_bits(inp) } fn get_bits(&mut self) -> Option { self.code_buffer.get_bits() } } impl CodeBuffer for MsbBuffer { fn new(min_size: u8) -> Self { MsbBuffer { code_size: min_size + 1, code_mask: (1u16 << (min_size + 1)) - 1, bit_buffer: 0, bits: 0, } } fn reset(&mut self, min_size: u8) { self.code_size = min_size + 1; self.code_mask = (1 << self.code_size) - 1; } fn next_symbol(&mut self, inp: &mut &[u8]) -> Option { if self.bits < self.code_size { self.refill_bits(inp); } self.get_bits() } fn bump_code_size(&mut self) { self.code_size += 1; self.code_mask = (self.code_mask << 1) | 1; } fn refill_bits(&mut self, inp: &mut &[u8]) { let wish_count = (64 - self.bits) / 8; let mut buffer = [0u8; 8]; let new_bits = match inp.get(..usize::from(wish_count)) { Some(bytes) => { buffer[..usize::from(wish_count)].copy_from_slice(bytes); *inp = &inp[usize::from(wish_count)..]; wish_count * 8 } None => { let new_bits = inp.len() * 8; buffer[..inp.len()].copy_from_slice(inp); *inp = &[]; new_bits as u8 } }; self.bit_buffer |= u64::from_be_bytes(buffer) >> self.bits; self.bits += new_bits; } fn get_bits(&mut self) -> Option { if self.bits < self.code_size { return None; } let mask = u64::from(self.code_mask); let rotbuf = self.bit_buffer.rotate_left(self.code_size.into()); self.bit_buffer = rotbuf & !mask; self.bits -= self.code_size; Some((rotbuf & mask) as u16) } fn max_code(&self) -> Code { self.code_mask } fn code_size(&self) -> u8 { self.code_size } } impl CodeBuffer for LsbBuffer { fn new(min_size: u8) -> Self { LsbBuffer { code_size: min_size + 1, code_mask: (1u16 << (min_size + 1)) - 1, bit_buffer: 0, bits: 0, } } fn reset(&mut self, min_size: u8) { self.code_size = min_size + 1; self.code_mask = (1 << self.code_size) - 1; } fn next_symbol(&mut self, inp: &mut &[u8]) -> Option { if self.bits < self.code_size { self.refill_bits(inp); } self.get_bits() } fn bump_code_size(&mut self) { self.code_size += 1; self.code_mask = (self.code_mask << 1) | 1; } fn refill_bits(&mut self, inp: &mut &[u8]) { let wish_count = (64 - self.bits) / 8; let mut buffer = [0u8; 8]; let new_bits = match inp.get(..usize::from(wish_count)) { Some(bytes) => { buffer[..usize::from(wish_count)].copy_from_slice(bytes); *inp = &inp[usize::from(wish_count)..]; wish_count * 8 } None => { let new_bits = inp.len() * 8; buffer[..inp.len()].copy_from_slice(inp); *inp = &[]; new_bits as u8 } }; self.bit_buffer |= u64::from_be_bytes(buffer).swap_bytes() << self.bits; self.bits += new_bits; } fn get_bits(&mut self) -> Option { if self.bits < self.code_size { return None; } let mask = u64::from(self.code_mask); let code = self.bit_buffer & mask; self.bit_buffer >>= self.code_size; self.bits -= self.code_size; Some(code as u16) } fn max_code(&self) -> Code { self.code_mask } fn code_size(&self) -> u8 { self.code_size } } impl Buffer { fn new() -> Self { Buffer { bytes: vec![0; MAX_ENTRIES].into_boxed_slice(), read_mark: 0, write_mark: 0, } } /// When encoding a sequence `cScSc` where `c` is any character and `S` is any string /// this results in two codes `AB`, `A` encoding `cS` and `B` encoding `cSc`. Supposing /// the buffer is already filled with the reconstruction of `A`, we can easily fill it /// with the reconstruction of `B`. fn fill_cscsc(&mut self) -> u8 { self.bytes[self.write_mark] = self.bytes[0]; self.write_mark += 1; self.read_mark = 0; self.bytes[0] } // Fill the buffer by decoding from the table fn fill_reconstruct(&mut self, table: &Table, code: Code) -> u8 { self.write_mark = 0; self.read_mark = 0; let depth = table.depths[usize::from(code)]; let mut memory = core::mem::replace(&mut self.bytes, Box::default()); let out = &mut memory[..usize::from(depth)]; let last = table.reconstruct(code, out); self.bytes = memory; self.write_mark = usize::from(depth); last } fn buffer(&self) -> &[u8] { &self.bytes[self.read_mark..self.write_mark] } fn consume(&mut self, amt: usize) { self.read_mark += amt; } } impl Table { fn new() -> Self { Table { inner: Vec::with_capacity(MAX_ENTRIES), depths: Vec::with_capacity(MAX_ENTRIES), } } fn clear(&mut self, min_size: u8) { let static_count = usize::from(1u16 << u16::from(min_size)) + 2; self.inner.truncate(static_count); self.depths.truncate(static_count); } fn init(&mut self, min_size: u8) { self.inner.clear(); self.depths.clear(); for i in 0..(1u16 << u16::from(min_size)) { self.inner.push(Link::base(i as u8)); self.depths.push(1); } // Clear code. self.inner.push(Link::base(0)); self.depths.push(0); // End code. self.inner.push(Link::base(0)); self.depths.push(0); } fn at(&self, code: Code) -> &Link { &self.inner[usize::from(code)] } fn is_empty(&self) -> bool { self.inner.is_empty() } fn is_full(&self) -> bool { self.inner.len() >= MAX_ENTRIES } fn derive(&mut self, from: &Link, byte: u8, prev: Code) -> Link { let link = from.derive(byte, prev); let depth = self.depths[usize::from(prev)] + 1; self.inner.push(link.clone()); self.depths.push(depth); link } fn reconstruct(&self, code: Code, out: &mut [u8]) -> u8 { let mut code_iter = code; let table = &self.inner[..=usize::from(code)]; let len = code_iter; for ch in out.iter_mut().rev() { //(code, cha) = self.table[k as usize]; // Note: This could possibly be replaced with an unchecked array access if // - value is asserted to be < self.next_code() in push // - min_size is asserted to be < MAX_CODESIZE let entry = &table[usize::from(code_iter)]; code_iter = core::cmp::min(len, entry.prev); *ch = entry.byte; } out[0] } } impl Link { fn base(byte: u8) -> Self { Link { prev: 0, byte } } // TODO: this has self type to make it clear we might depend on the old in a future // optimization. However, that has no practical purpose right now. fn derive(&self, byte: u8, prev: Code) -> Self { Link { prev, byte } } } #[cfg(test)] mod tests { use crate::alloc::vec::Vec; #[cfg(feature = "std")] use crate::StreamBuf; use crate::{decode::Decoder, BitOrder}; #[test] fn invalid_code_size_low() { let _ = Decoder::new(BitOrder::Msb, 0); let _ = Decoder::new(BitOrder::Msb, 1); } #[test] #[should_panic] fn invalid_code_size_high() { let _ = Decoder::new(BitOrder::Msb, 14); } fn make_encoded() -> Vec { const FILE: &'static [u8] = include_bytes!(concat!( env!("CARGO_MANIFEST_DIR"), "/benches/binary-8-msb.lzw" )); return Vec::from(FILE); } #[test] #[cfg(feature = "std")] fn into_stream_buffer_no_alloc() { let encoded = make_encoded(); let mut decoder = Decoder::new(BitOrder::Msb, 8); let mut output = vec![]; let mut buffer = [0; 512]; let mut istream = decoder.into_stream(&mut output); istream.set_buffer(&mut buffer[..]); istream.decode(&encoded[..]).status.unwrap(); match istream.buffer { Some(StreamBuf::Borrowed(_)) => {} None => panic!("Decoded without buffer??"), Some(StreamBuf::Owned(_)) => panic!("Unexpected buffer allocation"), } } #[test] #[cfg(feature = "std")] fn into_stream_buffer_small_alloc() { struct WriteTap(W); const BUF_SIZE: usize = 512; impl std::io::Write for WriteTap { fn write(&mut self, buf: &[u8]) -> std::io::Result { assert!(buf.len() <= BUF_SIZE); self.0.write(buf) } fn flush(&mut self) -> std::io::Result<()> { self.0.flush() } } let encoded = make_encoded(); let mut decoder = Decoder::new(BitOrder::Msb, 8); let mut output = vec![]; let mut istream = decoder.into_stream(WriteTap(&mut output)); istream.set_buffer_size(512); istream.decode(&encoded[..]).status.unwrap(); match istream.buffer { Some(StreamBuf::Owned(vec)) => assert!(vec.len() <= BUF_SIZE), Some(StreamBuf::Borrowed(_)) => panic!("Unexpected borrowed buffer, where from?"), None => panic!("Decoded without buffer??"), } } #[test] #[cfg(feature = "std")] fn reset() { let encoded = make_encoded(); let mut decoder = Decoder::new(BitOrder::Msb, 8); let mut reference = None; for _ in 0..2 { let mut output = vec![]; let mut buffer = [0; 512]; let mut istream = decoder.into_stream(&mut output); istream.set_buffer(&mut buffer[..]); istream.decode_all(&encoded[..]).status.unwrap(); decoder.reset(); if let Some(reference) = &reference { assert_eq!(output, *reference); } else { reference = Some(output); } } } } weezl-0.1.5/src/decode_into_async.rs010064400017500001750000000121601403363510300156700ustar0000000000000000use crate::decode::IntoAsync; use crate::error::LzwStatus; use crate::error::StreamResult; use crate::StreamBuf; use std::io; impl<'d, W: futures::io::AsyncWrite + core::marker::Unpin> IntoAsync<'d, W> { /// Decode data from a reader. /// /// This will read data until the stream is empty or an end marker is reached. pub async fn decode(&mut self, read: impl futures::io::AsyncBufRead) -> StreamResult { self.decode_part(read, false).await } /// Decode data from a reader, requiring an end marker. pub async fn decode_all(mut self, read: impl futures::io::AsyncBufRead) -> StreamResult { self.decode_part(read, true).await } /// Set the size of the intermediate decode buffer. /// /// A buffer of this size is allocated to hold one part of the decoded stream when no buffer is /// available and any decoding method is called. No buffer is allocated if `set_buffer` has /// been called. The buffer is reused. /// /// # Panics /// This method panics if `size` is `0`. pub fn set_buffer_size(&mut self, size: usize) { assert_ne!(size, 0, "Attempted to set empty buffer"); self.default_size = size; } /// Use a particular buffer as an intermediate decode buffer. /// /// Calling this sets or replaces the buffer. When a buffer has been set then it is used /// instead of dynamically allocating a buffer. Note that the size of the buffer is critical /// for efficient decoding. Some optimization techniques require the buffer to hold one or more /// previous decoded words. There is also additional overhead from `write` calls each time the /// buffer has been filled. /// /// # Panics /// This method panics if the `buffer` is empty. pub fn set_buffer(&mut self, buffer: &'d mut [u8]) { assert_ne!(buffer.len(), 0, "Attempted to set empty buffer"); self.buffer = Some(StreamBuf::Borrowed(buffer)); } async fn decode_part( &mut self, read: impl futures::io::AsyncBufRead, must_finish: bool, ) -> StreamResult { use futures::io::AsyncBufReadExt; use futures::io::AsyncWriteExt; let IntoAsync { decoder, writer, buffer, default_size, } = self; futures::pin_mut!(read); let mut read: core::pin::Pin<_> = read; let mut bytes_read = 0; let mut bytes_written = 0; // Converting to mutable refs to move into the `once` closure. let read_bytes = &mut bytes_read; let write_bytes = &mut bytes_written; let outbuf: &mut [u8] = match { buffer.get_or_insert_with(|| StreamBuf::Owned(vec![0u8; *default_size])) } { StreamBuf::Borrowed(slice) => &mut *slice, StreamBuf::Owned(vec) => &mut *vec, }; assert!(!outbuf.is_empty()); let status = loop { // Try to grab one buffer of input data. let mut filler = read.as_mut(); let data = match filler.fill_buf().await { Ok(buf) => buf, Err(err) => break Err(err), }; // Decode as much of the buffer as fits. let result = decoder.decode_bytes(data, &mut outbuf[..]); // Do the bookkeeping and consume the buffer. *read_bytes += result.consumed_in; *write_bytes += result.consumed_out; read.as_mut().consume(result.consumed_in); // Handle an error status in the result. let status = match result.status { Ok(ok) => ok, Err(err) => { break Err(io::Error::new( io::ErrorKind::InvalidData, &*format!("{:?}", err), )); } }; // Check if we had any new data at all. if let LzwStatus::NoProgress = status { debug_assert_eq!( result.consumed_out, 0, "No progress means we have not decoded any data" ); // In particular we did not finish decoding. if must_finish { break Err(io::Error::new( io::ErrorKind::UnexpectedEof, "No more data but no end marker detected", )); } else { break Ok(()); } } // And finish by writing our result. // TODO: we may lose data on error (also on status error above) which we might want to // deterministically handle so that we don't need to restart everything from scratch as // the only recovery strategy. Any changes welcome. match writer.write_all(&outbuf[..result.consumed_out]).await { Ok(_) => {} Err(err) => break Err(err), } if let LzwStatus::Done = status { break Ok(()); } }; StreamResult { bytes_read, bytes_written, status, } } } weezl-0.1.5/src/encode.rs010064400017500001750000001065751403437655100135040ustar0000000000000000//! A module for all encoding needs. use crate::error::{BufferResult, LzwError, LzwStatus, VectorResult}; use crate::{BitOrder, Code, StreamBuf, MAX_CODESIZE, MAX_ENTRIES, STREAM_BUF_SIZE}; use crate::alloc::{boxed::Box, vec::Vec}; #[cfg(feature = "std")] use crate::error::StreamResult; #[cfg(feature = "std")] use std::io::{self, BufRead, Write}; /// The state for encoding data with an LZW algorithm. /// /// The same structure can be utilized with streams as well as your own buffers and driver logic. /// It may even be possible to mix them if you are sufficiently careful not to lose any written /// data in the process. /// /// This is a sans-IO implementation, meaning that it only contains the state of the encoder and /// the caller will provide buffers for input and output data when calling the basic /// [`encode_bytes`] method. Nevertheless, a number of _adapters_ are provided in the `into_*` /// methods for enoding with a particular style of common IO. /// /// * [`encode`] for encoding once without any IO-loop. /// * [`into_async`] for encoding with the `futures` traits for asynchronous IO. /// * [`into_stream`] for encoding with the standard `io` traits. /// * [`into_vec`] for in-memory encoding. /// /// [`encode_bytes`]: #method.encode_bytes /// [`encode`]: #method.encode /// [`into_async`]: #method.into_async /// [`into_stream`]: #method.into_stream /// [`into_vec`]: #method.into_vec pub struct Encoder { /// Internally dispatch via a dynamic trait object. This did not have any significant /// performance impact as we batch data internally and this pointer does not change after /// creation! state: Box, } /// A encoding stream sink. /// /// See [`Encoder::into_stream`] on how to create this type. /// /// [`Encoder::into_stream`]: struct.Encoder.html#method.into_stream #[cfg_attr( not(feature = "std"), deprecated = "This type is only useful with the `std` feature." )] #[cfg_attr(not(feature = "std"), allow(dead_code))] pub struct IntoStream<'d, W> { encoder: &'d mut Encoder, writer: W, buffer: Option>, default_size: usize, } /// An async decoding sink. /// /// See [`Encoder::into_async`] on how to create this type. /// /// [`Encoder::into_async`]: struct.Encoder.html#method.into_async #[cfg(feature = "async")] pub struct IntoAsync<'d, W> { encoder: &'d mut Encoder, writer: W, buffer: Option>, default_size: usize, } /// A encoding sink into a vector. /// /// See [`Encoder::into_vec`] on how to create this type. /// /// [`Encoder::into_vec`]: struct.Encoder.html#method.into_vec pub struct IntoVec<'d> { encoder: &'d mut Encoder, vector: &'d mut Vec, } trait Stateful { fn advance(&mut self, inp: &[u8], out: &mut [u8]) -> BufferResult; fn mark_ended(&mut self) -> bool; /// Reset the state tracking if end code has been written. fn restart(&mut self); /// Reset the encoder to the beginning, dropping all buffers etc. fn reset(&mut self); } struct EncodeState { /// The configured minimal code size. min_size: u8, /// The current encoding symbol tree. tree: Tree, /// If we have pushed the end code. has_ended: bool, /// If tiff then bumps are a single code sooner. is_tiff: bool, /// The code corresponding to the currently read characters. current_code: Code, /// The clear code for resetting the dictionary. clear_code: Code, /// The bit buffer for encoding. buffer: B, } struct MsbBuffer { /// The current code length. code_size: u8, /// The buffer bits. buffer: u64, /// The number of valid buffer bits. bits_in_buffer: u8, } struct LsbBuffer { /// The current code length. code_size: u8, /// The buffer bits. buffer: u64, /// The number of valid buffer bits. bits_in_buffer: u8, } trait Buffer { fn new(size: u8) -> Self; /// Reset the code size in the buffer. fn reset(&mut self, min_size: u8); /// Apply effects of a Clear Code. fn clear(&mut self, min_size: u8); /// Insert a code into the buffer. fn buffer_code(&mut self, code: Code); /// Push bytes if the buffer space is getting small. fn push_out(&mut self, out: &mut &mut [u8]) -> bool; /// Flush all full bytes, returning if at least one more byte remains. fn flush_out(&mut self, out: &mut &mut [u8]) -> bool; /// Pad the buffer to a full byte. fn buffer_pad(&mut self); /// Increase the maximum code size. fn bump_code_size(&mut self); /// Return the maximum code with the current code size. fn max_code(&self) -> Code; /// Return the current code size in bits. fn code_size(&self) -> u8; } /// One tree node for at most each code. /// To avoid using too much memory we keep nodes with few successors in optimized form. This form /// doesn't offer lookup by indexing but instead does a linear search. #[derive(Default)] struct Tree { simples: Vec, complex: Vec, keys: Vec, } #[derive(Clone, Copy)] enum FullKey { NoSuccessor, Simple(u16), Full(u16), } #[derive(Clone, Copy)] struct CompressedKey(u16); const SHORT: usize = 16; #[derive(Clone, Copy)] struct Simple { codes: [Code; SHORT], chars: [u8; SHORT], count: u8, } #[derive(Clone, Copy)] struct Full { char_continuation: [Code; 256], } impl Encoder { /// Create a new encoder with the specified bit order and symbol size. /// /// The algorithm for dynamically increasing the code symbol bit width is compatible with the /// original specification. In particular you will need to specify an `Lsb` bit oder to encode /// the data portion of a compressed `gif` image. /// /// # Panics /// /// The `size` needs to be in the interval `2..=12`. pub fn new(order: BitOrder, size: u8) -> Self { type Boxed = Box; super::assert_encode_size(size); let state = match order { BitOrder::Lsb => Box::new(EncodeState::::new(size)) as Boxed, BitOrder::Msb => Box::new(EncodeState::::new(size)) as Boxed, }; Encoder { state } } /// Create a TIFF compatible encoder with the specified bit order and symbol size. /// /// The algorithm for dynamically increasing the code symbol bit width is compatible with the /// TIFF specification, which is a misinterpretation of the original algorithm for increasing /// the code size. It switches one symbol sooner. /// /// # Panics /// /// The `size` needs to be in the interval `2..=12`. pub fn with_tiff_size_switch(order: BitOrder, size: u8) -> Self { type Boxed = Box; super::assert_encode_size(size); let state = match order { BitOrder::Lsb => { let mut state = Box::new(EncodeState::::new(size)); state.is_tiff = true; state as Boxed } BitOrder::Msb => { let mut state = Box::new(EncodeState::::new(size)); state.is_tiff = true; state as Boxed } }; Encoder { state } } /// Encode some bytes from `inp` into `out`. /// /// See [`into_stream`] for high-level functions (this interface is only available with the /// `std` feature) and [`finish`] for marking the input data as complete. /// /// When some input byte is invalid, i.e. is not smaller than `1 << size`, then that byte and /// all following ones will _not_ be consumed and the `status` of the result will signal an /// error. The result will also indicate that all bytes up to but not including the offending /// byte have been consumed. You may try again with a fixed byte. /// /// [`into_stream`]: #method.into_stream /// [`finish`]: #method.finish pub fn encode_bytes(&mut self, inp: &[u8], out: &mut [u8]) -> BufferResult { self.state.advance(inp, out) } /// Encode a single chunk of data. /// /// This method will add an end marker to the encoded chunk. /// /// This is a convenience wrapper around [`into_vec`]. Use the `into_vec` adapter to customize /// buffer size, to supply an existing vector, to control whether an end marker is required, or /// to preserve partial data in the case of a decoding error. /// /// [`into_vec`]: #into_vec /// /// # Example /// /// ``` /// use weezl::{BitOrder, encode::Encoder}; /// /// let data = b"Hello, world"; /// let encoded = Encoder::new(BitOrder::Msb, 9) /// .encode(data) /// .expect("All bytes valid for code size"); /// ``` pub fn encode(&mut self, data: &[u8]) -> Result, LzwError> { let mut output = Vec::new(); self.into_vec(&mut output).encode_all(data).status?; Ok(output) } /// Construct a encoder into a writer. #[cfg(feature = "std")] pub fn into_stream(&mut self, writer: W) -> IntoStream<'_, W> { IntoStream { encoder: self, writer, buffer: None, default_size: STREAM_BUF_SIZE, } } /// Construct a encoder into an async writer. #[cfg(feature = "async")] pub fn into_async(&mut self, writer: W) -> IntoAsync<'_, W> { IntoAsync { encoder: self, writer, buffer: None, default_size: STREAM_BUF_SIZE, } } /// Construct an encoder into a vector. /// /// All encoded data is appended and the vector is __not__ cleared. /// /// Compared to `into_stream` this interface allows a high-level access to encoding without /// requires the `std`-feature. Also, it can make full use of the extra buffer control that the /// special target exposes. pub fn into_vec<'lt>(&'lt mut self, vec: &'lt mut Vec) -> IntoVec<'lt> { IntoVec { encoder: self, vector: vec, } } /// Mark the encoding as in the process of finishing. /// /// The next following call to `encode_bytes` which is able to consume the complete input will /// also try to emit an end code. It's not recommended, but also not unsound, to use different /// byte slices in different calls from this point forward and thus to 'delay' the actual end /// of the data stream. The behaviour after the end marker has been written is unspecified but /// sound. pub fn finish(&mut self) { self.state.mark_ended(); } /// Undo marking this data stream as ending. /// FIXME: clarify how this interacts with padding introduced after end code. #[allow(dead_code)] pub(crate) fn restart(&mut self) { self.state.restart() } /// Reset all internal state. /// /// This produce an encoder as if just constructed with `new` but taking slightly less work. In /// particular it will not deallocate any internal allocations. It will also avoid some /// duplicate setup work. pub fn reset(&mut self) { self.state.reset() } } #[cfg(feature = "std")] impl<'d, W: Write> IntoStream<'d, W> { /// Encode data from a reader. /// /// This will drain the supplied reader. It will not encode an end marker after all data has /// been processed. pub fn encode(&mut self, read: impl BufRead) -> StreamResult { self.encode_part(read, false) } /// Encode data from a reader and an end marker. pub fn encode_all(mut self, read: impl BufRead) -> StreamResult { self.encode_part(read, true) } /// Set the size of the intermediate encode buffer. /// /// A buffer of this size is allocated to hold one part of the encoded stream when no buffer is /// available and any encoding method is called. No buffer is allocated if `set_buffer` has /// been called. The buffer is reused. /// /// # Panics /// This method panics if `size` is `0`. pub fn set_buffer_size(&mut self, size: usize) { assert_ne!(size, 0, "Attempted to set empty buffer"); self.default_size = size; } /// Use a particular buffer as an intermediate encode buffer. /// /// Calling this sets or replaces the buffer. When a buffer has been set then it is used /// instead of a dynamically allocating a buffer. Note that the size of the buffer is relevant /// for efficient encoding as there is additional overhead from `write` calls each time the /// buffer has been filled. /// /// # Panics /// This method panics if the `buffer` is empty. pub fn set_buffer(&mut self, buffer: &'d mut [u8]) { assert_ne!(buffer.len(), 0, "Attempted to set empty buffer"); self.buffer = Some(StreamBuf::Borrowed(buffer)); } fn encode_part(&mut self, mut read: impl BufRead, finish: bool) -> StreamResult { let IntoStream { encoder, writer, buffer, default_size, } = self; enum Progress { Ok, Done, } let mut bytes_read = 0; let mut bytes_written = 0; let read_bytes = &mut bytes_read; let write_bytes = &mut bytes_written; let outbuf: &mut [u8] = match { buffer.get_or_insert_with(|| StreamBuf::Owned(vec![0u8; *default_size])) } { StreamBuf::Borrowed(slice) => &mut *slice, StreamBuf::Owned(vec) => &mut *vec, }; assert!(!outbuf.is_empty()); let once = move || { let data = read.fill_buf()?; if data.is_empty() { if finish { encoder.finish(); } else { return Ok(Progress::Done); } } let result = encoder.encode_bytes(data, &mut outbuf[..]); *read_bytes += result.consumed_in; *write_bytes += result.consumed_out; read.consume(result.consumed_in); let done = result.status.map_err(|err| { io::Error::new(io::ErrorKind::InvalidData, &*format!("{:?}", err)) })?; if let LzwStatus::Done = done { writer.write_all(&outbuf[..result.consumed_out])?; return Ok(Progress::Done); } if let LzwStatus::NoProgress = done { return Err(io::Error::new( io::ErrorKind::UnexpectedEof, "No more data but no end marker detected", )); } writer.write_all(&outbuf[..result.consumed_out])?; Ok(Progress::Ok) }; let status = core::iter::repeat_with(once) // scan+fuse can be replaced with map_while .scan((), |(), result| match result { Ok(Progress::Ok) => Some(Ok(())), Err(err) => Some(Err(err)), Ok(Progress::Done) => None, }) .fuse() .collect(); StreamResult { bytes_read, bytes_written, status, } } } impl IntoVec<'_> { /// Encode data from a slice. pub fn encode(&mut self, read: &[u8]) -> VectorResult { self.encode_part(read, false) } /// Decode data from a reader, adding an end marker. pub fn encode_all(mut self, read: &[u8]) -> VectorResult { self.encode_part(read, true) } fn grab_buffer(&mut self) -> (&mut [u8], &mut Encoder) { const CHUNK_SIZE: usize = 1 << 12; let decoder = &mut self.encoder; let length = self.vector.len(); // Use the vector to do overflow checks and w/e. self.vector.reserve(CHUNK_SIZE); // FIXME: encoding into uninit buffer? self.vector.resize(length + CHUNK_SIZE, 0u8); (&mut self.vector[length..], decoder) } fn encode_part(&mut self, part: &[u8], finish: bool) -> VectorResult { let mut result = VectorResult { consumed_in: 0, consumed_out: 0, status: Ok(LzwStatus::Ok), }; enum Progress { Ok, Done, } // Converting to mutable refs to move into the `once` closure. let read_bytes = &mut result.consumed_in; let write_bytes = &mut result.consumed_out; let mut data = part; // A 64 MB buffer is quite large but should get alloc_zeroed. // Note that the decoded size can be up to quadratic in code block. let once = move || { // Grab a new output buffer. let (outbuf, encoder) = self.grab_buffer(); if finish { encoder.finish(); } // Decode as much of the buffer as fits. let result = encoder.encode_bytes(data, &mut outbuf[..]); // Do the bookkeeping and consume the buffer. *read_bytes += result.consumed_in; *write_bytes += result.consumed_out; data = &data[result.consumed_in..]; let unfilled = outbuf.len() - result.consumed_out; let filled = self.vector.len() - unfilled; self.vector.truncate(filled); // Handle the status in the result. let done = result.status?; if let LzwStatus::Done = done { Ok(Progress::Done) } else { Ok(Progress::Ok) } }; // Decode chunks of input data until we're done. let status: Result<(), _> = core::iter::repeat_with(once) // scan+fuse can be replaced with map_while .scan((), |(), result| match result { Ok(Progress::Ok) => Some(Ok(())), Err(err) => Some(Err(err)), Ok(Progress::Done) => None, }) .fuse() .collect(); if let Err(err) = status { result.status = Err(err); } result } } // This is implemented in a separate file, so that 1.34.2 does not parse it. Otherwise, it would // trip over the usage of await, which is a reserved keyword in that edition/version. It only // contains an impl block. #[cfg(feature = "async")] #[path = "encode_into_async.rs"] mod impl_encode_into_async; impl EncodeState { fn new(min_size: u8) -> Self { let clear_code = 1 << min_size; let mut tree = Tree::default(); tree.init(min_size); let mut state = EncodeState { min_size, tree, has_ended: false, is_tiff: false, current_code: clear_code, clear_code, buffer: B::new(min_size), }; state.buffer_code(clear_code); state } } impl Stateful for EncodeState { fn advance(&mut self, mut inp: &[u8], mut out: &mut [u8]) -> BufferResult { let c_in = inp.len(); let c_out = out.len(); let mut status = Ok(LzwStatus::Ok); 'encoding: loop { if self.push_out(&mut out) { break; } if inp.is_empty() && self.has_ended { let end = self.end_code(); if self.current_code != end { if self.current_code != self.clear_code { self.buffer_code(self.current_code); // When reading this code, the decoder will add an extra entry to its table // before reading th end code. Thusly, it may increase its code size based // on this additional entry. if self.tree.keys.len() + usize::from(self.is_tiff) > usize::from(self.buffer.max_code()) && self.buffer.code_size() < MAX_CODESIZE { self.buffer.bump_code_size(); } } self.buffer_code(end); self.current_code = end; self.buffer_pad(); } break; } let mut next_code = None; let mut bytes = inp.iter(); while let Some(&byte) = bytes.next() { if self.min_size < 8 && byte >= 1 << self.min_size { status = Err(LzwError::InvalidCode); break 'encoding; } inp = bytes.as_slice(); match self.tree.iterate(self.current_code, byte) { Ok(code) => self.current_code = code, Err(_) => { next_code = Some(self.current_code); self.current_code = u16::from(byte); break; } } } match next_code { // No more bytes, no code produced. None => break, Some(code) => { self.buffer_code(code); if self.tree.keys.len() + usize::from(self.is_tiff) > usize::from(self.buffer.max_code()) + 1 && self.buffer.code_size() < MAX_CODESIZE { self.buffer.bump_code_size(); } if self.tree.keys.len() > MAX_ENTRIES { self.buffer_code(self.clear_code); self.tree.reset(self.min_size); self.buffer.clear(self.min_size); } } } } if inp.is_empty() && self.current_code == self.end_code() { if !self.flush_out(&mut out) { status = Ok(LzwStatus::Done); } } BufferResult { consumed_in: c_in - inp.len(), consumed_out: c_out - out.len(), status, } } fn mark_ended(&mut self) -> bool { core::mem::replace(&mut self.has_ended, true) } fn restart(&mut self) { self.has_ended = false; } fn reset(&mut self) { self.restart(); self.current_code = self.clear_code; self.tree.reset(self.min_size); self.buffer.reset(self.min_size); self.buffer_code(self.clear_code); } } impl EncodeState { fn push_out(&mut self, out: &mut &mut [u8]) -> bool { self.buffer.push_out(out) } fn flush_out(&mut self, out: &mut &mut [u8]) -> bool { self.buffer.flush_out(out) } fn end_code(&self) -> Code { self.clear_code + 1 } fn buffer_pad(&mut self) { self.buffer.buffer_pad(); } fn buffer_code(&mut self, code: Code) { self.buffer.buffer_code(code); } } impl Buffer for MsbBuffer { fn new(min_size: u8) -> Self { MsbBuffer { code_size: min_size + 1, buffer: 0, bits_in_buffer: 0, } } fn reset(&mut self, min_size: u8) { self.code_size = min_size + 1; self.buffer = 0; self.bits_in_buffer = 0; } fn clear(&mut self, min_size: u8) { self.code_size = min_size + 1; } fn buffer_code(&mut self, code: Code) { let shift = 64 - self.bits_in_buffer - self.code_size; self.buffer |= u64::from(code) << shift; self.bits_in_buffer += self.code_size; } fn push_out(&mut self, out: &mut &mut [u8]) -> bool { if self.bits_in_buffer + 2 * self.code_size < 64 { return false; } self.flush_out(out) } fn flush_out(&mut self, out: &mut &mut [u8]) -> bool { let want = usize::from(self.bits_in_buffer / 8); let count = want.min((*out).len()); let (bytes, tail) = core::mem::replace(out, &mut []).split_at_mut(count); *out = tail; for b in bytes { *b = ((self.buffer & 0xff00_0000_0000_0000) >> 56) as u8; self.buffer <<= 8; self.bits_in_buffer -= 8; } count < want } fn buffer_pad(&mut self) { let to_byte = self.bits_in_buffer.wrapping_neg() & 0x7; self.bits_in_buffer += to_byte; } fn bump_code_size(&mut self) { self.code_size += 1; } fn max_code(&self) -> Code { (1 << self.code_size) - 1 } fn code_size(&self) -> u8 { self.code_size } } impl Buffer for LsbBuffer { fn new(min_size: u8) -> Self { LsbBuffer { code_size: min_size + 1, buffer: 0, bits_in_buffer: 0, } } fn reset(&mut self, min_size: u8) { self.code_size = min_size + 1; self.buffer = 0; self.bits_in_buffer = 0; } fn clear(&mut self, min_size: u8) { self.code_size = min_size + 1; } fn buffer_code(&mut self, code: Code) { self.buffer |= u64::from(code) << self.bits_in_buffer; self.bits_in_buffer += self.code_size; } fn push_out(&mut self, out: &mut &mut [u8]) -> bool { if self.bits_in_buffer + 2 * self.code_size < 64 { return false; } self.flush_out(out) } fn flush_out(&mut self, out: &mut &mut [u8]) -> bool { let want = usize::from(self.bits_in_buffer / 8); let count = want.min((*out).len()); let (bytes, tail) = core::mem::replace(out, &mut []).split_at_mut(count); *out = tail; for b in bytes { *b = (self.buffer & 0x0000_0000_0000_00ff) as u8; self.buffer >>= 8; self.bits_in_buffer -= 8; } count < want } fn buffer_pad(&mut self) { let to_byte = self.bits_in_buffer.wrapping_neg() & 0x7; self.bits_in_buffer += to_byte; } fn bump_code_size(&mut self) { self.code_size += 1; } fn max_code(&self) -> Code { (1 << self.code_size) - 1 } fn code_size(&self) -> u8 { self.code_size } } impl Tree { fn init(&mut self, min_size: u8) { // We need a way to represent the state of a currently empty buffer. We use the clear code // for this, thus create one complex mapping that leads to the one-char base codes. self.keys .resize((1 << min_size) + 2, FullKey::NoSuccessor.into()); self.complex.push(Full { char_continuation: [0; 256], }); let map_of_begin = self.complex.last_mut().unwrap(); for ch in 0u16..256 { map_of_begin.char_continuation[usize::from(ch)] = ch; } self.keys[1 << min_size] = FullKey::Full(0).into(); } fn reset(&mut self, min_size: u8) { self.simples.clear(); self.keys.truncate((1 << min_size) + 2); // Keep entry for clear code. self.complex.truncate(1); // The first complex is not changed.. for k in self.keys[..(1 << min_size) + 2].iter_mut() { *k = FullKey::NoSuccessor.into(); } self.keys[1 << min_size] = FullKey::Full(0).into(); } fn at_key(&self, code: Code, ch: u8) -> Option { let key = self.keys[usize::from(code)]; match FullKey::from(key) { FullKey::NoSuccessor => None, FullKey::Simple(idx) => { let nexts = &self.simples[usize::from(idx)]; let successors = nexts .codes .iter() .zip(nexts.chars.iter()) .take(usize::from(nexts.count)); for (&scode, &sch) in successors { if sch == ch { return Some(scode); } } None } FullKey::Full(idx) => { let full = &self.complex[usize::from(idx)]; let precode = full.char_continuation[usize::from(ch)]; if usize::from(precode) < MAX_ENTRIES { Some(precode) } else { None } } } } /// Iterate to the next char. /// Return Ok when it was already in the tree or creates a new entry for it and returns Err. fn iterate(&mut self, code: Code, ch: u8) -> Result { if let Some(next) = self.at_key(code, ch) { Ok(next) } else { Err(self.append(code, ch)) } } fn append(&mut self, code: Code, ch: u8) -> Code { let next: Code = self.keys.len() as u16; let key = self.keys[usize::from(code)]; // TODO: with debug assertions, check for non-existence match FullKey::from(key) { FullKey::NoSuccessor => { let new_key = FullKey::Simple(self.simples.len() as u16); self.simples.push(Simple::default()); let simples = self.simples.last_mut().unwrap(); simples.codes[0] = next; simples.chars[0] = ch; simples.count = 1; self.keys[usize::from(code)] = new_key.into(); } FullKey::Simple(idx) if usize::from(self.simples[usize::from(idx)].count) < SHORT => { let nexts = &mut self.simples[usize::from(idx)]; let nidx = usize::from(nexts.count); nexts.chars[nidx] = ch; nexts.codes[nidx] = next; nexts.count += 1; } FullKey::Simple(idx) => { let new_key = FullKey::Full(self.complex.len() as u16); let simples = &self.simples[usize::from(idx)]; self.complex.push(Full { char_continuation: [Code::max_value(); 256], }); let full = self.complex.last_mut().unwrap(); for (&pch, &pcont) in simples.chars.iter().zip(simples.codes.iter()) { full.char_continuation[usize::from(pch)] = pcont; } self.keys[usize::from(code)] = new_key.into(); } FullKey::Full(idx) => { let full = &mut self.complex[usize::from(idx)]; full.char_continuation[usize::from(ch)] = next; } } self.keys.push(FullKey::NoSuccessor.into()); next } } impl Default for FullKey { fn default() -> Self { FullKey::NoSuccessor } } impl Default for Simple { fn default() -> Self { Simple { codes: [0; SHORT], chars: [0; SHORT], count: 0, } } } impl From for FullKey { fn from(CompressedKey(key): CompressedKey) -> Self { match (key >> MAX_CODESIZE) & 0xf { 0 => FullKey::Full(key & 0xfff), 1 => FullKey::Simple(key & 0xfff), _ => FullKey::NoSuccessor, } } } impl From for CompressedKey { fn from(full: FullKey) -> Self { CompressedKey(match full { FullKey::NoSuccessor => 0x2000, FullKey::Simple(code) => 0x1000 | code, FullKey::Full(code) => code, }) } } #[cfg(test)] mod tests { use super::{BitOrder, Encoder, LzwError, LzwStatus}; use crate::alloc::vec::Vec; use crate::decode::Decoder; #[cfg(feature = "std")] use crate::StreamBuf; #[test] fn invalid_input_rejected() { const BIT_LEN: u8 = 2; let ref input = [0, 1 << BIT_LEN /* invalid */, 0]; let ref mut target = [0u8; 128]; let mut encoder = Encoder::new(BitOrder::Msb, BIT_LEN); encoder.finish(); // We require simulation of normality, that is byte-for-byte compression. let result = encoder.encode_bytes(input, target); assert!(if let Err(LzwError::InvalidCode) = result.status { true } else { false }); assert_eq!(result.consumed_in, 1); let fixed = encoder.encode_bytes(&[1, 0], &mut target[result.consumed_out..]); assert!(if let Ok(LzwStatus::Done) = fixed.status { true } else { false }); assert_eq!(fixed.consumed_in, 2); // Okay, now test we actually fixed it. let ref mut compare = [0u8; 4]; let mut todo = &target[..result.consumed_out + fixed.consumed_out]; let mut free = &mut compare[..]; let mut decoder = Decoder::new(BitOrder::Msb, BIT_LEN); // Decode with up to 16 rounds, far too much but inconsequential. for _ in 0..16 { if decoder.has_ended() { break; } let result = decoder.decode_bytes(todo, free); assert!(result.status.is_ok()); todo = &todo[result.consumed_in..]; free = &mut free[result.consumed_out..]; } let remaining = { free }.len(); let len = compare.len() - remaining; assert_eq!(todo, &[]); assert_eq!(compare[..len], [0, 1, 0]); } #[test] #[should_panic] fn invalid_code_size_low() { let _ = Encoder::new(BitOrder::Msb, 1); } #[test] #[should_panic] fn invalid_code_size_high() { let _ = Encoder::new(BitOrder::Msb, 14); } fn make_decoded() -> Vec { const FILE: &'static [u8] = include_bytes!(concat!(env!("CARGO_MANIFEST_DIR"), "/Cargo.lock")); return Vec::from(FILE); } #[test] #[cfg(feature = "std")] fn into_stream_buffer_no_alloc() { let encoded = make_decoded(); let mut encoder = Encoder::new(BitOrder::Msb, 8); let mut output = vec![]; let mut buffer = [0; 512]; let mut istream = encoder.into_stream(&mut output); istream.set_buffer(&mut buffer[..]); istream.encode(&encoded[..]).status.unwrap(); match istream.buffer { Some(StreamBuf::Borrowed(_)) => {} None => panic!("Decoded without buffer??"), Some(StreamBuf::Owned(_)) => panic!("Unexpected buffer allocation"), } } #[test] #[cfg(feature = "std")] fn into_stream_buffer_small_alloc() { struct WriteTap(W); const BUF_SIZE: usize = 512; impl std::io::Write for WriteTap { fn write(&mut self, buf: &[u8]) -> std::io::Result { assert!(buf.len() <= BUF_SIZE); self.0.write(buf) } fn flush(&mut self) -> std::io::Result<()> { self.0.flush() } } let encoded = make_decoded(); let mut encoder = Encoder::new(BitOrder::Msb, 8); let mut output = vec![]; let mut istream = encoder.into_stream(WriteTap(&mut output)); istream.set_buffer_size(512); istream.encode(&encoded[..]).status.unwrap(); match istream.buffer { Some(StreamBuf::Owned(vec)) => assert!(vec.len() <= BUF_SIZE), Some(StreamBuf::Borrowed(_)) => panic!("Unexpected borrowed buffer, where from?"), None => panic!("Decoded without buffer??"), } } #[test] #[cfg(feature = "std")] fn reset() { let encoded = make_decoded(); let mut encoder = Encoder::new(BitOrder::Msb, 8); let mut reference = None; for _ in 0..2 { let mut output = vec![]; let mut buffer = [0; 512]; let mut istream = encoder.into_stream(&mut output); istream.set_buffer(&mut buffer[..]); istream.encode_all(&encoded[..]).status.unwrap(); encoder.reset(); if let Some(reference) = &reference { assert_eq!(output, *reference); } else { reference = Some(output); } } } } weezl-0.1.5/src/encode_into_async.rs010064400017500001750000000117361403363510300157120ustar0000000000000000use crate::encode::IntoAsync; use crate::error::LzwStatus; use crate::error::StreamResult; use crate::StreamBuf; use std::io; impl<'d, W: futures::io::AsyncWrite + core::marker::Unpin> IntoAsync<'d, W> { /// Encode data from a reader. /// /// This will drain the supplied reader. It will not encode an end marker after all data has /// been processed. pub async fn encode(&mut self, read: impl futures::io::AsyncBufRead) -> StreamResult { self.encode_part(read, false).await } /// Encode data from a reader and an end marker. pub async fn encode_all(mut self, read: impl futures::io::AsyncBufRead) -> StreamResult { self.encode_part(read, true).await } /// Set the size of the intermediate decode buffer. /// /// A buffer of this size is allocated to hold one part of the decoded stream when no buffer is /// available and any decoding method is called. No buffer is allocated if `set_buffer` has /// been called. The buffer is reused. /// /// # Panics /// This method panics if `size` is `0`. pub fn set_buffer_size(&mut self, size: usize) { assert_ne!(size, 0, "Attempted to set empty buffer"); self.default_size = size; } /// Use a particular buffer as an intermediate decode buffer. /// /// Calling this sets or replaces the buffer. When a buffer has been set then it is used /// instead of dynamically allocating a buffer. Note that the size of the buffer is critical /// for efficient decoding. Some optimization techniques require the buffer to hold one or more /// previous decoded words. There is also additional overhead from `write` calls each time the /// buffer has been filled. /// /// # Panics /// This method panics if the `buffer` is empty. pub fn set_buffer(&mut self, buffer: &'d mut [u8]) { assert_ne!(buffer.len(), 0, "Attempted to set empty buffer"); self.buffer = Some(StreamBuf::Borrowed(buffer)); } async fn encode_part( &mut self, read: impl futures::io::AsyncBufRead, finish: bool, ) -> StreamResult { use futures::io::AsyncBufReadExt; use futures::io::AsyncWriteExt; let IntoAsync { encoder, writer, buffer, default_size, } = self; futures::pin_mut!(read); let mut read: core::pin::Pin<_> = read; let mut bytes_read = 0; let mut bytes_written = 0; // Converting to mutable refs to move into the `once` closure. let read_bytes = &mut bytes_read; let write_bytes = &mut bytes_written; let outbuf: &mut [u8] = match { buffer.get_or_insert_with(|| StreamBuf::Owned(vec![0u8; *default_size])) } { StreamBuf::Borrowed(slice) => &mut *slice, StreamBuf::Owned(vec) => &mut *vec, }; assert!(!outbuf.is_empty()); let status = loop { // Try to grab one buffer of input data. let mut filler = read.as_mut(); let data = match filler.fill_buf().await { Ok(buf) => buf, Err(err) => break Err(err), }; if data.is_empty() { if finish { encoder.finish(); } else { break Ok(()); } } // Decode as much of the buffer as fits. let result = encoder.encode_bytes(data, &mut outbuf[..]); // Do the bookkeeping and consume the buffer. *read_bytes += result.consumed_in; *write_bytes += result.consumed_out; read.as_mut().consume(result.consumed_in); // Handle an error status in the result. let done = match result.status { Ok(ok) => ok, Err(err) => { break Err(io::Error::new( io::ErrorKind::InvalidData, &*format!("{:?}", err), )); } }; if let LzwStatus::Done = done { break writer.write_all(&outbuf[..result.consumed_out]).await; } if let LzwStatus::NoProgress = done { break Err(io::Error::new( io::ErrorKind::UnexpectedEof, "No more data but no end marker detected", )); } // And finish by writing our result. // TODO: we may lose data on error (also on status error above) which we might want to // deterministically handle so that we don't need to restart everything from scratch as // the only recovery strategy. Any changes welcome. match writer.write_all(&outbuf[..result.consumed_out]).await { Ok(_) => {} Err(err) => break Err(err), } }; StreamResult { bytes_read, bytes_written, status, } } } weezl-0.1.5/src/error.rs010064400017500001750000000053051403367366000133650ustar0000000000000000/// The result of a coding operation on a pair of buffer. #[must_use = "Contains a status with potential error information"] pub struct BufferResult { /// The number of bytes consumed from the input buffer. pub consumed_in: usize, /// The number of bytes written into the output buffer. pub consumed_out: usize, /// The status after returning from the write call. pub status: Result, } /// The result of a coding operation into a vector. #[must_use = "Contains a status with potential error information"] pub struct VectorResult { /// The number of bytes consumed from the input buffer. pub consumed_in: usize, /// The number of bytes written into the output buffer. pub consumed_out: usize, /// The status after returning from the write call. pub status: Result, } /// The result of coding into an output stream. #[cfg(feature = "std")] #[must_use = "Contains a status with potential error information"] pub struct StreamResult { /// The total number of bytes consumed from the reader. pub bytes_read: usize, /// The total number of bytes written into the writer. pub bytes_written: usize, /// The possible error that occurred. /// /// Note that when writing into streams it is not in general possible to recover from an error. pub status: std::io::Result<()>, } /// The status after successful coding of an LZW stream. #[derive(Debug, Clone, Copy)] pub enum LzwStatus { /// Everything went well. Ok, /// No bytes were read or written and no internal state advanced. /// /// If this is returned but your application can not provide more input data then decoding is /// definitely stuck for good and it should stop trying and report some error of its own. In /// other situations this may be used as a signal to refill an internal buffer. NoProgress, /// No more data will be produced because an end marker was reached. Done, } /// The error kind after unsuccessful coding of an LZW stream. #[derive(Debug, Clone, Copy)] pub enum LzwError { /// The input contained an invalid code. /// /// For decompression this refers to a code larger than those currently known through the prior /// decoding stages. For compression this refers to a byte that has no code representation due /// to being larger than permitted by the `size` parameter given to the Encoder. InvalidCode, } impl core::fmt::Display for LzwError { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { match self { LzwError::InvalidCode => f.write_str("invalid code in LZW stream"), } } } #[cfg(feature = "std")] impl std::error::Error for LzwError {} weezl-0.1.5/src/lib.rs010064400017500001750000000113401403437406000127670ustar0000000000000000//! # LZW decoder and encoder //! //! This crates provides an `Encoder` and a `Decoder` in their respective modules. The code words //! are written from and to bit byte slices (or streams) where it is possible to write either the //! most or least significant bits first. The maximum possible code size is 12 bits, the smallest //! available code size is 2 bits. //! //! ## Example //! //! These two code blocks show the compression and corresponding decompression. Note that you must //! use the same arguments to `Encoder` and `Decoder`, otherwise the decoding might fail or produce //! bad results. //! #![cfg_attr(feature = "std", doc = "```")] #![cfg_attr(not(feature = "std"), doc = "```ignore")] //! use weezl::{BitOrder, encode::Encoder}; //! //! let data = b"Hello, world"; //! let compressed = Encoder::new(BitOrder::Msb, 9) //! .encode(data) //! .unwrap(); //! ``` //! #![cfg_attr(feature = "std", doc = "```")] #![cfg_attr(not(feature = "std"), doc = "```ignore")] //! use weezl::{BitOrder, decode::Decoder}; //! # let compressed = b"\x80\x04\x81\x94l\x1b\x06\xf0\xb0 \x1d\xc6\xf1\xc8l\x19 \x10".to_vec(); //! # let data = b"Hello, world"; //! //! let decompressed = Decoder::new(BitOrder::Msb, 9) //! .decode(&compressed) //! .unwrap(); //! assert_eq!(decompressed, data); //! ``` //! //! ## LZW Details //! //! The de- and encoder expect the LZW stream to start with a clear code and end with an //! end code which are defined as follows: //! //! * `CLEAR_CODE == 1 << min_code_size` //! * `END_CODE == CLEAR_CODE + 1` //! //! For optimal performance, all buffers and input and output slices should be as large as possible //! and at least 2048 bytes long. This extends to input streams which should have similarly sized //! buffers. This library uses Rust's standard allocation interfaces (`Box` and `Vec` to be //! precise). Since there are no ways to handle allocation errors it is not recommended to operate //! it on 16-bit targets. //! //! ## Allocations and standard library //! //! The main algorithm can be used in `no_std` as well, although it requires an allocator. This //! restriction might be lifted at a later stage. For this you should deactivate the `std` feature. //! The main interfaces stay intact but the `into_stream` combinator is no available. #![cfg_attr(not(feature = "std"), no_std)] #![forbid(unsafe_code)] #![forbid(missing_docs)] #[cfg(all(feature = "alloc", not(feature = "std")))] extern crate alloc; #[cfg(all(feature = "alloc", feature = "std"))] use std as alloc; pub(crate) const MAX_CODESIZE: u8 = 12; pub(crate) const MAX_ENTRIES: usize = 1 << MAX_CODESIZE as usize; /// Alias for a LZW code point pub(crate) type Code = u16; /// A default buffer size for encoding/decoding buffer. /// /// Note that this is larger than the default size for buffers (usually 4K) since each code word /// can expand to multiple bytes. Expanding one buffer would yield multiple and require a costly /// break in the decoding loop. Note that the decoded size can be up to quadratic in code block. pub(crate) const STREAM_BUF_SIZE: usize = 1 << 24; /// The order of bits in bytes. #[derive(Clone, Copy, Debug)] pub enum BitOrder { /// The most significant bit is processed first. Msb, /// The least significant bit is processed first. Lsb, } /// An owned or borrowed buffer for stream operations. #[cfg(feature = "alloc")] pub(crate) enum StreamBuf<'d> { Borrowed(&'d mut [u8]), Owned(crate::alloc::vec::Vec), } #[cold] fn assert_decode_size(size: u8) { assert!( size <= MAX_CODESIZE, "Maximum code size 12 required, got {}", size ); } #[cold] fn assert_encode_size(size: u8) { assert!(size >= 2, "Minimum code size 2 required, got {}", size); assert!( size <= MAX_CODESIZE, "Maximum code size 12 required, got {}", size ); } #[cfg(feature = "alloc")] pub mod decode; #[cfg(feature = "alloc")] pub mod encode; mod error; #[cfg(feature = "std")] pub use self::error::StreamResult; pub use self::error::{BufferResult, LzwError, LzwStatus}; #[cfg(all(test, feature = "alloc"))] mod tests { use crate::decode::Decoder; use crate::encode::Encoder; #[cfg(feature = "std")] use crate::{decode, encode}; #[test] fn stable_send() { fn must_be_send() {} must_be_send::(); must_be_send::(); #[cfg(feature = "std")] fn _send_and_lt<'lt, T: Send + 'lt>() {} // Check that the inference `W: Send + 'd` => `IntoStream: Send + 'd` works. #[cfg(feature = "std")] fn _all_send_writer<'d, W: std::io::Write + Send + 'd>() { _send_and_lt::<'d, decode::IntoStream<'d, W>>(); _send_and_lt::<'d, encode::IntoStream<'d, W>>(); } } } weezl-0.1.5/tests/async.rs010064400017500001750000000030631403363510300137110ustar0000000000000000use std::{env, fs}; use tokio::io::BufReader; use tokio::net::{TcpListener, TcpStream}; use tokio_util::compat::TokioAsyncReadCompatExt as _; use weezl::{decode, encode, BitOrder}; async fn pair() -> (TcpStream, TcpStream) { let listener = TcpListener::bind("localhost:0") .await .expect("No loop tcp for testing"); let addr = listener.local_addr().expect("No address for listener"); let connect = TcpStream::connect(addr); let accept = listener.accept(); let (a, (b, _)) = tokio::try_join!(connect, accept).expect("Can connect"); (a, b) } async fn assert_send_through(data: &[u8], send: &mut TcpStream, recv: &mut TcpStream) { let mut send = send.compat(); let mut recv = BufReader::new(recv).compat(); let mut encoder = encode::Encoder::new(BitOrder::Lsb, 8); let encode = encoder.into_async(&mut send).encode_all(data); let mut recv_buffer = vec![]; let mut decoder = decode::Decoder::new(BitOrder::Lsb, 8); let decode = decoder.into_async(&mut recv_buffer).decode_all(&mut recv); let (encode, decode) = tokio::join!(encode, decode); encode.status.expect("Could send/encoded data"); decode.status.expect("Could recv/decode data"); assert_eq!(recv_buffer, data); } #[test] fn with_streams() { let file = env::args().next().unwrap(); let data = fs::read(file).unwrap(); let rt = tokio::runtime::Runtime::new().expect("runtime"); let _enter = rt.enter(); let (mut send, mut recv) = rt.block_on(pair()); rt.block_on(assert_send_through(&data, &mut send, &mut recv)); } weezl-0.1.5/tests/roundtrip.rs010064400017500001750000000034571403363510300146310ustar0000000000000000use std::{env, fs}; use weezl::{decode, encode, BitOrder}; #[derive(Clone, Copy, Debug)] enum Flavor { Gif, Tiff, } #[test] fn roundtrip_all() { let file = env::args().next().unwrap(); let data = fs::read(file).unwrap(); for &flavor in &[Flavor::Gif, Flavor::Tiff] { for &bit_order in &[BitOrder::Lsb, BitOrder::Msb] { for bit_width in 2..8 { let data: Vec<_> = data .iter() .copied() .map(|b| b & ((1 << bit_width) - 1)) .collect(); println!("Roundtrip test {:?} {:?} {}", flavor, bit_order, bit_width); assert_roundtrips(&*data, flavor, bit_width, bit_order); } } } } fn assert_roundtrips(data: &[u8], flavor: Flavor, bit_width: u8, bit_order: BitOrder) { let (c, d): ( fn(BitOrder, u8) -> encode::Encoder, fn(BitOrder, u8) -> decode::Decoder, ) = match flavor { Flavor::Gif => (encode::Encoder::new, decode::Decoder::new), Flavor::Tiff => ( encode::Encoder::with_tiff_size_switch, decode::Decoder::with_tiff_size_switch, ), }; let mut encoder = c(bit_order, bit_width); let mut buffer = Vec::with_capacity(2 * data.len() + 40); let _ = encoder.into_stream(&mut buffer).encode_all(data); let mut decoder = d(bit_order, bit_width); let mut compare = vec![]; let result = decoder .into_stream(&mut compare) .decode_all(buffer.as_slice()); assert!( result.status.is_ok(), "{:?}, {}, {:?}", bit_order, bit_width, result.status ); assert!( data == &*compare, "{:?}, {}\n{:?}\n{:?}", bit_order, bit_width, data, compare ); } weezl-0.1.5/tests/roundtrip_vec.rs010064400017500001750000000034301403437655100154670ustar0000000000000000use std::{env, fs}; use weezl::{decode, encode, BitOrder}; #[derive(Clone, Copy, Debug)] enum Flavor { Gif, Tiff, } #[test] fn roundtrip_all() { let file = env::args().next().unwrap(); let data = fs::read(file).unwrap(); for &flavor in &[Flavor::Gif, Flavor::Tiff] { for &bit_order in &[BitOrder::Lsb, BitOrder::Msb] { for bit_width in 2..8 { let data: Vec<_> = data .iter() .copied() .map(|b| b & ((1 << bit_width) - 1)) .collect(); println!("Roundtrip test {:?} {:?} {}", flavor, bit_order, bit_width); assert_roundtrips(&*data, flavor, bit_width, bit_order); } } } } fn assert_roundtrips(data: &[u8], flavor: Flavor, bit_width: u8, bit_order: BitOrder) { let (c, d): ( fn(BitOrder, u8) -> encode::Encoder, fn(BitOrder, u8) -> decode::Decoder, ) = match flavor { Flavor::Gif => (encode::Encoder::new, decode::Decoder::new), Flavor::Tiff => ( encode::Encoder::with_tiff_size_switch, decode::Decoder::with_tiff_size_switch, ), }; let mut encoder = c(bit_order, bit_width); let mut buffer = Vec::with_capacity(2 * data.len() + 40); let _ = encoder.into_vec(&mut buffer).encode_all(data); let mut decoder = d(bit_order, bit_width); let mut compare = vec![]; let result = decoder.into_vec(&mut compare).decode_all(buffer.as_slice()); assert!( result.status.is_ok(), "{:?}, {}, {:?}", bit_order, bit_width, result.status ); assert!( data == &*compare, "{:?}, {}\n{:?}\n{:?}", bit_order, bit_width, data, compare ); } weezl-0.1.5/.cargo_vcs_info.json0000644000000001120000000000000122160ustar00{ "git": { "sha1": "9d494924c46898885fc3a5d35c644c366e582500" } }