lzxd-0.2.5/.cargo_vcs_info.json0000644000000001360000000000100120210ustar { "git": { "sha1": "4748e43594e3e30cff2ace3a6ad7a376c9816fdd" }, "path_in_vcs": "" }lzxd-0.2.5/.github/workflows/ci.yaml000064400000000000000000000022150072674642500155150ustar 00000000000000name: Build on: [push, pull_request] env: CARGO_TERM_COLOR: always jobs: build: runs-on: ${{ matrix.os }} env: RUST_BACKTRACE: 1 strategy: matrix: build: [ubuntu64, win64] include: - build: ubuntu64 os: ubuntu-latest host_target: x86_64-unknown-linux-gnu - build: win64 os: windows-latest host_target: x86_64-pc-windows-msvc steps: - uses: actions/checkout@v4 with: lfs: 'true' - name: Install rust toolchain uses: dtolnay/rust-toolchain@stable with: targets: ${{ matrix.host_target }} - name: build run: cargo build --target=${{ matrix.host_target }} --release - name: Cargo test run: cargo test --release fmt: name: check formatting runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Install rust toolchain uses: dtolnay/rust-toolchain@stable with: components: rustfmt, clippy - name: Cargo fmt run: cargo fmt --all -- --check - name: Cargo clippy run: cargo clippy -- -D warnings lzxd-0.2.5/.gitignore000064400000000000000000000000230072674642500126240ustar 00000000000000/target Cargo.lock lzxd-0.2.5/Cargo.toml0000644000000017530000000000100100250ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" name = "lzxd" version = "0.2.5" authors = ["Lonami Exo "] exclude = [ "/tests", "/testdata", ] description = """ Decompression implementation for Microsoft's LZXD compression format. """ homepage = "https://github.com/Lonami/lzxd" documentation = "https://docs.rs/lzxd" readme = "README.md" keywords = [ "lzx", "lzxd", "xnb", "decompress", ] categories = ["compression"] license = "MIT OR Apache-2.0" repository = "https://github.com/Lonami/lzxd" [dependencies] lzxd-0.2.5/Cargo.toml.orig000064400000000000000000000007450072674642500135360ustar 00000000000000[package] name = "lzxd" version = "0.2.5" authors = ["Lonami Exo "] license = "MIT OR Apache-2.0" description = """ Decompression implementation for Microsoft's LZXD compression format. """ homepage = "https://github.com/Lonami/lzxd" documentation = "https://docs.rs/lzxd" repository = "https://github.com/Lonami/lzxd" keywords = ["lzx", "lzxd", "xnb", "decompress"] categories = ["compression"] edition = "2021" exclude = ["/tests", "/testdata"] [dependencies] lzxd-0.2.5/LICENSE-APACHE000064400000000000000000000261350072674642500125740ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. lzxd-0.2.5/LICENSE-MIT000064400000000000000000000020670072674642500123020ustar 00000000000000MIT License Copyright (c) 2020 Lonami Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. lzxd-0.2.5/README.md000064400000000000000000000024350072674642500121240ustar 00000000000000# lzxd A Rust implementation of [Microsoft's lzxd encoding][1], based in the description and code of the document itself. This crate currently only implements decompression. ```rust use lzxd::{Lzxd, WindowSize}; let mut lzxd = Lzxd::new(WindowSize::KB64); while let Some(chunk) = get_compressed_chunk() { let decompressed = lzxd.decompress_next(&chunk); write_data(decompressed.unwrap()); } ``` The project's motivation was to be able to read XNB files produced by XNA Game Studio, some of which are compressed under LZXD compression. Huge thanks to [LeonBlade for their xnbcli][2] project which helped greatly to debug this implementation, and special mention to [dorkbox's CabParser][3] for further helping validate that this implementation is able to decompress real-world data correctly. ## License This library is licensed under either of * Apache License, Version 2.0 ([LICENSE-APACHE] or http://www.apache.org/licenses/LICENSE-2.0) * MIT license ([LICENSE-MIT] or http://opensource.org/licenses/MIT) at your option. [1]: https://docs.microsoft.com/en-us/openspecs/exchange_server_protocols/ms-patch/cc78752a-b4af-4eee-88cb-01f4d8a4c2bf [2]: https://github.com/LeonBlade/xnbcli [3]: https://github.com/dorkbox/CabParser/ [LICENSE-APACHE]: LICENSE-APACHE [LICENSE-MIT]: LICENSE-MIT lzxd-0.2.5/src/bitstream.rs000064400000000000000000000251270072674642500137770ustar 00000000000000/// > An LZXD bitstream is encoded as a sequence of aligned 16-bit integers stored in the /// > least-significant- byte to most-significant-byte order, also known as byte-swapped, /// > or little-endian, words. Given an input stream of bits named a, b, c,..., x, y, z, /// > A, B, C, D, E, F, the output byte stream MUST be as [ 0| 1| 2| 3|...|30|31]. /// /// It is worth mentioning that older revisions of the document explain this better: /// /// > Given an input stream of bits named a, b, c, ..., x, y, z, A, B, C, D, E, F, the output /// > byte stream (with byte boundaries highlighted) would be as follows: /// > [i|j|k|l|m|n|o#p|a|b|c|d|e|f|g|h#y|z|A|B|C|D|E|F#q|r|s|t|u|v|w|x] use crate::DecodeFailed; pub struct Bitstream<'a> { buffer: &'a [u8], // Next number in the bitstream. n: u16, // How many bits left in the current `n`. remaining: u8, } impl<'a> Bitstream<'a> { pub fn new(buffer: &'a [u8]) -> Self { Self { buffer, n: 0, remaining: 0, } } // Advance the buffer to the next 16-bit integer. fn advance_buffer(&mut self) -> Result<(), DecodeFailed> { if self.buffer.is_empty() { return Err(DecodeFailed::UnexpectedEof); } self.remaining = 16; self.n = u16::from_le_bytes([self.buffer[0], self.buffer[1]]); self.buffer = &self.buffer[2..]; Ok(()) } pub fn read_bit(&mut self) -> Result { if self.remaining == 0 { self.advance_buffer()?; } self.remaining -= 1; self.n = self.n.rotate_left(1); Ok(self.n & 1) } pub fn read_byte(&mut self) -> Option { if self.buffer.is_empty() { return None; } let byte = self.buffer[0]; self.buffer = &self.buffer[1..]; Some(byte) } /// Read from the bistream, no more than 16 bits (one word). fn read_bits_oneword(&mut self, bits: u8) -> Result { assert!(bits <= 16); debug_assert!(self.remaining <= 16); Ok(if bits <= self.remaining { self.remaining -= bits; self.n = self.n.rotate_left(bits as u32); self.n & ((1 << bits) - 1) } else { // No need to store `rol` result in `n` as we're about to overwrite it. let hi = self.n.rotate_left(self.remaining as u32) & ((1 << self.remaining) - 1); let bits = bits - self.remaining; self.advance_buffer()?; self.remaining -= bits; self.n = self.n.rotate_left(bits as u32); // `bits` may be 16 which would overflow the left shift, operate on `u32` and trunc. let lo = self.n & ((1u32 << bits) as u16).wrapping_sub(1); ((hi as u32) << bits) as u16 | lo }) } pub fn read_bits(&mut self, bits: u8) -> Result { if bits <= 16 { self.read_bits_oneword(bits).map(|w| w as u32) } else { assert!(bits <= 32); // Read the two words. let w0 = self.read_bits_oneword(16)? as u32; let w1 = self.read_bits_oneword(bits - 16)? as u32; Ok((w0 << (bits - 16)) | w1) } } /// Peek from the bitstream, no more than 16 bits (one word). fn peek_bits_oneword(&self, bits: u8) -> u16 { // Copy paste of `read_bits`, but without advancing the buffer. assert!(bits <= 16); if bits <= self.remaining { self.n.rotate_left(bits as u32) & ((1 << bits) - 1) } else { let hi = self.n.rotate_left(self.remaining as u32) & ((1 << self.remaining) - 1); let bits = bits - self.remaining; // We may peek more than we need (i.e. at the end of a chunk), due to the way // our decoder is implemented. This is a bit ugly but luckily we can pretend // there are just zeros after. let n = if self.buffer.is_empty() { 0 } else { u16::from_le_bytes([self.buffer[0], self.buffer[1]]) }; let lo = n.rotate_left(bits as u32) & ((1u32 << bits) as u16).wrapping_sub(1); ((hi as u32) << bits) as u16 | lo } } pub fn peek_bits(&self, bits: u8) -> u32 { if bits <= 16 { self.peek_bits_oneword(bits) as u32 } else { assert!(bits <= 32); // Read the two words. let mut advanced_stream = Self { buffer: self.buffer, n: self.n, remaining: self.remaining, }; let w0 = advanced_stream.read_bits_oneword(16).unwrap() as u32; let w1 = advanced_stream.peek_bits_oneword(bits - 16) as u32; (w0 << (bits - 16)) | w1 } } pub fn read_u32_le(&mut self) -> Result { let lo = self.read_bits_oneword(16)?.to_le_bytes(); let hi = self.read_bits_oneword(16)?.to_le_bytes(); Ok(u32::from_le_bytes([lo[0], lo[1], hi[0], hi[1]])) } pub fn read_u24_be(&mut self) -> Result { let hi = self.read_bits(16)?; let lo = self.read_bits(8)?; Ok(hi << 8 | lo) } pub fn align(&mut self) -> Result<(), DecodeFailed> { if self.remaining == 0 { self.read_bits(16)?; } else { self.remaining = 0; } Ok(()) } /// Copies from the current buffer to the destination output ignoring the representation. pub fn read_raw(&mut self, output: &mut [u8]) -> Result<(), DecodeFailed> { if self.buffer.len() < output.len() { return Err(DecodeFailed::UnexpectedEof); } output.copy_from_slice(&self.buffer[..output.len()]); self.buffer = &self.buffer[output.len()..]; Ok(()) } pub fn remaining_bytes(&self) -> usize { self.buffer.len() } } #[cfg(test)] mod tests { use super::*; #[test] fn read_sequential() { // 0..=10 and padding using the least amount of bits possible, read LTR let ns = [0b0_1_10_11_100_101_110_1u16, 0b11_1000_1001_1010_00u16]; let bit_lengths = [1u8, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4]; // Convert input sequence of 16-bit integers to byte-stream let mut bytes = Vec::with_capacity(ns.len() * 2); ns.iter().for_each(|n| bytes.extend(&n.to_le_bytes())); let mut bitstream = Bitstream::new(&bytes); bit_lengths .iter() .copied() .enumerate() .for_each(|(value, bit_length)| { assert_eq!(bitstream.read_bits(bit_length), Ok(value as u32)); }); } #[test] fn read_32le() { let bytes = [0x56, 0x78, 0x12, 0x34]; let mut bitstream = Bitstream::new(&bytes); assert_eq!(bitstream.read_u32_le(), Ok(873625686)); } #[test] fn read_24be() { let ns = [0b0000_1100_0001_1000_u16, 0b0001_1000_0011_0000_u16]; let mut bytes = Vec::with_capacity(ns.len() * 2); ns.iter().for_each(|n| bytes.extend(&n.to_le_bytes())); let mut bitstream = Bitstream::new(&bytes); assert_eq!(bitstream.read_bits(4), Ok(0)); assert_eq!(bitstream.read_u24_be(), Ok(0b1100_0001_1000_0001_1000_0011)); assert_eq!(bitstream.read_bits(4), Ok(0)); } #[test] fn align() { let bytes = [0b0100_0000, 0b0010_0000, 0b1000_0000, 0b0110_0000]; let mut bitstream = Bitstream::new(&bytes); assert_eq!(bitstream.read_bits(3), Ok(1)); bitstream.align().unwrap(); assert_eq!(bitstream.read_bits(3), Ok(3)); } #[test] fn no_remain_after_aligned() { let bytes = [0b0100_0000, 0b0010_0000, 0b1000_0000, 0b0110_0000]; let mut bitstream = Bitstream::new(&bytes); bitstream.read_bits(3).unwrap(); assert_ne!(bitstream.remaining, 0); bitstream.align().unwrap(); assert_eq!(bitstream.remaining, 0); bitstream.read_bits(16).unwrap(); assert_eq!(bitstream.remaining, 0); } #[test] fn check_read_bit() { let bytes = [0b0110_1001, 0b1001_0110]; let mut bitstream_1 = Bitstream::new(&bytes); let mut bitstream_n = Bitstream::new(&bytes); (0..16).for_each(|_| { assert_eq!( bitstream_1.read_bit().map(|b| b as u32), bitstream_n.read_bits(1) ) }); } #[test] fn read_bit_positions_match_description() { // bits _abcdefgh_ijklmnop_qrstuvwx_yzABCDEF become: let bit_indices: [u32; 32] = [ 8, // i 9, // j 10, // k 11, // l 12, // m 13, // n 14, // o 15, // p 0, // a 1, // b 2, // c 3, // d 4, // e 5, // f 6, // g 7, // h 24, // y 25, // z 26, // A 27, // B 28, // C 29, // D 30, // E 31, // F 16, // q 17, // r 18, // s 19, // t 20, // u 21, // v 22, // w 23, // x ]; for (index, bit_index) in bit_indices.iter().copied().enumerate() { let n = 1u32.rotate_right(1).rotate_right(bit_index); let bytes = n.to_be_bytes(); eprintln!("index={index}, bit_index={bit_index}, bytes={n:032b}"); let mut bitstream = Bitstream::new(&bytes); if index != 0 { assert_eq!(bitstream.read_bits(index as u8), Ok(0)); } assert_eq!(bitstream.read_bit(), Ok(1)); if let Some(remaining) = 31usize.checked_sub(index) { assert_eq!(bitstream.read_bits(remaining as u8), Ok(0)); } } } #[test] fn read_equals_peek() { for index in 0..20 { let n = (0b11_0_111_0_11111_0_1111111_0_11111111111_0_1111111111111u64).rotate_left(index); let bytes = n.to_be_bytes(); for offset in 0..20 { for size in 0..20 { let mut bitstream = Bitstream::new(&bytes); bitstream.read_bits(offset).unwrap(); let peeked = bitstream.peek_bits(size); assert_eq!( bitstream.read_bits(size), Ok(peeked), "offset={offset}, size={size}, bytes={n:032b}", ); } } } } } lzxd-0.2.5/src/block.rs000064400000000000000000000346030072674642500130760ustar 00000000000000use crate::{Bitstream, DecodeFailed, DecoderState, Tree}; // if position_slot < 4 { // 0 // } else if position_slot >= 36 { // 17 // } else { // (position_slot - 2) / 2 // } const FOOTER_BITS: [u8; 289] = [ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, ]; // if position_slot == 0 { // 0 // } else { // BASE_POSITION[position_slot - 1] + (1 << FOOTER_BITS[position_slot - 1]) // } const BASE_POSITION: [u32; 290] = [ 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768, 49152, 65536, 98304, 131072, 196608, 262144, 393216, 524288, 655360, 786432, 917504, 1048576, 1179648, 1310720, 1441792, 1572864, 1703936, 1835008, 1966080, 2097152, 2228224, 2359296, 2490368, 2621440, 2752512, 2883584, 3014656, 3145728, 3276800, 3407872, 3538944, 3670016, 3801088, 3932160, 4063232, 4194304, 4325376, 4456448, 4587520, 4718592, 4849664, 4980736, 5111808, 5242880, 5373952, 5505024, 5636096, 5767168, 5898240, 6029312, 6160384, 6291456, 6422528, 6553600, 6684672, 6815744, 6946816, 7077888, 7208960, 7340032, 7471104, 7602176, 7733248, 7864320, 7995392, 8126464, 8257536, 8388608, 8519680, 8650752, 8781824, 8912896, 9043968, 9175040, 9306112, 9437184, 9568256, 9699328, 9830400, 9961472, 10092544, 10223616, 10354688, 10485760, 10616832, 10747904, 10878976, 11010048, 11141120, 11272192, 11403264, 11534336, 11665408, 11796480, 11927552, 12058624, 12189696, 12320768, 12451840, 12582912, 12713984, 12845056, 12976128, 13107200, 13238272, 13369344, 13500416, 13631488, 13762560, 13893632, 14024704, 14155776, 14286848, 14417920, 14548992, 14680064, 14811136, 14942208, 15073280, 15204352, 15335424, 15466496, 15597568, 15728640, 15859712, 15990784, 16121856, 16252928, 16384000, 16515072, 16646144, 16777216, 16908288, 17039360, 17170432, 17301504, 17432576, 17563648, 17694720, 17825792, 17956864, 18087936, 18219008, 18350080, 18481152, 18612224, 18743296, 18874368, 19005440, 19136512, 19267584, 19398656, 19529728, 19660800, 19791872, 19922944, 20054016, 20185088, 20316160, 20447232, 20578304, 20709376, 20840448, 20971520, 21102592, 21233664, 21364736, 21495808, 21626880, 21757952, 21889024, 22020096, 22151168, 22282240, 22413312, 22544384, 22675456, 22806528, 22937600, 23068672, 23199744, 23330816, 23461888, 23592960, 23724032, 23855104, 23986176, 24117248, 24248320, 24379392, 24510464, 24641536, 24772608, 24903680, 25034752, 25165824, 25296896, 25427968, 25559040, 25690112, 25821184, 25952256, 26083328, 26214400, 26345472, 26476544, 26607616, 26738688, 26869760, 27000832, 27131904, 27262976, 27394048, 27525120, 27656192, 27787264, 27918336, 28049408, 28180480, 28311552, 28442624, 28573696, 28704768, 28835840, 28966912, 29097984, 29229056, 29360128, 29491200, 29622272, 29753344, 29884416, 30015488, 30146560, 30277632, 30408704, 30539776, 30670848, 30801920, 30932992, 31064064, 31195136, 31326208, 31457280, 31588352, 31719424, 31850496, 31981568, 32112640, 32243712, 32374784, 32505856, 32636928, 32768000, 32899072, 33030144, 33161216, 33292288, 33423360, ]; struct DecodeInfo<'a> { aligned_offset_tree: Option<&'a Tree>, main_tree: &'a Tree, length_tree: Option<&'a Tree>, } #[derive(Debug)] pub enum Decoded { Single(u8), Match { offset: usize, length: usize }, Read(usize), } #[derive(Debug)] pub enum Kind { Verbatim { main_tree: Tree, length_tree: Option, }, AlignedOffset { aligned_offset_tree: Tree, main_tree: Tree, length_tree: Option, }, Uncompressed { r: [u32; 3], }, } /// Note that this is not the block header, but the head of the block's body, which includes /// everything except the tail of the block data (either uncompressed data or token sequence). pub struct Block { /// Only 24 bits may be used. pub remaining: u32, pub size: u32, pub kind: Kind, } /// Read the pretrees for the main and length tree, and with those also read the trees /// themselves, using the path lengths from a previous tree if any. /// /// This is used when reading a verbatim or aligned block. fn read_main_and_length_trees( bitstream: &mut Bitstream, state: &mut DecoderState, ) -> Result<(), DecodeFailed> { // Verbatim block // Entry Comments // Pretree for first 256 elements of main tree 20 elements, 4 bits each // Path lengths of first 256 elements of main tree Encoded using pretree // Pretree for remainder of main tree 20 elements, 4 bits each // Path lengths of remaining elements of main tree Encoded using pretree // Pretree for length tree 20 elements, 4 bits each // Path lengths of elements in length tree Encoded using pretree // Token sequence (matches and literals) Specified in section 2.6 state .main_tree .update_range_with_pretree(bitstream, 0..256)?; state .main_tree .update_range_with_pretree(bitstream, 256..256 + 8 * state.window_size.position_slots())?; state .length_tree .update_range_with_pretree(bitstream, 0..249)?; Ok(()) } fn decode_element( bitstream: &mut Bitstream, r: &mut [u32; 3], DecodeInfo { aligned_offset_tree, main_tree, length_tree, }: DecodeInfo, ) -> Result { // Decoding Matches and Literals (Aligned and Verbatim Blocks) let main_element = main_tree.decode_element(bitstream)?; // Check if it is a literal character. Ok(if main_element < 256 { // It is a literal, so copy the literal to output. Decoded::Single(main_element as u8) } else { // Decode the match. For a match, there are two components, offset and length. let length_header = (main_element - 256) & 7; let match_length = if length_header == 7 { // Length of the footer. length_tree .ok_or(DecodeFailed::EmptyTree)? .decode_element(bitstream)? + 7 + 2 } else { length_header + 2 // no length footer // Decoding a match length (if a match length < 257). }; assert_ne!(match_length, 0); let position_slot = (main_element - 256) >> 3; // Check for repeated offsets (positions 0, 1, 2). let match_offset; if position_slot == 0 { match_offset = r[0]; } else if position_slot == 1 { match_offset = r[1]; r.swap(0, 1); } else if position_slot == 2 { match_offset = r[2]; r.swap(0, 2); } else { // Not a repeated offset. let offset_bits = FOOTER_BITS[position_slot as usize]; let formatted_offset = if let Some(aligned_offset_tree) = aligned_offset_tree.as_ref() { let verbatim_bits; let aligned_bits; // This means there are some aligned bits. if offset_bits >= 3 { verbatim_bits = bitstream.read_bits(offset_bits - 3)? << 3; aligned_bits = aligned_offset_tree.decode_element(bitstream)?; } else { // 0, 1, or 2 verbatim bits verbatim_bits = bitstream.read_bits(offset_bits)?; aligned_bits = 0; } BASE_POSITION[position_slot as usize] + verbatim_bits + aligned_bits as u32 } else { // Block_type is a verbatim_block. let verbatim_bits = bitstream.read_bits(offset_bits)?; BASE_POSITION[position_slot as usize] + verbatim_bits }; // Decoding a match offset. match_offset = formatted_offset - 2; // Update repeated offset least recently used queue. r[2] = r[1]; r[1] = r[0]; r[0] = match_offset; } // Check for extra length. // > If the match length is 257 or larger, the encoded match length token // > (or match length, as specified in section 2.6) value is 257, and an // > encoded Extra Length field follows the other match encoding components, // > as specified in section 2.6.7, in the bitstream. // TODO for some reason, if we do this, parsing .xnb files with window size // 64KB, it breaks and stops decompressing correctly, but no idea why. /* let match_length = if match_length == 257 { // Decode the extra length. let extra_len = if bitstream.read_bit() != 0 { if bitstream.read_bit() != 0 { if bitstream.read_bit() != 0 { // > Prefix 0b111; Number of bits to decode 15; bitstream.read_bits(15) } else { // > Prefix 0b110; Number of bits to decode 12; bitstream.read_bits(12) + 1024 + 256 } } else { // > Prefix 0b10; Number of bits to decode 10; bitstream.read_bits(10) + 256 } } else { // > Prefix 0b0; Number of bits to decode 8; bitstream.read_bits(8) }; // Get the match length (if match length >= 257). // In all cases, // > Base value to add to decoded value 257 + … 257 + extra_len } else { match_length as u16 }; */ // Get match length and offset. Perform copy and paste work. Decoded::Match { offset: match_offset as usize, length: match_length as usize, } }) } impl Block { pub(crate) fn read( bitstream: &mut Bitstream, state: &mut DecoderState, ) -> Result { // > Each block of compressed data begins with a 3-bit Block Type field. // > Of the eight possible values, only three are valid values for the Block Type // > field. let kind = bitstream.read_bits(3)? as u8; let size = bitstream.read_u24_be()?; if size == 0 { return Err(DecodeFailed::InvalidBlockSize(size)); } let kind = match kind { 0b001 => { read_main_and_length_trees(bitstream, state)?; Kind::Verbatim { main_tree: state.main_tree.create_instance()?, length_tree: state.length_tree.create_instance_allow_empty()?, } } 0b010 => { // > encoding only the delta path lengths between the current and previous trees // // This means we don't need to worry about deltas on this tree. let aligned_offset_tree = { let mut path_lengths = Vec::with_capacity(8); for _ in 0..8 { path_lengths.push(bitstream.read_bits(3)? as u8); } Tree::from_path_lengths(path_lengths)? }; // > An aligned offset block is identical to the verbatim block except for the // > presence of the aligned offset tree preceding the other trees. read_main_and_length_trees(bitstream, state)?; Kind::AlignedOffset { aligned_offset_tree, main_tree: state.main_tree.create_instance()?, length_tree: state.length_tree.create_instance_allow_empty()?, } } 0b011 => { bitstream.align()?; Kind::Uncompressed { r: [ bitstream.read_u32_le()?, bitstream.read_u32_le()?, bitstream.read_u32_le()?, ], } } _ => return Err(DecodeFailed::InvalidBlock(kind)), }; Ok(Block { remaining: size, size, kind, }) } pub(crate) fn decode_element( &self, bitstream: &mut Bitstream, r: &mut [u32; 3], ) -> Result { match &self.kind { Kind::Verbatim { main_tree, length_tree, } => decode_element( bitstream, r, DecodeInfo { aligned_offset_tree: None, main_tree, length_tree: length_tree.as_ref(), }, ), Kind::AlignedOffset { aligned_offset_tree, main_tree, length_tree, } => decode_element( bitstream, r, DecodeInfo { aligned_offset_tree: Some(aligned_offset_tree), main_tree, length_tree: length_tree.as_ref(), }, ), Kind::Uncompressed { r: new_r } => { r.copy_from_slice(new_r); Ok(Decoded::Read(self.remaining as usize)) } } } } lzxd-0.2.5/src/lib.rs000064400000000000000000000414420072674642500125510ustar 00000000000000//! This library implements the LZX compression format as described in //! [LZX DELTA Compression and Decompression], revision 9.0. //! //! Lempel-Ziv Extended (LZX) is an LZ77-based compression engine, as described in [UASDC], //! that is a universal lossless data compression algorithm. It performs no analysis on the //! data. //! //! Lempel-Ziv Extended Delta (LZXD) is a derivative of the Lempel-Ziv Extended (LZX) format with //! some modifications to facilitate efficient delta compression. //! //! In order to use this module, refer to the main [`Lzxd`] type and its methods. //! //! [LZX DELTA Compression and Decompression]: https://docs.microsoft.com/en-us/openspecs/exchange_server_protocols/ms-patch/cc78752a-b4af-4eee-88cb-01f4d8a4c2bf //! [UASDC]: https://ieeexplore.ieee.org/document/1055714 //! [`Lzxd`]: struct.Lzxd.html use std::{fmt, mem}; pub(crate) use bitstream::Bitstream; pub(crate) use block::{Block, Decoded, Kind as BlockKind}; pub(crate) use tree::{CanonicalTree, Tree}; use window::Window; pub use window::WindowSize; mod bitstream; mod block; mod tree; mod window; /// A chunk represents exactly 32 KB of uncompressed data until the last chunk in the stream, /// which can represent less than 32 KB. pub const MAX_CHUNK_SIZE: usize = 32 * 1024; /// Decoder state needed for new blocks. // TODO not sure how much we want to keep in DecoderState and Lzxd respectively pub(crate) struct DecoderState { /// The window size we're working with. window_size: WindowSize, /// This tree cannot be used directly, it exists only to apply the delta of upcoming trees /// to its path lengths. main_tree: CanonicalTree, /// This tree cannot be used directly, it exists only to apply the delta of upcoming trees /// to its path lengths. length_tree: CanonicalTree, } struct PostProcessState { /// The pointer in the file at which to stop performing E8 translation. e8_translation_size: i32, /// A buffer that can be used to hold postprocessed chunks. data_chunk: Box<[u8]>, } /// The main interface to perform LZXD decompression. /// /// This structure stores the required state to process the compressed chunks of data in a /// sequential order. /// /// ```no_run /// # fn get_compressed_chunk() -> Option<(Vec, usize)> { unimplemented!() } /// # fn write_data(a: &[u8]) { unimplemented!() } /// use ::lzxd::{Lzxd, WindowSize}; /// /// let mut lzxd = Lzxd::new(WindowSize::KB64); /// /// while let Some((chunk, output_size)) = get_compressed_chunk() { /// let decompressed = lzxd.decompress_next(&chunk, output_size); /// write_data(decompressed.unwrap()); /// } /// ``` pub struct Lzxd { /// Sliding window into which data is decompressed. window: Window, /// Current decoder state. state: DecoderState, /// > The three most recent real match offsets are kept in a list. r: [u32; 3], /// The current offset into the decompressed data. chunk_offset: usize, /// Has the very first chunk been read yet? Unlike the rest, it has additional data. first_chunk_read: bool, /// Current block. current_block: Block, /// Information and data related to E8 postprocessing. This is populated after /// the first chunk is read. postprocess: Option, } /// Specific cause for decompression failure. #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum DecodeFailed { /// The chunk data caused a read of more items than the current block had in a single step. OverreadBlock, /// There was not enough data in the chunk to fully decode, and a premature end was found. UnexpectedEof, /// An invalid block type was found. InvalidBlock(u8), /// An invalid block size was found. InvalidBlockSize(u32), /// An invalid pretree element was found. InvalidPretreeElement(u16), /// Invalid pretree run-length encoding. InvalidPretreeRle, /// When attempting to construct a decode tree, we encountered an invalid path length tree. InvalidPathLengths, /// A required decode tree was empty (all path lengths were 0). EmptyTree, /// The given window size was too small. WindowTooSmall, /// Tried to read a chunk longer than [`MAX_CHUNK_SIZE`]. /// /// [`MAX_CHUNK_SIZE`]: constant.MAX_CHUNK_SIZE.html ChunkTooLong, } impl fmt::Display for DecodeFailed { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use DecodeFailed::*; match self { OverreadBlock => write!( f, "read more items than available in the block in a single step" ), UnexpectedEof => write!(f, "reached end of chunk without fully decoding it"), InvalidBlock(kind) => write!(f, "block type {} is invalid", kind), InvalidBlockSize(size) => write!(f, "block size {} is invalid", size), InvalidPretreeElement(elem) => write!(f, "found invalid pretree element {}", elem), InvalidPretreeRle => write!(f, "found invalid pretree rle element"), InvalidPathLengths => write!(f, "encountered invalid path lengths"), EmptyTree => write!(f, "encountered empty decode tree"), WindowTooSmall => write!(f, "decode window was too small"), ChunkTooLong => write!( f, "tried reading a chunk longer than {} bytes", MAX_CHUNK_SIZE ), } } } impl std::error::Error for DecodeFailed {} /// The error type used when decompression fails. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct DecompressError(DecodeFailed); impl fmt::Display for DecompressError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) } } impl std::error::Error for DecompressError {} impl From for DecompressError { fn from(value: DecodeFailed) -> Self { Self(value) } } impl Lzxd { /// Creates a new instance of the LZXD decoder state. The [`WindowSize`] must be obtained /// from elsewhere (e.g. it may be predetermined to a certain value), and if it's wrong, /// the decompressed values won't be those expected. /// /// [`WindowSize`]: enum.WindowSize.html pub fn new(window_size: WindowSize) -> Self { // > The main tree comprises 256 elements that correspond to all possible 8-bit // > characters, plus 8 * NUM_POSITION_SLOTS elements that correspond to matches. let main_tree = CanonicalTree::new(256 + 8 * window_size.position_slots()); // > The length tree comprises 249 elements. let length_tree = CanonicalTree::new(249); Self { window: window_size.create_buffer(), // > Because trees are output several times during compression of large amounts of // > data (multiple blocks), LZXD optimizes compression by encoding only the delta // > path lengths lengths between the current and previous trees. // // Because it uses deltas, we need to store the previous value across blocks. state: DecoderState { window_size, main_tree, length_tree, }, // > The initial state of R0, R1, R2 is (1, 1, 1). r: [1, 1, 1], first_chunk_read: false, chunk_offset: 0, postprocess: None, // Start with some dummy value. current_block: Block { remaining: 0, size: 0, kind: BlockKind::Uncompressed { r: [1, 1, 1] }, }, } } /// Try reading the header for the first chunk. fn try_read_first_chunk(&mut self, bitstream: &mut Bitstream) -> Result<(), DecodeFailed> { // > The first bit in the first chunk in the LZXD bitstream (following the 2-byte, // > chunk-size prefix described in section 2.2.1) indicates the presence or absence of // > two 16-bit fields immediately following the single bit. If the bit is set, E8 // > translation is enabled. if !self.first_chunk_read { self.first_chunk_read = true; let e8_translation = bitstream.read_bit()? != 0; self.postprocess = if e8_translation { Some(PostProcessState { data_chunk: vec![0; MAX_CHUNK_SIZE].into_boxed_slice(), e8_translation_size: bitstream.read_bits(32)? as i32, }) } else { None }; } Ok(()) } /// Attempts to perform post-decompression E8 fixups on an output data buffer. fn postprocess( translation_size: i32, chunk_offset: usize, idata: &mut [u8], ) -> Result<&[u8], DecodeFailed> { let mut processed = 0usize; // Find the next E8 match, or finish once there are no more E8 matches. while let Some(pos) = idata[processed..] .iter() .position(|&e| e == 0xE8) .map(|pos| processed + pos) { // N.B: E8 fixups are only performed for up to 10 bytes before the end of a chunk. if idata.len() - pos <= 10 { break; } // This is the current file output pointer. let current_pointer = chunk_offset + pos; // Match. Fix up the following bytes. let abs_val = i32::from_le_bytes([ idata[pos + 1], idata[pos + 2], idata[pos + 3], idata[pos + 4], ]); if (abs_val >= -(current_pointer as i32)) && abs_val < translation_size { let rel_val = if abs_val.is_positive() { abs_val.wrapping_sub(current_pointer as i32) } else { abs_val.wrapping_add(translation_size) }; idata[pos + 1..pos + 5].copy_from_slice(&rel_val.to_le_bytes()); } processed = pos + 5; } Ok(idata) } /// Decompresses the next compressed `chunk` from the LZXD data stream. pub fn decompress_next( &mut self, chunk: &[u8], output_len: usize, ) -> Result<&[u8], DecompressError> { // > A chunk represents exactly 32 KB of uncompressed data until the last chunk in the // > stream, which can represent less than 32 KB. // // > The LZXD engine encodes a compressed, chunk-size prefix field preceding each // > compressed chunk in the compressed byte stream. The compressed, chunk-size prefix // > field is a byte aligned, little-endian, 16-bit field. // // However, this doesn't seem to be part of LZXD itself? At least when testing with // `.xnb` files, every chunk comes with a compressed chunk size unless it has the flag // set to 0xff where it also includes the uncompressed chunk size. // // TODO maybe the docs could clarify whether this length is compressed or not let mut bitstream = Bitstream::new(chunk); self.try_read_first_chunk(&mut bitstream)?; let mut decoded_len = 0; while decoded_len != output_len { if self.current_block.remaining == 0 { // Re-align the bitstream to word // Related: https://github.com/GNOME/gcab/blob/master/libgcab/decomp.c#L883. // Related: https://github.com/kyz/libmspack/blob/master/libmspack/mspack/lzxd.c#L469 if matches!(self.current_block.kind, BlockKind::Uncompressed { .. }) && self.current_block.size % 2 != 0 { bitstream.read_byte(); } self.current_block = Block::read(&mut bitstream, &mut self.state)?; assert_ne!(self.current_block.remaining, 0); } let decoded = self .current_block .decode_element(&mut bitstream, &mut self.r)?; let advance = match decoded { Decoded::Single(value) => { self.window.push(value); 1 } Decoded::Match { offset, length } => { self.window.copy_from_self(offset, length); length } Decoded::Read(length) => { // Read up to end of chunk, to allow for larger blocks. let length = usize::min(bitstream.remaining_bytes(), length); // Will re-align if needed, just as decompressed reads mandate. self.window.copy_from_bitstream(&mut bitstream, length)?; length } }; assert_ne!(advance, 0); decoded_len += advance; if let Some(value) = self.current_block.remaining.checked_sub(advance as u32) { self.current_block.remaining = value; } else { return Err(DecodeFailed::OverreadBlock.into()); } } let chunk_offset = self.chunk_offset; self.chunk_offset += decoded_len; let view = self.window.past_view(decoded_len)?; if let Some(postprocess) = self.postprocess.as_mut() { // E8 fixups are disabled after 1GB of input data, // or if the chunk size is too small. if chunk_offset >= 0x4000_0000 || decoded_len <= 10 { Ok(view) } else { let postprocess_buf = &mut postprocess.data_chunk[..decoded_len]; postprocess_buf.copy_from_slice(view); // E8 fixups are enabled. Postprocess the output buffer. let view = Self::postprocess( postprocess.e8_translation_size, chunk_offset, postprocess_buf, )?; Ok(view) } } else { Ok(view) } } /// Resets the decoder state. /// /// This is equivalent to calling [`Self::new`] with the same [`WindowSize`]. /// [`WindowSize`]: enum.WindowSize.html pub fn reset(&mut self) { let this = Self::new(self.state.window_size); let _ = mem::replace(self, this); } } #[cfg(test)] mod tests { use super::*; #[test] fn check_uncompressed() { let data = [ 0x00, 0x30, 0x30, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, b'a', b'b', b'c', 0x00, ]; let mut lzxd = Lzxd::new(WindowSize::KB32); // size does not matter let res = lzxd.decompress_next(&data, 3); assert_eq!(res.unwrap(), [b'a', b'b', b'c']); } #[test] fn reset() { let data = [ 0x00, 0x30, 0x30, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, b'a', b'b', b'c', 0x00, ]; let mut lzxd = Lzxd::new(WindowSize::KB32); // size does not matter let res = lzxd.decompress_next(&data, 3); assert_eq!(res.unwrap(), [b'a', b'b', b'c']); lzxd.reset(); let res = lzxd.decompress_next(&data, 3); assert_eq!(res.unwrap(), [b'a', b'b', b'c']); } #[test] fn check_e8() { let data = [ 0x5B, 0x80, 0x80, 0x8D, 0x00, 0x30, 0x80, 0x0A, 0x18, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x54, 0x68, 0x69, 0x73, 0x20, 0x66, 0x69, 0x6C, 0x65, 0x20, 0x68, 0x61, 0x73, 0x20, 0x61, 0x6E, 0x20, 0x45, 0x38, 0x20, 0x62, 0x79, 0x74, 0x65, 0x20, 0x74, 0x6F, 0x20, 0x74, 0x65, 0x73, 0x74, 0x20, 0x45, 0x38, 0x20, 0x74, 0x72, 0x61, 0x6E, 0x73, 0x6C, 0x61, 0x74, 0x69, 0x6F, 0x6E, 0x2C, 0x20, 0x58, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0xE8, 0x7B, 0x00, 0x00, 0x00, 0xE8, 0x7B, 0x00, 0x00, 0x00, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, 0x64, ]; let mut lzxd = Lzxd::new(WindowSize::KB32); let res = lzxd.decompress_next(&data, 168); assert_eq!( res.unwrap(), b"This file has an E8 byte to test E8 translation, Xdddddddddddddddd\ dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd\ dddddddddddddd\xE8\xE9\xFF\xFF\xFF\xE8\xE4\xFF\xFF\xFFdddddddddddd" ); } } lzxd-0.2.5/src/tree.rs000064400000000000000000000251410072674642500127400ustar 00000000000000use std::fmt; use std::num::NonZeroU8; use std::ops::Range; use crate::{Bitstream, DecodeFailed}; /// The canonical tree cannot be used to decode elements. Instead, it behaves like a builder for /// instances of the actual tree that can decode elements efficiently. #[derive(Debug)] pub struct CanonicalTree { // > Each tree element can have a path length of [0, 16], where a zero path length indicates // > that the element has a zero frequency and is not present in the tree. // // We represent them as `u8` due to their very short range. path_lengths: Vec, } pub struct Tree { path_lengths: Vec, largest_length: NonZeroU8, huffman_tree: Vec, } impl CanonicalTree { pub fn new(count: usize) -> Self { Self { // > In the case of the very first such tree, the delta is calculated against a tree // > in which all elements have a zero path length. path_lengths: vec![0; count], } } /// Create a new `Tree` instance from this cast that can be used to decode elements. If the /// resulting tree is empty (all path lengths are 0), then `Ok(None)` is returned. /// /// This method transforms the canonical Huffman tree into a different structure that can /// be used to better decode elements. // > an LZXD decoder uses only the path lengths of the Huffman tree to reconstruct the // > identical tree, pub fn create_instance_allow_empty(&self) -> Result, DecodeFailed> { // The ideas implemented by this method are heavily inspired from LeonBlade's xnbcli // on GitHub. // // The path lengths contains the bit indices or zero if its not present, so find the // highest path length to determine how big our tree needs to be. let largest_length = match NonZeroU8::new(*self.path_lengths.iter().max().expect("empty path lengths")) { Some(x) => x, // N.B: If all the path lengths are zero, then the tree is empty (which is allowed). None => return Ok(None), }; let mut huffman_tree = vec![0; 1 << largest_length.get()]; // > a zero path length indicates that the element has a zero frequency and is not // > present in the tree. Tree elements are output in sequential order starting with the // > first element // // We start at the MSB, 1, and write the tree elements in sequential order from index 0. let mut pos = 0; for bit in 1..=largest_length.get() { let amount = 1 << (largest_length.get() - bit); // The codes correspond with the indices of the path length (because // `path_lengths[code]` is its path length). for code in 0..self.path_lengths.len() { // As soon as a code's path length matches with our bit index write the code as // many times as the bit index itself represents. if self.path_lengths[code] == bit { huffman_tree .get_mut(pos..pos + amount) .ok_or(DecodeFailed::InvalidPathLengths)? .iter_mut() .for_each(|x| *x = code as u16); pos += amount; } } } // If we didn't fill the entire table, the path lengths were wrong. if pos != huffman_tree.len() { Err(DecodeFailed::InvalidPathLengths)?; } Ok(Some(Tree { path_lengths: self.path_lengths.clone(), largest_length, huffman_tree, })) } /// Create a new `Tree` instance from this cast that can be used to decode elements. /// /// This method transforms the canonical Huffman tree into a different structure that can /// be used to better decode elements. // > an LZXD decoder uses only the path lengths of the Huffman tree to reconstruct the // > identical tree, pub fn create_instance(&self) -> Result { self.create_instance_allow_empty()? .ok_or(DecodeFailed::EmptyTree) } // Note: the tree already exists and is used to apply the deltas. pub fn update_range_with_pretree( &mut self, bitstream: &mut Bitstream, range: Range, ) -> Result<(), DecodeFailed> { // > Each of the 17 possible values of (len[x] - prev_len[x]) mod 17, plus three // > additional codes used for run-length encoding, are not output directly as 5-bit // > numbers but are instead encoded via a Huffman tree called the pretree. The pretree // > is generated dynamically according to the frequencies of the 20 allowable tree // > codes. The structure of the pretree is encoded in a total of 80 bits by using 4 bits // > to output the path length of each of the 20 pretree elements. Once again, a zero // > path length indicates a zero-frequency element. let pretree = { let mut path_lengths = Vec::with_capacity(20); for _ in 0..20 { path_lengths.push(bitstream.read_bits(4)? as u8) } Tree::from_path_lengths(path_lengths)? }; // > Tree elements are output in sequential order starting with the first element. let mut i = range.start; while i < range.end { // > The "real" tree is then encoded using the pretree Huffman codes. let code = pretree.decode_element(bitstream)?; // > Elements can be encoded in one of two ways: if several consecutive elements have // > the same path length, run-length encoding is employed; otherwise, the element is // > output by encoding the difference between the current path length and the // > previous path length of the tree, mod 17. match code { 0..=16 => { self.path_lengths[i] = (17 + self.path_lengths[i] - code as u8) % 17; i += 1; } // > Codes 17, 18, and 19 are used to represent consecutive elements that have the // > same path length. 17 => { let zeros = bitstream.read_bits(4)?; self.path_lengths .get_mut(i..i + zeros as usize + 4) .ok_or(DecodeFailed::InvalidPretreeRle)? .iter_mut() .for_each(|x| *x = 0); i += zeros as usize + 4; } 18 => { let zeros = bitstream.read_bits(5)?; self.path_lengths .get_mut(i..i + zeros as usize + 20) .ok_or(DecodeFailed::InvalidPretreeRle)? .iter_mut() .for_each(|x| *x = 0); i += zeros as usize + 20; } 19 => { let same = bitstream.read_bits(1)?; // "Decode new code" is used to parse the next code from the bitstream, which // has a value range of [0, 16]. let code = pretree.decode_element(bitstream)?; if code > 16 { return Err(DecodeFailed::InvalidPretreeElement(code))?; } let value = (17 + self.path_lengths[i] - code as u8) % 17; self.path_lengths .get_mut(i..i + same as usize + 4) .ok_or(DecodeFailed::InvalidPretreeRle)? .iter_mut() .for_each(|x| *x = value); i += same as usize + 4; } _ => return Err(DecodeFailed::InvalidPretreeElement(code)), }; } Ok(()) } } impl Tree { /// Create a new usable tree instance directly from known path lengths. pub fn from_path_lengths(path_lengths: Vec) -> Result { CanonicalTree { path_lengths }.create_instance() } pub fn decode_element(&self, bitstream: &mut Bitstream) -> Result { // Perform the inverse translation, peeking as many bits as our tree is… let code = self.huffman_tree[bitstream.peek_bits(self.largest_length.get()) as usize]; // …and advancing the stream for as many bits this code actually takes (read to seek). bitstream.read_bits(self.path_lengths[code as usize])?; Ok(code) } } impl fmt::Debug for Tree { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Tree") .field("path_lengths", &self.path_lengths.len()) .field("largest_length", &self.largest_length) .finish() } } #[cfg(test)] mod tests { use super::*; #[test] fn decode_simple_table() { // Based on some aligned offset tree let tree = Tree::from_path_lengths(vec![6, 5, 1, 3, 4, 6, 2, 0]).unwrap(); let value_count = vec![(2, 32), (6, 16), (3, 8), (4, 4), (1, 2), (0, 1), (5, 1)]; let mut i = 0; for (value, count) in value_count.into_iter() { (0..count).for_each(|_| { assert_eq!(tree.huffman_tree[i], value); i += 1; }) } } #[test] fn decode_complex_table() { // Based on the pretree of some length tree let tree = Tree::from_path_lengths(vec![ 1, 0, 0, 0, 0, 7, 3, 3, 4, 4, 5, 5, 5, 7, 8, 8, 0, 7, 0, 0, ]) .unwrap(); let value_count = vec![ (0, 128), (6, 32), (7, 32), (8, 16), (9, 16), (10, 8), (11, 8), (12, 8), (5, 2), (13, 2), (17, 2), (14, 1), (15, 1), ]; let mut i = 0; for (value, count) in value_count.into_iter() { (0..count).for_each(|_| { assert_eq!(tree.huffman_tree[i], value); i += 1; }) } } #[test] fn decode_elements() { let tree = Tree::from_path_lengths(vec![6, 5, 1, 3, 4, 6, 2, 0]).unwrap(); let buffer = [0x5b, 0xda, 0x3f, 0xf8]; let mut bitstream = Bitstream::new(&buffer); bitstream.read_bits(11).unwrap(); assert_eq!(tree.decode_element(&mut bitstream), Ok(3)); assert_eq!(tree.decode_element(&mut bitstream), Ok(5)); assert_eq!(tree.decode_element(&mut bitstream), Ok(6)); assert_eq!(tree.decode_element(&mut bitstream), Ok(2)); } } lzxd-0.2.5/src/window.rs000064400000000000000000000275430072674642500133200ustar 00000000000000use crate::{Bitstream, DecodeFailed, MAX_CHUNK_SIZE}; /// The window size is not stored in the compressed data stream and must be known before /// decoding begins. /// /// The window size should be the smallest power of two between 2^17 and 2^25 that is greater /// than or equal to the sum of the size of the reference data rounded up to a multiple of /// 32_768 and the size of the subject data. However, some implementations also seem to support /// a window size of less than 2^17, and this one is no exception. #[repr(u32)] #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] pub enum WindowSize { /// Window size of 32 KB (2^15 bytes). KB32 = 0x0000_8000, /// Window size of 64 KB (2^16 bytes). KB64 = 0x0001_0000, /// Window size of 128 KB (2^17 bytes). KB128 = 0x0002_0000, /// Window size of 256 KB (2^18 bytes). KB256 = 0x0004_0000, /// Window size of 512 KB (2^19 bytes). KB512 = 0x0008_0000, /// Window size of 1 MB (2^20 bytes). MB1 = 0x0010_0000, /// Window size of 2 MB (2^21 bytes). MB2 = 0x0020_0000, /// Window size of 4 MB (2^22 bytes). MB4 = 0x0040_0000, /// Window size of 8 MB (2^23 bytes). MB8 = 0x0080_0000, /// Window size of 16 MB (2^24 bytes). MB16 = 0x0100_0000, /// Window size of 32 MB (2^25 bytes). MB32 = 0x0200_0000, } /// A sliding window of a certain size. /// /// A `std::collections::VecDeque` is not used because the `deque_make_contiguous` feature /// is [nightly-only experimental](https://github.com/rust-lang/rust/issues/70929). pub struct Window { pos: usize, buffer: Box<[u8]>, } impl WindowSize { /// The window size determines the number of window subdivisions, or position slots. pub(crate) fn position_slots(&self) -> usize { use WindowSize::*; match self { KB32 => 30, KB64 => 32, KB128 => 34, KB256 => 36, KB512 => 38, MB1 => 42, MB2 => 50, MB4 => 66, MB8 => 98, MB16 => 162, MB32 => 290, } } fn value(&self) -> usize { *self as usize } pub(crate) fn create_buffer(&self) -> Window { // The window must be at least as big as the smallest chunk, or else we can't possibly // contain an entire chunk inside of the sliding window. assert!(self.value() >= MAX_CHUNK_SIZE); // We can use bit operations if we rely on this assumption so make sure it holds. assert!(self.value().is_power_of_two()); Window { pos: 0, buffer: vec![0; self.value()].into_boxed_slice(), } } } impl Window { fn advance(&mut self, delta: usize) { self.pos += delta; if self.pos >= self.buffer.len() { self.pos -= self.buffer.len(); } } pub fn push(&mut self, value: u8) { self.buffer[self.pos] = value; self.advance(1); } pub fn copy_from_self(&mut self, offset: usize, length: usize) { // For the fast path: // * Source cannot wrap around // * `copy_within` won't overwrite as we go but we need that // * Destination cannot wrap around if offset <= self.pos && length <= offset && self.pos + length < self.buffer.len() { // Best case: neither source or destination wrap around // TODO write a test for this because it used to fail let start = self.pos - offset; self.buffer.copy_within(start..start + length, self.pos); } else { // Either source or destination wrap around. We could expand this case into three // (one for only source wrapping, one for only destination wrapping, one for both) // but it's not really worth the effort. // // We could work out the ranges for use in `copy_within` but this is a lot simpler. let mask = self.buffer.len() - 1; // relying on power of two assumption for i in 0..length { let dst = (self.pos + i) & mask; let src = (self.buffer.len() + self.pos + i - offset) & mask; self.buffer[dst] = self.buffer[src]; } } self.advance(length); } pub fn copy_from_bitstream( &mut self, bitstream: &mut Bitstream, len: usize, ) -> Result<(), DecodeFailed> { if len > self.buffer.len() { return Err(DecodeFailed::WindowTooSmall); } if self.pos + len > self.buffer.len() { let shift = self.pos + len - self.buffer.len(); self.pos -= shift; // No need to actually save the part we're about to overwrite because when reading // with the bitstream we would also overwrite it anyway. self.buffer.copy_within(shift.., 0); } bitstream.read_raw(&mut self.buffer[self.pos..self.pos + len])?; self.advance(len); Ok(()) } pub fn past_view(&mut self, len: usize) -> Result<&mut [u8], DecodeFailed> { if len > MAX_CHUNK_SIZE { return Err(DecodeFailed::ChunkTooLong); } // Being at zero means we're actually at max length where is impossible for `len` to be // bigger and we would not want to bother shifting the entire array to end where it was. if self.pos != 0 && len > self.pos { let shift = len - self.pos; self.advance(shift); let tmp = self.buffer[self.buffer.len() - shift..].to_vec(); self.buffer.copy_within(0..self.buffer.len() - shift, shift); self.buffer[..shift].copy_from_slice(&tmp); } // Because we want to read behind us, being at zero means we're at the end let pos = if self.pos == 0 { self.buffer.len() } else { self.pos }; Ok(&mut self.buffer[pos - len..pos]) } } #[cfg(test)] mod tests { use super::*; #[test] fn check_push() { let mut window = WindowSize::KB32.create_buffer(); window.push(1); window.push(2); window.push(3); assert_eq!(window.pos, 3); assert_eq!(&window.buffer[..3], &[1, 2, 3]); assert!(window.buffer[3..].iter().all(|&x| x == 0)); } #[test] fn check_push_before_boundary() { let mut window = WindowSize::KB32.create_buffer(); window.pos = window.buffer.len() - 1; window.push(1); assert_eq!(window.pos, 0); } #[test] fn check_push_at_boundary() { let mut window = WindowSize::KB32.create_buffer(); for _ in 0..((1 << 15) - 2) { window.push(0); } window.push(1); window.push(2); window.push(3); window.push(4); assert_eq!(window.pos, 2); assert_eq!(&window.buffer[window.buffer.len() - 2..], &[1, 2]); assert_eq!(&window.buffer[..2], &[3, 4]); assert!(window.buffer[2..window.buffer.len() - 2] .iter() .all(|&x| x == 0)); } #[test] fn check_copy_from_self() { let mut window = WindowSize::KB32.create_buffer(); window.buffer[0] = 1; window.buffer[1] = 2; window.buffer[2] = 3; window.pos = 3; window.copy_from_self(3, 2); assert_eq!(window.pos, 5); assert_eq!(&window.buffer[..5], &[1, 2, 3, 1, 2]); assert!(window.buffer[5..].iter().all(|&x| x == 0)); } #[test] fn check_copy_from_self_overlap() { let mut window = WindowSize::KB32.create_buffer(); window.buffer[0] = 1; window.buffer[1] = 2; window.buffer[2] = 3; window.pos = 3; window.copy_from_self(2, 3); assert_eq!(window.pos, 6); assert_eq!(&window.buffer[..6], &[1, 2, 3, 2, 3, 2]); assert!(window.buffer[6..].iter().all(|&x| x == 0)); } #[test] fn check_copy_at_boundary_from_self() { let mut window = WindowSize::KB32.create_buffer(); window.buffer[window.buffer.len() - 3] = 1; window.buffer[window.buffer.len() - 2] = 2; window.pos = window.buffer.len() - 1; window.copy_from_self(2, 2); assert_eq!(window.pos, 1); assert_eq!(window.buffer[0], 2); assert_eq!(&window.buffer[window.buffer.len() - 3..], &[1, 2, 1]); assert!(window.buffer[1..window.buffer.len() - 3] .iter() .all(|&x| x == 0)); } #[test] fn check_copy_from_self_before_boundary() { let mut window = WindowSize::KB32.create_buffer(); window.buffer[window.buffer.len() - 4] = 1; window.buffer[window.buffer.len() - 3] = 2; window.pos = window.buffer.len() - 2; window.copy_from_self(2, 2); assert_eq!(window.pos, 0); } #[test] fn check_copy_from_self_at_boundary() { let mut window = WindowSize::KB32.create_buffer(); window.buffer[window.buffer.len() - 2] = 1; window.buffer[window.buffer.len() - 1] = 2; window.buffer[0] = 3; window.buffer[1] = 4; window.pos = 2; window.copy_from_self(4, 3); assert_eq!(window.pos, 5); assert_eq!(&window.buffer[..5], &[3, 4, 1, 2, 3]); assert_eq!(&window.buffer[window.buffer.len() - 2..], &[1, 2]); assert!(window.buffer[5..window.buffer.len() - 2] .iter() .all(|&x| x == 0)); } #[test] fn check_bitstream() { let buffer = [1, 2, 3, 4]; let mut bitstream = Bitstream::new(&buffer); let mut window = WindowSize::KB32.create_buffer(); window.copy_from_bitstream(&mut bitstream, 4).unwrap(); assert_eq!(window.pos, 4); assert_eq!(&window.buffer[..4], &[1, 2, 3, 4]); assert!(window.buffer[4..].iter().all(|&x| x == 0)); } #[test] fn check_bitstream_before_boundary() { let buffer = [1, 2, 3, 4]; let mut bitstream = Bitstream::new(&buffer); let mut window = WindowSize::KB32.create_buffer(); window.pos = window.buffer.len() - 4; window.copy_from_bitstream(&mut bitstream, 4).unwrap(); assert_eq!(window.pos, 0); } #[test] fn check_bitstream_at_boundary() { let buffer = [1, 2, 3, 4]; let mut bitstream = Bitstream::new(&buffer); let mut window = WindowSize::KB32.create_buffer(); window.pos = window.buffer.len() - 2; window.copy_from_bitstream(&mut bitstream, 4).unwrap(); assert_eq!(window.pos, 0); assert_eq!(&window.buffer[window.buffer.len() - 4..], &[1, 2, 3, 4]); assert!(window.buffer[..window.buffer.len() - 4] .iter() .all(|&x| x == 0)); } #[test] fn check_past_view() { let mut window = WindowSize::KB32.create_buffer(); window.buffer[0] = 1; window.buffer[1] = 2; window.buffer[2] = 3; window.pos = 3; assert_eq!(window.past_view(2).unwrap(), &[2, 3]); assert_eq!(window.past_view(3).unwrap(), &[1, 2, 3]); } #[test] fn check_past_view_at_boundary() { let mut window = WindowSize::KB32.create_buffer(); window.buffer[window.buffer.len() - 2] = 1; window.buffer[window.buffer.len() - 1] = 2; window.buffer[0] = 3; window.buffer[1] = 4; window.pos = 2; assert_eq!(window.past_view(4).unwrap(), &[1, 2, 3, 4]); } #[test] fn check_past_view_too_long() { let mut window = WindowSize::KB32.create_buffer(); assert_eq!( window.past_view(1 << 15 + 1), Err(DecodeFailed::ChunkTooLong) ); } #[test] fn check_past_view_new_max_size() { let mut window = WindowSize::KB32.create_buffer(); assert!(window.past_view(1 << 15).is_ok()); } #[test] fn check_past_view_shifted_max_size() { let mut window = WindowSize::KB32.create_buffer(); window.pos = 123; assert!(window.past_view(1 << 15).is_ok()); } }