gix-pack-0.56.0/.cargo_vcs_info.json0000644000000001460000000000100126300ustar { "git": { "sha1": "beb0ea8c4ff94c64b7773772a9d388ccb403f3c1" }, "path_in_vcs": "gix-pack" }gix-pack-0.56.0/Cargo.toml0000644000000113730000000000100106320ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" rust-version = "1.65" name = "gix-pack" version = "0.56.0" authors = ["Sebastian Thiel "] build = false include = [ "src/**/*", "LICENSE-*", ] autolib = false autobins = false autoexamples = false autotests = false autobenches = false description = "Implements git packs and related data structures" readme = false license = "MIT OR Apache-2.0" repository = "https://github.com/GitoxideLabs/gitoxide" [package.metadata.docs.rs] all-features = true features = [ "document-features", "pack-cache-lru-dynamic", "object-cache-dynamic", "serde", ] [lib] name = "gix_pack" path = "src/lib.rs" doctest = false [dependencies.clru] version = "0.6.1" optional = true [dependencies.document-features] version = "0.2.0" optional = true [dependencies.gix-chunk] version = "^0.4.10" [dependencies.gix-diff] version = "^0.49.0" optional = true default-features = false [dependencies.gix-features] version = "^0.39.1" features = [ "crc32", "rustsha1", "progress", "zlib", ] [dependencies.gix-hash] version = "^0.15.1" [dependencies.gix-hashtable] version = "^0.6.0" optional = true [dependencies.gix-object] version = "^0.46.1" [dependencies.gix-path] version = "^0.10.13" [dependencies.gix-traverse] version = "^0.43.1" optional = true [dependencies.memmap2] version = "0.9.0" [dependencies.parking_lot] version = "0.12.0" optional = true default-features = false [dependencies.serde] version = "1.0.114" features = ["derive"] optional = true default-features = false [dependencies.smallvec] version = "1.3.0" [dependencies.thiserror] version = "2.0.0" [dependencies.uluru] version = "3.0.0" optional = true [dev-dependencies] [features] default = [ "generate", "streaming-input", ] generate = [ "dep:gix-traverse", "dep:gix-diff", "dep:parking_lot", "dep:gix-hashtable", ] object-cache-dynamic = [ "dep:clru", "dep:gix-hashtable", ] pack-cache-lru-dynamic = ["dep:clru"] pack-cache-lru-static = ["dep:uluru"] serde = [ "dep:serde", "gix-object/serde", ] streaming-input = [ "dep:parking_lot", "dep:gix-tempfile", ] wasm = ["gix-diff?/wasm"] [target.'cfg(not(target_arch = "wasm32"))'.dependencies.gix-tempfile] version = "^15.0.0" optional = true default-features = false [lints.clippy] bool_to_int_with_if = "allow" borrow_as_ptr = "allow" cast_lossless = "allow" cast_possible_truncation = "allow" cast_possible_wrap = "allow" cast_precision_loss = "allow" cast_sign_loss = "allow" checked_conversions = "allow" copy_iterator = "allow" default_trait_access = "allow" doc_markdown = "allow" empty_docs = "allow" enum_glob_use = "allow" explicit_deref_methods = "allow" explicit_into_iter_loop = "allow" explicit_iter_loop = "allow" filter_map_next = "allow" fn_params_excessive_bools = "allow" from_iter_instead_of_collect = "allow" if_not_else = "allow" ignored_unit_patterns = "allow" implicit_clone = "allow" inconsistent_struct_constructor = "allow" inefficient_to_string = "allow" inline_always = "allow" items_after_statements = "allow" iter_not_returning_iterator = "allow" iter_without_into_iter = "allow" manual_assert = "allow" manual_is_variant_and = "allow" manual_let_else = "allow" manual_string_new = "allow" many_single_char_names = "allow" match_bool = "allow" match_same_arms = "allow" match_wild_err_arm = "allow" match_wildcard_for_single_variants = "allow" missing_errors_doc = "allow" missing_panics_doc = "allow" module_name_repetitions = "allow" must_use_candidate = "allow" mut_mut = "allow" naive_bytecount = "allow" needless_for_each = "allow" needless_pass_by_value = "allow" needless_raw_string_hashes = "allow" no_effect_underscore_binding = "allow" option_option = "allow" range_plus_one = "allow" redundant_else = "allow" return_self_not_must_use = "allow" should_panic_without_expect = "allow" similar_names = "allow" single_match_else = "allow" stable_sort_primitive = "allow" struct_excessive_bools = "allow" struct_field_names = "allow" too_long_first_doc_paragraph = "allow" too_many_lines = "allow" transmute_ptr_to_ptr = "allow" trivially_copy_pass_by_ref = "allow" unnecessary_join = "allow" unnecessary_wraps = "allow" unreadable_literal = "allow" unused_self = "allow" used_underscore_binding = "allow" wildcard_imports = "allow" [lints.clippy.pedantic] level = "warn" priority = -1 [lints.rust] gix-pack-0.56.0/Cargo.toml.orig000064400000000000000000000055771046102023000143240ustar 00000000000000lints.workspace = true [package] name = "gix-pack" version = "0.56.0" repository = "https://github.com/GitoxideLabs/gitoxide" authors = ["Sebastian Thiel "] license = "MIT OR Apache-2.0" description = "Implements git packs and related data structures" edition = "2021" include = ["src/**/*", "LICENSE-*"] rust-version = "1.65" autotests = false [lib] doctest = false [features] default = ["generate", "streaming-input"] ## generate new packs from a set of objects. generate = ["dep:gix-traverse", "dep:gix-diff", "dep:parking_lot", "dep:gix-hashtable"] ## Receive a pack as datastream and resolve it streaming-input = ["dep:parking_lot", "dep:gix-tempfile"] ## Provide a fixed-size allocation-free LRU cache for packs. It's useful if caching is desired while keeping the memory footprint ## for the LRU-cache itself low. pack-cache-lru-static = ["dep:uluru"] ## Provide a hash-map based LRU cache whose eviction is based a memory cap calculated from object data. pack-cache-lru-dynamic = ["dep:clru"] ## If set, select algorithms may additionally use a full-object cache which is queried before the pack itself. object-cache-dynamic = ["dep:clru", "dep:gix-hashtable"] ## Data structures implement `serde::Serialize` and `serde::Deserialize`. serde = ["dep:serde", "gix-object/serde"] ## Make it possible to compile to the `wasm32-unknown-unknown` target. wasm = ["gix-diff?/wasm"] [dependencies] gix-features = { version = "^0.39.1", path = "../gix-features", features = ["crc32", "rustsha1", "progress", "zlib"] } gix-path = { version = "^0.10.13", path = "../gix-path" } gix-hash = { version = "^0.15.1", path = "../gix-hash" } gix-chunk = { version = "^0.4.10", path = "../gix-chunk" } gix-object = { version = "^0.46.1", path = "../gix-object" } gix-hashtable = { version = "^0.6.0", path = "../gix-hashtable", optional = true } # for streaming of packs (input, output) gix-traverse = { version = "^0.43.1", path = "../gix-traverse", optional = true } gix-diff = { version = "^0.49.0", path = "../gix-diff", default-features = false, optional = true } memmap2 = "0.9.0" smallvec = "1.3.0" parking_lot = { version = "0.12.0", default-features = false, optional = true } thiserror = "2.0.0" # for caching uluru = { version = "3.0.0", optional = true } clru = { version = "0.6.1", optional = true } serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } ## If enabled, `cargo doc` will see feature documentation from this manifest. document-features = { version = "0.2.0", optional = true } [target.'cfg(not(target_arch = "wasm32"))'.dependencies] gix-tempfile = { version = "^15.0.0", default-features = false, path = "../gix-tempfile", optional = true } [dev-dependencies] gix-testtools = { path = "../tests/tools" } [package.metadata.docs.rs] all-features = true features = ["document-features", "pack-cache-lru-dynamic", "object-cache-dynamic", "serde"] gix-pack-0.56.0/LICENSE-APACHE000064400000000000000000000247461046102023000133600ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. gix-pack-0.56.0/LICENSE-MIT000064400000000000000000000017771046102023000130670ustar 00000000000000Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. gix-pack-0.56.0/src/bundle/find.rs000064400000000000000000000052711046102023000147520ustar 00000000000000use gix_features::zlib; impl crate::Bundle { /// Find an object with the given [`ObjectId`](gix_hash::ObjectId) and place its data into `out`. /// `inflate` is used to decompress objects, and will be reset before first use, but not after the last use. /// /// [`cache`](crate::cache::DecodeEntry) is used to accelerate the lookup. /// /// **Note** that ref deltas are automatically resolved within this pack only, which makes this implementation unusable /// for thin packs, which by now are expected to be resolved already. pub fn find<'a>( &self, id: &gix_hash::oid, out: &'a mut Vec, inflate: &mut zlib::Inflate, cache: &mut dyn crate::cache::DecodeEntry, ) -> Result, crate::data::entry::Location)>, crate::data::decode::Error> { let idx = match self.index.lookup(id) { Some(idx) => idx, None => return Ok(None), }; self.get_object_by_index(idx, out, inflate, cache).map(Some) } /// Special-use function to get an object given an index previously returned from /// [index::File::](crate::index::File::lookup()). /// `inflate` is used to decompress objects, and will be reset before first use, but not after the last use. /// /// # Panics /// /// If `index` is out of bounds. pub fn get_object_by_index<'a>( &self, idx: u32, out: &'a mut Vec, inflate: &mut zlib::Inflate, cache: &mut dyn crate::cache::DecodeEntry, ) -> Result<(gix_object::Data<'a>, crate::data::entry::Location), crate::data::decode::Error> { let ofs = self.index.pack_offset_at_index(idx); let pack_entry = self.pack.entry(ofs)?; let header_size = pack_entry.header_size(); self.pack .decode_entry( pack_entry, out, inflate, &|id, _out| { let idx = self.index.lookup(id)?; self.pack .entry(self.index.pack_offset_at_index(idx)) .ok() .map(crate::data::decode::entry::ResolvedBase::InPack) }, cache, ) .map(move |r| { ( gix_object::Data { kind: r.kind, data: out.as_slice(), }, crate::data::entry::Location { pack_id: self.pack.id, pack_offset: ofs, entry_size: r.compressed_size + header_size, }, ) }) } } gix-pack-0.56.0/src/bundle/init.rs000064400000000000000000000032521046102023000147720ustar 00000000000000use std::path::{Path, PathBuf}; use crate::Bundle; /// Returned by [`Bundle::at()`] #[derive(thiserror::Error, Debug)] #[allow(missing_docs)] pub enum Error { #[error("An 'idx' extension is expected of an index file: '{0}'")] InvalidPath(PathBuf), #[error(transparent)] Pack(#[from] crate::data::header::decode::Error), #[error(transparent)] Index(#[from] crate::index::init::Error), } /// Initialization impl Bundle { /// Create a `Bundle` from `path`, which is either a pack file _(*.pack)_ or an index file _(*.idx)_. /// /// The corresponding complementary file is expected to be present. /// /// The `object_hash` is a way to read (and write) the same file format with different hashes, as the hash kind /// isn't stored within the file format itself. pub fn at(path: impl AsRef, object_hash: gix_hash::Kind) -> Result { Self::at_inner(path.as_ref(), object_hash) } fn at_inner(path: &Path, object_hash: gix_hash::Kind) -> Result { let ext = path .extension() .and_then(std::ffi::OsStr::to_str) .ok_or_else(|| Error::InvalidPath(path.to_owned()))?; Ok(match ext { "idx" => Self { index: crate::index::File::at(path, object_hash)?, pack: crate::data::File::at(path.with_extension("pack"), object_hash)?, }, "pack" => Self { pack: crate::data::File::at(path, object_hash)?, index: crate::index::File::at(path.with_extension("idx"), object_hash)?, }, _ => return Err(Error::InvalidPath(path.to_owned())), }) } } gix-pack-0.56.0/src/bundle/mod.rs000064400000000000000000000034461046102023000146130ustar 00000000000000/// pub mod init; mod find; /// #[cfg(all(not(feature = "wasm"), feature = "streaming-input"))] pub mod write; /// pub mod verify { use std::sync::atomic::AtomicBool; use gix_features::progress::DynNestedProgress; /// pub mod integrity { /// Returned by [`Bundle::verify_integrity()`][crate::Bundle::verify_integrity()]. pub struct Outcome { /// The computed checksum of the index which matched the stored one. pub actual_index_checksum: gix_hash::ObjectId, /// The packs traversal outcome pub pack_traverse_outcome: crate::index::traverse::Statistics, } } use crate::Bundle; impl Bundle { /// Similar to [`crate::index::File::verify_integrity()`] but more convenient to call as the presence of the /// pack file is a given. pub fn verify_integrity( &self, progress: &mut dyn DynNestedProgress, should_interrupt: &AtomicBool, options: crate::index::verify::integrity::Options, ) -> Result> where C: crate::cache::DecodeEntry, F: Fn() -> C + Send + Clone, { self.index .verify_integrity( Some(crate::index::verify::PackContext { data: &self.pack, options, }), progress, should_interrupt, ) .map(|o| integrity::Outcome { actual_index_checksum: o.actual_index_checksum, pack_traverse_outcome: o.pack_traverse_statistics.expect("pack is set"), }) } } } gix-pack-0.56.0/src/bundle/write/error.rs000064400000000000000000000011701046102023000163070ustar 00000000000000use std::io; use gix_tempfile::handle::Writable; /// The error returned by [`Bundle::write_to_directory()`][crate::Bundle::write_to_directory()] #[derive(thiserror::Error, Debug)] #[allow(missing_docs)] pub enum Error { #[error("An IO error occurred when reading the pack or creating a temporary file")] Io(#[from] io::Error), #[error(transparent)] PackIter(#[from] crate::data::input::Error), #[error("Could not move a temporary file into its desired place")] Persist(#[from] gix_tempfile::handle::persist::Error), #[error(transparent)] IndexWrite(#[from] crate::index::write::Error), } gix-pack-0.56.0/src/bundle/write/mod.rs000064400000000000000000000376101046102023000157450ustar 00000000000000use std::{ io, io::Write, marker::PhantomData, path::{Path, PathBuf}, sync::{atomic::AtomicBool, Arc}, }; use gix_features::{interrupt, progress, progress::Progress}; use gix_tempfile::{AutoRemove, ContainingDirectory}; use crate::data; mod error; pub use error::Error; use gix_features::progress::prodash::DynNestedProgress; mod types; use types::{LockWriter, PassThrough}; pub use types::{Options, Outcome}; use crate::bundle::write::types::SharedTempFile; /// The progress ids used in [`write_to_directory()`][crate::Bundle::write_to_directory()]. /// /// Use this information to selectively extract the progress of interest in case the parent application has custom visualization. #[derive(Debug, Copy, Clone)] pub enum ProgressId { /// The amount of bytes read from the input pack data file. ReadPackBytes, /// A root progress counting logical steps towards an index file on disk. /// /// Underneath will be more progress information related to actually producing the index. IndexingSteps(PhantomData), } impl From for gix_features::progress::Id { fn from(v: ProgressId) -> Self { match v { ProgressId::ReadPackBytes => *b"BWRB", ProgressId::IndexingSteps(_) => *b"BWCI", } } } impl crate::Bundle { /// Given a `pack` data stream, write it along with a generated index into the `directory` if `Some` or discard all output if `None`. /// /// In the latter case, the functionality provided here is more a kind of pack data stream validation. /// /// * `progress` provides detailed progress information which can be discarded with [`gix_features::progress::Discard`]. /// * `should_interrupt` is checked regularly and when true, the whole operation will stop. /// * `thin_pack_base_object_lookup` If set, we expect to see a thin-pack with objects that reference their base object by object id which is /// expected to exist in the object database the bundle is contained within. /// `options` further configure how the task is performed. /// /// # Note /// /// * the resulting pack may be empty, that is, contains zero objects in some situations. This is a valid reply by a server and should /// be accounted for. /// - Empty packs always have the same name and not handling this case will result in at most one superfluous pack. pub fn write_to_directory( pack: &mut dyn io::BufRead, directory: Option<&Path>, progress: &mut dyn DynNestedProgress, should_interrupt: &AtomicBool, thin_pack_base_object_lookup: Option, options: Options, ) -> Result { let _span = gix_features::trace::coarse!("gix_pack::Bundle::write_to_directory()"); let mut read_progress = progress.add_child_with_id("read pack".into(), ProgressId::ReadPackBytes.into()); read_progress.init(None, progress::bytes()); let pack = progress::Read { inner: pack, progress: progress::ThroughputOnDrop::new(read_progress), }; let object_hash = options.object_hash; let data_file = Arc::new(parking_lot::Mutex::new(io::BufWriter::with_capacity( 64 * 1024, match directory.as_ref() { Some(directory) => gix_tempfile::new(directory, ContainingDirectory::Exists, AutoRemove::Tempfile)?, None => gix_tempfile::new(std::env::temp_dir(), ContainingDirectory::Exists, AutoRemove::Tempfile)?, }, ))); let (pack_entries_iter, pack_version): ( Box>>, _, ) = match thin_pack_base_object_lookup { Some(thin_pack_lookup) => { let pack = interrupt::Read { inner: pack, should_interrupt, }; let buffered_pack = io::BufReader::new(pack); let pack_entries_iter = data::input::LookupRefDeltaObjectsIter::new( data::input::BytesToEntriesIter::new_from_header( buffered_pack, options.iteration_mode, data::input::EntryDataMode::KeepAndCrc32, object_hash, )?, thin_pack_lookup, ); let pack_version = pack_entries_iter.inner.version(); let pack_entries_iter = data::input::EntriesToBytesIter::new( pack_entries_iter, LockWriter { writer: data_file.clone(), }, pack_version, gix_hash::Kind::Sha1, // Thin packs imply a pack being transported, and there we only ever know SHA1 at the moment. ); (Box::new(pack_entries_iter), pack_version) } None => { let pack = PassThrough { reader: interrupt::Read { inner: pack, should_interrupt, }, writer: Some(data_file.clone()), }; // This buf-reader is required to assure we call 'read()' in order to fill the (extra) buffer. Otherwise all the counting // we do with the wrapped pack reader doesn't work as it does not expect anyone to call BufRead functions directly. // However, this is exactly what's happening in the ZipReader implementation that is eventually used. // The performance impact of this is probably negligible, compared to all the other work that is done anyway :D. let buffered_pack = io::BufReader::new(pack); let pack_entries_iter = data::input::BytesToEntriesIter::new_from_header( buffered_pack, options.iteration_mode, data::input::EntryDataMode::Crc32, object_hash, )?; let pack_version = pack_entries_iter.version(); (Box::new(pack_entries_iter), pack_version) } }; let WriteOutcome { outcome, data_path, index_path, keep_path, } = crate::Bundle::inner_write( directory, progress, options, data_file, pack_entries_iter, should_interrupt, pack_version, )?; Ok(Outcome { index: outcome, object_hash, pack_version, data_path, index_path, keep_path, }) } /// Equivalent to [`write_to_directory()`][crate::Bundle::write_to_directory()] but offloads reading of the pack into its own thread, hence the `Send + 'static'` bounds. /// /// # Note /// /// As it sends portions of the input to a thread it requires the 'static lifetime for the interrupt flags. This can only /// be satisfied by a static `AtomicBool` which is only suitable for programs that only run one of these operations at a time /// or don't mind that all of them abort when the flag is set. pub fn write_to_directory_eagerly( pack: Box, pack_size: Option, directory: Option>, progress: &mut dyn DynNestedProgress, should_interrupt: &'static AtomicBool, thin_pack_base_object_lookup: Option, options: Options, ) -> Result { let _span = gix_features::trace::coarse!("gix_pack::Bundle::write_to_directory_eagerly()"); let mut read_progress = progress.add_child_with_id("read pack".into(), ProgressId::ReadPackBytes.into()); /* Bundle Write Read pack Bytes*/ read_progress.init(pack_size.map(|s| s as usize), progress::bytes()); let pack = progress::Read { inner: pack, progress: progress::ThroughputOnDrop::new(read_progress), }; let data_file = Arc::new(parking_lot::Mutex::new(io::BufWriter::new(match directory.as_ref() { Some(directory) => gix_tempfile::new(directory, ContainingDirectory::Exists, AutoRemove::Tempfile)?, None => gix_tempfile::new(std::env::temp_dir(), ContainingDirectory::Exists, AutoRemove::Tempfile)?, }))); let object_hash = options.object_hash; let eight_pages = 4096 * 8; let (pack_entries_iter, pack_version): ( Box> + Send + 'static>, _, ) = match thin_pack_base_object_lookup { Some(thin_pack_lookup) => { let pack = interrupt::Read { inner: pack, should_interrupt, }; let buffered_pack = io::BufReader::with_capacity(eight_pages, pack); let pack_entries_iter = data::input::LookupRefDeltaObjectsIter::new( data::input::BytesToEntriesIter::new_from_header( buffered_pack, options.iteration_mode, data::input::EntryDataMode::KeepAndCrc32, object_hash, )?, thin_pack_lookup, ); let pack_kind = pack_entries_iter.inner.version(); (Box::new(pack_entries_iter), pack_kind) } None => { let pack = PassThrough { reader: interrupt::Read { inner: pack, should_interrupt, }, writer: Some(data_file.clone()), }; let buffered_pack = io::BufReader::with_capacity(eight_pages, pack); let pack_entries_iter = data::input::BytesToEntriesIter::new_from_header( buffered_pack, options.iteration_mode, data::input::EntryDataMode::Crc32, object_hash, )?; let pack_kind = pack_entries_iter.version(); (Box::new(pack_entries_iter), pack_kind) } }; let num_objects = pack_entries_iter.size_hint().0; let pack_entries_iter = gix_features::parallel::EagerIterIf::new(move || num_objects > 25_000, pack_entries_iter, 5_000, 5); let WriteOutcome { outcome, data_path, index_path, keep_path, } = crate::Bundle::inner_write( directory, progress, options, data_file, Box::new(pack_entries_iter), should_interrupt, pack_version, )?; Ok(Outcome { index: outcome, object_hash, pack_version, data_path, index_path, keep_path, }) } fn inner_write<'a>( directory: Option>, progress: &mut dyn DynNestedProgress, Options { thread_limit, iteration_mode: _, index_version: index_kind, object_hash, }: Options, data_file: SharedTempFile, mut pack_entries_iter: Box> + 'a>, should_interrupt: &AtomicBool, pack_version: data::Version, ) -> Result { let mut indexing_progress = progress.add_child_with_id( "create index file".into(), ProgressId::IndexingSteps(Default::default()).into(), ); Ok(match directory { Some(directory) => { let directory = directory.as_ref(); let mut index_file = gix_tempfile::new(directory, ContainingDirectory::Exists, AutoRemove::Tempfile)?; let outcome = crate::index::File::write_data_iter_to_stream( index_kind, { let data_file = Arc::clone(&data_file); move || new_pack_file_resolver(data_file) }, &mut pack_entries_iter, thread_limit, &mut indexing_progress, &mut index_file, should_interrupt, object_hash, pack_version, )?; drop(pack_entries_iter); if outcome.num_objects == 0 { WriteOutcome { outcome, data_path: None, index_path: None, keep_path: None, } } else { let data_path = directory.join(format!("pack-{}.pack", outcome.data_hash.to_hex())); let index_path = data_path.with_extension("idx"); let keep_path = if data_path.is_file() { // avoid trying to overwrite existing files, we know they have the same content // and this is likely to fail on Windows as negotiation opened the pack. None } else { let keep_path = data_path.with_extension("keep"); std::fs::write(&keep_path, b"")?; Arc::try_unwrap(data_file) .expect("only one handle left after pack was consumed") .into_inner() .into_inner() .map_err(|err| Error::from(err.into_error()))? .persist(&data_path)?; Some(keep_path) }; if !index_path.is_file() { index_file .persist(&index_path) .map_err(|err| { gix_features::trace::warn!("pack file at \"{}\" is retained despite failing to move the index file into place. You can use plumbing to make it usable.",data_path.display()); err })?; } WriteOutcome { outcome, data_path: Some(data_path), index_path: Some(index_path), keep_path, } } } None => WriteOutcome { outcome: crate::index::File::write_data_iter_to_stream( index_kind, move || new_pack_file_resolver(data_file), &mut pack_entries_iter, thread_limit, &mut indexing_progress, &mut io::sink(), should_interrupt, object_hash, pack_version, )?, data_path: None, index_path: None, keep_path: None, }, }) } } fn resolve_entry(range: data::EntryRange, mapped_file: &memmap2::Mmap) -> Option<&[u8]> { mapped_file.get(range.start as usize..range.end as usize) } #[allow(clippy::type_complexity)] // cannot typedef impl Fn fn new_pack_file_resolver( data_file: SharedTempFile, ) -> io::Result<( impl Fn(data::EntryRange, &memmap2::Mmap) -> Option<&[u8]> + Send + Clone, memmap2::Mmap, )> { let mut guard = data_file.lock(); guard.flush()?; let mapped_file = crate::mmap::read_only(&guard.get_mut().with_mut(|f| f.path().to_owned())?)?; Ok((resolve_entry, mapped_file)) } struct WriteOutcome { outcome: crate::index::write::Outcome, data_path: Option, index_path: Option, keep_path: Option, } gix-pack-0.56.0/src/bundle/write/types.rs000064400000000000000000000101521046102023000163220ustar 00000000000000use std::{hash::Hash, io, io::SeekFrom, path::PathBuf, sync::Arc}; use gix_tempfile::handle::Writable; /// Configuration for [`write_to_directory`][crate::Bundle::write_to_directory()] or /// [`write_to_directory_eagerly`][crate::Bundle::write_to_directory_eagerly()] #[derive(Debug, Clone)] pub struct Options { /// The amount of threads to use at most when resolving the pack. If `None`, all logical cores are used. pub thread_limit: Option, /// Determine how much processing to spend on protecting against corruption or recovering from errors. pub iteration_mode: crate::data::input::Mode, /// The version of pack index to write, should be [`crate::index::Version::default()`] pub index_version: crate::index::Version, /// The kind of hash to use when writing the bundle. pub object_hash: gix_hash::Kind, } impl Default for Options { /// Options which favor speed and correctness and write the most commonly supported index file. fn default() -> Self { Options { thread_limit: None, iteration_mode: crate::data::input::Mode::Verify, index_version: Default::default(), object_hash: Default::default(), } } } /// Returned by [`write_to_directory`][crate::Bundle::write_to_directory()] or /// [`write_to_directory_eagerly`][crate::Bundle::write_to_directory_eagerly()] #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Outcome { /// The successful result of the index write operation. pub index: crate::index::write::Outcome, /// The version of the pack. pub pack_version: crate::data::Version, /// The kind of hash stored within the pack and indices. pub object_hash: gix_hash::Kind, /// The path to the pack index file. pub index_path: Option, /// The path to the pack data file. pub data_path: Option, /// The path to the `.keep` file to prevent collection of the newly written pack until refs are pointing to it. /// It might be `None` if the file at `data_path` already existed, indicating that we have received a pack that /// was already present locally. /// /// The file is created right before moving the pack data and index data into place (i.e. `data_path` and `index_path`) /// and is expected to be removed by the caller when ready. pub keep_path: Option, } impl Outcome { /// Instantiate a bundle from the newly written index and data file that are represented by this `Outcome` pub fn to_bundle(&self) -> Option> { self.index_path .as_ref() .map(|path| crate::Bundle::at(path, self.object_hash)) } } pub(crate) type SharedTempFile = Arc>>>; pub(crate) struct PassThrough { pub reader: R, pub writer: Option, } impl io::Read for PassThrough where R: io::Read, { fn read(&mut self, buf: &mut [u8]) -> io::Result { let bytes_read = self.reader.read(buf)?; if let Some(writer) = self.writer.as_mut() { use std::io::Write; writer.lock().write_all(&buf[..bytes_read])?; } Ok(bytes_read) } } impl io::BufRead for PassThrough where R: io::BufRead, { fn fill_buf(&mut self) -> io::Result<&[u8]> { self.reader.fill_buf() } fn consume(&mut self, amt: usize) { self.reader.consume(amt); } } pub(crate) struct LockWriter { pub writer: SharedTempFile, } impl io::Write for LockWriter { fn write(&mut self, buf: &[u8]) -> io::Result { self.writer.lock().write(buf) } fn flush(&mut self) -> io::Result<()> { self.writer.lock().flush() } } impl io::Read for LockWriter { fn read(&mut self, buf: &mut [u8]) -> io::Result { self.writer.lock().get_mut().read(buf) } } impl io::Seek for LockWriter { fn seek(&mut self, pos: SeekFrom) -> io::Result { self.writer.lock().seek(pos) } } gix-pack-0.56.0/src/cache/delta/from_offsets.rs000064400000000000000000000151521046102023000174100ustar 00000000000000use std::{ fs, io, io::{BufRead, Read, Seek, SeekFrom}, sync::atomic::{AtomicBool, Ordering}, time::Instant, }; use gix_features::progress::{self, Progress}; use crate::{cache::delta::Tree, data}; /// Returned by [`Tree::from_offsets_in_pack()`] #[derive(thiserror::Error, Debug)] #[allow(missing_docs)] pub enum Error { #[error("{message}")] Io { source: io::Error, message: &'static str }, #[error(transparent)] Header(#[from] crate::data::header::decode::Error), #[error("Could find object with id {id} in this pack. Thin packs are not supported")] UnresolvedRefDelta { id: gix_hash::ObjectId }, #[error(transparent)] Tree(#[from] crate::cache::delta::Error), #[error("Interrupted")] Interrupted, } const PACK_HEADER_LEN: usize = 12; /// Generate tree from certain input impl Tree { /// Create a new `Tree` from any data sorted by offset, ascending as returned by the `data_sorted_by_offsets` iterator. /// * `get_pack_offset(item: &T) -> data::Offset` is a function returning the pack offset of the given item, which can be used /// for obtaining the objects entry within the pack. /// * `pack_path` is the path to the pack file itself and from which to read the entry data, which is a pack file matching the offsets /// returned by `get_pack_offset(…)`. /// * `progress` is used to track progress when creating the tree. /// * `resolve_in_pack_id(gix_hash::oid) -> Option` takes an object ID and tries to resolve it to an object within this pack if /// possible. Failing to do so aborts the operation, and this function is not expected to be called in usual packs. It's a theoretical /// possibility though as old packs might have referred to their objects using the 20 bytes hash, instead of their encoded offset from the base. /// /// Note that the sort order is ascending. The given pack file path must match the provided offsets. pub fn from_offsets_in_pack( pack_path: &std::path::Path, data_sorted_by_offsets: impl Iterator, get_pack_offset: &dyn Fn(&T) -> data::Offset, resolve_in_pack_id: &dyn Fn(&gix_hash::oid) -> Option, progress: &mut dyn Progress, should_interrupt: &AtomicBool, object_hash: gix_hash::Kind, ) -> Result { let mut r = io::BufReader::with_capacity( 8192 * 8, // this value directly corresponds to performance, 8k (default) is about 4x slower than 64k fs::File::open(pack_path).map_err(|err| Error::Io { source: err, message: "open pack path", })?, ); let anticipated_num_objects = data_sorted_by_offsets .size_hint() .1 .map(|num_objects| { progress.init(Some(num_objects), progress::count("objects")); num_objects }) .unwrap_or_default(); let mut tree = Tree::with_capacity(anticipated_num_objects)?; { // safety check - assure ourselves it's a pack we can handle let mut buf = [0u8; PACK_HEADER_LEN]; r.read_exact(&mut buf).map_err(|err| Error::Io { source: err, message: "reading header buffer with at least 12 bytes failed - pack file truncated?", })?; crate::data::header::decode(&buf)?; } let then = Instant::now(); let mut previous_cursor_position = None::; let hash_len = object_hash.len_in_bytes(); for (idx, data) in data_sorted_by_offsets.enumerate() { let pack_offset = get_pack_offset(&data); if let Some(previous_offset) = previous_cursor_position { Self::advance_cursor_to_pack_offset(&mut r, pack_offset, previous_offset)?; }; let entry = crate::data::Entry::from_read(&mut r, pack_offset, hash_len).map_err(|err| Error::Io { source: err, message: "EOF while parsing header", })?; previous_cursor_position = Some(pack_offset + entry.header_size() as u64); use crate::data::entry::Header::*; match entry.header { Tree | Blob | Commit | Tag => { tree.add_root(pack_offset, data)?; } RefDelta { base_id } => { resolve_in_pack_id(base_id.as_ref()) .ok_or(Error::UnresolvedRefDelta { id: base_id }) .and_then(|base_pack_offset| { tree.add_child(base_pack_offset, pack_offset, data).map_err(Into::into) })?; } OfsDelta { base_distance } => { let base_pack_offset = pack_offset .checked_sub(base_distance) .expect("in bound distance for deltas"); tree.add_child(base_pack_offset, pack_offset, data)?; } }; progress.inc(); if idx % 10_000 == 0 && should_interrupt.load(Ordering::SeqCst) { return Err(Error::Interrupted); } } progress.show_throughput(then); Ok(tree) } fn advance_cursor_to_pack_offset( r: &mut io::BufReader, pack_offset: u64, previous_offset: u64, ) -> Result<(), Error> { let bytes_to_skip: u64 = pack_offset .checked_sub(previous_offset) .expect("continuously ascending pack offsets"); if bytes_to_skip == 0 { return Ok(()); } let buf = r.fill_buf().map_err(|err| Error::Io { source: err, message: "skip bytes", })?; if buf.is_empty() { // This means we have reached the end of file and can't make progress anymore, before we have satisfied our need // for more return Err(Error::Io { source: io::Error::new( io::ErrorKind::UnexpectedEof, "ran out of bytes before reading desired amount of bytes", ), message: "index file is damaged or corrupt", }); } if bytes_to_skip <= u64::try_from(buf.len()).expect("sensible buffer size") { // SAFETY: bytes_to_skip <= buf.len() <= usize::MAX r.consume(bytes_to_skip as usize); } else { r.seek(SeekFrom::Start(pack_offset)).map_err(|err| Error::Io { source: err, message: "seek to next entry", })?; } Ok(()) } } gix-pack-0.56.0/src/cache/delta/mod.rs000064400000000000000000000012321046102023000154650ustar 00000000000000/// Returned when using various methods on a [`Tree`] #[derive(thiserror::Error, Debug)] #[allow(missing_docs)] pub enum Error { #[error("Pack offsets must only increment. The previous pack offset was {last_pack_offset}, the current one is {pack_offset}")] InvariantIncreasingPackOffset { /// The last seen pack offset last_pack_offset: crate::data::Offset, /// The invariant violating offset pack_offset: crate::data::Offset, }, } /// pub mod traverse; /// pub mod from_offsets; /// Tree datastructure // kept in separate module to encapsulate unsafety (it has field invariants) mod tree; pub use tree::{Item, Tree}; gix-pack-0.56.0/src/cache/delta/traverse/mod.rs000064400000000000000000000200411046102023000173170ustar 00000000000000use std::sync::atomic::{AtomicBool, Ordering}; use gix_features::{ parallel::in_parallel_with_slice, progress::{self, DynNestedProgress, Progress}, threading, threading::{Mutable, OwnShared}, }; use crate::{ cache::delta::{traverse::util::ItemSliceSync, Item, Tree}, data::EntryRange, }; mod resolve; pub(crate) mod util; /// Returned by [`Tree::traverse()`] #[derive(thiserror::Error, Debug)] #[allow(missing_docs)] pub enum Error { #[error("{message}")] ZlibInflate { source: gix_features::zlib::inflate::Error, message: &'static str, }, #[error("The resolver failed to obtain the pack entry bytes for the entry at {pack_offset}")] ResolveFailed { pack_offset: u64 }, #[error(transparent)] EntryType(#[from] crate::data::entry::decode::Error), #[error("One of the object inspectors failed")] Inspect(#[from] Box), #[error("Interrupted")] Interrupted, #[error( "The base at {base_pack_offset} was referred to by a ref-delta, but it was never added to the tree as if the pack was still thin." )] OutOfPackRefDelta { /// The base's offset which was from a resolved ref-delta that didn't actually get added to the tree base_pack_offset: crate::data::Offset, }, #[error("Failed to spawn thread when switching to work-stealing mode")] SpawnThread(#[from] std::io::Error), } /// Additional context passed to the `inspect_object(…)` function of the [`Tree::traverse()`] method. pub struct Context<'a> { /// The pack entry describing the object pub entry: &'a crate::data::Entry, /// The offset at which `entry` ends in the pack, useful to learn about the exact range of `entry` within the pack. pub entry_end: u64, /// The decompressed object itself, ready to be decoded. pub decompressed: &'a [u8], /// The depth at which this object resides in the delta-tree. It represents the amount of base objects, with 0 indicating /// an 'undeltified' object, and higher values indicating delta objects with the given amount of bases. pub level: u16, } /// Options for [`Tree::traverse()`]. pub struct Options<'a, 's> { /// is a progress instance to track progress for each object in the traversal. pub object_progress: Box, /// is a progress instance to track the overall progress. pub size_progress: &'s mut dyn Progress, /// If `Some`, only use the given amount of threads. Otherwise, the amount of threads to use will be selected based on /// the amount of available logical cores. pub thread_limit: Option, /// Abort the operation if the value is `true`. pub should_interrupt: &'a AtomicBool, /// specifies what kind of hashes we expect to be stored in oid-delta entries, which is viable to decoding them /// with the correct size. pub object_hash: gix_hash::Kind, } /// The outcome of [`Tree::traverse()`] pub struct Outcome { /// The items that have no children in the pack, i.e. base objects. pub roots: Vec>, /// The items that children to a root object, i.e. delta objects. pub children: Vec>, } impl Tree where T: Send, { /// Traverse this tree of delta objects with a function `inspect_object` to process each object at will. /// /// * `should_run_in_parallel() -> bool` returns true if the underlying pack is big enough to warrant parallel traversal at all. /// * `resolve(EntrySlice, &mut Vec) -> Option<()>` resolves the bytes in the pack for the given `EntrySlice` and stores them in the /// output vector. It returns `Some(())` if the object existed in the pack, or `None` to indicate a resolution error, which would abort the /// operation as well. /// * `pack_entries_end` marks one-past-the-last byte of the last entry in the pack, as the last entries size would otherwise /// be unknown as it's not part of the index file. /// * `inspect_object(node_data: &mut T, progress: Progress, context: Context) -> Result<(), CustomError>` is a function /// running for each thread receiving fully decoded objects along with contextual information, which either succeeds with `Ok(())` /// or returns a `CustomError`. /// Note that `node_data` can be modified to allow storing maintaining computation results on a per-object basis. It should contain /// its own mutable per-thread data as required. /// /// This method returns a vector of all tree items, along with their potentially modified custom node data. /// /// _Note_ that this method consumed the Tree to assure safe parallel traversal with mutation support. pub fn traverse( mut self, resolve: F, resolve_data: &R, pack_entries_end: u64, inspect_object: MBFN, Options { thread_limit, mut object_progress, size_progress, should_interrupt, object_hash, }: Options<'_, '_>, ) -> Result, Error> where F: for<'r> Fn(EntryRange, &'r R) -> Option<&'r [u8]> + Send + Clone, R: Send + Sync, MBFN: FnMut(&mut T, &dyn Progress, Context<'_>) -> Result<(), E> + Send + Clone, E: std::error::Error + Send + Sync + 'static, { self.set_pack_entries_end_and_resolve_ref_offsets(pack_entries_end)?; let num_objects = self.num_items(); let object_counter = { let progress = &mut object_progress; progress.init(Some(num_objects), progress::count("objects")); progress.counter() }; size_progress.init(None, progress::bytes()); let size_counter = size_progress.counter(); let object_progress = OwnShared::new(Mutable::new(object_progress)); let start = std::time::Instant::now(); let (mut root_items, mut child_items_vec) = self.take_root_and_child(); let child_items = ItemSliceSync::new(&mut child_items_vec); let child_items = &child_items; in_parallel_with_slice( &mut root_items, thread_limit, { { let object_progress = object_progress.clone(); move |thread_index| resolve::State { delta_bytes: Vec::::with_capacity(4096), fully_resolved_delta_bytes: Vec::::with_capacity(4096), progress: Box::new( threading::lock(&object_progress).add_child(format!("thread {thread_index}")), ), resolve: resolve.clone(), modify_base: inspect_object.clone(), child_items, } } }, { move |node, state, threads_left, should_interrupt| { // SAFETY: This invariant is upheld since `child_items` and `node` come from the same Tree. // This means we can rely on Tree's invariant that node.children will be the only `children` array in // for nodes in this tree that will contain any of those children. #[allow(unsafe_code)] unsafe { resolve::deltas( object_counter.clone(), size_counter.clone(), node, state, resolve_data, object_hash.len_in_bytes(), threads_left, should_interrupt, ) } } }, || (!should_interrupt.load(Ordering::Relaxed)).then(|| std::time::Duration::from_millis(50)), |_| (), )?; threading::lock(&object_progress).show_throughput(start); size_progress.show_throughput(start); Ok(Outcome { roots: root_items, children: child_items_vec, }) } } gix-pack-0.56.0/src/cache/delta/traverse/resolve.rs000064400000000000000000000536361046102023000202370ustar 00000000000000use std::{ collections::BTreeMap, sync::atomic::{AtomicBool, AtomicIsize, Ordering}, }; use gix_features::{progress::Progress, threading, zlib}; use crate::{ cache::delta::{ traverse::{util::ItemSliceSync, Context, Error}, Item, }, data, data::EntryRange, }; mod root { use crate::cache::delta::{traverse::util::ItemSliceSync, Item}; /// An item returned by `iter_root_chunks`, allowing access to the `data` stored alongside nodes in a [`Tree`]. pub(crate) struct Node<'a, T: Send> { // SAFETY INVARIANT: see Node::new(). That function is the only one used // to create or modify these fields. item: &'a mut Item, child_items: &'a ItemSliceSync<'a, Item>, } impl<'a, T: Send> Node<'a, T> { /// SAFETY: `item.children` must uniquely reference elements in child_items that no other currently alive /// item does. All child_items must also have unique children, unless the child_item is itself `item`, /// in which case no other live item should reference it in its `item.children`. /// /// This safety invariant can be reliably upheld by making sure `item` comes from a Tree and `child_items` /// was constructed using that Tree's child_items. This works since Tree has this invariant as well: all /// child_items are referenced at most once (really, exactly once) by a node in the tree. /// /// Note that this invariant is a bit more relaxed than that on `deltas()`, because this function can be called /// for traversal within a child item, which happens in into_child_iter() #[allow(unsafe_code)] pub(super) unsafe fn new(item: &'a mut Item, child_items: &'a ItemSliceSync<'a, Item>) -> Self { Node { item, child_items } } } impl<'a, T: Send> Node<'a, T> { /// Returns the offset into the pack at which the `Node`s data is located. pub fn offset(&self) -> u64 { self.item.offset } /// Returns the slice into the data pack at which the pack entry is located. pub fn entry_slice(&self) -> crate::data::EntryRange { self.item.offset..self.item.next_offset } /// Returns the node data associated with this node. pub fn data(&mut self) -> &mut T { &mut self.item.data } /// Returns true if this node has children, e.g. is not a leaf in the tree. pub fn has_children(&self) -> bool { !self.item.children().is_empty() } /// Transform this `Node` into an iterator over its children. /// /// Children are `Node`s referring to pack entries whose base object is this pack entry. pub fn into_child_iter(self) -> impl Iterator> + 'a { let children = self.child_items; #[allow(unsafe_code)] self.item.children().iter().map(move |&index| { // SAFETY: Due to the invariant on new(), we can rely on these indices // being unique. let item = unsafe { children.get_mut(index as usize) }; // SAFETY: Since every child_item is also required to uphold the uniqueness guarantee, // creating a Node with one of the child_items that we are allowed access to is still fine. unsafe { Node::new(item, children) } }) } } } pub(super) struct State<'items, F, MBFN, T: Send> { pub delta_bytes: Vec, pub fully_resolved_delta_bytes: Vec, pub progress: Box, pub resolve: F, pub modify_base: MBFN, pub child_items: &'items ItemSliceSync<'items, Item>, } /// SAFETY: `item.children` must uniquely reference elements in child_items that no other currently alive /// item does. All child_items must also have unique children. /// /// This safety invariant can be reliably upheld by making sure `item` comes from a Tree and `child_items` /// was constructed using that Tree's child_items. This works since Tree has this invariant as well: all /// child_items are referenced at most once (really, exactly once) by a node in the tree. #[allow(clippy::too_many_arguments, unsafe_code)] #[deny(unsafe_op_in_unsafe_fn)] // this is a big function, require unsafe for the one small unsafe op we have pub(super) unsafe fn deltas( objects: gix_features::progress::StepShared, size: gix_features::progress::StepShared, item: &mut Item, State { delta_bytes, fully_resolved_delta_bytes, progress, resolve, modify_base, child_items, }: &mut State<'_, F, MBFN, T>, resolve_data: &R, hash_len: usize, threads_left: &AtomicIsize, should_interrupt: &AtomicBool, ) -> Result<(), Error> where T: Send, R: Send + Sync, F: for<'r> Fn(EntryRange, &'r R) -> Option<&'r [u8]> + Send + Clone, MBFN: FnMut(&mut T, &dyn Progress, Context<'_>) -> Result<(), E> + Send + Clone, E: std::error::Error + Send + Sync + 'static, { let mut decompressed_bytes_by_pack_offset = BTreeMap::new(); let mut inflate = zlib::Inflate::default(); let mut decompress_from_resolver = |slice: EntryRange, out: &mut Vec| -> Result<(data::Entry, u64), Error> { let bytes = resolve(slice.clone(), resolve_data).ok_or(Error::ResolveFailed { pack_offset: slice.start, })?; let entry = data::Entry::from_bytes(bytes, slice.start, hash_len)?; let compressed = &bytes[entry.header_size()..]; let decompressed_len = entry.decompressed_size as usize; decompress_all_at_once_with(&mut inflate, compressed, decompressed_len, out)?; Ok((entry, slice.end)) }; // each node is a base, and its children always start out as deltas which become a base after applying them. // These will be pushed onto our stack until all are processed let root_level = 0; // SAFETY: This invariant is required from the caller #[allow(unsafe_code)] let root_node = unsafe { root::Node::new(item, child_items) }; let mut nodes: Vec<_> = vec![(root_level, root_node)]; while let Some((level, mut base)) = nodes.pop() { if should_interrupt.load(Ordering::Relaxed) { return Err(Error::Interrupted); } let (base_entry, entry_end, base_bytes) = if level == root_level { let mut buf = Vec::new(); let (a, b) = decompress_from_resolver(base.entry_slice(), &mut buf)?; (a, b, buf) } else { decompressed_bytes_by_pack_offset .remove(&base.offset()) .expect("we store the resolved delta buffer when done") }; // anything done here must be repeated further down for leaf-nodes. // This way we avoid retaining their decompressed memory longer than needed (they have no children, // thus their memory can be released right away, using 18% less peak memory on the linux kernel). { modify_base( base.data(), progress, Context { entry: &base_entry, entry_end, decompressed: &base_bytes, level, }, ) .map_err(|err| Box::new(err) as Box)?; objects.fetch_add(1, Ordering::Relaxed); size.fetch_add(base_bytes.len(), Ordering::Relaxed); } for mut child in base.into_child_iter() { let (mut child_entry, entry_end) = decompress_from_resolver(child.entry_slice(), delta_bytes)?; let (base_size, consumed) = data::delta::decode_header_size(delta_bytes); let mut header_ofs = consumed; assert_eq!( base_bytes.len(), base_size as usize, "recorded base size in delta does match the actual one" ); let (result_size, consumed) = data::delta::decode_header_size(&delta_bytes[consumed..]); header_ofs += consumed; fully_resolved_delta_bytes.resize(result_size as usize, 0); data::delta::apply(&base_bytes, fully_resolved_delta_bytes, &delta_bytes[header_ofs..]); // FIXME: this actually invalidates the "pack_offset()" computation, which is not obvious to consumers // at all child_entry.header = base_entry.header; // assign the actual object type, instead of 'delta' if child.has_children() { decompressed_bytes_by_pack_offset.insert( child.offset(), (child_entry, entry_end, std::mem::take(fully_resolved_delta_bytes)), ); nodes.push((level + 1, child)); } else { modify_base( child.data(), &progress, Context { entry: &child_entry, entry_end, decompressed: fully_resolved_delta_bytes, level: level + 1, }, ) .map_err(|err| Box::new(err) as Box)?; objects.fetch_add(1, Ordering::Relaxed); size.fetch_add(base_bytes.len(), Ordering::Relaxed); } } // After the first round, see if we can use additional threads, and if so we enter multi-threaded mode. // In it we will keep using new threads as they become available while using this thread for coordination. // We optimize for a low memory footprint as we are likely to get here if long delta-chains with large objects are involved. // Try to avoid going into threaded mode if there isn't more than one unit of work anyway. if nodes.len() > 1 { if let Ok(initial_threads) = threads_left.fetch_update(Ordering::SeqCst, Ordering::SeqCst, |threads_available| { (threads_available > 0).then_some(0) }) { // Assure no memory is held here. *delta_bytes = Vec::new(); *fully_resolved_delta_bytes = Vec::new(); return deltas_mt( initial_threads, decompressed_bytes_by_pack_offset, objects, size, &progress, nodes, resolve.clone(), resolve_data, modify_base.clone(), hash_len, threads_left, should_interrupt, ); } } } Ok(()) } /// * `initial_threads` is the threads we may spawn, not accounting for our own thread which is still considered used by the parent /// system. Since this thread will take a controlling function, we may spawn one more than that. In threaded mode, we will finish /// all remaining work. #[allow(clippy::too_many_arguments)] fn deltas_mt( mut threads_to_create: isize, decompressed_bytes_by_pack_offset: BTreeMap)>, objects: gix_features::progress::StepShared, size: gix_features::progress::StepShared, progress: &dyn Progress, nodes: Vec<(u16, root::Node<'_, T>)>, resolve: F, resolve_data: &R, modify_base: MBFN, hash_len: usize, threads_left: &AtomicIsize, should_interrupt: &AtomicBool, ) -> Result<(), Error> where T: Send, R: Send + Sync, F: for<'r> Fn(EntryRange, &'r R) -> Option<&'r [u8]> + Send + Clone, MBFN: FnMut(&mut T, &dyn Progress, Context<'_>) -> Result<(), E> + Send + Clone, E: std::error::Error + Send + Sync + 'static, { let nodes = gix_features::threading::Mutable::new(nodes); let decompressed_bytes_by_pack_offset = gix_features::threading::Mutable::new(decompressed_bytes_by_pack_offset); threads_to_create += 1; // ourselves let mut returned_ourselves = false; gix_features::parallel::threads(|s| -> Result<(), Error> { let mut threads = Vec::new(); let poll_interval = std::time::Duration::from_millis(100); loop { for tid in 0..threads_to_create { let thread = gix_features::parallel::build_thread() .name(format!("gix-pack.traverse_deltas.{tid}")) .spawn_scoped(s, { let nodes = &nodes; let decompressed_bytes_by_pack_offset = &decompressed_bytes_by_pack_offset; let resolve = resolve.clone(); let mut modify_base = modify_base.clone(); let objects = &objects; let size = &size; move || -> Result<(), Error> { let mut fully_resolved_delta_bytes = Vec::new(); let mut delta_bytes = Vec::new(); let mut inflate = zlib::Inflate::default(); let mut decompress_from_resolver = |slice: EntryRange, out: &mut Vec| -> Result<(data::Entry, u64), Error> { let bytes = resolve(slice.clone(), resolve_data).ok_or(Error::ResolveFailed { pack_offset: slice.start, })?; let entry = data::Entry::from_bytes(bytes, slice.start, hash_len)?; let compressed = &bytes[entry.header_size()..]; let decompressed_len = entry.decompressed_size as usize; decompress_all_at_once_with(&mut inflate, compressed, decompressed_len, out)?; Ok((entry, slice.end)) }; loop { let (level, mut base) = match threading::lock(nodes).pop() { Some(v) => v, None => break, }; if should_interrupt.load(Ordering::Relaxed) { return Err(Error::Interrupted); } let (base_entry, entry_end, base_bytes) = if level == 0 { let mut buf = Vec::new(); let (a, b) = decompress_from_resolver(base.entry_slice(), &mut buf)?; (a, b, buf) } else { threading::lock(decompressed_bytes_by_pack_offset) .remove(&base.offset()) .expect("we store the resolved delta buffer when done") }; // anything done here must be repeated further down for leaf-nodes. // This way we avoid retaining their decompressed memory longer than needed (they have no children, // thus their memory can be released right away, using 18% less peak memory on the linux kernel). { modify_base( base.data(), progress, Context { entry: &base_entry, entry_end, decompressed: &base_bytes, level, }, ) .map_err(|err| Box::new(err) as Box)?; objects.fetch_add(1, Ordering::Relaxed); size.fetch_add(base_bytes.len(), Ordering::Relaxed); } for mut child in base.into_child_iter() { let (mut child_entry, entry_end) = decompress_from_resolver(child.entry_slice(), &mut delta_bytes)?; let (base_size, consumed) = data::delta::decode_header_size(&delta_bytes); let mut header_ofs = consumed; assert_eq!( base_bytes.len(), base_size as usize, "recorded base size in delta does match the actual one" ); let (result_size, consumed) = data::delta::decode_header_size(&delta_bytes[consumed..]); header_ofs += consumed; fully_resolved_delta_bytes.resize(result_size as usize, 0); data::delta::apply( &base_bytes, &mut fully_resolved_delta_bytes, &delta_bytes[header_ofs..], ); // FIXME: this actually invalidates the "pack_offset()" computation, which is not obvious to consumers // at all child_entry.header = base_entry.header; // assign the actual object type, instead of 'delta' if child.has_children() { threading::lock(decompressed_bytes_by_pack_offset).insert( child.offset(), (child_entry, entry_end, std::mem::take(&mut fully_resolved_delta_bytes)), ); threading::lock(nodes).push((level + 1, child)); } else { modify_base( child.data(), progress, Context { entry: &child_entry, entry_end, decompressed: &fully_resolved_delta_bytes, level: level + 1, }, ) .map_err(|err| Box::new(err) as Box)?; objects.fetch_add(1, Ordering::Relaxed); size.fetch_add(base_bytes.len(), Ordering::Relaxed); } } } Ok(()) } })?; threads.push(thread); } if threads_left .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |threads_available: isize| { (threads_available > 0).then(|| { threads_to_create = threads_available.min(threading::lock(&nodes).len() as isize); threads_available - threads_to_create }) }) .is_err() { threads_to_create = 0; } // What we really want to do is either wait for one of our threads to go down // or for another scheduled thread to become available. Unfortunately we can't do that, // but may instead find a good way to set the polling interval instead of hard-coding it. std::thread::sleep(poll_interval); // Get out of threads are already starving or they would be starving soon as no work is left. // // Lint: ScopedJoinHandle is not the same depending on active features and is not exposed in some cases. #[allow(clippy::redundant_closure_for_method_calls)] if threads.iter().any(|t| t.is_finished()) { let mut running_threads = Vec::new(); for thread in threads.drain(..) { if thread.is_finished() { match thread.join() { Ok(Err(err)) => return Err(err), Ok(Ok(())) => { if !returned_ourselves { returned_ourselves = true; } else { threads_left.fetch_add(1, Ordering::SeqCst); } } Err(err) => { std::panic::resume_unwind(err); } } } else { running_threads.push(thread); } } if running_threads.is_empty() && threading::lock(&nodes).is_empty() { break; } threads = running_threads; } } Ok(()) }) } fn decompress_all_at_once_with( inflate: &mut zlib::Inflate, b: &[u8], decompressed_len: usize, out: &mut Vec, ) -> Result<(), Error> { out.resize(decompressed_len, 0); inflate.reset(); inflate.once(b, out).map_err(|err| Error::ZlibInflate { source: err, message: "Failed to decompress entry", })?; Ok(()) } gix-pack-0.56.0/src/cache/delta/traverse/util.rs000064400000000000000000000051471046102023000175270ustar 00000000000000use std::marker::PhantomData; /// SAFETY: This type is used to allow access to a size-optimized vec of items that form a /// tree, and we need to access it concurrently with each thread taking its own root node, /// and working its way through all the reachable leaves. /// /// The tree was built by decoding a pack whose entries refer to its bases only by OFS_DELTA - /// they are pointing backwards only which assures bases have to be listed first, and that each entry /// only has a single parent. /// /// REF_DELTA entries aren't supported here, and cause immediate failure - they are expected to have /// been resolved before as part of the thin-pack handling. /// /// If we somehow would allow REF_DELTA entries to point to an in-pack object, then in theory malicious packs could /// cause all kinds of graphs as they can point anywhere in the pack, but they still can't link an entry to /// more than one base. And that's what one would really have to do for two threads to encounter the same child. /// /// Thus I believe it's impossible for this data structure to end up in a place where it violates its assumption. pub(super) struct ItemSliceSync<'a, T> where T: Send, { items: *mut T, #[cfg(debug_assertions)] len: usize, phantom: PhantomData<&'a mut T>, } impl<'a, T> ItemSliceSync<'a, T> where T: Send, { pub(super) fn new(items: &'a mut [T]) -> Self { ItemSliceSync { items: items.as_mut_ptr(), #[cfg(debug_assertions)] len: items.len(), phantom: PhantomData, } } // SAFETY: The index must point into the slice and must not be reused concurrently. #[allow(unsafe_code)] pub(super) unsafe fn get_mut(&self, index: usize) -> &'a mut T { #[cfg(debug_assertions)] if index >= self.len { panic!("index out of bounds: the len is {} but the index is {index}", self.len); } // SAFETY: // - The index is within the slice (required by documentation) // - We have mutable access to `items` as ensured by Self::new() // - This is the only method on this type giving access to items // - The documentation requires that this access is unique unsafe { &mut *self.items.add(index) } } } // SAFETY: This is logically an &mut T, which is Send if T is Send // (note: this is different from &T, which also needs T: Sync) #[allow(unsafe_code)] unsafe impl Send for ItemSliceSync<'_, T> where T: Send {} // SAFETY: This is logically an &mut T, which is Sync if T is Sync #[allow(unsafe_code)] unsafe impl Sync for ItemSliceSync<'_, T> where T: Send {} gix-pack-0.56.0/src/cache/delta/tree.rs000064400000000000000000000252021046102023000156500ustar 00000000000000use super::{traverse, Error}; use crate::exact_vec; /// An item stored within the [`Tree`] whose data is stored in a pack file, identified by /// the offset of its first (`offset`) and last (`next_offset`) bytes. /// /// It represents either a root entry, or one that relies on a base to be resolvable, /// alongside associated `data` `T`. pub struct Item { /// The offset into the pack file at which the pack entry's data is located. pub offset: crate::data::Offset, /// The offset of the next item in the pack file. pub next_offset: crate::data::Offset, /// Data to store with each Item, effectively data associated with each entry in a pack. pub data: T, /// Indices into our Tree's `items`, one for each pack entry that depends on us. /// /// Limited to u32 as that's the maximum amount of objects in a pack. // SAFETY INVARIANT: // - only one Item in a tree may have any given child index. `future_child_offsets` // should also not contain any indices found in `children`.\ // - These indices should be in bounds for tree.child_items children: Vec, } impl Item { /// Get the children // (we don't want to expose mutable access) pub fn children(&self) -> &[u32] { &self.children } } /// Identify what kind of node we have last seen enum NodeKind { Root, Child, } /// A tree that allows one-time iteration over all nodes and their children, consuming it in the process, /// while being shareable among threads without a lock. /// It does this by making the guarantee that iteration only happens once. pub struct Tree { /// The root nodes, i.e. base objects // SAFETY invariant: see Item.children root_items: Vec>, /// The child nodes, i.e. those that rely a base object, like ref and ofs delta objects // SAFETY invariant: see Item.children child_items: Vec>, /// The last encountered node was either a root or a child. last_seen: Option, /// Future child offsets, associating their offset into the pack with their index in the items array. /// (parent_offset, child_index) // SAFETY invariant: // - None of these child indices should already have parents // i.e. future_child_offsets[i].1 should never be also found // in Item.children. Indices should be found here at most once. // - These indices should be in bounds for tree.child_items. future_child_offsets: Vec<(crate::data::Offset, usize)>, } impl Tree { /// Instantiate a empty tree capable of storing `num_objects` amounts of items. pub fn with_capacity(num_objects: usize) -> Result { Ok(Tree { root_items: exact_vec(num_objects / 2), child_items: exact_vec(num_objects / 2), last_seen: None, future_child_offsets: Vec::new(), }) } pub(super) fn num_items(&self) -> usize { self.root_items.len() + self.child_items.len() } /// Returns self's root and child items. /// /// You can rely on them following the same `children` invariants as they did in the tree pub(super) fn take_root_and_child(self) -> (Vec>, Vec>) { (self.root_items, self.child_items) } pub(super) fn assert_is_incrementing_and_update_next_offset( &mut self, offset: crate::data::Offset, ) -> Result<(), Error> { let items = match &self.last_seen { Some(NodeKind::Root) => &mut self.root_items, Some(NodeKind::Child) => &mut self.child_items, None => return Ok(()), }; let item = &mut items.last_mut().expect("last seen won't lie"); if offset <= item.offset { return Err(Error::InvariantIncreasingPackOffset { last_pack_offset: item.offset, pack_offset: offset, }); } item.next_offset = offset; Ok(()) } pub(super) fn set_pack_entries_end_and_resolve_ref_offsets( &mut self, pack_entries_end: crate::data::Offset, ) -> Result<(), traverse::Error> { if !self.future_child_offsets.is_empty() { for (parent_offset, child_index) in self.future_child_offsets.drain(..) { // SAFETY invariants upheld: // - We are draining from future_child_offsets and adding to children, keeping things the same. // - We can rely on the `future_child_offsets` invariant to be sure that `children` is // not getting any indices that are already in use in `children` elsewhere // - The indices are in bounds for child_items since they were in bounds for future_child_offsets, // we can carry over the invariant. if let Ok(i) = self.child_items.binary_search_by_key(&parent_offset, |i| i.offset) { self.child_items[i].children.push(child_index as u32); } else if let Ok(i) = self.root_items.binary_search_by_key(&parent_offset, |i| i.offset) { self.root_items[i].children.push(child_index as u32); } else { return Err(traverse::Error::OutOfPackRefDelta { base_pack_offset: parent_offset, }); } } } self.assert_is_incrementing_and_update_next_offset(pack_entries_end) .expect("BUG: pack now is smaller than all previously seen entries"); Ok(()) } /// Add a new root node, one that only has children but is not a child itself, at the given pack `offset` and associate /// custom `data` with it. pub fn add_root(&mut self, offset: crate::data::Offset, data: T) -> Result<(), Error> { self.assert_is_incrementing_and_update_next_offset(offset)?; self.last_seen = NodeKind::Root.into(); self.root_items.push(Item { offset, next_offset: 0, data, // SAFETY INVARIANT upheld: there are no children children: Default::default(), }); Ok(()) } /// Add a child of the item at `base_offset` which itself resides at pack `offset` and associate custom `data` with it. pub fn add_child( &mut self, base_offset: crate::data::Offset, offset: crate::data::Offset, data: T, ) -> Result<(), Error> { self.assert_is_incrementing_and_update_next_offset(offset)?; let next_child_index = self.child_items.len(); // SAFETY INVARIANT upheld: // - This is one of two methods that modifies `children` and future_child_offsets. Out // of the two, it is the only one that produces new indices in the system. // - This always pushes next_child_index to *either* `children` or `future_child_offsets`, // maintaining the cross-field invariant there. // - This method will always push to child_items (at the end), incrementing // future values of next_child_index. This means next_child_index is always // unique for this method call. // - As the only method producing new indices, this is the only time // next_child_index will be added to children/future_child_offsets, upholding the invariant. // - Since next_child_index will always be a valid index by the end of this method, // this always produces valid in-bounds indices, upholding the bounds invariant. if let Ok(i) = self.child_items.binary_search_by_key(&base_offset, |i| i.offset) { self.child_items[i].children.push(next_child_index as u32); } else if let Ok(i) = self.root_items.binary_search_by_key(&base_offset, |i| i.offset) { self.root_items[i].children.push(next_child_index as u32); } else { self.future_child_offsets.push((base_offset, next_child_index)); } self.last_seen = NodeKind::Child.into(); self.child_items.push(Item { offset, next_offset: 0, data, // SAFETY INVARIANT upheld: there are no children children: Default::default(), }); Ok(()) } } #[cfg(test)] mod tests { mod from_offsets_in_pack { use std::sync::atomic::AtomicBool; use crate as pack; const SMALL_PACK_INDEX: &str = "objects/pack/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.idx"; const SMALL_PACK: &str = "objects/pack/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.pack"; const INDEX_V1: &str = "objects/pack/pack-c0438c19fb16422b6bbcce24387b3264416d485b.idx"; const PACK_FOR_INDEX_V1: &str = "objects/pack/pack-c0438c19fb16422b6bbcce24387b3264416d485b.pack"; use gix_testtools::fixture_path; #[test] fn v1() -> Result<(), Box> { tree(INDEX_V1, PACK_FOR_INDEX_V1) } #[test] fn v2() -> Result<(), Box> { tree(SMALL_PACK_INDEX, SMALL_PACK) } fn tree(index_path: &str, pack_path: &str) -> Result<(), Box> { let idx = pack::index::File::at(fixture_path(index_path), gix_hash::Kind::Sha1)?; crate::cache::delta::Tree::from_offsets_in_pack( &fixture_path(pack_path), idx.sorted_offsets().into_iter(), &|ofs| *ofs, &|id| idx.lookup(id).map(|index| idx.pack_offset_at_index(index)), &mut gix_features::progress::Discard, &AtomicBool::new(false), gix_hash::Kind::Sha1, )?; Ok(()) } } mod size { use super::super::Item; use gix_testtools::size_ok; #[test] fn size_of_pack_tree_item() { let actual = std::mem::size_of::<[Item<()>; 7_500_000]>(); let expected = 300_000_000; assert!( size_ok(actual, expected), "we don't want these to grow unnoticed: {actual} <~ {expected}" ); } #[test] fn size_of_pack_verify_data_structure() { pub struct EntryWithDefault { _index_entry: crate::index::Entry, _kind: gix_object::Kind, _object_size: u64, _decompressed_size: u64, _compressed_size: u64, _header_size: u16, _level: u16, } let actual = std::mem::size_of::<[Item; 7_500_000]>(); let expected = 840_000_000; assert!( size_ok(actual, expected), "we don't want these to grow unnoticed: {actual} <~ {expected}" ); } } } gix-pack-0.56.0/src/cache/lru.rs000064400000000000000000000207121046102023000144230ustar 00000000000000use super::DecodeEntry; #[cfg(feature = "pack-cache-lru-dynamic")] mod memory { use super::DecodeEntry; use crate::cache::set_vec_to_slice; use clru::WeightScale; use std::num::NonZeroUsize; struct Entry { data: Vec, kind: gix_object::Kind, compressed_size: usize, } type Key = (u32, u64); struct CustomScale; impl WeightScale for CustomScale { fn weight(&self, _key: &Key, value: &Entry) -> usize { value.data.len() } } /// An LRU cache with hash map backing and an eviction rule based on the memory usage for object data in bytes. pub struct MemoryCappedHashmap { inner: clru::CLruCache, free_list: Vec>, debug: gix_features::cache::Debug, } impl MemoryCappedHashmap { /// Return a new instance which evicts least recently used items if it uses more than `memory_cap_in_bytes` /// object data. pub fn new(memory_cap_in_bytes: usize) -> MemoryCappedHashmap { MemoryCappedHashmap { inner: clru::CLruCache::with_config( clru::CLruCacheConfig::new(NonZeroUsize::new(memory_cap_in_bytes).expect("non zero")) .with_scale(CustomScale), ), free_list: Vec::new(), debug: gix_features::cache::Debug::new(format!("MemoryCappedHashmap({memory_cap_in_bytes}B)")), } } } impl DecodeEntry for MemoryCappedHashmap { fn put(&mut self, pack_id: u32, offset: u64, data: &[u8], kind: gix_object::Kind, compressed_size: usize) { self.debug.put(); let Some(data) = set_vec_to_slice(self.free_list.pop().unwrap_or_default(), data) else { return; }; let res = self.inner.put_with_weight( (pack_id, offset), Entry { data, kind, compressed_size, }, ); match res { Ok(Some(previous_entry)) => self.free_list.push(previous_entry.data), Ok(None) => {} Err((_key, value)) => self.free_list.push(value.data), } } fn get(&mut self, pack_id: u32, offset: u64, out: &mut Vec) -> Option<(gix_object::Kind, usize)> { let res = self.inner.get(&(pack_id, offset)).and_then(|e| { set_vec_to_slice(out, &e.data)?; Some((e.kind, e.compressed_size)) }); if res.is_some() { self.debug.hit(); } else { self.debug.miss(); } res } } } #[cfg(feature = "pack-cache-lru-dynamic")] pub use memory::MemoryCappedHashmap; #[cfg(feature = "pack-cache-lru-static")] mod _static { use super::DecodeEntry; use crate::cache::set_vec_to_slice; struct Entry { pack_id: u32, offset: u64, data: Vec, kind: gix_object::Kind, compressed_size: usize, } /// A cache using a least-recently-used implementation capable of storing the `SIZE` most recent objects. /// The cache must be small as the search is 'naive' and the underlying data structure is a linked list. /// Values of 64 seem to improve performance. pub struct StaticLinkedList { inner: uluru::LRUCache, last_evicted: Vec, debug: gix_features::cache::Debug, /// the amount of bytes we are currently holding, taking into account the capacities of all Vecs we keep. mem_used: usize, /// The total amount of memory we should be able to hold with all entries combined. mem_limit: usize, } impl StaticLinkedList { /// Create a new list with a memory limit of `mem_limit` in bytes. If 0, there is no memory limit. pub fn new(mem_limit: usize) -> Self { StaticLinkedList { inner: Default::default(), last_evicted: Vec::new(), debug: gix_features::cache::Debug::new(format!("StaticLinkedList<{SIZE}>")), mem_used: 0, mem_limit: if mem_limit == 0 { usize::MAX } else { mem_limit }, } } } impl Default for StaticLinkedList { fn default() -> Self { Self::new(96 * 1024 * 1024) } } impl DecodeEntry for StaticLinkedList { fn put(&mut self, pack_id: u32, offset: u64, data: &[u8], kind: gix_object::Kind, compressed_size: usize) { // We cannot possibly hold this much. if data.len() > self.mem_limit { return; } // If we could hold it but are at limit, all we can do is make space. let mem_free = self.mem_limit - self.mem_used; if data.len() > mem_free { // prefer freeing free-lists instead of clearing our cache let free_list_cap = self.last_evicted.len(); self.last_evicted = Vec::new(); // still not enough? clear everything if data.len() > mem_free + free_list_cap { self.inner.clear(); self.mem_used = 0; } else { self.mem_used -= free_list_cap; } } self.debug.put(); let mut v = std::mem::take(&mut self.last_evicted); self.mem_used -= v.capacity(); if set_vec_to_slice(&mut v, data).is_none() { return; } self.mem_used += v.capacity(); if let Some(previous) = self.inner.insert(Entry { offset, pack_id, data: v, kind, compressed_size, }) { // No need to adjust capacity as we already counted it. self.last_evicted = previous.data; } } fn get(&mut self, pack_id: u32, offset: u64, out: &mut Vec) -> Option<(gix_object::Kind, usize)> { let res = self.inner.lookup(|e: &mut Entry| { if e.pack_id == pack_id && e.offset == offset { set_vec_to_slice(&mut *out, &e.data)?; Some((e.kind, e.compressed_size)) } else { None } }); if res.is_some() { self.debug.hit(); } else { self.debug.miss(); } res } } #[cfg(test)] mod tests { use super::*; #[test] fn no_limit() { let c = StaticLinkedList::<10>::new(0); assert_eq!( c.mem_limit, usize::MAX, "zero is automatically turned into a large limit that is equivalent to unlimited" ); } #[test] fn journey() { let mut c = StaticLinkedList::<10>::new(100); assert_eq!(c.mem_limit, 100); assert_eq!(c.mem_used, 0); // enough memory for normal operation let mut last_mem_used = 0; for _ in 0..10 { c.put(0, 0, &[0], gix_object::Kind::Blob, 1); assert!(c.mem_used > last_mem_used); last_mem_used = c.mem_used; } assert_eq!(c.mem_used, 80, "there is a minimal vec size"); assert_eq!(c.inner.len(), 10); assert_eq!(c.last_evicted.len(), 0); c.put(0, 0, &(0..20).collect::>(), gix_object::Kind::Blob, 1); assert_eq!(c.inner.len(), 10); assert_eq!(c.mem_used, 80 + 20); assert_eq!(c.last_evicted.len(), 1); c.put(0, 0, &(0..50).collect::>(), gix_object::Kind::Blob, 1); assert_eq!(c.inner.len(), 1, "cache clearance wasn't necessary"); assert_eq!(c.last_evicted.len(), 0, "the free list was cleared"); assert_eq!(c.mem_used, 50); c.put(0, 0, &(0..101).collect::>(), gix_object::Kind::Blob, 1); assert_eq!( c.inner.len(), 1, "objects that won't ever fit within the memory limit are ignored" ); } } } #[cfg(feature = "pack-cache-lru-static")] pub use _static::StaticLinkedList; gix-pack-0.56.0/src/cache/mod.rs000064400000000000000000000055531046102023000144060ustar 00000000000000use std::ops::DerefMut; use gix_object::Kind; /// A trait to model putting objects at a given pack `offset` into a cache, and fetching them. /// /// It is used to speed up [pack traversals][crate::index::File::traverse()]. pub trait DecodeEntry { /// Store a fully decoded object at `offset` of `kind` with `compressed_size` and `data` in the cache. /// /// It is up to the cache implementation whether that actually happens or not. fn put(&mut self, pack_id: u32, offset: u64, data: &[u8], kind: gix_object::Kind, compressed_size: usize); /// Attempt to fetch the object at `offset` and store its decoded bytes in `out`, as previously stored with [`DecodeEntry::put()`], and return /// its (object `kind`, `decompressed_size`) fn get(&mut self, pack_id: u32, offset: u64, out: &mut Vec) -> Option<(gix_object::Kind, usize)>; } /// A cache that stores nothing and retrieves nothing, thus it _never_ caches. #[derive(Default)] pub struct Never; impl DecodeEntry for Never { fn put(&mut self, _pack_id: u32, _offset: u64, _data: &[u8], _kind: gix_object::Kind, _compressed_size: usize) {} fn get(&mut self, _pack_id: u32, _offset: u64, _out: &mut Vec) -> Option<(gix_object::Kind, usize)> { None } } impl DecodeEntry for Box { fn put(&mut self, pack_id: u32, offset: u64, data: &[u8], kind: Kind, compressed_size: usize) { self.deref_mut().put(pack_id, offset, data, kind, compressed_size); } fn get(&mut self, pack_id: u32, offset: u64, out: &mut Vec) -> Option<(Kind, usize)> { self.deref_mut().get(pack_id, offset, out) } } /// A way of storing and retrieving entire objects to and from a cache. pub trait Object { /// Put the object going by `id` of `kind` with `data` into the cache. fn put(&mut self, id: gix_hash::ObjectId, kind: gix_object::Kind, data: &[u8]); /// Try to retrieve the object named `id` and place its data into `out` if available and return `Some(kind)` if found. fn get(&mut self, id: &gix_hash::ObjectId, out: &mut Vec) -> Option; } /// Various implementations of [`DecodeEntry`] using least-recently-used algorithms. #[cfg(any(feature = "pack-cache-lru-dynamic", feature = "pack-cache-lru-static"))] pub mod lru; pub mod object; /// pub(crate) mod delta; /// Replaces content of the given `Vec` with the slice. The vec will have the same length /// as the slice. The vec can be either `&mut Vec` or `Vec`. /// Returns `None` if no memory could be allocated. #[cfg(any( feature = "pack-cache-lru-static", feature = "pack-cache-lru-dynamic", feature = "object-cache-dynamic" ))] fn set_vec_to_slice>>(mut vec: V, source: &[u8]) -> Option { let out = vec.borrow_mut(); out.clear(); out.try_reserve(source.len()).ok()?; out.extend_from_slice(source); Some(vec) } gix-pack-0.56.0/src/cache/object.rs000064400000000000000000000076651046102023000151030ustar 00000000000000//! This module is a bit 'misplaced' if spelled out like '`gix_pack::cache::object::`*' but is best placed here for code reuse and //! general usefulness. use crate::cache; #[cfg(feature = "object-cache-dynamic")] mod memory { use crate::{cache, cache::set_vec_to_slice}; use clru::WeightScale; use std::num::NonZeroUsize; struct Entry { data: Vec, kind: gix_object::Kind, } type Key = gix_hash::ObjectId; struct CustomScale; impl WeightScale for CustomScale { fn weight(&self, key: &Key, value: &Entry) -> usize { value.data.len() + std::mem::size_of::() + key.as_bytes().len() } } /// An LRU cache with hash map backing and an eviction rule based on the memory usage for object data in bytes. pub struct MemoryCappedHashmap { inner: clru::CLruCache, free_list: Vec>, debug: gix_features::cache::Debug, } impl MemoryCappedHashmap { /// The amount of bytes we can hold in total, or the value we saw in `new(…)`. pub fn capacity(&self) -> usize { self.inner.capacity() } /// Return a new instance which evicts least recently used items if it uses more than `memory_cap_in_bytes` /// object data. pub fn new(memory_cap_in_bytes: usize) -> MemoryCappedHashmap { MemoryCappedHashmap { inner: clru::CLruCache::with_config( clru::CLruCacheConfig::new(NonZeroUsize::new(memory_cap_in_bytes).expect("non zero")) .with_hasher(gix_hashtable::hash::Builder) .with_scale(CustomScale), ), free_list: Vec::new(), debug: gix_features::cache::Debug::new(format!("MemoryCappedObjectHashmap({memory_cap_in_bytes}B)")), } } } impl cache::Object for MemoryCappedHashmap { /// Put the object going by `id` of `kind` with `data` into the cache. fn put(&mut self, id: gix_hash::ObjectId, kind: gix_object::Kind, data: &[u8]) { self.debug.put(); let Some(data) = set_vec_to_slice(self.free_list.pop().unwrap_or_default(), data) else { return; }; let res = self.inner.put_with_weight(id, Entry { data, kind }); match res { Ok(Some(previous_entry)) => self.free_list.push(previous_entry.data), Ok(None) => {} Err((_key, value)) => self.free_list.push(value.data), } } /// Try to retrieve the object named `id` and place its data into `out` if available and return `Some(kind)` if found. fn get(&mut self, id: &gix_hash::ObjectId, out: &mut Vec) -> Option { let res = self.inner.get(id).and_then(|e| { set_vec_to_slice(out, &e.data)?; Some(e.kind) }); if res.is_some() { self.debug.hit(); } else { self.debug.miss(); } res } } } #[cfg(feature = "object-cache-dynamic")] pub use memory::MemoryCappedHashmap; /// A cache implementation that doesn't do any caching. pub struct Never; impl cache::Object for Never { /// Noop fn put(&mut self, _id: gix_hash::ObjectId, _kind: gix_object::Kind, _data: &[u8]) {} /// Noop fn get(&mut self, _id: &gix_hash::ObjectId, _out: &mut Vec) -> Option { None } } impl cache::Object for Box { fn put(&mut self, id: gix_hash::ObjectId, kind: gix_object::Kind, data: &[u8]) { use std::ops::DerefMut; self.deref_mut().put(id, kind, data); } fn get(&mut self, id: &gix_hash::ObjectId, out: &mut Vec) -> Option { use std::ops::DerefMut; self.deref_mut().get(id, out) } } gix-pack-0.56.0/src/data/delta.rs000064400000000000000000000047331046102023000145650ustar 00000000000000/// Given the decompressed pack delta `d`, decode a size in bytes (either the base object size or the result object size) /// Equivalent to [this canonical git function](https://github.com/git/git/blob/311531c9de557d25ac087c1637818bd2aad6eb3a/delta.h#L89) pub fn decode_header_size(d: &[u8]) -> (u64, usize) { let mut i = 0; let mut size = 0u64; let mut consumed = 0; for cmd in d.iter() { consumed += 1; size |= (u64::from(*cmd) & 0x7f) << i; i += 7; if *cmd & 0x80 == 0 { break; } } (size, consumed) } pub fn apply(base: &[u8], mut target: &mut [u8], data: &[u8]) { let mut i = 0; while let Some(cmd) = data.get(i) { i += 1; match cmd { cmd if cmd & 0b1000_0000 != 0 => { let (mut ofs, mut size): (u32, u32) = (0, 0); if cmd & 0b0000_0001 != 0 { ofs = u32::from(data[i]); i += 1; } if cmd & 0b0000_0010 != 0 { ofs |= u32::from(data[i]) << 8; i += 1; } if cmd & 0b0000_0100 != 0 { ofs |= u32::from(data[i]) << 16; i += 1; } if cmd & 0b0000_1000 != 0 { ofs |= u32::from(data[i]) << 24; i += 1; } if cmd & 0b0001_0000 != 0 { size = u32::from(data[i]); i += 1; } if cmd & 0b0010_0000 != 0 { size |= u32::from(data[i]) << 8; i += 1; } if cmd & 0b0100_0000 != 0 { size |= u32::from(data[i]) << 16; i += 1; } if size == 0 { size = 0x10000; // 65536 } let ofs = ofs as usize; std::io::Write::write(&mut target, &base[ofs..ofs + size as usize]) .expect("delta copy from base: byte slices must match"); } 0 => panic!("encountered unsupported command code: 0"), size => { std::io::Write::write(&mut target, &data[i..i + *size as usize]) .expect("delta copy data: slice sizes to match up"); i += *size as usize; } } } assert_eq!(i, data.len()); assert_eq!(target.len(), 0); } gix-pack-0.56.0/src/data/entry/decode.rs000064400000000000000000000104401046102023000160500ustar 00000000000000use std::io; use gix_features::decode::{leb64, leb64_from_read}; use super::{BLOB, COMMIT, OFS_DELTA, REF_DELTA, TAG, TREE}; use crate::data; /// The error returned by [data::Entry::from_bytes()]. #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] #[error("Object type {type_id} is unsupported")] pub struct Error { pub type_id: u8, } /// Decoding impl data::Entry { /// Decode an entry from the given entry data `d`, providing the `pack_offset` to allow tracking the start of the entry data section. /// /// # Panics /// /// If we cannot understand the header, garbage data is likely to trigger this. pub fn from_bytes(d: &[u8], pack_offset: data::Offset, hash_len: usize) -> Result { let (type_id, size, mut consumed) = parse_header_info(d); use crate::data::entry::Header::*; let object = match type_id { OFS_DELTA => { let (distance, leb_bytes) = leb64(&d[consumed..]); let delta = OfsDelta { base_distance: distance, }; consumed += leb_bytes; delta } REF_DELTA => { let delta = RefDelta { base_id: gix_hash::ObjectId::from_bytes_or_panic(&d[consumed..][..hash_len]), }; consumed += hash_len; delta } BLOB => Blob, TREE => Tree, COMMIT => Commit, TAG => Tag, other => return Err(Error { type_id: other }), }; Ok(data::Entry { header: object, decompressed_size: size, data_offset: pack_offset + consumed as u64, }) } /// Instantiate an `Entry` from the reader `r`, providing the `pack_offset` to allow tracking the start of the entry data section. pub fn from_read(r: &mut dyn io::Read, pack_offset: data::Offset, hash_len: usize) -> io::Result { let (type_id, size, mut consumed) = streaming_parse_header_info(r)?; use crate::data::entry::Header::*; let object = match type_id { OFS_DELTA => { let (distance, leb_bytes) = leb64_from_read(r)?; let delta = OfsDelta { base_distance: distance, }; consumed += leb_bytes; delta } REF_DELTA => { let mut buf = gix_hash::Kind::buf(); let hash = &mut buf[..hash_len]; r.read_exact(hash)?; #[allow(clippy::redundant_slicing)] let delta = RefDelta { base_id: gix_hash::ObjectId::from_bytes_or_panic(&hash[..]), }; consumed += hash_len; delta } BLOB => Blob, TREE => Tree, COMMIT => Commit, TAG => Tag, other => { return Err(io::Error::new( io::ErrorKind::Other, format!("Object type {other} is unsupported"), )) } }; Ok(data::Entry { header: object, decompressed_size: size, data_offset: pack_offset + consumed as u64, }) } } #[inline] fn streaming_parse_header_info(read: &mut dyn io::Read) -> Result<(u8, u64, usize), io::Error> { let mut byte = [0u8; 1]; read.read_exact(&mut byte)?; let mut c = byte[0]; let mut i = 1; let type_id = (c >> 4) & 0b0000_0111; let mut size = u64::from(c) & 0b0000_1111; let mut s = 4; while c & 0b1000_0000 != 0 { read.read_exact(&mut byte)?; c = byte[0]; i += 1; size += u64::from(c & 0b0111_1111) << s; s += 7; } Ok((type_id, size, i)) } /// Parses the header of a pack-entry, yielding object type id, decompressed object size, and consumed bytes #[inline] fn parse_header_info(data: &[u8]) -> (u8, u64, usize) { let mut c = data[0]; let mut i = 1; let type_id = (c >> 4) & 0b0000_0111; let mut size = u64::from(c) & 0b0000_1111; let mut s = 4; while c & 0b1000_0000 != 0 { c = data[i]; i += 1; size += u64::from(c & 0b0111_1111) << s; s += 7; } (type_id, size, i) } gix-pack-0.56.0/src/data/entry/header.rs000064400000000000000000000124731046102023000160650ustar 00000000000000use std::io; use super::{BLOB, COMMIT, OFS_DELTA, REF_DELTA, TAG, TREE}; use crate::data; /// The header portion of a pack data entry, identifying the kind of stored object. #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[allow(missing_docs)] pub enum Header { /// The object is a commit Commit, /// The object is a tree Tree, /// The object is a blob Blob, /// The object is a tag Tag, /// Describes a delta-object which needs to be applied to a base. The base object is identified by the `base_id` field /// which is found within the parent repository. /// Most commonly used for **thin-packs** when receiving pack files from the server to refer to objects that are not /// part of the pack but expected to be present in the receivers repository. /// /// # Note /// This could also be an object within this pack if the LSB encoded offset would be larger than 20 bytes, which is unlikely to /// happen. /// /// **The naming** is exactly the same as the canonical implementation uses, namely **REF_DELTA**. RefDelta { base_id: gix_hash::ObjectId }, /// Describes a delta-object present in this pack which acts as base for this object. /// The base object is measured as a distance from this objects /// pack offset, so that `base_pack_offset = this_objects_pack_offset - base_distance` /// /// # Note /// /// **The naming** is exactly the same as the canonical implementation uses, namely **OFS_DELTA**. OfsDelta { base_distance: u64 }, } impl Header { /// Subtract `distance` from `pack_offset` safely without the chance for overflow or no-ops if `distance` is 0. pub fn verified_base_pack_offset(pack_offset: data::Offset, distance: u64) -> Option { if distance == 0 { return None; } pack_offset.checked_sub(distance) } /// Convert the header's object kind into [`gix_object::Kind`] if possible pub fn as_kind(&self) -> Option { use gix_object::Kind::*; Some(match self { Header::Tree => Tree, Header::Blob => Blob, Header::Commit => Commit, Header::Tag => Tag, Header::RefDelta { .. } | Header::OfsDelta { .. } => return None, }) } /// Convert this header's object kind into the packs internal representation pub fn as_type_id(&self) -> u8 { use Header::*; match self { Blob => BLOB, Tree => TREE, Commit => COMMIT, Tag => TAG, OfsDelta { .. } => OFS_DELTA, RefDelta { .. } => REF_DELTA, } } /// Return's true if this is a delta object, i.e. not a full object. pub fn is_delta(&self) -> bool { matches!(self, Header::OfsDelta { .. } | Header::RefDelta { .. }) } /// Return's true if this is a base object, i.e. not a delta object. pub fn is_base(&self) -> bool { !self.is_delta() } } impl Header { /// Encode this header along the given `decompressed_size_in_bytes` into the `out` write stream for use within a data pack. /// /// Returns the amount of bytes written to `out`. /// `decompressed_size_in_bytes` is the full size in bytes of the object that this header represents pub fn write_to(&self, decompressed_size_in_bytes: u64, out: &mut dyn io::Write) -> io::Result { let mut size = decompressed_size_in_bytes; let mut written = 1; let mut c: u8 = (self.as_type_id() << 4) | (size as u8 & 0b0000_1111); size >>= 4; while size != 0 { out.write_all(&[c | 0b1000_0000])?; written += 1; c = size as u8 & 0b0111_1111; size >>= 7; } out.write_all(&[c])?; use Header::*; match self { RefDelta { base_id: oid } => { out.write_all(oid.as_slice())?; written += oid.as_slice().len(); } OfsDelta { base_distance } => { let mut buf = [0u8; 10]; let buf = leb64_encode(*base_distance, &mut buf); out.write_all(buf)?; written += buf.len(); } Blob | Tree | Commit | Tag => {} } Ok(written) } /// The size of the header in bytes when serialized pub fn size(&self, decompressed_size: u64) -> usize { self.write_to(decompressed_size, &mut io::sink()) .expect("io::sink() to never fail") } } #[inline] fn leb64_encode(mut n: u64, buf: &mut [u8; 10]) -> &[u8] { let mut bytes_written = 1; buf[buf.len() - 1] = n as u8 & 0b0111_1111; for out in buf.iter_mut().rev().skip(1) { n >>= 7; if n == 0 { break; } n -= 1; *out = 0b1000_0000 | (n as u8 & 0b0111_1111); bytes_written += 1; } debug_assert_eq!(n, 0, "BUG: buffer must be large enough to hold a 64 bit integer"); &buf[buf.len() - bytes_written..] } #[cfg(test)] mod tests { use super::*; #[test] fn leb64_encode_max_int() { let mut buf = [0u8; 10]; let buf = leb64_encode(u64::MAX, &mut buf); assert_eq!(buf.len(), 10, "10 bytes should be used when 64bits are encoded"); } } gix-pack-0.56.0/src/data/entry/mod.rs000064400000000000000000000036201046102023000154060ustar 00000000000000use crate::data::Entry; const _TYPE_EXT1: u8 = 0; const COMMIT: u8 = 1; const TREE: u8 = 2; const BLOB: u8 = 3; const TAG: u8 = 4; const _TYPE_EXT2: u8 = 5; const OFS_DELTA: u8 = 6; const REF_DELTA: u8 = 7; /// A way to uniquely identify the location of an entry within a pack bundle #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Location { /// The id of the pack containing the object. It's unique within its frame of reference which is the owning object database. pub pack_id: u32, /// The size of the entry of disk so that the range of bytes of the entry is `pack_offset..pack_offset + entry_size`. pub entry_size: usize, /// The start of the entry in the pack identified by `pack_id`. pub pack_offset: data::Offset, } impl Location { /// Compute a range suitable for lookup in pack data using the [`entry_slice()`][crate::data::File::entry_slice()] method. pub fn entry_range(&self, pack_offset: data::Offset) -> crate::data::EntryRange { pack_offset..pack_offset + self.entry_size as u64 } } /// Access impl Entry { /// Compute the pack offset to the base entry of the object represented by this entry. pub fn base_pack_offset(&self, distance: u64) -> data::Offset { let pack_offset = self.data_offset - self.header_size() as u64; pack_offset.checked_sub(distance).expect("in-bound distance of deltas") } /// The pack offset at which this entry starts pub fn pack_offset(&self) -> data::Offset { self.data_offset - self.header_size() as u64 } /// The amount of bytes used to describe this entry in the pack. The header starts at [`Self::pack_offset()`] pub fn header_size(&self) -> usize { self.header.size(self.decompressed_size) } } /// pub mod decode; mod header; pub use header::Header; use crate::data; gix-pack-0.56.0/src/data/file/decode/entry.rs000064400000000000000000000472661046102023000170070ustar 00000000000000use std::ops::Range; use gix_features::zlib; use smallvec::SmallVec; use crate::{ cache, data, data::{delta, file::decode::Error, File}, }; /// A return value of a resolve function, which given an [`ObjectId`][gix_hash::ObjectId] determines where an object can be found. #[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum ResolvedBase { /// Indicate an object is within this pack, at the given entry, and thus can be looked up locally. InPack(data::Entry), /// Indicates the object of `kind` was found outside of the pack, and its data was written into an output /// vector which now has a length of `end`. #[allow(missing_docs)] OutOfPack { kind: gix_object::Kind, end: usize }, } #[derive(Debug)] struct Delta { data: Range, base_size: usize, result_size: usize, decompressed_size: usize, data_offset: data::Offset, } /// Additional information and statistics about a successfully decoded object produced by [`File::decode_entry()`]. /// /// Useful to understand the effectiveness of the pack compression or the cost of decompression. #[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Outcome { /// The kind of resolved object. pub kind: gix_object::Kind, /// The amount of deltas in the chain of objects that had to be resolved beforehand. /// /// This number is affected by the [`Cache`][cache::DecodeEntry] implementation, with cache hits shortening the /// delta chain accordingly pub num_deltas: u32, /// The total decompressed size of all pack entries in the delta chain pub decompressed_size: u64, /// The total compressed size of all pack entries in the delta chain pub compressed_size: usize, /// The total size of the decoded object. pub object_size: u64, } impl Outcome { pub(crate) fn default_from_kind(kind: gix_object::Kind) -> Self { Self { kind, num_deltas: 0, decompressed_size: 0, compressed_size: 0, object_size: 0, } } fn from_object_entry(kind: gix_object::Kind, entry: &data::Entry, compressed_size: usize) -> Self { Self { kind, num_deltas: 0, decompressed_size: entry.decompressed_size, compressed_size, object_size: entry.decompressed_size, } } } /// Decompression of objects impl File { /// Decompress the given `entry` into `out` and return the amount of bytes read from the pack data. /// Note that `inflate` is not reset after usage, but will be reset before using it. /// /// _Note_ that this method does not resolve deltified objects, but merely decompresses their content /// `out` is expected to be large enough to hold `entry.size` bytes. /// /// # Panics /// /// If `out` isn't large enough to hold the decompressed `entry` pub fn decompress_entry( &self, entry: &data::Entry, inflate: &mut zlib::Inflate, out: &mut [u8], ) -> Result { assert!( out.len() as u64 >= entry.decompressed_size, "output buffer isn't large enough to hold decompressed result, want {}, have {}", entry.decompressed_size, out.len() ); self.decompress_entry_from_data_offset(entry.data_offset, inflate, out) .map_err(Into::into) } /// Obtain the [`Entry`][crate::data::Entry] at the given `offset` into the pack. /// /// The `offset` is typically obtained from the pack index file. pub fn entry(&self, offset: data::Offset) -> Result { let pack_offset: usize = offset.try_into().expect("offset representable by machine"); assert!(pack_offset <= self.data.len(), "offset out of bounds"); let object_data = &self.data[pack_offset..]; data::Entry::from_bytes(object_data, offset, self.hash_len) } /// Decompress the object expected at the given data offset, sans pack header. This information is only /// known after the pack header was parsed. /// Note that this method does not resolve deltified objects, but merely decompresses their content /// `out` is expected to be large enough to hold `entry.size` bytes. /// Returns the amount of packed bytes there read from the pack data file. pub(crate) fn decompress_entry_from_data_offset( &self, data_offset: data::Offset, inflate: &mut zlib::Inflate, out: &mut [u8], ) -> Result { let offset: usize = data_offset.try_into().expect("offset representable by machine"); assert!(offset < self.data.len(), "entry offset out of bounds"); inflate.reset(); inflate .once(&self.data[offset..], out) .map(|(_status, consumed_in, _consumed_out)| consumed_in) } /// Like `decompress_entry_from_data_offset`, but returns consumed input and output. pub(crate) fn decompress_entry_from_data_offset_2( &self, data_offset: data::Offset, inflate: &mut zlib::Inflate, out: &mut [u8], ) -> Result<(usize, usize), zlib::inflate::Error> { let offset: usize = data_offset.try_into().expect("offset representable by machine"); assert!(offset < self.data.len(), "entry offset out of bounds"); inflate.reset(); inflate .once(&self.data[offset..], out) .map(|(_status, consumed_in, consumed_out)| (consumed_in, consumed_out)) } /// Decode an entry, resolving delta's as needed, while growing the `out` vector if there is not enough /// space to hold the result object. /// /// The `entry` determines which object to decode, and is commonly obtained with the help of a pack index file or through pack iteration. /// `inflate` will be used for decompressing entries, and will not be reset after usage, but before first using it. /// /// `resolve` is a function to lookup objects with the given [`ObjectId`][gix_hash::ObjectId], in case the full object id is used to refer to /// a base object, instead of an in-pack offset. /// /// `delta_cache` is a mechanism to avoid looking up base objects multiple times when decompressing multiple objects in a row. /// Use a [Noop-Cache][cache::Never] to disable caching all together at the cost of repeating work. pub fn decode_entry( &self, entry: data::Entry, out: &mut Vec, inflate: &mut zlib::Inflate, resolve: &dyn Fn(&gix_hash::oid, &mut Vec) -> Option, delta_cache: &mut dyn cache::DecodeEntry, ) -> Result { use crate::data::entry::Header::*; match entry.header { Tree | Blob | Commit | Tag => { let size: usize = entry.decompressed_size.try_into().map_err(|_| Error::OutOfMemory)?; if let Some(additional) = size.checked_sub(out.len()) { out.try_reserve(additional)?; } out.resize(size, 0); self.decompress_entry(&entry, inflate, out.as_mut_slice()) .map(|consumed_input| { Outcome::from_object_entry( entry.header.as_kind().expect("a non-delta entry"), &entry, consumed_input, ) }) } OfsDelta { .. } | RefDelta { .. } => self.resolve_deltas(entry, resolve, inflate, out, delta_cache), } } /// resolve: technically, this shouldn't ever be required as stored local packs don't refer to objects by id /// that are outside of the pack. Unless, of course, the ref refers to an object within this pack, which means /// it's very, very large as 20bytes are smaller than the corresponding MSB encoded number fn resolve_deltas( &self, last: data::Entry, resolve: &dyn Fn(&gix_hash::oid, &mut Vec) -> Option, inflate: &mut zlib::Inflate, out: &mut Vec, cache: &mut dyn cache::DecodeEntry, ) -> Result { // all deltas, from the one that produces the desired object (first) to the oldest at the end of the chain let mut chain = SmallVec::<[Delta; 10]>::default(); let first_entry = last.clone(); let mut cursor = last; let mut base_buffer_size: Option = None; let mut object_kind: Option = None; let mut consumed_input: Option = None; // Find the first full base, either an undeltified object in the pack or a reference to another object. let mut total_delta_data_size: u64 = 0; while cursor.header.is_delta() { if let Some((kind, packed_size)) = cache.get(self.id, cursor.data_offset, out) { base_buffer_size = Some(out.len()); object_kind = Some(kind); // If the input entry is a cache hit, keep the packed size as it must be returned. // Otherwise, the packed size will be determined later when decompressing the input delta if total_delta_data_size == 0 { consumed_input = Some(packed_size); } break; } // This is a pessimistic guess, as worst possible compression should not be bigger than the data itself. // TODO: is this assumption actually true? total_delta_data_size += cursor.decompressed_size; let decompressed_size = cursor .decompressed_size .try_into() .expect("a single delta size small enough to fit a usize"); chain.push(Delta { data: Range { start: 0, end: decompressed_size, }, base_size: 0, result_size: 0, decompressed_size, data_offset: cursor.data_offset, }); use crate::data::entry::Header; cursor = match cursor.header { Header::OfsDelta { base_distance } => self.entry(cursor.base_pack_offset(base_distance))?, Header::RefDelta { base_id } => match resolve(base_id.as_ref(), out) { Some(ResolvedBase::InPack(entry)) => entry, Some(ResolvedBase::OutOfPack { end, kind }) => { base_buffer_size = Some(end); object_kind = Some(kind); break; } None => return Err(Error::DeltaBaseUnresolved(base_id)), }, _ => unreachable!("cursor.is_delta() only allows deltas here"), }; } // This can happen if the cache held the first entry itself // We will just treat it as an object then, even though it's technically incorrect. if chain.is_empty() { return Ok(Outcome::from_object_entry( object_kind.expect("object kind as set by cache"), &first_entry, consumed_input.expect("consumed bytes as set by cache"), )); }; // First pass will decompress all delta data and keep it in our output buffer // []... // so that we can find the biggest result size. let total_delta_data_size: usize = total_delta_data_size.try_into().expect("delta data to fit in memory"); let chain_len = chain.len(); let (first_buffer_end, second_buffer_end) = { let delta_start = base_buffer_size.unwrap_or(0); let delta_range = Range { start: delta_start, end: delta_start + total_delta_data_size, }; out.try_reserve(delta_range.end.saturating_sub(out.len()))?; out.resize(delta_range.end, 0); let mut instructions = &mut out[delta_range.clone()]; let mut relative_delta_start = 0; let mut biggest_result_size = 0; for (delta_idx, delta) in chain.iter_mut().rev().enumerate() { let consumed_from_data_offset = self.decompress_entry_from_data_offset( delta.data_offset, inflate, &mut instructions[..delta.decompressed_size], )?; let is_last_delta_to_be_applied = delta_idx + 1 == chain_len; if is_last_delta_to_be_applied { consumed_input = Some(consumed_from_data_offset); } let (base_size, offset) = delta::decode_header_size(instructions); let mut bytes_consumed_by_header = offset; biggest_result_size = biggest_result_size.max(base_size); delta.base_size = base_size.try_into().expect("base size fits into usize"); let (result_size, offset) = delta::decode_header_size(&instructions[offset..]); bytes_consumed_by_header += offset; biggest_result_size = biggest_result_size.max(result_size); delta.result_size = result_size.try_into().expect("result size fits into usize"); // the absolute location into the instructions buffer, so we keep track of the end point of the last delta.data.start = relative_delta_start + bytes_consumed_by_header; relative_delta_start += delta.decompressed_size; delta.data.end = relative_delta_start; instructions = &mut instructions[delta.decompressed_size..]; } // Now we can produce a buffer like this // [] // from []... let biggest_result_size: usize = biggest_result_size.try_into().map_err(|_| Error::OutOfMemory)?; let first_buffer_size = biggest_result_size; let second_buffer_size = first_buffer_size; let out_size = first_buffer_size + second_buffer_size + total_delta_data_size; out.try_reserve(out_size.saturating_sub(out.len()))?; out.resize(out_size, 0); // Now 'rescue' the deltas, because in the next step we possibly overwrite that portion // of memory with the base object (in the majority of cases) let second_buffer_end = { let end = first_buffer_size + second_buffer_size; if delta_range.start < end { // …this means that the delta size is even larger than two uncompressed worst-case // intermediate results combined. It would already be undesirable to have it bigger // then the target size (as you could just store the object in whole). // However, this just means that it reuses existing deltas smartly, which as we rightfully // remember stand for an object each. However, this means a lot of data is read to restore // a single object sometimes. Fair enough - package size is minimized that way. out.copy_within(delta_range, end); } else { let (buffers, instructions) = out.split_at_mut(end); instructions.copy_from_slice(&buffers[delta_range]); } end }; // If we don't have a out-of-pack object already, fill the base-buffer by decompressing the full object // at which the cursor is left after the iteration if base_buffer_size.is_none() { let base_entry = cursor; debug_assert!(!base_entry.header.is_delta()); object_kind = base_entry.header.as_kind(); self.decompress_entry_from_data_offset(base_entry.data_offset, inflate, out)?; } (first_buffer_size, second_buffer_end) }; // From oldest to most recent, apply all deltas, swapping the buffer back and forth // TODO: once we have more tests, we could optimize this memory-intensive work to // analyse the delta-chains to only copy data once - after all, with 'copy-from-base' deltas, // all data originates from one base at some point. // `out` is: [source-buffer][target-buffer][max-delta-instructions-buffer] let (buffers, instructions) = out.split_at_mut(second_buffer_end); let (mut source_buf, mut target_buf) = buffers.split_at_mut(first_buffer_end); let mut last_result_size = None; for ( delta_idx, Delta { data, base_size, result_size, .. }, ) in chain.into_iter().rev().enumerate() { let data = &mut instructions[data]; if delta_idx + 1 == chain_len { last_result_size = Some(result_size); } delta::apply(&source_buf[..base_size], &mut target_buf[..result_size], data); // use the target as source for the next delta std::mem::swap(&mut source_buf, &mut target_buf); } let last_result_size = last_result_size.expect("at least one delta chain item"); // uneven chains leave the target buffer after the source buffer // FIXME(Performance) If delta-chains are uneven, we know we will have to copy bytes over here // Instead we could use a different start buffer, to naturally end up with the result in the // right one. // However, this is a bit more complicated than just that - you have to deal with the base // object, which should also be placed in the second buffer right away. You don't have that // control/knowledge for out-of-pack bases, so this is a special case to deal with, too. // Maybe these invariants can be represented in the type system though. if chain_len % 2 == 1 { // this seems inverted, but remember: we swapped the buffers on the last iteration target_buf[..last_result_size].copy_from_slice(&source_buf[..last_result_size]); } debug_assert!(out.len() >= last_result_size); out.truncate(last_result_size); let object_kind = object_kind.expect("a base object as root of any delta chain that we are here to resolve"); let consumed_input = consumed_input.expect("at least one decompressed delta object"); cache.put( self.id, first_entry.data_offset, out.as_slice(), object_kind, consumed_input, ); Ok(Outcome { kind: object_kind, // technically depending on the cache, the chain size is not correct as it might // have been cut short by a cache hit. The caller must deactivate the cache to get // actual results num_deltas: chain_len as u32, decompressed_size: first_entry.decompressed_size, compressed_size: consumed_input, object_size: last_result_size as u64, }) } } #[cfg(test)] mod tests { use super::*; use gix_testtools::size_ok; #[test] fn size_of_decode_entry_outcome() { let actual = std::mem::size_of::(); let expected = 32; assert!( size_ok(actual, expected), "this shouldn't change without use noticing as it's returned a lot: {actual} <~ {expected}" ); } } gix-pack-0.56.0/src/data/file/decode/header.rs000064400000000000000000000120371046102023000170620ustar 00000000000000use gix_features::zlib; use crate::{ data, data::{delta, file::decode::Error, File}, }; /// A return value of a resolve function, which given an [`ObjectId`][gix_hash::ObjectId] determines where an object can be found. #[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum ResolvedBase { /// Indicate an object is within this pack, at the given entry, and thus can be looked up locally. InPack(data::Entry), /// Indicates the object of `kind` was found outside of the pack. OutOfPack { /// The kind of object we found when reading the header of the out-of-pack base. kind: gix_object::Kind, /// The amount of deltas encountered if the object was packed as well. num_deltas: Option, }, } /// Additional information and statistics about a successfully decoded object produced by [`File::decode_header()`]. /// /// Useful to understand the effectiveness of the pack compression or the cost of decompression. #[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone, Copy)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Outcome { /// The kind of resolved object. pub kind: gix_object::Kind, /// The decompressed size of the object. pub object_size: u64, /// The amount of deltas in the chain of objects that had to be resolved beforehand. pub num_deltas: u32, } /// Obtain object information quickly. impl File { /// Resolve the object header information starting at `entry`, following the chain of entries as needed. /// /// The `entry` determines which object to decode, and is commonly obtained with the help of a pack index file or through pack iteration. /// `inflate` will be used for (partially) decompressing entries, and will be reset before first use, but not after the last use. /// /// `resolve` is a function to lookup objects with the given [`ObjectId`][gix_hash::ObjectId], in case the full object id /// is used to refer to a base object, instead of an in-pack offset. pub fn decode_header( &self, mut entry: data::Entry, inflate: &mut zlib::Inflate, resolve: &dyn Fn(&gix_hash::oid) -> Option, ) -> Result { use crate::data::entry::Header::*; let mut num_deltas = 0; let mut first_delta_decompressed_size = None::; loop { match entry.header { Tree | Blob | Commit | Tag => { return Ok(Outcome { kind: entry.header.as_kind().expect("always valid for non-refs"), object_size: first_delta_decompressed_size.unwrap_or(entry.decompressed_size), num_deltas, }); } OfsDelta { base_distance } => { num_deltas += 1; if first_delta_decompressed_size.is_none() { first_delta_decompressed_size = Some(self.decode_delta_object_size(inflate, &entry)?); } entry = self.entry(entry.base_pack_offset(base_distance))?; } RefDelta { base_id } => { num_deltas += 1; if first_delta_decompressed_size.is_none() { first_delta_decompressed_size = Some(self.decode_delta_object_size(inflate, &entry)?); } match resolve(base_id.as_ref()) { Some(ResolvedBase::InPack(base_entry)) => entry = base_entry, Some(ResolvedBase::OutOfPack { kind, num_deltas: origin_num_deltas, }) => { return Ok(Outcome { kind, object_size: first_delta_decompressed_size.unwrap_or(entry.decompressed_size), num_deltas: origin_num_deltas.unwrap_or_default() + num_deltas, }) } None => return Err(Error::DeltaBaseUnresolved(base_id)), } } }; } } #[inline] fn decode_delta_object_size(&self, inflate: &mut zlib::Inflate, entry: &data::Entry) -> Result { let mut buf = [0_u8; 32]; let used = self .decompress_entry_from_data_offset_2(entry.data_offset, inflate, &mut buf)? .1; let buf = &buf[..used]; let (_base_size, offset) = delta::decode_header_size(buf); let (result_size, _offset) = delta::decode_header_size(&buf[offset..]); Ok(result_size) } } #[cfg(test)] mod tests { use super::*; #[test] fn size_of_decode_entry_outcome() { assert_eq!( std::mem::size_of::(), 16, "this shouldn't change without use noticing as it's returned a lot" ); } } gix-pack-0.56.0/src/data/file/decode/mod.rs000064400000000000000000000016151046102023000164110ustar 00000000000000use std::collections::TryReserveError; /// pub mod entry; /// pub mod header; /// Returned by [`File::decode_header()`][crate::data::File::decode_header()], /// [`File::decode_entry()`][crate::data::File::decode_entry()] and . /// [`File::decompress_entry()`][crate::data::File::decompress_entry()] #[derive(thiserror::Error, Debug)] #[allow(missing_docs)] pub enum Error { #[error("Failed to decompress pack entry")] ZlibInflate(#[from] gix_features::zlib::inflate::Error), #[error("A delta chain could not be followed as the ref base with id {0} could not be found")] DeltaBaseUnresolved(gix_hash::ObjectId), #[error(transparent)] EntryType(#[from] crate::data::entry::decode::Error), #[error("Entry too large to fit in memory")] OutOfMemory, } impl From for Error { #[cold] fn from(_: TryReserveError) -> Self { Self::OutOfMemory } } gix-pack-0.56.0/src/data/file/init.rs000064400000000000000000000030241046102023000153460ustar 00000000000000use std::path::Path; use crate::data; /// Instantiation impl data::File { /// Try opening a data file at the given `path`. /// /// The `object_hash` is a way to read (and write) the same file format with different hashes, as the hash kind /// isn't stored within the file format itself. pub fn at(path: impl AsRef, object_hash: gix_hash::Kind) -> Result { Self::at_inner(path.as_ref(), object_hash) } fn at_inner(path: &Path, object_hash: gix_hash::Kind) -> Result { use crate::data::header::N32_SIZE; let hash_len = object_hash.len_in_bytes(); let data = crate::mmap::read_only(path).map_err(|e| data::header::decode::Error::Io { source: e, path: path.to_owned(), })?; let pack_len = data.len(); if pack_len < N32_SIZE * 3 + hash_len { return Err(data::header::decode::Error::Corrupt(format!( "Pack data of size {pack_len} is too small for even an empty pack with shortest hash" ))); } let (kind, num_objects) = data::header::decode(&data[..12].try_into().expect("enough data after previous check"))?; Ok(data::File { data, path: path.to_owned(), id: gix_features::hash::crc32(path.as_os_str().to_string_lossy().as_bytes()), version: kind, num_objects, hash_len, object_hash, }) } } gix-pack-0.56.0/src/data/file/mod.rs000064400000000000000000000002021046102023000151550ustar 00000000000000mod init; /// pub mod verify; /// pub mod decode; /// The bytes used as header in a pack data file. pub type Header = [u8; 12]; gix-pack-0.56.0/src/data/file/verify.rs000064400000000000000000000026521046102023000157150ustar 00000000000000use std::sync::atomic::AtomicBool; use gix_features::progress::Progress; use crate::data::File; /// pub mod checksum { /// Returned by [`data::File::verify_checksum()`][crate::data::File::verify_checksum()]. pub type Error = crate::verify::checksum::Error; } /// Checksums and verify checksums impl File { /// The checksum in the trailer of this pack data file pub fn checksum(&self) -> gix_hash::ObjectId { gix_hash::ObjectId::from_bytes_or_panic(&self.data[self.data.len() - self.hash_len..]) } /// Verifies that the checksum of the packfile over all bytes preceding it indeed matches the actual checksum, /// returning the actual checksum equivalent to the return value of [`checksum()`][File::checksum()] if there /// is no mismatch. /// /// Note that if no `progress` is desired, one can pass [`gix_features::progress::Discard`]. /// /// Have a look at [`index::File::verify_integrity(…)`][crate::index::File::verify_integrity()] for an /// even more thorough integrity check. pub fn verify_checksum( &self, progress: &mut dyn Progress, should_interrupt: &AtomicBool, ) -> Result { crate::verify::checksum_on_disk_or_mmap( self.path(), &self.data, self.checksum(), self.object_hash, progress, should_interrupt, ) } } gix-pack-0.56.0/src/data/header.rs000064400000000000000000000032721046102023000147210ustar 00000000000000use crate::data; pub(crate) const N32_SIZE: usize = std::mem::size_of::(); /// Parses the first 12 bytes of a pack file, returning the pack version as well as the number of objects contained in the pack. pub fn decode(data: &[u8; 12]) -> Result<(data::Version, u32), decode::Error> { let mut ofs = 0; if &data[ofs..ofs + b"PACK".len()] != b"PACK" { return Err(decode::Error::Corrupt("Pack data type not recognized".into())); } ofs += N32_SIZE; let kind = match crate::read_u32(&data[ofs..ofs + N32_SIZE]) { 2 => data::Version::V2, 3 => data::Version::V3, v => return Err(decode::Error::UnsupportedVersion(v)), }; ofs += N32_SIZE; let num_objects = crate::read_u32(&data[ofs..ofs + N32_SIZE]); Ok((kind, num_objects)) } /// Write a pack data header at `version` with `num_objects` and return a buffer. pub fn encode(version: data::Version, num_objects: u32) -> [u8; 12] { use crate::data::Version::*; let mut buf = [0u8; 12]; buf[..4].copy_from_slice(b"PACK"); buf[4..8].copy_from_slice( &match version { V2 => 2u32, V3 => 3, } .to_be_bytes()[..], ); buf[8..].copy_from_slice(&num_objects.to_be_bytes()[..]); buf } /// pub mod decode { /// Returned by [`decode()`][super::decode()]. #[derive(thiserror::Error, Debug)] #[allow(missing_docs)] pub enum Error { #[error("Could not open pack file at '{path}'")] Io { source: std::io::Error, path: std::path::PathBuf, }, #[error("{0}")] Corrupt(String), #[error("Unsupported pack version: {0}")] UnsupportedVersion(u32), } } gix-pack-0.56.0/src/data/input/bytes_to_entries.rs000064400000000000000000000237411046102023000202140ustar 00000000000000use std::{fs, io}; use gix_features::{hash::Hasher, zlib::Decompress}; use gix_hash::ObjectId; use crate::data::input; /// An iterator over [`Entries`][input::Entry] in a byte stream. /// /// The iterator used as part of [`Bundle::write_to_directory(…)`][crate::Bundle::write_to_directory()]. pub struct BytesToEntriesIter
{ read: BR, decompressor: Decompress, offset: u64, had_error: bool, version: crate::data::Version, objects_left: u32, hash: Option, mode: input::Mode, compressed: input::EntryDataMode, compressed_buf: Option>, hash_len: usize, object_hash: gix_hash::Kind, } /// Access impl
BytesToEntriesIter
{ /// The pack version currently being iterated pub fn version(&self) -> crate::data::Version { self.version } /// The kind of iteration pub fn mode(&self) -> input::Mode { self.mode } } /// Initialization impl
BytesToEntriesIter
where BR: io::BufRead, { /// Obtain an iterator from a `read` stream to a pack data file and configure it using `mode` and `compressed`. /// `object_hash` specifies which hash is used for objects in ref-delta entries. /// /// Note that `read` is expected at the beginning of a valid pack data file with a header, entries and a trailer. pub fn new_from_header( mut read: BR, mode: input::Mode, compressed: input::EntryDataMode, object_hash: gix_hash::Kind, ) -> Result, input::Error> { let mut header_data = [0u8; 12]; read.read_exact(&mut header_data)?; let (version, num_objects) = crate::data::header::decode(&header_data)?; assert_eq!( version, crate::data::Version::V2, "let's stop here if we see undocumented pack formats" ); Ok(BytesToEntriesIter { read, decompressor: Decompress::new(true), compressed, offset: 12, had_error: false, version, objects_left: num_objects, hash: (mode != input::Mode::AsIs).then(|| { let mut hash = gix_features::hash::hasher(object_hash); hash.update(&header_data); hash }), mode, compressed_buf: None, hash_len: object_hash.len_in_bytes(), object_hash, }) } fn next_inner(&mut self) -> Result { self.objects_left -= 1; // even an error counts as objects // Read header let entry = match self.hash.as_mut() { Some(hash) => { let mut read = read_and_pass_to( &mut self.read, HashWrite { inner: io::sink(), hash, }, ); crate::data::Entry::from_read(&mut read, self.offset, self.hash_len) } None => crate::data::Entry::from_read(&mut self.read, self.offset, self.hash_len), } .map_err(input::Error::from)?; // Decompress object to learn its compressed bytes let compressed_buf = self.compressed_buf.take().unwrap_or_else(|| Vec::with_capacity(4096)); self.decompressor.reset(true); let mut decompressed_reader = DecompressRead { inner: read_and_pass_to( &mut self.read, if self.compressed.keep() { Vec::with_capacity(entry.decompressed_size as usize) } else { compressed_buf }, ), decompressor: &mut self.decompressor, }; let bytes_copied = io::copy(&mut decompressed_reader, &mut io::sink())?; if bytes_copied != entry.decompressed_size { return Err(input::Error::IncompletePack { actual: bytes_copied, expected: entry.decompressed_size, }); } let pack_offset = self.offset; let compressed_size = decompressed_reader.decompressor.total_in(); self.offset += entry.header_size() as u64 + compressed_size; let mut compressed = decompressed_reader.inner.write; debug_assert_eq!( compressed_size, compressed.len() as u64, "we must track exactly the same amount of bytes as read by the decompressor" ); if let Some(hash) = self.hash.as_mut() { hash.update(&compressed); } let crc32 = if self.compressed.crc32() { let mut header_buf = [0u8; 12 + gix_hash::Kind::longest().len_in_bytes()]; let header_len = entry.header.write_to(bytes_copied, &mut header_buf.as_mut())?; let state = gix_features::hash::crc32_update(0, &header_buf[..header_len]); Some(gix_features::hash::crc32_update(state, &compressed)) } else { None }; let compressed = if self.compressed.keep() { Some(compressed) } else { compressed.clear(); self.compressed_buf = Some(compressed); None }; // Last objects gets trailer (which is potentially verified) let trailer = self.try_read_trailer()?; Ok(input::Entry { header: entry.header, header_size: entry.header_size() as u16, compressed, compressed_size, crc32, pack_offset, decompressed_size: bytes_copied, trailer, }) } fn try_read_trailer(&mut self) -> Result, input::Error> { Ok(if self.objects_left == 0 { let mut id = gix_hash::ObjectId::null(self.object_hash); if let Err(err) = self.read.read_exact(id.as_mut_slice()) { if self.mode != input::Mode::Restore { return Err(err.into()); } } if let Some(hash) = self.hash.take() { let actual_id = gix_hash::ObjectId::from(hash.digest()); if self.mode == input::Mode::Restore { id = actual_id; } if id != actual_id { return Err(input::Error::ChecksumMismatch { actual: actual_id, expected: id, }); } } Some(id) } else if self.mode == input::Mode::Restore { let hash = self.hash.clone().expect("in restore mode a hash is set"); Some(gix_hash::ObjectId::from(hash.digest())) } else { None }) } } fn read_and_pass_to(read: &mut R, to: W) -> PassThrough<&mut R, W> { PassThrough { read, write: to } } impl Iterator for BytesToEntriesIter where R: io::BufRead, { type Item = Result; fn next(&mut self) -> Option { if self.had_error || self.objects_left == 0 { return None; } let result = self.next_inner(); self.had_error = result.is_err(); if self.had_error { self.objects_left = 0; } if self.mode == input::Mode::Restore && self.had_error { None } else { Some(result) } } fn size_hint(&self) -> (usize, Option) { (self.objects_left as usize, Some(self.objects_left as usize)) } } impl std::iter::ExactSizeIterator for BytesToEntriesIter where R: io::BufRead {} struct PassThrough { read: R, write: W, } impl io::BufRead for PassThrough where Self: io::Read, R: io::BufRead, W: io::Write, { fn fill_buf(&mut self) -> io::Result<&[u8]> { self.read.fill_buf() } fn consume(&mut self, amt: usize) { let buf = self .read .fill_buf() .expect("never fail as we called fill-buf before and this does nothing"); self.write .write_all(&buf[..amt]) .expect("a write to never fail - should be a memory buffer"); self.read.consume(amt); } } impl io::Read for PassThrough where W: io::Write, R: io::Read, { fn read(&mut self, buf: &mut [u8]) -> io::Result { let bytes_read = self.read.read(buf)?; self.write.write_all(&buf[..bytes_read])?; Ok(bytes_read) } } impl crate::data::File { /// Returns an iterator over [`Entries`][crate::data::input::Entry], without making use of the memory mapping. pub fn streaming_iter(&self) -> Result, input::Error> { let reader = io::BufReader::with_capacity(4096 * 8, fs::File::open(&self.path)?); BytesToEntriesIter::new_from_header( reader, input::Mode::Verify, input::EntryDataMode::KeepAndCrc32, self.object_hash, ) } } /// The boxed variant is faster for what we do (moving the decompressor in and out a lot) pub struct DecompressRead<'a, R> { /// The reader from which bytes should be decompressed. pub inner: R, /// The decompressor doing all the work. pub decompressor: &'a mut Decompress, } impl io::Read for DecompressRead<'_, R> where R: io::BufRead, { fn read(&mut self, into: &mut [u8]) -> io::Result { gix_features::zlib::stream::inflate::read(&mut self.inner, self.decompressor, into) } } /// A utility to automatically generate a hash while writing into an inner writer. pub struct HashWrite<'a, T> { /// The hash implementation. pub hash: &'a mut Hasher, /// The inner writer. pub inner: T, } impl std::io::Write for HashWrite<'_, T> where T: std::io::Write, { fn write(&mut self, buf: &[u8]) -> std::io::Result { let written = self.inner.write(buf)?; self.hash.update(&buf[..written]); Ok(written) } fn flush(&mut self) -> std::io::Result<()> { self.inner.flush() } } gix-pack-0.56.0/src/data/input/entries_to_bytes.rs000064400000000000000000000130041046102023000202030ustar 00000000000000use std::iter::Peekable; use gix_features::hash; use crate::data::input; /// An implementation of [`Iterator`] to write [encoded entries][input::Entry] to an inner implementation each time /// `next()` is called. /// /// It is able to deal with an unknown amount of objects as it will rewrite the pack header once the entries iterator /// is depleted and compute the hash in one go by re-reading the whole file. pub struct EntriesToBytesIter { /// An iterator for input [`input::Entry`] instances pub input: Peekable, /// A way of writing encoded bytes. output: W, /// Our trailing hash when done writing all input entries trailer: Option, /// The amount of objects in the iteration and the version of the packfile to be written. /// Will be `None` to signal the header was written already. data_version: crate::data::Version, /// The amount of entries seen so far num_entries: u32, /// If we are done, no additional writes will occur is_done: bool, /// The kind of hash to use for the digest object_hash: gix_hash::Kind, } impl EntriesToBytesIter where I: Iterator>, W: std::io::Read + std::io::Write + std::io::Seek, { /// Create a new instance reading [entries][input::Entry] from an `input` iterator and write pack data bytes to /// `output` writer, resembling a pack of `version`. The amount of entries will be dynamically determined and /// the pack is completed once the last entry was written. /// `object_hash` is the kind of hash to use for the pack checksum and maybe other places, depending on the version. /// /// # Panics /// /// Not all combinations of `object_hash` and `version` are supported currently triggering assertion errors. pub fn new(input: I, output: W, version: crate::data::Version, object_hash: gix_hash::Kind) -> Self { assert!( matches!(version, crate::data::Version::V2), "currently only pack version 2 can be written", ); assert!( matches!(object_hash, gix_hash::Kind::Sha1), "currently only Sha1 is supported, right now we don't know how other hashes are encoded", ); EntriesToBytesIter { input: input.peekable(), output, object_hash, num_entries: 0, trailer: None, data_version: version, is_done: false, } } /// Returns the trailing hash over all ~ entries once done. /// It's `None` if we are not yet done writing. pub fn digest(&self) -> Option { self.trailer } fn next_inner(&mut self, entry: input::Entry) -> Result { if self.num_entries == 0 { let header_bytes = crate::data::header::encode(self.data_version, 0); self.output.write_all(&header_bytes[..])?; } self.num_entries += 1; entry.header.write_to(entry.decompressed_size, &mut self.output)?; self.output.write_all( entry .compressed .as_deref() .expect("caller must configure generator to keep compressed bytes"), )?; Ok(entry) } fn write_header_and_digest(&mut self, last_entry: Option<&mut input::Entry>) -> Result<(), input::Error> { let header_bytes = crate::data::header::encode(self.data_version, self.num_entries); let num_bytes_written = if last_entry.is_some() { self.output.stream_position()? } else { header_bytes.len() as u64 }; self.output.rewind()?; self.output.write_all(&header_bytes[..])?; self.output.flush()?; self.output.rewind()?; let interrupt_never = std::sync::atomic::AtomicBool::new(false); let digest = hash::bytes( &mut self.output, num_bytes_written, self.object_hash, &mut gix_features::progress::Discard, &interrupt_never, )?; self.output.write_all(digest.as_slice())?; self.output.flush()?; self.is_done = true; if let Some(last_entry) = last_entry { last_entry.trailer = Some(digest); } self.trailer = Some(digest); Ok(()) } } impl Iterator for EntriesToBytesIter where I: Iterator>, W: std::io::Read + std::io::Write + std::io::Seek, { /// The amount of bytes written to `out` if `Ok` or the error `E` received from the input. type Item = Result; fn next(&mut self) -> Option { if self.is_done { return None; } match self.input.next() { Some(res) => Some(match res { Ok(entry) => self.next_inner(entry).and_then(|mut entry| { if self.input.peek().is_none() { self.write_header_and_digest(Some(&mut entry)).map(|_| entry) } else { Ok(entry) } }), Err(err) => { self.is_done = true; Err(err) } }), None => match self.write_header_and_digest(None) { Ok(_) => None, Err(err) => Some(Err(err)), }, } } fn size_hint(&self) -> (usize, Option) { self.input.size_hint() } } gix-pack-0.56.0/src/data/input/entry.rs000064400000000000000000000046361046102023000157760ustar 00000000000000use std::io::Write; use crate::data::{entry::Header, input}; impl input::Entry { /// Create a new input entry from a given data `obj` set to be placed at the given `pack_offset`. /// /// This method is useful when arbitrary base entries are created pub fn from_data_obj(obj: &gix_object::Data<'_>, pack_offset: u64) -> Result { let header = to_header(obj.kind); let compressed = compress_data(obj)?; let compressed_size = compressed.len() as u64; let mut entry = input::Entry { header, header_size: header.size(obj.data.len() as u64) as u16, pack_offset, compressed: Some(compressed), compressed_size, crc32: None, decompressed_size: obj.data.len() as u64, trailer: None, }; entry.crc32 = Some(entry.compute_crc32()); Ok(entry) } /// The amount of bytes this entry may consume in a pack data file pub fn bytes_in_pack(&self) -> u64 { u64::from(self.header_size) + self.compressed_size } /// Update our CRC value by recalculating it from our header and compressed data. pub fn compute_crc32(&self) -> u32 { let mut header_buf = [0u8; 12 + gix_hash::Kind::longest().len_in_bytes()]; let header_len = self .header .write_to(self.decompressed_size, &mut header_buf.as_mut()) .expect("write to memory will not fail"); let state = gix_features::hash::crc32_update(0, &header_buf[..header_len]); gix_features::hash::crc32_update(state, self.compressed.as_ref().expect("we always set it")) } } fn to_header(kind: gix_object::Kind) -> Header { use gix_object::Kind::*; match kind { Tree => Header::Tree, Blob => Header::Blob, Commit => Header::Commit, Tag => Header::Tag, } } fn compress_data(obj: &gix_object::Data<'_>) -> Result, input::Error> { let mut out = gix_features::zlib::stream::deflate::Write::new(Vec::new()); if let Err(err) = std::io::copy(&mut &*obj.data, &mut out) { match err.kind() { std::io::ErrorKind::Other => return Err(input::Error::Io(err)), err => { unreachable!("Should never see other errors than zlib, but got {:?}", err,) } } }; out.flush().expect("zlib flush should never fail"); Ok(out.into_inner()) } gix-pack-0.56.0/src/data/input/lookup_ref_delta_objects.rs000064400000000000000000000234351046102023000216620ustar 00000000000000use gix_hash::ObjectId; use crate::data::{entry::Header, input}; /// An iterator to resolve thin packs on the fly. pub struct LookupRefDeltaObjectsIter { /// The inner iterator whose entries we will resolve. pub inner: I, lookup: Find, /// The cached delta to provide next time we are called, it's the delta to go with the base we just resolved in its place. next_delta: Option, /// Fuse to stop iteration after first missing object. error: bool, /// The overall pack-offset we accumulated thus far. Each inserted entry offsets all following /// objects by its length. We need to determine exactly where the object was inserted to see if its affected at all. inserted_entry_length_at_offset: Vec, /// The sum of all entries added so far, as a cache to avoid recomputation inserted_entries_length_in_bytes: i64, buf: Vec, } impl LookupRefDeltaObjectsIter where I: Iterator>, Find: gix_object::Find, { /// Create a new instance wrapping `iter` and using `lookup` as function to retrieve objects that will serve as bases /// for ref deltas seen while traversing `iter`. pub fn new(iter: I, lookup: Find) -> Self { LookupRefDeltaObjectsIter { inner: iter, lookup, error: false, inserted_entry_length_at_offset: Vec::new(), inserted_entries_length_in_bytes: 0, next_delta: None, buf: Vec::new(), } } fn shifted_pack_offset(&self, pack_offset: u64) -> u64 { let new_ofs = pack_offset as i64 + self.inserted_entries_length_in_bytes; new_ofs.try_into().expect("offset value is never becomes negative") } /// positive `size_change` values mean an object grew or was more commonly, was inserted. Negative values /// mean the object shrunk, usually because there header changed from ref-deltas to ofs deltas. fn track_change(&mut self, shifted_pack_offset: u64, pack_offset: u64, size_change: i64, oid: Option) { if size_change == 0 { return; } self.inserted_entry_length_at_offset.push(Change { shifted_pack_offset, pack_offset, size_change_in_bytes: size_change, oid: oid.unwrap_or_else(|| // NOTE: this value acts as sentinel and the actual hash kind doesn't matter. gix_hash::Kind::Sha1.null()), }); self.inserted_entries_length_in_bytes += size_change; } fn shift_entry_and_point_to_base_by_offset(&mut self, entry: &mut input::Entry, base_distance: u64) { let pack_offset = entry.pack_offset; entry.pack_offset = self.shifted_pack_offset(pack_offset); entry.header = Header::OfsDelta { base_distance }; let previous_header_size = entry.header_size; entry.header_size = entry.header.size(entry.decompressed_size) as u16; let change = i64::from(entry.header_size) - i64::from(previous_header_size); entry.crc32 = Some(entry.compute_crc32()); self.track_change(entry.pack_offset, pack_offset, change, None); } } impl Iterator for LookupRefDeltaObjectsIter where I: Iterator>, Find: gix_object::Find, { type Item = Result; fn next(&mut self) -> Option { if self.error { return None; } if let Some(delta) = self.next_delta.take() { return Some(Ok(delta)); } match self.inner.next() { Some(Ok(mut entry)) => match entry.header { Header::RefDelta { base_id } => { match self.inserted_entry_length_at_offset.iter().rfind(|e| e.oid == base_id) { None => { let base_entry = match self.lookup.try_find(&base_id, &mut self.buf).ok()? { Some(obj) => { let current_pack_offset = entry.pack_offset; let mut entry = match input::Entry::from_data_obj(&obj, 0) { Ok(e) => e, Err(err) => return Some(Err(err)), }; entry.pack_offset = self.shifted_pack_offset(current_pack_offset); self.track_change( entry.pack_offset, current_pack_offset, entry.bytes_in_pack() as i64, Some(base_id), ); entry } None => { self.error = true; return Some(Err(input::Error::NotFound { object_id: base_id })); } }; { self.shift_entry_and_point_to_base_by_offset(&mut entry, base_entry.bytes_in_pack()); self.next_delta = Some(entry); } Some(Ok(base_entry)) } Some(base_entry) => { let base_distance = self.shifted_pack_offset(entry.pack_offset) - base_entry.shifted_pack_offset; self.shift_entry_and_point_to_base_by_offset(&mut entry, base_distance); Some(Ok(entry)) } } } _ => { if self.inserted_entries_length_in_bytes != 0 { if let Header::OfsDelta { base_distance } = entry.header { // We have to find the new distance based on the previous distance to the base, using the absolute // pack offset computed from it as stored in `base_pack_offset`. let base_pack_offset = entry .pack_offset .checked_sub(base_distance) .expect("distance to be in range of pack"); match self .inserted_entry_length_at_offset .binary_search_by_key(&base_pack_offset, |c| c.pack_offset) { Ok(index) => { let index = { let maybe_index_of_actual_entry = index + 1; self.inserted_entry_length_at_offset .get(maybe_index_of_actual_entry) .and_then(|c| { (c.pack_offset == base_pack_offset) .then_some(maybe_index_of_actual_entry) }) .unwrap_or(index) }; let new_distance = self .shifted_pack_offset(entry.pack_offset) .checked_sub(self.inserted_entry_length_at_offset[index].shifted_pack_offset) .expect("a base that is behind us in the pack"); self.shift_entry_and_point_to_base_by_offset(&mut entry, new_distance); } Err(index) => { let change_since_offset = self.inserted_entry_length_at_offset[index..] .iter() .map(|c| c.size_change_in_bytes) .sum::(); let new_distance: u64 = { (base_distance as i64 + change_since_offset) .try_into() .expect("it still points behind us") }; self.shift_entry_and_point_to_base_by_offset(&mut entry, new_distance); } } } else { // Offset this entry by all changes (positive or negative) that we saw thus far. entry.pack_offset = self.shifted_pack_offset(entry.pack_offset); } } Some(Ok(entry)) } }, other => other, } } fn size_hint(&self) -> (usize, Option) { let (min, max) = self.inner.size_hint(); max.map_or_else(|| (min * 2, None), |max| (min, Some(max * 2))) } } #[derive(Debug)] struct Change { /// The original pack offset as mentioned in the entry we saw. This is used to find this as base object if deltas refer to it by /// old offset. pack_offset: u64, /// The new pack offset that is the shifted location of the pack entry in the pack. shifted_pack_offset: u64, /// The size change of the entry header, negative values denote shrinking, positive denote growing. size_change_in_bytes: i64, /// The object id of the entry responsible for the change, or null if it's an entry just for tracking an insertion. oid: ObjectId, } gix-pack-0.56.0/src/data/input/mod.rs000064400000000000000000000034441046102023000154100ustar 00000000000000/// An item of the iteration produced by [`BytesToEntriesIter`] #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Entry { /// The header of a pack entry pub header: crate::data::entry::Header, /// The amount of bytes used to encode the `header`. `pack_offset + header_size` is the beginning of /// the compressed data in the pack. pub header_size: u16, /// The first byte of the entry at which the `header` can be read. pub pack_offset: u64, /// The bytes consumed while producing `decompressed` /// These do not contain the header, which makes it possible to easily replace a RefDelta with offset deltas /// when resolving thin packs. /// Depends on `CompressionMode` when the iterator is initialized. pub compressed: Option>, /// The amount of bytes the compressed portion of the entry takes, i.e. the portion behind behind the header. pub compressed_size: u64, /// The CRC32 over the complete entry, that is encoded header and compressed object data. /// Depends on `CompressionMode` when the iterator is initialized pub crc32: Option, /// The amount of decompressed bytes of the entry. pub decompressed_size: u64, /// Set for the last object in the iteration, providing the hash over all bytes of the iteration /// for use as trailer in a pack or to verify it matches the trailer. pub trailer: Option, } mod entry; mod types; pub use types::{EntryDataMode, Error, Mode}; mod bytes_to_entries; pub use bytes_to_entries::BytesToEntriesIter; mod lookup_ref_delta_objects; pub use lookup_ref_delta_objects::LookupRefDeltaObjectsIter; mod entries_to_bytes; pub use entries_to_bytes::EntriesToBytesIter; gix-pack-0.56.0/src/data/input/types.rs000064400000000000000000000060171046102023000157740ustar 00000000000000use std::io; /// Returned by [`BytesToEntriesIter::new_from_header()`][crate::data::input::BytesToEntriesIter::new_from_header()] and as part /// of `Item` of [`BytesToEntriesIter`][crate::data::input::BytesToEntriesIter]. #[derive(thiserror::Error, Debug)] #[allow(missing_docs)] pub enum Error { #[error("An IO operation failed while streaming an entry")] Io(#[from] io::Error), #[error(transparent)] PackParse(#[from] crate::data::header::decode::Error), #[error("pack checksum in trailer was {expected}, but actual checksum was {actual}")] ChecksumMismatch { expected: gix_hash::ObjectId, actual: gix_hash::ObjectId, }, #[error("pack is incomplete: it was decompressed into {actual} bytes but {expected} bytes where expected.")] IncompletePack { actual: u64, expected: u64 }, #[error("The object {object_id} could not be decoded or wasn't found")] NotFound { object_id: gix_hash::ObjectId }, } /// Iteration Mode #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum Mode { /// Provide the trailer as read from the pack AsIs, /// Generate an own hash and trigger an error on the last iterated object /// if it does not match the hash provided with the pack. /// /// This way the one iterating the data cannot miss corruption as long as /// the iteration is continued through to the end. Verify, /// Generate an own hash and if there was an error or the objects are depleted early /// due to partial packs, return the last valid entry and with our own hash thus far. /// Note that the existing pack hash, if present, will be ignored. /// As we won't know which objects fails, every object will have the hash obtained thus far. /// This also means that algorithms must know about this possibility, or else might wrongfully /// assume the pack is finished. Restore, } /// Define what to do with the compressed bytes portion of a pack [`Entry`][super::Entry] #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum EntryDataMode { /// Do nothing with the compressed bytes we read Ignore, /// Only create a CRC32 of the entry, otherwise similar to `Ignore` Crc32, /// Keep them and pass them along in a newly allocated buffer Keep, /// As above, but also compute a CRC32 KeepAndCrc32, } impl EntryDataMode { /// Returns true if a crc32 should be computed pub fn crc32(&self) -> bool { match self { EntryDataMode::KeepAndCrc32 | EntryDataMode::Crc32 => true, EntryDataMode::Keep | EntryDataMode::Ignore => false, } } /// Returns true if compressed bytes should be kept pub fn keep(&self) -> bool { match self { EntryDataMode::Keep | EntryDataMode::KeepAndCrc32 => true, EntryDataMode::Ignore | EntryDataMode::Crc32 => false, } } } gix-pack-0.56.0/src/data/mod.rs000064400000000000000000000107231046102023000142470ustar 00000000000000//! a pack data file use std::path::Path; /// The offset to an entry into the pack data file, relative to its beginning. pub type Offset = u64; /// An identifier to uniquely identify all packs loaded within a known context or namespace. pub type Id = u32; use memmap2::Mmap; /// An representing an full- or delta-object within a pack #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Entry { /// The entry's header pub header: entry::Header, /// The decompressed size of the entry in bytes. /// /// Note that for non-delta entries this will be the size of the object itself. pub decompressed_size: u64, /// absolute offset to compressed object data in the pack, just behind the entry's header pub data_offset: Offset, } mod file; pub use file::{decode, verify, Header}; /// pub mod header; /// pub mod init { pub use super::header::decode::Error; } /// pub mod entry; /// #[cfg(feature = "streaming-input")] pub mod input; /// Utilities to encode pack data entries and write them to a `Write` implementation to resemble a pack data file. #[cfg(feature = "generate")] pub mod output; /// A slice into a pack file denoting a pack entry. /// /// An entry can be decoded into an object. pub type EntryRange = std::ops::Range; /// Supported versions of a pack data file #[derive(Default, PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[allow(missing_docs)] pub enum Version { #[default] V2, V3, } /// A pack data file pub struct File { data: Mmap, path: std::path::PathBuf, /// A value to represent this pack uniquely when used with cache lookup, or a way to identify this pack by its location on disk. /// The same location on disk should yield the same id. /// /// These must be unique per pack and must be stable, that is they don't change if the pack doesn't change. /// If the same id is assigned (or reassigned) to different packs, pack creation or cache access will fail in hard-to-debug ways. /// /// This value is controlled by the owning object store, which can use it in whichever way it wants as long as the above constraints are met. pub id: Id, version: Version, num_objects: u32, /// The size of the hash contained within. This is entirely determined by the caller, and repositories have to know which hash to use /// based on their configuration. hash_len: usize, object_hash: gix_hash::Kind, } /// Information about the pack data file itself impl File { /// The pack data version of this file pub fn version(&self) -> Version { self.version } /// The number of objects stored in this pack data file pub fn num_objects(&self) -> u32 { self.num_objects } /// The length of all mapped data, including the pack header and the pack trailer pub fn data_len(&self) -> usize { self.data.len() } /// The kind of hash we use internally. pub fn object_hash(&self) -> gix_hash::Kind { self.object_hash } /// The position of the byte one past the last pack entry, or in other terms, the first byte of the trailing hash. pub fn pack_end(&self) -> usize { self.data.len() - self.hash_len } /// The path to the pack data file on disk pub fn path(&self) -> &Path { &self.path } /// Returns the pack data at the given slice if its range is contained in the mapped pack data pub fn entry_slice(&self, slice: EntryRange) -> Option<&[u8]> { let entry_end: usize = slice.end.try_into().expect("end of pack fits into usize"); let entry_start = slice.start as usize; self.data.get(entry_start..entry_end) } /// Returns the CRC32 of the pack data indicated by `pack_offset` and the `size` of the mapped data. /// /// _Note:_ finding the right size is only possible by decompressing /// the pack entry beforehand, or by using the (to be sorted) offsets stored in an index file. /// /// # Panics /// /// If `pack_offset` or `size` are pointing to a range outside of the mapped pack data. pub fn entry_crc32(&self, pack_offset: Offset, size: usize) -> u32 { let pack_offset: usize = pack_offset.try_into().expect("pack_size fits into usize"); gix_features::hash::crc32(&self.data[pack_offset..pack_offset + size]) } } pub(crate) mod delta; gix-pack-0.56.0/src/data/output/bytes.rs000064400000000000000000000140651046102023000161610ustar 00000000000000use std::io::Write; use gix_features::hash; use crate::data::output; use crate::exact_vec; /// The error returned by `next()` in the [`FromEntriesIter`] iterator. #[allow(missing_docs)] #[derive(Debug, thiserror::Error)] pub enum Error where E: std::error::Error + 'static, { #[error(transparent)] Io(#[from] std::io::Error), #[error(transparent)] Input(E), } /// An implementation of [`Iterator`] to write [encoded entries][output::Entry] to an inner implementation each time /// `next()` is called. pub struct FromEntriesIter { /// An iterator for input [`output::Entry`] instances pub input: I, /// A way of writing encoded bytes. output: hash::Write, /// Our trailing hash when done writing all input entries trailer: Option, /// The amount of objects in the iteration and the version of the packfile to be written. /// Will be `None` to signal the header was written already. header_info: Option<(crate::data::Version, u32)>, /// The pack data version with which pack entries should be written. entry_version: crate::data::Version, /// The amount of written bytes thus far written: u64, /// Required to quickly find offsets by object IDs, as future objects may refer to those in the past to become a delta offset base. /// It stores the pack offsets at which objects begin. /// Additionally we store if an object was invalid, and if so we will not write it nor will we allow delta objects to it. pack_offsets_and_validity: Vec<(u64, bool)>, /// If we are done, no additional writes will occur is_done: bool, } impl FromEntriesIter where I: Iterator, E>>, W: std::io::Write, E: std::error::Error + 'static, { /// Create a new instance reading [entries][output::Entry] from an `input` iterator and write pack data bytes to /// `output` writer, resembling a pack of `version` with exactly `num_entries` amount of objects contained in it. /// `object_hash` is the kind of hash to use for the pack checksum and maybe other places, depending on the version. /// /// The input chunks are expected to be sorted already. You can use the [`InOrderIter`][gix_features::parallel::InOrderIter] to assure /// this happens on the fly holding entire chunks in memory as long as needed for them to be dispensed in order. /// /// # Panics /// /// Not all combinations of `object_hash` and `version` are supported currently triggering assertion errors. pub fn new( input: I, output: W, num_entries: u32, version: crate::data::Version, object_hash: gix_hash::Kind, ) -> Self { assert!( matches!(version, crate::data::Version::V2), "currently only pack version 2 can be written", ); FromEntriesIter { input, output: hash::Write::new(output, object_hash), trailer: None, entry_version: version, pack_offsets_and_validity: exact_vec(num_entries as usize), written: 0, header_info: Some((version, num_entries)), is_done: false, } } /// Consume this instance and return the `output` implementation. /// /// _Note_ that the `input` iterator can be moved out of this instance beforehand. pub fn into_write(self) -> W { self.output.inner } /// Returns the trailing hash over all written entries once done. /// It's `None` if we are not yet done writing. pub fn digest(&self) -> Option { self.trailer } fn next_inner(&mut self) -> Result> { let previous_written = self.written; if let Some((version, num_entries)) = self.header_info.take() { let header_bytes = crate::data::header::encode(version, num_entries); self.output.write_all(&header_bytes[..])?; self.written += header_bytes.len() as u64; } match self.input.next() { Some(entries) => { for entry in entries.map_err(Error::Input)? { if entry.is_invalid() { self.pack_offsets_and_validity.push((0, false)); continue; }; self.pack_offsets_and_validity.push((self.written, true)); let header = entry.to_entry_header(self.entry_version, |index| { let (base_offset, is_valid_object) = self.pack_offsets_and_validity[index]; if !is_valid_object { unreachable!("if you see this the object database is correct as a delta refers to a non-existing object") } self.written - base_offset }); self.written += header.write_to(entry.decompressed_size as u64, &mut self.output)? as u64; self.written += std::io::copy(&mut &*entry.compressed_data, &mut self.output)?; } } None => { let digest = self.output.hash.clone().digest(); self.output.inner.write_all(&digest[..])?; self.written += digest.len() as u64; self.output.inner.flush()?; self.is_done = true; self.trailer = Some(gix_hash::ObjectId::from(digest)); } }; Ok(self.written - previous_written) } } impl Iterator for FromEntriesIter where I: Iterator, E>>, W: std::io::Write, E: std::error::Error + 'static, { /// The amount of bytes written to `out` if `Ok` or the error `E` received from the input. type Item = Result>; fn next(&mut self) -> Option { if self.is_done { return None; } Some(match self.next_inner() { Err(err) => { self.is_done = true; Err(err) } Ok(written) => Ok(written), }) } } gix-pack-0.56.0/src/data/output/count/mod.rs000064400000000000000000000031511046102023000167340ustar 00000000000000use gix_hash::ObjectId; use crate::data::output::Count; /// Specifies how the pack location was handled during counting #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum PackLocation { /// We did not lookup this object NotLookedUp, /// The object was looked up and there may be a location in a pack, along with entry information LookedUp(Option), } impl PackLocation { /// Directly go through to `LookedUp` variant, panic otherwise pub fn is_none(&self) -> bool { match self { PackLocation::LookedUp(opt) => opt.is_none(), PackLocation::NotLookedUp => unreachable!("must have been resolved"), } } /// Directly go through to `LookedUp` variant, panic otherwise pub fn as_ref(&self) -> Option<&crate::data::entry::Location> { match self { PackLocation::LookedUp(opt) => opt.as_ref(), PackLocation::NotLookedUp => unreachable!("must have been resolved"), } } } impl Count { /// Create a new instance from the given `oid` and its corresponding location. pub fn from_data(oid: impl Into, location: Option) -> Self { Count { id: oid.into(), entry_pack_location: PackLocation::LookedUp(location), } } } #[path = "objects/mod.rs"] mod objects_impl; pub use objects_impl::{objects, objects_unthreaded}; /// pub mod objects { pub use super::objects_impl::{Error, ObjectExpansion, Options, Outcome}; } gix-pack-0.56.0/src/data/output/count/objects/mod.rs000064400000000000000000000430321046102023000203670ustar 00000000000000use std::{cell::RefCell, sync::atomic::AtomicBool}; use gix_features::parallel; use gix_hash::ObjectId; use crate::data::output; pub(in crate::data::output::count::objects_impl) mod reduce; mod util; mod types; pub use types::{Error, ObjectExpansion, Options, Outcome}; mod tree; /// Generate [`Count`][output::Count]s from input `objects` with object expansion based on [`options`][Options] /// to learn which objects would would constitute a pack. This step is required to know exactly how many objects would /// be in a pack while keeping data around to avoid minimize object database access. /// /// A [`Count`][output::Count] object maintains enough state to greatly accelerate future access of packed objects. /// /// * `db` - the object store to use for accessing objects. /// * `objects_ids` /// * A list of objects ids to add to the pack. Duplication checks are performed so no object is ever added to a pack twice. /// * Objects may be expanded based on the provided [`options`][Options] /// * `objects` /// * count the amount of objects we encounter /// * `should_interrupt` /// * A flag that is set to true if the operation should stop /// * `options` /// * more configuration pub fn objects( db: Find, objects_ids: Box>> + Send>, objects: &dyn gix_features::progress::Count, should_interrupt: &AtomicBool, Options { thread_limit, input_object_expansion, chunk_size, }: Options, ) -> Result<(Vec, Outcome), Error> where Find: crate::Find + Send + Clone, { let lower_bound = objects_ids.size_hint().0; let (chunk_size, thread_limit, _) = parallel::optimize_chunk_size_and_thread_limit( chunk_size, if lower_bound == 0 { None } else { Some(lower_bound) }, thread_limit, None, ); let chunks = gix_features::iter::Chunks { inner: objects_ids, size: chunk_size, }; let seen_objs = gix_hashtable::sync::ObjectIdMap::default(); let objects = objects.counter(); parallel::in_parallel( chunks, thread_limit, { move |_| { ( Vec::new(), // object data buffer Vec::new(), // object data buffer 2 to hold two objects at a time objects.clone(), ) } }, { let seen_objs = &seen_objs; move |oids: Vec<_>, (buf1, buf2, objects)| { expand::this( &db, input_object_expansion, seen_objs, &mut oids.into_iter(), buf1, buf2, objects, should_interrupt, true, /*allow pack lookups*/ ) } }, reduce::Statistics::new(), ) } /// Like [`objects()`] but using a single thread only to mostly save on the otherwise required overhead. pub fn objects_unthreaded( db: &dyn crate::Find, object_ids: &mut dyn Iterator>>, objects: &dyn gix_features::progress::Count, should_interrupt: &AtomicBool, input_object_expansion: ObjectExpansion, ) -> Result<(Vec, Outcome), Error> { let seen_objs = RefCell::new(gix_hashtable::HashSet::default()); let (mut buf1, mut buf2) = (Vec::new(), Vec::new()); expand::this( db, input_object_expansion, &seen_objs, object_ids, &mut buf1, &mut buf2, &objects.counter(), should_interrupt, false, /*allow pack lookups*/ ) } mod expand { use std::{ cell::RefCell, sync::atomic::{AtomicBool, Ordering}, }; use gix_hash::{oid, ObjectId}; use gix_object::{CommitRefIter, Data, TagRefIter}; use super::{ tree, types::{Error, ObjectExpansion, Outcome}, util, }; use crate::{ data::{output, output::count::PackLocation}, FindExt, }; #[allow(clippy::too_many_arguments)] pub fn this( db: &dyn crate::Find, input_object_expansion: ObjectExpansion, seen_objs: &impl util::InsertImmutable, oids: &mut dyn Iterator>>, buf1: &mut Vec, #[allow(clippy::ptr_arg)] buf2: &mut Vec, objects: &gix_features::progress::AtomicStep, should_interrupt: &AtomicBool, allow_pack_lookups: bool, ) -> Result<(Vec, Outcome), Error> { use ObjectExpansion::*; let mut out = Vec::new(); let mut tree_traversal_state = gix_traverse::tree::breadthfirst::State::default(); let mut tree_diff_state = gix_diff::tree::State::default(); let mut parent_commit_ids = Vec::new(); let mut traverse_delegate = tree::traverse::AllUnseen::new(seen_objs); let mut changes_delegate = tree::changes::AllNew::new(seen_objs); let mut outcome = Outcome::default(); let stats = &mut outcome; for id in oids { if should_interrupt.load(Ordering::Relaxed) { return Err(Error::Interrupted); } let id = id.map_err(Error::InputIteration)?; let (obj, location) = db.find(&id, buf1)?; stats.input_objects += 1; match input_object_expansion { TreeAdditionsComparedToAncestor => { use gix_object::Kind::*; let mut obj = obj; let mut location = location; let mut id = id.to_owned(); loop { push_obj_count_unique(&mut out, seen_objs, &id, location, objects, stats, false); match obj.kind { Tree | Blob => break, Tag => { id = TagRefIter::from_bytes(obj.data) .target_id() .expect("every tag has a target"); let tmp = db.find(&id, buf1)?; obj = tmp.0; location = tmp.1; stats.expanded_objects += 1; continue; } Commit => { let current_tree_iter = { let mut commit_iter = CommitRefIter::from_bytes(obj.data); let tree_id = commit_iter.tree_id().expect("every commit has a tree"); parent_commit_ids.clear(); for token in commit_iter { match token { Ok(gix_object::commit::ref_iter::Token::Parent { id }) => { parent_commit_ids.push(id); } Ok(_) => break, Err(err) => return Err(Error::CommitDecode(err)), } } let (obj, location) = db.find(&tree_id, buf1)?; push_obj_count_unique( &mut out, seen_objs, &tree_id, location, objects, stats, true, ); gix_object::TreeRefIter::from_bytes(obj.data) }; let objects_ref = if parent_commit_ids.is_empty() { traverse_delegate.clear(); let objects = ExpandedCountingObjects::new(db, out, objects); gix_traverse::tree::breadthfirst( current_tree_iter, &mut tree_traversal_state, &objects, &mut traverse_delegate, ) .map_err(Error::TreeTraverse)?; out = objects.dissolve(stats); &traverse_delegate.non_trees } else { for commit_id in &parent_commit_ids { let parent_tree_id = { let (parent_commit_obj, location) = db.find(commit_id, buf2)?; push_obj_count_unique( &mut out, seen_objs, commit_id, location, objects, stats, true, ); CommitRefIter::from_bytes(parent_commit_obj.data) .tree_id() .expect("every commit has a tree") }; let parent_tree = { let (parent_tree_obj, location) = db.find(&parent_tree_id, buf2)?; push_obj_count_unique( &mut out, seen_objs, &parent_tree_id, location, objects, stats, true, ); gix_object::TreeRefIter::from_bytes(parent_tree_obj.data) }; changes_delegate.clear(); let objects = CountingObjects::new(db); gix_diff::tree( parent_tree, current_tree_iter, &mut tree_diff_state, &objects, &mut changes_delegate, ) .map_err(Error::TreeChanges)?; stats.decoded_objects += objects.into_count(); } &changes_delegate.objects }; for id in objects_ref.iter() { out.push(id_to_count(db, buf2, id, objects, stats, allow_pack_lookups)); } break; } } } } TreeContents => { use gix_object::Kind::*; let mut id = id; let mut obj = (obj, location); loop { push_obj_count_unique(&mut out, seen_objs, &id, obj.1.clone(), objects, stats, false); match obj.0.kind { Tree => { traverse_delegate.clear(); { let objects = ExpandedCountingObjects::new(db, out, objects); gix_traverse::tree::breadthfirst( gix_object::TreeRefIter::from_bytes(obj.0.data), &mut tree_traversal_state, &objects, &mut traverse_delegate, ) .map_err(Error::TreeTraverse)?; out = objects.dissolve(stats); } for id in &traverse_delegate.non_trees { out.push(id_to_count(db, buf1, id, objects, stats, allow_pack_lookups)); } break; } Commit => { id = CommitRefIter::from_bytes(obj.0.data) .tree_id() .expect("every commit has a tree"); stats.expanded_objects += 1; obj = db.find(&id, buf1)?; continue; } Blob => break, Tag => { id = TagRefIter::from_bytes(obj.0.data) .target_id() .expect("every tag has a target"); stats.expanded_objects += 1; obj = db.find(&id, buf1)?; continue; } } } } AsIs => push_obj_count_unique(&mut out, seen_objs, &id, location, objects, stats, false), } } outcome.total_objects = out.len(); Ok((out, outcome)) } #[inline] fn push_obj_count_unique( out: &mut Vec, all_seen: &impl util::InsertImmutable, id: &oid, location: Option, objects: &gix_features::progress::AtomicStep, statistics: &mut Outcome, count_expanded: bool, ) { let inserted = all_seen.insert(id.to_owned()); if inserted { objects.fetch_add(1, Ordering::Relaxed); statistics.decoded_objects += 1; if count_expanded { statistics.expanded_objects += 1; } out.push(output::Count::from_data(id, location)); } } #[inline] fn id_to_count( db: &dyn crate::Find, buf: &mut Vec, id: &oid, objects: &gix_features::progress::AtomicStep, statistics: &mut Outcome, allow_pack_lookups: bool, ) -> output::Count { objects.fetch_add(1, Ordering::Relaxed); statistics.expanded_objects += 1; output::Count { id: id.to_owned(), entry_pack_location: if allow_pack_lookups { PackLocation::LookedUp(db.location_by_oid(id, buf)) } else { PackLocation::NotLookedUp }, } } struct CountingObjects<'a> { decoded_objects: std::cell::RefCell, objects: &'a dyn crate::Find, } impl<'a> CountingObjects<'a> { fn new(objects: &'a dyn crate::Find) -> Self { Self { decoded_objects: Default::default(), objects, } } fn into_count(self) -> usize { self.decoded_objects.into_inner() } } impl gix_object::Find for CountingObjects<'_> { fn try_find<'a>(&self, id: &oid, buffer: &'a mut Vec) -> Result>, gix_object::find::Error> { let res = Ok(self.objects.try_find(id, buffer)?.map(|t| t.0)); *self.decoded_objects.borrow_mut() += 1; res } } struct ExpandedCountingObjects<'a> { decoded_objects: std::cell::RefCell, expanded_objects: std::cell::RefCell, out: std::cell::RefCell>, objects_count: &'a gix_features::progress::AtomicStep, objects: &'a dyn crate::Find, } impl<'a> ExpandedCountingObjects<'a> { fn new( objects: &'a dyn crate::Find, out: Vec, objects_count: &'a gix_features::progress::AtomicStep, ) -> Self { Self { decoded_objects: Default::default(), expanded_objects: Default::default(), out: RefCell::new(out), objects_count, objects, } } fn dissolve(self, stats: &mut Outcome) -> Vec { stats.decoded_objects += self.decoded_objects.into_inner(); stats.expanded_objects += self.expanded_objects.into_inner(); self.out.into_inner() } } impl gix_object::Find for ExpandedCountingObjects<'_> { fn try_find<'a>(&self, id: &oid, buffer: &'a mut Vec) -> Result>, gix_object::find::Error> { let maybe_obj = self.objects.try_find(id, buffer)?; *self.decoded_objects.borrow_mut() += 1; match maybe_obj { None => Ok(None), Some((obj, location)) => { self.objects_count.fetch_add(1, Ordering::Relaxed); *self.expanded_objects.borrow_mut() += 1; self.out.borrow_mut().push(output::Count::from_data(id, location)); Ok(Some(obj)) } } } } } gix-pack-0.56.0/src/data/output/count/objects/reduce.rs000064400000000000000000000016551046102023000210640ustar 00000000000000use std::marker::PhantomData; use gix_features::parallel; use super::Outcome; use crate::data::output; pub struct Statistics { total: Outcome, counts: Vec, _err: PhantomData, } impl Statistics { pub fn new() -> Self { Statistics { total: Default::default(), counts: Default::default(), _err: PhantomData, } } } impl parallel::Reduce for Statistics { type Input = Result<(Vec, Outcome), E>; type FeedProduce = (); type Output = (Vec, Outcome); type Error = E; fn feed(&mut self, item: Self::Input) -> Result { let (counts, stats) = item?; self.total.aggregate(stats); self.counts.extend(counts); Ok(()) } fn finalize(self) -> Result { Ok((self.counts, self.total)) } } gix-pack-0.56.0/src/data/output/count/objects/tree.rs000064400000000000000000000064161046102023000205540ustar 00000000000000pub mod changes { use gix_diff::tree::{ visit::{Action, Change}, Visit, }; use gix_hash::ObjectId; use gix_object::bstr::BStr; use crate::data::output::count::objects_impl::util::InsertImmutable; pub struct AllNew<'a, H> { pub objects: Vec, all_seen: &'a H, } impl<'a, H> AllNew<'a, H> where H: InsertImmutable, { pub fn new(all_seen: &'a H) -> Self { AllNew { objects: Default::default(), all_seen, } } pub fn clear(&mut self) { self.objects.clear(); } } impl Visit for AllNew<'_, H> where H: InsertImmutable, { fn pop_front_tracked_path_and_set_current(&mut self) {} fn push_back_tracked_path_component(&mut self, _component: &BStr) {} fn push_path_component(&mut self, _component: &BStr) {} fn pop_path_component(&mut self) {} fn visit(&mut self, change: Change) -> Action { match change { Change::Addition { oid, entry_mode, relation: _, } | Change::Modification { oid, entry_mode, .. } => { if entry_mode.is_commit() { return Action::Continue; } let inserted = self.all_seen.insert(oid); if inserted { self.objects.push(oid); } } Change::Deletion { .. } => {} }; Action::Continue } } } pub mod traverse { use gix_hash::ObjectId; use gix_object::{bstr::BStr, tree::EntryRef}; use gix_traverse::tree::{visit::Action, Visit}; use crate::data::output::count::objects_impl::util::InsertImmutable; pub struct AllUnseen<'a, H> { pub non_trees: Vec, all_seen: &'a H, } impl<'a, H> AllUnseen<'a, H> where H: InsertImmutable, { pub fn new(all_seen: &'a H) -> Self { AllUnseen { non_trees: Default::default(), all_seen, } } pub fn clear(&mut self) { self.non_trees.clear(); } } impl Visit for AllUnseen<'_, H> where H: InsertImmutable, { fn pop_front_tracked_path_and_set_current(&mut self) {} fn push_back_tracked_path_component(&mut self, _component: &BStr) {} fn push_path_component(&mut self, _component: &BStr) {} fn pop_path_component(&mut self) {} fn visit_tree(&mut self, entry: &EntryRef<'_>) -> Action { let inserted = self.all_seen.insert(entry.oid.to_owned()); if inserted { Action::Continue } else { Action::Skip } } fn visit_nontree(&mut self, entry: &EntryRef<'_>) -> Action { if entry.mode.is_commit() { return Action::Continue; } let inserted = self.all_seen.insert(entry.oid.to_owned()); if inserted { self.non_trees.push(entry.oid.to_owned()); } Action::Continue } } } gix-pack-0.56.0/src/data/output/count/objects/types.rs000064400000000000000000000105401046102023000207520ustar 00000000000000/// Information gathered during the run of [`iter_from_objects()`][super::objects()]. #[derive(Default, PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Outcome { /// The amount of objects provided to start the iteration. pub input_objects: usize, /// The amount of objects that have been expanded from the input source. /// It's desirable to do that as expansion happens on multiple threads, allowing the amount of input objects to be small. /// `expanded_objects - decoded_objects` is the 'cheap' object we found without decoding the object itself. pub expanded_objects: usize, /// The amount of fully decoded objects. These are the most expensive as they are fully decoded pub decoded_objects: usize, /// The total amount of encountered objects. Should be `expanded_objects + input_objects`. pub total_objects: usize, } impl Outcome { pub(in crate::data::output::count) fn aggregate( &mut self, Outcome { input_objects, decoded_objects, expanded_objects, total_objects, }: Self, ) { self.input_objects += input_objects; self.decoded_objects += decoded_objects; self.expanded_objects += expanded_objects; self.total_objects += total_objects; } } /// The way input objects are handled #[derive(Default, PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum ObjectExpansion { /// Don't do anything with the input objects except for transforming them into pack entries #[default] AsIs, /// If the input object is a Commit then turn it into a pack entry. Additionally obtain its tree, turn it into a pack entry /// along with all of its contents, that is nested trees, and any other objects reachable from it. /// Otherwise, the same as [`AsIs`][ObjectExpansion::AsIs]. /// /// This mode is useful if all reachable objects should be added, as in cloning a repository. TreeContents, /// If the input is a commit, obtain its ancestors and turn them into pack entries. Obtain the ancestor trees along with the commits /// tree and turn them into pack entries. Finally obtain the added/changed objects when comparing the ancestor trees with the /// current tree and turn them into entries as well. /// Otherwise, the same as [`AsIs`][ObjectExpansion::AsIs]. /// /// This mode is useful to build a pack containing only new objects compared to a previous state. TreeAdditionsComparedToAncestor, } /// Configuration options for the pack generation functions provided in [this module][crate::data::output]. #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Options { /// The amount of threads to use at most when resolving the pack. If `None`, all logical cores are used. /// If more than one thread is used, the order of returned [counts][crate::data::output::Count] is not deterministic anymore /// especially when tree traversal is involved. Thus deterministic ordering requires `Some(1)` to be set. pub thread_limit: Option, /// The amount of objects per chunk or unit of work to be sent to threads for processing pub chunk_size: usize, /// The way input objects are handled pub input_object_expansion: ObjectExpansion, } impl Default for Options { fn default() -> Self { Options { thread_limit: None, chunk_size: 10, input_object_expansion: Default::default(), } } } /// The error returned by the pack generation iterator [`bytes::FromEntriesIter`][crate::data::output::bytes::FromEntriesIter]. #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] pub enum Error { #[error(transparent)] CommitDecode(gix_object::decode::Error), #[error(transparent)] FindExisting(#[from] gix_object::find::existing::Error), #[error(transparent)] InputIteration(Box), #[error(transparent)] TreeTraverse(gix_traverse::tree::breadthfirst::Error), #[error(transparent)] TreeChanges(gix_diff::tree::Error), #[error("Operation interrupted")] Interrupted, } gix-pack-0.56.0/src/data/output/count/objects/util.rs000064400000000000000000000010741046102023000205650ustar 00000000000000pub trait InsertImmutable { fn insert(&self, id: gix_hash::ObjectId) -> bool; } mod trait_impls { use std::cell::RefCell; use gix_hash::ObjectId; use gix_hashtable::HashSet; use super::InsertImmutable; impl InsertImmutable for gix_hashtable::sync::ObjectIdMap<()> { fn insert(&self, id: ObjectId) -> bool { self.insert(id, ()).is_none() } } impl InsertImmutable for RefCell> { fn insert(&self, item: ObjectId) -> bool { self.borrow_mut().insert(item) } } } gix-pack-0.56.0/src/data/output/entry/iter_from_counts.rs000064400000000000000000000451761046102023000215640ustar 00000000000000pub(crate) mod function { use std::{cmp::Ordering, sync::Arc}; use gix_features::{ parallel, parallel::SequenceId, progress::{ prodash::{Count, DynNestedProgress}, Progress, }, }; use super::{reduce, util, Error, Mode, Options, Outcome, ProgressId}; use crate::data::output; /// Given a known list of object `counts`, calculate entries ready to be put into a data pack. /// /// This allows objects to be written quite soon without having to wait for the entire pack to be built in memory. /// A chunk of objects is held in memory and compressed using DEFLATE, and serve the output of this iterator. /// That way slow writers will naturally apply back pressure, and communicate to the implementation that more time can be /// spent compressing objects. /// /// * `counts` /// * A list of previously counted objects to add to the pack. Duplication checks are not performed, no object is expected to be duplicated. /// * `progress` /// * a way to obtain progress information /// * `options` /// * more configuration /// /// _Returns_ the checksum of the pack /// /// ## Discussion /// /// ### Advantages /// /// * Begins writing immediately and supports back-pressure. /// * Abstract over object databases and how input is provided. /// /// ### Disadvantages /// /// * ~~currently there is no way to easily write the pack index, even though the state here is uniquely positioned to do /// so with minimal overhead (especially compared to `gix index-from-pack`)~~ Probably works now by chaining Iterators /// or keeping enough state to write a pack and then generate an index with recorded data. /// pub fn iter_from_counts( mut counts: Vec, db: Find, mut progress: Box, Options { version, mode, allow_thin_pack, thread_limit, chunk_size, }: Options, ) -> impl Iterator), Error>> + parallel::reduce::Finalize> where Find: crate::Find + Send + Clone + 'static, { assert!( matches!(version, crate::data::Version::V2), "currently we can only write version 2" ); let (chunk_size, thread_limit, _) = parallel::optimize_chunk_size_and_thread_limit(chunk_size, Some(counts.len()), thread_limit, None); { let progress = Arc::new(parking_lot::Mutex::new( progress.add_child_with_id("resolving".into(), ProgressId::ResolveCounts.into()), )); progress.lock().init(None, gix_features::progress::count("counts")); let enough_counts_present = counts.len() > 4_000; let start = std::time::Instant::now(); parallel::in_parallel_if( || enough_counts_present, counts.chunks_mut(chunk_size), thread_limit, |_n| Vec::::new(), { let progress = Arc::clone(&progress); let db = db.clone(); move |chunk, buf| { let chunk_size = chunk.len(); for count in chunk { use crate::data::output::count::PackLocation::*; match count.entry_pack_location { LookedUp(_) => continue, NotLookedUp => count.entry_pack_location = LookedUp(db.location_by_oid(&count.id, buf)), } } progress.lock().inc_by(chunk_size); Ok::<_, ()>(()) } }, parallel::reduce::IdentityWithResult::<(), ()>::default(), ) .expect("infallible - we ignore none-existing objects"); progress.lock().show_throughput(start); } let counts_range_by_pack_id = match mode { Mode::PackCopyAndBaseObjects => { let mut progress = progress.add_child_with_id("sorting".into(), ProgressId::SortEntries.into()); progress.init(Some(counts.len()), gix_features::progress::count("counts")); let start = std::time::Instant::now(); use crate::data::output::count::PackLocation::*; counts.sort_by(|lhs, rhs| match (&lhs.entry_pack_location, &rhs.entry_pack_location) { (LookedUp(None), LookedUp(None)) => Ordering::Equal, (LookedUp(Some(_)), LookedUp(None)) => Ordering::Greater, (LookedUp(None), LookedUp(Some(_))) => Ordering::Less, (LookedUp(Some(lhs)), LookedUp(Some(rhs))) => lhs .pack_id .cmp(&rhs.pack_id) .then(lhs.pack_offset.cmp(&rhs.pack_offset)), (_, _) => unreachable!("counts were resolved beforehand"), }); let mut index: Vec<(u32, std::ops::Range)> = Vec::new(); let mut chunks_pack_start = counts.partition_point(|e| e.entry_pack_location.is_none()); let mut slice = &counts[chunks_pack_start..]; while !slice.is_empty() { let current_pack_id = slice[0].entry_pack_location.as_ref().expect("packed object").pack_id; let pack_end = slice.partition_point(|e| { e.entry_pack_location.as_ref().expect("packed object").pack_id == current_pack_id }); index.push((current_pack_id, chunks_pack_start..chunks_pack_start + pack_end)); slice = &slice[pack_end..]; chunks_pack_start += pack_end; } progress.set(counts.len()); progress.show_throughput(start); index } }; let counts = Arc::new(counts); let progress = Arc::new(parking_lot::Mutex::new(progress)); let chunks = util::ChunkRanges::new(chunk_size, counts.len()); parallel::reduce::Stepwise::new( chunks.enumerate(), thread_limit, { let progress = Arc::clone(&progress); move |n| { ( Vec::new(), // object data buffer progress .lock() .add_child_with_id(format!("thread {n}"), gix_features::progress::UNKNOWN), ) } }, { let counts = Arc::clone(&counts); move |(chunk_id, chunk_range): (SequenceId, std::ops::Range), (buf, progress)| { let mut out = Vec::new(); let chunk = &counts[chunk_range]; let mut stats = Outcome::default(); let mut pack_offsets_to_id = None; progress.init(Some(chunk.len()), gix_features::progress::count("objects")); for count in chunk.iter() { out.push(match count .entry_pack_location .as_ref() .and_then(|l| db.entry_by_location(l).map(|pe| (l, pe))) { Some((location, pack_entry)) => { if let Some((cached_pack_id, _)) = &pack_offsets_to_id { if *cached_pack_id != location.pack_id { pack_offsets_to_id = None; } } let pack_range = counts_range_by_pack_id[counts_range_by_pack_id .binary_search_by_key(&location.pack_id, |e| e.0) .expect("pack-id always present")] .1 .clone(); let base_index_offset = pack_range.start; let counts_in_pack = &counts[pack_range]; let entry = output::Entry::from_pack_entry( pack_entry, count, counts_in_pack, base_index_offset, allow_thin_pack.then_some({ |pack_id, base_offset| { let (cached_pack_id, cache) = pack_offsets_to_id.get_or_insert_with(|| { db.pack_offsets_and_oid(pack_id) .map(|mut v| { v.sort_by_key(|e| e.0); (pack_id, v) }) .expect("pack used for counts is still available") }); debug_assert_eq!(*cached_pack_id, pack_id); stats.ref_delta_objects += 1; cache .binary_search_by_key(&base_offset, |e| e.0) .ok() .map(|idx| cache[idx].1) } }), version, ); match entry { Some(entry) => { stats.objects_copied_from_pack += 1; entry } None => match db.try_find(&count.id, buf).map_err(Error::Find)? { Some((obj, _location)) => { stats.decoded_and_recompressed_objects += 1; output::Entry::from_data(count, &obj) } None => { stats.missing_objects += 1; Ok(output::Entry::invalid()) } }, } } None => match db.try_find(&count.id, buf).map_err(Error::Find)? { Some((obj, _location)) => { stats.decoded_and_recompressed_objects += 1; output::Entry::from_data(count, &obj) } None => { stats.missing_objects += 1; Ok(output::Entry::invalid()) } }, }?); progress.inc(); } Ok((chunk_id, out, stats)) } }, reduce::Statistics::default(), ) } } mod util { #[derive(Clone)] pub struct ChunkRanges { cursor: usize, size: usize, len: usize, } impl ChunkRanges { pub fn new(size: usize, total: usize) -> Self { ChunkRanges { cursor: 0, size, len: total, } } } impl Iterator for ChunkRanges { type Item = std::ops::Range; fn next(&mut self) -> Option { if self.cursor >= self.len { None } else { let upper = (self.cursor + self.size).min(self.len); let range = self.cursor..upper; self.cursor = upper; Some(range) } } } } mod reduce { use std::marker::PhantomData; use gix_features::{parallel, parallel::SequenceId}; use super::Outcome; use crate::data::output; pub struct Statistics { total: Outcome, _err: PhantomData, } impl Default for Statistics { fn default() -> Self { Statistics { total: Default::default(), _err: PhantomData, } } } impl parallel::Reduce for Statistics { type Input = Result<(SequenceId, Vec, Outcome), Error>; type FeedProduce = (SequenceId, Vec); type Output = Outcome; type Error = Error; fn feed(&mut self, item: Self::Input) -> Result { item.map(|(cid, entries, stats)| { self.total.aggregate(stats); (cid, entries) }) } fn finalize(self) -> Result { Ok(self.total) } } } mod types { use crate::data::output::entry; /// Information gathered during the run of [`iter_from_counts()`][crate::data::output::entry::iter_from_counts()]. #[derive(Default, PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Outcome { /// The amount of fully decoded objects. These are the most expensive as they are fully decoded. pub decoded_and_recompressed_objects: usize, /// The amount of objects that could not be located despite them being mentioned during iteration pub missing_objects: usize, /// The amount of base or delta objects that could be copied directly from the pack. These are cheapest as they /// only cost a memory copy for the most part. pub objects_copied_from_pack: usize, /// The amount of objects that ref to their base as ref-delta, an indication for a thin back being created. pub ref_delta_objects: usize, } impl Outcome { pub(in crate::data::output::entry) fn aggregate( &mut self, Outcome { decoded_and_recompressed_objects: decoded_objects, missing_objects, objects_copied_from_pack, ref_delta_objects, }: Self, ) { self.decoded_and_recompressed_objects += decoded_objects; self.missing_objects += missing_objects; self.objects_copied_from_pack += objects_copied_from_pack; self.ref_delta_objects += ref_delta_objects; } } /// The way the iterator operates. #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum Mode { /// Copy base objects and deltas from packs, while non-packed objects will be treated as base objects /// (i.e. without trying to delta compress them). This is a fast way of obtaining a back while benefiting /// from existing pack compression and spending the smallest possible time on compressing unpacked objects at /// the cost of bandwidth. PackCopyAndBaseObjects, } /// Configuration options for the pack generation functions provided in [`iter_from_counts()`][crate::data::output::entry::iter_from_counts()]. #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Options { /// The amount of threads to use at most when resolving the pack. If `None`, all logical cores are used. pub thread_limit: Option, /// The algorithm to produce a pack pub mode: Mode, /// If set, the resulting back can have deltas that refer to an object which is not in the pack. This can happen /// if the initial counted objects do not contain an object that an existing packed delta refers to, for example, because /// it wasn't part of the iteration, for instance when the iteration was performed on tree deltas or only a part of the /// commit graph. Please note that thin packs are not valid packs at rest, thus they are only valid for packs in transit. /// /// If set to false, delta objects will be decompressed and recompressed as base objects. pub allow_thin_pack: bool, /// The amount of objects per chunk or unit of work to be sent to threads for processing /// TODO: could this become the window size? pub chunk_size: usize, /// The pack data version to produce for each entry pub version: crate::data::Version, } impl Default for Options { fn default() -> Self { Options { thread_limit: None, mode: Mode::PackCopyAndBaseObjects, allow_thin_pack: false, chunk_size: 10, version: Default::default(), } } } /// The error returned by the pack generation function [`iter_from_counts()`][crate::data::output::entry::iter_from_counts()]. #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] pub enum Error { #[error(transparent)] Find(gix_object::find::Error), #[error(transparent)] NewEntry(#[from] entry::Error), } /// The progress ids used in [`write_to_directory()`][crate::Bundle::write_to_directory()]. /// /// Use this information to selectively extract the progress of interest in case the parent application has custom visualization. #[derive(Debug, Copy, Clone)] pub enum ProgressId { /// The amount of [`Count`][crate::data::output::Count] objects which are resolved to their pack location. ResolveCounts, /// Layout pack entries for placement into a pack (by pack-id and by offset). SortEntries, } impl From for gix_features::progress::Id { fn from(v: ProgressId) -> Self { match v { ProgressId::ResolveCounts => *b"ECRC", ProgressId::SortEntries => *b"ECSE", } } } } pub use types::{Error, Mode, Options, Outcome, ProgressId}; gix-pack-0.56.0/src/data/output/entry/mod.rs000064400000000000000000000171471046102023000167570ustar 00000000000000use std::io::Write; use gix_hash::ObjectId; use crate::{data, data::output, find}; /// pub mod iter_from_counts; pub use iter_from_counts::function::iter_from_counts; /// The kind of pack entry to be written #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum Kind { /// A complete base object, including its kind Base(gix_object::Kind), /// A delta against the object with the given index. It's always an index that was already encountered to refer only /// to object we have written already. DeltaRef { /// The absolute index to the object to serve as base. It's up to the writer to maintain enough state to allow producing /// a packed delta object from it. object_index: usize, }, /// A delta against the given object as identified by its `ObjectId`. /// This is the case for thin packs only, i.e. those that are sent over the wire. /// Note that there is the option of the `ObjectId` being used to refer to an object within /// the same pack, but it's a discontinued practice which won't be encountered here. DeltaOid { /// The object serving as base for this delta id: ObjectId, }, } /// The error returned by [`output::Entry::from_data()`]. #[allow(missing_docs)] #[derive(Debug, thiserror::Error)] pub enum Error { #[error("{0}")] ZlibDeflate(#[from] std::io::Error), #[error(transparent)] EntryType(#[from] crate::data::entry::decode::Error), } impl output::Entry { /// An object which can be identified as invalid easily which happens if objects didn't exist even if they were referred to. pub fn invalid() -> output::Entry { output::Entry { id: gix_hash::Kind::Sha1.null(), // NOTE: the actual object hash used in the repo doesn't matter here, this is a sentinel value. kind: Kind::Base(gix_object::Kind::Blob), decompressed_size: 0, compressed_data: vec![], } } /// Returns true if this object doesn't really exist but still has to be handled responsibly /// /// Note that this is true for tree entries that are commits/git submodules, or for objects which aren't present in our local clone /// due to shallow clones. pub fn is_invalid(&self) -> bool { self.id.is_null() } /// Create an Entry from a previously counted object which is located in a pack. It's `entry` is provided here. /// The `version` specifies what kind of target `Entry` version the caller desires. pub fn from_pack_entry( mut entry: find::Entry, count: &output::Count, potential_bases: &[output::Count], bases_index_offset: usize, pack_offset_to_oid: Option Option>, target_version: data::Version, ) -> Option> { if entry.version != target_version { return None; }; let pack_offset_must_be_zero = 0; let pack_entry = match data::Entry::from_bytes(&entry.data, pack_offset_must_be_zero, count.id.as_slice().len()) { Ok(e) => e, Err(err) => return Some(Err(err.into())), }; use crate::data::entry::Header::*; match pack_entry.header { Commit => Some(output::entry::Kind::Base(gix_object::Kind::Commit)), Tree => Some(output::entry::Kind::Base(gix_object::Kind::Tree)), Blob => Some(output::entry::Kind::Base(gix_object::Kind::Blob)), Tag => Some(output::entry::Kind::Base(gix_object::Kind::Tag)), OfsDelta { base_distance } => { let pack_location = count.entry_pack_location.as_ref().expect("packed"); let base_offset = pack_location .pack_offset .checked_sub(base_distance) .expect("pack-offset - distance is firmly within the pack"); potential_bases .binary_search_by(|e| { e.entry_pack_location .as_ref() .expect("packed") .pack_offset .cmp(&base_offset) }) .ok() .map(|idx| output::entry::Kind::DeltaRef { object_index: idx + bases_index_offset, }) .or_else(|| { pack_offset_to_oid .and_then(|mut f| f(pack_location.pack_id, base_offset)) .map(|id| output::entry::Kind::DeltaOid { id }) }) } RefDelta { base_id: _ } => None, // ref deltas are for thin packs or legacy, repack them as base objects } .map(|kind| { Ok(output::Entry { id: count.id.to_owned(), kind, decompressed_size: pack_entry.decompressed_size as usize, compressed_data: { entry.data.copy_within(pack_entry.data_offset as usize.., 0); entry.data.resize( entry.data.len() - usize::try_from(pack_entry.data_offset).expect("offset representable as usize"), 0, ); entry.data }, }) }) } /// Create a new instance from the given `oid` and its corresponding git object data `obj`. pub fn from_data(count: &output::Count, obj: &gix_object::Data<'_>) -> Result { Ok(output::Entry { id: count.id.to_owned(), kind: Kind::Base(obj.kind), decompressed_size: obj.data.len(), compressed_data: { let mut out = gix_features::zlib::stream::deflate::Write::new(Vec::new()); if let Err(err) = std::io::copy(&mut &*obj.data, &mut out) { match err.kind() { std::io::ErrorKind::Other => return Err(Error::ZlibDeflate(err)), err => unreachable!("Should never see other errors than zlib, but got {:?}", err,), } }; out.flush()?; out.into_inner() }, }) } /// Transform ourselves into pack entry header of `version` which can be written into a pack. /// /// `index_to_pack(object_index) -> pack_offset` is a function to convert the base object's index into /// the input object array (if each object is numbered) to an offset into the pack. /// This information is known to the one calling the method. pub fn to_entry_header( &self, version: data::Version, index_to_base_distance: impl FnOnce(usize) -> u64, ) -> data::entry::Header { assert!( matches!(version, data::Version::V2), "we can only write V2 pack entries for now" ); use Kind::*; match self.kind { Base(kind) => { use gix_object::Kind::*; match kind { Tree => data::entry::Header::Tree, Blob => data::entry::Header::Blob, Commit => data::entry::Header::Commit, Tag => data::entry::Header::Tag, } } DeltaOid { id } => data::entry::Header::RefDelta { base_id: id.to_owned() }, DeltaRef { object_index } => data::entry::Header::OfsDelta { base_distance: index_to_base_distance(object_index), }, } } } gix-pack-0.56.0/src/data/output/mod.rs000064400000000000000000000027231046102023000156100ustar 00000000000000use gix_hash::ObjectId; /// pub mod count; /// An item representing a future Entry in the leanest way possible. /// /// One can expect to have one of these in memory when building big objects, so smaller is better here. /// They should contain everything of importance to generate a pack as fast as possible. #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Count { /// The hash of the object to write pub id: ObjectId, /// A way to locate a pack entry in the object database, only available if the object is in a pack. pub entry_pack_location: count::PackLocation, } /// An entry to be written to a file. /// /// Some of these will be in-flight and in memory while waiting to be written. Memory requirements depend on the amount of compressed /// data they hold. #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Entry { /// The hash of the object to write pub id: ObjectId, /// The kind of entry represented by `data`. It's used alongside with it to complete the pack entry /// at rest or in transit. pub kind: entry::Kind, /// The size in bytes needed once `data` gets decompressed pub decompressed_size: usize, /// The compressed data right behind the header pub compressed_data: Vec, } /// pub mod entry; /// pub mod bytes; gix-pack-0.56.0/src/find.rs000064400000000000000000000011261046102023000134740ustar 00000000000000/// An Entry in a pack providing access to its data. /// /// Its commonly retrieved by reading from a pack index file followed by a read from a pack data file. #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[allow(missing_docs)] pub struct Entry { /// The pack-data encoded bytes of the pack data entry as present in the pack file, including the header followed by compressed data. pub data: Vec, /// The version of the pack file containing `data` pub version: crate::data::Version, } gix-pack-0.56.0/src/find_traits.rs000064400000000000000000000275461046102023000151000ustar 00000000000000use crate::{data, find}; /// Describe how object can be located in an object store with built-in facilities to supports packs specifically. /// /// ## Notes /// /// Find effectively needs [generic associated types][issue] to allow a trait for the returned object type. /// Until then, we will have to make due with explicit types and give them the potentially added features we want. /// /// Furthermore, despite this trait being in `gix-pack`, it leaks knowledge about objects potentially not being packed. /// This is a necessary trade-off to allow this trait to live in `gix-pack` where it is used in functions to create a pack. /// /// [issue]: https://github.com/rust-lang/rust/issues/44265 pub trait Find { /// Returns true if the object exists in the database. fn contains(&self, id: &gix_hash::oid) -> bool; /// Find an object matching `id` in the database while placing its raw, decoded data into `buffer`. /// A `pack_cache` can be used to speed up subsequent lookups, set it to [`crate::cache::Never`] if the /// workload isn't suitable for caching. /// /// Returns `Some((, ))` if it was present in the database, /// or the error that occurred during lookup or object retrieval. fn try_find<'a>( &self, id: &gix_hash::oid, buffer: &'a mut Vec, ) -> Result, Option)>, gix_object::find::Error> { self.try_find_cached(id, buffer, &mut crate::cache::Never) } /// Like [`Find::try_find()`], but with support for controlling the pack cache. /// A `pack_cache` can be used to speed up subsequent lookups, set it to [`crate::cache::Never`] if the /// workload isn't suitable for caching. /// /// Returns `Some((, ))` if it was present in the database, /// or the error that occurred during lookup or object retrieval. fn try_find_cached<'a>( &self, id: &gix_hash::oid, buffer: &'a mut Vec, pack_cache: &mut dyn crate::cache::DecodeEntry, ) -> Result, Option)>, gix_object::find::Error>; /// Find the packs location where an object with `id` can be found in the database, or `None` if there is no pack /// holding the object. /// /// _Note_ that this is always None if the object isn't packed even though it exists as loose object. fn location_by_oid(&self, id: &gix_hash::oid, buf: &mut Vec) -> Option; /// Obtain a vector of all offsets, in index order, along with their object id. fn pack_offsets_and_oid(&self, pack_id: u32) -> Option>; /// Return the [`find::Entry`] for `location` if it is backed by a pack. /// /// Note that this is only in the interest of avoiding duplicate work during pack generation. /// Pack locations can be obtained from [`Find::try_find()`]. /// /// # Notes /// /// Custom implementations might be interested in providing their own meta-data with `object`, /// which currently isn't possible as the `Locate` trait requires GATs to work like that. fn entry_by_location(&self, location: &data::entry::Location) -> Option; } mod ext { use gix_object::{BlobRef, CommitRef, CommitRefIter, Kind, ObjectRef, TagRef, TagRefIter, TreeRef, TreeRefIter}; macro_rules! make_obj_lookup { ($method:ident, $object_variant:path, $object_kind:path, $object_type:ty) => { /// Like [`find(…)`][Self::find()], but flattens the `Result>` into a single `Result` making a non-existing object an error /// while returning the desired object type. fn $method<'a>( &self, id: &gix_hash::oid, buffer: &'a mut Vec, ) -> Result<($object_type, Option), gix_object::find::existing_object::Error> { let id = id.as_ref(); self.try_find(id, buffer) .map_err(gix_object::find::existing_object::Error::Find)? .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.as_ref().to_owned(), }) .and_then(|(o, l)| { o.decode() .map_err(|err| gix_object::find::existing_object::Error::Decode { source: err, oid: id.to_owned(), }) .map(|o| (o, l)) }) .and_then(|(o, l)| match o { $object_variant(o) => return Ok((o, l)), o => Err(gix_object::find::existing_object::Error::ObjectKind { oid: id.to_owned(), actual: o.kind(), expected: $object_kind, }), }) } }; } macro_rules! make_iter_lookup { ($method:ident, $object_kind:path, $object_type:ty, $into_iter:tt) => { /// Like [`find(…)`][Self::find()], but flattens the `Result>` into a single `Result` making a non-existing object an error /// while returning the desired iterator type. fn $method<'a>( &self, id: &gix_hash::oid, buffer: &'a mut Vec, ) -> Result<($object_type, Option), gix_object::find::existing_iter::Error> { let id = id.as_ref(); self.try_find(id, buffer) .map_err(gix_object::find::existing_iter::Error::Find)? .ok_or_else(|| gix_object::find::existing_iter::Error::NotFound { oid: id.as_ref().to_owned(), }) .and_then(|(o, l)| { o.$into_iter() .ok_or_else(|| gix_object::find::existing_iter::Error::ObjectKind { oid: id.to_owned(), actual: o.kind, expected: $object_kind, }) .map(|i| (i, l)) }) } }; } /// An extension trait with convenience functions. pub trait FindExt: super::Find { /// Like [`try_find(…)`][super::Find::try_find()], but flattens the `Result>` into a single `Result` making a non-existing object an error. fn find<'a>( &self, id: &gix_hash::oid, buffer: &'a mut Vec, ) -> Result<(gix_object::Data<'a>, Option), gix_object::find::existing::Error> { self.try_find(id, buffer) .map_err(gix_object::find::existing::Error::Find)? .ok_or_else(|| gix_object::find::existing::Error::NotFound { oid: id.as_ref().to_owned(), }) } make_obj_lookup!(find_commit, ObjectRef::Commit, Kind::Commit, CommitRef<'a>); make_obj_lookup!(find_tree, ObjectRef::Tree, Kind::Tree, TreeRef<'a>); make_obj_lookup!(find_tag, ObjectRef::Tag, Kind::Tag, TagRef<'a>); make_obj_lookup!(find_blob, ObjectRef::Blob, Kind::Blob, BlobRef<'a>); make_iter_lookup!(find_commit_iter, Kind::Blob, CommitRefIter<'a>, try_into_commit_iter); make_iter_lookup!(find_tree_iter, Kind::Tree, TreeRefIter<'a>, try_into_tree_iter); make_iter_lookup!(find_tag_iter, Kind::Tag, TagRefIter<'a>, try_into_tag_iter); } impl FindExt for T {} } pub use ext::FindExt; mod find_impls { use std::{ops::Deref, rc::Rc}; use gix_hash::oid; use crate::{data, find}; impl crate::Find for &T where T: crate::Find, { fn contains(&self, id: &oid) -> bool { (*self).contains(id) } fn try_find_cached<'a>( &self, id: &oid, buffer: &'a mut Vec, pack_cache: &mut dyn crate::cache::DecodeEntry, ) -> Result, Option)>, gix_object::find::Error> { (*self).try_find_cached(id, buffer, pack_cache) } fn location_by_oid(&self, id: &oid, buf: &mut Vec) -> Option { (*self).location_by_oid(id, buf) } fn pack_offsets_and_oid(&self, pack_id: u32) -> Option> { (*self).pack_offsets_and_oid(pack_id) } fn entry_by_location(&self, location: &data::entry::Location) -> Option { (*self).entry_by_location(location) } } impl super::Find for std::sync::Arc where T: super::Find, { fn contains(&self, id: &oid) -> bool { self.deref().contains(id) } fn try_find_cached<'a>( &self, id: &oid, buffer: &'a mut Vec, pack_cache: &mut dyn crate::cache::DecodeEntry, ) -> Result, Option)>, gix_object::find::Error> { self.deref().try_find_cached(id, buffer, pack_cache) } fn location_by_oid(&self, id: &oid, buf: &mut Vec) -> Option { self.deref().location_by_oid(id, buf) } fn pack_offsets_and_oid(&self, pack_id: u32) -> Option> { self.deref().pack_offsets_and_oid(pack_id) } fn entry_by_location(&self, object: &data::entry::Location) -> Option { self.deref().entry_by_location(object) } } impl super::Find for Rc where T: super::Find, { fn contains(&self, id: &oid) -> bool { self.deref().contains(id) } fn try_find_cached<'a>( &self, id: &oid, buffer: &'a mut Vec, pack_cache: &mut dyn crate::cache::DecodeEntry, ) -> Result, Option)>, gix_object::find::Error> { self.deref().try_find_cached(id, buffer, pack_cache) } fn location_by_oid(&self, id: &oid, buf: &mut Vec) -> Option { self.deref().location_by_oid(id, buf) } fn pack_offsets_and_oid(&self, pack_id: u32) -> Option> { self.deref().pack_offsets_and_oid(pack_id) } fn entry_by_location(&self, location: &data::entry::Location) -> Option { self.deref().entry_by_location(location) } } impl super::Find for Box where T: super::Find, { fn contains(&self, id: &oid) -> bool { self.deref().contains(id) } fn try_find_cached<'a>( &self, id: &oid, buffer: &'a mut Vec, pack_cache: &mut dyn crate::cache::DecodeEntry, ) -> Result, Option)>, gix_object::find::Error> { self.deref().try_find_cached(id, buffer, pack_cache) } fn location_by_oid(&self, id: &oid, buf: &mut Vec) -> Option { self.deref().location_by_oid(id, buf) } fn pack_offsets_and_oid(&self, pack_id: u32) -> Option> { self.deref().pack_offsets_and_oid(pack_id) } fn entry_by_location(&self, location: &data::entry::Location) -> Option { self.deref().entry_by_location(location) } } } gix-pack-0.56.0/src/index/access.rs000064400000000000000000000264621046102023000151360ustar 00000000000000use std::{mem::size_of, ops::Range}; use crate::{ data, index::{self, EntryIndex, PrefixLookupResult, FAN_LEN}, }; const N32_SIZE: usize = size_of::(); const N64_SIZE: usize = size_of::(); const V1_HEADER_SIZE: usize = FAN_LEN * N32_SIZE; const V2_HEADER_SIZE: usize = N32_SIZE * 2 + FAN_LEN * N32_SIZE; const N32_HIGH_BIT: u32 = 1 << 31; /// Represents an entry within a pack index file, effectively mapping object [`IDs`][gix_hash::ObjectId] to pack data file locations. #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Entry { /// The ID of the object pub oid: gix_hash::ObjectId, /// The offset to the object's header in the pack data file pub pack_offset: data::Offset, /// The CRC32 hash over all bytes of the pack data entry. /// /// This can be useful for direct copies of pack data entries from one pack to another with insurance there was no bit rot. /// _Note_: Only available in index version 2 or newer pub crc32: Option, } /// Iteration and access impl index::File { fn iter_v1(&self) -> impl Iterator + '_ { match self.version { index::Version::V1 => self.data[V1_HEADER_SIZE..] .chunks_exact(N32_SIZE + self.hash_len) .take(self.num_objects as usize) .map(|c| { let (ofs, oid) = c.split_at(N32_SIZE); Entry { oid: gix_hash::ObjectId::from_bytes_or_panic(oid), pack_offset: u64::from(crate::read_u32(ofs)), crc32: None, } }), _ => panic!("Cannot use iter_v1() on index of type {:?}", self.version), } } fn iter_v2(&self) -> impl Iterator + '_ { let pack64_offset = self.offset_pack_offset64_v2(); let oids = self.data[V2_HEADER_SIZE..] .chunks_exact(self.hash_len) .take(self.num_objects as usize); let crcs = self.data[self.offset_crc32_v2()..] .chunks_exact(N32_SIZE) .take(self.num_objects as usize); let offsets = self.data[self.offset_pack_offset_v2()..] .chunks_exact(N32_SIZE) .take(self.num_objects as usize); assert_eq!(oids.len(), crcs.len()); assert_eq!(crcs.len(), offsets.len()); match self.version { index::Version::V2 => izip!(oids, crcs, offsets).map(move |(oid, crc32, ofs32)| Entry { oid: gix_hash::ObjectId::from_bytes_or_panic(oid), pack_offset: self.pack_offset_from_offset_v2(ofs32, pack64_offset), crc32: Some(crate::read_u32(crc32)), }), _ => panic!("Cannot use iter_v2() on index of type {:?}", self.version), } } /// Returns the object hash at the given index in our list of (sorted) sha1 hashes. /// The index ranges from 0 to `self.num_objects()` /// /// # Panics /// /// If `index` is out of bounds. pub fn oid_at_index(&self, index: EntryIndex) -> &gix_hash::oid { let index = index as usize; let start = match self.version { index::Version::V2 => V2_HEADER_SIZE + index * self.hash_len, index::Version::V1 => V1_HEADER_SIZE + index * (N32_SIZE + self.hash_len) + N32_SIZE, }; gix_hash::oid::from_bytes_unchecked(&self.data[start..][..self.hash_len]) } /// Returns the offset into our pack data file at which to start reading the object at `index`. /// /// # Panics /// /// If `index` is out of bounds. pub fn pack_offset_at_index(&self, index: EntryIndex) -> data::Offset { let index = index as usize; match self.version { index::Version::V2 => { let start = self.offset_pack_offset_v2() + index * N32_SIZE; self.pack_offset_from_offset_v2(&self.data[start..][..N32_SIZE], self.offset_pack_offset64_v2()) } index::Version::V1 => { let start = V1_HEADER_SIZE + index * (N32_SIZE + self.hash_len); u64::from(crate::read_u32(&self.data[start..][..N32_SIZE])) } } } /// Returns the CRC32 of the object at the given `index`. /// /// _Note_: These are always present for index version 2 or higher. /// # Panics /// /// If `index` is out of bounds. pub fn crc32_at_index(&self, index: EntryIndex) -> Option { let index = index as usize; match self.version { index::Version::V2 => { let start = self.offset_crc32_v2() + index * N32_SIZE; Some(crate::read_u32(&self.data[start..start + N32_SIZE])) } index::Version::V1 => None, } } /// Returns the `index` of the given hash for use with the [`oid_at_index()`][index::File::oid_at_index()], /// [`pack_offset_at_index()`][index::File::pack_offset_at_index()] or [`crc32_at_index()`][index::File::crc32_at_index()]. // NOTE: pretty much the same things as in `multi_index::File::lookup`, change things there // as well. pub fn lookup(&self, id: impl AsRef) -> Option { lookup(id.as_ref(), &self.fan, &|idx| self.oid_at_index(idx)) } /// Given a `prefix`, find an object that matches it uniquely within this index and return `Some(Ok(entry_index))`. /// If there is more than one object matching the object `Some(Err(())` is returned. /// /// Finally, if no object matches the index, the return value is `None`. /// /// Pass `candidates` to obtain the set of entry-indices matching `prefix`, with the same return value as /// one would have received if it remained `None`. It will be empty if no object matched the `prefix`. /// // NOTE: pretty much the same things as in `index::File::lookup`, change things there // as well. pub fn lookup_prefix( &self, prefix: gix_hash::Prefix, candidates: Option<&mut Range>, ) -> Option { lookup_prefix( prefix, candidates, &self.fan, &|idx| self.oid_at_index(idx), self.num_objects, ) } /// An iterator over all [`Entries`][Entry] of this index file. pub fn iter<'a>(&'a self) -> Box + 'a> { match self.version { index::Version::V2 => Box::new(self.iter_v2()), index::Version::V1 => Box::new(self.iter_v1()), } } /// Return a vector of ascending offsets into our respective pack data file. /// /// Useful to control an iteration over all pack entries in a cache-friendly way. pub fn sorted_offsets(&self) -> Vec { let mut ofs: Vec<_> = match self.version { index::Version::V1 => self.iter().map(|e| e.pack_offset).collect(), index::Version::V2 => { let offset32_start = &self.data[self.offset_pack_offset_v2()..]; let offsets32 = offset32_start.chunks_exact(N32_SIZE).take(self.num_objects as usize); assert_eq!(self.num_objects as usize, offsets32.len()); let pack_offset_64_start = self.offset_pack_offset64_v2(); offsets32 .map(|offset| self.pack_offset_from_offset_v2(offset, pack_offset_64_start)) .collect() } }; ofs.sort_unstable(); ofs } #[inline] fn offset_crc32_v2(&self) -> usize { V2_HEADER_SIZE + self.num_objects as usize * self.hash_len } #[inline] fn offset_pack_offset_v2(&self) -> usize { self.offset_crc32_v2() + self.num_objects as usize * N32_SIZE } #[inline] fn offset_pack_offset64_v2(&self) -> usize { self.offset_pack_offset_v2() + self.num_objects as usize * N32_SIZE } #[inline] fn pack_offset_from_offset_v2(&self, offset: &[u8], pack64_offset: usize) -> data::Offset { debug_assert_eq!(self.version, index::Version::V2); let ofs32 = crate::read_u32(offset); if (ofs32 & N32_HIGH_BIT) == N32_HIGH_BIT { let from = pack64_offset + (ofs32 ^ N32_HIGH_BIT) as usize * N64_SIZE; crate::read_u64(&self.data[from..][..N64_SIZE]) } else { u64::from(ofs32) } } } pub(crate) fn lookup_prefix<'a>( prefix: gix_hash::Prefix, candidates: Option<&mut Range>, fan: &[u32; FAN_LEN], oid_at_index: &dyn Fn(EntryIndex) -> &'a gix_hash::oid, num_objects: u32, ) -> Option { let first_byte = prefix.as_oid().first_byte() as usize; let mut upper_bound = fan[first_byte]; let mut lower_bound = if first_byte != 0 { fan[first_byte - 1] } else { 0 }; // Bisect using indices while lower_bound < upper_bound { let mid = (lower_bound + upper_bound) / 2; let mid_sha = oid_at_index(mid); use std::cmp::Ordering::*; match prefix.cmp_oid(mid_sha) { Less => upper_bound = mid, Equal => match candidates { Some(candidates) => { let first_past_entry = ((0..mid).rev()) .take_while(|prev| prefix.cmp_oid(oid_at_index(*prev)) == Equal) .last(); let last_future_entry = ((mid + 1)..num_objects) .take_while(|next| prefix.cmp_oid(oid_at_index(*next)) == Equal) .last(); *candidates = match (first_past_entry, last_future_entry) { (Some(first), Some(last)) => first..last + 1, (Some(first), None) => first..mid + 1, (None, Some(last)) => mid..last + 1, (None, None) => mid..mid + 1, }; return if candidates.len() > 1 { Some(Err(())) } else { Some(Ok(mid)) }; } None => { let next = mid + 1; if next < num_objects && prefix.cmp_oid(oid_at_index(next)) == Equal { return Some(Err(())); } if mid != 0 && prefix.cmp_oid(oid_at_index(mid - 1)) == Equal { return Some(Err(())); } return Some(Ok(mid)); } }, Greater => lower_bound = mid + 1, } } if let Some(candidates) = candidates { *candidates = 0..0; } None } pub(crate) fn lookup<'a>( id: &gix_hash::oid, fan: &[u32; FAN_LEN], oid_at_index: &dyn Fn(EntryIndex) -> &'a gix_hash::oid, ) -> Option { let first_byte = id.first_byte() as usize; let mut upper_bound = fan[first_byte]; let mut lower_bound = if first_byte != 0 { fan[first_byte - 1] } else { 0 }; while lower_bound < upper_bound { let mid = (lower_bound + upper_bound) / 2; let mid_sha = oid_at_index(mid); use std::cmp::Ordering::*; match id.cmp(mid_sha) { Less => upper_bound = mid, Equal => return Some(mid), Greater => lower_bound = mid + 1, } } None } gix-pack-0.56.0/src/index/encode.rs000064400000000000000000000123321046102023000151210ustar 00000000000000use std::cmp::Ordering; pub(crate) const LARGE_OFFSET_THRESHOLD: u64 = 0x7fff_ffff; pub(crate) const HIGH_BIT: u32 = 0x8000_0000; pub(crate) fn fanout(iter: &mut dyn ExactSizeIterator) -> [u32; 256] { let mut fan_out = [0u32; 256]; let entries_len = iter.len() as u32; let mut iter = iter.enumerate(); let mut idx_and_entry = iter.next(); let mut upper_bound = 0; for (offset_be, byte) in fan_out.iter_mut().zip(0u8..=255) { *offset_be = match idx_and_entry.as_ref() { Some((_idx, first_byte)) => match first_byte.cmp(&byte) { Ordering::Less => unreachable!("ids should be ordered, and we make sure to keep ahead with them"), Ordering::Greater => upper_bound, Ordering::Equal => { if byte == 255 { entries_len } else { idx_and_entry = iter.find(|(_, first_byte)| *first_byte != byte); upper_bound = idx_and_entry.as_ref().map_or(entries_len, |(idx, _)| *idx as u32); upper_bound } } }, None => entries_len, }; } fan_out } #[cfg(feature = "streaming-input")] mod function { use std::io; use gix_features::{ hash, progress::{self, DynNestedProgress}, }; use super::{fanout, HIGH_BIT, LARGE_OFFSET_THRESHOLD}; use crate::index::V2_SIGNATURE; struct Count { bytes: u64, inner: W, } impl Count { fn new(inner: W) -> Self { Count { bytes: 0, inner } } } impl io::Write for Count where W: io::Write, { fn write(&mut self, buf: &[u8]) -> io::Result { let written = self.inner.write(buf)?; self.bytes += written as u64; Ok(written) } fn flush(&mut self) -> io::Result<()> { self.inner.flush() } } pub(crate) fn write_to( out: &mut dyn io::Write, entries_sorted_by_oid: Vec>, pack_hash: &gix_hash::ObjectId, kind: crate::index::Version, progress: &mut dyn DynNestedProgress, ) -> io::Result { use io::Write; assert_eq!(kind, crate::index::Version::V2, "Can only write V2 packs right now"); assert!( entries_sorted_by_oid.len() <= u32::MAX as usize, "a pack cannot have more than u32::MAX objects" ); // Write header let mut out = Count::new(std::io::BufWriter::with_capacity( 8 * 4096, hash::Write::new(out, kind.hash()), )); out.write_all(V2_SIGNATURE)?; out.write_all(&(kind as u32).to_be_bytes())?; progress.init(Some(4), progress::steps()); let start = std::time::Instant::now(); let _info = progress.add_child_with_id("writing fan-out table".into(), gix_features::progress::UNKNOWN); let fan_out = fanout(&mut entries_sorted_by_oid.iter().map(|e| e.data.id.first_byte())); for value in fan_out.iter() { out.write_all(&value.to_be_bytes())?; } progress.inc(); let _info = progress.add_child_with_id("writing ids".into(), gix_features::progress::UNKNOWN); for entry in &entries_sorted_by_oid { out.write_all(entry.data.id.as_slice())?; } progress.inc(); let _info = progress.add_child_with_id("writing crc32".into(), gix_features::progress::UNKNOWN); for entry in &entries_sorted_by_oid { out.write_all(&entry.data.crc32.to_be_bytes())?; } progress.inc(); let _info = progress.add_child_with_id("writing offsets".into(), gix_features::progress::UNKNOWN); { let mut offsets64 = Vec::::new(); for entry in &entries_sorted_by_oid { let offset: u32 = if entry.offset > LARGE_OFFSET_THRESHOLD { assert!( offsets64.len() < LARGE_OFFSET_THRESHOLD as usize, "Encoding breakdown - way too many 64bit offsets" ); offsets64.push(entry.offset); ((offsets64.len() - 1) as u32) | HIGH_BIT } else { entry.offset as u32 }; out.write_all(&offset.to_be_bytes())?; } for value in offsets64 { out.write_all(&value.to_be_bytes())?; } } out.write_all(pack_hash.as_slice())?; let bytes_written_without_trailer = out.bytes; let out = out.inner.into_inner()?; let index_hash: gix_hash::ObjectId = out.hash.digest().into(); out.inner.write_all(index_hash.as_slice())?; out.inner.flush()?; progress.inc(); progress.show_throughput_with( start, (bytes_written_without_trailer + 20) as usize, progress::bytes().expect("unit always set"), progress::MessageLevel::Success, ); Ok(index_hash) } } #[cfg(feature = "streaming-input")] pub(crate) use function::write_to; gix-pack-0.56.0/src/index/init.rs000064400000000000000000000056531046102023000146370ustar 00000000000000use std::{mem::size_of, path::Path}; use crate::index::{self, Version, FAN_LEN, V2_SIGNATURE}; /// Returned by [`index::File::at()`]. #[derive(thiserror::Error, Debug)] #[allow(missing_docs)] pub enum Error { #[error("Could not open pack index file at '{path}'")] Io { source: std::io::Error, path: std::path::PathBuf, }, #[error("{message}")] Corrupt { message: String }, #[error("Unsupported index version: {version})")] UnsupportedVersion { version: u32 }, } const N32_SIZE: usize = size_of::(); /// Instantiation impl index::File { /// Open the pack index file at the given `path`. /// /// The `object_hash` is a way to read (and write) the same file format with different hashes, as the hash kind /// isn't stored within the file format itself. pub fn at(path: impl AsRef, object_hash: gix_hash::Kind) -> Result { Self::at_inner(path.as_ref(), object_hash) } fn at_inner(path: &Path, object_hash: gix_hash::Kind) -> Result { let data = crate::mmap::read_only(path).map_err(|source| Error::Io { source, path: path.to_owned(), })?; let idx_len = data.len(); let hash_len = object_hash.len_in_bytes(); let footer_size = hash_len * 2; if idx_len < FAN_LEN * N32_SIZE + footer_size { return Err(Error::Corrupt { message: format!("Pack index of size {idx_len} is too small for even an empty index"), }); } let (kind, fan, num_objects) = { let (kind, d) = { let (sig, d) = data.split_at(V2_SIGNATURE.len()); if sig == V2_SIGNATURE { (Version::V2, d) } else { (Version::V1, &data[..]) } }; let d = { if let Version::V2 = kind { let (vd, dr) = d.split_at(N32_SIZE); let version = crate::read_u32(vd); if version != Version::V2 as u32 { return Err(Error::UnsupportedVersion { version }); } dr } else { d } }; let (fan, bytes_read) = read_fan(d); let (_, _d) = d.split_at(bytes_read); let num_objects = fan[FAN_LEN - 1]; (kind, fan, num_objects) }; Ok(index::File { data, path: path.to_owned(), version: kind, num_objects, fan, hash_len, object_hash, }) } } fn read_fan(d: &[u8]) -> ([u32; FAN_LEN], usize) { assert!(d.len() >= FAN_LEN * N32_SIZE); let mut fan = [0; FAN_LEN]; for (c, f) in d.chunks_exact(N32_SIZE).zip(fan.iter_mut()) { *f = crate::read_u32(c); } (fan, FAN_LEN * N32_SIZE) } gix-pack-0.56.0/src/index/mod.rs000064400000000000000000000104061046102023000144430ustar 00000000000000//! an index into the pack file /// From itertools /// Create an iterator running multiple iterators in lockstep. /// /// The `izip!` iterator yields elements until any subiterator /// returns `None`. /// /// This is a version of the standard ``.zip()`` that's supporting more than /// two iterators. The iterator element type is a tuple with one element /// from each of the input iterators. Just like ``.zip()``, the iteration stops /// when the shortest of the inputs reaches its end. /// /// **Note:** The result of this macro is in the general case an iterator /// composed of repeated `.zip()` and a `.map()`; it has an anonymous type. /// The special cases of one and two arguments produce the equivalent of /// `$a.into_iter()` and `$a.into_iter().zip($b)` respectively. /// /// Prefer this macro `izip!()` over [`multizip`] for the performance benefits /// of using the standard library `.zip()`. /// /// [`multizip`]: fn.multizip.html /// /// ```ignore /// # use itertools::izip; /// # /// # fn main() { /// /// // iterate over three sequences side-by-side /// let mut results = [0, 0, 0, 0]; /// let inputs = [3, 7, 9, 6]; /// /// for (r, index, input) in izip!(&mut results, 0..10, &inputs) { /// *r = index * 10 + input; /// } /// /// assert_eq!(results, [0 + 3, 10 + 7, 29, 36]); /// # } /// ``` /// /// (The above is vendored from [itertools](https://github.com/rust-itertools/itertools), /// including the original doctest, though it has been marked `ignore` here.) macro_rules! izip { // @closure creates a tuple-flattening closure for .map() call. usage: // @closure partial_pattern => partial_tuple , rest , of , iterators // eg. izip!( @closure ((a, b), c) => (a, b, c) , dd , ee ) ( @closure $p:pat => $tup:expr ) => { |$p| $tup }; // The "b" identifier is a different identifier on each recursion level thanks to hygiene. ( @closure $p:pat => ( $($tup:tt)* ) , $_iter:expr $( , $tail:expr )* ) => { izip!(@closure ($p, b) => ( $($tup)*, b ) $( , $tail )*) }; // unary ($first:expr $(,)*) => { std::iter::IntoIterator::into_iter($first) }; // binary ($first:expr, $second:expr $(,)*) => { izip!($first) .zip($second) }; // n-ary where n > 2 ( $first:expr $( , $rest:expr )* $(,)* ) => { izip!($first) $( .zip($rest) )* .map( izip!(@closure a => (a) $( , $rest )*) ) }; } use memmap2::Mmap; /// The version of an index file #[derive(Default, PartialEq, Eq, Ord, PartialOrd, Debug, Hash, Clone, Copy)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[allow(missing_docs)] pub enum Version { V1 = 1, #[default] V2 = 2, } impl Version { /// The kind of hash to produce to be compatible to this kind of index pub fn hash(&self) -> gix_hash::Kind { match self { Version::V1 | Version::V2 => gix_hash::Kind::Sha1, } } } /// A way to indicate if a lookup, despite successful, was ambiguous or yielded exactly /// one result in the particular index. pub type PrefixLookupResult = Result; /// The type for referring to indices of an entry within the index file. pub type EntryIndex = u32; const FAN_LEN: usize = 256; /// A representation of a pack index file pub struct File { data: Mmap, path: std::path::PathBuf, version: Version, num_objects: u32, fan: [u32; FAN_LEN], hash_len: usize, object_hash: gix_hash::Kind, } /// Basic file information impl File { /// The version of the pack index pub fn version(&self) -> Version { self.version } /// The path of the opened index file pub fn path(&self) -> &std::path::Path { &self.path } /// The amount of objects stored in the pack and index, as one past the highest entry index. pub fn num_objects(&self) -> EntryIndex { self.num_objects } /// The kind of hash we assume pub fn object_hash(&self) -> gix_hash::Kind { self.object_hash } } const V2_SIGNATURE: &[u8] = b"\xfftOc"; /// pub mod init; pub(crate) mod access; pub use access::Entry; pub(crate) mod encode; /// pub mod traverse; mod util; /// pub mod verify; /// #[cfg(feature = "streaming-input")] pub mod write; gix-pack-0.56.0/src/index/traverse/error.rs000064400000000000000000000033711046102023000166530ustar 00000000000000use crate::index; /// Returned by [`index::File::traverse_with_index()`] and [`index::File::traverse_with_lookup`] #[derive(thiserror::Error, Debug)] #[allow(missing_docs)] pub enum Error { #[error("One of the traversal processors failed")] Processor(#[source] E), #[error("Index file, pack file or object verification failed")] VerifyChecksum(#[from] index::verify::checksum::Error), #[error("The pack delta tree index could not be built")] Tree(#[from] crate::cache::delta::from_offsets::Error), #[error("The tree traversal failed")] TreeTraversal(#[from] crate::cache::delta::traverse::Error), #[error(transparent)] EntryType(#[from] crate::data::entry::decode::Error), #[error("Object {id} at offset {offset} could not be decoded")] PackDecode { id: gix_hash::ObjectId, offset: u64, source: crate::data::decode::Error, }, #[error("The packfiles checksum didn't match the index file checksum: expected {expected}, got {actual}")] PackMismatch { expected: gix_hash::ObjectId, actual: gix_hash::ObjectId, }, #[error("The hash of {kind} object at offset {offset} didn't match the checksum in the index file: expected {expected}, got {actual}")] PackObjectMismatch { expected: gix_hash::ObjectId, actual: gix_hash::ObjectId, offset: u64, kind: gix_object::Kind, }, #[error( "The CRC32 of {kind} object at offset {offset} didn't match the checksum in the index file: expected {expected}, got {actual}" )] Crc32Mismatch { expected: u32, actual: u32, offset: u64, kind: gix_object::Kind, }, #[error("Interrupted")] Interrupted, } gix-pack-0.56.0/src/index/traverse/mod.rs000064400000000000000000000205241046102023000163000ustar 00000000000000use std::sync::atomic::AtomicBool; use gix_features::{parallel, progress::Progress, zlib}; use crate::index; mod reduce; /// pub mod with_index; /// pub mod with_lookup; use reduce::Reducer; mod error; pub use error::Error; use gix_features::progress::DynNestedProgress; mod types; pub use types::{Algorithm, ProgressId, SafetyCheck, Statistics}; /// Traversal options for [`index::File::traverse()`]. #[derive(Debug, Clone)] pub struct Options { /// The algorithm to employ. pub traversal: Algorithm, /// If `Some`, only use the given amount of threads. Otherwise, the amount of threads to use will be selected based on /// the amount of available logical cores. pub thread_limit: Option, /// The kinds of safety checks to perform. pub check: SafetyCheck, /// A function to create a pack cache pub make_pack_lookup_cache: F, } impl Default for Options crate::cache::Never> { fn default() -> Self { Options { check: Default::default(), traversal: Default::default(), thread_limit: None, make_pack_lookup_cache: || crate::cache::Never, } } } /// The outcome of the [`traverse()`][index::File::traverse()] method. pub struct Outcome { /// The checksum obtained when hashing the file, which matched the checksum contained within the file. pub actual_index_checksum: gix_hash::ObjectId, /// The statistics obtained during traversal. pub statistics: Statistics, } /// Traversal of pack data files using an index file impl index::File { /// Iterate through all _decoded objects_ in the given `pack` and handle them with a `Processor`. /// The return value is (pack-checksum, [`Outcome`], `progress`), thus the pack traversal will always verify /// the whole packs checksum to assure it was correct. In case of bit-rod, the operation will abort early without /// verifying all objects using the [interrupt mechanism][gix_features::interrupt] mechanism. /// /// # Algorithms /// /// Using the [`Options::traversal`] field one can chose between two algorithms providing different tradeoffs. Both invoke /// `new_processor()` to create functions receiving decoded objects, their object kind, index entry and a progress instance to provide /// progress information. /// /// * [`Algorithm::DeltaTreeLookup`] builds an index to avoid any unnecessary computation while resolving objects, avoiding /// the need for a cache entirely, rendering `new_cache()` unused. /// One could also call [`traverse_with_index()`][index::File::traverse_with_index()] directly. /// * [`Algorithm::Lookup`] uses a cache created by `new_cache()` to avoid having to re-compute all bases of a delta-chain while /// decoding objects. /// One could also call [`traverse_with_lookup()`][index::File::traverse_with_lookup()] directly. /// /// Use [`thread_limit`][Options::thread_limit] to further control parallelism and [`check`][SafetyCheck] to define how much the passed /// objects shall be verified beforehand. pub fn traverse( &self, pack: &crate::data::File, progress: &mut dyn DynNestedProgress, should_interrupt: &AtomicBool, processor: Processor, Options { traversal, thread_limit, check, make_pack_lookup_cache, }: Options, ) -> Result> where C: crate::cache::DecodeEntry, E: std::error::Error + Send + Sync + 'static, Processor: FnMut(gix_object::Kind, &[u8], &index::Entry, &dyn Progress) -> Result<(), E> + Send + Clone, F: Fn() -> C + Send + Clone, { match traversal { Algorithm::Lookup => self.traverse_with_lookup( processor, pack, progress, should_interrupt, with_lookup::Options { thread_limit, check, make_pack_lookup_cache, }, ), Algorithm::DeltaTreeLookup => self.traverse_with_index( pack, processor, progress, should_interrupt, with_index::Options { check, thread_limit }, ), } } fn possibly_verify( &self, pack: &crate::data::File, check: SafetyCheck, pack_progress: &mut dyn Progress, index_progress: &mut dyn Progress, should_interrupt: &AtomicBool, ) -> Result> where E: std::error::Error + Send + Sync + 'static, { Ok(if check.file_checksum() { if self.pack_checksum() != pack.checksum() { return Err(Error::PackMismatch { actual: pack.checksum(), expected: self.pack_checksum(), }); } let (pack_res, id) = parallel::join( move || pack.verify_checksum(pack_progress, should_interrupt), move || self.verify_checksum(index_progress, should_interrupt), ); pack_res?; id? } else { self.index_checksum() }) } #[allow(clippy::too_many_arguments)] fn decode_and_process_entry( &self, check: SafetyCheck, pack: &crate::data::File, cache: &mut C, buf: &mut Vec, inflate: &mut zlib::Inflate, progress: &mut dyn Progress, index_entry: &index::Entry, processor: &mut impl FnMut(gix_object::Kind, &[u8], &index::Entry, &dyn Progress) -> Result<(), E>, ) -> Result> where C: crate::cache::DecodeEntry, E: std::error::Error + Send + Sync + 'static, { let pack_entry = pack.entry(index_entry.pack_offset)?; let pack_entry_data_offset = pack_entry.data_offset; let entry_stats = pack .decode_entry( pack_entry, buf, inflate, &|id, _| { let index = self.lookup(id)?; pack.entry(self.pack_offset_at_index(index)) .ok() .map(crate::data::decode::entry::ResolvedBase::InPack) }, cache, ) .map_err(|e| Error::PackDecode { source: e, id: index_entry.oid, offset: index_entry.pack_offset, })?; let object_kind = entry_stats.kind; let header_size = (pack_entry_data_offset - index_entry.pack_offset) as usize; let entry_len = header_size + entry_stats.compressed_size; process_entry( check, object_kind, buf, index_entry, || pack.entry_crc32(index_entry.pack_offset, entry_len), progress, processor, )?; Ok(entry_stats) } } #[allow(clippy::too_many_arguments)] fn process_entry( check: SafetyCheck, object_kind: gix_object::Kind, decompressed: &[u8], index_entry: &index::Entry, pack_entry_crc32: impl FnOnce() -> u32, progress: &dyn Progress, processor: &mut impl FnMut(gix_object::Kind, &[u8], &index::Entry, &dyn Progress) -> Result<(), E>, ) -> Result<(), Error> where E: std::error::Error + Send + Sync + 'static, { if check.object_checksum() { let actual_oid = gix_object::compute_hash(index_entry.oid.kind(), object_kind, decompressed); if actual_oid != index_entry.oid { return Err(Error::PackObjectMismatch { actual: actual_oid, expected: index_entry.oid, offset: index_entry.pack_offset, kind: object_kind, }); } if let Some(desired_crc32) = index_entry.crc32 { let actual_crc32 = pack_entry_crc32(); if actual_crc32 != desired_crc32 { return Err(Error::Crc32Mismatch { actual: actual_crc32, expected: desired_crc32, offset: index_entry.pack_offset, kind: object_kind, }); } } } processor(object_kind, decompressed, index_entry, progress).map_err(Error::Processor) } gix-pack-0.56.0/src/index/traverse/reduce.rs000064400000000000000000000102321046102023000167630ustar 00000000000000use std::{ sync::atomic::{AtomicBool, Ordering}, time::Instant, }; use gix_features::{ parallel, progress::Progress, threading::{lock, Mutable, OwnShared}, }; use crate::{data, index::traverse}; fn add_decode_result(lhs: &mut data::decode::entry::Outcome, rhs: data::decode::entry::Outcome) { lhs.num_deltas += rhs.num_deltas; lhs.decompressed_size += rhs.decompressed_size; lhs.compressed_size += rhs.compressed_size; lhs.object_size += rhs.object_size; } fn div_decode_result(lhs: &mut data::decode::entry::Outcome, div: usize) { if div != 0 { lhs.num_deltas = (lhs.num_deltas as f32 / div as f32) as u32; lhs.decompressed_size /= div as u64; lhs.compressed_size /= div; lhs.object_size /= div as u64; } } pub struct Reducer<'a, P, E> { progress: OwnShared>, check: traverse::SafetyCheck, then: Instant, entries_seen: usize, stats: traverse::Statistics, should_interrupt: &'a AtomicBool, _error: std::marker::PhantomData, } impl<'a, P, E> Reducer<'a, P, E> where P: Progress, { pub fn from_progress( progress: OwnShared>, pack_data_len_in_bytes: usize, check: traverse::SafetyCheck, should_interrupt: &'a AtomicBool, ) -> Self { let stats = traverse::Statistics { pack_size: pack_data_len_in_bytes as u64, ..Default::default() }; Reducer { progress, check, then: Instant::now(), entries_seen: 0, should_interrupt, stats, _error: Default::default(), } } } impl parallel::Reduce for Reducer<'_, P, E> where P: Progress, E: std::error::Error + Send + Sync + 'static, { type Input = Result, traverse::Error>; type FeedProduce = (); type Output = traverse::Statistics; type Error = traverse::Error; fn feed(&mut self, input: Self::Input) -> Result<(), Self::Error> { let chunk_stats: Vec<_> = match input { Err(err @ traverse::Error::PackDecode { .. }) if !self.check.fatal_decode_error() => { lock(&self.progress).info(format!("Ignoring decode error: {err}")); return Ok(()); } res => res, }?; self.entries_seen += chunk_stats.len(); let chunk_total = chunk_stats.into_iter().fold( data::decode::entry::Outcome::default_from_kind(gix_object::Kind::Tree), |mut total, stats| { *self.stats.objects_per_chain_length.entry(stats.num_deltas).or_insert(0) += 1; self.stats.total_decompressed_entries_size += stats.decompressed_size; self.stats.total_compressed_entries_size += stats.compressed_size as u64; self.stats.total_object_size += stats.object_size; use gix_object::Kind::*; match stats.kind { Commit => self.stats.num_commits += 1, Tree => self.stats.num_trees += 1, Blob => self.stats.num_blobs += 1, Tag => self.stats.num_tags += 1, } add_decode_result(&mut total, stats); total }, ); add_decode_result(&mut self.stats.average, chunk_total); lock(&self.progress).set(self.entries_seen); if self.should_interrupt.load(Ordering::SeqCst) { return Err(Self::Error::Interrupted); } Ok(()) } fn finalize(mut self) -> Result { div_decode_result(&mut self.stats.average, self.entries_seen); let elapsed_s = self.then.elapsed().as_secs_f32(); let objects_per_second = (self.entries_seen as f32 / elapsed_s) as u32; lock(&self.progress).info(format!( "of {} objects done in {:.2}s ({} objects/s, ~{}/s)", self.entries_seen, elapsed_s, objects_per_second, gix_features::progress::bytesize::ByteSize(self.stats.average.object_size * u64::from(objects_per_second)) )); Ok(self.stats) } } gix-pack-0.56.0/src/index/traverse/types.rs000064400000000000000000000115721046102023000166700ustar 00000000000000use std::{collections::BTreeMap, marker::PhantomData}; /// Statistics regarding object encountered during execution of the [`traverse()`][crate::index::File::traverse()] method. #[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Statistics { /// The average over all decoded objects pub average: crate::data::decode::entry::Outcome, /// A mapping of the length of the chain to the amount of objects at that length. /// /// A length of 0 indicates full objects, and everything above that involves the given amount /// of delta objects. pub objects_per_chain_length: BTreeMap, /// The amount of bytes in all compressed streams, one per entry pub total_compressed_entries_size: u64, /// The amount of bytes in all decompressed streams, one per entry pub total_decompressed_entries_size: u64, /// The amount of bytes occupied by all undeltified, decompressed objects pub total_object_size: u64, /// The amount of bytes occupied by the pack itself, in bytes pub pack_size: u64, /// The amount of objects encountered that where commits pub num_commits: u32, /// The amount of objects encountered that where trees pub num_trees: u32, /// The amount of objects encountered that where tags pub num_tags: u32, /// The amount of objects encountered that where blobs pub num_blobs: u32, } impl Default for Statistics { fn default() -> Self { Statistics { average: crate::data::decode::entry::Outcome::default_from_kind(gix_object::Kind::Tree), objects_per_chain_length: Default::default(), total_compressed_entries_size: 0, total_decompressed_entries_size: 0, total_object_size: 0, pack_size: 0, num_blobs: 0, num_commits: 0, num_trees: 0, num_tags: 0, } } } /// The ways to validate decoded objects before passing them to the processor. #[derive(Default, Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone, Copy)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum SafetyCheck { /// Don't verify the validity of the checksums stored in the index and pack file SkipFileChecksumVerification, /// All of the above, and also don't perform any object checksum verification SkipFileAndObjectChecksumVerification, /// All of the above, and only log object decode errors. /// /// Useful if there is a damaged pack and you would like to traverse as many objects as possible. SkipFileAndObjectChecksumVerificationAndNoAbortOnDecodeError, /// Perform all available safety checks before operating on the pack and /// abort if any of them fails #[default] All, } impl SafetyCheck { pub(crate) fn file_checksum(&self) -> bool { matches!(self, SafetyCheck::All) } pub(crate) fn object_checksum(&self) -> bool { matches!(self, SafetyCheck::All | SafetyCheck::SkipFileChecksumVerification) } pub(crate) fn fatal_decode_error(&self) -> bool { match self { SafetyCheck::All | SafetyCheck::SkipFileChecksumVerification | SafetyCheck::SkipFileAndObjectChecksumVerification => true, SafetyCheck::SkipFileAndObjectChecksumVerificationAndNoAbortOnDecodeError => false, } } } /// The way we verify the pack #[derive(Default, Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone, Copy)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum Algorithm { /// Build an index to allow decoding each delta and base exactly once, saving a lot of computational /// resource at the expense of resident memory, as we will use an additional `DeltaTree` to accelerate /// delta chain resolution. #[default] DeltaTreeLookup, /// We lookup each object similarly to what would happen during normal repository use. /// Uses more compute resources as it will resolve delta chains from back to front, but start right away /// without indexing or investing any memory in indices. /// /// This option may be well suited for big packs in memory-starved system that support memory mapping. Lookup, } /// The progress ids used in [`traverse()`][crate::index::File::traverse()] . /// /// Use this information to selectively extract the progress of interest in case the parent application has custom visualization. #[derive(Debug, Copy, Clone)] pub enum ProgressId { /// A root progress which isn't actually used, but links to the `ProgressId` of the lookup version of the algorithm. WithLookup(PhantomData), /// A root progress which isn't actually used, but links to the `ProgressId` of the indexed version of the algorithm. WithIndex(PhantomData), } gix-pack-0.56.0/src/index/traverse/with_index.rs000064400000000000000000000234651046102023000176720ustar 00000000000000use std::sync::atomic::{AtomicBool, Ordering}; use gix_features::{parallel, progress::DynNestedProgress}; use super::Error; use crate::{ cache::delta::traverse, index::{self, traverse::Outcome, util::index_entries_sorted_by_offset_ascending}, }; /// Traversal options for [`traverse_with_index()`][index::File::traverse_with_index()] #[derive(Default)] pub struct Options { /// If `Some`, only use the given amount of threads. Otherwise, the amount of threads to use will be selected based on /// the amount of available logical cores. pub thread_limit: Option, /// The kinds of safety checks to perform. pub check: crate::index::traverse::SafetyCheck, } /// The progress ids used in [`index::File::traverse_with_index()`]. /// /// Use this information to selectively extract the progress of interest in case the parent application has custom visualization. #[derive(Debug, Copy, Clone)] pub enum ProgressId { /// The amount of bytes currently processed to generate a checksum of the *pack data file*. HashPackDataBytes, /// The amount of bytes currently processed to generate a checksum of the *pack index file*. HashPackIndexBytes, /// Collect all object hashes into a vector and sort it by their pack offset. CollectSortedIndexEntries, /// Count the objects processed when building a cache tree from all objects in a pack index. TreeFromOffsetsObjects, /// The amount of objects which were decoded. DecodedObjects, /// The amount of bytes that were decoded in total, as the sum of all bytes to represent all decoded objects. DecodedBytes, } impl From for gix_features::progress::Id { fn from(v: ProgressId) -> Self { match v { ProgressId::HashPackDataBytes => *b"PTHP", ProgressId::HashPackIndexBytes => *b"PTHI", ProgressId::CollectSortedIndexEntries => *b"PTCE", ProgressId::TreeFromOffsetsObjects => *b"PTDI", ProgressId::DecodedObjects => *b"PTRO", ProgressId::DecodedBytes => *b"PTDB", } } } /// Traversal with index impl index::File { /// Iterate through all _decoded objects_ in the given `pack` and handle them with a `Processor`, using an index to reduce waste /// at the cost of memory. /// /// For more details, see the documentation on the [`traverse()`][index::File::traverse()] method. pub fn traverse_with_index( &self, pack: &crate::data::File, mut processor: Processor, progress: &mut dyn DynNestedProgress, should_interrupt: &AtomicBool, Options { check, thread_limit }: Options, ) -> Result> where Processor: FnMut(gix_object::Kind, &[u8], &index::Entry, &dyn gix_features::progress::Progress) -> Result<(), E> + Send + Clone, E: std::error::Error + Send + Sync + 'static, { let (verify_result, traversal_result) = parallel::join( { let mut pack_progress = progress.add_child_with_id( format!( "Hash of pack '{}'", pack.path().file_name().expect("pack has filename").to_string_lossy() ), ProgressId::HashPackDataBytes.into(), ); let mut index_progress = progress.add_child_with_id( format!( "Hash of index '{}'", self.path.file_name().expect("index has filename").to_string_lossy() ), ProgressId::HashPackIndexBytes.into(), ); move || { let res = self.possibly_verify(pack, check, &mut pack_progress, &mut index_progress, should_interrupt); if res.is_err() { should_interrupt.store(true, Ordering::SeqCst); } res } }, || -> Result<_, Error<_>> { let sorted_entries = index_entries_sorted_by_offset_ascending( self, &mut progress.add_child_with_id( "collecting sorted index".into(), ProgressId::CollectSortedIndexEntries.into(), ), ); /* Pack Traverse Collect sorted Entries */ let tree = crate::cache::delta::Tree::from_offsets_in_pack( pack.path(), sorted_entries.into_iter().map(Entry::from), &|e| e.index_entry.pack_offset, &|id| self.lookup(id).map(|idx| self.pack_offset_at_index(idx)), &mut progress.add_child_with_id("indexing".into(), ProgressId::TreeFromOffsetsObjects.into()), should_interrupt, self.object_hash, )?; let mut outcome = digest_statistics(tree.traverse( |slice, pack| pack.entry_slice(slice), pack, pack.pack_end() as u64, move |data, progress, traverse::Context { entry: pack_entry, entry_end, decompressed: bytes, level, }| { let object_kind = pack_entry.header.as_kind().expect("non-delta object"); data.level = level; data.decompressed_size = pack_entry.decompressed_size; data.object_kind = object_kind; data.compressed_size = entry_end - pack_entry.data_offset; data.object_size = bytes.len() as u64; let result = index::traverse::process_entry( check, object_kind, bytes, &data.index_entry, || { // TODO: Fix this - we overwrite the header of 'data' which also changes the computed entry size, // causing index and pack to seemingly mismatch. This is surprising, and should be done differently. // debug_assert_eq!(&data.index_entry.pack_offset, &pack_entry.pack_offset()); gix_features::hash::crc32( pack.entry_slice(data.index_entry.pack_offset..entry_end) .expect("slice pointing into the pack (by now data is verified)"), ) }, progress, &mut processor, ); match result { Err(err @ Error::PackDecode { .. }) if !check.fatal_decode_error() => { progress.info(format!("Ignoring decode error: {err}")); Ok(()) } res => res, } }, traverse::Options { object_progress: Box::new( progress.add_child_with_id("Resolving".into(), ProgressId::DecodedObjects.into()), ), size_progress: &mut progress.add_child_with_id("Decoding".into(), ProgressId::DecodedBytes.into()), thread_limit, should_interrupt, object_hash: self.object_hash, }, )?); outcome.pack_size = pack.data_len() as u64; Ok(outcome) }, ); Ok(Outcome { actual_index_checksum: verify_result?, statistics: traversal_result?, }) } } struct Entry { index_entry: crate::index::Entry, object_kind: gix_object::Kind, object_size: u64, decompressed_size: u64, compressed_size: u64, level: u16, } impl From for Entry { fn from(index_entry: crate::index::Entry) -> Self { Entry { index_entry, level: 0, object_kind: gix_object::Kind::Tree, object_size: 0, decompressed_size: 0, compressed_size: 0, } } } fn digest_statistics(traverse::Outcome { roots, children }: traverse::Outcome) -> index::traverse::Statistics { let mut res = index::traverse::Statistics::default(); let average = &mut res.average; for item in roots.iter().chain(children.iter()) { res.total_compressed_entries_size += item.data.compressed_size; res.total_decompressed_entries_size += item.data.decompressed_size; res.total_object_size += item.data.object_size; *res.objects_per_chain_length .entry(u32::from(item.data.level)) .or_insert(0) += 1; average.decompressed_size += item.data.decompressed_size; average.compressed_size += item.data.compressed_size as usize; average.object_size += item.data.object_size; average.num_deltas += u32::from(item.data.level); use gix_object::Kind::*; match item.data.object_kind { Blob => res.num_blobs += 1, Tree => res.num_trees += 1, Tag => res.num_tags += 1, Commit => res.num_commits += 1, }; } let num_nodes = roots.len() + children.len(); average.decompressed_size /= num_nodes as u64; average.compressed_size /= num_nodes; average.object_size /= num_nodes as u64; average.num_deltas /= num_nodes as u32; res } gix-pack-0.56.0/src/index/traverse/with_lookup.rs000064400000000000000000000175531046102023000200750ustar 00000000000000use std::sync::atomic::{AtomicBool, Ordering}; use gix_features::{ parallel::{self, in_parallel_if}, progress::{self, Count, DynNestedProgress, Progress}, threading::{lock, Mutable, OwnShared}, zlib, }; use super::{Error, Reducer}; use crate::{ data, exact_vec, index, index::{traverse::Outcome, util}, }; /// Traversal options for [`index::File::traverse_with_lookup()`] pub struct Options { /// If `Some`, only use the given amount of threads. Otherwise, the amount of threads to use will be selected based on /// the amount of available logical cores. pub thread_limit: Option, /// The kinds of safety checks to perform. pub check: index::traverse::SafetyCheck, /// A function to create a pack cache pub make_pack_lookup_cache: F, } impl Default for Options crate::cache::Never> { fn default() -> Self { Options { check: Default::default(), thread_limit: None, make_pack_lookup_cache: || crate::cache::Never, } } } /// The progress ids used in [`index::File::traverse_with_lookup()`]. /// /// Use this information to selectively extract the progress of interest in case the parent application has custom visualization. #[derive(Debug, Copy, Clone)] pub enum ProgressId { /// The amount of bytes currently processed to generate a checksum of the *pack data file*. HashPackDataBytes, /// The amount of bytes currently processed to generate a checksum of the *pack index file*. HashPackIndexBytes, /// Collect all object hashes into a vector and sort it by their pack offset. CollectSortedIndexEntries, /// The amount of objects which were decoded by brute-force. DecodedObjects, } impl From for gix_features::progress::Id { fn from(v: ProgressId) -> Self { match v { ProgressId::HashPackDataBytes => *b"PTHP", ProgressId::HashPackIndexBytes => *b"PTHI", ProgressId::CollectSortedIndexEntries => *b"PTCE", ProgressId::DecodedObjects => *b"PTRO", } } } /// Verify and validate the content of the index file impl index::File { /// Iterate through all _decoded objects_ in the given `pack` and handle them with a `Processor` using a cache to reduce the amount of /// waste while decoding objects. /// /// For more details, see the documentation on the [`traverse()`][index::File::traverse()] method. pub fn traverse_with_lookup( &self, mut processor: Processor, pack: &data::File, progress: &mut dyn DynNestedProgress, should_interrupt: &AtomicBool, Options { thread_limit, check, make_pack_lookup_cache, }: Options, ) -> Result> where C: crate::cache::DecodeEntry, E: std::error::Error + Send + Sync + 'static, Processor: FnMut(gix_object::Kind, &[u8], &index::Entry, &dyn Progress) -> Result<(), E> + Send + Clone, F: Fn() -> C + Send + Clone, { let (verify_result, traversal_result) = parallel::join( { let mut pack_progress = progress.add_child_with_id( format!( "Hash of pack '{}'", pack.path().file_name().expect("pack has filename").to_string_lossy() ), ProgressId::HashPackDataBytes.into(), ); let mut index_progress = progress.add_child_with_id( format!( "Hash of index '{}'", self.path.file_name().expect("index has filename").to_string_lossy() ), ProgressId::HashPackIndexBytes.into(), ); move || { let res = self.possibly_verify(pack, check, &mut pack_progress, &mut index_progress, should_interrupt); if res.is_err() { should_interrupt.store(true, Ordering::SeqCst); } res } }, || { let index_entries = util::index_entries_sorted_by_offset_ascending( self, &mut progress.add_child_with_id( "collecting sorted index".into(), ProgressId::CollectSortedIndexEntries.into(), ), ); let (chunk_size, thread_limit, available_cores) = parallel::optimize_chunk_size_and_thread_limit(1000, Some(index_entries.len()), thread_limit, None); let there_are_enough_entries_to_process = || index_entries.len() > chunk_size * available_cores; let input_chunks = index_entries.chunks(chunk_size); let reduce_progress = OwnShared::new(Mutable::new({ let mut p = progress.add_child_with_id("Traversing".into(), ProgressId::DecodedObjects.into()); p.init(Some(self.num_objects() as usize), progress::count("objects")); p })); let state_per_thread = { let reduce_progress = reduce_progress.clone(); move |index| { ( make_pack_lookup_cache(), Vec::with_capacity(2048), // decode buffer zlib::Inflate::default(), lock(&reduce_progress) .add_child_with_id(format!("thread {index}"), gix_features::progress::UNKNOWN), // per thread progress ) } }; in_parallel_if( there_are_enough_entries_to_process, input_chunks, thread_limit, state_per_thread, move |entries: &[index::Entry], (cache, buf, inflate, progress)| -> Result, Error<_>> { progress.init( Some(entries.len()), gix_features::progress::count_with_decimals("objects", 2), ); let mut stats = exact_vec(entries.len()); progress.set(0); for index_entry in entries.iter() { let result = self.decode_and_process_entry( check, pack, cache, buf, inflate, progress, index_entry, &mut processor, ); progress.inc(); let stat = match result { Err(err @ Error::PackDecode { .. }) if !check.fatal_decode_error() => { progress.info(format!("Ignoring decode error: {err}")); continue; } res => res, }?; stats.push(stat); if should_interrupt.load(Ordering::Relaxed) { break; } } Ok(stats) }, Reducer::from_progress(reduce_progress, pack.data_len(), check, should_interrupt), ) }, ); Ok(Outcome { actual_index_checksum: verify_result?, statistics: traversal_result?, }) } } gix-pack-0.56.0/src/index/util.rs000064400000000000000000000011011046102023000146310ustar 00000000000000use std::time::Instant; use crate::exact_vec; use gix_features::progress::{self, Progress}; pub(crate) fn index_entries_sorted_by_offset_ascending( idx: &crate::index::File, progress: &mut dyn Progress, ) -> Vec { progress.init(Some(idx.num_objects as usize), progress::count("entries")); let start = Instant::now(); let mut v = exact_vec(idx.num_objects as usize); for entry in idx.iter() { v.push(entry); progress.inc(); } v.sort_by_key(|e| e.pack_offset); progress.show_throughput(start); v } gix-pack-0.56.0/src/index/verify.rs000064400000000000000000000252111046102023000151700ustar 00000000000000use std::sync::atomic::AtomicBool; use gix_features::progress::{DynNestedProgress, Progress}; use gix_object::WriteTo; use crate::index; /// pub mod integrity { use std::marker::PhantomData; use gix_object::bstr::BString; /// Returned by [`index::File::verify_integrity()`][crate::index::File::verify_integrity()]. #[derive(thiserror::Error, Debug)] #[allow(missing_docs)] pub enum Error { #[error("Reserialization of an object failed")] Io(#[from] std::io::Error), #[error("The fan at index {index} is out of order as it's larger then the following value.")] Fan { index: usize }, #[error("{kind} object {id} could not be decoded")] ObjectDecode { source: gix_object::decode::Error, kind: gix_object::Kind, id: gix_hash::ObjectId, }, #[error("{kind} object {id} wasn't re-encoded without change, wanted\n{expected}\n\nGOT\n\n{actual}")] ObjectEncodeMismatch { kind: gix_object::Kind, id: gix_hash::ObjectId, expected: BString, actual: BString, }, } /// Returned by [`index::File::verify_integrity()`][crate::index::File::verify_integrity()]. pub struct Outcome { /// The computed checksum of the index which matched the stored one. pub actual_index_checksum: gix_hash::ObjectId, /// The packs traversal outcome, if one was provided pub pack_traverse_statistics: Option, } /// Additional options to define how the integrity should be verified. #[derive(Clone)] pub struct Options { /// The thoroughness of the verification pub verify_mode: crate::index::verify::Mode, /// The way to traverse packs pub traversal: crate::index::traverse::Algorithm, /// The amount of threads to use of `Some(N)`, with `None|Some(0)` using all available cores are used. pub thread_limit: Option, /// A function to create a pack cache pub make_pack_lookup_cache: F, } impl Default for Options crate::cache::Never> { fn default() -> Self { Options { verify_mode: Default::default(), traversal: Default::default(), thread_limit: None, make_pack_lookup_cache: || crate::cache::Never, } } } /// The progress ids used in [`index::File::verify_integrity()`][crate::index::File::verify_integrity()]. /// /// Use this information to selectively extract the progress of interest in case the parent application has custom visualization. #[derive(Debug, Copy, Clone)] pub enum ProgressId { /// The amount of bytes read to verify the index checksum. ChecksumBytes, /// A root progress for traversal which isn't actually used directly, but here to link to the respective `ProgressId` types. Traverse(PhantomData), } impl From for gix_features::progress::Id { fn from(v: ProgressId) -> Self { match v { ProgressId::ChecksumBytes => *b"PTHI", ProgressId::Traverse(_) => gix_features::progress::UNKNOWN, } } } } /// pub mod checksum { /// Returned by [`index::File::verify_checksum()`][crate::index::File::verify_checksum()]. pub type Error = crate::verify::checksum::Error; } /// Various ways in which a pack and index can be verified #[derive(Default, Debug, Eq, PartialEq, Hash, Clone, Copy)] pub enum Mode { /// Validate the object hash and CRC32 HashCrc32, /// Validate hash and CRC32, and decode each non-Blob object. /// Each object should be valid, i.e. be decodable. HashCrc32Decode, /// Validate hash and CRC32, and decode and encode each non-Blob object. /// Each object should yield exactly the same hash when re-encoded. #[default] HashCrc32DecodeEncode, } /// Information to allow verifying the integrity of an index with the help of its corresponding pack. pub struct PackContext<'a, F> { /// The pack data file itself. pub data: &'a crate::data::File, /// The options further configuring the pack traversal and verification pub options: integrity::Options, } /// Verify and validate the content of the index file impl index::File { /// Returns the trailing hash stored at the end of this index file. /// /// It's a hash over all bytes of the index. pub fn index_checksum(&self) -> gix_hash::ObjectId { gix_hash::ObjectId::from_bytes_or_panic(&self.data[self.data.len() - self.hash_len..]) } /// Returns the hash of the pack data file that this index file corresponds to. /// /// It should [`crate::data::File::checksum()`] of the corresponding pack data file. pub fn pack_checksum(&self) -> gix_hash::ObjectId { let from = self.data.len() - self.hash_len * 2; gix_hash::ObjectId::from_bytes_or_panic(&self.data[from..][..self.hash_len]) } /// Validate that our [`index_checksum()`][index::File::index_checksum()] matches the actual contents /// of this index file, and return it if it does. pub fn verify_checksum( &self, progress: &mut dyn Progress, should_interrupt: &AtomicBool, ) -> Result { crate::verify::checksum_on_disk_or_mmap( self.path(), &self.data, self.index_checksum(), self.object_hash, progress, should_interrupt, ) } /// The most thorough validation of integrity of both index file and the corresponding pack data file, if provided. /// Returns the checksum of the index file, the traversal outcome and the given progress if the integrity check is successful. /// /// If `pack` is provided, it is expected (and validated to be) the pack belonging to this index. /// It will be used to validate internal integrity of the pack before checking each objects integrity /// is indeed as advertised via its SHA1 as stored in this index, as well as the CRC32 hash. /// The last member of the Option is a function returning an implementation of [`crate::cache::DecodeEntry`] to be used if /// the [`index::traverse::Algorithm`] is `Lookup`. /// To set this to `None`, use `None::<(_, _, _, fn() -> crate::cache::Never)>`. /// /// The `thread_limit` optionally specifies the amount of threads to be used for the [pack traversal][index::File::traverse()]. /// `make_cache` is only used in case a `pack` is specified, use existing implementations in the [`crate::cache`] module. /// /// # Tradeoffs /// /// The given `progress` is inevitably consumed if there is an error, which is a tradeoff chosen to easily allow using `?` in the /// error case. pub fn verify_integrity( &self, pack: Option>, progress: &mut dyn DynNestedProgress, should_interrupt: &AtomicBool, ) -> Result> where C: crate::cache::DecodeEntry, F: Fn() -> C + Send + Clone, { if let Some(first_invalid) = crate::verify::fan(&self.fan) { return Err(index::traverse::Error::Processor(integrity::Error::Fan { index: first_invalid, })); } match pack { Some(PackContext { data: pack, options: integrity::Options { verify_mode, traversal, thread_limit, make_pack_lookup_cache, }, }) => self .traverse( pack, progress, should_interrupt, { let mut encode_buf = Vec::with_capacity(2048); move |kind, data, index_entry, progress| { Self::verify_entry(verify_mode, &mut encode_buf, kind, data, index_entry, progress) } }, index::traverse::Options { traversal, thread_limit, check: index::traverse::SafetyCheck::All, make_pack_lookup_cache, }, ) .map(|o| integrity::Outcome { actual_index_checksum: o.actual_index_checksum, pack_traverse_statistics: Some(o.statistics), }), None => self .verify_checksum( &mut progress .add_child_with_id("Sha1 of index".into(), integrity::ProgressId::ChecksumBytes.into()), should_interrupt, ) .map_err(Into::into) .map(|id| integrity::Outcome { actual_index_checksum: id, pack_traverse_statistics: None, }), } } #[allow(clippy::too_many_arguments)] fn verify_entry( verify_mode: Mode, encode_buf: &mut Vec, object_kind: gix_object::Kind, buf: &[u8], index_entry: &index::Entry, _progress: &dyn gix_features::progress::Progress, ) -> Result<(), integrity::Error> { if let Mode::HashCrc32Decode | Mode::HashCrc32DecodeEncode = verify_mode { use gix_object::Kind::*; match object_kind { Tree | Commit | Tag => { let object = gix_object::ObjectRef::from_bytes(object_kind, buf).map_err(|err| { integrity::Error::ObjectDecode { source: err, kind: object_kind, id: index_entry.oid, } })?; if let Mode::HashCrc32DecodeEncode = verify_mode { encode_buf.clear(); object.write_to(&mut *encode_buf)?; if encode_buf.as_slice() != buf { return Err(integrity::Error::ObjectEncodeMismatch { kind: object_kind, id: index_entry.oid, expected: buf.into(), actual: encode_buf.clone().into(), }); } } } Blob => {} }; } Ok(()) } } gix-pack-0.56.0/src/index/write/error.rs000064400000000000000000000023711046102023000161510ustar 00000000000000use std::io; /// Returned by [`crate::index::File::write_data_iter_to_stream()`] #[derive(thiserror::Error, Debug)] #[allow(missing_docs)] pub enum Error { #[error("An IO error occurred when reading the pack or creating a temporary file")] Io(#[from] io::Error), #[error("A pack entry could not be extracted")] PackEntryDecode(#[from] crate::data::input::Error), #[error("Indices of type {} cannot be written, only {} are supported", *.0 as usize, crate::index::Version::default() as usize)] Unsupported(crate::index::Version), #[error("Ref delta objects are not supported as there is no way to look them up. Resolve them beforehand.")] IteratorInvariantNoRefDelta, #[error("The iterator failed to set a trailing hash over all prior pack entries in the last provided entry")] IteratorInvariantTrailer, #[error("Only u32::MAX objects can be stored in a pack, found {0}")] IteratorInvariantTooManyObjects(usize), #[error("{pack_offset} is not a valid offset for pack offset {distance}")] IteratorInvariantBaseOffset { pack_offset: u64, distance: u64 }, #[error(transparent)] Tree(#[from] crate::cache::delta::Error), #[error(transparent)] TreeTraversal(#[from] crate::cache::delta::traverse::Error), } gix-pack-0.56.0/src/index/write/mod.rs000064400000000000000000000251761046102023000156070ustar 00000000000000use std::{io, sync::atomic::AtomicBool}; pub use error::Error; use gix_features::progress::{self, prodash::DynNestedProgress, Count, Progress}; use crate::cache::delta::{traverse, Tree}; mod error; pub(crate) struct TreeEntry { pub id: gix_hash::ObjectId, pub crc32: u32, } /// Information gathered while executing [`write_data_iter_to_stream()`][crate::index::File::write_data_iter_to_stream] #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Outcome { /// The version of the verified index pub index_version: crate::index::Version, /// The verified checksum of the verified index pub index_hash: gix_hash::ObjectId, /// The hash of the '.pack' file, also found in its trailing bytes pub data_hash: gix_hash::ObjectId, /// The amount of objects that were verified, always the amount of objects in the pack. pub num_objects: u32, } /// The progress ids used in [`write_data_iter_from_stream()`][crate::index::File::write_data_iter_to_stream()]. /// /// Use this information to selectively extract the progress of interest in case the parent application has custom visualization. #[derive(Debug, Copy, Clone)] pub enum ProgressId { /// Counts the amount of objects that were index thus far. IndexObjects, /// The amount of bytes that were decompressed while decoding pack entries. /// /// This is done to determine entry boundaries. DecompressedBytes, /// The amount of objects whose hashes were computed. /// /// This is done by decoding them, which typically involves decoding delta objects. ResolveObjects, /// The amount of bytes that were decoded in total, as the sum of all bytes to represent all resolved objects. DecodedBytes, /// The amount of bytes written to the index file. IndexBytesWritten, } impl From for gix_features::progress::Id { fn from(v: ProgressId) -> Self { match v { ProgressId::IndexObjects => *b"IWIO", ProgressId::DecompressedBytes => *b"IWDB", ProgressId::ResolveObjects => *b"IWRO", ProgressId::DecodedBytes => *b"IWDB", ProgressId::IndexBytesWritten => *b"IWBW", } } } /// Various ways of writing an index file from pack entries impl crate::index::File { /// Write information about `entries` as obtained from a pack data file into a pack index file via the `out` stream. /// The resolver produced by `make_resolver` must resolve pack entries from the same pack data file that produced the /// `entries` iterator. /// /// * `kind` is the version of pack index to produce, use [`crate::index::Version::default()`] if in doubt. /// * `tread_limit` is used for a parallel tree traversal for obtaining object hashes with optimal performance. /// * `root_progress` is the top-level progress to stay informed about the progress of this potentially long-running /// computation. /// * `object_hash` defines what kind of object hash we write into the index file. /// * `pack_version` is the version of the underlying pack for which `entries` are read. It's used in case none of these objects are provided /// to compute a pack-hash. /// /// # Remarks /// /// * neither in-pack nor out-of-pack Ref Deltas are supported here, these must have been resolved beforehand. /// * `make_resolver()` will only be called after the iterator stopped returning elements and produces a function that /// provides all bytes belonging to a pack entry writing them to the given mutable output `Vec`. /// It should return `None` if the entry cannot be resolved from the pack that produced the `entries` iterator, causing /// the write operation to fail. #[allow(clippy::too_many_arguments)] pub fn write_data_iter_to_stream( version: crate::index::Version, make_resolver: F, entries: &mut dyn Iterator>, thread_limit: Option, root_progress: &mut dyn DynNestedProgress, out: &mut dyn io::Write, should_interrupt: &AtomicBool, object_hash: gix_hash::Kind, pack_version: crate::data::Version, ) -> Result where F: FnOnce() -> io::Result<(F2, R)>, R: Send + Sync, F2: for<'r> Fn(crate::data::EntryRange, &'r R) -> Option<&'r [u8]> + Send + Clone, { if version != crate::index::Version::default() { return Err(Error::Unsupported(version)); } let mut num_objects: usize = 0; let mut last_seen_trailer = None; let (anticipated_num_objects, upper_bound) = entries.size_hint(); let worst_case_num_objects_after_thin_pack_resolution = upper_bound.unwrap_or(anticipated_num_objects); let mut tree = Tree::with_capacity(worst_case_num_objects_after_thin_pack_resolution)?; let indexing_start = std::time::Instant::now(); root_progress.init(Some(4), progress::steps()); let mut objects_progress = root_progress.add_child_with_id("indexing".into(), ProgressId::IndexObjects.into()); objects_progress.init(Some(anticipated_num_objects), progress::count("objects")); let mut decompressed_progress = root_progress.add_child_with_id("decompressing".into(), ProgressId::DecompressedBytes.into()); decompressed_progress.init(None, progress::bytes()); let mut pack_entries_end: u64 = 0; for entry in entries { let crate::data::input::Entry { header, pack_offset, crc32, header_size, compressed: _, compressed_size, decompressed_size, trailer, } = entry?; decompressed_progress.inc_by(decompressed_size as usize); let entry_len = u64::from(header_size) + compressed_size; pack_entries_end = pack_offset + entry_len; let crc32 = crc32.expect("crc32 to be computed by the iterator. Caller assures correct configuration."); use crate::data::entry::Header::*; match header { Tree | Blob | Commit | Tag => { tree.add_root( pack_offset, TreeEntry { id: object_hash.null(), crc32, }, )?; } RefDelta { .. } => return Err(Error::IteratorInvariantNoRefDelta), OfsDelta { base_distance } => { let base_pack_offset = crate::data::entry::Header::verified_base_pack_offset(pack_offset, base_distance).ok_or( Error::IteratorInvariantBaseOffset { pack_offset, distance: base_distance, }, )?; tree.add_child( base_pack_offset, pack_offset, TreeEntry { id: object_hash.null(), crc32, }, )?; } }; last_seen_trailer = trailer; num_objects += 1; objects_progress.inc(); } let num_objects: u32 = num_objects .try_into() .map_err(|_| Error::IteratorInvariantTooManyObjects(num_objects))?; objects_progress.show_throughput(indexing_start); decompressed_progress.show_throughput(indexing_start); drop(objects_progress); drop(decompressed_progress); root_progress.inc(); let (resolver, pack) = make_resolver()?; let sorted_pack_offsets_by_oid = { let traverse::Outcome { roots, children } = tree.traverse( resolver, &pack, pack_entries_end, |data, _progress, traverse::Context { entry, decompressed: bytes, .. }| { modify_base(data, entry, bytes, version.hash()); Ok::<_, Error>(()) }, traverse::Options { object_progress: Box::new( root_progress.add_child_with_id("Resolving".into(), ProgressId::ResolveObjects.into()), ), size_progress: &mut root_progress .add_child_with_id("Decoding".into(), ProgressId::DecodedBytes.into()), thread_limit, should_interrupt, object_hash, }, )?; root_progress.inc(); let mut items = roots; items.extend(children); { let _progress = root_progress.add_child_with_id("sorting by id".into(), gix_features::progress::UNKNOWN); items.sort_by_key(|e| e.data.id); } root_progress.inc(); items }; let pack_hash = match last_seen_trailer { Some(ph) => ph, None if num_objects == 0 => { let header = crate::data::header::encode(pack_version, 0); let mut hasher = gix_features::hash::hasher(object_hash); hasher.update(&header); gix_hash::ObjectId::from(hasher.digest()) } None => return Err(Error::IteratorInvariantTrailer), }; let index_hash = crate::index::encode::write_to( out, sorted_pack_offsets_by_oid, &pack_hash, version, &mut root_progress.add_child_with_id("writing index file".into(), ProgressId::IndexBytesWritten.into()), )?; root_progress.show_throughput_with( indexing_start, num_objects as usize, progress::count("objects").expect("unit always set"), progress::MessageLevel::Success, ); Ok(Outcome { index_version: version, index_hash, data_hash: pack_hash, num_objects, }) } } fn modify_base(entry: &mut TreeEntry, pack_entry: &crate::data::Entry, decompressed: &[u8], hash: gix_hash::Kind) { let object_kind = pack_entry.header.as_kind().expect("base object as source of iteration"); let id = gix_object::compute_hash(hash, object_kind, decompressed); entry.id = id; } gix-pack-0.56.0/src/lib.rs000075500000000000000000000043341046102023000133310ustar 00000000000000//! Git stores all of its data as _Objects_, which are data along with a hash over all data. Storing objects efficiently //! is what git packs are concerned about. //! //! Packs consist of [data files][data::File] and [index files][index::File]. The latter can be generated from a data file //! and make accessing objects within a pack feasible. //! //! A [Bundle] conveniently combines a data pack alongside its index to allow [finding][Find] objects or verifying the pack. //! Objects returned by `.find(…)` are [objects][gix_object::Data] which know their pack location in order to speed up //! various common operations like creating new packs from existing ones. //! //! When traversing all objects in a pack, a _delta tree acceleration structure_ can be built from pack data or an index //! in order to decompress packs in parallel and without any waste. //! ## Feature Flags #![cfg_attr( all(doc, feature = "document-features"), doc = ::document_features::document_features!() )] #![cfg_attr(all(doc, feature = "document-features"), feature(doc_cfg, doc_auto_cfg))] #![deny(missing_docs, rust_2018_idioms, unsafe_code)] /// pub mod bundle; /// A bundle of pack data and the corresponding pack index pub struct Bundle { /// The pack file corresponding to `index` pub pack: data::File, /// The index file corresponding to `pack` pub index: index::File, } /// pub mod find; /// pub mod cache; /// pub mod data; mod find_traits; pub use find_traits::{Find, FindExt}; /// pub mod index; /// pub mod multi_index; /// pub mod verify; mod mmap { use std::path::Path; pub fn read_only(path: &Path) -> std::io::Result { let file = std::fs::File::open(path)?; // SAFETY: we have to take the risk of somebody changing the file underneath. Git never writes into the same file. #[allow(unsafe_code)] unsafe { memmap2::MmapOptions::new().map_copy_read_only(&file) } } } #[inline] fn read_u32(b: &[u8]) -> u32 { u32::from_be_bytes(b.try_into().unwrap()) } #[inline] fn read_u64(b: &[u8]) -> u64 { u64::from_be_bytes(b.try_into().unwrap()) } fn exact_vec(capacity: usize) -> Vec { let mut v = Vec::new(); v.reserve_exact(capacity); v } gix-pack-0.56.0/src/multi_index/access.rs000064400000000000000000000136101046102023000163370ustar 00000000000000use std::{ ops::Range, path::{Path, PathBuf}, }; use crate::{ data, index::PrefixLookupResult, multi_index::{EntryIndex, File, PackIndex, Version}, }; /// Represents an entry within a multi index file, effectively mapping object [`IDs`][gix_hash::ObjectId] to pack data /// files and the offset within. #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Entry { /// The ID of the object. pub oid: gix_hash::ObjectId, /// The offset to the object's header in the pack data file. pub pack_offset: data::Offset, /// The index of the pack matching our [`File::index_names()`] slice. pub pack_index: PackIndex, } /// Access methods impl File { /// Returns the version of the multi-index file. pub fn version(&self) -> Version { self.version } /// Returns the path from which the multi-index file was loaded. /// /// Note that it might have changed in the mean time, or might have been removed as well. pub fn path(&self) -> &Path { &self.path } /// Returns the amount of indices stored in this multi-index file. It's the same as [File::index_names().len()][File::index_names()], /// and returned as one past the highest known index. pub fn num_indices(&self) -> PackIndex { self.num_indices } /// Returns the total amount of objects available for lookup, and returned as one past the highest known entry index pub fn num_objects(&self) -> EntryIndex { self.num_objects } /// Returns the kind of hash function used for object ids available in this index. pub fn object_hash(&self) -> gix_hash::Kind { self.object_hash } /// Returns the checksum over the entire content of the file (excluding the checksum itself). /// /// It can be used to validate it didn't change after creation. pub fn checksum(&self) -> gix_hash::ObjectId { gix_hash::ObjectId::from_bytes_or_panic(&self.data[self.data.len() - self.hash_len..]) } /// Return all names of index files (`*.idx`) whose objects we contain. /// /// The corresponding pack can be found by replacing the `.idx` extension with `.pack`. pub fn index_names(&self) -> &[PathBuf] { &self.index_names } } impl File { /// Return the object id at the given `index`, which ranges from 0 to [File::num_objects()]. pub fn oid_at_index(&self, index: EntryIndex) -> &gix_hash::oid { debug_assert!(index < self.num_objects, "index out of bounds"); let index: usize = index as usize; let start = self.lookup_ofs + index * self.hash_len; gix_hash::oid::from_bytes_unchecked(&self.data[start..][..self.hash_len]) } /// Given a `prefix`, find an object that matches it uniquely within this index and return `Some(Ok(entry_index))`. /// If there is more than one object matching the object `Some(Err(())` is returned. /// /// Finally, if no object matches the index, the return value is `None`. /// /// Pass `candidates` to obtain the set of entry-indices matching `prefix`, with the same return value as /// one would have received if it remained `None`. It will be empty if no object matched the `prefix`. /// // NOTE: pretty much the same things as in `index::File::lookup`, change things there // as well. pub fn lookup_prefix( &self, prefix: gix_hash::Prefix, candidates: Option<&mut Range>, ) -> Option { crate::index::access::lookup_prefix( prefix, candidates, &self.fan, &|idx| self.oid_at_index(idx), self.num_objects, ) } /// Find the index ranging from 0 to [File::num_objects()] that belongs to data associated with `id`, or `None` if it wasn't found. /// /// Use this index for finding additional information via [`File::pack_id_and_pack_offset_at_index()`]. pub fn lookup(&self, id: impl AsRef) -> Option { crate::index::access::lookup(id.as_ref(), &self.fan, &|idx| self.oid_at_index(idx)) } /// Given the `index` ranging from 0 to [File::num_objects()], return the pack index and its absolute offset into the pack. /// /// The pack-index refers to an entry in the [`index_names`][File::index_names()] list, from which the pack can be derived. pub fn pack_id_and_pack_offset_at_index(&self, index: EntryIndex) -> (PackIndex, data::Offset) { const OFFSET_ENTRY_SIZE: usize = 4 + 4; let index = index as usize; let start = self.offsets_ofs + index * OFFSET_ENTRY_SIZE; const HIGH_BIT: u32 = 1 << 31; let pack_index = crate::read_u32(&self.data[start..][..4]); let offset = &self.data[start + 4..][..4]; let ofs32 = crate::read_u32(offset); let pack_offset = if (ofs32 & HIGH_BIT) == HIGH_BIT { // We determine if large offsets are actually larger than 4GB and if not, we don't use the high-bit to signal anything // but allow the presence of the large-offset chunk to signal what's happening. if let Some(offsets_64) = self.large_offsets_ofs { let from = offsets_64 + (ofs32 ^ HIGH_BIT) as usize * 8; crate::read_u64(&self.data[from..][..8]) } else { u64::from(ofs32) } } else { u64::from(ofs32) }; (pack_index, pack_offset) } /// Return an iterator over all entries within this file. pub fn iter(&self) -> impl Iterator + '_ { (0..self.num_objects).map(move |idx| { let (pack_index, pack_offset) = self.pack_id_and_pack_offset_at_index(idx); Entry { oid: self.oid_at_index(idx).to_owned(), pack_offset, pack_index, } }) } } gix-pack-0.56.0/src/multi_index/chunk.rs000064400000000000000000000231741046102023000162140ustar 00000000000000/// Information for the chunk about index names pub mod index_names { use std::path::{Path, PathBuf}; use gix_object::bstr::{BString, ByteSlice}; /// The ID used for the index-names chunk. pub const ID: gix_chunk::Id = *b"PNAM"; /// pub mod decode { use gix_object::bstr::BString; /// The error returned by [`from_bytes()`][super::from_bytes()]. #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] pub enum Error { #[error("The pack names were not ordered alphabetically.")] NotOrderedAlphabetically, #[error("Each pack path name must be terminated with a null byte")] MissingNullByte, #[error("Couldn't turn path '{path}' into OS path due to encoding issues")] PathEncoding { path: BString }, #[error("non-padding bytes found after all paths were read.")] UnknownTrailerBytes, } } /// Parse null-separated index names from the given `chunk` of bytes and the expected number of packs and indices. /// Ignore padding bytes which are typically \0. pub fn from_bytes(mut chunk: &[u8], num_packs: u32) -> Result, decode::Error> { let mut out = Vec::new(); for _ in 0..num_packs { let null_byte_pos = chunk.find_byte(b'\0').ok_or(decode::Error::MissingNullByte)?; let path = &chunk[..null_byte_pos]; let path = gix_path::try_from_byte_slice(path) .map_err(|_| decode::Error::PathEncoding { path: BString::from(path), })? .to_owned(); if let Some(previous) = out.last() { if previous >= &path { return Err(decode::Error::NotOrderedAlphabetically); } } out.push(path); chunk = &chunk[null_byte_pos + 1..]; } if !chunk.is_empty() && !chunk.iter().all(|b| *b == 0) { return Err(decode::Error::UnknownTrailerBytes); } // NOTE: git writes garbage into this chunk, usually extra \0 bytes, which we simply ignore. If we were strict // about it we couldn't read this chunk data at all. Ok(out) } /// Calculate the size on disk for our chunk with the given index paths. Note that these are expected to have been processed already /// to actually be file names. pub fn storage_size(paths: impl IntoIterator>) -> u64 { let mut count = 0u64; for path in paths { let path = path.as_ref(); let ascii_path = path.to_str().expect("UTF-8 compatible paths"); assert!( ascii_path.is_ascii(), "must use ascii bytes for correct size computation" ); count += (ascii_path.as_bytes().len() + 1/* null byte */) as u64; } let needed_alignment = CHUNK_ALIGNMENT - (count % CHUNK_ALIGNMENT); if needed_alignment < CHUNK_ALIGNMENT { count += needed_alignment; } count } /// Write all `paths` in order to `out`, including padding. pub fn write( paths: impl IntoIterator>, out: &mut dyn std::io::Write, ) -> std::io::Result<()> { let mut written_bytes = 0; for path in paths { let path = path.as_ref().to_str().expect("UTF-8 path"); out.write_all(path.as_bytes())?; out.write_all(&[0])?; written_bytes += path.as_bytes().len() as u64 + 1; } let needed_alignment = CHUNK_ALIGNMENT - (written_bytes % CHUNK_ALIGNMENT); if needed_alignment < CHUNK_ALIGNMENT { let padding = [0u8; CHUNK_ALIGNMENT as usize]; out.write_all(&padding[..needed_alignment as usize])?; } Ok(()) } const CHUNK_ALIGNMENT: u64 = 4; } /// Information for the chunk with the fanout table pub mod fanout { use crate::multi_index; /// The size of the fanout table pub const SIZE: usize = 4 * 256; /// The id uniquely identifying the fanout table. pub const ID: gix_chunk::Id = *b"OIDF"; /// Decode the fanout table contained in `chunk`, or return `None` if it didn't have the expected size. pub fn from_bytes(chunk: &[u8]) -> Option<[u32; 256]> { if chunk.len() != SIZE { return None; } let mut out = [0; 256]; for (c, f) in chunk.chunks_exact(4).zip(out.iter_mut()) { *f = u32::from_be_bytes(c.try_into().unwrap()); } out.into() } /// Write the fanout for the given entries, which must be sorted by oid pub(crate) fn write( sorted_entries: &[multi_index::write::Entry], out: &mut dyn std::io::Write, ) -> std::io::Result<()> { let fanout = crate::index::encode::fanout(&mut sorted_entries.iter().map(|e| e.id.first_byte())); for value in fanout.iter() { out.write_all(&value.to_be_bytes())?; } Ok(()) } } /// Information about the oid lookup table. pub mod lookup { use std::ops::Range; use crate::multi_index; /// The id uniquely identifying the oid lookup table. pub const ID: gix_chunk::Id = *b"OIDL"; /// Return the amount of bytes needed to store the data on disk for the given amount of `entries` pub fn storage_size(entries: usize, object_hash: gix_hash::Kind) -> u64 { (entries * object_hash.len_in_bytes()) as u64 } pub(crate) fn write( sorted_entries: &[multi_index::write::Entry], out: &mut dyn std::io::Write, ) -> std::io::Result<()> { for entry in sorted_entries { out.write_all(entry.id.as_slice())?; } Ok(()) } /// Return true if the size of the `offset` range seems to match for a `hash` of the given kind and the amount of objects. pub fn is_valid(offset: &Range, hash: gix_hash::Kind, num_objects: u32) -> bool { (offset.end - offset.start) / hash.len_in_bytes() == num_objects as usize } } /// Information about the offsets table. pub mod offsets { use std::ops::Range; use crate::multi_index; /// The id uniquely identifying the offsets table. pub const ID: gix_chunk::Id = *b"OOFF"; /// Return the amount of bytes needed to offset data for `entries`. pub fn storage_size(entries: usize) -> u64 { (entries * (4 /*pack-id*/ + 4/* pack offset */)) as u64 } pub(crate) fn write( sorted_entries: &[multi_index::write::Entry], large_offsets_needed: bool, out: &mut dyn std::io::Write, ) -> std::io::Result<()> { use crate::index::encode::{HIGH_BIT, LARGE_OFFSET_THRESHOLD}; let mut num_large_offsets = 0u32; for entry in sorted_entries { out.write_all(&entry.pack_index.to_be_bytes())?; let offset: u32 = if large_offsets_needed { if entry.pack_offset > LARGE_OFFSET_THRESHOLD { let res = num_large_offsets | HIGH_BIT; num_large_offsets += 1; res } else { entry.pack_offset as u32 } } else { entry .pack_offset .try_into() .expect("without large offsets, pack-offset fits u32") }; out.write_all(&offset.to_be_bytes())?; } Ok(()) } /// Returns true if the `offset` range seems to match the size required for `num_objects`. pub fn is_valid(offset: &Range, num_objects: u32) -> bool { let entry_size = 4 /* pack-id */ + 4 /* pack-offset */; ((offset.end - offset.start) / num_objects as usize) == entry_size } } /// Information about the large offsets table. pub mod large_offsets { use std::ops::Range; use crate::{index::encode::LARGE_OFFSET_THRESHOLD, multi_index}; /// The id uniquely identifying the large offsets table (with 64 bit offsets) pub const ID: gix_chunk::Id = *b"LOFF"; /// Returns Some(num-large-offset) if there are offsets larger than u32. pub(crate) fn num_large_offsets(entries: &[multi_index::write::Entry]) -> Option { let mut num_large_offsets = 0; let mut needs_large_offsets = false; for entry in entries { if entry.pack_offset > LARGE_OFFSET_THRESHOLD { num_large_offsets += 1; } if entry.pack_offset > crate::data::Offset::from(u32::MAX) { needs_large_offsets = true; } } needs_large_offsets.then_some(num_large_offsets) } /// Returns true if the `offsets` range seems to be properly aligned for the data we expect. pub fn is_valid(offset: &Range) -> bool { (offset.end - offset.start) % 8 == 0 } pub(crate) fn write( sorted_entries: &[multi_index::write::Entry], mut num_large_offsets: usize, out: &mut dyn std::io::Write, ) -> std::io::Result<()> { for offset in sorted_entries .iter() .filter_map(|e| (e.pack_offset > LARGE_OFFSET_THRESHOLD).then_some(e.pack_offset)) { out.write_all(&offset.to_be_bytes())?; num_large_offsets = num_large_offsets .checked_sub(1) .expect("BUG: wrote more offsets the previously found"); } assert_eq!(num_large_offsets, 0, "BUG: wrote less offsets than initially counted"); Ok(()) } /// Return the amount of bytes needed to store the given amount of `large_offsets` pub(crate) fn storage_size(large_offsets: usize) -> u64 { 8 * large_offsets as u64 } } gix-pack-0.56.0/src/multi_index/init.rs000064400000000000000000000136761046102023000160550ustar 00000000000000use std::path::Path; use crate::multi_index::{chunk, File, Version}; mod error { use crate::multi_index::chunk; /// The error returned by [File::at()][super::File::at()]. #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] pub enum Error { #[error("Could not open multi-index file at '{path}'")] Io { source: std::io::Error, path: std::path::PathBuf, }, #[error("{message}")] Corrupt { message: &'static str }, #[error("Unsupported multi-index version: {version})")] UnsupportedVersion { version: u8 }, #[error("Unsupported hash kind: {kind})")] UnsupportedObjectHash { kind: u8 }, #[error(transparent)] ChunkFileDecode(#[from] gix_chunk::file::decode::Error), #[error(transparent)] MissingChunk(#[from] gix_chunk::file::index::offset_by_kind::Error), #[error(transparent)] FileTooLarge(#[from] gix_chunk::file::index::data_by_kind::Error), #[error("The multi-pack fan doesn't have the correct size of 256 * 4 bytes")] MultiPackFanSize, #[error(transparent)] PackNames(#[from] chunk::index_names::decode::Error), #[error("multi-index chunk {:?} has invalid size: {message}", String::from_utf8_lossy(.id))] InvalidChunkSize { id: gix_chunk::Id, message: &'static str }, } } pub use error::Error; /// Initialization impl File { /// Open the multi-index file at the given `path`. pub fn at(path: impl AsRef) -> Result { Self::try_from(path.as_ref()) } } impl TryFrom<&Path> for File { type Error = Error; fn try_from(path: &Path) -> Result { let data = crate::mmap::read_only(path).map_err(|source| Error::Io { source, path: path.to_owned(), })?; const TRAILER_LEN: usize = gix_hash::Kind::shortest().len_in_bytes(); /* trailing hash */ if data.len() < Self::HEADER_LEN + gix_chunk::file::Index::size_for_entries(4 /*index names, fan, offsets, oids*/) + chunk::fanout::SIZE + TRAILER_LEN { return Err(Error::Corrupt { message: "multi-index file is truncated and too short", }); } let (version, object_hash, num_chunks, num_indices) = { let (signature, data) = data.split_at(4); if signature != Self::SIGNATURE { return Err(Error::Corrupt { message: "Invalid signature", }); } let (version, data) = data.split_at(1); let version = match version[0] { 1 => Version::V1, version => return Err(Error::UnsupportedVersion { version }), }; let (object_hash, data) = data.split_at(1); let object_hash = gix_hash::Kind::try_from(object_hash[0]) .map_err(|unknown| Error::UnsupportedObjectHash { kind: unknown })?; let (num_chunks, data) = data.split_at(1); let num_chunks = num_chunks[0]; let (_num_base_files, data) = data.split_at(1); // TODO: handle base files once it's clear what this does let (num_indices, _) = data.split_at(4); let num_indices = crate::read_u32(num_indices); (version, object_hash, num_chunks, num_indices) }; let chunks = gix_chunk::file::Index::from_bytes(&data, Self::HEADER_LEN, u32::from(num_chunks))?; let index_names = chunks.data_by_id(&data, chunk::index_names::ID)?; let index_names = chunk::index_names::from_bytes(index_names, num_indices)?; let fan = chunks.data_by_id(&data, chunk::fanout::ID)?; let fan = chunk::fanout::from_bytes(fan).ok_or(Error::MultiPackFanSize)?; let num_objects = fan[255]; let lookup = chunks.validated_usize_offset_by_id(chunk::lookup::ID, |offset| { chunk::lookup::is_valid(&offset, object_hash, num_objects) .then_some(offset) .ok_or(Error::InvalidChunkSize { id: chunk::lookup::ID, message: "The chunk with alphabetically ordered object ids doesn't have the correct size", }) })??; let offsets = chunks.validated_usize_offset_by_id(chunk::offsets::ID, |offset| { chunk::offsets::is_valid(&offset, num_objects) .then_some(offset) .ok_or(Error::InvalidChunkSize { id: chunk::offsets::ID, message: "The chunk with offsets into the pack doesn't have the correct size", }) })??; let large_offsets = chunks .validated_usize_offset_by_id(chunk::large_offsets::ID, |offset| { chunk::large_offsets::is_valid(&offset) .then_some(offset) .ok_or(Error::InvalidChunkSize { id: chunk::large_offsets::ID, message: "The chunk with large offsets into the pack doesn't have the correct size", }) }) .ok() .transpose()?; let checksum_offset = chunks.highest_offset() as usize; let trailer = &data[checksum_offset..]; if trailer.len() != object_hash.len_in_bytes() { return Err(Error::Corrupt { message: "Trailing checksum didn't have the expected size or there were unknown bytes after the checksum.", }); } Ok(File { data, path: path.to_owned(), version, hash_len: object_hash.len_in_bytes(), object_hash, fan, index_names, lookup_ofs: lookup.start, offsets_ofs: offsets.start, large_offsets_ofs: large_offsets.map(|r| r.start), num_objects, num_indices, }) } } gix-pack-0.56.0/src/multi_index/mod.rs000064400000000000000000000022041046102023000156520ustar 00000000000000use std::path::PathBuf; use memmap2::Mmap; /// Known multi-index file versions #[derive(Default, PartialEq, Eq, Ord, PartialOrd, Debug, Hash, Clone, Copy)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[allow(missing_docs)] pub enum Version { #[default] V1 = 1, } /// An index into our [`File::index_names()`] array yielding the name of the index and by implication, its pack file. pub type PackIndex = u32; /// The type for referring to indices of an entry within the index file. pub type EntryIndex = u32; /// A representation of an index file for multiple packs at the same time, typically stored in a file /// named 'multi-pack-index'. pub struct File { data: Mmap, path: std::path::PathBuf, version: Version, hash_len: usize, object_hash: gix_hash::Kind, /// The amount of pack files contained within num_indices: u32, num_objects: u32, fan: [u32; 256], index_names: Vec, lookup_ofs: usize, offsets_ofs: usize, large_offsets_ofs: Option, } /// pub mod write; /// mod access; /// pub mod verify; /// pub mod chunk; /// pub mod init; gix-pack-0.56.0/src/multi_index/verify.rs000064400000000000000000000330541046102023000164060ustar 00000000000000use std::{cmp::Ordering, sync::atomic::AtomicBool, time::Instant}; use gix_features::progress::{Count, DynNestedProgress, Progress}; use crate::{exact_vec, index, multi_index::File}; /// pub mod integrity { use crate::multi_index::EntryIndex; /// Returned by [`multi_index::File::verify_integrity()`][crate::multi_index::File::verify_integrity()]. #[derive(thiserror::Error, Debug)] #[allow(missing_docs)] pub enum Error { #[error("Object {id} should be at pack-offset {expected_pack_offset} but was found at {actual_pack_offset}")] PackOffsetMismatch { id: gix_hash::ObjectId, expected_pack_offset: u64, actual_pack_offset: u64, }, #[error(transparent)] MultiIndexChecksum(#[from] crate::multi_index::verify::checksum::Error), #[error(transparent)] IndexIntegrity(#[from] crate::index::verify::integrity::Error), #[error(transparent)] BundleInit(#[from] crate::bundle::init::Error), #[error("Counted {actual} objects, but expected {expected} as per multi-index")] UnexpectedObjectCount { actual: usize, expected: usize }, #[error("{id} wasn't found in the index referenced in the multi-pack index")] OidNotFound { id: gix_hash::ObjectId }, #[error("The object id at multi-index entry {index} wasn't in order")] OutOfOrder { index: EntryIndex }, #[error("The fan at index {index} is out of order as it's larger then the following value.")] Fan { index: usize }, #[error("The multi-index claims to have no objects")] Empty, #[error("Interrupted")] Interrupted, } /// Returned by [`multi_index::File::verify_integrity()`][crate::multi_index::File::verify_integrity()]. pub struct Outcome { /// The computed checksum of the multi-index which matched the stored one. pub actual_index_checksum: gix_hash::ObjectId, /// The for each entry in [`index_names()`][super::File::index_names()] provide the corresponding pack traversal outcome. pub pack_traverse_statistics: Vec, } /// The progress ids used in [`multi_index::File::verify_integrity()`][crate::multi_index::File::verify_integrity()]. /// /// Use this information to selectively extract the progress of interest in case the parent application has custom visualization. #[derive(Debug, Copy, Clone)] pub enum ProgressId { /// The amount of bytes read to verify the multi-index checksum. ChecksumBytes, /// The amount of objects whose offset has been checked. ObjectOffsets, } impl From for gix_features::progress::Id { fn from(v: ProgressId) -> Self { match v { ProgressId::ChecksumBytes => *b"MVCK", ProgressId::ObjectOffsets => *b"MVOF", } } } } /// pub mod checksum { /// Returned by [`multi_index::File::verify_checksum()`][crate::multi_index::File::verify_checksum()]. pub type Error = crate::verify::checksum::Error; } impl File { /// Validate that our [`checksum()`][File::checksum()] matches the actual contents /// of this index file, and return it if it does. pub fn verify_checksum( &self, progress: &mut dyn Progress, should_interrupt: &AtomicBool, ) -> Result { crate::verify::checksum_on_disk_or_mmap( self.path(), &self.data, self.checksum(), self.object_hash, progress, should_interrupt, ) } /// Similar to [`verify_integrity()`][File::verify_integrity()] but without any deep inspection of objects. /// /// Instead we only validate the contents of the multi-index itself. pub fn verify_integrity_fast( &self, progress: &mut dyn DynNestedProgress, should_interrupt: &AtomicBool, ) -> Result { self.verify_integrity_inner( progress, should_interrupt, false, index::verify::integrity::Options::default(), ) .map_err(|err| match err { index::traverse::Error::Processor(err) => err, _ => unreachable!("BUG: no other error type is possible"), }) .map(|o| o.actual_index_checksum) } /// Similar to [`crate::Bundle::verify_integrity()`] but checks all contained indices and their packs. /// /// Note that it's considered a failure if an index doesn't have a corresponding pack. pub fn verify_integrity( &self, progress: &mut dyn DynNestedProgress, should_interrupt: &AtomicBool, options: index::verify::integrity::Options, ) -> Result> where C: crate::cache::DecodeEntry, F: Fn() -> C + Send + Clone, { self.verify_integrity_inner(progress, should_interrupt, true, options) } fn verify_integrity_inner( &self, progress: &mut dyn DynNestedProgress, should_interrupt: &AtomicBool, deep_check: bool, options: index::verify::integrity::Options, ) -> Result> where C: crate::cache::DecodeEntry, F: Fn() -> C + Send + Clone, { let parent = self.path.parent().expect("must be in a directory"); let actual_index_checksum = self .verify_checksum( &mut progress.add_child_with_id( format!("{}: checksum", self.path.display()), integrity::ProgressId::ChecksumBytes.into(), ), should_interrupt, ) .map_err(integrity::Error::from) .map_err(index::traverse::Error::Processor)?; if let Some(first_invalid) = crate::verify::fan(&self.fan) { return Err(index::traverse::Error::Processor(integrity::Error::Fan { index: first_invalid, })); } if self.num_objects == 0 { return Err(index::traverse::Error::Processor(integrity::Error::Empty)); } let mut pack_traverse_statistics = Vec::new(); let operation_start = Instant::now(); let mut total_objects_checked = 0; let mut pack_ids_and_offsets = exact_vec(self.num_objects as usize); { let order_start = Instant::now(); let mut progress = progress.add_child_with_id("checking oid order".into(), gix_features::progress::UNKNOWN); progress.init( Some(self.num_objects as usize), gix_features::progress::count("objects"), ); for entry_index in 0..(self.num_objects - 1) { let lhs = self.oid_at_index(entry_index); let rhs = self.oid_at_index(entry_index + 1); if rhs.cmp(lhs) != Ordering::Greater { return Err(index::traverse::Error::Processor(integrity::Error::OutOfOrder { index: entry_index, })); } let (pack_id, _) = self.pack_id_and_pack_offset_at_index(entry_index); pack_ids_and_offsets.push((pack_id, entry_index)); progress.inc(); } { let entry_index = self.num_objects - 1; let (pack_id, _) = self.pack_id_and_pack_offset_at_index(entry_index); pack_ids_and_offsets.push((pack_id, entry_index)); } // sort by pack-id to allow handling all indices matching a pack while its open. pack_ids_and_offsets.sort_by(|l, r| l.0.cmp(&r.0)); progress.show_throughput(order_start); }; progress.init( Some(self.num_indices as usize), gix_features::progress::count("indices"), ); let mut pack_ids_slice = pack_ids_and_offsets.as_slice(); for (pack_id, index_file_name) in self.index_names.iter().enumerate() { progress.set_name(index_file_name.display().to_string()); progress.inc(); let mut bundle = None; let index; let index_path = parent.join(index_file_name); let index = if deep_check { bundle = crate::Bundle::at(index_path, self.object_hash) .map_err(integrity::Error::from) .map_err(index::traverse::Error::Processor)? .into(); bundle.as_ref().map(|b| &b.index).expect("just set") } else { index = Some( index::File::at(index_path, self.object_hash) .map_err(|err| integrity::Error::BundleInit(crate::bundle::init::Error::Index(err))) .map_err(index::traverse::Error::Processor)?, ); index.as_ref().expect("just set") }; let slice_end = pack_ids_slice.partition_point(|e| e.0 == pack_id as crate::data::Id); let multi_index_entries_to_check = &pack_ids_slice[..slice_end]; { let offset_start = Instant::now(); let mut offsets_progress = progress.add_child_with_id( "verify object offsets".into(), integrity::ProgressId::ObjectOffsets.into(), ); offsets_progress.init( Some(pack_ids_and_offsets.len()), gix_features::progress::count("objects"), ); pack_ids_slice = &pack_ids_slice[slice_end..]; for entry_id in multi_index_entries_to_check.iter().map(|e| e.1) { let oid = self.oid_at_index(entry_id); let (_, expected_pack_offset) = self.pack_id_and_pack_offset_at_index(entry_id); let entry_in_bundle_index = index.lookup(oid).ok_or_else(|| { index::traverse::Error::Processor(integrity::Error::OidNotFound { id: oid.to_owned() }) })?; let actual_pack_offset = index.pack_offset_at_index(entry_in_bundle_index); if actual_pack_offset != expected_pack_offset { return Err(index::traverse::Error::Processor( integrity::Error::PackOffsetMismatch { id: oid.to_owned(), expected_pack_offset, actual_pack_offset, }, )); } offsets_progress.inc(); } if should_interrupt.load(std::sync::atomic::Ordering::Relaxed) { return Err(index::traverse::Error::Processor(integrity::Error::Interrupted)); } offsets_progress.show_throughput(offset_start); } total_objects_checked += multi_index_entries_to_check.len(); if let Some(bundle) = bundle { progress.set_name(format!("Validating {}", index_file_name.display())); let crate::bundle::verify::integrity::Outcome { actual_index_checksum: _, pack_traverse_outcome, } = bundle .verify_integrity(progress, should_interrupt, options.clone()) .map_err(|err| { use index::traverse::Error::*; match err { Processor(err) => Processor(integrity::Error::IndexIntegrity(err)), VerifyChecksum(err) => VerifyChecksum(err), Tree(err) => Tree(err), TreeTraversal(err) => TreeTraversal(err), PackDecode { id, offset, source } => PackDecode { id, offset, source }, PackMismatch { expected, actual } => PackMismatch { expected, actual }, EntryType(err) => EntryType(err), PackObjectMismatch { expected, actual, offset, kind, } => PackObjectMismatch { expected, actual, offset, kind, }, Crc32Mismatch { expected, actual, offset, kind, } => Crc32Mismatch { expected, actual, offset, kind, }, Interrupted => Interrupted, } })?; pack_traverse_statistics.push(pack_traverse_outcome); } } assert_eq!( self.num_objects as usize, total_objects_checked, "BUG: our slicing should allow to visit all objects" ); progress.set_name("Validating multi-pack".into()); progress.show_throughput(operation_start); Ok(integrity::Outcome { actual_index_checksum, pack_traverse_statistics, }) } } gix-pack-0.56.0/src/multi_index/write.rs000064400000000000000000000222711046102023000162330ustar 00000000000000use std::{ path::PathBuf, sync::atomic::{AtomicBool, Ordering}, time::{Instant, SystemTime}, }; use gix_features::progress::{Count, DynNestedProgress, Progress}; use crate::multi_index; mod error { /// The error returned by [`multi_index::File::write_from_index_paths()`][super::multi_index::File::write_from_index_paths()].. #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] pub enum Error { #[error(transparent)] Io(#[from] std::io::Error), #[error("Interrupted")] Interrupted, #[error(transparent)] OpenIndex(#[from] crate::index::init::Error), } } pub use error::Error; /// An entry suitable for sorting and writing pub(crate) struct Entry { pub(crate) id: gix_hash::ObjectId, pub(crate) pack_index: u32, pub(crate) pack_offset: crate::data::Offset, /// Used for sorting in case of duplicates index_mtime: SystemTime, } /// Options for use in [`multi_index::File::write_from_index_paths()`]. pub struct Options { /// The kind of hash to use for objects and to expect in the input files. pub object_hash: gix_hash::Kind, } /// The result of [`multi_index::File::write_from_index_paths()`]. pub struct Outcome { /// The calculated multi-index checksum of the file at `multi_index_path`. pub multi_index_checksum: gix_hash::ObjectId, } /// The progress ids used in [`write_from_index_paths()`][multi_index::File::write_from_index_paths()]. /// /// Use this information to selectively extract the progress of interest in case the parent application has custom visualization. #[derive(Debug, Copy, Clone)] pub enum ProgressId { /// Counts each path in the input set whose entries we enumerate and write into the multi-index FromPathsCollectingEntries, /// The amount of bytes written as part of the multi-index. BytesWritten, } impl From for gix_features::progress::Id { fn from(v: ProgressId) -> Self { match v { ProgressId::FromPathsCollectingEntries => *b"MPCE", ProgressId::BytesWritten => *b"MPBW", } } } impl multi_index::File { pub(crate) const SIGNATURE: &'static [u8] = b"MIDX"; pub(crate) const HEADER_LEN: usize = 4 /*signature*/ + 1 /*version*/ + 1 /*object id version*/ + 1 /*num chunks */ + 1 /*num base files */ + 4 /*num pack files*/; /// Create a new multi-index file for writing to `out` from the pack index files at `index_paths`. /// /// Progress is sent to `progress` and interruptions checked via `should_interrupt`. pub fn write_from_index_paths( mut index_paths: Vec, out: &mut dyn std::io::Write, progress: &mut dyn DynNestedProgress, should_interrupt: &AtomicBool, Options { object_hash }: Options, ) -> Result { let out = gix_features::hash::Write::new(out, object_hash); let (index_paths_sorted, index_filenames_sorted) = { index_paths.sort(); let file_names = index_paths .iter() .map(|p| PathBuf::from(p.file_name().expect("file name present"))) .collect::>(); (index_paths, file_names) }; let entries = { let mut entries = Vec::new(); let start = Instant::now(); let mut progress = progress.add_child_with_id( "Collecting entries".into(), ProgressId::FromPathsCollectingEntries.into(), ); progress.init(Some(index_paths_sorted.len()), gix_features::progress::count("indices")); // This could be parallelized… but it's probably not worth it unless you have 500mio objects. for (index_id, index) in index_paths_sorted.iter().enumerate() { let mtime = index .metadata() .and_then(|m| m.modified()) .unwrap_or(SystemTime::UNIX_EPOCH); let index = crate::index::File::at(index, object_hash)?; entries.reserve(index.num_objects() as usize); entries.extend(index.iter().map(|e| Entry { id: e.oid, pack_index: index_id as u32, pack_offset: e.pack_offset, index_mtime: mtime, })); progress.inc(); if should_interrupt.load(Ordering::Relaxed) { return Err(Error::Interrupted); } } progress.show_throughput(start); let start = Instant::now(); progress.set_name("Deduplicate".into()); progress.init(Some(entries.len()), gix_features::progress::count("entries")); entries.sort_by(|l, r| { l.id.cmp(&r.id) .then_with(|| l.index_mtime.cmp(&r.index_mtime).reverse()) .then_with(|| l.pack_index.cmp(&r.pack_index)) }); entries.dedup_by_key(|e| e.id); progress.inc_by(entries.len()); progress.show_throughput(start); if should_interrupt.load(Ordering::Relaxed) { return Err(Error::Interrupted); } entries }; let mut cf = gix_chunk::file::Index::for_writing(); cf.plan_chunk( multi_index::chunk::index_names::ID, multi_index::chunk::index_names::storage_size(&index_filenames_sorted), ); cf.plan_chunk(multi_index::chunk::fanout::ID, multi_index::chunk::fanout::SIZE as u64); cf.plan_chunk( multi_index::chunk::lookup::ID, multi_index::chunk::lookup::storage_size(entries.len(), object_hash), ); cf.plan_chunk( multi_index::chunk::offsets::ID, multi_index::chunk::offsets::storage_size(entries.len()), ); let num_large_offsets = multi_index::chunk::large_offsets::num_large_offsets(&entries); if let Some(num_large_offsets) = num_large_offsets { cf.plan_chunk( multi_index::chunk::large_offsets::ID, multi_index::chunk::large_offsets::storage_size(num_large_offsets), ); } let mut write_progress = progress.add_child_with_id("Writing multi-index".into(), ProgressId::BytesWritten.into()); let write_start = Instant::now(); write_progress.init( Some(cf.planned_storage_size() as usize + Self::HEADER_LEN), gix_features::progress::bytes(), ); let mut out = gix_features::progress::Write { inner: out, progress: write_progress, }; let bytes_written = Self::write_header( &mut out, cf.num_chunks().try_into().expect("BUG: wrote more than 256 chunks"), index_paths_sorted.len() as u32, object_hash, )?; { progress.set_name("Writing chunks".into()); progress.init(Some(cf.num_chunks()), gix_features::progress::count("chunks")); let mut chunk_write = cf.into_write(&mut out, bytes_written)?; while let Some(chunk_to_write) = chunk_write.next_chunk() { match chunk_to_write { multi_index::chunk::index_names::ID => { multi_index::chunk::index_names::write(&index_filenames_sorted, &mut chunk_write)?; } multi_index::chunk::fanout::ID => multi_index::chunk::fanout::write(&entries, &mut chunk_write)?, multi_index::chunk::lookup::ID => multi_index::chunk::lookup::write(&entries, &mut chunk_write)?, multi_index::chunk::offsets::ID => { multi_index::chunk::offsets::write(&entries, num_large_offsets.is_some(), &mut chunk_write)?; } multi_index::chunk::large_offsets::ID => multi_index::chunk::large_offsets::write( &entries, num_large_offsets.expect("available if planned"), &mut chunk_write, )?, unknown => unreachable!("BUG: forgot to implement chunk {:?}", std::str::from_utf8(&unknown)), } progress.inc(); if should_interrupt.load(Ordering::Relaxed) { return Err(Error::Interrupted); } } } // write trailing checksum let multi_index_checksum: gix_hash::ObjectId = out.inner.hash.digest().into(); out.inner.inner.write_all(multi_index_checksum.as_slice())?; out.progress.show_throughput(write_start); Ok(Outcome { multi_index_checksum }) } fn write_header( out: &mut dyn std::io::Write, num_chunks: u8, num_indices: u32, object_hash: gix_hash::Kind, ) -> std::io::Result { out.write_all(Self::SIGNATURE)?; out.write_all(&[crate::multi_index::Version::V1 as u8])?; out.write_all(&[object_hash as u8])?; out.write_all(&[num_chunks])?; out.write_all(&[0])?; /* unused number of base files */ out.write_all(&num_indices.to_be_bytes())?; Ok(Self::HEADER_LEN) } } gix-pack-0.56.0/src/verify.rs000064400000000000000000000044171046102023000140660ustar 00000000000000use std::{path::Path, sync::atomic::AtomicBool}; use gix_features::progress::Progress; /// pub mod checksum { /// Returned by various methods to verify the checksum of a memory mapped file that might also exist on disk. #[derive(thiserror::Error, Debug)] #[allow(missing_docs)] pub enum Error { #[error("Interrupted by user")] Interrupted, #[error("index checksum mismatch: expected {expected}, got {actual}")] Mismatch { expected: gix_hash::ObjectId, actual: gix_hash::ObjectId, }, } } /// Returns the `index` at which the following `index + 1` value is not an increment over the value at `index`. pub fn fan(data: &[u32]) -> Option { data.windows(2) .enumerate() .find_map(|(win_index, v)| (v[0] > v[1]).then_some(win_index)) } /// Calculate the hash of the given kind by trying to read the file from disk at `data_path` or falling back on the mapped content in `data`. /// `Ok(desired_hash)` or `Err(Some(actual_hash))` is returned if the hash matches or mismatches. /// If the `Err(None)` is returned, the operation was interrupted. pub fn checksum_on_disk_or_mmap( data_path: &Path, data: &[u8], expected: gix_hash::ObjectId, object_hash: gix_hash::Kind, progress: &mut dyn Progress, should_interrupt: &AtomicBool, ) -> Result { let data_len_without_trailer = data.len() - object_hash.len_in_bytes(); let actual = match gix_features::hash::bytes_of_file( data_path, data_len_without_trailer as u64, object_hash, progress, should_interrupt, ) { Ok(id) => id, Err(err) if err.kind() == std::io::ErrorKind::Interrupted => return Err(checksum::Error::Interrupted), Err(_io_err) => { let start = std::time::Instant::now(); let mut hasher = gix_features::hash::hasher(object_hash); hasher.update(&data[..data_len_without_trailer]); progress.inc_by(data_len_without_trailer); progress.show_throughput(start); gix_hash::ObjectId::from(hasher.digest()) } }; if actual == expected { Ok(actual) } else { Err(checksum::Error::Mismatch { actual, expected }) } }