gix-diff-0.49.0/.cargo_vcs_info.json0000644000000001460000000000100126240ustar { "git": { "sha1": "c1ba5719132227410abefeb54e3032b015233e94" }, "path_in_vcs": "gix-diff" }gix-diff-0.49.0/Cargo.toml0000644000000107400000000000100106230ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" rust-version = "1.65" name = "gix-diff" version = "0.49.0" authors = ["Sebastian Thiel "] build = false include = [ "src/**/*", "LICENSE-*", ] autolib = false autobins = false autoexamples = false autotests = false autobenches = false description = "Calculate differences between various git objects" readme = false license = "MIT OR Apache-2.0" repository = "https://github.com/GitoxideLabs/gitoxide" [package.metadata.docs.rs] all-features = true features = ["document-features"] [lib] name = "gix_diff" path = "src/lib.rs" doctest = false [dependencies.bstr] version = "1.5.0" default-features = false [dependencies.document-features] version = "0.2.0" optional = true [dependencies.getrandom] version = "0.2.8" features = ["js"] optional = true default-features = false [dependencies.gix-command] version = "^0.4.0" optional = true [dependencies.gix-filter] version = "^0.16.0" optional = true [dependencies.gix-fs] version = "^0.12.1" optional = true [dependencies.gix-hash] version = "^0.15.1" [dependencies.gix-object] version = "^0.46.1" [dependencies.gix-path] version = "^0.10.13" optional = true [dependencies.gix-tempfile] version = "^15.0.0" optional = true [dependencies.gix-trace] version = "^0.1.11" optional = true [dependencies.gix-traverse] version = "^0.43.1" optional = true [dependencies.gix-worktree] version = "^0.38.0" features = ["attributes"] optional = true default-features = false [dependencies.imara-diff] version = "0.1.7" optional = true [dependencies.serde] version = "1.0.114" features = ["derive"] optional = true default-features = false [dependencies.thiserror] version = "2.0.0" [features] blob = [ "dep:imara-diff", "dep:gix-filter", "dep:gix-worktree", "dep:gix-path", "dep:gix-fs", "dep:gix-command", "dep:gix-tempfile", "dep:gix-trace", "dep:gix-traverse", ] default = ["blob"] serde = [ "dep:serde", "gix-hash/serde", "gix-object/serde", ] wasm = ["dep:getrandom"] [lints.clippy] bool_to_int_with_if = "allow" borrow_as_ptr = "allow" cast_lossless = "allow" cast_possible_truncation = "allow" cast_possible_wrap = "allow" cast_precision_loss = "allow" cast_sign_loss = "allow" checked_conversions = "allow" copy_iterator = "allow" default_trait_access = "allow" doc_markdown = "allow" empty_docs = "allow" enum_glob_use = "allow" explicit_deref_methods = "allow" explicit_into_iter_loop = "allow" explicit_iter_loop = "allow" filter_map_next = "allow" fn_params_excessive_bools = "allow" from_iter_instead_of_collect = "allow" if_not_else = "allow" ignored_unit_patterns = "allow" implicit_clone = "allow" inconsistent_struct_constructor = "allow" inefficient_to_string = "allow" inline_always = "allow" items_after_statements = "allow" iter_not_returning_iterator = "allow" iter_without_into_iter = "allow" manual_assert = "allow" manual_is_variant_and = "allow" manual_let_else = "allow" manual_string_new = "allow" many_single_char_names = "allow" match_bool = "allow" match_same_arms = "allow" match_wild_err_arm = "allow" match_wildcard_for_single_variants = "allow" missing_errors_doc = "allow" missing_panics_doc = "allow" module_name_repetitions = "allow" must_use_candidate = "allow" mut_mut = "allow" naive_bytecount = "allow" needless_for_each = "allow" needless_pass_by_value = "allow" needless_raw_string_hashes = "allow" no_effect_underscore_binding = "allow" option_option = "allow" range_plus_one = "allow" redundant_else = "allow" return_self_not_must_use = "allow" should_panic_without_expect = "allow" similar_names = "allow" single_match_else = "allow" stable_sort_primitive = "allow" struct_excessive_bools = "allow" struct_field_names = "allow" too_long_first_doc_paragraph = "allow" too_many_lines = "allow" transmute_ptr_to_ptr = "allow" trivially_copy_pass_by_ref = "allow" unnecessary_join = "allow" unnecessary_wraps = "allow" unreadable_literal = "allow" unused_self = "allow" used_underscore_binding = "allow" wildcard_imports = "allow" [lints.clippy.pedantic] level = "warn" priority = -1 [lints.rust] gix-diff-0.49.0/Cargo.toml.orig000064400000000000000000000042451046102023000143070ustar 00000000000000lints.workspace = true [package] name = "gix-diff" version = "0.49.0" repository = "https://github.com/GitoxideLabs/gitoxide" license = "MIT OR Apache-2.0" description = "Calculate differences between various git objects" authors = ["Sebastian Thiel "] edition = "2021" include = ["src/**/*", "LICENSE-*"] rust-version = "1.65" autotests = false [features] default = ["blob"] ## Enable diffing of blobs using imara-diff, which also allows for a generic rewrite tracking implementation. blob = ["dep:imara-diff", "dep:gix-filter", "dep:gix-worktree", "dep:gix-path", "dep:gix-fs", "dep:gix-command", "dep:gix-tempfile", "dep:gix-trace", "dep:gix-traverse"] ## Data structures implement `serde::Serialize` and `serde::Deserialize`. serde = ["dep:serde", "gix-hash/serde", "gix-object/serde"] ## Make it possible to compile to the `wasm32-unknown-unknown` target. wasm = ["dep:getrandom"] [lib] doctest = false [dependencies] gix-hash = { version = "^0.15.1", path = "../gix-hash" } gix-object = { version = "^0.46.1", path = "../gix-object" } gix-filter = { version = "^0.16.0", path = "../gix-filter", optional = true } gix-worktree = { version = "^0.38.0", path = "../gix-worktree", default-features = false, features = ["attributes"], optional = true } gix-command = { version = "^0.4.0", path = "../gix-command", optional = true } gix-path = { version = "^0.10.13", path = "../gix-path", optional = true } gix-fs = { version = "^0.12.1", path = "../gix-fs", optional = true } gix-tempfile = { version = "^15.0.0", path = "../gix-tempfile", optional = true } gix-trace = { version = "^0.1.11", path = "../gix-trace", optional = true } gix-traverse = { version = "^0.43.1", path = "../gix-traverse", optional = true } thiserror = "2.0.0" imara-diff = { version = "0.1.7", optional = true } serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } getrandom = { version = "0.2.8", optional = true, default-features = false, features = ["js"] } bstr = { version = "1.5.0", default-features = false } document-features = { version = "0.2.0", optional = true } [package.metadata.docs.rs] all-features = true features = ["document-features"] gix-diff-0.49.0/LICENSE-APACHE000064400000000000000000000247461046102023000133540ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. gix-diff-0.49.0/LICENSE-MIT000064400000000000000000000017771046102023000130630ustar 00000000000000Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. gix-diff-0.49.0/src/blob/mod.rs000064400000000000000000000143551046102023000142550ustar 00000000000000//! For using text diffs, please have a look at the [`imara-diff` documentation](https://docs.rs/imara-diff), //! maintained by [Pascal Kuthe](https://github.com/pascalkuthe). use std::{collections::HashMap, path::PathBuf}; use bstr::BString; pub use imara_diff::*; /// pub mod pipeline; /// pub mod platform; /// Information about the diff performed to detect similarity. #[derive(Debug, Default, Clone, Copy, PartialEq, PartialOrd)] pub struct DiffLineStats { /// The amount of lines to remove from the source to get to the destination. pub removals: u32, /// The amount of lines to add to the source to get to the destination. pub insertions: u32, /// The amount of lines of the previous state, in the source. pub before: u32, /// The amount of lines of the new state, in the destination. pub after: u32, /// A range from 0 to 1.0, where 1.0 is a perfect match and 0.5 is a similarity of 50%. /// Similarity is the ratio between all lines in the previous blob and the current blob, /// calculated as `(old_lines_count - new_lines_count) as f32 / old_lines_count.max(new_lines_count) as f32`. pub similarity: f32, } /// A way to classify a resource suitable for diffing. #[derive(Copy, Clone, Debug, Ord, PartialOrd, Eq, PartialEq, Hash)] pub enum ResourceKind { /// The source of a rewrite, rename or copy operation, or generally the old version of a resource. OldOrSource, /// The destination of a rewrite, rename or copy operation, or generally the new version of a resource. NewOrDestination, } /// A set of values to define how to diff something that is associated with it using `git-attributes`, relevant for regular files. /// /// Some values are related to diffing, some are related to conversions. #[derive(Default, Debug, Clone, PartialEq, Eq)] pub struct Driver { /// The name of the driver, as referred to by `[diff "name"]` in the git configuration. pub name: BString, /// The command to execute to perform the diff entirely like ` old-file old-hex old-mode new-file new-hex new-mode`. /// /// Please note that we don't make this call ourselves, but use it to determine that we should not run the our standard /// built-in algorithm but bail instead as the output of such a program isn't standardized. pub command: Option, /// The per-driver algorithm to use. pub algorithm: Option, /// The external filter program to call like ` /path/to/blob` which outputs a textual version of the provided /// binary file. /// Note that it's invoked with a shell if arguments are given. /// Further, if present, it will always be executed, whether `is_binary` is set or not. pub binary_to_text_command: Option, /// `Some(true)` if this driver deals with binary files, which means that a `binary_to_text_command` should be used to convert binary /// into a textual representation. /// Without such a command, anything that is considered binary is not diffed, but only the size of its data is made available. /// If `Some(false)`, it won't be considered binary, and the its data will not be sampled for the null-byte either. /// Leaving it to `None` means binary detection is automatic, and is based on the presence of the `0` byte in the first 8kB of the buffer. pub is_binary: Option, } /// A conversion pipeline to take an object or path from what's stored in `git` to what can be diffed, while /// following the guidance of git-attributes at the respective path to learn if diffing should happen or if /// the content is considered binary. /// /// There are two different conversion flows, where the target of the flow is a buffer with diffable content: // TODO: update this with information about possible directions. /// /// * `worktree on disk` -> `text conversion` /// * `object` -> `worktree-filters` -> `text conversion` #[derive(Clone)] pub struct Pipeline { /// A way to read data directly from the worktree. pub roots: pipeline::WorktreeRoots, /// A pipeline to convert objects from what's stored in `git` to its worktree version. pub worktree_filter: gix_filter::Pipeline, /// Options affecting the way we read files. pub options: pipeline::Options, /// Drivers to help customize the conversion behaviour depending on the location of items. drivers: Vec, /// Pre-configured attributes to obtain additional diff-related information. attrs: gix_filter::attributes::search::Outcome, /// A buffer to manipulate paths path: PathBuf, } /// A utility for performing a diff of two blobs, including flexible conversions, conversion-caching /// acquisition of diff information. /// Note that this instance will not call external filters as their output can't be known programmatically, /// but it allows to prepare their input if the caller wishes to perform this task. /// /// Optimized for NxM lookups with built-in caching. #[derive(Clone)] pub struct Platform { /// The old version of a diff-able blob, if set. old: Option, /// The new version of a diff-able blob, if set. new: Option, /// Options to alter how diffs should be performed. pub options: platform::Options, /// A way to convert objects into a diff-able format. pub filter: Pipeline, /// A way to access .gitattributes pub attr_stack: gix_worktree::Stack, /// The way we convert resources into diffable states. filter_mode: pipeline::Mode, /// A continuously growing cache keeping ready-for-diff blobs by their path in the worktree, /// as that is what affects their final diff-able state. /// /// That way, expensive rewrite-checks with NxM matrix checks would be as fast as possible, /// avoiding duplicate work. diff_cache: HashMap, /// A list of previously used buffers, ready for re-use. free_list: Vec>, } mod impls { use crate::blob::ResourceKind; impl std::fmt::Display for ResourceKind { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.write_str(match self { ResourceKind::OldOrSource => "old", ResourceKind::NewOrDestination => "new", }) } } } gix-diff-0.49.0/src/blob/pipeline.rs000064400000000000000000000643261046102023000153060ustar 00000000000000use std::{ io::{Read, Write}, path::{Path, PathBuf}, process::{Command, Stdio}, }; use bstr::{BStr, ByteSlice}; use gix_filter::{ driver::apply::{Delay, MaybeDelayed}, pipeline::convert::{ToGitOutcome, ToWorktreeOutcome}, }; use gix_object::tree::EntryKind; use crate::blob::{Driver, Pipeline, ResourceKind}; /// A way to access roots for different kinds of resources that are possibly located and accessible in a worktree. #[derive(Clone, Debug, Default)] pub struct WorktreeRoots { /// A place where the source of a rewrite, rename or copy, or generally the previous version of resources, are located. pub old_root: Option, /// A place where the destination of a rewrite, rename or copy, or generally the new version of resources, are located. pub new_root: Option, } /// Access impl WorktreeRoots { /// Return the root path for the given `kind` pub fn by_kind(&self, kind: ResourceKind) -> Option<&Path> { match kind { ResourceKind::OldOrSource => self.old_root.as_deref(), ResourceKind::NewOrDestination => self.new_root.as_deref(), } } /// Return `true` if all worktree roots are unset. pub fn is_unset(&self) -> bool { self.new_root.is_none() && self.old_root.is_none() } } /// Data as part of an [Outcome]. #[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)] pub enum Data { /// The data to use for diffing was written into the buffer that was passed during the call to [`Pipeline::convert_to_diffable()`]. Buffer, /// The size that the binary blob had at the given revision, without having applied filters, as it's either /// considered binary or above the big-file threshold. /// /// In this state, the binary file cannot be diffed. Binary { /// The size of the object prior to performing any filtering or as it was found on disk. /// /// Note that technically, the size isn't always representative of the same 'state' of the /// content, as once it can be the size of the blob in git, and once it's the size of file /// in the worktree. size: u64, }, } /// The outcome returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()). #[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)] pub struct Outcome { /// If available, an index into the `drivers` field to access more diff-related information of the driver for items /// at the given path, as previously determined by git-attributes. /// /// Note that drivers are queried even if there is no object available. pub driver_index: Option, /// The data itself, suitable for diffing, and if the object or worktree item is present at all. pub data: Option, } /// Options for use in a [`Pipeline`]. #[derive(Default, Clone, Copy, PartialEq, Eq, Debug, Hash, Ord, PartialOrd)] pub struct Options { /// The amount of bytes that an object has to reach before being treated as binary. /// These objects will not be queried, nor will their data be processed in any way. /// If `0`, no file is ever considered binary due to their size. /// /// Note that for files stored in `git`, what counts is their stored, decompressed size, /// thus `git-lfs` files would typically not be considered binary unless one explicitly sets /// them pub large_file_threshold_bytes: u64, /// Capabilities of the file system which affect how we read worktree files. pub fs: gix_fs::Capabilities, } /// The specific way to convert a resource. #[derive(Default, Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] pub enum Mode { /// Always prepare the version of the resource as it would be in the work-tree, and /// apply binary-to-text filters if present. /// /// This is typically free for resources in the worktree, and will apply filters to resources in the /// object database. #[default] ToWorktreeAndBinaryToText, /// Prepare the version of the resource as it would be in the work-tree if /// binary-to-text filters are present (and apply them), or use the version in `git` otherwise. ToGitUnlessBinaryToTextIsPresent, /// Always prepare resources as they are stored in `git`. /// /// This is usually fastest, even though resources in the worktree needed to be converted files. ToGit, } impl Mode { fn to_worktree(self) -> bool { matches!( self, Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToWorktreeAndBinaryToText ) } fn to_git(self) -> bool { matches!(self, Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToGit) } } /// pub mod convert_to_diffable { use std::collections::TryReserveError; use bstr::BString; use gix_object::tree::EntryKind; /// The error returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()). #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] pub enum Error { #[error("Entry at '{rela_path}' must be regular file or symlink, but was {actual:?}")] InvalidEntryKind { rela_path: BString, actual: EntryKind }, #[error("Entry at '{rela_path}' could not be read as symbolic link")] ReadLink { rela_path: BString, source: std::io::Error }, #[error("Entry at '{rela_path}' could not be opened for reading or read from")] OpenOrRead { rela_path: BString, source: std::io::Error }, #[error("Entry at '{rela_path}' could not be copied from a filter process to a memory buffer")] StreamCopy { rela_path: BString, source: std::io::Error }, #[error("Failed to run '{cmd}' for binary-to-text conversion of entry at {rela_path}")] RunTextConvFilter { rela_path: BString, cmd: String, source: std::io::Error, }, #[error("Tempfile for binary-to-text conversion for entry at {rela_path} could not be created")] CreateTempfile { rela_path: BString, source: std::io::Error }, #[error("Binary-to-text conversion '{cmd}' for entry at {rela_path} failed with: {stderr}")] TextConvFilterFailed { rela_path: BString, cmd: String, stderr: BString, }, #[error(transparent)] FindObject(#[from] gix_object::find::existing_object::Error), #[error(transparent)] ConvertToWorktree(#[from] gix_filter::pipeline::convert::to_worktree::Error), #[error(transparent)] ConvertToGit(#[from] gix_filter::pipeline::convert::to_git::Error), #[error("Memory allocation failed")] OutOfMemory(#[from] TryReserveError), } } /// Lifecycle impl Pipeline { /// Create a new instance of a pipeline which produces blobs suitable for diffing. `roots` allow to read worktree files directly, otherwise /// `worktree_filter` is used to transform object database data directly. `drivers` further configure individual paths. /// `options` are used to further configure the way we act.. pub fn new( roots: WorktreeRoots, worktree_filter: gix_filter::Pipeline, mut drivers: Vec, options: Options, ) -> Self { drivers.sort_by(|a, b| a.name.cmp(&b.name)); Pipeline { roots, worktree_filter, drivers, options, attrs: { let mut out = gix_filter::attributes::search::Outcome::default(); out.initialize_with_selection(&Default::default(), Some("diff")); out }, path: Default::default(), } } } /// Access impl Pipeline { /// Return all drivers that this instance was initialized with. /// /// They are sorted by [`name`](Driver::name) to support binary searches. pub fn drivers(&self) -> &[super::Driver] { &self.drivers } } /// Conversion impl Pipeline { /// Convert the object at `id`, `mode`, `rela_path` and `kind`, providing access to `attributes` and `objects`. /// The resulting diff-able data is written into `out`, assuming it's not too large. The returned [`Outcome`] /// contains information on how to use `out`, or if it's filled at all. /// /// `attributes` must be returning the attributes at `rela_path`, and `objects` must be usable if `kind` is /// a resource in the object database, i.e. has no worktree root available. /// /// If `id` [is null](gix_hash::ObjectId::is_null()) or the file in question doesn't exist in the worktree in case /// [a root](WorktreeRoots) is present, then `out` will be left cleared and [Outcome::data] will be `None`. /// /// Note that `mode` is trusted, and we will not re-validate that the entry in the worktree actually is of that mode. /// /// Use `convert` to control what kind of the resource will be produced. /// /// ### About Tempfiles /// /// When querying from the object database and a binary and a [binary-to-text](Driver::binary_to_text_command) is set, /// a temporary file will be created to serve as input for the converter program, containing the worktree-data that /// exactly as it would be present in the worktree if checked out. /// /// As these files are ultimately named tempfiles, they will be leaked unless the [gix_tempfile] is configured with /// a signal handler. If they leak, they would remain in the system's `$TMP` directory. #[allow(clippy::too_many_arguments)] pub fn convert_to_diffable( &mut self, id: &gix_hash::oid, mode: EntryKind, rela_path: &BStr, kind: ResourceKind, attributes: &mut dyn FnMut(&BStr, &mut gix_filter::attributes::search::Outcome), objects: &dyn gix_object::FindObjectOrHeader, convert: Mode, out: &mut Vec, ) -> Result { let is_symlink = match mode { EntryKind::Link if self.options.fs.symlink => true, EntryKind::Blob | EntryKind::BlobExecutable => false, _ => { return Err(convert_to_diffable::Error::InvalidEntryKind { rela_path: rela_path.to_owned(), actual: mode, }) } }; out.clear(); attributes(rela_path, &mut self.attrs); let attr = self.attrs.iter_selected().next().expect("pre-initialized with 'diff'"); let driver_index = attr .assignment .state .as_bstr() .and_then(|name| self.drivers.binary_search_by(|d| d.name.as_bstr().cmp(name)).ok()); let driver = driver_index.map(|idx| &self.drivers[idx]); let mut is_binary = if let Some(driver) = driver { driver .is_binary .map(|is_binary| is_binary && driver.binary_to_text_command.is_none()) } else { attr.assignment.state.is_unset().then_some(true) }; match self.roots.by_kind(kind) { Some(root) => { self.path.clear(); self.path.push(root); self.path.push(gix_path::from_bstr(rela_path)); let data = if is_symlink { let target = none_if_missing(std::fs::read_link(&self.path)).map_err(|err| { convert_to_diffable::Error::ReadLink { rela_path: rela_path.to_owned(), source: err, } })?; target.map(|target| { out.extend_from_slice(gix_path::into_bstr(target).as_ref()); Data::Buffer }) } else { let need_size_only = is_binary == Some(true); let size_in_bytes = (need_size_only || (is_binary != Some(false) && self.options.large_file_threshold_bytes > 0)) .then(|| { none_if_missing(self.path.metadata().map(|md| md.len())).map_err(|err| { convert_to_diffable::Error::OpenOrRead { rela_path: rela_path.to_owned(), source: err, } }) }) .transpose()?; match size_in_bytes { Some(None) => None, // missing as identified by the size check Some(Some(size)) if size > self.options.large_file_threshold_bytes || need_size_only => { Some(Data::Binary { size }) } _ => { match driver .filter(|_| convert.to_worktree()) .and_then(|d| d.prepare_binary_to_text_cmd(&self.path)) { Some(cmd) => { // Avoid letting the driver program fail if it doesn't exist. if self.options.large_file_threshold_bytes == 0 && none_if_missing(std::fs::symlink_metadata(&self.path)) .map_err(|err| convert_to_diffable::Error::OpenOrRead { rela_path: rela_path.to_owned(), source: err, })? .is_none() { None } else { run_cmd(rela_path, cmd, out)?; Some(Data::Buffer) } } None => { let file = none_if_missing(std::fs::File::open(&self.path)).map_err(|err| { convert_to_diffable::Error::OpenOrRead { rela_path: rela_path.to_owned(), source: err, } })?; match file { Some(mut file) => { if convert.to_git() { let res = self.worktree_filter.convert_to_git( file, gix_path::from_bstr(rela_path).as_ref(), attributes, &mut |buf| objects.try_find(id, buf).map(|obj| obj.map(|_| ())), )?; match res { ToGitOutcome::Unchanged(mut file) => { file.read_to_end(out).map_err(|err| { convert_to_diffable::Error::OpenOrRead { rela_path: rela_path.to_owned(), source: err, } })?; } ToGitOutcome::Process(mut stream) => { stream.read_to_end(out).map_err(|err| { convert_to_diffable::Error::OpenOrRead { rela_path: rela_path.to_owned(), source: err, } })?; } ToGitOutcome::Buffer(buf) => { out.clear(); out.try_reserve(buf.len())?; out.extend_from_slice(buf); } } } else { file.read_to_end(out).map_err(|err| { convert_to_diffable::Error::OpenOrRead { rela_path: rela_path.to_owned(), source: err, } })?; } Some(if is_binary.unwrap_or_else(|| is_binary_buf(out)) { let size = out.len() as u64; out.clear(); Data::Binary { size } } else { Data::Buffer }) } None => None, } } } } } }; Ok(Outcome { driver_index, data }) } None => { let data = if id.is_null() { None } else { let header = objects .try_header(id) .map_err(gix_object::find::existing_object::Error::Find)? .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?; if is_binary.is_none() && self.options.large_file_threshold_bytes > 0 && header.size > self.options.large_file_threshold_bytes { is_binary = Some(true); }; let data = if is_binary == Some(true) { Data::Binary { size: header.size } } else { objects .try_find(id, out) .map_err(gix_object::find::existing_object::Error::Find)? .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?; if matches!(mode, EntryKind::Blob | EntryKind::BlobExecutable) && convert == Mode::ToWorktreeAndBinaryToText || (convert == Mode::ToGitUnlessBinaryToTextIsPresent && driver.map_or(false, |d| d.binary_to_text_command.is_some())) { let res = self.worktree_filter .convert_to_worktree(out, rela_path, attributes, Delay::Forbid)?; let cmd_and_file = driver .and_then(|d| { d.binary_to_text_command.is_some().then(|| { gix_tempfile::new( std::env::temp_dir(), gix_tempfile::ContainingDirectory::Exists, gix_tempfile::AutoRemove::Tempfile, ) .and_then(|mut tmp_file| { self.path.clear(); tmp_file.with_mut(|tmp| self.path.push(tmp.path()))?; Ok(tmp_file) }) .map(|tmp_file| { ( d.prepare_binary_to_text_cmd(&self.path) .expect("always get cmd if command is set"), tmp_file, ) }) }) }) .transpose() .map_err(|err| convert_to_diffable::Error::CreateTempfile { source: err, rela_path: rela_path.to_owned(), })?; match cmd_and_file { Some((cmd, mut tmp_file)) => { match res { ToWorktreeOutcome::Unchanged(buf) | ToWorktreeOutcome::Buffer(buf) => { tmp_file.write_all(buf) } ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => { std::io::copy(&mut stream, &mut tmp_file).map(|_| ()) } ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => { unreachable!("we prohibit this") } } .map_err(|err| { convert_to_diffable::Error::StreamCopy { source: err, rela_path: rela_path.to_owned(), } })?; out.clear(); run_cmd(rela_path, cmd, out)?; } None => { match res { ToWorktreeOutcome::Unchanged(_) => {} ToWorktreeOutcome::Buffer(src) => { out.clear(); out.try_reserve(src.len())?; out.extend_from_slice(src); } ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => { std::io::copy(&mut stream, out).map_err(|err| { convert_to_diffable::Error::StreamCopy { rela_path: rela_path.to_owned(), source: err, } })?; } ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => { unreachable!("we prohibit this") } }; } } } if driver.map_or(true, |d| d.binary_to_text_command.is_none()) && is_binary.unwrap_or_else(|| is_binary_buf(out)) { let size = out.len() as u64; out.clear(); Data::Binary { size } } else { Data::Buffer } }; Some(data) }; Ok(Outcome { driver_index, data }) } } } } fn is_binary_buf(buf: &[u8]) -> bool { let buf = &buf[..buf.len().min(8000)]; buf.contains(&0) } fn none_if_missing(res: std::io::Result) -> std::io::Result> { match res { Ok(data) => Ok(Some(data)), Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(None), Err(err) => Err(err), } } fn run_cmd(rela_path: &BStr, mut cmd: Command, out: &mut Vec) -> Result<(), convert_to_diffable::Error> { gix_trace::debug!(cmd = ?cmd, "Running binary-to-text command"); let mut res = cmd .output() .map_err(|err| convert_to_diffable::Error::RunTextConvFilter { rela_path: rela_path.to_owned(), cmd: format!("{cmd:?}"), source: err, })?; if !res.status.success() { return Err(convert_to_diffable::Error::TextConvFilterFailed { rela_path: rela_path.to_owned(), cmd: format!("{cmd:?}"), stderr: res.stderr.into(), }); } out.append(&mut res.stdout); Ok(()) } impl Driver { /// Produce an invocable command pre-configured to produce the filtered output on stdout after reading `path`. pub fn prepare_binary_to_text_cmd(&self, path: &Path) -> Option { let command: &BStr = self.binary_to_text_command.as_ref()?.as_ref(); let cmd = gix_command::prepare(gix_path::from_bstr(command).into_owned()) // TODO: Add support for an actual Context, validate it *can* match Git .with_context(Default::default()) .with_shell() .stdin(Stdio::null()) .stdout(Stdio::piped()) .stderr(Stdio::piped()) .arg(path) .into(); Some(cmd) } } gix-diff-0.49.0/src/blob/platform.rs000064400000000000000000000656101046102023000153220ustar 00000000000000use bstr::{BStr, BString, ByteSlice}; use std::cmp::Ordering; use std::{io::Write, process::Stdio}; use super::Algorithm; use crate::blob::{pipeline, Pipeline, Platform, ResourceKind}; /// A key to uniquely identify either a location in the worktree, or in the object database. #[derive(Clone)] pub(crate) struct CacheKey { id: gix_hash::ObjectId, location: BString, /// If `true`, this is an `id` based key, otherwise it's location based. use_id: bool, /// Only relevant when `id` is not null, to further differentiate content and allow us to /// keep track of both links and blobs with the same content (rare, but possible). is_link: bool, } /// A stored value representing a diffable resource. #[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Debug)] pub(crate) struct CacheValue { /// The outcome of converting a resource into a diffable format using [Pipeline::convert_to_diffable()]. conversion: pipeline::Outcome, /// The kind of the resource we are looking at. Only possible values are `Blob`, `BlobExecutable` and `Link`. mode: gix_object::tree::EntryKind, /// A possibly empty buffer, depending on `conversion.data` which may indicate the data is considered binary. buffer: Vec, } impl std::hash::Hash for CacheKey { fn hash(&self, state: &mut H) { if self.use_id { self.id.hash(state); self.is_link.hash(state); } else { self.location.hash(state); } } } impl PartialEq for CacheKey { fn eq(&self, other: &Self) -> bool { match (self.use_id, other.use_id) { (false, false) => self.location.eq(&other.location), (true, true) => self.id.eq(&other.id) && self.is_link.eq(&other.is_link), _ => false, } } } impl Eq for CacheKey {} impl Default for CacheKey { fn default() -> Self { CacheKey { id: gix_hash::Kind::Sha1.null(), use_id: false, is_link: false, location: BString::default(), } } } impl CacheKey { fn set_location(&mut self, rela_path: &BStr) { self.location.clear(); self.location.extend_from_slice(rela_path); } } /// A resource ready to be diffed in one way or another. #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] pub struct Resource<'a> { /// If available, an index into the `drivers` field to access more diff-related information of the driver for items /// at the given path, as previously determined by git-attributes. /// /// Note that drivers are queried even if there is no object available. pub driver_index: Option, /// The data itself, suitable for diffing, and if the object or worktree item is present at all. pub data: resource::Data<'a>, /// The kind of the resource we are looking at. Only possible values are `Blob`, `BlobExecutable` and `Link`. pub mode: gix_object::tree::EntryKind, /// The location of the resource, relative to the working tree. pub rela_path: &'a BStr, /// The id of the content as it would be stored in `git`, or `null` if the content doesn't exist anymore at /// `rela_path` or if it was never computed. This can happen with content read from the worktree, which has to /// go through a filter to be converted back to what `git` would store. pub id: &'a gix_hash::oid, } /// pub mod resource { use crate::blob::{ pipeline, platform::{CacheKey, CacheValue, Resource}, }; impl<'a> Resource<'a> { pub(crate) fn new(key: &'a CacheKey, value: &'a CacheValue) -> Self { Resource { driver_index: value.conversion.driver_index, data: value.conversion.data.map_or(Data::Missing, |data| match data { pipeline::Data::Buffer => Data::Buffer(&value.buffer), pipeline::Data::Binary { size } => Data::Binary { size }, }), mode: value.mode, rela_path: key.location.as_ref(), id: &key.id, } } /// Produce an iterator over lines, separated by LF or CRLF, suitable to create tokens using /// [`imara_diff::intern::InternedInput`]. pub fn intern_source(&self) -> imara_diff::sources::ByteLines<'a, true> { crate::blob::sources::byte_lines_with_terminator(self.data.as_slice().unwrap_or_default()) } } /// The data of a diffable resource, as it could be determined and computed previously. #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] pub enum Data<'a> { /// The object is missing, either because it didn't exist in the working tree or because its `id` was null. Missing, /// The textual data as processed to be in a diffable state. Buffer(&'a [u8]), /// The size that the binary blob had at the given revision, without having applied filters, as it's either /// considered binary or above the big-file threshold. /// /// In this state, the binary file cannot be diffed. Binary { /// The size of the object prior to performing any filtering or as it was found on disk. /// /// Note that technically, the size isn't always representative of the same 'state' of the /// content, as once it can be the size of the blob in git, and once it's the size of file /// in the worktree. size: u64, }, } impl<'a> Data<'a> { /// Return ourselves as slice of bytes if this instance stores data. pub fn as_slice(&self) -> Option<&'a [u8]> { match self { Data::Buffer(d) => Some(d), Data::Binary { .. } | Data::Missing => None, } } } } /// pub mod set_resource { use bstr::BString; use crate::blob::{pipeline, ResourceKind}; /// The error returned by [Platform::set_resource](super::Platform::set_resource). #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] pub enum Error { #[error("Can only diff blobs and links, not {mode:?}")] InvalidMode { mode: gix_object::tree::EntryKind }, #[error("Failed to read {kind} worktree data from '{rela_path}'")] Io { rela_path: BString, kind: ResourceKind, source: std::io::Error, }, #[error("Failed to obtain attributes for {kind} resource at '{rela_path}'")] Attributes { rela_path: BString, kind: ResourceKind, source: std::io::Error, }, #[error(transparent)] ConvertToDiffable(#[from] pipeline::convert_to_diffable::Error), } } /// pub mod prepare_diff { use bstr::BStr; use crate::blob::platform::Resource; /// The kind of operation that should be performed based on the configuration of the resources involved in the diff. #[derive(Debug, Copy, Clone, Eq, PartialEq)] pub enum Operation<'a> { /// The [internal diff algorithm](imara_diff::diff) should be called with the provided arguments. /// This only happens if none of the resources are binary, and if there is no external diff program configured via git-attributes /// *or* [Options::skip_internal_diff_if_external_is_configured](super::Options::skip_internal_diff_if_external_is_configured) /// is `false`. /// /// Use [`Outcome::interned_input()`] to easily obtain an interner for use with [`imara_diff::diff()`], or maintain one yourself /// for greater reuse. InternalDiff { /// The algorithm we determined should be used, which is one of (in order, first set one wins): /// /// * the driver's override /// * the platforms own configuration (typically from git-config) /// * the default algorithm algorithm: imara_diff::Algorithm, }, /// Run the external diff program according as configured in the `source`-resources driver. /// This only happens if [Options::skip_internal_diff_if_external_is_configured](super::Options::skip_internal_diff_if_external_is_configured) /// was `true`, preventing the usage of the internal diff implementation. ExternalCommand { /// The command as extracted from [Driver::command](super::super::Driver::command). /// Use it in [`Platform::prepare_diff_command`](super::Platform::prepare_diff_command()) to easily prepare a compatible invocation. command: &'a BStr, }, /// One of the involved resources, [`old`](Outcome::old) or [`new`](Outcome::new), were binary and thus no diff /// cannot be performed. SourceOrDestinationIsBinary, } /// The outcome of a [`prepare_diff`](super::Platform::prepare_diff()) operation. #[derive(Debug, Copy, Clone, Eq, PartialEq)] pub struct Outcome<'a> { /// The kind of diff that was actually performed. This may include skipping the internal diff as well. pub operation: Operation<'a>, /// The old or source of the diff operation. pub old: Resource<'a>, /// The new or destination of the diff operation. pub new: Resource<'a>, } impl<'a> Outcome<'a> { /// Produce an instance of an interner which `git` would use to perform diffs. pub fn interned_input(&self) -> imara_diff::intern::InternedInput<&'a [u8]> { crate::blob::intern::InternedInput::new(self.old.intern_source(), self.new.intern_source()) } } /// The error returned by [Platform::prepare_diff()](super::Platform::prepare_diff()). #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] pub enum Error { #[error("Either the source or the destination of the diff operation were not set")] SourceOrDestinationUnset, #[error("Tried to diff resources that are both considered removed")] SourceAndDestinationRemoved, } } /// pub mod prepare_diff_command { use std::ops::{Deref, DerefMut}; use bstr::BString; /// The error returned by [Platform::prepare_diff_command()](super::Platform::prepare_diff_command()). #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] pub enum Error { #[error("Either the source or the destination of the diff operation were not set")] SourceOrDestinationUnset, #[error("Binary resources can't be diffed with an external command (as we don't have the data anymore)")] SourceOrDestinationBinary, #[error( "Tempfile to store content of '{rela_path}' for passing to external diff command could not be created" )] CreateTempfile { rela_path: BString, source: std::io::Error }, #[error("Could not write content of '{rela_path}' to tempfile for passing to external diff command")] WriteTempfile { rela_path: BString, source: std::io::Error }, } /// The outcome of a [`prepare_diff_command`](super::Platform::prepare_diff_command()) operation. /// /// This type acts like [`std::process::Command`], ready to run, with `stdin`, `stdout` and `stderr` set to *inherit* /// all handles as this is expected to be for visual inspection. pub struct Command { pub(crate) cmd: std::process::Command, /// Possibly a tempfile to be removed after the run, or `None` if there is no old version. pub(crate) old: Option>, /// Possibly a tempfile to be removed after the run, or `None` if there is no new version. pub(crate) new: Option>, } impl Deref for Command { type Target = std::process::Command; fn deref(&self) -> &Self::Target { &self.cmd } } impl DerefMut for Command { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.cmd } } } /// Options for use in [Platform::new()]. #[derive(Default, Copy, Clone)] pub struct Options { /// The algorithm to use when diffing. /// If unset, it uses the [default algorithm](Algorithm::default()). pub algorithm: Option, /// If `true`, default `false`, then an external `diff` configured using gitattributes and drivers, /// will cause the built-in diff [to be skipped](prepare_diff::Operation::ExternalCommand). /// Otherwise, the internal diff is called despite the configured external diff, which is /// typically what callers expect by default. pub skip_internal_diff_if_external_is_configured: bool, } /// Lifecycle impl Platform { /// Create a new instance with `options`, and a way to `filter` data from the object database to data that is diff-able. /// `filter_mode` decides how to do that specifically. /// Use `attr_stack` to access attributes pertaining worktree filters and diff settings. pub fn new( options: Options, filter: Pipeline, filter_mode: pipeline::Mode, attr_stack: gix_worktree::Stack, ) -> Self { Platform { old: None, new: None, diff_cache: Default::default(), free_list: Vec::with_capacity(2), options, filter, filter_mode, attr_stack, } } } /// Conversions impl Platform { /// Store enough information about a resource to eventually diff it, where… /// /// * `id` is the hash of the resource. If it [is null](gix_hash::ObjectId::is_null()), it should either /// be a resource in the worktree, or it's considered a non-existing, deleted object. /// If an `id` is known, as the hash of the object as (would) be stored in `git`, then it should be provided /// for completeness. /// * `mode` is the kind of object (only blobs and links are allowed) /// * `rela_path` is the relative path as seen from the (work)tree root. /// * `kind` identifies the side of the diff this resource will be used for. /// A diff needs both `OldOrSource` *and* `NewOrDestination`. /// * `objects` provides access to the object database in case the resource can't be read from a worktree. /// /// Note that it's assumed that either `id + mode (` or `rela_path` can serve as unique identifier for the resource, /// depending on whether or not a [worktree root](pipeline::WorktreeRoots) is set for the resource of `kind`, /// with resources with worktree roots using the `rela_path` as unique identifier. /// /// ### Important /// /// If an error occurs, the previous resource of `kind` will be cleared, preventing further diffs /// unless another attempt succeeds. pub fn set_resource( &mut self, id: gix_hash::ObjectId, mode: gix_object::tree::EntryKind, rela_path: &BStr, kind: ResourceKind, objects: &impl gix_object::FindObjectOrHeader, // TODO: make this `dyn` once https://github.com/rust-lang/rust/issues/65991 is stable, then also make tracker.rs `objects` dyn ) -> Result<(), set_resource::Error> { let res = self.set_resource_inner(id, mode, rela_path, kind, objects); if res.is_err() { *match kind { ResourceKind::OldOrSource => &mut self.old, ResourceKind::NewOrDestination => &mut self.new, } = None; } res } /// Given `diff_command` and `context`, typically obtained from git-configuration, and the currently set diff-resources, /// prepare the invocation and temporary files needed to launch it according to protocol. /// `count` / `total` are used for progress indication passed as environment variables `GIT_DIFF_PATH_(COUNTER|TOTAL)` /// respectively (0-based), so the first path has `count=0` and `total=1` (assuming there is only one path). /// Returns `None` if at least one resource is unset, see [`set_resource()`](Self::set_resource()). /// /// Please note that this is an expensive operation this will always create up to two temporary files to hold the data /// for the old and new resources. /// /// ### Deviation /// /// If one of the resources is binary, the operation reports an error as such resources don't make their data available /// which is required for the external diff to run. // TODO: fix this - the diff shouldn't fail if binary (or large) files are used, just copy them into tempfiles. pub fn prepare_diff_command( &self, diff_command: BString, context: gix_command::Context, count: usize, total: usize, ) -> Result { fn add_resource( cmd: &mut std::process::Command, res: Resource<'_>, ) -> Result>, prepare_diff_command::Error> { let tmpfile = match res.data { resource::Data::Missing => { cmd.args(["/dev/null", ".", "."]); None } resource::Data::Buffer(buf) => { let mut tmp = gix_tempfile::new( std::env::temp_dir(), gix_tempfile::ContainingDirectory::Exists, gix_tempfile::AutoRemove::Tempfile, ) .map_err(|err| prepare_diff_command::Error::CreateTempfile { rela_path: res.rela_path.to_owned(), source: err, })?; tmp.write_all(buf) .map_err(|err| prepare_diff_command::Error::WriteTempfile { rela_path: res.rela_path.to_owned(), source: err, })?; tmp.with_mut(|f| { cmd.arg(f.path()); }) .map_err(|err| prepare_diff_command::Error::WriteTempfile { rela_path: res.rela_path.to_owned(), source: err, })?; cmd.arg(res.id.to_string()).arg(res.mode.as_octal_str().to_string()); let tmp = tmp.close().map_err(|err| prepare_diff_command::Error::WriteTempfile { rela_path: res.rela_path.to_owned(), source: err, })?; Some(tmp) } resource::Data::Binary { .. } => return Err(prepare_diff_command::Error::SourceOrDestinationBinary), }; Ok(tmpfile) } let (old, new) = self .resources() .ok_or(prepare_diff_command::Error::SourceOrDestinationUnset)?; let mut cmd: std::process::Command = gix_command::prepare(gix_path::from_bstring(diff_command)) .with_context(context) .env("GIT_DIFF_PATH_COUNTER", (count + 1).to_string()) .env("GIT_DIFF_PATH_TOTAL", total.to_string()) .stdin(Stdio::inherit()) .stdout(Stdio::inherit()) .stderr(Stdio::inherit()) .into(); cmd.arg(gix_path::from_bstr(old.rela_path).into_owned()); let mut out = prepare_diff_command::Command { cmd, old: None, new: None, }; out.old = add_resource(&mut out.cmd, old)?; out.new = add_resource(&mut out.cmd, new)?; if old.rela_path != new.rela_path { out.cmd.arg(gix_path::from_bstr(new.rela_path).into_owned()); } Ok(out) } /// Returns the resource of the given kind if it was set. pub fn resource(&self, kind: ResourceKind) -> Option> { let key = match kind { ResourceKind::OldOrSource => self.old.as_ref(), ResourceKind::NewOrDestination => self.new.as_ref(), }?; Resource::new(key, self.diff_cache.get(key)?).into() } /// Obtain the two resources that were previously set as `(OldOrSource, NewOrDestination)`, if both are set and available. /// /// This is useful if one wishes to manually prepare the diff, maybe for invoking external programs, instead of relying on /// [`Self::prepare_diff()`]. pub fn resources(&self) -> Option<(Resource<'_>, Resource<'_>)> { let key = &self.old.as_ref()?; let value = self.diff_cache.get(key)?; let old = Resource::new(key, value); let key = &self.new.as_ref()?; let value = self.diff_cache.get(key)?; let new = Resource::new(key, value); Some((old, new)) } /// Prepare a diff operation on the [previously set](Self::set_resource()) [old](ResourceKind::OldOrSource) and /// [new](ResourceKind::NewOrDestination) resources. /// /// The returned outcome allows to easily perform diff operations, based on the [`prepare_diff::Outcome::operation`] field, /// which hints at what should be done. pub fn prepare_diff(&mut self) -> Result, prepare_diff::Error> { let old_key = &self.old.as_ref().ok_or(prepare_diff::Error::SourceOrDestinationUnset)?; let old = self .diff_cache .get(old_key) .ok_or(prepare_diff::Error::SourceOrDestinationUnset)?; let new_key = &self.new.as_ref().ok_or(prepare_diff::Error::SourceOrDestinationUnset)?; let new = self .diff_cache .get(new_key) .ok_or(prepare_diff::Error::SourceOrDestinationUnset)?; let mut out = prepare_diff::Outcome { operation: prepare_diff::Operation::SourceOrDestinationIsBinary, old: Resource::new(old_key, old), new: Resource::new(new_key, new), }; match (old.conversion.data, new.conversion.data) { (None, None) => return Err(prepare_diff::Error::SourceAndDestinationRemoved), (Some(pipeline::Data::Binary { .. }), _) | (_, Some(pipeline::Data::Binary { .. })) => return Ok(out), _either_missing_or_non_binary => { if let Some(command) = old .conversion .driver_index .and_then(|idx| self.filter.drivers[idx].command.as_deref()) .filter(|_| self.options.skip_internal_diff_if_external_is_configured) { out.operation = prepare_diff::Operation::ExternalCommand { command: command.as_bstr(), }; return Ok(out); } } } out.operation = prepare_diff::Operation::InternalDiff { algorithm: old .conversion .driver_index .and_then(|idx| self.filter.drivers[idx].algorithm) .or(self.options.algorithm) .unwrap_or_default(), }; Ok(out) } /// Every call to [set_resource()](Self::set_resource()) will keep the diffable data in memory, and that will never be cleared. /// /// Use this method to clear the cache, releasing memory. Note that this will also lose all information about resources /// which means diffs would fail unless the resources are set again. /// /// Note that this also has to be called if the same resource is going to be diffed in different states, i.e. using different /// `id`s, but the same `rela_path`. pub fn clear_resource_cache(&mut self) { self.old = None; self.new = None; self.diff_cache.clear(); self.free_list.clear(); } /// Every call to [set_resource()](Self::set_resource()) will keep the diffable data in memory, and that will never be cleared. /// /// Use this method to clear the cache, but keep the previously used buffers around for later re-use. /// /// If there are more buffers on the free-list than there are stored sources, we half that amount each time this method is called, /// or keep as many resources as were previously stored, or 2 buffers, whatever is larger. /// If there are fewer buffers in the free-list than are in the resource cache, we will keep as many as needed to match the /// number of previously stored resources. /// /// Returns the number of available buffers. pub fn clear_resource_cache_keep_allocation(&mut self) -> usize { self.old = None; self.new = None; let diff_cache = std::mem::take(&mut self.diff_cache); match self.free_list.len().cmp(&diff_cache.len()) { Ordering::Less => { let to_take = diff_cache.len() - self.free_list.len(); self.free_list .extend(diff_cache.into_values().map(|v| v.buffer).take(to_take)); } Ordering::Equal => {} Ordering::Greater => { let new_len = (self.free_list.len() / 2).max(diff_cache.len()).max(2); self.free_list.truncate(new_len); } } self.free_list.len() } } impl Platform { fn set_resource_inner( &mut self, id: gix_hash::ObjectId, mode: gix_object::tree::EntryKind, rela_path: &BStr, kind: ResourceKind, objects: &impl gix_object::FindObjectOrHeader, ) -> Result<(), set_resource::Error> { if matches!( mode, gix_object::tree::EntryKind::Commit | gix_object::tree::EntryKind::Tree ) { return Err(set_resource::Error::InvalidMode { mode }); } let storage = match kind { ResourceKind::OldOrSource => &mut self.old, ResourceKind::NewOrDestination => &mut self.new, } .get_or_insert_with(Default::default); storage.id = id; storage.set_location(rela_path); storage.is_link = matches!(mode, gix_object::tree::EntryKind::Link); storage.use_id = self.filter.roots.by_kind(kind).is_none(); if self.diff_cache.contains_key(storage) { return Ok(()); } let entry = self.attr_stack .at_entry(rela_path, None, objects) .map_err(|err| set_resource::Error::Attributes { source: err, kind, rela_path: rela_path.to_owned(), })?; let mut buf = self.free_list.pop().unwrap_or_default(); let out = self.filter.convert_to_diffable( &id, mode, rela_path, kind, &mut |_, out| { let _ = entry.matching_attributes(out); }, objects, self.filter_mode, &mut buf, )?; let key = storage.clone(); assert!( self.diff_cache .insert( key, CacheValue { conversion: out, mode, buffer: buf, }, ) .is_none(), "The key impl makes clashes impossible with our usage" ); Ok(()) } } gix-diff-0.49.0/src/lib.rs000064400000000000000000000050601046102023000133170ustar 00000000000000//! Algorithms for diffing various git object types and for generating patches, highly optimized for performance. //! ## Feature Flags #![cfg_attr( all(doc, feature = "document-features"), doc = ::document_features::document_features!() )] #![cfg_attr(all(doc, feature = "document-features"), feature(doc_cfg, doc_auto_cfg))] #![deny(missing_docs, rust_2018_idioms)] #![forbid(unsafe_code)] /// Re-export for use in public API. #[cfg(feature = "blob")] pub use gix_command as command; /// Re-export for use in public API. #[cfg(feature = "blob")] pub use gix_object as object; /// A structure to capture how to perform rename and copy tracking, used by the [rewrites::Tracker]. #[derive(Debug, Copy, Clone, PartialEq)] #[cfg(feature = "blob")] pub struct Rewrites { /// If `Some(…)`, also find copies. `None` is the default which does not try to detect copies at all. /// /// Note that this is an even more expensive operation than detecting renames stemming from additions and deletions /// as the resulting set to search through is usually larger. pub copies: Option, /// The percentage of similarity needed for files to be considered renamed, defaulting to `Some(0.5)`. /// This field is similar to `git diff -M50%`. /// /// If `None`, files are only considered equal if their content matches 100%. /// Note that values greater than 1.0 have no different effect than 1.0. pub percentage: Option, /// The amount of files to consider for fuzzy rename or copy tracking. Defaults to 1000, meaning that only 1000*1000 /// combinations can be tested for fuzzy matches, i.e. the ones that try to find matches by comparing similarity. /// If 0, there is no limit. /// /// If the limit would not be enough to test the entire set of combinations, the algorithm will trade in precision and not /// run the fuzzy version of identity tests at all. That way results are never partial. pub limit: usize, /// If `true`, empty blobs will be tracked. If `false`, they do not participate in rename tracking. /// /// Leaving this off usually leads to better results as empty files don't have a unique-enough identity. pub track_empty: bool, } /// Contains a [Tracker](rewrites::Tracker) to detect rewrites. #[cfg(feature = "blob")] pub mod rewrites; /// pub mod tree; pub use tree::function::diff as tree; /// #[cfg(feature = "blob")] pub mod tree_with_rewrites; #[cfg(feature = "blob")] pub use tree_with_rewrites::function::diff as tree_with_rewrites; /// #[cfg(feature = "blob")] pub mod blob; gix-diff-0.49.0/src/rewrites/mod.rs000064400000000000000000000060511046102023000151750ustar 00000000000000use crate::tree::visit::ChangeId; use crate::Rewrites; use std::collections::BTreeSet; /// Types related to the rename tracker for renames, rewrites and copies. pub mod tracker; /// A type to retain state related to an ongoing tracking operation to retain sets of interesting changes /// of which some are retained to at a later stage compute the ones that seem to be renames or copies. pub struct Tracker { /// The tracked items thus far, which will be used to determine renames/copies and rewrites later. items: Vec>, /// A place to store all paths in to reduce amount of allocations. path_backing: Vec, /// How to track copies and/or rewrites. rewrites: Rewrites, /// Previously emitted relation ids of rewrite pairs, with `(deleted source, added destination)`. child_renames: BTreeSet<(ChangeId, ChangeId)>, } /// Determine in which set of files to search for copies. #[derive(Default, Debug, Copy, Clone, Eq, PartialEq)] pub enum CopySource { /// Find copies from the set of modified files only. #[default] FromSetOfModifiedFiles, /// Find copies from the set of modified files, as well as all files known to the source (i.e. previous state of the tree). /// /// This can be an expensive operation as it scales exponentially with the total amount of files in the set. FromSetOfModifiedFilesAndAllSources, } /// Under which circumstances we consider a file to be a copy. #[derive(Debug, Copy, Clone, PartialEq)] pub struct Copies { /// The set of files to search when finding the source of copies. pub source: CopySource, /// Equivalent to [`Rewrites::percentage`], but used for copy tracking. /// /// Useful to have similarity-based rename tracking and cheaper copy tracking. pub percentage: Option, } impl Default for Copies { fn default() -> Self { Copies { source: CopySource::default(), percentage: Some(0.5), } } } /// Information collected while handling rewrites of files which may be tracked. #[derive(Default, Clone, Copy, Debug, PartialEq)] pub struct Outcome { /// The options used to guide the rewrite tracking. Either fully provided by the caller or retrieved from git configuration. pub options: Rewrites, /// The amount of similarity checks that have been conducted to find renamed files and potentially copies. pub num_similarity_checks: usize, /// Set to the amount of worst-case rename permutations we didn't search as our limit didn't allow it. pub num_similarity_checks_skipped_for_rename_tracking_due_to_limit: usize, /// Set to the amount of worst-case copy permutations we didn't search as our limit didn't allow it. pub num_similarity_checks_skipped_for_copy_tracking_due_to_limit: usize, } /// The default settings for rewrites according to the git configuration defaults. impl Default for Rewrites { fn default() -> Self { Rewrites { copies: None, percentage: Some(0.5), limit: 1000, track_empty: false, } } } gix-diff-0.49.0/src/rewrites/tracker.rs000064400000000000000000001026551046102023000160600ustar 00000000000000//! ### Deviation //! //! Note that the algorithm implemented here is in many ways different from what `git` does. //! //! - it's less sophisticated and doesn't use any ranking of candidates. Instead, it picks the first possible match. //! - the set used for copy-detection is probably smaller by default. use std::ops::Range; use bstr::{BStr, ByteSlice}; use gix_object::tree::{EntryKind, EntryMode}; use crate::rewrites::tracker::visit::SourceKind; use crate::tree::visit::{Action, ChangeId, Relation}; use crate::{ blob::{platform::prepare_diff::Operation, DiffLineStats, ResourceKind}, rewrites::{CopySource, Outcome, Tracker}, Rewrites, }; /// The kind of a change. #[derive(Debug, Copy, Clone, Ord, PartialOrd, PartialEq, Eq)] pub enum ChangeKind { /// The change represents the *deletion* of an item. Deletion, /// The change represents the *modification* of an item. Modification, /// The change represents the *addition* of an item. Addition, } /// A trait providing all functionality to abstract over the concept of a change, as seen by the [`Tracker`]. pub trait Change: Clone { /// Return the hash of the object behind this change for identification. /// /// Note that this is the id of the object as stored in `git`, i.e. it must have gone through workspace /// conversions. What matters is that the IDs are comparable. fn id(&self) -> &gix_hash::oid; /// Return the relation that this change may have with other changes. /// /// It allows to associate a directory with its children that are added or removed at the same moment. /// Note that this is ignored for modifications. /// /// If rename-tracking should always be on leaf-level, this should be set to `None` consistently. /// Note that trees will never be looked up by their `id` as their children are assumed to be passed in /// with the respective relationship. /// /// Also note that the tracker only sees what's given to it, it will not lookup trees or match paths itself. fn relation(&self) -> Option; /// Return the kind of this change. fn kind(&self) -> ChangeKind; /// Return more information about the kind of entry affected by this change. fn entry_mode(&self) -> EntryMode; /// Return the id of the change along with its mode. fn id_and_entry_mode(&self) -> (&gix_hash::oid, EntryMode); } /// A set of tracked items allows to figure out their relations by figuring out their similarity. pub(crate) struct Item { /// The underlying raw change change: T, /// That slice into the backing for paths. path: Range, /// If true, this item was already emitted, i.e. seen by the caller. emitted: bool, } impl Item { fn location<'a>(&self, backing: &'a [u8]) -> &'a BStr { backing[self.path.clone()].as_ref() } fn entry_mode_compatible(&self, other: EntryMode) -> bool { use EntryKind::*; matches!( (other.kind(), self.change.entry_mode().kind()), (Blob | BlobExecutable, Blob | BlobExecutable) | (Link, Link) | (Tree, Tree) ) } fn is_source_for_destination_of(&self, kind: visit::SourceKind, dest_item_mode: EntryMode) -> bool { self.entry_mode_compatible(dest_item_mode) && match kind { visit::SourceKind::Rename => !self.emitted && matches!(self.change.kind(), ChangeKind::Deletion), visit::SourceKind::Copy => { matches!(self.change.kind(), ChangeKind::Modification) } } } } /// A module with types used in the user-callback in [Tracker::emit()](crate::rewrites::Tracker::emit()). pub mod visit { use bstr::BStr; use gix_object::tree::EntryMode; use crate::blob::DiffLineStats; /// The source of a rewrite, rename or copy. #[derive(Debug, Clone, PartialEq, PartialOrd)] pub struct Source<'a, T> { /// The kind of entry. pub entry_mode: EntryMode, /// The hash of the state of the source as seen in the object database. pub id: gix_hash::ObjectId, /// Further specify what kind of source this is. pub kind: SourceKind, /// The repository-relative location of this entry. pub location: &'a BStr, /// The change that was registered as source. pub change: &'a T, /// If this is a rewrite, indicate how many lines would need to change to turn this source into the destination. pub diff: Option, } /// Further identify the kind of [Source]. #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] pub enum SourceKind { /// This is the source of an entry that was renamed, as `source` was renamed to `destination`. Rename, /// This is the source of a copy, as `source` was copied into `destination`. Copy, } /// A change along with a location. #[derive(Debug, Clone)] pub struct Destination<'a, T: Clone> { /// The change at the given `location`. pub change: T, /// The repository-relative location of this destination. pub location: &'a BStr, } } /// pub mod emit { /// The error returned by [Tracker::emit()](super::Tracker::emit()). #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] pub enum Error { #[error("Could not find blob for similarity checking")] FindExistingBlob(#[from] gix_object::find::existing_object::Error), #[error("Could not obtain exhaustive item set to use as possible sources for copy detection")] GetItemsForExhaustiveCopyDetection(#[source] Box), #[error(transparent)] SetResource(#[from] crate::blob::platform::set_resource::Error), #[error(transparent)] PrepareDiff(#[from] crate::blob::platform::prepare_diff::Error), } } /// Lifecycle impl Tracker { /// Create a new instance with `rewrites` configuration. pub fn new(rewrites: Rewrites) -> Self { Tracker { items: vec![], path_backing: vec![], rewrites, child_renames: Default::default(), } } } /// build state and find matches. impl Tracker { /// We may refuse the push if that information isn't needed for what we have to track. pub fn try_push_change(&mut self, change: T, location: &BStr) -> Option { let change_kind = change.kind(); if let (None, ChangeKind::Modification { .. }) = (self.rewrites.copies, change_kind) { return Some(change); }; let entry_kind = change.entry_mode().kind(); if entry_kind == EntryKind::Commit { return Some(change); } let relation = change .relation() .filter(|_| matches!(change_kind, ChangeKind::Addition | ChangeKind::Deletion)); if let (None, EntryKind::Tree) = (relation, entry_kind) { return Some(change); }; let start = self.path_backing.len(); self.path_backing.extend_from_slice(location); let path = start..self.path_backing.len(); self.items.push(Item { path, change, emitted: false, }); None } /// Can only be called once effectively as it alters its own state to assure each item is only emitted once. /// /// `cb(destination, source)` is called for each item, either with `Some(source)` if it's /// the destination of a copy or rename, or with `None` for source if no relation to other /// items in the tracked set exist, which is like saying 'no rename or rewrite or copy' happened. /// Note that directories with [relation](Relation) will be emitted if there is a match, along with all their matching /// child-items which are similarly bundled as rename. /// /// `objects` is used to access blob data for similarity checks if required and is taken directly from the object database. /// Worktree filters and text conversions will be applied afterwards automatically. Note that object-caching *should not* /// be enabled as caching is implemented by `diff_cache`, after all, the blob that's actually diffed is going /// through conversion steps. /// /// `diff_cache` is a way to retain a cache of resources that are prepared for rapid diffing, and it also controls /// the diff-algorithm (provided no user-algorithm is set). /// Note that we control a few options of `diff_cache` to assure it will ignore external commands. /// Note that we do not control how the `diff_cache` converts resources, it's left to the caller to decide /// if it should look at what's stored in `git`, or in the working tree, along with all diff-specific conversions. /// /// `push_source_tree(push_fn: push(change, location))` is a function that is called when the entire tree of the source /// should be added as modifications by calling `push` repeatedly to use for perfect copy tracking. Note that `push` /// will panic if `change` is not a modification, and it's valid to not call `push` at all. pub fn emit( &mut self, mut cb: impl FnMut(visit::Destination<'_, T>, Option>) -> Action, diff_cache: &mut crate::blob::Platform, objects: &impl gix_object::FindObjectOrHeader, mut push_source_tree: PushSourceTreeFn, ) -> Result where PushSourceTreeFn: FnMut(&mut dyn FnMut(T, &BStr)) -> Result<(), E>, E: std::error::Error + Send + Sync + 'static, { fn is_parent(change: &impl Change) -> bool { matches!(change.relation(), Some(Relation::Parent(_))) } diff_cache.options.skip_internal_diff_if_external_is_configured = false; // The point of this is to optimize for identity-based lookups, which should be easy to find // by partitioning. fn by_id_and_location(a: &Item, b: &Item) -> std::cmp::Ordering { a.change .id() .cmp(b.change.id()) .then_with(|| a.path.start.cmp(&b.path.start).then(a.path.end.cmp(&b.path.end))) } let mut out = Outcome { options: self.rewrites, ..Default::default() }; self.items.sort_by(by_id_and_location); // Rewrites by directory (without local changes) can be pruned out quickly, // by finding only parents, their counterpart, and then all children can be matched by // relationship ID. self.match_pairs_of_kind( visit::SourceKind::Rename, &mut cb, None, /* by identity for parents */ &mut out, diff_cache, objects, Some(is_parent), )?; self.match_pairs_of_kind( visit::SourceKind::Rename, &mut cb, self.rewrites.percentage, &mut out, diff_cache, objects, None, )?; self.match_renamed_directories(&mut cb)?; if let Some(copies) = self.rewrites.copies { self.match_pairs_of_kind( visit::SourceKind::Copy, &mut cb, copies.percentage, &mut out, diff_cache, objects, None, )?; match copies.source { CopySource::FromSetOfModifiedFiles => {} CopySource::FromSetOfModifiedFilesAndAllSources => { push_source_tree(&mut |change, location| { if self.try_push_change(change, location).is_none() { // make sure these aren't viable to be emitted anymore. self.items.last_mut().expect("just pushed").emitted = true; } }) .map_err(|err| emit::Error::GetItemsForExhaustiveCopyDetection(Box::new(err)))?; self.items.sort_by(by_id_and_location); self.match_pairs_of_kind( visit::SourceKind::Copy, &mut cb, copies.percentage, &mut out, diff_cache, objects, None, )?; } } } self.items .sort_by(|a, b| a.location(&self.path_backing).cmp(b.location(&self.path_backing))); for item in self.items.drain(..).filter(|item| !item.emitted) { if cb( visit::Destination { location: item.location(&self.path_backing), change: item.change, }, None, ) == Action::Cancel { break; } } Ok(out) } } impl Tracker { #[allow(clippy::too_many_arguments)] fn match_pairs_of_kind( &mut self, kind: visit::SourceKind, cb: &mut impl FnMut(visit::Destination<'_, T>, Option>) -> Action, percentage: Option, out: &mut Outcome, diff_cache: &mut crate::blob::Platform, objects: &impl gix_object::FindObjectOrHeader, filter: Option bool>, ) -> Result<(), emit::Error> { // we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively. let needs_second_pass = !needs_exact_match(percentage); // https://github.com/git/git/blob/cc01bad4a9f566cf4453c7edd6b433851b0835e2/diffcore-rename.c#L350-L369 // We would need a hashmap to be OK to not use the limit here, otherwise the performance is too bad. // This also means we don't find all renames if we hit the rename limit. if self.match_pairs(cb, None /* by identity */, kind, out, diff_cache, objects, filter)? == Action::Cancel { return Ok(()); } if needs_second_pass { let is_limited = if self.rewrites.limit == 0 { false } else { let (num_src, num_dst) = estimate_involved_items(self.items.iter().map(|item| (item.emitted, item.change.kind())), kind); let permutations = num_src * num_dst; if permutations > self.rewrites.limit { match kind { visit::SourceKind::Rename => { out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations; } visit::SourceKind::Copy => { out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations; } } true } else { false } }; if !is_limited { self.match_pairs(cb, percentage, kind, out, diff_cache, objects, None)?; } } Ok(()) } #[allow(clippy::too_many_arguments)] fn match_pairs( &mut self, cb: &mut impl FnMut(visit::Destination<'_, T>, Option>) -> Action, percentage: Option, kind: visit::SourceKind, stats: &mut Outcome, diff_cache: &mut crate::blob::Platform, objects: &impl gix_object::FindObjectOrHeader, filter: Option bool>, ) -> Result { let mut dest_ofs = 0; let mut num_checks = 0; let max_checks = { let limit = self.rewrites.limit.saturating_pow(2); // There can be trees with a lot of entries and pathological search behaviour, as they can be repeated // and then have a lot of similar hashes. This also means we have to search a lot of candidates which // can be too slow despite best attempts. So play it save and detect such cases 'roughly' by amount of items. if self.items.len() < 100_000 { 0 } else { limit } }; while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| { (!item.emitted && matches!(item.change.kind(), ChangeKind::Addition) && filter.map_or_else( || { self.rewrites.track_empty // We always want to keep track of entries that are involved of a directory rename. // Note that this may still match them up arbitrarily if empty, but empty is empty. || matches!(item.change.relation(), Some(Relation::ChildOfParent(_))) || { let id = item.change.id(); id != gix_hash::ObjectId::empty_blob(id.kind()) } }, |f| f(&item.change), )) .then_some((idx, item)) }) { dest_idx += dest_ofs; dest_ofs = dest_idx + 1; self.items[dest_idx].location(&self.path_backing); let src = find_match( &self.items, dest, dest_idx, percentage, kind, stats, objects, diff_cache, &self.path_backing, &mut num_checks, )? .map(|(src_idx, src, diff)| { let (id, entry_mode) = src.change.id_and_entry_mode(); let id = id.to_owned(); let location = src.location(&self.path_backing); ( visit::Source { entry_mode, id, kind, location, change: &src.change, diff, }, src_idx, ) }); if max_checks != 0 && num_checks > max_checks { gix_trace::warn!( "Cancelled rename matching as there were too many iterations ({num_checks} > {max_checks})" ); return Ok(Action::Cancel); } let Some((src, src_idx)) = src else { continue; }; let location = dest.location(&self.path_backing); let change = dest.change.clone(); let dest = visit::Destination { change, location }; let relations = if percentage.is_none() { src.change.relation().zip(dest.change.relation()) } else { None }; let res = cb(dest, Some(src)); self.items[dest_idx].emitted = true; self.items[src_idx].emitted = true; if res == Action::Cancel { return Ok(Action::Cancel); } match relations { Some((Relation::Parent(src), Relation::Parent(dst))) => { let res = self.emit_child_renames_matching_identity(cb, kind, src, dst)?; if res == Action::Cancel { return Ok(Action::Cancel); } } Some((Relation::ChildOfParent(src), Relation::ChildOfParent(dst))) => { self.child_renames.insert((src, dst)); } _ => {} } } Ok(Action::Continue) } /// Emit the children of `src_parent_id` and `dst_parent_id` as pairs of exact matches, which are assumed /// as `src` and `dst` were an exact match (so all children have to match exactly). /// Note that we intentionally do not record them as their parents will be emitted, too. fn emit_child_renames_matching_identity( &mut self, cb: &mut impl FnMut(visit::Destination<'_, T>, Option>) -> Action, kind: visit::SourceKind, src_parent_id: ChangeId, dst_parent_id: ChangeId, ) -> Result { debug_assert_ne!( src_parent_id, dst_parent_id, "src and destination directories must be distinct" ); let (mut src_items, mut dst_items) = (Vec::with_capacity(1), Vec::with_capacity(1)); for item in self.items.iter_mut().filter(|item| !item.emitted) { match item.change.relation() { Some(Relation::ChildOfParent(id)) if id == src_parent_id => { src_items.push((item.change.id().to_owned(), item)); } Some(Relation::ChildOfParent(id)) if id == dst_parent_id => { dst_items.push((item.change.id().to_owned(), item)); } _ => continue, }; } for ((src_id, src_item), (dst_id, dst_item)) in src_items.into_iter().zip(dst_items) { // Since the parent items are already identical by ID, we know that the children will also match, we just // double-check to still have a chance to be correct in case some of that goes wrong. if src_id == dst_id && filename(src_item.location(&self.path_backing)) == filename(dst_item.location(&self.path_backing)) { let entry_mode = src_item.change.entry_mode(); let location = src_item.location(&self.path_backing); let src = visit::Source { entry_mode, id: src_id, kind, location, change: &src_item.change, diff: None, }; let location = dst_item.location(&self.path_backing); let change = dst_item.change.clone(); let dst = visit::Destination { change, location }; let res = cb(dst, Some(src)); src_item.emitted = true; dst_item.emitted = true; if res == Action::Cancel { return Ok(res); } } else { gix_trace::warn!("Children of parents with change-id {src_parent_id} and {dst_parent_id} were not equal, even though their parents claimed to be"); break; } } Ok(Action::Continue) } /// Find directories with relation id that haven't been emitted yet and store them for lookup. /// Then use the previously stored emitted renames with relation id to learn which directories they 'link' /// and emit them, too. /// Note that this works whenever top-level directories are renamed because they are always added and deleted, /// and we only match those. Thus, one rewrite inside the directory is enough. fn match_renamed_directories( &mut self, cb: &mut impl FnMut(visit::Destination<'_, T>, Option>) -> Action, ) -> Result<(), emit::Error> { fn unemitted_directory_matching_relation_id(items: &[Item], child_id: ChangeId) -> Option { items.iter().position(|i| { !i.emitted && matches!(i.change.relation(), Some(Relation::Parent(pid)) if pid == child_id) }) } for (deleted_child_id, added_child_id) in &self.child_renames { let Some(src_idx) = unemitted_directory_matching_relation_id(&self.items, *deleted_child_id) else { continue; }; let Some(dst_idx) = unemitted_directory_matching_relation_id(&self.items, *added_child_id) else { // This could go wrong in case there are mismatches, so be defensive here. // But generally, we'd expect the destination item to exist. continue; }; let (src_item, dst_item) = (&self.items[src_idx], &self.items[dst_idx]); let entry_mode = src_item.change.entry_mode(); let location = src_item.location(&self.path_backing); let src = visit::Source { entry_mode, id: src_item.change.id().to_owned(), kind: SourceKind::Rename, location, change: &src_item.change, diff: None, }; let location = dst_item.location(&self.path_backing); let change = dst_item.change.clone(); let dst = visit::Destination { change, location }; let res = cb(dst, Some(src)); self.items[src_idx].emitted = true; self.items[dst_idx].emitted = true; if res == Action::Cancel { return Ok(()); } } Ok(()) } } fn filename(path: &BStr) -> &BStr { path.rfind_byte(b'/').map_or(path, |idx| path[idx + 1..].as_bstr()) } /// Returns the amount of viable sources and destinations for `items` as eligible for the given `kind` of operation. fn estimate_involved_items( items: impl IntoIterator, kind: visit::SourceKind, ) -> (usize, usize) { items .into_iter() .filter(|(emitted, _)| match kind { visit::SourceKind::Rename => !*emitted, visit::SourceKind::Copy => true, }) .fold((0, 0), |(mut src, mut dest), (emitted, change_kind)| { match change_kind { ChangeKind::Addition => { if kind == visit::SourceKind::Rename || !emitted { dest += 1; } } ChangeKind::Deletion => { if kind == visit::SourceKind::Rename { src += 1; } } ChangeKind::Modification => { if kind == visit::SourceKind::Copy { src += 1; } } } (src, dest) }) } fn needs_exact_match(percentage: Option) -> bool { percentage.map_or(true, |p| p >= 1.0) } /// <`src_idx`, src, possibly diff stat> type SourceTuple<'a, T> = (usize, &'a Item, Option); /// Find `item` in our set of items ignoring `item_idx` to avoid finding ourselves, by similarity indicated by `percentage`. /// The latter can be `None` or `Some(x)` where `x>=1` for identity, and anything else for similarity. /// We also ignore emitted items entirely. /// Use `kind` to indicate what kind of match we are looking for, which might be deletions matching an `item` addition, or /// any non-deletion otherwise. /// Note that we always try to find by identity first even if a percentage is given as it's much faster and may reduce the set /// of items to be searched. #[allow(clippy::too_many_arguments)] fn find_match<'a, T: Change>( items: &'a [Item], item: &Item, item_idx: usize, percentage: Option, kind: visit::SourceKind, stats: &mut Outcome, objects: &impl gix_object::FindObjectOrHeader, diff_cache: &mut crate::blob::Platform, path_backing: &[u8], num_checks: &mut usize, ) -> Result>, emit::Error> { let (item_id, item_mode) = item.change.id_and_entry_mode(); if needs_exact_match(percentage) || item_mode.is_link() { let first_idx = items.partition_point(|a| a.change.id() < item_id); let range = items.get(first_idx..).map(|slice| { let end = slice .iter() .position(|a| a.change.id() != item_id) .map_or(items.len(), |idx| first_idx + idx); first_idx..end }); let range = match range { Some(range) => range, None => return Ok(None), }; if range.is_empty() { return Ok(None); } let res = items[range.clone()].iter().enumerate().find_map(|(mut src_idx, src)| { src_idx += range.start; *num_checks += 1; (src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)).then_some((src_idx, src, None)) }); if let Some(src) = res { return Ok(Some(src)); } } else if item_mode.is_blob() { let mut has_new = false; let percentage = percentage.expect("it's set to something below 1.0 and we assured this"); for (can_idx, src) in items .iter() .enumerate() .filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)) { if !has_new { diff_cache.set_resource( item_id.to_owned(), item_mode.kind(), item.location(path_backing), ResourceKind::NewOrDestination, objects, )?; has_new = true; } let (src_id, src_mode) = src.change.id_and_entry_mode(); diff_cache.set_resource( src_id.to_owned(), src_mode.kind(), src.location(path_backing), ResourceKind::OldOrSource, objects, )?; let prep = diff_cache.prepare_diff()?; stats.num_similarity_checks += 1; *num_checks += 1; match prep.operation { Operation::InternalDiff { algorithm } => { let tokens = crate::blob::intern::InternedInput::new(prep.old.intern_source(), prep.new.intern_source()); let counts = crate::blob::diff( algorithm, &tokens, crate::blob::sink::Counter::new(diff::Statistics { removed_bytes: 0, input: &tokens, }), ); let old_data_len = prep.old.data.as_slice().unwrap_or_default().len(); let new_data_len = prep.new.data.as_slice().unwrap_or_default().len(); let similarity = (old_data_len - counts.wrapped) as f32 / old_data_len.max(new_data_len) as f32; if similarity >= percentage { return Ok(Some(( can_idx, src, DiffLineStats { removals: counts.removals, insertions: counts.insertions, before: tokens.before.len().try_into().expect("interner handles only u32"), after: tokens.after.len().try_into().expect("interner handles only u32"), similarity, } .into(), ))); } } Operation::ExternalCommand { .. } => { unreachable!("we have disabled this possibility with an option") } Operation::SourceOrDestinationIsBinary => { // TODO: figure out if git does more here } }; } } Ok(None) } mod diff { use std::ops::Range; pub struct Statistics<'a, 'data> { pub removed_bytes: usize, pub input: &'a crate::blob::intern::InternedInput<&'data [u8]>, } impl crate::blob::Sink for Statistics<'_, '_> { type Out = usize; fn process_change(&mut self, before: Range, _after: Range) { self.removed_bytes += self.input.before[before.start as usize..before.end as usize] .iter() .map(|token| self.input.interner[*token].len()) .sum::(); } fn finish(self) -> Self::Out { self.removed_bytes } } } #[cfg(test)] mod estimate_involved_items { use super::estimate_involved_items; use crate::rewrites::tracker::{visit::SourceKind, ChangeKind}; #[test] fn renames_count_unemitted_as_sources_and_destinations() { let items = [ (false, ChangeKind::Addition), (true, ChangeKind::Deletion), (true, ChangeKind::Deletion), ]; assert_eq!( estimate_involved_items(items, SourceKind::Rename), (0, 1), "here we only have one eligible source, hence nothing to do" ); assert_eq!( estimate_involved_items(items.into_iter().map(|t| (false, t.1)), SourceKind::Rename), (2, 1), "now we have more possibilities as renames count un-emitted deletions as source" ); } #[test] fn copies_do_not_count_additions_as_sources() { let items = [ (false, ChangeKind::Addition), (true, ChangeKind::Addition), (true, ChangeKind::Deletion), ]; assert_eq!( estimate_involved_items(items, SourceKind::Copy), (0, 1), "one addition as source, the other isn't counted as it's emitted, nor is it considered a copy-source.\ deletions don't count" ); } #[test] fn copies_count_modifications_as_sources() { let items = [ (false, ChangeKind::Addition), (true, ChangeKind::Modification), (false, ChangeKind::Modification), ]; assert_eq!( estimate_involved_items(items, SourceKind::Copy), (2, 1), "any modifications is a valid source, emitted or not" ); } } gix-diff-0.49.0/src/tree/function.rs000064400000000000000000000402351046102023000153400ustar 00000000000000use std::{borrow::BorrowMut, collections::VecDeque}; use gix_object::{tree::EntryRef, FindExt, TreeRefIter}; use crate::tree::visit::{ChangeId, Relation}; use crate::tree::{visit::Change, Error, State, TreeInfoTuple, Visit}; /// Calculate the changes that would need to be applied to `lhs` to get `rhs` using `objects` to obtain objects as needed for traversal. /// `state` can be used between multiple calls to re-use memory. /// /// * The `state` maybe owned or mutably borrowed to allow reuses allocated data structures through multiple runs. /// * `delegate` will receive the computed changes, see the [`Visit`] trait for more information on what to expect. /// /// # Notes /// /// * `lhs` can be an empty tree to simulate what would happen if the left-hand side didn't exist. /// * To obtain progress, implement it within the `delegate`. /// * Tree entries are expected to be ordered using [`tree-entry-comparison`][git_cmp_c] (the same [in Rust][git_cmp_rs]) /// * it does a breadth first iteration as buffer space only fits two trees, the current one on the one we compare with. /// * does not do rename tracking but attempts to reduce allocations to zero (so performance is mostly determined /// by the delegate implementation which should be as specific as possible. Rename tracking can be computed on top of the changes /// received by the `delegate`. /// * cycle checking is not performed, but can be performed in the delegate which can return /// [`tree::visit::Action::Cancel`](crate::tree::visit::Action::Cancel) to stop the traversal. /// /// [git_cmp_c]: https://github.com/git/git/blob/ef8ce8f3d4344fd3af049c17eeba5cd20d98b69f/tree-diff.c#L72-L88 /// [git_cmp_rs]: https://github.com/GitoxideLabs/gitoxide/blob/795962b107d86f58b1f7c75006da256d19cc80ad/gix-object/src/tree/mod.rs#L263-L273 #[doc(alias = "diff_tree_to_tree", alias = "git2")] pub fn diff( lhs: TreeRefIter<'_>, rhs: TreeRefIter<'_>, mut state: StateMut, objects: impl gix_object::Find, delegate: &mut impl Visit, ) -> Result<(), Error> where StateMut: BorrowMut, { let state = state.borrow_mut(); state.clear(); let mut lhs_entries = peekable(lhs); let mut rhs_entries = peekable(rhs); let mut relation = None; let mut pop_path = false; loop { if pop_path { delegate.pop_path_component(); } pop_path = true; match (lhs_entries.next(), rhs_entries.next()) { (None, None) => { match state.trees.pop_front() { Some((None, Some(rhs), relation_to_propagate)) => { delegate.pop_front_tracked_path_and_set_current(); relation = relation_to_propagate; rhs_entries = peekable(objects.find_tree_iter(&rhs, &mut state.buf2)?); } Some((Some(lhs), Some(rhs), relation_to_propagate)) => { delegate.pop_front_tracked_path_and_set_current(); lhs_entries = peekable(objects.find_tree_iter(&lhs, &mut state.buf1)?); rhs_entries = peekable(objects.find_tree_iter(&rhs, &mut state.buf2)?); relation = relation_to_propagate; } Some((Some(lhs), None, relation_to_propagate)) => { delegate.pop_front_tracked_path_and_set_current(); lhs_entries = peekable(objects.find_tree_iter(&lhs, &mut state.buf1)?); relation = relation_to_propagate; } Some((None, None, _)) => unreachable!("BUG: it makes no sense to fill the stack with empties"), None => return Ok(()), }; pop_path = false; } (Some(lhs), Some(rhs)) => { use std::cmp::Ordering::*; let (lhs, rhs) = (lhs?, rhs?); match compare(&lhs, &rhs) { Equal => handle_lhs_and_rhs_with_equal_filenames( lhs, rhs, &mut state.trees, &mut state.change_id, relation, delegate, )?, Less => catchup_lhs_with_rhs( &mut lhs_entries, lhs, rhs, &mut state.trees, &mut state.change_id, relation, delegate, )?, Greater => catchup_rhs_with_lhs( &mut rhs_entries, lhs, rhs, &mut state.trees, &mut state.change_id, relation, delegate, )?, } } (Some(lhs), None) => { let lhs = lhs?; delete_entry_schedule_recursion(lhs, &mut state.trees, &mut state.change_id, relation, delegate)?; } (None, Some(rhs)) => { let rhs = rhs?; add_entry_schedule_recursion(rhs, &mut state.trees, &mut state.change_id, relation, delegate)?; } } } } fn compare(a: &EntryRef<'_>, b: &EntryRef<'_>) -> std::cmp::Ordering { let common = a.filename.len().min(b.filename.len()); a.filename[..common].cmp(&b.filename[..common]).then_with(|| { let a = a.filename.get(common).or_else(|| a.mode.is_tree().then_some(&b'/')); let b = b.filename.get(common).or_else(|| b.mode.is_tree().then_some(&b'/')); a.cmp(&b) }) } fn delete_entry_schedule_recursion( entry: EntryRef<'_>, queue: &mut VecDeque, change_id: &mut ChangeId, relation_to_propagate: Option, delegate: &mut impl Visit, ) -> Result<(), Error> { delegate.push_path_component(entry.filename); let relation = relation_to_propagate.or_else(|| { entry.mode.is_tree().then(|| { *change_id += 1; Relation::Parent(*change_id) }) }); let is_cancelled = delegate .visit(Change::Deletion { entry_mode: entry.mode, oid: entry.oid.to_owned(), relation, }) .cancelled(); if is_cancelled { return Err(Error::Cancelled); } if entry.mode.is_tree() { delegate.pop_path_component(); delegate.push_back_tracked_path_component(entry.filename); queue.push_back((Some(entry.oid.to_owned()), None, to_child(relation))); } Ok(()) } fn add_entry_schedule_recursion( entry: EntryRef<'_>, queue: &mut VecDeque, change_id: &mut ChangeId, relation_to_propagate: Option, delegate: &mut impl Visit, ) -> Result<(), Error> { delegate.push_path_component(entry.filename); let relation = relation_to_propagate.or_else(|| { entry.mode.is_tree().then(|| { *change_id += 1; Relation::Parent(*change_id) }) }); if delegate .visit(Change::Addition { entry_mode: entry.mode, oid: entry.oid.to_owned(), relation, }) .cancelled() { return Err(Error::Cancelled); } if entry.mode.is_tree() { delegate.pop_path_component(); delegate.push_back_tracked_path_component(entry.filename); queue.push_back((None, Some(entry.oid.to_owned()), to_child(relation))); } Ok(()) } fn catchup_rhs_with_lhs( rhs_entries: &mut IteratorType>, lhs: EntryRef<'_>, rhs: EntryRef<'_>, queue: &mut VecDeque, change_id: &mut ChangeId, relation_to_propagate: Option, delegate: &mut impl Visit, ) -> Result<(), Error> { use std::cmp::Ordering::*; add_entry_schedule_recursion(rhs, queue, change_id, relation_to_propagate, delegate)?; loop { match rhs_entries.peek() { Some(Ok(rhs)) => match compare(&lhs, rhs) { Equal => { let rhs = rhs_entries.next().transpose()?.expect("the peeked item to be present"); delegate.pop_path_component(); handle_lhs_and_rhs_with_equal_filenames( lhs, rhs, queue, change_id, relation_to_propagate, delegate, )?; break; } Greater => { let rhs = rhs_entries.next().transpose()?.expect("the peeked item to be present"); delegate.pop_path_component(); add_entry_schedule_recursion(rhs, queue, change_id, relation_to_propagate, delegate)?; } Less => { delegate.pop_path_component(); delete_entry_schedule_recursion(lhs, queue, change_id, relation_to_propagate, delegate)?; break; } }, Some(Err(err)) => return Err(Error::EntriesDecode(err.to_owned())), None => { delegate.pop_path_component(); delete_entry_schedule_recursion(lhs, queue, change_id, relation_to_propagate, delegate)?; break; } } } Ok(()) } fn catchup_lhs_with_rhs( lhs_entries: &mut IteratorType>, lhs: EntryRef<'_>, rhs: EntryRef<'_>, queue: &mut VecDeque, change_id: &mut ChangeId, relation_to_propagate: Option, delegate: &mut impl Visit, ) -> Result<(), Error> { use std::cmp::Ordering::*; delete_entry_schedule_recursion(lhs, queue, change_id, relation_to_propagate, delegate)?; loop { match lhs_entries.peek() { Some(Ok(lhs)) => match compare(lhs, &rhs) { Equal => { let lhs = lhs_entries.next().expect("the peeked item to be present")?; delegate.pop_path_component(); handle_lhs_and_rhs_with_equal_filenames( lhs, rhs, queue, change_id, relation_to_propagate, delegate, )?; break; } Less => { let lhs = lhs_entries.next().expect("the peeked item to be present")?; delegate.pop_path_component(); delete_entry_schedule_recursion(lhs, queue, change_id, relation_to_propagate, delegate)?; } Greater => { delegate.pop_path_component(); add_entry_schedule_recursion(rhs, queue, change_id, relation_to_propagate, delegate)?; break; } }, Some(Err(err)) => return Err(Error::EntriesDecode(err.to_owned())), None => { delegate.pop_path_component(); add_entry_schedule_recursion(rhs, queue, change_id, relation_to_propagate, delegate)?; break; } } } Ok(()) } fn handle_lhs_and_rhs_with_equal_filenames( lhs: EntryRef<'_>, rhs: EntryRef<'_>, queue: &mut VecDeque, change_id: &mut ChangeId, relation_to_propagate: Option, delegate: &mut impl Visit, ) -> Result<(), Error> { match (lhs.mode.is_tree(), rhs.mode.is_tree()) { (true, true) => { delegate.push_back_tracked_path_component(lhs.filename); if lhs.oid != rhs.oid && delegate .visit(Change::Modification { previous_entry_mode: lhs.mode, previous_oid: lhs.oid.to_owned(), entry_mode: rhs.mode, oid: rhs.oid.to_owned(), }) .cancelled() { return Err(Error::Cancelled); } queue.push_back(( Some(lhs.oid.to_owned()), Some(rhs.oid.to_owned()), relation_to_propagate, )); } (_, true) => { delegate.push_back_tracked_path_component(lhs.filename); if delegate .visit(Change::Deletion { entry_mode: lhs.mode, oid: lhs.oid.to_owned(), relation: None, }) .cancelled() { return Err(Error::Cancelled); }; let relation = relation_to_propagate.or_else(|| { *change_id += 1; Some(Relation::Parent(*change_id)) }); if delegate .visit(Change::Addition { entry_mode: rhs.mode, oid: rhs.oid.to_owned(), relation, }) .cancelled() { return Err(Error::Cancelled); }; queue.push_back((None, Some(rhs.oid.to_owned()), to_child(relation))); } (true, _) => { delegate.push_back_tracked_path_component(lhs.filename); let relation = relation_to_propagate.or_else(|| { *change_id += 1; Some(Relation::Parent(*change_id)) }); if delegate .visit(Change::Deletion { entry_mode: lhs.mode, oid: lhs.oid.to_owned(), relation, }) .cancelled() { return Err(Error::Cancelled); } if delegate .visit(Change::Addition { entry_mode: rhs.mode, oid: rhs.oid.to_owned(), relation: None, }) .cancelled() { return Err(Error::Cancelled); }; queue.push_back((Some(lhs.oid.to_owned()), None, to_child(relation))); } (false, false) => { delegate.push_path_component(lhs.filename); debug_assert!(lhs.mode.is_no_tree() && lhs.mode.is_no_tree()); if (lhs.oid != rhs.oid || lhs.mode != rhs.mode) && delegate .visit(Change::Modification { previous_entry_mode: lhs.mode, previous_oid: lhs.oid.to_owned(), entry_mode: rhs.mode, oid: rhs.oid.to_owned(), }) .cancelled() { return Err(Error::Cancelled); } } }; Ok(()) } type IteratorType = std::iter::Peekable; fn to_child(r: Option) -> Option { r.map(|r| match r { Relation::Parent(id) => Relation::ChildOfParent(id), Relation::ChildOfParent(id) => Relation::ChildOfParent(id), }) } fn peekable(iter: I) -> IteratorType { iter.peekable() } #[cfg(test)] mod tests { use std::cmp::Ordering; use gix_object::tree::EntryKind; use super::*; #[test] fn compare_select_samples() { let null = gix_hash::ObjectId::null(gix_hash::Kind::Sha1); let actual = compare( &EntryRef { mode: EntryKind::Blob.into(), filename: "plumbing-cli.rs".into(), oid: &null, }, &EntryRef { mode: EntryKind::Tree.into(), filename: "plumbing".into(), oid: &null, }, ); assert_eq!(actual, Ordering::Less); let actual = compare( &EntryRef { mode: EntryKind::Tree.into(), filename: "plumbing-cli.rs".into(), oid: &null, }, &EntryRef { mode: EntryKind::Blob.into(), filename: "plumbing".into(), oid: &null, }, ); assert_eq!(actual, Ordering::Greater); } } gix-diff-0.49.0/src/tree/mod.rs000064400000000000000000000047041046102023000142730ustar 00000000000000use crate::tree::visit::Relation; use bstr::BStr; use gix_hash::ObjectId; use gix_object::bstr::BString; use std::collections::VecDeque; /// The error returned by [`tree()`](super::tree()). #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] pub enum Error { #[error(transparent)] Find(#[from] gix_object::find::existing_iter::Error), #[error("The delegate cancelled the operation")] Cancelled, #[error(transparent)] EntriesDecode(#[from] gix_object::decode::Error), } /// A trait to allow responding to a traversal designed to figure out the [changes](visit::Change) /// to turn tree A into tree B. pub trait Visit { /// Sets the full path in front of the queue so future calls to push and pop components affect it instead. fn pop_front_tracked_path_and_set_current(&mut self); /// Append a `component` to the end of a path, which may be empty. fn push_back_tracked_path_component(&mut self, component: &BStr); /// Append a `component` to the end of a path, which may be empty. fn push_path_component(&mut self, component: &BStr); /// Removes the last component from the path, which may leave it empty. fn pop_path_component(&mut self); /// Record a `change` and return an instruction whether to continue or not. /// /// The implementation may use the current path to lean where in the tree the change is located. fn visit(&mut self, change: visit::Change) -> visit::Action; } /// The state required to run [tree-diffs](super::tree()). #[derive(Default, Clone)] pub struct State { /// A buffer for object data. pub buf1: Vec, /// Another buffer for object data. pub buf2: Vec, trees: VecDeque, change_id: visit::ChangeId, } type TreeInfoTuple = (Option, Option, Option); impl State { fn clear(&mut self) { self.trees.clear(); self.buf1.clear(); self.buf2.clear(); self.change_id = 0; } } pub(super) mod function; /// pub mod visit; /// A [Visit] implementation to record every observed change and keep track of the changed paths. #[derive(Clone, Debug)] pub struct Recorder { path_deque: VecDeque, path: BString, location: Option, /// The observed changes. pub records: Vec, } /// Useful for use as delegate implementing [`Visit`] to keep track of all seen changes. Useful for debugging or printing primarily. pub mod recorder; gix-diff-0.49.0/src/tree/recorder.rs000064400000000000000000000111461046102023000153170ustar 00000000000000use gix_hash::ObjectId; use gix_object::{ bstr::{BStr, BString, ByteSlice, ByteVec}, tree, }; use crate::tree::visit::Relation; use crate::tree::{visit, Recorder, Visit}; /// Describe how to track the location of a change. #[derive(Debug, Clone, Copy, Eq, PartialEq)] pub enum Location { /// Track the entire path, relative to the repository. Path, /// Keep only the file-name as location, which may be enough for some calculations. /// /// This is less expensive than tracking the entire `Path`. FileName, } /// A Change as observed by a call to [`visit(…)`](Visit::visit()), enhanced with the path affected by the change. /// Its similar to [`visit::Change`] but includes the path that changed. #[derive(Clone, Debug, PartialEq, Eq)] #[allow(missing_docs)] pub enum Change { Addition { entry_mode: tree::EntryMode, oid: ObjectId, path: BString, relation: Option, }, Deletion { entry_mode: tree::EntryMode, oid: ObjectId, path: BString, relation: Option, }, Modification { previous_entry_mode: tree::EntryMode, previous_oid: ObjectId, entry_mode: tree::EntryMode, oid: ObjectId, path: BString, }, } impl Default for Recorder { fn default() -> Self { Recorder { path_deque: Default::default(), path: Default::default(), location: Some(Location::Path), records: vec![], } } } /// Builder impl Recorder { /// Obtain a copy of the currently tracked, full path of the entry. pub fn track_location(mut self, location: Option) -> Self { self.location = location; self } } /// Access impl Recorder { /// Obtain a copy of the currently tracked, full path of the entry. pub fn path_clone(&self) -> BString { self.path.clone() } /// Return the currently set path. pub fn path(&self) -> &BStr { self.path.as_ref() } } impl Recorder { fn pop_element(&mut self) { if let Some(pos) = self.path.rfind_byte(b'/') { self.path.resize(pos, 0); } else { self.path.clear(); } } fn push_element(&mut self, name: &BStr) { if !self.path.is_empty() { self.path.push(b'/'); } self.path.push_str(name); } } impl Visit for Recorder { fn pop_front_tracked_path_and_set_current(&mut self) { if let Some(Location::Path) = self.location { self.path = self.path_deque.pop_front().expect("every parent is set only once"); } } fn push_back_tracked_path_component(&mut self, component: &BStr) { match self.location { None => {} Some(Location::Path) => { self.push_element(component); self.path_deque.push_back(self.path.clone()); } Some(Location::FileName) => { self.path.clear(); self.path.extend_from_slice(component); } } } fn push_path_component(&mut self, component: &BStr) { match self.location { None => {} Some(Location::Path) => { self.push_element(component); } Some(Location::FileName) => { self.path.clear(); self.path.extend_from_slice(component); } } } fn pop_path_component(&mut self) { if let Some(Location::Path) = self.location { self.pop_element(); } } fn visit(&mut self, change: visit::Change) -> visit::Action { use visit::Change::*; self.records.push(match change { Deletion { entry_mode, oid, relation, } => Change::Deletion { entry_mode, oid, path: self.path_clone(), relation, }, Addition { entry_mode, oid, relation, } => Change::Addition { entry_mode, oid, path: self.path_clone(), relation, }, Modification { previous_entry_mode, previous_oid, entry_mode, oid, } => Change::Modification { previous_entry_mode, previous_oid, entry_mode, oid, path: self.path_clone(), }, }); visit::Action::Continue } } gix-diff-0.49.0/src/tree/visit.rs000064400000000000000000000131631046102023000146510ustar 00000000000000use gix_hash::ObjectId; use gix_object::{tree, tree::EntryMode}; /// A way to recognize and associate different [`Change`] instances. /// /// These are unique only within one diff operation. pub type ChangeId = u32; /// Identifies a relationship between this instance and another one. #[derive(Debug, Copy, Clone, PartialOrd, PartialEq, Ord, Eq, Hash)] pub enum Relation { /// This is a parent with the given ID, which will always have at least one child /// assuming that empty directories are not allowed in valid trees. /// It's also always a tree which is the start of a recursive deletion or addition. /// /// The change with this relation is always emitted first. Parent(ChangeId), /// This is a direct or indirect child, tree or not tree, of the parent with the given ID. ChildOfParent(ChangeId), } /// Represents any possible change in order to turn one tree into another. #[derive(Debug, Clone, PartialOrd, PartialEq, Ord, Eq, Hash)] pub enum Change { /// An entry was added, like the addition of a file or directory. Addition { /// The mode of the added entry. entry_mode: tree::EntryMode, /// The object id of the added entry. oid: ObjectId, /// Possibly associate this change with another for hierarchical rename tracking. relation: Option, }, /// An entry was deleted, like the deletion of a file or directory. Deletion { /// The mode of the deleted entry. entry_mode: tree::EntryMode, /// The object id of the deleted entry. oid: ObjectId, /// Possibly associate this change with another for hierarchical rename tracking. relation: Option, }, /// An entry was modified, e.g. changing the contents of a file adjusts its object id and turning /// a file into a symbolic link adjusts its mode. Modification { /// The mode of the entry before the modification. previous_entry_mode: tree::EntryMode, /// The object id of the entry before the modification. previous_oid: ObjectId, /// The mode of the entry after the modification. entry_mode: tree::EntryMode, /// The object id after the modification. oid: ObjectId, }, } impl Change { /// Return the current object id. pub fn oid(&self) -> &gix_hash::oid { match self { Change::Addition { oid, .. } | Change::Deletion { oid, .. } | Change::Modification { oid, .. } => oid, } } /// Return the current tree entry mode. pub fn entry_mode(&self) -> EntryMode { match self { Change::Addition { entry_mode, .. } | Change::Deletion { entry_mode, .. } | Change::Modification { entry_mode, .. } => *entry_mode, } } /// Return the current object id and tree entry mode of a change. pub fn oid_and_entry_mode(&self) -> (&gix_hash::oid, EntryMode) { match self { Change::Addition { oid, entry_mode, relation: _, } | Change::Deletion { oid, entry_mode, relation: _, } | Change::Modification { oid, entry_mode, .. } => (oid, *entry_mode), } } } /// What to do after a [Change] was [recorded](super::Visit::visit()). #[derive(Default, Clone, Copy, PartialOrd, PartialEq, Ord, Eq, Hash)] pub enum Action { /// Continue the traversal of changes. #[default] Continue, /// Stop the traversal of changes, making this the last call to [visit(…)](super::Visit::visit()). Cancel, } impl Action { /// Returns true if this action means to stop the traversal. pub fn cancelled(&self) -> bool { matches!(self, Action::Cancel) } } #[cfg(feature = "blob")] mod change_impls { use gix_hash::oid; use gix_object::tree::EntryMode; use crate::tree::visit::Relation; use crate::{rewrites::tracker::ChangeKind, tree::visit::Change}; impl crate::rewrites::tracker::Change for crate::tree::visit::Change { fn id(&self) -> &oid { match self { Change::Addition { oid, .. } | Change::Deletion { oid, .. } | Change::Modification { oid, .. } => oid, } } fn relation(&self) -> Option { match self { Change::Addition { relation, .. } | Change::Deletion { relation, .. } => *relation, Change::Modification { .. } => None, } } fn kind(&self) -> ChangeKind { match self { Change::Addition { .. } => ChangeKind::Addition, Change::Deletion { .. } => ChangeKind::Deletion, Change::Modification { .. } => ChangeKind::Modification, } } fn entry_mode(&self) -> EntryMode { match self { Change::Addition { entry_mode, .. } | Change::Deletion { entry_mode, .. } | Change::Modification { entry_mode, .. } => *entry_mode, } } fn id_and_entry_mode(&self) -> (&oid, EntryMode) { match self { Change::Addition { entry_mode, oid, .. } | Change::Deletion { entry_mode, oid, .. } | Change::Modification { entry_mode, oid, .. } => (oid, *entry_mode), } } } } #[cfg(test)] mod tests { use super::*; #[test] fn size_of_change() { let actual = std::mem::size_of::(); assert!( actual <= 48, "{actual} <= 48: this type shouldn't grow without us knowing" ); } } gix-diff-0.49.0/src/tree_with_rewrites/change.rs000064400000000000000000000545411046102023000177240ustar 00000000000000use crate::blob::{DiffLineStats, ResourceKind}; use crate::tree; use bstr::BString; use bstr::{BStr, ByteSlice}; /// Represents any possible change in order to turn one tree into another, which references data owned by its producer. #[derive(Debug, Clone, Copy, PartialEq)] pub enum ChangeRef<'a> { /// An entry was added, like the addition of a file or directory. Addition { /// The location of the file or directory. /// /// It may be empty if [file names](super::Options::location) is `None`. location: &'a BStr, /// The mode of the added entry. entry_mode: gix_object::tree::EntryMode, /// Identifies a relationship between this instance and another one, /// making it easy to reconstruct the top-level of directory changes. relation: Option, /// The object id of the added entry. id: gix_hash::ObjectId, }, /// An entry was deleted, like the deletion of a file or directory. Deletion { /// The location of the file or directory. /// /// It may be empty if [file names](super::Options::location) is `None`. /// are tracked. location: &'a BStr, /// The mode of the deleted entry. entry_mode: gix_object::tree::EntryMode, /// Identifies a relationship between this instance and another one, /// making it easy to reconstruct the top-level of directory changes. relation: Option, /// The object id of the deleted entry. id: gix_hash::ObjectId, }, /// An entry was modified, e.g. changing the contents of a file adjusts its object id and turning /// a file into a symbolic link adjusts its mode. Modification { /// The location of the file or directory. /// /// It may be empty if [file names](super::Options::location) is `None`. /// are tracked. location: &'a BStr, /// The mode of the entry before the modification. previous_entry_mode: gix_object::tree::EntryMode, /// The object id of the entry before the modification. previous_id: gix_hash::ObjectId, /// The mode of the entry after the modification. entry_mode: gix_object::tree::EntryMode, /// The object id after the modification. id: gix_hash::ObjectId, }, /// Entries are considered rewritten if they are not trees and they, according to some understanding of identity, were renamed /// or copied. /// In case of renames, this means they originally appeared as [`Deletion`](ChangeRef::Deletion) signalling their source as well as an /// [`Addition`](ChangeRef::Addition) acting as destination. /// /// In case of copies, the `copy` flag is true and typically represents a perfect copy of a source was made. /// /// This variant can only be encountered if [rewrite tracking](super::Options::rewrites) is enabled. /// /// Note that mode changes may have occurred as well, i.e. changes from executable to non-executable or vice-versa. Rewrite { /// The location of the source of the rename or copy operation. /// /// It may be empty if [file names](super::Options::location) is `None`. /// are tracked. source_location: &'a BStr, /// The mode of the entry before the rename. source_entry_mode: gix_object::tree::EntryMode, /// Identifies a relationship between the source and another source, /// making it easy to reconstruct the top-level of directory changes. source_relation: Option, /// The object id of the entry before the rename. /// /// Note that this is the same as `id` if we require the [similarity to be 100%](super::Rewrites::percentage), but may /// be different otherwise. source_id: gix_hash::ObjectId, /// Information about the diff we performed to detect similarity and match the `source_id` with the current state at `id`. /// It's `None` if `source_id` is equal to `id`, as identity made an actual diff computation unnecessary. diff: Option, /// The mode of the entry after the rename. /// It could differ but still be considered a rename as we are concerned only about content. entry_mode: gix_object::tree::EntryMode, /// The object id after the rename. id: gix_hash::ObjectId, /// The location after the rename or copy operation. /// /// It may be empty if [file names](super::Options::location) is `None`. location: &'a BStr, /// Identifies a relationship between this destination and another destination, /// making it easy to reconstruct the top-level of directory changes. relation: Option, /// If true, this rewrite is created by copy, and `source_id` is pointing to its source. Otherwise, it's a rename, and `source_id` /// points to a deleted object, as renames are tracked as deletions and additions of the same or similar content. copy: bool, }, } /// Represents any possible change in order to turn one tree into another, with fully-owned data. #[derive(Debug, Clone, PartialEq)] pub enum Change { /// An entry was added, like the addition of a file or directory. Addition { /// The location of the file or directory. /// /// It may be empty if [file names](super::Options::location) is `None`. location: BString, /// Identifies a relationship between this instance and another one, /// making it easy to reconstruct the top-level of directory changes. relation: Option, /// The mode of the added entry. entry_mode: gix_object::tree::EntryMode, /// The object id of the added entry. id: gix_hash::ObjectId, }, /// An entry was deleted, like the deletion of a file or directory. Deletion { /// The location of the file or directory. /// /// It may be empty if [file names](super::Options::location) is `None`. location: BString, /// Identifies a relationship between this instance and another one, /// making it easy to reconstruct the top-level of directory changes. relation: Option, /// The mode of the deleted entry. entry_mode: gix_object::tree::EntryMode, /// The object id of the deleted entry. id: gix_hash::ObjectId, }, /// An entry was modified, e.g. changing the contents of a file adjusts its object id and turning /// a file into a symbolic link adjusts its mode. Modification { /// The location of the file or directory. /// /// It may be empty if [file names](super::Options::location) is `None`. location: BString, /// The mode of the entry before the modification. previous_entry_mode: gix_object::tree::EntryMode, /// The object id of the entry before the modification. previous_id: gix_hash::ObjectId, /// The mode of the entry after the modification. entry_mode: gix_object::tree::EntryMode, /// The object id after the modification. id: gix_hash::ObjectId, }, /// Entries are considered rewritten if they are not trees and they, according to some understanding of identity, were renamed /// or copied. /// In case of renames, this means they originally appeared as [`Deletion`](ChangeRef::Deletion) signalling their source as well as an /// [`Addition`](ChangeRef::Addition) acting as destination. /// /// In case of copies, the `copy` flag is true and typically represents a perfect copy of a source was made. /// /// This variant can only be encountered if [rewrite tracking](super::Options::rewrites) is enabled. /// /// Note that mode changes may have occurred as well, i.e. changes from executable to non-executable or vice-versa. Rewrite { /// The location of the source of the rename operation. /// /// It may be empty if [file names](super::Options::location) is `None`. source_location: BString, /// The mode of the entry before the rename. source_entry_mode: gix_object::tree::EntryMode, /// Identifies a relationship between the source and another source, /// making it easy to reconstruct the top-level of directory changes. source_relation: Option, /// The object id of the entry before the rename. /// /// Note that this is the same as `id` if we require the [similarity to be 100%](super::Rewrites::percentage), but may /// be different otherwise. source_id: gix_hash::ObjectId, /// Information about the diff we performed to detect similarity and match the `source_id` with the current state at `id`. /// It's `None` if `source_id` is equal to `id`, as identity made an actual diff computation unnecessary. diff: Option, /// The mode of the entry after the rename. /// It could differ but still be considered a rename as we are concerned only about content. entry_mode: gix_object::tree::EntryMode, /// The object id after the rename. id: gix_hash::ObjectId, /// The location after the rename or copy operation. /// /// It may be empty if [file names](super::Options::location) is `None`. location: BString, /// Identifies a relationship between this destination and another destination, /// making it easy to reconstruct the top-level of directory changes. relation: Option, /// If true, this rewrite is created by copy, and `source_id` is pointing to its source. Otherwise, it's a rename, and `source_id` /// points to a deleted object, as renames are tracked as deletions and additions of the same or similar content. copy: bool, }, } /// Lifecycle impl ChangeRef<'_> { /// Copy this instance into a fully-owned version pub fn into_owned(self) -> Change { match self { ChangeRef::Addition { location, entry_mode, id, relation, } => Change::Addition { location: location.to_owned(), entry_mode, id, relation, }, ChangeRef::Deletion { location, entry_mode, id, relation, } => Change::Deletion { location: location.to_owned(), entry_mode, id, relation, }, ChangeRef::Modification { location, previous_entry_mode, previous_id, entry_mode, id, } => Change::Modification { location: location.to_owned(), previous_entry_mode, previous_id, entry_mode, id, }, ChangeRef::Rewrite { source_location, source_relation, source_entry_mode, source_id, diff, entry_mode, id, location, relation, copy, } => Change::Rewrite { source_location: source_location.to_owned(), source_relation, source_entry_mode, source_id, diff, entry_mode, id, location: location.to_owned(), relation, copy, }, } } } /// Lifecycle impl Change { /// Return an attached version of this instance that uses `old_repo` for previous values and `new_repo` for current values. pub fn to_ref(&self) -> ChangeRef<'_> { match self { Change::Addition { location, relation, entry_mode, id, } => ChangeRef::Addition { location: location.as_bstr(), entry_mode: *entry_mode, id: *id, relation: *relation, }, Change::Deletion { location, relation, entry_mode, id, } => ChangeRef::Deletion { location: location.as_bstr(), entry_mode: *entry_mode, id: *id, relation: *relation, }, Change::Modification { location, previous_entry_mode, previous_id, entry_mode, id, } => ChangeRef::Modification { location: location.as_bstr(), previous_entry_mode: *previous_entry_mode, previous_id: *previous_id, entry_mode: *entry_mode, id: *id, }, Change::Rewrite { source_location, source_relation, source_entry_mode, source_id, diff, entry_mode, id, location, relation, copy, } => ChangeRef::Rewrite { source_location: source_location.as_ref(), source_relation: *source_relation, source_entry_mode: *source_entry_mode, source_id: *source_id, diff: *diff, entry_mode: *entry_mode, id: *id, location: location.as_bstr(), relation: *relation, copy: *copy, }, } } } impl crate::blob::Platform { /// Set ourselves up to produces blob-diffs from `change`, so this platform can be used to produce diffs easily. /// `objects` are used to fetch object data as needed. /// /// ### Warning about Memory Consumption /// /// This instance only grows, so one should call [`crate::blob::Platform::clear_resource_cache`] occasionally. pub fn set_resource_by_change( &mut self, change: ChangeRef<'_>, objects: &impl gix_object::FindObjectOrHeader, ) -> Result<&mut Self, crate::blob::platform::set_resource::Error> { match change { ChangeRef::Addition { location, relation: _, entry_mode, id, } => { self.set_resource( id.kind().null(), entry_mode.kind(), location, ResourceKind::OldOrSource, objects, )?; self.set_resource(id, entry_mode.kind(), location, ResourceKind::NewOrDestination, objects)?; } ChangeRef::Deletion { location, relation: _, entry_mode, id, } => { self.set_resource(id, entry_mode.kind(), location, ResourceKind::OldOrSource, objects)?; self.set_resource( id.kind().null(), entry_mode.kind(), location, ResourceKind::NewOrDestination, objects, )?; } ChangeRef::Modification { location, previous_entry_mode, previous_id, entry_mode, id, } => { self.set_resource( previous_id, previous_entry_mode.kind(), location, ResourceKind::OldOrSource, objects, )?; self.set_resource(id, entry_mode.kind(), location, ResourceKind::NewOrDestination, objects)?; } ChangeRef::Rewrite { source_location, source_relation: _, source_entry_mode, source_id, entry_mode, id, location, relation: _, diff: _, copy: _, } => { self.set_resource( source_id, source_entry_mode.kind(), source_location, ResourceKind::OldOrSource, objects, )?; self.set_resource(id, entry_mode.kind(), location, ResourceKind::NewOrDestination, objects)?; } } Ok(self) } } impl<'a> ChangeRef<'a> { /// Return the relation this instance may have to other changes. pub fn relation(&self) -> Option { match self { ChangeRef::Addition { relation, .. } | ChangeRef::Deletion { relation, .. } | ChangeRef::Rewrite { relation, .. } => *relation, ChangeRef::Modification { .. } => None, } } /// Return the current mode of this instance. pub fn entry_mode(&self) -> gix_object::tree::EntryMode { match self { ChangeRef::Addition { entry_mode, .. } | ChangeRef::Deletion { entry_mode, .. } | ChangeRef::Modification { entry_mode, .. } | ChangeRef::Rewrite { entry_mode, .. } => *entry_mode, } } /// Return the current mode of this instance, along with its object id. pub fn entry_mode_and_id(&self) -> (gix_object::tree::EntryMode, &gix_hash::oid) { match self { ChangeRef::Addition { entry_mode, id, .. } | ChangeRef::Deletion { entry_mode, id, .. } | ChangeRef::Modification { entry_mode, id, .. } | ChangeRef::Rewrite { entry_mode, id, .. } => (*entry_mode, id), } } /// Return the *previous* mode and id of the resource where possible, i.e. the source of a rename or copy, or a modification. pub fn source_entry_mode_and_id(&self) -> (gix_object::tree::EntryMode, &gix_hash::oid) { match self { ChangeRef::Addition { entry_mode, id, .. } | ChangeRef::Deletion { entry_mode, id, .. } | ChangeRef::Modification { previous_entry_mode: entry_mode, previous_id: id, .. } | ChangeRef::Rewrite { source_entry_mode: entry_mode, source_id: id, .. } => (*entry_mode, id), } } /// Return the *current* location of the resource, i.e. the destination of a rename or copy, or the /// location at which an addition, deletion or modification took place. pub fn location(&self) -> &'a BStr { match self { ChangeRef::Addition { location, .. } | ChangeRef::Deletion { location, .. } | ChangeRef::Modification { location, .. } | ChangeRef::Rewrite { location, .. } => location, } } /// Return the *previous* location of the resource where possible, i.e. the source of a rename or copy, or the /// location at which an addition, deletion or modification took place. pub fn source_location(&self) -> &BStr { match self { ChangeRef::Addition { location, .. } | ChangeRef::Deletion { location, .. } | ChangeRef::Modification { location, .. } => location, ChangeRef::Rewrite { source_location, .. } => source_location, } } } impl Change { /// Return the relation this instance may have to other changes. pub fn relation(&self) -> Option { match self { Change::Addition { relation, .. } | Change::Deletion { relation, .. } | Change::Rewrite { relation, .. } => *relation, Change::Modification { .. } => None, } } /// Return the current mode of this instance. pub fn entry_mode(&self) -> gix_object::tree::EntryMode { match self { Change::Addition { entry_mode, .. } | Change::Deletion { entry_mode, .. } | Change::Modification { entry_mode, .. } | Change::Rewrite { entry_mode, .. } => *entry_mode, } } /// Return the current mode of this instance, along with its object id. pub fn entry_mode_and_id(&self) -> (gix_object::tree::EntryMode, &gix_hash::oid) { match self { Change::Addition { entry_mode, id, .. } | Change::Deletion { entry_mode, id, .. } | Change::Modification { entry_mode, id, .. } | Change::Rewrite { entry_mode, id, .. } => (*entry_mode, id), } } /// Return the *previous* mode and id of the resource where possible, i.e. the source of a rename or copy, or a modification. pub fn source_entry_mode_and_id(&self) -> (gix_object::tree::EntryMode, &gix_hash::oid) { match self { Change::Addition { entry_mode, id, .. } | Change::Deletion { entry_mode, id, .. } | Change::Modification { previous_entry_mode: entry_mode, previous_id: id, .. } | Change::Rewrite { source_entry_mode: entry_mode, source_id: id, .. } => (*entry_mode, id), } } /// Return the *current* location of the resource, i.e. the destination of a rename or copy, or the /// location at which an addition, deletion or modification took place. pub fn location(&self) -> &BStr { match self { Change::Addition { location, .. } | Change::Deletion { location, .. } | Change::Modification { location, .. } | Change::Rewrite { location, .. } => location.as_bstr(), } } /// Return the *previous* location of the resource where possible, i.e. the source of a rename or copy, or the /// location at which an addition, deletion or modification took place. pub fn source_location(&self) -> &BStr { match self { Change::Addition { location, .. } | Change::Deletion { location, .. } | Change::Modification { location, .. } => location.as_bstr(), Change::Rewrite { source_location, .. } => source_location.as_bstr(), } } } gix-diff-0.49.0/src/tree_with_rewrites/function.rs000064400000000000000000000243601046102023000203200ustar 00000000000000use bstr::BStr; use gix_object::TreeRefIter; use super::{Action, ChangeRef, Error, Options}; use crate::rewrites; use crate::rewrites::tracker; /// Call `for_each` repeatedly with all changes that are needed to convert `lhs` to `rhs`. /// Provide a `resource_cache` to speed up obtaining blobs for similarity checks. /// `tree_diff_state` can be used to re-use tree-diff memory between calls. /// `objects` are used to lookup trees while performing the diff. /// Use `options` to further configure how the rename tracking is performed. /// /// Reusing `resource_cache` between multiple invocations saves a lot of IOps as it avoids the creation /// of a temporary `resource_cache` that triggers reading or checking for multiple gitattribute files. /// Note that it's recommended to call [`clear_resource_cache()`](`crate::blob::Platform::clear_resource_cache()`) /// between the calls to avoid runaway memory usage, as the cache isn't limited. /// /// Note that to do rename tracking like `git` does, one has to configure the `resource_cache` with /// a conversion pipeline that uses [`crate::blob::pipeline::Mode::ToGit`]. /// /// `rhs` or `lhs` can be empty to indicate deletion or addition of an entire tree. /// /// Note that the rewrite outcome is only available if [rewrite-tracking was enabled](Options::rewrites). pub fn diff( lhs: TreeRefIter<'_>, rhs: TreeRefIter<'_>, resource_cache: &mut crate::blob::Platform, tree_diff_state: &mut crate::tree::State, objects: &impl gix_object::FindObjectOrHeader, for_each: impl FnMut(ChangeRef<'_>) -> Result, options: Options, ) -> Result, Error> where E: Into>, { let mut delegate = Delegate { src_tree: lhs, recorder: crate::tree::Recorder::default().track_location(options.location), visit: for_each, location: options.location, objects, tracked: options.rewrites.map(rewrites::Tracker::new), err: None, }; match crate::tree(lhs, rhs, tree_diff_state, objects, &mut delegate) { Ok(()) => { let outcome = delegate.process_tracked_changes(resource_cache)?; match delegate.err { Some(err) => Err(Error::ForEach(err.into())), None => Ok(outcome), } } Err(crate::tree::Error::Cancelled) => delegate .err .map_or(Err(Error::Diff(crate::tree::Error::Cancelled)), |err| { Err(Error::ForEach(err.into())) }), Err(err) => Err(err.into()), } } struct Delegate<'a, 'old, VisitFn, E, Objects> { src_tree: TreeRefIter<'old>, recorder: crate::tree::Recorder, objects: &'a Objects, visit: VisitFn, tracked: Option>, location: Option, err: Option, } impl Delegate<'_, '_, VisitFn, E, Objects> where Objects: gix_object::FindObjectOrHeader, VisitFn: for<'delegate> FnMut(ChangeRef<'_>) -> Result, E: Into>, { /// Call `visit` on an attached version of `change`. fn emit_change( change: crate::tree::visit::Change, location: &BStr, visit: &mut VisitFn, stored_err: &mut Option, ) -> crate::tree::visit::Action { use crate::tree::visit::Change::*; let change = match change { Addition { entry_mode, oid, relation, } => ChangeRef::Addition { location, relation, entry_mode, id: oid, }, Deletion { entry_mode, oid, relation, } => ChangeRef::Deletion { entry_mode, location, relation, id: oid, }, Modification { previous_entry_mode, previous_oid, entry_mode, oid, } => ChangeRef::Modification { location, previous_entry_mode, entry_mode, previous_id: previous_oid, id: oid, }, }; match visit(change) { Ok(Action::Cancel) => crate::tree::visit::Action::Cancel, Ok(Action::Continue) => crate::tree::visit::Action::Continue, Err(err) => { *stored_err = Some(err); crate::tree::visit::Action::Cancel } } } fn process_tracked_changes( &mut self, diff_cache: &mut crate::blob::Platform, ) -> Result, Error> { use crate::rewrites::tracker::Change as _; let tracked = match self.tracked.as_mut() { Some(t) => t, None => return Ok(None), }; let outcome = tracked.emit( |dest, source| match source { Some(source) => { let (oid, mode) = dest.change.oid_and_entry_mode(); let change = ChangeRef::Rewrite { source_location: source.location, source_entry_mode: source.entry_mode, source_id: source.id, source_relation: source.change.relation(), entry_mode: mode, id: oid.to_owned(), relation: dest.change.relation(), diff: source.diff, location: dest.location, copy: match source.kind { tracker::visit::SourceKind::Rename => false, tracker::visit::SourceKind::Copy => true, }, }; match (self.visit)(change) { Ok(Action::Cancel) => crate::tree::visit::Action::Cancel, Ok(Action::Continue) => crate::tree::visit::Action::Continue, Err(err) => { self.err = Some(err); crate::tree::visit::Action::Cancel } } } None => Self::emit_change(dest.change, dest.location, &mut self.visit, &mut self.err), }, diff_cache, self.objects, |push| { let mut delegate = tree_to_changes::Delegate::new(push, self.location); let state = gix_traverse::tree::breadthfirst::State::default(); gix_traverse::tree::breadthfirst(self.src_tree, state, self.objects, &mut delegate) }, )?; Ok(Some(outcome)) } } impl crate::tree::Visit for Delegate<'_, '_, VisitFn, E, Objects> where Objects: gix_object::FindObjectOrHeader, VisitFn: for<'delegate> FnMut(ChangeRef<'_>) -> Result, E: Into>, { fn pop_front_tracked_path_and_set_current(&mut self) { self.recorder.pop_front_tracked_path_and_set_current(); } fn push_back_tracked_path_component(&mut self, component: &BStr) { self.recorder.push_back_tracked_path_component(component); } fn push_path_component(&mut self, component: &BStr) { self.recorder.push_path_component(component); } fn pop_path_component(&mut self) { self.recorder.pop_path_component(); } fn visit(&mut self, change: crate::tree::visit::Change) -> crate::tree::visit::Action { match self.tracked.as_mut() { Some(tracked) => tracked .try_push_change(change, self.recorder.path()) .map_or(crate::tree::visit::Action::Continue, |change| { Self::emit_change(change, self.recorder.path(), &mut self.visit, &mut self.err) }), None => Self::emit_change(change, self.recorder.path(), &mut self.visit, &mut self.err), } } } mod tree_to_changes { use crate::tree::visit::Change; use gix_object::tree::EntryRef; use bstr::BStr; pub struct Delegate<'a> { push: &'a mut dyn FnMut(Change, &BStr), recorder: gix_traverse::tree::Recorder, } impl<'a> Delegate<'a> { pub fn new(push: &'a mut dyn FnMut(Change, &BStr), location: Option) -> Self { let location = location.map(|t| match t { crate::tree::recorder::Location::FileName => gix_traverse::tree::recorder::Location::FileName, crate::tree::recorder::Location::Path => gix_traverse::tree::recorder::Location::Path, }); Self { push, recorder: gix_traverse::tree::Recorder::default().track_location(location), } } } impl gix_traverse::tree::Visit for Delegate<'_> { fn pop_front_tracked_path_and_set_current(&mut self) { self.recorder.pop_front_tracked_path_and_set_current(); } fn push_back_tracked_path_component(&mut self, component: &BStr) { self.recorder.push_back_tracked_path_component(component); } fn push_path_component(&mut self, component: &BStr) { self.recorder.push_path_component(component); } fn pop_path_component(&mut self) { self.recorder.pop_path_component(); } fn visit_tree(&mut self, _entry: &EntryRef<'_>) -> gix_traverse::tree::visit::Action { gix_traverse::tree::visit::Action::Continue } fn visit_nontree(&mut self, entry: &EntryRef<'_>) -> gix_traverse::tree::visit::Action { if entry.mode.is_blob() { (self.push)( Change::Modification { previous_entry_mode: entry.mode, previous_oid: gix_hash::ObjectId::null(entry.oid.kind()), entry_mode: entry.mode, oid: entry.oid.to_owned(), }, self.recorder.path(), ); } gix_traverse::tree::visit::Action::Continue } } } gix-diff-0.49.0/src/tree_with_rewrites/mod.rs000064400000000000000000000025511046102023000172500ustar 00000000000000use crate::tree::recorder::Location; use crate::Rewrites; mod change; pub use change::{Change, ChangeRef}; /// The error returned by [`tree_with_rewrites()`](super::tree_with_rewrites()). #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] pub enum Error { #[error(transparent)] Diff(#[from] crate::tree::Error), #[error("The user-provided callback failed")] ForEach(#[source] Box), #[error("Failure during rename tracking")] RenameTracking(#[from] crate::rewrites::tracker::emit::Error), } /// Returned by the [`tree_with_rewrites()`](super::tree_with_rewrites()) function to control flow. #[derive(Default, Clone, Copy, PartialOrd, PartialEq, Ord, Eq, Hash)] pub enum Action { /// Continue the traversal of changes. #[default] Continue, /// Stop the traversal of changes and stop calling the function that returned it. Cancel, } /// Options for use in [`tree_with_rewrites()`](super::tree_with_rewrites()). #[derive(Default, Clone, Debug)] pub struct Options { /// Determine how locations of changes, i.e. their repository-relative path, should be tracked. /// If `None`, locations will always be empty. pub location: Option, /// If not `None`, rename tracking will be performed accordingly. pub rewrites: Option, } pub(super) mod function;