rust-unixfs-0.4.0/.cargo_vcs_info.json0000644000000001440000000000100133430ustar { "git": { "sha1": "fc1a9f1a7e48546fb0f9155ee36d86831bbcd29b" }, "path_in_vcs": "unixfs" }rust-unixfs-0.4.0/CHANGELOG.md000064400000000000000000000000451046102023000137440ustar 00000000000000# 0.4.0 # 0.3.x - See commit historyrust-unixfs-0.4.0/Cargo.lock0000644000000623140000000000100113250ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "aho-corasick" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6748e8def348ed4d14996fa801f4122cd763fff530258cdc03f64b25f89d3a5a" dependencies = [ "memchr", ] [[package]] name = "anes" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anyhow" version = "1.0.75" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" [[package]] name = "arrayref" version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" [[package]] name = "arrayvec" version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "atty" version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" dependencies = [ "hermit-abi", "libc", "winapi", ] [[package]] name = "autocfg" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "base-x" version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4cbbc9d0964165b47557570cce6c952866c2678457aca742aafc9fb771d30270" [[package]] name = "bitflags" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "blake2b_simd" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c2f0dc9a68c6317d884f97cc36cf5a3d20ba14ce404227df55e1af708ab04bc" dependencies = [ "arrayref", "arrayvec", "constant_time_eq 0.2.6", ] [[package]] name = "blake2s_simd" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6637f448b9e61dfadbdcbae9a885fadee1f3eaffb1f8d3c1965d3ade8bdfd44f" dependencies = [ "arrayref", "arrayvec", "constant_time_eq 0.2.6", ] [[package]] name = "blake3" version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "199c42ab6972d92c9f8995f086273d25c42fc0f7b2a1fcefba465c1352d25ba5" dependencies = [ "arrayref", "arrayvec", "cc", "cfg-if", "constant_time_eq 0.3.0", ] [[package]] name = "block-buffer" version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" dependencies = [ "generic-array", ] [[package]] name = "byteorder" version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" [[package]] name = "cast" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" dependencies = [ "libc", ] [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "ciborium" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926" dependencies = [ "ciborium-io", "ciborium-ll", "serde", ] [[package]] name = "ciborium-io" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656" [[package]] name = "ciborium-ll" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b" dependencies = [ "ciborium-io", "half", ] [[package]] name = "cid" version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd94671561e36e4e7de75f753f577edafb0e7c05d6e4547229fdf7938fbcd2c3" dependencies = [ "core2", "multibase", "multihash", "serde", "unsigned-varint", ] [[package]] name = "clap" version = "3.2.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123" dependencies = [ "bitflags", "clap_lex", "indexmap", "textwrap", ] [[package]] name = "clap_lex" version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5" dependencies = [ "os_str_bytes", ] [[package]] name = "constant_time_eq" version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "21a53c0a4d288377e7415b53dcfc3c04da5cdc2cc95c8d5ac178b58f0b861ad6" [[package]] name = "constant_time_eq" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" [[package]] name = "core2" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" dependencies = [ "memchr", ] [[package]] name = "cpufeatures" version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1" dependencies = [ "libc", ] [[package]] name = "criterion" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb" dependencies = [ "anes", "atty", "cast", "ciborium", "clap", "criterion-plot", "itertools", "lazy_static", "num-traits", "oorandom", "regex", "serde", "serde_derive", "serde_json", "tinytemplate", "walkdir", ] [[package]] name = "criterion-plot" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" dependencies = [ "cast", "itertools", ] [[package]] name = "crypto-common" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" dependencies = [ "generic-array", "typenum", ] [[package]] name = "data-encoding" version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2e66c9d817f1720209181c316d28635c050fa304f9c79e47a520882661b7308" [[package]] name = "data-encoding-macro" version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c904b33cc60130e1aeea4956ab803d08a3f4a0ca82d64ed757afac3891f2bb99" dependencies = [ "data-encoding", "data-encoding-macro-internal", ] [[package]] name = "data-encoding-macro-internal" version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fdf3fce3ce863539ec1d7fd1b6dcc3c645663376b43ed376bbf887733e4f772" dependencies = [ "data-encoding", "syn 1.0.109", ] [[package]] name = "digest" version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", "crypto-common", ] [[package]] name = "either" version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" [[package]] name = "filetime" version = "0.2.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4029edd3e734da6fe05b6cd7bd2960760a616bd2ddd0d59a0124746d6272af0" dependencies = [ "cfg-if", "libc", "redox_syscall", "windows-sys", ] [[package]] name = "fnv" version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] name = "generic-array" version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ "typenum", "version_check", ] [[package]] name = "half" version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" [[package]] name = "hash_hasher" version = "2.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74721d007512d0cb3338cd20f0654ac913920061a4c4d0d8708edb3f2a698c0c" [[package]] name = "hashbrown" version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" [[package]] name = "hermit-abi" version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" dependencies = [ "libc", ] [[package]] name = "hex-literal" version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ebdb29d2ea9ed0083cd8cece49bbd968021bd99b0849edb4a9a7ee0fdf6a4e0" [[package]] name = "indexmap" version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" dependencies = [ "autocfg", "hashbrown", ] [[package]] name = "itertools" version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" dependencies = [ "either", ] [[package]] name = "itoa" version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" [[package]] name = "keccak" version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f6d5ed8676d904364de097082f4e7d240b571b67989ced0240f08b7f966f940" dependencies = [ "cpufeatures", ] [[package]] name = "lazy_static" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" version = "0.2.147" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" [[package]] name = "libipld" version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1ccd6b8ffb3afee7081fcaec00e1b099fd1c7ccf35ba5729d88538fcc3b4599" dependencies = [ "fnv", "libipld-cbor", "libipld-cbor-derive", "libipld-core", "libipld-json", "libipld-macro", "libipld-pb", "log", "multihash", "thiserror", ] [[package]] name = "libipld-cbor" version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77d98c9d1747aa5eef1cf099cd648c3fd2d235249f5fed07522aaebc348e423b" dependencies = [ "byteorder", "libipld-core", "thiserror", ] [[package]] name = "libipld-cbor-derive" version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d5ba3a729b72973e456a1812b0afe2e176a376c1836cc1528e9fc98ae8cb838" dependencies = [ "proc-macro-crate", "proc-macro2", "quote", "syn 1.0.109", "synstructure", ] [[package]] name = "libipld-core" version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5acd707e8d8b092e967b2af978ed84709eaded82b75effe6cb6f6cc797ef8158" dependencies = [ "anyhow", "cid", "core2", "multibase", "multihash", "thiserror", ] [[package]] name = "libipld-json" version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25856def940047b07b25c33d4e66d248597049ab0202085215dc4dca0487731c" dependencies = [ "libipld-core", "multihash", "serde", "serde_json", ] [[package]] name = "libipld-macro" version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71171c54214f866ae6722f3027f81dff0931e600e5a61e6b1b6a49ca0b5ed4ae" dependencies = [ "libipld-core", ] [[package]] name = "libipld-pb" version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3f2d0f866c4cd5dc9aa8068c429ba478d2882a3a4b70ab56f7e9a0eddf5d16f" dependencies = [ "bytes", "libipld-core", "quick-protobuf", "thiserror", ] [[package]] name = "log" version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" [[package]] name = "memchr" version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "multibase" version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b3539ec3c1f04ac9748a260728e855f261b4977f5c3406612c884564f329404" dependencies = [ "base-x", "data-encoding", "data-encoding-macro", ] [[package]] name = "multihash" version = "0.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfd8a792c1694c6da4f68db0a9d707c72bd260994da179e6030a5dcee00bb815" dependencies = [ "blake2b_simd", "blake2s_simd", "blake3", "core2", "digest", "multihash-derive", "sha2", "sha3", "unsigned-varint", ] [[package]] name = "multihash-derive" version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d6d4752e6230d8ef7adf7bd5d8c4b1f6561c1014c5ba9a37445ccefe18aa1db" dependencies = [ "proc-macro-crate", "proc-macro-error", "proc-macro2", "quote", "syn 1.0.109", "synstructure", ] [[package]] name = "num-traits" version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" dependencies = [ "autocfg", ] [[package]] name = "oorandom" version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" [[package]] name = "os_str_bytes" version = "6.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d5d9eb14b174ee9aa2ef96dc2b94637a2d4b6e7cb873c7e171f0c20c6cf3eac" [[package]] name = "proc-macro-crate" version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e17d47ce914bf4de440332250b0edd23ce48c005f59fab39d3335866b114f11a" dependencies = [ "thiserror", "toml", ] [[package]] name = "proc-macro-error" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" dependencies = [ "proc-macro-error-attr", "proc-macro2", "quote", "syn 1.0.109", "version_check", ] [[package]] name = "proc-macro-error-attr" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" dependencies = [ "proc-macro2", "quote", "version_check", ] [[package]] name = "proc-macro2" version = "1.0.66" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" dependencies = [ "unicode-ident", ] [[package]] name = "quick-protobuf" version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d6da84cc204722a989e01ba2f6e1e276e190f22263d0cb6ce8526fcdb0d2e1f" dependencies = [ "byteorder", ] [[package]] name = "quote" version = "1.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" dependencies = [ "proc-macro2", ] [[package]] name = "redox_syscall" version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" dependencies = [ "bitflags", ] [[package]] name = "regex" version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81bc1d4caf89fac26a70747fe603c130093b53c773888797a6329091246d651a" dependencies = [ "aho-corasick", "memchr", "regex-automata", "regex-syntax", ] [[package]] name = "regex-automata" version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fed1ceff11a1dddaee50c9dc8e4938bd106e9d89ae372f192311e7da498e3b69" dependencies = [ "aho-corasick", "memchr", "regex-syntax", ] [[package]] name = "regex-syntax" version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2" [[package]] name = "rust-unixfs" version = "0.4.0" dependencies = [ "criterion", "either", "filetime", "hash_hasher", "hex-literal", "libc", "libipld", "multibase", "quick-protobuf", "sha2", "tar", ] [[package]] name = "ryu" version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" [[package]] name = "same-file" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" dependencies = [ "winapi-util", ] [[package]] name = "serde" version = "1.0.186" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f5db24220c009de9bd45e69fb2938f4b6d2df856aa9304ce377b3180f83b7c1" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" version = "1.0.186" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ad697f7e0b65af4983a4ce8f56ed5b357e8d3c36651bf6a7e13639c17b8e670" dependencies = [ "proc-macro2", "quote", "syn 2.0.29", ] [[package]] name = "serde_json" version = "1.0.105" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "693151e1ac27563d6dbcec9dee9fbd5da8539b20fa14ad3752b2e6d363ace360" dependencies = [ "itoa", "ryu", "serde", ] [[package]] name = "sha2" version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8" dependencies = [ "cfg-if", "cpufeatures", "digest", ] [[package]] name = "sha3" version = "0.10.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75872d278a8f37ef87fa0ddbda7802605cb18344497949862c0d4dcb291eba60" dependencies = [ "digest", "keccak", ] [[package]] name = "syn" version = "1.0.109" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "syn" version = "2.0.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c324c494eba9d92503e6f1ef2e6df781e78f6a7705a0202d9801b198807d518a" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "synstructure" version = "0.12.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f" dependencies = [ "proc-macro2", "quote", "syn 1.0.109", "unicode-xid", ] [[package]] name = "tar" version = "0.4.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b16afcea1f22891c49a00c751c7b63b2233284064f11a200fc624137c51e2ddb" dependencies = [ "filetime", "libc", ] [[package]] name = "textwrap" version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" [[package]] name = "thiserror" version = "1.0.47" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" version = "1.0.47" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b" dependencies = [ "proc-macro2", "quote", "syn 2.0.29", ] [[package]] name = "tinytemplate" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" dependencies = [ "serde", "serde_json", ] [[package]] name = "toml" version = "0.5.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" dependencies = [ "serde", ] [[package]] name = "typenum" version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" [[package]] name = "unicode-ident" version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" [[package]] name = "unicode-xid" version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" [[package]] name = "unsigned-varint" version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d86a8dc7f45e4c1b0d30e43038c38f274e77af056aa5f74b93c2cf9eb3c1c836" [[package]] name = "version_check" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" [[package]] name = "walkdir" version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698" dependencies = [ "same-file", "winapi-util", ] [[package]] name = "winapi" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" dependencies = [ "winapi-i686-pc-windows-gnu", "winapi-x86_64-pc-windows-gnu", ] [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" dependencies = [ "winapi", ] [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-sys" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ "windows-targets", ] [[package]] name = "windows-targets" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", "windows_i686_gnu", "windows_i686_msvc", "windows_x86_64_gnu", "windows_x86_64_gnullvm", "windows_x86_64_msvc", ] [[package]] name = "windows_aarch64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_i686_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_x86_64_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" rust-unixfs-0.4.0/Cargo.toml0000644000000030130000000000100113370ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" name = "rust-unixfs" version = "0.4.0" authors = ["Rust-IPFS contributors"] description = "UnixFs tree support" readme = "README.md" license = "MIT OR Apache-2.0" repository = "https://github.com/dariusc93/rust-ipfs" [[bench]] name = "ingest-tar" harness = false [[bench]] name = "adder" harness = false [dependencies.either] version = "1.8" default-features = false [dependencies.filetime] version = "0.2" optional = true [dependencies.libipld] version = "0.16" [dependencies.quick-protobuf] version = "0.8" features = ["std"] default-features = false [dependencies.sha2] version = "0.10" default-features = false [dev-dependencies.criterion] version = "0.4" default-features = false [dev-dependencies.hash_hasher] version = "2.0.3" [dev-dependencies.hex-literal] version = "0.3" default-features = false [dev-dependencies.libc] version = "0.2" default-features = false [dev-dependencies.multibase] version = "0.9" default-features = false [dev-dependencies.tar] version = "0.4" default-features = false [features] default = ["filetime"] rust-unixfs-0.4.0/Cargo.toml.orig000064400000000000000000000017131046102023000150250ustar 00000000000000[package] authors = ["Rust-IPFS contributors"] description = "UnixFs tree support" edition = "2021" license = "MIT OR Apache-2.0" name = "rust-unixfs" readme = "README.md" repository = "https://github.com/dariusc93/rust-ipfs" version = "0.4.0" [features] default = ["filetime"] [dependencies] libipld = "0.16" either = { default-features = false, version = "1.8" } filetime = { optional = true, version = "0.2" } quick-protobuf = { default-features = false, features = [ "std", ], version = "0.8" } sha2 = { default-features = false, version = "0.10" } [dev-dependencies] hash_hasher = "2.0.3" hex-literal = { default-features = false, version = "0.3" } libc = { default-features = false, version = "0.2" } multibase = { default-features = false, version = "0.9" } tar = { default-features = false, version = "0.4" } criterion = { default-features = false, version = "0.4" } [[bench]] name = "ingest-tar" harness = false [[bench]] name = "adder" harness = false rust-unixfs-0.4.0/README.md000064400000000000000000000006221046102023000134130ustar 00000000000000# ipfs-unixfs ## Goals * blockstore API independent way to traverse the merkledag * the core read API does not deal with loading blocks * instead access to interesting `Cid`s is given ## Status * unfiltered walking of known unixfs trees * creation of balanced file trees * creation of non HAMT-sharded directory trees See the docs at https://docs.rs/ipfs-unixfs. ## License MIT or APL2. rust-unixfs-0.4.0/benches/adder.rs000064400000000000000000000053601046102023000151740ustar 00000000000000// The goal of this benchmark was initially to expose a supposed quadratic increase in time when // adding large files. // // The behaviour isn't quadratic, though there is a slowdown observed e.g.: // // size (B): throughput: // // 100 2.06 MBps // 174 2.07 MBps // 200 2.00 MBps // 30000 1.98 MBps // 30277 1.94 MBps // 31000 1.98 MBps // 60000 1.77 MBps // 60552 1.75 MBps // 60553 1.74 MBps // 70000 1.71 MBps // // // And feeding the "add" example (basis for the benchmark below) a 5GB file over stdin: // // 0.586 MB in 1.00s or 0.586 MBps // 0.866 MB in 2.00s or 0.280 MBps // 1.076 MB in 3.00s or 0.210 MBps // 1.245 MB in 4.00s or 0.169 MBps // 1.382 MB in 5.00s or 0.137 MBps // 1.508 MB in 6.00s or 0.126 MBps // 1.618 MB in 7.00s or 0.110 MBps // 1.719 MB in 8.00s or 0.102 MBps // 1.815 MB in 9.00s or 0.096 MBps // 1.904 MB in 10.00s or 0.088 MBps // 1.988 MB in 11.00s or 0.084 MBps // 2.070 MB in 12.00s or 0.082 MBps // 2.148 MB in 13.00s or 0.078 MBps // 2.225 MB in 14.00s or 0.077 MBps // 2.295 MB in 15.00s or 0.070 MBps // 2.365 MB in 16.00s or 0.070 MBps // 2.435 MB in 17.00s or 0.070 MBps // 2.502 MB in 18.00s or 0.067 MBps // 2.567 MB in 19.00s or 0.065 MBps // 2.630 MB in 20.00s or 0.063 MBps // 2.693 MB in 21.00s or 0.063 MBps // 2.752 MB in 22.00s or 0.060 MBps // 2.811 MB in 23.00s or 0.059 MBps // 2.869 MB in 24.00s or 0.058 MBps use criterion::{ criterion_group, criterion_main, BenchmarkId, Criterion, SamplingMode, Throughput, }; use rust_unixfs::file::adder::{Chunker, FileAdder}; pub fn criterion_benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("adder"); group.sample_size(30); for size in [175, 30277, 60553].iter() { group.sampling_mode(SamplingMode::Flat); group.throughput(Throughput::Bytes(*size as u64)); group.bench_with_input(BenchmarkId::from_parameter(size), size, |b, _size| { b.iter(|| run_adder(*size)); }); } } pub fn run_adder(size: usize) { // Setting a small chunker size should exacerbate the issue as the BalanceCollector needs to // work harder as a result. let chunker = Chunker::Size(1); let mut adder = FileAdder::builder().with_chunker(chunker).build(); let mut total = 0; while total < size { let (blocks, consumed) = adder.push(&[0]); blocks.count(); total += consumed; } assert_eq!(total, size); adder.finish().count(); } criterion_group!(benches, criterion_benchmark); criterion_main!(benches); rust-unixfs-0.4.0/benches/ingest-tar.rs000064400000000000000000000121221046102023000161640ustar 00000000000000use criterion::{black_box, criterion_group, criterion_main, Criterion}; use libipld::multihash::Multihash; pub fn criterion_benchmark(c: &mut Criterion) { let file = "benchmark.tar"; match std::fs::read(file) { Ok(tar_bytes) => { // warmup should take care of right sizing these let mut buffer = Vec::new(); let mut path = String::new(); c.bench_function("ingest-tar", |b| { b.iter(|| ingest_tar(&tar_bytes, &mut buffer, &mut path)) }); } Err(e) if e.kind() == std::io::ErrorKind::NotFound => { eprintln!("could not find {file:?}:"); eprintln!("please download a linux kernel and unpack it to enable benchmark. specific version doesn't matter."); } Err(e) => panic!("failed to read the {file:?}: {e}"), } } fn ingest_tar(bytes: &[u8], buffer: &mut Vec, path: &mut String) { use libipld::Cid; use rust_unixfs::dir::builder::{BufferingTreeBuilder, TreeOptions}; use rust_unixfs::file::adder::FileAdder; use sha2::{Digest, Sha256}; use std::io::Read; let mut archive = tar::Archive::new(std::io::Cursor::new(bytes)); let entries = archive.entries().unwrap(); let mut opts = TreeOptions::default(); opts.wrap_with_directory(); let mut tree = BufferingTreeBuilder::new(opts); for entry in entries { let mut entry = entry.expect("assuming good tar"); let path_bytes = entry.path_bytes(); let tmp_path = std::str::from_utf8(&path_bytes).unwrap(); path.clear(); path.push_str(tmp_path); if let Some(link_name) = entry.link_name_bytes() { let link_name = std::str::from_utf8(&link_name).expect("symlink targets should be utf8"); buffer.clear(); rust_unixfs::symlink::serialize_symlink_block(link_name, buffer); let len = buffer.len(); let mh = Multihash::wrap( libipld::multihash::Code::Sha2_256.into(), &Sha256::digest(&buffer), ) .unwrap(); let cid = Cid::new_v0(mh).expect("sha2_256 is the correct multihash for cidv0"); tree.put_link(path, cid, len as u64).unwrap(); // save the &buffer[..] continue; } if !path.ends_with('/') { // TODO: reusing of adder let mut adder = FileAdder::default(); // with the std::io::Read it'd be good to read into the fileadder, or read into ... // something. trying to acccess the buffer from inside FileAdder does not seem the be the // way to go. // // reusing the buffers between files would make a lot of sense as well if let Some(needed) = adder.size_hint().checked_sub(buffer.capacity()) { buffer.reserve(needed); } if let Some(mut needed) = adder.size_hint().checked_sub(buffer.len()) { let zeros = [0u8; 8]; while needed > zeros.len() { buffer.extend_from_slice(&zeros[..]); needed -= zeros.len(); } buffer.extend(std::iter::repeat(0).take(needed)); } let mut total_written = 0usize; loop { match entry.read(&mut buffer[0..]).unwrap() { 0 => { let blocks = adder.finish(); let (cid, subtotal) = blocks .fold( None, |acc: Option<(Cid, usize)>, (cid, bytes): (Cid, Vec)| match acc { Some((_, total)) => Some((cid, total + bytes.len())), None => Some((cid, bytes.len())), }, ) .expect("this is probably always present"); total_written += subtotal; tree.put_link(path, cid, total_written as u64).unwrap(); break; } n => { let mut read = 0; while read < n { let (blocks, consumed) = adder.push(&buffer[read..n]); read += consumed; total_written += blocks.map(|(_, bytes)| bytes.len()).sum::(); } } } } } else { tree.set_metadata(&path[..path.len() - 1], rust_unixfs::Metadata::default()) .unwrap(); } } let mut iter = tree.build(); let mut last: Option<(Cid, u64, usize)> = None; while let Some(res) = iter.next_borrowed() { let res = res.unwrap(); last = Some((res.cid.to_owned(), res.total_size, res.block.len())); } let last = last.unwrap(); black_box(last); } criterion_group!(benches, criterion_benchmark); criterion_main!(benches); rust-unixfs-0.4.0/examples/add.rs000064400000000000000000000071431046102023000150550ustar 00000000000000use libipld::Cid; use rust_unixfs::file::adder::FileAdder; use std::fmt; use std::io::{BufRead, BufReader}; use std::time::Duration; fn main() { // read stdin, maybe produce stdout car? let stdin = std::io::stdin(); let stdin = stdin.lock(); let mut adder = FileAdder::default(); let mut stdin = BufReader::with_capacity(adder.size_hint(), stdin); let mut stats = Stats::default(); let mut input = 0; let start = std::time::Instant::now(); loop { match stdin.fill_buf().unwrap() { x if x.is_empty() => { eprintln!("finishing"); eprintln!("{adder:?}"); let blocks = adder.finish(); stats.process(blocks); break; } x => { let mut total = 0; while total < x.len() { let (blocks, consumed) = adder.push(&x[total..]); stats.process(blocks); input += consumed; total += consumed; } assert_eq!(total, x.len()); stdin.consume(total); } } } let process_stats = get_process_stats(); eprintln!("{stats}"); let total = start.elapsed(); if let Some(process_stats) = process_stats { eprint!("{process_stats}, "); } eprintln!("total: {total:?}"); let megabytes = 1024.0 * 1024.0; eprintln!( "Input: {:.2} MB/s (read {} bytes)", (input as f64 / megabytes) / total.as_secs_f64(), input ); eprintln!( "Output: {:.2} MB/s", (stats.block_bytes as f64 / megabytes) / total.as_secs_f64() ); } struct ProcessStats { user_time: Duration, system_time: Duration, max_rss: i64, } impl fmt::Display for ProcessStats { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { write!( fmt, "Max RSS: {} KB, utime: {:?}, stime: {:?}", self.max_rss, self.user_time, self.system_time ) } } #[cfg(unix)] fn get_process_stats() -> Option { fn to_duration(tv: libc::timeval) -> Duration { assert!(tv.tv_sec >= 0); Duration::new(tv.tv_sec as u64, tv.tv_usec as u32) } let (max_rss, user_time, system_time) = unsafe { let mut rusage: libc::rusage = std::mem::zeroed(); let retval = libc::getrusage(libc::RUSAGE_SELF, &mut rusage as *mut _); assert_eq!(retval, 0); (rusage.ru_maxrss, rusage.ru_utime, rusage.ru_stime) }; let user_time = to_duration(user_time); let system_time = to_duration(system_time); Some(ProcessStats { user_time, system_time, max_rss, }) } #[cfg(not(unix))] fn get_process_stats() -> Option { None } #[derive(Default)] struct Stats { blocks: usize, block_bytes: u64, last: Option, } impl fmt::Display for Stats { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { let hash = self.last.as_ref().unwrap().hash(); let cidv1 = Cid::new_v1(libipld::IpldCodec::DagPb.into(), hash.to_owned()); write!( fmt, "{} blocks, {} block bytes, {} or {}", self.blocks, self.block_bytes, self.last.as_ref().unwrap(), cidv1, ) } } impl Stats { fn process)>>(&mut self, new_blocks: I) { for (cid, block) in new_blocks { self.last = Some(cid); self.blocks += 1; self.block_bytes += block.len() as u64; } } } rust-unixfs-0.4.0/examples/cat.rs000064400000000000000000000126321046102023000150730ustar 00000000000000use libipld::Cid; use rust_unixfs::file::{visit::IdleFileVisit, FileReadFailed}; use std::convert::TryFrom; use std::fmt; use std::io::{Error as IoError, Read, Write}; use std::path::PathBuf; fn main() { let cid = match std::env::args().nth(1).map(Cid::try_from) { Some(Ok(cid)) => cid, Some(Err(e)) => { eprintln!("Invalid cid given as argument: {e}"); std::process::exit(1); } None => { eprintln!("USAGE: {} CID\n", std::env::args().next().unwrap()); eprintln!( "Will walk the unixfs file pointed out by the CID from default go-ipfs 0.5 \ configuration flatfs blockstore and write all content to stdout." ); std::process::exit(0); } }; let ipfs_path = match std::env::var("IPFS_PATH") { Ok(s) => s, Err(e) => { eprintln!("IPFS_PATH is not set or could not be read: {e}"); std::process::exit(1); } }; let mut blocks = PathBuf::from(ipfs_path); blocks.push("blocks"); let blockstore = ShardedBlockStore { root: blocks }; match walk(blockstore, &cid) { Ok((read, content)) => { eprintln!("Content bytes: {content}"); eprintln!("Total bytes: {read}"); } Err(Error::OpeningFailed(e)) => { eprintln!("{e}\n"); eprintln!("This is likely caused by either:"); eprintln!(" - ipfs does not have the block"); eprintln!(" - ipfs is configured to use non-flatfs storage"); eprintln!(" - ipfs is configured to use flatfs with different sharding"); std::process::exit(1); } Err(e) => { eprintln!("Failed to walk the merkle tree: {e}"); std::process::exit(1); } } } fn walk(blocks: ShardedBlockStore, start: &Cid) -> Result<(u64, u64), Error> { let stdout = std::io::stdout(); let mut stdout = stdout.lock(); let mut read_bytes = 0; let mut content_bytes = 0; // The blockstore specific way of reading the block. Here we assume go-ipfs 0.5 default flatfs // configuration, which puts the files at sharded directories and names the blocks as base32 // upper and a suffix of "data". // // For the ipfs-unixfs it is important that the raw block data lives long enough that the // possible content gets to be processed, at minimum one step of the walk as shown in this // example. let mut buf = Vec::new(); read_bytes += blocks.as_file(&start.to_bytes())?.read_to_end(&mut buf)? as u64; // First step of the walk can give content or continued visitation but not both. let (content, _, _metadata, mut step) = IdleFileVisit::default().start(&buf)?; stdout.write_all(content)?; content_bytes += content.len() as u64; // Following steps repeat the same pattern: while let Some(visit) = step { // Read the next link. The `pending_links()` gives the next link and an iterator over the // following links. The iterator lists the known links in the order of traversal, with the // exception of possible new links appearing before the older. let (first, _) = visit.pending_links(); buf.clear(); read_bytes += blocks.as_file(&first.to_bytes())?.read_to_end(&mut buf)? as u64; // Similar to first step, except we no longer get the file metadata. It is still accessible // from the `visit` via `AsRef` but likely only needed in // the first step. let (content, next_step) = visit.continue_walk(&buf, &mut None)?; stdout.write_all(content)?; content_bytes += content.len() as u64; // Using a while loop combined with `let Some(visit) = step` allows for easy walking. step = next_step; } stdout.flush()?; Ok((read_bytes, content_bytes)) } enum Error { OpeningFailed(IoError), Other(IoError), Traversal(rust_unixfs::file::FileReadFailed), } impl From for Error { fn from(e: IoError) -> Error { Error::Other(e) } } impl From for Error { fn from(e: FileReadFailed) -> Error { Error::Traversal(e) } } impl fmt::Display for Error { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { use Error::*; match self { OpeningFailed(e) => write!(fmt, "File opening failed: {e}"), Other(e) => write!(fmt, "Other file related io error: {e}"), Traversal(e) => write!(fmt, "Traversal failed, please report this as a bug: {e}"), } } } struct ShardedBlockStore { root: PathBuf, } impl ShardedBlockStore { fn as_path(&self, key: &[u8]) -> PathBuf { // assume that we have a block store with second-to-last/2 sharding // files in Base32Upper let encoded = multibase::Base::Base32Upper.encode(key); let len = encoded.len(); // this is safe because base32 is ascii let dir = &encoded[(len - 3)..(len - 1)]; assert_eq!(dir.len(), 2); let mut path = self.root.clone(); path.push(dir); path.push(encoded); path.set_extension("data"); path } fn as_file(&self, key: &[u8]) -> Result { let path = self.as_path(key); std::fs::OpenOptions::new() .read(true) .open(path) .map_err(Error::OpeningFailed) } } rust-unixfs-0.4.0/examples/get.rs000064400000000000000000000124011046102023000150750ustar 00000000000000use libipld::Cid; use std::convert::TryFrom; use std::fmt; use std::io::{Error as IoError, Read}; use std::path::{Path, PathBuf}; fn main() { let cid = match std::env::args().nth(1).map(Cid::try_from) { Some(Ok(cid)) => cid, Some(Err(e)) => { eprintln!("Invalid cid given as argument: {e}"); std::process::exit(1); } None => { eprintln!("USAGE: {} CID\n", std::env::args().next().unwrap()); eprintln!( "Will walk the unixfs file pointed out by the CID from default go-ipfs 0.5 \ configuration flatfs blockstore and write listing to stdout." ); std::process::exit(0); } }; let ipfs_path = match std::env::var("IPFS_PATH") { Ok(s) => s, Err(e) => { eprintln!("IPFS_PATH is not set or could not be read: {e}"); std::process::exit(1); } }; let mut blocks = PathBuf::from(ipfs_path); blocks.push("blocks"); let blockstore = ShardedBlockStore { root: blocks }; match walk(blockstore, &cid) { Ok(()) => {} Err(Error::OpeningFailed(e)) => { eprintln!("{e}\n"); eprintln!("This is likely caused by either:"); eprintln!(" - ipfs does not have the block"); eprintln!(" - ipfs is configured to use non-flatfs storage"); eprintln!(" - ipfs is configured to use flatfs with different sharding"); std::process::exit(1); } Err(e) => { eprintln!("Failed to walk the merkle tree: {e}"); std::process::exit(1); } } } fn walk(blocks: ShardedBlockStore, start: &Cid) -> Result<(), Error> { use rust_unixfs::walk::{ContinuedWalk, Walker}; let mut buf = Vec::new(); let mut cache = None; let mut walker = Walker::new(start.to_owned(), String::new()); while walker.should_continue() { buf.clear(); // Note: if you bind the pending or the "prefetchable", it must be dropped before the next // call to continue_walk. let (next, _) = walker.pending_links(); blocks.as_file(&next.to_bytes())?.read_to_end(&mut buf)?; match walker.next(&buf, &mut cache)? { ContinuedWalk::Bucket(..) => { // Continuation of a HAMT shard directory that is usually ignored } ContinuedWalk::File(segment, _, path, metadata, size) => { if segment.is_first() { // this is set on the root block, no actual bytes are present for multiblock // files } if segment.is_last() { let mode = metadata.mode().unwrap_or(0o0644) & 0o7777; let (seconds, _) = metadata.mtime().unwrap_or((0, 0)); println!("f {mode:o} {seconds:>12} {size:>16} {path:?}"); } } ContinuedWalk::Directory(_, path, metadata) | ContinuedWalk::RootDirectory(_, path, metadata) => { let mode = metadata.mode().unwrap_or(0o0755) & 0o7777; let (seconds, _) = metadata.mtime().unwrap_or((0, 0)); println!("d {:o} {:>12} {:>16} {:?}", mode, seconds, "-", path); } ContinuedWalk::Symlink(bytes, _, path, metadata) => { let target = Path::new(std::str::from_utf8(bytes).unwrap()); let mode = metadata.mode().unwrap_or(0o0755) & 0o7777; let (seconds, _) = metadata.mtime().unwrap_or((0, 0)); println!( "s {:o} {:>12} {:>16} {:?} -> {:?}", mode, seconds, "-", path, target ); } }; } Ok(()) } enum Error { OpeningFailed(IoError), Other(IoError), Walk(rust_unixfs::walk::Error), } impl From for Error { fn from(e: IoError) -> Error { Error::Other(e) } } impl From for Error { fn from(e: rust_unixfs::walk::Error) -> Error { Error::Walk(e) } } impl fmt::Display for Error { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { use Error::*; match self { OpeningFailed(e) => write!(fmt, "Failed to open file: {e}"), Other(e) => write!(fmt, "A file-related IO error: {e}"), Walk(e) => write!(fmt, "Walk failed, please report this as a bug: {e}"), } } } struct ShardedBlockStore { root: PathBuf, } impl ShardedBlockStore { fn as_path(&self, key: &[u8]) -> PathBuf { // assume that we have a block store with second-to-last/2 sharding // files in Base32Upper let encoded = multibase::Base::Base32Upper.encode(key); let len = encoded.len(); // this is safe because base32 is ascii let dir = &encoded[(len - 3)..(len - 1)]; assert_eq!(dir.len(), 2); let mut path = self.root.clone(); path.push(dir); path.push(encoded); path.set_extension("data"); path } fn as_file(&self, key: &[u8]) -> Result { let path = self.as_path(key); std::fs::OpenOptions::new() .read(true) .open(path) .map_err(Error::OpeningFailed) } } rust-unixfs-0.4.0/examples/resolve.rs000064400000000000000000000211221046102023000157750ustar 00000000000000use libipld::Cid; use rust_unixfs::dir::{resolve, LookupError, ResolveError}; use std::convert::TryFrom; use std::fmt; use std::io::{Error as IoError, Read}; use std::path::PathBuf; fn main() { let path = match std::env::args() .nth(1) .map(|s| IpfsPath::try_from(s.as_str())) { Some(Ok(path)) => path, Some(Err(e)) => { eprintln!("Invalid path given as argument: {e}"); std::process::exit(1); } None => { eprintln!("USAGE: {} IPFSPATH\n", std::env::args().next().unwrap()); eprintln!( "Will resolve the given IPFSPATH to a CID through any UnixFS \ directories or HAMT shards from default go-ipfs 0.5 \ configuration flatfs blockstore and write the final CID into \ stdout" ); std::process::exit(0); } }; let ipfs_path = match std::env::var("IPFS_PATH") { Ok(s) => s, Err(e) => { eprintln!("IPFS_PATH is not set or could not be read: {e}"); std::process::exit(1); } }; let mut blocks = PathBuf::from(ipfs_path); blocks.push("blocks"); let blockstore = ShardedBlockStore { root: blocks }; match walk(blockstore, path) { Ok(Some(cid)) => { println!("{cid}"); } Ok(None) => { eprintln!("not found"); } Err(Error::OpeningFailed(e)) => { eprintln!("{e}\n"); eprintln!("This is likely caused by either:"); eprintln!(" - ipfs does not have the block"); eprintln!(" - ipfs is configured to use non-flatfs storage"); eprintln!(" - ipfs is configured to use flatfs with different sharding"); std::process::exit(1); } Err(e) => { eprintln!("Failed to walk the merkle tree: {e}"); std::process::exit(1); } } } #[derive(Debug)] pub enum PathError { InvalidCid(libipld::cid::Error), InvalidPath, } impl fmt::Display for PathError { fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { match self { PathError::InvalidCid(e) => write!(fmt, "{e}"), PathError::InvalidPath => write!(fmt, "invalid path"), } } } impl std::error::Error for PathError {} /// Ipfs path following https://github.com/ipfs/go-path/ #[derive(Debug)] pub struct IpfsPath { /// Option to support moving the cid root: Option, path: std::vec::IntoIter, } impl From for IpfsPath { /// Creates a new `IpfsPath` from just the `Cid`, which is the same as parsing from a string /// representation of a `Cid`, but cannot fail. fn from(root: Cid) -> IpfsPath { IpfsPath { root: Some(root), path: Vec::new().into_iter(), } } } impl TryFrom<&str> for IpfsPath { type Error = PathError; fn try_from(path: &str) -> Result { let mut split = path.splitn(2, "/ipfs/"); let first = split.next(); let (_root, path) = match first { Some("") => { /* started with /ipfs/ */ if let Some(x) = split.next() { // was /ipfs/x ("ipfs", x) } else { // just the /ipfs/ return Err(PathError::InvalidPath); } } Some(x) => { /* maybe didn't start with /ipfs/, need to check second */ if split.next().is_some() { // x/ipfs/_ return Err(PathError::InvalidPath); } ("", x) } None => return Err(PathError::InvalidPath), }; let mut split = path.splitn(2, '/'); let root = split .next() .expect("first value from splitn(2, _) must exist"); let path = split .next() .iter() .flat_map(|s| s.split('/').filter(|s| !s.is_empty()).map(String::from)) .collect::>() .into_iter(); let root = Some(Cid::try_from(root).map_err(PathError::InvalidCid)?); Ok(IpfsPath { root, path }) } } impl IpfsPath { pub fn take_root(&mut self) -> Option { self.root.take() } } #[allow(clippy::result_large_err)] fn walk(blocks: ShardedBlockStore, mut path: IpfsPath) -> Result, Error> { use rust_unixfs::dir::MaybeResolved::*; let mut buf = Vec::new(); let mut root = path.take_root().unwrap(); let mut cache = None; for segment in path.path { println!("cache {cache:?}"); buf.clear(); eprintln!("reading {root} to resolve {segment:?}"); blocks.as_file(&root.to_bytes())?.read_to_end(&mut buf)?; let mut walker = match resolve(&buf, segment.as_str(), &mut cache)? { Found(cid) => { // either root was a Directory or we got lucky with a HAMT directory. // With HAMTDirectories the top level can contain a direct link to the target, but // it's more likely it will be found under some bucket, which would be the third // case in this match. println!("got lucky: found {cid} for {segment:?}"); println!("cache {cache:?}"); root = cid; continue; } NotFound => return Ok(None), // when we stumble upon a HAMT shard, we'll need to look up other blocks in order to // find the final link. The current implementation cannot search for the directory by // hashing the name and looking it up, but the implementation can be changed underneath // without changes to the API. // // HAMTDirecotories or HAMT shards are multi-block directories where the entires are // bucketed per their hash value. NeedToLoadMore(walker) => walker, }; eprintln!("walking {root} on {segment:?}"); let mut other_blocks = 1; loop { let (first, _) = walker.pending_links(); buf.clear(); eprintln!(" -> reading {first} while searching for {segment:?}"); blocks.as_file(&first.to_bytes())?.read_to_end(&mut buf)?; match walker.continue_walk(&buf, &mut cache)? { NotFound => { println!("cache {cache:?}"); return Ok(None); } Found(cid) => { eprintln!( " resolved {segment} from {root} after {other_blocks} blocks to {cid}" ); root = cid; break; } NeedToLoadMore(next) => walker = next, } other_blocks += 1; } } println!("cache {cache:?}"); Ok(Some(root)) } enum Error { OpeningFailed(IoError), Other(IoError), Traversal(ResolveError), } impl From for Error { fn from(e: IoError) -> Error { Error::Other(e) } } impl From for Error { fn from(e: ResolveError) -> Error { Error::Traversal(e) } } impl From for Error { fn from(e: LookupError) -> Error { Error::Traversal(e.into()) } } impl fmt::Display for Error { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { use Error::*; match self { OpeningFailed(e) => write!(fmt, "File opening failed: {e}"), Other(e) => write!(fmt, "Other file related io error: {e}"), Traversal(e) => write!(fmt, "Walking failed, please report this as a bug: {e:?}"), } } } struct ShardedBlockStore { root: PathBuf, } impl ShardedBlockStore { fn as_path(&self, key: &[u8]) -> PathBuf { // assume that we have a block store with second-to-last/2 sharding // files in Base32Upper let encoded = multibase::Base::Base32Upper.encode(key); let len = encoded.len(); // this is safe because base32 is ascii let dir = &encoded[(len - 3)..(len - 1)]; assert_eq!(dir.len(), 2); let mut path = self.root.clone(); path.push(dir); path.push(encoded); path.set_extension("data"); path } #[allow(clippy::result_large_err)] fn as_file(&self, key: &[u8]) -> Result { let path = self.as_path(key); std::fs::OpenOptions::new() .read(true) .open(path) .map_err(Error::OpeningFailed) } } rust-unixfs-0.4.0/pb-rs-gen.sh000075500000000000000000000015761046102023000142760ustar 00000000000000#!/usr/bin/env bash # don't really use this as the files do have local modifications: # - removal of the broken owned container # - fixing of clippy warnings # - added lifetime # - fix the lint ignore # set -eu gen() { # strip the packages as apparently there is a bug with single-mod 0.8.2 local tmpfile="$(mktemp).proto" grep -v '^package ' "$1" > "$tmpfile"; retval=0 local filename="$(basename "$1")" local output="$(dirname "$1")/${filename%.*}.rs" pb-rs --single-mod --output "$output" "$tmpfile" || retval=$? rm "$tmpfile" if [[ "$retval" -ne 0 ]]; then return $retval; fi # strip out the empty lines and fix the automatically generated comment # empty lines seem to be a problem for cargo fmt sed -i -Ee '/^\s*$/d' -e "s/'$(basename "$tmpfile")'/'$filename'/g" "$output" cargo fmt -- "$output" } gen src/pb/merkledag.proto src/pb gen src/pb/unixfs.proto src/pb rust-unixfs-0.4.0/src/dagpb.rs000064400000000000000000000101061046102023000143440ustar 00000000000000//! dag-pb support operations. Placing this module inside unixfs module is a bit unfortunate but //! follows from the inseparability of dag-pb and UnixFS. use crate::pb::PBNode; use alloc::borrow::Cow; use core::convert::TryFrom; use core::fmt; use core::ops::Range; /// Extracts the PBNode::Data field from the block as it appears on the block. pub fn node_data(block: &[u8]) -> Result, quick_protobuf::Error> { let doc = PBNode::try_from(block)?; Ok(match doc.Data { Some(Cow::Borrowed(slice)) => Some(slice), Some(Cow::Owned(_)) => unreachable!("never converted to owned"), None => None, }) } /// Creates a wrapper around the given block representation which does not consume the block /// representation but allows accessing the dag-pb node Data. pub fn wrap_node_data(block: T) -> Result, quick_protobuf::Error> where T: AsRef<[u8]>, { let full = block.as_ref(); let range = node_data(full)? .map(|data| subslice_to_range(full, data).expect("this has to be in range")); Ok(NodeData { inner: block, range, }) } fn subslice_to_range(full: &[u8], sub: &[u8]) -> Option> { // note this doesn't work for all types, for example () or similar ZSTs. let max = full.len(); let amt = sub.len(); if max < amt { // if the latter slice is larger than the first one, surely it isn't a subslice. return None; } let full = full.as_ptr() as usize; let sub = sub.as_ptr() as usize; sub.checked_sub(full) // not needed as it would divide by one: .map(|diff| diff / mem::size_of::()) // // if there are two slices of a continuous chunk, [A|B] we need to make sure B will not be // calculated as subslice of A .and_then(|start| if start >= max { None } else { Some(start) }) .map(|start| start..(start + amt)) } /// The wrapper returned from [`wrap_node_data`], allows accessing dag-pb nodes Data. #[derive(PartialEq, Eq)] pub struct NodeData { inner: T, range: Option>, } impl> fmt::Debug for NodeData { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { write!( fmt, "NodeData<{}> {{ inner.len: {}, range: {:?} }}", std::any::type_name::(), self.inner.as_ref().len(), self.range ) } } impl> NodeData { /// Returns the dag-pb nodes Data field as slice pub fn node_data(&self) -> &[u8] { if let Some(range) = self.range.as_ref() { &self.inner.as_ref()[range.clone()] } else { &[][..] } } /// Returns access to the wrapped block representation pub fn get_ref(&self) -> &T { &self.inner } /// Consumes self and returns the block representation pub fn into_inner(self) -> T { self.inner } } impl, B: AsRef<[u8]>> PartialEq for NodeData { fn eq(&self, other: &B) -> bool { self.node_data() == other.as_ref() } } #[cfg(test)] mod tests { use super::subslice_to_range; #[test] fn subslice_ranges() { let full = &b"01234"[..]; for start in 0..(full.len() - 1) { for end in start..(full.len() - 1) { let sub = &full[start..end]; assert_eq!(subslice_to_range(full, sub), Some(start..end)); } } } #[test] fn not_in_following_subslice() { // this could be done with two distinct/disjoint 'static slices but there might not be any // guarantees it working in all rust released and unreleased versions, and with different // linkers. let full = &b"0123456789"[..]; let a = &full[0..4]; let b = &full[4..]; let a_sub = &a[1..3]; let b_sub = &b[0..2]; assert_eq!(subslice_to_range(a, a_sub), Some(1..3)); assert_eq!(subslice_to_range(b, b_sub), Some(0..2)); assert_eq!(subslice_to_range(a, b_sub), None); assert_eq!(subslice_to_range(b, a_sub), None); } } rust-unixfs-0.4.0/src/dir/builder/buffered.rs000064400000000000000000000354131046102023000172650ustar 00000000000000use super::{DirBuilder, Entry, Leaf, PostOrderIterator, TreeBuildingFailed, TreeOptions}; use crate::Metadata; use alloc::collections::btree_map::Entry::*; use libipld::Cid; /// UnixFs directory tree builder which buffers entries until `build()` is called. #[derive(Debug)] pub struct BufferingTreeBuilder { /// At the root there can be only one element, unless an option was given to create a new /// directory surrounding the root elements. root_builder: DirBuilder, longest_path: usize, // used to generate a unique id for each node; it is used when doing the post order traversal to // recover all children's rendered Cids counter: u64, opts: TreeOptions, } impl Default for BufferingTreeBuilder { fn default() -> Self { Self::new(TreeOptions::default()) } } impl BufferingTreeBuilder { /// Construct a new tree builder with the given configuration. pub fn new(opts: TreeOptions) -> Self { BufferingTreeBuilder { root_builder: DirBuilder::root(0), longest_path: 0, counter: 1, opts, } } /// Registers the given path to be a link to the cid that follows. The target leaf should be /// either a file, directory or symlink but could of course be anything. It will be treated as /// an opaque link. pub fn put_link( &mut self, full_path: &str, target: Cid, total_size: u64, ) -> Result<(), TreeBuildingFailed> { let leaf = Leaf { link: target, total_size, }; self.modify_with(full_path, |parent, basename, _| { parent .put_leaf(basename, leaf) .map_err(|_| TreeBuildingFailed::DuplicatePath(full_path.to_string())) }) } /// Directories get "put" implicitly through the put files, and directories need to be adjusted /// only when wanting them to have metadata. pub fn set_metadata( &mut self, full_path: &str, metadata: Metadata, ) -> Result<(), TreeBuildingFailed> { // create all paths along the way // // set if not set, error otherwise? FIXME: doesn't error atm self.modify_with(full_path, |parent, basename, id| { parent .add_or_get_node(basename, id) .map_err(|_| TreeBuildingFailed::LeafAsDirectory(full_path.to_string()))? .set_metadata(metadata); Ok(()) }) } fn modify_with(&mut self, full_path: &str, f: F) -> Result<(), TreeBuildingFailed> where F: FnOnce(&mut DirBuilder, String, &mut Option) -> Result<(), TreeBuildingFailed>, { // create all paths along the way // // assuming it's ok to split at '/' since that cannot be escaped in linux at least self.longest_path = full_path.len().max(self.longest_path); let mut remaining = full_path.split('/').enumerate().peekable(); let mut dir_builder = &mut self.root_builder; // check these before to avoid creation of bogus nodes in the tree or having to clean up. if full_path.ends_with('/') { return Err(TreeBuildingFailed::PathEndsInSlash(full_path.to_string())); } if full_path.contains("//") { return Err(TreeBuildingFailed::RepeatSlashesInPath( full_path.to_string(), )); } // needed to avoid borrowing into the DirBuilder::new calling closure let counter = &mut self.counter; while let Some((depth, next)) = remaining.next() { let last = remaining.peek().is_none(); match (depth, next, last) { // this might need to be accepted in case there is just a single file (0, "", true) => { // accepted: allows unconditional tree building in ipfs-http // but the resulting tree will have at most single node, which doesn't prompt // creation of new directories and should be fine. } (0, "", false) => { // ok to keep this inside the loop; we are yet to create any nodes. // note the ipfs-http (and for example js-ipfs) normalizes the path by // removing the slash from the start. return Err(TreeBuildingFailed::RootedPath(full_path.to_string())); } (_, "", false) => unreachable!("already validated: no repeat slashes"), (_, "", true) => unreachable!("already validated: path does not end in slash"), _ => {} } // our first level can be full, depending on the options given let full = depth == 0 && !self.opts.wrap_with_directory && !dir_builder.is_empty(); if last { let mut next_id = Some(*counter); let ret = if full { Err(TreeBuildingFailed::TooManyRootLevelEntries) } else { f(dir_builder, next.to_string(), &mut next_id) }; if next_id.is_none() { *counter += 1; } if ret.is_err() { // FIXME: there might be a case where we have now stale nodes in our tree but // cannot figure out an example for that. } return ret; } let parent_id = dir_builder.id; dir_builder = match (full, dir_builder.nodes.entry(next.to_string())) { (_, Occupied(oe)) => oe .into_mut() .as_dir_builder() .map_err(|_| TreeBuildingFailed::LeafAsDirectory(full_path.to_string()))?, (false, Vacant(ve)) => { let next_id = *counter; *counter += 1; ve.insert(Entry::Directory(DirBuilder::new(parent_id, next_id))) .as_dir_builder() .expect("safe: we just inserted a DirBuilder") } (true, Vacant(_)) => return Err(TreeBuildingFailed::TooManyRootLevelEntries), }; } // as the str::split will always return a single element this should not ever be hit unreachable!( "walked the full_path but failed to add anything: {:?}", full_path ); } /// Called to build the tree. The built tree will have the added files and their implied /// directory structure, along with the directory entries which were created using /// `set_metadata`. To build the whole hierarchy, one must iterate the returned iterator to /// completion while storing the created blocks. /// /// Returned `PostOrderIterator` will use the given `full_path` and `block_buffer` to store /// its data during the walk. `PostOrderIterator` implements `Iterator` while also allowing /// borrowed access via `next_borrowed`. pub fn build(self) -> PostOrderIterator { PostOrderIterator::new(self.root_builder, self.opts, self.longest_path) } } #[cfg(test)] mod tests { use super::{ super::OwnedTreeNode, BufferingTreeBuilder, Metadata, TreeBuildingFailed, TreeOptions, }; use core::convert::TryFrom; use libipld::multihash::{Code, MultihashDigest}; use libipld::Cid; #[test] fn some_directories() { let mut builder = BufferingTreeBuilder::default(); // foobar\n let five_block_foobar = Cid::try_from("QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6").unwrap(); builder .put_link("a/b/c/d/e/f/g.txt", five_block_foobar, 221) .unwrap(); builder .put_link("a/b/c/d/e/h.txt", five_block_foobar, 221) .unwrap(); builder .put_link("a/b/c/d/e/i.txt", five_block_foobar, 221) .unwrap(); let actual = builder .build() .map(|res| res.map(|n| (n.path, n.cid, n.block))) .collect::, _>>() .unwrap(); let expected = vec![ ( "a/b/c/d/e/f", "Qmbgf44ztW9wLcGNRNYGinGQB6SQDQtbHVbkM5MrWms698", ), ( "a/b/c/d/e", "Qma1hCr3CuPRAq2Gw4DCNMqsi42Bjs4Bt1MGSS57kNh144", ), ("a/b/c/d", "QmUqaYatcJqiSFdykHXGh4Nog1eMSfDJBeYzcG67KV5Ri4"), ("a/b/c", "QmYwaNBaGpDCNN9XpHmjxVPHmEXZMw9KDY3uikE2UU5fVB"), ("a/b", "QmeAzCPig4o4gBLh2LvP96Sr8MUBrsu2Scw9MTq1EvTDhY"), ("a", "QmSTUFaPwJW8xD4KNRLLQRqVTYtYC29xuhYTJoYPWdzvKp"), ]; verify_results(expected, actual); } #[test] fn empty_path() { let mut builder = BufferingTreeBuilder::default(); builder.put_link("", some_cid(0), 1).unwrap(); let actual = builder .build() .map(|res| res.map(|OwnedTreeNode { path, .. }| path)) .collect::, _>>() .unwrap(); assert!( actual.is_empty(), "wrapping in directory was not asked, single element" ); } #[test] #[should_panic] fn rooted_path() { let mut builder = BufferingTreeBuilder::default(); builder.put_link("/a", some_cid(0), 1).unwrap(); } #[test] #[should_panic] fn successive_slashes() { let mut builder = BufferingTreeBuilder::default(); builder.put_link("a//b", some_cid(0), 1).unwrap(); } #[test] fn multiple_roots() { // foobar\n let five_block_foobar = Cid::try_from("QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6").unwrap(); let mut opts = TreeOptions::default(); opts.wrap_with_directory(); let mut builder = BufferingTreeBuilder::new(opts); builder.put_link("a", five_block_foobar, 221).unwrap(); builder.put_link("b", five_block_foobar, 221).unwrap(); let actual = builder .build() .map(|res| res.map(|OwnedTreeNode { path, cid, .. }| (path, cid.to_string()))) .collect::, _>>() .unwrap(); assert_eq!( actual, &[( "".to_string(), "QmdbWuhpVCX9weVMMqvVTMeGwKMqCNJDbx7ZK1zG36sea7".to_string() )] ); } #[test] fn single_wrapped_root() { // foobar\n let five_block_foobar = "QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6" .parse() .unwrap(); let mut opts = TreeOptions::default(); opts.wrap_with_directory(); let mut builder = BufferingTreeBuilder::new(opts); builder.put_link("a", five_block_foobar, 221).unwrap(); let actual = builder .build() .map(|res| res.map(|OwnedTreeNode { path, cid, .. }| (path, cid.to_string()))) .collect::, _>>() .unwrap(); assert_eq!( actual, &[( "".to_string(), "QmQBseoi3b2FBrYhjM2E4mCF4Q7C8MgCUbzAbGNfyVwgNk".to_string() )] ); } #[test] #[should_panic] fn denied_multiple_root_dirs() { let mut builder = BufferingTreeBuilder::default(); builder.put_link("a/c.txt", some_cid(0), 1).unwrap(); builder.put_link("b/d.txt", some_cid(1), 1).unwrap(); } #[test] #[should_panic] fn denied_multiple_root_files() { let mut builder = BufferingTreeBuilder::default(); builder.put_link("a.txt", some_cid(0), 1).unwrap(); builder.put_link("b.txt", some_cid(1), 1).unwrap(); } #[test] #[should_panic] fn using_leaf_as_node() { let mut builder = BufferingTreeBuilder::default(); builder.put_link("a.txt", some_cid(0), 1).unwrap(); builder.put_link("a.txt/b.txt", some_cid(1), 1).unwrap(); } #[test] fn set_metadata_before_files() { let mut builder = BufferingTreeBuilder::default(); builder .set_metadata("a/b/c/d", Metadata::default()) .unwrap(); builder.put_link("a/b/c/d/e.txt", some_cid(1), 1).unwrap(); builder.put_link("a/b/c/d/f.txt", some_cid(2), 1).unwrap(); let actual = builder .build() .map(|res| res.map(|OwnedTreeNode { path, .. }| path)) .collect::, _>>() .unwrap(); assert_eq!(actual, &["a/b/c/d", "a/b/c", "a/b", "a",]) } #[test] fn set_metadata_on_file() { let mut builder = BufferingTreeBuilder::default(); builder.put_link("a/a.txt", some_cid(0), 1).unwrap(); let err = builder .set_metadata("a/a.txt", Metadata::default()) .unwrap_err(); assert!( matches!(err, TreeBuildingFailed::LeafAsDirectory(_)), "{err:?}" ); } #[test] fn dir_with_cidv1_link() { // this is `echo '{ "name": "hello" }` | ./ipfs dag put` let target = Cid::try_from("bafyreihakpd7te5nbmlhdk5ntvcvhf2hmfgrvcwna2sddq5zz5342mcbli").unwrap(); let mut builder = BufferingTreeBuilder::default(); builder.put_link("a/b", target, 12).unwrap(); let actual = builder .build() .map(|res| res.map(|n| (n.path, n.cid, n.block))) .collect::, _>>() .unwrap(); let expected = vec![("a", "QmPMDMPG8dbHDC9GuvqWr9pfruLnp4GZCAWrskwCmenVQa")]; verify_results(expected, actual); } fn verify_results( mut expected: Vec<( impl AsRef + core::fmt::Debug, impl AsRef + core::fmt::Debug, )>, mut actual: Vec<(String, Cid, Box<[u8]>)>, ) { use core::fmt; struct Hex<'a>(&'a [u8]); impl<'a> fmt::Debug for Hex<'a> { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { for b in self.0 { write!(fmt, "{b:02x}")?; } Ok(()) } } // hopefully this way the errors will be easier to hunt down actual.reverse(); expected.reverse(); while let Some(actual) = actual.pop() { let expected = expected.pop().expect("size mismatch"); assert_eq!(actual.0, expected.0.as_ref()); assert_eq!( actual.1.to_string(), expected.1.as_ref(), "{:?}: {:?}", actual.0, Hex(&actual.2) ); } assert_eq!(expected.len(), 0, "size mismatch: {actual:?}"); } /// Returns a quick and dirty sha2-256 of the given number as a Cidv0 fn some_cid(number: usize) -> Cid { let mh = Code::Sha2_256.digest(&number.to_le_bytes()); Cid::new_v0(mh).unwrap() } } rust-unixfs-0.4.0/src/dir/builder/custom_pb.rs000064400000000000000000000073551046102023000175020ustar 00000000000000//! Custom protobuf types which are used in encoding directorys. use super::NamedLeaf; use crate::pb::UnixFs; use libipld::Cid; use quick_protobuf::{MessageWrite, Writer, WriterBackend}; /// Newtype which uses the &[Option<(NamedLeaf)>] as Vec. pub(super) struct CustomFlatUnixFs<'a> { pub(super) links: &'a [Option], pub(super) data: UnixFs<'a>, } impl<'a> CustomFlatUnixFs<'a> { fn mapped(&self) -> impl Iterator> + '_ { self.links .iter() // FIXME: this unwrap here seems dangerious; it seems to follow from // `crate::dir::builder::iter::Leaves` assumption that all of these options have // already been filled at the previous stages of post-order visit .map(|triple| triple.as_ref().map(NamedLeafAsPBLink).unwrap()) } } impl<'a> MessageWrite for CustomFlatUnixFs<'a> { fn get_size(&self) -> usize { use quick_protobuf::sizeofs::*; let links = self .mapped() .map(|link| 1 + sizeof_len(link.get_size())) .sum::(); links + 1 + sizeof_len(self.data.get_size()) } fn write_message(&self, w: &mut Writer) -> quick_protobuf::Result<()> { self.mapped() .try_for_each(|l| w.write_with_tag(18, |w| w.write_message(&l)))?; w.write_with_tag(10, |w| w.write_message(&self.data)) } } /// Custom NamedLeaf as PBLink "adapter." struct NamedLeafAsPBLink<'a>(&'a NamedLeaf); impl<'a> MessageWrite for NamedLeafAsPBLink<'a> { fn get_size(&self) -> usize { use quick_protobuf::sizeofs::*; // ones are the tags 1 + sizeof_len((self.0).0.len()) + 1 + sizeof_len(WriteableCid(&(self.0).1).get_size()) //+ sizeof_len(self.1.link.to_bytes().len()) + 1 + sizeof_varint((self.0).2) } fn write_message(&self, w: &mut Writer) -> quick_protobuf::Result<()> { w.write_with_tag(10, |w| w.write_message(&WriteableCid(&(self.0).1)))?; //w.write_with_tag(10, |w| w.write_bytes(&self.1.link.to_bytes()))?; w.write_with_tag(18, |w| w.write_string((self.0).0.as_str()))?; w.write_with_tag(24, |w| w.write_uint64((self.0).2))?; Ok(()) } } /// Newtype around Cid to allow embedding it as PBLink::Hash without allocating a vector. struct WriteableCid<'a>(&'a Cid); impl<'a> MessageWrite for WriteableCid<'a> { fn get_size(&self) -> usize { use libipld::cid::Version::*; use quick_protobuf::sizeofs::*; let hash_len = self.0.hash().to_bytes().len(); match self.0.version() { V0 => hash_len, V1 => { let version_len = 1; let codec_len = sizeof_varint(self.0.codec()); version_len + codec_len + hash_len } } } fn write_message(&self, w: &mut Writer) -> quick_protobuf::Result<()> { use libipld::cid::Version::*; match self.0.version() { V0 => { /* cidv0 has only the _multi_hash */ } V1 => { // it is possible that CidV1 should not be linked to from a unixfs // directory; at least go-ipfs 0.5 `ipfs files` denies making a cbor link // but happily accepts and does refs over one. w.write_u8(1)?; w.write_varint(self.0.codec())?; } } self.0 .hash() .to_bytes() .iter() // while this looks bad it cannot be measured; note we cannot use the // write_bytes because that is length prefixed bytes write .try_for_each(|b| w.write_u8(*b)) } } rust-unixfs-0.4.0/src/dir/builder/dir_builder.rs000064400000000000000000000041511046102023000177620ustar 00000000000000use super::{Entry, Leaf}; use crate::Metadata; use alloc::collections::btree_map::Entry::*; use alloc::collections::BTreeMap; pub(super) struct DuplicateName; pub(super) struct FoundLeaf; /// Node in a directory tree. #[derive(Debug)] pub(super) struct DirBuilder { /// Immediate files, symlinks or directories in this directory pub nodes: BTreeMap, /// Metadata for this directory metadata: Metadata, /// Id of the parent; None for the root node pub parent_id: Option, /// Internal id, used for propagating Cids back from children during post order visit. pub id: u64, } impl DirBuilder { pub fn new(parent_id: u64, id: u64) -> Self { assert_ne!(parent_id, id); DirBuilder { nodes: Default::default(), metadata: Default::default(), parent_id: Some(parent_id), id, } } pub fn root(id: u64) -> Self { DirBuilder { nodes: Default::default(), metadata: Default::default(), parent_id: None, id, } } pub fn put_leaf(&mut self, key: String, leaf: Leaf) -> Result<(), DuplicateName> { match self.nodes.entry(key) { Occupied(_) => Err(DuplicateName), Vacant(ve) => { ve.insert(Entry::Leaf(leaf)); Ok(()) } } } pub fn add_or_get_node( &mut self, key: String, id: &mut Option, ) -> Result<&mut DirBuilder, FoundLeaf> { match self.nodes.entry(key) { Occupied(oe) => oe.into_mut().as_dir_builder().map_err(|_| FoundLeaf), Vacant(ve) => { let id = id.take().unwrap(); let entry = ve.insert(Entry::Directory(Self::new(self.id, id))); Ok(entry.as_dir_builder().expect("just inserted")) } } } pub fn len(&self) -> usize { self.nodes.len() } pub fn is_empty(&self) -> bool { self.len() == 0 } pub fn set_metadata(&mut self, metadata: Metadata) { self.metadata = metadata; } } rust-unixfs-0.4.0/src/dir/builder/iter.rs000064400000000000000000000373261046102023000164530ustar 00000000000000use super::{ CustomFlatUnixFs, DirBuilder, Entry, Leaf, NamedLeaf, TreeConstructionFailed, TreeOptions, }; use core::fmt; use libipld::multihash::{Code, Multihash}; use libipld::Cid; use std::collections::HashMap; /// Constructs the directory nodes required for a tree. /// /// Implements the Iterator interface for owned values and the borrowed version, `next_borrowed`. /// The tree is fully constructed once this has been exhausted. pub struct PostOrderIterator { full_path: String, old_depth: usize, block_buffer: Vec, // our stack of pending work pending: Vec, // "communication channel" from nested entries back to their parents; this hashmap is only used // in the event of mixed child nodes (leaves and nodes). persisted_cids: HashMap>>, reused_children: Vec, cid: Option, total_size: u64, // from TreeOptions opts: TreeOptions, } /// The link list used to create the directory node. This list is created from a the BTreeMap /// inside DirBuilder, and initially it will have `Some` values only for the initial leaves and /// `None` values for subnodes which are not yet ready. At the time of use, this list is expected /// to have only `Some` values. type Leaves = Vec>; /// The nodes in the visit. We need to do a post-order visit, which starts from a single /// `DescentRoot`, followed by N `Descents` where N is the deepest directory in the tree. On each /// descent, we'll need to first schedule a `Post` (or `PostRoot`) followed the immediate children /// of the node. Directories are rendered when all of their direct and indirect descendants have /// been serialized into NamedLeafs. #[derive(Debug)] enum Visited { // handle root differently not to infect with the Option and Option DescentRoot(DirBuilder), Descent { node: DirBuilder, name: String, depth: usize, /// The index in the parents `Leaves` accessible through `PostOrderIterator::persisted_cids`. index: usize, }, Post { parent_id: u64, depth: usize, name: String, index: usize, /// Leaves will be stored directly in this field when there are no DirBuilder descendants, /// in the `PostOrderIterator::persisted_cids` otherwise. leaves: LeafStorage, }, PostRoot { leaves: LeafStorage, }, } impl PostOrderIterator { pub(super) fn new(root: DirBuilder, opts: TreeOptions, longest_path: usize) -> Self { let root = Visited::DescentRoot(root); PostOrderIterator { full_path: String::with_capacity(longest_path), old_depth: 0, block_buffer: Default::default(), pending: vec![root], persisted_cids: Default::default(), reused_children: Vec::new(), cid: None, total_size: 0, opts, } } fn render_directory( links: &[Option], buffer: &mut Vec, block_size_limit: &Option, ) -> Result { use crate::pb::{UnixFs, UnixFsType}; use quick_protobuf::{BytesWriter, MessageWrite, Writer}; use sha2::{Digest, Sha256}; // FIXME: ideas on how to turn this into a HAMT sharding on some heuristic. we probably // need to introduce states in to the "iterator": // // 1. bucketization // 2. another post order visit of the buckets? // // the nested post order visit should probably re-use the existing infra ("message // passing") and new ids can be generated by giving this iterator the counter from // BufferedTreeBuilder. // // could also be that the HAMT shard building should start earlier, since the same // heuristic can be detected *at* bufferedtreewriter. there the split would be easier, and // this would "just" be a single node rendering, and not need any additional states.. let node = CustomFlatUnixFs { links, data: UnixFs { Type: UnixFsType::Directory, ..Default::default() }, }; let size = node.get_size(); if let Some(limit) = block_size_limit { let size = size as u64; if *limit < size { // FIXME: this could probably be detected at builder return Err(TreeConstructionFailed::TooLargeBlock(size)); } } let cap = buffer.capacity(); if let Some(additional) = size.checked_sub(cap) { buffer.reserve(additional); } if let Some(mut needed_zeroes) = size.checked_sub(buffer.len()) { let zeroes = [0; 8]; while needed_zeroes > 8 { buffer.extend_from_slice(&zeroes[..]); needed_zeroes -= zeroes.len(); } buffer.extend(core::iter::repeat(0).take(needed_zeroes)); } let mut writer = Writer::new(BytesWriter::new(&mut buffer[..])); node.write_message(&mut writer) .map_err(TreeConstructionFailed::Protobuf)?; buffer.truncate(size); let mh = Multihash::wrap(Code::Sha2_256.into(), &Sha256::digest(&buffer)).unwrap(); let cid = Cid::new_v0(mh).expect("sha2_256 is the correct multihash for cidv0"); let combined_from_links = links .iter() .map(|opt| { opt.as_ref() .map(|NamedLeaf(_, _, total_size)| total_size) .unwrap() }) .sum::(); Ok(Leaf { link: cid, total_size: buffer.len() as u64 + combined_from_links, }) } /// Construct the next dag-pb node, if any. /// /// Returns a `TreeNode` of the latest constructed tree node. pub fn next_borrowed(&mut self) -> Option, TreeConstructionFailed>> { while let Some(visited) = self.pending.pop() { let (name, depth) = match &visited { Visited::DescentRoot(_) => (None, 0), Visited::Descent { name, depth, .. } => (Some(name.as_ref()), *depth), Visited::Post { name, depth, .. } => (Some(name.as_ref()), *depth), Visited::PostRoot { .. } => (None, 0), }; update_full_path((&mut self.full_path, &mut self.old_depth), name, depth); match visited { Visited::DescentRoot(node) => { let children = &mut self.reused_children; let leaves = partition_children_leaves(depth, node.nodes.into_iter(), children); let any_children = !children.is_empty(); let leaves = if any_children { self.persisted_cids.insert(node.id, leaves); LeafStorage::from(node.id) } else { leaves.into() }; self.pending.push(Visited::PostRoot { leaves }); self.pending.append(children); } Visited::Descent { node, name, depth, index, } => { let children = &mut self.reused_children; let leaves = partition_children_leaves(depth, node.nodes.into_iter(), children); let any_children = !children.is_empty(); let parent_id = node.parent_id.expect("only roots parent_id is None"); let leaves = if any_children { self.persisted_cids.insert(node.id, leaves); node.id.into() } else { leaves.into() }; self.pending.push(Visited::Post { parent_id, name, depth, leaves, index, }); self.pending.append(children); } Visited::Post { parent_id, name, leaves, index, .. } => { let leaves = leaves.into_inner(&mut self.persisted_cids); let buffer = &mut self.block_buffer; let leaf = match Self::render_directory( &leaves, buffer, &self.opts.block_size_limit, ) { Ok(leaf) => leaf, Err(e) => return Some(Err(e)), }; self.cid = Some(leaf.link); self.total_size = leaf.total_size; { // name is None only for wrap_with_directory, which cannot really be // propagated up but still the parent_id is allowed to be None let parent_leaves = self.persisted_cids.get_mut(&parent_id); match (parent_id, parent_leaves, index) { (pid, None, index) => { panic!("leaves not found for parent_id = {pid} and index = {index}") } (_, Some(vec), index) => { let cell = &mut vec[index]; // all assert!(cell.is_none()); *cell = Some(NamedLeaf(name, leaf.link, leaf.total_size)); } } } return Some(Ok(TreeNode { path: self.full_path.as_str(), cid: self.cid.as_ref().unwrap(), total_size: self.total_size, block: &self.block_buffer, })); } Visited::PostRoot { leaves } => { let leaves = leaves.into_inner(&mut self.persisted_cids); if !self.opts.wrap_with_directory { break; } let buffer = &mut self.block_buffer; let leaf = match Self::render_directory( &leaves, buffer, &self.opts.block_size_limit, ) { Ok(leaf) => leaf, Err(e) => return Some(Err(e)), }; self.cid = Some(leaf.link); self.total_size = leaf.total_size; return Some(Ok(TreeNode { path: self.full_path.as_str(), cid: self.cid.as_ref().unwrap(), total_size: self.total_size, block: &self.block_buffer, })); } } } None } } impl Iterator for PostOrderIterator { type Item = Result; fn next(&mut self) -> Option { self.next_borrowed() .map(|res| res.map(TreeNode::into_owned)) } } /// Borrowed representation of a node in the tree. pub struct TreeNode<'a> { /// Full path to the node. pub path: &'a str, /// The Cid of the document. pub cid: &'a Cid, /// Cumulative total size of the subtree in bytes. pub total_size: u64, /// Raw dag-pb document. pub block: &'a [u8], } impl<'a> fmt::Debug for TreeNode<'a> { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { fmt.debug_struct("TreeNode") .field("path", &format_args!("{:?}", self.path)) .field("cid", &format_args!("{}", self.cid)) .field("total_size", &self.total_size) .field("size", &self.block.len()) .finish() } } impl TreeNode<'_> { /// Convert to an owned and detached representation. pub fn into_owned(self) -> OwnedTreeNode { OwnedTreeNode { path: self.path.to_owned(), cid: self.cid.to_owned(), total_size: self.total_size, block: self.block.into(), } } } /// Owned representation of a node in the tree. pub struct OwnedTreeNode { /// Full path to the node. pub path: String, /// The Cid of the document. pub cid: Cid, /// Cumulative total size of the subtree in bytes. pub total_size: u64, /// Raw dag-pb document. pub block: Box<[u8]>, } fn update_full_path( (full_path, old_depth): (&mut String, &mut usize), name: Option<&str>, depth: usize, ) { if depth < 2 { // initially thought it might be a good idea to add a slash to all components; removing it made // it impossible to get back down to empty string, so fixing this for depths 0 and 1. full_path.clear(); *old_depth = 0; } else { while *old_depth >= depth && *old_depth > 0 { // we now want to pop the last segment // this would be easier with PathBuf let slash_at = full_path.bytes().rposition(|ch| ch == b'/'); if let Some(slash_at) = slash_at { if *old_depth == depth && Some(&full_path[(slash_at + 1)..]) == name { // minor unmeasurable perf optimization: // going from a/b/foo/zz => a/b/foo does not need to go through the a/b return; } full_path.truncate(slash_at); *old_depth -= 1; } else { todo!( "no last slash_at in {:?} yet {} >= {}", full_path, old_depth, depth ); } } } debug_assert!(*old_depth <= depth); if let Some(name) = name { if !full_path.is_empty() { full_path.push('/'); } full_path.push_str(name); *old_depth += 1; } assert_eq!(*old_depth, depth); } /// Returns a Vec of the links in order with only the leaves, the given `children` will contain yet /// incomplete nodes of the tree. fn partition_children_leaves( depth: usize, it: impl Iterator, children: &mut Vec, ) -> Leaves { let mut leaves = Vec::new(); for (i, (k, v)) in it.enumerate() { match v { Entry::Directory(node) => { children.push(Visited::Descent { node, // this needs to be pushed down to update the full_path name: k, depth: depth + 1, index: i, }); // this will be overwritten later, but the order is fixed leaves.push(None); } Entry::Leaf(leaf) => leaves.push(Some(NamedLeaf(k, leaf.link, leaf.total_size))), } } leaves } #[derive(Debug)] enum LeafStorage { Direct(Leaves), Stashed(u64), } impl LeafStorage { fn into_inner(self, stash: &mut HashMap) -> Leaves { use LeafStorage::*; match self { Direct(leaves) => leaves, Stashed(id) => stash .remove(&id) .ok_or(id) .expect("leaves are either stashed or direct, must able to find with id"), } } } impl From for LeafStorage { fn from(key: u64) -> LeafStorage { LeafStorage::Stashed(key) } } impl From for LeafStorage { fn from(leaves: Leaves) -> LeafStorage { LeafStorage::Direct(leaves) } } rust-unixfs-0.4.0/src/dir/builder.rs000064400000000000000000000104561046102023000155030ustar 00000000000000use core::fmt; use libipld::Cid; mod dir_builder; use dir_builder::DirBuilder; mod iter; pub use iter::{OwnedTreeNode, PostOrderIterator, TreeNode}; mod buffered; pub use buffered::BufferingTreeBuilder; mod custom_pb; use custom_pb::CustomFlatUnixFs; enum Entry { Leaf(Leaf), Directory(DirBuilder), } impl fmt::Debug for Entry { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { use Entry::*; match self { Leaf(leaf) => write!(fmt, "Leaf {{ {leaf:?} }}"), Directory(_) => write!(fmt, "DirBuilder {{ .. }}"), } } } impl Entry { fn as_dir_builder(&mut self) -> Result<&mut DirBuilder, ()> { use Entry::*; match self { Directory(ref mut d) => Ok(d), _ => Err(()), } } } struct Leaf { link: Cid, total_size: u64, } impl fmt::Debug for Leaf { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { write!(fmt, "{}, {}", self.link, self.total_size) } } /// Configuration for customizing how the tree is built. #[derive(Debug, Clone)] pub struct TreeOptions { block_size_limit: Option, wrap_with_directory: bool, } impl Default for TreeOptions { fn default() -> Self { TreeOptions { block_size_limit: Some(512 * 1024), wrap_with_directory: false, } } } impl TreeOptions { /// Overrides the default directory block size limit. If the size limit is set to `None`, no /// directory will be too large. pub fn block_size_limit(&mut self, limit: Option) { self.block_size_limit = limit; } /// When true, allow multiple top level entries, otherwise error on the second entry. /// Defaults to false. pub fn wrap_with_directory(&mut self) { self.wrap_with_directory = true; } } /// Tree building failure cases. #[derive(Debug)] pub enum TreeBuildingFailed { /// The given full path started with a slash; paths in the `/add` convention are not rooted. RootedPath(String), /// The given full path contained an empty segment. RepeatSlashesInPath(String), /// The given full path ends in slash. PathEndsInSlash(String), /// If the `BufferingTreeBuilder` was created without `TreeOptions` with the option /// `wrap_with_directory` enabled, then there can be only a single element at the root. TooManyRootLevelEntries, /// The given full path had already been added. DuplicatePath(String), /// The given full path had already been added as a link to an opaque entry. LeafAsDirectory(String), } impl fmt::Display for TreeBuildingFailed { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { use TreeBuildingFailed::*; match self { RootedPath(s) => write!(fmt, "path is rooted: {s:?}"), RepeatSlashesInPath(s) => write!(fmt, "path contains repeat slashes: {s:?}"), PathEndsInSlash(s) => write!(fmt, "path ends in a slash: {s:?}"), TooManyRootLevelEntries => write!( fmt, "multiple root level entries while configured wrap_with_directory = false" ), // TODO: perhaps we should allow adding two leafs with the same Cid? DuplicatePath(s) => write!(fmt, "path exists already: {s:?}"), LeafAsDirectory(s) => write!( fmt, "attempted to use already added leaf as a subdirectory: {s:?}" ), } } } impl std::error::Error for TreeBuildingFailed {} /// Failure cases for `PostOrderIterator` creating the tree dag-pb nodes. #[derive(Debug)] pub enum TreeConstructionFailed { /// Failed to serialize the protobuf node for the directory Protobuf(quick_protobuf::Error), /// The resulting directory would be too large and HAMT sharding is yet to be implemented or /// denied. TooLargeBlock(u64), } impl fmt::Display for TreeConstructionFailed { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { use TreeConstructionFailed::*; match self { Protobuf(e) => write!(fmt, "serialization failed: {e}"), TooLargeBlock(size) => write!(fmt, "attempted to create block of {size} bytes"), } } } impl std::error::Error for TreeConstructionFailed {} #[derive(Debug)] struct NamedLeaf(String, Cid, u64); rust-unixfs-0.4.0/src/dir/directory.rs000064400000000000000000000033741046102023000160620ustar 00000000000000use crate::pb::FlatUnixFs; use core::fmt; /// Ensures the directory looks like something we actually support. pub(crate) fn check_directory_supported( flat: FlatUnixFs<'_>, ) -> Result, UnexpectedDirectoryProperties> { let data = flat.data.Data.as_deref(); if flat.data.filesize.is_some() || !flat.data.blocksizes.is_empty() || flat.data.hashType.is_some() || flat.data.fanout.is_some() || !data.unwrap_or_default().is_empty() { let data = data.map(|s| s.to_vec()); Err(UnexpectedDirectoryProperties { filesize: flat.data.filesize, blocksizes: flat.data.blocksizes, hash_type: flat.data.hashType, fanout: flat.data.fanout, data, }) } else { Ok(flat) } } /// Error case for checking if we support this directory. #[derive(Debug)] pub struct UnexpectedDirectoryProperties { /// filesize is a property of Files filesize: Option, /// blocksizes is a property of Files blocksizes: Vec, /// hash_type is a property of HAMT Shards hash_type: Option, /// fanout is a property of HAMT shards fanout: Option, /// directories should have no Data data: Option>, } impl fmt::Display for UnexpectedDirectoryProperties { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { write!( fmt, "filesize={:?}, {} blocksizes, hash_type={:?}, fanout={:?}, data=[", self.filesize, self.blocksizes.len(), self.hash_type, self.fanout, )?; for b in self.data.as_deref().unwrap_or_default() { write!(fmt, "{b:02x}")?; } write!(fmt, "]") } } rust-unixfs-0.4.0/src/dir/sharded_lookup.rs000064400000000000000000000367341046102023000170670ustar 00000000000000use super::{try_convert_cid, MaybeResolved, MultipleMatchingLinks, ResolveError}; use crate::pb::{FlatUnixFs, PBLink, ParsingFailed, UnixFsType}; use crate::{InvalidCidInLink, UnexpectedNodeType}; use alloc::borrow::Cow; use alloc::collections::VecDeque; use core::convert::TryFrom; use core::fmt; use libipld::Cid; /// A cache of data structures used while traversing. Reduces allocations when walking over multiple /// path segments. pub struct Cache { buffer: VecDeque, } impl From> for Cache { fn from(mut buffer: VecDeque) -> Self { buffer.clear(); Cache { buffer } } } impl fmt::Debug for Cache { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { write!(fmt, "Cache {{ buffer: {} }}", self.buffer.capacity()) } } /// `ShardedLookup` can walk over multiple HAMT sharded directory nodes which allows multiple block /// spanning directories. pub struct ShardedLookup<'needle> { links: VecDeque, // this will be tricky if we ever need to have case-insensitive resolving *but* we can then // make a custom Cow type; important not to expose Cow in any API. needle: Cow<'needle, str>, } impl fmt::Debug for ShardedLookup<'_> { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { write!( fmt, "ShardedLookup {{ links: {}, needle: {:?} }}", self.links.len(), self.needle.as_ref(), ) } } impl<'needle> ShardedLookup<'needle> { /// Returns the next pending link and an iterator over the rest. pub fn pending_links(&self) -> (&Cid, impl Iterator) { let mut iter = self.links.iter(); let first = iter.next().expect("Already validated there are links"); (first, iter) } /// Continues the walk in the DAG of HAMT buckets searching for the original `needle`. #[allow(clippy::result_large_err)] pub fn continue_walk( mut self, next: &[u8], cache: &mut Option, ) -> Result, LookupError> { // just to make sure not to mess this up debug_assert_eq!(Some(self.pending_links().0), self.links.front()); self.links .pop_front() .expect("Already validated there are links"); let mut hamt = match FlatUnixFs::try_from(next) { Ok(hamt) if hamt.data.Type == UnixFsType::HAMTShard => hamt, Ok(other) => return Err(LookupError::UnexpectedBucketType(other.data.Type.into())), Err(ParsingFailed::InvalidDagPb(e)) | Err(ParsingFailed::InvalidUnixFs(e, _)) => { *cache = Some(self.links.into()); return Err(LookupError::Read(Some(e))); } Err(ParsingFailed::NoData(_)) => { *cache = Some(self.links.into()); return Err(LookupError::Read(None)); } }; Self::check_supported(&mut hamt)?; let found = Self::partition( hamt.links.into_iter(), self.needle.as_ref(), &mut self.links, )?; if let Some(cid) = found { *cache = Some(self.links.into()); Ok(MaybeResolved::Found(cid)) } else if self.links.is_empty() { *cache = Some(self.links.into()); Ok(MaybeResolved::NotFound) } else { Ok(MaybeResolved::NeedToLoadMore(self)) } } /// Transforms this `ShardedLookup` into a `ShardedLookup<'static>` by taking ownership of the /// needle we are trying to find. pub fn with_owned_needle(self) -> ShardedLookup<'static> { let ShardedLookup { links, needle } = self; let needle = Cow::Owned(needle.into_owned()); ShardedLookup { links, needle } } /// Finds or starts a lookup of multiple buckets. /// /// Returns the found link, the definitive negative or the means to continue the traversal. #[allow(clippy::result_large_err)] pub(crate) fn lookup_or_start( mut hamt: FlatUnixFs<'_>, needle: &'needle str, cache: &mut Option, ) -> Result, LookupError> { Self::check_supported(&mut hamt)?; let mut links = cache.take().map(|c| c.buffer).unwrap_or_default(); let found = Self::partition(hamt.links.into_iter(), needle, &mut links)?; if let Some(cid) = found { *cache = Some(links.into()); Ok(MaybeResolved::Found(cid)) } else if links.is_empty() { *cache = Some(links.into()); Ok(MaybeResolved::NotFound) } else { Ok(MaybeResolved::NeedToLoadMore(ShardedLookup { links, needle: Cow::Borrowed(needle), })) } } /// Takes the validated object as mutable reference to move data out of it in case of error. /// /// Returns an error if we don't support the properties on the HAMTShard-typed node pub(crate) fn check_supported(hamt: &mut FlatUnixFs<'_>) -> Result<(), ShardError> { assert_eq!(hamt.data.Type, UnixFsType::HAMTShard); if hamt.data.fanout != Some(256) || hamt.data.hashType != Some(34) { Err(ShardError::UnsupportedProperties { hash_type: hamt.data.hashType, fanout: hamt.data.fanout, }) } else if hamt.data.filesize.is_some() || !hamt.data.blocksizes.is_empty() { Err(ShardError::UnexpectedProperties { filesize: hamt.data.filesize, blocksizes: core::mem::take(&mut hamt.data.blocksizes), }) } else { Ok(()) } } /// Partition the original links based on their kind; if the link: /// /// - matches the needle uniquely, it will be returned as `Some(cid)` /// - is a bucket, it is pushed back to the work #[allow(clippy::result_large_err)] fn partition<'a>( iter: impl Iterator>, needle: &str, work: &mut VecDeque, ) -> Result, PartitioningError> { let mut found = None; for (i, link) in iter.enumerate() { let name = link.Name.as_deref().unwrap_or_default(); if name.len() > 2 && &name[2..] == needle { if let Some(first) = found.take() { return Err(MultipleMatchingLinks::from((first, (i, link))).into()); } else { found = Some((i, try_convert_cid(i, link)?)); } } else if name.len() == 2 { // the magic number of two comes from the fanout (256) probably let cid = try_convert_cid(i, link)?; work.push_back(cid); } else { // no match, not interesting for us } } Ok(found.map(|(_, cid)| cid)) } } pub(crate) enum PartitioningError { Multiple(MultipleMatchingLinks), InvalidCid(InvalidCidInLink), } impl From for PartitioningError { fn from(e: InvalidCidInLink) -> PartitioningError { PartitioningError::InvalidCid(e) } } impl From for PartitioningError { fn from(e: MultipleMatchingLinks) -> PartitioningError { PartitioningError::Multiple(e) } } impl From for ResolveError { fn from(e: PartitioningError) -> ResolveError { ResolveError::Lookup(LookupError::from(e)) } } impl From for LookupError { fn from(e: PartitioningError) -> LookupError { use PartitioningError::*; match e { Multiple(m) => LookupError::Multiple(m), InvalidCid(e) => LookupError::InvalidCid(e), } } } /// Shard does not fit into expectations. #[derive(Debug)] pub enum ShardError { /// Encountered an HAMT sharded directory which had an unsupported configuration. UnsupportedProperties { /// Unsupported multihash hash. hash_type: Option, /// Unsupported fanout value. fanout: Option, }, /// Encountered an HAMT sharded directory which had a unexpected properties. UnexpectedProperties { /// Filesize is used with UnixFS files. filesize: Option, /// Blocksizes are in general used with UnixFS files. blocksizes: Vec, }, } impl fmt::Display for ShardError { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { use ShardError::*; match self { UnsupportedProperties { hash_type, fanout } => write!( fmt, "unsupported HAMTShard properties: hash_type={hash_type:?}, fanout={fanout:?}" ), UnexpectedProperties { filesize, blocksizes, } => write!( fmt, "unexpected HAMTShard properties: filesize=({:?}), {} blocksizes", filesize, blocksizes.len() ), } } } impl std::error::Error for ShardError {} /// Errors which can occur when looking up a HAMTSharded directory. #[derive(Debug)] pub enum LookupError { /// Multiple matching links were found Multiple(MultipleMatchingLinks), /// Invalid Cid was matched InvalidCid(InvalidCidInLink), /// Unexpected HAMT shard bucket type UnexpectedBucketType(UnexpectedNodeType), /// Unsupported or unexpected property of the UnixFS node Shard(ShardError), /// Parsing failed or the inner dag-pb data was contained no bytes. Read(Option), } impl LookupError { /// Converts this HAMT lookup error to the more general ResolveError pub fn into_resolve_error(self) -> ResolveError { self.into() } } impl From for LookupError { fn from(e: MultipleMatchingLinks) -> Self { LookupError::Multiple(e) } } impl From for LookupError { fn from(e: InvalidCidInLink) -> Self { LookupError::InvalidCid(e) } } impl From for LookupError { fn from(e: ShardError) -> Self { LookupError::Shard(e) } } impl fmt::Display for LookupError { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { use LookupError::*; match self { InvalidCid(e) => write!(fmt, "Invalid link: {e:?}"), Multiple(e) => write!(fmt, "Multiple matching links found: {e:?}"), UnexpectedBucketType(ut) => write!(fmt, "unexpected type for HAMT bucket: {ut:?}"), Shard(e) => write!(fmt, "{e}"), Read(Some(e)) => write!( fmt, "failed to parse the block as unixfs or dag-pb node: {e}" ), Read(None) => write!(fmt, "HAMTDirectory not found in empty dag-pb node"), } } } impl std::error::Error for LookupError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { use LookupError::*; match self { Read(Some(e)) => Some(e), _ => None, } } } #[cfg(test)] mod tests { use super::{LookupError, MaybeResolved, ShardError, ShardedLookup}; use crate::pb::FlatUnixFs; use core::convert::TryFrom; use hex_literal::hex; // a directory from some linux kernel tree import: linux-5.5-rc5/tools/testing/selftests/rcutorture/ const DIR: &[u8] = &hex!("122e0a2212204baf5104fe53d495223f8e2ba95375a31fda6b18e926cb54edd61f30b5f1de6512053641646f6318b535122c0a221220fd9f545068048e647d5d0b275ed171596e0c1c04b8fed09dc13bee7607e75bc7120242391883c00312330a2212208a4a68f6b88594ce373419586c12d24bde2d519ab636b1d2dcc986eb6265b7a3120a43444d616b6566696c65189601122f0a2212201ededc99d23a7ef43a8f17e6dd8b89934993245ef39e18936a37e412e536ed681205463562696e18c5ad030a280805121f200000000020000200000000000000000004000000000000000000000000002822308002"); const FILE: &[u8] = &hex!("0a130802120d666f6f6261720a666f6f626172180d"); #[test] fn direct_hit() { let parsed = FlatUnixFs::try_from(DIR).unwrap(); // calling shardedlookup directly makes little sense, but through `resolve` it would make // sense // testing this is a bit ... not nice, since we can only find out if there is a negative // hit through exhausting the buckets let found = ShardedLookup::lookup_or_start(parsed, "bin", &mut None); match found { Ok(MaybeResolved::Found(cid)) if cid.to_string() == "QmQRA3JX9JNSccQpXjuKzMVCpfTaP4XpHbrrefqaQFWf5Z" => {} x => unreachable!("{:?}", x), } } #[test] fn found_in_the_other_bucket() { let parsed = FlatUnixFs::try_from(DIR).unwrap(); // there is a single bin "B9" which would contain "formal" but our implementation cannot // see it, it just guesses to start looking up in other buckets let see_next = ShardedLookup::lookup_or_start(parsed, "formal", &mut None); let next = match see_next { Ok(MaybeResolved::NeedToLoadMore(next)) => next, x => unreachable!("{:?}", x), }; { let (first, mut rest) = next.pending_links(); // there is only one bin: in other cases we would just walk in BFS order assert_eq!( first.to_string(), "QmfQgmYMYmGQP4X6V3JhTELkQmGVP9kpJgv9duejQ8vWez" ); assert!(rest.next().is_none()); } // then we error on anything other than HAMTShard let err = next.continue_walk(FILE, &mut None).unwrap_err(); match err { LookupError::UnexpectedBucketType(ut) if ut.is_file() => {} x => unreachable!("{:?}", x), } } #[test] fn unsupported_hash_type_or_fanout() { use crate::pb::{FlatUnixFs, UnixFs, UnixFsType}; use alloc::borrow::Cow; let example = FlatUnixFs { data: UnixFs { Type: UnixFsType::HAMTShard, Data: Some(Cow::Borrowed( b"this cannot be interpreted yet but would be an error", )), filesize: None, blocksizes: Vec::new(), hashType: Some(33), // supported 34 or murmur128 le cut as u64? fanout: Some(255), // supported 256 mode: None, // these are not read by the lookup mtime: None, }, links: Vec::new(), }; let err = ShardedLookup::lookup_or_start(example, "doesnt matter", &mut None).unwrap_err(); assert!( matches!( err, LookupError::Shard(ShardError::UnsupportedProperties { .. }) ), "{err:?}" ); } #[test] fn unexpected_properties() { use crate::pb::{FlatUnixFs, UnixFs, UnixFsType}; use alloc::borrow::Cow; let example = FlatUnixFs { data: UnixFs { Type: UnixFsType::HAMTShard, Data: Some(Cow::Borrowed( b"this cannot be interpreted yet but would be an error", )), filesize: Some(1), // err blocksizes: vec![1], // err hashType: Some(34), fanout: Some(256), mode: None, mtime: None, }, links: Vec::new(), }; let err = ShardedLookup::lookup_or_start(example, "doesnt matter", &mut None).unwrap_err(); assert!( matches!( err, LookupError::Shard(ShardError::UnexpectedProperties { .. }) ), "{err:?}" ); } } rust-unixfs-0.4.0/src/dir.rs000064400000000000000000000244051046102023000140540ustar 00000000000000use crate::pb::{FlatUnixFs, PBLink, PBNode, ParsingFailed, UnixFsType}; use crate::{InvalidCidInLink, UnexpectedNodeType}; use core::convert::TryFrom; use core::fmt; use libipld::Cid; mod sharded_lookup; pub use sharded_lookup::{Cache, LookupError, ShardError, ShardedLookup}; mod directory; pub(crate) use directory::{check_directory_supported, UnexpectedDirectoryProperties}; /// Directory tree builder. pub mod builder; pub(crate) fn check_hamtshard_supported( mut flat: FlatUnixFs<'_>, ) -> Result, ShardError> { ShardedLookup::check_supported(&mut flat)?; Ok(flat) } /// Resolves a single path segment on `dag-pb` or UnixFS directories (normal, sharded). /// /// The third parameter can always be substituted with a None but when repeatedly resolving over /// multiple path segments, it can be used to cache the work queue used to avoid re-allocating it /// between the steps. /// /// Returns on success either a walker which can be used to traverse additional links searching for /// the link, or the resolved link once it has been found or NotFound when it cannot be found. /// /// # Note /// /// The returned walker by default borrows the needle but it can be transformed into owned walker /// with `ShardedLookup::with_owned_needle` which will allow moving it between tasks and boundaries. #[allow(clippy::result_large_err)] pub fn resolve<'needle>( block: &[u8], needle: &'needle str, cache: &mut Option, ) -> Result, ResolveError> { let links = match FlatUnixFs::try_parse(block) { Ok(hamt) if hamt.data.Type == UnixFsType::HAMTShard => { return Ok(ShardedLookup::lookup_or_start(hamt, needle, cache)?) } Ok(flat) if flat.data.Type == UnixFsType::Directory => { check_directory_supported(flat)?.links } Err(ParsingFailed::InvalidUnixFs(_, PBNode { Links: links, .. })) | Err(ParsingFailed::NoData(PBNode { Links: links, .. })) => links, Ok(other) => { // go-ipfs does not resolve links under File, probably it's not supposed to work on // anything else then; returning NotFound would be correct, but perhaps it's even more // correct to return that we don't support this return Err(ResolveError::UnexpectedType(other.data.Type.into())); } Err(ParsingFailed::InvalidDagPb(e)) => return Err(ResolveError::Read(e)), }; let mut matching = links.into_iter().enumerate().filter_map(|(i, link)| { match link.Name.as_deref().unwrap_or_default() { x if x == needle => Some((i, link)), _ => None, } }); let first = matching.next(); if let Some((i, first)) = first { let first = try_convert_cid(i, first)?; match matching.next() { Some((j, second)) => Err(MultipleMatchingLinks::from(((i, first), (j, second))).into()), None => Ok(MaybeResolved::Found(first)), } } else { Ok(MaybeResolved::NotFound) } } fn try_convert_cid(nth: usize, link: PBLink<'_>) -> Result { let hash = link.Hash.as_deref().unwrap_or_default(); Cid::try_from(hash).map_err(|e| InvalidCidInLink::from((nth, link, e))) } /// Resolving result type for the successful cases. #[derive(Debug)] pub enum MaybeResolved<'needle> { /// Link was found for the given segment. Found(Cid), /// The block presented to `resolve` was a HAMT sharded directory and other blocks need to be /// read in order to find the link. `ShardedLookup` will handle the lookup and navigation /// over the shards. NeedToLoadMore(ShardedLookup<'needle>), /// The segment could not be found. NotFound, } /// Resolving can fail similarly as with `ShardedLookup::continue_walk` but in addition to sharded /// cases, there can be unexpected directories. #[derive(Debug)] pub enum ResolveError { /// The target block was a UnixFs node that doesn't support resolving, e.g. a file. UnexpectedType(UnexpectedNodeType), /// A directory had unsupported properties. These are not encountered during walking sharded /// directories. UnexpectedDirProperties(UnexpectedDirectoryProperties), /// Failed to read the block as a dag-pb node. Failure to read an inner UnixFS node is ignored /// and links of the outer dag-pb are processed. Read(quick_protobuf::Error), /// Lookup errors. Lookup(LookupError), } impl From for ResolveError { fn from(e: UnexpectedDirectoryProperties) -> Self { ResolveError::UnexpectedDirProperties(e) } } impl fmt::Display for ResolveError { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { use ResolveError::*; match self { UnexpectedType(ut) => write!(fmt, "unexpected type for UnixFs: {ut:?}"), UnexpectedDirProperties(udp) => write!(fmt, "unexpected directory properties: {udp}"), Read(e) => write!(fmt, "parsing failed: {e}"), Lookup(e) => write!(fmt, "{e}"), } } } impl std::error::Error for ResolveError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { use ResolveError::*; match self { Read(e) => Some(e), Lookup(LookupError::Read(Some(e))) => Some(e), _ => None, } } } impl From for ResolveError { fn from(e: InvalidCidInLink) -> ResolveError { ResolveError::Lookup(e.into()) } } impl From for ResolveError { fn from(e: MultipleMatchingLinks) -> ResolveError { ResolveError::Lookup(e.into()) } } impl From for ResolveError { fn from(e: ShardError) -> ResolveError { ResolveError::Lookup(e.into()) } } impl From for ResolveError { fn from(e: LookupError) -> ResolveError { ResolveError::Lookup(e) } } /// Multiple matching links were found: **at least two**. #[derive(Debug)] pub enum MultipleMatchingLinks { /// Two valid links were found Two { /// The first link and its index in the links first: (usize, Cid), /// The second link and its index in the links second: (usize, Cid), }, /// Two links were matched but one of them could not be converted. OneValid { /// The first link and its index in the links first: (usize, Cid), /// The failure to parse the other link second: InvalidCidInLink, }, } impl<'a> From<((usize, Cid), (usize, PBLink<'a>))> for MultipleMatchingLinks { fn from( ((i, first), (j, second)): ((usize, Cid), (usize, PBLink<'a>)), ) -> MultipleMatchingLinks { match try_convert_cid(j, second) { Ok(second) => MultipleMatchingLinks::Two { first: (i, first), second: (j, second), }, Err(e) => MultipleMatchingLinks::OneValid { first: (i, first), second: e, }, } } } impl MultipleMatchingLinks { /// Takes the first link, ignoring the other(s). pub fn into_inner(self) -> Cid { use MultipleMatchingLinks::*; match self { Two { first, .. } | OneValid { first, .. } => first.1, } } } #[cfg(test)] mod tests { use super::{resolve, MaybeResolved}; use crate::test_support::FakeBlockstore; use core::convert::TryFrom; use hex_literal::hex; use libipld::Cid; #[test] fn resolve_paths_from_plain_dagpb() { let payload = hex!("12330a2212206aad27d7e2fc815cd15bf679535062565dc927a831547281fc0af9e5d7e67c74120b6166726963616e2e747874180812340a221220fd36ac5279964db0cba8f7fa45f8c4c44ef5e2ff55da85936a378c96c9c63204120c616d6572696361732e747874180812360a2212207564c20415869d77a8a40ca68a9158e397dd48bdff1325cdb23c5bcd181acd17120e6175737472616c69616e2e7478741808"); assert!( crate::dagpb::node_data(&payload).unwrap().is_none(), "this payload has no data field" ); let segments = [ ( "african.txt", Some("QmVX54jfjB8eRxLVxyQSod6b1FyDh7mR4mQie9j97i2Qk3"), ), ( "americas.txt", Some("QmfP6D9bRV4FEYDL4EHZtZG58kDwDfnzmyjuyK5d1pvzbM"), ), ( "australian.txt", Some("QmWEuXAjUGyndgr4MKqMBgzMW36XgPgvitt2jsXgtuc7JE"), ), ("not found", None), // this is not a hamt shard ("01african.txt", None), ]; let mut cache = None; for (segment, link) in &segments { let target = link.map(|link| Cid::try_from(link).unwrap()); let res = resolve(&payload[..], segment, &mut cache); match res { Ok(MaybeResolved::Found(cid)) => assert_eq!(Some(cid), target), Ok(MaybeResolved::NotFound) => { assert!(target.is_none(), "should not have found {segment:?}") } x => panic!("{x:?}"), } } } #[test] fn errors_with_file() { let payload = hex!("0a130802120d666f6f6261720a666f6f626172180d"); // MaybeResolved::NotFound would be a possible answer as well, but this perhaps highlights // that we dont know how to resolve through this resolve(&payload[..], "anything", &mut None).unwrap_err(); } #[test] fn sharded_directory_linking_to_non_sharded() { // created this test case out of doubt that we could fail a traversal as ShardedLookup // expects the linked cids to be hamt shards. However that cannot happen as we only resolve // a single step. let blocks = FakeBlockstore::with_fixtures(); let block = blocks.get_by_str("QmQXUANxYGpkwMTWQUdZBPx9jqfFP7acNgL4FHRWkndKCe"); let next = match resolve(block, "non_sharded_dir", &mut None).unwrap() { MaybeResolved::Found(cid) => cid, x => unreachable!("{:?}", x), }; let block = blocks.get_by_cid(&next); let next = match resolve(block, "foobar", &mut None).unwrap() { MaybeResolved::Found(cid) => cid, x => unreachable!("{:?}", x), }; assert_eq!( &next.to_string(), "QmRgutAxd8t7oGkSm4wmeuByG6M51wcTso6cubDdQtuEfL" ); } } rust-unixfs-0.4.0/src/file/adder.rs000064400000000000000000000735041046102023000153000ustar 00000000000000use libipld::multihash::{self, Multihash}; use libipld::Cid; use crate::pb::{FlatUnixFs, PBLink, UnixFs, UnixFsType}; use alloc::borrow::Cow; use core::fmt; use quick_protobuf::{MessageWrite, Writer}; use sha2::{Digest, Sha256}; /// File tree builder. Implements [`core::default::Default`] which tracks the recent defaults. /// /// Custom file tree builder can be created with [`FileAdder::builder()`] and configuring the /// chunker and collector. /// /// Current implementation maintains an internal buffer for the block creation and uses a /// non-customizable hash function to produce Cid version 0 links. Currently does not support /// inline links. #[derive(Default)] pub struct FileAdder { chunker: Chunker, collector: Collector, block_buffer: Vec, // all unflushed links as a flat vec; this is compacted as we grow and need to create a link // block for the last N blocks, as decided by the collector. // FIXME: this is a cause of likely "accidentally quadratic" behavior visible when adding a // large file and using a minimal chunk size. Could be that this must be moved to Collector to // help collector (or layout) to decide how this should be persisted. unflushed_links: Vec, } impl fmt::Debug for FileAdder { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { write!( fmt, "FileAdder {{ chunker: {:?}, block_buffer: {}/{}, unflushed_links: {} }}", self.chunker, self.block_buffer.len(), self.block_buffer.capacity(), LinkFormatter(&self.unflushed_links), ) } } struct LinkFormatter<'a>(&'a [Link]); impl fmt::Display for LinkFormatter<'_> { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { let mut iter = self.0.iter().peekable(); write!(fmt, "[")?; let mut current = match iter.peek() { Some(Link { depth, .. }) => depth, None => return write!(fmt, "]"), }; let mut count = 0; for Link { depth: next_depth, .. } in iter { if current == next_depth { count += 1; } else { write!(fmt, "{current}: {count}/")?; let steps_between = if current > next_depth { current - next_depth } else { next_depth - current }; for _ in 0..steps_between - 1 { write!(fmt, "0/")?; } count = 1; current = next_depth; } } write!(fmt, "{current}: {count}]") } } /// Represents an intermediate structure which will be serialized into link blocks as both PBLink /// and UnixFs::blocksize. Also holds `depth`, which helps with compaction of the link blocks. struct Link { /// Depth of this link. Zero is leaf, and anything above it is, at least for /// [`BalancedCollector`], the compacted link blocks. depth: usize, /// The link target target: Cid, /// Total size is dag-pb specific part of the link: aggregated size of the linked subtree. total_size: u64, /// File size is the unixfs specific blocksize for this link. In UnixFs link blocks, there is a /// UnixFs::blocksizes item for each link. file_size: u64, } impl fmt::Debug for Link { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { fmt.debug_struct("Link") .field("depth", &self.depth) .field("target", &format_args!("{}", self.target)) .field("total_size", &self.total_size) .field("file_size", &self.file_size) .finish() } } /// Convenience type to facilitate configuring [`FileAdder`]s. #[derive(Default)] pub struct FileAdderBuilder { chunker: Chunker, collector: Collector, } impl FileAdderBuilder { /// Configures the builder to use the given chunker. pub fn with_chunker(self, chunker: Chunker) -> Self { FileAdderBuilder { chunker, ..self } } /// Configures the builder to use the given collector or layout. pub fn with_collector(self, collector: impl Into) -> Self { FileAdderBuilder { collector: collector.into(), ..self } } /// Returns a new FileAdder pub fn build(self) -> FileAdder { let FileAdderBuilder { chunker, collector } = self; FileAdder { chunker, collector, ..Default::default() } } } impl FileAdder { /// Returns a [`FileAdderBuilder`] for creating a non-default FileAdder. pub fn builder() -> FileAdderBuilder { FileAdderBuilder::default() } /// Returns the likely amount of buffering the file adding will work with best. /// /// When using the size based chunker and input larger than or equal to the hint is `push()`'ed /// to the chunker, the internal buffer will not be used. pub fn size_hint(&self) -> usize { self.chunker.size_hint() } /// Called to push new file bytes into the tree builder. /// /// Returns the newly created blocks (at most 2) and their respective Cids, and the amount of /// `input` consumed. pub fn push(&mut self, input: &[u8]) -> (impl Iterator)>, usize) { let (accepted, ready) = self.chunker.accept(input, &self.block_buffer); if self.block_buffer.is_empty() && ready { // save single copy as the caller is giving us whole chunks. // // TODO: though, this path does make one question if there is any point in keeping // block_buffer and chunker here; perhaps FileAdder should only handle pre-chunked // blocks and user takes care of chunking (and buffering)? // // cat file | my_awesome_chunker | my_brilliant_collector let leaf = Self::flush_buffered_leaf(accepted, &mut self.unflushed_links, false); assert!(leaf.is_some(), "chunk completed, must produce a new block"); self.block_buffer.clear(); let links = self.flush_buffered_links(false); (leaf.into_iter().chain(links), accepted.len()) } else { // slower path as we manage the buffer. if self.block_buffer.capacity() == 0 { // delay the internal buffer creation until this point, as the caller clearly wants // to use it. self.block_buffer.reserve(self.size_hint()); } self.block_buffer.extend_from_slice(accepted); let written = accepted.len(); let (leaf, links) = if !ready { // a new block did not become ready, which means we couldn't have gotten a new cid. (None, Vec::new()) } else { // a new leaf must be output, as well as possibly a new link block let leaf = Self::flush_buffered_leaf( self.block_buffer.as_slice(), &mut self.unflushed_links, false, ); assert!(leaf.is_some(), "chunk completed, must produce a new block"); self.block_buffer.clear(); let links = self.flush_buffered_links(false); (leaf, links) }; (leaf.into_iter().chain(links), written) } } /// Called after the last [`FileAdder::push`] to finish the tree construction. /// /// Returns a list of Cids and their respective blocks. /// /// Note: the API will hopefully evolve in a direction which will not allocate a new Vec for /// every block in the near-ish future. pub fn finish(mut self) -> impl Iterator)> { let last_leaf = Self::flush_buffered_leaf(&self.block_buffer, &mut self.unflushed_links, true); let root_links = self.flush_buffered_links(true); // should probably error if there is neither? last_leaf.into_iter().chain(root_links) } /// Returns `None` when the input is empty but there are links, otherwise a new Cid and a /// block. fn flush_buffered_leaf( input: &[u8], unflushed_links: &mut Vec, finishing: bool, ) -> Option<(Cid, Vec)> { if input.is_empty() && (!finishing || !unflushed_links.is_empty()) { return None; } // for empty unixfs file the bytes is missing but filesize is present. let data = if !input.is_empty() { Some(Cow::Borrowed(input)) } else { None }; let filesize = Some(input.len() as u64); let inner = FlatUnixFs { links: Vec::new(), data: UnixFs { Type: UnixFsType::File, Data: data, filesize, // no blocksizes as there are no links ..Default::default() }, }; let (cid, vec) = render_and_hash(&inner); let total_size = vec.len(); let link = Link { depth: 0, target: cid, total_size: total_size as u64, file_size: input.len() as u64, }; unflushed_links.push(link); Some((cid, vec)) } fn flush_buffered_links(&mut self, finishing: bool) -> Vec<(Cid, Vec)> { self.collector .flush_links(&mut self.unflushed_links, finishing) } /// Test helper for collecting all of the produced blocks; probably not a good idea outside /// smaller test cases. When `amt` is zero, the whole content is processed at the speed of /// chunker, otherwise `all_content` is pushed at `amt` sized slices with the idea of catching /// bugs in chunkers. #[cfg(test)] fn collect_blocks(mut self, all_content: &[u8], mut amt: usize) -> Vec<(Cid, Vec)> { let mut written = 0; let mut blocks_received = Vec::new(); if amt == 0 { amt = all_content.len(); } while written < all_content.len() { let end = written + (all_content.len() - written).min(amt); let slice = &all_content[written..end]; let (blocks, pushed) = self.push(slice); blocks_received.extend(blocks); written += pushed; } let last_blocks = self.finish(); blocks_received.extend(last_blocks); blocks_received } } fn render_and_hash(flat: &FlatUnixFs<'_>) -> (Cid, Vec) { // TODO: as shown in later dagger we don't really need to render the FlatUnixFs fully; we could // either just render a fixed header and continue with the body OR links, though the links are // a bit more complicated. let mut out = Vec::with_capacity(flat.get_size()); let mut writer = Writer::new(&mut out); flat.write_message(&mut writer) .expect("unsure how this could fail"); let mh = Multihash::wrap(multihash::Code::Sha2_256.into(), &Sha256::digest(&out)).unwrap(); let cid = Cid::new_v0(mh).expect("sha2_256 is the correct multihash for cidv0"); (cid, out) } /// Chunker strategy #[derive(Debug, Clone, Copy)] pub enum Chunker { /// Size based chunking Size(usize), } impl Default for Chunker { /// Returns a default chunker fn default() -> Self { Chunker::Size(256 * 1024) } } impl Chunker { fn accept<'a>(&mut self, input: &'a [u8], buffered: &[u8]) -> (&'a [u8], bool) { use Chunker::*; match self { Size(max) => { let l = input.len().min(*max - buffered.len()); let accepted = &input[..l]; let ready = buffered.len() + l >= *max; (accepted, ready) } } } fn size_hint(&self) -> usize { use Chunker::*; match self { Size(max) => *max, } } } /// Collector or layout strategy. For more information, see the [Layout section of the spec]. /// Currently only the default balanced collector/layout has been implemented. /// /// [Layout section of the spec]: https://github.com/ipfs/specs/blob/master/UNIXFS.md#layout #[derive(Debug, Clone)] pub enum Collector { /// Balanced trees. Balanced(BalancedCollector), } impl Default for Collector { fn default() -> Self { Collector::Balanced(Default::default()) } } impl Collector { fn flush_links(&mut self, pending: &mut Vec, finishing: bool) -> Vec<(Cid, Vec)> { use Collector::*; match self { Balanced(bc) => bc.flush_links(pending, finishing), } } } /// BalancedCollector creates balanced UnixFs trees, most optimized for random access to different /// parts of the file. Currently supports only link count threshold or the branching factor. #[derive(Clone)] pub struct BalancedCollector { branching_factor: usize, // reused between link block generation reused_links: Vec>, // reused between link block generation reused_blocksizes: Vec, } impl fmt::Debug for BalancedCollector { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { write!( fmt, "BalancedCollector {{ branching_factor: {} }}", self.branching_factor ) } } impl Default for BalancedCollector { /// Returns a default collector which matches go-ipfs 0.6 /// /// The origin for 174 is not described in the the [specs], but has likely to do something /// with being "good enough" regarding prefetching when reading and allows reusing some of the /// link blocks if parts of a longer file change. /// /// [specs]: https://github.com/ipfs/specs/blob/master/UNIXFS.md fn default() -> Self { Self::with_branching_factor(174) } } impl From for Collector { fn from(b: BalancedCollector) -> Self { Collector::Balanced(b) } } impl BalancedCollector { /// Configure Balanced collector with the given branching factor. pub fn with_branching_factor(branching_factor: usize) -> Self { assert!(branching_factor > 0); Self { branching_factor, reused_links: Vec::new(), reused_blocksizes: Vec::new(), } } /// In-place compression of the `pending` links to a balanced hierarchy. When `finishing`, the /// links will be compressed iteratively from the lowest level to produce a single root link /// block. fn flush_links(&mut self, pending: &mut Vec, finishing: bool) -> Vec<(Cid, Vec)> { /* file |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -| links-0 |-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|E|F|G| links-1 |-------|-------|-------|-------|-B-----|-C-----|-D-----|\ / links-2 |-A-----------------------------| ^^^ ^ one short \--- link.depth pending [A, B, C, D, E, F, G] #flush_buffered_links(...) first iteration: file |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -| links-0 |-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|E|F|G| links-1 |-------|-------|-------|-------|-B-----|-C-----|-D-----|=#1==| links-2 |-A-----------------------------| pending [A, B, C, D, E, F, G] => [A, B, C, D, 1] new link block #1 is created for E, F, and G. #flush_buffered_links(...) second iteration: file |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -| links-0 |-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-| links-1 |-------|-------|-------|-------|-B-----|-C-----|-D-----|-#1--| links-2 |-A-----------------------------|=========================#2==| pending [A, B, C, D, 1] => [A, 2] new link block #2 is created for B, C, D, and #1. #flush_buffered_links(...) last iteration: file |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -| links-0 |-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-| links-1 |-------|-------|-------|-------|-------|-------|-------|-#1--| links-2 |-A-----------------------------|-------------------------#2--| links-3 |=========================================================#3==| pending [A, 2] => [3] new link block #3 is created for A, and #2. (the root block) */ let mut ret = Vec::new(); let mut reused_links = core::mem::take(&mut self.reused_links); let mut reused_blocksizes = core::mem::take(&mut self.reused_blocksizes); if let Some(need) = self.branching_factor.checked_sub(reused_links.capacity()) { reused_links.reserve(need); } if let Some(need) = self .branching_factor .checked_sub(reused_blocksizes.capacity()) { reused_blocksizes.reserve(need); } 'outer: for level in 0.. { if pending.len() == 1 && finishing || pending.len() <= self.branching_factor && !finishing { // when there is just a single linking block left and we are finishing, we are // done. It might not be part of the `ret` as will be the case with single chunk // files for example. // // normally when not finishing we do nothing if we don't have enough links. break; } // when finishing, we iterate the level to completion in blocks of // self.branching_factor and *insert* values at the offset of the first compressed // link. on following iterations this will be the index after the higher level index. let mut starting_point = 0; // when creating the link blocks, start overwriting the pending links at the first // found link for this depth. this index will be incremented for successive link // blocks. let mut last_overwrite = None; while let Some(mut first_at) = &pending[starting_point..] .iter() .position(|Link { depth, .. }| depth == &level) { // fix first_at as absolute index from being possible relative to the // starting_point first_at += starting_point; if !finishing && pending[first_at..].len() <= self.branching_factor { if let Some(last_overwrite) = last_overwrite { // drain any processed pending.drain((last_overwrite + 1)..first_at); } break 'outer; } reused_links.clear(); reused_blocksizes.clear(); let mut nested_size = 0; let mut nested_total_size = 0; let last = (first_at + self.branching_factor).min(pending.len()); for (index, link) in pending[first_at..last].iter().enumerate() { assert_eq!( link.depth, level, "unexpected link depth {} when searching at level {} index {}", link.depth, level, index + first_at ); Self::partition_link( link, &mut reused_links, &mut reused_blocksizes, &mut nested_size, &mut nested_total_size, ); } debug_assert_eq!(reused_links.len(), reused_blocksizes.len()); let inner = FlatUnixFs { links: reused_links, data: UnixFs { Type: UnixFsType::File, filesize: Some(nested_size), blocksizes: reused_blocksizes, ..Default::default() }, }; let (cid, vec) = render_and_hash(&inner); // start overwriting at the first index of this level, then continue forward on // next iterations. let index = last_overwrite.map(|i| i + 1).unwrap_or(first_at); pending[index] = Link { depth: level + 1, target: cid, total_size: nested_total_size + vec.len() as u64, file_size: nested_size, }; ret.push((cid, vec)); reused_links = inner.links; reused_blocksizes = inner.data.blocksizes; starting_point = last; last_overwrite = Some(index); } if let Some(last_overwrite) = last_overwrite { pending.truncate(last_overwrite + 1); } // this holds regardless of finishing; we would had broken 'outer had there been less // than full blocks left. debug_assert_eq!( pending.iter().position(|l| l.depth == level), None, "should have no more of depth {}: {}", level, LinkFormatter(pending.as_slice()) ); } self.reused_links = reused_links; self.reused_blocksizes = reused_blocksizes; ret } /// Each link needs to be partitioned into the four mut arguments received by this function in /// order to produce the expected UnixFs output. fn partition_link( link: &Link, links: &mut Vec>, blocksizes: &mut Vec, nested_size: &mut u64, nested_total_size: &mut u64, ) { links.push(PBLink { Hash: Some(link.target.to_bytes().into()), Name: Some("".into()), Tsize: Some(link.total_size), }); blocksizes.push(link.file_size); *nested_size += link.file_size; *nested_total_size += link.total_size; } } #[cfg(test)] mod tests { use super::{BalancedCollector, Chunker, FileAdder}; use crate::test_support::FakeBlockstore; use core::convert::TryFrom; use hex_literal::hex; use libipld::Cid; #[test] fn test_size_chunker() { assert_eq!(size_chunker_scenario(1, 4, 0), (1, true)); assert_eq!(size_chunker_scenario(2, 4, 0), (2, true)); assert_eq!(size_chunker_scenario(2, 1, 0), (1, false)); assert_eq!(size_chunker_scenario(2, 1, 1), (1, true)); assert_eq!(size_chunker_scenario(32, 3, 29), (3, true)); // this took some debugging time: assert_eq!(size_chunker_scenario(32, 4, 29), (3, true)); } fn size_chunker_scenario(max: usize, input_len: usize, existing_len: usize) -> (usize, bool) { let input = vec![0; input_len]; let existing = vec![0; existing_len]; let (accepted, ready) = Chunker::Size(max).accept(&input, &existing); (accepted.len(), ready) } #[test] fn favourite_single_block_file() { let blocks = FakeBlockstore::with_fixtures(); // everyones favourite content let content = b"foobar\n"; let mut adder = FileAdder::default(); { let (mut ready_blocks, bytes) = adder.push(content); assert!(ready_blocks.next().is_none()); assert_eq!(bytes, content.len()); } // real impl would probably hash this ... except maybe hashing is faster when done inline? // or maybe not let (_, file_block) = adder .finish() .next() .expect("there must have been the root block"); assert_eq!( blocks.get_by_str("QmRgutAxd8t7oGkSm4wmeuByG6M51wcTso6cubDdQtuEfL"), file_block.as_slice() ); } #[test] fn favourite_multi_block_file() { // root should be QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6 let blocks = FakeBlockstore::with_fixtures(); let content = b"foobar\n"; let adder = FileAdder::builder().with_chunker(Chunker::Size(2)).build(); let blocks_received = adder.collect_blocks(content, 0); // the order here is "fo", "ob", "ar", "\n", root block // while verifying the root Cid would be *enough* this is easier to eyeball, ... not really // that much but ... let expected = [ "QmfVyMoStzTvdnUR7Uotzh82gmL427q9z3xW5Y8fUoszi4", "QmdPyW4CWE3QBkgjWfjM5f7Tjb3HukxVuBXZtkqAGwsMnm", "QmNhDQpphvMWhdCzP74taRzXDaEfPGq8vWfFRzD7mEgePM", "Qmc5m94Gu7z62RC8waSKkZUrCCBJPyHbkpmGzEePxy2oXJ", "QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6", ] .iter() .map(|key| { let cid = Cid::try_from(*key).unwrap(); let block = blocks.get_by_str(key).to_vec(); (cid, block) }) .collect::>(); assert_eq!(blocks_received, expected); } #[test] fn three_layers() { let content = b"Lorem ipsum dolor sit amet, sit enim montes aliquam. Cras non lorem, \ rhoncus condimentum, irure et ante. Pulvinar suscipit odio ante, et tellus a enim, \ wisi ipsum, vel rhoncus eget faucibus varius, luctus turpis nibh vel odio nulla pede."; assert!(content.len() > 174 && content.len() < 2 * 174); // go-ipfs 0.5 result: QmRQ6NZNUs4JrCT2y7tmCC1wUhjqYuTssB8VXbbN3rMffg, 239 blocks and root // root has two links: // - QmXUcuLGKc8SCMEqG4wgct6NKsSRZQfvB2FCfjDow1PfpB (174 links) // - QmeEn8dxWTzGAFKvyXoLj4oWbh9putL4vSw4uhLXJrSZhs (63 links) // // in future, if we ever add inline Cid generation this test would need to be changed not // to use those inline cids or raw leaves let adder = FileAdder::builder().with_chunker(Chunker::Size(1)).build(); let blocks_received = adder.collect_blocks(content, 0); assert_eq!(blocks_received.len(), 240); assert_eq!( blocks_received.last().unwrap().0.to_string(), "QmRQ6NZNUs4JrCT2y7tmCC1wUhjqYuTssB8VXbbN3rMffg" ); } #[test] fn three_layers_all_subchunks() { let content = b"Lorem ipsum dolor sit amet, sit enim montes aliquam. Cras non lorem, \ rhoncus condimentum, irure et ante. Pulvinar suscipit odio ante, et tellus a enim, \ wisi ipsum, vel rhoncus eget faucibus varius, luctus turpis nibh vel odio nulla pede."; for amt in 1..32 { let adder = FileAdder::builder().with_chunker(Chunker::Size(32)).build(); let blocks_received = adder.collect_blocks(content, amt); assert_eq!( blocks_received.last().unwrap().0.to_string(), "QmYSLcVQqxKygiq7x9w1XGYxU29EShB8ZemiaQ8GAAw17h", "amt: {amt}" ); } } #[test] fn empty_file() { let blocks = FileAdder::default().collect_blocks(b"", 0); assert_eq!(blocks.len(), 1); // 0a == field dag-pb body (unixfs) // 04 == dag-pb body len, varint, 4 bytes // 08 == field type tag, varint, 1 byte // 02 == field type (File) // 18 == field filesize tag, varint // 00 == filesize, varint, 1 byte assert_eq!(blocks[0].1.as_slice(), &hex!("0a 04 08 02 18 00")); assert_eq!( blocks[0].0.to_string(), "QmbFMke1KXqnYyBBWxB74N4c5SBnJMVAiMNRcGu6x1AwQH" ); } #[test] fn full_link_block_and_a_byte() { let buf = vec![0u8; 2]; // this should produce a root with two links // +----------^---+ // | | // |----------------------| |-| <-- link blocks // ^^^^^^^^^^^^^^^^^^^^^^ ^ // 174 blocks \--- 1 block let branching_factor = 174; let mut adder = FileAdder::builder() .with_chunker(Chunker::Size(2)) .with_collector(BalancedCollector::with_branching_factor(branching_factor)) .build(); let mut blocks_count = 0; for _ in 0..branching_factor { let (blocks, written) = adder.push(buf.as_slice()); assert_eq!(written, buf.len()); blocks_count += blocks.count(); } let (blocks, written) = adder.push(&buf[0..1]); assert_eq!(written, 1); blocks_count += blocks.count(); let last_blocks = adder.finish().collect::>(); blocks_count += last_blocks.len(); // chunks == 174 // one link block for 174 // one is for the single byte block // one is a link block for the singular single byte block // other is for the root block assert_eq!(blocks_count, branching_factor + 1 + 1 + 1 + 1); assert_eq!( last_blocks.last().unwrap().0.to_string(), "QmcHNWF1d56uCDSfJPA7t9fadZRV9we5HGSTGSmwuqmMP9" ); } #[test] fn full_link_block() { let buf = vec![0u8; 1]; let branching_factor = 174; let mut adder = FileAdder::builder() .with_chunker(Chunker::Size(1)) .with_collector(BalancedCollector::with_branching_factor(branching_factor)) .build(); let mut blocks_count = 0; for _ in 0..branching_factor { let (blocks, written) = adder.push(buf.as_slice()); assert_eq!(written, buf.len()); blocks_count += blocks.count(); } let mut last_blocks = adder.finish(); // go-ipfs waits until finish to get a single link block, no additional root block let last_block = last_blocks.next().expect("must not have flushed yet"); blocks_count += 1; assert_eq!(last_blocks.next(), None); assert_eq!( last_block.0.to_string(), "QmdgQac8c6Bo3MP5bHAg2yQ25KebFUsmkZFvyByYzf8UCB" ); assert_eq!(blocks_count, 175); } } rust-unixfs-0.4.0/src/file/reader.rs000064400000000000000000000273651046102023000154670ustar 00000000000000use crate::pb::{FlatUnixFs, PBLink, RangeLinks, UnixFsType}; use core::convert::TryFrom; use core::fmt; use core::ops::Range; use crate::file::{FileError, FileReadFailed, Metadata, UnwrapBorrowedExt}; /// Navigates the UnixFs files, which are either: /// - single block files which have everything needed to all of the contents /// - multi block files which have trees of trees until Raw leaf blocks /// /// The trees can have different shapes but it doesn't really matter for our depth-first approach. /// For seeking, the each sub-tree linking node will have blocksizes for the trees representing /// which the original file offsets covered by the tree. /// /// A file doesn't know it's name. It only has a name when part of a directory, and then the name /// is on a PbLink::Name. With UnixFs the names are always UTF-8. The root CID is not interesting /// either: we just need the root block. pub struct FileReader<'a> { offset: u64, end: Ending, links: Vec>, data: &'a [u8], blocksizes: Vec, metadata: Metadata, file_size: u64, } impl AsRef for FileReader<'_> { fn as_ref(&self) -> &Metadata { &self.metadata } } // TODO: this could be Range ... It just seemed there seems to be "two kinds" of endings but in // reality these are closer to two kinds of ranges or spans. #[derive(Debug)] enum Ending { /// The block represented a subtree without actual content TreeCoverage(u64), /// The block repressented a leaf with actual content Chunk(u64), } impl Ending { /// Checks wheter or not the next range is good to be processed next. fn check_is_suitable_next(&self, offset: u64, next: &Range) -> Result<(), FileError> { match self { Ending::TreeCoverage(cover_end) if next.start <= offset && &next.end > cover_end => { // tree must be collapsing; we cant have root be some smaller *file* range than // the child Err(FileError::TreeExpandsOnLinks) } Ending::TreeCoverage(cover_end) if &next.start < cover_end && &next.end > cover_end => { // when moving to sibling at the same height or above, its coverage must start // from where we stopped // // This has been separated instead of making the TreeExpandsOnLinks more general as // this might be a reasonable way with unixfs to reuse lower trees but no such // example has been found at least. Err(FileError::TreeOverlapsBetweenLinks) } Ending::TreeCoverage(_) if next.start < offset => Err(FileError::EarlierLink), Ending::Chunk(chunk_end) if &next.start != chunk_end => { // when continuing on from leaf node to either tree at above or a chunk at // next, the next must continue where we stopped Err(FileError::TreeJumpsBetweenLinks) } _ => Ok(()), } } } impl<'a> FileReader<'a> { /// Method for starting the file traversal. `data` is the raw data from unixfs block. pub fn from_block(data: &'a [u8]) -> Result { let inner = FlatUnixFs::try_from(data)?; let metadata = Metadata::from(&inner.data); Self::from_parts(inner, 0, metadata) } pub(crate) fn from_parsed(inner: FlatUnixFs<'a>) -> Result { let metadata = Metadata::from(&inner.data); Self::from_parts(inner, 0, metadata) } /// Called by Traversal to continue traversing a file tree traversal. fn from_continued( traversal: Traversal, offset: u64, data: &'a [u8], ) -> Result { let inner = FlatUnixFs::try_from(data)?; if inner.data.mode.is_some() || inner.data.mtime.is_some() { let metadata = Metadata::from(&inner.data); return Err(FileError::NonRootDefinesMetadata(metadata).into()); } Self::from_parts(inner, offset, traversal.metadata) } fn from_parts( inner: FlatUnixFs<'a>, offset: u64, metadata: Metadata, ) -> Result { let empty_or_no_content = inner .data .Data .as_ref() .map(|cow| cow.as_ref().is_empty()) .unwrap_or(true); let is_zero_bytes = inner.data.filesize.unwrap_or(0) == 0; if inner.data.Type != UnixFsType::File && inner.data.Type != UnixFsType::Raw { Err(FileReadFailed::UnexpectedType(inner.data.Type.into())) } else if inner.links.len() != inner.data.blocksizes.len() { Err(FileError::LinksAndBlocksizesMismatch.into()) } else if empty_or_no_content && !is_zero_bytes && inner.links.is_empty() { Err(FileError::NoLinksNoContent.into()) } else { // raw and file seem to be same except the raw is preferred in trickle dag let data = inner.data.Data.unwrap_borrowed_or_empty(); if inner.data.hashType.is_some() || inner.data.fanout.is_some() { return Err(FileError::UnexpectedRawOrFileProperties { hash_type: inner.data.hashType, fanout: inner.data.fanout, } .into()); } let end = if inner.links.is_empty() { // can unwrap because `data` is all of the data let filesize = inner.data.filesize.unwrap_or(data.len() as u64); Ending::Chunk(offset + filesize) } else { match inner.data.filesize { Some(filesize) => Ending::TreeCoverage(offset + filesize), None => return Err(FileError::IntermediateNodeWithoutFileSize.into()), } }; Ok(Self { offset, end, links: inner.links, data, blocksizes: inner.data.blocksizes, metadata, file_size: inner.data.filesize.unwrap(), }) } } /// Returns a moved tuple of the content (bytes or links) and a traversal, which can be used to /// continue the traversal from the next block. pub fn content( self, ) -> ( FileContent<'a, impl Iterator, Range)>>, Traversal, ) { let traversal = Traversal { last_ending: self.end, last_offset: self.offset, metadata: self.metadata, file_size: self.file_size, }; let fc = if self.links.is_empty() { FileContent::Bytes(self.data) } else { let zipped = self.links.into_iter().zip(self.blocksizes); FileContent::Links(RangeLinks::from_links_and_blocksizes( zipped, Some(self.offset), )) }; (fc, traversal) } } /// Carrier of validation data used between blocks during a walk on the merkle tree. #[derive(Debug)] pub struct Traversal { last_ending: Ending, last_offset: u64, file_size: u64, metadata: Metadata, } impl Traversal { /// Continues the walk on the merkle tree with the given block contents. The block contents is /// not validated and the range is expected to be the next from previous call to /// FileContent::Links iterator. /// /// When calling this directly, it is good to note that repeatedly calling this with the same /// block contents will not be detected, and will instead grow the internal Vec of links until /// memory runs out. pub fn continue_walk<'a>( self, next_block: &'a [u8], tree_range: &Range, ) -> Result, FileReadFailed> { self.last_ending .check_is_suitable_next(self.last_offset, tree_range)?; FileReader::from_continued(self, tree_range.start, next_block) } /// Returns the total size of the file. pub fn file_size(&self) -> u64 { self.file_size } } impl AsRef for Traversal { fn as_ref(&self) -> &Metadata { &self.metadata } } /// Files in unixfs merkle trees can either contain content of the file, or can contain links to /// other parts of the tree. pub enum FileContent<'a, I> where I: Iterator, Range)> + 'a, { /// When reaching the leaf level of a DAG we finally find the actual content. For empty files /// without content this will be an empty slice. Bytes(&'a [u8]), /// The content of the file is spread over a number of blocks; iteration must follow from index /// depth-first from the first link to reach the given the bytes in the given byte offset /// range. Links(I), } #[cfg(test)] impl<'a, I> FileContent<'a, I> where I: Iterator, Range)>, { /// Returns the content as bytes, or panics if there were links instead. pub fn unwrap_content(self) -> &'a [u8] { match self { FileContent::Bytes(x) => x, y => panic!("Expected FileContent::Bytes, found: {y:?}"), } } } impl<'a, I> fmt::Debug for FileContent<'a, I> where I: Iterator, Range)>, { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { match self { FileContent::Bytes(bytes) => write!(fmt, "Bytes({} bytes)", bytes.len()), FileContent::Links(iter) => write!(fmt, "Links({:?})", iter.size_hint()), } } } #[cfg(test)] mod tests { use super::Ending; use crate::file::FileError; #[test] fn collapsing_tree() { // this is pretty much how I planned the ending might be useful but it's perhaps a bit // confusing as it's only the half of the range Ending::TreeCoverage(100) .check_is_suitable_next(0, &(0..100)) .unwrap(); Ending::TreeCoverage(100) .check_is_suitable_next(0, &(0..10)) .unwrap(); Ending::TreeCoverage(100) .check_is_suitable_next(0, &(0..2)) .unwrap(); Ending::Chunk(2) .check_is_suitable_next(0, &(2..10)) .unwrap(); Ending::TreeCoverage(10) .check_is_suitable_next(2, &(2..10)) .unwrap(); Ending::TreeCoverage(10) .check_is_suitable_next(2, &(10..20)) .unwrap(); Ending::Chunk(10) .check_is_suitable_next(2, &(10..100)) .unwrap(); } #[test] fn expanding_tree() { let res = Ending::TreeCoverage(100).check_is_suitable_next(10, &(0..102)); assert!(matches!(res, Err(FileError::TreeExpandsOnLinks)), "{res:?}"); let res = Ending::TreeCoverage(100).check_is_suitable_next(0, &(0..102)); assert!(matches!(res, Err(FileError::TreeExpandsOnLinks)), "{res:?}"); } #[test] fn overlap() { let res = Ending::TreeCoverage(100).check_is_suitable_next(10, &(88..102)); assert!( matches!(res, Err(FileError::TreeOverlapsBetweenLinks)), "{res:?}" ); } #[test] fn hole() { let res = Ending::Chunk(100).check_is_suitable_next(0, &(101..105)); assert!( matches!(res, Err(FileError::TreeJumpsBetweenLinks)), "{res:?}" ); } #[test] fn wrong_next() { let res = Ending::TreeCoverage(200).check_is_suitable_next(100, &(0..100)); assert!(matches!(res, Err(FileError::EarlierLink)), "{res:?}"); let res = Ending::TreeCoverage(101).check_is_suitable_next(100, &(0..100)); assert!(matches!(res, Err(FileError::EarlierLink)), "{res:?}"); let res = Ending::TreeCoverage(100).check_is_suitable_next(100, &(0..100)); assert!(matches!(res, Err(FileError::EarlierLink)), "{res:?}"); } } rust-unixfs-0.4.0/src/file/visit.rs000064400000000000000000000262601046102023000153540ustar 00000000000000use core::convert::TryFrom; use core::ops::Range; use libipld::Cid; use crate::file::reader::{FileContent, FileReader, Traversal}; use crate::file::{FileReadFailed, Metadata}; use crate::pb::{merkledag::PBLink, FlatUnixFs}; use crate::InvalidCidInLink; /// IdleFileVisit represents a prepared file visit over a tree. The user has to know the CID and be /// able to get the block for the visit. /// /// **Note**: For easier to use interface, you should consider using `ipfs_unixfs::walk::Walker`. /// It uses `IdleFileVisit` and `FileVisit` internally but has a better API. #[derive(Default, Debug)] pub struct IdleFileVisit { range: Option>, } type FileVisitResult<'a> = (&'a [u8], u64, Metadata, Option); impl IdleFileVisit { /// Target range represents the target byte range of the file we are interested in visiting. pub fn with_target_range(self, range: Range) -> Self { Self { range: Some(range) } } /// Begins the visitation by processing the first block to be visited. /// /// Returns (on success) a tuple of file bytes, total file size, any metadata associated, and /// optionally a `FileVisit` to continue the walk. pub fn start(self, block: &'_ [u8]) -> Result, FileReadFailed> { let fr = FileReader::from_block(block)?; self.start_from_reader(fr, &mut None) } pub(crate) fn start_from_parsed<'a>( self, block: FlatUnixFs<'a>, cache: &'_ mut Option, ) -> Result, FileReadFailed> { let fr = FileReader::from_parsed(block)?; self.start_from_reader(fr, cache) } fn start_from_reader<'a>( self, fr: FileReader<'a>, cache: &'_ mut Option, ) -> Result, FileReadFailed> { let metadata = fr.as_ref().to_owned(); let (content, traversal) = fr.content(); match content { FileContent::Bytes(content) => { let block = 0..content.len() as u64; let content = maybe_target_slice(content, &block, self.range.as_ref()); Ok((content, traversal.file_size(), metadata, None)) } FileContent::Links(iter) => { // we need to select suitable here let mut links = cache.take().unwrap_or_default().inner; let pending = iter.enumerate().filter_map(|(i, (link, range))| { if !block_is_in_target_range(&range, self.range.as_ref()) { return None; } Some(to_pending(i, link, range)) }); for item in pending { links.push(item?); } // order is reversed to consume them in the depth first order links.reverse(); if links.is_empty() { *cache = Some(links.into()); Ok((&[][..], traversal.file_size(), metadata, None)) } else { Ok(( &[][..], traversal.file_size(), metadata, Some(FileVisit { pending: links, state: traversal, range: self.range, }), )) } } } } } /// Optional cache for datastructures which can be re-used without re-allocation between walks of /// different files. #[derive(Default)] pub struct Cache { inner: Vec<(Cid, Range)>, } impl From)>> for Cache { fn from(mut inner: Vec<(Cid, Range)>) -> Self { inner.clear(); Cache { inner } } } /// FileVisit represents an ongoing visitation over an UnixFs File tree. /// /// The file visitor does **not** implement size validation of merkledag links at the moment. This /// could be implmented with generational storage and it would require an u64 per link. /// /// **Note**: For easier to use interface, you should consider using `ipfs_unixfs::walk::Walker`. /// It uses `IdleFileVisit` and `FileVisit` internally but has a better API. #[derive(Debug)] pub struct FileVisit { /// The internal cache for pending work. Order is such that the next is always the last item, /// so it can be popped. This currently does use a lot of memory for very large files. /// /// One workaround would be to transform excess links to relative links to some block of a Cid. pending: Vec<(Cid, Range)>, /// Target range, if any. Used to filter the links so that we will only visit interesting /// parts. range: Option>, state: Traversal, } impl FileVisit { /// Access hashes of all pending links for prefetching purposes. The block for the first item /// returned by this method is the one which needs to be processed next with `continue_walk`. /// /// Returns tuple of the next Cid which needs to be processed and an iterator over the /// remaining. pub fn pending_links(&self) -> (&Cid, impl Iterator) { let mut iter = self.pending.iter().rev().map(|(link, _)| link); let first = iter .next() .expect("the presence of links has been validated"); (first, iter) } /// Continues the walk with the data for the first `pending_link` key. /// /// Returns on success a tuple of bytes and new version of `FileVisit` to continue the visit, /// when there is something more to visit. pub fn continue_walk<'a>( mut self, next: &'a [u8], cache: &mut Option, ) -> Result<(&'a [u8], Option), FileReadFailed> { let traversal = self.state; let (_, range) = self .pending .pop() .expect("User called continue_walk there must have been a next link"); // interesting, validation doesn't trigger if the range is the same? let fr = traversal.continue_walk(next, &range)?; let (content, traversal) = fr.content(); match content { FileContent::Bytes(content) => { let content = maybe_target_slice(content, &range, self.range.as_ref()); if !self.pending.is_empty() { self.state = traversal; Ok((content, Some(self))) } else { *cache = Some(self.pending.into()); Ok((content, None)) } } FileContent::Links(iter) => { let before = self.pending.len(); for (i, (link, range)) in iter.enumerate() { if !block_is_in_target_range(&range, self.range.as_ref()) { continue; } self.pending.push(to_pending(i, link, range)?); } // reverse to keep the next link we need to traverse as last, where pop() operates. self.pending[before..].reverse(); self.state = traversal; Ok((&[][..], Some(self))) } } } /// Returns the total size of the file in bytes. pub fn file_size(&self) -> u64 { self.state.file_size() } } impl AsRef for FileVisit { fn as_ref(&self) -> &Metadata { self.state.as_ref() } } fn to_pending( nth: usize, link: PBLink<'_>, range: Range, ) -> Result<(Cid, Range), FileReadFailed> { let hash = link.Hash.as_deref().unwrap_or_default(); match Cid::try_from(hash) { Ok(cid) => Ok((cid, range)), Err(e) => Err(FileReadFailed::InvalidCid(InvalidCidInLink::from(( nth, link, e, )))), } } /// Returns true if the blocks byte offsets are interesting for our target range, false otherwise. /// If there is no target, all blocks are of interest. fn block_is_in_target_range(block: &Range, target: Option<&Range>) -> bool { use core::cmp::{max, min}; if let Some(target) = target { max(block.start, target.start) <= min(block.end, target.end) } else { true } } /// Whenever we propagate the content from the tree upwards, we need to make sure it's inside the /// range we were originally interested in. fn maybe_target_slice<'a>( content: &'a [u8], block: &Range, target: Option<&Range>, ) -> &'a [u8] { if let Some(target) = target { target_slice(content, block, target) } else { content } } fn target_slice<'a>(content: &'a [u8], block: &Range, target: &Range) -> &'a [u8] { use core::cmp::min; if !block_is_in_target_range(block, Some(target)) { // defaulting to empty slice is good, and similar to the "cat" HTTP API operation. &[][..] } else { let start; let end; // FIXME: these must have bugs and must be possible to simplify if target.start < block.start { // we mostly need something before start = 0; end = (min(target.end, block.end) - block.start) as usize; } else if target.end > block.end { // we mostly need something after start = (target.start - block.start) as usize; end = (min(target.end, block.end) - block.start) as usize; } else { // inside start = (target.start - block.start) as usize; end = start + (target.end - target.start) as usize; } &content[start..end] } } #[cfg(test)] mod tests { use super::target_slice; #[test] #[allow(clippy::type_complexity)] fn slice_for_target() { use core::ops::Range; // turns out these examples are not easy to determine at all // writing out the type here avoids &b""[..] inside the array. let cases: &[(&[u8], u64, Range, &[u8])] = &[ // xxxx xxxx cont ent_ // ^^^^ ^^^^ (b"content_", 8, 0..8, b""), // xxxx xxxx cont ent_ // ^^^^ ^^^^ ^ (b"content_", 8, 0..9, b"c"), // xxxx xxxx cont ent_ // ^^^ ^^^^ ^^^^ ^^^^ ... (b"content_", 8, 1..20, b"content_"), // xxxx xxxx cont ent_ // ^ ^^^^ ^^^^ ... (b"content_", 8, 7..20, b"content_"), // xxxx xxxx cont ent_ // ^^^^ ^^^^ ... (b"content_", 8, 8..20, b"content_"), // xxxx xxxx cont ent_ // ^^^ ^^^^ ... (b"content_", 8, 9..20, b"ontent_"), // xxxx xxxx cont ent_ // ^ ... (b"content_", 8, 15..20, b"_"), // xxxx xxxx cont ent_ yyyy // ^^^^ (b"content_", 8, 16..20, b""), ]; for (block_data, block_offset, target_range, expected) in cases { let block_range = *block_offset..(block_offset + block_data.len() as u64); let sliced = target_slice(block_data, &block_range, target_range); assert_eq!( sliced, *expected, "slice {target_range:?} of block {block_range:?}" ); } } } rust-unixfs-0.4.0/src/file.rs000064400000000000000000000266511046102023000142220ustar 00000000000000//! UnixFS file support. //! //! The module provides low-level File tree visitor support and file importing support. Note: The //! [`ipfs_unixfs::walk::Walker`] should typically be used for accessing file content. use crate::pb::ParsingFailed; use crate::{InvalidCidInLink, Metadata, UnexpectedNodeType}; use alloc::borrow::Cow; use core::fmt; /// Low level UnixFS file descriptor reader support. mod reader; /// Mid level API for visiting the file tree. pub mod visit; /// File adder capable of constructing UnixFs v1 trees pub mod adder; /// Describes the errors which can happen during a visit or lower level block-by-block walking of /// the DAG. #[derive(Debug)] pub enum FileReadFailed { /// Unsupported UnixFs file; these might exist, but currently there are no workarounds for /// handling them. File(FileError), /// FileReader can only process raw or file type of unixfs content. // This is the raw value instead of the enum by design not to expose the quick-protobuf types UnexpectedType(UnexpectedNodeType), /// Parsing failed Read(Option), /// Link could not be turned into Cid. InvalidCid(InvalidCidInLink), } impl fmt::Display for FileReadFailed { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { use FileReadFailed::*; match self { File(e) => write!(fmt, "{e}"), UnexpectedType(ut) => write!(fmt, "unexpected type for UnixFs: {ut:?}"), Read(Some(e)) => write!(fmt, "reading failed: {e}"), Read(None) => write!(fmt, "reading failed: missing UnixFS message"), InvalidCid(e) => write!(fmt, "{e}"), } } } impl std::error::Error for FileReadFailed { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { use FileReadFailed::*; match self { InvalidCid(e) => Some(e), Read(Some(e)) => Some(e), _ => None, } } } impl<'a> From> for FileReadFailed { fn from(e: ParsingFailed<'a>) -> Self { use ParsingFailed::*; match e { InvalidDagPb(e) => FileReadFailed::Read(Some(e)), InvalidUnixFs(e, _) => FileReadFailed::Read(Some(e)), NoData(_) => FileReadFailed::Read(None), } } } /// Errors which can happen while processing UnixFS type File or Raw blocks. #[derive(Debug, PartialEq, Eq)] pub enum FileError { /// There are nonequal number of links and blocksizes and thus the file ranges for linked trees /// or blocks cannot be determined. LinksAndBlocksizesMismatch, /// Errored when the filesize is non-zero. NoLinksNoContent, /// Unsupported: non-root block defines metadata. NonRootDefinesMetadata(Metadata), /// A non-leaf node in the tree has no filesize value which is used to determine the file range /// for this tree. IntermediateNodeWithoutFileSize, /// The tree or merkle dag should only collapse or stay the same length. TreeExpandsOnLinks, /// The tree links contain overlapping file segments. This is at least unsupported right now, /// but the larger segment could be collapsed down to the reused part. TreeOverlapsBetweenLinks, /// Reader has been fed a link to earlier range. EarlierLink, /// The tree links contain a hole from a file segment to the next tree. This is at least /// unsupported right now. Zeroes could be generated for the hole. TreeJumpsBetweenLinks, /// These values should not be present for unixfs files with File or Raw. If they have a valid /// meaning, support for such has not been implemented. UnexpectedRawOrFileProperties { /// Hash type, as read from the protobuf descriptor; should only be used with HAMT /// directories. hash_type: Option, /// Fan out, as read from the protobuf descriptor; should only be used with HAMT /// directories. fanout: Option, }, } impl fmt::Display for FileError { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { use FileError::*; match self { LinksAndBlocksizesMismatch => write!( fmt, "different number of links and blocksizes: cannot determine subtree ranges" ), NoLinksNoContent => write!( fmt, "filesize is non-zero while there are no links or content" ), NonRootDefinesMetadata(metadata) => { write!(fmt, "unsupported: non-root defines {metadata:?}") } IntermediateNodeWithoutFileSize => { write!(fmt, "intermediatery node with links but no filesize") } TreeExpandsOnLinks => write!( fmt, "total size of tree expands through links, it should only get smaller or keep size" ), TreeOverlapsBetweenLinks => write!(fmt, "unsupported: tree contains overlap"), EarlierLink => write!(fmt, "error: earlier link given"), TreeJumpsBetweenLinks => write!(fmt, "unsupported: tree contains holes"), UnexpectedRawOrFileProperties { hash_type, fanout } => write!( fmt, "unsupported: File or Raw with hash_type {hash_type:?} or fanount {fanout:?}" ), } } } impl std::error::Error for FileError {} impl From for FileReadFailed { fn from(e: FileError) -> Self { Self::File(e) } } /// This exists to help matching the borrowed slice in `Option>` in this file /// or defaulting to empty array. In the processing inside this file, the Cow never represents /// owned value. /// /// This at least sounded useful early on as the quick-protobuf produces many Option values /// which are a bit tricky to handle. We never turn them into Option so we can safely /// use these. pub(crate) trait UnwrapBorrowedExt<'a> { /// Does not default but requires there to be an borrowed inner value. fn unwrap_borrowed(self) -> &'a [u8]; /// Unwraps the Cow of [u8] into empty or wrapped slice. fn unwrap_borrowed_or_empty(self) -> &'a [u8] where Self: 'a; } impl<'a> UnwrapBorrowedExt<'a> for Option> { fn unwrap_borrowed(self) -> &'a [u8] { match self { Some(Cow::Borrowed(x)) => x, Some(Cow::Owned(_)) => panic!("unexpected Cow::Owned"), None => panic!("Unexpected None"), } } fn unwrap_borrowed_or_empty(self) -> &'a [u8] { match self { Some(Cow::Borrowed(x)) => x, None => &[][..], _ => panic!("should not be Cow::Owned"), } } } #[cfg(test)] pub(crate) mod tests { use super::{reader::*, visit::*, UnwrapBorrowedExt}; use crate::test_support::FakeBlockstore; use hex_literal::hex; const CONTENT_FILE: &[u8] = &hex!("0a0d08021207636f6e74656e741807"); #[test] fn just_content() { let fr = FileReader::from_block(CONTENT_FILE).unwrap(); let (content, _) = fr.content(); assert!( matches!(content, FileContent::Bytes(b"content")), "{content:?}" ); } #[test] fn visiting_just_content() { let res = IdleFileVisit::default().start(CONTENT_FILE); assert!(matches!(res, Ok((b"content", _, _, None))), "{res:?}"); } #[test] fn visiting_too_large_range_of_singleblock_file() { let res = IdleFileVisit::default() .with_target_range(500_000..600_000) .start(CONTENT_FILE); assert!(matches!(res, Ok((b"", _, _, None))), "{res:?}"); } #[test] fn empty_file() { let block = &hex!("0a0408021800"); let fr = FileReader::from_block(block).unwrap(); let (content, _) = fr.content(); assert!(matches!(content, FileContent::Bytes(b"")), "{content:?}"); } #[test] fn balanced_traversal() { let target = "QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6"; let blocks = FakeBlockstore::with_fixtures(); // filled on root let (mut links_and_ranges, mut traversal) = { let root = FileReader::from_block(blocks.get_by_str(target)).unwrap(); let (mut links_and_ranges, traversal) = match root.content() { (FileContent::Links(iter), traversal) => { let links_and_ranges = iter .map(|(link, range)| (link.Hash.unwrap_borrowed().to_vec(), range)) .collect::>(); (links_and_ranges, traversal) } x => unreachable!("unexpected {:?}", x), }; // reverse again to pop again links_and_ranges.reverse(); // something 'static to hold on between two blocks (links_and_ranges, traversal) }; let mut combined: Vec = Vec::new(); while let Some((key, range)) = links_and_ranges.pop() { let next = blocks.get_by_raw(&key); let fr = traversal.continue_walk(next, &range).unwrap(); let (content, next) = fr.content(); combined.extend(content.unwrap_content()); traversal = next; } assert_eq!(combined, b"foobar\n"); } fn collect_bytes(blocks: &FakeBlockstore, visit: IdleFileVisit, start: &str) -> Vec { let mut ret = Vec::new(); let (content, _, _, mut step) = visit.start(blocks.get_by_str(start)).unwrap(); ret.extend(content); while let Some(visit) = step { let (first, _) = visit.pending_links(); let block = blocks.get_by_cid(first); let (content, next_step) = visit.continue_walk(block, &mut None).unwrap(); ret.extend(content); step = next_step; } ret } #[test] fn visitor_traversal() { let blocks = FakeBlockstore::with_fixtures(); let start = "QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6"; let bytes = collect_bytes(&blocks, IdleFileVisit::default(), start); assert_eq!(&bytes[..], b"foobar\n"); } #[test] fn scoped_visitor_traversal_from_blockstore() { let blocks = FakeBlockstore::with_fixtures(); let start = "QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6"; let visit = IdleFileVisit::default().with_target_range(1..6); let bytes = collect_bytes(&blocks, visit, start); assert_eq!(&bytes[..], b"oobar"); } #[test] fn less_than_block_scoped_traversal_from_blockstore() { let blocks = FakeBlockstore::with_fixtures(); let start = "QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6"; let visit = IdleFileVisit::default().with_target_range(0..1); let bytes = collect_bytes(&blocks, visit, start); assert_eq!(&bytes[..], b"f"); } #[test] fn scoped_traversal_out_of_bounds_from_blockstore() { let blocks = FakeBlockstore::with_fixtures(); let start = "QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6"; let visit = IdleFileVisit::default().with_target_range(7..20); let bytes = collect_bytes(&blocks, visit, start); assert_eq!(&bytes[..], b""); } #[test] fn trickle_traversal() { let blocks = FakeBlockstore::with_fixtures(); let start = "QmWfQ48ChJUj4vWKFsUDe4646xCBmXgdmNfhjz9T7crywd"; let bytes = collect_bytes(&blocks, IdleFileVisit::default(), start); assert_eq!(&bytes[..], b"foobar\n"); } } rust-unixfs-0.4.0/src/lib.rs000064400000000000000000000131031046102023000140350ustar 00000000000000#![warn(rust_2018_idioms, missing_docs)] //! ipfs-unixfs: UnixFs tree support in Rust. //! //! The crate aims to provide a blockstore implementation independent of the UnixFs implementation by //! working on slices and not doing any IO operations. //! //! The main entry point for extracting information and/or data out of UnixFs trees is //! `ipfs_unixfs::walk::Walker`. To resolve `IpfsPath` segments over dag-pb nodes, //! `ipfs_unixfs::resolve` should be used. extern crate alloc; use alloc::borrow::Cow; use core::fmt; /// File support. pub mod file; /// Symlink creation support pub mod symlink; /// Directory and directory tree support pub mod dir; pub use dir::{resolve, LookupError, MaybeResolved, ResolveError}; mod pb; use pb::{UnixFs, UnixFsType}; /// Support operations for the dag-pb, the outer shell of UnixFS pub mod dagpb; /// Support for walking over all UnixFs trees pub mod walk; #[cfg(test)] pub(crate) mod test_support; /// A link could not be transformed into a Cid. #[derive(Debug)] #[non_exhaustive] pub struct InvalidCidInLink { /// The index of this link, from zero pub nth: usize, /// Hash which could not be turned into a `Cid` pub hash: Cow<'static, [u8]>, /// Name of the link, most likely empty when this originates from a file, most likely non-empty /// for other kinds. pub name: Cow<'static, str>, /// Error from the attempted conversion pub source: libipld::cid::Error, } impl<'a> From<(usize, pb::PBLink<'a>, libipld::cid::Error)> for InvalidCidInLink { fn from((nth, link, source): (usize, pb::PBLink<'a>, libipld::cid::Error)) -> Self { let hash = match link.Hash { Some(Cow::Borrowed(x)) if !x.is_empty() => Cow::Owned(x.to_vec()), Some(Cow::Borrowed(_)) | None => Cow::Borrowed(&[][..]), Some(Cow::Owned(x)) => Cow::Owned(x), }; let name = match link.Name { Some(Cow::Borrowed(x)) if !x.is_empty() => Cow::Owned(x.to_string()), Some(Cow::Borrowed(_)) | None => Cow::Borrowed(""), Some(Cow::Owned(x)) => Cow::Owned(x), }; InvalidCidInLink { nth, hash, name, source, } } } impl fmt::Display for InvalidCidInLink { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { write!( fmt, "failed to convert link #{} ({:?}) to Cid: {}", self.nth, self.name, self.source ) } } impl std::error::Error for InvalidCidInLink { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { Some(&self.source) } } /// Wrapper around the unexpected UnixFs node type, allowing access to querying what is known about /// the type. pub struct UnexpectedNodeType(i32); impl fmt::Debug for UnexpectedNodeType { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { let converted = UnixFsType::from(self.0); // the conversion defaults to Raw if converted == UnixFsType::Raw && self.0 != 0 { write!(fmt, "{} or ", self.0) } else { write!(fmt, "{} or {:?}", self.0, converted) } } } impl From for UnexpectedNodeType { fn from(t: UnixFsType) -> UnexpectedNodeType { UnexpectedNodeType(t.into()) } } impl UnexpectedNodeType { /// Returns `true` if the type represents some directory pub fn is_directory(&self) -> bool { matches!( UnixFsType::from(self.0), UnixFsType::Directory | UnixFsType::HAMTShard ) } /// Returns `true` if the type represents a `File` pub fn is_file(&self) -> bool { matches!(UnixFsType::from(self.0), UnixFsType::File) } } /// A container for the UnixFs metadata, which can be present at the root of the file, directory, or symlink trees. #[derive(Debug, Default, PartialEq, Eq, Clone)] pub struct Metadata { mode: Option, mtime: Option<(i64, u32)>, } impl Metadata { /// Returns the full file mode, if one has been specified. /// /// The full file mode is originally read through `st_mode` field of `stat` struct defined in /// `sys/stat.h` and its defining OpenGroup standard. The lowest 3 bytes correspond to read, /// write, and execute rights per user, group, and other, while the 4th byte determines sticky bits, /// set user id or set group id. The following two bytes correspond to the different file types, as /// defined by the same OpenGroup standard: /// https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/sys_stat.h.html pub fn mode(&self) -> Option { self.mode } /// Returns the raw timestamp of last modification time, if specified. /// /// The timestamp is `(seconds, nanos)` - similar to `core::time::Duration`, with the exception of /// allowing seconds to be negative. The seconds are calculated from `1970-01-01 00:00:00` or /// the common "unix epoch". pub fn mtime(&self) -> Option<(i64, u32)> { self.mtime } /// Returns the mtime metadata as a `FileTime`. Enabled only in the `filetime` feature. #[cfg(feature = "filetime")] pub fn mtime_as_filetime(&self) -> Option { self.mtime() .map(|(seconds, nanos)| filetime::FileTime::from_unix_time(seconds, nanos)) } } impl<'a> From<&'a UnixFs<'_>> for Metadata { fn from(data: &'a UnixFs<'_>) -> Self { let mode = data.mode; let mtime = data .mtime .clone() .map(|ut| (ut.Seconds, ut.FractionalNanoseconds.unwrap_or(0))); Metadata { mode, mtime } } } rust-unixfs-0.4.0/src/pb/merkledag.proto000064400000000000000000000020101046102023000163350ustar 00000000000000syntax = "proto2"; package merkledag.pb; // import and options disabled // import "github.com/gogo/protobuf/gogoproto/gogo.proto"; // // option (gogoproto.gostring_all) = true; // option (gogoproto.equal_all) = true; // option (gogoproto.verbose_equal_all) = true; // option (gogoproto.goproto_stringer_all) = false; // option (gogoproto.stringer_all) = true; // option (gogoproto.populate_all) = true; // option (gogoproto.testgen_all) = true; // option (gogoproto.benchgen_all) = true; // option (gogoproto.marshaler_all) = true; // option (gogoproto.sizer_all) = true; // option (gogoproto.unmarshaler_all) = true; // An IPFS MerkleDAG Link message PBLink { // multihash of the target object optional bytes Hash = 1; // utf string name. should be unique per object optional string Name = 2; // cumulative size of target object optional uint64 Tsize = 3; } // An IPFS MerkleDAG Node message PBNode { // refs to other objects repeated PBLink Links = 2; // opaque user data optional bytes Data = 1; } rust-unixfs-0.4.0/src/pb/merkledag.rs000064400000000000000000000065731046102023000156400ustar 00000000000000// Modified automatically generated rust module for 'merkledag.proto' file #![allow(non_snake_case)] #![allow(non_upper_case_globals)] #![allow(non_camel_case_types)] #![allow(unused_imports)] #![allow(unknown_lints)] #![allow(clippy::all)] #![cfg_attr(rustfmt, rustfmt_skip)] use super::*; use quick_protobuf::sizeofs::*; use quick_protobuf::{BytesReader, MessageRead, MessageWrite, Result, Writer, WriterBackend}; use alloc::borrow::Cow; use core::convert::TryFrom; use std::io::Write; use core::ops::Deref; use core::ops::DerefMut; #[derive(Debug, Default, PartialEq, Clone)] pub struct PBLink<'a> { pub Hash: Option>, pub Name: Option>, pub Tsize: Option, } impl<'a> MessageRead<'a> for PBLink<'a> { fn from_reader(r: &mut BytesReader, bytes: &'a [u8]) -> Result { let mut msg = Self::default(); while !r.is_eof() { match r.next_tag(bytes) { Ok(10) => msg.Hash = Some(r.read_bytes(bytes).map(Cow::Borrowed)?), Ok(18) => msg.Name = Some(r.read_string(bytes).map(Cow::Borrowed)?), Ok(24) => msg.Tsize = Some(r.read_uint64(bytes)?), Ok(t) => { r.read_unknown(bytes, t)?; } Err(e) => return Err(e), } } Ok(msg) } } impl<'a> MessageWrite for PBLink<'a> { fn get_size(&self) -> usize { 0 + self.Hash.as_ref().map_or(0, |m| 1 + sizeof_len((m).len())) + self.Name.as_ref().map_or(0, |m| 1 + sizeof_len((m).len())) + self .Tsize .as_ref() .map_or(0, |m| 1 + sizeof_varint(*(m) as u64)) } fn write_message(&self, w: &mut Writer) -> Result<()> { if let Some(ref s) = self.Hash { w.write_with_tag(10, |w| w.write_bytes(&**s))?; } if let Some(ref s) = self.Name { w.write_with_tag(18, |w| w.write_string(&**s))?; } if let Some(ref s) = self.Tsize { w.write_with_tag(24, |w| w.write_uint64(*s))?; } Ok(()) } } #[derive(Debug, Default, PartialEq, Clone)] pub struct PBNode<'a> { pub Links: Vec>, pub Data: Option>, } impl<'a> MessageRead<'a> for PBNode<'a> { fn from_reader(r: &mut BytesReader, bytes: &'a [u8]) -> Result { let mut msg = Self::default(); while !r.is_eof() { match r.next_tag(bytes) { Ok(18) => msg.Links.push(r.read_message::>(bytes)?), Ok(10) => msg.Data = Some(r.read_bytes(bytes).map(Cow::Borrowed)?), Ok(t) => { r.read_unknown(bytes, t)?; } Err(e) => return Err(e), } } Ok(msg) } } impl<'a> MessageWrite for PBNode<'a> { fn get_size(&self) -> usize { 0 + self .Links .iter() .map(|s| 1 + sizeof_len((s).get_size())) .sum::() + self.Data.as_ref().map_or(0, |m| 1 + sizeof_len((m).len())) } fn write_message(&self, w: &mut Writer) -> Result<()> { for s in &self.Links { w.write_with_tag(18, |w| w.write_message(s))?; } if let Some(ref s) = self.Data { w.write_with_tag(10, |w| w.write_bytes(&**s))?; } Ok(()) } } rust-unixfs-0.4.0/src/pb/unixfs.proto000064400000000000000000000010511046102023000157220ustar 00000000000000syntax = "proto2"; package unixfs.pb; message Data { enum DataType { Raw = 0; Directory = 1; File = 2; Metadata = 3; Symlink = 4; HAMTShard = 5; } required DataType Type = 1; optional bytes Data = 2; optional uint64 filesize = 3; repeated uint64 blocksizes = 4; optional uint64 hashType = 5; optional uint64 fanout = 6; optional uint32 mode = 7; optional UnixTime mtime = 8; } message UnixTime { required int64 Seconds = 1; optional fixed32 FractionalNanoseconds = 2; } message Metadata { optional string MimeType = 1; } rust-unixfs-0.4.0/src/pb/unixfs.rs000064400000000000000000000166071046102023000152200ustar 00000000000000// Modified automatically generated rust module for 'unixfs.proto' file #![allow(non_snake_case)] #![allow(non_upper_case_globals)] #![allow(non_camel_case_types)] #![allow(unused_imports)] #![allow(unknown_lints)] #![allow(clippy::all)] #![cfg_attr(rustfmt, rustfmt_skip)] use super::*; use quick_protobuf::sizeofs::*; use quick_protobuf::{BytesReader, MessageRead, MessageWrite, Result, Writer, WriterBackend}; use alloc::borrow::Cow; use core::convert::TryFrom; use std::io::Write; use core::ops::Deref; use core::ops::DerefMut; #[derive(Debug, Default, PartialEq, Clone)] pub struct Data<'a> { pub Type: mod_Data::DataType, pub Data: Option>, pub filesize: Option, pub blocksizes: Vec, pub hashType: Option, pub fanout: Option, pub mode: Option, pub mtime: Option, } impl<'a> MessageRead<'a> for Data<'a> { fn from_reader(r: &mut BytesReader, bytes: &'a [u8]) -> Result { let mut msg = Self::default(); while !r.is_eof() { match r.next_tag(bytes) { Ok(8) => msg.Type = r.read_enum(bytes)?, Ok(18) => msg.Data = Some(r.read_bytes(bytes).map(Cow::Borrowed)?), Ok(24) => msg.filesize = Some(r.read_uint64(bytes)?), Ok(32) => msg.blocksizes.push(r.read_uint64(bytes)?), Ok(40) => msg.hashType = Some(r.read_uint64(bytes)?), Ok(48) => msg.fanout = Some(r.read_uint64(bytes)?), Ok(56) => msg.mode = Some(r.read_uint32(bytes)?), Ok(66) => msg.mtime = Some(r.read_message::(bytes)?), Ok(t) => { r.read_unknown(bytes, t)?; } Err(e) => return Err(e), } } Ok(msg) } } impl<'a> MessageWrite for Data<'a> { fn get_size(&self) -> usize { 0 + 1 + sizeof_varint(*(&self.Type) as u64) + self.Data.as_ref().map_or(0, |m| 1 + sizeof_len((m).len())) + self .filesize .as_ref() .map_or(0, |m| 1 + sizeof_varint(*(m) as u64)) + self .blocksizes .iter() .map(|s| 1 + sizeof_varint(*(s) as u64)) .sum::() + self .hashType .as_ref() .map_or(0, |m| 1 + sizeof_varint(*(m) as u64)) + self .fanout .as_ref() .map_or(0, |m| 1 + sizeof_varint(*(m) as u64)) + self .mode .as_ref() .map_or(0, |m| 1 + sizeof_varint(*(m) as u64)) + self .mtime .as_ref() .map_or(0, |m| 1 + sizeof_len((m).get_size())) } fn write_message(&self, w: &mut Writer) -> Result<()> { w.write_with_tag(8, |w| w.write_enum(*&self.Type as i32))?; if let Some(ref s) = self.Data { w.write_with_tag(18, |w| w.write_bytes(&**s))?; } if let Some(ref s) = self.filesize { w.write_with_tag(24, |w| w.write_uint64(*s))?; } for s in &self.blocksizes { w.write_with_tag(32, |w| w.write_uint64(*s))?; } if let Some(ref s) = self.hashType { w.write_with_tag(40, |w| w.write_uint64(*s))?; } if let Some(ref s) = self.fanout { w.write_with_tag(48, |w| w.write_uint64(*s))?; } if let Some(ref s) = self.mode { w.write_with_tag(56, |w| w.write_uint32(*s))?; } if let Some(ref s) = self.mtime { w.write_with_tag(66, |w| w.write_message(s))?; } Ok(()) } } pub mod mod_Data { #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub enum DataType { Raw = 0, Directory = 1, File = 2, Metadata = 3, Symlink = 4, HAMTShard = 5, } impl Default for DataType { fn default() -> Self { DataType::Raw } } impl From for DataType { fn from(i: i32) -> Self { match i { 0 => DataType::Raw, 1 => DataType::Directory, 2 => DataType::File, 3 => DataType::Metadata, 4 => DataType::Symlink, 5 => DataType::HAMTShard, _ => Self::default(), } } } impl<'a> From<&'a str> for DataType { fn from(s: &'a str) -> Self { match s { "Raw" => DataType::Raw, "Directory" => DataType::Directory, "File" => DataType::File, "Metadata" => DataType::Metadata, "Symlink" => DataType::Symlink, "HAMTShard" => DataType::HAMTShard, _ => Self::default(), } } } impl From for i32 { fn from(dt: DataType) -> Self { match dt { DataType::Raw => 0, DataType::Directory => 1, DataType::File => 2, DataType::Metadata => 3, DataType::Symlink => 4, DataType::HAMTShard => 5, } } } } #[derive(Debug, Default, PartialEq, Clone)] pub struct UnixTime { pub Seconds: i64, pub FractionalNanoseconds: Option, } impl<'a> MessageRead<'a> for UnixTime { fn from_reader(r: &mut BytesReader, bytes: &'a [u8]) -> Result { let mut msg = Self::default(); while !r.is_eof() { match r.next_tag(bytes) { Ok(8) => msg.Seconds = r.read_int64(bytes)?, Ok(21) => msg.FractionalNanoseconds = Some(r.read_fixed32(bytes)?), Ok(t) => { r.read_unknown(bytes, t)?; } Err(e) => return Err(e), } } Ok(msg) } } impl MessageWrite for UnixTime { fn get_size(&self) -> usize { 0 + 1 + sizeof_varint(*(&self.Seconds) as u64) + self.FractionalNanoseconds.as_ref().map_or(0, |_| 1 + 4) } fn write_message(&self, w: &mut Writer) -> Result<()> { w.write_with_tag(8, |w| w.write_int64(*&self.Seconds))?; if let Some(ref s) = self.FractionalNanoseconds { w.write_with_tag(21, |w| w.write_fixed32(*s))?; } Ok(()) } } #[derive(Debug, Default, PartialEq, Clone)] pub struct Metadata<'a> { pub MimeType: Option>, } impl<'a> MessageRead<'a> for Metadata<'a> { fn from_reader(r: &mut BytesReader, bytes: &'a [u8]) -> Result { let mut msg = Self::default(); while !r.is_eof() { match r.next_tag(bytes) { Ok(10) => msg.MimeType = Some(r.read_string(bytes).map(Cow::Borrowed)?), Ok(t) => { r.read_unknown(bytes, t)?; } Err(e) => return Err(e), } } Ok(msg) } } impl<'a> MessageWrite for Metadata<'a> { fn get_size(&self) -> usize { 0 + self .MimeType .as_ref() .map_or(0, |m| 1 + sizeof_len((m).len())) } fn write_message(&self, w: &mut Writer) -> Result<()> { if let Some(ref s) = self.MimeType { w.write_with_tag(10, |w| w.write_string(&**s))?; } Ok(()) } } rust-unixfs-0.4.0/src/pb.rs000064400000000000000000000173401046102023000136770ustar 00000000000000use alloc::borrow::Cow; use core::convert::TryFrom; use core::fmt; use core::ops::Range; use quick_protobuf::{errors::Result as ProtobufResult, Writer, WriterBackend}; pub(crate) mod merkledag; pub(crate) use merkledag::PBLink; pub(crate) use merkledag::PBNode; pub(crate) mod unixfs; pub(crate) use unixfs::mod_Data::DataType as UnixFsType; pub(crate) use unixfs::Data as UnixFs; /// Failure cases for nested serialization, which allows recovery of the outer `PBNode` when desired. #[derive(Debug)] pub(crate) enum ParsingFailed<'a> { InvalidDagPb(quick_protobuf::Error), NoData(PBNode<'a>), InvalidUnixFs(quick_protobuf::Error, PBNode<'a>), } impl fmt::Display for ParsingFailed<'_> { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { use ParsingFailed::*; match self { InvalidDagPb(e) => write!(fmt, "failed to read the block as dag-pb: {e}"), InvalidUnixFs(e, _) => write!( fmt, "failed to read the dag-pb PBNode::Data as UnixFS message: {e}" ), NoData(_) => write!(fmt, "dag-pb PBNode::Data was missing or empty"), } } } impl std::error::Error for ParsingFailed<'_> { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { use ParsingFailed::*; match self { InvalidDagPb(e) => Some(e), InvalidUnixFs(e, _) => Some(e), NoData(_) => None, } } } // This has been aliased as UnixFs<'a> impl<'a> TryFrom<&'a merkledag::PBNode<'a>> for unixfs::Data<'a> { type Error = quick_protobuf::Error; fn try_from(node: &'a merkledag::PBNode<'a>) -> Result { UnixFs::try_from(node.Data.as_deref()) } } // This has been aliased as UnixFs<'a> impl<'a> TryFrom> for unixfs::Data<'a> { type Error = quick_protobuf::Error; fn try_from(data: Option<&'a [u8]>) -> Result { use quick_protobuf::{BytesReader, MessageRead}; let data = data.unwrap_or_default(); let mut reader = BytesReader::from_bytes(data); UnixFs::from_reader(&mut reader, data) } } // These should be derived by the pb-rs impl<'a> TryFrom<&'a [u8]> for merkledag::PBNode<'a> { type Error = quick_protobuf::Error; fn try_from(data: &'a [u8]) -> Result { use quick_protobuf::{BytesReader, MessageRead}; merkledag::PBNode::from_reader(&mut BytesReader::from_bytes(data), data) } } /// Combined dag-pb (or MerkleDAG) with UnixFs payload. #[derive(Debug)] pub(crate) struct FlatUnixFs<'a> { pub(crate) links: Vec>, pub(crate) data: UnixFs<'a>, } impl<'a> quick_protobuf::message::MessageWrite for FlatUnixFs<'a> { fn get_size(&self) -> usize { use quick_protobuf::sizeofs::sizeof_len; let links = self .links .iter() .map(|s| 1 + sizeof_len(s.get_size())) .sum::(); let body = 1 + sizeof_len(self.data.get_size()); links + body } fn write_message(&self, w: &mut Writer) -> ProtobufResult<()> { // this has been monkeyd after PBNode::write_message // // important to note that while protobuf isn't so picky when reading on field order, dag-pb // is, at least to produce the same Cids. for link in &self.links { w.write_with_tag(18, |w| w.write_message(link))?; } // writing the self.data directly saves us the trouble of serializing it first to a vec, // then using the vec to write this field. w.write_with_tag(10, |w| w.write_message(&self.data))?; Ok(()) } } impl<'a> FlatUnixFs<'a> { pub(crate) fn try_parse(data: &'a [u8]) -> Result> { Self::try_from(data) } } impl<'a> TryFrom<&'a [u8]> for FlatUnixFs<'a> { type Error = ParsingFailed<'a>; fn try_from(data: &'a [u8]) -> Result { let node = merkledag::PBNode::try_from(data).map_err(ParsingFailed::InvalidDagPb)?; let data = match node.Data { Some(Cow::Borrowed(bytes)) if !bytes.is_empty() => Some(bytes), Some(Cow::Owned(_)) => unreachable!(), Some(Cow::Borrowed(_)) | None => return Err(ParsingFailed::NoData(node)), }; match UnixFs::try_from(data) { Ok(data) => Ok(FlatUnixFs { links: node.Links, data, }), Err(e) => Err(ParsingFailed::InvalidUnixFs(e, node)), } } } #[cfg(test)] impl<'a> FlatUnixFs<'a> { pub fn range_links(&'a self) -> impl Iterator, Range)> { assert_eq!(self.links.len(), self.data.blocksizes.len()); let zipped = self .links .clone() .into_iter() .zip(self.data.blocksizes.iter().copied()); // important: we have validated links.len() == blocksizes.len() RangeLinks::from_links_and_blocksizes(zipped, Some(0)) } } pub(crate) struct RangeLinks { inner: I, base: u64, } impl<'a, I> RangeLinks where I: Iterator, u64)>, { /// `start_offset` is the offset of the current tree when walking the graph. pub fn from_links_and_blocksizes(zipped: I, start_offset: Option) -> RangeLinks { RangeLinks { inner: zipped, base: start_offset.unwrap_or(0), } } } impl<'a, I> Iterator for RangeLinks where I: Iterator, u64)>, { type Item = (PBLink<'a>, Range); fn next(&mut self) -> Option { self.inner.next().map(|(link, blocksize)| { let returned_base = self.base; self.base += blocksize; (link, returned_base..(returned_base + blocksize)) }) } fn size_hint(&self) -> (usize, Option) { self.inner.size_hint() } } #[cfg(test)] mod test { use super::{FlatUnixFs, PBNode, UnixFs, UnixFsType}; use alloc::borrow::Cow; use core::convert::TryFrom; use hex_literal::hex; #[test] fn parse_content() { use quick_protobuf::{BytesReader, MessageRead}; let input = hex!("0a0d08021207636f6e74656e741807"); let mut reader = BytesReader::from_bytes(&input); let dagnode = PBNode::from_reader(&mut reader, &input).expect("parse outer merkledag::PBNode"); assert!(dagnode.Links.is_empty()); let unixfs_data = UnixFs::try_from(&dagnode).expect("parse inner unixfs::Data"); assert_eq!(unixfs_data.Type, UnixFsType::File); assert_eq!(unixfs_data.Data, Some(Cow::Borrowed(&b"content"[..]))); assert_eq!(unixfs_data.filesize, Some(7)); println!("{unixfs_data:?}"); } #[test] fn linux_tarxz_range_links() { let input = hex!("122b0a2212203822560f945fd3c74522de3448512a7e45cb53f0a9a1e12161da4667531ec12e120018aed4e015122b0a2212208594eb4dd5d67e573d506cd950ac59863b9afb024a590d7fe49b42fbcb44af43120018aed4e015122b0a221220745a70b6cd7ec3e46d16fb15b5e1e5db256f6a7a52d0b359f8f49b242665e17b120018b4e7e8090a1608021888c1a835208080e015208080e0152088c1e809"); let flat = FlatUnixFs::try_from(&input[..]).unwrap(); let mut expected_ranges = vec![ 0..45_613_056, 45_613_056..91_226_112, 91_226_112..111_812_744, ]; expected_ranges.reverse(); for (link, range) in flat.range_links() { assert_eq!(link.Name, Some(Cow::Borrowed(""))); // Tsize is the subtree size, which must always be larger than the file segments // because of encoding assert!(link.Tsize >= Some(range.end - range.start)); assert_eq!(Some(range), expected_ranges.pop()); } } } rust-unixfs-0.4.0/src/symlink.rs000064400000000000000000000162011046102023000147570ustar 00000000000000//! UnixFS symlink support. UnixFS symlinks are UnixFS messages similar to single block files, but //! the link name or target path is encoded in the UnixFS::Data field. This means that the target //! path could be in any encoding, however it is always treated as an utf8 Unix path. Could be that //! this is wrong. use crate::pb::{FlatUnixFs, UnixFs, UnixFsType}; use alloc::borrow::Cow; use quick_protobuf::{MessageWrite, Writer}; /// Appends a dag-pb block for for a symlink to the given target_path. It is expected that the /// `target_path` is valid relative unix path relative to the place in which this is used but /// targets validity cannot really be judged. pub fn serialize_symlink_block(target_path: &str, block_buffer: &mut Vec) { // should this fail or not? protobuf encoding cannot fail here, however we might create a too // large block but what's the limit? // // why not return a (Cid, Vec) like usually with cidv0? well... let node = FlatUnixFs { links: Vec::new(), data: UnixFs { Type: UnixFsType::Symlink, Data: Some(Cow::Borrowed(target_path.as_bytes())), ..Default::default() }, }; let mut writer = Writer::new(block_buffer); node.write_message(&mut writer).expect("unexpected failure"); } #[cfg(test)] mod tests { use super::serialize_symlink_block; use core::convert::TryFrom; use libipld::multihash::{self, Multihash}; use libipld::Cid; use sha2::{Digest, Sha256}; #[test] fn simple_symlink() { let mut buf = Vec::new(); // this symlink just points to a "b" at the same level, used in `symlinks_in_trees` to // create the `foo_directory/a` which links to sibling `b` or the directory // `foo_directory/b`. serialize_symlink_block("b", &mut buf); let mh = Multihash::wrap(multihash::Code::Sha2_256.into(), &Sha256::digest(&buf)).unwrap(); let cid = Cid::new_v0(mh).expect("sha2_256 is the correct multihash for cidv0"); assert_eq!( cid.to_string(), "QmfLJN6HLyREnWr7QQNmgmuNziUhcbwUopkHQ8gD3pMfp6" ); } #[test] fn symlinks_in_trees_rooted() { use crate::dir::builder::BufferingTreeBuilder; let mut tree = BufferingTreeBuilder::default(); tree.put_link( "foo_directory/b/car", Cid::try_from("QmNYVgoDXh3dqC1jjCuYqQ9w4XfiocehPZjEPiQiCVYv33").unwrap(), 12, ) .unwrap(); tree.put_link( "foo_directory/a", Cid::try_from("QmfLJN6HLyREnWr7QQNmgmuNziUhcbwUopkHQ8gD3pMfp6").unwrap(), 7, ) .unwrap(); let otn = tree.build().last().unwrap().unwrap(); assert_eq!( otn.cid.to_string(), "QmZDVQHwjHwA4SyzEDtJLNxmZeJVK1W8BWFAHV61x2Rs19" ); } #[test] fn symlinks_in_trees_wrapped() { use crate::dir::builder::{BufferingTreeBuilder, TreeOptions}; // note regarding the root directory; now we can add the paths without the first component // `foo_directory` and still get the same result as in `symlinks_in_trees_rooted`. let mut opts = TreeOptions::default(); opts.wrap_with_directory(); let mut tree = BufferingTreeBuilder::new(opts); tree.put_link( "b/car", Cid::try_from("QmNYVgoDXh3dqC1jjCuYqQ9w4XfiocehPZjEPiQiCVYv33").unwrap(), 12, ) .unwrap(); tree.put_link( "a", Cid::try_from("QmfLJN6HLyREnWr7QQNmgmuNziUhcbwUopkHQ8gD3pMfp6").unwrap(), 7, ) .unwrap(); let otn = tree.build().last().unwrap().unwrap(); assert_eq!( otn.cid.to_string(), "QmZDVQHwjHwA4SyzEDtJLNxmZeJVK1W8BWFAHV61x2Rs19" ); } #[test] fn walking_symlink_containing_tree() { use crate::walk::{ContinuedWalk, Walker}; use hex_literal::hex; use std::path::PathBuf; // while this case or similar should be repeated in the walker tests, the topic of symlinks // and how the target path names are handled (esp. on windows) is curious enough to warrant // duplicating these three cases here. let mut fake = crate::test_support::FakeBlockstore::default(); // if `simple_symlink` and `symlinks_in_trees_*` passed, they would had created these // blocks, which we now take for granted. let tree_blocks: &[(&'static str, &'static [u8])] = &[ ("QmZDVQHwjHwA4SyzEDtJLNxmZeJVK1W8BWFAHV61x2Rs19", &hex!("12290a221220fc7fac69ddb44e39686ecfd1ecc6c52ab653f4227e533ee74a2e238f8b2143d3120161180712290a221220b924ddb19181d159c29eec7c98ec506976a76d40241ccd203b226849ce6e0b72120162183d0a020801")), ("QmfLJN6HLyREnWr7QQNmgmuNziUhcbwUopkHQ8gD3pMfp6", &hex!("0a050804120162")), ("QmaoNjmCQ9774sR6H4DzgGPafXyuVVTCyBeXLaxueKYRLm", &hex!("122b0a2212200308c49252eb61966f802baf45074e074f3b3b90619766e0589c1445261a1a221203636172180c0a020801")), ("QmNYVgoDXh3dqC1jjCuYqQ9w4XfiocehPZjEPiQiCVYv33", &hex!("0a0a080212046361720a1804")), ]; for (expected, bytes) in tree_blocks { assert_eq!(*expected, fake.insert_v0(bytes).to_string()); } let mut walker = Walker::new( // note: this matches the `symlinks_in_trees` root cid (the last cid produced) Cid::try_from(tree_blocks[0].0).unwrap(), String::default(), ); #[derive(Debug, PartialEq, Eq)] enum Entry { Dir(PathBuf), Symlink(PathBuf, String), File(PathBuf), } let mut actual = Vec::new(); while walker.should_continue() { let (next, _) = walker.pending_links(); let next = fake.get_by_cid(next); match walker.next(next, &mut None).unwrap() { ContinuedWalk::File(_fs, _cid, path, _metadata, _total_size) => { actual.push(Entry::File(path.into())); } ContinuedWalk::RootDirectory(_cid, path, _metadata) | ContinuedWalk::Directory(_cid, path, _metadata) => { actual.push(Entry::Dir(path.into())); } ContinuedWalk::Bucket(..) => { /* ignore */ } ContinuedWalk::Symlink(link_name, _cid, path, _metadata) => { actual.push(Entry::Symlink( path.into(), core::str::from_utf8(link_name).unwrap().to_owned(), )); } }; } // possibly noteworthy: compare these paths to the ones used when creating; there was // non-empty root component `foo_directory`, refer to `symlinks_in_trees_*` variants for // more. let expected = &[ Entry::Dir(PathBuf::from("")), Entry::Symlink(PathBuf::from("a"), String::from("b")), Entry::Dir(PathBuf::from("b")), Entry::File({ let mut p = PathBuf::from("b"); p.push("car"); p }), ]; assert_eq!(expected, actual.as_slice()); } } rust-unixfs-0.4.0/src/test_support.rs000064400000000000000000000210521046102023000160440ustar 00000000000000use core::convert::TryFrom; use hash_hasher::HashedMap; use hex_literal::hex; use libipld::multihash::Multihash; use libipld::{multihash, Cid}; #[derive(Default)] pub struct FakeBlockstore { blocks: HashedMap>, } impl FakeBlockstore { pub fn get_by_cid<'a>(&'a self, cid: &Cid) -> &'a [u8] { self.blocks .get(cid) .unwrap_or_else(|| panic!("cid not found: {cid}")) } pub fn get_by_raw<'a>(&'a self, key: &[u8]) -> &'a [u8] { self.get_by_cid(&Cid::try_from(key).unwrap()) } pub fn get_by_str<'a>(&'a self, key: &str) -> &'a [u8] { self.get_by_cid(&Cid::try_from(key).unwrap()) } pub fn insert_v0(&mut self, block: &[u8]) -> Cid { use sha2::Digest; let mut sha = sha2::Sha256::new(); sha.update(block); let result = sha.finalize(); let mh = Multihash::wrap(multihash::Code::Sha2_256.into(), &result[..]).unwrap(); let cid = Cid::new_v0(mh).unwrap(); assert!( self.blocks.insert(cid, block.to_vec()).is_none(), "duplicate cid {cid}" ); cid } pub fn with_fixtures() -> Self { let mut this = Self::default(); let foobar_blocks: &[&[u8]] = &[ // root for "foobar\n" from go-ipfs 0.5 add -s size-2 // root // | // ----+----- // | | | | // fo ob ar \n // QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6 &hex!("12280a221220fef9fe1804942b35e19e145a03f9c9d5ca9c997dda0a9416f3f515a52f1b3ce11200180a12280a221220dfb94b75acb208fd4873d84872af58bd65c731770a7d4c0deeb4088e87390bfe1200180a12280a221220054497ae4e89812c83276a48e3e679013a788b7c0eb02712df15095c02d6cd2c1200180a12280a221220cc332ceb37dea7d3d7c00d1393117638d3ed963575836c6d44a24951e444cf5d120018090a0c080218072002200220022001"), // first bytes: fo or QmfVyMoStzTvdnUR7Uotzh82gmL427q9z3xW5Y8fUoszi4 &hex!("0a0808021202666f1802"), // ob or QmdPyW4CWE3QBkgjWfjM5f7Tjb3HukxVuBXZtkqAGwsMnm &hex!("0a08080212026f621802"), // ar or QmNhDQpphvMWhdCzP74taRzXDaEfPGq8vWfFRzD7mEgePM &hex!("0a080802120261721802"), // \n or Qmc5m94Gu7z62RC8waSKkZUrCCBJPyHbkpmGzEePxy2oXJ &hex!("0a07080212010a1801"), // same "foobar\n" but with go-ipfs 0.5 add --trickle -s size-2 // QmWfQ48ChJUj4vWKFsUDe4646xCBmXgdmNfhjz9T7crywd &hex!("12280a2212200f20a024ce0152161bc23e7234573374dfc3999143deaebf9b07b9c67318f9bd1200180a12280a221220b424253c25b5a7345fc7945732e363a12a790341b7c2d758516bbad5bbaab4461200180a12280a221220b7ab6350c604a885be9bd72d833f026b1915d11abe7e8dda5d0bca689342b7411200180a12280a221220a8a826652c2a3e93a751456e71139df086a1fedfd3bd9f232ad52ea1d813720e120018090a0c080218072002200220022001"), // the blocks have type raw instead of file, for some unknown reason &hex!("0a0808001202666f1802"), &hex!("0a08080012026f621802"), &hex!("0a080800120261721802"), &hex!("0a07080012010a1801"), // directory of the above two: QmVkvLsSEm2uJx1h5Fqukje8mMPYg393o5C2kMCkF2bBTA &hex!("12380a2212202bf7f75b76e336f34a04abd86af423b5063628ffd91e5392444078851dc31655120f666f6f6261722e62616c616e63656418dd0112370a2212207baaf5e250ba1352f97eddc95840705890dc5d3fc37084a4c1aa052abcf4ac58120e666f6f6261722e747269636b6c6518dd010a020801"), // a directory with the above directory of two: QmPTotyhVnnfCu9R4qwR4cdhpi5ENaiP8ZJfdqsm8Dw2jB &hex!("12570a2212206e396cd762f0ab55cc48e10b3c9d5a8428fc2888f4ccda86b72d6aa9fc020cb5122e516d566b764c7353456d32754a783168354671756b6a65386d4d5059673339336f3543326b4d436b46326242544118b1040a020801"), // sharded directory where all are collisions with small number of links, each // bucket has two links: QmZbFPTnDBMWbQ6iBxQAhuhLz8Nu9XptYS96e7cuf5wvbk &hex!("122b0a221220904e1485d68b56a71f79d44cd306d536ee52adb6a90c29b6a1fa95a504a038f71202303718b501122b0a221220772026a2c0e021710f8d0d8f72080255d5133556d3ae881e3405e673692f79a81202313318b001122b0a22122075e9df118a625120006c63b75c8f25f1e28397555ccf8c107029332d5e9b648a1202353418aa01122b0a221220db916fd000e12decdf0724965cbf419233a187ae415d59fbafea2c3851e584ad1202353618b101122b0a2212209adc67f730bd8b2f7eff8f2910ec8391814da9d7ae08d076165a9832bce99f921202383218af01122b0a221220bb48edba8f029483a6983ba70aef2cd86d14aa633f33007ce175680105da8d811202433118af01122b0a22122047b1f317152eb425d878e5e3577dd7c40af4bc2b005083c4bc9ec19157a8605c1202443418a601122b0a2212207b7e161cf9246d7fca2e2986aac98bbf2fef4f13f6fea497fc8f43d8899e0de51202463118a6010a280805121f020000001000020000000000000004000000000050000000000000000800802822308002"), // file linked to by all names (empty): QmbFMke1KXqnYyBBWxB74N4c5SBnJMVAiMNRcGu6x1AwQH &hex!("0a0408021800"), &hex!("123d0a221220bfccda787baba32b59c78450ac3d20b633360b43992c77289f9ed46d843561e6121534386c6f6e672d6e616d65642d66696c652d3031361806123d0a221220bfccda787baba32b59c78450ac3d20b633360b43992c77289f9ed46d843561e6121546426c6f6e672d6e616d65642d66696c652d30333718060a290805122008000000000000000000000000000000000000000000010000000000000000002822308002"), &hex!("123d0a221220bfccda787baba32b59c78450ac3d20b633360b43992c77289f9ed46d843561e6121538376c6f6e672d6e616d65642d66696c652d3035381806123d0a221220bfccda787baba32b59c78450ac3d20b633360b43992c77289f9ed46d843561e6121544446c6f6e672d6e616d65642d66696c652d30303918060a250805121c200000000000000000000080000000000000000000000000000000002822308002"), &hex!("123d0a221220bfccda787baba32b59c78450ac3d20b633360b43992c77289f9ed46d843561e6121536396c6f6e672d6e616d65642d66696c652d3033381806123d0a221220bfccda787baba32b59c78450ac3d20b633360b43992c77289f9ed46d843561e6121544356c6f6e672d6e616d65642d66696c652d30353018060a240805121b2000000000000000000000000002000000000000000000000000002822308002"), &hex!("123d0a221220bfccda787baba32b59c78450ac3d20b633360b43992c77289f9ed46d843561e6121532416c6f6e672d6e616d65642d66696c652d3034391806123d0a221220bfccda787baba32b59c78450ac3d20b633360b43992c77289f9ed46d843561e6121543436c6f6e672d6e616d65642d66696c652d30303418060a230805121a10000000000000000000000000000000000000000400000000002822308002"), &hex!("123d0a221220bfccda787baba32b59c78450ac3d20b633360b43992c77289f9ed46d843561e6121543346c6f6e672d6e616d65642d66696c652d3032351806123d0a221220bfccda787baba32b59c78450ac3d20b633360b43992c77289f9ed46d843561e6121543416c6f6e672d6e616d65642d66696c652d30333418060a230805121a04100000000000000000000000000000000000000000000000002822308002"), &hex!("123d0a221220bfccda787baba32b59c78450ac3d20b633360b43992c77289f9ed46d843561e6121534356c6f6e672d6e616d65642d66696c652d3034311806123d0a221220bfccda787baba32b59c78450ac3d20b633360b43992c77289f9ed46d843561e6121541366c6f6e672d6e616d65642d66696c652d30333318060a1e080512154000000000000000000000002000000000000000002822308002"), &hex!("123d0a221220bfccda787baba32b59c78450ac3d20b633360b43992c77289f9ed46d843561e6121536396c6f6e672d6e616d65642d66696c652d3031371806123d0a221220bfccda787baba32b59c78450ac3d20b633360b43992c77289f9ed46d843561e6121538306c6f6e672d6e616d65642d66696c652d30343018060a1a0805121101000002000000000000000000000000002822308002"), &hex!("123d0a221220bfccda787baba32b59c78450ac3d20b633360b43992c77289f9ed46d843561e6121536376c6f6e672d6e616d65642d66696c652d3030331806123d0a221220bfccda787baba32b59c78450ac3d20b633360b43992c77289f9ed46d843561e6121538356c6f6e672d6e616d65642d66696c652d30343818060a1a0805121120000000800000000000000000000000002822308002"), // symlink linking to "foobar": QmNgQEdXVdLw79nH2bnxLMxnyWMaXrijfqMTiDVat3iyuz &hex!("0a0a08041206666f6f626172"), // sharded directory with single link to a non-sharded directory // QmQXUANxYGpkwMTWQUdZBPx9jqfFP7acNgL4FHRWkndKCe &hex!("12390a2212209b04586b8bdc01a7e0db04b8358a3717954572720f6b6803af5eec781cf73801121146416e6f6e5f736861726465645f64697218430a290805122004000000000000000000000000000000000000000000000000000000000000002822308002"), // the non-sharded directory linked by the the above sharded directory // QmYmmkD3dGZjuozuqSzDYjU4ZyhAgc4T4P4SUgY6qjzBi8 &hex!("122e0a22122031c3d57080d8463a3c63b2923df5a1d40ad7a73eae5a14af584213e5f504ac331206666f6f626172180f0a020801"), // single block version of "foobar\n" linked to by above non-sharded directory &hex!("0a0d08021207666f6f6261720a1807"), ]; for block in foobar_blocks { this.insert_v0(block); } this } } rust-unixfs-0.4.0/src/walk.rs000064400000000000000000000755761046102023000142530ustar 00000000000000use crate::dir::{ShardError, UnexpectedDirectoryProperties}; use crate::file::visit::{Cache, FileVisit, IdleFileVisit}; use crate::file::{FileError, FileReadFailed}; use crate::pb::{FlatUnixFs, PBLink, ParsingFailed, UnixFsType}; use crate::{InvalidCidInLink, Metadata, UnexpectedNodeType}; use alloc::borrow::Cow; use core::convert::TryFrom; use core::fmt; use either::Either; use libipld::Cid; use std::path::{Path, PathBuf}; /// `Walker` helps with walking a UnixFS tree, including all of the content and files. It is /// created with `Walker::new` and walked over each block with `Walker::continue_block`. Use /// `Walker::pending_links` to obtain the next [`Cid`] to be loaded and the prefetchable links. #[derive(Debug)] pub struct Walker { /// This is `None` until the first block has been visited. Any failing unwraps would be logic /// errors. current: Option, /// On the next call to `continue_walk` this will be the block, unless we have an ongoing file /// walk, in which case we short-circuit to continue it. Any failing unwraps of /// `self.next` would be logic errors. next: Option<(Cid, String, usize)>, pending: Vec<(Cid, String, usize)>, // tried to recycle the names but that was consistently as fast and used more memory than just // cloning the strings should_continue: bool, } /// Converts a link of specifically a Directory (and not a link of a HAMTShard). fn convert_link( nested_depth: usize, nth: usize, link: PBLink<'_>, ) -> Result<(Cid, String, usize), InvalidCidInLink> { let hash = link.Hash.as_deref().unwrap_or_default(); let cid = match Cid::try_from(hash) { Ok(cid) => cid, Err(e) => return Err(InvalidCidInLink::from((nth, link, e))), }; let name = match link.Name { Some(Cow::Borrowed(s)) if !s.is_empty() => s.to_owned(), None | Some(Cow::Borrowed(_)) => todo!("link cannot be empty"), Some(Cow::Owned(_s)) => unreachable!("FlatUnixFs is never transformed to owned"), }; assert!(!name.contains('/')); Ok((cid, name, nested_depth)) } /// Converts a link of specifically a HAMTShard (and not a link of a Directory). fn convert_sharded_link( nested_depth: usize, sibling_depth: usize, nth: usize, link: PBLink<'_>, ) -> Result<(Cid, String, usize), InvalidCidInLink> { let hash = link.Hash.as_deref().unwrap_or_default(); let cid = match Cid::try_from(hash) { Ok(cid) => cid, Err(e) => return Err(InvalidCidInLink::from((nth, link, e))), }; let (depth, name) = match link.Name { Some(Cow::Borrowed(s)) if s.len() > 2 => (nested_depth, s[2..].to_owned()), Some(Cow::Borrowed(s)) if s.len() == 2 => (sibling_depth, String::from("")), None | Some(Cow::Borrowed(_)) => todo!("link cannot be empty"), Some(Cow::Owned(_s)) => unreachable!("FlatUnixFs is never transformed to owned"), }; assert!(!name.contains('/')); Ok((cid, name, depth)) } impl Walker { /// Returns a new instance of a walker, ready to start from the given `Cid`. pub fn new(cid: Cid, root_name: String) -> Walker { // 1 == Path::ancestors().count() for an empty path let depth = if root_name.is_empty() { 1 } else { 2 }; let next = Some((cid, root_name, depth)); Walker { current: None, next, pending: Vec::new(), should_continue: true, } } /// Returns the next [`Cid`] to load and pass its associated content to [`next`]. /// /// # Panics /// /// When [`should_continue()`] returns `false`. // TODO: perhaps this should return an option? pub fn pending_links(&self) -> (&Cid, impl Iterator + '_) { use InnerKind::*; // rev: because we'll pop any of the pending let cids = self.pending.iter().map(|(cid, ..)| cid).rev(); match self.current.as_ref().map(|c| &c.kind) { Some(File(Some(ref visit), _)) => { let (first, rest) = visit.pending_links(); let next = self.next.iter().map(|(cid, _, _)| cid); (first, Either::Left(rest.chain(next.chain(cids)))) } _ => { let next = self .next .as_ref() .expect("we've validated that we have the next in new and continue_walk"); (&next.0, Either::Right(cids)) } } } /// Continues the walk. /// /// Returns a descriptor for the next element found as `ContinuedWalk` which includes the means /// to further continue the walk. `bytes` is the raw data of the next block, `cache` is an /// optional cache for data structures which can always be substituted with `&mut None`. pub fn next<'a: 'c, 'b: 'c, 'c>( &'a mut self, bytes: &'b [u8], cache: &mut Option, ) -> Result, Error> { let Self { current, next, pending, should_continue, } = self; *should_continue = false; if let Some(InnerEntry { cid, kind: InnerKind::File(visit @ Some(_), sz), metadata, path, .. }) = current { // we have an ongoing filevisit, the block must be related to it. let (bytes, step) = visit.take().unwrap().continue_walk(bytes, cache)?; let file_continues = step.is_some(); let segment = FileSegment::later(bytes, !file_continues); *visit = step; if file_continues || next.is_some() { *should_continue = true; } return Ok(ContinuedWalk::File(segment, cid, path, metadata, *sz)); } let flat = FlatUnixFs::try_from(bytes)?; let metadata = Metadata::from(&flat.data); match flat.data.Type { UnixFsType::Directory => { let flat = crate::dir::check_directory_supported(flat)?; let (cid, name, depth) = next.take().expect("validated at new and earlier"); // depth + 1 because all entries below a directory are children of next, as in, // deeper let links = flat .links .into_iter() .enumerate() .map(|(nth, link)| convert_link(depth + 1, nth, link)) .rev(); // replacing this with try_fold takes as many lines as the R: Try cannot be // deduced without specifying the Error for link in links { pending.push(link?); } if let next_local @ Some(_) = pending.pop() { *next = next_local; *should_continue = true; } Ok(match current { None => { *current = Some(InnerEntry::new_root_dir(cid, metadata, &name, depth)); let ie = current.as_ref().unwrap(); ContinuedWalk::RootDirectory(&ie.cid, &ie.path, &ie.metadata) } Some(ie) => { ie.as_directory(cid, &name, depth, metadata); ContinuedWalk::Directory(&ie.cid, &ie.path, &ie.metadata) } }) } UnixFsType::HAMTShard => { let flat = crate::dir::check_hamtshard_supported(flat)?; let (cid, name, depth) = next.take().expect("validated at start and this method"); // similar to directory, the depth is +1 for nested entries, but the sibling buckets // are at depth let links = flat .links .into_iter() .enumerate() .map(|(nth, link)| convert_sharded_link(depth + 1, depth, nth, link)) .rev(); // TODO: it might be worthwhile to lose the `rev` and sort the pushed links using // the depth ascending. This should make sure we are first visiting the shortest // path items. for link in links { pending.push(link?); } if let next_local @ Some(_) = pending.pop() { *next = next_local; *should_continue = true; } Ok(match current { None => { *current = Some(InnerEntry::new_root_bucket(cid, metadata, &name, depth)); let ie = current.as_ref().unwrap(); ContinuedWalk::RootDirectory(&ie.cid, &ie.path, &ie.metadata) } Some(ie) => { // the name should be empty for all of the siblings if name.is_empty() { ie.as_bucket(cid, &name, depth); ContinuedWalk::Bucket(&ie.cid, &ie.path) } // but it should be non-empty for the directories else { ie.as_bucket_root(cid, &name, depth, metadata); ContinuedWalk::RootDirectory(&ie.cid, &ie.path, &ie.metadata) } } }) } UnixFsType::Raw | UnixFsType::File => { let (bytes, file_size, metadata, step) = IdleFileVisit::default().start_from_parsed(flat, cache)?; let (cid, name, depth) = next.take().expect("validated at new and earlier"); let file_continues = step.is_some(); match current { None => { let ie = InnerEntry::new_root_file(cid, metadata, &name, step, file_size, depth); *current = Some(ie); } Some(ie) => { ie.as_file(cid, &name, depth, metadata, step, file_size); } }; let next_local = pending.pop(); if file_continues || next_local.is_some() { *next = next_local; *should_continue = true; } let segment = FileSegment::first(bytes, !file_continues); let ie = current.as_ref().unwrap(); Ok(ContinuedWalk::File( segment, &ie.cid, &ie.path, &ie.metadata, file_size, )) } UnixFsType::Metadata => Err(Error::UnsupportedType(flat.data.Type.into())), UnixFsType::Symlink => { let contents = match flat.data.Data { Some(Cow::Borrowed(bytes)) if !bytes.is_empty() => bytes, None | Some(Cow::Borrowed(_)) => &[][..], _ => unreachable!("never used into_owned"), }; let (cid, name, depth) = next.take().expect("continued without next"); match current { None => { let ie = InnerEntry::new_root_symlink(cid, metadata, &name, depth); *current = Some(ie); } Some(ie) => { ie.as_symlink(cid, &name, depth, metadata); } }; if let next_local @ Some(_) = pending.pop() { *next = next_local; *should_continue = true; } let ie = current.as_ref().unwrap(); Ok(ContinuedWalk::Symlink( contents, &ie.cid, &ie.path, &ie.metadata, )) } } } /// Returns `true` if there are more links to walk over. pub fn should_continue(&self) -> bool { self.should_continue } // TODO: we could easily split a 'static value for a directory or bucket, which would pop all // entries at a single level out to do some parallel walking, though the skipping could already // be used to do that... Maybe we could return the filevisit on Skipped to save user from // re-creating one? How to do the same for directories? } /// Represents what the `Walker` is currently looking at. struct InnerEntry { cid: Cid, kind: InnerKind, path: PathBuf, metadata: Metadata, depth: usize, } impl From for Metadata { fn from(e: InnerEntry) -> Self { e.metadata } } #[derive(Debug)] enum InnerKind { /// This is necessarily at the root of the walk RootDirectory, /// This is necessarily at the root of the walk BucketAtRoot, /// This is the metadata containing bucket, for which we have a name RootBucket, /// This is a sibling to a previous named metadata containing bucket Bucket, /// Directory on any level except root Directory, /// File optionally on the root level File(Option, u64), /// Symlink optionally on the root level Symlink, } impl InnerEntry { fn new_root_dir(cid: Cid, metadata: Metadata, name: &str, depth: usize) -> Self { let mut path = PathBuf::new(); path.push(name); Self { cid, kind: InnerKind::RootDirectory, path, metadata, depth, } } fn new_root_bucket(cid: Cid, metadata: Metadata, name: &str, depth: usize) -> Self { let mut path = PathBuf::new(); path.push(name); Self { cid, kind: InnerKind::BucketAtRoot, path, metadata, depth, } } fn new_root_file( cid: Cid, metadata: Metadata, name: &str, step: Option, file_size: u64, depth: usize, ) -> Self { let mut path = PathBuf::new(); path.push(name); Self { cid, kind: InnerKind::File(step, file_size), path, metadata, depth, } } fn new_root_symlink(cid: Cid, metadata: Metadata, name: &str, depth: usize) -> Self { let mut path = PathBuf::new(); path.push(name); Self { cid, kind: InnerKind::Symlink, path, metadata, depth, } } fn set_path(&mut self, name: &str, depth: usize) { debug_assert_eq!(self.depth, self.path.ancestors().count()); while self.depth >= depth && self.depth > 0 { assert!(self.path.pop()); self.depth = self .depth .checked_sub(1) .expect("underflowed path components"); } self.path.push(name); self.depth = depth; debug_assert_eq!(self.depth, self.path.ancestors().count()); } fn as_directory(&mut self, cid: Cid, name: &str, depth: usize, metadata: Metadata) { use InnerKind::*; match self.kind { RootDirectory | BucketAtRoot | Bucket | RootBucket | Directory | File(None, _) | Symlink => { self.cid = cid; self.kind = Directory; self.set_path(name, depth); self.metadata = metadata; } ref x => unreachable!("directory ({}, {}, {}) following {:?}", cid, name, depth, x), } } fn as_bucket_root(&mut self, cid: Cid, name: &str, depth: usize, metadata: Metadata) { use InnerKind::*; match self.kind { RootDirectory | BucketAtRoot | Bucket | RootBucket | Directory | File(None, _) | Symlink => { self.cid = cid; self.kind = RootBucket; self.set_path(name, depth); self.metadata = metadata; } ref x => unreachable!( "root bucket ({}, {}, {}) following {:?}", cid, name, depth, x ), } } fn as_bucket(&mut self, cid: Cid, name: &str, depth: usize) { use InnerKind::*; match self.kind { BucketAtRoot => { assert_eq!(self.depth, depth, "{:?}", self.path); } RootBucket | Bucket | File(None, _) | Symlink => { self.cid = cid; self.kind = Bucket; assert!(name.is_empty()); // continuation bucket going bucket -> bucket while self.depth > depth { assert!(self.path.pop()); self.depth = self .depth .checked_sub(1) .expect("underflowed depth calculation during bucket->bucket"); } assert_eq!(self.depth, depth, "{:?}", self.path); } ref x => unreachable!("bucket ({}, {}, {}) following {:?}", cid, name, depth, x), } } fn as_file( &mut self, cid: Cid, name: &str, depth: usize, metadata: Metadata, step: Option, file_size: u64, ) { use InnerKind::*; match self.kind { RootDirectory | BucketAtRoot | RootBucket | Bucket | Directory | File(None, _) | Symlink => { self.cid = cid; self.kind = File(step, file_size); self.set_path(name, depth); self.metadata = metadata; } ref x => unreachable!( "file ({}, {}, {}, {}) following {:?}", cid, name, depth, file_size, x ), } } fn as_symlink(&mut self, cid: Cid, name: &str, depth: usize, metadata: Metadata) { use InnerKind::*; match self.kind { Bucket | BucketAtRoot | Directory | File(None, _) | RootBucket | RootDirectory | Symlink => { self.cid = cid; self.kind = Symlink; self.set_path(name, depth); self.metadata = metadata; } ref x => unreachable!("symlink ({}, {}, {}) following {:?}", cid, name, depth, x), } } } impl fmt::Debug for InnerEntry { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { fmt.debug_struct("InnerEntry") .field("depth", &self.depth) .field("kind", &self.kind) .field("cid", &format_args!("{}", self.cid)) .field("path", &self.path) .field("metadata", &self.metadata) .finish() } } /// Representation of the walk progress. #[derive(Debug)] pub enum ContinuedWalk<'a> { /// Currently looking at a continuation of a HAMT sharded directory. Usually safe to ignore. Bucket(&'a Cid, &'a Path), /// Currently looking at a directory. Directory(&'a Cid, &'a Path, &'a Metadata), /// Currently looking at a file. The first tuple value contains the file bytes accessible /// from the block, which can also be an empty slice. File(FileSegment<'a>, &'a Cid, &'a Path, &'a Metadata, u64), /// Currently looking at a root directory. RootDirectory(&'a Cid, &'a Path, &'a Metadata), /// Currently looking at a symlink. The first tuple value contains the symlink target path. It /// might be convertible to UTF-8, but this is not specified in the spec. Symlink(&'a [u8], &'a Cid, &'a Path, &'a Metadata), } impl ContinuedWalk<'_> { #[cfg(test)] fn path(&self) -> &Path { match self { Self::Bucket(_, p) | Self::Directory(_, p, ..) | Self::File(_, _, p, ..) | Self::RootDirectory(_, p, ..) | Self::Symlink(_, _, p, ..) => p, } } } /// A slice of bytes of a possibly multi-block file. The slice can be accessed via `as_bytes()` or /// `AsRef<[u8]>::as_ref()`. #[derive(Debug)] pub struct FileSegment<'a> { bytes: &'a [u8], first_block: bool, last_block: bool, } impl<'a> FileSegment<'a> { fn first(bytes: &'a [u8], last_block: bool) -> Self { FileSegment { bytes, first_block: true, last_block, } } fn later(bytes: &'a [u8], last_block: bool) -> Self { FileSegment { bytes, first_block: false, last_block, } } /// Returns `true` if this is the first block in the file, `false` otherwise. /// /// Note: the first block can also be the last one. pub fn is_first(&self) -> bool { self.first_block } /// Returns `true` if this is the last block in the file, `false` otherwise. /// /// Note: the last block can also be the first one. pub fn is_last(&self) -> bool { self.last_block } /// Returns a slice into the file's bytes, which can be empty, as is the case for any /// intermediate blocks which only contain links to further blocks. pub fn as_bytes(&self) -> &'a [u8] { self.bytes } } impl AsRef<[u8]> for FileSegment<'_> { fn as_ref(&self) -> &[u8] { self.bytes } } /// Errors which can occur while walking a tree. #[derive(Debug)] pub enum Error { /// An unsupported type of UnixFS node was encountered. There should be a way to skip these. Of the /// defined types only `Metadata` is unsupported, all undefined types as of 2020-06 are also /// unsupported. UnsupportedType(UnexpectedNodeType), /// This error is returned when a file e.g. links to a non-Raw or non-File subtree. UnexpectedType(UnexpectedNodeType), /// dag-pb node parsing failed, perhaps the block is not a dag-pb node? DagPbParsingFailed(quick_protobuf::Error), /// Failed to parse the unixfs node inside the dag-pb node. UnixFsParsingFailed(quick_protobuf::Error), /// dag-pb node contained no data. EmptyDagPbNode, /// dag-pb link could not be converted to a Cid InvalidCid(InvalidCidInLink), /// A File has an invalid structure File(FileError), /// A Directory has an unsupported structure UnsupportedDirectory(UnexpectedDirectoryProperties), /// HAMTSharded directory has unsupported properties UnsupportedHAMTShard(ShardError), } impl From> for Error { fn from(e: ParsingFailed<'_>) -> Self { use ParsingFailed::*; match e { InvalidDagPb(e) => Error::DagPbParsingFailed(e), InvalidUnixFs(e, _) => Error::UnixFsParsingFailed(e), NoData(_) => Error::EmptyDagPbNode, } } } impl From for Error { fn from(e: InvalidCidInLink) -> Self { Error::InvalidCid(e) } } impl From for Error { fn from(e: FileReadFailed) -> Self { use FileReadFailed::*; match e { File(e) => Error::File(e), UnexpectedType(ut) => Error::UnexpectedType(ut), Read(_) => unreachable!("FileVisit does not parse any blocks"), InvalidCid(l) => Error::InvalidCid(l), } } } impl From for Error { fn from(e: UnexpectedDirectoryProperties) -> Self { Error::UnsupportedDirectory(e) } } impl From for Error { fn from(e: ShardError) -> Self { Error::UnsupportedHAMTShard(e) } } impl fmt::Display for Error { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { use Error::*; match self { UnsupportedType(ut) => write!(fmt, "unsupported UnixFs type: {ut:?}"), UnexpectedType(ut) => write!(fmt, "link to unexpected UnixFs type from File: {ut:?}"), DagPbParsingFailed(e) => write!(fmt, "failed to parse the outer dag-pb: {e}"), UnixFsParsingFailed(e) => write!(fmt, "failed to parse the inner UnixFs: {e}"), EmptyDagPbNode => write!(fmt, "failed to parse the inner UnixFs: no data"), InvalidCid(e) => write!(fmt, "link contained an invalid Cid: {e}"), File(e) => write!(fmt, "invalid file: {e}"), UnsupportedDirectory(udp) => write!(fmt, "unsupported directory: {udp}"), UnsupportedHAMTShard(se) => write!(fmt, "unsupported hamtshard: {se}"), } } } impl std::error::Error for Error {} #[cfg(test)] mod tests { use super::*; use crate::test_support::FakeBlockstore; use std::collections::HashMap; use std::path::PathBuf; #[test] fn walk_two_file_directory_empty() { two_file_directory_scenario(""); } #[test] fn walk_two_file_directory_named() { two_file_directory_scenario("foo"); } fn two_file_directory_scenario(root_name: &str) { println!("new two_file_directory_scenario"); let mut counts = walk_everything(root_name, "QmPTotyhVnnfCu9R4qwR4cdhpi5ENaiP8ZJfdqsm8Dw2jB"); let mut pb = PathBuf::new(); pb.push(root_name); counts.checked_removal(&pb, 1); pb.push("QmVkvLsSEm2uJx1h5Fqukje8mMPYg393o5C2kMCkF2bBTA"); counts.checked_removal(&pb, 1); pb.push("foobar.balanced"); counts.checked_removal(&pb, 5); assert!(pb.pop()); pb.push("foobar.trickle"); counts.checked_removal(&pb, 5); assert!(counts.is_empty(), "{counts:#?}"); } #[test] fn sharded_dir_different_root_empty() { sharded_dir_scenario(""); } #[test] fn sharded_dir_different_root_named() { sharded_dir_scenario("foo"); } fn sharded_dir_scenario(root_name: &str) { use core::fmt::Write; // the hamt sharded directory is such that the root only has buckets so all of the actual files // are at second level buckets, each bucket should have 2 files. the actual files, in fact, constitute a single empty // file, linked from many names. let mut counts = walk_everything(root_name, "QmZbFPTnDBMWbQ6iBxQAhuhLz8Nu9XptYS96e7cuf5wvbk"); let mut buf = PathBuf::from(root_name); counts.checked_removal(&buf, 9); let indices = [38, 48, 50, 58, 9, 33, 4, 34, 17, 37, 40, 16, 41, 3, 25, 49]; let mut fmtbuf = String::new(); for (index, i) in indices.iter().enumerate() { fmtbuf.clear(); write!(fmtbuf, "long-named-file-{i:03}").unwrap(); if index > 0 { buf.pop(); } buf.push(&fmtbuf); counts.checked_removal(&buf, 1); } assert!(counts.is_empty(), "{counts:#?}"); } #[test] fn top_level_single_block_file_empty() { single_block_top_level_file_scenario(""); } #[test] fn top_level_single_block_file_named() { single_block_top_level_file_scenario("empty.txt"); } fn single_block_top_level_file_scenario(root_name: &str) { let mut counts = walk_everything(root_name, "QmbFMke1KXqnYyBBWxB74N4c5SBnJMVAiMNRcGu6x1AwQH"); let buf = PathBuf::from(root_name); counts.checked_removal(&buf, 1); } #[test] fn top_level_symlink_empty() { top_level_symlink_scenario(""); } #[test] fn top_level_symlink_named() { top_level_symlink_scenario("this_links_to_foobar"); } fn top_level_symlink_scenario(root_name: &str) { let mut counts = walk_everything(root_name, "QmNgQEdXVdLw79nH2bnxLMxnyWMaXrijfqMTiDVat3iyuz"); let buf = PathBuf::from(root_name); counts.checked_removal(&buf, 1); } #[test] fn top_level_multiblock_file_empty() { top_level_multiblock_file_scenario(""); } #[test] fn top_level_multiblock_file_named() { top_level_multiblock_file_scenario("foobar_and_newline.txt"); } fn top_level_multiblock_file_scenario(root_name: &str) { let mut counts = walk_everything(root_name, "QmWfQ48ChJUj4vWKFsUDe4646xCBmXgdmNfhjz9T7crywd"); let buf = PathBuf::from(root_name); counts.checked_removal(&buf, 5); } #[test] fn test_walked_file_segments() { let blocks = FakeBlockstore::with_fixtures(); let trickle_foobar = libipld::Cid::try_from("QmWfQ48ChJUj4vWKFsUDe4646xCBmXgdmNfhjz9T7crywd").unwrap(); let mut walker = Walker::new(trickle_foobar, String::new()); let mut counter = 0; while walker.should_continue() { let (next, _) = walker.pending_links(); let block = blocks.get_by_cid(next); counter += 1; match walker.next(block, &mut None).unwrap() { ContinuedWalk::File(segment, ..) => { match counter { 1 => { // the root block has only links assert!(segment.as_ref().is_empty()); assert!(segment.is_first()); assert!(!segment.is_last()); } 2..=4 => { assert_eq!(segment.as_ref().len(), 2); assert!(!segment.is_first()); assert!(!segment.is_last()); } 5 => { assert_eq!(segment.as_ref().len(), 1); assert!(!segment.is_first()); assert!(segment.is_last()); } _ => unreachable!(), } } x => unreachable!("{:?}", x), }; } } trait CountsExt { fn checked_removal(&mut self, key: &Path, expected: usize); } impl CountsExt for HashMap { fn checked_removal(&mut self, key: &Path, expected: usize) { use std::collections::hash_map::Entry::*; match self.entry(key.to_owned()) { Occupied(oe) => { assert_eq!(oe.remove(), expected); } Vacant(_) => { panic!("no such key {key:?} (expected {expected}) in {self:#?}"); } } } } fn walk_everything(root_name: &str, cid: &str) -> HashMap { let mut ret = HashMap::new(); let blocks = FakeBlockstore::with_fixtures(); let mut cache = None; let mut walker = Walker::new(libipld::Cid::try_from(cid).unwrap(), root_name.to_string()); while walker.should_continue() { let (next, _) = walker.pending_links(); let block = blocks.get_by_cid(next); let cw = walker.next(block, &mut cache).unwrap(); *ret.entry(PathBuf::from(cw.path())).or_insert(0) += 1; } ret } }