bgzip-0.3.1/.cargo_vcs_info.json0000644000000001430000000000100121460ustar { "git": { "sha1": "99dc1c8c74bbd3bb916da569b96727ecf1312879" }, "path_in_vcs": "bgzip" }bgzip-0.3.1/Cargo.lock0000644000000475140000000000100101360ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "adler" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "anyhow" version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800" [[package]] name = "autocfg" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "bgzip" version = "0.3.1" dependencies = [ "anyhow", "clap", "csv", "flate2", "libdeflater", "log", "rand", "rand_pcg", "rayon", "tempfile", "thiserror", ] [[package]] name = "bitflags" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "cc" version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" version = "4.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f13b9c79b5d1dd500d20ef541215a6423c75829ef43117e1b4d17fd8af0b5d76" dependencies = [ "bitflags", "clap_derive", "clap_lex", "is-terminal", "once_cell", "strsim", "termcolor", "terminal_size", ] [[package]] name = "clap_derive" version = "4.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "684a277d672e91966334af371f1a7b5833f9aa00b07c84e92fbce95e00208ce8" dependencies = [ "heck", "proc-macro-error", "proc-macro2", "quote", "syn", ] [[package]] name = "clap_lex" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "783fe232adfca04f90f56201b26d79682d4cd2625e0bc7290b95123afe558ade" dependencies = [ "os_str_bytes", ] [[package]] name = "cloudflare-zlib-sys" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2040b6d1edfee6d75f172d81e2d2a7807534f3f294ce18184c70e7bb0105cd6f" dependencies = [ "cc", ] [[package]] name = "cmake" version = "0.1.49" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db34956e100b30725f2eb215f90d4871051239535632f84fea3bc92722c66b7c" dependencies = [ "cc", ] [[package]] name = "crc32fast" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" dependencies = [ "cfg-if", ] [[package]] name = "crossbeam-channel" version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" dependencies = [ "cfg-if", "crossbeam-utils", ] [[package]] name = "crossbeam-deque" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" dependencies = [ "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" version = "0.9.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "01a9af1f4c2ef74bb8aa1f7e19706bc72d03598c8a570bb5de72243c7a9d9d5a" dependencies = [ "autocfg", "cfg-if", "crossbeam-utils", "memoffset", "scopeguard", ] [[package]] name = "crossbeam-utils" version = "0.8.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f" dependencies = [ "cfg-if", ] [[package]] name = "csv" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af91f40b7355f82b0a891f50e70399475945bb0b0da4f1700ce60761c9d3e359" dependencies = [ "csv-core", "itoa", "ryu", "serde", ] [[package]] name = "csv-core" version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" dependencies = [ "memchr", ] [[package]] name = "either" version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" [[package]] name = "errno" version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" dependencies = [ "errno-dragonfly", "libc", "winapi", ] [[package]] name = "errno-dragonfly" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" dependencies = [ "cc", "libc", ] [[package]] name = "fastrand" version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" dependencies = [ "instant", ] [[package]] name = "flate2" version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8a2db397cb1c8772f31494cb8917e48cd1e64f0fa7efac59fbd741a0a8ce841" dependencies = [ "cloudflare-zlib-sys", "crc32fast", "libz-ng-sys", "libz-sys", "miniz_oxide", ] [[package]] name = "getrandom" version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" dependencies = [ "cfg-if", "libc", "wasi", ] [[package]] name = "heck" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "hermit-abi" version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" dependencies = [ "libc", ] [[package]] name = "hermit-abi" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" [[package]] name = "instant" version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" dependencies = [ "cfg-if", ] [[package]] name = "io-lifetimes" version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1abeb7a0dd0f8181267ff8adc397075586500b81b28a73e8a0208b00fc170fb3" dependencies = [ "libc", "windows-sys 0.45.0", ] [[package]] name = "is-terminal" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22e18b0a45d56fe973d6db23972bf5bc46f988a4a2385deac9cc29572f09daef" dependencies = [ "hermit-abi 0.3.1", "io-lifetimes", "rustix", "windows-sys 0.45.0", ] [[package]] name = "itoa" version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fad582f4b9e86b6caa621cabeb0963332d92eea04729ab12892c2533951e6440" [[package]] name = "libc" version = "0.2.139" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" [[package]] name = "libdeflate-sys" version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e1f7b0817f85e2ba608892f30fbf4c9d03f3ebf9db0c952d1b7c8f7387b54785" dependencies = [ "cc", ] [[package]] name = "libdeflater" version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "671e63282f642c7bcc7d292b212d5a4739fef02a77fe98429a75d308f96e7931" dependencies = [ "libdeflate-sys", ] [[package]] name = "libz-ng-sys" version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4399ae96a9966bf581e726de86969f803a81b7ce795fcd5480e640589457e0f2" dependencies = [ "cmake", "libc", ] [[package]] name = "libz-sys" version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9702761c3935f8cc2f101793272e202c72b99da8f4224a19ddcf1279a6450bbf" dependencies = [ "cc", "cmake", "libc", "pkg-config", "vcpkg", ] [[package]] name = "linux-raw-sys" version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" [[package]] name = "log" version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" dependencies = [ "cfg-if", ] [[package]] name = "memchr" version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "memoffset" version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" dependencies = [ "autocfg", ] [[package]] name = "miniz_oxide" version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa" dependencies = [ "adler", ] [[package]] name = "num_cpus" version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" dependencies = [ "hermit-abi 0.2.6", "libc", ] [[package]] name = "once_cell" version = "1.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" [[package]] name = "os_str_bytes" version = "6.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee" [[package]] name = "pkg-config" version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" [[package]] name = "ppv-lite86" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro-error" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" dependencies = [ "proc-macro-error-attr", "proc-macro2", "quote", "syn", "version_check", ] [[package]] name = "proc-macro-error-attr" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" dependencies = [ "proc-macro2", "quote", "version_check", ] [[package]] name = "proc-macro2" version = "1.0.51" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6" dependencies = [ "unicode-ident", ] [[package]] name = "quote" version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" dependencies = [ "proc-macro2", ] [[package]] name = "rand" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", "rand_chacha", "rand_core", ] [[package]] name = "rand_chacha" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", "rand_core", ] [[package]] name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ "getrandom", ] [[package]] name = "rand_pcg" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "59cad018caf63deb318e5a4586d99a24424a364f40f1e5778c29aca23f4fc73e" dependencies = [ "rand_core", ] [[package]] name = "rayon" version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6db3a213adf02b3bcfd2d3846bb41cb22857d131789e01df434fb7e7bc0759b7" dependencies = [ "either", "rayon-core", ] [[package]] name = "rayon-core" version = "1.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "356a0625f1954f730c0201cdab48611198dc6ce21f4acff55089b5a78e6e835b" dependencies = [ "crossbeam-channel", "crossbeam-deque", "crossbeam-utils", "num_cpus", ] [[package]] name = "redox_syscall" version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ "bitflags", ] [[package]] name = "remove_dir_all" version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" dependencies = [ "winapi", ] [[package]] name = "rustix" version = "0.36.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f43abb88211988493c1abb44a70efa56ff0ce98f233b7b276146f1f3f7ba9644" dependencies = [ "bitflags", "errno", "io-lifetimes", "libc", "linux-raw-sys", "windows-sys 0.45.0", ] [[package]] name = "ryu" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde" [[package]] name = "scopeguard" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "serde" version = "1.0.152" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb" [[package]] name = "strsim" version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "syn" version = "1.0.107" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "tempfile" version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4" dependencies = [ "cfg-if", "fastrand", "libc", "redox_syscall", "remove_dir_all", "winapi", ] [[package]] name = "termcolor" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" dependencies = [ "winapi-util", ] [[package]] name = "terminal_size" version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb20089a8ba2b69debd491f8d2d023761cbf196e999218c591fa1e7e15a21907" dependencies = [ "rustix", "windows-sys 0.42.0", ] [[package]] name = "thiserror" version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "unicode-ident" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" [[package]] name = "vcpkg" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "version_check" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "winapi" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" dependencies = [ "winapi-i686-pc-windows-gnu", "winapi-x86_64-pc-windows-gnu", ] [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" dependencies = [ "winapi", ] [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-sys" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", "windows_i686_gnu", "windows_i686_msvc", "windows_x86_64_gnu", "windows_x86_64_gnullvm", "windows_x86_64_msvc", ] [[package]] name = "windows-sys" version = "0.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" dependencies = [ "windows-targets", ] [[package]] name = "windows-targets" version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", "windows_i686_gnu", "windows_i686_msvc", "windows_x86_64_gnu", "windows_x86_64_gnullvm", "windows_x86_64_msvc", ] [[package]] name = "windows_aarch64_gnullvm" version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" [[package]] name = "windows_aarch64_msvc" version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" [[package]] name = "windows_i686_gnu" version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" [[package]] name = "windows_i686_msvc" version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" [[package]] name = "windows_x86_64_gnu" version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" [[package]] name = "windows_x86_64_gnullvm" version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" [[package]] name = "windows_x86_64_msvc" version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" bgzip-0.3.1/Cargo.toml0000644000000037460000000000100101600ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2018" name = "bgzip" version = "0.3.1" authors = ["OKAMURA, Yasunobu "] exclude = [ "testfiles", "tmp", ] description = "Rust implementation of bgzip" homepage = "https://github.com/informationsea/bgzip-rs" readme = "README.md" keywords = [ "bioinformatics", "bgzip", "gzip", "compression", ] categories = ["compression"] license = "MIT" repository = "https://github.com/informationsea/bgzip-rs" [dependencies.flate2] version = "1" optional = true default-features = false [dependencies.libdeflater] version = "0.12.0" optional = true [dependencies.log] version = "0.4" optional = true [dependencies.rayon] version = "1.6.1" optional = true [dependencies.thiserror] version = "1.0" [dev-dependencies.anyhow] version = "1" [dev-dependencies.clap] version = "4.0" features = [ "derive", "wrap_help", ] [dev-dependencies.csv] version = "1" [dev-dependencies.flate2] version = "1" [dev-dependencies.rand] version = "0.8.5" [dev-dependencies.rand_pcg] version = "0.3.1" [dev-dependencies.tempfile] version = "3.3" [features] cloudflare_zlib = [ "flate2/cloudflare_zlib", "flate2", ] default = [ "rust_backend", "log", "rayon", ] flate2 = ["dep:flate2"] libdeflater = ["dep:libdeflater"] log = ["dep:log"] rayon = ["dep:rayon"] rust_backend = [ "flate2/rust_backend", "flate2", ] zlib = [ "flate2/zlib", "flate2", ] zlib-ng = [ "flate2/zlib-ng", "flate2", ] zlib-ng-compat = [ "flate2/zlib-ng-compat", "flate2", ] bgzip-0.3.1/Cargo.toml.orig000064400000000000000000000023300072674642500136550ustar 00000000000000[package] name = "bgzip" version = "0.3.1" edition = "2018" authors = ["OKAMURA, Yasunobu "] readme = "../README.md" description = "Rust implementation of bgzip" homepage = "https://github.com/informationsea/bgzip-rs" repository = "https://github.com/informationsea/bgzip-rs" license = "MIT" keywords = ["bioinformatics", "bgzip", "gzip", "compression"] categories = ["compression"] exclude = ["testfiles", "tmp"] [features] default = ["rust_backend", "log", "rayon"] flate2 = ["dep:flate2"] rust_backend = ["flate2/rust_backend", "flate2"] zlib = ["flate2/zlib", "flate2"] zlib-ng-compat = ["flate2/zlib-ng-compat", "flate2"] zlib-ng = ["flate2/zlib-ng", "flate2"] cloudflare_zlib = ["flate2/cloudflare_zlib", "flate2"] libdeflater = ["dep:libdeflater"] rayon = ["dep:rayon"] log = ["dep:log"] [dependencies] flate2 = { version = "1", default-features = false, optional = true } rayon = { version = "1.6.1", optional = true } log = { version = "0.4", optional = true } libdeflater = { version = "0.12.0", optional = true } thiserror = "1.0" [dev-dependencies] flate2 = "1" csv = "1" clap = {version = "4.0", features=["derive", "wrap_help"]} tempfile = "3.3" anyhow = "1" rand = "0.8.5" rand_pcg = "0.3.1" bgzip-0.3.1/README.md000064400000000000000000000055170072674642500122570ustar 00000000000000bgzip-rs ======== [![Build](https://github.com/informationsea/bgzip-rs/actions/workflows/build.yml/badge.svg)](https://github.com/informationsea/bgzip-rs/actions/workflows/build.yml) [![Crates.io](https://img.shields.io/crates/v/bgzip)](https://crates.io/crates/bgzip) [![Crates.io](https://img.shields.io/crates/d/bgzip)](https://crates.io/crates/bgzip) [![Crates.io](https://img.shields.io/crates/l/bgzip)](https://crates.io/crates/bgzip) [![doc-rs](https://docs.rs/bgzip/badge.svg)](https://docs.rs/bgzip) Rust implementation of BGZF Feature flags ------------- * `rayon`: Enable [rayon](https://github.com/rayon-rs/rayon) based multi-threaded reader/writer. This is default feature. * `log`: Enable [log](https://github.com/rust-lang/log) crate to log warnings. This is default feature. * `rust_backend`: use [miniz_oxide](https://crates.io/crates/miniz_oxide) crate for [flate2](https://github.com/rust-lang/flate2-rs) backend. This is default feature. * `zlib`: use `zlib` for flate2 backend. Please read [flate2](https://github.com/rust-lang/flate2-rs) description for the detail. * `zlib-ng`: use `zlib-ng` for flate2 backend. Please read [flate2](https://github.com/rust-lang/flate2-rs) description for the detail. * `zlib-ng-compat`: Please read [flate2](https://github.com/rust-lang/flate2-rs) description for the detail. * `cloudflare_zlib`: Please read [flate2](https://github.com/rust-lang/flate2-rs) description for the detail. * `libdeflater`: use [libdeflater](https://github.com/adamkewley/libdeflater) instead of [flate2](https://github.com/rust-lang/flate2-rs) crate. Write Examples -------- ```rust use bgzip::{BGZFWriter, BGZFError, Compression}; use std::io::{self, Write}; fn main() -> Result<(), BGZFError> { let mut write_buffer = Vec::new(); let mut writer = BGZFWriter::new(&mut write_buffer, Compression::default()); writer.write_all(b"##fileformat=VCFv4.2\n")?; writer.write_all(b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")?; writer.close()?; Ok(()) } ``` Read Examples -------- ```rust use bgzip::{BGZFReader, BGZFError}; use std::io::{self, BufRead}; use std::fs; fn main() -> Result<(), BGZFError> { let mut reader = BGZFReader::new(fs::File::open("testfiles/common_all_20180418_half.vcf.gz")?)?; let mut line = String::new(); reader.read_line(&mut line)?; assert_eq!("##fileformat=VCFv4.0\n", line); reader.bgzf_seek(4210818610)?; line.clear(); reader.read_line(&mut line)?; assert_eq!("1\t72700625\trs12116859\tT\tA,C\t.\t.\tRS=12116859;RSPOS=72700625;dbSNPBuildID=120;SSR=0;SAO=0;VP=0x05010008000517053e000100;GENEINFO=LOC105378798:105378798;WGT=1;VC=SNV;SLO;INT;ASP;VLD;G5A;G5;HD;GNO;KGPhase1;KGPhase3;CAF=0.508,.,0.492;COMMON=1;TOPMED=0.37743692660550458,0.00608435270132517,0.61647872069317023\n", line); Ok(()) } ``` Author ------ Yasunobu OKAMURA License ------- MIT bgzip-0.3.1/examples/bgzip-header-printer.rs000064400000000000000000000026430072674642500172030ustar 00000000000000use clap::Parser; use std::fs::File; use std::io::{stdout, BufReader, Read, Seek, SeekFrom, Write}; #[derive(Debug, Parser)] struct Args { #[command()] file: String, #[arg(short, long)] output: Option, } fn main() -> anyhow::Result<()> { let parser = Args::parse(); let mut file = BufReader::new(File::open(&parser.file)?); let out: Box = if let Some(out) = parser.output { Box::new(File::create(out)?) } else { Box::new(stdout().lock()) }; let mut csv_out = csv::WriterBuilder::new().from_writer(out); csv_out.write_record(&[ "offset", "header-size", "compressed-size", "decompressed-size", ])?; loop { let offset = file.seek(SeekFrom::Current(0))?; let header = bgzip::header::BGZFHeader::from_reader(&mut file)?; let compressed_size = header.block_size()?; file.seek(SeekFrom::Current(compressed_size as i64 - 20 - 6 + 4))?; let mut size_buf: [u8; 4] = [0, 0, 0, 0]; file.read_exact(&mut size_buf)?; let uncompressed_size = u32::from_le_bytes(size_buf); csv_out.write_record(&[ format!("{}", offset), format!("{}", header.header_size()), format!("{}", compressed_size), format!("{}", uncompressed_size), ])?; if uncompressed_size == 0 { break; } } Ok(()) } bgzip-0.3.1/examples/compress.rs000064400000000000000000000025000072674642500150040ustar 00000000000000use bgzip::write::BGZFWriter; use clap::Parser; use std::fs::File; use std::io::prelude::*; #[cfg(not(feature = "rayon"))] #[derive(Debug, Clone, Parser, PartialEq)] struct Cli { #[command()] input_file: String, #[arg(short, long)] output: String, #[arg(short, long)] compress_level: u32, } #[cfg(feature = "rayon")] #[derive(Debug, Clone, Parser, PartialEq)] struct Cli { #[command()] input_file: String, #[arg(short, long)] output: String, #[arg(short, long)] compress_level: u32, #[arg(short = '@', long)] thread: Option, } fn main() -> anyhow::Result<()> { let cli = Cli::parse(); let mut file_reader = File::open(&cli.input_file)?; let file_writer = File::create(&cli.output)?; let level = bgzip::Compression::new(cli.compress_level)?; #[cfg(feature = "rayon")] let mut writer: Box = if let Some(thread) = cli.thread { rayon::ThreadPoolBuilder::new() .num_threads(thread) .build_global()?; Box::new(bgzip::write::BGZFMultiThreadWriter::new(file_writer, level)) } else { Box::new(BGZFWriter::new(file_writer, level)) }; #[cfg(not(feature = "rayon"))] let mut writer = BGZFWriter::new(file_writer, level)?; std::io::copy(&mut file_reader, &mut writer)?; Ok(()) } bgzip-0.3.1/examples/decompress.rs000064400000000000000000000021450072674642500153220ustar 00000000000000use bgzip::read::BGZFReader; use clap::Parser; use std::fs::File; use std::io::prelude::*; #[cfg(not(feature = "rayon"))] #[derive(Debug, Parser)] struct Cli { #[command()] input_file: String, #[arg(short, long)] output: String, } #[cfg(feature = "rayon")] #[derive(Debug, Parser)] struct Cli { #[command()] input_file: String, #[arg(short, long)] output: String, #[arg(short = '@', long)] thread: Option, } fn main() -> anyhow::Result<()> { let cli = Cli::parse(); let file_reader = File::open(&cli.input_file)?; let mut file_writer = File::create(&cli.output)?; #[cfg(feature = "rayon")] let mut reader: Box = if let Some(thread) = cli.thread { rayon::ThreadPoolBuilder::new() .num_threads(thread) .build_global()?; Box::new(bgzip::read::BGZFMultiThreadReader::new(file_reader)?) } else { Box::new(BGZFReader::new(file_reader)?) }; #[cfg(not(feature = "rayon"))] let mut reader = BGZFReader::new(file_reader)?; std::io::copy(&mut reader, &mut file_writer)?; Ok(()) } bgzip-0.3.1/examples/tabix-printer.rs000064400000000000000000000050320072674642500157440ustar 00000000000000use bgzip::tabix::Tabix; use clap::Parser; use std::fs::File; use std::io::{stdout, Write}; #[derive(Debug, Parser)] struct Args { #[command()] file: String, #[arg(short, long)] output: Option, } fn main() -> anyhow::Result<()> { let parser = Args::parse(); let file = Tabix::from_reader(File::open(&parser.file)?)?; let out: Box = if let Some(out) = parser.output { Box::new(File::create(out)?) } else { Box::new(stdout().lock()) }; let mut csv_out = csv::WriterBuilder::new().flexible(true).from_writer(out); csv_out.write_record(&[ "# of sequences", "format", "coordinate rule", "column for the sequence name", "column for the start of a region", "column for the end fo a region", "meta", "skip", "Length of concatenated sequence names", ])?; csv_out.write_record(&[ format!("{}", file.number_of_references), match file.format & 0xffff { 0 => "Generic".to_string(), 1 => "SAM".to_string(), 2 => "VCF".to_string(), _ => format!("Unknown: {}", file.format), }, match file.format & 0x10000 { 0 => "GFF Rule".to_string(), _ => "BED Rule".to_string(), }, format!("{}", file.column_for_sequence), format!("{}", file.column_for_begin), format!("{}", file.column_for_end), format!("{}", String::from_utf8_lossy(&file.meta)), format!("{}", file.skip), format!("{}", file.length_of_concatenated_sequence_names), ])?; csv_out.write_record(&[""])?; csv_out.write_record(&[ "sequence index", "sequence name", "bin index", "bin", "chunk index", "begin", "end", ])?; for (i, (ref_name, sequence)) in file.names.iter().zip(file.sequences.iter()).enumerate() { let mut bins: Vec<_> = sequence.bins.values().collect(); bins.sort_by_key(|x| x.bin); for (j, bin) in bins.iter().enumerate() { for (k, x) in bin.chunks.iter().enumerate() { csv_out.write_record(&[ format!("{}", i), String::from_utf8_lossy(ref_name).to_string(), format!("{}", j), format!("0x{:x}", bin.bin), format!("{}", k), format!("0x{:x}", x.begin), format!("0x{:x}", x.end), ])?; } } } Ok(()) } bgzip-0.3.1/src/csi.rs000064400000000000000000000022720072674642500127060ustar 00000000000000use std::convert::TryInto; /// calculate bin given an alignment covering [beg,end) (zero-based, half-close-half-open) pub fn reg2bin(beg: i64, end: i64, min_shift: u32, depth: u32) -> u32 { let end = end - 1; let mut s = min_shift; let mut t = ((1 << (depth * 3)) - 1) / 7; for l2 in 0..depth { //eprintln!("depth: {}", l2); let l = depth - l2; if beg >> s == end >> s { //eprintln!("value: {}", (t + (beg >> s))); return (t + (beg >> s)).try_into().unwrap(); }; s += 3; //let t2 = t; t -= 1 << ((l - 1) * 3); //eprintln!("t : {} -> {} / {} / {}", t2, t, l, 1 << (l * 3)); } 0 } /// calculate the list of bins that may overlap with region [beg,end) (zero-based) pub fn reg2bins(beg: i64, end: i64, min_shift: u32, depth: u32) -> Vec { let mut bins: Vec = Vec::new(); let end = end - 1; let mut s = min_shift + depth * 3; let mut t = 0; for l in 0..=depth { let b = t + (beg >> s); let e = t + (end >> s); for i in b..=e { bins.push(i.try_into().unwrap()); } s -= 3; t += 1 << (l * 3); } bins } bgzip-0.3.1/src/deflate.rs000064400000000000000000000165350072674642500135430ustar 00000000000000//! Binding to DEFLATE library. //! //! [libdeflater](https://crates.io/crates/libdeflater) or [flate2](https://crates.io/crates/flate2) is used to compress/decompress data. use std::convert::TryInto; use thiserror::Error; #[cfg(not(feature = "libdeflater"))] use flate2::Status; #[cfg(not(feature = "libdeflater"))] pub use flate2::Crc; #[cfg(feature = "libdeflater")] pub use libdeflater::Crc; use crate::BGZFError; /// Compression Level #[cfg(not(feature = "libdeflater"))] #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct Compression(flate2::Compression); /// Compression Level #[cfg(feature = "libdeflater")] #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct Compression(libdeflater::CompressionLvl); #[cfg(not(feature = "libdeflater"))] impl Compression { pub const fn new(level: u32) -> Result { Ok(Compression(flate2::Compression::new(level))) } pub const fn best() -> Self { Compression(flate2::Compression::best()) } pub const fn fast() -> Self { Compression(flate2::Compression::fast()) } } #[cfg(not(feature = "libdeflater"))] impl From for Compression { fn from(value: flate2::Compression) -> Self { Compression(value) } } #[cfg(feature = "libdeflater")] impl Compression { pub fn new(level: u32) -> Result { Ok(Compression( libdeflater::CompressionLvl::new(level.try_into().unwrap()).map_err(|e| match e { libdeflater::CompressionLvlError::InvalidValue => { BGZFError::InvalidCompressionLevel } })?, )) } pub fn best() -> Self { Compression(libdeflater::CompressionLvl::best()) } pub fn fast() -> Self { Compression(libdeflater::CompressionLvl::fastest()) } } #[cfg(not(feature = "libdeflater"))] impl Default for Compression { fn default() -> Self { Compression(flate2::Compression::default()) } } #[cfg(feature = "libdeflater")] impl Default for Compression { fn default() -> Self { Compression(libdeflater::CompressionLvl::default()) } } /// Compression Error #[derive(Debug, Error, Clone, PartialEq)] pub enum CompressError { #[error("Insufficient spcae")] InsufficientSpace, #[error("Other: {0}")] Other(String), } /// flate2 based compressor #[cfg(not(feature = "libdeflater"))] #[derive(Debug)] pub struct Compress(flate2::Compress); #[cfg(not(feature = "libdeflater"))] impl Compress { pub fn new(level: Compression) -> Self { Compress(flate2::Compress::new(level.0, false)) } pub fn compress( &mut self, original_data: &[u8], compressed_data: &mut [u8], ) -> Result { self.0.reset(); let status = self .0 .compress( original_data, compressed_data, flate2::FlushCompress::Finish, ) .map_err(|e| CompressError::Other(e.message().unwrap_or("Unkown error").to_string()))?; match status { flate2::Status::BufError => Err(CompressError::InsufficientSpace), flate2::Status::Ok => Err(CompressError::InsufficientSpace), flate2::Status::StreamEnd => Ok(self.0.total_out().try_into().unwrap()), } } } /// libdeflater based compressor #[cfg(feature = "libdeflater")] pub struct Compress(libdeflater::Compressor); #[cfg(feature = "libdeflater")] impl Compress { pub fn new(level: Compression) -> Self { Compress(libdeflater::Compressor::new(level.0)) } pub fn compress( &mut self, original_data: &[u8], compressed_data: &mut [u8], ) -> Result { self.0 .deflate_compress(original_data, compressed_data) .map_err(|e| match e { libdeflater::CompressionError::InsufficientSpace => { CompressError::InsufficientSpace } }) } } /// Decompress Error #[derive(Debug, Error, Clone, PartialEq)] pub enum DecompressError { #[error("Decompress Error: Insufficient spcae")] InsufficientSpace, #[error("Decompress Error: Bad data")] BadData, #[error("Decompress Error: {0}")] Other(String), } /// flate2 based decompressor #[cfg(not(feature = "libdeflater"))] #[derive(Debug)] pub struct Decompress(flate2::Decompress); #[cfg(not(feature = "libdeflater"))] impl Decompress { pub fn new() -> Self { Decompress(flate2::Decompress::new(false)) } pub fn decompress( &mut self, compressed_data: &[u8], decompressed_data: &mut [u8], ) -> Result { self.0.reset(false); match self .0 .decompress( compressed_data, decompressed_data, flate2::FlushDecompress::Finish, ) .map_err(|e| { DecompressError::Other(e.message().unwrap_or("Unknown Error").to_string()) })? { Status::StreamEnd => Ok(self.0.total_out().try_into().unwrap()), Status::Ok => Err(DecompressError::InsufficientSpace), Status::BufError => Err(DecompressError::InsufficientSpace), } } } /// libdeflater based decompressor #[cfg(feature = "libdeflater")] pub struct Decompress(libdeflater::Decompressor); #[cfg(feature = "libdeflater")] impl Decompress { pub fn new() -> Self { Decompress(libdeflater::Decompressor::new()) } pub fn decompress( &mut self, compressed_data: &[u8], decompressed_data: &mut [u8], ) -> Result { self.0 .deflate_decompress(compressed_data, decompressed_data) .map_err(|e| match e { libdeflater::DecompressionError::BadData => DecompressError::BadData, libdeflater::DecompressionError::InsufficientSpace => { DecompressError::InsufficientSpace } }) } } #[cfg(test)] mod test { use super::*; use rand::prelude::*; const BUF_SIZE: usize = 3000; #[test] fn test_deflate_inflate() -> anyhow::Result<()> { let mut rand = rand_pcg::Pcg64Mcg::seed_from_u64(0x3874aef456157523); let mut original_data = vec![0; BUF_SIZE]; rand.fill_bytes(&mut original_data); let mut compress = Compress::new(Compression::default()); let mut small_buf = [0; 100]; assert_eq!( compress.compress(&original_data, &mut small_buf), Err(CompressError::InsufficientSpace) ); let mut decompress = Decompress::new(); let mut deflated_data = vec![0; BUF_SIZE + 500]; let deflate_size = compress.compress(&original_data, &mut deflated_data)?; let mut inflated_data = vec![0; BUF_SIZE]; assert_eq!( decompress.decompress(&deflated_data[..deflate_size], &mut small_buf), Err(DecompressError::InsufficientSpace) ); assert!(decompress .decompress(&deflated_data[..100], &mut inflated_data) .is_err()); let inflate_size = decompress.decompress(&deflated_data[..deflate_size], &mut inflated_data)?; assert_eq!(inflate_size, original_data.len()); assert_eq!(inflated_data, original_data); Ok(()) } } bgzip-0.3.1/src/error.rs000064400000000000000000000032140072674642500132560ustar 00000000000000use thiserror::Error; /// A BGZF error. #[derive(Debug, Error)] pub enum BGZFError { /// Failed to parse header #[error("Failed to parse header at position: {position}")] HeaderParseError { position: u64 }, /// Not tabix format #[error("not tabix format")] NotTabix, /// Not BGZF format #[error("not BGZF format")] NotBGZF, /// Not gzip format #[error("not gzip format")] NotGzip, /// Too larget compress unit. A compress unit must be smaller than 64k bytes. #[error("Too large compress unit")] TooLargeCompressUnit, /// I/O Error #[error("I/O Error: {0}")] IoError(#[from] std::io::Error), /// UTF-8 Error #[error("Utf8 Error: {0}")] Utf8Error(#[from] std::str::Utf8Error), /// Failed to convert native path to UTF-8 #[error("Failed to convert native path to UTF-8")] PathConvertionError, /// Deflate compresssion error #[error("Compression Error: {0}")] CompressionError(#[from] crate::deflate::CompressError), /// Inflate decompression error #[error("Decompression Error: {0}")] DecompressionError(#[from] crate::deflate::DecompressError), /// Invalid compression level #[error("Invalid Compression Level")] InvalidCompressionLevel, /// Other error #[error("Error: {0}")] Other(&'static str), } impl Into for BGZFError { fn into(self) -> std::io::Error { match self { BGZFError::IoError(e) => e, other => std::io::Error::new(std::io::ErrorKind::Other, other), } } } impl BGZFError { pub fn into_io_error(self) -> std::io::Error { self.into() } } bgzip-0.3.1/src/header.rs000064400000000000000000000266310072674642500133650ustar 00000000000000use crate::*; use std::convert::TryInto; use std::io; use std::u32; pub const GZIP_ID1: u8 = 31; pub const GZIP_ID2: u8 = 139; pub const BGZIP_HEADER_SIZE: u16 = 20 + 6; /// Gzip extra field #[derive(Debug, Clone, PartialEq, Eq)] pub struct ExtraField { sub_field_id1: u8, sub_field_id2: u8, data: Vec, } impl ExtraField { pub fn new(id1: u8, id2: u8, data: Vec) -> Self { ExtraField { sub_field_id1: id1, sub_field_id2: id2, data, } } pub fn id1(&self) -> u8 { self.sub_field_id1 } pub fn id2(&self) -> u8 { self.sub_field_id2 } pub fn data(&self) -> &[u8] { &self.data } pub fn field_len(&self) -> u16 { TryInto::::try_into(self.data.len()).unwrap() + 4 } pub fn write(&self, mut writer: W) -> io::Result<()> { writer.write_all(&[self.sub_field_id1, self.sub_field_id2])?; writer.write_all(&(TryInto::::try_into(self.data.len()).unwrap()).to_le_bytes())?; writer.write_all(&self.data)?; Ok(()) } } /// gzip file header #[derive(Debug, Clone, PartialEq, Eq)] pub struct BGZFHeader { /// Compress Method Field. /// /// Must be [`DEFLATE`] pub compression_method: u8, /// Flags /// /// Combination of [`FLAG_FTEXT`], [`FLAG_FHCRC`], [`FLAG_FEXTRA`], [`FLAG_FNAME`] and [`FLAG_FCOMMENT`]. pub flags: u8, /// Modified date in unix epoch /// /// Set `0` if unknown. pub modified_time: u32, /// Extra flags pub extra_flags: u8, /// Operation System /// /// Common values are [`FILESYSTEM_UNKNOWN`], [`FILESYSTEM_FAT`], [`FILESYSTEM_NTFS`] and [`FILESYSTEM_UNIX`]. pub operation_system: u8, /// Length of extra field pub extra_field_len: Option, /// Extra field content pub extra_field: Vec, /// Original filename pub file_name: Option>, /// Comment pub comment: Option>, /// CRC16 in header pub crc16: Option, } pub const DEFLATE: u8 = 8; pub const FLAG_FTEXT: u8 = 1; pub const FLAG_FHCRC: u8 = 2; pub const FLAG_FEXTRA: u8 = 4; pub const FLAG_FNAME: u8 = 8; pub const FLAG_FCOMMENT: u8 = 16; pub const FILESYSTEM_FAT: u8 = 0; pub const FILESYSTEM_UNIX: u8 = 3; pub const FILESYSTEM_NTFS: u8 = 11; pub const FILESYSTEM_UNKNOWN: u8 = 255; impl BGZFHeader { /// Create new BGZF file header pub fn new(fast: bool, modified_time: u32, compressed_len: u16) -> Self { let block_size = compressed_len + BGZIP_HEADER_SIZE; let bgzf_field = ExtraField::new(66, 67, (block_size - 1).to_le_bytes().to_vec()); BGZFHeader { compression_method: DEFLATE, flags: FLAG_FEXTRA, modified_time, extra_flags: if fast { 4 } else { 2 }, operation_system: FILESYSTEM_UNKNOWN, extra_field_len: Some(bgzf_field.field_len()), extra_field: vec![bgzf_field], file_name: None, comment: None, crc16: None, } } /// Load BGZF block size pub fn block_size(&self) -> Result { self.extra_field .iter() .find(|x| x.sub_field_id1 == 66 && x.sub_field_id2 == 67 && x.data.len() == 2) .map(|x| { let mut bytes: [u8; 2] = [0, 0]; bytes.copy_from_slice(&x.data[0..2]); u16::from_le_bytes(bytes) + 1 }) .ok_or(BGZFError::NotBGZF) } /// Overwrite BGZF block write pub fn update_block_size(&mut self, new_block_size: u16) -> Result<(), BGZFError> { self.extra_field .iter_mut() .find(|x| x.sub_field_id1 == 66 && x.sub_field_id2 == 67 && x.data.len() == 2) .map(|x| { x.data.copy_from_slice(&(new_block_size - 1).to_le_bytes()); }) .ok_or(BGZFError::NotBGZF) } /// Calculate header size pub fn header_size(&self) -> u64 { 10u64 + self.extra_field_len.map(|x| (x + 2).into()).unwrap_or(0) + self .file_name .as_ref() .map(|x| x.len() as u64 + if x.ends_with(&[0]) { 0 } else { 1 }) .unwrap_or(0) + self .comment .as_ref() .map(|x| x.len() as u64 + if x.ends_with(&[0]) { 0 } else { 1 }) .unwrap_or(0) + self.crc16.map(|_| 2).unwrap_or(0) } /// Load gzip header form `reader` pub fn from_reader(reader: &mut R) -> Result { let mut header_data = [0u8; 10]; reader.read_exact(&mut header_data)?; let id1 = header_data[0]; let id2 = header_data[1]; if id1 != GZIP_ID1 || id2 != GZIP_ID2 { return Err(BGZFError::NotGzip); } let compression_method = header_data[2]; if compression_method != DEFLATE { return Err(BGZFError::Other("Unsupported compression method")); } let flags = header_data[3]; if flags | 0x1f != 0x1f { return Err(BGZFError::Other("Unsupported flag")); } let modified_time = u32::from_le_bytes(header_data[4..8].try_into().unwrap()); let extra_flags = header_data[8]; let operation_system = header_data[9]; let (extra_field_len, extra_field) = if flags & FLAG_FEXTRA != 0 { let len = reader.read_le_u16()?; let mut remain_bytes = len; let mut fields = Vec::new(); while remain_bytes > 4 { let mut buf = [0u8; 4]; reader.read_exact(&mut buf)?; let sub_field_id1 = buf[0]; let sub_field_id2 = buf[1]; let sub_field_len = u16::from_le_bytes([buf[2], buf[3]]); let mut buf: Vec = vec![0; sub_field_len as usize]; reader.read_exact(&mut buf)?; fields.push(ExtraField { sub_field_id1, sub_field_id2, data: buf, }); remain_bytes -= 4 + sub_field_len; } if remain_bytes != 0 { return Err(BGZFError::Other("Invalid extra field")); } (Some(len), fields) } else { (None, Vec::new()) }; let file_name = if flags & FLAG_FNAME != 0 { let mut buf = Vec::new(); reader.read_until(0, &mut buf)?; Some(buf) } else { None }; let comment = if flags & FLAG_FCOMMENT != 0 { let mut buf = Vec::new(); reader.read_until(0, &mut buf)?; Some(buf) } else { None }; let crc16 = if flags & FLAG_FHCRC != 0 { Some(reader.read_le_u16()?) } else { None }; Ok(BGZFHeader { compression_method, flags, modified_time, extra_flags, operation_system, extra_field_len, extra_field, file_name, comment, crc16, }) } /// Write gzip header to `writer` pub fn write(&self, mut writer: W) -> io::Result<()> { let mut calculated_flags = self.flags & FLAG_FTEXT; if self.file_name.is_some() { calculated_flags |= FLAG_FNAME; } if self.comment.is_some() { calculated_flags |= FLAG_FCOMMENT; } if self.crc16.is_some() { calculated_flags |= FLAG_FHCRC; } if self.extra_field_len.is_some() { calculated_flags |= FLAG_FEXTRA; } if calculated_flags != self.flags { return Err(io::Error::new(io::ErrorKind::Other, "Invalid bgzip flag")); } writer.write_all(&[ GZIP_ID1, GZIP_ID2, self.compression_method, calculated_flags, ])?; writer.write_all(&self.modified_time.to_le_bytes())?; writer.write_all(&[self.extra_flags, self.operation_system])?; if let Some(extra_field_len) = self.extra_field_len { let total_xlen: u16 = self.extra_field.iter().map(|x| x.field_len()).sum(); if total_xlen != extra_field_len { return Err(io::Error::new( io::ErrorKind::Other, "Invalid bgzip extra field length", )); } writer.write_all(&extra_field_len.to_le_bytes())?; for extra in self.extra_field.iter() { extra.write(&mut writer)?; } } if let Some(file_name) = self.file_name.as_ref() { writer.write_all(file_name)?; if !file_name.ends_with(&[0]) { writer.write_all(&[0])?; } } if let Some(comment) = self.comment.as_ref() { writer.write_all(comment)?; if !comment.ends_with(&[0]) { writer.write_all(&[0])?; } } if let Some(crc16) = self.crc16 { writer.write_all(&crc16.to_le_bytes())?; } Ok(()) } } #[cfg(test)] mod test { use super::*; use std::io::prelude::*; use std::{fs::File, io::SeekFrom}; #[test] fn load_header() -> Result<(), BGZFError> { let mut reader = io::BufReader::new(File::open("testfiles/common_all_20180418_half.vcf.gz")?); let mut header = BGZFHeader::from_reader(&mut reader)?; assert_eq!(header.operation_system, FILESYSTEM_UNKNOWN); assert_eq!(header.compression_method, 8); assert_eq!(header.flags, 4); assert_eq!(header.extra_field_len, Some(6)); assert_eq!(header.extra_field[0].data.len(), 2); let pos = reader.seek(SeekFrom::Current(0))?; let mut buf: Vec = Vec::new(); header.write(&mut buf)?; assert_eq!(buf.len(), header.header_size() as usize); assert_eq!(header.header_size(), pos); let mut actual_header = vec![0u8; buf.len()]; reader.seek(SeekFrom::Start(0))?; reader.read_exact(&mut actual_header)?; assert_eq!(buf, actual_header); let mut buf: Vec = Vec::new(); header.update_block_size(header.block_size()?)?; header.write(&mut buf)?; assert_eq!(buf, actual_header); Ok(()) } #[test] fn load_header2() -> Result<(), BGZFError> { let mut reader = io::BufReader::new(File::open( "testfiles/common_all_20180418_half.vcf.nobgzip.gz", )?); let header = BGZFHeader::from_reader(&mut reader)?; assert_eq!(header.operation_system, FILESYSTEM_UNIX); assert_eq!(header.compression_method, 8); assert_eq!(header.flags, FLAG_FNAME); assert_eq!(header.extra_field_len, None); assert_eq!( header.file_name, Some(b"common_all_20180418_half.vcf.nobgzip\0".to_vec()) ); let pos = reader.seek(SeekFrom::Current(0))?; let mut buf: Vec = Vec::new(); header.write(&mut buf)?; assert_eq!(buf.len(), header.header_size() as usize); assert_eq!(header.header_size(), pos); let mut actual_header = vec![0u8; buf.len()]; reader.seek(SeekFrom::Start(0))?; reader.read_exact(&mut actual_header)?; assert_eq!(buf, actual_header); Ok(()) } } bgzip-0.3.1/src/index.rs000064400000000000000000000111600072674642500132330ustar 00000000000000//! .gzi index support use std::convert::TryInto; use crate::{BGZFError, BinaryReader}; /// Represents .gzi index file #[derive(Debug, Clone, PartialEq, Default)] pub struct BGZFIndex { pub(crate) entries: Vec, } impl BGZFIndex { pub(crate) fn new() -> Self { BGZFIndex::default() } /// List of index entries pub fn entries(&self) -> &[BGZFIndexEntry] { &self.entries } /// Load .gzi index file from `reader` pub fn from_reader(mut reader: R) -> std::io::Result { let num_entries = reader.read_le_u64()?; let mut result = BGZFIndex::default(); for _ in 0..num_entries { let compressed_offset = reader.read_le_u64()?; let uncompressed_offset = reader.read_le_u64()?; result.entries.push(BGZFIndexEntry { compressed_offset, uncompressed_offset, }) } Ok(result) } /// Write .gzi index file into `writer` pub fn write(&self, mut writer: W) -> std::io::Result<()> { let entries: u64 = self.entries.len().try_into().unwrap(); writer.write_all(&entries.to_le_bytes())?; for one in &self.entries { writer.write_all(&one.compressed_offset.to_le_bytes())?; writer.write_all(&one.uncompressed_offset.to_le_bytes())?; } Ok(()) } /// Convert uncompressed position to bgzf virtual position pub fn uncompressed_pos_to_bgzf_pos(&self, pos: u64) -> Result { let i = self .entries .partition_point(|x| x.uncompressed_offset <= pos); let entry = match i { 0 => BGZFIndexEntry { compressed_offset: 0, uncompressed_offset: 0, }, i => self.entries[i - 1].clone(), }; // eprintln!( // "[{}/{}] {} / {} ", // i, // self.entries().len(), // pos, // entry.uncompressed_offset // ); Ok((entry.compressed_offset << 16) + ((pos - entry.uncompressed_offset) & ((1 << 16) - 1))) } /// Convert bgzf virtual position to uncompressed position pub fn bgzf_pos_to_uncompressed_pos(&self, bgzf_pos: u64) -> Result { let compressed_pos = bgzf_pos >> 16; if compressed_pos == 0 { return Ok(bgzf_pos); } let i = self .entries .binary_search_by(|x| x.compressed_offset.cmp(&compressed_pos)) .map_err(|_| BGZFError::Other("Invalid BGZF position"))?; Ok(self.entries[i].uncompressed_offset + (bgzf_pos & ((1 << 16) - 1))) } } /// One entry of .gzi #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub struct BGZFIndexEntry { pub compressed_offset: u64, pub uncompressed_offset: u64, } #[cfg(test)] mod test { use super::*; use crate::{BGZFWriter, Compression}; use std::fs; use std::io::prelude::*; #[test] fn test_index_read_write() -> anyhow::Result<()> { let data = fs::read("testfiles/generated.bed.gz.gzi")?; let index = BGZFIndex::from_reader(&data[..])?; assert_eq!(index.entries.len(), 295); let mut generated_data = Vec::new(); index.write(&mut generated_data)?; assert_eq!(data, generated_data); Ok(()) } #[test] fn test_index_position_convert() -> anyhow::Result<()> { let mut data_reader = std::io::BufReader::new(flate2::read::MultiGzDecoder::new( fs::File::open("testfiles/generated.bed.gz")?, )); let mut line = String::new(); let mut line_list = Vec::new(); let mut writer = BGZFWriter::new( fs::File::create("tmp/test_index_position_convert.bed.gz")?, Compression::default(), ); loop { let bgzf_pos = writer.bgzf_pos(); let uncompressed_pos = writer.pos(); line.clear(); let size = data_reader.read_line(&mut line)?; if size == 0 { break; } writer.write_all(&line.as_bytes())?; line_list.push((bgzf_pos, uncompressed_pos, line.clone())); } let index = writer.close()?.unwrap(); for (bgzf_pos, uncompressed_pos, _) in &line_list { assert_eq!( index.bgzf_pos_to_uncompressed_pos(*bgzf_pos)?, *uncompressed_pos ); assert_eq!( index.uncompressed_pos_to_bgzf_pos(*uncompressed_pos)?, *bgzf_pos ); } Ok(()) } } bgzip-0.3.1/src/lib.rs000064400000000000000000000163530072674642500127030ustar 00000000000000//! bgzip-rs //! ======== //! [![Build](https://github.com/informationsea/bgzip-rs/actions/workflows/build.yml/badge.svg)](https://github.com/informationsea/bgzip-rs/actions/workflows/build.yml) //! [![Crates.io](https://img.shields.io/crates/v/bgzip)](https://crates.io/crates/bgzip) //! [![Crates.io](https://img.shields.io/crates/d/bgzip)](https://crates.io/crates/bgzip) //! [![Crates.io](https://img.shields.io/crates/l/bgzip)](https://crates.io/crates/bgzip) //! [![doc-rs](https://docs.rs/bgzip/badge.svg)](https://docs.rs/bgzip) //! //! //! Rust implementation of [BGZF format](https://samtools.github.io/hts-specs/SAMv1.pdf) //! //! Feature flags //! ------------- //! //! * `rayon`: Enable [rayon](https://github.com/rayon-rs/rayon) based multi-threaded writer. This is default feature. //! * `log`: Enable [log](https://github.com/rust-lang/log) crate to log warnings. This is default feature. //! * `rust_backend`: use use [miniz_oxide](https://crates.io/crates/miniz_oxide) crate for [flate2](https://github.com/rust-lang/flate2-rs) backend. This is default feature. //! * `zlib`: use `zlib` for flate2 backend. Please read [flate2](https://github.com/rust-lang/flate2-rs) description for the detail. //! * `zlib-ng`: use `zlib-ng` for flate2 backend. Please read [flate2](https://github.com/rust-lang/flate2-rs) description for the detail. //! * `zlib-ng-compat`: Please read [flate2](https://github.com/rust-lang/flate2-rs) description for the detail. //! * `cloudflare_zlib`: Please read [flate2](https://github.com/rust-lang/flate2-rs) description for the detail. //! * `libdeflater`: use [libdeflater](https://github.com/adamkewley/libdeflater) instead of [flate2](https://github.com/rust-lang/flate2-rs) crate. //! //! Write Examples //! -------- //! ```rust //! use bgzip::{BGZFWriter, BGZFError, Compression}; //! use std::io::{self, Write}; //! fn main() -> Result<(), BGZFError> { //! let mut write_buffer = Vec::new(); //! let mut writer = BGZFWriter::new(&mut write_buffer, Compression::default()); //! writer.write_all(b"##fileformat=VCFv4.2\n")?; //! writer.write_all(b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")?; //! writer.close()?; //! Ok(()) //! } //! ``` //! //! Multi-thread support is available via [`write::BGZFMultiThreadWriter`]. `rayon` flag is required to use this feature. //! //! Read Examples //! -------- //! ```rust //! use bgzip::{BGZFReader, BGZFError}; //! use std::io::{self, BufRead}; //! use std::fs; //! fn main() -> Result<(), BGZFError> { //! let mut reader = //! BGZFReader::new(fs::File::open("testfiles/common_all_20180418_half.vcf.gz")?)?; //! let mut line = String::new(); //! reader.read_line(&mut line)?; //! assert_eq!("##fileformat=VCFv4.0\n", line); //! reader.bgzf_seek(4210818610)?; //! line.clear(); //! reader.read_line(&mut line)?; //! assert_eq!("1\t72700625\trs12116859\tT\tA,C\t.\t.\tRS=12116859;RSPOS=72700625;dbSNPBuildID=120;SSR=0;SAO=0;VP=0x05010008000517053e000100;GENEINFO=LOC105378798:105378798;WGT=1;VC=SNV;SLO;INT;ASP;VLD;G5A;G5;HD;GNO;KGPhase1;KGPhase3;CAF=0.508,.,0.492;COMMON=1;TOPMED=0.37743692660550458,0.00608435270132517,0.61647872069317023\n", line); //! //! Ok(()) //! } //! ``` mod error; pub(crate) mod csi; pub mod deflate; /// BGZ header parser pub mod header; pub mod index; pub mod read; pub use deflate::Compression; /// Tabix file parser. (This module is alpha state.) pub mod tabix; pub mod write; pub use error::BGZFError; pub use read::BGZFReader; pub use write::BGZFWriter; use std::io; /// End-of-file maker. /// /// This marker should be written at end of the BGZF files. pub const EOF_MARKER: [u8; 28] = [ 0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, 0x42, 0x43, 0x02, 0x00, 0x1b, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ]; pub(crate) trait BinaryReader: io::Read { fn read_le_u8(&mut self) -> io::Result { let mut buf: [u8; 1] = [0]; self.read_exact(&mut buf)?; Ok(u8::from_le_bytes(buf)) } fn read_le_u16(&mut self) -> io::Result { let mut buf: [u8; 2] = [0, 0]; self.read_exact(&mut buf)?; Ok(u16::from_le_bytes(buf)) } fn read_le_u32(&mut self) -> io::Result { let mut buf: [u8; 4] = [0, 0, 0, 0]; self.read_exact(&mut buf)?; Ok(u32::from_le_bytes(buf)) } fn read_le_i32(&mut self) -> io::Result { let mut buf: [u8; 4] = [0, 0, 0, 0]; self.read_exact(&mut buf)?; Ok(i32::from_le_bytes(buf)) } fn read_le_u64(&mut self) -> io::Result { let mut buf: [u8; 8] = [0, 0, 0, 0, 0, 0, 0, 0]; self.read_exact(&mut buf)?; Ok(u64::from_le_bytes(buf)) } fn read_until(&mut self, byte: u8, buf: &mut Vec) -> io::Result { let mut tmp = [0u8]; let mut total_bytes: usize = 0; loop { let l = self.read(&mut tmp)?; if l == 0 { break; } buf.extend_from_slice(&tmp); total_bytes += 1; if tmp[0] == byte { break; } } Ok(total_bytes) } } impl BinaryReader for R {} #[cfg(test)] mod test { use crate::index::BGZFIndex; use super::*; use std::fs; use std::io::{BufRead, Write}; #[test] fn test_run() -> Result<(), BGZFError> { let mut write_buffer = Vec::new(); let mut writer = BGZFWriter::new(&mut write_buffer, Compression::default()); writer.write_all(b"##fileformat=VCFv4.2\n")?; writer.write_all(b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")?; writer.close()?; Ok(()) } #[test] fn test_read() -> Result<(), BGZFError> { let mut reader = BGZFReader::new(fs::File::open("testfiles/common_all_20180418_half.vcf.gz")?)?; let mut line = String::new(); reader.read_line(&mut line)?; assert_eq!("##fileformat=VCFv4.0\n", line); reader.bgzf_seek(4210818610)?; line.clear(); reader.read_line(&mut line)?; assert_eq!("1\t72700625\trs12116859\tT\tA,C\t.\t.\tRS=12116859;RSPOS=72700625;dbSNPBuildID=120;SSR=0;SAO=0;VP=0x05010008000517053e000100;GENEINFO=LOC105378798:105378798;WGT=1;VC=SNV;SLO;INT;ASP;VLD;G5A;G5;HD;GNO;KGPhase1;KGPhase3;CAF=0.508,.,0.492;COMMON=1;TOPMED=0.37743692660550458,0.00608435270132517,0.61647872069317023\n", line); Ok(()) } #[test] fn test_read_all() -> Result<(), BGZFError> { let reader = BGZFReader::new(fs::File::open("testfiles/common_all_20180418_half.vcf.gz")?)?; let expected_reader = std::io::BufReader::new(flate2::read::MultiGzDecoder::new( fs::File::open("testfiles/common_all_20180418_half.vcf.gz")?, )); for (line1, line2) in reader.lines().zip(expected_reader.lines()) { assert_eq!(line1?, line2?); } Ok(()) } #[test] fn test_index_read_write() -> anyhow::Result<()> { let data = fs::read("testfiles/generated.bed.gz.gzi")?; let index = BGZFIndex::from_reader(&data[..])?; assert_eq!(index.entries.len(), 295); let mut generated_data = Vec::new(); index.write(&mut generated_data)?; assert_eq!(data, generated_data); Ok(()) } } bgzip-0.3.1/src/read/mod.rs000064400000000000000000000364720072674642500136330ustar 00000000000000//! BGZF reader #[cfg(feature = "rayon")] mod thread; #[cfg(feature = "rayon")] pub use thread::BGZFMultiThreadReader; use crate::deflate::*; use crate::index::BGZFIndex; use crate::{header::BGZFHeader, BGZFError}; use std::convert::TryInto; use std::io::{self, prelude::*}; /// Load single block from reader. /// /// This function is useful when writing your own parallelized BGZF reader. /// Loaded buffer can be decompress with [`decompress_block`] function. pub fn load_block(mut reader: R, buffer: &mut Vec) -> Result { let header = BGZFHeader::from_reader(&mut reader)?; let block_size: u64 = header.block_size()?.into(); buffer.clear(); buffer.resize((block_size - header.header_size()).try_into().unwrap(), 0); reader.read_exact(buffer)?; Ok(header) } /// Decompress single BGZF block from buffer. The buffer should be loaded with [`load_block`] function. /// /// This function is useful when writing your own parallelized BGZF reader. pub fn decompress_block( decompressed_data: &mut Vec, compressed_block: &[u8], decompress: &mut Decompress, ) -> Result<(), BGZFError> { let original_decompress_data_len = decompressed_data.len(); let mut crc = Crc::new(); let expected_len_data = [ compressed_block[(compressed_block.len() - 4)], compressed_block[(compressed_block.len() - 3)], compressed_block[(compressed_block.len() - 2)], compressed_block[(compressed_block.len() - 1)], ]; let expected_len: usize = u32::from_le_bytes(expected_len_data).try_into().unwrap(); decompressed_data.resize(original_decompress_data_len + expected_len, 0); decompress.decompress( compressed_block, &mut decompressed_data[original_decompress_data_len..], )?; let expected_crc_data = [ compressed_block[(compressed_block.len() - 8)], compressed_block[(compressed_block.len() - 7)], compressed_block[(compressed_block.len() - 6)], compressed_block[(compressed_block.len() - 5)], ]; let expected_crc = u32::from_le_bytes(expected_crc_data); crc.update(&decompressed_data[original_decompress_data_len..]); if expected_crc != crc.sum() { return Err(BGZFError::Other("unmatched CRC32 of decompressed data")); } Ok(()) } /// A BGZF reader /// /// Decode BGZF file with seek support. pub struct BGZFReader { reader: R, decompress: Decompress, compressed_buffer: Vec, current_buffer: Vec, current_block: u64, next_block: u64, current_position_in_block: usize, eof_pos: u64, } impl BGZFReader { /// Seek BGZF with position. This position is not equal to real file offset, /// but equal to virtual file offset described in [BGZF format](https://samtools.github.io/hts-specs/SAMv1.pdf). /// Please read "4.1.1 Random access" to learn more. pub fn bgzf_seek(&mut self, position: u64) -> Result<(), BGZFError> { self.next_block = position >> 16; self.reader.seek(io::SeekFrom::Start(self.next_block))?; self.load_next()?; self.current_position_in_block = (position & 0xffff) as usize; Ok(()) } } impl BGZFReader { /// Create a new BGZF reader from [`std::io::Read`] pub fn new(mut reader: R) -> Result { let mut decompress = Decompress::new(); let mut compressed_buffer = Vec::new(); load_block(&mut reader, &mut compressed_buffer)?; let mut buffer = Vec::new(); decompress_block(&mut buffer, &compressed_buffer, &mut decompress)?; Ok(BGZFReader { reader, decompress, current_buffer: buffer, current_block: 0, next_block: compressed_buffer.len().try_into().unwrap(), current_position_in_block: 0, eof_pos: u64::MAX, compressed_buffer, }) } /// Get BGZF virtual file offset. This position is not equal to real file offset, /// but equal to virtual file offset described in [BGZF format](https://samtools.github.io/hts-specs/SAMv1.pdf). /// Please read "4.1.1 Random access" to learn more. pub fn bgzf_pos(&self) -> u64 { self.current_block << 16 | (self.current_position_in_block & 0xffff) as u64 } fn load_next(&mut self) -> Result<(), BGZFError> { if self.next_block >= self.eof_pos { return Ok(()); } self.compressed_buffer.clear(); let header = load_block(&mut self.reader, &mut self.compressed_buffer)?; let header_size = header.header_size(); if self.compressed_buffer == crate::EOF_MARKER { self.eof_pos = self.next_block; self.current_buffer.clear(); self.current_block = self.next_block; self.current_position_in_block = 0; return Ok(()); } self.current_buffer.clear(); decompress_block( &mut self.current_buffer, &self.compressed_buffer, &mut self.decompress, )?; self.current_block = self.next_block; let current_block_size: u64 = self.compressed_buffer.len().try_into().unwrap(); self.next_block += current_block_size + header_size; self.current_position_in_block = 0; Ok(()) } } impl BufRead for BGZFReader { fn fill_buf(&mut self) -> io::Result<&[u8]> { if self.current_position_in_block >= self.current_buffer.len() { self.load_next().map_err(|e| e.into_io_error())?; } let remain_bytes = self.current_buffer.len() - self.current_position_in_block; if remain_bytes > 0 { return Ok(&self.current_buffer[self.current_position_in_block..]); } Ok(&[]) } fn consume(&mut self, amt: usize) { let remain_bytes = self.current_buffer.len() - self.current_position_in_block; if amt <= remain_bytes { self.current_position_in_block += amt; } else { unreachable!() } } } impl Read for BGZFReader { fn read(&mut self, buf: &mut [u8]) -> io::Result { //eprintln!("read start: {}", buf.len()); let internal_buf = self.fill_buf()?; let bytes_to_copy = buf.len().min(internal_buf.len()); buf[0..bytes_to_copy].copy_from_slice(&internal_buf[0..bytes_to_copy]); self.consume(bytes_to_copy); //eprintln!("read end: {}", bytes_to_copy); Ok(bytes_to_copy) } } /// Seekable BGZF reader. pub struct IndexedBGZFReader { reader: BGZFReader, index: BGZFIndex, current_pos: u64, end_pos: u64, } impl IndexedBGZFReader { /// Create new [`IndexedBGZFReader`] from [`BGZFReader`] and [`BGZFIndex`]. pub fn new(mut reader: BGZFReader, index: BGZFIndex) -> Result { let last_entry = index .entries .last() .ok_or(BGZFError::Other("Invalid index file"))? .clone(); reader.bgzf_seek(last_entry.compressed_offset << 16)?; let mut buf = Vec::new(); reader.read_to_end(&mut buf)?; reader.bgzf_seek(0)?; std::mem::drop(last_entry); Ok(IndexedBGZFReader { reader, index, current_pos: 0, end_pos: last_entry.uncompressed_offset + TryInto::::try_into(buf.len()).unwrap(), }) } } impl IndexedBGZFReader { /// Create new [`IndexedBGZFReader`] from file path. pub fn from_path>(path: P) -> Result { let reader = BGZFReader::new(std::fs::File::open(path.as_ref())?)?; let index = BGZFIndex::from_reader(std::fs::File::open( path.as_ref() .to_str() .ok_or(BGZFError::PathConvertionError)?, )?)?; IndexedBGZFReader::new(reader, index) } } impl Seek for IndexedBGZFReader { fn seek(&mut self, pos: io::SeekFrom) -> io::Result { let new_pos = match pos { io::SeekFrom::Current(p) => { TryInto::::try_into(p + TryInto::::try_into(self.current_pos).unwrap()) .unwrap() } io::SeekFrom::Start(p) => p, io::SeekFrom::End(p) => { TryInto::::try_into(TryInto::::try_into(self.end_pos).unwrap() + p) .unwrap() } }; self.reader .bgzf_seek( self.index .uncompressed_pos_to_bgzf_pos(new_pos) .map_err(|x| Into::::into(x))?, ) .map_err(|x| Into::::into(x))?; Ok(new_pos) } } impl BufRead for IndexedBGZFReader { fn fill_buf(&mut self) -> io::Result<&[u8]> { self.reader.fill_buf() } fn consume(&mut self, amt: usize) { self.reader.consume(amt); self.current_pos += TryInto::::try_into(amt).unwrap(); } } impl Read for IndexedBGZFReader { fn read(&mut self, buf: &mut [u8]) -> io::Result { let s = self.reader.read(buf)?; self.current_pos += TryInto::::try_into(s).unwrap(); Ok(s) } } #[cfg(test)] mod test { use flate2::Crc; use crate::BGZFWriter; use super::*; use rand::prelude::*; use std::fs::{self, File}; #[test] fn test_load_block() -> Result<(), BGZFError> { let mut crc = Crc::new(); let mut expected_reader = io::BufReader::new(flate2::read::MultiGzDecoder::new( File::open("testfiles/common_all_20180418_half.vcf.gz")?, )); let mut buf = [0u8; 1024 * 100]; loop { let read_bytes = expected_reader.read(&mut buf[..])?; if read_bytes == 0 { break; } crc.update(&buf[0..read_bytes]); } let original_crc = crc.sum(); let mut reader = io::BufReader::new(File::open("testfiles/common_all_20180418_half.vcf.gz")?); let mut block_data = Vec::new(); let mut data_crc = Crc::new(); let mut decompress = super::Decompress::new(); let mut decompressed_data = Vec::with_capacity(crate::write::MAXIMUM_COMPRESS_UNIT_SIZE); loop { load_block(&mut reader, &mut block_data)?; if block_data == &[3, 0, 0, 0, 0, 0, 0, 0, 0, 0] { break; } decompressed_data.clear(); decompress_block(&mut decompressed_data, &block_data, &mut decompress)?; data_crc.update(&decompressed_data); } assert_eq!(original_crc, data_crc.sum()); Ok(()) } #[test] fn test_read() -> Result<(), BGZFError> { let mut expected_reader = io::BufReader::new(flate2::read::MultiGzDecoder::new( File::open("testfiles/common_all_20180418_half.vcf.gz")?, )); let mut reader = BGZFReader::new(File::open("testfiles/common_all_20180418_half.vcf.gz")?)?; let mut line1 = String::new(); let mut line2 = String::new(); for _ in 0..1000 { line1.clear(); line2.clear(); reader.read_line(&mut line1)?; expected_reader.read_line(&mut line2)?; assert_eq!(line1, line2); //println!("line: {}", line); } for _ in 0..1000 { let mut buf1: [u8; 1000] = [0; 1000]; let mut buf2: [u8; 1000] = [0; 1000]; reader.read_exact(&mut buf1)?; expected_reader.read_exact(&mut buf2)?; //assert_eq!(read_len1, buf1.len()); assert_eq!(&buf1[..], &buf2[..]); } let mut buffer = [0; 30]; reader.bgzf_seek(0)?; assert_eq!(reader.bgzf_pos(), 0); reader.bgzf_seek(35973)?; assert_eq!(reader.bgzf_pos(), 35973); reader.read_exact(&mut buffer)?; assert!( buffer.starts_with(b"1\t4008153"), "{}", String::from_utf8_lossy(&buffer) ); //reader.bgzf_seek(reader.cache.get(&0).unwrap().next_block_position() << 16)?; reader.bgzf_seek(4210818610)?; assert_eq!(reader.bgzf_pos(), 4210818610); reader.read_exact(&mut buffer)?; assert!(buffer.starts_with(b"1\t72700625")); //eprintln!("data: {}", String::from_utf8_lossy(&buffer)); reader.bgzf_seek(9618658636)?; assert_eq!(reader.bgzf_pos(), 9618658636); reader.read_exact(&mut buffer)?; assert!(buffer.starts_with(b"1\t")); reader.bgzf_seek(135183301012)?; assert_eq!(reader.bgzf_pos(), 135183301012); reader.read_exact(&mut buffer)?; assert!(buffer.starts_with(b"11\t")); let mut tmp_buf = vec![0u8; 391474]; reader.bgzf_seek(0)?; reader.read_exact(&mut tmp_buf)?; //eprintln!("data: {}", String::from_utf8_lossy(&buffer)); assert_eq!(reader.bgzf_pos(), 4210818610); reader.read_exact(&mut buffer)?; assert!( buffer.starts_with(b"1\t72700625"), "{}", String::from_utf8_lossy(&buffer) ); Ok(()) } #[test] fn test_read_all() -> anyhow::Result<()> { let mut expected_data_reader = flate2::read::MultiGzDecoder::new(File::open("testfiles/generated.bed.gz")?); let mut expected_data = Vec::new(); expected_data_reader.read_to_end(&mut expected_data)?; let mut data_reader = crate::BGZFReader::new(File::open("testfiles/generated.bed.gz")?)?; let mut data = Vec::new(); data_reader.read_to_end(&mut data)?; assert_eq!(data, expected_data); Ok(()) } #[test] fn test_indexed_reader() -> anyhow::Result<()> { let mut data_reader = std::io::BufReader::new(flate2::read::MultiGzDecoder::new( fs::File::open("testfiles/generated.bed.gz")?, )); let mut line = String::new(); let mut line_list = Vec::new(); let mut writer = BGZFWriter::new( fs::File::create("tmp/test-indexed-reader.bed.gz")?, Compression::default(), ); let mut total_len = 0; loop { let bgzf_pos = writer.bgzf_pos(); let uncompressed_pos = writer.pos(); line.clear(); let size = data_reader.read_line(&mut line)?; if size == 0 { break; } writer.write_all(&line.as_bytes())?; total_len += line.as_bytes().len(); line_list.push((bgzf_pos, uncompressed_pos, line.clone())); } let index = writer.close()?.unwrap(); let mut rand = rand_pcg::Pcg64Mcg::seed_from_u64(0x9387402456157523); let mut reader = IndexedBGZFReader::new( BGZFReader::new(fs::File::open("tmp/test-indexed-reader.bed.gz")?)?, index, )?; line.clear(); reader.read_line(&mut line)?; assert_eq!(line, line_list[0].2); for _ in 0..300 { let i = rand.gen_range(0..line_list.len()); reader.seek(std::io::SeekFrom::Start(line_list[i].1))?; line.clear(); reader.read_line(&mut line)?; assert_eq!(line, line_list[i].2); } assert_eq!(TryInto::::try_into(total_len).unwrap(), reader.end_pos); Ok(()) } } bgzip-0.3.1/src/read/thread.rs000064400000000000000000000226620072674642500143170ustar 00000000000000use std::collections::HashMap; use std::io::{BufRead, Read}; use std::sync::mpsc::{channel, Receiver, Sender}; use crate::deflate::*; use crate::BGZFError; const EOF_BLOCK: [u8; 10] = [3, 0, 0, 0, 0, 0, 0, 0, 0, 0]; const DEFAULT_PROCESS_BLOCK_NUM: usize = 50; struct ReadBlock { index: u64, decompressed_data: Vec, compressed_data: Vec>, decompress: Decompress, } impl ReadBlock { pub fn new(process_block_num: usize) -> Self { let decompress = Decompress::new(); ReadBlock { index: 0, decompressed_data: Vec::with_capacity(crate::write::MAXIMUM_COMPRESS_UNIT_SIZE), compressed_data: vec![ Vec::with_capacity(crate::write::MAXIMUM_COMPRESS_UNIT_SIZE); process_block_num ], decompress, } } } /// A Multi-thread BGZF writer. /// /// [rayon](https://crates.io/crates/rayon) is used to run decompression in a thread pool. pub struct BGZFMultiThreadReader { reader: R, block_list: Vec, current_read_pos: usize, current_read_buffer: Option, read_waiting_blocks: HashMap, reader_receiver: Receiver>, reader_sender: Sender>, next_read_index: u64, next_decompress_index: u64, eof_read_index: u64, } impl BGZFMultiThreadReader { /// Create new [`BGZFMultiThreadReader`] from `reader` pub fn new(reader: R) -> Result { Self::with_process_block_num(reader, DEFAULT_PROCESS_BLOCK_NUM) } /// Create new [`BGZFMultiThreadReader`] from `reader` and `process_block_num`. /// /// `process_block_num` is the number blocks to dispatch a new thread. /// Default value is 50. If you have fast CPU, larger value can be improve efficiency. pub fn with_process_block_num(reader: R, process_block_num: usize) -> Result { let (tx, rx) = channel(); let mut reader = BGZFMultiThreadReader { reader, block_list: (0..(rayon::current_num_threads() * 2)) .map(|_| ReadBlock::new(process_block_num)) .collect(), current_read_pos: 0, current_read_buffer: None, read_waiting_blocks: HashMap::new(), reader_receiver: rx, reader_sender: tx, next_read_index: 0, next_decompress_index: 0, eof_read_index: u64::MAX, }; reader.dispatch_read_thread()?; Ok(reader) } fn dispatch_read_thread(&mut self) -> Result<(), BGZFError> { while !self.block_list.is_empty() && self.next_decompress_index < self.eof_read_index { let mut block = self.block_list.pop().unwrap(); block.index = self.next_decompress_index; self.next_decompress_index += 1; let mut last_index = 0; for i in 0..block.compressed_data.len() { //eprintln!("load block {}", i); super::load_block( &mut self.reader, &mut block.compressed_data.get_mut(i).unwrap(), ) .map_err(|e| -> std::io::Error { // eprintln!("load block error: {}", e); e.into() })?; last_index = i; if block.compressed_data.get(i).unwrap() == &EOF_BLOCK { //self.block_list.clear(); // eprintln!("EOF reach: {}", block.index); self.eof_read_index = self.next_decompress_index; break; } } if last_index != block.compressed_data.len() - 1 { block .compressed_data .drain(last_index..block.compressed_data.len()); } let sender = self.reader_sender.clone(); // eprintln!("spawn: {}", block.index); rayon::spawn(move || { let _i = block.index; block.decompressed_data.clear(); for one_compress_data in &block.compressed_data { match super::decompress_block( &mut block.decompressed_data, &one_compress_data, &mut block.decompress, ) { Ok(_) => (), Err(e) => { //eprintln!("send Error: {}", e); sender.send(Err(e)).expect("reader send error 2") } } } sender.send(Ok(block)).expect("reader send error 1"); // eprintln!("done: {}", i); }); } Ok(()) } } impl BufRead for BGZFMultiThreadReader { fn consume(&mut self, amt: usize) { self.current_read_pos += amt; } fn fill_buf(&mut self) -> std::io::Result<&[u8]> { // eprintln!( // "fill buf start: {} {} {} {}", // self.current_read_pos, // self.next_read_index, // self.current_read_buffer // .as_ref() // .map(|x| x.index) // .unwrap_or(10000000000), // self.eof_read_index // ); //eprintln!("fill buf 1"); if let Some(b) = self.current_read_buffer.as_ref() { if b.decompressed_data.len() <= self.current_read_pos { std::mem::drop(b); self.block_list .push(self.current_read_buffer.take().unwrap()); } } //eprintln!("fill buf 2"); if self.next_read_index > self.eof_read_index { //eprintln!("EOF 0 bytes fill"); return Ok(&[]); } //eprintln!("fill buf 3"); self.dispatch_read_thread() .map_err(|e| Into::::into(e))?; //eprintln!("fill buf 4"); if self.current_read_buffer.is_none() { if self.next_read_index >= self.eof_read_index { //eprintln!("EOF 0 bytes fill"); return Ok(&[]); } while !self.read_waiting_blocks.contains_key(&self.next_read_index) { let block = self .reader_receiver .recv() .expect("reader receive error") .map_err(|e| -> std::io::Error { e.into() })?; // eprintln!("fetch: {}", block.index); self.read_waiting_blocks.insert(block.index, block); } self.current_read_buffer = self.read_waiting_blocks.remove(&self.next_read_index); // eprintln!("read: {}", self.next_read_index); self.current_read_pos = 0; self.next_read_index += 1; } // eprintln!( // "fill buf end {} {}/{}", // self.current_read_buffer.as_ref().unwrap().index, // self.current_read_pos, // self.current_read_buffer // .as_ref() // .unwrap() // .decompressed_data // .len() // ); Ok(&self.current_read_buffer.as_ref().unwrap().decompressed_data[self.current_read_pos..]) } } impl Read for BGZFMultiThreadReader { fn read(&mut self, buf: &mut [u8]) -> std::io::Result { //eprintln!("read start: {}", buf.len()); let internal_buf = self.fill_buf()?; let bytes_to_copy = buf.len().min(internal_buf.len()); buf[0..bytes_to_copy].copy_from_slice(&internal_buf[0..bytes_to_copy]); self.consume(bytes_to_copy); //eprintln!("read end: {}", bytes_to_copy); Ok(bytes_to_copy) } } #[cfg(test)] mod test { use super::*; #[test] fn test_thread_read() -> anyhow::Result<()> { let mut expected_reader = flate2::read::MultiGzDecoder::new(std::fs::File::open( "testfiles/common_all_20180418_half.vcf.gz", )?); let mut expected_buf = Vec::new(); expected_reader.read_to_end(&mut expected_buf)?; // normal read let mut reader = BGZFMultiThreadReader::new(std::fs::File::open( "testfiles/common_all_20180418_half.vcf.gz", )?)?; let mut read_buf = Vec::new(); reader.read_to_end(&mut read_buf)?; assert_eq!(expected_buf.len(), read_buf.len()); assert_eq!(expected_buf, read_buf); // with single block let mut reader = BGZFMultiThreadReader::with_process_block_num( std::io::BufReader::new(std::fs::File::open( "testfiles/common_all_20180418_half.vcf.gz", )?), 1, )?; let mut read_buf = Vec::new(); reader.read_to_end(&mut read_buf)?; assert_eq!(expected_buf.len(), read_buf.len()); assert_eq!(expected_buf, read_buf); // read 100 bytes per loop let mut reader = BGZFMultiThreadReader::new(std::fs::File::open( "testfiles/common_all_20180418_half.vcf.gz", )?)?; let mut read_buf = Vec::new(); loop { let mut small_buf = [0; 100]; let read_bytes = reader.read(&mut small_buf)?; if read_bytes == 0 { break; } read_buf.extend_from_slice(&small_buf[..read_bytes]); } assert_eq!(expected_buf.len(), read_buf.len()); Ok(()) } } bgzip-0.3.1/src/tabix.rs000064400000000000000000000167000072674642500132400ustar 00000000000000use crate::*; use std::collections::HashMap; use std::convert::TryInto; use std::i32; use std::io::{self, Read}; #[derive(Debug, Clone, PartialEq)] pub struct TabixChunk { pub begin: u64, pub end: u64, } impl TabixChunk { fn from_reader(reader: &mut R) -> io::Result { let begin = reader.read_le_u64()?; let end = reader.read_le_u64()?; Ok(TabixChunk { begin, end }) } } #[derive(Debug, Clone, PartialEq)] pub struct TabixBin { pub bin: u32, pub number_of_chunk: i32, pub chunks: Vec, } impl TabixBin { fn from_reader(reader: &mut R) -> io::Result { let bin = reader.read_le_u32()?; let number_of_chunk = reader.read_le_i32()?; let mut chunks = Vec::new(); for _ in 0..number_of_chunk { chunks.push(TabixChunk::from_reader(reader)?); } Ok(TabixBin { bin, number_of_chunk, chunks, }) } } #[derive(Debug, Clone, PartialEq)] pub struct TabixSequence { pub number_of_distinct_bin: i32, pub bins: HashMap, pub number_of_intervals: i32, pub intervals: Vec, } impl TabixSequence { fn from_reader(reader: &mut R) -> io::Result { let number_of_distinct_bin = reader.read_le_i32()?; let mut bins = HashMap::new(); for _ in 0..number_of_distinct_bin { let one_bin = TabixBin::from_reader(reader)?; bins.insert(one_bin.bin, one_bin); } let number_of_intervals = reader.read_le_i32()?; let mut intervals = Vec::new(); for _ in 0..number_of_intervals { intervals.push(reader.read_le_u64()?); } Ok(TabixSequence { number_of_distinct_bin, bins, number_of_intervals, intervals, }) } } #[derive(Debug, Clone, PartialEq)] pub struct Tabix { pub number_of_references: i32, pub format: i32, pub column_for_sequence: i32, pub column_for_begin: i32, pub column_for_end: i32, pub meta: [u8; 4], pub skip: i32, pub length_of_concatenated_sequence_names: i32, pub names: Vec>, pub sequences: Vec, } impl Tabix { pub fn from_reader(reader: R) -> Result { let mut reader = io::BufReader::new(crate::read::BGZFReader::new(reader)?); let mut buf: [u8; 4] = [0, 0, 0, 0]; reader.read_exact(&mut buf)?; if buf != [b'T', b'B', b'I', 1] { return Err(BGZFError::Other("Not Tabix format")); } let number_of_references = reader.read_le_i32()?; let format = reader.read_le_i32()?; let column_for_sequence = reader.read_le_i32()?; let column_for_begin = reader.read_le_i32()?; let column_for_end = reader.read_le_i32()?; reader.read_exact(&mut buf)?; let meta = buf; let skip = reader.read_le_i32()?; let length_of_concatenated_sequence_names = reader.read_le_i32()?; let mut name_bytes: Vec = vec![0; length_of_concatenated_sequence_names.try_into().unwrap()]; reader.read_exact(&mut name_bytes)?; let names = split_names(&name_bytes); let mut sequences = Vec::new(); for _ in 0..number_of_references { sequences.push(TabixSequence::from_reader(&mut reader)?); } Ok(Tabix { number_of_references, format, column_for_sequence, column_for_begin, column_for_end, meta, skip, length_of_concatenated_sequence_names, names, sequences, }) } } fn split_names(data: &[u8]) -> Vec> { let mut reader = io::BufReader::new(data); let mut result = Vec::new(); loop { let mut buf = Vec::new(); let l = reader.read_until(0, &mut buf).unwrap(); if l == 0 { break; } result.push(buf); } result } const MIN_SHIFT: u32 = 14; const DEPTH: u32 = 5; /// calculate the list of bins that may overlap with region [beg,end) (zero-based) pub fn reg2bin(beg: u32, end: u32) -> u32 { crate::csi::reg2bin(beg.into(), end.into(), MIN_SHIFT, DEPTH) } /// calculate the list of bins that may overlap with region [beg,end) (zero-based) pub fn reg2bins(beg: u32, end: u32) -> Vec { crate::csi::reg2bins(beg.into(), end.into(), MIN_SHIFT, DEPTH) } #[cfg(test)] mod test { use anyhow::anyhow; use super::*; use std::fs::File; use std::str; #[test] fn test_tabix_read() -> anyhow::Result<()> { let mut reader = File::open("testfiles/common_all_20180418_half.vcf.gz.tbi")?; let tabix = Tabix::from_reader(&mut reader)?; //println!("{:?}", tabix); let mut chunks_writer = csv::Writer::from_path("tmp/sequence.csv")?; chunks_writer.write_record(&[ "sequence name", "bin index", "bin number", "chunk index", "chunk begin", "chunk end", ])?; for (i, one_seq) in tabix.sequences.iter().enumerate() { for (j, (_, one_bin)) in one_seq.bins.iter().enumerate() { for (k, one_chunk) in one_bin.chunks.iter().enumerate() { chunks_writer.write_record(&[ str::from_utf8(&tabix.names[i]).unwrap().to_string(), format!("{}", j), format!("{}", one_bin.bin), format!("{}", k), format!("{}", one_chunk.begin), format!("{}", one_chunk.end), ])?; } } } Ok(()) } #[test] fn test_bins() -> anyhow::Result<()> { let mut reader = csv::ReaderBuilder::new() .delimiter(b'\t') .quoting(false) .from_reader(flate2::read::MultiGzDecoder::new(File::open( "testfiles/bins.tsv.gz", )?)); for row in reader.records() { let row = row?; let start: u64 = row.get(1).ok_or_else(|| anyhow!("No Start"))?.parse()?; let end: u64 = row.get(2).ok_or_else(|| anyhow!("No End"))?.parse()?; let bin: u32 = row.get(3).ok_or_else(|| anyhow!("No Bin"))?.parse()?; let mut bins: Vec = row .get(4) .ok_or_else(|| anyhow!("No Bins"))? .split(',') .map(|x| x.parse().expect("Invalid bin")) .collect(); let calculated_bin = reg2bin( start.try_into().expect("Too large start"), end.try_into().expect("Too large end"), ); let mut calculated_bins = reg2bins( start.try_into().expect("Too large start"), end.try_into().expect("Too large end"), ); bins.sort(); calculated_bins.sort(); assert_eq!( bin, calculated_bin, "Start: {} / End: {} / Calculated bin: {} / Expected bin: {}", start, end, calculated_bin, bin, ); assert_eq!( bins, calculated_bins, "Start: {} / End: {} / Calculated bins: {:?} / Expected bins: {:?}", start, end, calculated_bins, bins, ); } Ok(()) } } bgzip-0.3.1/src/write/mod.rs000064400000000000000000000303620072674642500140420ustar 00000000000000//! BGZF writer #[cfg(feature = "rayon")] mod thread; #[cfg(feature = "rayon")] pub use thread::BGZFMultiThreadWriter; use crate::header::BGZFHeader; use crate::index::{BGZFIndex, BGZFIndexEntry}; use crate::{deflate::*, BGZFError}; use std::convert::TryInto; use std::io::{self, Write}; // #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] // pub struct BGZFWritePos { // block_index: u64, // wrote_bytes: u64, // position_in_block: u64, // block_position: Option, // } /// A BGZF writer pub struct BGZFWriter { writer: W, original_data: Vec, compressed_buffer: Vec, compress: Compress, compress_unit_size: usize, closed: bool, current_compressed_pos: u64, current_uncompressed_pos: u64, bgzf_index: Option, } /// Default BGZF compress unit size pub const DEFAULT_COMPRESS_UNIT_SIZE: usize = 65280; /// Maximum BGZF compress unit size pub const MAXIMUM_COMPRESS_UNIT_SIZE: usize = 64 * 1024; pub(crate) const EXTRA_COMPRESS_BUFFER_SIZE: usize = 200; impl BGZFWriter { /// Create new BGZF writer from [`std::io::Write`] pub fn new(writer: W, level: Compression) -> Self { Self::with_compress_unit_size(writer, level, DEFAULT_COMPRESS_UNIT_SIZE, true) .expect("Unreachable (BGZFWriter)") } /// Cerate new BGZF writer with compress unit size. /// /// Default value of compress unit size is 65280. pub fn with_compress_unit_size( writer: W, level: Compression, compress_unit_size: usize, create_index: bool, ) -> Result { if compress_unit_size >= crate::write::MAXIMUM_COMPRESS_UNIT_SIZE { return Err(BGZFError::TooLargeCompressUnit); } Ok(BGZFWriter { writer, original_data: Vec::with_capacity(compress_unit_size), compressed_buffer: Vec::with_capacity(compress_unit_size + EXTRA_COMPRESS_BUFFER_SIZE), compress_unit_size, compress: Compress::new(level), closed: false, current_uncompressed_pos: 0, current_compressed_pos: 0, bgzf_index: if create_index { Some(BGZFIndex::new()) } else { None }, }) } /// Get BGZF virtual file offset. This position is not equal to real file offset, /// but equal to virtual file offset described in [BGZF format](https://samtools.github.io/hts-specs/SAMv1.pdf). /// Please read "4.1.1 Random access" to learn more. pub fn bgzf_pos(&self) -> u64 { self.current_compressed_pos << 16 | (self.original_data.len() & 0xffff) as u64 } /// Current write position. pub fn pos(&self) -> u64 { self.current_uncompressed_pos + TryInto::::try_into(self.original_data.len()).unwrap() } fn write_block(&mut self) -> io::Result<()> { self.compressed_buffer.clear(); write_block( &mut self.compressed_buffer, &self.original_data, &mut self.compress, ) .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; self.writer.write_all(&self.compressed_buffer)?; self.current_uncompressed_pos += TryInto::::try_into(self.original_data.len()).unwrap(); self.current_compressed_pos += TryInto::::try_into(self.compressed_buffer.len()).unwrap(); if let Some(index) = self.bgzf_index.as_mut() { index.entries.push(BGZFIndexEntry { compressed_offset: self.current_compressed_pos, uncompressed_offset: self.current_uncompressed_pos, }); } Ok(()) } /// Write end-of-file marker and close BGZF. /// /// Explicitly call of this method is not required unless you need .gzi index. /// Drop trait will write end-of-file marker automatically. /// If you need to handle I/O errors while closing, please use this method. pub fn close(mut self) -> io::Result> { if !self.closed { self.flush()?; self.writer.write_all(&crate::EOF_MARKER)?; self.closed = true; } if let Some(index) = self.bgzf_index.as_mut() { index.entries.pop(); } Ok(self.bgzf_index.take()) } } impl io::Write for BGZFWriter { fn write(&mut self, buf: &[u8]) -> io::Result { let mut process_start_pos = 0; loop { //eprintln!("process start pos: {}", process_start_pos); let to_write_bytes = (buf.len() - process_start_pos) .min(self.compress_unit_size - self.original_data.len()); if to_write_bytes == 0 { break; } self.original_data .extend_from_slice(&buf[process_start_pos..(process_start_pos + to_write_bytes)]); if self.original_data.len() >= self.compress_unit_size { self.write_block()?; self.original_data.clear(); } process_start_pos += to_write_bytes; } Ok(buf.len()) } fn flush(&mut self) -> io::Result<()> { if !self.original_data.is_empty() { self.write_block()?; } Ok(()) } } impl Drop for BGZFWriter { fn drop(&mut self) { if !self.closed { self.flush().unwrap(); self.writer.write_all(&crate::EOF_MARKER).unwrap(); self.closed = true; } } } const FOOTER_SIZE: usize = 8; /// Write single BGZF block to writer. /// /// This function is useful when writing your own parallelized BGZF writer. pub fn write_block( compressed_data: &mut Vec, original_data: &[u8], compress: &mut Compress, ) -> Result { //eprintln!("write block : {} ", original_data.len()); let original_compressed_data_size = compressed_data.len(); let mut header = BGZFHeader::new(false, 0, 0); let header_size: usize = header.header_size().try_into().unwrap(); compressed_data.resize( original_compressed_data_size + original_data.len() + EXTRA_COMPRESS_BUFFER_SIZE + header_size + FOOTER_SIZE, 0, ); let compressed_len = compress.compress( original_data, &mut compressed_data[(original_compressed_data_size + header_size)..], )?; compressed_data.truncate(original_compressed_data_size + header_size + compressed_len); let mut crc = Crc::new(); crc.update(original_data); compressed_data.extend_from_slice(&crc.sum().to_le_bytes()); compressed_data.extend_from_slice(&(original_data.len() as u32).to_le_bytes()); let block_size = compressed_data.len() - original_compressed_data_size; //eprintln!("block size: {} / {}", block_size, original_data.len()); header .update_block_size(block_size.try_into().unwrap()) .expect("Unreachable"); header .write( &mut compressed_data [original_compressed_data_size..(header_size + original_compressed_data_size)], ) .expect("Failed to write header"); Ok(block_size) } #[cfg(test)] mod test { use crate::BGZFReader; use crate::{deflate::Compression, BinaryReader}; use rand::prelude::*; use super::*; use std::fs::{self, File}; use std::io::{BufRead, BufReader, Read, Write}; #[test] fn test_vcf() -> anyhow::Result<()> { let mut data = Vec::new(); let mut reader = flate2::read::MultiGzDecoder::new(fs::File::open( "testfiles/common_all_20180418_half.vcf.gz", )?); reader.read_to_end(&mut data)?; let output_path = "tmp/test.vcf.gz"; let mut writer = BGZFWriter::new(fs::File::create(output_path)?, Compression::default()); writer.write_all(&data)?; writer .close()? .unwrap() .write(fs::File::create(format!("{}.gzi", output_path))?)?; let mut reader = flate2::read::MultiGzDecoder::new(fs::File::open(output_path)?); let mut wrote_data = Vec::new(); reader.read_to_end(&mut wrote_data)?; assert_eq!(wrote_data.len(), data.len()); Ok(()) } #[test] fn test_simple() -> anyhow::Result<()> { let output_path = "tmp/simple1.txt.gz"; let mut writer = BGZFWriter::new(fs::File::create(output_path)?, Compression::default()); writer.write_all(b"1234")?; std::mem::drop(writer); let mut reader = flate2::read::MultiGzDecoder::new(std::fs::File::open(output_path)?); let mut data = Vec::new(); reader.read_to_end(&mut data)?; assert_eq!(data, b"1234"); Ok(()) } #[test] fn test_write_bed() -> anyhow::Result<()> { const TEST_OUTPUT_PATH: &str = "tmp/test.bed.gz"; let mut writer = BGZFWriter::new(fs::File::create(TEST_OUTPUT_PATH)?, Compression::default()); let mut all_data = Vec::new(); let mut data_reader = flate2::read::MultiGzDecoder::new(fs::File::open("testfiles/generated.bed.gz")?); data_reader.read_to_end(&mut all_data)?; writer.write_all(&all_data)?; std::mem::drop(data_reader); std::mem::drop(writer); let mut result_data = Vec::new(); let mut result_reader = flate2::read::MultiGzDecoder::new(BufReader::new(File::open(TEST_OUTPUT_PATH)?)); result_reader.read_to_end(&mut result_data)?; assert_eq!(result_data, all_data); let mut result_reader = BufReader::new(File::open(TEST_OUTPUT_PATH)?); let mut decompress = flate2::Decompress::new(false); loop { let header = crate::header::BGZFHeader::from_reader(&mut result_reader)?; assert_eq!(header.comment, None); assert_eq!(header.file_name, None); assert_eq!(header.modified_time, 0); let block_size = header.block_size()?; let compressed_data_len = block_size as i64 - 20 - 6; let mut compressed_data = vec![0u8; compressed_data_len as usize]; result_reader.read_exact(&mut compressed_data)?; let crc32 = result_reader.read_le_u32()?; let uncompressed_data_len = result_reader.read_le_u32()?; if uncompressed_data_len == 0 { break; } let mut decompressed_data = vec![0u8; (uncompressed_data_len) as usize]; decompress.reset(false); assert_eq!( decompress.decompress( &compressed_data, &mut decompressed_data, flate2::FlushDecompress::None, )?, flate2::Status::StreamEnd ); assert_eq!(decompressed_data.len(), uncompressed_data_len as usize); let mut crc = flate2::Crc::new(); crc.update(&decompressed_data); assert_eq!(crc.sum(), crc32); } let mut buf = vec![0u8; 100]; assert_eq!(result_reader.read(&mut buf)?, 0); Ok(()) } #[test] fn test_bgzf_pos() -> anyhow::Result<()> { let mut data_reader = std::io::BufReader::new(flate2::read::MultiGzDecoder::new( fs::File::open("testfiles/generated.bed.gz")?, )); let mut line = String::new(); let mut line_list = Vec::new(); let mut writer = BGZFWriter::new( fs::File::create("tmp/write-pos.bed.gz")?, Compression::default(), ); loop { let pos = writer.bgzf_pos(); line.clear(); let size = data_reader.read_line(&mut line)?; if size == 0 { break; } writer.write_all(&line.as_bytes())?; line_list.push((pos, line.clone())); } writer.close()?; let mut rand = rand_pcg::Pcg64Mcg::seed_from_u64(0x9387402456157523); let mut reader = BGZFReader::new(fs::File::open("tmp/write-pos.bed.gz")?)?; for _ in 0..300 { let i = rand.gen_range(0..line_list.len()); reader.bgzf_seek(line_list[i].0)?; line.clear(); reader.read_line(&mut line)?; assert_eq!(line, line_list[i].1); } Ok(()) } } bgzip-0.3.1/src/write/thread.rs000064400000000000000000000262640072674642500145400ustar 00000000000000use crate::index::BGZFIndexEntry; use crate::{deflate::*, index::BGZFIndex, BGZFError}; use std::collections::HashMap; use std::convert::TryInto; use std::io::{self, Error, ErrorKind, Write}; use std::sync::mpsc::{channel, Receiver, Sender}; const DEFAULT_WRITE_BLOCK_UNIT_NUM: usize = 50; #[derive(Debug, Clone, Copy, PartialEq, PartialOrd)] struct BlockSize { uncompressed_size: usize, compressed_size: usize, } struct WriteBlock { index: u64, compress: Compress, compressed_buffer: Vec, raw_buffer: Vec, block_sizes: Vec, } impl WriteBlock { fn new(level: Compression, compress_unit_size: usize, write_block_num: usize) -> Self { let compress = Compress::new(level); WriteBlock { index: 0, compress, compressed_buffer: Vec::with_capacity( (compress_unit_size + crate::write::EXTRA_COMPRESS_BUFFER_SIZE) * write_block_num, ), raw_buffer: Vec::with_capacity(compress_unit_size * write_block_num), block_sizes: Vec::new(), } } fn reset(&mut self) { self.index = 0; self.compressed_buffer.clear(); self.raw_buffer.clear(); self.block_sizes.clear(); } } /// A Multi-thread BGZF writer /// /// [rayon](https://crates.io/crates/rayon) is used to run compression in a thread pool. pub struct BGZFMultiThreadWriter { writer: W, compress_unit_size: usize, write_block_num: usize, block_list: Vec, write_waiting_blocks: HashMap, writer_receiver: Receiver, writer_sender: Sender, next_write_index: u64, next_compress_index: u64, closed: bool, current_compressed_pos: u64, current_uncompressed_pos: u64, bgzf_index: Option, } impl BGZFMultiThreadWriter { /// Create new [`BGZFMultiThreadWriter`] from [`std::io::Read`] and [`Compression`] pub fn new(writer: W, level: Compression) -> Self { Self::with_compress_unit_size( writer, crate::write::DEFAULT_COMPRESS_UNIT_SIZE, DEFAULT_WRITE_BLOCK_UNIT_NUM, level, true, ) .expect("Unreachable (BGZFMultiThreadWriter)") } pub fn with_compress_unit_size( writer: W, compress_unit_size: usize, write_block_num: usize, level: Compression, create_index: bool, ) -> Result { if compress_unit_size >= crate::write::MAXIMUM_COMPRESS_UNIT_SIZE { return Err(BGZFError::TooLargeCompressUnit); } let (tx, rx) = channel(); Ok(BGZFMultiThreadWriter { writer, compress_unit_size, write_block_num, block_list: (0..(rayon::current_num_threads() * 2)) .map(|_| WriteBlock::new(level, compress_unit_size, write_block_num)) .collect(), write_waiting_blocks: HashMap::new(), writer_receiver: rx, writer_sender: tx, next_write_index: 0, next_compress_index: 0, closed: false, current_uncompressed_pos: 0, current_compressed_pos: 0, bgzf_index: if create_index { Some(BGZFIndex::new()) } else { None }, }) } fn write_blocks(&mut self, mut next_data: WriteBlock) -> io::Result<()> { self.writer.write_all(&next_data.compressed_buffer)?; for one in &next_data.block_sizes { self.current_compressed_pos += TryInto::::try_into(one.compressed_size).unwrap(); self.current_uncompressed_pos += TryInto::::try_into(one.uncompressed_size).unwrap(); if let Some(index) = self.bgzf_index.as_mut() { index.entries.push(BGZFIndexEntry { compressed_offset: self.current_compressed_pos, uncompressed_offset: self.current_uncompressed_pos, }) } } self.next_write_index += 1; next_data.reset(); self.block_list.push(next_data); Ok(()) } fn process_buffer(&mut self, block: bool, block_all: bool) -> io::Result<()> { let mut current_block = block; while self.next_compress_index != self.next_write_index { let next_data = if current_block { self.writer_receiver .recv() .map_err(|_| Error::new(ErrorKind::Other, "Closed channel"))? } else { match self.writer_receiver.try_recv() { Ok(d) => d, Err(std::sync::mpsc::TryRecvError::Empty) => return Ok(()), Err(std::sync::mpsc::TryRecvError::Disconnected) => { return Err(Error::new(ErrorKind::Other, "Closed channel")) } } }; // eprintln!( // "fetch thread data: {} / {} / {}", // next_data.index, self.next_write_index, self.next_compress_index // ); if next_data.index == self.next_write_index { self.write_blocks(next_data)?; while let Some(next_data) = self.write_waiting_blocks.remove(&self.next_write_index) { //eprintln!("write block 2: {}", next_data.index); self.write_blocks(next_data)?; } current_block = block_all; } else { //eprintln!("Insert into waiting blocks: {}", next_data.index); self.write_waiting_blocks.insert(next_data.index, next_data); } } Ok(()) } fn dispatch_current_block(&mut self) { let mut block = self.block_list.remove(0); block.index = self.next_compress_index; self.next_compress_index += 1; let sender = self.writer_sender.clone(); // eprintln!("spawn thread: {}", block.index); let compress_unit_size = self.compress_unit_size; rayon::spawn_fifo(move || { // eprintln!("started thread: {}", block.index); block.compressed_buffer.clear(); let mut wrote_bytes = 0; while wrote_bytes < block.raw_buffer.len() { // eprintln!( // "write block: {} / {}, {}", // block.index, // wrote_bytes, // String::from_utf8_lossy(&block.raw_buffer[wrote_bytes..(wrote_bytes + 10)]) // ); let bytes_to_write = (block.raw_buffer.len() - wrote_bytes).min(compress_unit_size); let compressed_size = crate::write::write_block( &mut block.compressed_buffer, &block.raw_buffer[wrote_bytes..(wrote_bytes + bytes_to_write)], &mut block.compress, ) .expect("Failed to write block"); wrote_bytes += bytes_to_write; block.block_sizes.push(BlockSize { uncompressed_size: bytes_to_write, compressed_size, }); } //eprintln!("finished thread: {}", block.index); sender.send(block).expect("failed to send write result"); }); } /// Write end-of-file marker and close BGZF. /// /// Explicitly call of this method is not required unless you need .gzi index. /// Drop trait will write end-of-file marker automatically. /// If you need to handle I/O errors while closing, please use this method. pub fn close(mut self) -> io::Result> { self.flush()?; self.writer.write_all(&crate::EOF_MARKER)?; self.closed = true; if let Some(index) = self.bgzf_index.as_mut() { index.entries.pop(); } Ok(self.bgzf_index.take()) } } impl Write for BGZFMultiThreadWriter { fn write(&mut self, buf: &[u8]) -> io::Result { let mut wrote_bytes = 0; while wrote_bytes < buf.len() { self.process_buffer(self.block_list.is_empty(), false)?; let current_buffer = self.block_list.get_mut(0).unwrap(); let remain_buffer = (self.compress_unit_size * self.write_block_num) - current_buffer.raw_buffer.len(); let bytes_to_write = remain_buffer.min(buf.len() - wrote_bytes); current_buffer .raw_buffer .extend_from_slice(&buf[wrote_bytes..(wrote_bytes + bytes_to_write)]); if bytes_to_write == remain_buffer { self.dispatch_current_block(); } wrote_bytes += bytes_to_write; } Ok(wrote_bytes) } fn flush(&mut self) -> io::Result<()> { self.process_buffer(self.block_list.is_empty(), false)?; if self.block_list[0].raw_buffer.len() > 0 { self.dispatch_current_block(); } self.process_buffer(true, true)?; // eprintln!( // "flush: {}/{}/{}/{}", // self.next_compress_index, // self.next_write_index, // self.block_list.len(), // rayon::current_num_threads() // ); Ok(()) } } impl Drop for BGZFMultiThreadWriter { fn drop(&mut self) { if !self.closed { self.flush().expect("BGZF: Flash Error"); self.writer .write_all(&crate::EOF_MARKER) .expect("BGZF: Cannot write EOF marker"); } } } #[cfg(test)] mod test { use std::io::Read; use super::*; use rand::prelude::*; const WRITE_UNIT: usize = 2000; const BUF_SIZE: usize = 1000 * 1000 * 10; #[test] fn test_thread_writer() -> anyhow::Result<()> { let mut rand = rand_pcg::Pcg64Mcg::seed_from_u64(0x9387402456157523); let path = "./tmp/test_thread_writer.data.gz"; let write_file = std::io::BufWriter::new(std::fs::File::create(path)?); let mut writer = BGZFMultiThreadWriter::with_compress_unit_size( write_file, 1024, 30, Compression::best(), true, )?; let mut data = vec![0; BUF_SIZE]; rand.fill_bytes(&mut data); let mut wrote_bytes = 0; loop { let to_write_bytes = WRITE_UNIT.min(data.len() - wrote_bytes); if to_write_bytes == 0 { break; } wrote_bytes += writer.write(&mut data[wrote_bytes..(wrote_bytes + to_write_bytes)])?; } //eprintln!("wrote_bytes: {}/{}", i, wrote_bytes); writer .close()? .unwrap() .write(std::fs::File::create(format!("{}.gzi", path))?)?; let mut rand = rand_pcg::Pcg64Mcg::seed_from_u64(0x9387402456157523); let mut reader = flate2::read::MultiGzDecoder::new(std::fs::File::open(path)?); let mut read_data = vec![]; rand.fill_bytes(&mut data); reader.read_to_end(&mut read_data)?; assert_eq!(read_data.len(), data.len()); assert!(read_data == data, "unmatched"); //writer.flush()?; Ok(()) } }