needletail-0.5.1/CHANGELOG.md000064400000000000000000000037251046102023000135550ustar 00000000000000# Changelog All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] ## [0.4.1] - 2020-03-12 ## Added - `ParseErrorKind::EmptyFile` variant to handle cases where there are less than two bytes in a file [[#51][51]] ## [0.4.0] - 2020-07-08 ## Changed - Added `parse_fastx_file` which replaces `parse_sequence_reader` and offers an iterator like usage and is faster than 0.3. Also adds `parse_fastx_reader` and `parse_fastx_stdin`. - `SequenceRecord` now offers more information about the file such as line ending, which allows writing a file identical to the input one. ## [0.3.0] - 2019-09-12 ### Added - Improved error reporting (i.e., a parse failure now gives the record it failed on). - Significant code cleanup and additional linting (`cargo clippy`). - Significant additional test coverage, including via fuzzing. - Significant improvements to library documentation. ### Changed - The `.kmers` method has been simplified and a new `.canonical_kmers` method has been introduced with much of the original's functionality. - Added `parse_sequence_reader`, which replaces `fastx_stream` and `fastx_bytes`. - `fastx_cli` updated and renamed to `parse_sequence_path`. - `SeqRecord` is now `SequenceRecord` and many of its methods are now in the `Sequence` trait (e.g., working on byte slices). - Automatic decompression now takes `Read` instead of `Read + Seek` so we can handle e.g. gzip files piped in through `stdin`. - See [this link](https://github.com/onecodex/needletail/pull/26#issuecomment-530982670) for additional details on updating code to `v0.3.0`. ### Removed - Single-file zip handling (zip requires `Seek`) 😞 ## [0.3.1] - 2019-09-18 ### Fixed - Needletail no longer runs out of memory when parsing large, compressed files. [51]: https://github.com/onecodex/needletail/issues/51needletail-0.5.1/Cargo.lock0000644000001330230000000000100111220ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "adler" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "aho-corasick" version = "0.7.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" dependencies = [ "memchr", ] [[package]] name = "anes" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstyle" version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "23ea9e81bd02e310c216d080f6223c179012256e5151c41db88d12c88a1684d2" [[package]] name = "anyhow" version = "1.0.70" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7de8ce5e0f9f8d88245311066a578d72b7af3e7088f32783804676302df237e4" [[package]] name = "approx" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6" dependencies = [ "num-traits", ] [[package]] name = "assert_cmd" version = "2.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0b2340f55d9661d76793b2bfc2eb0e62689bd79d067a95707ea762afd5e9dd" dependencies = [ "anstyle", "bstr", "doc-comment", "predicates", "predicates-core", "predicates-tree", "wait-timeout", ] [[package]] name = "atty" version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" dependencies = [ "hermit-abi 0.1.19", "libc", "winapi", ] [[package]] name = "autocfg" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "bio" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e07645dcc036557a09cf078268b4666f0ce737bd797b0ee554de29ea81981ab2" dependencies = [ "anyhow", "approx", "bio-types", "bit-set", "bv", "bytecount", "csv", "custom_derive", "enum-map", "fxhash", "getset", "itertools", "itertools-num", "lazy_static", "multimap", "ndarray", "newtype_derive", "num-integer", "num-traits", "ordered-float", "petgraph", "rand", "regex", "serde", "serde_derive", "statrs", "strum", "strum_macros", "thiserror", "triple_accel", "vec_map", ] [[package]] name = "bio-types" version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dfa990f40a28735fa598dc3dd58d73e62e6b41458959d623903b927ba7b04c80" dependencies = [ "derive-new", "lazy_static", "regex", "strum_macros", "thiserror", ] [[package]] name = "bit-set" version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" dependencies = [ "bit-vec", ] [[package]] name = "bit-vec" version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" [[package]] name = "bitflags" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bstr" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3d4260bcc2e8fc9df1eac4919a720effeb63a3f0952f5bf4944adfa18897f09" dependencies = [ "memchr", "once_cell", "regex-automata", "serde", ] [[package]] name = "buf_redux" version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b953a6887648bb07a535631f2bc00fbdb2a2216f135552cb3f534ed136b9c07f" dependencies = [ "memchr", "safemem", ] [[package]] name = "buffer-redux" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2886ea01509598caac116942abd33ab5a88fa32acdf7e4abfa0fc489ca520c9" dependencies = [ "memchr", "safemem", ] [[package]] name = "bumpalo" version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535" [[package]] name = "bv" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8834bb1d8ee5dc048ee3124f2c7c1afcc6bc9aed03f11e9dfd8c69470a5db340" dependencies = [ "feature-probe", "serde", ] [[package]] name = "bytecount" version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" [[package]] name = "bytemuck" version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "17febce684fd15d89027105661fec94afb475cb995fbc59d2865198446ba2eea" [[package]] name = "byteorder" version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bzip2" version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" dependencies = [ "bzip2-sys", "libc", ] [[package]] name = "bzip2-sys" version = "0.1.11+1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" dependencies = [ "cc", "libc", "pkg-config", ] [[package]] name = "cast" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "ciborium" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b0c137568cc60b904a7724001b35ce2630fd00d5d84805fbb608ab89509d788f" dependencies = [ "ciborium-io", "ciborium-ll", "serde", ] [[package]] name = "ciborium-io" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "346de753af073cc87b52b2083a506b38ac176a44cfb05497b622e27be899b369" [[package]] name = "ciborium-ll" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "213030a2b5a4e0c0892b6652260cf6ccac84827b83a85a534e178e3906c4cf1b" dependencies = [ "ciborium-io", "half", ] [[package]] name = "clap" version = "3.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71655c45cb9845d3270c9d6df84ebe72b4dad3c2ba3f7023ad47c144e4e473a5" dependencies = [ "bitflags", "clap_lex", "indexmap", "textwrap", ] [[package]] name = "clap_lex" version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5" dependencies = [ "os_str_bytes", ] [[package]] name = "crc32fast" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" dependencies = [ "cfg-if", ] [[package]] name = "criterion" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb" dependencies = [ "anes", "atty", "cast", "ciborium", "clap", "criterion-plot", "itertools", "lazy_static", "num-traits", "oorandom", "plotters", "rayon", "regex", "serde", "serde_derive", "serde_json", "tinytemplate", "walkdir", ] [[package]] name = "criterion-plot" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" dependencies = [ "cast", "itertools", ] [[package]] name = "crossbeam" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c" dependencies = [ "cfg-if", "crossbeam-channel", "crossbeam-deque", "crossbeam-epoch", "crossbeam-queue", "crossbeam-utils", ] [[package]] name = "crossbeam-channel" version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf2b3e8478797446514c91ef04bafcb59faba183e621ad488df88983cc14128c" dependencies = [ "cfg-if", "crossbeam-utils", ] [[package]] name = "crossbeam-deque" version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" dependencies = [ "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" version = "0.9.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695" dependencies = [ "autocfg", "cfg-if", "crossbeam-utils", "memoffset", "scopeguard", ] [[package]] name = "crossbeam-queue" version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add" dependencies = [ "cfg-if", "crossbeam-utils", ] [[package]] name = "crossbeam-utils" version = "0.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b" dependencies = [ "cfg-if", ] [[package]] name = "csv" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b015497079b9a9d69c02ad25de6c0a6edef051ea6360a327d0bd05802ef64ad" dependencies = [ "csv-core", "itoa", "ryu", "serde", ] [[package]] name = "csv-core" version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" dependencies = [ "memchr", ] [[package]] name = "custom_derive" version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9" [[package]] name = "derive-new" version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3418329ca0ad70234b9735dc4ceed10af4df60eff9c8e7b06cb5e520d92c3535" dependencies = [ "proc-macro2", "quote", "syn 1.0.109", ] [[package]] name = "difflib" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" [[package]] name = "doc-comment" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" [[package]] name = "either" version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" [[package]] name = "enum-map" version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e893a7ba6116821058dec84a6fb14fb2a97cd8ce5fd0f85d5a4e760ecd7329d9" dependencies = [ "enum-map-derive", ] [[package]] name = "enum-map-derive" version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84278eae0af6e34ff6c1db44c11634a694aafac559ff3080e4db4e4ac35907aa" dependencies = [ "proc-macro2", "quote", "syn 1.0.109", ] [[package]] name = "errno" version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" dependencies = [ "errno-dragonfly", "libc", "winapi", ] [[package]] name = "errno-dragonfly" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" dependencies = [ "cc", "libc", ] [[package]] name = "escargot" version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f5584ba17d7ab26a8a7284f13e5bd196294dd2f2d79773cff29b9e9edef601a6" dependencies = [ "log", "once_cell", "serde", "serde_json", ] [[package]] name = "fastrand" version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" dependencies = [ "instant", ] [[package]] name = "feature-probe" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "835a3dc7d1ec9e75e2b5fb4ba75396837112d2060b03f7d43bc1897c7f7211da" [[package]] name = "fixedbitset" version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "flate2" version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8a2db397cb1c8772f31494cb8917e48cd1e64f0fa7efac59fbd741a0a8ce841" dependencies = [ "crc32fast", "miniz_oxide", ] [[package]] name = "float-cmp" version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "98de4bbd547a563b716d8dfa9aad1cb19bfab00f4fa09a6a4ed21dbcf44ce9c4" dependencies = [ "num-traits", ] [[package]] name = "fxhash" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" dependencies = [ "byteorder", ] [[package]] name = "getrandom" version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" dependencies = [ "cfg-if", "libc", "wasi", ] [[package]] name = "getset" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e45727250e75cc04ff2846a66397da8ef2b3db8e40e0cef4df67950a07621eb9" dependencies = [ "proc-macro-error", "proc-macro2", "quote", "syn 1.0.109", ] [[package]] name = "half" version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" [[package]] name = "hashbrown" version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" [[package]] name = "heck" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "hermit-abi" version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" dependencies = [ "libc", ] [[package]] name = "hermit-abi" version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" dependencies = [ "libc", ] [[package]] name = "hermit-abi" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" [[package]] name = "indexmap" version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" dependencies = [ "autocfg", "hashbrown", ] [[package]] name = "indoc" version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa799dd5ed20a7e349f3b4639aa80d74549c81716d9ec4f994c9b5815598306" [[package]] name = "instant" version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" dependencies = [ "cfg-if", ] [[package]] name = "io-lifetimes" version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09270fd4fa1111bc614ed2246c7ef56239a3063d5be0d1ec3b589c505d400aeb" dependencies = [ "hermit-abi 0.3.1", "libc", "windows-sys 0.45.0", ] [[package]] name = "itertools" version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" dependencies = [ "either", ] [[package]] name = "itertools-num" version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a872a22f9e6f7521ca557660adb96dd830e54f0f490fa115bb55dd69d38b27e7" dependencies = [ "num-traits", ] [[package]] name = "itoa" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" [[package]] name = "js-sys" version = "0.3.61" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730" dependencies = [ "wasm-bindgen", ] [[package]] name = "lazy_static" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" version = "0.2.140" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99227334921fae1a979cf0bfdfcc6b3e5ce376ef57e16fb6fb3ea2ed6095f80c" [[package]] name = "libm" version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb" [[package]] name = "linux-raw-sys" version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" [[package]] name = "lock_api" version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df" dependencies = [ "autocfg", "scopeguard", ] [[package]] name = "log" version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" dependencies = [ "cfg-if", ] [[package]] name = "lzma-sys" version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" dependencies = [ "cc", "libc", "pkg-config", ] [[package]] name = "matrixmultiply" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "add85d4dd35074e6fedc608f8c8f513a3548619a9024b751949ef0e8e45a4d84" dependencies = [ "rawpointer", ] [[package]] name = "memchr" version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "memoffset" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1" dependencies = [ "autocfg", ] [[package]] name = "miniz_oxide" version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa" dependencies = [ "adler", ] [[package]] name = "multimap" version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" dependencies = [ "serde", ] [[package]] name = "nalgebra" version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d506eb7e08d6329505faa8a3a00a5dcc6de9f76e0c77e4b75763ae3c770831ff" dependencies = [ "approx", "matrixmultiply", "nalgebra-macros", "num-complex", "num-rational", "num-traits", "rand", "rand_distr", "simba", "typenum", ] [[package]] name = "nalgebra-macros" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "01fcc0b8149b4632adc89ac3b7b31a12fb6099a0317a4eb2ebff574ef7de7218" dependencies = [ "proc-macro2", "quote", "syn 1.0.109", ] [[package]] name = "ndarray" version = "0.15.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" dependencies = [ "matrixmultiply", "num-complex", "num-integer", "num-traits", "rawpointer", ] [[package]] name = "needletail" version = "0.5.1" dependencies = [ "assert_cmd", "bio", "buffer-redux", "bytecount", "bzip2", "criterion", "escargot", "flate2", "memchr", "predicates", "pyo3", "seq_io", "serde", "serde_derive", "tempfile", "toml", "xz2", ] [[package]] name = "newtype_derive" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac8cd24d9f185bb7223958d8c1ff7a961b74b1953fd05dba7cc568a63b3861ec" dependencies = [ "rustc_version", ] [[package]] name = "normalize-line-endings" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be" [[package]] name = "num-complex" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02e0d21255c828d6f128a1e41534206671e8c3ea0c62f32291e808dc82cff17d" dependencies = [ "num-traits", ] [[package]] name = "num-integer" version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" dependencies = [ "autocfg", "num-traits", ] [[package]] name = "num-rational" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" dependencies = [ "autocfg", "num-integer", "num-traits", ] [[package]] name = "num-traits" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", "libm", ] [[package]] name = "num_cpus" version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" dependencies = [ "hermit-abi 0.2.6", "libc", ] [[package]] name = "once_cell" version = "1.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" [[package]] name = "oorandom" version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" [[package]] name = "ordered-float" version = "3.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13a384337e997e6860ffbaa83708b2ef329fd8c54cb67a5f64d421e0f943254f" dependencies = [ "num-traits", ] [[package]] name = "os_str_bytes" version = "6.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267" [[package]] name = "parking_lot" version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" dependencies = [ "lock_api", "parking_lot_core", ] [[package]] name = "parking_lot_core" version = "0.9.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521" dependencies = [ "cfg-if", "libc", "redox_syscall", "smallvec", "windows-sys 0.45.0", ] [[package]] name = "paste" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f746c4065a8fa3fe23974dd82f15431cc8d40779821001404d10d2e79ca7d79" [[package]] name = "petgraph" version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4" dependencies = [ "fixedbitset", "indexmap", ] [[package]] name = "pkg-config" version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" [[package]] name = "plotters" version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2538b639e642295546c50fcd545198c9d64ee2a38620a628724a3b266d5fbf97" dependencies = [ "num-traits", "plotters-backend", "plotters-svg", "wasm-bindgen", "web-sys", ] [[package]] name = "plotters-backend" version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "193228616381fecdc1224c62e96946dfbc73ff4384fba576e052ff8c1bea8142" [[package]] name = "plotters-svg" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9a81d2759aae1dae668f783c308bc5c8ebd191ff4184aaa1b37f65a6ae5a56f" dependencies = [ "plotters-backend", ] [[package]] name = "ppv-lite86" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "predicates" version = "3.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c575290b64d24745b6c57a12a31465f0a66f3a4799686a6921526a33b0797965" dependencies = [ "anstyle", "difflib", "float-cmp", "itertools", "normalize-line-endings", "predicates-core", "regex", ] [[package]] name = "predicates-core" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b794032607612e7abeb4db69adb4e33590fa6cf1149e95fd7cb00e634b92f174" [[package]] name = "predicates-tree" version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "368ba315fb8c5052ab692e68a0eefec6ec57b23a36959c14496f0b0df2c0cecf" dependencies = [ "predicates-core", "termtree", ] [[package]] name = "proc-macro-error" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" dependencies = [ "proc-macro-error-attr", "proc-macro2", "quote", "syn 1.0.109", "version_check", ] [[package]] name = "proc-macro-error-attr" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" dependencies = [ "proc-macro2", "quote", "version_check", ] [[package]] name = "proc-macro2" version = "1.0.54" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e472a104799c74b514a57226160104aa483546de37e839ec50e3c2e41dd87534" dependencies = [ "unicode-ident", ] [[package]] name = "pyo3" version = "0.18.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfb848f80438f926a9ebddf0a539ed6065434fd7aae03a89312a9821f81b8501" dependencies = [ "cfg-if", "indoc", "libc", "memoffset", "parking_lot", "pyo3-build-config", "pyo3-ffi", "pyo3-macros", "unindent", ] [[package]] name = "pyo3-build-config" version = "0.18.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "98a42e7f42e917ce6664c832d5eee481ad514c98250c49e0b03b20593e2c7ed0" dependencies = [ "once_cell", "target-lexicon", ] [[package]] name = "pyo3-ffi" version = "0.18.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0707f0ab26826fe4ccd59b69106e9df5e12d097457c7b8f9c0fd1d2743eec4d" dependencies = [ "libc", "pyo3-build-config", ] [[package]] name = "pyo3-macros" version = "0.18.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "978d18e61465ecd389e1f235ff5a467146dc4e3c3968b90d274fe73a5dd4a438" dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", "syn 1.0.109", ] [[package]] name = "pyo3-macros-backend" version = "0.18.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e0e1128f85ce3fca66e435e08aa2089a2689c1c48ce97803e13f63124058462" dependencies = [ "proc-macro2", "quote", "syn 1.0.109", ] [[package]] name = "quote" version = "1.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc" dependencies = [ "proc-macro2", ] [[package]] name = "rand" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", "rand_chacha", "rand_core", ] [[package]] name = "rand_chacha" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", "rand_core", ] [[package]] name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ "getrandom", ] [[package]] name = "rand_distr" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" dependencies = [ "num-traits", "rand", ] [[package]] name = "rawpointer" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" [[package]] name = "rayon" version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b" dependencies = [ "either", "rayon-core", ] [[package]] name = "rayon-core" version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d" dependencies = [ "crossbeam-channel", "crossbeam-deque", "crossbeam-utils", "num_cpus", ] [[package]] name = "redox_syscall" version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ "bitflags", ] [[package]] name = "regex" version = "1.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d" dependencies = [ "aho-corasick", "memchr", "regex-syntax", ] [[package]] name = "regex-automata" version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" [[package]] name = "regex-syntax" version = "0.6.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "rustc_version" version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c5f5376ea5e30ce23c03eb77cbe4962b988deead10910c372b226388b594c084" dependencies = [ "semver", ] [[package]] name = "rustix" version = "0.36.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db4165c9963ab29e422d6c26fbc1d37f15bace6b2810221f9d925023480fcf0e" dependencies = [ "bitflags", "errno", "io-lifetimes", "libc", "linux-raw-sys", "windows-sys 0.45.0", ] [[package]] name = "rustversion" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4f3208ce4d8448b3f3e7d168a73f5e0c43a61e32930de3bceeccedb388b6bf06" [[package]] name = "ryu" version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" [[package]] name = "safe_arch" version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "794821e4ccb0d9f979512f9c1973480123f9bd62a90d74ab0f9426fcf8f4a529" dependencies = [ "bytemuck", ] [[package]] name = "safemem" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef703b7cb59335eae2eb93ceb664c0eb7ea6bf567079d843e09420219668e072" [[package]] name = "same-file" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" dependencies = [ "winapi-util", ] [[package]] name = "scoped_threadpool" version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d51f5df5af43ab3f1360b429fa5e0152ac5ce8c0bd6485cae490332e96846a8" [[package]] name = "scopeguard" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "semver" version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4f410fedcf71af0345d7607d246e7ad15faaadd49d240ee3b24e5dc21a820ac" [[package]] name = "seq_io" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c6fd8c16e3bbea801215d58ea15624defd37feaf05015af2dac2a8e5611cc4e4" dependencies = [ "buf_redux", "crossbeam", "memchr", "scoped_threadpool", "serde", "serde_derive", ] [[package]] name = "serde" version = "1.0.158" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "771d4d9c4163ee138805e12c710dd365e4f44be8be0503cb1bb9eb989425d9c9" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" version = "1.0.158" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e801c1712f48475582b7696ac71e0ca34ebb30e09338425384269d9717c62cad" dependencies = [ "proc-macro2", "quote", "syn 2.0.10", ] [[package]] name = "serde_json" version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1c533a59c9d8a93a09c6ab31f0fd5e5f4dd1b8fc9434804029839884765d04ea" dependencies = [ "itoa", "ryu", "serde", ] [[package]] name = "serde_spanned" version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0efd8caf556a6cebd3b285caf480045fcc1ac04f6bd786b09a6f11af30c4fcf4" dependencies = [ "serde", ] [[package]] name = "simba" version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0b7840f121a46d63066ee7a99fc81dcabbc6105e437cae43528cea199b5a05f" dependencies = [ "approx", "num-complex", "num-traits", "paste", "wide", ] [[package]] name = "smallvec" version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" [[package]] name = "statrs" version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d08e5e1748192713cc281da8b16924fb46be7b0c2431854eadc785823e5696e" dependencies = [ "approx", "lazy_static", "nalgebra", "num-traits", "rand", ] [[package]] name = "strum" version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" [[package]] name = "strum_macros" version = "0.24.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" dependencies = [ "heck", "proc-macro2", "quote", "rustversion", "syn 1.0.109", ] [[package]] name = "syn" version = "1.0.109" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "syn" version = "2.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5aad1363ed6d37b84299588d62d3a7d95b5a5c2d9aad5c85609fda12afaa1f40" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "target-lexicon" version = "0.12.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ae9980cab1db3fceee2f6c6f643d5d8de2997c58ee8d25fb0cc8a9e9e7348e5" [[package]] name = "tempfile" version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95" dependencies = [ "cfg-if", "fastrand", "redox_syscall", "rustix", "windows-sys 0.42.0", ] [[package]] name = "termtree" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76" [[package]] name = "textwrap" version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" [[package]] name = "thiserror" version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", "syn 2.0.10", ] [[package]] name = "tinytemplate" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" dependencies = [ "serde", "serde_json", ] [[package]] name = "toml" version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b403acf6f2bb0859c93c7f0d967cb4a75a7ac552100f9322faf64dc047669b21" dependencies = [ "serde", "serde_spanned", "toml_datetime", "toml_edit", ] [[package]] name = "toml_datetime" version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3ab8ed2edee10b50132aed5f331333428b011c99402b5a534154ed15746f9622" dependencies = [ "serde", ] [[package]] name = "toml_edit" version = "0.19.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "239410c8609e8125456927e6707163a3b1fdb40561e4b803bc041f466ccfdc13" dependencies = [ "indexmap", "serde", "serde_spanned", "toml_datetime", "winnow", ] [[package]] name = "triple_accel" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22048bc95dfb2ffd05b1ff9a756290a009224b60b2f0e7525faeee7603851e63" [[package]] name = "typenum" version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" [[package]] name = "unicode-ident" version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" [[package]] name = "unindent" version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e1766d682d402817b5ac4490b3c3002d91dfa0d22812f341609f97b08757359c" [[package]] name = "vec_map" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" dependencies = [ "serde", ] [[package]] name = "version_check" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" [[package]] name = "wait-timeout" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6" dependencies = [ "libc", ] [[package]] name = "walkdir" version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698" dependencies = [ "same-file", "winapi-util", ] [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" version = "0.2.84" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" dependencies = [ "cfg-if", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" version = "0.2.84" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", "syn 1.0.109", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-macro" version = "0.2.84" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5" dependencies = [ "quote", "wasm-bindgen-macro-support", ] [[package]] name = "wasm-bindgen-macro-support" version = "0.2.84" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" dependencies = [ "proc-macro2", "quote", "syn 1.0.109", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" version = "0.2.84" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d" [[package]] name = "web-sys" version = "0.3.61" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e33b99f4b23ba3eec1a53ac264e35a755f00e966e0065077d6027c0f575b0b97" dependencies = [ "js-sys", "wasm-bindgen", ] [[package]] name = "wide" version = "0.7.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b689b6c49d6549434bf944e6b0f39238cf63693cb7a147e9d887507fffa3b223" dependencies = [ "bytemuck", "safe_arch", ] [[package]] name = "winapi" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" dependencies = [ "winapi-i686-pc-windows-gnu", "winapi-x86_64-pc-windows-gnu", ] [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" dependencies = [ "winapi", ] [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-sys" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", "windows_i686_gnu", "windows_i686_msvc", "windows_x86_64_gnu", "windows_x86_64_gnullvm", "windows_x86_64_msvc", ] [[package]] name = "windows-sys" version = "0.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" dependencies = [ "windows-targets", ] [[package]] name = "windows-targets" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", "windows_i686_gnu", "windows_i686_msvc", "windows_x86_64_gnu", "windows_x86_64_gnullvm", "windows_x86_64_msvc", ] [[package]] name = "windows_aarch64_gnullvm" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" [[package]] name = "windows_aarch64_msvc" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" [[package]] name = "windows_i686_gnu" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" [[package]] name = "windows_i686_msvc" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" [[package]] name = "windows_x86_64_gnu" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" [[package]] name = "windows_x86_64_gnullvm" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" [[package]] name = "windows_x86_64_msvc" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" [[package]] name = "winnow" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae8970b36c66498d8ff1d66685dc86b91b29db0c7739899012f63a63814b4b28" dependencies = [ "memchr", ] [[package]] name = "xz2" version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" dependencies = [ "lzma-sys", ] needletail-0.5.1/Cargo.toml0000644000000041440000000000100111460ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" name = "needletail" version = "0.5.1" authors = [ "Roderick Bovee ", "Vincent Prouillet ", ] include = [ "src/**/*", "LICENSE", "README.md", "CHANGELOG.md", ] description = "FASTX parsing and k-mer methods" readme = "./README.md" keywords = [ "FASTA", "FASTQ", "kmer", "bioinformatics", ] categories = [ "science", "parsing", ] license = "MIT" repository = "https://github.com/onecodex/needletail" [profile.release] lto = true [lib] crate-type = [ "cdylib", "rlib", ] bench = false [[bench]] name = "benchmark" path = "benches/benchmark.rs" harness = false [dependencies.buffer-redux] version = "1" default_features = false [dependencies.bytecount] version = "0.6" features = ["runtime-dispatch-simd"] [dependencies.bzip2] version = "0.4" optional = true [dependencies.flate2] version = "1.0.6" optional = true [dependencies.memchr] version = "2.2.1" [dependencies.pyo3] version = "0.18" optional = true [dependencies.xz2] version = "0.1.6" optional = true [dev-dependencies.assert_cmd] version = "2" [dev-dependencies.bio] version = "1" [dev-dependencies.criterion] version = "0.4" [dev-dependencies.escargot] version = "0.5.0" [dev-dependencies.predicates] version = "3" [dev-dependencies.seq_io] version = "0.3" [dev-dependencies.serde] version = "1.0" [dev-dependencies.serde_derive] version = "1.0" [dev-dependencies.tempfile] version = "3" [dev-dependencies.toml] version = "0.7" [features] compression = [ "bzip2", "flate2", "xz2", ] default = ["compression"] python = ["pyo3/extension-module"] python_test = ["pyo3"] needletail-0.5.1/Cargo.toml.orig0000644000000025210000000000100121020ustar [package] name = "needletail" version = "0.5.1" authors = ["Roderick Bovee ", "Vincent Prouillet "] description = "FASTX parsing and k-mer methods" keywords = ["FASTA", "FASTQ", "kmer", "bioinformatics"] categories = ["science", "parsing"] repository = "https://github.com/onecodex/needletail" license = "MIT" readme = "./README.md" edition = "2021" include = ["src/**/*", "LICENSE", "README.md", "CHANGELOG.md"] [lib] crate-type=["cdylib", "rlib"] bench = false [features] default = ["compression"] compression = ["bzip2", "flate2", "xz2"] python = ["pyo3/extension-module"] python_test = ["pyo3"] [dependencies] flate2 = { version="1.0.6", optional=true } bzip2 = { version="0.4", optional=true } xz2 = { version="0.1.6", optional=true } pyo3 = { version = "0.18", optional = true } memchr = "2.2.1" bytecount = {version = "0.6", features = ["runtime-dispatch-simd"]} buffer-redux = { version = "1", default_features = false } [dev-dependencies] criterion = "0.4" # for stdin test escargot = "0.5.0" assert_cmd = "2" predicates = "3" tempfile = "3" # for benchmark comparisons bio = "1" seq_io = "0.3" # for testing with the FormatSpecimens.jl repo samples toml = "0.7" serde = "1.0" serde_derive = "1.0" [profile.release] lto = true [[bench]] name = "benchmark" harness = false path = "benches/benchmark.rs" needletail-0.5.1/Cargo.toml.orig000064400000000000000000000025211046102023000146240ustar 00000000000000[package] name = "needletail" version = "0.5.1" authors = ["Roderick Bovee ", "Vincent Prouillet "] description = "FASTX parsing and k-mer methods" keywords = ["FASTA", "FASTQ", "kmer", "bioinformatics"] categories = ["science", "parsing"] repository = "https://github.com/onecodex/needletail" license = "MIT" readme = "./README.md" edition = "2021" include = ["src/**/*", "LICENSE", "README.md", "CHANGELOG.md"] [lib] crate-type=["cdylib", "rlib"] bench = false [features] default = ["compression"] compression = ["bzip2", "flate2", "xz2"] python = ["pyo3/extension-module"] python_test = ["pyo3"] [dependencies] flate2 = { version="1.0.6", optional=true } bzip2 = { version="0.4", optional=true } xz2 = { version="0.1.6", optional=true } pyo3 = { version = "0.18", optional = true } memchr = "2.2.1" bytecount = {version = "0.6", features = ["runtime-dispatch-simd"]} buffer-redux = { version = "1", default_features = false } [dev-dependencies] criterion = "0.4" # for stdin test escargot = "0.5.0" assert_cmd = "2" predicates = "3" tempfile = "3" # for benchmark comparisons bio = "1" seq_io = "0.3" # for testing with the FormatSpecimens.jl repo samples toml = "0.7" serde = "1.0" serde_derive = "1.0" [profile.release] lto = true [[bench]] name = "benchmark" harness = false path = "benches/benchmark.rs" needletail-0.5.1/LICENSE000064400000000000000000000020711046102023000127420ustar 00000000000000The MIT License (MIT) Copyright (c) 2016 Roderick Bovee Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. needletail-0.5.1/README.md000064400000000000000000000117701046102023000132220ustar 00000000000000![CI](https://github.com/onecodex/needletail/workflows/CI/badge.svg) [![crates.io](https://img.shields.io/crates/v/needletail.svg)](https://crates.io/crates/needletail) # Needletail Needletail is a MIT-licensed, minimal-copying FASTA/FASTQ parser and _k_-mer processing library for Rust. The goal is to write a fast *and* well-tested set of functions that more specialized bioinformatics programs can use. Needletail's goal is to be as fast as the [readfq](https://github.com/lh3/readfq) C library at parsing FASTX files and much (i.e. 25 times) faster than equivalent Python implementations at _k_-mer counting. ## Example ```rust extern crate needletail; use needletail::{parse_fastx_file, Sequence, FastxReader}; fn main() { let filename = "tests/data/28S.fasta"; let mut n_bases = 0; let mut n_valid_kmers = 0; let mut reader = parse_fastx_file(&filename).expect("valid path/file"); while let Some(record) = reader.next() { let seqrec = record.expect("invalid record"); // keep track of the total number of bases n_bases += seqrec.num_bases(); // normalize to make sure all the bases are consistently capitalized and // that we remove the newlines since this is FASTA let norm_seq = seqrec.normalize(false); // we make a reverse complemented copy of the sequence first for // `canonical_kmers` to draw the complemented sequences from. let rc = norm_seq.reverse_complement(); // now we keep track of the number of AAAAs (or TTTTs via // canonicalization) in the file; note we also get the position (i.0; // in the event there were `N`-containing kmers that were skipped) // and whether the sequence was complemented (i.2) in addition to // the canonical kmer (i.1) for (_, kmer, _) in norm_seq.canonical_kmers(4, &rc) { if kmer == b"AAAA" { n_valid_kmers += 1; } } } println!("There are {} bases in your file.", n_bases); println!("There are {} AAAAs in your file.", n_valid_kmers); } ``` ## Installation Needletail requires `rust` and `cargo` to be installed. Please use either your local package manager (`homebrew`, `apt-get`, `pacman`, etc) or install these via [rustup](https://www.rustup.rs/). Once you have Rust set up, you can include needletail in your `Cargo.toml` file like: ```shell [dependencies] needletail = "0.4" ``` To install needletail itself for development: ```shell git clone https://github.com/onecodex/needletail cargo test # to run tests ``` ### Python #### Documentation For a real example, you can refer to `test_python.py`. The python library only raise one type of exception: `NeedletailError`. There are 2 ways to parse a FASTA/FASTQ: one if you have a string (`parse_fastx_string(content: str)`) or a path to a file (`parse_fastx_file(path: str)`). Those functions will raise if the file is not found or if the content is invalid and will return an iterator. ```python from needletail import parse_fastx_file, NeedletailError, reverse_complement, normalize_seq try: for record in parse_fastx_file("myfile.fastq"): print(record.id) print(record.seq) print(record.qual) except NeedletailError: print("Invalid Fastq file") ``` A record has the following shape: ```python class Record: id: str seq: str qual: Optional[str] def is_fasta(self) -> bool def is_fastq(self) -> bool def normalize(self, iupac: bool) ``` Note that `normalize` (see for what it does) will mutate `self.seq`. It is also available as the `normalize_seq(seq: str, iupac: bool)` function which will return the normalized sequence in this case. Lastly, there is also a `reverse_complement(seq: str)` that will do exactly what it says. This will not raise an error if you pass some invalid characters. #### Building To work on the Python library on a Mac OS X/Unix system (requires Python 3): ```bash pip install maturin # finally, install the library in the local virtualenv maturin develop --cargo-extra-args="--features=python" ``` To build the binary wheels and push to PyPI ``` # The Mac build requires switching through a few different python versions maturin build --features python --release --strip # The linux build is automated through cross-compiling in a docker image docker run --rm -v $(pwd):/io ghcr.io/pyo3/maturin:main build --features=python --release --strip -f twine upload target/wheels/* ``` ## Getting Help Questions are best directed as GitHub issues. We plan to add more documentation soon, but in the meantime "doc" comments are included in the source. ## Contributing Please do! We're happy to discuss possible additions and/or accept pull requests. ## Acknowledgements Starting from 0.4, the parsers algorithms is taken from [seq_io](https://github.com/markschl/seq_io). While it has been slightly modified, it is mainly coming from that library. Links to the original files are available in `src/parser/fast{a,q}.rs`. needletail-0.5.1/src/bitkmer.rs000064400000000000000000000222271046102023000145340ustar 00000000000000//! Compact binary representations of nucleic acid kmers pub type BitKmerSeq = u64; pub type BitKmer = (BitKmerSeq, u8); const NUC2BIT_LOOKUP: [Option; 256] = { let mut lookup = [None; 256]; lookup[b'A' as usize] = Some(0); lookup[b'C' as usize] = Some(1); lookup[b'G' as usize] = Some(2); lookup[b'T' as usize] = Some(3); lookup[b'a' as usize] = Some(0); lookup[b'c' as usize] = Some(1); lookup[b'g' as usize] = Some(2); lookup[b't' as usize] = Some(3); lookup }; fn nuc2bti_lookup_nocheck(nuc: u8) -> Option { unsafe { *NUC2BIT_LOOKUP.get_unchecked(nuc as usize) } } /// Takes a BitKmer and adds a new base on the end, optionally loping off the /// first base if the resulting kmer is too long. fn extend_kmer(kmer: &mut BitKmer, new_char: u8) -> bool { if let Some(new_char_int) = nuc2bti_lookup_nocheck(new_char) { let new_kmer = (kmer.0 << 2) + new_char_int as BitKmerSeq; // mask out any overflowed bits kmer.0 = new_kmer & (BitKmerSeq::pow(2, u32::from(2 * kmer.1)) - 1) as BitKmerSeq; true } else { false } } /// Used for the BitNuclKmer iterator to handle skipping invalid bases. fn update_position( start_pos: &mut usize, kmer: &mut BitKmer, buffer: &[u8], initial: bool, ) -> bool { // check if we have enough "physical" space for one more kmer if *start_pos + kmer.1 as usize > buffer.len() { return false; } let (mut kmer_len, stop_len) = if initial { (0, (kmer.1 - 1) as usize) } else { ((kmer.1 - 1) as usize, kmer.1 as usize) }; let cur_kmer = kmer; while kmer_len < stop_len { if extend_kmer(cur_kmer, buffer[*start_pos + kmer_len]) { kmer_len += 1; } else { kmer_len = 0; *cur_kmer = (0u64, cur_kmer.1); *start_pos += kmer_len + 1; if *start_pos + cur_kmer.1 as usize > buffer.len() { return false; } } } true } pub struct BitNuclKmer<'a> { start_pos: usize, cur_kmer: BitKmer, buffer: &'a [u8], canonical: bool, } impl<'a> BitNuclKmer<'a> { pub fn new(slice: &'a [u8], k: u8, canonical: bool) -> BitNuclKmer<'a> { let mut kmer = (0u64, k); let mut start_pos = 0; update_position(&mut start_pos, &mut kmer, slice, true); BitNuclKmer { start_pos, cur_kmer: kmer, buffer: slice, canonical, } } } impl<'a> Iterator for BitNuclKmer<'a> { type Item = (usize, BitKmer, bool); fn next(&mut self) -> Option<(usize, BitKmer, bool)> { if !update_position(&mut self.start_pos, &mut self.cur_kmer, self.buffer, false) { return None; } self.start_pos += 1; if self.canonical { let (kmer, was_rc) = canonical(self.cur_kmer); Some((self.start_pos - 1, kmer, was_rc)) } else { Some((self.start_pos - 1, self.cur_kmer, false)) } } } /// Reverse complement a BitKmer (reverses the sequence and swaps A<>T and G<>C) pub fn reverse_complement(kmer: BitKmer) -> BitKmer { // FIXME: this is not going to work with BitKmers of u128 or u32 // inspired from https://www.biostars.org/p/113640/ let mut new_kmer = kmer.0; // reverse it new_kmer = (new_kmer >> 2 & 0x3333_3333_3333_3333) | (new_kmer & 0x3333_3333_3333_3333) << 2; new_kmer = (new_kmer >> 4 & 0x0F0F_0F0F_0F0F_0F0F) | (new_kmer & 0x0F0F_0F0F_0F0F_0F0F) << 4; new_kmer = (new_kmer >> 8 & 0x00FF_00FF_00FF_00FF) | (new_kmer & 0x00FF_00FF_00FF_00FF) << 8; new_kmer = (new_kmer >> 16 & 0x0000_FFFF_0000_FFFF) | (new_kmer & 0x0000_FFFF_0000_FFFF) << 16; new_kmer = (new_kmer >> 32 & 0x0000_0000_FFFF_FFFF) | (new_kmer & 0x0000_0000_FFFF_FFFF) << 32; // complement it new_kmer ^= 0xFFFF_FFFF_FFFF_FFFF; // shift it to the right size new_kmer >>= 2 * (32 - kmer.1); (new_kmer, kmer.1) } /// Return the lexigraphically lowest of the BitKmer and its reverse complement and /// whether the returned kmer is the reverse_complement (true) or the original (false) pub fn canonical(kmer: BitKmer) -> (BitKmer, bool) { let rc = reverse_complement(kmer); if kmer.0 > rc.0 { (rc, true) } else { (kmer, false) } } /// Find the lexicographically lowest substring of a given length in the BitKmer pub fn minimizer(kmer: BitKmer, minmer_size: u8) -> BitKmer { let mut new_kmer = kmer.0; let mut lowest = !0; let bitmask = (BitKmerSeq::pow(2, u32::from(2 * minmer_size)) - 1) as BitKmerSeq; for _ in 0..=(kmer.1 - minmer_size) { let cur = bitmask & new_kmer; if cur < lowest { lowest = cur; } let cur_rev = reverse_complement((bitmask & new_kmer, kmer.1)); if cur_rev.0 < lowest { lowest = cur_rev.0; } new_kmer >>= 2; } (lowest, kmer.1) } pub fn bitmer_to_bytes(kmer: BitKmer) -> Vec { let mut new_kmer = kmer.0; let mut new_kmer_str = Vec::new(); // we're reading the bases off from the "high" end of the integer so we need to do some // math to figure out where they start (this helps us just pop the bases on the end // of the working buffer as we read them off "left to right") let offset = (kmer.1 - 1) * 2; let bitmask = BitKmerSeq::pow(2, u32::from(2 * kmer.1 - 1)) + BitKmerSeq::pow(2, u32::from(2 * kmer.1 - 2)); for _ in 0..kmer.1 { let new_char = (new_kmer & bitmask) >> offset; new_kmer <<= 2; new_kmer_str.push(match new_char { 0 => b'A', 1 => b'C', 2 => b'G', 3 => b'T', _ => panic!("Mathematical impossibility"), }); } new_kmer_str } #[cfg(test)] mod tests { use super::*; #[test] fn can_kmerize() { // test general function let mut i = 0; for (_, k, _) in BitNuclKmer::new(b"AGCT", 1, false) { match i { 0 => assert_eq!(k.0, 0b00 as BitKmerSeq), 1 => assert_eq!(k.0, 0b10 as BitKmerSeq), 2 => assert_eq!(k.0, 0b01 as BitKmerSeq), 3 => assert_eq!(k.0, 0b11 as BitKmerSeq), _ => unreachable!("Too many kmers"), } i += 1; } // test that we skip over N's i = 0; for (_, k, _) in BitNuclKmer::new(b"ACNGT", 2, false) { match i { 0 => assert_eq!(k.0, 0b0001 as BitKmerSeq), 1 => assert_eq!(k.0, 0b1011 as BitKmerSeq), _ => unreachable!("Too many kmers"), } i += 1; } // test that we skip over N's and handle short kmers i = 0; for (_, k, _) in BitNuclKmer::new(b"ACNG", 2, false) { match i { 0 => assert_eq!(k.0, 0x0001 as BitKmerSeq), _ => unreachable!("Too many kmers"), } i += 1; } // test that the minimum length works i = 0; for (_, k, _) in BitNuclKmer::new(b"AC", 2, false) { match i { 0 => assert_eq!(k.0, 0x0001 as BitKmerSeq), _ => unreachable!("Too many kmers"), } i += 1; } } #[test] fn test_iterator() { let seq = b"ACGTA"; let mut kmer_iter = BitNuclKmer::new(seq, 3, false); assert_eq!(kmer_iter.next(), Some((0, (6, 3), false))); assert_eq!(kmer_iter.next(), Some((1, (27, 3), false))); assert_eq!(kmer_iter.next(), Some((2, (44, 3), false))); assert_eq!(kmer_iter.next(), None); let seq = b"TA"; let mut kmer_iter = BitNuclKmer::new(seq, 3, false); assert_eq!(kmer_iter.next(), None); } #[test] fn test_reverse_complement() { assert_eq!(reverse_complement((0b00_0000, 3)).0, 0b11_1111); assert_eq!(reverse_complement((0b11_1111, 3)).0, 0b00_0000); assert_eq!(reverse_complement((0b0000_0000, 4)).0, 0b1111_1111); assert_eq!(reverse_complement((0b0001_1011, 4)).0, 0b0001_1011); } #[test] fn test_minimizer() { assert_eq!(minimizer((0b00_1011, 3), 2).0, 0b0010); assert_eq!(minimizer((0b00_1011, 3), 1).0, 0b00); assert_eq!(minimizer((0b1100_0011, 4), 2).0, 0b0000); assert_eq!(minimizer((0b11_0001, 3), 2).0, 0b0001); } #[test] fn test_bytes_to_bitkmer() { let mut ikmer: BitKmer = bytes_to_bitmer(b"C"); assert_eq!(ikmer.0, 1 as BitKmerSeq); ikmer = bytes_to_bitmer(b"TTA"); assert_eq!(ikmer.0, 60 as BitKmerSeq); ikmer = bytes_to_bitmer(b"AAA"); assert_eq!(ikmer.0, 0 as BitKmerSeq); } #[test] fn test_bitmer_to_bytes() { assert_eq!(bitmer_to_bytes((1 as BitKmerSeq, 1)), b"C".to_vec()); assert_eq!(bitmer_to_bytes((60 as BitKmerSeq, 3)), b"TTA".to_vec()); assert_eq!(bitmer_to_bytes((0 as BitKmerSeq, 3)), b"AAA".to_vec()); } pub fn bytes_to_bitmer(kmer: &[u8]) -> BitKmer { let k = kmer.len() as u8; let mut bit_kmer = (0u64, k); for i in 0..k { extend_kmer(&mut bit_kmer, kmer[i as usize]); } bit_kmer } } needletail-0.5.1/src/errors.rs000064400000000000000000000117761046102023000144220ustar 00000000000000//! The errors needletail can return; only when parsing FASTA/FASTQ files use crate::parser::Format; use std::error::Error as StdError; use std::fmt; use std::io; /// Represents where we were in a file when an error occurred. #[derive(Debug, Clone, PartialEq, Eq, Default)] pub struct ErrorPosition { /// Line number where the error occurred (starting with 1) pub line: u64, /// ID of record if available pub id: Option, } impl fmt::Display for ErrorPosition { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { if let Some(id) = self.id.as_ref() { write!(f, "record '{}' at ", id)?; } write!(f, "line {}", self.line) } } /// The type of error that occured during file parsing #[derive(Clone, Debug, PartialEq)] pub enum ParseErrorKind { /// An error happened during file/stream input/output Io, /// The file didn't start with `@` or `>` and we didn't know what to expect yet UnknownFormat, /// Invalid start byte of record encountered (expected `@` in FASTQ and `>` in FASTA) InvalidStart, /// The separator line in a FASTQ file is not valid (no `+`) InvalidSeparator, /// Sequence and quality lengths are not equal (in a FASTQ file only) UnequalLengths, /// Truncated record found UnexpectedEnd, /// The file appears to be empty EmptyFile, } /// The only error type that needletail returns #[derive(Clone, Debug, PartialEq)] pub struct ParseError { /// A description of what went wrong pub msg: String, /// The type of error that occurred pub kind: ParseErrorKind, /// Position within file pub position: ErrorPosition, /// The format of the file we were parsing pub format: Option, } impl ParseError { pub fn new_invalid_start(byte_found: u8, position: ErrorPosition, format: Format) -> Self { let msg = format!( "Expected '{}' but found '{}", format.start_char(), (byte_found as char).escape_default() ); ParseError { kind: ParseErrorKind::InvalidStart, msg, position, format: Some(format), } } pub fn new_invalid_separator(byte_found: u8, position: ErrorPosition) -> Self { let msg = format!( "Expected '+' separator but found '{}", (byte_found as char).escape_default() ); ParseError { kind: ParseErrorKind::InvalidSeparator, msg, position, format: Some(Format::Fastq), } } pub fn new_unknown_format(byte_found: u8) -> Self { let msg = format!( "Expected '@' or '>' at the start of the file but found '{}'.", (byte_found as char).escape_default() ); ParseError { kind: ParseErrorKind::UnknownFormat, msg, position: ErrorPosition::default(), format: Some(Format::Fastq), } } pub fn new_unequal_length(seq_len: usize, qual_len: usize, position: ErrorPosition) -> Self { let msg = format!( "Sequence length is {} but quality length is {}", seq_len, qual_len ); ParseError { kind: ParseErrorKind::UnequalLengths, msg, position, format: Some(Format::Fastq), } } pub fn new_unexpected_end(position: ErrorPosition, format: Format) -> Self { ParseError { msg: String::new(), kind: ParseErrorKind::UnexpectedEnd, position, format: Some(format), } } pub fn new_empty_file() -> Self { ParseError { msg: String::from("Failed to read the first two bytes. Is the file empty?"), kind: ParseErrorKind::EmptyFile, position: ErrorPosition::default(), format: None, } } } impl fmt::Display for ParseError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self.kind { ParseErrorKind::Io => write!(f, "I/O error: {}", self.msg), ParseErrorKind::UnequalLengths | ParseErrorKind::InvalidStart | ParseErrorKind::UnknownFormat | ParseErrorKind::EmptyFile | ParseErrorKind::InvalidSeparator => write!(f, "{} ({})", self.msg, self.position), ParseErrorKind::UnexpectedEnd => { write!(f, "Unexpected end of input ({}).", self.position) } } } } impl From for ParseError { fn from(err: io::Error) -> ParseError { ParseError { msg: err.to_string(), kind: ParseErrorKind::Io, position: ErrorPosition::default(), format: None, } } } impl StdError for ParseError { fn cause(&self) -> Option<&dyn StdError> { // Ideally we would pass the io::Error but we don't for simplicity sake // since we wouldn't be able to `==` on the error kind otherwise. // TODO: impl partialeq manually? None } } needletail-0.5.1/src/kmer.rs000064400000000000000000000151721046102023000140360ustar 00000000000000//! Functions for splitting sequences into fixed-width moving windows (kmers) //! and utilities for dealing with these kmers. /// Returns true if the base is a unambiguous nucleic acid base (e.g. ACGT) and /// false otherwise. fn is_good_base(chr: u8) -> bool { matches!(chr as char, 'a' | 'c' | 'g' | 't' | 'A' | 'C' | 'G' | 'T') } /// Generic moving window iterator over sequences to return k-mers /// /// Iterator returns slices to the original data. pub struct Kmers<'a> { k: u8, start_pos: usize, buffer: &'a [u8], } impl<'a> Kmers<'a> { /// Creates a new kmer-izer for a nucleotide/amino acid sequence. pub fn new(buffer: &'a [u8], k: u8) -> Self { Kmers { k, start_pos: 0, buffer, } } } impl<'a> Iterator for Kmers<'a> { type Item = &'a [u8]; fn next(&mut self) -> Option { if self.start_pos + self.k as usize > self.buffer.len() { return None; } let pos = self.start_pos; self.start_pos += 1; Some(&self.buffer[pos..pos + self.k as usize]) } } /// A kmer-izer for a nucleotide acid sequences to return canonical kmers. /// /// Iterator returns the position of the kmer, a slice to the original data, /// and an boolean indicating if the kmer returned is the original or the /// reverse complement. pub struct CanonicalKmers<'a> { k: u8, start_pos: usize, buffer: &'a [u8], rc_buffer: &'a [u8], } impl<'a> CanonicalKmers<'a> { /// Creates a new iterator. /// /// It's generally more useful to use this directly from a sequences (e.g. /// `seq.canonical_kmers`. Requires a reference to the reverse complement /// of the sequence it's created on, e.g. /// ``` /// use needletail::Sequence; /// use needletail::kmer::CanonicalKmers; /// /// let seq = b"ACGT"; /// let rc = seq.reverse_complement(); /// let c_iter = CanonicalKmers::new(seq, &rc, 3); /// for (pos, kmer, canonical) in c_iter { /// // process data in here /// } /// /// ``` pub fn new(buffer: &'a [u8], rc_buffer: &'a [u8], k: u8) -> Self { let mut nucl_kmers = CanonicalKmers { k, start_pos: 0, buffer, rc_buffer, }; nucl_kmers.update_position(true); nucl_kmers } fn update_position(&mut self, initial: bool) -> bool { // check if we have enough "physical" space for one more kmer if self.start_pos + self.k as usize > self.buffer.len() { return false; } let (mut kmer_len, stop_len) = if initial { (0, (self.k - 1) as usize) } else { ((self.k - 1) as usize, self.k as usize) }; while kmer_len < stop_len { if is_good_base(self.buffer[self.start_pos + kmer_len]) { kmer_len += 1; } else { kmer_len = 0; self.start_pos += kmer_len + 1; if self.start_pos + self.k as usize > self.buffer.len() { return false; } } } true } } impl<'a> Iterator for CanonicalKmers<'a> { type Item = (usize, &'a [u8], bool); fn next(&mut self) -> Option<(usize, &'a [u8], bool)> { if !self.update_position(false) { return None; } let pos = self.start_pos; self.start_pos += 1; let result = &self.buffer[pos..pos + self.k as usize]; let rc_buffer = self.rc_buffer; let rc_result = &rc_buffer[rc_buffer.len() - pos - self.k as usize..rc_buffer.len() - pos]; if result < rc_result { Some((pos, result, false)) } else { Some((pos, rc_result, true)) } } } #[cfg(test)] mod tests { use super::*; use crate::sequence::Sequence; #[test] fn can_kmerize() { let k_iter = Kmers::new(b"AGCT", 1); // test general function for (i, k) in k_iter.enumerate() { match i { 0 => assert_eq!(k, b"A"), 1 => assert_eq!(k, b"G"), 2 => assert_eq!(k, b"C"), 3 => assert_eq!(k, b"T"), _ => unreachable!("Too many kmers"), } } // test that we handle length 2 (and don't drop Ns) let k_iter = Kmers::new(b"AGNCT", 2); for (i, k) in k_iter.enumerate() { match i { 0 => assert_eq!(k, b"AG"), 1 => assert_eq!(k, b"GN"), 2 => assert_eq!(k, b"NC"), 3 => assert_eq!(k, b"CT"), _ => unreachable!("Too many kmers"), } } // test that the minimum length works let k_iter = Kmers::new(b"AC", 2); for k in k_iter { assert_eq!(k, &b"AC"[..]); } } #[test] fn can_canonicalize() { // test general function let seq = b"AGCT"; let rc_seq = seq.reverse_complement(); let c_iter = CanonicalKmers::new(seq, &rc_seq, 1); for (i, (_, k, is_c)) in c_iter.enumerate() { match i { 0 => { assert_eq!(k, b"A"); assert_eq!(is_c, false); } 1 => { assert_eq!(k, b"C"); assert_eq!(is_c, true); } 2 => { assert_eq!(k, b"C"); assert_eq!(is_c, false); } 3 => { assert_eq!(k, b"A"); assert_eq!(is_c, true); } _ => unreachable!("Too many kmers"), } } let seq = b"AGCTA"; let rc_seq = seq.reverse_complement(); let c_iter = CanonicalKmers::new(seq, &rc_seq, 2); for (i, (_, k, _)) in c_iter.enumerate() { match i { 0 => assert_eq!(k, b"AG"), 1 => assert_eq!(k, b"GC"), 2 => assert_eq!(k, b"AG"), 3 => assert_eq!(k, b"TA"), _ => unreachable!("Too many kmers"), } } let seq = b"AGNTA"; let rc_seq = seq.reverse_complement(); let c_iter = CanonicalKmers::new(seq, &rc_seq, 2); for (i, (ix, k, _)) in c_iter.enumerate() { match i { 0 => { assert_eq!(ix, 0); assert_eq!(k, b"AG"); } 1 => { assert_eq!(ix, 3); assert_eq!(k, b"TA"); } _ => unreachable!("Too many kmers"), } } } } needletail-0.5.1/src/lib.rs000064400000000000000000000042011046102023000136350ustar 00000000000000#![crate_name = "needletail"] //! Needletail is a crate to quickly and easily parse FASTA and FASTQ sequences out of //! streams/files and manipulate and analyse that data. //! //! A contrived example of how to use it: //! ``` //! extern crate needletail; //! use needletail::{parse_fastx_file, Sequence, FastxReader}; //! //! fn main() { //! let filename = "tests/data/28S.fasta"; //! //! let mut n_bases = 0; //! let mut n_valid_kmers = 0; //! let mut reader = parse_fastx_file(&filename).expect("valid path/file"); //! while let Some(record) = reader.next() { //! let seqrec = record.expect("invalid record"); //! // keep track of the total number of bases //! n_bases += seqrec.num_bases(); //! // normalize to make sure all the bases are consistently capitalized and //! // that we remove the newlines since this is FASTA //! let norm_seq = seqrec.normalize(false); //! // we make a reverse complemented copy of the sequence first for //! // `canonical_kmers` to draw the complemented sequences from. //! let rc = norm_seq.reverse_complement(); //! // now we keep track of the number of AAAAs (or TTTTs via //! // canonicalization) in the file; note we also get the position (i.0; //! // in the event there were `N`-containing kmers that were skipped) //! // and whether the sequence was complemented (i.2) in addition to //! // the canonical kmer (i.1) //! for (_, kmer, _) in norm_seq.canonical_kmers(4, &rc) { //! if kmer == b"AAAA" { //! n_valid_kmers += 1; //! } //! } //! } //! println!("There are {} bases in your file.", n_bases); //! println!("There are {} AAAAs in your file.", n_valid_kmers); //! } //! ``` //! #[cfg(any(feature = "python", feature = "python_test"))] extern crate pyo3; pub mod bitkmer; pub mod kmer; pub mod parser; pub mod sequence; pub mod errors; #[cfg(any(feature = "python", feature = "python_test"))] pub mod python; pub use parser::{parse_fastx_file, parse_fastx_reader, parse_fastx_stdin, FastxReader}; pub use sequence::Sequence; needletail-0.5.1/src/parser/fasta.rs000064400000000000000000000352561046102023000154770ustar 00000000000000//! The vast majority of the code is taken from https://github.com/markschl/seq_io/blob/master/src/fasta.rs use crate::errors::{ErrorPosition, ParseError}; use crate::parser::record::SequenceRecord; use crate::parser::utils::{ fill_buf, find_line_ending, grow_to, trim_cr, FastxReader, Format, LineEnding, Position, BUFSIZE, }; use memchr::{memchr2, Memchr}; use std::borrow::Cow; use std::fs::File; use std::io; use std::io::BufRead; use std::path::Path; #[derive(Clone, Debug)] pub struct BufferPosition { /// index of '>' pub(crate) start: usize, /// Indicate line start, but actually it is one byte before (start - 1), which is usually /// the line terminator of the header (if there is one). The last index in the Vec is always /// the last byte of the last sequence line (including line terminator if present). /// Therefore, the length of this Vec should never be 0. pub(crate) seq_pos: Vec, } impl BufferPosition { #[inline] fn is_new(&self) -> bool { self.seq_pos.is_empty() } #[inline] fn reset(&mut self, start: usize) { self.seq_pos.clear(); self.start = start; } #[inline] fn find_line_ending(&self, buffer: &[u8]) -> Option { find_line_ending(self.all(buffer)) } #[inline] pub(crate) fn all<'a>(&self, buffer: &'a [u8]) -> &'a [u8] { &buffer[self.start..*self.seq_pos.last().unwrap()] } #[inline] pub(crate) fn id<'a>(&self, buffer: &'a [u8]) -> &'a [u8] { trim_cr(&buffer[self.start + 1..*self.seq_pos.first().unwrap()]) } #[inline] pub(crate) fn raw_seq<'a>(&self, buffer: &'a [u8]) -> &'a [u8] { if self.seq_pos.len() > 1 { let start = *self.seq_pos.first().unwrap() + 1; let end = *self.seq_pos.last().unwrap(); trim_cr(&buffer[start..end]) } else { b"" } } #[inline] pub(crate) fn seq<'a>(&self, buffer: &'a [u8]) -> Cow<'a, [u8]> { // TODO: make that DRY let seq = if self.seq_pos.len() > 1 { let start = *self.seq_pos.first().unwrap() + 1; let end = *self.seq_pos.last().unwrap(); trim_cr(&buffer[start..end]) } else { b"" }; // first part is a fast check to see if we need to do any allocations let mut i; match memchr2(b'\r', b'\n', seq) { Some(break_loc) => i = break_loc, None => return seq.into(), } // we found a newline; create a new buffer and stripping out newlines // and writing into it let mut new_buf = Vec::with_capacity(seq.len() - 1); new_buf.extend_from_slice(&seq[..i]); while i < seq.len() { match memchr2(b'\r', b'\n', &seq[i..]) { None => { new_buf.extend_from_slice(&seq[i..]); break; } Some(match_pos) => { new_buf.extend_from_slice(&seq[i..i + match_pos]); i += match_pos + 1; } } } new_buf.into() } #[inline] pub(crate) fn num_bases(&self, buffer: &[u8]) -> usize { let seq = self.raw_seq(buffer); let num_lines = bytecount::count(seq, b'\n'); let windows_num_lines = bytecount::count(seq, b'\r'); seq.len() - num_lines - windows_num_lines } } /// Parser for FASTA files. /// Only use this directly if you know your file is FASTA and that it is not compressed as /// it does not handle decompression. /// If you are unsure, it's better to use [parse_fastx_file](fn.parse_fastx_file.html). pub struct Reader { buf_reader: buffer_redux::BufReader, buf_pos: BufferPosition, search_pos: usize, position: Position, finished: bool, line_ending: Option, } impl Reader where R: io::Read, { /// Creates a new reader with the default buffer size of 64 KiB /// /// # Example: /// /// ``` /// use needletail::parser::{FastaReader, FastxReader}; /// let fasta = b">id\nSEQUENCE"; /// /// let mut reader = FastaReader::new(&fasta[..]); /// let record = reader.next().unwrap().unwrap(); /// assert_eq!(record.id(), b"id") /// ``` #[inline] pub fn new(reader: R) -> Reader { Reader::with_capacity(reader, BUFSIZE) } /// Creates a new reader with a given buffer capacity. The minimum allowed /// capacity is 3. #[inline] pub fn with_capacity(reader: R, capacity: usize) -> Reader { assert!(capacity >= 3); Reader { buf_reader: buffer_redux::BufReader::with_capacity(capacity, reader), buf_pos: BufferPosition { start: 0, seq_pos: Vec::with_capacity(1), }, position: Position::new(0, 0), search_pos: 0, finished: false, line_ending: None, } } } impl Reader { /// Creates a reader from a file path. /// /// # Example: /// /// ```no_run /// use needletail::parser::{FastaReader, FastxReader}; /// /// let mut reader = FastaReader::from_path("seqs.fasta").unwrap(); /// /// // (... do something with the reader) /// ``` #[inline] pub fn from_path>(path: P) -> io::Result> { File::open(path).map(Reader::new) } } impl Reader where R: io::Read, { #[inline] fn get_buf(&self) -> &[u8] { self.buf_reader.buffer() } #[inline] fn next_pos(&mut self) { self.position.line += self.buf_pos.seq_pos.len() as u64; self.position.byte += (self.search_pos - self.buf_pos.start) as u64; self.buf_pos.reset(self.search_pos); } /// Finds the position of the next record /// and returns true if found; false if end of buffer reached. #[inline] fn find(&mut self) -> bool { if self._find() { return true; } // nothing found if self.get_buf().len() < self.buf_reader.capacity() { // EOF reached, there will be no next record self.finished = true; if !self.buf_pos.seq_pos.is_empty() { self.buf_pos.seq_pos.push(self.search_pos); } return true; } false } /// Returns true if complete position found, false if end of buffer reached. #[inline] fn _find(&mut self) -> bool { let bufsize = self.get_buf().len(); for pos in Memchr::new(b'\n', &self.buf_reader.buffer()[self.search_pos..]) { let pos = self.search_pos + pos; let next_line_start = pos + 1; if next_line_start == bufsize { // cannot check next byte -> treat as incomplete self.search_pos = pos; // make sure last byte is re-searched next time return false; } self.buf_pos.seq_pos.push(pos); if self.get_buf()[next_line_start] == b'>' { // complete record was found self.search_pos = next_line_start; return true; } } // record end not found self.search_pos = bufsize; false } /// To be called when the end of the buffer is reached and `next_pos` does not find /// the next record. Incomplete bytes will be moved to the start of the buffer. /// If the record still doesn't fit in, the buffer will be enlarged. /// After calling this function, the position will therefore always be 'complete'. /// this function assumes that the buffer was fully searched fn next_complete(&mut self) -> Result { loop { if self.buf_pos.start == 0 { // first record -> buffer too small self.grow(); } else { // not the first record -> buffer may be big enough self.make_room(); } // fill up remaining buffer fill_buf(&mut self.buf_reader)?; if self.find() { return Ok(true); } } } /// Grow internal buffer as needed fn grow(&mut self) { let cap = self.buf_reader.capacity(); let new_size = grow_to(cap); let additional = new_size - cap; self.buf_reader.reserve(additional); } /// Move incomplete bytes to start of buffer fn make_room(&mut self) { let consumed = self.buf_pos.start; self.buf_reader.consume(consumed); self.buf_reader.make_room(); self.buf_pos.start = 0; self.search_pos -= consumed; for s in &mut self.buf_pos.seq_pos { *s -= consumed; } } } impl FastxReader for Reader { fn next(&mut self) -> Option> { if self.finished { return None; } // Load some data in the buffer to start if self.position.line == 0 { match fill_buf(&mut self.buf_reader) { Ok(n) => { if n == 0 { self.finished = true; return None; } } Err(e) => { return Some(Err(e.into())); } }; if self.get_buf()[0] == b'>' { self.position.line = 1; self.position.byte = 0; self.buf_pos.start = 0; self.search_pos = 1; } else { return Some(Err(ParseError::new_invalid_start( self.get_buf()[0], ErrorPosition { line: self.position.line, id: None, }, Format::Fasta, ))); } } if !self.buf_pos.is_new() { self.next_pos(); } // Can we identify the start of the next record ? let complete = self.find(); if !complete { // Did we get a record? let got_record = match self.next_complete() { Ok(f) => f, Err(e) => { return Some(Err(e)); } }; if !got_record { return None; } } if self.buf_pos.seq_pos.is_empty() { return Some(Err(ParseError::new_unexpected_end( ErrorPosition { line: self.position.line, id: None, }, Format::Fasta, ))); } if self.line_ending.is_none() { self.line_ending = self.buf_pos.find_line_ending(self.get_buf()); } Some(Ok(SequenceRecord::new_fasta( self.get_buf(), &self.buf_pos, &self.position, self.line_ending, ))) } fn position(&self) -> &Position { &self.position } fn line_ending(&self) -> Option { self.line_ending } } #[cfg(test)] mod tests { use std::io::Cursor; use super::*; use crate::errors::ParseErrorKind; fn seq(s: &[u8]) -> Cursor<&[u8]> { Cursor::new(&s[..]) } #[test] fn test_basic() { let mut reader = Reader::new(seq(b">test\nACGT\n>test2\nTGCA\n")); assert!(reader.line_ending().is_none()); let rec = reader.next().unwrap(); assert!(rec.is_ok()); let r = rec.unwrap(); assert_eq!(r.id(), b"test"); assert_eq!(r.raw_seq(), b"ACGT"); assert_eq!(r.all(), b">test\nACGT"); assert_eq!(reader.line_ending().unwrap(), LineEnding::Unix); let rec = reader.next().unwrap(); assert!(rec.is_ok()); let r = rec.unwrap(); assert_eq!(r.id(), b"test2"); assert_eq!(r.raw_seq(), b"TGCA"); assert!(reader.next().is_none()); } #[test] fn test_wrapped_fasta() { let mut reader = Reader::new(seq(b">test\nACGT\nACGT\n>test2\nTGCA\nTG")); let rec = reader.next().unwrap(); assert!(rec.is_ok()); let r = rec.unwrap(); assert_eq!(r.id(), b"test"); assert_eq!(r.raw_seq(), b"ACGT\nACGT"); assert_eq!(r.num_bases(), 8); assert_eq!(reader.line_ending().unwrap(), LineEnding::Unix); let rec = reader.next().unwrap(); assert!(rec.is_ok()); let r = rec.unwrap(); assert_eq!(r.id(), b"test2"); assert_eq!(r.raw_seq(), b"TGCA\nTG"); assert_eq!(r.num_bases(), 6); assert!(reader.next().is_none()); } #[test] fn test_wrapped_fasta_windows_newlines() { let mut reader = Reader::new(seq(b">test\r\nACGT\r\nACGT\r\n>test2\r\nTGCA\r\nTG")); let rec = reader.next().unwrap(); assert!(rec.is_ok()); let r = rec.unwrap(); assert_eq!(r.id(), b"test"); assert_eq!(r.raw_seq(), b"ACGT\r\nACGT"); assert_eq!(r.num_bases(), 8); assert_eq!(r.start_line_number(), 1); assert_eq!(reader.line_ending().unwrap(), LineEnding::Windows); let rec = reader.next().unwrap(); assert!(rec.is_ok()); let r = rec.unwrap(); assert_eq!(r.id(), b"test2"); assert_eq!(r.raw_seq(), b"TGCA\r\nTG"); assert_eq!(r.num_bases(), 6); assert_eq!(r.start_line_number(), 4); assert!(reader.next().is_none()); } #[test] fn test_premature_ending() { let mut reader = Reader::new(seq(b">test\nAGCT\n>test2")); reader.next().unwrap().unwrap(); let rec = reader.next().unwrap(); assert!(rec.is_err()); let r = rec.unwrap_err(); assert_eq!(r.kind, ParseErrorKind::UnexpectedEnd); let mut reader = Reader::new(seq(b">test\r\nAGCT\r\n>test2\r\n")); reader.next().unwrap().unwrap(); let rec = reader.next().unwrap(); assert!(rec.is_err()); let r = rec.unwrap_err(); assert_eq!(r.kind, ParseErrorKind::UnexpectedEnd); } #[test] fn test_empty_records() { let mut reader = Reader::new(seq(b">\n\n>shine\nAGGAGGU")); let rec = reader.next().unwrap().unwrap(); assert_eq!(rec.id(), b""); assert_eq!(rec.raw_seq(), b""); let rec = reader.next().unwrap().unwrap(); assert_eq!(rec.id(), b"shine"); assert_eq!(rec.raw_seq(), b"AGGAGGU"); let mut reader = Reader::new(seq(b">\r\n\r\n>shine\r\nAGGAGGU")); let rec = reader.next().unwrap().unwrap(); assert_eq!(rec.id(), b""); assert_eq!(rec.raw_seq(), b""); let rec = reader.next().unwrap().unwrap(); assert_eq!(rec.id(), b"shine"); assert_eq!(rec.raw_seq(), b"AGGAGGU"); } } needletail-0.5.1/src/parser/fastq.rs000064400000000000000000000501041046102023000155040ustar 00000000000000//! The vast majority of the code is taken from https://github.com/markschl/seq_io/blob/master/src/fastq.rs use std::fs::File; use std::io::{self, BufRead}; use std::path::Path; use crate::errors::{ErrorPosition, ParseError}; use crate::parser::record::SequenceRecord; use crate::parser::utils::{ fill_buf, find_line_ending, grow_to, trim_cr, FastxReader, Format, LineEnding, Position, BUFSIZE, }; use memchr::memchr; /// Represents the position of a record within a buffer #[derive(Debug, Clone, Default)] pub struct BufferPosition { pub(crate) start: usize, pub(crate) end: usize, pub(crate) seq: usize, pub(crate) sep: usize, pub(crate) qual: usize, } impl BufferPosition { #[inline] pub(crate) fn is_new(&self) -> bool { self.end == 0 } #[inline] pub(crate) fn len(&self) -> u64 { (self.end + 1 - self.start) as u64 } #[inline] pub(crate) fn id<'a>(&'a self, buffer: &'a [u8]) -> &'a [u8] { trim_cr(&buffer[self.start + 1..self.seq - 1]) } #[inline] pub(crate) fn seq<'a>(&'a self, buffer: &'a [u8]) -> &'a [u8] { trim_cr(&buffer[self.seq..self.sep - 1]) } #[inline] pub(crate) fn qual<'a>(&'a self, buffer: &'a [u8]) -> &'a [u8] { trim_cr(&buffer[self.qual..self.end]) } #[inline] pub(crate) fn num_bases<'a>(&'a self, buffer: &'a [u8]) -> usize { self.seq(buffer).len() } #[inline] fn find_line_ending<'a>(&'a self, buffer: &'a [u8]) -> Option { find_line_ending(self.all(buffer)) } #[inline] pub(crate) fn all<'a>(&self, buffer: &'a [u8]) -> &'a [u8] { &buffer[self.start..self.end] } } #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd)] enum SearchPosition { Id, Sequence, Separator, Quality, } /// Parser for FASTQ files. /// Only use this directly if you know your file is FASTQ and that it is not compressed as /// it does not handle decompression. /// If you are unsure, it's better to use [parse_fastx_file](fn.parse_fastx_file.html). pub struct Reader { buf_reader: buffer_redux::BufReader, buf_pos: BufferPosition, search_pos: SearchPosition, position: Position, finished: bool, line_ending: Option, } impl Reader where R: io::Read, { /// Creates a new reader with the default buffer size of 64 KiB /// /// # Example: /// /// ``` /// use needletail::parser::{FastqReader, FastxReader}; /// let fastq = b"@id\nACGT\n+\nIIII"; /// /// let mut reader = FastqReader::new(&fastq[..]); /// let record = reader.next().unwrap().unwrap(); /// assert_eq!(record.id(), b"id") /// ``` pub fn new(reader: R) -> Reader { Reader::with_capacity(reader, BUFSIZE) } /// Creates a new reader with a given buffer capacity. The minimum allowed /// capacity is 3. pub fn with_capacity(reader: R, capacity: usize) -> Reader { assert!(capacity >= 3); Reader { buf_reader: buffer_redux::BufReader::with_capacity(capacity, reader), buf_pos: BufferPosition::default(), search_pos: SearchPosition::Id, position: Position::new(1, 0), finished: false, line_ending: None, } } } impl Reader { /// Creates a reader from a file path. /// /// # Example: /// /// ```no_run /// use needletail::parser::{FastxReader, FastqReader}; /// /// let mut reader = FastqReader::from_path("seqs.fastq").unwrap(); /// /// // (... do something with the reader) /// ``` pub fn from_path>(path: P) -> io::Result> { File::open(path).map(Reader::new) } } impl Reader where R: io::Read, { #[inline] fn get_buf(&self) -> &[u8] { self.buf_reader.buffer() } // TODO: avoid duplication with find_incomplete. // TODO: having a single fn and adding branches introduce a noticeable slowdown /// Reads the current record and returns true if found. /// Returns false if incomplete because end of buffer reached, /// meaning that the last record may be incomplete. /// Updates self.search_pos. fn find(&mut self) -> Result { self.buf_pos.seq = match self.find_line(self.buf_pos.start) { Some(p) => p, None => { self.search_pos = SearchPosition::Id; return Ok(false); } }; self.buf_pos.sep = match self.find_line(self.buf_pos.seq) { Some(p) => p, None => { self.search_pos = SearchPosition::Sequence; return Ok(false); } }; self.buf_pos.qual = match self.find_line(self.buf_pos.sep) { Some(p) => p, None => { self.search_pos = SearchPosition::Separator; return Ok(false); } }; self.buf_pos.end = match self.find_line(self.buf_pos.qual) { Some(p) => p - 1, None => { self.search_pos = SearchPosition::Quality; return Ok(false); } }; self.validate()?; Ok(true) } // Resumes reading an incomplete record without // re-searching positions that were already found. // The resulting position may still be incomplete (-> false). fn find_incomplete(&mut self) -> Result { if self.search_pos == SearchPosition::Id { self.buf_pos.seq = match self.find_line(self.buf_pos.start) { Some(p) => p, None => { self.search_pos = SearchPosition::Id; return Ok(false); } }; } if self.search_pos <= SearchPosition::Sequence { self.buf_pos.sep = match self.find_line(self.buf_pos.seq) { Some(p) => p, None => { self.search_pos = SearchPosition::Sequence; return Ok(false); } }; } if self.search_pos <= SearchPosition::Separator { self.buf_pos.qual = match self.find_line(self.buf_pos.sep) { Some(p) => p, None => { self.search_pos = SearchPosition::Separator; return Ok(false); } }; } if self.search_pos <= SearchPosition::Quality { self.buf_pos.end = match self.find_line(self.buf_pos.qual) { Some(p) => p - 1, None => { self.search_pos = SearchPosition::Quality; return Ok(false); } }; } self.search_pos = SearchPosition::Id; self.validate()?; Ok(true) } /// Verify that the record is valid: /// - starts with @ /// - separator line starts with - /// - quality and sequence have the same length fn validate(&mut self) -> Result<(), ParseError> { let start_byte = self.get_buf()[self.buf_pos.start]; if start_byte != b'@' { self.finished = true; return Err(ParseError::new_invalid_start( start_byte, self.get_error_pos(0, false), Format::Fastq, )); } let sep_byte = self.get_buf()[self.buf_pos.sep]; if sep_byte != b'+' { self.finished = true; return Err(ParseError::new_invalid_separator( sep_byte, self.get_error_pos(2, true), )); } let buf = self.get_buf(); // We assume we only have ASCII in sequence and quality let seq_len = self.buf_pos.seq(buf).len(); let qual_len = self.buf_pos.qual(buf).len(); // TODO: we don't do that every time because it's a ~90% performance penalty. // TODO: mention it on the README // And we can further validate quality chars // and the vast majority of files don't have this issue // let qual_len = self // .buf_pos // .qual(&buf) // .iter() // .filter(|c| *c >= &b'!' && *c <= &b'~') // .count(); if seq_len != qual_len { self.finished = true; return Err(ParseError::new_unequal_length( seq_len, qual_len, self.get_error_pos(0, true), )); } Ok(()) } fn get_error_pos(&self, line_offset: u64, parse_id: bool) -> ErrorPosition { let id = if parse_id && self.buf_pos.seq - self.buf_pos.start > 1 { let id = self .buf_pos .id(self.get_buf()) .split(|b| *b == b' ') .next() .unwrap(); Some(String::from_utf8_lossy(id).into()) } else { None }; ErrorPosition { line: self.position.line() + line_offset, id, } } #[inline] fn find_line(&self, search_start: usize) -> Option { memchr(b'\n', &self.get_buf()[search_start..]).map(|pos| search_start + pos + 1) } /// Called when we couldn't find a complete record. /// We might be at EOF, buffer might be too small or we need to refill it fn next_complete(&mut self) -> Result { loop { if self.get_buf().len() < self.buf_reader.capacity() { // EOF reached, there will be no next record return self.check_end(); } if self.buf_pos.start == 0 { // first record already incomplete -> buffer too small self.grow(); } else { // not the first record -> buffer may be big enough but we need to make some space self.make_room(); } fill_buf(&mut self.buf_reader)?; if self.find_incomplete()? { return Ok(true); } } } /// Checks for EOF. /// If there is one last record that can be sent, return `true` otherwise `false`. fn check_end(&mut self) -> Result { self.finished = true; if self.search_pos == SearchPosition::Quality { // no line ending at end of last record self.buf_pos.end = self.get_buf().len(); self.validate()?; return Ok(true); } // It allows some blank lines at the end of the file let rest = &self.get_buf()[self.buf_pos.start..]; if rest.split(|c| *c == b'\n').all(|l| trim_cr(l).is_empty()) { return Ok(false); } Err(ParseError::new_unexpected_end( self.get_error_pos(self.search_pos as u64, self.search_pos > SearchPosition::Id), Format::Fastq, )) } // Grow the internal buffer. Used if the original buffer is not big // enough for a record fn grow(&mut self) { let cap = self.buf_reader.capacity(); let new_size = grow_to(cap); let additional = new_size - cap; self.buf_reader.reserve(additional); } // Consume bytes from records we've seen and move incomplete bytes to start of buffer fn make_room(&mut self) { let consumed = self.buf_pos.start; self.buf_reader.consume(consumed); self.buf_reader.make_room(); self.buf_pos.start = 0; if self.search_pos >= SearchPosition::Sequence { self.buf_pos.seq -= consumed; } if self.search_pos >= SearchPosition::Separator { self.buf_pos.sep -= consumed; } if self.search_pos >= SearchPosition::Quality { self.buf_pos.qual -= consumed; } } } impl FastxReader for Reader { fn next(&mut self) -> Option> { // No more records to read if self.finished { return None; } // Empty buffer, let's fill it if self.get_buf().is_empty() { // If we get an ParseError when reading or get back 0 bytes, we're done match fill_buf(&mut self.buf_reader) { Ok(n) => { if n == 0 { self.finished = true; return None; } } Err(e) => { return Some(Err(e.into())); } }; } // If we already did look at a record, let's setup for the next one if !self.buf_pos.is_new() { self.position.byte += self.buf_pos.len(); self.position.line += 4; self.buf_pos.start = self.buf_pos.end + 1; } // Can we identify all the positions of each element of the next record? let complete = match self.find() { Ok(f) => f, Err(e) => { return Some(Err(e)); } }; // If it's not complete, try to fetch more from the buffer until we have it in full if !complete { // Did we get a record? let got_record = match self.next_complete() { Ok(f) => f, Err(e) => { return Some(Err(e)); } }; if !got_record { return None; } } if self.line_ending.is_none() { self.line_ending = self.buf_pos.find_line_ending(self.get_buf()); } // We got one! Some(Ok(SequenceRecord::new_fastq( self.get_buf(), &self.buf_pos, &self.position, self.line_ending, ))) } fn position(&self) -> &Position { &self.position } fn line_ending(&self) -> Option { self.line_ending } } #[cfg(test)] mod test { use std::io::Cursor; use super::Reader; use crate::errors::ParseErrorKind; use crate::parser::utils::LineEnding; use crate::FastxReader; fn seq(s: &[u8]) -> Cursor<&[u8]> { Cursor::new(&s[..]) } #[test] fn test_simple_fastq() { // Test both line endings let sequences = vec![ ( "@test\nAGCT\n+test\n~~a!\n@test2\nTGCA\n+test\nWUI9", LineEnding::Unix, ), ( "@test\r\nAGCT\r\n+test\r\n~~a!\r\n@test2\r\nTGCA\r\n+test\r\nWUI9", LineEnding::Windows, ), ]; for (sequence, line_ending) in sequences { let mut i = 0; let mut reader = Reader::new(seq(sequence.as_bytes())); while let Some(record) = reader.next() { let rec = record.unwrap(); match i { 0 => { assert_eq!(&rec.id(), b"test"); assert_eq!(&rec.raw_seq(), b"AGCT"); assert_eq!(&rec.qual().unwrap(), b"~~a!"); assert_eq!(reader.line_ending().unwrap(), line_ending); } 1 => { assert_eq!(&rec.id(), b"test2"); assert_eq!(&rec.raw_seq(), b"TGCA"); assert_eq!(&rec.qual().unwrap(), b"WUI9"); assert_eq!(reader.line_ending().unwrap(), line_ending); } _ => unreachable!("Too many records"), } i += 1; } assert_eq!(i, 2); } } #[test] fn test_eof_in_qual() { let mut reader = Reader::new(seq(b"@test\nACGT\n+\nIII")); let rec = reader.next().unwrap(); assert!(rec.is_err()); let e = rec.unwrap_err(); // Not a eof error due to the way the validate is implemented assert_eq!(e.kind, ParseErrorKind::UnequalLengths); } #[test] fn test_eof_in_seq() { let mut reader = Reader::new(seq(b"@test\nAGCT\n+test\n~~a!\n@test2\nTGCA")); let rec = reader.next().unwrap(); assert!(rec.is_ok()); let rec2 = reader.next().unwrap(); assert!(rec2.is_err()); let e = rec2.unwrap_err(); assert_eq!(e.kind, ParseErrorKind::UnexpectedEnd); } #[test] fn test_extra_empty_newlines_at_end_are_ok() { let mut reader = Reader::new(seq(b"@test\nAGCT\n+test\n~~a!\n\n")); let rec = reader.next().unwrap(); assert!(rec.is_ok()); assert!(reader.next().is_none()); } #[test] fn test_extra_non_empty_newlines_at_end_are_not_ok() { let mut reader = Reader::new(seq(b"@test\nAGCT\n+test\n~~a!\n\n@TEST\nA\n+TEST\n~")); let rec = reader.next().unwrap(); assert!(rec.is_ok()); let rec2 = reader.next().unwrap(); let e = rec2.unwrap_err(); assert_eq!(e.kind, ParseErrorKind::InvalidStart); } #[test] fn test_empty_records() { let mut reader = Reader::new(seq(b"@\n\n+\n\n@test2\nTGCA\n+test2\n~~~~\n")); let mut i = 0; while let Some(record) = reader.next() { let rec = record.unwrap(); match i { 0 => { assert_eq!(&rec.id(), b""); assert_eq!(&rec.raw_seq(), b""); assert_eq!(&rec.qual().unwrap(), b""); assert_eq!(rec.all(), b"@\n\n+\n"); } 1 => { assert_eq!(&rec.id(), b"test2"); assert_eq!(&rec.raw_seq(), b"TGCA"); assert_eq!(&rec.qual().unwrap(), b"~~~~"); assert_eq!(rec.all(), b"@test2\nTGCA\n+test2\n~~~~"); } _ => unreachable!("Too many records"), } i += 1 } assert_eq!(i, 2); } #[test] fn test_weird_ncbi_file() { let test = b"@NCBI actually has files like this\nACGTACGATCGTACGTAGCTGCTAGCTAGCATGCATGACACACACGTACGATCGTACGTAGCTGCTAGCTAGCATGCATGACACAC\n+\n00000000000000000000000000000000000000000000000000000000000000000000000000000000000000\n@NCBI actually has files like this\n\n+\n\n@NCBI actually has files like this\nACGTACGATCGTACGTAGCTGCTAGCTAGCATGCATGACACACACGTACGATCGTACGTAGCTGCTAGCTAGCATGCATGACACAC\n+\n00000000000000000000000000000000000000000000000000000000000000000000000000000000000000"; let mut reader = Reader::new(seq(test)); let rec = reader.next().unwrap(); assert!(rec.is_ok()); let r = rec.unwrap(); assert_eq!(r.start_line_number(), 1); let rec = reader.next().unwrap(); assert!(rec.is_ok()); let r = rec.unwrap(); assert_eq!(r.start_line_number(), 5); let rec = reader.next().unwrap(); assert!(rec.is_ok()); let r = rec.unwrap(); assert_eq!(r.start_line_number(), 9); } #[test] fn test_mismatched_lengths() { let mut reader = Reader::new(seq(b"@test\nAGCT\n+\nIII\n@TEST\nA\n+\nI")); let rec = reader.next().unwrap(); assert!(rec.is_err()); let e = rec.unwrap_err(); assert_eq!(e.kind, ParseErrorKind::UnequalLengths); } // https://github.com/onecodex/needletail/pull/36 #[test] fn test_bad_headers() { let mut reader = Reader::from_path("tests/data/bad_header.fastq").unwrap(); let rec = reader.next().unwrap(); assert!(rec.is_ok()); let rec2 = reader.next().unwrap(); let e = rec2.unwrap_err(); // Ideally this would not be UnexpectedEnd since we know it's an invalid record // but that's for another day assert_eq!(e.kind, ParseErrorKind::UnexpectedEnd); } // https://github.com/onecodex/needletail/pull/39 #[test] fn test_fastq_with_random_tsv_inside() { let mut reader = Reader::from_path("tests/data/random_tsv.fq").unwrap(); let rec = reader.next().unwrap(); assert!(rec.is_ok()); let rec2 = reader.next().unwrap(); let e = rec2.unwrap_err(); // It errors when it tries to validate the separator line that needs to start with `+` assert_eq!(e.kind, ParseErrorKind::InvalidSeparator); } } needletail-0.5.1/src/parser/mod.rs000064400000000000000000000125641046102023000151550ustar 00000000000000//! Handles all the FASTA/FASTQ parsing use std::fs::File; use std::io::{stdin, Cursor, Read}; use std::path::Path; #[cfg(feature = "compression")] use bzip2::read::BzDecoder; #[cfg(feature = "compression")] use flate2::read::MultiGzDecoder; #[cfg(feature = "compression")] use xz2::read::XzDecoder; use crate::errors::ParseError; pub use crate::parser::fasta::Reader as FastaReader; pub use crate::parser::fastq::Reader as FastqReader; mod record; mod utils; mod fasta; mod fastq; pub use crate::parser::utils::FastxReader; // Magic bytes for each compression format #[cfg(feature = "compression")] const GZ_MAGIC: [u8; 2] = [0x1F, 0x8B]; #[cfg(feature = "compression")] const BZ_MAGIC: [u8; 2] = [0x42, 0x5A]; #[cfg(feature = "compression")] const XZ_MAGIC: [u8; 2] = [0xFD, 0x37]; fn get_fastx_reader<'a, R: 'a + io::Read + Send>( reader: R, first_byte: u8, ) -> Result, ParseError> { match first_byte { b'>' => Ok(Box::new(FastaReader::new(reader))), b'@' => Ok(Box::new(FastqReader::new(reader))), _ => Err(ParseError::new_unknown_format(first_byte)), } } /// The main entry point of needletail if you're reading from something that implements [`std::io::Read`]. /// This automatically detects whether the file is: /// 1. compressed: [`gzip`][gzip], [`bz`][bz] and [`xz`][xz] are supported and will use the appropriate decoder /// 2. FASTA or FASTQ: the right parser will be automatically instantiated /// /// Option 1 is only available if the `compression` feature is enabled. /// /// # Errors /// /// If the object you're reading from has less than 2 bytes then a [`ParserError`](needletail::errors::ParserError) of the kind /// [`ParseErrorKind::EmptyFile`](needletail::errors::ParseErrorKind::EmptyFile) is returned. /// /// If the first byte in the object is unknown, then a `ParserError` of the kind /// [`ParseErrorKind::UnknownFormat`](needletail::errors::ParseErrorKind::UnknownFormat) is returned. /// /// # Examples /// /// ``` /// use needletail::parse_fastx_reader; /// /// let reader = ">read1\nACGT\nread2\nGGGG".as_bytes(); /// let mut fastx_reader = parse_fastx_reader(reader).expect("invalid reader"); /// let mut idx = 0; /// let read_ids = [b"read1", b"read2"]; /// /// while let Some(r) = fastx_reader.next() { /// let record = r.expect("invalid record"); /// assert_eq!(record.id(), read_ids[idx]); /// idx += 1; /// } /// ``` /// /// [gzip]: https://www.gnu.org/software/gzip/ /// [bz]: https://sourceware.org/bzip2/ /// [xz]: https://tukaani.org/xz/format.html /// pub fn parse_fastx_reader<'a, R: 'a + io::Read + Send>( mut reader: R, ) -> Result, ParseError> { let mut first_two_bytes = [0; 2]; reader .read_exact(&mut first_two_bytes) .map_err(|_| ParseError::new_empty_file())?; let first_two_cursor = Cursor::new(first_two_bytes); let new_reader = first_two_cursor.chain(reader); match first_two_bytes { #[cfg(feature = "compression")] GZ_MAGIC => { let mut gz_reader = MultiGzDecoder::new(new_reader); let mut first = [0; 1]; gz_reader.read_exact(&mut first)?; let r = Cursor::new(first).chain(gz_reader); get_fastx_reader(r, first[0]) } #[cfg(feature = "compression")] BZ_MAGIC => { let mut bz_reader = BzDecoder::new(new_reader); let mut first = [0; 1]; bz_reader.read_exact(&mut first)?; let r = Cursor::new(first).chain(bz_reader); get_fastx_reader(r, first[0]) } #[cfg(feature = "compression")] XZ_MAGIC => { let mut xz_reader = XzDecoder::new(new_reader); let mut first = [0; 1]; xz_reader.read_exact(&mut first)?; let r = Cursor::new(first).chain(xz_reader); get_fastx_reader(r, first[0]) } _ => get_fastx_reader(new_reader, first_two_bytes[0]), } } /// The main entry point of needletail if you're reading from stdin. /// Shortcut to calling `parse_fastx_reader` with `stdin()` pub fn parse_fastx_stdin() -> Result, ParseError> { let stdin = stdin(); parse_fastx_reader(stdin) } /// The main entry point of needletail if you're reading from a file. /// Shortcut to calling `parse_fastx_reader` with a file pub fn parse_fastx_file>(path: P) -> Result, ParseError> { parse_fastx_reader(File::open(&path)?) } pub use record::{mask_header_tabs, mask_header_utf8, write_fasta, write_fastq, SequenceRecord}; use std::io; pub use utils::{Format, LineEnding}; #[cfg(test)] mod test { use crate::errors::ParseErrorKind; use crate::parse_fastx_reader; #[test] fn test_empty_file_raises_parser_error_of_same_kind() { let reader = "".as_bytes(); let actual = parse_fastx_reader(reader); assert!(actual.is_err()); let actual_err = actual.err().unwrap().kind; let expected_err = ParseErrorKind::EmptyFile; assert_eq!(actual_err, expected_err); } #[test] fn test_only_one_byte_in_file_raises_empty_file_error() { let reader = "@".as_bytes(); let actual = parse_fastx_reader(reader); assert!(actual.is_err()); let actual_err = actual.err().unwrap().kind; let expected_err = ParseErrorKind::EmptyFile; assert_eq!(actual_err, expected_err); } } needletail-0.5.1/src/parser/record.rs000064400000000000000000000147021046102023000156500ustar 00000000000000use std::borrow::Cow; use std::io::Write; use memchr::memchr; use crate::errors::ParseError; use crate::parser::fasta::BufferPosition as FastaBufferPosition; use crate::parser::fastq::BufferPosition as FastqBufferPosition; use crate::parser::utils::{Format, LineEnding, Position}; use crate::Sequence; #[derive(Debug, Clone)] enum BufferPositionKind<'a> { Fasta(&'a FastaBufferPosition), Fastq(&'a FastqBufferPosition), } /// A FASTA or FASTQ record #[derive(Debug, Clone)] pub struct SequenceRecord<'a> { buffer: &'a [u8], buf_pos: BufferPositionKind<'a>, position: &'a Position, line_ending: LineEnding, } impl<'a> SequenceRecord<'a> { pub(crate) fn new_fasta( buffer: &'a [u8], buf_pos: &'a FastaBufferPosition, position: &'a Position, line_ending: Option, ) -> Self { Self { buffer, position, buf_pos: BufferPositionKind::Fasta(buf_pos), line_ending: line_ending.unwrap_or(LineEnding::Unix), } } pub(crate) fn new_fastq( buffer: &'a [u8], buf_pos: &'a FastqBufferPosition, position: &'a Position, line_ending: Option, ) -> Self { Self { buffer, position, buf_pos: BufferPositionKind::Fastq(buf_pos), line_ending: line_ending.unwrap_or(LineEnding::Unix), } } /// Returns the format of the record #[inline] pub fn format(&self) -> Format { match self.buf_pos { BufferPositionKind::Fasta(_) => Format::Fasta, BufferPositionKind::Fastq(_) => Format::Fastq, } } /// Returns the id of the record #[inline] pub fn id(&self) -> &[u8] { match self.buf_pos { BufferPositionKind::Fasta(bp) => bp.id(self.buffer), BufferPositionKind::Fastq(bp) => bp.id(self.buffer), } } /// Returns the raw sequence of the record. Only matters for FASTA since it can contain /// newlines. #[inline] pub fn raw_seq(&self) -> &[u8] { match self.buf_pos { BufferPositionKind::Fasta(bp) => bp.raw_seq(self.buffer), BufferPositionKind::Fastq(bp) => bp.seq(self.buffer), } } /// Returns the cleaned up sequence of the record. For FASTQ it is the same as `raw_seq` but /// for FASTA it is `raw_seq` minus all the `\r\n` pub fn seq(&self) -> Cow<[u8]> { match self.buf_pos { BufferPositionKind::Fasta(bp) => bp.seq(self.buffer), BufferPositionKind::Fastq(bp) => bp.seq(self.buffer).into(), } } /// Returns the quality line if there is one. /// Always `None` for FASTA and `Some` for FASTQ, even if the quality line is empty. #[inline] pub fn qual(&self) -> Option<&[u8]> { match self.buf_pos { BufferPositionKind::Fasta(_) => None, BufferPositionKind::Fastq(bp) => Some(bp.qual(self.buffer)), } } /// Returns the full sequence, including line endings. This doesn't include a trailing newline. #[inline] pub fn all(&self) -> &[u8] { match self.buf_pos { BufferPositionKind::Fasta(bp) => bp.all(self.buffer), BufferPositionKind::Fastq(bp) => bp.all(self.buffer), } } /// Return the number of bases in the sequence, computed efficiently. #[inline] pub fn num_bases(&self) -> usize { match self.buf_pos { BufferPositionKind::Fasta(bp) => bp.num_bases(self.buffer), BufferPositionKind::Fastq(bp) => bp.num_bases(self.buffer), } } /// Return the line number in the file of the start of the sequence pub fn start_line_number(&self) -> u64 { self.position.line } /// Which line ending is this record using? pub fn line_ending(&self) -> LineEnding { self.line_ending } /// Write record back to a `Write` instance. By default it will use the original line ending but /// you can force it to use another one. pub fn write( &self, writer: &mut dyn Write, forced_line_ending: Option, ) -> Result<(), ParseError> { match self.buf_pos { BufferPositionKind::Fasta(_) => write_fasta( self.id(), self.raw_seq(), writer, forced_line_ending.unwrap_or(self.line_ending), ), BufferPositionKind::Fastq(_) => write_fastq( self.id(), self.raw_seq(), self.qual(), writer, forced_line_ending.unwrap_or(self.line_ending), ), } } } impl<'a> Sequence<'a> for SequenceRecord<'a> { fn sequence(&'a self) -> &'a [u8] { self.raw_seq() } } /// Mask tabs in header lines to `|`s pub fn mask_header_tabs(id: &[u8]) -> Option> { memchr(b'\t', id).map(|_| { id.iter() .map(|x| if *x == b'\t' { b'|' } else { *x }) .collect() }) } /// Convert bad UTF8 characters into �s pub fn mask_header_utf8(id: &[u8]) -> Option> { // this may potentially change the length of the id; we should probably // be doing something trickier like converting match String::from_utf8_lossy(id) { Cow::Owned(s) => Some(s.into_bytes()), Cow::Borrowed(_) => None, } } /// Write a FASTA record pub fn write_fasta( id: &[u8], seq: &[u8], writer: &mut dyn Write, line_ending: LineEnding, ) -> Result<(), ParseError> { let ending = line_ending.to_bytes(); writer.write_all(b">")?; writer.write_all(id)?; writer.write_all(&ending)?; writer.write_all(seq)?; writer.write_all(&ending)?; Ok(()) } pub fn write_fastq( id: &[u8], seq: &[u8], qual: Option<&[u8]>, writer: &mut dyn Write, line_ending: LineEnding, ) -> Result<(), ParseError> { let ending = line_ending.to_bytes(); writer.write_all(b"@")?; writer.write_all(id)?; writer.write_all(&ending)?; writer.write_all(seq)?; writer.write_all(&ending)?; writer.write_all(b"+")?; writer.write_all(&ending)?; // this is kind of a hack, but we want to allow writing out sequences // that don't have qualitys so this will mask to "good" if the quality // slice is empty if let Some(qual) = qual { writer.write_all(qual)?; } else { writer.write_all(&vec![b'I'; seq.len()])?; } writer.write_all(&ending)?; Ok(()) } needletail-0.5.1/src/parser/utils.rs000064400000000000000000000066631046102023000155410ustar 00000000000000use std::io; use memchr::memchr; use crate::errors::ParseError; use crate::parser::record::SequenceRecord; pub(crate) const BUFSIZE: usize = 64 * 1024; /// Remove a final '\r' from a byte slice #[inline] pub(crate) fn trim_cr(line: &[u8]) -> &[u8] { if let Some((&b'\r', remaining)) = line.split_last() { remaining } else { line } } /// Standard buffer policy: buffer size /// doubles until it reaches 8 MiB. Above, it will /// increase in steps of 8 MiB. Buffer size is not limited, /// it could theoretically grow indefinitely. pub(crate) fn grow_to(current_size: usize) -> usize { if current_size < 1 << 23 { current_size * 2 } else { current_size + (1 << 23) } } /// Makes sure the buffer is full after this call (unless EOF reached) /// code adapted from `io::Read::read_exact` pub(crate) fn fill_buf(reader: &mut buffer_redux::BufReader) -> io::Result where R: io::Read, { let initial_size = reader.buffer().len(); let mut num_read = 0; while initial_size + num_read < reader.capacity() { match reader.read_into_buf() { Ok(0) => break, Ok(n) => num_read += n, Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} Err(e) => return Err(e), } } Ok(num_read) } /// Holds line number and byte offset of our current state in a parser #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] pub struct Position { pub(crate) line: u64, pub(crate) byte: u64, } impl Position { pub fn new(line: u64, byte: u64) -> Position { Position { line, byte } } /// Line number (starting with 1) pub fn line(&self) -> u64 { self.line } /// Byte offset within the file pub fn byte(&self) -> u64 { self.byte } } /// FASTA or FASTQ? #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub enum Format { Fasta, Fastq, } impl Format { pub fn start_char(&self) -> char { match self { Format::Fasta => '>', Format::Fastq => '@', } } } /// Whether it uses \r\n or only \n #[derive(Debug, PartialEq, Eq, Hash, Copy, Clone)] pub enum LineEnding { Windows, Unix, } impl LineEnding { pub fn to_bytes(&self) -> Vec { match self { LineEnding::Windows => vec![b'\r', b'\n'], LineEnding::Unix => vec![b'\n'], } } } pub fn find_line_ending(bytes: &[u8]) -> Option { if !bytes.is_empty() { if let Some(idx) = memchr(b'\n', bytes) { if idx > 0 && bytes[idx - 1] == b'\r' { return Some(LineEnding::Windows); } else { return Some(LineEnding::Unix); } } } None } /// The main trait, iterator-like, that the FASTA and FASTQ readers implement pub trait FastxReader: Send { /// Gets the next record in the stream. /// This imitates the Iterator API but does not support any iterator functions. /// This returns None once we reached the EOF. fn next(&mut self) -> Option>; /// Returns the current line/byte in the stream we are reading from fn position(&self) -> &Position; /// Returns whether the current stream uses Windows or Unix style line endings /// It is `None` only before calling `next`, once `next` has been called it will always /// return a line ending. fn line_ending(&self) -> Option; } needletail-0.5.1/src/python.rs000064400000000000000000000070751046102023000144240ustar 00000000000000//! Python bindings for needletail use std::io::Cursor; use pyo3::prelude::*; use pyo3::{create_exception, wrap_pyfunction}; use crate::sequence::{complement, normalize}; use crate::{ parse_fastx_file as rs_parse_fastx_file, parse_fastx_reader, parser::SequenceRecord, FastxReader, }; create_exception!(needletail, NeedletailError, pyo3::exceptions::PyException); // Avoid some boilerplate with the error handling macro_rules! py_try { ($call:expr) => { $call.map_err(|e| PyErr::new::(format!("{}", e)))? }; } #[pyclass] pub struct PyFastxReader { reader: Box, } #[pymethods] impl PyFastxReader { fn __repr__(&self) -> PyResult { Ok("".to_string()) } fn __iter__(slf: PyRefMut, py: Python<'_>) -> PyResult { Ok(FastxReaderIterator { t: slf.into_py(py) }) } } #[pyclass] pub struct Record { #[pyo3(get)] id: String, #[pyo3(get)] seq: String, #[pyo3(get)] qual: Option, } impl Record { fn from_sequence_record(rec: &SequenceRecord) -> Self { Self { id: String::from_utf8_lossy(rec.id()).to_string(), seq: String::from_utf8_lossy(&rec.seq()).to_string(), qual: rec.qual().map(|q| String::from_utf8_lossy(q).to_string()), } } } #[pymethods] impl Record { pub fn is_fasta(&self) -> PyResult { Ok(self.qual.is_none()) } pub fn is_fastq(&self) -> PyResult { Ok(self.qual.is_some()) } pub fn normalize(&mut self, iupac: bool) -> PyResult<()> { if let Some(s) = normalize(self.seq.as_bytes(), iupac) { self.seq = String::from_utf8_lossy(&s).to_string(); } Ok(()) } } #[pyclass] pub struct FastxReaderIterator { t: PyObject, } #[pymethods] impl FastxReaderIterator { fn __next__(slf: PyRef, py: Python<'_>) -> PyResult> { let mut parser: PyRefMut = slf.t.extract(py)?; if let Some(rec) = parser.reader.next() { let record = py_try!(rec); Ok(Some(Record::from_sequence_record(&record))) } else { Ok(None) } } } // TODO: what would be really nice is to detect the type of pyobject so it would on file object etc // not for initial release though #[pyfunction] fn parse_fastx_file(path: &str) -> PyResult { let reader = py_try!(rs_parse_fastx_file(path)); Ok(PyFastxReader { reader }) } #[pyfunction] fn parse_fastx_string(content: &str) -> PyResult { let reader = py_try!(parse_fastx_reader(Cursor::new(content.to_owned()))); Ok(PyFastxReader { reader }) } #[pyfunction] pub fn normalize_seq(seq: &str, iupac: bool) -> PyResult { if let Some(s) = normalize(seq.as_bytes(), iupac) { Ok(String::from_utf8_lossy(&s).to_string()) } else { Ok(seq.to_owned()) } } #[pyfunction] pub fn reverse_complement(seq: &str) -> String { let comp: Vec = seq .as_bytes() .iter() .rev() .map(|n| complement(*n)) .collect(); String::from_utf8_lossy(&comp).to_string() } #[pymodule] fn needletail(py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_wrapped(wrap_pyfunction!(parse_fastx_file))?; m.add_wrapped(wrap_pyfunction!(parse_fastx_string))?; m.add_wrapped(wrap_pyfunction!(normalize_seq))?; m.add_wrapped(wrap_pyfunction!(reverse_complement))?; m.add("NeedletailError", py.get_type::())?; Ok(()) } needletail-0.5.1/src/sequence.rs000064400000000000000000000303171046102023000147060ustar 00000000000000//! Generic functions for working with (primarily nucleic acid) sequences use std::borrow::Cow; use memchr::memchr2; use crate::bitkmer::BitNuclKmer; use crate::kmer::{CanonicalKmers, Kmers}; /// Transform a nucleic acid sequence into its "normalized" form. /// /// The normalized form is: /// - only AGCTN and possibly - (for gaps) /// - strip out any whitespace or line endings /// - lowercase versions of these are uppercased /// - U is converted to T (make everything a DNA sequence) /// - some other punctuation is converted to gaps /// - IUPAC bases may be converted to N's depending on the parameter passed in /// - everything else is considered a N pub fn normalize(seq: &[u8], allow_iupac: bool) -> Option> { let mut buf: Vec = Vec::with_capacity(seq.len()); let mut changed: bool = false; for n in seq.iter() { let (new_char, char_changed) = match (*n, allow_iupac) { c @ (b'A', _) | c @ (b'C', _) | c @ (b'G', _) | c @ (b'T', _) | c @ (b'N', _) | c @ (b'-', _) => (c.0, false), (b'a', _) => (b'A', true), (b'c', _) => (b'C', true), (b'g', _) => (b'G', true), // normalize uridine to thymine (b't', _) | (b'u', _) | (b'U', _) => (b'T', true), // normalize gaps (b'.', _) | (b'~', _) => (b'-', true), // logic for IUPAC bases (a little messy) c @ (b'B', true) | c @ (b'D', true) | c @ (b'H', true) | c @ (b'V', true) | c @ (b'R', true) | c @ (b'Y', true) | c @ (b'S', true) | c @ (b'W', true) | c @ (b'K', true) | c @ (b'M', true) => (c.0, false), (b'b', true) => (b'B', true), (b'd', true) => (b'D', true), (b'h', true) => (b'H', true), (b'v', true) => (b'V', true), (b'r', true) => (b'R', true), (b'y', true) => (b'Y', true), (b's', true) => (b'S', true), (b'w', true) => (b'W', true), (b'k', true) => (b'K', true), (b'm', true) => (b'M', true), // remove all whitespace and line endings (b' ', _) | (b'\t', _) | (b'\r', _) | (b'\n', _) => (b' ', true), // everything else is an N _ => (b'N', true), }; changed = changed || char_changed; if new_char != b' ' { buf.push(new_char); } } if changed { Some(buf) } else { None } } /// Returns the complementary base for a given IUPAC base code. /// /// Does not work for RNA sequences (maybe we should raise an error or something?) #[inline] pub fn complement(n: u8) -> u8 { match n { b'a' => b't', b'A' => b'T', b'c' => b'g', b'C' => b'G', b'g' => b'c', b'G' => b'C', b't' => b'a', b'T' => b'A', // IUPAC codes b'r' => b'y', b'y' => b'r', b'k' => b'm', b'm' => b'k', b'b' => b'v', b'v' => b'b', b'd' => b'h', b'h' => b'd', b's' => b's', b'w' => b'w', b'R' => b'Y', b'Y' => b'R', b'K' => b'M', b'M' => b'K', b'B' => b'V', b'V' => b'B', b'D' => b'H', b'H' => b'D', b'S' => b'S', b'W' => b'W', // anything else just pass through // 'u' | 'U' => panic!("Does not support complements of U"), x => x, } } /// Taking in a sequence string, return the canonical form of the sequence /// (e.g. the lexigraphically lowest of either the original sequence or its /// reverse complement) pub fn canonical(seq: &[u8]) -> Cow<[u8]> { let mut buf: Vec = Vec::with_capacity(seq.len()); // enough just keeps our comparisons from happening after they need to let mut enough = false; let mut original_was_canonical = false; // loop through the kmer and its reverse complement simultaneously for (rn, n) in seq.iter().rev().map(|n| complement(*n)).zip(seq.iter()) { buf.push(rn); if !enough && (*n < rn) { original_was_canonical = true; break; } else if !enough && (rn < *n) { enough = true; } // unstated if branch: if rn == n, keep comparing } match (original_was_canonical, enough) { (true, true) => panic!("Bug: should never set original_was_canonical if enough == true"), (true, false) => seq.into(), (false, true) => buf.into(), // the sequences were completely equal, return the ref (false, false) => seq.into(), } } /// Find the lexigraphically smallest substring of `seq` of length `length` /// /// There's probably a faster algorithm for this somewhere... pub fn minimizer(seq: &[u8], length: usize) -> Cow<[u8]> { let reverse_complement: Vec = seq.iter().rev().map(|n| complement(*n)).collect(); let mut minmer = Cow::Borrowed(&seq[..length]); for (kmer, rc_kmer) in seq.windows(length).zip(reverse_complement.windows(length)) { if *kmer < minmer[..] { minmer = kmer.into(); } if *rc_kmer < minmer[..] { minmer = rc_kmer.to_vec().into(); } } minmer } /// A generic FASTX record that also abstracts over several logical operations /// that can be performed on nucleic acid sequences. pub trait Sequence<'a> { fn sequence(&'a self) -> &'a [u8]; /// Remove newlines from the sequence; this handles `\r`, `\n`, and `\r\n` /// and removes internal newlines in addition to ones at the end. /// Primarily used for FASTA multiline records, but can also help process /// (the much rarer) multiline FASTQs. Always use before iteration methods /// below to ensure no newlines are being returned with e.g. `.kmers`. /// If you are using `normalize`, you do not need to call this function directly. fn strip_returns(&'a self) -> Cow<'a, [u8]> { let seq = self.sequence(); // first part is a fast check to see if we need to do any allocations let mut i; match memchr2(b'\r', b'\n', seq) { Some(break_loc) => i = break_loc, None => return seq.into(), } // we found a newline; create a new buffer and stripping out newlines // and writing into it let mut new_buf = Vec::with_capacity(seq.len() - 1); new_buf.extend_from_slice(&seq[..i]); while i < seq.len() { match memchr2(b'\r', b'\n', &seq[i..]) { None => { new_buf.extend_from_slice(&seq[i..]); break; } Some(match_pos) => { new_buf.extend_from_slice(&seq[i..i + match_pos]); i += match_pos + 1; } } } new_buf.into() } /// Returns the reverse complement of a sequence. Biologically this is /// equivalent to the sequence of the strand opposite the one you pass /// in. /// /// ``` /// use needletail::Sequence; /// /// assert_eq!(b"AACC".reverse_complement(), b"GGTT"); /// ``` fn reverse_complement(&'a self) -> Vec { self.sequence() .iter() .rev() .map(|n| complement(*n)) .collect() } /// [Nucleic Acids] Normalizes the sequence. See documentation for /// `needletail::sequence::normalize`. Do not use on amino acid /// sequences. Note that this returns a Cow so you may have to coerce /// to a Vec or &[u8] as necessary. /// /// ``` /// use needletail::Sequence; /// /// // IUPAC bases are coerced to N's if `false` /// assert_eq!(b"ADGH".normalize(false).as_ref(), b"ANGN"); /// // otherwise they're preserved /// assert_eq!(b"ADGH".normalize(true).as_ref(), b"ADGH"); /// /// // Uridine residues are converted to thymidine /// assert_eq!(b"ACGU".normalize(true).as_ref(), b"ACGT"); /// ``` fn normalize(&'a self, iupac: bool) -> Cow<'a, [u8]> { if let Some(s) = normalize(self.sequence(), iupac) { s.into() } else { self.sequence().into() } } /// [Nucleic Acids] Returns an iterator over the sequence that skips /// non-ACGT bases and returns a tuple containing (position, the /// canonicalized kmer, if the sequence is the complement of the original). fn canonical_kmers(&'a self, k: u8, reverse_complement: &'a [u8]) -> CanonicalKmers<'a> { CanonicalKmers::new(self.sequence(), reverse_complement, k) } /// Returns an iterator that returns a sliding window of k-sized /// sequences (k-mers). Does not skip whitespace or correct bases in the /// original sequence so `.normalize` or `.strip_returns` may be /// appropriate to use first. fn kmers(&'a self, k: u8) -> Kmers<'a> { Kmers::new(self.sequence(), k) } /// Return an iterator that returns valid kmers in 4-bit form fn bit_kmers(&'a self, k: u8, canonical: bool) -> BitNuclKmer<'a> { BitNuclKmer::new(self.sequence(), k, canonical) } } impl<'a> Sequence<'a> for &'a [u8] { fn sequence(&'a self) -> &'a [u8] { self } } impl<'a> Sequence<'a> for [u8] { fn sequence(&'a self) -> &'a [u8] { self } } impl<'a> Sequence<'a> for Cow<'a, [u8]> { fn sequence(&'a self) -> &'a [u8] { self } } /// [⚠️Unstable] A trait to wrap over sequence data that has associated /// quality information. /// /// Will be stabilized once we figure out a good way to handle sequences that /// have _optional_ quality information (like SequenceRecord) because the /// return trait requires a slice from an immutable reference and /// SequenceRecords can't return that without modifying themselves. pub trait QualitySequence<'a>: Sequence<'a> { fn quality(&'a self) -> &'a [u8]; /// Given a SeqRecord and a quality cutoff, mask out low-quality bases with /// `N` characters. fn quality_mask(&'a self, score: u8) -> Cow<'a, [u8]> { let qual = self.quality(); // could maybe speed this up by doing a copy of base and then // iterating though qual and masking? let seq: Vec = self .sequence() .iter() .zip(qual.iter()) .map(|(base, qual)| if *qual < score { b'N' } else { *base }) .collect(); seq.into() } } impl<'a> Sequence<'a> for (&'a [u8], &'a [u8]) { fn sequence(&'a self) -> &'a [u8] { self.0 } } impl<'a> QualitySequence<'a> for (&'a [u8], &'a [u8]) { fn quality(&'a self) -> &'a [u8] { self.1 } } #[cfg(test)] mod tests { use super::*; #[test] fn test_normalize() { assert_eq!(normalize(b"ACGTU", false), Some(b"ACGTT".to_vec())); assert_eq!(normalize(b"acgtu", false), Some(b"ACGTT".to_vec())); assert_eq!(normalize(b"N.N-N~N N", false), Some(b"N-N-N-NN".to_vec())); assert_eq!(normalize(b"BDHVRYSWKM", true), None); assert_eq!(normalize(b"bdhvryswkm", true), Some(b"BDHVRYSWKM".to_vec())); assert_eq!( normalize(b"BDHVRYSWKM", false), Some(b"NNNNNNNNNN".to_vec()) ); assert_eq!( normalize(b"bdhvryswkm", false), Some(b"NNNNNNNNNN".to_vec()) ); } #[test] fn test_complement() { assert_eq!(complement(b'a'), b't'); assert_eq!(complement(b'c'), b'g'); assert_eq!(complement(b'g'), b'c'); assert_eq!(complement(b'n'), b'n'); } #[test] fn can_canonicalize() { assert_eq!(canonical(b"A"), Cow::Borrowed(b"A")); assert_eq!(canonical(b"T"), Cow::Owned::<[u8]>(b"A".to_vec())); assert_eq!(canonical(b"AAGT"), Cow::Borrowed(b"AAGT")); assert_eq!(canonical(b"ACTT"), Cow::Owned::<[u8]>(b"AAGT".to_vec())); assert_eq!(canonical(b"GC"), Cow::Borrowed(b"GC")); } #[test] fn can_minimize() { let minmer = minimizer(&b"ATTTCG"[..], 3); assert_eq!(&minmer[..], b"AAA"); } #[test] fn test_quality_mask() { let seq_rec = (&b"AGCT"[..], &b"AAA0"[..]); let filtered_rec = seq_rec.quality_mask(b'5'); assert_eq!(&filtered_rec[..], &b"AGCN"[..]); } }