tendril-0.4.3/.cargo_vcs_info.json0000644000000001360000000000100125010ustar { "git": { "sha1": "f882b602039a56dd00927461ed9631c6259ed67f" }, "path_in_vcs": "" }tendril-0.4.3/.github/workflows/main.yml000064400000000000000000000020150072674642500163630ustar 00000000000000name: CI on: push: branches: [auto] pull_request: workflow_dispatch: jobs: linux-ci: name: Linux runs-on: ubuntu-latest strategy: matrix: toolchain: ["stable", "beta", "nightly", "1.36.0"] steps: - uses: actions/checkout@v2 - name: Install toolchain uses: actions-rs/toolchain@v1 with: profile: minimal toolchain: ${{ matrix.toolchain }} override: true - name: Cargo build run: cargo build - name: Cargo doc run: cargo doc - name: Cargo test run: cargo test --features 'encoding encoding_rs' - name: Cargo bench if: matrix.toolchain == 'nightly' run: cargo test --features bench build_result: name: homu build finished runs-on: ubuntu-latest needs: - "linux-ci" steps: - name: Mark the job as successful run: exit 0 if: success() - name: Mark the job as unsuccessful run: exit 1 if: "!success()" tendril-0.4.3/.gitignore000064400000000000000000000000220072674642500133030ustar 00000000000000target Cargo.lock tendril-0.4.3/Cargo.lock0000644000000123370000000000100104620ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "encoding" version = "0.2.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" dependencies = [ "encoding-index-japanese", "encoding-index-korean", "encoding-index-simpchinese", "encoding-index-singlebyte", "encoding-index-tradchinese", ] [[package]] name = "encoding-index-japanese" version = "1.20141219.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" dependencies = [ "encoding_index_tests", ] [[package]] name = "encoding-index-korean" version = "1.20141219.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" dependencies = [ "encoding_index_tests", ] [[package]] name = "encoding-index-simpchinese" version = "1.20141219.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" dependencies = [ "encoding_index_tests", ] [[package]] name = "encoding-index-singlebyte" version = "1.20141219.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" dependencies = [ "encoding_index_tests", ] [[package]] name = "encoding-index-tradchinese" version = "1.20141219.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" dependencies = [ "encoding_index_tests", ] [[package]] name = "encoding_index_tests" version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" [[package]] name = "encoding_rs" version = "0.8.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7896dc8abb250ffdda33912550faa54c88ec8b998dec0b2c55ab224921ce11df" dependencies = [ "cfg-if", ] [[package]] name = "fuchsia-cprng" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" [[package]] name = "futf" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" dependencies = [ "mac", "new_debug_unreachable", ] [[package]] name = "libc" version = "0.2.121" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "efaa7b300f3b5fe8eb6bf21ce3895e1751d9665086af2d64b42f19701015ff4f" [[package]] name = "mac" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "new_debug_unreachable" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" [[package]] name = "rand" version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "552840b97013b1a26992c11eac34bdd778e464601a4c2054b5f0bff7c6761293" dependencies = [ "fuchsia-cprng", "libc", "rand_core 0.3.1", "rdrand", "winapi", ] [[package]] name = "rand_core" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" dependencies = [ "rand_core 0.4.2", ] [[package]] name = "rand_core" version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc" [[package]] name = "rdrand" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" dependencies = [ "rand_core 0.3.1", ] [[package]] name = "tendril" version = "0.4.3" dependencies = [ "encoding", "encoding_rs", "futf", "mac", "rand", "utf-8", ] [[package]] name = "utf-8" version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" [[package]] name = "winapi" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" dependencies = [ "winapi-i686-pc-windows-gnu", "winapi-x86_64-pc-windows-gnu", ] [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" tendril-0.4.3/Cargo.toml0000644000000021220000000000100104740ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] name = "tendril" version = "0.4.3" authors = ["Keegan McAllister ", "Simon Sapin ", "Chris Morgan "] description = "Compact buffer/string type for zero-copy parsing" readme = "README.md" license = "MIT/Apache-2.0" repository = "https://github.com/servo/tendril" [dependencies.encoding] version = "0.2" optional = true [dependencies.encoding_rs] version = "0.8.12" optional = true [dependencies.futf] version = "0.1.5" [dependencies.mac] version = "0.1" [dependencies.utf-8] version = "0.7" [dev-dependencies.rand] version = "0.4" [features] bench = [] tendril-0.4.3/Cargo.toml.orig000064400000000000000000000011010072674642500142010ustar 00000000000000[package] name = "tendril" version = "0.4.3" authors = ["Keegan McAllister ", "Simon Sapin ", "Chris Morgan "] repository = "https://github.com/servo/tendril" readme = "README.md" license = "MIT/Apache-2.0" description = "Compact buffer/string type for zero-copy parsing" [dependencies] mac = "0.1" encoding = {version = "0.2", optional = true} encoding_rs = {version = "0.8.12", optional = true} futf = "0.1.5" utf-8 = "0.7" [dev-dependencies] rand = "0.4" [features] bench = [] tendril-0.4.3/LICENSE-APACHE000064400000000000000000000251370072674642500132550ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. tendril-0.4.3/LICENSE-MIT000064400000000000000000000020450072674642500127560ustar 00000000000000Copyright (c) 2015 Keegan McAllister Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. tendril-0.4.3/README.md000064400000000000000000000107020072674642500126000ustar 00000000000000# tendril **Warning**: This library is at a very early stage of development, and it contains a substantial amount of `unsafe` code. Use at your own risk! [![Build Status](https://github.com/servo/tendril/workflows/CI/badge.svg)](https://github.com/servo/tendril/actions) [API Documentation](https://doc.servo.org/tendril/index.html) ## Introduction `Tendril` is a compact string/buffer type, optimized for zero-copy parsing. Tendrils have the semantics of owned strings, but are sometimes views into shared buffers. When you mutate a tendril, an owned copy is made if necessary. Further mutations occur in-place until the string becomes shared, e.g. with `clone()` or `subtendril()`. Buffer sharing is accomplished through thread-local (non-atomic) reference counting, which has very low overhead. The Rust type system will prevent you at compile time from sending a tendril between threads. (See below for thoughts on relaxing this restriction.) Whereas `String` allocates in the heap for any non-empty string, `Tendril` can store small strings (up to 8 bytes) in-line, without a heap allocation. `Tendril` is also smaller than `String` on 64-bit platforms — 16 bytes versus 24. `Option` is the same size as `Tendril`, thanks to [`NonZero`][NonZero]. The maximum length of a tendril is 4 GB. The library will panic if you attempt to go over the limit. ## Formats and encoding `Tendril` uses [phantom types](https://doc.rust-lang.org/stable/rust-by-example/generics/phantom.html) to track a buffer's format. This determines at compile time which operations are available on a given tendril. For example, `Tendril` and `Tendril` can be borrowed as `&str` and `&[u8]` respectively. `Tendril` also integrates with [rust-encoding](https://github.com/lifthrasiir/rust-encoding) and has preliminary support for [WTF-8][] buffers. ## Plans for the future ### Ropes [html5ever][] will use `Tendril` as a zero-copy text representation. It would be good to preserve this all the way through to Servo's DOM. This would reduce memory consumption, and possibly speed up text shaping and painting. However, DOM text may conceivably be larger than 4 GB, and will anyway not be contiguous in memory around e.g. a character entity reference. *Solution:* Build a **[rope][] on top of these strings** and use that as Servo's representation of DOM text. We can perhaps do text shaping and/or painting in parallel for different chunks of a rope. html5ever can additionally use this rope type as a replacement for `BufferQueue`. Because the underlying buffers are reference-counted, the bulk of this rope is already a [persistent data structure][]. Consider what happens when appending two ropes to get a "new" rope. A vector-backed rope would copy a vector of small structs, one for each chunk, and would bump the corresponding refcounts. But it would not copy any of the string data. If we want more sharing, then a [2-3 finger tree][] could be a good choice. We would probably stick with `VecDeque` for ropes under a certain size. ### UTF-16 compatibility SpiderMonkey expects text to be in UCS-2 format for the most part. The semantics of JavaScript strings are difficult to implement on UTF-8. This also applies to HTML parsing via `document.write`. Also, passing SpiderMonkey a string that isn't contiguous in memory will incur additional overhead and complexity, if not a full copy. *Solution:* Use **WTF-8 in parsing** and in the DOM. Servo will **convert to contiguous UTF-16 when necessary**. The conversion can easily be parallelized, if we find a practical need to convert huge chunks of text all at once. ### Source span information Some html5ever API consumers want to know the originating location in the HTML source file(s) of each token or parse error. An example application would be a command-line HTML validator with diagnostic output similar to `rustc`'s. *Solution:* Accept **some metadata along with each input string**. The type of metadata is chosen by the API consumer; it defaults to `()`, which has size zero. For any non-inline string, we can provide the associated metadata as well as a byte offset. [NonZero]: https://doc.rust-lang.org/core/nonzero/struct.NonZero.html [html5ever]: https://github.com/servo/html5ever [WTF-8]: https://simonsapin.github.io/wtf-8/ [rope]: https://en.wikipedia.org/wiki/Rope_%28data_structure%29 [persistent data structure]: https://en.wikipedia.org/wiki/Persistent_data_structure [2-3 finger tree]: https://www.staff.city.ac.uk/~ross/papers/FingerTree.html tendril-0.4.3/examples/fuzz.rs000064400000000000000000000125440072674642500145110ustar 00000000000000// Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! A simple fuzz tester for the library. #![deny(warnings)] extern crate rand; extern crate tendril; use std::borrow::ToOwned; use rand::distributions::{IndependentSample, Range}; use rand::Rng; use tendril::StrTendril; fn fuzz() { let mut rng = rand::thread_rng(); let capacity = Range::new(0u32, 1 << 14).ind_sample(&mut rng); let mut buf_string = String::with_capacity(capacity as usize); let mut buf_tendril = StrTendril::with_capacity(capacity); let mut string_slices = vec![]; let mut tendril_slices = vec![]; for _ in 1..100_000 { if buf_string.len() > (1 << 30) { buf_string.truncate(0); buf_tendril.clear(); } let dist_action = Range::new(0, 100); match dist_action.ind_sample(&mut rng) { 0..=15 => { let (start, end) = random_slice(&mut rng, TEXT); let snip = &TEXT[start..end]; buf_string.push_str(snip); buf_tendril.push_slice(snip); assert_eq!(&*buf_string, &*buf_tendril); } 16..=31 => { let (start, end) = random_slice(&mut rng, &buf_string); let snip = &buf_string[start..end].to_owned(); buf_string.push_str(&snip); buf_tendril.push_slice(&snip); assert_eq!(&*buf_string, &*buf_tendril); } 32..=47 => { let lenstr = format!("[length = {}]", buf_tendril.len()); buf_string.push_str(&lenstr); buf_tendril.push_slice(&lenstr); assert_eq!(&*buf_string, &*buf_tendril); } 48..=63 => { let n = random_boundary(&mut rng, &buf_string); buf_tendril.pop_front(n as u32); buf_string = buf_string[n..].to_owned(); assert_eq!(&*buf_string, &*buf_tendril); } 64..=79 => { let new_len = random_boundary(&mut rng, &buf_string); let n = buf_string.len() - new_len; buf_string.truncate(new_len); buf_tendril.pop_back(n as u32); assert_eq!(&*buf_string, &*buf_tendril); } 80..=90 => { let (start, end) = random_slice(&mut rng, &buf_string); buf_string = buf_string[start..end].to_owned(); buf_tendril = buf_tendril.subtendril(start as u32, (end - start) as u32); assert_eq!(&*buf_string, &*buf_tendril); } 91..=96 => { let c = rng.gen(); buf_string.push(c); assert!(buf_tendril.try_push_char(c).is_ok()); assert_eq!(&*buf_string, &*buf_tendril); } 97 => { buf_string.truncate(0); buf_tendril.clear(); assert_eq!(&*buf_string, &*buf_tendril); } _ => { let (start, end) = random_slice(&mut rng, &buf_string); string_slices.push(buf_string[start..end].to_owned()); tendril_slices.push(buf_tendril.subtendril(start as u32, (end - start) as u32)); assert_eq!(string_slices.len(), tendril_slices.len()); assert!(string_slices .iter() .zip(tendril_slices.iter()) .all(|(s, t)| **s == **t)); } } } } fn random_boundary(rng: &mut R, text: &str) -> usize { loop { let i = Range::new(0, text.len() + 1).ind_sample(rng); if text.is_char_boundary(i) { return i; } } } fn random_slice(rng: &mut R, text: &str) -> (usize, usize) { loop { let start = Range::new(0, text.len() + 1).ind_sample(rng); let end = Range::new(start, text.len() + 1).ind_sample(rng); if !text.is_char_boundary(start) { continue; } if end < text.len() && !text.is_char_boundary(end) { continue; } return (start, end); } } static TEXT: &'static str = "It was from the artists and poets that the pertinent answers came, and I \ know that panic would have broken loose had they been able to compare notes. \ As it was, lacking their original letters, I half suspected the compiler of \ having asked leading questions, or of having edited the correspondence in \ corroboration of what he had latently resolved to see.\ \ ˙ǝǝs oʇ pǝʌʃosǝɹ ʎʃʇuǝʇɐʃ pɐɥ ǝɥ ʇɐɥʍ ɟo uoıʇɐɹoqoɹɹoɔ uı ǝɔuǝpuodsǝɹɹoɔ ǝɥʇ \ pǝʇıpǝ ƃuıʌɐɥ ɟo ɹo 'suoıʇsǝnb ƃuıpɐǝʃ pǝʞsɐ ƃuıʌɐɥ ɟo ɹǝʃıdɯoɔ ǝɥʇ pǝʇɔǝdsns \ ɟʃɐɥ I 'sɹǝʇʇǝʃ ʃɐuıƃıɹo ɹıǝɥʇ ƃuıʞɔɐʃ 'sɐʍ ʇı s∀ ˙sǝʇou ǝɹɐdɯoɔ oʇ ǝʃqɐ uǝǝq \ ʎǝɥʇ pɐɥ ǝsooʃ uǝʞoɹq ǝʌɐɥ pʃnoʍ ɔıuɐd ʇɐɥʇ ʍouʞ I puɐ 'ǝɯɐɔ sɹǝʍsuɐ ʇuǝuıʇɹǝd \ ǝɥʇ ʇɐɥʇ sʇǝod puɐ sʇsıʇɹɐ ǝɥʇ ɯoɹɟ sɐʍ ʇI"; fn main() { fuzz(); } tendril-0.4.3/src/bench.rs000064400000000000000000000150360072674642500135420ustar 00000000000000// Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use std::borrow::ToOwned; use std::collections::hash_map::{Entry, HashMap}; use tendril::StrTendril; fn index_words_string(input: &String) -> HashMap> { let mut index = HashMap::new(); for word in input.split(|c| c == ' ') { if word.len() == 0 { continue; } let word = word.to_owned(); match index.entry(word.chars().next().unwrap()) { Entry::Occupied(mut e) => { let x: &mut Vec = e.get_mut(); x.push(word); } Entry::Vacant(e) => { e.insert(vec![word]); } } } index } fn index_words_tendril(input: &StrTendril) -> HashMap> { let mut index = HashMap::new(); let mut t = input.clone(); loop { match t.pop_front_char_run(|c| c != ' ') { None => return index, Some((_, false)) => (), Some((word, true)) => match index.entry(word.chars().next().unwrap()) { Entry::Occupied(mut e) => { e.get_mut().push(word); } Entry::Vacant(e) => { e.insert(vec![word]); } }, } } } static EN_1: &'static str = "Days turn to nights turn to paper into rocks into plastic"; static EN_2: &'static str = "Here the notes in my laboratory journal cease. I was able to write the last \ words only with great effort. By now it was already clear to me that LSD had \ been the cause of the remarkable experience of the previous Friday, for the \ altered perceptions were of the same type as before, only much more intense. I \ had to struggle to speak intelligibly. I asked my laboratory assistant, who was \ informed of the self-experiment, to escort me home. We went by bicycle, no \ automobile being available because of wartime restrictions on their use. On the \ way home, my condition began to assume threatening forms. Everything in my \ field of vision wavered and was distorted as if seen in a curved mirror. I also \ had the sensation of being unable to move from the spot. Nevertheless, my \ assistant later told me that we had traveled very rapidly. Finally, we arrived \ at home safe and sound, and I was just barely capable of asking my companion to \ summon our family doctor and request milk from the neighbors.\n\n\ In spite of my delirious, bewildered condition, I had brief periods of clear \ and effective thinking—and chose milk as a nonspecific antidote for poisoning."; static KR_1: &'static str = "러스트(Rust)는 모질라(mozilla.org)에서 개발하고 있는, 메모리-안전하고 병렬 \ 프로그래밍이 쉬운 차세대 프로그래밍 언어입니다. 아직 \ 개발 단계이며 많은 기능이 구현 중으로, MIT/Apache2 라이선스로 배포됩니다."; static HTML_KR_1: &'static str = "

러스트(Rust)는 모질라(mozilla.org)에서 개발하고 있는, \ 메모리-안전하고 병렬 프로그래밍이 쉬운 차세대 프로그래밍 언어입니다. \ 아직 개발 단계이며 많은 기능이 구현 중으로, MIT/Apache2 라이선스로 배포됩니다.

"; mod index_words { macro_rules! bench { ($txt:ident) => { #[allow(non_snake_case)] mod $txt { const SMALL_SIZE: usize = 65536; const LARGE_SIZE: usize = (1 << 20); #[bench] fn index_words_string(b: &mut ::test::Bencher) { let mut s = String::new(); while s.len() < SMALL_SIZE { s.push_str(::tendril::bench::$txt); } b.iter(|| ::tendril::bench::index_words_string(&s)); } #[bench] fn index_words_tendril(b: &mut ::test::Bencher) { let mut t = ::tendril::StrTendril::new(); while t.len() < SMALL_SIZE { t.push_slice(::tendril::bench::$txt); } b.iter(|| ::tendril::bench::index_words_tendril(&t)); } #[bench] fn index_words_big_string(b: &mut ::test::Bencher) { let mut s = String::new(); while s.len() < LARGE_SIZE { s.push_str(::tendril::bench::$txt); } b.iter(|| ::tendril::bench::index_words_string(&s)); } #[bench] fn index_words_big_tendril(b: &mut ::test::Bencher) { let mut t = ::tendril::StrTendril::new(); while t.len() < LARGE_SIZE { t.push_slice(::tendril::bench::$txt); } b.iter(|| ::tendril::bench::index_words_tendril(&t)); } #[test] fn correctness() { use std::borrow::ToOwned; use tendril::bench::{index_words_string, index_words_tendril}; use tendril::SliceExt; let txt = ::tendril::bench::$txt; let input_string = txt.to_owned(); let count_s = index_words_string(&input_string); let mut keys: Vec = count_s.keys().cloned().collect(); keys.sort(); let input_tendril = txt.to_tendril(); let count_t = index_words_tendril(&input_tendril); let mut keys_t: Vec = count_t.keys().cloned().collect(); keys_t.sort(); assert_eq!(keys, keys_t); for k in &keys { let vs = &count_s[k]; let vt = &count_t[k]; assert_eq!(vs.len(), vt.len()); assert!(vs.iter().zip(vt.iter()).all(|(s, t)| **s == **t)); } } } }; } bench!(EN_1); bench!(EN_2); bench!(KR_1); bench!(HTML_KR_1); } tendril-0.4.3/src/buf32.rs000064400000000000000000000061570072674642500134100ustar 00000000000000// Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! Provides an unsafe owned buffer type, used in implementing `Tendril`. use std::{mem, ptr, slice, u32}; use OFLOW; pub const MIN_CAP: u32 = 16; pub const MAX_LEN: usize = u32::MAX as usize; /// A buffer points to a header of type `H`, which is followed by `MIN_CAP` or more /// bytes of storage. pub struct Buf32 { pub ptr: *mut H, pub len: u32, pub cap: u32, } #[inline(always)] fn bytes_to_vec_capacity(x: u32) -> usize { let header = mem::size_of::(); debug_assert!(header > 0); let x = (x as usize).checked_add(header).expect(OFLOW); // Integer ceil https://stackoverflow.com/a/2745086/1162888 1 + ((x - 1) / header) } impl Buf32 { #[inline] pub unsafe fn with_capacity(mut cap: u32, h: H) -> Buf32 { if cap < MIN_CAP { cap = MIN_CAP; } let mut vec = Vec::::with_capacity(bytes_to_vec_capacity::(cap)); let ptr = vec.as_mut_ptr(); mem::forget(vec); ptr::write(ptr, h); Buf32 { ptr: ptr, len: 0, cap: cap, } } #[inline] pub unsafe fn destroy(self) { mem::drop(Vec::from_raw_parts( self.ptr, 1, bytes_to_vec_capacity::(self.cap), )); } #[inline(always)] pub unsafe fn data_ptr(&self) -> *mut u8 { (self.ptr as *mut u8).offset(mem::size_of::() as isize) } #[inline(always)] pub unsafe fn data(&self) -> &[u8] { slice::from_raw_parts(self.data_ptr(), self.len as usize) } #[inline(always)] pub unsafe fn data_mut(&mut self) -> &mut [u8] { slice::from_raw_parts_mut(self.data_ptr(), self.len as usize) } /// Grow the capacity to at least `new_cap`. /// /// This will panic if the capacity calculation overflows `u32`. #[inline] pub unsafe fn grow(&mut self, new_cap: u32) { if new_cap <= self.cap { return; } let new_cap = new_cap.checked_next_power_of_two().expect(OFLOW); let mut vec = Vec::from_raw_parts(self.ptr, 0, bytes_to_vec_capacity::(self.cap)); vec.reserve_exact(bytes_to_vec_capacity::(new_cap)); self.ptr = vec.as_mut_ptr(); self.cap = new_cap; mem::forget(vec); } } #[cfg(test)] mod test { use super::Buf32; use std::ptr; #[test] fn smoke_test() { unsafe { let mut b = Buf32::with_capacity(0, 0u8); assert_eq!(b"", b.data()); b.grow(5); ptr::copy_nonoverlapping(b"Hello".as_ptr(), b.data_ptr(), 5); assert_eq!(b"", b.data()); b.len = 5; assert_eq!(b"Hello", b.data()); b.grow(1337); assert!(b.cap >= 1337); assert_eq!(b"Hello", b.data()); b.destroy(); } } } tendril-0.4.3/src/fmt.rs000064400000000000000000000324040072674642500132470ustar 00000000000000// Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! Marker types for formats. //! //! This module defines the types and traits used to mark a `Tendril` //! with the format of data it contains. It includes those formats //! for which `Tendril` supports at least some operations without //! conversion. //! //! To convert a string tendril to/from a byte tendril in an arbitrary //! character encoding, see the `encode` and `decode` methods on //! `Tendril`. //! //! `Tendril` operations may become memory-unsafe if data invalid for //! the format sneaks in. For that reason, these traits require //! `unsafe impl`. use std::default::Default; use std::{char, mem, str}; use futf::{self, Codepoint, Meaning}; /// Implementation details. /// /// You don't need these unless you are implementing /// a new format. pub mod imp { use std::default::Default; use std::{iter, mem, slice}; /// Describes how to fix up encodings when concatenating. /// /// We can drop characters on either side of the splice, /// and insert up to 4 bytes in the middle. pub struct Fixup { pub drop_left: u32, pub drop_right: u32, pub insert_len: u32, pub insert_bytes: [u8; 4], } impl Default for Fixup { #[inline(always)] fn default() -> Fixup { Fixup { drop_left: 0, drop_right: 0, insert_len: 0, insert_bytes: [0; 4], } } } #[inline(always)] unsafe fn from_u32_unchecked(n: u32) -> char { mem::transmute(n) } pub struct SingleByteCharIndices<'a> { inner: iter::Enumerate>, } impl<'a> Iterator for SingleByteCharIndices<'a> { type Item = (usize, char); #[inline] fn next(&mut self) -> Option<(usize, char)> { self.inner .next() .map(|(i, &b)| unsafe { (i, from_u32_unchecked(b as u32)) }) } } impl<'a> SingleByteCharIndices<'a> { #[inline] pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> { SingleByteCharIndices { inner: buf.iter().enumerate(), } } } } /// Trait for format marker types. /// /// The type implementing this trait is usually not instantiated. /// It's used with a phantom type parameter of `Tendril`. pub unsafe trait Format { /// Check whether the buffer is valid for this format. fn validate(buf: &[u8]) -> bool; /// Check whether the buffer is valid for this format. /// /// You may assume the buffer is a prefix of a valid buffer. #[inline] fn validate_prefix(buf: &[u8]) -> bool { ::validate(buf) } /// Check whether the buffer is valid for this format. /// /// You may assume the buffer is a suffix of a valid buffer. #[inline] fn validate_suffix(buf: &[u8]) -> bool { ::validate(buf) } /// Check whether the buffer is valid for this format. /// /// You may assume the buffer is a contiguous subsequence /// of a valid buffer, but not necessarily a prefix or /// a suffix. #[inline] fn validate_subseq(buf: &[u8]) -> bool { ::validate(buf) } /// Compute any fixup needed when concatenating buffers. /// /// The default is to do nothing. /// /// The function is `unsafe` because it may assume the input /// buffers are already valid for the format. Also, no /// bounds-checking is performed on the return value! #[inline(always)] unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup { Default::default() } } /// Indicates that one format is a subset of another. /// /// The subset format can be converted to the superset format /// for free. pub unsafe trait SubsetOf: Format where Super: Format, { /// Validate the *other* direction of conversion; check if /// this buffer from the superset format conforms to the /// subset format. /// /// The default calls `Self::validate`, but some conversions /// may implement a check which is cheaper than validating /// from scratch. fn revalidate_subset(x: &[u8]) -> bool { Self::validate(x) } } /// Indicates a format which corresponds to a Rust slice type, /// representing exactly the same invariants. pub unsafe trait SliceFormat: Format + Sized { type Slice: ?Sized + Slice; } /// Indicates a format which contains characters from Unicode /// (all of it, or some proper subset). pub unsafe trait CharFormat<'a>: Format { /// Iterator for characters and their byte indices. type Iter: Iterator; /// Iterate over the characters of the string and their byte /// indices. /// /// You may assume the buffer is *already validated* for `Format`. unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter; /// Encode the character as bytes and pass them to a continuation. /// /// Returns `Err(())` iff the character cannot be represented. fn encode_char(ch: char, cont: F) -> Result<(), ()> where F: FnOnce(&[u8]); } /// Indicates a Rust slice type that is represented in memory as bytes. pub unsafe trait Slice { /// Access the raw bytes of the slice. fn as_bytes(&self) -> &[u8]; /// Convert a byte slice to this kind of slice. /// /// You may assume the buffer is *already validated* /// for `Format`. unsafe fn from_bytes(x: &[u8]) -> &Self; /// Convert a byte slice to this kind of slice. /// /// You may assume the buffer is *already validated* /// for `Format`. unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self; } /// Marker type for uninterpreted bytes. /// /// Validation will never fail for this format. #[derive(Copy, Clone, Default, Debug)] pub struct Bytes; unsafe impl Format for Bytes { #[inline(always)] fn validate(_: &[u8]) -> bool { true } } unsafe impl SliceFormat for Bytes { type Slice = [u8]; } unsafe impl Slice for [u8] { #[inline(always)] fn as_bytes(&self) -> &[u8] { self } #[inline(always)] unsafe fn from_bytes(x: &[u8]) -> &[u8] { x } #[inline(always)] unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] { x } } /// Marker type for ASCII text. #[derive(Copy, Clone, Default, Debug)] pub struct ASCII; unsafe impl Format for ASCII { #[inline] fn validate(buf: &[u8]) -> bool { buf.iter().all(|&n| n <= 127) } #[inline(always)] fn validate_prefix(_: &[u8]) -> bool { true } #[inline(always)] fn validate_suffix(_: &[u8]) -> bool { true } #[inline(always)] fn validate_subseq(_: &[u8]) -> bool { true } } unsafe impl SubsetOf for ASCII {} unsafe impl SubsetOf for ASCII {} unsafe impl<'a> CharFormat<'a> for ASCII { type Iter = imp::SingleByteCharIndices<'a>; #[inline] unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> { imp::SingleByteCharIndices::new(buf) } #[inline] fn encode_char(ch: char, cont: F) -> Result<(), ()> where F: FnOnce(&[u8]), { let n = ch as u32; if n > 0x7F { return Err(()); } cont(&[n as u8]); Ok(()) } } /// Marker type for UTF-8 text. #[derive(Copy, Clone, Default, Debug)] pub struct UTF8; unsafe impl Format for UTF8 { #[inline] fn validate(buf: &[u8]) -> bool { str::from_utf8(buf).is_ok() } #[inline] fn validate_prefix(buf: &[u8]) -> bool { if buf.len() == 0 { return true; } match futf::classify(buf, buf.len() - 1) { Some(Codepoint { meaning: Meaning::Whole(_), .. }) => true, _ => false, } } #[inline] fn validate_suffix(buf: &[u8]) -> bool { if buf.len() == 0 { return true; } match futf::classify(buf, 0) { Some(Codepoint { meaning: Meaning::Whole(_), .. }) => true, _ => false, } } #[inline] fn validate_subseq(buf: &[u8]) -> bool { ::validate_prefix(buf) && ::validate_suffix(buf) } } unsafe impl SubsetOf for UTF8 {} unsafe impl SliceFormat for UTF8 { type Slice = str; } unsafe impl Slice for str { #[inline(always)] fn as_bytes(&self) -> &[u8] { str::as_bytes(self) } #[inline(always)] unsafe fn from_bytes(x: &[u8]) -> &str { str::from_utf8_unchecked(x) } #[inline(always)] unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str { mem::transmute(x) } } unsafe impl<'a> CharFormat<'a> for UTF8 { type Iter = str::CharIndices<'a>; #[inline] unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> { str::from_utf8_unchecked(buf).char_indices() } #[inline] fn encode_char(ch: char, cont: F) -> Result<(), ()> where F: FnOnce(&[u8]), { cont(ch.encode_utf8(&mut [0_u8; 4]).as_bytes()); Ok(()) } } /// Marker type for WTF-8 text. /// /// See the [WTF-8 spec](https://simonsapin.github.io/wtf-8/). #[derive(Copy, Clone, Default, Debug)] pub struct WTF8; #[inline] fn wtf8_meaningful(m: Meaning) -> bool { match m { Meaning::Whole(_) | Meaning::LeadSurrogate(_) | Meaning::TrailSurrogate(_) => true, _ => false, } } unsafe impl Format for WTF8 { #[inline] fn validate(buf: &[u8]) -> bool { let mut i = 0; let mut prev_lead = false; while i < buf.len() { let codept = unwrap_or_return!(futf::classify(buf, i), false); if !wtf8_meaningful(codept.meaning) { return false; } i += codept.bytes.len(); prev_lead = match codept.meaning { Meaning::TrailSurrogate(_) if prev_lead => return false, Meaning::LeadSurrogate(_) => true, _ => false, }; } true } #[inline] fn validate_prefix(buf: &[u8]) -> bool { if buf.len() == 0 { return true; } match futf::classify(buf, buf.len() - 1) { Some(c) => wtf8_meaningful(c.meaning), _ => false, } } #[inline] fn validate_suffix(buf: &[u8]) -> bool { if buf.len() == 0 { return true; } match futf::classify(buf, 0) { Some(c) => wtf8_meaningful(c.meaning), _ => false, } } #[inline] fn validate_subseq(buf: &[u8]) -> bool { ::validate_prefix(buf) && ::validate_suffix(buf) } #[inline] unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup { const ERR: &'static str = "WTF8: internal error"; if lhs.len() >= 3 && rhs.len() >= 3 { if let ( Some(Codepoint { meaning: Meaning::LeadSurrogate(hi), .. }), Some(Codepoint { meaning: Meaning::TrailSurrogate(lo), .. }), ) = (futf::classify(lhs, lhs.len() - 1), futf::classify(rhs, 0)) { let mut fixup = imp::Fixup { drop_left: 3, drop_right: 3, insert_len: 0, insert_bytes: [0_u8; 4], }; let n = 0x10000 + ((hi as u32) << 10) + (lo as u32); let ch = char::from_u32(n).expect(ERR); fixup.insert_len = ch.encode_utf8(&mut fixup.insert_bytes).len() as u32; return fixup; } } Default::default() } } /// Marker type for the single-byte encoding of the first 256 Unicode codepoints. /// /// This is IANA's "ISO-8859-1". It's ISO's "ISO 8859-1" with the addition of the /// C0 and C1 control characters from ECMA-48 / ISO 6429. /// /// Not to be confused with WHATWG's "latin1" or "iso8859-1" labels (or the /// many other aliases), which actually stand for Windows-1252. #[derive(Copy, Clone, Default, Debug)] pub struct Latin1; unsafe impl Format for Latin1 { #[inline(always)] fn validate(_: &[u8]) -> bool { true } #[inline(always)] fn validate_prefix(_: &[u8]) -> bool { true } #[inline(always)] fn validate_suffix(_: &[u8]) -> bool { true } #[inline(always)] fn validate_subseq(_: &[u8]) -> bool { true } } unsafe impl<'a> CharFormat<'a> for Latin1 { type Iter = imp::SingleByteCharIndices<'a>; #[inline] unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> { imp::SingleByteCharIndices::new(buf) } #[inline] fn encode_char(ch: char, cont: F) -> Result<(), ()> where F: FnOnce(&[u8]), { let n = ch as u32; if n > 0xFF { return Err(()); } cont(&[n as u8]); Ok(()) } } tendril-0.4.3/src/lib.rs000064400000000000000000000020130072674642500132200ustar 00000000000000// Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. #![cfg_attr(all(test, feature = "bench"), feature(test))] //#![cfg_attr(test, deny(warnings))] #[cfg(feature = "encoding")] pub extern crate encoding; #[cfg(feature = "encoding_rs")] pub extern crate encoding_rs; #[cfg(all(test, feature = "bench"))] extern crate test; #[macro_use] extern crate mac; extern crate futf; extern crate utf8; pub use fmt::Format; pub use stream::TendrilSink; pub use tendril::{Atomic, Atomicity, NonAtomic, SendTendril}; pub use tendril::{ByteTendril, ReadExt, SliceExt, StrTendril, SubtendrilError, Tendril}; pub use utf8_decode::IncompleteUtf8; pub mod fmt; pub mod stream; mod buf32; mod tendril; mod utf8_decode; mod util; static OFLOW: &'static str = "tendril: overflow in buffer arithmetic"; tendril-0.4.3/src/stream.rs000064400000000000000000000600610072674642500137540ustar 00000000000000// Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! Streams of tendrils. use fmt; use tendril::{Atomicity, NonAtomic, Tendril}; use std::borrow::Cow; use std::fs::File; use std::io; use std::marker::PhantomData; use std::path::Path; #[cfg(feature = "encoding")] use encoding; #[cfg(feature = "encoding_rs")] use encoding_rs::{self, DecoderResult}; use utf8; /// Trait for types that can process a tendril. /// /// This is a "push" interface, unlike the "pull" interface of /// `Iterator>`. The push interface matches /// [html5ever][] and other incremental parsers with a similar /// architecture. /// /// [html5ever]: https://github.com/servo/html5ever pub trait TendrilSink where F: fmt::Format, A: Atomicity, { /// Process this tendril. fn process(&mut self, t: Tendril); /// Indicates that an error has occurred. fn error(&mut self, desc: Cow<'static, str>); /// What the overall result of processing is. type Output; /// Indicates the end of the stream. fn finish(self) -> Self::Output; /// Process one tendril and finish. fn one(mut self, t: T) -> Self::Output where Self: Sized, T: Into>, { self.process(t.into()); self.finish() } /// Consume an iterator of tendrils, processing each item, then finish. fn from_iter(mut self, i: I) -> Self::Output where Self: Sized, I: IntoIterator, I::Item: Into>, { for t in i { self.process(t.into()) } self.finish() } /// Read from the given stream of bytes until exhaustion and process incrementally, /// then finish. Return `Err` at the first I/O error. fn read_from(mut self, r: &mut R) -> io::Result where Self: Sized, R: io::Read, F: fmt::SliceFormat, { const BUFFER_SIZE: u32 = 4 * 1024; loop { let mut tendril = Tendril::::new(); // FIXME: this exposes uninitialized bytes to a generic R type // this is fine for R=File which never reads these bytes, // but user-defined types might. // The standard library pushes zeros to `Vec` for that reason. unsafe { tendril.push_uninitialized(BUFFER_SIZE); } loop { match r.read(&mut tendril) { Ok(0) => return Ok(self.finish()), Ok(n) => { tendril.pop_back(BUFFER_SIZE - n as u32); self.process(tendril); break; } Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} Err(e) => return Err(e), } } } } /// Read from the file at the given path and process incrementally, /// then finish. Return `Err` at the first I/O error. fn from_file

(self, path: P) -> io::Result where Self: Sized, P: AsRef, F: fmt::SliceFormat, { self.read_from(&mut File::open(path)?) } } /// A `TendrilSink` adaptor that takes bytes, decodes them as UTF-8, /// lossily replace ill-formed byte sequences with U+FFFD replacement characters, /// and emits Unicode (`StrTendril`). /// /// This does not allocate memory: the output is either subtendrils on the input, /// on inline tendrils for a single code point. pub struct Utf8LossyDecoder where Sink: TendrilSink, A: Atomicity, { pub inner_sink: Sink, incomplete: Option, marker: PhantomData, } impl Utf8LossyDecoder where Sink: TendrilSink, A: Atomicity, { /// Create a new incremental UTF-8 decoder. #[inline] pub fn new(inner_sink: Sink) -> Self { Utf8LossyDecoder { inner_sink: inner_sink, incomplete: None, marker: PhantomData, } } } impl TendrilSink for Utf8LossyDecoder where Sink: TendrilSink, A: Atomicity, { #[inline] fn process(&mut self, mut t: Tendril) { // FIXME: remove take() and map() when non-lexical borrows are stable. if let Some(mut incomplete) = self.incomplete.take() { let resume_at = incomplete.try_complete(&t).map(|(result, rest)| { match result { Ok(s) => self.inner_sink.process(Tendril::from_slice(s)), Err(_) => { self.inner_sink.error("invalid byte sequence".into()); self.inner_sink .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); } } t.len() - rest.len() }); match resume_at { None => { self.incomplete = Some(incomplete); return; } Some(resume_at) => t.pop_front(resume_at as u32), } } while !t.is_empty() { let unborrowed_result = match utf8::decode(&t) { Ok(s) => { debug_assert!(s.as_ptr() == t.as_ptr()); debug_assert!(s.len() == t.len()); Ok(()) } Err(utf8::DecodeError::Invalid { valid_prefix, invalid_sequence, .. }) => { debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); debug_assert!(valid_prefix.len() <= t.len()); Err(( valid_prefix.len(), Err(valid_prefix.len() + invalid_sequence.len()), )) } Err(utf8::DecodeError::Incomplete { valid_prefix, incomplete_suffix, }) => { debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); debug_assert!(valid_prefix.len() <= t.len()); Err((valid_prefix.len(), Ok(incomplete_suffix))) } }; match unborrowed_result { Ok(()) => { unsafe { self.inner_sink.process(t.reinterpret_without_validating()) } return; } Err((valid_len, and_then)) => { if valid_len > 0 { let subtendril = t.subtendril(0, valid_len as u32); unsafe { self.inner_sink .process(subtendril.reinterpret_without_validating()) } } match and_then { Ok(incomplete) => { self.incomplete = Some(incomplete); return; } Err(offset) => { self.inner_sink.error("invalid byte sequence".into()); self.inner_sink .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); t.pop_front(offset as u32); } } } } } } #[inline] fn error(&mut self, desc: Cow<'static, str>) { self.inner_sink.error(desc); } type Output = Sink::Output; #[inline] fn finish(mut self) -> Sink::Output { if self.incomplete.is_some() { self.inner_sink .error("incomplete byte sequence at end of stream".into()); self.inner_sink .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); } self.inner_sink.finish() } } /// A `TendrilSink` adaptor that takes bytes, decodes them as the given character encoding, /// lossily replace ill-formed byte sequences with U+FFFD replacement characters, /// and emits Unicode (`StrTendril`). /// /// This allocates new tendrils for encodings other than UTF-8. #[cfg(any(feature = "encoding", feature = "encoding_rs"))] pub struct LossyDecoder where Sink: TendrilSink, A: Atomicity, { inner: LossyDecoderInner, } #[cfg(any(feature = "encoding", feature = "encoding_rs"))] enum LossyDecoderInner where Sink: TendrilSink, A: Atomicity, { Utf8(Utf8LossyDecoder), #[cfg(feature = "encoding")] Encoding(Box, Sink), #[cfg(feature = "encoding_rs")] EncodingRs(encoding_rs::Decoder, Sink), } #[cfg(any(feature = "encoding", feature = "encoding_rs"))] impl LossyDecoder where Sink: TendrilSink, A: Atomicity, { /// Create a new incremental decoder using the encoding crate. #[cfg(feature = "encoding")] #[inline] pub fn new(encoding: encoding::EncodingRef, sink: Sink) -> Self { if encoding.name() == "utf-8" { LossyDecoder::utf8(sink) } else { LossyDecoder { inner: LossyDecoderInner::Encoding(encoding.raw_decoder(), sink), } } } /// Create a new incremental decoder using the encoding_rs crate. #[cfg(feature = "encoding_rs")] #[inline] pub fn new_encoding_rs(encoding: &'static encoding_rs::Encoding, sink: Sink) -> Self { if encoding == encoding_rs::UTF_8 { return Self::utf8(sink); } Self { inner: LossyDecoderInner::EncodingRs(encoding.new_decoder(), sink), } } /// Create a new incremental decoder for the UTF-8 encoding. /// /// This is useful for content that is known at run-time to be UTF-8 /// (whereas `Utf8LossyDecoder` requires knowning at compile-time.) #[inline] pub fn utf8(sink: Sink) -> LossyDecoder { LossyDecoder { inner: LossyDecoderInner::Utf8(Utf8LossyDecoder::new(sink)), } } /// Give a reference to the inner sink. pub fn inner_sink(&self) -> &Sink { match self.inner { LossyDecoderInner::Utf8(ref utf8) => &utf8.inner_sink, #[cfg(feature = "encoding")] LossyDecoderInner::Encoding(_, ref inner_sink) => inner_sink, #[cfg(feature = "encoding_rs")] LossyDecoderInner::EncodingRs(_, ref inner_sink) => inner_sink, } } /// Give a mutable reference to the inner sink. pub fn inner_sink_mut(&mut self) -> &mut Sink { match self.inner { LossyDecoderInner::Utf8(ref mut utf8) => &mut utf8.inner_sink, #[cfg(feature = "encoding")] LossyDecoderInner::Encoding(_, ref mut inner_sink) => inner_sink, #[cfg(feature = "encoding_rs")] LossyDecoderInner::EncodingRs(_, ref mut inner_sink) => inner_sink, } } } #[cfg(any(feature = "encoding", feature = "encoding_rs"))] impl TendrilSink for LossyDecoder where Sink: TendrilSink, A: Atomicity, { #[inline] fn process(&mut self, t: Tendril) { match self.inner { LossyDecoderInner::Utf8(ref mut utf8) => return utf8.process(t), #[cfg(feature = "encoding")] LossyDecoderInner::Encoding(ref mut decoder, ref mut sink) => { let mut out = Tendril::new(); let mut t = t; loop { match decoder.raw_feed(&*t, &mut out) { (_, Some(err)) => { out.push_char('\u{fffd}'); sink.error(err.cause); debug_assert!(err.upto >= 0); t.pop_front(err.upto as u32); // continue loop and process remainder of t } (_, None) => break, } } if out.len() > 0 { sink.process(out); } } #[cfg(feature = "encoding_rs")] LossyDecoderInner::EncodingRs(ref mut decoder, ref mut sink) => { if t.is_empty() { return; } decode_to_sink(t, decoder, sink, false); } } } #[inline] fn error(&mut self, desc: Cow<'static, str>) { match self.inner { LossyDecoderInner::Utf8(ref mut utf8) => utf8.error(desc), #[cfg(feature = "encoding")] LossyDecoderInner::Encoding(_, ref mut sink) => sink.error(desc), #[cfg(feature = "encoding_rs")] LossyDecoderInner::EncodingRs(_, ref mut sink) => sink.error(desc), } } type Output = Sink::Output; #[inline] fn finish(self) -> Sink::Output { match self.inner { LossyDecoderInner::Utf8(utf8) => return utf8.finish(), #[cfg(feature = "encoding")] LossyDecoderInner::Encoding(mut decoder, mut sink) => { let mut out = Tendril::new(); if let Some(err) = decoder.raw_finish(&mut out) { out.push_char('\u{fffd}'); sink.error(err.cause); } if out.len() > 0 { sink.process(out); } sink.finish() } #[cfg(feature = "encoding_rs")] LossyDecoderInner::EncodingRs(mut decoder, mut sink) => { decode_to_sink(Tendril::new(), &mut decoder, &mut sink, true); sink.finish() } } } } #[cfg(feature = "encoding_rs")] fn decode_to_sink( mut t: Tendril, decoder: &mut encoding_rs::Decoder, sink: &mut Sink, last: bool, ) where Sink: TendrilSink, A: Atomicity, { loop { let mut out = >::new(); let max_len = decoder .max_utf8_buffer_length_without_replacement(t.len()) .unwrap_or(8192); unsafe { out.push_uninitialized(std::cmp::min(max_len as u32, 8192)); } let (result, bytes_read, bytes_written) = decoder.decode_to_utf8_without_replacement(&t, &mut out, last); if bytes_written > 0 { sink.process(unsafe { out.subtendril(0, bytes_written as u32) .reinterpret_without_validating() }); } match result { DecoderResult::InputEmpty => return, DecoderResult::OutputFull => {} DecoderResult::Malformed(_, _) => { sink.error(Cow::Borrowed("invalid sequence")); sink.process("\u{FFFD}".into()); } } t.pop_front(bytes_read as u32); if t.is_empty() { return; } } } #[cfg(test)] mod test { use super::{TendrilSink, Utf8LossyDecoder}; use fmt; use std::borrow::Cow; use tendril::{Atomicity, NonAtomic, Tendril}; #[cfg(any(feature = "encoding", feature = "encoding_rs"))] use super::LossyDecoder; #[cfg(any(feature = "encoding", feature = "encoding_rs"))] use tendril::SliceExt; #[cfg(feature = "encoding")] use encoding::all as enc; #[cfg(feature = "encoding_rs")] use encoding_rs as enc_rs; struct Accumulate where A: Atomicity, { tendrils: Vec>, errors: Vec, } impl Accumulate where A: Atomicity, { fn new() -> Accumulate { Accumulate { tendrils: vec![], errors: vec![], } } } impl TendrilSink for Accumulate where A: Atomicity, { fn process(&mut self, t: Tendril) { self.tendrils.push(t); } fn error(&mut self, desc: Cow<'static, str>) { self.errors.push(desc.into_owned()); } type Output = (Vec>, Vec); fn finish(self) -> Self::Output { (self.tendrils, self.errors) } } fn check_utf8(input: &[&[u8]], expected: &[&str], errs: usize) { let decoder = Utf8LossyDecoder::new(Accumulate::::new()); let (tendrils, errors) = decoder.from_iter(input.iter().cloned()); assert_eq!( expected, &*tendrils.iter().map(|t| &**t).collect::>() ); assert_eq!(errs, errors.len()); } #[test] fn utf8() { check_utf8(&[], &[], 0); check_utf8(&[b""], &[], 0); check_utf8(&[b"xyz"], &["xyz"], 0); check_utf8(&[b"x", b"y", b"z"], &["x", "y", "z"], 0); check_utf8(&[b"xy\xEA\x99\xAEzw"], &["xy\u{a66e}zw"], 0); check_utf8(&[b"xy\xEA", b"\x99\xAEzw"], &["xy", "\u{a66e}z", "w"], 0); check_utf8(&[b"xy\xEA\x99", b"\xAEzw"], &["xy", "\u{a66e}z", "w"], 0); check_utf8( &[b"xy\xEA", b"\x99", b"\xAEzw"], &["xy", "\u{a66e}z", "w"], 0, ); check_utf8(&[b"\xEA", b"", b"\x99", b"", b"\xAE"], &["\u{a66e}"], 0); check_utf8( &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], &["\u{a66e}"], 0, ); check_utf8( &[b"xy\xEA", b"\xFF", b"\x99\xAEz"], &["xy", "\u{fffd}", "\u{fffd}", "\u{fffd}", "\u{fffd}", "z"], 4, ); check_utf8( &[b"xy\xEA\x99", b"\xFFz"], &["xy", "\u{fffd}", "\u{fffd}", "z"], 2, ); check_utf8(&[b"\xC5\x91\xC5\x91\xC5\x91"], &["őőő"], 0); check_utf8( &[b"\xC5\x91", b"\xC5\x91", b"\xC5\x91"], &["ő", "ő", "ő"], 0, ); check_utf8( &[b"\xC5", b"\x91\xC5", b"\x91\xC5", b"\x91"], &["ő", "ő", "ő"], 0, ); check_utf8( &[b"\xC5", b"\x91\xff", b"\x91\xC5", b"\x91"], &["ő", "\u{fffd}", "\u{fffd}", "ő"], 2, ); // incomplete char at end of input check_utf8(&[b"\xC0"], &["\u{fffd}"], 1); check_utf8(&[b"\xEA\x99"], &["\u{fffd}"], 1); } #[cfg(any(feature = "encoding", feature = "encoding_rs"))] fn check_decode( mut decoder: LossyDecoder>, input: &[&[u8]], expected: &str, errs: usize, ) { for x in input { decoder.process(x.to_tendril()); } let (tendrils, errors) = decoder.finish(); let mut tendril: Tendril = Tendril::new(); for t in tendrils { tendril.push_tendril(&t); } assert_eq!(expected, &*tendril); assert_eq!(errs, errors.len()); } #[cfg(any(feature = "encoding", feature = "encoding_rs"))] pub type Tests = &'static [(&'static [&'static [u8]], &'static str, usize)]; #[cfg(any(feature = "encoding"))] const ASCII: Tests = &[ (&[], "", 0), (&[b""], "", 0), (&[b"xyz"], "xyz", 0), (&[b"xy", b"", b"", b"z"], "xyz", 0), (&[b"x", b"y", b"z"], "xyz", 0), (&[b"\xFF"], "\u{fffd}", 1), (&[b"x\xC0yz"], "x\u{fffd}yz", 1), (&[b"x", b"\xC0y", b"z"], "x\u{fffd}yz", 1), (&[b"x\xC0yz\xFF\xFFw"], "x\u{fffd}yz\u{fffd}\u{fffd}w", 3), ]; #[cfg(feature = "encoding")] #[test] fn decode_ascii() { for &(input, expected, errs) in ASCII { let decoder = LossyDecoder::new(enc::ASCII, Accumulate::new()); check_decode(decoder, input, expected, errs); } } #[cfg(any(feature = "encoding", feature = "encoding_rs"))] const UTF_8: Tests = &[ (&[], "", 0), (&[b""], "", 0), (&[b"xyz"], "xyz", 0), (&[b"x", b"y", b"z"], "xyz", 0), (&[b"\xEA\x99\xAE"], "\u{a66e}", 0), (&[b"\xEA", b"\x99\xAE"], "\u{a66e}", 0), (&[b"\xEA\x99", b"\xAE"], "\u{a66e}", 0), (&[b"\xEA", b"\x99", b"\xAE"], "\u{a66e}", 0), (&[b"\xEA", b"", b"\x99", b"", b"\xAE"], "\u{a66e}", 0), ( &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], "\u{a66e}", 0, ), (&[b"xy\xEA", b"\x99\xAEz"], "xy\u{a66e}z", 0), ( &[b"xy\xEA", b"\xFF", b"\x99\xAEz"], "xy\u{fffd}\u{fffd}\u{fffd}\u{fffd}z", 4, ), (&[b"xy\xEA\x99", b"\xFFz"], "xy\u{fffd}\u{fffd}z", 2), // incomplete char at end of input (&[b"\xC0"], "\u{fffd}", 1), (&[b"\xEA\x99"], "\u{fffd}", 1), ]; #[cfg(feature = "encoding")] #[test] fn decode_utf8() { for &(input, expected, errs) in UTF_8 { let decoder = LossyDecoder::new(enc::UTF_8, Accumulate::new()); check_decode(decoder, input, expected, errs); } } #[cfg(feature = "encoding_rs")] #[test] fn decode_utf8_encoding_rs() { for &(input, expected, errs) in UTF_8 { let decoder = LossyDecoder::new_encoding_rs(enc_rs::UTF_8, Accumulate::new()); check_decode(decoder, input, expected, errs); } } #[cfg(any(feature = "encoding", feature = "encoding_rs"))] const KOI8_U: Tests = &[ (&[b"\xfc\xce\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0), (&[b"\xfc\xce", b"\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0), (&[b"\xfc\xce", b"\xc5\xd2\xc7", b"\xc9\xd1"], "Энергия", 0), ( &[b"\xfc\xce", b"", b"\xc5\xd2\xc7", b"\xc9\xd1", b""], "Энергия", 0, ), ]; #[cfg(feature = "encoding")] #[test] fn decode_koi8_u() { for &(input, expected, errs) in KOI8_U { let decoder = LossyDecoder::new(enc::KOI8_U, Accumulate::new()); check_decode(decoder, input, expected, errs); } } #[cfg(feature = "encoding_rs")] #[test] fn decode_koi8_u_encoding_rs() { for &(input, expected, errs) in KOI8_U { let decoder = LossyDecoder::new_encoding_rs(enc_rs::KOI8_U, Accumulate::new()); check_decode(decoder, input, expected, errs); } } #[cfg(any(feature = "encoding", feature = "encoding_rs"))] const WINDOWS_949: Tests = &[ (&[], "", 0), (&[b""], "", 0), (&[b"\xbe\xc8\xb3\xe7"], "안녕", 0), (&[b"\xbe", b"\xc8\xb3\xe7"], "안녕", 0), (&[b"\xbe", b"", b"\xc8\xb3\xe7"], "안녕", 0), ( &[b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4"], "안녕하세요", 0, ), (&[b"\xbe\xc8\xb3\xe7\xc7"], "안녕\u{fffd}", 1), (&[b"\xbe", b"", b"\xc8\xb3"], "안\u{fffd}", 1), (&[b"\xbe\x28\xb3\xe7"], "\u{fffd}(녕", 1), ]; #[cfg(feature = "encoding")] #[test] fn decode_windows_949() { for &(input, expected, errs) in WINDOWS_949 { let decoder = LossyDecoder::new(enc::WINDOWS_949, Accumulate::new()); check_decode(decoder, input, expected, errs); } } #[cfg(feature = "encoding_rs")] #[test] fn decode_windows_949_encoding_rs() { for &(input, expected, errs) in WINDOWS_949 { let decoder = LossyDecoder::new_encoding_rs(enc_rs::EUC_KR, Accumulate::new()); check_decode(decoder, input, expected, errs); } } #[test] fn read_from() { let decoder = Utf8LossyDecoder::new(Accumulate::::new()); let mut bytes: &[u8] = b"foo\xffbar"; let (tendrils, errors) = decoder.read_from(&mut bytes).unwrap(); assert_eq!( &*tendrils.iter().map(|t| &**t).collect::>(), &["foo", "\u{FFFD}", "bar"] ); assert_eq!(errors, &["invalid byte sequence"]); } } tendril-0.4.3/src/tendril.rs000064400000000000000000002147030072674642500141260ustar 00000000000000// Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use std::borrow::Borrow; use std::cell::{Cell, UnsafeCell}; use std::cmp::Ordering; use std::default::Default; use std::fmt as strfmt; use std::iter::FromIterator; use std::marker::PhantomData; use std::num::NonZeroUsize; use std::ops::{Deref, DerefMut}; use std::sync::atomic::Ordering as AtomicOrdering; use std::sync::atomic::{self, AtomicUsize}; use std::{hash, io, mem, ptr, str, u32}; #[cfg(feature = "encoding")] use encoding::{self, DecoderTrap, EncoderTrap, EncodingRef}; use buf32::{self, Buf32}; use fmt::imp::Fixup; use fmt::{self, Slice}; use util::{copy_and_advance, copy_lifetime, copy_lifetime_mut, unsafe_slice, unsafe_slice_mut}; use OFLOW; const MAX_INLINE_LEN: usize = 8; const MAX_INLINE_TAG: usize = 0xF; const EMPTY_TAG: usize = 0xF; #[inline(always)] fn inline_tag(len: u32) -> NonZeroUsize { debug_assert!(len <= MAX_INLINE_LEN as u32); unsafe { NonZeroUsize::new_unchecked(if len == 0 { EMPTY_TAG } else { len as usize }) } } /// The multithreadedness of a tendril. /// /// Exactly two types implement this trait: /// /// - `Atomic`: use this in your tendril and you will have a `Send` tendril which works /// across threads; this is akin to `Arc`. /// /// - `NonAtomic`: use this in your tendril and you will have a tendril which is neither /// `Send` nor `Sync` but should be a tad faster; this is akin to `Rc`. /// /// The layout of this trait is also mandated to be that of a `usize`, /// for it is used for reference counting. pub unsafe trait Atomicity: 'static { #[doc(hidden)] fn new() -> Self; #[doc(hidden)] fn increment(&self) -> usize; #[doc(hidden)] fn decrement(&self) -> usize; #[doc(hidden)] fn fence_acquire(); } /// A marker of a non-atomic tendril. /// /// This is the default for the second type parameter of a `Tendril` /// and so doesn't typically need to be written. /// /// This is akin to using `Rc` for reference counting. #[repr(C)] pub struct NonAtomic(Cell); unsafe impl Atomicity for NonAtomic { #[inline] fn new() -> Self { NonAtomic(Cell::new(1)) } #[inline] fn increment(&self) -> usize { let value = self.0.get(); self.0.set(value.checked_add(1).expect(OFLOW)); value } #[inline] fn decrement(&self) -> usize { let value = self.0.get(); self.0.set(value - 1); value } #[inline] fn fence_acquire() {} } /// A marker of an atomic (and hence concurrent) tendril. /// /// This is used as the second, optional type parameter of a `Tendril`; /// `Tendril` thus implements`Send`. /// /// This is akin to using `Arc` for reference counting. pub struct Atomic(AtomicUsize); unsafe impl Atomicity for Atomic { #[inline] fn new() -> Self { Atomic(AtomicUsize::new(1)) } #[inline] fn increment(&self) -> usize { // Relaxed is OK because we have a reference already. self.0.fetch_add(1, AtomicOrdering::Relaxed) } #[inline] fn decrement(&self) -> usize { self.0.fetch_sub(1, AtomicOrdering::Release) } #[inline] fn fence_acquire() { atomic::fence(AtomicOrdering::Acquire); } } #[repr(C)] // Preserve field order for cross-atomicity transmutes struct Header { refcount: A, cap: u32, } impl Header where A: Atomicity, { #[inline(always)] unsafe fn new() -> Header { Header { refcount: A::new(), cap: 0, } } } /// Errors that can occur when slicing a `Tendril`. #[derive(Copy, Clone, Hash, Debug, PartialEq, Eq)] pub enum SubtendrilError { OutOfBounds, ValidationFailed, } /// Compact string type for zero-copy parsing. /// /// `Tendril`s have the semantics of owned strings, but are sometimes views /// into shared buffers. When you mutate a `Tendril`, an owned copy is made /// if necessary. Further mutations occur in-place until the string becomes /// shared, e.g. with `clone()` or `subtendril()`. /// /// Buffer sharing is accomplished through thread-local (non-atomic) reference /// counting, which has very low overhead. The Rust type system will prevent /// you at compile time from sending a `Tendril` between threads. We plan to /// relax this restriction in the future; see `README.md`. /// /// Whereas `String` allocates in the heap for any non-empty string, `Tendril` /// can store small strings (up to 8 bytes) in-line, without a heap allocation. /// `Tendril` is also smaller than `String` on 64-bit platforms — 16 bytes /// versus 24. /// /// The type parameter `F` specifies the format of the tendril, for example /// UTF-8 text or uninterpreted bytes. The parameter will be instantiated /// with one of the marker types from `tendril::fmt`. See the `StrTendril` /// and `ByteTendril` type aliases for two examples. /// /// The type parameter `A` indicates the atomicity of the tendril; it is by /// default `NonAtomic`, but can be specified as `Atomic` to get a tendril /// which implements `Send` (viz. a thread-safe tendril). /// /// The maximum length of a `Tendril` is 4 GB. The library will panic if /// you attempt to go over the limit. #[repr(C)] pub struct Tendril where F: fmt::Format, A: Atomicity, { ptr: Cell, buf: UnsafeCell, marker: PhantomData<*mut F>, refcount_marker: PhantomData, } #[repr(C)] union Buffer { heap: Heap, inline: [u8; 8], } #[derive(Copy, Clone)] #[repr(C)] struct Heap { len: u32, aux: u32, } unsafe impl Send for Tendril where F: fmt::Format, A: Atomicity + Sync, { } /// `Tendril` for storing native Rust strings. pub type StrTendril = Tendril; /// `Tendril` for storing binary data. pub type ByteTendril = Tendril; impl Clone for Tendril where F: fmt::Format, A: Atomicity, { #[inline] fn clone(&self) -> Tendril { unsafe { if self.ptr.get().get() > MAX_INLINE_TAG { self.make_buf_shared(); self.incref(); } ptr::read(self) } } } impl Drop for Tendril where F: fmt::Format, A: Atomicity, { #[inline] fn drop(&mut self) { unsafe { let p = self.ptr.get().get(); if p <= MAX_INLINE_TAG { return; } let (buf, shared, _) = self.assume_buf(); if shared { let header = self.header(); if (*header).refcount.decrement() == 1 { A::fence_acquire(); buf.destroy(); } } else { buf.destroy(); } } } } macro_rules! from_iter_method { ($ty:ty) => { #[inline] fn from_iter(iterable: I) -> Self where I: IntoIterator, { let mut output = Self::new(); output.extend(iterable); output } }; } impl Extend for Tendril where A: Atomicity, { #[inline] fn extend(&mut self, iterable: I) where I: IntoIterator, { let iterator = iterable.into_iter(); self.force_reserve(iterator.size_hint().0 as u32); for c in iterator { self.push_char(c); } } } impl FromIterator for Tendril where A: Atomicity, { from_iter_method!(char); } impl Extend for Tendril where A: Atomicity, { #[inline] fn extend(&mut self, iterable: I) where I: IntoIterator, { let iterator = iterable.into_iter(); self.force_reserve(iterator.size_hint().0 as u32); for b in iterator { self.push_slice(&[b]); } } } impl FromIterator for Tendril where A: Atomicity, { from_iter_method!(u8); } impl<'a, A> Extend<&'a u8> for Tendril where A: Atomicity, { #[inline] fn extend(&mut self, iterable: I) where I: IntoIterator, { let iterator = iterable.into_iter(); self.force_reserve(iterator.size_hint().0 as u32); for &b in iterator { self.push_slice(&[b]); } } } impl<'a, A> FromIterator<&'a u8> for Tendril where A: Atomicity, { from_iter_method!(&'a u8); } impl<'a, A> Extend<&'a str> for Tendril where A: Atomicity, { #[inline] fn extend(&mut self, iterable: I) where I: IntoIterator, { for s in iterable { self.push_slice(s); } } } impl<'a, A> FromIterator<&'a str> for Tendril where A: Atomicity, { from_iter_method!(&'a str); } impl<'a, A> Extend<&'a [u8]> for Tendril where A: Atomicity, { #[inline] fn extend(&mut self, iterable: I) where I: IntoIterator, { for s in iterable { self.push_slice(s); } } } impl<'a, A> FromIterator<&'a [u8]> for Tendril where A: Atomicity, { from_iter_method!(&'a [u8]); } impl<'a, F, A> Extend<&'a Tendril> for Tendril where F: fmt::Format + 'a, A: Atomicity, { #[inline] fn extend(&mut self, iterable: I) where I: IntoIterator>, { for t in iterable { self.push_tendril(t); } } } impl<'a, F, A> FromIterator<&'a Tendril> for Tendril where F: fmt::Format + 'a, A: Atomicity, { from_iter_method!(&'a Tendril); } impl Deref for Tendril where F: fmt::SliceFormat, A: Atomicity, { type Target = F::Slice; #[inline] fn deref(&self) -> &F::Slice { unsafe { F::Slice::from_bytes(self.as_byte_slice()) } } } impl DerefMut for Tendril where F: fmt::SliceFormat, A: Atomicity, { #[inline] fn deref_mut(&mut self) -> &mut F::Slice { unsafe { F::Slice::from_mut_bytes(self.as_mut_byte_slice()) } } } impl Borrow<[u8]> for Tendril where F: fmt::SliceFormat, A: Atomicity, { fn borrow(&self) -> &[u8] { self.as_byte_slice() } } // Why not impl Borrow for Tendril? str and [u8] hash differently, // and so a HashMap would silently break if we indexed by str. Ick. // https://github.com/rust-lang/rust/issues/27108 impl PartialEq for Tendril where F: fmt::Format, A: Atomicity, { #[inline] fn eq(&self, other: &Self) -> bool { self.as_byte_slice() == other.as_byte_slice() } #[inline] fn ne(&self, other: &Self) -> bool { self.as_byte_slice() != other.as_byte_slice() } } impl Eq for Tendril where F: fmt::Format, A: Atomicity, { } impl PartialOrd for Tendril where F: fmt::SliceFormat, ::Slice: PartialOrd, A: Atomicity, { #[inline] fn partial_cmp(&self, other: &Self) -> Option { PartialOrd::partial_cmp(&**self, &**other) } } impl Ord for Tendril where F: fmt::SliceFormat, ::Slice: Ord, A: Atomicity, { #[inline] fn cmp(&self, other: &Self) -> Ordering { Ord::cmp(&**self, &**other) } } impl Default for Tendril where F: fmt::Format, A: Atomicity, { #[inline(always)] fn default() -> Tendril { Tendril::new() } } impl strfmt::Debug for Tendril where F: fmt::SliceFormat + Default + strfmt::Debug, ::Slice: strfmt::Debug, A: Atomicity, { #[inline] fn fmt(&self, f: &mut strfmt::Formatter) -> strfmt::Result { let kind = match self.ptr.get().get() { p if p <= MAX_INLINE_TAG => "inline", p if p & 1 == 1 => "shared", _ => "owned", }; write!(f, "Tendril<{:?}>({}: ", ::default(), kind)?; <::Slice as strfmt::Debug>::fmt(&**self, f)?; write!(f, ")") } } impl hash::Hash for Tendril where F: fmt::Format, A: Atomicity, { #[inline] fn hash(&self, hasher: &mut H) { self.as_byte_slice().hash(hasher) } } impl Tendril where F: fmt::Format, A: Atomicity, { /// Create a new, empty `Tendril` in any format. #[inline(always)] pub fn new() -> Tendril { unsafe { Tendril::inline(&[]) } } /// Create a new, empty `Tendril` with a specified capacity. #[inline] pub fn with_capacity(capacity: u32) -> Tendril { let mut t: Tendril = Tendril::new(); if capacity > MAX_INLINE_LEN as u32 { unsafe { t.make_owned_with_capacity(capacity); } } t } /// Reserve space for additional bytes. /// /// This is only a suggestion. There are cases where `Tendril` will /// decline to allocate until the buffer is actually modified. #[inline] pub fn reserve(&mut self, additional: u32) { if !self.is_shared() { // Don't grow a shared tendril because we'd have to copy // right away. self.force_reserve(additional); } } /// Reserve space for additional bytes, even for shared buffers. #[inline] fn force_reserve(&mut self, additional: u32) { let new_len = self.len32().checked_add(additional).expect(OFLOW); if new_len > MAX_INLINE_LEN as u32 { unsafe { self.make_owned_with_capacity(new_len); } } } /// Get the length of the `Tendril`. /// /// This is named not to conflict with `len()` on the underlying /// slice, if any. #[inline(always)] pub fn len32(&self) -> u32 { match self.ptr.get().get() { EMPTY_TAG => 0, n if n <= MAX_INLINE_LEN => n as u32, _ => unsafe { self.raw_len() }, } } /// Is the backing buffer shared? #[inline] pub fn is_shared(&self) -> bool { let n = self.ptr.get().get(); (n > MAX_INLINE_TAG) && ((n & 1) == 1) } /// Is the backing buffer shared with this other `Tendril`? #[inline] pub fn is_shared_with(&self, other: &Tendril) -> bool { let n = self.ptr.get().get(); (n > MAX_INLINE_TAG) && (n == other.ptr.get().get()) } /// Truncate to length 0 without discarding any owned storage. #[inline] pub fn clear(&mut self) { if self.ptr.get().get() <= MAX_INLINE_TAG { self.ptr .set(unsafe { NonZeroUsize::new_unchecked(EMPTY_TAG) }); } else { let (_, shared, _) = unsafe { self.assume_buf() }; if shared { // No need to keep a reference alive for a 0-size slice. *self = Tendril::new(); } else { unsafe { self.set_len(0) }; } } } /// Build a `Tendril` by copying a byte slice, if it conforms to the format. #[inline] pub fn try_from_byte_slice(x: &[u8]) -> Result, ()> { match F::validate(x) { true => Ok(unsafe { Tendril::from_byte_slice_without_validating(x) }), false => Err(()), } } /// View as uninterpreted bytes. #[inline(always)] pub fn as_bytes(&self) -> &Tendril { unsafe { mem::transmute(self) } } /// Convert into uninterpreted bytes. #[inline(always)] pub fn into_bytes(self) -> Tendril { unsafe { mem::transmute(self) } } /// Convert `self` into a type which is `Send`. /// /// If the tendril is owned or inline, this is free, /// but if it's shared this will entail a copy of the contents. #[inline] pub fn into_send(mut self) -> SendTendril { self.make_owned(); SendTendril { // This changes the header.refcount from A to NonAtomic, but that's // OK because we have defined the format of A as a usize. tendril: unsafe { mem::transmute(self) }, } } /// View as a superset format, for free. #[inline(always)] pub fn as_superset(&self) -> &Tendril where F: fmt::SubsetOf, Super: fmt::Format, { unsafe { mem::transmute(self) } } /// Convert into a superset format, for free. #[inline(always)] pub fn into_superset(self) -> Tendril where F: fmt::SubsetOf, Super: fmt::Format, { unsafe { mem::transmute(self) } } /// View as a subset format, if the `Tendril` conforms to that subset. #[inline] pub fn try_as_subset(&self) -> Result<&Tendril, ()> where Sub: fmt::SubsetOf, { match Sub::revalidate_subset(self.as_byte_slice()) { true => Ok(unsafe { mem::transmute(self) }), false => Err(()), } } /// Convert into a subset format, if the `Tendril` conforms to that subset. #[inline] pub fn try_into_subset(self) -> Result, Self> where Sub: fmt::SubsetOf, { match Sub::revalidate_subset(self.as_byte_slice()) { true => Ok(unsafe { mem::transmute(self) }), false => Err(self), } } /// View as another format, if the bytes of the `Tendril` are valid for /// that format. #[inline] pub fn try_reinterpret_view(&self) -> Result<&Tendril, ()> where Other: fmt::Format, { match Other::validate(self.as_byte_slice()) { true => Ok(unsafe { mem::transmute(self) }), false => Err(()), } } /// Convert into another format, if the `Tendril` conforms to that format. /// /// This only re-validates the existing bytes under the new format. It /// will *not* change the byte content of the tendril! /// /// See the `encode` and `decode` methods for character encoding conversion. #[inline] pub fn try_reinterpret(self) -> Result, Self> where Other: fmt::Format, { match Other::validate(self.as_byte_slice()) { true => Ok(unsafe { mem::transmute(self) }), false => Err(self), } } /// Push some bytes onto the end of the `Tendril`, if they conform to the /// format. #[inline] pub fn try_push_bytes(&mut self, buf: &[u8]) -> Result<(), ()> { match F::validate(buf) { true => unsafe { self.push_bytes_without_validating(buf); Ok(()) }, false => Err(()), } } /// Push another `Tendril` onto the end of this one. #[inline] pub fn push_tendril(&mut self, other: &Tendril) { let new_len = self.len32().checked_add(other.len32()).expect(OFLOW); unsafe { if (self.ptr.get().get() > MAX_INLINE_TAG) && (other.ptr.get().get() > MAX_INLINE_TAG) { let (self_buf, self_shared, _) = self.assume_buf(); let (other_buf, other_shared, _) = other.assume_buf(); if self_shared && other_shared && (self_buf.data_ptr() == other_buf.data_ptr()) && other.aux() == self.aux() + self.raw_len() { self.set_len(new_len); return; } } self.push_bytes_without_validating(other.as_byte_slice()) } } /// Attempt to slice this `Tendril` as a new `Tendril`. /// /// This will share the buffer when possible. Mutating a shared buffer /// will copy the contents. /// /// The offset and length are in bytes. The function will return /// `Err` if these are out of bounds, or if the resulting slice /// does not conform to the format. #[inline] pub fn try_subtendril( &self, offset: u32, length: u32, ) -> Result, SubtendrilError> { let self_len = self.len32(); if offset > self_len || length > (self_len - offset) { return Err(SubtendrilError::OutOfBounds); } unsafe { let byte_slice = unsafe_slice(self.as_byte_slice(), offset as usize, length as usize); if !F::validate_subseq(byte_slice) { return Err(SubtendrilError::ValidationFailed); } Ok(self.unsafe_subtendril(offset, length)) } } /// Slice this `Tendril` as a new `Tendril`. /// /// Panics on bounds or validity check failure. #[inline] pub fn subtendril(&self, offset: u32, length: u32) -> Tendril { self.try_subtendril(offset, length).unwrap() } /// Try to drop `n` bytes from the front. /// /// Returns `Err` if the bytes are not available, or the suffix fails /// validation. #[inline] pub fn try_pop_front(&mut self, n: u32) -> Result<(), SubtendrilError> { if n == 0 { return Ok(()); } let old_len = self.len32(); if n > old_len { return Err(SubtendrilError::OutOfBounds); } let new_len = old_len - n; unsafe { if !F::validate_suffix(unsafe_slice( self.as_byte_slice(), n as usize, new_len as usize, )) { return Err(SubtendrilError::ValidationFailed); } self.unsafe_pop_front(n); Ok(()) } } /// Drop `n` bytes from the front. /// /// Panics if the bytes are not available, or the suffix fails /// validation. #[inline] pub fn pop_front(&mut self, n: u32) { self.try_pop_front(n).unwrap() } /// Drop `n` bytes from the back. /// /// Returns `Err` if the bytes are not available, or the prefix fails /// validation. #[inline] pub fn try_pop_back(&mut self, n: u32) -> Result<(), SubtendrilError> { if n == 0 { return Ok(()); } let old_len = self.len32(); if n > old_len { return Err(SubtendrilError::OutOfBounds); } let new_len = old_len - n; unsafe { if !F::validate_prefix(unsafe_slice(self.as_byte_slice(), 0, new_len as usize)) { return Err(SubtendrilError::ValidationFailed); } self.unsafe_pop_back(n); Ok(()) } } /// Drop `n` bytes from the back. /// /// Panics if the bytes are not available, or the prefix fails /// validation. #[inline] pub fn pop_back(&mut self, n: u32) { self.try_pop_back(n).unwrap() } /// View as another format, without validating. #[inline(always)] pub unsafe fn reinterpret_view_without_validating(&self) -> &Tendril where Other: fmt::Format, { mem::transmute(self) } /// Convert into another format, without validating. #[inline(always)] pub unsafe fn reinterpret_without_validating(self) -> Tendril where Other: fmt::Format, { mem::transmute(self) } /// Build a `Tendril` by copying a byte slice, without validating. #[inline] pub unsafe fn from_byte_slice_without_validating(x: &[u8]) -> Tendril { assert!(x.len() <= buf32::MAX_LEN); if x.len() <= MAX_INLINE_LEN { Tendril::inline(x) } else { Tendril::owned_copy(x) } } /// Push some bytes onto the end of the `Tendril`, without validating. #[inline] pub unsafe fn push_bytes_without_validating(&mut self, buf: &[u8]) { assert!(buf.len() <= buf32::MAX_LEN); let Fixup { drop_left, drop_right, insert_len, insert_bytes, } = F::fixup(self.as_byte_slice(), buf); // FIXME: think more about overflow let adj_len = self.len32() + insert_len - drop_left; let new_len = adj_len.checked_add(buf.len() as u32).expect(OFLOW) - drop_right; let drop_left = drop_left as usize; let drop_right = drop_right as usize; if new_len <= MAX_INLINE_LEN as u32 { let mut tmp = [0_u8; MAX_INLINE_LEN]; { let old = self.as_byte_slice(); let mut dest = tmp.as_mut_ptr(); copy_and_advance(&mut dest, unsafe_slice(old, 0, old.len() - drop_left)); copy_and_advance( &mut dest, unsafe_slice(&insert_bytes, 0, insert_len as usize), ); copy_and_advance( &mut dest, unsafe_slice(buf, drop_right, buf.len() - drop_right), ); } *self = Tendril::inline(&tmp[..new_len as usize]); } else { self.make_owned_with_capacity(new_len); let (owned, _, _) = self.assume_buf(); let mut dest = owned .data_ptr() .offset((owned.len as usize - drop_left) as isize); copy_and_advance( &mut dest, unsafe_slice(&insert_bytes, 0, insert_len as usize), ); copy_and_advance( &mut dest, unsafe_slice(buf, drop_right, buf.len() - drop_right), ); self.set_len(new_len); } } /// Slice this `Tendril` as a new `Tendril`. /// /// Does not check validity or bounds! #[inline] pub unsafe fn unsafe_subtendril(&self, offset: u32, length: u32) -> Tendril { if length <= MAX_INLINE_LEN as u32 { Tendril::inline(unsafe_slice( self.as_byte_slice(), offset as usize, length as usize, )) } else { self.make_buf_shared(); self.incref(); let (buf, _, _) = self.assume_buf(); Tendril::shared(buf, self.aux() + offset, length) } } /// Drop `n` bytes from the front. /// /// Does not check validity or bounds! #[inline] pub unsafe fn unsafe_pop_front(&mut self, n: u32) { let new_len = self.len32() - n; if new_len <= MAX_INLINE_LEN as u32 { *self = Tendril::inline(unsafe_slice( self.as_byte_slice(), n as usize, new_len as usize, )); } else { self.make_buf_shared(); self.set_aux(self.aux() + n); let len = self.raw_len(); self.set_len(len - n); } } /// Drop `n` bytes from the back. /// /// Does not check validity or bounds! #[inline] pub unsafe fn unsafe_pop_back(&mut self, n: u32) { let new_len = self.len32() - n; if new_len <= MAX_INLINE_LEN as u32 { *self = Tendril::inline(unsafe_slice(self.as_byte_slice(), 0, new_len as usize)); } else { self.make_buf_shared(); let len = self.raw_len(); self.set_len(len - n); } } #[inline] unsafe fn incref(&self) { (*self.header()).refcount.increment(); } #[inline] unsafe fn make_buf_shared(&self) { let p = self.ptr.get().get(); if p & 1 == 0 { let header = p as *mut Header; (*header).cap = self.aux(); self.ptr.set(NonZeroUsize::new_unchecked(p | 1)); self.set_aux(0); } } // This is not public as it is of no practical value to users. // By and large they shouldn't need to worry about the distinction at all, // and going out of your way to make it owned is pointless. #[inline] fn make_owned(&mut self) { unsafe { let ptr = self.ptr.get().get(); if ptr <= MAX_INLINE_TAG || (ptr & 1) == 1 { *self = Tendril::owned_copy(self.as_byte_slice()); } } } #[inline] unsafe fn make_owned_with_capacity(&mut self, cap: u32) { self.make_owned(); let mut buf = self.assume_buf().0; buf.grow(cap); self.ptr.set(NonZeroUsize::new_unchecked(buf.ptr as usize)); self.set_aux(buf.cap); } #[inline(always)] unsafe fn header(&self) -> *mut Header { (self.ptr.get().get() & !1) as *mut Header } #[inline] unsafe fn assume_buf(&self) -> (Buf32>, bool, u32) { let ptr = self.ptr.get().get(); let header = self.header(); let shared = (ptr & 1) == 1; let (cap, offset) = match shared { true => ((*header).cap, self.aux()), false => (self.aux(), 0), }; ( Buf32 { ptr: header, len: offset + self.len32(), cap: cap, }, shared, offset, ) } #[inline] unsafe fn inline(x: &[u8]) -> Tendril { let len = x.len(); let t = Tendril { ptr: Cell::new(inline_tag(len as u32)), buf: UnsafeCell::new(Buffer { inline: [0; 8] }), marker: PhantomData, refcount_marker: PhantomData, }; ptr::copy_nonoverlapping(x.as_ptr(), (*t.buf.get()).inline.as_mut_ptr(), len); t } #[inline] unsafe fn owned(x: Buf32>) -> Tendril { Tendril { ptr: Cell::new(NonZeroUsize::new_unchecked(x.ptr as usize)), buf: UnsafeCell::new(Buffer { heap: Heap { len: x.len, aux: x.cap, }, }), marker: PhantomData, refcount_marker: PhantomData, } } #[inline] unsafe fn owned_copy(x: &[u8]) -> Tendril { let len32 = x.len() as u32; let mut b = Buf32::with_capacity(len32, Header::new()); ptr::copy_nonoverlapping(x.as_ptr(), b.data_ptr(), x.len()); b.len = len32; Tendril::owned(b) } #[inline] unsafe fn shared(buf: Buf32>, off: u32, len: u32) -> Tendril { Tendril { ptr: Cell::new(NonZeroUsize::new_unchecked((buf.ptr as usize) | 1)), buf: UnsafeCell::new(Buffer { heap: Heap { len, aux: off }, }), marker: PhantomData, refcount_marker: PhantomData, } } #[inline] fn as_byte_slice<'a>(&'a self) -> &'a [u8] { unsafe { match self.ptr.get().get() { EMPTY_TAG => &[], n if n <= MAX_INLINE_LEN => (*self.buf.get()).inline.get_unchecked(..n), _ => { let (buf, _, offset) = self.assume_buf(); copy_lifetime( self, unsafe_slice(buf.data(), offset as usize, self.len32() as usize), ) } } } } // There's no need to worry about locking on an atomic Tendril, because it makes it unique as // soon as you do that. #[inline] fn as_mut_byte_slice<'a>(&'a mut self) -> &'a mut [u8] { unsafe { match self.ptr.get().get() { EMPTY_TAG => &mut [], n if n <= MAX_INLINE_LEN => (*self.buf.get()).inline.get_unchecked_mut(..n), _ => { self.make_owned(); let (mut buf, _, offset) = self.assume_buf(); let len = self.len32() as usize; copy_lifetime_mut(self, unsafe_slice_mut(buf.data_mut(), offset as usize, len)) } } } } unsafe fn raw_len(&self) -> u32 { (*self.buf.get()).heap.len } unsafe fn set_len(&mut self, len: u32) { (*self.buf.get()).heap.len = len; } unsafe fn aux(&self) -> u32 { (*self.buf.get()).heap.aux } unsafe fn set_aux(&self, aux: u32) { (*self.buf.get()).heap.aux = aux; } } impl Tendril where F: fmt::SliceFormat, A: Atomicity, { /// Build a `Tendril` by copying a slice. #[inline] pub fn from_slice(x: &F::Slice) -> Tendril { unsafe { Tendril::from_byte_slice_without_validating(x.as_bytes()) } } /// Push a slice onto the end of the `Tendril`. #[inline] pub fn push_slice(&mut self, x: &F::Slice) { unsafe { self.push_bytes_without_validating(x.as_bytes()) } } } /// A simple wrapper to make `Tendril` `Send`. /// /// Although there is a certain subset of the operations on a `Tendril` that a `SendTendril` could /// reasonably implement, in order to clearly separate concerns this type is deliberately /// minimalist, acting as a safe encapsulation around the invariants which permit `Send`ness and /// behaving as an opaque object. /// /// A `SendTendril` may be produced by `Tendril.into_send()` or `SendTendril::from(tendril)`, /// and may be returned to a `Tendril` by `Tendril::from(self)`. #[derive(Clone)] pub struct SendTendril where F: fmt::Format, { tendril: Tendril, } unsafe impl Send for SendTendril where F: fmt::Format {} impl From> for SendTendril where F: fmt::Format, A: Atomicity, { #[inline] fn from(tendril: Tendril) -> SendTendril { tendril.into_send() } } impl From> for Tendril where F: fmt::Format, A: Atomicity, { #[inline] fn from(send: SendTendril) -> Tendril { unsafe { mem::transmute(send.tendril) } // header.refcount may have been initialised as an Atomic or a NonAtomic, but the value // will be the same (1) regardless, because the layout is defined. // Thus we don't need to fiddle about resetting it or anything like that. } } /// `Tendril`-related methods for Rust slices. pub trait SliceExt: fmt::Slice where F: fmt::SliceFormat, { /// Make a `Tendril` from this slice. #[inline] fn to_tendril(&self) -> Tendril { // It should be done thusly, but at the time of writing the defaults don't help inference: //fn to_tendril(&self) -> Tendril // where A: Atomicity, //{ Tendril::from_slice(self) } } impl SliceExt for str {} impl SliceExt for [u8] {} impl Tendril where F: for<'a> fmt::CharFormat<'a>, A: Atomicity, { /// Remove and return the first character, if any. #[inline] pub fn pop_front_char<'a>(&'a mut self) -> Option { unsafe { let next_char; // first char in iterator let mut skip = 0; // number of bytes to skip, or 0 to clear { // <--+ // | Creating an iterator borrows self, so introduce a // +- scope to contain the borrow (that way we can mutate // self below, after this scope exits). let mut iter = F::char_indices(self.as_byte_slice()); match iter.next() { Some((_, c)) => { next_char = Some(c); if let Some((n, _)) = iter.next() { skip = n as u32; } } None => { next_char = None; } } } if skip != 0 { self.unsafe_pop_front(skip); } else { self.clear(); } next_char } } /// Remove and return a run of characters at the front of the `Tendril` /// which are classified the same according to the function `classify`. /// /// Returns `None` on an empty string. #[inline] pub fn pop_front_char_run<'a, C, R>(&'a mut self, mut classify: C) -> Option<(Tendril, R)> where C: FnMut(char) -> R, R: PartialEq, { let (class, first_mismatch); { let mut chars = unsafe { F::char_indices(self.as_byte_slice()) }; let (_, first) = unwrap_or_return!(chars.next(), None); class = classify(first); first_mismatch = chars.find(|&(_, ch)| &classify(ch) != &class); } match first_mismatch { Some((idx, _)) => unsafe { let t = self.unsafe_subtendril(0, idx as u32); self.unsafe_pop_front(idx as u32); Some((t, class)) }, None => { let t = self.clone(); self.clear(); Some((t, class)) } } } /// Push a character, if it can be represented in this format. #[inline] pub fn try_push_char(&mut self, c: char) -> Result<(), ()> { F::encode_char(c, |b| unsafe { self.push_bytes_without_validating(b); }) } } /// Extension trait for `io::Read`. pub trait ReadExt: io::Read { fn read_to_tendril(&mut self, buf: &mut Tendril) -> io::Result where A: Atomicity; } impl ReadExt for T where T: io::Read, { /// Read all bytes until EOF. fn read_to_tendril(&mut self, buf: &mut Tendril) -> io::Result where A: Atomicity, { // Adapted from libstd/io/mod.rs. const DEFAULT_BUF_SIZE: u32 = 64 * 1024; let start_len = buf.len(); let mut len = start_len; let mut new_write_size = 16; let ret; loop { if len == buf.len() { if new_write_size < DEFAULT_BUF_SIZE { new_write_size *= 2; } // FIXME: this exposes uninitialized bytes to a generic R type // this is fine for R=File which never reads these bytes, // but user-defined types might. // The standard library pushes zeros to `Vec` for that reason. unsafe { buf.push_uninitialized(new_write_size); } } match self.read(&mut buf[len..]) { Ok(0) => { ret = Ok(len - start_len); break; } Ok(n) => len += n, Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} Err(e) => { ret = Err(e); break; } } } let buf_len = buf.len32(); buf.pop_back(buf_len - (len as u32)); ret } } impl io::Write for Tendril where A: Atomicity, { #[inline] fn write(&mut self, buf: &[u8]) -> io::Result { self.push_slice(buf); Ok(buf.len()) } #[inline] fn write_all(&mut self, buf: &[u8]) -> io::Result<()> { self.push_slice(buf); Ok(()) } #[inline(always)] fn flush(&mut self) -> io::Result<()> { Ok(()) } } #[cfg(feature = "encoding")] impl encoding::ByteWriter for Tendril where A: Atomicity, { #[inline] fn write_byte(&mut self, b: u8) { self.push_slice(&[b]); } #[inline] fn write_bytes(&mut self, v: &[u8]) { self.push_slice(v); } #[inline] fn writer_hint(&mut self, additional: usize) { self.reserve(::std::cmp::min(u32::MAX as usize, additional) as u32); } } impl Tendril where A: Atomicity, F: fmt::SliceFormat, { /// Decode from some character encoding into UTF-8. /// /// See the [rust-encoding docs](https://lifthrasiir.github.io/rust-encoding/encoding/) /// for more information. #[inline] #[cfg(feature = "encoding")] pub fn decode( &self, encoding: EncodingRef, trap: DecoderTrap, ) -> Result, ::std::borrow::Cow<'static, str>> { let mut ret = Tendril::new(); encoding.decode_to(&*self, trap, &mut ret).map(|_| ret) } /// Push "uninitialized bytes" onto the end. /// /// Really, this grows the tendril without writing anything to the new area. /// It's only defined for byte tendrils because it's only useful if you /// plan to then mutate the buffer. #[inline] pub unsafe fn push_uninitialized(&mut self, n: u32) { let new_len = self.len32().checked_add(n).expect(OFLOW); if new_len <= MAX_INLINE_LEN as u32 && self.ptr.get().get() <= MAX_INLINE_TAG { self.ptr.set(inline_tag(new_len)) } else { self.make_owned_with_capacity(new_len); self.set_len(new_len); } } } impl strfmt::Display for Tendril where A: Atomicity, { #[inline] fn fmt(&self, f: &mut strfmt::Formatter) -> strfmt::Result { ::fmt(&**self, f) } } impl str::FromStr for Tendril where A: Atomicity, { type Err = (); #[inline] fn from_str(s: &str) -> Result { Ok(Tendril::from_slice(s)) } } impl strfmt::Write for Tendril where A: Atomicity, { #[inline] fn write_str(&mut self, s: &str) -> strfmt::Result { self.push_slice(s); Ok(()) } } #[cfg(feature = "encoding")] impl encoding::StringWriter for Tendril where A: Atomicity, { #[inline] fn write_char(&mut self, c: char) { self.push_char(c); } #[inline] fn write_str(&mut self, s: &str) { self.push_slice(s); } #[inline] fn writer_hint(&mut self, additional: usize) { self.reserve(::std::cmp::min(u32::MAX as usize, additional) as u32); } } impl Tendril where A: Atomicity, { /// Encode from UTF-8 into some other character encoding. /// /// See the [rust-encoding docs](https://lifthrasiir.github.io/rust-encoding/encoding/) /// for more information. #[inline] #[cfg(feature = "encoding")] pub fn encode( &self, encoding: EncodingRef, trap: EncoderTrap, ) -> Result, ::std::borrow::Cow<'static, str>> { let mut ret = Tendril::new(); encoding.encode_to(&*self, trap, &mut ret).map(|_| ret) } /// Push a character onto the end. #[inline] pub fn push_char(&mut self, c: char) { unsafe { self.push_bytes_without_validating(c.encode_utf8(&mut [0_u8; 4]).as_bytes()); } } /// Create a `Tendril` from a single character. #[inline] pub fn from_char(c: char) -> Tendril { let mut t: Tendril = Tendril::new(); t.push_char(c); t } /// Helper for the `format_tendril!` macro. #[inline] pub fn format(args: strfmt::Arguments) -> Tendril { use std::fmt::Write; let mut output: Tendril = Tendril::new(); let _ = write!(&mut output, "{}", args); output } } /// Create a `StrTendril` through string formatting. /// /// Works just like the standard `format!` macro. #[macro_export] macro_rules! format_tendril { ($($arg:tt)*) => ($crate::StrTendril::format(format_args!($($arg)*))) } impl<'a, F, A> From<&'a F::Slice> for Tendril where F: fmt::SliceFormat, A: Atomicity, { #[inline] fn from(input: &F::Slice) -> Tendril { Tendril::from_slice(input) } } impl From for Tendril where A: Atomicity, { #[inline] fn from(input: String) -> Tendril { Tendril::from_slice(&*input) } } impl AsRef for Tendril where F: fmt::SliceFormat, A: Atomicity, { #[inline] fn as_ref(&self) -> &F::Slice { &**self } } impl From> for String where A: Atomicity, { #[inline] fn from(input: Tendril) -> String { String::from(&*input) } } impl<'a, A> From<&'a Tendril> for String where A: Atomicity, { #[inline] fn from(input: &'a Tendril) -> String { String::from(&**input) } } #[cfg(all(test, feature = "bench"))] #[path = "bench.rs"] mod bench; #[cfg(test)] mod test { use super::{ Atomic, ByteTendril, Header, NonAtomic, ReadExt, SendTendril, SliceExt, StrTendril, Tendril, }; use fmt; use std::iter; use std::thread; fn assert_send() {} #[test] fn smoke_test() { assert_eq!("", &*"".to_tendril()); assert_eq!("abc", &*"abc".to_tendril()); assert_eq!("Hello, world!", &*"Hello, world!".to_tendril()); assert_eq!(b"", &*b"".to_tendril()); assert_eq!(b"abc", &*b"abc".to_tendril()); assert_eq!(b"Hello, world!", &*b"Hello, world!".to_tendril()); } #[test] fn assert_sizes() { use std::mem; struct EmptyWithDrop; impl Drop for EmptyWithDrop { fn drop(&mut self) {} } let compiler_uses_inline_drop_flags = mem::size_of::() > 0; let correct = mem::size_of::<*const ()>() + 8 + if compiler_uses_inline_drop_flags { 1 } else { 0 }; assert_eq!(correct, mem::size_of::()); assert_eq!(correct, mem::size_of::()); assert_eq!(correct, mem::size_of::>()); assert_eq!(correct, mem::size_of::>()); assert_eq!( mem::size_of::<*const ()>() * 2, mem::size_of::>(), ); assert_eq!( mem::size_of::>(), mem::size_of::>(), ); } #[test] fn validate_utf8() { assert!(ByteTendril::try_from_byte_slice(b"\xFF").is_ok()); assert!(StrTendril::try_from_byte_slice(b"\xFF").is_err()); assert!(StrTendril::try_from_byte_slice(b"\xEA\x99\xFF").is_err()); assert!(StrTendril::try_from_byte_slice(b"\xEA\x99").is_err()); assert!(StrTendril::try_from_byte_slice(b"\xEA\x99\xAE\xEA").is_err()); assert_eq!( "\u{a66e}", &*StrTendril::try_from_byte_slice(b"\xEA\x99\xAE").unwrap() ); let mut t = StrTendril::new(); assert!(t.try_push_bytes(b"\xEA\x99").is_err()); assert!(t.try_push_bytes(b"\xAE").is_err()); assert!(t.try_push_bytes(b"\xEA\x99\xAE").is_ok()); assert_eq!("\u{a66e}", &*t); } #[test] fn share_and_unshare() { let s = b"foobarbaz".to_tendril(); assert_eq!(b"foobarbaz", &*s); assert!(!s.is_shared()); let mut t = s.clone(); assert_eq!(s.as_ptr(), t.as_ptr()); assert!(s.is_shared()); assert!(t.is_shared()); t.push_slice(b"quux"); assert_eq!(b"foobarbaz", &*s); assert_eq!(b"foobarbazquux", &*t); assert!(s.as_ptr() != t.as_ptr()); assert!(!t.is_shared()); } #[test] fn format_display() { assert_eq!("foobar", &*format!("{}", "foobar".to_tendril())); let mut s = "foo".to_tendril(); assert_eq!("foo", &*format!("{}", s)); let t = s.clone(); assert_eq!("foo", &*format!("{}", s)); assert_eq!("foo", &*format!("{}", t)); s.push_slice("barbaz!"); assert_eq!("foobarbaz!", &*format!("{}", s)); assert_eq!("foo", &*format!("{}", t)); } #[test] fn format_debug() { assert_eq!( r#"Tendril(inline: "foobar")"#, &*format!("{:?}", "foobar".to_tendril()) ); assert_eq!( r#"Tendril(inline: [102, 111, 111, 98, 97, 114])"#, &*format!("{:?}", b"foobar".to_tendril()) ); let t = "anextralongstring".to_tendril(); assert_eq!( r#"Tendril(owned: "anextralongstring")"#, &*format!("{:?}", t) ); let _ = t.clone(); assert_eq!( r#"Tendril(shared: "anextralongstring")"#, &*format!("{:?}", t) ); } #[test] fn subtendril() { assert_eq!("foo".to_tendril(), "foo-bar".to_tendril().subtendril(0, 3)); assert_eq!("bar".to_tendril(), "foo-bar".to_tendril().subtendril(4, 3)); let mut t = "foo-bar".to_tendril(); t.pop_front(2); assert_eq!("o-bar".to_tendril(), t); t.pop_back(1); assert_eq!("o-ba".to_tendril(), t); assert_eq!( "foo".to_tendril(), "foo-a-longer-string-bar-baz".to_tendril().subtendril(0, 3) ); assert_eq!( "oo-a-".to_tendril(), "foo-a-longer-string-bar-baz".to_tendril().subtendril(1, 5) ); assert_eq!( "bar".to_tendril(), "foo-a-longer-string-bar-baz".to_tendril().subtendril(20, 3) ); let mut t = "another rather long string".to_tendril(); t.pop_front(2); assert!(t.starts_with("other rather")); t.pop_back(1); assert_eq!("other rather long strin".to_tendril(), t); assert!(t.is_shared()); } #[test] fn subtendril_invalid() { assert!("\u{a66e}".to_tendril().try_subtendril(0, 2).is_err()); assert!("\u{a66e}".to_tendril().try_subtendril(1, 2).is_err()); assert!("\u{1f4a9}".to_tendril().try_subtendril(0, 3).is_err()); assert!("\u{1f4a9}".to_tendril().try_subtendril(0, 2).is_err()); assert!("\u{1f4a9}".to_tendril().try_subtendril(0, 1).is_err()); assert!("\u{1f4a9}".to_tendril().try_subtendril(1, 3).is_err()); assert!("\u{1f4a9}".to_tendril().try_subtendril(1, 2).is_err()); assert!("\u{1f4a9}".to_tendril().try_subtendril(1, 1).is_err()); assert!("\u{1f4a9}".to_tendril().try_subtendril(2, 2).is_err()); assert!("\u{1f4a9}".to_tendril().try_subtendril(2, 1).is_err()); assert!("\u{1f4a9}".to_tendril().try_subtendril(3, 1).is_err()); let mut t = "\u{1f4a9}zzzzzz".to_tendril(); assert!(t.try_pop_front(1).is_err()); assert!(t.try_pop_front(2).is_err()); assert!(t.try_pop_front(3).is_err()); assert!(t.try_pop_front(4).is_ok()); assert_eq!("zzzzzz", &*t); let mut t = "zzzzzz\u{1f4a9}".to_tendril(); assert!(t.try_pop_back(1).is_err()); assert!(t.try_pop_back(2).is_err()); assert!(t.try_pop_back(3).is_err()); assert!(t.try_pop_back(4).is_ok()); assert_eq!("zzzzzz", &*t); } #[test] fn conversion() { assert_eq!( &[0x66, 0x6F, 0x6F].to_tendril(), "foo".to_tendril().as_bytes() ); assert_eq!( [0x66, 0x6F, 0x6F].to_tendril(), "foo".to_tendril().into_bytes() ); let ascii: Tendril = b"hello".to_tendril().try_reinterpret().unwrap(); assert_eq!(&"hello".to_tendril(), ascii.as_superset()); assert_eq!("hello".to_tendril(), ascii.clone().into_superset()); assert!(b"\xFF" .to_tendril() .try_reinterpret::() .is_err()); let t = "hello".to_tendril(); let ascii: &Tendril = t.try_as_subset().unwrap(); assert_eq!(b"hello", &**ascii.as_bytes()); assert!("ő" .to_tendril() .try_reinterpret_view::() .is_err()); assert!("ő".to_tendril().try_as_subset::().is_err()); let ascii: Tendril = "hello".to_tendril().try_into_subset().unwrap(); assert_eq!(b"hello", &**ascii.as_bytes()); assert!("ő".to_tendril().try_reinterpret::().is_err()); assert!("ő".to_tendril().try_into_subset::().is_err()); } #[test] fn clear() { let mut t = "foo-".to_tendril(); t.clear(); assert_eq!(t.len(), 0); assert_eq!(t.len32(), 0); assert_eq!(&*t, ""); let mut t = "much longer".to_tendril(); let s = t.clone(); t.clear(); assert_eq!(t.len(), 0); assert_eq!(t.len32(), 0); assert_eq!(&*t, ""); assert_eq!(&*s, "much longer"); } #[test] fn push_tendril() { let mut t = "abc".to_tendril(); t.push_tendril(&"xyz".to_tendril()); assert_eq!("abcxyz", &*t); } #[test] fn wtf8() { assert!(Tendril::::try_from_byte_slice(b"\xED\xA0\xBD").is_ok()); assert!(Tendril::::try_from_byte_slice(b"\xED\xB2\xA9").is_ok()); assert!(Tendril::::try_from_byte_slice(b"\xED\xA0\xBD\xED\xB2\xA9").is_err()); let t: Tendril = Tendril::try_from_byte_slice(b"\xED\xA0\xBD\xEA\x99\xAE").unwrap(); assert!(b"\xED\xA0\xBD".to_tendril().try_reinterpret().unwrap() == t.subtendril(0, 3)); assert!(b"\xEA\x99\xAE".to_tendril().try_reinterpret().unwrap() == t.subtendril(3, 3)); assert!(t.try_reinterpret_view::().is_err()); assert!(t.try_subtendril(0, 1).is_err()); assert!(t.try_subtendril(0, 2).is_err()); assert!(t.try_subtendril(1, 1).is_err()); assert!(t.try_subtendril(3, 1).is_err()); assert!(t.try_subtendril(3, 2).is_err()); assert!(t.try_subtendril(4, 1).is_err()); // paired surrogates let mut t: Tendril = Tendril::try_from_byte_slice(b"\xED\xA0\xBD").unwrap(); assert!(t.try_push_bytes(b"\xED\xB2\xA9").is_ok()); assert_eq!(b"\xF0\x9F\x92\xA9", t.as_byte_slice()); assert!(t.try_reinterpret_view::().is_ok()); // unpaired surrogates let mut t: Tendril = Tendril::try_from_byte_slice(b"\xED\xA0\xBB").unwrap(); assert!(t.try_push_bytes(b"\xED\xA0").is_err()); assert!(t.try_push_bytes(b"\xED").is_err()); assert!(t.try_push_bytes(b"\xA0").is_err()); assert!(t.try_push_bytes(b"\xED\xA0\xBD").is_ok()); assert_eq!(b"\xED\xA0\xBB\xED\xA0\xBD", t.as_byte_slice()); assert!(t.try_push_bytes(b"\xED\xB2\xA9").is_ok()); assert_eq!(b"\xED\xA0\xBB\xF0\x9F\x92\xA9", t.as_byte_slice()); assert!(t.try_reinterpret_view::().is_err()); } #[test] fn front_char() { let mut t = "".to_tendril(); assert_eq!(None, t.pop_front_char()); assert_eq!(None, t.pop_front_char()); let mut t = "abc".to_tendril(); assert_eq!(Some('a'), t.pop_front_char()); assert_eq!(Some('b'), t.pop_front_char()); assert_eq!(Some('c'), t.pop_front_char()); assert_eq!(None, t.pop_front_char()); assert_eq!(None, t.pop_front_char()); let mut t = "főo-a-longer-string-bar-baz".to_tendril(); assert_eq!(28, t.len()); assert_eq!(Some('f'), t.pop_front_char()); assert_eq!(Some('ő'), t.pop_front_char()); assert_eq!(Some('o'), t.pop_front_char()); assert_eq!(Some('-'), t.pop_front_char()); assert_eq!(23, t.len()); } #[test] fn char_run() { for &(s, exp) in &[ ("", None), (" ", Some((" ", true))), ("x", Some(("x", false))), (" \t \n", Some((" \t \n", true))), ("xyzzy", Some(("xyzzy", false))), (" xyzzy", Some((" ", true))), ("xyzzy ", Some(("xyzzy", false))), (" xyzzy ", Some((" ", true))), ("xyzzy hi", Some(("xyzzy", false))), ("中 ", Some(("中", false))), (" 中 ", Some((" ", true))), (" 中 ", Some((" ", true))), (" 中 ", Some((" ", true))), ] { let mut t = s.to_tendril(); let res = t.pop_front_char_run(char::is_whitespace); match exp { None => assert!(res.is_none()), Some((es, ec)) => { let (rt, rc) = res.unwrap(); assert_eq!(es, &*rt); assert_eq!(ec, rc); } } } } #[test] fn deref_mut_inline() { let mut t = "xyő".to_tendril().into_bytes(); t[3] = 0xff; assert_eq!(b"xy\xC5\xFF", &*t); assert!(t.try_reinterpret_view::().is_err()); t[3] = 0x8b; assert_eq!("xyŋ", &**t.try_reinterpret_view::().unwrap()); unsafe { t.push_uninitialized(3); t[4] = 0xEA; t[5] = 0x99; t[6] = 0xAE; assert_eq!( "xyŋ\u{a66e}", &**t.try_reinterpret_view::().unwrap() ); t.push_uninitialized(20); t.pop_back(20); assert_eq!( "xyŋ\u{a66e}", &**t.try_reinterpret_view::().unwrap() ); } } #[test] fn deref_mut() { let mut t = b"0123456789".to_tendril(); let u = t.clone(); assert!(t.is_shared()); t[9] = 0xff; assert!(!t.is_shared()); assert_eq!(b"0123456789", &*u); assert_eq!(b"012345678\xff", &*t); } #[test] fn push_char() { let mut t = "xyz".to_tendril(); t.push_char('o'); assert_eq!("xyzo", &*t); t.push_char('ő'); assert_eq!("xyzoő", &*t); t.push_char('\u{a66e}'); assert_eq!("xyzoő\u{a66e}", &*t); t.push_char('\u{1f4a9}'); assert_eq!("xyzoő\u{a66e}\u{1f4a9}", &*t); assert_eq!(t.len(), 13); } #[test] #[cfg(feature = "encoding")] fn encode() { use encoding::{all, EncoderTrap}; let t = "안녕하세요 러스트".to_tendril(); assert_eq!( b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4\x20\xb7\xaf\xbd\xba\xc6\xae", &*t.encode(all::WINDOWS_949, EncoderTrap::Strict).unwrap() ); let t = "Энергия пробуждения ия-я-я! \u{a66e}".to_tendril(); assert_eq!( b"\xfc\xce\xc5\xd2\xc7\xc9\xd1 \xd0\xd2\xcf\xc2\xd5\xd6\xc4\xc5\xce\ \xc9\xd1 \xc9\xd1\x2d\xd1\x2d\xd1\x21 ?", &*t.encode(all::KOI8_U, EncoderTrap::Replace).unwrap() ); let t = "\u{1f4a9}".to_tendril(); assert!(t.encode(all::WINDOWS_1252, EncoderTrap::Strict).is_err()); } #[test] #[cfg(feature = "encoding")] fn decode() { use encoding::{all, DecoderTrap}; let t = b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\ \xbf\xe4\x20\xb7\xaf\xbd\xba\xc6\xae" .to_tendril(); assert_eq!( "안녕하세요 러스트", &*t.decode(all::WINDOWS_949, DecoderTrap::Strict).unwrap() ); let t = b"\xfc\xce\xc5\xd2\xc7\xc9\xd1 \xd0\xd2\xcf\xc2\xd5\xd6\xc4\xc5\xce\ \xc9\xd1 \xc9\xd1\x2d\xd1\x2d\xd1\x21" .to_tendril(); assert_eq!( "Энергия пробуждения ия-я-я!", &*t.decode(all::KOI8_U, DecoderTrap::Replace).unwrap() ); let t = b"x \xff y".to_tendril(); assert!(t.decode(all::UTF_8, DecoderTrap::Strict).is_err()); let t = b"x \xff y".to_tendril(); assert_eq!( "x \u{fffd} y", &*t.decode(all::UTF_8, DecoderTrap::Replace).unwrap() ); } #[test] fn ascii() { fn mk(x: &[u8]) -> Tendril { x.to_tendril().try_reinterpret().unwrap() } let mut t = mk(b"xyz"); assert_eq!(Some('x'), t.pop_front_char()); assert_eq!(Some('y'), t.pop_front_char()); assert_eq!(Some('z'), t.pop_front_char()); assert_eq!(None, t.pop_front_char()); let mut t = mk(b" \t xyz"); assert!(Some((mk(b" \t "), true)) == t.pop_front_char_run(char::is_whitespace)); assert!(Some((mk(b"xyz"), false)) == t.pop_front_char_run(char::is_whitespace)); assert!(t.pop_front_char_run(char::is_whitespace).is_none()); let mut t = Tendril::::new(); assert!(t.try_push_char('x').is_ok()); assert!(t.try_push_char('\0').is_ok()); assert!(t.try_push_char('\u{a0}').is_err()); assert_eq!(b"x\0", t.as_byte_slice()); } #[test] fn latin1() { fn mk(x: &[u8]) -> Tendril { x.to_tendril().try_reinterpret().unwrap() } let mut t = mk(b"\xd8_\xd8"); assert_eq!(Some('Ø'), t.pop_front_char()); assert_eq!(Some('_'), t.pop_front_char()); assert_eq!(Some('Ø'), t.pop_front_char()); assert_eq!(None, t.pop_front_char()); let mut t = mk(b" \t \xfe\xa7z"); assert!(Some((mk(b" \t "), true)) == t.pop_front_char_run(char::is_whitespace)); assert!(Some((mk(b"\xfe\xa7z"), false)) == t.pop_front_char_run(char::is_whitespace)); assert!(t.pop_front_char_run(char::is_whitespace).is_none()); let mut t = Tendril::::new(); assert!(t.try_push_char('x').is_ok()); assert!(t.try_push_char('\0').is_ok()); assert!(t.try_push_char('\u{a0}').is_ok()); assert!(t.try_push_char('ő').is_err()); assert!(t.try_push_char('я').is_err()); assert!(t.try_push_char('\u{a66e}').is_err()); assert!(t.try_push_char('\u{1f4a9}').is_err()); assert_eq!(b"x\0\xa0", t.as_byte_slice()); } #[test] fn format() { assert_eq!("", &*format_tendril!("")); assert_eq!( "two and two make 4", &*format_tendril!("two and two make {}", 2 + 2) ); } #[test] fn merge_shared() { let t = "012345678901234567890123456789".to_tendril(); let a = t.subtendril(10, 20); assert!(a.is_shared()); assert_eq!("01234567890123456789", &*a); let mut b = t.subtendril(0, 10); assert!(b.is_shared()); assert_eq!("0123456789", &*b); b.push_tendril(&a); assert!(b.is_shared()); assert!(a.is_shared()); assert!(a.is_shared_with(&b)); assert!(b.is_shared_with(&a)); assert_eq!("012345678901234567890123456789", &*b); assert!(t.is_shared()); assert!(t.is_shared_with(&a)); assert!(t.is_shared_with(&b)); } #[test] fn merge_cant_share() { let t = "012345678901234567890123456789".to_tendril(); let mut b = t.subtendril(0, 10); assert!(b.is_shared()); assert_eq!("0123456789", &*b); b.push_tendril(&"abcd".to_tendril()); assert!(!b.is_shared()); assert_eq!("0123456789abcd", &*b); } #[test] fn shared_doesnt_reserve() { let mut t = "012345678901234567890123456789".to_tendril(); let a = t.subtendril(1, 10); assert!(t.is_shared()); t.reserve(10); assert!(t.is_shared()); let _ = a; } #[test] fn out_of_bounds() { assert!("".to_tendril().try_subtendril(0, 1).is_err()); assert!("abc".to_tendril().try_subtendril(0, 4).is_err()); assert!("abc".to_tendril().try_subtendril(3, 1).is_err()); assert!("abc".to_tendril().try_subtendril(7, 1).is_err()); let mut t = "".to_tendril(); assert!(t.try_pop_front(1).is_err()); assert!(t.try_pop_front(5).is_err()); assert!(t.try_pop_front(500).is_err()); assert!(t.try_pop_back(1).is_err()); assert!(t.try_pop_back(5).is_err()); assert!(t.try_pop_back(500).is_err()); let mut t = "abcd".to_tendril(); assert!(t.try_pop_front(1).is_ok()); assert!(t.try_pop_front(4).is_err()); assert!(t.try_pop_front(500).is_err()); assert!(t.try_pop_back(1).is_ok()); assert!(t.try_pop_back(3).is_err()); assert!(t.try_pop_back(500).is_err()); } #[test] fn compare() { for &a in &[ "indiscretions", "validity", "hallucinogenics", "timelessness", "original", "microcosms", "boilers", "mammoth", ] { for &b in &[ "intrepidly", "frigid", "spa", "cardigans", "guileful", "evaporated", "unenthusiastic", "legitimate", ] { let ta = a.to_tendril(); let tb = b.to_tendril(); assert_eq!(a.eq(b), ta.eq(&tb)); assert_eq!(a.ne(b), ta.ne(&tb)); assert_eq!(a.lt(b), ta.lt(&tb)); assert_eq!(a.le(b), ta.le(&tb)); assert_eq!(a.gt(b), ta.gt(&tb)); assert_eq!(a.ge(b), ta.ge(&tb)); assert_eq!(a.partial_cmp(b), ta.partial_cmp(&tb)); assert_eq!(a.cmp(b), ta.cmp(&tb)); } } } #[test] fn extend_and_from_iterator() { // Testing Extend and FromIterator for the various Ts. // Tendril let mut t = "Hello".to_tendril(); t.extend(None::<&Tendril<_>>.into_iter()); assert_eq!("Hello", &*t); t.extend(&[", ".to_tendril(), "world".to_tendril(), "!".to_tendril()]); assert_eq!("Hello, world!", &*t); assert_eq!( "Hello, world!", &*[ "Hello".to_tendril(), ", ".to_tendril(), "world".to_tendril(), "!".to_tendril() ] .iter() .collect::() ); // &str let mut t = "Hello".to_tendril(); t.extend(None::<&str>.into_iter()); assert_eq!("Hello", &*t); t.extend([", ", "world", "!"].iter().map(|&s| s)); assert_eq!("Hello, world!", &*t); assert_eq!( "Hello, world!", &*["Hello", ", ", "world", "!"] .iter() .map(|&s| s) .collect::() ); // &[u8] let mut t = b"Hello".to_tendril(); t.extend(None::<&[u8]>.into_iter()); assert_eq!(b"Hello", &*t); t.extend( [b", ".as_ref(), b"world".as_ref(), b"!".as_ref()] .iter() .map(|&s| s), ); assert_eq!(b"Hello, world!", &*t); assert_eq!( b"Hello, world!", &*[ b"Hello".as_ref(), b", ".as_ref(), b"world".as_ref(), b"!".as_ref() ] .iter() .map(|&s| s) .collect::() ); let string = "the quick brown fox jumps over the lazy dog"; let string_expected = string.to_tendril(); let bytes = string.as_bytes(); let bytes_expected = bytes.to_tendril(); // char assert_eq!(string_expected, string.chars().collect()); let mut tendril = StrTendril::new(); tendril.extend(string.chars()); assert_eq!(string_expected, tendril); // &u8 assert_eq!(bytes_expected, bytes.iter().collect()); let mut tendril = ByteTendril::new(); tendril.extend(bytes); assert_eq!(bytes_expected, tendril); // u8 assert_eq!(bytes_expected, bytes.iter().map(|&b| b).collect()); let mut tendril = ByteTendril::new(); tendril.extend(bytes.iter().map(|&b| b)); assert_eq!(bytes_expected, tendril); } #[test] fn from_str() { use std::str::FromStr; let t: Tendril<_> = FromStr::from_str("foo bar baz").unwrap(); assert_eq!("foo bar baz", &*t); } #[test] fn from_char() { assert_eq!("o", &*StrTendril::from_char('o')); assert_eq!("ő", &*StrTendril::from_char('ő')); assert_eq!("\u{a66e}", &*StrTendril::from_char('\u{a66e}')); assert_eq!("\u{1f4a9}", &*StrTendril::from_char('\u{1f4a9}')); } #[test] #[cfg_attr(miri, ignore)] // slow fn read() { fn check(x: &[u8]) { use std::io::Cursor; let mut t = ByteTendril::new(); assert_eq!(x.len(), Cursor::new(x).read_to_tendril(&mut t).unwrap()); assert_eq!(x, &*t); } check(b""); check(b"abcd"); let long: Vec = iter::repeat(b'x').take(1_000_000).collect(); check(&long); } #[test] fn hash_map_key() { use std::collections::HashMap; // As noted with Borrow, indexing on HashMap is byte-based because of // https://github.com/rust-lang/rust/issues/27108. let mut map = HashMap::new(); map.insert("foo".to_tendril(), 1); assert_eq!(map.get(b"foo".as_ref()), Some(&1)); assert_eq!(map.get(b"bar".as_ref()), None); let mut map = HashMap::new(); map.insert(b"foo".to_tendril(), 1); assert_eq!(map.get(b"foo".as_ref()), Some(&1)); assert_eq!(map.get(b"bar".as_ref()), None); } #[test] fn atomic() { assert_send::>(); let s: Tendril = Tendril::from_slice("this is a string"); assert!(!s.is_shared()); let mut t = s.clone(); assert!(s.is_shared()); let sp = s.as_ptr() as usize; thread::spawn(move || { assert!(t.is_shared()); t.push_slice(" extended"); assert_eq!("this is a string extended", &*t); assert!(t.as_ptr() as usize != sp); assert!(!t.is_shared()); }) .join() .unwrap(); assert!(s.is_shared()); assert_eq!("this is a string", &*s); } #[test] fn send() { assert_send::>(); let s = "this is a string".to_tendril(); let t = s.clone(); let s2 = s.into_send(); thread::spawn(move || { let s = StrTendril::from(s2); assert!(!s.is_shared()); assert_eq!("this is a string", &*s); }) .join() .unwrap(); assert_eq!("this is a string", &*t); } /// https://github.com/servo/tendril/issues/58 #[test] fn issue_58() { let data = "

Hello!

, World!"; let s: Tendril = data.into(); assert_eq!(&*s, data); let s: Tendril = s.into_send().into(); assert_eq!(&*s, data); } #[test] fn inline_send() { let s = "x".to_tendril(); let t = s.clone(); let s2 = s.into_send(); thread::spawn(move || { let s = StrTendril::from(s2); assert!(!s.is_shared()); assert_eq!("x", &*s); }) .join() .unwrap(); assert_eq!("x", &*t); } } tendril-0.4.3/src/utf8_decode.rs000064400000000000000000000065250072674642500146570ustar 00000000000000// Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use fmt; use tendril::{Atomicity, Tendril}; use utf8; pub struct IncompleteUtf8(utf8::Incomplete); impl Tendril where A: Atomicity, { pub fn decode_utf8_lossy(mut self, mut push_utf8: F) -> Option where F: FnMut(Tendril), { loop { if self.is_empty() { return None; } let unborrowed_result = match utf8::decode(&self) { Ok(s) => { debug_assert!(s.as_ptr() == self.as_ptr()); debug_assert!(s.len() == self.len()); Ok(()) } Err(utf8::DecodeError::Invalid { valid_prefix, invalid_sequence, .. }) => { debug_assert!(valid_prefix.as_ptr() == self.as_ptr()); debug_assert!(valid_prefix.len() <= self.len()); Err(( valid_prefix.len(), Err(valid_prefix.len() + invalid_sequence.len()), )) } Err(utf8::DecodeError::Incomplete { valid_prefix, incomplete_suffix, }) => { debug_assert!(valid_prefix.as_ptr() == self.as_ptr()); debug_assert!(valid_prefix.len() <= self.len()); Err((valid_prefix.len(), Ok(incomplete_suffix))) } }; match unborrowed_result { Ok(()) => { unsafe { push_utf8(self.reinterpret_without_validating()) } return None; } Err((valid_len, and_then)) => { if valid_len > 0 { let subtendril = self.subtendril(0, valid_len as u32); unsafe { push_utf8(subtendril.reinterpret_without_validating()) } } match and_then { Ok(incomplete) => return Some(IncompleteUtf8(incomplete)), Err(offset) => { push_utf8(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); self.pop_front(offset as u32) } } } } } } } impl IncompleteUtf8 { pub fn try_complete( &mut self, mut input: Tendril, mut push_utf8: F, ) -> Result, ()> where A: Atomicity, F: FnMut(Tendril), { let resume_at; match self.0.try_complete(&input) { None => return Err(()), Some((result, rest)) => { push_utf8(Tendril::from_slice( result.unwrap_or(utf8::REPLACEMENT_CHARACTER), )); resume_at = input.len() - rest.len(); } } input.pop_front(resume_at as u32); Ok(input) } } tendril-0.4.3/src/util.rs000064400000000000000000000026060072674642500134370ustar 00000000000000// Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use std::mem; use std::{ptr, slice}; #[inline(always)] pub unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] { debug_assert!(start <= buf.len()); debug_assert!(new_len <= (buf.len() - start)); slice::from_raw_parts(buf.as_ptr().offset(start as isize), new_len) } #[inline(always)] pub unsafe fn unsafe_slice_mut<'a>( buf: &'a mut [u8], start: usize, new_len: usize, ) -> &'a mut [u8] { debug_assert!(start <= buf.len()); debug_assert!(new_len <= (buf.len() - start)); slice::from_raw_parts_mut(buf.as_mut_ptr().offset(start as isize), new_len) } #[inline(always)] pub unsafe fn copy_and_advance(dest: &mut *mut u8, src: &[u8]) { ptr::copy_nonoverlapping(src.as_ptr(), *dest, src.len()); *dest = dest.offset(src.len() as isize) } #[inline(always)] pub unsafe fn copy_lifetime_mut<'a, S: ?Sized, T: ?Sized + 'a>( _ptr: &'a mut S, ptr: &mut T, ) -> &'a mut T { mem::transmute(ptr) } #[inline(always)] pub unsafe fn copy_lifetime<'a, S: ?Sized, T: ?Sized + 'a>(_ptr: &'a S, ptr: &T) -> &'a T { mem::transmute(ptr) }