compact_str-0.7.1/.cargo_vcs_info.json0000644000000001510000000000100133540ustar { "git": { "sha1": "13fce3a7f095f574434a522cf967b9a8e027f685" }, "path_in_vcs": "compact_str" }compact_str-0.7.1/Cargo.toml0000644000000050070000000000100113570ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" name = "compact_str" version = "0.7.1" authors = ["Parker Timmerman "] description = "A memory efficient string type that transparently stores strings on the stack, when possible" homepage = "https://github.com/ParkMyCar/compact_str" readme = "README.md" keywords = [ "string", "compact", "small", "memory", "mutable", ] categories = [ "encoding", "parsing", "memory-management", "text-processing", ] license = "MIT" repository = "https://github.com/ParkMyCar/compact_str" resolver = "1" [package.metadata.docs.rs] all-features = true rustdoc-args = [ "--cfg", "docsrs", ] [dependencies.arbitrary] version = "1" optional = true default-features = false [dependencies.bytes] version = "1" optional = true [dependencies.castaway] version = "0.2" [dependencies.cfg-if] version = "1" [dependencies.itoa] version = "1" [dependencies.markup] version = "0.13" optional = true default-features = false [dependencies.proptest] version = "1" features = ["std"] optional = true default-features = false [dependencies.quickcheck] version = "1" optional = true default-features = false [dependencies.rkyv] version = "0.7" features = ["size_32"] optional = true default-features = false [dependencies.ryu] version = "1" [dependencies.serde] version = "1" optional = true [dependencies.smallvec] version = "1" features = ["union"] optional = true [dependencies.static_assertions] version = "1" [dev-dependencies.cfg-if] version = "1" [dev-dependencies.proptest] version = "1.0.*" features = ["std"] default-features = false [dev-dependencies.quickcheck] version = "1" default-features = false [dev-dependencies.quickcheck_macros] version = "1" [dev-dependencies.rayon] version = "1.6.0" [dev-dependencies.rkyv] version = "0.7" features = [ "alloc", "size_32", ] default-features = false [dev-dependencies.serde] version = "1" features = ["derive"] [dev-dependencies.serde_json] version = "1" [dev-dependencies.test-case] version = "2" [dev-dependencies.test-strategy] version = "0.2" compact_str-0.7.1/Cargo.toml.orig000064400000000000000000000031731046102023000150420ustar 00000000000000[package] name = "compact_str" description = "A memory efficient string type that transparently stores strings on the stack, when possible" version = "0.7.1" authors = ["Parker Timmerman "] edition = "2021" license = "MIT" homepage = "https://github.com/ParkMyCar/compact_str" repository = "https://github.com/ParkMyCar/compact_str" readme = "../README.md" keywords = ["string", "compact", "small", "memory", "mutable"] categories = ["encoding", "parsing", "memory-management", "text-processing"] [dependencies] arbitrary = { version = "1", optional = true, default-features = false } bytes = { version = "1", optional = true } markup = { version = "0.13", optional = true, default-features = false } proptest = { version = "1", optional = true, default-features = false, features = ["std"] } quickcheck = { version = "1", optional = true, default-features = false } rkyv = { version = "0.7", optional = true, default-features = false, features = ["size_32"] } serde = { version = "1", optional = true } smallvec = { version = "1", optional = true, features = ["union"] } castaway = "0.2" cfg-if = "1" itoa = "1" ryu = "1" static_assertions = "1" [dev-dependencies] cfg-if = "1" proptest = { version = "1.0.*", default-features = false, features = ["std"] } quickcheck = { version = "1", default-features = false } quickcheck_macros = "1" rayon = "1.6.0" rkyv = { version = "0.7", default-features = false, features = ["alloc", "size_32"] } serde = { version = "1", features = ["derive"] } serde_json = "1" test-case = "2" test-strategy = "0.2" [package.metadata.docs.rs] all-features = true rustdoc-args = ["--cfg", "docsrs"] compact_str-0.7.1/LICENSE000064400000000000000000000020611046102023000131530ustar 00000000000000MIT License Copyright (c) 2021 Parker Timmerman Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. compact_str-0.7.1/README.md000064400000000000000000000273111046102023000134320ustar 00000000000000

compact_str

A memory efficient string type that can store up to 24* bytes on the stack.

version on crates.io Minimum supported Rust Version: 1.57 mit license
Continuous Integration Status Cross Platform Status Minimum Supported Rust Version Status Clippy Status

* 12 bytes for 32-bit architectures


### About A `CompactString` is a more memory efficient string type, that can store smaller strings on the stack, and transparently stores longer strings on the heap (aka a small string optimization). It can mostly be used as a drop in replacement for `String` and are particularly useful in parsing, deserializing, or any other application where you may have smaller strings. ### Properties A `CompactString` specifically has the following properties: * `size_of::() == size_of::()` * Stores up to 24 bytes on the stack * 12 bytes if running on a 32 bit architecture * Strings longer than 24 bytes are stored on the heap * `Clone` is `O(n)` * `From` or `From>` re-uses underlying buffer * Eagerly inlines small strings * Heap based string grows at a rate of 1.5x * The std library `String` grows at a rate of 2x * Space optimized for `Option<_>` * `size_of::() == size_of::>()` * Uses [branchless instructions](https://en.algorithmica.org/hpc/pipelining/branchless/) for string accesses ### Traits This crate exposes two traits, `ToCompactString` and `CompactStringExt`. #### `ToCompactString` Provides the `to_compact_string(&self)` method for converting types into a `CompactString`. This trait is automatically implemented for all types that are `std::fmt::Display`, with specialized higher performance impls for: * `u8`, `u16`, `u32`, `u64`, `usize`, `u128` * `i8`, `i16`, `i32`, `i64`, `isize`, `i128` * `f32`, `f64` * `bool`, `char` * `NonZeroU*`, `NonZeroI*` * `String`, `CompactString` #### `CompactStringExt` Provides two methods `join_compact(seperator: impl AsRef)` and `concat_compact()`. This trait is automatically implemented for all types that can be converted into an iterator and yield types that `impl AsRef`. This allows you to join Vec's, slices, and any other collection to form `CompactString`s. ### Macros This crate exposes one macro `format_compact!` that can be used to create `CompactString`s from arguments, like you can `String`s with the `std::format!` macro. ### Features `compact_str` has the following optional features: * `serde`, which implements [`Deserialize`](https://docs.rs/serde/1/serde/trait.Deserialize.html) and [`Serialize`](https://docs.rs/serde/1/serde/trait.Serialize.html) from the popular [`serde`](https://docs.rs/serde/1/serde/) crate, for `CompactString` * `bytes`, which provides two methods `from_utf8_buf(buf: &mut B)` and `from_utf8_buf_unchecked(buf: &mut B)`, which allows for the creation of a `CompactString` from a [`bytes::Buf`](https://docs.rs/bytes/1/bytes/trait.Buf.html) * `markup`, which implements [`Render`](https://docs.rs/markup/0.13/markup/trait.Render.html) trait, so `CompactString`s can be used in templates as HTML escaped strings * `arbitrary`, which implements the [`arbitrary::Arbitrary`](https://docs.rs/arbitrary/1/arbitrary/trait.Arbitrary.html) trait for fuzzing * `proptest`, which implements the [`proptest::arbitrary::Arbitrary`](https://docs.rs/proptest/1/proptest/arbitrary/trait.Arbitrary.html) trait for fuzzing * `quickcheck`, which implements the [`quickcheck::Arbitrary`](https://docs.rs/quickcheck/1/quickcheck/trait.Arbitrary.html) trait for fuzzing * `rkyv`, which implements [`rkyv::Archive`](https://docs.rs/rkyv/0.7/rkyv/trait.Archive.html), [`rkyv::Serialize`](https://docs.rs/rkyv/0.7/rkyv/trait.Serialize.html) and [`rkyv::Deserialize`](https://docs.rs/rkyv/0.7/rkyv/trait.Deserialize.html) for fast zero-copy serialization, interchangable with serialized Strings * `smallvec`, provides the `into_bytes()` method which enables you to convert a `CompactString` into a byte vector, using [`smallvec::SmallVec`](https://docs.rs/smallvec/latest/smallvec/struct.SmallVec.html) ### How it works Note: this explanation assumes a 64-bit architecture, for 32-bit architectures generally divide any number by 2. Normally strings are stored on the heap since they're dynamically sized. In Rust a `String` consists of three fields, each of which are the size of a `usize`. e.g. its layout is something like the following: `String: [ ptr<8> | len<8> | cap<8> ]` 1. `ptr` is a pointer to a location on the heap that stores the string 2. `len` is the length of the string 3. `cap` is the total capacity of the buffer being pointed to This results in 24 bytes being stored on the stack, 8 bytes for each field. Then the actual string is stored on the heap, usually with additional memory allocated to prevent re-allocating if the string is mutated. The idea of `CompactString` is instead of storing metadata on the stack, just store the string itself. This way for smaller strings we save a bit of memory, and we don't have to heap allocate so it's more performant. A `CompactString` is limited to 24 bytes (aka `size_of::()`) so it won't ever use more memory than a `String` would. The memory layout of a `CompactString` looks something like: `CompactString: [ buffer<23> | len<1> ]` #### Memory Layout Internally a `CompactString` has two variants: 1. **Inline**, a string <= 24 bytes long 2. **Heap** allocated, a string > 24 bytes long We define a discriminant (aka track which variant we are) *within* the last byte, specifically: 1. `0b11111110` - All 1s with a trailing 0, indicates **heap** allocated 2. `0b11XXXXXX` - Two leading 1s, indicates **inline**, with the trailing 6 bits used to store the length and the overall memory layout of a `CompactString` is: 1. `heap: { ptr: NonNull, len: usize, cap: Capacity }` 2. `inline: { buffer: [u8; 24] }` Both variants are 24 bytes long For **heap** allocated strings we use a custom `HeapBuffer` which normally stores the capacity of the string on the stack, but also optionally allows us to store it on the heap. Since we use the last byte to track our discriminant, we only have 7 bytes to store the capacity, or 3 bytes on a 32-bit architecture. 7 bytes allows us to store a value up to `2^56`, aka 64 petabytes, while 3 bytes only allows us to store a value up to `2^24`, aka 16 megabytes. For 64-bit architectures we always inline the capacity, because we can safely assume our strings will never be larger than 64 petabytes, but on 32-bit architectures, when creating or growing a `CompactString`, if the text is larger than 16MB then we move the capacity onto the heap. We handle the capacity in this way for two reaons: 1. Users shouldn't have to pay for what they don't use. Meaning, in the _majority_ of cases the capacity of the buffer could easily fit into 7 or 3 bytes, so the user shouldn't have to pay the memory cost of storing the capacity on the heap, if they don't need to. 2. Allows us to convert `From` in `O(1)` time, by taking the parts of a `String` (e.g. `ptr`, `len`, and `cap`) and using those to create a `CompactString`, without having to do any heap allocations. This is important when using `CompactString` in large codebases where you might have `CompactString` working alongside of `String`. For **inline** strings we only have a 24 byte buffer on the stack. This might make you wonder how can we store a 24 byte long string, inline? Don't we also need to store the length somewhere? To do this, we utilize the fact that the last byte of our string could only ever have a value in the range `[0, 192)`. We know this because all strings in Rust are valid [UTF-8](https://en.wikipedia.org/wiki/UTF-8), and the only valid byte pattern for the last byte of a UTF-8 character (and thus the possible last byte of a string) is `0b0XXXXXXX` aka `[0, 128)` or `0b10XXXXXX` aka `[128, 192)`. This leaves all values in `[192, 255]` as unused in our last byte. Therefore, we can use values in the range of `[192, 215]` to represent a length in the range of `[0, 23]`, and if our last byte has a value `< 192`, we know that's a UTF-8 character, and can interpret the length of our string as `24`. Specifically, the last byte on the stack for a `CompactString` has the following uses: * `[0, 192)` - Is the last byte of a UTF-8 char, the `CompactString` is stored on the stack and implicitly has a length of `24` * `[192, 215]` - Denotes a length in the range of `[0, 23]`, this `CompactString` is stored on the stack. * `[215, 254)` - Unused * `254` - Denotes this `CompactString` is stored on the heap * `255` - Denotes the `None` variant for an `Option` ### Testing Strings and unicode can be quite messy, even further, we're working with things at the bit level. `compact_str` has an _extensive_ test suite comprised of unit testing, property testing, and fuzz testing, to ensure our invariants are upheld. We test across all major OSes (Windows, macOS, and Linux), architectures (64-bit and 32-bit), and endian-ness (big endian and little endian). Fuzz testing is run with `libFuzzer`, `AFL++`, *and* `honggfuzz`, with `AFL++` running on both `x86_64` and `ARMv7` architectures. We test with [`miri`](https://github.com/rust-lang/miri) to catch cases of undefined behavior, and run all tests on every Rust compiler since `v1.57` to ensure support for our minimum supported Rust version (MSRV). ### `unsafe` code `CompactString` uses a bit of unsafe code because we manually define what variant we are, so unlike an enum, the compiler can't guarantee what value is actually stored. We also have some manually implemented heap data structures, i.e. `HeapBuffer`, and mess with bytes at a bit level, to make the most out of our resources. That being said, uses of unsafe code in this library are constrained to only where *absolutely* necessary, and always documented with `// SAFETY: `. ### Similar Crates Storing strings on the stack is not a new idea, in fact there are a few other crates in the Rust ecosystem that do similar things, an incomplete list: 1. [`smol_str`](https://crates.io/crates/smol_str) - Can inline 22 bytes, `Clone` is `O(1)`, doesn't adjust for 32-bit archs 2. [`smartstring`](https://crates.io/crates/smartstring) - Can inline 23 bytes, `Clone` is `O(n)`, is mutable 3. [`kstring`](https://crates.io/crates/kstring) - Can inline 15 or 22 bytes dependent on crate features, `Clone` is `O(1)`, can also store `&'static str`s 4. [`flexstr`](https://crates.io/crates/flexstr) - Can inline 22 bytes, `Clone` is `O(1)`, can also store `&'static str`s
Thanks for readingme! compact_str-0.7.1/src/features/arbitrary.rs000064400000000000000000000026451046102023000171300ustar 00000000000000//! Implements the [`arbitrary::Arbitrary`] trait for [`CompactString`] use arbitrary::{ Arbitrary, Result, Unstructured, }; use crate::CompactString; #[cfg_attr(docsrs, doc(cfg(feature = "arbitrary")))] impl<'a> Arbitrary<'a> for CompactString { fn arbitrary(u: &mut Unstructured<'a>) -> Result { <&str as Arbitrary>::arbitrary(u).map(CompactString::new) } fn arbitrary_take_rest(u: Unstructured<'a>) -> Result { <&str as Arbitrary>::arbitrary_take_rest(u).map(CompactString::new) } #[inline] fn size_hint(depth: usize) -> (usize, Option) { <&str as Arbitrary>::size_hint(depth) } } #[cfg(test)] mod test { use arbitrary::{ Arbitrary, Unstructured, }; use crate::CompactString; #[test] fn arbitrary_sanity() { let mut data = Unstructured::new(&[42; 50]); let compact = CompactString::arbitrary(&mut data).expect("generate a CompactString"); // we don't really care what the content of the CompactString is, just that one's generated assert!(!compact.is_empty()); } #[test] fn arbitrary_inlines_strings() { let mut data = Unstructured::new(&[42; 20]); let compact = CompactString::arbitrary(&mut data).expect("generate a CompactString"); // running this manually, we generate the string "**" assert!(!compact.is_heap_allocated()); } } compact_str-0.7.1/src/features/bytes.rs000064400000000000000000000076761046102023000162700ustar 00000000000000use core::str::Utf8Error; use bytes::Buf; use crate::{ CompactString, Repr, }; impl CompactString { /// Converts a buffer of bytes to a [`CompactString`] /// /// # Examples /// ### Basic usage /// ``` /// # use compact_str::CompactString; /// # use std::collections::VecDeque; /// /// // `bytes::Buf` is implemented for `VecDeque` /// let mut sparkle_heart = VecDeque::from(vec![240, 159, 146, 150]); /// // We know these bytes are valid, so we can `.unwrap()` or `.expect(...)` here /// let compact_str = CompactString::from_utf8_buf(&mut sparkle_heart).expect("valid utf-8"); /// /// assert_eq!(compact_str, "๐Ÿ’–"); /// ``` /// /// ### With invalid/non-UTF8 bytes /// ``` /// # use compact_str::CompactString; /// # use std::io; /// /// // `bytes::Buf` is implemented for `std::io::Cursor<&[u8]>` /// let mut invalid = io::Cursor::new(&[0, 159]); /// /// // The provided buffer is invalid, so trying to create a `CompactString` will fail /// assert!(CompactString::from_utf8_buf(&mut invalid).is_err()); /// ``` #[cfg_attr(docsrs, doc(cfg(feature = "bytes")))] pub fn from_utf8_buf(buf: &mut B) -> Result { Repr::from_utf8_buf(buf).map(CompactString) } /// Converts a buffer of bytes to a [`CompactString`], without checking that the provided buffer /// is valid UTF-8. /// /// # Safety /// This function is unsafe because it does not check that the provided bytes are valid UTF-8. /// If this constraint is violated, it may cause memory safety issues with futures uses of the /// `CompactString`, as the rest of the library assumes that `CompactString`s are valid UTF-8 /// /// # Examples /// ``` /// # use compact_str::CompactString; /// # use std::io; /// /// let word = "hello world"; /// // `bytes::Buf` is implemented for `std::io::Cursor<&[u8]>` /// let mut buffer = io::Cursor::new(word.as_bytes()); /// let compact_str = unsafe { CompactString::from_utf8_buf_unchecked(&mut buffer) }; /// /// assert_eq!(compact_str, word); /// ``` #[cfg_attr(docsrs, doc(cfg(feature = "bytes")))] pub unsafe fn from_utf8_buf_unchecked(buf: &mut B) -> Self { let repr = Repr::from_utf8_buf_unchecked(buf); CompactString(repr) } } #[cfg(test)] mod test { use std::io::Cursor; use proptest::prelude::*; use test_strategy::proptest; use crate::tests::{ rand_bytes, rand_unicode, }; use crate::CompactString; const MAX_SIZE: usize = core::mem::size_of::(); #[proptest] #[cfg_attr(miri, ignore)] fn proptest_buffers_roundtrip(#[strategy(rand_unicode())] word: String) { let mut buf = Cursor::new(word.as_bytes()); let compact = CompactString::from_utf8_buf(&mut buf).unwrap(); proptest::prop_assert_eq!(&word, &compact); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_allocated_properly(#[strategy(rand_unicode())] word: String) { let mut buf = Cursor::new(word.as_bytes()); let compact = CompactString::from_utf8_buf(&mut buf).unwrap(); if word.len() <= MAX_SIZE { proptest::prop_assert!(!compact.is_heap_allocated()) } else { proptest::prop_assert!(compact.is_heap_allocated()) } } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_only_accept_valid_utf8(#[strategy(rand_bytes())] bytes: Vec) { let mut buf = Cursor::new(bytes.as_slice()); let compact_result = CompactString::from_utf8_buf(&mut buf); let str_result = core::str::from_utf8(bytes.as_slice()); match (compact_result, str_result) { (Ok(c), Ok(s)) => prop_assert_eq!(c, s), (Err(c_err), Err(s_err)) => prop_assert_eq!(c_err, s_err), _ => panic!("CompactString and core::str read UTF-8 differently?"), } } } compact_str-0.7.1/src/features/markup.rs000064400000000000000000000015641046102023000164270ustar 00000000000000use std::fmt; use markup::Render; use crate::CompactString; #[cfg_attr(docsrs, doc(cfg(feature = "markup")))] impl Render for CompactString { #[inline] fn render(&self, writer: &mut impl fmt::Write) -> fmt::Result { self.as_str().render(writer) } } #[cfg(test)] #[test] fn test_markup() { const TEXT: &str = ""; markup::define!(Template(msg: M) { textarea { @msg } }); let compact = Template { msg: CompactString::from(TEXT), }; let control = Template { msg: String::from(TEXT), }; assert_eq!( compact.to_string(), "", ); assert_eq!( control.to_string(), "", ); } compact_str-0.7.1/src/features/mod.rs000064400000000000000000000006511046102023000157030ustar 00000000000000//! A module that contains the implementations for optional features. For example `serde` support #[cfg(feature = "arbitrary")] mod arbitrary; #[cfg(feature = "bytes")] mod bytes; #[cfg(feature = "markup")] mod markup; #[cfg(feature = "proptest")] mod proptest; #[cfg(feature = "quickcheck")] mod quickcheck; #[cfg(feature = "rkyv")] mod rkyv; #[cfg(feature = "serde")] mod serde; #[cfg(feature = "smallvec")] mod smallvec; compact_str-0.7.1/src/features/proptest.rs000064400000000000000000000027501046102023000170060ustar 00000000000000//! Implements the [`proptest::arbitrary::Arbitrary`] trait for [`CompactString`] use proptest::arbitrary::{ Arbitrary, StrategyFor, }; use proptest::prelude::*; use proptest::strategy::{ MapInto, Strategy, }; use proptest::string::StringParam; use crate::CompactString; #[cfg_attr(docsrs, doc(cfg(feature = "proptest")))] impl Arbitrary for CompactString { type Parameters = StringParam; type Strategy = MapInto, Self>; fn arbitrary_with(a: Self::Parameters) -> Self::Strategy { any_with::(a).prop_map_into() } } #[cfg(test)] mod test { use proptest::prelude::*; use crate::CompactString; const MAX_SIZE: usize = std::mem::size_of::(); proptest! { #[test] #[cfg_attr(miri, ignore)] fn proptest_sanity(compact: CompactString) { let control: String = compact.clone().into(); assert_eq!(control, compact); } /// We rely on [`proptest`]'s `String` strategy for generating a `CompactString`. When /// converting from a `String` into a `CompactString`, if it's short enough we should /// eagerly inline strings #[test] #[cfg_attr(miri, ignore)] fn proptest_does_not_inline_strings(compact: CompactString) { if compact.len() <= MAX_SIZE { assert!(!compact.is_heap_allocated()); } else { assert!(compact.is_heap_allocated()); } } } } compact_str-0.7.1/src/features/quickcheck.rs000064400000000000000000000027231046102023000172400ustar 00000000000000//! Implements the [`quickcheck::Arbitrary`] trait for [`CompactString`] use quickcheck::{ Arbitrary, Gen, }; use crate::CompactString; #[cfg_attr(docsrs, doc(cfg(feature = "quickcheck")))] impl Arbitrary for CompactString { fn arbitrary(g: &mut Gen) -> CompactString { let max = g.size(); // pick some value in [0, max] let x = usize::arbitrary(g); let ratio = (x as f64) / (usize::MAX as f64); let size = (ratio * max as f64) as usize; (0..size).map(|_| char::arbitrary(g)).collect() } fn shrink(&self) -> Box> { // Shrink a string by shrinking a vector of its characters. let chars: Vec = self.chars().collect(); Box::new( chars .shrink() .map(|x| x.into_iter().collect::()), ) } } #[cfg(test)] mod test { use quickcheck_macros::quickcheck; use crate::CompactString; #[quickcheck] #[cfg_attr(miri, ignore)] fn quickcheck_sanity(compact: CompactString) { let control: String = compact.clone().into(); assert_eq!(control, compact); } #[quickcheck] #[cfg_attr(miri, ignore)] fn quickcheck_inlines_strings(compact: CompactString) { if compact.len() <= std::mem::size_of::() { assert!(!compact.is_heap_allocated()) } else { assert!(compact.is_heap_allocated()) } } } compact_str-0.7.1/src/features/rkyv.rs000064400000000000000000000072711046102023000161240ustar 00000000000000#![cfg_attr(docsrs, doc(cfg(feature = "rkyv")))] use rkyv::string::{ ArchivedString, StringResolver, }; use rkyv::{ Archive, Deserialize, DeserializeUnsized, Fallible, Serialize, SerializeUnsized, }; use crate::CompactString; impl Archive for CompactString { type Archived = ArchivedString; type Resolver = StringResolver; #[inline] unsafe fn resolve(&self, pos: usize, resolver: Self::Resolver, out: *mut Self::Archived) { ArchivedString::resolve_from_str(self.as_str(), pos, resolver, out); } } impl Serialize for CompactString where str: SerializeUnsized, { #[inline] fn serialize(&self, serializer: &mut S) -> Result { ArchivedString::serialize_from_str(self.as_str(), serializer) } } impl Deserialize for ArchivedString where str: DeserializeUnsized, { #[inline] fn deserialize(&self, _: &mut D) -> Result { Ok(self.as_str().into()) } } impl PartialEq for ArchivedString { #[inline] fn eq(&self, other: &CompactString) -> bool { PartialEq::eq(self.as_str(), other.as_str()) } } impl PartialOrd for ArchivedString { #[inline] fn partial_cmp(&self, other: &CompactString) -> Option { PartialOrd::partial_cmp(self.as_str(), other.as_str()) } } #[cfg(test)] mod tests { use rkyv::Deserialize; use test_strategy::proptest; use crate::CompactString; #[cfg_attr(miri, ignore)] // https://github.com/rust-lang/unsafe-code-guidelines/issues/134 #[test] fn test_roundtrip() { const VALUE: &str = "Hello, ๐ŸŒ!"; let bytes_compact = rkyv::to_bytes::<_, 32>(&CompactString::from(VALUE)).unwrap(); let bytes_control = rkyv::to_bytes::<_, 32>(&String::from(VALUE)).unwrap(); assert_eq!(&*bytes_compact, &*bytes_control); let archived = unsafe { rkyv::archived_root::(&bytes_compact) }; let compact: CompactString = archived.deserialize(&mut rkyv::Infallible).unwrap(); let control: String = archived.deserialize(&mut rkyv::Infallible).unwrap(); assert_eq!(archived, VALUE); assert_eq!(compact, VALUE); assert_eq!(control, VALUE); let archived = unsafe { rkyv::archived_root::(&bytes_compact) }; let compact: CompactString = archived.deserialize(&mut rkyv::Infallible).unwrap(); let control: String = archived.deserialize(&mut rkyv::Infallible).unwrap(); assert_eq!(archived, VALUE); assert_eq!(compact, VALUE); assert_eq!(control, VALUE); } #[cfg_attr(miri, ignore)] #[proptest] fn proptest_roundtrip(s: String) { let bytes_compact = rkyv::to_bytes::<_, 32>(&CompactString::from(&s)).unwrap(); let bytes_control = rkyv::to_bytes::<_, 32>(&s).unwrap(); assert_eq!(&*bytes_compact, &*bytes_control); let archived = unsafe { rkyv::archived_root::(&bytes_compact) }; let compact: CompactString = archived.deserialize(&mut rkyv::Infallible).unwrap(); let control: String = archived.deserialize(&mut rkyv::Infallible).unwrap(); assert_eq!(archived, &s); assert_eq!(compact, s); assert_eq!(control, s); let archived = unsafe { rkyv::archived_root::(&bytes_compact) }; let compact: CompactString = archived.deserialize(&mut rkyv::Infallible).unwrap(); let control: String = archived.deserialize(&mut rkyv::Infallible).unwrap(); assert_eq!(archived, &s); assert_eq!(compact, s); assert_eq!(control, s); } } compact_str-0.7.1/src/features/serde.rs000064400000000000000000000121411046102023000162230ustar 00000000000000use std::fmt; use serde::de::{ Deserializer, Error, Unexpected, Visitor, }; use crate::CompactString; fn compact_string<'de: 'a, 'a, D: Deserializer<'de>>( deserializer: D, ) -> Result { struct CompactStringVisitor; impl<'a> Visitor<'a> for CompactStringVisitor { type Value = CompactString; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { formatter.write_str("a string") } fn visit_str(self, v: &str) -> Result { Ok(CompactString::from(v)) } fn visit_borrowed_str(self, v: &'a str) -> Result { Ok(CompactString::from(v)) } fn visit_string(self, v: String) -> Result { Ok(CompactString::from(v)) } fn visit_bytes(self, v: &[u8]) -> Result { match std::str::from_utf8(v) { Ok(s) => Ok(CompactString::from(s)), Err(_) => Err(Error::invalid_value(Unexpected::Bytes(v), &self)), } } fn visit_borrowed_bytes(self, v: &'a [u8]) -> Result { match std::str::from_utf8(v) { Ok(s) => Ok(CompactString::from(s)), Err(_) => Err(Error::invalid_value(Unexpected::Bytes(v), &self)), } } fn visit_byte_buf(self, v: Vec) -> Result { match String::from_utf8(v) { Ok(s) => Ok(CompactString::from(s)), Err(e) => Err(Error::invalid_value( Unexpected::Bytes(&e.into_bytes()), &self, )), } } } deserializer.deserialize_str(CompactStringVisitor) } #[cfg_attr(docsrs, doc(cfg(feature = "serde")))] impl serde::Serialize for CompactString { fn serialize(&self, serializer: S) -> Result { self.as_str().serialize(serializer) } } #[cfg_attr(docsrs, doc(cfg(feature = "serde")))] impl<'de> serde::Deserialize<'de> for CompactString { fn deserialize>(deserializer: D) -> Result { compact_string(deserializer) } } #[cfg(test)] mod tests { use serde::{ Deserialize, Serialize, }; use test_strategy::proptest; use crate::CompactString; #[derive(Debug, PartialEq, Eq, Deserialize, Serialize)] struct PersonString { name: String, phones: Vec, address: Option, } #[derive(Debug, PartialEq, Eq, Deserialize, Serialize)] struct PersonCompactString { name: CompactString, phones: Vec, address: Option, } #[test] fn test_roundtrip() { let name = "Ferris the Crab"; let phones = vec!["1-800-111-1111", "2-222-222-2222"]; let address = Some("123 Sesame Street"); let std = PersonString { name: name.to_string(), phones: phones.iter().map(|s| s.to_string()).collect(), address: address.as_ref().map(|s| s.to_string()), }; let compact = PersonCompactString { name: name.into(), phones: phones.iter().map(|s| CompactString::from(*s)).collect(), address: address.as_ref().map(|s| CompactString::from(*s)), }; let std_json = serde_json::to_string(&std).unwrap(); let compact_json = serde_json::to_string(&compact).unwrap(); // the serialized forms should be the same assert_eq!(std_json, compact_json); let std_de_compact: PersonString = serde_json::from_str(&compact_json).unwrap(); let compact_de_std: PersonCompactString = serde_json::from_str(&std_json).unwrap(); // we should be able to deserailze from the opposite, serialized, source assert_eq!(std_de_compact, std); assert_eq!(compact_de_std, compact); } #[cfg_attr(miri, ignore)] #[proptest] fn proptest_roundtrip(name: String, phones: Vec, address: Option) { let std = PersonString { name: name.clone(), phones: phones.iter().map(|s| s.clone()).collect(), address: address.clone(), }; let compact = PersonCompactString { name: name.into(), phones: phones.iter().map(|s| CompactString::from(s)).collect(), address: address.map(|s| CompactString::from(s)), }; let std_json = serde_json::to_string(&std).unwrap(); let compact_json = serde_json::to_string(&compact).unwrap(); // the serialized forms should be the same assert_eq!(std_json, compact_json); let std_de_compact: PersonString = serde_json::from_str(&compact_json).unwrap(); let compact_de_std: PersonCompactString = serde_json::from_str(&std_json).unwrap(); // we should be able to deserailze from the opposite, serialized, source assert_eq!(std_de_compact, std); assert_eq!(compact_de_std, compact); } } compact_str-0.7.1/src/features/smallvec.rs000064400000000000000000000052161046102023000167340ustar 00000000000000use smallvec::SmallVec; use crate::repr::MAX_SIZE; use crate::CompactString; impl CompactString { /// Converts a [`CompactString`] into a byte vector /// /// This consumes the [`CompactString`] and returns a [`SmallVec`], so we do not need to copy /// contents /// /// Note: [`SmallVec`] is an inline-able version [`Vec`], just like [`CompactString`] is an /// inline-able version of [`String`]. /// /// # Example /// ``` /// use compact_str::CompactString; /// /// let c = CompactString::new("hello"); /// let bytes = c.into_bytes(); /// /// assert_eq!(&[104, 101, 108, 108, 111][..], &bytes[..]); /// ``` #[cfg_attr(docsrs, doc(cfg(feature = "smallvec")))] pub fn into_bytes(self) -> SmallVec<[u8; MAX_SIZE]> { self.0.into_bytes() } } #[cfg(test)] mod tests { use proptest::prelude::*; use test_strategy::proptest; use crate::repr::MAX_SIZE; use crate::tests::rand_unicode; use crate::CompactString; /// generates random unicode strings, that are at least MAX_SIZE bytes long pub fn rand_long_unicode() -> impl Strategy { proptest::collection::vec(proptest::char::any(), (MAX_SIZE + 1)..80) .prop_map(|v| v.into_iter().collect()) } #[test] fn test_buffer_reuse() { let c = CompactString::from("I am a longer string that will be on the heap"); let c_ptr = c.as_ptr(); let bytes = c.into_bytes(); let b_ptr = bytes.as_ptr(); // Note: inlined CompactStrings also get their buffers re-used, but we can't assert their // re-use the same way we do for longer strings, because the underlying array may move on // the callstack, whereas for longer strings the buffer is not moving on the heap // converting into_bytes should _always_ re-use the underlying buffer assert_eq!(c_ptr, b_ptr); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_buffer_reuse(#[strategy(rand_long_unicode())] s: String) { let c = CompactString::from(s); let c_ptr = c.as_ptr(); let bytes = c.into_bytes(); let b_ptr = bytes.as_ptr(); // converting into_bytes should _always_ re-use the underlying buffer prop_assert_eq!(c_ptr, b_ptr); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_roundtrip(#[strategy(rand_unicode())] s: String) { let og_compact = CompactString::from(s.clone()); prop_assert_eq!(&og_compact, &s); let bytes = og_compact.into_bytes(); let ex_compact = CompactString::from_utf8(bytes).unwrap(); prop_assert_eq!(&ex_compact, &s); } } compact_str-0.7.1/src/lib.rs000064400000000000000000001676641046102023000140760ustar 00000000000000#![doc = include_str!("../README.md")] #![cfg_attr(docsrs, feature(doc_cfg))] #[doc(hidden)] pub use core; use core::borrow::{ Borrow, BorrowMut, }; use core::cmp::Ordering; use core::hash::{ Hash, Hasher, }; use core::iter::FromIterator; use core::ops::{ Add, AddAssign, Bound, Deref, DerefMut, RangeBounds, }; use core::str::{ FromStr, Utf8Error, }; use core::{ fmt, slice, }; use std::borrow::Cow; use std::ffi::OsStr; use std::iter::FusedIterator; mod features; mod macros; mod repr; use repr::Repr; mod traits; pub use traits::{ CompactStringExt, ToCompactString, }; #[cfg(test)] mod tests; /// A [`CompactString`] is a compact string type that can be used almost anywhere a /// [`String`] or [`str`] can be used. /// /// ## Using `CompactString` /// ``` /// use compact_str::CompactString; /// # use std::collections::HashMap; /// /// // CompactString auto derefs into a str so you can use all methods from `str` /// // that take a `&self` /// if CompactString::new("hello world!").is_ascii() { /// println!("we're all ASCII") /// } /// /// // You can use a CompactString in collections like you would a String or &str /// let mut map: HashMap = HashMap::new(); /// /// // directly construct a new `CompactString` /// map.insert(CompactString::new("nyc"), CompactString::new("empire state building")); /// // create a `CompactString` from a `&str` /// map.insert("sf".into(), "transamerica pyramid".into()); /// // create a `CompactString` from a `String` /// map.insert(String::from("sea").into(), String::from("space needle").into()); /// /// fn wrapped_print>(text: T) { /// println!("{}", text.as_ref()); /// } /// /// // CompactString impls AsRef and Borrow, so it can be used anywhere /// // that expects a generic string /// if let Some(building) = map.get("nyc") { /// wrapped_print(building); /// } /// /// // CompactString can also be directly compared to a String or &str /// assert_eq!(CompactString::new("chicago"), "chicago"); /// assert_eq!(CompactString::new("houston"), String::from("houston")); /// ``` /// /// # Converting from a `String` /// It's important that a `CompactString` interops well with `String`, so you can easily use both in /// your code base. /// /// `CompactString` implements `From` and operates in the following manner: /// - Eagerly inlines the string, possibly dropping excess capacity /// - Otherwise re-uses the same underlying buffer from `String` /// /// ``` /// use compact_str::CompactString; /// /// // eagerly inlining /// let short = String::from("hello world"); /// let short_c = CompactString::from(short); /// assert!(!short_c.is_heap_allocated()); /// /// // dropping excess capacity /// let mut excess = String::with_capacity(256); /// excess.push_str("abc"); /// /// let excess_c = CompactString::from(excess); /// assert!(!excess_c.is_heap_allocated()); /// assert!(excess_c.capacity() < 256); /// /// // re-using the same buffer /// let long = String::from("this is a longer string that will be heap allocated"); /// /// let long_ptr = long.as_ptr(); /// let long_len = long.len(); /// let long_cap = long.capacity(); /// /// let mut long_c = CompactString::from(long); /// assert!(long_c.is_heap_allocated()); /// /// let cpt_ptr = long_c.as_ptr(); /// let cpt_len = long_c.len(); /// let cpt_cap = long_c.capacity(); /// /// // the original String and the CompactString point to the same place in memory, buffer re-use! /// assert_eq!(cpt_ptr, long_ptr); /// assert_eq!(cpt_len, long_len); /// assert_eq!(cpt_cap, long_cap); /// ``` /// /// ### Prevent Eagerly Inlining /// A consequence of eagerly inlining is you then need to de-allocate the existing buffer, which /// might not always be desirable if you're converting a very large amount of `String`s. If your /// code is very sensitive to allocations, consider the [`CompactString::from_string_buffer`] API. #[derive(Clone)] #[repr(transparent)] pub struct CompactString(Repr); impl CompactString { /// Creates a new [`CompactString`] from any type that implements `AsRef`. /// If the string is short enough, then it will be inlined on the stack! /// /// # Examples /// /// ### Inlined /// ``` /// # use compact_str::CompactString; /// // We can inline strings up to 12 characters long on 32-bit architectures... /// #[cfg(target_pointer_width = "32")] /// let s = "i'm 12 chars"; /// // ...and up to 24 characters on 64-bit architectures! /// #[cfg(target_pointer_width = "64")] /// let s = "i am 24 characters long!"; /// /// let compact = CompactString::new(&s); /// /// assert_eq!(compact, s); /// // we are not allocated on the heap! /// assert!(!compact.is_heap_allocated()); /// ``` /// /// ### Heap /// ``` /// # use compact_str::CompactString; /// // For longer strings though, we get allocated on the heap /// let long = "I am a longer string that will be allocated on the heap"; /// let compact = CompactString::new(long); /// /// assert_eq!(compact, long); /// // we are allocated on the heap! /// assert!(compact.is_heap_allocated()); /// ``` /// /// ### Creation /// ``` /// use compact_str::CompactString; /// /// // Using a `&'static str` /// let s = "hello world!"; /// let hello = CompactString::new(&s); /// /// // Using a `String` /// let u = String::from("๐Ÿฆ„๐ŸŒˆ"); /// let unicorn = CompactString::new(u); /// /// // Using a `Box` /// let b: Box = String::from("๐Ÿ“ฆ๐Ÿ“ฆ๐Ÿ“ฆ").into_boxed_str(); /// let boxed = CompactString::new(&b); /// ``` #[inline] pub fn new>(text: T) -> Self { CompactString(Repr::new(text.as_ref())) } /// Creates a new inline [`CompactString`] at compile time. /// /// # Examples /// ``` /// use compact_str::CompactString; /// /// const DEFAULT_NAME: CompactString = CompactString::new_inline("untitled"); /// ``` /// /// Note: Trying to create a long string that can't be inlined, will fail to build. /// ```compile_fail /// # use compact_str::CompactString; /// const LONG: CompactString = CompactString::new_inline("this is a long string that can't be stored on the stack"); /// ``` #[inline] pub const fn new_inline(text: &str) -> Self { CompactString(Repr::new_inline(text)) } /// Creates a new empty [`CompactString`] with the capacity to fit at least `capacity` bytes. /// /// A `CompactString` will inline strings on the stack, if they're small enough. Specifically, /// if the string has a length less than or equal to `std::mem::size_of::` bytes /// then it will be inlined. This also means that `CompactString`s have a minimum capacity /// of `std::mem::size_of::`. /// /// # Examples /// /// ### "zero" Capacity /// ``` /// # use compact_str::CompactString; /// // Creating a CompactString with a capacity of 0 will create /// // one with capacity of std::mem::size_of::(); /// let empty = CompactString::with_capacity(0); /// let min_size = std::mem::size_of::(); /// /// assert_eq!(empty.capacity(), min_size); /// assert_ne!(0, min_size); /// assert!(!empty.is_heap_allocated()); /// ``` /// /// ### Max Inline Size /// ``` /// # use compact_str::CompactString; /// // Creating a CompactString with a capacity of std::mem::size_of::() /// // will not heap allocate. /// let str_size = std::mem::size_of::(); /// let empty = CompactString::with_capacity(str_size); /// /// assert_eq!(empty.capacity(), str_size); /// assert!(!empty.is_heap_allocated()); /// ``` /// /// ### Heap Allocating /// ``` /// # use compact_str::CompactString; /// // If you create a `CompactString` with a capacity greater than /// // `std::mem::size_of::`, it will heap allocated. For heap /// // allocated strings we have a minimum capacity /// /// const MIN_HEAP_CAPACITY: usize = std::mem::size_of::() * 4; /// /// let heap_size = std::mem::size_of::() + 1; /// let empty = CompactString::with_capacity(heap_size); /// /// assert_eq!(empty.capacity(), MIN_HEAP_CAPACITY); /// assert!(empty.is_heap_allocated()); /// ``` #[inline] pub fn with_capacity(capacity: usize) -> Self { CompactString(Repr::with_capacity(capacity)) } /// Convert a slice of bytes into a [`CompactString`]. /// /// A [`CompactString`] is a contiguous collection of bytes (`u8`s) that is valid [`UTF-8`](https://en.wikipedia.org/wiki/UTF-8). /// This method converts from an arbitrary contiguous collection of bytes into a /// [`CompactString`], failing if the provided bytes are not `UTF-8`. /// /// Note: If you want to create a [`CompactString`] from a non-contiguous collection of bytes, /// enable the `bytes` feature of this crate, and see `CompactString::from_utf8_buf` /// /// # Examples /// ### Valid UTF-8 /// ``` /// # use compact_str::CompactString; /// let bytes = vec![240, 159, 166, 128, 240, 159, 146, 175]; /// let compact = CompactString::from_utf8(bytes).expect("valid UTF-8"); /// /// assert_eq!(compact, "๐Ÿฆ€๐Ÿ’ฏ"); /// ``` /// /// ### Invalid UTF-8 /// ``` /// # use compact_str::CompactString; /// let bytes = vec![255, 255, 255]; /// let result = CompactString::from_utf8(bytes); /// /// assert!(result.is_err()); /// ``` #[inline] pub fn from_utf8>(buf: B) -> Result { Repr::from_utf8(buf).map(CompactString) } /// Converts a vector of bytes to a [`CompactString`] without checking that the string contains /// valid UTF-8. /// /// See the safe version, [`CompactString::from_utf8`], for more details. /// /// # Safety /// /// This function is unsafe because it does not check that the bytes passed to it are valid /// UTF-8. If this constraint is violated, it may cause memory unsafety issues with future users /// of the [`CompactString`], as the rest of the standard library assumes that /// [`CompactString`]s are valid UTF-8. /// /// # Examples /// /// Basic usage: /// /// ``` /// # use compact_str::CompactString; /// // some bytes, in a vector /// let sparkle_heart = vec![240, 159, 146, 150]; /// /// let sparkle_heart = unsafe { /// CompactString::from_utf8_unchecked(sparkle_heart) /// }; /// /// assert_eq!("๐Ÿ’–", sparkle_heart); /// ``` #[inline] #[must_use] pub unsafe fn from_utf8_unchecked>(buf: B) -> Self { CompactString(Repr::from_utf8_unchecked(buf)) } /// Decode a [`UTF-16`](https://en.wikipedia.org/wiki/UTF-16) slice of bytes into a /// [`CompactString`], returning an [`Err`] if the slice contains any invalid data. /// /// # Examples /// ### Valid UTF-16 /// ``` /// # use compact_str::CompactString; /// let buf: &[u16] = &[0xD834, 0xDD1E, 0x006d, 0x0075, 0x0073, 0x0069, 0x0063]; /// let compact = CompactString::from_utf16(buf).unwrap(); /// /// assert_eq!(compact, "๐„žmusic"); /// ``` /// /// ### Invalid UTF-16 /// ``` /// # use compact_str::CompactString; /// let buf: &[u16] = &[0xD834, 0xDD1E, 0x006d, 0x0075, 0xD800, 0x0069, 0x0063]; /// let res = CompactString::from_utf16(buf); /// /// assert!(res.is_err()); /// ``` #[inline] pub fn from_utf16>(buf: B) -> Result { // Note: we don't use collect::>() because that fails to pre-allocate a buffer, // even though the size of our iterator, `buf`, is known ahead of time. // // rustlang issue #48994 is tracking the fix let buf = buf.as_ref(); let mut ret = CompactString::with_capacity(buf.len()); for c in core::char::decode_utf16(buf.iter().copied()) { if let Ok(c) = c { ret.push(c); } else { return Err(Utf16Error(())); } } Ok(ret) } /// Decode a UTF-16โ€“encoded slice `v` into a `CompactString`, replacing invalid data with /// the replacement character (`U+FFFD`), ๏ฟฝ. /// /// # Examples /// /// Basic usage: /// /// ``` /// # use compact_str::CompactString; /// // ๐„žmusic /// let v = &[0xD834, 0xDD1E, 0x006d, 0x0075, /// 0x0073, 0xDD1E, 0x0069, 0x0063, /// 0xD834]; /// /// assert_eq!(CompactString::from("๐„žmus\u{FFFD}ic\u{FFFD}"), /// CompactString::from_utf16_lossy(v)); /// ``` #[inline] pub fn from_utf16_lossy>(buf: B) -> Self { let buf = buf.as_ref(); let mut ret = CompactString::with_capacity(buf.len()); for c in std::char::decode_utf16(buf.iter().copied()) { match c { Ok(c) => ret.push(c), Err(_) => ret.push_str("๏ฟฝ"), } } ret } /// Returns the length of the [`CompactString`] in `bytes`, not [`char`]s or graphemes. /// /// When using `UTF-8` encoding (which all strings in Rust do) a single character will be 1 to 4 /// bytes long, therefore the return value of this method might not be what a human considers /// the length of the string. /// /// # Examples /// ``` /// # use compact_str::CompactString; /// let ascii = CompactString::new("hello world"); /// assert_eq!(ascii.len(), 11); /// /// let emoji = CompactString::new("๐Ÿ‘ฑ"); /// assert_eq!(emoji.len(), 4); /// ``` #[inline] pub fn len(&self) -> usize { self.0.len() } /// Returns `true` if the [`CompactString`] has a length of 0, `false` otherwise /// /// # Examples /// ``` /// # use compact_str::CompactString; /// let mut msg = CompactString::new(""); /// assert!(msg.is_empty()); /// /// // add some characters /// msg.push_str("hello reader!"); /// assert!(!msg.is_empty()); /// ``` #[inline] pub fn is_empty(&self) -> bool { self.len() == 0 } /// Returns the capacity of the [`CompactString`], in bytes. /// /// # Note /// * A `CompactString` will always have a capacity of at least `std::mem::size_of::()` /// /// # Examples /// ### Minimum Size /// ``` /// # use compact_str::CompactString; /// let min_size = std::mem::size_of::(); /// let compact = CompactString::new(""); /// /// assert!(compact.capacity() >= min_size); /// ``` /// /// ### Heap Allocated /// ``` /// # use compact_str::CompactString; /// let compact = CompactString::with_capacity(128); /// assert_eq!(compact.capacity(), 128); /// ``` #[inline] pub fn capacity(&self) -> usize { self.0.capacity() } /// Ensures that this [`CompactString`]'s capacity is at least `additional` bytes longer than /// its length. The capacity may be increased by more than `additional` bytes if it chooses, /// to prevent frequent reallocations. /// /// # Note /// * A `CompactString` will always have at least a capacity of `std::mem::size_of::()` /// * Reserving additional bytes may cause the `CompactString` to become heap allocated /// /// # Panics /// Panics if the new capacity overflows `usize` /// /// # Examples /// ``` /// # use compact_str::CompactString; /// /// const WORD: usize = std::mem::size_of::(); /// let mut compact = CompactString::default(); /// assert!(compact.capacity() >= (WORD * 3) - 1); /// /// compact.reserve(200); /// assert!(compact.is_heap_allocated()); /// assert!(compact.capacity() >= 200); /// ``` #[inline] pub fn reserve(&mut self, additional: usize) { self.0.reserve(additional) } /// Returns a string slice containing the entire [`CompactString`]. /// /// # Examples /// ``` /// # use compact_str::CompactString; /// let s = CompactString::new("hello"); /// /// assert_eq!(s.as_str(), "hello"); /// ``` #[inline] pub fn as_str(&self) -> &str { self.0.as_str() } /// Returns a mutable string slice containing the entire [`CompactString`]. /// /// # Examples /// ``` /// # use compact_str::CompactString; /// let mut s = CompactString::new("hello"); /// s.as_mut_str().make_ascii_uppercase(); /// /// assert_eq!(s.as_str(), "HELLO"); /// ``` #[inline] pub fn as_mut_str(&mut self) -> &mut str { let len = self.len(); unsafe { std::str::from_utf8_unchecked_mut(&mut self.0.as_mut_buf()[..len]) } } /// Returns a byte slice of the [`CompactString`]'s contents. /// /// # Examples /// ``` /// # use compact_str::CompactString; /// let s = CompactString::new("hello"); /// /// assert_eq!(&[104, 101, 108, 108, 111], s.as_bytes()); /// ``` #[inline] pub fn as_bytes(&self) -> &[u8] { &self.0.as_slice()[..self.len()] } // TODO: Implement a `try_as_mut_slice(...)` that will fail if it results in cloning? // /// Provides a mutable reference to the underlying buffer of bytes. /// /// # Safety /// * All Rust strings, including `CompactString`, must be valid UTF-8. The caller must /// guarantee /// that any modifications made to the underlying buffer are valid UTF-8. /// /// # Examples /// ``` /// # use compact_str::CompactString; /// let mut s = CompactString::new("hello"); /// /// let slice = unsafe { s.as_mut_bytes() }; /// // copy bytes into our string /// slice[5..11].copy_from_slice(" world".as_bytes()); /// // set the len of the string /// unsafe { s.set_len(11) }; /// /// assert_eq!(s, "hello world"); /// ``` #[inline] pub unsafe fn as_mut_bytes(&mut self) -> &mut [u8] { self.0.as_mut_buf() } /// Appends the given [`char`] to the end of this [`CompactString`]. /// /// # Examples /// ``` /// # use compact_str::CompactString; /// let mut s = CompactString::new("foo"); /// /// s.push('b'); /// s.push('a'); /// s.push('r'); /// /// assert_eq!("foobar", s); /// ``` pub fn push(&mut self, ch: char) { self.push_str(ch.encode_utf8(&mut [0; 4])); } /// Removes the last character from the [`CompactString`] and returns it. /// Returns `None` if this [`CompactString`] is empty. /// /// # Examples /// ``` /// # use compact_str::CompactString; /// let mut s = CompactString::new("abc"); /// /// assert_eq!(s.pop(), Some('c')); /// assert_eq!(s.pop(), Some('b')); /// assert_eq!(s.pop(), Some('a')); /// /// assert_eq!(s.pop(), None); /// ``` #[inline] pub fn pop(&mut self) -> Option { self.0.pop() } /// Appends a given string slice onto the end of this [`CompactString`] /// /// # Examples /// ``` /// # use compact_str::CompactString; /// let mut s = CompactString::new("abc"); /// /// s.push_str("123"); /// /// assert_eq!("abc123", s); /// ``` #[inline] pub fn push_str(&mut self, s: &str) { self.0.push_str(s) } /// Removes a [`char`] from this [`CompactString`] at a byte position and returns it. /// /// This is an *O*(*n*) operation, as it requires copying every element in the /// buffer. /// /// # Panics /// /// Panics if `idx` is larger than or equal to the [`CompactString`]'s length, /// or if it does not lie on a [`char`] boundary. /// /// # Examples /// /// ### Basic usage: /// /// ``` /// # use compact_str::CompactString; /// let mut c = CompactString::from("hello world"); /// /// assert_eq!(c.remove(0), 'h'); /// assert_eq!(c, "ello world"); /// /// assert_eq!(c.remove(5), 'w'); /// assert_eq!(c, "ello orld"); /// ``` /// /// ### Past total length: /// /// ```should_panic /// # use compact_str::CompactString; /// let mut c = CompactString::from("hello there!"); /// c.remove(100); /// ``` /// /// ### Not on char boundary: /// /// ```should_panic /// # use compact_str::CompactString; /// let mut c = CompactString::from("๐Ÿฆ„"); /// c.remove(1); /// ``` #[inline] pub fn remove(&mut self, idx: usize) -> char { let len = self.len(); let substr = &mut self.as_mut_str()[idx..]; // get the char we want to remove let ch = substr .chars() .next() .expect("cannot remove a char from the end of a string"); let ch_len = ch.len_utf8(); // shift everything back one character let num_bytes = substr.len() - ch_len; let ptr = substr.as_mut_ptr(); // SAFETY: Both src and dest are valid for reads of `num_bytes` amount of bytes, // and are properly aligned unsafe { core::ptr::copy(ptr.add(ch_len) as *const u8, ptr, num_bytes); self.set_len(len - ch_len); } ch } /// Forces the length of the [`CompactString`] to `new_len`. /// /// This is a low-level operation that maintains none of the normal invariants for /// `CompactString`. If you want to modify the `CompactString` you should use methods like /// `push`, `push_str` or `pop`. /// /// # Safety /// * `new_len` must be less than or equal to `capacity()` /// * The elements at `old_len..new_len` must be initialized #[inline] pub unsafe fn set_len(&mut self, new_len: usize) { self.0.set_len(new_len) } /// Returns whether or not the [`CompactString`] is heap allocated. /// /// # Examples /// ### Inlined /// ``` /// # use compact_str::CompactString; /// let hello = CompactString::new("hello world"); /// /// assert!(!hello.is_heap_allocated()); /// ``` /// /// ### Heap Allocated /// ``` /// # use compact_str::CompactString; /// let msg = CompactString::new("this message will self destruct in 5, 4, 3, 2, 1 ๐Ÿ’ฅ"); /// /// assert!(msg.is_heap_allocated()); /// ``` #[inline] pub fn is_heap_allocated(&self) -> bool { self.0.is_heap_allocated() } /// Ensure that the given range is inside the set data, and that no codepoints are split. /// /// Returns the range `start..end` as a tuple. #[inline] fn ensure_range(&self, range: impl RangeBounds) -> (usize, usize) { #[cold] #[inline(never)] fn illegal_range() -> ! { panic!("illegal range"); } let start = match range.start_bound() { Bound::Included(&n) => n, Bound::Excluded(&n) => match n.checked_add(1) { Some(n) => n, None => illegal_range(), }, Bound::Unbounded => 0, }; let end = match range.end_bound() { Bound::Included(&n) => match n.checked_add(1) { Some(n) => n, None => illegal_range(), }, Bound::Excluded(&n) => n, Bound::Unbounded => self.len(), }; if end < start { illegal_range(); } let s = self.as_str(); if !s.is_char_boundary(start) || !s.is_char_boundary(end) { illegal_range(); } (start, end) } /// Removes the specified range in the [`CompactString`], /// and replaces it with the given string. /// The given string doesn't need to be the same length as the range. /// /// # Panics /// /// Panics if the starting point or end point do not lie on a [`char`] /// boundary, or if they're out of bounds. /// /// # Examples /// /// Basic usage: /// /// ``` /// # use compact_str::CompactString; /// let mut s = CompactString::new("Hello, world!"); /// /// s.replace_range(7..12, "WORLD"); /// assert_eq!(s, "Hello, WORLD!"); /// /// s.replace_range(7..=11, "you"); /// assert_eq!(s, "Hello, you!"); /// /// s.replace_range(5.., "! Is it me you're looking for?"); /// assert_eq!(s, "Hello! Is it me you're looking for?"); /// ``` #[inline] pub fn replace_range(&mut self, range: impl RangeBounds, replace_with: &str) { let (start, end) = self.ensure_range(range); let dest_len = end - start; match dest_len.cmp(&replace_with.len()) { Ordering::Equal => unsafe { self.replace_range_same_size(start, end, replace_with) }, Ordering::Greater => unsafe { self.replace_range_shrink(start, end, replace_with) }, Ordering::Less => unsafe { self.replace_range_grow(start, end, replace_with) }, } } /// Replace into the same size. unsafe fn replace_range_same_size(&mut self, start: usize, end: usize, replace_with: &str) { core::ptr::copy_nonoverlapping( replace_with.as_ptr(), self.as_mut_ptr().add(start), end - start, ); } /// Replace, so self.len() gets smaller. unsafe fn replace_range_shrink(&mut self, start: usize, end: usize, replace_with: &str) { let total_len = self.len(); let dest_len = end - start; let new_len = total_len - (dest_len - replace_with.len()); let amount = total_len - end; let data = self.as_mut_ptr(); // first insert the replacement string, overwriting the current content core::ptr::copy_nonoverlapping(replace_with.as_ptr(), data.add(start), replace_with.len()); // then move the tail of the CompactString forward to its new place, filling the gap core::ptr::copy( data.add(total_len - amount), data.add(new_len - amount), amount, ); // and lastly we set the new length self.set_len(new_len); } /// Replace, so self.len() gets bigger. unsafe fn replace_range_grow(&mut self, start: usize, end: usize, replace_with: &str) { let dest_len = end - start; self.reserve(replace_with.len() - dest_len); let total_len = self.len(); let new_len = total_len + (replace_with.len() - dest_len); let amount = total_len - end; // first grow the string, so MIRI knows that the full range is usable self.set_len(new_len); let data = self.as_mut_ptr(); // then move the tail of the CompactString back to its new place core::ptr::copy( data.add(total_len - amount), data.add(new_len - amount), amount, ); // and lastly insert the replacement string core::ptr::copy_nonoverlapping(replace_with.as_ptr(), data.add(start), replace_with.len()); } /// Truncate the [`CompactString`] to a shorter length. /// /// If the length of the [`CompactString`] is less or equal to `new_len`, the call is a no-op. /// /// Calling this function does not change the capacity of the [`CompactString`]. /// /// # Panics /// /// Panics if the new end of the string does not lie on a [`char`] boundary. /// /// # Examples /// /// Basic usage: /// /// ``` /// # use compact_str::CompactString; /// let mut s = CompactString::new("Hello, world!"); /// s.truncate(5); /// assert_eq!(s, "Hello"); /// ``` pub fn truncate(&mut self, new_len: usize) { let s = self.as_str(); if new_len >= s.len() { return; } assert!( s.is_char_boundary(new_len), "new_len must lie on char boundary", ); unsafe { self.set_len(new_len) }; } /// Converts a [`CompactString`] to a raw pointer. #[inline] pub fn as_ptr(&self) -> *const u8 { self.0.as_slice().as_ptr() } /// Converts a mutable [`CompactString`] to a raw pointer. #[inline] pub fn as_mut_ptr(&mut self) -> *mut u8 { unsafe { self.0.as_mut_buf().as_mut_ptr() } } /// Insert string character at an index. /// /// # Examples /// /// Basic usage: /// /// ``` /// # use compact_str::CompactString; /// let mut s = CompactString::new("Hello!"); /// s.insert_str(5, ", world"); /// assert_eq!(s, "Hello, world!"); /// ``` pub fn insert_str(&mut self, idx: usize, string: &str) { assert!(self.is_char_boundary(idx), "idx must lie on char boundary"); let new_len = self.len() + string.len(); self.reserve(string.len()); // SAFETY: We just checked that we may split self at idx. // We set the length only after reserving the memory. // We fill the gap with valid UTF-8 data. unsafe { // first move the tail to the new back let data = self.as_mut_ptr(); std::ptr::copy( data.add(idx), data.add(idx + string.len()), new_len - idx - string.len(), ); // then insert the new bytes std::ptr::copy_nonoverlapping(string.as_ptr(), data.add(idx), string.len()); // and lastly resize the string self.set_len(new_len); } } /// Insert a character at an index. /// /// # Examples /// /// Basic usage: /// /// ``` /// # use compact_str::CompactString; /// let mut s = CompactString::new("Hello world!"); /// s.insert(5, ','); /// assert_eq!(s, "Hello, world!"); /// ``` pub fn insert(&mut self, idx: usize, ch: char) { self.insert_str(idx, ch.encode_utf8(&mut [0; 4])); } /// Reduces the length of the [`CompactString`] to zero. /// /// Calling this function does not change the capacity of the [`CompactString`]. /// /// ``` /// # use compact_str::CompactString; /// let mut s = CompactString::new("Rust is the most loved language on Stackoverflow!"); /// assert_eq!(s.capacity(), 49); /// /// s.clear(); /// /// assert_eq!(s, ""); /// assert_eq!(s.capacity(), 49); /// ``` pub fn clear(&mut self) { unsafe { self.set_len(0) }; } /// Split the [`CompactString`] into at the given byte index. /// /// Calling this function does not change the capacity of the [`CompactString`]. /// /// # Panics /// /// Panics if `at` does not lie on a [`char`] boundary. /// /// Basic usage: /// /// ``` /// # use compact_str::CompactString; /// let mut s = CompactString::new("Hello, world!"); /// assert_eq!(s.split_off(5), ", world!"); /// assert_eq!(s, "Hello"); /// ``` pub fn split_off(&mut self, at: usize) -> Self { let result = self[at..].into(); // SAFETY: the previous line `self[at...]` would have panicked if `at` was invalid unsafe { self.set_len(at) }; result } /// Remove a range from the [`CompactString`], and return it as an iterator. /// /// Calling this function does not change the capacity of the [`CompactString`]. /// /// # Panics /// /// Panics if the start or end of the range does not lie on a [`char`] boundary. /// /// # Examples /// /// Basic usage: /// /// ``` /// # use compact_str::CompactString; /// let mut s = CompactString::new("Hello, world!"); /// /// let mut d = s.drain(5..12); /// assert_eq!(d.next(), Some(',')); // iterate over the extracted data /// assert_eq!(d.as_str(), " world"); // or get the whole data as &str /// /// // The iterator keeps a reference to `s`, so you have to drop() the iterator, /// // before you can access `s` again. /// drop(d); /// assert_eq!(s, "Hello!"); /// ``` pub fn drain(&mut self, range: impl RangeBounds) -> Drain<'_> { let (start, end) = self.ensure_range(range); Drain { compact_string: self as *mut Self, start, end, chars: self[start..end].chars(), } } /// Shrinks the capacity of this [`CompactString`] with a lower bound. /// /// The resulting capactity is never less than the size of 3ร—[`usize`], /// i.e. the capacity than can be inlined. /// /// # Examples /// /// Basic usage: /// /// ``` /// # use compact_str::CompactString; /// let mut s = CompactString::with_capacity(100); /// assert_eq!(s.capacity(), 100); /// /// // if the capacity was already bigger than the argument, the call is a no-op /// s.shrink_to(100); /// assert_eq!(s.capacity(), 100); /// /// s.shrink_to(50); /// assert_eq!(s.capacity(), 50); /// /// // if the string can be inlined, it is /// s.shrink_to(10); /// assert_eq!(s.capacity(), 3 * std::mem::size_of::()); /// ``` #[inline] pub fn shrink_to(&mut self, min_capacity: usize) { self.0.shrink_to(min_capacity); } /// Shrinks the capacity of this [`CompactString`] to match its length. /// /// The resulting capactity is never less than the size of 3ร—[`usize`], /// i.e. the capacity than can be inlined. /// /// This method is effectively the same as calling [`string.shrink_to(0)`]. /// /// # Examples /// /// Basic usage: /// /// ``` /// # use compact_str::CompactString; /// let mut s = CompactString::from("This is a string with more than 24 characters."); /// /// s.reserve(100); /// assert!(s.capacity() >= 100); /// /// s.shrink_to_fit(); /// assert_eq!(s.len(), s.capacity()); /// ``` /// /// ``` /// # use compact_str::CompactString; /// let mut s = CompactString::from("short string"); /// /// s.reserve(100); /// assert!(s.capacity() >= 100); /// /// s.shrink_to_fit(); /// assert_eq!(s.capacity(), 3 * std::mem::size_of::()); /// ``` #[inline] pub fn shrink_to_fit(&mut self) { self.0.shrink_to(0); } /// Retains only the characters specified by the predicate. /// /// The method iterates over the characters in the string and calls the `predicate`. /// /// If the `predicate` returns `false`, then the character gets removed. /// If the `predicate` returns `true`, then the character is kept. /// /// # Examples /// /// ``` /// # use compact_str::CompactString; /// let mut s = CompactString::from("รคb๐„ždโ‚ฌ"); /// /// let keep = [false, true, true, false, true]; /// let mut iter = keep.iter(); /// s.retain(|_| *iter.next().unwrap()); /// /// assert_eq!(s, "b๐„žโ‚ฌ"); /// ``` pub fn retain(&mut self, mut predicate: impl FnMut(char) -> bool) { // We iterate over the string, and copy character by character. let s = self.as_mut_str(); let mut dest_idx = 0; let mut src_idx = 0; while let Some(ch) = s[src_idx..].chars().next() { let ch_len = ch.len_utf8(); if predicate(ch) { // SAFETY: We know that both indices are valid, and that we don't split a char. unsafe { let p = s.as_mut_ptr(); core::ptr::copy(p.add(src_idx), p.add(dest_idx), ch_len); } dest_idx += ch_len; } src_idx += ch_len; } // SAFETY: We know that the index is a valid position to break the string. unsafe { self.set_len(dest_idx) }; } /// Decode a bytes slice as UTF-8 string, replacing any illegal codepoints /// /// # Examples /// /// ``` /// # use compact_str::CompactString; /// let chess_knight = b"\xf0\x9f\xa8\x84"; /// /// assert_eq!( /// "๐Ÿจ„", /// CompactString::from_utf8_lossy(chess_knight), /// ); /// /// // For valid UTF-8 slices, this is the same as: /// assert_eq!( /// "๐Ÿจ„", /// CompactString::new(std::str::from_utf8(chess_knight).unwrap()), /// ); /// ``` /// /// Incorrect bytes: /// /// ``` /// # use compact_str::CompactString; /// let broken = b"\xf0\x9f\xc8\x84"; /// /// assert_eq!( /// "๏ฟฝศ„", /// CompactString::from_utf8_lossy(broken), /// ); /// /// // For invalid UTF-8 slices, this is an optimized implemented for: /// assert_eq!( /// "๏ฟฝศ„", /// CompactString::from(String::from_utf8_lossy(broken)), /// ); /// ``` pub fn from_utf8_lossy(v: &[u8]) -> Self { fn next_char<'a>( iter: &mut <&[u8] as IntoIterator>::IntoIter, buf: &'a mut [u8; 4], ) -> Option<&'a [u8]> { const REPLACEMENT: &[u8] = "\u{FFFD}".as_bytes(); macro_rules! ensure_range { ($idx:literal, $range:pat) => {{ let mut i = iter.clone(); match i.next() { Some(&c) if matches!(c, $range) => { buf[$idx] = c; *iter = i; } _ => return Some(REPLACEMENT), } }}; } macro_rules! ensure_cont { ($idx:literal) => {{ ensure_range!($idx, 0x80..=0xBF); }}; } let c = *iter.next()?; buf[0] = c; match c { 0x00..=0x7F => { // simple ASCII: push as is Some(&buf[..1]) } 0xC2..=0xDF => { // two bytes ensure_cont!(1); Some(&buf[..2]) } 0xE0..=0xEF => { // three bytes match c { // 0x80..=0x9F encodes surrogate half 0xE0 => ensure_range!(1, 0xA0..=0xBF), // 0xA0..=0xBF encodes surrogate half 0xED => ensure_range!(1, 0x80..=0x9F), // all UTF-8 continuation bytes are valid _ => ensure_cont!(1), } ensure_cont!(2); Some(&buf[..3]) } 0xF0..=0xF4 => { // four bytes match c { // 0x80..=0x8F encodes overlong three byte codepoint 0xF0 => ensure_range!(1, 0x90..=0xBF), // 0x90..=0xBF encodes codepoint > U+10FFFF 0xF4 => ensure_range!(1, 0x80..=0x8F), // all UTF-8 continuation bytes are valid _ => ensure_cont!(1), } ensure_cont!(2); ensure_cont!(3); Some(&buf[..4]) } | 0x80..=0xBF // unicode continuation, invalid | 0xC0..=0xC1 // overlong one byte character | 0xF5..=0xF7 // four bytes that encode > U+10FFFF | 0xF8..=0xFB // five bytes, invalid | 0xFC..=0xFD // six bytes, invalid | 0xFE..=0xFF => Some(REPLACEMENT), // always invalid } } let mut buf = [0; 4]; let mut result = Self::with_capacity(v.len()); let mut iter = v.iter(); while let Some(s) = next_char(&mut iter, &mut buf) { // SAFETY: next_char() only returns valid strings let s = unsafe { std::str::from_utf8_unchecked(s) }; result.push_str(s); } result } fn from_utf16x( v: &[u8], from_int: impl Fn(u16) -> u16, from_bytes: impl Fn([u8; 2]) -> u16, ) -> Result { if v.len() % 2 != 0 { // Input had an odd number of bytes. return Err(Utf16Error(())); } // Note: we don't use collect::>() because that fails to pre-allocate a buffer, // even though the size of our iterator, `v`, is known ahead of time. // // rustlang issue #48994 is tracking the fix let mut result = CompactString::with_capacity(v.len() / 2); // SAFETY: `u8` and `u16` are `Copy`, so if the alignment fits, we can transmute a // `[u8; 2*N]` to `[u16; N]`. `slice::align_to()` checks if the alignment is right. match unsafe { v.align_to::() } { (&[], v, &[]) => { // Input is correcty aligned. for c in std::char::decode_utf16(v.iter().copied().map(from_int)) { result.push(c.map_err(|_| Utf16Error(()))?); } } _ => { // Input's alignment is off. // SAFETY: we can always reinterpret a `[u8; 2*N]` slice as `[[u8; 2]; N]` let v = unsafe { slice::from_raw_parts(v.as_ptr().cast(), v.len() / 2) }; for c in std::char::decode_utf16(v.iter().copied().map(from_bytes)) { result.push(c.map_err(|_| Utf16Error(()))?); } } } Ok(result) } fn from_utf16x_lossy( v: &[u8], from_int: impl Fn(u16) -> u16, from_bytes: impl Fn([u8; 2]) -> u16, ) -> Self { // Notice: We write the string "๏ฟฝ" instead of the character '๏ฟฝ', so the character does not // have to be formatted before it can be appended. let (trailing_extra_byte, v) = match v.len() % 2 != 0 { true => (true, &v[..v.len() - 1]), false => (false, v), }; let mut result = CompactString::with_capacity(v.len() / 2); // SAFETY: `u8` and `u16` are `Copy`, so if the alignment fits, we can transmute a // `[u8; 2*N]` to `[u16; N]`. `slice::align_to()` checks if the alignment is right. match unsafe { v.align_to::() } { (&[], v, &[]) => { // Input is correcty aligned. for c in std::char::decode_utf16(v.iter().copied().map(from_int)) { match c { Ok(c) => result.push(c), Err(_) => result.push_str("๏ฟฝ"), } } } _ => { // Input's alignment is off. // SAFETY: we can always reinterpret a `[u8; 2*N]` slice as `[[u8; 2]; N]` let v = unsafe { slice::from_raw_parts(v.as_ptr().cast(), v.len() / 2) }; for c in std::char::decode_utf16(v.iter().copied().map(from_bytes)) { match c { Ok(c) => result.push(c), Err(_) => result.push_str("๏ฟฝ"), } } } } if trailing_extra_byte { result.push_str("๏ฟฝ"); } result } /// Decode a slice of bytes as UTF-16 encoded string, in little endian. /// /// # Errors /// /// If the slice has an odd number of bytes, or if it did not contain valid UTF-16 characters, /// a [`Utf16Error`] is returned. /// /// # Examples /// /// ``` /// # use compact_str::CompactString; /// const DANCING_MEN: &[u8] = b"\x3d\xd8\x6f\xdc\x0d\x20\x42\x26\x0f\xfe"; /// let dancing_men = CompactString::from_utf16le(DANCING_MEN).unwrap(); /// assert_eq!(dancing_men, "๐Ÿ‘ฏโ€โ™‚๏ธ"); /// ``` #[inline] pub fn from_utf16le(v: impl AsRef<[u8]>) -> Result { CompactString::from_utf16x(v.as_ref(), u16::from_le, u16::from_le_bytes) } /// Decode a slice of bytes as UTF-16 encoded string, in big endian. /// /// # Errors /// /// If the slice has an odd number of bytes, or if it did not contain valid UTF-16 characters, /// a [`Utf16Error`] is returned. /// /// # Examples /// /// ``` /// # use compact_str::CompactString; /// const DANCING_WOMEN: &[u8] = b"\xd8\x3d\xdc\x6f\x20\x0d\x26\x40\xfe\x0f"; /// let dancing_women = CompactString::from_utf16be(DANCING_WOMEN).unwrap(); /// assert_eq!(dancing_women, "๐Ÿ‘ฏโ€โ™€๏ธ"); /// ``` #[inline] pub fn from_utf16be(v: impl AsRef<[u8]>) -> Result { CompactString::from_utf16x(v.as_ref(), u16::from_be, u16::from_be_bytes) } /// Lossy decode a slice of bytes as UTF-16 encoded string, in little endian. /// /// In this context "lossy" means that any broken characters in the input are replaced by the /// \ `'๏ฟฝ'`. Please notice that, unlike UTF-8, UTF-16 is not self /// synchronizing. I.e. if a byte in the input is dropped, all following data is broken. /// /// # Examples /// /// ``` /// # use compact_str::CompactString; /// // A "random" bit was flipped in the 4th byte: /// const DANCING_MEN: &[u8] = b"\x3d\xd8\x6f\xfc\x0d\x20\x42\x26\x0f\xfe"; /// let dancing_men = CompactString::from_utf16le_lossy(DANCING_MEN); /// assert_eq!(dancing_men, "๏ฟฝ\u{fc6f}\u{200d}โ™‚๏ธ"); /// ``` #[inline] pub fn from_utf16le_lossy(v: impl AsRef<[u8]>) -> Self { CompactString::from_utf16x_lossy(v.as_ref(), u16::from_le, u16::from_le_bytes) } /// Lossy decode a slice of bytes as UTF-16 encoded string, in big endian. /// /// In this context "lossy" means that any broken characters in the input are replaced by the /// \ `'๏ฟฝ'`. Please notice that, unlike UTF-8, UTF-16 is not self /// synchronizing. I.e. if a byte in the input is dropped, all following data is broken. /// /// # Examples /// /// ``` /// # use compact_str::CompactString; /// // A "random" bit was flipped in the 9th byte: /// const DANCING_WOMEN: &[u8] = b"\xd8\x3d\xdc\x6f\x20\x0d\x26\x40\xde\x0f"; /// let dancing_women = CompactString::from_utf16be_lossy(DANCING_WOMEN); /// assert_eq!(dancing_women, "๐Ÿ‘ฏ\u{200d}โ™€๏ฟฝ"); /// ``` #[inline] pub fn from_utf16be_lossy(v: impl AsRef<[u8]>) -> Self { CompactString::from_utf16x_lossy(v.as_ref(), u16::from_be, u16::from_be_bytes) } /// Convert the [`CompactString`] into a [`String`]. /// /// # Examples /// /// ``` /// # use compact_str::CompactString; /// let s = CompactString::new("Hello world"); /// let s = s.into_string(); /// assert_eq!(s, "Hello world"); /// ``` pub fn into_string(self) -> String { self.0.into_string() } /// Convert a [`String`] into a [`CompactString`] _without inlining_. /// /// Note: You probably don't need to use this method, instead you should use `From` /// which is implemented for [`CompactString`]. /// /// This method exists incase your code is very sensitive to memory allocations. Normally when /// converting a [`String`] to a [`CompactString`] we'll inline short strings onto the stack. /// But this results in [`Drop`]-ing the original [`String`], which causes memory it owned on /// the heap to be deallocated. Instead when using this method, we always reuse the buffer that /// was previously owned by the [`String`], so no trips to the allocator are needed. /// /// # Examples /// /// ### Short Strings /// ``` /// use compact_str::CompactString; /// /// let short = "hello world".to_string(); /// let c_heap = CompactString::from_string_buffer(short); /// /// // using CompactString::from_string_buffer, we'll re-use the String's underlying buffer /// assert!(c_heap.is_heap_allocated()); /// /// // note: when Clone-ing a short heap allocated string, we'll eagerly inline at that point /// let c_inline = c_heap.clone(); /// assert!(!c_inline.is_heap_allocated()); /// /// assert_eq!(c_heap, c_inline); /// ``` /// /// ### Longer Strings /// ``` /// use compact_str::CompactString; /// /// let x = "longer string that will be on the heap".to_string(); /// let c1 = CompactString::from(x); /// /// let y = "longer string that will be on the heap".to_string(); /// let c2 = CompactString::from_string_buffer(y); /// /// // for longer strings, we re-use the underlying String's buffer in both cases /// assert!(c1.is_heap_allocated()); /// assert!(c2.is_heap_allocated()); /// ``` /// /// ### Buffer Re-use /// ``` /// use compact_str::CompactString; /// /// let og = "hello world".to_string(); /// let og_addr = og.as_ptr(); /// /// let mut c = CompactString::from_string_buffer(og); /// let ex_addr = c.as_ptr(); /// /// // When converting to/from String and CompactString with from_string_buffer we always re-use /// // the same underlying allocated memory/buffer /// assert_eq!(og_addr, ex_addr); /// /// let long = "this is a long string that will be on the heap".to_string(); /// let long_addr = long.as_ptr(); /// /// let mut long_c = CompactString::from(long); /// let long_ex_addr = long_c.as_ptr(); /// /// // When converting to/from String and CompactString with From, we'll also re-use the /// // underlying buffer, if the string is long, otherwise when converting to CompactString we /// // eagerly inline /// assert_eq!(long_addr, long_ex_addr); /// ``` #[inline] pub fn from_string_buffer(s: String) -> Self { let repr = Repr::from_string(s, false); CompactString(repr) } } impl Default for CompactString { #[inline] fn default() -> Self { CompactString::new("") } } impl Deref for CompactString { type Target = str; #[inline] fn deref(&self) -> &str { self.as_str() } } impl DerefMut for CompactString { #[inline] fn deref_mut(&mut self) -> &mut str { self.as_mut_str() } } impl AsRef for CompactString { #[inline] fn as_ref(&self) -> &str { self.as_str() } } impl AsRef for CompactString { #[inline] fn as_ref(&self) -> &OsStr { OsStr::new(self.as_str()) } } impl AsRef<[u8]> for CompactString { #[inline] fn as_ref(&self) -> &[u8] { self.as_bytes() } } impl Borrow for CompactString { #[inline] fn borrow(&self) -> &str { self.as_str() } } impl BorrowMut for CompactString { #[inline] fn borrow_mut(&mut self) -> &mut str { self.as_mut_str() } } impl Eq for CompactString {} impl> PartialEq for CompactString { fn eq(&self, other: &T) -> bool { self.as_str() == other.as_ref() } } impl PartialEq for String { fn eq(&self, other: &CompactString) -> bool { self.as_str() == other.as_str() } } impl PartialEq for &str { fn eq(&self, other: &CompactString) -> bool { *self == other.as_str() } } impl<'a> PartialEq for Cow<'a, str> { fn eq(&self, other: &CompactString) -> bool { *self == other.as_str() } } impl Ord for CompactString { fn cmp(&self, other: &Self) -> Ordering { self.as_str().cmp(other.as_str()) } } impl PartialOrd for CompactString { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl Hash for CompactString { fn hash(&self, state: &mut H) { self.as_str().hash(state) } } impl<'a> From<&'a str> for CompactString { fn from(s: &'a str) -> Self { let repr = Repr::new(s); CompactString(repr) } } impl From for CompactString { fn from(s: String) -> Self { let repr = Repr::from_string(s, true); CompactString(repr) } } impl<'a> From<&'a String> for CompactString { fn from(s: &'a String) -> Self { CompactString::new(s) } } impl<'a> From> for CompactString { fn from(cow: Cow<'a, str>) -> Self { match cow { Cow::Borrowed(s) => s.into(), // we separate these two so we can re-use the underlying buffer in the owned case Cow::Owned(s) => s.into(), } } } impl From> for CompactString { fn from(b: Box) -> Self { let s = b.into_string(); let repr = Repr::from_string(s, true); CompactString(repr) } } impl From for String { #[inline] fn from(s: CompactString) -> Self { s.into_string() } } impl From for Cow<'_, str> { #[inline] fn from(s: CompactString) -> Self { Self::Owned(s.into_string()) } } impl<'a> From<&'a CompactString> for Cow<'a, str> { #[inline] fn from(s: &'a CompactString) -> Self { Self::Borrowed(s) } } impl FromStr for CompactString { type Err = core::convert::Infallible; fn from_str(s: &str) -> Result { Ok(CompactString::from(s)) } } impl fmt::Debug for CompactString { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Debug::fmt(self.as_str(), f) } } impl fmt::Display for CompactString { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt(self.as_str(), f) } } impl FromIterator for CompactString { fn from_iter>(iter: T) -> Self { let repr = iter.into_iter().collect(); CompactString(repr) } } impl<'a> FromIterator<&'a char> for CompactString { fn from_iter>(iter: T) -> Self { let repr = iter.into_iter().collect(); CompactString(repr) } } impl<'a> FromIterator<&'a str> for CompactString { fn from_iter>(iter: T) -> Self { let repr = iter.into_iter().collect(); CompactString(repr) } } impl FromIterator> for CompactString { fn from_iter>>(iter: T) -> Self { let repr = iter.into_iter().collect(); CompactString(repr) } } impl<'a> FromIterator> for CompactString { fn from_iter>>(iter: T) -> Self { let repr = iter.into_iter().collect(); CompactString(repr) } } impl FromIterator for CompactString { fn from_iter>(iter: T) -> Self { let repr = iter.into_iter().collect(); CompactString(repr) } } impl FromIterator for CompactString { fn from_iter>(iter: T) -> Self { let repr = iter.into_iter().collect(); CompactString(repr) } } impl FromIterator for String { fn from_iter>(iter: T) -> Self { let mut iterator = iter.into_iter(); match iterator.next() { None => String::new(), Some(buf) => { let mut buf = buf.into_string(); buf.extend(iterator); buf } } } } impl FromIterator for Cow<'_, str> { fn from_iter>(iter: T) -> Self { String::from_iter(iter).into() } } impl Extend for CompactString { fn extend>(&mut self, iter: T) { self.0.extend(iter) } } impl<'a> Extend<&'a char> for CompactString { fn extend>(&mut self, iter: T) { self.0.extend(iter) } } impl<'a> Extend<&'a str> for CompactString { fn extend>(&mut self, iter: T) { self.0.extend(iter) } } impl Extend> for CompactString { fn extend>>(&mut self, iter: T) { self.0.extend(iter) } } impl<'a> Extend> for CompactString { fn extend>>(&mut self, iter: T) { iter.into_iter().for_each(move |s| self.push_str(&s)); } } impl Extend for CompactString { fn extend>(&mut self, iter: T) { self.0.extend(iter) } } impl Extend for String { fn extend>(&mut self, iter: T) { for s in iter { self.push_str(&s); } } } impl Extend for CompactString { fn extend>(&mut self, iter: T) { for s in iter { self.push_str(&s); } } } impl<'a> Extend for Cow<'a, str> { fn extend>(&mut self, iter: T) { self.to_mut().extend(iter); } } impl fmt::Write for CompactString { fn write_str(&mut self, s: &str) -> fmt::Result { self.push_str(s); Ok(()) } fn write_fmt(mut self: &mut Self, args: fmt::Arguments<'_>) -> fmt::Result { match args.as_str() { Some(s) => { self.push_str(s); Ok(()) } None => fmt::write(&mut self, args), } } } impl Add<&str> for CompactString { type Output = Self; fn add(mut self, rhs: &str) -> Self::Output { self.push_str(rhs); self } } impl AddAssign<&str> for CompactString { fn add_assign(&mut self, rhs: &str) { self.push_str(rhs); } } /// A possible error value when converting a [`CompactString`] from a UTF-16 byte slice. /// /// This type is the error type for the [`from_utf16`] method on [`CompactString`]. /// /// [`from_utf16`]: CompactString::from_utf16 /// # Examples /// /// Basic usage: /// /// ``` /// # use compact_str::CompactString; /// // ๐„žmuic /// let v = &[0xD834, 0xDD1E, 0x006d, 0x0075, /// 0xD800, 0x0069, 0x0063]; /// /// assert!(CompactString::from_utf16(v).is_err()); /// ``` #[derive(Copy, Clone, Debug)] pub struct Utf16Error(()); impl fmt::Display for Utf16Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt("invalid utf-16: lone surrogate found", f) } } /// An iterator over the exacted data by [`CompactString::drain()`]. #[must_use = "iterators are lazy and do nothing unless consumed"] pub struct Drain<'a> { compact_string: *mut CompactString, start: usize, end: usize, chars: std::str::Chars<'a>, } // SAFETY: Drain keeps the lifetime of the CompactString it belongs to. unsafe impl Send for Drain<'_> {} unsafe impl Sync for Drain<'_> {} impl fmt::Debug for Drain<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_tuple("Drain").field(&self.as_str()).finish() } } impl fmt::Display for Drain<'_> { #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(self.as_str()) } } impl Drop for Drain<'_> { #[inline] fn drop(&mut self) { // SAFETY: Drain keeps a mutable reference to compact_string, so one one else can access // the CompactString, but this function right now. CompactString::drain() ensured // that the new extracted range does not split a UTF-8 character. unsafe { (*self.compact_string).replace_range_shrink(self.start, self.end, "") }; } } impl Drain<'_> { /// The remaining, unconsumed characters of the extracted substring. #[inline] pub fn as_str(&self) -> &str { self.chars.as_str() } } impl Deref for Drain<'_> { type Target = str; #[inline] fn deref(&self) -> &Self::Target { self.as_str() } } impl Iterator for Drain<'_> { type Item = char; #[inline] fn next(&mut self) -> Option { self.chars.next() } #[inline] fn count(self) -> usize { // ::count() is specialized, and cloning is trivial. self.chars.clone().count() } fn size_hint(&self) -> (usize, Option) { self.chars.size_hint() } #[inline] fn last(mut self) -> Option { self.chars.next_back() } } impl DoubleEndedIterator for Drain<'_> { #[inline] fn next_back(&mut self) -> Option { self.chars.next_back() } } impl FusedIterator for Drain<'_> {} static_assertions::assert_eq_size!(CompactString, String); compact_str-0.7.1/src/macros.rs000064400000000000000000000030161046102023000145700ustar 00000000000000/// Creates a `CompactString` using interpolation of runtime expressions. /// /// The first argument `format_compact!` receives is a format string. /// This must be a string literal. /// The power of the formatting string is in the `{}`s contained. /// /// Additional parameters passed to `format_compact!` replace the `{}`s within /// the formatting string in the order given unless named or /// positional parameters are used; see [`std::fmt`] for more information. /// /// A common use for `format_compact!` is concatenation and interpolation /// of strings. /// The same convention is used with [`print!`] and [`write!`] macros, /// depending on the intended destination of the string. /// /// To convert a single value to a string, use the /// `ToCompactString::to_compact_string` method, which uses /// the [`std::fmt::Display`] formatting trait. /// /// # Panics /// /// `format_compact!` panics if a formatting trait implementation returns /// an error. /// /// This indicates an incorrect implementation since /// `ToCompactString::to_compact_string` never returns an error itself. #[macro_export] macro_rules! format_compact { ($($arg:tt)*) => { $crate::ToCompactString::to_compact_string(&$crate::core::format_args!($($arg)*)) } } #[cfg(test)] mod tests { #[test] fn test_macros() { assert_eq!(format_compact!("2"), "2"); assert_eq!(format_compact!("{}", 2), "2"); assert!(!format_compact!("2").is_heap_allocated()); assert!(!format_compact!("{}", 2).is_heap_allocated()); } } compact_str-0.7.1/src/repr/bytes.rs000064400000000000000000000143211046102023000154030ustar 00000000000000use core::str::Utf8Error; use bytes::Buf; use super::{ Repr, MAX_SIZE, }; impl Repr { /// Converts a [`Buf`] of bytes to a [`Repr`], checking that the provided bytes are valid UTF-8 pub fn from_utf8_buf(buf: &mut B) -> Result { // SAFETY: We check below to make sure the provided buffer is valid UTF-8 let (repr, bytes_written) = unsafe { Self::collect_buf(buf) }; // Check to make sure the provided bytes are valid UTF-8, return the Repr if they are! match core::str::from_utf8(&repr.as_slice()[..bytes_written]) { Ok(_) => Ok(repr), Err(e) => Err(e), } } /// Converts a [`Buf`] of bytes to a [`Repr`], without checking for valid UTF-8 /// /// # Safety /// * The provided buffer must be valid UTF-8 pub unsafe fn from_utf8_buf_unchecked(buf: &mut B) -> Self { let (repr, _bytes_written) = Self::collect_buf(buf); repr } /// Collects the bytes from a [`Buf`] into a [`Repr`] /// /// # Safety /// * The caller must guarantee that `buf` is valid UTF-8 unsafe fn collect_buf(buf: &mut B) -> (Self, usize) { // Get an empty Repr we can write into let mut repr = super::EMPTY; let mut bytes_written = 0; debug_assert_eq!(repr.len(), bytes_written); while buf.has_remaining() { let chunk = buf.chunk(); let chunk_len = chunk.len(); // There's an edge case where the final byte of this buffer == `HEAP_MASK`, which is // invalid UTF-8, but would result in us creating an inline variant, that identifies as // a heap variant. If a user ever tried to reference the data at all, we'd incorrectly // try and read data from an invalid memory address, causing undefined behavior. if bytes_written < MAX_SIZE && bytes_written + chunk_len == MAX_SIZE { let last_byte = chunk[chunk_len - 1]; // If we hit the edge case, reserve additional space to make the repr becomes heap // allocated, which prevents us from writing this last byte inline if last_byte >= 0b11000000 { repr.reserve(MAX_SIZE + 1); } } // reserve at least enough space to fit this chunk repr.reserve(chunk_len); // SAFETY: The caller is responsible for making sure the provided buffer is UTF-8. This // invariant is documented in the public API let slice = repr.as_mut_buf(); // write the chunk into the Repr slice[bytes_written..bytes_written + chunk_len].copy_from_slice(chunk); // Set the length of the Repr // SAFETY: We just wrote an additional `chunk_len` bytes into the Repr bytes_written += chunk_len; repr.set_len(bytes_written); // advance the pointer of the buffer buf.advance(chunk_len); } (repr, bytes_written) } } #[cfg(test)] mod test { use std::io::Cursor; use test_case::test_case; use super::Repr; #[test_case(""; "empty")] #[test_case("hello world"; "short")] #[test_case("hello, this is a long string which should be heap allocated"; "long")] fn test_from_utf8_buf(word: &'static str) { let mut buf = Cursor::new(word.as_bytes()); let repr = Repr::from_utf8_buf(&mut buf).unwrap(); assert_eq!(repr.as_str(), word); assert_eq!(repr.len(), word.len()); } #[test] fn test_from_utf8_packed() { cfg_if::cfg_if! { if #[cfg(target_pointer_width = "64")] { let packed = "this string is 24 chars!"; } else if #[cfg(target_pointer_width = "32")] { let packed = "i am 12 char"; } else { compile_error!("unsupported architecture!") } } let mut buf = Cursor::new(packed.as_bytes()); let repr = Repr::from_utf8_buf(&mut buf).unwrap(); assert_eq!(repr.as_str(), packed); // This repr should __not__ be heap allocated assert!(!repr.is_heap_allocated()); } #[test] fn test_fuzz_panic() { let bytes = &[ 255, 255, 255, 255, 255, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 12, 0, 0, 96, ]; let mut buf: Cursor<&[u8]> = Cursor::new(bytes); assert!(Repr::from_utf8_buf(&mut buf).is_err()); } #[test] fn test_valid_repr_but_invalid_utf8() { let bytes = &[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 192, ]; let mut buf: Cursor<&[u8]> = Cursor::new(bytes); assert!(Repr::from_utf8_buf(&mut buf).is_err()); } #[test] fn test_fake_heap_variant() { let bytes = &[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, ]; let mut buf: Cursor<&[u8]> = Cursor::new(bytes); assert!(Repr::from_utf8_buf(&mut buf).is_err()); } #[test] fn test_from_non_contiguous() { let data = [ 211, 247, 211, 247, 121, 135, 151, 255, 126, 205, 255, 204, 211, 51, 51, 0, 52, 55, 247, 204, 45, 37, 44, 210, 132, 50, 206, 121, 135, 151, 255, 126, 205, 255, 204, 211, 51, 51, 0, 52, 55, 247, 204, 45, 44, 210, 132, 50, 206, 51, ]; let (front, back) = data.split_at(data.len() / 2 + 1); let mut queue = std::collections::VecDeque::with_capacity(data.len()); // create a non-contiguous slice of memory in queue front.into_iter().copied().for_each(|x| queue.push_back(x)); back.into_iter().copied().for_each(|x| queue.push_front(x)); // make sure it's non-contiguous let (a, b) = queue.as_slices(); assert!(data.is_empty() || !a.is_empty()); assert!(data.is_empty() || !b.is_empty()); assert_eq!(data.len(), queue.len()); assert!(Repr::from_utf8_buf(&mut queue).is_err()); } #[test] #[should_panic(expected = "Utf8Error")] fn test_invalid_utf8() { let invalid = &[0, 159]; let mut buf: Cursor<&[u8]> = Cursor::new(invalid); Repr::from_utf8_buf(&mut buf).unwrap(); } } compact_str-0.7.1/src/repr/capacity.rs000064400000000000000000000143661046102023000160630ustar 00000000000000use crate::repr::HEAP_MASK; // how many bytes a `usize` occupies const USIZE_SIZE: usize = core::mem::size_of::(); /// Used to generate [`CAPACITY_IS_ON_THE_HEAP`] #[allow(non_snake_case)] const fn CAP_ON_HEAP_FLAG() -> [u8; USIZE_SIZE] { // all bytes 255, with the last being HEAP_MASK let mut flag = [255; USIZE_SIZE]; flag[USIZE_SIZE - 1] = HEAP_MASK; flag } /// State that describes the capacity as being stored on the heap. /// /// All bytes `255`, with the last being [`HEAP_MASK`], using the same amount of bytes as `usize` /// Example (64-bit): `[255, 255, 255, 255, 255, 255, 255, 254]` const CAPACITY_IS_ON_THE_HEAP: [u8; USIZE_SIZE] = CAP_ON_HEAP_FLAG(); // how many bytes we can use for capacity const SPACE_FOR_CAPACITY: usize = USIZE_SIZE - 1; // the maximum value we're able to store, e.g. on 64-bit arch this is 2^56 - 2 // // note: Preferably we'd used usize.pow(..) here, but that's not a `const fn`, so we need to use // bitshift operators, and there's a lint against using them in this pattern, which IMO isn't a // great lint pub const MAX_VALUE: usize = 2usize.pow(SPACE_FOR_CAPACITY as u32 * 8) - 2; /// An integer type that uses `core::mem::size_of::() - 1` bytes to store the capacity of /// a heap buffer. /// /// Assumming a 64-bit arch, a [`super::BoxString`] uses 8 bytes for a pointer, 8 bytes for a /// length, and then needs 1 byte for a discriminant. We need to store the capacity somewhere, and /// we could store it on the heap, but we also have 7 unused bytes. [`Capacity`] handles storing a /// value in these 7 bytes, returning an error if it's not possible, at which point we'll store the /// capacity on the heap. /// /// # Max Values /// * __64-bit:__ `(2 ^ (7 * 8)) - 2 = 72_057_594_037_927_934 ~= 64 petabytes` /// * __32-bit:__ `(2 ^ (3 * 8)) - 2 = 16_777_214 ~= 16 megabytes` /// /// Practically speaking, on a 64-bit architecture we'll never need to store the capacity on the /// heap, because with it's impossible to create a string that is 64 petabytes or larger. But for /// 32-bit architectures we need to be able to store a capacity larger than 16 megabytes, since a /// string larger than 16 megabytes probably isn't that uncommon. #[derive(Copy, Clone, Debug, PartialEq, Eq)] #[cfg_attr(target_pointer_width = "64", repr(align(8)))] #[cfg_attr(target_pointer_width = "32", repr(align(4)))] pub struct Capacity([u8; USIZE_SIZE]); static_assertions::assert_eq_size!(Capacity, usize); static_assertions::assert_eq_align!(Capacity, usize); impl Capacity { #[inline] pub const fn new(capacity: usize) -> Self { cfg_if::cfg_if! { if #[cfg(target_pointer_width = "64")] { // on 64-bit arches we can always fit the capacity inline debug_assert!(capacity <= MAX_VALUE); let mut bytes = capacity.to_le_bytes(); bytes[core::mem::size_of::() - 1] = HEAP_MASK; Capacity(bytes) } else if #[cfg(target_pointer_width = "32")] { // on 32-bit arches we might need to store the capacity on the heap if capacity > MAX_VALUE { // if we need the last byte to encode this capacity then we need to put the capacity on // the heap. return an Error so `BoxString` can do the right thing Capacity(CAPACITY_IS_ON_THE_HEAP) } else { // otherwise, we can store this capacity inline! Set the last byte to be our `HEAP_MASK` // for our discriminant, using the leading bytes to store the actual value let mut bytes = capacity.to_le_bytes(); bytes[core::mem::size_of::() - 1] = HEAP_MASK; Capacity(bytes) } } else { compile_error!("Unsupported target_pointer_width"); } } } /// Re-interprets a [`Capacity`] as a `usize` /// /// # SAFETY: /// * `self` must be less than or equal to [`MAX_VALUE`] #[inline(always)] pub unsafe fn as_usize(&self) -> usize { let mut usize_buf = [0u8; USIZE_SIZE]; // SAFETY: // * `src` is valid for reads of `SPACE_FOR_CAPACITY` because it is less than `USIZE_SIZE` // * `dst` is valid for reads of `SPACE_FOR_CAPACITY` because it is less than `USIZE_SIZE` // * `src` and `dst` do not overlap because we created `usize_buf` core::ptr::copy_nonoverlapping(self.0.as_ptr(), usize_buf.as_mut_ptr(), SPACE_FOR_CAPACITY); usize::from_le_bytes(usize_buf) } /// Returns whether or not this [`Capacity`] has a value that indicates the capacity is being /// stored on the heap #[inline(always)] pub fn is_heap(&self) -> bool { self.0 == CAPACITY_IS_ON_THE_HEAP } } #[cfg(test)] mod tests { use rayon::prelude::*; use super::Capacity; #[test] fn test_zero_roundtrips() { let og = 0; let cap = Capacity::new(og); let after = unsafe { cap.as_usize() }; assert_eq!(og, after); } #[test] fn test_max_value() { let available_bytes = (core::mem::size_of::() - 1) as u32; let max_value = 2usize.pow(available_bytes * 8) - 2; #[cfg(target_pointer_width = "64")] assert_eq!(max_value, 72057594037927934); #[cfg(target_pointer_width = "32")] assert_eq!(max_value, 16777214); let cap = Capacity::new(max_value); let after = unsafe { cap.as_usize() }; assert_eq!(max_value, after); } #[cfg(target_pointer_width = "32")] #[test] fn test_invalid_value() { let invalid_val = usize::MAX; let cap = Capacity::new(invalid_val); let after = unsafe { cap.as_usize() }; // anything greater than or equal to 16777215, should "resolve" to 16777215 assert_eq!(16777215, after); } #[test] #[cfg_attr(miri, ignore)] fn test_all_valid_32bit_values() { #[cfg(target_pointer_width = "32")] assert_eq!(16_777_214, super::MAX_VALUE); (0..=16_777_214).into_par_iter().for_each(|i| { let cap = Capacity::new(i); let val = unsafe { cap.as_usize() }; assert_eq!(val, i, "value roundtriped to wrong value?"); }); } } compact_str-0.7.1/src/repr/heap.rs000064400000000000000000000470371046102023000152040ustar 00000000000000use core::{ cmp, mem, ptr, }; use super::capacity::Capacity; use super::{ Repr, MAX_SIZE, }; /// The minimum size we'll allocate on the heap is one usize larger than our max inline size const MIN_HEAP_SIZE: usize = MAX_SIZE + mem::size_of::(); const UNKNOWN: usize = 0; pub type StrBuffer = [u8; UNKNOWN]; /// [`HeapBuffer`] grows at an amortized rates of 1.5x /// /// Note: this is different than [`std::string::String`], which grows at a rate of 2x. It's debated /// which is better, for now we'll stick with a rate of 1.5x #[inline(always)] pub fn amortized_growth(cur_len: usize, additional: usize) -> usize { let required = cur_len.saturating_add(additional); let amortized = cur_len.saturating_mul(3) / 2; amortized.max(required) } #[repr(C)] pub struct HeapBuffer { pub ptr: ptr::NonNull, pub len: usize, pub cap: Capacity, } static_assertions::assert_eq_size!(HeapBuffer, Repr); impl HeapBuffer { /// Create a [`HeapBuffer`] with the provided text #[inline] pub fn new(text: &str) -> Self { let len = text.len(); let (cap, ptr) = allocate_ptr(len); // copy our string into the buffer we just allocated // // SAFETY: We know both `src` and `dest` are valid for respectively reads and writes of // length `len` because `len` comes from `src`, and `dest` was allocated to be at least that // length. We also know they're non-overlapping because `dest` is newly allocated unsafe { ptr.as_ptr().copy_from_nonoverlapping(text.as_ptr(), len) }; HeapBuffer { ptr, len, cap } } /// Create an empty [`HeapBuffer`] with a specific capacity #[inline] pub fn with_capacity(capacity: usize) -> Self { let len = 0; let (cap, ptr) = allocate_ptr(capacity); HeapBuffer { ptr, len, cap } } /// Create a [`HeapBuffer`] with `text` that has _at least_ `additional` bytes of capacity /// /// To prevent frequent re-allocations, this method will create a [`HeapBuffer`] with a capacity /// of `text.len() + additional` or `text.len() * 1.5`, whichever is greater #[inline] pub fn with_additional(text: &str, additional: usize) -> Self { let len = text.len(); let new_capacity = amortized_growth(len, additional); let (cap, ptr) = allocate_ptr(new_capacity); // copy our string into the buffer we just allocated // // SAFETY: We know both `src` and `dest` are valid for respectively reads and writes of // length `len` because `len` comes from `src`, and `dest` was allocated to be at least that // length. We also know they're non-overlapping because `dest` is newly allocated unsafe { ptr.as_ptr().copy_from_nonoverlapping(text.as_ptr(), len) }; HeapBuffer { ptr, len, cap } } /// Return the capacity of the [`HeapBuffer`] #[inline] pub fn capacity(&self) -> usize { #[cold] fn read_capacity_from_heap(this: &HeapBuffer) -> usize { // re-adjust the pointer to include the capacity that's on the heap let adj_ptr: *const u8 = this.ptr.as_ptr().wrapping_sub(mem::size_of::()); let mut buf = [0u8; mem::size_of::()]; // SAFETY: `src` and `dst` don't overlap, and are valid for usize number of bytes unsafe { ptr::copy_nonoverlapping(adj_ptr, buf.as_mut_ptr(), mem::size_of::()); } usize::from_ne_bytes(buf) } if self.cap.is_heap() { read_capacity_from_heap(self) } else { // SAFETY: Checked above that the capacity is on the stack unsafe { self.cap.as_usize() } } } /// Try to grow the [`HeapBuffer`] by reallocating, returning an error if we fail pub fn realloc(&mut self, new_capacity: usize) -> Result { let new_cap = Capacity::new(new_capacity); // We can't reallocate to a size less than our length, or else we'd clip the string if new_capacity < self.len { return Err(()); } // HeapBuffer doesn't support 0 byte heap sizes if new_capacity == 0 { return Err(()); } // Always allocate at least MIN_HEAP_SIZE let new_capacity = cmp::max(new_capacity, MIN_HEAP_SIZE); let (new_cap, new_ptr) = match (self.cap.is_heap(), new_cap.is_heap()) { // both current and new capacity can be stored inline (false, false) => { // SAFETY: checked above that our capacity is valid let cap = unsafe { self.cap.as_usize() }; // current capacity is the same as the new, nothing to do! if cap == new_capacity { return Ok(new_capacity); } let cur_layout = inline_capacity::layout(cap); let new_layout = inline_capacity::layout(new_capacity); let new_size = new_layout.size(); // It's possible `new_size` could overflow since inline_capacity::layout pads for // alignment if new_size < new_capacity { return Err(()); } // SAFETY: // * We're using the same allocator that we used for `ptr` // * The layout is the same because we checked that the capacity is inline // * `new_size` will be > 0, we return early if the requested capacity is 0 // * Checked above if `new_size` overflowed when rounding to alignment match ptr::NonNull::new(unsafe { std::alloc::realloc(self.ptr.as_ptr(), cur_layout, new_size) }) { Some(ptr) => (new_cap, ptr), None => return Err(()), } } // both current and new capacity need to be stored on the heap (true, true) => { let cur_layout = heap_capacity::layout(self.capacity()); let new_layout = heap_capacity::layout(new_capacity); let new_size = new_layout.size(); // alloc::realloc requires that size > 0 debug_assert!(new_size > 0); // It's possible `new_size` could overflow since heap_capacity::layout requires a // few additional bytes if new_size < new_capacity { return Err(()); } // move our pointer back one WORD since our capacity is behind it let raw_ptr = self.ptr.as_ptr(); let adj_ptr = raw_ptr.wrapping_sub(mem::size_of::()); // SAFETY: // * We're using the same allocator that we used for `ptr` // * The layout is the same because we checked that the capacity is on the heap // * `new_size` will be > 0, we return early if the requested capacity is 0 // * Checked above if `new_size` overflowed when rounding to alignment let cap_ptr = unsafe { std::alloc::realloc(adj_ptr, cur_layout, new_size) }; // Check if reallocation succeeded if cap_ptr.is_null() { return Err(()); } // Our allocation succeeded! Write the new capacity // // SAFTEY: // * `src` and `dst` are both valid for reads of `usize` number of bytes // * `src` and `dst` don't overlap because we created `src` unsafe { ptr::copy_nonoverlapping( new_capacity.to_ne_bytes().as_ptr(), cap_ptr, mem::size_of::(), ) }; // Finally, adjust our pointer backup so it points at the string content let str_ptr = cap_ptr.wrapping_add(mem::size_of::()); // SAFETY: We checked above to make sure the pointer was non-null let ptr = unsafe { ptr::NonNull::new_unchecked(str_ptr) }; (new_cap, ptr) } // capacity is currently inline or on the heap, but needs to move, can't realloc because // we'd need to change the layout! (false, true) | (true, false) => return Err(()), }; // set our new pointer and new capacity self.ptr = new_ptr; self.cap = new_cap; Ok(new_capacity) } /// Set's the length of the content for this [`HeapBuffer`] /// /// # SAFETY: /// * The caller must guarantee that `len` bytes in the buffer are valid UTF-8 #[inline] pub unsafe fn set_len(&mut self, len: usize) { self.len = len; } /// Deallocates the memory owned by the provided [`HeapBuffer`] #[inline] pub fn dealloc(&mut self) { deallocate_ptr(self.ptr, self.cap); } } impl Clone for HeapBuffer { fn clone(&self) -> Self { // Create a new HeapBuffer with the same capacity as the original let mut new = Self::with_capacity(self.capacity()); // SAFETY: // * `src` and `dst` don't overlap because we just created `dst` // * `src` and `dst` are both valid for `self.len` bytes because self.len < capacity unsafe { new.ptr .as_ptr() .copy_from_nonoverlapping(self.ptr.as_ptr(), self.len) }; // SAFETY: // * We copied the text from self, which is valid UTF-8 unsafe { new.set_len(self.len) }; new } } impl Drop for HeapBuffer { fn drop(&mut self) { self.dealloc() } } /// Allocates a buffer on the heap that we can use to store a string, optionally stores the capacity /// of said buffer on the heap. /// /// Returns a [`Capacity`] that either indicates the capacity is stored on the heap, or is stored /// in the `Capacity` itself. #[inline] pub fn allocate_ptr(capacity: usize) -> (Capacity, ptr::NonNull) { // We allocate at least MIN_HEAP_SIZE bytes because we need to allocate at least one byte let capacity = capacity.max(MIN_HEAP_SIZE); let cap = Capacity::new(capacity); // HeapBuffer doesn't support 0 sized allocations, we should always allocate at least // MIN_HEAP_SIZE bytes debug_assert!(capacity > 0); #[cold] fn allocate_with_capacity_on_heap(capacity: usize) -> ptr::NonNull { // write our capacity onto the heap let ptr = heap_capacity::alloc(capacity); // SAFETY: // * `src` and `dst` don't overlap and are both valid for `usize` bytes unsafe { ptr::copy_nonoverlapping( capacity.to_ne_bytes().as_ptr(), ptr.as_ptr(), mem::size_of::(), ) }; let raw_ptr = ptr.as_ptr().wrapping_add(core::mem::size_of::()); // SAFETY: We know `raw_ptr` is non-null because we just created it unsafe { ptr::NonNull::new_unchecked(raw_ptr) } } let ptr = if cap.is_heap() { allocate_with_capacity_on_heap(capacity) } else { unsafe { inline_capacity::alloc(capacity) } }; (cap, ptr) } /// Deallocates a buffer on the heap, handling when the capacity is also stored on the heap #[inline] pub fn deallocate_ptr(ptr: ptr::NonNull, cap: Capacity) { #[cold] fn deallocate_with_capacity_on_heap(ptr: ptr::NonNull) { // re-adjust the pointer to include the capacity that's on the heap let adj_ptr = ptr.as_ptr().wrapping_sub(mem::size_of::()); // read the capacity from the heap so we know how much to deallocate let mut buf = [0u8; mem::size_of::()]; // SAFETY: `src` and `dst` don't overlap, and are valid for usize number of bytes unsafe { ptr::copy_nonoverlapping(adj_ptr, buf.as_mut_ptr(), mem::size_of::()); } let capacity = usize::from_ne_bytes(buf); // SAFETY: We know the pointer is not null since we got it as a NonNull let ptr = unsafe { ptr::NonNull::new_unchecked(adj_ptr) }; // SAFETY: We checked above that our capacity is on the heap, and we readjusted the // pointer to reference the capacity unsafe { heap_capacity::dealloc(ptr, capacity) } } if cap.is_heap() { deallocate_with_capacity_on_heap(ptr); } else { // SAFETY: Our capacity is always inline on 64-bit archs unsafe { inline_capacity::dealloc(ptr, cap.as_usize()) } } } mod heap_capacity { use core::ptr; use std::alloc; use super::StrBuffer; #[inline] pub fn alloc(capacity: usize) -> ptr::NonNull { let layout = layout(capacity); debug_assert!(layout.size() > 0); // SAFETY: `alloc(...)` has undefined behavior if the layout is zero-sized. We know the // layout can't be zero-sized though because we're always at least allocating one `usize` let raw_ptr = unsafe { alloc::alloc(layout) }; // Check to make sure our pointer is non-null, some allocators return null pointers instead // of panicking match ptr::NonNull::new(raw_ptr) { Some(ptr) => ptr, None => alloc::handle_alloc_error(layout), } } /// Deallocates a pointer which references a `HeapBuffer` whose capacity is on the heap /// /// # Saftey /// * `ptr` must point to the start of a `HeapBuffer` whose capacity is on the heap. i.e. we /// must have `ptr -> [cap ; string]` pub unsafe fn dealloc(ptr: ptr::NonNull, capacity: usize) { let layout = layout(capacity); alloc::dealloc(ptr.as_ptr(), layout); } #[repr(C)] struct HeapBufferInnerHeapCapacity { capacity: usize, buffer: StrBuffer, } #[inline(always)] pub fn layout(capacity: usize) -> alloc::Layout { let buffer_layout = alloc::Layout::array::(capacity).expect("valid capacity"); alloc::Layout::new::() .extend(buffer_layout) .expect("valid layout") .0 .pad_to_align() } } mod inline_capacity { use core::ptr; use std::alloc; use super::StrBuffer; /// # SAFETY: /// * `capacity` must be > 0 #[inline] pub unsafe fn alloc(capacity: usize) -> ptr::NonNull { let layout = layout(capacity); debug_assert!(layout.size() > 0); // SAFETY: `alloc(...)` has undefined behavior if the layout is zero-sized. We specify that // `capacity` must be > 0 as a constraint to uphold the safety of this method. If capacity // is greater than 0, then our layout will be non-zero-sized. let raw_ptr = alloc::alloc(layout); // Check to make sure our pointer is non-null, some allocators return null pointers instead // of panicking match ptr::NonNull::new(raw_ptr) { Some(ptr) => ptr, None => alloc::handle_alloc_error(layout), } } /// Deallocates a pointer which references a `HeapBuffer` whose capacity is stored inline /// /// # Saftey /// * `ptr` must point to the start of a `HeapBuffer` whose capacity is on the inline pub unsafe fn dealloc(ptr: ptr::NonNull, capacity: usize) { let layout = layout(capacity); alloc::dealloc(ptr.as_ptr(), layout); } #[repr(C)] struct HeapBufferInnerInlineCapacity { buffer: StrBuffer, } #[inline(always)] pub fn layout(capacity: usize) -> alloc::Layout { let buffer_layout = alloc::Layout::array::(capacity).expect("valid capacity"); alloc::Layout::new::() .extend(buffer_layout) .expect("valid layout") .0 .pad_to_align() } } #[cfg(test)] mod test { use test_case::test_case; use super::{ HeapBuffer, MIN_HEAP_SIZE, }; const EIGHTEEN_MB: usize = 18 * 1024 * 1024; #[test] fn test_min_capacity() { let h = HeapBuffer::new("short"); assert_eq!(h.capacity(), MIN_HEAP_SIZE); } #[test_case(&[42; 8]; "short")] #[test_case(&[42; 50]; "long")] #[test_case(&[42; EIGHTEEN_MB]; "huge")] fn test_capacity(buf: &[u8]) { // we know the buffer is valid UTF-8 let s = unsafe { core::str::from_utf8_unchecked(buf) }; let h = HeapBuffer::new(s); assert_eq!(h.capacity(), core::cmp::max(s.len(), MIN_HEAP_SIZE)); } #[test_case(&[42; 0], 0, Err(MIN_HEAP_SIZE); "empty_empty")] #[test_case(&[42; 64], 0, Err(64); "short_empty")] #[test_case(&[42; 64], 32, Err(64); "short_to_shorter")] #[test_case(&[42; 64], 128, Ok(128); "short_to_longer")] #[test_case(&[42; EIGHTEEN_MB], EIGHTEEN_MB + 128, Ok(EIGHTEEN_MB + 128); "heap_to_heap")] fn test_realloc(buf: &[u8], realloc: usize, result: Result) { // we know the buffer is valid UTF-8 let s = unsafe { core::str::from_utf8_unchecked(buf) }; let mut h = HeapBuffer::new(s); // reallocate, asserting our result let expected_cap = match result { Ok(c) | Err(c) => c, }; let expected_res = result.map_err(|_| ()); assert_eq!(h.realloc(realloc), expected_res); assert_eq!(h.capacity(), expected_cap); } #[test] fn test_realloc_inline_to_heap() { // we know the buffer is valid UTF-8 let s = unsafe { core::str::from_utf8_unchecked(&[42; 128]) }; let mut h = HeapBuffer::new(s); cfg_if::cfg_if! { if #[cfg(target_pointer_width = "64")] { let expected_result = Ok(EIGHTEEN_MB); let expected_capacity = EIGHTEEN_MB; } else if #[cfg(target_pointer_width = "32")] { // on 32-bit architectures we'd need to change the layout from capacity being inline // to the capacity being on the heap, which isn't possible let expected_result = Err(()); let expected_capacity = 128; } else { compile_error!("Unsupported pointer width!"); } } assert_eq!(h.realloc(EIGHTEEN_MB), expected_result); assert_eq!(h.capacity(), expected_capacity); } #[test_case(&[42; 64], 128, 100, Ok(100); "sanity")] fn test_realloc_shrink( buf: &[u8], realloc_one: usize, realloc_two: usize, exp_result: Result, ) { // we know the buffer is valid UTF-8 let s = unsafe { core::str::from_utf8_unchecked(buf) }; let mut h = HeapBuffer::new(s); assert!( realloc_one > realloc_two, "we have to grow before we can shrink" ); // grow our allocation assert_eq!(h.realloc(realloc_one), Ok(realloc_one)); // shrink our allocation, asserting our result let expected_cap = match exp_result { Ok(c) | Err(c) => c, }; let expected_res = exp_result.map_err(|_| ()); assert_eq!(h.realloc(realloc_two), expected_res); assert_eq!(h.capacity(), expected_cap); } #[test] fn test_realloc_shrink_heap_to_inline() { // TODO: test this case assert!(true) } #[test_case(&[42; 0]; "empty")] #[test_case(&[42; 3]; "short")] #[test_case(&[42; 64]; "long")] #[test_case(&[42; EIGHTEEN_MB]; "huge")] fn test_clone(buf: &[u8]) { let s = unsafe { core::str::from_utf8_unchecked(buf) }; let h_a = HeapBuffer::new(s); let h_b = h_a.clone(); assert_eq!(h_a.capacity(), h_b.capacity()); } } compact_str-0.7.1/src/repr/inline.rs000064400000000000000000000147301046102023000155370ustar 00000000000000use core::ptr; use super::{ Repr, LENGTH_MASK, MAX_SIZE, }; /// A buffer stored on the stack whose size is equal to the stack size of `String` #[repr(transparent)] pub struct InlineBuffer(pub [u8; MAX_SIZE]); static_assertions::assert_eq_size!(InlineBuffer, Repr); impl InlineBuffer { /// Construct a new [`InlineString`]. A string that lives in a small buffer on the stack /// /// SAFETY: /// * The caller must guarantee that the length of `text` is less than [`MAX_SIZE`] #[inline] pub unsafe fn new(text: &str) -> Self { debug_assert!(text.len() <= MAX_SIZE); let len = text.len(); let mut buffer = [0u8; MAX_SIZE]; // set the length in the last byte buffer[MAX_SIZE - 1] = len as u8 | LENGTH_MASK; // copy the string into our buffer // // note: in the case where len == MAX_SIZE, we'll overwrite the len, but that's okay because // when reading the length we can detect that the last byte is part of UTF-8 and return a // length of MAX_SIZE // // SAFETY: // * src (`text`) is valid for `len` bytes because `len` comes from `text` // * dst (`buffer`) is valid for `len` bytes because we assert src is less than MAX_SIZE // * src and dst don't overlap because we created dst // ptr::copy_nonoverlapping(text.as_ptr(), buffer.as_mut_ptr(), len); InlineBuffer(buffer) } #[inline] pub const fn new_const(text: &str) -> Self { if text.len() > MAX_SIZE { panic!("Provided string has a length greater than our MAX_SIZE"); } let len = text.len(); let mut buffer = [0u8; MAX_SIZE]; // set the length buffer[MAX_SIZE - 1] = len as u8 | LENGTH_MASK; // Note: for loops aren't allowed in `const fn`, hence the while. // Note: Iterating forward results in badly optimized code, because the compiler tries to // unroll the loop. let text = text.as_bytes(); let mut i = len; while i > 0 { buffer[i - 1] = text[i - 1]; i -= 1; } InlineBuffer(buffer) } /// Returns an empty [`InlineBuffer`] #[inline(always)] pub const fn empty() -> Self { Self::new_const("") } /// Consumes the [`InlineBuffer`] returning the entire underlying array and the length of the /// string that it contains #[inline] #[cfg(feature = "smallvec")] pub fn into_array(self) -> ([u8; MAX_SIZE], usize) { let mut buffer = self.0; let length = core::cmp::min( (buffer[MAX_SIZE - 1].wrapping_sub(LENGTH_MASK)) as usize, MAX_SIZE, ); let last_byte_ref = &mut buffer[MAX_SIZE - 1]; // unset the last byte of the buffer if it's just storing the length of the string // // Note: we should never add an `else` statement here, keeping the conditional simple allows // the compiler to optimize this to a conditional-move instead of a branch if length < MAX_SIZE { *last_byte_ref = 0; } (buffer, length) } /// Set's the length of the content for this [`InlineBuffer`] /// /// # SAFETY: /// * The caller must guarantee that `len` bytes in the buffer are valid UTF-8 #[inline] pub unsafe fn set_len(&mut self, len: usize) { debug_assert!(len <= MAX_SIZE); // If `length` == MAX_SIZE, then we infer the length to be the capacity of the buffer. We // can infer this because the way we encode length doesn't overlap with any valid UTF-8 // bytes if len < MAX_SIZE { self.0[MAX_SIZE - 1] = len as u8 | LENGTH_MASK; } } #[inline(always)] pub fn copy(&self) -> Self { InlineBuffer(self.0) } } #[cfg(test)] mod tests { use rayon::prelude::*; #[test] #[ignore] // we run this in CI, but unless you're compiling in release, this takes a while fn test_unused_utf8_bytes() { // test to validate for all char the first and last bytes are never within a specified range // note: according to the UTF-8 spec it shouldn't be, but we double check that here (0..u32::MAX).into_par_iter().for_each(|i| { if let Ok(c) = char::try_from(i) { let mut buf = [0_u8; 4]; c.encode_utf8(&mut buf); // check ranges for first byte match buf[0] { x @ 128..=191 => panic!("first byte within 128..=191, {}", x), x @ 248..=255 => panic!("first byte within 248..=255, {}", x), _ => (), } // check ranges for last byte match buf[c.len_utf8() - 1] { x @ 192..=255 => panic!("last byte within 192..=255, {}", x), _ => (), } } }) } #[cfg(feature = "smallvec")] mod smallvec { use quickcheck_macros::quickcheck; use crate::repr::{ InlineBuffer, MAX_SIZE, }; #[test] fn test_into_array() { let s = "hello world!"; let inline = unsafe { InlineBuffer::new(s) }; let (array, length) = inline.into_array(); assert_eq!(s.len(), length); // all bytes after the length should be 0 assert!(array[length..].iter().all(|b| *b == 0)); // taking a string slice should give back the same string as the original let ex_s = unsafe { std::str::from_utf8_unchecked(&array[..length]) }; assert_eq!(s, ex_s); } #[quickcheck] #[cfg_attr(miri, ignore)] fn quickcheck_into_array(s: String) { let mut total_length = 0; let s: String = s .chars() .take_while(|c| { total_length += c.len_utf8(); total_length < MAX_SIZE }) .collect(); let inline = unsafe { InlineBuffer::new(&s) }; let (array, length) = inline.into_array(); assert_eq!(s.len(), length); // all bytes after the length should be 0 assert!(array[length..].iter().all(|b| *b == 0)); // taking a string slice should give back the same string as the original let ex_s = unsafe { std::str::from_utf8_unchecked(&array[..length]) }; assert_eq!(s, ex_s); } } } compact_str-0.7.1/src/repr/iter.rs000064400000000000000000000152151046102023000152230ustar 00000000000000//! Implementations of the [`FromIterator`] trait to make building [`Repr`]s more ergonomic use core::iter::FromIterator; use std::borrow::Cow; use super::{ InlineBuffer, Repr, MAX_SIZE, }; use crate::CompactString; impl FromIterator for Repr { fn from_iter>(iter: T) -> Self { let mut iter = iter.into_iter(); // If the size hint indicates we can't store this inline, then create a heap string let (size_hint, _) = iter.size_hint(); if size_hint > MAX_SIZE { return Repr::from_string(iter.collect(), true); } // Otherwise, continuously pull chars from the iterator let mut curr_len = 0; let mut inline_buf = InlineBuffer::new_const(""); while let Some(c) = iter.next() { let char_len = c.len_utf8(); // If this new character is too large to fit into the inline buffer, then create a heap // string if char_len + curr_len > MAX_SIZE { let (min_remaining, _) = iter.size_hint(); let mut string = String::with_capacity(char_len + curr_len + min_remaining); // push existing characters onto the heap // SAFETY: `inline_buf` has been filled with `char`s which are valid UTF-8 string .push_str(unsafe { core::str::from_utf8_unchecked(&inline_buf.0[..curr_len]) }); // push current char onto the heap string.push(c); // extend heap with remaining characters string.extend(iter); return Repr::from_string(string, true); } // write the current char into a slice of the unoccupied space c.encode_utf8(&mut inline_buf.0[curr_len..]); curr_len += char_len; } // SAFETY: Everything we just pushed onto the buffer is a `str` which is valid UTF-8 unsafe { inline_buf.set_len(curr_len) } Repr::from_inline(inline_buf) } } impl<'a> FromIterator<&'a char> for Repr { fn from_iter>(iter: T) -> Self { iter.into_iter().copied().collect() } } fn from_as_ref_str_iterator(mut iter: I) -> Repr where S: AsRef, I: Iterator, String: core::iter::Extend, String: FromIterator, { // Note: We don't check the lower bound here like we do in the character iterator because it's // possible for the iterator to be full of empty strings! In which case checking the lower bound // could cause us to heap allocate when there's no need. // Continuously pull strings from the iterator let mut curr_len = 0; let mut inline_buf = InlineBuffer::new_const(""); while let Some(s) = iter.next() { let str_slice = s.as_ref(); let bytes_len = str_slice.len(); // this new string is too large to fit into our inline buffer, so heap allocate the rest if bytes_len + curr_len > MAX_SIZE { let (min_remaining, _) = iter.size_hint(); let mut string = String::with_capacity(bytes_len + curr_len + min_remaining); // push existing strings onto the heap // SAFETY: `inline_buf` has been filled with `&str`s which are valid UTF-8 string.push_str(unsafe { core::str::from_utf8_unchecked(&inline_buf.0[..curr_len]) }); // push current string onto the heap string.push_str(str_slice); // extend heap with remaining strings string.extend(iter); return Repr::from_string(string, true); } // write the current string into a slice of the unoccupied space inline_buf.0[curr_len..][..bytes_len].copy_from_slice(str_slice.as_bytes()); curr_len += bytes_len; } // SAFETY: Everything we just pushed onto the buffer is a `str` which is valid UTF-8 unsafe { inline_buf.set_len(curr_len) } Repr::from_inline(inline_buf) } impl<'a> FromIterator<&'a str> for Repr { fn from_iter>(iter: T) -> Self { from_as_ref_str_iterator(iter.into_iter()) } } impl FromIterator> for Repr { fn from_iter>>(iter: T) -> Self { from_as_ref_str_iterator(iter.into_iter()) } } impl FromIterator for Repr { fn from_iter>(iter: T) -> Self { from_as_ref_str_iterator(iter.into_iter()) } } impl FromIterator for Repr { fn from_iter>(iter: T) -> Self { from_as_ref_str_iterator(iter.into_iter()) } } impl<'a> FromIterator> for Repr { fn from_iter>>(iter: T) -> Self { from_as_ref_str_iterator(iter.into_iter()) } } #[cfg(test)] mod tests { use super::Repr; #[test] fn short_char_iter() { let chars = ['a', 'b', 'c']; let repr: Repr = chars.iter().collect(); assert_eq!(repr.as_str(), "abc"); assert!(!repr.is_heap_allocated()); } #[test] fn short_char_ref_iter() { let chars = ['a', 'b', 'c']; let repr: Repr = chars.iter().collect(); assert_eq!(repr.as_str(), "abc"); assert!(!repr.is_heap_allocated()); } #[test] #[cfg_attr(target_pointer_width = "32", ignore)] fn packed_char_iter() { let chars = [ '\u{92f01}', '\u{81515}', '\u{81515}', '\u{81515}', '\u{81515}', '\u{41515}', ]; let repr: Repr = chars.iter().collect(); let s: String = chars.iter().collect(); assert_eq!(repr.as_str(), s.as_str()); assert!(!repr.is_heap_allocated()); } #[test] fn long_char_iter() { let long = "This is supposed to be a really long string"; let repr: Repr = long.chars().collect(); assert_eq!(repr.as_str(), "This is supposed to be a really long string"); assert!(repr.is_heap_allocated()); } #[test] fn short_string_iter() { let strings = vec!["hello", "world"]; let repr: Repr = strings.into_iter().collect(); assert_eq!(repr.as_str(), "helloworld"); assert!(!repr.is_heap_allocated()); } #[test] fn long_short_string_iter() { let strings = vec![ "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", ]; let repr: Repr = strings.into_iter().collect(); assert_eq!(repr.as_str(), "1234567891011121314151617181920"); assert!(repr.is_heap_allocated()); } } compact_str-0.7.1/src/repr/mod.rs000064400000000000000000001053341046102023000150410ustar 00000000000000use core::str::Utf8Error; use core::{ mem, ptr, }; use std::borrow::Cow; #[cfg(feature = "bytes")] mod bytes; #[cfg(feature = "smallvec")] mod smallvec; mod capacity; mod heap; mod inline; mod iter; mod nonmax; mod num; mod traits; use capacity::Capacity; use heap::HeapBuffer; use inline::InlineBuffer; use nonmax::NonMaxU8; pub use traits::IntoRepr; /// The max size of a string we can fit inline pub const MAX_SIZE: usize = std::mem::size_of::(); /// Used as a discriminant to identify different variants pub const HEAP_MASK: u8 = 0b11111110; /// When our string is stored inline, we represent the length of the string in the last byte, offset /// by `LENGTH_MASK` pub const LENGTH_MASK: u8 = 0b11000000; const EMPTY: Repr = Repr::new_inline(""); #[repr(C)] pub struct Repr( // We have a pointer in the repesentation to properly carry provenance *const (), // Then we need two `usize`s (aka WORDs) of data, for the first we just define a `usize`... usize, // ...but the second we breakup into multiple pieces... #[cfg(target_pointer_width = "64")] u32, u16, u8, // ...so that the last byte can be a NonMax, which allows the compiler to see a niche value NonMaxU8, ); unsafe impl Send for Repr {} unsafe impl Sync for Repr {} impl Repr { #[inline] pub fn new(text: &str) -> Self { let len = text.len(); if len == 0 { EMPTY } else if len <= MAX_SIZE { // SAFETY: We checked that the length of text is less than or equal to MAX_SIZE let inline = unsafe { InlineBuffer::new(text) }; Repr::from_inline(inline) } else { let heap = HeapBuffer::new(text); Repr::from_heap(heap) } } #[inline] pub const fn new_inline(text: &str) -> Self { let len = text.len(); if len <= MAX_SIZE { let inline = InlineBuffer::new_const(text); Repr::from_inline(inline) } else { panic!("Inline string was too long, max length is `std::mem::size_of::()` bytes"); } } /// Create a [`Repr`] with the provided `capacity` #[inline] pub fn with_capacity(capacity: usize) -> Self { if capacity <= MAX_SIZE { EMPTY } else { let heap = HeapBuffer::with_capacity(capacity); Repr::from_heap(heap) } } /// Create a [`Repr`] from a slice of bytes that is UTF-8 #[inline] pub fn from_utf8>(buf: B) -> Result { // Get a &str from the Vec, failing if it's not valid UTF-8 let s = core::str::from_utf8(buf.as_ref())?; // Construct a Repr from the &str Ok(Self::new(s)) } /// Create a [`Repr`] from a slice of bytes that is UTF-8, without validating that it is indeed /// UTF-8 /// /// # Safety /// * The caller must guarantee that `buf` is valid UTF-8 #[inline] pub unsafe fn from_utf8_unchecked>(buf: B) -> Self { let bytes = buf.as_ref(); let bytes_len = bytes.len(); // Create a Repr with enough capacity for the entire buffer let mut repr = Repr::with_capacity(bytes_len); // There's an edge case where the final byte of this buffer == `HEAP_MASK`, which is // invalid UTF-8, but would result in us creating an inline variant, that identifies as // a heap variant. If a user ever tried to reference the data at all, we'd incorrectly // try and read data from an invalid memory address, causing undefined behavior. if bytes_len == MAX_SIZE { let last_byte = bytes[bytes_len - 1]; // If we hit the edge case, reserve additional space to make the repr becomes heap // allocated, which prevents us from writing this last byte inline if last_byte >= 0b11000000 { repr.reserve(MAX_SIZE + 1); } } // SAFETY: The caller is responsible for making sure the provided buffer is UTF-8. This // invariant is documented in the public API let slice = repr.as_mut_buf(); // write the chunk into the Repr slice[..bytes_len].copy_from_slice(bytes); // Set the length of the Repr // SAFETY: We just wrote the entire `buf` into the Repr repr.set_len(bytes_len); repr } /// Create a [`Repr`] from a [`String`], in `O(1)` time. We'll attempt to inline the string /// if `should_inline` is `true` /// /// Note: If the provided [`String`] is >16 MB and we're on a 32-bit arch, we'll copy the /// `String`. #[inline] pub fn from_string(s: String, should_inline: bool) -> Self { let og_cap = s.capacity(); let cap = Capacity::new(og_cap); #[cold] fn capacity_on_heap(s: String) -> Repr { let heap = HeapBuffer::new(s.as_str()); Repr::from_heap(heap) } #[cold] fn empty() -> Repr { EMPTY } if cap.is_heap() { // We only hit this case if the provided String is > 16MB and we're on a 32-bit arch. We // expect it to be unlikely, thus we hint that to the compiler capacity_on_heap(s) } else if og_cap == 0 { // We don't expect converting from an empty String often, so we make this code path cold empty() } else if should_inline && s.len() <= MAX_SIZE { // SAFETY: Checked to make sure the string would fit inline let inline = unsafe { InlineBuffer::new(s.as_str()) }; Repr::from_inline(inline) } else { let mut s = mem::ManuallyDrop::new(s.into_bytes()); let len = s.len(); let raw_ptr = s.as_mut_ptr(); let ptr = ptr::NonNull::new(raw_ptr).expect("string with capacity has null ptr?"); let heap = HeapBuffer { ptr, len, cap }; Repr::from_heap(heap) } } /// Converts a [`Repr`] into a [`String`], in `O(1)` time, if possible #[inline] pub fn into_string(self) -> String { let last_byte = self.last_byte(); #[cold] fn into_string_heap(this: HeapBuffer) -> String { // SAFETY: We know pointer is valid for `length` bytes let slice = unsafe { core::slice::from_raw_parts(this.ptr.as_ptr(), this.len) }; // SAFETY: A `Repr` contains valid UTF-8 let s = unsafe { core::str::from_utf8_unchecked(slice) }; String::from(s) } if last_byte == HEAP_MASK { // SAFTEY: we just checked that the discriminant indicates we're a HeapBuffer let heap_buffer = unsafe { self.into_heap() }; if heap_buffer.cap.is_heap() { // We don't expect capacity to be on the heap often, so we mark it as cold into_string_heap(heap_buffer) } else { // Wrap the BoxString in a ManuallyDrop so the underlying buffer doesn't get freed let this = mem::ManuallyDrop::new(heap_buffer); // SAFETY: We checked above to make sure capacity is valid let cap = unsafe { this.cap.as_usize() }; // SAFETY: // * The memory in `ptr` was previously allocated by the same allocator the standard // library uses, with a required alignment of exactly 1. // * `length` is less than or equal to capacity, due to internal invaraints. // * `capacity` is correctly maintained internally. // * `BoxString` only ever contains valid UTF-8. unsafe { String::from_raw_parts(this.ptr.as_ptr(), this.len, cap) } } } else { let pointer = &self as *const _ as *const u8; let length = core::cmp::min((last_byte.wrapping_sub(LENGTH_MASK)) as usize, MAX_SIZE); // SAFETY: We know pointer is valid for `length` bytes let slice = unsafe { core::slice::from_raw_parts(pointer, length) }; // SAFETY: A `Repr` contains valid UTF-8 let s = unsafe { core::str::from_utf8_unchecked(slice) }; String::from(s) } } /// Reserves at least `additional` bytes. If there is already enough capacity to store /// `additional` bytes this is a no-op #[inline] pub fn reserve(&mut self, additional: usize) { let len = self.len(); let needed_capacity = len .checked_add(additional) .expect("Attempted to reserve more than 'usize' bytes"); if needed_capacity < self.capacity() { // we already have enough space, no-op return; } if needed_capacity <= MAX_SIZE { // It's possible to have a `Repr` that is heap allocated with a capacity less than // MAX_SIZE, if that `Repr` was created From a String or Box // // SAFTEY: Our needed_capacity is >= our length, which is <= than MAX_SIZE let inline = unsafe { InlineBuffer::new(self.as_str()) }; *self = Repr::from_inline(inline); } else if !self.is_heap_allocated() { // We're not heap allocated, but need to be, create a HeapBuffer let heap = HeapBuffer::with_additional(self.as_str(), additional); *self = Repr::from_heap(heap); } else { // We're already heap allocated, but we need more capacity // // SAFETY: We checked above to see if we're heap allocated let heap_buffer = unsafe { self.as_mut_heap() }; // To reduce allocations, we amortize our growth let amortized_capacity = heap::amortized_growth(len, additional); // Attempt to grow our capacity, allocating a new HeapBuffer on failure if heap_buffer.realloc(amortized_capacity).is_err() { // Create a new HeapBuffer let heap = HeapBuffer::with_additional(self.as_str(), additional); *self = Repr::from_heap(heap); } } } pub fn shrink_to(&mut self, min_capacity: usize) { let last_byte = self.last_byte(); // Note: We can't shrink the inline variant since it's buffer is a fixed size, so we only // take action here if our string is heap allocated if last_byte == HEAP_MASK { // SAFETY: We just checked the discriminant to make sure we're heap allocated let heap = unsafe { self.as_mut_heap() }; let old_capacity = heap.capacity(); let new_capacity = heap.len.max(min_capacity); if new_capacity <= MAX_SIZE { // String can be inlined. let mut inline = InlineBuffer::empty(); // SAFETY: Our src is on the heap, so it does not overlap with our new inline // buffer, and the src is a `Repr` so we can assume it's valid UTF-8 unsafe { inline .0 .as_mut_ptr() .copy_from_nonoverlapping(heap.ptr.as_ptr(), heap.len) }; // SAFETY: The src we wrote from was a `Repr` which we can assume is valid UTF-8 unsafe { inline.set_len(heap.len) } *self = Repr::from_inline(inline); } else if new_capacity < old_capacity { // String can be shrunk. // We can ignore the result. The string keeps its old capacity, but that's okay. let _ = heap.realloc(new_capacity); } } } #[inline] pub fn push_str(&mut self, s: &str) { let len = self.len(); let str_len = s.len(); // Reserve at least enough space to fit `s` self.reserve(str_len); // SAFTEY: `s` which we're appending to the buffer, is valid UTF-8 let slice = unsafe { self.as_mut_buf() }; let push_buffer = &mut slice[len..len + str_len]; debug_assert_eq!(push_buffer.len(), s.as_bytes().len()); // Copy the string into our buffer push_buffer.copy_from_slice(s.as_bytes()); // Increment the length of our string // // SAFETY: We appened `s` which is valid UTF-8, and if our size became greater than // MAX_SIZE, our call to reserve would make us heap allocated unsafe { self.set_len(len + str_len) }; } #[inline] pub fn pop(&mut self) -> Option { let ch = self.as_str().chars().rev().next()?; // SAFETY: We know this is is a valid length which falls on a char boundary unsafe { self.set_len(self.len() - ch.len_utf8()) }; Some(ch) } /// Returns the string content, and only the string content, as a slice of bytes. #[inline] pub fn as_slice(&self) -> &[u8] { // the last byte stores our discriminant and stack length let last_byte = self.last_byte(); // initially has the value of the stack pointer, conditionally becomes the heap pointer let mut pointer = self as *const Self as *const u8; let heap_pointer = self.0 as *const u8; // initially has the value of the stack length, conditionally becomes the heap length let mut length = core::cmp::min((last_byte.wrapping_sub(LENGTH_MASK)) as usize, MAX_SIZE); let heap_length = self.1; // our discriminant is stored in the last byte and denotes stack vs heap // // Note: We should never add an `else` statement here, keeping the conditional simple allows // the compiler to optimize this to a conditional-move instead of a branch if last_byte == HEAP_MASK { pointer = heap_pointer; length = heap_length; } // SAFETY: We know the data is valid, aligned, and part of the same contiguous allocated // chunk. It's also valid for the lifetime of self unsafe { core::slice::from_raw_parts(pointer, length) } } #[inline] pub fn as_str(&self) -> &str { // SAFETY: A `Repr` contains valid UTF-8 unsafe { core::str::from_utf8_unchecked(self.as_slice()) } } /// Returns the length of the string that we're storing #[allow(clippy::len_without_is_empty)] // is_empty exists on CompactString #[inline] pub fn len(&self) -> usize { // the last byte stores our discriminant and stack length let last_byte = self.last_byte(); // initially has the value of the stack length, conditionally becomes the heap length let mut length = core::cmp::min((last_byte.wrapping_sub(LENGTH_MASK)) as usize, MAX_SIZE); let heap_length = self.1; let length_ref = &mut length; // our discriminant is stored in the last byte and denotes stack vs heap // // Note: We should never add an `else` statement here, keeping the conditional simple allows // the compiler to optimize this to a conditional-move instead of a branch if last_byte == HEAP_MASK { *length_ref = heap_length; } *length_ref } /// Returns the overall capacity of the underlying buffer #[inline] pub fn capacity(&self) -> usize { // the last byte stores our discriminant and stack length let last_byte = self.last_byte(); #[cold] fn heap_capacity(this: &Repr) -> usize { // SAFETY: We just checked the discriminant to make sure we're heap allocated let heap_buffer = unsafe { this.as_heap() }; heap_buffer.capacity() } if last_byte == HEAP_MASK { heap_capacity(self) } else { MAX_SIZE } } #[inline(always)] pub fn is_heap_allocated(&self) -> bool { let last_byte = self.last_byte(); last_byte == HEAP_MASK } /// Return a mutable reference to the entirely underlying buffer /// /// # Safety /// * Callers must guarantee that any modifications made to the buffer are valid UTF-8 pub unsafe fn as_mut_buf(&mut self) -> &mut [u8] { // the last byte stores our discriminant and stack length let last_byte = self.last_byte(); let (ptr, cap) = if last_byte == HEAP_MASK { // SAFETY: We just checked the discriminant to make sure we're heap allocated let heap_buffer = self.as_heap(); let ptr = heap_buffer.ptr.as_ptr(); let cap = heap_buffer.capacity(); (ptr, cap) } else { let ptr = self as *mut Self as *mut u8; (ptr, MAX_SIZE) }; // SAFETY: Our data is valid for `cap` bytes, and is initialized core::slice::from_raw_parts_mut(ptr, cap) } /// Sets the length of the string that our underlying buffer contains /// /// # Safety /// * `len` bytes in the buffer must be valid UTF-8 /// * If the underlying buffer is stored inline, `len` must be <= MAX_SIZE pub unsafe fn set_len(&mut self, len: usize) { let last_byte = self.last_byte(); if last_byte == HEAP_MASK { // SAFETY: We just checked the discriminant to make sure we're heap allocated let heap_buffer = self.as_mut_heap(); // SAFETY: The caller guarantees that `len` bytes is valid UTF-8 heap_buffer.set_len(len); } else { // SAFETY: We just checked the discriminant to make sure we're an InlineBuffer let inline_buffer = self.as_mut_inline(); // SAFETY: The caller guarantees that len <= MAX_SIZE, and `len` bytes is valid UTF-8 inline_buffer.set_len(len); } } /// Returns the last byte that's on the stack. /// /// The last byte stores the discriminant that indicates whether the string is on the stack or /// on the heap. When the string is on the stack the last byte also stores the length #[inline(always)] const fn last_byte(&self) -> u8 { cfg_if::cfg_if! { if #[cfg(target_pointer_width = "64")] { let last_byte = self.5; } else if #[cfg(target_pointer_width = "32")] { let last_byte = self.4; } else { compile_error!("Unsupported target_pointer_width"); } }; last_byte as u8 } /// Reinterprets an [`InlineBuffer`] into a [`Repr`] /// /// Note: This is safe because [`InlineBuffer`] and [`Repr`] are the same size. We used to /// define [`Repr`] as a `union` which implicitly transmuted between the two types, but that /// prevented us from defining a "niche" value to make `Option` the same size as /// just `CompactString` #[inline(always)] const fn from_inline(inline: InlineBuffer) -> Self { // SAFETY: An `InlineBuffer` and `Repr` have the same size unsafe { core::mem::transmute(inline) } } /// Reinterprets a [`HeapBuffer`] into a [`Repr`] /// /// Note: This is safe because [`HeapBuffer`] and [`Repr`] are the same size. We used to define /// [`Repr`] as a `union` which implicitly transmuted between the two types, but that prevented /// us from defining a "niche" value to make `Option` the same size as just /// `CompactString` #[inline(always)] const fn from_heap(heap: HeapBuffer) -> Self { // SAFETY: A `HeapBuffer` and `Repr` have the same size unsafe { core::mem::transmute(heap) } } /// Reinterprets a [`Repr`] as a [`HeapBuffer`] /// /// # SAFETY /// * The caller must guarantee that the provided [`Repr`] is actually a [`HeapBuffer`] by /// checking the discriminant /// /// Note: We used to define [`Repr`] as a `union` which implicitly transmuted between the two /// types, but that prevented us from defining a "niche" value to make `Option` /// the same size as just `CompactString` #[inline(always)] const unsafe fn into_heap(self) -> HeapBuffer { core::mem::transmute(self) } /// Reinterprets a `&mut Repr` as a `&mut HeapBuffer` /// /// # SAFETY /// * The caller must guarantee that the provided [`Repr`] is actually a [`HeapBuffer`] by /// checking the discriminant /// /// Note: We used to define [`Repr`] as a `union` which implicitly transmuted between the two /// types, but that prevented us from defining a "niche" value to make `Option` /// the same size as just `CompactString` #[inline(always)] unsafe fn as_mut_heap(&mut self) -> &mut HeapBuffer { // SAFETY: A `HeapBuffer` and `Repr` have the same size &mut *(self as *mut _ as *mut HeapBuffer) } /// Reinterprets a `&Repr` as a `&HeapBuffer` /// /// # SAFETY /// * The caller must guarantee that the provided [`Repr`] is actually a [`HeapBuffer`] by /// checking the discriminant /// /// Note: We used to define [`Repr`] as a `union` which implicitly transmuted between the two /// types, but that prevented us from defining a "niche" value to make `Option` /// the same size as just `CompactString` #[inline(always)] unsafe fn as_heap(&self) -> &HeapBuffer { // SAFETY: A `HeapBuffer` and `Repr` have the same size &*(self as *const _ as *const HeapBuffer) } /// Reinterprets a [`Repr`] as an [`InlineBuffer`] /// /// # SAFETY /// * The caller must guarantee that the provided [`Repr`] is actually an [`InlineBuffer`] by /// checking the discriminant /// /// Note: We used to define [`Repr`] as a `union` which implicitly transmuted between the two /// types, but that prevented us from defining a "niche" value to make `Option` /// the same size as just `CompactString` #[inline(always)] #[cfg(feature = "smallvec")] const unsafe fn into_inline(self) -> InlineBuffer { core::mem::transmute(self) } /// Reinterprets a `&mut Repr` as an `&mut InlineBuffer` /// /// # SAFETY /// * The caller must guarantee that the provided [`Repr`] is actually an [`InlineBuffer`] by /// checking the discriminant /// /// Note: We used to define [`Repr`] as a `union` which implicitly transmuted between the two /// types, but that prevented us from defining a "niche" value to make `Option` /// the same size as just `CompactString` #[inline(always)] unsafe fn as_mut_inline(&mut self) -> &mut InlineBuffer { // SAFETY: An `InlineBuffer` and `Repr` have the same size &mut *(self as *mut _ as *mut InlineBuffer) } /// Reinterprets a `&Repr` as an `&InlineBuffer` /// /// # SAFETY /// * The caller must guarantee that the provided [`Repr`] is actually an [`InlineBuffer`] by /// checking the discriminant /// /// Note: We used to define [`Repr`] as a `union` which implicitly transmuted between the two /// types, but that prevented us from defining a "niche" value to make `Option` /// the same size as just `CompactString` #[inline(always)] unsafe fn as_inline(&self) -> &InlineBuffer { // SAFETY: An `InlineBuffer` and `Repr` have the same size &*(self as *const _ as *const InlineBuffer) } } impl Clone for Repr { #[inline] fn clone(&self) -> Self { let last_byte = self.last_byte(); #[cold] fn clone_heap(this: &Repr) -> Repr { // SAFETY: We just checked the discriminant to make sure we're heap allocated let heap = unsafe { this.as_heap() }; // If the contained string is small enough, we will inline it instead of allocating if heap.len <= MAX_SIZE { // SAFETY: Checked to make sure the length is <= MAX_SIZE let inline = unsafe { InlineBuffer::new(this.as_str()) }; Repr::from_inline(inline) } else { let new = heap.clone(); Repr::from_heap(new) } } if last_byte == HEAP_MASK { clone_heap(self) } else { // SAFETY: We checked above that the discriminant indicates we're inline let inline = unsafe { self.as_inline() }; Repr::from_inline(inline.copy()) } } } impl Drop for Repr { #[inline] fn drop(&mut self) { // By "outlining" the actual Drop code and only calling it if we're a heap variant, it // allows dropping an inline variant to be as cheap as possible. if self.is_heap_allocated() { outlined_drop(self) } #[cold] fn outlined_drop(this: &mut Repr) { // SAFETY: We just checked the discriminant to make sure we're heap allocated let heap_buffer = unsafe { this.as_mut_heap() }; heap_buffer.dealloc(); } } } impl Extend for Repr { #[inline] fn extend>(&mut self, iter: T) { let mut iterator = iter.into_iter().peekable(); // if the iterator is empty, no work needs to be done! if iterator.peek().is_none() { return; } let (lower_bound, _) = iterator.size_hint(); self.reserve(lower_bound); iterator.for_each(|c| self.push_str(c.encode_utf8(&mut [0; 4]))); } } impl<'a> Extend<&'a char> for Repr { fn extend>(&mut self, iter: T) { self.extend(iter.into_iter().copied()); } } impl<'a> Extend<&'a str> for Repr { fn extend>(&mut self, iter: T) { iter.into_iter().for_each(|s| self.push_str(s)); } } impl Extend> for Repr { fn extend>>(&mut self, iter: T) { iter.into_iter().for_each(move |s| self.push_str(&s)); } } impl<'a> Extend> for Repr { fn extend>>(&mut self, iter: T) { iter.into_iter().for_each(move |s| self.push_str(&s)); } } impl Extend for Repr { fn extend>(&mut self, iter: T) { iter.into_iter().for_each(move |s| self.push_str(&s)); } } #[cfg(test)] mod tests { use quickcheck_macros::quickcheck; use test_case::test_case; use super::{ Repr, MAX_SIZE, }; const EIGHTEEN_MB: usize = 18 * 1024 * 1024; const EIGHTEEN_MB_STR: &'static str = unsafe { core::str::from_utf8_unchecked(&[42; EIGHTEEN_MB]) }; #[test_case("hello world!"; "inline")] #[test_case("this is a long string that should be stored on the heap"; "heap")] fn test_create(s: &'static str) { let repr = Repr::new(s); assert_eq!(repr.as_str(), s); assert_eq!(repr.len(), s.len()); } #[quickcheck] #[cfg_attr(miri, ignore)] fn quickcheck_create(s: String) { let repr = Repr::new(&s); assert_eq!(repr.as_str(), s); assert_eq!(repr.len(), s.len()); } #[test_case(0; "empty")] #[test_case(10; "short")] #[test_case(64; "long")] #[test_case(EIGHTEEN_MB; "huge")] fn test_with_capacity(cap: usize) { let r = Repr::with_capacity(cap); assert!(r.capacity() >= MAX_SIZE); assert_eq!(r.len(), 0); } #[test_case(""; "empty")] #[test_case("abc"; "short")] #[test_case("hello world! I am a longer string ๐Ÿฆ€"; "long")] fn test_from_utf8_valid(s: &'static str) { let bytes = s.as_bytes(); let r = Repr::from_utf8(bytes).expect("valid UTF-8"); assert_eq!(r.as_str(), s); assert_eq!(r.len(), s.len()); } #[quickcheck] #[cfg_attr(miri, ignore)] fn quickcheck_from_utf8(buf: Vec) { match (core::str::from_utf8(&buf), Repr::from_utf8(&buf)) { (Ok(s), Ok(r)) => { assert_eq!(r.as_str(), s); assert_eq!(r.len(), s.len()); } (Err(e), Err(r)) => assert_eq!(e, r), _ => panic!("core::str and Repr differ on what is valid UTF-8!"), } } #[test_case(String::new(), true; "empty should inline")] #[test_case(String::new(), false; "empty not inline")] #[test_case(String::with_capacity(10), true ; "empty with small capacity inline")] #[test_case(String::with_capacity(10), false ; "empty with small capacity not inline")] #[test_case(String::with_capacity(128), true ; "empty with large capacity inline")] #[test_case(String::with_capacity(128), false ; "empty with large capacity not inline")] #[test_case(String::from("nyc ๐Ÿ—ฝ"), true; "short should inline")] #[test_case(String::from("nyc ๐Ÿ—ฝ"), false ; "short not inline")] #[test_case(String::from("this is a really long string, which is intended"), true; "long")] #[test_case(String::from("this is a really long string, which is intended"), false; "long not inline")] #[test_case(EIGHTEEN_MB_STR.to_string(), true ; "huge should inline")] #[test_case(EIGHTEEN_MB_STR.to_string(), false ; "huge not inline")] fn test_from_string(s: String, try_to_inline: bool) { // note: when cloning a String it truncates capacity, which is why we measure these values // before cloning the string let s_len = s.len(); let s_cap = s.capacity(); let s_str = s.clone(); let r = Repr::from_string(s, try_to_inline); assert_eq!(r.len(), s_len); assert_eq!(r.as_str(), s_str.as_str()); if s_cap == 0 { // we should always inline the string, if the length of the source string is 0 assert!(!r.is_heap_allocated()); } else if try_to_inline && s_len <= MAX_SIZE { // we should inline the string, if we were asked to, and the length of the string would // fit inline, meaning we would truncate capacity assert!(!r.is_heap_allocated()); } else { assert!(r.is_heap_allocated()); } } #[quickcheck] #[cfg_attr(miri, ignore)] fn quickcheck_from_string(s: String, try_to_inline: bool) { let r = Repr::from_string(s.clone(), try_to_inline); assert_eq!(r.len(), s.len()); assert_eq!(r.as_str(), s.as_str()); if s.capacity() == 0 { // we should always inline the string, if the length of the source string is 0 assert!(!r.is_heap_allocated()); } else if s.capacity() <= MAX_SIZE { // we should inline the string, if we were asked to assert_eq!(!r.is_heap_allocated(), try_to_inline); } else { assert!(r.is_heap_allocated()); } } #[test_case(""; "empty")] #[test_case("nyc ๐Ÿ—ฝ"; "short")] #[test_case("this is a really long string, which is intended"; "long")] fn test_into_string(control: &'static str) { let r = Repr::new(control); let s = r.into_string(); assert_eq!(control.len(), s.len()); assert_eq!(control, s.as_str()); } #[quickcheck] #[cfg_attr(miri, ignore)] fn quickcheck_into_string(control: String) { let r = Repr::new(&control); let s = r.into_string(); assert_eq!(control.len(), s.len()); assert_eq!(control, s.as_str()); } #[test_case("", "a", false; "empty")] #[test_case("", "๐Ÿ—ฝ", false; "empty_emoji")] #[test_case("abc", "๐Ÿ—ฝ๐Ÿ™‚๐Ÿฆ€๐ŸŒˆ๐Ÿ‘๐Ÿถ", true; "inline_to_heap")] #[test_case("i am a long string that will be on the heap", "extra", true; "heap_to_heap")] fn test_push_str(control: &'static str, append: &'static str, is_heap: bool) { let mut r = Repr::new(control); let mut c = String::from(control); r.push_str(append); c.push_str(append); assert_eq!(r.as_str(), c.as_str()); assert_eq!(r.len(), c.len()); assert_eq!(r.is_heap_allocated(), is_heap); } #[quickcheck] #[cfg_attr(miri, ignore)] fn quickcheck_push_str(control: String, append: String) { let mut r = Repr::new(&control); let mut c = control; r.push_str(&append); c.push_str(&append); assert_eq!(r.as_str(), c.as_str()); assert_eq!(r.len(), c.len()); } #[test_case(&[42; 0], &[42; EIGHTEEN_MB]; "empty_to_heap_capacity")] #[test_case(&[42; 8], &[42; EIGHTEEN_MB]; "inline_to_heap_capacity")] #[test_case(&[42; 128], &[42; EIGHTEEN_MB]; "heap_inline_to_heap_capacity")] #[test_case(&[42; EIGHTEEN_MB], &[42; 64]; "heap_capacity_to_heap_capacity")] fn test_push_str_from_buf(buf: &[u8], append: &[u8]) { // The goal of this test is to exercise the scenario when our capacity is stored on the heap let control = unsafe { core::str::from_utf8_unchecked(buf) }; let append = unsafe { core::str::from_utf8_unchecked(append) }; let mut r = Repr::new(control); let mut c = String::from(control); r.push_str(append); c.push_str(append); assert_eq!(r.as_str(), c.as_str()); assert_eq!(r.len(), c.len()); assert!(r.is_heap_allocated()); } #[test_case("", 0, false; "empty_zero")] #[test_case("", 10, false; "empty_small")] #[test_case("", 64, true; "empty_large")] #[test_case("abc", 0, false; "short_zero")] #[test_case("abc", 8, false; "short_small")] #[test_case("abc", 64, true; "short_large")] #[test_case("I am a long string that will be on the heap", 0, true; "large_zero")] #[test_case("I am a long string that will be on the heap", 10, true; "large_small")] #[test_case("I am a long string that will be on the heap", EIGHTEEN_MB, true; "large_huge")] fn test_reserve(initial: &'static str, additional: usize, is_heap: bool) { let mut r = Repr::new(initial); r.reserve(additional); assert!(r.capacity() >= initial.len() + additional); assert_eq!(r.is_heap_allocated(), is_heap); } #[test] #[should_panic(expected = "Attempted to reserve more than 'usize' bytes")] fn test_reserve_overflow() { let mut r = Repr::new("abc"); r.reserve(usize::MAX); } #[test_case(""; "empty")] #[test_case("abc"; "short")] #[test_case("i am a longer string that will be on the heap"; "long")] #[test_case(EIGHTEEN_MB_STR; "huge")] fn test_clone(initial: &'static str) { let r_a = Repr::new(initial); let r_b = r_a.clone(); assert_eq!(r_a.as_str(), initial); assert_eq!(r_a.len(), initial.len()); assert_eq!(r_a.as_str(), r_b.as_str()); assert_eq!(r_a.len(), r_b.len()); assert_eq!(r_a.capacity(), r_b.capacity()); assert_eq!(r_a.is_heap_allocated(), r_b.is_heap_allocated()); } #[quickcheck] #[cfg_attr(miri, ignore)] fn quickcheck_clone(initial: String) { let r_a = Repr::new(&initial); let r_b = r_a.clone(); assert_eq!(r_a.as_str(), initial); assert_eq!(r_a.len(), initial.len()); assert_eq!(r_a.as_str(), r_b.as_str()); assert_eq!(r_a.len(), r_b.len()); assert_eq!(r_a.capacity(), r_b.capacity()); assert_eq!(r_a.is_heap_allocated(), r_b.is_heap_allocated()); } } compact_str-0.7.1/src/repr/nonmax.rs000064400000000000000000000105211046102023000155530ustar 00000000000000/// [`NonMaxU8`] is an unsigned 8-bit integer data type that has a valid range of `[0, 254]`. /// Excluding `255` allows the Rust compiler to use `255` as a niche. /// /// Specifically the compiler can use `255` to encode the `None` variant of `Option` /// allowing `std::mem::size_of:: == std::mem::size_of::>()` #[allow(clippy::upper_case_acronyms)] #[allow(dead_code)] #[allow(non_camel_case_types)] #[derive(Copy, Clone, Debug)] #[repr(u8)] pub enum NonMaxU8 { V0 = 0, V1 = 1, V2 = 2, V3 = 3, V4 = 4, V5 = 5, V6 = 6, V7 = 7, V8 = 8, V9 = 9, V10 = 10, V11 = 11, V12 = 12, V13 = 13, V14 = 14, V15 = 15, V16 = 16, V17 = 17, V18 = 18, V19 = 19, V20 = 20, V21 = 21, V22 = 22, V23 = 23, V24 = 24, V25 = 25, V26 = 26, V27 = 27, V28 = 28, V29 = 29, V30 = 30, V31 = 31, V32 = 32, V33 = 33, V34 = 34, V35 = 35, V36 = 36, V37 = 37, V38 = 38, V39 = 39, V40 = 40, V41 = 41, V42 = 42, V43 = 43, V44 = 44, V45 = 45, V46 = 46, V47 = 47, V48 = 48, V49 = 49, V50 = 50, V51 = 51, V52 = 52, V53 = 53, V54 = 54, V55 = 55, V56 = 56, V57 = 57, V58 = 58, V59 = 59, V60 = 60, V61 = 61, V62 = 62, V63 = 63, V64 = 64, V65 = 65, V66 = 66, V67 = 67, V68 = 68, V69 = 69, V70 = 70, V71 = 71, V72 = 72, V73 = 73, V74 = 74, V75 = 75, V76 = 76, V77 = 77, V78 = 78, V79 = 79, V80 = 80, V81 = 81, V82 = 82, V83 = 83, V84 = 84, V85 = 85, V86 = 86, V87 = 87, V88 = 88, V89 = 89, V90 = 90, V91 = 91, V92 = 92, V93 = 93, V94 = 94, V95 = 95, V96 = 96, V97 = 97, V98 = 98, V99 = 99, V100 = 100, V101 = 101, V102 = 102, V103 = 103, V104 = 104, V105 = 105, V106 = 106, V107 = 107, V108 = 108, V109 = 109, V110 = 110, V111 = 111, V112 = 112, V113 = 113, V114 = 114, V115 = 115, V116 = 116, V117 = 117, V118 = 118, V119 = 119, V120 = 120, V121 = 121, V122 = 122, V123 = 123, V124 = 124, V125 = 125, V126 = 126, V127 = 127, V128 = 128, V129 = 129, V130 = 130, V131 = 131, V132 = 132, V133 = 133, V134 = 134, V135 = 135, V136 = 136, V137 = 137, V138 = 138, V139 = 139, V140 = 140, V141 = 141, V142 = 142, V143 = 143, V144 = 144, V145 = 145, V146 = 146, V147 = 147, V148 = 148, V149 = 149, V150 = 150, V151 = 151, V152 = 152, V153 = 153, V154 = 154, V155 = 155, V156 = 156, V157 = 157, V158 = 158, V159 = 159, V160 = 160, V161 = 161, V162 = 162, V163 = 163, V164 = 164, V165 = 165, V166 = 166, V167 = 167, V168 = 168, V169 = 169, V170 = 170, V171 = 171, V172 = 172, V173 = 173, V174 = 174, V175 = 175, V176 = 176, V177 = 177, V178 = 178, V179 = 179, V180 = 180, V181 = 181, V182 = 182, V183 = 183, V184 = 184, V185 = 185, V186 = 186, V187 = 187, V188 = 188, V189 = 189, V190 = 190, V191 = 191, V192 = 192, V193 = 193, V194 = 194, V195 = 195, V196 = 196, V197 = 197, V198 = 198, V199 = 199, V200 = 200, V201 = 201, V202 = 202, V203 = 203, V204 = 204, V205 = 205, V206 = 206, V207 = 207, V208 = 208, V209 = 209, V210 = 210, V211 = 211, V212 = 212, V213 = 213, V214 = 214, V215 = 215, V216 = 216, V217 = 217, V218 = 218, V219 = 219, V220 = 220, V221 = 221, V222 = 222, V223 = 223, V224 = 224, V225 = 225, V226 = 226, V227 = 227, V228 = 228, V229 = 229, V230 = 230, V231 = 231, V232 = 232, V233 = 233, V234 = 234, V235 = 235, V236 = 236, V237 = 237, V238 = 238, V239 = 239, V240 = 240, V241 = 241, V242 = 242, V243 = 243, V244 = 244, V245 = 245, V246 = 246, V247 = 247, V248 = 248, V249 = 249, V250 = 250, V251 = 251, V252 = 252, V253 = 253, V254 = 254, } static_assertions::assert_eq_size!(NonMaxU8, Option, u8); compact_str-0.7.1/src/repr/num.rs000064400000000000000000000375551046102023000150720ustar 00000000000000//! Implementations for efficiently converting a number into a [`Repr`] //! //! Adapted from the implemenation in the `std` library at //! use core::{ mem, num, ptr, }; use super::traits::IntoRepr; use super::Repr; const DEC_DIGITS_LUT: &[u8] = b"\ 0001020304050607080910111213141516171819\ 2021222324252627282930313233343536373839\ 4041424344454647484950515253545556575859\ 6061626364656667686970717273747576777879\ 8081828384858687888990919293949596979899"; /// Defines the implementation of [`IntoRepr`] for integer types macro_rules! impl_IntoRepr { ($t:ident, $conv_ty:ident) => { impl IntoRepr for $t { #[inline] fn into_repr(self) -> Repr { // Determine the number of digits in this value // // Note: this considers the `-` symbol let num_digits = NumChars::num_chars(self); let mut repr = Repr::with_capacity(num_digits); #[allow(unused_comparisons)] let is_nonnegative = self >= 0; let mut n = if is_nonnegative { self as $conv_ty } else { // convert the negative num to positive by summing 1 to it's 2 complement (!(self as $conv_ty)).wrapping_add(1) }; let mut curr = num_digits as isize; // our string will end up being num_digits long unsafe { repr.set_len(num_digits) }; // get mutable pointer to our buffer let buf_ptr = unsafe { repr.as_mut_buf().as_mut_ptr() }; let lut_ptr = DEC_DIGITS_LUT.as_ptr(); unsafe { // need at least 16 bits for the 4-characters-at-a-time to work. if mem::size_of::<$t>() >= 2 { // eagerly decode 4 characters at a time while n >= 10000 { let rem = (n % 10000) as isize; n /= 10000; let d1 = (rem / 100) << 1; let d2 = (rem % 100) << 1; curr -= 4; ptr::copy_nonoverlapping(lut_ptr.offset(d1), buf_ptr.offset(curr), 2); ptr::copy_nonoverlapping( lut_ptr.offset(d2), buf_ptr.offset(curr + 2), 2, ); } } // if we reach here numbers are <= 9999, so at most 4 chars long let mut n = n as isize; // possibly reduce 64bit math // decode 2 more chars, if > 2 chars if n >= 100 { let d1 = (n % 100) << 1; n /= 100; curr -= 2; ptr::copy_nonoverlapping(lut_ptr.offset(d1), buf_ptr.offset(curr), 2); } // decode last 1 or 2 chars if n < 10 { curr -= 1; *buf_ptr.offset(curr) = (n as u8) + b'0'; } else { let d1 = n << 1; curr -= 2; ptr::copy_nonoverlapping(lut_ptr.offset(d1), buf_ptr.offset(curr), 2); } if !is_nonnegative { curr -= 1; *buf_ptr.offset(curr) = b'-'; } } // we should have moved all the way down our buffer debug_assert_eq!(curr, 0); repr } } }; } impl_IntoRepr!(u8, u32); impl_IntoRepr!(i8, u32); impl_IntoRepr!(u16, u32); impl_IntoRepr!(i16, u32); impl_IntoRepr!(u32, u32); impl_IntoRepr!(i32, u32); impl_IntoRepr!(u64, u64); impl_IntoRepr!(i64, u64); #[cfg(target_pointer_width = "32")] impl_IntoRepr!(usize, u32); #[cfg(target_pointer_width = "32")] impl_IntoRepr!(isize, u32); #[cfg(target_pointer_width = "64")] impl_IntoRepr!(usize, u64); #[cfg(target_pointer_width = "64")] impl_IntoRepr!(isize, u64); /// For 128-bit integer types we use the [`itoa`] crate because writing into a buffer, and then /// copying the amount of characters we've written, is faster than determining the number of /// characters and then writing. impl IntoRepr for u128 { #[inline] fn into_repr(self) -> Repr { let mut buffer = itoa::Buffer::new(); Repr::new(buffer.format(self)) } } impl IntoRepr for i128 { #[inline] fn into_repr(self) -> Repr { let mut buffer = itoa::Buffer::new(); Repr::new(buffer.format(self)) } } /// Defines the implementation of [`IntoRepr`] for NonZero integer types macro_rules! impl_NonZero_IntoRepr { ($t:path) => { impl IntoRepr for $t { #[inline] fn into_repr(self) -> Repr { self.get().into_repr() } } }; } impl_NonZero_IntoRepr!(num::NonZeroU8); impl_NonZero_IntoRepr!(num::NonZeroI8); impl_NonZero_IntoRepr!(num::NonZeroU16); impl_NonZero_IntoRepr!(num::NonZeroI16); impl_NonZero_IntoRepr!(num::NonZeroU32); impl_NonZero_IntoRepr!(num::NonZeroI32); impl_NonZero_IntoRepr!(num::NonZeroU64); impl_NonZero_IntoRepr!(num::NonZeroI64); impl_NonZero_IntoRepr!(num::NonZeroUsize); impl_NonZero_IntoRepr!(num::NonZeroIsize); impl_NonZero_IntoRepr!(num::NonZeroU128); impl_NonZero_IntoRepr!(num::NonZeroI128); /// All of these `num_chars(...)` methods are kind of crazy, but they are necessary. /// /// An alternate way to calculate the number of digits in a value is to do: /// ``` /// let val = 42; /// let num_digits = ((val as f32).log10().floor()) as usize + 1; /// assert_eq!(num_digits, 2); /// ``` /// But there are two problems with this approach: /// 1. floating point math is slow /// 2. results are dependent on floating point precision, which is too inaccurate for larger values /// /// For example, consider this relatively large value... /// /// ``` /// let val = 9999995; /// let num_digits = ((val as f32).log10().floor()) as usize + 1; /// /// // this is wrong! There are only 7 digits in this number! /// assert_eq!(num_digits, 8); /// ``` /// /// you can use `f64` to get better precision, e.g. /// /// ``` /// let val = 9999995; /// let num_digits = ((val as f64).log10().floor()) as usize + 1; /// /// // the precision is enough to get the correct value /// assert_eq!(num_digits, 7); /// ``` /// /// ...but still not precise enough! /// /// ``` /// let val: u64 = 9999999999999999999; /// let num_digits = ((val as f64).log10().floor()) as usize + 1; /// /// // this is wrong! the number is only 19 digits but the formula returns 20 /// assert_eq!(num_digits, 20); /// ``` trait NumChars { fn num_chars(val: Self) -> usize; } impl NumChars for u8 { #[inline(always)] fn num_chars(val: u8) -> usize { match val { u8::MIN..=9 => 1, 10..=99 => 2, 100..=u8::MAX => 3, } } } impl NumChars for i8 { #[inline(always)] fn num_chars(val: i8) -> usize { match val { i8::MIN..=-100 => 4, -99..=-10 => 3, -9..=-1 => 2, 0..=9 => 1, 10..=99 => 2, 100..=i8::MAX => 3, } } } impl NumChars for u16 { #[inline(always)] fn num_chars(val: u16) -> usize { match val { u16::MIN..=9 => 1, 10..=99 => 2, 100..=999 => 3, 1000..=9999 => 4, 10000..=u16::MAX => 5, } } } impl NumChars for i16 { #[inline(always)] fn num_chars(val: i16) -> usize { match val { i16::MIN..=-10000 => 6, -9999..=-1000 => 5, -999..=-100 => 4, -99..=-10 => 3, -9..=-1 => 2, 0..=9 => 1, 10..=99 => 2, 100..=999 => 3, 1000..=9999 => 4, 10000..=i16::MAX => 5, } } } impl NumChars for u32 { #[inline(always)] fn num_chars(val: u32) -> usize { match val { u32::MIN..=9 => 1, 10..=99 => 2, 100..=999 => 3, 1000..=9999 => 4, 10000..=99999 => 5, 100000..=999999 => 6, 1000000..=9999999 => 7, 10000000..=99999999 => 8, 100000000..=999999999 => 9, 1000000000..=u32::MAX => 10, } } } impl NumChars for i32 { #[inline(always)] fn num_chars(val: i32) -> usize { match val { i32::MIN..=-1000000000 => 11, -999999999..=-100000000 => 10, -99999999..=-10000000 => 9, -9999999..=-1000000 => 8, -999999..=-100000 => 7, -99999..=-10000 => 6, -9999..=-1000 => 5, -999..=-100 => 4, -99..=-10 => 3, -9..=-1 => 2, 0..=9 => 1, 10..=99 => 2, 100..=999 => 3, 1000..=9999 => 4, 10000..=99999 => 5, 100000..=999999 => 6, 1000000..=9999999 => 7, 10000000..=99999999 => 8, 100000000..=999999999 => 9, 1000000000..=i32::MAX => 10, } } } impl NumChars for u64 { #[inline(always)] fn num_chars(val: u64) -> usize { match val { u64::MIN..=9 => 1, 10..=99 => 2, 100..=999 => 3, 1000..=9999 => 4, 10000..=99999 => 5, 100000..=999999 => 6, 1000000..=9999999 => 7, 10000000..=99999999 => 8, 100000000..=999999999 => 9, 1000000000..=9999999999 => 10, 10000000000..=99999999999 => 11, 100000000000..=999999999999 => 12, 1000000000000..=9999999999999 => 13, 10000000000000..=99999999999999 => 14, 100000000000000..=999999999999999 => 15, 1000000000000000..=9999999999999999 => 16, 10000000000000000..=99999999999999999 => 17, 100000000000000000..=999999999999999999 => 18, 1000000000000000000..=9999999999999999999 => 19, 10000000000000000000..=u64::MAX => 20, } } } impl NumChars for i64 { #[inline(always)] fn num_chars(val: i64) -> usize { match val { i64::MIN..=-1000000000000000000 => 20, -999999999999999999..=-100000000000000000 => 19, -99999999999999999..=-10000000000000000 => 18, -9999999999999999..=-1000000000000000 => 17, -999999999999999..=-100000000000000 => 16, -99999999999999..=-10000000000000 => 15, -9999999999999..=-1000000000000 => 14, -999999999999..=-100000000000 => 13, -99999999999..=-10000000000 => 12, -9999999999..=-1000000000 => 11, -999999999..=-100000000 => 10, -99999999..=-10000000 => 9, -9999999..=-1000000 => 8, -999999..=-100000 => 7, -99999..=-10000 => 6, -9999..=-1000 => 5, -999..=-100 => 4, -99..=-10 => 3, -9..=-1 => 2, 0..=9 => 1, 10..=99 => 2, 100..=999 => 3, 1000..=9999 => 4, 10000..=99999 => 5, 100000..=999999 => 6, 1000000..=9999999 => 7, 10000000..=99999999 => 8, 100000000..=999999999 => 9, 1000000000..=9999999999 => 10, 10000000000..=99999999999 => 11, 100000000000..=999999999999 => 12, 1000000000000..=9999999999999 => 13, 10000000000000..=99999999999999 => 14, 100000000000000..=999999999999999 => 15, 1000000000000000..=9999999999999999 => 16, 10000000000000000..=99999999999999999 => 17, 100000000000000000..=999999999999999999 => 18, 1000000000000000000..=i64::MAX => 19, } } } impl NumChars for usize { fn num_chars(val: usize) -> usize { #[cfg(target_pointer_width = "32")] { u32::num_chars(val as u32) } #[cfg(target_pointer_width = "64")] { u64::num_chars(val as u64) } } } impl NumChars for isize { fn num_chars(val: isize) -> usize { #[cfg(target_pointer_width = "32")] { i32::num_chars(val as i32) } #[cfg(target_pointer_width = "64")] { i64::num_chars(val as i64) } } } #[cfg(test)] mod tests { use super::IntoRepr; #[test] fn test_from_u8_sanity() { let vals = [u8::MIN, u8::MIN + 1, 0, 42, u8::MAX - 1, u8::MAX]; for x in &vals { let repr = u8::into_repr(*x); assert_eq!(repr.as_str(), x.to_string()); } } #[test] fn test_from_i8_sanity() { let vals = [i8::MIN, i8::MIN + 1, 0, 42, i8::MAX - 1, i8::MAX]; for x in &vals { let repr = i8::into_repr(*x); assert_eq!(repr.as_str(), x.to_string()); } } #[test] fn test_from_u16_sanity() { let vals = [u16::MIN, u16::MIN + 1, 0, 42, u16::MAX - 1, u16::MAX]; for x in &vals { let repr = u16::into_repr(*x); assert_eq!(repr.as_str(), x.to_string()); } } #[test] fn test_from_i16_sanity() { let vals = [i16::MIN, i16::MIN + 1, 0, 42, i16::MAX - 1, i16::MAX]; for x in &vals { let repr = i16::into_repr(*x); assert_eq!(repr.as_str(), x.to_string()); } } #[test] fn test_from_u32_sanity() { let vals = [u32::MIN, u32::MIN + 1, 0, 42, u32::MAX - 1, u32::MAX]; for x in &vals { let repr = u32::into_repr(*x); assert_eq!(repr.as_str(), x.to_string()); } } #[test] fn test_from_i32_sanity() { let vals = [i32::MIN, i32::MIN + 1, 0, 42, i32::MAX - 1, i32::MAX]; for x in &vals { let repr = i32::into_repr(*x); assert_eq!(repr.as_str(), x.to_string()); } } #[test] fn test_from_u64_sanity() { let vals = [u64::MIN, u64::MIN + 1, 0, 42, u64::MAX - 1, u64::MAX]; for x in &vals { let repr = u64::into_repr(*x); assert_eq!(repr.as_str(), x.to_string()); } } #[test] fn test_from_i64_sanity() { let vals = [i64::MIN, i64::MIN + 1, 0, 42, i64::MAX - 1, i64::MAX]; for x in &vals { let repr = i64::into_repr(*x); assert_eq!(repr.as_str(), x.to_string()); } } #[test] fn test_from_usize_sanity() { let vals = [ usize::MIN, usize::MIN + 1, 0, 42, usize::MAX - 1, usize::MAX, ]; for x in &vals { let repr = usize::into_repr(*x); assert_eq!(repr.as_str(), x.to_string()); } } #[test] fn test_from_isize_sanity() { let vals = [ isize::MIN, isize::MIN + 1, 0, 42, isize::MAX - 1, isize::MAX, ]; for x in &vals { let repr = isize::into_repr(*x); assert_eq!(repr.as_str(), x.to_string()); } } #[test] fn test_from_u128_sanity() { let vals = [u128::MIN, u128::MIN + 1, 0, 42, u128::MAX - 1, u128::MAX]; for x in &vals { let repr = u128::into_repr(*x); assert_eq!(repr.as_str(), x.to_string()); } } #[test] fn test_from_i128_sanity() { let vals = [i128::MIN, i128::MIN + 1, 0, 42, i128::MAX - 1, i128::MAX]; for x in &vals { let repr = i128::into_repr(*x); assert_eq!(repr.as_str(), x.to_string()); } } } compact_str-0.7.1/src/repr/smallvec.rs000064400000000000000000000024661046102023000160720ustar 00000000000000use smallvec::SmallVec; use super::{ Repr, HEAP_MASK, MAX_SIZE, }; impl Repr { /// Consumes the [`Repr`] returning a byte vector in a [`SmallVec`] /// /// Note: both for the inlined case and the heap case, the buffers are re-used #[inline] pub fn into_bytes(self) -> SmallVec<[u8; MAX_SIZE]> { let last_byte = self.last_byte(); if last_byte == HEAP_MASK { let string = self.into_string(); let bytes = string.into_bytes(); SmallVec::from_vec(bytes) } else { // SAFETY: We just checked the discriminant to make sure we're an InlineBuffer let inline = unsafe { self.into_inline() }; let (array, length) = inline.into_array(); SmallVec::from_buf_and_len(array, length) } } } #[cfg(test)] mod tests { use test_case::test_case; use crate::CompactString; #[test_case("" ; "empty")] #[test_case("abc" ; "short")] #[test_case("I am a long string ๐Ÿ˜Š๐Ÿ˜Š๐Ÿ˜Š๐Ÿ˜Š๐Ÿ˜Š" ; "long")] fn proptest_roundtrip(s: &'static str) { let og_compact = CompactString::from(s); assert_eq!(og_compact, s); let bytes = og_compact.into_bytes(); let ex_compact = CompactString::from_utf8(bytes).unwrap(); assert_eq!(ex_compact, s); } } compact_str-0.7.1/src/repr/traits.rs000064400000000000000000000075661046102023000156000ustar 00000000000000use super::Repr; const FALSE: Repr = Repr::new_inline("false"); const TRUE: Repr = Repr::new_inline("true"); /// Defines how to _efficiently_ create a [`Repr`] from `self` pub trait IntoRepr { fn into_repr(self) -> Repr; } impl IntoRepr for f32 { fn into_repr(self) -> Repr { let mut buf = ryu::Buffer::new(); let s = buf.format(self); Repr::new(s) } } impl IntoRepr for f64 { fn into_repr(self) -> Repr { let mut buf = ryu::Buffer::new(); let s = buf.format(self); Repr::new(s) } } impl IntoRepr for bool { fn into_repr(self) -> Repr { if self { TRUE } else { FALSE } } } impl IntoRepr for char { fn into_repr(self) -> Repr { let mut buf = [0_u8; 4]; Repr::new_inline(self.encode_utf8(&mut buf)) } } #[cfg(test)] mod tests { use quickcheck_macros::quickcheck; use super::IntoRepr; #[test] fn test_into_repr_bool() { let t = true; let repr = t.into_repr(); assert_eq!(repr.as_str(), t.to_string()); let f = false; let repr = f.into_repr(); assert_eq!(repr.as_str(), f.to_string()); } #[quickcheck] #[cfg_attr(miri, ignore)] fn quickcheck_into_repr_char(val: char) { let repr = char::into_repr(val); assert_eq!(repr.as_str(), val.to_string()); } #[test] fn test_into_repr_f64_sanity() { let vals = [ f64::MIN, f64::MIN_POSITIVE, f64::MAX, f64::NEG_INFINITY, f64::INFINITY, ]; for x in &vals { let repr = f64::into_repr(*x); let roundtrip = repr.as_str().parse::().unwrap(); assert_eq!(*x, roundtrip); } } #[test] fn test_into_repr_f64_nan() { let repr = f64::into_repr(f64::NAN); let roundtrip = repr.as_str().parse::().unwrap(); assert!(roundtrip.is_nan()); } #[quickcheck] #[cfg_attr(miri, ignore)] fn quickcheck_into_repr_f64(val: f64) { let repr = f64::into_repr(val); let roundtrip = repr.as_str().parse::().unwrap(); // Note: The formatting of floats by `ryu` sometimes differs from that of `std`, so instead // of asserting equality with `std` we just make sure the value roundtrips if val.is_nan() != roundtrip.is_nan() { assert_eq!(val, roundtrip); } } // `f32` formatting is broken on powerpc64le, not only in `ryu` but also `std` // // See: https://github.com/rust-lang/rust/issues/96306 #[test] #[cfg_attr(all(target_arch = "powerpc64", target_pointer_width = "64"), ignore)] fn test_into_repr_f32_sanity() { let vals = [ f32::MIN, f32::MIN_POSITIVE, f32::MAX, f32::NEG_INFINITY, f32::INFINITY, ]; for x in &vals { let repr = f32::into_repr(*x); let roundtrip = repr.as_str().parse::().unwrap(); assert_eq!(*x, roundtrip); } } #[test] #[cfg_attr(all(target_arch = "powerpc64", target_pointer_width = "64"), ignore)] fn test_into_repr_f32_nan() { let repr = f32::into_repr(f32::NAN); let roundtrip = repr.as_str().parse::().unwrap(); assert!(roundtrip.is_nan()); } #[quickcheck] #[cfg_attr(all(target_arch = "powerpc64", target_pointer_width = "64"), ignore)] fn proptest_into_repr_f32(val: f32) { let repr = f32::into_repr(val); let roundtrip = repr.as_str().parse::().unwrap(); // Note: The formatting of floats by `ryu` sometimes differs from that of `std`, so instead // of asserting equality with `std` we just make sure the value roundtrips if val.is_nan() != roundtrip.is_nan() { assert_eq!(val, roundtrip); } } } compact_str-0.7.1/src/tests.rs000064400000000000000000001257501046102023000144600ustar 00000000000000use core::slice; use std::borrow::Cow; use std::num; use std::str::FromStr; use proptest::collection::SizeRange; use proptest::prelude::*; use proptest::strategy::Strategy; use test_strategy::proptest; use crate::{ format_compact, CompactString, ToCompactString, }; #[cfg(target_pointer_width = "64")] const MAX_SIZE: usize = 24; #[cfg(target_pointer_width = "32")] const MAX_SIZE: usize = 12; const SIXTEEN_MB: usize = 16 * 1024 * 1024; /// generates random unicode strings, upto 80 chars long pub fn rand_unicode() -> impl Strategy { proptest::collection::vec(proptest::char::any(), 0..80).prop_map(|v| v.into_iter().collect()) } /// generates a random collection of bytes, upto 80 bytes long pub fn rand_bytes() -> impl Strategy> { proptest::collection::vec(any::(), 0..80) } /// generates a random collection of `u16`s, upto 80 elements long pub fn rand_u16s() -> impl Strategy> { proptest::collection::vec(any::(), 0..80) } /// [`proptest::strategy::Strategy`] that generates [`String`]s with up to `len` bytes pub fn rand_unicode_with_range(range: impl Into) -> impl Strategy { proptest::collection::vec(proptest::char::any(), range).prop_map(|v| v.into_iter().collect()) } /// generates groups upto 40 strings long of random unicode strings, upto 80 chars long fn rand_unicode_collection() -> impl Strategy> { proptest::collection::vec(rand_unicode(), 0..40) } /// Asserts a [`CompactString`] is allocated properly fn assert_allocated_properly(compact: &CompactString) { if compact.len() <= MAX_SIZE { assert!(!compact.is_heap_allocated()) } else { assert!(compact.is_heap_allocated()) } } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_strings_roundtrip(#[strategy(rand_unicode())] word: String) { let compact = CompactString::new(&word); prop_assert_eq!(&word, &compact); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_strings_allocated_properly(#[strategy(rand_unicode())] word: String) { let compact = CompactString::new(&word); assert_allocated_properly(&compact); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_char_iterator_roundtrips(#[strategy(rand_unicode())] word: String) { let compact: CompactString = word.clone().chars().collect(); prop_assert_eq!(&word, &compact) } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_string_iterator_roundtrips( #[strategy(rand_unicode_collection())] collection: Vec, ) { let compact: CompactString = collection.clone().into_iter().collect(); let word: String = collection.into_iter().collect(); prop_assert_eq!(&word, &compact); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_from_bytes_roundtrips(#[strategy(rand_unicode())] word: String) { let bytes = word.into_bytes(); let compact = CompactString::from_utf8(&bytes).unwrap(); let word = String::from_utf8(bytes).unwrap(); prop_assert_eq!(compact, word); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_from_bytes_only_valid_utf8(#[strategy(rand_bytes())] bytes: Vec) { let compact_result = CompactString::from_utf8(&bytes); let word_result = String::from_utf8(bytes); match (compact_result, word_result) { (Ok(c), Ok(s)) => prop_assert_eq!(c, s), (Err(c_err), Err(s_err)) => prop_assert_eq!(c_err, s_err.utf8_error()), _ => panic!("CompactString and core::str read UTF-8 differently?"), } } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_from_lossy_cow_roundtrips(#[strategy(rand_bytes())] bytes: Vec) { let cow = String::from_utf8_lossy(&bytes[..]); let compact = CompactString::from(cow.clone()); prop_assert_eq!(cow, compact); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_reserve_and_write_bytes(#[strategy(rand_unicode())] word: String) { let mut compact = CompactString::default(); prop_assert!(compact.is_empty()); // reserve enough space to write our bytes compact.reserve(word.len()); // SAFETY: We're writing a String which we know is UTF-8 let slice = unsafe { compact.as_mut_bytes() }; slice[..word.len()].copy_from_slice(word.as_bytes()); // SAFTEY: We know this is the length of our string, since `compact` started with 0 bytes // and we just wrote `word.len()` bytes unsafe { compact.set_len(word.len()) } prop_assert_eq!(&word, &compact); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_reserve_and_write_bytes_allocated_properly(#[strategy(rand_unicode())] word: String) { let mut compact = CompactString::default(); prop_assert!(compact.is_empty()); // reserve enough space to write our bytes compact.reserve(word.len()); // SAFETY: We're writing a String which we know is UTF-8 let slice = unsafe { compact.as_mut_bytes() }; slice[..word.len()].copy_from_slice(word.as_bytes()); // SAFTEY: We know this is the length of our string, since `compact` started with 0 bytes // and we just wrote `word.len()` bytes unsafe { compact.set_len(word.len()) } prop_assert_eq!(compact.len(), word.len()); // The string should be heap allocated if `word` was > MAX_SIZE // // NOTE: The reserve and write API's don't currently support the Packed representation prop_assert_eq!(compact.is_heap_allocated(), word.len() > MAX_SIZE); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_arbitrary_compact_string_converts_to_string(#[strategy(rand_unicode())] word: String) { let compact = CompactString::new(&word); let result = String::from(compact); prop_assert_eq!(result.len(), word.len()); prop_assert_eq!(result, word); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_extend_chars_allocated_properly( #[strategy(rand_unicode())] start: String, #[strategy(rand_unicode())] extend: String, ) { let mut compact = CompactString::new(&start); compact.extend(extend.chars()); let mut control = start.clone(); control.extend(extend.chars()); prop_assert_eq!(&compact, &control); assert_allocated_properly(&compact); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_truncate(#[strategy(rand_unicode())] mut control: String, val: u8) { let initial_len = control.len(); let mut compact = CompactString::new(&control); // turn the arbitrary number `val` into character indices let new_len = control .char_indices() .into_iter() .cycle() .nth(val as usize) .unwrap_or_default() .0; // then truncate both strings string control.truncate(new_len); compact.truncate(new_len); // assert they're equal prop_assert_eq!(&control, &compact); prop_assert_eq!(control.len(), compact.len()); // If we started as heap allocated, we should stay heap allocated. This prevents us from // needing to deallocate the buffer on the heap if initial_len > MAX_SIZE { prop_assert!(compact.is_heap_allocated()); } else { prop_assert!(!compact.is_heap_allocated()); } } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_from_utf16_roundtrips(#[strategy(rand_unicode())] control: String) { let utf16_buf: Vec = control.encode_utf16().collect(); let compact = CompactString::from_utf16(&utf16_buf).unwrap(); assert_eq!(compact, control); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_from_utf16_random(#[strategy(rand_u16s())] buf: Vec) { let compact = CompactString::from_utf16(&buf); let std_str = String::from_utf16(&buf); match (compact, std_str) { (Ok(c), Ok(s)) => assert_eq!(c, s), (Err(_), Err(_)) => (), (c_res, s_res) => panic!( "CompactString and String decode UTF-16 differently? {:?} {:?}", c_res, s_res ), } } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_from_utf16_lossy_roundtrips(#[strategy(rand_unicode())] control: String) { let utf16_buf: Vec = control.encode_utf16().collect(); let compact = CompactString::from_utf16_lossy(&utf16_buf); assert_eq!(compact, control); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_from_utf16_lossy_random(#[strategy(rand_u16s())] buf: Vec) { let control = String::from_utf16_lossy(&buf); let compact = CompactString::from_utf16_lossy(&buf); assert_eq!(compact, control); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_remove(#[strategy(rand_unicode_with_range(1..80))] mut control: String, val: u8) { let initial_len = control.len(); let mut compact = CompactString::new(&control); let idx = control .char_indices() .into_iter() .cycle() .nth(val as usize) .unwrap_or_default() .0; let control_char = control.remove(idx); let compact_char = compact.remove(idx); prop_assert_eq!(control_char, compact_char); prop_assert_eq!(control_char, compact_char); prop_assert_eq!(control.len(), compact.len()); // If we started as heap allocated, we should stay heap allocated. This prevents us from // needing to deallocate the buffer on the heap if initial_len > MAX_SIZE { prop_assert!(compact.is_heap_allocated()); } else { prop_assert!(!compact.is_heap_allocated()); } } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_from_utf8_unchecked(#[strategy(rand_bytes())] bytes: Vec) { let compact = unsafe { CompactString::from_utf8_unchecked(&bytes) }; let std_str = unsafe { String::from_utf8_unchecked(bytes.clone()) }; // we might not make valid strings, but we should be able to read the underlying bytes assert_eq!(compact.as_bytes(), std_str.as_bytes()); assert_eq!(compact.as_bytes(), bytes); // make sure the length is correct assert_eq!(compact.len(), bytes.len()); // check if we were valid UTF-8, if so, assert the data written into the CompactString is // correct let data_is_valid = std::str::from_utf8(&bytes); let compact_is_valid = std::str::from_utf8(compact.as_bytes()); let std_str_is_valid = std::str::from_utf8(std_str.as_bytes()); match (data_is_valid, compact_is_valid, std_str_is_valid) { (Ok(d), Ok(c), Ok(s)) => { // if we get &str's back, make sure they're all equal assert_eq!(d, c); assert_eq!(c, s); } (Err(d), Err(c), Err(s)) => { // if we get errors back, the errors should be the same assert_eq!(d, c); assert_eq!(c, s); } _ => panic!("data, CompactString, and String disagreed?"), } } #[test] fn test_const_creation() { const EMPTY: CompactString = CompactString::new_inline(""); const SHORT: CompactString = CompactString::new_inline("rust"); #[cfg(target_pointer_width = "64")] const PACKED: CompactString = CompactString::new_inline("i am 24 characters long!"); #[cfg(target_pointer_width = "32")] const PACKED: CompactString = CompactString::new_inline("i am 12 char"); assert_eq!(EMPTY, CompactString::new("")); assert_eq!(SHORT, CompactString::new("rust")); #[cfg(target_pointer_width = "64")] assert_eq!(PACKED, CompactString::new("i am 24 characters long!")); #[cfg(target_pointer_width = "32")] assert_eq!(PACKED, CompactString::new("i am 12 char")); } #[test] fn test_short_ascii() { // always inlined on all archs let strs = vec!["nyc", "statue", "liberty", "img_1234.png"]; for s in strs { let compact = CompactString::new(s); assert_eq!(compact, s); assert_eq!(s, compact); assert_eq!(compact.is_heap_allocated(), false); } } #[test] fn test_short_unicode() { let strs = vec![ ("๐Ÿฆ€", false), ("๐ŸŒงโ˜€๏ธ", false), // str is 12 bytes long, and leading character is non-ASCII ("ๅ’ฌ๐“…ˆ๊ˆ:_", false), ]; for (s, is_heap) in strs { let compact = CompactString::new(s); assert_eq!(compact, s); assert_eq!(s, compact); assert_eq!(compact.is_heap_allocated(), is_heap); } } #[test] fn test_medium_ascii() { let strs = vec![ "rustconf 2021", "new york city", "nyc pizza is good", "test the 24 char limit!!", ]; for s in strs { let compact = CompactString::new(s); assert_eq!(compact, s); assert_eq!(s, compact); #[cfg(target_pointer_width = "64")] let is_heap = false; #[cfg(target_pointer_width = "32")] let is_heap = true; assert_eq!(compact.is_heap_allocated(), is_heap); } } #[test] fn test_medium_unicode() { let strs = vec![ ("โ˜•๏ธ๐Ÿ‘€๐Ÿ˜๐ŸŽ‰", false), // str is 24 bytes long, and leading character is non-ASCII ("๐Ÿฆ€๐Ÿ˜€๐Ÿ˜ƒ๐Ÿ˜„๐Ÿ˜๐Ÿฆ€", false), ]; #[allow(unused_variables)] for (s, is_heap) in strs { let compact = CompactString::new(s); assert_eq!(compact, s); assert_eq!(s, compact); #[cfg(target_pointer_width = "64")] let is_heap = is_heap; #[cfg(target_pointer_width = "32")] let is_heap = true; assert_eq!(compact.is_heap_allocated(), is_heap); } } #[test] fn test_from_str_trait() { let s = "hello_world"; // Until the never type `!` is stabilized, we have to unwrap here let c = CompactString::from_str(s).unwrap(); assert_eq!(s, c); } #[test] #[cfg_attr(target_pointer_width = "32", ignore)] fn test_from_char_iter() { let s = "\u{0} 0 \u{0}a๐€€๐€€ ๐€€a๐€€"; println!("{}", s.len()); let compact: CompactString = s.chars().into_iter().collect(); assert!(!compact.is_heap_allocated()); assert_eq!(s, compact); } #[test] #[cfg_attr(target_pointer_width = "32", ignore)] fn test_extend_packed_from_empty() { let s = " 0\u{80}A\u{0}๐€€ ๐€€ยกa๐€€0"; let mut compact = CompactString::new(s); assert!(!compact.is_heap_allocated()); // extend from an empty iterator compact.extend("".chars()); // we should still be heap allocated assert!(!compact.is_heap_allocated()); } #[test] fn test_pop_empty() { let num_pops = 256; let mut compact = CompactString::from(""); (0..num_pops).for_each(|_| { let ch = compact.pop(); assert!(ch.is_none()); }); assert!(compact.is_empty()); assert_eq!(compact, ""); } #[test] fn test_extend_from_empty_strs() { let strs = vec![ "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ]; let compact: CompactString = strs.clone().into_iter().collect(); assert_eq!(compact, ""); assert!(compact.is_empty()); assert!(!compact.is_heap_allocated()); } #[test] fn test_compact_str_is_send_and_sync() { fn is_send_and_sync() {} is_send_and_sync::(); } #[test] fn test_fmt_write() { use core::fmt::Write; let mut compact = CompactString::default(); write!(compact, "test").unwrap(); assert_eq!(compact, "test"); writeln!(compact, "{}", 1234).unwrap(); assert_eq!(compact, "test1234\n"); write!(compact, "{:>8} {} {:<8}", "some", "more", "words").unwrap(); assert_eq!(compact, "test1234\n some more words "); } #[test] fn test_plus_operator() { // + &CompactString assert_eq!(CompactString::from("a") + &CompactString::from("b"), "ab"); // + &str assert_eq!(CompactString::from("a") + "b", "ab"); // + &String assert_eq!(CompactString::from("a") + &String::from("b"), "ab"); // + &Box let box_str = String::from("b").into_boxed_str(); assert_eq!(CompactString::from("a") + &box_str, "ab"); // + &Cow<'a, str> let cow = Cow::from("b"); assert_eq!(CompactString::from("a") + &cow, "ab"); // Implementing `Add for String` can break adding &String or other types to String, so we // explicitly don't do this. See https://github.com/rust-lang/rust/issues/77143 for more details. // Below we assert adding types to String still compiles // String + &CompactString assert_eq!(String::from("a") + &CompactString::from("b"), "ab"); // String + &String assert_eq!(String::from("a") + &("b".to_string()), "ab"); // String + &str assert_eq!(String::from("a") + &"b", "ab"); } #[test] fn test_plus_equals_operator() { let mut m = CompactString::from("a"); m += "b"; assert_eq!(m, "ab"); } #[test] fn test_u8_to_compact_string() { let vals = [u8::MIN, 1, 42, u8::MAX - 2, u8::MAX - 1, u8::MAX]; for x in &vals { let c = x.to_compact_string(); let s = x.to_string(); assert_eq!(c, s); assert!(!c.is_heap_allocated()); } } #[test] fn test_i8_to_compact_string() { let vals = [ i8::MIN, i8::MIN + 1, i8::MIN + 2, -1, 0, 1, 42, i8::MAX - 2, i8::MAX - 1, i8::MAX, ]; for x in &vals { let c = x.to_compact_string(); let s = x.to_string(); assert_eq!(c, s); assert!(!c.is_heap_allocated()); } } #[test] fn test_u16_to_compact_string() { let vals = [u16::MIN, 1, 42, 999, u16::MAX - 2, u16::MAX - 1, u16::MAX]; for x in &vals { let c = x.to_compact_string(); let s = x.to_string(); assert_eq!(c, s); assert!(!c.is_heap_allocated()); } } #[test] fn test_i16_to_compact_string() { let vals = [ i16::MIN, i16::MIN + 1, i16::MIN + 2, -42, -1, 0, 1, 42, 999, i16::MAX - 2, i16::MAX - 1, i16::MAX, ]; for x in &vals { let c = x.to_compact_string(); let s = x.to_string(); assert_eq!(c, s); assert!(!c.is_heap_allocated()); } } #[test] fn test_u32_to_compact_string() { let vals = [ u32::MIN, 1, 42, 999, 123456789, u32::MAX - 2, u32::MAX - 1, u32::MAX, ]; for x in &vals { let c = x.to_compact_string(); let s = x.to_string(); assert_eq!(c, s); assert!(!c.is_heap_allocated()); } } #[test] fn test_i32_to_compact_string() { let vals = [ i32::MIN, i32::MIN + 2, i32::MIN + 1, -12345678, -42, -1, 0, 1, 999, 123456789, i32::MAX - 2, i32::MAX - 1, i32::MAX, ]; for x in &vals { let c = x.to_compact_string(); let s = x.to_string(); assert_eq!(c, s); assert!(!c.is_heap_allocated()); } } #[test] fn test_u64_to_compact_string() { let vals = [ u64::MIN, 1, 999, 123456789, 98765432123456, u64::MAX - 2, u64::MAX - 1, u64::MAX, ]; for x in &vals { let c = x.to_compact_string(); let s = x.to_string(); assert_eq!(c, s); // u64 can be up-to 20 characters long, which can't be inlined on 32-bit arches #[cfg(target_pointer_width = "64")] assert!(!c.is_heap_allocated()); } } #[test] fn test_i64_to_compact_string() { let vals = [ i64::MIN, i64::MIN + 1, i64::MIN + 2, -22222222, -42, 0, 1, 999, 123456789, i64::MAX - 2, i64::MAX - 1, i64::MAX, ]; for x in &vals { let c = x.to_compact_string(); let s = x.to_string(); assert_eq!(c, s); // i64 can be up-to 20 characters long, which can't be inlined on 32-bit arches #[cfg(target_pointer_width = "64")] assert!(!c.is_heap_allocated()); } } #[test] fn test_u128_to_compact_string() { let vals = [ u128::MIN, 1, 999, 123456789, u128::MAX - 2, u128::MAX - 1, u128::MAX, ]; for x in &vals { let c = x.to_compact_string(); let s = x.to_string(); assert_eq!(c, s); } } #[test] fn test_i128_to_compact_string() { let vals = [ i128::MIN, i128::MIN + 1, i128::MIN + 2, -22222222, -42, 0, 1, 999, 123456789, i128::MAX - 2, i128::MAX - 1, i128::MAX, ]; for x in &vals { let c = x.to_compact_string(); let s = x.to_string(); assert_eq!(c, s); } } #[test] fn test_bool_to_compact_string() { let c = true.to_compact_string(); let s = true.to_string(); assert_eq!("true", c); assert_eq!(c, s); assert!(!c.is_heap_allocated()); let c = false.to_compact_string(); let s = false.to_string(); assert_eq!("false", c); assert_eq!(c, s); assert!(!c.is_heap_allocated()); } macro_rules! assert_int_MAX_to_compact_string { ($int: ty) => { assert_eq!(&*<$int>::MAX.to_string(), &*<$int>::MAX.to_compact_string()); }; } #[test] fn test_to_compact_string() { // Test specialisation for bool, char and String assert_eq!(&*true.to_string(), "true".to_compact_string()); assert_eq!(&*false.to_string(), "false".to_compact_string()); assert_eq!("1", '1'.to_compact_string()); assert_eq!("2333", "2333".to_string().to_compact_string()); assert_eq!("2333", "2333".to_compact_string().to_compact_string()); // Test specialisation for int and nonzero_int using itoa assert_eq!("234", 234.to_compact_string()); assert_eq!( "234", num::NonZeroU64::new(234).unwrap().to_compact_string() ); assert_int_MAX_to_compact_string!(u8); assert_int_MAX_to_compact_string!(i8); assert_int_MAX_to_compact_string!(u16); assert_int_MAX_to_compact_string!(i16); assert_int_MAX_to_compact_string!(u32); assert_int_MAX_to_compact_string!(i32); assert_int_MAX_to_compact_string!(u64); assert_int_MAX_to_compact_string!(i64); assert_int_MAX_to_compact_string!(usize); assert_int_MAX_to_compact_string!(isize); // Test specialisation for f32 and f64 using ryu // TODO: Fix bug in powerpc64, which is a little endian system #[cfg(not(all(target_arch = "powerpc64", target_pointer_width = "64")))] { assert_eq!( (&*3.2_f32.to_string(), &*288888.290028_f64.to_string()), ( &*3.2_f32.to_compact_string(), &*288888.290028_f64.to_compact_string() ) ); assert_eq!("inf", f32::INFINITY.to_compact_string()); assert_eq!("-inf", f32::NEG_INFINITY.to_compact_string()); assert_eq!("inf", f64::INFINITY.to_compact_string()); assert_eq!("-inf", f64::NEG_INFINITY.to_compact_string()); assert_eq!("NaN", f32::NAN.to_compact_string()); assert_eq!("NaN", f64::NAN.to_compact_string()); } // Test generic Display implementation assert_eq!("234", "234".to_compact_string()); assert_eq!("12345", format_compact!("{}", "12345")); assert_eq!("112345", format_compact!("1{}", "12345")); assert_eq!("1123452", format_compact!("1{}{}", "12345", 2)); assert_eq!("11234522", format_compact!("1{}{}{}", "12345", 2, '2')); assert_eq!( "112345221000", format_compact!("1{}{}{}{}", "12345", 2, '2', 1000) ); // Test string longer than repr::MAX_SIZE assert_eq!( "01234567890123456789999999", format_compact!("0{}67890123456789{}", "12345", 999999) ); } #[test] fn test_into_string_large_string_with_excess_capacity() { let mut string = String::with_capacity(128); string.push_str("abcdefghijklmnopqrstuvwxyz"); let str_addr = string.as_ptr(); let str_len = string.len(); let str_cap = string.capacity(); let compact = CompactString::from(string); let new_string = String::from(compact); let new_str_addr = new_string.as_ptr(); let new_str_len = new_string.len(); let new_str_cap = new_string.capacity(); assert_eq!(str_addr, new_str_addr); assert_eq!(str_len, new_str_len); assert_eq!(str_cap, new_str_cap); } #[test] fn test_into_string_where_32_bit_capacity_is_on_heap() { let buf = vec![b'a'; SIXTEEN_MB - 1]; // SAFETY: `buf` is filled with ASCII `a`s. // This primarily speeds up miri, as we don't need to check every byte // in the input buffer let string = unsafe { String::from_utf8_unchecked(buf) }; let str_addr = string.as_ptr(); let str_len = string.len(); let str_cap = string.capacity(); let compact = CompactString::from(string); let new_string = String::from(compact); let new_str_addr = new_string.as_ptr(); let new_str_len = new_string.len(); let new_str_cap = new_string.capacity(); assert_eq!(str_len, new_str_len); if cfg!(target_pointer_width = "64") { assert_eq!(str_addr, new_str_addr); assert_eq!(str_cap, new_str_cap); } else { assert_eq!(&new_string.as_bytes()[0..10], b"aaaaaaaaaa"); assert_eq!(str_len, new_str_cap); } } #[test] fn test_into_string_small_string_with_excess_capacity() { let mut string = String::with_capacity(128); string.push_str("abcdef"); let str_len = string.len(); let compact = CompactString::from(string); // we should inline this string, which would truncate capacity // // note: String truncates capacity on Clone, so truncating here seems reasonable assert!(!compact.is_heap_allocated()); assert_eq!(compact.len(), str_len); assert_eq!(compact.capacity(), MAX_SIZE); } #[test] fn test_from_string_buffer_small_string_with_excess_capacity() { let mut string = String::with_capacity(128); string.push_str("abcedfg"); let str_ptr = string.as_ptr(); let str_len = string.len(); let str_cap = string.capacity(); // using from_string_buffer should always re-use the underlying buffer let compact = CompactString::from_string_buffer(string); assert!(compact.is_heap_allocated()); let cpt_ptr = compact.as_ptr(); let cpt_len = compact.len(); let cpt_cap = compact.capacity(); assert_eq!(str_ptr, cpt_ptr); assert_eq!(str_len, cpt_len); assert_eq!(str_cap, cpt_cap); } #[test] fn test_into_string_small_string_with_no_excess_capacity() { let string = String::from("abcdef"); let str_len = string.len(); let compact = CompactString::from(string); // we should eagerly inline the string assert!(!compact.is_heap_allocated()); assert_eq!(compact.len(), str_len); assert_eq!(compact.capacity(), MAX_SIZE); } #[test] fn test_from_string_buffer_small_string_with_no_excess_capacity() { let string = String::from("abcdefg"); let str_ptr = string.as_ptr(); let str_len = string.len(); let str_cap = string.capacity(); // using from_string_buffer should always re-use the underlying buffer let compact = CompactString::from_string_buffer(string); assert!(compact.is_heap_allocated()); let cpt_ptr = compact.as_ptr(); let cpt_len = compact.len(); let cpt_cap = compact.capacity(); assert_eq!(str_ptr, cpt_ptr); assert_eq!(str_len, cpt_len); assert_eq!(str_cap, cpt_cap); } #[test] fn test_roundtrip_from_string_empty_string() { let string = String::new(); let str_ptr = string.as_ptr(); let str_len = string.len(); let str_cap = string.capacity(); let compact = CompactString::from(string); // we should always inline empty strings assert!(!compact.is_heap_allocated()); let new_string = String::from(compact); let new_str_ptr = new_string.as_ptr(); let new_str_len = new_string.len(); let new_str_cap = new_string.capacity(); assert_eq!(str_ptr, new_str_ptr); assert_eq!(str_len, new_str_len); assert_eq!(str_cap, new_str_cap); } #[test] fn test_roundtrip_from_string_buffer_empty_string() { let string = String::new(); let str_ptr = string.as_ptr(); let str_len = string.len(); let str_cap = string.capacity(); let compact = CompactString::from_string_buffer(string); // we should always inline empty strings assert!(!compact.is_heap_allocated()); let new_string = String::from(compact); let new_str_ptr = new_string.as_ptr(); let new_str_len = new_string.len(); let new_str_cap = new_string.capacity(); assert_eq!(str_ptr, new_str_ptr); assert_eq!(str_len, new_str_len); assert_eq!(str_cap, new_str_cap); } #[test] fn test_into_string_small_str() { let data = "abcdef"; let str_addr = data.as_ptr(); let str_len = data.len(); let compact = CompactString::from(data); let new_string = String::from(compact); let new_str_addr = new_string.as_ptr(); let new_str_len = new_string.len(); let new_str_cap = new_string.capacity(); assert_ne!(str_addr, new_str_addr); assert_eq!(str_len, new_str_len); assert_eq!(str_len, new_str_cap); } #[test] fn test_into_string_long_str() { let data = "this is a long string that will be on the heap"; let str_addr = data.as_ptr(); let str_len = data.len(); let compact = CompactString::from(data); let new_string = String::from(compact); let new_str_addr = new_string.as_ptr(); let new_str_len = new_string.len(); let new_str_cap = new_string.capacity(); assert_ne!(str_addr, new_str_addr); assert_eq!(str_len, new_str_len); assert_eq!(str_len, new_str_cap); } #[test] fn test_into_string_empty_str() { let data = ""; let str_len = data.len(); let compact = CompactString::from(data); let new_string = String::from(compact); let new_str_addr = new_string.as_ptr(); let new_str_len = new_string.len(); let new_str_cap = new_string.capacity(); assert_eq!(String::new().as_ptr(), new_str_addr); assert_eq!(str_len, new_str_len); assert_eq!(str_len, new_str_cap); } #[test] fn test_truncate_noops_if_new_len_greater_than_current() { let mut short = CompactString::from("short"); let short_cap = short.capacity(); short.truncate(100); assert_eq!(short.len(), 5); assert_eq!(short.capacity(), short_cap); let mut long = CompactString::from("i am a long string that will be allocated on the heap"); let long_cap = long.capacity(); long.truncate(500); assert_eq!(long.len(), 53); assert_eq!(long.capacity(), long_cap); } #[test] #[should_panic(expected = "new_len must lie on char boundary")] fn test_truncate_panics_on_non_char_boundary() { let mut emojis = CompactString::from("๐Ÿ˜€๐Ÿ˜€๐Ÿ˜€๐Ÿ˜€"); assert!('๐Ÿ˜€'.len_utf8() > 1); emojis.truncate(1); } #[test] fn test_insert() { // insert into empty string let mut one_byte = CompactString::from(""); one_byte.insert(0, '.'); assert_eq!(one_byte, "."); let mut two_bytes = CompactString::from(""); two_bytes.insert(0, 'รœ'); assert_eq!(two_bytes, "รœ"); let mut three_bytes = CompactString::from(""); three_bytes.insert(0, 'โ‚ฌ'); assert_eq!(three_bytes, "โ‚ฌ"); let mut four_bytes = CompactString::from(""); four_bytes.insert(0, '๐Ÿ˜€'); assert_eq!(four_bytes, "๐Ÿ˜€"); // insert at the front of string let mut one_byte = CompactString::from("๐Ÿ˜€"); one_byte.insert(0, '.'); assert_eq!(one_byte, ".๐Ÿ˜€"); let mut two_bytes = CompactString::from("๐Ÿ˜€"); two_bytes.insert(0, 'รœ'); assert_eq!(two_bytes, "รœ๐Ÿ˜€"); let mut three_bytes = CompactString::from("๐Ÿ˜€"); three_bytes.insert(0, 'โ‚ฌ'); assert_eq!(three_bytes, "โ‚ฌ๐Ÿ˜€"); let mut four_bytes = CompactString::from("๐Ÿ˜€"); four_bytes.insert(0, '๐Ÿ˜€'); assert_eq!(four_bytes, "๐Ÿ˜€๐Ÿ˜€"); // insert at the end of string let mut one_byte = CompactString::from("๐Ÿ˜€"); one_byte.insert(4, '.'); assert_eq!(one_byte, "๐Ÿ˜€."); let mut two_bytes = CompactString::from("๐Ÿ˜€"); two_bytes.insert(4, 'รœ'); assert_eq!(two_bytes, "๐Ÿ˜€รœ"); let mut three_bytes = CompactString::from("๐Ÿ˜€"); three_bytes.insert(4, 'โ‚ฌ'); assert_eq!(three_bytes, "๐Ÿ˜€โ‚ฌ"); let mut four_bytes = CompactString::from("๐Ÿ˜€"); four_bytes.insert(4, '๐Ÿ˜€'); assert_eq!(four_bytes, "๐Ÿ˜€๐Ÿ˜€"); // insert in the middle of string let mut one_byte = CompactString::from("๐Ÿ˜€๐Ÿ˜€"); one_byte.insert(4, '.'); assert_eq!(one_byte, "๐Ÿ˜€.๐Ÿ˜€"); let mut two_bytes = CompactString::from("๐Ÿ˜€๐Ÿ˜€"); two_bytes.insert(4, 'รœ'); assert_eq!(two_bytes, "๐Ÿ˜€รœ๐Ÿ˜€"); let mut three_bytes = CompactString::from("๐Ÿ˜€๐Ÿ˜€"); three_bytes.insert(4, 'โ‚ฌ'); assert_eq!(three_bytes, "๐Ÿ˜€โ‚ฌ๐Ÿ˜€"); let mut four_bytes = CompactString::from("๐Ÿ˜€๐Ÿ˜€"); four_bytes.insert(4, '๐Ÿ˜€'); assert_eq!(four_bytes, "๐Ÿ˜€๐Ÿ˜€๐Ÿ˜€"); // edge case: new length is 24 bytes let mut s = CompactString::from("\u{ffff}\u{ffff}\u{ffff}\u{ffff}\u{ffff}\u{ffff}\u{ffff}"); s.insert(21, '\u{ffff}'); assert_eq!( s, "\u{ffff}\u{ffff}\u{ffff}\u{ffff}\u{ffff}\u{ffff}\u{ffff}\u{ffff}", ); } #[test] fn test_remove() { let mut control = String::from("๐Ÿฆ„๐Ÿฆ€hello๐ŸŽถworld๐Ÿ‡บ๐Ÿ‡ธ"); let mut compact = CompactString::from(&control); assert_eq!(control.remove(0), compact.remove(0)); assert_eq!(control, compact); assert_eq!(compact, "๐Ÿฆ€hello๐ŸŽถworld๐Ÿ‡บ๐Ÿ‡ธ"); let music_idx = control .char_indices() .find(|(_idx, c)| *c == '๐ŸŽถ') .map(|(idx, _c)| idx) .unwrap(); assert_eq!(control.remove(music_idx), compact.remove(music_idx)); assert_eq!(control, compact); assert_eq!(compact, "๐Ÿฆ€helloworld๐Ÿ‡บ๐Ÿ‡ธ"); } #[test] #[should_panic(expected = "cannot remove a char from the end of a string")] fn test_remove_empty_string() { let mut compact = CompactString::new(""); compact.remove(0); } #[test] #[should_panic(expected = "cannot remove a char from the end of a string")] fn test_remove_str_len() { let mut compact = CompactString::new("hello world"); compact.remove(compact.len()); } #[test] fn test_with_capacity_16711422() { // Fuzzing with AFL on a 32-bit ARM arch found this bug! // // We have our own heap implemenation called BoxString, which optionally stores the capacity // on the heap, which is really only relevant for 32-bit architectures. The discriminant it used // to determine if capacity was on the heap, was when the last `usize` number of bytes were all // equal to our internal HEAP_MASK, which at the time was `255`. At the time this worked and was // correct. // // When we released support to make the size of CompactString == Option, we // changed the HEAP_MASK to `254`, which unintentionally made our discriminant for determining // if our capacity was on the heap, all `254`s, yet our "max inline capacity value" was still // based on the discriminant being all `255`s. // // When creating a BoxString with capacity 16711422, we'd correctly decide we could store the // capacity inline, but this would create a capacity with an underlying value of // [254, 254, 254, HEAP_MASK]. Once the HEAP_MASK changed to 254, this capacity was now the same // as the discriminant to determine if the capacity was on the heap, so we'd incorrectly // identify the capacity as being on the heap, when it was really inline. assert_eq!(16711422_u32.to_le_bytes(), [254, 254, 254, 0]); let compact = CompactString::with_capacity(16711422); let std_str = String::with_capacity(16711422); assert!(compact.is_heap_allocated()); assert_eq!(compact.capacity(), std_str.capacity()); assert_eq!(compact, ""); assert_eq!(compact, std_str); } #[test] fn test_from_utf16() { let control = String::from("๐Ÿฆ„ hello world! ๐ŸŽฎ "); let utf16_buf: Vec = control.encode_utf16().collect(); let compact = CompactString::from_utf16(&utf16_buf).unwrap(); assert_eq!(compact, control); cfg_if::cfg_if! { if #[cfg(target_pointer_width = "64")] { assert!(!compact.is_heap_allocated()); } else if #[cfg(target_pointer_width = "32")] { assert!(compact.is_heap_allocated()); } else { compile_error!("unsupported pointer width!"); } } } #[test] fn test_reserve_shrink_roundtrip() { const TEXT: &str = "Hello."; let mut s = CompactString::new(TEXT); assert!(!s.is_heap_allocated()); assert_eq!(s.capacity(), MAX_SIZE); assert_eq!(s, TEXT); s.reserve(128); assert!(s.is_heap_allocated()); assert!(s.capacity() >= 128 + TEXT.len()); assert_eq!(s, TEXT); s.shrink_to(64); assert!(s.is_heap_allocated()); assert!(s.capacity() >= 64); assert_eq!(s, TEXT); s.shrink_to_fit(); assert!(!s.is_heap_allocated()); assert_eq!(s.capacity(), MAX_SIZE); assert_eq!(s, TEXT); s.reserve(SIXTEEN_MB); assert!(s.is_heap_allocated()); assert!(s.capacity() >= SIXTEEN_MB + TEXT.len()); assert_eq!(s, TEXT); s.shrink_to(64); assert!(s.is_heap_allocated()); assert!(s.capacity() >= 64); assert_eq!(s, TEXT); s.reserve(SIXTEEN_MB); assert!(s.is_heap_allocated()); assert!(s.capacity() >= SIXTEEN_MB + TEXT.len()); assert_eq!(s, TEXT); s.shrink_to_fit(); assert!(!s.is_heap_allocated()); assert_eq!(s.capacity(), MAX_SIZE); assert_eq!(s, TEXT); } #[test] fn test_from_utf8_unchecked_sanity() { let text = "hello ๐ŸŒŽ, you are nice"; let compact = unsafe { CompactString::from_utf8_unchecked(text) }; assert_eq!(compact, text); } #[test] fn test_from_utf8_unchecked_long() { let bytes = [255; 2048]; let compact = unsafe { CompactString::from_utf8_unchecked(bytes) }; assert_eq!(compact.len(), 2048); assert_eq!(compact.as_bytes(), bytes); } #[test] fn test_from_utf8_unchecked_short() { let bytes = [255; 10]; let compact = unsafe { CompactString::from_utf8_unchecked(bytes) }; assert_eq!(compact.len(), 10); assert_eq!(compact.as_bytes(), bytes); } #[test] fn test_from_utf8_unchecked_empty() { let bytes = [255; 0]; let compact = unsafe { CompactString::from_utf8_unchecked(bytes) }; assert_eq!(compact.len(), 0); assert_eq!(compact.as_bytes(), bytes); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_from_utf8_lossy(#[strategy(rand_bytes())] bytes: Vec) { let compact = CompactString::from_utf8_lossy(&bytes); let control = String::from_utf8_lossy(&bytes); assert_eq!(compact, control); assert_eq!(compact.len(), control.len()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_from_utf16(#[strategy(rand_u16s())] buf: Vec) { const FUNCS: &[( fn(&[u8]) -> Result, fn(u16) -> u16, fn([u8; 2]) -> u16, )] = &[ ( |v| CompactString::from_utf16le(v), u16::from_le, u16::from_le_bytes, ), ( |v| CompactString::from_utf16be(v), u16::from_be, u16::from_be_bytes, ), ]; for (new_compact_string, from_int, from_bytes) in FUNCS { let buf = &*buf; let bytes: &[u8] = unsafe { slice::from_raw_parts(buf.as_ptr().cast(), buf.len() * 2) }; let compact = new_compact_string(bytes); let control = String::from_utf16(&buf.iter().copied().map(from_int).collect::>()); assert_eq!(compact.is_ok(), control.is_ok()); if let (Ok(compact), Ok(control)) = (compact, control) { assert_eq!(compact.len(), control.len()); assert_eq!(compact, control); } if bytes.len() >= 2 { // Test if `CompactString::from_utf16x()` works with misaligned slices. let bytes: &[u8] = &bytes[1..bytes.len() - 1]; let buf: Vec = bytes .chunks_exact(2) .map(|v| from_bytes([v[0], v[1]])) .collect(); let compact = new_compact_string(bytes); let control = String::from_utf16(&buf); assert_eq!(compact.is_ok(), control.is_ok()); if let (Ok(compact), Ok(control)) = (compact, control) { assert_eq!(compact.len(), control.len()); assert_eq!(compact, control); } } } } #[test] fn test_from_utf16x() { let dancing_men = b"\x3d\xd8\x6f\xdc\x0d\x20\x42\x26\x0f\xfe"; assert_eq!(CompactString::from_utf16le(dancing_men).unwrap(), "๐Ÿ‘ฏโ€โ™‚๏ธ"); let dancing_men = b"0\x3d\xd8\x6f\xdc\x0d\x20\x42\x26\x0f\xfe"; assert!(CompactString::from_utf16le(dancing_men).is_err()); assert_eq!( CompactString::from_utf16le(&dancing_men[1..]).unwrap(), "๐Ÿ‘ฏโ€โ™‚๏ธ", ); let dancing_women = b"\xd8\x3d\xdc\x6f\x20\x0d\x26\x40\xfe\x0f"; assert_eq!(CompactString::from_utf16be(dancing_women).unwrap(), "๐Ÿ‘ฏโ€โ™€๏ธ"); let dancing_women = b"0\xd8\x3d\xdc\x6f\x20\x0d\x26\x40\xfe\x0f"; assert!(CompactString::from_utf16be(dancing_women).is_err()); assert_eq!( CompactString::from_utf16be(&dancing_women[1..]).unwrap(), "๐Ÿ‘ฏโ€โ™€๏ธ", ); } #[test] fn test_from_utf16x_lossy() { let dancing_men = b"\x3d\xd8\x6f\xfc\x0d\x20\x42\x26\x0f\xfe"; assert_eq!( CompactString::from_utf16le_lossy(dancing_men), "๏ฟฝ\u{fc6f}\u{200d}โ™‚๏ธ", ); let dancing_men = b"0\x3d\xd8\x6f\xfc\x0d\x20\x42\x26\x0f\xfe"; assert_eq!( CompactString::from_utf16le_lossy(&dancing_men[1..]), "๏ฟฝ\u{fc6f}\u{200d}โ™‚๏ธ", ); let dancing_women = b"\xd8\x3d\xdc\x6f\x20\x0d\x26\x40\xde\x0f"; assert_eq!( CompactString::from_utf16be_lossy(dancing_women), "๐Ÿ‘ฏ\u{200d}โ™€๏ฟฝ", ); let dancing_women = b"0\xd8\x3d\xdc\x6f\x20\x0d\x26\x40\xde\x0f"; assert_eq!( CompactString::from_utf16be_lossy(&dancing_women[1..]), "๐Ÿ‘ฏ\u{200d}โ™€๏ฟฝ", ); } #[test] fn test_collect() { const VALUES: &[&str] = &["foo", "bar", "baz"]; assert_eq!( VALUES .iter() .copied() .map(Cow::Borrowed) .collect::(), "foobarbaz", ); assert_eq!( VALUES .iter() .copied() .map(|s| Cow::Owned(s.into())) .collect::(), "foobarbaz", ); assert_eq!( VALUES .iter() .copied() .map(Box::::from) .collect::(), "foobarbaz", ); assert_eq!( VALUES .iter() .copied() .map(CompactString::from) .collect::(), "foobarbaz", ); assert_eq!( VALUES .iter() .copied() .map(CompactString::from) .collect::>(), "foobarbaz", ); assert_eq!( VALUES .iter() .copied() .flat_map(|s| s.chars()) .collect::>(), "foobarbaz", ); } #[test] fn test_into_cow() { let og = "aaa"; let compact = CompactString::new(og); let cow: std::borrow::Cow<'_, str> = compact.into(); assert_eq!(og, cow); } #[test] fn test_from_string_buffer_inlines_on_push() { let mut compact = CompactString::from_string_buffer("hello".to_string()); assert!(compact.is_heap_allocated()); compact.push_str(" world"); // when growing the CompactString we should inline it assert!(!compact.is_heap_allocated()); } #[test] fn test_from_string_buffer_inlines_on_clone() { let a = CompactString::from_string_buffer("hello".to_string()); assert!(a.is_heap_allocated()); let b = a.clone(); // when cloning the CompactString we should inline it assert!(!b.is_heap_allocated()); } compact_str-0.7.1/src/traits.rs000064400000000000000000000335231046102023000146200ustar 00000000000000use core::fmt::{ self, Write, }; use core::num; use castaway::{ match_type, LifetimeFree, }; use super::repr::{ IntoRepr, Repr, }; use crate::CompactString; /// A trait for converting a value to a `CompactString`. /// /// This trait is automatically implemented for any type which implements the /// [`fmt::Display`] trait. As such, [`ToCompactString`] shouldn't be implemented directly: /// [`fmt::Display`] should be implemented instead, and you get the [`ToCompactString`] /// implementation for free. pub trait ToCompactString { /// Converts the given value to a [`CompactString`]. /// /// # Examples /// /// Basic usage: /// /// ``` /// use compact_str::ToCompactString; /// # use compact_str::CompactString; /// /// let i = 5; /// let five = CompactString::new("5"); /// /// assert_eq!(i.to_compact_string(), five); /// ``` fn to_compact_string(&self) -> CompactString; } /// # Safety /// /// * [`CompactString`] does not contain any lifetime /// * [`CompactString`] is 'static /// * [`CompactString`] is a container to `u8`, which is `LifetimeFree`. unsafe impl LifetimeFree for CompactString {} unsafe impl LifetimeFree for Repr {} /// # Panics /// /// In this implementation, the `to_compact_string` method panics if the `Display` implementation /// returns an error. This indicates an incorrect `Display` implementation since /// `std::fmt::Write for CompactString` never returns an error itself. /// /// # Note /// /// We use the [`castaway`] crate to provide zero-cost specialization for several types, those are: /// * `u8`, `u16`, `u32`, `u64`, `u128`, `usize` /// * `i8`, `i16`, `i32`, `i64`, `i128`, `isize` /// * `NonZeroU*`, `NonZeroI*` /// * `bool` /// * `char` /// * `String`, `CompactString` /// * `f32`, `f64` /// * For floats we use [`ryu`] crate which sometimes provides different formatting than [`std`] impl ToCompactString for T { #[inline] fn to_compact_string(&self) -> CompactString { let repr = match_type!(self, { &u8 as s => s.into_repr(), &i8 as s => s.into_repr(), &u16 as s => s.into_repr(), &i16 as s => s.into_repr(), &u32 as s => s.into_repr(), &i32 as s => s.into_repr(), &u64 as s => s.into_repr(), &i64 as s => s.into_repr(), &u128 as s => s.into_repr(), &i128 as s => s.into_repr(), &usize as s => s.into_repr(), &isize as s => s.into_repr(), &f32 as s => s.into_repr(), &f64 as s => s.into_repr(), &bool as s => s.into_repr(), &char as s => s.into_repr(), &String as s => Repr::new(s), &CompactString as s => Repr::new(s), &num::NonZeroU8 as s => s.into_repr(), &num::NonZeroI8 as s => s.into_repr(), &num::NonZeroU16 as s => s.into_repr(), &num::NonZeroI16 as s => s.into_repr(), &num::NonZeroU32 as s => s.into_repr(), &num::NonZeroI32 as s => s.into_repr(), &num::NonZeroU64 as s => s.into_repr(), &num::NonZeroI64 as s => s.into_repr(), &num::NonZeroUsize as s => s.into_repr(), &num::NonZeroIsize as s => s.into_repr(), &num::NonZeroU128 as s => s.into_repr(), &num::NonZeroI128 as s => s.into_repr(), s => { let mut c = CompactString::new_inline(""); write!(&mut c, "{}", s).expect("fmt::Display incorrectly implemented!"); return c; } }); CompactString(repr) } } /// A trait that provides convience methods for creating a [`CompactString`] from a collection of /// items. It is implemented for all types that can be converted into an iterator, and that iterator /// yields types that can be converted into a `str`. /// /// i.e. `C: IntoIterator>`. /// /// # Concatenate and Join /// Two methods that this trait provides are `concat_compact(...)` and `join_compact(...)` /// ``` /// use compact_str::CompactStringExt; /// /// let words = vec!["โ˜€๏ธ", "๐ŸŒ•", "๐ŸŒ‘", "โ˜€๏ธ"]; /// /// // directly concatenate all the words together /// let concat = words.concat_compact(); /// assert_eq!(concat, "โ˜€๏ธ๐ŸŒ•๐ŸŒ‘โ˜€๏ธ"); /// /// // join the words, with a seperator /// let join = words.join_compact(" โžก๏ธ "); /// assert_eq!(join, "โ˜€๏ธ โžก๏ธ ๐ŸŒ• โžก๏ธ ๐ŸŒ‘ โžก๏ธ โ˜€๏ธ"); /// ``` pub trait CompactStringExt { /// Concatenates all the items of a collection into a [`CompactString`] /// /// # Example /// ``` /// use compact_str::CompactStringExt; /// /// let items = ["hello", " ", "world", "!"]; /// let compact = items.concat_compact(); /// /// assert_eq!(compact, "hello world!"); /// ``` fn concat_compact(&self) -> CompactString; /// Joins all the items of a collection, placing a seperator between them, forming a /// [`CompactString`] /// /// # Example /// ``` /// use compact_str::CompactStringExt; /// /// let fruits = vec!["apples", "oranges", "bananas"]; /// let compact = fruits.join_compact(", "); /// /// assert_eq!(compact, "apples, oranges, bananas"); /// ``` fn join_compact>(&self, seperator: S) -> CompactString; } impl CompactStringExt for C where I: AsRef, for<'a> &'a C: IntoIterator, { fn concat_compact(&self) -> CompactString { self.into_iter() .fold(CompactString::new_inline(""), |mut s, item| { s.push_str(item.as_ref()); s }) } fn join_compact>(&self, seperator: S) -> CompactString { let mut compact_string = CompactString::new_inline(""); let mut iter = self.into_iter().peekable(); let sep = seperator.as_ref(); while let Some(item) = iter.next() { compact_string.push_str(item.as_ref()); if iter.peek().is_some() { compact_string.push_str(sep); } } compact_string } } #[cfg(test)] mod tests { use core::num; use proptest::prelude::*; use test_strategy::proptest; use super::{ CompactStringExt, ToCompactString, }; use crate::CompactString; #[test] fn test_join() { let slice = ["hello", "world"]; let c = slice.join_compact(" "); assert_eq!(c, "hello world"); let vector = vec!["๐ŸŽ", "๐ŸŠ", "๐ŸŒ"]; let c = vector.join_compact(","); assert_eq!(c, "๐ŸŽ,๐ŸŠ,๐ŸŒ"); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_join(items: Vec, seperator: String) { let c: CompactString = items.join_compact(&seperator); let s: String = items.join(&seperator); assert_eq!(c, s); } #[test] fn test_concat() { let items = vec!["hello", "world"]; let c = items.join_compact(" "); assert_eq!(c, "hello world"); let vector = vec!["๐ŸŽ", "๐ŸŠ", "๐ŸŒ"]; let c = vector.concat_compact(); assert_eq!(c, "๐ŸŽ๐ŸŠ๐ŸŒ"); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_concat(items: Vec) { let c: CompactString = items.concat_compact(); let s: String = items.concat(); assert_eq!(c, s); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_u8(val: u8) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_i8(val: i8) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_u16(val: u16) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_i16(val: i16) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_u32(val: u32) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_i32(val: i32) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_u64(val: u64) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_i64(val: i64) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_usize(val: usize) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_isize(val: isize) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_u128(val: u128) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_i128(val: i128) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_non_zero_u8( #[strategy((1..=u8::MAX).prop_map(|x| unsafe { num::NonZeroU8::new_unchecked(x)} ))] val: num::NonZeroU8, ) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_non_zero_u16( #[strategy((1..=u16::MAX).prop_map(|x| unsafe { num::NonZeroU16::new_unchecked(x)} ))] val: num::NonZeroU16, ) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_non_zero_u32( #[strategy((1..=u32::MAX).prop_map(|x| unsafe { num::NonZeroU32::new_unchecked(x)} ))] val: num::NonZeroU32, ) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_non_zero_u64( #[strategy((1..=u64::MAX).prop_map(|x| unsafe { num::NonZeroU64::new_unchecked(x)} ))] val: num::NonZeroU64, ) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_non_zero_u128( #[strategy((1..=u128::MAX).prop_map(|x| unsafe { num::NonZeroU128::new_unchecked(x)} ))] val: num::NonZeroU128, ) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_non_zero_usize( #[strategy((1..=usize::MAX).prop_map(|x| unsafe { num::NonZeroUsize::new_unchecked(x)} ))] val: num::NonZeroUsize, ) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_non_zero_i8( #[strategy((1..=u8::MAX).prop_map(|x| unsafe { num::NonZeroI8::new_unchecked(x as i8)} ))] val: num::NonZeroI8, ) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_non_zero_i16( #[strategy((1..=u16::MAX).prop_map(|x| unsafe { num::NonZeroI16::new_unchecked(x as i16)} ))] val: num::NonZeroI16, ) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_non_zero_i32( #[strategy((1..=u32::MAX).prop_map(|x| unsafe { num::NonZeroI32::new_unchecked(x as i32)} ))] val: num::NonZeroI32, ) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_non_zero_i64( #[strategy((1..=u64::MAX).prop_map(|x| unsafe { num::NonZeroI64::new_unchecked(x as i64)} ))] val: num::NonZeroI64, ) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_non_zero_i128( #[strategy((1..=u128::MAX).prop_map(|x| unsafe { num::NonZeroI128::new_unchecked(x as i128)} ))] val: num::NonZeroI128, ) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } #[proptest] #[cfg_attr(miri, ignore)] fn proptest_to_compact_string_non_zero_isize( #[strategy((1..=usize::MAX).prop_map(|x| unsafe { num::NonZeroIsize::new_unchecked(x as isize)} ))] val: num::NonZeroIsize, ) { let compact = val.to_compact_string(); prop_assert_eq!(compact.as_str(), val.to_string()); } }