potential_utf-0.1.2/.cargo_vcs_info.json0000644000000001610000000000100137070ustar { "git": { "sha1": "f4290a877dfcb0f87cad6de4abdd65f0cbb33c9c" }, "path_in_vcs": "utils/potential_utf" }potential_utf-0.1.2/Cargo.lock0000644000000067030000000000100116720ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "bincode" version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" dependencies = [ "serde", ] [[package]] name = "databake" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff6ee9e2d2afb173bcdeee45934c89ec341ab26f91c9933774fc15c2b58f83ef" dependencies = [ "proc-macro2", "quote", ] [[package]] name = "itoa" version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" [[package]] name = "memchr" version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "potential_utf" version = "0.1.2" dependencies = [ "bincode", "databake", "serde", "serde_json", "writeable", "zerovec", ] [[package]] name = "proc-macro2" version = "1.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" dependencies = [ "unicode-ident", ] [[package]] name = "quote" version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" dependencies = [ "proc-macro2", ] [[package]] name = "ryu" version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd" [[package]] name = "serde" version = "1.0.218" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8dfc9d19bdbf6d17e22319da49161d5d0108e4188e8b680aef6299eed22df60" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" version = "1.0.218" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f09503e191f4e797cb8aac08e9a4a4695c5edf6a2e70e376d961ddd5c969f82b" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "serde_json" version = "1.0.139" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "44f86c3acccc9c65b153fe1b85a3be07fe5515274ec9f0653b4a0875731c72a6" dependencies = [ "itoa", "memchr", "ryu", "serde", ] [[package]] name = "syn" version = "2.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "unicode-ident" version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "00e2473a93778eb0bad35909dff6a10d28e63f792f16ed15e404fca9d5eeedbe" [[package]] name = "writeable" version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74b3b5b7c6114bf7253093603034e102d479ecc8501deca33b6c1c816418b6d2" [[package]] name = "zerofrom" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e" [[package]] name = "zerovec" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94e62113720e311984f461c56b00457ae9981c0bc7859d22306cc2ae2f95571c" dependencies = [ "zerofrom", ] potential_utf-0.1.2/Cargo.toml0000644000000033450000000000100117140ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" rust-version = "1.81" name = "potential_utf" version = "0.1.2" authors = ["The ICU4X Project Developers"] build = false include = [ "data/**/*", "src/**/*", "examples/**/*", "benches/**/*", "tests/**/*", "Cargo.toml", "LICENSE", "README.md", ] autolib = false autobins = false autoexamples = false autotests = false autobenches = false description = "Unvalidated string and character types" homepage = "https://icu4x.unicode.org" readme = "README.md" categories = ["internationalization"] license = "Unicode-3.0" repository = "https://github.com/unicode-org/icu4x" [features] alloc = [ "serde?/alloc", "zerovec?/alloc", ] databake = ["dep:databake"] serde = ["dep:serde"] writeable = [ "dep:writeable", "alloc", ] zerovec = ["dep:zerovec"] [lib] name = "potential_utf" path = "src/lib.rs" [dependencies.databake] version = "0.2.0" optional = true default-features = false [dependencies.serde] version = "1.0.110" optional = true default-features = false [dependencies.writeable] version = "0.6.0" optional = true default-features = false [dependencies.zerovec] version = "0.11.1" optional = true default-features = false [dev-dependencies.bincode] version = "1.3.1" [dev-dependencies.serde_json] version = "1.0.45" potential_utf-0.1.2/Cargo.toml.orig000064400000000000000000000017261046102023000153760ustar 00000000000000# This file is part of ICU4X. For terms of use, please see the file # called LICENSE at the top level of the ICU4X source tree # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). [package] name = "potential_utf" description = "Unvalidated string and character types" version = "0.1.2" rust-version.workspace = true authors.workspace = true edition.workspace = true repository.workspace = true homepage.workspace = true license.workspace = true categories.workspace = true include.workspace = true [dependencies] databake = { workspace = true, optional = true } serde = { workspace = true, optional = true } zerovec = { workspace = true, optional = true } writeable = { workspace = true, optional = true } [dev-dependencies] serde_json = { workspace = true } bincode = { workspace = true } [features] alloc = ["serde?/alloc", "zerovec?/alloc"] databake = ["dep:databake"] serde = ["dep:serde"] writeable = ["dep:writeable", "alloc"] zerovec = ["dep:zerovec"] potential_utf-0.1.2/LICENSE000064400000000000000000000042231046102023000135070ustar 00000000000000UNICODE LICENSE V3 COPYRIGHT AND PERMISSION NOTICE Copyright © 2020-2024 Unicode, Inc. NOTICE TO USER: Carefully read the following legal agreement. BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE. Permission is hereby granted, free of charge, to any person obtaining a copy of data files and any associated documentation (the "Data Files") or software and any associated documentation (the "Software") to deal in the Data Files or Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sell copies of the Data Files or Software, and to permit persons to whom the Data Files or Software are furnished to do so, provided that either (a) this copyright and permission notice appear with all copies of the Data Files or Software, or (b) this copyright and permission notice appear in associated Documentation. THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA FILES OR SOFTWARE. Except as contained in this notice, the name of a copyright holder shall not be used in advertising or otherwise to promote the sale, use or other dealings in these Data Files or Software without prior written authorization of the copyright holder. SPDX-License-Identifier: Unicode-3.0 — Portions of ICU4X may have been adapted from ICU4C and/or ICU4J. ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others. potential_utf-0.1.2/README.md000064400000000000000000000006141046102023000137610ustar 00000000000000# unvalidated_utf [![crates.io](https://img.shields.io/crates/v/unvalidated_utf)](https://crates.io/crates/unvalidated_utf) A crate providing unvalidated string and character types. ## More Information For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x). potential_utf-0.1.2/src/lib.rs000064400000000000000000000015011046102023000144010ustar 00000000000000// This file is part of ICU4X. For terms of use, please see the file // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). #![cfg_attr(not(any(test, doc)), no_std)] #![cfg_attr( not(test), deny( clippy::indexing_slicing, clippy::unwrap_used, clippy::expect_used, clippy::panic, clippy::exhaustive_structs, clippy::exhaustive_enums, clippy::trivially_copy_pass_by_ref, missing_debug_implementations, ) )] //! A crate providing unvalidated string and character types. #[cfg(feature = "alloc")] extern crate alloc; mod uchar; mod ustr; pub use uchar::PotentialCodePoint; pub use ustr::PotentialUtf16; pub use ustr::PotentialUtf8; #[cfg(feature = "writeable")] mod writeable; potential_utf-0.1.2/src/uchar.rs000064400000000000000000000257041046102023000147500ustar 00000000000000// This file is part of ICU4X. For terms of use, please see the file // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). use core::cmp::Ordering; use core::fmt; /// A 24-bit numeric data type that is expected to be a Unicode scalar value, but is not /// validated as such. /// /// Use this type instead of `char` when you want to deal with data that is expected to be valid /// Unicode scalar values, but you want control over when or if you validate that assumption. /// /// # Examples /// /// ``` /// use potential_utf::PotentialCodePoint; /// /// assert_eq!(PotentialCodePoint::from_u24(0x68).try_to_char(), Ok('h')); /// assert_eq!(PotentialCodePoint::from_char('i').try_to_char(), Ok('i')); /// assert_eq!( /// PotentialCodePoint::from_u24(0x1F44B).try_to_char(), /// Ok('👋') /// ); /// /// assert!(PotentialCodePoint::from_u24(0xDE01).try_to_char().is_err()); /// assert_eq!( /// PotentialCodePoint::from_u24(0xDE01).to_char_lossy(), /// char::REPLACEMENT_CHARACTER /// ); /// ``` #[repr(transparent)] #[allow(clippy::exhaustive_structs)] // transparent newtype #[derive(PartialEq, Eq, Clone, Copy, Hash)] pub struct PotentialCodePoint([u8; 3]); impl PotentialCodePoint { /// Create a [`PotentialCodePoint`] from a `char`. /// /// # Examples /// /// ``` /// use potential_utf::PotentialCodePoint; /// /// let a = PotentialCodePoint::from_char('a'); /// assert_eq!(a.try_to_char().unwrap(), 'a'); /// ``` #[inline] pub const fn from_char(c: char) -> Self { let [u0, u1, u2, _u3] = (c as u32).to_le_bytes(); Self([u0, u1, u2]) } /// Create [`PotentialCodePoint`] from a u32 value, ignoring the most significant 8 bits. #[inline] pub const fn from_u24(c: u32) -> Self { let [u0, u1, u2, _u3] = c.to_le_bytes(); Self([u0, u1, u2]) } /// Attempt to convert a [`PotentialCodePoint`] to a `char`. /// /// # Examples /// /// ``` /// use potential_utf::PotentialCodePoint; /// use zerovec::ule::AsULE; /// /// let a = PotentialCodePoint::from_char('a'); /// assert_eq!(a.try_to_char(), Ok('a')); /// /// let b = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into()); /// assert!(matches!(b.try_to_char(), Err(_))); /// ``` #[inline] pub fn try_to_char(self) -> Result { char::try_from(u32::from(self)) } /// Convert a [`PotentialCodePoint`] to a `char', returning [`char::REPLACEMENT_CHARACTER`] /// if the `PotentialCodePoint` does not represent a valid Unicode scalar value. /// /// # Examples /// /// ``` /// use potential_utf::PotentialCodePoint; /// use zerovec::ule::AsULE; /// /// let a = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into()); /// assert_eq!(a.to_char_lossy(), char::REPLACEMENT_CHARACTER); /// ``` #[inline] pub fn to_char_lossy(self) -> char { self.try_to_char().unwrap_or(char::REPLACEMENT_CHARACTER) } /// Convert a [`PotentialCodePoint`] to a `char` without checking that it is /// a valid Unicode scalar value. /// /// # Safety /// /// The `PotentialCodePoint` must be a valid Unicode scalar value in little-endian order. /// /// # Examples /// /// ``` /// use potential_utf::PotentialCodePoint; /// /// let a = PotentialCodePoint::from_char('a'); /// assert_eq!(unsafe { a.to_char_unchecked() }, 'a'); /// ``` #[inline] pub unsafe fn to_char_unchecked(self) -> char { char::from_u32_unchecked(u32::from(self)) } /// For converting to the ULE type in a const context /// /// Can be removed once const traits are a thing #[inline] #[cfg(feature = "zerovec")] pub const fn to_unaligned(self) -> zerovec::ule::RawBytesULE<3> { zerovec::ule::RawBytesULE(self.0) } } /// This impl requires enabling the optional `zerovec` Cargo feature #[cfg(feature = "zerovec")] impl zerovec::ule::AsULE for PotentialCodePoint { type ULE = zerovec::ule::RawBytesULE<3>; #[inline] fn to_unaligned(self) -> Self::ULE { zerovec::ule::RawBytesULE(self.0) } #[inline] fn from_unaligned(unaligned: Self::ULE) -> Self { Self(unaligned.0) } } // Safety: PotentialCodePoint is always the little-endian representation of a char, // which corresponds to its AsULE::ULE type /// This impl requires enabling the optional `zerovec` Cargo feature #[cfg(feature = "zerovec")] unsafe impl zerovec::ule::EqULE for PotentialCodePoint {} impl fmt::Debug for PotentialCodePoint { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { // Debug as a char if possible match self.try_to_char() { Ok(c) => fmt::Debug::fmt(&c, f), Err(_) => fmt::Debug::fmt(&self.0, f), } } } impl PartialOrd for PotentialCodePoint { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl PartialEq for PotentialCodePoint { fn eq(&self, other: &char) -> bool { self.eq(&Self::from_char(*other)) } } impl PartialOrd for PotentialCodePoint { fn partial_cmp(&self, other: &char) -> Option { self.partial_cmp(&Self::from_char(*other)) } } impl PartialEq for char { fn eq(&self, other: &PotentialCodePoint) -> bool { PotentialCodePoint::from_char(*self).eq(other) } } impl PartialOrd for char { fn partial_cmp(&self, other: &PotentialCodePoint) -> Option { PotentialCodePoint::from_char(*self).partial_cmp(other) } } impl Ord for PotentialCodePoint { // custom implementation, as derived Ord would compare lexicographically fn cmp(&self, other: &Self) -> Ordering { let a = u32::from(*self); let b = u32::from(*other); a.cmp(&b) } } impl From for u32 { fn from(x: PotentialCodePoint) -> Self { let [a0, a1, a2] = x.0; u32::from_le_bytes([a0, a1, a2, 0]) } } impl TryFrom for PotentialCodePoint { type Error = (); fn try_from(x: u32) -> Result { let [u0, u1, u2, u3] = x.to_le_bytes(); if u3 != 0 { return Err(()); } Ok(Self([u0, u1, u2])) } } impl From for PotentialCodePoint { #[inline] fn from(value: char) -> Self { Self::from_char(value) } } impl TryFrom for char { type Error = core::char::CharTryFromError; #[inline] fn try_from(value: PotentialCodePoint) -> Result { value.try_to_char() } } /// This impl requires enabling the optional `serde` Cargo feature #[cfg(feature = "serde")] impl serde::Serialize for PotentialCodePoint { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, { use serde::ser::Error; let c = self .try_to_char() .map_err(|_| S::Error::custom("invalid Unicode scalar value in PotentialCodePoint"))?; if serializer.is_human_readable() { serializer.serialize_char(c) } else { self.0.serialize(serializer) } } } /// This impl requires enabling the optional `serde` Cargo feature #[cfg(feature = "serde")] impl<'de> serde::Deserialize<'de> for PotentialCodePoint { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { if deserializer.is_human_readable() { let c = ::deserialize(deserializer)?; Ok(PotentialCodePoint::from_char(c)) } else { let bytes = <[u8; 3]>::deserialize(deserializer)?; Ok(PotentialCodePoint(bytes)) } } } /// This impl requires enabling the optional `databake` Cargo feature #[cfg(feature = "databake")] impl databake::Bake for PotentialCodePoint { fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { match self.try_to_char() { Ok(ch) => { env.insert("potential_utf"); let ch = ch.bake(env); databake::quote! { potential_utf::PotentialCodePoint::from_char(#ch) } } Err(_) => { env.insert("potential_utf"); let u24 = u32::from_le_bytes([self.0[0], self.0[1], self.0[2], 0]); databake::quote! { potential_utf::PotentialCodePoint::from_u24(#u24) } } } } } #[cfg(test)] mod test { use super::*; use zerovec::ZeroVec; #[test] fn test_serde_fail() { let uc = PotentialCodePoint([0xFF, 0xFF, 0xFF]); serde_json::to_string(&uc).expect_err("serialize invalid char bytes"); bincode::serialize(&uc).expect_err("serialize invalid char bytes"); } #[test] fn test_serde_json() { let c = '🙃'; let uc = PotentialCodePoint::from_char(c); let json_ser = serde_json::to_string(&uc).unwrap(); assert_eq!(json_ser, r#""🙃""#); let json_de: PotentialCodePoint = serde_json::from_str(&json_ser).unwrap(); assert_eq!(uc, json_de); } #[test] fn test_serde_bincode() { let c = '🙃'; let uc = PotentialCodePoint::from_char(c); let bytes_ser = bincode::serialize(&uc).unwrap(); assert_eq!(bytes_ser, [0x43, 0xF6, 0x01]); let bytes_de: PotentialCodePoint = bincode::deserialize(&bytes_ser).unwrap(); assert_eq!(uc, bytes_de); } #[test] fn test_representation() { let chars = ['w', 'ω', '文', '𑄃', '🙃']; // backed by [PotentialCodePoint] let uvchars: Vec<_> = chars .iter() .copied() .map(PotentialCodePoint::from_char) .collect(); // backed by [RawBytesULE<3>] let zvec: ZeroVec<_> = uvchars.clone().into_iter().collect(); let ule_bytes = zvec.as_bytes(); let uvbytes; unsafe { let ptr = &uvchars[..] as *const _ as *const u8; uvbytes = core::slice::from_raw_parts(ptr, ule_bytes.len()); } // PotentialCodePoint is defined as little-endian, so this must be true on all platforms // also asserts that to_unaligned/from_unaligned are no-ops assert_eq!(uvbytes, ule_bytes); assert_eq!( &[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1], ule_bytes ); } #[test] fn test_char_bake() { databake::test_bake!( PotentialCodePoint, const, crate::PotentialCodePoint::from_char('b'), potential_utf ); // surrogate code point databake::test_bake!( PotentialCodePoint, const, crate::PotentialCodePoint::from_u24(55296u32), potential_utf ); } } potential_utf-0.1.2/src/ustr.rs000064400000000000000000000204741046102023000146420ustar 00000000000000// This file is part of ICU4X. For terms of use, please see the file // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). #[cfg(feature = "alloc")] use alloc::boxed::Box; use core::cmp::Ordering; use core::fmt; use core::ops::Deref; /// A byte slice that is expected to be a UTF-8 string but does not enforce that invariant. /// /// Use this type instead of `str` if you don't need to enforce UTF-8 during deserialization. For /// example, strings that are keys of a map don't need to ever be reified as `str`s. /// /// [`PotentialUtf8`] derefs to `[u8]`. To obtain a `str`, use [`Self::try_as_str()`]. /// /// The main advantage of this type over `[u8]` is that it serializes as a string in /// human-readable formats like JSON. /// /// # Examples /// /// Using an [`PotentialUtf8`] as the key of a [`ZeroMap`]: /// /// ``` /// use potential_utf::PotentialUtf8; /// use zerovec::ZeroMap; /// /// // This map is cheap to deserialize, as we don't need to perform UTF-8 validation. /// let map: ZeroMap = [ /// (PotentialUtf8::from_bytes(b"abc"), 11), /// (PotentialUtf8::from_bytes(b"def"), 22), /// (PotentialUtf8::from_bytes(b"ghi"), 33), /// ] /// .into_iter() /// .collect(); /// /// let key = "abc"; /// let value = map.get_copied(PotentialUtf8::from_str(key)); /// assert_eq!(Some(11), value); /// ``` /// /// [`ZeroMap`]: zerovec::ZeroMap #[repr(transparent)] #[derive(PartialEq, Eq, PartialOrd, Ord)] #[allow(clippy::exhaustive_structs)] // transparent newtype pub struct PotentialUtf8(pub [u8]); impl fmt::Debug for PotentialUtf8 { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { // Debug as a string if possible match self.try_as_str() { Ok(s) => fmt::Debug::fmt(s, f), Err(_) => fmt::Debug::fmt(&self.0, f), } } } impl PotentialUtf8 { /// Create a [`PotentialUtf8`] from a byte slice. #[inline] pub const fn from_bytes(other: &[u8]) -> &Self { // Safety: PotentialUtf8 is transparent over [u8] unsafe { core::mem::transmute(other) } } /// Create a [`PotentialUtf8`] from a string slice. #[inline] pub const fn from_str(s: &str) -> &Self { Self::from_bytes(s.as_bytes()) } /// Create a [`PotentialUtf8`] from boxed bytes. #[inline] #[cfg(feature = "alloc")] pub fn from_boxed_bytes(other: Box<[u8]>) -> Box { // Safety: PotentialUtf8 is transparent over [u8] unsafe { core::mem::transmute(other) } } /// Create a [`PotentialUtf8`] from a boxed `str`. #[inline] #[cfg(feature = "alloc")] pub fn from_boxed_str(other: Box) -> Box { Self::from_boxed_bytes(other.into_boxed_bytes()) } /// Get the bytes from a [`PotentialUtf8]. #[inline] pub const fn as_bytes(&self) -> &[u8] { &self.0 } /// Attempt to convert a [`PotentialUtf8`] to a `str`. /// /// # Examples /// /// ``` /// use potential_utf::PotentialUtf8; /// /// static A: &PotentialUtf8 = PotentialUtf8::from_bytes(b"abc"); /// /// let b = A.try_as_str().unwrap(); /// assert_eq!(b, "abc"); /// ``` // Note: this is const starting in 1.63 #[inline] pub fn try_as_str(&self) -> Result<&str, core::str::Utf8Error> { core::str::from_utf8(&self.0) } } impl<'a> From<&'a str> for &'a PotentialUtf8 { #[inline] fn from(other: &'a str) -> Self { PotentialUtf8::from_str(other) } } impl PartialEq for PotentialUtf8 { fn eq(&self, other: &str) -> bool { self.eq(Self::from_str(other)) } } impl PartialOrd for PotentialUtf8 { fn partial_cmp(&self, other: &str) -> Option { self.partial_cmp(Self::from_str(other)) } } impl PartialEq for str { fn eq(&self, other: &PotentialUtf8) -> bool { PotentialUtf8::from_str(self).eq(other) } } impl PartialOrd for str { fn partial_cmp(&self, other: &PotentialUtf8) -> Option { PotentialUtf8::from_str(self).partial_cmp(other) } } #[cfg(feature = "alloc")] impl From> for Box { #[inline] fn from(other: Box) -> Self { PotentialUtf8::from_boxed_str(other) } } impl Deref for PotentialUtf8 { type Target = [u8]; fn deref(&self) -> &Self::Target { &self.0 } } /// This impl requires enabling the optional `zerovec` Cargo feature #[cfg(all(feature = "zerovec", feature = "alloc"))] impl<'a> zerovec::maps::ZeroMapKV<'a> for PotentialUtf8 { type Container = zerovec::VarZeroVec<'a, PotentialUtf8>; type Slice = zerovec::VarZeroSlice; type GetType = PotentialUtf8; type OwnedType = Box; } // Safety (based on the safety checklist on the VarULE trait): // 1. PotentialUtf8 does not include any uninitialized or padding bytes (transparent over a ULE) // 2. PotentialUtf8 is aligned to 1 byte (transparent over a ULE) // 3. The impl of `validate_bytes()` returns an error if any byte is not valid (impossible) // 4. The impl of `validate_bytes()` returns an error if the slice cannot be used in its entirety (impossible) // 5. The impl of `from_bytes_unchecked()` returns a reference to the same data (returns the argument directly) // 6. All other methods are defaulted // 7. `[T]` byte equality is semantic equality (transparent over a ULE) /// This impl requires enabling the optional `zerovec` Cargo feature #[cfg(feature = "zerovec")] unsafe impl zerovec::ule::VarULE for PotentialUtf8 { #[inline] fn validate_bytes(_: &[u8]) -> Result<(), zerovec::ule::UleError> { Ok(()) } #[inline] unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Self { PotentialUtf8::from_bytes(bytes) } } /// This impl requires enabling the optional `serde` Cargo feature #[cfg(feature = "serde")] impl serde::Serialize for PotentialUtf8 { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, { use serde::ser::Error; let s = self .try_as_str() .map_err(|_| S::Error::custom("invalid UTF-8 in PotentialUtf8"))?; if serializer.is_human_readable() { serializer.serialize_str(s) } else { serializer.serialize_bytes(s.as_bytes()) } } } /// This impl requires enabling the optional `serde` Cargo feature #[cfg(all(feature = "serde", feature = "alloc"))] impl<'de> serde::Deserialize<'de> for Box { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { if deserializer.is_human_readable() { let boxed_str = Box::::deserialize(deserializer)?; Ok(PotentialUtf8::from_boxed_str(boxed_str)) } else { let boxed_bytes = Box::<[u8]>::deserialize(deserializer)?; Ok(PotentialUtf8::from_boxed_bytes(boxed_bytes)) } } } /// This impl requires enabling the optional `serde` Cargo feature #[cfg(feature = "serde")] impl<'de, 'a> serde::Deserialize<'de> for &'a PotentialUtf8 where 'de: 'a, { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { if deserializer.is_human_readable() { let s = <&str>::deserialize(deserializer)?; Ok(PotentialUtf8::from_str(s)) } else { let bytes = <&[u8]>::deserialize(deserializer)?; Ok(PotentialUtf8::from_bytes(bytes)) } } } #[repr(transparent)] #[derive(PartialEq, Eq, PartialOrd, Ord)] #[allow(clippy::exhaustive_structs)] // transparent newtype pub struct PotentialUtf16(pub [u16]); impl fmt::Debug for PotentialUtf16 { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { // Debug as a string if possible for c in char::decode_utf16(self.0.iter().copied()) { match c { Ok(c) => write!(f, "{c}")?, Err(e) => write!(f, "\\0x{:x}", e.unpaired_surrogate())?, } } Ok(()) } } impl PotentialUtf16 { /// Create a [`PotentialUtf16`] from a u16 slice. #[inline] pub const fn from_slice(other: &[u16]) -> &Self { // Safety: PotentialUtf16 is transparent over [u16] unsafe { core::mem::transmute(other) } } } potential_utf-0.1.2/src/writeable.rs000064400000000000000000000127701046102023000156230ustar 00000000000000// This file is part of ICU4X. For terms of use, please see the file // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). use crate::{PotentialUtf16, PotentialUtf8}; use alloc::borrow::Cow; use core::fmt::Write; use writeable::{LengthHint, Part, PartsWrite, TryWriteable}; use core::{char::DecodeUtf16Error, fmt, str::Utf8Error}; /// This impl requires enabling the optional `writeable` Cargo feature impl TryWriteable for &'_ PotentialUtf8 { type Error = Utf8Error; fn try_write_to_parts( &self, sink: &mut S, ) -> Result, fmt::Error> { let mut remaining = &self.0; let mut r = Ok(()); loop { match core::str::from_utf8(remaining) { Ok(valid) => { sink.write_str(valid)?; return Ok(r); } Err(e) => { // SAFETY: By Utf8Error invariants let valid = unsafe { core::str::from_utf8_unchecked(remaining.get_unchecked(..e.valid_up_to())) }; sink.write_str(valid)?; sink.with_part(Part::ERROR, |s| s.write_char(char::REPLACEMENT_CHARACTER))?; if r.is_ok() { r = Err(e); } let Some(error_len) = e.error_len() else { return Ok(r); // end of string }; // SAFETY: By Utf8Error invariants remaining = unsafe { remaining.get_unchecked(e.valid_up_to() + error_len..) } } } } } fn writeable_length_hint(&self) -> LengthHint { // Lower bound is all valid UTF-8, upper bound is all bytes with the high bit, which become replacement characters. LengthHint::between(self.0.len(), self.0.len() * 3) } fn try_write_to_string(&self) -> Result, (Self::Error, Cow)> { match core::str::from_utf8(&self.0) { Ok(valid) => Ok(Cow::Borrowed(valid)), Err(e) => { // SAFETY: By Utf8Error invariants let valid = unsafe { core::str::from_utf8_unchecked(self.0.get_unchecked(..e.valid_up_to())) }; // Let's assume this is the only error let mut out = alloc::string::String::with_capacity( self.0.len() + char::REPLACEMENT_CHARACTER.len_utf8() - e.error_len().unwrap_or(0), ); out.push_str(valid); out.push(char::REPLACEMENT_CHARACTER); // If there's more, we can use `try_write_to` if let Some(error_len) = e.error_len() { // SAFETY: By Utf8Error invariants let remaining = unsafe { self.0.get_unchecked(e.valid_up_to() + error_len..) }; let _discard = PotentialUtf8::from_bytes(remaining).try_write_to(&mut out); } Err((e, Cow::Owned(out))) } } } } /// This impl requires enabling the optional `writeable` Cargo feature impl TryWriteable for &'_ PotentialUtf16 { type Error = DecodeUtf16Error; fn try_write_to_parts( &self, sink: &mut S, ) -> Result, fmt::Error> { let mut r = Ok(()); for c in core::char::decode_utf16(self.0.iter().copied()) { match c { Ok(c) => sink.write_char(c)?, Err(e) => { if r.is_ok() { r = Err(e); } sink.with_part(Part::ERROR, |s| s.write_char(char::REPLACEMENT_CHARACTER))?; } } } Ok(r) } fn writeable_length_hint(&self) -> LengthHint { // Lower bound is all ASCII, upper bound is all 3-byte code points (including replacement character) LengthHint::between(self.0.len(), self.0.len() * 3) } } #[cfg(test)] mod test { #![allow(invalid_from_utf8)] // only way to construct the error use super::*; use writeable::assert_try_writeable_parts_eq; #[test] fn test_utf8() { assert_try_writeable_parts_eq!( PotentialUtf8::from_bytes(b"Foo Bar"), "Foo Bar", Ok(()), [] ); assert_try_writeable_parts_eq!( PotentialUtf8::from_bytes(b"Foo\xFDBar"), "Foo�Bar", Err(core::str::from_utf8(b"Foo\xFDBar").unwrap_err()), [(3, 6, Part::ERROR)] ); assert_try_writeable_parts_eq!( PotentialUtf8::from_bytes(b"Foo\xFDBar\xff"), "Foo�Bar�", Err(core::str::from_utf8(b"Foo\xFDBar\xff").unwrap_err()), [(3, 6, Part::ERROR), (9, 12, Part::ERROR)], ); } #[test] fn test_utf16() { assert_try_writeable_parts_eq!( PotentialUtf16::from_slice(&[0xD83E, 0xDD73]), "🥳", Ok(()), [] ); assert_try_writeable_parts_eq!( PotentialUtf16::from_slice(&[0xD83E, 0x20, 0xDD73]), "� �", Err(core::char::decode_utf16([0xD83E].into_iter()) .next() .unwrap() .unwrap_err()), [(0, 3, Part::ERROR), (4, 7, Part::ERROR)] ); } }