pax_global_header00006660000000000000000000000064145734116510014521gustar00rootroot0000000000000052 comment=c79e291edb25022cfd638f86defa00e5350915b7 oxilangtag-0.1.5/000077500000000000000000000000001457341165100136615ustar00rootroot00000000000000oxilangtag-0.1.5/.github/000077500000000000000000000000001457341165100152215ustar00rootroot00000000000000oxilangtag-0.1.5/.github/dependabot.yml000066400000000000000000000002711457341165100200510ustar00rootroot00000000000000version: 2 updates: - package-ecosystem: cargo directory: "/" schedule: interval: weekly - package-ecosystem: "github-actions" directory: "/" schedule: interval: weekly oxilangtag-0.1.5/.github/workflows/000077500000000000000000000000001457341165100172565ustar00rootroot00000000000000oxilangtag-0.1.5/.github/workflows/build.yml000066400000000000000000000046271457341165100211110ustar00rootroot00000000000000name: build on: push: branches: - main pull_request: branches: - main jobs: fmt: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - run: rustup update && rustup component add rustfmt - run: cargo fmt -- --check clippy: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - run: rustup update && rustup component add clippy - uses: Swatinem/rust-cache@v2 - run: cargo clippy --all-targets -- -D warnings -D clippy::all - run: cargo clippy --all-targets --all-features -- -D warnings -D clippy::all - run: cargo clippy --all-targets --no-default-features -- -D warnings -D clippy::all clippy_msrv: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - run: rustup update && rustup override set 1.63.0 && rustup component add clippy && rustup toolchain install nightly - uses: Swatinem/rust-cache@v2 - run: cargo +nightly update -Z direct-minimal-versions - run: cargo clippy -- -D warnings -D clippy::all - run: cargo clippy --all-features -- -D warnings -D clippy::all - run: cargo clippy --no-default-features -- -D warnings -D clippy::all test: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - run: rustup update - uses: Swatinem/rust-cache@v2 - run: cargo build --all-features - run: cargo test - run: cargo test --all-features - run: cargo test --no-default-features rustdoc: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - run: rustup update - uses: Swatinem/rust-cache@v2 - run: cargo doc --all-features --no-deps env: RUSTDOCFLAGS: -D warnings deny: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - run: rustup update - uses: Swatinem/rust-cache@v2 - run: cargo install cargo-deny || true - run: cargo deny check semver_checks: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 with: submodules: true - run: rustup update - uses: Swatinem/rust-cache@v2 - run: cargo install cargo-semver-checks || true - run: cargo semver-checks check-release typos: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - uses: Swatinem/rust-cache@v2 - run: cargo install typos-cli || true - run: typos oxilangtag-0.1.5/.gitignore000066400000000000000000000000311457341165100156430ustar00rootroot00000000000000/target Cargo.lock .idea oxilangtag-0.1.5/CHANGELOG.md000066400000000000000000000013031457341165100154670ustar00rootroot00000000000000# Changelog ## [0.1.5] - 2024-03-10 ### Changed - Moves back Serde feature from `dep:serde` to `serde`. ## [0.1.4] - 2024-03-04 ### Added - Support for `no_std`: Rust std usage is now behind the enabled by default `std` feature. ### Changed - Rust minimum supported version is set to 1.63. ## [0.1.3] - 2022-03-26 ### Added - `LanguageTag` now implements Serde `Serialize` and `Deserialize` trait if the `serde` crate is present. The serialization is a plain string. ## [0.1.2] - 2021-04-16 ### Added - `LanguageTag` struct with a parser, case normalization and components accessors. ### Changed - Proper attribution from [`language-tags`](https://github.com/pyfisch/rust-language-tags/). oxilangtag-0.1.5/Cargo.toml000066400000000000000000000013561457341165100156160ustar00rootroot00000000000000[package] name = "oxilangtag" version = "0.1.5" authors = [ "Tpt " ] license = "MIT" readme = "README.md" documentation = "https://docs.rs/oxilangtag" keywords = ["language-tag", "BCP47"] repository = "https://github.com/oxigraph/oxilangtag" description = """ Simple and fast implementation of language tag normalization and validation """ edition = "2021" rust-version = "1.63" [features] default = ["std"] std = ["serde?/std"] alloc = ["serde?/alloc"] serialize = ["serde"] [dependencies] serde = { version = "1.0.100", optional = true, default_features = false } [dev-dependencies] criterion = ">=0.4,<0.6" serde_test = "1" [[bench]] name = "lib" harness = false [package.metadata.docs.rs] all-features = true oxilangtag-0.1.5/LICENSE000066400000000000000000000020441457341165100146660ustar00rootroot00000000000000Copyright (c) 2015-2021 Pyfisch Tpt Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. oxilangtag-0.1.5/README.md000066400000000000000000000045671457341165100151540ustar00rootroot00000000000000oxilangtag ========== [![actions status](https://github.com/oxigraph/oxilangtag/workflows/build/badge.svg)](https://github.com/oxigraph/oxilangtag/actions) [![Latest Version](https://img.shields.io/crates/v/oxilangtag.svg)](https://crates.io/crates/oxilangtag) [![Released API docs](https://docs.rs/oxilangtag/badge.svg)](https://docs.rs/oxilangtag) OxiLangTag is a Rust library allowing to validate and normalize language tags following [RFC 5646](https://tools.ietf.org/html/rfc5646) ([BCP 47](https://tools.ietf.org/html/bcp47)). It is a fork of the [`language-tags`](https://github.com/pyfisch/rust-language-tags/) focusing on [RDF use cases](https://www.w3.org/TR/rdf11-primer/). You might find the [`language-tags`](https://github.com/pyfisch/rust-language-tags/) crate more convenient. It allows zero stack allocation language tag validation. Getters are also provided to easily retrieve the various language tag components. If [`serde`](https://serde.rs/) is available, `LanguageTag` implements the `Serialize` and `Deserialize` traits and encodes the language tag as a string. Example: ```rust use oxilangtag::LanguageTag; // Parsing and validation let language_tag = LanguageTag::parse("zh-cmn-Hans-CN-x-test").unwrap(); assert_eq!(language_tag.as_str(), "zh-cmn-Hans-CN-x-test"); // Language tag components assert_eq!(language_tag.primary_language(), "zh"); assert_eq!(language_tag.extended_language(), Some("cmn")); assert_eq!(language_tag.full_language(), "zh-cmn"); assert_eq!(language_tag.script(), Some("Hans")); assert_eq!(language_tag.region(), Some("CN")); assert_eq!(language_tag.extension(), None); assert_eq!(language_tag.private_use_subtags().collect::>(), vec!["test"]); ``` It is also possible to use this crate in `no_std` (with `alloc`) by opting-out of the default `std` feature: ```toml serde = { version = "*", default-features = false } ``` ## License This project is licensed under the MIT license ([LICENSE-MIT](LICENSE-MIT) or ``). It is based on the [`language-tags`](https://github.com/pyfisch/rust-language-tags/) crate by [pyfisch](https://github.com/pyfisch) under MIT license. ### Contribution Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in Oxilangtag by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions. oxilangtag-0.1.5/benches/000077500000000000000000000000001457341165100152705ustar00rootroot00000000000000oxilangtag-0.1.5/benches/lib.rs000066400000000000000000000033211457341165100164030ustar00rootroot00000000000000use criterion::{criterion_group, criterion_main, Criterion}; use oxilangtag::LanguageTag; fn bench_language_tag_parse(c: &mut Criterion) { let examples = [ "fr", "fr-Latn", "fr-fra", "fr-Latn-FR", "fr-Latn-419", "fr-FR", "ax-TZ", "fr-shadok", "fr-y-myext-myext2", "fra-Latn", "fra", "fra-FX", "i-klingon", "I-kLINgon", "no-bok", "fr-Lat", "mn-Cyrl-MN", "mN-cYrL-Mn", "fr-Latn-CA", "en-US", "fr-Latn-CA", "i-enochian", "x-fr-CH", "sr-Latn-CS", "es-419", "sl-nedis", "de-CH-1996", "de-Latg-1996", "sl-IT-nedis", "en-a-bbb-x-a-ccc", "de-a-value", "en-Latn-GB-boont-r-extended-sequence-x-private", "en-x-US", "az-Arab-x-AZE-derbend", "es-Latn-CO-x-private", "en-US-boont", "ab-x-abc-x-abc", "ab-x-abc-a-a", "i-default", "i-klingon", "abcd-Latn", "AaBbCcDd-x-y-any-x", "en", "de-AT", "es-419", "de-CH-1901", "sr-Cyrl", "sr-Cyrl-CS", "sl-Latn-IT-rozaj", "en-US-x-twain", "zh-cmn", "zh-cmn-Hant", "zh-cmn-Hant-HK", "zh-gan", "zh-yue-Hant-HK", "xr-lxs-qut", "xr-lqt-qu", "xr-p-lze", ]; c.bench_function("language tag parse tests", |b| { b.iter(|| { for tag in examples.iter() { LanguageTag::parse(*tag).unwrap(); } }) }); } criterion_group!(language_tag, bench_language_tag_parse); criterion_main!(language_tag); oxilangtag-0.1.5/deny.toml000066400000000000000000000002461457341165100155170ustar00rootroot00000000000000[licenses] unlicensed = "deny" allow = [ "MIT", "Apache-2.0", "Unicode-DFS-2016" ] default = "deny" [bans] multiple-versions = "warn" wildcards = "deny" oxilangtag-0.1.5/fuzz/000077500000000000000000000000001457341165100146575ustar00rootroot00000000000000oxilangtag-0.1.5/fuzz/.gitignore000066400000000000000000000000271457341165100166460ustar00rootroot00000000000000target corpus artifactsoxilangtag-0.1.5/fuzz/Cargo.toml000066400000000000000000000004461457341165100166130ustar00rootroot00000000000000[package] name = "oxilangtag-fuzz" version = "0.0.0" authors = ["Automatically generated"] publish = false edition = "2021" [package.metadata] cargo-fuzz = true [dependencies] libfuzzer-sys = "0.4" [dependencies.oxilangtag] path = ".." [[bin]] name = "parse" path = "fuzz_targets/parse.rs" oxilangtag-0.1.5/fuzz/fuzz_targets/000077500000000000000000000000001457341165100174065ustar00rootroot00000000000000oxilangtag-0.1.5/fuzz/fuzz_targets/parse.rs000066400000000000000000000003201457341165100210610ustar00rootroot00000000000000#![no_main] use libfuzzer_sys::fuzz_target; use oxilangtag::LanguageTag; use std::str; fuzz_target!(|data: &[u8]| { if let Ok(s) = str::from_utf8(data) { let _ = LanguageTag::parse(s); } }); oxilangtag-0.1.5/src/000077500000000000000000000000001457341165100144505ustar00rootroot00000000000000oxilangtag-0.1.5/src/lib.rs000066400000000000000000000664171457341165100156020ustar00rootroot00000000000000#![doc = include_str!("../README.md")] #![cfg_attr(docsrs, feature(doc_auto_cfg))] #![deny(unsafe_code)] #![no_std] #[cfg(feature = "std")] extern crate std; extern crate alloc; use alloc::borrow::{Borrow, Cow}; use alloc::boxed::Box; use alloc::fmt; use alloc::str::{FromStr, Split}; use alloc::string::String; use core::cmp::Ordering; use core::hash::{Hash, Hasher}; use core::iter::once; use core::ops::Deref; #[cfg(feature = "serde")] use serde::{Deserialize, Deserializer, Serialize, Serializer}; /// A [RFC 5646](https://tools.ietf.org/html/rfc5646) language tag. /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("en-us").unwrap(); /// assert_eq!(language_tag.into_inner(), "en-us") /// ``` #[derive(Copy, Clone)] pub struct LanguageTag { tag: T, positions: TagElementsPositions, } impl> LanguageTag { /// Parses a language tag according to [RFC 5646](https://tools.ietf.org/html/rfc5646). /// and checks if the tag is ["well-formed"](https://tools.ietf.org/html/rfc5646#section-2.2.9). /// /// This operation keeps internally the `tag` parameter and does not allocate on the heap. /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("en-us").unwrap(); /// assert_eq!(language_tag.into_inner(), "en-us") /// ``` pub fn parse(tag: T) -> Result { let positions = parse_language_tag(&tag, &mut VoidOutputBuffer::default())?; Ok(Self { tag, positions }) } /// Returns the underlying language tag representation. #[inline] pub fn as_str(&self) -> &str { &self.tag } /// Returns the underlying language tag representation. #[inline] pub fn into_inner(self) -> T { self.tag } /// Returns the [primary language subtag](https://tools.ietf.org/html/rfc5646#section-2.2.1). /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap(); /// assert_eq!(language_tag.primary_language(), "zh"); /// ``` #[inline] pub fn primary_language(&self) -> &str { &self.tag[..self.positions.language_end] } /// Returns the [extended language subtags](https://tools.ietf.org/html/rfc5646#section-2.2.2). /// /// Valid language tags have at most one extended language. /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap(); /// assert_eq!(language_tag.extended_language(), Some("cmn")); /// ``` #[inline] pub fn extended_language(&self) -> Option<&str> { if self.positions.language_end == self.positions.extlang_end { None } else { Some(&self.tag[self.positions.language_end + 1..self.positions.extlang_end]) } } /// Iterates on the [extended language subtags](https://tools.ietf.org/html/rfc5646#section-2.2.2). /// /// Valid language tags have at most one extended language. /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap(); /// assert_eq!(language_tag.extended_language_subtags().collect::>(), vec!["cmn"]); /// ``` #[inline] pub fn extended_language_subtags(&self) -> impl Iterator { self.extended_language().unwrap_or("").split_terminator('-') } /// Returns the [primary language subtag](https://tools.ietf.org/html/rfc5646#section-2.2.1) /// and its [extended language subtags](https://tools.ietf.org/html/rfc5646#section-2.2.2). /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap(); /// assert_eq!(language_tag.full_language(), "zh-cmn"); /// ``` #[inline] pub fn full_language(&self) -> &str { &self.tag[..self.positions.extlang_end] } /// Returns the [script subtag](https://tools.ietf.org/html/rfc5646#section-2.2.3). /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap(); /// assert_eq!(language_tag.script(), Some("Hans")); /// ``` #[inline] pub fn script(&self) -> Option<&str> { if self.positions.extlang_end == self.positions.script_end { None } else { Some(&self.tag[self.positions.extlang_end + 1..self.positions.script_end]) } } /// Returns the [region subtag](https://tools.ietf.org/html/rfc5646#section-2.2.4). /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap(); /// assert_eq!(language_tag.region(), Some("CN")); /// ``` #[inline] pub fn region(&self) -> Option<&str> { if self.positions.script_end == self.positions.region_end { None } else { Some(&self.tag[self.positions.script_end + 1..self.positions.region_end]) } } /// Returns the [variant subtags](https://tools.ietf.org/html/rfc5646#section-2.2.5). /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("zh-Latn-TW-pinyin").unwrap(); /// assert_eq!(language_tag.variant(), Some("pinyin")); /// ``` #[inline] pub fn variant(&self) -> Option<&str> { if self.positions.region_end == self.positions.variant_end { None } else { Some(&self.tag[self.positions.region_end + 1..self.positions.variant_end]) } } /// Iterates on the [variant subtags](https://tools.ietf.org/html/rfc5646#section-2.2.5). /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("zh-Latn-TW-pinyin").unwrap(); /// assert_eq!(language_tag.variant_subtags().collect::>(), vec!["pinyin"]); /// ``` #[inline] pub fn variant_subtags(&self) -> impl Iterator { self.variant().unwrap_or("").split_terminator('-') } /// Returns the [extension subtags](https://tools.ietf.org/html/rfc5646#section-2.2.6). /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("de-DE-u-co-phonebk").unwrap(); /// assert_eq!(language_tag.extension(), Some("u-co-phonebk")); /// ``` #[inline] pub fn extension(&self) -> Option<&str> { if self.positions.variant_end == self.positions.extension_end { None } else { Some(&self.tag[self.positions.variant_end + 1..self.positions.extension_end]) } } /// Iterates on the [extension subtags](https://tools.ietf.org/html/rfc5646#section-2.2.6). /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("de-DE-u-co-phonebk").unwrap(); /// assert_eq!(language_tag.extension_subtags().collect::>(), vec![('u', "co-phonebk")]); /// ``` #[inline] pub fn extension_subtags(&self) -> impl Iterator { match self.extension() { Some(parts) => ExtensionsIterator::new(parts), None => ExtensionsIterator::new(""), } } /// Returns the [private use subtags](https://tools.ietf.org/html/rfc5646#section-2.2.7). /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("de-x-foo-bar").unwrap(); /// assert_eq!(language_tag.private_use(), Some("x-foo-bar")); /// ``` #[inline] pub fn private_use(&self) -> Option<&str> { if self.tag.starts_with("x-") { Some(&self.tag) } else if self.positions.extension_end == self.tag.len() { None } else { Some(&self.tag[self.positions.extension_end + 1..]) } } /// Iterates on the [private use subtags](https://tools.ietf.org/html/rfc5646#section-2.2.7). /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("de-x-foo-bar").unwrap(); /// assert_eq!(language_tag.private_use_subtags().collect::>(), vec!["foo", "bar"]); /// ``` #[inline] pub fn private_use_subtags(&self) -> impl Iterator { self.private_use() .map(|part| &part[2..]) .unwrap_or("") .split_terminator('-') } } impl LanguageTag { /// Parses a language tag according to [RFC 5646](https://tools.ietf.org/html/rfc5646) /// and normalizes its case. /// /// This parser accepts the language tags that are "well-formed" according to /// [RFC 5646](https://tools.ietf.org/html/rfc5646#section-2.2.9). /// /// This operation does heap allocation. /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse_and_normalize("en-us").unwrap(); /// assert_eq!(language_tag.into_inner(), "en-US") /// ``` pub fn parse_and_normalize(tag: &str) -> Result { let mut output_buffer = String::with_capacity(tag.len()); let positions = parse_language_tag(tag, &mut output_buffer)?; Ok(Self { tag: output_buffer, positions, }) } } impl, Rhs> PartialEq> for LanguageTag { #[inline] fn eq(&self, other: &LanguageTag) -> bool { self.tag.eq(&other.tag) } } impl> PartialEq for LanguageTag { #[inline] fn eq(&self, other: &str) -> bool { self.tag.eq(other) } } impl<'a, T: PartialEq<&'a str>> PartialEq<&'a str> for LanguageTag { #[inline] fn eq(&self, other: &&'a str) -> bool { self.tag.eq(other) } } impl> PartialEq for LanguageTag { #[inline] fn eq(&self, other: &String) -> bool { self.tag.eq(other) } } impl<'a, T: PartialEq>> PartialEq> for LanguageTag { #[inline] fn eq(&self, other: &Cow<'a, str>) -> bool { self.tag.eq(other) } } impl> PartialEq> for str { #[inline] fn eq(&self, other: &LanguageTag) -> bool { other.tag.eq(self) } } impl<'a, T: PartialEq<&'a str>> PartialEq> for &'a str { #[inline] fn eq(&self, other: &LanguageTag) -> bool { other.tag.eq(self) } } impl> PartialEq> for String { #[inline] fn eq(&self, other: &LanguageTag) -> bool { other.tag.eq(self) } } impl<'a, T: PartialEq>> PartialEq> for Cow<'a, str> { #[inline] fn eq(&self, other: &LanguageTag) -> bool { other.tag.eq(self) } } impl Eq for LanguageTag {} impl Hash for LanguageTag { #[inline] fn hash(&self, state: &mut H) { self.tag.hash(state) } } impl PartialOrd for LanguageTag { #[inline] fn partial_cmp(&self, other: &Self) -> Option { self.tag.partial_cmp(&other.tag) } } impl Ord for LanguageTag { #[inline] fn cmp(&self, other: &Self) -> Ordering { self.tag.cmp(&other.tag) } } impl> Deref for LanguageTag { type Target = str; #[inline] fn deref(&self) -> &str { self.tag.deref() } } impl> AsRef for LanguageTag { #[inline] fn as_ref(&self) -> &str { self.tag.as_ref() } } impl> Borrow for LanguageTag { #[inline] fn borrow(&self) -> &str { self.tag.borrow() } } impl fmt::Debug for LanguageTag { #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.tag.fmt(f) } } impl fmt::Display for LanguageTag { #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.tag.fmt(f) } } impl FromStr for LanguageTag { type Err = LanguageTagParseError; #[inline] fn from_str(tag: &str) -> Result { Self::parse_and_normalize(tag) } } impl<'a> From> for LanguageTag { #[inline] fn from(tag: LanguageTag<&'a str>) -> Self { Self { tag: tag.tag.into(), positions: tag.positions, } } } impl<'a> From>> for LanguageTag { #[inline] fn from(tag: LanguageTag>) -> Self { Self { tag: tag.tag.into(), positions: tag.positions, } } } impl From>> for LanguageTag { #[inline] fn from(tag: LanguageTag>) -> Self { Self { tag: tag.tag.into(), positions: tag.positions, } } } impl<'a> From> for LanguageTag> { #[inline] fn from(tag: LanguageTag<&'a str>) -> Self { Self { tag: tag.tag.into(), positions: tag.positions, } } } impl<'a> From> for LanguageTag> { #[inline] fn from(tag: LanguageTag) -> Self { Self { tag: tag.tag.into(), positions: tag.positions, } } } #[cfg(feature = "serde")] impl Serialize for LanguageTag { fn serialize(&self, serializer: S) -> Result { self.tag.serialize(serializer) } } #[cfg(feature = "serde")] impl<'de, T: Deref + Deserialize<'de>> Deserialize<'de> for LanguageTag { fn deserialize>(deserializer: D) -> Result, D::Error> { use serde::de::Error; Self::parse(T::deserialize(deserializer)?).map_err(D::Error::custom) } } /// An error raised during [`LanguageTag`](struct.LanguageTag.html) validation. #[derive(Debug)] pub struct LanguageTagParseError { kind: TagParseErrorKind, } impl fmt::Display for LanguageTagParseError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self.kind { TagParseErrorKind::EmptyExtension => { write!(f, "If an extension subtag is present, it must not be empty") } TagParseErrorKind::EmptyPrivateUse => { write!(f, "If the `x` subtag is present, it must not be empty") } TagParseErrorKind::ForbiddenChar => { write!(f, "The langtag contains a char not allowed") } TagParseErrorKind::InvalidSubtag => write!( f, "A subtag fails to parse, it does not match any other subtags" ), TagParseErrorKind::InvalidLanguage => write!(f, "The given language subtag is invalid"), TagParseErrorKind::SubtagTooLong => { write!(f, "A subtag may be eight characters in length at maximum") } TagParseErrorKind::EmptySubtag => write!(f, "A subtag should not be empty"), TagParseErrorKind::TooManyExtlangs => { write!(f, "At maximum three extlangs are allowed") } } } } // Move to core::error::Error once stable see // https://github.com/rust-lang/rust/issues/103765 #[cfg(feature = "std")] impl std::error::Error for LanguageTagParseError {} #[derive(Debug)] enum TagParseErrorKind { /// If an extension subtag is present, it must not be empty. EmptyExtension, /// If the `x` subtag is present, it must not be empty. EmptyPrivateUse, /// The langtag contains a char that is not A-Z, a-z, 0-9 or the dash. ForbiddenChar, /// A subtag fails to parse, it does not match any other subtags. InvalidSubtag, /// The given language subtag is invalid. InvalidLanguage, /// A subtag may be eight characters in length at maximum. SubtagTooLong, /// A subtag should not be empty. EmptySubtag, /// At maximum three extlangs are allowed, but zero to one extlangs are preferred. TooManyExtlangs, } #[derive(Copy, Clone, Debug)] struct TagElementsPositions { language_end: usize, extlang_end: usize, script_end: usize, region_end: usize, variant_end: usize, extension_end: usize, } trait OutputBuffer: Extend { fn push(&mut self, c: char); fn push_str(&mut self, s: &str); } #[derive(Default)] struct VoidOutputBuffer {} impl OutputBuffer for VoidOutputBuffer { #[inline] fn push(&mut self, _: char) {} #[inline] fn push_str(&mut self, _: &str) {} } impl Extend for VoidOutputBuffer { #[inline] fn extend>(&mut self, _: T) {} } impl OutputBuffer for String { #[inline] fn push(&mut self, c: char) { self.push(c); } #[inline] fn push_str(&mut self, s: &str) { self.push_str(s); } } /// Parses language tag following [the RFC5646 grammar](https://tools.ietf.org/html/rfc5646#section-2.1) fn parse_language_tag( input: &str, output: &mut impl OutputBuffer, ) -> Result { //grandfathered tags if let Some(tag) = GRANDFATHEREDS .iter() .find(|record| record.eq_ignore_ascii_case(input)) { output.push_str(tag); Ok(TagElementsPositions { language_end: tag.len(), extlang_end: tag.len(), script_end: tag.len(), region_end: tag.len(), variant_end: tag.len(), extension_end: tag.len(), }) } else if input.starts_with("x-") || input.starts_with("X-") { // private use if !is_alphanumeric_or_dash(input) { Err(LanguageTagParseError { kind: TagParseErrorKind::ForbiddenChar, }) } else if input.len() == 2 { Err(LanguageTagParseError { kind: TagParseErrorKind::EmptyPrivateUse, }) } else { output.extend(input.chars().map(|c| c.to_ascii_lowercase())); Ok(TagElementsPositions { language_end: input.len(), extlang_end: input.len(), script_end: input.len(), region_end: input.len(), variant_end: input.len(), extension_end: input.len(), }) } } else { parse_langtag(input, output) } } /// Handles normal tags. fn parse_langtag( input: &str, output: &mut impl OutputBuffer, ) -> Result { #[derive(PartialEq, Eq)] enum State { Start, AfterLanguage, AfterExtLang, AfterScript, AfterRegion, InExtension { expected: bool }, InPrivateUse { expected: bool }, } let mut state = State::Start; let mut language_end = 0; let mut extlang_end = 0; let mut script_end = 0; let mut region_end = 0; let mut variant_end = 0; let mut extension_end = 0; let mut extlangs_count = 0; for (subtag, end) in SubTagIterator::new(input) { if subtag.is_empty() { return Err(LanguageTagParseError { kind: TagParseErrorKind::EmptySubtag, }); } if subtag.len() > 8 { return Err(LanguageTagParseError { kind: TagParseErrorKind::SubtagTooLong, }); } if state == State::Start { // Primary language if subtag.len() < 2 || !is_alphabetic(subtag) { return Err(LanguageTagParseError { kind: TagParseErrorKind::InvalidLanguage, }); } language_end = end; output.extend(to_lowercase(subtag)); if subtag.len() < 4 { // extlangs are only allowed for short language tags state = State::AfterLanguage; } else { state = State::AfterExtLang; } } else if let State::InPrivateUse { .. } = state { if !is_alphanumeric(subtag) { return Err(LanguageTagParseError { kind: TagParseErrorKind::InvalidSubtag, }); } output.push('-'); output.extend(to_lowercase(subtag)); state = State::InPrivateUse { expected: false }; } else if subtag == "x" || subtag == "X" { // We make sure extension is found if let State::InExtension { expected: true } = state { return Err(LanguageTagParseError { kind: TagParseErrorKind::EmptyExtension, }); } output.push('-'); output.push('x'); state = State::InPrivateUse { expected: true }; } else if subtag.len() == 1 && is_alphanumeric(subtag) { // We make sure extension is found if let State::InExtension { expected: true } = state { return Err(LanguageTagParseError { kind: TagParseErrorKind::EmptyExtension, }); } let extension_tag = subtag.chars().next().unwrap().to_ascii_lowercase(); output.push('-'); output.push(extension_tag); state = State::InExtension { expected: true }; } else if let State::InExtension { .. } = state { if !is_alphanumeric(subtag) { return Err(LanguageTagParseError { kind: TagParseErrorKind::InvalidSubtag, }); } extension_end = end; output.push('-'); output.extend(to_lowercase(subtag)); state = State::InExtension { expected: false }; } else if state == State::AfterLanguage && subtag.len() == 3 && is_alphabetic(subtag) { extlangs_count += 1; if extlangs_count > 3 { return Err(LanguageTagParseError { kind: TagParseErrorKind::TooManyExtlangs, }); } // valid extlangs extlang_end = end; output.push('-'); output.extend(to_lowercase(subtag)); } else if (state == State::AfterLanguage || state == State::AfterExtLang) && subtag.len() == 4 && is_alphabetic(subtag) { // Script script_end = end; output.push('-'); output.extend(to_uppercase_first(subtag)); state = State::AfterScript; } else if (state == State::AfterLanguage || state == State::AfterExtLang || state == State::AfterScript) && (subtag.len() == 2 && is_alphabetic(subtag) || subtag.len() == 3 && is_numeric(subtag)) { // Region region_end = end; output.push('-'); output.extend(to_uppercase(subtag)); state = State::AfterRegion; } else if (state == State::AfterLanguage || state == State::AfterExtLang || state == State::AfterScript || state == State::AfterRegion) && is_alphanumeric(subtag) && (subtag.len() >= 5 && is_alphabetic(&subtag[0..1]) || subtag.len() >= 4 && is_numeric(&subtag[0..1])) { // Variant variant_end = end; output.push('-'); output.extend(to_lowercase(subtag)); state = State::AfterRegion; } else { return Err(LanguageTagParseError { kind: TagParseErrorKind::InvalidSubtag, }); } } //We make sure we are in a correct final state if let State::InExtension { expected: true } = state { return Err(LanguageTagParseError { kind: TagParseErrorKind::EmptyExtension, }); } if let State::InPrivateUse { expected: true } = state { return Err(LanguageTagParseError { kind: TagParseErrorKind::EmptyPrivateUse, }); } //We make sure we have not skipped anyone if extlang_end < language_end { extlang_end = language_end; } if script_end < extlang_end { script_end = extlang_end; } if region_end < script_end { region_end = script_end; } if variant_end < region_end { variant_end = region_end; } if extension_end < variant_end { extension_end = variant_end; } Ok(TagElementsPositions { language_end, extlang_end, script_end, region_end, variant_end, extension_end, }) } struct ExtensionsIterator<'a> { input: &'a str, } impl<'a> ExtensionsIterator<'a> { fn new(input: &'a str) -> Self { Self { input } } } impl<'a> Iterator for ExtensionsIterator<'a> { type Item = (char, &'a str); fn next(&mut self) -> Option<(char, &'a str)> { let mut parts_iterator = self.input.split_terminator('-'); let singleton = parts_iterator.next()?.chars().next().unwrap(); let mut content_size: usize = 2; for part in parts_iterator { if part.len() == 1 { let content = &self.input[2..content_size - 1]; self.input = &self.input[content_size..]; return Some((singleton, content)); } else { content_size += part.len() + 1; } } let result = self.input.get(2..).map(|content| (singleton, content)); self.input = ""; result } } struct SubTagIterator<'a> { split: Split<'a, char>, position: usize, } impl<'a> SubTagIterator<'a> { #[inline] fn new(input: &'a str) -> Self { Self { split: input.split('-'), position: 0, } } } impl<'a> Iterator for SubTagIterator<'a> { type Item = (&'a str, usize); #[inline] fn next(&mut self) -> Option<(&'a str, usize)> { let tag = self.split.next()?; let tag_end = self.position + tag.len(); self.position = tag_end + 1; Some((tag, tag_end)) } } #[inline] fn is_alphabetic(s: &str) -> bool { s.chars().all(|x| x.is_ascii_alphabetic()) } #[inline] fn is_numeric(s: &str) -> bool { s.chars().all(|x| x.is_ascii_digit()) } #[inline] fn is_alphanumeric(s: &str) -> bool { s.chars().all(|x| x.is_ascii_alphanumeric()) } #[inline] fn is_alphanumeric_or_dash(s: &str) -> bool { s.chars().all(|x| x.is_ascii_alphanumeric() || x == '-') } #[inline] fn to_uppercase(s: &str) -> impl Iterator + '_ { s.chars().map(|c| c.to_ascii_uppercase()) } // Beware: panics if s.len() == 0 (should never happen in our code) #[inline] fn to_uppercase_first(s: &str) -> impl Iterator + '_ { let mut chars = s.chars(); once(chars.next().unwrap().to_ascii_uppercase()).chain(chars.map(|c| c.to_ascii_lowercase())) } #[inline] fn to_lowercase(s: &str) -> impl Iterator + '_ { s.chars().map(|c| c.to_ascii_lowercase()) } const GRANDFATHEREDS: [&str; 26] = [ "art-lojban", "cel-gaulish", "en-GB-oed", "i-ami", "i-bnn", "i-default", "i-enochian", "i-hak", "i-klingon", "i-lux", "i-mingo", "i-navajo", "i-pwn", "i-tao", "i-tay", "i-tsu", "no-bok", "no-nyn", "sgn-BE-FR", "sgn-BE-NL", "sgn-CH-DE", "zh-guoyu", "zh-hakka", "zh-min", "zh-min-nan", "zh-xiang", ]; oxilangtag-0.1.5/tests/000077500000000000000000000000001457341165100150235ustar00rootroot00000000000000oxilangtag-0.1.5/tests/lib.rs000066400000000000000000000462731457341165100161530ustar00rootroot00000000000000use oxilangtag::LanguageTag; #[cfg(feature = "serde")] use serde_test::{assert_de_tokens, assert_de_tokens_error, assert_tokens, Token}; use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; // Tests from RFC 5646 2.1.1 #[test] fn test_formatting() { assert_eq!( "mn-Cyrl-MN", LanguageTag::parse_and_normalize("mn-Cyrl-MN") .unwrap() .as_str() ); assert_eq!( "mn-Cyrl-MN", LanguageTag::parse_and_normalize("MN-cYRL-mn") .unwrap() .as_str() ); assert_eq!( "mn-Cyrl-MN", LanguageTag::parse_and_normalize("mN-cYrL-Mn") .unwrap() .as_str() ); assert_eq!( "en-CA-x-ca", LanguageTag::parse_and_normalize("en-CA-x-ca") .unwrap() .as_str() ); assert_eq!( "sgn-BE-FR", LanguageTag::parse_and_normalize("sgn-BE-FR") .unwrap() .as_str() ); assert_eq!( "az-Latn-x-latn", LanguageTag::parse_and_normalize("az-Latn-x-latn") .unwrap() .as_str() ); assert_eq!( "i-ami", LanguageTag::parse_and_normalize("i-ami").unwrap().as_str() ); assert_eq!( "i-ami", LanguageTag::parse_and_normalize("I-AMI").unwrap().as_str() ); assert_eq!( "sl-afb-Latn-005-nedis", LanguageTag::parse_and_normalize("SL-AFB-lATN-005-nEdis") .unwrap() .as_str() ) } // Tests from RFC 5646 2.2.1 #[test] fn test_primary_language() { assert_eq!( "fr", LanguageTag::parse_and_normalize("fr") .unwrap() .primary_language() ); assert_eq!( "de", LanguageTag::parse_and_normalize("de") .unwrap() .primary_language() ); assert_eq!( "x-fr-ch", LanguageTag::parse_and_normalize("x-fr-CH") .unwrap() .primary_language() ); assert_eq!( "i-klingon", LanguageTag::parse_and_normalize("i-klingon") .unwrap() .primary_language() ); assert_eq!( "i-bnn", LanguageTag::parse_and_normalize("i-bnn") .unwrap() .primary_language() ); assert_eq!( "zh-hakka", LanguageTag::parse_and_normalize("zh-hakka") .unwrap() .primary_language() ) } // Tests from RFC 5646 2.2.2 #[test] fn test_extended_language() { fn parts(tag: &LanguageTag) -> (&str, &str, Option<&str>, Vec<&str>) { ( tag.full_language(), tag.primary_language(), tag.extended_language(), tag.extended_language_subtags().collect(), ) } assert_eq!(("zh", "zh", None, vec![]), parts(&"zh".parse().unwrap())); assert_eq!( ("zh-gan", "zh", Some("gan"), vec!["gan"]), parts(&"zh-gan".parse().unwrap()) ); assert_eq!( ("zh-gan-foo", "zh", Some("gan-foo"), vec!["gan", "foo"]), parts(&"zh-gan-foo".parse().unwrap()) ); assert_eq!( ("zh-min-nan", "zh-min-nan", None, vec![]), parts(&"zh-min-nan".parse().unwrap()) ); assert_eq!( ("i-tsu", "i-tsu", None, vec![]), parts(&"i-tsu".parse().unwrap()) ); assert_eq!(("zh", "zh", None, vec![]), parts(&"zh-CN".parse().unwrap())); assert_eq!( ("zh-gan", "zh", Some("gan"), vec!["gan"]), parts(&"zh-gan-CN".parse().unwrap()) ); assert_eq!( ("ar-afb", "ar", Some("afb"), vec!["afb"]), parts(&"ar-afb".parse().unwrap()) ); } // Tests from RFC 5646 2.2.3 #[test] fn test_script() { fn parts(tag: &LanguageTag) -> (&str, Option<&str>) { (tag.primary_language(), tag.script()) } assert_eq!(("sr", Some("Latn")), parts(&"sr-Latn".parse().unwrap())); assert_eq!(("ar", Some("Latn")), parts(&"ar-afb-Latn".parse().unwrap())) } // Tests from RFC 5646 2.2.4 #[test] fn test_region() { fn parts(tag: &LanguageTag) -> (&str, Option<&str>, Option<&str>) { (tag.primary_language(), tag.script(), tag.region()) } assert_eq!(("de", None, Some("AT")), parts(&"de-AT".parse().unwrap())); assert_eq!( ("sr", Some("Latn"), Some("RS")), parts(&"sr-Latn-RS".parse().unwrap()) ); assert_eq!(("es", None, Some("419")), parts(&"es-419".parse().unwrap())); assert_eq!(("ar", None, Some("DE")), parts(&"ar-DE".parse().unwrap())); assert_eq!(("ar", None, Some("005")), parts(&"ar-005".parse().unwrap())); } // Tests from RFC 5646 2.2.5 #[test] fn test_variant() { fn parts(tag: &LanguageTag) -> (&str, Option<&str>, Vec<&str>) { ( tag.primary_language(), tag.variant(), tag.variant_subtags().collect(), ) } assert_eq!(("sl", None, vec![]), parts(&"sl".parse().unwrap())); assert_eq!( ("sl", Some("nedis"), vec!["nedis"]), parts(&"sl-nedis".parse().unwrap()) ); assert_eq!( ("de", Some("1996"), vec!["1996"]), parts(&"de-CH-1996".parse().unwrap()) ); assert_eq!( ("art-lojban", None, vec![]), parts(&"art-lojban".parse().unwrap()) ); } // Tests from RFC 5646 2.2.6 #[test] fn test_extension() { fn parts(tag: &LanguageTag) -> (&str, Option<&str>, Vec<(char, &str)>) { ( tag.primary_language(), tag.extension(), tag.extension_subtags().collect(), ) } assert_eq!(("en", None, vec![]), parts(&"en".parse().unwrap())); assert_eq!( ("en", Some("a-bbb"), vec![('a', "bbb")]), parts(&"en-a-bbb-x-a-ccc".parse().unwrap()) ); assert_eq!( ( "en", Some("a-babble-b-warble"), vec![('a', "babble"), ('b', "warble")] ), parts(&"en-a-babble-b-warble".parse().unwrap()) ); assert_eq!( ("fr", Some("a-latn"), vec![('a', "latn")]), parts(&"fr-a-Latn".parse().unwrap()) ); assert_eq!( ( "en", Some("r-extended-sequence"), vec![('r', "extended-sequence")] ), parts( &"en-Latn-GB-boont-r-extended-sequence-x-private" .parse() .unwrap() ) ); assert_eq!( ("en", Some("r-az-r-qt"), vec![('r', "az"), ('r', "qt")]), parts(&"en-r-az-r-qt".parse().unwrap()) ); assert_eq!(("i-tsu", None, vec![]), parts(&"i-tsu".parse().unwrap())); } // Tests from RFC 5646 2.2.7 #[test] fn test_privateuse() { fn parts(tag: &LanguageTag) -> (&str, Option<&str>, Vec<&str>) { ( tag.primary_language(), tag.private_use(), tag.private_use_subtags().collect(), ) } assert_eq!(("en", None, vec![]), parts(&"en".parse().unwrap())); assert_eq!( ("en", Some("x-us"), vec!["us"]), parts(&"en-x-US".parse().unwrap()) ); assert_eq!( ("el", Some("x-koine"), vec!["koine"]), parts(&"el-x-koine".parse().unwrap()) ); assert_eq!( ("x-fr-ch", Some("x-fr-ch"), vec!["fr", "ch"]), parts(&"x-fr-ch".parse().unwrap()) ); assert_eq!( ("es", Some("x-foobar-at-007"), vec!["foobar", "at", "007"]), parts(&"es-x-foobar-AT-007".parse().unwrap()) ) } #[test] fn test_fmt() { assert_eq!( "ar-arb-Latn-DE-nedis-foobar", LanguageTag::parse_and_normalize("ar-arb-Latn-DE-nedis-foobar") .unwrap() .as_str() ); assert_eq!( "ar-arb-Latn-DE-nedis-foobar", LanguageTag::parse_and_normalize("ar-arb-latn-de-nedis-foobar") .unwrap() .as_str() ); assert_eq!( "ar-arb-Latn-DE-nedis-foobar", LanguageTag::parse_and_normalize("AR-ARB-LATN-DE-NEDIS-FOOBAR") .unwrap() .as_str() ); assert_eq!( "xx-z-foo-a-bar-f-spam-b-eggs", LanguageTag::parse_and_normalize("xx-z-foo-a-bar-F-spam-b-eggs") .unwrap() .as_str() ); assert_eq!( "hkgnmerm-x-e5-zf-vddjcpz-1v6", LanguageTag::parse_and_normalize("HkgnmerM-x-e5-zf-VdDjcpz-1V6") .unwrap() .to_string() ); assert_eq!( "mgxqa-Ywep-8lcw-7bvt-h-dp1md-0h7-0z3ir", LanguageTag::parse_and_normalize("MgxQa-ywEp-8lcW-7bvT-h-dP1Md-0h7-0Z3ir") .unwrap() .as_str() ); } #[test] fn test_unicode() { assert!(LanguageTag::parse("zh-x-Üńìcødê").is_err()); } #[test] fn test_cmp() { assert_eq!( LanguageTag::parse_and_normalize("dE-AraB-lY").unwrap(), LanguageTag::parse_and_normalize("DE-aRaB-LY").unwrap() ); assert_ne!( LanguageTag::parse_and_normalize("zh").unwrap(), LanguageTag::parse_and_normalize("zh-Latn").unwrap() ); } // http://www.langtag.net/test-suites/well-formed-tags.txt #[test] fn test_wellformed_tags() { let tags = vec![ "fr", "fr-Latn", "fr-fra", // Extended tag "fr-Latn-FR", "fr-Latn-419", "fr-FR", "ax-TZ", // Not in the registry, but well-formed "fr-shadok", // Variant "fr-y-myext-myext2", "fra-Latn", // ISO 639 can be 3-letters "fra", "fra-FX", "i-klingon", // grandfathered with singleton "I-kLINgon", // tags are case-insensitive... "no-bok", // grandfathered without singleton "fr-Lat", // Extended", "mn-Cyrl-MN", "mN-cYrL-Mn", "fr-Latn-CA", "en-US", "fr-Latn-CA", "i-enochian", // Grand fathered "x-fr-CH", "sr-Latn-CS", "es-419", "sl-nedis", "de-CH-1996", "de-Latg-1996", "sl-IT-nedis", "en-a-bbb-x-a-ccc", "de-a-value", "en-Latn-GB-boont-r-extended-sequence-x-private", "en-x-US", "az-Arab-x-AZE-derbend", "es-Latn-CO-x-private", "en-US-boont", "ab-x-abc-x-abc", // anything goes after x "ab-x-abc-a-a", // ditto", "i-default", // grandfathered", "i-klingon", // grandfathered", "abcd-Latn", // Language of 4 chars reserved for future use "AaBbCcDd-x-y-any-x", // Language of 5-8 chars, registered "en", "de-AT", "es-419", "de-CH-1901", "sr-Cyrl", "sr-Cyrl-CS", "sl-Latn-IT-rozaj", "en-US-x-twain", "zh-cmn", "zh-cmn-Hant", "zh-cmn-Hant-HK", "zh-gan", "zh-yue-Hant-HK", "xr-lxs-qut", // extlangS "xr-lqt-qu", // extlang + region "xr-p-lze", // Extension ]; for tag in tags { let result = LanguageTag::parse(tag); assert!( result.is_ok(), "{} should be considered well-formed but returned error {}", tag, result.err().unwrap() ); } } // http://www.langtag.net/test-suites/broken-tags.txt #[test] fn test_broken_tags() { let tags = vec![ "", "f", "f-Latn", "fr-Latn-F", "a-value", "tlh-a-b-foo", "i-notexist", // grandfathered but not registered: always invalid "abcdefghi-012345678", "ab-abc-abc-abc-abc", "ab-abcd-abc", "ab-ab-abc", "ab-123-abc", "a-Hant-ZH", "a1-Hant-ZH", "ab-abcde-abc", "ab-1abc-abc", "ab-ab-abcd", "ab-123-abcd", "ab-abcde-abcd", "ab-1abc-abcd", "ab-a-b", "ab-a-x", "ab--ab", "ab-abc-", "-ab-abc", "abcd-efg", "aabbccddE", ]; for tag in tags { let result = LanguageTag::parse(tag); assert!( result.is_err(), "{} should be considered not well-formed but returned result {:?}", tag, result.ok().unwrap() ); } } #[test] fn test_random_good_tags() { // http://unicode.org/repos/cldr/trunk/tools/java/org/unicode/cldr/util/data/langtagTest.txt let tags = vec![ "zszLDm-sCVS-es-x-gn762vG-83-S-mlL", "IIJdFI-cfZv", "kbAxSgJ-685", "tbutP", "hDL-595", "dUf-iUjq-0hJ4P-5YkF-WD8fk", "FZAABA-FH", "xZ-lh-4QfM5z9J-1eG4-x-K-R6VPr2z", "Fyi", "SeI-DbaG", "ch-xwFn", "OeC-GPVI", "JLzvUSi", "Fxh-hLAs", "pKHzCP-sgaO-554", "eytqeW-hfgH-uQ", "ydn-zeOP-PR", "uoWmBM-yHCf-JE", "xwYem", "zie", "Re-wjSv-Ey-i-XE-E-JjWTEB8-f-DLSH-NVzLH-AtnFGWoH-SIDE", "Ri-063-c-u6v-ZfhkToTB-C-IFfmv-XT-j-rdyYFMhK-h-pY-D5-Oh6FqBhL-hcXt-v-WdpNx71-\ K-c74m4-eBTT7-JdH7Q1Z", "ji", "IM-487", "EPZ-zwcB", "GauwEcwo", "kDEP", "FwDYt-TNvo", "ottqP-KLES-x-9-i9", "fcflR-grQQ", "TvFwdu-kYhs", "WE-336", "MgxQa-ywEp-8lcW-7bvT-h-dP1Md-0h7-0Z3ir-K-Srkm-kA-7LXM-Z-whb2MiO-2mNsvbLm-W3O\ -4r-U-KceIxHdI-gvMVgUBV-2uRUni-J0-7C8yTK2", "Hyr-B-evMtVoB1-mtsVZf-vQMV-gM-I-rr-kvLzg-f-lAUK-Qb36Ne-Z-7eFzOD-mv6kKf-l-miZ\ 7U3-k-XDGtNQG", "ybrlCpzy", "PTow-w-cAQ51-8Xd6E-cumicgt-WpkZv3NY-q-ORYPRy-v-A4jL4A-iNEqQZZ-sjKn-W-N1F-pzy\ c-xP5eWz-LmsCiCcZ", "ih-DlPR-PE", "Krf-362", "WzaD", "EPaOnB-gHHn", "XYta", "NZ-RgOO-tR", "at-FE", "Tpc-693", "YFp", "gRQrQULo", "pVomZ-585", "laSu-ZcAq-338", "gCW", "PydSwHRI-TYfF", "zKmWDD", "X-bCrL5RL", "HK", "YMKGcLY", "GDJ-nHYa-bw-X-ke-rohH5GfS-LdJKsGVe", "tfOxdau-yjge-489-a-oB-I8Csb-1ESaK1v-VFNz-N-FT-ZQyn-On2-I-hu-vaW3-jIQb-vg0U-h\ Ul-h-dO6KuJqB-U-tde2L-P3gHUY-vnl5c-RyO-H-gK1-zDPu-VF1oeh8W-kGzzvBbW-yuAJZ", "LwDux", "Zl-072", "Ri-Ar", "vocMSwo-cJnr-288", "kUWq-gWfQ-794", "YyzqKL-273", "Xrw-ZHwH-841-9ddT-ESSZF-6OqO-0knk-991U-9p3m-b-JhiV-0Kq7Y-h-cxphLb-cDlXUBOQ-X\ -4Ti-jty94yPp", "en-GB-oed", "LEuZl-so", "HyvBvFi-cCAl-X-irMQA-Pzt-H", "uDbsrAA-304", "wTS", "IWXS", "XvDqNkSn-jRDR", "gX-Ycbb-iLphEks-AQ1aJ5", "FbSBz-VLcR-VL", "JYoVQOP-Iytp", "gDSoDGD-lq-v-7aFec-ag-k-Z4-0kgNxXC-7h", "Bjvoayy-029", "qSDJd", "qpbQov", "fYIll-516", "GfgLyfWE-EHtB", "Wc-ZMtk", "cgh-VEYK", "WRZs-AaFd-yQ", "eSb-CpsZ-788", "YVwFU", "JSsHiQhr-MpjT-381", "LuhtJIQi-JKYt", "vVTvS-RHcP", "SY", "fSf-EgvQfI-ktWoG-8X5z-63PW", "NOKcy", "OjJb-550", "KB", "qzKBv-zDKk-589", "Jr", "Acw-GPXf-088", "WAFSbos", "HkgnmerM-x-e5-zf-VdDjcpz-1V6", "UAfYflJU-uXDc-YV", "x-CHsHx-VDcOUAur-FqagDTx-H-V0e74R", "uZIAZ-Xmbh-pd", ]; for tag in tags { let result = LanguageTag::parse(tag); assert!( result.is_ok(), "{} should be considered well-formed but returned error {}", tag, result.err().unwrap() ); } } #[test] fn test_random_bad_tags() { // http://unicode.org/repos/cldr/trunk/tools/java/org/unicode/cldr/util/data/langtagTest.txt let tags = vec![ "EdY-z_H791Xx6_m_kj", "qWt85_8S0-L_rbBDq0gl_m_O_zsAx_nRS", "VzyL2", "T_VFJq-L-0JWuH_u2_VW-hK-kbE", "u-t", "Q-f_ZVJXyc-doj_k-i", "JWB7gNa_K-5GB-25t_W-s-ZbGVwDu1-H3E", "b-2T-Qob_L-C9v_2CZxK86", "fQTpX_0_4Vg_L3L_g7VtALh2", "S-Z-E_J", "f6wsq-02_i-F", "9_GcUPq_G", "QjsIy_9-0-7_Dv2yPV09_D-JXWXM", "D_se-f-k", "ON47Wv1_2_W", "f-z-R_s-ha", "N3APeiw_195_Bx2-mM-pf-Z-Ip5lXWa-5r", "IRjxU-E_6kS_D_b1b_H", "NB-3-5-AyW_FQ-9hB-TrRJg3JV_3C", "yF-3a_V_FJQAHeL_Z-Mc-u", "n_w_bbunOG_1-s-tJMT5je", "Q-AEWE_X", "57b1O_k_R6MU_sb", "hK_65J_i-o_SI-Y", "wB4B7u_5I2_I_NZPI", "J24Nb_q_d-zE", "v6-dHjJmvPS_IEb-x_A-O-i", "8_8_dl-ZgBr84u-P-E", "nIn-xD7EVhe_C", "5_N-6P_x7Of_Lo_6_YX_R", "0_46Oo0sZ-YNwiU8Wr_d-M-pg1OriV", "laiY-5", "K-8Mdd-j_ila0sSpo_aO8_J", "wNATtSL-Cp4_gPa_fD41_9z", "H_FGz5V8_n6rrcoz0_1O6d-kH-7-N", "wDOrnHU-odqJ_vWl", "gP_qO-I-jH", "h", "dJ0hX-o_csBykEhU-F", "L-Vf7_BV_eRJ5goSF_Kp", "y-oF-chnavU-H", "9FkG-8Q-8_v", "W_l_QqI-O_SFSAOVq", "kDG3fzXw", "t-nsSp-7-t-mUK2", "Yw-F", "1-S_3_l", "u-v_brn-Y", "4_ft_3ZPZC5lA_D", "n_dR-QodsqJnh_e", "Hwvt-bSwZwj_KL-hxg0m-3_hUG", "mQHzvcV-UL-o2O_1KhUJQo_G2_uryk3-a", "b-UTn33HF", "r-Ep-jY-aFM_N_H", "K-k-krEZ0gwD_k_ua-9dm3Oy-s_v", "XS_oS-p", "EIx_h-zf5", "p_z-0_i-omQCo3B", "1_q0N_jo_9", "0Ai-6-S", "L-LZEp_HtW", "Zj-A4JD_2A5Aj7_b-m3", "x", "p-qPuXQpp_d-jeKifB-c-7_G-X", "X94cvJ_A", "F2D25R_qk_W-w_Okf_kx", "rc-f", "D", "gD_WrDfxmF-wu-E-U4t", "Z_BN9O4_D9-D_0E_KnCwZF-84b-19", "T-8_g-u-0_E", "lXTtys9j_X_A_m-vtNiNMw_X_b-C6Nr", "V_Ps-4Y-S", "X5wGEA", "mIbHFf_ALu4_Jo1Z1", "ET-TacYx_c", "Z-Lm5cAP_ri88-d_q_fi8-x", "rTi2ah-4j_j_4AlxTs6m_8-g9zqncIf-N5", "FBLB85_u-0NxhAy-ZU_9c", "x_j_l-5_aV95_s_tY_jp4", "PL768_D-m7jNWjfD-Nl_7qvb_bs_8_Vg", "9-yOc-gbh", "6DYxZ_SL-S_Ye", "ZCa-U-muib-6-d-f_oEh_O", "Qt-S-o8340F_f_aGax-c-jbV0gfK_p", "WE_SzOI_OGuoBDk-gDp", "cs-Y_9", "m1_uj", "Y-ob_PT", "li-B", "f-2-7-9m_f8den_J_T_d", "p-Os0dua-H_o-u", "L", "rby-w", ]; for tag in tags { let result = LanguageTag::parse(tag); assert!( result.is_err(), "{} should be considered not well-formed but returned result {:?}", tag, result.ok().unwrap() ); } } #[test] fn test_eq() { let tag = LanguageTag::parse("en-fr").unwrap(); assert_eq!(tag, "en-fr"); assert_ne!(tag, "en-FR"); assert_eq!("en-fr", tag); assert_eq!(hash(tag), hash("en-fr")); assert_ne!(hash(tag), hash("en-FR")); } fn hash(value: impl Hash) -> u64 { let mut hasher = DefaultHasher::new(); value.hash(&mut hasher); hasher.finish() } #[test] fn test_str() { let tag = LanguageTag::parse("en-fr").unwrap(); assert!(tag.starts_with("en-")); } #[cfg(feature = "serde")] #[test] fn test_serd_impl() { assert_tokens( &LanguageTag::parse("en-us").unwrap(), &[Token::BorrowedStr("en-us")], ); assert_tokens( &LanguageTag::parse("en-US".to_string()).unwrap(), &[Token::String("en-US")], ); assert_de_tokens( &LanguageTag::parse("en-US".to_string()).unwrap(), &[Token::BorrowedStr("en-US")], ); assert_de_tokens_error::>( &[Token::String("verybadvalue")], "A subtag may be eight characters in length at maximum", ); }