unic-bidi-0.9.0/benches/bidi_basic_benches.rs010064400007650000024000000033151343520353600173240ustar0000000000000000// Copyright 2015 The Servo Project Developers. // Copyright 2017 The UNIC Project Developers. // // See the COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. #![cfg(all(test, feature = "bench_it"))] #![feature(test)] extern crate test; use test::Bencher; use unic_bidi::BidiInfo; const LTR_TEXTS: &[&str] = &["abc\ndef\nghi", "abc 123\ndef 456\nghi 789"]; const BIDI_TEXTS: &[&str] = &[ "ابجد\nهوز\nحتی", "ابجد ۱۲۳\nهوز ۴۵۶\nحتی ۷۸۹\nabc\ndef", ]; fn bench_bidi_info_new(b: &mut Bencher, texts: &[&str]) { for text in texts { b.iter(|| { BidiInfo::new(text, None); }); } } fn bench_reorder_line(b: &mut Bencher, texts: &[&str]) { for text in texts { let bidi_info = BidiInfo::new(text, None); b.iter(|| { for para in &bidi_info.paragraphs { let line = para.range.clone(); bidi_info.reorder_line(para, line); } }); } } #[bench] fn bench_1_bidi_info_new_for_ltr_texts(b: &mut Bencher) { bench_bidi_info_new(b, LTR_TEXTS); } #[bench] fn bench_2_bidi_info_new_for_bidi_texts(b: &mut Bencher) { bench_bidi_info_new(b, BIDI_TEXTS); } #[bench] fn bench_3_reorder_line_for_ltr_texts(b: &mut Bencher) { bench_reorder_line(b, LTR_TEXTS); } #[bench] fn bench_4_reorder_line_for_bidi_texts(b: &mut Bencher) { bench_reorder_line(b, BIDI_TEXTS); } unic-bidi-0.9.0/Cargo.toml.orig010064400007650000024000000024051343665764000145100ustar0000000000000000[package] name = "unic-bidi" version = "0.9.0" edition = "2018" authors = ["The UNIC Project Developers"] repository = "https://github.com/open-i18n/rust-unic/" license = "MIT/Apache-2.0" description = "UNIC — Unicode Bidirectional Algorithm" keywords = ["text", "unicode", "bidi", "rtl", "layout"] categories = ["internationalization", "text-processing", "parsing", "rendering"] readme = "README.md" # No tests/benches that depends on /data/ exclude = [ "tests/conformance_tests.rs", "benches/bidi_udhr_benches.rs", ] [features] default = [] bench_it = [] [dependencies] matches = "0.1" serde = { version = ">=0.8, <2.0", optional = true, features = ["derive"] } unic-ucd-bidi = { path = "../ucd/bidi/", version = "0.9.0" } [dev-dependencies] serde_test = ">=0.8, <2.0" unic-char-property = { path = "../char/property/", version = "0.9.0" } unic-ucd-version = { path = "../ucd/version/", version = "0.9.0" } [badges] maintenance = { status = "actively-developed" } is-it-maintained-issue-resolution = { repository = "open-i18n/rust-unic" } is-it-maintained-open-issues = { repository = "open-i18n/rust-unic" } appveyor = { repository = "open-i18n/rust-unic", branch = "master", service = "github" } travis-ci = { repository = "open-i18n/rust-unic", branch = "master" } unic-bidi-0.9.0/Cargo.toml0000644000000033200000000000000107400ustar00# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g. crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] edition = "2018" name = "unic-bidi" version = "0.9.0" authors = ["The UNIC Project Developers"] exclude = ["tests/conformance_tests.rs", "benches/bidi_udhr_benches.rs"] description = "UNIC — Unicode Bidirectional Algorithm" readme = "README.md" keywords = ["text", "unicode", "bidi", "rtl", "layout"] categories = ["internationalization", "text-processing", "parsing", "rendering"] license = "MIT/Apache-2.0" repository = "https://github.com/open-i18n/rust-unic/" [dependencies.matches] version = "0.1" [dependencies.serde] version = ">=0.8, <2.0" features = ["derive"] optional = true [dependencies.unic-ucd-bidi] version = "0.9.0" [dev-dependencies.serde_test] version = ">=0.8, <2.0" [dev-dependencies.unic-char-property] version = "0.9.0" [dev-dependencies.unic-ucd-version] version = "0.9.0" [features] bench_it = [] default = [] [badges.appveyor] branch = "master" repository = "open-i18n/rust-unic" service = "github" [badges.is-it-maintained-issue-resolution] repository = "open-i18n/rust-unic" [badges.is-it-maintained-open-issues] repository = "open-i18n/rust-unic" [badges.maintenance] status = "actively-developed" [badges.travis-ci] branch = "master" repository = "open-i18n/rust-unic" unic-bidi-0.9.0/README.md010064400007650000024000000011731343520353600130670ustar0000000000000000# UNIC — Unicode Bidirectional Algorithm [![Crates.io](https://img.shields.io/crates/v/unic-bidi.svg)](https://crates.io/crates/unic-bidi) [![Documentation](https://docs.rs/unic-bidi/badge.svg)](https://docs.rs/unic-bidi/) This UNIC component implements algorithms from [Unicode® Standard Annex #9 - Unicode Bidirectional Algorithm](http://unicode.org/reports/tr9/), a.k.a. *UBA*, used for display of mixed right-to-left and left-to-right text. It is written in safe Rust, compatible with the current stable release. ## Notes Initial code for this component is based on [`unicode-bidi`](https://github.com/servo/unicode-bidi). unic-bidi-0.9.0/src/bidi_info.rs010064400007650000024000000664611343520353600147020ustar0000000000000000// Copyright 2015 The Servo Project Developers. // Copyright 2017 The UNIC Project Developers. // // See the COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use std::borrow::Cow; use std::cmp::{max, min}; use std::fmt; use std::iter::repeat; use std::ops::Range; use unic_ucd_bidi::bidi_class::abbr_names::*; use unic_ucd_bidi::BidiClass; use crate::explicit; use crate::format_chars; use crate::implicit; use crate::level; use crate::prepare; use crate::level::{Level, LTR_LEVEL, RTL_LEVEL}; use crate::prepare::LevelRun; /// Bidi information about a single paragraph #[derive(Clone, Debug, Eq, PartialEq, Hash)] pub struct ParagraphInfo { /// The paragraphs boundaries within the text, as byte indices. /// /// TODO: Shrink this to only include the starting index? pub range: Range, /// The paragraph embedding level. /// /// pub level: Level, } /// Initial bidi information of the text /// /// Contains the paragraphs and `BidiClass`es in a string of text. #[derive(Clone, Debug, Eq, PartialEq, Hash)] pub struct InitialInfo<'text> { /// The text pub text: &'text str, /// The BidiClass of the character at each byte in the text. /// If a character is multiple bytes, its class will appear multiple times in the vector. pub original_classes: Vec, /// The boundaries and level of each paragraph within the text. pub paragraphs: Vec, } impl<'text> InitialInfo<'text> { /// Find the paragraphs and `BidiClass`es in a string of text. /// /// /// /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong /// character is found before the matching PDI. If no strong character is found, the class will /// remain FSI, and it's up to later stages to treat these as LRI when needed. pub fn new(text: &str, default_para_level: Option) -> InitialInfo<'_> { let mut original_classes = Vec::with_capacity(text.len()); // The stack contains the starting byte index for each nested isolate we're inside. let mut isolate_stack = Vec::new(); let mut paragraphs = Vec::new(); let mut para_start = 0; let mut para_level = default_para_level; for (i, c) in text.char_indices() { let class = BidiClass::of(c); original_classes.extend(repeat(class).take(c.len_utf8())); match class { B => { // P1. Split the text into separate paragraphs. The paragraph separator is kept // with the previous paragraph. let para_end = i + c.len_utf8(); paragraphs.push(ParagraphInfo { range: para_start..para_end, // P3. If no character is found in p2, set the paragraph level to zero. level: para_level.unwrap_or(LTR_LEVEL), }); // Reset state for the start of the next paragraph. para_start = para_end; // TODO: Support defaulting to direction of previous paragraph // // para_level = default_para_level; isolate_stack.clear(); } L | R | AL => { match isolate_stack.last() { Some(&start) => { if original_classes[start] == FSI { // X5c. If the first strong character between FSI and its matching // PDI is R or AL, treat it as RLI. Otherwise, treat it as LRI. for j in 0..format_chars::FSI.len_utf8() { original_classes[start + j] = if class == L { LRI } else { RLI }; } } } None => { if para_level.is_none() { // P2. Find the first character of type L, AL, or R, while skipping // any characters between an isolate initiator and its matching // PDI. para_level = Some(if class != L { RTL_LEVEL } else { LTR_LEVEL }); } } } } RLI | LRI | FSI => { isolate_stack.push(i); } PDI => { isolate_stack.pop(); } _ => {} } } if para_start < text.len() { paragraphs.push(ParagraphInfo { range: para_start..text.len(), level: para_level.unwrap_or(LTR_LEVEL), }); } assert_eq!(original_classes.len(), text.len()); InitialInfo { text, original_classes, paragraphs, } } } /// Bidi information of the text /// /// The `original_classes` and `levels` vectors are indexed by byte offsets into the text. If a /// character is multiple bytes wide, then its class and level will appear multiple times in these /// vectors. // TODO: Impl `struct StringProperty { values: Vec }` and use instead of Vec #[derive(Debug, Eq, PartialEq, Hash)] pub struct BidiInfo<'text> { /// The text pub text: &'text str, /// The BidiClass of the character at each byte in the text. pub original_classes: Vec, /// The directional embedding level of each byte in the text. pub levels: Vec, /// The boundaries and paragraph embedding level of each paragraph within the text. /// /// TODO: Use SmallVec or similar to avoid overhead when there are only one or two paragraphs? /// Or just don't include the first paragraph, which always starts at 0? pub paragraphs: Vec, } impl<'text> BidiInfo<'text> { /// Split the text into paragraphs and determine the bidi embedding levels for each paragraph. /// /// TODO: In early steps, check for special cases that allow later steps to be skipped. like /// text that is entirely LTR. See the `nsBidi` class from Gecko for comparison. /// /// TODO: Support auto-RTL base direction pub fn new(text: &str, default_para_level: Option) -> BidiInfo<'_> { let InitialInfo { original_classes, paragraphs, .. } = InitialInfo::new(text, default_para_level); let mut levels = Vec::::with_capacity(text.len()); let mut processing_classes = original_classes.clone(); for para in ¶graphs { let text = &text[para.range.clone()]; let original_classes = &original_classes[para.range.clone()]; let processing_classes = &mut processing_classes[para.range.clone()]; let new_len = levels.len() + para.range.len(); levels.resize(new_len, para.level); let levels = &mut levels[para.range.clone()]; explicit::compute( text, para.level, original_classes, levels, processing_classes, ); let sequences = prepare::isolating_run_sequences(para.level, original_classes, levels); for sequence in &sequences { implicit::resolve_weak(sequence, processing_classes); implicit::resolve_neutral(sequence, levels, processing_classes); } implicit::resolve_levels(processing_classes, levels); Self::assign_levels_to_removed_chars(para.level, original_classes, levels); } BidiInfo { text, original_classes, paragraphs, levels, } } /// Assign levels to characters removed by rule X9. /// /// The levels assigned to these characters are not specified by the algorithm. This function /// assigns each one the level of the previous character, to avoid breaking level runs. fn assign_levels_to_removed_chars( para_level: Level, classes: &[BidiClass], levels: &mut [Level], ) { for i in 0..levels.len() { if prepare::removed_by_x9(classes[i]) { levels[i] = if i > 0 { levels[i - 1] } else { para_level }; } } } /// Re-order a line based on resolved levels and return only the embedding levels, one `Level` /// per *byte*. pub fn reordered_levels(&self, para: &ParagraphInfo, line: Range) -> Vec { let (levels, _) = self.visual_runs(para, line.clone()); levels } /// Re-order a line based on resolved levels and return only the embedding levels, one `Level` /// per *character*. pub fn reordered_levels_per_char( &self, para: &ParagraphInfo, line: Range, ) -> Vec { let levels = self.reordered_levels(para, line); self.text.char_indices().map(|(i, _)| levels[i]).collect() } /// Re-order a line based on resolved levels and return the line in display order. pub fn reorder_line(&self, para: &ParagraphInfo, line: Range) -> Cow<'text, str> { let (levels, runs) = self.visual_runs(para, line.clone()); // If all isolating run sequences are LTR, no reordering is needed if runs.iter().all(|run| levels[run.start].is_ltr()) { return self.text[line.clone()].into(); } let mut result = String::with_capacity(line.len()); for run in runs { if levels[run.start].is_rtl() { result.extend(self.text[run].chars().rev()); } else { result.push_str(&self.text[run]); } } result.into() } /// Find the level runs within a line and return them in visual order. /// /// `line` is a range of bytes indices within `levels`. /// /// #[cfg_attr(feature = "cargo-clippy", allow(needless_range_loop))] pub fn visual_runs( &self, para: &ParagraphInfo, line: Range, ) -> (Vec, Vec) { assert!(line.start <= self.levels.len()); assert!(line.end <= self.levels.len()); let mut levels = self.levels.clone(); // Reset some whitespace chars to paragraph level. // let line_str: &str = &self.text[line.clone()]; let mut reset_from: Option = Some(0); let mut reset_to: Option = None; for (i, c) in line_str.char_indices() { match self.original_classes[i] { // Ignored by X9 RLE | LRE | RLO | LRO | PDF | BN => {} // Segment separator, Paragraph separator B | S => { assert_eq!(reset_to, None); reset_to = Some(i + c.len_utf8()); if reset_from == None { reset_from = Some(i); } } // Whitespace, isolate formatting WS | FSI | LRI | RLI | PDI => { if reset_from == None { reset_from = Some(i); } } _ => { reset_from = None; } } if let (Some(from), Some(to)) = (reset_from, reset_to) { for j in from..to { levels[j] = para.level; } reset_from = None; reset_to = None; } } if let Some(from) = reset_from { for j in from..line_str.len() { levels[j] = para.level; } } // Find consecutive level runs. let mut runs = Vec::new(); let mut start = line.start; let mut level = levels[start]; let mut min_level = level; let mut max_level = level; for i in (start + 1)..line.end { let new_level = levels[i]; if new_level != level { // End of the previous run, start of a new one. runs.push(start..i); start = i; level = new_level; min_level = min(level, min_level); max_level = max(level, max_level); } } runs.push(start..line.end); let run_count = runs.len(); // Re-order the odd runs. // // Stop at the lowest *odd* level. min_level = min_level.new_lowest_ge_rtl().expect("Level error"); while max_level >= min_level { // Look for the start of a sequence of consecutive runs of max_level or higher. let mut seq_start = 0; while seq_start < run_count { if self.levels[runs[seq_start].start] < max_level { seq_start += 1; continue; } // Found the start of a sequence. Now find the end. let mut seq_end = seq_start + 1; while seq_end < run_count { if self.levels[runs[seq_end].start] < max_level { break; } seq_end += 1; } // Reverse the runs within this sequence. runs[seq_start..seq_end].reverse(); seq_start = seq_end; } max_level .lower(1) .expect("Lowering embedding level below zero"); } (levels, runs) } /// If processed text has any computed RTL levels /// /// This information is usually used to skip re-ordering of text when no RTL level is present #[inline] pub fn has_rtl(&self) -> bool { level::has_rtl(&self.levels) } } impl<'text> fmt::Display for BidiInfo<'text> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "{} paragraphs with a maximum bidirectional level of {}", self.paragraphs.len(), self.levels.iter().max().unwrap_or(&Level::ltr()), ) } } #[cfg(test)] mod tests { use super::*; #[test] fn test_initial_text_info() { let text = "a1"; assert_eq!( InitialInfo::new(text, None), InitialInfo { text: &text, original_classes: vec![L, EN], paragraphs: vec![ParagraphInfo { range: 0..2, level: LTR_LEVEL, },], } ); let text = "غ א"; assert_eq!( InitialInfo::new(text, None), InitialInfo { text: &text, original_classes: vec![AL, AL, WS, R, R], paragraphs: vec![ParagraphInfo { range: 0..5, level: RTL_LEVEL, },], } ); let text = "a\u{2029}b"; assert_eq!( InitialInfo::new(text, None), InitialInfo { text: &text, original_classes: vec![L, B, B, B, L], paragraphs: vec![ ParagraphInfo { range: 0..4, level: LTR_LEVEL, }, ParagraphInfo { range: 4..5, level: LTR_LEVEL, }, ], } ); let text = format!("{}א{}a", format_chars::FSI, format_chars::PDI); assert_eq!( InitialInfo::new(&text, None), InitialInfo { text: &text, original_classes: vec![RLI, RLI, RLI, R, R, PDI, PDI, PDI, L], paragraphs: vec![ParagraphInfo { range: 0..9, level: LTR_LEVEL, },], } ); } #[test] fn test_bidi_info() { let text = "abc123"; assert_eq!( BidiInfo::new(text, Some(LTR_LEVEL)), BidiInfo { text: &text, levels: Level::vec(&[0, 0, 0, 0, 0, 0]), original_classes: vec![L, L, L, EN, EN, EN], paragraphs: vec![ParagraphInfo { range: 0..6, level: LTR_LEVEL, },], } ); let text = "abc אבג"; assert_eq!( BidiInfo::new(text, Some(LTR_LEVEL)), BidiInfo { text: &text, levels: Level::vec(&[0, 0, 0, 0, 1, 1, 1, 1, 1, 1]), original_classes: vec![L, L, L, WS, R, R, R, R, R, R], paragraphs: vec![ParagraphInfo { range: 0..10, level: LTR_LEVEL, },], } ); assert_eq!( BidiInfo::new(text, Some(RTL_LEVEL)), BidiInfo { text: &text, levels: Level::vec(&[2, 2, 2, 1, 1, 1, 1, 1, 1, 1]), original_classes: vec![L, L, L, WS, R, R, R, R, R, R], paragraphs: vec![ParagraphInfo { range: 0..10, level: RTL_LEVEL, },], } ); let text = "אבג abc"; assert_eq!( BidiInfo::new(text, Some(LTR_LEVEL)), BidiInfo { text: &text, levels: Level::vec(&[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]), original_classes: vec![R, R, R, R, R, R, WS, L, L, L], paragraphs: vec![ParagraphInfo { range: 0..10, level: LTR_LEVEL, },], } ); assert_eq!( BidiInfo::new(text, None), BidiInfo { text: &text, levels: Level::vec(&[1, 1, 1, 1, 1, 1, 1, 2, 2, 2]), original_classes: vec![R, R, R, R, R, R, WS, L, L, L], paragraphs: vec![ParagraphInfo { range: 0..10, level: RTL_LEVEL, },], } ); let text = "غ2ظ א2ג"; assert_eq!( BidiInfo::new(text, Some(LTR_LEVEL)), BidiInfo { text: &text, levels: Level::vec(&[1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1]), original_classes: vec![AL, AL, EN, AL, AL, WS, R, R, EN, R, R], paragraphs: vec![ParagraphInfo { range: 0..11, level: LTR_LEVEL, },], } ); let text = "a א.\nג"; assert_eq!( BidiInfo::new(text, None), BidiInfo { text: &text, original_classes: vec![L, WS, R, R, CS, B, R, R], levels: Level::vec(&[0, 0, 1, 1, 0, 0, 1, 1]), paragraphs: vec![ ParagraphInfo { range: 0..6, level: LTR_LEVEL, }, ParagraphInfo { range: 6..8, level: RTL_LEVEL, }, ], } ); // BidiTest:69635 (AL ET EN) let bidi_info = BidiInfo::new("\u{060B}\u{20CF}\u{06F9}", None); assert_eq!(bidi_info.original_classes, vec![AL, AL, ET, ET, ET, EN, EN]); } #[test] fn test_bidi_info_has_rtl() { // ASCII only assert_eq!(BidiInfo::new("123", None).has_rtl(), false); assert_eq!(BidiInfo::new("123", Some(LTR_LEVEL)).has_rtl(), false); assert_eq!(BidiInfo::new("123", Some(RTL_LEVEL)).has_rtl(), false); assert_eq!(BidiInfo::new("abc", None).has_rtl(), false); assert_eq!(BidiInfo::new("abc", Some(LTR_LEVEL)).has_rtl(), false); assert_eq!(BidiInfo::new("abc", Some(RTL_LEVEL)).has_rtl(), false); assert_eq!(BidiInfo::new("abc 123", None).has_rtl(), false); assert_eq!(BidiInfo::new("abc\n123", None).has_rtl(), false); // With Hebrew assert_eq!(BidiInfo::new("אבּג", None).has_rtl(), true); assert_eq!(BidiInfo::new("אבּג", Some(LTR_LEVEL)).has_rtl(), true); assert_eq!(BidiInfo::new("אבּג", Some(RTL_LEVEL)).has_rtl(), true); assert_eq!(BidiInfo::new("abc אבּג", None).has_rtl(), true); assert_eq!(BidiInfo::new("abc\nאבּג", None).has_rtl(), true); assert_eq!(BidiInfo::new("אבּג abc", None).has_rtl(), true); assert_eq!(BidiInfo::new("אבּג\nabc", None).has_rtl(), true); assert_eq!(BidiInfo::new("אבּג 123", None).has_rtl(), true); assert_eq!(BidiInfo::new("אבּג\n123", None).has_rtl(), true); } fn reorder_paras(text: &str) -> Vec> { let bidi_info = BidiInfo::new(text, None); bidi_info .paragraphs .iter() .map(|para| bidi_info.reorder_line(para, para.range.clone())) .collect() } #[test] fn test_reorder_line() { /// Bidi_Class: L L L B L L L B L L L assert_eq!( reorder_paras("abc\ndef\nghi"), vec!["abc\n", "def\n", "ghi"] ); /// Bidi_Class: L L EN B L L EN B L L EN assert_eq!( reorder_paras("ab1\nde2\ngh3"), vec!["ab1\n", "de2\n", "gh3"] ); /// Bidi_Class: L L L B AL AL AL assert_eq!(reorder_paras("abc\nابج"), vec!["abc\n", "جبا"]); /// Bidi_Class: AL AL AL B L L L assert_eq!(reorder_paras("ابج\nabc"), vec!["\nجبا", "abc"]); assert_eq!(reorder_paras("1.-2"), vec!["1.-2"]); assert_eq!(reorder_paras("1-.2"), vec!["1-.2"]); assert_eq!(reorder_paras("abc אבג"), vec!["abc גבא"]); // Numbers being weak LTR characters, cannot reorder strong RTL assert_eq!(reorder_paras("123 אבג"), vec!["גבא 123"]); assert_eq!(reorder_paras("abc\u{202A}def"), vec!["abc\u{202A}def"]); assert_eq!( reorder_paras("abc\u{202A}def\u{202C}ghi"), vec!["abc\u{202A}def\u{202C}ghi"] ); assert_eq!( reorder_paras("abc\u{2066}def\u{2069}ghi"), vec!["abc\u{2066}def\u{2069}ghi"] ); // Testing for RLE Character assert_eq!( reorder_paras("\u{202B}abc אבג\u{202C}"), vec!["\u{202B}\u{202C}גבא abc"] ); // Testing neutral characters assert_eq!(reorder_paras("אבג? אבג"), vec!["גבא ?גבא"]); // Testing neutral characters with special case assert_eq!(reorder_paras("A אבג?"), vec!["A גבא?"]); // Testing neutral characters with Implicit RTL Marker assert_eq!( reorder_paras("A אבג?\u{200F}"), vec!["A \u{200F}?גבא"] ); assert_eq!(reorder_paras("אבג abc"), vec!["abc גבא"]); assert_eq!( reorder_paras("abc\u{2067}.-\u{2069}ghi"), vec!["abc\u{2067}-.\u{2069}ghi"] ); assert_eq!( reorder_paras("Hello, \u{2068}\u{202E}world\u{202C}\u{2069}!"), vec!["Hello, \u{2068}\u{202E}\u{202C}dlrow\u{2069}!"] ); // With mirrorable characters in RTL run assert_eq!(reorder_paras("א(ב)ג."), vec![".ג)ב(א"]); // With mirrorable characters on level boundry assert_eq!( reorder_paras("אב(גד[&ef].)gh"), vec!["ef].)gh&[דג(בא"] ); } fn reordered_levels_for_paras(text: &str) -> Vec> { let bidi_info = BidiInfo::new(text, None); bidi_info .paragraphs .iter() .map(|para| bidi_info.reordered_levels(para, para.range.clone())) .collect() } fn reordered_levels_per_char_for_paras(text: &str) -> Vec> { let bidi_info = BidiInfo::new(text, None); bidi_info .paragraphs .iter() .map(|para| bidi_info.reordered_levels_per_char(para, para.range.clone())) .collect() } #[test] fn test_reordered_levels() { // BidiTest:946 (LRI PDI) let text = "\u{2067}\u{2069}"; assert_eq!( reordered_levels_for_paras(text), vec![Level::vec(&[0, 0, 0, 0, 0, 0])] ); assert_eq!( reordered_levels_per_char_for_paras(text), vec![Level::vec(&[0, 0])] ); /* TODO // BidiTest:69635 (AL ET EN) let text = "\u{060B}\u{20CF}\u{06F9}"; assert_eq!( reordered_levels_for_paras(text), vec![Level::vec(&[1, 1, 1, 1, 1, 2, 2])] ); assert_eq!( reordered_levels_per_char_for_paras(text), vec![Level::vec(&[1, 1, 2])] ); */ /* TODO // BidiTest:291284 (AN RLI PDF R) assert_eq!( reordered_levels_per_char_for_paras("\u{0605}\u{2067}\u{202C}\u{0590}"), vec![&["2", "0", "x", "1"]] ); */ } #[test] fn test_display() { assert_eq!( format!("{}", BidiInfo::new("", None)), "0 paragraphs with a maximum bidirectional level of 0" ); assert_eq!( format!("{}", BidiInfo::new("abc\nאבּג", None)), "2 paragraphs with a maximum bidirectional level of 1" ); } } #[cfg(all(feature = "serde", test))] mod serde_tests { use super::*; use serde_test::{assert_tokens, Token}; #[test] fn test_levels() { let text = "abc אבג"; let bidi_info = BidiInfo::new(text, None); let levels = bidi_info.levels; assert_eq!(text.as_bytes().len(), 10); assert_eq!(levels.len(), 10); assert_tokens( &levels, &[ Token::Seq { len: Some(10) }, Token::NewtypeStruct { name: "Level" }, Token::U8(0), Token::NewtypeStruct { name: "Level" }, Token::U8(0), Token::NewtypeStruct { name: "Level" }, Token::U8(0), Token::NewtypeStruct { name: "Level" }, Token::U8(0), Token::NewtypeStruct { name: "Level" }, Token::U8(1), Token::NewtypeStruct { name: "Level" }, Token::U8(1), Token::NewtypeStruct { name: "Level" }, Token::U8(1), Token::NewtypeStruct { name: "Level" }, Token::U8(1), Token::NewtypeStruct { name: "Level" }, Token::U8(1), Token::NewtypeStruct { name: "Level" }, Token::U8(1), Token::SeqEnd, ], ); } } unic-bidi-0.9.0/src/explicit.rs010064400007650000024000000147321343520353600145730ustar0000000000000000// Copyright 2015 The Servo Project Developers. // Copyright 2017 The UNIC Project Developers. // // See the COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! 3.3.2 Explicit Levels and Directions //! //! use unic_ucd_bidi::bidi_class::abbr_names::*; use unic_ucd_bidi::BidiClass; use super::level::Level; /// Compute explicit embedding levels for one paragraph of text (X1-X8). /// /// `processing_classes[i]` must contain the `BidiClass` of the char at byte index `i`, /// for each char in `text`. pub fn compute( text: &str, para_level: Level, original_classes: &[BidiClass], levels: &mut [Level], processing_classes: &mut [BidiClass], ) { assert_eq!(text.len(), original_classes.len()); // let mut stack = DirectionalStatusStack::new(); stack.push(para_level, OverrideStatus::Neutral); let mut overflow_isolate_count = 0u32; let mut overflow_embedding_count = 0u32; let mut valid_isolate_count = 0u32; for (i, c) in text.char_indices() { match original_classes[i] { // Rules X2-X5c RLE | LRE | RLO | LRO | RLI | LRI | FSI => { let last_level = stack.last().level; // X5a-X5c: Isolate initiators get the level of the last entry on the stack. let is_isolate = matches!(original_classes[i], RLI | LRI | FSI); if is_isolate { levels[i] = last_level; match stack.last().status { OverrideStatus::RTL => processing_classes[i] = R, OverrideStatus::LTR => processing_classes[i] = L, _ => {} } } let new_level = if original_classes[i].is_rtl() { last_level.new_explicit_next_rtl() } else { last_level.new_explicit_next_ltr() }; if new_level.is_ok() && overflow_isolate_count == 0 && overflow_embedding_count == 0 { let new_level = new_level.unwrap(); stack.push( new_level, match original_classes[i] { RLO => OverrideStatus::RTL, LRO => OverrideStatus::LTR, RLI | LRI | FSI => OverrideStatus::Isolate, _ => OverrideStatus::Neutral, }, ); if is_isolate { valid_isolate_count += 1; } else { // The spec doesn't explicitly mention this step, but it is necessary. // See the reference implementations for comparison. levels[i] = new_level; } } else if is_isolate { overflow_isolate_count += 1; } else if overflow_isolate_count == 0 { overflow_embedding_count += 1; } } // PDI => { if overflow_isolate_count > 0 { overflow_isolate_count -= 1; } else if valid_isolate_count > 0 { overflow_embedding_count = 0; loop { // Pop everything up to and including the last Isolate status. match stack.vec.pop() { None | Some(Status { status: OverrideStatus::Isolate, .. }) => break, _ => continue, } } valid_isolate_count -= 1; } let last = stack.last(); levels[i] = last.level; match last.status { OverrideStatus::RTL => processing_classes[i] = R, OverrideStatus::LTR => processing_classes[i] = L, _ => {} } } // PDF => { if overflow_isolate_count > 0 { continue; } if overflow_embedding_count > 0 { overflow_embedding_count -= 1; continue; } if stack.last().status != OverrideStatus::Isolate && stack.vec.len() >= 2 { stack.vec.pop(); } // The spec doesn't explicitly mention this step, but it is necessary. // See the reference implementations for comparison. levels[i] = stack.last().level; } // Nothing B | BN => {} // _ => { let last = stack.last(); levels[i] = last.level; match last.status { OverrideStatus::RTL => processing_classes[i] = R, OverrideStatus::LTR => processing_classes[i] = L, _ => {} } } } // Handle multi-byte characters. for j in 1..c.len_utf8() { levels[i + j] = levels[i]; processing_classes[i + j] = processing_classes[i]; } } } /// Entries in the directional status stack: struct Status { level: Level, status: OverrideStatus, } #[derive(PartialEq)] enum OverrideStatus { Neutral, RTL, LTR, Isolate, } struct DirectionalStatusStack { vec: Vec, } impl DirectionalStatusStack { fn new() -> Self { DirectionalStatusStack { vec: Vec::with_capacity(Level::max_explicit_depth() as usize + 2), } } fn push(&mut self, level: Level, status: OverrideStatus) { self.vec.push(Status { level, status }); } fn last(&self) -> &Status { self.vec.last().unwrap() } } unic-bidi-0.9.0/src/format_chars.rs010064400007650000024000000026121343520353600154140ustar0000000000000000// Copyright 2015 The Servo Project Developers. // Copyright 2017 The UNIC Project Developers. // // See the COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! Directional Formatting Characters //! //! // == Implicit == /// ARABIC LETTER MARK pub const ALM: char = '\u{061C}'; /// LEFT-TO-RIGHT MARK pub const LRM: char = '\u{200E}'; /// RIGHT-TO-LEFT MARK pub const RLM: char = '\u{200F}'; // == Explicit Isolates == /// LEFT‑TO‑RIGHT ISOLATE pub const LRI: char = '\u{2066}'; /// RIGHT‑TO‑LEFT ISOLATE pub const RLI: char = '\u{2067}'; /// FIRST STRONG ISOLATE pub const FSI: char = '\u{2068}'; /// POP DIRECTIONAL ISOLATE pub const PDI: char = '\u{2069}'; // == Explicit Embeddings and Overrides == /// LEFT-TO-RIGHT EMBEDDING pub const LRE: char = '\u{202A}'; /// RIGHT-TO-LEFT EMBEDDING pub const RLE: char = '\u{202B}'; /// POP DIRECTIONAL FORMATTING pub const PDF: char = '\u{202C}'; /// LEFT-TO-RIGHT OVERRIDE pub const LRO: char = '\u{202D}'; /// RIGHT-TO-LEFT OVERRIDE pub const RLO: char = '\u{202E}'; unic-bidi-0.9.0/src/implicit.rs010064400007650000024000000176731343520353600145730ustar0000000000000000// Copyright 2015 The Servo Project Developers. // Copyright 2017 The UNIC Project Developers. // // See the COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! 3.3.4 - 3.3.6. Resolve implicit levels and types. use std::cmp::max; use unic_ucd_bidi::bidi_class::abbr_names::*; use unic_ucd_bidi::BidiClass; use super::level::Level; use super::prepare::{not_removed_by_x9, removed_by_x9, IsolatingRunSequence, LevelRun}; /// 3.3.4 Resolving Weak Types /// /// pub fn resolve_weak(sequence: &IsolatingRunSequence, processing_classes: &mut [BidiClass]) { // FIXME (#8): This function applies steps W1-W6 in a single pass. This can produce // incorrect results in cases where a "later" rule changes the value of `prev_class` seen // by an "earlier" rule. We should either split this into separate passes, or preserve // extra state so each rule can see the correct previous class. // FIXME: Also, this could be the cause of increased failure for using longer-UTF-8 chars in // conformance tests, like BidiTest:69635 (AL ET EN) let mut prev_class = sequence.sos; let mut last_strong_is_al = false; let mut et_run_indices = Vec::new(); // for W5 // Like sequence.runs.iter().flat_map(Clone::clone), but make indices itself clonable. fn id(x: LevelRun) -> LevelRun { x } let mut indices = sequence .runs .iter() .cloned() .flat_map(id as fn(LevelRun) -> LevelRun); while let Some(i) = indices.next() { match processing_classes[i] { // https://www.unicode.org/reports/tr9/#W1 NSM => { processing_classes[i] = match prev_class { RLI | LRI | FSI | PDI => ON, _ => prev_class, }; } EN => { if last_strong_is_al { // W2. If previous strong char was AL, change EN to AN. processing_classes[i] = AN; } else { // W5. If a run of ETs is adjacent to an EN, change the ETs to EN. for j in &et_run_indices { processing_classes[*j] = EN; } et_run_indices.clear(); } } // AL => processing_classes[i] = R, // ES | CS => { let next_class = indices .clone() .map(|j| processing_classes[j]) .find(not_removed_by_x9) .unwrap_or(sequence.eos); processing_classes[i] = match (prev_class, processing_classes[i], next_class) { (EN, ES, EN) | (EN, CS, EN) => EN, (AN, CS, AN) => AN, (_, _, _) => ON, } } // ET => { match prev_class { EN => processing_classes[i] = EN, _ => et_run_indices.push(i), // In case this is followed by an EN. } } class => { if removed_by_x9(class) { continue; } } } prev_class = processing_classes[i]; match prev_class { L | R => { last_strong_is_al = false; } AL => { last_strong_is_al = true; } _ => {} } if prev_class != ET { // W6. If we didn't find an adjacent EN, turn any ETs into ON instead. for j in &et_run_indices { processing_classes[*j] = ON; } et_run_indices.clear(); } } // W7. If the previous strong char was L, change EN to L. let mut last_strong_is_l = sequence.sos == L; for run in &sequence.runs { for i in run.clone() { match processing_classes[i] { EN if last_strong_is_l => { processing_classes[i] = L; } L => { last_strong_is_l = true; } R | AL => { last_strong_is_l = false; } _ => {} } } } } /// 3.3.5 Resolving Neutral Types /// /// pub fn resolve_neutral( sequence: &IsolatingRunSequence, levels: &[Level], processing_classes: &mut [BidiClass], ) { let e: BidiClass = levels[sequence.runs[0].start].bidi_class(); let mut indices = sequence.runs.iter().flat_map(Clone::clone); let mut prev_class = sequence.sos; while let Some(mut i) = indices.next() { // N0. Process bracket pairs. // TODO // Process sequences of NI characters. let mut ni_run = Vec::new(); if is_NI(processing_classes[i]) { // Consume a run of consecutive NI characters. ni_run.push(i); let mut next_class; loop { match indices.next() { Some(j) => { i = j; if removed_by_x9(processing_classes[i]) { continue; } next_class = processing_classes[j]; if is_NI(next_class) { ni_run.push(i); } else { break; } } None => { next_class = sequence.eos; break; } }; } // N1-N2. // // // let new_class = match (prev_class, next_class) { (L, L) => L, (R, R) | (R, AN) | (R, EN) | (AN, R) | (AN, AN) | (AN, EN) | (EN, R) | (EN, AN) | (EN, EN) => R, (_, _) => e, }; for j in &ni_run { processing_classes[*j] = new_class; } ni_run.clear(); } prev_class = processing_classes[i]; } } /// 3.3.6 Resolving Implicit Levels /// /// Returns the maximum embedding level in the paragraph. /// /// pub fn resolve_levels(original_classes: &[BidiClass], levels: &mut [Level]) -> Level { let mut max_level = Level::ltr(); assert_eq!(original_classes.len(), levels.len()); for i in 0..levels.len() { // // match (levels[i].is_rtl(), original_classes[i]) { (true, L) | (false, R) | (true, EN) | (true, AN) => { levels[i].raise(1).expect("Level number error") } (false, AN) | (false, EN) => levels[i].raise(2).expect("Level number error"), (_, _) => {} } max_level = max(max_level, levels[i]); } max_level } /// Neutral or Isolate formatting character (B, S, WS, ON, FSI, LRI, RLI, PDI) /// /// #[allow(non_snake_case)] fn is_NI(class: BidiClass) -> bool { matches!(class, B | S | WS | ON | FSI | LRI | RLI | PDI) } unic-bidi-0.9.0/src/level.rs010064400007650000024000000267011343520353600140600ustar0000000000000000// Copyright 2017 The Servo Project Developers. // Copyright 2017 The UNIC Project Developers. // // See the COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! Bidi Embedding Level //! //! See [`Level`](struct.Level.html) for more details. //! //! use std::convert::{From, Into}; use std::fmt; use unic_ucd_bidi::BidiClass; /// Embedding Level /// /// Embedding Levels are numbers between 0 and 126 (inclusive), where even values denote a /// left-to-right (LTR) direction and odd values a right-to-left (RTL) direction. /// /// This struct maintains a *valid* status for level numbers, meaning that creating a new level, or /// mutating an existing level, with the value smaller than `0` (before conversion to `u8`) or /// larger than 125 results in an `Error`. /// /// #[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct Level(u8); /// LTR level with smallest number value (0). pub const LTR_LEVEL: Level = Level(0); /// RTL level with smallest number value (0). pub const RTL_LEVEL: Level = Level(1); const MAX_DEPTH: u8 = 125; /// During explicit level resolution, embedding level can go as high as `max_depth`. pub const MAX_EXPLICIT_DEPTH: u8 = MAX_DEPTH; /// During implicit level resolution, embedding level can go as high as `max_depth + 1`. pub const MAX_IMPLICIT_DEPTH: u8 = MAX_DEPTH + 1; /// Errors that can occur on Level creation or mutation #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] pub enum Error { /// Out-of-range (invalid) embedding level number. OutOfRangeNumber, } impl Level { /// New LTR level with smallest number value (0). #[inline] pub fn ltr() -> Level { LTR_LEVEL } /// New RTL level with smallest number value (1). #[inline] pub fn rtl() -> Level { RTL_LEVEL } /// Maximum depth of the directional status stack during implicit resolutions. pub fn max_implicit_depth() -> u8 { MAX_IMPLICIT_DEPTH } /// Maximum depth of the directional status stack during explicit resolutions. pub fn max_explicit_depth() -> u8 { MAX_EXPLICIT_DEPTH } // == Inquiries == /// Create new level, fail if number is larger than `max_depth + 1`. #[inline] pub fn new(number: u8) -> Result { if number <= MAX_IMPLICIT_DEPTH { Ok(Level(number)) } else { Err(Error::OutOfRangeNumber) } } /// Create new level, fail if number is larger than `max_depth`. #[inline] pub fn new_explicit(number: u8) -> Result { if number <= MAX_EXPLICIT_DEPTH { Ok(Level(number)) } else { Err(Error::OutOfRangeNumber) } } // == Inquiries == /// The level number. #[inline] pub fn number(&self) -> u8 { self.0 } /// If this level is left-to-right. #[inline] pub fn is_ltr(&self) -> bool { self.0 % 2 == 0 } /// If this level is right-to-left. #[inline] pub fn is_rtl(&self) -> bool { self.0 % 2 == 1 } // == Mutators == /// Raise level by `amount`, fail if number is larger than `max_depth + 1`. #[inline] pub fn raise(&mut self, amount: u8) -> Result<(), Error> { match self.0.checked_add(amount) { Some(number) => { if number <= MAX_IMPLICIT_DEPTH { self.0 = number; Ok(()) } else { Err(Error::OutOfRangeNumber) } } None => Err(Error::OutOfRangeNumber), } } /// Raise level by `amount`, fail if number is larger than `max_depth`. #[inline] pub fn raise_explicit(&mut self, amount: u8) -> Result<(), Error> { match self.0.checked_add(amount) { Some(number) => { if number <= MAX_EXPLICIT_DEPTH { self.0 = number; Ok(()) } else { Err(Error::OutOfRangeNumber) } } None => Err(Error::OutOfRangeNumber), } } /// Lower level by `amount`, fail if number goes below zero. #[inline] pub fn lower(&mut self, amount: u8) -> Result<(), Error> { match self.0.checked_sub(amount) { Some(number) => { self.0 = number; Ok(()) } None => Err(Error::OutOfRangeNumber), } } // == Helpers == /// The next LTR (even) level greater than this, or fail if number is larger than `max_depth`. #[inline] pub fn new_explicit_next_ltr(&self) -> Result { Level::new_explicit((self.0 + 2) & !1) } /// The next RTL (odd) level greater than this, or fail if number is larger than `max_depth`. #[inline] pub fn new_explicit_next_rtl(&self) -> Result { Level::new_explicit((self.0 + 1) | 1) } /// The lowest RTL (odd) level greater than or equal to this, or fail if number is larger than /// `max_depth + 1`. #[inline] pub fn new_lowest_ge_rtl(&self) -> Result { Level::new(self.0 | 1) } /// Generate a character type based on a level (as specified in steps X10 and N2). #[inline] pub fn bidi_class(&self) -> BidiClass { if self.is_rtl() { BidiClass::RightToLeft } else { BidiClass::LeftToRight } } /// Create a `Vec` from a slice of `u8` numbers pub fn vec(v: &[u8]) -> Vec { v.iter().map(|&x| x.into()).collect() } } impl fmt::Display for Level { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.0) } } /// If levels has any RTL (odd) level /// /// This information is usually used to skip re-ordering of text when no RTL level is present #[inline] pub fn has_rtl(levels: &[Level]) -> bool { levels.iter().any(|&lvl| lvl.is_rtl()) } impl Into for Level { /// Convert to the level number #[inline] fn into(self) -> u8 { self.number() } } impl From for Level { /// Create level by number #[inline] fn from(number: u8) -> Level { Level::new(number).expect("Level number error") } } /// Used for matching levels in conformance tests impl<'a> PartialEq<&'a str> for Level { #[inline] fn eq(&self, s: &&'a str) -> bool { *s == "x" || *s == self.0.to_string() } } /// Used for matching levels in conformance tests impl<'a> PartialEq for Level { #[inline] fn eq(&self, s: &String) -> bool { self == &s.as_str() } } #[cfg(test)] mod tests { use super::*; #[test] fn test_new() { assert_eq!(Level::new(0), Ok(Level(0))); assert_eq!(Level::new(1), Ok(Level(1))); assert_eq!(Level::new(10), Ok(Level(10))); assert_eq!(Level::new(125), Ok(Level(125))); assert_eq!(Level::new(126), Ok(Level(126))); assert_eq!(Level::new(127), Err(Error::OutOfRangeNumber)); assert_eq!(Level::new(255), Err(Error::OutOfRangeNumber)); } #[test] fn test_new_explicit() { assert_eq!(Level::new_explicit(0), Ok(Level(0))); assert_eq!(Level::new_explicit(1), Ok(Level(1))); assert_eq!(Level::new_explicit(10), Ok(Level(10))); assert_eq!(Level::new_explicit(125), Ok(Level(125))); assert_eq!(Level::new_explicit(126), Err(Error::OutOfRangeNumber)); assert_eq!(Level::new_explicit(255), Err(Error::OutOfRangeNumber)); } #[test] fn test_is_ltr() { assert_eq!(Level(0).is_ltr(), true); assert_eq!(Level(1).is_ltr(), false); assert_eq!(Level(10).is_ltr(), true); assert_eq!(Level(11).is_ltr(), false); assert_eq!(Level(124).is_ltr(), true); assert_eq!(Level(125).is_ltr(), false); } #[test] fn test_is_rtl() { assert_eq!(Level(0).is_rtl(), false); assert_eq!(Level(1).is_rtl(), true); assert_eq!(Level(10).is_rtl(), false); assert_eq!(Level(11).is_rtl(), true); assert_eq!(Level(124).is_rtl(), false); assert_eq!(Level(125).is_rtl(), true); } #[test] fn test_raise() { let mut level = Level::ltr(); assert_eq!(level.number(), 0); assert!(level.raise(100).is_ok()); assert_eq!(level.number(), 100); assert!(level.raise(26).is_ok()); assert_eq!(level.number(), 126); assert!(level.raise(1).is_err()); // invalid! assert!(level.raise(250).is_err()); // overflow! assert_eq!(level.number(), 126); } #[test] fn test_raise_explicit() { let mut level = Level::ltr(); assert_eq!(level.number(), 0); assert!(level.raise_explicit(100).is_ok()); assert_eq!(level.number(), 100); assert!(level.raise_explicit(25).is_ok()); assert_eq!(level.number(), 125); assert!(level.raise_explicit(1).is_err()); // invalid! assert!(level.raise_explicit(250).is_err()); // overflow! assert_eq!(level.number(), 125); } #[test] fn test_lower() { let mut level = Level::rtl(); assert_eq!(level.number(), 1); assert!(level.lower(1).is_ok()); assert_eq!(level.number(), 0); assert!(level.lower(1).is_err()); // underflow! assert!(level.lower(250).is_err()); // underflow! assert_eq!(level.number(), 0); } #[test] fn test_has_rtl() { assert_eq!(has_rtl(&Level::vec(&[0, 0, 0])), false); assert_eq!(has_rtl(&Level::vec(&[0, 1, 0])), true); assert_eq!(has_rtl(&Level::vec(&[0, 2, 0])), false); assert_eq!(has_rtl(&Level::vec(&[0, 125, 0])), true); assert_eq!(has_rtl(&Level::vec(&[0, 126, 0])), false); } #[test] fn test_into() { let level = Level::rtl(); assert_eq!(1u8, level.into()); } #[test] fn test_vec() { assert_eq!( Level::vec(&[0, 1, 125]), vec![Level(0), Level(1), Level(125)] ); } #[test] fn test_str_eq() { assert_eq!(Level::vec(&[0, 1, 4, 125]), vec!["0", "1", "x", "125"]); assert_ne!(Level::vec(&[0, 1, 4, 125]), vec!["0", "1", "5", "125"]); } #[test] fn test_string_eq() { assert_eq!( Level::vec(&[0, 1, 4, 125]), vec![ "0".to_string(), "1".to_string(), "x".to_string(), "125".to_string(), ] ); } } #[cfg(all(feature = "serde", test))] mod serde_tests { use super::*; use serde_test::{assert_tokens, Token}; #[test] fn test_statics() { assert_tokens( &Level::ltr(), &[Token::NewtypeStruct { name: "Level" }, Token::U8(0)], ); assert_tokens( &Level::rtl(), &[Token::NewtypeStruct { name: "Level" }, Token::U8(1)], ); } #[test] fn test_new() { let level = Level::new(42).unwrap(); assert_tokens( &level, &[Token::NewtypeStruct { name: "Level" }, Token::U8(42)], ); } } unic-bidi-0.9.0/src/lib.rs010064400007650000024000000052671343520353600135230ustar0000000000000000// Copyright 2015 The Servo Project Developers. // Copyright 2017 The UNIC Project Developers. // // See the COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. #![warn( bad_style, missing_debug_implementations, missing_docs, unconditional_recursion )] #![forbid(unsafe_code)] //! # UNIC — Unicode Bidirectional Algorithm //! //! A component of [`unic`: Unicode and Internationalization Crates for Rust](/unic/). //! //! This UNIC component implements algorithms from [Unicode Standard Annex #9 - Unicode //! Bidirectional Algorithm](http://unicode.org/reports/tr9/), a.k.a. *UBA*, used for display of //! mixed right-to-left and left-to-right text. It is written in safe Rust, compatible with the //! current stable release. //! //! //! ## Example //! //! ```rust //! use unic_bidi::BidiInfo; //! //! // This example text is defined using `concat!` because some browsers //! // and text editors have trouble displaying bidi strings. //! let text = concat![ //! "א", //! "ב", //! "ג", //! "a", //! "b", //! "c", //! ]; //! //! // Resolve embedding levels within the text. Pass `None` to detect the //! // paragraph level automatically. //! let bidi_info = BidiInfo::new(&text, None); //! //! // This paragraph has embedding level 1 because its first strong character is RTL. //! assert_eq!(bidi_info.paragraphs.len(), 1); //! let para = &bidi_info.paragraphs[0]; //! assert_eq!(para.level.number(), 1); //! assert_eq!(para.level.is_rtl(), true); //! //! // Re-ordering is done after wrapping each paragraph into a sequence of //! // lines. For this example, I'll just use a single line that spans the //! // entire paragraph. //! let line = para.range.clone(); //! //! let display = bidi_info.reorder_line(para, line); //! assert_eq!(display, concat![ //! "a", //! "b", //! "c", //! "ג", //! "ב", //! "א", //! ]); //! ``` //! //! [tr9]: https://www.unicode.org/reports/tr9/ #[macro_use] extern crate matches; #[cfg(feature = "serde")] #[macro_use] extern crate serde; pub use unic_ucd_bidi::UNICODE_VERSION; pub use unic_ucd_bidi::{bidi_class, BidiClass, BidiClassCategory}; mod pkg_info; pub use crate::pkg_info::{PKG_DESCRIPTION, PKG_NAME, PKG_VERSION}; pub mod format_chars; pub mod level; pub use crate::level::Level; mod bidi_info; pub use crate::bidi_info::{BidiInfo, ParagraphInfo}; mod explicit; mod implicit; mod prepare; pub use crate::prepare::LevelRun; unic-bidi-0.9.0/src/pkg_info.rs010064400007650000024000000013121343520353600145340ustar0000000000000000// Copyright 2017 The UNIC Project Developers. // // See the COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! Package information /// UNIC component version. pub const PKG_VERSION: &str = env!("CARGO_PKG_VERSION"); /// UNIC component name. pub const PKG_NAME: &str = env!("CARGO_PKG_NAME"); /// UNIC component description. pub const PKG_DESCRIPTION: &str = env!("CARGO_PKG_DESCRIPTION"); unic-bidi-0.9.0/src/prepare.rs010064400007650000024000000275021343520353600144070ustar0000000000000000// Copyright 2015 The Servo Project Developers. // Copyright 2017 The UNIC Project Developers. // // See the COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! 3.3.3 Preparations for Implicit Processing //! //! use std::cmp::max; use std::ops::Range; use unic_ucd_bidi::bidi_class::abbr_names::*; use unic_ucd_bidi::BidiClass; use super::level::Level; /// A maximal substring of characters with the same embedding level. /// /// Represented as a range of byte indices. pub type LevelRun = Range; /// Output of `isolating_run_sequences` (steps X9-X10) #[derive(Debug, PartialEq)] pub struct IsolatingRunSequence { pub runs: Vec, pub sos: BidiClass, // Start-of-sequence type. pub eos: BidiClass, // End-of-sequence type. } /// Compute the set of isolating run sequences. /// /// An isolating run sequence is a maximal sequence of level runs such that for all level runs /// except the last one in the sequence, the last character of the run is an isolate initiator /// whose matching PDI is the first character of the next level run in the sequence. /// /// Note: This function does *not* return the sequences in order by their first characters. pub fn isolating_run_sequences( para_level: Level, original_classes: &[BidiClass], levels: &[Level], ) -> Vec { let runs = level_runs(levels, original_classes); // Compute the set of isolating run sequences. // let mut sequences = Vec::with_capacity(runs.len()); // When we encounter an isolate initiator, we push the current sequence onto the // stack so we can resume it after the matching PDI. let mut stack = vec![Vec::new()]; for run in runs { assert!(run.len() > 0); assert!(!stack.is_empty()); let start_class = original_classes[run.start]; let end_class = original_classes[run.end - 1]; let mut sequence = if start_class == PDI && stack.len() > 1 { // Continue a previous sequence interrupted by an isolate. stack.pop().unwrap() } else { // Start a new sequence. Vec::new() }; sequence.push(run); if matches!(end_class, RLI | LRI | FSI) { // Resume this sequence after the isolate. stack.push(sequence); } else { // This sequence is finished. sequences.push(sequence); } } // Pop any remaning sequences off the stack. sequences.extend(stack.into_iter().rev().filter(|seq| !seq.is_empty())); // Determine the `sos` and `eos` class for each sequence. // sequences .into_iter() .map(|sequence: Vec| { assert!(!sequence.is_empty()); let start_of_seq = sequence[0].start; let end_of_seq = sequence[sequence.len() - 1].end; let seq_level = levels[start_of_seq]; #[cfg(test)] for run in sequence.clone() { for idx in run { if not_removed_by_x9(&original_classes[idx]) { assert_eq!(seq_level, levels[idx]); } } } // Get the level of the last non-removed char before the runs. let pred_level = match original_classes[..start_of_seq] .iter() .rposition(not_removed_by_x9) { Some(idx) => levels[idx], None => para_level, }; // Get the level of the next non-removed char after the runs. let succ_level = if matches!(original_classes[end_of_seq - 1], RLI | LRI | FSI) { para_level } else { match original_classes[end_of_seq..] .iter() .position(not_removed_by_x9) { Some(idx) => levels[end_of_seq + idx], None => para_level, } }; IsolatingRunSequence { runs: sequence, sos: max(seq_level, pred_level).bidi_class(), eos: max(seq_level, succ_level).bidi_class(), } }) .collect() } /// Finds the level runs in a paragraph. /// /// fn level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec { assert_eq!(levels.len(), original_classes.len()); let mut runs = Vec::new(); if levels.is_empty() { return runs; } let mut current_run_level = levels[0]; let mut current_run_start = 0; for i in 1..levels.len() { if !removed_by_x9(original_classes[i]) && levels[i] != current_run_level { // End the last run and start a new one. runs.push(current_run_start..i); current_run_level = levels[i]; current_run_start = i; } } runs.push(current_run_start..levels.len()); runs } /// Should this character be ignored in steps after X9? /// /// pub fn removed_by_x9(class: BidiClass) -> bool { matches!(class, RLE | LRE | RLO | LRO | PDF | BN) } // For use as a predicate for `position` / `rposition` pub fn not_removed_by_x9(class: &BidiClass) -> bool { !removed_by_x9(*class) } #[cfg(test)] mod tests { use super::*; #[test] fn test_level_runs() { assert_eq!(level_runs(&Level::vec(&[]), &[]), &[]); assert_eq!( level_runs(&Level::vec(&[0, 0, 0, 1, 1, 2, 0, 0]), &[L; 8]), &[0..3, 3..5, 5..6, 6..8] ); } // From #[cfg_attr(rustfmt, rustfmt_skip)] #[test] fn test_isolating_run_sequences() { // == Example 1 == // text1·RLE·text2·PDF·RLE·text3·PDF·text4 // index 0 1 2 3 4 5 6 7 let classes = &[L, RLE, L, PDF, RLE, L, PDF, L]; let levels = &[0, 1, 1, 1, 1, 1, 1, 0]; let para_level = Level::ltr(); let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); assert_eq!( sequences.iter().map(|s| s.runs.clone()).collect::>(), vec![vec![0..2], vec![2..7], vec![7..8]] ); // == Example 2 == // text1·RLI·text2·PDI·RLI·text3·PDI·text4 // index 0 1 2 3 4 5 6 7 let classes = &[L, RLI, L, PDI, RLI, L, PDI, L]; let levels = &[0, 0, 1, 0, 0, 1, 0, 0]; let para_level = Level::ltr(); let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); assert_eq!( sequences.iter().map(|s| s.runs.clone()).collect::>(), vec![vec![0..2, 3..5, 6..8], vec![2..3], vec![5..6]] ); // == Example 3 == // text1·RLI·text2·LRI·text3·RLE·text4·PDF·text5·PDI·text6·PDI·text7 // index 0 1 2 3 4 5 6 7 8 9 10 11 12 let classes = &[L, RLI, L, LRI, L, RLE, L, PDF, L, PDI, L, PDI, L]; let levels = &[0, 0, 1, 1, 2, 3, 3, 3, 2, 1, 1, 0, 0]; let para_level = Level::ltr(); let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); assert_eq!( sequences.iter().map(|s| s.runs.clone()).collect::>(), vec![vec![0..2, 11..13], vec![2..4, 9..11], vec![4..6], vec![6..8], vec![8..9]] ); } // From #[cfg_attr(rustfmt, rustfmt_skip)] #[test] fn test_isolating_run_sequences_sos_and_eos() { // == Example 1 == // text1·RLE·text2·LRE·text3·PDF·text4·PDF·RLE·text5·PDF·text6 // index 0 1 2 3 4 5 6 7 8 9 10 11 let classes = &[L, RLE, L, LRE, L, PDF, L, PDF, RLE, L, PDF, L]; let levels = &[0, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 0]; let para_level = Level::ltr(); let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); // text1 assert_eq!( &sequences[0], &IsolatingRunSequence { runs: vec![0..2], sos: L, eos: R, } ); // text2 assert_eq!( &sequences[1], &IsolatingRunSequence { runs: vec![2..4], sos: R, eos: L, } ); // text3 assert_eq!( &sequences[2], &IsolatingRunSequence { runs: vec![4..6], sos: L, eos: L, } ); // text4 text5 assert_eq!( &sequences[3], &IsolatingRunSequence { runs: vec![6..11], sos: L, eos: R, } ); // text6 assert_eq!( &sequences[4], &IsolatingRunSequence { runs: vec![11..12], sos: R, eos: L, } ); // == Example 2 == // text1·RLI·text2·LRI·text3·PDI·text4·PDI·RLI·text5·PDI·text6 // index 0 1 2 3 4 5 6 7 8 9 10 11 let classes = &[L, RLI, L, LRI, L, PDI, L, PDI, RLI, L, PDI, L]; let levels = &[0, 0, 1, 1, 2, 1, 1, 0, 0, 1, 0, 0]; let para_level = Level::ltr(); let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); // text1·RLI·PDI·RLI·PDI·text6 assert_eq!( &sequences[0], &IsolatingRunSequence { runs: vec![0..2, 7..9, 10..12], sos: L, eos: L, } ); // text2·LRI·PDI·text4 assert_eq!( &sequences[1], &IsolatingRunSequence { runs: vec![2..4, 5..7], sos: R, eos: R, } ); // text3 assert_eq!( &sequences[2], &IsolatingRunSequence { runs: vec![4..5], sos: L, eos: L, } ); // text5 assert_eq!( &sequences[3], &IsolatingRunSequence { runs: vec![9..10], sos: R, eos: R, } ); } #[test] fn test_removed_by_x9() { let rem_classes = &[RLE, LRE, RLO, LRO, PDF, BN]; let not_classes = &[L, RLI, AL, LRI, PDI]; for x in rem_classes { assert_eq!(removed_by_x9(*x), true); } for x in not_classes { assert_eq!(removed_by_x9(*x), false); } } #[test] fn test_not_removed_by_x9() { let non_x9_classes = &[ L, R, AL, EN, ES, ET, AN, CS, NSM, B, S, WS, ON, LRI, RLI, FSI, PDI, ]; for x in non_x9_classes { assert_eq!(not_removed_by_x9(&x), true); } } } unic-bidi-0.9.0/tests/unicode_version_tests.rs010064400007650000024000000011521343520353600177320ustar0000000000000000// Copyright 2017 The UNIC Project Developers. // // See the COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use unic_bidi; use unic_ucd_version; #[test] fn test_version_against_ucd_version() { assert_eq!( unic_ucd_version::UNICODE_VERSION, unic_bidi::UNICODE_VERSION ); } unic-bidi-0.9.0/.cargo_vcs_info.json0000644000000001120000000000000127360ustar00{ "git": { "sha1": "8a6ce83063d90b91ae2ce59eddb803edd393fca9" } }