hyphenation_commons-0.7.1/Cargo.toml.orig010064400007650000024000000010071335245453500167150ustar0000000000000000[package] name = "hyphenation_commons" version = "0.7.1" # version-locked to the main `hyphenation` crate authors = ["Andrew "] license = "Apache-2.0/MIT" repository = "https://github.com/tapeinosyne/hyphenation" homepage = "https://github.com/tapeinosyne/hyphenation" documentation = "https://docs.rs/hyphenation" description = "Proemial code for the `hyphenation` library" [dependencies] serde = { version = "1.0", features = ["derive"] } atlatl = { version = "0.1.2", features = ["serde"] } hyphenation_commons-0.7.1/Cargo.toml0000644000000017400000000000000131610ustar00# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g. crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] name = "hyphenation_commons" version = "0.7.1" authors = ["Andrew "] description = "Proemial code for the `hyphenation` library" homepage = "https://github.com/tapeinosyne/hyphenation" documentation = "https://docs.rs/hyphenation" license = "Apache-2.0/MIT" repository = "https://github.com/tapeinosyne/hyphenation" [dependencies.atlatl] version = "0.1.2" features = ["serde"] [dependencies.serde] version = "1.0" features = ["derive"] hyphenation_commons-0.7.1/README.md010064400007650000024000000001611335173534200153020ustar0000000000000000# hyphenation commons Proemial code for `hyphenation`. Mostly internal, slightly haphazard, leastly dependable. hyphenation_commons-0.7.1/src/dictionary.rs010064400007650000024000000072761335173534200173430ustar0000000000000000//! Data structures for the storage of hyphenation patterns and exceptions. use atlatl::fst::FST; use std::collections::HashMap; use language::Language; #[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct Locus { pub index : u8, pub value : u8 } /// A trie mapping hyphenation patterns to their tallies. #[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] pub struct Patterns { pub tallies : Vec>, pub automaton : FST } /// A specialized hashmap associating words to their known hyphenation. #[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] pub struct Exceptions(pub HashMap>); /// A dictionary for standard Knuth–Liang hyphenation. /// /// It comprises the working language, the pattern and exception sets, /// and the character boundaries for hyphenation. #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct Standard { pub language : Language, pub patterns : Patterns, pub exceptions : Exceptions, /// The minimum number of `char`s from the start and end of a word where breaks /// may not occur. pub minima : (usize, usize) } // Extended hyphenation pub use self::extended::Extended; pub mod extended { use atlatl::fst::FST; use std::collections::HashMap; use language::Language; use super::Locus; /// The partial score carried by an extended hyphenation pattern. #[derive(Clone, Debug, Default, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct Tally { /// The pattern tally, equivalent to that found in standard patterns. pub standard : Vec, /// An optional subregion which may replace part of the string around the /// opportunity. pub subregion : Option<(Locus, Subregion)> } /// Word alterations extending a standard Knuth–Liang pattern. #[derive(Clone, Debug, Default, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct Subregion { /// The number of bytes that the substitution will replace before the break. pub left : usize, /// The number of bytes that the substitution will replace after the break. pub right : usize, /// The replacement for the substring to be altered around the break, as /// delimited by the `left` and `right` subregion boundaries. pub substitution : String, /// An index into the substitution, denoting the hyphenation opportunity /// within this subregion. pub breakpoint : usize, } /// A trie mapping hyphenation patterns to their extended tallies. #[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] pub struct Patterns { pub tallies : Vec, pub automaton : FST } /// A specialized hashmap associating words to their known hyphenation. #[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] pub struct Exceptions(pub HashMap)>>); /// A dictionary for extended Knuth–Liang hyphenation, based on the strategy /// described by Németh in "Automatic non-standard hyphenation in OpenOffice.org". /// /// It comprises the working language, the set of extended patterns and /// exceptions, and the character boundaries for hyphenation. #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct Extended { pub language : Language, pub patterns : Patterns, pub exceptions : Exceptions, /// The minimum number of `char`s from the start and end of a word where /// breaks may not occur. pub minima: (usize, usize) } } hyphenation_commons-0.7.1/src/language.rs010064400007650000024000000124531335173766500167640ustar0000000000000000//! Available languages and related data. use std::fmt; macro_rules! fiant_linguae { ( $($lang:ident, $bounds:expr, $code:expr;)* ) => { fiant_linguae! { $($lang, $bounds, $code);* } }; ( $($lang:ident, $bounds:expr, $code:expr);* ) => { /// The set of languages available for hyphenation. #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub enum Language { $( $lang, )* } impl Language { /// The default number of characters from the start and end of a word /// where breaks may not occur. pub fn minima(&self) -> (usize, usize) { match *self { $( Language::$lang => $bounds, )* } } /// The TeX language code. pub fn code(&self) -> &'static str { match *self { $( Language::$lang => $code, )* } } } impl fmt::Display for Language { fn fmt(&self, f : &mut fmt::Formatter) -> fmt::Result { write!(f, "{:?}", *self) } } } } // NOTE: These hyphenation bounds were taken directly from the relevant TeX // packages, but it is not entirely clear how well they map to the notion of // Unicode `char` in Rust. // // In the worst case, a language featuring graphemes larger than 1 `char` may // set boundaries mid-grapheme. This should be of no practical consequence, // since well-formed hyphenation patterns only match full graphemes; moreover, // well-behaved hyphenators are expected to validate hyphenation opportunities, // discarding any which arise outside `char` boundaries. fiant_linguae! { Afrikaans, (1, 2), "af"; Armenian, (1, 2), "hy"; Assamese, (1, 1), "as"; Basque, (2, 2), "eu"; Belarusian, (2, 2), "be"; Bengali, (1, 1), "bn"; Bulgarian, (2, 2), "bg"; Catalan, (2, 2), "ca"; Chinese, (1, 1), "zh-latn-pinyin"; Coptic, (1, 1), "cop"; Croatian, (2, 2), "hr"; Czech, (2, 3), "cs"; Danish, (2, 2), "da"; Dutch, (2, 2), "nl"; EnglishGB, (2, 3), "en-gb"; EnglishUS, (2, 3), "en-us"; Esperanto, (2, 2), "eo"; Estonian, (2, 3), "et"; Ethiopic, (1, 1), "mul-ethi"; Finnish, (2, 2), "fi"; French, (2, 3), "fr"; Friulan, (2, 2), "fur"; Galician, (2, 2), "gl"; Georgian, (1, 2), "ka"; German1901, (2, 2), "de-1901"; German1996, (2, 2), "de-1996"; GermanSwiss, (2, 2), "de-ch-1901"; GreekAncient, (1, 1), "grc"; GreekMono, (1, 1), "el-monoton"; GreekPoly, (1, 1), "el-polyton"; Gujarati, (1, 1), "gu"; Hindi, (1, 1), "hi"; Hungarian, (2, 2), "hu"; Icelandic, (2, 2), "is"; Indonesian, (2, 2), "id"; Interlingua, (2, 2), "ia"; Irish, (2, 3), "ga"; Italian, (2, 2), "it"; Kannada, (1, 1), "kn"; Kurmanji, (2, 2), "kmr"; Latin, (2, 2), "la"; LatinClassic, (2, 2), "la-x-classic"; LatinLiturgical, (2, 2), "la-x-liturgic"; Latvian, (2, 2), "lv"; Lithuanian, (2, 2), "lt"; Malayalam, (1, 1), "ml"; Marathi, (1, 1), "mr"; Mongolian, (2, 2), "mn-cyrl"; NorwegianBokmal, (2, 2), "nb"; NorwegianNynorsk, (2, 2), "nn"; Occitan, (2, 2), "oc"; Oriya, (1, 1), "or"; Pali, (1, 2), "pi"; Panjabi, (1, 1), "pa"; Piedmontese, (2, 2), "pms"; Polish, (2, 2), "pl"; Portuguese, (2, 3), "pt"; Romanian, (2, 2), "ro"; Romansh, (2, 2), "rm"; Russian, (2, 2), "ru"; Sanskrit, (1, 3), "sa"; SerbianCyrillic, (2, 2), "sr-cyrl"; SerbocroatianCyrillic, (2, 2), "sh-cyrl"; SerbocroatianLatin, (2, 2), "sh-latn"; SlavonicChurch, (1, 2), "cu"; Slovak, (2, 3), "sk"; Slovenian, (2, 2), "sl"; Spanish, (2, 2), "es"; Swedish, (2, 2), "sv"; Tamil, (1, 1), "ta"; Telugu, (1, 1), "te"; Thai, (2, 3), "th"; Turkish, (2, 2), "tr"; Turkmen, (2, 2), "tk"; Ukrainian, (2, 2), "uk"; Uppersorbian, (2, 2), "hsb"; Welsh, (2, 3), "cy"; } hyphenation_commons-0.7.1/src/lib.rs010064400007650000024000000002331335173534200157260ustar0000000000000000/* Hyphenation internals */ #[macro_use] extern crate serde; extern crate atlatl; mod language; pub mod dictionary; pub mod parse; pub use language::*; hyphenation_commons-0.7.1/src/parse.rs010064400007650000024000000104151335173534200162750ustar0000000000000000//! Pattern and exception parsing. use dictionary::*; use dictionary::extended::{self as ext, Subregion}; // TODO: make parsing fallible pub trait Parse { type Tally : Eq; fn value(char) -> Option; fn tally(&str) -> Self::Tally; fn alphabetical(s : &str) -> String { s.chars().filter(|c| Self::value(c.clone()) == None).collect() } fn pair(str_klp : &str, normalize : N) -> (String, Self::Tally) where N : Fn(&str) -> String { let normalized = normalize(str_klp); (Self::alphabetical(&normalized), Self::tally(&normalized)) } } impl<'a> Parse for Patterns { type Tally = Vec; #[inline] fn value(c : char) -> Option { c.to_digit(10).map(|n| n as u8) } fn tally(pattern : &str) -> Self::Tally { pattern.bytes() .enumerate() .filter_map(|(i, b)| Self::value(b as char).map(|v| (i, v))) .enumerate() .map(|(j, (i, v))| Locus { index : (i - j) as u8, value : v }) .collect() } } impl<'a> Parse for Exceptions { type Tally = Vec; #[inline] fn value(c : char) -> Option { match c { '-' => Some(2), _ => None } } fn tally(exception : &str) -> Self::Tally { exception.bytes() .enumerate() .filter_map(|(i, b)| Self::value(b as char).map(|_| i)) .enumerate() .map(|(j, i)| i - j) .collect() } } impl<'a> Parse for ext::Patterns { type Tally = ext::Tally; #[inline] fn value(c : char) -> Option { c.to_digit(10).map(|n| n as u8) } fn alphabetical(s : &str) -> String { match s.find('/') { None => Patterns::alphabetical(s), Some(i) => Patterns::alphabetical(&s[.. i]) } } fn tally(pattern : &str) -> Self::Tally { use std::str::FromStr; // TODO: refactor match pattern.find('/') { None => ext::Tally { standard : Patterns::tally(pattern), subregion : None, }, Some(i) => { // Exoneration: we unwrap liberally within this match arm, since failure // would denote a malformed pattern. let err = &["Malformed extended hyphenation pattern: ", pattern].concat(); let (standard, extension) = (&pattern[.. i], &pattern[i + 1 ..]); let breakpoint = extension.find('=').expect(err); let sub_pattern_end = extension.find(',').expect(err); let sub_pattern = &extension[.. sub_pattern_end]; let sub_idxs = &extension[sub_pattern_end + 1 ..]; let dot_offset = if standard.starts_with('.') { 1 } else { 0 }; let (chars_to_op, span) = { let v : Vec<_> = sub_idxs.split(',').map(|s| usize::from_str(s).expect(err)).collect(); assert!(v.len() == 2, "Malformed extended hyphenation pattern: {}", pattern); (v[0] + dot_offset, v[1]) }; let tally = Patterns::tally(standard); let alphabetical = Patterns::alphabetical(standard); let substitution = sub_pattern.chars().filter(|&c| c.is_alphabetic()).collect(); // Németh always starts the subregion at the character immediately preceding // the opportunity. let chars_to_start = chars_to_op.saturating_sub(1); let start = alphabetical.char_indices().nth(chars_to_start).expect(err).0; let end = alphabetical.char_indices().nth(chars_to_start + span).expect(err).0; let index = alphabetical.char_indices().nth(chars_to_op).expect(err).0 as u8; let (left, right) = (index as usize - start, end - index as usize); let value = tally.iter().find(|&&locus| locus.index == index) .map(|&locus| locus.value).expect(err); ext::Tally { standard : tally, subregion : ( Locus { index, value } , Subregion { left, right, substitution, breakpoint } ).into() } } } } } hyphenation_commons-0.7.1/.cargo_vcs_info.json0000644000000001120000000000000151530ustar00{ "git": { "sha1": "cab10551e468d95605e0d41bdd2c6b12da9fe697" } }