encoding-0.2.33/.gitignore01006440000765000002400000000022127606113120013565 0ustar0000000000000000target Cargo.lock encoding-0.2.33/.travis.yml01006440000765000002400000000604127606113120013714 0ustar0000000000000000language: rust os: - linux - osx env: global: - secure: WtFY+Nu8Erb9JOqX38XHyMH0C4b0y5sDAVw2GSo3pr9o5Re/is8Fa7CBtikoZp1IfB70b7mNK7T5hqvh289M+Ur43OA4EAjWi9rKZYAoK94GXRMNCwhUQR4OiPkQ8s/oJxcNGgb2lKT4Bwtpa2/kT4HA2Md6wo1Db30D4lskrsc= script: - cargo build -v - make test - cargo doc after_script: - cd target && curl http://www.rust-ci.org/artifacts/put?t=$RUSTCI_TOKEN | sh encoding-0.2.33/AUTHORS.txt01006440000765000002400000002357127606134610013507 0ustar0000000000000000Encoding is mainly written by Kang Seonghoon , and also the following people (in ascending order): Aaron Weiss Alex Crichton Andrew Cann Aneesh Agrawal Björn Steinbrink Brian Koropoff Clark Gaebel D.K Filipe Gonçalves Florian Gilcher Frank Denis Jack Moffitt Jason Ozias Jason Ozias Joonas Javanainen Joshua DeSeno Keegan McAllister Ken Tossell Kyle Dewey Manish Goregaokar Matt Brubeck Michael Neumann Michael Sproul Peter Atashian Pierre Baillet Robert Straw Simon Sapin Son Steve Klabnik klutzy Сухарик encoding-0.2.33/Cargo.toml01006440000765000002400000002500127606134320013535 0ustar0000000000000000# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g. crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] name = "encoding" version = "0.2.33" authors = ["Kang Seonghoon "] description = "Character encoding support for Rust" homepage = "https://github.com/lifthrasiir/rust-encoding" documentation = "https://lifthrasiir.github.io/rust-encoding/" readme = "README.md" keywords = ["encoding", "unicode", "charset"] license = "MIT" repository = "https://github.com/lifthrasiir/rust-encoding" [lib] name = "encoding" [dependencies.encoding-index-japanese] version = "~1.20141219.5" [dependencies.encoding-index-korean] version = "~1.20141219.5" [dependencies.encoding-index-simpchinese] version = "~1.20141219.5" [dependencies.encoding-index-singlebyte] version = "~1.20141219.5" [dependencies.encoding-index-tradchinese] version = "~1.20141219.5" [dev-dependencies.getopts] version = "0.2" encoding-0.2.33/Cargo.toml.orig01006440000765000002400000002353127606134320014502 0ustar0000000000000000[package] name = "encoding" version = "0.2.33" authors = ["Kang Seonghoon "] description = "Character encoding support for Rust" homepage = "https://github.com/lifthrasiir/rust-encoding" documentation = "https://lifthrasiir.github.io/rust-encoding/" repository = "https://github.com/lifthrasiir/rust-encoding" keywords = ["encoding", "unicode", "charset"] readme = "README.md" license = "MIT" [lib] name = "encoding" # version policy for index tables: # - major: addition or deletion of index tables # - minor: any content changes to index tables, numbered by the date # - patch: language changes # # we definitely don't want to use the old index table, # so we should use tilde requirements here. [dependencies.encoding-index-singlebyte] version = "~1.20141219.5" path = "src/index/singlebyte" [dependencies.encoding-index-korean] version = "~1.20141219.5" path = "src/index/korean" [dependencies.encoding-index-japanese] version = "~1.20141219.5" path = "src/index/japanese" [dependencies.encoding-index-simpchinese] version = "~1.20141219.5" path = "src/index/simpchinese" [dependencies.encoding-index-tradchinese] version = "~1.20141219.5" path = "src/index/tradchinese" [dev-dependencies] getopts = "0.2" # for examples encoding-0.2.33/examples/recode.rs01006440000765000002400000006715125331707360015251 0ustar0000000000000000// This is a part of rust-encoding. // Copyright (c) 2014-2015, Kang Seonghoon. // See README.md and LICENSE.txt for details. extern crate encoding; extern crate getopts; use std::{io, env}; use std::io::{Read, Write}; use std::fs::File; use std::path::Path; use encoding::{EncoderTrap, DecoderTrap}; use encoding::label::encoding_from_whatwg_label; use getopts::Options; fn main() { let args: Vec<_> = env::args().collect(); let mut opts = Options::new(); opts.optopt("f", "from-code", "set input encoding", "NAME"); opts.optopt("t", "to-code", "set output encoding", "NAME"); opts.optopt("e", "error-policy", "set error policy (one of strict, ignore, replace, ncr-escape)", "POLICY"); opts.optflag("c", "", "same as `--error-policy=ignore`"); opts.optopt("o", "output", "output file", "FILE"); opts.optflag("h", "help", "print this help menu"); let matches = match opts.parse(&args[1..]) { Ok(m) => m, Err(e) => panic!(e.to_string()), }; if matches.opt_present("h") { println!("{}", opts.usage("Converts the character encoding using rust-encoding.")); return; } let inencname = matches.opt_str("f"); let outencname = matches.opt_str("t"); let inenc = match inencname.as_ref().map(|s| &s[..]) { Some(name) => match encoding_from_whatwg_label(name) { Some(enc) => enc, None => panic!("invalid input encoding name {}", name), }, None => encoding::all::UTF_8 as encoding::EncodingRef, }; let outenc = match outencname.as_ref().map(|s| &s[..]) { Some(name) => match encoding_from_whatwg_label(name) { Some(enc) => enc, None => panic!("invalid output encoding name {}", name), }, None => encoding::all::UTF_8 as encoding::EncodingRef, }; let mut policy = matches.opt_str("e"); if matches.opt_present("c") { policy = Some("ignore".to_string()); } let (intrap, outtrap) = match policy.as_ref().map(|s| &s[..]) { Some("strict") | None => (DecoderTrap::Strict, EncoderTrap::Strict), Some("ignore") => (DecoderTrap::Ignore, EncoderTrap::Ignore), Some("replace") => (DecoderTrap::Replace, EncoderTrap::Replace), Some("ncr-escape") => (DecoderTrap::Replace, EncoderTrap::NcrEscape), Some(s) => panic!("invalid error policy {}", s), }; let mut input = match matches.free.first().map(|s| &s[..]) { Some("-") | None => Box::new(io::stdin()) as Box, Some(f) => match File::open(&Path::new(f)) { Ok(f) => Box::new(f) as Box, Err(e) => panic!("cannot open the input {}: {}", f, e), }, }; let mut output = match matches.opt_str("o").as_ref().map(|s| &s[..]) { Some("-") | None => Box::new(io::stdout()) as Box, Some(f) => match File::create(&Path::new(f)) { Ok(f) => Box::new(f) as Box, Err(e) => panic!("cannot open the output {}: {}", f, e), }, }; // XXX should really use the incremental interface let mut ret = Vec::new(); input.read_to_end(&mut ret).ok().expect("cannot read from the input"); let decoded = match inenc.decode(&ret, intrap) { Ok(s) => s, Err(e) => panic!("decoder error: {}", e), }; let encoded = match outenc.encode(&decoded, outtrap) { Ok(s) => s, Err(e) => panic!("encoder error: {}", e), }; output.write_all(&encoded).unwrap(); } encoding-0.2.33/LICENSE.txt01006440000765000002400000002074125331707360013441 0ustar0000000000000000The MIT License (MIT) Copyright (c) 2013, Kang Seonghoon. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. encoding-0.2.33/Makefile01006440000765000002400000003632127606125520013256 0ustar0000000000000000.PHONY: all all: @echo 'Try `cargo build` instead.' .PHONY: authors authors: echo 'Encoding is mainly written by Kang Seonghoon ,' > AUTHORS.txt echo 'and also the following people (in ascending order):' >> AUTHORS.txt echo >> AUTHORS.txt git log --format='%aN <%aE>' | grep -v 'Kang Seonghoon' | sort -u >> AUTHORS.txt .PHONY: test test: cargo test -v cargo test -v -p encoding-index-singlebyte cargo test -v -p encoding-index-korean cargo test -v -p encoding-index-japanese cargo test -v -p encoding-index-simpchinese cargo test -v -p encoding-index-tradchinese .PHONY: readme readme: README.md README.md: src/lib.rs # really, really sorry for this mess. awk '/^\/\/! # Encoding /{print "[Encoding][doc]",$$4}' $< > $@ awk '/^\/\/! # Encoding /{print "[Encoding][doc]",$$4}' $< | sed 's/./=/g' >> $@ echo >> $@ echo '[![Encoding on Travis CI][travis-image]][travis]' >> $@ echo >> $@ echo '[travis-image]: https://travis-ci.org/lifthrasiir/rust-encoding.png' >> $@ echo '[travis]: https://travis-ci.org/lifthrasiir/rust-encoding' >> $@ awk '/^\/\/! # Encoding /,/^\/\/! ## /' $< | cut -b 5- | grep -v '^# ' >> $@ echo >> $@ echo '[Complete Documentation][doc]' >> $@ echo >> $@ echo '[doc]: https://lifthrasiir.github.io/rust-encoding/' >> $@ echo >> $@ awk '/^\/\/! ## /,!/^\/\/!/' $< | cut -b 5- | grep -v '^# ' >> $@ .PHONY: doc doc: authors readme cargo doc .PHONY: doc-publish doc-publish: doc ( \ PKGID="$$(cargo pkgid)"; \ PKGNAMEVER="$${PKGID#*#}"; \ PKGNAME="$${PKGNAMEVER%:*}"; \ REMOTE="$$(git config --get remote.origin.url)"; \ cd target/doc && \ rm -rf .git && \ git init && \ git checkout --orphan gh-pages && \ echo '' > index.html && \ git add . && \ git commit -m 'updated docs.' && \ git push "$$REMOTE" gh-pages -f; \ ) encoding-0.2.33/README.md01006440000765000002400000017415127606123020013072 0ustar0000000000000000[Encoding][doc] 0.2.33 ====================== [![Encoding on Travis CI][travis-image]][travis] [travis-image]: https://travis-ci.org/lifthrasiir/rust-encoding.png [travis]: https://travis-ci.org/lifthrasiir/rust-encoding Character encoding support for Rust. (also known as `rust-encoding`) It is based on [WHATWG Encoding Standard](http://encoding.spec.whatwg.org/), and also provides an advanced interface for error detection and recovery. ## Usage [Complete Documentation][doc] [doc]: https://lifthrasiir.github.io/rust-encoding/ ## Usage Put this in your `Cargo.toml`: ```toml [dependencies] encoding = "0.2" ``` Then put this in your crate root: ```rust extern crate encoding; ``` ## Overview To encode a string: ~~~~ {.rust} use encoding::{Encoding, EncoderTrap}; use encoding::all::ISO_8859_1; assert_eq!(ISO_8859_1.encode("caf\u{e9}", EncoderTrap::Strict), Ok(vec![99,97,102,233])); ~~~~ To encode a string with unrepresentable characters: ~~~~ {.rust} use encoding::{Encoding, EncoderTrap}; use encoding::all::ISO_8859_2; assert!(ISO_8859_2.encode("Acme\u{a9}", EncoderTrap::Strict).is_err()); assert_eq!(ISO_8859_2.encode("Acme\u{a9}", EncoderTrap::Replace), Ok(vec![65,99,109,101,63])); assert_eq!(ISO_8859_2.encode("Acme\u{a9}", EncoderTrap::Ignore), Ok(vec![65,99,109,101])); assert_eq!(ISO_8859_2.encode("Acme\u{a9}", EncoderTrap::NcrEscape), Ok(vec![65,99,109,101,38,35,49,54,57,59])); ~~~~ To decode a byte sequence: ~~~~ {.rust} use encoding::{Encoding, DecoderTrap}; use encoding::all::ISO_8859_1; assert_eq!(ISO_8859_1.decode(&[99,97,102,233], DecoderTrap::Strict), Ok("caf\u{e9}".to_string())); ~~~~ To decode a byte sequence with invalid sequences: ~~~~ {.rust} use encoding::{Encoding, DecoderTrap}; use encoding::all::ISO_8859_6; assert!(ISO_8859_6.decode(&[65,99,109,101,169], DecoderTrap::Strict).is_err()); assert_eq!(ISO_8859_6.decode(&[65,99,109,101,169], DecoderTrap::Replace), Ok("Acme\u{fffd}".to_string())); assert_eq!(ISO_8859_6.decode(&[65,99,109,101,169], DecoderTrap::Ignore), Ok("Acme".to_string())); ~~~~ To encode or decode the input into the already allocated buffer: ~~~~ {.rust} use encoding::{Encoding, EncoderTrap, DecoderTrap}; use encoding::all::{ISO_8859_2, ISO_8859_6}; let mut bytes = Vec::new(); let mut chars = String::new(); assert!(ISO_8859_2.encode_to("Acme\u{a9}", EncoderTrap::Ignore, &mut bytes).is_ok()); assert!(ISO_8859_6.decode_to(&[65,99,109,101,169], DecoderTrap::Replace, &mut chars).is_ok()); assert_eq!(bytes, [65,99,109,101]); assert_eq!(chars, "Acme\u{fffd}"); ~~~~ A practical example of custom encoder traps: ~~~~ {.rust} use encoding::{Encoding, ByteWriter, EncoderTrap, DecoderTrap}; use encoding::types::RawEncoder; use encoding::all::ASCII; // hexadecimal numeric character reference replacement fn hex_ncr_escape(_encoder: &mut RawEncoder, input: &str, output: &mut ByteWriter) -> bool { let escapes: Vec = input.chars().map(|ch| format!("&#x{:x};", ch as isize)).collect(); let escapes = escapes.concat(); output.write_bytes(escapes.as_bytes()); true } static HEX_NCR_ESCAPE: EncoderTrap = EncoderTrap::Call(hex_ncr_escape); let orig = "Hello, 世界!".to_string(); let encoded = ASCII.encode(&orig, HEX_NCR_ESCAPE).unwrap(); assert_eq!(ASCII.decode(&encoded, DecoderTrap::Strict), Ok("Hello, 世界!".to_string())); ~~~~ Getting the encoding from the string label, as specified in WHATWG Encoding standard: ~~~~ {.rust} use encoding::{Encoding, DecoderTrap}; use encoding::label::encoding_from_whatwg_label; use encoding::all::WINDOWS_949; let euckr = encoding_from_whatwg_label("euc-kr").unwrap(); assert_eq!(euckr.name(), "windows-949"); assert_eq!(euckr.whatwg_name(), Some("euc-kr")); // for the sake of compatibility let broken = &[0xbf, 0xec, 0xbf, 0xcd, 0xff, 0xbe, 0xd3]; assert_eq!(euckr.decode(broken, DecoderTrap::Replace), Ok("\u{c6b0}\u{c640}\u{fffd}\u{c559}".to_string())); // corresponding Encoding native API: assert_eq!(WINDOWS_949.decode(broken, DecoderTrap::Replace), Ok("\u{c6b0}\u{c640}\u{fffd}\u{c559}".to_string())); ~~~~ ## Types and Stuffs There are three main entry points to Encoding. **`Encoding`** is a single character encoding. It contains `encode` and `decode` methods for converting `String` to `Vec` and vice versa. For the error handling, they receive **traps** (`EncoderTrap` and `DecoderTrap` respectively) which replace any error with some string (e.g. `U+FFFD`) or sequence (e.g. `?`). You can also use `EncoderTrap::Strict` and `DecoderTrap::Strict` traps to stop on an error. There are two ways to get `Encoding`: * `encoding::all` has static items for every supported encoding. You should use them when the encoding would not change or only handful of them are required. Combined with link-time optimization, any unused encoding would be discarded from the binary. * `encoding::label` has functions to dynamically get an encoding from given string ("label"). They will return a static reference to the encoding, which type is also known as `EncodingRef`. It is useful when a list of required encodings is not available in advance, but it will result in the larger binary and missed optimization opportunities. **`RawEncoder`** is an experimental incremental encoder. At each step of `raw_feed`, it receives a slice of string and emits any encoded bytes to a generic `ByteWriter` (normally `Vec`). It will stop at the first error if any, and would return a `CodecError` struct in that case. The caller is responsible for calling `raw_finish` at the end of encoding process. **`RawDecoder`** is an experimental incremental decoder. At each step of `raw_feed`, it receives a slice of byte sequence and emits any decoded characters to a generic `StringWriter` (normally `String`). Otherwise it is identical to `RawEncoder`s. One should prefer `Encoding::{encode,decode}` as a primary interface. `RawEncoder` and `RawDecoder` is experimental and can change substantially. See the additional documents on `encoding::types` module for more information on them. ## Supported Encodings Encoding covers all encodings specified by WHATWG Encoding Standard and some more: * 7-bit strict ASCII (`ascii`) * UTF-8 (`utf-8`) * UTF-16 in little endian (`utf-16` or `utf-16le`) and big endian (`utf-16be`) * All single byte encoding in WHATWG Encoding Standard: * IBM code page 866 * ISO 8859-{2,3,4,5,6,7,8,10,13,14,15,16} * KOI8-R, KOI8-U * MacRoman (`macintosh`), Macintosh Cyrillic encoding (`x-mac-cyrillic`) * Windows code pages 874, 1250, 1251, 1252 (instead of ISO 8859-1), 1253, 1254 (instead of ISO 8859-9), 1255, 1256, 1257, 1258 * All multi byte encodings in WHATWG Encoding Standard: * Windows code page 949 (`euc-kr`, since the strict EUC-KR is hardly used) * EUC-JP and Windows code page 932 (`shift_jis`, since it's the most widespread extension to Shift_JIS) * ISO-2022-JP with asymmetric JIS X 0212 support (Note: this is not yet up to date to the current standard) * GBK * GB 18030 * Big5-2003 with HKSCS-2008 extensions * Encodings that were originally specified by WHATWG Encoding Standard: * HZ * ISO 8859-1 (distinct from Windows code page 1252) Parenthesized names refer to the encoding's primary name assigned by WHATWG Encoding Standard. Many legacy character encodings lack the proper specification, and even those that have a specification are highly dependent of the actual implementation. Consequently one should be careful when picking a desired character encoding. The only standards reliable in this regard are WHATWG Encoding Standard and [vendor-provided mappings from the Unicode consortium](http://www.unicode.org/Public/MAPPINGS/). Whenever in doubt, look at the source code and specifications for detailed explanations. encoding-0.2.33/src/all.rs01006440000765000002400000014120127606113120013506 0ustar0000000000000000// This is a part of rust-encoding. // Copyright (c) 2013, Kang Seonghoon. // See README.md and LICENSE.txt for details. //! A list of all supported encodings. Useful for encodings fixed in the compile time. use index_singlebyte as index; use codec; use types::EncodingRef; macro_rules! unique( ($(#[$attr:meta])* var=$var:ident, mod=$($module:ident)::+, val=$val:ident) => ( unique!($(#[$attr])* var=$var, mod=$($module)::+, ty=$val, val=$val); ); ($(#[$attr:meta])* var=$var:ident, mod=$($module:ident)::+, ty=$ty:ident, val=$val:ident) => ( $(#[$attr])* pub const $var: &'static $($module)::+::$ty = &$($module)::+::$val; ); ); macro_rules! singlebyte( ($(#[$attr:meta])* var=$var:ident, mod=$($module:ident)::+, name=$name:expr) => ( singlebyte!($(#[$attr])* var=$var, mod=$($module)::+, name=$name, whatwg=None); ); ($(#[$attr:meta])* var=$var:ident, mod=$($module:ident)::+, name|whatwg=$name:expr) => ( singlebyte!($(#[$attr])* var=$var, mod=$($module)::+, name=$name, whatwg=Some($name)); ); ($(#[$attr:meta])* var=$var:ident, mod=$($module:ident)::+, name=$name:expr, whatwg=$whatwg:expr) => ( $(#[$attr])* pub const $var: &'static codec::singlebyte::SingleByteEncoding = &codec::singlebyte::SingleByteEncoding { name: $name, whatwg_name: $whatwg, index_forward: $($module)::+::forward, index_backward: $($module)::+::backward, }; ) ); unique!(var=ERROR, mod=codec::error, val=ErrorEncoding); unique!(var=ASCII, mod=codec::ascii, val=ASCIIEncoding); singlebyte!(var=IBM866, mod=index::ibm866, name|whatwg="ibm866"); singlebyte!(var=ISO_8859_1, mod=codec::singlebyte::iso_8859_1, name="iso-8859-1"); singlebyte!(var=ISO_8859_2, mod=index::iso_8859_2, name|whatwg="iso-8859-2"); singlebyte!(var=ISO_8859_3, mod=index::iso_8859_3, name|whatwg="iso-8859-3"); singlebyte!(var=ISO_8859_4, mod=index::iso_8859_4, name|whatwg="iso-8859-4"); singlebyte!(var=ISO_8859_5, mod=index::iso_8859_5, name|whatwg="iso-8859-5"); singlebyte!(var=ISO_8859_6, mod=index::iso_8859_6, name|whatwg="iso-8859-6"); singlebyte!(var=ISO_8859_7, mod=index::iso_8859_7, name|whatwg="iso-8859-7"); singlebyte!(var=ISO_8859_8, mod=index::iso_8859_8, name|whatwg="iso-8859-8"); singlebyte!(var=ISO_8859_10, mod=index::iso_8859_10, name|whatwg="iso-8859-10"); singlebyte!(var=ISO_8859_13, mod=index::iso_8859_13, name|whatwg="iso-8859-13"); singlebyte!(var=ISO_8859_14, mod=index::iso_8859_14, name|whatwg="iso-8859-14"); singlebyte!(var=ISO_8859_15, mod=index::iso_8859_15, name|whatwg="iso-8859-15"); singlebyte!(var=ISO_8859_16, mod=index::iso_8859_16, name|whatwg="iso-8859-16"); singlebyte!(var=KOI8_R, mod=index::koi8_r, name|whatwg="koi8-r"); singlebyte!(var=KOI8_U, mod=index::koi8_u, name|whatwg="koi8-u"); singlebyte!(var=MAC_ROMAN, mod=index::macintosh, name="mac-roman", whatwg=Some("macintosh")); singlebyte!(var=WINDOWS_874, mod=index::windows_874, name|whatwg="windows-874"); singlebyte!(var=WINDOWS_1250, mod=index::windows_1250, name|whatwg="windows-1250"); singlebyte!(var=WINDOWS_1251, mod=index::windows_1251, name|whatwg="windows-1251"); singlebyte!(var=WINDOWS_1252, mod=index::windows_1252, name|whatwg="windows-1252"); singlebyte!(var=WINDOWS_1253, mod=index::windows_1253, name|whatwg="windows-1253"); singlebyte!(var=WINDOWS_1254, mod=index::windows_1254, name|whatwg="windows-1254"); singlebyte!(var=WINDOWS_1255, mod=index::windows_1255, name|whatwg="windows-1255"); singlebyte!(var=WINDOWS_1256, mod=index::windows_1256, name|whatwg="windows-1256"); singlebyte!(var=WINDOWS_1257, mod=index::windows_1257, name|whatwg="windows-1257"); singlebyte!(var=WINDOWS_1258, mod=index::windows_1258, name|whatwg="windows-1258"); singlebyte!(var=MAC_CYRILLIC, mod=index::x_mac_cyrillic, name="mac-cyrillic", whatwg=Some("x-mac-cyrillic")); unique!(var=UTF_8, mod=codec::utf_8, val=UTF8Encoding); unique!(var=UTF_16LE, mod=codec::utf_16, ty=UTF16LEEncoding, val=UTF_16LE_ENCODING); unique!(var=UTF_16BE, mod=codec::utf_16, ty=UTF16BEEncoding, val=UTF_16BE_ENCODING); unique!(var=WINDOWS_949, mod=codec::korean, val=Windows949Encoding); unique!(var=EUC_JP, mod=codec::japanese, val=EUCJPEncoding); unique!(var=WINDOWS_31J, mod=codec::japanese, val=Windows31JEncoding); unique!(var=ISO_2022_JP, mod=codec::japanese, val=ISO2022JPEncoding); unique!(var=GBK, mod=codec::simpchinese, ty=GBKEncoding, val=GBK_ENCODING); unique!(var=GB18030, mod=codec::simpchinese, ty=GB18030Encoding, val=GB18030_ENCODING); unique!(var=HZ, mod=codec::simpchinese, val=HZEncoding); unique!(var=BIG5_2003, mod=codec::tradchinese, val=BigFive2003Encoding); pub mod whatwg { use index_singlebyte as index; use codec; singlebyte!(var=X_USER_DEFINED, mod=codec::whatwg::x_user_defined, name="pua-mapped-binary", whatwg=Some("x-user-defined")); singlebyte!(var=ISO_8859_8_I, mod=index::iso_8859_8, name|whatwg="iso-8859-8-i"); unique!(var=REPLACEMENT, mod=codec::whatwg, val=EncoderOnlyUTF8Encoding); } /// Returns a list of references to the encodings available. pub fn encodings() -> &'static [EncodingRef] { // TODO should be generated automatically const ENCODINGS: &'static [EncodingRef] = &[ ERROR, ASCII, IBM866, ISO_8859_1, ISO_8859_2, ISO_8859_3, ISO_8859_4, ISO_8859_5, ISO_8859_6, ISO_8859_7, ISO_8859_8, ISO_8859_10, ISO_8859_13, ISO_8859_14, ISO_8859_15, ISO_8859_16, KOI8_R, KOI8_U, MAC_ROMAN, WINDOWS_874, WINDOWS_1250, WINDOWS_1251, WINDOWS_1252, WINDOWS_1253, WINDOWS_1254, WINDOWS_1255, WINDOWS_1256, WINDOWS_1257, WINDOWS_1258, MAC_CYRILLIC, UTF_8, UTF_16LE, UTF_16BE, WINDOWS_949, EUC_JP, WINDOWS_31J, ISO_2022_JP, GBK, GB18030, HZ, BIG5_2003, whatwg::X_USER_DEFINED, whatwg::ISO_8859_8_I, whatwg::REPLACEMENT, ]; ENCODINGS } encoding-0.2.33/src/codec/ascii.rs01006440000765000002400000011445125331707360015122 0ustar0000000000000000// This is a part of rust-encoding. // Copyright (c) 2013-2015, Kang Seonghoon. // See README.md and LICENSE.txt for details. //! 7-bit ASCII encoding. use std::mem; use std::convert::Into; use types::*; /** * ASCII, also known as ISO/IEC 646:US. * * It is both a basis and a lowest common denominator of many other encodings * including UTF-8, which Rust internally assumes. */ #[derive(Clone, Copy)] pub struct ASCIIEncoding; impl Encoding for ASCIIEncoding { fn name(&self) -> &'static str { "ascii" } fn raw_encoder(&self) -> Box { ASCIIEncoder::new() } fn raw_decoder(&self) -> Box { ASCIIDecoder::new() } } /// An encoder for ASCII. #[derive(Clone, Copy)] pub struct ASCIIEncoder; impl ASCIIEncoder { pub fn new() -> Box { Box::new(ASCIIEncoder) } } impl RawEncoder for ASCIIEncoder { fn from_self(&self) -> Box { ASCIIEncoder::new() } fn is_ascii_compatible(&self) -> bool { true } fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option) { output.writer_hint(input.len()); match input.as_bytes().iter().position(|&ch| ch >= 0x80) { Some(first_error) => { output.write_bytes(&input.as_bytes()[..first_error]); let len = input[first_error..].chars().next().unwrap().len_utf8(); (first_error, Some(CodecError { upto: (first_error + len) as isize, cause: "unrepresentable character".into() })) } None => { output.write_bytes(input.as_bytes()); (input.len(), None) } } } fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option { None } } /// A decoder for ASCII. #[derive(Clone, Copy)] pub struct ASCIIDecoder; impl ASCIIDecoder { pub fn new() -> Box { Box::new(ASCIIDecoder) } } impl RawDecoder for ASCIIDecoder { fn from_self(&self) -> Box { ASCIIDecoder::new() } fn is_ascii_compatible(&self) -> bool { true } fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option) { output.writer_hint(input.len()); fn write_ascii_bytes(output: &mut StringWriter, buf: &[u8]) { output.write_str(unsafe {mem::transmute(buf)}); } match input.iter().position(|&ch| ch >= 0x80) { Some(first_error) => { write_ascii_bytes(output, &input[..first_error]); (first_error, Some(CodecError { upto: first_error as isize + 1, cause: "invalid sequence".into() })) } None => { write_ascii_bytes(output, input); (input.len(), None) } } } fn raw_finish(&mut self, _output: &mut StringWriter) -> Option { None } } #[cfg(test)] mod tests { extern crate test; use super::ASCIIEncoding; use testutils; use types::*; #[test] fn test_encoder() { let mut e = ASCIIEncoding.raw_encoder(); assert_feed_ok!(e, "A", "", [0x41]); assert_feed_ok!(e, "BC", "", [0x42, 0x43]); assert_feed_ok!(e, "", "", []); assert_feed_err!(e, "", "\u{a0}", "", []); assert_feed_err!(e, "X", "\u{a0}", "Z", [0x58]); assert_finish_ok!(e, []); } #[test] fn test_decoder() { let mut d = ASCIIEncoding.raw_decoder(); assert_feed_ok!(d, [0x41], [], "A"); assert_feed_ok!(d, [0x42, 0x43], [], "BC"); assert_feed_ok!(d, [], [], ""); assert_feed_err!(d, [], [0xa0], [], ""); assert_feed_err!(d, [0x58], [0xa0], [0x5a], "X"); assert_finish_ok!(d, ""); } #[bench] fn bench_encode(bencher: &mut test::Bencher) { let s = testutils::ASCII_TEXT; bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ ASCIIEncoding.encode(s, EncoderTrap::Strict) })) } #[bench] fn bench_decode(bencher: &mut test::Bencher) { let s = testutils::ASCII_TEXT.as_bytes(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ ASCIIEncoding.decode(s, DecoderTrap::Strict) })) } #[bench] fn bench_encode_replace(bencher: &mut test::Bencher) { let s = testutils::KOREAN_TEXT; bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ ASCIIEncoding.encode(s, EncoderTrap::Replace) })) } #[bench] fn bench_decode_replace(bencher: &mut test::Bencher) { let s = testutils::KOREAN_TEXT.as_bytes(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ ASCIIEncoding.decode(s, DecoderTrap::Replace) })) } } encoding-0.2.33/src/codec/error.rs01006440000765000002400000005170125331707360015161 0ustar0000000000000000// This is a part of rust-encoding. // Copyright (c) 2013-2015, Kang Seonghoon. // See README.md and LICENSE.txt for details. //! A placeholder encoding that returns encoder/decoder error for every case. use std::convert::Into; use types::*; /// An encoding that returns encoder/decoder error for every case. #[derive(Clone, Copy)] pub struct ErrorEncoding; impl Encoding for ErrorEncoding { fn name(&self) -> &'static str { "error" } fn raw_encoder(&self) -> Box { ErrorEncoder::new() } fn raw_decoder(&self) -> Box { ErrorDecoder::new() } } /// An encoder that always returns error. #[derive(Clone, Copy)] pub struct ErrorEncoder; impl ErrorEncoder { pub fn new() -> Box { Box::new(ErrorEncoder) } } impl RawEncoder for ErrorEncoder { fn from_self(&self) -> Box { ErrorEncoder::new() } fn raw_feed(&mut self, input: &str, _output: &mut ByteWriter) -> (usize, Option) { if let Some(ch) = input.chars().next() { (0, Some(CodecError { upto: ch.len_utf8() as isize, cause: "unrepresentable character".into() })) } else { (0, None) } } fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option { None } } /// A decoder that always returns error. #[derive(Clone, Copy)] pub struct ErrorDecoder; impl ErrorDecoder { pub fn new() -> Box { Box::new(ErrorDecoder) } } impl RawDecoder for ErrorDecoder { fn from_self(&self) -> Box { ErrorDecoder::new() } fn raw_feed(&mut self, input: &[u8], _output: &mut StringWriter) -> (usize, Option) { if input.len() > 0 { (0, Some(CodecError { upto: 1, cause: "invalid sequence".into() })) } else { (0, None) } } fn raw_finish(&mut self, _output: &mut StringWriter) -> Option { None } } #[cfg(test)] mod tests { use super::ErrorEncoding; use types::*; #[test] fn test_encoder() { let mut e = ErrorEncoding.raw_encoder(); assert_feed_err!(e, "", "A", "", []); assert_feed_err!(e, "", "B", "C", []); assert_feed_ok!(e, "", "", []); assert_feed_err!(e, "", "\u{a0}", "", []); assert_finish_ok!(e, []); } #[test] fn test_decoder() { let mut d = ErrorEncoding.raw_decoder(); assert_feed_err!(d, [], [0x41], [], ""); assert_feed_err!(d, [], [0x42], [0x43], ""); assert_feed_ok!(d, [], [], ""); assert_feed_err!(d, [], [0xa0], [], ""); assert_finish_ok!(d, ""); } } encoding-0.2.33/src/codec/japanese.rs01006440000765000002400000142171125331707360015621 0ustar0000000000000000// This is a part of rust-encoding. // Copyright (c) 2013-2015, Kang Seonghoon. // See README.md and LICENSE.txt for details. //! Legacy Japanese encodings based on JIS X 0208 and JIS X 0212. use std::convert::Into; use std::default::Default; use util::StrCharIndex; use index_japanese as index; use types::*; use self::ISO2022JPState::{ASCII,Katakana,Lead}; /** * EUC-JP. (XXX with asymmetric JIS X 0212 support) * * This is a Japanese encoding created from three JIS character sets: * * - JIS X 0201, which lower half is ISO/IEC 646:JP (US-ASCII with yen sign and overline) * and upper half contains legacy half-width Katakanas. * - JIS X 0208, a primary graphic character set (94x94). * - JIS X 0212, a supplementary graphic character set (94x94). * * EUC-JP contains the lower half of JIS X 0201 in G0 (`[21-7E]`), * JIS X 0208 in G1 (`[A1-FE] [A1-FE]`), * the upper half of JIS X 0212 in G2 (`8E [A1-DF]`), and * JIS X 0212 in G3 (`8F [A1-FE] [A1-FE]`). */ #[derive(Clone, Copy)] pub struct EUCJPEncoding; impl Encoding for EUCJPEncoding { fn name(&self) -> &'static str { "euc-jp" } fn whatwg_name(&self) -> Option<&'static str> { Some("euc-jp") } fn raw_encoder(&self) -> Box { EUCJPEncoder::new() } fn raw_decoder(&self) -> Box { EUCJP0212Decoder::new() } } /// An encoder for EUC-JP with unused G3 character set. #[derive(Clone, Copy)] pub struct EUCJPEncoder; impl EUCJPEncoder { pub fn new() -> Box { Box::new(EUCJPEncoder) } } impl RawEncoder for EUCJPEncoder { fn from_self(&self) -> Box { EUCJPEncoder::new() } fn is_ascii_compatible(&self) -> bool { true } fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option) { output.writer_hint(input.len()); for ((i,j), ch) in input.index_iter() { match ch { '\u{0}'...'\u{7f}' => { output.write_byte(ch as u8); } '\u{a5}' => { output.write_byte(0x5c); } '\u{203e}' => { output.write_byte(0x7e); } '\u{ff61}'...'\u{ff9f}' => { output.write_byte(0x8e); output.write_byte((ch as usize - 0xff61 + 0xa1) as u8); } _ => { let ptr = index::jis0208::backward(ch as u32); if ptr == 0xffff { return (i, Some(CodecError { upto: j as isize, cause: "unrepresentable character".into() })); } else { let lead = ptr / 94 + 0xa1; let trail = ptr % 94 + 0xa1; output.write_byte(lead as u8); output.write_byte(trail as u8); } } } } (input.len(), None) } fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option { None } } /// A decoder for EUC-JP with JIS X 0212 in G3. #[derive(Clone, Copy)] struct EUCJP0212Decoder { st: eucjp::State, } impl EUCJP0212Decoder { pub fn new() -> Box { Box::new(EUCJP0212Decoder { st: Default::default() }) } } impl RawDecoder for EUCJP0212Decoder { fn from_self(&self) -> Box { EUCJP0212Decoder::new() } fn is_ascii_compatible(&self) -> bool { true } fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option) { let (st, processed, err) = eucjp::raw_feed(self.st, input, output, &()); self.st = st; (processed, err) } fn raw_finish(&mut self, output: &mut StringWriter) -> Option { let (st, err) = eucjp::raw_finish(self.st, output, &()); self.st = st; err } } stateful_decoder! { module eucjp; internal pub fn map_two_0208_bytes(lead: u8, trail: u8) -> u32 { use index_japanese as index; let lead = lead as u16; let trail = trail as u16; let index = match (lead, trail) { (0xa1...0xfe, 0xa1...0xfe) => (lead - 0xa1) * 94 + trail - 0xa1, _ => 0xffff, }; index::jis0208::forward(index) } internal pub fn map_two_0212_bytes(lead: u8, trail: u8) -> u32 { use index_japanese as index; let lead = lead as u16; let trail = trail as u16; let index = match (lead, trail) { (0xa1...0xfe, 0xa1...0xfe) => (lead - 0xa1) * 94 + trail - 0xa1, _ => 0xffff, }; index::jis0212::forward(index) } initial: // euc-jp lead = 0x00 state S0(ctx: Context) { case b @ 0x00...0x7f => ctx.emit(b as u32); case 0x8e => S1(ctx); case 0x8f => S2(ctx); case b @ 0xa1...0xfe => S3(ctx, b); case _ => ctx.err("invalid sequence"); } transient: // euc-jp lead = 0x8e state S1(ctx: Context) { case b @ 0xa1...0xdf => ctx.emit(0xff61 + b as u32 - 0xa1); case 0xa1...0xfe => ctx.err("invalid sequence"); case _ => ctx.backup_and_err(1, "invalid sequence"); } // euc-jp lead = 0x8f // JIS X 0201 half-width katakana state S2(ctx: Context) { case b @ 0xa1...0xfe => S4(ctx, b); case _ => ctx.backup_and_err(1, "invalid sequence"); } // euc-jp lead != 0x00, euc-jp jis0212 flag = unset // JIS X 0208 two-byte sequence state S3(ctx: Context, lead: u8) { case b @ 0xa1...0xfe => match map_two_0208_bytes(lead, b) { // do NOT backup, we only backup for out-of-range trails. 0xffff => ctx.err("invalid sequence"), ch => ctx.emit(ch as u32) }; case _ => ctx.backup_and_err(1, "invalid sequence"); } // euc-jp lead != 0x00, euc-jp jis0212 flag = set // JIS X 0212 three-byte sequence state S4(ctx: Context, lead: u8) { case b @ 0xa1...0xfe => match map_two_0212_bytes(lead, b) { // do NOT backup, we only backup for out-of-range trails. 0xffff => ctx.err("invalid sequence"), ch => ctx.emit(ch as u32) }; case _ => ctx.backup_and_err(1, "invalid sequence"); } } #[cfg(test)] mod eucjp_tests { extern crate test; use super::EUCJPEncoding; use testutils; use types::*; #[test] fn test_encoder_valid() { let mut e = EUCJPEncoding.raw_encoder(); assert_feed_ok!(e, "A", "", [0x41]); assert_feed_ok!(e, "BC", "", [0x42, 0x43]); assert_feed_ok!(e, "", "", []); assert_feed_ok!(e, "\u{a5}", "", [0x5c]); assert_feed_ok!(e, "\u{203e}", "", [0x7e]); assert_feed_ok!(e, "\u{306b}\u{307b}\u{3093}", "", [0xa4, 0xcb, 0xa4, 0xdb, 0xa4, 0xf3]); assert_feed_ok!(e, "\u{ff86}\u{ff8e}\u{ff9d}", "", [0x8e, 0xc6, 0x8e, 0xce, 0x8e, 0xdd]); assert_feed_ok!(e, "\u{65e5}\u{672c}", "", [0xc6, 0xfc, 0xcb, 0xdc]); assert_finish_ok!(e, []); } #[test] fn test_encoder_double_mapped() { // these characters are double-mapped to both EUDC area and Shift_JIS extension area // but only the former should be used. (note that U+FFE2 is triple-mapped!) let mut e = EUCJPEncoding.raw_encoder(); assert_feed_ok!(e, "\u{9ed1}\u{2170}\u{ffe2}", "", [0xfc, 0xee, 0xfc, 0xf1, 0xa2, 0xcc]); assert_finish_ok!(e, []); } #[test] fn test_encoder_invalid() { let mut e = EUCJPEncoding.raw_encoder(); assert_feed_err!(e, "", "\u{ffff}", "", []); assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]); // JIS X 0212 is not supported in the encoder assert_feed_err!(e, "", "\u{736c}", "\u{8c78}", []); assert_finish_ok!(e, []); } #[test] fn test_decoder_valid() { let mut d = EUCJPEncoding.raw_decoder(); assert_feed_ok!(d, [0x41], [], "A"); assert_feed_ok!(d, [0x42, 0x43], [], "BC"); assert_feed_ok!(d, [], [], ""); assert_feed_ok!(d, [0x5c], [], "\\"); assert_feed_ok!(d, [0x7e], [], "~"); assert_feed_ok!(d, [0xa4, 0xcb, 0xa4, 0xdb, 0xa4, 0xf3], [], "\u{306b}\u{307b}\u{3093}"); assert_feed_ok!(d, [0x8e, 0xc6, 0x8e, 0xce, 0x8e, 0xdd], [], "\u{ff86}\u{ff8e}\u{ff9d}"); assert_feed_ok!(d, [0xc6, 0xfc, 0xcb, 0xdc], [], "\u{65e5}\u{672c}"); assert_feed_ok!(d, [0x8f, 0xcb, 0xc6, 0xec, 0xb8], [], "\u{736c}\u{8c78}"); assert_finish_ok!(d, ""); } #[test] fn test_decoder_valid_partial() { let mut d = EUCJPEncoding.raw_decoder(); assert_feed_ok!(d, [], [0xa4], ""); assert_feed_ok!(d, [0xcb], [0xa4], "\u{306b}"); assert_feed_ok!(d, [0xdb], [0xa4], "\u{307b}"); assert_feed_ok!(d, [0xf3], [], "\u{3093}"); assert_feed_ok!(d, [], [0x8e], ""); assert_feed_ok!(d, [0xc6], [0x8e], "\u{ff86}"); assert_feed_ok!(d, [0xce], [0x8e], "\u{ff8e}"); assert_feed_ok!(d, [0xdd], [], "\u{ff9d}"); assert_feed_ok!(d, [], [0xc6], ""); assert_feed_ok!(d, [0xfc], [0xcb], "\u{65e5}"); assert_feed_ok!(d, [0xdc], [], "\u{672c}"); assert_feed_ok!(d, [], [0x8f], ""); assert_feed_ok!(d, [], [0xcb], ""); assert_feed_ok!(d, [0xc6], [0xec], "\u{736c}"); assert_feed_ok!(d, [0xb8], [], "\u{8c78}"); assert_feed_ok!(d, [], [0x8f, 0xcb], ""); assert_feed_ok!(d, [0xc6, 0xec, 0xb8], [], "\u{736c}\u{8c78}"); assert_finish_ok!(d, ""); } #[test] fn test_decoder_invalid_lone_lead_immediate_test_finish() { for i in 0x8e..0x90 { let mut d = EUCJPEncoding.raw_decoder(); assert_feed_ok!(d, [], [i], ""); // wait for a trail assert_finish_err!(d, ""); } for i in 0xa1..0xff { let mut d = EUCJPEncoding.raw_decoder(); assert_feed_ok!(d, [], [i], ""); // wait for a trail assert_finish_err!(d, ""); } // immediate failures let mut d = EUCJPEncoding.raw_decoder(); for i in 0x80..0x8e { assert_feed_err!(d, [], [i], [], ""); } for i in 0x90..0xa1 { assert_feed_err!(d, [], [i], [], ""); } assert_feed_err!(d, [], [0xff], [], ""); assert_finish_ok!(d, ""); } #[test] fn test_decoder_invalid_lone_lead_followed_by_space() { for i in 0x80..0x100 { let i = i as u8; let mut d = EUCJPEncoding.raw_decoder(); assert_feed_err!(d, [], [i], [0x20], ""); assert_finish_ok!(d, ""); } } #[test] fn test_decoder_invalid_lead_followed_by_invalid_trail() { for i in 0x80..0x100 { let i = i as u8; let mut d = EUCJPEncoding.raw_decoder(); assert_feed_err!(d, [], [i], [0x80], ""); assert_feed_err!(d, [], [i], [0xff], ""); assert_finish_ok!(d, ""); } } #[test] fn test_decoder_invalid_lone_lead_for_0212_immediate_test_finish() { for i in 0xa1..0xff { let mut d = EUCJPEncoding.raw_decoder(); assert_feed_ok!(d, [], [0x8f, i], ""); // wait for a trail assert_finish_err!(d, ""); } } #[test] fn test_decoder_invalid_lone_lead_for_0212_immediate_test_finish_partial() { for i in 0xa1..0xff { let mut d = EUCJPEncoding.raw_decoder(); assert_feed_ok!(d, [], [0x8f], ""); assert_feed_ok!(d, [], [i], ""); // wait for a trail assert_finish_err!(d, ""); } } #[test] fn test_decoder_invalid_trail_for_0201() { for i in 0..0xa1 { let mut d = EUCJPEncoding.raw_decoder(); assert_feed_err!(d, [], [0x8e], [i], ""); assert_finish_ok!(d, ""); } for i in 0xe0..0xff { let mut d = EUCJPEncoding.raw_decoder(); assert_feed_err!(d, [], [0x8e, i], [], ""); assert_finish_ok!(d, ""); } } #[test] fn test_decoder_invalid_trail_for_0201_partial() { for i in 0..0xa1 { let mut d = EUCJPEncoding.raw_decoder(); assert_feed_ok!(d, [], [0x8e], ""); assert_feed_err!(d, [], [], [i], ""); assert_finish_ok!(d, ""); } for i in 0xe0..0xff { let mut d = EUCJPEncoding.raw_decoder(); assert_feed_ok!(d, [], [0x8e], ""); assert_feed_err!(d, [], [i], [], ""); assert_finish_ok!(d, ""); } } #[test] fn test_decoder_invalid_middle_for_0212() { for i in 0..0xa1 { let mut d = EUCJPEncoding.raw_decoder(); assert_feed_err!(d, [], [0x8f], [i], ""); assert_finish_ok!(d, ""); } } #[test] fn test_decoder_invalid_middle_for_0212_partial() { for i in 0..0xa1 { let mut d = EUCJPEncoding.raw_decoder(); assert_feed_ok!(d, [], [0x8f], ""); assert_feed_err!(d, [], [], [i], ""); assert_finish_ok!(d, ""); } } #[test] fn test_decoder_invalid_trail_for_0212() { for i in 0..0xa1 { let mut d = EUCJPEncoding.raw_decoder(); assert_feed_err!(d, [], [0x8f, 0xa1], [i], ""); assert_finish_ok!(d, ""); } } #[test] fn test_decoder_invalid_trail_for_0212_partial() { for i in 0..0xa1 { let mut d = EUCJPEncoding.raw_decoder(); assert_feed_ok!(d, [], [0x8f], ""); assert_feed_ok!(d, [], [0xa1], ""); assert_feed_err!(d, [], [], [i], ""); assert_finish_ok!(d, ""); } } #[test] fn test_decoder_feed_after_finish() { let mut d = EUCJPEncoding.raw_decoder(); assert_feed_ok!(d, [0xa4, 0xa2], [0xa4], "\u{3042}"); assert_finish_err!(d, ""); assert_feed_ok!(d, [0xa4, 0xa2], [], "\u{3042}"); assert_finish_ok!(d, ""); } #[bench] fn bench_encode_short_text(bencher: &mut test::Bencher) { let s = testutils::JAPANESE_TEXT; bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ EUCJPEncoding.encode(&s, EncoderTrap::Strict) })) } #[bench] fn bench_decode_short_text(bencher: &mut test::Bencher) { let s = EUCJPEncoding.encode(testutils::JAPANESE_TEXT, EncoderTrap::Strict).ok().unwrap(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ EUCJPEncoding.decode(&s, DecoderTrap::Strict) })) } } /** * Windows code page 932, i.e. Shift_JIS with IBM/NEC extensions. * * This is a Japanese encoding for JIS X 0208 * compatible to the original assignments of JIS X 0201 (`[21-7E A1-DF]`). * The 94 by 94 region of JIS X 0208 is sliced, or rather "shifted" into * the odd half (odd row number) and even half (even row number), * and merged into the 188 by 47 region mapped to `[81-9F E0-EF] [40-7E 80-FC]`. * The remaining area, `[80 A0 F0-FF] [40-7E 80-FC]`, has been subjected to * numerous extensions incompatible to each other. * This particular implementation uses IBM/NEC extensions * which assigns more characters to `[F0-FC 80-FC]` and also to the Private Use Area (PUA). * It requires some cares to handle * since the second byte of JIS X 0208 can have its MSB unset. */ #[derive(Clone, Copy)] pub struct Windows31JEncoding; impl Encoding for Windows31JEncoding { fn name(&self) -> &'static str { "windows-31j" } fn whatwg_name(&self) -> Option<&'static str> { Some("shift_jis") } // WHATWG compatibility fn raw_encoder(&self) -> Box { Windows31JEncoder::new() } fn raw_decoder(&self) -> Box { Windows31JDecoder::new() } } /// An encoder for Shift_JIS with IBM/NEC extensions. #[derive(Clone, Copy)] pub struct Windows31JEncoder; impl Windows31JEncoder { pub fn new() -> Box { Box::new(Windows31JEncoder) } } impl RawEncoder for Windows31JEncoder { fn from_self(&self) -> Box { Windows31JEncoder::new() } fn is_ascii_compatible(&self) -> bool { true } fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option) { output.writer_hint(input.len()); for ((i,j), ch) in input.index_iter() { match ch { '\u{0}'...'\u{80}' => { output.write_byte(ch as u8); } '\u{a5}' => { output.write_byte(0x5c); } '\u{203e}' => { output.write_byte(0x7e); } '\u{ff61}'...'\u{ff9f}' => { output.write_byte((ch as usize - 0xff61 + 0xa1) as u8); } _ => { // corresponds to the "index shift_jis pointer" in the WHATWG spec let ptr = index::jis0208::backward_remapped(ch as u32); if ptr == 0xffff { return (i, Some(CodecError { upto: j as isize, cause: "unrepresentable character".into(), })); } else { let lead = ptr / 188; let leadoffset = if lead < 0x1f {0x81} else {0xc1}; let trail = ptr % 188; let trailoffset = if trail < 0x3f {0x40} else {0x41}; output.write_byte((lead + leadoffset) as u8); output.write_byte((trail + trailoffset) as u8); } } } } (input.len(), None) } fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option { None } } /// A decoder for Shift_JIS with IBM/NEC extensions. #[derive(Clone, Copy)] struct Windows31JDecoder { st: windows31j::State, } impl Windows31JDecoder { pub fn new() -> Box { Box::new(Windows31JDecoder { st: Default::default() }) } } impl RawDecoder for Windows31JDecoder { fn from_self(&self) -> Box { Windows31JDecoder::new() } fn is_ascii_compatible(&self) -> bool { true } fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option) { let (st, processed, err) = windows31j::raw_feed(self.st, input, output, &()); self.st = st; (processed, err) } fn raw_finish(&mut self, output: &mut StringWriter) -> Option { let (st, err) = windows31j::raw_finish(self.st, output, &()); self.st = st; err } } stateful_decoder! { module windows31j; internal pub fn map_two_0208_bytes(lead: u8, trail: u8) -> u32 { use index_japanese as index; let lead = lead as u16; let trail = trail as u16; let leadoffset = if lead < 0xa0 {0x81} else {0xc1}; let trailoffset = if trail < 0x7f {0x40} else {0x41}; let index = match (lead, trail) { (0xf0...0xf9, 0x40...0x7e) | (0xf0...0xf9, 0x80...0xfc) => return (0xe000 + (lead - 0xf0) * 188 + trail - trailoffset) as u32, (0x81...0x9f, 0x40...0x7e) | (0x81...0x9f, 0x80...0xfc) | (0xe0...0xfc, 0x40...0x7e) | (0xe0...0xfc, 0x80...0xfc) => (lead - leadoffset) * 188 + trail - trailoffset, _ => 0xffff, }; index::jis0208::forward(index) } initial: // shift_jis lead = 0x00 state S0(ctx: Context) { case b @ 0x00...0x80 => ctx.emit(b as u32); case b @ 0xa1...0xdf => ctx.emit(0xff61 + b as u32 - 0xa1); case b @ 0x81...0x9f, b @ 0xe0...0xfc => S1(ctx, b); case _ => ctx.err("invalid sequence"); } transient: // shift_jis lead != 0x00 state S1(ctx: Context, lead: u8) { case b => match map_two_0208_bytes(lead, b) { 0xffff => ctx.backup_and_err(1, "invalid sequence"), // unconditional ch => ctx.emit(ch) }; } } #[cfg(test)] mod windows31j_tests { extern crate test; use super::Windows31JEncoding; use testutils; use types::*; #[test] fn test_encoder_valid() { let mut e = Windows31JEncoding.raw_encoder(); assert_feed_ok!(e, "A", "", [0x41]); assert_feed_ok!(e, "BC", "", [0x42, 0x43]); assert_feed_ok!(e, "", "", []); assert_feed_ok!(e, "\u{a5}", "", [0x5c]); assert_feed_ok!(e, "\u{203e}", "", [0x7e]); assert_feed_ok!(e, "\u{306b}\u{307b}\u{3093}", "", [0x82, 0xc9, 0x82, 0xd9, 0x82, 0xf1]); assert_feed_ok!(e, "\u{ff86}\u{ff8e}\u{ff9d}", "", [0xc6, 0xce, 0xdd]); assert_feed_ok!(e, "\u{65e5}\u{672c}", "", [0x93, 0xfa, 0x96, 0x7b]); assert_finish_ok!(e, []); } #[test] fn test_encoder_no_eudc() { let mut e = Windows31JEncoding.raw_encoder(); assert_feed_err!(e, "", "\u{e000}", "", []); assert_feed_err!(e, "", "\u{e757}", "", []); assert_feed_err!(e, "", "\u{e758}", "", []); assert_finish_ok!(e, []); } #[test] fn test_encoder_double_mapped() { // these characters are double-mapped to both EUDC area and Shift_JIS extension area // but only the latter should be used. (note that U+FFE2 is triple-mapped!) let mut e = Windows31JEncoding.raw_encoder(); assert_feed_ok!(e, "\u{9ed1}\u{2170}\u{ffe2}", "", [0xfc, 0x4b, 0xfa, 0x40, 0x81, 0xca]); assert_finish_ok!(e, []); } #[test] fn test_encoder_invalid() { let mut e = Windows31JEncoding.raw_encoder(); assert_feed_err!(e, "", "\u{ffff}", "", []); assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]); assert_feed_err!(e, "", "\u{736c}", "\u{8c78}", []); assert_finish_ok!(e, []); } #[test] fn test_decoder_valid() { let mut d = Windows31JEncoding.raw_decoder(); assert_feed_ok!(d, [0x41], [], "A"); assert_feed_ok!(d, [0x42, 0x43], [], "BC"); assert_feed_ok!(d, [], [], ""); assert_feed_ok!(d, [0x5c], [], "\\"); assert_feed_ok!(d, [0x7e], [], "~"); assert_feed_ok!(d, [0x80], [], "\u{80}"); // compatibility assert_feed_ok!(d, [0x82, 0xc9, 0x82, 0xd9, 0x82, 0xf1], [], "\u{306b}\u{307b}\u{3093}"); assert_feed_ok!(d, [0xc6, 0xce, 0xdd], [], "\u{ff86}\u{ff8e}\u{ff9d}"); assert_feed_ok!(d, [0x93, 0xfa, 0x96, 0x7b], [], "\u{65e5}\u{672c}"); assert_finish_ok!(d, ""); } #[test] fn test_decoder_eudc() { let mut d = Windows31JEncoding.raw_decoder(); assert_feed_ok!(d, [], [0xf0], ""); assert_feed_ok!(d, [0x40], [], "\u{e000}"); assert_feed_ok!(d, [0xf9, 0xfc], [], "\u{e757}"); assert_feed_err!(d, [], [0xf0], [0x00], ""); assert_feed_err!(d, [], [0xf0], [0xff], ""); assert_finish_ok!(d, ""); } #[test] fn test_decoder_invalid_lone_lead_immediate_test_finish() { for i in 0x81..0xa0 { let mut d = Windows31JEncoding.raw_decoder(); assert_feed_ok!(d, [], [i], ""); // wait for a trail assert_finish_err!(d, ""); } for i in 0xe0..0xfd { let mut d = Windows31JEncoding.raw_decoder(); assert_feed_ok!(d, [], [i], ""); // wait for a trail assert_finish_err!(d, ""); } // A0/FD/FE/FF: immediate failure let mut d = Windows31JEncoding.raw_decoder(); assert_feed_err!(d, [], [0xa0], [], ""); assert_feed_err!(d, [], [0xfd], [], ""); assert_feed_err!(d, [], [0xfe], [], ""); assert_feed_err!(d, [], [0xff], [], ""); assert_finish_ok!(d, ""); } #[test] fn test_decoder_invalid_lone_lead_followed_by_space() { for i in 0x81..0xa0 { let mut d = Windows31JEncoding.raw_decoder(); assert_feed_err!(d, [], [i], [0x20], ""); assert_finish_ok!(d, ""); } for i in 0xe0..0xfd { let mut d = Windows31JEncoding.raw_decoder(); assert_feed_err!(d, [], [i], [0x20], ""); assert_finish_ok!(d, ""); } } #[test] fn test_decoder_invalid_lead_followed_by_invalid_trail() { for i in 0x81..0xa0 { let mut d = Windows31JEncoding.raw_decoder(); assert_feed_err!(d, [], [i], [0x3f], ""); assert_feed_err!(d, [], [i], [0x7f], ""); assert_feed_err!(d, [], [i], [0xfd], ""); assert_feed_err!(d, [], [i], [0xfe], ""); assert_feed_err!(d, [], [i], [0xff], ""); assert_finish_ok!(d, ""); } for i in 0xe0..0xfd { let mut d = Windows31JEncoding.raw_decoder(); assert_feed_err!(d, [], [i], [0x3f], ""); assert_feed_err!(d, [], [i], [0x7f], ""); assert_feed_err!(d, [], [i], [0xfd], ""); assert_feed_err!(d, [], [i], [0xfe], ""); assert_feed_err!(d, [], [i], [0xff], ""); assert_finish_ok!(d, ""); } } #[test] fn test_decoder_invalid_lead_followed_by_invalid_trail_partial() { for i in 0x81..0xa0 { let mut d = Windows31JEncoding.raw_decoder(); assert_feed_ok!(d, [], [i], ""); assert_feed_err!(d, [], [], [0xff], ""); assert_finish_ok!(d, ""); } for i in 0xe0..0xfd { let mut d = Windows31JEncoding.raw_decoder(); assert_feed_ok!(d, [], [i], ""); assert_feed_err!(d, [], [], [0xff], ""); assert_finish_ok!(d, ""); } } #[test] fn test_decoder_feed_after_finish() { let mut d = Windows31JEncoding.raw_decoder(); assert_feed_ok!(d, [0x82, 0xa0], [0x82], "\u{3042}"); assert_finish_err!(d, ""); assert_feed_ok!(d, [0x82, 0xa0], [], "\u{3042}"); assert_finish_ok!(d, ""); } #[bench] fn bench_encode_short_text(bencher: &mut test::Bencher) { let s = testutils::JAPANESE_TEXT; bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ Windows31JEncoding.encode(&s, EncoderTrap::Strict) })) } #[bench] fn bench_decode_short_text(bencher: &mut test::Bencher) { let s = Windows31JEncoding.encode(testutils::JAPANESE_TEXT, EncoderTrap::Strict).ok().unwrap(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ Windows31JEncoding.decode(&s, DecoderTrap::Strict) })) } } /** * ISO-2022-JP. * * This version of ISO-2022-JP does not correspond to any standardized repertoire of character sets * due to the widespread implementation differences. The following character sets are supported: * * - JIS X 0201-1976 roman (`ESC ( J` or `ESC ( B`; the latter is originally allocated to ASCII * but willfully violated) * - JIS X 0201-1976 kana (`ESC ( I`) * - JIS X 0208-1983 (`ESC $ B` or `ESC $ @`; the latter is originally allocated to JIS X 0208-1978 * but willfully violated) * - JIS X 0212-1990 (`ESC $ ( D`, XXX asymmetric support) */ #[derive(Clone, Copy)] pub struct ISO2022JPEncoding; impl Encoding for ISO2022JPEncoding { fn name(&self) -> &'static str { "iso-2022-jp" } fn whatwg_name(&self) -> Option<&'static str> { Some("iso-2022-jp") } fn raw_encoder(&self) -> Box { ISO2022JPEncoder::new() } fn raw_decoder(&self) -> Box { ISO2022JPDecoder::new() } } #[derive(PartialEq,Clone,Copy)] enum ISO2022JPState { ASCII, // U+0000..007F, U+00A5, U+203E Katakana, // JIS X 0201: U+FF61..FF9F Lead, // JIS X 0208 } /// An encoder for ISO-2022-JP without JIS X 0212/0213 support. #[derive(Clone, Copy)] pub struct ISO2022JPEncoder { st: ISO2022JPState } impl ISO2022JPEncoder { pub fn new() -> Box { Box::new(ISO2022JPEncoder { st: ASCII }) } } impl RawEncoder for ISO2022JPEncoder { fn from_self(&self) -> Box { ISO2022JPEncoder::new() } fn is_ascii_compatible(&self) -> bool { true } fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option) { output.writer_hint(input.len()); let mut st = self.st; macro_rules! ensure_ASCII( () => (if st != ASCII { output.write_bytes(b"\x1b(B"); st = ASCII; }) ); macro_rules! ensure_Katakana( () => (if st != Katakana { output.write_bytes(b"\x1b(I"); st = Katakana; }) ); macro_rules! ensure_Lead( () => (if st != Lead { output.write_bytes(b"\x1b$B"); st = Lead; }) ); for ((i,j), ch) in input.index_iter() { match ch { '\u{0}'...'\u{7f}' => { ensure_ASCII!(); output.write_byte(ch as u8); } '\u{a5}' => { ensure_ASCII!(); output.write_byte(0x5c); } '\u{203e}' => { ensure_ASCII!(); output.write_byte(0x7e); } '\u{ff61}'...'\u{ff9f}' => { ensure_Katakana!(); output.write_byte((ch as usize - 0xff61 + 0x21) as u8); } _ => { let ptr = index::jis0208::backward(ch as u32); if ptr == 0xffff { self.st = st; // do NOT reset the state! return (i, Some(CodecError { upto: j as isize, cause: "unrepresentable character".into() })); } else { ensure_Lead!(); let lead = ptr / 94 + 0x21; let trail = ptr % 94 + 0x21; output.write_byte(lead as u8); output.write_byte(trail as u8); } } } } self.st = st; (input.len(), None) } fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option { None } } /// A decoder for ISO-2022-JP with JIS X 0212 support. #[derive(Clone, Copy)] struct ISO2022JPDecoder { st: iso2022jp::State, } impl ISO2022JPDecoder { pub fn new() -> Box { Box::new(ISO2022JPDecoder { st: Default::default() }) } } impl RawDecoder for ISO2022JPDecoder { fn from_self(&self) -> Box { ISO2022JPDecoder::new() } fn is_ascii_compatible(&self) -> bool { false } fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option) { let (st, processed, err) = iso2022jp::raw_feed(self.st, input, output, &()); self.st = st; (processed, err) } fn raw_finish(&mut self, output: &mut StringWriter) -> Option { let (st, err) = iso2022jp::raw_finish(self.st, output, &()); self.st = st; err } } stateful_decoder! { module iso2022jp; internal pub fn map_two_0208_bytes(lead: u8, trail: u8) -> u32 { use index_japanese as index; let lead = lead as u16; let trail = trail as u16; let index = match (lead, trail) { (0x21...0x7e, 0x21...0x7e) => (lead - 0x21) * 94 + trail - 0x21, _ => 0xffff, }; index::jis0208::forward(index) } internal pub fn map_two_0212_bytes(lead: u8, trail: u8) -> u32 { use index_japanese as index; let lead = lead as u16; let trail = trail as u16; let index = match (lead, trail) { (0x21...0x7e, 0x21...0x7e) => (lead - 0x21) * 94 + trail - 0x21, _ => 0xffff, }; index::jis0212::forward(index) } initial: // iso-2022-jp state = ASCII, iso-2022-jp jis0212 flag = unset, iso-2022-jp lead = 0x00 state ASCII(ctx: Context) { case 0x1b => EscapeStart(ctx); case b @ 0x00...0x7f => ctx.emit(b as u32), ASCII(ctx); case _ => ctx.err("invalid sequence"), ASCII(ctx); final => ctx.reset(); } checkpoint: // iso-2022-jp state = Lead, iso-2022-jp jis0212 flag = unset state Lead0208(ctx: Context) { case 0x0a => ctx.emit(0x000a); // return to ASCII case 0x1b => EscapeStart(ctx); case b => Trail0208(ctx, b); final => ctx.reset(); } // iso-2022-jp state = Lead, iso-2022-jp jis0212 flag = set state Lead0212(ctx: Context) { case 0x0a => ctx.emit(0x000a); // return to ASCII case 0x1b => EscapeStart(ctx); case b => Trail0212(ctx, b); final => ctx.reset(); } // iso-2022-jp state = Katakana state Katakana(ctx: Context) { case 0x1b => EscapeStart(ctx); case b @ 0x21...0x5f => ctx.emit(0xff61 + b as u32 - 0x21), Katakana(ctx); case _ => ctx.err("invalid sequence"), Katakana(ctx); final => ctx.reset(); } transient: // iso-2022-jp state = EscapeStart // ESC state EscapeStart(ctx: Context) { case 0x24 => EscapeMiddle24(ctx); // ESC $ case 0x28 => EscapeMiddle28(ctx); // ESC ( case _ => ctx.backup_and_err(1, "invalid sequence"); final => ctx.err("incomplete sequence"); } // iso-2022-jp state = EscapeMiddle, iso-2022-jp lead = 0x24 // ESC $ state EscapeMiddle24(ctx: Context) { case 0x40, 0x42 => Lead0208(ctx); // ESC $ @ (JIS X 0208-1978) or ESC $ B (-1983) case 0x28 => EscapeFinal(ctx); // ESC $ ( case _ => ctx.backup_and_err(2, "invalid sequence"); final => ctx.err("incomplete sequence"); } // iso-2022-jp state = EscapeMiddle, iso-2022-jp lead = 0x28 // ESC ( state EscapeMiddle28(ctx: Context) { case 0x42, 0x4a => ctx.reset(); // ESC ( B (ASCII) or ESC ( J (JIS X 0201-1976 roman) case 0x49 => Katakana(ctx); // ESC ( I (JIS X 0201-1976 kana) case _ => ctx.backup_and_err(2, "invalid sequence"); final => ctx.err("incomplete sequence"); } // iso-2022-jp state = EscapeFinal // ESC $ ( state EscapeFinal(ctx: Context) { case 0x44 => Lead0212(ctx); // ESC $ ( D (JIS X 0212-1990) case _ => ctx.backup_and_err(3, "invalid sequence"); final => ctx.backup_and_err(1, "incomplete sequence"); } // iso-2022-jp state = Trail, iso-2022-jp jis0212 flag = unset state Trail0208(ctx: Context, lead: u8) { case b => match map_two_0208_bytes(lead, b) { 0xffff => ctx.err("invalid sequence"), ch => ctx.emit(ch as u32) }, Lead0208(ctx); final => ctx.err("incomplete sequence"); } // iso-2022-jp state = Trail, iso-2022-jp jis0212 flag = set state Trail0212(ctx: Context, lead: u8) { case b => match map_two_0212_bytes(lead, b) { 0xffff => ctx.err("invalid sequence"), ch => ctx.emit(ch as u32) }, Lead0212(ctx); final => ctx.err("incomplete sequence"); } } #[cfg(test)] mod iso2022jp_tests { extern crate test; use super::ISO2022JPEncoding; use testutils; use types::*; #[test] fn test_encoder_valid() { let mut e = ISO2022JPEncoding.raw_encoder(); assert_feed_ok!(e, "A", "", [0x41]); assert_feed_ok!(e, "BC", "", [0x42, 0x43]); assert_feed_ok!(e, "\x1b\x24\x42", "", [0x1b, 0x24, 0x42]); // no round-trip guarantee assert_feed_ok!(e, "", "", []); assert_feed_ok!(e, "\u{a5}", "", [0x5c]); assert_feed_ok!(e, "\u{203e}", "", [0x7e]); assert_feed_ok!(e, "\u{306b}\u{307b}\u{3093}", "", [0x1b, 0x24, 0x42, 0x24, 0x4b, 0x24, 0x5b, 0x24, 0x73]); assert_feed_ok!(e, "\u{65e5}\u{672c}", "", [0x46, 0x7c, 0x4b, 0x5c]); assert_feed_ok!(e, "\u{ff86}\u{ff8e}\u{ff9d}", "", [0x1b, 0x28, 0x49, 0x46, 0x4e, 0x5d]); assert_feed_ok!(e, "XYZ", "", [0x1b, 0x28, 0x42, 0x58, 0x59, 0x5a]); assert_finish_ok!(e, []); // one ASCII character and two similarly looking characters: // - A: U+0020 SPACE (requires ASCII state) // - B: U+30CD KATAKANA LETTER NE (requires JIS X 0208 Lead state) // - C: U+FF88 HALFWIDTH KATAKANA LETTER NE (requires Katakana state) // - D is omitted as the encoder does not support JIS X 0212. // a (3,2) De Bruijn near-sequence "ABCACBA" is used to test all possible cases. const AD: &'static str = "\x20"; const BD: &'static str = "\u{30cd}"; const CD: &'static str = "\u{ff88}"; const AE: &'static [u8] = &[0x1b, 0x28, 0x42, 0x20]; const BE: &'static [u8] = &[0x1b, 0x24, 0x42, 0x25, 0x4d]; const CE: &'static [u8] = &[0x1b, 0x28, 0x49, 0x48]; let mut e = ISO2022JPEncoding.raw_encoder(); let decoded: String = ["\x20", BD, CD, AD, CD, BD, AD].concat(); let encoded: Vec<_> = [&[0x20][..], BE, CE, AE, CE, BE, AE].concat(); assert_feed_ok!(e, decoded, "", encoded); assert_finish_ok!(e, []); } #[test] fn test_encoder_invalid() { let mut e = ISO2022JPEncoding.raw_encoder(); assert_feed_err!(e, "", "\u{ffff}", "", []); assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]); // JIS X 0212 is not supported in the encoder assert_feed_err!(e, "", "\u{736c}", "\u{8c78}", []); assert_finish_ok!(e, []); } #[test] fn test_decoder_valid() { let mut d = ISO2022JPEncoding.raw_decoder(); assert_feed_ok!(d, [0x41], [], "A"); assert_feed_ok!(d, [0x42, 0x43], [], "BC"); assert_feed_ok!(d, [0x1b, 0x28, 0x4a, 0x44, 0x45, 0x46], [], "DEF"); assert_feed_ok!(d, [], [], ""); assert_feed_ok!(d, [0x5c], [], "\\"); assert_feed_ok!(d, [0x7e], [], "~"); assert_feed_ok!(d, [0x1b, 0x24, 0x42, 0x24, 0x4b, 0x1b, 0x24, 0x42, 0x24, 0x5b, 0x24, 0x73], [], "\u{306b}\u{307b}\u{3093}"); assert_feed_ok!(d, [0x46, 0x7c, 0x4b, 0x5c], [], "\u{65e5}\u{672c}"); assert_feed_ok!(d, [0x1b, 0x28, 0x49, 0x46, 0x4e, 0x5d], [], "\u{ff86}\u{ff8e}\u{ff9d}"); assert_feed_ok!(d, [0x1b, 0x24, 0x28, 0x44, 0x4b, 0x46, 0x1b, 0x24, 0x40, 0x6c, 0x38], [], "\u{736c}\u{8c78}"); assert_feed_ok!(d, [0x1b, 0x28, 0x42, 0x58, 0x59, 0x5a], [], "XYZ"); assert_finish_ok!(d, ""); let mut d = ISO2022JPEncoding.raw_decoder(); assert_feed_ok!(d, [0x1b, 0x24, 0x42, 0x24, 0x4b, 0x24, 0x5b, 0x24, 0x73], [], "\u{306b}\u{307b}\u{3093}"); assert_finish_ok!(d, ""); let mut d = ISO2022JPEncoding.raw_decoder(); assert_feed_ok!(d, [0x1b, 0x28, 0x49, 0x46, 0x4e, 0x5d], [], "\u{ff86}\u{ff8e}\u{ff9d}"); assert_finish_ok!(d, ""); let mut d = ISO2022JPEncoding.raw_decoder(); assert_feed_ok!(d, [0x1b, 0x24, 0x28, 0x44, 0x4b, 0x46], [], "\u{736c}"); assert_finish_ok!(d, ""); // one ASCII character and three similarly looking characters: // - A: U+0020 SPACE (requires ASCII state) // - B: U+30CD KATAKANA LETTER NE (requires JIS X 0208 Lead state) // - C: U+FF88 HALFWIDTH KATAKANA LETTER NE (requires Katakana state) // - D: U+793B CJK UNIFIED IDEOGRAPH-793B (requires JIS X 0212 Lead state) // a (4,2) De Bruijn sequence "AABBCCACBADDBDCDA" is used to test all possible cases. const AD: &'static str = "\x20"; const BD: &'static str = "\u{30cd}"; const CD: &'static str = "\u{ff88}"; const DD: &'static str = "\u{793b}"; const AE: &'static [u8] = &[0x1b, 0x28, 0x42, 0x20]; const BE: &'static [u8] = &[0x1b, 0x24, 0x42, 0x25, 0x4d]; const CE: &'static [u8] = &[0x1b, 0x28, 0x49, 0x48]; const DE: &'static [u8] = &[0x1b, 0x24, 0x28, 0x44, 0x50, 0x4b]; let mut d = ISO2022JPEncoding.raw_decoder(); let dec: String = ["\x20", AD,BD,BD,CD,CD,AD,CD,BD,AD,DD,DD,BD,DD,CD,DD,AD].concat(); let enc: Vec<_> = [&[0x20][..],AE,BE,BE,CE,CE,AE,CE,BE,AE,DE,DE,BE,DE,CE,DE,AE].concat(); assert_feed_ok!(d, enc, [], dec); assert_finish_ok!(d, ""); } #[test] fn test_decoder_valid_partial() { let mut d = ISO2022JPEncoding.raw_decoder(); assert_feed_ok!(d, [], [0x1b], ""); assert_feed_ok!(d, [], [0x28], ""); assert_feed_ok!(d, [0x4a, 0x41], [], "A"); assert_feed_ok!(d, [], [0x1b, 0x28], ""); assert_feed_ok!(d, [0x4a, 0x42], [0x1b], "B"); assert_feed_ok!(d, [0x28, 0x4a, 0x43], [], "C"); assert_feed_ok!(d, [], [0x1b], ""); assert_feed_ok!(d, [], [0x24], ""); assert_feed_ok!(d, [0x42], [0x24], ""); assert_feed_ok!(d, [0x4b], [0x1b, 0x24], "\u{306b}"); assert_feed_ok!(d, [0x42, 0x24, 0x5b], [], "\u{307b}"); assert_feed_ok!(d, [], [0x1b], ""); assert_feed_ok!(d, [0x24, 0x42, 0x24, 0x73], [], "\u{3093}"); assert_feed_ok!(d, [], [0x1b], ""); assert_feed_ok!(d, [], [0x28], ""); assert_feed_ok!(d, [0x49, 0x46], [], "\u{ff86}"); assert_feed_ok!(d, [], [0x1b, 0x28], ""); assert_feed_ok!(d, [0x49, 0x4e], [0x1b], "\u{ff8e}"); assert_feed_ok!(d, [0x28, 0x49, 0x5d], [], "\u{ff9d}"); assert_feed_ok!(d, [], [0x1b, 0x24], ""); assert_feed_ok!(d, [], [0x28], ""); assert_feed_ok!(d, [0x44], [0x4b], ""); assert_feed_ok!(d, [0x46], [0x1b, 0x24, 0x28], "\u{736c}"); assert_feed_ok!(d, [0x44, 0x4b, 0x46], [], "\u{736c}"); assert_finish_ok!(d, ""); } #[test] fn test_decoder_carriage_return() { // CR in Lead state "resets to ASCII" let mut d = ISO2022JPEncoding.raw_decoder(); assert_feed_ok!(d, [0x1b, 0x24, 0x42, 0x25, 0x4d, 0x0a, 0x25, 0x4d], [], "\u{30cd}\n\x25\x4d"); assert_feed_ok!(d, [0x1b, 0x24, 0x28, 0x44, 0x50, 0x4b, 0x0a, 0x50, 0x4b], [], "\u{793b}\n\x50\x4b"); assert_finish_ok!(d, ""); // other states don't allow CR let mut d = ISO2022JPEncoding.raw_decoder(); assert_feed_err!(d, [0x1b, 0x28, 0x49, 0x48], [0x0a], [], "\u{ff88}"); // Katakana assert_feed_err!(d, [0x1b, 0x24, 0x42], [0x25, 0x0a], [], ""); // Trail assert_finish_ok!(d, ""); } #[test] fn test_decoder_invalid_partial() { let mut d = ISO2022JPEncoding.raw_decoder(); assert_feed_ok!(d, [0x1b, 0x24, 0x42, 0x24, 0x4b], [0x24], "\u{306b}"); assert_finish_err!(d, ""); let mut d = ISO2022JPEncoding.raw_decoder(); assert_feed_ok!(d, [0x1b, 0x24, 0x28, 0x44, 0x4b, 0x46], [0x50], "\u{736c}"); assert_finish_err!(d, ""); } #[test] fn test_decoder_invalid_partial_escape() { let mut d = ISO2022JPEncoding.raw_decoder(); assert_feed_ok!(d, [], [0x1b], ""); assert_finish_err!(d, ""); let mut d = ISO2022JPEncoding.raw_decoder(); assert_feed_ok!(d, [], [0x1b, 0x24], ""); assert_finish_err!(d, ""); // no backup let mut d = ISO2022JPEncoding.raw_decoder(); assert_feed_ok!(d, [], [0x1b, 0x24, 0x28], ""); assert_finish_err!(d, -1, ""); // backup of -1, not -2 let mut d = ISO2022JPEncoding.raw_decoder(); assert_feed_ok!(d, [], [0x1b, 0x28], ""); assert_finish_err!(d, ""); // no backup assert_eq!(ISO2022JPEncoding.decode(&[0x1b], DecoderTrap::Replace), Ok("\u{fffd}".to_string())); assert_eq!(ISO2022JPEncoding.decode(&[0x1b, 0x24], DecoderTrap::Replace), Ok("\u{fffd}".to_string())); assert_eq!(ISO2022JPEncoding.decode(&[0x1b, 0x24, 0x28], DecoderTrap::Replace), Ok("\u{fffd}\x28".to_string())); assert_eq!(ISO2022JPEncoding.decode(&[0x1b, 0x28], DecoderTrap::Replace), Ok("\u{fffd}".to_string())); } #[test] fn test_decoder_invalid_escape() { // also tests allowed but never used escape codes in ISO 2022 let mut d = ISO2022JPEncoding.raw_decoder(); macro_rules! reset(() => ( assert_feed_ok!(d, [0x41, 0x42, 0x43, 0x1b, 0x24, 0x42, 0x21, 0x21], [], "ABC\u{3000}") )); reset!(); assert_feed_ok!(d, [], [0x1b], ""); assert_feed_err!(d, [], [], [0x00], ""); reset!(); assert_feed_err!(d, [], [0x1b], [0x0a], ""); reset!(); assert_feed_err!(d, [], [0x1b], [0x20], ""); reset!(); assert_feed_err!(d, [], [0x1b], [0x21, 0x5a], ""); // ESC ! Z (CZD) reset!(); assert_feed_err!(d, [], [0x1b], [0x22, 0x5a], ""); // ESC " Z (C1D) reset!(); assert_feed_err!(d, [], [0x1b], [0x24, 0x5a], ""); // ESC $ Z (GZDM4) reset!(); assert_feed_ok!(d, [], [0x1b, 0x24], ""); assert_feed_err!(d, -1, [], [], [0x24, 0x5a], ""); reset!(); assert_feed_err!(d, [], [0x1b], [0x24, 0x28, 0x5a], ""); // ESC $ ( Z (GZDM4) reset!(); assert_feed_ok!(d, [], [0x1b, 0x24, 0x28], ""); assert_feed_err!(d, -2, [], [], [0x24, 0x28, 0x5a], ""); reset!(); assert_feed_err!(d, [], [0x1b], [0x24, 0x29, 0x5a], ""); // ESC $ ) Z (G1DM4) reset!(); assert_feed_err!(d, [], [0x1b], [0x24, 0x2a, 0x5a], ""); // ESC $ * Z (G2DM4) reset!(); assert_feed_err!(d, [], [0x1b], [0x24, 0x2b, 0x5a], ""); // ESC $ + Z (G3DM4) reset!(); assert_feed_err!(d, [], [0x1b], [0x24, 0x2d, 0x5a], ""); // ESC $ - Z (G1DM6) reset!(); assert_feed_err!(d, [], [0x1b], [0x24, 0x2e, 0x5a], ""); // ESC $ . Z (G2DM6) reset!(); assert_feed_err!(d, [], [0x1b], [0x24, 0x2f, 0x5a], ""); // ESC $ / Z (G3DM6) reset!(); assert_feed_err!(d, [], [0x1b], [0x25, 0x5a], ""); // ESC % Z (DOCS) reset!(); assert_feed_err!(d, [], [0x1b], [0x25, 0x2f, 0x5a], ""); // ESC % / Z (DOCS) reset!(); assert_feed_err!(d, [], [0x1b], [0x28, 0x5a], ""); // ESC ( Z (GZD4) reset!(); assert_feed_ok!(d, [], [0x1b, 0x28], ""); assert_feed_err!(d, -1, [], [], [0x28, 0x5a], ""); reset!(); assert_feed_err!(d, [], [0x1b], [0x29, 0x5a], ""); // ESC ) Z (G1D4) reset!(); assert_feed_err!(d, [], [0x1b], [0x2a, 0x5a], ""); // ESC * Z (G2D4) reset!(); assert_feed_err!(d, [], [0x1b], [0x2b, 0x5a], ""); // ESC + Z (G3D4) reset!(); assert_feed_err!(d, [], [0x1b], [0x2d, 0x5a], ""); // ESC - Z (G1D6) reset!(); assert_feed_err!(d, [], [0x1b], [0x2e, 0x5a], ""); // ESC . Z (G2D6) reset!(); assert_feed_err!(d, [], [0x1b], [0x2f, 0x5a], ""); // ESC / Z (G3D6) reset!(); assert_feed_err!(d, [], [0x1b], [0x4e], ""); // ESC N (SS2) reset!(); assert_feed_err!(d, [], [0x1b], [0x4f], ""); // ESC O (SS3) reset!(); assert_feed_err!(d, [], [0x1b], [0x6e], ""); // ESC n (LS2) reset!(); assert_feed_err!(d, [], [0x1b], [0x6f], ""); // ESC o (LS3) reset!(); assert_feed_err!(d, [], [0x1b], [0x7c], ""); // ESC | (LS3R) reset!(); assert_feed_err!(d, [], [0x1b], [0x7d], ""); // ESC } (LS2R) reset!(); assert_feed_err!(d, [], [0x1b], [0x7e], ""); // ESC ~ (LS1R) reset!(); assert_feed_err!(d, [], [0x1b], [0xff], ""); reset!(); assert_finish_ok!(d, ""); } #[test] fn test_decoder_invalid_out_or_range() { let mut d = ISO2022JPEncoding.raw_decoder(); assert_feed_err!(d, [], [0x80], [], ""); assert_feed_err!(d, [], [0xff], [], ""); assert_feed_err!(d, [0x1b, 0x24, 0x42], [0x80, 0x21], [], ""); assert_feed_err!(d, [0x1b, 0x24, 0x42], [0x21, 0x80], [], ""); assert_feed_err!(d, [0x1b, 0x24, 0x42], [0x20, 0x21], [], ""); assert_feed_err!(d, [0x1b, 0x24, 0x42], [0x21, 0x20], [], ""); assert_feed_err!(d, [0x1b, 0x28, 0x49], [0x20], [], ""); assert_feed_err!(d, [0x1b, 0x28, 0x49], [0x60], [], ""); assert_feed_err!(d, [0x1b, 0x24, 0x28, 0x44], [0x80, 0x21], [], ""); assert_feed_err!(d, [0x1b, 0x24, 0x28, 0x44], [0x21, 0x80], [], ""); assert_feed_err!(d, [0x1b, 0x24, 0x28, 0x44], [0x20, 0x21], [], ""); assert_feed_err!(d, [0x1b, 0x24, 0x28, 0x44], [0x21, 0x20], [], ""); assert_finish_ok!(d, ""); } #[test] fn test_decoder_feed_after_finish() { let mut d = ISO2022JPEncoding.raw_decoder(); assert_feed_ok!(d, [0x24, 0x22, 0x1b, 0x24, 0x42, 0x24, 0x22], [0x24], "\x24\x22\u{3042}"); assert_finish_err!(d, ""); assert_feed_ok!(d, [0x24, 0x22, 0x1b, 0x24, 0x42, 0x24, 0x22], [], "\x24\x22\u{3042}"); assert_finish_ok!(d, ""); } #[bench] fn bench_encode_short_text(bencher: &mut test::Bencher) { let s = testutils::JAPANESE_TEXT; bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ ISO2022JPEncoding.encode(&s, EncoderTrap::Strict) })) } #[bench] fn bench_decode_short_text(bencher: &mut test::Bencher) { let s = ISO2022JPEncoding.encode(testutils::JAPANESE_TEXT, EncoderTrap::Strict).ok().unwrap(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ ISO2022JPEncoding.decode(&s, DecoderTrap::Strict) })) } } encoding-0.2.33/src/codec/korean.rs01006440000765000002400000022734125331707360015314 0ustar0000000000000000// This is a part of rust-encoding. // Copyright (c) 2013-2015, Kang Seonghoon. // See README.md and LICENSE.txt for details. //! Legacy Korean encodings based on KS X 1001. use std::convert::Into; use std::default::Default; use util::StrCharIndex; use index_korean as index; use types::*; /** * Windows code page 949. * * This is a Korean encoding derived from EUC-KR, * which is so widespread that most occurrences of EUC-KR actually mean this encoding. * Unlike KS X 1001 (and EUC-KR) which only contains a set of 2,350 common Hangul syllables, * it assigns remaining 8,822 Hangul syllables to the two-byte sequence * which second byte have its MSB unset (i.e. `[81-C6] [41-5A 61-7A 81-FE]`). * Its design strongly resembles that of Shift_JIS but less prone to errors * since the set of MSB-unset second bytes is much limited compared to Shift_JIS. */ #[derive(Clone, Copy)] pub struct Windows949Encoding; impl Encoding for Windows949Encoding { fn name(&self) -> &'static str { "windows-949" } fn whatwg_name(&self) -> Option<&'static str> { Some("euc-kr") } // WHATWG compatibility fn raw_encoder(&self) -> Box { Windows949Encoder::new() } fn raw_decoder(&self) -> Box { Windows949Decoder::new() } } /// An encoder for Windows code page 949. #[derive(Clone, Copy)] pub struct Windows949Encoder; impl Windows949Encoder { pub fn new() -> Box { Box::new(Windows949Encoder) } } impl RawEncoder for Windows949Encoder { fn from_self(&self) -> Box { Windows949Encoder::new() } fn is_ascii_compatible(&self) -> bool { true } fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option) { output.writer_hint(input.len()); for ((i,j), ch) in input.index_iter() { if ch <= '\u{7f}' { output.write_byte(ch as u8); } else { let ptr = index::euc_kr::backward(ch as u32); if ptr == 0xffff { return (i, Some(CodecError { upto: j as isize, cause: "unrepresentable character".into() })); } else { output.write_byte((ptr / 190 + 0x81) as u8); output.write_byte((ptr % 190 + 0x41) as u8); } } } (input.len(), None) } fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option { None } } /// A decoder for Windows code page 949. #[derive(Clone, Copy)] struct Windows949Decoder { st: windows949::State, } impl Windows949Decoder { pub fn new() -> Box { Box::new(Windows949Decoder { st: Default::default() }) } } impl RawDecoder for Windows949Decoder { fn from_self(&self) -> Box { Windows949Decoder::new() } fn is_ascii_compatible(&self) -> bool { true } fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option) { let (st, processed, err) = windows949::raw_feed(self.st, input, output, &()); self.st = st; (processed, err) } fn raw_finish(&mut self, output: &mut StringWriter) -> Option { let (st, err) = windows949::raw_finish(self.st, output, &()); self.st = st; err } } stateful_decoder! { module windows949; internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 { use index_korean as index; let lead = lead as u16; let trail = trail as u16; let index = match (lead, trail) { (0x81...0xfe, 0x41...0xfe) => (lead - 0x81) * 190 + (trail - 0x41), (_, _) => 0xffff, }; index::euc_kr::forward(index) } initial: // euc-kr lead = 0x00 state S0(ctx: Context) { case b @ 0x00...0x7f => ctx.emit(b as u32); case b @ 0x81...0xfe => S1(ctx, b); case _ => ctx.err("invalid sequence"); } transient: // euc-kr lead != 0x00 state S1(ctx: Context, lead: u8) { case b => match map_two_bytes(lead, b) { 0xffff => { let backup = if b < 0x80 {1} else {0}; ctx.backup_and_err(backup, "invalid sequence") }, ch => ctx.emit(ch as u32) }; } } #[cfg(test)] mod windows949_tests { extern crate test; use super::Windows949Encoding; use testutils; use types::*; #[test] fn test_encoder_valid() { let mut e = Windows949Encoding.raw_encoder(); assert_feed_ok!(e, "A", "", [0x41]); assert_feed_ok!(e, "BC", "", [0x42, 0x43]); assert_feed_ok!(e, "", "", []); assert_feed_ok!(e, "\u{ac00}", "", [0xb0, 0xa1]); assert_feed_ok!(e, "\u{b098}\u{b2e4}", "", [0xb3, 0xaa, 0xb4, 0xd9]); assert_feed_ok!(e, "\u{bdc1}\u{314b}\u{d7a3}", "", [0x94, 0xee, 0xa4, 0xbb, 0xc6, 0x52]); assert_finish_ok!(e, []); } #[test] fn test_encoder_invalid() { let mut e = Windows949Encoding.raw_encoder(); assert_feed_err!(e, "", "\u{ffff}", "", []); assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]); assert_feed_err!(e, "?", "\u{fffd}", "!", [0x3f]); // for invalid table entries assert_finish_ok!(e, []); } #[test] fn test_decoder_valid() { let mut d = Windows949Encoding.raw_decoder(); assert_feed_ok!(d, [0x41], [], "A"); assert_feed_ok!(d, [0x42, 0x43], [], "BC"); assert_feed_ok!(d, [], [], ""); assert_feed_ok!(d, [0xb0, 0xa1], [], "\u{ac00}"); assert_feed_ok!(d, [0xb3, 0xaa, 0xb4, 0xd9], [], "\u{b098}\u{b2e4}"); assert_feed_ok!(d, [0x94, 0xee, 0xa4, 0xbb, 0xc6, 0x52, 0xc1, 0x64], [], "\u{bdc1}\u{314b}\u{d7a3}\u{d58f}"); assert_finish_ok!(d, ""); } #[test] fn test_decoder_valid_partial() { let mut d = Windows949Encoding.raw_decoder(); assert_feed_ok!(d, [], [0xb0], ""); assert_feed_ok!(d, [0xa1], [], "\u{ac00}"); assert_feed_ok!(d, [0xb3, 0xaa], [0xb4], "\u{b098}"); assert_feed_ok!(d, [0xd9], [0x94], "\u{b2e4}"); assert_feed_ok!(d, [0xee, 0xa4, 0xbb], [0xc6], "\u{bdc1}\u{314b}"); assert_feed_ok!(d, [0x52, 0xc1, 0x64], [], "\u{d7a3}\u{d58f}"); assert_finish_ok!(d, ""); } #[test] fn test_decoder_invalid_lone_lead_immediate_test_finish() { for i in 0x81..0xff { let mut d = Windows949Encoding.raw_decoder(); assert_feed_ok!(d, [], [i], ""); // wait for a trail assert_finish_err!(d, ""); } // 80/FF: immediate failure let mut d = Windows949Encoding.raw_decoder(); assert_feed_err!(d, [], [0x80], [], ""); assert_feed_err!(d, [], [0xff], [], ""); assert_finish_ok!(d, ""); } #[test] fn test_decoder_invalid_lone_lead_followed_by_space() { for i in 0x80..0x100 { let i = i as u8; let mut d = Windows949Encoding.raw_decoder(); assert_feed_err!(d, [], [i], [0x20], ""); assert_finish_ok!(d, ""); } } #[test] fn test_decoder_invalid_lead_followed_by_invalid_trail() { // should behave similarly to Big5. // https://www.w3.org/Bugs/Public/show_bug.cgi?id=16691 for i in 0x81..0xff { let mut d = Windows949Encoding.raw_decoder(); assert_feed_err!(d, [], [i, 0x80], [0x20], ""); assert_feed_err!(d, [], [i, 0xff], [0x20], ""); assert_finish_ok!(d, ""); let mut d = Windows949Encoding.raw_decoder(); assert_feed_ok!(d, [], [i], ""); assert_feed_err!(d, [], [0x80], [0x20], ""); assert_feed_ok!(d, [], [i], ""); assert_feed_err!(d, [], [0xff], [0x20], ""); assert_finish_ok!(d, ""); } let mut d = Windows949Encoding.raw_decoder(); assert_feed_err!(d, [], [0x80], [0x80], ""); assert_feed_err!(d, [], [0x80], [0xff], ""); assert_feed_err!(d, [], [0xff], [0x80], ""); assert_feed_err!(d, [], [0xff], [0xff], ""); assert_finish_ok!(d, ""); } #[test] fn test_decoder_invalid_boundary() { // U+D7A3 (C6 52) is the last Hangul syllable not in KS X 1001, C6 53 is invalid. // note that since the trail byte may coincide with ASCII, the trail byte 53 is // not considered to be in the problem. this is compatible to WHATWG Encoding standard. let mut d = Windows949Encoding.raw_decoder(); assert_feed_ok!(d, [], [0xc6], ""); assert_feed_err!(d, [], [], [0x53], ""); assert_finish_ok!(d, ""); } #[test] fn test_decoder_feed_after_finish() { let mut d = Windows949Encoding.raw_decoder(); assert_feed_ok!(d, [0xb0, 0xa1], [0xb0], "\u{ac00}"); assert_finish_err!(d, ""); assert_feed_ok!(d, [0xb0, 0xa1], [], "\u{ac00}"); assert_finish_ok!(d, ""); } #[bench] fn bench_encode_short_text(bencher: &mut test::Bencher) { let s = testutils::KOREAN_TEXT; bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ Windows949Encoding.encode(&s, EncoderTrap::Strict) })) } #[bench] fn bench_decode_short_text(bencher: &mut test::Bencher) { let s = Windows949Encoding.encode(testutils::KOREAN_TEXT, EncoderTrap::Strict).ok().unwrap(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ Windows949Encoding.decode(&s, DecoderTrap::Strict) })) } } encoding-0.2.33/src/codec/simpchinese.rs01006440000765000002400000070465127606115130016343 0ustar0000000000000000// This is a part of rust-encoding. // Copyright (c) 2013-2015, Kang Seonghoon. // See README.md and LICENSE.txt for details. //! Legacy simplified Chinese encodings based on GB 2312 and GB 18030. use std::convert::Into; use std::marker::PhantomData; use std::default::Default; use util::StrCharIndex; use index_simpchinese as index; use types::*; /// An implementation type for GBK. /// /// Can be used as a type parameter to `GBEncoding` and `GBEncoder`. /// (GB18030Decoder is shared by both.) #[derive(Clone, Copy)] pub struct GBK; /// An implementation type for GB18030. /// /// Can be used as a type parameter to `GBEncoding` and `GBEncoder.' /// (GB18030Decoder is shared by both.) #[derive(Clone, Copy)] pub struct GB18030; /// An internal trait used to customize GBK and GB18030 implementations. #[doc(hidden)] // XXX never intended to be used publicly, should be gone later pub trait GBType: Clone + 'static { fn name() -> &'static str; fn whatwg_name() -> Option<&'static str>; fn initial_gbk_flag() -> bool; } impl GBType for GBK { fn name() -> &'static str { "gbk" } fn whatwg_name() -> Option<&'static str> { Some("gbk") } fn initial_gbk_flag() -> bool { true } } impl GBType for GB18030 { fn name() -> &'static str { "gb18030" } fn whatwg_name() -> Option<&'static str> { Some("gb18030") } fn initial_gbk_flag() -> bool { false } } /** * GBK and GB 18030-2005. * * The original GBK 1.0 region spans `[81-FE] [40-7E 80-FE]`, and is derived from * several different revisions of a family of encodings named "GBK": * * - GBK as specified in the normative annex of GB 13000.1-93, * the domestic standard equivalent to Unicode 1.1, * consisted of characters included in Unicode 1.1 and not in GB 2312-80. * - Windows code page 936 is the widespread extension to GBK. * - Due to the popularity of Windows code page 936, * a formal encoding based on Windows code page 936 (while adding new characters) * was standardized into GBK 1.0. * - Finally, GB 18030 added four-byte sequences to GBK for becoming a pan-Unicode encoding, * while adding new characters to the (former) GBK region again. * * GB 18030-2005 is a simplified Chinese encoding which extends GBK 1.0 to a pan-Unicode encoding. * It assigns four-byte sequences to every Unicode codepoint missing from the GBK area, * lexicographically ordered with occasional "gaps" for codepoints in the GBK area. * Due to this compatibility decision, * there is no simple relationship between these four-byte sequences and Unicode codepoints, * though there *exists* a relatively simple mapping algorithm with a small lookup table. * * ## Specialization * * This type is specialized with GBType `T`, * which should be either `GBK` or `GB18030`. */ #[derive(Clone, Copy)] pub struct GBEncoding { _marker: PhantomData } /// A type for GBK. pub type GBKEncoding = GBEncoding; /// A type for GB18030. pub type GB18030Encoding = GBEncoding; /// An instance for GBK. pub const GBK_ENCODING: GBKEncoding = GBEncoding { _marker: PhantomData }; /// An instance for GB18030. pub const GB18030_ENCODING: GB18030Encoding = GBEncoding { _marker: PhantomData }; impl Encoding for GBEncoding { fn name(&self) -> &'static str { ::name() } fn whatwg_name(&self) -> Option<&'static str> { ::whatwg_name() } fn raw_encoder(&self) -> Box { GBEncoder::::new() } fn raw_decoder(&self) -> Box { GB18030Decoder::new() } } /** * An encoder for GBK and GB18030. * * ## Specialization * * This type is specialized with GBType `T`, * which should be either `GBK` or `GB18030`. */ #[derive(Clone, Copy)] pub struct GBEncoder { _marker: PhantomData } impl GBEncoder { pub fn new() -> Box { Box::new(GBEncoder:: { _marker: PhantomData }) } } impl RawEncoder for GBEncoder { fn from_self(&self) -> Box { GBEncoder::::new() } fn is_ascii_compatible(&self) -> bool { true } fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option) { output.writer_hint(input.len()); let gbk_flag = ::initial_gbk_flag(); for ((i, j), ch) in input.index_iter() { if ch < '\u{80}' { output.write_byte(ch as u8); } else if gbk_flag && ch == '\u{20AC}' { output.write_byte('\u{80}' as u8) } else { let ptr = index::gb18030::backward(ch as u32); if ptr == 0xffff { if gbk_flag { return (i, Some(CodecError { upto: j as isize, cause: "gbk doesn't support gb18030 extensions".into() })); } let ptr = index::gb18030_ranges::backward(ch as u32); assert!(ptr != 0xffffffff); let (ptr, byte4) = (ptr / 10, ptr % 10); let (ptr, byte3) = (ptr / 126, ptr % 126); let (byte1, byte2) = (ptr / 10, ptr % 10); output.write_byte((byte1 + 0x81) as u8); output.write_byte((byte2 + 0x30) as u8); output.write_byte((byte3 + 0x81) as u8); output.write_byte((byte4 + 0x30) as u8); } else { let lead = ptr / 190 + 0x81; let trail = ptr % 190; let trailoffset = if trail < 0x3f {0x40} else {0x41}; output.write_byte(lead as u8); output.write_byte((trail + trailoffset) as u8); } } } (input.len(), None) } fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option { None } } /// A decoder for GB 18030 (also used by GBK). #[derive(Clone, Copy)] struct GB18030Decoder { st: gb18030::State, } impl GB18030Decoder { pub fn new() -> Box { Box::new(GB18030Decoder { st: Default::default() }) } } impl RawDecoder for GB18030Decoder { fn from_self(&self) -> Box { GB18030Decoder::new() } fn is_ascii_compatible(&self) -> bool { true } fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option) { let (st, processed, err) = gb18030::raw_feed(self.st, input, output, &()); self.st = st; (processed, err) } fn raw_finish(&mut self, output: &mut StringWriter) -> Option { let (st, err) = gb18030::raw_finish(self.st, output, &()); self.st = st; err } } stateful_decoder! { module gb18030; internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 { use index_simpchinese as index; let lead = lead as u16; let trail = trail as u16; let index = match (lead, trail) { (0x81...0xfe, 0x40...0x7e) | (0x81...0xfe, 0x80...0xfe) => { let trailoffset = if trail < 0x7f {0x40} else {0x41}; (lead - 0x81) * 190 + trail - trailoffset } _ => 0xffff, }; index::gb18030::forward(index) } internal pub fn map_four_bytes(b1: u8, b2: u8, b3: u8, b4: u8) -> u32 { use index_simpchinese as index; // no range check here, caller should have done all checks let index = (b1 as u32 - 0x81) * 12600 + (b2 as u32 - 0x30) * 1260 + (b3 as u32 - 0x81) * 10 + (b4 as u32 - 0x30); index::gb18030_ranges::forward(index) } initial: // gb18030 first = 0x00, gb18030 second = 0x00, gb18030 third = 0x00 state S0(ctx: Context) { case b @ 0x00...0x7f => ctx.emit(b as u32); case 0x80 => ctx.emit(0x20ac); case b @ 0x81...0xfe => S1(ctx, b); case _ => ctx.err("invalid sequence"); } transient: // gb18030 first != 0x00, gb18030 second = 0x00, gb18030 third = 0x00 state S1(ctx: Context, first: u8) { case b @ 0x30...0x39 => S2(ctx, first, b); case b => match map_two_bytes(first, b) { 0xffff => ctx.backup_and_err(1, "invalid sequence"), // unconditional ch => ctx.emit(ch) }; } // gb18030 first != 0x00, gb18030 second != 0x00, gb18030 third = 0x00 state S2(ctx: Context, first: u8, second: u8) { case b @ 0x81...0xfe => S3(ctx, first, second, b); case _ => ctx.backup_and_err(2, "invalid sequence"); } // gb18030 first != 0x00, gb18030 second != 0x00, gb18030 third != 0x00 state S3(ctx: Context, first: u8, second: u8, third: u8) { case b @ 0x30...0x39 => match map_four_bytes(first, second, third, b) { 0xffffffff => ctx.backup_and_err(3, "invalid sequence"), // unconditional ch => ctx.emit(ch) }; case _ => ctx.backup_and_err(3, "invalid sequence"); } } #[cfg(test)] mod gb18030_tests { extern crate test; use super::GB18030_ENCODING; use testutils; use types::*; #[test] fn test_encoder() { let mut e = GB18030_ENCODING.raw_encoder(); assert_feed_ok!(e, "A", "", [0x41]); assert_feed_ok!(e, "BC", "", [0x42, 0x43]); assert_feed_ok!(e, "", "", []); assert_feed_ok!(e, "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}", "", [0xd6, 0xd0, 0xbb, 0xaa, 0xc8, 0xcb, 0xc3, 0xf1, 0xb9, 0xb2, 0xba, 0xcd, 0xb9, 0xfa]); assert_feed_ok!(e, "1\u{20ac}/m", "", [0x31, 0xa2, 0xe3, 0x2f, 0x6d]); assert_feed_ok!(e, "\u{ff21}\u{ff22}\u{ff23}", "", [0xa3, 0xc1, 0xa3, 0xc2, 0xa3, 0xc3]); assert_feed_ok!(e, "\u{80}", "", [0x81, 0x30, 0x81, 0x30]); assert_feed_ok!(e, "\u{81}", "", [0x81, 0x30, 0x81, 0x31]); assert_feed_ok!(e, "\u{a3}", "", [0x81, 0x30, 0x84, 0x35]); assert_feed_ok!(e, "\u{a4}", "", [0xa1, 0xe8]); assert_feed_ok!(e, "\u{a5}", "", [0x81, 0x30, 0x84, 0x36]); assert_feed_ok!(e, "\u{10ffff}", "", [0xe3, 0x32, 0x9a, 0x35]); assert_feed_ok!(e, "\u{2a6a5}\u{3007}", "", [0x98, 0x35, 0xee, 0x37, 0xa9, 0x96]); assert_finish_ok!(e, []); } #[test] fn test_decoder_valid() { let mut d = GB18030_ENCODING.raw_decoder(); assert_feed_ok!(d, [0x41], [], "A"); assert_feed_ok!(d, [0x42, 0x43], [], "BC"); assert_feed_ok!(d, [], [], ""); assert_feed_ok!(d, [0xd6, 0xd0, 0xbb, 0xaa, 0xc8, 0xcb, 0xc3, 0xf1, 0xb9, 0xb2, 0xba, 0xcd, 0xb9, 0xfa], [], "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}"); assert_feed_ok!(d, [0x31, 0x80, 0x2f, 0x6d], [], "1\u{20ac}/m"); assert_feed_ok!(d, [0xa3, 0xc1, 0xa3, 0xc2, 0xa3, 0xc3], [], "\u{ff21}\u{ff22}\u{ff23}"); assert_feed_ok!(d, [0x81, 0x30, 0x81, 0x30], [], "\u{80}"); assert_feed_ok!(d, [0x81, 0x30, 0x81, 0x31], [], "\u{81}"); assert_feed_ok!(d, [0x81, 0x30, 0x84, 0x35], [], "\u{a3}"); assert_feed_ok!(d, [0xa1, 0xe8], [], "\u{a4}" ); assert_feed_ok!(d, [0x81, 0x30, 0x84, 0x36], [], "\u{a5}"); assert_feed_ok!(d, [0xe3, 0x32, 0x9a, 0x35], [], "\u{10ffff}"); assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37, 0xa9, 0x96], [], "\u{2a6a5}\u{3007}"); assert_finish_ok!(d, ""); } #[test] fn test_decoder_valid_partial() { let mut d = GB18030_ENCODING.raw_decoder(); assert_feed_ok!(d, [], [0xa1], ""); assert_feed_ok!(d, [0xa1], [], "\u{3000}"); assert_feed_ok!(d, [], [0x81], ""); assert_feed_ok!(d, [], [0x30], ""); assert_feed_ok!(d, [], [0x81], ""); assert_feed_ok!(d, [0x30], [], "\u{80}"); assert_feed_ok!(d, [], [0x81], ""); assert_feed_ok!(d, [], [0x30], ""); assert_feed_ok!(d, [0x81, 0x31], [], "\u{81}"); assert_feed_ok!(d, [], [0x81], ""); assert_feed_ok!(d, [0x30, 0x81, 0x32], [], "\u{82}"); assert_feed_ok!(d, [], [0x81], ""); assert_feed_ok!(d, [], [0x30, 0x81], ""); assert_feed_ok!(d, [0x33], [], "\u{83}"); assert_feed_ok!(d, [], [0x81, 0x30], ""); assert_feed_ok!(d, [], [0x81], ""); assert_feed_ok!(d, [0x34], [], "\u{84}"); assert_feed_ok!(d, [], [0x81, 0x30], ""); assert_feed_ok!(d, [0x81, 0x35], [], "\u{85}"); assert_feed_ok!(d, [], [0x81, 0x30, 0x81], ""); assert_feed_ok!(d, [0x36], [], "\u{86}"); assert_finish_ok!(d, ""); } #[test] fn test_decoder_invalid_partial() { let mut d = GB18030_ENCODING.raw_decoder(); assert_feed_ok!(d, [], [0xa1], ""); assert_finish_err!(d, ""); let mut d = GB18030_ENCODING.raw_decoder(); assert_feed_ok!(d, [], [0x81], ""); assert_finish_err!(d, ""); let mut d = GB18030_ENCODING.raw_decoder(); assert_feed_ok!(d, [], [0x81, 0x30], ""); assert_finish_err!(d, ""); let mut d = GB18030_ENCODING.raw_decoder(); assert_feed_ok!(d, [], [0x81, 0x30, 0x81], ""); assert_finish_err!(d, ""); } #[test] fn test_decoder_invalid_out_of_range() { let mut d = GB18030_ENCODING.raw_decoder(); assert_feed_err!(d, [], [0xff], [], ""); assert_feed_err!(d, [], [0x81], [0x00], ""); assert_feed_err!(d, [], [0x81], [0x7f], ""); assert_feed_err!(d, [], [0x81], [0xff], ""); assert_feed_err!(d, [], [0x81], [0x31, 0x00], ""); assert_feed_err!(d, [], [0x81], [0x31, 0x80], ""); assert_feed_err!(d, [], [0x81], [0x31, 0xff], ""); assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0x00], ""); assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0x2f], ""); assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0x3a], ""); assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0xff], ""); assert_finish_ok!(d, ""); } #[test] fn test_decoder_invalid_boundary() { // U+10FFFF (E3 32 9A 35) is the last Unicode codepoint, E3 32 9A 36 is invalid. // note that since the 2nd to 4th bytes may coincide with ASCII, bytes 32 9A 36 is // not considered to be in the problem. this is compatible to WHATWG Encoding standard. let mut d = GB18030_ENCODING.raw_decoder(); assert_feed_ok!(d, [], [0xe3], ""); assert_feed_err!(d, [], [], [0x32, 0x9a, 0x36], ""); assert_finish_ok!(d, ""); let mut d = GB18030_ENCODING.raw_decoder(); assert_feed_ok!(d, [], [0xe3], ""); assert_feed_ok!(d, [], [0x32, 0x9a], ""); assert_feed_err!(d, -2, [], [], [0x32, 0x9a, 0x36], ""); assert_finish_ok!(d, ""); } #[test] fn test_decoder_feed_after_finish() { let mut d = GB18030_ENCODING.raw_decoder(); assert_feed_ok!(d, [0xd2, 0xbb], [0xd2], "\u{4e00}"); assert_finish_err!(d, ""); assert_feed_ok!(d, [0xd2, 0xbb], [], "\u{4e00}"); assert_finish_ok!(d, ""); let mut d = GB18030_ENCODING.raw_decoder(); assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [0x98, 0x35, 0xee], "\u{2a6a5}"); assert_finish_err!(d, ""); assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [0x98, 0x35], "\u{2a6a5}"); assert_finish_err!(d, ""); assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [0x98], "\u{2a6a5}"); assert_finish_err!(d, ""); assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [], "\u{2a6a5}"); assert_finish_ok!(d, ""); } #[bench] fn bench_encode_short_text(bencher: &mut test::Bencher) { let s = testutils::SIMPLIFIED_CHINESE_TEXT; bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ GB18030_ENCODING.encode(&s, EncoderTrap::Strict) })) } #[bench] fn bench_decode_short_text(bencher: &mut test::Bencher) { let s = GB18030_ENCODING.encode(testutils::SIMPLIFIED_CHINESE_TEXT, EncoderTrap::Strict).ok().unwrap(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ GB18030_ENCODING.decode(&s, DecoderTrap::Strict) })) } } #[cfg(test)] mod gbk_tests { extern crate test; use super::GBK_ENCODING; use testutils; use types::*; // GBK and GB 18030 share the same decoder logic. #[test] fn test_encoder() { let mut e = GBK_ENCODING.raw_encoder(); assert_feed_ok!(e, "A", "", [0x41]); assert_feed_ok!(e, "BC", "", [0x42, 0x43]); assert_feed_ok!(e, "", "", []); assert_feed_ok!(e, "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}", "", [0xd6, 0xd0, 0xbb, 0xaa, 0xc8, 0xcb, 0xc3, 0xf1, 0xb9, 0xb2, 0xba, 0xcd, 0xb9, 0xfa]); assert_feed_ok!(e, "1\u{20ac}/m", "", [0x31, 0x80, 0x2f, 0x6d]); assert_feed_ok!(e, "\u{ff21}\u{ff22}\u{ff23}", "", [0xa3, 0xc1, 0xa3, 0xc2, 0xa3, 0xc3]); assert_feed_err!(e, "", "\u{80}", "", []); assert_feed_err!(e, "", "\u{81}", "", []); assert_feed_err!(e, "", "\u{a3}", "", []); assert_feed_ok!(e, "\u{a4}", "", [0xa1, 0xe8]); assert_feed_err!(e, "", "\u{a5}", "", []); assert_feed_err!(e, "", "\u{10ffff}", "", []); assert_feed_err!(e, "", "\u{2a6a5}", "\u{3007}", []); assert_feed_err!(e, "\u{3007}", "\u{2a6a5}", "", [0xa9, 0x96]); assert_finish_ok!(e, []); } #[bench] fn bench_encode_short_text(bencher: &mut test::Bencher) { let s = testutils::SIMPLIFIED_CHINESE_TEXT; bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ GBK_ENCODING.encode(&s, EncoderTrap::Strict) })) } } /** * HZ. (RFC 1843) * * This is a simplified Chinese encoding based on GB 2312. * It bears a resemblance to ISO 2022 encodings in such that the printable escape sequences `~{` * and `~}` are used to delimit a sequence of 7-bit-safe GB 2312 sequences. For the comparison, * they are equivalent to ISO-2022-CN escape sequences `ESC $ ) A` and `ESC ( B`. * Additional escape sequences `~~` (for a literal `~`) and `~\n` (ignored) are also supported. */ #[derive(Clone, Copy)] pub struct HZEncoding; impl Encoding for HZEncoding { fn name(&self) -> &'static str { "hz" } fn whatwg_name(&self) -> Option<&'static str> { None } fn raw_encoder(&self) -> Box { HZEncoder::new() } fn raw_decoder(&self) -> Box { HZDecoder::new() } } /// An encoder for HZ. #[derive(Clone, Copy)] pub struct HZEncoder { escaped: bool, } impl HZEncoder { pub fn new() -> Box { Box::new(HZEncoder { escaped: false }) } } impl RawEncoder for HZEncoder { fn from_self(&self) -> Box { HZEncoder::new() } fn is_ascii_compatible(&self) -> bool { false } fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option) { output.writer_hint(input.len()); let mut escaped = self.escaped; macro_rules! ensure_escaped( () => (if !escaped { output.write_bytes(b"~{"); escaped = true; }) ); macro_rules! ensure_unescaped( () => (if escaped { output.write_bytes(b"~}"); escaped = false; }) ); for ((i,j), ch) in input.index_iter() { if ch < '\u{80}' { ensure_unescaped!(); output.write_byte(ch as u8); if ch == '~' { output.write_byte('~' as u8); } } else { let ptr = index::gb18030::backward(ch as u32); if ptr == 0xffff { self.escaped = escaped; // do NOT reset the state! return (i, Some(CodecError { upto: j as isize, cause: "unrepresentable character".into() })); } else { let lead = ptr / 190; let trail = ptr % 190; if lead < 0x21 - 1 || trail < 0x21 + 0x3f { // GBK extension, ignored self.escaped = escaped; // do NOT reset the state! return (i, Some(CodecError { upto: j as isize, cause: "unrepresentable character".into() })); } else { ensure_escaped!(); output.write_byte((lead + 1) as u8); output.write_byte((trail - 0x3f) as u8); } } } } self.escaped = escaped; (input.len(), None) } fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option { None } } /// A decoder for HZ. #[derive(Clone, Copy)] struct HZDecoder { st: hz::State, } impl HZDecoder { pub fn new() -> Box { Box::new(HZDecoder { st: Default::default() }) } } impl RawDecoder for HZDecoder { fn from_self(&self) -> Box { HZDecoder::new() } fn is_ascii_compatible(&self) -> bool { true } fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option) { let (st, processed, err) = hz::raw_feed(self.st, input, output, &()); self.st = st; (processed, err) } fn raw_finish(&mut self, output: &mut StringWriter) -> Option { let (st, err) = hz::raw_finish(self.st, output, &()); self.st = st; err } } stateful_decoder! { module hz; internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 { use index_simpchinese as index; let lead = lead as u16; let trail = trail as u16; let index = match (lead, trail) { (0x20...0x7f, 0x21...0x7e) => (lead - 1) * 190 + (trail + 0x3f), _ => 0xffff, }; index::gb18030::forward(index) } initial: // hz-gb-2312 flag = unset, hz-gb-2312 lead = 0x00 state A0(ctx: Context) { case 0x7e => A1(ctx); case b @ 0x00...0x7f => ctx.emit(b as u32); case _ => ctx.err("invalid sequence"); final => ctx.reset(); } checkpoint: // hz-gb-2312 flag = set, hz-gb-2312 lead = 0x00 state B0(ctx: Context) { case 0x7e => B1(ctx); case b @ 0x20...0x7f => B2(ctx, b); case 0x0a => ctx.err("invalid sequence"); // error *and* reset case _ => ctx.err("invalid sequence"), B0(ctx); final => ctx.reset(); } transient: // hz-gb-2312 flag = unset, hz-gb-2312 lead = 0x7e state A1(ctx: Context) { case 0x7b => B0(ctx); case 0x7d => A0(ctx); case 0x7e => ctx.emit(0x7e), A0(ctx); case 0x0a => A0(ctx); case _ => ctx.backup_and_err(1, "invalid sequence"); final => ctx.err("incomplete sequence"); } // hz-gb-2312 flag = set, hz-gb-2312 lead = 0x7e state B1(ctx: Context) { case 0x7b => B0(ctx); case 0x7d => A0(ctx); case 0x7e => ctx.emit(0x7e), B0(ctx); case 0x0a => A0(ctx); case _ => ctx.backup_and_err(1, "invalid sequence"), B0(ctx); final => ctx.err("incomplete sequence"); } // hz-gb-2312 flag = set, hz-gb-2312 lead != 0 & != 0x7e state B2(ctx: Context, lead: u8) { case 0x0a => ctx.err("invalid sequence"); // should reset the state! case b => match map_two_bytes(lead, b) { 0xffff => ctx.err("invalid sequence"), ch => ctx.emit(ch) }, B0(ctx); final => ctx.err("incomplete sequence"); } } #[cfg(test)] mod hz_tests { extern crate test; use super::HZEncoding; use testutils; use types::*; #[test] fn test_encoder_valid() { let mut e = HZEncoding.raw_encoder(); assert_feed_ok!(e, "A", "", *b"A"); assert_feed_ok!(e, "BC", "", *b"BC"); assert_feed_ok!(e, "", "", *b""); assert_feed_ok!(e, "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}", "", *b"~{VP;*HKCq92:M9z"); assert_feed_ok!(e, "\u{ff21}\u{ff22}\u{ff23}", "", *b"#A#B#C"); assert_feed_ok!(e, "1\u{20ac}/m", "", *b"~}1~{\"c~}/m"); assert_feed_ok!(e, "~<\u{a4}~\u{0a4}>~", "", *b"~~<~{!h~}~~~{!h~}>~~"); assert_finish_ok!(e, []); } #[test] fn test_encoder_invalid() { let mut e = HZEncoding.raw_encoder(); assert_feed_err!(e, "", "\u{ffff}", "", []); assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]); // no support for GBK extension assert_feed_err!(e, "", "\u{3007}", "", []); assert_finish_ok!(e, []); } #[test] fn test_decoder_valid() { let mut d = HZEncoding.raw_decoder(); assert_feed_ok!(d, *b"A", *b"", "A"); assert_feed_ok!(d, *b"BC", *b"", "BC"); assert_feed_ok!(d, *b"D~~E", *b"~", "D~E"); assert_feed_ok!(d, *b"~F~\nG", *b"~", "~FG"); assert_feed_ok!(d, *b"", *b"", ""); assert_feed_ok!(d, *b"\nH", *b"~", "H"); assert_feed_ok!(d, *b"{VP~}~{;*~{HKCq92:M9z", *b"", "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}"); assert_feed_ok!(d, *b"", *b"#", ""); assert_feed_ok!(d, *b"A", *b"~", "\u{ff21}"); assert_feed_ok!(d, *b"~#B~~#C", *b"~", "~\u{ff22}~\u{ff23}"); assert_feed_ok!(d, *b"", *b"", ""); assert_feed_ok!(d, *b"\n#D~{#E~\n#F~{#G", *b"~", "#D\u{ff25}#F\u{ff27}"); assert_feed_ok!(d, *b"}X~}YZ", *b"", "XYZ"); assert_finish_ok!(d, ""); } #[test] fn test_decoder_invalid_out_or_range() { let mut d = HZEncoding.raw_decoder(); assert_feed_ok!(d, *b"~{", *b"", ""); assert_feed_err!(d, *b"", *b"\x20\x20", *b"", ""); assert_feed_err!(d, *b"", *b"\x20\x7f", *b"", ""); // do not reset the state (except for CR) assert_feed_err!(d, *b"", *b"\x21\x7f", *b"", ""); assert_feed_err!(d, *b"", *b"\x7f\x20", *b"", ""); assert_feed_err!(d, *b"", *b"\x7f\x21", *b"", ""); assert_feed_err!(d, *b"", *b"\x7f\x7f", *b"", ""); assert_finish_ok!(d, ""); } #[test] fn test_decoder_invalid_carriage_return() { // CR in the multibyte mode is invalid but *also* resets the state let mut d = HZEncoding.raw_decoder(); assert_feed_ok!(d, *b"~{#A", *b"", "\u{ff21}"); assert_feed_err!(d, *b"", *b"\n", *b"", ""); assert_feed_ok!(d, *b"#B~{#C", *b"", "#B\u{ff23}"); assert_feed_err!(d, *b"", *b"#\n", *b"", ""); assert_feed_ok!(d, *b"#D", *b"", "#D"); assert_finish_ok!(d, ""); } #[test] fn test_decoder_invalid_partial() { let mut d = HZEncoding.raw_decoder(); assert_feed_ok!(d, *b"", *b"~", ""); assert_finish_err!(d, ""); let mut d = HZEncoding.raw_decoder(); assert_feed_ok!(d, *b"~{", *b"#", ""); assert_finish_err!(d, ""); let mut d = HZEncoding.raw_decoder(); assert_feed_ok!(d, *b"~{#A", *b"~", "\u{ff21}"); assert_finish_err!(d, ""); } #[test] fn test_decoder_invalid_escape() { let mut d = HZEncoding.raw_decoder(); assert_feed_ok!(d, *b"#A", *b"", "#A"); assert_feed_err!(d, *b"", *b"~", *b"xy", ""); assert_feed_ok!(d, *b"#B", *b"", "#B"); assert_feed_ok!(d, *b"", *b"~", ""); assert_feed_err!(d, *b"", *b"", *b"xy", ""); assert_feed_ok!(d, *b"#C~{#D", *b"", "#C\u{ff24}"); assert_feed_err!(d, *b"", *b"~", *b"xy", ""); assert_feed_ok!(d, *b"#E", *b"", "\u{ff25}"); // does not reset to ASCII assert_feed_ok!(d, *b"", *b"~", ""); assert_feed_err!(d, *b"", *b"", *b"xy", ""); assert_feed_ok!(d, *b"#F~}#G", *b"", "\u{ff26}#G"); assert_finish_ok!(d, ""); } #[test] fn test_decoder_feed_after_finish() { let mut d = HZEncoding.raw_decoder(); assert_feed_ok!(d, *b"R;~{R;", *b"R", "R;\u{4e00}"); assert_finish_err!(d, ""); assert_feed_ok!(d, *b"R;~{R;", *b"", "R;\u{4e00}"); assert_finish_ok!(d, ""); } #[bench] fn bench_encode_short_text(bencher: &mut test::Bencher) { let s = testutils::SIMPLIFIED_CHINESE_TEXT; bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ HZEncoding.encode(&s, EncoderTrap::Strict) })) } #[bench] fn bench_decode_short_text(bencher: &mut test::Bencher) { let s = HZEncoding.encode(testutils::SIMPLIFIED_CHINESE_TEXT, EncoderTrap::Strict).ok().unwrap(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ HZEncoding.decode(&s, DecoderTrap::Strict) })) } } encoding-0.2.33/src/codec/singlebyte.rs01006440000765000002400000010051125331707360016167 0ustar0000000000000000// This is a part of rust-encoding. // Copyright (c) 2013-2015, Kang Seonghoon. // See README.md and LICENSE.txt for details. //! Common codec implementation for single-byte encodings. use std::convert::Into; use util::{as_char, StrCharIndex}; use types::*; /// A common framework for single-byte encodings based on ASCII. #[derive(Copy, Clone)] pub struct SingleByteEncoding { pub name: &'static str, pub whatwg_name: Option<&'static str>, pub index_forward: extern "Rust" fn(u8) -> u16, pub index_backward: extern "Rust" fn(u32) -> u8, } impl Encoding for SingleByteEncoding { fn name(&self) -> &'static str { self.name } fn whatwg_name(&self) -> Option<&'static str> { self.whatwg_name } fn raw_encoder(&self) -> Box { SingleByteEncoder::new(self.index_backward) } fn raw_decoder(&self) -> Box { SingleByteDecoder::new(self.index_forward) } } /// An encoder for single-byte encodings based on ASCII. #[derive(Clone, Copy)] pub struct SingleByteEncoder { index_backward: extern "Rust" fn(u32) -> u8, } impl SingleByteEncoder { pub fn new(index_backward: extern "Rust" fn(u32) -> u8) -> Box { Box::new(SingleByteEncoder { index_backward: index_backward }) } } impl RawEncoder for SingleByteEncoder { fn from_self(&self) -> Box { SingleByteEncoder::new(self.index_backward) } fn is_ascii_compatible(&self) -> bool { true } fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option) { output.writer_hint(input.len()); for ((i,j), ch) in input.index_iter() { if ch <= '\u{7f}' { output.write_byte(ch as u8); continue; } else { let index = (self.index_backward)(ch as u32); if index != 0 { output.write_byte(index); } else { return (i, Some(CodecError { upto: j as isize, cause: "unrepresentable character".into() })); } } } (input.len(), None) } fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option { None } } /// A decoder for single-byte encodings based on ASCII. #[derive(Clone, Copy)] pub struct SingleByteDecoder { index_forward: extern "Rust" fn(u8) -> u16, } impl SingleByteDecoder { pub fn new(index_forward: extern "Rust" fn(u8) -> u16) -> Box { Box::new(SingleByteDecoder { index_forward: index_forward }) } } impl RawDecoder for SingleByteDecoder { fn from_self(&self) -> Box { SingleByteDecoder::new(self.index_forward) } fn is_ascii_compatible(&self) -> bool { true } fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option) { output.writer_hint(input.len()); let mut i = 0; let len = input.len(); while i < len { if input[i] <= 0x7f { output.write_char(input[i] as char); } else { let ch = (self.index_forward)(input[i]); if ch != 0xffff { output.write_char(as_char(ch as u32)); } else { return (i, Some(CodecError { upto: i as isize + 1, cause: "invalid sequence".into() })); } } i += 1; } (i, None) } fn raw_finish(&mut self, _output: &mut StringWriter) -> Option { None } } /// Algorithmic mapping for ISO 8859-1. pub mod iso_8859_1 { #[inline] pub fn forward(code: u8) -> u16 { code as u16 } #[inline] pub fn backward(code: u32) -> u8 { if (code & !0x7f) == 0x80 {code as u8} else {0} } } #[cfg(test)] mod tests { use all::ISO_8859_2; use types::*; #[test] fn test_encoder_non_bmp() { let mut e = ISO_8859_2.raw_encoder(); assert_feed_err!(e, "A", "\u{FFFF}", "B", [0x41]); assert_feed_err!(e, "A", "\u{10000}", "B", [0x41]); } } encoding-0.2.33/src/codec/tradchinese.rs01006440000765000002400000023771125331707360016330 0ustar0000000000000000// This is a part of rust-encoding. // Copyright (c) 2013-2015, Kang Seonghoon. // See README.md and LICENSE.txt for details. //! Legacy traditional Chinese encodings. use std::convert::Into; use std::default::Default; use util::StrCharIndex; use index_tradchinese as index; use types::*; /** * Big5-2003 with common extensions. (XXX with asymmetric HKSCS-2008 support) * * This is a traditional Chinese encoding spanning the region `[81-FE] [40-7E A1-FE]`. * Originally a proprietary encoding by the consortium of five companies (hence the name), * the Republic of China government standardized Big5-2003 in an appendix of CNS 11643 * so that CNS 11643 plane 1 and plane 2 have * an almost identical set of characters as Big5 (but with a different mapping). * The Hong Kong government has an official extension to Big5 * named Hong Kong Supplementary Character Set (HKSCS). * * This particular implementation of Big5 includes the widespread ETEN and HKSCS extensions, * but excludes less common extensions such as Big5+, Big-5E and Unicode-at-on. */ #[derive(Clone, Copy)] pub struct BigFive2003Encoding; impl Encoding for BigFive2003Encoding { fn name(&self) -> &'static str { "big5-2003" } fn whatwg_name(&self) -> Option<&'static str> { Some("big5") } // WHATWG compatibility fn raw_encoder(&self) -> Box { BigFive2003Encoder::new() } fn raw_decoder(&self) -> Box { BigFive2003HKSCS2008Decoder::new() } } /// An encoder for Big5-2003. #[derive(Clone, Copy)] pub struct BigFive2003Encoder; impl BigFive2003Encoder { pub fn new() -> Box { Box::new(BigFive2003Encoder) } } impl RawEncoder for BigFive2003Encoder { fn from_self(&self) -> Box { BigFive2003Encoder::new() } fn is_ascii_compatible(&self) -> bool { true } fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option) { output.writer_hint(input.len()); for ((i,j), ch) in input.index_iter() { if ch < '\u{80}' { output.write_byte(ch as u8); } else { let ptr = index::big5::backward(ch as u32); if ptr == 0xffff || ptr < (0xa1 - 0x81) * 157 { // no HKSCS extension (XXX doesn't HKSCS include 0xFA40..0xFEFE?) return (i, Some(CodecError { upto: j as isize, cause: "unrepresentable character".into() })); } let lead = ptr / 157 + 0x81; let trail = ptr % 157; let trailoffset = if trail < 0x3f {0x40} else {0x62}; output.write_byte(lead as u8); output.write_byte((trail + trailoffset) as u8); } } (input.len(), None) } fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option { None } } /// A decoder for Big5-2003 with HKSCS-2008 extension. #[derive(Clone, Copy)] struct BigFive2003HKSCS2008Decoder { st: bigfive2003::State, } impl BigFive2003HKSCS2008Decoder { pub fn new() -> Box { Box::new(BigFive2003HKSCS2008Decoder { st: Default::default() }) } } impl RawDecoder for BigFive2003HKSCS2008Decoder { fn from_self(&self) -> Box { BigFive2003HKSCS2008Decoder::new() } fn is_ascii_compatible(&self) -> bool { true } fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option) { let (st, processed, err) = bigfive2003::raw_feed(self.st, input, output, &()); self.st = st; (processed, err) } fn raw_finish(&mut self, output: &mut StringWriter) -> Option { let (st, err) = bigfive2003::raw_finish(self.st, output, &()); self.st = st; err } } stateful_decoder! { module bigfive2003; internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 { use index_tradchinese as index; let lead = lead as u16; let trail = trail as u16; let index = match (lead, trail) { (0x81...0xfe, 0x40...0x7e) | (0x81...0xfe, 0xa1...0xfe) => { let trailoffset = if trail < 0x7f {0x40} else {0x62}; (lead - 0x81) * 157 + trail - trailoffset } _ => 0xffff, }; index::big5::forward(index) // may return two-letter replacements 0..3 } initial: // big5 lead = 0x00 state S0(ctx: Context) { case b @ 0x00...0x7f => ctx.emit(b as u32); case b @ 0x81...0xfe => S1(ctx, b); case _ => ctx.err("invalid sequence"); } transient: // big5 lead != 0x00 state S1(ctx: Context, lead: u8) { case b => match map_two_bytes(lead, b) { 0xffff => { let backup = if b < 0x80 {1} else {0}; ctx.backup_and_err(backup, "invalid sequence") }, 0 /*index=1133*/ => ctx.emit_str("\u{ca}\u{304}"), 1 /*index=1135*/ => ctx.emit_str("\u{ca}\u{30c}"), 2 /*index=1164*/ => ctx.emit_str("\u{ea}\u{304}"), 3 /*index=1166*/ => ctx.emit_str("\u{ea}\u{30c}"), ch => ctx.emit(ch), }; } } #[cfg(test)] mod bigfive2003_tests { extern crate test; use super::BigFive2003Encoding; use testutils; use types::*; #[test] fn test_encoder_valid() { let mut e = BigFive2003Encoding.raw_encoder(); assert_feed_ok!(e, "A", "", [0x41]); assert_feed_ok!(e, "BC", "", [0x42, 0x43]); assert_feed_ok!(e, "", "", []); assert_feed_ok!(e, "\u{4e2d}\u{83ef}\u{6c11}\u{570b}", "", [0xa4, 0xa4, 0xb5, 0xd8, 0xa5, 0xc1, 0xb0, 0xea]); assert_feed_ok!(e, "1\u{20ac}/m", "", [0x31, 0xa3, 0xe1, 0x2f, 0x6d]); assert_feed_ok!(e, "\u{ffed}", "", [0xf9, 0xfe]); assert_finish_ok!(e, []); } #[test] fn test_encoder_invalid() { let mut e = BigFive2003Encoding.raw_encoder(); assert_feed_err!(e, "", "\u{ffff}", "", []); assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]); assert_feed_err!(e, "", "\u{3eec}", "\u{4e00}", []); // HKSCS-2008 addition assert_finish_ok!(e, []); } #[test] fn test_decoder_valid() { let mut d = BigFive2003Encoding.raw_decoder(); assert_feed_ok!(d, [0x41], [], "A"); assert_feed_ok!(d, [0x42, 0x43], [], "BC"); assert_feed_ok!(d, [], [], ""); assert_feed_ok!(d, [0xa4, 0xa4, 0xb5, 0xd8, 0xa5, 0xc1, 0xb0, 0xea], [], "\u{4e2d}\u{83ef}\u{6c11}\u{570b}"); assert_feed_ok!(d, [], [0xa4], ""); assert_feed_ok!(d, [0xa4, 0xb5, 0xd8], [0xa5], "\u{4e2d}\u{83ef}"); assert_feed_ok!(d, [0xc1, 0xb0, 0xea], [], "\u{6c11}\u{570b}"); assert_feed_ok!(d, [0x31, 0xa3, 0xe1, 0x2f, 0x6d], [], "1\u{20ac}/m"); assert_feed_ok!(d, [0xf9, 0xfe], [], "\u{ffed}"); assert_feed_ok!(d, [0x87, 0x7e], [], "\u{3eec}"); // HKSCS-2008 addition assert_feed_ok!(d, [0x88, 0x62, 0x88, 0x64, 0x88, 0xa3, 0x88, 0xa5], [], "\u{ca}\u{304}\u{00ca}\u{30c}\u{ea}\u{304}\u{ea}\u{30c}"); // 2-byte output assert_finish_ok!(d, ""); } #[test] fn test_decoder_invalid_lone_lead_immediate_test_finish() { for i in 0x81..0xff { let mut d = BigFive2003Encoding.raw_decoder(); assert_feed_ok!(d, [], [i], ""); // wait for a trail assert_finish_err!(d, ""); } // 80/FF: immediate failure let mut d = BigFive2003Encoding.raw_decoder(); assert_feed_err!(d, [], [0x80], [], ""); assert_feed_err!(d, [], [0xff], [], ""); assert_finish_ok!(d, ""); } #[test] fn test_decoder_invalid_lone_lead_followed_by_space() { for i in 0x80..0x100 { let i = i as u8; let mut d = BigFive2003Encoding.raw_decoder(); assert_feed_err!(d, [], [i], [0x20], ""); assert_finish_ok!(d, ""); } } #[test] fn test_decoder_invalid_lead_followed_by_invalid_trail() { // unlike most other cases, valid lead + invalid MSB-set trail are entirely consumed. // https://www.w3.org/Bugs/Public/show_bug.cgi?id=16771 for i in 0x81..0xff { let mut d = BigFive2003Encoding.raw_decoder(); assert_feed_err!(d, [], [i, 0x80], [0x20], ""); assert_feed_err!(d, [], [i, 0xff], [0x20], ""); assert_finish_ok!(d, ""); let mut d = BigFive2003Encoding.raw_decoder(); assert_feed_ok!(d, [], [i], ""); assert_feed_err!(d, [], [0x80], [0x20], ""); assert_feed_ok!(d, [], [i], ""); assert_feed_err!(d, [], [0xff], [0x20], ""); assert_finish_ok!(d, ""); } // 80/FF is not a valid lead and the trail is not consumed let mut d = BigFive2003Encoding.raw_decoder(); assert_feed_err!(d, [], [0x80], [0x80], ""); assert_feed_err!(d, [], [0x80], [0xff], ""); assert_feed_err!(d, [], [0xff], [0x80], ""); assert_feed_err!(d, [], [0xff], [0xff], ""); assert_finish_ok!(d, ""); } #[test] fn test_decoder_feed_after_finish() { let mut d = BigFive2003Encoding.raw_decoder(); assert_feed_ok!(d, [0xa4, 0x40], [0xa4], "\u{4e00}"); assert_finish_err!(d, ""); assert_feed_ok!(d, [0xa4, 0x40], [], "\u{4e00}"); assert_finish_ok!(d, ""); } #[bench] fn bench_encode_short_text(bencher: &mut test::Bencher) { let s = testutils::TRADITIONAL_CHINESE_TEXT; bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ BigFive2003Encoding.encode(&s, EncoderTrap::Strict) })) } #[bench] fn bench_decode_short_text(bencher: &mut test::Bencher) { let s = BigFive2003Encoding.encode(testutils::TRADITIONAL_CHINESE_TEXT, EncoderTrap::Strict).ok().unwrap(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ BigFive2003Encoding.decode(&s, DecoderTrap::Strict) })) } } encoding-0.2.33/src/codec/utf_16.rs01006440000765000002400000054272127606115240015140 0ustar0000000000000000// This is a part of rust-encoding. // Copyright (c) 2013-2015, Kang Seonghoon. // See README.md and LICENSE.txt for details. //! UTF-16. use std::convert::Into; use std::marker::PhantomData; use util::as_char; use types::*; /// An implementation type for little endian. /// /// Can be used as a type parameter to `UTF16Encoding`, `UTF16Encoder` and `UTF16Decoder`. #[derive(Clone, Copy)] pub struct Little; /// An implementation type for big endian. /// /// Can be used as a type parameter to `UTF16Encoding`, `UTF16Encoder` and `UTF16Decoder`. #[derive(Clone, Copy)] pub struct Big; /// An internal trait used to customize UTF-16 implementations. #[doc(hidden)] // XXX never intended to be used publicly, should be gone later pub trait Endian: Clone + 'static { fn name() -> &'static str; fn whatwg_name() -> Option<&'static str>; fn write_two_bytes(output: &mut ByteWriter, msb: u8, lsb: u8); fn concat_two_bytes(lead: u16, trail: u8) -> u16; } impl Endian for Little { fn name() -> &'static str { "utf-16le" } fn whatwg_name() -> Option<&'static str> { Some("utf-16le") } fn write_two_bytes(output: &mut ByteWriter, msb: u8, lsb: u8) { output.write_byte(lsb); output.write_byte(msb); } fn concat_two_bytes(lead: u16, trail: u8) -> u16 { lead | ((trail as u16) << 8) } } impl Endian for Big { fn name() -> &'static str { "utf-16be" } fn whatwg_name() -> Option<&'static str> { Some("utf-16be") } fn write_two_bytes(output: &mut ByteWriter, msb: u8, lsb: u8) { output.write_byte(msb); output.write_byte(lsb); } fn concat_two_bytes(lead: u16, trail: u8) -> u16 { (lead << 8) | trail as u16 } } /** * UTF-16 (UCS Transformation Format, 16-bit). * * This is a Unicode encoding where one codepoint may use * 2 (up to U+FFFF) or 4 bytes (up to U+10FFFF) depending on its value. * It uses a "surrogate" mechanism to encode non-BMP codepoints, * which are represented as a pair of lower surrogate and upper surrogate characters. * In this effect, surrogate characters (U+D800..DFFF) cannot appear alone * and cannot be included in a valid Unicode string. * * ## Specialization * * This type is specialized with endianness type `E`, * which should be either `Little` (little endian) or `Big` (big endian). */ #[derive(Clone, Copy)] pub struct UTF16Encoding { _marker: PhantomData } /// A type for UTF-16 in little endian. pub type UTF16LEEncoding = UTF16Encoding; /// A type for UTF-16 in big endian. pub type UTF16BEEncoding = UTF16Encoding; /// An instance for UTF-16 in little endian. pub const UTF_16LE_ENCODING: UTF16LEEncoding = UTF16Encoding { _marker: PhantomData }; /// An instance for UTF-16 in big endian. pub const UTF_16BE_ENCODING: UTF16BEEncoding = UTF16Encoding { _marker: PhantomData }; impl Encoding for UTF16Encoding { fn name(&self) -> &'static str { ::name() } fn whatwg_name(&self) -> Option<&'static str> { ::whatwg_name() } fn raw_encoder(&self) -> Box { UTF16Encoder::::new() } fn raw_decoder(&self) -> Box { UTF16Decoder::::new() } } /** * An encoder for UTF-16. * * ## Specialization * * This type is specialized with endianness type `E`, * which should be either `Little` (little endian) or `Big` (big endian). */ #[derive(Clone, Copy)] pub struct UTF16Encoder { _marker: PhantomData } impl UTF16Encoder { fn new() -> Box { Box::new(UTF16Encoder:: { _marker: PhantomData }) } } impl RawEncoder for UTF16Encoder { fn from_self(&self) -> Box { UTF16Encoder::::new() } fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option) { output.writer_hint(input.len() * 2); let write_two_bytes = |output: &mut ByteWriter, msb: u8, lsb: u8| ::write_two_bytes(output, msb, lsb); for ch in input.chars() { match ch { '\u{0}'...'\u{d7ff}' | '\u{e000}'...'\u{ffff}' => { let ch = ch as u32; write_two_bytes(output, (ch >> 8) as u8, (ch & 0xff) as u8); } '\u{10000}'...'\u{10ffff}' => { let ch = ch as u32 - 0x10000; write_two_bytes(output, (0xd8 | (ch >> 18)) as u8, ((ch >> 10) & 0xff) as u8); write_two_bytes(output, (0xdc | ((ch >> 8) & 0x3)) as u8, (ch & 0xff) as u8); } _ => unreachable!() // XXX Rust issue #12483, this is redundant } } (input.len(), None) } fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option { None } } /** * A decoder for UTF-16. * * ## Specialization * * This type is specialized with endianness type `E`, * which should be either `Little` (little endian) or `Big` (big endian). */ pub struct UTF16Decoder { leadbyte: u16, leadsurrogate: u16, _marker: PhantomData } impl UTF16Decoder { pub fn new() -> Box { Box::new(UTF16Decoder:: { leadbyte: 0xffff, leadsurrogate: 0xffff, _marker: PhantomData }) } } impl RawDecoder for UTF16Decoder { fn from_self(&self) -> Box { UTF16Decoder::::new() } fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option) { output.writer_hint(input.len() / 2); // when every codepoint is U+0000..007F let concat_two_bytes = |lead: u16, trail: u8| ::concat_two_bytes(lead, trail); let mut i = 0; let mut processed = 0; let len = input.len(); if i >= len { return (processed, None); } if self.leadbyte != 0xffff { let ch = concat_two_bytes(self.leadbyte, input[i]); i += 1; self.leadbyte = 0xffff; if self.leadsurrogate != 0xffff { // `ch` is lower surrogate let upper = self.leadsurrogate; self.leadsurrogate = 0xffff; match ch { 0xdc00...0xdfff => { let ch = ((upper as u32 - 0xd800) << 10) + (ch as u32 - 0xdc00); output.write_char(as_char(ch + 0x10000)); processed = i; } _ => { return (processed, Some(CodecError { upto: i as isize - 2, cause: "invalid sequence".into() })); } } } else { match ch { 0xd800...0xdbff => { self.leadsurrogate = ch; // pass through } 0xdc00...0xdfff => { return (processed, Some(CodecError { upto: i as isize, cause: "invalid sequence".into() })); } _ => { output.write_char(as_char(ch as u32)); processed = i; } } } if i >= len { return (processed, None); } } if self.leadsurrogate != 0xffff { i += 1; if i >= len { self.leadbyte = input[i-1] as u16; return (processed, None); } let upper = self.leadsurrogate; let ch = concat_two_bytes(input[i-1] as u16, input[i]); i += 1; match ch { 0xdc00...0xdfff => { let ch = ((upper as u32 - 0xd800) << 10) + (ch as u32 - 0xdc00); output.write_char(as_char(ch + 0x10000)); } _ => { self.leadbyte = 0xffff; self.leadsurrogate = 0xffff; return (processed, Some(CodecError { upto: i as isize - 2, cause: "invalid sequence".into() })); } } } self.leadbyte = 0xffff; self.leadsurrogate = 0xffff; processed = i; while i < len { i += 1; if i >= len { self.leadbyte = input[i-1] as u16; break; } let ch = concat_two_bytes(input[i-1] as u16, input[i]); match ch { 0xd800...0xdbff => { i += 2; if i >= len { self.leadsurrogate = ch; if i-1 < len { self.leadbyte = input[i-1] as u16; } break; } let ch2 = concat_two_bytes(input[i-1] as u16, input[i]); match ch2 { 0xdc00...0xdfff => { let ch = ((ch as u32 - 0xd800) << 10) + (ch2 as u32 - 0xdc00); output.write_char(as_char(ch + 0x10000)); } _ => { return (processed, Some(CodecError { upto: i as isize - 1, cause: "invalid sequence".into() })); } } } 0xdc00...0xdfff => { return (processed, Some(CodecError { upto: i as isize + 1, cause: "invalid sequence".into() })); } _ => { output.write_char(as_char(ch as u32)); } } i += 1; processed = i; } (processed, None) } fn raw_finish(&mut self, _output: &mut StringWriter) -> Option { let leadbyte = self.leadbyte; let leadsurrogate = self.leadsurrogate; self.leadbyte = 0xffff; self.leadsurrogate = 0xffff; if leadbyte != 0xffff || leadsurrogate != 0xffff { Some(CodecError { upto: 0, cause: "incomplete sequence".into() }) } else { None } } } #[cfg(test)] mod tests { // little endian and big endian is symmetric to each other, there's no need to test both. // since big endian is easier to inspect we test UTF_16BE only. use super::UTF_16BE_ENCODING as UTF_16BE; use types::*; #[test] fn test_encoder_valid() { let mut e = UTF_16BE.raw_encoder(); assert_feed_ok!(e, "\u{0}\ \u{1}\u{02}\u{004}\u{0008}\ \u{10}\u{020}\u{0040}\u{80}\ \u{100}\u{0200}\u{400}\u{800}\ \u{1000}\u{2000}\u{4000}\u{8000}\ \u{ffff}", "", [0x00, 0x00, 0x00, 0x01, 0x00, 0x02, 0x00, 0x04, 0x00, 0x08, 0x00, 0x10, 0x00, 0x20, 0x00, 0x40, 0x00, 0x80, 0x01, 0x00, 0x02, 0x00, 0x04, 0x00, 0x08, 0x00, 0x10, 0x00, 0x20, 0x00, 0x40, 0x00, 0x80, 0x00, 0xff, 0xff]); assert_feed_ok!(e, "\u{10000}\ \u{10001}\u{010002}\ \u{10004}\u{010008}\ \u{10010}\u{010020}\ \u{10040}\u{010080}\ \u{10100}\u{010200}\ \u{10400}\u{010800}\ \u{11000}\u{012000}\ \u{14000}\u{018000}\ \u{20000}\u{030000}\ \u{50000}\u{090000}\ \u{10FFFF}", "", [0xd8, 0x00, 0xdc, 0x00, 0xd8, 0x00, 0xdc, 0x01, 0xd8, 0x00, 0xdc, 0x02, 0xd8, 0x00, 0xdc, 0x04, 0xd8, 0x00, 0xdc, 0x08, 0xd8, 0x00, 0xdc, 0x10, 0xd8, 0x00, 0xdc, 0x20, 0xd8, 0x00, 0xdc, 0x40, 0xd8, 0x00, 0xdc, 0x80, 0xd8, 0x00, 0xdd, 0x00, 0xd8, 0x00, 0xde, 0x00, 0xd8, 0x01, 0xdc, 0x00, 0xd8, 0x02, 0xdc, 0x00, 0xd8, 0x04, 0xdc, 0x00, 0xd8, 0x08, 0xdc, 0x00, 0xd8, 0x10, 0xdc, 0x00, 0xd8, 0x20, 0xdc, 0x00, 0xd8, 0x40, 0xdc, 0x00, 0xd8, 0x80, 0xdc, 0x00, 0xd9, 0x00, 0xdc, 0x00, 0xda, 0x00, 0xdc, 0x00, 0xdb, 0xff, 0xdf, 0xff]); assert_finish_ok!(e, []); } #[test] fn test_decoder_valid() { let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [0x00, 0x00, 0x00, 0x01, 0x00, 0x02, 0x00, 0x04, 0x00, 0x08, 0x00, 0x10, 0x00, 0x20, 0x00, 0x40, 0x00, 0x80, 0x01, 0x00, 0x02, 0x00, 0x04, 0x00, 0x08, 0x00, 0x10, 0x00, 0x20, 0x00, 0x40, 0x00, 0x80, 0x00, 0xff, 0xff], [], "\u{0}\ \u{1}\u{02}\u{004}\u{0008}\ \u{10}\u{020}\u{0040}\u{80}\ \u{100}\u{0200}\u{400}\u{800}\ \u{1000}\u{2000}\u{4000}\u{8000}\ \u{ffff}"); assert_feed_ok!(d, [0xd8, 0x00, 0xdc, 0x00, 0xd8, 0x00, 0xdc, 0x01, 0xd8, 0x00, 0xdc, 0x02, 0xd8, 0x00, 0xdc, 0x04, 0xd8, 0x00, 0xdc, 0x08, 0xd8, 0x00, 0xdc, 0x10, 0xd8, 0x00, 0xdc, 0x20, 0xd8, 0x00, 0xdc, 0x40, 0xd8, 0x00, 0xdc, 0x80, 0xd8, 0x00, 0xdd, 0x00, 0xd8, 0x00, 0xde, 0x00, 0xd8, 0x01, 0xdc, 0x00, 0xd8, 0x02, 0xdc, 0x00, 0xd8, 0x04, 0xdc, 0x00, 0xd8, 0x08, 0xdc, 0x00, 0xd8, 0x10, 0xdc, 0x00, 0xd8, 0x20, 0xdc, 0x00, 0xd8, 0x40, 0xdc, 0x00, 0xd8, 0x80, 0xdc, 0x00, 0xd9, 0x00, 0xdc, 0x00, 0xda, 0x00, 0xdc, 0x00, 0xdb, 0xff, 0xdf, 0xff], [], "\u{10000}\ \u{10001}\u{010002}\ \u{10004}\u{010008}\ \u{10010}\u{010020}\ \u{10040}\u{010080}\ \u{10100}\u{010200}\ \u{10400}\u{010800}\ \u{11000}\u{012000}\ \u{14000}\u{018000}\ \u{20000}\u{030000}\ \u{50000}\u{090000}\ \u{10FFFF}"); assert_finish_ok!(d, ""); } #[test] fn test_decoder_valid_partial_bmp() { let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [], [0x12], ""); assert_feed_ok!(d, [0x34], [], "\u{1234}"); assert_feed_ok!(d, [], [0x56], ""); assert_feed_ok!(d, [0x78], [], "\u{5678}"); assert_finish_ok!(d, ""); let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [], [0x12], ""); assert_feed_ok!(d, [0x34], [0x56], "\u{1234}"); assert_feed_ok!(d, [0x78, 0xab, 0xcd], [], "\u{5678}\u{abcd}"); assert_finish_ok!(d, ""); } #[test] fn test_decoder_valid_partial_non_bmp() { let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [], [0xd8], ""); assert_feed_ok!(d, [], [0x08], ""); assert_feed_ok!(d, [], [0xdf], ""); assert_feed_ok!(d, [0x45], [0xd9], "\u{12345}"); assert_feed_ok!(d, [], [0x5e], ""); assert_feed_ok!(d, [], [0xdc], ""); assert_feed_ok!(d, [0x90], [], "\u{67890}"); assert_finish_ok!(d, ""); let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [], [0xd8], ""); assert_feed_ok!(d, [], [0x08, 0xdf], ""); assert_feed_ok!(d, [0x45], [0xd9, 0x5e], "\u{12345}"); assert_feed_ok!(d, [0xdc, 0x90], [], "\u{67890}"); assert_finish_ok!(d, ""); let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [], [0xd8, 0x08, 0xdf], ""); assert_feed_ok!(d, [0x45], [0xd9, 0x5e, 0xdc], "\u{12345}"); assert_feed_ok!(d, [0x90], [], "\u{67890}"); assert_finish_ok!(d, ""); } #[test] fn test_decoder_invalid_partial() { let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [], [0x12], ""); assert_finish_err!(d, ""); let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [], [0xd8], ""); assert_finish_err!(d, ""); let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [], [0xd8, 0x08], ""); assert_finish_err!(d, ""); let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [], [0xd8, 0x08, 0xdf], ""); assert_finish_err!(d, ""); } #[test] fn test_decoder_invalid_lone_upper_surrogate() { let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [], [0xd8, 0x00], ""); assert_feed_err!(d, [], [], [0x12, 0x34], ""); assert_feed_err!(d, [], [0xd8, 0x00], [0x56, 0x78], ""); assert_feed_ok!(d, [], [0xd8, 0x00], ""); assert_feed_err!(d, [], [], [0xd8, 0x00], ""); assert_feed_ok!(d, [], [0xd8, 0x00], ""); assert_finish_err!(d, ""); let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [], [0xdb, 0xff], ""); assert_feed_err!(d, [], [], [0x12, 0x34], ""); assert_feed_err!(d, [], [0xdb, 0xff], [0x56, 0x78], ""); assert_feed_ok!(d, [], [0xdb, 0xff], ""); assert_feed_err!(d, [], [], [0xdb, 0xff], ""); assert_feed_ok!(d, [], [0xdb, 0xff], ""); assert_finish_err!(d, ""); } #[test] fn test_decoder_invalid_lone_upper_surrogate_partial() { let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [], [0xd8], ""); assert_feed_err!(d, [], [0x00], [0x12, 0x34], ""); assert_feed_ok!(d, [], [0xd8, 0x00, 0x56], ""); assert_feed_err!(d, -1, [], [], [0x56, 0x78], ""); assert_feed_ok!(d, [], [0xd8], ""); assert_feed_err!(d, [], [0x00], [0xd8, 0x00], ""); assert_feed_ok!(d, [], [0xd8, 0x00, 0xdb], ""); assert_feed_err!(d, -1, [], [], [0xdb, 0xff], ""); assert_feed_ok!(d, [], [0xd8], ""); assert_finish_err!(d, ""); let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [], [0xdb], ""); assert_feed_err!(d, [], [0xff], [0x12, 0x34], ""); assert_feed_ok!(d, [], [0xdb, 0xff, 0x56], ""); assert_feed_err!(d, -1, [], [], [0x56, 0x78], ""); assert_feed_ok!(d, [], [0xdb], ""); assert_feed_err!(d, [], [0xff], [0xdb, 0xff], ""); assert_feed_ok!(d, [], [0xdb, 0xff, 0xd8], ""); assert_feed_err!(d, -1, [], [], [0xd8, 0x00], ""); assert_feed_ok!(d, [], [0xdb], ""); assert_finish_err!(d, ""); } #[test] fn test_decoder_invalid_lone_lower_surrogate() { let mut d = UTF_16BE.raw_decoder(); assert_feed_err!(d, [], [0xdc, 0x00], [], ""); assert_feed_err!(d, [0x12, 0x34], [0xdc, 0x00], [0x56, 0x78], "\u{1234}"); assert_finish_ok!(d, ""); let mut d = UTF_16BE.raw_decoder(); assert_feed_err!(d, [], [0xdf, 0xff], [], ""); assert_feed_err!(d, [0x12, 0x34], [0xdf, 0xff], [0x56, 0x78], "\u{1234}"); assert_finish_ok!(d, ""); } #[test] fn test_decoder_invalid_lone_lower_surrogate_partial() { let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [], [0xdc], ""); assert_feed_err!(d, [], [0x00], [], ""); assert_feed_ok!(d, [0x12, 0x34], [0xdc], "\u{1234}"); assert_feed_err!(d, [], [0x00], [0x56, 0x78], ""); assert_finish_ok!(d, ""); assert_feed_ok!(d, [], [0xdf], ""); assert_feed_err!(d, [], [0xff], [], ""); assert_feed_ok!(d, [0x12, 0x34], [0xdf], "\u{1234}"); assert_feed_err!(d, [], [0xff], [0x56, 0x78], ""); assert_finish_ok!(d, ""); } #[test] fn test_decoder_invalid_one_byte_before_finish() { let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [], [0x12], ""); assert_finish_err!(d, ""); let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [0x12, 0x34], [0x56], "\u{1234}"); assert_finish_err!(d, ""); } #[test] fn test_decoder_invalid_three_bytes_before_finish() { let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [], [0xd8, 0x00, 0xdc], ""); assert_finish_err!(d, ""); let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [0x12, 0x34], [0xd8, 0x00, 0xdc], "\u{1234}"); assert_finish_err!(d, ""); } #[test] fn test_decoder_invalid_three_bytes_before_finish_partial() { let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [], [0xd8], ""); assert_feed_ok!(d, [], [0x00], ""); assert_feed_ok!(d, [], [0xdc], ""); assert_finish_err!(d, ""); let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [0x12, 0x34], [0xd8], "\u{1234}"); assert_feed_ok!(d, [], [0x00, 0xdc], ""); assert_finish_err!(d, ""); let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [0x12, 0x34], [0xd8, 0x00], "\u{1234}"); assert_feed_ok!(d, [], [0xdc], ""); assert_finish_err!(d, ""); } #[test] fn test_decoder_feed_after_finish() { let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [0x12, 0x34], [0x12], "\u{1234}"); assert_finish_err!(d, ""); assert_feed_ok!(d, [0x12, 0x34], [], "\u{1234}"); assert_finish_ok!(d, ""); let mut d = UTF_16BE.raw_decoder(); assert_feed_ok!(d, [0xd8, 0x08, 0xdf, 0x45], [0xd8, 0x08, 0xdf], "\u{12345}"); assert_finish_err!(d, ""); assert_feed_ok!(d, [0xd8, 0x08, 0xdf, 0x45], [0xd8, 0x08], "\u{12345}"); assert_finish_err!(d, ""); assert_feed_ok!(d, [0xd8, 0x08, 0xdf, 0x45], [0xd8], "\u{12345}"); assert_finish_err!(d, ""); assert_feed_ok!(d, [0xd8, 0x08, 0xdf, 0x45], [], "\u{12345}"); assert_finish_ok!(d, ""); } } encoding-0.2.33/src/codec/utf_8.rs01006440000765000002400000072224125331707360015061 0ustar0000000000000000// This is a part of rust-encoding. // Copyright (c) 2013-2015, Kang Seonghoon. // See README.md and LICENSE.txt for details. // // Portions Copyright (c) 2008-2009 Bjoern Hoehrmann // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. //! UTF-8, the universal encoding. use std::{str, mem}; use std::convert::Into; use types::*; /** * UTF-8 (UCS Transformation Format, 8-bit). * * This is a Unicode encoding compatible to ASCII (ISO/IEC 646:US) * and able to represent all Unicode codepoints uniquely and unambiguously. * It has a variable-length design, * where one codepoint may use 1 (up to U+007F), 2 (up to U+07FF), 3 (up to U+FFFF) * and 4 bytes (up to U+10FFFF) depending on its value. * The first byte of the sequence is distinct from other "continuation" bytes of the sequence * making UTF-8 self-synchronizable and easy to handle. * It has a fixed endianness, and can be lexicographically sorted by codepoints. * * The UTF-8 scanner used by this module is heavily based on Bjoern Hoehrmann's * [Flexible and Economical UTF-8 Decoder](http://bjoern.hoehrmann.de/utf-8/decoder/dfa/). */ #[derive(Clone, Copy)] pub struct UTF8Encoding; impl Encoding for UTF8Encoding { fn name(&self) -> &'static str { "utf-8" } fn whatwg_name(&self) -> Option<&'static str> { Some("utf-8") } fn raw_encoder(&self) -> Box { UTF8Encoder::new() } fn raw_decoder(&self) -> Box { UTF8Decoder::new() } } /// An encoder for UTF-8. #[derive(Clone, Copy)] pub struct UTF8Encoder; impl UTF8Encoder { pub fn new() -> Box { Box::new(UTF8Encoder) } } impl RawEncoder for UTF8Encoder { fn from_self(&self) -> Box { UTF8Encoder::new() } fn is_ascii_compatible(&self) -> bool { true } fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option) { let input: &[u8] = input.as_bytes(); assert!(str::from_utf8(input).is_ok()); output.write_bytes(input); (input.len(), None) } fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option { None } } /// A decoder for UTF-8. #[derive(Clone, Copy)] pub struct UTF8Decoder { queuelen: usize, queue: [u8; 4], state: u8, } impl UTF8Decoder { pub fn new() -> Box { Box::new(UTF8Decoder { queuelen: 0, queue: [0; 4], state: INITIAL_STATE }) } } static CHAR_CATEGORY: [u8; 256] = [ // 0 (00-7F): one byte sequence // 1 (80-8F): continuation byte // 2 (C2-DF): start of two byte sequence // 3 (E1-EC,EE-EF): start of three byte sequence, next byte unrestricted // 4 (ED): start of three byte sequence, next byte restricted to non-surrogates (80-9F) // 5 (F4): start of four byte sequence, next byte restricted to 0+10FFFF (80-8F) // 6 (F1-F3): start of four byte sequence, next byte unrestricted // 7 (A0-BF): continuation byte // 8 (C0-C1,F5-FF): invalid (overlong or out-of-range) start of multi byte sequences // 9 (90-9F): continuation byte // 10 (E0): start of three byte sequence, next byte restricted to non-overlong (A0-BF) // 11 (F0): start of four byte sequence, next byte restricted to non-overlong (90-BF) 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, ]; static STATE_TRANSITIONS: [u8; 110] = [ 0,98,12,24,48,84,72,98,98,98,36,60, // 0: '?? 86, 0,86,86,86,86,86, 0,86, 0,86,86, // 12: .. 'cc 86,12,86,86,86,86,86,12,86,12,86,86, // 24: .. 'cc cc 86,86,86,86,86,86,86,12,86,86,86,86, // 36: .. 'cc(A0-BF) cc 86,12,86,86,86,86,86,86,86,12,86,86, // 48: .. 'cc(80-9F) cc 86,86,86,86,86,86,86,24,86,24,86,86, // 60: .. 'cc(90-BF) cc cc 86,24,86,86,86,86,86,24,86,24,86,86, // 72: .. 'cc cc cc 86,24,86,86,86,86,86,86,86,86,86,86,86,86, // 84: .. 'cc(80-8F) cc cc // 86,86,86,86,86,86,86,86,86,86,86,86, // 86: .. xx '.. 98,98,98,98,98,98,98,98,98,98,98,98, // 98: xx '.. ]; static INITIAL_STATE: u8 = 0; static ACCEPT_STATE: u8 = 0; static REJECT_STATE: u8 = 98; static REJECT_STATE_WITH_BACKUP: u8 = 86; macro_rules! is_reject_state(($state:expr) => ($state >= REJECT_STATE_WITH_BACKUP)); macro_rules! next_state(($state:expr, $ch:expr) => ( STATE_TRANSITIONS[($state + CHAR_CATEGORY[$ch as usize]) as usize] )); impl RawDecoder for UTF8Decoder { fn from_self(&self) -> Box { UTF8Decoder::new() } fn is_ascii_compatible(&self) -> bool { true } fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option) { output.writer_hint(input.len()); fn write_bytes(output: &mut StringWriter, bytes: &[u8]) { output.write_str(unsafe {mem::transmute(bytes)}); } let mut state = self.state; let mut processed = 0; let mut offset = 0; // optimization: if we are in the initial state, quickly skip to the first non-MSB-set byte. if state == INITIAL_STATE { let first_msb = input.iter().position(|&ch| ch >= 0x80).unwrap_or(input.len()); offset += first_msb; processed += first_msb; } for (i, &ch) in input[offset..].iter().enumerate() { state = next_state!(state, ch); if state == ACCEPT_STATE { processed = i + offset + 1; } else if is_reject_state!(state) { let upto = if state == REJECT_STATE {i + offset + 1} else {i + offset}; self.state = INITIAL_STATE; if processed > 0 && self.queuelen > 0 { // flush `queue` outside the problem write_bytes(output, &self.queue[0..self.queuelen]); } self.queuelen = 0; write_bytes(output, &input[0..processed]); return (processed, Some(CodecError { upto: upto as isize, cause: "invalid sequence".into() })); } } self.state = state; if processed > 0 && self.queuelen > 0 { // flush `queue` write_bytes(output, &self.queue[0..self.queuelen]); self.queuelen = 0; } write_bytes(output, &input[0..processed]); if processed < input.len() { let morequeuelen = input.len() - processed; for i in 0..morequeuelen { self.queue[self.queuelen + i] = input[processed + i]; } self.queuelen += morequeuelen; } (processed, None) } fn raw_finish(&mut self, _output: &mut StringWriter) -> Option { let state = self.state; let queuelen = self.queuelen; self.state = INITIAL_STATE; self.queuelen = 0; if state != ACCEPT_STATE { Some(CodecError { upto: 0, cause: "incomplete sequence".into() }) } else { assert!(queuelen == 0); None } } } /// Almost equivalent to `std::str::from_utf8`. /// This function is provided for the fair benchmark against the stdlib's UTF-8 conversion /// functions, as rust-encoding always allocates a new string. pub fn from_utf8<'a>(input: &'a [u8]) -> Option<&'a str> { let mut iter = input.iter(); let mut state; macro_rules! return_as_whole(() => (return Some(unsafe {mem::transmute(input)}))); // optimization: if we are in the initial state, quickly skip to the first non-MSB-set byte. loop { match iter.next() { Some(&ch) if ch < 0x80 => {} Some(&ch) => { state = next_state!(INITIAL_STATE, ch); break; } None => { return_as_whole!(); } } } for &ch in iter { state = next_state!(state, ch); if is_reject_state!(state) { return None; } } if state != ACCEPT_STATE { return None; } return_as_whole!(); } #[cfg(test)] mod tests { // portions of these tests are adopted from Markus Kuhn's UTF-8 decoder capability and // stress test: . use super::{UTF8Encoding, from_utf8}; use std::str; use testutils; use types::*; #[test] fn test_valid() { // one byte let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [0x41], [], "A"); assert_feed_ok!(d, [0x42, 0x43], [], "BC"); assert_feed_ok!(d, [], [], ""); assert_feed_ok!(d, [0x44, 0x45, 0x46], [], "DEF"); assert_finish_ok!(d, ""); // two bytes let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [0xc2, 0xa2], [], "\u{a2}"); assert_feed_ok!(d, [0xc2, 0xac, 0xc2, 0xa9], [], "\u{ac}\u{0a9}"); assert_feed_ok!(d, [], [], ""); assert_feed_ok!(d, [0xd5, 0xa1, 0xd5, 0xb5, 0xd5, 0xa2, 0xd5, 0xb8, 0xd6, 0x82, 0xd5, 0xa2, 0xd5, 0xa5, 0xd5, 0xb6], [], "\u{561}\u{0575}\u{562}\u{578}\u{582}\u{562}\u{565}\u{576}"); assert_finish_ok!(d, ""); // three bytes let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [0xed, 0x92, 0x89], [], "\u{d489}"); assert_feed_ok!(d, [0xe6, 0xbc, 0xa2, 0xe5, 0xad, 0x97], [], "\u{6f22}\u{5b57}"); assert_feed_ok!(d, [], [], ""); assert_feed_ok!(d, [0xc9, 0x99, 0xc9, 0x94, 0xc9, 0x90], [], "\u{259}\u{0254}\u{250}"); assert_finish_ok!(d, ""); // four bytes let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [0xf0, 0x90, 0x82, 0x82], [], "\u{10082}"); assert_feed_ok!(d, [], [], ""); assert_finish_ok!(d, ""); // we don't test encoders as it is largely a no-op. } #[test] fn test_valid_boundary() { let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [0x00], [], "\x00"); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [0x7f], [], "\x7f"); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [0xc2, 0x80], [], "\u{80}"); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [0xdf, 0xbf], [], "\u{7ff}"); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [0xe0, 0xa0, 0x80], [], "\u{800}"); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [0xed, 0x9f, 0xbf], [], "\u{d7ff}"); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [0xee, 0x80, 0x80], [], "\u{e000}"); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [0xef, 0xbf, 0xbf], [], "\u{ffff}"); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [0xf0, 0x90, 0x80, 0x80], [], "\u{10000}"); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [0xf4, 0x8f, 0xbf, 0xbf], [], "\u{10ffff}"); assert_finish_ok!(d, ""); } #[test] fn test_valid_partial() { let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [], [0xf0], ""); assert_feed_ok!(d, [], [0x90], ""); assert_feed_ok!(d, [], [0x82], ""); assert_feed_ok!(d, [0x82], [0xed], "\u{10082}"); assert_feed_ok!(d, [0x92, 0x89], [], "\u{d489}"); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [], [0xc2], ""); assert_feed_ok!(d, [0xa9, 0x20], [], "\u{a9}\u{020}"); assert_finish_ok!(d, ""); } #[test] fn test_invalid_continuation() { for c in 0x80..0xc0 { let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [c], [], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [c], [c], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [c], [c, c], ""); assert_finish_ok!(d, ""); } } #[test] fn test_invalid_surrogate() { // surrogates should fail at the second byte. let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xed], [0xa0, 0x80], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xed], [0xad, 0xbf], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xed], [0xae, 0x80], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xed], [0xaf, 0xbf], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xed], [0xb0, 0x80], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xed], [0xbe, 0x80], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xed], [0xbf, 0xbf], ""); assert_finish_ok!(d, ""); } #[test] fn test_invalid_boundary() { // as with surrogates, should fail at the second byte. let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xf4], [0x90, 0x90, 0x90], ""); // U+110000 assert_finish_ok!(d, ""); } #[test] fn test_invalid_start_immediate_test_finish() { for c in 0xf5..0x100 { let c = c as u8; let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [c], [], ""); assert_finish_ok!(d, ""); } } #[test] fn test_invalid_start_followed_by_space() { for c in 0xf5..0x100 { let c = c as u8; let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [c], [0x20], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [c], [], ""); assert_feed_ok!(d, [0x20], [], "\x20"); assert_finish_ok!(d, ""); } } #[test] fn test_invalid_lone_start_immediate_test_finish() { for c in 0xc2..0xf5 { let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [], [c], ""); // wait for cont. bytes assert_finish_err!(d, ""); } } #[test] fn test_invalid_lone_start_followed_by_space() { for c in 0xc2..0xf5 { let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [c], [0x20], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [], [c], ""); // wait for cont. bytes assert_feed_err!(d, [], [], [0x20], ""); assert_finish_ok!(d, ""); } } #[test] fn test_invalid_incomplete_three_byte_seq_followed_by_space() { for b in 0xe0..0xf5 { let c = if b == 0xe0 || b == 0xf0 {0xa0} else {0x80}; let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [b, c], [0x20], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [], [b, c], ""); // wait for cont. bytes assert_feed_err!(d, [], [], [0x20], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [], [b], ""); // wait for cont. bytes assert_feed_err!(d, [], [c], [0x20], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [], [b], ""); // wait for cont. bytes assert_feed_ok!(d, [], [c], ""); // wait for cont. bytes assert_feed_err!(d, [], [], [0x20], ""); assert_finish_ok!(d, ""); } } #[test] fn test_invalid_incomplete_four_byte_seq_followed_by_space() { for a in 0xf0..0xf5 { let b = if a == 0xf0 {0xa0} else {0x80}; let c = 0x80; let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [a, b, c], [0x20], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [], [a], ""); // wait for cont. bytes assert_feed_ok!(d, [], [b], ""); // wait for cont. bytes assert_feed_ok!(d, [], [c], ""); // wait for cont. bytes assert_feed_err!(d, [], [], [0x20], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [], [a, b], ""); // wait for cont. bytes assert_feed_err!(d, [], [c], [0x20], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [], [a, b, c], ""); // wait for cont. bytes assert_feed_err!(d, [], [], [0x20], ""); assert_finish_ok!(d, ""); } } #[test] fn test_invalid_too_many_cont_bytes() { let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [0xc2, 0x80], [0x80], [], "\u{80}"); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [0xe0, 0xa0, 0x80], [0x80], [], "\u{800}"); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [0xf0, 0x90, 0x80, 0x80], [0x80], [], "\u{10000}"); assert_finish_ok!(d, ""); // no continuation byte is consumed after 5/6-byte sequence starters and FE/FF let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xf8], [0x88, 0x80, 0x80, 0x80, 0x80], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xfc], [0x84, 0x80, 0x80, 0x80, 0x80, 0x80], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xfe], [0x80], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xff], [0x80], ""); assert_finish_ok!(d, ""); } #[test] fn test_invalid_too_many_cont_bytes_partial() { let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [], [0xc2], ""); assert_feed_err!(d, [0x80], [0x80], [], "\u{80}"); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [], [0xe0, 0xa0], ""); assert_feed_err!(d, [0x80], [0x80], [], "\u{800}"); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [], [0xf0, 0x90, 0x80], ""); assert_feed_err!(d, [0x80], [0x80], [], "\u{10000}"); assert_finish_ok!(d, ""); // no continuation byte is consumed after 5/6-byte sequence starters and FE/FF let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xf8], [], ""); assert_feed_err!(d, [], [0x88], [0x80, 0x80, 0x80, 0x80], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xfc], [], ""); assert_feed_err!(d, [], [0x84], [0x80, 0x80, 0x80, 0x80, 0x80], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xfe], [], ""); assert_feed_err!(d, [], [0x80], [], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xff], [], ""); assert_feed_err!(d, [], [0x80], [], ""); assert_finish_ok!(d, ""); } #[test] fn test_invalid_overlong_minimal() { let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xc0], [0x80], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xe0], [0x80, 0x80], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xf0], [0x80, 0x80, 0x80], ""); assert_finish_ok!(d, ""); } #[test] fn test_invalid_overlong_maximal() { let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xc1], [0xbf], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xe0], [0x9f, 0xbf], ""); assert_finish_ok!(d, ""); let mut d = UTF8Encoding.raw_decoder(); assert_feed_err!(d, [], [0xf0], [0x8f, 0xbf, 0xbf], ""); assert_finish_ok!(d, ""); } #[test] fn test_feed_after_finish() { let mut d = UTF8Encoding.raw_decoder(); assert_feed_ok!(d, [0xc2, 0x80], [0xc2], "\u{80}"); assert_finish_err!(d, ""); assert_feed_ok!(d, [0xc2, 0x80], [], "\u{80}"); assert_finish_ok!(d, ""); } #[test] fn test_correct_from_utf8() { let s = testutils::ASCII_TEXT.as_bytes(); assert_eq!(from_utf8(s), str::from_utf8(s).ok()); let s = testutils::KOREAN_TEXT.as_bytes(); assert_eq!(from_utf8(s), str::from_utf8(s).ok()); let s = testutils::INVALID_UTF8_TEXT; assert_eq!(from_utf8(s), str::from_utf8(s).ok()); } mod bench_ascii { extern crate test; use super::super::{UTF8Encoding, from_utf8}; use std::str; use testutils; use types::*; #[bench] fn bench_encode(bencher: &mut test::Bencher) { let s = testutils::ASCII_TEXT; bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ UTF8Encoding.encode(s, EncoderTrap::Strict) })) } #[bench] fn bench_decode(bencher: &mut test::Bencher) { let s = testutils::ASCII_TEXT.as_bytes(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ UTF8Encoding.decode(s, DecoderTrap::Strict) })) } #[bench] fn bench_from_utf8(bencher: &mut test::Bencher) { let s = testutils::ASCII_TEXT.as_bytes(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ from_utf8(s) })) } #[bench] // for the comparison fn bench_stdlib_from_utf8(bencher: &mut test::Bencher) { let s = testutils::ASCII_TEXT.as_bytes(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ str::from_utf8(s) })) } #[bench] // for the comparison fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) { let s = testutils::ASCII_TEXT.as_bytes(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ String::from_utf8_lossy(s) })) } } // why Korean? it has an excellent mix of multibyte sequences and ASCII sequences // unlike other CJK scripts, so it reflects a practical use case a bit better. mod bench_korean { extern crate test; use super::super::{UTF8Encoding, from_utf8}; use std::str; use testutils; use types::*; #[bench] fn bench_encode(bencher: &mut test::Bencher) { let s = testutils::KOREAN_TEXT; bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ UTF8Encoding.encode(s, EncoderTrap::Strict) })) } #[bench] fn bench_decode(bencher: &mut test::Bencher) { let s = testutils::KOREAN_TEXT.as_bytes(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ UTF8Encoding.decode(s, DecoderTrap::Strict) })) } #[bench] fn bench_from_utf8(bencher: &mut test::Bencher) { let s = testutils::KOREAN_TEXT.as_bytes(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ from_utf8(s) })) } #[bench] // for the comparison fn bench_stdlib_from_utf8(bencher: &mut test::Bencher) { let s = testutils::KOREAN_TEXT.as_bytes(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ str::from_utf8(s) })) } #[bench] // for the comparison fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) { let s = testutils::KOREAN_TEXT.as_bytes(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ String::from_utf8_lossy(s) })) } } mod bench_lossy_invalid { extern crate test; use super::super::{UTF8Encoding, from_utf8}; use std::str; use testutils; use types::*; use types::DecoderTrap::Replace as DecodeReplace; #[bench] fn bench_decode_replace(bencher: &mut test::Bencher) { let s = testutils::INVALID_UTF8_TEXT; bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ UTF8Encoding.decode(s, DecodeReplace) })) } #[bench] // for the comparison fn bench_from_utf8_failing(bencher: &mut test::Bencher) { let s = testutils::INVALID_UTF8_TEXT; bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ from_utf8(s) })) } #[bench] // for the comparison fn bench_stdlib_from_utf8_failing(bencher: &mut test::Bencher) { let s = testutils::INVALID_UTF8_TEXT; bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ str::from_utf8(s) })) } #[bench] // for the comparison fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) { let s = testutils::INVALID_UTF8_TEXT; bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ String::from_utf8_lossy(s) })) } } mod bench_lossy_external { extern crate test; use super::super::{UTF8Encoding, from_utf8}; use std::str; use testutils; use types::*; use types::DecoderTrap::Replace as DecodeReplace; #[bench] fn bench_decode_replace(bencher: &mut test::Bencher) { let s = testutils::get_external_bench_data(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ UTF8Encoding.decode(&s, DecodeReplace) })) } #[bench] // for the comparison fn bench_from_utf8_failing(bencher: &mut test::Bencher) { let s = testutils::get_external_bench_data(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ from_utf8(&s) })) } #[bench] // for the comparison fn bench_stdlib_from_utf8_failing(bencher: &mut test::Bencher) { let s = testutils::get_external_bench_data(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ str::from_utf8(&s) })) } #[bench] // for the comparison fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) { let s = testutils::get_external_bench_data(); bencher.bytes = s.len() as u64; bencher.iter(|| test::black_box({ String::from_utf8_lossy(&s) })) } } } encoding-0.2.33/src/codec/whatwg.rs01006440000765000002400000002176125331707360015334 0ustar0000000000000000// This is a part of rust-encoding. // Copyright (c) 2013-2015, Kang Seonghoon. // See README.md and LICENSE.txt for details. //! Asymmetric or special encoding constructions required by the WHATWG Encoding standard. use codec; use types::*; /// Replacement encoding used to solve a particular attack vector due to mismatching server and /// client supports for encodings. It is rarely useful outside. #[derive(Clone, Copy)] pub struct EncoderOnlyUTF8Encoding; impl Encoding for EncoderOnlyUTF8Encoding { fn name(&self) -> &'static str { "encoder-only-utf-8" } fn whatwg_name(&self) -> Option<&'static str> { Some("replacement") } // WHATWG compatibility fn raw_encoder(&self) -> Box { codec::utf_8::UTF8Encoding.raw_encoder() } fn raw_decoder(&self) -> Box { codec::error::ErrorEncoding.raw_decoder() } } /// Algorithmic mapping for `x-user-defined` encoding. pub mod x_user_defined { #[inline] pub fn forward(code: u8) -> u16 { 0xf700 | (code as u16) } #[inline] pub fn backward(code: u32) -> u8 { if (code & !0x7f) == 0xf780 {(code & 0xff) as u8} else {0} } } encoding-0.2.33/src/examples/outer-space-treaty.html01006440000765000002400000362705125331707360020660 0ustar0000000000000000 Treaty on Principles Governing the Activities of States in the Exploration and Use of Outer Space, including the Moon and Other Celestial Bodies

معاهدة المبادئ المنظمة لأنشطة الدول في ميدان
استكشاف واستخدام الفضاء الخارجي، بما في ذلك
القمر والأجرام السماوية الأخرى

  

  

إن الدول الأطراف في هذه المعاهدة

،إذ تستلهم الآفاق الواسعة التي فتحها أمام الانسانية ولوج الانسان الفضاء الخارجي

وإذ تدرك المصلحة المشتركة التي تعود على جميع الانسانية من التقد م في ميدان
،استكشاف الفضاء الخارجي واستخدامه للأغراض السلمية  

وإذ تعتقد ان استكشاف الفضاء الخارجي واستخدامه يجب أن يباشرا لتحقيق فائدة
،جميع الشعوب أيا كانت درجة نمائها الاقتصادي أو العلمي  

وإذ تود الاسهام في تعاون دولي واسع يتناول النواحي العلمية إلى جانب النواحي
،القانونية من استكشاف الفضاء الخارجي واستخدامه للأغراض السلمية  

وإذ تعتقد ان هذا التعاون سيسهم في انماء التفاهم المتبادل وفي توثيق العلاقات الودية
،بين الأمم والشعوب  

وإذ تشير إلى القرار ١٩٦٢ (د- ١٨ ) ذي العنوان التالي "اعلان المبادئ القانونية
المنظمة لأنشطة الدول في ميدان استكشاف الفضاء الخارجي واستخدامه "، وهو القرار الذي
،اتخذته الجمعية العامة للأمم المتحدة بالاجماع في ١٣ كانون الأول/ديسمبر ١٩٦٣  

وإذ تشير إلى القرار ١٨٨٤ (د- ١٨ ) الذي يدعو الدول إلى الامتناع عن وضع أية
أجسام، تحمل أية أسلحة نووية أو أي نوع آخر من أسلحة التدمير الشامل، في أي مدار حول
الأرض، أو عن وضع مثل هذه الأسلحة على أية أجرام سماوية، وهو القرار الذي اتخذته
،الجمعية العامة للأمم المتحدة بالاجماع في ١٧ تشرين الأول/أكتوبر ١٩٦٣

وإذ تراعي القرار ١١٠ (د- ٢) الذي اتخذته الجمعية العامة للأ مم المتحدة في ٣
تشرين الثاني /نوفمبر ١٩٤٧ ، وشجبت فيه الدعاية الرامية أو المؤدية إلى إثارة أو تشجيع أي
ﺗﻬديد أو خرق للسلم أو أي عمل عدواني، وإذ ترى ان القرار السالف الذكر يسري على
،الفضاء الخارجي

واقتناعا منها بأن عقد معاهدة تتضمن المبادئ المنظمة لأنشطة الد ول في ميدان
استكشاف واستخدام الفضاء الخارجي، بما في ذلك القمر والأجرام السماوية الأخرى، من
،شأنه تعزيز مقاصد ميثاق الأمم المتحدة ومبادئه

:قد اتفقت على ما يلي

 

المادة الأولى

يباشر استكشاف واستخدام الفضاء الخارجي، بما في ذلك القمر والأجرام السماوية
،الأخرى، لتحقيق فائدة ومصالح جميع البلدان، أيا كانت درجة نمائها الاقتصادي أو العلمي
.ويكونان ميدانا للبشرية قاطبة

وتكون لجميع الدول حرية استكشاف واستخدام الفضاء الخارجي، بما في ذلك القمر
والأجرام السماوية الأخرى دون تمييز وعلى قدم المساواة وفقا للقانون الدولي، ويكون حرا
.الوصول إلى جميع مناطق الأجرام السماوية

ويكون حرا اجراء الأبحاث العلمية في الفضاء الخارجي، بما في ذلك القمر والأجرام
.السماوية الأخرى، وتراعي الدول تيسير وتشجيع التعاون الدولي في مثل هذه الأبحاث

 

المادة الثانية

لا يجوز التملك القومي للفضاء الخ ارجي، بما في ذلك القمر والأجرام السماوية
.الأخرى، بدعوى السيادة أو بطريق الاستخدام أو الاحتلال أو بأية وسيلة أخرى

 

المادة الثالثة

تلتزم الدول الأطراف في المعاهدة، في مباشرة أنشطتها في ميدان استكشاف
واستخدام الفضاء الخارجي، بما في ذلك القمر والأجرام السما وية الأخرى، مراعاة القانون
الدولي، بما في ذلك ميثاق الأمم المتحدة، بغية صيانة السلم والأمن الدوليين وتعزيز التعاون
.والتفاهم الدوليين

 

المادة الرابعة

تتعهد الدول الأطراف في المعاهدة بعدم وضع أية أجسام تحمل أية أسلحة نووية أو
أي نوع آخر من أسلحة التدمير ا لشامل في أي مدار حول الأرض، أو وضع مثل هذه
.الأسلحة على أية أجرام سماوية أو في الفضاء الخارجي بأية طريقة أخرى

وتراعي جميع الدول الأطراف في المعاهدة قصر استخدامها للقمر والأجرام السماوية
الأخرى على الأغراض السلمية . ويحظر انشاء أية قواعد أو منشآت أو تحصينات عسكرية
وتجريب أي نوع من الأسلحة واجراء أية مناورات عسكرية في الأجرام السماوية . ولا يحظر
استخدام الملاكات العسكرية لأغراض البحث العلمي أو لأية أغراض سلمية أخرى . وكذلك
لا يحظر استخدام أية معدات أو مرافق تكون لازمة للاستكشاف السلمي للقمر وللأجرام
.السماوية الأخرى

 

المادة الخامسة

تراعي الدول الأطراف في المعاهدة اعتبار الملاحين الفضائيين بمثابة مبعوثي الانسانية
في الفضاء الخارجي وتزويدهم بكل مساعدة ممكنة عند حصول أي حادث أو محنة أو هبوط
اضطراري في اقليم أية دولة من الدول الأطراف أو في أعالي البحار . ويبادر، في حالة هبوط
.الملاحين الفضائيين اضطرارا، إلى اعادﺗﻬم سالمين إلى الدولة المسجلة فيها مركبتهم الفضائية

ويراعي الملاحون الفضائيون التابعون لأية دولة من الدول الأطراف تقديم كل
مساعدة ممكنة، عند مباشرة أية نشاطات في الفضاء الخارجي أو الأجرام السماوية، إلى
.الملاحين الفضائيين التابعين للدول الأطراف الأخرى

وتلتزم الدول المعنية الأطراف في المعاهدة القيام فورا باعلام الدول الأخرى الأطراف
في المعاهدة أو الأمين العام للأمم المتحدة بأية ظاهرة تكتشفها في الفضاء الخارجي، بما في ذلك
القمر والأجرام السماوية الأخرى، ويكون من شأﻧﻬا تعريض حياة الملاحين الفضائيين أو
.صحتهم للخطر

 

المادة السادسة

تترتب على الدول الأطراف في المعاهدة مسؤولية دولية عن الأنشطة القومية المباشرة
في الفضاء الخارجي، بما في ذلك القمر والأجرام السماوية الأخرى، سواء باشرﺗﻬا الهيئات
الحكومية أو غير الحكوم ية، وعن تأمين مباشرة الأنشطة القومية وفقا للمباد ئ المقررة في هذه
المعاهدة. وتراعي الدولة المعنية الطرف في المعاهدة فرض الاجازة والإشراف المستمر على
أنشطة الهيئات غير الحكومية في الفضاء الخارجي، بما في ذلك القمر والأجرام السماوية
الأخرى، وفي حالة صدور الأنش طة المباشرة في الفضاء الخارجي، بما في ذلك القمر والأجرام
السماوية الأخرى، عن احدى المنظمات الدولية، تكون هذه المنظمة، مع الدول التي تكون
.مشتركة فيها وأطرافا في المعاهدة، هي صاحبة المسؤولية عن التزام أحكام المعاهدة

 

المادة السابعة

تترتب على كل دولة من الدو ل الأطراف في المعاهدة تطلق أو تتيح اطلاق أي جسم
في الفضاء الخارجي، بما في ذلك القمر والأجرام السماوية الأخرى، وعلى كل دولة من الدول
الأطراف يطلق أي جسم من اقليمها أو من منشآﺗﻬا، المسؤولية الدولية عن الأضرار التي تلحق
أية دولة أخرى من الدول الأطراف في المعا هدة أي شخص من أشخاصها الطبيعيين أو
،القانونيين بسبب ذلك الجسم أجزائه فوق الأرض أو في الفضاء الجوي أو في الفضاء الخارجي
.بما في ذلك القمر والأجرام السماوية الأخرى

 

المادة الثامنة

تحتفظ الدولة الطرف في المعاهدة والمقيد في سجلها أي جسم مطلق في الفضاء
الخارجي بالولاية والمراقبة على ذلك الجسم وعلى أي أشخاص يحملهم أثناء وجوده
ووجودهم في الفضاء الخارجي أو على أي جرم سماوي، ولا تتأثر ملكية الأجسام المطلقة في
الفضاء الخارجي، بما في ذلك الأجسام الهابطة أو المنشأة على أي جرم سماوي، ولا ملكية
أجزائها، بوجودها في الفض اء الخارجي أو على جرم سماوي أو بعودﺗﻬا إلى الأرض . وترد إلى
دولة السجل التي تكون طرفا في المعاهدة أية أجسام مقيدة في سجلها أو أية أجزاء منها يعثر
عليها خارج حدودها، على أن تقوم تلك الدولة قبل الرد بتقديم البيانات الثبوتية اللازمة عند
.طلبها

 

المادة التاسعة

تلتزم الدول الأطراف في المعاهدة، في استكشاف واستخدام الفضاء الخارجي، بما في
ذلك القمر والأجرام السماوية الأخرى، الاسترشاد بمبدأ التعاون والتساعد المتبادل، والمراعاة
،الحقة في مباشرة أنشطتها في الفضاء الخارجي، بما في ذلك القمر والأجرام السماوية الأخرى
للمصالح المقابلة التي تكون لجميع الدول الأخرى الأطراف في المعاهدة . وتلتزم الدول الأطراف
في المعاهدة، في دراسة واستكشاف الفضاء الخارجي، بما في ذلك القمر والأجرام السماوية
الأخرى، تفادي إحداث أي تلويث ضار لها وكذلك أية تغييرات ضارة في البيئة الأرضية
.يسببها إدخا ل أية مواد غير أرضية، والقيام عند الاقتضاء باتخاذ التدابير المناسبة لهذا الغرض
ويجب على كل دولة من الدول الأطراف في المعاهدة، يكون لديها من الأسباب ما يحملها
على الاعتقاد بأن ثمة نشاطا تجريبيا مزمعا منها أو من مواطنيها في الفضاء الخارجي، بما في
ذلك القمر والأجرام السماوية الأخرى، قد يتسبب في عرقلة، محتملة الإضرار، لأنشطة الدول
الأطراف الأخرى في ميدان استكشاف واستخدام الفضاء الخارجي، بما في ذلك القمر
والأجرام السماوية الأخرى، للأغراض السلمية، اجراء المشاورات الدولية المناسبة قبل الشروع
في ذلك النشاط أو التج ريب. ويجوز لكل دولة من الدول الأطراف في المعاهدة يكون لديها
من الأسباب ما يحملها على الاعتقاد بأن ثمة نشاطا أو تجريبا مزمعا من أية دولة أخرى من
الدول الأطراف في الفضاء الخارجي، بما في ذلك القمر والأجرام السماوية الأخرى، قد
يتسبب في عرقلة، محتملة الإضرار، ل لأنشطة المباشرة في ميدان استكشاف واستخدام الفضاء
الخارجي، بما في ذلك القمر والأجرام السماوية الأخرى، للأغراض السلمية، طلب اجراء
.المشاورات اللازمة بشأن ذلك النشاط التجريبي

 

المادة العاشرة

تراعي الدول الأطراف في المعاهدة والمطلقة لأية أجسام فضائية، تعزيز ا للتعاون
الدولي في ميدان استكشاف واستخدام الفضاء الخارجي، بما في ذلك القمر والأجرام السماوية
الأخرى، ووفقا لمقاصد هذه المعاهدة، النظر على قدم المساواة في أية طلبات من الدول
الأخرى الأطراف في المعاهدة تطلب اليها فيها توفير التسهيلات اللازمة لها لمراقبة طير ان
.الأجسام الفضائية المطلقة منها

ويجري، بالاتفاق بين الدول المعنية، تحديد طبيعة تلك التسهيلات اللازمة للمراقبة
.وتعيين الشروط المناسبة لتوفيرها

 

المادة الحادية عشرة

توافق الدول الأطراف في المعاهدة والمباشرة لأية أنشطة في الفضاء الخارجي، بما في
ذلك القم ر والأجرام السماوية الأخرى، تعزيزا للتعاون الدولي في ميدان استكشاف الفضاء
الخارجي واستخدامه، على القيام، في أوسع نطاق عملي ممكن، بموافاة الأمين العام للأمم
المتحدة، وكذلك الجمهور واﻟﻤﺠتمع العلمي الدولي، بالمعلومات اللازمة عن طبيعة تلك الأنشطة
ومباشرﺗﻬا وأم اكنها ونتائجها، ويجب على الأمين العام للأمم المتحدة أن يكون مستعدا، عند
.تلقي المعلومات المذكورة، لإذاعتها ونشرها فورا بالطريقة الفعالة

 

المادة الثانية عشرة

تتاح لممثلي الدول الأخرى الأطراف في المعاهدة، وعلى أساس التبادل، زيارة جميع
المحطات والمنشآت والم عدات والمركبات الفضائية التي تكون موجودة على القمر أو على
الأجرام السماوية الأخرى . ويراعي الممثلون المذكورون إرسال اعلان مسبق بزيارﺗﻬم المزمعة
لاتاحة اجراء المشاورات المناسبة وتيسير اتخاذ الاحتياطات القصوى اللازمة لكفالة السلامة
.ولتفادي عرقلة السير الطبيعية للعمليات المعتادة في المرفق المزمعة زيارته

 

المادة الثالثة عشرة

تسري أحكام هذه المعاهدة على الأنشطة التي تباشرها الدول الأطراف فيها في ميدان
استكشاف واستخدام الفضاء الخارجي، بما في ذلك القمر والأجرام السماوية الأخرى، سواء
كانت تلك الأنشطة مباشرة من اح دى الدول الأطراف في المعاهدة على سبيل الانفراد أو
بالاشتراك مع الدول الأخرى، بما في ذلك الحالات التي تكون فيها تلك الأنشطة مباشرة ضمن
.اطار المنظمات الحكومية الدولية

وتتولى الدول الأطراف في المعاهدة، بالنسبة إلى أية مسائل عملية تنشأ بصدد
الأنشطة المباشرة من المنظمات الحكومية الدولية في ميدان استكشاف واستخدام الفضاء
الخارجي، بما في ذلك القمر والأجرام السماوية الأخرى، التماس الحلول اللازمة لتلك المسائل
إما مع المنظمة الدولية المختصة وإما مع واحدة أو أكثر من الدول الأعضاء في تلك المنظمة
.والتي تكون أطرافا في هذه المعاهدة

 

المادة الرابعة عشرة

١- تعرض هذه المعاهدة لتوقيع جميع الدول . ويجوز الانضمام إلى هذه المعاهدة
في أي وقت لأية دولة لم توقعها قبل بدء نفاذها وفقا للفقرة ٣ من هذه
.المادة

٢- تخضع هذه المعاهدة لتصديق الدول الموقعة لها وتودع وثائق التصديق ووثائق
الانضمام لدى حكومات اتحاد الجمهوريات الاشتراكية السوفياتية والمملكة
،المتحدة لبريطانيا العظمى وايرلندا الشمالية والولايات المتحدة الأمريكية
.المعنية بحكم هذه المعاهدة باعتبارها الحكومات الوديعة

٣- يبدأ نفاذ هذه المعاهدة بإيداع وثائق تصديق خمس حكومات تكون من
.بينها الحكومات المعنية بحكم هذه المعاهدة باعتبارها الحكومات الوديعة

٤- يبدأ نفاذ هذه المعاهدة، بالنسبة إلى الدول التي تكون قد أودعت وثائق
تصديقها عليها أو انضمامها اليها بعد بدء نفاذها، ابتداء من تاريخ ايداع
.تلك الدول لوثائق تصديقها أو انضمامها

٥- تنهي الحكومات الوديعة، على وجه السرعة، إلى جميع الدول الموقعة لهذه
المعاهدة أو المنضمة اليها، تاريخ كل توقيع لها، وتاريخ ايداع كل وثيقة
تصديق عليها أو انضمام اليها، وتاريخ بدء نفاذها، وأية اعلانات أخرى
.تتصل ﺑﻬا

٦- تقوم الحكومات الوديعة بتسجيل هذه المعاهدة وفقا للمادة ١٠٢ من ميثاق
.الأمم المتحدة

 

المادة الخامسة عشرة

يجوز لأية دولة من الدول الأطراف في المعاهدة اقتراح ادخال التعديلات عليها
وتصبح التعديلات نافذة، بالنسبة إلى كل دولة تقبلها من الدول الأطراف في المعاهدة، فور
نيلها قبول أغلبية الدول الأطراف في المعاهدة، وتنفذ بعد ذلك بالنسبة إلى كل دولة أخرى من
.الدول الأطراف في المعاهدة، ابتداء من تاريخ قبول هذه الدولة لها

 

المادة السادسة عشرة

يجوز لكل دولة من الدول الأطراف في المعاهدة، بعد سنة من نفاذها، تخطر
بانسحاﺑﻬا منها باعلان كتابي ترسله إلى الحكومات ال وديعة، ويسري الانسحاب بعد سنة من
.ورود هذا الاعلان

 

المادة السابعة عشرة

حررت هذه المعاهدة بخمس لغات رسمية متساوية الحجية هي الاسبانية والانكليزية
والروسية والصينية والفرنسية، وتودع في محفوظات الحكومات الوديعة . وتقوم الحكومات
الوديعة بارسال نسخ مصدقة من ه ذه المعاهدة إلى حكومات الدول الموقعة لها أو المنضمة
.اليها

واثباتا لما تقدم ، قام الموقعون أدناه، المفوضون بذلك حسب الأصول، بتوقيع هذه
.المعاهدة

حررت بثلاث نسخ في مدن لندن وموسكو وواشنطن العاصمة في اليوم السابع
.والعشرين من شهر كانون الثاني/يناير عام ألف وتسعمائة وسبعة وستين

 


关于各国探索和利用外层空间包括月球与其他天体活动所应遵守原则的条约

 

本条约各缔约国,

受到由于人类进入外层空间而在人类面前展现的伟大前景的鼓舞,

承认为和平目的而探索和利用外层空间所取得的进展关系到全人类共同的利益,

相信外层空间的探索和利用应造福于各国人民,不论他们的经济或科学发展的程度如何,

愿意在为和平目的而探索和利用外层空间的科学以及法律方面的广泛国际合作作出贡献,

相信这种合作将有助于促进各国和各国人民之间的相互谅解并加强他们之间的友好关系,

回顾联合国大会1963 年12 月13 日一致通过的题为"关于各国探索和利用外层空间活动的法律原则宣言"的第1962(XVIII)号决议,

回顾联合国大会1963 年10 月17 日一致通过的第1881(XVIII)号决议,要求各国不要将任何载有核武器或任何其他种类大规模毁灭性武器的物体放置在环绕地球的轨道上,也不要在天体上装置这种武器,

考虑到联合国大会1947 年11 月3 日第110(II)号决议,谴责旨在或可能煽动或鼓励任何威胁和平、破坏和平或侵略行为的宣传,并认为上述决议也适用于外层空间,

深信缔结关于各国探索和利用外层空间包括月球与其他天体活动所应遵守原则的条约,将促进联合国宪章的宗旨和原则,

议定条款如下:

  

第一条

探索和利用外层空间,包括月球与其他天体在内,应本着为所有国家谋福利与利益的精神,不论其经济或科学发展的程度如何,这种探索和利用应是全人类的事情。

外层空间,包括月球与其他天体在内,应由各国在平等基础上并按国际法自由探索和利用,不得有任何歧视,天体的所有地区均得自由进入。

对外层空间,包括月球与其他天体在内,应有科学调查的自由,各国应在这类调查方面便利并鼓励国际合作。

  

第二条

外层空间,包括月球与其他天体在内,不得由国家通过提出主权主张,通过使用或占领,或以任何其他方法,据为己有。

  

第三条

本条约各缔约国探索和利用外层空间,包括月球与其他天体在内的活动,应按照国际法,包括联合国宪章,并为了维护国际和平与安全及增进国际合作与谅解而进行。

  

第四条

本条约各缔约国承诺不在环绕地球的轨道上放置任何载有核武器或任何其他种类大规模毁灭性武器的物体,不在天体上装置这种武器,也不以任何其他方式在外层空间设置这种武器。

本条约所有缔约国应专为和平目的使用月球和其他天体。禁止在天体上建立军事基地、军事设施和工事,试验任何类型的武器和进行军事演习。不禁止为了科学研究或任何其他和平目的而使用军事人员。为和平探索月球与其他天体所必需的任何装置或设备,也不在禁止之列。

  

第五条

本条约各缔约国应把航天员视为人类在外层空间的使者,航天员如遇意外事故、危难或在另一缔约国领土上或公海上紧急降落时,应给予他们一切可能的协助。航天员降落后,应将他们安全和迅速地送回航天器的登记国。

在外层空间及天体上进行活动时,任一缔约国的航天员应给予其他缔约国的航天员一切可能的协助。

本条约各缔约国如发现在包括月球与其他天体在内的外层空间有对航天员的生命或健康可能构成危险的任何现象,应立即通知本条约其他缔约国或联合国秘书长。

  

第六条

本条约各缔约国对本国在外层空间,包括月球与其他天体在内的活动应负国际责任,不论这类活动是由政府机构或是由非政府团体进行的。它并应负国际责任保证本国的活动符合本条约的规定。非政府团体在外层空间,包括月球与其他天体在内的活动,应经本条约有关缔约国批准并受其不断的监督。一个国际组织在外层空间,包括月球与其他天体在内进行活动时,遵守本条约的责任应由该国际组织和参加该国际组织的本条约各缔约国共同承担。

  

第七条

凡发射或促使发射物体进入外层空间,包括月球与其他天体在内的缔约国,以及以其领土或设备供发射物体用的缔约国,对于这种物体或其组成部分在地球上、在大气空间或在外层空间,包括月球与其他天体在内,使另一缔约国或其自然人或法人遭受损害时,应负国际责任。

  

第八条

凡本条约缔约国为射入外层空间物体的登记国者,对于该物体及其所载人员,当其在外层空间或在某一天体上时,应保有管辖权和控制权。向外层空间发射的物体,包括在某一天体上着陆或建筑的物体及其组成部分的所有权,不因其在外层空间或在某一天体上或因其返回地球而受影响。这类物体或组成部分如果在其所登记的缔约国境外发现,应交还该缔约国,如经请求,该缔约国应在交还前提供认证资料。

  

第九条

本条约各缔约国探索和利用外层空间,包括月球与其他天体在内,应以合作和互助的原则为指导,其在外层空间,包括月球与其他天体在内进行的各种活动,应充分注意本条约所有其他缔约国的相应利益。本条约各缔约国对外层空间,包括月球与其他天体在内进行的研究和探索,应避免使它们受到有害污染以及将地球外物质带入而使地球环境发生不利变化,并应在必要时为此目的采取适当措施。如果本条约某一缔约国有理由认为,该国或其国民在外层空间,包括月球与其他天体在内计划进行的活动或实验可能对其他缔约国和平探索和利用外层空间,包括月球与其他天体在内的活动产生有害干扰时,则该缔约国在开始进行任何这种活动或实验之前,应进行适当的国际磋商。如果本条约某一缔约国有理由认为,另一缔约国在外层空间,包括月球与其他天体在内计划进行的活动或实验,可能对和平探索和利用外层空间,包括月球与其他天体在内的活动产生有害干扰时,则该缔约国可请求就该活动或实验进行磋商。

  

第十条

为了按照本条约的宗旨促进在探索和利用外层空间,包括月球与其他天体在内的国际合作,本条约各缔约国应在平等基础上,考虑本条约其他缔约国就提供机会对其发射的外层空间物体的飞行进行观察所提出的任何要求。

这种观察机会的性质和提供这种机会的条件,应由有关国家议定。

  

第十一条

为了促进在和平探索和利用外层空间方面的国际合作,在外层空间,包括月球与其他天体在内进行活动的本条约各缔约国同意,在最大可能和实际可行的范围内,将这类活动的性质、进行情况、地点和结果通知联合国秘书长,并通告公众和国际科学界。联合国秘书长在接到上述情报后,应准备立即作有效传播。

  

第十二条

在月球与其他天体上的一切站所、设施、装备和航天器,应在对等的基础上对本条约其他缔约国的代表开放。这些代表应将所计划的参观,在合理的时间内提前通知,以便进行适当的磋商和采取最大限度的预防措施,以保证安全并避免干扰所要参观的设备的正常运行。

  

第十三条

本条约的规定应适用于本条约各缔约国探索和利用外层空间,包括月球与其他天体在内的活动,不论这类活动是由某一缔约国单独进行还是与其他国家联合进行,包括在国际政府间组织的范围内进行的活动在内。

国际政府间组织在进行探索和利用外层空间,包括月球与其他天体在内的活动时所产生的任何实际问题,应由本条约各缔约国与有关国际组织或与该国际组织内本条约一个或一个以上的缔约国成员解决。

  

第十四条

1. 本条约应开放供所有国家签署。未在本条约按照本条第三款生效之前签署的任何国家,得随时加入本条约。

2. 本条约须经签署国批准。批准书和加入书应交苏维埃社会主义共和国联盟、大不列颠及北爱尔兰联合王国和美利坚合众国三国政府保存,该三国政府经指定为保存国政府。

3. 本条约应自包括经指定为本条约保存国政府的三国政府在内的五国政府交存批准书起生效。

4. 对于在本条约生效后交存批准书或加入书的国家,本条约应自其批准书或加入书交存之日起生效。

5. 保存国政府应将每一签字的日期、本条约每份批准书和加入书的交存日期和本条约生效日期以及其他通知事项,迅速告知所有签署国和加入国。

6. 本条约应由保存国政府遵照联合国宪章第一百零二条办理登记。

  

第十五条

本条约任何缔约国得对本条约提出修正案。修正案应自本条约多数缔约国接受之日起,对接受修正案的各缔约国生效,其后,对其余各缔约国则应自其接受之日起生效。

  

第十六条

本条约任何缔约国得在条约生效一年后用书面通知保存国政府退出本条约。这种退出应自接到通知一年后生效。

  

第十七条

本条约的中文、英文、法文、西班牙文和俄文五种文本具有同等效力;本条约应保存在保存国政府的档案库内。本条约经正式核证的副本应由保存国政府分送签署国和加入国政府。

下列签署人,经正式授权,在本条约上签字,以资证明。

一九六七年一月二十七日订于伦敦、莫斯科和华盛顿,一式三份。


RESOLUTION ADOPTED BY THE GENERAL ASSEMBLY

2222 (XXI). Treaty on Principles Governing the Activities of States in the Exploration and Use of Outer Space, including the Moon and Other Celestial Bodies

The General Assembly,

Having considered the report of the Committee on the Peaceful Uses of Outer Space covering its work during 1966, 1 and in particular the work accomplished by the Legal Subcommittee during its fifth session, held at Geneva from 12 July to 4 August and at New York from 12 September to 16 September,

Noting further the progress achieved through subsequent consultations among States Members of the United Nations,

Reaffirming the importance of international cooperation in the field of activities in the peaceful exploration and use of outer space, including the Moon and other celestial bodies, and the importance of developing the rule of law in this new area of human endeavour,

1.     Commends the Treaty on Principles Governing the Activities of States in the Exploration and Use of Outer Space, including the Moon and Other Celestial Bodies, the text of which is annexed to the present resolution;

2.     Requests the Depositary Governments to open the Treaty for signature and ratification at the earliest possible date;

3.     Expresses its hope for the widest possible adherence to this Treaty;

4.     Requests the Committee on the Peaceful Uses of Outer Space:

(a) To continue to work on the elaboration of an agreement on liability for damages caused by the launching of objects into outer space and an agreement on assistance to and return of astronauts and space vehicles, which are on the agenda of the Committee;

(b) To begin at the same time the study of questions relative to the definition of outer space and the utilization of outer space and celestial bodies, including the various implications of space communications;

(c) To report on the progress of its work to the General Assembly at its twenty-second session.

 

 

1499th plenary meeting,
19 December 1966.

 

ANNEX

Treaty on Principles Governing the Activities of States
in the Exploration and Use of Outer Space, including
the Moon and Other Celestial Bodies

 

     The States Parties to this Treaty,

     Inspired by the great prospects opening up before mankind as a result of man's entry into outer space,

     Recognizing the common interest of all mankind in the progress of the exploration and use of outer space for peaceful purposes,

     Believing that the exploration and use of outer space should be carried on for the benefit of all peoples irrespective of the degree of their economic or scientific development,

     Desiring to contribute to broad international co-operation in the scientific as well as the legal aspects of the exploration and use of outer space for peaceful purposes,

     Believing that such co-operation will contribute to the development of mutual understanding and to the strengthening of friendly relations between States and peoples,

     Recalling resolution 1962 (XVIII), entitled "Declaration of Legal Principles Governing the Activities of States in the Exploration and Use of Outer Space",which was adopted unanimously by the United Nations General Assembly on 13 December 1963,

     Recalling resolution 1884 (XVIII), calling upon States to refrain from placing in orbit around the earth any objects carrying nuclear weapons or any other kinds of weapons of mass destruction or from installing such weapons on celestial bodies, which was adopted unanimously by the United Nations General Assembly on 17 October 1963,

     Taking account of United Nations General Assembly resolution 110 (II) of 3 November 1947, which condemned propaganda designed or likely to provoke or encourage any threat to the peace, breach of the peace or act of aggression, and considering that the aforementioned resolution is applicable to outer space,

     Convinced that a Treaty on Principles Governing the Activities of States in the Exploration and Use of Outer Space, including the Moon and Other Celestial Bodies, will further the purposes and principles of the Charter of the United Nations,

     Have agreed on the following:

 

    The exploration and use of outer space, including the moon and other celestial bodies, shall be carried out for the benefit and in the interests of all countries, irrespective of their degree of economic or scientific development, and shall be the province of all mankind.

    Outer space, including the moon and other celestial bodies, shall be free for exploration and use by all States without discrimination of any kind, on a basis of equality and in accordance with international law, and there shall be free access to all areas of celestial bodies.

    There shall be freedom of scientific investigation in outer space, including the moon and other celestial bodies, and States shall facilitate and encourage international co-operation in such investigation.

 

    Outer space, including the moon and other celestial bodies, is not subject to national appropriation by claim of sovereignty, by means of use or occupation, or by any other means.

 

    States Parties to the Treaty shall carry on activities in the exploration and use of outer space, including the moon and other celestial bodies, in accordance with international law, including the Charter of the United Nations, in the interest of maintaining international peace and security and promoting international co-operation and understanding.

 

    States Parties to the Treaty undertake not to place in orbit around the earth any objects carrying nuclear weapons or any other kinds of weapons of mass destruction, install such weapons on celestial bodies, or station such weapons in outer space in any other manner.

    The moon and other celestial bodies shall be used by all States Parties to the Treaty exclusively for peaceful purposes. The establishment of military bases, installations and fortifications, the testing of any type of weapons and the conduct of military manoeuvres on celestial bodies shall be forbidden. The use of military personnel for scientific research or for any other peaceful purposes shall not be prohibited. The use of any equipment or facility necessary for peaceful exploration of the moon and other celestial bodies shall also not be prohibited.

Article V

    States Parties to the Treaty shall regard astronauts as envoys of mankind in outer space and shall render to them all possible assistance in the event of accident, distress, or emergency landing on the territory of another State Party or on the high seas. When astronauts make such a landing, they shall be safely and promptly returned to the State of registry of their space vehicle.

    In carrying on activities in outer space and on celestial bodies, the astronauts of one State Party shall render all possible assistance to the astronauts of other States Parties.

    States Parties to the Treaty shall immediately inform the other States Parties to the Treaty or the Secretary-General of the United Nations of any phenomena they discover in outer space, including the moon and other celestial bodies, which could constitute a danger to the life or health of astronauts.

 

    States Parties to the Treaty shall bear international responsibility for national activities in outer space, including the moon and other celestial bodies, whether such activities are carried on by governmental agencies or by non-governmental entities, and for assuring that national activities are carried out in conformity with the provisions set forth in the present Treaty. The activities of non-governmental entities in outer space, including the moon and other celestial bodies, shall require authorization and continuing supervision by the appropriate State Party to the Treaty. When activities are carried on in outer space, including the moon and other celestial bodies, by an international organization, responsibility for compliance with this Treaty shall be borne both by the international organization and by the States Parties to the Treaty participating in such organization.

 

    Each State Party to the Treaty that launches or procures the launching of an object into outer space, including the moon and other celestial bodies, and each State Party from whose territory or facility an object is launched, is internationally liable for damage to another State Party to the Treaty or to its natural or juridical persons by such object or its component parts on the Earth, in air or in outer space, including the moon and other celestial bodies.

    A State Party to the Treaty on whose registry an object launched into outer space is carried shall retain jurisdiction and control over such object, and over any personnel thereof, while in outer space or on a celestial body. Ownership of objects launched into outer space, including objects landed or constructed on a celestial body, and of their component parts, is not affected by their presence in outer space or on a celestial body or by their return to the Earth. Such objects or component parts found beyond the limits of the State Party to the Treaty on whose registry they are carried shall be returned to that State Party, which shall, upon request, furnish identifying data prior to their return.

 

Article IX

    In the exploration and use of outer space, including the moon and other celestial bodies, States Parties to the Treaty shall be guided by the principle of co-operation and mutual assistance and shall conduct all their activities in outer space, including the moon and other celestial bodies, with due regard to the corresponding interests of all other States Parties to the Treaty. States Parties to the Treaty shall pursue studies of outer space, including the moon and other celestial bodies, and conduct exploration of them so as to avoid their harmful contamination and also adverse changes in the environment of the Earth resulting from the introduction of extraterrestrial matter and, where necessary, shall adopt appropriate measures for this purpose. If a State Party to the Treaty has reason to believe that an activity or experiment planned by it or its nationals in outer space, including the moon and other celestial bodies, would cause potentially harmful interference with activities of other States Parties in the peaceful exploration and use of outer space, including the moon and other celestial bodies, it shall undertake appropriate international consultations before proceeding with any such activity or experiment. A State Party to the Treaty which has reason to believe that an activity or experiment planned by another State Party in outer space, including the moon and other celestial bodies, would cause potentially harmful interference with activities in the peaceful exploration and use of outer space, including the moon and other celestial bodies, may request consultation concerning the activity or experiment.

 

Article X

    In order to promote international co-operation in the exploration and use of outer space, including the moon and other celestial bodies, in conformity with the purposes of this Treaty, the States Parties to the Treaty shall consider on a basis of equality any requests by other States Parties to the Treaty to be afforded an opportunity to observe the flight of space objects launched by those States. The nature of such an opportunity for observation and the conditions under which it could be afforded shall be determined by agreement between the States concerned.

  

Article XI

    In order to promote international co-operation in the peaceful exploration and use of outer space, States Parties to the Treaty conducting activities in outer space, including the moon and other celestial bodies, agree to inform the Secretary-General of the United Nations as well as the public and the international scientific community, to the greatest extent feasible and practicable, of the nature, conduct, locations and results of such activities. On receiving the said information, the Secretary-General of the United Nations should be prepared to disseminate it immediately and effectively.

 

Article XII

    All stations, installations, equipment and space vehicles on the moon and other celestial bodies shall be open to representatives of other States Parties to the Treaty on a basis of reciprocity. Such representatives shall give reasonable advance notice of a projected visit, in order that appropriate consultations may be held and that maximum precautions may betaken to assure safety and to avoid interference with normal operations in the facility to be visited.

 

Article XIII

    The provisions of this Treaty shall apply to the activities of States Parties to the Treaty in the exploration and use of outer space, including the moon and other celestial bodies, whether such activities are carried on by a single State Party to the Treaty or jointly with other States, including cases where they are carried on within the framework of international intergovernmental organizations.

    Any practical questions arising in connection with activities carried on by international intergovernmental organizations in the exploration and use of outer space, including the moon and other celestial bodies, shall be resolved by the States Parties to the Treaty either with the appropriate international organization or with one or more States members of that international organization, which are Parties to this Treaty.

 

Article XIV

1.   This Treaty shall be open to all States for signature. Any State which does not sign this Treaty before its entry into force in accordance with paragraph 3 of this article may accede to it at anytime.

2.  This Treaty shall be subject to ratification by signatory States. Instruments of ratification and instruments of accession shall be deposited with the Governments of the United Kingdom of Great Britain and Northern Ireland, the Union of Soviet Socialist Republics and the United States of America, which are hereby designated the Depositary Governments.

3.  This Treaty shall enter into force upon the deposit of instruments of ratification by five Governments including the Governments designated as Depositary Governments under this Treaty.

4.  For States whose instruments of ratification or accession are deposited subsequent to the entry into force of this Treaty, it shall enter into force on the date of the deposit of their instruments of ratification or accession.

5.  The Depositary Governments shall promptly inform all signatory and acceding States of the date of each signature, the date of deposit of each instrument of ratification of and accession to this Treaty, the date of its entry into force and other notices.

6.  This Treaty shall be registered by the Depositary Governments pursuant to Article 102 of the Charter of the United Nations.

  

Article XV

    Any State Party to the Treaty may propose amendments to this Treaty. Amendments shall enter into force for each State Party to the Treaty accepting the amendments upon their acceptance by a majority of the States Parties to the Treaty and thereafter for each remaining State Party to the Treaty on the date of acceptance by it.

 

Article XVI

    Any State Party to the Treaty may give notice of its withdrawal from the Treaty one year after its entry into force by written notification to the Depositary Governments. Such withdrawal shall take effect one year from the date of receipt of this notification.

 

Article XVII

    This Treaty, of which the English, Russian, French, Spanish and Chinese texts are equally authentic, shall be deposited in the archives of the Depositary Governments. Duly certified copies of this Treaty shall be transmitted by the Depositary Governments to the Governments of the signatory and acceding States.

    IN WITNESS WHEREOF the undersigned, duly authorized, have signed this Treaty.

DONE in triplicate, at the cities of London, Moscow and Washington, the twenty-seventh day of January, one thousand nine hundred and sixty-seven.


 

Note

1.    Official Records of the General Assembly, Twenty-first Session, agenda items 30, 89 and 91, document A/6431.

Back


 

 

2222 (XXI). Traité sur les principes régissant les activités des États en matière d'exploration et d'utilisation de l'espace extra-atmosphérique, y compris la Lune et les autres corps célestes

L'Assemblée générale,

Ayant examiné le rapport du Comité des utilisations pacifiques de l'espace extra-atmosphérique sur ses travaux de l'année 1 , et en particulier l'œuvre accomplie par le Sous-Comité juridique à sa cinquième session, tenue à Genève du 12 juillet au 4 août et à New York du 12 au 16 septembre,

Notant en outre les progrès accomplis grâce à des consultations ultérieures entre les États Membres de l'Organisation des Nations Unies,

Réaffirmant l'importance de la coopération internationale dans le domaine des activités touchant l'exploration et l'utilisation pacifques de l'espace extra-atmopshérique, y compris la Lune et les autres célestes, et l'importance qu'il y a à promouvoir le règne du droit dans ce nouveau domaine de l'effort humain,

1.     Se félicite du Traité sur les principes régissant les activités des États en matière d'exploration et d'utilisation de l'espace extra-atmosphérique, y compris la Lune et les autres corps célestes, dont le texte est joint en annexe à la présente résolution;

2.     Prie les gouvernements dépositaires d'ouvrir le Traité aussitôt à la signature et à la ratification;

3.     Exprime l'espoir d'une adhésion aussi large que possible audit Traité;

4.     Prie le Comité des utilisations pacifique de l'espace extra-atmopshérique:

(a) De poursuivre ses travaux concernant l'élaboration d'un accord sur la responsibilité pour les dommages causés par des objets lancés dans l'espace extra-atmopshérique et d'un accord sur l'assistance aux astronautes et aux vehicules spatiaux, le retour des astronautes et la restitution des véhicules spatiaux, qui sont à l'ordre du jour du Comité;

(b) D'entreprendre en même temps l'étude de questions relatives à la définition de l'espace extra-atmopshérique et des corps célestes, y compris les diverses conséquences des communications spatiales;

(c) De rendre compte de la marche de ses travaux à l'Assemblée générale lors de la vingt-deuxième session.

1499 e séance plénière,
19 décembre 1966

  

  

ANNEXE

Traité sur les principes régissant les activités des États
en matière d'exploration et d'utilisation de l'espace
extra-atmosphérique, y compris la Lune et les
autres corps célestes

 

Les États parties au présent Traité

S'inspirant des vastes perspectives qui s'offrent à l'humanité du fait de la découverte de l'espace extra-atmosphérique par l'homme,

Reconnaissant l'intérêt que présente pour l'humanité tout entière le progrès de l'exploration et de l'utilisation de l'espace extra-atmosphérique à des fins pacifiques,

Estimant que l'exploration et l'utilisation de l'espace extra-atmosphérique devraient s'effectuer pour le bien de tous les peuples, quel que soit le stade de leur
développement économique ou scientifique,

Désireux de contribuer au développement d'une large coopération internationale en ce qui concerne les aspects scientifiques aussi bien que juridiques de l'exploration et de l'utilisation de l'espace extra-atmosphérique à des fins pacifiques,

Estimant que cette coopération contribuera à développer la compréhension mutuelle et à consolider les relations amicales entre les États et entre les peuples,

Rappelant la résolution 1962 (XVIII), intitulée "Déclaration des principes juridiques régissant les activités des États en matière d'exploration et d'utilisation de l'espace extra-atmosphérique", que l'Assemblée générale des Nations Unies a adoptée à l'unanimité le 13 décembre 1963,

Rappelant la résolution 1884 (XVIII), qui engage les États à s'abstenir de mettre sur orbite autour de la Terre tous objets porteurs d'armes nucléaires ou de tout autre type d'armes de destruction massive et d'installer de telles armes sur des corps célestes, résolution que l'Assemblée générale des Nations Unies a adoptée à l'unanimité le 17 octobre 1963,

Tenant comptede la résolution 110 (II) de l'Assemblée générale des Nations Unies en date du 3 novembre 1947, résolution qui condamne la propagande destinée ou de nature à provoquer ou à encourager toute menace à la paix, toute rupture de la paix ou tout acte d'agression, et considérant que ladite résolution est applicable à l'espace extra-atmosphérique,

Convaincus que le Traité sur les principes régissant les activités des États en matière d'exploration et d'utilisation de l'espace extra-atmosphérique, y compris la Lune et les autres corps célestes, contribuera à la réalisation des buts et principes de la Charte des Nations Unies,

Sont convenus de ce qui suit:

  

Article premier

L'exploration et l'utilisation de l'espace extra-atmosphérique, y compris la Lune et les autres corps célestes, doivent se faire pour le bien et dans l'intérêt de tous les pays, quel que soit le stade de leur développement économique ou scientifique; elles sont l'apanage de l'humanité tout entière.

L'espace extra-atmosphérique, y compris la Lune et les autres corps célestes, peut être exploré et utilisé librement par tous les États sans aucune discrimination, dans des conditions d'égalité et conformément au droit international, toutes les régions des corps célestes devant être librement accessibles.

Les recherches scientifiques sont libres dans l'espace extra-atmosphérique, y compris la Lune et les autres corps célestes, et les États doivent faciliter et encourager la coopération internationale dans ces recherches.

  

Article II

L'espace extra-atmosphérique, y compris la Lune et les autres corps célestes, ne peut faire l'objet d'appropriation nationale par proclamation de souveraineté, ni par voie d'utilisation ou d'occupation, ni par aucun autre moyen.

  

Article III

Les activités des États parties au Traité relatives à l'exploration et à l'utilisation de l'espace extra-atmosphérique, y compris la Lune et les autres corps célestes, doivent s'effectuer conformément au droit international, y compris la Charte des Nations Unies, en vue de maintenir la paix et la sécurité internationales et de favoriser la coopération et la compréhension internationales.

  

Article IV

Les États parties au Traité s'engagent à ne mettre sur orbite autour de la Terre aucun objet porteur d'armes nucléaires ou de tout autre type d'armes de destruction massive, à ne pas installer de telles armes sur des corps célestes et à ne pas placer de telles armes, de toute autre manière, dans l'espace extra-atmosphérique.

Tous les États parties au Traité utiliseront la Lune et les autres corps célestes exclusivement à des fins pacifiques. Sont interdits sur les corps célestes l'aménagement de bases et installations militaires et de fortifications, les essais d'armes de tous types et l'exécution de manoeuvres militaires. N'est pas interdite l'utilisation de personnel militaire à des fins de recherche scientifique ou à toute autre fin pacifique. N'est pas interdite non plus l'utilisation de tout équipement ou installation nécessaire à l'exploration pacifique de la Lune et des autres corps célestes.

  

Article V

Les États parties au Traité considéreront les astronautes comme des envoyés de l'humanité dans l'espace extra-atmosphérique et leur prêteront toute l'assistance possible en cas d'accident, de détresse ou d'atterrissage forcé sur le territoire d'un autre État partie au Traité ou d'amerrissage en haute mer. En cas d'un tel atterrissage ou amerrissage, le retour des astronautes à l'État d'immatriculation de leur véhicule spatial devra être effectué promptement et en toute sécurité.

Lorsqu'ils poursuivront des activités dans l'espace extra-atmosphérique et sur les corps célestes, les astronautes d'un État partie au Traité prêteront toute l'assistance possible aux astronautes des autres États parties au Traité.

Les États parties au Traité porteront immédiatement à la connaissance des autres États parties au Traité ou du Secrétaire général de l'Organisation des Nations Unies tout phénomène découvert par eux dans l'espace extra-atmosphérique, y compris la Lune et les corps célestes, qui pourrait présenter un danger pour la vie ou la santé des astronautes.

  

Article VI

Les États parties au Traité ont la responsabilité internationale des activités nationales dans l'espace extra-atmosphérique, y compris la Lune et les autres corps célestes, qu'elles soient entreprises par des organismes gouvernementaux ou par des entités non gouvernementales, et de veiller à ce que les activités nationales soient poursuivies conformément aux dispositions énoncées dans le présent Traité. Les activités des entités non gouvernementales dans l'espace extra-atmosphérique, y compris la Lune et les autres corps célestes, doivent faire l'objet d'une autorisation et d'une surveillance continue de la part de l'État approprié partie au Traité. En cas d'activités poursuivies par une organisation internationale dans l'espace extraatmosphérique, y compris la Lune et les autres corps célestes, la responsabilité du respect des dispositions du présent Traité incombera à cette organisation internationale et aux États parties au Traité qui font partie de ladite organisation.

  

Article VII

Tout État partie au Traité qui procède ou fait procéder au lancement d'un objet dans l'espace extra-atmosphérique, y compris la Lune et les autres corps célestes, et tout État partie dont le territoire ou les installations servent au lancement d'un objet, est responsable du point de vue international des dommages causés par ledit objet ou par ses éléments constitutifs, sur la Terre, dans l'atmosphère ou dans l'espace extraatmosphérique, y compris la Lune et les autres corps célestes, à un autre État partie au Traité ou aux personnes physiques ou morales qui relèvent de cet autre État.

  

Article VIII

L'État partie au Traité sur le registre duquel est inscrit un objet lancé dans l'espace extra-atmosphérique conservera sous sa juridiction et son contrôle ledit objet et tout le personnel dudit objet, alors qu'ils se trouvent dans l'espace extraatmosphérique ou sur un corps céleste. Les droits de propriété sur les objets lancés dans l'espace extra-atmosphérique, y compris les objets amenés ou construits sur un corps céleste, ainsi que sur leurs éléments constitutifs, demeurent entiers lorsque ces objets ou éléments se trouvent dans l'espace extra-atmosphérique ou sur un corps céleste, et lorsqu'ils reviennent sur la Terre. Les objets ou éléments constitutifs d'objets trouvés au-delà des limites de l'État partie au Traité sur le registre duquel ils sont inscrits doivent être restitués à cet État partie au Traité, celui-ci étant tenu de fournir, sur demande, des données d'identification avant la restitution.

  

Article IX

En ce qui concerne l'exploration et l'utilisation de l'espace extra-atmosphérique, y compris la Lune et les autres corps célestes, les États parties au Traité devront se fonder sur les principes de la coopération et de l'assistance mutuelle et poursuivront toutes leurs activités dans l'espace extra-atmosphérique, y compris la Lune et les autres corps célestes, en tenant dûment compte des intérêts correspondants de tous les autres États parties au Traité. Les États parties au Traité effectueront l'étude de l'espace extra-atmosphérique, y compris la Lune et les autres corps célestes, et procéderont à leur exploration de manière à éviter les effets préjudiciables de leur contamination ainsi que les modifications nocives du milieu terrestre résultant de l'introduction de substances extraterrestres et, en cas de besoin, ils prendront les mesures appropriées à cette fin. Si un État partie au Traité a lieu de croire qu'une activité ou expérience envisagée par lui-même ou par ses ressortissants dans l'espace extra-atmosphérique, y compris la Lune et les autres corps célestes, causerait une gêne potentiellement nuisible aux activités d'autres États parties au Traité en matière d'exploration et d'utilisation pacifiques de l'espace extra-atmosphérique, y compris la Lune et les autres corps célestes, il devra engager les consultations internationales appropriées avant d'entreprendre ladite activité ou expérience. Tout État partie au Traité ayant lieu de croire qu'une activité ou expérience envisagée par un autre État partie au Traité dans l'espace extra-atmosphérique, y compris la Lune et les autres corps célestes, causerait une gêne potentiellement nuisible aux activités poursuivies en matière d'exploration et d'utilisation pacifiques de l'espace extra-atmosphérique, y compris la Lune et les autres corps célestes, peut demander que des consultations soient ouvertes au sujet de ladite activité ou expérience.

  

Article X

Pour favoriser la coopération en matière d'exploration et d'utilisation de l'espace extra-atmosphérique, y compris la Lune et les autres corps célestes, conformément aux buts du présent Traité, les États parties au Traité examineront dans des conditions d'égalité les demandes des autres États parties au Traité tendant à obtenir des facilités pour l'observation du vol des objets spatiaux lancés par ces États.

La nature de telles facilités d'observation et les conditions dans lesquelles elles pourraient être consenties seront déterminées d'un commun accord par les États intéressés.

  

Article XI

Pour favoriser la coopération internationale en matière d'exploration et d'utilisation pacifiques de l'espace extra-atmosphérique, les États parties au Traité qui mènent des activités dans l'espace extra-atmosphérique, y compris la Lune et les autres corps célestes, conviennent, dans toute la mesure où cela est possible et réalisable, d'informer le Secrétaire général de l'Organisation des Nations Unies, ainsi que le public et la communauté scientifique internationale, de la nature et de la conduite de ces activités, des lieux où elles sont poursuivies et de leurs résultats. Le Secrétaire général de l'Organisation des Nations Unies devra être prêt à assurer, aussitôt après les avoir reçus, la diffusion effective de ces renseignements.

  

Article XII

Toutes les stations et installations, tout le matériel et tous les véhicules spatiaux se trouvant sur la Lune ou sur d'autres corps célestes seront accessibles, dans des conditions de réciprocité, aux représentants des autres États au Traité. Ces représentants notifieront au préalable toute visite projetée, de façon que les consultations voulues puissent avoir lieu et que le maximum de précautions puissent être prises pour assurer la sécurité et éviter de gêner les opérations normales sur les lieux de l'installation à visiter.

  

Article XIII

Les dispositions du présent Traité s'appliquent aux activités poursuivies par les États parties au Traité en matière d'exploration et d'utilisation de l'espace extraatmosphérique, y compris la Lune et les autres corps célestes, que ces activités soient menées par un État partie au Traité seul ou en commun avec d'autres États, notamment dans le cadre d'organisations intergouvernementales internationales.

Toutes questions pratiques se posant à l'occasion des activités poursuivies par des organisations intergouvernementales internationales en matière d'exploration et d'utilisation de l'espace extra-atmosphérique, y compris la Lune et les autres corps célestes, seront réglées par les États parties au Traité soit avec l'organisation internationale compétente, soit avec un ou plusieurs des États membres de ladite organisation qui sont parties au Traité.

  

Article XIV

1. Le présent Traité est ouvert à la signature de tous les États. Tout État qui n'aura pas signé le présent Traité avant son entrée en vigueur conformément au paragraphe 3 du présent article pourra y adhérer à tout moment.

2. Le présent Traité sera soumis à la ratification des États signataires. Les instruments de ratification et les instruments d'adhésion seront déposés auprès des Gouvernements des États-Unis d'Amérique, du Royaume-Uni de Grande-Bretagne et d'Irlande du Nord et de l'Union des Républiques socialistes soviétiques, qui sont, dans le présent Traité, désignés comme étant les gouvernements dépositaires.

3. Le présent Traité entrera en vigueur lorsque cinq gouvernements, y compris ceux qui sont désignés comme étant les gouvernements dépositaires aux termes du présent Traité, auront déposé leurs instruments de ratification.

4. Pour les États dont les instruments de ratification ou d'adhésion seront déposés après l'entrée en vigueur du présent Traité, celui-ci entrera en vigueur à la date du dépôt de leurs instruments de ratification ou d'adhésion.

5. Les gouvernements dépositaires informeront sans délai tous les États qui auront signé le présent Traité ou y auront adhéré de la date de chaque signature, de la date du dépôt de chaque instrument de ratification du présent Traité ou d'adhésion au présent Traité, de la date d'entrée en vigueur du Traité ainsi que de toute autre communication.

6. Le présent Traité sera enregistré par les gouvernements dépositaires conformément à l'Article 102 de la Charte des Nations Unies.

  

Article XV

Tout État partie au présent Traité peut proposer des amendements au Traité. Les amendements prendront effet à l'égard de chaque État partie au Traité acceptant les amendements dès qu'ils auront été acceptés par la majorité des États parties au Traité et, par la suite, pour chacun des autres États parties au Traité, à la date de son acceptation desdits amendements.

  

Article XVI

Tout État partie au présent Traité peut, un an après l'entrée en vigueur du Traité, communiquer son intention de cesser d'y être partie par voie de notification écrite adressée aux gouvernements dépositaires. Cette notification prendra effet un an après la date à laquelle elle aura été reçue.

  

Article XVII

Le présent Traité, dont les textes anglais, chinois, espagnol, français et russe font également foi, sera déposé dans les archives des gouvernements dépositaires. Des copies dûment certifiées du présent Traité seront adressées par les gouvernements dépositaires aux gouvernements des États qui auront signé le Traité ou qui y auront adhéré.

EN FOI DE QUOI les soussignés, dûment habilités à cet effet, ont signé le présent Traité.

FAIT en trois exemplaires, à Londres, Moscou et Washington, le vingt-sept janvier mil neuf cent soixante-sept.


2222 (XXI). Договор о принципах деятельности государств по исследованию и использованию космического пространства, включая Луну и другие небесные тела

 

Государства - участники настоящего Договора,

воодушевленные великими перспективами, открывающимися перед человечеством в результате проникновения человека в космос,

признавая общую заинтересованность всего человечества в прогрессе исследования и использования космического пространства в мирных целях,

полагая, что исследование и использование космического пространства должны быть направлены на благо всех народов, независимо от степени их экономического или научного развития,

желая содействовать развитию широкого международного сотруд-ничества как в научных, так и в юридических аспектах исследования и использования космического пространства в мирных целях,

полагая, что такое сотрудничество будет содействовать развитию взаимопонимания и укреплению дружественных отношений между госу-дарствами и народами,

напоминая резолюцию 1962 (XVIII), озаглавленную "Декларация правовых принципов в деятельности государств по исследованию и использованию космического пространства", единодушно принятую Гене-ральной Ассамблеей Организации Объединенных Наций 13 декабря 1963 года,

напоминая резолюцию 1884 (XVIII), призывающую государства воз-держиваться от вывода на орбиту вокруг Земли любых объектов с ядерным оружием или любыми другими видами оружия массового уничтожения или от установки такого оружия на небесных телах, едино-душно принятую Генеральной Ассамблеей Организации Объединенных Наций 17 октября 1963 года,

принимая во внимание резолюцию Генеральной Ассамблеи Организации Объединенных Наций 110 (II) от 3 ноября 1947 года, которая осуждает пропаганду, имеющую целью или способную создать или усилить угрозу миру, нарушение мира или акты агрессии, и считая, что указанная резолюция применима к космическому пространству,

будучи убежденными, что Договор о принципах деятельности государств по исследованию и использованию космического пространства, включая Луну и другие небесные тела, будет способствовать осуществлению целей и принципов Устава Организации Объединенных Наций,

согласились о нижеследующем:

  

Статья I

Исследование и использование космического пространства, включая Луну и другие небесные тела, осуществляются на благо и в интересах всех стран, независимо от степени их экономического или научного развития, и являются достоянием всего человечества.

Космическое пространство, включая Луну и другие небесные тела, открыто для исследования и использования всеми государствами без какой бы то ни было дискриминации на основе равенства и в соответствии с международным правом, при свободном доступе во все районы небесных тел.

Космическое пространство, включая Луну и другие небесные тела, свободно для научных исследований, и государства содействуют и поощряют международное сотрудничество в таких исследованиях.

  

Статья II

Космическое пространство, включая Луну и другие небесные тела, не подлежит национальному присвоению ни путем провозглашения на них суверенитета, ни путем использования или оккупации, ни любыми другими средствами.

  

Статья III

Государства - участники Договора осуществляют деятельность по исследованию и использованию космического пространства, в том числе Луны и других небесных тел, в соответствии с международным правом, включая Устав Организации Объединенных Наций, в интересах поддержания международного мира и безопасности и развития международного сотрудничества и взаимопонимания.

  

Статья IV

Государства - участники Договора обязуются не выводить на орбиту вокруг Земли любые объекты с ядерным оружием или любыми другими видами оружия массового уничтожения, не устанавливать такое оружие на небесных телах и не размещать такое оружие в космическом пространстве каким-либо иным образом.

Луна и другие небесные тела используются всеми государствами - участниками Договора исключительно в мирных целях. Запрещается создание на небесных телах военных баз, сооружений и укреплений, испытание любых типов оружия и проведение военных маневров. Использование военного персонала для научных исследований или каких- либо иных мирных целей не запрещается. Не запрещается также использование любого оборудования или средств, необходимых для мирного исследования Луны и других небесных тел.

  

Статья V

Государства - участники Договора рассматривают космонавтов как посланцев человечества в космос и оказывают им всемерную помощь в случае аварии, бедствия или вынужденной посадки на территории другого государства - участника Договора или в открытом море. Космонавты, которые совершают такую вынужденную посадку, должны быть в безопасности и незамедлительно возвращены государству, в регистр которого занесен их космический корабль.

При осуществлении деятельности в космическом пространстве, в том числе и на небесных телах, космонавты одного государства - участника Договора оказывают возможную помощь космонавтам других государств - участников Договора.

Государства - участники Договора незамедлительно информируют другие государства - участники Договора или Генерального секретаря Организации Объединенных Наций об установленных ими явлениях в космическом пространстве, включая Луну и другие небесные тела, которые могли бы представить опасность для жизни или здоровья космонавтов.

  

Статья VI

Государства - участники Договора несут международную ответствен- ность за национальную деятельность в космическом пространстве, включая Луну и другие небесные тела, независимо от того, осуществляется ли она правительственными органами или неправительственными юридическими лицами, и за обеспечение того, чтобы национальная деятельность проводилась в соответствии с положениями, содержащимися в настоящем Договоре. Деятельность неправительственных юридических лиц в космическом пространстве, включая Луну и другие небесные тела, должна проводиться с разрешения и под постоянным наблюдением соответ- ствующего государства - участника Договора. В случае деятельности в космическом пространстве, включая Луну и другие небесные тела, международной организации, ответственность за выполнение настоящего Договора несут, наряду с международной организацией, также и участвующие в ней государства - участники Договора.

  

Статья VII

Каждое государство - участник Договора, которое осуществляет или организует запуск объекта в космическое пространство, включая Луну и другие небесные тела, а также каждое государство - участник Договора, с территории или установок которого производится запуск объекта, несет международную ответственность за ущерб, причиненный такими объектами или их составными частями на Земле, в воздушном или в космическом пространстве, включая Луну и другие небесные тела, другому государству - участнику Договора, его физическим или юридическим лицам.

  

Статья VIII

Государство - участник Договора, в регистр которого занесен объект, запущенный в космическое пространство, сохраняет юридисдикцию и контроль над таким объектом и над любым экипажем этого объекта во время их нахождения в космическом пространстве, в том числе и на небесном теле. Права собственности на космические объекты, запущенные в космическое пространство, включая объекты, доставленные или сооруженные на небесном теле, и на их составные части остаются незатронутыми во время их нахождения в космическом пространстве или на небесном теле, или по возвращении на Землю. Такие объекты или их составные части, обнару- женные за пределами государства - участника Договора, в регистр которого они занесены, должны быть возвращены этому государству - участнику Договора; при этом такое государство должно по требованию представить до возвращения опознавательные данные.

  

Статья IX

При исследовании и использовании космического пространства, включая Луну и другие небесные тела, государства - участники Договора должны руководствоваться принципом сотрудничества и взаимной помощи и должны осуществлять всю свою деятельность в космическом пространстве, включая Луну и другие небесные тела, с должным учетом соответствующих интересов всех других государства - участников Договора. Государства - участники Договора осуществляют изучение и исследование космического пространства, включая Луну и другие небесные тела, таким образом, чтобы избегать их вредного загрязнения, а также неблагоприятных изменений земной среды вследствие доставки внеземного вещества, и с этой целью, в случае необходимости, принимают соответствующие меры. Если какое-либо государство - участник Договора имеет основания полагать, что деятельность или эксперимент, запланированные этим государством - участником Дого- вора или гражданами этого государства - участника Договора в космическом пространстве, включая Луну и другие небесные тела, создадут потенциально вредные помехи деятельности других государств - участников Договора в деле мирного исследования и использования космического пространства, включая Луну и другие небесные тела, то оно должно провести соот- ветствующие международные консультации, прежде чем приступить к такой деятельности или эксперименту. Государство - участник Договора, имеющее основание полагать, что деятельность или эксперимент, запланированные другим государством - участником Договора в космическом пространстве, включая Луну и другие небесные тела, создадут потенциально вредные помехи деятельности в деле мирного исследования и использования космического пространства, включая Луну и другие небесные тела, может запросить проведения консультаций относительно такой деятельности или эксперимента.

  

Статья X

Для содействия международному сотрудничеству в исследовании и использовании космического пространства, включая Луну и другие небесные тела, в соответствии с целями настоящего Договора, государства - участники Договора будут на равных основаниях рассматривать просьбы других государств - участников Договора о предоставлении им возможности для наблюдения за полетом запускаемых этими государствами космических объектов.

Характер и условия предоставления упомянутой выше возможности определяются по соглашению между заинтересованными государствами.

  

Статья XI

Для содействия международному сотрудничеству в мирном исследовании и использовании космического пространства государства - участники Договора, осуществляющие деятельность в космическом пространстве, включая Луну и другие небесные тела, соглашаются в максимально возможной и практически осуществимой степени инфор- мировать Генерального секретаря Организации Объединенных Наций, а также общественность и международное научное сообщество о характера, ходе, местах и результатах такой деятельности. По получении указанной выше информации Генеральный секретарь Организации Объединенных Наций должен быть готов к ее немедленному и эффективному распространению.

  

Статья XII

Все станции, установки, оборудование и космические корабли на Луне и на других небесных телах открыты для представителей других государств - участников настоящего Договора на основе взаимности. Эти представители заблаговременно сообщают о проектируемом посещении, чтобы позволить провести соответствующие консультации и принять меры максимальной предосторожности для обеспечения безопасности и во избежание помех для нормальных операций на установке, подлежащей посещению.

  

Статья XIII

Положения настоящего Договора применяются в отношении деятель- ности государств - участников Договора по исследованию и использованию космического пространства, включая Луну и другие небесные тела, независимо от того, осуществляется ли такая деятельность одним госу- дарством - участником Договора или совместно с другими государствами, в том числе в рамках международных межправительственных организаций.

Практические вопросы, которые могут возникать в связи с осуществлением международными межправительственными организациями деятельности по исследованию и использованию космического пространства, включая Луну и другие небесные тела, решаются государствами - участниками Договора либо с соответствующей международной организацией, либо с одним или несколькими государствами - членами этой международной организации, являющимися участниками настоящего Договора.

  

Статья XIV

1. Настоящий Договор будет открыт для подписания его всеми государствами. Любое государство, которое не подпишет настоящей Договор до вступления его в силу в соответствии с пунктом 3 данной статьи, может присоединиться к нему в любое время.

2. Настоящий Договор подлежит ратификации государствами, подписавшими его. Ратификационные грамоты и документы о присоединении должны быть сданы на хранение правительствам Союза Советских Социалистических Республик, Соединенного Королевства Великобритании и Северной Ирландии и Соединенных Штатов Америки, которые настоящим назначаются в качестве правительств- депозитариев.

3. Настоящий Договор вступает в силу после сдачи на хранение ратификационных грамот пятью правительствами, включая прави- тельства, назначенные в качестве правительств - депозитариев настоящего Договора.

4. Для государств, ратификационные грамоты или документы о присоединении которых будут сданы на хранение после вступления в силу настоящего Договора, он вступит в силу в день сдачи на хранение их ратификационных грамот или документов о присоединении.

5. Правительства-депозитарии незамедлительно уведомляют все подписавшие и присоединившиеся к настоящему Договору государства о дате каждого подписания, о дате сдачи на хранение каждой ратификационной грамоты и документа о присоединении, о дате вступления в силу настоящего Договора, а также о других уведомлениях.

6. Настоящий Договор будет зарегистрирован правительствами- депозитариями в соответствии со статьей 102 Устава Организации Объединенных Наций.

  

Статья XV

Любое государство - участник Договора может предлагать поправки к настоящему Договору. Поправки вступают в силу для каждого государства - участника Договора, принимающего эти поправки, после принятия их большинством государств - участников Договора, а впоследствии для каждого оставшегося государства - участника Договора в день принятия им этих поправок.

  

Статья XVI

Любое государство - участник Договора может уведомить о своем выходе из Договора через год после вступления его в силу путем письменного уведомления правительств-депозитариев, Такой выход приобретает силу по истечении одного года со дня получения этого уведомления.

  

Статья XVII

  

Статья XVII

Настоящий Договор, английский, испанский, китайский, русский и французский тексты которого являются равно аутентичными, будет сдан на хранение в архивы правительств-депозитариев. Должным образом заве- ренные копии настоящего Договора будут препровождены правительствами- депозитариями правительствам государств, подписавших Договор и присоединившихся к нему.

В УДОСТОВЕРЕНИЕ ЧЕГО нижеподписавшиеся, должным образом на то уполномоченные, подписали настоящий Договор.

СОВЕРШЕНО в трех экземплярах в городах Вашингтоне, О.К., Лондоне и Москве января месяца двадцать седьмого дня тысяча девятьсот шестьдесят седьмого года.


2222 (XVI): Tratado sobre los principios que deben regir las actividades de los Estados en la exploración y utilización del espacio ultraterrestre, incluso la Luna y otros cuerpos celestes

 

Los Estados Partes en este Tratado,

Inspirándose en las grandes perspectivas que se ofrecen a la humanidad como consecuencia de la entrada del hombre en el espacio ultraterrestre,

Reconociendo el interés general de toda la humanidad en el proceso de la exploración y utilización del espacio ultraterrestre con fines pacíficos,

Estimando que la exploración y la utilización del espacio ultraterrestre se debe efectuar en bien de todos los pueblos, sea cual fuere su grado de desarrollo económico y científico,

Deseando contribuir a una amplia cooperación internacional en lo que se refiere a los aspectos científicos y jurídicos de la exploración y utilización del espacio ultraterrestre con fines pacíficos,

Estimando que tal cooperación contribuirá al desarrollo de la comprensión mutua y al afianzamiento de las relaciones amistosas entre los Estados y pueblos,

Recordando la resolución 1962 (XVIII), titulada "Declaración de los principios jurídicos que deben regir las actividades de los Estados en la exploración y utilización del espacio ultraterrestre", que fue aprobada unánimemente por la Asamblea General de las Naciones Unidas el 13 de diciembre de 1963,

Recordando la resolución 1884 (XVIII), en que se insta a los Estados a no poner en órbita alrededor de la Tierra ningún objeto portador de armas nucleares u otras clases de armas de destrucción en masa, ni a emplazar tales armas en los cuerpos celestes, que fue aprobada unánimemente por la Asamblea General de las Naciones Unidas el 17 de octubre de 1963,

Tomando nota de la resolución 110 (II), aprobada por la Asamblea General el 3 de noviembre de 1947, que condena la propaganda destinada a provocar o alentar, o susceptible de provocar o alentar cualquier amenaza de la paz, quebrantamiento de la paz o acto de agresión, y considerando que dicha resolución es aplicable al espacio ultraterrestre,

Convencidos de que un Tratado sobre los principios que deben regir las actividades de los Estados en la exploración y utilización del espacio ultraterrestre, incluso la Luna y otros cuerpos celestes, promoverá los propósitos y principios de la Carta de las Naciones Unidas,

Han convenido en lo siguiente:

  

Artículo I

La exploración y utilización del espacio ultraterrestre, incluso la Luna y otros cuerpos celestes, deberán hacerse en provecho y en interés de todos los países, sea cual fuere su grado de desarrollo económico y científico, e incumben a toda la humanidad.

El espacio ultraterrestre, incluso la Luna y otros cuerpos celestes, estará abierto para su exploración y utilización a todos los Estados sin discriminación alguna en condiciones de igualdad y en conformidad con el derecho internacional, y habrá libertad de acceso a todas las regiones de los cuerpos celestes.

El espacio ultraterrestre, incluso la Luna y otros cuerpos celestes, estarán abiertos a la investigación científica, y los Estados facilitarán y fomentarán la cooperación internacional en dichas investigaciones.

  

Artículo II

El espacio ultraterrestre, incluso la Luna y otros cuerpos celestes, no podrá ser objeto de apropiación nacional por reivindicación de soberanía, uso u ocupación, ni de ninguna otra manera.

  

Artículo III

Los Estados Partes en el Tratado deberán realizar sus actividades de exploración y utilización del espacio ultraterrestre, incluso la Luna y otros cuerpos celestes, de conformidad con el derecho internacional, incluida la Carta de las Naciones Unidas, en interés del mantenimiento de la paz y la seguridad internacionales y del fomento de la cooperación y la comprensión internacionales.

  

Artículo IV

Los Estados Partes en el Tratado se comprometen a no colocar en órbita alrededor de la Tierra ningún objeto portador de armas nucleares ni de ningún otro tipo de armas de destrucción en masa, a no emplazar tales armas en los cuerpos celestes y a no colocar tales armas en el espacio ultraterrestre en ninguna otra forma.

La Luna y los demás cuerpos celestes se utilizarán exclusivamente con fines pacíficos por todos los Estados Partes en el Tratado. Queda prohibido establecer en los cuerpos celestes bases, instalaciones y fortificaciones militares, efectuar ensayos con cualquier tipo de armas y realizar maniobras militares. No se prohíbe la utilización de personal militar para investigaciones científicas ni para cualquier otro objetivo pacífico. Tampoco se prohíbe la utilización de cualquier equipo o medios necesarios para la exploración de la Luna y de otros cuerpos celestes con fines pacíficos.

  

Artículo V

Los Estados Partes en el Tratado considerarán a todos los astronautas como enviados de la humanidad en el espacio ultraterrestre, y les prestarán toda la ayuda posible en caso de accidente, peligro o aterrizaje forzoso en el territorio de otro Estado Parte o en alta mar. Cuando los astronautas hagan tal aterrizaje serán devueltos con seguridad y sin demora al Estado de registro de su vehículo espacial.

Al realizar actividades en el espacio ultraterrestre, así como en los cuerpos celestes, los astronautas de un Estado Parte en el Tratado deberán prestar toda la ayuda posible a los astronautas de los demás Estados Partes en el Tratado.

Los Estados Partes en el Tratado tendrán que informar inmediatamente a los demás Estados Partes en el Tratado o al Secretario General de las Naciones Unidas sobre los fenómenos por ellos observados en el espacio ultraterrestre, incluso la Luna y otros cuerpos celestes, que podrían constituir un peligro para la vida o la salud de los astronautas.

  

Artículo VI

Los Estados Partes en el Tratado serán responsables internacionalmente de las actividades nacionales que realicen en el espacio ultraterrestre, incluso la Luna y otros cuerpos celestes, los organismos gubernamentales o las entidades no gubernamentales, y deberán asegurar que dichas actividades se efectúen en conformidad con las disposiciones del presente Tratado. Las actividades de las entidades no gubernamentales en el espacio ultraterrestre, incluso la Luna y otros cuerpos celestes, deberán ser autorizadas y fiscalizadas constantemente por el pertinente Estado Parte en el Tratado. Cuando se trate de actividades que realiza en el espacio ultraterrestre, incluso la Luna y otros cuerpos celestes, una organización internacional, la responsable en cuanto al presente Tratado corresponderá a esa organización internacional y a los Estados Partes en el Tratado que pertenecen a ella.

  

Artículo VII

Todo Estado Parte en el Tratado que lance o promueva el lanzamiento de un objeto al espacio ultraterrestre, incluso la Luna y otros cuerpos celestes, y todo Estado Parte en el Tratado, desde cuyo territorio o cuyas instalaciones se lance un objeto, será responsable internacionalmente de los daños causados a otro Estado Parte en el Tratado o a sus personas naturales o jurídicas por dicho objeto o sus partes componentes en la Tierra, en el espacio aéreo o en el espacio ultraterrestre, incluso la Luna y otros cuerpos celestes.

  

Artículo VIII

El Estado Parte en el Tratado, en cuyo registro figura el objeto lanzado al espacio ultraterrestre, retendrá su jurisdicción y control sobre tal objeto, así como sobre todo el personal que vaya en él, mientras se encuentre en el espacio ultraterrestre o en un cuerpo celeste. El derecho de propiedad de los objetos lanzados al espacio ultraterrestre, incluso de los objetos que hayan descendido o se construyan en un cuerpo celeste, y de sus partes componentes, no sufrirá ninguna alteración mientras estén en el espacio ultraterrestre, incluso en un cuerpo celeste, ni en su retorno a la Tierra. Cuando esos objetos o esas partes componentes sean hallados fuera de los límites del Estado Parte en el Tratado en cuyo registro figuran, deberán ser devueltos a ese Estado Parte, el que deberá proporcionar los datos de identificación que se le soliciten antes de efectuarse la restitución.

  

Artículo IX

En la exploración y utilización del espacio ultraterrestre, incluso la Luna y otros cuerpos celestes, los Estados Partes en el Tratado deberán guiarse por el principio de la cooperación y la asistencia mutua, y en todas sus actividades en el espacio ultraterrestre, incluso en la Luna y otros cuerpos celestes, deberán tener debidamente en cuenta los intereses correspondientes de los demás Estados Partes en el Tratado. Los Estados Partes en el Tratado harán los estudios e investigaciones del espacio ultraterrestre, incluso la Luna y otros cuerpos celestes, y procederán a su exploración de tal forma que no se produzca una contaminación nociva ni cambios desfavorables en el medio ambiente de la Tierra como consecuencia de la introducción en él de materias extraterrestres, y cuando sea necesario adoptarán las medidas pertinentes a tal efecto. Si un Estado Parte en el Tratado tiene motivos para creer que una actividad o un experimento en el espacio ultraterrestre, incluso la Luna y otros cuerpos celestes, proyectado por él o por sus nacionales, crearía un obstáculo capaz de perjudicar las actividades de otros Estados Partes en el Tratado en la exploración y utilización del espacio ultraterrestre con fines pacíficos, incluso en la Luna y otros cuerpos celestes, deberá celebrar las consultas internacionales oportunas antes de iniciar esa actividad o ese experimento. Si un Estado Parte en el Tratado tiene motivos para creer que una actividad o un experimento en el espacio ultraterrestre, incluso la Luna y otros cuerpos celestes, proyectado por otro Estado Parte en el Tratado, crearía un obstáculo capaz de perjudicar las actividades de exploración y utilización del espacio ultraterrestre con fines pacíficos, incluso en la Luna y otros cuerpos celestes, podrá pedir que se celebren consultas sobre dicha actividad o experimento.

  

Artículo X

A fin de contribuir a la cooperación internacional en la exploración y la utilización del espacio ultraterrestre, incluso la Luna y otros cuerpos celestes, conforme a los objetivos del presente Tratado, los Estados Partes en él examinarán, en condiciones de igualdad, las solicitudes formuladas por otros Estados Partes en el Tratado para que se les brinde la oportunidad a fin de observar el vuelo de los objetos espaciales lanzados por dichos Estados.

La naturaleza de tal oportunidad y las condiciones en que podría ser concedida se determinarán por acuerdo entre los Estados interesados.

  

Artículo XI

A fin de fomentar la cooperación internacional en la exploración y utilización del espacio ultraterrestre con fines pacíficos, los Estados Partes en el Tratado que desarrollan actividades en el espacio ultraterrestre, incluso la Luna y otros cuerpos celestes, convienen en informar, en la mayor medida posible dentro de lo viable y factible, al Secretario General de las Naciones Unidas, así como al público y a la comunidad científica internacional, acerca de la naturaleza, marcha, localización y resultados de dichas actividades. El Secretario General de las Naciones Unidas debe estar en condiciones de difundir eficazmente tal información, inmediatamente después de recibirla.

  

Artículo XII

Todas las estaciones, instalaciones, equipo y vehículos espaciales situados en la Luna y otros cuerpos celestes serán accesibles a los representantes de otros Estados Parte en el presente Tratado, sobre la base de reciprocidad. Dichos representantes notificarán con antelación razonable su intención de hacer una visita, a fin de permitir celebrar las consultas que procedan y adoptar un máximo de precauciones para velar por la seguridad y evitar toda perturbación del funcionamiento normal de la instalación visitada.

  

Artículo XIII

Las disposiciones del presente Tratado se aplicarán a las actividades de exploración y utilización de espacio ultraterrestre, incluso la Luna y otros cuerpos celestes, que realicen los Estados Partes en el Tratado, tanto en el caso de que esas actividades las lleve a cabo un Estado Parte en el Tratado por sí solo o junto con otros Estados, incluso cuando se efectúen dentro del marco de organizaciones intergubernamentales internacionales.

Los Estados Partes en el Tratado resolverán los problemas prácticos que puedan surgir en relación con las actividades que desarrollen las organizaciones intergubernamentales internacionales en la exploración y utilización del espacio ultraterrestre, incluso la Luna y otros cuerpos celestes, con la organización internacional pertinente o con uno o varios Estados miembros de dicha organización internacional que sean Partes en el presente Tratado.

  

Artículo XIV

1. Este Tratado estará abierto a la firma de todos los Estados. El Estado que firmare este Tratado antes de su entrada en vigor, de conformidad con párrafo 3 de este artículo, podrá adherirse a él en cualquier momento.

2. Este Tratado estará sujeto a ratificación por los Estados signatarios. Los instrumentos de ratificación y los instrumentos de adhesión se depositarán en los archivos de los Gobiernos de los Estados Unidos de América, del Reino Unido de Gran Bretaña e Irlanda del Norte y de la Unión de Repúblicas Socialistas Soviéticas, a los que por el presente se designa como Gobiernos depositarios.

3. Este Tratado entrará en vigor cuando hayan depositado los instrumentos de ratificación cinco gobiernos, incluidos los designados como Gobiernos depositarios en virtud del presente Tratado.

4. Para los Estados cuyos instrumentos de ratificación o de adhesión se depositaren después de la entrada en vigor de este Tratado, el Tratado entrará en vigor en la fecha del depósito de sus instrumentos de ratificación o adhesión.

5. Los Gobiernos depositarios informarán sin tardanza a todos los Estados signatarios y a todos los Estados que se hayan adherido a este Tratado, de la fecha de cada firma, de la fecha de depósito de cada instrumento de ratificación y de adhesión a este Tratado, de la fecha de su entrada en vigor y de cualquier otra notificación.

6. Este Tratado será registrado por los Gobiernos depositarios, de conformidad con el Artículo 102 de la Carta de las Naciones Unidas.

  

Artículo XV

Cualquier Estado Parte en el Tratado podrá proponer enmiendas al mismo. Las enmiendas entrarán en vigor para cada Estado Parte en el Tratado que las acepte cuando éstas hayan sido aceptadas por la mayoría de los Estados Partes en el Tratado, y en lo sucesivo para cada Estado restante que sea Parte en el Tratado en la fecha en que las acepte.

  

Artículo XVI

Todo Estado Parte podrá comunicar su retiro de este Tratado al cabo de un año de su entrada en vigor, mediante notificación por escrito dirigida a los Gobiernos depositarios. Tal retiro surtirá efecto un año después de la fecha en que se reciba la notificación.

  

Artículo XVII

Este Tratado, cuyos textos en chino, español, francés, inglés y ruso son igualmente auténticos, se depositará en los archivos de los Gobiernos depositarios. Los Gobiernos depositarios remitirán copias debidamente certificadas de este Tratado a los gobiernos de los Estados signatarios y de los Estados que se adhieran al Tratado.

EN TESTIMONIO DE LO CUAL, los infrascritos, debidamente autorizados, firman este Tratado.

HECHO en tres ejemplares, en las ciudades de Londres, Moscú y Washington D.C., el día veintisiete de enero de mil novecientos sesenta y siete.

encoding-0.2.33/src/examples/UTF-8-test.txt01006440000765000002400000047556125331707360016562 0ustar0000000000000000UTF-8 decoder capability and stress test ---------------------------------------- Markus Kuhn - 2003-02-19 This test file can help you examine, how your UTF-8 decoder handles various types of correct, malformed, or otherwise interesting UTF-8 sequences. This file is not meant to be a conformance test. It does not prescribes any particular outcome and therefore there is no way to "pass" or "fail" this test file, even though the texts suggests a preferable decoder behaviour at some places. The aim is instead to help you think about and test the behaviour of your UTF-8 on a systematic collection of unusual inputs. Experience so far suggests that most first-time authors of UTF-8 decoders find at least one serious problem in their decoder by using this file. The test lines below cover boundary conditions, malformed UTF-8 sequences as well as correctly encoded UTF-8 sequences of Unicode code points that should never occur in a correct UTF-8 file. According to ISO 10646-1:2000, sections D.7 and 2.3c, a device receiving UTF-8 shall interpret a "malformed sequence in the same way that it interprets a character that is outside the adopted subset" and "characters that are not within the adopted subset shall be indicated to the user" by a receiving device. A quite commonly used approach in UTF-8 decoders is to replace any malformed UTF-8 sequence by a replacement character (U+FFFD), which looks a bit like an inverted question mark, or a similar symbol. It might be a good idea to visually distinguish a malformed UTF-8 sequence from a correctly encoded Unicode character that is just not available in the current font but otherwise fully legal, even though ISO 10646-1 doesn't mandate this. In any case, just ignoring malformed sequences or unavailable characters does not conform to ISO 10646, will make debugging more difficult, and can lead to user confusion. Please check, whether a malformed UTF-8 sequence is (1) represented at all, (2) represented by exactly one single replacement character (or equivalent signal), and (3) the following quotation mark after an illegal UTF-8 sequence is correctly displayed, i.e. proper resynchronization takes place immageately after any malformed sequence. This file says "THE END" in the last line, so if you don't see that, your decoder crashed somehow before, which should always be cause for concern. All lines in this file are exactly 79 characters long (plus the line feed). In addition, all lines end with "|", except for the two test lines 2.1.1 and 2.2.1, which contain non-printable ASCII controls U+0000 and U+007F. If you display this file with a fixed-width font, these "|" characters should all line up in column 79 (right margin). This allows you to test quickly, whether your UTF-8 decoder finds the correct number of characters in every line, that is whether each malformed sequences is replaced by a single replacement character. Note that as an alternative to the notion of malformed sequence used here, it is also a perfectly acceptable (and in some situations even preferable) solution to represent each individual byte of a malformed sequence by a replacement character. If you follow this strategy in your decoder, then please ignore the "|" column. Here come the tests: | | 1 Some correct UTF-8 text | | You should see the Greek word 'kosme': "κόσμε" | | 2 Boundary condition test cases | | 2.1 First possible sequence of a certain length | | 2.1.1 1 byte (U-00000000): "" 2.1.2 2 bytes (U-00000080): "€" | 2.1.3 3 bytes (U-00000800): "ࠀ" | 2.1.4 4 bytes (U-00010000): "𐀀" | 2.1.5 5 bytes (U-00200000): "" | 2.1.6 6 bytes (U-04000000): "" | | 2.2 Last possible sequence of a certain length | | 2.2.1 1 byte (U-0000007F): "" 2.2.2 2 bytes (U-000007FF): "߿" | 2.2.3 3 bytes (U-0000FFFF): "￿" | 2.2.4 4 bytes (U-001FFFFF): "" | 2.2.5 5 bytes (U-03FFFFFF): "" | 2.2.6 6 bytes (U-7FFFFFFF): "" | | 2.3 Other boundary conditions | | 2.3.1 U-0000D7FF = ed 9f bf = "퟿" | 2.3.2 U-0000E000 = ee 80 80 = "" | 2.3.3 U-0000FFFD = ef bf bd = "�" | 2.3.4 U-0010FFFF = f4 8f bf bf = "􏿿" | 2.3.5 U-00110000 = f4 90 80 80 = "" | | 3 Malformed sequences | | 3.1 Unexpected continuation bytes | | Each unexpected continuation byte should be separately signalled as a | malformed sequence of its own. | | 3.1.1 First continuation byte 0x80: "" | 3.1.2 Last continuation byte 0xbf: "" | | 3.1.3 2 continuation bytes: "" | 3.1.4 3 continuation bytes: "" | 3.1.5 4 continuation bytes: "" | 3.1.6 5 continuation bytes: "" | 3.1.7 6 continuation bytes: "" | 3.1.8 7 continuation bytes: "" | | 3.1.9 Sequence of all 64 possible continuation bytes (0x80-0xbf): | | " | | | " | | 3.2 Lonely start characters | | 3.2.1 All 32 first bytes of 2-byte sequences (0xc0-0xdf), | each followed by a space character: | | " | " | | 3.2.2 All 16 first bytes of 3-byte sequences (0xe0-0xef), | each followed by a space character: | | " " | | 3.2.3 All 8 first bytes of 4-byte sequences (0xf0-0xf7), | each followed by a space character: | | " " | | 3.2.4 All 4 first bytes of 5-byte sequences (0xf8-0xfb), | each followed by a space character: | | " " | | 3.2.5 All 2 first bytes of 6-byte sequences (0xfc-0xfd), | each followed by a space character: | | " " | | 3.3 Sequences with last continuation byte missing | | All bytes of an incomplete sequence should be signalled as a single | malformed sequence, i.e., you should see only a single replacement | character in each of the next 10 tests. (Characters as in section 2) | | 3.3.1 2-byte sequence with last byte missing (U+0000): "" | 3.3.2 3-byte sequence with last byte missing (U+0000): "" | 3.3.3 4-byte sequence with last byte missing (U+0000): "" | 3.3.4 5-byte sequence with last byte missing (U+0000): "" | 3.3.5 6-byte sequence with last byte missing (U+0000): "" | 3.3.6 2-byte sequence with last byte missing (U-000007FF): "" | 3.3.7 3-byte sequence with last byte missing (U-0000FFFF): "" | 3.3.8 4-byte sequence with last byte missing (U-001FFFFF): "" | 3.3.9 5-byte sequence with last byte missing (U-03FFFFFF): "" | 3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): "" | | 3.4 Concatenation of incomplete sequences | | All the 10 sequences of 3.3 concatenated, you should see 10 malformed | sequences being signalled: | | "" | | 3.5 Impossible bytes | | The following two bytes cannot appear in a correct UTF-8 string | | 3.5.1 fe = "" | 3.5.2 ff = "" | 3.5.3 fe fe ff ff = "" | | 4 Overlong sequences | | The following sequences are not malformed according to the letter of | the Unicode 2.0 standard. However, they are longer then necessary and | a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8 | decoder" should reject them just like malformed sequences for two | reasons: (1) It helps to debug applications if overlong sequences are | not treated as valid representations of characters, because this helps | to spot problems more quickly. (2) Overlong sequences provide | alternative representations of characters, that could maliciously be | used to bypass filters that check only for ASCII characters. For | instance, a 2-byte encoded line feed (LF) would not be caught by a | line counter that counts only 0x0a bytes, but it would still be | processed as a line feed by an unsafe UTF-8 decoder later in the | pipeline. From a security point of view, ASCII compatibility of UTF-8 | sequences means also, that ASCII characters are *only* allowed to be | represented by ASCII bytes in the range 0x00-0x7f. To ensure this | aspect of ASCII compatibility, use only "safe UTF-8 decoders" that | reject overlong UTF-8 sequences for which a shorter encoding exists. | | 4.1 Examples of an overlong ASCII character | | With a safe UTF-8 decoder, all of the following five overlong | representations of the ASCII character slash ("/") should be rejected | like a malformed UTF-8 sequence, for instance by substituting it with | a replacement character. If you see a slash below, you do not have a | safe UTF-8 decoder! | | 4.1.1 U+002F = c0 af = "" | 4.1.2 U+002F = e0 80 af = "" | 4.1.3 U+002F = f0 80 80 af = "" | 4.1.4 U+002F = f8 80 80 80 af = "" | 4.1.5 U+002F = fc 80 80 80 80 af = "" | | 4.2 Maximum overlong sequences | | Below you see the highest Unicode value that is still resulting in an | overlong sequence if represented with the given number of bytes. This | is a boundary test for safe UTF-8 decoders. All five characters should | be rejected like malformed UTF-8 sequences. | | 4.2.1 U-0000007F = c1 bf = "" | 4.2.2 U-000007FF = e0 9f bf = "" | 4.2.3 U-0000FFFF = f0 8f bf bf = "" | 4.2.4 U-001FFFFF = f8 87 bf bf bf = "" | 4.2.5 U-03FFFFFF = fc 83 bf bf bf bf = "" | | 4.3 Overlong representation of the NUL character | | The following five sequences should also be rejected like malformed | UTF-8 sequences and should not be treated like the ASCII NUL | character. | | 4.3.1 U+0000 = c0 80 = "" | 4.3.2 U+0000 = e0 80 80 = "" | 4.3.3 U+0000 = f0 80 80 80 = "" | 4.3.4 U+0000 = f8 80 80 80 80 = "" | 4.3.5 U+0000 = fc 80 80 80 80 80 = "" | | 5 Illegal code positions | | The following UTF-8 sequences should be rejected like malformed | sequences, because they never represent valid ISO 10646 characters and | a UTF-8 decoder that accepts them might introduce security problems | comparable to overlong UTF-8 sequences. | | 5.1 Single UTF-16 surrogates | | 5.1.1 U+D800 = ed a0 80 = "" | 5.1.2 U+DB7F = ed ad bf = "" | 5.1.3 U+DB80 = ed ae 80 = "" | 5.1.4 U+DBFF = ed af bf = "" | 5.1.5 U+DC00 = ed b0 80 = "" | 5.1.6 U+DF80 = ed be 80 = "" | 5.1.7 U+DFFF = ed bf bf = "" | | 5.2 Paired UTF-16 surrogates | | 5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80 = "" | 5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf = "" | 5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80 = "" | 5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf = "" | 5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80 = "" | 5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf = "" | 5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80 = "" | 5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf = "" | | 5.3 Other illegal code positions | | 5.3.1 U+FFFE = ef bf be = "￾" | 5.3.2 U+FFFF = ef bf bf = "￿" | | THE END | encoding-0.2.33/src/index/gen_index.py01006440000765000002400000037243127606113120016024 0ustar0000000000000000# This is a part of rust-encoding. # Copyright (c) 2013-2015, Kang Seonghoon. # See README.md and LICENSE.txt for details. import urllib import sys import os.path def whatwg_index(name, comments): for line in urllib.urlopen('http://encoding.spec.whatwg.org/index-%s.txt' % name): line = line.strip() if not line: continue if line.startswith('#'): comments.append('//' + line[1:]) continue parts = line.split(None, 2) key = int(parts[0], 0) value = int(parts[1], 0) yield key, value def mkdir_and_open(crate, name): dirname = os.path.join(os.path.dirname(__file__), crate) try: os.mkdir(dirname) except Exception: pass return open(os.path.join(dirname, '%s.rs' % name.replace('-', '_')), 'wb') def write_header(f, name, comments): print >>f, '// AUTOGENERATED FROM index-%s.txt, ORIGINAL COMMENT FOLLOWS:' % name print >>f, '//' for line in comments: print >>f, line def write_comma_separated(f, prefix, l, width=80): buffered = '' for i in l: i = str(i) if len(prefix) + len(buffered) + len(i) <= width: buffered += i else: print >>f, prefix + buffered.rstrip() buffered = i if buffered: print >>f, prefix + buffered.rstrip() def make_minimal_trie(invdata, lowerlimit=0x10000): maxvalue = max(invdata.keys()) + 1 best = 0xffffffff besttrie = None for triebits in xrange(21): lower = [None] * (1<= len(lower) + len(upper): best = len(lower) + len(upper) besttrie = (triebits, lower, upper) return besttrie def generate_single_byte_index(crate, name): modname = name.replace('-', '_') data = [None] * 128 invdata = {} comments = [] for key, value in whatwg_index(name, comments): assert 0 <= key < 128 and 0 <= value < 0xffff and data[key] is None and value not in invdata data[key] = value invdata[value] = key # generate a trie with a minimal amount of data triebits, lower, upper = make_minimal_trie(invdata, lowerlimit=0x10000) with mkdir_and_open(crate, name) as f: write_header(f, name, comments) print >>f print >>f, "static FORWARD_TABLE: &'static [u16] = &[" write_comma_separated(f, ' ', ['%d, ' % (0xffff if value is None else value) for value in data]) print >>f, '];' print >>f print >>f, '/// Returns the index code point for pointer `code` in this index.' print >>f, '#[inline]' print >>f, 'pub fn forward(code: u8) -> u16 {' print >>f, ' FORWARD_TABLE[(code - 0x80) as usize]' print >>f, '}' print >>f print >>f, "static BACKWARD_TABLE_LOWER: &'static [u8] = &[" write_comma_separated(f, ' ', ['%d, ' % (0 if v is None else v+0x80) for v in lower]) print >>f, '];' print >>f print >>f, "static BACKWARD_TABLE_UPPER: &'static [u16] = &[" write_comma_separated(f, ' ', ['%d, ' % v for v in upper]) print >>f, '];' print >>f print >>f, '/// Returns the index pointer for code point `code` in this index.' print >>f, '#[inline]' print >>f, 'pub fn backward(code: u32) -> u8 {' print >>f, ' let offset = (code >> %d) as usize;' % triebits print >>f, ' let offset = if offset < %d {BACKWARD_TABLE_UPPER[offset] as usize} else {0};' % len(upper) print >>f, ' BACKWARD_TABLE_LOWER[offset + ((code & %d) as usize)]' % ((1<>f, '}' print >>f print >>f, '#[cfg(test)]' print >>f, 'single_byte_tests!(' print >>f, ' mod = %s' % modname print >>f, ');' return 2 * len(data) + len(lower) + 2 * len(upper) def generate_multi_byte_index(crate, name): modname = name.replace('-', '_') data = {} invdata = {} dups = [] comments = [] morebits = False for key, value in whatwg_index(name, comments): assert 0 <= key < 0xffff and 0 <= value < 0x110000 and value != 0xffff and key not in data if value >= 0x10001: assert (value >> 16) == 2 morebits = True data[key] = value if value not in invdata: invdata[value] = key else: dups.append(key) # Big5 has four two-letter forward mappings, we use special entries for them if name == 'big5': specialidx = [1133, 1135, 1164, 1166] assert all(key not in data for key in specialidx) assert all(value not in invdata for value in xrange(len(specialidx))) for value, key in enumerate(specialidx): data[key] = value dups.append(key) # no consistency testing for them # generate a trie with a minimal amount of data triebits, lower, upper = make_minimal_trie(invdata, lowerlimit=0x10000) # JIS X 0208 index has two ranges [8272,8836) and [8836,11280) to support two slightly # different encodings EUC-JP and Shift_JIS; the default backward function would favor # the former, so we need a separate mapping for the latter. # # fortunately for us, all allocated codes in [8272,8836) have counterparts in others, # so we only need a smaller remapping from [8272,8836) to other counterparts. remap = None if name == 'jis0208': REMAP_MIN = 8272 REMAP_MAX = 8835 invdataminusremap = {} for key, value in data.items(): if value not in invdataminusremap and not REMAP_MIN <= key <= REMAP_MAX: invdataminusremap[value] = key remap = [] for i in xrange(REMAP_MIN, REMAP_MAX+1): if i in data: assert data[i] in invdataminusremap value = invdataminusremap[data[i]] assert value < 0x10000 remap.append(value) else: remap.append(0xffff) minkey = min(data) maxkey = max(data) + 1 with mkdir_and_open(crate, name) as f: write_header(f, name, comments) print >>f print >>f, "static FORWARD_TABLE: &'static [u16] = &[" write_comma_separated(f, ' ', ['%d, ' % (data.get(key, 0xffff) & 0xffff) for key in xrange(minkey, maxkey)]) print >>f, '];' if morebits: print >>f print >>f, "static FORWARD_TABLE_MORE: &'static [u32] = &[" bits = [] for i in xrange(minkey, maxkey, 32): v = 0 for j in xrange(32): v |= (data.get(i+j, 0) >= 0x10000) << j bits.append(v) write_comma_separated(f, ' ', ['%d, ' % v for v in bits]) print >>f, '];' print >>f print >>f, '/// Returns the index code point for pointer `code` in this index.' print >>f, '#[inline]' print >>f, 'pub fn forward(code: u16) -> u32 {' if minkey != 0: print >>f, ' let code = (code as usize).wrapping_sub(%d);' % minkey else: print >>f, ' let code = code as usize;' print >>f, ' if code < %d {' % (maxkey - minkey) if morebits: print >>f, ' (FORWARD_TABLE[code] as u32) | ' + \ '(((FORWARD_TABLE_MORE[code >> 5] >> (code & 31)) & 1) << 17)' else: print >>f, ' FORWARD_TABLE[code] as u32' print >>f, ' } else {' print >>f, ' 0xffff' print >>f, ' }' print >>f, '}' print >>f print >>f, "static BACKWARD_TABLE_LOWER: &'static [u16] = &[" write_comma_separated(f, ' ', ['%d, ' % (0xffff if v is None else v) for v in lower]) print >>f, '];' print >>f print >>f, "static BACKWARD_TABLE_UPPER: &'static [u16] = &[" write_comma_separated(f, ' ', ['%d, ' % v for v in upper]) print >>f, '];' if remap: print >>f print >>f, "static BACKWARD_TABLE_REMAPPED: &'static [u16] = &[" write_comma_separated(f, ' ', ['%d, ' % v for v in remap]) print >>f, '];' print >>f print >>f, '/// Returns the index pointer for code point `code` in this index.' print >>f, '#[inline]' print >>f, 'pub fn backward(code: u32) -> u16 {' print >>f, ' let offset = (code >> %d) as usize;' % triebits print >>f, ' let offset = if offset < %d {BACKWARD_TABLE_UPPER[offset] as usize} else {0};' % len(upper) print >>f, ' BACKWARD_TABLE_LOWER[offset + ((code & %d) as usize)]' % ((1<>f, '}' if remap: print >>f assert name == 'jis0208' print >>f, '/// Returns the index shift_jis pointer for code point `code`.' print >>f, '#[inline]' print >>f, 'pub fn backward_remapped(code: u32) -> u16 {' print >>f, ' let value = backward(code);' print >>f, ' if %d <= value && value <= %d {' % (REMAP_MIN, REMAP_MAX) print >>f, ' BACKWARD_TABLE_REMAPPED[(value - %d) as usize]' % REMAP_MIN print >>f, ' } else {' print >>f, ' value' print >>f, ' }' print >>f, '}' print >>f print >>f, '#[cfg(test)]' print >>f, 'multi_byte_tests!(' print >>f, ' mod = %s,' % modname if remap: print >>f, ' remap = [%d, %d],' % (REMAP_MIN, REMAP_MAX) if dups: print >>f, ' dups = [' write_comma_separated(f, ' ', ['%d, ' % v for v in sorted(dups)]) print >>f, ' ]' else: print >>f, ' dups = []' print >>f, ');' tablesz = 2 * (maxkey - minkey) + 2 * len(lower) + 2 * len(upper) if morebits: tablesz += 4 * ((maxkey - minkey + 31) // 32) if remap: tablesz += 2 * len(remap) return tablesz def generate_multi_byte_range_lbound_index(crate, name): modname = name.replace('-', '_') data = [] comments = [] for key, value in whatwg_index(name, comments): data.append((key, value)) assert data and data == sorted(data) minkey, minvalue = data[0] maxkey, maxvalue = data[-1] if data[0] != (0, 0): data.insert(0, (0, 0)) maxlog2 = 0 while 2**(maxlog2 + 1) <= len(data): maxlog2 += 1 if name == 'gb18030-ranges': keyubound = 0x110000 valueubound = 126 * 10 * 126 * 10 else: keyubound = maxkey + 1 valueubound = maxvalue + 1 with mkdir_and_open(crate, name) as f: write_header(f, name, comments) print >>f print >>f, "static FORWARD_TABLE: &'static [u32] = &[" write_comma_separated(f, ' ', ['%d, ' % value for key, value in data]) print >>f, '];' print >>f print >>f, "static BACKWARD_TABLE: &'static [u32] = &[" write_comma_separated(f, ' ', ['%d, ' % key for key, value in data]) print >>f, '];' print >>f print >>f, '/// Returns the index code point for pointer `code` in this index.' print >>f, '#[inline]' print >>f, 'pub fn forward(code: u32) -> u32 {' if minkey > 0: print >>f, ' if code < %d { return 0xffffffff; }' % minkey if name == 'gb18030-ranges': # has "invalid" region inside print >>f, ' if (code > 39419 && code < 189000) || code > 1237575 { return 0xffffffff; }' print >>f, ' let mut i = if code >= BACKWARD_TABLE[%d] {%d} else {0};' % \ (2**maxlog2 - 1, len(data) - 2**maxlog2 + 1) for i in xrange(maxlog2-1, -1, -1): print >>f, ' if code >= BACKWARD_TABLE[i%s] { i += %d; }' % \ ('+%d' % (2**i-1) if i > 0 else '', 2**i) print >>f, ' (code - BACKWARD_TABLE[i-1]) + FORWARD_TABLE[i-1]' print >>f, '}' print >>f print >>f, '/// Returns the index pointer for code point `code` in this index.' print >>f, '#[inline]' print >>f, 'pub fn backward(code: u32) -> u32 {' if minvalue > 0: print >>f, ' if code < %d { return 0xffffffff; }' % minvalue print >>f, ' let mut i = if code >= FORWARD_TABLE[%d] {%d} else {0};' % \ (2**maxlog2 - 1, len(data) - 2**maxlog2 + 1) for i in xrange(maxlog2-1, -1, -1): print >>f, ' if code >= FORWARD_TABLE[i%s] { i += %d; }' % \ ('+%d' % (2**i-1) if i > 0 else '', 2**i) print >>f, ' (code - FORWARD_TABLE[i-1]) + BACKWARD_TABLE[i-1]' print >>f, '}' print >>f print >>f, '#[cfg(test)]' print >>f, 'multi_byte_range_tests!(' print >>f, ' mod = %s,' % modname print >>f, ' key = [%d, %d], key < %d,' % (minkey, maxkey, keyubound) print >>f, ' value = [%d, %d], value < %d' % (minvalue, maxvalue, valueubound) print >>f, ');' return 8 * len(data) INDICES = { 'singlebyte/ibm866': generate_single_byte_index, 'singlebyte/iso-8859-2': generate_single_byte_index, 'singlebyte/iso-8859-3': generate_single_byte_index, 'singlebyte/iso-8859-4': generate_single_byte_index, 'singlebyte/iso-8859-5': generate_single_byte_index, 'singlebyte/iso-8859-6': generate_single_byte_index, 'singlebyte/iso-8859-7': generate_single_byte_index, 'singlebyte/iso-8859-8': generate_single_byte_index, 'singlebyte/iso-8859-10': generate_single_byte_index, 'singlebyte/iso-8859-13': generate_single_byte_index, 'singlebyte/iso-8859-14': generate_single_byte_index, 'singlebyte/iso-8859-15': generate_single_byte_index, 'singlebyte/iso-8859-16': generate_single_byte_index, 'singlebyte/koi8-r': generate_single_byte_index, 'singlebyte/koi8-u': generate_single_byte_index, 'singlebyte/macintosh': generate_single_byte_index, 'singlebyte/windows-874': generate_single_byte_index, 'singlebyte/windows-1250': generate_single_byte_index, 'singlebyte/windows-1251': generate_single_byte_index, 'singlebyte/windows-1252': generate_single_byte_index, 'singlebyte/windows-1253': generate_single_byte_index, 'singlebyte/windows-1254': generate_single_byte_index, 'singlebyte/windows-1255': generate_single_byte_index, 'singlebyte/windows-1256': generate_single_byte_index, 'singlebyte/windows-1257': generate_single_byte_index, 'singlebyte/windows-1258': generate_single_byte_index, 'singlebyte/x-mac-cyrillic': generate_single_byte_index, 'tradchinese/big5': generate_multi_byte_index, 'korean/euc-kr': generate_multi_byte_index, 'simpchinese/gb18030': generate_multi_byte_index, 'japanese/jis0208': generate_multi_byte_index, 'japanese/jis0212': generate_multi_byte_index, 'simpchinese/gb18030-ranges': generate_multi_byte_range_lbound_index, } if __name__ == '__main__': import sys filter = sys.argv[1] if len(sys.argv) > 1 else '' for index, generate in INDICES.items(): crate, _, index = index.partition('/') if filter not in index: continue print >>sys.stderr, 'generating index %s...' % index, tablesz = generate(crate, index) print >>sys.stderr, '%d bytes.' % tablesz encoding-0.2.33/src/label.rs01006440000765000002400000026113127606113120014022 0ustar0000000000000000// This is a part of rust-encoding. // Copyright (c) 2013-2015, Kang Seonghoon. // See README.md and LICENSE.txt for details. //! An interface for retrieving an encoding (or a set of encodings) from a string/numeric label. use all; use types::EncodingRef; /// Returns an encoding from given label, defined in the WHATWG Encoding standard, if any. /// Implements "get an encoding" algorithm: http://encoding.spec.whatwg.org/#concept-encoding-get pub fn encoding_from_whatwg_label(label: &str) -> Option { let label = label.trim_matches(&[' ', '\n', '\r', '\t', '\x0C'][..]); let label: String = label.chars().map(|c| match c { 'A'...'Z' => (c as u8 + 32) as char, _ => c }).collect(); match &label[..] { "unicode-1-1-utf-8" | "utf-8" | "utf8" => Some(all::UTF_8 as EncodingRef), "866" | "cp866" | "csibm866" | "ibm866" => Some(all::IBM866 as EncodingRef), "csisolatin2" | "iso-8859-2" | "iso-ir-101" | "iso8859-2" | "iso88592" | "iso_8859-2" | "iso_8859-2:1987" | "l2" | "latin2" => Some(all::ISO_8859_2 as EncodingRef), "csisolatin3" | "iso-8859-3" | "iso-ir-109" | "iso8859-3" | "iso88593" | "iso_8859-3" | "iso_8859-3:1988" | "l3" | "latin3" => Some(all::ISO_8859_3 as EncodingRef), "csisolatin4" | "iso-8859-4" | "iso-ir-110" | "iso8859-4" | "iso88594" | "iso_8859-4" | "iso_8859-4:1988" | "l4" | "latin4" => Some(all::ISO_8859_4 as EncodingRef), "csisolatincyrillic" | "cyrillic" | "iso-8859-5" | "iso-ir-144" | "iso8859-5" | "iso88595" | "iso_8859-5" | "iso_8859-5:1988" => Some(all::ISO_8859_5 as EncodingRef), "arabic" | "asmo-708" | "csiso88596e" | "csiso88596i" | "csisolatinarabic" | "ecma-114" | "iso-8859-6" | "iso-8859-6-e" | "iso-8859-6-i" | "iso-ir-127" | "iso8859-6" | "iso88596" | "iso_8859-6" | "iso_8859-6:1987" => Some(all::ISO_8859_6 as EncodingRef), "csisolatingreek" | "ecma-118" | "elot_928" | "greek" | "greek8" | "iso-8859-7" | "iso-ir-126" | "iso8859-7" | "iso88597" | "iso_8859-7" | "iso_8859-7:1987" | "sun_eu_greek" => Some(all::ISO_8859_7 as EncodingRef), "csiso88598e" | "csisolatinhebrew" | "hebrew" | "iso-8859-8" | "iso-8859-8-e" | "iso-ir-138" | "iso8859-8" | "iso88598" | "iso_8859-8" | "iso_8859-8:1988" | "visual" => Some(all::ISO_8859_8 as EncodingRef), "csiso88598i" | "iso-8859-8-i" | "logical" => Some(all::whatwg::ISO_8859_8_I as EncodingRef), "csisolatin6" | "iso-8859-10" | "iso-ir-157" | "iso8859-10" | "iso885910" | "l6" | "latin6" => Some(all::ISO_8859_10 as EncodingRef), "iso-8859-13" | "iso8859-13" | "iso885913" => Some(all::ISO_8859_13 as EncodingRef), "iso-8859-14" | "iso8859-14" | "iso885914" => Some(all::ISO_8859_14 as EncodingRef), "csisolatin9" | "iso-8859-15" | "iso8859-15" | "iso885915" | "iso_8859-15" | "l9" => Some(all::ISO_8859_15 as EncodingRef), "iso-8859-16" => Some(all::ISO_8859_16 as EncodingRef), "cskoi8r" | "koi" | "koi8" | "koi8-r" | "koi8_r" => Some(all::KOI8_R as EncodingRef), "koi8-u" => Some(all::KOI8_U as EncodingRef), "csmacintosh" | "mac" | "macintosh" | "x-mac-roman" => Some(all::MAC_ROMAN as EncodingRef), "dos-874" | "iso-8859-11" | "iso8859-11" | "iso885911" | "tis-620" | "windows-874" => Some(all::WINDOWS_874 as EncodingRef), "cp1250" | "windows-1250" | "x-cp1250" => Some(all::WINDOWS_1250 as EncodingRef), "cp1251" | "windows-1251" | "x-cp1251" => Some(all::WINDOWS_1251 as EncodingRef), "ansi_x3.4-1968" | "ascii" | "cp1252" | "cp819" | "csisolatin1" | "ibm819" | "iso-8859-1" | "iso-ir-100" | "iso8859-1" | "iso88591" | "iso_8859-1" | "iso_8859-1:1987" | "l1" | "latin1" | "us-ascii" | "windows-1252" | "x-cp1252" => Some(all::WINDOWS_1252 as EncodingRef), "cp1253" | "windows-1253" | "x-cp1253" => Some(all::WINDOWS_1253 as EncodingRef), "cp1254" | "csisolatin5" | "iso-8859-9" | "iso-ir-148" | "iso8859-9" | "iso88599" | "iso_8859-9" | "iso_8859-9:1989" | "l5" | "latin5" | "windows-1254" | "x-cp1254" => Some(all::WINDOWS_1254 as EncodingRef), "cp1255" | "windows-1255" | "x-cp1255" => Some(all::WINDOWS_1255 as EncodingRef), "cp1256" | "windows-1256" | "x-cp1256" => Some(all::WINDOWS_1256 as EncodingRef), "cp1257" | "windows-1257" | "x-cp1257" => Some(all::WINDOWS_1257 as EncodingRef), "cp1258" | "windows-1258" | "x-cp1258" => Some(all::WINDOWS_1258 as EncodingRef), "x-mac-cyrillic" | "x-mac-ukrainian" => Some(all::MAC_CYRILLIC as EncodingRef), "chinese" | "csgb2312" | "csiso58gb231280" | "gb2312" | "gb_2312" | "gb_2312-80" | "gbk" | "iso-ir-58" | "x-gbk" => Some(all::GBK as EncodingRef), "gb18030" => Some(all::GB18030 as EncodingRef), "big5" | "big5-hkscs" | "cn-big5" | "csbig5" | "x-x-big5" => Some(all::BIG5_2003 as EncodingRef), "cseucpkdfmtjapanese" | "euc-jp" | "x-euc-jp" => Some(all::EUC_JP as EncodingRef), "csiso2022jp" | "iso-2022-jp" => Some(all::ISO_2022_JP as EncodingRef), "csshiftjis" | "ms_kanji" | "shift-jis" | "shift_jis" | "sjis" | "windows-31j" | "x-sjis" => Some(all::WINDOWS_31J as EncodingRef), "cseuckr" | "csksc56011987" | "euc-kr" | "iso-ir-149" | "korean" | "ks_c_5601-1987" | "ks_c_5601-1989" | "ksc5601" | "ksc_5601" | "windows-949" => Some(all::WINDOWS_949 as EncodingRef), "csiso2022kr" | "hz-gb-2312" | "iso-2022-kr" | "iso-2022-cn" | "iso-2022-cn-ext" => Some(all::whatwg::REPLACEMENT as EncodingRef), "utf-16be" => Some(all::UTF_16BE as EncodingRef), "utf-16" | "utf-16le" => Some(all::UTF_16LE as EncodingRef), "x-user-defined" => Some(all::whatwg::X_USER_DEFINED as EncodingRef), _ => None } } /// Returns an encoding from Windows code page number. /// http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756%28v=vs.85%29.aspx /// Sometimes it can return a *superset* of the requested encoding, e.g. for several CJK encodings. pub fn encoding_from_windows_code_page(cp: usize) -> Option { match cp { 65001 => Some(all::UTF_8 as EncodingRef), 866 => Some(all::IBM866 as EncodingRef), 28591 => Some(all::ISO_8859_1 as EncodingRef), 28592 => Some(all::ISO_8859_2 as EncodingRef), 28593 => Some(all::ISO_8859_3 as EncodingRef), 28594 => Some(all::ISO_8859_4 as EncodingRef), 28595 => Some(all::ISO_8859_5 as EncodingRef), 28596 => Some(all::ISO_8859_6 as EncodingRef), 28597 => Some(all::ISO_8859_7 as EncodingRef), 28598 => Some(all::ISO_8859_8 as EncodingRef), 38598 => Some(all::whatwg::ISO_8859_8_I as EncodingRef), 28603 => Some(all::ISO_8859_13 as EncodingRef), 28605 => Some(all::ISO_8859_15 as EncodingRef), 20866 => Some(all::KOI8_R as EncodingRef), 21866 => Some(all::KOI8_U as EncodingRef), 10000 => Some(all::MAC_ROMAN as EncodingRef), 874 => Some(all::WINDOWS_874 as EncodingRef), 1250 => Some(all::WINDOWS_1250 as EncodingRef), 1251 => Some(all::WINDOWS_1251 as EncodingRef), 1252 => Some(all::WINDOWS_1252 as EncodingRef), 1253 => Some(all::WINDOWS_1253 as EncodingRef), 1254 => Some(all::WINDOWS_1254 as EncodingRef), 1255 => Some(all::WINDOWS_1255 as EncodingRef), 1256 => Some(all::WINDOWS_1256 as EncodingRef), 1257 => Some(all::WINDOWS_1257 as EncodingRef), 1258 => Some(all::WINDOWS_1258 as EncodingRef), 1259 => Some(all::MAC_CYRILLIC as EncodingRef), 936 | 54936 => Some(all::GB18030 as EncodingRef), // XXX technically wrong 52936 => Some(all::HZ as EncodingRef), 950 => Some(all::BIG5_2003 as EncodingRef), 20932 => Some(all::EUC_JP as EncodingRef), 50220 => Some(all::ISO_2022_JP as EncodingRef), 932 => Some(all::WINDOWS_31J as EncodingRef), 949 => Some(all::WINDOWS_949 as EncodingRef), 1201 => Some(all::UTF_16BE as EncodingRef), 1200 => Some(all::UTF_16LE as EncodingRef), _ => None } } #[cfg(test)] mod tests { extern crate test; use all; use super::encoding_from_whatwg_label; #[test] fn test_encoding_from_whatwg_label() { assert!(encoding_from_whatwg_label("utf-8").is_some()); assert!(encoding_from_whatwg_label("UTF-8").is_some()); assert!(encoding_from_whatwg_label("\t\n\x0C\r utf-8\t\n\x0C\r ").is_some()); assert!(encoding_from_whatwg_label("\u{A0}utf-8").is_none(), "Non-ASCII whitespace should not be trimmed"); assert!(encoding_from_whatwg_label("greek").is_some()); assert!(encoding_from_whatwg_label("gree\u{212A}").is_none(), "Case-insensitive matching should be ASCII only. Kelvin sign does not match k."); // checks if the `whatwg_name` method returns the label that resolves back to that encoding for encoding in all::encodings() { if let Some(whatwg_name) = encoding.whatwg_name() { if whatwg_name == "replacement" { continue; } assert_eq!(encoding_from_whatwg_label(whatwg_name).and_then(|e| e.whatwg_name()), Some(whatwg_name)); } } } #[bench] fn bench_encoding_from_whatwg_label(bencher: &mut test::Bencher) { bencher.iter(|| test::black_box({ encoding_from_whatwg_label("iso-8859-bazinga") })) } } encoding-0.2.33/src/lib.rs01006440000765000002400000024233127606122770013525 0ustar0000000000000000// This is a part of rust-encoding. // Copyright (c) 2013-2015, Kang Seonghoon. // See README.md and LICENSE.txt for details. //! # Encoding 0.2.33 //! //! Character encoding support for Rust. (also known as `rust-encoding`) //! It is based on [WHATWG Encoding Standard](http://encoding.spec.whatwg.org/), //! and also provides an advanced interface for error detection and recovery. //! //! ## Usage //! //! Put this in your `Cargo.toml`: //! //! ```toml //! [dependencies] //! encoding = "0.2" //! ``` //! //! Then put this in your crate root: //! //! ```rust //! extern crate encoding; //! ``` //! //! ## Overview //! //! To encode a string: //! //! ~~~~ {.rust} //! use encoding::{Encoding, EncoderTrap}; //! use encoding::all::ISO_8859_1; //! //! assert_eq!(ISO_8859_1.encode("caf\u{e9}", EncoderTrap::Strict), //! Ok(vec![99,97,102,233])); //! ~~~~ //! //! To encode a string with unrepresentable characters: //! //! ~~~~ {.rust} //! use encoding::{Encoding, EncoderTrap}; //! use encoding::all::ISO_8859_2; //! //! assert!(ISO_8859_2.encode("Acme\u{a9}", EncoderTrap::Strict).is_err()); //! assert_eq!(ISO_8859_2.encode("Acme\u{a9}", EncoderTrap::Replace), //! Ok(vec![65,99,109,101,63])); //! assert_eq!(ISO_8859_2.encode("Acme\u{a9}", EncoderTrap::Ignore), //! Ok(vec![65,99,109,101])); //! assert_eq!(ISO_8859_2.encode("Acme\u{a9}", EncoderTrap::NcrEscape), //! Ok(vec![65,99,109,101,38,35,49,54,57,59])); //! ~~~~ //! //! To decode a byte sequence: //! //! ~~~~ {.rust} //! use encoding::{Encoding, DecoderTrap}; //! use encoding::all::ISO_8859_1; //! //! assert_eq!(ISO_8859_1.decode(&[99,97,102,233], DecoderTrap::Strict), //! Ok("caf\u{e9}".to_string())); //! ~~~~ //! //! To decode a byte sequence with invalid sequences: //! //! ~~~~ {.rust} //! use encoding::{Encoding, DecoderTrap}; //! use encoding::all::ISO_8859_6; //! //! assert!(ISO_8859_6.decode(&[65,99,109,101,169], DecoderTrap::Strict).is_err()); //! assert_eq!(ISO_8859_6.decode(&[65,99,109,101,169], DecoderTrap::Replace), //! Ok("Acme\u{fffd}".to_string())); //! assert_eq!(ISO_8859_6.decode(&[65,99,109,101,169], DecoderTrap::Ignore), //! Ok("Acme".to_string())); //! ~~~~ //! //! To encode or decode the input into the already allocated buffer: //! //! ~~~~ {.rust} //! use encoding::{Encoding, EncoderTrap, DecoderTrap}; //! use encoding::all::{ISO_8859_2, ISO_8859_6}; //! //! let mut bytes = Vec::new(); //! let mut chars = String::new(); //! //! assert!(ISO_8859_2.encode_to("Acme\u{a9}", EncoderTrap::Ignore, &mut bytes).is_ok()); //! assert!(ISO_8859_6.decode_to(&[65,99,109,101,169], DecoderTrap::Replace, &mut chars).is_ok()); //! //! assert_eq!(bytes, [65,99,109,101]); //! assert_eq!(chars, "Acme\u{fffd}"); //! ~~~~ //! //! A practical example of custom encoder traps: //! //! ~~~~ {.rust} //! use encoding::{Encoding, ByteWriter, EncoderTrap, DecoderTrap}; //! use encoding::types::RawEncoder; //! use encoding::all::ASCII; //! //! // hexadecimal numeric character reference replacement //! fn hex_ncr_escape(_encoder: &mut RawEncoder, input: &str, output: &mut ByteWriter) -> bool { //! let escapes: Vec = //! input.chars().map(|ch| format!("&#x{:x};", ch as isize)).collect(); //! let escapes = escapes.concat(); //! output.write_bytes(escapes.as_bytes()); //! true //! } //! static HEX_NCR_ESCAPE: EncoderTrap = EncoderTrap::Call(hex_ncr_escape); //! //! let orig = "Hello, 世界!".to_string(); //! let encoded = ASCII.encode(&orig, HEX_NCR_ESCAPE).unwrap(); //! assert_eq!(ASCII.decode(&encoded, DecoderTrap::Strict), //! Ok("Hello, 世界!".to_string())); //! ~~~~ //! //! Getting the encoding from the string label, as specified in WHATWG Encoding standard: //! //! ~~~~ {.rust} //! use encoding::{Encoding, DecoderTrap}; //! use encoding::label::encoding_from_whatwg_label; //! use encoding::all::WINDOWS_949; //! //! let euckr = encoding_from_whatwg_label("euc-kr").unwrap(); //! assert_eq!(euckr.name(), "windows-949"); //! assert_eq!(euckr.whatwg_name(), Some("euc-kr")); // for the sake of compatibility //! let broken = &[0xbf, 0xec, 0xbf, 0xcd, 0xff, 0xbe, 0xd3]; //! assert_eq!(euckr.decode(broken, DecoderTrap::Replace), //! Ok("\u{c6b0}\u{c640}\u{fffd}\u{c559}".to_string())); //! //! // corresponding Encoding native API: //! assert_eq!(WINDOWS_949.decode(broken, DecoderTrap::Replace), //! Ok("\u{c6b0}\u{c640}\u{fffd}\u{c559}".to_string())); //! ~~~~ //! //! ## Types and Stuffs //! //! There are three main entry points to Encoding. //! //! **`Encoding`** is a single character encoding. //! It contains `encode` and `decode` methods for converting `String` to `Vec` and vice versa. //! For the error handling, they receive **traps** (`EncoderTrap` and `DecoderTrap` respectively) //! which replace any error with some string (e.g. `U+FFFD`) or sequence (e.g. `?`). //! You can also use `EncoderTrap::Strict` and `DecoderTrap::Strict` traps to stop on an error. //! //! There are two ways to get `Encoding`: //! //! * `encoding::all` has static items for every supported encoding. //! You should use them when the encoding would not change or only handful of them are required. //! Combined with link-time optimization, any unused encoding would be discarded from the binary. //! //! * `encoding::label` has functions to dynamically get an encoding from given string ("label"). //! They will return a static reference to the encoding, //! which type is also known as `EncodingRef`. //! It is useful when a list of required encodings is not available in advance, //! but it will result in the larger binary and missed optimization opportunities. //! //! **`RawEncoder`** is an experimental incremental encoder. //! At each step of `raw_feed`, it receives a slice of string //! and emits any encoded bytes to a generic `ByteWriter` (normally `Vec`). //! It will stop at the first error if any, and would return a `CodecError` struct in that case. //! The caller is responsible for calling `raw_finish` at the end of encoding process. //! //! **`RawDecoder`** is an experimental incremental decoder. //! At each step of `raw_feed`, it receives a slice of byte sequence //! and emits any decoded characters to a generic `StringWriter` (normally `String`). //! Otherwise it is identical to `RawEncoder`s. //! //! One should prefer `Encoding::{encode,decode}` as a primary interface. //! `RawEncoder` and `RawDecoder` is experimental and can change substantially. //! See the additional documents on `encoding::types` module for more information on them. //! //! ## Supported Encodings //! //! Encoding covers all encodings specified by WHATWG Encoding Standard and some more: //! //! * 7-bit strict ASCII (`ascii`) //! * UTF-8 (`utf-8`) //! * UTF-16 in little endian (`utf-16` or `utf-16le`) and big endian (`utf-16be`) //! * All single byte encoding in WHATWG Encoding Standard: //! * IBM code page 866 //! * ISO 8859-{2,3,4,5,6,7,8,10,13,14,15,16} //! * KOI8-R, KOI8-U //! * MacRoman (`macintosh`), Macintosh Cyrillic encoding (`x-mac-cyrillic`) //! * Windows code pages 874, 1250, 1251, 1252 (instead of ISO 8859-1), 1253, //! 1254 (instead of ISO 8859-9), 1255, 1256, 1257, 1258 //! * All multi byte encodings in WHATWG Encoding Standard: //! * Windows code page 949 (`euc-kr`, since the strict EUC-KR is hardly used) //! * EUC-JP and Windows code page 932 (`shift_jis`, //! since it's the most widespread extension to Shift_JIS) //! * ISO-2022-JP with asymmetric JIS X 0212 support //! (Note: this is not yet up to date to the current standard) //! * GBK //! * GB 18030 //! * Big5-2003 with HKSCS-2008 extensions //! * Encodings that were originally specified by WHATWG Encoding Standard: //! * HZ //! * ISO 8859-1 (distinct from Windows code page 1252) //! //! Parenthesized names refer to the encoding's primary name assigned by WHATWG Encoding Standard. //! //! Many legacy character encodings lack the proper specification, //! and even those that have a specification are highly dependent of the actual implementation. //! Consequently one should be careful when picking a desired character encoding. //! The only standards reliable in this regard are WHATWG Encoding Standard and //! [vendor-provided mappings from the Unicode consortium](http://www.unicode.org/Public/MAPPINGS/). //! Whenever in doubt, look at the source code and specifications for detailed explanations. #![cfg_attr(test, feature(test))] // lib stability features as per RFC #507 extern crate encoding_index_singlebyte as index_singlebyte; extern crate encoding_index_korean as index_korean; extern crate encoding_index_japanese as index_japanese; extern crate encoding_index_simpchinese as index_simpchinese; extern crate encoding_index_tradchinese as index_tradchinese; #[cfg(test)] extern crate test; pub use self::types::{CodecError, ByteWriter, StringWriter, RawEncoder, RawDecoder, EncodingRef, Encoding, EncoderTrapFunc, DecoderTrapFunc, DecoderTrap, EncoderTrap, decode}; // reexport #[macro_use] mod util; #[cfg(test)] #[macro_use] mod testutils; pub mod types; /// Codec implementations. pub mod codec { pub mod error; pub mod ascii; pub mod singlebyte; pub mod utf_8; pub mod utf_16; pub mod korean; pub mod japanese; pub mod simpchinese; pub mod tradchinese; pub mod whatwg; } pub mod all; pub mod label; #[cfg(test)] mod tests { use super::*; #[test] fn test_decode() { fn test_one(input: &[u8], expected_result: &str, expected_encoding: &str) { let (result, used_encoding) = decode( input, DecoderTrap::Strict, all::ISO_8859_1 as EncodingRef); let result = result.unwrap(); assert_eq!(used_encoding.name(), expected_encoding); assert_eq!(&result[..], expected_result); } test_one(&[0xEF, 0xBB, 0xBF, 0xC3, 0xA9], "é", "utf-8"); test_one(&[0xC3, 0xA9], "é", "iso-8859-1"); test_one(&[0xFE, 0xFF, 0x00, 0xE9], "é", "utf-16be"); test_one(&[0x00, 0xE9], "\x00é", "iso-8859-1"); test_one(&[0xFF, 0xFE, 0xE9, 0x00], "é", "utf-16le"); test_one(&[0xE9, 0x00], "é\x00", "iso-8859-1"); } } encoding-0.2.33/src/testutils.rs01006440000765000002400000035646125331707360015026 0ustar0000000000000000// This is a part of rust-encoding. // Copyright (c) 2013-2015, Kang Seonghoon. // See README.md and LICENSE.txt for details. //! Macros and utilities for testing. use std::borrow::ToOwned; use types::{RawDecoder, RawEncoder}; pub struct TestResult<'a, Output: 'a + ?Sized + ToOwned> { pub expected_return: (usize, Option), pub expected_push: &'a Output, pub actual_return: (usize, Option), pub actual_push: Output::Owned, } pub trait Testable { type Input: ?Sized; type Output: ?Sized + ToOwned; fn process_feed_ok<'a>(&mut self, processed: &Self::Input, unprocessed: &Self::Input, output: &'a Self::Output) -> TestResult<'a, Self::Output>; fn process_feed_err<'a>(&mut self, backup: isize, processed: &Self::Input, problem: &Self::Input, remaining: &Self::Input, output: &'a Self::Output) -> TestResult<'a, Self::Output>; fn process_finish_ok<'a>(&mut self, output: &'a Self::Output) -> TestResult<'a, Self::Output>; fn process_finish_err<'a>(&mut self, backup: isize, output: &'a Self::Output) -> TestResult<'a, Self::Output>; } impl Testable for RawDecoder { type Input = [u8]; type Output = str; fn process_feed_ok<'a>(&mut self, processed: &[u8], unprocessed: &[u8], output: &'a str) -> TestResult<'a, str> { let mut input = Vec::with_capacity(processed.len() + unprocessed.len()); input.extend(processed.iter().cloned()); input.extend(unprocessed.iter().cloned()); let mut buf = String::new(); let (nprocessed, err) = self.raw_feed(&input, &mut buf); TestResult { expected_return: (processed.len(), None), expected_push: output, actual_return: (nprocessed, err.map(|e| e.upto)), actual_push: buf, } } fn process_feed_err<'a>(&mut self, backup: isize, processed: &[u8], problem: &[u8], remaining: &[u8], output: &'a str) -> TestResult<'a, str> { let mut input = Vec::with_capacity(processed.len() + problem.len() + remaining.len()); input.extend(processed.iter().cloned()); input.extend(problem.iter().cloned()); input.extend(remaining.iter().cloned()); let mut buf = String::new(); let (nprocessed, err) = self.raw_feed(&input[-backup as usize..], &mut buf); TestResult { expected_return: (processed.len(), Some(processed.len() as isize + problem.len() as isize + backup)), expected_push: output, actual_return: (nprocessed, err.map(|e| e.upto)), actual_push: buf, } } fn process_finish_ok<'a>(&mut self, output: &'a str) -> TestResult<'a, str> { let mut buf = String::new(); let err = self.raw_finish(&mut buf); TestResult { expected_return: (0, None), expected_push: output, actual_return: (0, err.map(|e| e.upto)), actual_push: buf, } } fn process_finish_err<'a>(&mut self, backup: isize, output: &'a str) -> TestResult<'a, str> { let mut buf = String::new(); let err = self.raw_finish(&mut buf); TestResult { expected_return: (0, Some(backup)), expected_push: output, actual_return: (0, err.map(|e| e.upto)), actual_push: buf, } } } impl Testable for RawEncoder { type Input = str; type Output = [u8]; fn process_feed_ok<'a>(&mut self, processed: &str, unprocessed: &str, output: &'a [u8]) -> TestResult<'a, [u8]> { let mut input = String::with_capacity(processed.len() + unprocessed.len()); input.push_str(processed); input.push_str(unprocessed); let mut buf = Vec::new(); let (nprocessed, err) = self.raw_feed(&input, &mut buf); TestResult { expected_return: (processed.len(), None), expected_push: output, actual_return: (nprocessed, err.map(|e| e.upto)), actual_push: buf, } } fn process_feed_err<'a>(&mut self, backup: isize, processed: &str, problem: &str, remaining: &str, output: &'a [u8]) -> TestResult<'a, [u8]> { let mut input = String::with_capacity(processed.len() + problem.len() + remaining.len()); input.push_str(processed); input.push_str(problem); input.push_str(remaining); let mut buf = Vec::new(); let (nprocessed, err) = self.raw_feed(&input[-backup as usize..], &mut buf); TestResult { expected_return: (processed.len(), Some(processed.len() as isize + problem.len() as isize + backup)), expected_push: output, actual_return: (nprocessed, err.map(|e| e.upto)), actual_push: buf, } } fn process_finish_ok<'a>(&mut self, output: &'a [u8]) -> TestResult<'a, [u8]> { let mut buf = Vec::new(); let err = self.raw_finish(&mut buf); TestResult { expected_return: (0, None), expected_push: output, actual_return: (0, err.map(|e| e.upto)), actual_push: buf, } } fn process_finish_err<'a>(&mut self, backup: isize, output: &'a [u8]) -> TestResult<'a, [u8]> { let mut buf = Vec::new(); let err = self.raw_finish(&mut buf); TestResult { expected_return: (0, Some(backup)), expected_push: output, actual_return: (0, err.map(|e| e.upto)), actual_push: buf, } } } macro_rules! assert_expected { ($result:expr, $func:expr, $filter:expr) => ({ use testutils::Testable; match $result { result => { assert!(result.expected_return == result.actual_return, "{} should return {:?}, but instead returned {:?}", $func, $filter(result.expected_return), $filter(result.actual_return)); assert!(&result.expected_push[..] == &result.actual_push[..], "{} should push {:?}, but instead pushed {:?}", $func, result.expected_push, result.actual_push); } } }); } macro_rules! assert_feed_ok { ($this:expr, $processed:expr, $unprocessed:expr, $output:expr) => ( assert_expected!($this.process_feed_ok(&$processed, &$unprocessed, &$output), "raw_feed", |r| r) ); } macro_rules! assert_feed_err { ($this:expr, $backup:expr, $processed:expr, $problem:expr, $remaining:expr, $output:expr) => ( assert_expected!($this.process_feed_err($backup, &$processed, &$problem, &$remaining, &$output), "raw_feed", |r| r) ); ($this:expr, $processed:expr, $problem:expr, $remaining:expr, $output:expr) => ( assert_feed_err!($this, 0, $processed, $problem, $remaining, $output) ); } macro_rules! assert_finish_ok { ($this:expr, $output:expr) => ( assert_expected!($this.process_finish_ok(&$output), "raw_finish", |r: (usize, Option)| r.0) ); } macro_rules! assert_finish_err { ($this:expr, $backup:expr, $output:expr) => ( assert_expected!($this.process_finish_err($backup, &$output), "raw_finish", |r: (usize, Option)| r.0) ); ($this:expr, $output:expr) => ( assert_finish_err!($this, 0, $output) ); } /// Some ASCII-only text to test. // // the first paragraphs of the article "English Language" from English Wikipedia. // https://en.wikipedia.org/w/index.php?title=English_language&oldid=608500518 pub static ASCII_TEXT: &'static str = "English is a West Germanic language that was first spoken in early medieval England \ and is now a global lingua franca. It is spoken as a first language by \ the majority populations of several sovereign states, including the United Kingdom, \ the United States, Canada, Australia, Ireland, New Zealand and a number of Caribbean nations; \ and it is an official language of almost 60 sovereign states. It is the third-most-common \ native language in the world, after Mandarin Chinese and Spanish. It is widely learned as \ a second language and is an official language of the European Union, many Commonwealth \ countries and the United Nations, as well as in many world organisations."; /// Some Korean text to test. // // the first paragraphs of the article "Korean Language" from Korean Wikipedia. // https://ko.wikipedia.org/w/index.php?title=%ED%95%9C%EA%B5%AD%EC%96%B4&oldid=12331875 pub static KOREAN_TEXT: &'static str = "한국어(韓國語)는 주로 한반도(韓半島)와 한민족(韓民族) 거주 지역에서 쓰이는 언어로, \ 대한민국에서는 한국어, 한국말이라고 부르고, 조선민주주의인민공화국과 중국, 일본에서는 \ 조선어(朝鮮語), 조선말이라고 불린다. 우즈베키스탄, 러시아 등 구 소련의 고려인들 사이에서는 \ 고려말(高麗語)로 불린다. 19세기 중반 이후 한반도와 주변 정세의 혼란, 20세기 전반 \ 일본 제국주의의 침략, 20세기 후반 대한민국의 해외 이민에 의해 중국 동북 지방, 일본, \ 러시아 연해주와 사할린, 우즈베키스탄, 미국, 캐나다, 오스트레일리아, 필리핀, 베트남, 브라질 등 \ 세계 곳곳에 한민족이 이주하면서 한국어가 쓰이고 있다. 한국어 쓰는 인구는 전 세계를 통틀어 \ 약 8천250만 명으로 추산된다."; /// Some Japanese text to test. // // the first paragraphs of the article "Japanese Language" from Japanese Wikipedia. // https://ja.wikipedia.org/w/index.php?title=%E6%97%A5%E6%9C%AC%E8%AA%9E&oldid=51443986 pub static JAPANESE_TEXT: &'static str = "日本語(にほんご、にっぽんご)とは、主に日本国内や日本人同士の間で使われている言語である。\ 日本は法令によって「公用語」を規定していないが、法令その他の公用文は日本語で記述され、\ 各種法令(裁判所法第74条、会社計算規則第57条、特許法施行規則第2条など)において\ 日本語を用いることが定められるなど事実上の公用語となっており、学校教育の「国語」でも\ 教えられる。使用人口について正確な統計はないが、日本国内の人口、および日本国外に住む\ 日本人や日系人、日本がかつて統治した地域の一部の住民など、約1億3千万人以上と考えられる。\ 統計によって前後する可能性はあるが、この数は世界の母語話者数で上位10位以内に入る人数である。"; /// Some simplified Chinese text to test. // // the first paragraphs of the article "Chinese Language" from Chinese Wikipedia. // https://zh.wikipedia.org/w/index.php?title=%E6%B1%89%E8%AF%AD&variant=zh-cn&oldid=31224104 pub static SIMPLIFIED_CHINESE_TEXT: &'static str = "汉语,又称中文、华语(东南亚)、国语(中华民国国语)、中国语(日本、韩国等),\ 其他名称有汉文(通常指文言文)、华文、唐文、唐话、中国话等,是属汉藏语系的分析语,具有声调。\ 汉语的文字系统——汉字是一种意音文字,表意的同时也具一定的表音功能。\ 汉语包含书面语以及口语两部分,古代书面汉语称为文言文,现代书面汉语一般指使用现代标准汉语语法,\ 词汇的中文通行文体。目前全球有六分之一人口使用汉语作为母语。现代汉语书面语高度统一,\ 口语则有官话、粤语、吴语、湘语、赣语、客家语、闽语等七种主要汉语言\ (也有人认为晋语和(或)徽语和(或)平话(广西平话)也应为独立汉语言,\ 也有其他人认为闽语其实是一个语族,下辖闽南语、闽东语、闽中语以及莆仙语,\ 国际标准化组织即持此观点,部分资料将其中的一至六种也算成单独的汉语言,\ 这就是八至十三种汉语言的由来)。"; /// Some traditional Chinese text to test. // // the first paragraphs of the article "Chinese Language" from Chinese Wikipedia. // https://zh.wikipedia.org/w/index.php?title=%E6%B1%89%E8%AF%AD&variant=zh-tw&oldid=31224104 pub static TRADITIONAL_CHINESE_TEXT: &'static str = "漢語,又稱中文、華語(東南亞)、國語(中華民國國語)、中國語(日本、韓國等),\ 其他名稱有漢文(通常指文言文)、華文、唐文、唐話、中國話等,是屬漢藏語系的分析語,具有聲調。\ 漢語的文字系統——漢字是一種意音文字,表意的同時也具一定的表音功能。\ 漢語包含書面語以及口語兩部分,古代書面漢語稱為文言文,現代書面漢語一般指使用現代標準漢語語法,\ 詞彙的中文通行文體。目前全球有六分之一人口使用漢語作為母語。現代漢語書面語高度統一,\ 口語則有官話、粵語、吳語、湘語、贛語、客家語、閩語等七種主要漢語言\ (也有人認為晉語和(或)徽語和(或)平話(廣西平話)也應為獨立漢語言,\ 也有其他人認為閩語其實是一個語族,下轄閩南語、閩東語、閩中語以及莆仙語,\ 國際標準化組織即持此觀點,部分資料將其中的一至六種也算成單獨的漢語言,\ 這就是八至十三種漢語言的由來)。"; /// Some text with various invalid UTF-8 sequences. // // Markus Kuhn's UTF-8 decoder capability and stress test. // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt pub static INVALID_UTF8_TEXT: &'static [u8] = include_bytes!("examples/UTF-8-test.txt"); /// Returns a longer text used for external data benchmarks. /// This can be overriden with an environment variable `EXTERNAL_BENCH_DATA`, /// or it will use a built-in sample data (of about 100KB). pub fn get_external_bench_data() -> Vec { use std::env; use std::io::Read; use std::fs::File; use std::path::Path; // An HTML file derived from the Outer Space Treaty of 1967, in six available languages. // http://www.unoosa.org/oosa/SpaceLaw/outerspt.html static LONGER_TEXT: &'static [u8] = include_bytes!("examples/outer-space-treaty.html"); match env::var("EXTERNAL_BENCH_DATA") { Ok(path) => { let path = Path::new(&path); let mut file = File::open(&path).ok().expect("cannot read an external bench data"); let mut ret = Vec::new(); file.read_to_end(&mut ret).ok().expect("cannot read an external bench data"); ret } Err(..) => { LONGER_TEXT.to_vec() } } } encoding-0.2.33/src/types.rs01006440000765000002400000051752127606113120014116 0ustar0000000000000000// This is a part of rust-encoding. // Copyright (c) 2013-2015, Kang Seonghoon. // See README.md and LICENSE.txt for details. /*! * Interface to the character encoding. * * # Raw incremental interface * * Methods which name starts with `raw_` constitute the raw incremental interface, * the lowest-available API for encoders and decoders. * This interface divides the entire input to four parts: * * - **Processed** bytes do not affect the future result. * - **Unprocessed** bytes may affect the future result * and can be a part of problematic sequence according to the future input. * - **Problematic** byte is the first byte that causes an error condition. * - **Remaining** bytes are not yet processed nor read, * so the caller should feed any remaining bytes again. * * The following figure illustrates an example of successive `raw_feed` calls: * * ````notrust * 1st raw_feed :2nd raw_feed :3rd raw_feed * ----------+----:---------------:--+--+--------- * | : : | | * ----------+----:---------------:--+--+--------- * processed unprocessed | remaining * problematic * ```` * * Since these parts can span the multiple input sequences to `raw_feed`, * `raw_feed` returns two offsets (one optional) * with that the caller can track the problematic sequence. * The first offset (the first `usize` in the tuple) points to the first unprocessed bytes, * or is zero when unprocessed bytes have started before the current call. * (The first unprocessed byte can also be at offset 0, * which doesn't make a difference for the caller.) * The second offset (`upto` field in the `CodecError` struct), if any, * points to the first remaining bytes. * * If the caller needs to recover the error via the problematic sequence, * then the caller starts to save the unprocessed bytes when the first offset < the input length, * appends any new unprocessed bytes while the first offset is zero, * and discards unprocessed bytes when first offset becomes non-zero * while saving new unprocessed bytes when the first offset < the input length. * Then the caller checks for the error condition * and can use the saved unprocessed bytes for error recovery. * Alternatively, if the caller only wants to replace the problematic sequence * with a fixed string (like U+FFFD), * then it can just discard the first sequence and can emit the fixed string on an error. * It still has to feed the input bytes starting at the second offset again. */ use std::borrow::Cow; /// Error information from either encoder or decoder. pub struct CodecError { /// The byte position of the first remaining byte, with respect to the *current* input. /// For the `finish` call, this should be no more than zero (since there is no input). /// It can be negative if the remaining byte is in the prior inputs, /// as long as the remaining byte is not yet processed. /// The caller should feed the bytes starting from this point again /// in order to continue encoding or decoding after an error. pub upto: isize, /// A human-readable cause of the error. pub cause: Cow<'static, str>, } /// Byte writer used by encoders. In most cases this will be an owned vector of `u8`. pub trait ByteWriter { /// Hints an expected lower bound on the length (in bytes) of the output /// until the next call to `writer_hint`, /// so that the writer can reserve the memory for writing. /// `RawEncoder`s are recommended but not required to call this method /// with an appropriate estimate. /// By default this method does nothing. fn writer_hint(&mut self, _expectedlen: usize) {} /// Writes a single byte. fn write_byte(&mut self, b: u8); /// Writes a number of bytes. fn write_bytes(&mut self, v: &[u8]); } impl ByteWriter for Vec { fn writer_hint(&mut self, expectedlen: usize) { self.reserve(expectedlen); } fn write_byte(&mut self, b: u8) { self.push(b); } fn write_bytes(&mut self, v: &[u8]) { self.extend(v.iter().cloned()); } } /// String writer used by decoders. In most cases this will be an owned string. pub trait StringWriter { /// Hints an expected lower bound on the length (in bytes) of the output /// until the next call to `writer_hint`, /// so that the writer can reserve the memory for writing. /// `RawDecoder`s are recommended but not required to call this method /// with an appropriate estimate. /// By default this method does nothing. fn writer_hint(&mut self, _expectedlen: usize) {} /// Writes a single character. fn write_char(&mut self, c: char); /// Writes a string. fn write_str(&mut self, s: &str); } impl StringWriter for String { fn writer_hint(&mut self, expectedlen: usize) { let newlen = self.len() + expectedlen; self.reserve(newlen); } fn write_char(&mut self, c: char) { self.push(c); } fn write_str(&mut self, s: &str) { self.push_str(s); } } /// Encoder converting a Unicode string into a byte sequence. /// This is a lower level interface, and normally `Encoding::encode` should be used instead. pub trait RawEncoder: 'static { /// Creates a fresh `RawEncoder` instance which parameters are same as `self`. fn from_self(&self) -> Box; /// Returns true if this encoding is compatible to ASCII, /// i.e. U+0000 through U+007F always map to bytes 00 through 7F and nothing else. fn is_ascii_compatible(&self) -> bool { false } /// Feeds given portion of string to the encoder, /// pushes the an encoded byte sequence at the end of the given output, /// and returns a byte offset to the first unprocessed character /// (that can be zero when the first such character appeared in the prior calls to `raw_feed`) /// and optional error information (None means success). fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option); /// Finishes the encoder, /// pushes the an encoded byte sequence at the end of the given output, /// and returns optional error information (None means success). /// `remaining` value of the error information, if any, is always an empty string. fn raw_finish(&mut self, output: &mut ByteWriter) -> Option; } /// Decoder converting a byte sequence into a Unicode string. /// This is a lower level interface, and normally `Encoding::decode` should be used instead. pub trait RawDecoder: 'static { /// Creates a fresh `RawDecoder` instance which parameters are same as `self`. fn from_self(&self) -> Box; /// Returns true if this encoding is compatible to ASCII, /// i.e. bytes 00 through 7F always map to U+0000 through U+007F and nothing else. fn is_ascii_compatible(&self) -> bool { false } /// Feeds given portion of byte sequence to the encoder, /// pushes the a decoded string at the end of the given output, /// and returns an offset to the first unprocessed byte /// (that can be zero when the first such byte appeared in the prior calls to `raw_feed`) /// and optional error information (None means success). fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option); /// Finishes the decoder, /// pushes the a decoded string at the end of the given output, /// and returns optional error information (None means success). fn raw_finish(&mut self, output: &mut StringWriter) -> Option; } /// A trait object using dynamic dispatch which is a sendable reference to the encoding, /// for code where the encoding is not known at compile-time. pub type EncodingRef = &'static (Encoding + Send + Sync); /// Character encoding. pub trait Encoding { /// Returns the canonical name of given encoding. /// This name is guaranteed to be unique across built-in encodings, /// but it is not normative and would be at most arbitrary. fn name(&self) -> &'static str; /// Returns a name of given encoding defined in the WHATWG Encoding standard, if any. /// This name often differs from `name` due to the compatibility reason. fn whatwg_name(&self) -> Option<&'static str> { None } /// Creates a new encoder. fn raw_encoder(&self) -> Box; /// Creates a new decoder. fn raw_decoder(&self) -> Box; /// An easy-to-use interface to `RawEncoder`. /// On the encoder error `trap` is called, /// which may return a replacement sequence to continue processing, /// or a failure to return the error. fn encode(&self, input: &str, trap: EncoderTrap) -> Result, Cow<'static, str>> { let mut ret = Vec::new(); self.encode_to(input, trap, &mut ret).map(|_| ret) } /// Encode into a `ByteWriter`. fn encode_to(&self, input: &str, trap: EncoderTrap, ret: &mut ByteWriter) -> Result<(), Cow<'static, str>> { // we don't need to keep `unprocessed` here; // `raw_feed` should process as much input as possible. let mut encoder = self.raw_encoder(); let mut remaining = 0; loop { let (offset, err) = encoder.raw_feed(&input[remaining..], ret); let unprocessed = remaining + offset; match err { Some(err) => { remaining = (remaining as isize + err.upto) as usize; if !trap.trap(&mut *encoder, &input[unprocessed..remaining], ret) { return Err(err.cause); } } None => { remaining = input.len(); match encoder.raw_finish(ret) { Some(err) => { remaining = (remaining as isize + err.upto) as usize; if !trap.trap(&mut *encoder, &input[unprocessed..remaining], ret) { return Err(err.cause); } } None => {} } if remaining >= input.len() { return Ok(()); } } } } } /// An easy-to-use interface to `RawDecoder`. /// On the decoder error `trap` is called, /// which may return a replacement string to continue processing, /// or a failure to return the error. fn decode(&self, input: &[u8], trap: DecoderTrap) -> Result> { let mut ret = String::new(); self.decode_to(input, trap, &mut ret).map(|_| ret) } /// Decode into a `StringWriter`. /// /// This does *not* handle partial characters at the beginning or end of `input`! /// Use `RawDecoder` for incremental decoding. fn decode_to(&self, input: &[u8], trap: DecoderTrap, ret: &mut StringWriter) -> Result<(), Cow<'static, str>> { // we don't need to keep `unprocessed` here; // `raw_feed` should process as much input as possible. let mut decoder = self.raw_decoder(); let mut remaining = 0; loop { let (offset, err) = decoder.raw_feed(&input[remaining..], ret); let unprocessed = remaining + offset; match err { Some(err) => { remaining = (remaining as isize + err.upto) as usize; if !trap.trap(&mut *decoder, &input[unprocessed..remaining], ret) { return Err(err.cause); } } None => { remaining = input.len(); match decoder.raw_finish(ret) { Some(err) => { remaining = (remaining as isize + err.upto) as usize; if !trap.trap(&mut *decoder, &input[unprocessed..remaining], ret) { return Err(err.cause); } } None => {} } if remaining >= input.len() { return Ok(()); } } } } } } /// A type of the bare function in `EncoderTrap` values. pub type EncoderTrapFunc = extern "Rust" fn(encoder: &mut RawEncoder, input: &str, output: &mut ByteWriter) -> bool; /// A type of the bare function in `DecoderTrap` values. pub type DecoderTrapFunc = extern "Rust" fn(decoder: &mut RawDecoder, input: &[u8], output: &mut StringWriter) -> bool; /// Trap, which handles decoder errors. #[derive(Copy)] pub enum DecoderTrap { /// Immediately fails on errors. /// Corresponds to WHATWG "fatal" error algorithm. Strict, /// Replaces an error with a U+FFFD (decoder). /// Corresponds to WHATWG "replacement" error algorithm. Replace, /// Silently ignores an error, effectively replacing it with an empty sequence. Ignore, /// Calls given function to handle decoder errors. /// The function is given the current decoder, input and output writer, /// and should return true only when it is fine to keep going. Call(DecoderTrapFunc), } impl DecoderTrap { /// Handles a decoder error. May write to the output writer. /// Returns true only when it is fine to keep going. pub fn trap(&self, decoder: &mut RawDecoder, input: &[u8], output: &mut StringWriter) -> bool { match *self { DecoderTrap::Strict => false, DecoderTrap::Replace => { output.write_char('\u{fffd}'); true }, DecoderTrap::Ignore => true, DecoderTrap::Call(func) => func(decoder, input, output), } } } impl Clone for DecoderTrap { fn clone(&self) -> DecoderTrap { match *self { DecoderTrap::Strict => DecoderTrap::Strict, DecoderTrap::Replace => DecoderTrap::Replace, DecoderTrap::Ignore => DecoderTrap::Ignore, DecoderTrap::Call(f) => DecoderTrap::Call(f), } } } #[derive(Copy)] pub enum EncoderTrap { /// Immediately fails on errors. /// Corresponds to WHATWG "fatal" error algorithm. Strict, /// Replaces an error with `?` in given encoding. /// Note that this fails when `?` cannot be represented in given encoding. /// Corresponds to WHATWG "URL" error algorithms. Replace, /// Silently ignores an error, effectively replacing it with an empty sequence. Ignore, /// Replaces an error with XML numeric character references (e.g. `Ӓ`). /// The encoder trap fails when NCRs cannot be represented in given encoding. /// Corresponds to WHATWG "
" error algorithms. NcrEscape, /// Calls given function to handle encoder errors. /// The function is given the current encoder, input and output writer, /// and should return true only when it is fine to keep going. Call(EncoderTrapFunc), } impl EncoderTrap { /// Handles an encoder error. May write to the output writer. /// Returns true only when it is fine to keep going. pub fn trap(&self, encoder: &mut RawEncoder, input: &str, output: &mut ByteWriter) -> bool { fn reencode(encoder: &mut RawEncoder, input: &str, output: &mut ByteWriter, trapname: &str) -> bool { if encoder.is_ascii_compatible() { // optimization! output.write_bytes(input.as_bytes()); } else { let (_, err) = encoder.raw_feed(input, output); if err.is_some() { panic!("{} cannot reencode a replacement string", trapname); } } true } match *self { EncoderTrap::Strict => false, EncoderTrap::Replace => reencode(encoder, "?", output, "Replace"), EncoderTrap::Ignore => true, EncoderTrap::NcrEscape => { let mut escapes = String::new(); for ch in input.chars() { escapes.push_str(&format!("&#{};", ch as isize)); } reencode(encoder, &escapes, output, "NcrEscape") }, EncoderTrap::Call(func) => func(encoder, input, output), } } } impl Clone for EncoderTrap { fn clone(&self) -> EncoderTrap { match *self { EncoderTrap::Strict => EncoderTrap::Strict, EncoderTrap::Replace => EncoderTrap::Replace, EncoderTrap::Ignore => EncoderTrap::Ignore, EncoderTrap::NcrEscape => EncoderTrap::NcrEscape, EncoderTrap::Call(f) => EncoderTrap::Call(f), } } } /// Determine the encoding by looking for a Byte Order Mark (BOM) /// and decoded a single string in memory. /// Return the result and the used encoding. pub fn decode(input: &[u8], trap: DecoderTrap, fallback_encoding: EncodingRef) -> (Result>, EncodingRef) { use all::{UTF_8, UTF_16LE, UTF_16BE}; if input.starts_with(&[0xEF, 0xBB, 0xBF]) { (UTF_8.decode(&input[3..], trap), UTF_8 as EncodingRef) } else if input.starts_with(&[0xFE, 0xFF]) { (UTF_16BE.decode(&input[2..], trap), UTF_16BE as EncodingRef) } else if input.starts_with(&[0xFF, 0xFE]) { (UTF_16LE.decode(&input[2..], trap), UTF_16LE as EncodingRef) } else { (fallback_encoding.decode(input, trap), fallback_encoding) } } #[cfg(test)] mod tests { use super::*; use super::EncoderTrap::NcrEscape; use util::StrCharIndex; use std::convert::Into; // a contrived encoding example: same as ASCII, but inserts `prepend` between each character // within two "e"s (so that `widespread` becomes `wide*s*p*r*ead` and `eeeeasel` becomes // `e*ee*ease*l` where `*` is substituted by `prepend`) and prohibits `prohibit` character. struct MyEncoder { flag: bool, prohibit: char, prepend: &'static str, toggle: bool } impl RawEncoder for MyEncoder { fn from_self(&self) -> Box { Box::new(MyEncoder { flag: self.flag, prohibit: self.prohibit, prepend: self.prepend, toggle: false }) } fn is_ascii_compatible(&self) -> bool { self.flag } fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option) { for ((i,j), ch) in input.index_iter() { if ch <= '\u{7f}' && ch != self.prohibit { if self.toggle && !self.prepend.is_empty() { output.write_bytes(self.prepend.as_bytes()); } output.write_byte(ch as u8); if ch == 'e' { self.toggle = !self.toggle; } } else { return (i, Some(CodecError { upto: j as isize, cause: "!!!".into() })); } } (input.len(), None) } fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option { None } } struct MyEncoding { flag: bool, prohibit: char, prepend: &'static str } impl Encoding for MyEncoding { fn name(&self) -> &'static str { "my encoding" } fn raw_encoder(&self) -> Box { Box::new(MyEncoder { flag: self.flag, prohibit: self.prohibit, prepend: self.prepend, toggle: false }) } fn raw_decoder(&self) -> Box { panic!("not supported") } } #[test] fn test_reencoding_trap_with_ascii_compatible_encoding() { static COMPAT: &'static MyEncoding = &MyEncoding { flag: true, prohibit: '\u{80}', prepend: "" }; static INCOMPAT: &'static MyEncoding = &MyEncoding { flag: false, prohibit: '\u{80}', prepend: "" }; assert_eq!(COMPAT.encode("Hello\u{203d} I'm fine.", NcrEscape), Ok(b"Hello‽ I'm fine.".to_vec())); assert_eq!(INCOMPAT.encode("Hello\u{203d} I'm fine.", NcrEscape), Ok(b"Hello‽ I'm fine.".to_vec())); } #[test] fn test_reencoding_trap_with_ascii_incompatible_encoding() { static COMPAT: &'static MyEncoding = &MyEncoding { flag: true, prohibit: '\u{80}', prepend: "*" }; static INCOMPAT: &'static MyEncoding = &MyEncoding { flag: false, prohibit: '\u{80}', prepend: "*" }; // this should behave incorrectly as the encoding broke the assumption. assert_eq!(COMPAT.encode("Hello\u{203d} I'm fine.", NcrEscape), Ok(b"He*l*l*o‽* *I*'*m* *f*i*n*e.".to_vec())); assert_eq!(INCOMPAT.encode("Hello\u{203d} I'm fine.", NcrEscape), Ok(b"He*l*l*o*&*#*8*2*5*3*;* *I*'*m* *f*i*n*e.".to_vec())); } #[test] #[should_panic] fn test_reencoding_trap_can_fail() { static FAIL: &'static MyEncoding = &MyEncoding { flag: false, prohibit: '&', prepend: "" }; // this should fail as this contrived encoding does not support `&` at all let _ = FAIL.encode("Hello\u{203d} I'm fine.", NcrEscape); } } encoding-0.2.33/src/util.rs01006440000765000002400000027336125331707360013740 0ustar0000000000000000// This is a part of rust-encoding. // Copyright (c) 2013-2015, Kang Seonghoon. // See README.md and LICENSE.txt for details. //! Internal utilities. use std::{str, char, mem}; use std::marker::PhantomData; use std::convert::Into; use std::default::Default; use types; /// Unchecked conversion to `char`. pub fn as_char(ch: u32) -> char { debug_assert!(char::from_u32(ch).is_some()); unsafe { mem::transmute(ch) } } /// External iterator for a string's characters with its corresponding byte offset range. pub struct StrCharIndexIterator<'r> { index: usize, chars: str::Chars<'r>, } impl<'r> Iterator for StrCharIndexIterator<'r> { type Item = ((usize,usize), char); #[inline] fn next(&mut self) -> Option<((usize,usize), char)> { if let Some(ch) = self.chars.next() { let prev = self.index; let next = prev + ch.len_utf8(); self.index = next; Some(((prev, next), ch)) } else { None } } } /// A trait providing an `index_iter` method. pub trait StrCharIndex<'r> { fn index_iter(&self) -> StrCharIndexIterator<'r>; } impl<'r> StrCharIndex<'r> for &'r str { /// Iterates over each character with corresponding byte offset range. fn index_iter(&self) -> StrCharIndexIterator<'r> { StrCharIndexIterator { index: 0, chars: self.chars() } } } /// A helper struct for the stateful decoder DSL. pub struct StatefulDecoderHelper<'a, St, Data: 'a> { /// The current buffer. pub buf: &'a [u8], /// The current index to the buffer. pub pos: usize, /// The output buffer. pub output: &'a mut (types::StringWriter + 'a), /// The last codec error. The caller will later collect this. pub err: Option, /// The additional data attached for the use from transition functions. pub data: &'a Data, /// A marker for the phantom type parameter `St`. _marker: PhantomData, } impl<'a, St: Default, Data> StatefulDecoderHelper<'a, St, Data> { /// Makes a new decoder context out of given buffer and output callback. #[inline(always)] pub fn new(buf: &'a [u8], output: &'a mut (types::StringWriter + 'a), data: &'a Data) -> StatefulDecoderHelper<'a, St, Data> { StatefulDecoderHelper { buf: buf, pos: 0, output: output, err: None, data: data, _marker: PhantomData } } /// Reads one byte from the buffer if any. #[inline(always)] pub fn read(&mut self) -> Option { match self.buf.get(self.pos) { Some(&c) => { self.pos += 1; Some(c) } None => None } } /// Resets back to the initial state. /// This should be the last expr in the rules. #[inline(always)] pub fn reset(&self) -> St { Default::default() } /// Writes one Unicode scalar value to the output. /// There is intentionally no check for `c`, so the caller should ensure that it's valid. /// If this is the last expr in the rules, also resets back to the initial state. #[inline(always)] pub fn emit(&mut self, c: u32) -> St { self.output.write_char(unsafe {mem::transmute(c)}); Default::default() } /// Writes a Unicode string to the output. /// If this is the last expr in the rules, also resets back to the initial state. #[inline(always)] pub fn emit_str(&mut self, s: &str) -> St { self.output.write_str(s); Default::default() } /// Issues a codec error with given message at the current position. /// If this is the last expr in the rules, also resets back to the initial state. #[inline(always)] pub fn err(&mut self, msg: &'static str) -> St { self.err = Some(types::CodecError { upto: self.pos as isize, cause: msg.into() }); Default::default() } /// Issues a codec error with given message at the current position minus `backup` bytes. /// If this is the last expr in the rules, also resets back to the initial state. /// /// This should be used to implement "prepending byte to the stream" in the Encoding spec, /// which corresponds to `ctx.backup_and_err(1, ...)`. #[inline(always)] pub fn backup_and_err(&mut self, backup: usize, msg: &'static str) -> St { let upto = self.pos as isize - backup as isize; self.err = Some(types::CodecError { upto: upto, cause: msg.into() }); Default::default() } } /// Defines a stateful decoder from given state machine. macro_rules! stateful_decoder { ( module $stmod:ident; // should be unique from other existing identifiers $(internal $item:item)* // will only be visible from state functions initial: state $inist:ident($inictx:ident: Context) { $(case $($inilhs:pat),+ => $($inirhs:expr),+;)+ final => $($inifin:expr),+; } checkpoint: $(state $ckst:ident($ckctx:ident: Context $(, $ckarg:ident: $ckty:ty)*) { $(case $($cklhs:pat),+ => $($ckrhs:expr),+;)+ final => $($ckfin:expr),+; })* transient: $(state $st:ident($ctx:ident: Context $(, $arg:ident: $ty:ty)*) { $(case $($lhs:pat),+ => $($rhs:expr),+;)+ final => $($fin:expr),+; })* ) => ( #[allow(non_snake_case)] mod $stmod { pub use self::State::*; #[derive(PartialEq, Clone, Copy)] pub enum State { $inist, $( $ckst(() $(, $ckty)*), )* $( $st(() $(, $ty)*), )* } impl ::std::default::Default for State { #[inline(always)] fn default() -> State { $inist } } pub mod internal { pub type Context<'a, Data> = ::util::StatefulDecoderHelper<'a, super::State, Data>; $($item)* } pub mod start { use super::internal::*; #[inline(always)] pub fn $inist($inictx: &mut Context) -> super::State { // prohibits all kind of recursions, including self-recursions #[allow(unused_imports)] use super::transient::*; match $inictx.read() { None => super::$inist, Some(c) => match c { $($($inilhs)|+ => { $($inirhs);+ })+ }, } } $( #[inline(always)] pub fn $ckst($ckctx: &mut Context $(, $ckarg: $ckty)*) -> super::State { // prohibits all kind of recursions, including self-recursions #[allow(unused_imports)] use super::transient::*; match $ckctx.read() { None => super::$ckst(() $(, $ckarg)*), Some(c) => match c { $($($cklhs)|+ => { $($ckrhs);+ })+ }, } } )* } pub mod transient { use super::internal::*; #[inline(always)] #[allow(dead_code)] pub fn $inist(_: &mut Context) -> super::State { super::$inist // do not recurse further } $( #[inline(always)] #[allow(dead_code)] pub fn $ckst(_: &mut Context $(, $ckarg: $ckty)*) -> super::State { super::$ckst(() $(, $ckarg)*) // do not recurse further } )* $( #[inline(always)] pub fn $st($ctx: &mut Context $(, $arg: $ty)*) -> super::State { match $inictx.read() { None => super::$st(() $(, $arg)*), Some(c) => match c { $($($lhs)|+ => { $($rhs);+ })+ }, } } )* } pub fn raw_feed(mut st: State, input: &[u8], output: &mut ::types::StringWriter, data: &T) -> (State, usize, Option<::types::CodecError>) { output.writer_hint(input.len()); let mut ctx = ::util::StatefulDecoderHelper::new(input, output, data); let mut processed = 0; let st_ = match st { $inist => $inist, $( $ckst(() $(, $ckarg)*) => start::$ckst(&mut ctx $(, $ckarg)*), )* $( $st(() $(, $arg)*) => transient::$st(&mut ctx $(, $arg)*), )* }; match (ctx.err.take(), st_) { (None, $inist) $(| (None, $ckst(..)))* => { st = st_; processed = ctx.pos; } // XXX splitting the match case improves the performance somehow, but why? (None, _) => { return (st_, processed, None); } (Some(err), _) => { return (st_, processed, Some(err)); } } while ctx.pos < ctx.buf.len() { let st_ = match st { $inist => start::$inist(&mut ctx), $( $ckst(() $(, $ckarg)*) => start::$ckst(&mut ctx $(, $ckarg)*), )* _ => unreachable!(), }; match (ctx.err.take(), st_) { (None, $inist) $(| (None, $ckst(..)))* => { st = st_; processed = ctx.pos; } // XXX splitting the match case improves the performance somehow, but why? (None, _) => { return (st_, processed, None); } (Some(err), _) => { return (st_, processed, Some(err)); } } } (st, processed, None) } pub fn raw_finish(mut st: State, output: &mut ::types::StringWriter, data: &T) -> (State, Option<::types::CodecError>) { #![allow(unused_mut, unused_variables)] let mut ctx = ::util::StatefulDecoderHelper::new(&[], output, data); let st = match ::std::mem::replace(&mut st, $inist) { $inist => { let $inictx = &mut ctx; $($inifin);+ }, $( $ckst(() $(, $ckarg)*) => { let $ckctx = &mut ctx; $($ckfin);+ }, )* $( $st(() $(, $arg)*) => { let $ctx = &mut ctx; $($fin);+ }, )* }; (st, ctx.err.take()) } } ); // simplified rules: no checkpoint and default final actions ( module $stmod:ident; // should be unique from other existing identifiers $(internal $item:item)* // will only be visible from state functions initial: state $inist:ident($inictx:ident: Context) { $(case $($inilhs:pat),+ => $($inirhs:expr),+;)+ } transient: $(state $st:ident($ctx:ident: Context $(, $arg:ident: $ty:ty)*) { $(case $($lhs:pat),+ => $($rhs:expr),+;)+ })* ) => ( stateful_decoder! { module $stmod; $(internal $item)* initial: state $inist($inictx: Context) { $(case $($inilhs),+ => $($inirhs),+;)+ final => $inictx.reset(); } checkpoint: transient: $(state $st($ctx: Context $(, $arg: $ty)*) { $(case $($lhs),+ => $($rhs),+;)+ final => $ctx.err("incomplete sequence"); })* } ); }