xml5ever-0.16.1/Cargo.toml.orig010064400017500001750000000013441355453247100144710ustar0000000000000000[package] name = "xml5ever" version = "0.16.1" authors = ["The xml5ever project developers"] license = "MIT / Apache-2.0" repository = "https://github.com/servo/html5ever" description = "Push based streaming parser for xml" documentation = "https://docs.rs/xml5ever/" homepage = "https://github.com/servo/html5ever/blob/master/xml5ever/README.md" readme = "README.md" keywords = ["xml", "xml5", "parser", "parsing"] exclude = ["xml5lib-tests/*"] categories = [ "parser-implementations", "web-programming" ] edition = "2018" [dependencies] time = "0.1" log = "0.4" mac = "0.1" markup5ever = {version = "0.10", path = "../markup5ever" } [dev-dependencies] rustc-test = "0.3" criterion = "0.3" [[bench]] name = "xml5ever" harness = false xml5ever-0.16.1/Cargo.toml0000644000000024710000000000000107310ustar00# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] edition = "2018" name = "xml5ever" version = "0.16.1" authors = ["The xml5ever project developers"] exclude = ["xml5lib-tests/*"] description = "Push based streaming parser for xml" homepage = "https://github.com/servo/html5ever/blob/master/xml5ever/README.md" documentation = "https://docs.rs/xml5ever/" readme = "README.md" keywords = ["xml", "xml5", "parser", "parsing"] categories = ["parser-implementations", "web-programming"] license = "MIT / Apache-2.0" repository = "https://github.com/servo/html5ever" [[bench]] name = "xml5ever" harness = false [dependencies.log] version = "0.4" [dependencies.mac] version = "0.1" [dependencies.markup5ever] version = "0.10" [dependencies.time] version = "0.1" [dev-dependencies.criterion] version = "0.3" [dev-dependencies.rustc-test] version = "0.3" xml5ever-0.16.1/LICENSE-APACHE010064400017500001750000000251371355452147500135360ustar0000000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. xml5ever-0.16.1/LICENSE-MIT010064400017500001750000000020641355452147500132400ustar0000000000000000Copyright (c) 2014 The html5ever Project Developers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. xml5ever-0.16.1/README.md010064400017500001750000000057221355452147500130670ustar0000000000000000# xml5ever ![http://www.apache.org/licenses/LICENSE-2.0](https://img.shields.io/badge/license-Apache-blue.svg) ![https://opensource.org/licenses/MIT](https://img.shields.io/badge/license-MIT-blue.svg) [![Docs.rs](https://docs.rs/xml5ever/badge.svg)](https://docs.rs/xml5ever) [![](http://meritbadge.herokuapp.com/xml5ever)](https://crates.io/crates/xml5ever) [API documentation](https://Ygg01.github.io/docs/xml5ever/xml5ever/index.html) **Warning:** This library is alpha quality, so no guarantees are given. This crate provides a push based XML parser library that trades well-formedness for error recovery. xml5ever is based largely on [html5ever](https://github.com/servo/html5ever) parser, so if you have experience with html5ever you will be familiar with xml5ever. The library is dual licensed under MIT and Apache license. # Why you should use xml5ever Main use case for this library is when XML is badly formatted, usually from bad XML templates. XML5 tries to handle most common errors, in a manner similar to HTML5. ## When you should use it? - You aren't interested in well-formed documents. - You need to get some info from your data even if it has errors (although not all possible errors are handled). - You want to features like character references or xml namespaces. ## When you shouldn't use it - You need to have your document validated. - You require DTD support. - You require an easy to use parser, with lots of extensions (e.g. XPath, XQuery). - You require a battle tested, industry proven solution. # Installation Add xml5ever as a dependency in your project manifest. ```toml [dependencies] xml5ever = "0.1.3" ``` And add crate declaration in your lib.rs ```rust extern crate xml5ever ``` # Getting started Here is a very simple RcDom backed parser: ```rust let input = "".to_tendril(); // To parse XML into a tree form, we need a TreeSink // luckily xml5ever comes with a static RC backed tree represetation. let dom: RcDom = parse(std::iter::once(input), Default::default()); // Do something with dom ``` The thing that does actual parsing is the `parse` function. It expects an iterator that can be converted into `StrTendril`, so you can use `std::iter::once(input)` or `Some(input).into_iter()` (where `input` is `StrTendril` like structure). # Working on xml5ever To build examples and tests you need to do something along the lines of: ```rust git submodule update --init # to fetch xml5lib-tests cargo build cargo test ``` This will fetch tests from outside repository and it will invoke cargo to build and test the crate. If you need docs checkout either [API docs](https://ygg01.github.io/docs/xml5ever/xml5ever/index.html) or run `cargo docs` to generate documentation. ## Easy first tasks What I generally recommend is to look at Clippy Linting badge results and create a PR for fixing the said lints. Other than that try to look for any tasks labeled easy or just update docs/examples. xml5ever-0.16.1/benches/xml5ever.rs010064400017500001750000000045231355452147500153320ustar0000000000000000#[macro_use] extern crate criterion; extern crate markup5ever; extern crate xml5ever; use std::fs; use std::path::PathBuf; use criterion::{black_box, Criterion}; use markup5ever::buffer_queue::BufferQueue; use xml5ever::tendril::*; use xml5ever::tokenizer::{Token, TokenSink, XmlTokenizer}; struct Sink; impl TokenSink for Sink { fn process_token(&mut self, token: Token) { // Don't use the token, but make sure we don't get // optimized out entirely. black_box(token); } } fn run_bench(c: &mut Criterion, name: &str) { let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); path.push("data/bench/"); path.push(name); let mut file = fs::File::open(&path).ok().expect("can't open file"); // Read the file and treat it as an infinitely repeating sequence of characters. let mut file_input = ByteTendril::new(); file.read_to_tendril(&mut file_input) .ok() .expect("can't read file"); let file_input: StrTendril = file_input.try_reinterpret().unwrap(); let size = file_input.len(); let mut stream = file_input.chars().cycle(); // Break the input into chunks of 1024 chars (= a few kB). // This simulates reading from the network. let mut input = vec![]; let mut total = 0usize; while total < size { // The by_ref() call is important, otherwise we get wrong results! // See rust-lang/rust#18045. let sz = std::cmp::min(1024, size - total); input.push(stream.by_ref().take(sz).collect::().to_tendril()); total += sz; } let test_name = format!("xml tokenizing {}", name); c.bench_function(&test_name, move |b| { b.iter(|| { let mut tok = XmlTokenizer::new(Sink, Default::default()); let mut buffer = BufferQueue::new(); // We are doing clone inside the bench function, this is not ideal, but possibly // necessary since our iterator consumes the underlying buffer. for buf in input.clone().into_iter() { buffer.push_back(buf); let _ = tok.feed(&mut buffer); } let _ = tok.feed(&mut buffer); tok.end(); }) }); } fn xml5ever_benchmarks(c: &mut Criterion) { run_bench(c, "strong.xml"); } criterion_group!(benches, xml5ever_benchmarks); criterion_main!(benches); xml5ever-0.16.1/data/bench/strong.xml010064400017500001750000000020001344230726100156070ustar0000000000000000xml5ever-0.16.1/examples/README.md010064400017500001750000000201451344230726100146700ustar0000000000000000# Examples The examples have been designed with [`cargo-script`](https://github.com/DanielKeep/cargo-script) in mind. Here I'll just give broad overview how to install [`cargo script`] for Rust 1.5. For more details, check out [cargo-script repository](https://github.com/DanielKeep/cargo-script). cargo install cargo-script # Token printer The basis of xml5ever is its tokenizer and tree builder. Roughly speaking tokenizer takes input and returns a set of tokens like comment, processing instruction, start tag, end tag, etc. First let's define our dependencies: ```toml [dependencies] xml5ever = "0.2.0" tendril = "0.1.3" ``` With dependencies declared, we can now make a simple tokenizer sink. First step is to define a [`TokenSink`](https://ygg01.github.io/docs/xml5ever/xml5ever/tokenizer/trait.TokenSink.html). [`TokenSink`](https://ygg01.github.io/docs/xml5ever/xml5ever/tokenizer/trait.TokenSink.html) are traits that received stream of [`Tokens`](https://ygg01.github.io/docs/xml5ever/xml5ever/tokenizer/enum.Token.html). In our case we'll define a unit struct (i.e. a struct without any fields). ```rust struct SimpleTokenPrinter; ``` To make `SimpleTokenPrinter` a [`TokenSink`](https://ygg01.github.io/docs/xml5ever/xml5ever/tokenizer/trait.TokenSink.html), we need to implement [process_token](https://ygg01.github.io/docs/xml5ever/xml5ever/tokenizer/trait.TokenSink.html#tymethod.process_token) method. ```rust impl TokenSink for SimpleTokenPrinter { fn process_token(&mut self, token: Token) { match token { CharacterTokens(b) => { println!("TEXT: {}", &*b); }, NullCharacterToken => print!("NULL"), TagToken(tag) => { println!("{:?} {} ", tag.kind, &*tag.name.local); }, ParseError(err) => { println!("ERROR: {}", err); }, PIToken(Pi{ref target, ref data}) => { println!("PI : ", &*target, &*data); }, CommentToken(ref comment) => { println!("", &*comment); }, EOFToken => { println!("EOF"); }, DoctypeToken(Doctype{ref name, ref public_id, ..}) => { println!("", &*name, &*public_id); } } } } ``` Now, we need some input to process. For input we'll use `stdin`. However, xml5ever `tokenize_to` method only takes `StrTendril`. So we need to construct a [`ByteTendril`](https://doc.servo.org/tendril/type.ByteTendril.html) using `ByteTendril::new()`, then read the `stdin` using [`read_to_tendril`](https://doc.servo.org/tendril/trait.ReadExt.html#tymethod.read_to_tendril) extension. Once that is set, to make `SimpleTokenPrinter` parse the input, call, `tokenize_to` with it as the first parameter, input wrapped in Option for second parameter and XmlToke. ```rust fn main() { let sink = SimpleTokenPrinter; // We need a ByteTendril to read a file let mut input = ByteTendril::new(); // Using SliceExt.read_to_tendril we read stdin io::stdin().read_to_tendril(&mut input).unwrap(); // For xml5ever we need StrTendril, so we reinterpret it // into StrTendril. // // You might wonder, how does `try_reinterpret` know we // need StrTendril and the answer is type inference based // on `tokenize_xml_to` signature. let input = input.try_reinterpret().unwrap(); // Here we create and run tokenizer let mut tok = XmlTokenizer::new(sink, Default::default()); // We pass input to parsed. tok.feed(input); // tok.end must be invoked for final bytes to be processed. tok.end(); } ``` NOTE: `unwrap` causes panic, it's only OK to use in simple examples. For full source code check out: [`examples/simple_xml_tokenizer.rs`](https://github.com/Ygg01/xml5ever/blob/master/examples/simple_xml_tokenizer.rs) Once we have successfully compiled the example we run the example with inline xml ```bash cargo script simple_xml_tokenizer.rs <<< "Text with bold words!" ``` or by sending an [`examples/example.xml`](https://github.com/Ygg01/xml5ever/blob/master/examples/simple_xml_tokenizer.rs) located in same folder as examples. ```bash cargo script simple_xml_tokenizer.rs < example.xml ``` # Tree printer To actually get an XML document tree from the xml5ever, you need to use a `TreeSink`. `TreeSink` is in many way similar to the TokenSink. Basically, TokenSink takes data and returns list of tokens, while TreeSink takes tokens and returns a tree of parsed XML document. Do note, that this is a simplified explanation and consult documentation for more info. Ok, with that in mind, let's build us a TreePrinter. For example if we get an XML file like: ```xml Bobby Tables ``` We'd want a structure similar to this: ``` #document student first-name #text Bobby last-name #text Tables ``` We won't print anything other than element names and text fields. So comments, doctypes and other such elements are ignored. First part is similar to making SimpleTokenPrinter: ```rust // We need to allocate an input tendril for xml5ever let mut input = ByteTendril::new(); // Using SliceExt.read_to_tendril functions we can read stdin io::stdin().read_to_tendril(&mut input).unwrap(); let input = input.try_reinterpret().unwrap(); ``` This time, we need an implementation of [`TreeSink`](https://ygg01.github.io/docs/xml5ever/xml5ever/tree_builder/interface/trait.TreeSink.html). xml5ever comes with a built-in `TreeSink` implementation called [`RcDom`](https://ygg01.github.io/docs/xml5ever/xml5ever/rcdom/struct.RcDom.html). To process input into a `TreeSink` we use the following line: ```rust let dom: RcDom = parse(one_input(input), Default::default()); ``` Let's analyze it a bit. First there is `let dom: RcDom`. We need this part, because the type inferencer can't infer which TreeSink implementation we mean in this scenario. Function [`one_input`](https://ygg01.github.io/docs/xml5ever/xml5ever/fn.one_input.html) is a convenience function that turns any value into an iterator. In this case it converts a StrTendril into an Iterator over itself. Ok, so now that we parsed our tree what with it? Well, for that we might need some kind of function that will help us traverse it. We shall call that function `walk`. ```rust fn walk(prefix: &str, handle: Handle) { let node = handle.borrow(); // We print out the prefix before we start print!("{}", prefix); // We are only interested in following nodes: // Document, Text and Element, so our match // reflects that. match node.node { Document => println!("#document"), Text(ref text) => { println!("#text {}", escape_default(text)) }, Element(ref name, _) => { println!("{}", name.local); }, _ => {}, } // We increase indent in child nodes let new_indent = { let mut temp = String::new(); temp.push_str(prefix); temp.push_str(" "); temp }; for child in node.children.iter() // In order to avoid weird indentation, we filter // only Text/Element nodes. // We don't need to filter Document since its guaranteed // child elements don't contain documents .filter(|child| match child.borrow().node { Text(_) | Element (_, _) => true, _ => false, } ) { // Recursion - Yay! walk(&new_indent, child.clone()); } } ``` For full source code check out: [`examples/xml_tree_printer.rs`](https://github.com/Ygg01/xml5ever/blob/master/examples/xml_tree_printer.rs) xml5ever-0.16.1/examples/example.xml010064400017500001750000000002421344230726100155620ustar0000000000000000 BobbyTables xml5ever-0.16.1/examples/simple_xml_tokenizer.rs010064400017500001750000000047041355452217200202310ustar0000000000000000#!/usr/bin/env run-cargo-script //! This is a regular crate doc comment, but it also contains a partial //! Cargo manifest. Note the use of a *fenced* code block, and the //! `cargo` "language". //! //! ```cargo //! [dependencies] //! xml5ever = "0.1.1" //! tendril = "0.1.3" //! markup5ever = "0.7.4" //! ``` extern crate markup5ever; extern crate xml5ever; use std::default::Default; use std::io; use markup5ever::buffer_queue::BufferQueue; use xml5ever::tendril::{ByteTendril, ReadExt}; use xml5ever::tokenizer::{CharacterTokens, NullCharacterToken, TagToken}; use xml5ever::tokenizer::{CommentToken, PIToken, Pi}; use xml5ever::tokenizer::{Doctype, DoctypeToken, EOFToken}; use xml5ever::tokenizer::{ParseError, Token, TokenSink, XmlTokenizer}; struct SimpleTokenPrinter; impl TokenSink for SimpleTokenPrinter { fn process_token(&mut self, token: Token) { match token { CharacterTokens(b) => { println!("TEXT: {}", &*b); }, NullCharacterToken => print!("NULL"), TagToken(tag) => { println!("{:?} {} ", tag.kind, &*tag.name.local); }, ParseError(err) => { println!("ERROR: {}", err); }, PIToken(Pi { ref target, ref data, }) => { println!("PI : ", &*target, &*data); }, CommentToken(ref comment) => { println!("", &*comment); }, EOFToken => { println!("EOF"); }, DoctypeToken(Doctype { ref name, ref public_id, .. }) => { println!("", &*name, &*public_id); }, } } } fn main() { // Our implementation of TokenSink let sink = SimpleTokenPrinter; // We need a ByteTendril to read a file let mut input = ByteTendril::new(); // Using SliceExt.read_to_tendril we can read stdin io::stdin().read_to_tendril(&mut input).unwrap(); // For xml5ever we need StrTendril, so we reinterpret it // into StrTendril. // Load input into BufferQueue let mut input_buffer = BufferQueue::new(); input_buffer.push_back(input.try_reinterpret().unwrap()); // Here we create and run tokenizer let mut tok = XmlTokenizer::new(sink, Default::default()); tok.feed(&mut input_buffer); tok.end(); } xml5ever-0.16.1/examples/xml_tokenizer.rs010064400017500001750000000065411355452217200166610ustar0000000000000000#!/usr/bin/env run-cargo-script //! This is a regular crate doc comment, but it also contains a partial //! Cargo manifest. Note the use of a *fenced* code block, and the //! `cargo` "language". //! //! ```cargo //! [dependencies] //! xml5ever = "0.2.0" //! tendril = "0.1.3" //! markup5ever = "0.7.4" //! ``` extern crate markup5ever; extern crate xml5ever; use std::default::Default; use std::io; use markup5ever::buffer_queue::BufferQueue; use xml5ever::tendril::{ByteTendril, ReadExt}; use xml5ever::tokenizer::{CharacterTokens, NullCharacterToken, TagToken}; use xml5ever::tokenizer::{EmptyTag, EndTag, ShortTag, StartTag}; use xml5ever::tokenizer::{PIToken, Pi}; use xml5ever::tokenizer::{ParseError, Token, TokenSink, XmlTokenizer, XmlTokenizerOpts}; #[derive(Copy, Clone)] struct TokenPrinter { in_char_run: bool, } impl TokenPrinter { fn is_char(&mut self, is_char: bool) { match (self.in_char_run, is_char) { (false, true) => print!("CHAR : \""), (true, false) => println!("\""), _ => (), } self.in_char_run = is_char; } fn do_char(&mut self, c: char) { self.is_char(true); print!("{}", c.escape_default().collect::()); } } impl TokenSink for TokenPrinter { fn process_token(&mut self, token: Token) { match token { CharacterTokens(b) => { for c in b.chars() { self.do_char(c); } }, NullCharacterToken => self.do_char('\0'), TagToken(tag) => { self.is_char(false); // This is not proper HTML serialization, of course. match tag.kind { StartTag => print!("TAG : <\x1b[32m{}\x1b[0m", tag.name.local), EndTag => print!("END TAG : <\x1b[31m/{}\x1b[0m", tag.name.local), ShortTag => print!("Short TAG : <\x1b[31m/{}\x1b[0m", tag.name.local), EmptyTag => print!("Empty TAG : <\x1b[31m{}\x1b[0m", tag.name.local), } for attr in tag.attrs.iter() { print!( " \x1b[36m{}\x1b[0m='\x1b[34m{}\x1b[0m'", attr.name.local, attr.value ); } if tag.kind == EmptyTag { print!("/"); } println!(">"); }, ParseError(err) => { self.is_char(false); println!("ERROR: {}", err); }, PIToken(Pi { target, data }) => { self.is_char(false); println!("PI : ", target, data); }, _ => { self.is_char(false); println!("OTHER: {:?}", token); }, } } } fn main() { let mut sink = TokenPrinter { in_char_run: false }; let mut input = ByteTendril::new(); io::stdin().read_to_tendril(&mut input).unwrap(); let mut input_buffer = BufferQueue::new(); input_buffer.push_back(input.try_reinterpret().unwrap()); let mut tok = XmlTokenizer::new( sink, XmlTokenizerOpts { profile: true, exact_errors: true, ..Default::default() }, ); tok.feed(&mut input_buffer); tok.end(); sink.is_char(false); } xml5ever-0.16.1/src/driver.rs010064400017500001750000000052641355452220200142260ustar0000000000000000// Copyright 2014-2017 The html5ever Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use crate::tokenizer::{XmlTokenizer, XmlTokenizerOpts}; use crate::tree_builder::{TreeSink, XmlTreeBuilder, XmlTreeBuilderOpts}; use std::borrow::Cow; use markup5ever::buffer_queue::BufferQueue; use crate::tendril; use crate::tendril::stream::{TendrilSink, Utf8LossyDecoder}; use crate::tendril::StrTendril; /// All-encompasing parser setting structure. #[derive(Clone, Default)] pub struct XmlParseOpts { /// Xml tokenizer options. pub tokenizer: XmlTokenizerOpts, /// Xml tree builder . pub tree_builder: XmlTreeBuilderOpts, } /// Parse and send results to a `TreeSink`. /// /// ## Example /// /// ```ignore /// let mut sink = MySink; /// parse_document(&mut sink, iter::once(my_str), Default::default()); /// ``` pub fn parse_document(sink: Sink, opts: XmlParseOpts) -> XmlParser where Sink: TreeSink, { let tb = XmlTreeBuilder::new(sink, opts.tree_builder); let tok = XmlTokenizer::new(tb, opts.tokenizer); XmlParser { tokenizer: tok, input_buffer: BufferQueue::new(), } } /// An XML parser, /// ready to receive Unicode input through the `tendril::TendrilSink` trait’s methods. pub struct XmlParser where Sink: TreeSink, { /// Tokenizer used by XmlParser. pub tokenizer: XmlTokenizer>, /// Input used by XmlParser. pub input_buffer: BufferQueue, } impl TendrilSink for XmlParser { type Output = Sink::Output; fn process(&mut self, t: StrTendril) { self.input_buffer.push_back(t); self.tokenizer.feed(&mut self.input_buffer); } // FIXME: Is it too noisy to report every character decoding error? fn error(&mut self, desc: Cow<'static, str>) { self.tokenizer.sink.sink.parse_error(desc) } fn finish(mut self) -> Self::Output { self.tokenizer.end(); self.tokenizer.sink.sink.finish() } } impl XmlParser { /// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes. /// /// Use this when your input is bytes that are known to be in the UTF-8 encoding. /// Decoding is lossy, like `String::from_utf8_lossy`. pub fn from_utf8(self) -> Utf8LossyDecoder { Utf8LossyDecoder::new(self) } } xml5ever-0.16.1/src/lib.rs010064400017500001750000000035761355452147500135200ustar0000000000000000// Copyright 2014-2017 The html5ever Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! This crate provides a push based XML parser library that //! adheres to XML5 specification. In other words this library //! trades well-formedness for error recovery. //! //! The idea behind this, was to minimize number of errors from //! tools that generate XML (e.g. `S` won't just return `S` //! as text, but will parse it into `S` ). //! You can check out full specification [here](https://ygg01.github.io/xml5_draft/). //! //! What this library provides is a solid XML parser that can: //! //! * Parse somewhat erroneous XML input //! * Provide support for [Numeric character references](https://en.wikipedia.org/wiki/Numeric_character_reference). //! * Provide partial [XML namespace](http://www.w3.org/TR/xml-names11/) support. //! * Provide full set of SVG/MathML entities //! //! What isn't in scope for this library: //! //! * Document Type Definition parsing - this is pretty hard to do right and nowadays, its used //! #![crate_name = "xml5ever"] #![crate_type = "dylib"] #![deny(missing_docs)] pub use markup5ever::*; macro_rules! time { ($e:expr) => {{ let t0 = ::time::precise_time_ns(); let result = $e; let dt = ::time::precise_time_ns() - t0; (result, dt) }}; } mod util; /// Driver pub mod driver; /// Serializer for XML5. pub mod serialize; /// XML5 tokenizer - converts input into tokens pub mod tokenizer; /// XML5 tree builder - converts tokens into a tree like structure pub mod tree_builder; xml5ever-0.16.1/src/serialize/mod.rs010064400017500001750000000156761355452220200155110ustar0000000000000000// Copyright 2014-2017 The html5ever Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. pub use markup5ever::serialize::{AttrRef, Serialize, Serializer, TraversalScope}; use std::io::{self, Write}; use crate::tree_builder::NamespaceMap; use crate::QualName; #[derive(Clone)] /// Struct for setting serializer options. pub struct SerializeOpts { /// Serialize the root node? Default: ChildrenOnly pub traversal_scope: TraversalScope, } impl Default for SerializeOpts { fn default() -> SerializeOpts { SerializeOpts { traversal_scope: TraversalScope::ChildrenOnly(None), } } } /// Method for serializing generic node to a given writer. pub fn serialize(writer: Wr, node: &T, opts: SerializeOpts) -> io::Result<()> where Wr: Write, T: Serialize, { let mut ser = XmlSerializer::new(writer); node.serialize(&mut ser, opts.traversal_scope) } /// Struct used for serializing nodes into a text that other XML /// parses can read. /// /// Serializer contains a set of functions (start_elem, end_elem...) /// that make parsing nodes easier. pub struct XmlSerializer { writer: Wr, namespace_stack: NamespaceMapStack, } #[derive(Debug)] struct NamespaceMapStack(Vec); impl NamespaceMapStack { fn new() -> NamespaceMapStack { NamespaceMapStack(vec![]) } fn push(&mut self, namespace: NamespaceMap) { self.0.push(namespace); } fn pop(&mut self) { self.0.pop(); } } /// Writes given text into the Serializer, escaping it, /// depending on where the text is written inside the tag or attribute value. /// /// For example ///```text /// '&-quotes' becomes '&-quotes' /// becomes (writer: &mut W, text: &str, attr_mode: bool) -> io::Result<()> { for c in text.chars() { match c { '&' => writer.write_all(b"&"), '\'' if attr_mode => writer.write_all(b"'"), '"' if attr_mode => writer.write_all(b"""), '<' if !attr_mode => writer.write_all(b"<"), '>' if !attr_mode => writer.write_all(b">"), c => writer.write_fmt(format_args!("{}", c)), }?; } Ok(()) } #[inline] fn write_qual_name(writer: &mut W, name: &QualName) -> io::Result<()> { if let Some(ref prefix) = name.prefix { writer.write_all(&prefix.as_bytes())?; writer.write_all(b":")?; writer.write_all(&*name.local.as_bytes())?; } else { writer.write_all(&*name.local.as_bytes())?; } Ok(()) } impl XmlSerializer { /// Creates a new Serializier from a writer and given serialization options. pub fn new(writer: Wr) -> Self { XmlSerializer { writer: writer, namespace_stack: NamespaceMapStack::new(), } } #[inline(always)] fn qual_name(&mut self, name: &QualName) -> io::Result<()> { self.find_or_insert_ns(name); write_qual_name(&mut self.writer, name) } #[inline(always)] fn qual_attr_name(&mut self, name: &QualName) -> io::Result<()> { self.find_or_insert_ns(name); write_qual_name(&mut self.writer, name) } fn find_uri(&self, name: &QualName) -> bool { let mut found = false; for stack in self.namespace_stack.0.iter().rev() { if let Some(&Some(ref el)) = stack.get(&name.prefix) { found = *el == name.ns; break; } } found } fn find_or_insert_ns(&mut self, name: &QualName) { if name.prefix.is_some() || &*name.ns != "" { if !self.find_uri(name) { if let Some(last_ns) = self.namespace_stack.0.last_mut() { last_ns.insert(name); } } } } } impl Serializer for XmlSerializer { /// Serializes given start element into text. Start element contains /// qualified name and an attributes iterator. fn start_elem<'a, AttrIter>(&mut self, name: QualName, attrs: AttrIter) -> io::Result<()> where AttrIter: Iterator>, { self.namespace_stack.push(NamespaceMap::empty()); self.writer.write_all(b"<")?; self.qual_name(&name)?; if let Some(current_namespace) = self.namespace_stack.0.last() { for (prefix, url_opt) in current_namespace.get_scope_iter() { self.writer.write_all(b" xmlns")?; if let &Some(ref p) = prefix { self.writer.write_all(b":")?; self.writer.write_all(&*p.as_bytes())?; } self.writer.write_all(b"=\"")?; let url = if let &Some(ref a) = url_opt { a.as_bytes() } else { b"" }; self.writer.write_all(url)?; self.writer.write_all(b"\"")?; } } for (name, value) in attrs { self.writer.write_all(b" ")?; self.qual_attr_name(&name)?; self.writer.write_all(b"=\"")?; write_to_buf_escaped(&mut self.writer, value, true)?; self.writer.write_all(b"\"")?; } self.writer.write_all(b">")?; Ok(()) } /// Serializes given end element into text. fn end_elem(&mut self, name: QualName) -> io::Result<()> { self.namespace_stack.pop(); self.writer.write_all(b"") } /// Serializes comment into text. fn write_comment(&mut self, text: &str) -> io::Result<()> { self.writer.write_all(b"") } /// Serializes given doctype fn write_doctype(&mut self, name: &str) -> io::Result<()> { self.writer.write_all(b"") } /// Serializes text for a node or an attributes. fn write_text(&mut self, text: &str) -> io::Result<()> { write_to_buf_escaped(&mut self.writer, text, false) } /// Serializes given processing instruction. fn write_processing_instruction(&mut self, target: &str, data: &str) -> io::Result<()> { self.writer.write_all(b"") } } xml5ever-0.16.1/src/tokenizer/char_ref/mod.rs010064400017500001750000000347761355452220200173070ustar0000000000000000// Copyright 2014-2017 The html5ever Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use super::{TokenSink, XmlTokenizer}; use crate::data; use log::debug; use mac::{format_if, unwrap_or_return}; use markup5ever::buffer_queue::BufferQueue; use std::borrow::Cow::Borrowed; use std::char::from_u32; use crate::tendril::StrTendril; use crate::util::is_ascii_alnum; use self::State::*; pub use self::Status::*; //§ tokenizing-character-references pub struct CharRef { /// The resulting character(s) pub chars: [char; 2], /// How many slots in `chars` are valid? pub num_chars: u8, } pub enum Status { Stuck, Progress, Done, } #[derive(Debug)] enum State { Begin, Octothorpe, Numeric(u32), // base NumericSemicolon, Named, BogusName, } pub struct CharRefTokenizer { state: State, addnl_allowed: Option, result: Option, num: u32, num_too_big: bool, seen_digit: bool, hex_marker: Option, name_buf_opt: Option, name_match: Option<(u32, u32)>, name_len: usize, } impl CharRefTokenizer { // NB: We assume that we have an additional allowed character iff we're // tokenizing in an attribute value. pub fn new(addnl_allowed: Option) -> CharRefTokenizer { CharRefTokenizer { state: Begin, addnl_allowed: addnl_allowed, result: None, num: 0, num_too_big: false, seen_digit: false, hex_marker: None, name_buf_opt: None, name_match: None, name_len: 0, } } // A CharRefTokenizer can only tokenize one character reference, // so this method consumes the tokenizer. pub fn get_result(self) -> CharRef { self.result.expect("get_result called before done") } fn name_buf<'t>(&'t self) -> &'t StrTendril { self.name_buf_opt .as_ref() .expect("name_buf missing in named character reference") } fn name_buf_mut<'t>(&'t mut self) -> &'t mut StrTendril { self.name_buf_opt .as_mut() .expect("name_buf missing in named character reference") } fn finish_none(&mut self) -> Status { self.result = Some(CharRef { chars: ['\0', '\0'], num_chars: 0, }); Done } fn finish_one(&mut self, c: char) -> Status { self.result = Some(CharRef { chars: [c, '\0'], num_chars: 1, }); Done } } impl CharRefTokenizer { pub fn step( &mut self, tokenizer: &mut XmlTokenizer, input: &mut BufferQueue, ) -> Status { if self.result.is_some() { return Done; } debug!("char ref tokenizer stepping in state {:?}", self.state); match self.state { Begin => self.do_begin(tokenizer, input), Octothorpe => self.do_octothorpe(tokenizer, input), Numeric(base) => self.do_numeric(tokenizer, base, input), NumericSemicolon => self.do_numeric_semicolon(tokenizer, input), Named => self.do_named(tokenizer, input), BogusName => self.do_bogus_name(tokenizer, input), } } fn do_begin( &mut self, tokenizer: &mut XmlTokenizer, input: &mut BufferQueue, ) -> Status { match unwrap_or_return!(tokenizer.peek(input), Stuck) { '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(), c if Some(c) == self.addnl_allowed => self.finish_none(), '#' => { tokenizer.discard_char(input); self.state = Octothorpe; Progress }, _ => { self.state = Named; self.name_buf_opt = Some(StrTendril::new()); Progress }, } } fn do_octothorpe( &mut self, tokenizer: &mut XmlTokenizer, input: &mut BufferQueue, ) -> Status { let c = unwrap_or_return!(tokenizer.peek(input), Stuck); match c { 'x' | 'X' => { tokenizer.discard_char(input); self.hex_marker = Some(c); self.state = Numeric(16); }, _ => { self.hex_marker = None; self.state = Numeric(10); }, } Progress } fn do_numeric( &mut self, tokenizer: &mut XmlTokenizer, base: u32, input: &mut BufferQueue, ) -> Status { let c = unwrap_or_return!(tokenizer.peek(input), Stuck); match c.to_digit(base) { Some(n) => { tokenizer.discard_char(input); self.num = self.num.wrapping_mul(base); if self.num > 0x10FFFF { // We might overflow, and the character is definitely invalid. // We still parse digits and semicolon, but don't use the result. self.num_too_big = true; } self.num = self.num.wrapping_add(n); self.seen_digit = true; Progress }, None if !self.seen_digit => self.unconsume_numeric(tokenizer, input), None => { self.state = NumericSemicolon; Progress }, } } fn do_numeric_semicolon( &mut self, tokenizer: &mut XmlTokenizer, input: &mut BufferQueue, ) -> Status { match unwrap_or_return!(tokenizer.peek(input), Stuck) { ';' => tokenizer.discard_char(input), _ => tokenizer.emit_error(Borrowed( "Semicolon missing after numeric character reference", )), }; self.finish_numeric(tokenizer) } fn unconsume_numeric( &mut self, tokenizer: &mut XmlTokenizer, input: &mut BufferQueue, ) -> Status { let mut unconsume = StrTendril::from_char('#'); match self.hex_marker { Some(c) => unconsume.push_char(c), None => (), } tokenizer.unconsume(input, unconsume); tokenizer.emit_error(Borrowed("Numeric character reference without digits")); self.finish_none() } fn finish_numeric(&mut self, tokenizer: &mut XmlTokenizer) -> Status { fn conv(n: u32) -> char { from_u32(n).expect("invalid char missed by error handling cases") } let (c, error) = match self.num { n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true), 0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true), 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] { Some(c) => (c, true), None => (conv(self.num), true), }, 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true), n if (n & 0xFFFE) == 0xFFFE => (conv(n), true), n => (conv(n), false), }; if error { let msg = format_if!( tokenizer.opts.exact_errors, "Invalid numeric character reference", "Invalid numeric character reference value 0x{:06X}", self.num ); tokenizer.emit_error(msg); } self.finish_one(c) } fn do_named( &mut self, tokenizer: &mut XmlTokenizer, input: &mut BufferQueue, ) -> Status { let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); self.name_buf_mut().push_char(c); match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { // We have either a full match or a prefix of one. Some(&m) => { if m.0 != 0 { // We have a full match, but there might be a longer one to come. self.name_match = Some(m); self.name_len = self.name_buf().len(); } // Otherwise we just have a prefix match. Progress }, // Can't continue the match. None => self.finish_named(tokenizer, Some(c), input), } } fn emit_name_error(&mut self, tokenizer: &mut XmlTokenizer) { let msg = format_if!( tokenizer.opts.exact_errors, "Invalid character reference", "Invalid character reference &{}", self.name_buf() ); tokenizer.emit_error(msg); } fn unconsume_name( &mut self, tokenizer: &mut XmlTokenizer, input: &mut BufferQueue, ) { tokenizer.unconsume(input, self.name_buf_opt.take().unwrap()); } fn finish_named( &mut self, tokenizer: &mut XmlTokenizer, end_char: Option, input: &mut BufferQueue, ) -> Status { match self.name_match { None => { match end_char { Some(c) if is_ascii_alnum(c) => { // Keep looking for a semicolon, to determine whether // we emit a parse error. self.state = BogusName; return Progress; }, // Check length because &; is not a parse error. Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer), _ => (), } self.unconsume_name(tokenizer, input); self.finish_none() }, Some((c1, c2)) => { // We have a complete match, but we may have consumed // additional characters into self.name_buf. Usually // at least one, but several in cases like // // ¬ => match for U+00AC // ¬i => valid prefix for ¬in // ¬it => can't continue match let name_len = self.name_len; assert!(name_len > 0); let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap(); // There might not be a next character after the match, if // we had a full match and then hit EOF. let next_after = if name_len == self.name_buf().len() { None } else { Some(self.name_buf()[name_len..].chars().next().unwrap()) }; // "If the character reference is being consumed as part of an // attribute, and the last character matched is not a U+003B // SEMICOLON character (;), and the next character is either a // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII // character, then, for historical reasons, all the characters // that were matched after the U+0026 AMPERSAND character (&) // must be unconsumed, and nothing is returned. However, if // this next character is in fact a U+003D EQUALS SIGN // character (=), then this is a parse error" let unconsume_all = match (self.addnl_allowed, last_matched, next_after) { (_, ';', _) => false, (Some(_), _, Some('=')) => { tokenizer.emit_error(Borrowed( "Equals sign after character reference in attribute", )); true }, (Some(_), _, Some(c)) if is_ascii_alnum(c) => true, _ => { tokenizer.emit_error(Borrowed( "Character reference does not end with semicolon", )); false }, }; if unconsume_all { self.unconsume_name(tokenizer, input); self.finish_none() } else { tokenizer .unconsume(input, StrTendril::from_slice(&self.name_buf()[name_len..])); self.result = Some(CharRef { chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()], num_chars: if c2 == 0 { 1 } else { 2 }, }); Done } }, } } fn do_bogus_name( &mut self, tokenizer: &mut XmlTokenizer, input: &mut BufferQueue, ) -> Status { let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); self.name_buf_mut().push_char(c); match c { _ if is_ascii_alnum(c) => return Progress, ';' => self.emit_name_error(tokenizer), _ => (), } self.unconsume_name(tokenizer, input); self.finish_none() } pub fn end_of_file( &mut self, tokenizer: &mut XmlTokenizer, input: &mut BufferQueue, ) { while self.result.is_none() { match self.state { Begin => drop(self.finish_none()), Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)), Numeric(_) | NumericSemicolon => { tokenizer.emit_error(Borrowed("EOF in numeric character reference")); self.finish_numeric(tokenizer); }, Named => drop(self.finish_named(tokenizer, None, input)), BogusName => { self.unconsume_name(tokenizer, input); self.finish_none(); }, Octothorpe => { tokenizer.unconsume(input, StrTendril::from_slice("#")); tokenizer.emit_error(Borrowed("EOF after '#' in character reference")); self.finish_none(); }, } } } } xml5ever-0.16.1/src/tokenizer/interface.rs010064400017500001750000000077041355452147500167210ustar0000000000000000// Copyright 2014-2017 The html5ever Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use std::borrow::Cow; use crate::tendril::StrTendril; use crate::{Attribute, QualName}; pub use self::TagKind::{EmptyTag, EndTag, ShortTag, StartTag}; pub use self::Token::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; pub use self::Token::{CommentToken, DoctypeToken, PIToken, TagToken}; use super::states; /// Tag kind denotes which kind of tag did we encounter. #[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)] pub enum TagKind { /// Beginning of a tag (e.g. ``). StartTag, /// End of a tag (e.g. ``). EndTag, /// Empty tag (e.g. ``). EmptyTag, /// Short tag (e.g. ``). ShortTag, } /// XML 5 Tag Token #[derive(PartialEq, Eq, Debug, Clone)] pub struct Tag { /// Token kind denotes which type of token was encountered. /// E.g. if parser parsed `` the token kind would be `EndTag`. pub kind: TagKind, /// Qualified name of the tag. pub name: QualName, /// List of attributes attached to this tag. /// Only valid in start and empty tag. pub attrs: Vec, } impl Tag { /// Sorts attributes in a tag. pub fn equiv_modulo_attr_order(&self, other: &Tag) -> bool { if (self.kind != other.kind) || (self.name != other.name) { return false; } let mut self_attrs = self.attrs.clone(); let mut other_attrs = other.attrs.clone(); self_attrs.sort(); other_attrs.sort(); self_attrs == other_attrs } } /// A `DOCTYPE` token. /// Doctype token in XML5 is rather limited for reasons, such as: /// security and simplicity. XML5 only supports declaring DTD with /// name, public identifier and system identifier #[derive(PartialEq, Eq, Clone, Debug)] pub struct Doctype { /// Name of DOCTYPE declared pub name: Option, /// Public identifier of this DOCTYPE. pub public_id: Option, /// System identifier of this DOCTYPE. pub system_id: Option, } impl Doctype { /// Constructs an empty DOCTYPE, with all fields set to None. pub fn new() -> Doctype { Doctype { name: None, public_id: None, system_id: None, } } } /// A ProcessingInstruction token. #[derive(PartialEq, Eq, Clone, Debug)] pub struct Pi { /// What is the name of processing instruction. pub target: StrTendril, /// Text of processing instruction. pub data: StrTendril, } /// Describes tokens encountered during parsing of input. #[derive(PartialEq, Eq, Debug)] pub enum Token { /// Doctype token DoctypeToken(Doctype), /// Token tag founds. This token applies to all /// possible kinds of tags (like start, end, empty tag, etc.). TagToken(Tag), /// Processing Instruction token PIToken(Pi), /// Comment token. CommentToken(StrTendril), /// Token that represents a series of characters. CharacterTokens(StrTendril), /// End of File found. EOFToken, /// NullCharacter encountered. NullCharacterToken, /// Error happened ParseError(Cow<'static, str>), } /// Types which can receive tokens from the tokenizer. pub trait TokenSink { /// Process a token. fn process_token(&mut self, token: Token); /// Signal to the sink that parsing has ended. fn end(&mut self) {} /// The tokenizer will call this after emitting any start tag. /// This allows the tree builder to change the tokenizer's state. /// By default no state changes occur. fn query_state_change(&mut self) -> Option { None } } xml5ever-0.16.1/src/tokenizer/mod.rs010064400017500001750000001450761355452220200155320ustar0000000000000000// Copyright 2014-2017 The html5ever Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. mod char_ref; mod interface; mod qname; pub mod states; pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken}; pub use self::interface::{CommentToken, DoctypeToken, PIToken, TagToken}; pub use self::interface::{Doctype, Pi}; pub use self::interface::{EmptyTag, EndTag, ShortTag, StartTag}; pub use self::interface::{ParseError, Tag, TagKind, Token, TokenSink}; pub use crate::{LocalName, Namespace, Prefix}; use log::debug; use mac::{format_if, unwrap_or_return}; use markup5ever::{local_name, namespace_prefix, namespace_url, ns, small_char_set}; use std::borrow::Cow::{self, Borrowed}; use std::collections::BTreeMap; use std::mem::replace; use crate::tendril::StrTendril; use crate::{buffer_queue, Attribute, QualName, SmallCharSet}; use self::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult}; use self::char_ref::{CharRef, CharRefTokenizer}; use self::qname::QualNameTokenizer; use self::states::XmlState; use self::states::{DoctypeKind, Public, System}; use self::states::{DoubleQuoted, SingleQuoted, Unquoted}; /// Copy of Tokenizer options, with an impl for `Default`. #[derive(Copy, Clone)] pub struct XmlTokenizerOpts { /// Report all parse errors described in the spec, at some /// performance penalty? Default: false pub exact_errors: bool, /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning /// of the stream? Default: true pub discard_bom: bool, /// Keep a record of how long we spent in each state? Printed /// when `end()` is called. Default: false pub profile: bool, /// Initial state override. Only the test runner should use /// a non-`None` value! pub initial_state: Option, } fn process_qname(tag_name: StrTendril) -> QualName { // If tag name can't possibly contain full namespace, skip qualified name // parsing altogether. For a tag to have namespace it must look like: // a:b // Since StrTendril are UTF-8, we know that minimal size in bytes must be // three bytes minimum. let split = if (&*tag_name).as_bytes().len() < 3 { None } else { QualNameTokenizer::new((&*tag_name).as_bytes()).run() }; match split { None => QualName::new(None, ns!(), LocalName::from(&*tag_name)), Some(col) => { let len = (&*tag_name).as_bytes().len() as u32; let prefix = tag_name.subtendril(0, col); let local = tag_name.subtendril(col + 1, len - col - 1); let ns = ns!(); // Actual namespace URL set in XmlTreeBuilder::bind_qname QualName::new(Some(Prefix::from(&*prefix)), ns, LocalName::from(&*local)) }, } } fn option_push(opt_str: &mut Option, c: char) { match *opt_str { Some(ref mut s) => s.push_char(c), None => *opt_str = Some(StrTendril::from_char(c)), } } impl Default for XmlTokenizerOpts { fn default() -> XmlTokenizerOpts { XmlTokenizerOpts { exact_errors: false, discard_bom: true, profile: false, initial_state: None, } } } /// The Xml tokenizer. pub struct XmlTokenizer { /// Options controlling the behavior of the tokenizer. opts: XmlTokenizerOpts, /// Destination for tokens we emit. pub sink: Sink, /// The abstract machine state as described in the spec. state: states::XmlState, /// Are we at the end of the file, once buffers have been processed /// completely? This affects whether we will wait for lookahead or not. at_eof: bool, /// Tokenizer for character references, if we're tokenizing /// one at the moment. char_ref_tokenizer: Option>, /// Current input character. Just consumed, may reconsume. current_char: char, /// Should we reconsume the current input character? reconsume: bool, /// Did we just consume \r, translating it to \n? In that case we need /// to ignore the next character if it's \n. ignore_lf: bool, /// Discard a U+FEFF BYTE ORDER MARK if we see one? Only done at the /// beginning of the stream. discard_bom: bool, /// Temporary buffer temp_buf: StrTendril, /// Current tag kind. current_tag_kind: TagKind, /// Current tag name. current_tag_name: StrTendril, /// Current tag attributes. current_tag_attrs: Vec, /// Current attribute name. current_attr_name: StrTendril, /// Current attribute value. current_attr_value: StrTendril, current_doctype: Doctype, /// Current comment. current_comment: StrTendril, /// Current processing instruction target. current_pi_target: StrTendril, /// Current processing instruction value. current_pi_data: StrTendril, /// Record of how many ns we spent in each state, if profiling is enabled. state_profile: BTreeMap, /// Record of how many ns we spent in the token sink. time_in_sink: u64, } impl XmlTokenizer { /// Create a new tokenizer which feeds tokens to a particular `TokenSink`. pub fn new(sink: Sink, opts: XmlTokenizerOpts) -> XmlTokenizer { if opts.profile && cfg!(for_c) { panic!("Can't profile tokenizer when built as a C library"); } let state = *opts.initial_state.as_ref().unwrap_or(&states::Data); let discard_bom = opts.discard_bom; XmlTokenizer { opts: opts, sink: sink, state: state, char_ref_tokenizer: None, at_eof: false, current_char: '\0', reconsume: false, ignore_lf: false, temp_buf: StrTendril::new(), discard_bom: discard_bom, current_tag_kind: StartTag, current_tag_name: StrTendril::new(), current_tag_attrs: vec![], current_attr_name: StrTendril::new(), current_attr_value: StrTendril::new(), current_comment: StrTendril::new(), current_pi_data: StrTendril::new(), current_pi_target: StrTendril::new(), current_doctype: Doctype::new(), state_profile: BTreeMap::new(), time_in_sink: 0, } } /// Feed an input string into the tokenizer. pub fn feed(&mut self, input: &mut BufferQueue) { if input.is_empty() { return; } if self.discard_bom { if let Some(c) = input.peek() { if c == '\u{feff}' { input.next(); } } else { return; } }; self.run(input); } fn process_token(&mut self, token: Token) { if self.opts.profile { let (_, dt) = time!(self.sink.process_token(token)); self.time_in_sink += dt; } else { self.sink.process_token(token); } } // Get the next input character, which might be the character // 'c' that we already consumed from the buffers. fn get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option { if self.ignore_lf { self.ignore_lf = false; if c == '\n' { c = unwrap_or_return!(input.next(), None); } } if c == '\r' { self.ignore_lf = true; c = '\n'; } // Normalize \x00 into \uFFFD if c == '\x00' { c = '\u{FFFD}' } // Exclude forbidden Unicode characters if self.opts.exact_errors && match c as u32 { 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true, n if (n & 0xFFFE) == 0xFFFE => true, _ => false, } { let msg = format!("Bad character {}", c); self.emit_error(Cow::Owned(msg)); } debug!("got character {}", c); self.current_char = c; Some(c) } fn bad_eof_error(&mut self) { let msg = format_if!( self.opts.exact_errors, "Unexpected EOF", "Saw EOF in state {:?}", self.state ); self.emit_error(msg); } fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option { // Bail to the slow path for various corner cases. // This means that `FromSet` can contain characters not in the set! // It shouldn't matter because the fallback `FromSet` case should // always do the same thing as the `NotFromSet` case. if self.opts.exact_errors || self.reconsume || self.ignore_lf { return self.get_char(input).map(|x| FromSet(x)); } let d = input.pop_except_from(set); debug!("got characters {:?}", d); match d { Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(|x| FromSet(x)), // NB: We don't set self.current_char for a run of characters not // in the set. It shouldn't matter for the codepaths that use // this. _ => d, } } // Check if the next characters are an ASCII case-insensitive match. See // BufferQueue::eat. // // NB: this doesn't do input stream preprocessing or set the current input // character. fn eat(&mut self, input: &mut BufferQueue, pat: &str) -> Option { input.push_front(replace(&mut self.temp_buf, StrTendril::new())); match input.eat(pat, u8::eq_ignore_ascii_case) { None if self.at_eof => Some(false), None => { while let Some(c) = input.next() { self.temp_buf.push_char(c); } None }, Some(matched) => Some(matched), } } /// Run the state machine for as long as we can. pub fn run(&mut self, input: &mut BufferQueue) { if self.opts.profile { loop { let state = self.state; let old_sink = self.time_in_sink; let (run, mut dt) = time!(self.step(input)); dt -= self.time_in_sink - old_sink; let new = match self.state_profile.get_mut(&state) { Some(x) => { *x += dt; false }, None => true, }; if new { // do this here because of borrow shenanigans self.state_profile.insert(state, dt); } if !run { break; } } } else { while self.step(input) {} } } //§ tokenization // Get the next input character, if one is available. fn get_char(&mut self, input: &mut BufferQueue) -> Option { if self.reconsume { self.reconsume = false; Some(self.current_char) } else { input .next() .and_then(|c| self.get_preprocessed_char(c, input)) } } fn bad_char_error(&mut self) { let msg = format_if!( self.opts.exact_errors, "Bad character", "Saw {} in state {:?}", self.current_char, self.state ); self.emit_error(msg); } fn discard_tag(&mut self) { self.current_tag_name = StrTendril::new(); self.current_tag_attrs = Vec::new(); } fn create_tag(&mut self, kind: TagKind, c: char) { self.discard_tag(); self.current_tag_name.push_char(c); self.current_tag_kind = kind; } // This method creates a PI token and // sets its target to given char fn create_pi(&mut self, c: char) { self.current_pi_target = StrTendril::new(); self.current_pi_data = StrTendril::new(); self.current_pi_target.push_char(c); } fn emit_char(&mut self, c: char) { self.process_token(CharacterTokens(StrTendril::from_char(match c { '\0' => '\u{FFFD}', c => c, }))); } fn emit_short_tag(&mut self) { self.current_tag_kind = ShortTag; self.current_tag_name = StrTendril::new(); self.emit_current_tag(); } fn emit_empty_tag(&mut self) { self.current_tag_kind = EmptyTag; self.emit_current_tag(); } fn set_empty_tag(&mut self) { self.current_tag_kind = EmptyTag; } fn emit_start_tag(&mut self) { self.current_tag_kind = StartTag; self.emit_current_tag(); } fn emit_current_tag(&mut self) { self.finish_attribute(); let qname = process_qname(replace(&mut self.current_tag_name, StrTendril::new())); match self.current_tag_kind { StartTag | EmptyTag => {}, EndTag => { if !self.current_tag_attrs.is_empty() { self.emit_error(Borrowed("Attributes on an end tag")); } }, ShortTag => { if !self.current_tag_attrs.is_empty() { self.emit_error(Borrowed("Attributes on a short tag")); } }, } let token = TagToken(Tag { kind: self.current_tag_kind, name: qname, attrs: replace(&mut self.current_tag_attrs, vec![]), }); self.process_token(token); match self.sink.query_state_change() { None => (), Some(s) => self.state = s, } } // The string must not contain '\0'! fn emit_chars(&mut self, b: StrTendril) { self.process_token(CharacterTokens(b)); } // Emits the current Processing Instruction fn emit_pi(&mut self) { let token = PIToken(Pi { target: replace(&mut self.current_pi_target, StrTendril::new()), data: replace(&mut self.current_pi_data, StrTendril::new()), }); self.process_token(token); } fn consume_char_ref(&mut self, addnl_allowed: Option) { // NB: The char ref tokenizer assumes we have an additional allowed // character iff we're tokenizing in an attribute value. self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed))); } fn emit_eof(&mut self) { self.process_token(EOFToken); } fn emit_error(&mut self, error: Cow<'static, str>) { self.process_token(ParseError(error)); } fn emit_current_comment(&mut self) { let comment = replace(&mut self.current_comment, StrTendril::new()); self.process_token(CommentToken(comment)); } fn emit_current_doctype(&mut self) { let doctype = replace(&mut self.current_doctype, Doctype::new()); self.process_token(DoctypeToken(doctype)); } fn doctype_id<'a>(&'a mut self, kind: DoctypeKind) -> &'a mut Option { match kind { Public => &mut self.current_doctype.public_id, System => &mut self.current_doctype.system_id, } } fn clear_doctype_id(&mut self, kind: DoctypeKind) { let id = self.doctype_id(kind); match *id { Some(ref mut s) => s.clear(), None => *id = Some(StrTendril::new()), } } fn peek(&mut self, input: &mut BufferQueue) -> Option { if self.reconsume { Some(self.current_char) } else { input.peek() } } fn discard_char(&mut self, input: &mut BufferQueue) { let c = self.get_char(input); assert!(c.is_some()); } fn unconsume(&mut self, input: &mut BufferQueue, buf: StrTendril) { input.push_front(buf); } } // Shorthand for common state machine behaviors. macro_rules! shorthand ( ( $me:ident : emit $c:expr ) => ( $me.emit_char($c); ); ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c); ); ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push_char($c); ); ( $me:ident : discard_tag $input:expr ) => ( $me.discard_tag($input); ); ( $me:ident : discard_char ) => ( $me.discard_char(); ); ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push_char($c); ); ( $me:ident : emit_temp ) => ( $me.emit_temp_buf(); ); ( $me:ident : clear_temp ) => ( $me.clear_temp_buf(); ); ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c); ); ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.push_char($c); ); ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.push_char($c); ); ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.push_tendril($c); ); ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.push_char($c); ); ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.push_slice($c); ); ( $me:ident : emit_comment ) => ( $me.emit_current_comment(); ); ( $me:ident : clear_comment ) => ( $me.current_comment.clear(); ); ( $me:ident : create_doctype ) => ( $me.current_doctype = Doctype::new(); ); ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.name, $c); ); ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c); ); ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k); ); ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype(); ); ( $me:ident : error ) => ( $me.bad_char_error(); ); ( $me:ident : error_eof ) => ( $me.bad_eof_error(); ); ( $me:ident : create_pi $c:expr ) => ( $me.create_pi($c); ); ( $me:ident : push_pi_target $c:expr ) => ( $me.current_pi_target.push_char($c); ); ( $me:ident : push_pi_data $c:expr ) => ( $me.current_pi_data.push_char($c); ); ( $me:ident : set_empty_tag ) => ( $me.set_empty_tag(); ); ); // Tracing of tokenizer actions. This adds significant bloat and compile time, // so it's behind a cfg flag. #[cfg(trace_tokenizer)] macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({ debug!(" {:s}", stringify!($($cmds)*)); shorthand!($me:expr : $($cmds)*); })); #[cfg(not(trace_tokenizer))] macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) ); // A little DSL for sequencing shorthand actions. macro_rules! go ( // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity. // We have to tell the parser how much lookahead we need. ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); }); ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); }); ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); }); ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); }); // These can only come at the end. ( $me:ident : to $s:ident ) => ({ $me.state = states::$s; return true; }); ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state = states::$s($k1); return true; }); ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state = states::$s($k1($k2)); return true; }); ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume = true; go!($me: to $s); }); ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1); }); ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1 $k2); }); ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); return true; }); ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return true; }); // We have a default next state after emitting a tag, but the sink can override. ( $me:ident : emit_tag $s:ident ) => ({ $me.state = states::$s; $me.emit_current_tag(); return true; }); // We have a special when dealing with empty and short tags in Xml ( $me:ident : emit_short_tag $s:ident ) => ({ $me.state = states::$s; $me.emit_short_tag(); return true; }); ( $me:ident : emit_empty_tag $s:ident ) => ({ $me.state = states::$s; $me.emit_empty_tag(); return true; }); ( $me:ident : emit_start_tag $s:ident ) => ({ $me.state = states::$s; $me.emit_start_tag(); return true; }); ( $me:ident : emit_pi $s:ident ) => ({ $me.state = states::$s; $me.emit_pi(); return true; }); ( $me:ident : eof ) => ({ $me.emit_eof(); return false; }); // If nothing else matched, it's a single command ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+); ); // or nothing. ( $me:ident : ) => (()); ); // This is a macro because it can cause early return // from the function where it is used. macro_rules! get_char ( ($me:expr, $input:expr) => ( unwrap_or_return!($me.get_char($input), false) )); macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => ( unwrap_or_return!($me.pop_except_from($input, $set), false) )); macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => ( unwrap_or_return!($me.eat($input, $pat), false) )); impl XmlTokenizer { // Run the state machine for a while. // Return true if we should be immediately re-invoked // (this just simplifies control flow vs. break / continue). fn step(&mut self, input: &mut BufferQueue) -> bool { if self.char_ref_tokenizer.is_some() { return self.step_char_ref_tokenizer(input); } debug!("processing in state {:?}", self.state); match self.state { XmlState::Quiescent => { self.state = XmlState::Data; return false; }, //§ data-state XmlState::Data => loop { match pop_except_from!(self, input, small_char_set!('\r' '&' '<')) { FromSet('&') => go!(self: consume_char_ref), FromSet('<') => go!(self: to TagState), FromSet(c) => go!(self: emit c), NotFromSet(b) => self.emit_chars(b), } }, //§ tag-state XmlState::TagState => loop { match get_char!(self, input) { '!' => go!(self: to MarkupDecl), '/' => go!(self: to EndTagState), '?' => go!(self: to Pi), '\t' | '\n' | ' ' | ':' | '<' | '>' => { go!(self: error; emit '<'; reconsume Data) }, cl => go!(self: create_tag StartTag cl; to TagName), } }, //§ end-tag-state XmlState::EndTagState => loop { match get_char!(self, input) { '>' => go!(self: emit_short_tag Data), '\t' | '\n' | ' ' | '<' | ':' => { go!(self: error; emit '<'; emit '/'; reconsume Data) }, cl => go!(self: create_tag EndTag cl; to EndTagName), } }, //§ end-tag-name-state XmlState::EndTagName => loop { match get_char!(self, input) { '\t' | '\n' | ' ' => go!(self: to EndTagNameAfter), '/' => go!(self: error; to EndTagNameAfter), '>' => go!(self: emit_tag Data), cl => go!(self: push_tag cl), } }, //§ end-tag-name-after-state XmlState::EndTagNameAfter => loop { match get_char!(self, input) { '>' => go!(self: emit_tag Data), '\t' | '\n' | ' ' => (), _ => self.emit_error(Borrowed("Unexpected element in tag name")), } }, //§ pi-state XmlState::Pi => loop { match get_char!(self, input) { '\t' | '\n' | ' ' => go!(self: error; reconsume BogusComment), cl => go!(self: create_pi cl; to PiTarget), } }, //§ pi-target-state XmlState::PiTarget => loop { match get_char!(self, input) { '\t' | '\n' | ' ' => go!(self: to PiTargetAfter), '?' => go!(self: to PiAfter), cl => go!(self: push_pi_target cl), } }, //§ pi-target-after-state XmlState::PiTargetAfter => loop { match get_char!(self, input) { '\t' | '\n' | ' ' => (), _ => go!(self: reconsume PiData), } }, //§ pi-data-state XmlState::PiData => loop { match get_char!(self, input) { '?' => go!(self: to PiAfter), cl => go!(self: push_pi_data cl), } }, //§ pi-after-state XmlState::PiAfter => loop { match get_char!(self, input) { '>' => go!(self: emit_pi Data), '?' => go!(self: to PiAfter), cl => go!(self: push_pi_data cl), } }, //§ markup-declaration-state XmlState::MarkupDecl => loop { if eat!(self, input, "--") { go!(self: clear_comment; to CommentStart); } else if eat!(self, input, "[CDATA[") { go!(self: to Cdata); } else if eat!(self, input, "DOCTYPE") { go!(self: to Doctype); } else { // FIXME: 'error' gives wrong message go!(self: error; to BogusComment); } }, //§ comment-start-state XmlState::CommentStart => loop { match get_char!(self, input) { '-' => go!(self: to CommentStartDash), '>' => go!(self: error; emit_comment; to Data), _ => go!(self: reconsume Comment), } }, //§ comment-start-dash-state XmlState::CommentStartDash => loop { match get_char!(self, input) { '-' => go!(self: to CommentEnd), '>' => go!(self: error; emit_comment; to Data), _ => go!(self: push_comment '-'; reconsume Comment), } }, //§ comment-state XmlState::Comment => loop { match get_char!(self, input) { '<' => go!(self: push_comment '<'; to CommentLessThan), '-' => go!(self: to CommentEndDash), c => go!(self: push_comment c), } }, //§ comment-less-than-sign-state XmlState::CommentLessThan => loop { match get_char!(self, input) { '!' => go!(self: push_comment '!';to CommentLessThanBang), '<' => go!(self: push_comment '<'), _ => go!(self: reconsume Comment), } }, //§ comment-less-than-sign-bang-state XmlState::CommentLessThanBang => loop { match get_char!(self, input) { '-' => go!(self: to CommentLessThanBangDash), _ => go!(self: reconsume Comment), } }, //§ comment-less-than-sign-bang-dash-state XmlState::CommentLessThanBangDash => loop { match get_char!(self, input) { '-' => go!(self: to CommentLessThanBangDashDash), _ => go!(self: reconsume CommentEndDash), } }, //§ comment-less-than-sign-bang-dash-dash-state XmlState::CommentLessThanBangDashDash => loop { match get_char!(self, input) { '>' => go!(self: reconsume CommentEnd), _ => go!(self: error; reconsume CommentEnd), } }, //§ comment-end-dash-state XmlState::CommentEndDash => loop { match get_char!(self, input) { '-' => go!(self: to CommentEnd), _ => go!(self: push_comment '-'; reconsume Comment), } }, //§ comment-end-state XmlState::CommentEnd => loop { match get_char!(self, input) { '>' => go!(self: emit_comment; to Data), '!' => go!(self: to CommentEndBang), '-' => go!(self: push_comment '-'), _ => go!(self: append_comment "--"; reconsume Comment), } }, //§ comment-end-bang-state XmlState::CommentEndBang => loop { match get_char!(self, input) { '-' => go!(self: append_comment "--!"; to CommentEndDash), '>' => go!(self: error; emit_comment; to Data), _ => go!(self: append_comment "--!"; reconsume Comment), } }, //§ bogus-comment-state XmlState::BogusComment => loop { match get_char!(self, input) { '>' => go!(self: emit_comment; to Data), c => go!(self: push_comment c), } }, //§ cdata-state XmlState::Cdata => loop { match get_char!(self, input) { ']' => go!(self: to CdataBracket), cl => go!(self: emit cl), } }, //§ cdata-bracket-state XmlState::CdataBracket => loop { match get_char!(self, input) { ']' => go!(self: to CdataEnd), cl => go!(self: emit ']'; emit cl; to Cdata), } }, //§ cdata-end-state XmlState::CdataEnd => loop { match get_char!(self, input) { '>' => go!(self: to Data), ']' => go!(self: emit ']'), cl => go!(self: emit ']'; emit ']'; emit cl; to Cdata), } }, //§ tag-name-state XmlState::TagName => loop { match get_char!(self, input) { '\t' | '\n' | ' ' => go!(self: to TagAttrNameBefore), '>' => go!(self: emit_tag Data), '/' => go!(self: set_empty_tag; to TagEmpty), cl => go!(self: push_tag cl), } }, //§ empty-tag-state XmlState::TagEmpty => loop { match get_char!(self, input) { '>' => go!(self: emit_empty_tag Data), _ => go!(self: reconsume TagAttrValueBefore), } }, //§ tag-attribute-name-before-state XmlState::TagAttrNameBefore => loop { match get_char!(self, input) { '\t' | '\n' | ' ' => (), '>' => go!(self: emit_tag Data), '/' => go!(self: set_empty_tag; to TagEmpty), ':' => go!(self: error), cl => go!(self: create_attr cl; to TagAttrName), } }, //§ tag-attribute-name-state XmlState::TagAttrName => loop { match get_char!(self, input) { '=' => go!(self: to TagAttrValueBefore), '>' => go!(self: emit_tag Data), '\t' | '\n' | ' ' => go!(self: to TagAttrNameAfter), '/' => go!(self: set_empty_tag; to TagEmpty), cl => go!(self: push_name cl), } }, //§ tag-attribute-name-after-state XmlState::TagAttrNameAfter => loop { match get_char!(self, input) { '\t' | '\n' | ' ' => (), '=' => go!(self: to TagAttrValueBefore), '>' => go!(self: emit_tag Data), '/' => go!(self: set_empty_tag; to TagEmpty), cl => go!(self: create_attr cl; to TagAttrName), } }, //§ tag-attribute-value-before-state XmlState::TagAttrValueBefore => loop { match get_char!(self, input) { '\t' | '\n' | ' ' => (), '"' => go!(self: to TagAttrValue DoubleQuoted), '\'' => go!(self: to TagAttrValue SingleQuoted), '&' => go!(self: reconsume TagAttrValue(Unquoted)), '>' => go!(self: emit_tag Data), cl => go!(self: push_value cl; to TagAttrValue(Unquoted)), } }, //§ tag-attribute-value-double-quoted-state XmlState::TagAttrValue(DoubleQuoted) => loop { match pop_except_from!(self, input, small_char_set!('\n' '"' '&')) { FromSet('"') => go!(self: to TagAttrNameBefore), FromSet('&') => go!(self: consume_char_ref '"' ), FromSet(c) => go!(self: push_value c), NotFromSet(ref b) => go!(self: append_value b), } }, //§ tag-attribute-value-single-quoted-state XmlState::TagAttrValue(SingleQuoted) => loop { match pop_except_from!(self, input, small_char_set!('\n' '\'' '&')) { FromSet('\'') => go!(self: to TagAttrNameBefore), FromSet('&') => go!(self: consume_char_ref '\''), FromSet(c) => go!(self: push_value c), NotFromSet(ref b) => go!(self: append_value b), } }, //§ tag-attribute-value-double-quoted-state XmlState::TagAttrValue(Unquoted) => loop { match pop_except_from!(self, input, small_char_set!('\n' '\t' ' ' '&' '>')) { FromSet('\t') | FromSet('\n') | FromSet(' ') => go!(self: to TagAttrNameBefore), FromSet('&') => go!(self: consume_char_ref), FromSet('>') => go!(self: emit_tag Data), FromSet(c) => go!(self: push_value c), NotFromSet(ref b) => go!(self: append_value b), } }, //§ doctype-state XmlState::Doctype => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName), _ => go!(self: error; reconsume BeforeDoctypeName), } }, //§ before-doctype-name-state XmlState::BeforeDoctypeName => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '>' => go!(self: error; emit_doctype; to Data), c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase()); to DoctypeName), } }, //§ doctype-name-state XmlState::DoctypeName => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterDoctypeName), '>' => go!(self: emit_doctype; to Data), c => go!(self: push_doctype_name (c.to_ascii_lowercase()); to DoctypeName), } }, //§ after-doctype-name-state XmlState::AfterDoctypeName => loop { if eat!(self, input, "public") { go!(self: to AfterDoctypeKeyword Public); } else if eat!(self, input, "system") { go!(self: to AfterDoctypeKeyword System); } else { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '>' => go!(self: emit_doctype; to Data), _ => go!(self: error; to BogusDoctype), } } }, //§ after-doctype-public-keyword-state XmlState::AfterDoctypeKeyword(Public) => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier Public), '"' => { go!(self: error; clear_doctype_id Public; to DoctypeIdentifierDoubleQuoted Public) }, '\'' => { go!(self: error; clear_doctype_id Public; to DoctypeIdentifierSingleQuoted Public) }, '>' => go!(self: error; emit_doctype; to Data), _ => go!(self: error; to BogusDoctype), } }, //§ after-doctype-system-keyword-state XmlState::AfterDoctypeKeyword(System) => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier System), '"' => { go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System) }, '\'' => { go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) }, '>' => go!(self: error; emit_doctype; to Data), _ => go!(self: error; to BogusDoctype), } }, //§ before_doctype_public_identifier_state before_doctype_system_identifier_state XmlState::BeforeDoctypeIdentifier(kind) => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind), '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind), '>' => go!(self: error; emit_doctype; to Data), _ => go!(self: error; to BogusDoctype), } }, //§ doctype_public_identifier_double_quoted_state doctype_system_identifier_double_quoted_state XmlState::DoctypeIdentifierDoubleQuoted(kind) => loop { match get_char!(self, input) { '"' => go!(self: to AfterDoctypeIdentifier kind), '>' => go!(self: error; emit_doctype; to Data), c => go!(self: push_doctype_id kind c), } }, //§ doctype_public_identifier_single_quoted_state doctype_system_identifier_single_quoted_state XmlState::DoctypeIdentifierSingleQuoted(kind) => loop { match get_char!(self, input) { '\'' => go!(self: to AfterDoctypeIdentifier kind), '>' => go!(self: error; emit_doctype; to Data), c => go!(self: push_doctype_id kind c), } }, //§ doctype_public_identifier_single_quoted_state XmlState::AfterDoctypeIdentifier(Public) => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => { go!(self: to BetweenDoctypePublicAndSystemIdentifiers) }, '\'' => { go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted(System)) }, '"' => { go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted(System)) }, '>' => go!(self: emit_doctype; to Data), _ => go!(self: error; to BogusDoctype), } }, //§ doctype_system_identifier_single_quoted_state XmlState::AfterDoctypeIdentifier(System) => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '>' => go!(self: emit_doctype; to Data), _ => go!(self: error; to BogusDoctype), } }, //§ between_doctype_public_and_system_identifier_state XmlState::BetweenDoctypePublicAndSystemIdentifiers => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '>' => go!(self: emit_doctype; to Data), '\'' => go!(self: to DoctypeIdentifierSingleQuoted System), '"' => go!(self: to DoctypeIdentifierDoubleQuoted System), _ => go!(self: error; to BogusDoctype), } }, //§ bogus_doctype_state XmlState::BogusDoctype => loop { match get_char!(self, input) { '>' => go!(self: emit_doctype; to Data), _ => (), } }, } } /// Indicate that we have reached the end of the input. pub fn end(&mut self) { // Handle EOF in the char ref sub-tokenizer, if there is one. // Do this first because it might un-consume stuff. let mut input = BufferQueue::new(); match self.char_ref_tokenizer.take() { None => (), Some(mut tok) => { tok.end_of_file(self, &mut input); self.process_char_ref(tok.get_result()); }, } // Process all remaining buffered input. // If we're waiting for lookahead, we're not gonna get it. self.at_eof = true; self.run(&mut input); while self.eof_step() { // loop } self.sink.end(); if self.opts.profile { self.dump_profile(); } } #[cfg(for_c)] fn dump_profile(&self) { unreachable!(); } #[cfg(not(for_c))] fn dump_profile(&self) { let mut results: Vec<(states::XmlState, u64)> = self.state_profile.iter().map(|(s, t)| (*s, *t)).collect(); results.sort_by(|&(_, x), &(_, y)| y.cmp(&x)); let total: u64 = results .iter() .map(|&(_, t)| t) .fold(0, ::std::ops::Add::add); debug!("\nTokenizer profile, in nanoseconds"); debug!("\n{:12} total in token sink", self.time_in_sink); debug!("\n{:12} total in tokenizer", total); for (k, v) in results.into_iter() { let pct = 100.0 * (v as f64) / (total as f64); debug!("{:12} {:4.1}% {:?}", v, pct, k); } } fn eof_step(&mut self) -> bool { debug!("processing EOF in state {:?}", self.state); match self.state { XmlState::Data | XmlState::Quiescent => go!(self: eof), XmlState::CommentStart | XmlState::CommentLessThan | XmlState::CommentLessThanBang => { go!(self: reconsume Comment) }, XmlState::CommentLessThanBangDash => go!(self: reconsume CommentEndDash), XmlState::CommentLessThanBangDashDash => go!(self: reconsume CommentEnd), XmlState::CommentStartDash | XmlState::Comment | XmlState::CommentEndDash | XmlState::CommentEnd | XmlState::CommentEndBang => go!(self: error_eof; emit_comment; eof), XmlState::TagState => go!(self: error_eof; emit '<'; to Data), XmlState::EndTagState => go!(self: error_eof; emit '<'; emit '/'; to Data), XmlState::TagEmpty => go!(self: error_eof; to TagAttrNameBefore), XmlState::Cdata | XmlState::CdataBracket | XmlState::CdataEnd => { go!(self: error_eof; to Data) }, XmlState::Pi => go!(self: error_eof; to BogusComment), XmlState::PiTargetAfter | XmlState::PiAfter => go!(self: reconsume PiData), XmlState::MarkupDecl => go!(self: error_eof; to BogusComment), XmlState::TagName | XmlState::TagAttrNameBefore | XmlState::EndTagName | XmlState::TagAttrNameAfter | XmlState::EndTagNameAfter | XmlState::TagAttrValueBefore | XmlState::TagAttrValue(_) => go!(self: error_eof; emit_tag Data), XmlState::PiData | XmlState::PiTarget => go!(self: error_eof; emit_pi Data), XmlState::TagAttrName => go!(self: error_eof; emit_start_tag Data), XmlState::BeforeDoctypeName | XmlState::Doctype | XmlState::DoctypeName | XmlState::AfterDoctypeName | XmlState::AfterDoctypeKeyword(_) | XmlState::BeforeDoctypeIdentifier(_) | XmlState::AfterDoctypeIdentifier(_) | XmlState::DoctypeIdentifierSingleQuoted(_) | XmlState::DoctypeIdentifierDoubleQuoted(_) | XmlState::BetweenDoctypePublicAndSystemIdentifiers => { go!(self: error_eof; emit_doctype; to Data) }, XmlState::BogusDoctype => go!(self: emit_doctype; to Data), XmlState::BogusComment => go!(self: emit_comment; to Data), } } fn process_char_ref(&mut self, char_ref: CharRef) { let CharRef { mut chars, mut num_chars, } = char_ref; if num_chars == 0 { chars[0] = '&'; num_chars = 1; } for i in 0..num_chars { let c = chars[i as usize]; match self.state { states::Data | states::Cdata => go!(self: emit c), states::TagAttrValue(_) => go!(self: push_value c), _ => panic!( "state {:?} should not be reachable in process_char_ref", self.state ), } } } fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> bool { let mut tok = self.char_ref_tokenizer.take().unwrap(); let outcome = tok.step(self, input); let progress = match outcome { char_ref::Done => { self.process_char_ref(tok.get_result()); return true; }, char_ref::Stuck => false, char_ref::Progress => true, }; self.char_ref_tokenizer = Some(tok); progress } fn finish_attribute(&mut self) { if self.current_attr_name.len() == 0 { return; } // Check for a duplicate attribute. // FIXME: the spec says we should error as soon as the name is finished. // FIXME: linear time search, do we care? let dup = { let name = &self.current_attr_name[..]; self.current_tag_attrs .iter() .any(|a| &*a.name.local == name) }; if dup { self.emit_error(Borrowed("Duplicate attribute")); self.current_attr_name.clear(); self.current_attr_value.clear(); } else { let qname = process_qname(replace(&mut self.current_attr_name, StrTendril::new())); let attr = Attribute { name: qname.clone(), value: replace(&mut self.current_attr_value, StrTendril::new()), }; if qname.local == local_name!("xmlns") || qname.prefix == Some(namespace_prefix!("xmlns")) { self.current_tag_attrs.insert(0, attr); } else { self.current_tag_attrs.push(attr); } } } fn create_attribute(&mut self, c: char) { self.finish_attribute(); self.current_attr_name.push_char(c); } } #[cfg(test)] mod test { use super::process_qname; use crate::tendril::SliceExt; use crate::{LocalName, Prefix}; #[test] fn simple_namespace() { let qname = process_qname("prefix:local".to_tendril()); assert_eq!(qname.prefix, Some(Prefix::from("prefix"))); assert_eq!(qname.local, LocalName::from("local")); let qname = process_qname("a:b".to_tendril()); assert_eq!(qname.prefix, Some(Prefix::from("a"))); assert_eq!(qname.local, LocalName::from("b")); } #[test] fn wrong_namespaces() { let qname = process_qname(":local".to_tendril()); assert_eq!(qname.prefix, None); assert_eq!(qname.local, LocalName::from(":local")); let qname = process_qname("::local".to_tendril()); assert_eq!(qname.prefix, None); assert_eq!(qname.local, LocalName::from("::local")); let qname = process_qname("a::local".to_tendril()); assert_eq!(qname.prefix, None); assert_eq!(qname.local, LocalName::from("a::local")); let qname = process_qname("fake::".to_tendril()); assert_eq!(qname.prefix, None); assert_eq!(qname.local, LocalName::from("fake::")); let qname = process_qname(":::".to_tendril()); assert_eq!(qname.prefix, None); assert_eq!(qname.local, LocalName::from(":::")); let qname = process_qname(":a:b:".to_tendril()); assert_eq!(qname.prefix, None); assert_eq!(qname.local, LocalName::from(":a:b:")); } } xml5ever-0.16.1/src/tokenizer/qname.rs010064400017500001750000000042741344230726100160500ustar0000000000000000// Copyright 2014-2017 The html5ever Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. enum QualNameState { BeforeName, InName, AfterColon, } pub struct QualNameTokenizer<'a> { state: QualNameState, slice: &'a [u8], valid_index: Option, curr_ind: usize, } impl<'a> QualNameTokenizer<'a> { pub fn new(tag: &[u8]) -> QualNameTokenizer { QualNameTokenizer { state: QualNameState::BeforeName, slice: tag, valid_index: None, curr_ind: 0, } } pub fn run(&mut self) -> Option { if self.slice.len() > 0 { loop { if !self.step() { break; } } } self.valid_index } fn incr(&mut self) -> bool { if self.curr_ind + 1 < self.slice.len() { self.curr_ind += 1; return true; } false } fn step(&mut self) -> bool { match self.state { QualNameState::BeforeName => self.do_before_name(), QualNameState::InName => self.do_in_name(), QualNameState::AfterColon => self.do_after_colon(), } } fn do_before_name(&mut self) -> bool { if self.slice[self.curr_ind] == b':' { false } else { self.state = QualNameState::InName; self.incr() } } fn do_in_name(&mut self) -> bool { if self.slice[self.curr_ind] == b':' && self.curr_ind + 1 < self.slice.len() { self.valid_index = Some(self.curr_ind as u32); self.state = QualNameState::AfterColon; } self.incr() } fn do_after_colon(&mut self) -> bool { if self.slice[self.curr_ind] == b':' { self.valid_index = None; return false; } self.incr() } } xml5ever-0.16.1/src/tokenizer/states.rs010064400017500001750000000037431344230726100162520ustar0000000000000000// Copyright 2014-2017 The html5ever Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! Tokenizer states. //! //! This is public for use by the tokenizer tests. Other library //! users should not have to care about this. pub use self::AttrValueKind::*; pub use self::DoctypeKind::*; pub use self::XmlState::*; #[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)] #[doc(hidden)] pub enum DoctypeKind { Public, System, } #[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)] #[doc(hidden)] pub enum XmlState { Data, TagState, EndTagState, EndTagName, EndTagNameAfter, Pi, PiTarget, PiTargetAfter, PiData, PiAfter, MarkupDecl, CommentStart, CommentStartDash, Comment, CommentLessThan, CommentLessThanBang, CommentLessThanBangDash, CommentLessThanBangDashDash, CommentEnd, CommentEndDash, CommentEndBang, Cdata, CdataBracket, CdataEnd, TagName, TagEmpty, TagAttrNameBefore, TagAttrName, TagAttrNameAfter, TagAttrValueBefore, TagAttrValue(AttrValueKind), Doctype, BeforeDoctypeName, DoctypeName, AfterDoctypeName, AfterDoctypeKeyword(DoctypeKind), BeforeDoctypeIdentifier(DoctypeKind), DoctypeIdentifierDoubleQuoted(DoctypeKind), DoctypeIdentifierSingleQuoted(DoctypeKind), AfterDoctypeIdentifier(DoctypeKind), BetweenDoctypePublicAndSystemIdentifiers, BogusDoctype, BogusComment, Quiescent, } #[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)] #[doc(hidden)] pub enum AttrValueKind { Unquoted, SingleQuoted, DoubleQuoted, } xml5ever-0.16.1/src/tree_builder/mod.rs010064400017500001750000000576641355452220200161720ustar0000000000000000// Copyright 2014-2017 The html5ever Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. mod types; use log::{debug, warn}; use mac::{matches, _tt_as_expr_hack, unwrap_or_return}; use markup5ever::{local_name, namespace_prefix, namespace_url, ns}; use std::borrow::Cow; use std::borrow::Cow::Borrowed; use std::collections::btree_map::Iter; use std::collections::{BTreeMap, HashSet, VecDeque}; use std::fmt::{Debug, Error, Formatter}; use std::mem; use std::result::Result; pub use self::interface::{NextParserState, NodeOrText, Tracer, TreeSink}; use self::types::*; use crate::interface::{self, create_element, AppendNode, Attribute, QualName}; use crate::interface::{AppendText, ExpandedName}; use crate::tokenizer::states::Quiescent; use crate::tokenizer::{self, EndTag, StartTag, Tag, TokenSink}; use crate::tokenizer::{Doctype, EmptyTag, Pi, ShortTag}; use crate::{LocalName, Namespace, Prefix}; use crate::tendril::{StrTendril, Tendril}; static XML_URI: &'static str = "http://www.w3.org/XML/1998/namespace"; static XMLNS_URI: &'static str = "http://www.w3.org/2000/xmlns/"; type InsResult = Result<(), Cow<'static, str>>; #[derive(Debug)] struct NamespaceMapStack(Vec); impl NamespaceMapStack { fn new() -> NamespaceMapStack { NamespaceMapStack({ let mut vec = Vec::new(); vec.push(NamespaceMap::default()); vec }) } fn push(&mut self, map: NamespaceMap) { self.0.push(map); } #[doc(hidden)] pub fn pop(&mut self) { self.0.pop(); } } #[doc(hidden)] pub struct NamespaceMap { // Map that maps prefixes to URI. // // Key denotes namespace prefix, and value denotes // URI it maps to. // // If value of value is None, that means the namespace // denoted by key has been undeclared. scope: BTreeMap, Option>, } impl Debug for NamespaceMap { fn fmt(&self, f: &mut Formatter) -> Result<(), Error> { write!(f, "\nNamespaceMap[")?; for (key, value) in &self.scope { write!(f, " {:?} : {:?}\n", key, value)?; } write!(f, "]") } } impl NamespaceMap { // Returns an empty namespace. #[doc(hidden)] pub fn empty() -> NamespaceMap { NamespaceMap { scope: BTreeMap::new(), } } fn default() -> NamespaceMap { NamespaceMap { scope: { let mut map = BTreeMap::new(); map.insert(None, None); map.insert(Some(namespace_prefix!("xml")), Some(ns!(xml))); map.insert(Some(namespace_prefix!("xmlns")), Some(ns!(xmlns))); map }, } } #[doc(hidden)] pub fn get(&self, prefix: &Option) -> Option<&Option> { self.scope.get(prefix) } #[doc(hidden)] pub fn get_scope_iter(&self) -> Iter, Option> { self.scope.iter() } #[doc(hidden)] pub fn insert(&mut self, name: &QualName) { let prefix = if let Some(ref p) = name.prefix { Some(p.clone()) } else { None }; let namespace = Some(Namespace::from(&*name.ns)); self.scope.insert(prefix, namespace); } fn insert_ns(&mut self, attr: &Attribute) -> InsResult { if &*attr.value == XMLNS_URI { return Err(Borrowed("Can't declare XMLNS URI")); }; let opt_uri = if attr.value.is_empty() { None } else { Some(Namespace::from(&*attr.value)) }; let result = match (&attr.name.prefix, &*attr.name.local) { (&Some(namespace_prefix!("xmlns")), "xml") => { if &*attr.value != XML_URI { Err(Borrowed("XML namespace can't be redeclared")) } else { Ok(()) } }, (&Some(namespace_prefix!("xmlns")), "xmlns") => { Err(Borrowed("XMLNS namespaces can't be changed")) }, (&Some(namespace_prefix!("xmlns")), _) | (&None, "xmlns") => { // We can have two cases of properly defined xmlns // First with default namespace e.g. // // let ns_prefix = if &*attr.name.local == "xmlns" { None // Second is with named namespace e.g. // // } else { Some(Prefix::from(&*attr.name.local)) }; if opt_uri.is_some() && self.scope.contains_key(&ns_prefix) { Err(Borrowed("Namespace already defined")) } else { self.scope.insert(ns_prefix, opt_uri); Ok(()) } }, (_, _) => Err(Borrowed("Invalid namespace declaration.")), }; result } } /// Tree builder options, with an impl for Default. #[derive(Copy, Clone)] pub struct XmlTreeBuilderOpts {} impl Default for XmlTreeBuilderOpts { fn default() -> XmlTreeBuilderOpts { XmlTreeBuilderOpts {} } } /// The XML tree builder. pub struct XmlTreeBuilder { /// Configuration options for XmlTreeBuilder _opts: XmlTreeBuilderOpts, /// Consumer of tree modifications. pub sink: Sink, /// The document node, which is created by the sink. doc_handle: Handle, /// Next state change for the tokenizer, if any. next_tokenizer_state: Option, /// Stack of open elements, most recently added at end. open_elems: Vec, /// Current element pointer. curr_elem: Option, /// Stack of namespace identifiers and namespaces. namespace_stack: NamespaceMapStack, /// Current namespace identifier current_namespace: NamespaceMap, /// List of already present namespace local name attribute pairs. present_attrs: HashSet<(Namespace, LocalName)>, /// Current tree builder phase. phase: XmlPhase, } impl XmlTreeBuilder where Handle: Clone, Sink: TreeSink, { /// Create a new tree builder which sends tree modifications to a particular `TreeSink`. /// /// The tree builder is also a `TokenSink`. pub fn new(mut sink: Sink, opts: XmlTreeBuilderOpts) -> XmlTreeBuilder { let doc_handle = sink.get_document(); XmlTreeBuilder { _opts: opts, sink: sink, doc_handle: doc_handle, next_tokenizer_state: None, open_elems: vec![], curr_elem: None, namespace_stack: NamespaceMapStack::new(), current_namespace: NamespaceMap::empty(), present_attrs: HashSet::new(), phase: StartPhase, } } /// Call the `Tracer`'s `trace_handle` method on every `Handle` in the tree builder's /// internal state. This is intended to support garbage-collected DOMs. pub fn trace_handles(&self, tracer: &dyn Tracer) { tracer.trace_handle(&self.doc_handle); for e in self.open_elems.iter() { tracer.trace_handle(&e); } self.curr_elem.as_ref().map(|h| tracer.trace_handle(&h)); } // Debug helper #[cfg(not(for_c))] #[allow(dead_code)] fn dump_state(&self, label: String) { debug!("dump_state on {}", label); debug!(" open_elems:"); for node in self.open_elems.iter() { debug!(" {:?}", self.sink.elem_name(node)); } debug!(""); } #[cfg(for_c)] fn debug_step(&self, _mode: XmlPhase, _token: &Token) {} #[cfg(not(for_c))] fn debug_step(&self, mode: XmlPhase, token: &Token) { debug!( "processing {:?} in insertion mode {:?}", format!("{:?}", token), mode ); } fn declare_ns(&mut self, attr: &mut Attribute) { if let Err(msg) = self.current_namespace.insert_ns(&attr) { self.sink.parse_error(msg); } else { attr.name.ns = ns!(xmlns); } } fn find_uri(&self, prefix: &Option) -> Result, Cow<'static, str>> { let mut uri = Err(Borrowed("No appropriate namespace found")); for ns in self .namespace_stack .0 .iter() .chain(Some(&self.current_namespace)) .rev() { if let Some(el) = ns.get(prefix) { uri = Ok(el.clone()); break; } } uri } fn bind_qname(&mut self, name: &mut QualName) { match self.find_uri(&name.prefix) { Ok(uri) => { let ns_uri = match uri { Some(e) => e, None => ns!(), }; name.ns = ns_uri; }, Err(msg) => { self.sink.parse_error(msg); }, } } // This method takes in name qualified name and binds it to the // existing namespace context. // // Returns false if the attribute is a duplicate, returns true otherwise. fn bind_attr_qname(&mut self, name: &mut QualName) -> bool { // Attributes don't have default namespace let mut not_duplicate = true; if name.prefix.is_some() { self.bind_qname(name); not_duplicate = self.check_duplicate_attr(name); } not_duplicate } fn check_duplicate_attr(&mut self, name: &QualName) -> bool { let pair = (name.ns.clone(), name.local.clone()); if self.present_attrs.contains(&pair) { return false; } self.present_attrs.insert(pair); true } fn process_namespaces(&mut self, tag: &mut Tag) { let mut new_attr = vec![]; // First we extract all namespace declarations for mut attr in tag.attrs.iter_mut().filter(|attr| { attr.name.prefix == Some(namespace_prefix!("xmlns")) || attr.name.local == local_name!("xmlns") }) { self.declare_ns(&mut attr); } // Then we bind those namespace declarations to attributes for attr in tag.attrs.iter_mut().filter(|attr| { attr.name.prefix != Some(namespace_prefix!("xmlns")) && attr.name.local != local_name!("xmlns") }) { if self.bind_attr_qname(&mut attr.name) { new_attr.push(attr.clone()); } } mem::replace(&mut tag.attrs, new_attr); // Then we bind the tags namespace. self.bind_qname(&mut tag.name); // Finally, we dump current namespace if its unneeded. let x = mem::replace(&mut self.current_namespace, NamespaceMap::empty()); // Only start tag doesn't dump current namespace. However,