xml5ever-0.16.1/Cargo.toml.orig 0100644 0001750 0001750 00000001344 13554532471 0014471 0 ustar 00 0000000 0000000 [package]
name = "xml5ever"
version = "0.16.1"
authors = ["The xml5ever project developers"]
license = "MIT / Apache-2.0"
repository = "https://github.com/servo/html5ever"
description = "Push based streaming parser for xml"
documentation = "https://docs.rs/xml5ever/"
homepage = "https://github.com/servo/html5ever/blob/master/xml5ever/README.md"
readme = "README.md"
keywords = ["xml", "xml5", "parser", "parsing"]
exclude = ["xml5lib-tests/*"]
categories = [ "parser-implementations", "web-programming" ]
edition = "2018"
[dependencies]
time = "0.1"
log = "0.4"
mac = "0.1"
markup5ever = {version = "0.10", path = "../markup5ever" }
[dev-dependencies]
rustc-test = "0.3"
criterion = "0.3"
[[bench]]
name = "xml5ever"
harness = false
xml5ever-0.16.1/Cargo.toml 0000644 00000002471 00000000000 0010731 0 ustar 00 # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies
#
# If you believe there's an error in this file please file an
# issue against the rust-lang/cargo repository. If you're
# editing this file be aware that the upstream Cargo.toml
# will likely look very different (and much more reasonable)
[package]
edition = "2018"
name = "xml5ever"
version = "0.16.1"
authors = ["The xml5ever project developers"]
exclude = ["xml5lib-tests/*"]
description = "Push based streaming parser for xml"
homepage = "https://github.com/servo/html5ever/blob/master/xml5ever/README.md"
documentation = "https://docs.rs/xml5ever/"
readme = "README.md"
keywords = ["xml", "xml5", "parser", "parsing"]
categories = ["parser-implementations", "web-programming"]
license = "MIT / Apache-2.0"
repository = "https://github.com/servo/html5ever"
[[bench]]
name = "xml5ever"
harness = false
[dependencies.log]
version = "0.4"
[dependencies.mac]
version = "0.1"
[dependencies.markup5ever]
version = "0.10"
[dependencies.time]
version = "0.1"
[dev-dependencies.criterion]
version = "0.3"
[dev-dependencies.rustc-test]
version = "0.3"
xml5ever-0.16.1/LICENSE-APACHE 0100644 0001750 0001750 00000025137 13554521475 0013536 0 ustar 00 0000000 0000000 Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
xml5ever-0.16.1/LICENSE-MIT 0100644 0001750 0001750 00000002064 13554521475 0013240 0 ustar 00 0000000 0000000 Copyright (c) 2014 The html5ever Project Developers
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
xml5ever-0.16.1/README.md 0100644 0001750 0001750 00000005722 13554521475 0013067 0 ustar 00 0000000 0000000 # xml5ever
 
[](https://docs.rs/xml5ever)
[](https://crates.io/crates/xml5ever)
[API documentation](https://Ygg01.github.io/docs/xml5ever/xml5ever/index.html)
**Warning:** This library is alpha quality, so no guarantees are given.
This crate provides a push based XML parser library that trades well-formedness for error recovery.
xml5ever is based largely on [html5ever](https://github.com/servo/html5ever) parser, so if you have experience with html5ever you will be familiar with xml5ever.
The library is dual licensed under MIT and Apache license.
# Why you should use xml5ever
Main use case for this library is when XML is badly formatted, usually from bad XML
templates. XML5 tries to handle most common errors, in a manner similar to HTML5.
## When you should use it?
- You aren't interested in well-formed documents.
- You need to get some info from your data even if it has errors (although not all possible errors are handled).
- You want to features like character references or xml namespaces.
## When you shouldn't use it
- You need to have your document validated.
- You require DTD support.
- You require an easy to use parser, with lots of extensions (e.g. XPath, XQuery).
- You require a battle tested, industry proven solution.
# Installation
Add xml5ever as a dependency in your project manifest.
```toml
[dependencies]
xml5ever = "0.1.3"
```
And add crate declaration in your lib.rs
```rust
extern crate xml5ever
```
# Getting started
Here is a very simple RcDom backed parser:
```rust
let input = "".to_tendril();
// To parse XML into a tree form, we need a TreeSink
// luckily xml5ever comes with a static RC backed tree represetation.
let dom: RcDom = parse(std::iter::once(input), Default::default());
// Do something with dom
```
The thing that does actual parsing is the `parse` function. It expects an iterator that can be converted into `StrTendril`, so you can use `std::iter::once(input)` or `Some(input).into_iter()` (where `input` is `StrTendril` like structure).
# Working on xml5ever
To build examples and tests you need to do something along the lines of:
```rust
git submodule update --init # to fetch xml5lib-tests
cargo build
cargo test
```
This will fetch tests from outside repository and it will invoke cargo to
build and test the crate. If you need docs checkout either [API docs](https://ygg01.github.io/docs/xml5ever/xml5ever/index.html) or run `cargo docs`
to generate documentation.
## Easy first tasks
What I generally recommend is to look at Clippy Linting badge results and create
a PR for fixing the said lints. Other than that try to look for any tasks labeled
easy or just update docs/examples.
xml5ever-0.16.1/benches/xml5ever.rs 0100644 0001750 0001750 00000004523 13554521475 0015332 0 ustar 00 0000000 0000000 #[macro_use]
extern crate criterion;
extern crate markup5ever;
extern crate xml5ever;
use std::fs;
use std::path::PathBuf;
use criterion::{black_box, Criterion};
use markup5ever::buffer_queue::BufferQueue;
use xml5ever::tendril::*;
use xml5ever::tokenizer::{Token, TokenSink, XmlTokenizer};
struct Sink;
impl TokenSink for Sink {
fn process_token(&mut self, token: Token) {
// Don't use the token, but make sure we don't get
// optimized out entirely.
black_box(token);
}
}
fn run_bench(c: &mut Criterion, name: &str) {
let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
path.push("data/bench/");
path.push(name);
let mut file = fs::File::open(&path).ok().expect("can't open file");
// Read the file and treat it as an infinitely repeating sequence of characters.
let mut file_input = ByteTendril::new();
file.read_to_tendril(&mut file_input)
.ok()
.expect("can't read file");
let file_input: StrTendril = file_input.try_reinterpret().unwrap();
let size = file_input.len();
let mut stream = file_input.chars().cycle();
// Break the input into chunks of 1024 chars (= a few kB).
// This simulates reading from the network.
let mut input = vec![];
let mut total = 0usize;
while total < size {
// The by_ref() call is important, otherwise we get wrong results!
// See rust-lang/rust#18045.
let sz = std::cmp::min(1024, size - total);
input.push(stream.by_ref().take(sz).collect::().to_tendril());
total += sz;
}
let test_name = format!("xml tokenizing {}", name);
c.bench_function(&test_name, move |b| {
b.iter(|| {
let mut tok = XmlTokenizer::new(Sink, Default::default());
let mut buffer = BufferQueue::new();
// We are doing clone inside the bench function, this is not ideal, but possibly
// necessary since our iterator consumes the underlying buffer.
for buf in input.clone().into_iter() {
buffer.push_back(buf);
let _ = tok.feed(&mut buffer);
}
let _ = tok.feed(&mut buffer);
tok.end();
})
});
}
fn xml5ever_benchmarks(c: &mut Criterion) {
run_bench(c, "strong.xml");
}
criterion_group!(benches, xml5ever_benchmarks);
criterion_main!(benches);
xml5ever-0.16.1/data/bench/strong.xml 0100644 0001750 0001750 00000002000 13442307261 0015607 0 ustar 00 0000000 0000000 xml5ever-0.16.1/examples/README.md 0100644 0001750 0001750 00000020145 13442307261 0014670 0 ustar 00 0000000 0000000 # Examples
The examples have been designed with [`cargo-script`](https://github.com/DanielKeep/cargo-script) in mind.
Here I'll just give broad overview how to install [`cargo script`] for Rust 1.5. For more details, check out [cargo-script repository](https://github.com/DanielKeep/cargo-script).
cargo install cargo-script
# Token printer
The basis of xml5ever is its tokenizer and tree builder. Roughly speaking tokenizer
takes input and returns a set of tokens like comment, processing instruction, start
tag, end tag, etc.
First let's define our dependencies:
```toml
[dependencies]
xml5ever = "0.2.0"
tendril = "0.1.3"
```
With dependencies declared, we can now make a simple tokenizer sink. First step is to
define a [`TokenSink`](https://ygg01.github.io/docs/xml5ever/xml5ever/tokenizer/trait.TokenSink.html). [`TokenSink`](https://ygg01.github.io/docs/xml5ever/xml5ever/tokenizer/trait.TokenSink.html) are traits that received stream of [`Tokens`](https://ygg01.github.io/docs/xml5ever/xml5ever/tokenizer/enum.Token.html).
In our case we'll define a unit struct (i.e. a struct without any fields).
```rust
struct SimpleTokenPrinter;
```
To make `SimpleTokenPrinter` a [`TokenSink`](https://ygg01.github.io/docs/xml5ever/xml5ever/tokenizer/trait.TokenSink.html), we need to implement [process_token](https://ygg01.github.io/docs/xml5ever/xml5ever/tokenizer/trait.TokenSink.html#tymethod.process_token) method.
```rust
impl TokenSink for SimpleTokenPrinter {
fn process_token(&mut self, token: Token) {
match token {
CharacterTokens(b) => {
println!("TEXT: {}", &*b);
},
NullCharacterToken => print!("NULL"),
TagToken(tag) => {
println!("{:?} {} ", tag.kind, &*tag.name.local);
},
ParseError(err) => {
println!("ERROR: {}", err);
},
PIToken(Pi{ref target, ref data}) => {
println!("PI : {} {}?>", &*target, &*data);
},
CommentToken(ref comment) => {
println!("", &*comment);
},
EOFToken => {
println!("EOF");
},
DoctypeToken(Doctype{ref name, ref public_id, ..}) => {
println!("", &*name, &*public_id);
}
}
}
}
```
Now, we need some input to process. For input we'll use `stdin`. However, xml5ever `tokenize_to` method only takes `StrTendril`. So we need to construct a
[`ByteTendril`](https://doc.servo.org/tendril/type.ByteTendril.html) using `ByteTendril::new()`, then read the `stdin` using [`read_to_tendril`](https://doc.servo.org/tendril/trait.ReadExt.html#tymethod.read_to_tendril) extension.
Once that is set, to make `SimpleTokenPrinter` parse the input, call,
`tokenize_to` with it as the first parameter, input wrapped in Option for second parameter and XmlToke.
```rust
fn main() {
let sink = SimpleTokenPrinter;
// We need a ByteTendril to read a file
let mut input = ByteTendril::new();
// Using SliceExt.read_to_tendril we read stdin
io::stdin().read_to_tendril(&mut input).unwrap();
// For xml5ever we need StrTendril, so we reinterpret it
// into StrTendril.
//
// You might wonder, how does `try_reinterpret` know we
// need StrTendril and the answer is type inference based
// on `tokenize_xml_to` signature.
let input = input.try_reinterpret().unwrap();
// Here we create and run tokenizer
let mut tok = XmlTokenizer::new(sink, Default::default());
// We pass input to parsed.
tok.feed(input);
// tok.end must be invoked for final bytes to be processed.
tok.end();
}
```
NOTE: `unwrap` causes panic, it's only OK to use in simple examples.
For full source code check out: [`examples/simple_xml_tokenizer.rs`](https://github.com/Ygg01/xml5ever/blob/master/examples/simple_xml_tokenizer.rs)
Once we have successfully compiled the example we run the example with inline
xml
```bash
cargo script simple_xml_tokenizer.rs <<< "Text with bold words!"
```
or by sending an [`examples/example.xml`](https://github.com/Ygg01/xml5ever/blob/master/examples/simple_xml_tokenizer.rs) located in same folder as examples.
```bash
cargo script simple_xml_tokenizer.rs < example.xml
```
# Tree printer
To actually get an XML document tree from the xml5ever, you need to use a `TreeSink`.
`TreeSink` is in many way similar to the TokenSink. Basically, TokenSink takes data
and returns list of tokens, while TreeSink takes tokens and returns a tree of parsed
XML document. Do note, that this is a simplified explanation and consult
documentation for more info.
Ok, with that in mind, let's build us a TreePrinter. For example if we get an XML
file like:
```xml
BobbyTables
```
We'd want a structure similar to this:
```
#document
student
first-name
#text Bobby
last-name
#text Tables
```
We won't print anything other than element names and text fields. So comments,
doctypes and other such elements are ignored.
First part is similar to making SimpleTokenPrinter:
```rust
// We need to allocate an input tendril for xml5ever
let mut input = ByteTendril::new();
// Using SliceExt.read_to_tendril functions we can read stdin
io::stdin().read_to_tendril(&mut input).unwrap();
let input = input.try_reinterpret().unwrap();
```
This time, we need an implementation of [`TreeSink`](https://ygg01.github.io/docs/xml5ever/xml5ever/tree_builder/interface/trait.TreeSink.html). xml5ever comes with a
built-in `TreeSink` implementation called [`RcDom`](https://ygg01.github.io/docs/xml5ever/xml5ever/rcdom/struct.RcDom.html). To process input into
a `TreeSink` we use the following line:
```rust
let dom: RcDom = parse(one_input(input), Default::default());
```
Let's analyze it a bit. First there is `let dom: RcDom`. We need this part,
because the type inferencer can't infer which TreeSink implementation we mean
in this scenario.
Function [`one_input`](https://ygg01.github.io/docs/xml5ever/xml5ever/fn.one_input.html) is a convenience function that turns any value into an iterator. In this case
it converts a StrTendril into an Iterator over itself.
Ok, so now that we parsed our tree what with it? Well, for that we might need some
kind of function that will help us traverse it. We shall call that function `walk`.
```rust
fn walk(prefix: &str, handle: Handle) {
let node = handle.borrow();
// We print out the prefix before we start
print!("{}", prefix);
// We are only interested in following nodes:
// Document, Text and Element, so our match
// reflects that.
match node.node {
Document
=> println!("#document"),
Text(ref text) => {
println!("#text {}", escape_default(text))
},
Element(ref name, _) => {
println!("{}", name.local);
},
_ => {},
}
// We increase indent in child nodes
let new_indent = {
let mut temp = String::new();
temp.push_str(prefix);
temp.push_str(" ");
temp
};
for child in node.children.iter()
// In order to avoid weird indentation, we filter
// only Text/Element nodes.
// We don't need to filter Document since its guaranteed
// child elements don't contain documents
.filter(|child| match child.borrow().node {
Text(_) | Element (_, _) => true,
_ => false,
}
) {
// Recursion - Yay!
walk(&new_indent, child.clone());
}
}
```
For full source code check out: [`examples/xml_tree_printer.rs`](https://github.com/Ygg01/xml5ever/blob/master/examples/xml_tree_printer.rs)
xml5ever-0.16.1/examples/example.xml 0100644 0001750 0001750 00000000242 13442307261 0015562 0 ustar 00 0000000 0000000
BobbyTables
xml5ever-0.16.1/examples/simple_xml_tokenizer.rs 0100644 0001750 0001750 00000004704 13554522172 0020231 0 ustar 00 0000000 0000000 #!/usr/bin/env run-cargo-script
//! This is a regular crate doc comment, but it also contains a partial
//! Cargo manifest. Note the use of a *fenced* code block, and the
//! `cargo` "language".
//!
//! ```cargo
//! [dependencies]
//! xml5ever = "0.1.1"
//! tendril = "0.1.3"
//! markup5ever = "0.7.4"
//! ```
extern crate markup5ever;
extern crate xml5ever;
use std::default::Default;
use std::io;
use markup5ever::buffer_queue::BufferQueue;
use xml5ever::tendril::{ByteTendril, ReadExt};
use xml5ever::tokenizer::{CharacterTokens, NullCharacterToken, TagToken};
use xml5ever::tokenizer::{CommentToken, PIToken, Pi};
use xml5ever::tokenizer::{Doctype, DoctypeToken, EOFToken};
use xml5ever::tokenizer::{ParseError, Token, TokenSink, XmlTokenizer};
struct SimpleTokenPrinter;
impl TokenSink for SimpleTokenPrinter {
fn process_token(&mut self, token: Token) {
match token {
CharacterTokens(b) => {
println!("TEXT: {}", &*b);
},
NullCharacterToken => print!("NULL"),
TagToken(tag) => {
println!("{:?} {} ", tag.kind, &*tag.name.local);
},
ParseError(err) => {
println!("ERROR: {}", err);
},
PIToken(Pi {
ref target,
ref data,
}) => {
println!("PI : {} {}?>", &*target, &*data);
},
CommentToken(ref comment) => {
println!("", &*comment);
},
EOFToken => {
println!("EOF");
},
DoctypeToken(Doctype {
ref name,
ref public_id,
..
}) => {
println!("", &*name, &*public_id);
},
}
}
}
fn main() {
// Our implementation of TokenSink
let sink = SimpleTokenPrinter;
// We need a ByteTendril to read a file
let mut input = ByteTendril::new();
// Using SliceExt.read_to_tendril we can read stdin
io::stdin().read_to_tendril(&mut input).unwrap();
// For xml5ever we need StrTendril, so we reinterpret it
// into StrTendril.
// Load input into BufferQueue
let mut input_buffer = BufferQueue::new();
input_buffer.push_back(input.try_reinterpret().unwrap());
// Here we create and run tokenizer
let mut tok = XmlTokenizer::new(sink, Default::default());
tok.feed(&mut input_buffer);
tok.end();
}
xml5ever-0.16.1/examples/xml_tokenizer.rs 0100644 0001750 0001750 00000006541 13554522172 0016661 0 ustar 00 0000000 0000000 #!/usr/bin/env run-cargo-script
//! This is a regular crate doc comment, but it also contains a partial
//! Cargo manifest. Note the use of a *fenced* code block, and the
//! `cargo` "language".
//!
//! ```cargo
//! [dependencies]
//! xml5ever = "0.2.0"
//! tendril = "0.1.3"
//! markup5ever = "0.7.4"
//! ```
extern crate markup5ever;
extern crate xml5ever;
use std::default::Default;
use std::io;
use markup5ever::buffer_queue::BufferQueue;
use xml5ever::tendril::{ByteTendril, ReadExt};
use xml5ever::tokenizer::{CharacterTokens, NullCharacterToken, TagToken};
use xml5ever::tokenizer::{EmptyTag, EndTag, ShortTag, StartTag};
use xml5ever::tokenizer::{PIToken, Pi};
use xml5ever::tokenizer::{ParseError, Token, TokenSink, XmlTokenizer, XmlTokenizerOpts};
#[derive(Copy, Clone)]
struct TokenPrinter {
in_char_run: bool,
}
impl TokenPrinter {
fn is_char(&mut self, is_char: bool) {
match (self.in_char_run, is_char) {
(false, true) => print!("CHAR : \""),
(true, false) => println!("\""),
_ => (),
}
self.in_char_run = is_char;
}
fn do_char(&mut self, c: char) {
self.is_char(true);
print!("{}", c.escape_default().collect::());
}
}
impl TokenSink for TokenPrinter {
fn process_token(&mut self, token: Token) {
match token {
CharacterTokens(b) => {
for c in b.chars() {
self.do_char(c);
}
},
NullCharacterToken => self.do_char('\0'),
TagToken(tag) => {
self.is_char(false);
// This is not proper HTML serialization, of course.
match tag.kind {
StartTag => print!("TAG : <\x1b[32m{}\x1b[0m", tag.name.local),
EndTag => print!("END TAG : <\x1b[31m/{}\x1b[0m", tag.name.local),
ShortTag => print!("Short TAG : <\x1b[31m/{}\x1b[0m", tag.name.local),
EmptyTag => print!("Empty TAG : <\x1b[31m{}\x1b[0m", tag.name.local),
}
for attr in tag.attrs.iter() {
print!(
" \x1b[36m{}\x1b[0m='\x1b[34m{}\x1b[0m'",
attr.name.local, attr.value
);
}
if tag.kind == EmptyTag {
print!("/");
}
println!(">");
},
ParseError(err) => {
self.is_char(false);
println!("ERROR: {}", err);
},
PIToken(Pi { target, data }) => {
self.is_char(false);
println!("PI : {:?} {:?}?>", target, data);
},
_ => {
self.is_char(false);
println!("OTHER: {:?}", token);
},
}
}
}
fn main() {
let mut sink = TokenPrinter { in_char_run: false };
let mut input = ByteTendril::new();
io::stdin().read_to_tendril(&mut input).unwrap();
let mut input_buffer = BufferQueue::new();
input_buffer.push_back(input.try_reinterpret().unwrap());
let mut tok = XmlTokenizer::new(
sink,
XmlTokenizerOpts {
profile: true,
exact_errors: true,
..Default::default()
},
);
tok.feed(&mut input_buffer);
tok.end();
sink.is_char(false);
}
xml5ever-0.16.1/src/driver.rs 0100644 0001750 0001750 00000005264 13554522202 0014226 0 ustar 00 0000000 0000000 // Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 or the MIT license
// , at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use crate::tokenizer::{XmlTokenizer, XmlTokenizerOpts};
use crate::tree_builder::{TreeSink, XmlTreeBuilder, XmlTreeBuilderOpts};
use std::borrow::Cow;
use markup5ever::buffer_queue::BufferQueue;
use crate::tendril;
use crate::tendril::stream::{TendrilSink, Utf8LossyDecoder};
use crate::tendril::StrTendril;
/// All-encompasing parser setting structure.
#[derive(Clone, Default)]
pub struct XmlParseOpts {
/// Xml tokenizer options.
pub tokenizer: XmlTokenizerOpts,
/// Xml tree builder .
pub tree_builder: XmlTreeBuilderOpts,
}
/// Parse and send results to a `TreeSink`.
///
/// ## Example
///
/// ```ignore
/// let mut sink = MySink;
/// parse_document(&mut sink, iter::once(my_str), Default::default());
/// ```
pub fn parse_document(sink: Sink, opts: XmlParseOpts) -> XmlParser
where
Sink: TreeSink,
{
let tb = XmlTreeBuilder::new(sink, opts.tree_builder);
let tok = XmlTokenizer::new(tb, opts.tokenizer);
XmlParser {
tokenizer: tok,
input_buffer: BufferQueue::new(),
}
}
/// An XML parser,
/// ready to receive Unicode input through the `tendril::TendrilSink` trait’s methods.
pub struct XmlParser
where
Sink: TreeSink,
{
/// Tokenizer used by XmlParser.
pub tokenizer: XmlTokenizer>,
/// Input used by XmlParser.
pub input_buffer: BufferQueue,
}
impl TendrilSink for XmlParser {
type Output = Sink::Output;
fn process(&mut self, t: StrTendril) {
self.input_buffer.push_back(t);
self.tokenizer.feed(&mut self.input_buffer);
}
// FIXME: Is it too noisy to report every character decoding error?
fn error(&mut self, desc: Cow<'static, str>) {
self.tokenizer.sink.sink.parse_error(desc)
}
fn finish(mut self) -> Self::Output {
self.tokenizer.end();
self.tokenizer.sink.sink.finish()
}
}
impl XmlParser {
/// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes.
///
/// Use this when your input is bytes that are known to be in the UTF-8 encoding.
/// Decoding is lossy, like `String::from_utf8_lossy`.
pub fn from_utf8(self) -> Utf8LossyDecoder {
Utf8LossyDecoder::new(self)
}
}
xml5ever-0.16.1/src/lib.rs 0100644 0001750 0001750 00000003576 13554521475 0013520 0 ustar 00 0000000 0000000 // Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 or the MIT license
// , at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! This crate provides a push based XML parser library that
//! adheres to XML5 specification. In other words this library
//! trades well-formedness for error recovery.
//!
//! The idea behind this, was to minimize number of errors from
//! tools that generate XML (e.g. `S` won't just return `S`
//! as text, but will parse it into `S` ).
//! You can check out full specification [here](https://ygg01.github.io/xml5_draft/).
//!
//! What this library provides is a solid XML parser that can:
//!
//! * Parse somewhat erroneous XML input
//! * Provide support for [Numeric character references](https://en.wikipedia.org/wiki/Numeric_character_reference).
//! * Provide partial [XML namespace](http://www.w3.org/TR/xml-names11/) support.
//! * Provide full set of SVG/MathML entities
//!
//! What isn't in scope for this library:
//!
//! * Document Type Definition parsing - this is pretty hard to do right and nowadays, its used
//!
#![crate_name = "xml5ever"]
#![crate_type = "dylib"]
#![deny(missing_docs)]
pub use markup5ever::*;
macro_rules! time {
($e:expr) => {{
let t0 = ::time::precise_time_ns();
let result = $e;
let dt = ::time::precise_time_ns() - t0;
(result, dt)
}};
}
mod util;
/// Driver
pub mod driver;
/// Serializer for XML5.
pub mod serialize;
/// XML5 tokenizer - converts input into tokens
pub mod tokenizer;
/// XML5 tree builder - converts tokens into a tree like structure
pub mod tree_builder;
xml5ever-0.16.1/src/serialize/mod.rs 0100644 0001750 0001750 00000015676 13554522202 0015511 0 ustar 00 0000000 0000000 // Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 or the MIT license
// , at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
pub use markup5ever::serialize::{AttrRef, Serialize, Serializer, TraversalScope};
use std::io::{self, Write};
use crate::tree_builder::NamespaceMap;
use crate::QualName;
#[derive(Clone)]
/// Struct for setting serializer options.
pub struct SerializeOpts {
/// Serialize the root node? Default: ChildrenOnly
pub traversal_scope: TraversalScope,
}
impl Default for SerializeOpts {
fn default() -> SerializeOpts {
SerializeOpts {
traversal_scope: TraversalScope::ChildrenOnly(None),
}
}
}
/// Method for serializing generic node to a given writer.
pub fn serialize(writer: Wr, node: &T, opts: SerializeOpts) -> io::Result<()>
where
Wr: Write,
T: Serialize,
{
let mut ser = XmlSerializer::new(writer);
node.serialize(&mut ser, opts.traversal_scope)
}
/// Struct used for serializing nodes into a text that other XML
/// parses can read.
///
/// Serializer contains a set of functions (start_elem, end_elem...)
/// that make parsing nodes easier.
pub struct XmlSerializer {
writer: Wr,
namespace_stack: NamespaceMapStack,
}
#[derive(Debug)]
struct NamespaceMapStack(Vec);
impl NamespaceMapStack {
fn new() -> NamespaceMapStack {
NamespaceMapStack(vec![])
}
fn push(&mut self, namespace: NamespaceMap) {
self.0.push(namespace);
}
fn pop(&mut self) {
self.0.pop();
}
}
/// Writes given text into the Serializer, escaping it,
/// depending on where the text is written inside the tag or attribute value.
///
/// For example
///```text
/// '&-quotes' becomes '&-quotes'
/// becomes (writer: &mut W, text: &str, attr_mode: bool) -> io::Result<()> {
for c in text.chars() {
match c {
'&' => writer.write_all(b"&"),
'\'' if attr_mode => writer.write_all(b"'"),
'"' if attr_mode => writer.write_all(b"""),
'<' if !attr_mode => writer.write_all(b"<"),
'>' if !attr_mode => writer.write_all(b">"),
c => writer.write_fmt(format_args!("{}", c)),
}?;
}
Ok(())
}
#[inline]
fn write_qual_name(writer: &mut W, name: &QualName) -> io::Result<()> {
if let Some(ref prefix) = name.prefix {
writer.write_all(&prefix.as_bytes())?;
writer.write_all(b":")?;
writer.write_all(&*name.local.as_bytes())?;
} else {
writer.write_all(&*name.local.as_bytes())?;
}
Ok(())
}
impl XmlSerializer {
/// Creates a new Serializier from a writer and given serialization options.
pub fn new(writer: Wr) -> Self {
XmlSerializer {
writer: writer,
namespace_stack: NamespaceMapStack::new(),
}
}
#[inline(always)]
fn qual_name(&mut self, name: &QualName) -> io::Result<()> {
self.find_or_insert_ns(name);
write_qual_name(&mut self.writer, name)
}
#[inline(always)]
fn qual_attr_name(&mut self, name: &QualName) -> io::Result<()> {
self.find_or_insert_ns(name);
write_qual_name(&mut self.writer, name)
}
fn find_uri(&self, name: &QualName) -> bool {
let mut found = false;
for stack in self.namespace_stack.0.iter().rev() {
if let Some(&Some(ref el)) = stack.get(&name.prefix) {
found = *el == name.ns;
break;
}
}
found
}
fn find_or_insert_ns(&mut self, name: &QualName) {
if name.prefix.is_some() || &*name.ns != "" {
if !self.find_uri(name) {
if let Some(last_ns) = self.namespace_stack.0.last_mut() {
last_ns.insert(name);
}
}
}
}
}
impl Serializer for XmlSerializer {
/// Serializes given start element into text. Start element contains
/// qualified name and an attributes iterator.
fn start_elem<'a, AttrIter>(&mut self, name: QualName, attrs: AttrIter) -> io::Result<()>
where
AttrIter: Iterator>,
{
self.namespace_stack.push(NamespaceMap::empty());
self.writer.write_all(b"<")?;
self.qual_name(&name)?;
if let Some(current_namespace) = self.namespace_stack.0.last() {
for (prefix, url_opt) in current_namespace.get_scope_iter() {
self.writer.write_all(b" xmlns")?;
if let &Some(ref p) = prefix {
self.writer.write_all(b":")?;
self.writer.write_all(&*p.as_bytes())?;
}
self.writer.write_all(b"=\"")?;
let url = if let &Some(ref a) = url_opt {
a.as_bytes()
} else {
b""
};
self.writer.write_all(url)?;
self.writer.write_all(b"\"")?;
}
}
for (name, value) in attrs {
self.writer.write_all(b" ")?;
self.qual_attr_name(&name)?;
self.writer.write_all(b"=\"")?;
write_to_buf_escaped(&mut self.writer, value, true)?;
self.writer.write_all(b"\"")?;
}
self.writer.write_all(b">")?;
Ok(())
}
/// Serializes given end element into text.
fn end_elem(&mut self, name: QualName) -> io::Result<()> {
self.namespace_stack.pop();
self.writer.write_all(b"")?;
self.qual_name(&name)?;
self.writer.write_all(b">")
}
/// Serializes comment into text.
fn write_comment(&mut self, text: &str) -> io::Result<()> {
self.writer.write_all(b"")
}
/// Serializes given doctype
fn write_doctype(&mut self, name: &str) -> io::Result<()> {
self.writer.write_all(b"")
}
/// Serializes text for a node or an attributes.
fn write_text(&mut self, text: &str) -> io::Result<()> {
write_to_buf_escaped(&mut self.writer, text, false)
}
/// Serializes given processing instruction.
fn write_processing_instruction(&mut self, target: &str, data: &str) -> io::Result<()> {
self.writer.write_all(b"")?;
self.writer.write_all(target.as_bytes())?;
self.writer.write_all(b" ")?;
self.writer.write_all(data.as_bytes())?;
self.writer.write_all(b"?>")
}
}
xml5ever-0.16.1/src/tokenizer/char_ref/mod.rs 0100644 0001750 0001750 00000034776 13554522202 0017307 0 ustar 00 0000000 0000000 // Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 or the MIT license
// , at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::{TokenSink, XmlTokenizer};
use crate::data;
use log::debug;
use mac::{format_if, unwrap_or_return};
use markup5ever::buffer_queue::BufferQueue;
use std::borrow::Cow::Borrowed;
use std::char::from_u32;
use crate::tendril::StrTendril;
use crate::util::is_ascii_alnum;
use self::State::*;
pub use self::Status::*;
//§ tokenizing-character-references
pub struct CharRef {
/// The resulting character(s)
pub chars: [char; 2],
/// How many slots in `chars` are valid?
pub num_chars: u8,
}
pub enum Status {
Stuck,
Progress,
Done,
}
#[derive(Debug)]
enum State {
Begin,
Octothorpe,
Numeric(u32), // base
NumericSemicolon,
Named,
BogusName,
}
pub struct CharRefTokenizer {
state: State,
addnl_allowed: Option,
result: Option,
num: u32,
num_too_big: bool,
seen_digit: bool,
hex_marker: Option,
name_buf_opt: Option,
name_match: Option<(u32, u32)>,
name_len: usize,
}
impl CharRefTokenizer {
// NB: We assume that we have an additional allowed character iff we're
// tokenizing in an attribute value.
pub fn new(addnl_allowed: Option) -> CharRefTokenizer {
CharRefTokenizer {
state: Begin,
addnl_allowed: addnl_allowed,
result: None,
num: 0,
num_too_big: false,
seen_digit: false,
hex_marker: None,
name_buf_opt: None,
name_match: None,
name_len: 0,
}
}
// A CharRefTokenizer can only tokenize one character reference,
// so this method consumes the tokenizer.
pub fn get_result(self) -> CharRef {
self.result.expect("get_result called before done")
}
fn name_buf<'t>(&'t self) -> &'t StrTendril {
self.name_buf_opt
.as_ref()
.expect("name_buf missing in named character reference")
}
fn name_buf_mut<'t>(&'t mut self) -> &'t mut StrTendril {
self.name_buf_opt
.as_mut()
.expect("name_buf missing in named character reference")
}
fn finish_none(&mut self) -> Status {
self.result = Some(CharRef {
chars: ['\0', '\0'],
num_chars: 0,
});
Done
}
fn finish_one(&mut self, c: char) -> Status {
self.result = Some(CharRef {
chars: [c, '\0'],
num_chars: 1,
});
Done
}
}
impl CharRefTokenizer {
pub fn step(
&mut self,
tokenizer: &mut XmlTokenizer,
input: &mut BufferQueue,
) -> Status {
if self.result.is_some() {
return Done;
}
debug!("char ref tokenizer stepping in state {:?}", self.state);
match self.state {
Begin => self.do_begin(tokenizer, input),
Octothorpe => self.do_octothorpe(tokenizer, input),
Numeric(base) => self.do_numeric(tokenizer, base, input),
NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
Named => self.do_named(tokenizer, input),
BogusName => self.do_bogus_name(tokenizer, input),
}
}
fn do_begin(
&mut self,
tokenizer: &mut XmlTokenizer,
input: &mut BufferQueue,
) -> Status {
match unwrap_or_return!(tokenizer.peek(input), Stuck) {
'\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(),
c if Some(c) == self.addnl_allowed => self.finish_none(),
'#' => {
tokenizer.discard_char(input);
self.state = Octothorpe;
Progress
},
_ => {
self.state = Named;
self.name_buf_opt = Some(StrTendril::new());
Progress
},
}
}
fn do_octothorpe(
&mut self,
tokenizer: &mut XmlTokenizer,
input: &mut BufferQueue,
) -> Status {
let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
match c {
'x' | 'X' => {
tokenizer.discard_char(input);
self.hex_marker = Some(c);
self.state = Numeric(16);
},
_ => {
self.hex_marker = None;
self.state = Numeric(10);
},
}
Progress
}
fn do_numeric(
&mut self,
tokenizer: &mut XmlTokenizer,
base: u32,
input: &mut BufferQueue,
) -> Status {
let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
match c.to_digit(base) {
Some(n) => {
tokenizer.discard_char(input);
self.num = self.num.wrapping_mul(base);
if self.num > 0x10FFFF {
// We might overflow, and the character is definitely invalid.
// We still parse digits and semicolon, but don't use the result.
self.num_too_big = true;
}
self.num = self.num.wrapping_add(n);
self.seen_digit = true;
Progress
},
None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
None => {
self.state = NumericSemicolon;
Progress
},
}
}
fn do_numeric_semicolon(
&mut self,
tokenizer: &mut XmlTokenizer,
input: &mut BufferQueue,
) -> Status {
match unwrap_or_return!(tokenizer.peek(input), Stuck) {
';' => tokenizer.discard_char(input),
_ => tokenizer.emit_error(Borrowed(
"Semicolon missing after numeric character reference",
)),
};
self.finish_numeric(tokenizer)
}
fn unconsume_numeric(
&mut self,
tokenizer: &mut XmlTokenizer,
input: &mut BufferQueue,
) -> Status {
let mut unconsume = StrTendril::from_char('#');
match self.hex_marker {
Some(c) => unconsume.push_char(c),
None => (),
}
tokenizer.unconsume(input, unconsume);
tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
self.finish_none()
}
fn finish_numeric(&mut self, tokenizer: &mut XmlTokenizer) -> Status {
fn conv(n: u32) -> char {
from_u32(n).expect("invalid char missed by error handling cases")
}
let (c, error) = match self.num {
n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
Some(c) => (c, true),
None => (conv(self.num), true),
},
0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
n => (conv(n), false),
};
if error {
let msg = format_if!(
tokenizer.opts.exact_errors,
"Invalid numeric character reference",
"Invalid numeric character reference value 0x{:06X}",
self.num
);
tokenizer.emit_error(msg);
}
self.finish_one(c)
}
fn do_named(
&mut self,
tokenizer: &mut XmlTokenizer,
input: &mut BufferQueue,
) -> Status {
let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
self.name_buf_mut().push_char(c);
match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
// We have either a full match or a prefix of one.
Some(&m) => {
if m.0 != 0 {
// We have a full match, but there might be a longer one to come.
self.name_match = Some(m);
self.name_len = self.name_buf().len();
}
// Otherwise we just have a prefix match.
Progress
},
// Can't continue the match.
None => self.finish_named(tokenizer, Some(c), input),
}
}
fn emit_name_error(&mut self, tokenizer: &mut XmlTokenizer) {
let msg = format_if!(
tokenizer.opts.exact_errors,
"Invalid character reference",
"Invalid character reference &{}",
self.name_buf()
);
tokenizer.emit_error(msg);
}
fn unconsume_name(
&mut self,
tokenizer: &mut XmlTokenizer,
input: &mut BufferQueue,
) {
tokenizer.unconsume(input, self.name_buf_opt.take().unwrap());
}
fn finish_named(
&mut self,
tokenizer: &mut XmlTokenizer,
end_char: Option,
input: &mut BufferQueue,
) -> Status {
match self.name_match {
None => {
match end_char {
Some(c) if is_ascii_alnum(c) => {
// Keep looking for a semicolon, to determine whether
// we emit a parse error.
self.state = BogusName;
return Progress;
},
// Check length because &; is not a parse error.
Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
_ => (),
}
self.unconsume_name(tokenizer, input);
self.finish_none()
},
Some((c1, c2)) => {
// We have a complete match, but we may have consumed
// additional characters into self.name_buf. Usually
// at least one, but several in cases like
//
// ¬ => match for U+00AC
// ¬i => valid prefix for ¬in
// ¬it => can't continue match
let name_len = self.name_len;
assert!(name_len > 0);
let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
// There might not be a next character after the match, if
// we had a full match and then hit EOF.
let next_after = if name_len == self.name_buf().len() {
None
} else {
Some(self.name_buf()[name_len..].chars().next().unwrap())
};
// "If the character reference is being consumed as part of an
// attribute, and the last character matched is not a U+003B
// SEMICOLON character (;), and the next character is either a
// U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
// character, then, for historical reasons, all the characters
// that were matched after the U+0026 AMPERSAND character (&)
// must be unconsumed, and nothing is returned. However, if
// this next character is in fact a U+003D EQUALS SIGN
// character (=), then this is a parse error"
let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
(_, ';', _) => false,
(Some(_), _, Some('=')) => {
tokenizer.emit_error(Borrowed(
"Equals sign after character reference in attribute",
));
true
},
(Some(_), _, Some(c)) if is_ascii_alnum(c) => true,
_ => {
tokenizer.emit_error(Borrowed(
"Character reference does not end with semicolon",
));
false
},
};
if unconsume_all {
self.unconsume_name(tokenizer, input);
self.finish_none()
} else {
tokenizer
.unconsume(input, StrTendril::from_slice(&self.name_buf()[name_len..]));
self.result = Some(CharRef {
chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
num_chars: if c2 == 0 { 1 } else { 2 },
});
Done
}
},
}
}
fn do_bogus_name(
&mut self,
tokenizer: &mut XmlTokenizer,
input: &mut BufferQueue,
) -> Status {
let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
self.name_buf_mut().push_char(c);
match c {
_ if is_ascii_alnum(c) => return Progress,
';' => self.emit_name_error(tokenizer),
_ => (),
}
self.unconsume_name(tokenizer, input);
self.finish_none()
}
pub fn end_of_file(
&mut self,
tokenizer: &mut XmlTokenizer,
input: &mut BufferQueue,
) {
while self.result.is_none() {
match self.state {
Begin => drop(self.finish_none()),
Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
Numeric(_) | NumericSemicolon => {
tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
self.finish_numeric(tokenizer);
},
Named => drop(self.finish_named(tokenizer, None, input)),
BogusName => {
self.unconsume_name(tokenizer, input);
self.finish_none();
},
Octothorpe => {
tokenizer.unconsume(input, StrTendril::from_slice("#"));
tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
self.finish_none();
},
}
}
}
}
xml5ever-0.16.1/src/tokenizer/interface.rs 0100644 0001750 0001750 00000007704 13554521475 0016721 0 ustar 00 0000000 0000000 // Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 or the MIT license
// , at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use std::borrow::Cow;
use crate::tendril::StrTendril;
use crate::{Attribute, QualName};
pub use self::TagKind::{EmptyTag, EndTag, ShortTag, StartTag};
pub use self::Token::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
pub use self::Token::{CommentToken, DoctypeToken, PIToken, TagToken};
use super::states;
/// Tag kind denotes which kind of tag did we encounter.
#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)]
pub enum TagKind {
/// Beginning of a tag (e.g. ``).
StartTag,
/// End of a tag (e.g. ``).
EndTag,
/// Empty tag (e.g. ``).
EmptyTag,
/// Short tag (e.g. `>`).
ShortTag,
}
/// XML 5 Tag Token
#[derive(PartialEq, Eq, Debug, Clone)]
pub struct Tag {
/// Token kind denotes which type of token was encountered.
/// E.g. if parser parsed `` the token kind would be `EndTag`.
pub kind: TagKind,
/// Qualified name of the tag.
pub name: QualName,
/// List of attributes attached to this tag.
/// Only valid in start and empty tag.
pub attrs: Vec,
}
impl Tag {
/// Sorts attributes in a tag.
pub fn equiv_modulo_attr_order(&self, other: &Tag) -> bool {
if (self.kind != other.kind) || (self.name != other.name) {
return false;
}
let mut self_attrs = self.attrs.clone();
let mut other_attrs = other.attrs.clone();
self_attrs.sort();
other_attrs.sort();
self_attrs == other_attrs
}
}
/// A `DOCTYPE` token.
/// Doctype token in XML5 is rather limited for reasons, such as:
/// security and simplicity. XML5 only supports declaring DTD with
/// name, public identifier and system identifier
#[derive(PartialEq, Eq, Clone, Debug)]
pub struct Doctype {
/// Name of DOCTYPE declared
pub name: Option,
/// Public identifier of this DOCTYPE.
pub public_id: Option,
/// System identifier of this DOCTYPE.
pub system_id: Option,
}
impl Doctype {
/// Constructs an empty DOCTYPE, with all fields set to None.
pub fn new() -> Doctype {
Doctype {
name: None,
public_id: None,
system_id: None,
}
}
}
/// A ProcessingInstruction token.
#[derive(PartialEq, Eq, Clone, Debug)]
pub struct Pi {
/// What is the name of processing instruction.
pub target: StrTendril,
/// Text of processing instruction.
pub data: StrTendril,
}
/// Describes tokens encountered during parsing of input.
#[derive(PartialEq, Eq, Debug)]
pub enum Token {
/// Doctype token
DoctypeToken(Doctype),
/// Token tag founds. This token applies to all
/// possible kinds of tags (like start, end, empty tag, etc.).
TagToken(Tag),
/// Processing Instruction token
PIToken(Pi),
/// Comment token.
CommentToken(StrTendril),
/// Token that represents a series of characters.
CharacterTokens(StrTendril),
/// End of File found.
EOFToken,
/// NullCharacter encountered.
NullCharacterToken,
/// Error happened
ParseError(Cow<'static, str>),
}
/// Types which can receive tokens from the tokenizer.
pub trait TokenSink {
/// Process a token.
fn process_token(&mut self, token: Token);
/// Signal to the sink that parsing has ended.
fn end(&mut self) {}
/// The tokenizer will call this after emitting any start tag.
/// This allows the tree builder to change the tokenizer's state.
/// By default no state changes occur.
fn query_state_change(&mut self) -> Option {
None
}
}
xml5ever-0.16.1/src/tokenizer/mod.rs 0100644 0001750 0001750 00000145076 13554522202 0015532 0 ustar 00 0000000 0000000 // Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 or the MIT license
// , at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
mod char_ref;
mod interface;
mod qname;
pub mod states;
pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken};
pub use self::interface::{CommentToken, DoctypeToken, PIToken, TagToken};
pub use self::interface::{Doctype, Pi};
pub use self::interface::{EmptyTag, EndTag, ShortTag, StartTag};
pub use self::interface::{ParseError, Tag, TagKind, Token, TokenSink};
pub use crate::{LocalName, Namespace, Prefix};
use log::debug;
use mac::{format_if, unwrap_or_return};
use markup5ever::{local_name, namespace_prefix, namespace_url, ns, small_char_set};
use std::borrow::Cow::{self, Borrowed};
use std::collections::BTreeMap;
use std::mem::replace;
use crate::tendril::StrTendril;
use crate::{buffer_queue, Attribute, QualName, SmallCharSet};
use self::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
use self::char_ref::{CharRef, CharRefTokenizer};
use self::qname::QualNameTokenizer;
use self::states::XmlState;
use self::states::{DoctypeKind, Public, System};
use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
/// Copy of Tokenizer options, with an impl for `Default`.
#[derive(Copy, Clone)]
pub struct XmlTokenizerOpts {
/// Report all parse errors described in the spec, at some
/// performance penalty? Default: false
pub exact_errors: bool,
/// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
/// of the stream? Default: true
pub discard_bom: bool,
/// Keep a record of how long we spent in each state? Printed
/// when `end()` is called. Default: false
pub profile: bool,
/// Initial state override. Only the test runner should use
/// a non-`None` value!
pub initial_state: Option,
}
fn process_qname(tag_name: StrTendril) -> QualName {
// If tag name can't possibly contain full namespace, skip qualified name
// parsing altogether. For a tag to have namespace it must look like:
// a:b
// Since StrTendril are UTF-8, we know that minimal size in bytes must be
// three bytes minimum.
let split = if (&*tag_name).as_bytes().len() < 3 {
None
} else {
QualNameTokenizer::new((&*tag_name).as_bytes()).run()
};
match split {
None => QualName::new(None, ns!(), LocalName::from(&*tag_name)),
Some(col) => {
let len = (&*tag_name).as_bytes().len() as u32;
let prefix = tag_name.subtendril(0, col);
let local = tag_name.subtendril(col + 1, len - col - 1);
let ns = ns!(); // Actual namespace URL set in XmlTreeBuilder::bind_qname
QualName::new(Some(Prefix::from(&*prefix)), ns, LocalName::from(&*local))
},
}
}
fn option_push(opt_str: &mut Option, c: char) {
match *opt_str {
Some(ref mut s) => s.push_char(c),
None => *opt_str = Some(StrTendril::from_char(c)),
}
}
impl Default for XmlTokenizerOpts {
fn default() -> XmlTokenizerOpts {
XmlTokenizerOpts {
exact_errors: false,
discard_bom: true,
profile: false,
initial_state: None,
}
}
}
/// The Xml tokenizer.
pub struct XmlTokenizer {
/// Options controlling the behavior of the tokenizer.
opts: XmlTokenizerOpts,
/// Destination for tokens we emit.
pub sink: Sink,
/// The abstract machine state as described in the spec.
state: states::XmlState,
/// Are we at the end of the file, once buffers have been processed
/// completely? This affects whether we will wait for lookahead or not.
at_eof: bool,
/// Tokenizer for character references, if we're tokenizing
/// one at the moment.
char_ref_tokenizer: Option>,
/// Current input character. Just consumed, may reconsume.
current_char: char,
/// Should we reconsume the current input character?
reconsume: bool,
/// Did we just consume \r, translating it to \n? In that case we need
/// to ignore the next character if it's \n.
ignore_lf: bool,
/// Discard a U+FEFF BYTE ORDER MARK if we see one? Only done at the
/// beginning of the stream.
discard_bom: bool,
/// Temporary buffer
temp_buf: StrTendril,
/// Current tag kind.
current_tag_kind: TagKind,
/// Current tag name.
current_tag_name: StrTendril,
/// Current tag attributes.
current_tag_attrs: Vec,
/// Current attribute name.
current_attr_name: StrTendril,
/// Current attribute value.
current_attr_value: StrTendril,
current_doctype: Doctype,
/// Current comment.
current_comment: StrTendril,
/// Current processing instruction target.
current_pi_target: StrTendril,
/// Current processing instruction value.
current_pi_data: StrTendril,
/// Record of how many ns we spent in each state, if profiling is enabled.
state_profile: BTreeMap,
/// Record of how many ns we spent in the token sink.
time_in_sink: u64,
}
impl XmlTokenizer {
/// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
pub fn new(sink: Sink, opts: XmlTokenizerOpts) -> XmlTokenizer {
if opts.profile && cfg!(for_c) {
panic!("Can't profile tokenizer when built as a C library");
}
let state = *opts.initial_state.as_ref().unwrap_or(&states::Data);
let discard_bom = opts.discard_bom;
XmlTokenizer {
opts: opts,
sink: sink,
state: state,
char_ref_tokenizer: None,
at_eof: false,
current_char: '\0',
reconsume: false,
ignore_lf: false,
temp_buf: StrTendril::new(),
discard_bom: discard_bom,
current_tag_kind: StartTag,
current_tag_name: StrTendril::new(),
current_tag_attrs: vec![],
current_attr_name: StrTendril::new(),
current_attr_value: StrTendril::new(),
current_comment: StrTendril::new(),
current_pi_data: StrTendril::new(),
current_pi_target: StrTendril::new(),
current_doctype: Doctype::new(),
state_profile: BTreeMap::new(),
time_in_sink: 0,
}
}
/// Feed an input string into the tokenizer.
pub fn feed(&mut self, input: &mut BufferQueue) {
if input.is_empty() {
return;
}
if self.discard_bom {
if let Some(c) = input.peek() {
if c == '\u{feff}' {
input.next();
}
} else {
return;
}
};
self.run(input);
}
fn process_token(&mut self, token: Token) {
if self.opts.profile {
let (_, dt) = time!(self.sink.process_token(token));
self.time_in_sink += dt;
} else {
self.sink.process_token(token);
}
}
// Get the next input character, which might be the character
// 'c' that we already consumed from the buffers.
fn get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option {
if self.ignore_lf {
self.ignore_lf = false;
if c == '\n' {
c = unwrap_or_return!(input.next(), None);
}
}
if c == '\r' {
self.ignore_lf = true;
c = '\n';
}
// Normalize \x00 into \uFFFD
if c == '\x00' {
c = '\u{FFFD}'
}
// Exclude forbidden Unicode characters
if self.opts.exact_errors &&
match c as u32 {
0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
n if (n & 0xFFFE) == 0xFFFE => true,
_ => false,
}
{
let msg = format!("Bad character {}", c);
self.emit_error(Cow::Owned(msg));
}
debug!("got character {}", c);
self.current_char = c;
Some(c)
}
fn bad_eof_error(&mut self) {
let msg = format_if!(
self.opts.exact_errors,
"Unexpected EOF",
"Saw EOF in state {:?}",
self.state
);
self.emit_error(msg);
}
fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option {
// Bail to the slow path for various corner cases.
// This means that `FromSet` can contain characters not in the set!
// It shouldn't matter because the fallback `FromSet` case should
// always do the same thing as the `NotFromSet` case.
if self.opts.exact_errors || self.reconsume || self.ignore_lf {
return self.get_char(input).map(|x| FromSet(x));
}
let d = input.pop_except_from(set);
debug!("got characters {:?}", d);
match d {
Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(|x| FromSet(x)),
// NB: We don't set self.current_char for a run of characters not
// in the set. It shouldn't matter for the codepaths that use
// this.
_ => d,
}
}
// Check if the next characters are an ASCII case-insensitive match. See
// BufferQueue::eat.
//
// NB: this doesn't do input stream preprocessing or set the current input
// character.
fn eat(&mut self, input: &mut BufferQueue, pat: &str) -> Option {
input.push_front(replace(&mut self.temp_buf, StrTendril::new()));
match input.eat(pat, u8::eq_ignore_ascii_case) {
None if self.at_eof => Some(false),
None => {
while let Some(c) = input.next() {
self.temp_buf.push_char(c);
}
None
},
Some(matched) => Some(matched),
}
}
/// Run the state machine for as long as we can.
pub fn run(&mut self, input: &mut BufferQueue) {
if self.opts.profile {
loop {
let state = self.state;
let old_sink = self.time_in_sink;
let (run, mut dt) = time!(self.step(input));
dt -= self.time_in_sink - old_sink;
let new = match self.state_profile.get_mut(&state) {
Some(x) => {
*x += dt;
false
},
None => true,
};
if new {
// do this here because of borrow shenanigans
self.state_profile.insert(state, dt);
}
if !run {
break;
}
}
} else {
while self.step(input) {}
}
}
//§ tokenization
// Get the next input character, if one is available.
fn get_char(&mut self, input: &mut BufferQueue) -> Option {
if self.reconsume {
self.reconsume = false;
Some(self.current_char)
} else {
input
.next()
.and_then(|c| self.get_preprocessed_char(c, input))
}
}
fn bad_char_error(&mut self) {
let msg = format_if!(
self.opts.exact_errors,
"Bad character",
"Saw {} in state {:?}",
self.current_char,
self.state
);
self.emit_error(msg);
}
fn discard_tag(&mut self) {
self.current_tag_name = StrTendril::new();
self.current_tag_attrs = Vec::new();
}
fn create_tag(&mut self, kind: TagKind, c: char) {
self.discard_tag();
self.current_tag_name.push_char(c);
self.current_tag_kind = kind;
}
// This method creates a PI token and
// sets its target to given char
fn create_pi(&mut self, c: char) {
self.current_pi_target = StrTendril::new();
self.current_pi_data = StrTendril::new();
self.current_pi_target.push_char(c);
}
fn emit_char(&mut self, c: char) {
self.process_token(CharacterTokens(StrTendril::from_char(match c {
'\0' => '\u{FFFD}',
c => c,
})));
}
fn emit_short_tag(&mut self) {
self.current_tag_kind = ShortTag;
self.current_tag_name = StrTendril::new();
self.emit_current_tag();
}
fn emit_empty_tag(&mut self) {
self.current_tag_kind = EmptyTag;
self.emit_current_tag();
}
fn set_empty_tag(&mut self) {
self.current_tag_kind = EmptyTag;
}
fn emit_start_tag(&mut self) {
self.current_tag_kind = StartTag;
self.emit_current_tag();
}
fn emit_current_tag(&mut self) {
self.finish_attribute();
let qname = process_qname(replace(&mut self.current_tag_name, StrTendril::new()));
match self.current_tag_kind {
StartTag | EmptyTag => {},
EndTag => {
if !self.current_tag_attrs.is_empty() {
self.emit_error(Borrowed("Attributes on an end tag"));
}
},
ShortTag => {
if !self.current_tag_attrs.is_empty() {
self.emit_error(Borrowed("Attributes on a short tag"));
}
},
}
let token = TagToken(Tag {
kind: self.current_tag_kind,
name: qname,
attrs: replace(&mut self.current_tag_attrs, vec![]),
});
self.process_token(token);
match self.sink.query_state_change() {
None => (),
Some(s) => self.state = s,
}
}
// The string must not contain '\0'!
fn emit_chars(&mut self, b: StrTendril) {
self.process_token(CharacterTokens(b));
}
// Emits the current Processing Instruction
fn emit_pi(&mut self) {
let token = PIToken(Pi {
target: replace(&mut self.current_pi_target, StrTendril::new()),
data: replace(&mut self.current_pi_data, StrTendril::new()),
});
self.process_token(token);
}
fn consume_char_ref(&mut self, addnl_allowed: Option) {
// NB: The char ref tokenizer assumes we have an additional allowed
// character iff we're tokenizing in an attribute value.
self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed)));
}
fn emit_eof(&mut self) {
self.process_token(EOFToken);
}
fn emit_error(&mut self, error: Cow<'static, str>) {
self.process_token(ParseError(error));
}
fn emit_current_comment(&mut self) {
let comment = replace(&mut self.current_comment, StrTendril::new());
self.process_token(CommentToken(comment));
}
fn emit_current_doctype(&mut self) {
let doctype = replace(&mut self.current_doctype, Doctype::new());
self.process_token(DoctypeToken(doctype));
}
fn doctype_id<'a>(&'a mut self, kind: DoctypeKind) -> &'a mut Option {
match kind {
Public => &mut self.current_doctype.public_id,
System => &mut self.current_doctype.system_id,
}
}
fn clear_doctype_id(&mut self, kind: DoctypeKind) {
let id = self.doctype_id(kind);
match *id {
Some(ref mut s) => s.clear(),
None => *id = Some(StrTendril::new()),
}
}
fn peek(&mut self, input: &mut BufferQueue) -> Option {
if self.reconsume {
Some(self.current_char)
} else {
input.peek()
}
}
fn discard_char(&mut self, input: &mut BufferQueue) {
let c = self.get_char(input);
assert!(c.is_some());
}
fn unconsume(&mut self, input: &mut BufferQueue, buf: StrTendril) {
input.push_front(buf);
}
}
// Shorthand for common state machine behaviors.
macro_rules! shorthand (
( $me:ident : emit $c:expr ) => ( $me.emit_char($c); );
( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c); );
( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push_char($c); );
( $me:ident : discard_tag $input:expr ) => ( $me.discard_tag($input); );
( $me:ident : discard_char ) => ( $me.discard_char(); );
( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push_char($c); );
( $me:ident : emit_temp ) => ( $me.emit_temp_buf(); );
( $me:ident : clear_temp ) => ( $me.clear_temp_buf(); );
( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c); );
( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.push_char($c); );
( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.push_char($c); );
( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.push_tendril($c); );
( $me:ident : push_comment $c:expr ) => ( $me.current_comment.push_char($c); );
( $me:ident : append_comment $c:expr ) => ( $me.current_comment.push_slice($c); );
( $me:ident : emit_comment ) => ( $me.emit_current_comment(); );
( $me:ident : clear_comment ) => ( $me.current_comment.clear(); );
( $me:ident : create_doctype ) => ( $me.current_doctype = Doctype::new(); );
( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.name, $c); );
( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c); );
( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k); );
( $me:ident : emit_doctype ) => ( $me.emit_current_doctype(); );
( $me:ident : error ) => ( $me.bad_char_error(); );
( $me:ident : error_eof ) => ( $me.bad_eof_error(); );
( $me:ident : create_pi $c:expr ) => ( $me.create_pi($c); );
( $me:ident : push_pi_target $c:expr ) => ( $me.current_pi_target.push_char($c); );
( $me:ident : push_pi_data $c:expr ) => ( $me.current_pi_data.push_char($c); );
( $me:ident : set_empty_tag ) => ( $me.set_empty_tag(); );
);
// Tracing of tokenizer actions. This adds significant bloat and compile time,
// so it's behind a cfg flag.
#[cfg(trace_tokenizer)]
macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
debug!(" {:s}", stringify!($($cmds)*));
shorthand!($me:expr : $($cmds)*);
}));
#[cfg(not(trace_tokenizer))]
macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
// A little DSL for sequencing shorthand actions.
macro_rules! go (
// A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
// We have to tell the parser how much lookahead we need.
( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); });
( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); });
( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); });
( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
// These can only come at the end.
( $me:ident : to $s:ident ) => ({ $me.state = states::$s; return true; });
( $me:ident : to $s:ident $k1:expr ) => ({ $me.state = states::$s($k1); return true; });
( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state = states::$s($k1($k2)); return true; });
( $me:ident : reconsume $s:ident ) => ({ $me.reconsume = true; go!($me: to $s); });
( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1); });
( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1 $k2); });
( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); return true; });
( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return true; });
// We have a default next state after emitting a tag, but the sink can override.
( $me:ident : emit_tag $s:ident ) => ({
$me.state = states::$s;
$me.emit_current_tag();
return true;
});
// We have a special when dealing with empty and short tags in Xml
( $me:ident : emit_short_tag $s:ident ) => ({
$me.state = states::$s;
$me.emit_short_tag();
return true;
});
( $me:ident : emit_empty_tag $s:ident ) => ({
$me.state = states::$s;
$me.emit_empty_tag();
return true;
});
( $me:ident : emit_start_tag $s:ident ) => ({
$me.state = states::$s;
$me.emit_start_tag();
return true;
});
( $me:ident : emit_pi $s:ident ) => ({
$me.state = states::$s;
$me.emit_pi();
return true;
});
( $me:ident : eof ) => ({ $me.emit_eof(); return false; });
// If nothing else matched, it's a single command
( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+); );
// or nothing.
( $me:ident : ) => (());
);
// This is a macro because it can cause early return
// from the function where it is used.
macro_rules! get_char ( ($me:expr, $input:expr) => (
unwrap_or_return!($me.get_char($input), false)
));
macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
unwrap_or_return!($me.pop_except_from($input, $set), false)
));
macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
unwrap_or_return!($me.eat($input, $pat), false)
));
impl XmlTokenizer {
// Run the state machine for a while.
// Return true if we should be immediately re-invoked
// (this just simplifies control flow vs. break / continue).
fn step(&mut self, input: &mut BufferQueue) -> bool {
if self.char_ref_tokenizer.is_some() {
return self.step_char_ref_tokenizer(input);
}
debug!("processing in state {:?}", self.state);
match self.state {
XmlState::Quiescent => {
self.state = XmlState::Data;
return false;
},
//§ data-state
XmlState::Data => loop {
match pop_except_from!(self, input, small_char_set!('\r' '&' '<')) {
FromSet('&') => go!(self: consume_char_ref),
FromSet('<') => go!(self: to TagState),
FromSet(c) => go!(self: emit c),
NotFromSet(b) => self.emit_chars(b),
}
},
//§ tag-state
XmlState::TagState => loop {
match get_char!(self, input) {
'!' => go!(self: to MarkupDecl),
'/' => go!(self: to EndTagState),
'?' => go!(self: to Pi),
'\t' | '\n' | ' ' | ':' | '<' | '>' => {
go!(self: error; emit '<'; reconsume Data)
},
cl => go!(self: create_tag StartTag cl; to TagName),
}
},
//§ end-tag-state
XmlState::EndTagState => loop {
match get_char!(self, input) {
'>' => go!(self: emit_short_tag Data),
'\t' | '\n' | ' ' | '<' | ':' => {
go!(self: error; emit '<'; emit '/'; reconsume Data)
},
cl => go!(self: create_tag EndTag cl; to EndTagName),
}
},
//§ end-tag-name-state
XmlState::EndTagName => loop {
match get_char!(self, input) {
'\t' | '\n' | ' ' => go!(self: to EndTagNameAfter),
'/' => go!(self: error; to EndTagNameAfter),
'>' => go!(self: emit_tag Data),
cl => go!(self: push_tag cl),
}
},
//§ end-tag-name-after-state
XmlState::EndTagNameAfter => loop {
match get_char!(self, input) {
'>' => go!(self: emit_tag Data),
'\t' | '\n' | ' ' => (),
_ => self.emit_error(Borrowed("Unexpected element in tag name")),
}
},
//§ pi-state
XmlState::Pi => loop {
match get_char!(self, input) {
'\t' | '\n' | ' ' => go!(self: error; reconsume BogusComment),
cl => go!(self: create_pi cl; to PiTarget),
}
},
//§ pi-target-state
XmlState::PiTarget => loop {
match get_char!(self, input) {
'\t' | '\n' | ' ' => go!(self: to PiTargetAfter),
'?' => go!(self: to PiAfter),
cl => go!(self: push_pi_target cl),
}
},
//§ pi-target-after-state
XmlState::PiTargetAfter => loop {
match get_char!(self, input) {
'\t' | '\n' | ' ' => (),
_ => go!(self: reconsume PiData),
}
},
//§ pi-data-state
XmlState::PiData => loop {
match get_char!(self, input) {
'?' => go!(self: to PiAfter),
cl => go!(self: push_pi_data cl),
}
},
//§ pi-after-state
XmlState::PiAfter => loop {
match get_char!(self, input) {
'>' => go!(self: emit_pi Data),
'?' => go!(self: to PiAfter),
cl => go!(self: push_pi_data cl),
}
},
//§ markup-declaration-state
XmlState::MarkupDecl => loop {
if eat!(self, input, "--") {
go!(self: clear_comment; to CommentStart);
} else if eat!(self, input, "[CDATA[") {
go!(self: to Cdata);
} else if eat!(self, input, "DOCTYPE") {
go!(self: to Doctype);
} else {
// FIXME: 'error' gives wrong message
go!(self: error; to BogusComment);
}
},
//§ comment-start-state
XmlState::CommentStart => loop {
match get_char!(self, input) {
'-' => go!(self: to CommentStartDash),
'>' => go!(self: error; emit_comment; to Data),
_ => go!(self: reconsume Comment),
}
},
//§ comment-start-dash-state
XmlState::CommentStartDash => loop {
match get_char!(self, input) {
'-' => go!(self: to CommentEnd),
'>' => go!(self: error; emit_comment; to Data),
_ => go!(self: push_comment '-'; reconsume Comment),
}
},
//§ comment-state
XmlState::Comment => loop {
match get_char!(self, input) {
'<' => go!(self: push_comment '<'; to CommentLessThan),
'-' => go!(self: to CommentEndDash),
c => go!(self: push_comment c),
}
},
//§ comment-less-than-sign-state
XmlState::CommentLessThan => loop {
match get_char!(self, input) {
'!' => go!(self: push_comment '!';to CommentLessThanBang),
'<' => go!(self: push_comment '<'),
_ => go!(self: reconsume Comment),
}
},
//§ comment-less-than-sign-bang-state
XmlState::CommentLessThanBang => loop {
match get_char!(self, input) {
'-' => go!(self: to CommentLessThanBangDash),
_ => go!(self: reconsume Comment),
}
},
//§ comment-less-than-sign-bang-dash-state
XmlState::CommentLessThanBangDash => loop {
match get_char!(self, input) {
'-' => go!(self: to CommentLessThanBangDashDash),
_ => go!(self: reconsume CommentEndDash),
}
},
//§ comment-less-than-sign-bang-dash-dash-state
XmlState::CommentLessThanBangDashDash => loop {
match get_char!(self, input) {
'>' => go!(self: reconsume CommentEnd),
_ => go!(self: error; reconsume CommentEnd),
}
},
//§ comment-end-dash-state
XmlState::CommentEndDash => loop {
match get_char!(self, input) {
'-' => go!(self: to CommentEnd),
_ => go!(self: push_comment '-'; reconsume Comment),
}
},
//§ comment-end-state
XmlState::CommentEnd => loop {
match get_char!(self, input) {
'>' => go!(self: emit_comment; to Data),
'!' => go!(self: to CommentEndBang),
'-' => go!(self: push_comment '-'),
_ => go!(self: append_comment "--"; reconsume Comment),
}
},
//§ comment-end-bang-state
XmlState::CommentEndBang => loop {
match get_char!(self, input) {
'-' => go!(self: append_comment "--!"; to CommentEndDash),
'>' => go!(self: error; emit_comment; to Data),
_ => go!(self: append_comment "--!"; reconsume Comment),
}
},
//§ bogus-comment-state
XmlState::BogusComment => loop {
match get_char!(self, input) {
'>' => go!(self: emit_comment; to Data),
c => go!(self: push_comment c),
}
},
//§ cdata-state
XmlState::Cdata => loop {
match get_char!(self, input) {
']' => go!(self: to CdataBracket),
cl => go!(self: emit cl),
}
},
//§ cdata-bracket-state
XmlState::CdataBracket => loop {
match get_char!(self, input) {
']' => go!(self: to CdataEnd),
cl => go!(self: emit ']'; emit cl; to Cdata),
}
},
//§ cdata-end-state
XmlState::CdataEnd => loop {
match get_char!(self, input) {
'>' => go!(self: to Data),
']' => go!(self: emit ']'),
cl => go!(self: emit ']'; emit ']'; emit cl; to Cdata),
}
},
//§ tag-name-state
XmlState::TagName => loop {
match get_char!(self, input) {
'\t' | '\n' | ' ' => go!(self: to TagAttrNameBefore),
'>' => go!(self: emit_tag Data),
'/' => go!(self: set_empty_tag; to TagEmpty),
cl => go!(self: push_tag cl),
}
},
//§ empty-tag-state
XmlState::TagEmpty => loop {
match get_char!(self, input) {
'>' => go!(self: emit_empty_tag Data),
_ => go!(self: reconsume TagAttrValueBefore),
}
},
//§ tag-attribute-name-before-state
XmlState::TagAttrNameBefore => loop {
match get_char!(self, input) {
'\t' | '\n' | ' ' => (),
'>' => go!(self: emit_tag Data),
'/' => go!(self: set_empty_tag; to TagEmpty),
':' => go!(self: error),
cl => go!(self: create_attr cl; to TagAttrName),
}
},
//§ tag-attribute-name-state
XmlState::TagAttrName => loop {
match get_char!(self, input) {
'=' => go!(self: to TagAttrValueBefore),
'>' => go!(self: emit_tag Data),
'\t' | '\n' | ' ' => go!(self: to TagAttrNameAfter),
'/' => go!(self: set_empty_tag; to TagEmpty),
cl => go!(self: push_name cl),
}
},
//§ tag-attribute-name-after-state
XmlState::TagAttrNameAfter => loop {
match get_char!(self, input) {
'\t' | '\n' | ' ' => (),
'=' => go!(self: to TagAttrValueBefore),
'>' => go!(self: emit_tag Data),
'/' => go!(self: set_empty_tag; to TagEmpty),
cl => go!(self: create_attr cl; to TagAttrName),
}
},
//§ tag-attribute-value-before-state
XmlState::TagAttrValueBefore => loop {
match get_char!(self, input) {
'\t' | '\n' | ' ' => (),
'"' => go!(self: to TagAttrValue DoubleQuoted),
'\'' => go!(self: to TagAttrValue SingleQuoted),
'&' => go!(self: reconsume TagAttrValue(Unquoted)),
'>' => go!(self: emit_tag Data),
cl => go!(self: push_value cl; to TagAttrValue(Unquoted)),
}
},
//§ tag-attribute-value-double-quoted-state
XmlState::TagAttrValue(DoubleQuoted) => loop {
match pop_except_from!(self, input, small_char_set!('\n' '"' '&')) {
FromSet('"') => go!(self: to TagAttrNameBefore),
FromSet('&') => go!(self: consume_char_ref '"' ),
FromSet(c) => go!(self: push_value c),
NotFromSet(ref b) => go!(self: append_value b),
}
},
//§ tag-attribute-value-single-quoted-state
XmlState::TagAttrValue(SingleQuoted) => loop {
match pop_except_from!(self, input, small_char_set!('\n' '\'' '&')) {
FromSet('\'') => go!(self: to TagAttrNameBefore),
FromSet('&') => go!(self: consume_char_ref '\''),
FromSet(c) => go!(self: push_value c),
NotFromSet(ref b) => go!(self: append_value b),
}
},
//§ tag-attribute-value-double-quoted-state
XmlState::TagAttrValue(Unquoted) => loop {
match pop_except_from!(self, input, small_char_set!('\n' '\t' ' ' '&' '>')) {
FromSet('\t') | FromSet('\n') | FromSet(' ') => go!(self: to TagAttrNameBefore),
FromSet('&') => go!(self: consume_char_ref),
FromSet('>') => go!(self: emit_tag Data),
FromSet(c) => go!(self: push_value c),
NotFromSet(ref b) => go!(self: append_value b),
}
},
//§ doctype-state
XmlState::Doctype => loop {
match get_char!(self, input) {
'\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
_ => go!(self: error; reconsume BeforeDoctypeName),
}
},
//§ before-doctype-name-state
XmlState::BeforeDoctypeName => loop {
match get_char!(self, input) {
'\t' | '\n' | '\x0C' | ' ' => (),
'>' => go!(self: error; emit_doctype; to Data),
c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
to DoctypeName),
}
},
//§ doctype-name-state
XmlState::DoctypeName => loop {
match get_char!(self, input) {
'\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterDoctypeName),
'>' => go!(self: emit_doctype; to Data),
c => go!(self: push_doctype_name (c.to_ascii_lowercase());
to DoctypeName),
}
},
//§ after-doctype-name-state
XmlState::AfterDoctypeName => loop {
if eat!(self, input, "public") {
go!(self: to AfterDoctypeKeyword Public);
} else if eat!(self, input, "system") {
go!(self: to AfterDoctypeKeyword System);
} else {
match get_char!(self, input) {
'\t' | '\n' | '\x0C' | ' ' => (),
'>' => go!(self: emit_doctype; to Data),
_ => go!(self: error; to BogusDoctype),
}
}
},
//§ after-doctype-public-keyword-state
XmlState::AfterDoctypeKeyword(Public) => loop {
match get_char!(self, input) {
'\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier Public),
'"' => {
go!(self: error; clear_doctype_id Public; to DoctypeIdentifierDoubleQuoted Public)
},
'\'' => {
go!(self: error; clear_doctype_id Public; to DoctypeIdentifierSingleQuoted Public)
},
'>' => go!(self: error; emit_doctype; to Data),
_ => go!(self: error; to BogusDoctype),
}
},
//§ after-doctype-system-keyword-state
XmlState::AfterDoctypeKeyword(System) => loop {
match get_char!(self, input) {
'\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier System),
'"' => {
go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
},
'\'' => {
go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
},
'>' => go!(self: error; emit_doctype; to Data),
_ => go!(self: error; to BogusDoctype),
}
},
//§ before_doctype_public_identifier_state before_doctype_system_identifier_state
XmlState::BeforeDoctypeIdentifier(kind) => loop {
match get_char!(self, input) {
'\t' | '\n' | '\x0C' | ' ' => (),
'"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
'\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
'>' => go!(self: error; emit_doctype; to Data),
_ => go!(self: error; to BogusDoctype),
}
},
//§ doctype_public_identifier_double_quoted_state doctype_system_identifier_double_quoted_state
XmlState::DoctypeIdentifierDoubleQuoted(kind) => loop {
match get_char!(self, input) {
'"' => go!(self: to AfterDoctypeIdentifier kind),
'>' => go!(self: error; emit_doctype; to Data),
c => go!(self: push_doctype_id kind c),
}
},
//§ doctype_public_identifier_single_quoted_state doctype_system_identifier_single_quoted_state
XmlState::DoctypeIdentifierSingleQuoted(kind) => loop {
match get_char!(self, input) {
'\'' => go!(self: to AfterDoctypeIdentifier kind),
'>' => go!(self: error; emit_doctype; to Data),
c => go!(self: push_doctype_id kind c),
}
},
//§ doctype_public_identifier_single_quoted_state
XmlState::AfterDoctypeIdentifier(Public) => loop {
match get_char!(self, input) {
'\t' | '\n' | '\x0C' | ' ' => {
go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
},
'\'' => {
go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted(System))
},
'"' => {
go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted(System))
},
'>' => go!(self: emit_doctype; to Data),
_ => go!(self: error; to BogusDoctype),
}
},
//§ doctype_system_identifier_single_quoted_state
XmlState::AfterDoctypeIdentifier(System) => loop {
match get_char!(self, input) {
'\t' | '\n' | '\x0C' | ' ' => (),
'>' => go!(self: emit_doctype; to Data),
_ => go!(self: error; to BogusDoctype),
}
},
//§ between_doctype_public_and_system_identifier_state
XmlState::BetweenDoctypePublicAndSystemIdentifiers => loop {
match get_char!(self, input) {
'\t' | '\n' | '\x0C' | ' ' => (),
'>' => go!(self: emit_doctype; to Data),
'\'' => go!(self: to DoctypeIdentifierSingleQuoted System),
'"' => go!(self: to DoctypeIdentifierDoubleQuoted System),
_ => go!(self: error; to BogusDoctype),
}
},
//§ bogus_doctype_state
XmlState::BogusDoctype => loop {
match get_char!(self, input) {
'>' => go!(self: emit_doctype; to Data),
_ => (),
}
},
}
}
/// Indicate that we have reached the end of the input.
pub fn end(&mut self) {
// Handle EOF in the char ref sub-tokenizer, if there is one.
// Do this first because it might un-consume stuff.
let mut input = BufferQueue::new();
match self.char_ref_tokenizer.take() {
None => (),
Some(mut tok) => {
tok.end_of_file(self, &mut input);
self.process_char_ref(tok.get_result());
},
}
// Process all remaining buffered input.
// If we're waiting for lookahead, we're not gonna get it.
self.at_eof = true;
self.run(&mut input);
while self.eof_step() {
// loop
}
self.sink.end();
if self.opts.profile {
self.dump_profile();
}
}
#[cfg(for_c)]
fn dump_profile(&self) {
unreachable!();
}
#[cfg(not(for_c))]
fn dump_profile(&self) {
let mut results: Vec<(states::XmlState, u64)> =
self.state_profile.iter().map(|(s, t)| (*s, *t)).collect();
results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
let total: u64 = results
.iter()
.map(|&(_, t)| t)
.fold(0, ::std::ops::Add::add);
debug!("\nTokenizer profile, in nanoseconds");
debug!("\n{:12} total in token sink", self.time_in_sink);
debug!("\n{:12} total in tokenizer", total);
for (k, v) in results.into_iter() {
let pct = 100.0 * (v as f64) / (total as f64);
debug!("{:12} {:4.1}% {:?}", v, pct, k);
}
}
fn eof_step(&mut self) -> bool {
debug!("processing EOF in state {:?}", self.state);
match self.state {
XmlState::Data | XmlState::Quiescent => go!(self: eof),
XmlState::CommentStart | XmlState::CommentLessThan | XmlState::CommentLessThanBang => {
go!(self: reconsume Comment)
},
XmlState::CommentLessThanBangDash => go!(self: reconsume CommentEndDash),
XmlState::CommentLessThanBangDashDash => go!(self: reconsume CommentEnd),
XmlState::CommentStartDash |
XmlState::Comment |
XmlState::CommentEndDash |
XmlState::CommentEnd |
XmlState::CommentEndBang => go!(self: error_eof; emit_comment; eof),
XmlState::TagState => go!(self: error_eof; emit '<'; to Data),
XmlState::EndTagState => go!(self: error_eof; emit '<'; emit '/'; to Data),
XmlState::TagEmpty => go!(self: error_eof; to TagAttrNameBefore),
XmlState::Cdata | XmlState::CdataBracket | XmlState::CdataEnd => {
go!(self: error_eof; to Data)
},
XmlState::Pi => go!(self: error_eof; to BogusComment),
XmlState::PiTargetAfter | XmlState::PiAfter => go!(self: reconsume PiData),
XmlState::MarkupDecl => go!(self: error_eof; to BogusComment),
XmlState::TagName |
XmlState::TagAttrNameBefore |
XmlState::EndTagName |
XmlState::TagAttrNameAfter |
XmlState::EndTagNameAfter |
XmlState::TagAttrValueBefore |
XmlState::TagAttrValue(_) => go!(self: error_eof; emit_tag Data),
XmlState::PiData | XmlState::PiTarget => go!(self: error_eof; emit_pi Data),
XmlState::TagAttrName => go!(self: error_eof; emit_start_tag Data),
XmlState::BeforeDoctypeName |
XmlState::Doctype |
XmlState::DoctypeName |
XmlState::AfterDoctypeName |
XmlState::AfterDoctypeKeyword(_) |
XmlState::BeforeDoctypeIdentifier(_) |
XmlState::AfterDoctypeIdentifier(_) |
XmlState::DoctypeIdentifierSingleQuoted(_) |
XmlState::DoctypeIdentifierDoubleQuoted(_) |
XmlState::BetweenDoctypePublicAndSystemIdentifiers => {
go!(self: error_eof; emit_doctype; to Data)
},
XmlState::BogusDoctype => go!(self: emit_doctype; to Data),
XmlState::BogusComment => go!(self: emit_comment; to Data),
}
}
fn process_char_ref(&mut self, char_ref: CharRef) {
let CharRef {
mut chars,
mut num_chars,
} = char_ref;
if num_chars == 0 {
chars[0] = '&';
num_chars = 1;
}
for i in 0..num_chars {
let c = chars[i as usize];
match self.state {
states::Data | states::Cdata => go!(self: emit c),
states::TagAttrValue(_) => go!(self: push_value c),
_ => panic!(
"state {:?} should not be reachable in process_char_ref",
self.state
),
}
}
}
fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> bool {
let mut tok = self.char_ref_tokenizer.take().unwrap();
let outcome = tok.step(self, input);
let progress = match outcome {
char_ref::Done => {
self.process_char_ref(tok.get_result());
return true;
},
char_ref::Stuck => false,
char_ref::Progress => true,
};
self.char_ref_tokenizer = Some(tok);
progress
}
fn finish_attribute(&mut self) {
if self.current_attr_name.len() == 0 {
return;
}
// Check for a duplicate attribute.
// FIXME: the spec says we should error as soon as the name is finished.
// FIXME: linear time search, do we care?
let dup = {
let name = &self.current_attr_name[..];
self.current_tag_attrs
.iter()
.any(|a| &*a.name.local == name)
};
if dup {
self.emit_error(Borrowed("Duplicate attribute"));
self.current_attr_name.clear();
self.current_attr_value.clear();
} else {
let qname = process_qname(replace(&mut self.current_attr_name, StrTendril::new()));
let attr = Attribute {
name: qname.clone(),
value: replace(&mut self.current_attr_value, StrTendril::new()),
};
if qname.local == local_name!("xmlns") ||
qname.prefix == Some(namespace_prefix!("xmlns"))
{
self.current_tag_attrs.insert(0, attr);
} else {
self.current_tag_attrs.push(attr);
}
}
}
fn create_attribute(&mut self, c: char) {
self.finish_attribute();
self.current_attr_name.push_char(c);
}
}
#[cfg(test)]
mod test {
use super::process_qname;
use crate::tendril::SliceExt;
use crate::{LocalName, Prefix};
#[test]
fn simple_namespace() {
let qname = process_qname("prefix:local".to_tendril());
assert_eq!(qname.prefix, Some(Prefix::from("prefix")));
assert_eq!(qname.local, LocalName::from("local"));
let qname = process_qname("a:b".to_tendril());
assert_eq!(qname.prefix, Some(Prefix::from("a")));
assert_eq!(qname.local, LocalName::from("b"));
}
#[test]
fn wrong_namespaces() {
let qname = process_qname(":local".to_tendril());
assert_eq!(qname.prefix, None);
assert_eq!(qname.local, LocalName::from(":local"));
let qname = process_qname("::local".to_tendril());
assert_eq!(qname.prefix, None);
assert_eq!(qname.local, LocalName::from("::local"));
let qname = process_qname("a::local".to_tendril());
assert_eq!(qname.prefix, None);
assert_eq!(qname.local, LocalName::from("a::local"));
let qname = process_qname("fake::".to_tendril());
assert_eq!(qname.prefix, None);
assert_eq!(qname.local, LocalName::from("fake::"));
let qname = process_qname(":::".to_tendril());
assert_eq!(qname.prefix, None);
assert_eq!(qname.local, LocalName::from(":::"));
let qname = process_qname(":a:b:".to_tendril());
assert_eq!(qname.prefix, None);
assert_eq!(qname.local, LocalName::from(":a:b:"));
}
}
xml5ever-0.16.1/src/tokenizer/qname.rs 0100644 0001750 0001750 00000004274 13442307261 0016050 0 ustar 00 0000000 0000000 // Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 or the MIT license
// , at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
enum QualNameState {
BeforeName,
InName,
AfterColon,
}
pub struct QualNameTokenizer<'a> {
state: QualNameState,
slice: &'a [u8],
valid_index: Option,
curr_ind: usize,
}
impl<'a> QualNameTokenizer<'a> {
pub fn new(tag: &[u8]) -> QualNameTokenizer {
QualNameTokenizer {
state: QualNameState::BeforeName,
slice: tag,
valid_index: None,
curr_ind: 0,
}
}
pub fn run(&mut self) -> Option {
if self.slice.len() > 0 {
loop {
if !self.step() {
break;
}
}
}
self.valid_index
}
fn incr(&mut self) -> bool {
if self.curr_ind + 1 < self.slice.len() {
self.curr_ind += 1;
return true;
}
false
}
fn step(&mut self) -> bool {
match self.state {
QualNameState::BeforeName => self.do_before_name(),
QualNameState::InName => self.do_in_name(),
QualNameState::AfterColon => self.do_after_colon(),
}
}
fn do_before_name(&mut self) -> bool {
if self.slice[self.curr_ind] == b':' {
false
} else {
self.state = QualNameState::InName;
self.incr()
}
}
fn do_in_name(&mut self) -> bool {
if self.slice[self.curr_ind] == b':' && self.curr_ind + 1 < self.slice.len() {
self.valid_index = Some(self.curr_ind as u32);
self.state = QualNameState::AfterColon;
}
self.incr()
}
fn do_after_colon(&mut self) -> bool {
if self.slice[self.curr_ind] == b':' {
self.valid_index = None;
return false;
}
self.incr()
}
}
xml5ever-0.16.1/src/tokenizer/states.rs 0100644 0001750 0001750 00000003743 13442307261 0016252 0 ustar 00 0000000 0000000 // Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 or the MIT license
// , at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Tokenizer states.
//!
//! This is public for use by the tokenizer tests. Other library
//! users should not have to care about this.
pub use self::AttrValueKind::*;
pub use self::DoctypeKind::*;
pub use self::XmlState::*;
#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)]
#[doc(hidden)]
pub enum DoctypeKind {
Public,
System,
}
#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)]
#[doc(hidden)]
pub enum XmlState {
Data,
TagState,
EndTagState,
EndTagName,
EndTagNameAfter,
Pi,
PiTarget,
PiTargetAfter,
PiData,
PiAfter,
MarkupDecl,
CommentStart,
CommentStartDash,
Comment,
CommentLessThan,
CommentLessThanBang,
CommentLessThanBangDash,
CommentLessThanBangDashDash,
CommentEnd,
CommentEndDash,
CommentEndBang,
Cdata,
CdataBracket,
CdataEnd,
TagName,
TagEmpty,
TagAttrNameBefore,
TagAttrName,
TagAttrNameAfter,
TagAttrValueBefore,
TagAttrValue(AttrValueKind),
Doctype,
BeforeDoctypeName,
DoctypeName,
AfterDoctypeName,
AfterDoctypeKeyword(DoctypeKind),
BeforeDoctypeIdentifier(DoctypeKind),
DoctypeIdentifierDoubleQuoted(DoctypeKind),
DoctypeIdentifierSingleQuoted(DoctypeKind),
AfterDoctypeIdentifier(DoctypeKind),
BetweenDoctypePublicAndSystemIdentifiers,
BogusDoctype,
BogusComment,
Quiescent,
}
#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Hash, Debug)]
#[doc(hidden)]
pub enum AttrValueKind {
Unquoted,
SingleQuoted,
DoubleQuoted,
}
xml5ever-0.16.1/src/tree_builder/mod.rs 0100644 0001750 0001750 00000057664 13554522202 0016172 0 ustar 00 0000000 0000000 // Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 or the MIT license
// , at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
mod types;
use log::{debug, warn};
use mac::{matches, _tt_as_expr_hack, unwrap_or_return};
use markup5ever::{local_name, namespace_prefix, namespace_url, ns};
use std::borrow::Cow;
use std::borrow::Cow::Borrowed;
use std::collections::btree_map::Iter;
use std::collections::{BTreeMap, HashSet, VecDeque};
use std::fmt::{Debug, Error, Formatter};
use std::mem;
use std::result::Result;
pub use self::interface::{NextParserState, NodeOrText, Tracer, TreeSink};
use self::types::*;
use crate::interface::{self, create_element, AppendNode, Attribute, QualName};
use crate::interface::{AppendText, ExpandedName};
use crate::tokenizer::states::Quiescent;
use crate::tokenizer::{self, EndTag, StartTag, Tag, TokenSink};
use crate::tokenizer::{Doctype, EmptyTag, Pi, ShortTag};
use crate::{LocalName, Namespace, Prefix};
use crate::tendril::{StrTendril, Tendril};
static XML_URI: &'static str = "http://www.w3.org/XML/1998/namespace";
static XMLNS_URI: &'static str = "http://www.w3.org/2000/xmlns/";
type InsResult = Result<(), Cow<'static, str>>;
#[derive(Debug)]
struct NamespaceMapStack(Vec);
impl NamespaceMapStack {
fn new() -> NamespaceMapStack {
NamespaceMapStack({
let mut vec = Vec::new();
vec.push(NamespaceMap::default());
vec
})
}
fn push(&mut self, map: NamespaceMap) {
self.0.push(map);
}
#[doc(hidden)]
pub fn pop(&mut self) {
self.0.pop();
}
}
#[doc(hidden)]
pub struct NamespaceMap {
// Map that maps prefixes to URI.
//
// Key denotes namespace prefix, and value denotes
// URI it maps to.
//
// If value of value is None, that means the namespace
// denoted by key has been undeclared.
scope: BTreeMap