xml-rs-0.8.19/.cargo_vcs_info.json0000644000000001360000000000100123550ustar { "git": { "sha1": "bfb185ede18170f7b21f9b17ab65cbb4aba2de22" }, "path_in_vcs": "" }xml-rs-0.8.19/Cargo.lock0000644000000002270000000000100103310ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "xml-rs" version = "0.8.19" xml-rs-0.8.19/Cargo.toml0000644000000024640000000000100103610ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" rust-version = "1.58" name = "xml-rs" version = "0.8.19" authors = ["Vladimir Matveev "] include = [ "src/**", "LICENSE", "README.md", ] description = "An XML library in pure Rust" homepage = "https://lib.rs/crates/xml-rs" documentation = "https://docs.rs/xml-rs/" readme = "README.md" keywords = [ "xml", "parser", "sax", "parsing", "writer", ] categories = ["parser-implementations"] license = "MIT" repository = "https://github.com/kornelski/xml-rs" [package.metadata.docs.rs] rustdoc-args = ["--generate-link-to-definition"] targets = ["x86_64-unknown-linux-gnu"] [package.metadata.release] tag-message = "" tag-name = "{{version}}" [lib] name = "xml" path = "src/lib.rs" [[bin]] name = "xml-analyze" path = "src/analyze.rs" [badges.maintenance] status = "actively-developed" xml-rs-0.8.19/Cargo.toml.orig000064400000000000000000000015011046102023000140310ustar 00000000000000[package] name = "xml-rs" version = "0.8.19" authors = ["Vladimir Matveev "] license = "MIT" description = "An XML library in pure Rust" repository = "https://github.com/kornelski/xml-rs" homepage = "https://lib.rs/crates/xml-rs" documentation = "https://docs.rs/xml-rs/" readme = "README.md" keywords = ["xml", "parser", "sax", "parsing", "writer"] categories = ["parser-implementations"] edition = "2021" rust-version = "1.58" include = ["src/**", "LICENSE", "README.md"] [lib] name = "xml" path = "src/lib.rs" [[bin]] name = "xml-analyze" path = "src/analyze.rs" [badges] maintenance = { status = "actively-developed" } [package.metadata.docs.rs] targets = ["x86_64-unknown-linux-gnu"] rustdoc-args = ["--generate-link-to-definition"] [package.metadata.release] tag-name = "{{version}}" tag-message = "" xml-rs-0.8.19/LICENSE000064400000000000000000000020731046102023000121540ustar 00000000000000The MIT License (MIT) Copyright (c) 2014 Vladimir Matveev Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. xml-rs-0.8.19/README.md000064400000000000000000000172521046102023000124330ustar 00000000000000xml-rs, an XML library for Rust =============================== [![CI](https://github.com/kornelski/xml-rs/actions/workflows/main.yml/badge.svg)](https://github.com/kornelski/xml-rs/actions/workflows/main.yml) [![crates.io][crates-io-img]](https://lib.rs/crates/xml-rs) [![docs][docs-img]](https://docs.rs/xml-rs/) [Documentation](https://docs.rs/xml-rs/) [crates-io-img]: https://img.shields.io/crates/v/xml-rs.svg [docs-img]: https://img.shields.io/badge/docs-latest%20release-6495ed.svg xml-rs is an XML library for the [Rust](https://www.rust-lang.org/) programming language. It supports reading and writing of XML documents in a streaming fashion (without DOM). ### Features * XML spec conformance better than other pure-Rust libraries. * Easy to use API based on `Iterator`s and regular `String`s without tricky lifetimes. * Support for UTF-16, UTF-8, ISO-8859-1, and ASCII encodings. * Written entirely in the safe Rust subset. Designed to safely handle untrusted input. The API is heavily inspired by Java Streaming API for XML ([StAX][stax]). It contains a pull parser much like StAX event reader. It provides an iterator API, so you can leverage Rust's existing iterators library features. [stax]: https://en.wikipedia.org/wiki/StAX It also provides a streaming document writer much like StAX event writer. This writer consumes its own set of events, but reader events can be converted to writer events easily, and so it is possible to write XML transformation chains in a pretty clean manner. This parser is mostly full-featured, however, there are limitations: * Legacy code pages and non-Unicode encodings are not supported; * DTD validation is not supported (but entities defined in the internal subset are supported); * attribute value normalization is not performed, and end-of-line characters are not normalized either. Other than that the parser tries to be mostly XML-1.1-compliant. Writer is also mostly full-featured with the following limitations: * no support for encodings other than UTF-8, * no support for emitting `` declarations; * more validations of input are needed, for example, checking that namespace prefixes are bounded or comments are well-formed. Building and using ------------------ xml-rs uses [Cargo](https://crates.io), so add it with `cargo add xml` or modify `Cargo.toml`: ```toml [dependencies] xml = "0.8.16" ``` The package exposes a single crate called `xml`. Reading XML documents --------------------- [`xml::reader::EventReader`](EventReader) requires a [`Read`](stdread) instance to read from. It can be a `File` wrapped in `BufReader`, or a `Vec`, or a `&[u8]` slice. [EventReader]: https://docs.rs/xml-rs/latest/xml/reader/struct.EventReader.html [stdread]: https://doc.rust-lang.org/stable/std/io/trait.Read.html `EventReader` implements `IntoIterator` trait, so you can use it in a `for` loop directly: ```rust,no_run use std::fs::File; use std::io::BufReader; use xml::reader::{EventReader, XmlEvent}; fn main() -> std::io::Result<()> { let file = File::open("file.xml")?; let file = BufReader::new(file); // Buffering is important for performance let parser = EventReader::new(file); let mut depth = 0; for e in parser { match e { Ok(XmlEvent::StartElement { name, .. }) => { println!("{:spaces$}+{name}", "", spaces = depth * 2); depth += 1; } Ok(XmlEvent::EndElement { name }) => { depth -= 1; println!("{:spaces$}-{name}", "", spaces = depth * 2); } Err(e) => { eprintln!("Error: {e}"); break; } // There's more: https://docs.rs/xml-rs/latest/xml/reader/enum.XmlEvent.html _ => {} } } Ok(()) } ``` Document parsing can end normally or with an error. Regardless of exact cause, the parsing process will be stopped, and the iterator will terminate normally. You can also have finer control over when to pull the next event from the parser using its own `next()` method: ```rust,ignore match parser.next() { ... } ``` Upon the end of the document or an error, the parser will remember the last event and will always return it in the result of `next()` call afterwards. If iterator is used, then it will yield error or end-of-document event once and will produce `None` afterwards. It is also possible to tweak parsing process a little using [`xml::reader::ParserConfig`][ParserConfig] structure. See its documentation for more information and examples. [ParserConfig]: https://docs.rs/xml-rs/latest/xml/reader/struct.ParserConfig.html You can find a more extensive example of using `EventReader` in `src/analyze.rs`, which is a small program (BTW, it is built with `cargo build` and can be run after that) which shows various statistics about specified XML document. It can also be used to check for well-formedness of XML documents - if a document is not well-formed, this program will exit with an error. ## Parsing untrusted inputs The parser is written in safe Rust subset, so by Rust's guarantees the worst that it can do is to cause a panic. You can use `ParserConfig` to set limits on maximum lenghts of names, attributes, text, entities, etc. You should also set a maximum document size via `io::Read`'s [`take(max)`](https://doc.rust-lang.org/stable/std/io/trait.Read.html#method.take) method. Writing XML documents --------------------- xml-rs also provides a streaming writer much like StAX event writer. With it you can write an XML document to any `Write` implementor. ```rust,no_run use std::io; use xml::writer::{EmitterConfig, XmlEvent}; /// A simple demo syntax where "+foo" makes ``, "-foo" makes `` fn make_event_from_line(line: &str) -> XmlEvent { let line = line.trim(); if let Some(name) = line.strip_prefix("+") { XmlEvent::start_element(name).into() } else if line.starts_with("-") { XmlEvent::end_element().into() } else { XmlEvent::characters(line).into() } } fn main() -> io::Result<()> { let input = io::stdin(); let output = io::stdout(); let mut writer = EmitterConfig::new() .perform_indent(true) .create_writer(output); let mut line = String::new(); loop { line.clear(); let bytes_read = input.read_line(&mut line)?; if bytes_read == 0 { break; // EOF } let event = make_event_from_line(&line); if let Err(e) = writer.write(event) { panic!("Write error: {e}") } } Ok(()) } ``` The code example above also demonstrates how to create a writer out of its configuration. Similar thing also works with `EventReader`. The library provides an XML event building DSL which helps to construct complex events, e.g. ones having namespace definitions. Some examples: ```rust,ignore // XmlEvent::start_element("a:hello").attr("a:param", "value").ns("a", "urn:some:document") // XmlEvent::start_element("hello").attr("b:config", "value").default_ns("urn:defaul:uri") // XmlEvent::cdata("some unescaped text") ``` Of course, one can create `XmlEvent` enum variants directly instead of using the builder DSL. There are more examples in [`xml::writer::XmlEvent`][XmlEvent] documentation. [XmlEvent]: https://docs.rs/xml-rs/latest/xml/reader/enum.XmlEvent.html The writer has multiple configuration options; see `EmitterConfig` documentation for more information. [EmitterConfig]: https://docs.rs/xml-rs/latest/xml/writer/struct.EmitterConfig.html Bug reports ------------ Please report issues at: . xml-rs-0.8.19/src/analyze.rs000064400000000000000000000056671046102023000137630ustar 00000000000000#![forbid(unsafe_code)] use std::cmp; use std::collections::HashSet; use std::env; use std::fs::File; use std::io::{self, BufReader, Read}; use xml::reader::XmlEvent; use xml::ParserConfig; fn main() -> Result<(), Box> { let mut file; let mut stdin; let source: &mut dyn Read = if let Some(file_name) = env::args().nth(1) { file = File::open(file_name).map_err(|e| format!("Cannot open input file: {e}"))?; &mut file } else { stdin = io::stdin(); &mut stdin }; let reader = ParserConfig::new() .whitespace_to_characters(true) .ignore_comments(false) .create_reader(BufReader::new(source)); let mut processing_instructions = 0; let mut elements = 0; let mut character_blocks = 0; let mut cdata_blocks = 0; let mut characters = 0; let mut comment_blocks = 0; let mut comment_characters = 0; let mut namespaces = HashSet::new(); let mut depth = 0; let mut max_depth = 0; for e in reader { let e = e.map_err(|e| format!("Error parsing XML document: {e}"))?; match e { XmlEvent::StartDocument { version, encoding, standalone } => println!( "XML document version {}, encoded in {}, {}standalone", version, encoding, if standalone.unwrap_or(false) { "" } else { "not " } ), XmlEvent::EndDocument => println!("Document finished"), XmlEvent::ProcessingInstruction { .. } => processing_instructions += 1, XmlEvent::Whitespace(_) => {} // can't happen due to configuration XmlEvent::Characters(s) => { character_blocks += 1; characters += s.len(); } XmlEvent::CData(s) => { cdata_blocks += 1; characters += s.len(); } XmlEvent::Comment(s) => { comment_blocks += 1; comment_characters += s.len(); } XmlEvent::StartElement { namespace, .. } => { depth += 1; max_depth = cmp::max(max_depth, depth); elements += 1; namespaces.extend(namespace.0.into_values()); } XmlEvent::EndElement { .. } => { depth -= 1; } }; } namespaces.remove(xml::namespace::NS_EMPTY_URI); namespaces.remove(xml::namespace::NS_XMLNS_URI); namespaces.remove(xml::namespace::NS_XML_URI); println!("Elements: {elements}, maximum depth: {max_depth}"); println!("Namespaces (excluding built-in): {}", namespaces.len()); println!("Characters: {characters}, characters blocks: {character_blocks}, CDATA blocks: {cdata_blocks}"); println!("Comment blocks: {comment_blocks}, comment characters: {comment_characters}"); println!("Processing instructions (excluding built-in): {processing_instructions}"); Ok(()) } xml-rs-0.8.19/src/attribute.rs000064400000000000000000000051771046102023000143170ustar 00000000000000//! Contains XML attributes manipulation types and functions. //! use std::fmt; use crate::escape::{AttributeEscapes, Escaped}; use crate::name::{Name, OwnedName}; /// A borrowed version of an XML attribute. /// /// Consists of a borrowed qualified name and a borrowed string value. #[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)] pub struct Attribute<'a> { /// Attribute name. pub name: Name<'a>, /// Attribute value. pub value: &'a str, } impl<'a> fmt::Display for Attribute<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}=\"{}\"", self.name, Escaped::::new(self.value)) } } impl<'a> Attribute<'a> { /// Creates an owned attribute out of this borrowed one. #[inline] #[must_use] pub fn to_owned(&self) -> OwnedAttribute { OwnedAttribute { name: self.name.into(), value: self.value.into(), } } /// Creates a borrowed attribute using the provided borrowed name and a borrowed string value. #[inline] #[must_use] pub fn new(name: Name<'a>, value: &'a str) -> Attribute<'a> { Attribute { name, value } } } /// An owned version of an XML attribute. /// /// Consists of an owned qualified name and an owned string value. #[derive(Clone, Eq, PartialEq, Hash, Debug)] pub struct OwnedAttribute { /// Attribute name. pub name: OwnedName, /// Attribute value. pub value: String, } impl OwnedAttribute { /// Returns a borrowed `Attribute` out of this owned one. #[must_use] #[inline] pub fn borrow(&self) -> Attribute<'_> { Attribute { name: self.name.borrow(), value: &self.value, } } /// Creates a new owned attribute using the provided owned name and an owned string value. #[inline] pub fn new>(name: OwnedName, value: S) -> OwnedAttribute { OwnedAttribute { name, value: value.into(), } } } impl fmt::Display for OwnedAttribute { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}=\"{}\"", self.name, Escaped::::new(&self.value)) } } #[cfg(test)] mod tests { use super::Attribute; use crate::name::Name; #[test] fn attribute_display() { let attr = Attribute::new( Name::qualified("attribute", "urn:namespace", Some("n")), "its value with > & \" ' < weird symbols", ); assert_eq!( &*attr.to_string(), "{urn:namespace}n:attribute=\"its value with > & " ' < weird symbols\"" ); } } xml-rs-0.8.19/src/common.rs000064400000000000000000000110551046102023000135740ustar 00000000000000//! Contains common types and functions used throughout the library. use std::fmt; /// Represents a position inside some textual document. #[derive(Copy, Clone, PartialEq, Eq)] pub struct TextPosition { /// Row, counting from 0 pub row: u64, /// Column, counting from 0 pub column: u64, } impl TextPosition { /// Creates a new position initialized to the beginning of the document #[inline] #[must_use] pub fn new() -> TextPosition { TextPosition { row: 0, column: 0 } } /// Advances the position in a line #[inline] pub fn advance(&mut self, count: u8) { self.column += u64::from(count); } /// Advances the position in a line to the next tab position #[inline] pub fn advance_to_tab(&mut self, width: u8) { let width = u64::from(width); self.column += width - self.column % width; } /// Advances the position to the beginning of the next line #[inline] pub fn new_line(&mut self) { self.column = 0; self.row += 1; } } impl fmt::Debug for TextPosition { #[cold] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}:{}", self.row + 1, self.column + 1) } } impl fmt::Display for TextPosition { #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}:{}", self.row + 1, self.column + 1) } } /// Get the position in the document corresponding to the object /// /// This trait is implemented by parsers, lexers and errors. pub trait Position { /// Returns the current position or a position corresponding to the object. fn position(&self) -> TextPosition; } impl Position for TextPosition { #[inline] fn position(&self) -> TextPosition { *self } } /// XML version enumeration. #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] pub enum XmlVersion { /// XML version 1.0. Version10, /// XML version 1.1. Version11, } impl fmt::Display for XmlVersion { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { XmlVersion::Version10 => "1.0", XmlVersion::Version11 => "1.1", }.fmt(f) } } impl fmt::Debug for XmlVersion { #[cold] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt(self, f) } } /// Checks whether the given character is a white space character (`S`) /// as is defined by XML 1.1 specification, [section 2.3][1]. /// /// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn #[must_use] #[inline] pub fn is_whitespace_char(c: char) -> bool { matches!(c, '\x20' | '\x0a' | '\x09' | '\x0d') } /// Checks whether the given string is compound only by white space /// characters (`S`) using the previous `is_whitespace_char` to check /// all characters of this string pub fn is_whitespace_str(s: &str) -> bool { s.chars().all(is_whitespace_char) } #[must_use] pub fn is_xml10_char(c: char) -> bool { matches!(c, '\u{09}' | '\u{0A}' | '\u{0D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..) } #[must_use] pub fn is_xml11_char(c: char) -> bool { matches!(c, '\u{01}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..) } #[must_use] pub fn is_xml11_char_not_restricted(c: char) -> bool { is_xml11_char(c) && !matches!(c, '\u{01}'..='\u{08}' | '\u{0B}'..='\u{0C}' | '\u{0E}'..='\u{1F}' | '\u{7F}'..='\u{84}' | '\u{86}'..='\u{9F}') } /// Checks whether the given character is a name start character (`NameStartChar`) /// as is defined by XML 1.1 specification, [section 2.3][1]. /// /// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn #[must_use] pub fn is_name_start_char(c: char) -> bool { match c { ':' | 'A'..='Z' | '_' | 'a'..='z' | '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' | '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' | '\u{10000}'..='\u{EFFFF}' => true, _ => false } } /// Checks whether the given character is a name character (`NameChar`) /// as is defined by XML 1.1 specification, [section 2.3][1]. /// /// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn #[must_use] pub fn is_name_char(c: char) -> bool { match c { _ if is_name_start_char(c) => true, '-' | '.' | '0'..='9' | '\u{B7}' | '\u{300}'..='\u{36F}' | '\u{203F}'..='\u{2040}' => true, _ => false } } xml-rs-0.8.19/src/escape.rs000064400000000000000000000101761046102023000135470ustar 00000000000000//! Contains functions for performing XML special characters escaping. use std::{borrow::Cow, marker::PhantomData, fmt::{Display, Result, Formatter}}; pub(crate) trait Escapes { fn escape(c: u8) -> Option<&'static str>; fn byte_needs_escaping(c: u8) -> bool { Self::escape(c).is_some() } fn str_needs_escaping(s: &str) -> bool { s.bytes().any(|c| Self::escape(c).is_some()) } } pub(crate) struct Escaped<'a, E: Escapes> { _escape_phantom: PhantomData, to_escape: &'a str, } impl<'a, E: Escapes> Escaped<'a, E> { pub fn new(s: &'a str) -> Self { Escaped { _escape_phantom: PhantomData, to_escape: s, } } } impl<'a, E: Escapes> Display for Escaped<'a, E> { fn fmt(&self, f: &mut Formatter<'_>) -> Result { let mut total_remaining = self.to_escape; // find the next occurence while let Some(n) = total_remaining .bytes() .position(E::byte_needs_escaping) { let (start, remaining) = total_remaining.split_at(n); f.write_str(start)?; // unwrap is safe because we checked is_some for position n earlier let next_byte = remaining.bytes().next().unwrap(); let replacement = E::escape(next_byte).unwrap(); f.write_str(replacement)?; total_remaining = &remaining[1..]; } f.write_str(total_remaining) } } fn escape_str(s: &str) -> Cow<'_, str> { if E::str_needs_escaping(s) { Cow::Owned(format!("{}", Escaped::::new(s))) } else { Cow::Borrowed(s) } } macro_rules! escapes { { $name: ident, $($k: expr => $v: expr),* $(,)? } => { pub(crate) struct $name; impl Escapes for $name { fn escape(c: u8) -> Option<&'static str> { match c { $( $k => Some($v),)* _ => None } } } }; } escapes!( AttributeEscapes, b'<' => "<", b'>' => ">", b'"' => """, b'\'' => "'", b'&' => "&", b'\n' => " ", b'\r' => " ", ); escapes!( PcDataEscapes, b'<' => "<", b'&' => "&", ); /// Performs escaping of common XML characters inside an attribute value. /// /// This function replaces several important markup characters with their /// entity equivalents: /// /// * `<` → `<` /// * `>` → `>` /// * `"` → `"` /// * `'` → `'` /// * `&` → `&` /// /// The following characters are escaped so that attributes are printed on /// a single line: /// * `\n` → ` ` /// * `\r` → ` ` /// /// The resulting string is safe to use inside XML attribute values or in PCDATA sections. /// /// Does not perform allocations if the given string does not contain escapable characters. #[inline] #[must_use] pub fn escape_str_attribute(s: &str) -> Cow<'_, str> { escape_str::(s) } /// Performs escaping of common XML characters inside PCDATA. /// /// This function replaces several important markup characters with their /// entity equivalents: /// /// * `<` → `<` /// * `&` → `&` /// /// The resulting string is safe to use inside PCDATA sections but NOT inside attribute values. /// /// Does not perform allocations if the given string does not contain escapable characters. #[inline] #[must_use] pub fn escape_str_pcdata(s: &str) -> Cow<'_, str> { escape_str::(s) } #[cfg(test)] mod tests { use super::{escape_str_attribute, escape_str_pcdata}; #[test] fn test_escape_str_attribute() { assert_eq!(escape_str_attribute("<>'\"&\n\r"), "<>'"& "); assert_eq!(escape_str_attribute("no_escapes"), "no_escapes"); } #[test] fn test_escape_str_pcdata() { assert_eq!(escape_str_pcdata("<&"), "<&"); assert_eq!(escape_str_pcdata("no_escapes"), "no_escapes"); } #[test] fn test_escape_multibyte_code_points() { assert_eq!(escape_str_attribute("☃<"), "☃<"); assert_eq!(escape_str_pcdata("☃<"), "☃<"); } } xml-rs-0.8.19/src/lib.rs000064400000000000000000000016001046102023000130450ustar 00000000000000//#![warn(missing_doc)] #![forbid(non_camel_case_types)] #![forbid(unsafe_code)] #![allow(clippy::redundant_closure_for_method_calls)] #![allow(clippy::module_name_repetitions)] //! This crate currently provides an almost XML 1.0/1.1-compliant pull parser. //! //! Please note that functions of this parser may panic. //! If a panic could cause a Denial Of Service in your codebase, *you're* responsible for wrapping access to this library in `catch_unwind`. //! #![cfg_attr(doctest, doc = include_str!("../README.md"))] pub use crate::reader::EventReader; pub use crate::reader::ParserConfig; pub use crate::util::Encoding; pub use crate::writer::EmitterConfig; pub use crate::writer::EventWriter; pub mod attribute; pub mod common; pub mod escape; #[doc(hidden)] // FIXME: not supposed to be public pub mod macros; pub mod name; pub mod namespace; pub mod reader; mod util; pub mod writer; xml-rs-0.8.19/src/macros.rs000064400000000000000000000036051046102023000135720ustar 00000000000000#![macro_use] //! Contains several macros used in this crate. macro_rules! gen_setter { ($(#[$comments:meta])* $field:ident : into $t:ty) => { $(#[$comments])* /// /// See [`ParserConfig`][crate::ParserConfig] fields docs for details #[inline] pub fn $field>(mut self, value: T) -> Self { self.$field = value.into(); self } }; ($(#[$comments:meta])* $field:ident : val $t:ty) => { $(#[$comments])* /// /// See [`ParserConfig`][crate::ParserConfig] fields docs for details #[inline] #[must_use] pub fn $field(mut self, value: $t) -> Self { self.$field = value; self } }; ($(#[$comments:meta])* $field:ident : delegate $t:ty) => { $(#[$comments])* /// /// See [`ParserConfig`][crate::ParserConfig] fields docs for details #[inline] #[must_use] pub fn $field(mut self, value: $t) -> Self { self.c.$field = value; self } }; ($(#[$comments:meta])* $field:ident : c2 $t:ty) => { $(#[$comments])* /// /// See [`ParserConfig2`][crate::reader::ParserConfig2] fields docs for details #[inline] #[must_use] pub fn $field(self, value: $t) -> ParserConfig2 { ParserConfig2 { c: self, ..Default::default() } .$field(value) } }; } macro_rules! gen_setters { ($target:ident, $($(#[$comments:meta])* $field:ident : $k:tt $tpe:ty),+) => ( impl $target {$( gen_setter! { $(#[$comments])* $field : $k $tpe } )+ }) } xml-rs-0.8.19/src/name.rs000064400000000000000000000234761046102023000132360ustar 00000000000000//! Contains XML qualified names manipulation types and functions. //! use std::fmt; use std::str::FromStr; use crate::namespace::NS_NO_PREFIX; /// Represents a qualified XML name. /// /// A qualified name always consists at least of a local name. It can optionally contain /// a prefix; when reading an XML document, if it contains a prefix, it must also contain a /// namespace URI, but this is not enforced statically; see below. The name can contain a /// namespace without a prefix; in that case a default, empty prefix is assumed. /// /// When writing XML documents, it is possible to omit the namespace URI, leaving only /// the prefix. In this case the writer will check that the specifed prefix is bound to some /// URI in the current namespace context. If both prefix and namespace URI are specified, /// it is checked that the current namespace context contains this exact correspondence /// between prefix and namespace URI. /// /// # Prefixes and URIs /// /// A qualified name with a prefix must always contain a proper namespace URI --- names with /// a prefix but without a namespace associated with that prefix are meaningless. However, /// it is impossible to obtain proper namespace URI by a prefix without a context, and such /// context is only available when parsing a document (or it can be constructed manually /// when writing a document). Tying a name to a context statically seems impractical. This /// may change in future, though. /// /// # Conversions /// /// `Name` implements some `From` instances for conversion from strings and tuples. For example: /// /// ```rust /// # use xml::name::Name; /// let n1: Name = "p:some-name".into(); /// let n2: Name = ("p", "some-name").into(); /// /// assert_eq!(n1, n2); /// assert_eq!(n1.local_name, "some-name"); /// assert_eq!(n1.prefix, Some("p")); /// assert!(n1.namespace.is_none()); /// ``` /// /// This is added to support easy specification of XML elements when writing XML documents. #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] pub struct Name<'a> { /// A local name, e.g. `string` in `xsi:string`. pub local_name: &'a str, /// A namespace URI, e.g. `http://www.w3.org/2000/xmlns/`. pub namespace: Option<&'a str>, /// A name prefix, e.g. `xsi` in `xsi:string`. pub prefix: Option<&'a str>, } impl<'a> From<&'a str> for Name<'a> { fn from(s: &'a str) -> Name<'a> { let mut parts = s.splitn(2, ':').fuse(); match (parts.next(), parts.next()) { (Some(name), None) => Name::local(name), (Some(prefix), Some(name)) => Name::prefixed(name, prefix), _ => unreachable!(), } } } impl<'a> From<(&'a str, &'a str)> for Name<'a> { fn from((prefix, name): (&'a str, &'a str)) -> Name<'a> { Name::prefixed(name, prefix) } } impl<'a> fmt::Display for Name<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if let Some(namespace) = self.namespace { write!(f, "{{{namespace}}}")?; } if let Some(prefix) = self.prefix { write!(f, "{prefix}:")?; } f.write_str(self.local_name) } } impl<'a> Name<'a> { /// Returns an owned variant of the qualified name. #[must_use] pub fn to_owned(&self) -> OwnedName { OwnedName { local_name: self.local_name.into(), namespace: self.namespace.map(std::convert::Into::into), prefix: self.prefix.map(std::convert::Into::into), } } /// Returns a new `Name` instance representing plain local name. #[inline] #[must_use] pub fn local(local_name: &str) -> Name<'_> { Name { local_name, prefix: None, namespace: None, } } /// Returns a new `Name` instance with the given local name and prefix. #[inline] #[must_use] pub fn prefixed(local_name: &'a str, prefix: &'a str) -> Name<'a> { Name { local_name, namespace: None, prefix: Some(prefix), } } /// Returns a new `Name` instance representing a qualified name with or without a prefix and /// with a namespace URI. #[inline] #[must_use] pub fn qualified(local_name: &'a str, namespace: &'a str, prefix: Option<&'a str>) -> Name<'a> { Name { local_name, namespace: Some(namespace), prefix, } } /// Returns a correct XML representation of this local name and prefix. /// /// This method is different from the autoimplemented `to_string()` because it does not /// include namespace URI in the result. #[must_use] pub fn to_repr(&self) -> String { self.repr_display().to_string() } /// Returns a structure which can be displayed with `std::fmt` machinery to obtain this /// local name and prefix. /// /// This method is needed for efficiency purposes in order not to create unnecessary /// allocations. #[inline] #[must_use] pub fn repr_display(&self) -> ReprDisplay<'_, '_> { ReprDisplay(self) } /// Returns either a prefix of this name or `namespace::NS_NO_PREFIX` constant. #[inline] #[must_use] pub fn prefix_repr(&self) -> &str { self.prefix.unwrap_or(NS_NO_PREFIX) } } /// A wrapper around `Name` whose `Display` implementation prints the wrapped name as it is /// displayed in an XML document. pub struct ReprDisplay<'a, 'b>(&'a Name<'b>); impl<'a, 'b: 'a> fmt::Display for ReprDisplay<'a, 'b> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self.0.prefix { Some(prefix) => write!(f, "{}:{}", prefix, self.0.local_name), None => self.0.local_name.fmt(f), } } } /// An owned variant of `Name`. /// /// Everything about `Name` applies to this structure as well. #[derive(Clone, PartialEq, Eq, Hash, Debug)] pub struct OwnedName { /// A local name, e.g. `string` in `xsi:string`. pub local_name: String, /// A namespace URI, e.g. `http://www.w3.org/2000/xmlns/`. pub namespace: Option, /// A name prefix, e.g. `xsi` in `xsi:string`. pub prefix: Option, } impl fmt::Display for OwnedName { #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt(&self.borrow(), f) } } impl OwnedName { /// Constructs a borrowed `Name` based on this owned name. #[must_use] #[inline] pub fn borrow(&self) -> Name<'_> { Name { local_name: &self.local_name, namespace: self.namespace.as_deref(), prefix: self.prefix.as_deref(), } } /// Returns a new `OwnedName` instance representing a plain local name. #[inline] pub fn local(local_name: S) -> OwnedName where S: Into { OwnedName { local_name: local_name.into(), namespace: None, prefix: None, } } /// Returns a new `OwnedName` instance representing a qualified name with or without /// a prefix and with a namespace URI. #[inline] pub fn qualified(local_name: S1, namespace: S2, prefix: Option) -> OwnedName where S1: Into, S2: Into, S3: Into { OwnedName { local_name: local_name.into(), namespace: Some(namespace.into()), prefix: prefix.map(std::convert::Into::into), } } /// Returns an optional prefix by reference, equivalent to `self.borrow().prefix` /// but avoids extra work. #[inline] #[must_use] pub fn prefix_ref(&self) -> Option<&str> { self.prefix.as_deref() } /// Returns an optional namespace by reference, equivalen to `self.borrow().namespace` /// but avoids extra work. #[inline] #[must_use] pub fn namespace_ref(&self) -> Option<&str> { self.namespace.as_deref() } } impl<'a> From> for OwnedName { #[inline] fn from(n: Name<'a>) -> OwnedName { n.to_owned() } } impl FromStr for OwnedName { type Err = (); /// Parses the given string slice into a qualified name. /// /// This function, when finishes sucessfully, always return a qualified /// name without a namespace (`name.namespace == None`). It should be filled later /// using proper `NamespaceStack`. /// /// It is supposed that all characters in the argument string are correct /// as defined by the XML specification. No additional checks except a check /// for emptiness are done. fn from_str(s: &str) -> Result { let mut it = s.split(':'); let r = match (it.next(), it.next(), it.next()) { (Some(prefix), Some(local_name), None) if !prefix.is_empty() && !local_name.is_empty() => Some((local_name.into(), Some(prefix.into()))), (Some(local_name), None, None) if !local_name.is_empty() => Some((local_name.into(), None)), (_, _, _) => None }; r.map(|(local_name, prefix)| OwnedName { local_name, namespace: None, prefix }).ok_or(()) } } #[cfg(test)] mod tests { use super::OwnedName; #[test] fn test_owned_name_from_str() { assert_eq!("prefix:name".parse(), Ok(OwnedName { local_name: "name".into(), namespace: None, prefix: Some("prefix".into()) })); assert_eq!("name".parse(), Ok(OwnedName { local_name: "name".into(), namespace: None, prefix: None })); assert_eq!("".parse(), Err::(())); assert_eq!(":".parse(), Err::(())); assert_eq!(":a".parse(), Err::(())); assert_eq!("a:".parse(), Err::(())); assert_eq!("a:b:c".parse(), Err::(())); } } xml-rs-0.8.19/src/namespace.rs000064400000000000000000000403421046102023000142410ustar 00000000000000//! Contains namespace manipulation types and functions. use std::borrow::Cow; use std::collections::btree_map::Iter as Entries; use std::collections::btree_map::{BTreeMap, Entry}; use std::collections::HashSet; use std::iter::{Map, Rev}; use std::slice::Iter; /// Designates prefix for namespace definitions. /// /// See [Namespaces in XML][namespace] spec for more information. /// /// [namespace]: http://www.w3.org/TR/xml-names/#ns-decl pub const NS_XMLNS_PREFIX: &str = "xmlns"; /// Designates the standard URI for `xmlns` prefix. /// /// See [A Namespace Name for xmlns Attributes][namespace] for more information. /// /// [namespace]: http://www.w3.org/2000/xmlns/ pub const NS_XMLNS_URI: &str = "http://www.w3.org/2000/xmlns/"; /// Designates prefix for a namespace containing several special predefined attributes. /// /// See [2.10 White Space handling][1], [2.1 Language Identification][2], /// [XML Base specification][3] and [xml:id specification][4] for more information. /// /// [1]: http://www.w3.org/TR/REC-xml/#sec-white-space /// [2]: http://www.w3.org/TR/REC-xml/#sec-lang-tag /// [3]: http://www.w3.org/TR/xmlbase/ /// [4]: http://www.w3.org/TR/xml-id/ pub const NS_XML_PREFIX: &str = "xml"; /// Designates the standard URI for `xml` prefix. /// /// See `NS_XML_PREFIX` documentation for more information. pub const NS_XML_URI: &str = "http://www.w3.org/XML/1998/namespace"; /// Designates the absence of prefix in a qualified name. /// /// This constant should be used to define or query default namespace which should be used /// for element or attribute names without prefix. For example, if a namespace mapping /// at a particular point in the document contains correspondence like /// /// ```none /// NS_NO_PREFIX --> urn:some:namespace /// ``` /// /// then all names declared without an explicit prefix `urn:some:namespace` is assumed as /// a namespace URI. /// /// By default empty prefix corresponds to absence of namespace, but this can change either /// when writing an XML document (manually) or when reading an XML document (based on namespace /// declarations). pub const NS_NO_PREFIX: &str = ""; /// Designates an empty namespace URI, which is equivalent to absence of namespace. /// /// This constant should not usually be used directly; it is used to designate that /// empty prefix corresponds to absent namespace in `NamespaceStack` instances created with /// `NamespaceStack::default()`. Therefore, it can be used to restore `NS_NO_PREFIX` mapping /// in a namespace back to its default value. pub const NS_EMPTY_URI: &str = ""; /// Namespace is a map from prefixes to namespace URIs. /// /// No prefix (i.e. default namespace) is designated by `NS_NO_PREFIX` constant. #[derive(PartialEq, Eq, Clone, Debug)] pub struct Namespace(pub BTreeMap); impl Namespace { /// Returns an empty namespace. #[inline] #[must_use] pub fn empty() -> Namespace { Namespace(BTreeMap::new()) } /// Checks whether this namespace is empty. #[inline] #[must_use] pub fn is_empty(&self) -> bool { self.0.is_empty() } /// Checks whether this namespace is essentially empty, that is, it does not contain /// anything but default mappings. #[must_use] pub fn is_essentially_empty(&self) -> bool { // a shortcut for a namespace which is definitely not empty if self.0.len() > 3 { return false; } self.0.iter().all(|(k, v)| match (&**k, &**v) { (NS_NO_PREFIX, NS_EMPTY_URI) => true, (NS_XMLNS_PREFIX, NS_XMLNS_URI) => true, (NS_XML_PREFIX, NS_XML_URI) => true, _ => false }) } /// Checks whether this namespace mapping contains the given prefix. /// /// # Parameters /// * `prefix` --- namespace prefix. /// /// # Return value /// `true` if this namespace contains the given prefix, `false` otherwise. #[inline] pub fn contains>(&self, prefix: &P) -> bool { self.0.contains_key(prefix.as_ref()) } /// Puts a mapping into this namespace. /// /// This method does not override any already existing mappings. /// /// Returns a boolean flag indicating whether the map already contained /// the given prefix. /// /// # Parameters /// * `prefix` --- namespace prefix; /// * `uri` --- namespace URI. /// /// # Return value /// `true` if `prefix` has been inserted successfully; `false` if the `prefix` /// was already present in the namespace. pub fn put(&mut self, prefix: P, uri: U) -> bool where P: Into, U: Into { match self.0.entry(prefix.into()) { Entry::Occupied(_) => false, Entry::Vacant(ve) => { ve.insert(uri.into()); true } } } /// Puts a mapping into this namespace forcefully. /// /// This method, unlike `put()`, does replace an already existing mapping. /// /// Returns previous URI which was assigned to the given prefix, if it is present. /// /// # Parameters /// * `prefix` --- namespace prefix; /// * `uri` --- namespace URI. /// /// # Return value /// `Some(uri)` with `uri` being a previous URI assigned to the `prefix`, or /// `None` if such prefix was not present in the namespace before. pub fn force_put(&mut self, prefix: P, uri: U) -> Option where P: Into, U: Into { self.0.insert(prefix.into(), uri.into()) } /// Queries the namespace for the given prefix. /// /// # Parameters /// * `prefix` --- namespace prefix. /// /// # Return value /// Namespace URI corresponding to the given prefix, if it is present. pub fn get<'a, P: ?Sized + AsRef>(&'a self, prefix: &P) -> Option<&'a str> { self.0.get(prefix.as_ref()).map(|s| &**s) } /// Borrowed namespace for the writer #[must_use] pub fn borrow(&self) -> Cow<'_, Self> { Cow::Borrowed(self) } } /// An alias for iterator type for namespace mappings contained in a namespace. pub type NamespaceMappings<'a> = Map< Entries<'a, String, String>, for<'b> fn((&'b String, &'b String)) -> UriMapping<'b> >; impl<'a> IntoIterator for &'a Namespace { type Item = UriMapping<'a>; type IntoIter = NamespaceMappings<'a>; fn into_iter(self) -> Self::IntoIter { fn mapper<'a>((prefix, uri): (&'a String, &'a String)) -> UriMapping<'a> { (prefix, uri) } self.0.iter().map(mapper) } } /// Namespace stack is a sequence of namespaces. /// /// Namespace stack is used to represent cumulative namespace consisting of /// combined namespaces from nested elements. #[derive(Clone, Eq, PartialEq, Debug)] pub struct NamespaceStack(pub Vec); impl NamespaceStack { /// Returns an empty namespace stack. #[inline] #[must_use] pub fn empty() -> NamespaceStack { NamespaceStack(Vec::with_capacity(2)) } /// Returns a namespace stack with default items in it. /// /// Default items are the following: /// /// * `xml` → `http://www.w3.org/XML/1998/namespace`; /// * `xmlns` → `http://www.w3.org/2000/xmlns/`. #[inline] #[must_use] pub fn default() -> NamespaceStack { let mut nst = NamespaceStack::empty(); nst.push_empty(); // xml namespace nst.put(NS_XML_PREFIX, NS_XML_URI); // xmlns namespace nst.put(NS_XMLNS_PREFIX, NS_XMLNS_URI); // empty namespace nst.put(NS_NO_PREFIX, NS_EMPTY_URI); nst } /// Adds an empty namespace to the top of this stack. #[inline] pub fn push_empty(&mut self) -> &mut NamespaceStack { self.0.push(Namespace::empty()); self } /// Removes the topmost namespace in this stack. /// /// Panics if the stack is empty. #[inline] pub fn pop(&mut self) -> Namespace { self.0.pop().unwrap() } /// Removes the topmost namespace in this stack. /// /// Returns `Some(namespace)` if this stack is not empty and `None` otherwise. #[inline] pub fn try_pop(&mut self) -> Option { self.0.pop() } /// Borrows the topmost namespace mutably, leaving the stack intact. /// /// Panics if the stack is empty. #[inline] pub fn peek_mut(&mut self) -> &mut Namespace { self.0.last_mut().unwrap() } /// Borrows the topmost namespace immutably, leaving the stack intact. /// /// Panics if the stack is empty. #[inline] #[must_use] pub fn peek(&self) -> &Namespace { self.0.last().unwrap() } /// Puts a mapping into the topmost namespace if this stack does not already contain one. /// /// Returns a boolean flag indicating whether the insertion has completed successfully. /// Note that both key and value are matched and the mapping is inserted if either /// namespace prefix is not already mapped, or if it is mapped, but to a different URI. /// /// # Parameters /// * `prefix` --- namespace prefix; /// * `uri` --- namespace URI. /// /// # Return value /// `true` if `prefix` has been inserted successfully; `false` if the `prefix` /// was already present in the namespace stack. pub fn put_checked(&mut self, prefix: P, uri: U) -> bool where P: Into + AsRef, U: Into + AsRef { if self.0.iter().any(|ns| ns.get(&prefix) == Some(uri.as_ref())) { false } else { self.put(prefix, uri); true } } /// Puts a mapping into the topmost namespace in this stack. /// /// This method does not override a mapping in the topmost namespace if it is /// already present, however, it does not depend on other namespaces in the stack, /// so it is possible to put a mapping which is present in lower namespaces. /// /// Returns a boolean flag indicating whether the insertion has completed successfully. /// /// # Parameters /// * `prefix` --- namespace prefix; /// * `uri` --- namespace URI. /// /// # Return value /// `true` if `prefix` has been inserted successfully; `false` if the `prefix` /// was already present in the namespace. #[inline] pub fn put(&mut self, prefix: P, uri: U) -> bool where P: Into, U: Into { if let Some(ns) = self.0.last_mut() { ns.put(prefix, uri) } else { false } } /// Performs a search for the given prefix in the whole stack. /// /// This method walks the stack from top to bottom, querying each namespace /// in order for the given prefix. If none of the namespaces contains the prefix, /// `None` is returned. /// /// # Parameters /// * `prefix` --- namespace prefix. #[inline] pub fn get<'a, P: ?Sized + AsRef>(&'a self, prefix: &P) -> Option<&'a str> { let prefix = prefix.as_ref(); for ns in self.0.iter().rev() { match ns.get(prefix) { None => {}, r => return r, } } None } /// Combines this stack of namespaces into a single namespace. /// /// Namespaces are combined in left-to-right order, that is, rightmost namespace /// elements take priority over leftmost ones. #[must_use] pub fn squash(&self) -> Namespace { let mut result = BTreeMap::new(); for ns in &self.0 { result.extend(ns.0.iter().map(|(k, v)| (k.clone(), v.clone()))); } Namespace(result) } /// Returns an object which implements `Extend` using `put_checked()` instead of `put()`. /// /// See `CheckedTarget` for more information. #[inline] pub fn checked_target(&mut self) -> CheckedTarget<'_> { CheckedTarget(self) } /// Returns an iterator over all mappings in this namespace stack. #[inline] #[must_use] pub fn iter(&self) -> NamespaceStackMappings<'_> { self.into_iter() } } /// An iterator over mappings from prefixes to URIs in a namespace stack. /// /// # Example /// ``` /// # use xml::namespace::NamespaceStack; /// let mut nst = NamespaceStack::empty(); /// nst.push_empty(); /// nst.put("a", "urn:A"); /// nst.put("b", "urn:B"); /// nst.push_empty(); /// nst.put("c", "urn:C"); /// /// assert_eq!(vec![("c", "urn:C"), ("a", "urn:A"), ("b", "urn:B")], nst.iter().collect::>()); /// ``` pub struct NamespaceStackMappings<'a> { namespaces: Rev>, current_namespace: Option>, used_keys: HashSet<&'a str>, } impl<'a> NamespaceStackMappings<'a> { fn go_to_next_namespace(&mut self) -> bool { self.current_namespace = self.namespaces.next().map(|ns| ns.into_iter()); self.current_namespace.is_some() } } impl<'a> Iterator for NamespaceStackMappings<'a> { type Item = UriMapping<'a>; fn next(&mut self) -> Option> { // If there is no current namespace and no next namespace, we're finished if self.current_namespace.is_none() && !self.go_to_next_namespace() { return None; } let next_item = self.current_namespace.as_mut()?.next(); match next_item { // There is an element in the current namespace Some((k, v)) => if self.used_keys.contains(&k) { // If the current key is used, go to the next one self.next() } else { // Otherwise insert the current key to the set of used keys and // return the mapping self.used_keys.insert(k); Some((k, v)) }, // Current namespace is exhausted None => if self.go_to_next_namespace() { // If there is next namespace, continue from it self.next() } else { // No next namespace, exiting None } } } } impl<'a> IntoIterator for &'a NamespaceStack { type Item = UriMapping<'a>; type IntoIter = NamespaceStackMappings<'a>; fn into_iter(self) -> Self::IntoIter { NamespaceStackMappings { namespaces: self.0.iter().rev(), current_namespace: None, used_keys: HashSet::new(), } } } /// A type alias for a pair of `(prefix, uri)` values returned by namespace iterators. pub type UriMapping<'a> = (&'a str, &'a str); impl<'a> Extend> for Namespace { fn extend(&mut self, iterable: T) where T: IntoIterator> { for (prefix, uri) in iterable { self.put(prefix, uri); } } } impl<'a> Extend> for NamespaceStack { fn extend(&mut self, iterable: T) where T: IntoIterator> { for (prefix, uri) in iterable { self.put(prefix, uri); } } } /// A wrapper around `NamespaceStack` which implements `Extend` using `put_checked()`. /// /// # Example /// /// ``` /// # use xml::namespace::NamespaceStack; /// /// let mut nst = NamespaceStack::empty(); /// nst.push_empty(); /// nst.put("a", "urn:A"); /// nst.put("b", "urn:B"); /// nst.push_empty(); /// nst.put("c", "urn:C"); /// /// nst.checked_target().extend(vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:Y"), ("d", "urn:D")]); /// assert_eq!( /// vec![("a", "urn:Z"), ("c", "urn:C"), ("d", "urn:D"), ("b", "urn:B")], /// nst.iter().collect::>() /// ); /// ``` /// /// Compare: /// /// ``` /// # use xml::namespace::NamespaceStack; /// # let mut nst = NamespaceStack::empty(); /// # nst.push_empty(); /// # nst.put("a", "urn:A"); /// # nst.put("b", "urn:B"); /// # nst.push_empty(); /// # nst.put("c", "urn:C"); /// /// nst.extend(vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:Y"), ("d", "urn:D")]); /// assert_eq!( /// vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:C"), ("d", "urn:D")], /// nst.iter().collect::>() /// ); /// ``` pub struct CheckedTarget<'a>(&'a mut NamespaceStack); impl<'a, 'b> Extend> for CheckedTarget<'a> { fn extend(&mut self, iterable: T) where T: IntoIterator> { for (prefix, uri) in iterable { self.0.put_checked(prefix, uri); } } } xml-rs-0.8.19/src/reader/config.rs000064400000000000000000000337411046102023000150210ustar 00000000000000//! Contains parser configuration structure. use std::collections::HashMap; use std::io::Read; use crate::reader::EventReader; use crate::util::Encoding; /// Limits to defend from billion laughs attack const DEFAULT_MAX_ENTITY_EXPANSION_LENGTH: usize = 1_000_000; const DEFAULT_MAX_ENTITY_EXPANSION_DEPTH: u8 = 10; /// Parser configuration structure. **There are more config methods than public fileds — see methods below**. /// /// This structure contains various configuration options which affect /// behavior of the parser. #[derive(Clone, PartialEq, Eq, Debug)] pub struct ParserConfig { /// Whether or not should whitespace in textual events be removed. Default is false. /// /// When true, all standalone whitespace will be removed (this means no /// `Whitespace` events will be emitted), and leading and trailing whitespace /// from `Character` events will be deleted. If after trimming `Characters` /// event will be empty, it will also be omitted from output stream. This is /// possible, however, only if `whitespace_to_characters` or /// `cdata_to_characters` options are set. /// /// This option does not affect CDATA events, unless `cdata_to_characters` /// option is also set. In that case CDATA content will also be trimmed. pub trim_whitespace: bool, /// Whether or not should whitespace be converted to characters. /// Default is false. /// /// If true, instead of `Whitespace` events `Characters` events with the /// same content will be emitted. If `trim_whitespace` is also true, these /// events will be trimmed to nothing and, consequently, not emitted. pub whitespace_to_characters: bool, /// Whether or not should CDATA be converted to characters. /// Default is false. /// /// If true, instead of `CData` events `Characters` events with the same /// content will be emitted. If `trim_whitespace` is also true, these events /// will be trimmed. If corresponding CDATA contained nothing but whitespace, /// this event will be omitted from the stream. pub cdata_to_characters: bool, /// Whether or not should comments be omitted. Default is true. /// /// If true, `Comment` events will not be emitted at all. pub ignore_comments: bool, /// Whether or not should sequential `Characters` events be merged. /// Default is true. /// /// If true, multiple sequential `Characters` events will be merged into /// a single event, that is, their data will be concatenated. /// /// Multiple sequential `Characters` events are only possible if either /// `cdata_to_characters` or `ignore_comments` are set. Otherwise character /// events will always be separated by other events. pub coalesce_characters: bool, /// A map of extra entities recognized by the parser. Default is an empty map. /// /// By default the XML parser recognizes the entities defined in the XML spec. Sometimes, /// however, it is convenient to make the parser recognize additional entities which /// are also not available through the DTD definitions (especially given that at the moment /// DTD parsing is not supported). pub extra_entities: HashMap, /// Whether or not the parser should ignore the end of stream. Default is false. /// /// By default the parser will either error out when it encounters a premature end of /// stream or complete normally if the end of stream was expected. If you want to continue /// reading from a stream whose input is supplied progressively, you can set this option to true. /// In this case the parser will allow you to invoke the next() method even if a supposed end /// of stream has happened. /// /// Note that support for this functionality is incomplete; for example, the parser will fail if /// the premature end of stream happens inside PCDATA. Therefore, use this option at your own risk. pub ignore_end_of_stream: bool, /// Whether or not non-unicode entity references get replaced with the replacement character /// /// When true, any decimal or hexadecimal character reference that cannot be converted from a /// u32 to a char using [std::char::from_u32](https://doc.rust-lang.org/std/char/fn.from_u32.html) /// will be converted into the unicode REPLACEMENT CHARACTER (U+FFFD). pub replace_unknown_entity_references: bool, /// Whether or not whitespace at the root level of the document is ignored. Default is true. /// /// By default any whitespace that is not enclosed within at least one level of elements will be /// ignored. Setting this value to false will cause root level whitespace events to be emitted. /// /// **There are configuration options – see methods below** pub ignore_root_level_whitespace: bool, } impl ParserConfig { /// Returns a new config with default values. /// /// You can tweak default values using builder-like pattern: /// /// ```rust /// use xml::reader::ParserConfig; /// /// let config = ParserConfig::new() /// .trim_whitespace(true) /// .ignore_comments(true) /// .coalesce_characters(false); /// ``` #[must_use] #[inline] pub fn new() -> ParserConfig { ParserConfig { trim_whitespace: false, whitespace_to_characters: false, cdata_to_characters: false, ignore_comments: true, coalesce_characters: true, extra_entities: HashMap::new(), ignore_end_of_stream: false, replace_unknown_entity_references: false, ignore_root_level_whitespace: true, } } /// Creates an XML reader with this configuration. /// /// This is a convenience method for configuring and creating a reader at the same time: /// /// ```rust /// use xml::reader::ParserConfig; /// /// let mut source: &[u8] = b"..."; /// /// let reader = ParserConfig::new() /// .trim_whitespace(true) /// .ignore_comments(true) /// .coalesce_characters(false) /// .create_reader(&mut source); /// ``` /// /// This method is exactly equivalent to calling `EventReader::new_with_config()` with /// this configuration object. #[inline] pub fn create_reader(self, source: R) -> EventReader { EventReader::new_with_config(source, self) } /// Adds a new entity mapping and returns an updated config object. /// /// This is a convenience method for adding external entities mappings to the XML parser. /// An example: /// /// ```rust /// use xml::reader::ParserConfig; /// /// let mut source: &[u8] = b"..."; /// /// let reader = ParserConfig::new() /// .add_entity("nbsp", " ") /// .add_entity("copy", "©") /// .add_entity("reg", "®") /// .create_reader(&mut source); /// ``` pub fn add_entity, T: Into>(mut self, entity: S, value: T) -> ParserConfig { self.extra_entities.insert(entity.into(), value.into()); self } } impl Default for ParserConfig { #[inline] fn default() -> ParserConfig { ParserConfig::new() } } gen_setters! { ParserConfig, trim_whitespace: val bool, whitespace_to_characters: val bool, cdata_to_characters: val bool, ignore_comments: val bool, coalesce_characters: val bool, ignore_end_of_stream: val bool, replace_unknown_entity_references: val bool, ignore_root_level_whitespace: val bool } /// Backwards-compatible extension of `ParserConfig`, which will eventually be merged into the original `ParserConfig` struct #[derive(Clone, PartialEq, Eq, Debug)] #[non_exhaustive] pub struct ParserConfig2 { pub(crate) c: ParserConfig, /// Use this encoding as the default. Necessary for UTF-16 files without BOM. pub override_encoding: Option, /// Allow `` to contain unsupported encoding names, /// and interpret them as Latin1 instead. This will mangle non-ASCII characters, but usually it won't fail parsing. pub ignore_invalid_encoding_declarations: bool, /// Documents with multiple root elements are ill-formed pub allow_multiple_root_elements: bool, /// Abort if custom entities create a string longer than this pub max_entity_expansion_length: usize, /// Entities can expand into other entities this many times (be careful about exponential cost!) pub max_entity_expansion_depth: u8, /// Maximum length of tag name or attribute name pub max_name_length: usize, /// Max number of attributes per element pub max_attributes: usize, /// Max number of bytes in each attribute pub max_attribute_length: usize, /// Maximum length of strings reprsenting characters, comments, and processing instructions pub max_data_length: usize, } impl Default for ParserConfig2 { fn default() -> Self { ParserConfig2 { c: Default::default(), override_encoding: None, ignore_invalid_encoding_declarations: false, allow_multiple_root_elements: true, max_entity_expansion_length: DEFAULT_MAX_ENTITY_EXPANSION_LENGTH, max_entity_expansion_depth: DEFAULT_MAX_ENTITY_EXPANSION_DEPTH, max_attributes: 1<<16, max_attribute_length: 1<<30, max_data_length: 1<<30, max_name_length: 1<<18, } } } impl ParserConfig2 { #[inline] #[must_use] pub fn new() -> Self { Self::default() } /// Read character encoding from `Content-Type` header. /// Set this when parsing XML documents fetched over HTTP. /// /// `text/*` MIME types do *not* imply latin1. UTF-8 is always the default fallback. #[must_use] pub fn content_type(mut self, mime_type: &str) -> Self { let charset = mime_type.split_once(';') .and_then(|(_, args)| args.split_once("charset")) .and_then(|(_, args)| args.split_once('=')); if let Some((_, charset)) = charset { let name = charset.trim().trim_matches('"'); match name.parse() { Ok(enc) => { self.override_encoding = Some(enc); }, Err(_) => {}, } } self } /// Creates an XML reader with this configuration. /// /// This is a convenience method for configuring and creating a reader at the same time: /// /// ```rust /// use xml::reader::ParserConfig; /// /// let mut source: &[u8] = b"..."; /// /// let reader = ParserConfig::new() /// .trim_whitespace(true) /// .ignore_comments(true) /// .coalesce_characters(false) /// .create_reader(&mut source); /// ``` /// /// This method is exactly equivalent to calling `EventReader::new_with_config()` with /// this configuration object. #[inline] pub fn create_reader(self, source: R) -> EventReader { EventReader::new_with_config(source, self) } } impl From for ParserConfig2 { #[inline] fn from(c: ParserConfig) -> Self { Self { c, ..Default::default() } } } gen_setters! { ParserConfig2, /// Set if you got one in the HTTP header override_encoding: val Option, /// Allows invalid documents. There should be only a single root element in XML. allow_multiple_root_elements: val bool, /// Abort if custom entities create a string longer than this max_entity_expansion_length: val usize, /// Entities can expand into other entities this many times (be careful about exponential cost!) max_entity_expansion_depth: val u8, /// Max number of attributes per element max_attributes: val usize, /// Maximum length of tag name or attribute name max_name_length: val usize, /// Max number of bytes in each attribute max_attribute_length: val usize, /// Maximum length of strings reprsenting characters, comments, and processing instructions max_data_length: val usize, /// Allow `` ignore_invalid_encoding_declarations: val bool } gen_setters! { ParserConfig, /// Set if you got one in the HTTP header (see `content_type`) override_encoding: c2 Option, /// Allow `` ignore_invalid_encoding_declarations: c2 bool, /// Allows invalid documents. There should be only a single root element in XML. allow_multiple_root_elements: c2 bool, /// Abort if custom entities create a string longer than this max_entity_expansion_length: c2 usize, /// Entities can expand into other entities this many times (be careful about exponential cost!) max_entity_expansion_depth: c2 u8, /// Max number of attributes per element max_attributes: c2 usize, /// Maximum length of tag name or attribute name max_name_length: c2 usize, /// Max number of bytes in each attribute max_attribute_length: c2 usize, /// Maximum length of strings reprsenting characters, comments, and processing instructions max_data_length: c2 usize, /// Set encoding from the MIME type. Important for HTTP compatibility. content_type: c2 &str } gen_setters! { ParserConfig2, trim_whitespace: delegate bool, whitespace_to_characters: delegate bool, cdata_to_characters: delegate bool, ignore_comments: delegate bool, coalesce_characters: delegate bool, ignore_end_of_stream: delegate bool, replace_unknown_entity_references: delegate bool, /// Whether or not whitespace at the root level of the document is ignored. Default is true. ignore_root_level_whitespace: delegate bool } #[test] fn mime_parse() { let c = ParserConfig2::new().content_type("text/xml;charset=Us-AScii").max_entity_expansion_length(1000); assert_eq!(c.override_encoding, Some(Encoding::Ascii)); let c = ParserConfig2::new().max_entity_expansion_depth(3).content_type("text/xml;charset = \"UTF-16\""); assert_eq!(c.override_encoding, Some(Encoding::Utf16)); } xml-rs-0.8.19/src/reader/error.rs000064400000000000000000000232551046102023000147040ustar 00000000000000use crate::Encoding; use crate::reader::lexer::Token; use std::borrow::Cow; use std::error; use std::error::Error as _; use std::fmt; use std::io; use std::str; use crate::common::{Position, TextPosition}; use crate::util; #[derive(Debug)] pub enum ErrorKind { Syntax(Cow<'static, str>), Io(io::Error), Utf8(str::Utf8Error), UnexpectedEof, } #[derive(Debug, Clone, PartialEq)] #[non_exhaustive] pub(crate) enum SyntaxError { CannotRedefineXmlnsPrefix, CannotRedefineXmlPrefix, /// Recursive custom entity expanded to too many chars, it could be DoS EntityTooBig, EmptyEntity, NoRootElement, ProcessingInstructionWithoutName, UnbalancedRootElement, UnexpectedEof, UnexpectedOpeningTag, /// Missing `]]>` UnclosedCdata, UnexpectedQualifiedName(Token), UnexpectedTokenOutsideRoot(Token), UnexpectedToken(Token), UnexpectedTokenInEntity(Token), UnexpectedTokenInClosingTag(Token), UnexpectedTokenInOpeningTag(Token), InvalidQualifiedName(Box), UnboundAttribute(Box), UnboundElementPrefix(Box), UnexpectedClosingTag(Box), UnexpectedName(Box), /// Found , Token), CannotUndefinePrefix(Box), InvalidCharacterEntity(u32), InvalidDefaultNamespace(Box), InvalidNamePrefix(Box), InvalidNumericEntity(Box), InvalidStandaloneDeclaration(Box), InvalidXmlProcessingInstruction(Box), RedefinedAttribute(Box), UndefinedEntity(Box), UnexpectedEntity(Box), UnexpectedNameInsideXml(Box), UnsupportedEncoding(Box), /// In DTD UnknownMarkupDeclaration(Box), UnexpectedXmlVersion(Box), ConflictingEncoding(Encoding, Encoding), UnexpectedTokenBefore(&'static str, char), /// Document has more stuff than `ParserConfig` allows ExceededConfiguredLimit, } impl fmt::Display for SyntaxError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { self.to_cow().fmt(f) } } impl SyntaxError { #[inline(never)] #[cold] pub(crate) fn to_cow(&self) -> Cow<'static, str> { match *self { Self::CannotRedefineXmlnsPrefix => "Cannot redefine XMLNS prefix".into(), Self::CannotRedefineXmlPrefix => "Default XMLNS prefix cannot be rebound to another value".into(), Self::EmptyEntity => "Encountered empty entity".into(), Self::EntityTooBig => "Entity too big".into(), Self::NoRootElement => "Unexpected end of stream: no root element found".into(), Self::ProcessingInstructionWithoutName => "Encountered processing instruction without a name".into(), Self::UnbalancedRootElement => "Unexpected end of stream: still inside the root element".into(), Self::UnclosedCdata => "Unclosed "Unexpected end of stream".into(), Self::UnexpectedOpeningTag => "'<' is not allowed in attributes".into(), Self::CannotUndefinePrefix(ref ln) => format!("Cannot undefine prefix '{ln}'").into(), Self::ConflictingEncoding(a, b) => format!("Declared encoding {a}, but uses {b}").into(), Self::InvalidCharacterEntity(num) => format!("Invalid character U+{num:04X}").into(), Self::InvalidDefaultNamespace(ref name) => format!( "Namespace '{name}' cannot be default").into(), Self::InvalidNamePrefix(ref prefix) => format!("'{prefix}' cannot be an element name prefix").into(), Self::InvalidNumericEntity(ref v) => format!("Invalid numeric entity: {v}").into(), Self::InvalidQualifiedName(ref e) => format!("Qualified name is invalid: {e}").into(), Self::InvalidStandaloneDeclaration(ref value) => format!("Invalid standalone declaration value: {value}").into(), Self::InvalidXmlProcessingInstruction(ref name) => format!("Invalid processing instruction: format!("Attribute '{name}' is redefined").into(), Self::UnboundAttribute(ref name) => format!("Attribute {name} prefix is unbound").into(), Self::UnboundElementPrefix(ref name) => format!("Element {name} prefix is unbound").into(), Self::UndefinedEntity(ref v) => format!("Undefined entity: {v}").into(), Self::UnexpectedClosingTag(ref expected_got) => format!("Unexpected closing tag: {expected_got}").into(), Self::UnexpectedEntity(ref name) => format!("Unexpected entity: {name}").into(), Self::UnexpectedName(ref name) => format!("Unexpected name: {name}").into(), Self::UnexpectedNameInsideXml(ref name) => format!("Unexpected name inside XML declaration: {name}").into(), Self::UnexpectedProcessingInstruction(ref buf, token) => format!("Unexpected token inside processing instruction: format!("Unexpected token inside qualified name: {e}").into(), Self::UnexpectedToken(token) => format!("Unexpected token: {token}").into(), Self::UnexpectedTokenBefore(before, c) => format!("Unexpected token '{before}' before '{c}'").into(), Self::UnexpectedTokenInClosingTag(token) => format!("Unexpected token inside closing tag: {token}").into(), Self::UnexpectedTokenInEntity(token) => format!("Unexpected token inside entity: {token}").into(), Self::UnexpectedTokenInOpeningTag(token) => format!("Unexpected token inside opening tag: {token}").into(), Self::UnexpectedTokenOutsideRoot(token) => format!("Unexpected characters outside the root element: {token}").into(), Self::UnexpectedXmlVersion(ref version) => format!("Invalid XML version: {version}").into(), Self::UnknownMarkupDeclaration(ref v) => format!("Unknown markup declaration: {v}").into(), Self::UnsupportedEncoding(ref v) => format!("Unsupported encoding: {v}").into(), Self::ExceededConfiguredLimit => "This document is larger/more complex than allowed by the parser's configuration".into(), } } } /// An XML parsing error. /// /// Consists of a 2D position in a document and a textual message describing the error. #[derive(Clone, PartialEq, Eq, Debug)] pub struct Error { pub(crate) pos: TextPosition, pub(crate) kind: ErrorKind, } impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8}; write!(f, "{} ", self.pos)?; match &self.kind { Io(io_error) => io_error.fmt(f), Utf8(reason) => reason.fmt(f), Syntax(msg) => f.write_str(msg), UnexpectedEof => f.write_str("Unexpected EOF"), } } } impl Position for Error { #[inline] fn position(&self) -> TextPosition { self.pos } } impl Error { /// Returns a reference to a message which is contained inside this error. #[cold] #[doc(hidden)] #[allow(deprecated)] #[must_use] pub fn msg(&self) -> &str { use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8}; match &self.kind { Io(io_error) => io_error.description(), Utf8(reason) => reason.description(), Syntax(msg) => msg.as_ref(), UnexpectedEof => "Unexpected EOF", } } #[must_use] #[inline] pub fn kind(&self) -> &ErrorKind { &self.kind } } impl error::Error for Error { #[allow(deprecated)] #[cold] fn description(&self) -> &str { self.msg() } } impl<'a, P, M> From<(&'a P, M)> for Error where P: Position, M: Into> { #[cold] fn from(orig: (&'a P, M)) -> Self { Error { pos: orig.0.position(), kind: ErrorKind::Syntax(orig.1.into()), } } } impl From for Error { #[cold] fn from(e: util::CharReadError) -> Self { use crate::util::CharReadError::{Io, UnexpectedEof, Utf8}; Error { pos: TextPosition::new(), kind: match e { UnexpectedEof => ErrorKind::UnexpectedEof, Utf8(reason) => ErrorKind::Utf8(reason), Io(io_error) => ErrorKind::Io(io_error), }, } } } impl From for Error { #[cold] fn from(e: io::Error) -> Self { Error { pos: TextPosition::new(), kind: ErrorKind::Io(e), } } } impl Clone for ErrorKind { #[cold] fn clone(&self) -> Self { use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8}; match self { UnexpectedEof => UnexpectedEof, Utf8(reason) => Utf8(*reason), Io(io_error) => Io(io::Error::new(io_error.kind(), io_error.to_string())), Syntax(msg) => Syntax(msg.clone()), } } } impl PartialEq for ErrorKind { #[allow(deprecated)] fn eq(&self, other: &ErrorKind) -> bool { use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8}; match (self, other) { (UnexpectedEof, UnexpectedEof) => true, (Utf8(left), Utf8(right)) => left == right, (Io(left), Io(right)) => left.kind() == right.kind() && left.description() == right.description(), (Syntax(left), Syntax(right)) => left == right, (_, _) => false, } } } impl Eq for ErrorKind {} #[test] fn err_size() { assert!(std::mem::size_of::() <= 24); } xml-rs-0.8.19/src/reader/events.rs000064400000000000000000000212331046102023000150510ustar 00000000000000//! Contains `XmlEvent` datatype, instances of which are emitted by the parser. use std::fmt; use crate::attribute::OwnedAttribute; use crate::common::XmlVersion; use crate::name::OwnedName; use crate::namespace::Namespace; /// An element of an XML input stream. /// /// Items of this enum are emitted by `reader::EventReader`. They correspond to different /// elements of an XML document. #[derive(PartialEq, Clone)] pub enum XmlEvent { /// Corresponds to XML document declaration. /// /// This event is always emitted before any other event. It is emitted /// even if the actual declaration is not present in the document. StartDocument { /// XML version. /// /// If XML declaration is not present, defaults to `Version10`. version: XmlVersion, /// XML document encoding. /// /// If XML declaration is not present or does not contain `encoding` attribute, /// defaults to `"UTF-8"`. This field is currently used for no other purpose than /// informational. encoding: String, /// XML standalone declaration. /// /// If XML document is not present or does not contain `standalone` attribute, /// defaults to `None`. This field is currently used for no other purpose than /// informational. standalone: Option, }, /// Denotes to the end of the document stream. /// /// This event is always emitted after any other event (except `Error`). After it /// is emitted for the first time, it will always be emitted on next event pull attempts. EndDocument, /// Denotes an XML processing instruction. /// /// This event contains a processing instruction target (`name`) and opaque `data`. It /// is up to the application to process them. ProcessingInstruction { /// Processing instruction target. name: String, /// Processing instruction content. data: Option, }, /// Denotes a beginning of an XML element. /// /// This event is emitted after parsing opening tags or after parsing bodiless tags. In the /// latter case `EndElement` event immediately follows. StartElement { /// Qualified name of the element. name: OwnedName, /// A list of attributes associated with the element. /// /// Currently attributes are not checked for duplicates (TODO) attributes: Vec, /// Contents of the namespace mapping at this point of the document. namespace: Namespace, }, /// Denotes an end of an XML element. /// /// This event is emitted after parsing closing tags or after parsing bodiless tags. In the /// latter case it is emitted immediately after corresponding `StartElement` event. EndElement { /// Qualified name of the element. name: OwnedName, }, /// Denotes CDATA content. /// /// This event contains unparsed data. No unescaping will be performed. /// /// It is possible to configure a parser to emit `Characters` event instead of `CData`. See /// `pull::ParserConfiguration` structure for more information. CData(String), /// Denotes a comment. /// /// It is possible to configure a parser to ignore comments, so this event will never be emitted. /// See `pull::ParserConfiguration` structure for more information. Comment(String), /// Denotes character data outside of tags. /// /// Contents of this event will always be unescaped, so no entities like `<` or `&` or `{` /// will appear in it. /// /// It is possible to configure a parser to trim leading and trailing whitespace for this event. /// See `pull::ParserConfiguration` structure for more information. Characters(String), /// Denotes a chunk of whitespace outside of tags. /// /// It is possible to configure a parser to emit `Characters` event instead of `Whitespace`. /// See `pull::ParserConfiguration` structure for more information. When combined with whitespace /// trimming, it will eliminate standalone whitespace from the event stream completely. Whitespace(String), } impl fmt::Debug for XmlEvent { #[cold] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { XmlEvent::StartDocument { ref version, ref encoding, standalone } => write!(f, "StartDocument({}, {}, {:?})", version, *encoding, standalone), XmlEvent::EndDocument => write!(f, "EndDocument"), XmlEvent::ProcessingInstruction { ref name, ref data } => write!(f, "ProcessingInstruction({}{})", *name, match *data { Some(ref data) => format!(", {data}"), None => String::new() }), XmlEvent::StartElement { ref name, ref attributes, namespace: Namespace(ref namespace) } => write!(f, "StartElement({}, {:?}{})", name, namespace, if attributes.is_empty() { String::new() } else { let attributes: Vec = attributes.iter().map( |a| format!("{} -> {}", a.name, a.value) ).collect(); format!(", [{}]", attributes.join(", ")) }), XmlEvent::EndElement { ref name } => write!(f, "EndElement({name})"), XmlEvent::Comment(ref data) => write!(f, "Comment({data})"), XmlEvent::CData(ref data) => write!(f, "CData({data})"), XmlEvent::Characters(ref data) => write!(f, "Characters({data})"), XmlEvent::Whitespace(ref data) => write!(f, "Whitespace({data})") } } } impl XmlEvent { /// Obtains a writer event from this reader event. /// /// This method is useful for streaming processing of XML documents where the output /// is also an XML document. With this method it is possible to process some events /// while passing other events through to the writer unchanged: /// /// ```rust /// use std::str; /// /// use xml::{EventReader, EventWriter}; /// use xml::reader::XmlEvent as ReaderEvent; /// use xml::writer::XmlEvent as WriterEvent; /// /// let mut input: &[u8] = b"world"; /// let mut output: Vec = Vec::new(); /// /// { /// let mut reader = EventReader::new(&mut input); /// let mut writer = EventWriter::new(&mut output); /// /// for e in reader { /// match e.unwrap() { /// ReaderEvent::Characters(s) => /// writer.write(WriterEvent::characters(&s.to_uppercase())).unwrap(), /// e => if let Some(e) = e.as_writer_event() { /// writer.write(e).unwrap() /// } /// } /// } /// } /// /// assert_eq!( /// str::from_utf8(&output).unwrap(), /// r#"WORLD"# /// ); /// ``` /// /// Note that this API may change or get additions in future to improve its ergonomics. #[must_use] pub fn as_writer_event(&self) -> Option> { match *self { XmlEvent::StartDocument { version, ref encoding, standalone } => Some(crate::writer::events::XmlEvent::StartDocument { version, encoding: Some(encoding), standalone }), XmlEvent::ProcessingInstruction { ref name, ref data } => Some(crate::writer::events::XmlEvent::ProcessingInstruction { name, data: data.as_ref().map(|s| &**s) }), XmlEvent::StartElement { ref name, ref attributes, ref namespace } => Some(crate::writer::events::XmlEvent::StartElement { name: name.borrow(), attributes: attributes.iter().map(|a| a.borrow()).collect(), namespace: namespace.borrow(), }), XmlEvent::EndElement { ref name } => Some(crate::writer::events::XmlEvent::EndElement { name: Some(name.borrow()) }), XmlEvent::Comment(ref data) => Some(crate::writer::events::XmlEvent::Comment(data)), XmlEvent::CData(ref data) => Some(crate::writer::events::XmlEvent::CData(data)), XmlEvent::Characters(ref data) | XmlEvent::Whitespace(ref data) => Some(crate::writer::events::XmlEvent::Characters(data)), XmlEvent::EndDocument => None, } } } xml-rs-0.8.19/src/reader/indexset.rs000064400000000000000000000063771046102023000154040ustar 00000000000000use crate::attribute::OwnedAttribute; use crate::name::OwnedName; use std::collections::hash_map::RandomState; use std::collections::HashSet; use std::hash::BuildHasher; use std::hash::Hash; use std::hash::Hasher; /// An ordered set pub(crate) struct AttributesSet { vec: Vec, /// Uses a no-op hasher, because these u64s are hashes already may_contain: HashSet, /// This is real hasher for the `OwnedName` hasher: RandomState, } /// Use linear search and don't allocate `HashSet` if there are few attributes, /// because allocation costs more than a few comparisons. const HASH_THRESHOLD: usize = 8; impl AttributesSet { pub fn new() -> Self { Self { vec: Vec::new(), hasher: RandomState::new(), may_contain: HashSet::default(), } } fn hash(&self, val: &OwnedName) -> u64 { let mut h = self.hasher.build_hasher(); val.hash(&mut h); h.finish() } pub fn len(&self) -> usize { self.vec.len() } pub fn contains(&self, name: &OwnedName) -> bool { // fall back to linear search only on duplicate or hash collision (self.vec.len() < HASH_THRESHOLD || self.may_contain.contains(&self.hash(name))) && self.vec.iter().any(move |a| &a.name == name) } pub fn push(&mut self, attr: OwnedAttribute) { if self.vec.len() >= HASH_THRESHOLD { if self.vec.len() == HASH_THRESHOLD { self.may_contain.reserve(HASH_THRESHOLD * 2); for attr in &self.vec { self.may_contain.insert(self.hash(&attr.name)); } } self.may_contain.insert(self.hash(&attr.name)); } self.vec.push(attr); } pub fn into_vec(self) -> Vec { self.vec } } #[test] fn indexset() { let mut s = AttributesSet::new(); let not_here = OwnedName { local_name: "attr1000".into(), namespace: Some("test".into()), prefix: None, }; // this test will take a lot of time if the `contains()` is linear, and the loop is quadratic for i in 0..50000 { let name = OwnedName { local_name: format!("attr{i}"), namespace: None, prefix: None, }; assert!(!s.contains(&name)); s.push(OwnedAttribute { name, value: String::new() }); assert!(!s.contains(¬_here)); } assert!(s.contains(&OwnedName { local_name: "attr1234".into(), namespace: None, prefix: None, })); assert!(s.contains(&OwnedName { local_name: "attr0".into(), namespace: None, prefix: None, })); assert!(s.contains(&OwnedName { local_name: "attr49999".into(), namespace: None, prefix: None, })); } /// Hashser that does nothing except passing u64 through struct U64Hasher(u64); impl Hasher for U64Hasher { fn finish(&self) -> u64 { self.0 } fn write(&mut self, slice: &[u8]) { for &v in slice { self.0 ^= u64::from(v) } // unused in practice } fn write_u64(&mut self, i: u64) { self.0 ^= i; } } #[derive(Default)] struct U64HasherBuilder; impl BuildHasher for U64HasherBuilder { type Hasher = U64Hasher; fn build_hasher(&self) -> U64Hasher { U64Hasher(0) } } xml-rs-0.8.19/src/reader/lexer.rs000064400000000000000000001212301046102023000146620ustar 00000000000000//! Contains simple lexer for XML documents. //! //! This module is for internal use. Use `xml::pull` module to do parsing. use crate::reader::ErrorKind; use crate::reader::error::SyntaxError; use std::collections::VecDeque; use std::fmt; use std::io::Read; use std::result; use crate::common::{is_name_char, is_whitespace_char, Position, TextPosition, is_xml10_char, is_xml11_char}; use crate::reader::Error; use crate::util::{CharReader, Encoding}; use super::ParserConfig2; /// `Token` represents a single lexeme of an XML document. These lexemes /// are used to perform actual parsing. #[derive(Copy, Clone, PartialEq, Eq, Debug)] pub(crate) enum Token { /// `` ProcessingInstructionEnd, /// `` TagEnd, /// `/>` EmptyTagEnd, /// `` CommentEnd, /// Any non-special character except whitespace. Character(char), /// `=` EqualsSign, /// `'` SingleQuote, /// `"` DoubleQuote, /// `` CDataEnd, /// `&` ReferenceStart, /// `;` ReferenceEnd, /// `) -> fmt::Result { match *self { Token::Character(c) => c.fmt(f), other => match other { Token::OpeningTagStart => "<", Token::ProcessingInstructionStart => " " " "", Token::CDataEnd => "]]>", Token::ReferenceStart => "&", Token::ReferenceEnd => ";", Token::EqualsSign => "=", Token::SingleQuote => "'", Token::DoubleQuote => "\"", Token::MarkupDeclarationStart => " unreachable!() }.fmt(f), } } } impl Token { pub fn as_static_str(&self) -> Option<&'static str> { match *self { Token::OpeningTagStart => Some("<"), Token::ProcessingInstructionStart => Some(" Some(" Some(" Some(""), Token::CDataEnd => Some("]]>"), Token::ReferenceStart => Some("&"), Token::ReferenceEnd => Some(";"), Token::EqualsSign => Some("="), Token::SingleQuote => Some("'"), Token::DoubleQuote => Some("\""), _ => None } } // using String.push_str(token.to_string()) is simply way too slow pub fn push_to_string(&self, target: &mut String) { match *self { Token::Character(c) => { debug_assert!(is_xml10_char(c) || is_xml11_char(c)); target.push(c) }, _ => if let Some(s) = self.as_static_str() { target.push_str(s); } } } } #[derive(Copy, Clone)] enum State { /// Default state Normal, /// Triggered on '<' TagStarted, /// Triggered on '` InsideMarkupDeclarationQuotedString(QuoteStyle), } #[derive(Copy, Clone, Eq, PartialEq)] enum QuoteStyle { Single, Double } #[derive(Copy, Clone)] enum ClosingSubstate { First, Second } #[derive(Copy, Clone)] enum DoctypeStartedSubstate { D, DO, DOC, DOCT, DOCTY, DOCTYP } #[derive(Copy, Clone)] enum CDataStartedSubstate { E, C, CD, CDA, CDAT, CDATA } /// `Result` represents lexing result. It is either a token or an error message. pub(crate) type Result, E = Error> = result::Result; /// Helps to set up a dispatch table for lexing large unambigous tokens like /// ` ( match $s { $( $st => match $c { $stc => $_self.move_to($is($next_st)), _ => $_self.handle_error($chunk, $c) }, )+ $end_st => match $c { $end_c => $e, _ => $_self.handle_error($end_chunk, $c) } } ) ); /// `Lexer` is a lexer for XML documents, which implements pull API. /// /// Main method is `next_token` which accepts an `std::io::Read` instance and /// tries to read the next lexeme from it. /// /// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s. /// When it is not set, errors will be reported as `Err` objects with a string message. /// By default this flag is not set. Use `enable_errors` and `disable_errors` methods /// to toggle the behavior. pub(crate) struct Lexer { st: State, reader: CharReader, pos: TextPosition, head_pos: TextPosition, char_queue: VecDeque, /// Default state to go back to after a tag end (may be `InsideDoctype`) normal_state: State, inside_token: bool, eof_handled: bool, reparse_depth: u8, #[cfg(test)] skip_errors: bool, max_entity_expansion_depth: u8, max_entity_expansion_length: usize, } impl Position for Lexer { #[inline] /// Returns the position of the last token produced by the lexer fn position(&self) -> TextPosition { self.pos } } impl Lexer { /// Returns a new lexer with default state. pub(crate) fn new(config: &ParserConfig2) -> Lexer { Lexer { reader: CharReader::new(), pos: TextPosition::new(), head_pos: TextPosition::new(), char_queue: VecDeque::with_capacity(4), // TODO: check size st: State::Normal, normal_state: State::Normal, inside_token: false, eof_handled: false, reparse_depth: 0, #[cfg(test)] skip_errors: false, max_entity_expansion_depth: config.max_entity_expansion_depth, max_entity_expansion_length: config.max_entity_expansion_length, } } pub(crate) fn encoding(&mut self) -> Encoding { self.reader.encoding } pub(crate) fn set_encoding(&mut self, encoding: Encoding) { self.reader.encoding = encoding; } /// Disables error handling so `next_token` will return `Some(Chunk(..))` /// upon invalid lexeme with this lexeme content. #[cfg(test)] fn disable_errors(&mut self) { self.skip_errors = true; } /// Reset the eof handled flag of the lexer. #[inline] pub fn reset_eof_handled(&mut self) { self.eof_handled = false; } /// Tries to read the next token from the buffer. /// /// It is possible to pass different instaces of `BufReader` each time /// this method is called, but the resulting behavior is undefined in this case. /// /// Return value: /// * `Err(reason) where reason: reader::Error` - when an error occurs; /// * `Ok(None)` - upon end of stream is reached; /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream. pub fn next_token(&mut self, b: &mut B) -> Result { // Already reached end of buffer if self.eof_handled { return Ok(None); } if !self.inside_token { self.pos = self.head_pos; self.inside_token = true; } // Check if we have saved a char or two for ourselves while let Some(c) = self.char_queue.pop_front() { match self.dispatch_char(c)? { Some(t) => { self.inside_token = false; return Ok(Some(t)); } None => {} // continue } } // if char_queue is empty, all circular reparsing is done self.reparse_depth = 0; loop { let c = match self.reader.next_char_from(b)? { Some(c) => c, // got next char None => break, // nothing to read left }; if c == '\n' { self.head_pos.new_line(); } else { self.head_pos.advance(1); } match self.dispatch_char(c)? { Some(t) => { self.inside_token = false; return Ok(Some(t)); } None => { // continue } } } self.end_of_stream() } #[inline(never)] fn end_of_stream(&mut self) -> Result { // Handle end of stream self.eof_handled = true; self.pos = self.head_pos; match self.st { State::InsideCdata | State::CDataClosing(_) => Err(self.error(SyntaxError::UnclosedCdata)), State::TagStarted | State::CommentOrCDataOrDoctypeStarted | State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) | State::CommentClosing(ClosingSubstate::Second) | State::InsideComment | State::InsideMarkupDeclaration | State::InsideProcessingInstruction | State::ProcessingInstructionClosing | State::InsideDoctype | State::InsideMarkupDeclarationQuotedString(_) => Err(self.error(SyntaxError::UnexpectedEof)), State::EmptyTagClosing => Ok(Some(Token::Character('/'))), State::CommentClosing(ClosingSubstate::First) => Ok(Some(Token::Character('-'))), State::InvalidCDataClosing(ClosingSubstate::First) => Ok(Some(Token::Character(']'))), State::InvalidCDataClosing(ClosingSubstate::Second) => { self.eof_handled = false; self.move_to_with_unread(State::Normal, &[']'], Token::Character(']')) }, State::Normal => Ok(None), } } #[cold] fn error(&self, e: SyntaxError) -> Error { Error { pos: self.position(), kind: ErrorKind::Syntax(e.to_cow()), } } #[inline(never)] fn dispatch_char(&mut self, c: char) -> Result { match self.st { State::Normal => self.normal(c), State::TagStarted => self.tag_opened(c), State::EmptyTagClosing => self.empty_element_closing(c), State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c), State::InsideCdata => self.inside_cdata(c), State::CDataStarted(s) => self.cdata_started(c, s), State::InsideComment => self.inside_comment_state(c), State::CommentStarted => self.comment_started(c), State::InsideProcessingInstruction => self.inside_processing_instruction(c), State::ProcessingInstructionClosing => self.processing_instruction_closing(c), State::CommentClosing(s) => self.comment_closing(c, s), State::CDataClosing(s) => self.cdata_closing(c, s), State::InsideDoctype => self.inside_doctype(c), State::DoctypeStarted(s) => self.doctype_started(c, s), State::InvalidCDataClosing(s) => self.invalid_cdata_closing(c, s), State::InsideMarkupDeclaration => self.markup_declaration(c), State::InsideMarkupDeclarationQuotedString(q) => self.markup_declaration_string(c, q), } } #[inline] fn move_to(&mut self, st: State) -> Result { self.st = st; Ok(None) } #[inline] fn move_to_with(&mut self, st: State, token: Token) -> Result { self.st = st; Ok(Some(token)) } #[inline] fn move_to_and_reset_normal(&mut self, st: State, token: Token) -> Result { self.normal_state = st; self.st = st; Ok(Some(token)) } fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result { for c in cs.iter().rev().copied() { self.char_queue.push_front(c); } self.move_to_with(st, token) } pub(crate) fn reparse(&mut self, markup: &str) -> Result<()> { if markup.is_empty() { return Ok(()); } self.reparse_depth += 1; if self.reparse_depth > self.max_entity_expansion_depth || self.char_queue.len() > self.max_entity_expansion_length { return Err(self.error(SyntaxError::EntityTooBig)) } self.eof_handled = false; self.char_queue.reserve(markup.len()); for c in markup.chars().rev() { self.char_queue.push_front(c); } Ok(()) } fn handle_error(&mut self, chunk: &'static str, c: char) -> Result { debug_assert!(!chunk.is_empty()); #[cfg(test)] if self.skip_errors { let mut chars = chunk.chars(); let first = chars.next().unwrap_or('\0'); self.char_queue.extend(chars); self.char_queue.push_back(c); return self.move_to_with(State::Normal, Token::Character(first)); } Err(self.error(SyntaxError::UnexpectedTokenBefore(chunk, c))) } /// Encountered a char fn normal(&mut self, c: char) -> Result { match c { '<' => self.move_to(State::TagStarted), '>' => Ok(Some(Token::TagEnd)), '/' => self.move_to(State::EmptyTagClosing), '=' => Ok(Some(Token::EqualsSign)), '"' => Ok(Some(Token::DoubleQuote)), '\'' => Ok(Some(Token::SingleQuote)), ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::First)), '&' => Ok(Some(Token::ReferenceStart)), ';' => Ok(Some(Token::ReferenceEnd)), _ => Ok(Some(Token::Character(c))) } } fn inside_cdata(&mut self, c: char) -> Result { match c { ']' => self.move_to(State::CDataClosing(ClosingSubstate::First)), _ => Ok(Some(Token::Character(c))) } } fn inside_processing_instruction(&mut self, c: char) -> Result { // These tokens are used by `` parser match c { '?' => self.move_to(State::ProcessingInstructionClosing), '<' => Ok(Some(Token::OpeningTagStart)), '>' => Ok(Some(Token::TagEnd)), '/' => Ok(Some(Token::ClosingTagStart)), '=' => Ok(Some(Token::EqualsSign)), '"' => Ok(Some(Token::DoubleQuote)), '\'' => Ok(Some(Token::SingleQuote)), '&' => Ok(Some(Token::ReferenceStart)), ';' => Ok(Some(Token::ReferenceEnd)), _ => Ok(Some(Token::Character(c))) } } fn inside_comment_state(&mut self, c: char) -> Result { match c { '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)), _ => Ok(Some(Token::Character(c))) } } /// Encountered '<' fn tag_opened(&mut self, c: char) -> Result { match c { '?' => self.move_to_with(State::InsideProcessingInstruction, Token::ProcessingInstructionStart), '/' => self.move_to_with(self.normal_state, Token::ClosingTagStart), '!' => self.move_to(State::CommentOrCDataOrDoctypeStarted), _ if is_whitespace_char(c) => self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart), _ if is_name_char(c) => self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart), _ => self.handle_error("<", c) } } /// Encountered ' Result { match c { '-' => self.move_to(State::CommentStarted), '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)), 'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)), 'E' | 'A' | 'N' if matches!(self.normal_state, State::InsideDoctype) => { self.move_to_with_unread(State::InsideMarkupDeclaration, &[c], Token::MarkupDeclarationStart) }, _ => self.handle_error(" Result { match c { '-' => self.move_to_with(State::InsideComment, Token::CommentStart), _ => self.handle_error(" Result { use self::CDataStartedSubstate::{C, CD, CDA, CDAT, CDATA, E}; dispatch_on_enum_state!(self, s, c, State::CDataStarted, E ; 'C' ; C ; " Result { match c { '<' => self.handle_error("' => self.move_to_with(self.normal_state, Token::TagEnd), '&' => Ok(Some(Token::ReferenceStart)), ';' => Ok(Some(Token::ReferenceEnd)), '"' => self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Double), Token::DoubleQuote), '\'' => self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Single), Token::SingleQuote), _ => Ok(Some(Token::Character(c))), } } fn markup_declaration_string(&mut self, c: char, q: QuoteStyle) -> Result { match c { '"' if q == QuoteStyle::Double => self.move_to_with(State::InsideMarkupDeclaration, Token::DoubleQuote), '\'' if q == QuoteStyle::Single => self.move_to_with(State::InsideMarkupDeclaration, Token::SingleQuote), _ => Ok(Some(Token::Character(c))), } } /// Encountered ' Result { use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP}; dispatch_on_enum_state!(self, s, c, State::DoctypeStarted, D ; 'O' ; DO ; " Result { match c { '>' => self.move_to_and_reset_normal(State::Normal, Token::TagEnd), '<' => self.move_to(State::TagStarted), '&' => Ok(Some(Token::ReferenceStart)), ';' => Ok(Some(Token::ReferenceEnd)), '"' => Ok(Some(Token::DoubleQuote)), '\'' => Ok(Some(Token::SingleQuote)), _ => Ok(Some(Token::Character(c))), } } /// Encountered '?' fn processing_instruction_closing(&mut self, c: char) -> Result { match c { '>' => self.move_to_with(self.normal_state, Token::ProcessingInstructionEnd), _ => self.move_to_with_unread(State::InsideProcessingInstruction, &[c], Token::Character('?')), } } /// Encountered '/' fn empty_element_closing(&mut self, c: char) -> Result { match c { '>' => self.move_to_with(self.normal_state, Token::EmptyTagEnd), _ => self.move_to_with_unread(self.normal_state, &[c], Token::Character('/')), } } /// Encountered '-' fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result { match s { ClosingSubstate::First => match c { '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)), _ => self.move_to_with_unread(State::InsideComment, &[c], Token::Character('-')), }, ClosingSubstate::Second => match c { '>' => self.move_to_with(self.normal_state, Token::CommentEnd), // double dash not followed by a greater-than is a hard error inside comment _ => self.handle_error("--", c), }, } } /// Encountered ']' fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result { match s { ClosingSubstate::First => match c { ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)), _ => self.move_to_with_unread(State::InsideCdata, &[c], Token::Character(']')), }, ClosingSubstate::Second => match c { '>' => self.move_to_with(State::Normal, Token::CDataEnd), _ => self.move_to_with_unread(State::InsideCdata, &[']', c], Token::Character(']')), }, } } /// Encountered ']' fn invalid_cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result { match s { ClosingSubstate::First => match c { ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::Second)), _ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']')), }, ClosingSubstate::Second => match c { '>' => self.move_to_with(self.normal_state, Token::CDataEnd), _ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']')), }, } } } #[cfg(test)] mod tests { use crate::{common::Position, reader::ParserConfig2}; use std::io::{BufReader, Cursor}; use super::{Lexer, Token}; macro_rules! assert_oks( (for $lex:ident and $buf:ident ; $($e:expr)+) => ({ $( assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf)); )+ }) ); macro_rules! assert_err( (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({ let err = $lex.next_token(&mut $buf); assert!(err.is_err()); let err = err.unwrap_err(); assert_eq!($r as u64, err.position().row); assert_eq!($c as u64, err.position().column); }) ); macro_rules! assert_none( (for $lex:ident and $buf:ident) => ( assert_eq!(Ok(None), $lex.next_token(&mut $buf)) ) ); fn make_lex_and_buf(s: &str) -> (Lexer, BufReader>>) { (Lexer::new(&ParserConfig2::default()), BufReader::new(Cursor::new(s.to_owned().into_bytes()))) } #[test] fn tricky_pi() { let (mut lex, mut buf) = make_lex_and_buf(r#""#); assert_oks!(for lex and buf ; Token::ProcessingInstructionStart Token::Character('x') Token::OpeningTagStart // processing of relies on the extra tokens Token::Character('!') Token::Character('-') Token::Character('-') Token::Character(' ') Token::ReferenceStart Token::Character('?') Token::ProcessingInstructionEnd Token::OpeningTagStart Token::Character('x') Token::TagEnd ); assert_none!(for lex and buf); } #[test] fn reparser() { let (mut lex, mut buf) = make_lex_and_buf(r#"&a;"#); assert_oks!(for lex and buf ; Token::ReferenceStart Token::Character('a') Token::ReferenceEnd ); lex.reparse("").unwrap(); assert_oks!(for lex and buf ; Token::OpeningTagStart Token::Character('h') Token::Character('i') Token::EmptyTagEnd ); assert_none!(for lex and buf); } #[test] fn simple_lexer_test() { let (mut lex, mut buf) = make_lex_and_buf( r#" xd

 "# ); assert_oks!(for lex and buf ; Token::OpeningTagStart Token::Character('a') Token::Character(' ') Token::Character('p') Token::EqualsSign Token::SingleQuote Token::Character('q') Token::SingleQuote Token::TagEnd Token::Character(' ') Token::Character('x') Token::OpeningTagStart Token::Character('b') Token::Character(' ') Token::Character('z') Token::EqualsSign Token::DoubleQuote Token::Character('y') Token::DoubleQuote Token::TagEnd Token::Character('d') Token::Character('\t') Token::ClosingTagStart Token::Character('b') Token::TagEnd Token::ClosingTagStart Token::Character('a') Token::TagEnd Token::OpeningTagStart Token::Character('p') Token::EmptyTagEnd Token::Character(' ') Token::ProcessingInstructionStart Token::Character('n') Token::Character('m') Token::Character(' ') Token::ProcessingInstructionEnd Token::Character(' ') Token::CommentStart Token::Character(' ') Token::Character('a') Token::Character(' ') Token::Character('c') Token::Character(' ') Token::CommentEnd Token::Character(' ') Token::ReferenceStart Token::Character('n') Token::Character('b') Token::Character('s') Token::Character('p') Token::ReferenceEnd ); assert_none!(for lex and buf); } #[test] fn special_chars_test() { let (mut lex, mut buf) = make_lex_and_buf( r#"?x!+ // -| ]z]]"# ); assert_oks!(for lex and buf ; Token::Character('?') Token::Character('x') Token::Character('!') Token::Character('+') Token::Character(' ') Token::Character('/') Token::Character('/') Token::Character(' ') Token::Character('-') Token::Character('|') Token::Character(' ') Token::Character(']') Token::Character('z') Token::Character(']') Token::Character(']') ); assert_none!(for lex and buf); } #[test] fn cdata_test() { let (mut lex, mut buf) = make_lex_and_buf( r#" "# ); assert_oks!(for lex and buf ; Token::OpeningTagStart Token::Character('a') Token::TagEnd Token::CDataStart Token::Character('x') Token::Character(' ') Token::Character('y') Token::Character(' ') Token::Character('?') Token::CDataEnd Token::Character(' ') Token::ClosingTagStart Token::Character('a') Token::TagEnd ); assert_none!(for lex and buf); } #[test] fn cdata_closers_test() { let (mut lex, mut buf) = make_lex_and_buf( r#" ]> ]]>]]"# ); assert_oks!(for lex and buf ; Token::CDataStart Token::Character(']') Token::Character(' ') Token::Character('>') Token::Character(' ') Token::Character(']') Token::Character('>') Token::Character(' ') Token::CDataEnd Token::CommentStart Token::CommentEnd Token::Character(']') Token::Character(']') Token::OpeningTagStart Token::Character('a') Token::TagEnd ); assert_none!(for lex and buf); } #[test] fn doctype_test() { let (mut lex, mut buf) = make_lex_and_buf( r#" "# ); assert_oks!(for lex and buf ; Token::OpeningTagStart Token::Character('a') Token::TagEnd Token::DoctypeStart Token::Character(' ') Token::Character('a') Token::Character('b') Token::Character(' ') Token::Character('x') Token::Character('x') Token::Character(' ') Token::Character('z') Token::TagEnd Token::Character(' ') ); assert_none!(for lex and buf); } #[test] fn tricky_comments() { let (mut lex, mut buf) = make_lex_and_buf( r#""# ); assert_oks!(for lex and buf ; Token::OpeningTagStart Token::Character('a') Token::TagEnd Token::CommentStart Token::Character(' ') Token::Character('C') Token::Character(' ') Token::Character('-') Token::Character('>') Token::CommentEnd Token::ClosingTagStart Token::Character('a') Token::TagEnd ); assert_none!(for lex and buf); } #[test] fn doctype_with_internal_subset_test() { let (mut lex, mut buf) = make_lex_and_buf( r#">>"> ]> "# ); assert_oks!(for lex and buf ; Token::OpeningTagStart Token::Character('a') Token::TagEnd Token::DoctypeStart Token::Character(' ') Token::Character('a') Token::Character('b') Token::Character('[') Token::MarkupDeclarationStart Token::Character('E') Token::Character('L') Token::Character('E') Token::Character('M') Token::Character('E') Token::Character('N') Token::Character('T') Token::Character(' ') Token::Character('b') Token::Character('a') Token::Character(' ') Token::DoubleQuote Token::Character('>') Token::Character('>') Token::Character('>') Token::DoubleQuote Token::TagEnd Token::Character(' ') Token::Character(']') Token::TagEnd Token::Character(' ') ); assert_none!(for lex and buf); } #[test] fn doctype_internal_pi_comment() { let (mut lex, mut buf) = make_lex_and_buf( " ?> \n]>" ); assert_oks!(for lex and buf ; Token::DoctypeStart Token::Character(' ') Token::Character('a') Token::Character(' ') Token::Character('[') Token::Character('\n') Token::MarkupDeclarationStart Token::Character('E') Token::Character('L') Token::Character('E') Token::Character('M') Token::Character('E') Token::Character('N') Token::Character('T') Token::Character(' ') Token::Character('l') Token::Character(' ') Token::Character('A') Token::Character('N') Token::Character('Y') Token::TagEnd Token::Character(' ') Token::CommentStart Token::Character(' ') Token::Character('<') Token::Character('?') Token::Character('n') Token::Character('o') Token::Character('n') Token::Character('?') Token::Character('>') Token::CommentEnd Token::Character(' ') Token::ProcessingInstructionStart Token::Character('p') Token::Character('i') Token::Character(' ') Token::TagEnd // not really Token::Character(' ') Token::ProcessingInstructionEnd Token::Character(' ') Token::Character('\n') Token::Character(']') Token::TagEnd // DTD ); assert_none!(for lex and buf); } #[test] fn end_of_stream_handling_ok() { macro_rules! eof_check( ($data:expr ; $token:expr) => ({ let (mut lex, mut buf) = make_lex_and_buf($data); assert_oks!(for lex and buf ; $token); assert_none!(for lex and buf); }) ); eof_check!("?" ; Token::Character('?')); eof_check!("/" ; Token::Character('/')); eof_check!("-" ; Token::Character('-')); eof_check!("]" ; Token::Character(']')); eof_check!("]" ; Token::Character(']')); eof_check!("]" ; Token::Character(']')); } #[test] fn end_of_stream_handling_error() { macro_rules! eof_check( ($data:expr; $r:expr, $c:expr) => ({ let (mut lex, mut buf) = make_lex_and_buf($data); assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream"); assert_none!(for lex and buf); }) ); eof_check!("<" ; 0, 1); eof_check!(" ({ let (mut lex, mut buf) = make_lex_and_buf($data); assert_err!(for lex and buf expect row $r ; $c, $s); let (mut lex, mut buf) = make_lex_and_buf($data); lex.disable_errors(); for c in $chunk.chars() { assert_eq!(Ok(Some(Token::Character(c))), lex.next_token(&mut buf)); } assert_oks!(for lex and buf ; Token::Character($app) ); assert_none!(for lex and buf); }) ); #[test] fn token_size() { assert_eq!(4, std::mem::size_of::()); assert_eq!(2, std::mem::size_of::()); } #[test] fn error_in_cdata_started() { check_case!(""# ); assert_oks!(for lex and buf ; Token::CDataStart Token::Character('F') Token::Character('o') Token::Character('o') Token::Character(' ') Token::Character('[') Token::Character('B') Token::Character('a') Token::Character('r') Token::Character(']') Token::CDataEnd ); assert_none!(for lex and buf); } } xml-rs-0.8.19/src/reader/parser/inside_cdata.rs000064400000000000000000000023411046102023000174470ustar 00000000000000use crate::reader::error::SyntaxError; use crate::reader::lexer::Token; use crate::{common::is_whitespace_char, reader::events::XmlEvent}; use super::{PullParser, Result, State}; impl PullParser { pub fn inside_cdata(&mut self, t: Token) -> Option { match t { Token::CDataEnd => { let event = if self.config.c.cdata_to_characters { // start called push_pos, but there will be no event to pop it if self.buf.is_empty() { self.next_pos(); } None } else { let data = self.take_buf(); Some(Ok(XmlEvent::CData(data))) }; self.into_state(State::OutsideTag, event) } Token::Character(c) if !self.is_valid_xml_char(c) => { Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) }, Token::Character(c) => { if !is_whitespace_char(c) { self.inside_whitespace = false; } self.buf.push(c); None } _ => unreachable!(), } } } xml-rs-0.8.19/src/reader/parser/inside_closing_tag_name.rs000064400000000000000000000031351046102023000216660ustar 00000000000000use crate::reader::error::SyntaxError; use crate::{common::is_whitespace_char, namespace}; use crate::reader::lexer::Token; use super::{ClosingTagSubstate, PullParser, QualifiedNameTarget, Result, State}; impl PullParser { pub fn inside_closing_tag_name(&mut self, t: Token, s: ClosingTagSubstate) -> Option { match s { ClosingTagSubstate::CTInsideName => self.read_qualified_name(t, QualifiedNameTarget::ClosingTagNameTarget, |this, token, name| { match name.prefix_ref() { Some(prefix) if prefix == namespace::NS_XML_PREFIX || prefix == namespace::NS_XMLNS_PREFIX => Some(this.error(SyntaxError::InvalidNamePrefix(prefix.into()))), _ => { this.data.element_name = Some(name.clone()); match token { Token::TagEnd => this.emit_end_element(), Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideClosingTag(ClosingTagSubstate::CTAfterName)), _ => Some(this.error(SyntaxError::UnexpectedTokenInClosingTag(token))) } } } }), ClosingTagSubstate::CTAfterName => match t { Token::TagEnd => self.emit_end_element(), Token::Character(c) if is_whitespace_char(c) => None, // Skip whitespace _ => Some(self.error(SyntaxError::UnexpectedTokenInClosingTag(t))) } } } } xml-rs-0.8.19/src/reader/parser/inside_comment.rs000064400000000000000000000021541046102023000200370ustar 00000000000000use crate::reader::error::SyntaxError; use crate::reader::events::XmlEvent; use crate::reader::lexer::Token; use super::{PullParser, Result, State}; impl PullParser { pub fn inside_comment(&mut self, t: Token) -> Option { match t { Token::CommentEnd if self.config.c.ignore_comments => { self.into_state_continue(State::OutsideTag) } Token::CommentEnd => { let data = self.take_buf(); self.into_state_emit(State::OutsideTag, Ok(XmlEvent::Comment(data))) } Token::Character(c) if !self.is_valid_xml_char(c) => { Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) }, _ if self.config.c.ignore_comments => None, // Do not modify buffer if ignoring the comment _ => { if self.buf.len() > self.config.max_data_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } t.push_to_string(&mut self.buf); None } } } } xml-rs-0.8.19/src/reader/parser/inside_declaration.rs000064400000000000000000000224111046102023000206600ustar 00000000000000use crate::common::{is_whitespace_char, XmlVersion}; use crate::reader::error::SyntaxError; use crate::reader::events::XmlEvent; use crate::reader::lexer::Token; use crate::util::Encoding; use super::{ DeclarationSubstate, Encountered, PullParser, QualifiedNameTarget, Result, State, DEFAULT_VERSION, }; impl PullParser { #[inline(never)] fn emit_start_document(&mut self) -> Option { debug_assert!(self.encountered == Encountered::None); self.encountered = Encountered::Declaration; let version = self.data.version; let encoding = self.data.take_encoding(); let standalone = self.data.standalone; if let Some(new_encoding) = encoding.as_deref() { let new_encoding = match new_encoding.parse() { Ok(e) => e, Err(_) if self.config.ignore_invalid_encoding_declarations => Encoding::Latin1, Err(_) => return Some(self.error(SyntaxError::UnsupportedEncoding(new_encoding.into()))), }; let current_encoding = self.lexer.encoding(); if current_encoding != new_encoding { let set = match (current_encoding, new_encoding) { (Encoding::Unknown | Encoding::Default, new) if new != Encoding::Utf16 => new, (Encoding::Utf16Be | Encoding::Utf16Le, Encoding::Utf16) => current_encoding, _ if self.config.ignore_invalid_encoding_declarations => current_encoding, _ => return Some(self.error(SyntaxError::ConflictingEncoding(new_encoding, current_encoding))), }; self.lexer.set_encoding(set); } } let current_encoding = self.lexer.encoding(); self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartDocument { version: version.unwrap_or(DEFAULT_VERSION), encoding: encoding.unwrap_or_else(move || current_encoding.to_string()), standalone })) } // TODO: remove redundancy via macros or extra methods pub fn inside_declaration(&mut self, t: Token, s: DeclarationSubstate) -> Option { match s { DeclarationSubstate::BeforeVersion => match t { Token::Character('v') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersion)), Token::Character(c) if is_whitespace_char(c) => None, // continue _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideVersion => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { match &*name.local_name { "ersion" if name.namespace.is_none() => this.into_state_continue(State::InsideDeclaration( if token == Token::EqualsSign { DeclarationSubstate::InsideVersionValue } else { DeclarationSubstate::AfterVersion } )), _ => Some(this.error(SyntaxError::UnexpectedNameInsideXml(name.to_string().into()))), } }), DeclarationSubstate::AfterVersion => match t { Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersionValue)), Token::Character(c) if is_whitespace_char(c) => None, _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideVersionValue => self.read_attribute_value(t, |this, value| { this.data.version = match &*value { "1.0" => Some(XmlVersion::Version10), "1.1" => Some(XmlVersion::Version11), _ => None }; if this.data.version.is_some() { this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterVersionValue)) } else { Some(this.error(SyntaxError::UnexpectedXmlVersion(value.into()))) } }), DeclarationSubstate::AfterVersionValue => match t { Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeEncoding)), Token::ProcessingInstructionEnd => self.emit_start_document(), _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::BeforeEncoding => match t { Token::Character('e') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncoding)), Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)), Token::ProcessingInstructionEnd => self.emit_start_document(), Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideEncoding => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { match &*name.local_name { "ncoding" if name.namespace.is_none() => this.into_state_continue(State::InsideDeclaration( if token == Token::EqualsSign { DeclarationSubstate::InsideEncodingValue } else { DeclarationSubstate::AfterEncoding } )), _ => Some(this.error(SyntaxError::UnexpectedName(name.to_string().into()))) } }), DeclarationSubstate::AfterEncoding => match t { Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncodingValue)), Token::Character(c) if is_whitespace_char(c) => None, _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideEncodingValue => self.read_attribute_value(t, |this, value| { this.data.encoding = Some(value); this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterEncodingValue)) }), DeclarationSubstate::AfterEncodingValue => match t { Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeStandaloneDecl)), Token::ProcessingInstructionEnd => self.emit_start_document(), _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::BeforeStandaloneDecl => match t { Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)), Token::ProcessingInstructionEnd => self.emit_start_document(), Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideStandaloneDecl => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { match &*name.local_name { "tandalone" if name.namespace.is_none() => this.into_state_continue(State::InsideDeclaration( if token == Token::EqualsSign { DeclarationSubstate::InsideStandaloneDeclValue } else { DeclarationSubstate::AfterStandaloneDecl } )), _ => Some(this.error(SyntaxError::UnexpectedName(name.to_string().into()))), } }), DeclarationSubstate::AfterStandaloneDecl => match t { Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDeclValue)), Token::Character(c) if is_whitespace_char(c) => None, _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideStandaloneDeclValue => self.read_attribute_value(t, |this, value| { let standalone = match &*value { "yes" => Some(true), "no" => Some(false), _ => None }; if standalone.is_some() { this.data.standalone = standalone; this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterStandaloneDeclValue)) } else { Some(this.error(SyntaxError::InvalidStandaloneDeclaration(value.into()))) } }), DeclarationSubstate::AfterStandaloneDeclValue => match t { Token::ProcessingInstructionEnd => self.emit_start_document(), Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, } } } xml-rs-0.8.19/src/reader/parser/inside_doctype.rs000064400000000000000000000305421046102023000200460ustar 00000000000000use crate::reader::error::SyntaxError; use crate::common::{is_name_char, is_name_start_char, is_whitespace_char}; use crate::reader::lexer::Token; use super::{DoctypeSubstate, PullParser, QuoteToken, Result, State}; impl PullParser { pub fn inside_doctype(&mut self, t: Token, substate: DoctypeSubstate) -> Option { match substate { DoctypeSubstate::Outside => match t { Token::TagEnd => self.into_state_continue(State::OutsideTag), Token::MarkupDeclarationStart => { self.buf.clear(); self.into_state_continue(State::InsideDoctype(DoctypeSubstate::InsideName)) }, Token::Character('%') => { self.data.ref_data.clear(); self.data.ref_data.push('%'); self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceInDtd)) }, Token::CommentStart => { self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Comment)) }, Token::SingleQuote | Token::DoubleQuote => { // just discard string literals self.data.quote = Some(super::QuoteToken::from_token(&t)); self.into_state_continue(State::InsideDoctype(DoctypeSubstate::String)) }, Token::CDataEnd | Token::CDataStart => Some(self.error(SyntaxError::UnexpectedToken(t))), // TODO: parse SYSTEM, and [ _ => None, }, DoctypeSubstate::String => match t { Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => None, Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => None, Token::SingleQuote | Token::DoubleQuote => { self.data.quote = None; self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) }, _ => None, }, DoctypeSubstate::Comment => match t { Token::CommentEnd => { self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) }, _ => None, }, DoctypeSubstate::InsideName => match t { Token::Character(c @ 'A'..='Z') => { self.buf.push(c); None }, Token::Character(c) if is_whitespace_char(c) => { let buf = self.take_buf(); match buf.as_str() { "ENTITY" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityName)), "NOTATION" | "ELEMENT" | "ATTLIST" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)), _ => Some(self.error(SyntaxError::UnknownMarkupDeclaration(buf.into()))), } }, _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DoctypeSubstate::BeforeEntityName => { self.data.name.clear(); match t { Token::Character(c) if is_whitespace_char(c) => None, Token::Character('%') => { // % is for PEDecl self.data.name.push('%'); self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinitionStart)) }, Token::Character(c) if is_name_start_char(c) => { if self.data.name.len() > self.config.max_name_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } self.data.name.push(c); self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityName)) }, _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), } }, DoctypeSubstate::EntityName => match t { Token::Character(c) if is_whitespace_char(c) => { self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue)) }, Token::Character(c) if is_name_char(c) => { if self.data.name.len() > self.config.max_name_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } self.data.name.push(c); None }, _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), }, DoctypeSubstate::BeforeEntityValue => { self.buf.clear(); match t { Token::Character(c) if is_whitespace_char(c) => None, // SYSTEM/PUBLIC not supported Token::Character('S' | 'P') => { let name = self.data.take_name(); self.entities.entry(name).or_insert_with(String::new); // Dummy value, but at least the name is recognized self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)) }, Token::SingleQuote | Token::DoubleQuote => { self.data.quote = Some(super::QuoteToken::from_token(&t)); self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) }, _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), } }, DoctypeSubstate::EntityValue => match t { Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { self.buf.push('\''); None }, Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { self.buf.push('"'); None }, Token::SingleQuote | Token::DoubleQuote => { self.data.quote = None; let name = self.data.take_name(); let val = self.take_buf(); self.entities.entry(name).or_insert(val); // First wins self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)) // FIXME }, Token::ReferenceStart | Token::Character('&') => { self.data.ref_data.clear(); self.into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReferenceStart)) }, Token::Character('%') => { self.data.ref_data.clear(); self.data.ref_data.push('%'); // include literal % in the name to distinguish from regular entities self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceInValue)) }, Token::Character(c) if !self.is_valid_xml_char(c) => { Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) }, Token::Character(c) => { self.buf.push(c); None }, _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), }, DoctypeSubstate::PEReferenceDefinitionStart => match t { Token::Character(c) if is_whitespace_char(c) => { None }, Token::Character(c) if is_name_start_char(c) => { debug_assert_eq!(self.data.name, "%"); self.data.name.push(c); self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinition)) }, _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), }, DoctypeSubstate::PEReferenceDefinition => match t { Token::Character(c) if is_name_char(c) => { if self.data.name.len() > self.config.max_name_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } self.data.name.push(c); None }, Token::Character(c) if is_whitespace_char(c) => { self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue)) }, _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), }, DoctypeSubstate::PEReferenceInDtd => match t { Token::Character(c) if is_name_char(c) => { self.data.ref_data.push(c); None }, Token::ReferenceEnd | Token::Character(';') => { let name = self.data.take_ref_data(); match self.entities.get(&name) { Some(ent) => { if let Err(e) = self.lexer.reparse(ent) { return Some(Err(e)); } self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) }, None => Some(self.error(SyntaxError::UndefinedEntity(name.into()))), } }, _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), }, DoctypeSubstate::PEReferenceInValue => match t { Token::Character(c) if is_name_char(c) => { self.data.ref_data.push(c); None }, Token::ReferenceEnd | Token::Character(';') => { let name = self.data.take_ref_data(); match self.entities.get(&name) { Some(ent) => { self.buf.push_str(ent); self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) }, None => Some(self.error(SyntaxError::UndefinedEntity(name.into()))), } }, _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), }, DoctypeSubstate::NumericReferenceStart => match t { Token::Character('#') => { self.into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReference)) }, Token::Character(c) if !self.is_valid_xml_char(c) => { Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) }, Token::Character(c) => { self.buf.push('&'); self.buf.push(c); // named entities are not expanded inside doctype self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) }, _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), }, DoctypeSubstate::NumericReference => match t { Token::ReferenceEnd | Token::Character(';') => { let r = self.data.take_ref_data(); // https://www.w3.org/TR/xml/#sec-entexpand match self.numeric_reference_from_str(&r) { Ok(c) => { self.buf.push(c); self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) } Err(e) => Some(self.error(e)), } }, Token::Character(c) if !self.is_valid_xml_char(c) => { Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) }, Token::Character(c) => { self.data.ref_data.push(c); None }, _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), }, DoctypeSubstate::SkipDeclaration => match t { Token::TagEnd => { self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) }, _ => None, }, } } } xml-rs-0.8.19/src/reader/parser/inside_opening_tag.rs000064400000000000000000000147511046102023000206750ustar 00000000000000use crate::reader::error::SyntaxError; use crate::common::is_name_start_char; use crate::namespace; use crate::{attribute::OwnedAttribute, common::is_whitespace_char}; use crate::reader::lexer::Token; use super::{OpeningTagSubstate, PullParser, QualifiedNameTarget, Result, State}; impl PullParser { pub fn inside_opening_tag(&mut self, t: Token, s: OpeningTagSubstate) -> Option { let max_attrs = self.config.max_attributes; match s { OpeningTagSubstate::InsideName => self.read_qualified_name(t, QualifiedNameTarget::OpeningTagNameTarget, |this, token, name| { match name.prefix_ref() { Some(prefix) if prefix == namespace::NS_XML_PREFIX || prefix == namespace::NS_XMLNS_PREFIX => Some(this.error(SyntaxError::InvalidNamePrefix(prefix.into()))), _ => { this.data.element_name = Some(name.clone()); match token { Token::TagEnd => this.emit_start_element(false), Token::EmptyTagEnd => this.emit_start_element(true), Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)), _ => unreachable!() } } } }), OpeningTagSubstate::InsideTag => match t { Token::TagEnd => self.emit_start_element(false), Token::EmptyTagEnd => self.emit_start_element(true), Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace Token::Character(c) if is_name_start_char(c) => { if self.buf.len() > self.config.max_name_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } self.buf.push(c); self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeName)) } _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))), }, OpeningTagSubstate::InsideAttributeName => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { // check that no attribute with such name is already present // if there is one, XML is not well-formed if this.data.attributes.contains(&name) { return Some(this.error(SyntaxError::RedefinedAttribute(name.to_string().into()))) } this.data.attr_name = Some(name); match token { Token::EqualsSign => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)), Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeName)), _ => Some(this.error(SyntaxError::UnexpectedTokenInOpeningTag(t))) // likely unreachable } }), OpeningTagSubstate::AfterAttributeName => match t { Token::EqualsSign => self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)), Token::Character(c) if is_whitespace_char(c) => None, _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))) }, OpeningTagSubstate::InsideAttributeValue => self.read_attribute_value(t, |this, value| { let name = this.data.take_attr_name()?; // will always succeed here match name.prefix_ref() { // declaring a new prefix; it is sufficient to check prefix only // because "xmlns" prefix is reserved Some(namespace::NS_XMLNS_PREFIX) => { let ln = &*name.local_name; if ln == namespace::NS_XMLNS_PREFIX { Some(this.error(SyntaxError::CannotRedefineXmlnsPrefix)) } else if ln == namespace::NS_XML_PREFIX && &*value != namespace::NS_XML_URI { Some(this.error(SyntaxError::CannotRedefineXmlPrefix)) } else if value.is_empty() { Some(this.error(SyntaxError::CannotUndefinePrefix(ln.into()))) } else { this.nst.put(name.local_name.clone(), value); this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue)) } } // declaring default namespace None if &*name.local_name == namespace::NS_XMLNS_PREFIX => match &*value { namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX | namespace::NS_XML_URI | namespace::NS_XMLNS_URI => Some(this.error(SyntaxError::InvalidDefaultNamespace(value.into()))), _ => { this.nst.put(namespace::NS_NO_PREFIX, value.clone()); this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue)) } }, // regular attribute _ => { if this.data.attributes.len() >= max_attrs { return Some(this.error(SyntaxError::ExceededConfiguredLimit)); } this.data.attributes.push(OwnedAttribute { name, value }); this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue)) } } }), OpeningTagSubstate::AfterAttributeValue => match t { Token::Character(c) if is_whitespace_char(c) => { self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) }, Token::TagEnd => self.emit_start_element(false), Token::EmptyTagEnd => self.emit_start_element(true), _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))), }, } } } xml-rs-0.8.19/src/reader/parser/inside_processing_instruction.rs000064400000000000000000000123721046102023000232150ustar 00000000000000use crate::reader::error::SyntaxError; use crate::common::{is_name_char, is_name_start_char, is_whitespace_char}; use crate::reader::events::XmlEvent; use crate::reader::lexer::Token; use super::{DeclarationSubstate, ProcessingInstructionSubstate, PullParser, Result, State, Encountered}; impl PullParser { pub fn inside_processing_instruction(&mut self, t: Token, s: ProcessingInstructionSubstate) -> Option { match s { ProcessingInstructionSubstate::PIInsideName => match t { Token::Character(c) if self.buf.is_empty() && is_name_start_char(c) || self.buf_has_data() && is_name_char(c) => { if self.buf.len() > self.config.max_name_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } self.buf.push(c); None }, Token::ProcessingInstructionEnd => { // self.buf contains PI name let name = self.take_buf(); // Don't need to check for declaration because it has mandatory attributes // but there is none match &*name { // Name is empty, it is an error "" => Some(self.error(SyntaxError::ProcessingInstructionWithoutName)), // Found Some(self.error(SyntaxError::InvalidXmlProcessingInstruction(name.into()))), // All is ok, emitting event _ => { debug_assert!(self.next_event.is_none(), "{:?}", self.next_event); // can't have a PI before ` { // self.buf contains PI name let name = self.take_buf(); match &*name { // We have not ever encountered an element and have not parsed XML declaration "xml" if self.encountered == Encountered::None => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeVersion)), // Found Some(self.error(SyntaxError::InvalidXmlProcessingInstruction(name.into()))), // All is ok, starting parsing PI data _ => { self.data.name = name; // can't have a PI before ` { let buf = self.take_buf(); Some(self.error(SyntaxError::UnexpectedProcessingInstruction(buf.into(), t))) } }, ProcessingInstructionSubstate::PIInsideData => match t { Token::ProcessingInstructionEnd => { let name = self.data.take_name(); let data = self.take_buf(); self.into_state_emit( State::OutsideTag, Ok(XmlEvent::ProcessingInstruction { name, data: Some(data), }), ) }, Token::Character(c) if !self.is_valid_xml_char(c) => { Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) }, // Any other token should be treated as plain characters _ => { if self.buf.len() > self.config.max_data_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } t.push_to_string(&mut self.buf); None } }, } } } xml-rs-0.8.19/src/reader/parser/inside_reference.rs000064400000000000000000000066631046102023000203440ustar 00000000000000use crate::reader::error::SyntaxError; use std::char; use crate::common::{is_name_char, is_name_start_char, is_whitespace_char}; use crate::reader::lexer::Token; use super::{PullParser, Result, State}; impl PullParser { pub fn inside_reference(&mut self, t: Token) -> Option { match t { Token::Character(c) if !self.data.ref_data.is_empty() && is_name_char(c) || self.data.ref_data.is_empty() && (is_name_start_char(c) || c == '#') => { self.data.ref_data.push(c); None } Token::ReferenceEnd => { let name = self.data.take_ref_data(); if name.is_empty() { return Some(self.error(SyntaxError::EmptyEntity)); } let c = match &*name { "lt" => Some('<'), "gt" => Some('>'), "amp" => Some('&'), "apos" => Some('\''), "quot" => Some('"'), _ if name.starts_with('#') => match self.numeric_reference_from_str(&name[1..]) { Ok(c) => Some(c), Err(e) => return Some(self.error(e)) }, _ => None, }; if let Some(c) = c { self.buf.push(c); } else if let Some(v) = self.config.c.extra_entities.get(&name) { self.buf.push_str(v); } else if let Some(v) = self.entities.get(&name) { if self.state_after_reference == State::OutsideTag { // an entity can expand to *elements*, so outside of a tag it needs a full reparse if let Err(e) = self.lexer.reparse(v) { return Some(Err(e)); } } else { // however, inside attributes it's not allowed to affect attribute quoting, // so it can't be fed to the lexer self.buf.push_str(v); } } else { return Some(self.error(SyntaxError::UnexpectedEntity(name.into()))); } let prev_st = self.state_after_reference; if prev_st == State::OutsideTag && !is_whitespace_char(self.buf.chars().last().unwrap_or('\0')) { self.inside_whitespace = false; } self.into_state_continue(prev_st) } _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), } } pub(crate) fn numeric_reference_from_str(&self, num_str: &str) -> std::result::Result { let val = if let Some(hex) = num_str.strip_prefix('x') { u32::from_str_radix(hex, 16).map_err(move |_| SyntaxError::InvalidNumericEntity(num_str.into()))? } else { u32::from_str_radix(num_str, 10).map_err(move |_| SyntaxError::InvalidNumericEntity(num_str.into()))? }; match char::from_u32(val) { Some(c) if self.is_valid_xml_char(c) => Ok(c), Some(_) if self.config.c.replace_unknown_entity_references => Ok('\u{fffd}'), None if self.config.c.replace_unknown_entity_references => { Ok('\u{fffd}') }, _ => Err(SyntaxError::InvalidCharacterEntity(val)), } } } xml-rs-0.8.19/src/reader/parser/outside_tag.rs000064400000000000000000000217221046102023000173530ustar 00000000000000use crate::reader::error::SyntaxError; use crate::common::is_whitespace_char; use crate::reader::events::XmlEvent; use crate::reader::lexer::Token; use super::{ ClosingTagSubstate, DoctypeSubstate, Encountered, OpeningTagSubstate, ProcessingInstructionSubstate, PullParser, Result, State, }; impl PullParser { pub fn outside_tag(&mut self, t: Token) -> Option { match t { Token::Character(c) => { if is_whitespace_char(c) { // skip whitespace outside of the root element if (self.config.c.trim_whitespace && self.buf.is_empty()) || (self.depth() == 0 && self.config.c.ignore_root_level_whitespace) { return None; } } else { self.inside_whitespace = false; if self.depth() == 0 { return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t))); } } if !self.is_valid_xml_char_not_restricted(c) { return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))); } if self.buf.is_empty() { self.push_pos(); } else if self.buf.len() > self.config.max_data_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } self.buf.push(c); None }, Token::CommentEnd | Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote | Token::ProcessingInstructionEnd | Token::EmptyTagEnd => { if self.depth() == 0 { return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t))); } self.inside_whitespace = false; if let Some(s) = t.as_static_str() { if self.buf.is_empty() { self.push_pos(); } else if self.buf.len() > self.config.max_data_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } self.buf.push_str(s); } None }, Token::ReferenceStart if self.depth() > 0 => { self.state_after_reference = State::OutsideTag; self.into_state_continue(State::InsideReference) }, Token::ReferenceEnd if self.depth() > 0 => { // Semi-colon in a text outside an entity self.inside_whitespace = false; if self.buf.len() > self.config.max_data_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } Token::ReferenceEnd.push_to_string(&mut self.buf); None }, Token::CommentStart if self.config.c.coalesce_characters && self.config.c.ignore_comments => { let next_event = self.set_encountered(Encountered::Comment); // We need to switch the lexer into a comment mode inside comments self.into_state(State::InsideComment, next_event) } Token::CDataStart if self.depth() > 0 && self.config.c.coalesce_characters && self.config.c.cdata_to_characters => { if self.buf.is_empty() { self.push_pos(); } self.into_state_continue(State::InsideCData) }, _ => { // Encountered some markup event, flush the buffer as characters // or a whitespace let mut next_event = if self.buf_has_data() { let buf = self.take_buf(); if self.inside_whitespace && self.config.c.trim_whitespace { None } else if self.inside_whitespace && !self.config.c.whitespace_to_characters { debug_assert!(buf.chars().all(|ch| ch.is_whitespace()), "ws={buf:?}"); Some(Ok(XmlEvent::Whitespace(buf))) } else if self.config.c.trim_whitespace { Some(Ok(XmlEvent::Characters(buf.trim_matches(is_whitespace_char).into()))) } else { Some(Ok(XmlEvent::Characters(buf))) } } else { None }; self.inside_whitespace = true; // Reset inside_whitespace flag // pos is popped whenever an event is emitted, so pushes must happen only if there will be an event to balance it // and ignored comments don't pop if t != Token::CommentStart || !self.config.c.ignore_comments { self.push_pos(); } match t { Token::OpeningTagStart if self.depth() > 0 || self.encountered < Encountered::Element || self.config.allow_multiple_root_elements => { if let Some(e) = self.set_encountered(Encountered::Element) { next_event = Some(e); } self.nst.push_empty(); self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event) }, Token::ClosingTagStart if self.depth() > 0 => self.into_state(State::InsideClosingTag(ClosingTagSubstate::CTInsideName), next_event), Token::CommentStart => { if let Some(e) = self.set_encountered(Encountered::Comment) { next_event = Some(e); } // We need to switch the lexer into a comment mode inside comments self.into_state(State::InsideComment, next_event) }, Token::DoctypeStart if self.encountered < Encountered::Doctype => { if let Some(e) = self.set_encountered(Encountered::Doctype) { next_event = Some(e); } // We don't have a doctype event so skip this position // FIXME: update when we have a doctype event self.next_pos(); self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event) }, Token::ProcessingInstructionStart => self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event), Token::CDataStart if self.depth() > 0 => { self.into_state(State::InsideCData, next_event) }, _ => Some(self.error(SyntaxError::UnexpectedToken(t))) } } } } pub fn document_start(&mut self, t: Token) -> Option { debug_assert!(self.encountered < Encountered::Declaration); match t { Token::Character(c) => { let next_event = self.set_encountered(Encountered::AnyChars); if !is_whitespace_char(c) { return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t))); } self.inside_whitespace = true; // skip whitespace outside of the root element if (self.config.c.trim_whitespace && self.buf.is_empty()) || (self.depth() == 0 && self.config.c.ignore_root_level_whitespace) { return self.into_state(State::OutsideTag, next_event); } self.push_pos(); self.buf.push(c); self.into_state(State::OutsideTag, next_event) }, Token::CommentStart => { let next_event = self.set_encountered(Encountered::Comment); self.into_state(State::InsideComment, next_event) } Token::OpeningTagStart => { let next_event = self.set_encountered(Encountered::Element); self.nst.push_empty(); self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event) }, Token::DoctypeStart => { let next_event = self.set_encountered(Encountered::Doctype); // We don't have a doctype event so skip this position // FIXME: update when we have a doctype event self.next_pos(); self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event) }, Token::ProcessingInstructionStart => { self.push_pos(); self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName)) }, _ => Some(self.error(SyntaxError::UnexpectedToken(t))), } } } xml-rs-0.8.19/src/reader/parser.rs000064400000000000000000000641401046102023000150450ustar 00000000000000//! Contains an implementation of pull-based XML parser. use crate::common::{is_xml10_char, is_xml11_char, is_xml11_char_not_restricted, is_name_char, is_name_start_char, is_whitespace_char}; use crate::common::{Position, TextPosition, XmlVersion}; use crate::name::OwnedName; use crate::namespace::NamespaceStack; use crate::reader::config::ParserConfig2; use crate::reader::error::SyntaxError; use crate::reader::events::XmlEvent; use crate::reader::indexset::AttributesSet; use crate::reader::lexer::{Lexer, Token}; use super::{Error, ErrorKind}; use std::collections::HashMap; use std::io::Read; macro_rules! gen_takes( ($($field:ident -> $method:ident, $t:ty, $def:expr);+) => ( $( impl MarkupData { #[inline] #[allow(clippy::mem_replace_option_with_none)] fn $method(&mut self) -> $t { std::mem::replace(&mut self.$field, $def) } } )+ ) ); gen_takes!( name -> take_name, String, String::new(); ref_data -> take_ref_data, String, String::new(); encoding -> take_encoding, Option, None; element_name -> take_element_name, Option, None; attr_name -> take_attr_name, Option, None; attributes -> take_attributes, AttributesSet, AttributesSet::new() ); mod inside_cdata; mod inside_closing_tag_name; mod inside_comment; mod inside_declaration; mod inside_doctype; mod inside_opening_tag; mod inside_processing_instruction; mod inside_reference; mod outside_tag; static DEFAULT_VERSION: XmlVersion = XmlVersion::Version10; static DEFAULT_STANDALONE: Option = None; type ElementStack = Vec; pub type Result = super::Result; /// Pull-based XML parser. pub(crate) struct PullParser { config: ParserConfig2, lexer: Lexer, st: State, state_after_reference: State, buf: String, /// From DTD internal subset entities: HashMap, nst: NamespaceStack, data: MarkupData, final_result: Option, next_event: Option, est: ElementStack, pos: Vec, encountered: Encountered, inside_whitespace: bool, read_prefix_separator: bool, pop_namespace: bool, } // Keeps track when XML declaration can happen #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] enum Encountered { None = 0, AnyChars, // whitespace before ) -> PullParser { let config = config.into(); Self::new_with_config2(config) } #[inline] fn new_with_config2(config: ParserConfig2) -> PullParser { let mut lexer = Lexer::new(&config); if let Some(enc) = config.override_encoding { lexer.set_encoding(enc); } let mut pos = Vec::with_capacity(16); pos.push(TextPosition::new()); PullParser { config, lexer, st: State::DocumentStart, state_after_reference: State::OutsideTag, buf: String::new(), entities: HashMap::new(), nst: NamespaceStack::default(), data: MarkupData { name: String::new(), version: None, encoding: None, standalone: None, ref_data: String::new(), element_name: None, quote: None, attr_name: None, attributes: AttributesSet::new(), }, final_result: None, next_event: None, est: Vec::new(), pos, encountered: Encountered::None, inside_whitespace: true, read_prefix_separator: false, pop_namespace: false, } } /// Checks if this parser ignores the end of stream errors. pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.c.ignore_end_of_stream } #[inline(never)] fn set_encountered(&mut self, new_encounter: Encountered) -> Option { if new_encounter <= self.encountered { return None; } let prev_enc = self.encountered; self.encountered = new_encounter; // If declaration was not parsed and we have encountered an element, // emit this declaration as the next event. if prev_enc == Encountered::None { self.push_pos(); Some(Ok(XmlEvent::StartDocument { version: DEFAULT_VERSION, encoding: self.lexer.encoding().to_string(), standalone: DEFAULT_STANDALONE, })) } else { None } } } impl Position for PullParser { /// Returns the position of the last event produced by the parser #[inline] fn position(&self) -> TextPosition { self.pos[0] } } #[derive(Copy, Clone, PartialEq)] pub enum State { OutsideTag, InsideOpeningTag(OpeningTagSubstate), InsideClosingTag(ClosingTagSubstate), InsideProcessingInstruction(ProcessingInstructionSubstate), InsideComment, InsideCData, InsideDeclaration(DeclarationSubstate), InsideDoctype(DoctypeSubstate), InsideReference, DocumentStart, } #[derive(Copy, Clone, PartialEq)] pub enum DoctypeSubstate { Outside, String, InsideName, BeforeEntityName, EntityName, BeforeEntityValue, EntityValue, NumericReferenceStart, NumericReference, /// expansion PEReferenceInValue, PEReferenceInDtd, /// name definition PEReferenceDefinitionStart, PEReferenceDefinition, SkipDeclaration, Comment, } #[derive(Copy, Clone, PartialEq)] pub enum OpeningTagSubstate { InsideName, InsideTag, InsideAttributeName, AfterAttributeName, InsideAttributeValue, AfterAttributeValue, } #[derive(Copy, Clone, PartialEq)] pub enum ClosingTagSubstate { CTInsideName, CTAfterName, } #[derive(Copy, Clone, PartialEq)] pub enum ProcessingInstructionSubstate { PIInsideName, PIInsideData, } #[derive(Copy, Clone, PartialEq)] pub enum DeclarationSubstate { BeforeVersion, InsideVersion, AfterVersion, InsideVersionValue, AfterVersionValue, BeforeEncoding, InsideEncoding, AfterEncoding, InsideEncodingValue, AfterEncodingValue, BeforeStandaloneDecl, InsideStandaloneDecl, AfterStandaloneDecl, InsideStandaloneDeclValue, AfterStandaloneDeclValue, } #[derive(PartialEq)] enum QualifiedNameTarget { AttributeNameTarget, OpeningTagNameTarget, ClosingTagNameTarget, } #[derive(Copy, Clone, PartialEq, Eq)] enum QuoteToken { SingleQuoteToken, DoubleQuoteToken, } impl QuoteToken { fn from_token(t: &Token) -> QuoteToken { match *t { Token::SingleQuote => QuoteToken::SingleQuoteToken, Token::DoubleQuote => QuoteToken::DoubleQuoteToken, _ => panic!("Unexpected token: {t}"), } } fn as_token(self) -> Token { match self { QuoteToken::SingleQuoteToken => Token::SingleQuote, QuoteToken::DoubleQuoteToken => Token::DoubleQuote, } } } struct MarkupData { name: String, // used for processing instruction name ref_data: String, // used for reference content version: Option, // used for XML declaration version encoding: Option, // used for XML declaration encoding standalone: Option, // used for XML declaration standalone parameter element_name: Option, // used for element name quote: Option, // used to hold opening quote for attribute value attr_name: Option, // used to hold attribute name attributes: AttributesSet, // used to hold all accumulated attributes } impl PullParser { /// Returns next event read from the given buffer. /// /// This method should be always called with the same buffer. If you call it /// providing different buffers each time, the result will be undefined. pub fn next(&mut self, r: &mut R) -> Result { if let Some(ref ev) = self.final_result { return ev.clone(); } if let Some(ev) = self.next_event.take() { return ev; } if self.pop_namespace { self.pop_namespace = false; self.nst.pop(); } loop { debug_assert!(self.next_event.is_none()); debug_assert!(!self.pop_namespace); // While lexer gives us Ok(maybe_token) -- we loop. // Upon having a complete XML-event -- we return from the whole function. match self.lexer.next_token(r) { Ok(Some(token)) => { match self.dispatch_token(token) { None => {} // continue Some(Ok(xml_event)) => { self.next_pos(); return Ok(xml_event) }, Some(Err(xml_error)) => { self.next_pos(); return self.set_final_result(Err(xml_error)) }, } }, Ok(None) => break, Err(lexer_error) => { return self.set_final_result(Err(lexer_error)) }, } } self.handle_eof() } /// Handle end of stream fn handle_eof(&mut self) -> std::result::Result { // Forward pos to the lexer head self.next_pos(); let ev = if self.depth() == 0 { if self.encountered == Encountered::Element && self.st == State::OutsideTag { // all is ok Ok(XmlEvent::EndDocument) } else if self.encountered < Encountered::Element { self.error(SyntaxError::NoRootElement) } else { // self.st != State::OutsideTag self.error(SyntaxError::UnexpectedEof) // TODO: add expected hint? } } else if self.config.c.ignore_end_of_stream { self.final_result = None; self.lexer.reset_eof_handled(); return self.error(SyntaxError::UnbalancedRootElement); } else { self.error(SyntaxError::UnbalancedRootElement) }; self.set_final_result(ev) } // This function is to be called when a terminal event is reached. // The function sets up the `self.final_result` into `Some(result)` and return `result`. #[inline] fn set_final_result(&mut self, result: Result) -> Result { self.final_result = Some(result.clone()); result } #[cold] fn error(&self, e: SyntaxError) -> Result { Err(Error { pos: self.lexer.position(), kind: ErrorKind::Syntax(e.to_cow()), }) } #[inline] fn next_pos(&mut self) { // unfortunately calls to next_pos will never be perfectly balanced with push_pos, // at very least because parse errors and EOF can happen unexpectedly without a prior push. if !self.pos.is_empty() { if self.pos.len() > 1 { self.pos.remove(0); } else { self.pos[0] = self.lexer.position(); } } } #[inline] #[track_caller] fn push_pos(&mut self) { debug_assert!(self.pos.len() != self.pos.capacity(), "You've found a bug in xml-rs, caused by calls to push_pos() in states that don't end up emitting events. This case is ignored in release mode, and merely causes document positions to be out of sync. Please file a bug and include the XML document that triggers this assert."); // it has capacity preallocated for more than it ever needs, so this reduces code size if self.pos.len() != self.pos.capacity() { self.pos.push(self.lexer.position()); } else if self.pos.len() > 1 { self.pos.remove(0); // this mitigates the excessive push_pos() call } } #[inline(never)] fn dispatch_token(&mut self, t: Token) -> Option { match self.st { State::OutsideTag => self.outside_tag(t), State::InsideOpeningTag(s) => self.inside_opening_tag(t, s), State::InsideClosingTag(s) => self.inside_closing_tag_name(t, s), State::InsideReference => self.inside_reference(t), State::InsideComment => self.inside_comment(t), State::InsideCData => self.inside_cdata(t), State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s), State::InsideDoctype(s) => self.inside_doctype(t, s), State::InsideDeclaration(s) => self.inside_declaration(t, s), State::DocumentStart => self.document_start(t), } } #[inline] fn depth(&self) -> usize { self.est.len() } #[inline] fn buf_has_data(&self) -> bool { !self.buf.is_empty() } #[inline] fn take_buf(&mut self) -> String { std::mem::take(&mut self.buf) } #[inline] fn into_state(&mut self, st: State, ev: Option) -> Option { self.st = st; ev } #[inline] fn into_state_continue(&mut self, st: State) -> Option { self.into_state(st, None) } #[inline] fn into_state_emit(&mut self, st: State, ev: Result) -> Option { self.into_state(st, Some(ev)) } /// Dispatches tokens in order to process qualified name. If qualified name cannot be parsed, /// an error is returned. /// /// # Parameters /// * `t` --- next token; /// * `on_name` --- a callback which is executed when whitespace is encountered. fn read_qualified_name(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option where F: Fn(&mut PullParser, Token, OwnedName) -> Option { // We can get here for the first time only when self.data.name contains zero or one character, // but first character cannot be a colon anyway if self.buf.len() <= 1 { self.read_prefix_separator = false; } let invoke_callback = move |this: &mut PullParser, t| { let name = this.take_buf(); match name.parse() { Ok(name) => on_name(this, t, name), Err(_) => Some(this.error(SyntaxError::InvalidQualifiedName(name.into()))), } }; match t { // There can be only one colon, and not as the first character Token::Character(':') if self.buf_has_data() && !self.read_prefix_separator => { self.buf.push(':'); self.read_prefix_separator = true; None } Token::Character(c) if c != ':' && (self.buf.is_empty() && is_name_start_char(c) || self.buf_has_data() && is_name_char(c)) => { if self.buf.len() > self.config.max_name_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } self.buf.push(c); None }, Token::EqualsSign if target == QualifiedNameTarget::AttributeNameTarget => invoke_callback(self, t), Token::EmptyTagEnd if target == QualifiedNameTarget::OpeningTagNameTarget => invoke_callback(self, t), Token::TagEnd if target == QualifiedNameTarget::OpeningTagNameTarget || target == QualifiedNameTarget::ClosingTagNameTarget => invoke_callback(self, t), Token::Character(c) if is_whitespace_char(c) => invoke_callback(self, t), _ => Some(self.error(SyntaxError::UnexpectedQualifiedName(t))), } } /// Dispatches tokens in order to process attribute value. /// /// # Parameters /// * `t` --- next token; /// * `on_value` --- a callback which is called when terminating quote is encountered. fn read_attribute_value(&mut self, t: Token, on_value: F) -> Option where F: Fn(&mut PullParser, String) -> Option { match t { Token::Character(c) if self.data.quote.is_none() && is_whitespace_char(c) => None, // skip leading whitespace Token::DoubleQuote | Token::SingleQuote => match self.data.quote { None => { // Entered attribute value self.data.quote = Some(QuoteToken::from_token(&t)); None } Some(q) if q.as_token() == t => { self.data.quote = None; let value = self.take_buf(); on_value(self, value) } _ => { if let Token::Character(c) = t { if !self.is_valid_xml_char_not_restricted(c) { return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))); } } if self.buf.len() > self.config.max_attribute_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } t.push_to_string(&mut self.buf); None } }, Token::ReferenceStart if self.data.quote.is_some() => { self.state_after_reference = self.st; self.into_state_continue(State::InsideReference) }, Token::OpeningTagStart => Some(self.error(SyntaxError::UnexpectedOpeningTag)), Token::Character(c) if !self.is_valid_xml_char_not_restricted(c) => { Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) }, // Every character except " and ' and < is okay _ if self.data.quote.is_some() => { if self.buf.len() > self.config.max_attribute_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } t.push_to_string(&mut self.buf); None } _ => Some(self.error(SyntaxError::UnexpectedToken(t))), } } fn emit_start_element(&mut self, emit_end_element: bool) -> Option { let mut name = self.data.take_element_name()?; let mut attributes = self.data.take_attributes().into_vec(); // check whether the name prefix is bound and fix its namespace match self.nst.get(name.borrow().prefix_repr()) { Some("") => name.namespace = None, // default namespace Some(ns) => name.namespace = Some(ns.into()), None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into()))) } // check and fix accumulated attributes prefixes for attr in &mut attributes { if let Some(ref pfx) = attr.name.prefix { let new_ns = match self.nst.get(pfx) { Some("") => None, // default namespace Some(ns) => Some(ns.into()), None => return Some(self.error(SyntaxError::UnboundAttribute(attr.name.to_string().into()))) }; attr.name.namespace = new_ns; } } if emit_end_element { self.pop_namespace = true; self.next_event = Some(Ok(XmlEvent::EndElement { name: name.clone() })); } else { self.est.push(name.clone()); } let namespace = self.nst.squash(); self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement { name, attributes, namespace })) } fn emit_end_element(&mut self) -> Option { let mut name = self.data.take_element_name()?; // check whether the name prefix is bound and fix its namespace match self.nst.get(name.borrow().prefix_repr()) { Some("") => name.namespace = None, // default namespace Some(ns) => name.namespace = Some(ns.into()), None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into()))) } let op_name = self.est.pop()?; if name == op_name { self.pop_namespace = true; self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name })) } else { Some(self.error(SyntaxError::UnexpectedClosingTag(format!("{name} != {op_name}").into()))) } } #[inline] fn is_valid_xml_char(&self, c: char) -> bool { if Some(XmlVersion::Version11) == self.data.version { is_xml11_char(c) } else { is_xml10_char(c) } } #[inline] fn is_valid_xml_char_not_restricted(&self, c: char) -> bool { if Some(XmlVersion::Version11) == self.data.version { is_xml11_char_not_restricted(c) } else { is_xml10_char(c) } } } #[cfg(test)] mod tests { use std::io::BufReader; use crate::attribute::OwnedAttribute; use crate::common::TextPosition; use crate::name::OwnedName; use crate::reader::events::XmlEvent; use crate::reader::parser::PullParser; use crate::reader::ParserConfig; fn new_parser() -> PullParser { PullParser::new(ParserConfig::new()) } macro_rules! expect_event( ($r:expr, $p:expr, $t:pat) => ( match $p.next(&mut $r) { $t => {} e => panic!("Unexpected event: {e:?}\nExpected: {}", stringify!($t)) } ); ($r:expr, $p:expr, $t:pat => $c:expr ) => ( match $p.next(&mut $r) { $t if $c => {} e => panic!("Unexpected event: {e:?}\nExpected: {} if {}", stringify!($t), stringify!($c)) } ) ); macro_rules! test_data( ($d:expr) => ({ static DATA: &'static str = $d; let r = BufReader::new(DATA.as_bytes()); let p = new_parser(); (r, p) }) ); #[test] fn issue_3_semicolon_in_attribute_value() { let (mut r, mut p) = test_data!(r#" "#); expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, ref attributes, ref namespace }) => *name == OwnedName::local("a") && attributes.len() == 1 && attributes[0] == OwnedAttribute::new(OwnedName::local("attr"), "zzz;zzz") && namespace.is_essentially_empty() ); expect_event!(r, p, Ok(XmlEvent::EndElement { ref name }) => *name == OwnedName::local("a")); expect_event!(r, p, Ok(XmlEvent::EndDocument)); } #[test] fn issue_140_entity_reference_inside_tag() { let (mut r, mut p) = test_data!(r#" "#); expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, .. }) => *name == OwnedName::local("bla")); expect_event!(r, p, Ok(XmlEvent::Characters(ref s)) => s == "\u{266b}"); expect_event!(r, p, Ok(XmlEvent::EndElement { ref name, .. }) => *name == OwnedName::local("bla")); expect_event!(r, p, Ok(XmlEvent::EndDocument)); } #[test] fn issue_220_comment() { let (mut r, mut p) = test_data!(r#""#); expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); expect_event!(r, p, Ok(XmlEvent::EndElement { .. })); expect_event!(r, p, Ok(XmlEvent::EndDocument)); let (mut r, mut p) = test_data!(r#""#); expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); expect_event!(r, p, Err(_)); // ---> is forbidden in comments let (mut r, mut p) = test_data!(r#""#); p.config.c.ignore_comments = false; expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); expect_event!(r, p, Ok(XmlEvent::Comment(s)) => s == " "#); expect_event!(r, p, Err(_)); let (mut r, mut p) = test_data!(r#""#); expect_event!(r, p, Err(_)); let (mut r, mut p) = test_data!(r#""#); expect_event!(r, p, Err(_)); let (mut r, mut p) = test_data!(r#""#); expect_event!(r, p, Err(_)); let (mut r, mut p) = test_data!(r#""#); expect_event!(r, p, Err(_)); } #[test] fn opening_tag_in_attribute_value() { use crate::reader::error::{SyntaxError, Error, ErrorKind}; let (mut r, mut p) = test_data!(r#" "#); expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Err(ref e) => *e == Error { kind: ErrorKind::Syntax(SyntaxError::UnexpectedOpeningTag.to_cow()), pos: TextPosition { row: 1, column: 24 } } ); } #[test] fn reference_err() { let (mut r, mut p) = test_data!(r#" && "#); expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); expect_event!(r, p, Err(_)); } #[test] fn state_size() { assert_eq!(2, std::mem::size_of::()); assert_eq!(1, std::mem::size_of::()); } } xml-rs-0.8.19/src/reader.rs000064400000000000000000000107751046102023000135560ustar 00000000000000//! Contains high-level interface for a pull-based XML parser. //! //! The most important type in this module is `EventReader`, which provides an iterator //! view for events in XML document. use std::io::Read; use std::iter::FusedIterator; use std::result; use crate::common::{Position, TextPosition}; pub use self::config::ParserConfig; pub use self::config::ParserConfig2; pub use self::error::{Error, ErrorKind}; pub use self::events::XmlEvent; use self::parser::PullParser; mod config; mod events; mod lexer; mod parser; mod indexset; mod error; /// A result type yielded by `XmlReader`. pub type Result = result::Result; /// A wrapper around an `std::io::Read` instance which provides pull-based XML parsing. pub struct EventReader { source: R, parser: PullParser, } impl EventReader { /// Creates a new reader, consuming the given stream. #[inline] pub fn new(source: R) -> EventReader { EventReader::new_with_config(source, ParserConfig2::new()) } /// Creates a new reader with the provded configuration, consuming the given stream. #[inline] pub fn new_with_config(source: R, config: impl Into) -> EventReader { EventReader { source, parser: PullParser::new(config) } } /// Pulls and returns next XML event from the stream. /// /// If returned event is `XmlEvent::Error` or `XmlEvent::EndDocument`, then /// further calls to this method will return this event again. #[inline] pub fn next(&mut self) -> Result { self.parser.next(&mut self.source) } /// Skips all XML events until the next end tag at the current level. /// /// Convenience function that is useful for the case where you have /// encountered a start tag that is of no interest and want to /// skip the entire XML subtree until the corresponding end tag. #[inline] pub fn skip(&mut self) -> Result<()> { let mut depth = 1; while depth > 0 { match self.next()? { XmlEvent::StartElement { .. } => depth += 1, XmlEvent::EndElement { .. } => depth -= 1, XmlEvent::EndDocument => unreachable!(), _ => {} } } Ok(()) } pub fn source(&self) -> &R { &self.source } pub fn source_mut(&mut self) -> &mut R { &mut self.source } /// Unwraps this `EventReader`, returning the underlying reader. /// /// Note that this operation is destructive; unwrapping the reader and wrapping it /// again with `EventReader::new()` will create a fresh reader which will attempt /// to parse an XML document from the beginning. pub fn into_inner(self) -> R { self.source } } impl Position for EventReader { /// Returns the position of the last event produced by the reader. #[inline] fn position(&self) -> TextPosition { self.parser.position() } } impl IntoIterator for EventReader { type Item = Result; type IntoIter = Events; fn into_iter(self) -> Events { Events { reader: self, finished: false } } } /// An iterator over XML events created from some type implementing `Read`. /// /// When the next event is `xml::event::Error` or `xml::event::EndDocument`, then /// it will be returned by the iterator once, and then it will stop producing events. pub struct Events { reader: EventReader, finished: bool, } impl Events { /// Unwraps the iterator, returning the internal `EventReader`. #[inline] pub fn into_inner(self) -> EventReader { self.reader } pub fn source(&self) -> &R { &self.reader.source } pub fn source_mut(&mut self) -> &mut R { &mut self.reader.source } } impl FusedIterator for Events { } impl Iterator for Events { type Item = Result; #[inline] fn next(&mut self) -> Option> { if self.finished && !self.reader.parser.is_ignoring_end_of_stream() { None } else { let ev = self.reader.next(); if let Ok(XmlEvent::EndDocument) | Err(_) = ev { self.finished = true; } Some(ev) } } } impl<'r> EventReader<&'r [u8]> { /// A convenience method to create an `XmlReader` from a string slice. #[inline] #[must_use] pub fn from_str(source: &'r str) -> EventReader<&'r [u8]> { EventReader::new(source.as_bytes()) } } xml-rs-0.8.19/src/util.rs000064400000000000000000000271301046102023000132620ustar 00000000000000use std::fmt; use std::io::{self, Read}; use std::str::{self, FromStr}; #[derive(Debug)] pub enum CharReadError { UnexpectedEof, Utf8(str::Utf8Error), Io(io::Error), } impl From for CharReadError { #[cold] fn from(e: str::Utf8Error) -> CharReadError { CharReadError::Utf8(e) } } impl From for CharReadError { #[cold] fn from(e: io::Error) -> CharReadError { CharReadError::Io(e) } } impl fmt::Display for CharReadError { #[cold] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use self::CharReadError::{Io, UnexpectedEof, Utf8}; match *self { UnexpectedEof => write!(f, "unexpected end of stream"), Utf8(ref e) => write!(f, "UTF-8 decoding error: {e}"), Io(ref e) => write!(f, "I/O error: {e}"), } } } /// Character encoding used for parsing #[derive(Debug, Copy, Clone, Eq, PartialEq)] #[non_exhaustive] pub enum Encoding { /// Explicitly UTF-8 only Utf8, /// UTF-8 fallback, but can be any 8-bit encoding Default, /// ISO-8859-1 Latin1, /// US-ASCII Ascii, /// Big-Endian Utf16Be, /// Little-Endian Utf16Le, /// Unknown endianness yet, will be sniffed Utf16, /// Not determined yet, may be sniffed to be anything Unknown, } // Rustc inlines eq_ignore_ascii_case and creates kilobytes of code! #[inline(never)] fn icmp(lower: &str, varcase: &str) -> bool { lower.bytes().zip(varcase.bytes()).all(|(l, v)| l == v.to_ascii_lowercase()) } impl FromStr for Encoding { type Err = &'static str; fn from_str(val: &str) -> Result { if ["utf-8", "utf8"].into_iter().any(move |label| icmp(label, val)) { Ok(Encoding::Utf8) } else if ["iso-8859-1", "latin1"].into_iter().any(move |label| icmp(label, val)) { Ok(Encoding::Latin1) } else if ["utf-16", "utf16"].into_iter().any(move |label| icmp(label, val)) { Ok(Encoding::Utf16) } else if ["ascii", "us-ascii"].into_iter().any(move |label| icmp(label, val)) { Ok(Encoding::Ascii) } else { Err("unknown encoding name") } } } impl fmt::Display for Encoding { #[cold] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(match self { Encoding::Utf8 => "UTF-8", Encoding::Default => "UTF-8", Encoding::Latin1 => "ISO-8859-1", Encoding::Ascii => "US-ASCII", Encoding::Utf16Be => "UTF-16", Encoding::Utf16Le => "UTF-16", Encoding::Utf16 => "UTF-16", Encoding::Unknown => "(unknown)", }) } } pub(crate) struct CharReader { pub encoding: Encoding, } impl CharReader { pub fn new() -> Self { Self { encoding: Encoding::Unknown, } } pub fn next_char_from(&mut self, source: &mut R) -> Result, CharReadError> { let mut bytes = source.bytes(); const MAX_CODEPOINT_LEN: usize = 4; let mut buf = [0u8; MAX_CODEPOINT_LEN]; let mut pos = 0; loop { let next = match bytes.next() { Some(Ok(b)) => b, Some(Err(e)) => return Err(e.into()), None if pos == 0 => return Ok(None), None => return Err(CharReadError::UnexpectedEof), }; match self.encoding { Encoding::Utf8 | Encoding::Default => { // fast path for ASCII subset if pos == 0 && next.is_ascii() { return Ok(Some(next.into())); } buf[pos] = next; pos += 1; match str::from_utf8(&buf[..pos]) { Ok(s) => return Ok(s.chars().next()), // always Some(..) Err(_) if pos < MAX_CODEPOINT_LEN => continue, Err(e) => return Err(e.into()), } }, Encoding::Latin1 => { return Ok(Some(next.into())); }, Encoding::Ascii => { if next.is_ascii() { return Ok(Some(next.into())); } else { return Err(CharReadError::Io(io::Error::new(io::ErrorKind::InvalidData, "char is not ASCII"))); } }, Encoding::Unknown | Encoding::Utf16 => { buf[pos] = next; pos += 1; // sniff BOM if pos <= 3 && buf[..pos] == [0xEF, 0xBB, 0xBF][..pos] { if pos == 3 && self.encoding != Encoding::Utf16 { pos = 0; self.encoding = Encoding::Utf8; } } else if pos <= 2 && buf[..pos] == [0xFE, 0xFF][..pos] { if pos == 2 { pos = 0; self.encoding = Encoding::Utf16Be; } } else if pos <= 2 && buf[..pos] == [0xFF, 0xFE][..pos] { if pos == 2 { pos = 0; self.encoding = Encoding::Utf16Le; } } else if pos == 1 && self.encoding == Encoding::Utf16 { // sniff ASCII char in UTF-16 self.encoding = if next == 0 { Encoding::Utf16Be } else { Encoding::Utf16Le }; } else { // UTF-8 is the default, but XML decl can change it to other 8-bit encoding self.encoding = Encoding::Default; if pos == 1 && next.is_ascii() { return Ok(Some(next.into())); } } }, Encoding::Utf16Be => { buf[pos] = next; pos += 1; if pos == 2 { if let Some(Ok(c)) = char::decode_utf16([u16::from_be_bytes(buf[..2].try_into().unwrap())]).next() { return Ok(Some(c)); } } else if pos == 4 { // surrogate return char::decode_utf16([u16::from_be_bytes(buf[..2].try_into().unwrap()), u16::from_be_bytes(buf[2..4].try_into().unwrap())]) .next().transpose() .map_err(|e| CharReadError::Io(io::Error::new(io::ErrorKind::InvalidData, e))); } }, Encoding::Utf16Le => { buf[pos] = next; pos += 1; if pos == 2 { if let Some(Ok(c)) = char::decode_utf16([u16::from_le_bytes(buf[..2].try_into().unwrap())]).next() { return Ok(Some(c)); } } else if pos == 4 { // surrogate return char::decode_utf16([u16::from_le_bytes(buf[..2].try_into().unwrap()), u16::from_le_bytes(buf[2..4].try_into().unwrap())]) .next().transpose() .map_err(|e| CharReadError::Io(io::Error::new(io::ErrorKind::InvalidData, e))); } }, } } } } #[cfg(test)] mod tests { use super::{CharReadError, CharReader, Encoding}; #[test] fn test_next_char_from() { use std::io; let mut bytes: &[u8] = "correct".as_bytes(); // correct ASCII assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('c')); let mut bytes: &[u8] = b"\xEF\xBB\xBF\xE2\x80\xA2!"; // BOM assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('•')); let mut bytes: &[u8] = b"\xEF\xBB\xBFx123"; // BOM assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('x')); let mut bytes: &[u8] = b"\xEF\xBB\xBF"; // Nothing after BOM assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None); let mut bytes: &[u8] = b"\xEF\xBB"; // Nothing after BO assert!(matches!(CharReader::new().next_char_from(&mut bytes), Err(CharReadError::UnexpectedEof))); let mut bytes: &[u8] = b"\xEF\xBB\x42"; // Nothing after BO assert!(matches!(CharReader::new().next_char_from(&mut bytes), Err(_))); let mut bytes: &[u8] = b"\xFE\xFF\x00\x42"; // UTF-16 assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('B')); let mut bytes: &[u8] = b"\xFF\xFE\x42\x00"; // UTF-16 assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('B')); let mut bytes: &[u8] = b"\xFF\xFE"; // UTF-16 assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None); let mut bytes: &[u8] = b"\xFF\xFE\x00"; // UTF-16 assert!(matches!(CharReader::new().next_char_from(&mut bytes), Err(CharReadError::UnexpectedEof))); let mut bytes: &[u8] = "правильно".as_bytes(); // correct BMP assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('п')); let mut bytes: &[u8] = "правильно".as_bytes(); assert_eq!(CharReader { encoding: Encoding::Utf16Be }.next_char_from(&mut bytes).unwrap(), Some('킿')); let mut bytes: &[u8] = "правильно".as_bytes(); assert_eq!(CharReader { encoding: Encoding::Utf16Le }.next_char_from(&mut bytes).unwrap(), Some('뿐')); let mut bytes: &[u8] = b"\xD8\xD8\x80"; assert!(matches!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes), Err(_))); let mut bytes: &[u8] = b"\x00\x42"; assert_eq!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes).unwrap(), Some('B')); let mut bytes: &[u8] = b"\x42\x00"; assert_eq!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes).unwrap(), Some('B')); let mut bytes: &[u8] = b"\x00"; assert!(matches!(CharReader { encoding: Encoding::Utf16Be }.next_char_from(&mut bytes), Err(_))); let mut bytes: &[u8] = "😊".as_bytes(); // correct non-BMP assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('😊')); let mut bytes: &[u8] = b""; // empty assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None); let mut bytes: &[u8] = b"\xf0\x9f\x98"; // incomplete code point match CharReader::new().next_char_from(&mut bytes).unwrap_err() { super::CharReadError::UnexpectedEof => {}, e => panic!("Unexpected result: {e:?}") }; let mut bytes: &[u8] = b"\xff\x9f\x98\x32"; // invalid code point match CharReader::new().next_char_from(&mut bytes).unwrap_err() { super::CharReadError::Utf8(_) => {}, e => panic!("Unexpected result: {e:?}") }; // error during read struct ErrorReader; impl io::Read for ErrorReader { fn read(&mut self, _: &mut [u8]) -> io::Result { Err(io::Error::new(io::ErrorKind::Other, "test error")) } } let mut r = ErrorReader; match CharReader::new().next_char_from(&mut r).unwrap_err() { super::CharReadError::Io(ref e) if e.kind() == io::ErrorKind::Other && e.to_string().contains("test error") => {}, e => panic!("Unexpected result: {e:?}") } } } xml-rs-0.8.19/src/writer/config.rs000064400000000000000000000141751046102023000150730ustar 00000000000000//! Contains emitter configuration structure. use std::borrow::Cow; use std::io::Write; use crate::writer::EventWriter; /// Emitter configuration structure. /// /// This structure contains various options which control XML document emitter behavior. #[derive(Clone, PartialEq, Eq, Debug)] pub struct EmitterConfig { /// Line separator used to separate lines in formatted output. Default is `"\n"`. pub line_separator: Cow<'static, str>, /// A string which will be used for a single level of indentation. Default is `" "` /// (two spaces). pub indent_string: Cow<'static, str>, /// Whether or not the emitted document should be indented. Default is false. /// /// The emitter is capable to perform automatic indentation of the emitted XML document. /// It is done in stream-like fashion and does not require the knowledge of the whole /// document in advance. /// /// Sometimes, however, automatic indentation is undesirable, e.g. when you want to keep /// existing layout when processing an existing XML document. Also the indentiation algorithm /// is not thoroughly tested. Hence by default it is disabled. pub perform_indent: bool, /// Whether or not characters in output events will be escaped. Default is true. /// /// The emitter can automatically escape characters which can't appear in PCDATA sections /// or element attributes of an XML document, like `<` or `"` (in attributes). This may /// introduce some overhead because then every corresponding piece of character data /// should be scanned for invalid characters. /// /// If this option is disabled, the XML writer may produce non-well-formed documents, so /// use `false` value for this option with care. pub perform_escaping: bool, /// Whether or not to write XML document declaration at the beginning of a document. /// Default is true. /// /// This option controls whether the document declaration should be emitted automatically /// before a root element is written if it was not emitted explicitly by the user. pub write_document_declaration: bool, /// Whether or not to convert elements with empty content to empty elements. Default is true. /// /// This option allows turning elements like `` (an element with empty content) /// into `` (an empty element). pub normalize_empty_elements: bool, /// Whether or not to emit CDATA events as plain characters. Default is false. /// /// This option forces the emitter to convert CDATA events into regular character events, /// performing all the necessary escaping beforehand. This may be occasionally useful /// for feeding the document into incorrect parsers which do not support CDATA. pub cdata_to_characters: bool, /// Whether or not to keep element names to support `EndElement` events without explicit names. /// Default is true. /// /// This option makes the emitter to keep names of written elements in order to allow /// omitting names when writing closing element tags. This could incur some memory overhead. pub keep_element_names_stack: bool, /// Whether or not to automatically insert leading and trailing spaces in emitted comments, /// if necessary. Default is true. /// /// This is a convenience option in order for the user not to append spaces before and after /// comments text in order to get more pretty comments: `` instead of /// ``. pub autopad_comments: bool, /// Whether or not to automatically insert spaces before the trailing `/>` in self-closing /// elements. Default is true. /// /// This option is only meaningful if `normalize_empty_elements` is true. For example, the /// element `` would be unaffected. When `normalize_empty_elements` is true, then when /// this option is also true, the same element would appear ``. If this option is false, /// then the same element would appear ``. pub pad_self_closing: bool, } impl EmitterConfig { /// Creates an emitter configuration with default values. /// /// You can tweak default options with builder-like pattern: /// /// ```rust /// use xml::writer::EmitterConfig; /// /// let config = EmitterConfig::new() /// .line_separator("\r\n") /// .perform_indent(true) /// .normalize_empty_elements(false); /// ``` #[inline] #[must_use] pub fn new() -> EmitterConfig { EmitterConfig { line_separator: "\n".into(), indent_string: " ".into(), // two spaces perform_indent: false, perform_escaping: true, write_document_declaration: true, normalize_empty_elements: true, cdata_to_characters: false, keep_element_names_stack: true, autopad_comments: true, pad_self_closing: true, } } /// Creates an XML writer with this configuration. /// /// This is a convenience method for configuring and creating a writer at the same time: /// /// ```rust /// use xml::writer::EmitterConfig; /// /// let mut target: Vec = Vec::new(); /// /// let writer = EmitterConfig::new() /// .line_separator("\r\n") /// .perform_indent(true) /// .normalize_empty_elements(false) /// .create_writer(&mut target); /// ``` /// /// This method is exactly equivalent to calling `EventWriter::new_with_config()` with /// this configuration object. #[inline] pub fn create_writer(self, sink: W) -> EventWriter { EventWriter::new_with_config(sink, self) } } impl Default for EmitterConfig { #[inline] fn default() -> EmitterConfig { EmitterConfig::new() } } gen_setters!(EmitterConfig, line_separator: into Cow<'static, str>, indent_string: into Cow<'static, str>, perform_indent: val bool, write_document_declaration: val bool, normalize_empty_elements: val bool, cdata_to_characters: val bool, keep_element_names_stack: val bool, autopad_comments: val bool, pad_self_closing: val bool ); xml-rs-0.8.19/src/writer/emitter.rs000064400000000000000000000332631046102023000152760ustar 00000000000000use std::error::Error; use std::fmt; use std::io; use std::io::prelude::*; use std::result; use crate::attribute::Attribute; use crate::common; use crate::common::XmlVersion; use crate::escape::{AttributeEscapes, Escaped, PcDataEscapes}; use crate::name::{Name, OwnedName}; use crate::namespace::{NamespaceStack, NS_EMPTY_URI, NS_NO_PREFIX, NS_XMLNS_PREFIX, NS_XML_PREFIX}; use crate::writer::config::EmitterConfig; /// An error which may be returned by `XmlWriter` when writing XML events. #[derive(Debug)] pub enum EmitterError { /// An I/O error occured in the underlying `Write` instance. Io(io::Error), /// Document declaration has already been written to the output stream. DocumentStartAlreadyEmitted, /// The name of the last opening element is not available. LastElementNameNotAvailable, /// The name of the last opening element is not equal to the name of the provided /// closing element. EndElementNameIsNotEqualToLastStartElementName, /// End element name is not specified when it is needed, for example, when automatic /// closing is not enabled in configuration. EndElementNameIsNotSpecified, } impl From for EmitterError { #[cold] fn from(err: io::Error) -> EmitterError { EmitterError::Io(err) } } impl fmt::Display for EmitterError { #[cold] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str("emitter error: ")?; match self { EmitterError::Io(e) => write!(f, "I/O error: {e}"), EmitterError::DocumentStartAlreadyEmitted => f.write_str("document start event has already been emitted"), EmitterError::LastElementNameNotAvailable => f.write_str("last element name is not available"), EmitterError::EndElementNameIsNotEqualToLastStartElementName => f.write_str("end element name is not equal to last start element name"), EmitterError::EndElementNameIsNotSpecified => f.write_str("end element name is not specified and can't be inferred"), } } } impl Error for EmitterError { } /// A result type yielded by `XmlWriter`. pub type Result = result::Result; // TODO: split into a low-level fast writer without any checks and formatting logic and a // high-level indenting validating writer pub struct Emitter { config: EmitterConfig, nst: NamespaceStack, indent_level: usize, indent_stack: Vec, element_names: Vec, start_document_emitted: bool, just_wrote_start_element: bool, } impl Emitter { pub fn new(config: EmitterConfig) -> Emitter { let mut indent_stack = Vec::with_capacity(16); indent_stack.push(IndentFlags::WroteNothing); Emitter { config, nst: NamespaceStack::empty(), indent_level: 0, indent_stack, element_names: Vec::new(), start_document_emitted: false, just_wrote_start_element: false, } } } #[derive(Copy, Clone, Eq, PartialEq, Debug)] enum IndentFlags { WroteNothing, WroteMarkup, WroteText, } impl Emitter { /// Returns the current state of namespaces. #[inline] pub fn namespace_stack_mut(&mut self) -> &mut NamespaceStack { &mut self.nst } #[inline] fn wrote_text(&self) -> bool { self.indent_stack.last().map_or(false, |&e| e == IndentFlags::WroteText) } #[inline] fn wrote_markup(&self) -> bool { self.indent_stack.last().map_or(false, |&e| e == IndentFlags::WroteMarkup) } #[inline] fn set_wrote_text(&mut self) { if let Some(e) = self.indent_stack.last_mut() { *e = IndentFlags::WroteText; } } #[inline] fn set_wrote_markup(&mut self) { if let Some(e) = self.indent_stack.last_mut() { *e = IndentFlags::WroteMarkup; } } fn write_newline(&mut self, target: &mut W, level: usize) -> Result<()> { target.write_all(self.config.line_separator.as_bytes())?; for _ in 0..level { target.write_all(self.config.indent_string.as_bytes())?; } Ok(()) } fn before_markup(&mut self, target: &mut W) -> Result<()> { if self.config.perform_indent && !self.wrote_text() && (self.indent_level > 0 || self.wrote_markup()) { let indent_level = self.indent_level; self.write_newline(target, indent_level)?; if self.indent_level > 0 && self.config.indent_string.len() > 0 { self.after_markup(); } } Ok(()) } fn after_markup(&mut self) { self.set_wrote_markup(); } fn before_start_element(&mut self, target: &mut W) -> Result<()> { self.before_markup(target)?; self.indent_stack.push(IndentFlags::WroteNothing); Ok(()) } fn after_start_element(&mut self) { self.after_markup(); self.indent_level += 1; } fn before_end_element(&mut self, target: &mut W) -> Result<()> { if self.config.perform_indent && self.indent_level > 0 && self.wrote_markup() && !self.wrote_text() { let indent_level = self.indent_level; self.write_newline(target, indent_level - 1) } else { Ok(()) } } fn after_end_element(&mut self) { if self.indent_level > 0 { self.indent_level -= 1; self.indent_stack.pop(); } self.set_wrote_markup(); } fn after_text(&mut self) { self.set_wrote_text(); } pub fn emit_start_document(&mut self, target: &mut W, version: XmlVersion, encoding: &str, standalone: Option) -> Result<()> { if self.start_document_emitted { return Err(EmitterError::DocumentStartAlreadyEmitted); } self.start_document_emitted = true; self.before_markup(target)?; let result = { let mut write = move || { write!(target, "")?; Ok(()) }; write() }; self.after_markup(); result } fn check_document_started(&mut self, target: &mut W) -> Result<()> { if !self.start_document_emitted && self.config.write_document_declaration { self.emit_start_document(target, common::XmlVersion::Version10, "utf-8", None) } else { Ok(()) } } fn fix_non_empty_element(&mut self, target: &mut W) -> Result<()> { if self.config.normalize_empty_elements && self.just_wrote_start_element { self.just_wrote_start_element = false; target.write_all(b">").map_err(From::from) } else { Ok(()) } } pub fn emit_processing_instruction(&mut self, target: &mut W, name: &str, data: Option<&str>) -> Result<()> { self.check_document_started(target)?; self.fix_non_empty_element(target)?; self.before_markup(target)?; let result = { let mut write = move || { write!(target, "")?; Ok(()) }; write() }; self.after_markup(); result } fn emit_start_element_initial(&mut self, target: &mut W, name: Name<'_>, attributes: &[Attribute<'_>]) -> Result<()> where W: Write { self.check_document_started(target)?; self.fix_non_empty_element(target)?; self.before_start_element(target)?; write!(target, "<{}", name.repr_display())?; self.emit_current_namespace_attributes(target)?; self.emit_attributes(target, attributes)?; self.after_start_element(); Ok(()) } pub fn emit_start_element(&mut self, target: &mut W, name: Name<'_>, attributes: &[Attribute<'_>]) -> Result<()> where W: Write { if self.config.keep_element_names_stack { self.element_names.push(name.to_owned()); } self.emit_start_element_initial(target, name, attributes)?; self.just_wrote_start_element = true; if !self.config.normalize_empty_elements { write!(target, ">")?; } Ok(()) } pub fn emit_current_namespace_attributes(&mut self, target: &mut W) -> Result<()> where W: Write { for (prefix, uri) in self.nst.peek() { match prefix { // internal namespaces are not emitted NS_XMLNS_PREFIX | NS_XML_PREFIX => Ok(()), //// there is already a namespace binding with this prefix in scope //prefix if self.nst.get(prefix) == Some(uri) => Ok(()), // emit xmlns only if it is overridden NS_NO_PREFIX => if uri != NS_EMPTY_URI { write!(target, " xmlns=\"{uri}\"") } else { Ok(()) }, // everything else prefix => write!(target, " xmlns:{prefix}=\"{uri}\"") }?; } Ok(()) } pub fn emit_attributes(&mut self, target: &mut W, attributes: &[Attribute<'_>]) -> Result<()> { for attr in attributes.iter() { write!(target, " {}=\"", attr.name.repr_display())?; if self.config.perform_escaping { write!(target, "{}", Escaped::::new(attr.value))?; } else { write!(target, "{}", attr.value)?; } write!(target, "\"")?; } Ok(()) } pub fn emit_end_element(&mut self, target: &mut W, name: Option>) -> Result<()> { let owned_name = if self.config.keep_element_names_stack { Some(self.element_names.pop().ok_or(EmitterError::LastElementNameNotAvailable)?) } else { None }; // Check that last started element name equals to the provided name, if there are both if let Some(ref last_name) = owned_name { if let Some(ref name) = name { if last_name.borrow() != *name { return Err(EmitterError::EndElementNameIsNotEqualToLastStartElementName); } } } if let Some(name) = owned_name.as_ref().map(|n| n.borrow()).or(name) { if self.config.normalize_empty_elements && self.just_wrote_start_element { self.just_wrote_start_element = false; let termination = if self.config.pad_self_closing { " />" } else { "/>" }; let result = target.write_all(termination.as_bytes()).map_err(From::from); self.after_end_element(); result } else { self.just_wrote_start_element = false; self.before_end_element(target)?; let result = write!(target, "", name.repr_display()).map_err(From::from); self.after_end_element(); result } } else { Err(EmitterError::EndElementNameIsNotSpecified) } } pub fn emit_cdata(&mut self, target: &mut W, content: &str) -> Result<()> { self.fix_non_empty_element(target)?; if self.config.cdata_to_characters { self.emit_characters(target, content) } else { // TODO: escape ']]>' characters in CDATA as two adjacent CDATA blocks target.write_all(b"")?; self.after_text(); Ok(()) } } pub fn emit_characters(&mut self, target: &mut W, content: &str) -> Result<()> { self.check_document_started(target)?; self.fix_non_empty_element(target)?; if self.config.perform_escaping { write!(target, "{}", Escaped::::new(content))?; } else { target.write_all(content.as_bytes())?; } self.after_text(); Ok(()) } pub fn emit_comment(&mut self, target: &mut W, content: &str) -> Result<()> { self.fix_non_empty_element(target)?; // TODO: add escaping dashes at the end of the comment let autopad_comments = self.config.autopad_comments; let write = move |target: &mut W| -> Result<()> { target.write_all(b"")?; Ok(()) }; self.before_markup(target)?; let result = write(target); self.after_markup(); result } } xml-rs-0.8.19/src/writer/events.rs000064400000000000000000000210011046102023000151140ustar 00000000000000//! Contains `XmlEvent` datatype, instances of which are consumed by the writer. use std::borrow::Cow; use crate::attribute::Attribute; use crate::common::XmlVersion; use crate::name::Name; use crate::namespace::{Namespace, NS_NO_PREFIX}; /// A part of an XML output stream. /// /// Objects of this enum are consumed by `EventWriter`. They correspond to different parts of /// an XML document. #[derive(Debug, Clone)] pub enum XmlEvent<'a> { /// Corresponds to XML document declaration. /// /// This event should always be written before any other event. If it is not written /// at all, a default XML declaration will be outputted if the corresponding option /// is set in the configuration. Otherwise an error will be returned. StartDocument { /// XML version. /// /// Defaults to `XmlVersion::Version10`. version: XmlVersion, /// XML document encoding. /// /// Defaults to `Some("UTF-8")`. encoding: Option<&'a str>, /// XML standalone declaration. /// /// Defaults to `None`. standalone: Option, }, /// Denotes an XML processing instruction. ProcessingInstruction { /// Processing instruction target. name: &'a str, /// Processing instruction content. data: Option<&'a str>, }, /// Denotes a beginning of an XML element. StartElement { /// Qualified name of the element. name: Name<'a>, /// A list of attributes associated with the element. /// /// Currently attributes are not checked for duplicates (TODO). Attribute values /// will be escaped, and all characters invalid for attribute values like `"` or `<` /// will be changed into character entities. attributes: Cow<'a, [Attribute<'a>]>, /// Contents of the namespace mapping at this point of the document. /// /// This mapping will be inspected for "new" entries, and if at this point of the document /// a particular pair of prefix and namespace URI is already defined, no namespace /// attributes will be emitted. namespace: Cow<'a, Namespace>, }, /// Denotes an end of an XML element. EndElement { /// Optional qualified name of the element. /// /// If `None`, then it is assumed that the element name should be the last valid one. /// If `Some` and element names tracking is enabled, then the writer will check it for /// correctness. name: Option>, }, /// Denotes CDATA content. /// /// This event contains unparsed data, and no escaping will be performed when writing it /// to the output stream. CData(&'a str), /// Denotes a comment. /// /// The string will be checked for invalid sequences and error will be returned by the /// write operation Comment(&'a str), /// Denotes character data outside of tags. /// /// Contents of this event will be escaped if `perform_escaping` option is enabled, /// that is, every character invalid for PCDATA will appear as a character entity. Characters(&'a str), } impl<'a> XmlEvent<'a> { /// Returns an writer event for a processing instruction. #[inline] #[must_use] pub fn processing_instruction(name: &'a str, data: Option<&'a str>) -> XmlEvent<'a> { XmlEvent::ProcessingInstruction { name, data } } /// Returns a builder for a starting element. /// /// This builder can then be used to tweak attributes and namespace starting at /// this element. #[inline] pub fn start_element(name: S) -> StartElementBuilder<'a> where S: Into> { StartElementBuilder { name: name.into(), attributes: Vec::new(), namespace: Namespace::empty(), } } /// Returns a builder for an closing element. /// /// This method, unline `start_element()`, does not accept a name because by default /// the writer is able to determine it automatically. However, when this functionality /// is disabled, it is possible to specify the name with `name()` method on the builder. #[inline] #[must_use] pub fn end_element() -> EndElementBuilder<'a> { EndElementBuilder { name: None } } /// Returns a CDATA event. /// /// Naturally, the provided string won't be escaped, except for closing CDATA token `]]>` /// (depending on the configuration). #[inline] #[must_use] pub fn cdata(data: &'a str) -> XmlEvent<'a> { XmlEvent::CData(data) } /// Returns a regular characters (PCDATA) event. /// /// All offending symbols, in particular, `&` and `<`, will be escaped by the writer. #[inline] #[must_use] pub fn characters(data: &'a str) -> XmlEvent<'a> { XmlEvent::Characters(data) } /// Returns a comment event. #[inline] #[must_use] pub fn comment(data: &'a str) -> XmlEvent<'a> { XmlEvent::Comment(data) } } impl<'a> From<&'a str> for XmlEvent<'a> { #[inline] fn from(s: &'a str) -> XmlEvent<'a> { XmlEvent::Characters(s) } } pub struct EndElementBuilder<'a> { name: Option>, } /// A builder for a closing element event. impl<'a> EndElementBuilder<'a> { /// Sets the name of this closing element. /// /// Usually the writer is able to determine closing element names automatically. If /// this functionality is enabled (by default it is), then this name is checked for correctness. /// It is possible, however, to disable such behavior; then the user must ensure that /// closing element name is correct manually. #[inline] pub fn name(mut self, name: N) -> EndElementBuilder<'a> where N: Into> { self.name = Some(name.into()); self } } impl<'a> From> for XmlEvent<'a> { fn from(b: EndElementBuilder<'a>) -> XmlEvent<'a> { XmlEvent::EndElement { name: b.name } } } /// A builder for a starting element event. pub struct StartElementBuilder<'a> { name: Name<'a>, attributes: Vec>, namespace: Namespace, } impl<'a> StartElementBuilder<'a> { /// Sets an attribute value of this element to the given string. /// /// This method can be used to add attributes to the starting element. Name is a qualified /// name; its namespace is ignored, but its prefix is checked for correctness, that is, /// it is checked that the prefix is bound to some namespace in the current context. /// /// Currently attributes are not checked for duplicates. Note that duplicate attributes /// are a violation of XML document well-formedness. /// /// The writer checks that you don't specify reserved prefix names, for example `xmlns`. #[inline] pub fn attr(mut self, name: N, value: &'a str) -> StartElementBuilder<'a> where N: Into> { self.attributes.push(Attribute::new(name.into(), value)); self } /// Adds a namespace to the current namespace context. /// /// If no namespace URI was bound to the provided prefix at this point of the document, /// then the mapping from the prefix to the provided namespace URI will be written as /// a part of this element attribute set. /// /// If the same namespace URI was bound to the provided prefix at this point of the document, /// then no namespace attributes will be emitted. /// /// If some other namespace URI was bound to the provided prefix at this point of the document, /// then another binding will be added as a part of this element attribute set, shadowing /// the outer binding. #[inline] #[must_use] pub fn ns(mut self, prefix: S1, uri: S2) -> StartElementBuilder<'a> where S1: Into, S2: Into { self.namespace.put(prefix, uri); self } /// Adds a default namespace mapping to the current namespace context. /// /// Same rules as for `ns()` are also valid for the default namespace mapping. #[inline] #[must_use] pub fn default_ns(mut self, uri: S) -> StartElementBuilder<'a> where S: Into { self.namespace.put(NS_NO_PREFIX, uri); self } } impl<'a> From> for XmlEvent<'a> { #[inline] fn from(b: StartElementBuilder<'a>) -> XmlEvent<'a> { XmlEvent::StartElement { name: b.name, attributes: Cow::Owned(b.attributes), namespace: Cow::Owned(b.namespace), } } } xml-rs-0.8.19/src/writer.rs000064400000000000000000000072521046102023000136240ustar 00000000000000//! Contains high-level interface for an events-based XML emitter. //! //! The most important type in this module is `EventWriter` which allows writing an XML document //! to some output stream. pub use self::config::EmitterConfig; pub use self::emitter::EmitterError as Error; pub use self::emitter::Result; pub use self::events::XmlEvent; use self::emitter::Emitter; use std::io::prelude::*; mod config; mod emitter; pub mod events; /// A wrapper around an `std::io::Write` instance which emits XML document according to provided /// events. pub struct EventWriter { sink: W, emitter: Emitter, } impl EventWriter { /// Creates a new `EventWriter` out of an `std::io::Write` instance using the default /// configuration. #[inline] pub fn new(sink: W) -> EventWriter { EventWriter::new_with_config(sink, EmitterConfig::new()) } /// Creates a new `EventWriter` out of an `std::io::Write` instance using the provided /// configuration. #[inline] pub fn new_with_config(sink: W, config: EmitterConfig) -> EventWriter { EventWriter { sink, emitter: Emitter::new(config), } } /// Writes the next piece of XML document according to the provided event. /// /// Note that output data may not exactly correspond to the written event because /// of various configuration options. For example, `XmlEvent::EndElement` may /// correspond to a separate closing element or it may cause writing an empty element. /// Another example is that `XmlEvent::CData` may be represented as characters in /// the output stream. pub fn write<'a, E>(&mut self, event: E) -> Result<()> where E: Into> { match event.into() { XmlEvent::StartDocument { version, encoding, standalone } => self.emitter.emit_start_document(&mut self.sink, version, encoding.unwrap_or("UTF-8"), standalone), XmlEvent::ProcessingInstruction { name, data } => self.emitter.emit_processing_instruction(&mut self.sink, name, data), XmlEvent::StartElement { name, attributes, namespace } => { self.emitter.namespace_stack_mut().push_empty().checked_target().extend(namespace.as_ref()); self.emitter.emit_start_element(&mut self.sink, name, &attributes) } XmlEvent::EndElement { name } => { let r = self.emitter.emit_end_element(&mut self.sink, name); self.emitter.namespace_stack_mut().try_pop(); r } XmlEvent::Comment(content) => self.emitter.emit_comment(&mut self.sink, content), XmlEvent::CData(content) => self.emitter.emit_cdata(&mut self.sink, content), XmlEvent::Characters(content) => self.emitter.emit_characters(&mut self.sink, content), } } /// Returns a mutable reference to the underlying `Writer`. /// /// Note that having a reference to the underlying sink makes it very easy to emit invalid XML /// documents. Use this method with care. Valid use cases for this method include accessing /// methods like `Write::flush`, which do not emit new data but rather change the state /// of the stream itself. pub fn inner_mut(&mut self) -> &mut W { &mut self.sink } /// Unwraps this `EventWriter`, returning the underlying writer. /// /// Note that this is a destructive operation: unwrapping a writer and then wrapping /// it again with `EventWriter::new()` will create a fresh writer whose state will be /// blank; for example, accumulated namespaces will be reset. pub fn into_inner(self) -> W { self.sink } }