xml-1.2.0/.cargo_vcs_info.json0000644000000001361046102023000116560ustar { "git": { "sha1": "5ec715b9f09cd793772e13ec5bc8a3a241c9ed63" }, "path_in_vcs": "" }xml-1.2.0/Cargo.lock0000644000000002231046102023000076260ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "xml" version = "1.2.0" xml-1.2.0/Cargo.toml0000644000000026271046102023000076630ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" rust-version = "1.71" name = "xml" version = "1.2.0" authors = [ "Vladimir Matveev ", "Kornel (https://github.com/kornelski)", ] build = false include = [ "src/**", "LICENSE", "README.md", ] autolib = false autobins = false autoexamples = false autotests = false autobenches = false description = "An XML library in pure Rust" homepage = "https://lib.rs/crates/xml" documentation = "https://docs.rs/xml/" readme = "README.md" keywords = [ "xml", "parser", "sax", "event-reader", "writer", ] categories = ["parser-implementations"] license = "MIT" repository = "https://github.com/kornelski/xml-rs" [package.metadata.docs.rs] targets = ["x86_64-unknown-linux-gnu"] rustdoc-args = ["--generate-link-to-definition"] [package.metadata.release] tag-name = "{{version}}" tag-message = "" [badges.maintenance] status = "actively-developed" [lib] name = "xml" path = "src/lib.rs" xml-1.2.0/Cargo.toml.orig000064400000000000000000000014251046102023000133150ustar 00000000000000[package] name = "xml" version = "1.2.0" authors = ["Vladimir Matveev ", "Kornel (https://github.com/kornelski)"] license = "MIT" description = "An XML library in pure Rust" repository = "https://github.com/kornelski/xml-rs" homepage = "https://lib.rs/crates/xml" documentation = "https://docs.rs/xml/" readme = "README.md" keywords = ["xml", "parser", "sax", "event-reader", "writer"] categories = ["parser-implementations"] edition = "2021" rust-version = "1.71" # bump CI too include = ["src/**", "LICENSE", "README.md"] [badges] maintenance = { status = "actively-developed" } [package.metadata.docs.rs] targets = ["x86_64-unknown-linux-gnu"] rustdoc-args = ["--generate-link-to-definition"] [package.metadata.release] tag-name = "{{version}}" tag-message = "" xml-1.2.0/LICENSE000064400000000000000000000020731046102023000114330ustar 00000000000000The MIT License (MIT) Copyright (c) 2014 Vladimir Matveev Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. xml-1.2.0/README.md000064400000000000000000000202521046102023000117040ustar 00000000000000xml-rs, an XML library for Rust =============================== [![CI](https://github.com/kornelski/xml-rs/actions/workflows/main.yml/badge.svg)](https://github.com/kornelski/xml-rs/actions/workflows/main.yml) [![crates.io][crates-io-img]](https://lib.rs/crates/xml) [![docs][docs-img]](https://docs.rs/xml/) [Documentation](https://docs.rs/xml/) [crates-io-img]: https://img.shields.io/crates/v/xml-rs.svg [docs-img]: https://img.shields.io/badge/docs-latest%20release-6495ed.svg xml-rs is an XML library for the [Rust](https://www.rust-lang.org/) programming language. It supports reading and writing of XML documents in a streaming fashion (without DOM). ### Features * XML spec conformance better than other pure-Rust libraries. * Easy to use API based on `Iterator`s and regular `String`s without tricky lifetimes. * Support for UTF-16, UTF-8, ISO-8859-1, and ASCII encodings. * Written entirely in the safe Rust subset. Designed to safely handle untrusted input. The API is heavily inspired by Java Streaming API for XML ([StAX][stax]). It contains a pull parser much like StAX event reader. It provides an iterator API, so you can leverage Rust's existing iterators library features. [stax]: https://en.wikipedia.org/wiki/StAX It also provides a streaming document writer much like StAX event writer. This writer consumes its own set of events, but reader events can be converted to writer events easily, and so it is possible to write XML transformation chains in a pretty clean manner. This parser is mostly full-featured, however, there are limitations: * Legacy code pages and non-Unicode encodings are not supported; * DTD validation is not supported (but entities defined in the internal subset are supported); * attribute value normalization is not performed, and end-of-line characters are not normalized either. Other than that the parser tries to be mostly XML-1.1-compliant. Writer is also mostly full-featured with the following limitations: * no support for encodings other than UTF-8, * no support for emitting `` declarations; * more validations of input are needed, for example, checking that namespace prefixes are bounded or comments are well-formed. Building and using ------------------ xml-rs uses [Cargo](https://crates.io), so add it with `cargo add xml` or modify `Cargo.toml`: ```toml [dependencies] xml = "1.0" ``` The package exposes a single crate called `xml`. Reading XML documents --------------------- [`xml::reader::EventReader`](EventReader) requires a [`Read`](stdread) instance to read from. It can be a `File` wrapped in `BufReader`, or a `Vec`, or a `&[u8]` slice. [EventReader]: https://docs.rs/xml/latest/xml/reader/struct.EventReader.html [stdread]: https://doc.rust-lang.org/stable/std/io/trait.Read.html `EventReader` implements `IntoIterator` trait, so you can use it in a `for` loop directly: ```rust,no_run use std::fs::File; use std::io::BufReader; use xml::reader::{EventReader, XmlEvent}; fn main() -> std::io::Result<()> { let file = File::open("file.xml")?; let file = BufReader::new(file); // Buffering is important for performance let parser = EventReader::new(file); let mut depth = 0; for e in parser { match e { Ok(XmlEvent::StartElement { name, .. }) => { println!("{:spaces$}+{name}", "", spaces = depth * 2); depth += 1; } Ok(XmlEvent::EndElement { name }) => { depth -= 1; println!("{:spaces$}-{name}", "", spaces = depth * 2); } Err(e) => { eprintln!("Error: {e}"); break; } // There's more: https://docs.rs/xml/latest/xml/reader/enum.XmlEvent.html _ => {} } } Ok(()) } ``` Document parsing can end normally or with an error. Regardless of exact cause, the parsing process will be stopped, and the iterator will terminate normally. You can also have finer control over when to pull the next event from the parser using its own `next()` method: ```rust,ignore match parser.next() { ... } ``` Upon the end of the document or an error, the parser will remember the last event and will always return it in the result of `next()` call afterwards. If iterator is used, then it will yield error or end-of-document event once and will produce `None` afterwards. It is also possible to tweak parsing process a little using [`xml::reader::ParserConfig`][ParserConfig] structure. See its documentation for more information and examples. [ParserConfig]: https://docs.rs/xml/latest/xml/reader/struct.ParserConfig.html You can find a more extensive example of using `EventReader` in `src/analyze.rs`, which is a small program (BTW, it is built with `cargo build` and can be run after that) which shows various statistics about specified XML document. It can also be used to check for well-formedness of XML documents - if a document is not well-formed, this program will exit with an error. ## Parsing untrusted inputs The parser is written in safe Rust subset, so by Rust's guarantees the worst that it can do is to cause a panic. You can use `ParserConfig` to set limits on maximum lenghts of names, attributes, text, entities, etc. You should also set a maximum document size via `io::Read`'s [`take(max)`](https://doc.rust-lang.org/stable/std/io/trait.Read.html#method.take) method. Writing XML documents --------------------- xml-rs also provides a streaming writer much like StAX event writer. With it you can write an XML document to any `Write` implementor. ```rust,no_run use std::io; use xml::writer::{EmitterConfig, XmlEvent}; /// A simple demo syntax where "+foo" makes ``, "-foo" makes `` fn make_event_from_line(line: &str) -> XmlEvent { let line = line.trim(); if let Some(name) = line.strip_prefix("+") { XmlEvent::start_element(name).into() } else if line.starts_with("-") { XmlEvent::end_element().into() } else { XmlEvent::characters(line).into() } } fn main() -> io::Result<()> { let input = io::stdin(); let output = io::stdout(); let mut writer = EmitterConfig::new() .perform_indent(true) .create_writer(output); let mut line = String::new(); loop { line.clear(); let bytes_read = input.read_line(&mut line)?; if bytes_read == 0 { break; // EOF } let event = make_event_from_line(&line); if let Err(e) = writer.write(event) { panic!("Write error: {e}") } } Ok(()) } ``` The code example above also demonstrates how to create a writer out of its configuration. Similar thing also works with `EventReader`. The library provides an XML event building DSL which helps to construct complex events, e.g. ones having namespace definitions. Some examples: ```rust,ignore // XmlEvent::start_element("a:hello").attr("a:param", "value").ns("a", "urn:some:document") // XmlEvent::start_element("hello").attr("b:config", "value").default_ns("urn:defaul:uri") // XmlEvent::cdata("some unescaped text") ``` Of course, one can create `XmlEvent` enum variants directly instead of using the builder DSL. There are more examples in [`xml::writer::XmlEvent`][XmlEvent] documentation. [XmlEvent]: https://docs.rs/xml/latest/xml/reader/enum.XmlEvent.html The writer has multiple configuration options; see `EmitterConfig` documentation for more information. [EmitterConfig]: https://docs.rs/xml/latest/xml/writer/struct.EmitterConfig.html Bug reports ------------ Please report issues at: . Before reporting issues with XML conformance, please find the relevant section in the XML spec first. ## [Upgrading from 0.8 to 1.0](https://github.com/kornelski/xml-rs/blob/main/Changelog.md) It should be pretty painless: * Change `xml-rs = "0.8"` to `xml = "1.0"` in `Cargo.toml` * Add `_ => {}` to `match` statements where the compiler complains. A new `Doctype` event has been added, and error enums are non-exhaustive. * If you were creating `ParserConfig` using a struct literal, please use `ParserConfig::new()` and the setters. xml-1.2.0/src/attribute.rs000064400000000000000000000050771046102023000135750ustar 00000000000000//! Contains XML attributes manipulation types and functions. use std::fmt; use crate::escape::{AttributeEscapes, Escaped}; use crate::name::{Name, OwnedName}; /// A borrowed version of an XML attribute. /// /// Consists of a borrowed qualified name and a borrowed string value. #[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)] pub struct Attribute<'a> { /// Attribute name. pub name: Name<'a>, /// Attribute value. pub value: &'a str, } impl fmt::Display for Attribute<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}=\"{}\"", self.name, Escaped::::new(self.value)) } } impl<'a> Attribute<'a> { /// Creates an owned attribute out of this borrowed one. #[inline] #[must_use] pub fn to_owned(&self) -> OwnedAttribute { OwnedAttribute { name: self.name.into(), value: self.value.into(), } } /// Creates a borrowed attribute using the provided borrowed name and a borrowed string value. #[inline] #[must_use] pub const fn new(name: Name<'a>, value: &'a str) -> Self { Attribute { name, value } } } /// An owned version of an XML attribute. /// /// Consists of an owned qualified name and an owned string value. #[derive(Clone, Eq, PartialEq, Hash, Debug)] pub struct OwnedAttribute { /// Attribute name. pub name: OwnedName, /// Attribute value. pub value: String, } impl OwnedAttribute { /// Returns a borrowed `Attribute` out of this owned one. #[must_use] #[inline] pub fn borrow(&self) -> Attribute<'_> { Attribute { name: self.name.borrow(), value: &self.value, } } /// Creates a new owned attribute using the provided owned name and an owned string value. #[inline] pub fn new>(name: OwnedName, value: S) -> Self { Self { name, value: value.into() } } } impl fmt::Display for OwnedAttribute { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}=\"{}\"", self.name, Escaped::::new(&self.value)) } } #[cfg(test)] mod tests { use super::Attribute; use crate::name::Name; #[test] fn attribute_display() { let attr = Attribute::new( Name::qualified("attribute", "urn:namespace", Some("n")), "its value with > & \" ' < weird symbols", ); assert_eq!( &*attr.to_string(), "{urn:namespace}n:attribute=\"its value with > & " ' < weird symbols\"" ); } } xml-1.2.0/src/common.rs000064400000000000000000000133341046102023000130550ustar 00000000000000//! Contains common types and functions used throughout the library. use std::fmt; /// Represents a position inside some textual document. #[derive(Copy, Clone, PartialEq, Eq)] pub struct TextPosition { #[doc(hidden)] pub row: u64, #[doc(hidden)] pub column: u64, } impl TextPosition { /// Creates a new position initialized to the beginning of the document #[inline] #[must_use] pub const fn new() -> Self { Self { row: 0, column: 0 } } /// Advances the position in a line #[inline] pub fn advance(&mut self, count: u8) { self.column += u64::from(count); } #[doc(hidden)] #[deprecated] pub fn advance_to_tab(&mut self, width: u8) { let width = u64::from(width); self.column += width - self.column % width; } /// Advances the position to the beginning of the next line #[inline] pub fn new_line(&mut self) { self.column = 0; self.row += 1; } /// Row, counting from 0. Add 1 to display as users expect! #[must_use] pub fn row(&self) -> u64 { self.row } /// Column, counting from 0. Add 1 to display as users expect! #[must_use] pub fn column(&self) -> u64 { self.column } } impl fmt::Debug for TextPosition { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt(self, f) } } impl fmt::Display for TextPosition { #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}:{}", self.row + 1, self.column + 1) } } /// Get the position in the document corresponding to the object /// /// This trait is implemented by parsers, lexers and errors. pub trait Position { /// Returns the current position or a position corresponding to the object. fn position(&self) -> TextPosition; } impl Position for TextPosition { #[inline] fn position(&self) -> TextPosition { *self } } /// XML version enumeration. #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] pub enum XmlVersion { /// XML version 1.0, or any 1.x version other than 1.1 /// /// All future versions are disallowed since XML 1.1, so any version beyond 1.1 is an error tolerated only in XML 1.0. /// Version10, /// XML version 1.1. Version11, } impl XmlVersion { /// Convenience helper which returns a string representation of the given version. /// /// ``` /// # use xml::common::XmlVersion; /// assert_eq!(XmlVersion::Version10.as_str(), "1.0"); /// assert_eq!(XmlVersion::Version11.as_str(), "1.1"); /// ``` pub fn as_str(self) -> &'static str { match self { Self::Version10 => "1.0", Self::Version11 => "1.1", } } } impl fmt::Display for XmlVersion { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.as_str().fmt(f) } } impl fmt::Debug for XmlVersion { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt(self, f) } } /// Checks whether the given character is a white space character (`S`) /// as is defined by XML 1.1 specification, [section 2.3][1]. /// /// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn #[must_use] #[inline] pub const fn is_whitespace_char(c: char) -> bool { matches!(c, '\x20' | '\x0a' | '\x09' | '\x0d') } /// Matches the PubIdChar production. pub (crate) fn is_pubid_char(c: char) -> bool { matches!(c, '\x20' | '\x0D' | '\x0A' | 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '\'' | '(' | ')' | '+' | ',' | '.' | '/' | ':' | '=' | '?' | ';' | '!' | '*' | '#' | '@' | '$' | '_' | '%') } /// Checks whether the given string is compound only by white space /// characters (`S`) using the previous `is_whitespace_char` to check /// all characters of this string pub fn is_whitespace_str(s: &str) -> bool { s.chars().all(is_whitespace_char) } /// Is it a valid character in XML 1.0 #[must_use] pub const fn is_xml10_char(c: char) -> bool { matches!(c, '\u{09}' | '\u{0A}' | '\u{0D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..) } /// Is it a valid character in XML 1.1 #[must_use] pub const fn is_xml11_char(c: char) -> bool { matches!(c, '\u{01}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..) } /// Is it a valid character in XML 1.1 but not part of the restricted character set #[must_use] pub const fn is_xml11_char_not_restricted(c: char) -> bool { is_xml11_char(c) && !matches!(c, '\u{01}'..='\u{08}' | '\u{0B}'..='\u{0C}' | '\u{0E}'..='\u{1F}' | '\u{7F}'..='\u{84}' | '\u{86}'..='\u{9F}') } /// Checks whether the given character is a name start character (`NameStartChar`) /// as is defined by XML 1.1 specification, [section 2.3][1]. /// /// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn #[must_use] pub const fn is_name_start_char(c: char) -> bool { matches!(c, ':' | 'A'..='Z' | '_' | 'a'..='z' | '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' | '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' | '\u{10000}'..='\u{EFFFF}' ) } /// Checks whether the given character is a name character (`NameChar`) /// as is defined by XML 1.1 specification, [section 2.3][1]. /// /// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn #[must_use] pub const fn is_name_char(c: char) -> bool { if is_name_start_char(c) { return true; } matches!(c, '-' | '.' | '0'..='9' | '\u{B7}' | '\u{300}'..='\u{36F}' | '\u{203F}'..='\u{2040}' ) } xml-1.2.0/src/escape.rs000064400000000000000000000102311046102023000130160ustar 00000000000000//! Contains functions for performing XML special characters escaping. use std::borrow::Cow; use std::fmt::{Display, Formatter, Result}; use std::marker::PhantomData; pub(crate) trait Escapes { fn escape(c: u8) -> Option<&'static str>; fn byte_needs_escaping(c: u8) -> bool { Self::escape(c).is_some() } fn str_needs_escaping(s: &str) -> bool { s.bytes().any(|c| Self::escape(c).is_some()) } } pub(crate) struct Escaped<'a, E: Escapes> { _escape_phantom: PhantomData, to_escape: &'a str, } impl<'a, E: Escapes> Escaped<'a, E> { pub const fn new(s: &'a str) -> Self { Escaped { _escape_phantom: PhantomData, to_escape: s, } } } impl Display for Escaped<'_, E> { fn fmt(&self, f: &mut Formatter<'_>) -> Result { let mut total_remaining = self.to_escape; // find the next occurence while let Some(n) = total_remaining.bytes().position(E::byte_needs_escaping) { let (start, remaining) = total_remaining.split_at(n); f.write_str(start)?; // unwrap is safe because we checked is_some for position n earlier let next_byte = remaining.bytes().next().unwrap(); let replacement = E::escape(next_byte).unwrap_or("unexpected token"); f.write_str(replacement)?; total_remaining = &remaining[1..]; } f.write_str(total_remaining) } } fn escape_str(s: &str) -> Cow<'_, str> { if E::str_needs_escaping(s) { Cow::Owned(Escaped::::new(s).to_string()) } else { Cow::Borrowed(s) } } macro_rules! escapes { { $name: ident, $($k: expr => $v: expr),* $(,)? } => { pub(crate) struct $name; impl Escapes for $name { fn escape(c: u8) -> Option<&'static str> { match c { $( $k => Some($v),)* _ => None } } } }; } escapes!( AttributeEscapes, b'<' => "<", b'>' => ">", b'"' => """, b'\'' => "'", b'&' => "&", b'\n' => " ", b'\r' => " ", ); escapes!( PcDataEscapes, b'<' => "<", b'>' => ">", b'&' => "&", ); /// Performs escaping of common XML characters inside an attribute value. /// /// This function replaces several important markup characters with their /// entity equivalents: /// /// * `<` → `<` /// * `>` → `>` /// * `"` → `"` /// * `'` → `'` /// * `&` → `&` /// /// The following characters are escaped so that attributes are printed on /// a single line: /// * `\n` → ` ` /// * `\r` → ` ` /// /// The resulting string is safe to use inside XML attribute values or in PCDATA sections. /// /// Does not perform allocations if the given string does not contain escapable characters. #[inline] #[must_use] pub fn escape_str_attribute(s: &str) -> Cow<'_, str> { escape_str::(s) } /// Performs escaping of common XML characters inside PCDATA. /// /// This function replaces several important markup characters with their /// entity equivalents: /// /// * `<` → `<` /// * `&` → `&` /// /// The resulting string is safe to use inside PCDATA sections but NOT inside attribute values. /// /// Does not perform allocations if the given string does not contain escapable characters. #[inline] #[must_use] pub fn escape_str_pcdata(s: &str) -> Cow<'_, str> { escape_str::(s) } #[cfg(test)] mod tests { use super::{escape_str_attribute, escape_str_pcdata}; #[test] fn test_escape_str_attribute() { assert_eq!(escape_str_attribute("<>'\"&\n\r"), "<>'"& "); assert_eq!(escape_str_attribute("no_escapes"), "no_escapes"); } #[test] fn test_escape_str_pcdata() { assert_eq!(escape_str_pcdata("<>&"), "<>&"); assert_eq!(escape_str_pcdata("no_escapes"), "no_escapes"); } #[test] fn test_escape_multibyte_code_points() { assert_eq!(escape_str_attribute("☃<"), "☃<"); assert_eq!(escape_str_pcdata("☃<"), "☃<"); } } xml-1.2.0/src/lib.rs000064400000000000000000000015211046102023000123260ustar 00000000000000#![warn(missing_docs)] #![forbid(non_camel_case_types)] #![forbid(unsafe_code)] #![allow(clippy::redundant_closure_for_method_calls)] #![allow(clippy::module_name_repetitions)] //! This crate currently provides an almost XML 1.0/1.1-compliant pull parser. //! //! Please note that functions of this parser may panic. //! If a panic could cause a Denial Of Service in your codebase, *you're* responsible for wrapping access to this library in `catch_unwind`. #![cfg_attr(doctest, doc = include_str!("../README.md"))] pub use crate::reader::{EventReader, ParserConfig}; pub use crate::util::Encoding; pub use crate::writer::{EmitterConfig, EventWriter}; pub mod attribute; pub mod common; pub mod escape; #[doc(hidden)] // FIXME: not supposed to be public pub mod macros; pub mod name; pub mod namespace; pub mod reader; mod util; pub mod writer; xml-1.2.0/src/macros.rs000064400000000000000000000021151046102023000130440ustar 00000000000000#![macro_use] //! Contains several macros used in this crate. macro_rules! gen_setter { ($(#[$comments:meta])* $field:ident : into $t:ty) => { $(#[$comments])* /// /// See [`ParserConfig`][crate::ParserConfig] fields docs for details #[inline] #[must_use] pub fn $field>(mut self, value: T) -> Self { self.$field = value.into(); self } }; ($(#[$comments:meta])* $field:ident : val $t:ty) => { $(#[$comments])* /// /// See [`ParserConfig`][crate::ParserConfig] fields docs for details #[inline] #[must_use] pub const fn $field(mut self, value: $t) -> Self { self.$field = value; self } }; } macro_rules! gen_setters { ($target:ident, $($(#[$comments:meta])* $field:ident : $k:tt $tpe:ty),+) => ( impl $target {$( gen_setter! { $(#[$comments])* $field : $k $tpe } )+ }) } xml-1.2.0/src/name.rs000064400000000000000000000251731046102023000125110ustar 00000000000000//! Contains XML qualified names manipulation types and functions. use std::fmt; use std::str::FromStr; use crate::namespace::NS_NO_PREFIX; /// Represents a qualified XML name. /// /// A qualified name always consists at least of a local name. It can optionally contain /// a prefix; when reading an XML document, if it contains a prefix, it must also contain a /// namespace URI, but this is not enforced statically; see below. The name can contain a /// namespace without a prefix; in that case a default, empty prefix is assumed. /// /// When writing XML documents, it is possible to omit the namespace URI, leaving only /// the prefix. In this case the writer will check that the specifed prefix is bound to some /// URI in the current namespace context. If both prefix and namespace URI are specified, /// it is checked that the current namespace context contains this exact correspondence /// between prefix and namespace URI. /// /// # Prefixes and URIs /// /// A qualified name with a prefix must always contain a proper namespace URI --- names with /// a prefix but without a namespace associated with that prefix are meaningless. However, /// it is impossible to obtain proper namespace URI by a prefix without a context, and such /// context is only available when parsing a document (or it can be constructed manually /// when writing a document). Tying a name to a context statically seems impractical. This /// may change in future, though. /// /// # Conversions /// /// `Name` implements some `From` instances for conversion from strings and tuples. For example: /// /// ```rust /// # use xml::name::Name; /// let n1: Name = "p:some-name".into(); /// let n2: Name = ("p", "some-name").into(); /// /// assert_eq!(n1, n2); /// assert_eq!(n1.local_name, "some-name"); /// assert_eq!(n1.prefix, Some("p")); /// assert!(n1.namespace.is_none()); /// ``` /// /// This is added to support easy specification of XML elements when writing XML documents. #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] pub struct Name<'a> { /// A local name, e.g. `string` in `xsi:string`. pub local_name: &'a str, /// A namespace URI, e.g. `http://www.w3.org/2000/xmlns/`. pub namespace: Option<&'a str>, /// A name prefix, e.g. `xsi` in `xsi:string`. pub prefix: Option<&'a str>, } impl<'a> From<&'a str> for Name<'a> { fn from(s: &'a str) -> Self { if let Some((prefix, name)) = s.split_once(':') { Name::prefixed(name, prefix) } else { Name::local(s) } } } impl<'a> From<(&'a str, &'a str)> for Name<'a> { fn from((prefix, name): (&'a str, &'a str)) -> Self { Name::prefixed(name, prefix) } } impl fmt::Display for Name<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if let Some(namespace) = self.namespace { write!(f, "{{{namespace}}}")?; } if let Some(prefix) = self.prefix { write!(f, "{prefix}:")?; } f.write_str(self.local_name) } } impl<'a> Name<'a> { /// Returns an owned variant of the qualified name. #[must_use] pub fn to_owned(&self) -> OwnedName { OwnedName { local_name: self.local_name.into(), namespace: self.namespace.map(std::convert::Into::into), prefix: self.prefix.map(std::convert::Into::into), } } /// Returns a new `Name` instance representing plain local name. #[inline] #[must_use] pub const fn local(local_name: &str) -> Name<'_> { Name { local_name, prefix: None, namespace: None, } } /// Returns a new `Name` instance with the given local name and prefix. #[inline] #[must_use] pub const fn prefixed(local_name: &'a str, prefix: &'a str) -> Self { Name { local_name, namespace: None, prefix: Some(prefix), } } /// Returns a new `Name` instance representing a qualified name with or without a prefix and /// with a namespace URI. #[inline] #[must_use] pub const fn qualified(local_name: &'a str, namespace: &'a str, prefix: Option<&'a str>) -> Self { Name { local_name, namespace: Some(namespace), prefix, } } /// Returns a correct XML representation of this local name and prefix. /// /// This method is different from the autoimplemented `to_string()` because it does not /// include namespace URI in the result. #[must_use] pub fn to_repr(&self) -> String { self.repr_display().to_string() } /// Returns a structure which can be displayed with `std::fmt` machinery to obtain this /// local name and prefix. /// /// This method is needed for efficiency purposes in order not to create unnecessary /// allocations. #[inline] #[must_use] pub const fn repr_display(&self) -> ReprDisplay<'_, '_> { ReprDisplay(self) } /// Returns either a prefix of this name or `namespace::NS_NO_PREFIX` constant. #[inline] #[must_use] pub fn prefix_repr(&self) -> &str { self.prefix.unwrap_or(NS_NO_PREFIX) } } /// A wrapper around `Name` whose `Display` implementation prints the wrapped name as it is /// displayed in an XML document. pub struct ReprDisplay<'a, 'b>(&'a Name<'b>); impl<'a, 'b: 'a> fmt::Display for ReprDisplay<'a, 'b> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self.0.prefix { Some(prefix) => write!(f, "{}:{}", prefix, self.0.local_name), None => self.0.local_name.fmt(f), } } } /// An owned variant of `Name`. /// /// Everything about `Name` applies to this structure as well. #[derive(Clone, PartialEq, Eq, Hash, Debug)] pub struct OwnedName { /// A local name, e.g. `string` in `xsi:string`. /// /// Local name is ambiguous, because multiple namespaces can share the same name. /// Always check `namespace` too. pub local_name: String, /// A namespace URI, e.g. `http://www.w3.org/2000/xmlns/`. /// /// `None` for default namespace. /// /// Note that in XML attributes don't inherit element's namespace /// and are in the default namespace unless they have a prefix. pub namespace: Option, /// Semantically meaningless name prefix, e.g. `xsi` in `xsi:string`. /// /// Prefixes are just a syntactic detail used for decoration or to avoid repetition. /// Only `namespace` matters for identity of the element. pub prefix: Option, } impl PartialEq<(&str, &str)> for OwnedName { /// Compare `(namespaceURI, localName)`. Default namespace is `""`. fn eq(&self, other: &(&str, &str)) -> bool { other.1.eq(&self.local_name) && other.0.eq(self.namespace.as_deref().unwrap_or_default()) } } impl PartialEq for (&str, &str) { /// Compare `(namespaceURI, localName)`. Default namespace is `""`. fn eq(&self, other: &OwnedName) -> bool { other.eq(self) } } impl fmt::Display for OwnedName { #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt(&self.borrow(), f) } } impl OwnedName { /// Constructs a borrowed `Name` based on this owned name. #[must_use] #[inline] pub fn borrow(&self) -> Name<'_> { Name { local_name: &self.local_name, namespace: self.namespace.as_deref(), prefix: self.prefix.as_deref(), } } /// Returns a new `OwnedName` instance representing a plain local name. #[inline] pub fn local(local_name: S) -> Self where S: Into { Self { local_name: local_name.into(), namespace: None, prefix: None, } } /// Returns a new `OwnedName` instance representing a qualified name with or without /// a prefix and with a namespace URI. #[inline] pub fn qualified(local_name: S1, namespace: S2, prefix: Option) -> Self where S1: Into, S2: Into, S3: Into { Self { local_name: local_name.into(), namespace: Some(namespace.into()), prefix: prefix.map(std::convert::Into::into), } } /// Returns an optional prefix by reference, equivalent to `self.borrow().prefix` /// but avoids extra work. #[inline] #[must_use] pub fn prefix_ref(&self) -> Option<&str> { self.prefix.as_deref() } /// Returns an optional namespace by reference, equivalen to `self.borrow().namespace` /// but avoids extra work. #[inline] #[must_use] pub fn namespace_ref(&self) -> Option<&str> { self.namespace.as_deref() } } impl<'a> From> for OwnedName { #[inline] fn from(n: Name<'a>) -> Self { n.to_owned() } } impl FromStr for OwnedName { type Err = (); /// Parses the given string slice into a qualified name. /// /// This function, when finishes sucessfully, always return a qualified /// name without a namespace (`name.namespace == None`). It should be filled later /// using proper `NamespaceStack`. /// /// It is supposed that all characters in the argument string are correct /// as defined by the XML specification. No additional checks except a check /// for emptiness are done. fn from_str(s: &str) -> Result { let mut it = s.split(':'); let r = match (it.next(), it.next(), it.next()) { (Some(prefix), Some(local_name), None) if !prefix.is_empty() && !local_name.is_empty() => Some((local_name.into(), Some(prefix.into()))), (Some(local_name), None, None) if !local_name.is_empty() => Some((local_name.into(), None)), (_, _, _) => None }; r.map(|(local_name, prefix)| Self { local_name, namespace: None, prefix }).ok_or(()) } } #[cfg(test)] mod tests { use super::OwnedName; #[test] fn test_owned_name_from_str() { assert_eq!("prefix:name".parse(), Ok(OwnedName { local_name: "name".into(), namespace: None, prefix: Some("prefix".into()) })); assert_eq!("name".parse(), Ok(OwnedName { local_name: "name".into(), namespace: None, prefix: None })); assert_eq!("".parse(), Err::(())); assert_eq!(":".parse(), Err::(())); assert_eq!(":a".parse(), Err::(())); assert_eq!("a:".parse(), Err::(())); assert_eq!("a:b:c".parse(), Err::(())); } } xml-1.2.0/src/namespace.rs000064400000000000000000000405061046102023000135220ustar 00000000000000//! Contains namespace manipulation types and functions. use std::borrow::Cow; use std::collections::btree_map::{BTreeMap, Entry, Iter as Entries}; use std::collections::HashSet; use std::iter::{Map, Rev}; use std::slice::Iter; /// Designates prefix for namespace definitions. /// /// See [Namespaces in XML][namespace] spec for more information. /// /// [namespace]: http://www.w3.org/TR/xml-names/#ns-decl pub const NS_XMLNS_PREFIX: &str = "xmlns"; /// Designates the standard URI for `xmlns` prefix. /// /// See [A Namespace Name for xmlns Attributes][namespace] for more information. /// /// [namespace]: http://www.w3.org/2000/xmlns/ pub const NS_XMLNS_URI: &str = "http://www.w3.org/2000/xmlns/"; /// Designates prefix for a namespace containing several special predefined attributes. /// /// See [2.10 White Space handling][1], [2.1 Language Identification][2], /// [XML Base specification][3] and [xml:id specification][4] for more information. /// /// [1]: http://www.w3.org/TR/REC-xml/#sec-white-space /// [2]: http://www.w3.org/TR/REC-xml/#sec-lang-tag /// [3]: http://www.w3.org/TR/xmlbase/ /// [4]: http://www.w3.org/TR/xml-id/ pub const NS_XML_PREFIX: &str = "xml"; /// Designates the standard URI for `xml` prefix. /// /// See `NS_XML_PREFIX` documentation for more information. pub const NS_XML_URI: &str = "http://www.w3.org/XML/1998/namespace"; /// Designates the absence of prefix in a qualified name. /// /// This constant should be used to define or query default namespace which should be used /// for element or attribute names without prefix. For example, if a namespace mapping /// at a particular point in the document contains correspondence like /// /// ```none /// NS_NO_PREFIX --> urn:some:namespace /// ``` /// /// then all names declared without an explicit prefix `urn:some:namespace` is assumed as /// a namespace URI. /// /// By default empty prefix corresponds to absence of namespace, but this can change either /// when writing an XML document (manually) or when reading an XML document (based on namespace /// declarations). pub const NS_NO_PREFIX: &str = ""; /// Designates an empty namespace URI, which is equivalent to absence of namespace. /// /// This constant should not usually be used directly; it is used to designate that /// empty prefix corresponds to absent namespace in `NamespaceStack` instances created with /// `NamespaceStack::default()`. Therefore, it can be used to restore `NS_NO_PREFIX` mapping /// in a namespace back to its default value. pub const NS_EMPTY_URI: &str = ""; /// Namespace is a map from prefixes to namespace URIs. /// /// No prefix (i.e. default namespace) is designated by `NS_NO_PREFIX` constant. #[derive(PartialEq, Eq, Clone, Debug)] pub struct Namespace(pub BTreeMap); impl Namespace { /// Returns an empty namespace. #[inline] #[must_use] pub fn empty() -> Self { Self(BTreeMap::new()) } /// Checks whether this namespace is empty. #[inline] #[must_use] pub fn is_empty(&self) -> bool { self.0.is_empty() } /// Checks whether this namespace is essentially empty, that is, it does not contain /// anything but default mappings. #[must_use] pub fn is_essentially_empty(&self) -> bool { // a shortcut for a namespace which is definitely not empty if self.0.len() > 3 { return false; } self.0.iter().all(|(k, v)| matches!((&**k, &**v), (NS_NO_PREFIX, NS_EMPTY_URI) | (NS_XMLNS_PREFIX, NS_XMLNS_URI) | (NS_XML_PREFIX, NS_XML_URI)) ) } /// Checks whether this namespace mapping contains the given prefix. /// /// # Parameters /// * `prefix` --- namespace prefix. /// /// # Return value /// `true` if this namespace contains the given prefix, `false` otherwise. #[inline] pub fn contains>(&self, prefix: &P) -> bool { self.0.contains_key(prefix.as_ref()) } /// Puts a mapping into this namespace. /// /// This method does not override any already existing mappings. /// /// Returns a boolean flag indicating whether the map already contained /// the given prefix. /// /// # Parameters /// * `prefix` --- namespace prefix; /// * `uri` --- namespace URI. /// /// # Return value /// `true` if `prefix` has been inserted successfully; `false` if the `prefix` /// was already present in the namespace. pub fn put(&mut self, prefix: P, uri: U) -> bool where P: Into, U: Into { match self.0.entry(prefix.into()) { Entry::Occupied(_) => false, Entry::Vacant(ve) => { ve.insert(uri.into()); true }, } } /// Puts a mapping into this namespace forcefully. /// /// This method, unlike `put()`, does replace an already existing mapping. /// /// Returns previous URI which was assigned to the given prefix, if it is present. /// /// # Parameters /// * `prefix` --- namespace prefix; /// * `uri` --- namespace URI. /// /// # Return value /// `Some(uri)` with `uri` being a previous URI assigned to the `prefix`, or /// `None` if such prefix was not present in the namespace before. pub fn force_put(&mut self, prefix: P, uri: U) -> Option where P: Into, U: Into { self.0.insert(prefix.into(), uri.into()) } /// Queries the namespace for the given prefix. /// /// # Parameters /// * `prefix` --- namespace prefix. /// /// # Return value /// Namespace URI corresponding to the given prefix, if it is present. pub fn get<'a, P: ?Sized + AsRef>(&'a self, prefix: &P) -> Option<&'a str> { self.0.get(prefix.as_ref()).map(|s| &**s) } /// Borrowed namespace for the writer #[must_use] pub const fn borrow(&self) -> Cow<'_, Self> { Cow::Borrowed(self) } /// Namespace mappings contained in a namespace. pub fn iter(&self) -> NamespaceMappings<'_> { self.into_iter() } } /// An alias for iterator type for namespace mappings contained in a namespace. pub type NamespaceMappings<'a> = Map< Entries<'a, String, String>, for<'b> fn((&'b String, &'b String)) -> UriMapping<'b> >; impl<'a> IntoIterator for &'a Namespace { type IntoIter = NamespaceMappings<'a>; type Item = UriMapping<'a>; fn into_iter(self) -> Self::IntoIter { fn mapper<'a>((prefix, uri): (&'a String, &'a String)) -> UriMapping<'a> { (prefix, uri) } self.0.iter().map(mapper) } } /// Namespace stack is a sequence of namespaces. /// /// Namespace stack is used to represent cumulative namespace consisting of /// combined namespaces from nested elements. #[derive(Clone, Eq, PartialEq, Debug)] pub struct NamespaceStack(pub Vec); impl NamespaceStack { /// Returns an empty namespace stack. #[inline] #[must_use] pub fn empty() -> Self { Self(Vec::with_capacity(2)) } /// Returns a namespace stack with default items in it. /// /// Default items are the following: /// /// * `xml` → `http://www.w3.org/XML/1998/namespace`; /// * `xmlns` → `http://www.w3.org/2000/xmlns/`. #[inline] #[must_use] #[allow(clippy::should_implement_trait)] pub fn default() -> Self { let mut nst = Self::empty(); nst.push_empty(); // xml namespace nst.put(NS_XML_PREFIX, NS_XML_URI); // xmlns namespace nst.put(NS_XMLNS_PREFIX, NS_XMLNS_URI); // empty namespace nst.put(NS_NO_PREFIX, NS_EMPTY_URI); nst } /// Adds an empty namespace to the top of this stack. #[inline] pub fn push_empty(&mut self) -> &mut Self { self.0.push(Namespace::empty()); self } /// Removes the topmost namespace in this stack. /// /// Panics if the stack is empty. #[inline] #[track_caller] pub fn pop(&mut self) -> Namespace { self.0.pop().unwrap() } /// Removes the topmost namespace in this stack. /// /// Returns `Some(namespace)` if this stack is not empty and `None` otherwise. #[inline] pub fn try_pop(&mut self) -> Option { self.0.pop() } /// Borrows the topmost namespace mutably, leaving the stack intact. /// /// Panics if the stack is empty. #[inline] #[track_caller] pub fn peek_mut(&mut self) -> &mut Namespace { self.0.last_mut().unwrap() } /// Borrows the topmost namespace immutably, leaving the stack intact. /// /// Panics if the stack is empty. #[inline] #[must_use] #[track_caller] pub fn peek(&self) -> &Namespace { self.0.last().unwrap() } /// Puts a mapping into the topmost namespace if this stack does not already contain one. /// /// Returns a boolean flag indicating whether the insertion has completed successfully. /// Note that both key and value are matched and the mapping is inserted if either /// namespace prefix is not already mapped, or if it is mapped, but to a different URI. /// /// # Parameters /// * `prefix` --- namespace prefix; /// * `uri` --- namespace URI. /// /// # Return value /// `true` if `prefix` has been inserted successfully; `false` if the `prefix` /// was already present in the namespace stack. pub fn put_checked(&mut self, prefix: P, uri: U) -> bool where P: Into + AsRef, U: Into + AsRef { if self.0.iter().any(|ns| ns.get(&prefix) == Some(uri.as_ref())) { false } else { self.put(prefix, uri); true } } /// Puts a mapping into the topmost namespace in this stack. /// /// This method does not override a mapping in the topmost namespace if it is /// already present, however, it does not depend on other namespaces in the stack, /// so it is possible to put a mapping which is present in lower namespaces. /// /// Returns a boolean flag indicating whether the insertion has completed successfully. /// /// # Parameters /// * `prefix` --- namespace prefix; /// * `uri` --- namespace URI. /// /// # Return value /// `true` if `prefix` has been inserted successfully; `false` if the `prefix` /// was already present in the namespace. #[inline] pub fn put(&mut self, prefix: P, uri: U) -> bool where P: Into, U: Into { if let Some(ns) = self.0.last_mut() { ns.put(prefix, uri) } else { false } } /// Performs a search for the given prefix in the whole stack. /// /// This method walks the stack from top to bottom, querying each namespace /// in order for the given prefix. If none of the namespaces contains the prefix, /// `None` is returned. /// /// # Parameters /// * `prefix` --- namespace prefix. #[inline] pub fn get<'a, P: ?Sized + AsRef>(&'a self, prefix: &P) -> Option<&'a str> { let prefix = prefix.as_ref(); for ns in self.0.iter().rev() { match ns.get(prefix) { None => {}, r => return r, } } None } /// Combines this stack of namespaces into a single namespace. /// /// Namespaces are combined in left-to-right order, that is, rightmost namespace /// elements take priority over leftmost ones. #[must_use] pub fn squash(&self) -> Namespace { let mut result = BTreeMap::new(); for ns in &self.0 { result.extend(ns.0.iter().map(|(k, v)| (k.clone(), v.clone()))); } Namespace(result) } /// Returns an object which implements `Extend` using `put_checked()` instead of `put()`. /// /// See `CheckedTarget` for more information. #[inline] pub fn checked_target(&mut self) -> CheckedTarget<'_> { CheckedTarget(self) } /// Returns an iterator over all mappings in this namespace stack. #[inline] #[must_use] pub fn iter(&self) -> NamespaceStackMappings<'_> { self.into_iter() } } /// An iterator over mappings from prefixes to URIs in a namespace stack. /// /// # Example /// ``` /// # use xml::namespace::NamespaceStack; /// let mut nst = NamespaceStack::empty(); /// nst.push_empty(); /// nst.put("a", "urn:A"); /// nst.put("b", "urn:B"); /// nst.push_empty(); /// nst.put("c", "urn:C"); /// /// assert_eq!(vec![("c", "urn:C"), ("a", "urn:A"), ("b", "urn:B")], nst.iter().collect::>()); /// ``` pub struct NamespaceStackMappings<'a> { namespaces: Rev>, current_namespace: Option>, used_keys: HashSet<&'a str>, } impl NamespaceStackMappings<'_> { fn go_to_next_namespace(&mut self) -> bool { self.current_namespace = self.namespaces.next().map(|ns| ns.into_iter()); self.current_namespace.is_some() } } impl<'a> Iterator for NamespaceStackMappings<'a> { type Item = UriMapping<'a>; fn next(&mut self) -> Option> { // If there is no current namespace and no next namespace, we're finished if self.current_namespace.is_none() && !self.go_to_next_namespace() { return None; } let next_item = self.current_namespace.as_mut()?.next(); match next_item { // There is an element in the current namespace Some((k, v)) => if self.used_keys.contains(&k) { // If the current key is used, go to the next one self.next() } else { // Otherwise insert the current key to the set of used keys and // return the mapping self.used_keys.insert(k); Some((k, v)) }, // Current namespace is exhausted None => if self.go_to_next_namespace() { // If there is next namespace, continue from it self.next() } else { // No next namespace, exiting None } } } } impl<'a> IntoIterator for &'a NamespaceStack { type IntoIter = NamespaceStackMappings<'a>; type Item = UriMapping<'a>; fn into_iter(self) -> Self::IntoIter { NamespaceStackMappings { namespaces: self.0.iter().rev(), current_namespace: None, used_keys: HashSet::new(), } } } /// A type alias for a pair of `(prefix, uri)` values returned by namespace iterators. pub type UriMapping<'a> = (&'a str, &'a str); impl<'a> Extend> for Namespace { fn extend(&mut self, iterable: T) where T: IntoIterator> { for (prefix, uri) in iterable { self.put(prefix, uri); } } } impl<'a> Extend> for NamespaceStack { fn extend(&mut self, iterable: T) where T: IntoIterator> { for (prefix, uri) in iterable { self.put(prefix, uri); } } } /// A wrapper around `NamespaceStack` which implements `Extend` using `put_checked()`. /// /// # Example /// /// ``` /// # use xml::namespace::NamespaceStack; /// /// let mut nst = NamespaceStack::empty(); /// nst.push_empty(); /// nst.put("a", "urn:A"); /// nst.put("b", "urn:B"); /// nst.push_empty(); /// nst.put("c", "urn:C"); /// /// nst.checked_target().extend(vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:Y"), ("d", "urn:D")]); /// assert_eq!( /// vec![("a", "urn:Z"), ("c", "urn:C"), ("d", "urn:D"), ("b", "urn:B")], /// nst.iter().collect::>() /// ); /// ``` /// /// Compare: /// /// ``` /// # use xml::namespace::NamespaceStack; /// # let mut nst = NamespaceStack::empty(); /// # nst.push_empty(); /// # nst.put("a", "urn:A"); /// # nst.put("b", "urn:B"); /// # nst.push_empty(); /// # nst.put("c", "urn:C"); /// /// nst.extend(vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:Y"), ("d", "urn:D")]); /// assert_eq!( /// vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:C"), ("d", "urn:D")], /// nst.iter().collect::>() /// ); /// ``` pub struct CheckedTarget<'a>(&'a mut NamespaceStack); impl<'b> Extend> for CheckedTarget<'_> { fn extend(&mut self, iterable: T) where T: IntoIterator> { for (prefix, uri) in iterable { self.0.put_checked(prefix, uri); } } } xml-1.2.0/src/reader/config.rs000064400000000000000000000276721046102023000143060ustar 00000000000000//! Contains parser configuration structure. use std::collections::HashMap; use std::io::Read; use crate::reader::EventReader; use crate::util::Encoding; /// Limits to defend from billion laughs attack const DEFAULT_MAX_ENTITY_EXPANSION_LENGTH: usize = 1_000_000; const DEFAULT_MAX_ENTITY_EXPANSION_DEPTH: u8 = 10; /// Parser configuration structure. **There are more config methods than public fileds — see methods below**. /// /// This structure contains various configuration options which affect /// behavior of the parser. #[derive(Clone, PartialEq, Eq, Debug)] #[non_exhaustive] pub struct ParserConfig { /// Whether or not should whitespace in textual events be removed. Default is false. /// /// When true, all standalone whitespace will be removed (this means no /// `Whitespace` events will be emitted), and leading and trailing whitespace /// from `Character` events will be deleted. If after trimming `Characters` /// event will be empty, it will also be omitted from output stream. This is /// possible, however, only if `whitespace_to_characters` or /// `cdata_to_characters` options are set. /// /// This option does not affect CDATA events, unless `cdata_to_characters` /// option is also set. In that case CDATA content will also be trimmed. pub trim_whitespace: bool, /// Whether or not should whitespace be converted to characters. /// Default is false. /// /// If true, instead of `Whitespace` events `Characters` events with the /// same content will be emitted. If `trim_whitespace` is also true, these /// events will be trimmed to nothing and, consequently, not emitted. pub whitespace_to_characters: bool, /// Whether or not should CDATA be converted to characters. /// Default is false. /// /// If true, instead of `CData` events `Characters` events with the same /// content will be emitted. If `trim_whitespace` is also true, these events /// will be trimmed. If corresponding CDATA contained nothing but whitespace, /// this event will be omitted from the stream. pub cdata_to_characters: bool, /// Whether or not should comments be omitted. Default is true. /// /// If true, `Comment` events will not be emitted at all. pub ignore_comments: bool, /// Whether or not should sequential `Characters` events be merged. /// Default is true. /// /// If true, multiple sequential `Characters` events will be merged into /// a single event, that is, their data will be concatenated. /// /// Multiple sequential `Characters` events are only possible if either /// `cdata_to_characters` or `ignore_comments` are set. Otherwise character /// events will always be separated by other events. pub coalesce_characters: bool, /// A map of extra entities recognized by the parser. Default is an empty map. /// /// By default the XML parser recognizes the entities defined in the XML spec. Sometimes, /// however, it is convenient to make the parser recognize additional entities which /// are also not available through the DTD definitions (especially given that at the moment /// DTD parsing is not supported). pub extra_entities: HashMap, /// Whether or not the parser should ignore the end of stream. Default is false. /// /// By default the parser will either error out when it encounters a premature end of /// stream or complete normally if the end of stream was expected. If you want to continue /// reading from a stream whose input is supplied progressively, you can set this option to true. /// In this case the parser will allow you to invoke the `next()` method even if a supposed end /// of stream has happened. /// /// Note that support for this functionality is incomplete; for example, the parser will fail if /// the premature end of stream happens inside PCDATA. Therefore, use this option at your own risk. pub ignore_end_of_stream: bool, /// Whether or not non-unicode entity references get replaced with the replacement character /// /// When true, any decimal or hexadecimal character reference that cannot be converted from a /// u32 to a char using [std::char::from_u32](https://doc.rust-lang.org/std/char/fn.from_u32.html) /// will be converted into the unicode REPLACEMENT CHARACTER (U+FFFD). pub replace_unknown_entity_references: bool, /// Whether or not whitespace at the root level of the document is ignored. Default is true. /// /// By default any whitespace that is not enclosed within at least one level of elements will be /// ignored. Setting this value to false will cause root level whitespace events to be emitted. pub ignore_root_level_whitespace: bool, /// Use this encoding as the default. Necessary for UTF-16 files without BOM. pub override_encoding: Option, /// Allow `` to contain unsupported encoding names, /// and interpret them as Latin1 instead. This will mangle non-ASCII characters, but usually it won't fail parsing. pub ignore_invalid_encoding_declarations: bool, /// Documents with multiple root elements are ill-formed pub allow_multiple_root_elements: bool, /// Abort if custom entities create a string longer than this pub max_entity_expansion_length: usize, /// Entities can expand into other entities this many times (be careful about exponential cost!) pub max_entity_expansion_depth: u8, /// Maximum length of tag name or attribute name pub max_name_length: usize, /// Max number of attributes per element pub max_attributes: usize, /// Max number of bytes in each attribute pub max_attribute_length: usize, /// Maximum length of strings reprsenting characters, comments, and processing instructions pub max_data_length: usize, } impl ParserConfig { /// Returns a new config with default values. /// /// You can tweak default values using builder-like pattern: /// /// ```rust /// use xml::reader::ParserConfig; /// /// let config = ParserConfig::new() /// .trim_whitespace(true) /// .ignore_comments(true) /// .coalesce_characters(false); /// ``` #[must_use] #[inline] pub fn new() -> Self { Self { trim_whitespace: false, whitespace_to_characters: false, cdata_to_characters: false, ignore_comments: true, coalesce_characters: true, extra_entities: HashMap::new(), ignore_end_of_stream: false, replace_unknown_entity_references: false, ignore_root_level_whitespace: true, override_encoding: None, ignore_invalid_encoding_declarations: false, allow_multiple_root_elements: true, max_entity_expansion_length: DEFAULT_MAX_ENTITY_EXPANSION_LENGTH, max_entity_expansion_depth: DEFAULT_MAX_ENTITY_EXPANSION_DEPTH, max_attributes: 1 << 16, max_attribute_length: 1 << 30, max_data_length: 1 << 30, max_name_length: 1 << 18, } } /// Creates an XML reader with this configuration. The reader should be wrapped in a `BufReader`, otherwise parsing may be very slow. /// /// This is a convenience method for configuring and creating a reader at the same time: /// /// ```rust /// use xml::reader::ParserConfig; /// /// let mut source: &[u8] = b"..."; /// /// let reader = ParserConfig::new() /// .trim_whitespace(true) /// .ignore_comments(true) /// .coalesce_characters(false) /// .create_reader(&mut source); /// ``` /// /// This method is exactly equivalent to calling `EventReader::new_with_config()` with /// this configuration object. #[inline] pub fn create_reader(self, source: R) -> EventReader { EventReader::new_with_config(source, self) } /// Adds a new entity mapping and returns an updated config object. /// /// This is a convenience method for adding external entities mappings to the XML parser. /// An example: /// /// ```rust /// use xml::reader::ParserConfig; /// /// let mut source: &[u8] = b"..."; /// /// let reader = ParserConfig::new() /// .add_entity("nbsp", " ") /// .add_entity("copy", "©") /// .add_entity("reg", "®") /// .create_reader(&mut source); /// ``` #[must_use] #[inline] pub fn add_entity, T: Into>(mut self, entity: S, value: T) -> Self { self.extra_entities.insert(entity.into(), value.into()); self } /// Adds entities and returns an updated config object. /// /// This is a convenience method for adding external entities mappings to the XML parser. /// An example: /// /// ```rust /// use xml::reader::ParserConfig; /// /// let mut source: &[u8] = b"..."; /// /// let reader = ParserConfig::new() /// .add_entities([ /// ("nbsp", " "), /// ("copy", "©"), /// ("reg", "®"), /// ]) /// .create_reader(&mut source); /// ``` #[must_use] #[inline] pub fn add_entities, T: Into>(mut self, entities: impl IntoIterator) -> Self { self.extra_entities.extend(entities.into_iter().map(|(k, v)| (k.into(), v.into()))); self } } gen_setters! { ParserConfig, trim_whitespace: val bool, whitespace_to_characters: val bool, cdata_to_characters: val bool, ignore_comments: val bool, coalesce_characters: val bool, ignore_end_of_stream: val bool, replace_unknown_entity_references: val bool, /// Whether or not whitespace at the root level of the document is ignored. Default is true. ignore_root_level_whitespace: val bool } impl Default for ParserConfig { fn default() -> Self { Self::new() } } impl ParserConfig { /// Read character encoding from `Content-Type` header. /// Set this when parsing XML documents fetched over HTTP. /// /// `text/*` MIME types do *not* imply latin1. UTF-8 is always the default fallback. #[must_use] pub fn content_type(mut self, mime_type: &str) -> Self { let charset = mime_type.split_once(';') .and_then(|(_, args)| args.split_once("charset")) .and_then(|(_, args)| args.split_once('=')); if let Some((_, charset)) = charset { let name = charset.trim().trim_matches('"'); if let Ok(enc) = name.parse() { self.override_encoding = Some(enc); } } self } } gen_setters! { ParserConfig, /// Set if you got one in the HTTP header override_encoding: val Option, /// Allows invalid documents. There should be only a single root element in XML. allow_multiple_root_elements: val bool, /// Abort if custom entities create a string longer than this max_entity_expansion_length: val usize, /// Entities can expand into other entities this many times (be careful about exponential cost!) max_entity_expansion_depth: val u8, /// Max number of attributes per element max_attributes: val usize, /// Maximum length of tag name or attribute name max_name_length: val usize, /// Max number of bytes in each attribute max_attribute_length: val usize, /// Maximum length of strings reprsenting characters, comments, and processing instructions max_data_length: val usize, /// Allow `` ignore_invalid_encoding_declarations: val bool } #[test] fn mime_parse() { let c = ParserConfig::new() .content_type("text/xml;charset=Us-AScii") .max_entity_expansion_length(1000); assert_eq!(c.override_encoding, Some(Encoding::Ascii)); let c = ParserConfig::new() .max_entity_expansion_depth(3) .content_type("text/xml;charset = \"UTF-16\""); assert_eq!(c.override_encoding, Some(Encoding::Utf16)); } xml-1.2.0/src/reader/error.rs000064400000000000000000000271021046102023000141560ustar 00000000000000use crate::Encoding; use crate::reader::lexer::Token; use crate::writer::Error as EmitterError; use std::borrow::Cow; use std::error::Error as _; use std::{error, fmt, io, str}; use crate::common::{Position, TextPosition}; use crate::util; /// Failure reason #[derive(Debug)] #[non_exhaustive] pub enum ErrorKind { /// This is an ill-formed XML document Syntax(Cow<'static, str>), /// Reader/writer reported an error Io(io::Error), /// The document contains bytes that are not allowed in UTF-8 strings Utf8(str::Utf8Error), /// The document ended while they were elements/comments/etc. still open UnexpectedEof, /// [Writer error](crate::writer::Error) for convenience of using a single [`Error`] type EmitterError(Box), } /// Returned by `add_entities()` #[derive(Clone, PartialEq)] #[non_exhaustive] pub enum ImmutableEntitiesError { /// Too late to modify ElementEncountered, /// `` can't have entities StandaloneDocument, } #[derive(Debug, Clone, PartialEq)] #[non_exhaustive] pub(crate) enum SyntaxError { CannotRedefineXmlnsPrefix, CannotRedefineXmlPrefix, /// Recursive custom entity expanded to too many chars, it could be DoS EntityTooBig, EmptyEntity, NoRootElement, ProcessingInstructionWithoutName, UnbalancedRootElement, UnexpectedEof, UnexpectedOpeningTag, /// Missing `]]>` UnclosedCdata, UnexpectedQualifiedName(Token), UnexpectedTokenOutsideRoot(Token), UnexpectedToken(Token), UnexpectedTokenInEntity(Token), UnexpectedTokenInClosingTag(Token), UnexpectedTokenInOpeningTag(Token), InvalidQualifiedName(Box), UnboundAttribute(Box), UnboundElementPrefix(Box), UnexpectedClosingTag(Box), UnexpectedName(Box), /// Found , Token), CannotUndefinePrefix(Box), InvalidCharacterEntity(u32), InvalidDefaultNamespace(Box), InvalidNamePrefix(Box), InvalidNumericEntity(Box), InvalidStandaloneDeclaration(Box), InvalidXmlProcessingInstruction(Box), RedefinedAttribute(Box), UndefinedEntity(Box), UnexpectedEntity(Box), UnexpectedNameInsideXml(Box), UnsupportedEncoding(Box), /// In DTD UnknownMarkupDeclaration(Box), UnexpectedXmlVersion(Box), ConflictingEncoding(Encoding, Encoding), UnexpectedTokenBefore(&'static str, char), /// Document has more stuff than `ParserConfig` allows ExceededConfiguredLimit, } impl fmt::Display for SyntaxError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.to_cow().fmt(f) } } impl SyntaxError { #[inline(never)] #[cold] pub(crate) fn to_cow(&self) -> Cow<'static, str> { match *self { Self::CannotRedefineXmlnsPrefix => "Cannot redefine XMLNS prefix".into(), Self::CannotRedefineXmlPrefix => "Default XMLNS prefix cannot be rebound to another value".into(), Self::EmptyEntity => "Encountered empty entity".into(), Self::EntityTooBig => "Entity too big".into(), Self::NoRootElement => "Unexpected end of stream: no root element found".into(), Self::ProcessingInstructionWithoutName => "Encountered processing instruction without a name".into(), Self::UnbalancedRootElement => "Unexpected end of stream: still inside the root element".into(), Self::UnclosedCdata => "Unclosed "Unexpected end of stream".into(), Self::UnexpectedOpeningTag => "'<' is not allowed in attributes".into(), Self::CannotUndefinePrefix(ref ln) => format!("Cannot undefine prefix '{ln}'").into(), Self::ConflictingEncoding(a, b) => format!("Declared encoding {a}, but uses {b}").into(), Self::InvalidCharacterEntity(num) => format!("Invalid character U+{num:04X}").into(), Self::InvalidDefaultNamespace(ref name) => format!("Namespace '{name}' cannot be default").into(), Self::InvalidNamePrefix(ref prefix) => format!("'{prefix}' cannot be an element name prefix").into(), Self::InvalidNumericEntity(ref v) => format!("Invalid numeric entity: {v}").into(), Self::InvalidQualifiedName(ref e) => format!("Qualified name is invalid: {e}").into(), Self::InvalidStandaloneDeclaration(ref value) => format!("Invalid standalone declaration value: {value}").into(), Self::InvalidXmlProcessingInstruction(ref name) => format!("Invalid processing instruction: format!("Attribute '{name}' is redefined").into(), Self::UnboundAttribute(ref name) => format!("Attribute {name} prefix is unbound").into(), Self::UnboundElementPrefix(ref name) => format!("Element {name} prefix is unbound").into(), Self::UndefinedEntity(ref v) => format!("Undefined entity: {v}").into(), Self::UnexpectedClosingTag(ref expected_got) => format!("Unexpected closing tag: {expected_got}").into(), Self::UnexpectedEntity(ref name) => format!("Unexpected entity: {name}").into(), Self::UnexpectedName(ref name) => format!("Unexpected name: {name}").into(), Self::UnexpectedNameInsideXml(ref name) => format!("Unexpected name inside XML declaration: {name}").into(), Self::UnexpectedProcessingInstruction(ref buf, token) => format!("Unexpected token inside processing instruction: format!("Unexpected token inside qualified name: {e}").into(), Self::UnexpectedToken(token) => format!("Unexpected token: {token}").into(), Self::UnexpectedTokenBefore(before, c) => format!("Unexpected token '{before}' before '{c}'").into(), Self::UnexpectedTokenInClosingTag(token) => format!("Unexpected token inside closing tag: {token}").into(), Self::UnexpectedTokenInEntity(token) => format!("Unexpected token inside entity: {token}").into(), Self::UnexpectedTokenInOpeningTag(token) => format!("Unexpected token inside opening tag: {token}").into(), Self::UnexpectedTokenOutsideRoot(token) => format!("Unexpected characters outside the root element: {token}").into(), Self::UnexpectedXmlVersion(ref version) => format!("Invalid XML version: {version}").into(), Self::UnknownMarkupDeclaration(ref v) => format!("Unknown markup declaration: {v}").into(), Self::UnsupportedEncoding(ref v) => format!("Unsupported encoding: {v}").into(), Self::ExceededConfiguredLimit => "This document is larger/more complex than allowed by the parser's configuration".into(), } } } /// An XML parsing error. /// /// Consists of a 2D position in a document and a textual message describing the error. #[derive(Clone, PartialEq, Eq, Debug)] pub struct Error { pub(crate) pos: TextPosition, pub(crate) kind: ErrorKind, } impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use self::ErrorKind::{EmitterError, Io, Syntax, UnexpectedEof, Utf8}; write!(f, "{} ", self.pos)?; match &self.kind { Io(io_error) => io_error.fmt(f), Utf8(reason) => reason.fmt(f), Syntax(msg) => f.write_str(msg), UnexpectedEof => f.write_str("Unexpected EOF"), EmitterError(e) => e.fmt(f), } } } impl fmt::Display for ImmutableEntitiesError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(match self { Self::ElementEncountered => "Element encountered", Self::StandaloneDocument => "Standalone XML", }) } } impl fmt::Debug for ImmutableEntitiesError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt(self, f) } } impl error::Error for ImmutableEntitiesError { } impl Position for Error { #[inline] fn position(&self) -> TextPosition { self.pos } } impl Error { #[doc(hidden)] #[must_use] pub fn msg(&self) -> String { self.to_string() } /// Failure reason #[must_use] #[inline] pub fn kind(&self) -> &ErrorKind { &self.kind } pub(crate) fn syntax(syntax_msg: Cow<'static, str>, pos: TextPosition) -> Self { Self { kind: ErrorKind::Syntax(syntax_msg), pos } } } impl error::Error for Error { fn source(&self) -> Option<&(dyn error::Error + 'static)> { match &self.kind { ErrorKind::Io(e) => e.source(), ErrorKind::Utf8(e) => Some(e), ErrorKind::EmitterError(e) => Some(e), _ => None, } } } impl<'a, P, M> From<(&'a P, M)> for Error where P: Position, M: Into> { #[cold] fn from(orig: (&'a P, M)) -> Self { Self { pos: orig.0.position(), kind: ErrorKind::Syntax(orig.1.into()), } } } impl From for Error { #[cold] fn from(e: util::CharReadError) -> Self { use crate::util::CharReadError::{Io, UnexpectedEof, Utf8}; Self { pos: TextPosition::new(), kind: match e { UnexpectedEof => ErrorKind::UnexpectedEof, Utf8(reason) => ErrorKind::Utf8(reason), Io(io_error) => ErrorKind::Io(io_error), }, } } } impl From for Error { #[cold] fn from(e: io::Error) -> Self { Self { pos: TextPosition::new(), kind: ErrorKind::Io(e), } } } impl From for Error { #[cold] fn from(e: EmitterError) -> Self { Self { pos: TextPosition::new(), kind: ErrorKind::EmitterError(Box::new(e)), } } } impl From for Error { #[cold] fn from(e: ImmutableEntitiesError) -> Self { Self { pos: TextPosition::new(), kind: ErrorKind::Io(io::Error::new(io::ErrorKind::Other, e)), } } } impl From for Error { fn from(kind: ErrorKind) -> Self { Self { kind, pos: TextPosition::new() } } } impl Clone for ErrorKind { #[cold] fn clone(&self) -> Self { use self::ErrorKind::{EmitterError, Io, Syntax, UnexpectedEof, Utf8}; match self { UnexpectedEof => UnexpectedEof, Utf8(reason) => Utf8(*reason), Io(io_error) => Io(io::Error::new(io_error.kind(), io_error.to_string())), Syntax(msg) => Syntax(msg.clone()), EmitterError(e) => EmitterError(e.clone()), } } } impl PartialEq for ErrorKind { #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8}; match (self, other) { (UnexpectedEof, UnexpectedEof) => true, (Utf8(left), Utf8(right)) => left == right, (Io(left), Io(right)) => left.kind() == right.kind() && left.description() == right.description(), (Syntax(left), Syntax(right)) => left == right, (_, _) => false, } } } impl Eq for ErrorKind {} #[test] fn err_size() { assert!(std::mem::size_of::() <= 24); } xml-1.2.0/src/reader/events.rs000064400000000000000000000240001046102023000143230ustar 00000000000000//! Contains `XmlEvent` datatype, instances of which are emitted by the parser. use crate::attribute::OwnedAttribute; use crate::common::XmlVersion; use crate::name::OwnedName; use crate::namespace::Namespace; use std::fmt; /// An element of an XML input stream. /// /// Items of this enum are emitted by `reader::EventReader`. They correspond to different /// elements of an XML document. #[derive(PartialEq, Clone)] pub enum XmlEvent { /// Corresponds to XML document declaration. /// /// This event is always emitted before any other event. It is emitted /// even if the actual declaration is not present in the document. StartDocument { /// XML version. /// /// If XML declaration is not present, defaults to `Version10`. version: XmlVersion, /// XML document encoding. /// /// If XML declaration is not present or does not contain `encoding` attribute, /// defaults to `"UTF-8"`. This field is currently used for no other purpose than /// informational. encoding: String, /// XML standalone declaration. /// /// If XML document is not present or does not contain `standalone` attribute, /// defaults to `None`. This field is currently used for no other purpose than /// informational. standalone: Option, }, /// Denotes to the end of the document stream. /// /// This event is always emitted after any other event (except `Error`). After it /// is emitted for the first time, it will always be emitted on next event pull attempts. EndDocument, /// Denotes an XML processing instruction. /// /// This event contains a processing instruction target (`name`) and opaque `data`. It /// is up to the application to process them. ProcessingInstruction { /// Processing instruction target. name: String, /// Processing instruction content. data: Option, }, /// Denotes a beginning of an XML element. /// /// This event is emitted after parsing opening tags or after parsing bodiless tags. In the /// latter case `EndElement` event immediately follows. StartElement { /// Qualified name of the element. name: OwnedName, /// A list of attributes associated with the element. /// /// Currently attributes are not checked for duplicates (TODO) attributes: Vec, /// Contents of the namespace mapping at this point of the document. namespace: Namespace, }, /// Denotes an end of an XML element. /// /// This event is emitted after parsing closing tags or after parsing bodiless tags. In the /// latter case it is emitted immediately after corresponding `StartElement` event. EndElement { /// Qualified name of the element. name: OwnedName, }, /// Denotes CDATA content. /// /// This event contains unparsed data. No unescaping will be performed. /// /// It is possible to configure a parser to emit `Characters` event instead of `CData`. See /// `pull::ParserConfiguration` structure for more information. CData(String), /// Denotes a comment. /// /// It is possible to configure a parser to ignore comments, so this event will never be emitted. /// See `pull::ParserConfiguration` structure for more information. Comment(String), /// Denotes character data outside of tags. /// /// Contents of this event will always be unescaped, so no entities like `<` or `&` or `{` /// will appear in it. /// /// It is possible to configure a parser to trim leading and trailing whitespace for this event. /// See `pull::ParserConfiguration` structure for more information. Characters(String), /// Denotes a chunk of whitespace outside of tags. /// /// It is possible to configure a parser to emit `Characters` event instead of `Whitespace`. /// See `pull::ParserConfiguration` structure for more information. When combined with whitespace /// trimming, it will eliminate standalone whitespace from the event stream completely. Whitespace(String), /// The whole DOCTYPE markup Doctype { /// Everything including `<` and `>` syntax: String, }, } /// Supplement to the Doctype event (use the event if you want the full syntax) pub struct DoctypeRef<'tmp> { pub(crate) syntax: &'tmp str, /// Doctype name, following , /// System id of Doctype, if available See https://www.w3.org/TR/xml/#NT-ExternalID pub(crate) system_id: Option<&'tmp str>, } impl DoctypeRef<'_> { /// Doctype name, following &str { self.name } /// Public id of Doctype, if available. See https://www.w3.org/TR/xml/#NT-ExternalID pub fn public_id(&self) -> Option<&str> { self.public_id } /// System id of Doctype, if available See https://www.w3.org/TR/xml/#NT-ExternalID pub fn system_id(&self) -> Option<&str> { self.system_id } } impl std::ops::Deref for DoctypeRef<'_> { type Target = str; /// Don't use it. It's for back-compat with v0.8 fn deref(&self) -> &Self::Target { self.syntax } } impl fmt::Debug for XmlEvent { #[cold] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::StartDocument { version, encoding, standalone } => write!(f, "StartDocument({}, {}, {:?})", version, *encoding, standalone), Self::EndDocument => write!(f, "EndDocument"), Self::ProcessingInstruction { name, data } => write!(f, "ProcessingInstruction({}{})", *name, match data { Some(data) => format!(", {data}"), None => String::new() }), Self::StartElement { name, attributes, namespace: Namespace(namespace) } => write!(f, "StartElement({}, {:?}{})", name, namespace, if attributes.is_empty() { String::new() } else { let attributes: Vec = attributes.iter().map( |a| format!("{} -> {}", a.name, a.value) ).collect(); format!(", [{}]", attributes.join(", ")) }), Self::EndElement { name } => write!(f, "EndElement({name})"), Self::Comment(data) => write!(f, "Comment({data})"), Self::CData(data) => write!(f, "CData({data})"), Self::Characters(data) => write!(f, "Characters({data})"), Self::Whitespace(data) => write!(f, "Whitespace({data})"), Self::Doctype { syntax } => write!(f, "Doctype({syntax})"), } } } impl XmlEvent { /// Obtains a writer event from this reader event. /// /// This method is useful for streaming processing of XML documents where the output /// is also an XML document. With this method it is possible to process some events /// while passing other events through to the writer unchanged: /// /// ```rust /// use std::str; /// /// use xml::reader::XmlEvent as ReaderEvent; /// use xml::writer::XmlEvent as WriterEvent; /// use xml::{EventReader, EventWriter}; /// /// let mut input: &[u8] = b"world"; /// let mut output: Vec = Vec::new(); /// /// { /// let mut reader = EventReader::new(&mut input); /// let mut writer = EventWriter::new(&mut output); /// /// for e in reader { /// match e.unwrap() { /// ReaderEvent::Characters(s) => { /// writer.write(WriterEvent::characters(&s.to_uppercase())).unwrap() /// }, /// e => { /// if let Some(e) = e.as_writer_event() { /// writer.write(e).unwrap() /// } /// }, /// } /// } /// } /// /// assert_eq!( /// str::from_utf8(&output).unwrap(), /// r#"WORLD"# /// ); /// ``` /// /// Note that this API may change or get additions in future to improve its ergonomics. #[must_use] pub fn as_writer_event(&self) -> Option> { match self { Self::StartDocument { version, encoding, standalone } => Some(crate::writer::events::XmlEvent::StartDocument { version: *version, encoding: Some(encoding), standalone: *standalone }), Self::ProcessingInstruction { name, data } => Some(crate::writer::events::XmlEvent::ProcessingInstruction { name, data: data.as_ref().map(|s| &**s) }), Self::StartElement { name, attributes, namespace } => Some(crate::writer::events::XmlEvent::StartElement { name: name.borrow(), attributes: attributes.iter().map(|a| a.borrow()).collect(), namespace: namespace.borrow(), }), Self::EndElement { name } => Some(crate::writer::events::XmlEvent::EndElement { name: Some(name.borrow()) }), Self::Comment(data) => Some(crate::writer::events::XmlEvent::Comment(data)), Self::CData(data) => Some(crate::writer::events::XmlEvent::CData(data)), Self::Characters(data) | Self::Whitespace(data) => Some(crate::writer::events::XmlEvent::Characters(data)), Self::Doctype { syntax, .. } => Some(crate::writer::events::XmlEvent::Doctype(syntax)), Self::EndDocument => None, } } } xml-1.2.0/src/reader/indexset.rs000064400000000000000000000062331046102023000146520ustar 00000000000000use crate::attribute::OwnedAttribute; use crate::name::OwnedName; use std::collections::hash_map::RandomState; use std::collections::HashSet; use std::hash::{BuildHasher, Hasher}; /// An ordered set pub struct AttributesSet { vec: Vec, /// Uses a no-op hasher, because these u64s are hashes already may_contain: HashSet, /// This is real hasher for the `OwnedName` hasher: RandomState, } /// Use linear search and don't allocate `HashSet` if there are few attributes, /// because allocation costs more than a few comparisons. const HASH_THRESHOLD: usize = 8; impl AttributesSet { pub fn new() -> Self { Self { vec: Vec::new(), hasher: RandomState::new(), may_contain: HashSet::default(), } } fn hash(&self, val: &OwnedName) -> u64 { self.hasher.hash_one(val) } pub fn len(&self) -> usize { self.vec.len() } pub fn contains(&self, name: &OwnedName) -> bool { // fall back to linear search only on duplicate or hash collision (self.vec.len() < HASH_THRESHOLD || self.may_contain.contains(&self.hash(name))) && self.vec.iter().any(move |a| &a.name == name) } pub fn push(&mut self, attr: OwnedAttribute) { if self.vec.len() >= HASH_THRESHOLD { if self.vec.len() == HASH_THRESHOLD { self.may_contain.reserve(HASH_THRESHOLD * 2); for attr in &self.vec { self.may_contain.insert(self.hash(&attr.name)); } } self.may_contain.insert(self.hash(&attr.name)); } self.vec.push(attr); } pub fn into_vec(self) -> Vec { self.vec } } #[test] fn indexset() { let mut s = AttributesSet::new(); let not_here = OwnedName { local_name: "attr1000".into(), namespace: Some("test".into()), prefix: None, }; // this test will take a lot of time if the `contains()` is linear, and the loop is quadratic for i in 0..50000 { let name = OwnedName { local_name: format!("attr{i}"), namespace: None, prefix: None, }; assert!(!s.contains(&name)); s.push(OwnedAttribute { name, value: String::new() }); assert!(!s.contains(¬_here)); } assert!(s.contains(&OwnedName { local_name: "attr1234".into(), namespace: None, prefix: None, })); assert!(s.contains(&OwnedName { local_name: "attr0".into(), namespace: None, prefix: None, })); assert!(s.contains(&OwnedName { local_name: "attr49999".into(), namespace: None, prefix: None, })); } /// Hashser that does nothing except passing u64 through struct U64Hasher(u64); impl Hasher for U64Hasher { fn finish(&self) -> u64 { self.0 } fn write(&mut self, slice: &[u8]) { for &v in slice { self.0 ^= u64::from(v) } // unused in practice } fn write_u64(&mut self, i: u64) { self.0 ^= i; } } #[derive(Default)] struct U64HasherBuilder; impl BuildHasher for U64HasherBuilder { type Hasher = U64Hasher; fn build_hasher(&self) -> U64Hasher { U64Hasher(0) } } xml-1.2.0/src/reader/lexer.rs000064400000000000000000001211041046102023000141410ustar 00000000000000//! Contains simple lexer for XML documents. //! //! This module is for internal use. Use `xml::pull` module to do parsing. use crate::common::{is_name_char, is_whitespace_char, is_xml10_char, is_xml11_char, Position, TextPosition}; use crate::reader::error::SyntaxError; use crate::reader::Error; use crate::util::{CharReader, Encoding}; use std::collections::VecDeque; use std::io::Read; use std::{fmt, result}; use super::ParserConfig; /// `Token` represents a single lexeme of an XML document. These lexemes /// are used to perform actual parsing. #[derive(Copy, Clone, PartialEq, Eq, Debug)] pub(crate) enum Token { /// `` ProcessingInstructionEnd, /// `` TagEnd, /// `/>` EmptyTagEnd, /// `` CommentEnd, /// Any non-special character except whitespace. Character(char), /// `=` EqualsSign, /// `'` SingleQuote, /// `"` DoubleQuote, /// `` CDataEnd, /// `&` ReferenceStart, /// `;` ReferenceEnd, /// `) -> fmt::Result { match *self { Token::Character(c) => c.fmt(f), other => match other { Token::OpeningTagStart => "<", Token::ProcessingInstructionStart => " " " "", Token::CDataEnd => "]]>", Token::ReferenceStart => "&", Token::ReferenceEnd => ";", Token::EqualsSign => "=", Token::SingleQuote => "'", Token::DoubleQuote => "\"", Token::MarkupDeclarationStart => " { debug_assert!(false); "" }, }.fmt(f), } } } impl Token { pub const fn as_static_str(self) -> Option<&'static str> { match self { Self::OpeningTagStart => Some("<"), Self::ProcessingInstructionStart => Some(" Some(" Some(" Some(""), Self::CDataEnd => Some("]]>"), Self::ReferenceStart => Some("&"), Self::ReferenceEnd => Some(";"), Self::EqualsSign => Some("="), Self::SingleQuote => Some("'"), Self::DoubleQuote => Some("\""), _ => None } } // using String.push_str(token.to_string()) is simply way too slow pub fn push_to_string(self, target: &mut String) { match self { Self::Character(c) => { debug_assert!(is_xml10_char(c) || is_xml11_char(c)); target.push(c); }, _ => if let Some(s) = self.as_static_str() { target.push_str(s); } } } } #[derive(Copy, Clone)] enum State { /// Default state Normal, /// Triggered on '<' TagStarted, /// Triggered on '` InsideMarkupDeclarationQuotedString(QuoteStyle), } #[derive(Copy, Clone, Eq, PartialEq)] enum QuoteStyle { Single, Double } #[derive(Copy, Clone)] enum ClosingSubstate { First, Second } #[derive(Copy, Clone)] #[allow(clippy::upper_case_acronyms)] enum DoctypeStartedSubstate { D, DO, DOC, DOCT, DOCTY, DOCTYP } #[derive(Copy, Clone)] #[allow(clippy::upper_case_acronyms)] enum CDataStartedSubstate { E, C, CD, CDA, CDAT, CDATA } /// `Result` represents lexing result. It is either a token or an error message. pub(crate) type Result, E = Error> = result::Result; /// Helps to set up a dispatch table for lexing large unambigous tokens like /// ` ( match $s { $( $st => match $c { $stc => Ok($_self.move_to($is($next_st))), _ => $_self.handle_error($chunk, $c) }, )+ $end_st => match $c { $end_c => $e, _ => $_self.handle_error($end_chunk, $c) } } ) ); /// `Lexer` is a lexer for XML documents, which implements pull API. /// /// Main method is `next_token` which accepts an `std::io::Read` instance and /// tries to read the next lexeme from it. /// /// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s. /// When it is not set, errors will be reported as `Err` objects with a string message. /// By default this flag is not set. Use `enable_errors` and `disable_errors` methods /// to toggle the behavior. pub(crate) struct Lexer { st: State, reader: CharReader, pos: TextPosition, head_pos: TextPosition, char_queue: VecDeque, /// Default state to go back to after a tag end (may be `InsideDoctype`) normal_state: State, inside_token: bool, eof_handled: bool, reparse_depth: u8, #[cfg(test)] skip_errors: bool, max_entity_expansion_depth: u8, max_entity_expansion_length: usize, } impl Position for Lexer { #[inline] /// Returns the position of the last token produced by the lexer fn position(&self) -> TextPosition { self.pos } } impl Lexer { /// Returns a new lexer with default state. pub(crate) fn new(config: &ParserConfig) -> Self { Self { reader: CharReader::new(), pos: TextPosition::new(), head_pos: TextPosition::new(), char_queue: VecDeque::with_capacity(4), // TODO: check size st: State::Normal, normal_state: State::Normal, inside_token: false, eof_handled: false, reparse_depth: 0, #[cfg(test)] skip_errors: false, max_entity_expansion_depth: config.max_entity_expansion_depth, max_entity_expansion_length: config.max_entity_expansion_length, } } pub(crate) fn encoding(&self) -> Encoding { self.reader.encoding } pub(crate) fn set_encoding(&mut self, encoding: Encoding) { self.reader.encoding = encoding; } /// Disables error handling so `next_token` will return `Some(Chunk(..))` /// upon invalid lexeme with this lexeme content. #[cfg(test)] fn disable_errors(&mut self) { self.skip_errors = true; } /// Reset the eof handled flag of the lexer. #[inline] pub fn reset_eof_handled(&mut self) { self.eof_handled = false; } /// Tries to read the next token from the buffer. /// /// It is possible to pass different instaces of `BufReader` each time /// this method is called, but the resulting behavior is undefined in this case. /// /// Return value: /// * `Err(reason) where reason: reader::Error` - when an error occurs; /// * `Ok(Token::Eof)` - upon end of stream is reached; /// * `Ok(token) where token: Token` - in case a complete-token has been read from the stream. pub fn next_token(&mut self, b: &mut B) -> Result { // Already reached end of buffer if self.eof_handled { return Ok(Token::Eof); } if !self.inside_token { self.pos = self.head_pos; self.inside_token = true; } // Check if we have saved a char or two for ourselves while let Some(c) = self.char_queue.pop_front() { if let Some(t) = self.dispatch_char(c)? { self.inside_token = false; return Ok(t); } } // if char_queue is empty, all circular reparsing is done self.reparse_depth = 0; while let Some(c) = self.reader.next_char_from(b)? { if c == '\n' { self.head_pos.new_line(); } else { self.head_pos.advance(1); } if let Some(t) = self.dispatch_char(c)? { self.inside_token = false; return Ok(t); } } self.end_of_stream() } #[inline(never)] fn end_of_stream(&mut self) -> Result { // Handle end of stream self.eof_handled = true; self.pos = self.head_pos; match self.st { State::InsideCdata | State::CDataClosing(_) => Err(self.error(SyntaxError::UnclosedCdata)), State::TagStarted | State::CommentOrCDataOrDoctypeStarted | State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) | State::CommentClosing(ClosingSubstate::Second) | State::InsideComment | State::InsideMarkupDeclaration | State::InsideProcessingInstruction | State::ProcessingInstructionClosing | State::InsideDoctype | State::InsideMarkupDeclarationQuotedString(_) => Err(self.error(SyntaxError::UnexpectedEof)), State::EmptyTagClosing => Ok(Token::Character('/')), State::CommentClosing(ClosingSubstate::First) => Ok(Token::Character('-')), State::InvalidCDataClosing(ClosingSubstate::First) => Ok(Token::Character(']')), State::InvalidCDataClosing(ClosingSubstate::Second) => { self.eof_handled = false; Ok(self.move_to_with_unread(State::Normal, &[']'], Token::Character(']'))) }, State::Normal => Ok(Token::Eof), } } #[cold] #[allow(clippy::needless_pass_by_value)] fn error(&self, e: SyntaxError) -> Error { Error::syntax(e.to_cow(), self.position()) } #[inline(never)] fn dispatch_char(&mut self, c: char) -> Result { match self.st { State::Normal => Ok(self.normal(c)), State::TagStarted => self.tag_opened(c), State::EmptyTagClosing => Ok(Some(self.empty_element_closing(c))), State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c), State::InsideCdata => Ok(self.inside_cdata(c)), State::CDataStarted(s) => self.cdata_started(c, s), State::InsideComment => Ok(self.inside_comment_state(c)), State::CommentStarted => self.comment_started(c), State::InsideProcessingInstruction => Ok(self.inside_processing_instruction(c)), State::ProcessingInstructionClosing => Ok(Some(self.processing_instruction_closing(c))), State::CommentClosing(s) => self.comment_closing(c, s), State::CDataClosing(s) => Ok(self.cdata_closing(c, s)), State::InsideDoctype => Ok(self.inside_doctype(c)), State::DoctypeStarted(s) => self.doctype_started(c, s), State::InvalidCDataClosing(s) => Ok(self.invalid_cdata_closing(c, s)), State::InsideMarkupDeclaration => self.markup_declaration(c), State::InsideMarkupDeclarationQuotedString(q) => Ok(Some(self.markup_declaration_string(c, q))), } } #[inline] fn move_to(&mut self, st: State) -> Option { self.st = st; None } #[inline] fn move_to_with(&mut self, st: State, token: Token) -> Token { self.st = st; token } #[inline] fn move_to_and_reset_normal(&mut self, st: State, token: Token) -> Token { self.normal_state = st; self.st = st; token } fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Token { for c in cs.iter().rev().copied() { self.char_queue.push_front(c); } self.move_to_with(st, token) } pub(crate) fn reparse(&mut self, markup: &str) -> Result<()> { if markup.is_empty() { return Ok(()); } self.reparse_depth += 1; if self.reparse_depth > self.max_entity_expansion_depth || self.char_queue.len() > self.max_entity_expansion_length { return Err(self.error(SyntaxError::EntityTooBig)); } self.eof_handled = false; self.char_queue.reserve(markup.len()); for c in markup.chars().rev() { self.char_queue.push_front(c); } Ok(()) } #[allow(clippy::needless_pass_by_ref_mut)] fn handle_error(&mut self, chunk: &'static str, c: char) -> Result { debug_assert!(!chunk.is_empty()); #[cfg(test)] if self.skip_errors { let mut chars = chunk.chars(); let first = chars.next().unwrap_or('\0'); self.char_queue.extend(chars); self.char_queue.push_back(c); return Ok(Some(self.move_to_with(State::Normal, Token::Character(first)))); } Err(self.error(SyntaxError::UnexpectedTokenBefore(chunk, c))) } /// Encountered a char fn normal(&mut self, c: char) -> Option { match c { '<' => self.move_to(State::TagStarted), '>' => Some(Token::TagEnd), '/' => self.move_to(State::EmptyTagClosing), '=' => Some(Token::EqualsSign), '"' => Some(Token::DoubleQuote), '\'' => Some(Token::SingleQuote), ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::First)), '&' => Some(Token::ReferenceStart), ';' => Some(Token::ReferenceEnd), _ => Some(Token::Character(c)) } } fn inside_cdata(&mut self, c: char) -> Option { match c { ']' => self.move_to(State::CDataClosing(ClosingSubstate::First)), _ => Some(Token::Character(c)), } } fn inside_processing_instruction(&mut self, c: char) -> Option { // These tokens are used by `` parser match c { '?' => self.move_to(State::ProcessingInstructionClosing), '<' => Some(Token::OpeningTagStart), '>' => Some(Token::TagEnd), '=' => Some(Token::EqualsSign), '"' => Some(Token::DoubleQuote), '\'' => Some(Token::SingleQuote), '&' => Some(Token::ReferenceStart), ';' => Some(Token::ReferenceEnd), _ => Some(Token::Character(c)) } } fn inside_comment_state(&mut self, c: char) -> Option { match c { '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)), _ => Some(Token::Character(c)), } } /// Encountered '<' fn tag_opened(&mut self, c: char) -> Result { match c { '?' => Ok(Some(self.move_to_with(State::InsideProcessingInstruction, Token::ProcessingInstructionStart))), '/' => Ok(Some(self.move_to_with(self.normal_state, Token::ClosingTagStart))), '!' => Ok(self.move_to(State::CommentOrCDataOrDoctypeStarted)), _ if is_whitespace_char(c) => Ok(Some(self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart))), _ if is_name_char(c) => Ok(Some(self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart))), _ => self.handle_error("<", c) } } /// Encountered ' Result { match c { '-' => Ok(self.move_to(State::CommentStarted)), '[' => Ok(self.move_to(State::CDataStarted(CDataStartedSubstate::E))), 'D' => Ok(self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D))), 'E' | 'A' | 'N' if matches!(self.normal_state, State::InsideDoctype) => { Ok(Some(self.move_to_with_unread(State::InsideMarkupDeclaration, &[c], Token::MarkupDeclarationStart))) }, _ => self.handle_error(" Result { match c { '-' => Ok(Some(self.move_to_with(State::InsideComment, Token::CommentStart))), _ => self.handle_error(" Result { use self::CDataStartedSubstate::{C, CD, CDA, CDAT, CDATA, E}; dispatch_on_enum_state!(self, s, c, State::CDataStarted, E ; 'C' ; C ; " Result { match c { '<' => self.handle_error("' => Ok(Some(self.move_to_with(self.normal_state, Token::TagEnd))), '&' => Ok(Some(Token::ReferenceStart)), ';' => Ok(Some(Token::ReferenceEnd)), '"' => Ok(Some(self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Double), Token::DoubleQuote))), '\'' => Ok(Some(self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Single), Token::SingleQuote))), _ => Ok(Some(Token::Character(c))), } } fn markup_declaration_string(&mut self, c: char, q: QuoteStyle) -> Token { match c { '"' if q == QuoteStyle::Double => self.move_to_with(State::InsideMarkupDeclaration, Token::DoubleQuote), '\'' if q == QuoteStyle::Single => self.move_to_with(State::InsideMarkupDeclaration, Token::SingleQuote), _ => Token::Character(c), } } /// Encountered ' Result { use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP}; dispatch_on_enum_state!(self, s, c, State::DoctypeStarted, D ; 'O' ; DO ; " Option { match c { '>' => Some(self.move_to_and_reset_normal(State::Normal, Token::TagEnd)), '<' => self.move_to(State::TagStarted), '&' => Some(Token::ReferenceStart), ';' => Some(Token::ReferenceEnd), '"' => Some(Token::DoubleQuote), '\'' => Some(Token::SingleQuote), _ => Some(Token::Character(c)), } } /// Encountered '?' fn processing_instruction_closing(&mut self, c: char) -> Token { match c { '>' => self.move_to_with(self.normal_state, Token::ProcessingInstructionEnd), _ => self.move_to_with_unread(State::InsideProcessingInstruction, &[c], Token::Character('?')), } } /// Encountered '/' fn empty_element_closing(&mut self, c: char) -> Token { match c { '>' => self.move_to_with(self.normal_state, Token::EmptyTagEnd), _ => self.move_to_with_unread(self.normal_state, &[c], Token::Character('/')), } } /// Encountered '-' fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result { match s { ClosingSubstate::First => match c { '-' => Ok(self.move_to(State::CommentClosing(ClosingSubstate::Second))), _ => Ok(Some(self.move_to_with_unread(State::InsideComment, &[c], Token::Character('-')))), }, ClosingSubstate::Second => match c { '>' => Ok(Some(self.move_to_with(self.normal_state, Token::CommentEnd))), // double dash not followed by a greater-than is a hard error inside comment _ => self.handle_error("--", c), }, } } /// Encountered ']' fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Option { match s { ClosingSubstate::First => match c { ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)), _ => Some(self.move_to_with_unread(State::InsideCdata, &[c], Token::Character(']'))), }, ClosingSubstate::Second => match c { '>' => Some(self.move_to_with(State::Normal, Token::CDataEnd)), _ => Some(self.move_to_with_unread(State::InsideCdata, &[']', c], Token::Character(']'))), }, } } /// Encountered ']' fn invalid_cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Option { match s { ClosingSubstate::First => match c { ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::Second)), _ => Some(self.move_to_with_unread(State::Normal, &[c], Token::Character(']'))), }, ClosingSubstate::Second => match c { '>' => Some(self.move_to_with(self.normal_state, Token::CDataEnd)), _ => Some(self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']'))), }, } } } #[cfg(test)] mod tests { use crate::common::Position; use crate::reader::ParserConfig; use std::io::{BufReader, Cursor}; use super::{Lexer, Token}; macro_rules! assert_oks( (for $lex:ident and $buf:ident ; $($e:expr)+) => ({ $( assert_eq!(Ok($e), $lex.next_token(&mut $buf)); )+ }) ); macro_rules! assert_err( (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({ let err = $lex.next_token(&mut $buf); assert!(err.is_err()); let err = err.unwrap_err(); assert_eq!($r as u64, err.position().row); assert_eq!($c as u64, err.position().column); }) ); macro_rules! assert_none( (for $lex:ident and $buf:ident) => ( assert_eq!(Ok(Token::Eof), $lex.next_token(&mut $buf)) ) ); fn make_lex_and_buf(s: &str) -> (Lexer, BufReader>>) { (Lexer::new(&ParserConfig::default()), BufReader::new(Cursor::new(s.to_owned().into_bytes()))) } #[test] fn tricky_pi() { let (mut lex, mut buf) = make_lex_and_buf(r""); assert_oks!(for lex and buf ; Token::ProcessingInstructionStart Token::Character('x') Token::OpeningTagStart // processing of relies on the extra tokens Token::Character('!') Token::Character('-') Token::Character('-') Token::Character(' ') Token::ReferenceStart Token::Character('?') Token::ProcessingInstructionEnd Token::OpeningTagStart Token::Character('x') Token::TagEnd ); assert_none!(for lex and buf); } #[test] fn reparser() { let (mut lex, mut buf) = make_lex_and_buf(r"&a;"); assert_oks!(for lex and buf ; Token::ReferenceStart Token::Character('a') Token::ReferenceEnd ); lex.reparse("").unwrap(); assert_oks!(for lex and buf ; Token::OpeningTagStart Token::Character('h') Token::Character('i') Token::EmptyTagEnd ); assert_none!(for lex and buf); } #[test] fn simple_lexer_test() { let (mut lex, mut buf) = make_lex_and_buf( r#" xd

 "# ); assert_oks!(for lex and buf ; Token::OpeningTagStart Token::Character('a') Token::Character(' ') Token::Character('p') Token::EqualsSign Token::SingleQuote Token::Character('q') Token::SingleQuote Token::TagEnd Token::Character(' ') Token::Character('x') Token::OpeningTagStart Token::Character('b') Token::Character(' ') Token::Character('z') Token::EqualsSign Token::DoubleQuote Token::Character('y') Token::DoubleQuote Token::TagEnd Token::Character('d') Token::Character('\t') Token::ClosingTagStart Token::Character('b') Token::TagEnd Token::ClosingTagStart Token::Character('a') Token::TagEnd Token::OpeningTagStart Token::Character('p') Token::EmptyTagEnd Token::Character(' ') Token::ProcessingInstructionStart Token::Character('n') Token::Character('m') Token::Character(' ') Token::ProcessingInstructionEnd Token::Character(' ') Token::CommentStart Token::Character(' ') Token::Character('a') Token::Character(' ') Token::Character('c') Token::Character(' ') Token::CommentEnd Token::Character(' ') Token::ReferenceStart Token::Character('n') Token::Character('b') Token::Character('s') Token::Character('p') Token::ReferenceEnd ); assert_none!(for lex and buf); } #[test] fn special_chars_test() { let (mut lex, mut buf) = make_lex_and_buf( r"?x!+ // -| ]z]]" ); assert_oks!(for lex and buf ; Token::Character('?') Token::Character('x') Token::Character('!') Token::Character('+') Token::Character(' ') Token::Character('/') Token::Character('/') Token::Character(' ') Token::Character('-') Token::Character('|') Token::Character(' ') Token::Character(']') Token::Character('z') Token::Character(']') Token::Character(']') ); assert_none!(for lex and buf); } #[test] fn cdata_test() { let (mut lex, mut buf) = make_lex_and_buf( r" " ); assert_oks!(for lex and buf ; Token::OpeningTagStart Token::Character('a') Token::TagEnd Token::CDataStart Token::Character('x') Token::Character(' ') Token::Character('y') Token::Character(' ') Token::Character('?') Token::CDataEnd Token::Character(' ') Token::ClosingTagStart Token::Character('a') Token::TagEnd ); assert_none!(for lex and buf); } #[test] fn cdata_closers_test() { let (mut lex, mut buf) = make_lex_and_buf( r" ]> ]]>]]" ); assert_oks!(for lex and buf ; Token::CDataStart Token::Character(']') Token::Character(' ') Token::Character('>') Token::Character(' ') Token::Character(']') Token::Character('>') Token::Character(' ') Token::CDataEnd Token::CommentStart Token::CommentEnd Token::Character(']') Token::Character(']') Token::OpeningTagStart Token::Character('a') Token::TagEnd ); assert_none!(for lex and buf); } #[test] fn doctype_test() { let (mut lex, mut buf) = make_lex_and_buf( r" " ); assert_oks!(for lex and buf ; Token::OpeningTagStart Token::Character('a') Token::TagEnd Token::DoctypeStart Token::Character(' ') Token::Character('a') Token::Character('b') Token::Character(' ') Token::Character('x') Token::Character('x') Token::Character(' ') Token::Character('z') Token::TagEnd Token::Character(' ') ); assert_none!(for lex and buf); } #[test] fn tricky_comments() { let (mut lex, mut buf) = make_lex_and_buf( r"" ); assert_oks!(for lex and buf ; Token::OpeningTagStart Token::Character('a') Token::TagEnd Token::CommentStart Token::Character(' ') Token::Character('C') Token::Character(' ') Token::Character('-') Token::Character('>') Token::CommentEnd Token::ClosingTagStart Token::Character('a') Token::TagEnd ); assert_none!(for lex and buf); } #[test] fn doctype_with_internal_subset_test() { let (mut lex, mut buf) = make_lex_and_buf( r#">>"> ]> "# ); assert_oks!(for lex and buf ; Token::OpeningTagStart Token::Character('a') Token::TagEnd Token::DoctypeStart Token::Character(' ') Token::Character('a') Token::Character('b') Token::Character('[') Token::MarkupDeclarationStart Token::Character('E') Token::Character('L') Token::Character('E') Token::Character('M') Token::Character('E') Token::Character('N') Token::Character('T') Token::Character(' ') Token::Character('b') Token::Character('a') Token::Character(' ') Token::DoubleQuote Token::Character('>') Token::Character('>') Token::Character('>') Token::DoubleQuote Token::TagEnd Token::Character(' ') Token::Character(']') Token::TagEnd Token::Character(' ') ); assert_none!(for lex and buf); } #[test] fn doctype_internal_pi_comment() { let (mut lex, mut buf) = make_lex_and_buf( " ?> \n]>" ); assert_oks!(for lex and buf ; Token::DoctypeStart Token::Character(' ') Token::Character('a') Token::Character(' ') Token::Character('[') Token::Character('\n') Token::MarkupDeclarationStart Token::Character('E') Token::Character('L') Token::Character('E') Token::Character('M') Token::Character('E') Token::Character('N') Token::Character('T') Token::Character(' ') Token::Character('l') Token::Character(' ') Token::Character('A') Token::Character('N') Token::Character('Y') Token::TagEnd Token::Character(' ') Token::CommentStart Token::Character(' ') Token::Character('<') Token::Character('?') Token::Character('n') Token::Character('o') Token::Character('n') Token::Character('?') Token::Character('>') Token::CommentEnd Token::Character(' ') Token::ProcessingInstructionStart Token::Character('p') Token::Character('i') Token::Character(' ') Token::TagEnd // not really Token::Character(' ') Token::ProcessingInstructionEnd Token::Character(' ') Token::Character('\n') Token::Character(']') Token::TagEnd // DTD ); assert_none!(for lex and buf); } #[test] fn end_of_stream_handling_ok() { macro_rules! eof_check( ($data:expr ; $token:expr) => ({ let (mut lex, mut buf) = make_lex_and_buf($data); assert_oks!(for lex and buf ; $token); assert_none!(for lex and buf); }) ); eof_check!("?" ; Token::Character('?')); eof_check!("/" ; Token::Character('/')); eof_check!("-" ; Token::Character('-')); eof_check!("]" ; Token::Character(']')); eof_check!("]" ; Token::Character(']')); eof_check!("]" ; Token::Character(']')); } #[test] fn end_of_stream_handling_error() { macro_rules! eof_check( ($data:expr; $r:expr, $c:expr) => ({ let (mut lex, mut buf) = make_lex_and_buf($data); assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream"); assert_none!(for lex and buf); }) ); eof_check!("<" ; 0, 1); eof_check!(" ({ let (mut lex, mut buf) = make_lex_and_buf($data); assert_err!(for lex and buf expect row $r ; $c, $s); let (mut lex, mut buf) = make_lex_and_buf($data); lex.disable_errors(); for c in $chunk.chars() { assert_eq!(Ok(Token::Character(c)), lex.next_token(&mut buf)); } assert_oks!(for lex and buf ; Token::Character($app) ); assert_none!(for lex and buf); }) ); #[test] fn token_size() { assert_eq!(4, std::mem::size_of::()); assert_eq!(2, std::mem::size_of::()); } #[test] fn error_in_cdata_started() { check_case!("" ); assert_oks!(for lex and buf ; Token::CDataStart Token::Character('F') Token::Character('o') Token::Character('o') Token::Character(' ') Token::Character('[') Token::Character('B') Token::Character('a') Token::Character('r') Token::Character(']') Token::CDataEnd ); assert_none!(for lex and buf); } } xml-1.2.0/src/reader/parser/inside_cdata.rs000064400000000000000000000024641046102023000167340ustar 00000000000000use crate::common::is_whitespace_char; use crate::reader::error::SyntaxError; use crate::reader::events::XmlEvent; use crate::reader::lexer::Token; use super::{PullParser, Result, State}; impl PullParser { pub fn inside_cdata(&mut self, t: Token) -> Option { match t { Token::CDataEnd => { let event = if self.config.cdata_to_characters { // start called push_pos, but there will be no event to pop it if self.buf.is_empty() { self.next_pos(); } None } else { let data = self.take_buf(); Some(Ok(XmlEvent::CData(data))) }; self.into_state(State::OutsideTag, event) }, Token::Character(c) if !self.is_valid_xml_char(c) => { Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) }, Token::Character(c) => { if !is_whitespace_char(c) { self.inside_whitespace = false; } self.buf.push(c); None }, _ => { debug_assert!(false, "unreachable"); None }, } } } xml-1.2.0/src/reader/parser/inside_closing_tag_name.rs000064400000000000000000000031361046102023000211460ustar 00000000000000use super::{ClosingTagSubstate, PullParser, QualifiedNameTarget, Result, State}; use crate::common::is_whitespace_char; use crate::namespace; use crate::reader::error::SyntaxError; use crate::reader::lexer::Token; impl PullParser { pub fn inside_closing_tag_name(&mut self, t: Token, s: ClosingTagSubstate) -> Option { match s { ClosingTagSubstate::CTInsideName => self.read_qualified_name(t, QualifiedNameTarget::ClosingTag, |this, token, name| { match name.prefix_ref() { Some(prefix) if prefix == namespace::NS_XML_PREFIX || prefix == namespace::NS_XMLNS_PREFIX => Some(this.error(SyntaxError::InvalidNamePrefix(prefix.into()))), _ => { this.data.element_name = Some(name.clone()); match token { Token::TagEnd => this.emit_end_element(), Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideClosingTag(ClosingTagSubstate::CTAfterName)), _ => Some(this.error(SyntaxError::UnexpectedTokenInClosingTag(token))), } } } }), ClosingTagSubstate::CTAfterName => match t { Token::TagEnd => self.emit_end_element(), Token::Character(c) if is_whitespace_char(c) => None, // Skip whitespace _ => Some(self.error(SyntaxError::UnexpectedTokenInClosingTag(t))), }, } } } xml-1.2.0/src/reader/parser/inside_comment.rs000064400000000000000000000021531046102023000173150ustar 00000000000000use crate::reader::error::SyntaxError; use crate::reader::events::XmlEvent; use crate::reader::lexer::Token; use super::{PullParser, Result, State}; impl PullParser { pub fn inside_comment(&mut self, t: Token) -> Option { match t { Token::CommentEnd if self.config.ignore_comments => { self.into_state_continue(State::OutsideTag) }, Token::CommentEnd => { let data = self.take_buf(); self.into_state_emit(State::OutsideTag, Ok(XmlEvent::Comment(data))) }, Token::Character(c) if !self.is_valid_xml_char(c) => { Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) }, _ if self.config.ignore_comments => None, // Do not modify buffer if ignoring the comment _ => { if self.buf.len() > self.config.max_data_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } t.push_to_string(&mut self.buf); None }, } } } xml-1.2.0/src/reader/parser/inside_declaration.rs000064400000000000000000000240671046102023000201500ustar 00000000000000use crate::common::{is_whitespace_char, XmlVersion}; use crate::reader::error::SyntaxError; use crate::reader::events::XmlEvent; use crate::reader::lexer::Token; use crate::util::Encoding; use super::{ DeclarationSubstate, Encountered, PullParser, QualifiedNameTarget, Result, State, DEFAULT_VERSION, }; impl PullParser { #[inline(never)] fn emit_start_document(&mut self) -> Option { debug_assert!(self.encountered == Encountered::None); self.encountered = Encountered::Declaration; let version = self.data.version; let encoding = self.data.take_encoding(); let standalone = self.data.standalone; if let Some(new_encoding) = encoding.as_deref() { let new_encoding = match new_encoding.parse() { Ok(e) => e, Err(_) if self.config.ignore_invalid_encoding_declarations => Encoding::Latin1, Err(_) => return Some(self.error(SyntaxError::UnsupportedEncoding(new_encoding.into()))), }; let current_encoding = self.lexer.encoding(); if current_encoding != new_encoding { let set = match (current_encoding, new_encoding) { (Encoding::Unknown | Encoding::Default, new) if new != Encoding::Utf16 => new, (Encoding::Utf16Be | Encoding::Utf16Le, Encoding::Utf16) => current_encoding, _ if self.config.ignore_invalid_encoding_declarations => current_encoding, _ => return Some(self.error(SyntaxError::ConflictingEncoding(new_encoding, current_encoding))), }; self.lexer.set_encoding(set); } } let current_encoding = self.lexer.encoding(); self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartDocument { version: version.unwrap_or(DEFAULT_VERSION), encoding: encoding.unwrap_or_else(move || current_encoding.to_string()), standalone })) } // TODO: remove redundancy via macros or extra methods pub fn inside_declaration(&mut self, t: Token, s: DeclarationSubstate) -> Option { match s { DeclarationSubstate::BeforeVersion => match t { Token::Character('v') => { self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersion)) }, Token::Character(c) if is_whitespace_char(c) => None, // continue _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideVersion => self.read_qualified_name(t, QualifiedNameTarget::Attribute, |this, token, name| { match &*name.local_name { "ersion" if name.namespace.is_none() => this.into_state_continue(State::InsideDeclaration( if token == Token::EqualsSign { DeclarationSubstate::InsideVersionValue } else { DeclarationSubstate::AfterVersion } )), _ => Some(this.error(SyntaxError::UnexpectedNameInsideXml(name.to_string().into()))), } }), DeclarationSubstate::AfterVersion => match t { Token::EqualsSign => { self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersionValue)) }, Token::Character(c) if is_whitespace_char(c) => None, _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideVersionValue => self.read_attribute_value(t, |this, value| { this.data.version = match &*value { "1.0" => Some(XmlVersion::Version10), "1.1" => Some(XmlVersion::Version11), // https://www.w3.org/TR/REC-xml/#sec-prolog-dtd // VersionNum ::= '1.' [0-9]+ // "[...] This means that an XML 1.0 processor will accept 1.x // documents provided they do not use any non-1.0 features." v if v.starts_with("1.") && v.len() >= 3 && v.chars().skip(2).all(|c| c.is_ascii_digit()) => { // XML 1.1 forbids any other versions Some(XmlVersion::Version10) } _ => None, }; if this.data.version.is_some() { this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterVersionValue)) } else { Some(this.error(SyntaxError::UnexpectedXmlVersion(value.into()))) } }), DeclarationSubstate::AfterVersionValue => match t { Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeEncoding)), Token::ProcessingInstructionEnd => self.emit_start_document(), _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::BeforeEncoding => match t { Token::Character('e') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncoding)), Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)), Token::ProcessingInstructionEnd => self.emit_start_document(), Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideEncoding => self.read_qualified_name(t, QualifiedNameTarget::Attribute, |this, token, name| { match &*name.local_name { "ncoding" if name.namespace.is_none() => this.into_state_continue(State::InsideDeclaration( if token == Token::EqualsSign { DeclarationSubstate::InsideEncodingValue } else { DeclarationSubstate::AfterEncoding } )), _ => Some(this.error(SyntaxError::UnexpectedName(name.to_string().into()))) } }), DeclarationSubstate::AfterEncoding => match t { Token::EqualsSign => { self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncodingValue)) }, Token::Character(c) if is_whitespace_char(c) => None, _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideEncodingValue => self.read_attribute_value(t, |this, value| { this.data.encoding = Some(value); this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterEncodingValue)) }), DeclarationSubstate::AfterEncodingValue => match t { Token::Character(c) if is_whitespace_char(c) => { self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeStandaloneDecl)) }, Token::ProcessingInstructionEnd => self.emit_start_document(), _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::BeforeStandaloneDecl => match t { Token::Character('s') => { self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)) }, Token::ProcessingInstructionEnd => self.emit_start_document(), Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideStandaloneDecl => self.read_qualified_name(t, QualifiedNameTarget::Attribute, |this, token, name| { match &*name.local_name { "tandalone" if name.namespace.is_none() => this.into_state_continue(State::InsideDeclaration( if token == Token::EqualsSign { DeclarationSubstate::InsideStandaloneDeclValue } else { DeclarationSubstate::AfterStandaloneDecl } )), _ => Some(this.error(SyntaxError::UnexpectedName(name.to_string().into()))), } }), DeclarationSubstate::AfterStandaloneDecl => match t { Token::EqualsSign => { self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDeclValue)) }, Token::Character(c) if is_whitespace_char(c) => None, _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DeclarationSubstate::InsideStandaloneDeclValue => self.read_attribute_value(t, |this, value| { let standalone = match &*value { "yes" => Some(true), "no" => Some(false), _ => None }; if standalone.is_some() { this.data.standalone = standalone; this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterStandaloneDeclValue)) } else { Some(this.error(SyntaxError::InvalidStandaloneDeclaration(value.into()))) } }), DeclarationSubstate::AfterStandaloneDeclValue => match t { Token::ProcessingInstructionEnd => self.emit_start_document(), Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, } } } xml-1.2.0/src/reader/parser/inside_doctype.rs000064400000000000000000000504431046102023000173270ustar 00000000000000use std::fmt::Write; use crate::common::{is_name_char, is_name_start_char, is_pubid_char, is_whitespace_char}; use crate::reader::error::SyntaxError; use crate::reader::lexer::Token; use crate::reader::XmlEvent; use super::{DoctypeSubstate, PullParser, QuoteToken, Result, State}; impl PullParser { pub fn inside_doctype(&mut self, t: Token, substate: DoctypeSubstate) -> Option { if let Some(ref mut doctype) = self.data.doctype { write!(doctype, "{t}").ok()?; if doctype.len() > self.config.max_data_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } } match substate { DoctypeSubstate::BeforeDoctypeName => match t { Token::Character(c) if is_whitespace_char(c) => None, Token::Character(c) if is_name_start_char(c) => { self.buf.push(c); self.into_state_continue(State::InsideDoctype(DoctypeSubstate::DoctypeName)) } _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DoctypeSubstate::DoctypeName => match t { Token::TagEnd => { self.data.doctype_name = Some(self.take_buf_boxed()); let event = XmlEvent::Doctype { syntax: self.data.doctype.clone().unwrap_or_default(), }; self.into_state_emit(State::OutsideTag, Ok(event)) } Token::Character(c) if is_whitespace_char(c) => { self.data.doctype_name = Some(self.take_buf_boxed()); self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) } Token::Character(c) if is_name_char(c) => { self.buf.push(c); if self.buf.len() > self.config.max_name_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } None } _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DoctypeSubstate::ExternalIdKeyword => match t { Token::Character(c @ 'A'..='Z') => { self.buf.push(c); if self.buf == "SYSTEM" { self.buf.clear(); return self.into_state_continue(State::InsideDoctype( DoctypeSubstate::BeforeSystemLiteral, )); } if self.buf == "PUBLIC" { self.buf.clear(); return self.into_state_continue(State::InsideDoctype( DoctypeSubstate::BeforePubId, )); } if "PUBLIC".starts_with(&self.buf) || "SYSTEM".starts_with(&self.buf) { return None; } Some(self.error(SyntaxError::UnexpectedToken(t))) } _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DoctypeSubstate::BeforeSystemLiteral => match t { Token::Character(c) if is_whitespace_char(c) => None, Token::SingleQuote | Token::DoubleQuote => { self.data.quote = super::QuoteToken::from_token(t); self.buf.clear(); self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SystemLiteral)) } _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DoctypeSubstate::SystemLiteral => match t { Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { self.buf.push('\''); None } Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { self.buf.push('"'); None } Token::SingleQuote | Token::DoubleQuote => { self.data.quote = None; self.data.doctype_system_id = Some(self.take_buf_boxed()); self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) } Token::Character(c) => { self.buf.push(c); None } _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DoctypeSubstate::BeforePubId => match t { Token::Character(c) if is_whitespace_char(c) => None, Token::SingleQuote | Token::DoubleQuote => { self.data.quote = super::QuoteToken::from_token(t); self.buf.clear(); self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PubId)) } _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DoctypeSubstate::PubId => match t { Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { self.buf.push('\''); None } Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { self.buf.push('"'); None } Token::SingleQuote | Token::DoubleQuote => { self.data.quote = None; self.data.doctype_public_id = Some(self.take_buf_boxed()); self.into_state_continue(State::InsideDoctype( DoctypeSubstate::BeforeSystemLiteral, )) } Token::Character(c) if is_pubid_char(c) => { self.buf.push(c); None } Token::ReferenceEnd => { self.buf.push(';'); None } _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DoctypeSubstate::Outside => match t { Token::TagEnd => { let event = XmlEvent::Doctype { syntax: self.data.doctype.clone().unwrap_or_default(), }; self.into_state_emit(State::OutsideTag, Ok(event)) } Token::CDataEnd | Token::CDataStart => { Some(self.error(SyntaxError::UnexpectedToken(t))) } Token::Character(c) if c == '[' => { self.into_state_continue(State::InsideDoctype(DoctypeSubstate::InternalSubset)) } Token::Character(c) if c == 'S' || c == 'P' => { self.buf.push(c); self.into_state_continue(State::InsideDoctype( DoctypeSubstate::ExternalIdKeyword, )) } Token::Character(c) if is_whitespace_char(c) => None, _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DoctypeSubstate::InternalSubset => match t { Token::Character(c) if c == ']' => { self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) } Token::SingleQuote | Token::DoubleQuote => { // just discard string literals self.data.quote = super::QuoteToken::from_token(t); self.into_state_continue(State::InsideDoctype(DoctypeSubstate::String)) } Token::CommentStart => { self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Comment)) } Token::Character('%') => { self.data.ref_data.clear(); self.data.ref_data.push('%'); self.into_state_continue(State::InsideDoctype( DoctypeSubstate::PEReferenceInDtd, )) } Token::MarkupDeclarationStart => { self.buf.clear(); self.into_state_continue(State::InsideDoctype(DoctypeSubstate::InsideName)) } Token::Character(c) if is_whitespace_char(c) => None, Token::ProcessingInstructionStart => { self.into_state_continue(State::InsideDoctype(DoctypeSubstate::IgnorePI)) } _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DoctypeSubstate::IgnorePI => match t { Token::ProcessingInstructionEnd => { self.into_state_continue(State::InsideDoctype(DoctypeSubstate::InternalSubset)) } _ => None, }, DoctypeSubstate::String => match t { Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => None, Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => None, Token::SingleQuote | Token::DoubleQuote => { self.data.quote = None; self.into_state_continue(State::InsideDoctype(DoctypeSubstate::InternalSubset)) } _ => None, }, DoctypeSubstate::Comment => match t { Token::CommentEnd => { self.into_state_continue(State::InsideDoctype(DoctypeSubstate::InternalSubset)) } _ => None, }, DoctypeSubstate::InsideName => match t { Token::Character(c @ 'A'..='Z') => { self.buf.push(c); None } Token::Character(c) if is_whitespace_char(c) => { let state = match self.buf.as_str() { "ENTITY" => self.into_state_continue(State::InsideDoctype( DoctypeSubstate::BeforeEntityName, )), "NOTATION" | "ELEMENT" | "ATTLIST" => self.into_state_continue( State::InsideDoctype(DoctypeSubstate::SkipDeclaration), ), _ => Some(self.error(SyntaxError::UnknownMarkupDeclaration(self.buf.as_str().into()))), }; self.buf.clear(); state } _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, DoctypeSubstate::BeforeEntityName => { self.data.name.clear(); match t { Token::Character(c) if is_whitespace_char(c) => None, Token::Character('%') => { // % is for PEDecl self.data.name.push('%'); self.into_state_continue(State::InsideDoctype( DoctypeSubstate::PEReferenceDefinitionStart, )) } Token::Character(c) if is_name_start_char(c) => { if self.data.name.len() > self.config.max_name_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } self.data.name.push(c); self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityName)) } _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), } } DoctypeSubstate::EntityName => match t { Token::Character(c) if is_whitespace_char(c) => self .into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue)), Token::Character(c) if is_name_char(c) => { if self.data.name.len() > self.config.max_name_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } self.data.name.push(c); None } _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), }, DoctypeSubstate::BeforeEntityValue => { self.buf.clear(); match t { Token::Character(c) if is_whitespace_char(c) => None, // SYSTEM/PUBLIC not supported Token::Character('S' | 'P') => { let name = self.data.take_name(); self.entities.entry(name).or_default(); // Dummy value, but at least the name is recognized self.into_state_continue(State::InsideDoctype( DoctypeSubstate::SkipDeclaration, )) } Token::SingleQuote | Token::DoubleQuote => { self.data.quote = super::QuoteToken::from_token(t); self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) } _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), } } DoctypeSubstate::EntityValue => match t { Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { self.buf.push('\''); None } Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { self.buf.push('"'); None } Token::SingleQuote | Token::DoubleQuote => { self.data.quote = None; let name = self.data.take_name(); let val = self.take_buf(); self.entities.entry(name).or_insert(val); // First wins self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)) // FIXME } Token::ReferenceStart | Token::Character('&') => { self.data.ref_data.clear(); self.into_state_continue(State::InsideDoctype( DoctypeSubstate::NumericReferenceStart, )) } Token::Character('%') => { self.data.ref_data.clear(); self.data.ref_data.push('%'); // include literal % in the name to distinguish from regular entities self.into_state_continue(State::InsideDoctype( DoctypeSubstate::PEReferenceInValue, )) } Token::Character(c) if !self.is_valid_xml_char(c) => { Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) } Token::Character(c) => { self.buf.push(c); None } _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), }, DoctypeSubstate::PEReferenceDefinitionStart => match t { Token::Character(c) if is_whitespace_char(c) => None, Token::Character(c) if is_name_start_char(c) => { debug_assert_eq!(self.data.name, "%"); self.data.name.push(c); self.into_state_continue(State::InsideDoctype( DoctypeSubstate::PEReferenceDefinition, )) } _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), }, DoctypeSubstate::PEReferenceDefinition => match t { Token::Character(c) if is_name_char(c) => { if self.data.name.len() > self.config.max_name_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } self.data.name.push(c); None } Token::Character(c) if is_whitespace_char(c) => self .into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue)), _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), }, DoctypeSubstate::PEReferenceInDtd => match t { Token::Character(c) if is_name_char(c) => { self.data.ref_data.push(c); None } Token::ReferenceEnd | Token::Character(';') => { let name = self.data.take_ref_data(); match self.entities.get(&name) { Some(ent) => { if let Err(e) = self.lexer.reparse(ent) { return Some(Err(e)); } self.into_state_continue(State::InsideDoctype( DoctypeSubstate::InternalSubset, )) } None => Some(self.error(SyntaxError::UndefinedEntity(name.into()))), } } _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), }, DoctypeSubstate::PEReferenceInValue => match t { Token::Character(c) if is_name_char(c) => { self.data.ref_data.push(c); None } Token::ReferenceEnd | Token::Character(';') => { let name = self.data.take_ref_data(); match self.entities.get(&name) { Some(ent) => { self.buf.push_str(ent); self.into_state_continue(State::InsideDoctype( DoctypeSubstate::EntityValue, )) } None => Some(self.error(SyntaxError::UndefinedEntity(name.into()))), } } _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), }, DoctypeSubstate::NumericReferenceStart => match t { Token::Character('#') => self .into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReference)), Token::Character(c) if !self.is_valid_xml_char(c) => { Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) } Token::Character(c) => { self.buf.push('&'); self.buf.push(c); // named entities are not expanded inside doctype self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) } _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), }, DoctypeSubstate::NumericReference => match t { Token::ReferenceEnd | Token::Character(';') => { let r = self.data.take_ref_data(); // https://www.w3.org/TR/xml/#sec-entexpand match self.numeric_reference_from_str(&r) { Ok(c) => { self.buf.push(c); self.into_state_continue(State::InsideDoctype( DoctypeSubstate::EntityValue, )) } Err(e) => Some(self.error(e)), } } Token::Character(c) if !self.is_valid_xml_char(c) => { Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) } Token::Character(c) => { self.data.ref_data.push(c); None } _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), }, DoctypeSubstate::SkipDeclaration => match t { Token::TagEnd => { self.into_state_continue(State::InsideDoctype(DoctypeSubstate::InternalSubset)) } _ => None, }, } } } xml-1.2.0/src/reader/parser/inside_opening_tag.rs000064400000000000000000000150451046102023000201510ustar 00000000000000use crate::attribute::OwnedAttribute; use crate::common::{is_name_start_char, is_whitespace_char}; use crate::namespace; use crate::reader::error::SyntaxError; use crate::reader::lexer::Token; use super::{OpeningTagSubstate, PullParser, QualifiedNameTarget, Result, State}; impl PullParser { pub fn inside_opening_tag(&mut self, t: Token, s: OpeningTagSubstate) -> Option { let max_attrs = self.config.max_attributes; match s { OpeningTagSubstate::InsideName => self.read_qualified_name(t, QualifiedNameTarget::OpeningTag, |this, token, name| { match name.prefix_ref() { Some(prefix) if prefix == namespace::NS_XML_PREFIX || prefix == namespace::NS_XMLNS_PREFIX => Some(this.error(SyntaxError::InvalidNamePrefix(prefix.into()))), _ => { this.data.element_name = Some(name.clone()); match token { Token::TagEnd => this.emit_start_element(false), Token::EmptyTagEnd => this.emit_start_element(true), Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)), _ => { debug_assert!(false, "unreachable"); None }, } } } }), OpeningTagSubstate::InsideTag => match t { Token::TagEnd => self.emit_start_element(false), Token::EmptyTagEnd => self.emit_start_element(true), Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace Token::Character(c) if is_name_start_char(c) => { if self.buf.len() > self.config.max_name_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } self.buf.push(c); self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeName)) }, _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))), }, OpeningTagSubstate::InsideAttributeName => self.read_qualified_name(t, QualifiedNameTarget::Attribute, |this, token, name| { // check that no attribute with such name is already present // if there is one, XML is not well-formed if this.data.attributes.contains(&name) { return Some(this.error(SyntaxError::RedefinedAttribute(name.to_string().into()))) } this.data.attr_name = Some(name); match token { Token::EqualsSign => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)), Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeName)), _ => Some(this.error(SyntaxError::UnexpectedTokenInOpeningTag(t))) // likely unreachable } }), OpeningTagSubstate::AfterAttributeName => match t { Token::EqualsSign => { self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)) }, Token::Character(c) if is_whitespace_char(c) => None, _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))), }, OpeningTagSubstate::InsideAttributeValue => self.read_attribute_value(t, |this, value| { let name = this.data.take_attr_name()?; // will always succeed here match name.prefix_ref() { // declaring a new prefix; it is sufficient to check prefix only // because "xmlns" prefix is reserved Some(namespace::NS_XMLNS_PREFIX) => { let ln = &*name.local_name; if ln == namespace::NS_XMLNS_PREFIX { Some(this.error(SyntaxError::CannotRedefineXmlnsPrefix)) } else if ln == namespace::NS_XML_PREFIX && &*value != namespace::NS_XML_URI { Some(this.error(SyntaxError::CannotRedefineXmlPrefix)) } else if value.is_empty() { Some(this.error(SyntaxError::CannotUndefinePrefix(ln.into()))) } else { this.nst.put(name.local_name.clone(), value); this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue)) } }, // declaring default namespace None if &*name.local_name == namespace::NS_XMLNS_PREFIX => match &*value { namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX | namespace::NS_XML_URI | namespace::NS_XMLNS_URI => Some(this.error(SyntaxError::InvalidDefaultNamespace(value.into()))), _ => { this.nst.put(namespace::NS_NO_PREFIX, value.clone()); this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue)) } }, // regular attribute _ => { if this.data.attributes.len() >= max_attrs { return Some(this.error(SyntaxError::ExceededConfiguredLimit)); } this.data.attributes.push(OwnedAttribute { name, value }); this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue)) }, } }), OpeningTagSubstate::AfterAttributeValue => match t { Token::Character(c) if is_whitespace_char(c) => { self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) }, Token::TagEnd => self.emit_start_element(false), Token::EmptyTagEnd => self.emit_start_element(true), _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))), }, } } } xml-1.2.0/src/reader/parser/inside_processing_instruction.rs000064400000000000000000000122461046102023000224740ustar 00000000000000use crate::common::{is_name_char, is_name_start_char, is_whitespace_char}; use crate::reader::error::SyntaxError; use crate::reader::events::XmlEvent; use crate::reader::lexer::Token; use super::{DeclarationSubstate, Encountered, ProcessingInstructionSubstate, PullParser, Result, State}; impl PullParser { pub fn inside_processing_instruction(&mut self, t: Token, s: ProcessingInstructionSubstate) -> Option { match s { ProcessingInstructionSubstate::PIInsideName => match t { Token::Character(c) if self.buf.is_empty() && is_name_start_char(c) || self.buf_has_data() && is_name_char(c) => { if self.buf.len() > self.config.max_name_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } self.buf.push(c); None }, Token::ProcessingInstructionEnd => { // self.buf contains PI name let name = self.take_buf(); // Don't need to check for declaration because it has mandatory attributes // but there is none match &*name { // Name is empty, it is an error "" => Some(self.error(SyntaxError::ProcessingInstructionWithoutName)), // Found { Some(self.error(SyntaxError::InvalidXmlProcessingInstruction(name.into()))) }, // All is ok, emitting event _ => { debug_assert!(self.next_event.is_none(), "{:?}", self.next_event); // can't have a PI before ` { // self.buf contains PI name let name = self.take_buf(); match &*name { // We have not ever encountered an element and have not parsed XML declaration "xml" if self.encountered == Encountered::None => { self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeVersion)) }, // Found { Some(self.error(SyntaxError::InvalidXmlProcessingInstruction(name.into()))) }, // All is ok, starting parsing PI data _ => { self.data.name = name; // can't have a PI before ` { let buf = self.take_buf_boxed(); Some(self.error(SyntaxError::UnexpectedProcessingInstruction(buf, t))) }, }, ProcessingInstructionSubstate::PIInsideData => match t { Token::ProcessingInstructionEnd => { let name = self.data.take_name(); let data = self.take_buf(); self.into_state_emit( State::OutsideTag, Ok(XmlEvent::ProcessingInstruction { name, data: Some(data) }), ) }, Token::Character(c) if !self.is_valid_xml_char(c) => { Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) }, // Any other token should be treated as plain characters _ => { if self.buf.len() > self.config.max_data_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } t.push_to_string(&mut self.buf); None }, }, } } } xml-1.2.0/src/reader/parser/inside_reference.rs000064400000000000000000000066061046102023000176200ustar 00000000000000use super::{PullParser, Result, State}; use crate::common::{is_name_char, is_name_start_char, is_whitespace_char}; use crate::reader::error::SyntaxError; use crate::reader::lexer::Token; use std::char; impl PullParser { pub fn inside_reference(&mut self, t: Token) -> Option { match t { Token::Character(c) if !self.data.ref_data.is_empty() && is_name_char(c) || self.data.ref_data.is_empty() && (is_name_start_char(c) || c == '#') => { self.data.ref_data.push(c); None }, Token::ReferenceEnd => { let name = self.data.take_ref_data(); if name.is_empty() { return Some(self.error(SyntaxError::EmptyEntity)); } let c = match &*name { "lt" => Some('<'), "gt" => Some('>'), "amp" => Some('&'), "apos" => Some('\''), "quot" => Some('"'), _ if name.starts_with('#') => match self.numeric_reference_from_str(&name[1..]) { Ok(c) => Some(c), Err(e) => return Some(self.error(e)), }, _ => None, }; if let Some(c) = c { self.buf.push(c); } else if let Some(v) = self.config.extra_entities.get(&name) { self.buf.push_str(v); } else if let Some(v) = self.entities.get(&name) { if self.state_after_reference == State::OutsideTag { // an entity can expand to *elements*, so outside of a tag it needs a full reparse if let Err(e) = self.lexer.reparse(v) { return Some(Err(e)); } } else { // however, inside attributes it's not allowed to affect attribute quoting, // so it can't be fed to the lexer self.buf.push_str(v); } } else { return Some(self.error(SyntaxError::UnexpectedEntity(name.into()))); } let prev_st = self.state_after_reference; if prev_st == State::OutsideTag && !is_whitespace_char(self.buf.chars().last().unwrap_or('\0')) { self.inside_whitespace = false; } self.into_state_continue(prev_st) }, _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), } } pub(crate) fn numeric_reference_from_str(&self, num_str: &str) -> std::result::Result { let val = if let Some(hex) = num_str.strip_prefix('x') { u32::from_str_radix(hex, 16).map_err(move |_| SyntaxError::InvalidNumericEntity(num_str.into()))? } else { num_str.parse::().map_err(move |_| SyntaxError::InvalidNumericEntity(num_str.into()))? }; match char::from_u32(val) { Some(c) if self.is_valid_xml_char(c) => Ok(c), Some(_) if self.config.replace_unknown_entity_references => Ok('\u{fffd}'), None if self.config.replace_unknown_entity_references => Ok('\u{fffd}'), _ => Err(SyntaxError::InvalidCharacterEntity(val)), } } } xml-1.2.0/src/reader/parser/outside_tag.rs000064400000000000000000000223161046102023000166320ustar 00000000000000use crate::common::is_whitespace_char; use crate::reader::error::SyntaxError; use crate::reader::events::XmlEvent; use crate::reader::lexer::Token; use super::{ ClosingTagSubstate, DoctypeSubstate, Encountered, OpeningTagSubstate, ProcessingInstructionSubstate, PullParser, Result, State, }; impl PullParser { pub fn outside_tag(&mut self, t: Token) -> Option { match t { Token::Character(c) => { if is_whitespace_char(c) { // skip whitespace outside of the root element if (self.config.trim_whitespace && self.buf.is_empty()) || (self.depth() == 0 && self.config.ignore_root_level_whitespace) { return None; } } else { self.inside_whitespace = false; if self.depth() == 0 { return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t))); } } if !self.is_valid_xml_char_not_restricted(c) { return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))); } if self.buf.is_empty() { self.push_pos(); } else if self.buf.len() > self.config.max_data_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } self.buf.push(c); None }, Token::CommentEnd | Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote | Token::ProcessingInstructionEnd | Token::EmptyTagEnd => { if self.depth() == 0 { return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t))); } self.inside_whitespace = false; if let Some(s) = t.as_static_str() { if self.buf.is_empty() { self.push_pos(); } else if self.buf.len() > self.config.max_data_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } self.buf.push_str(s); } None }, Token::ReferenceStart if self.depth() > 0 => { self.state_after_reference = State::OutsideTag; self.into_state_continue(State::InsideReference) }, Token::ReferenceEnd if self.depth() > 0 => { // Semi-colon in a text outside an entity self.inside_whitespace = false; if self.buf.len() > self.config.max_data_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } Token::ReferenceEnd.push_to_string(&mut self.buf); None }, Token::CommentStart if self.config.coalesce_characters && self.config.ignore_comments => { let next_event = self.set_encountered(Encountered::Comment); // We need to switch the lexer into a comment mode inside comments self.into_state(State::InsideComment, next_event) }, Token::CDataStart if self.depth() > 0 && self.config.coalesce_characters && self.config.cdata_to_characters => { if self.buf.is_empty() { self.push_pos(); // CDataEnd will pop pos if the buffer remains empty } // if coalescing chars, continue without event self.into_state_continue(State::InsideCData) }, _ => { // Encountered some markup event, flush the buffer as characters // or a whitespace let mut next_event = if self.buf_has_data() { let buf = self.take_buf(); if self.inside_whitespace && self.config.trim_whitespace { // there will be no event emitted for this, but start of buffering has pushed a pos self.next_pos(); None } else if self.inside_whitespace && !self.config.whitespace_to_characters { debug_assert!(buf.chars().all(|ch| ch.is_whitespace()), "ws={buf:?}"); Some(Ok(XmlEvent::Whitespace(buf))) } else if self.config.trim_whitespace { Some(Ok(XmlEvent::Characters(buf.trim_matches(is_whitespace_char).into()))) } else { Some(Ok(XmlEvent::Characters(buf))) } } else { None }; self.inside_whitespace = true; // Reset inside_whitespace flag // pos is popped whenever an event is emitted, so pushes must happen only if there will be an event to balance it // and ignored comments don't pop if t != Token::CommentStart || !self.config.ignore_comments { self.push_pos(); } match t { Token::OpeningTagStart if self.depth() > 0 || self.encountered < Encountered::Element || self.config.allow_multiple_root_elements => { if let Some(e) = self.set_encountered(Encountered::Element) { next_event = Some(e); } self.nst.push_empty(); self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event) }, Token::ClosingTagStart if self.depth() > 0 => { self.into_state(State::InsideClosingTag(ClosingTagSubstate::CTInsideName), next_event) }, Token::CommentStart => { if let Some(e) = self.set_encountered(Encountered::Comment) { next_event = Some(e); } // We need to switch the lexer into a comment mode inside comments self.into_state(State::InsideComment, next_event) }, Token::DoctypeStart if self.encountered < Encountered::Doctype => { if let Some(e) = self.set_encountered(Encountered::Doctype) { next_event = Some(e); } self.data.doctype = Some(Token::DoctypeStart.to_string()); self.push_pos(); self.into_state(State::InsideDoctype(DoctypeSubstate::BeforeDoctypeName), next_event) }, Token::ProcessingInstructionStart => self.into_state( State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event, ), Token::CDataStart if self.depth() > 0 => { self.into_state(State::InsideCData, next_event) }, _ => Some(self.error(SyntaxError::UnexpectedToken(t))), } }, } } pub fn document_start(&mut self, t: Token) -> Option { debug_assert!(self.encountered < Encountered::Declaration); match t { Token::Character(c) => { let next_event = self.set_encountered(Encountered::AnyChars); if !is_whitespace_char(c) { return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t))); } self.inside_whitespace = true; // skip whitespace outside of the root element if (self.config.trim_whitespace && self.buf.is_empty()) || (self.depth() == 0 && self.config.ignore_root_level_whitespace) { return self.into_state(State::OutsideTag, next_event); } self.push_pos(); self.buf.push(c); self.into_state(State::OutsideTag, next_event) }, Token::CommentStart => { let next_event = self.set_encountered(Encountered::Comment); self.into_state(State::InsideComment, next_event) }, Token::OpeningTagStart => { let next_event = self.set_encountered(Encountered::Element); self.nst.push_empty(); self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event) }, Token::DoctypeStart => { let next_event = self.set_encountered(Encountered::Doctype); self.data.doctype = Some(Token::DoctypeStart.to_string()); self.push_pos(); self.into_state(State::InsideDoctype(DoctypeSubstate::BeforeDoctypeName), next_event) }, Token::ProcessingInstructionStart => { self.push_pos(); self.into_state_continue(State::InsideProcessingInstruction( ProcessingInstructionSubstate::PIInsideName, )) }, _ => Some(self.error(SyntaxError::UnexpectedToken(t))), } } } xml-1.2.0/src/reader/parser.rs000064400000000000000000000711001046102023000143160ustar 00000000000000//! Contains an implementation of pull-based XML parser. use crate::reader::DoctypeRef; use crate::common::{is_xml10_char, is_xml11_char, is_xml11_char_not_restricted, is_name_char, is_name_start_char, is_whitespace_char}; use crate::common::{Position, TextPosition, XmlVersion}; use crate::name::OwnedName; use crate::namespace::NamespaceStack; use crate::reader::config::ParserConfig; use crate::reader::error::{ImmutableEntitiesError, SyntaxError}; use crate::reader::error::Error; use crate::reader::events::XmlEvent; use crate::reader::indexset::AttributesSet; use crate::reader::lexer::{Lexer, Token}; use std::collections::HashMap; use std::io::Read; macro_rules! gen_takes( ($($field:ident -> $method:ident, $t:ty, $def:expr);+) => ( $( impl MarkupData { #[inline] #[allow(clippy::mem_replace_option_with_none)] #[allow(clippy::mem_replace_with_default)] fn $method(&mut self) -> $t { std::mem::replace(&mut self.$field, $def) } } )+ ) ); gen_takes!( name -> take_name, String, String::new(); ref_data -> take_ref_data, String, String::new(); encoding -> take_encoding, Option, None; element_name -> take_element_name, Option, None; attr_name -> take_attr_name, Option, None; attributes -> take_attributes, AttributesSet, AttributesSet::new() ); mod inside_cdata; mod inside_closing_tag_name; mod inside_comment; mod inside_declaration; mod inside_doctype; mod inside_opening_tag; mod inside_processing_instruction; mod inside_reference; mod outside_tag; static DEFAULT_VERSION: XmlVersion = XmlVersion::Version10; static DEFAULT_STANDALONE: Option = None; type ElementStack = Vec; /// Newtype for `XmlEvent` only. If you import this, use `std::result::Result` for other results. pub type Result = super::Result; /// Pull-based XML parser. pub(crate) struct PullParser { config: ParserConfig, lexer: Lexer, st: State, state_after_reference: State, buf: String, /// From DTD internal subset entities: HashMap, nst: NamespaceStack, data: MarkupData, final_result: Option, next_event: Option, est: ElementStack, pos: Vec, encountered: Encountered, inside_whitespace: bool, seen_prefix_separator: bool, pop_namespace: bool, } // Keeps track when XML declaration can happen #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] enum Encountered { None = 0, AnyChars, // whitespace before ) -> Self { Self::new_with_config(config.into()) } #[inline] fn new_with_config(config: ParserConfig) -> Self { let mut lexer = Lexer::new(&config); if let Some(enc) = config.override_encoding { lexer.set_encoding(enc); } let mut pos = Vec::with_capacity(16); pos.push(TextPosition::new()); Self { config, lexer, st: State::DocumentStart, state_after_reference: State::OutsideTag, buf: String::new(), entities: HashMap::new(), nst: NamespaceStack::default(), data: MarkupData { name: String::new(), doctype: None, doctype_name: None, doctype_public_id: None, doctype_system_id: None, version: None, encoding: None, standalone: None, ref_data: String::new(), element_name: None, quote: None, attr_name: None, attributes: AttributesSet::new(), }, final_result: None, next_event: None, est: Vec::new(), pos, encountered: Encountered::None, inside_whitespace: true, seen_prefix_separator: false, pop_namespace: false, } } /// Checks if this parser ignores the end of stream errors. pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.ignore_end_of_stream } /// Retrieves the Doctype from the document if any #[inline] #[deprecated(note = "there is `XmlEvent::Doctype` now")] pub fn doctype(&self) -> Option<&str> { self.data.doctype.as_deref() } pub fn doctype_ids(&self) -> Option> { Some(DoctypeRef { syntax: self.data.doctype.as_deref()?, name: self.data.doctype_name.as_deref()?, public_id: self.data.doctype_public_id.as_deref(), system_id: self.data.doctype_system_id.as_deref(), }) } #[inline(never)] fn set_encountered(&mut self, new_encounter: Encountered) -> Option { if new_encounter <= self.encountered { return None; } let prev_enc = self.encountered; self.encountered = new_encounter; // If declaration was not parsed and we have encountered an element, // emit this declaration as the next event. if prev_enc == Encountered::None { self.push_pos(); Some(Ok(XmlEvent::StartDocument { version: DEFAULT_VERSION, encoding: self.lexer.encoding().to_string(), standalone: DEFAULT_STANDALONE, })) } else { None } } #[inline] pub fn add_entities, T: Into>(&mut self, entities: impl IntoIterator) -> std::result::Result<(), ImmutableEntitiesError> { if self.data.standalone == Some(true) { return Err(ImmutableEntitiesError::StandaloneDocument); } if self.encountered == Encountered::Element { return Err(ImmutableEntitiesError::ElementEncountered); } self.config.extra_entities.extend(entities.into_iter().map(|(k, v)| (k.into(), v.into()))); Ok(()) } } impl Position for PullParser { /// Returns the position of the last event produced by the parser #[inline] fn position(&self) -> TextPosition { self.pos.first().copied().unwrap_or_else(TextPosition::new) } } #[derive(Copy, Clone, PartialEq, Debug)] pub(crate) enum State { OutsideTag, InsideOpeningTag(OpeningTagSubstate), InsideClosingTag(ClosingTagSubstate), InsideProcessingInstruction(ProcessingInstructionSubstate), InsideComment, InsideCData, InsideDeclaration(DeclarationSubstate), InsideDoctype(DoctypeSubstate), InsideReference, DocumentStart, } #[derive(Copy, Clone, PartialEq, Debug)] pub(crate) enum DoctypeSubstate { BeforeDoctypeName, DoctypeName, Outside, // PUBLIC ... SYSTEM... public and system literal parts. ExternalIdKeyword, BeforeSystemLiteral, SystemLiteral, BeforePubId, PubId, // Internal Subset related bits, parts inside [...]. InternalSubset, String, InsideName, BeforeEntityName, EntityName, BeforeEntityValue, EntityValue, NumericReferenceStart, NumericReference, /// expansion PEReferenceInValue, PEReferenceInDtd, /// name definition PEReferenceDefinitionStart, PEReferenceDefinition, IgnorePI, SkipDeclaration, Comment, } #[derive(Copy, Clone, PartialEq, Debug)] pub(crate) enum OpeningTagSubstate { InsideName, InsideTag, InsideAttributeName, AfterAttributeName, InsideAttributeValue, AfterAttributeValue, } #[derive(Copy, Clone, PartialEq, Debug)] pub(crate) enum ClosingTagSubstate { CTInsideName, CTAfterName, } #[derive(Copy, Clone, PartialEq, Debug)] pub(crate) enum ProcessingInstructionSubstate { PIInsideName, PIInsideData, } #[derive(Copy, Clone, PartialEq, Debug)] pub(crate) enum DeclarationSubstate { BeforeVersion, InsideVersion, AfterVersion, InsideVersionValue, AfterVersionValue, BeforeEncoding, InsideEncoding, AfterEncoding, InsideEncodingValue, AfterEncodingValue, BeforeStandaloneDecl, InsideStandaloneDecl, AfterStandaloneDecl, InsideStandaloneDeclValue, AfterStandaloneDeclValue, } #[derive(Copy, Clone, PartialEq, Debug)] enum QualifiedNameTarget { Attribute, OpeningTag, ClosingTag, } #[derive(Copy, Clone, PartialEq, Eq)] enum QuoteToken { SingleQuoteToken, DoubleQuoteToken, } impl QuoteToken { #[inline] fn from_token(t: Token) -> Option { match t { Token::SingleQuote => Some(Self::SingleQuoteToken), Token::DoubleQuote => Some(Self::DoubleQuoteToken), _ => { debug_assert!(false); None }, } } const fn as_token(self) -> Token { match self { Self::SingleQuoteToken => Token::SingleQuote, Self::DoubleQuoteToken => Token::DoubleQuote, } } } struct MarkupData { name: String, // used for processing instruction name ref_data: String, // used for reference content doctype: Option, // keeps a copy of the original doctype doctype_name: Option>, doctype_public_id: Option>, doctype_system_id: Option>, version: Option, // used for XML declaration version encoding: Option, // used for XML declaration encoding standalone: Option, // used for XML declaration standalone parameter element_name: Option, // used for element name quote: Option, // used to hold opening quote for attribute value attr_name: Option, // used to hold attribute name attributes: AttributesSet, // used to hold all accumulated attributes } impl PullParser { /// Returns next event read from the given buffer. /// /// This method should be always called with the same buffer. If you call it /// providing different buffers each time, the result will be undefined. pub fn next(&mut self, r: &mut R) -> Result { if let Some(ref ev) = self.final_result { return ev.clone(); } if let Some(ev) = self.next_event.take() { return ev; } if self.pop_namespace { self.pop_namespace = false; self.nst.pop(); } loop { debug_assert!(self.next_event.is_none()); debug_assert!(!self.pop_namespace); // While lexer gives us Ok(maybe_token) -- we loop. // Upon having a complete XML-event -- we return from the whole function. match self.lexer.next_token(r) { Ok(Token::Eof) => { // Forward pos to the lexer head self.next_pos(); return self.handle_eof(); }, Ok(token) => match self.dispatch_token(token) { None => continue, Some(Ok(xml_event)) => { self.next_pos(); return Ok(xml_event); }, Some(Err(xml_error)) => { self.next_pos(); return self.set_final_result(Err(xml_error)); }, }, Err(lexer_error) => { self.next_pos(); return self.set_final_result(Err(lexer_error)); }, } } } /// Handle end of stream #[cold] fn handle_eof(&mut self) -> std::result::Result { let ev = if self.depth() == 0 { if self.encountered == Encountered::Element && self.st == State::OutsideTag { // all is ok Ok(XmlEvent::EndDocument) } else if self.encountered < Encountered::Element { self.error(SyntaxError::NoRootElement) } else { // self.st != State::OutsideTag self.error(SyntaxError::UnexpectedEof) // TODO: add expected hint? } } else if self.config.ignore_end_of_stream { self.final_result = None; self.lexer.reset_eof_handled(); return self.error(SyntaxError::UnbalancedRootElement); } else { self.error(SyntaxError::UnbalancedRootElement) }; self.set_final_result(ev) } // This function is to be called when a terminal event is reached. // The function sets up the `self.final_result` into `Some(result)` and return `result`. #[inline] fn set_final_result(&mut self, result: Result) -> Result { self.final_result = Some(result.clone()); result } #[cold] #[allow(clippy::needless_pass_by_value)] fn error(&self, e: SyntaxError) -> Result { Err(Error::syntax(e.to_cow(), self.lexer.position())) } #[inline] fn next_pos(&mut self) { // unfortunately calls to next_pos will never be perfectly balanced with push_pos, // at very least because parse errors and EOF can happen unexpectedly without a prior push. if !self.pos.is_empty() { if self.pos.len() > 1 { self.pos.remove(0); } else { self.pos[0] = self.lexer.position(); } } } #[inline] #[track_caller] fn push_pos(&mut self) { debug_assert!(self.pos.len() != self.pos.capacity(), "You've found a bug in xml-rs, caused by calls to push_pos() in states that don't end up emitting events. This case is ignored in release mode, and merely causes document positions to be out of sync. Please file a bug and include the XML document that triggers this assert."); // it has capacity preallocated for more than it ever needs, so this reduces code size if self.pos.len() != self.pos.capacity() { self.pos.push(self.lexer.position()); } else if self.pos.len() > 1 { self.pos.remove(0); // this mitigates the excessive push_pos() call } } #[inline(never)] fn dispatch_token(&mut self, t: Token) -> Option { match self.st { State::OutsideTag => self.outside_tag(t), State::InsideOpeningTag(s) => self.inside_opening_tag(t, s), State::InsideClosingTag(s) => self.inside_closing_tag_name(t, s), State::InsideReference => self.inside_reference(t), State::InsideComment => self.inside_comment(t), State::InsideCData => self.inside_cdata(t), State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s), State::InsideDoctype(s) => self.inside_doctype(t, s), State::InsideDeclaration(s) => self.inside_declaration(t, s), State::DocumentStart => self.document_start(t), } } #[inline] fn depth(&self) -> usize { self.est.len() } #[inline] fn buf_has_data(&self) -> bool { !self.buf.is_empty() } #[inline] fn take_buf(&mut self) -> String { std::mem::take(&mut self.buf) } #[inline] fn take_buf_boxed(&mut self) -> Box { let res = self.buf.as_str().into(); self.buf.clear(); res } #[inline] #[allow(clippy::wrong_self_convention)] fn into_state(&mut self, st: State, ev: Option) -> Option { self.st = st; ev } #[inline] #[allow(clippy::wrong_self_convention)] fn into_state_continue(&mut self, st: State) -> Option { self.into_state(st, None) } #[inline] #[allow(clippy::wrong_self_convention)] fn into_state_emit(&mut self, st: State, ev: Result) -> Option { self.into_state(st, Some(ev)) } /// Dispatches tokens in order to process qualified name. If qualified name cannot be parsed, /// an error is returned. /// /// # Parameters /// * `t` --- next token; /// * `on_name` --- a callback which is executed when whitespace is encountered. fn read_qualified_name(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option where F: Fn(&mut Self, Token, OwnedName) -> Option { let try_consume_name = move |this: &mut Self, t| { let name = this.take_buf(); this.seen_prefix_separator = false; match name.parse() { Ok(name) => on_name(this, t, name), Err(()) => Some(this.error(SyntaxError::InvalidQualifiedName(name.into()))), } }; match t { // There can be only one colon, and not as the first character Token::Character(':') if self.buf_has_data() && !self.seen_prefix_separator => { self.buf.push(':'); self.seen_prefix_separator = true; None }, Token::Character(c) if c != ':' && (self.buf.is_empty() && is_name_start_char(c) || self.buf_has_data() && is_name_char(c)) => { if self.buf.len() > self.config.max_name_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } self.buf.push(c); None }, Token::EqualsSign if target == QualifiedNameTarget::Attribute => try_consume_name(self, t), Token::EmptyTagEnd if target == QualifiedNameTarget::OpeningTag => try_consume_name(self, t), Token::TagEnd if target == QualifiedNameTarget::OpeningTag || target == QualifiedNameTarget::ClosingTag => try_consume_name(self, t), Token::Character(c) if is_whitespace_char(c) => try_consume_name(self, t), _ => Some(self.error(SyntaxError::UnexpectedQualifiedName(t))), } } /// Dispatches tokens in order to process attribute value. /// /// # Parameters /// * `t` --- next token; /// * `on_value` --- a callback which is called when terminating quote is encountered. fn read_attribute_value(&mut self, t: Token, on_value: F) -> Option where F: Fn(&mut Self, String) -> Option { match t { Token::Character(c) if self.data.quote.is_none() && is_whitespace_char(c) => None, // skip leading whitespace Token::DoubleQuote | Token::SingleQuote => match self.data.quote { None => { // Entered attribute value self.data.quote = QuoteToken::from_token(t); None }, Some(q) if q.as_token() == t => { self.data.quote = None; let value = self.take_buf(); on_value(self, value) }, _ => { if let Token::Character(c) = t { if !self.is_valid_xml_char_not_restricted(c) { return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))); } } if self.buf.len() > self.config.max_attribute_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } t.push_to_string(&mut self.buf); None }, }, Token::ReferenceStart if self.data.quote.is_some() => { self.state_after_reference = self.st; self.into_state_continue(State::InsideReference) }, Token::OpeningTagStart | Token::ProcessingInstructionStart => { Some(self.error(SyntaxError::UnexpectedOpeningTag)) }, Token::Character(c) if !self.is_valid_xml_char_not_restricted(c) => { Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) }, // Every character except " and ' and < is okay _ if self.data.quote.is_some() => { if self.buf.len() > self.config.max_attribute_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } t.push_to_string(&mut self.buf); None }, _ => Some(self.error(SyntaxError::UnexpectedToken(t))), } } fn emit_start_element(&mut self, emit_end_element: bool) -> Option { let mut name = self.data.take_element_name()?; let mut attributes = self.data.take_attributes().into_vec(); // check whether the name prefix is bound and fix its namespace match self.nst.get(name.borrow().prefix_repr()) { Some("") => name.namespace = None, // default namespace Some(ns) => name.namespace = Some(ns.into()), None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into()))), } // check and fix accumulated attributes prefixes for attr in &mut attributes { if let Some(ref pfx) = attr.name.prefix { let new_ns = match self.nst.get(pfx) { Some("") => None, // default namespace Some(ns) => Some(ns.into()), None => return Some(self.error(SyntaxError::UnboundAttribute(attr.name.to_string().into()))), }; attr.name.namespace = new_ns; } } if emit_end_element { self.pop_namespace = true; self.next_event = Some(Ok(XmlEvent::EndElement { name: name.clone() })); } else { self.est.push(name.clone()); } let namespace = self.nst.squash(); self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement { name, attributes, namespace })) } fn emit_end_element(&mut self) -> Option { let mut name = self.data.take_element_name()?; // check whether the name prefix is bound and fix its namespace match self.nst.get(name.borrow().prefix_repr()) { Some("") => name.namespace = None, // default namespace Some(ns) => name.namespace = Some(ns.into()), None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into()))), } let op_name = self.est.pop()?; if name == op_name { self.pop_namespace = true; self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name })) } else { Some(self.error(SyntaxError::UnexpectedClosingTag(format!("{name} != {op_name}").into()))) } } #[inline] fn is_valid_xml_char(&self, c: char) -> bool { if Some(XmlVersion::Version11) == self.data.version { is_xml11_char(c) } else { is_xml10_char(c) } } #[inline] fn is_valid_xml_char_not_restricted(&self, c: char) -> bool { if Some(XmlVersion::Version11) == self.data.version { is_xml11_char_not_restricted(c) } else { is_xml10_char(c) } } } #[cfg(test)] mod tests { use crate::attribute::OwnedAttribute; use crate::common::TextPosition; use crate::name::OwnedName; use crate::reader::events::XmlEvent; use crate::reader::parser::PullParser; use crate::reader::ParserConfig; use std::io::BufReader; fn new_parser() -> PullParser { PullParser::new(ParserConfig::new()) } macro_rules! expect_event( ($r:expr, $p:expr, $t:pat) => ( match $p.next(&mut $r) { $t => {} e => panic!("Unexpected event: {e:?}\nExpected: {}", stringify!($t)) } ); ($r:expr, $p:expr, $t:pat => $c:expr ) => ( match $p.next(&mut $r) { $t if $c => {} e => panic!("Unexpected event: {e:?}\nExpected: {} if {}", stringify!($t), stringify!($c)) } ) ); macro_rules! test_data( ($d:expr) => ({ static DATA: &'static str = $d; let r = BufReader::new(DATA.as_bytes()); let p = new_parser(); (r, p) }) ); #[test] fn issue_3_semicolon_in_attribute_value() { let (mut r, mut p) = test_data!(r#" "#); expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, ref attributes, ref namespace }) => *name == OwnedName::local("a") && attributes.len() == 1 && attributes[0] == OwnedAttribute::new(OwnedName::local("attr"), "zzz;zzz") && namespace.is_essentially_empty() ); expect_event!(r, p, Ok(XmlEvent::EndElement { ref name }) => *name == OwnedName::local("a")); expect_event!(r, p, Ok(XmlEvent::EndDocument)); } #[test] fn issue_140_entity_reference_inside_tag() { let (mut r, mut p) = test_data!(r" "); expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, .. }) => *name == OwnedName::local("bla")); expect_event!(r, p, Ok(XmlEvent::Characters(ref s)) => s == "\u{266b}"); expect_event!(r, p, Ok(XmlEvent::EndElement { ref name, .. }) => *name == OwnedName::local("bla")); expect_event!(r, p, Ok(XmlEvent::EndDocument)); } #[test] fn issue_220_comment() { let (mut r, mut p) = test_data!(r""); expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); expect_event!(r, p, Ok(XmlEvent::EndElement { .. })); expect_event!(r, p, Ok(XmlEvent::EndDocument)); let (mut r, mut p) = test_data!(r""); expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); expect_event!(r, p, Err(_)); // ---> is forbidden in comments let (mut r, mut p) = test_data!(r""); p.config.ignore_comments = false; expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); expect_event!(r, p, Ok(XmlEvent::Comment(s)) => s == " "#); expect_event!(r, p, Err(_)); let (mut r, mut p) = test_data!(r#""#); expect_event!(r, p, Err(_)); let (mut r, mut p) = test_data!(r#""#); expect_event!(r, p, Err(_)); let (mut r, mut p) = test_data!(r#""#); expect_event!(r, p, Err(_)); let (mut r, mut p) = test_data!(r#""#); expect_event!(r, p, Err(_)); } #[test] fn opening_tag_in_attribute_value() { use crate::reader::error::{SyntaxError, Error}; let (mut r, mut p) = test_data!(r#" "#); expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Err(ref e) => *e == Error::syntax(SyntaxError::UnexpectedOpeningTag.to_cow(), TextPosition { row: 1, column: 24 })); } #[test] fn processing_instruction_in_attribute_value() { use crate::reader::error::{SyntaxError, Error}; let (mut r, mut p) = test_data!(r#" "#); expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Err(ref e) => *e == Error::syntax(SyntaxError::UnexpectedOpeningTag.to_cow(), TextPosition { row: 1, column: 18 })); } #[test] fn reference_err() { let (mut r, mut p) = test_data!(r" && "); expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); expect_event!(r, p, Err(_)); } #[test] fn state_size() { assert_eq!(2, std::mem::size_of::()); assert_eq!(1, std::mem::size_of::()); } } xml-1.2.0/src/reader.rs000064400000000000000000000156011046102023000130260ustar 00000000000000//! Contains high-level interface for a pull-based XML parser. //! //! The most important type in this module is `EventReader`, which provides an iterator //! view for events in XML document. use std::io::Read; use std::iter::FusedIterator; use std::result; use crate::common::{Position, TextPosition}; pub use self::config::ParserConfig; pub use self::error::{Error, ErrorKind}; pub use events::{XmlEvent, DoctypeRef}; // back compat #[doc(hidden)] #[deprecated(note = "Merged into ParserConfig")] pub type ParserConfig2 = ParserConfig; use self::parser::PullParser; mod config; mod error; mod events; mod indexset; mod lexer; mod parser; /// A result type yielded by `XmlReader`. pub type Result = result::Result; /// A wrapper around an `std::io::Read` instance which provides pull-based XML parsing. /// /// The reader should be wrapped in a `BufReader`, otherwise parsing may be very slow. pub struct EventReader { source: R, parser: PullParser, } impl EventReader { /// Creates a new reader, consuming the given stream. The reader should be wrapped in a `BufReader`, otherwise parsing may be very slow. #[inline] pub fn new(source: R) -> Self { Self::new_with_config(source, ParserConfig::new()) } /// Creates a new reader with the provded configuration, consuming the given stream. The reader should be wrapped in a `BufReader`, otherwise parsing may be very slow. #[inline] pub fn new_with_config(source: R, config: impl Into) -> Self { Self { source, parser: PullParser::new(config), } } /// Pulls and returns next XML event from the stream. /// /// If this returns [Err] or [`XmlEvent::EndDocument`] then further calls to /// this method will return this event again. #[inline] #[allow(clippy::should_implement_trait)] pub fn next(&mut self) -> Result { self.parser.next(&mut self.source) } /// Skips all XML events until the next end tag at the current level. /// /// Convenience function that is useful for the case where you have /// encountered a start tag that is of no interest and want to /// skip the entire XML subtree until the corresponding end tag. #[inline] pub fn skip(&mut self) -> Result<()> { let mut depth = 1; while depth > 0 { match self.next()? { XmlEvent::StartElement { .. } => depth += 1, XmlEvent::EndElement { .. } => depth -= 1, XmlEvent::EndDocument => return Err(Error { kind: ErrorKind::UnexpectedEof, pos: self.parser.position(), }), _ => {}, } } Ok(()) } /// Access underlying reader /// /// Using it directly while the event reader is parsing is not recommended pub fn source(&self) -> &R { &self.source } /// Access underlying reader /// /// Using it directly while the event reader is parsing is not recommended pub fn source_mut(&mut self) -> &mut R { &mut self.source } /// Unwraps this `EventReader`, returning the underlying reader. /// /// Note that this operation is destructive; unwrapping the reader and wrapping it /// again with `EventReader::new()` will create a fresh reader which will attempt /// to parse an XML document from the beginning. pub fn into_inner(self) -> R { self.source } /// Returns the DOCTYPE of the document if it has already been seen /// /// Available only after the `Doctype` event #[inline] #[deprecated(note = "there is `XmlEvent::Doctype` now")] #[allow(deprecated)] pub fn doctype(&self) -> Option<&str> { self.parser.doctype() } /// Returns PUBLIC/SYSTEM DOCTYPE IDs if it has already been seen /// /// Available only after the `Doctype` event #[inline] pub fn doctype_ids(&self) -> Option> { self.parser.doctype_ids() } /// Add new entity definitions **before any XML elements have been parsed**. /// /// ## Errors /// /// It's valid to call this after DOCTYPE, but not later. It won't be possible to add entities to a document without either XML decl or DOCTYPE. /// /// It will fail if the document is declared as _standalone_. #[inline] pub fn add_entities, T: Into>(&mut self, entities: impl IntoIterator) -> std::result::Result<(), crate::reader::error::ImmutableEntitiesError> { self.parser.add_entities(entities) } } impl Position for EventReader { /// Returns the position of the last event produced by the reader. #[inline] fn position(&self) -> TextPosition { self.parser.position() } } impl IntoIterator for EventReader { type IntoIter = Events; type Item = Result; fn into_iter(self) -> Events { Events { reader: self, finished: false } } } /// An iterator over XML events created from some type implementing `Read`. /// /// When the next event is `xml::event::Error` or `xml::event::EndDocument`, then /// it will be returned by the iterator once, and then it will stop producing events. pub struct Events { reader: EventReader, finished: bool, } impl Events { /// Unwraps the iterator, returning the internal `EventReader`. #[inline] pub fn into_inner(self) -> EventReader { self.reader } /// Access the underlying reader /// /// It's not recommended to use it while the events are still being parsed pub fn source(&self) -> &R { &self.reader.source } /// Access the underlying reader /// /// It's not recommended to use it while the events are still being parsed pub fn source_mut(&mut self) -> &mut R { &mut self.reader.source } } impl std::ops::Deref for Events { type Target = EventReader; fn deref(&self) -> &Self::Target { &self.reader } } impl std::ops::DerefMut for Events { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.reader } } impl FusedIterator for Events { } impl Iterator for Events { type Item = Result; #[inline] fn next(&mut self) -> Option> { if self.finished && !self.reader.parser.is_ignoring_end_of_stream() { None } else { let ev = self.reader.next(); if let Ok(XmlEvent::EndDocument) | Err(_) = ev { self.finished = true; } Some(ev) } } } impl<'r> EventReader<&'r [u8]> { /// A convenience method to create an `XmlReader` from a string slice. #[inline] #[must_use] #[allow(clippy::should_implement_trait)] pub fn from_str(source: &'r str) -> Self { EventReader::new(source.as_bytes()) } } xml-1.2.0/src/util.rs000064400000000000000000000271201046102023000125400ustar 00000000000000use std::fmt; use std::io::{self, Read}; use std::str::{self, FromStr}; #[derive(Debug)] pub(crate) enum CharReadError { UnexpectedEof, Utf8(str::Utf8Error), Io(io::Error), } impl From for CharReadError { #[cold] fn from(e: str::Utf8Error) -> Self { Self::Utf8(e) } } impl From for CharReadError { #[cold] fn from(e: io::Error) -> Self { Self::Io(e) } } impl fmt::Display for CharReadError { #[cold] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use self::CharReadError::{Io, UnexpectedEof, Utf8}; match *self { UnexpectedEof => write!(f, "unexpected end of stream"), Utf8(ref e) => write!(f, "UTF-8 decoding error: {e}"), Io(ref e) => write!(f, "I/O error: {e}"), } } } /// Character encoding used for parsing #[derive(Debug, Copy, Clone, Eq, PartialEq)] #[non_exhaustive] pub enum Encoding { /// Explicitly UTF-8 only Utf8, /// UTF-8 fallback, but can be any 8-bit encoding Default, /// ISO-8859-1 Latin1, /// US-ASCII Ascii, /// Big-Endian Utf16Be, /// Little-Endian Utf16Le, /// Unknown endianness yet, will be sniffed Utf16, /// Not determined yet, may be sniffed to be anything Unknown, } // Rustc inlines eq_ignore_ascii_case and creates kilobytes of code! #[inline(never)] fn icmp(lower: &str, varcase: &str) -> bool { lower.bytes().zip(varcase.bytes()).all(|(l, v)| l == v.to_ascii_lowercase()) } impl FromStr for Encoding { type Err = &'static str; fn from_str(val: &str) -> Result { if ["utf-8", "utf8"].into_iter().any(move |label| icmp(label, val)) { Ok(Self::Utf8) } else if ["iso-8859-1", "latin1"].into_iter().any(move |label| icmp(label, val)) { Ok(Self::Latin1) } else if ["utf-16", "utf16"].into_iter().any(move |label| icmp(label, val)) { Ok(Self::Utf16) } else if ["ascii", "us-ascii"].into_iter().any(move |label| icmp(label, val)) { Ok(Self::Ascii) } else { Err("unknown encoding name") } } } impl fmt::Display for Encoding { #[cold] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(match self { Self::Utf8 | Self::Default => "UTF-8", Self::Latin1 => "ISO-8859-1", Self::Ascii => "US-ASCII", Self::Utf16Be | Self::Utf16Le | Self::Utf16 => "UTF-16", Self::Unknown => "(unknown)", }) } } pub(crate) struct CharReader { pub encoding: Encoding, } impl CharReader { pub const fn new() -> Self { Self { encoding: Encoding::Unknown } } #[inline] pub fn next_char_from(&mut self, source: &mut R) -> Result, CharReadError> { let mut bytes = source.bytes(); const MAX_CODEPOINT_LEN: usize = 4; let mut buf = [0u8; MAX_CODEPOINT_LEN]; let mut pos = 0; while pos < MAX_CODEPOINT_LEN { let next = match bytes.next() { Some(Ok(b)) => b, Some(Err(e)) => return Err(e.into()), None if pos == 0 => return Ok(None), None => return Err(CharReadError::UnexpectedEof), }; match self.encoding { Encoding::Utf8 | Encoding::Default => { // fast path for ASCII subset if pos == 0 && next.is_ascii() { return Ok(Some(next.into())); } buf[pos] = next; pos += 1; match str::from_utf8(&buf[..pos]) { Ok(s) => return Ok(s.chars().next()), // always Some(..) Err(_) if pos < MAX_CODEPOINT_LEN => continue, Err(e) => return Err(e.into()), } }, Encoding::Latin1 => { return Ok(Some(next.into())); }, Encoding::Ascii => { return if next.is_ascii() { Ok(Some(next.into())) } else { Err(CharReadError::Io(io::Error::new(io::ErrorKind::InvalidData, "char is not ASCII"))) }; }, Encoding::Unknown | Encoding::Utf16 => { buf[pos] = next; pos += 1; if let Some(value) = self.sniff_bom(&buf[..pos], &mut pos) { return value; } }, Encoding::Utf16Be => { buf[pos] = next; pos += 1; if pos == 2 { if let Some(Ok(c)) = char::decode_utf16([u16::from_be_bytes(buf[..2].try_into().unwrap())]).next() { return Ok(Some(c)); } } else if pos == 4 { return Self::surrogate([u16::from_be_bytes(buf[..2].try_into().unwrap()), u16::from_be_bytes(buf[2..4].try_into().unwrap())]); } }, Encoding::Utf16Le => { buf[pos] = next; pos += 1; if pos == 2 { if let Some(Ok(c)) = char::decode_utf16([u16::from_le_bytes(buf[..2].try_into().unwrap())]).next() { return Ok(Some(c)); } } else if pos == 4 { return Self::surrogate([u16::from_le_bytes(buf[..2].try_into().unwrap()), u16::from_le_bytes(buf[2..4].try_into().unwrap())]); } }, } } Err(CharReadError::Io(io::ErrorKind::InvalidData.into())) } #[cold] fn sniff_bom(&mut self, buf: &[u8], pos: &mut usize) -> Option, CharReadError>> { // sniff BOM if buf.len() <= 3 && [0xEF, 0xBB, 0xBF].starts_with(buf) { if buf.len() == 3 && self.encoding != Encoding::Utf16 { *pos = 0; self.encoding = Encoding::Utf8; } } else if buf.len() <= 2 && [0xFE, 0xFF].starts_with(buf) { if buf.len() == 2 { *pos = 0; self.encoding = Encoding::Utf16Be; } } else if buf.len() <= 2 && [0xFF, 0xFE].starts_with(buf) { if buf.len() == 2 { *pos = 0; self.encoding = Encoding::Utf16Le; } } else if buf.len() == 1 && self.encoding == Encoding::Utf16 { // sniff ASCII char in UTF-16 self.encoding = if buf[0] == 0 { Encoding::Utf16Be } else { Encoding::Utf16Le }; } else { // UTF-8 is the default, but XML decl can change it to other 8-bit encoding self.encoding = Encoding::Default; if buf.len() == 1 && buf[0].is_ascii() { return Some(Ok(Some(buf[0].into()))); } } None } fn surrogate(buf: [u16; 2]) -> Result, CharReadError> { char::decode_utf16(buf).next().transpose() .map_err(|e| CharReadError::Io(io::Error::new(io::ErrorKind::InvalidData, e))) } } #[cfg(test)] mod tests { use super::{CharReadError, CharReader, Encoding}; #[test] fn test_next_char_from() { use std::io; let mut bytes: &[u8] = b"correct"; // correct ASCII assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('c')); let mut bytes: &[u8] = b"\xEF\xBB\xBF\xE2\x80\xA2!"; // BOM assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('•')); let mut bytes: &[u8] = b"\xEF\xBB\xBFx123"; // BOM assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('x')); let mut bytes: &[u8] = b"\xEF\xBB\xBF"; // Nothing after BOM assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None); let mut bytes: &[u8] = b"\xEF\xBB"; // Nothing after BO assert!(matches!(CharReader::new().next_char_from(&mut bytes), Err(CharReadError::UnexpectedEof))); let mut bytes: &[u8] = b"\xEF\xBB\x42"; // Nothing after BO assert!(CharReader::new().next_char_from(&mut bytes).is_err()); let mut bytes: &[u8] = b"\xFE\xFF\x00\x42"; // UTF-16 assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('B')); let mut bytes: &[u8] = b"\xFF\xFE\x42\x00"; // UTF-16 assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('B')); let mut bytes: &[u8] = b"\xFF\xFE"; // UTF-16 assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None); let mut bytes: &[u8] = b"\xFF\xFE\x00"; // UTF-16 assert!(matches!(CharReader::new().next_char_from(&mut bytes), Err(CharReadError::UnexpectedEof))); let mut bytes: &[u8] = "правильно".as_bytes(); // correct BMP assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('п')); let mut bytes: &[u8] = "правильно".as_bytes(); assert_eq!(CharReader { encoding: Encoding::Utf16Be }.next_char_from(&mut bytes).unwrap(), Some('킿')); let mut bytes: &[u8] = "правильно".as_bytes(); assert_eq!(CharReader { encoding: Encoding::Utf16Le }.next_char_from(&mut bytes).unwrap(), Some('뿐')); let mut bytes: &[u8] = b"\xD8\xD8\x80"; assert!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes).is_err()); let mut bytes: &[u8] = b"\x00\x42"; assert_eq!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes).unwrap(), Some('B')); let mut bytes: &[u8] = b"\x42\x00"; assert_eq!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes).unwrap(), Some('B')); let mut bytes: &[u8] = &[0xEF, 0xBB, 0xBF, 0xFF, 0xFF]; assert!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes).is_err()); let mut bytes: &[u8] = b"\x00"; assert!(CharReader { encoding: Encoding::Utf16Be }.next_char_from(&mut bytes).is_err()); let mut bytes: &[u8] = "😊".as_bytes(); // correct non-BMP assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('😊')); let mut bytes: &[u8] = b""; // empty assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None); let mut bytes: &[u8] = b"\xf0\x9f\x98"; // incomplete code point match CharReader::new().next_char_from(&mut bytes).unwrap_err() { super::CharReadError::UnexpectedEof => {}, e => panic!("Unexpected result: {e:?}") } let mut bytes: &[u8] = b"\xff\x9f\x98\x32"; // invalid code point match CharReader::new().next_char_from(&mut bytes).unwrap_err() { super::CharReadError::Utf8(_) => {}, e => panic!("Unexpected result: {e:?}"), } // error during read struct ErrorReader; impl io::Read for ErrorReader { fn read(&mut self, _: &mut [u8]) -> io::Result { Err(io::Error::new(io::ErrorKind::Other, "test error")) } } let mut r = ErrorReader; match CharReader::new().next_char_from(&mut r).unwrap_err() { super::CharReadError::Io(ref e) if e.kind() == io::ErrorKind::Other && e.to_string().contains("test error") => {}, e => panic!("Unexpected result: {e:?}") } } } xml-1.2.0/src/writer/config.rs000064400000000000000000000141311046102023000143420ustar 00000000000000//! Contains emitter configuration structure. use crate::writer::EventWriter; use std::borrow::Cow; use std::io::Write; /// Emitter configuration structure. /// /// This structure contains various options which control XML document emitter behavior. #[derive(Clone, PartialEq, Eq, Debug)] pub struct EmitterConfig { /// Line separator used to separate lines in formatted output. Default is `"\n"`. pub line_separator: Cow<'static, str>, /// A string which will be used for a single level of indentation. Default is `" "` /// (two spaces). pub indent_string: Cow<'static, str>, /// Whether or not the emitted document should be indented. Default is false. /// /// The emitter is capable to perform automatic indentation of the emitted XML document. /// It is done in stream-like fashion and does not require the knowledge of the whole /// document in advance. /// /// Sometimes, however, automatic indentation is undesirable, e.g. when you want to keep /// existing layout when processing an existing XML document. Also the indentiation algorithm /// is not thoroughly tested. Hence by default it is disabled. pub perform_indent: bool, /// Whether or not characters in output events will be escaped. Default is true. /// /// The emitter can automatically escape characters which can't appear in PCDATA sections /// or element attributes of an XML document, like `<` or `"` (in attributes). This may /// introduce some overhead because then every corresponding piece of character data /// should be scanned for invalid characters. /// /// If this option is disabled, the XML writer may produce non-well-formed documents, so /// use `false` value for this option with care. pub perform_escaping: bool, /// Whether or not to write XML document declaration at the beginning of a document. /// Default is true. /// /// This option controls whether the document declaration should be emitted automatically /// before a root element is written if it was not emitted explicitly by the user. pub write_document_declaration: bool, /// Whether or not to convert elements with empty content to empty elements. Default is true. /// /// This option allows turning elements like `` (an element with empty content) /// into `` (an empty element). pub normalize_empty_elements: bool, /// Whether or not to emit CDATA events as plain characters. Default is false. /// /// This option forces the emitter to convert CDATA events into regular character events, /// performing all the necessary escaping beforehand. This may be occasionally useful /// for feeding the document into incorrect parsers which do not support CDATA. pub cdata_to_characters: bool, /// Whether or not to keep element names to support `EndElement` events without explicit names. /// Default is true. /// /// This option makes the emitter to keep names of written elements in order to allow /// omitting names when writing closing element tags. This could incur some memory overhead. pub keep_element_names_stack: bool, /// Whether or not to automatically insert leading and trailing spaces in emitted comments, /// if necessary. Default is true. /// /// This is a convenience option in order for the user not to append spaces before and after /// comments text in order to get more pretty comments: `` instead of /// ``. pub autopad_comments: bool, /// Whether or not to automatically insert spaces before the trailing `/>` in self-closing /// elements. Default is true. /// /// This option is only meaningful if `normalize_empty_elements` is true. For example, the /// element `` would be unaffected. When `normalize_empty_elements` is true, then when /// this option is also true, the same element would appear ``. If this option is false, /// then the same element would appear ``. pub pad_self_closing: bool, } impl EmitterConfig { /// Creates an emitter configuration with default values. /// /// You can tweak default options with builder-like pattern: /// /// ```rust /// use xml::writer::EmitterConfig; /// /// let config = EmitterConfig::new() /// .line_separator("\r\n") /// .perform_indent(true) /// .normalize_empty_elements(false); /// ``` #[inline] #[must_use] pub fn new() -> Self { Self { line_separator: "\n".into(), indent_string: " ".into(), // two spaces perform_indent: false, perform_escaping: true, write_document_declaration: true, normalize_empty_elements: true, cdata_to_characters: false, keep_element_names_stack: true, autopad_comments: true, pad_self_closing: true, } } /// Creates an XML writer with this configuration. /// /// This is a convenience method for configuring and creating a writer at the same time: /// /// ```rust /// use xml::writer::EmitterConfig; /// /// let mut target: Vec = Vec::new(); /// /// let writer = EmitterConfig::new() /// .line_separator("\r\n") /// .perform_indent(true) /// .normalize_empty_elements(false) /// .create_writer(&mut target); /// ``` /// /// This method is exactly equivalent to calling `EventWriter::new_with_config()` with /// this configuration object. #[inline] pub fn create_writer(self, sink: W) -> EventWriter { EventWriter::new_with_config(sink, self) } } impl Default for EmitterConfig { #[inline] fn default() -> Self { Self::new() } } gen_setters!(EmitterConfig, line_separator: into Cow<'static, str>, indent_string: into Cow<'static, str>, perform_indent: val bool, write_document_declaration: val bool, normalize_empty_elements: val bool, cdata_to_characters: val bool, keep_element_names_stack: val bool, autopad_comments: val bool, pad_self_closing: val bool ); xml-1.2.0/src/writer/emitter.rs000064400000000000000000000362271046102023000145600ustar 00000000000000use std::error::Error; use std::io::prelude::*; use std::{fmt, io, result}; use crate::attribute::Attribute; use crate::common; use crate::common::XmlVersion; use crate::escape::{AttributeEscapes, Escaped, PcDataEscapes}; use crate::name::{Name, OwnedName}; use crate::namespace::{NamespaceStack, NS_EMPTY_URI, NS_NO_PREFIX, NS_XMLNS_PREFIX, NS_XML_PREFIX}; use crate::writer::config::EmitterConfig; /// An error which may be returned by `XmlWriter` when writing XML events. #[derive(Debug)] #[non_exhaustive] pub enum EmitterError { /// An I/O error occured in the underlying `Write` instance. Io(io::Error), /// Document declaration has already been written to the output stream. DocumentStartAlreadyEmitted, /// The name of the last opening element is not available. LastElementNameNotAvailable, /// The name of the last opening element is not equal to the name of the provided /// closing element. EndElementNameIsNotEqualToLastStartElementName, /// End element name is not specified when it is needed, for example, when automatic /// closing is not enabled in configuration. EndElementNameIsNotSpecified, } impl Clone for EmitterError { #[cold] fn clone(&self) -> Self { match self { Self::Io(io_error) => Self::Io(io::Error::new(io_error.kind(), io_error.to_string())), Self::DocumentStartAlreadyEmitted => Self::DocumentStartAlreadyEmitted, Self::LastElementNameNotAvailable => Self::LastElementNameNotAvailable, Self::EndElementNameIsNotEqualToLastStartElementName => Self::EndElementNameIsNotEqualToLastStartElementName, Self::EndElementNameIsNotSpecified => Self::EndElementNameIsNotSpecified, } } } impl From for EmitterError { #[cold] fn from(err: io::Error) -> Self { Self::Io(err) } } impl fmt::Display for EmitterError { #[cold] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str("emitter error: ")?; match self { Self::Io(e) => write!(f, "I/O error: {e}"), Self::DocumentStartAlreadyEmitted => f.write_str("document start event has already been emitted"), Self::LastElementNameNotAvailable => f.write_str("last element name is not available"), Self::EndElementNameIsNotEqualToLastStartElementName => f.write_str("end element name is not equal to last start element name"), Self::EndElementNameIsNotSpecified => f.write_str("end element name is not specified and can't be inferred"), } } } impl Error for EmitterError { fn source(&self) -> Option<&(dyn Error + 'static)> { match self { Self::Io(e) => e.source(), _ => None, } } } /// A result type yielded by `XmlWriter`. pub type Result = result::Result; // TODO: split into a low-level fast writer without any checks and formatting logic and a // high-level indenting validating writer pub struct Emitter { config: EmitterConfig, nst: NamespaceStack, indent_level: usize, indent_stack: Vec, element_names: Vec, start_document_emitted: bool, just_wrote_start_element: bool, } impl Emitter { pub fn new(config: EmitterConfig) -> Self { let mut indent_stack = Vec::with_capacity(16); indent_stack.push(IndentFlags::WroteNothing); Self { config, nst: NamespaceStack::empty(), indent_level: 0, indent_stack, element_names: Vec::new(), start_document_emitted: false, just_wrote_start_element: false, } } } #[derive(Copy, Clone, Eq, PartialEq, Debug)] enum IndentFlags { WroteNothing, WroteMarkup, WroteText, } impl Emitter { /// Returns the current state of namespaces. #[inline] pub fn namespace_stack_mut(&mut self) -> &mut NamespaceStack { &mut self.nst } #[inline] fn wrote_text(&self) -> bool { self.indent_stack.last().is_some_and(|&e| e == IndentFlags::WroteText) } #[inline] fn wrote_markup(&self) -> bool { self.indent_stack.last().is_some_and(|&e| e == IndentFlags::WroteMarkup) } #[inline] fn set_wrote_text(&mut self) { if let Some(e) = self.indent_stack.last_mut() { *e = IndentFlags::WroteText; } } #[inline] fn set_wrote_markup(&mut self) { if let Some(e) = self.indent_stack.last_mut() { *e = IndentFlags::WroteMarkup; } } fn write_newline(&self, target: &mut W, level: usize) -> Result<()> { target.write_all(self.config.line_separator.as_bytes())?; for _ in 0..level { target.write_all(self.config.indent_string.as_bytes())?; } Ok(()) } fn before_markup(&mut self, target: &mut W) -> Result<()> { if self.config.perform_indent && !self.wrote_text() && (self.indent_level > 0 || self.wrote_markup()) { let indent_level = self.indent_level; self.write_newline(target, indent_level)?; if self.indent_level > 0 && !self.config.indent_string.is_empty() { self.after_markup(); } } Ok(()) } fn after_markup(&mut self) { self.set_wrote_markup(); } fn before_start_element(&mut self, target: &mut W) -> Result<()> { self.before_markup(target)?; self.indent_stack.push(IndentFlags::WroteNothing); Ok(()) } fn after_start_element(&mut self) { self.after_markup(); self.indent_level += 1; } fn before_end_element(&self, target: &mut W) -> Result<()> { if self.config.perform_indent && self.indent_level > 0 && self.wrote_markup() && !self.wrote_text() { let indent_level = self.indent_level; self.write_newline(target, indent_level - 1) } else { Ok(()) } } fn after_end_element(&mut self) { if self.indent_level > 0 { self.indent_level -= 1; self.indent_stack.pop(); } self.set_wrote_markup(); } fn after_text(&mut self) { self.set_wrote_text(); } pub fn emit_start_document(&mut self, target: &mut W, version: XmlVersion, encoding: &str, standalone: Option) -> Result<()> { if self.start_document_emitted { return Err(EmitterError::DocumentStartAlreadyEmitted); } self.start_document_emitted = true; self.before_markup(target)?; let result = { let mut write = move || { write!(target, "")?; Ok(()) }; write() }; self.after_markup(); result } fn check_document_started(&mut self, target: &mut W) -> Result<()> { if !self.start_document_emitted && self.config.write_document_declaration { self.emit_start_document(target, common::XmlVersion::Version10, "UTF-8", None) } else { Ok(()) } } fn fix_non_empty_element(&mut self, target: &mut W) -> Result<()> { if self.config.normalize_empty_elements && self.just_wrote_start_element { self.just_wrote_start_element = false; target.write_all(b">").map_err(From::from) } else { Ok(()) } } pub fn emit_processing_instruction(&mut self, target: &mut W, name: &str, data: Option<&str>) -> Result<()> { self.check_document_started(target)?; self.fix_non_empty_element(target)?; self.before_markup(target)?; let result = { let mut write = move || { write!(target, "")?; Ok(()) }; write() }; self.after_markup(); result } #[track_caller] fn emit_start_element_initial(&mut self, target: &mut W, name: Name<'_>, attributes: &[Attribute<'_>]) -> Result<()> where W: Write { self.check_document_started(target)?; self.fix_non_empty_element(target)?; self.before_start_element(target)?; write!(target, "<{}", name.repr_display())?; self.emit_current_namespace_attributes(target)?; self.emit_attributes(target, attributes)?; self.after_start_element(); Ok(()) } #[track_caller] pub fn emit_start_element(&mut self, target: &mut W, name: Name<'_>, attributes: &[Attribute<'_>]) -> Result<()> where W: Write { if self.config.keep_element_names_stack { self.element_names.push(name.to_owned()); } self.emit_start_element_initial(target, name, attributes)?; self.just_wrote_start_element = true; if !self.config.normalize_empty_elements { write!(target, ">")?; } Ok(()) } #[track_caller] pub fn emit_current_namespace_attributes(&self, target: &mut W) -> Result<()> where W: Write { for (prefix, uri) in self.nst.peek() { match prefix { // internal namespaces are not emitted NS_XMLNS_PREFIX | NS_XML_PREFIX => Ok(()), //// there is already a namespace binding with this prefix in scope //prefix if self.nst.get(prefix) == Some(uri) => Ok(()), // emit xmlns only if it is overridden NS_NO_PREFIX => if uri == NS_EMPTY_URI { Ok(()) } else { write!(target, " xmlns=\"{uri}\"") }, // everything else prefix => write!(target, " xmlns:{prefix}=\"{uri}\""), }?; } Ok(()) } pub fn emit_attributes(&self, target: &mut W, attributes: &[Attribute<'_>]) -> Result<()> { for attr in attributes { write!(target, " {}=\"", attr.name.repr_display())?; if self.config.perform_escaping { write!(target, "{}", Escaped::::new(attr.value))?; } else { write!(target, "{}", attr.value)?; } write!(target, "\"")?; } Ok(()) } pub fn emit_end_element(&mut self, target: &mut W, name: Option>) -> Result<()> { let owned_name = if self.config.keep_element_names_stack { Some(self.element_names.pop().ok_or(EmitterError::LastElementNameNotAvailable)?) } else { None }; // Check that last started element name equals to the provided name, if there are both if let Some(ref last_name) = owned_name { if let Some(ref name) = name { if last_name.borrow() != *name { return Err(EmitterError::EndElementNameIsNotEqualToLastStartElementName); } } } if let Some(name) = owned_name.as_ref().map(|n| n.borrow()).or(name) { if self.config.normalize_empty_elements && self.just_wrote_start_element { self.just_wrote_start_element = false; let termination = if self.config.pad_self_closing { " />" } else { "/>" }; let result = target.write_all(termination.as_bytes()).map_err(From::from); self.after_end_element(); result } else { self.just_wrote_start_element = false; self.before_end_element(target)?; let result = write!(target, "", name.repr_display()).map_err(From::from); self.after_end_element(); result } } else { Err(EmitterError::EndElementNameIsNotSpecified) } } pub fn emit_cdata(&mut self, target: &mut W, content: &str) -> Result<()> { self.fix_non_empty_element(target)?; if self.config.cdata_to_characters { self.emit_characters(target, content) } else { target.write_all(b"") { let chunk_safe = chunk.strip_suffix("]]>"); let emit_escaped = chunk_safe.is_some(); target.write_all(chunk_safe.unwrap_or(chunk).as_bytes())?; if emit_escaped { target.write_all(b"]]]]>")?; } } target.write_all(b"]]>")?; self.after_text(); Ok(()) } } pub fn emit_characters(&mut self, target: &mut W, content: &str) -> Result<()> { self.check_document_started(target)?; self.fix_non_empty_element(target)?; if self.config.perform_escaping { write!(target, "{}", Escaped::::new(content))?; } else { target.write_all(content.as_bytes())?; } self.after_text(); Ok(()) } pub fn emit_raw_characters(&mut self, target: &mut W, content: &str) -> Result<()> { self.check_document_started(target)?; self.fix_non_empty_element(target)?; target.write_all(content.as_bytes())?; self.after_text(); Ok(()) } pub fn emit_comment(&mut self, target: &mut W, content: &str) -> Result<()> { self.fix_non_empty_element(target)?; // TODO: add escaping dashes at the end of the comment let autopad_comments = self.config.autopad_comments; let write = move |target: &mut W| -> Result<()> { target.write_all(b"")?; Ok(()) }; self.before_markup(target)?; let result = write(target); self.after_markup(); result } } xml-1.2.0/src/writer/events.rs000064400000000000000000000231111046102023000143770ustar 00000000000000//! Contains `XmlEvent` datatype, instances of which are consumed by the writer. use std::borrow::Cow; use crate::attribute::Attribute; use crate::common::XmlVersion; use crate::name::Name; use crate::namespace::{Namespace, NS_NO_PREFIX}; use crate::reader::ErrorKind; /// A part of an XML output stream. /// /// Objects of this enum are consumed by `EventWriter`. They correspond to different parts of /// an XML document. #[derive(Debug, Clone)] #[non_exhaustive] pub enum XmlEvent<'a> { /// Corresponds to XML document declaration. /// /// This event should always be written before any other event. If it is not written /// at all, a default XML declaration will be outputted if the corresponding option /// is set in the configuration. Otherwise an error will be returned. StartDocument { /// XML version. /// /// Defaults to `XmlVersion::Version10`. version: XmlVersion, /// XML document encoding. /// /// Defaults to `Some("UTF-8")`. encoding: Option<&'a str>, /// XML standalone declaration. /// /// Defaults to `None`. standalone: Option, }, /// Denotes an XML processing instruction. ProcessingInstruction { /// Processing instruction target. name: &'a str, /// Processing instruction content. data: Option<&'a str>, }, /// Denotes a beginning of an XML element. StartElement { /// Qualified name of the element. name: Name<'a>, /// A list of attributes associated with the element. /// /// Currently attributes are not checked for duplicates (TODO). Attribute values /// will be escaped, and all characters invalid for attribute values like `"` or `<` /// will be changed into character entities. attributes: Cow<'a, [Attribute<'a>]>, /// Contents of the namespace mapping at this point of the document. /// /// This mapping will be inspected for "new" entries, and if at this point of the document /// a particular pair of prefix and namespace URI is already defined, no namespace /// attributes will be emitted. namespace: Cow<'a, Namespace>, }, /// Denotes an end of an XML element. EndElement { /// Optional qualified name of the element. /// /// If `None`, then it is assumed that the element name should be the last valid one. /// If `Some` and element names tracking is enabled, then the writer will check it for /// correctness. name: Option>, }, /// Denotes CDATA content. /// /// This event contains unparsed data, and no escaping will be performed when writing it /// to the output stream. CData(&'a str), /// Denotes a comment. /// /// The string will be checked for invalid sequences and error will be returned by the /// write operation Comment(&'a str), /// Denotes character data outside of tags. /// /// Contents of this event will be escaped if `perform_escaping` option is enabled, /// that is, every character invalid for PCDATA will appear as a character entity. Characters(&'a str), /// Emits raw characters which will never be escaped. /// /// This event is only used for writing to an output stream, there is no equivalent /// reader event. Care must be taken when using this event, as it can easily result /// non-well-formed documents. RawCharacters(&'a str), /// Syntax of the `DOCTYPE`, everyhing including `<` and `>` Doctype(&'a str), } impl<'a> XmlEvent<'a> { /// Returns an writer event for a processing instruction. #[inline] #[must_use] pub const fn processing_instruction(name: &'a str, data: Option<&'a str>) -> Self { XmlEvent::ProcessingInstruction { name, data } } /// Returns a builder for a starting element. /// /// This builder can then be used to tweak attributes and namespace starting at /// this element. #[inline] pub fn start_element(name: S) -> StartElementBuilder<'a> where S: Into> { StartElementBuilder { name: name.into(), attributes: Vec::new(), namespace: Namespace::empty(), } } /// Returns a builder for an closing element. /// /// This method, unline `start_element()`, does not accept a name because by default /// the writer is able to determine it automatically. However, when this functionality /// is disabled, it is possible to specify the name with `name()` method on the builder. #[inline] #[must_use] pub const fn end_element() -> EndElementBuilder<'a> { EndElementBuilder { name: None } } /// Returns a CDATA event. /// /// Naturally, the provided string won't be escaped, except for closing CDATA token `]]>` /// (depending on the configuration). #[inline] #[must_use] pub const fn cdata(data: &'a str) -> Self { XmlEvent::CData(data) } /// Returns a regular characters (PCDATA) event. /// /// All offending symbols, in particular, `&` and `<`, will be escaped by the writer. #[inline] #[must_use] pub const fn characters(data: &'a str) -> Self { XmlEvent::Characters(data) } /// Returns a raw characters event. /// /// No escaping takes place. /// This event is only used for writing to an output stream, there is no equivalent /// reader event. Care must be taken when using this event, as it can easily result /// non-well-formed documents. #[inline] #[must_use] pub const fn raw_characters(data: &'a str) -> Self { XmlEvent::RawCharacters(data) } /// Returns a comment event. #[inline] #[must_use] pub const fn comment(data: &'a str) -> Self { XmlEvent::Comment(data) } } impl<'a> From<&'a str> for XmlEvent<'a> { #[inline] fn from(s: &'a str) -> Self { XmlEvent::Characters(s) } } /// A builder for a closing element event. pub struct EndElementBuilder<'a> { name: Option>, } /// A builder for a closing element event. impl<'a> EndElementBuilder<'a> { /// Sets the name of this closing element. /// /// Usually the writer is able to determine closing element names automatically. If /// this functionality is enabled (by default it is), then this name is checked for correctness. /// It is possible, however, to disable such behavior; then the user must ensure that /// closing element name is correct manually. #[inline] #[must_use] pub fn name(mut self, name: N) -> Self where N: Into> { self.name = Some(name.into()); self } } impl<'a> From> for XmlEvent<'a> { fn from(b: EndElementBuilder<'a>) -> Self { XmlEvent::EndElement { name: b.name } } } /// A builder for a starting element event. pub struct StartElementBuilder<'a> { name: Name<'a>, attributes: Vec>, namespace: Namespace, } impl<'a> StartElementBuilder<'a> { /// Sets an attribute value of this element to the given string. /// /// This method can be used to add attributes to the starting element. Name is a qualified /// name; its namespace is ignored, but its prefix is checked for correctness, that is, /// it is checked that the prefix is bound to some namespace in the current context. /// /// Currently attributes are not checked for duplicates. Note that duplicate attributes /// are a violation of XML document well-formedness. /// /// The writer checks that you don't specify reserved prefix names, for example `xmlns`. #[inline] #[must_use] pub fn attr(mut self, name: N, value: &'a str) -> Self where N: Into> { self.attributes.push(Attribute::new(name.into(), value)); self } /// Adds a namespace to the current namespace context. /// /// If no namespace URI was bound to the provided prefix at this point of the document, /// then the mapping from the prefix to the provided namespace URI will be written as /// a part of this element attribute set. /// /// If the same namespace URI was bound to the provided prefix at this point of the document, /// then no namespace attributes will be emitted. /// /// If some other namespace URI was bound to the provided prefix at this point of the document, /// then another binding will be added as a part of this element attribute set, shadowing /// the outer binding. #[inline] #[must_use] pub fn ns(mut self, prefix: S1, uri: S2) -> Self where S1: Into, S2: Into { self.namespace.put(prefix, uri); self } /// Adds a default namespace mapping to the current namespace context. /// /// Same rules as for `ns()` are also valid for the default namespace mapping. #[inline] #[must_use] pub fn default_ns(mut self, uri: S) -> Self where S: Into { self.namespace.put(NS_NO_PREFIX, uri); self } } impl<'a> From> for XmlEvent<'a> { #[inline] fn from(b: StartElementBuilder<'a>) -> Self { XmlEvent::StartElement { name: b.name, attributes: Cow::Owned(b.attributes), namespace: Cow::Owned(b.namespace), } } } impl<'a> TryFrom<&'a crate::reader::XmlEvent> for XmlEvent<'a> { type Error = crate::reader::Error; fn try_from(event: &crate::reader::XmlEvent) -> Result, Self::Error> { Ok(event.as_writer_event().ok_or(ErrorKind::UnexpectedEof)?) } } xml-1.2.0/src/writer.rs000064400000000000000000000077061046102023000131070ustar 00000000000000//! Contains high-level interface for an events-based XML emitter. //! //! The most important type in this module is `EventWriter` which allows writing an XML document //! to some output stream. pub use self::config::EmitterConfig; pub use self::emitter::{EmitterError as Error, Result}; pub use self::events::XmlEvent; use self::emitter::Emitter; use std::io::prelude::*; mod config; mod emitter; pub mod events; /// A wrapper around an `std::io::Write` instance which emits XML document according to provided /// events. pub struct EventWriter { sink: W, emitter: Emitter, } impl EventWriter { /// Creates a new `EventWriter` out of an `std::io::Write` instance using the default /// configuration. #[inline] pub fn new(sink: W) -> Self { Self::new_with_config(sink, EmitterConfig::new()) } /// Creates a new `EventWriter` out of an `std::io::Write` instance using the provided /// configuration. #[inline] pub fn new_with_config(sink: W, config: EmitterConfig) -> Self { Self { sink, emitter: Emitter::new(config), } } /// Writes the next piece of XML document according to the provided event. /// /// Note that output data may not exactly correspond to the written event because /// of various configuration options. For example, `XmlEvent::EndElement` may /// correspond to a separate closing element or it may cause writing an empty element. /// Another example is that `XmlEvent::CData` may be represented as characters in /// the output stream. pub fn write<'a, E>(&mut self, event: E) -> Result<()> where E: Into> { match event.into() { XmlEvent::StartDocument { version, encoding, standalone } => self.emitter.emit_start_document(&mut self.sink, version, encoding.unwrap_or("UTF-8"), standalone), XmlEvent::ProcessingInstruction { name, data } => self.emitter.emit_processing_instruction(&mut self.sink, name, data), XmlEvent::StartElement { name, attributes, namespace } => { self.emitter.namespace_stack_mut().push_empty().checked_target().extend(namespace.as_ref()); self.emitter.emit_start_element(&mut self.sink, name, &attributes) }, XmlEvent::EndElement { name } => { let r = self.emitter.emit_end_element(&mut self.sink, name); self.emitter.namespace_stack_mut().try_pop(); r }, XmlEvent::Comment(content) => self.emitter.emit_comment(&mut self.sink, content), XmlEvent::CData(content) => self.emitter.emit_cdata(&mut self.sink, content), XmlEvent::Characters(content) => self.emitter.emit_characters(&mut self.sink, content), XmlEvent::RawCharacters(content) => self.emitter.emit_raw_characters(&mut self.sink, content), XmlEvent::Doctype(content) => self.emitter.emit_raw_characters(&mut self.sink, content), } } /// Returns a mutable reference to the underlying `Writer`. /// /// Note that having a reference to the underlying sink makes it very easy to emit invalid XML /// documents. Use this method with care. Valid use cases for this method include accessing /// methods like `Write::flush`, which do not emit new data but rather change the state /// of the stream itself. pub fn inner_mut(&mut self) -> &mut W { &mut self.sink } /// Returns an immutable reference to the underlying `Writer`. pub fn inner_ref(&self) -> &W { &self.sink } /// Unwraps this `EventWriter`, returning the underlying writer. /// /// Note that this is a destructive operation: unwrapping a writer and then wrapping /// it again with `EventWriter::new()` will create a fresh writer whose state will be /// blank; for example, accumulated namespaces will be reset. pub fn into_inner(self) -> W { self.sink } }