logos-codegen-0.14.0/.cargo_vcs_info.json0000644000000001530000000000100136420ustar { "git": { "sha1": "171422c2c3ccc708c12b3f33c165319718b34882" }, "path_in_vcs": "logos-codegen" }logos-codegen-0.14.0/Cargo.toml0000644000000026540000000000100116500ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" rust-version = "1.65.0" name = "logos-codegen" version = "0.14.0" authors = [ "Maciej Hirsz ", "Jérome Eertmans (maintainer) ", ] description = "Create ridiculously fast Lexers" homepage = "https://logos.maciej.codes/" readme = "README.md" keywords = [ "lexer", "lexical", "tokenizer", "parser", "no_std", ] categories = [ "parsing", "text-processing", ] license = "MIT OR Apache-2.0" repository = "https://github.com/maciejhirsz/logos" [package.metadata.release] shared-version = true [lib] bench = false [dependencies.beef] version = "0.5.0" [dependencies.fnv] version = "1.0.6" [dependencies.lazy_static] version = "1.4.0" [dependencies.proc-macro2] version = "1.0.9" [dependencies.quote] version = "1.0.3" [dependencies.regex-syntax] version = "0.8.2" [dependencies.syn] version = "2.0.13" features = ["full"] [dev-dependencies.pretty_assertions] version = "1.4.0" logos-codegen-0.14.0/Cargo.toml.orig000064400000000000000000000011551046102023000153240ustar 00000000000000[dependencies] beef = "0.5.0" fnv = "1.0.6" lazy_static = "1.4.0" proc-macro2 = "1.0.9" quote = "1.0.3" regex-syntax = "0.8.2" syn = { version = "2.0.13", features = ["full"] } [dev-dependencies] pretty_assertions = "1.4.0" [lib] bench = false [package] name = "logos-codegen" authors.workspace = true categories.workspace = true description.workspace = true edition.workspace = true homepage.workspace = true keywords.workspace = true license.workspace = true readme.workspace = true repository.workspace = true rust-version.workspace = true version.workspace = true [package.metadata.release] shared-version = true logos-codegen-0.14.0/LICENSE-APACHE000064400000000000000000000251541046102023000143660ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2018 Maciej Hirsz Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. logos-codegen-0.14.0/LICENSE-MIT000064400000000000000000000021201046102023000140620ustar 00000000000000Copyright (c) 2018 Maciej Hirsz The MIT License (MIT) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. logos-codegen-0.14.0/README.md000064400000000000000000000067211046102023000137200ustar 00000000000000Logos logo # Logos ![Test](https://github.com/maciejhirsz/logos/workflows/Test/badge.svg?branch=master) [![Crates.io version shield](https://img.shields.io/crates/v/logos.svg)](https://crates.io/crates/logos2) [![Docs](https://docs.rs/logos2/badge.svg)](https://docs.rs/logos2) [![Crates.io license shield](https://img.shields.io/crates/l/logos.svg)](https://crates.io/crates/logos2) _Create ridiculously fast Lexers._ **Logos** has two goals: + To make it easy to create a Lexer, so you can focus on more complex problems. + To make the generated Lexer faster than anything you'd write by hand. To achieve those, **Logos**: + Combines all token definitions into a single [deterministic state machine](https://en.wikipedia.org/wiki/Deterministic_finite_automaton). + Optimizes branches into [lookup tables](https://en.wikipedia.org/wiki/Lookup_table) or [jump tables](https://en.wikipedia.org/wiki/Branch_table). + Prevents [backtracking](https://en.wikipedia.org/wiki/ReDoS) inside token definitions. + [Unwinds loops](https://en.wikipedia.org/wiki/Loop_unrolling), and batches reads to minimize bounds checking. + Does all of that heavy lifting at compile time. ## Example ```rust use logos::Logos; #[derive(Logos, Debug, PartialEq)] #[logos(skip r"[ \t\n\f]+")] // Ignore this regex pattern between tokens enum Token { // Tokens can be literal strings, of any length. #[token("fast")] Fast, #[token(".")] Period, // Or regular expressions. #[regex("[a-zA-Z]+")] Text, } fn main() { let mut lex = Token::lexer("Create ridiculously fast Lexers."); assert_eq!(lex.next(), Some(Ok(Token::Text))); assert_eq!(lex.span(), 0..6); assert_eq!(lex.slice(), "Create"); assert_eq!(lex.next(), Some(Ok(Token::Text))); assert_eq!(lex.span(), 7..19); assert_eq!(lex.slice(), "ridiculously"); assert_eq!(lex.next(), Some(Ok(Token::Fast))); assert_eq!(lex.span(), 20..24); assert_eq!(lex.slice(), "fast"); assert_eq!(lex.next(), Some(Ok(Token::Text))); assert_eq!(lex.slice(), "Lexers"); assert_eq!(lex.span(), 25..31); assert_eq!(lex.next(), Some(Ok(Token::Period))); assert_eq!(lex.span(), 31..32); assert_eq!(lex.slice(), "."); assert_eq!(lex.next(), None); } ``` For more examples and documentation, please refer to the [Logos handbook](https://maciejhirsz.github.io/logos/) or the [crate documentation](https://docs.rs/logos2/latest/logos/). ## How fast? Ridiculously fast! ```norust test identifiers ... bench: 647 ns/iter (+/- 27) = 1204 MB/s test keywords_operators_and_punctators ... bench: 2,054 ns/iter (+/- 78) = 1037 MB/s test strings ... bench: 553 ns/iter (+/- 34) = 1575 MB/s ``` ## Acknowledgements + [Pedrors](https://pedrors.pt/) for the **Logos** logo. ## Thank you **Logos** is very much a labor of love. If you find it useful, consider [getting me some coffee](https://github.com/sponsors/maciejhirsz). ☕ If you'd like to contribute to Logos, then consider reading the [Contributing guide](https://maciejhirsz.github.io/logos/contributing). ## License This code is distributed under the terms of both the MIT license and the Apache License (Version 2.0), choose whatever works for you. See [LICENSE-APACHE](LICENSE-APACHE) and [LICENSE-MIT](LICENSE-MIT) for details. logos-codegen-0.14.0/src/error.rs000064400000000000000000000042261046102023000147250ustar 00000000000000use std::fmt; use beef::lean::Cow; use proc_macro2::{Span, TokenStream}; use quote::quote; use quote::{quote_spanned, ToTokens, TokenStreamExt}; pub type Result = std::result::Result; #[derive(Default)] pub struct Errors { collected: Vec, } impl Errors { pub fn err(&mut self, message: M, span: Span) -> &mut Self where M: Into>, { self.collected.push(SpannedError { message: message.into(), span, }); self } pub fn render(self) -> Option { let errors = self.collected; match errors.len() { 0 => None, _ => Some(quote! { fn _logos_derive_compile_errors() { #(#errors)* } }), } } } pub struct Error(Cow<'static, str>); #[derive(Debug)] pub struct SpannedError { message: Cow<'static, str>, span: Span, } impl Error { pub fn new(message: M) -> Self where M: Into>, { Error(message.into()) } pub fn span(self, span: Span) -> SpannedError { SpannedError { message: self.0, span, } } } impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { self.0.fmt(f) } } impl fmt::Debug for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fmt::Display::fmt(self, f) } } impl From for Error { fn from(err: regex_syntax::Error) -> Error { Error(err.to_string().into()) } } impl From<&'static str> for Error { fn from(err: &'static str) -> Error { Error(err.into()) } } impl From for Error { fn from(err: String) -> Error { Error(err.into()) } } impl From for Cow<'static, str> { fn from(err: Error) -> Self { err.0 } } impl ToTokens for SpannedError { fn to_tokens(&self, tokens: &mut TokenStream) { let message = &*self.message; tokens.append_all(quote_spanned!(self.span => { compile_error!(#message) })) } } logos-codegen-0.14.0/src/generator/context.rs000064400000000000000000000070321046102023000172440ustar 00000000000000use proc_macro2::TokenStream; use quote::quote; use crate::generator::Generator; use crate::graph::NodeId; /// This struct keeps track of bytes available to be read without /// bounds checking across the tree. /// /// For example, a branch that matches 4 bytes followed by a fork /// with smallest branch containing of 2 bytes can do a bounds check /// for 6 bytes ahead, and leave the remaining 2 byte array (fixed size) /// to be handled by the fork, avoiding bound checks there. #[derive(Default, Clone, Copy, PartialEq, Eq, Hash, Debug)] pub struct Context { /// Amount of bytes that haven't been bumped yet but should /// before a new read is performed at: usize, /// Number of bytes available without bound checks available: usize, /// Whether or not the Lexer has been bumped at least by 1 byte bumped: bool, /// Node to backtrack to to in case an explicit match has failed. /// If `None` will instead produce an error token. backtrack: Option, } impl Context { pub fn can_backtrack(&self) -> bool { self.backtrack.is_some() } pub fn switch(&mut self, miss: Option) -> Option { self.backtrack = Some(miss?); self.bump() } pub const fn advance(self, n: usize) -> Self { Context { at: self.at + n, ..self } } pub fn bump(&mut self) -> Option { match self.at { 0 => None, n => { let tokens = quote!(lex.bump_unchecked(#n);); self.at = 0; self.available = 0; self.bumped = true; Some(tokens) } } } pub fn remainder(&self) -> usize { self.available.saturating_sub(self.at) } pub fn read_unchecked(&mut self, len: usize) -> TokenStream { let at = self.at; match len { 0 => { self.advance(1); quote!(lex.read_unchecked::(#at)) } l => { self.advance(l); quote!(lex.read_unchecked::<&[u8; #l]>(#at)) } } } pub fn read(&mut self, len: usize) -> TokenStream { self.available = len; match (self.at, len) { (0, 0) => quote!(lex.read::()), (a, 0) => quote!(lex.read_at::(#a)), (0, l) => quote!(lex.read::<&[u8; #l]>()), (a, l) => quote!(lex.read_at::<&[u8; #l]>(#a)), } } pub fn wipe(&mut self) { self.available = 0; } const fn backtrack(self) -> Self { Context { at: 0, available: 0, bumped: self.bumped, backtrack: None, } } pub fn miss(mut self, miss: Option, gen: &mut Generator) -> TokenStream { self.wipe(); match (miss, self.backtrack) { (Some(id), _) => gen.goto(id, self).clone(), (_, Some(id)) => gen.goto(id, self.backtrack()).clone(), _ if self.bumped => quote!(lex.error()), _ => quote!(_error(lex)), } } pub fn write_suffix(&self, buf: &mut String) { use std::fmt::Write; if self.at > 0 { let _ = write!(buf, "_at{}", self.at); } if self.available > 0 { let _ = write!(buf, "_with{}", self.available); } if let Some(id) = self.backtrack { let _ = write!(buf, "_ctx{}", id); } if self.bumped { buf.push_str("_x"); } } } logos-codegen-0.14.0/src/generator/fork.rs000064400000000000000000000150651046102023000165260ustar 00000000000000use std::cmp::max; use fnv::FnvHashMap as Map; use proc_macro2::TokenStream; use quote::quote; use crate::generator::{Context, Generator}; use crate::graph::{Fork, NodeId, Range}; use crate::util::ToIdent; type Targets = Map>; impl<'a> Generator<'a> { pub fn generate_fork(&mut self, this: NodeId, fork: &Fork, mut ctx: Context) -> TokenStream { let mut targets: Targets = Map::default(); for (range, then) in fork.branches() { targets.entry(then).or_default().push(range); } let loops_to_self = self.meta[this].loop_entry_from.contains(&this); match targets.len() { 1 if loops_to_self => return self.generate_fast_loop(fork, ctx), 0..=2 => (), _ => return self.generate_fork_jump_table(this, fork, targets, ctx), } let miss = ctx.miss(fork.miss, self); let end = self.fork_end(this, &miss); let (byte, read) = self.fork_read(this, end, &mut ctx); let branches = targets.into_iter().map(|(id, ranges)| { let next = self.goto(id, ctx.advance(1)); match *ranges { [range] => { quote!(#range => #next,) } [a, b] if a.is_byte() && b.is_byte() => { quote!(#a | #b => #next,) } _ => { let test = self.generate_test(ranges).clone(); let next = self.goto(id, ctx.advance(1)); quote!(byte if #test(byte) => #next,) } } }); quote! { #read match #byte { #(#branches)* _ => #miss, } } } fn generate_fork_jump_table( &mut self, this: NodeId, fork: &Fork, targets: Targets, mut ctx: Context, ) -> TokenStream { let miss = ctx.miss(fork.miss, self); let end = self.fork_end(this, &miss); let (byte, read) = self.fork_read(this, end, &mut ctx); let mut table: [u8; 256] = [0; 256]; let mut jumps = vec!["__".to_ident()]; let branches = targets .into_iter() .enumerate() .map(|(idx, (id, ranges))| { let idx = (idx as u8) + 1; let next = self.goto(id, ctx.advance(1)); jumps.push(format!("J{}", id).to_ident()); for byte in ranges.into_iter().flatten() { table[byte as usize] = idx; } let jump = jumps.last().unwrap(); quote!(Jump::#jump => #next,) }) .collect::(); let jumps = &jumps; let table = table.iter().copied().map(|idx| &jumps[idx as usize]); quote! { enum Jump { #(#jumps,)* } const LUT: [Jump; 256] = { use Jump::*; [#(#table),*] }; #read match LUT[#byte as usize] { #branches Jump::__ => #miss, } } } fn fork_end(&self, this: NodeId, miss: &TokenStream) -> TokenStream { if this == self.root { quote!(_end(lex)) } else { miss.clone() } } fn fork_read( &self, this: NodeId, end: TokenStream, ctx: &mut Context, ) -> (TokenStream, TokenStream) { let min_read = self.meta[this].min_read; if ctx.remainder() >= max(min_read, 1) { let read = ctx.read_unchecked(0); return (quote!(byte), quote!(let byte = unsafe { #read };)); } match min_read { 0 | 1 => { let read = ctx.read(0); ( quote!(byte), quote! { let byte = match #read { Some(byte) => byte, None => return #end, }; }, ) } len => { let read = ctx.read(len); ( quote!(arr[0]), quote! { let arr = match #read { Some(arr) => arr, None => return #end, }; }, ) } } } fn generate_fast_loop(&mut self, fork: &Fork, ctx: Context) -> TokenStream { let miss = ctx.miss(fork.miss, self); let ranges = fork.branches().map(|(range, _)| range).collect::>(); let test = self.generate_test(ranges); quote! { _fast_loop!(lex, #test, #miss); } } pub fn fast_loop_macro() -> TokenStream { quote! { macro_rules! _fast_loop { ($lex:ident, $test:ident, $miss:expr) => { // Do one bounds check for multiple bytes till EOF while let Some(arr) = $lex.read::<&[u8; 16]>() { if $test(arr[0]) { if $test(arr[1]) { if $test(arr[2]) { if $test(arr[3]) { if $test(arr[4]) { if $test(arr[5]) { if $test(arr[6]) { if $test(arr[7]) { if $test(arr[8]) { if $test(arr[9]) { if $test(arr[10]) { if $test(arr[11]) { if $test(arr[12]) { if $test(arr[13]) { if $test(arr[14]) { if $test(arr[15]) { $lex.bump_unchecked(16); continue; } $lex.bump_unchecked(15); return $miss; } $lex.bump_unchecked(14); return $miss; } $lex.bump_unchecked(13); return $miss; } $lex.bump_unchecked(12); return $miss; } $lex.bump_unchecked(11); return $miss; } $lex.bump_unchecked(10); return $miss; } $lex.bump_unchecked(9); return $miss; } $lex.bump_unchecked(8); return $miss; } $lex.bump_unchecked(7); return $miss; } $lex.bump_unchecked(6); return $miss; } $lex.bump_unchecked(5); return $miss; } $lex.bump_unchecked(4); return $miss; } $lex.bump_unchecked(3); return $miss; } $lex.bump_unchecked(2); return $miss; } $lex.bump_unchecked(1); return $miss; } return $miss; } while $lex.test($test) { $lex.bump_unchecked(1); } $miss }; } } } } logos-codegen-0.14.0/src/generator/leaf.rs000064400000000000000000000033421046102023000164670ustar 00000000000000use proc_macro2::TokenStream; use quote::quote; use crate::generator::{Context, Generator}; use crate::leaf::{Callback, Leaf}; use crate::util::MaybeVoid; impl<'a> Generator<'a> { pub fn generate_leaf(&mut self, leaf: &Leaf, mut ctx: Context) -> TokenStream { let bump = ctx.bump(); let ident = &leaf.ident; let name = self.name; let this = self.this; let ty = &leaf.field; let constructor = match leaf.field { MaybeVoid::Some(_) => quote!(#name::#ident), MaybeVoid::Void => quote!(|()| #name::#ident), }; match &leaf.callback { Some(Callback::Label(callback)) => quote! { #bump #callback(lex).construct(#constructor, lex); }, Some(Callback::Inline(inline)) => { let arg = &inline.arg; let body = &inline.body; quote! { #bump #[inline] fn callback<'s>(#arg: &mut Lexer<'s>) -> impl CallbackResult<'s, #ty, #this> { #body } callback(lex).construct(#constructor, lex); } } Some(Callback::Skip(_)) => { quote! { #bump lex.trivia(); #name::lex(lex); } } None if matches!(leaf.field, MaybeVoid::Void) => quote! { #bump lex.set(Ok(#name::#ident)); }, None => quote! { #bump let token = #name::#ident(lex.slice()); lex.set(Ok(token)); }, } } } logos-codegen-0.14.0/src/generator/mod.rs000064400000000000000000000202251046102023000163360ustar 00000000000000use fnv::{FnvHashMap as Map, FnvHashSet as Set}; use proc_macro2::TokenStream; use quote::{quote, ToTokens, TokenStreamExt}; use syn::Ident; use crate::graph::{Graph, Meta, Node, NodeId, Range}; use crate::leaf::Leaf; use crate::util::ToIdent; mod context; mod fork; mod leaf; mod rope; mod tables; use self::context::Context; use self::tables::TableStack; pub struct Generator<'a> { /// Name of the type we are implementing the `Logos` trait for name: &'a Ident, /// Name of the type with any generics it might need this: &'a TokenStream, /// Id to the root node root: NodeId, /// Reference to the graph with all of the nodes graph: &'a Graph>, /// Meta data collected for the nodes meta: Meta, /// Buffer with functions growing during generation rendered: TokenStream, /// Set of functions that have already been rendered fns: Set<(NodeId, Context)>, /// Function name identifiers idents: Map<(NodeId, Context), Ident>, /// Local function calls. Note: a call might change its context, /// so we can't use `idents` for this purpose. gotos: Map<(NodeId, Context), TokenStream>, /// Identifiers for helper functions matching a byte to a given /// set of ranges tests: Map, Ident>, /// Related to above, table stack manages tables that need to be tables: TableStack, } impl<'a> Generator<'a> { pub fn new( name: &'a Ident, this: &'a TokenStream, root: NodeId, graph: &'a Graph, ) -> Self { let rendered = Self::fast_loop_macro(); let meta = Meta::analyze(root, graph); Generator { name, this, root, graph, meta, rendered, fns: Set::default(), idents: Map::default(), gotos: Map::default(), tests: Map::default(), tables: TableStack::new(), } } pub fn generate(mut self) -> TokenStream { let root = self.goto(self.root, Context::default()).clone(); let rendered = &self.rendered; let tables = &self.tables; quote! { #tables #rendered #root } } fn generate_fn(&mut self, id: NodeId, ctx: Context) { if self.fns.contains(&(id, ctx)) { return; } self.fns.insert((id, ctx)); let body = match &self.graph[id] { Node::Fork(fork) => self.generate_fork(id, fork, ctx), Node::Rope(rope) => self.generate_rope(rope, ctx), Node::Leaf(leaf) => self.generate_leaf(leaf, ctx), }; let ident = self.generate_ident(id, ctx); let out = quote! { #[inline] fn #ident<'s>(lex: &mut Lexer<'s>) { #body } }; self.rendered.append_all(out); } fn goto(&mut self, id: NodeId, mut ctx: Context) -> &TokenStream { let key = (id, ctx); // Allow contains_key + insert because self.generate_ident borrows a mutable ref to self // too. #[allow(clippy::map_entry)] if !self.gotos.contains_key(&key) { let meta = &self.meta[id]; let enters_loop = !meta.loop_entry_from.is_empty(); let bump = if enters_loop || !ctx.can_backtrack() { ctx.switch(self.graph[id].miss()) } else { None }; let bump = match (bump, enters_loop, meta.min_read) { (Some(t), _, _) => Some(t), (None, true, _) => ctx.bump(), (None, false, 0) => ctx.bump(), (None, false, _) => None, }; if meta.min_read == 0 || ctx.remainder() < meta.min_read { ctx.wipe(); } let ident = self.generate_ident(id, ctx); let mut call_site = quote!(#ident(lex)); if let Some(bump) = bump { call_site = quote!({ #bump #call_site }); } self.gotos.insert(key, call_site); self.generate_fn(id, ctx); } &self.gotos[&key] } fn generate_ident(&mut self, id: NodeId, ctx: Context) -> &Ident { self.idents.entry((id, ctx)).or_insert_with(|| { let mut ident = format!("goto{}", id); ctx.write_suffix(&mut ident); ident.to_ident() }) } /// Returns an identifier to a function that matches a byte to any /// of the provided ranges. This will generate either a simple /// match expression, or use a lookup table internally. fn generate_test(&mut self, ranges: Vec) -> &Ident { if !self.tests.contains_key(&ranges) { let idx = self.tests.len(); let ident = format!("pattern{}", idx).to_ident(); let lo = ranges.first().unwrap().start; let hi = ranges.last().unwrap().end; let body = match ranges.len() { 0..=2 => { quote! { match byte { #(#ranges)|* => true, _ => false, } } } _ if hi - lo < 64 => { let mut offset = hi.saturating_sub(63); while offset.count_ones() > 1 && lo - offset > 0 { offset += 1; } let mut table = 0u64; for byte in ranges.iter().flat_map(|range| *range) { if byte - offset >= 64 { panic!("{:#?} {} {} {}", ranges, hi, lo, offset); } table |= 1 << (byte - offset); } let search = match offset { 0 => quote!(byte), _ => quote!(byte.wrapping_sub(#offset)), }; quote! { const LUT: u64 = #table; match 1u64.checked_shl(#search as u32) { Some(shift) => LUT & shift != 0, None => false, } } } _ => { let mut view = self.tables.view(); for byte in ranges.iter().flat_map(|range| *range) { view.flag(byte); } let mask = view.mask(); let lut = view.ident(); quote! { #lut[byte as usize] & #mask > 0 } } }; self.rendered.append_all(quote! { #[inline] fn #ident(byte: u8) -> bool { #body } }); self.tests.insert(ranges.clone(), ident); } &self.tests[&ranges] } } macro_rules! match_quote { ($source:expr; $($byte:tt,)* ) => {match $source { $( $byte => quote!($byte), )* byte => quote!(#byte), }} } fn byte_to_tokens(byte: u8) -> TokenStream { match_quote! { byte; b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O', b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W', b'X', b'Y', b'Z', b'!', b'@', b'#', b'$', b'%', b'^', b'&', b'*', b'(', b')', b'{', b'}', b'[', b']', b'<', b'>', b'-', b'=', b'_', b'+', b':', b';', b',', b'.', b'/', b'?', b'|', b'"', b'\'', b'\\', } } impl ToTokens for Range { fn to_tokens(&self, tokens: &mut TokenStream) { let Range { start, end } = self; tokens.append_all(byte_to_tokens(*start)); if start != end { tokens.append_all(quote!(..=)); tokens.append_all(byte_to_tokens(*end)); } } } logos-codegen-0.14.0/src/generator/rope.rs000064400000000000000000000020411046102023000165200ustar 00000000000000use proc_macro2::TokenStream; use quote::quote; use crate::generator::{Context, Generator}; use crate::graph::Rope; impl<'a> Generator<'a> { pub fn generate_rope(&mut self, rope: &Rope, mut ctx: Context) -> TokenStream { let miss = ctx.miss(rope.miss.first(), self); let read = ctx.read(rope.pattern.len()); let then = self.goto(rope.then, ctx.advance(rope.pattern.len())); let pat = match rope.pattern.to_bytes() { Some(bytes) => byte_slice_literal(&bytes), None => { let ranges = rope.pattern.iter(); quote!([#(#ranges),*]) } }; quote! { match #read { Some(#pat) => #then, _ => #miss, } } } } fn byte_slice_literal(bytes: &[u8]) -> TokenStream { if bytes.iter().any(|&b| !(0x20..0x7F).contains(&b)) { return quote!(&[#(#bytes),*]); } let slice = std::str::from_utf8(bytes).unwrap(); syn::parse_str(&format!("b{:?}", slice)).unwrap() } logos-codegen-0.14.0/src/generator/tables.rs000064400000000000000000000032751046102023000170370ustar 00000000000000use crate::util::ToIdent; use proc_macro2::{Literal, TokenStream}; use quote::{quote, ToTokens}; use syn::Ident; pub struct TableStack { tables: Vec<(Ident, [u8; 256])>, shift: u8, } pub struct TableView<'a> { ident: &'a Ident, table: &'a mut [u8; 256], mask: u8, } impl TableStack { pub fn new() -> Self { TableStack { tables: vec![("COMPACT_TABLE_0".to_ident(), [0; 256])], shift: 0, } } pub fn view(&mut self) -> TableView { let mask = if self.shift < 8 { // Reusing existing table with a shifted mask let mask = 1u8 << self.shift; self.shift += 1; mask } else { // Need to create a new table let ident = format!("COMPACT_TABLE_{}", self.tables.len()).to_ident(); self.tables.push((ident, [0; 256])); self.shift = 1; 1 }; let (ref ident, ref mut table) = self.tables.last_mut().unwrap(); TableView { ident, table, mask } } } impl<'a> TableView<'a> { pub fn ident(&self) -> &'a Ident { self.ident } pub fn flag(&mut self, byte: u8) { self.table[byte as usize] |= self.mask; } pub fn mask(&self) -> Literal { Literal::u8_unsuffixed(self.mask) } } impl ToTokens for TableStack { fn to_tokens(&self, out: &mut TokenStream) { if self.shift == 0 { return; } for (ident, table) in self.tables.iter() { let bytes = table.iter().copied().map(Literal::u8_unsuffixed); out.extend(quote! { static #ident: [u8; 256] = [#(#bytes),*]; }); } } } logos-codegen-0.14.0/src/graph/fork.rs000064400000000000000000000155221046102023000156370ustar 00000000000000use crate::graph::{Disambiguate, Graph, NodeId, Range}; #[derive(Clone)] pub struct Fork { /// LUT matching byte -> node id lut: Box<[Option; 256]>, /// State to go to if no arms are matching pub miss: Option, } impl Fork { pub fn new() -> Self { Fork { lut: Box::new([None; 256]), miss: None, } } pub fn miss(mut self, miss: M) -> Self where M: Into>, { self.miss = miss.into(); self } pub fn add_branch(&mut self, range: R, then: NodeId, graph: &mut Graph) where R: Into, T: Disambiguate, { for byte in range.into() { match &mut self.lut[byte as usize] { Some(other) if *other != then => { *other = graph.merge(*other, then); } opt => *opt = Some(then), } } } // TODO: Add result with a printable error pub fn merge(&mut self, other: Fork, graph: &mut Graph) where T: Disambiguate, { self.miss = match (self.miss, other.miss) { (None, None) => None, (Some(id), None) | (None, Some(id)) => Some(id), (Some(a), Some(b)) => Some(graph.merge(a, b)), }; for (left, right) in self.lut.iter_mut().zip(other.lut.iter()) { *left = match (*left, *right) { (None, None) => continue, (Some(id), None) | (None, Some(id)) => Some(id), (Some(a), Some(b)) => Some(graph.merge(a, b)), } } } pub fn branches(&self) -> ForkIter<'_> { ForkIter { offset: 0, lut: &self.lut, } } /// Checks if all bytes in the `range` have a branch on this /// fork, and those branches are resolve to the same `NodeId`. pub fn contains(&self, range: R) -> Option where R: Into, { let mut range = range.into(); let byte = range.next()?; let first = self.lut[byte as usize]?; for byte in range { if first != self.lut[byte as usize]? { return None; } } Some(first) } pub fn branch(mut self, range: R, then: NodeId) -> Self where R: Into, { for byte in range.into() { match &mut self.lut[byte as usize] { Some(other) if *other != then => { panic!("Overlapping branches"); } opt => *opt = Some(then), } } self } pub fn shake(&self, graph: &Graph, filter: &mut [bool]) { if let Some(id) = self.miss { if !filter[id.get()] { filter[id.get()] = true; graph[id].shake(graph, filter); } } for (_, id) in self.branches() { if !filter[id.get()] { filter[id.get()] = true; graph[id].shake(graph, filter); } } } } pub struct ForkIter<'a> { offset: usize, lut: &'a [Option; 256], } impl<'a> Iterator for ForkIter<'a> { type Item = (Range, NodeId); fn next(&mut self) -> Option { // Consume empty slots self.offset += self.lut[self.offset..] .iter() .take_while(|next| next.is_none()) .count(); let then = self.lut.get(self.offset).copied().flatten()?; let start = self.offset; // Consume all slots with same NodeId target self.offset += self.lut[self.offset..] .iter() .take_while(|next| **next == Some(then)) .count(); Some(( Range { start: start as u8, end: (self.offset - 1) as u8, }, then, )) } } #[cfg(test)] mod tests { use super::*; use crate::graph::Node; use pretty_assertions::assert_eq; #[test] fn fork_iter() { let mut buf = [None; 256]; for byte in b'4'..=b'7' { buf[byte as usize] = Some(NodeId::new(1)); } for byte in b'a'..=b'd' { buf[byte as usize] = Some(NodeId::new(2)); } let iter = ForkIter { offset: 0, lut: &buf, }; assert_eq!( &[ ( Range { start: b'4', end: b'7' }, NodeId::new(1) ), ( Range { start: b'a', end: b'd' }, NodeId::new(2) ), ], &*iter.collect::>(), ); } #[test] fn merge_no_conflict() { let mut graph = Graph::new(); let leaf1 = graph.push(Node::Leaf("FOO")); let leaf2 = graph.push(Node::Leaf("BAR")); let mut fork = Fork::new().branch(b'1', leaf1); fork.merge(Fork::new().branch(b'2', leaf2), &mut graph); assert_eq!(fork, Fork::new().branch(b'1', leaf1).branch(b'2', leaf2)); } #[test] fn merge_miss_right() { let mut graph = Graph::new(); let leaf1 = graph.push(Node::Leaf("FOO")); let leaf2 = graph.push(Node::Leaf("BAR")); let mut fork = Fork::new().branch(b'1', leaf1); fork.merge(Fork::new().miss(leaf2), &mut graph); assert_eq!(fork, Fork::new().branch(b'1', leaf1).miss(leaf2)); } #[test] fn merge_miss_left() { let mut graph = Graph::new(); let leaf1 = graph.push(Node::Leaf("FOO")); let leaf2 = graph.push(Node::Leaf("BAR")); let mut fork = Fork::new().miss(leaf1); fork.merge(Fork::new().branch(b'2', leaf2), &mut graph); assert_eq!(fork, Fork::new().branch(b'2', leaf2).miss(leaf1)); } #[test] fn contains_byte() { let fork = Fork::new().branch('a'..='z', NodeId::new(42)); assert_eq!(fork.contains(b't'), Some(NodeId::new(42))); } #[test] fn contains_range() { let fork = Fork::new() .branch('a'..='m', NodeId::new(42)) .branch('n'..='z', NodeId::new(42)); assert_eq!(fork.contains('i'..='r'), Some(NodeId::new(42))); assert_eq!(fork.contains('a'..='z'), Some(NodeId::new(42))); } #[test] fn contains_different_ranges() { let fork = Fork::new() .branch('a'..='m', NodeId::new(42)) .branch('n'..='z', NodeId::new(47)); assert_eq!(fork.contains('i'..='r'), None); assert_eq!(fork.contains('a'..='z'), None); assert_eq!(fork.contains('d'..='f'), Some(NodeId::new(42))); assert_eq!(fork.contains('n'..='p'), Some(NodeId::new(47))); } } logos-codegen-0.14.0/src/graph/impls.rs000064400000000000000000000131201046102023000160120ustar 00000000000000use std::fmt::{self, Debug, Display}; use std::hash::{Hash, Hasher}; use crate::graph::{Fork, Graph, Node, NodeId, Range, Rope}; impl From for Node { fn from(fork: Fork) -> Self { Node::Fork(fork) } } impl From for Node { fn from(rope: Rope) -> Self { Node::Rope(rope) } } fn is_ascii(byte: u8) -> bool { (0x20..0x7F).contains(&byte) } impl Hash for Fork { fn hash(&self, state: &mut H) { for branch in self.branches() { branch.hash(state); } self.miss.hash(state); } } impl Hash for Node { fn hash(&self, state: &mut H) { match self { Node::Rope(rope) => { b"ROPE".hash(state); rope.hash(state); } Node::Fork(fork) => { b"FORK".hash(state); fork.hash(state); } Node::Leaf(_) => b"LEAF".hash(state), } } } impl Debug for NodeId { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { Debug::fmt(&self.0, f) } } /// We don't need debug impls in release builds // #[cfg(test)] mod debug { use super::*; use crate::graph::rope::Miss; use crate::graph::Disambiguate; use std::cmp::{Ord, Ordering}; impl Disambiguate for &str { fn cmp(left: &&str, right: &&str) -> Ordering { Ord::cmp(left, right) } } impl Debug for Range { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let Range { start, end } = *self; if start != end || !is_ascii(start) { f.write_str("[")?; } match is_ascii(start) { true => write!(f, "{}", start as char), false => write!(f, "{:02X}", start), }?; if start != end { match is_ascii(end) { true => write!(f, "-{}]", end as char), false => write!(f, "-{:02X}]", end), }?; } else if !is_ascii(start) { f.write_str("]")?; } Ok(()) } } impl Display for Range { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { ::fmt(self, f) } } impl Debug for Graph { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let entries = self .nodes() .iter() .enumerate() .filter_map(|(i, n)| n.as_ref().map(|n| (i, n))); f.debug_map().entries(entries).finish() } } struct Arm(T, U); impl Debug for Arm where T: Display, U: Display, { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{} ⇒ {}", self.0, self.1) } } impl Debug for Fork { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let mut list = f.debug_set(); for (range, then) in self.branches() { list.entry(&Arm(range, then)); } if let Some(id) = self.miss { list.entry(&Arm('_', id)); } list.finish() } } impl Display for Miss { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Miss::First(id) => Display::fmt(id, f), Miss::Any(id) => write!(f, "{}*", id), Miss::None => f.write_str("n/a"), } } } impl Debug for Rope { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use std::fmt::Write; let mut rope = String::with_capacity(self.pattern.len()); for range in self.pattern.iter() { write!(rope, "{}", range)?; } match self.miss.is_none() { false => { let mut list = f.debug_list(); list.entry(&Arm(rope, self.then)); list.entry(&Arm('_', self.miss)); list.finish() } true => Arm(rope, self.then).fmt(f), } } } impl PartialEq for Fork { fn eq(&self, other: &Self) -> bool { self.miss == other.miss && self.branches().eq(other.branches()) } } impl Debug for Node { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Node::Fork(fork) => fork.fmt(f), Node::Rope(rope) => rope.fmt(f), Node::Leaf(leaf) => leaf.fmt(f), } } } use std::ops::RangeInclusive; impl From> for Range { fn from(range: RangeInclusive) -> Range { Range { start: *range.start(), end: *range.end(), } } } impl From> for Range { fn from(range: RangeInclusive) -> Range { Range { start: *range.start() as u8, end: *range.end() as u8, } } } impl PartialEq for Node { fn eq(&self, other: &Rope) -> bool { match self { Node::Rope(rope) => rope == other, _ => false, } } } impl PartialEq for Node { fn eq(&self, other: &Fork) -> bool { match self { Node::Fork(fork) => fork == other, _ => false, } } } } logos-codegen-0.14.0/src/graph/meta.rs000064400000000000000000000112001046102023000156110ustar 00000000000000use std::cmp::min; use std::collections::BTreeMap; use std::ops::{Index, IndexMut}; use crate::graph::{Graph, Node, NodeId}; #[derive(Debug)] pub struct Meta { map: BTreeMap, } #[derive(Debug, Default)] pub struct MetaItem { /// Number of references to this node pub refcount: usize, /// Minimum number of bytes that ought to be read for this /// node to find a match pub min_read: usize, /// Marks whether or not this node leads to a loop entry node. pub is_loop_init: bool, /// Ids of other nodes that point to this node while this /// node is on a stack (creating a loop) pub loop_entry_from: Vec, } impl Index for Meta { type Output = MetaItem; fn index(&self, id: NodeId) -> &MetaItem { &self.map[&id] } } impl IndexMut for Meta { fn index_mut(&mut self, id: NodeId) -> &mut MetaItem { self.map.entry(id).or_default() } } impl MetaItem { fn loop_entry(&mut self, id: NodeId) { if let Err(idx) = self.loop_entry_from.binary_search(&id) { self.loop_entry_from.insert(idx, id); } } } impl Meta { pub fn analyze(root: NodeId, graph: &Graph) -> Self { let mut meta = Meta { map: Default::default(), }; meta.first_pass(root, root, graph, &mut Vec::new()); meta } pub fn first_pass( &mut self, this: NodeId, parent: NodeId, graph: &Graph, stack: &mut Vec, ) -> &MetaItem { let meta = &mut self[this]; let is_done = meta.refcount > 0; meta.refcount += 1; if stack.contains(&this) { meta.loop_entry(parent); self[parent].is_loop_init = true; } if is_done { return &self[this]; } stack.push(this); let mut min_read; match &graph[this] { Node::Fork(fork) => { min_read = usize::max_value(); for (_, id) in fork.branches() { let meta = self.first_pass(id, this, graph, stack); if meta.is_loop_init { min_read = 1; } else { min_read = min(min_read, meta.min_read + 1); } } if let Some(id) = fork.miss { let meta = self.first_pass(id, this, graph, stack); if meta.is_loop_init { min_read = 0; } else { min_read = min(min_read, meta.min_read); } } if min_read == usize::max_value() { min_read = 0; } } Node::Rope(rope) => { min_read = rope.pattern.len(); let meta = self.first_pass(rope.then, this, graph, stack); if !meta.is_loop_init { min_read += meta.min_read; } if let Some(id) = rope.miss.first() { let meta = self.first_pass(id, this, graph, stack); if meta.is_loop_init { min_read = 0; } else { min_read = min(min_read, meta.min_read); } } } Node::Leaf(_) => min_read = 0, } stack.pop(); let meta = &mut self[this]; meta.min_read = min_read; let second_pass = meta.loop_entry_from.clone(); for id in second_pass { self.meta_second_pass(id, graph); } &self[this] } fn meta_second_pass(&mut self, id: NodeId, graph: &Graph) { let mut min_read; match &graph[id] { Node::Fork(fork) => { min_read = usize::max_value(); for (_, id) in fork.branches() { let meta = &self[id]; if meta.is_loop_init { min_read = 1; } else { min_read = min(min_read, meta.min_read + 1); } } if min_read == usize::max_value() { min_read = 0; } } Node::Rope(rope) => { min_read = rope.pattern.len(); let meta = &self[rope.then]; if !meta.is_loop_init { min_read += meta.min_read; } } Node::Leaf(_) => unreachable!(), } self[id].min_read = min_read; } } logos-codegen-0.14.0/src/graph/mod.rs000064400000000000000000000404321046102023000154530ustar 00000000000000use std::cmp::Ordering; use std::collections::btree_map::Entry; use std::collections::BTreeMap as Map; use std::hash::{Hash, Hasher}; use std::num::NonZeroU32; use std::ops::Index; use fnv::FnvHasher; mod fork; mod impls; mod meta; mod range; mod regex; mod rope; pub use self::fork::Fork; pub use self::meta::Meta; pub use self::range::Range; pub use self::rope::Rope; /// Disambiguation error during the attempt to merge two leaf /// nodes with the same priority #[derive(Debug)] pub struct DisambiguationError(pub NodeId, pub NodeId); pub struct Graph { /// Internal storage of all allocated nodes. Once a node is /// put here, it should never be mutated. nodes: Vec>>, /// When merging two nodes into a new node, we store the two /// entry keys and the result, so that we don't merge the same /// two nodes multiple times. /// /// Most of the time the entry we want to find will be the last /// one that has been inserted, so we can use a vec with reverse /// order search to get O(1) searches much faster than any *Map. merges: Map, /// Another map used for accounting. Before `.push`ing a new node /// onto the graph (inserts are exempt), we hash it and find if /// an identical(!) node has been created before. hashes: Map, /// Instead of handling errors over return types, opt to collect /// them internally. errors: Vec, /// Deferred merges. When when attempting to merge a node with an /// empty reserved slot, the merging operation is deferred until /// the reserved slot is populated. This is a stack that keeps track /// of all such deferred merges deferred: Vec, } /// Trait to be implemented on `Leaf` nodes in order to disambiguate /// between them. pub trait Disambiguate { fn cmp(left: &Self, right: &Self) -> Ordering; } /// Id of a Node in the graph. `NodeId` can be referencing an empty /// slot that is going to be populated later in time. #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct NodeId(NonZeroU32); impl NodeId { fn get(self) -> usize { self.0.get() as usize } fn new(n: usize) -> NodeId { NodeId(NonZeroU32::new(n as u32).expect("Invalid NodeId")) } } /// Unique reserved `NodeId` that is guaranteed to point to an /// empty allocated slot in the graph. It's safe to create multiple /// `NodeId` copies of `ReservedId`, however API users should never /// be able to clone a `ReservedId`, or create a new one from `NodeId`. /// /// `ReservedId` is consumed once passed into `Graph::insert`. #[derive(Debug)] pub struct ReservedId(NodeId); impl ReservedId { pub fn get(&self) -> NodeId { self.0 } } /// Merge key used to lookup whether two nodes have been previously /// mered, so we can avoid duplicating merges, potentially running into /// loops that blow out the stack. /// /// `Merge::new(a, b)` should always equal to `Merge::new(b, a)` to ensure /// that node merges are symmetric. #[derive(PartialEq, Eq, PartialOrd, Ord, Debug)] struct Merge(NodeId, NodeId); impl Merge { fn new(a: NodeId, b: NodeId) -> Self { if a < b { Merge(a, b) } else { Merge(b, a) } } } /// When attempting to merge two nodes, one of which was not yet created, /// we can record such attempt, and execute the merge later on when the /// `awaiting` has been `insert`ed into the graph. #[derive(Debug)] pub struct DeferredMerge { awaiting: NodeId, with: NodeId, into: ReservedId, } impl Graph { pub fn new() -> Self { Graph { // Start with an empty slot so we can start // counting NodeIds from 1 and use NonZero // optimizations nodes: vec![None], merges: Map::new(), hashes: Map::new(), errors: Vec::new(), deferred: Vec::new(), } } pub fn errors(&self) -> &[DisambiguationError] { &self.errors } fn next_id(&self) -> NodeId { NodeId::new(self.nodes.len()) } /// Reserve an empty slot for a node on the graph and return an /// id for it. `ReservedId` cannot be cloned, and must be consumed /// by calling `insert` on the graph. pub fn reserve(&mut self) -> ReservedId { let id = self.next_id(); self.nodes.push(None); ReservedId(id) } /// Insert a node at a given, previously reserved id. Returns the /// inserted `NodeId`. pub fn insert(&mut self, reserved: ReservedId, node: N) -> NodeId where N: Into>, Leaf: Disambiguate, { let id = reserved.get(); self.nodes[id.get()] = Some(node.into()); let mut awaiting = Vec::new(); // Partition out all `DeferredMerge`s that can be completed // now that this `ReservedId` has a `Node` inserted into it. for idx in (0..self.deferred.len()).rev() { if self.deferred[idx].awaiting == id { awaiting.push(self.deferred.remove(idx)); } } // Complete deferred merges. We've collected them from the back, // so we must iterate through them from the back as well to restore // proper order of merges in case there is some cascading going on. for DeferredMerge { awaiting, with, into, } in awaiting.into_iter().rev() { self.merge_unchecked(awaiting, with, into); } id } /// Push a node onto the graph and get an id to it. If an identical /// node has already been pushed on the graph, it will return the id /// of that node instead. pub fn push(&mut self, node: B) -> NodeId where B: Into>, { let node = node.into(); if let Node::Leaf(_) = node { return self.push_unchecked(node); } let mut hasher = FnvHasher::default(); node.hash(&mut hasher); let next_id = self.next_id(); match self.hashes.entry(hasher.finish()) { Entry::Occupied(occupied) => { let id = *occupied.get(); if self[id].eq(&node) { return id; } } Entry::Vacant(vacant) => { vacant.insert(next_id); } } self.push_unchecked(node) } fn push_unchecked(&mut self, node: Node) -> NodeId { let id = self.next_id(); self.nodes.push(Some(node)); id } /// If nodes `a` and `b` have been already merged, return the /// `NodeId` of the node they have been merged into. fn find_merged(&self, a: NodeId, b: NodeId) -> Option { let probe = Merge::new(a, b); self.merges.get(&probe).copied() } /// Mark that nodes `a` and `b` have been merged into `product`. /// /// This will also mark merging `a` and `product`, as well as /// `b` and `product` into `product`, since those are symmetric /// operations. /// /// This is necessary to break out asymmetric merge loops. fn set_merged(&mut self, a: NodeId, b: NodeId, product: NodeId) { self.merges.insert(Merge::new(a, b), product); self.merges.insert(Merge::new(a, product), product); self.merges.insert(Merge::new(b, product), product); } /// Merge the nodes at id `a` and `b`, returning a new id. pub fn merge(&mut self, a: NodeId, b: NodeId) -> NodeId where Leaf: Disambiguate, { if a == b { return a; } // If the id pair is already merged (or is being merged), just return the id if let Some(id) = self.find_merged(a, b) { return id; } match (self.get(a), self.get(b)) { (None, None) => { panic!( "Merging two reserved nodes! This is a bug, please report it:\n\ \n\ https://github.com/maciejhirsz/logos/issues" ); } (None, Some(_)) => { let reserved = self.reserve(); let id = reserved.get(); self.deferred.push(DeferredMerge { awaiting: a, with: b, into: reserved, }); self.set_merged(a, b, id); return id; } (Some(_), None) => { let reserved = self.reserve(); let id = reserved.get(); self.deferred.push(DeferredMerge { awaiting: b, with: a, into: reserved, }); self.set_merged(a, b, id); return id; } (Some(Node::Leaf(left)), Some(Node::Leaf(right))) => { return match Disambiguate::cmp(left, right) { Ordering::Less => b, Ordering::Greater => a, Ordering::Equal => { self.errors.push(DisambiguationError(a, b)); a } }; } _ => (), } // Reserve the id for the merge and save it. Since the graph can contain loops, // this prevents us from trying to merge the same id pair in a loop, blowing up // the stack. let reserved = self.reserve(); self.set_merged(a, b, reserved.get()); self.merge_unchecked(a, b, reserved) } /// Unchecked merge of `a` and `b`. This fn assumes that `a` and `b` are /// not pointing to empty slots. fn merge_unchecked(&mut self, a: NodeId, b: NodeId, reserved: ReservedId) -> NodeId where Leaf: Disambiguate, { let merged_rope = match (self.get(a), self.get(b)) { (Some(Node::Rope(rope)), _) => { let rope = rope.clone(); self.merge_rope(rope, b) } (_, Some(Node::Rope(rope))) => { let rope = rope.clone(); self.merge_rope(rope, a) } _ => None, }; if let Some(rope) = merged_rope { return self.insert(reserved, rope); } let mut fork = self.fork_off(a); fork.merge(self.fork_off(b), self); let mut stack = vec![reserved.get()]; // Flatten the fork while let Some(miss) = fork.miss { if stack.contains(&miss) { break; } stack.push(miss); let other = match self.get(miss) { Some(Node::Fork(other)) => other.clone(), Some(Node::Rope(other)) => other.clone().into_fork(self), _ => break, }; match other.miss { Some(id) if self.get(id).is_none() => break, _ => (), } fork.miss = None; fork.merge(other, self); } self.insert(reserved, fork) } fn merge_rope(&mut self, rope: Rope, other: NodeId) -> Option where Leaf: Disambiguate, { match self.get(other) { Some(Node::Fork(fork)) if rope.miss.is_none() => { // Count how many consecutive ranges in this rope would // branch into the fork that results in a loop. // // e.g.: for rope "foobar" and a looping fork [a-z]: 6 let count = rope .pattern .iter() .take_while(|range| fork.contains(**range) == Some(other)) .count(); let mut rope = rope.split_at(count, self)?.miss_any(other); rope.then = self.merge(rope.then, other); Some(rope) } Some(Node::Rope(other)) => { let (prefix, miss) = rope.prefix(other)?; let (a, b) = (rope, other.clone()); let a = a.remainder(prefix.len(), self); let b = b.remainder(prefix.len(), self); let rope = Rope::new(prefix, self.merge(a, b)).miss(miss); Some(rope) } Some(Node::Leaf(_)) | None => { if rope.miss.is_none() { Some(rope.miss(other)) } else { None } } _ => None, } } pub fn fork_off(&mut self, id: NodeId) -> Fork where Leaf: Disambiguate, { match self.get(id) { Some(Node::Fork(fork)) => fork.clone(), Some(Node::Rope(rope)) => rope.clone().into_fork(self), Some(Node::Leaf(_)) | None => Fork::new().miss(id), } } pub fn nodes(&self) -> &[Option>] { &self.nodes } /// Find all nodes that have no references and remove them. pub fn shake(&mut self, root: NodeId) { let mut filter = vec![false; self.nodes.len()]; filter[root.get()] = true; self[root].shake(self, &mut filter); for (id, referenced) in filter.into_iter().enumerate() { if !referenced { self.nodes[id] = None; } } } pub fn get(&self, id: NodeId) -> Option<&Node> { self.nodes.get(id.get())?.as_ref() } } impl Index for Graph { type Output = Node; fn index(&self, id: NodeId) -> &Node { self.get(id).expect( "Indexing into an empty node. This is a bug, please report it at:\n\ \n\ https://github.com/maciejhirsz/logos/issues", ) } } impl std::fmt::Display for NodeId { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { std::fmt::Display::fmt(&self.0, f) } } #[cfg_attr(test, derive(PartialEq))] pub enum Node { /// Fork node, can lead to more than one state Fork(Fork), /// Rope node, can lead to one state on match, one state on miss Rope(Rope), /// Leaf node, terminal state Leaf(Leaf), } impl Node { pub fn miss(&self) -> Option { match self { Node::Rope(rope) => rope.miss.first(), Node::Fork(fork) => fork.miss, Node::Leaf(_) => None, } } fn eq(&self, other: &Node) -> bool { match (self, other) { (Node::Fork(a), Node::Fork(b)) => a == b, (Node::Rope(a), Node::Rope(b)) => a == b, _ => false, } } fn shake(&self, graph: &Graph, filter: &mut [bool]) { match self { Node::Fork(fork) => fork.shake(graph, filter), Node::Rope(rope) => rope.shake(graph, filter), Node::Leaf(_) => (), } } pub fn unwrap_leaf(&self) -> &Leaf { match self { Node::Fork(_) => panic!("Internal Error: called unwrap_leaf on a fork"), Node::Rope(_) => panic!("Internal Error: called unwrap_leaf on a rope"), Node::Leaf(leaf) => leaf, } } } #[cfg(test)] mod tests { use super::*; use pretty_assertions::assert_eq; #[test] fn leaf_stack_size() { use std::mem::size_of; const WORD: usize = size_of::(); const NODE: usize = size_of::>(); assert!(NODE <= 6 * WORD, "Size of Node<()> is {} bytes!", NODE); } #[test] fn create_a_loop() { let mut graph = Graph::new(); let token = graph.push(Node::Leaf("IDENT")); let id = graph.reserve(); let fork = Fork::new().branch('a'..='z', id.get()).miss(token); let root = graph.insert(id, fork); assert_eq!(graph[token], Node::Leaf("IDENT")); assert_eq!(graph[root], Fork::new().branch('a'..='z', root).miss(token),); } #[test] fn fork_off() { let mut graph = Graph::new(); let leaf = graph.push(Node::Leaf("LEAF")); let rope = graph.push(Rope::new("rope", leaf)); let fork = graph.push(Fork::new().branch(b'!', leaf)); assert_eq!(graph.fork_off(leaf), Fork::new().miss(leaf)); assert_eq!( graph.fork_off(rope), Fork::new().branch(b'r', NodeId::new(graph.nodes.len() - 1)) ); assert_eq!(graph.fork_off(fork), Fork::new().branch(b'!', leaf)); } } logos-codegen-0.14.0/src/graph/range.rs000064400000000000000000000056721046102023000157770ustar 00000000000000use regex_syntax::hir::ClassBytesRange; use regex_syntax::hir::ClassUnicodeRange; use regex_syntax::utf8::Utf8Range; use std::cmp::{Ord, Ordering}; #[derive(Clone, Copy, PartialEq, Eq, Hash)] pub struct Range { pub start: u8, pub end: u8, } impl Range { pub fn as_byte(&self) -> Option { if self.is_byte() { Some(self.start) } else { None } } pub fn is_byte(&self) -> bool { self.start == self.end } } impl From for Range { fn from(byte: u8) -> Range { Range { start: byte, end: byte, } } } impl From<&u8> for Range { fn from(byte: &u8) -> Range { Range::from(*byte) } } impl Iterator for Range { type Item = u8; fn next(&mut self) -> Option { match self.start.cmp(&self.end) { std::cmp::Ordering::Less => { let res = self.start; self.start += 1; Some(res) } std::cmp::Ordering::Equal => { let res = self.start; // Necessary so that range 0xFF-0xFF doesn't loop forever self.start = 0xFF; self.end = 0x00; Some(res) } std::cmp::Ordering::Greater => None, } } } impl PartialOrd for Range { fn partial_cmp(&self, other: &Range) -> Option { Some(self.cmp(other)) } } impl Ord for Range { fn cmp(&self, other: &Self) -> Ordering { self.start.cmp(&other.start) } } impl From for Range { fn from(r: Utf8Range) -> Range { Range { start: r.start, end: r.end, } } } impl From for Range { fn from(r: ClassUnicodeRange) -> Range { let start = r.start() as u32; let end = r.end() as u32; if start >= 128 || end >= 128 && end != 0x0010FFFF { panic!("Casting non-ascii ClassUnicodeRange to Range") } Range { start: start as u8, end: end as u8, } } } impl From for Range { fn from(r: ClassBytesRange) -> Range { Range { start: r.start(), end: r.end(), } } } #[cfg(test)] mod tests { use super::*; #[test] fn range_iter_one() { let byte = Range::from(b'!'); let collected = byte.take(1000).collect::>(); assert_eq!(b"!", &collected[..]); } #[test] fn range_iter_few() { let byte = Range { start: b'a', end: b'd', }; let collected = byte.take(1000).collect::>(); assert_eq!(b"abcd", &collected[..]); } #[test] fn range_iter_bounds() { let byte = Range::from(0xFA..=0xFF); let collected = byte.take(1000).collect::>(); assert_eq!(b"\xFA\xFB\xFC\xFD\xFE\xFF", &collected[..]); } } logos-codegen-0.14.0/src/graph/regex.rs000064400000000000000000000172641046102023000160150ustar 00000000000000use std::fmt::Debug; use regex_syntax::utf8::Utf8Sequences; use crate::graph::{Disambiguate, Fork, Graph, Node, NodeId, Range, ReservedId, Rope}; use crate::mir::{Class, ClassUnicode, Literal, Mir}; impl Graph { pub fn regex(&mut self, mir: Mir, then: NodeId) -> NodeId { self.parse_mir(mir, then, None, None) } fn parse_mir( &mut self, mir: Mir, then: NodeId, miss: Option, reserved: Option, ) -> NodeId { match mir { Mir::Empty => then, Mir::Loop(mir) => { let miss = match miss { Some(id) => self.merge(id, then), None => then, }; let this = match reserved { Some(rid) => rid, None => self.reserve(), }; self.parse_mir(*mir, this.get(), Some(miss), Some(this)) } Mir::Maybe(mir) => { let miss = match miss { Some(id) => self.merge(id, then), None => then, }; self.parse_mir(*mir, then, Some(miss), reserved) } Mir::Alternation(alternation) => { let mut fork = Fork::new().miss(miss); for mir in alternation { let id = self.parse_mir(mir, then, None, None); let alt = self.fork_off(id); fork.merge(alt, self); } self.insert_or_push(reserved, fork) } Mir::Literal(literal) => { let pattern = literal.0.to_vec(); self.insert_or_push(reserved, Rope::new(pattern, then).miss(miss)) } Mir::Concat(mut concat) => { // We'll be writing from the back, so need to allocate enough // space here. Worst case scenario is all unicode codepoints // producing 4 byte utf8 sequences let mut ropebuf = vec![Range::from(0); concat.len() * 4]; let mut cur = ropebuf.len(); let mut end = ropebuf.len(); let mut then = then; let mut handle_bytes = |graph: &mut Self, mir, then: &mut NodeId| match mir { Mir::Literal(Literal(bytes)) => { cur -= bytes.len(); for (i, byte) in bytes.iter().enumerate() { ropebuf[cur + i] = byte.into(); } None } Mir::Class(Class::Unicode(class)) if is_one_ascii(&class) => { cur -= 1; ropebuf[cur] = class.ranges()[0].into(); None } Mir::Class(Class::Bytes(class)) if class.ranges().len() == 1 => { cur -= 1; ropebuf[cur] = class.ranges()[0].into(); None } mir => { if end > cur { let rope = Rope::new(&ropebuf[cur..end], *then); *then = graph.push(rope); end = cur; } Some(mir) } }; for mir in concat.drain(1..).rev() { if let Some(mir) = handle_bytes(self, mir, &mut then) { then = self.parse_mir(mir, then, None, None); } } match handle_bytes(self, concat.remove(0), &mut then) { None => { let rope = Rope::new(&ropebuf[cur..end], then).miss(miss); self.insert_or_push(reserved, rope) } Some(mir) => self.parse_mir(mir, then, miss, reserved), } } Mir::Class(Class::Unicode(class)) if !is_ascii(&class) => { let mut ropes = class .iter() .flat_map(|range| Utf8Sequences::new(range.start(), range.end())) .map(|sequence| Rope::new(sequence.as_slice(), then)) .collect::>(); if ropes.len() == 1 { let rope = ropes.remove(0); return self.insert_or_push(reserved, rope.miss(miss)); } let mut root = Fork::new().miss(miss); for rope in ropes { let fork = rope.into_fork(self); root.merge(fork, self); } self.insert_or_push(reserved, root) } Mir::Class(class) => { let mut fork = Fork::new().miss(miss); let class: Vec = match class { Class::Unicode(u) => u.iter().copied().map(Into::into).collect(), Class::Bytes(b) => b.iter().copied().map(Into::into).collect(), }; for range in class { fork.add_branch(range, then, self); } self.insert_or_push(reserved, fork) } } } fn insert_or_push(&mut self, id: Option, node: N) -> NodeId where N: Into>, { match id { Some(id) => self.insert(id, node), None => self.push(node), } } } fn is_ascii(class: &ClassUnicode) -> bool { class.iter().all(|range| { let start = range.start() as u32; let end = range.end() as u32; start < 128 && (end < 128 || end == 0x0010_FFFF) }) } fn is_one_ascii(class: &ClassUnicode) -> bool { if class.ranges().len() != 1 { return false; } let range = &class.ranges()[0]; let start = range.start() as u32; let end = range.end() as u32; start < 128 && (end < 128 || end == 0x0010_FFFF) } #[cfg(test)] mod tests { use super::*; use crate::graph::Node; use pretty_assertions::assert_eq; #[test] fn rope() { let mut graph = Graph::new(); let mir = Mir::utf8("foobar").unwrap(); assert_eq!(mir.priority(), 12); let leaf = graph.push(Node::Leaf("LEAF")); let id = graph.regex(mir, leaf); assert_eq!(graph[id], Node::Rope(Rope::new("foobar", leaf)),) } #[test] fn alternation() { let mut graph = Graph::new(); let mir = Mir::utf8("a|b").unwrap(); assert_eq!(mir.priority(), 2); let leaf = graph.push(Node::Leaf("LEAF")); let id = graph.regex(mir, leaf); assert_eq!( graph[id], Node::Fork(Fork::new().branch(b'a', leaf).branch(b'b', leaf)), ); } #[test] fn repeat() { let mut graph = Graph::new(); let mir = Mir::utf8("[a-z]*").unwrap(); assert_eq!(mir.priority(), 0); let leaf = graph.push(Node::Leaf("LEAF")); let id = graph.regex(mir, leaf); assert_eq!( graph[id], Node::Fork( Fork::new() .branch('a'..='z', id) // goto self == loop .miss(leaf) ), ); } #[test] fn maybe() { let mut graph = Graph::new(); let mir = Mir::utf8("[a-z]?").unwrap(); assert_eq!(mir.priority(), 0); let leaf = graph.push(Node::Leaf("LEAF")); let id = graph.regex(mir, leaf); assert_eq!( graph[id], Node::Fork(Fork::new().branch('a'..='z', leaf).miss(leaf)), ); } } logos-codegen-0.14.0/src/graph/rope.rs000064400000000000000000000172731046102023000156500ustar 00000000000000use std::ops::Deref; use crate::graph::{Disambiguate, Fork, Graph, NodeId, Range}; #[derive(PartialEq, Clone, Hash)] pub struct Rope { pub pattern: Pattern, pub then: NodeId, pub miss: Miss, } #[derive(PartialEq, Clone, Hash)] pub struct Pattern(pub Vec); impl Deref for Pattern { type Target = [Range]; fn deref(&self) -> &[Range] { &self.0 } } /// Because Ropes could potentially fail a match mid-pattern, /// a regular `Option` is not sufficient here. #[derive(PartialEq, Clone, Copy, Hash)] pub enum Miss { /// Same as Option::None, error on fail None, /// Jump to id if first byte does not match, fail on partial match First(NodeId), /// Jump to id on partial or empty match Any(NodeId), } impl Miss { pub fn is_none(&self) -> bool { matches!(self, Miss::None) } pub fn first(self) -> Option { match self { Miss::First(id) | Miss::Any(id) => Some(id), _ => None, } } pub fn take_first(&mut self) -> Option { match *self { Miss::First(id) => { *self = Miss::None; Some(id) } Miss::Any(id) => Some(id), Miss::None => None, } } } impl From> for Miss { fn from(miss: Option) -> Self { match miss { Some(id) => Miss::First(id), None => Miss::None, } } } impl From for Miss { fn from(id: NodeId) -> Self { Miss::First(id) } } impl Rope { pub fn new

(pattern: P, then: NodeId) -> Self where P: Into, { Rope { pattern: pattern.into(), then, miss: Miss::None, } } pub fn miss(mut self, miss: M) -> Self where M: Into, { self.miss = miss.into(); self } pub fn miss_any(mut self, miss: NodeId) -> Self { self.miss = Miss::Any(miss); self } pub fn into_fork(mut self, graph: &mut Graph) -> Fork where T: Disambiguate, { let first = self.pattern.0.remove(0); let miss = self.miss.take_first(); // The new fork will lead to a new rope, // or the old target if no new rope was created let then = match self.pattern.len() { 0 => self.then, _ => graph.push(self), }; Fork::new().branch(first, then).miss(miss) } pub fn prefix(&self, other: &Self) -> Option<(Pattern, Miss)> { let count = self .pattern .iter() .zip(other.pattern.iter()) .take_while(|(a, b)| a == b) .count(); let pattern = match count { 0 => return None, n => self.pattern[..n].into(), }; let miss = match (self.miss, other.miss) { (Miss::None, miss) => miss, (miss, Miss::None) => miss, _ => return None, }; Some((pattern, miss)) } pub fn split_at(mut self, at: usize, graph: &mut Graph) -> Option where T: Disambiguate, { match at { 0 => return None, n if n == self.pattern.len() => return Some(self), _ => (), } let (this, next) = self.pattern.split_at(at); let next_miss = match self.miss { Miss::Any(_) => self.miss, _ => Miss::None, }; let next = graph.push(Rope { pattern: next.into(), miss: next_miss, then: self.then, }); self.pattern = this.into(); self.then = next; Some(self) } pub fn remainder(mut self, at: usize, graph: &mut Graph) -> NodeId where T: Disambiguate, { self.pattern = self.pattern[at..].into(); match self.pattern.len() { 0 => self.then, _ => graph.push(self), } } pub fn shake(&self, graph: &Graph, filter: &mut [bool]) { if let Some(id) = self.miss.first() { if !filter[id.get()] { filter[id.get()] = true; graph[id].shake(graph, filter); } } if !filter[self.then.get()] { filter[self.then.get()] = true; graph[self.then].shake(graph, filter); } } } impl Pattern { pub fn to_bytes(&self) -> Option> { let mut out = Vec::with_capacity(self.len()); for range in self.iter() { out.push(range.as_byte()?); } Some(out) } } impl From<&[T]> for Pattern where T: Into + Copy, { fn from(slice: &[T]) -> Self { Pattern(slice.iter().copied().map(Into::into).collect()) } } impl From> for Pattern where T: Into, { fn from(vec: Vec) -> Self { Pattern(vec.into_iter().map(Into::into).collect()) } } impl From<&str> for Pattern { fn from(slice: &str) -> Self { slice.as_bytes().into() } } #[cfg(test)] mod tests { use super::*; use crate::graph::Node; use pretty_assertions::assert_eq; #[test] fn into_fork() { let mut graph = Graph::new(); let leaf = graph.push(Node::Leaf("LEAF")); let rope = Rope::new("foobar", leaf); let fork = rope.into_fork(&mut graph); assert_eq!(leaf, NodeId::new(1)); assert_eq!(fork, Fork::new().branch(b'f', NodeId::new(2))); assert_eq!(graph[NodeId::new(2)], Rope::new("oobar", leaf)); } #[test] fn into_fork_one_byte() { let mut graph = Graph::new(); let leaf = graph.push(Node::Leaf("LEAF")); let rope = Rope::new("!", leaf); let fork = rope.into_fork(&mut graph); assert_eq!(leaf, NodeId::new(1)); assert_eq!(fork, Fork::new().branch(b'!', NodeId::new(1))); } #[test] fn into_fork_miss_any() { let mut graph = Graph::new(); let leaf = graph.push(Node::Leaf("LEAF")); let rope = Rope::new("42", leaf).miss_any(NodeId::new(42)); let fork = rope.into_fork(&mut graph); assert_eq!(leaf, NodeId::new(1)); assert_eq!( fork, Fork::new() .branch(b'4', NodeId::new(2)) .miss(NodeId::new(42)) ); assert_eq!( graph[NodeId::new(2)], Rope::new("2", leaf).miss_any(NodeId::new(42)) ); } #[test] fn into_fork_miss_first() { let mut graph = Graph::new(); let leaf = graph.push(Node::Leaf("LEAF")); let rope = Rope::new("42", leaf).miss(Miss::First(NodeId::new(42))); let fork = rope.into_fork(&mut graph); assert_eq!(leaf, NodeId::new(1)); assert_eq!( fork, Fork::new() .branch(b'4', NodeId::new(2)) .miss(NodeId::new(42)) ); assert_eq!(graph[NodeId::new(2)], Rope::new("2", leaf)); } #[test] fn split_at() { let mut graph = Graph::new(); let leaf = graph.push(Node::Leaf("LEAF")); let rope = Rope::new("foobar", leaf); assert_eq!(rope.clone().split_at(6, &mut graph).unwrap(), rope); let split = rope.split_at(3, &mut graph).unwrap(); let expected_id = NodeId::new(leaf.get() + 1); assert_eq!(split, Rope::new("foo", expected_id)); assert_eq!(graph[expected_id], Rope::new("bar", leaf)); } #[test] fn pattern_to_bytes() { let pat = Pattern::from("foobar"); assert_eq!(pat.to_bytes().unwrap(), b"foobar"); let ranges = Pattern::from(vec![0..=0, 42..=42, b'{'..=b'}']); assert_eq!(ranges.to_bytes(), None); } } logos-codegen-0.14.0/src/leaf.rs000064400000000000000000000052301046102023000144770ustar 00000000000000use std::cmp::{Ord, Ordering}; use std::fmt::{self, Debug, Display}; use proc_macro2::{Span, TokenStream}; use syn::{spanned::Spanned, Ident}; use crate::graph::{Disambiguate, Node}; use crate::util::MaybeVoid; #[derive(Clone)] pub struct Leaf<'t> { pub ident: Option<&'t Ident>, pub span: Span, pub priority: usize, pub field: MaybeVoid, pub callback: Option, } #[derive(Clone)] pub enum Callback { Label(TokenStream), Inline(Box), Skip(Span), } #[derive(Clone)] pub struct InlineCallback { pub arg: Ident, pub body: TokenStream, pub span: Span, } impl From for Callback { fn from(inline: InlineCallback) -> Callback { Callback::Inline(Box::new(inline)) } } impl Callback { pub fn span(&self) -> Span { match self { Callback::Label(tokens) => tokens.span(), Callback::Inline(inline) => inline.span, Callback::Skip(span) => *span, } } } impl<'t> Leaf<'t> { pub fn new(ident: &'t Ident, span: Span) -> Self { Leaf { ident: Some(ident), span, priority: 0, field: MaybeVoid::Void, callback: None, } } pub fn new_skip(span: Span) -> Self { Leaf { ident: None, span, priority: 0, field: MaybeVoid::Void, callback: Some(Callback::Skip(span)), } } pub fn callback(mut self, callback: Option) -> Self { self.callback = callback; self } pub fn field(mut self, field: MaybeVoid) -> Self { self.field = field; self } pub fn priority(mut self, priority: usize) -> Self { self.priority = priority; self } } impl Disambiguate for Leaf<'_> { fn cmp(left: &Leaf, right: &Leaf) -> Ordering { Ord::cmp(&left.priority, &right.priority) } } impl<'t> From> for Node> { fn from(leaf: Leaf<'t>) -> Self { Node::Leaf(leaf) } } impl Debug for Leaf<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "::{}", self)?; match self.callback { Some(Callback::Label(ref label)) => write!(f, " ({})", label), Some(Callback::Inline(_)) => f.write_str(" ()"), Some(Callback::Skip(_)) => f.write_str(" ()"), None => Ok(()), } } } impl Display for Leaf<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self.ident { Some(ident) => Display::fmt(ident, f), None => f.write_str(""), } } } logos-codegen-0.14.0/src/lib.rs000064400000000000000000000266751046102023000143560ustar 00000000000000//! Logos logo //! //! # Logos //! //! This is a `#[derive]` macro crate, [for documentation go to main crate](https://docs.rs/logos2). // The `quote!` macro requires deep recursion. #![recursion_limit = "196"] #![doc(html_logo_url = "https://maciej.codes/kosz/logos.png")] mod error; mod generator; mod graph; mod leaf; mod mir; mod parser; mod util; use generator::Generator; use graph::{DisambiguationError, Fork, Graph, Rope}; use leaf::Leaf; use parser::{IgnoreFlags, Mode, Parser}; use quote::ToTokens; use util::MaybeVoid; use proc_macro2::{Delimiter, TokenStream, TokenTree}; use quote::quote; use syn::parse_quote; use syn::spanned::Spanned; use syn::{Fields, ItemEnum}; const LOGOS_ATTR: &str = "logos"; const ERROR_ATTR: &str = "error"; const TOKEN_ATTR: &str = "token"; const REGEX_ATTR: &str = "regex"; /// Generate a `Logos` implementation for the given struct, provided as a stream of rust tokens. pub fn generate(input: TokenStream) -> TokenStream { let mut item: ItemEnum = syn::parse2(input).expect("Logos can be only be derived for enums"); let name = &item.ident; let mut parser = Parser::default(); for param in item.generics.params { parser.parse_generic(param); } for attr in &mut item.attrs { parser.try_parse_logos(attr); } let mut ropes = Vec::new(); let mut regex_ids = Vec::new(); let mut graph = Graph::new(); { let errors = &mut parser.errors; for literal in &parser.skips { match literal.to_mir(&parser.subpatterns, IgnoreFlags::Empty, errors) { Ok(mir) => { let then = graph.push(Leaf::new_skip(literal.span()).priority(mir.priority())); let id = graph.regex(mir, then); regex_ids.push(id); } Err(err) => { errors.err(err, literal.span()); } } } } for variant in &mut item.variants { let field = match &mut variant.fields { Fields::Unit => MaybeVoid::Void, Fields::Unnamed(fields) => { if fields.unnamed.len() != 1 { parser.err( format!( "Logos currently only supports variants with one field, found {}", fields.unnamed.len(), ), fields.span(), ); } let ty = &mut fields .unnamed .first_mut() .expect("Already checked len; qed") .ty; let ty = parser.get_type(ty); MaybeVoid::Some(ty) } Fields::Named(fields) => { parser.err("Logos doesn't support named fields yet.", fields.span()); MaybeVoid::Void } }; // Lazy leaf constructor to avoid cloning let var_ident = &variant.ident; let leaf = move |span| Leaf::new(var_ident, span).field(field.clone()); for attr in &mut variant.attrs { let attr_name = match attr.path().get_ident() { Some(ident) => ident.to_string(), None => continue, }; match attr_name.as_str() { ERROR_ATTR => { // TODO: Remove in future versions parser.err( "\ Since 0.13 Logos no longer requires the #[error] variant.\n\ \n\ For help with migration see release notes: \ https://github.com/maciejhirsz/logos/releases\ ", attr.span(), ); } TOKEN_ATTR => { let definition = match parser.parse_definition(attr) { Some(definition) => definition, None => { parser.err("Expected #[token(...)]", attr.span()); continue; } }; if definition.ignore_flags.is_empty() { let bytes = definition.literal.to_bytes(); let then = graph.push( leaf(definition.literal.span()) .priority(definition.priority.unwrap_or(bytes.len() * 2)) .callback(definition.callback), ); ropes.push(Rope::new(bytes, then)); } else { let mir = definition .literal .escape_regex() .to_mir( &Default::default(), definition.ignore_flags, &mut parser.errors, ) .expect("The literal should be perfectly valid regex"); let then = graph.push( leaf(definition.literal.span()) .priority(definition.priority.unwrap_or_else(|| mir.priority())) .callback(definition.callback), ); let id = graph.regex(mir, then); regex_ids.push(id); } } REGEX_ATTR => { let definition = match parser.parse_definition(attr) { Some(definition) => definition, None => { parser.err("Expected #[regex(...)]", attr.span()); continue; } }; let mir = match definition.literal.to_mir( &parser.subpatterns, definition.ignore_flags, &mut parser.errors, ) { Ok(mir) => mir, Err(err) => { parser.err(err, definition.literal.span()); continue; } }; let then = graph.push( leaf(definition.literal.span()) .priority(definition.priority.unwrap_or_else(|| mir.priority())) .callback(definition.callback), ); let id = graph.regex(mir, then); regex_ids.push(id); } _ => (), } } } let mut root = Fork::new(); let error_type = parser.error_type.take(); let extras = parser.extras.take(); let source = parser .source .take() .map(strip_wrapping_parens) .unwrap_or(match parser.mode { Mode::Utf8 => quote!(str), Mode::Binary => quote!([u8]), }); let logos_path = parser .logos_path .take() .unwrap_or_else(|| parse_quote!(::logos)); let generics = parser.generics(); let this = quote!(#name #generics); let impl_logos = |body| { quote! { impl<'s> #logos_path::Logos<'s> for #this { type Error = #error_type; type Extras = #extras; type Source = #source; fn lex(lex: &mut #logos_path::Lexer<'s, Self>) { #body } } } }; for id in regex_ids { let fork = graph.fork_off(id); root.merge(fork, &mut graph); } for rope in ropes { root.merge(rope.into_fork(&mut graph), &mut graph); } while let Some(id) = root.miss.take() { let fork = graph.fork_off(id); if fork.branches().next().is_some() { root.merge(fork, &mut graph); } else { break; } } for &DisambiguationError(a, b) in graph.errors() { let a = graph[a].unwrap_leaf(); let b = graph[b].unwrap_leaf(); let disambiguate = a.priority + 1; let mut err = |a: &Leaf, b: &Leaf| { parser.err( format!( "\ A definition of variant `{a}` can match the same input as another definition of variant `{b}`.\n\ \n\ hint: Consider giving one definition a higher priority: \ #[regex(..., priority = {disambiguate})]\ ", ), a.span ); }; err(a, b); err(b, a); } if let Some(errors) = parser.errors.render() { return impl_logos(errors); } let root = graph.push(root); graph.shake(root); let generator = Generator::new(name, &this, root, &graph); let body = generator.generate(); impl_logos(quote! { use #logos_path::internal::{LexerInternal, CallbackResult}; type Lexer<'s> = #logos_path::Lexer<'s, #this>; fn _end<'s>(lex: &mut Lexer<'s>) { lex.end() } fn _error<'s>(lex: &mut Lexer<'s>) { lex.bump_unchecked(1); lex.error(); } #body }) } /// Strip all logos attributes from the given struct, allowing it to be used in code without `logos-derive` present. pub fn strip_attributes(input: TokenStream) -> TokenStream { let mut item: ItemEnum = syn::parse2(input).expect("Logos can be only be derived for enums"); strip_attrs_from_vec(&mut item.attrs); for attr in &mut item.attrs { if let syn::Meta::List(meta) = &mut attr.meta { if meta.path.is_ident("derive") { let mut tokens = std::mem::replace(&mut meta.tokens, TokenStream::new()).into_iter(); while let Some(TokenTree::Ident(ident)) = tokens.next() { let punct = tokens.next(); if ident == "Logos" { continue; } meta.tokens.extend([TokenTree::Ident(ident)]); meta.tokens.extend(punct); } } } } for variant in &mut item.variants { strip_attrs_from_vec(&mut variant.attrs); for field in &mut variant.fields { strip_attrs_from_vec(&mut field.attrs); } } item.to_token_stream() } fn strip_attrs_from_vec(attrs: &mut Vec) { attrs.retain(|attr| !is_logos_attr(attr)) } fn is_logos_attr(attr: &syn::Attribute) -> bool { attr.path().is_ident(LOGOS_ATTR) || attr.path().is_ident(TOKEN_ATTR) || attr.path().is_ident(REGEX_ATTR) } fn strip_wrapping_parens(t: TokenStream) -> TokenStream { let tts: Vec = t.into_iter().collect(); if tts.len() != 1 { tts.into_iter().collect() } else { match tts.into_iter().next().unwrap() { TokenTree::Group(g) => { if g.delimiter() == Delimiter::Parenthesis { g.stream() } else { core::iter::once(TokenTree::Group(g)).collect() } } tt => core::iter::once(tt).collect(), } } } logos-codegen-0.14.0/src/mir.rs000064400000000000000000000173271046102023000143710ustar 00000000000000use std::convert::TryFrom; use lazy_static::lazy_static; use regex_syntax::hir::{Dot, Hir, HirKind}; use regex_syntax::ParserBuilder; pub use regex_syntax::hir::{Class, ClassUnicode, Literal}; use crate::error::{Error, Result}; lazy_static! { static ref DOT_UTF8: Hir = Hir::dot(Dot::AnyChar); static ref DOT_BYTES: Hir = Hir::dot(Dot::AnyByte); } /// Middle Intermediate Representation of the regex, built from /// `regex_syntax`'s `Hir`. The goal here is to strip and canonicalize /// the tree, so that we don't have to do transformations later on the /// graph, with the potential of running into looping references. #[derive(Clone, Debug)] pub enum Mir { Empty, Loop(Box), Maybe(Box), Concat(Vec), Alternation(Vec), Class(Class), Literal(Literal), } impl Mir { pub fn utf8(source: &str) -> Result { Mir::try_from(ParserBuilder::new().build().parse(source)?) } pub fn utf8_ignore_case(source: &str) -> Result { Mir::try_from( ParserBuilder::new() .case_insensitive(true) .build() .parse(source)?, ) } pub fn binary(source: &str) -> Result { Mir::try_from( ParserBuilder::new() .utf8(false) .unicode(false) .build() .parse(source)?, ) } pub fn binary_ignore_case(source: &str) -> Result { Mir::try_from( ParserBuilder::new() .utf8(false) .unicode(false) .case_insensitive(true) .build() .parse(source)?, ) } pub fn priority(&self) -> usize { match self { Mir::Empty | Mir::Loop(_) | Mir::Maybe(_) => 0, Mir::Concat(concat) => concat.iter().map(Mir::priority).sum(), Mir::Alternation(alt) => alt.iter().map(Mir::priority).min().unwrap_or(0), Mir::Class(_) => 2, Mir::Literal(lit) => match std::str::from_utf8(&lit.0) { Ok(s) => 2 * s.chars().count(), Err(_) => 2 * lit.0.len(), }, } } } impl TryFrom for Mir { type Error = Error; fn try_from(hir: Hir) -> Result { match hir.into_kind() { HirKind::Empty => Ok(Mir::Empty), HirKind::Concat(concat) => { let mut out = Vec::with_capacity(concat.len()); fn extend(mir: Mir, out: &mut Vec) { match mir { Mir::Concat(nested) => { for child in nested { extend(child, out); } } mir => out.push(mir), } } for hir in concat { extend(Mir::try_from(hir)?, &mut out); } Ok(Mir::Concat(out)) } HirKind::Alternation(alternation) => { let alternation = alternation .into_iter() .map(Mir::try_from) .collect::>()?; Ok(Mir::Alternation(alternation)) } HirKind::Literal(literal) => Ok(Mir::Literal(literal)), HirKind::Class(class) => Ok(Mir::Class(class)), HirKind::Repetition(repetition) => { if !repetition.greedy { return Err("#[regex]: non-greedy parsing is currently unsupported.".into()); } let is_dot = if repetition.sub.properties().is_utf8() { *repetition.sub == *DOT_UTF8 } else { *repetition.sub == *DOT_BYTES }; let mir = Mir::try_from(*repetition.sub)?; match (repetition.min, repetition.max) { (0..=1, None) if is_dot => { Err( "#[regex]: \".+\" and \".*\" patterns will greedily consume \ the entire source till the end as Logos does not allow \ backtracking. If you are looking to match everything until \ a specific character, you should use a negative character \ class. E.g., use regex r\"'[^']*'\" to match anything in \ between two quotes. Read more about that here: \ https://github.com/maciejhirsz/logos/issues/302#issuecomment-1521342541." .into() ) } // 0 or 1 (0, Some(1)) => Ok(Mir::Maybe(Box::new(mir))), // 0 or more (0, None) => Ok(Mir::Loop(Box::new(mir))), // 1 or more (1, None) => { Ok(Mir::Concat(vec![mir.clone(), Mir::Loop(Box::new(mir))])) } // Exact {n} (n, Some(m)) if m == n => { let mut out = Vec::with_capacity(n as usize); for _ in 0..n { out.push(mir.clone()); } Ok(Mir::Concat(out)) } // At least {n,} (n, None) => { let mut out = Vec::with_capacity(n as usize); for _ in 0..n { out.push(mir.clone()); } out.push(Mir::Loop(Box::new(mir))); Ok(Mir::Concat(out)) } // Bounded {n, m} (n, Some(m)) => { let mut out = Vec::with_capacity(m as usize); for _ in 0..n { out.push(mir.clone()); } for _ in n..m { out.push(Mir::Maybe(Box::new(mir.clone()))); } Ok(Mir::Concat(out)) } } } HirKind::Capture(capture) => Mir::try_from(*capture.sub), HirKind::Look(_) => { Err("#[regex]: look-around assertions are currently unsupported.".into()) } } } } #[cfg(test)] mod tests { use super::Mir; #[test] fn priorities() { let regexes = [ ("a", 2), ("à", 2), ("京", 2), ("Eté", 6), ("Été", 6), ("[a-z]+", 2), ("a|b", 2), ("a|[b-z]", 2), ("(foo)+", 6), ("foobar", 12), ("(fooz|bar)+qux", 12), ]; for (regex, expected) in regexes.iter() { let mir = Mir::utf8(regex).unwrap(); assert_eq!(mir.priority(), *expected, "Failed for regex \"{}\"", regex); } } #[test] fn equivalent_patterns() { let regexes = [ ("a|b", "[a-b]"), ("1|2|3", "[1-3]"), ("1+", "[1]+"), ("c*", "[c]*"), ("aaa", "a{3}"), ("a[a]{2}", "a{3}"), ]; for (regex_left, regex_right) in regexes.iter() { let mir_left = Mir::utf8(regex_left).unwrap(); let mir_right = Mir::utf8(regex_right).unwrap(); assert_eq!( mir_left.priority(), mir_right.priority(), "Regexes \"{regex_left}\" and \"{regex_right}\" \ are equivalent but have different priorities" ); } } } logos-codegen-0.14.0/src/parser/definition.rs000064400000000000000000000137051046102023000172220ustar 00000000000000use proc_macro2::{Ident, Span}; use syn::{spanned::Spanned, LitByteStr, LitStr}; use crate::error::{Errors, Result}; use crate::leaf::Callback; use crate::mir::Mir; use crate::parser::nested::NestedValue; use crate::parser::{IgnoreFlags, Parser, Subpatterns}; use super::ignore_flags::ascii_case::MakeAsciiCaseInsensitive; pub struct Definition { pub literal: Literal, pub priority: Option, pub callback: Option, pub ignore_flags: IgnoreFlags, } pub enum Literal { Utf8(LitStr), Bytes(LitByteStr), } impl Definition { pub fn new(literal: Literal) -> Self { Definition { literal, priority: None, callback: None, ignore_flags: IgnoreFlags::Empty, } } pub fn named_attr(&mut self, name: Ident, value: NestedValue, parser: &mut Parser) { match (name.to_string().as_str(), value) { ("priority", NestedValue::Assign(tokens)) => { let prio = match tokens.to_string().parse() { Ok(prio) => prio, Err(_) => { parser.err("Expected an unsigned integer", tokens.span()); return; } }; if self.priority.replace(prio).is_some() { parser.err("Resetting previously set priority", tokens.span()); } } ("priority", _) => { parser.err("Expected: priority = ", name.span()); } ("callback", NestedValue::Assign(tokens)) => { let span = tokens.span(); let callback = match parser.parse_callback(tokens) { Some(callback) => callback, None => { parser.err("Not a valid callback", span); return; } }; if let Some(previous) = self.callback.replace(callback) { parser .err( "Callback has been already set", span.join(name.span()).unwrap(), ) .err("Previous callback set here", previous.span()); } } ("callback", _) => { parser.err("Expected: callback = ...", name.span()); } ("ignore", NestedValue::Group(tokens)) => { self.ignore_flags.parse_group(name, tokens, parser); } ("ignore", _) => { parser.err("Expected: ignore(, ...)", name.span()); } (unknown, _) => { parser.err( format!( "\ Unknown nested attribute: {}\n\ \n\ Expected one of: priority, callback\ ", unknown ), name.span(), ); } } } } impl Literal { pub fn to_bytes(&self) -> Vec { match self { Literal::Utf8(string) => string.value().into_bytes(), Literal::Bytes(bytes) => bytes.value(), } } pub fn escape_regex(&self) -> Literal { match self { Literal::Utf8(string) => Literal::Utf8(LitStr::new( regex_syntax::escape(&string.value()).as_str(), self.span(), )), Literal::Bytes(bytes) => Literal::Bytes(LitByteStr::new( regex_syntax::escape(&bytes_to_regex_string(bytes.value())).as_bytes(), self.span(), )), } } pub fn to_mir( &self, subpatterns: &Subpatterns, ignore_flags: IgnoreFlags, errors: &mut Errors, ) -> Result { let value = subpatterns.fix(self, errors); if ignore_flags.contains(IgnoreFlags::IgnoreAsciiCase) { match self { Literal::Utf8(_) => { Mir::utf8(&value).map(MakeAsciiCaseInsensitive::make_ascii_case_insensitive) } Literal::Bytes(_) => Mir::binary_ignore_case(&value), } } else if ignore_flags.contains(IgnoreFlags::IgnoreCase) { match self { Literal::Utf8(_) => Mir::utf8_ignore_case(&value), Literal::Bytes(_) => Mir::binary_ignore_case(&value), } } else { match self { Literal::Utf8(_) => Mir::utf8(&value), Literal::Bytes(_) => Mir::binary(&value), } } } pub fn span(&self) -> Span { match self { Literal::Utf8(string) => string.span(), Literal::Bytes(bytes) => bytes.span(), } } } impl syn::parse::Parse for Literal { fn parse(input: syn::parse::ParseStream) -> syn::Result { let la = input.lookahead1(); if la.peek(LitStr) { Ok(Literal::Utf8(input.parse()?)) } else if la.peek(LitByteStr) { Ok(Literal::Bytes(input.parse()?)) } else { Err(la.error()) } } } pub fn bytes_to_regex_string(bytes: Vec) -> String { if bytes.is_ascii() { unsafe { // Unicode values are prohibited, so we can't use // safe version of String::from_utf8 // // We can, however, construct a safe ASCII string return String::from_utf8_unchecked(bytes); } } let mut string = String::with_capacity(bytes.len() * 2); for byte in bytes { if byte < 0x80 { string.push(byte as char); } else { static DIGITS: [u8; 16] = *b"0123456789abcdef"; string.push_str(r"\x"); string.push(DIGITS[(byte / 16) as usize] as char); string.push(DIGITS[(byte % 16) as usize] as char); } } string } logos-codegen-0.14.0/src/parser/ignore_flags.rs000064400000000000000000000361771046102023000175410ustar 00000000000000use std::ops::{BitAnd, BitOr}; use proc_macro2::{Ident, TokenStream, TokenTree}; use crate::parser::Parser; use crate::util::is_punct; #[derive(Clone, Copy, PartialEq, Eq)] pub struct IgnoreFlags { bits: u8, } #[allow(non_upper_case_globals)] impl IgnoreFlags { pub const Empty: Self = Self::new(0x00); pub const IgnoreCase: Self = Self::new(0x01); pub const IgnoreAsciiCase: Self = Self::new(0x02); #[inline] pub const fn new(bits: u8) -> Self { Self { bits } } /// Enables a variant. #[inline] pub fn enable(&mut self, variant: Self) { self.bits |= variant.bits; } /// Checks if this `IgnoreFlags` contains *any* of the given variants. #[inline] pub fn contains(&self, variants: Self) -> bool { self.bits & variants.bits != 0 } #[inline] pub fn is_empty(&self) -> bool { self.bits == 0 } /// Parses an identifier an enables it for `self`. /// /// Valid inputs are (that produces `true`): /// * `"case"` (incompatible with `"ascii_case"`) /// * `"ascii_case"` (incompatible with `"case"`) /// /// An error causes this function to return `false` and emits an error to /// the given `Parser`. fn parse_ident(&mut self, ident: Ident, parser: &mut Parser) -> bool { match ident.to_string().as_str() { "case" => { if self.contains(Self::IgnoreAsciiCase) { parser.err( "\ The flag \"case\" cannot be used along with \"ascii_case\"\ ", ident.span(), ); false } else { self.enable(Self::IgnoreCase); true } } "ascii_case" => { if self.contains(Self::IgnoreCase) { parser.err( "\ The flag \"ascii_case\" cannot be used along with \"case\"\ ", ident.span(), ); false } else { self.enable(Self::IgnoreAsciiCase); true } } unknown => { parser.err( format!( "\ Unknown flag: {}\n\ \n\ Expected one of: case, ascii_case\ ", unknown ), ident.span(), ); false } } } pub fn parse_group(&mut self, name: Ident, tokens: TokenStream, parser: &mut Parser) { // Little finite state machine to parse "(,)*,?" // FSM description for future maintenance // 0: Initial state // -> 1 // _ -> error // 1: A flag was found // , -> 2 // None -> done // _ -> error // 2: A comma was found (after a ) // -> 1 // None -> done // _ -> error let mut state = 0u8; let mut tokens = tokens.into_iter(); loop { state = match state { 0 => match tokens.next() { Some(TokenTree::Ident(ident)) => { if self.parse_ident(ident, parser) { 1 } else { return; } } _ => { parser.err( "\ Invalid ignore flag\n\ \n\ Expected one of: case, ascii_case\ ", name.span(), ); return; } }, 1 => match tokens.next() { Some(tt) if is_punct(&tt, ',') => 2, None => return, Some(unexpected_tt) => { parser.err( format!( "\ Unexpected token: {:?}\ ", unexpected_tt.to_string(), ), unexpected_tt.span(), ); return; } }, 2 => match tokens.next() { Some(TokenTree::Ident(ident)) => { if self.parse_ident(ident, parser) { 1 } else { return; } } None => return, Some(unexpected_tt) => { parser.err( format!( "\ Unexpected token: {:?}\ ", unexpected_tt.to_string(), ), unexpected_tt.span(), ); return; } }, _ => unreachable!("Internal Error: invalid state ({})", state), } } } } impl BitOr for IgnoreFlags { type Output = Self; fn bitor(self, other: Self) -> Self { Self::new(self.bits | other.bits) } } impl BitAnd for IgnoreFlags { type Output = Self; fn bitand(self, other: Self) -> Self { Self::new(self.bits & other.bits) } } pub mod ascii_case { use regex_syntax::hir; use crate::mir::Mir; use crate::parser::Literal; macro_rules! literal { ($byte:expr) => { hir::Literal(Box::new([$byte])) }; (@char $c:expr) => { hir::Literal( $c.encode_utf8(&mut [0; 4]) .as_bytes() .to_vec() .into_boxed_slice(), ) }; } pub trait MakeAsciiCaseInsensitive { /// Creates a equivalent regular expression which ignore the letter casing /// of ascii characters. fn make_ascii_case_insensitive(self) -> Mir; } impl MakeAsciiCaseInsensitive for u8 { fn make_ascii_case_insensitive(self) -> Mir { if self.is_ascii_lowercase() { Mir::Alternation(vec![ Mir::Literal(literal!(self - 32)), Mir::Literal(literal!(self)), ]) } else if self.is_ascii_uppercase() { Mir::Alternation(vec![ Mir::Literal(literal!(self)), Mir::Literal(literal!(self + 32)), ]) } else { Mir::Literal(literal!(self)) } } } impl MakeAsciiCaseInsensitive for char { fn make_ascii_case_insensitive(self) -> Mir { if self.is_ascii() { (self as u8).make_ascii_case_insensitive() } else { Mir::Literal(literal!(@char self)) } } } impl MakeAsciiCaseInsensitive for hir::Literal { fn make_ascii_case_insensitive(self) -> Mir { Mir::Concat( self.0 .iter() .map(|x| x.make_ascii_case_insensitive()) .collect(), ) } } impl MakeAsciiCaseInsensitive for hir::ClassBytes { fn make_ascii_case_insensitive(mut self) -> Mir { self.case_fold_simple(); Mir::Class(hir::Class::Bytes(self)) } } impl MakeAsciiCaseInsensitive for hir::ClassUnicode { fn make_ascii_case_insensitive(mut self) -> Mir { use std::cmp; // Manuall implementation to only perform the case folding on ascii characters. let mut ranges = Vec::new(); for range in self.ranges() { #[inline] fn overlaps(st1: u8, end1: u8, st2: u8, end2: u8) -> bool { (st2 <= st1 && st1 <= end2) || (st1 <= st2 && st2 <= end1) } #[inline] fn make_ascii(c: char) -> Option { if c.is_ascii() { Some(c as u8) } else { None } } match (make_ascii(range.start()), make_ascii(range.end())) { (Some(start), Some(end)) => { if overlaps(b'a', b'z', start, end) { let lower = cmp::max(start, b'a'); let upper = cmp::min(end, b'z'); ranges.push(hir::ClassUnicodeRange::new( (lower - 32) as char, (upper - 32) as char, )) } if overlaps(b'A', b'Z', start, end) { let lower = cmp::max(start, b'A'); let upper = cmp::min(end, b'Z'); ranges.push(hir::ClassUnicodeRange::new( (lower + 32) as char, (upper + 32) as char, )) } } (Some(start), None) => { if overlaps(b'a', b'z', start, b'z') { let lower = cmp::max(start, b'a'); ranges.push(hir::ClassUnicodeRange::new((lower - 32) as char, 'Z')) } if overlaps(b'A', b'Z', start, b'Z') { let lower = cmp::max(start, b'A'); ranges.push(hir::ClassUnicodeRange::new((lower + 32) as char, 'Z')) } } _ => (), } } self.union(&hir::ClassUnicode::new(ranges)); Mir::Class(hir::Class::Unicode(self)) } } impl MakeAsciiCaseInsensitive for hir::Class { fn make_ascii_case_insensitive(self) -> Mir { match self { hir::Class::Bytes(b) => b.make_ascii_case_insensitive(), hir::Class::Unicode(u) => u.make_ascii_case_insensitive(), } } } impl MakeAsciiCaseInsensitive for &Literal { fn make_ascii_case_insensitive(self) -> Mir { match self { Literal::Bytes(bytes) => Mir::Concat( bytes .value() .into_iter() .map(|b| b.make_ascii_case_insensitive()) .collect(), ), Literal::Utf8(s) => Mir::Concat( s.value() .chars() .map(|b| b.make_ascii_case_insensitive()) .collect(), ), } } } impl MakeAsciiCaseInsensitive for Mir { fn make_ascii_case_insensitive(self) -> Mir { match self { Mir::Empty => Mir::Empty, Mir::Loop(l) => Mir::Loop(Box::new(l.make_ascii_case_insensitive())), Mir::Maybe(m) => Mir::Maybe(Box::new(m.make_ascii_case_insensitive())), Mir::Concat(c) => Mir::Concat( c.into_iter() .map(|m| m.make_ascii_case_insensitive()) .collect(), ), Mir::Alternation(a) => Mir::Alternation( a.into_iter() .map(|m| m.make_ascii_case_insensitive()) .collect(), ), Mir::Class(c) => c.make_ascii_case_insensitive(), Mir::Literal(l) => l.make_ascii_case_insensitive(), } } } #[cfg(test)] mod tests { use super::MakeAsciiCaseInsensitive; use crate::mir::{Class, Mir}; use regex_syntax::hir::{ClassUnicode, ClassUnicodeRange}; fn assert_range(in_s: char, in_e: char, expected: &[(char, char)]) { let range = ClassUnicodeRange::new(in_s, in_e); let class = ClassUnicode::new(vec![range]); let expected = ClassUnicode::new(expected.iter().map(|&(a, b)| ClassUnicodeRange::new(a, b))); if let Mir::Class(Class::Unicode(result)) = class.make_ascii_case_insensitive() { assert_eq!(result, expected); } else { panic!("Not a unicode class"); }; } #[test] fn no_letters_left() { assert_range(' ', '+', &[(' ', '+')]); } #[test] fn no_letters_right() { assert_range('{', '~', &[('{', '~')]); } #[test] fn no_letters_middle() { assert_range('[', '`', &[('[', '`')]); } #[test] fn lowercase_left_edge() { assert_range('a', 'd', &[('a', 'd'), ('A', 'D')]); } #[test] fn lowercase_right_edge() { assert_range('r', 'z', &[('r', 'z'), ('R', 'Z')]); } #[test] fn lowercase_total() { assert_range('a', 'z', &[('a', 'z'), ('A', 'Z')]); } #[test] fn uppercase_left_edge() { assert_range('A', 'D', &[('a', 'd'), ('A', 'D')]); } #[test] fn uppercase_right_edge() { assert_range('R', 'Z', &[('r', 'z'), ('R', 'Z')]); } #[test] fn uppercase_total() { assert_range('A', 'Z', &[('a', 'z'), ('A', 'Z')]); } #[test] fn lowercase_cross_left() { assert_range('[', 'h', &[('[', 'h'), ('A', 'H')]); } #[test] fn lowercase_cross_right() { assert_range('d', '}', &[('d', '}'), ('D', 'Z')]); } #[test] fn uppercase_cross_left() { assert_range(';', 'H', &[(';', 'H'), ('a', 'h')]); } #[test] fn uppercase_cross_right() { assert_range('T', ']', &[('t', 'z'), ('T', ']')]); } #[test] fn cross_both() { assert_range('X', 'c', &[('X', 'c'), ('x', 'z'), ('A', 'C')]); } #[test] fn all_letters() { assert_range('+', '|', &[('+', '|')]); } #[test] fn oob_all_letters() { assert_range('#', 'é', &[('#', 'é')]); } #[test] fn oob_from_uppercase() { assert_range('Q', 'é', &[('A', 'é')]); } #[test] fn oob_from_lowercase() { assert_range('q', 'é', &[('q', 'é'), ('Q', 'Z')]); } #[test] fn oob_no_letters() { assert_range('|', 'é', &[('|', 'é')]); } } } logos-codegen-0.14.0/src/parser/mod.rs000064400000000000000000000263621046102023000156540ustar 00000000000000use beef::lean::Cow; use proc_macro2::{Span, TokenStream, TokenTree}; use quote::quote; use syn::spanned::Spanned; use syn::{Attribute, GenericParam, Lit, Meta, Type}; use crate::error::Errors; use crate::leaf::{Callback, InlineCallback}; use crate::util::{expect_punct, MaybeVoid}; use crate::LOGOS_ATTR; mod definition; mod ignore_flags; mod nested; mod subpattern; mod type_params; pub use self::definition::{Definition, Literal}; pub use self::ignore_flags::IgnoreFlags; use self::nested::{AttributeParser, Nested, NestedValue}; pub use self::subpattern::Subpatterns; use self::type_params::{replace_lifetime, traverse_type, TypeParams}; #[derive(Default)] pub struct Parser { pub errors: Errors, pub mode: Mode, pub source: Option, pub skips: Vec, pub extras: MaybeVoid, pub error_type: MaybeVoid, pub subpatterns: Subpatterns, pub logos_path: Option, types: TypeParams, } #[derive(Default)] pub enum Mode { #[default] Utf8, Binary, } impl Parser { pub fn parse_generic(&mut self, param: GenericParam) { match param { GenericParam::Lifetime(lt) => { self.types.explicit_lifetime(lt, &mut self.errors); } GenericParam::Type(ty) => { self.types.add(ty.ident); } GenericParam::Const(c) => { self.err("Logos doesn't support const generics.", c.span()); } } } pub fn generics(&mut self) -> Option { self.types.generics(&mut self.errors) } fn parse_attr(&mut self, attr: &mut Attribute) -> Option { match &mut attr.meta { Meta::List(list) => { let tokens = std::mem::replace(&mut list.tokens, TokenStream::new()); Some(AttributeParser::new(tokens)) } _ => None, } } /// Try to parse the main `#[logos(...)]`, does nothing if /// the attribute's name isn't `logos`. pub fn try_parse_logos(&mut self, attr: &mut Attribute) { if !attr.path().is_ident(LOGOS_ATTR) { return; } let nested = match self.parse_attr(attr) { Some(tokens) => tokens, None => { self.err("Expected #[logos(...)]", attr.span()); return; } }; for nested in nested { let (name, value) = match nested { Nested::Named(name, value) => (name, value), Nested::Unexpected(tokens) | Nested::Unnamed(tokens) => { self.err("Invalid nested attribute", tokens.span()); continue; } }; // IMPORTANT: Keep these sorted alphabetically for binary search down the line #[allow(clippy::type_complexity)] static NESTED_LOOKUP: &[(&str, fn(&mut Parser, Span, NestedValue))] = &[ ("crate", |parser, span, value| match value { NestedValue::Assign(logos_path) => parser.logos_path = Some(logos_path), _ => { parser.err("Expected: #[logos(crate = path::to::logos)]", span); } }), ("error", |parser, span, value| match value { NestedValue::Assign(value) => { let span = value.span(); if let MaybeVoid::Some(previous) = parser.error_type.replace(value) { parser .err("Error type can be defined only once", span) .err("Previous definition here", previous.span()); } } _ => { parser.err("Expected: #[logos(error = SomeType)]", span); } }), ("extras", |parser, span, value| match value { NestedValue::Assign(value) => { let span = value.span(); if let MaybeVoid::Some(previous) = parser.extras.replace(value) { parser .err("Extras can be defined only once", span) .err("Previous definition here", previous.span()); } } _ => { parser.err("Expected: #[logos(extras = SomeType)]", span); } }), ("skip", |parser, span, value| match value { NestedValue::Literal(lit) => { if let Some(literal) = parser.parse_literal(Lit::new(lit)) { parser.skips.push(literal); } } _ => { parser.err("Expected: #[logos(skip \"regex literal\")]", span); } }), ("source", |parser, span, value| match value { NestedValue::Assign(value) => { let span = value.span(); if let Some(previous) = parser.source.replace(value) { parser .err("Source can be defined only once", span) .err("Previous definition here", previous.span()); } } _ => { parser.err("Expected: #[logos(source = SomeType)]", span); } }), ("subpattern", |parser, span, value| match value { NestedValue::KeywordAssign(name, value) => { parser.subpatterns.add(name, value, &mut parser.errors); } _ => { parser.err(r#"Expected: #[logos(subpattern name = r"regex")]"#, span); } }), ("type", |parser, span, value| match value { NestedValue::KeywordAssign(generic, ty) => { parser.types.set(generic, ty, &mut parser.errors); } _ => { parser.err("Expected: #[logos(type T = SomeType)]", span); } }), ]; match NESTED_LOOKUP.binary_search_by_key(&name.to_string().as_str(), |(n, _)| n) { Ok(idx) => NESTED_LOOKUP[idx].1(self, name.span(), value), Err(_) => { let mut err = format!( "Unknown nested attribute #[logos({name})], expected one of: {}", NESTED_LOOKUP[0].0 ); for (allowed, _) in &NESTED_LOOKUP[1..] { err.push_str(", "); err.push_str(allowed); } self.err(err, name.span()); } } } } pub fn parse_literal(&mut self, lit: Lit) -> Option { match lit { Lit::Str(string) => Some(Literal::Utf8(string)), Lit::ByteStr(bytes) => { self.mode = Mode::Binary; Some(Literal::Bytes(bytes)) } _ => { self.err("Expected a &str or &[u8] slice", lit.span()); None } } } /// Parse attribute definition of a token: /// /// + `#[token(literal[, callback])]` /// + `#[regex(literal[, callback])]` pub fn parse_definition(&mut self, attr: &mut Attribute) -> Option { let mut nested = self.parse_attr(attr)?; let literal = match nested.parsed::()? { Ok(lit) => self.parse_literal(lit)?, Err(err) => { self.err(err.to_string(), err.span()); return None; } }; let mut def = Definition::new(literal); for (position, next) in nested.enumerate() { match next { Nested::Unexpected(tokens) => { self.err("Unexpected token in attribute", tokens.span()); } Nested::Unnamed(tokens) => match position { 0 => def.callback = self.parse_callback(tokens), _ => { self.err( "\ Expected a named argument at this position\n\ \n\ hint: If you are trying to define a callback here use: callback = ...\ ", tokens.span(), ); } }, Nested::Named(name, value) => { def.named_attr(name, value, self); } } } Some(def) } fn parse_callback(&mut self, tokens: TokenStream) -> Option { let span = tokens.span(); let mut tokens = tokens.into_iter(); if let Some(tt) = expect_punct(tokens.next(), '|') { let mut label = TokenStream::from(tt); label.extend(tokens); return Some(Callback::Label(label)); } let first = tokens.next(); let error = expect_punct(tokens.next(), '|'); let arg = match (error, first) { (None, Some(TokenTree::Ident(arg))) => arg, _ => { self.err( "Inline callbacks must use closure syntax with exactly one parameter", span, ); return None; } }; let body = match tokens.next() { Some(TokenTree::Group(group)) => group.stream(), Some(first) => { let mut body = TokenStream::from(first); body.extend(tokens); body } None => { self.err("Callback missing a body", span); return None; } }; let inline = InlineCallback { arg, body, span }; Some(inline.into()) } /// Checks if `ty` is a declared generic param, if so replaces it /// with a concrete type defined using #[logos(type T = Type)] /// /// If no matching generic param is found, all lifetimes are fixed /// to the source lifetime pub fn get_type(&self, ty: &mut Type) -> TokenStream { traverse_type(ty, &mut |ty| { if let Type::Path(tp) = ty { // Skip types that begin with `self::` if tp.qself.is_none() { // If `ty` is a generic type parameter, try to find // its concrete type defined with #[logos(type T = Type)] if let Some(substitute) = self.types.find(&tp.path) { *ty = substitute; } } } // If `ty` is a concrete type, fix its lifetimes to 'source replace_lifetime(ty); }); quote!(#ty) } pub fn err(&mut self, message: M, span: Span) -> &mut Errors where M: Into>, { self.errors.err(message, span) } } logos-codegen-0.14.0/src/parser/nested.rs000064400000000000000000000074211046102023000163520ustar 00000000000000use proc_macro2::token_stream::IntoIter as TokenIter; use proc_macro2::{Ident, Literal, TokenStream, TokenTree}; use quote::quote; use crate::util::{expect_punct, is_punct}; pub enum NestedValue { /// `name = ...` Assign(TokenStream), /// `name "literal"` Literal(Literal), /// `name(...)` Group(TokenStream), /// `name ident = ...` KeywordAssign(Ident, TokenStream), } pub enum Nested { /// Unnamed nested attribute, such as a string, /// callback closure, or a lone ident/path /// /// Note: a lone ident will be Named with no value instead Unnamed(TokenStream), /// Named: name ... Named(Ident, NestedValue), /// Unexpected token, Unexpected(TokenStream), } pub struct AttributeParser { inner: TokenIter, } pub struct Empty; impl From for TokenStream { fn from(_: Empty) -> TokenStream { TokenStream::new() } } impl AttributeParser { pub fn new(stream: TokenStream) -> Self { AttributeParser { inner: stream.into_iter(), } } pub fn parsed(&mut self) -> Option> where T: syn::parse::Parse, { let tokens = self.collect_tail(TokenStream::new()); if tokens.is_empty() { return None; } Some(syn::parse2(tokens)) } fn next_tt(&mut self) -> Option { expect_punct(self.inner.next(), ',') } fn collect_tail(&mut self, first: T) -> TokenStream where T: Into, { let mut out = first.into(); while let Some(tt) = self.next_tt() { out.extend(Some(tt)); } out } fn parse_unnamed(&mut self, first: Ident, next: TokenTree) -> Nested { let mut out = TokenStream::from(TokenTree::Ident(first)); out.extend(self.collect_tail(next)); Nested::Unnamed(out.into_iter().collect()) } fn parse_assign(&mut self, name: Ident) -> Nested { let value = self.collect_tail(Empty); Nested::Named(name, NestedValue::Assign(value)) } fn parse_literal(&mut self, name: Ident, lit: Literal) -> Nested { // TODO: Error if there are any tokens following let _ = self.collect_tail(Empty); Nested::Named(name, NestedValue::Literal(lit)) } fn parse_group(&mut self, name: Ident, group: TokenStream) -> Nested { Nested::Named(name, NestedValue::Group(group)) } fn parse_keyword(&mut self, keyword: Ident, name: Ident) -> Nested { let error = expect_punct(self.next_tt(), '='); match error { Some(error) => { let error = self.collect_tail(error); Nested::Unexpected(error) } None => { let value = self.collect_tail(Empty); Nested::Named(keyword, NestedValue::KeywordAssign(name, value)) } } } } impl Iterator for AttributeParser { type Item = Nested; fn next(&mut self) -> Option { let first = self.inner.next()?; let name = match first { TokenTree::Ident(ident) => ident, tt => { let stream = self.collect_tail(tt); return Some(Nested::Unnamed(stream.into_iter().collect())); } }; match self.next_tt() { Some(tt) if is_punct(&tt, '=') => Some(self.parse_assign(name)), Some(TokenTree::Literal(lit)) => Some(self.parse_literal(name, lit)), Some(TokenTree::Group(group)) => Some(self.parse_group(name, group.stream())), Some(TokenTree::Ident(next)) => Some(self.parse_keyword(name, next)), Some(next) => Some(self.parse_unnamed(name, next)), None => Some(Nested::Unnamed(quote!(#name))), } } } logos-codegen-0.14.0/src/parser/subpattern.rs000064400000000000000000000060441046102023000172570ustar 00000000000000use proc_macro2::TokenStream; use syn::Ident; use crate::error::Errors; use crate::mir::Mir; use crate::parser::definition::{bytes_to_regex_string, Literal}; #[derive(Default)] pub struct Subpatterns { map: Vec<(Ident, String)>, } impl Subpatterns { pub fn add(&mut self, param: Ident, pattern: TokenStream, errors: &mut Errors) { let lit = match syn::parse2::(pattern) { Ok(lit) => lit, Err(e) => { errors.err(e.to_string(), e.span()); return; } }; if let Some((name, _)) = self.map.iter().find(|(name, _)| *name == param) { errors .err(format!("{} can only be assigned once", param), param.span()) .err("Previously assigned here", name.span()); return; } let fixed = self.fix(&lit, errors); // Validate the literal as proper regex. If it's not, emit an error. let mir = match &lit { Literal::Utf8(_) => Mir::utf8(&fixed), Literal::Bytes(_) => Mir::binary(&fixed), }; if let Err(err) = mir { errors.err(err, lit.span()); }; self.map.push((param, fixed)); } pub fn fix(&self, lit: &Literal, errors: &mut Errors) -> String { let mut i = 0; let mut pattern = match lit { Literal::Utf8(s) => s.value(), Literal::Bytes(b) => bytes_to_regex_string(b.value()), }; while let Some(f) = pattern[i..].find("(?&") { i += f; pattern.replace_range(i..i + 3, "(?:"); i += 3; let subref_end = if let Some(f) = pattern[i..].find(')') { i + f } else { pattern.truncate(i); // truncate so latter error doesn't suppress break; // regex-syntax will report the unclosed group }; let name = &pattern[i..subref_end]; let name = match syn::parse_str::(name) { Ok(name) => name, Err(_) => { errors.err( format!("subpattern reference `{}` is not an identifier", name), lit.span(), ); // we emitted the error; make something up and continue pattern.replace_range(i..subref_end, "_"); i += 2; continue; } }; match self.map.iter().find(|(def, _)| *def == name) { Some((_, subpattern)) => { pattern.replace_range(i..subref_end, subpattern); i += subpattern.len() + 1; } None => { errors.err( format!("subpattern reference `{}` has not been defined", name), lit.span(), ); // leaving `(?:name)` is fine i = subref_end + 1; } } } pattern } } logos-codegen-0.14.0/src/parser/type_params.rs000064400000000000000000000141011046102023000174050ustar 00000000000000use proc_macro2::{Ident, Span, TokenStream}; use quote::quote; use syn::spanned::Spanned; use syn::{Lifetime, LifetimeParam, Path, Type}; use crate::error::Errors; #[derive(Default)] pub struct TypeParams { lifetime: bool, type_params: Vec<(Ident, Option)>, } impl TypeParams { pub fn explicit_lifetime(&mut self, lt: LifetimeParam, errors: &mut Errors) { if self.lifetime { let span = lt.span(); errors.err("Logos types can only have one lifetime can be set", span); } self.lifetime = true; } pub fn add(&mut self, param: Ident) { self.type_params.push((param, None)); } pub fn set(&mut self, param: Ident, ty: TokenStream, errors: &mut Errors) { let ty = match syn::parse2::(ty) { Ok(mut ty) => { replace_lifetimes(&mut ty); ty } Err(err) => { errors.err(err.to_string(), err.span()); return; } }; match self.type_params.iter_mut().find(|(name, _)| *name == param) { Some((_, slot)) => { if let Some(previous) = slot.replace(ty) { errors .err( format!("{} can only have one type assigned to it", param), param.span(), ) .err("Previously assigned here", previous.span()); } } None => { errors.err( format!("{} is not a declared type parameter", param), param.span(), ); } } } pub fn find(&self, path: &Path) -> Option { for (ident, ty) in &self.type_params { if path.is_ident(ident) { return ty.clone(); } } None } pub fn generics(&self, errors: &mut Errors) -> Option { if !self.lifetime && self.type_params.is_empty() { return None; } let mut generics = Vec::new(); if self.lifetime { generics.push(quote!('s)); } for (ty, replace) in self.type_params.iter() { match replace { Some(ty) => generics.push(quote!(#ty)), None => { errors.err( format!( "Generic type parameter without a concrete type\n\ \n\ Define a concrete type Logos can use: #[logos(type {} = Type)]", ty, ), ty.span(), ); } } } if generics.is_empty() { None } else { Some(quote!(<#(#generics),*>)) } } } pub fn replace_lifetimes(ty: &mut Type) { traverse_type(ty, &mut replace_lifetime) } pub fn replace_lifetime(ty: &mut Type) { use syn::{GenericArgument, PathArguments}; match ty { Type::Path(p) => { p.path .segments .iter_mut() .filter_map(|segment| match &mut segment.arguments { PathArguments::AngleBracketed(ab) => Some(ab), _ => None, }) .flat_map(|ab| ab.args.iter_mut()) .for_each(|arg| { if let GenericArgument::Lifetime(lt) = arg { *lt = Lifetime::new("'s", lt.span()); } }); } Type::Reference(r) => { let span = match r.lifetime.take() { Some(lt) => lt.span(), None => Span::call_site(), }; r.lifetime = Some(Lifetime::new("'s", span)); } _ => (), } } pub fn traverse_type(ty: &mut Type, f: &mut impl FnMut(&mut Type)) { f(ty); match ty { Type::Array(array) => traverse_type(&mut array.elem, f), Type::BareFn(bare_fn) => { for input in &mut bare_fn.inputs { traverse_type(&mut input.ty, f); } if let syn::ReturnType::Type(_, ty) = &mut bare_fn.output { traverse_type(ty, f); } } Type::Group(group) => traverse_type(&mut group.elem, f), Type::Paren(paren) => traverse_type(&mut paren.elem, f), Type::Path(path) => traverse_path(&mut path.path, f), Type::Ptr(p) => traverse_type(&mut p.elem, f), Type::Reference(r) => traverse_type(&mut r.elem, f), Type::Slice(slice) => traverse_type(&mut slice.elem, f), Type::TraitObject(object) => object.bounds.iter_mut().for_each(|bound| { if let syn::TypeParamBound::Trait(trait_bound) = bound { traverse_path(&mut trait_bound.path, f); } }), Type::Tuple(tuple) => tuple .elems .iter_mut() .for_each(|elem| traverse_type(elem, f)), _ => (), } } fn traverse_path(path: &mut Path, f: &mut impl FnMut(&mut Type)) { for segment in &mut path.segments { match &mut segment.arguments { syn::PathArguments::None => (), syn::PathArguments::AngleBracketed(args) => { for arg in &mut args.args { match arg { syn::GenericArgument::Type(ty) => { traverse_type(ty, f); } syn::GenericArgument::AssocType(assoc) => { traverse_type(&mut assoc.ty, f); } _ => (), } } } syn::PathArguments::Parenthesized(args) => { for arg in &mut args.inputs { traverse_type(arg, f); } if let syn::ReturnType::Type(_, ty) = &mut args.output { traverse_type(ty, f); } } } } } logos-codegen-0.14.0/src/util.rs000064400000000000000000000032501046102023000145450ustar 00000000000000use proc_macro2::{Spacing, Span, TokenStream, TokenTree}; use quote::{quote, ToTokens}; use syn::Ident; /// Analog to Option, except when put into the quote! /// macro, `MaybeVoid::Void` will produce `()` #[derive(Clone, Default)] pub enum MaybeVoid { Some(TokenStream), #[default] Void, } impl MaybeVoid { pub fn replace(&mut self, stream: TokenStream) -> MaybeVoid { std::mem::replace(self, MaybeVoid::Some(stream)) } pub fn take(&mut self) -> MaybeVoid { std::mem::replace(self, MaybeVoid::Void) } } impl ToTokens for MaybeVoid { fn to_tokens(&self, out: &mut TokenStream) { match self { MaybeVoid::Some(stream) => out.extend(stream.clone()), MaybeVoid::Void => out.extend(quote!(())), } } fn to_token_stream(&self) -> TokenStream { match self { MaybeVoid::Some(stream) => stream.clone(), MaybeVoid::Void => quote!(()), } } fn into_token_stream(self) -> TokenStream { match self { MaybeVoid::Some(stream) => stream, MaybeVoid::Void => quote!(()), } } } pub fn is_punct(tt: &TokenTree, expect: char) -> bool { matches!(tt, TokenTree::Punct(punct) if punct.as_char() == expect && punct.spacing() == Spacing::Alone) } /// If supplied `tt` is a punct matching a char, returns `None`, else returns `tt` pub fn expect_punct(tt: Option, expect: char) -> Option { tt.filter(|tt| !is_punct(tt, expect)) } pub trait ToIdent { fn to_ident(&self) -> Ident; } impl ToIdent for str { fn to_ident(&self) -> Ident { Ident::new(self, Span::call_site()) } }