pax_global_header00006660000000000000000000000064150617533270014523gustar00rootroot0000000000000052 comment=c5094a84e56eb445297556500f19bcc2128a8859 ammar-regexp_parser-68cdeff/000077500000000000000000000000001506175332700162535ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/.github/000077500000000000000000000000001506175332700176135ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/.github/workflows/000077500000000000000000000000001506175332700216505ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/.github/workflows/gouteur.yml000066400000000000000000000006661506175332700240750ustar00rootroot00000000000000name: gouteur on: [push, pull_request] jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set up Ruby uses: ruby/setup-ruby@v1 with: ruby-version: 3.2 bundler-cache: true - name: Install and run ragel run: | sudo apt-get install -yqq ragel bundle exec rake ragel - name: Test run: bundle exec gouteur ammar-regexp_parser-68cdeff/.github/workflows/lint.yml000066400000000000000000000010031506175332700233330ustar00rootroot00000000000000# based on https://github.com/rails/rails/blob/4a78dcb/.github/workflows/rubocop.yml name: rubocop linting on: [push, pull_request] jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set up Ruby uses: ruby/setup-ruby@v1 with: ruby-version: 3.2 bundler-cache: true - name: Install and run ragel run: | sudo apt-get install -yqq ragel bundle exec rake ragel - name: Run rubocop run: bundle exec rubocop ammar-regexp_parser-68cdeff/.github/workflows/tests.yml000066400000000000000000000014611506175332700235370ustar00rootroot00000000000000name: tests on: push: pull_request: jobs: build: runs-on: ubuntu-latest strategy: matrix: ruby: [ '2.3', '2.4', '2.5', '2.6', '2.7', '3.0', '3.1', '3.2', '3.3', 'ruby-head' ] env: # The RUBYOPT flags can be removed if RuboCop style checks are added # to the lint workflow and support for Ruby < 2.3 is dropped. RUBYOPT: '--enable=frozen-string-literal --debug=frozen-string-literal' steps: - uses: actions/checkout@v4 - name: Set up Ruby ${{ matrix.ruby }} uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby }} rubygems: latest bundler-cache: true - name: Install ragel run: sudo apt-get install -yqq ragel - name: Test with Rake run: bundle exec rake test:full ammar-regexp_parser-68cdeff/.gitignore000066400000000000000000000002341506175332700202420ustar00rootroot00000000000000*.gem .*.swp .DS_Store .ruby-version .tags .tags1 .tool-versions Gemfile.lock lib/regexp_parser/scanner.rb doc .yardoc .bundle/* pkg/* coverage/* tmp/* ammar-regexp_parser-68cdeff/.gouteur.yml000066400000000000000000000007411506175332700205500ustar00rootroot00000000000000# Usage: https://github.com/jaynetics/gouteur/blob/main/README.md repos: - uri: https://github.com/jaynetics/js_regex - uri: https://github.com/jaynetics/repper - uri: https://github.com/rubocop-hq/rubocop tasks: rspec --pattern "**/{,*}regexp{,*,*/**/*}_spec.rb" - uri: https://github.com/mbj/mutant tasks: rspec --pattern "**/{,*}regexp{,*,*/**/*}_spec.rb" - uri: https://github.com/teamcapybara/capybara tasks: rspec spec/regexp_dissassembler_spec.rb ammar-regexp_parser-68cdeff/.rubocop.yml000066400000000000000000000010351506175332700205240ustar00rootroot00000000000000AllCops: DisabledByDefault: true Exclude: - '{bin,pkg,tmp,vendor}/**/*' # vendored dependencies etc. - 'lib/regexp_parser/scanner.rb' # Ragel-generated code NewCops: enable RubyInterpreters: - ruby - rake SuggestExtensions: false TargetRubyVersion: 2.0 Lint: Enabled: true Lint/UselessConstantScoping: Enabled: false # ignore weird looking regexps in specs, we have these on purpose Lint/DuplicateRegexpCharacterClassElement: Exclude: ['spec/**/*'] Lint/MixedRegexpCaptureTypes: Exclude: ['spec/**/*'] ammar-regexp_parser-68cdeff/CHANGELOG.md000066400000000000000000000776331506175332700201040ustar00rootroot00000000000000# Changelog All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] ## [2.11.3] - 2025-09-15 - Janosch Müller ### Fixed - correctly emit backslash-escaped UTF8 characters in character classes as one token (#104) * thanks to [Earlopain](https://github.com/Earlopain) for the report and fix ## [2.11.2] - 2025-08-12 - Janosch Müller ### Added - enable frozen string literals (#98) * thanks to [Geremia Taglialatela](https://github.com/tagliala) ### Fixed - scan with correct encoding when passing regopt individually (#102) * thanks to [Earlopain](https://github.com/Earlopain) for the report ## [2.11.1] - 2025-08-08 - Janosch Müller ### Fixed - restored compatibility with Ruby < 2.6, broken in regexp_parser v2.11.0 * thanks to [DanielFinkWoven](https://github.com/DanielFinkWoven) for the report ## [2.11.0] - 2025-08-03 - Janosch Müller ### Added - a new token `:escape, :utf8_hex` and expression `EscapeSequence::UTF8Hex` * used for UTF-8 hex escapes, e.g. `\xE2\x82\xAC` representing U+20AC "€" ### Fixed - detection of numeric backrefs > 9, e.g. `((((((((((foo))))))))))\10` * these are only backrefs in Ruby if sufficient groups have been opened * they were previously always scanned as octal or literal escapes ## [2.10.0] - 2024-12-25 - Janosch Müller ### Added - `#referenced_expressions` * like `#referenced_expression`, but for multiplexing backrefs * returns the `Group` expressions that are being referenced ### Fixed - fixed `#char` & `#codepoint` errors for single-digit hex escapes * e.g. `\xA` ## [2.9.3] - 2024-11-29 - Janosch Müller ### Fixed - fixed positive lookbehinds with character ">" being treated as named groups * e.g. `(?<=foo>)` * thanks to [Daniel Vandersluis](https://github.com/dvandersluis) ## [2.9.2] - 2024-05-15 - Janosch Müller ### Fixed - made the MFA requirement for changes to this gem visible on rubygems * thanks to [Geremia Taglialatela](https://github.com/tagliala) ## [2.9.1] - 2024-05-11 - Janosch Müller ### Fixed - fixed unnecessary `$LOAD_PATH` searches at load time * thanks to [Koichi ITO](https://github.com/koic) ## [2.9.0] - 2024-01-07 - Janosch Müller ### Added - all expressions now respond to `#negative?` / `#negated?` * previously only sets, props, and posix classes did - implemented `#negative?` / `#negated?` for more applicable expressions * `\B`, `\D`, `\H`, `\S`, `\W`, `(?!...)`, `(?`, `(a)\g<-01>`, `(a)?(?(01)b|c)` * thanks to [Markus Schirp](https://github.com/mbj) for the report ## [2.8.2] - 2023-10-10 - Janosch Müller ### Fixed - handle a corner case where parsing redundant number escapes raised an error * e.g. `parse(/\99/)`, which in Ruby is a valid Regexp that matches `99` * thanks to [Markus Schirp](https://github.com/mbj) for the report ## [2.8.1] - 2023-06-10 - Janosch Müller ### Fixed - support for extpict unicode property, added in Ruby 2.6 - support for 10 unicode script/block properties added in Ruby 3.2 ## [2.8.0] - 2023-04-17 - Janosch Müller ### Added - `Regexp::Expression::Shared#ends_at` * e.g. `parse(/a +/x)[0].ends_at # => 3` * e.g. `parse(/a +/x)[0].ends_at(include_quantifier = false) # => 1` - `Regexp::Expression::Shared#{capturing?,comment?}` * previously only available on capturing and comment groups - `Regexp::Expression::Shared#{decorative?}` * true for decorations: comment groups as well as comments and whitespace in x-mode - `Regexp::Expression::Shared#parent` - new format argument `:original` for `Regexp::Expression::Base#to_s` * includes decorative elements between node and its quantifier * e.g. `parse(/a (?#comment) +/x)[0].to_s(:original) # => "a (?#comment) +"` * using it is not needed when calling `Root#to_s` as Root can't be quantified - support calling `Subexpression#{each_expression,flat_map}` with a one-argument block * in this case, only the expressions are passed to the block, no indices - support calling test methods at Expression class level * `capturing?`, `comment?`, `decorative?`, `referential?`, `terminal?` * e.g. `Regexp::Expression::CharacterSet.terminal? # => false` ### Fixed - `Regexp::Expression::Shared#full_length` with whitespace before quantifier * e.g. `parse(/a +/x)[0].full_length` used to yield `2`, now it yields `3` - `Subexpression#to_s` output with children with whitespace before their quantifier * e.g. `parse(/a + /x).to_s` used to yield `"a+ "`, now it yields `"a + "` * calling `#to_s` on sub-nodes still omits such decorative interludes by default - use new `#to_s` format `:original` to include it - e.g. `parse(/a + /x)[0].to_s(:original) # => "a +"` - fixed `Subexpression#te` behaving differently from other expressions * only `Subexpression#te` used to include the quantifier * now `#te` is the end index without quantifier, as for other expressions - fixed `NoMethodError` when calling `#starts_at` or `#ts` on empty sequences * e.g. `Regexp::Parser.parse(/|/)[0].starts_at` * e.g. `Regexp::Parser.parse(/[&&]/)[0][0].starts_at` - fixed nested comment groups breaking local x-options * e.g. in `/(?x:(?#hello)) /`, the x-option wrongly applied to the whitespace - fixed nested comment groups breaking conditionals * e.g. in `/(a)(?(1)b|c(?#hello)d)e/`, the 2nd conditional branch included "e" - fixed quantifiers after comment groups being mis-assigned to that group * e.g. in `/a(?#foo){3}/` (matches 'aaa') - fixed Scanner accepting two cases of invalid Regexp syntax * unmatched closing parentheses (`)`) and k-backrefs with number 0 (`\k<0>`) * these are a `SyntaxError` in Ruby, so could only be passed as a String * they now raise a `Regexp::Scanner::ScannerError` - fixed some scanner errors not inheriting from `Regexp::Scanner::ScannerError` - reduced verbosity of inspect / pretty print output ## [2.7.0] - 2023-02-08 - Janosch Müller ### Added - `Regexp::Lexer.lex` now streams tokens when called with a block * it can now take arbitrarily large input, just like `Regexp::Scanner` * this also slightly improves `Regexp::Parser.parse` performance * note: `Regexp::Parser.parse` still does not and will not support streaming - improved performance of `Subexpression#each_expression` - minor improvements to `Regexp::Scanner` performance - overall improvement of parse performance: about 10% for large Regexps ### Fixed - parsing of octal escape sequences in sets, e.g. `[\141]` * thanks to [Randy Stauner](https://github.com/rwstauner) for the report ## [2.6.2] - 2023-01-19 - Janosch Müller ### Fixed - fixed `SystemStackError` when cloning recursive subexpression calls * e.g. `Regexp::Parser.parse(/a|b\g<0>/).dup` ## [2.6.1] - 2022-11-16 - Janosch Müller ### Fixed - fixed scanning of two negative lookbehind edge cases * `(?` used to raise a ScannerError * `(?)y` used to be misinterpreted as a named group * thanks to [Sergio Medina](https://github.com/serch) for the report ## [2.6.0] - 2022-09-26 - Janosch Müller ### Fixed - fixed `#referenced_expression` for `\g<0>` (was `nil`, is now the `Root` exp) - fixed `#reference`, `#referenced_expression` for recursion level backrefs * e.g. `(a)(b)\k<-1+1>` * `#referenced_expression` was `nil`, now it is the correct `Group` exp - detect and raise for two more syntax errors when parsing String input * quantification of option switches (e.g. `(?i)+`) * invalid references (e.g. `/\k<1>/`) * these are a `SyntaxError` in Ruby, so could only be passed as a String ### Added - `Regexp::Expression::Base#human_name` * returns a nice, human-readable description of the expression - `Regexp::Expression::Base#optional?` * returns `true` if the expression is quantified accordingly (e.g. with `*`, `{,n}`) - added a deprecation warning when calling `#to_re` on set members ## [2.5.0] - 2022-05-27 - Janosch Müller ### Added - `Regexp::Expression::Base.construct` and `.token_class` methods * see the [wiki](https://github.com/ammar/regexp_parser/wiki) for details ## [2.4.0] - 2022-05-09 - Janosch Müller ### Fixed - fixed interpretation of `+` and `?` after interval quantifiers (`{n,n}`) * they used to be treated as reluctant or possessive mode indicators * however, Ruby does not support these modes for interval quantifiers * they are now treated as chained quantifiers instead, as Ruby does it * c.f. [#3](https://github.com/ammar/regexp_parser/issues/3) - fixed `Expression::Base#nesting_level` for some tree rewrite cases * e.g. the alternatives in `/a|[b]/` had an inconsistent nesting_level - fixed `Scanner` accepting invalid posix classes, e.g. `[[:foo:]]` * they raise a `SyntaxError` when used in a Regexp, so could only be passed as String * they now raise a `Regexp::Scanner::ValidationError` in the `Scanner` ### Added - added `Expression::Base#==` for (deep) comparison of expressions - added `Expression::Base#parts` * returns the text elements and subexpressions of an expression * e.g. `parse(/(a)/)[0].parts # => ["(", #, ")"]` - added `Expression::Base#te` (a.k.a. token end index) * `Expression::Subexpression` always had `#te`, only terminal nodes lacked it so far - made some `Expression::Base` methods available on `Quantifier` instances, too * `#type`, `#type?`, `#is?`, `#one_of?`, `#options`, `#terminal?` * `#base_length`, `#full_length`, `#starts_at`, `#te`, `#ts`, `#offset` * `#conditional_level`, `#level`, `#nesting_level` , `#set_level` * this allows a more unified handling with `Expression::Base` instances - allowed `Quantifier#initialize` to take a token and options Hash like other nodes - added a deprecation warning for initializing Quantifiers with 4+ arguments: Calling `Expression::Base#quantify` or `Quantifier.new` with 4+ arguments is deprecated. It will no longer be supported in regexp_parser v3.0.0. Please pass a Regexp::Token instead, e.g. replace `token, text, min, max, mode` with `::Regexp::Token.new(:quantifier, token, text)`. min, max, and mode will be derived automatically. Or do `exp.quantifier = Quantifier.construct(token: token, text: str)`. This is consistent with how Expression::Base instances are created. ## [2.3.1] - 2022-04-24 - Janosch Müller ### Fixed - removed five inexistent unicode properties from `Syntax#features` * these were never supported by Ruby or the `Regexp::Scanner` * thanks to [Markus Schirp](https://github.com/mbj) for the report ## [2.3.0] - 2022-04-08 - Janosch Müller ### Added - improved parsing performance through `Syntax` refactoring * instead of fresh `Syntax` instances, pre-loaded constants are now re-used * this approximately doubles the parsing speed for simple regexps - added methods to `Syntax` classes to show relative feature sets * e.g. `Regexp::Syntax::V3_2_0.added_features` - support for new unicode properties of Ruby 3.2 / Unicode 14.0 ## [2.2.1] - 2022-02-11 - Janosch Müller ### Fixed - fixed Syntax version of absence groups (`(?~...)`) * the lexer accepted them for any Ruby version * now they are only recognized for Ruby >= 2.4.1 in which they were introduced - reduced gem size by excluding specs from package - removed deprecated `test_files` gemspec setting - no longer depend on `yaml`/`psych` (except for Ruby <= 2.4) - no longer depend on `set` * `set` was removed from the stdlib and made a standalone gem as of Ruby 3 * this made it a hidden/undeclared dependency of `regexp_parser` ## [2.2.0] - 2021-12-04 - Janosch Müller ### Added - added support for 13 new unicode properties introduced in Ruby 3.1.0 ## [2.1.1] - 2021-02-23 - Janosch Müller ### Fixed - fixed `NameError` when requiring only `'regexp_parser/scanner'` in v2.1.0 * thanks to [Jared White and Sam Ruby](https://github.com/ruby2js/ruby2js) for the report ## [2.1.0] - 2021-02-22 - Janosch Müller ### Added - common ancestor for all scanning/parsing/lexing errors * `Regexp::Parser::Error` can now be rescued as a catch-all * the following errors (and their many descendants) now inherit from it: - `Regexp::Expression::Conditional::TooManyBranches` - `Regexp::Parser::ParserError` - `Regexp::Scanner::ScannerError` - `Regexp::Scanner::ValidationError` - `Regexp::Syntax::SyntaxError` * it replaces `ArgumentError` in some rare cases (`Regexp::Parser.parse('?')`) * thanks to [sandstrom](https://github.com/sandstrom) for the cue ### Fixed - fixed scanning of whole-pattern recursion calls `\g<0>` and `\g'0'` * a regression in v2.0.1 had caused them to be scanned as literals - fixed scanning of some backreference and subexpression call edge cases * e.g. `\k<+1>`, `\g` - fixed tokenization of some escapes in character sets * `.`, `|`, `{`, `}`, `(`, `)`, `^`, `$`, `?`, `+`, `*` * all of these correctly emitted `#type` `:literal` and `#token` `:literal` if *not* escaped * if escaped, they emitted e.g. `#type` `:escape` and `#token` `:group_open` for `[\(]` * the escaped versions now correctly emit `#type` `:escape` and `#token` `:literal` - fixed handling of control/metacontrol escapes in character sets * e.g. `[\cX]`, `[\M-\C-X]` * they were misread as bunch of individual literals, escapes, and ranges - fixed some cases where calling `#dup`/`#clone` on expressions led to shared state ## [2.0.3] - 2020-12-28 - Janosch Müller ### Fixed - fixed error when scanning some unlikely and redundant but valid charset patterns * e.g. `/[[.a-b.]]/`, `/[[=e=]]/`, - fixed ancestry of some error classes related to syntax version lookup * `NotImplementedError`, `InvalidVersionNameError`, `UnknownSyntaxNameError` * they now correctly inherit from `Regexp::Syntax::SyntaxError` instead of Rubys `::SyntaxError` ## [2.0.2] - 2020-12-25 - Janosch Müller ### Fixed - fixed `FrozenError` when calling `#to_s` on a frozen `Group::Passive` * thanks to [Daniel Gollahon](https://github.com/dgollahon) ## [2.0.1] - 2020-12-20 - Janosch Müller ### Fixed - fixed error when scanning some group names * this affected names containing hyphens, digits or multibyte chars, e.g. `/(?a)/` * thanks to [Daniel Gollahon](https://github.com/dgollahon) for the report - fixed error when scanning hex escapes with just one hex digit * e.g. `/\x0A/` was scanned correctly, but the equivalent `/\xA/` was not * thanks to [Daniel Gollahon](https://github.com/dgollahon) for the report ## [2.0.0] - 2020-11-25 - Janosch Müller ### Changed - some methods that used to return byte-based indices now return char-based indices * the returned values have only changed for Regexps that contain multibyte chars * this is only a breaking change if you used such methods directly AND relied on them pointing to bytes * affected methods: * `Regexp::Token` `#length`, `#offset`, `#te`, `#ts` * `Regexp::Expression::Base` `#full_length`, `#offset`, `#starts_at`, `#te`, `#ts` * thanks to [Akinori MUSHA](https://github.com/knu) for the report - removed some deprecated methods/signatures * these are rarely used and have been showing deprecation warnings for a long time * `Regexp::Expression::Subexpression.new` with 3 arguments * `Regexp::Expression::Root.new` without a token argument * `Regexp::Expression.parsed` ### Added - `Regexp::Expression::Base#base_length` * returns the character count of an expression body, ignoring any quantifier - pragmatic, experimental support for chained quantifiers * e.g.: `/^a{10}{4,6}$/` matches exactly 40, 50 or 60 `a`s * successive quantifiers used to be silently dropped by the parser * they are now wrapped with passive groups as if they were written `(?:a{10}){4,6}` * thanks to [calfeld](https://github.com/calfeld) for reporting this a while back ### Fixed - incorrect encoding output for non-ascii comments * this led to a crash when calling `#to_s` on parse results containing such comments * thanks to [Michael Glass](https://github.com/michaelglass) for the report - some crashes when scanning contrived patterns such as `'\😋'` ### [1.8.2] - 2020-10-11 - Janosch Müller ### Fixed - fix `FrozenError` in `Expression::Base#repetitions` on Ruby 3.0 * thanks to [Thomas Walpole](https://github.com/twalpole) - removed "unknown future version" warning on Ruby 3.0 ### [1.8.1] - 2020-09-28 - Janosch Müller ### Fixed - fixed scanning of comment-like text in normal mode * this was an old bug, but had become more prevalent in v1.8.0 * thanks to [Tietew](https://github.com/Tietew) for the report - specified correct minimum Ruby version in gemspec * it said 1.9 but really required 2.0 as of v1.8.0 ### [1.8.0] - 2020-09-20 - Janosch Müller ### Changed - dropped support for running on Ruby 1.9.x ### Added - regexp flags can now be passed when parsing a `String` as regexp body * see the [README](/README.md#usage) for details * thanks to [Owen Stephens](https://github.com/owst) - bare occurrences of `\g` and `\k` are now allowed and scanned as literal escapes * matches Onigmo behavior * thanks for the report to [Marc-André Lafortune](https://github.com/marcandre) ### Fixed - fixed parsing comments without preceding space or trailing newline in x-mode * thanks to [Owen Stephens](https://github.com/owst) ### [1.7.1] - 2020-06-07 - Ammar Ali ### Fixed - Support for literals that include the unescaped delimiters `{`, `}`, and `]`. These delimiters are informally supported by various regexp engines. ### [1.7.0] - 2020-02-23 - Janosch Müller ### Added - `Expression::Base#each_expression` and `#traverse` can now be called without a block * this returns an `Enumerator` and allows chaining, e.g. `each_expression.select` * thanks to [Masataka Kuwabara](https://github.com/pocke) ### Fixed - `MatchLength#each` no longer ignores the given `limit:` when called without a block ### [1.6.0] - 2019-06-16 - Janosch Müller ### Added - Added support for 16 new unicode properties introduced in Ruby 2.6.2 and 2.6.3 ### [1.5.1] - 2019-05-23 - Janosch Müller ### Fixed - Fixed `#options` (and thus `#i?`, `#u?` etc.) not being set for some expressions: * this affected posix classes as well as alternation, conditional, and intersection branches * `#options` was already correct for all child expressions of such branches * this only made an operational difference for posix classes as they respect encoding flags - Fixed `#options` not respecting all negative options in weird cases like '(?u-m-x)' - Fixed `Group#option_changes` not accounting for indirectly disabled (overridden) encoding flags - Fixed `Scanner` allowing negative encoding options if there were no positive options, e.g. '(?-u)' - Fixed `ScannerError` for some valid meta/control sequences such as '\\C-\\\\' - Fixed `Expression::Base#match` and `#=~` not working with a single argument ### [1.5.0] - 2019-05-14 - Janosch Müller ### Added - Added `#referenced_expression` for backrefs, subexp calls and conditionals * returns the `Group` expression that is being referenced via name or number - Added `Expression::Base#repetitions` * returns a `Range` of allowed repetitions (`1..1` if there is no quantifier) * like `#quantity` but with a more uniform interface - Added `Expression::Base#match_length` * allows to inspect and iterate over String lengths matched by the Expression ### Fixed - Fixed `Expression::Base#clone` "direction" * it used to dup ivars onto the callee, leaving only the clone referencing the original objects * this will affect you if you call `#eql?`/`#equal?` on expressions or use them as Hash keys - Fixed `#clone` results for `Sequences`, e.g. alternations and conditionals * the inner `#text` was cloned onto the `Sequence` and thus duplicated * e.g. `Regexp::Parser.parse(/(a|bc)/).clone.to_s # => (aa|bcbc)` - Fixed inconsistent `#to_s` output for `Sequences` * it used to return only the "specific" text, e.g. "|" for an alternation * now it includes nested expressions as it does for all other `Subexpressions` - Fixed quantification of codepoint lists with more than one entry (`\u{62 63 64}+`) * quantifiers apply only to the last entry, so this token is now split up if quantified ### [1.4.0] - 2019-04-02 - Janosch Müller ### Added - Added support for 19 new unicode properties introduced in Ruby 2.6.0 ### [1.3.0] - 2018-11-14 - Janosch Müller ### Added - `Syntax#features` returns a `Hash` of all types and tokens supported by a given `Syntax` ### Fixed - Thanks to [Akira Matsuda](https://github.com/amatsuda) * eliminated warning "assigned but unused variable - testEof" ## [1.2.0] - 2018-09-28 - Janosch Müller ### Added - `Subexpression` (branch node) includes `Enumerable`, allowing to `#select` children etc. ### Fixed - Fixed missing quantifier in `Conditional::Expression` methods `#to_s`, `#to_re` - `Conditional::Condition` no longer lives outside the recursive `#expressions` tree * it used to be the only expression stored in a custom ivar, complicating traversal * its setter and getter (`#condition=`, `#condition`) still work as before ## [1.1.0] - 2018-09-17 - Janosch Müller ### Added - Added `Quantifier` methods `#greedy?`, `#possessive?`, `#reluctant?`/`#lazy?` - Added `Group::Options#option_changes` * shows the options enabled or disabled by the given options group * as with all other expressions, `#options` shows the overall active options - Added `Conditional#reference` and `Condition#reference`, indicating the determinative group - Added `Subexpression#dig`, acts like [`Array#dig`](http://ruby-doc.org/core-2.5.0/Array.html#method-i-dig) ### Fixed - Fixed parsing of quantified conditional expressions (quantifiers were assigned to the wrong expression) - Fixed scanning and parsing of forward-referring subexpression calls (e.g. `\g<+1>`) - `Root` and `Sequence` expressions now support the same constructor signature as all other expressions ## [1.0.0] - 2018-09-01 - Janosch Müller This release includes several breaking changes, mostly to character sets, #map and properties. ### Changed - Changed handling of sets (a.k.a. character classes or "bracket expressions") * see PR [#55](https://github.com/ammar/regexp_parser/pull/55) / issue [#47](https://github.com/ammar/regexp_parser/issues/47) for details * sets are now parsed to expression trees like other nestable expressions * `#scan` now emits the same tokens as outside sets (no longer `:set, :member`) * `CharacterSet#members` has been removed * new `Range` and `Intersection` classes represent corresponding syntax features * a new `PosixClass` expression class represents e.g. `[[:ascii:]]` * `PosixClass` instances behave like `Property` ones, e.g. support `#negative?` * `#scan` emits `:(non)posixclass, :` instead of `:set, :char_(non)` - Changed `Subexpression#map` to act like regular `Enumerable#map` * the old behavior is available as `Subexpression#flat_map` * e.g. `parse(/[a]/).map(&:to_s) == ["[a]"]`; used to be `["[a]", "a"]` - Changed expression emissions for some escape sequences * `EscapeSequence::Codepoint`, `CodepointList`, `Hex` and `Octal` are now all used * they already existed, but were all parsed as `EscapeSequence::Literal` * e.g. `\x97` is now `EscapeSequence::Hex` instead of `EscapeSequence::Literal` - Changed naming of many property tokens (emitted for `\p{...}`) * if you work with these tokens, see PR [#56](https://github.com/ammar/regexp_parser/pull/56) for details * e.g. `:punct_dash` is now `:dash_punctuation` - Changed `(?m)` and the likes to emit as `:options_switch` token (@4ade4d1) * allows differentiating from group-local `:options`, e.g. `(?m:.)` - Changed name of `Backreference::..NestLevel` to `..RecursionLevel` (@4184339) - Changed `Backreference::Number#number` from `String` to `Integer` (@40a2231) ### Added - Added support for all previously missing properties (about 250) - Added `Expression::UnicodeProperty#shortcut` (e.g. returns "m" for `\p{mark}`) - Added `#char(s)` and `#codepoint(s)` methods to all `EscapeSequence` expressions - Added `#number`/`#name`/`#recursion_level` to all backref/call expressions (@174bf21) - Added `#number` and `#number_at_level` to capturing group expressions (@40a2231) ### Fixed - Fixed Ruby version mapping of some properties - Fixed scanning of some property spellings, e.g. with dashes - Fixed some incorrect property alias normalizations - Fixed scanning of codepoint escapes with 6 digits (e.g. `\u{10FFFF}`) - Fixed scanning of `\R` and `\X` within sets; they act as literals there ## [0.5.0] - 2018-04-29 - Janosch Müller ### Changed - Changed handling of Ruby versions (PR [#53](https://github.com/ammar/regexp_parser/pull/53)) * New Ruby versions are now supported by default * Some deep-lying APIs have changed, which should not affect most users: * `Regexp::Syntax::VERSIONS` is gone * Syntax version names have changed from `Regexp::Syntax::Ruby::Vnnn` to `Regexp::Syntax::Vn_n_n` * Syntax version classes for Ruby versions without regex feature changes are no longer predefined and are now only created on demand / lazily * `Regexp::Syntax::supported?` returns true for any argument >= 1.8.6 ### Fixed - Fixed some use cases of Expression methods #strfregexp and #to_h (@e738107) ### Added - Added full signature support to collection methods of Expressions (@aa7c55a) ## [0.4.13] - 2018-04-04 - Ammar Ali - Added ruby version files for 2.2.10 and 2.3.7 ## [0.4.12] - 2018-03-30 - Janosch Müller - Added ruby version files for 2.4.4 and 2.5.1 ## [0.4.11] - 2018-03-04 - Janosch Müller - Fixed UnknownSyntaxNameError introduced in v0.4.10 if the gems parent dir tree included a 'ruby' dir ## [0.4.10] - 2018-03-04 - Janosch Müller - Added ruby version file for 2.6.0 - Added support for Emoji properties (available in Ruby since 2.5.0) - Added support for XPosixPunct and Regional_Indicator properties - Fixed parsing of Unicode 6.0 and 7.0 script properties - Fixed parsing of the special Assigned property - Fixed scanning of InCyrillic_Supplement property ## [0.4.9] - 2017-12-25 - Ammar Ali - Added ruby version file for 2.5.0 ## [0.4.8] - 2017-12-18 - Janosch Müller - Added ruby version files for 2.2.9, 2.3.6, and 2.4.3 ## [0.4.7] - 2017-10-15 - Janosch Müller - Fixed a thread safety issue (issue #45) - Some public class methods that were only reliable for internal use are now private instance methods (PR #46) - Improved the usefulness of Expression::Base#options (issue #43) - #options and derived methods such as #i?, #m? and #x? are now defined for all Expressions that are affected by such flags. - Fixed scanning of whitespace following (?x) (commit 5c94bd2) - Fixed a Parser bug where the #number attribute of traditional numerical backreferences was not set correctly (commit 851b620) ## [0.4.6] - 2017-09-18 - Janosch Müller - Added Parser support for hex escapes in sets (PR #36) - Added Parser support for octal escapes (PR #37) - Added support for cluster types \R and \X (PR #38) - Added support for more metacontrol notations (PR #39) ## [0.4.5] - 2017-09-17 - Ammar Ali - Thanks to [Janosch Müller](https://github.com/janosch-x): * Support ruby 2.2.7 (PR #42) - Added ruby version files for 2.2.8, 2.3.5, and 2.4.2 ## [0.4.4] - 2017-07-10 - Ammar Ali - Thanks to [Janosch Müller](https://github.com/janosch-x): * Add support for new absence operator (PR #33) - Thanks to [Bartek Bułat](https://github.com/barthez): * Add support for Ruby 2.3.4 version (PR #40) ## [0.4.3] - 2017-03-24 - Ammar Ali - Added ruby version file for 2.4.1 ## [0.4.2] - 2017-01-10 - Ammar Ali - Thanks to [Janosch Müller](https://github.com/janosch-x): * Support ruby 2.4 (PR #30) * Improve codepoint handling (PR #27) ## [0.4.1] - 2016-11-22 - Ammar Ali - Updated ruby version file for 2.3.3 ## [0.4.0] - 2016-11-20 - Ammar Ali - Added Syntax.supported? method - Updated ruby versions for latest releases; 2.1.10, 2.2.6, and 2.3.2 ## [0.3.6] - 2016-06-08 - Ammar Ali - Thanks to [John Backus](https://github.com/backus): * Remove warnings (PR #26) ## [0.3.5] - 2016-05-30 - Ammar Ali - Thanks to [John Backus](https://github.com/backus): * Fix parsing of /\xFF/n (hex:escape) (PR #24) ## [0.3.4] - 2016-05-25 - Ammar Ali - Thanks to [John Backus](https://github.com/backus): * Fix warnings (PR #19) - Thanks to [Dana Scheider](https://github.com/danascheider): * Correct error in README (PR #20) - Fixed mistyped \h and \H character types (issue #21) - Added ancestry syntax files for latest rubies (issue #22) ## [0.3.3] - 2016-04-26 - Ammar Ali - Thanks to [John Backus](https://github.com/backus): * Fixed scanning of zero length comments (PR #12) * Fixed missing escape:codepoint_list syntax token (PR #14) * Fixed to_s for modified interval quantifiers (PR #17) ## [0.3.2] - 2016-01-01 - Ammar Ali - Updated ruby versions for latest releases; 2.1.8, 2.2.4, and 2.3.0 - Fixed class name for UnknownSyntaxNameError exception - Added UnicodeBlocks support to the parser. - Added UnicodeBlocks support to the scanner. - Added expand_members method to CharacterSet, returns traditional or unicode property forms of shothands (\d, \W, \s, etc.) - Improved meaning and output of %t and %T in strfregexp. - Added syntax versions for ruby 2.1.4 and 2.1.5 and updated latest 2.1 version. - Added to_h methods to Expression, Subexpression, and Quantifier. - Added traversal methods; traverse, each_expression, and map. - Added token/type test methods; type?, is?, and one_of? - Added printing method strfregexp, inspired by strftime. - Added scanning and parsing of free spacing (x mode) expressions. - Improved handling of inline options (?mixdau:...) - Added conditional expressions. Ruby 2.0. - Added keep (\K) markers. Ruby 2.0. - Added d, a, and u options. Ruby 2.0. - Added missing meta sequences to the parser. They were supported by the scanner only. - Renamed Lexer's method to lex, added an alias to the old name (scan) - Use #map instead of #each to run the block in Lexer.lex. - Replaced VERSION.yml file with a constant. - Update tokens and scanner with new additions in Unicode 7.0. ## [0.1.6] - 2014-10-06 - Ammar Ali - Fixed test and gem building rake tasks and extracted the gem specification from the Rakefile into a .gemspec file. - Added syntax files for missing ruby 2.x versions. These do not add extra syntax support, they just make the gem work with the newer ruby versions. - Fixed a parser bug where an alternation sequence that contained nested expressions was incorrectly being appended to the parent expression when the nesting was exited. e.g. in /a|(b)c/, c was appended to the root. - Fixed a bug where character types were not being correctly scanned within character sets. e.g. in [\d], two tokens were scanned; one for the backslash '\' and one for the 'd' ## [0.1.5] - 2014-01-14 - Ammar Ali - Added syntax stubs for ruby versions 2.0 and 2.1 - Added clone methods for deep copying expressions. - Added optional format argument for to_s on expressions to return the text of the expression with (:full, the default) or without (:base) its quantifier. - Renamed the :beginning_of_line and :end_of_line tokens to :bol and :eol. - Fixed a bug where alternations with more than two alternatives and one of them ending in a group were being incorrectly nested. - Improved EOF handling in general and especially from sequences like hex and control escapes. - Fixed a bug where named groups with an empty name would return a blank token []. - Fixed a bug where member of a parent set where being added to its last subset. - Fixed a few mutable string bugs by calling dup on the originals. - Made ruby 1.8.6 the base for all 1.8 syntax, and the 1.8 name a pointer to the latest (1.8.7 at this time) - Removed look-behind assertions (positive and negative) from 1.8 syntax - Added control (\cc and \C-c) and meta (\M-c) escapes to 1.8 syntax - The default syntax is now the one of the running ruby version in both the lexer and the parser. ## [0.1.0] - 2010-11-21 - Ammar Ali - Initial release ammar-regexp_parser-68cdeff/Gemfile000066400000000000000000000005321506175332700175460ustar00rootroot00000000000000# frozen_string_literal: true source 'https://rubygems.org' gemspec group :development, :test do gem 'leto', '~> 2.1' gem 'rake', '~> 13.1' gem 'regexp_property_values', '~> 1.5' gem 'rspec', '~> 3.10' if RUBY_VERSION.to_f >= 2.7 gem 'benchmark-ips', '~> 2.1' gem 'gouteur', '~> 1.1' gem 'rubocop', '>= 1.80.2' end end ammar-regexp_parser-68cdeff/LICENSE000066400000000000000000000020521506175332700172570ustar00rootroot00000000000000Copyright (c) 2010, 2012-2025, Ammar Ali Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ammar-regexp_parser-68cdeff/README.md000066400000000000000000000516521506175332700175430ustar00rootroot00000000000000# Regexp::Parser [![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser) [![Build Status](https://github.com/ammar/regexp_parser/workflows/tests/badge.svg)](https://github.com/ammar/regexp_parser/actions) [![Build Status](https://github.com/ammar/regexp_parser/workflows/gouteur/badge.svg)](https://github.com/ammar/regexp_parser/actions) A Ruby gem for tokenizing, parsing, and transforming regular expressions. * Multilayered * A scanner/tokenizer based on [Ragel](http://www.colm.net/open-source/ragel/) * A lexer that produces a "stream" of [Token objects](https://github.com/ammar/regexp_parser/wiki/Token-Objects) * A parser that produces a "tree" of [Expression objects (OO API)](https://github.com/ammar/regexp_parser/wiki/Expression-Objects) * Runs on Ruby 2.x, 3.x and JRuby runtimes * Recognizes Ruby 1.8, 1.9, 2.x and 3.x regular expressions [See Supported Syntax](#supported-syntax) _For examples of regexp_parser in use, see [Example Projects](#example-projects)._ ## Requirements * Ruby >= 2.0 * Ragel >= 6.0, but only if you want to build the gem or work on the scanner. ## Install Install the gem with: `gem install regexp_parser` Or, add it to your project's `Gemfile`: ```gem 'regexp_parser', '~> X.Y.Z'``` See the badge at the top of this README or [rubygems](https://rubygems.org/gems/regexp_parser) for the the latest version number. ## Usage The three main modules are **Scanner**, **Lexer**, and **Parser**. Each of them provides a single method that takes a regular expression (as a Regexp object or a string) and returns its results. The **Lexer** and the **Parser** accept an optional second argument that specifies the syntax version, like 'ruby/2.0', which defaults to the host Ruby version (using RUBY_VERSION). Here are the basic usage examples: ```ruby require 'regexp_parser' Regexp::Scanner.scan(regexp) Regexp::Lexer.lex(regexp) Regexp::Parser.parse(regexp) ``` All three methods accept a block as the last argument, which, if given, gets called with the results as follows: * **Scanner**: the block gets passed the results as they are scanned. See the example in the next section for details. * **Lexer**: the block gets passed the tokens one by one as they are scanned. _The result of the block is returned._ * **Parser**: after completion, the block gets passed the root expression. _The result of the block is returned._ All three methods accept either a `Regexp` or `String` (containing the pattern) - if a String is passed, `options` can be supplied: ```ruby require 'regexp_parser' Regexp::Parser.parse( "a+ # Recognizes a and A...", options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE ) ``` ## Components ### Scanner A Ragel-generated scanner that recognizes the cumulative syntax of all supported syntax versions. It breaks a given expression's text into the smallest parts, and identifies their type, token, text, and start/end offsets within the pattern. #### Example The following scans the given pattern and prints out the type, token, text and start/end offsets for each token found. ```ruby require 'regexp_parser' Regexp::Scanner.scan(/(ab?(cd)*[e-h]+)/) do |type, token, text, ts, te| puts "type: #{type}, token: #{token}, text: '#{text}' [#{ts}..#{te}]" end # output # type: group, token: capture, text: '(' [0..1] # type: literal, token: literal, text: 'ab' [1..3] # type: quantifier, token: zero_or_one, text: '?' [3..4] # type: group, token: capture, text: '(' [4..5] # type: literal, token: literal, text: 'cd' [5..7] # type: group, token: close, text: ')' [7..8] # type: quantifier, token: zero_or_more, text: '*' [8..9] # type: set, token: open, text: '[' [9..10] # type: set, token: range, text: 'e-h' [10..13] # type: set, token: close, text: ']' [13..14] # type: quantifier, token: one_or_more, text: '+' [14..15] # type: group, token: close, text: ')' [15..16] ``` A one-liner that uses map on the result of the scan to return the textual parts of the pattern: ```ruby Regexp::Scanner.scan(/(cat?([bhm]at)){3,5}/).map { |token| token[2] } # => ["(", "cat", "?", "(", "[", "b", "h", "m", "]", "at", ")", ")", "{3,5}"] ``` #### Notes * The scanner performs basic syntax error checking, like detecting missing balancing punctuation and premature end of pattern. Flavor validity checks are performed in the lexer, which uses a syntax object. * If the input is a Ruby **Regexp** object, the scanner calls #source on it to get its string representation. #source does not include the options of the expression (m, i, and x). To include the options in the scan, #to_s should be called on the **Regexp** before passing it to the scanner or the lexer. For the parser, however, this is not necessary. It automatically exposes the options of a passed **Regexp** in the returned root expression. * To keep the scanner simple(r) and fairly reusable for other purposes, it does not perform lexical analysis on the tokens, sticking to the task of identifying the smallest possible tokens and leaving lexical analysis to the lexer. * The MRI implementation may accept expressions that either conflict with the documentation or are undocumented, like `{}` and `]` _(unescaped)_. The scanner will try to support as many of these cases as possible. ### Syntax Defines the supported tokens for a specific engine implementation (aka a flavor). Syntax classes act as lookup tables, and are layered to create flavor variations. Syntax only comes into play in the lexer. #### Example The following fetches syntax objects for Ruby 2.0, 1.9, 1.8, and checks a few of their implementation features. ```ruby require 'regexp_parser' ruby_20 = Regexp::Syntax.for 'ruby/2.0' ruby_20.implements? :quantifier, :zero_or_one # => true ruby_20.implements? :quantifier, :zero_or_one_reluctant # => true ruby_20.implements? :quantifier, :zero_or_one_possessive # => true ruby_20.implements? :conditional, :condition # => true ruby_19 = Regexp::Syntax.for 'ruby/1.9' ruby_19.implements? :quantifier, :zero_or_one # => true ruby_19.implements? :quantifier, :zero_or_one_reluctant # => true ruby_19.implements? :quantifier, :zero_or_one_possessive # => true ruby_19.implements? :conditional, :condition # => false ruby_18 = Regexp::Syntax.for 'ruby/1.8' ruby_18.implements? :quantifier, :zero_or_one # => true ruby_18.implements? :quantifier, :zero_or_one_reluctant # => true ruby_18.implements? :quantifier, :zero_or_one_possessive # => false ruby_18.implements? :conditional, :condition # => false ``` Syntax objects can also be queried about their complete and relative feature sets. ```ruby require 'regexp_parser' ruby_20 = Regexp::Syntax.for 'ruby/2.0' # => Regexp::Syntax::V2_0_0 ruby_20.added_features # => { conditional: [...], ... } ruby_20.removed_features # => { property: [:newline], ... } ruby_20.features # => { anchor: [...], ... } ``` #### Notes * Variations on a token, for example a named group with angle brackets (< and >) vs one with a pair of single quotes, are specified with an underscore followed by two characters appended to the base token. In the previous named group example, the tokens would be :named_ab (angle brackets) and :named_sq (single quotes). These variations are normalized by the syntax to :named. ### Lexer Sits on top of the scanner and performs lexical analysis on the tokens that it emits. Among its tasks are; breaking quantified literal runs, collecting the emitted token attributes into Token objects, calculating their nesting depth, normalizing tokens for the parser, and checking if the tokens are implemented by the given syntax version. See the [Token Objects](https://github.com/ammar/regexp_parser/wiki/Token-Objects) wiki page for more information on Token objects. #### Example The following example lexes the given pattern, checks it against the Ruby 1.9 syntax, and prints the token objects' text indented to their level. ```ruby require 'regexp_parser' Regexp::Lexer.lex(/a?(b(c))*[d]+/, 'ruby/1.9') do |token| puts "#{' ' * token.level}#{token.text}" end # output # a # ? # ( # b # ( # c # ) # ) # * # [ # d # ] # + ``` A one-liner that returns an array of the textual parts of the given pattern. Compare the output with that of the one-liner example of the **Scanner**; notably how the sequence 'cat' is treated. The 't' is separated because it's followed by a quantifier that only applies to it. ```ruby Regexp::Lexer.scan(/(cat?([b]at)){3,5}/).map { |token| token.text } # => ["(", "ca", "t", "?", "(", "[", "b", "]", "at", ")", ")", "{3,5}"] ``` #### Notes * The syntax argument is optional. It defaults to the version of the Ruby interpreter in use, as returned by RUBY_VERSION. * The lexer normalizes some tokens, as noted in the Syntax section above. ### Parser Sits on top of the lexer and transforms the "stream" of Token objects emitted by it into a tree of Expression objects represented by an instance of the `Expression::Root` class. See the [Expression Objects](https://github.com/ammar/regexp_parser/wiki/Expression-Objects) wiki page for attributes and methods. #### Example This example uses the tree traversal method `#each_expression` and the method `#strfregexp` to print each object in the tree. ```ruby include_root = true indent_offset = include_root ? 1 : 0 tree.each_expression(include_root) do |exp| puts exp.strfregexp("%>> %c", indent_offset) end # Output # > Regexp::Expression::Root # > Regexp::Expression::Literal # > Regexp::Expression::Group::Capture # > Regexp::Expression::Literal # > Regexp::Expression::Group::Capture # > Regexp::Expression::Literal # > Regexp::Expression::Literal # > Regexp::Expression::Group::Named # > Regexp::Expression::CharacterSet ``` _Note: quantifiers do not appear in the output because they are members of the Expression class. See the next section for details._ Another example, using `#traverse` for a more fine-grained tree traversal: ```ruby require 'regexp_parser' regex = /a?(b+(c)d)*(?[0-9]+)/ tree = Regexp::Parser.parse(regex, 'ruby/2.1') tree.traverse do |event, exp| puts "#{event}: #{exp.type} `#{exp.to_s}`" end # Output # visit: literal `a?` # enter: group `(b+(c)d)*` # visit: literal `b+` # enter: group `(c)` # visit: literal `c` # exit: group `(c)` # visit: literal `d` # exit: group `(b+(c)d)*` # enter: group `(?[0-9]+)` # visit: set `[0-9]+` # exit: group `(?[0-9]+)` ``` _See the traverse.rb and strfregexp.rb files under `lib/regexp_parser/expression/methods` for more information on these methods._ ## Supported Syntax The three modules support all the regular expression syntax features of Ruby 1.8, 1.9, 2.x and 3.x: _Note that not all of these are available in all versions of Ruby_ | Syntax Feature | Examples | ⋯ | | ------------------------------------- | ------------------------------------------------------- |:--------:| | **Alternation** | `a\|b\|c` | ✓ | | **Anchors** | `\A`, `^`, `\b` | ✓ | | **Character Classes** | `[abc]`, `[^\\]`, `[a-d&&aeiou]` | ✓ | | **Character Types** | `\d`, `\H`, `\s` | ✓ | | **Cluster Types** | `\R`, `\X` | ✓ | | **Conditional Exps.** | `(?(cond)yes-subexp)`, `(?(cond)yes-subexp\|no-subexp)` | ✓ | | **Escape Sequences** | `\t`, `\\+`, `\?` | ✓ | | **Free Space** | whitespace and `# Comments` _(x modifier)_ | ✓ | | **Grouped Exps.** | | ⋱ | |   _**Assertions**_ | | ⋱ | |   _Lookahead_ | `(?=abc)` | ✓ | |   _Negative Lookahead_ | `(?!abc)` | ✓ | |   _Lookbehind_ | `(?<=abc)` | ✓ | |   _Negative Lookbehind_ | `(?abc)` | ✓ | |   _**Absence**_ | `(?~abc)` | ✓ | |   _**Back-references**_ | | ⋱ | |   _Named_ | `\k` | ✓ | |   _Nest Level_ | `\k` | ✓ | |   _Numbered_ | `\k<1>` | ✓ | |   _Relative_ | `\k<-2>` | ✓ | |   _Traditional_ | `\1` through `\9` | ✓ | |   _**Capturing**_ | `(abc)` | ✓ | |   _**Comments**_ | `(?# comment text)` | ✓ | |   _**Named**_ | `(?abc)`, `(?'name'abc)` | ✓ | |   _**Options**_ | `(?mi-x:abc)`, `(?a:\s\w+)`, `(?i)` | ✓ | |   _**Passive**_ | `(?:abc)` | ✓ | |   _**Subexp. Calls**_ | `\g`, `\g<1>` | ✓ | | **Keep** | `\K`, `(ab\Kc\|d\Ke)f` | ✓ | | **Literals** _(utf-8)_ | `Ruby`, `ルビー`, `روبي` | ✓ | | **POSIX Classes** | `[:alpha:]`, `[:^digit:]` | ✓ | | **Quantifiers** | | ⋱ | |   _**Greedy**_ | `?`, `*`, `+`, `{m,M}` | ✓ | |   _**Reluctant** (Lazy)_ | `??`, `*?`, `+?` \[1\] | ✓ | |   _**Possessive**_ | `?+`, `*+`, `++` \[1\] | ✓ | | **String Escapes** | | ⋱ | |   _**Control** \[2\]_ | `\C-C`, `\cD` | ✓ | |   _**Hex**_ | `\x20`, `\xE2\x82\xAC` | ✓ | |   _**Meta** \[2\]_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | ✓ | |   _**Octal**_ | `\0`, `\01`, `\012` | ✓ | |   _**Unicode**_ | `\uHHHH`, `\u{H+ H+}` | ✓ | | **Unicode Properties** | _([Unicode 15.0.0])_ | ⋱ | |   _**Age**_ | `\p{Age=5.2}`, `\P{age=7.0}`, `\p{^age=8.0}` | ✓ | |   _**Blocks**_ | `\p{InArmenian}`, `\P{InKhmer}`, `\p{^InThai}` | ✓ | |   _**Classes**_ | `\p{Alpha}`, `\P{Space}`, `\p{^Alnum}` | ✓ | |   _**Derived**_ | `\p{Math}`, `\P{Lowercase}`, `\p{^Cased}` | ✓ | |   _**General Categories**_ | `\p{Lu}`, `\P{Cs}`, `\p{^sc}` | ✓ | |   _**Scripts**_ | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}` | ✓ | |   _**Simple**_ | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}` | ✓ | [Unicode 15.0.0]: https://www.unicode.org/versions/Unicode15.0.0/ **\[1\]**: Ruby does not support lazy or possessive interval quantifiers. Any `+` or `?` that follows an interval quantifier will be treated as another, chained quantifier. See also [#3](https://github.com/ammar/regexp_parser/issue/3), [#69](https://github.com/ammar/regexp_parser/pull/69). **\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex escapes when used in Regexp literals](https://github.com/ruby/ruby/commit/11ae581), so they will only reach the scanner and will only be emitted if a String or a Regexp that has been built with the `::new` constructor is scanned. ### Inapplicable Features Some Regexp options are not relevant to parsing. The option `o` modifies how Ruby deduplicates the **Regexp** object and does not appear in its source or options. Other such modifiers include the encoding modifiers `e`, `n`, `s` and `u` [See](https://ruby-doc.org/3.2.2/Regexp.html#class-Regexp-label-Encoding). These are not seen by the scanner. The following features are not currently enabled for Ruby by its regular expressions library (Onigmo). They are not supported by the scanner. - **Quotes**: `\Q...\E` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L499)_ - **Capture History**: `(?@...)`, `(?@...)` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L550)_ See something missing? Please submit an [issue](https://github.com/ammar/regexp_parser/issues) _**Note**: Attempting to process expressions with unsupported syntax features can raise an error, or incorrectly return tokens/objects as literals._ ## Testing To run the tests simply run rake from the root directory. The default task generates the scanner's code from the Ragel source files and runs all the specs, thus it requires Ragel to be installed. Note that changes to Ragel files will not be reflected when running `rspec` on its own, so to run individual tests you might want to run: ``` rake ragel && rspec spec/scanner/properties_spec.rb ``` ## Building Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/) to be installed. The build tasks will automatically invoke the 'ragel' task to generate the Ruby scanner code. The project uses the standard rubygems package tasks, so: To build the gem, run: ``` rake build ``` To install the gem from the cloned project, run: ``` rake install ``` ## References ### Example Projects Projects using regexp_parser. - [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool that uses regexp_parser to convert Regexps to css/xpath selectors. - [js_regex](https://github.com/jaynetics/js_regex) converts Ruby regular expressions to JavaScript-compatible regular expressions. - [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor with alias support. - [mutant](https://github.com/mbj/mutant) manipulates your regular expressions (amongst others) to see if your tests cover their behavior. - [repper](https://github.com/jaynetics/repper) is a regular expression pretty-printer and formatter for Ruby. - [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that uses regexp_parser to lint Regexps. - [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper that uses regexp_parser to generate examples of postal codes. Documentation and books used while working on this project. ### Ruby Flavors * Oniguruma Regular Expressions (Ruby 1.9.x) [link](https://github.com/kkos/oniguruma/blob/master/doc/RE) * Onigmo Regular Expressions (Ruby >= 2.0) [link](https://github.com/k-takata/Onigmo/blob/master/doc/RE) ### Regular Expressions * Mastering Regular Expressions, By Jeffrey E.F. Friedl (2nd Edition) [book](http://oreilly.com/catalog/9781565922570/) * Regular Expression Flavor Comparison [link](http://www.regular-expressions.info/refflavors.html) * Enumerating the strings of regular languages [link](http://www.cs.dartmouth.edu/~doug/nfa.ps.gz) * Stack Overflow Regular Expressions FAQ [link](http://stackoverflow.com/questions/22937618/reference-what-does-this-regex-mean/22944075#22944075) ### Unicode * Unicode Explained, By Jukka K. Korpela. [book](http://oreilly.com/catalog/9780596101213) * Unicode Derived Properties [link](http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt) * Unicode Property Aliases [link](http://www.unicode.org/Public/UNIDATA/PropertyAliases.txt) * Unicode Regular Expressions [link](http://www.unicode.org/reports/tr18/) * Unicode Standard Annex #44 [link](http://www.unicode.org/reports/tr44/) ## Copyright _Copyright (c) 2010-2025 Ammar Ali. See LICENSE file for details._ ammar-regexp_parser-68cdeff/Rakefile000066400000000000000000000010761506175332700177240ustar00rootroot00000000000000# frozen_string_literal: true require 'bundler' require 'rubygems' require 'rubygems/package_task' require 'rake' require 'rake/testtask' require 'rspec/core/rake_task' Dir['tasks/**/*.rake'].each { |file| load(file) } Bundler::GemHelper.install_tasks RSpec::Core::RakeTask.new(:spec) task :default => [:'test:full'] namespace :test do task full: [:ragel, :spec] end # Add ragel task as a prerequisite for building the gem to ensure that the # latest scanner code is generated and included in the build. desc "Runs ragel before building the gem" task build: :ragel ammar-regexp_parser-68cdeff/bin/000077500000000000000000000000001506175332700170235ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/bin/console000077500000000000000000000005531506175332700204160ustar00rootroot00000000000000#!/usr/bin/env ruby require 'bundler/setup' require 'regexp_parser' require 'regexp_property_values' RL = Regexp::Lexer RP = Regexp::Parser RS = Regexp::Scanner PV = RegexpPropertyValues def lex(...); Regexp::Lexer.lex(...) end def parse(...); Regexp::Parser.parse(...) end def scan(...); Regexp::Scanner.scan(...) end require 'irb' IRB.start(__FILE__) ammar-regexp_parser-68cdeff/bin/setup000077500000000000000000000001271506175332700201110ustar00rootroot00000000000000#!/bin/sh set -euo pipefail # install gems bundle # install ragel rake ragel:install ammar-regexp_parser-68cdeff/lib/000077500000000000000000000000001506175332700170215ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/lib/regexp_parser.rb000066400000000000000000000004171506175332700222160ustar00rootroot00000000000000# frozen_string_literal: true require_relative 'regexp_parser/version' require_relative 'regexp_parser/token' require_relative 'regexp_parser/scanner' require_relative 'regexp_parser/syntax' require_relative 'regexp_parser/lexer' require_relative 'regexp_parser/parser' ammar-regexp_parser-68cdeff/lib/regexp_parser/000077500000000000000000000000001506175332700216675ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/lib/regexp_parser/error.rb000066400000000000000000000002061506175332700233430ustar00rootroot00000000000000# frozen_string_literal: true class Regexp::Parser # base class for all gem-specific errors class Error < StandardError; end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression.rb000066400000000000000000000034251506175332700244170ustar00rootroot00000000000000# frozen_string_literal: true require_relative 'error' require_relative 'expression/shared' require_relative 'expression/base' require_relative 'expression/quantifier' require_relative 'expression/subexpression' require_relative 'expression/sequence' require_relative 'expression/sequence_operation' require_relative 'expression/classes/alternation' require_relative 'expression/classes/anchor' require_relative 'expression/classes/backreference' require_relative 'expression/classes/character_set' require_relative 'expression/classes/character_set/intersection' require_relative 'expression/classes/character_set/range' require_relative 'expression/classes/character_type' require_relative 'expression/classes/conditional' require_relative 'expression/classes/escape_sequence' require_relative 'expression/classes/free_space' require_relative 'expression/classes/group' require_relative 'expression/classes/keep' require_relative 'expression/classes/literal' require_relative 'expression/classes/posix_class' require_relative 'expression/classes/root' require_relative 'expression/classes/unicode_property' require_relative 'expression/methods/construct' require_relative 'expression/methods/escape_sequence_char' require_relative 'expression/methods/escape_sequence_codepoint' require_relative 'expression/methods/human_name' require_relative 'expression/methods/match' require_relative 'expression/methods/match_length' require_relative 'expression/methods/negative' require_relative 'expression/methods/options' require_relative 'expression/methods/parts' require_relative 'expression/methods/printing' require_relative 'expression/methods/referenced_expressions' require_relative 'expression/methods/strfregexp' require_relative 'expression/methods/tests' require_relative 'expression/methods/traverse' ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/000077500000000000000000000000001506175332700240665ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/base.rb000066400000000000000000000036731506175332700253360ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression class Base include Regexp::Expression::Shared def initialize(token, options = {}) init_from_token_and_options(token, options) end def to_re(format = :full) if set_level > 0 warn "Calling #to_re on character set members is deprecated - "\ "their behavior might not be equivalent outside of the set." end ::Regexp.new(to_s(format)) end def quantify(*args) self.quantifier = Quantifier.new(*args) end def unquantified_clone clone.tap { |exp| exp.quantifier = nil } end # Deprecated. Prefer `#repetitions` which has a more uniform interface. def quantity return [nil,nil] unless quantified? [quantifier.min, quantifier.max] end def repetitions @repetitions ||= if quantified? min = quantifier.min max = quantifier.max < 0 ? Float::INFINITY : quantifier.max range = min..max # fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807 if RUBY_VERSION.to_f < 2.7 range.define_singleton_method(:minmax) { [min, max] } end range else 1..1 end end def greedy? quantified? and quantifier.greedy? end def reluctant? quantified? and quantifier.reluctant? end alias :lazy? :reluctant? def possessive? quantified? and quantifier.possessive? end def to_h { type: type, token: token, text: to_s(:base), starts_at: ts, length: full_length, level: level, set_level: set_level, conditional_level: conditional_level, options: options, quantifier: quantified? ? quantifier.to_h : nil, } end alias :attributes :to_h end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/classes/000077500000000000000000000000001506175332700255235ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/classes/alternation.rb000066400000000000000000000005071506175332700303720ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression # A sequence of expressions, used by Alternation as one of its alternatives. class Alternative < Regexp::Expression::Sequence; end class Alternation < Regexp::Expression::SequenceOperation OPERAND = Alternative alias :alternatives :expressions end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/classes/anchor.rb000066400000000000000000000014101506175332700273160ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression module Anchor class Base < Regexp::Expression::Base; end class BeginningOfLine < Anchor::Base; end class EndOfLine < Anchor::Base; end class BeginningOfString < Anchor::Base; end class EndOfString < Anchor::Base; end class EndOfStringOrBeforeEndOfLine < Anchor::Base; end class WordBoundary < Anchor::Base; end class NonWordBoundary < Anchor::Base; end class MatchStart < Anchor::Base; end BOL = BeginningOfLine EOL = EndOfLine BOS = BeginningOfString EOS = EndOfString EOSobEOL = EndOfStringOrBeforeEndOfLine end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/classes/backreference.rb000066400000000000000000000027361506175332700306370ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression module Backreference class Base < Regexp::Expression::Base; end class Number < Backreference::Base attr_reader :number alias reference number def initialize(token, options = {}) @number = token.text[/-?\d+/].to_i super end end class Name < Backreference::Base attr_reader :name alias reference name def initialize(token, options = {}) @name = token.text[3..-2] super end end class NumberRelative < Backreference::Number attr_accessor :effective_number alias reference effective_number end class NumberCall < Backreference::Number; end class NameCall < Backreference::Name; end class NumberCallRelative < Backreference::NumberRelative; end class NumberRecursionLevel < Backreference::NumberRelative attr_reader :recursion_level def initialize(token, options = {}) super @number, @recursion_level = token.text[3..-2].split(/(?=[+-])/).map(&:to_i) end end class NameRecursionLevel < Backreference::Name attr_reader :recursion_level def initialize(token, options = {}) super @name, recursion_level = token.text[3..-2].split(/(?=[+-])/) @recursion_level = recursion_level.to_i end end end # alias for symmetry between token symbol and Expression class name Backref = Backreference end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/classes/character_set.rb000066400000000000000000000010141506175332700306530ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression class CharacterSet < Regexp::Expression::Subexpression attr_accessor :closed, :negative alias :closed? :closed def initialize(token, options = {}) self.negative = false self.closed = false super end def negate self.negative = true end def close self.closed = true end end # alias for symmetry between token symbol and Expression class name Set = CharacterSet end # module Regexp::Expression ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/classes/character_set/000077500000000000000000000000001506175332700303325ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/classes/character_set/intersection.rb000066400000000000000000000004521506175332700333660ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression class CharacterSet < Regexp::Expression::Subexpression class IntersectedSequence < Regexp::Expression::Sequence; end class Intersection < Regexp::Expression::SequenceOperation OPERAND = IntersectedSequence end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/classes/character_set/range.rb000066400000000000000000000007131506175332700317540ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression class CharacterSet < Regexp::Expression::Subexpression class Range < Regexp::Expression::Subexpression def ts (head = expressions.first) ? head.ts : @ts end def <<(exp) complete? and raise Regexp::Parser::Error, "Can't add more than 2 expressions to a Range" super end def complete? count == 2 end end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/classes/character_type.rb000066400000000000000000000013341506175332700310460ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression module CharacterType class Base < Regexp::Expression::Base; end class Any < CharacterType::Base; end class Digit < CharacterType::Base; end class NonDigit < CharacterType::Base; end class Hex < CharacterType::Base; end class NonHex < CharacterType::Base; end class Word < CharacterType::Base; end class NonWord < CharacterType::Base; end class Space < CharacterType::Base; end class NonSpace < CharacterType::Base; end class Linebreak < CharacterType::Base; end class ExtendedGrapheme < CharacterType::Base; end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/classes/conditional.rb000066400000000000000000000025161506175332700303570ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression module Conditional class TooManyBranches < Regexp::Parser::Error def initialize super('The conditional expression has more than 2 branches') end end class Condition < Regexp::Expression::Base # Name or number of the referenced capturing group that determines state. # Returns a String if reference is by name, Integer if by number. def reference ref = text.tr("'<>()", "") ref =~ /\D/ ? ref : Integer(ref) end end class Branch < Regexp::Expression::Sequence; end class Expression < Regexp::Expression::Subexpression def <<(exp) expressions.last << exp end def add_sequence(active_opts = {}, params = { ts: 0 }) raise TooManyBranches.new if branches.length == 2 params = params.merge({ conditional_level: conditional_level + 1 }) Branch.add_to(self, params, active_opts) end alias :branch :add_sequence def condition=(exp) expressions.delete(condition) expressions.unshift(exp) end def condition find { |subexp| subexp.is_a?(Condition) } end def branches select { |subexp| subexp.is_a?(Sequence) } end def reference condition.reference end end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/classes/escape_sequence.rb000066400000000000000000000022311506175332700311760ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression module EscapeSequence Base = Class.new(Regexp::Expression::Base) AsciiEscape = Class.new(Base) # \e Backspace = Class.new(Base) # \b Bell = Class.new(Base) # \a FormFeed = Class.new(Base) # \f Newline = Class.new(Base) # \n Return = Class.new(Base) # \r Tab = Class.new(Base) # \t VerticalTab = Class.new(Base) # \v Literal = Class.new(Base) # e.g. \j, \@, \😀 (ineffectual escapes) Octal = Class.new(Base) # e.g. \012 Hex = Class.new(Base) # e.g. \x0A Codepoint = Class.new(Base) # e.g. \u000A CodepointList = Class.new(Base) # e.g. \u{A B} UTF8Hex = Class.new(Base) # e.g. \xE2\x82\xAC AbstractMetaControlSequence = Class.new(Base) Control = Class.new(AbstractMetaControlSequence) # e.g. \cB Meta = Class.new(AbstractMetaControlSequence) # e.g. \M-Z MetaControl = Class.new(AbstractMetaControlSequence) # e.g. \M-\cX end # alias for symmetry between Token::* and Expression::* Escape = EscapeSequence end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/classes/free_space.rb000066400000000000000000000007251506175332700301500ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression class FreeSpace < Regexp::Expression::Base def quantify(*_args) raise Regexp::Parser::Error, 'Can not quantify a free space object' end end class Comment < Regexp::Expression::FreeSpace end class WhiteSpace < Regexp::Expression::FreeSpace def merge(exp) warn("#{self.class}##{__method__} is deprecated and will be removed in v3.0.0.") text << exp.text end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/classes/group.rb000066400000000000000000000032141506175332700272040ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression module Group class Base < Regexp::Expression::Subexpression end class Passive < Group::Base attr_writer :implicit def initialize(*) @implicit = false super end def implicit? @implicit end end class Absence < Group::Base; end class Atomic < Group::Base; end # TODO: should split off OptionsSwitch in v3.0.0. Maybe even make it no # longer inherit from Group because it is effectively a terminal expression. class Options < Group::Base attr_accessor :option_changes def initialize_copy(orig) self.option_changes = orig.option_changes.dup super end def quantify(*args) if token == :options_switch raise Regexp::Parser::Error, 'Can not quantify an option switch' else super end end end class Capture < Group::Base attr_accessor :number, :number_at_level alias identifier number end class Named < Group::Capture attr_reader :name alias identifier name def initialize(token, options = {}) @name = token.text[3..-2] super end def initialize_copy(orig) @name = orig.name.dup super end end class Comment < Group::Base end end module Assertion class Base < Regexp::Expression::Group::Base; end class Lookahead < Assertion::Base; end class NegativeLookahead < Assertion::Base; end class Lookbehind < Assertion::Base; end class NegativeLookbehind < Assertion::Base; end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/classes/keep.rb000066400000000000000000000004031506175332700267710ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression module Keep # TODO: in regexp_parser v3.0.0 this should possibly be a Subexpression # that contains all expressions to its left. class Mark < Regexp::Expression::Base; end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/classes/literal.rb000066400000000000000000000001551506175332700275050ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression class Literal < Regexp::Expression::Base; end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/classes/posix_class.rb000066400000000000000000000004311506175332700303750ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression class PosixClass < Regexp::Expression::Base def name text[/\w+/] end end # alias for symmetry between token symbol and Expression class name Posixclass = PosixClass Nonposixclass = PosixClass end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/classes/root.rb000066400000000000000000000005331506175332700270340ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression class Root < Regexp::Expression::Subexpression def self.build(options = {}) warn "`#{self.class}.build(options)` is deprecated and will raise in "\ "regexp_parser v3.0.0. Please use `.construct(options: options)`." construct(options: options) end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/classes/unicode_property.rb000066400000000000000000000071771506175332700314560ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression module UnicodeProperty class Base < Regexp::Expression::Base def name text[/\A\\[pP]\{([^}]+)\}\z/, 1] end def shortcut Regexp::Scanner.short_prop_map.key(token.to_s) end end class Alnum < Base; end class Alpha < Base; end class Ascii < Base; end class Blank < Base; end class Cntrl < Base; end class Digit < Base; end class Graph < Base; end class Lower < Base; end class Print < Base; end class Punct < Base; end class Space < Base; end class Upper < Base; end class Word < Base; end class Xdigit < Base; end class XPosixPunct < Base; end class Newline < Base; end class Any < Base; end class Assigned < Base; end module Letter class Base < UnicodeProperty::Base; end class Any < Letter::Base; end class Cased < Letter::Base; end class Uppercase < Letter::Base; end class Lowercase < Letter::Base; end class Titlecase < Letter::Base; end class Modifier < Letter::Base; end class Other < Letter::Base; end end module Mark class Base < UnicodeProperty::Base; end class Any < Mark::Base; end class Combining < Mark::Base; end class Nonspacing < Mark::Base; end class Spacing < Mark::Base; end class Enclosing < Mark::Base; end end module Number class Base < UnicodeProperty::Base; end class Any < Number::Base; end class Decimal < Number::Base; end class Letter < Number::Base; end class Other < Number::Base; end end module Punctuation class Base < UnicodeProperty::Base; end class Any < Punctuation::Base; end class Connector < Punctuation::Base; end class Dash < Punctuation::Base; end class Open < Punctuation::Base; end class Close < Punctuation::Base; end class Initial < Punctuation::Base; end class Final < Punctuation::Base; end class Other < Punctuation::Base; end end module Separator class Base < UnicodeProperty::Base; end class Any < Separator::Base; end class Space < Separator::Base; end class Line < Separator::Base; end class Paragraph < Separator::Base; end end module Symbol class Base < UnicodeProperty::Base; end class Any < Symbol::Base; end class Math < Symbol::Base; end class Currency < Symbol::Base; end class Modifier < Symbol::Base; end class Other < Symbol::Base; end end module Codepoint class Base < UnicodeProperty::Base; end class Any < Codepoint::Base; end class Control < Codepoint::Base; end class Format < Codepoint::Base; end class Surrogate < Codepoint::Base; end class PrivateUse < Codepoint::Base; end class Unassigned < Codepoint::Base; end end class Age < UnicodeProperty::Base; end class Block < UnicodeProperty::Base; end class Derived < UnicodeProperty::Base; end class Emoji < UnicodeProperty::Base; end class Enumerated < UnicodeProperty::Base; end class Script < UnicodeProperty::Base; end end # alias for symmetry between token symbol and Expression class name Property = UnicodeProperty Nonproperty = UnicodeProperty end # module Regexp::Expression ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/methods/000077500000000000000000000000001506175332700255315ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/methods/construct.rb000066400000000000000000000024541506175332700301070ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression module Shared module ClassMethods # Convenience method to init a valid Expression without a Regexp::Token def construct(params = {}) attrs = construct_defaults.merge(params) options = attrs.delete(:options) token_args = Regexp::TOKEN_KEYS.map { |k| attrs.delete(k) } token = Regexp::Token.new(*token_args) raise ArgumentError, "unsupported attribute(s): #{attrs}" if attrs.any? new(token, options) end def construct_defaults if self == Root { type: :expression, token: :root, ts: 0 } elsif self < Sequence { type: :expression, token: :sequence } else { type: token_class::Type } end.merge(level: 0, set_level: 0, conditional_level: 0, text: '') end def token_class if self == Root || self < Sequence nil # no token class because these objects are Parser-generated # TODO: synch exp class, token class & type names for this in v3.0.0 elsif self == CharacterType::Any Regexp::Syntax::Token::Meta else Regexp::Syntax::Token.const_get(name.split('::')[2]) end end end def token_class self.class.token_class end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/methods/escape_sequence_char.rb000066400000000000000000000002061506175332700322010ustar00rootroot00000000000000# frozen_string_literal: true Regexp::Expression::EscapeSequence::Base.class_eval do def char codepoint.chr('utf-8') end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb000066400000000000000000000036641506175332700332630ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression::EscapeSequence AsciiEscape.class_eval { def codepoint; 0x1B end } Backspace.class_eval { def codepoint; 0x8 end } Bell.class_eval { def codepoint; 0x7 end } FormFeed.class_eval { def codepoint; 0xC end } Newline.class_eval { def codepoint; 0xA end } Return.class_eval { def codepoint; 0xD end } Tab.class_eval { def codepoint; 0x9 end } VerticalTab.class_eval { def codepoint; 0xB end } Literal.class_eval { def codepoint; text[1].ord end } Octal.class_eval { def codepoint; text[/\d+/].to_i(8) end } Hex.class_eval { def codepoint; text[/\h+/].hex end } Codepoint.class_eval { def codepoint; text[/\h+/].hex end } UTF8Hex.class_eval do def codepoint text.scan(/\h+/).map(&:hex).pack('C*').force_encoding('utf-8').ord end end CodepointList.class_eval do # Maybe this should be a unique top-level expression class? def char raise NoMethodError, 'CodepointList responds only to #chars' end def codepoint raise NoMethodError, 'CodepointList responds only to #codepoints' end def chars codepoints.map { |cp| cp.chr('utf-8') } end def codepoints text.scan(/\h+/).map(&:hex) end end AbstractMetaControlSequence.class_eval do private def control_sequence_to_s(control_sequence) five_lsb = control_sequence.unpack('B*').first[-5..-1] ["000#{five_lsb}"].pack('B*') end def meta_char_to_codepoint(meta_char) byte_value = meta_char.ord byte_value < 128 ? byte_value + 128 : byte_value end end Control.class_eval do def codepoint control_sequence_to_s(text).ord end end Meta.class_eval do def codepoint meta_char_to_codepoint(text[-1]) end end MetaControl.class_eval do def codepoint meta_char_to_codepoint(control_sequence_to_s(text)) end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/methods/human_name.rb000066400000000000000000000071351506175332700301740ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression module Shared # default implementation, e.g. "atomic group", "hex escape", "word type", .. def human_name [token, type].compact.join(' ').tr('_', ' ') end end Alternation.class_eval { def human_name; 'alternation' end } Alternative.class_eval { def human_name; 'alternative' end } Anchor::BOL.class_eval { def human_name; 'beginning of line' end } Anchor::BOS.class_eval { def human_name; 'beginning of string' end } Anchor::EOL.class_eval { def human_name; 'end of line' end } Anchor::EOS.class_eval { def human_name; 'end of string' end } Anchor::EOSobEOL.class_eval { def human_name; 'newline-ready end of string' end } Anchor::MatchStart.class_eval { def human_name; 'match start' end } Anchor::NonWordBoundary.class_eval { def human_name; 'no word boundary' end } Anchor::WordBoundary.class_eval { def human_name; 'word boundary' end } Assertion::Lookahead.class_eval { def human_name; 'lookahead' end } Assertion::Lookbehind.class_eval { def human_name; 'lookbehind' end } Assertion::NegativeLookahead.class_eval { def human_name; 'negative lookahead' end } Assertion::NegativeLookbehind.class_eval { def human_name; 'negative lookbehind' end } Backreference::Name.class_eval { def human_name; 'backreference by name' end } Backreference::NameCall.class_eval { def human_name; 'subexpression call by name' end } Backreference::Number.class_eval { def human_name; 'backreference' end } Backreference::NumberRelative.class_eval { def human_name; 'relative backreference' end } Backreference::NumberCall.class_eval { def human_name; 'subexpression call' end } Backreference::NumberCallRelative.class_eval { def human_name; 'relative subexpression call' end } CharacterSet::IntersectedSequence.class_eval { def human_name; 'intersected sequence' end } CharacterSet::Intersection.class_eval { def human_name; 'intersection' end } CharacterSet::Range.class_eval { def human_name; 'character range' end } CharacterType::Any.class_eval { def human_name; 'match-all' end } Comment.class_eval { def human_name; 'comment' end } Conditional::Branch.class_eval { def human_name; 'conditional branch' end } Conditional::Condition.class_eval { def human_name; 'condition' end } Conditional::Expression.class_eval { def human_name; 'conditional' end } Group::Capture.class_eval { def human_name; "capture group #{number}" end } Group::Named.class_eval { def human_name; 'named capture group' end } Keep::Mark.class_eval { def human_name; 'keep-mark lookbehind' end } Literal.class_eval { def human_name; 'literal' end } Root.class_eval { def human_name; 'root' end } WhiteSpace.class_eval { def human_name; 'free space' end } end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/methods/match.rb000066400000000000000000000004161506175332700271530ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression class Base def match?(string) !!match(string) end alias :matches? :match? def match(string, offset = 0) Regexp.new(to_s).match(string, offset) end alias :=~ :match end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/methods/match_length.rb000066400000000000000000000102321506175332700305110ustar00rootroot00000000000000# frozen_string_literal: true class Regexp::MatchLength include Enumerable def self.of(obj) exp = obj.is_a?(Regexp::Expression::Base) ? obj : Regexp::Parser.parse(obj) exp.match_length end def initialize(exp, opts = {}) self.exp_class = exp.class self.min_rep = exp.repetitions.min self.max_rep = exp.repetitions.max if (base = opts[:base]) self.base_min = base self.base_max = base self.reify = ->{ '.' * base } else self.base_min = opts.fetch(:base_min) self.base_max = opts.fetch(:base_max) self.reify = opts.fetch(:reify) end end def each(opts = {}) return enum_for(__method__, opts) unless block_given? limit = opts[:limit] || 1000 yielded = 0 (min..max).each do |num| next unless include?(num) yield(num) break if (yielded += 1) >= limit end end def endless_each return enum_for(__method__) unless block_given? (min..max).each { |num| yield(num) if include?(num) } end def include?(length) test_regexp.match?('X' * length) end def fixed? min == max end def min min_rep * base_min end def max max_rep * base_max end def minmax [min, max] end def inspect type = exp_class.name.sub('Regexp::Expression::', '') "#<#{self.class}<#{type}> min=#{min} max=#{max}>" end def to_re /(?:#{reify.call}){#{min_rep},#{max_rep unless max_rep == Float::INFINITY}}/ end private attr_accessor :base_min, :base_max, :min_rep, :max_rep, :exp_class, :reify if Regexp.method_defined?(:match?) # ruby >= 2.4 def test_regexp @test_regexp ||= /^#{to_re}$/ end else def test_regexp @test_regexp ||= /^#{to_re}$/.tap { |r| def r.match?(s); !!match(s) end } end end end module Regexp::Expression MatchLength = Regexp::MatchLength [ CharacterSet, CharacterSet::Intersection, CharacterSet::IntersectedSequence, CharacterSet::Range, CharacterType::Base, EscapeSequence::Base, PosixClass, UnicodeProperty::Base, ].each do |klass| klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1 def match_length MatchLength.new(self, base: 1) end RUBY end class Literal def match_length MatchLength.new(self, base: text.length) end end class Subexpression def match_length MatchLength.new(self, base_min: map { |exp| exp.match_length.min }.inject(0, :+), base_max: map { |exp| exp.match_length.max }.inject(0, :+), reify: ->{ map { |exp| exp.match_length.to_re }.join }) end def inner_match_length dummy = Regexp::Expression::Root.construct dummy.expressions = expressions.map(&:clone) dummy.quantifier = quantifier && quantifier.clone dummy.match_length end end [ Alternation, Conditional::Expression, ].each do |klass| klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1 def match_length MatchLength.new(self, base_min: map { |exp| exp.match_length.min }.min, base_max: map { |exp| exp.match_length.max }.max, reify: ->{ map { |exp| exp.match_length.to_re }.join('|') }) end RUBY end [ Anchor::Base, Assertion::Base, Conditional::Condition, FreeSpace, Keep::Mark, ].each do |klass| klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1 def match_length MatchLength.new(self, base: 0) end RUBY end class Backreference::Base def match_length if referenced_expression.nil? raise ArgumentError, 'Missing referenced_expression - not parsed?' end referenced_expression.unquantified_clone.match_length end end class EscapeSequence::CodepointList def match_length MatchLength.new(self, base: codepoints.count) end end # Special case. Absence group can match 0.. chars, irrespective of content. # TODO: in theory, they *can* exclude match lengths with `.`: `(?~.{3})` class Group::Absence def match_length MatchLength.new(self, base_min: 0, base_max: Float::INFINITY, reify: ->{ '.*' }) end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/methods/negative.rb000066400000000000000000000016001506175332700276550ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression module Shared def negative? false end # not an alias so as to respect overrides of #negative? def negated? negative? end end Anchor::NonWordBoundary.class_eval { def negative?; true end } Assertion::NegativeLookahead.class_eval { def negative?; true end } Assertion::NegativeLookbehind.class_eval { def negative?; true end } CharacterSet.class_eval { def negative?; negative end } CharacterType::Base.class_eval { def negative?; token.to_s.start_with?('non') end } PosixClass.class_eval { def negative?; type == :nonposixclass end } UnicodeProperty::Base.class_eval { def negative?; type == :nonproperty end } end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/methods/options.rb000066400000000000000000000012621506175332700275520ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression class Base def multiline? options[:m] == true end alias :m? :multiline? def case_insensitive? options[:i] == true end alias :i? :case_insensitive? alias :ignore_case? :case_insensitive? def free_spacing? options[:x] == true end alias :x? :free_spacing? alias :extended? :free_spacing? def default_classes? options[:d] == true end alias :d? :default_classes? def ascii_classes? options[:a] == true end alias :a? :ascii_classes? def unicode_classes? options[:u] == true end alias :u? :unicode_classes? end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/methods/parts.rb000066400000000000000000000022361506175332700272120ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression module Shared # default implementation def parts [text.dup] end private def intersperse(expressions, separator) expressions.flat_map { |exp| [exp, separator] }.slice(0...-1) end end CharacterSet.class_eval { def parts; ["#{text}#{'^' if negated?}", *expressions, ']'] end } CharacterSet::Range.class_eval { def parts; intersperse(expressions, text.dup) end } Conditional::Expression.class_eval { def parts; [text.dup, condition, *intersperse(branches, '|'), ')'] end } Group::Base.class_eval { def parts; [text.dup, *expressions, ')'] end } Group::Passive.class_eval { def parts; implicit? ? expressions : super end } Group::Comment.class_eval { def parts; [text.dup] end } Subexpression.class_eval { def parts; expressions end } SequenceOperation.class_eval { def parts; intersperse(expressions, text.dup) end } end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/methods/printing.rb000066400000000000000000000012431506175332700277100ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression module Shared def inspect [ "#<#{self.class}", pretty_print_instance_variables.map { |v| " #{v}=#{instance_variable_get(v).inspect}" }, ">" ].join end # Make pretty-print work despite #inspect implementation. def pretty_print(q) q.pp_object(self) end # Called by pretty_print (ruby/pp) and #inspect. def pretty_print_instance_variables [ (:@text unless text.to_s.empty?), (:@quantifier if quantified?), (:@options unless options.empty?), (:@expressions unless terminal?), ].compact end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/methods/referenced_expressions.rb000066400000000000000000000013711506175332700326240ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression module ReferencedExpressions attr_accessor :referenced_expressions def referenced_expression referenced_expressions && referenced_expressions.first end def initialize_copy(orig) exp_id = [self.class, self.starts_at] # prevent infinite recursion for recursive subexp calls copied = self.class.instance_eval { @copied_ref_exps ||= {} } self.referenced_expressions = if copied[exp_id] orig.referenced_expressions else copied[exp_id] = true orig.referenced_expressions && orig.referenced_expressions.map(&:dup) end copied.clear super end end Base.include ReferencedExpressions end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/methods/strfregexp.rb000066400000000000000000000062221506175332700302510ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression class Base # %l Level (depth) of the expression. Returns 'root' for the root # expression, returns zero or higher for all others. # # %> Indentation at expression's level. # # %x Index of the expression at its depth. Available when using # the sprintf_tree method only. # # %s Start offset within the whole expression. # %e End offset within the whole expression. # %S Length of expression. # # %o Coded offset and length, same as '@%s+%S' # # %y Type of expression. # %k Token of expression. # %i ID, same as '%y:%k' # %c Class name # # %q Quantifier info, as {m[,M]} # %Q Quantifier text # # %z Quantifier min # %Z Quantifier max # # %t Base text of the expression (excludes quantifier, if any) # %~t Full text if the expression is terminal, otherwise %i # %T Full text of the expression (includes quantifier, if any) # # %b Basic info, same as '%o %i' # %m Most info, same as '%b %q' # %a All info, same as '%m %t' # def strfregexp(format = '%a', indent_offset = 0, index = nil) have_index = index ? true : false part = {} print_level = nesting_level > 0 ? nesting_level - 1 : nil # Order is important! Fields that use other fields in their # definition must appear before the fields they use. part_keys = %w[a m b o i l x s e S y k c q Q z Z t ~t T >] part.keys.each {|k| part[k] = ""} part['>'] = print_level ? (' ' * (print_level + indent_offset)) : '' part['l'] = print_level ? "#{'%d' % print_level}" : 'root' part['x'] = "#{'%d' % index}" if have_index part['s'] = starts_at part['S'] = full_length part['e'] = starts_at + full_length part['o'] = coded_offset part['k'] = token part['y'] = type part['i'] = '%y:%k' part['c'] = self.class.name if quantified? if quantifier.max == -1 part['q'] = "{#{quantifier.min}, or-more}" else part['q'] = "{#{quantifier.min}, #{quantifier.max}}" end part['Q'] = quantifier.text part['z'] = quantifier.min part['Z'] = quantifier.max else part['q'] = '{1}' part['Q'] = '' part['z'] = '1' part['Z'] = '1' end part['t'] = to_s(:base) part['~t'] = terminal? ? to_s : "#{type}:#{token}" part['T'] = to_s(:full) part['b'] = '%o %i' part['m'] = '%b %q' part['a'] = '%m %t' out = format.dup part_keys.each do |k| out.gsub!(/%#{k}/, part[k].to_s) end out end alias :strfre :strfregexp end class Subexpression < Regexp::Expression::Base def strfregexp_tree(format = '%a', include_self = true, separator = "\n") output = include_self ? [self.strfregexp(format)] : [] output += flat_map do |exp, index| exp.strfregexp(format, (include_self ? 1 : 0), index) end output.join(separator) end alias :strfre_tree :strfregexp_tree end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/methods/tests.rb000066400000000000000000000116741506175332700272310ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression module Shared # Test if this expression has the given test_type, which can be either # a symbol or an array of symbols to check against the expression's type. # # # is it a :group expression # exp.type? :group # # # is it a :set, or :meta # exp.type? [:set, :meta] # def type?(test_type) test_types = Array(test_type).map(&:to_sym) test_types.include?(:*) || test_types.include?(type) end # Test if this expression has the given test_token, and optionally a given # test_type. # # # Any expressions # exp.is? :* # always returns true # # # is it a :capture # exp.is? :capture # # # is it a :character and a :set # exp.is? :character, :set # # # is it a :meta :dot # exp.is? :dot, :meta # # # is it a :meta or :escape :dot # exp.is? :dot, [:meta, :escape] # def is?(test_token, test_type = nil) return true if test_token === :* token == test_token and (test_type ? type?(test_type) : true) end # Test if this expression matches an entry in the given scope spec. # # A scope spec can be one of: # # . An array: Interpreted as a set of tokens, tested for inclusion # of the expression's token. # # . A hash: Where the key is interpreted as the expression type # and the value is either a symbol or an array. In this # case, when the scope is a hash, one_of? calls itself to # evaluate the key's value. # # . A symbol: matches the expression's token or type, depending on # the level of the call. If one_of? is called directly with # a symbol then it will always be checked against the # type of the expression. If it's being called for a value # from a hash, it will be checked against the token of the # expression. # # # any expression # exp.one_of?(:*) # always true # # # like exp.type?(:group) # exp.one_of?(:group) # # # any expression of type meta # exp.one_of?(:meta => :*) # # # meta dots and alternations # exp.one_of?(:meta => [:dot, :alternation]) # # # meta dots and any set tokens # exp.one_of?({meta: [:dot], set: :*}) # def one_of?(scope, top = true) case scope when Array scope.include?(:*) || scope.include?(token) when Hash if scope.has_key?(:*) test_type = scope.has_key?(type) ? type : :* one_of?(scope[test_type], false) else scope.has_key?(type) && one_of?(scope[type], false) end when Symbol scope.equal?(:*) || (top ? type?(scope) : is?(scope)) else raise ArgumentError, "Array, Hash, or Symbol expected, #{scope.class.name} given" end end # Deep-compare two expressions for equality. # # When changing the conditions, please make sure to update # #pretty_print_instance_variables so that it includes all relevant values. def ==(other) self.class == other.class && text == other.text && quantifier == other.quantifier && options == other.options && (terminal? || expressions == other.expressions) end alias :=== :== alias :eql? :== def optional? quantified? && quantifier.min == 0 end def quantified? !quantifier.nil? end end Shared.class_eval { def terminal?; self.class.terminal? end } Shared::ClassMethods.class_eval { def terminal?; true end } Subexpression.instance_eval { def terminal?; false end } Shared.class_eval { def capturing?; self.class.capturing? end } Shared::ClassMethods.class_eval { def capturing?; false end } Group::Capture.instance_eval { def capturing?; true end } Shared.class_eval { def comment?; self.class.comment? end } Shared::ClassMethods.class_eval { def comment?; false end } Comment.instance_eval { def comment?; true end } Group::Comment.instance_eval { def comment?; true end } Shared.class_eval { def decorative?; self.class.decorative? end } Shared::ClassMethods.class_eval { def decorative?; false end } FreeSpace.instance_eval { def decorative?; true end } Group::Comment.instance_eval { def decorative?; true end } Shared.class_eval { def referential?; self.class.referential? end } Shared::ClassMethods.class_eval { def referential?; false end } Backreference::Base.instance_eval { def referential?; true end } Conditional::Condition.instance_eval { def referential?; true end } Conditional::Expression.instance_eval { def referential?; true end } end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/methods/traverse.rb000066400000000000000000000047721506175332700277230ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression class Subexpression < Regexp::Expression::Base # Traverses the expression, passing each recursive child to the # given block. # If the block takes two arguments, the indices of the children within # their parents are also passed to it. def each_expression(include_self = false, &block) return enum_for(__method__, include_self) unless block if block.arity == 1 block.call(self) if include_self each_expression_without_index(&block) else block.call(self, 0) if include_self each_expression_with_index(&block) end end # Traverses the subexpression (depth-first, pre-order) and calls the given # block for each expression with three arguments; the traversal event, # the expression, and the index of the expression within its parent. # # The event argument is passed as follows: # # - For subexpressions, :enter upon entering the subexpression, and # :exit upon exiting it. # # - For terminal expressions, :visit is called once. # # Returns self. def traverse(include_self = false, &block) return enum_for(__method__, include_self) unless block_given? block.call(:enter, self, 0) if include_self each_with_index do |exp, index| if exp.terminal? block.call(:visit, exp, index) else block.call(:enter, exp, index) exp.traverse(&block) block.call(:exit, exp, index) end end block.call(:exit, self, 0) if include_self self end alias :walk :traverse # Returns a new array with the results of calling the given block once # for every expression. If a block is not given, returns an array with # each expression and its level index as an array. def flat_map(include_self = false, &block) case block && block.arity when nil then each_expression(include_self).to_a when 2 then each_expression(include_self).map(&block) else each_expression(include_self).map { |exp| block.call(exp) } end end protected def each_expression_with_index(&block) each_with_index do |exp, index| block.call(exp, index) exp.each_expression_with_index(&block) unless exp.terminal? end end def each_expression_without_index(&block) each do |exp| block.call(exp) exp.each_expression_without_index(&block) unless exp.terminal? end end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/quantifier.rb000066400000000000000000000047041506175332700265670ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression # TODO: in v3.0.0, maybe put Shared back into Base, and inherit from Base and # call super in #initialize, but raise in #quantifier= and #quantify, # or introduce an Expression::Quantifiable intermediate class. # Or actually allow chaining as a more concise but tricky solution than PR#69. class Quantifier include Regexp::Expression::Shared MODES = %i[greedy possessive reluctant].freeze def initialize(*args) deprecated_old_init(*args) and return if args.count == 4 || args.count == 5 init_from_token_and_options(*args) # TODO: remove in v3.0.0, stop removing parts of #token (?) self.token = token.to_s.sub(/_(greedy|possessive|reluctant)/, '').to_sym end def to_h { token: token, text: text, mode: mode, min: min, max: max, } end MODES.each do |mode| class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{mode}? mode.equal?(:#{mode}) end RUBY end alias :lazy? :reluctant? def min derived_data[:min] end def max derived_data[:max] end def mode derived_data[:mode] end private def deprecated_old_init(token, text, _min, _max, _mode = :greedy) warn "Calling `Expression::Base#quantify` or `#{self.class}.new` with 4+ arguments "\ "is deprecated.\nIt will no longer be supported in regexp_parser v3.0.0.\n"\ "Please pass a Regexp::Token instead, e.g. replace `token, text, min, max, mode` "\ "with `::Regexp::Token.new(:quantifier, token, text)`. min, max, and mode "\ "will be derived automatically.\n"\ "Or do `exp.quantifier = #{self.class}.construct(token: token, text: str)`.\n"\ "This is consistent with how Expression::Base instances are created. " @token = token @text = text end def derived_data @derived_data ||= begin min, max = case text[0] when '?'; [0, 1] when '*'; [0, -1] when '+'; [1, -1] else int_min = text[/\{(\d*)/, 1] int_max = text[/,?(\d*)\}/, 1] [int_min.to_i, (int_max.empty? ? -1 : int_max.to_i)] end mod = text[/.([?+])/, 1] mode = (mod == '?' && :reluctant) || (mod == '+' && :possessive) || :greedy { min: min, max: max, mode: mode } end end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/sequence.rb000066400000000000000000000020351506175332700262230ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression # A sequence of expressions. Differs from a Subexpressions by how it handles # quantifiers, as it applies them to its last element instead of itself as # a whole subexpression. # # Used as the base class for the Alternation alternatives, Conditional # branches, and CharacterSet::Intersection intersected sequences. class Sequence < Regexp::Expression::Subexpression class << self def add_to(exp, params = {}, active_opts = {}) sequence = construct( level: exp.level, set_level: exp.set_level, conditional_level: params[:conditional_level] || exp.conditional_level, ts: params[:ts], ) sequence.options = active_opts exp.expressions << sequence sequence end end def ts (head = expressions.first) ? head.ts : @ts end def quantify(token, *args) extract_quantifier_target(token.text).quantify(token, *args) end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/sequence_operation.rb000066400000000000000000000007551506175332700303120ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression # abstract class class SequenceOperation < Regexp::Expression::Subexpression alias :sequences :expressions alias :operands :expressions alias :operator :text def ts (head = expressions.first) ? head.ts : @ts end def <<(exp) expressions.last << exp end def add_sequence(active_opts = {}, params = { ts: 0 }) self.class::OPERAND.add_to(self, params, active_opts) end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/shared.rb000066400000000000000000000063001506175332700256600ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression module Shared module ClassMethods; end # filled in ./methods/*.rb def self.included(mod) mod.class_eval do extend Shared::ClassMethods attr_accessor :type, :token, :text, :ts, :te, :level, :set_level, :conditional_level, :options, :parent, :custom_to_s_handling, :pre_quantifier_decorations attr_reader :nesting_level, :quantifier end end def init_from_token_and_options(token, options = {}) self.type = token.type self.token = token.token self.text = token.text self.ts = token.ts self.te = token.te self.level = token.level self.set_level = token.set_level self.conditional_level = token.conditional_level self.nesting_level = 0 self.options = options || {} end private :init_from_token_and_options def initialize_copy(orig) self.text = orig.text.dup if orig.text self.options = orig.options.dup if orig.options self.quantifier = orig.quantifier.clone if orig.quantifier self.parent = nil # updated by Subexpression#initialize_copy if orig.pre_quantifier_decorations self.pre_quantifier_decorations = orig.pre_quantifier_decorations.map(&:dup) end super end def starts_at ts end def ends_at(include_quantifier = true) ts + (include_quantifier ? full_length : base_length) end def base_length to_s(:base).length end def full_length to_s(:original).length end # #to_s reproduces the original source, as an unparser would. # # It takes an optional format argument. # # Example: # # lit = Regexp::Parser.parse(/a +/x)[0] # # lit.to_s # => 'a+' # default; with quantifier # lit.to_s(:full) # => 'a+' # default; with quantifier # lit.to_s(:base) # => 'a' # without quantifier # lit.to_s(:original) # => 'a +' # with quantifier AND intermittent decorations # def to_s(format = :full) base = ''.dup parts.each do |part| if part.instance_of?(String) base << part elsif !part.custom_to_s_handling base << part.to_s(:original) end end "#{base}#{pre_quantifier_decoration(format)}#{quantifier_affix(format)}" end alias :to_str :to_s def pre_quantifier_decoration(expression_format = :original) pre_quantifier_decorations.to_a.join if expression_format == :original end def quantifier_affix(expression_format = :full) quantifier.to_s if quantified? && expression_format != :base end def offset [starts_at, full_length] end def coded_offset '@%d+%d' % offset end def nesting_level=(lvl) @nesting_level = lvl quantifier && quantifier.nesting_level = lvl terminal? || each { |subexp| subexp.nesting_level = lvl + 1 } end def quantifier=(qtf) @quantifier = qtf @repetitions = nil # clear memoized value end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/expression/subexpression.rb000066400000000000000000000031641506175332700273300ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Expression class Subexpression < Regexp::Expression::Base include Enumerable attr_accessor :expressions def initialize(token, options = {}) self.expressions = [] super end # Override base method to clone the expressions as well. def initialize_copy(orig) self.expressions = orig.expressions.map do |exp| exp.clone.tap { |copy| copy.parent = self } end super end def <<(exp) exp.parent = self expressions << exp end %w[[] at each empty? fetch index join last length values_at].each do |method| class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{method}(*args, &block) expressions.#{method}(*args, &block) end RUBY end def dig(*indices) exp = self indices.each { |idx| exp = exp.nil? || exp.terminal? ? nil : exp[idx] } exp end def te ts + base_length end def to_h attributes.merge( text: to_s(:base), expressions: expressions.map(&:to_h) ) end def extract_quantifier_target(quantifier_description) pre_quantifier_decorations = [] target = expressions.reverse.find do |exp| if exp.decorative? exp.custom_to_s_handling = true pre_quantifier_decorations << exp.text next end exp end target or raise Regexp::Parser::ParserError, "No valid target found for '#{quantifier_description}' quantifier" target.pre_quantifier_decorations = pre_quantifier_decorations target end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/lexer.rb000066400000000000000000000121661506175332700233410ustar00rootroot00000000000000# frozen_string_literal: true # A very thin wrapper around the scanner that breaks quantified literal runs, # collects emitted tokens into an array, calculates their nesting depth, and # normalizes tokens for the parser, and checks if they are implemented by the # given syntax flavor. class Regexp::Lexer OPENING_TOKENS = %i[ capture passive lookahead nlookahead lookbehind nlookbehind atomic options options_switch named absence open ].freeze CLOSING_TOKENS = %i[close].freeze CONDITION_TOKENS = %i[condition condition_close].freeze def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block) new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block) end def lex(input, syntax = nil, options: nil, collect_tokens: true, &block) syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT self.block = block self.collect_tokens = collect_tokens self.tokens = [] self.prev_token = nil self.preprev_token = nil self.nesting = 0 self.set_nesting = 0 self.conditional_nesting = 0 self.shift = 0 Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te| type, token = *syntax.normalize(type, token) syntax.check! type, token ascend(type, token) if (last = prev_token) && type == :quantifier && ( (last.type == :literal && (parts = break_literal(last))) || (last.token == :codepoint_list && (parts = break_codepoint_list(last))) ) emit(parts[0]) last = parts[1] end current = Regexp::Token.new(type, token, text, ts + shift, te + shift, nesting, set_nesting, conditional_nesting) if type == :conditional && CONDITION_TOKENS.include?(token) current = merge_condition(current, last) elsif last last.next = current current.previous = last emit(last) end self.preprev_token = last self.prev_token = current descend(type, token) end emit(prev_token) if prev_token collect_tokens ? tokens : nil end def emit(token) if block # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block res = block.call(token) tokens << res if collect_tokens else tokens << token end end class << self alias :scan :lex end private attr_accessor :block, :collect_tokens, :tokens, :prev_token, :preprev_token, :nesting, :set_nesting, :conditional_nesting, :shift def ascend(type, token) return unless CLOSING_TOKENS.include?(token) case type when :group, :assertion self.nesting = nesting - 1 when :set self.set_nesting = set_nesting - 1 when :conditional self.conditional_nesting = conditional_nesting - 1 else raise "unhandled nesting type #{type}" end end def descend(type, token) return unless OPENING_TOKENS.include?(token) case type when :group, :assertion self.nesting = nesting + 1 when :set self.set_nesting = set_nesting + 1 when :conditional self.conditional_nesting = conditional_nesting + 1 else raise "unhandled nesting type #{type}" end end # called by scan to break a literal run that is longer than one character # into two separate tokens when it is followed by a quantifier def break_literal(token) lead, last, _ = token.text.partition(/.\z/mu) return if lead.empty? token_1 = Regexp::Token.new(:literal, :literal, lead, token.ts, (token.te - last.length), nesting, set_nesting, conditional_nesting) token_2 = Regexp::Token.new(:literal, :literal, last, (token.ts + lead.length), token.te, nesting, set_nesting, conditional_nesting) token_1.previous = preprev_token token_1.next = token_2 token_2.previous = token_1 # .next will be set by #lex [token_1, token_2] end # if a codepoint list is followed by a quantifier, that quantifier applies # to the last codepoint, e.g. /\u{61 62 63}{3}/ =~ 'abccc' # c.f. #break_literal. def break_codepoint_list(token) lead, _, tail = token.text.rpartition(' ') return if lead.empty? token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}', token.ts, (token.te - tail.length), nesting, set_nesting, conditional_nesting) token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail, (token.ts + lead.length + 1), (token.te + 3), nesting, set_nesting, conditional_nesting) self.shift = shift + 3 # one space less, but extra \, u, {, and } token_1.previous = preprev_token token_1.next = token_2 token_2.previous = token_1 # .next will be set by #lex [token_1, token_2] end def merge_condition(current, last) token = Regexp::Token.new(:conditional, :condition, last.text + current.text, last.ts, current.te, nesting, set_nesting, conditional_nesting) token.previous = preprev_token # .next will be set by #lex token end end # module Regexp::Lexer ammar-regexp_parser-68cdeff/lib/regexp_parser/parser.rb000066400000000000000000000527031506175332700235170ustar00rootroot00000000000000# frozen_string_literal: true require_relative 'error' require_relative 'expression' class Regexp::Parser include Regexp::Expression class ParserError < Regexp::Parser::Error; end class UnknownTokenTypeError < ParserError def initialize(type, token) super "Unknown token type #{type} #{token.inspect}" end end class UnknownTokenError < ParserError def initialize(type, token) super "Unknown #{type} token #{token.token}" end end def self.parse(input, syntax = nil, options: nil, &block) new.parse(input, syntax, options: options, &block) end def parse(input, syntax = nil, options: nil, &block) root = Root.construct(options: extract_options(input, options)) self.root = root self.node = root self.nesting = [root] self.options_stack = [root.options] self.switching_options = false self.conditional_nesting = [] self.captured_group_counts = Hash.new(0) Regexp::Lexer.scan(input, syntax, options: options, collect_tokens: false) do |token| parse_token(token) end # Trigger recursive setting of #nesting_level, which reflects how deep # a node is in the tree. Do this at the end to account for tree rewrites. root.nesting_level = 0 assign_referenced_expressions if block_given? block.call(root) else root end end private attr_accessor :root, :node, :nesting, :options_stack, :switching_options, :conditional_nesting, :captured_group_counts def extract_options(input, options) if options && !input.is_a?(String) raise ArgumentError, 'options cannot be supplied unless parsing a String' end options = input.options if input.is_a?(::Regexp) return {} unless options enabled_options = {} enabled_options[:i] = true if options & ::Regexp::IGNORECASE != 0 enabled_options[:m] = true if options & ::Regexp::MULTILINE != 0 enabled_options[:x] = true if options & ::Regexp::EXTENDED != 0 enabled_options end def parse_token(token) case token.type when :anchor; anchor(token) when :assertion, :group; group(token) when :backref; backref(token) when :conditional; conditional(token) when :escape; escape(token) when :free_space; free_space(token) when :keep; keep(token) when :literal; literal(token) when :meta; meta(token) when :posixclass, :nonposixclass; posixclass(token) when :property, :nonproperty; property(token) when :quantifier; quantifier(token) when :set; set(token) when :type; type(token) else raise UnknownTokenTypeError.new(token.type, token) end close_completed_character_set_range end def anchor(token) case token.token when :bol; node << Anchor::BeginningOfLine.new(token, active_opts) when :bos; node << Anchor::BOS.new(token, active_opts) when :eol; node << Anchor::EndOfLine.new(token, active_opts) when :eos; node << Anchor::EOS.new(token, active_opts) when :eos_ob_eol; node << Anchor::EOSobEOL.new(token, active_opts) when :match_start; node << Anchor::MatchStart.new(token, active_opts) when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts) when :word_boundary; node << Anchor::WordBoundary.new(token, active_opts) else raise UnknownTokenError.new('Anchor', token) end end def group(token) case token.token when :options, :options_switch options_group(token) when :close close_group when :comment node << Group::Comment.new(token, active_opts) else open_group(token) end end MOD_FLAGS = %w[i m x].map(&:to_sym) ENC_FLAGS = %w[a d u].map(&:to_sym) def options_group(token) positive, negative = token.text.split('-', 2) negative ||= '' self.switching_options = token.token.equal?(:options_switch) opt_changes = {} new_active_opts = active_opts.dup MOD_FLAGS.each do |flag| if positive.include?(flag.to_s) opt_changes[flag] = new_active_opts[flag] = true end if negative.include?(flag.to_s) opt_changes[flag] = false new_active_opts.delete(flag) end end if (enc_flag = positive.reverse[/[adu]/]) enc_flag = enc_flag.to_sym (ENC_FLAGS - [enc_flag]).each do |other| opt_changes[other] = false if new_active_opts[other] new_active_opts.delete(other) end opt_changes[enc_flag] = new_active_opts[enc_flag] = true end options_stack << new_active_opts options_group = Group::Options.new(token, active_opts) options_group.option_changes = opt_changes nest(options_group) end def open_group(token) group_class = case token.token when :absence; Group::Absence when :atomic; Group::Atomic when :capture; Group::Capture when :named; Group::Named when :passive; Group::Passive when :lookahead; Assertion::Lookahead when :lookbehind; Assertion::Lookbehind when :nlookahead; Assertion::NegativeLookahead when :nlookbehind; Assertion::NegativeLookbehind else raise UnknownTokenError.new('Group type open', token) end group = group_class.new(token, active_opts) if group.capturing? group.number = total_captured_group_count + 1 group.number_at_level = captured_group_count_at_level + 1 count_captured_group end # Push the active options to the stack again. This way we can simply pop the # stack for any group we close, no matter if it had its own options or not. options_stack << active_opts nest(group) end def total_captured_group_count captured_group_counts.values.reduce(0, :+) end def captured_group_count_at_level captured_group_counts[node] end def count_captured_group captured_group_counts[node] += 1 end def close_group options_stack.pop unless switching_options self.switching_options = false decrease_nesting end def decrease_nesting while nesting.last.is_a?(SequenceOperation) nesting.pop self.node = nesting.last end nesting.pop yield(node) if block_given? self.node = nesting.last self.node = node.last if node.last.is_a?(SequenceOperation) end def backref(token) case token.token when :name_ref node << Backreference::Name.new(token, active_opts) when :name_recursion_ref node << Backreference::NameRecursionLevel.new(token, active_opts) when :name_call node << Backreference::NameCall.new(token, active_opts) when :number, :number_ref # TODO: split in v3.0.0 node << Backreference::Number.new(token, active_opts) when :number_recursion_ref node << Backreference::NumberRecursionLevel.new(token, active_opts).tap do |exp| # TODO: should split off new token number_recursion_rel_ref and new # class NumberRelativeRecursionLevel in v3.0.0 to get rid of this if exp.text =~ /[<'][+-]/ assign_effective_number(exp) else exp.effective_number = exp.number end end when :number_call node << Backreference::NumberCall.new(token, active_opts) when :number_rel_ref node << Backreference::NumberRelative.new(token, active_opts).tap do |exp| assign_effective_number(exp) end when :number_rel_call node << Backreference::NumberCallRelative.new(token, active_opts).tap do |exp| assign_effective_number(exp) end else raise UnknownTokenError.new('Backreference', token) end end def assign_effective_number(exp) exp.effective_number = exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0) exp.effective_number > 0 || raise(ParserError, "Invalid reference: #{exp.reference}") end def conditional(token) case token.token when :open nest_conditional(Conditional::Expression.new(token, active_opts)) when :condition conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts) conditional_nesting.last.add_sequence(active_opts, { ts: token.te }) when :separator conditional_nesting.last.add_sequence(active_opts, { ts: token.te }) self.node = conditional_nesting.last.branches.last when :close conditional_nesting.pop decrease_nesting self.node = if conditional_nesting.empty? nesting.last else conditional_nesting.last end else raise UnknownTokenError.new('Conditional', token) end end def nest_conditional(exp) conditional_nesting.push(exp) nest(exp) end def nest(exp) nesting.push(exp) node << exp self.node = exp end def escape(token) case token.token when :backspace; node << EscapeSequence::Backspace.new(token, active_opts) when :escape; node << EscapeSequence::AsciiEscape.new(token, active_opts) when :bell; node << EscapeSequence::Bell.new(token, active_opts) when :form_feed; node << EscapeSequence::FormFeed.new(token, active_opts) when :newline; node << EscapeSequence::Newline.new(token, active_opts) when :carriage; node << EscapeSequence::Return.new(token, active_opts) when :tab; node << EscapeSequence::Tab.new(token, active_opts) when :vertical_tab; node << EscapeSequence::VerticalTab.new(token, active_opts) when :codepoint; node << EscapeSequence::Codepoint.new(token, active_opts) when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts) when :hex; node << EscapeSequence::Hex.new(token, active_opts) when :octal; node << EscapeSequence::Octal.new(token, active_opts) when :utf8_hex; node << EscapeSequence::UTF8Hex.new(token, active_opts) when :control if token.text =~ /\A(?:\\C-\\M|\\c\\M)/ # TODO: emit :meta_control_sequence token in v3.0.0 node << EscapeSequence::MetaControl.new(token, active_opts) else node << EscapeSequence::Control.new(token, active_opts) end when :meta_sequence if token.text =~ /\A\\M-\\[Cc]/ # TODO: emit :meta_control_sequence token in v3.0.0: node << EscapeSequence::MetaControl.new(token, active_opts) else node << EscapeSequence::Meta.new(token, active_opts) end else # treating everything else as a literal # TODO: maybe split this up a bit more in v3.0.0? # E.g. escaped quantifiers or set meta chars are not the same # as stuff that would be a literal even without the backslash. # Right now, they all end up here. node << EscapeSequence::Literal.new(token, active_opts) end end def free_space(token) case token.token when :comment node << Comment.new(token, active_opts) when :whitespace node << WhiteSpace.new(token, active_opts) else raise UnknownTokenError.new('FreeSpace', token) end end def keep(token) node << Keep::Mark.new(token, active_opts) end def literal(token) node << Literal.new(token, active_opts) end def meta(token) case token.token when :dot node << CharacterType::Any.new(token, active_opts) when :alternation sequence_operation(Alternation, token) else raise UnknownTokenError.new('Meta', token) end end def sequence_operation(klass, token) unless node.instance_of?(klass) operator = klass.new(token, active_opts) sequence = operator.add_sequence(active_opts, { ts: token.ts }) sequence.expressions = node.expressions node.expressions = [] nest(operator) end node.add_sequence(active_opts, { ts: token.te }) end def posixclass(token) node << PosixClass.new(token, active_opts) end UP = Regexp::Expression::Property UPTokens = Regexp::Syntax::Token::Property def property(token) case token.token when :alnum; node << UP::Alnum.new(token, active_opts) when :alpha; node << UP::Alpha.new(token, active_opts) when :ascii; node << UP::Ascii.new(token, active_opts) when :blank; node << UP::Blank.new(token, active_opts) when :cntrl; node << UP::Cntrl.new(token, active_opts) when :digit; node << UP::Digit.new(token, active_opts) when :graph; node << UP::Graph.new(token, active_opts) when :lower; node << UP::Lower.new(token, active_opts) when :print; node << UP::Print.new(token, active_opts) when :punct; node << UP::Punct.new(token, active_opts) when :space; node << UP::Space.new(token, active_opts) when :upper; node << UP::Upper.new(token, active_opts) when :word; node << UP::Word.new(token, active_opts) when :xdigit; node << UP::Xdigit.new(token, active_opts) when :xposixpunct; node << UP::XPosixPunct.new(token, active_opts) # only in Oniguruma (old rubies) when :newline; node << UP::Newline.new(token, active_opts) when :any; node << UP::Any.new(token, active_opts) when :assigned; node << UP::Assigned.new(token, active_opts) when :letter; node << UP::Letter::Any.new(token, active_opts) when :cased_letter; node << UP::Letter::Cased.new(token, active_opts) when :uppercase_letter; node << UP::Letter::Uppercase.new(token, active_opts) when :lowercase_letter; node << UP::Letter::Lowercase.new(token, active_opts) when :titlecase_letter; node << UP::Letter::Titlecase.new(token, active_opts) when :modifier_letter; node << UP::Letter::Modifier.new(token, active_opts) when :other_letter; node << UP::Letter::Other.new(token, active_opts) when :mark; node << UP::Mark::Any.new(token, active_opts) when :combining_mark; node << UP::Mark::Combining.new(token, active_opts) when :nonspacing_mark; node << UP::Mark::Nonspacing.new(token, active_opts) when :spacing_mark; node << UP::Mark::Spacing.new(token, active_opts) when :enclosing_mark; node << UP::Mark::Enclosing.new(token, active_opts) when :number; node << UP::Number::Any.new(token, active_opts) when :decimal_number; node << UP::Number::Decimal.new(token, active_opts) when :letter_number; node << UP::Number::Letter.new(token, active_opts) when :other_number; node << UP::Number::Other.new(token, active_opts) when :punctuation; node << UP::Punctuation::Any.new(token, active_opts) when :connector_punctuation; node << UP::Punctuation::Connector.new(token, active_opts) when :dash_punctuation; node << UP::Punctuation::Dash.new(token, active_opts) when :open_punctuation; node << UP::Punctuation::Open.new(token, active_opts) when :close_punctuation; node << UP::Punctuation::Close.new(token, active_opts) when :initial_punctuation; node << UP::Punctuation::Initial.new(token, active_opts) when :final_punctuation; node << UP::Punctuation::Final.new(token, active_opts) when :other_punctuation; node << UP::Punctuation::Other.new(token, active_opts) when :separator; node << UP::Separator::Any.new(token, active_opts) when :space_separator; node << UP::Separator::Space.new(token, active_opts) when :line_separator; node << UP::Separator::Line.new(token, active_opts) when :paragraph_separator; node << UP::Separator::Paragraph.new(token, active_opts) when :symbol; node << UP::Symbol::Any.new(token, active_opts) when :math_symbol; node << UP::Symbol::Math.new(token, active_opts) when :currency_symbol; node << UP::Symbol::Currency.new(token, active_opts) when :modifier_symbol; node << UP::Symbol::Modifier.new(token, active_opts) when :other_symbol; node << UP::Symbol::Other.new(token, active_opts) when :other; node << UP::Codepoint::Any.new(token, active_opts) when :control; node << UP::Codepoint::Control.new(token, active_opts) when :format; node << UP::Codepoint::Format.new(token, active_opts) when :surrogate; node << UP::Codepoint::Surrogate.new(token, active_opts) when :private_use; node << UP::Codepoint::PrivateUse.new(token, active_opts) when :unassigned; node << UP::Codepoint::Unassigned.new(token, active_opts) when *UPTokens::Age; node << UP::Age.new(token, active_opts) when *UPTokens::Derived; node << UP::Derived.new(token, active_opts) when *UPTokens::Emoji; node << UP::Emoji.new(token, active_opts) when *UPTokens::Enumerated; node << UP::Enumerated.new(token, active_opts) when *UPTokens::Script; node << UP::Script.new(token, active_opts) when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts) else raise UnknownTokenError.new('UnicodeProperty', token) end end def quantifier(token) target_node = node.extract_quantifier_target(token.text) # in case of chained quantifiers, wrap target in an implicit passive group # description of the problem: https://github.com/ammar/regexp_parser/issues/3 # rationale for this solution: https://github.com/ammar/regexp_parser/pull/69 if target_node.quantified? new_group = Group::Passive.construct( token: :passive, ts: target_node.ts, level: target_node.level, set_level: target_node.set_level, conditional_level: target_node.conditional_level, options: active_opts, ) new_group.implicit = true new_group << target_node increase_group_level(target_node) node.expressions[node.expressions.index(target_node)] = new_group target_node = new_group end unless token.token =~ /\A(?:zero_or_one|zero_or_more|one_or_more|interval) (?:_greedy|_reluctant|_possessive)?\z/x raise UnknownTokenError.new('Quantifier', token) end target_node.quantify(token, active_opts) end def increase_group_level(exp) exp.level += 1 exp.quantifier.level += 1 if exp.quantifier exp.terminal? || exp.each { |subexp| increase_group_level(subexp) } end def set(token) case token.token when :open; open_set(token) when :close; close_set when :negate; negate_set when :range; range(token) when :intersection; intersection(token) else raise UnknownTokenError.new('CharacterSet', token) end end def open_set(token) # TODO: this and Quantifier are the only cases where Expression#token # does not match the scanner/lexer output. Fix in v3.0.0. token.token = :character nest(CharacterSet.new(token, active_opts)) end def negate_set node.negate end def close_set decrease_nesting(&:close) end def range(token) exp = CharacterSet::Range.new(token, active_opts) scope = node.last.instance_of?(CharacterSet::IntersectedSequence) ? node.last : node exp << scope.expressions.pop nest(exp) end def intersection(token) sequence_operation(CharacterSet::Intersection, token) end def type(token) case token.token when :digit; node << CharacterType::Digit.new(token, active_opts) when :hex; node << CharacterType::Hex.new(token, active_opts) when :linebreak; node << CharacterType::Linebreak.new(token, active_opts) when :nondigit; node << CharacterType::NonDigit.new(token, active_opts) when :nonhex; node << CharacterType::NonHex.new(token, active_opts) when :nonspace; node << CharacterType::NonSpace.new(token, active_opts) when :nonword; node << CharacterType::NonWord.new(token, active_opts) when :space; node << CharacterType::Space.new(token, active_opts) when :word; node << CharacterType::Word.new(token, active_opts) when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts) else raise UnknownTokenError.new('CharacterType', token) end end def close_completed_character_set_range decrease_nesting if node.instance_of?(CharacterSet::Range) && node.complete? end def active_opts options_stack.last end # Assigns referenced expressions to referring expressions, e.g. if there is # an instance of Backreference::Number, its #referenced_expression is set to # the instance of Group::Capture that it refers to via its number. def assign_referenced_expressions # find all referenceable and referring expressions targets = { 0 => [root] } referrers = [] root.each_expression do |exp| if exp.referential? referrers << exp elsif exp.is_a?(Group::Capture) (targets[exp.identifier] ||= []) << exp end end # assign referenced expressions to referring expressions # (in a second iteration because there might be forward references) referrers.each do |exp| exp.referenced_expressions = targets[exp.reference] || raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}") end end end # module Regexp::Parser ammar-regexp_parser-68cdeff/lib/regexp_parser/scanner/000077500000000000000000000000001506175332700233205ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/lib/regexp_parser/scanner/char_type.rl000066400000000000000000000015651506175332700256440ustar00rootroot00000000000000%%{ machine re_char_type; single_codepoint_char_type = [dDhHsSwW]; multi_codepoint_char_type = [RX]; char_type_char = single_codepoint_char_type | multi_codepoint_char_type; # Char types scanner # -------------------------------------------------------------------------- char_type := |* char_type_char { case text = copy(data, ts-1, te) when '\d'; emit(:type, :digit, text) when '\D'; emit(:type, :nondigit, text) when '\h'; emit(:type, :hex, text) when '\H'; emit(:type, :nonhex, text) when '\s'; emit(:type, :space, text) when '\S'; emit(:type, :nonspace, text) when '\w'; emit(:type, :word, text) when '\W'; emit(:type, :nonword, text) when '\R'; emit(:type, :linebreak, text) when '\X'; emit(:type, :xgrapheme, text) end fret; }; *|; }%% ammar-regexp_parser-68cdeff/lib/regexp_parser/scanner/errors/000077500000000000000000000000001506175332700246345ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/lib/regexp_parser/scanner/errors/premature_end_error.rb000066400000000000000000000003401506175332700312210ustar00rootroot00000000000000# frozen_string_literal: true class Regexp::Scanner # Unexpected end of pattern class PrematureEndError < ScannerError def initialize(where = '') super "Premature end of pattern at #{where}" end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/scanner/errors/scanner_error.rb000066400000000000000000000003021506175332700300160ustar00rootroot00000000000000# frozen_string_literal: true require_relative '../../../regexp_parser/error' class Regexp::Scanner # General scanner error (catch all) class ScannerError < Regexp::Parser::Error; end end ammar-regexp_parser-68cdeff/lib/regexp_parser/scanner/errors/validation_error.rb000066400000000000000000000036111506175332700305250ustar00rootroot00000000000000# frozen_string_literal: true class Regexp::Scanner # Base for all scanner validation errors class ValidationError < ScannerError # Centralizes and unifies the handling of validation related errors. def self.for(type, problem, reason = nil) types.fetch(type).new(problem, reason) end def self.types @types ||= { backref: InvalidBackrefError, group: InvalidGroupError, group_option: InvalidGroupOption, posix_class: UnknownPosixClassError, property: UnknownUnicodePropertyError, sequence: InvalidSequenceError, } end end # Invalid sequence format. Used for escape sequences, mainly. class InvalidSequenceError < ValidationError def initialize(what = 'sequence', where = '') super "Invalid #{what} at #{where}" end end # Invalid group. Used for named groups. class InvalidGroupError < ValidationError def initialize(what, reason) super "Invalid #{what}, #{reason}." end end # Invalid groupOption. Used for inline options. # TODO: should become InvalidGroupOptionError in v3.0.0 for consistency class InvalidGroupOption < ValidationError def initialize(option, text) super "Invalid group option #{option} in #{text}" end end # Invalid back reference. Used for name a number refs/calls. class InvalidBackrefError < ValidationError def initialize(what, reason) super "Invalid back reference #{what}, #{reason}" end end # The property name was not recognized by the scanner. class UnknownUnicodePropertyError < ValidationError def initialize(name, _) super "Unknown unicode character property name #{name}" end end # The POSIX class name was not recognized by the scanner. class UnknownPosixClassError < ValidationError def initialize(text, _) super "Unknown POSIX class #{text}" end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/scanner/properties/000077500000000000000000000000001506175332700255145ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/lib/regexp_parser/scanner/properties/long.csv000066400000000000000000000474671506175332700272120ustar00rootroot00000000000000# THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT adlam,adlam age=1.1,age=1.1 age=10.0,age=10.0 age=11.0,age=11.0 age=12.0,age=12.0 age=12.1,age=12.1 age=13.0,age=13.0 age=14.0,age=14.0 age=15.0,age=15.0 age=15.1,age=15.1 age=16.0,age=16.0 age=2.0,age=2.0 age=2.1,age=2.1 age=3.0,age=3.0 age=3.1,age=3.1 age=3.2,age=3.2 age=4.0,age=4.0 age=4.1,age=4.1 age=5.0,age=5.0 age=5.1,age=5.1 age=5.2,age=5.2 age=6.0,age=6.0 age=6.1,age=6.1 age=6.2,age=6.2 age=6.3,age=6.3 age=7.0,age=7.0 age=8.0,age=8.0 age=9.0,age=9.0 ahom,ahom alnum,alnum alpha,alpha alphabetic,alphabetic anatolianhieroglyphs,anatolian_hieroglyphs any,any arabic,arabic armenian,armenian ascii,ascii asciihexdigit,ascii_hex_digit assigned,assigned avestan,avestan balinese,balinese bamum,bamum bassavah,bassa_vah batak,batak bengali,bengali bhaiksuki,bhaiksuki bidicontrol,bidi_control blank,blank bopomofo,bopomofo brahmi,brahmi braille,braille buginese,buginese buhid,buhid canadianaboriginal,canadian_aboriginal carian,carian cased,cased casedletter,cased_letter caseignorable,case_ignorable caucasianalbanian,caucasian_albanian chakma,chakma cham,cham changeswhencasefolded,changes_when_casefolded changeswhencasemapped,changes_when_casemapped changeswhenlowercased,changes_when_lowercased changeswhentitlecased,changes_when_titlecased changeswhenuppercased,changes_when_uppercased cherokee,cherokee chorasmian,chorasmian closepunctuation,close_punctuation cntrl,cntrl common,common connectorpunctuation,connector_punctuation control,control coptic,coptic cuneiform,cuneiform currencysymbol,currency_symbol cypriot,cypriot cyprominoan,cypro_minoan cyrillic,cyrillic dash,dash dashpunctuation,dash_punctuation decimalnumber,decimal_number defaultignorablecodepoint,default_ignorable_code_point deprecated,deprecated deseret,deseret devanagari,devanagari diacritic,diacritic digit,digit divesakuru,dives_akuru dogra,dogra duployan,duployan egyptianhieroglyphs,egyptian_hieroglyphs elbasan,elbasan elymaic,elymaic emoji,emoji emojicomponent,emoji_component emojimodifier,emoji_modifier emojimodifierbase,emoji_modifier_base emojipresentation,emoji_presentation enclosingmark,enclosing_mark ethiopic,ethiopic extendedpictographic,extended_pictographic extender,extender finalpunctuation,final_punctuation format,format garay,garay georgian,georgian glagolitic,glagolitic gothic,gothic grantha,grantha graph,graph graphemebase,grapheme_base graphemeclusterbreak=control,grapheme_cluster_break=control graphemeclusterbreak=cr,grapheme_cluster_break=cr graphemeclusterbreak=extend,grapheme_cluster_break=extend graphemeclusterbreak=l,grapheme_cluster_break=l graphemeclusterbreak=lf,grapheme_cluster_break=lf graphemeclusterbreak=lv,grapheme_cluster_break=lv graphemeclusterbreak=lvt,grapheme_cluster_break=lvt graphemeclusterbreak=prepend,grapheme_cluster_break=prepend graphemeclusterbreak=regionalindicator,grapheme_cluster_break=regional_indicator graphemeclusterbreak=spacingmark,grapheme_cluster_break=spacingmark graphemeclusterbreak=t,grapheme_cluster_break=t graphemeclusterbreak=v,grapheme_cluster_break=v graphemeclusterbreak=zwj,grapheme_cluster_break=zwj graphemeextend,grapheme_extend graphemelink,grapheme_link greek,greek gujarati,gujarati gunjalagondi,gunjala_gondi gurmukhi,gurmukhi gurungkhema,gurung_khema han,han hangul,hangul hanifirohingya,hanifi_rohingya hanunoo,hanunoo hatran,hatran hebrew,hebrew hexdigit,hex_digit hiragana,hiragana hyphen,hyphen idcompatmathcontinue,id_compat_math_continue idcompatmathstart,id_compat_math_start idcontinue,id_continue ideographic,ideographic idsbinaryoperator,ids_binary_operator idstart,id_start idstrinaryoperator,ids_trinary_operator idsunaryoperator,ids_unary_operator imperialaramaic,imperial_aramaic inadlam,in_adlam inaegeannumbers,in_aegean_numbers inahom,in_ahom inalchemicalsymbols,in_alchemical_symbols inalphabeticpresentationforms,in_alphabetic_presentation_forms inanatolianhieroglyphs,in_anatolian_hieroglyphs inancientgreekmusicalnotation,in_ancient_greek_musical_notation inancientgreeknumbers,in_ancient_greek_numbers inancientsymbols,in_ancient_symbols inarabic,in_arabic inarabicextendeda,in_arabic_extended_a inarabicextendedb,in_arabic_extended_b inarabicextendedc,in_arabic_extended_c inarabicmathematicalalphabeticsymbols,in_arabic_mathematical_alphabetic_symbols inarabicpresentationformsa,in_arabic_presentation_forms_a inarabicpresentationformsb,in_arabic_presentation_forms_b inarabicsupplement,in_arabic_supplement inarmenian,in_armenian inarrows,in_arrows inavestan,in_avestan inbalinese,in_balinese inbamum,in_bamum inbamumsupplement,in_bamum_supplement inbasiclatin,in_basic_latin inbassavah,in_bassa_vah inbatak,in_batak inbengali,in_bengali inbhaiksuki,in_bhaiksuki inblockelements,in_block_elements inbopomofo,in_bopomofo inbopomofoextended,in_bopomofo_extended inboxdrawing,in_box_drawing inbrahmi,in_brahmi inbraillepatterns,in_braille_patterns inbuginese,in_buginese inbuhid,in_buhid inbyzantinemusicalsymbols,in_byzantine_musical_symbols incarian,in_carian incaucasianalbanian,in_caucasian_albanian inchakma,in_chakma incham,in_cham incherokee,in_cherokee incherokeesupplement,in_cherokee_supplement inchesssymbols,in_chess_symbols inchorasmian,in_chorasmian incjkcompatibility,in_cjk_compatibility incjkcompatibilityforms,in_cjk_compatibility_forms incjkcompatibilityideographs,in_cjk_compatibility_ideographs incjkcompatibilityideographssupplement,in_cjk_compatibility_ideographs_supplement incjkradicalssupplement,in_cjk_radicals_supplement incjkstrokes,in_cjk_strokes incjksymbolsandpunctuation,in_cjk_symbols_and_punctuation incjkunifiedideographs,in_cjk_unified_ideographs incjkunifiedideographsextensiona,in_cjk_unified_ideographs_extension_a incjkunifiedideographsextensionb,in_cjk_unified_ideographs_extension_b incjkunifiedideographsextensionc,in_cjk_unified_ideographs_extension_c incjkunifiedideographsextensiond,in_cjk_unified_ideographs_extension_d incjkunifiedideographsextensione,in_cjk_unified_ideographs_extension_e incjkunifiedideographsextensionf,in_cjk_unified_ideographs_extension_f incjkunifiedideographsextensiong,in_cjk_unified_ideographs_extension_g incjkunifiedideographsextensionh,in_cjk_unified_ideographs_extension_h incjkunifiedideographsextensioni,in_cjk_unified_ideographs_extension_i incombiningdiacriticalmarks,in_combining_diacritical_marks incombiningdiacriticalmarksextended,in_combining_diacritical_marks_extended incombiningdiacriticalmarksforsymbols,in_combining_diacritical_marks_for_symbols incombiningdiacriticalmarkssupplement,in_combining_diacritical_marks_supplement incombininghalfmarks,in_combining_half_marks incommonindicnumberforms,in_common_indic_number_forms incontrolpictures,in_control_pictures incoptic,in_coptic incopticepactnumbers,in_coptic_epact_numbers incountingrodnumerals,in_counting_rod_numerals incuneiform,in_cuneiform incuneiformnumbersandpunctuation,in_cuneiform_numbers_and_punctuation incurrencysymbols,in_currency_symbols incypriotsyllabary,in_cypriot_syllabary incyprominoan,in_cypro_minoan incyrillic,in_cyrillic incyrillicextendeda,in_cyrillic_extended_a incyrillicextendedb,in_cyrillic_extended_b incyrillicextendedc,in_cyrillic_extended_c incyrillicextendedd,in_cyrillic_extended_d incyrillicsupplement,in_cyrillic_supplement indeseret,in_deseret indevanagari,in_devanagari indevanagariextended,in_devanagari_extended indevanagariextendeda,in_devanagari_extended_a indingbats,in_dingbats indivesakuru,in_dives_akuru indogra,in_dogra indominotiles,in_domino_tiles induployan,in_duployan inearlydynasticcuneiform,in_early_dynastic_cuneiform inegyptianhieroglyphformatcontrols,in_egyptian_hieroglyph_format_controls inegyptianhieroglyphs,in_egyptian_hieroglyphs inegyptianhieroglyphsextendeda,in_egyptian_hieroglyphs_extended_a inelbasan,in_elbasan inelymaic,in_elymaic inemoticons,in_emoticons inenclosedalphanumerics,in_enclosed_alphanumerics inenclosedalphanumericsupplement,in_enclosed_alphanumeric_supplement inenclosedcjklettersandmonths,in_enclosed_cjk_letters_and_months inenclosedideographicsupplement,in_enclosed_ideographic_supplement inethiopic,in_ethiopic inethiopicextended,in_ethiopic_extended inethiopicextendeda,in_ethiopic_extended_a inethiopicextendedb,in_ethiopic_extended_b inethiopicsupplement,in_ethiopic_supplement ingaray,in_garay ingeneralpunctuation,in_general_punctuation ingeometricshapes,in_geometric_shapes ingeometricshapesextended,in_geometric_shapes_extended ingeorgian,in_georgian ingeorgianextended,in_georgian_extended ingeorgiansupplement,in_georgian_supplement inglagolitic,in_glagolitic inglagoliticsupplement,in_glagolitic_supplement ingothic,in_gothic ingrantha,in_grantha ingreekandcoptic,in_greek_and_coptic ingreekextended,in_greek_extended ingujarati,in_gujarati ingunjalagondi,in_gunjala_gondi ingurmukhi,in_gurmukhi ingurungkhema,in_gurung_khema inhalfwidthandfullwidthforms,in_halfwidth_and_fullwidth_forms inhangulcompatibilityjamo,in_hangul_compatibility_jamo inhanguljamo,in_hangul_jamo inhanguljamoextendeda,in_hangul_jamo_extended_a inhanguljamoextendedb,in_hangul_jamo_extended_b inhangulsyllables,in_hangul_syllables inhanifirohingya,in_hanifi_rohingya inhanunoo,in_hanunoo inhatran,in_hatran inhebrew,in_hebrew inherited,inherited inhighprivateusesurrogates,in_high_private_use_surrogates inhighsurrogates,in_high_surrogates inhiragana,in_hiragana inideographicdescriptioncharacters,in_ideographic_description_characters inideographicsymbolsandpunctuation,in_ideographic_symbols_and_punctuation inimperialaramaic,in_imperial_aramaic inindicsiyaqnumbers,in_indic_siyaq_numbers ininscriptionalpahlavi,in_inscriptional_pahlavi ininscriptionalparthian,in_inscriptional_parthian inipaextensions,in_ipa_extensions initialpunctuation,initial_punctuation injavanese,in_javanese inkaithi,in_kaithi inkaktoviknumerals,in_kaktovik_numerals inkanaextendeda,in_kana_extended_a inkanaextendedb,in_kana_extended_b inkanasupplement,in_kana_supplement inkanbun,in_kanbun inkangxiradicals,in_kangxi_radicals inkannada,in_kannada inkatakana,in_katakana inkatakanaphoneticextensions,in_katakana_phonetic_extensions inkawi,in_kawi inkayahli,in_kayah_li inkharoshthi,in_kharoshthi inkhitansmallscript,in_khitan_small_script inkhmer,in_khmer inkhmersymbols,in_khmer_symbols inkhojki,in_khojki inkhudawadi,in_khudawadi inkiratrai,in_kirat_rai inlao,in_lao inlatin1supplement,in_latin_1_supplement inlatinextendeda,in_latin_extended_a inlatinextendedadditional,in_latin_extended_additional inlatinextendedb,in_latin_extended_b inlatinextendedc,in_latin_extended_c inlatinextendedd,in_latin_extended_d inlatinextendede,in_latin_extended_e inlatinextendedf,in_latin_extended_f inlatinextendedg,in_latin_extended_g inlepcha,in_lepcha inletterlikesymbols,in_letterlike_symbols inlimbu,in_limbu inlineara,in_linear_a inlinearbideograms,in_linear_b_ideograms inlinearbsyllabary,in_linear_b_syllabary inlisu,in_lisu inlisusupplement,in_lisu_supplement inlowsurrogates,in_low_surrogates inlycian,in_lycian inlydian,in_lydian inmahajani,in_mahajani inmahjongtiles,in_mahjong_tiles inmakasar,in_makasar inmalayalam,in_malayalam inmandaic,in_mandaic inmanichaean,in_manichaean inmarchen,in_marchen inmasaramgondi,in_masaram_gondi inmathematicalalphanumericsymbols,in_mathematical_alphanumeric_symbols inmathematicaloperators,in_mathematical_operators inmayannumerals,in_mayan_numerals inmedefaidrin,in_medefaidrin inmeeteimayek,in_meetei_mayek inmeeteimayekextensions,in_meetei_mayek_extensions inmendekikakui,in_mende_kikakui inmeroiticcursive,in_meroitic_cursive inmeroitichieroglyphs,in_meroitic_hieroglyphs inmiao,in_miao inmiscellaneousmathematicalsymbolsa,in_miscellaneous_mathematical_symbols_a inmiscellaneousmathematicalsymbolsb,in_miscellaneous_mathematical_symbols_b inmiscellaneoussymbols,in_miscellaneous_symbols inmiscellaneoussymbolsandarrows,in_miscellaneous_symbols_and_arrows inmiscellaneoussymbolsandpictographs,in_miscellaneous_symbols_and_pictographs inmiscellaneoustechnical,in_miscellaneous_technical inmodi,in_modi inmodifiertoneletters,in_modifier_tone_letters inmongolian,in_mongolian inmongoliansupplement,in_mongolian_supplement inmro,in_mro inmultani,in_multani inmusicalsymbols,in_musical_symbols inmyanmar,in_myanmar inmyanmarextendeda,in_myanmar_extended_a inmyanmarextendedb,in_myanmar_extended_b inmyanmarextendedc,in_myanmar_extended_c innabataean,in_nabataean innagmundari,in_nag_mundari innandinagari,in_nandinagari innewa,in_newa innewtailue,in_new_tai_lue innko,in_nko innoblock,in_no_block innumberforms,in_number_forms innushu,in_nushu innyiakengpuachuehmong,in_nyiakeng_puachue_hmong inogham,in_ogham inolchiki,in_ol_chiki inoldhungarian,in_old_hungarian inolditalic,in_old_italic inoldnortharabian,in_old_north_arabian inoldpermic,in_old_permic inoldpersian,in_old_persian inoldsogdian,in_old_sogdian inoldsoutharabian,in_old_south_arabian inoldturkic,in_old_turkic inolduyghur,in_old_uyghur inolonal,in_ol_onal inopticalcharacterrecognition,in_optical_character_recognition inoriya,in_oriya inornamentaldingbats,in_ornamental_dingbats inosage,in_osage inosmanya,in_osmanya inottomansiyaqnumbers,in_ottoman_siyaq_numbers inpahawhhmong,in_pahawh_hmong inpalmyrene,in_palmyrene inpaucinhau,in_pau_cin_hau inphagspa,in_phags_pa inphaistosdisc,in_phaistos_disc inphoenician,in_phoenician inphoneticextensions,in_phonetic_extensions inphoneticextensionssupplement,in_phonetic_extensions_supplement inplayingcards,in_playing_cards inprivateusearea,in_private_use_area inpsalterpahlavi,in_psalter_pahlavi inrejang,in_rejang inruminumeralsymbols,in_rumi_numeral_symbols inrunic,in_runic insamaritan,in_samaritan insaurashtra,in_saurashtra inscriptionalpahlavi,inscriptional_pahlavi inscriptionalparthian,inscriptional_parthian insharada,in_sharada inshavian,in_shavian inshorthandformatcontrols,in_shorthand_format_controls insiddham,in_siddham insinhala,in_sinhala insinhalaarchaicnumbers,in_sinhala_archaic_numbers insmallformvariants,in_small_form_variants insmallkanaextension,in_small_kana_extension insogdian,in_sogdian insorasompeng,in_sora_sompeng insoyombo,in_soyombo inspacingmodifierletters,in_spacing_modifier_letters inspecials,in_specials insundanese,in_sundanese insundanesesupplement,in_sundanese_supplement insunuwar,in_sunuwar insuperscriptsandsubscripts,in_superscripts_and_subscripts insupplementalarrowsa,in_supplemental_arrows_a insupplementalarrowsb,in_supplemental_arrows_b insupplementalarrowsc,in_supplemental_arrows_c insupplementalmathematicaloperators,in_supplemental_mathematical_operators insupplementalpunctuation,in_supplemental_punctuation insupplementalsymbolsandpictographs,in_supplemental_symbols_and_pictographs insupplementaryprivateuseareaa,in_supplementary_private_use_area_a insupplementaryprivateuseareab,in_supplementary_private_use_area_b insuttonsignwriting,in_sutton_signwriting insylotinagri,in_syloti_nagri insymbolsandpictographsextendeda,in_symbols_and_pictographs_extended_a insymbolsforlegacycomputing,in_symbols_for_legacy_computing insymbolsforlegacycomputingsupplement,in_symbols_for_legacy_computing_supplement insyriac,in_syriac insyriacsupplement,in_syriac_supplement intagalog,in_tagalog intagbanwa,in_tagbanwa intags,in_tags intaile,in_tai_le intaitham,in_tai_tham intaiviet,in_tai_viet intaixuanjingsymbols,in_tai_xuan_jing_symbols intakri,in_takri intamil,in_tamil intamilsupplement,in_tamil_supplement intangsa,in_tangsa intangut,in_tangut intangutcomponents,in_tangut_components intangutsupplement,in_tangut_supplement intelugu,in_telugu inthaana,in_thaana inthai,in_thai intibetan,in_tibetan intifinagh,in_tifinagh intirhuta,in_tirhuta intodhri,in_todhri intoto,in_toto intransportandmapsymbols,in_transport_and_map_symbols intulutigalari,in_tulu_tigalari inugaritic,in_ugaritic inunifiedcanadianaboriginalsyllabics,in_unified_canadian_aboriginal_syllabics inunifiedcanadianaboriginalsyllabicsextended,in_unified_canadian_aboriginal_syllabics_extended inunifiedcanadianaboriginalsyllabicsextendeda,in_unified_canadian_aboriginal_syllabics_extended_a invai,in_vai invariationselectors,in_variation_selectors invariationselectorssupplement,in_variation_selectors_supplement invedicextensions,in_vedic_extensions inverticalforms,in_vertical_forms invithkuqi,in_vithkuqi inwancho,in_wancho inwarangciti,in_warang_citi inyezidi,in_yezidi inyijinghexagramsymbols,in_yijing_hexagram_symbols inyiradicals,in_yi_radicals inyisyllables,in_yi_syllables inzanabazarsquare,in_zanabazar_square inznamennymusicalnotation,in_znamenny_musical_notation javanese,javanese joincontrol,join_control kaithi,kaithi kannada,kannada katakana,katakana kawi,kawi kayahli,kayah_li kharoshthi,kharoshthi khitansmallscript,khitan_small_script khmer,khmer khojki,khojki khudawadi,khudawadi kiratrai,kirat_rai lao,lao latin,latin lepcha,lepcha letter,letter letternumber,letter_number limbu,limbu lineara,linear_a linearb,linear_b lineseparator,line_separator lisu,lisu logicalorderexception,logical_order_exception lower,lower lowercase,lowercase lowercaseletter,lowercase_letter lycian,lycian lydian,lydian mahajani,mahajani makasar,makasar malayalam,malayalam mandaic,mandaic manichaean,manichaean marchen,marchen mark,mark masaramgondi,masaram_gondi math,math mathsymbol,math_symbol medefaidrin,medefaidrin meeteimayek,meetei_mayek mendekikakui,mende_kikakui meroiticcursive,meroitic_cursive meroitichieroglyphs,meroitic_hieroglyphs miao,miao modi,modi modifiercombiningmark,modifier_combining_mark modifierletter,modifier_letter modifiersymbol,modifier_symbol mongolian,mongolian mro,mro multani,multani myanmar,myanmar nabataean,nabataean nagmundari,nag_mundari nandinagari,nandinagari newa,newa newline,newline newtailue,new_tai_lue nko,nko noncharactercodepoint,noncharacter_code_point nonspacingmark,nonspacing_mark number,number nushu,nushu nyiakengpuachuehmong,nyiakeng_puachue_hmong ogham,ogham olchiki,ol_chiki oldhungarian,old_hungarian olditalic,old_italic oldnortharabian,old_north_arabian oldpermic,old_permic oldpersian,old_persian oldsogdian,old_sogdian oldsoutharabian,old_south_arabian oldturkic,old_turkic olduyghur,old_uyghur olonal,ol_onal openpunctuation,open_punctuation oriya,oriya osage,osage osmanya,osmanya other,other otheralphabetic,other_alphabetic otherdefaultignorablecodepoint,other_default_ignorable_code_point othergraphemeextend,other_grapheme_extend otheridcontinue,other_id_continue otheridstart,other_id_start otherletter,other_letter otherlowercase,other_lowercase othermath,other_math othernumber,other_number otherpunctuation,other_punctuation othersymbol,other_symbol otheruppercase,other_uppercase pahawhhmong,pahawh_hmong palmyrene,palmyrene paragraphseparator,paragraph_separator patternsyntax,pattern_syntax patternwhitespace,pattern_white_space paucinhau,pau_cin_hau phagspa,phags_pa phoenician,phoenician prependedconcatenationmark,prepended_concatenation_mark print,print privateuse,private_use psalterpahlavi,psalter_pahlavi punct,punct punctuation,punctuation quotationmark,quotation_mark radical,radical regionalindicator,regional_indicator rejang,rejang runic,runic samaritan,samaritan saurashtra,saurashtra sentenceterminal,sentence_terminal separator,separator sharada,sharada shavian,shavian siddham,siddham signwriting,signwriting sinhala,sinhala softdotted,soft_dotted sogdian,sogdian sorasompeng,sora_sompeng soyombo,soyombo space,space spaceseparator,space_separator spacingmark,spacing_mark sundanese,sundanese sunuwar,sunuwar surrogate,surrogate sylotinagri,syloti_nagri symbol,symbol syriac,syriac tagalog,tagalog tagbanwa,tagbanwa taile,tai_le taitham,tai_tham taiviet,tai_viet takri,takri tamil,tamil tangsa,tangsa tangut,tangut telugu,telugu terminalpunctuation,terminal_punctuation thaana,thaana thai,thai tibetan,tibetan tifinagh,tifinagh tirhuta,tirhuta titlecaseletter,titlecase_letter todhri,todhri toto,toto tulutigalari,tulu_tigalari ugaritic,ugaritic unassigned,unassigned unifiedideograph,unified_ideograph unknown,unknown upper,upper uppercase,uppercase uppercaseletter,uppercase_letter vai,vai variationselector,variation_selector vithkuqi,vithkuqi wancho,wancho warangciti,warang_citi whitespace,white_space word,word xdigit,xdigit xidcontinue,xid_continue xidstart,xid_start xposixpunct,xposixpunct yezidi,yezidi yi,yi zanabazarsquare,zanabazar_square ammar-regexp_parser-68cdeff/lib/regexp_parser/scanner/properties/short.csv000066400000000000000000000103421506175332700273700ustar00rootroot00000000000000# THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT adlm,adlam aghb,caucasian_albanian ahex,ascii_hex_digit arab,arabic armi,imperial_aramaic armn,armenian avst,avestan bali,balinese bamu,bamum bass,bassa_vah batk,batak beng,bengali bhks,bhaiksuki bidic,bidi_control bopo,bopomofo brah,brahmi brai,braille bugi,buginese buhd,buhid c,other cakm,chakma cans,canadian_aboriginal cari,carian cc,control cf,format cher,cherokee chrs,chorasmian ci,case_ignorable cn,unassigned co,private_use combiningmark,mark copt,coptic cpmn,cypro_minoan cprt,cypriot cs,surrogate cwcf,changes_when_casefolded cwcm,changes_when_casemapped cwl,changes_when_lowercased cwt,changes_when_titlecased cwu,changes_when_uppercased cyrl,cyrillic dep,deprecated deva,devanagari di,default_ignorable_code_point dia,diacritic diak,dives_akuru dogr,dogra dsrt,deseret dupl,duployan ebase,emoji_modifier_base ecomp,emoji_component egyp,egyptian_hieroglyphs elba,elbasan elym,elymaic emod,emoji_modifier epres,emoji_presentation ethi,ethiopic ext,extender extpict,extended_pictographic gara,garay geor,georgian glag,glagolitic gong,gunjala_gondi gonm,masaram_gondi goth,gothic gran,grantha grbase,grapheme_base grek,greek grext,grapheme_extend grlink,grapheme_link gujr,gujarati gukh,gurung_khema guru,gurmukhi hang,hangul hani,han hano,hanunoo hatr,hatran hebr,hebrew hex,hex_digit hira,hiragana hluw,anatolian_hieroglyphs hmng,pahawh_hmong hmnp,nyiakeng_puachue_hmong hung,old_hungarian idc,id_continue ideo,ideographic ids,id_start idsb,ids_binary_operator idst,ids_trinary_operator idsu,ids_unary_operator ital,old_italic java,javanese joinc,join_control kali,kayah_li kana,katakana khar,kharoshthi khmr,khmer khoj,khojki kits,khitan_small_script knda,kannada krai,kirat_rai kthi,kaithi l,letter lana,tai_tham laoo,lao latn,latin lc,cased_letter lepc,lepcha limb,limbu lina,linear_a linb,linear_b ll,lowercase_letter lm,modifier_letter lo,other_letter loe,logical_order_exception lt,titlecase_letter lu,uppercase_letter lyci,lycian lydi,lydian m,mark mahj,mahajani maka,makasar mand,mandaic mani,manichaean marc,marchen mc,spacing_mark mcm,modifier_combining_mark me,enclosing_mark medf,medefaidrin mend,mende_kikakui merc,meroitic_cursive mero,meroitic_hieroglyphs mlym,malayalam mn,nonspacing_mark mong,mongolian mroo,mro mtei,meetei_mayek mult,multani mymr,myanmar n,number nagm,nag_mundari nand,nandinagari narb,old_north_arabian nbat,nabataean nchar,noncharacter_code_point nd,decimal_number nkoo,nko nl,letter_number no,other_number nshu,nushu oalpha,other_alphabetic odi,other_default_ignorable_code_point ogam,ogham ogrext,other_grapheme_extend oidc,other_id_continue oids,other_id_start olck,ol_chiki olower,other_lowercase omath,other_math onao,ol_onal orkh,old_turkic orya,oriya osge,osage osma,osmanya ougr,old_uyghur oupper,other_uppercase p,punctuation palm,palmyrene patsyn,pattern_syntax patws,pattern_white_space pauc,pau_cin_hau pc,connector_punctuation pcm,prepended_concatenation_mark pd,dash_punctuation pe,close_punctuation perm,old_permic pf,final_punctuation phag,phags_pa phli,inscriptional_pahlavi phlp,psalter_pahlavi phnx,phoenician pi,initial_punctuation plrd,miao po,other_punctuation prti,inscriptional_parthian ps,open_punctuation qaac,coptic qaai,inherited qmark,quotation_mark ri,regional_indicator rjng,rejang rohg,hanifi_rohingya runr,runic s,symbol samr,samaritan sarb,old_south_arabian saur,saurashtra sc,currency_symbol sd,soft_dotted sgnw,signwriting shaw,shavian shrd,sharada sidd,siddham sind,khudawadi sinh,sinhala sk,modifier_symbol sm,math_symbol so,other_symbol sogd,sogdian sogo,old_sogdian sora,sora_sompeng soyo,soyombo sterm,sentence_terminal sund,sundanese sunu,sunuwar sylo,syloti_nagri syrc,syriac tagb,tagbanwa takr,takri tale,tai_le talu,new_tai_lue taml,tamil tang,tangut tavt,tai_viet telu,telugu term,terminal_punctuation tfng,tifinagh tglg,tagalog thaa,thaana tibt,tibetan tirh,tirhuta tnsa,tangsa todr,todhri tutg,tulu_tigalari ugar,ugaritic uideo,unified_ideograph vaii,vai vith,vithkuqi vs,variation_selector wara,warang_citi wcho,wancho wspace,white_space xidc,xid_continue xids,xid_start xpeo,old_persian xsux,cuneiform yezi,yezidi yiii,yi z,separator zanb,zanabazar_square zinh,inherited zl,line_separator zp,paragraph_separator zs,space_separator zyyy,common zzzz,unknown ammar-regexp_parser-68cdeff/lib/regexp_parser/scanner/property.rl000066400000000000000000000014401506175332700255420ustar00rootroot00000000000000%%{ machine re_property; property_char = [pP]; property_sequence = property_char . '{' . '^'? (alnum|space|[_\-\.=])+ '}'; action premature_property_end { raise PrematureEndError.new('unicode property') } # Unicode properties scanner # -------------------------------------------------------------------------- unicode_property := |* property_sequence < eof(premature_property_end) { text = copy(data, ts-1, te) type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property name = text[3..-2].gsub(/[\^\s_\-]/, '').downcase token = self.class.short_prop_map[name] || self.class.long_prop_map[name] raise ValidationError.for(:property, name) unless token self.emit(type, token.to_sym, text) fret; }; *|; }%% ammar-regexp_parser-68cdeff/lib/regexp_parser/scanner/scanner.rl000066400000000000000000000654671506175332700253320ustar00rootroot00000000000000%%{ machine re_scanner; include re_char_type "char_type.rl"; include re_property "property.rl"; utf8_2_byte = (0xc2..0xdf 0x80..0xbf); utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf); utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf); utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte; dot = '.'; backslash = '\\'; alternation = '|'; beginning_of_line = '^'; end_of_line = '$'; range_open = '{'; range_close = '}'; curlies = range_open | range_close; group_open = '('; group_close = ')'; parentheses = group_open | group_close; set_open = '['; set_close = ']'; brackets = set_open | set_close; comment = ('#' . [^\n]* . '\n'?); class_posix = ('[:' . '^'? . [^\[\]]* . ':]'); line_anchor = beginning_of_line | end_of_line; anchor_char = [AbBzZG]; escaped_ascii = [abefnrtv]; octal_sequence = [0-7]{1,3}; hex_sequence = 'x' . xdigit{1,2}; hex_sequence_err = 'x' . [^0-9A-Fa-f]; high_hex_sequence = 'x' . [89A-Fa-f] . xdigit . ( '\\x' . [89A-Fa-f] . xdigit )*; codepoint_single = 'u' . xdigit{4}; codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}'; codepoint_sequence = codepoint_single | codepoint_list; control_sequence = ('c' | 'C-') . (backslash . 'M-')? . backslash? . any; meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any; sequence_char = [CMcux]; zero_or_one = '?' | '??' | '?+'; zero_or_more = '*' | '*?' | '*+'; one_or_more = '+' | '+?' | '++'; quantifier_greedy = '?' | '*' | '+'; quantity_exact = (digit+); quantity_minimum = (digit+) . ','; quantity_maximum = ',' . (digit+); quantity_range = (digit+) . ',' . (digit+); quantifier_interval = range_open . ( quantity_exact | quantity_minimum | quantity_maximum | quantity_range ) . range_close; conditional = '(?('; group_comment = '?#' . [^)]* . group_close; group_atomic = '?>'; group_passive = '?:'; group_absence = '?~'; assertion_lookahead = '?='; assertion_nlookahead = '?!'; assertion_lookbehind = '?<='; assertion_nlookbehind = '?~]+ . ':'? ) ?; group_name_id_ab = ([^!=0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*; group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*; group_number = '-'? . [0-9]+; group_level = [+\-] . [0-9]+; group_name = ('<' . group_name_id_ab? . '>') | ("'" . group_name_id_sq? . "'"); group_lookup = group_name | group_number; group_named = ('?' . group_name ); group_ref_body = (('<' . (group_name_id_ab? | group_number) . group_level? '>') | ("'" . (group_name_id_sq? | group_number) . group_level? "'")); group_ref = 'k' . group_ref_body; group_call = 'g' . group_ref_body; group_type = group_atomic | group_passive | group_absence | group_named; keep_mark = 'K'; assertion_type = assertion_lookahead | assertion_nlookahead | assertion_lookbehind | assertion_nlookbehind; # characters that 'break' a literal meta_char = dot | backslash | alternation | curlies | parentheses | brackets | line_anchor | quantifier_greedy; literal_delimiters = ']' | '}'; ascii_print = ((0x20..0x7e) - meta_char - '#'); ascii_nonprint = (0x01..0x1f | 0x7f); non_literal_escape = char_type_char | anchor_char | escaped_ascii | keep_mark | sequence_char; # escapes that also work within a character set set_escape = backslash | brackets | escaped_ascii | octal_sequence | property_char | sequence_char | single_codepoint_char_type; # EOF error, used where it can be detected action premature_end_error { text = copy(data, ts ? ts-1 : 0, -1) raise PrematureEndError.new(text) } # Invalid sequence error, used from sequences, like escapes and sets action invalid_sequence_error { text = copy(data, ts ? ts-1 : 0, -1) raise ValidationError.for(:sequence, 'sequence', text) } # group (nesting) and set open/close actions action group_opened { self.group_depth = group_depth + 1 } action group_closed { self.group_depth = group_depth - 1 } action set_opened { self.set_depth = set_depth + 1 } action set_closed { self.set_depth = set_depth - 1 } # Character set scanner, continues consuming characters until it meets the # closing bracket of the set. # -------------------------------------------------------------------------- character_set := |* set_close > (set_meta, 2) @set_closed { emit(:set, :close, copy(data, ts, te)) if in_set? fret; else fgoto main; end }; '-]' @set_closed { # special case, emits two tokens emit(:literal, :literal, '-') emit(:set, :close, ']') if in_set? fret; else fgoto main; end }; '-&&' { # special case, emits two tokens emit(:literal, :literal, '-') emit(:set, :intersection, '&&') }; '^' { if prev_token[1] == :open emit(:set, :negate, '^') else emit(:literal, :literal, '^') end }; '-' { # ranges cant start with the opening bracket, a subset, or # intersection/negation/range operators if prev_token[0] == :set emit(:literal, :literal, '-') else emit(:set, :range, '-') end }; # Unlike ranges, intersections can start or end at set boundaries, whereupon # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil] '&&' { emit(:set, :intersection, '&&') }; backslash { fcall set_escape_sequence; }; set_open >(open_bracket, 1) >set_opened { emit(:set, :open, '[') fcall character_set; }; class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) { text = copy(data, ts, te) type = :posixclass class_name = text[2..-3] if class_name[0] == '^' class_name = class_name[1..-1] type = :nonposixclass end unless POSIX_CLASSES[class_name] raise ValidationError.for(:posix_class, text) end emit(type, class_name.to_sym, text) }; meta_char > (set_meta, 1) { emit(:literal, :literal, copy(data, ts, te)) }; any | ascii_nonprint | utf8_multibyte { text = copy(data, ts, te) emit(:literal, :literal, text) }; *|; # set escapes scanner # -------------------------------------------------------------------------- set_escape_sequence := |* # Special case: in sets, octal sequences have higher priority than backrefs octal_sequence { emit(:escape, :octal, copy(data, ts-1, te)) fret; }; # Scan all other escapes that work in sets with the generic escape scanner set_escape > (escaped_set_alpha, 2) { fhold; fnext character_set; fcall escape_sequence; }; # Treat all remaining escapes - those not supported in sets - as literal. # (This currently includes \^, \-, \&, \:, although these could potentially # be meta chars when not escaped, depending on their position in the set.) (any | utf8_multibyte) > (escaped_set_alpha, 1) { emit(:escape, :literal, copy(data, ts-1, te)) fret; }; *|; # escape sequence scanner # -------------------------------------------------------------------------- escape_sequence := |* [1-9] . [0-9]* { text = copy(data, ts-1, te) # If not enough groups have been opened, there is a fallback to either an # octal or literal interpretation for 2+ digit numerical escapes. digits = text[1..-1] if digits.size == 1 || digits.to_i <= capturing_group_count emit(:backref, :number, text) elsif digits =~ /\A[0-7]{2,}\z/ emit(:escape, :octal, text) else emit(:escape, :literal, text[0..1]) emit(:literal, :literal, text[2..-1]) end fret; }; octal_sequence { emit(:escape, :octal, copy(data, ts-1, te)) fret; }; [8-9] . [0-9] { # special case, emits two tokens text = copy(data, ts-1, te) emit(:escape, :literal, text[0, 2]) emit(:literal, :literal, text[2]) fret; }; meta_char { case text = copy(data, ts-1, te) when '\.'; emit(:escape, :dot, text) when '\|'; emit(:escape, :alternation, text) when '\^'; emit(:escape, :bol, text) when '\$'; emit(:escape, :eol, text) when '\?'; emit(:escape, :zero_or_one, text) when '\*'; emit(:escape, :zero_or_more, text) when '\+'; emit(:escape, :one_or_more, text) when '\('; emit(:escape, :group_open, text) when '\)'; emit(:escape, :group_close, text) when '\{'; emit(:escape, :interval_open, text) when '\}'; emit(:escape, :interval_close, text) when '\['; emit(:escape, :set_open, text) when '\]'; emit(:escape, :set_close, text) when "\\\\"; emit(:escape, :backslash, text) end fret; }; escaped_ascii > (escaped_alpha, 7) { # \b is emitted as backspace only when inside a character set, otherwise # it is a word boundary anchor. A syntax might "normalize" it if needed. case text = copy(data, ts-1, te) when '\a'; emit(:escape, :bell, text) when '\b'; emit(:escape, :backspace, text) when '\e'; emit(:escape, :escape, text) when '\f'; emit(:escape, :form_feed, text) when '\n'; emit(:escape, :newline, text) when '\r'; emit(:escape, :carriage, text) when '\t'; emit(:escape, :tab, text) when '\v'; emit(:escape, :vertical_tab, text) end fret; }; codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) { text = copy(data, ts-1, te) if text[2] == '{' emit(:escape, :codepoint_list, text) else emit(:escape, :codepoint, text) end fret; }; high_hex_sequence > (escaped_alpha, 5) { text = copy(data, ts-1, te) if regexp_encoding == Encoding::BINARY text.split(/(?=\\)/).each { |part| emit(:escape, :hex, part) } else emit(:escape, :utf8_hex, text) end fret; }; hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) { emit(:escape, :hex, copy(data, ts-1, te)) fret; }; hex_sequence_err @invalid_sequence_error { fret; }; control_sequence >(escaped_alpha, 4) $eof(premature_end_error) { emit_meta_control_sequence(data, ts, te, :control) fret; }; meta_sequence >(backslashed, 3) $eof(premature_end_error) { emit_meta_control_sequence(data, ts, te, :meta_sequence) fret; }; char_type_char > (escaped_alpha, 2) { fhold; fnext *(in_set? ? fentry(character_set) : fentry(main)); fcall char_type; }; property_char > (escaped_alpha, 2) { fhold; fnext *(in_set? ? fentry(character_set) : fentry(main)); fcall unicode_property; }; (any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) { emit(:escape, :literal, copy(data, ts-1, te)) fret; }; *|; # conditional expressions scanner # -------------------------------------------------------------------------- conditional_expression := |* group_lookup . ')' { text = copy(data, ts, te-1) text =~ /[^0]/ or raise ValidationError.for(:backref, 'condition', 'invalid ref ID') emit(:conditional, :condition, text) emit(:conditional, :condition_close, ')') }; any { fhold; fcall main; }; *|; # Main scanner # -------------------------------------------------------------------------- main := |* # Meta characters # ------------------------------------------------------------------------ dot { emit(:meta, :dot, copy(data, ts, te)) }; alternation { if conditional_stack.last == group_depth emit(:conditional, :separator, copy(data, ts, te)) else emit(:meta, :alternation, copy(data, ts, te)) end }; # Anchors # ------------------------------------------------------------------------ beginning_of_line { emit(:anchor, :bol, copy(data, ts, te)) }; end_of_line { emit(:anchor, :eol, copy(data, ts, te)) }; backslash . keep_mark > (backslashed, 4) { emit(:keep, :mark, copy(data, ts, te)) }; backslash . anchor_char > (backslashed, 3) { case text = copy(data, ts, te) when '\A'; emit(:anchor, :bos, text) when '\z'; emit(:anchor, :eos, text) when '\Z'; emit(:anchor, :eos_ob_eol, text) when '\b'; emit(:anchor, :word_boundary, text) when '\B'; emit(:anchor, :nonword_boundary, text) when '\G'; emit(:anchor, :match_start, text) end }; literal_delimiters { append_literal(data, ts, te) }; # Character sets # ------------------------------------------------------------------------ set_open >set_opened { emit(:set, :open, copy(data, ts, te)) fcall character_set; }; # Conditional expression # (?(condition)Y|N) conditional expression # ------------------------------------------------------------------------ conditional { text = copy(data, ts, te) conditional_stack << group_depth emit(:conditional, :open, text[0..-2]) emit(:conditional, :condition_open, '(') fcall conditional_expression; }; # (?#...) comments: parsed as a single expression, without introducing a # new nesting level. Comments may not include parentheses, escaped or not. # special case for close to get the correct closing count. # ------------------------------------------------------------------------ (group_open . group_comment) @group_closed { emit(:group, :comment, copy(data, ts, te)) }; # Expression options: # (?imxdau-imx) option on/off # i: ignore case # m: multi-line (dot(.) match newline) # x: extended form # d: default class rules (1.9 compatible) # a: ASCII class rules (\s, \w, etc.) # u: Unicode class rules (\s, \w, etc.) # # (?imxdau-imx:subexp) option on/off for subexp # ------------------------------------------------------------------------ (group_open . group_options) >group_opened { text = copy(data, ts, te) if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/ raise ValidationError.for(:group_option, $1 || "-#{$2}", text) end emit_options(text) }; # Assertions # (?=subexp) look-ahead # (?!subexp) negative look-ahead # (?<=subexp) look-behind # (?group_opened { case text = copy(data, ts, te) when '(?='; emit(:assertion, :lookahead, text) when '(?!'; emit(:assertion, :nlookahead, text) when '(?<='; emit(:assertion, :lookbehind, text) when '(?subexp) atomic group, don't backtrack in subexp. # (?~subexp) absence group, matches anything that is not subexp # (?subexp) named group # (?'name'subexp) named group (single quoted version) # (subexp) captured group # ------------------------------------------------------------------------ (group_open . group_type) >group_opened { case text = copy(data, ts, te) when '(?:'; emit(:group, :passive, text) when '(?>'; emit(:group, :atomic, text) when '(?~'; emit(:group, :absence, text) when /^\(\?(?:<>|'')/ raise ValidationError.for(:group, 'named group', 'name is empty') when /^\(\?<[^>]+>/ emit(:group, :named_ab, text) when /^\(\?'[^']+'/ emit(:group, :named_sq, text) end }; group_open @group_opened { self.capturing_group_count = capturing_group_count + 1 text = copy(data, ts, te) emit(:group, :capture, text) }; group_close @group_closed { if conditional_stack.last == group_depth + 1 conditional_stack.pop emit(:conditional, :close, ')') elsif group_depth >= 0 if spacing_stack.length > 1 && spacing_stack.last[:depth] == group_depth + 1 spacing_stack.pop self.free_spacing = spacing_stack.last[:free_spacing] end emit(:group, :close, ')') else raise ValidationError.for(:group, 'group', 'unmatched close parenthesis') end }; # Group backreference, named and numbered # ------------------------------------------------------------------------ backslash . (group_ref) > (backslashed, 4) { case text = copy(data, ts, te) when /^\\k(.)[^0-9\-][^+\-]*['>]$/ emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text) when /^\\k(.)0*[1-9]\d*['>]$/ emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text) when /^\\k(.)-0*[1-9]\d*['>]$/ emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text) when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/ emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text) when /^\\k(.)-?0*[1-9]\d*[+\-]\d+['>]$/ emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text) else raise ValidationError.for(:backref, 'backreference', 'invalid ref ID') end }; # Group call, named and numbered # ------------------------------------------------------------------------ backslash . (group_call) > (backslashed, 4) { case text = copy(data, ts, te) when /^\\g(.)[^0-9+\-].*['>]$/ emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text) when /^\\g(.)(?:0|0*[1-9]\d*)['>]$/ emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text) when /^\\g(.)[+-]0*[1-9]\d*/ emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text) else raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID') end }; # Quantifiers # ------------------------------------------------------------------------ zero_or_one { case text = copy(data, ts, te) when '?' ; emit(:quantifier, :zero_or_one, text) when '??'; emit(:quantifier, :zero_or_one_reluctant, text) when '?+'; emit(:quantifier, :zero_or_one_possessive, text) end }; zero_or_more { case text = copy(data, ts, te) when '*' ; emit(:quantifier, :zero_or_more, text) when '*?'; emit(:quantifier, :zero_or_more_reluctant, text) when '*+'; emit(:quantifier, :zero_or_more_possessive, text) end }; one_or_more { case text = copy(data, ts, te) when '+' ; emit(:quantifier, :one_or_more, text) when '+?'; emit(:quantifier, :one_or_more_reluctant, text) when '++'; emit(:quantifier, :one_or_more_possessive, text) end }; quantifier_interval { emit(:quantifier, :interval, copy(data, ts, te)) }; # Catch unmatched curly braces as literals range_open { append_literal(data, ts, te) }; # Escaped sequences # ------------------------------------------------------------------------ backslash > (backslashed, 1) { fcall escape_sequence; }; comment { if free_spacing emit(:free_space, :comment, copy(data, ts, te)) else # consume only the pound sign (#) and backtrack to do regular scanning append_literal(data, ts, ts + 1) fexec ts + 1; end }; space+ { if free_spacing emit(:free_space, :whitespace, copy(data, ts, te)) else append_literal(data, ts, te) end }; # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8, # except meta characters. # ------------------------------------------------------------------------ (ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ { append_literal(data, ts, te) }; *|; }%% require_relative 'scanner/errors/scanner_error' require_relative 'scanner/errors/premature_end_error' require_relative 'scanner/errors/validation_error' class Regexp::Scanner # Scans the given regular expression text, or Regexp object and collects the # emitted token into an array that gets returned at the end. If a block is # given, it gets called for each emitted token. # # This method may raise errors if a syntax error is encountered. # -------------------------------------------------------------------------- def self.scan(input_object, options: nil, collect_tokens: true, &block) new.scan(input_object, options: options, collect_tokens: collect_tokens, &block) end def scan(input_object, options: nil, collect_tokens: true, &block) self.collect_tokens = collect_tokens self.literal_run = nil stack = [] input = input_object.is_a?(Regexp) ? input_object.source : input_object self.free_spacing = free_spacing?(input_object, options) self.regexp_encoding = extract_encoding(input_object, options) self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}] data = input.unpack("c*") eof = data.length self.tokens = [] self.block = block self.set_depth = 0 self.group_depth = 0 self.capturing_group_count = 0 self.conditional_stack = [] self.char_pos = 0 %% write data; %% write init; %% write exec; # to avoid "warning: assigned but unused variable - testEof" testEof = testEof if cs == re_scanner_error text = copy(data, ts ? ts-1 : 0, -1) raise ScannerError.new("Scan error at '#{text}'") end raise PrematureEndError.new("(missing group closing paranthesis) "+ "[#{group_depth}]") if in_group? raise PrematureEndError.new("(missing set closing bracket) "+ "[#{set_depth}]") if in_set? # when the entire expression is a literal run emit_literal if literal_run tokens end # lazy-load property maps when first needed def self.short_prop_map @short_prop_map ||= parse_prop_map('short') end def self.long_prop_map @long_prop_map ||= parse_prop_map('long') end def self.parse_prop_map(name) File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h end # Use each_with_object for required_ruby_version >= 2.2, or #to_h for >= 2.6 POSIX_CLASSES = %w[alnum alpha ascii blank cntrl digit graph lower print punct space upper word xdigit] .inject({}) { |o, e| o.merge(e => true) }.freeze # Emits an array with the details of the scanned pattern def emit(type, token, text) #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}" emit_literal if literal_run # Ragel runs with byte-based indices (ts, te). These are of little value to # end-users, so we keep track of char-based indices and emit those instead. ts_char_pos = char_pos te_char_pos = char_pos + text.length tok = [type, token, text, ts_char_pos, te_char_pos] self.prev_token = tok self.char_pos = te_char_pos if block block.call type, token, text, ts_char_pos, te_char_pos # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given tokens << tok if collect_tokens elsif collect_tokens tokens << tok end end attr_accessor :capturing_group_count, :literal_run # only public for #||= to work on ruby <= 2.5 private attr_accessor :block, :collect_tokens, :tokens, :prev_token, :free_spacing, :spacing_stack, :regexp_encoding, :group_depth, :set_depth, :conditional_stack, :char_pos def extract_encoding(input_object, options) if input_object.is_a?(::Regexp) input_object.encoding elsif options && (options & Regexp::NOENCODING) Encoding::BINARY end end def free_spacing?(input_object, options) if options && !input_object.is_a?(String) raise ArgumentError, 'options cannot be supplied unless scanning a String' end options = input_object.options if input_object.is_a?(::Regexp) return false unless options options & Regexp::EXTENDED != 0 end def in_group? group_depth > 0 end def in_set? set_depth > 0 end # Copy from ts to te from data as text def copy(data, ts, te) data[ts...te].pack('c*').force_encoding('utf-8') end # Appends one or more characters to the literal buffer, to be emitted later # by a call to emit_literal. def append_literal(data, ts, te) (self.literal_run ||= []) << copy(data, ts, te) end # Emits the literal run collected by calls to the append_literal method. def emit_literal text = literal_run.join self.literal_run = nil emit(:literal, :literal, text) end def emit_options(text) token = nil # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'. text =~ /\(\?([mixdau]*)(-(?:[mix]*))*(:)?/ positive, negative, group_local = $1, $2, $3 if positive.include?('x') self.free_spacing = true end # If the x appears in both, treat it like ruby does, the second cancels # the first. if negative && negative.include?('x') self.free_spacing = false end if group_local spacing_stack << {:free_spacing => free_spacing, :depth => group_depth} token = :options else # switch for parent group level spacing_stack.last[:free_spacing] = free_spacing token = :options_switch end emit(:group, token, text) end def emit_meta_control_sequence(data, ts, te, token) if data.last < 0x00 || data.last > 0x7F raise ValidationError.for(:sequence, 'escape', token.to_s) end emit(:escape, token, copy(data, ts-1, te)) end end # module Regexp::Scanner ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax.rb000066400000000000000000000004561506175332700235470ustar00rootroot00000000000000# frozen_string_literal: true require_relative 'error' module Regexp::Syntax class SyntaxError < Regexp::Parser::Error; end end require_relative 'syntax/token' require_relative 'syntax/base' require_relative 'syntax/any' require_relative 'syntax/version_lookup' require_relative 'syntax/versions' ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/000077500000000000000000000000001506175332700232155ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/any.rb000066400000000000000000000005321506175332700243310ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Syntax # A syntax that always returns true, passing all tokens as implemented. This # is useful during development, testing, and should be useful for some types # of transformations as well. class Any < Base implements :*, [:*] def self.implements?(_type, _token) true end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/base.rb000066400000000000000000000064661506175332700244700ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Syntax class NotImplementedError < Regexp::Syntax::SyntaxError def initialize(syntax, type, token) super "#{syntax} does not implement: [#{type}:#{token}]" end end # A lookup map of supported types and tokens in a given syntax class Base include Regexp::Syntax::Token class << self attr_accessor :features # automatically inherit features through the syntax class hierarchy def inherited(subclass) super subclass.features = features.to_h.map { |k, v| [k, v.dup] }.to_h end def implements(type, tokens) (features[type] ||= []).concat(tokens) added_features[type] = tokens end def excludes(type, tokens) tokens.each { |tok| features[type].delete(tok) } removed_features[type] = tokens end def implements?(type, token) implementations(type).include?(token) end alias :check? :implements? def implementations(type) features[type] || [] end def implements!(type, token) raise NotImplementedError.new(self, type, token) unless implements?(type, token) end alias :check! :implements! def added_features @added_features ||= {} end def removed_features @removed_features ||= {} end def normalize(type, token) case type when :group normalize_group(type, token) when :backref normalize_backref(type, token) else [type, token] end end def normalize_group(type, token) case token when :named_ab, :named_sq %i[group named] else [type, token] end end def normalize_backref(type, token) case token when :name_ref_ab, :name_ref_sq %i[backref name_ref] when :name_call_ab, :name_call_sq %i[backref name_call] when :name_recursion_ref_ab, :name_recursion_ref_sq %i[backref name_recursion_ref] when :number_ref_ab, :number_ref_sq %i[backref number_ref] when :number_call_ab, :number_call_sq %i[backref number_call] when :number_rel_ref_ab, :number_rel_ref_sq %i[backref number_rel_ref] when :number_rel_call_ab, :number_rel_call_sq %i[backref number_rel_call] when :number_recursion_ref_ab, :number_recursion_ref_sq %i[backref number_recursion_ref] else [type, token] end end end # TODO: drop this backwards compatibility code in v3.0.0, do `private :new` def initialize warn 'Using instances of Regexp::Parser::Syntax is deprecated ' \ "and will no longer be supported in v3.0.0." end def method_missing(name, *args) if self.class.respond_to?(name) warn 'Using instances of Regexp::Parser::Syntax is deprecated ' \ "and will no longer be supported in v3.0.0. Please call "\ "methods on the class directly, e.g.: #{self.class}.#{name}" self.class.send(name, *args) else super end end def respond_to_missing?(name, include_private = false) self.class.respond_to?(name) || super end # end of backwards compatibility code end end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/token.rb000066400000000000000000000022311506175332700246600ustar00rootroot00000000000000# frozen_string_literal: true # Define the base module and the simplest of tokens. module Regexp::Syntax module Token Map = Hash.new module Literal All = %i[literal].freeze Type = :literal end module FreeSpace All = %i[comment whitespace].freeze Type = :free_space end Map[FreeSpace::Type] = FreeSpace::All Map[Literal::Type] = Literal::All end end # Load all the token files, they will populate the Map constant. require_relative 'token/anchor' require_relative 'token/assertion' require_relative 'token/backreference' require_relative 'token/posix_class' require_relative 'token/character_set' require_relative 'token/character_type' require_relative 'token/conditional' require_relative 'token/escape' require_relative 'token/group' require_relative 'token/keep' require_relative 'token/meta' require_relative 'token/quantifier' require_relative 'token/unicode_property' # After loading all the tokens the map is full. Extract all tokens and types # into the All and Types constants. module Regexp::Syntax module Token All = Map.values.flatten.uniq.sort.freeze Types = Map.keys.freeze end end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/token/000077500000000000000000000000001506175332700243355ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/token/anchor.rb000066400000000000000000000006211506175332700261330ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Syntax module Token module Anchor Basic = %i[bol eol].freeze Extended = Basic + %i[word_boundary nonword_boundary] String = %i[bos eos eos_ob_eol].freeze MatchStart = %i[match_start].freeze All = Extended + String + MatchStart Type = :anchor end Map[Anchor::Type] = Anchor::All end end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/token/assertion.rb000066400000000000000000000004711506175332700266730ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Syntax module Token module Assertion Lookahead = %i[lookahead nlookahead].freeze Lookbehind = %i[lookbehind nlookbehind].freeze All = Lookahead + Lookbehind Type = :assertion end Map[Assertion::Type] = Assertion::All end end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/token/backreference.rb000066400000000000000000000016201506175332700274400ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Syntax module Token module Backreference Plain = %i[number].freeze NumberRef = %i[number_ref number_rel_ref].freeze Number = Plain + NumberRef Name = %i[name_ref].freeze RecursionLevel = %i[name_recursion_ref number_recursion_ref].freeze V1_8_6 = Plain V1_9_1 = Name + NumberRef + RecursionLevel All = V1_8_6 + V1_9_1 Type = :backref end # Type is the same as Backreference so keeping it here, for now. module SubexpressionCall Name = %i[name_call].freeze Number = %i[number_call number_rel_call].freeze All = Name + Number end Map[Backreference::Type] = Backreference::All + SubexpressionCall::All # alias for symmetry between token symbol and Expression class name Backref = Backreference end end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/token/character_set.rb000066400000000000000000000006031506175332700274700ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Syntax module Token module CharacterSet Basic = %i[open close negate range].freeze Extended = Basic + %i[intersection] All = Extended Type = :set end Map[CharacterSet::Type] = CharacterSet::All # alias for symmetry between token symbol and Token module name Set = CharacterSet end end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/token/character_type.rb000066400000000000000000000006431506175332700276620ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Syntax module Token module CharacterType Basic = [].freeze Extended = %i[digit nondigit space nonspace word nonword].freeze Hex = %i[hex nonhex].freeze Clustered = %i[linebreak xgrapheme].freeze All = Basic + Extended + Hex + Clustered Type = :type end Map[CharacterType::Type] = CharacterType::All end end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/token/conditional.rb000066400000000000000000000006471506175332700271740ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Syntax module Token module Conditional Delimiters = %i[open close].freeze Condition = %i[condition_open condition condition_close].freeze Separator = %i[separator].freeze All = Conditional::Delimiters + Conditional::Condition + Conditional::Separator Type = :conditional end Map[Conditional::Type] = Conditional::All end end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/token/escape.rb000066400000000000000000000015731506175332700261300ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Syntax module Token module Escape Basic = %i[backslash literal].freeze Control = %i[control meta_sequence].freeze ASCII = %i[bell backspace escape form_feed newline carriage tab vertical_tab].freeze Unicode = %i[codepoint codepoint_list].freeze Meta = %i[dot alternation zero_or_one zero_or_more one_or_more bol eol group_open group_close interval_open interval_close set_open set_close].freeze Hex = %i[hex utf8_hex].freeze Octal = %i[octal].freeze All = Basic + Control + ASCII + Unicode + Meta + Hex + Octal Type = :escape end Map[Escape::Type] = Escape::All # alias for symmetry between Token::* and Expression::* EscapeSequence = Escape end end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/token/group.rb000066400000000000000000000011001506175332700260060ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Syntax module Token module Group Basic = %i[capture close].freeze Extended = Basic + %i[options options_switch] Named = %i[named].freeze Atomic = %i[atomic].freeze Passive = %i[passive].freeze Comment = %i[comment].freeze V1_8_6 = Group::Extended + Group::Named + Group::Atomic + Group::Passive + Group::Comment V2_4_1 = %i[absence].freeze All = V1_8_6 + V2_4_1 Type = :group end Map[Group::Type] = Group::All end end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/token/keep.rb000066400000000000000000000003121506175332700256020ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Syntax module Token module Keep Mark = %i[mark].freeze All = Mark Type = :keep end Map[Keep::Type] = Keep::All end end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/token/meta.rb000066400000000000000000000006711506175332700256140ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Syntax module Token module Meta Basic = %i[dot].freeze Alternation = %i[alternation].freeze Extended = Basic + Alternation All = Extended Type = :meta end Map[Meta::Type] = Meta::All # alias for symmetry between Token::* and Expression::* module Alternation All = Meta::Alternation Type = Meta::Type end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/token/posix_class.rb000066400000000000000000000007111506175332700272100ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Syntax module Token module PosixClass Standard = %i[alnum alpha blank cntrl digit graph lower print punct space upper xdigit].freeze Extensions = %i[ascii word].freeze All = Standard + Extensions Type = :posixclass NonType = :nonposixclass end Map[PosixClass::Type] = PosixClass::All Map[PosixClass::NonType] = PosixClass::All end end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/token/quantifier.rb000066400000000000000000000016031506175332700270310ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Syntax module Token module Quantifier Greedy = %i[ zero_or_one zero_or_more one_or_more ].freeze Reluctant = %i[ zero_or_one_reluctant zero_or_more_reluctant one_or_more_reluctant ].freeze Possessive = %i[ zero_or_one_possessive zero_or_more_possessive one_or_more_possessive ].freeze Interval = %i[interval].freeze IntervalReluctant = %i[interval_reluctant].freeze IntervalPossessive = %i[interval_possessive].freeze IntervalAll = Interval + IntervalReluctant + IntervalPossessive V1_8_6 = Greedy + Reluctant + Interval + IntervalReluctant All = Greedy + Reluctant + Possessive + IntervalAll Type = :quantifier end Map[Quantifier::Type] = Quantifier::All end end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/token/unicode_property.rb000066400000000000000000000437331506175332700302660ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Syntax module Token module UnicodeProperty all = proc { |name| constants.grep(/#{name}/).flat_map(&method(:const_get)) } CharType_V1_9_0 = %i[alnum alpha ascii blank cntrl digit graph lower print punct space upper word xdigit].freeze CharType_V2_5_0 = %i[xposixpunct].freeze POSIX = %i[any assigned newline].freeze module Category Letter = %i[letter uppercase_letter lowercase_letter titlecase_letter modifier_letter other_letter].freeze Mark = %i[mark nonspacing_mark spacing_mark enclosing_mark].freeze Number = %i[number decimal_number letter_number other_number].freeze Punctuation = %i[punctuation connector_punctuation dash_punctuation open_punctuation close_punctuation initial_punctuation final_punctuation other_punctuation].freeze Symbol = %i[symbol math_symbol currency_symbol modifier_symbol other_symbol].freeze Separator = %i[separator space_separator line_separator paragraph_separator].freeze Codepoint = %i[other control format surrogate private_use unassigned].freeze All = Letter + Mark + Number + Punctuation + Symbol + Separator + Codepoint end Age_V1_9_3 = %i[age=1.1 age=2.0 age=2.1 age=3.0 age=3.1 age=3.2 age=4.0 age=4.1 age=5.0 age=5.1 age=5.2 age=6.0].freeze Age_V2_0_0 = %i[age=6.1].freeze Age_V2_2_0 = %i[age=6.2 age=6.3 age=7.0].freeze Age_V2_3_0 = %i[age=8.0].freeze Age_V2_4_0 = %i[age=9.0].freeze Age_V2_5_0 = %i[age=10.0].freeze Age_V2_6_0 = %i[age=11.0].freeze Age_V2_6_2 = %i[age=12.0].freeze Age_V2_6_3 = %i[age=12.1].freeze Age_V3_1_0 = %i[age=13.0].freeze Age_V3_2_0 = %i[age=14.0 age=15.0].freeze Age_V3_5_0 = %i[age=15.1] Age = all[:Age_V] Derived_V1_9_0 = %i[ ascii_hex_digit alphabetic cased changes_when_casefolded changes_when_casemapped changes_when_lowercased changes_when_titlecased changes_when_uppercased case_ignorable bidi_control dash deprecated default_ignorable_code_point diacritic extender grapheme_base grapheme_extend grapheme_link hex_digit hyphen id_continue ideographic id_start ids_binary_operator ids_trinary_operator join_control logical_order_exception lowercase math noncharacter_code_point other_alphabetic other_default_ignorable_code_point other_grapheme_extend other_id_continue other_id_start other_lowercase other_math other_uppercase pattern_syntax pattern_white_space quotation_mark radical sentence_terminal soft_dotted terminal_punctuation unified_ideograph uppercase variation_selector white_space xid_start xid_continue ].freeze Derived_V2_0_0 = %i[ cased_letter combining_mark ].freeze Derived_V2_4_0 = %i[ prepended_concatenation_mark ].freeze Derived_V2_5_0 = %i[ regional_indicator ].freeze Derived_V3_5_0 = %i[ id_compat_math_continue id_compat_math_start ids_unary_operator ] Derived = all[:Derived_V] Script_V1_9_0 = %i[ arabic imperial_aramaic armenian avestan balinese bamum bengali bopomofo braille buginese buhid canadian_aboriginal carian cham cherokee coptic cypriot cyrillic devanagari deseret egyptian_hieroglyphs ethiopic georgian glagolitic gothic greek gujarati gurmukhi hangul han hanunoo hebrew hiragana old_italic javanese kayah_li katakana kharoshthi khmer kannada kaithi tai_tham lao latin lepcha limbu linear_b lisu lycian lydian malayalam mongolian meetei_mayek myanmar nko ogham ol_chiki old_turkic oriya osmanya phags_pa inscriptional_pahlavi phoenician inscriptional_parthian rejang runic samaritan old_south_arabian saurashtra shavian sinhala sundanese syloti_nagri syriac tagbanwa tai_le new_tai_lue tamil tai_viet telugu tifinagh tagalog thaana thai tibetan ugaritic vai old_persian cuneiform yi inherited common unknown ].freeze Script_V1_9_3 = %i[ brahmi batak mandaic ].freeze Script_V2_0_0 = %i[ chakma meroitic_cursive meroitic_hieroglyphs miao sharada sora_sompeng takri ].freeze Script_V2_2_0 = %i[ caucasian_albanian bassa_vah duployan elbasan grantha pahawh_hmong khojki linear_a mahajani manichaean mende_kikakui modi mro old_north_arabian nabataean palmyrene pau_cin_hau old_permic psalter_pahlavi siddham khudawadi tirhuta warang_citi ].freeze Script_V2_3_0 = %i[ ahom anatolian_hieroglyphs hatran multani old_hungarian signwriting ].freeze Script_V2_4_0 = %i[ adlam bhaiksuki marchen newa osage tangut ].freeze Script_V2_5_0 = %i[ masaram_gondi nushu soyombo zanabazar_square ].freeze Script_V2_6_0 = %i[ dogra gunjala_gondi hanifi_rohingya makasar medefaidrin old_sogdian sogdian ].freeze Script_V2_6_2 = %i[ elymaic nandinagari nyiakeng_puachue_hmong wancho ].freeze Script_V3_1_0 = %i[ chorasmian dives_akuru khitan_small_script yezidi ].freeze Script_V3_2_0 = %i[ cypro_minoan kawi nag_mundari old_uyghur tangsa toto vithkuqi ].freeze Script = all[:Script_V] UnicodeBlock_V1_9_0 = %i[ in_alphabetic_presentation_forms in_arabic in_armenian in_arrows in_basic_latin in_bengali in_block_elements in_bopomofo_extended in_bopomofo in_box_drawing in_braille_patterns in_buhid in_cjk_compatibility_forms in_cjk_compatibility_ideographs in_cjk_compatibility in_cjk_radicals_supplement in_cjk_symbols_and_punctuation in_cjk_unified_ideographs_extension_a in_cjk_unified_ideographs in_cherokee in_combining_diacritical_marks_for_symbols in_combining_diacritical_marks in_combining_half_marks in_control_pictures in_currency_symbols in_cyrillic_supplement in_cyrillic in_devanagari in_dingbats in_enclosed_alphanumerics in_enclosed_cjk_letters_and_months in_ethiopic in_general_punctuation in_geometric_shapes in_georgian in_greek_extended in_greek_and_coptic in_gujarati in_gurmukhi in_halfwidth_and_fullwidth_forms in_hangul_compatibility_jamo in_hangul_jamo in_hangul_syllables in_hanunoo in_hebrew in_high_private_use_surrogates in_high_surrogates in_hiragana in_ipa_extensions in_ideographic_description_characters in_kanbun in_kangxi_radicals in_kannada in_katakana_phonetic_extensions in_katakana in_khmer_symbols in_khmer in_lao in_latin_extended_additional in_letterlike_symbols in_limbu in_low_surrogates in_malayalam in_mathematical_operators in_miscellaneous_symbols_and_arrows in_miscellaneous_symbols in_miscellaneous_technical in_mongolian in_myanmar in_number_forms in_ogham in_optical_character_recognition in_oriya in_phonetic_extensions in_private_use_area in_runic in_sinhala in_small_form_variants in_spacing_modifier_letters in_specials in_superscripts_and_subscripts in_supplemental_mathematical_operators in_syriac in_tagalog in_tagbanwa in_tai_le in_tamil in_telugu in_thaana in_thai in_tibetan in_unified_canadian_aboriginal_syllabics in_variation_selectors in_yi_radicals in_yi_syllables in_yijing_hexagram_symbols ].freeze UnicodeBlock_V2_0_0 = %i[ in_aegean_numbers in_alchemical_symbols in_ancient_greek_musical_notation in_ancient_greek_numbers in_ancient_symbols in_arabic_extended_a in_arabic_mathematical_alphabetic_symbols in_arabic_presentation_forms_a in_arabic_presentation_forms_b in_arabic_supplement in_avestan in_balinese in_bamum in_bamum_supplement in_batak in_brahmi in_buginese in_byzantine_musical_symbols in_cjk_compatibility_ideographs_supplement in_cjk_strokes in_cjk_unified_ideographs_extension_b in_cjk_unified_ideographs_extension_c in_cjk_unified_ideographs_extension_d in_carian in_chakma in_cham in_combining_diacritical_marks_supplement in_common_indic_number_forms in_coptic in_counting_rod_numerals in_cuneiform in_cuneiform_numbers_and_punctuation in_cypriot_syllabary in_cyrillic_extended_a in_cyrillic_extended_b in_deseret in_devanagari_extended in_domino_tiles in_egyptian_hieroglyphs in_emoticons in_enclosed_alphanumeric_supplement in_enclosed_ideographic_supplement in_ethiopic_extended in_ethiopic_extended_a in_ethiopic_supplement in_georgian_supplement in_glagolitic in_gothic in_hangul_jamo_extended_a in_hangul_jamo_extended_b in_imperial_aramaic in_inscriptional_pahlavi in_inscriptional_parthian in_javanese in_kaithi in_kana_supplement in_kayah_li in_kharoshthi in_latin_1_supplement in_latin_extended_a in_latin_extended_b in_latin_extended_c in_latin_extended_d in_lepcha in_linear_b_ideograms in_linear_b_syllabary in_lisu in_lycian in_lydian in_mahjong_tiles in_mandaic in_mathematical_alphanumeric_symbols in_meetei_mayek in_meetei_mayek_extensions in_meroitic_cursive in_meroitic_hieroglyphs in_miao in_miscellaneous_mathematical_symbols_a in_miscellaneous_mathematical_symbols_b in_miscellaneous_symbols_and_pictographs in_modifier_tone_letters in_musical_symbols in_myanmar_extended_a in_nko in_new_tai_lue in_no_block in_ol_chiki in_old_italic in_old_persian in_old_south_arabian in_old_turkic in_osmanya in_phags_pa in_phaistos_disc in_phoenician in_phonetic_extensions_supplement in_playing_cards in_rejang in_rumi_numeral_symbols in_samaritan in_saurashtra in_sharada in_shavian in_sora_sompeng in_sundanese in_sundanese_supplement in_supplemental_arrows_a in_supplemental_arrows_b in_supplemental_punctuation in_supplementary_private_use_area_a in_supplementary_private_use_area_b in_syloti_nagri in_tags in_tai_tham in_tai_viet in_tai_xuan_jing_symbols in_takri in_tifinagh in_transport_and_map_symbols in_ugaritic in_unified_canadian_aboriginal_syllabics_extended in_vai in_variation_selectors_supplement in_vedic_extensions in_vertical_forms ].freeze UnicodeBlock_V2_2_0 = %i[ in_bassa_vah in_caucasian_albanian in_combining_diacritical_marks_extended in_coptic_epact_numbers in_duployan in_elbasan in_geometric_shapes_extended in_grantha in_khojki in_khudawadi in_latin_extended_e in_linear_a in_mahajani in_manichaean in_mende_kikakui in_modi in_mro in_myanmar_extended_b in_nabataean in_old_north_arabian in_old_permic in_ornamental_dingbats in_pahawh_hmong in_palmyrene in_pau_cin_hau in_psalter_pahlavi in_shorthand_format_controls in_siddham in_sinhala_archaic_numbers in_supplemental_arrows_c in_tirhuta in_warang_citi ].freeze UnicodeBlock_V2_3_0 = %i[ in_ahom in_anatolian_hieroglyphs in_cjk_unified_ideographs_extension_e in_cherokee_supplement in_early_dynastic_cuneiform in_hatran in_multani in_old_hungarian in_supplemental_symbols_and_pictographs in_sutton_signwriting ].freeze UnicodeBlock_V2_4_0 = %i[ in_adlam in_bhaiksuki in_cyrillic_extended_c in_glagolitic_supplement in_ideographic_symbols_and_punctuation in_marchen in_mongolian_supplement in_newa in_osage in_tangut in_tangut_components ].freeze UnicodeBlock_V2_5_0 = %i[ in_cjk_unified_ideographs_extension_f in_kana_extended_a in_masaram_gondi in_nushu in_soyombo in_syriac_supplement in_zanabazar_square ].freeze UnicodeBlock_V2_6_0 = %i[ in_chess_symbols in_dogra in_georgian_extended in_gunjala_gondi in_hanifi_rohingya in_indic_siyaq_numbers in_makasar in_mayan_numerals in_medefaidrin in_old_sogdian in_sogdian ].freeze UnicodeBlock_V2_6_2 = %i[ in_egyptian_hieroglyph_format_controls in_elymaic in_nandinagari in_nyiakeng_puachue_hmong in_ottoman_siyaq_numbers in_small_kana_extension in_symbols_and_pictographs_extended_a in_tamil_supplement in_wancho ].freeze UnicodeBlock_V3_1_0 = %i[ in_chorasmian in_cjk_unified_ideographs_extension_g in_dives_akuru in_khitan_small_script in_lisu_supplement in_symbols_for_legacy_computing in_tangut_supplement in_yezidi ].freeze UnicodeBlock_V3_2_0 = %i[ in_arabic_extended_b in_arabic_extended_c in_cjk_unified_ideographs_extension_h in_cypro_minoan in_cyrillic_extended_d in_devanagari_extended_a in_ethiopic_extended_b in_kaktovik_numerals in_kana_extended_b in_kawi in_latin_extended_f in_latin_extended_g in_nag_mundari in_old_uyghur in_tangsa in_toto in_unified_canadian_aboriginal_syllabics_extended_a in_vithkuqi in_znamenny_musical_notation ].freeze UnicodeBlock_V3_5_0 = %i[ in_cjk_unified_ideographs_extension_i ] UnicodeBlock = all[:UnicodeBlock_V] Emoji_V2_5_0 = %i[ emoji emoji_component emoji_modifier emoji_modifier_base emoji_presentation ].freeze Emoji_V2_6_0 = %i[ extended_pictographic ].freeze Enumerated_V2_4_0 = %i[ grapheme_cluster_break=control grapheme_cluster_break=cr grapheme_cluster_break=extend grapheme_cluster_break=l grapheme_cluster_break=lf grapheme_cluster_break=lv grapheme_cluster_break=lvt grapheme_cluster_break=prepend grapheme_cluster_break=regional_indicator grapheme_cluster_break=spacingmark grapheme_cluster_break=t grapheme_cluster_break=v grapheme_cluster_break=zwj ].freeze Enumerated = all[:Enumerated_V] Emoji = all[:Emoji_V] V1_9_0 = Category::All + POSIX + all[:V1_9_0] V1_9_3 = all[:V1_9_3] V2_0_0 = all[:V2_0_0] V2_2_0 = all[:V2_2_0] V2_3_0 = all[:V2_3_0] V2_4_0 = all[:V2_4_0] V2_5_0 = all[:V2_5_0] V2_6_0 = all[:V2_6_0] V2_6_2 = all[:V2_6_2] V2_6_3 = all[:V2_6_3] V3_1_0 = all[:V3_1_0] V3_2_0 = all[:V3_2_0] V3_5_0 = all[:V3_5_0] All = all[/^V\d+_\d+_\d+$/] Type = :property NonType = :nonproperty end Map[UnicodeProperty::Type] = UnicodeProperty::All Map[UnicodeProperty::NonType] = UnicodeProperty::All # alias for symmetry between token symbol and Token module name Property = UnicodeProperty end end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/token/virtual.rb000066400000000000000000000003571506175332700263550ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Syntax module Token module Virtual Root = %i[root].freeze Sequence = %i[sequence].freeze All = %i[root sequence].freeze Type = :expression end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/version_lookup.rb000066400000000000000000000037421506175332700266260ustar00rootroot00000000000000# frozen_string_literal: true module Regexp::Syntax VERSION_FORMAT = '\Aruby/\d+\.\d+(\.\d+)?\z' VERSION_REGEXP = /#{VERSION_FORMAT}/.freeze VERSION_CONST_REGEXP = /\AV\d+_\d+(?:_\d+)?\z/.freeze class InvalidVersionNameError < Regexp::Syntax::SyntaxError def initialize(name) super "Invalid version name '#{name}'. Expected format is '#{VERSION_FORMAT}'" end end class UnknownSyntaxNameError < Regexp::Syntax::SyntaxError def initialize(name) super "Unknown syntax name '#{name}'." end end module_function # Returns the syntax specification class for the given syntax # version name. The special names 'any' and '*' return Syntax::Any. def for(name) (@alias_map ||= {})[name] ||= version_class(name) end def new(name) warn 'Regexp::Syntax.new is deprecated in favor of Regexp::Syntax.for. '\ 'It does not return distinct instances and will be removed in v3.0.0.' self.for(name) end def supported?(name) name =~ VERSION_REGEXP && comparable(name) >= comparable('1.8.6') end def version_class(version) return Regexp::Syntax::Any if ['*', 'any'].include?(version.to_s) version =~ VERSION_REGEXP || raise(InvalidVersionNameError, version) version_const_name = "V#{version.to_s.scan(/\d+/).join('_')}" const_get(version_const_name) || raise(UnknownSyntaxNameError, version) end def const_missing(const_name) if const_name =~ VERSION_CONST_REGEXP return fallback_version_class(const_name) end super end def fallback_version_class(version) sorted = (specified_versions + [version]).sort_by { |ver| comparable(ver) } index = sorted.index(version) index > 0 && const_get(sorted[index - 1]) end def specified_versions constants.select { |const_name| const_name =~ VERSION_CONST_REGEXP } end def comparable(name) # add .99 to treat versions without a patch value as latest patch version Gem::Version.new((name.to_s.scan(/\d+/) << 99).join('.')) end end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/versions.rb000066400000000000000000000006101506175332700254070ustar00rootroot00000000000000# frozen_string_literal: true # Ruby 1.x is no longer a supported runtime, # but its regex features are still recognized. # # Aliases for the latest patch version are provided as 'ruby/n.n', # e.g. 'ruby/1.9' refers to Ruby v1.9.3. Dir[File.expand_path('../versions/*.rb', __FILE__)].sort.each { |f| require_relative f } Regexp::Syntax::CURRENT = Regexp::Syntax.for("ruby/#{RUBY_VERSION}") ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/versions/000077500000000000000000000000001506175332700250655ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/versions/1.8.6.rb000066400000000000000000000012201506175332700260570ustar00rootroot00000000000000# frozen_string_literal: true class Regexp::Syntax::V1_8_6 < Regexp::Syntax::Base implements :anchor, Anchor::All implements :assertion, Assertion::Lookahead implements :backref, Backreference::V1_8_6 implements :escape, Escape::Basic + Escape::ASCII + Escape::Meta + Escape::Control implements :free_space, FreeSpace::All implements :group, Group::V1_8_6 implements :literal, Literal::All implements :meta, Meta::Extended implements :posixclass, PosixClass::Standard implements :quantifier, Quantifier::V1_8_6 implements :set, CharacterSet::All implements :type, CharacterType::Extended end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/versions/1.9.1.rb000066400000000000000000000011631506175332700260610ustar00rootroot00000000000000# frozen_string_literal: true class Regexp::Syntax::V1_9_1 < Regexp::Syntax::V1_8_6 implements :assertion, Assertion::Lookbehind implements :backref, Backreference::V1_9_1 + SubexpressionCall::All implements :escape, Escape::Unicode + Escape::Hex + Escape::Octal implements :posixclass, PosixClass::Extensions implements :nonposixclass, PosixClass::All implements :property, UnicodeProperty::V1_9_0 implements :nonproperty, UnicodeProperty::V1_9_0 implements :quantifier, Quantifier::Possessive + Quantifier::IntervalPossessive implements :type, CharacterType::Hex end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/versions/1.9.3.rb000066400000000000000000000002771506175332700260700ustar00rootroot00000000000000# frozen_string_literal: true class Regexp::Syntax::V1_9_3 < Regexp::Syntax::V1_9_1 implements :property, UnicodeProperty::V1_9_3 implements :nonproperty, UnicodeProperty::V1_9_3 end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/versions/2.0.0.rb000066400000000000000000000006231506175332700260500ustar00rootroot00000000000000# frozen_string_literal: true class Regexp::Syntax::V2_0_0 < Regexp::Syntax::V1_9_3 implements :keep, Keep::All implements :conditional, Conditional::All implements :property, UnicodeProperty::V2_0_0 implements :nonproperty, UnicodeProperty::V2_0_0 implements :type, CharacterType::Clustered excludes :property, %i[newline] excludes :nonproperty, %i[newline] end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/versions/2.2.0.rb000066400000000000000000000002771506175332700260570ustar00rootroot00000000000000# frozen_string_literal: true class Regexp::Syntax::V2_2_0 < Regexp::Syntax::V2_0_0 implements :property, UnicodeProperty::V2_2_0 implements :nonproperty, UnicodeProperty::V2_2_0 end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/versions/2.3.0.rb000066400000000000000000000002771506175332700260600ustar00rootroot00000000000000# frozen_string_literal: true class Regexp::Syntax::V2_3_0 < Regexp::Syntax::V2_2_0 implements :property, UnicodeProperty::V2_3_0 implements :nonproperty, UnicodeProperty::V2_3_0 end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/versions/2.4.0.rb000066400000000000000000000002771506175332700260610ustar00rootroot00000000000000# frozen_string_literal: true class Regexp::Syntax::V2_4_0 < Regexp::Syntax::V2_3_0 implements :property, UnicodeProperty::V2_4_0 implements :nonproperty, UnicodeProperty::V2_4_0 end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/versions/2.4.1.rb000066400000000000000000000001741506175332700260560ustar00rootroot00000000000000# frozen_string_literal: true class Regexp::Syntax::V2_4_1 < Regexp::Syntax::V2_4_0 implements :group, Group::V2_4_1 end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/versions/2.5.0.rb000066400000000000000000000002771506175332700260620ustar00rootroot00000000000000# frozen_string_literal: true class Regexp::Syntax::V2_5_0 < Regexp::Syntax::V2_4_1 implements :property, UnicodeProperty::V2_5_0 implements :nonproperty, UnicodeProperty::V2_5_0 end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/versions/2.6.0.rb000066400000000000000000000002771506175332700260630ustar00rootroot00000000000000# frozen_string_literal: true class Regexp::Syntax::V2_6_0 < Regexp::Syntax::V2_5_0 implements :property, UnicodeProperty::V2_6_0 implements :nonproperty, UnicodeProperty::V2_6_0 end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/versions/2.6.2.rb000066400000000000000000000002771506175332700260650ustar00rootroot00000000000000# frozen_string_literal: true class Regexp::Syntax::V2_6_2 < Regexp::Syntax::V2_6_0 implements :property, UnicodeProperty::V2_6_2 implements :nonproperty, UnicodeProperty::V2_6_2 end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/versions/2.6.3.rb000066400000000000000000000002771506175332700260660ustar00rootroot00000000000000# frozen_string_literal: true class Regexp::Syntax::V2_6_3 < Regexp::Syntax::V2_6_2 implements :property, UnicodeProperty::V2_6_3 implements :nonproperty, UnicodeProperty::V2_6_3 end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/versions/3.1.0.rb000066400000000000000000000002771506175332700260570ustar00rootroot00000000000000# frozen_string_literal: true class Regexp::Syntax::V3_1_0 < Regexp::Syntax::V2_6_3 implements :property, UnicodeProperty::V3_1_0 implements :nonproperty, UnicodeProperty::V3_1_0 end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/versions/3.2.0.rb000066400000000000000000000002771506175332700260600ustar00rootroot00000000000000# frozen_string_literal: true class Regexp::Syntax::V3_2_0 < Regexp::Syntax::V3_1_0 implements :property, UnicodeProperty::V3_2_0 implements :nonproperty, UnicodeProperty::V3_2_0 end ammar-regexp_parser-68cdeff/lib/regexp_parser/syntax/versions/3.5.0.rb000066400000000000000000000002401506175332700260510ustar00rootroot00000000000000class Regexp::Syntax::V3_5_0 < Regexp::Syntax::V3_2_0 implements :property, UnicodeProperty::V3_5_0 implements :nonproperty, UnicodeProperty::V3_5_0 end ammar-regexp_parser-68cdeff/lib/regexp_parser/token.rb000066400000000000000000000005021506175332700233310ustar00rootroot00000000000000# frozen_string_literal: true class Regexp TOKEN_KEYS = %i[ type token text ts te level set_level conditional_level ].freeze Token = Struct.new(*TOKEN_KEYS) do attr_accessor :previous, :next def offset [ts, te] end def length te - ts end end end ammar-regexp_parser-68cdeff/lib/regexp_parser/version.rb000066400000000000000000000001341506175332700236770ustar00rootroot00000000000000# frozen_string_literal: true class Regexp class Parser VERSION = '2.11.3' end end ammar-regexp_parser-68cdeff/regexp_parser.gemspec000066400000000000000000000023451506175332700224720ustar00rootroot00000000000000# frozen_string_literal: true $:.unshift File.join(File.dirname(__FILE__), 'lib') require 'regexp_parser/version' Gem::Specification.new do |spec| spec.name = 'regexp_parser' spec.version = ::Regexp::Parser::VERSION spec.summary = "Scanner, lexer, parser for ruby's regular expressions" spec.description = 'A library for tokenizing, lexing, and parsing Ruby regular expressions.' spec.homepage = 'https://github.com/ammar/regexp_parser' spec.metadata['bug_tracker_uri'] = "#{spec.homepage}/issues" spec.metadata['changelog_uri'] = "#{spec.homepage}/blob/master/CHANGELOG.md" spec.metadata['homepage_uri'] = spec.homepage spec.metadata['source_code_uri'] = spec.homepage spec.metadata['wiki_uri'] = "#{spec.homepage}/wiki" spec.metadata['rubygems_mfa_required'] = 'true' spec.authors = ['Ammar Ali', 'Janosch Müller'] spec.email = ['ammarabuali@gmail.com', 'janosch84@gmail.com'] spec.license = 'MIT' spec.require_paths = ['lib'] spec.files = Dir.glob('lib/**/*.{csv,rb,rl}') + %w[Gemfile Rakefile LICENSE regexp_parser.gemspec] spec.platform = Gem::Platform::RUBY spec.required_ruby_version = '>= 2.0.0' end ammar-regexp_parser-68cdeff/spec/000077500000000000000000000000001506175332700172055ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/spec/expression/000077500000000000000000000000001506175332700214045ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/spec/expression/base_spec.rb000066400000000000000000000071171506175332700236630ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe(Regexp::Expression::Base) do # test #level include_examples 'parse', /^a(b(c(d)))e$/, [0] => [to_s: '^', level: 0], [1] => [to_s: 'a', level: 0], [2] => [to_s: '(b(c(d)))', level: 0], [2, 0] => [to_s: 'b', level: 1], [2, 1] => [to_s: '(c(d))', level: 1], [2, 1, 0] => [to_s: 'c', level: 2], [2, 1, 1] => [to_s: '(d)', level: 2], [2, 1, 1, 0] => [to_s: 'd', level: 3], [3] => [to_s: 'e', level: 0], [4] => [to_s: '$', level: 0] # test #coded_offset include_examples 'parse', /^a*(b+(c?))$/, [] => [Root, coded_offset: '@0+12'], [0] => [to_s: '^', coded_offset: '@0+1'], [1] => [to_s: 'a*', coded_offset: '@1+2'], [2] => [to_s: '(b+(c?))', coded_offset: '@3+8'], [2, 0] => [to_s: 'b+', coded_offset: '@4+2'], [2, 1] => [to_s: '(c?)', coded_offset: '@6+4'], [2, 1, 0] => [to_s: 'c?', coded_offset: '@7+2'], [3] => [to_s: '$', coded_offset: '@11+1'] # test #quantity include_examples 'parse', /aa/, [0] => [quantity: [nil, nil]] include_examples 'parse', /a?/, [0] => [quantity: [0, 1]] include_examples 'parse', /a*/, [0] => [quantity: [0, -1]] include_examples 'parse', /a+/, [0] => [quantity: [1, -1]] # test #repetitions include_examples 'parse', /aa/, [0] => [repetitions: 1..1] include_examples 'parse', /a?/, [0] => [repetitions: 0..1] include_examples 'parse', /a*/, [0] => [repetitions: 0..(Float::INFINITY)] include_examples 'parse', /a+/, [0] => [repetitions: 1..(Float::INFINITY)] # test #base_length, #full_length, #starts_at, #ends_at include_examples 'parse', /(aa)/, [] => [Root, base_length: 4, full_length: 4, starts_at: 0, ends_at: 4], [0] => [Group::Capture, base_length: 4, full_length: 4, starts_at: 0, ends_at: 4], [0, 0] => [Literal, base_length: 2, full_length: 2, starts_at: 1, ends_at: 3] include_examples 'parse', /(aa){42}/, [] => [Root, base_length: 8, full_length: 8, starts_at: 0, ends_at: 8], [0] => [Group::Capture, base_length: 4, full_length: 8, starts_at: 0, ends_at: 8], [0, 0] => [Literal, base_length: 2, full_length: 2, starts_at: 1, ends_at: 3] include_examples 'parse', /(aa) {42}/x, [] => [Root, base_length: 9, full_length: 9, starts_at: 0, ends_at: 9], [0] => [Group::Capture, base_length: 4, full_length: 9, starts_at: 0, ends_at: 9], [0, 0] => [Literal, base_length: 2, full_length: 2, starts_at: 1, ends_at: 3] # test #to_re include_examples 'parse', '^a*(b([cde]+))+f?$', [] => [Root, to_re: /^a*(b([cde]+))+f?$/] specify '#parent' do root = Regexp::Parser.parse(/(a(b)){42}/) expect(root.parent).to be_nil expect(root[0].parent).to eq root expect(root[0].quantifier.parent).to be_nil expect(root[0][0].parent).to eq root[0] expect(root[0][1].parent).to eq root[0] expect(root[0][1][0].parent).to eq root[0][1] end specify '#to_re warns when used on set members' do expect do result = Regexp::Parser.parse(/[\b]/)[0][0].to_re expect(result).to eq(/\b/) end.to output(/set member/).to_stderr end specify 'updating #quantifier updates #repetitions' do exp = Regexp::Parser.parse(/a{3}/)[0] expect(exp.repetitions).to eq 3..3 exp.quantifier = Regexp::Parser.parse(/b{5}/)[0].quantifier expect(exp.repetitions).to eq 5..5 end end ammar-regexp_parser-68cdeff/spec/expression/clone_spec.rb000066400000000000000000000131251506175332700240450ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Expression::Base#clone') do specify('Base#clone') do root = RP.parse(/^(?i:a)b+$/i) copy = root.clone expect(copy.to_s).to eq root.to_s expect(root).not_to equal copy expect(root.text).to eq copy.text expect(root.text).not_to equal copy.text root_1 = root[1] copy_1 = copy[1] expect(root_1.options).to eq copy_1.options expect(root_1.options).not_to equal copy_1.options expect(root_1.parent).to eq root expect(root_1.parent).not_to equal copy expect(copy_1.parent).to eq copy expect(copy_1.parent).not_to equal root root_2 = root[2] copy_2 = copy[2] expect(root_2).to be_quantified expect(copy_2).to be_quantified expect(root_2.quantifier.text).to eq copy_2.quantifier.text expect(root_2.quantifier.text).not_to equal copy_2.quantifier.text expect(root_2.quantifier).not_to equal copy_2.quantifier # regression test expect { root_2.clone }.not_to(change { root_2.quantifier.object_id }) expect { root_2.clone }.not_to(change { root_2.quantifier.text.object_id }) end specify('Base#clone causes no shared state') do root = RP.parse(regexp_with_all_features) copy = root.clone shared = Leto.shared_mutables(root, copy) expect(shared).to be_empty, "found shared mutables:\n#{shared.join("\n")}" end specify('Subexpression#clone') do root = RP.parse(/^a(b([cde])f)g$/) copy = root.clone expect(copy.to_s).to eq root.to_s expect(root).to respond_to(:expressions) expect(copy).to respond_to(:expressions) expect(root.expressions).not_to equal copy.expressions copy.expressions.each_with_index do |exp, index| expect(root[index]).not_to equal exp end copy[2].each_with_index do |exp, index| expect(root[2][index]).not_to equal exp end # regression test expect { root.clone }.not_to(change { root.expressions.object_id }) end specify('Group::Named#clone') do root = RP.parse('^(?a)+bc$') copy = root.clone expect(copy.to_s).to eq root.to_s root_1 = root[1] copy_1 = copy[1] expect(root_1.name).to eq copy_1.name expect(root_1.name).not_to equal copy_1.name expect(root_1.text).to eq copy_1.text expect(root_1.expressions).not_to equal copy_1.expressions copy_1.expressions.each_with_index do |exp, index| expect(root_1[index]).not_to equal exp end # regression test expect { root_1.clone }.not_to(change { root_1.name.object_id }) end specify('Group::Options#clone') do root = RP.parse('foo(?i)bar') copy = root.clone expect(copy.to_s).to eq root.to_s root_1 = root[1] copy_1 = copy[1] expect(root_1.option_changes).to eq copy_1.option_changes expect(root_1.option_changes).not_to equal copy_1.option_changes # regression test expect { root_1.clone }.not_to(change { root_1.option_changes.object_id }) end specify('Backreference::Base#clone') do root = RP.parse('(foo)\1') copy = root.clone expect(copy.to_s).to eq root.to_s root_1 = root[1] copy_1 = copy[1] expect(root_1.referenced_expression).to eq copy_1.referenced_expression expect(root_1.referenced_expression.to_s).to eq copy_1.referenced_expression.to_s expect(root_1.referenced_expression).not_to equal copy_1.referenced_expression # regression test expect { root_1.clone }.not_to(change { root_1.referenced_expression.object_id }) end specify('Backreference::Base#clone works for recursive subexp calls') do root = RP.parse('a|b\g<0>') copy = root.clone expect(copy.to_s).to eq root.to_s root_call = root.dig(0, 1, 1) copy_call = copy.dig(0, 1, 1) expect(root).to eq copy expect(root).not_to equal copy expect(root_call).to eq copy_call expect(root_call).not_to equal copy_call expect(root_call.referenced_expression).not_to be_nil expect(root_call.referenced_expression.object_id).to eq root.object_id expect(copy_call.referenced_expression).not_to be_nil # Mapping the reference to the cloned referenced_expression would # probably require a context or 2-way bindings in the tree. Maybe later ... # expect(copy_call.referenced_expression.object_id).to eq copy.object_id end specify('Sequence#clone') do root = RP.parse(/(a|b)/) copy = root.clone # regression test expect(copy.to_s).to eq root.to_s root_seq_op = root[0][0] copy_seq_op = copy[0][0] root_seq_1 = root[0][0][0] copy_seq_1 = copy[0][0][0] expect(root_seq_op).not_to equal copy_seq_op expect(root_seq_1).not_to equal copy_seq_1 copy_seq_1.expressions.each_with_index do |exp, index| expect(root_seq_1[index]).not_to equal exp end end describe('Base#unquantified_clone') do it 'produces a clone' do root = RP.parse(/^a(b([cde])f)g$/) copy = root.unquantified_clone expect(copy.to_s).to eq root.to_s expect(copy).not_to equal root end it 'does not carry over the callee quantifier' do expect(RP.parse(/a{3}/)[0]).to be_quantified expect(RP.parse(/a{3}/)[0].unquantified_clone).not_to be_quantified expect(RP.parse(/[a]{3}/)[0]).to be_quantified expect(RP.parse(/[a]{3}/)[0].unquantified_clone).not_to be_quantified expect(RP.parse(/(a|b){3}/)[0]).to be_quantified expect(RP.parse(/(a|b){3}/)[0].unquantified_clone).not_to be_quantified end it 'keeps quantifiers of callee children' do expect(RP.parse(/(a{3}){3}/)[0][0]).to be_quantified expect(RP.parse(/(a{3}){3}/)[0].unquantified_clone[0]).to be_quantified end end end ammar-regexp_parser-68cdeff/spec/expression/conditional_spec.rb000066400000000000000000000016621506175332700252530ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe(Regexp::Expression::Conditional) do specify('Conditional#condition, #branches') do conditional = RP.parse(/(?a)(?()T|F)/)[1] expect(conditional.condition).to eq conditional[0] expect(conditional.branches).to eq conditional[1..2] end specify('Condition#referenced_expression') do root = RP.parse(/(?a)(?()T|F)/) condition = root[1].condition expect(condition.referenced_expression).to eq root[0] expect(condition.referenced_expression.to_s).to eq '(?a)' root = RP.parse(/(a)(?(1)T|F)/) condition = root[1].condition expect(condition.referenced_expression).to eq root[0] expect(condition.referenced_expression.to_s).to eq '(a)' end specify('parse conditional excessive branches') do regexp = '(?a)(?()T|F|X)' expect { RP.parse(regexp) }.to raise_error(Conditional::TooManyBranches) end end ammar-regexp_parser-68cdeff/spec/expression/free_space_spec.rb000066400000000000000000000012471506175332700250430ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe(Regexp::Expression::FreeSpace) do specify('white space quantify raises error') do regexp = / a # Comment /x root = RP.parse(regexp) space = root[0] expect(space).to be_instance_of(FreeSpace::WhiteSpace) expect { space.quantify(:dummy, '#') }.to raise_error(Regexp::Parser::Error) end specify('comment quantify raises error') do regexp = / a # Comment /x root = RP.parse(regexp) comment = root[3] expect(comment).to be_instance_of(FreeSpace::Comment) expect { comment.quantify(:dummy, '#') }.to raise_error(Regexp::Parser::Error) end end ammar-regexp_parser-68cdeff/spec/expression/methods/000077500000000000000000000000001506175332700230475ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/spec/expression/methods/construct_spec.rb000066400000000000000000000044061506175332700264360ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe(Regexp::Expression::Shared) do describe '::construct' do { Alternation => :meta, Alternative => :expression, Anchor::Base => :anchor, Anchor::EndOfLine => :anchor, Assertion::Base => :assertion, Assertion::Lookahead => :assertion, Backreference::Base => :backref, Backreference::Number => :backref, CharacterSet => :set, CharacterSet::IntersectedSequence => :expression, CharacterSet::Intersection => :set, CharacterSet::Range => :set, CharacterType::Any => :meta, CharacterType::Base => :type, CharacterType::Digit => :type, Conditional::Branch => :expression, Conditional::Condition => :conditional, Conditional::Expression => :conditional, EscapeSequence::Base => :escape, EscapeSequence::Literal => :escape, FreeSpace => :free_space, Group::Base => :group, Group::Capture => :group, Keep::Mark => :keep, Literal => :literal, PosixClass => :posixclass, Quantifier => :quantifier, Root => :expression, UnicodeProperty::Base => :property, UnicodeProperty::Number::Decimal => :property, }.each do |klass, expected_type| it "works for #{klass}" do result = klass.construct expect(result).to be_a klass expect(result.type).to eq expected_type end end it 'allows overriding defaults' do expect(Literal.construct(type: :foo).type).to eq :foo end it 'allows passing options' do expect(Literal.construct(options: { i: true }).options[:i]).to eq true end it 'raises ArgumentError for unknown parameters' do expect { Literal.construct(foo: :foo) }.to raise_error(ArgumentError) end end end ammar-regexp_parser-68cdeff/spec/expression/methods/human_name_spec.rb000066400000000000000000000042751506175332700265260ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Regexp::Expression::Shared#human_name') do include_examples 'parse', //, [] => [human_name: 'root'] include_examples 'parse', /a/, [0] => [human_name: 'literal'] include_examples 'parse', /./, [0] => [human_name: 'match-all'] include_examples 'parse', /[abc]/, [0] => [human_name: 'character set'] include_examples 'parse', /[a-c]/, [0, 0] => [human_name: 'character range'] include_examples 'parse', /\d/, [0] => [human_name: 'digit type'] include_examples 'parse', /\n/, [0] => [human_name: 'newline escape'] include_examples 'parse', /\u{61 62 63}/, [0] => [human_name: 'codepoint list escape'] include_examples 'parse', /\p{ascii}/, [0] => [human_name: 'ascii property'] include_examples 'parse', /[[:ascii:]]/, [0, 0] => [human_name: 'ascii posixclass'] include_examples 'parse', /a{5}/, [0, :q] => [human_name: 'interval quantifier'] include_examples 'parse', /^/, [0] => [human_name: 'beginning of line'] include_examples 'parse', /(?=abc)/, [0] => [human_name: 'lookahead'] include_examples 'parse', /(a)(b)/, [0] => [human_name: 'capture group 1'] include_examples 'parse', /(a)(b)/, [1] => [human_name: 'capture group 2'] include_examples 'parse', /(?abc)/, [0] => [human_name: 'named capture group'] include_examples 'parse', / /x, [0] => [human_name: 'free space'] include_examples 'parse', /#comment /x, [0] => [human_name: 'comment'] include_examples 'parse', /(?#comment)/x, [0] => [human_name: 'comment group'] include_examples 'parse', /(abc)\1/, [1] => [human_name: 'backreference'] include_examples 'parse', /(?)\k/, [1] => [human_name: 'backreference by name'] include_examples 'parse', /(abc)\g<-1>/, [1] => [human_name: 'relative subexpression call'] include_examples 'parse', /a|bc/, [0] => [human_name: 'alternation'] include_examples 'parse', /a|bc/, [0, 0] => [human_name: 'alternative'] end ammar-regexp_parser-68cdeff/spec/expression/methods/match_length_spec.rb000066400000000000000000000147061506175332700270530ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' ML = Regexp::MatchLength RSpec.describe(Regexp::MatchLength) do specify('literal') { expect(ML.of(/a/).minmax).to eq [1, 1] } specify('literal sequence') { expect(ML.of(/abc/).minmax).to eq [3, 3] } specify('dot') { expect(ML.of(/./).minmax).to eq [1, 1] } specify('set') { expect(ML.of(/[abc]/).minmax).to eq [1, 1] } specify('type') { expect(ML.of(/\d/).minmax).to eq [1, 1] } specify('escape') { expect(ML.of(/\n/).minmax).to eq [1, 1] } specify('property') { expect(ML.of(/\p{ascii}/).minmax).to eq [1, 1] } specify('codepoint list') { expect(ML.of(/\u{61 62 63}/).minmax).to eq [3, 3] } specify('multi-char literal') { expect(ML.of(/abc/).minmax).to eq [3, 3] } specify('fixed quantified') { expect(ML.of(/a{5}/).minmax).to eq [5, 5] } specify('range quantified') { expect(ML.of(/a{5,9}/).minmax).to eq [5, 9] } specify('nested quantified') { expect(ML.of(/(a{2}){3,4}/).minmax).to eq [6, 8] } specify('open-end quantified') { expect(ML.of(/a*/).minmax).to eq [0, Float::INFINITY] } specify('empty subexpression') { expect(ML.of(//).minmax).to eq [0, 0] } specify('anchor') { expect(ML.of(/^$/).minmax).to eq [0, 0] } specify('lookaround') { expect(ML.of(/(?=abc)/).minmax).to eq [0, 0] } specify('free space') { expect(ML.of(/ /x).minmax).to eq [0, 0] } specify('comment') { expect(ML.of(/(?#comment)/x).minmax).to eq [0, 0] } specify('backreference') { expect(ML.of(/(abc){2}\1/).minmax).to eq [9, 9] } specify('subexp call') { expect(ML.of(/(abc){2}\g<-1>/).minmax).to eq [9, 9] } specify('alternation') { expect(ML.of(/a|bcde/).minmax).to eq [1, 4] } specify('nested alternation') { expect(ML.of(/a|bc(d|efg)/).minmax).to eq [1, 5] } specify('quantified alternation') { expect(ML.of(/a|bcde?/).minmax).to eq [1, 4] } if ruby_version_at_least('2.4.1') specify('absence group') { expect(ML.of('(?~abc)').minmax).to eq [0, Float::INFINITY] } end specify('raises for missing references') do exp = RP.parse(/(a)\1/).last exp.referenced_expressions = nil expect { exp.match_length }.to raise_error(ArgumentError) end describe('::of') do it('works with Regexps') { expect(ML.of(/foo/).minmax).to eq [3, 3] } it('works with Strings') { expect(ML.of('foo').minmax).to eq [3, 3] } it('works with Expressions') { expect(ML.of(RP.parse(/foo/)).minmax).to eq [3, 3] } end describe('Expression::Base#match_length') do it('returns the MatchLength') { expect(RP.parse(/abc/).match_length.minmax).to eq [3, 3] } end describe('Expression::Base#inner_match_length') do it 'returns the MatchLength of an expression that does not count towards parent match_length' do exp = RP.parse(/(?=ab|cdef)/)[0] expect(exp).to be_a Regexp::Expression::Assertion::Base expect(exp.match_length.minmax).to eq [0, 0] expect(exp.inner_match_length.minmax).to eq [2, 4] end end describe('#include?') do specify('unquantified') do expect(ML.of(/a/)).to include 1 expect(ML.of(/a/)).not_to include 0 expect(ML.of(/a/)).not_to include 2 end specify('fixed quantified') do expect(ML.of(/a{5}/)).to include 5 expect(ML.of(/a{5}/)).not_to include 0 expect(ML.of(/a{5}/)).not_to include 4 expect(ML.of(/a{5}/)).not_to include 6 end specify('variably quantified') do expect(ML.of(/a?/)).to include 0 expect(ML.of(/a?/)).to include 1 expect(ML.of(/a?/)).not_to include 2 end specify('nested quantified') do expect(ML.of(/(a{2}){3,4}/)).to include 6 expect(ML.of(/(a{2}){3,4}/)).to include 8 expect(ML.of(/(a{2}){3,4}/)).not_to include 0 expect(ML.of(/(a{2}){3,4}/)).not_to include 5 expect(ML.of(/(a{2}){3,4}/)).not_to include 7 expect(ML.of(/(a{2}){3,4}/)).not_to include 9 end specify('branches') do expect(ML.of(/ab|cdef/)).to include 2 expect(ML.of(/ab|cdef/)).to include 4 expect(ML.of(/ab|cdef/)).not_to include 0 expect(ML.of(/ab|cdef/)).not_to include 3 expect(ML.of(/ab|cdef/)).not_to include 5 end specify('called on leaf node') do expect(ML.of(RP.parse(/a{2}/)[0])).to include 2 expect(ML.of(RP.parse(/a{2}/)[0])).not_to include 0 expect(ML.of(RP.parse(/a{2}/)[0])).not_to include 1 expect(ML.of(RP.parse(/a{2}/)[0])).not_to include 3 end end describe('#fixed?') do specify('unquantified') { expect(ML.of(/a/)).to be_fixed } specify('fixed quantified') { expect(ML.of(/a{5}/)).to be_fixed } specify('variably quantified') { expect(ML.of(/a?/)).not_to be_fixed } specify('equal branches') { expect(ML.of(/ab|cd/)).to be_fixed } specify('unequal branches') { expect(ML.of(/ab|cdef/)).not_to be_fixed } specify('equal quantified branches') { expect(ML.of(/a{2}|cd/)).to be_fixed } specify('unequal quantified branches') { expect(ML.of(/a{3}|cd/)).not_to be_fixed } specify('empty') { expect(ML.of(//)).to be_fixed } end describe('#each') do it 'returns an Enumerator if called without a block' do result = ML.of(/a?/).each expect(result).to be_a(Enumerator) expect(result.next).to eq 0 expect(result.next).to eq 1 expect { result.next }.to raise_error(StopIteration) end it 'is aware of limit option even if called without a block' do result = ML.of(/a?/).each(limit: 1) expect(result).to be_a(Enumerator) expect(result.next).to eq 0 expect { result.next }.to raise_error(StopIteration) end it 'is limited to 1000 iterations in case there are infinite match lengths' do expect(ML.of(/a*/).first(3000).size).to eq 1000 end it 'scaffolds the Enumerable interface' do expect(ML.of(/abc|defg/).count).to eq 2 expect(ML.of(/(ab)*/).first(5)).to eq [0, 2, 4, 6, 8] expect(ML.of(/a{,10}/).any? { |len| len > 20 }).to be false end end describe('#endless_each') do it 'returns an Enumerator if called without a block' do result = ML.of(/a?/).endless_each expect(result).to be_a(Enumerator) expect(result.next).to eq 0 expect(result.next).to eq 1 expect { result.next }.to raise_error(StopIteration) end it 'never stops iterating for infinite match lengths' do expect(ML.of(/a*/).endless_each.first(3000).size).to eq 3000 end end describe('#inspect') do it 'is nice' do result = RP.parse(/a{2,4}/)[0].match_length expect(result.inspect).to eq '# min=2 max=4>' end end end ammar-regexp_parser-68cdeff/spec/expression/methods/match_spec.rb000066400000000000000000000013101506175332700254750ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Expression::Base#match') do it 'returns the #match result of the respective Regexp' do expect(RP.parse(/a/).match('a')[0]).to eq 'a' end it 'can be given an offset, just like Regexp#match' do expect(RP.parse(/./).match('ab', 1)[0]).to eq 'b' end it 'works with the #=~ alias' do expect(RP.parse(/a/) =~ 'a').to be_a MatchData end end RSpec.describe('Expression::Base#match?') do it 'returns true if the Respective Regexp matches' do expect(RP.parse(/a/).match?('a')).to be true end it 'returns false if the Respective Regexp does not match' do expect(RP.parse(/a/).match?('b')).to be false end end ammar-regexp_parser-68cdeff/spec/expression/methods/negative_spec.rb000066400000000000000000000032301506175332700262060ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Expression::Base#negative?') do include_examples 'parse', //, [] => [:root, negative?: false] include_examples 'parse', /a/, [0] => [:literal, negative?: false] include_examples 'parse', /\b/, [0] => [:word_boundary, negative?: false] include_examples 'parse', /\B/, [0] => [:nonword_boundary, negative?: true] include_examples 'parse', /(?=)/, [0] => [:lookahead, negative?: false] include_examples 'parse', /(?!)/, [0] => [:nlookahead, negative?: true] include_examples 'parse', /(?<=)/, [0] => [:lookbehind, negative?: false] include_examples 'parse', /(? [:nlookbehind, negative?: true] include_examples 'parse', /[a]/, [0] => [:character, negative?: false] include_examples 'parse', /[^a]/, [0] => [:character, negative?: true] include_examples 'parse', /\d/, [0] => [:digit, negative?: false] include_examples 'parse', /\D/, [0] => [:nondigit, negative?: true] include_examples 'parse', /[[:word:]]/, [0, 0] => [:word, negative?: false] include_examples 'parse', /[[:^word:]]/, [0, 0] => [:word, negative?: true] include_examples 'parse', /\p{word}/, [0] => [:word, negative?: false] include_examples 'parse', /\p{^word}/, [0] => [:word, negative?: true] include_examples 'parse', //, [] => [:root, negated?: false] include_examples 'parse', /[^a]/, [0] => [:character, negated?: true] end ammar-regexp_parser-68cdeff/spec/expression/methods/parts_spec.rb000066400000000000000000000056331506175332700255460ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Expression::Base#parts') do include_examples 'parse', //, [] => [:root, parts: []] include_examples 'parse', /a/, [0] => [:literal, parts: ['a']] include_examples 'parse', /\K/, [0] => [:mark, parts: ['\K']] include_examples 'parse', /\p{any}/, [0] => [:any, parts: ['\p{any}']] include_examples 'parse', /[a]/, [0] => [:character, parts: ['[', s(Literal, 'a'), ']']] include_examples 'parse', /[^a]/, [0] => [:character, parts: ['[^', s(Literal, 'a'), ']']] include_examples 'parse', /(a)/, [0] => [:capture, parts: ['(', s(Literal, 'a'), ')']] include_examples 'parse', /(?>a)/, [0] => [:atomic, parts: ['(?>', s(Literal, 'a'), ')']] include_examples 'parse', /(?=a)/, [0] => [:lookahead, parts: ['(?=', s(Literal, 'a'), ')']] include_examples 'parse', /(?#a)/, [0] => [:comment, parts: ['(?#a)']] include_examples 'parse', /(a(b(c)))/, [0] => [:capture, parts: [ '(', s(Literal, 'a'), s(Group::Capture, '(', s(Literal, 'b'), s(Group::Capture, '(', s(Literal, 'c'), ) ), ')' ]] include_examples 'parse', /a|b|c/, [] => [:root, parts: [ s(Alternation, '|', s(Alternative, nil, s(Literal, 'a')), s(Alternative, nil, s(Literal, 'b')), s(Alternative, nil, s(Literal, 'c')) ) ]], [0] => [:alternation, parts: [ s(Alternative, nil, s(Literal, 'a')), '|', s(Alternative, nil, s(Literal, 'b')), '|', s(Alternative, nil, s(Literal, 'c')) ]] include_examples 'parse', /[a-z]/, [] => [:root, parts: [ s(CharacterSet, '[', s(CharacterSet::Range, '-', s(Literal, 'a'), s(Literal, 'z')), ) ]], [0] => [:character, parts: [ '[', s(CharacterSet::Range, '-', s(Literal, 'a'), s(Literal, 'z')), ']' ]], [0, 0] => [:range, parts: [ s(Literal, 'a'), '-', s(Literal, 'z') ]] include_examples 'parse', /[a&&b&&c]/, [] => [:root, parts: [ s(CharacterSet, '[', s(CharacterSet::Intersection, '&&', s(CharacterSet::IntersectedSequence, nil, s(Literal, 'a')), s(CharacterSet::IntersectedSequence, nil, s(Literal, 'b')), s(CharacterSet::IntersectedSequence, nil, s(Literal, 'c')) ) ) ]], [0, 0] => [:intersection, parts: [ s(CharacterSet::IntersectedSequence, nil, s(Literal, 'a')), '&&', s(CharacterSet::IntersectedSequence, nil, s(Literal, 'b')), '&&', s(CharacterSet::IntersectedSequence, nil, s(Literal, 'c')) ]] include_examples 'parse', /(a)(?(1)T|F)/, [1] => [Conditional::Expression, parts: [ '(?', s(Conditional::Condition, '(1)'), s(Conditional::Branch, nil, s(Literal, 'T')), '|', s(Conditional::Branch, nil, s(Literal, 'F')), ')' ]] end ammar-regexp_parser-68cdeff/spec/expression/methods/printing_spec.rb000066400000000000000000000024001506175332700262340ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Expression::Shared#inspect') do it 'includes only essential information' do root = Regexp::Parser.parse(//) expect(root.inspect).to eq '#' root = Regexp::Parser.parse(/(a)+/) expect(root.inspect) .to match(/#' expect(root[0][0].inspect) .to eq '#' end end RSpec.describe('Expression::Shared#pretty_print') do it 'works' do require 'pp' pp_to_s = ->(arg) { ''.dup.tap { |buffer| PP.new(buffer).pp(arg) } } root = Regexp::Parser.parse(/(a)+/) expect(pp_to_s.(root)).to start_with '#')).to eq '<1>' c = root[1][1].first expect(c.strfregexp('[at: %l]')).to eq '[at: 2]' end specify('#strfregexp start end') do root = RP.parse(/a(b(c))/) expect(root.strfregexp('%s')).to eq '0' expect(root.strfregexp('%e')).to eq '7' a = root.first expect(a.strfregexp('%%s')).to eq '%0' expect(a.strfregexp('%e')).to eq '1' group_1 = root[1] expect(group_1.strfregexp('GRP:%s')).to eq 'GRP:1' expect(group_1.strfregexp('%e')).to eq '7' b = group_1.first expect(b.strfregexp('<@%s>')).to eq '<@2>' expect(b.strfregexp('%e')).to eq '3' c = group_1.last.first expect(c.strfregexp('[at: %s]')).to eq '[at: 4]' expect(c.strfregexp('%e')).to eq '5' end specify('#strfregexp length') do root = RP.parse(/a[b]c/) expect(root.strfregexp('%S')).to eq '5' a = root.first expect(a.strfregexp('%S')).to eq '1' set = root[1] expect(set.strfregexp('%S')).to eq '3' end specify('#strfregexp coded offset') do root = RP.parse(/a[b]c/) expect(root.strfregexp('%o')).to eq '@0+5' a = root.first expect(a.strfregexp('%o')).to eq '@0+1' set = root[1] expect(set.strfregexp('%o')).to eq '@1+3' end specify('#strfregexp type token') do root = RP.parse(/a[b](c)/) expect(root.strfregexp('%y')).to eq 'expression' expect(root.strfregexp('%k')).to eq 'root' expect(root.strfregexp('%i')).to eq 'expression:root' expect(root.strfregexp('%c')).to eq 'Regexp::Expression::Root' a = root.first expect(a.strfregexp('%y')).to eq 'literal' expect(a.strfregexp('%k')).to eq 'literal' expect(a.strfregexp('%i')).to eq 'literal:literal' expect(a.strfregexp('%c')).to eq 'Regexp::Expression::Literal' set = root[1] expect(set.strfregexp('%y')).to eq 'set' expect(set.strfregexp('%k')).to eq 'character' expect(set.strfregexp('%i')).to eq 'set:character' expect(set.strfregexp('%c')).to eq 'Regexp::Expression::CharacterSet' group = root.last expect(group.strfregexp('%y')).to eq 'group' expect(group.strfregexp('%k')).to eq 'capture' expect(group.strfregexp('%i')).to eq 'group:capture' expect(group.strfregexp('%c')).to eq 'Regexp::Expression::Group::Capture' end specify('#strfregexp quantifier') do root = RP.parse(/a+[b](c)?d{3,4}/) expect(root.strfregexp('%q')).to eq '{1}' expect(root.strfregexp('%Q')).to eq '' expect(root.strfregexp('%z, %Z')).to eq '1, 1' a = root.first expect(a.strfregexp('%q')).to eq '{1, or-more}' expect(a.strfregexp('%Q')).to eq '+' expect(a.strfregexp('%z, %Z')).to eq '1, -1' set = root[1] expect(set.strfregexp('%q')).to eq '{1}' expect(set.strfregexp('%Q')).to eq '' expect(set.strfregexp('%z, %Z')).to eq '1, 1' group = root[2] expect(group.strfregexp('%q')).to eq '{0, 1}' expect(group.strfregexp('%Q')).to eq '?' expect(group.strfregexp('%z, %Z')).to eq '0, 1' d = root.last expect(d.strfregexp('%q')).to eq '{3, 4}' expect(d.strfregexp('%Q')).to eq '{3,4}' expect(d.strfregexp('%z, %Z')).to eq '3, 4' end specify('#strfregexp text') do root = RP.parse(/a(b(c))|[d-gk-p]+/) expect(root.strfregexp('%t')).to eq 'a(b(c))|[d-gk-p]+' expect(root.strfregexp('%~t')).to eq 'expression:root' alt = root.first expect(alt.strfregexp('%t')).to eq 'a(b(c))|[d-gk-p]+' expect(alt.strfregexp('%T')).to eq 'a(b(c))|[d-gk-p]+' expect(alt.strfregexp('%~t')).to eq 'meta:alternation' seq_1 = alt.first expect(seq_1.strfregexp('%t')).to eq 'a(b(c))' expect(seq_1.strfregexp('%T')).to eq 'a(b(c))' expect(seq_1.strfregexp('%~t')).to eq 'expression:sequence' group = seq_1[1] expect(group.strfregexp('%t')).to eq '(b(c))' expect(group.strfregexp('%T')).to eq '(b(c))' expect(group.strfregexp('%~t')).to eq 'group:capture' seq_2 = alt.last expect(seq_2.strfregexp('%t')).to eq '[d-gk-p]+' expect(seq_2.strfregexp('%T')).to eq '[d-gk-p]+' set = seq_2.first expect(set.strfregexp('%t')).to eq '[d-gk-p]' expect(set.strfregexp('%T')).to eq '[d-gk-p]+' expect(set.strfregexp('%~t')).to eq 'set:character' end specify('#strfregexp combined') do root = RP.parse(/a{5}|[b-d]+/) expect(root.strfregexp('%b')).to eq '@0+11 expression:root' expect(root.strfregexp('%b')).to eq root.strfregexp('%o %i') expect(root.strfregexp('%m')).to eq '@0+11 expression:root {1}' expect(root.strfregexp('%m')).to eq root.strfregexp('%b %q') expect(root.strfregexp('%a')).to eq '@0+11 expression:root {1} a{5}|[b-d]+' expect(root.strfregexp('%a')).to eq root.strfregexp('%m %t') end specify('#strfregexp conditional') do root = RP.parse('(?a)(?()b|c)') expect { root.strfregexp }.not_to(raise_error) end specify('#strfregexp_tree') do root = RP.parse(/a[b-d]*(e(f+))?/) expect(root.strfregexp_tree('%>%o %~t')).to eq( "@0+15 expression:root\n" + " @0+1 a\n" + " @1+6 set:character\n" + " @2+3 set:range\n" + " @2+1 b\n" + " @4+1 d\n" + " @7+8 group:capture\n" + " @8+1 e\n" + " @9+4 group:capture\n" + " @10+2 f+" ) end specify('#strfregexp_tree separator') do root = RP.parse(/a[b-d]*(e(f+))?/) expect(root.strfregexp_tree('%>%o %~t', true, '-SEP-')).to eq( "@0+15 expression:root-SEP-" + " @0+1 a-SEP-" + " @1+6 set:character-SEP-" + " @2+3 set:range-SEP-" + " @2+1 b-SEP-" + " @4+1 d-SEP-" + " @7+8 group:capture-SEP-" + " @8+1 e-SEP-" + " @9+4 group:capture-SEP-" + " @10+2 f+" ) end specify('#strfregexp_tree excluding self') do root = RP.parse(/a[b-d]*(e(f+))?/) expect(root.strfregexp_tree('%>%o %~t', false)).to eq( "@0+1 a\n" + "@1+6 set:character\n" + " @2+3 set:range\n" + " @2+1 b\n" + " @4+1 d\n" + "@7+8 group:capture\n" + " @8+1 e\n" + " @9+4 group:capture\n" + " @10+2 f+" ) end end ammar-regexp_parser-68cdeff/spec/expression/methods/tests_spec.rb000066400000000000000000000211741506175332700255550ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('ExpressionTests') do specify('#type?') do root = RP.parse(/abcd|(ghij)|[klmn]/) alt = root.first expect(alt.type?(:meta)).to be true expect(alt.type?(:escape)).to be false expect(alt.type?(%i[meta escape])).to be true expect(alt.type?(%i[literal escape])).to be false expect(alt.type?(:*)).to be true expect(alt.type?([:*])).to be true expect(alt.type?(%i[literal escape *])).to be true seq_1 = alt[0] expect(seq_1.type?(:expression)).to be true expect(seq_1.first.type?(:literal)).to be true seq_2 = alt[1] expect(seq_2.type?(:*)).to be true expect(seq_2.first.type?(:group)).to be true seq_3 = alt[2] expect(seq_3.first.type?(:set)).to be true end specify('#is?') do root = RP.parse(/.+|\.?/) expect(root.is?(:*)).to be true alt = root.first expect(alt.is?(:*)).to be true expect(alt.is?(:alternation)).to be true expect(alt.is?(:alternation, :meta)).to be true seq_1 = alt[0] expect(seq_1.is?(:sequence)).to be true expect(seq_1.is?(:sequence, :expression)).to be true expect(seq_1.first.is?(:dot)).to be true expect(seq_1.first.is?(:dot, :escape)).to be false expect(seq_1.first.is?(:dot, :meta)).to be true expect(seq_1.first.is?(:dot, %i[escape meta])).to be true seq_2 = alt[1] expect(seq_2.first.is?(:dot)).to be true expect(seq_2.first.is?(:dot, :escape)).to be true expect(seq_2.first.is?(:dot, :meta)).to be false expect(seq_2.first.is?(:dot, %i[meta escape])).to be true end specify('#one_of?') do root = RP.parse(/\Aab(c[\w])d|e.\z/) expect(root.one_of?(:*)).to be true expect(root.one_of?(:* => :*)).to be true expect(root.one_of?(:* => [:*])).to be true alt = root.first expect(alt.one_of?(:*)).to be true expect(alt.one_of?(:meta)).to be true expect(alt.one_of?(:meta, :alternation)).to be true expect(alt.one_of?(meta: %i[dot bogus])).to be false expect(alt.one_of?(meta: %i[dot alternation])).to be true seq_1 = alt[0] expect(seq_1.one_of?(:expression)).to be true expect(seq_1.one_of?(expression: :sequence)).to be true expect(seq_1.first.one_of?(:anchor)).to be true expect(seq_1.first.one_of?(anchor: :bos)).to be true expect(seq_1.first.one_of?(anchor: :eos)).to be false expect(seq_1.first.one_of?(anchor: %i[escape meta bos])).to be true expect(seq_1.first.one_of?(anchor: %i[escape meta eos])).to be false seq_2 = alt[1] expect(seq_2.first.one_of?(:literal)).to be true expect(seq_2[1].one_of?(:meta)).to be true expect(seq_2[1].one_of?(meta: :dot)).to be true expect(seq_2[1].one_of?(meta: :alternation)).to be false expect(seq_2[1].one_of?(meta: [:dot])).to be true expect(seq_2.last.one_of?(:group)).to be false expect(seq_2.last.one_of?(group: [:*])).to be false expect(seq_2.last.one_of?(group: [:*], meta: :*)).to be false expect(seq_2.last.one_of?(:meta => [:*], :* => :*)).to be true expect(seq_2.last.one_of?(meta: [:*], anchor: :*)).to be true expect(seq_2.last.one_of?(meta: [:*], anchor: :eos)).to be true expect(seq_2.last.one_of?(meta: [:*], anchor: [:bos])).to be false expect(seq_2.last.one_of?(meta: [:*], anchor: %i[bos eos])).to be true expect { root.one_of?(Object.new) }.to raise_error(ArgumentError) end specify('#==') do expect(RP.parse(/a/)).to eq RP.parse(/a/) expect(RP.parse(/a/)).not_to eq RP.parse(/B/) expect(RP.parse(/a+/)).to eq RP.parse(/a+/) expect(RP.parse(/a+/)).not_to eq RP.parse(/a++/) expect(RP.parse(/a+/)).not_to eq RP.parse(/a?/) expect(RP.parse(/\A/)).to eq RP.parse(/\A/) expect(RP.parse(/\A/)).not_to eq RP.parse(/\b/) expect(RP.parse(/[a]/)).to eq RP.parse(/[a]/) expect(RP.parse(/[a]/)).not_to eq RP.parse(/[B]/) expect(RP.parse(/(a)/)).to eq RP.parse(/(a)/) expect(RP.parse(/(a)/)).not_to eq RP.parse(/(B)/) expect(RP.parse(/(a|A)/)).to eq RP.parse(/(a|A)/) expect(RP.parse(/(a|A)/)).not_to eq RP.parse(/(a|B)/) expect(RP.parse(/(?:a)/)).to eq RP.parse(/(?:a)/) expect(RP.parse(/(?:a)/)).not_to eq RP.parse(/(a)/) expect(RP.parse(/(?a)/)).to eq RP.parse(/(?a)/) expect(RP.parse(/(?a)/)).not_to eq RP.parse(/(?B)/) expect(RP.parse(/(?a)/)).not_to eq RP.parse(/(?a)/) expect(RP.parse(/(?a)/)).not_to eq RP.parse(/(?'a'a)/) expect(RP.parse(/(a)(x)(?(1)T|F)/)).to eq RP.parse(/(a)(x)(?(1)T|F)/) expect(RP.parse(/(a)(x)(?(1)T|F)/)).not_to eq RP.parse(/(a)(x)(?(2)T|F)/) expect(RP.parse(/(a)(x)(?(1)T|F)/)).not_to eq RP.parse(/(B)(x)(?(1)T|F)/) expect(RP.parse(/(a)(x)(?(1)T|F)/)).not_to eq RP.parse(/(a)(x)(?(1)T|T)/) expect(RP.parse(/a+/)[0].quantifier).to eq RP.parse(/a+/)[0].quantifier expect(RP.parse(/a+/)[0].quantifier).not_to eq RP.parse(/a++/)[0].quantifier expect(RP.parse(/a+/)[0].quantifier).not_to eq RP.parse(/a?/)[0].quantifier expect(RP.parse(/a+/)[0].quantifier).not_to eq RP.parse(/a{1,}/)[0].quantifier # active options should differentiate expressions expect(RP.parse(/a/)[0]).to eq RP.parse(/a/)[0] expect(RP.parse(/a/i)[0]).not_to eq RP.parse(/a/)[0] expect(RP.parse(/(?i)a/)[1]).not_to eq RP.parse(/a/)[0] expect(RP.parse(/(?i:a)/)[0][0]).not_to eq RP.parse(/a/)[0] # levels should be ignored expect(RP.parse(/([a])/)[0][0][0]).to eq RP.parse(/a/)[0] end # test #capturing? include_examples 'parse', /(a)\1/, [] => [capturing?: false], [0] => [capturing?: true], [1] => [capturing?: false] # test #comment?, #decorative? include_examples 'parse', /(a)(?#b)c # d/x, [] => [comment?: false], [0] => [comment?: false], [1] => [comment?: true, decorative?: true], [2] => [comment?: false], [3] => [comment?: false, decorative?: true], [4] => [comment?: true, decorative?: true] # test #optional? include_examples 'parse', /a?/, [0] => [optional?: true] include_examples 'parse', /a*/, [0] => [optional?: true] include_examples 'parse', /a{,5}/, [0] => [optional?: true] include_examples 'parse', /a{0,5}/, [0] => [optional?: true] include_examples 'parse', /a/, [0] => [optional?: false] include_examples 'parse', /a+/, [0] => [optional?: false] include_examples 'parse', /a{1}/, [0] => [optional?: false] include_examples 'parse', /a{1,5}/, [0] => [optional?: false] # test #quantified? include_examples 'parse', /a?b/, [] => [quantified?: false], [0] => [quantified?: true], [1] => [quantified?: false] # test #referential? include_examples 'parse', /(a)\1/, [] => [referential?: false], [0] => [referential?: false], [1] => [referential?: true] # test #terminal? include_examples 'parse', /^a([b]+)c$/, [] => [Root, terminal?: false], [0] => [to_s: '^', terminal?: true], [1] => [to_s: 'a', terminal?: true], [2] => [to_s: '([b]+)', terminal?: false], [2, 0] => [to_s: '[b]+', terminal?: false], [2, 0, 0] => [to_s: 'b', terminal?: true], [3] => [to_s: 'c', terminal?: true], [4] => [to_s: '$', terminal?: true] include_examples 'parse', /^(ab|cd)$/, [] => [Root, terminal?: false], [0] => [:bol, to_s: '^', terminal?: true], [1] => [:capture, to_s: '(ab|cd)', terminal?: false], [1, 0] => [:alternation, to_s: 'ab|cd', terminal?: false], [1, 0, 0] => [:sequence, to_s: 'ab', terminal?: false], [1, 0, 0, 0] => [:literal, to_s: 'ab', terminal?: true], [1, 0, 1] => [:sequence, to_s: 'cd', terminal?: false], [1, 0, 1, 0] => [:literal, to_s: 'cd', terminal?: true], [2] => [:eol, to_s: '$', terminal?: true] specify('tests at expression class level') do expect(Regexp::Expression::Literal).to be_terminal expect(Regexp::Expression::CharacterSet).not_to be_terminal expect(Regexp::Expression::Comment).to be_comment expect(Regexp::Expression::Literal).not_to be_comment expect(Regexp::Expression::Comment).to be_decorative expect(Regexp::Expression::Literal).not_to be_decorative expect(Regexp::Expression::Backreference::Base).to be_referential expect(Regexp::Expression::Literal).not_to be_referential end end ammar-regexp_parser-68cdeff/spec/expression/methods/traverse_spec.rb000066400000000000000000000106601506175332700262440ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Subexpression#traverse') do specify('Subexpression#traverse') do root = RP.parse(/a(b(c(d)))|g[h-i]j|klmn/) enters = 0 visits = 0 exits = 0 root.traverse do |event, _exp, _index| enters = (enters + 1) if event == :enter visits = (visits + 1) if event == :visit exits = (exits + 1) if event == :exit end expect(enters).to eq 9 expect(enters).to eq exits expect(visits).to eq 9 end specify('Subexpression#traverse including self') do root = RP.parse(/a(b(c(d)))|g[h-i]j|klmn/) enters = 0 visits = 0 exits = 0 root.traverse(true) do |event, _exp, _index| enters = (enters + 1) if event == :enter visits = (visits + 1) if event == :visit exits = (exits + 1) if event == :exit end expect(enters).to eq 10 expect(enters).to eq exits expect(visits).to eq 9 end specify('Subexpression#traverse without a block') do root = RP.parse(/abc/) enum = root.traverse expect(enum).to be_a(Enumerator) event, expr, idx = enum.next expect(event).to eq(:visit) expect(expr).to be_a(Regexp::Expression::Literal) expect(idx).to eq(0) end specify('Subexpression#walk alias') do root = RP.parse(/abc/) expect(root).to respond_to(:walk) end specify('Subexpression#each_expression') do root = RP.parse(/a(?x:b(c))|g[h-k]/) count = 0 root.each_expression { count += 1 } expect(count).to eq 13 end specify('Subexpression#each_expression including self') do root = RP.parse(/a(?x:b(c))|g[h-k]/) count = 0 root.each_expression(true) { count += 1 } expect(count).to eq 14 end specify('Subexpression#each_expression with block arity 1') do root = RP.parse(/a(b)c/) texts = [] root.each_expression { |exp| texts << exp.text } expect(texts).to eq ['a', '(', 'b', 'c'] end specify('Subexpression#each_expression indices') do root = RP.parse(/a(b)c/) indices = [] root.each_expression { |_exp, index| (indices << index) } expect(indices).to eq [0, 1, 0, 2] end specify('Subexpression#each_expression indices including self') do root = RP.parse(/a(b)c/) indices = [] root.each_expression(true) { |_exp, index| (indices << index) } expect(indices).to eq [0, 0, 1, 0, 2] end specify('Subexpression#each_expression without a block') do root = RP.parse(/abc/) enum = root.each_expression expect(enum).to be_a(Enumerator) expr, idx = enum.next expect(expr).to be_a(Regexp::Expression::Literal) expect(idx).to eq(0) end specify('Subexpression#flat_map without block') do root = RP.parse(/a(b([c-e]+))?/) array = root.flat_map expect(array).to be_instance_of(Array) expect(array.length).to eq 8 array.each do |item| expect(item).to be_instance_of(Array) expect(item.length).to eq 2 expect(item.first).to be_a(Regexp::Expression::Base) expect(item.last).to be_a(Integer) end end specify('Subexpression#flat_map without block including self') do root = RP.parse(/a(b([c-e]+))?/) array = root.flat_map(true) expect(array).to be_instance_of(Array) expect(array.length).to eq 9 end specify('Subexpression#flat_map expressions for block with arity 1') do root = RP.parse(/a(b(c(d)))/) result = root.flat_map { |exp| exp.text if exp.terminal? }.compact expect(result).to eq ['a', 'b', 'c', 'd'] end specify('Subexpression#flat_map indices') do root = RP.parse(/a(b([c-e]+))?f*g/) indices = root.flat_map { |_exp, index| index } expect(indices).to eq [0, 1, 0, 1, 0, 0, 0, 1, 2, 3] end specify('Subexpression#flat_map indices including self') do root = RP.parse(/a(b([c-e]+))?f*g/) indices = root.flat_map(true) { |_exp, index| index } expect(indices).to eq [0, 0, 1, 0, 1, 0, 0, 0, 1, 2, 3] end specify('Subexpression#flat_map expressions') do root = RP.parse(/a(b(c(d)))/) levels = root.flat_map { |exp| [exp.level, exp.text] if exp.terminal? }.compact expect(levels).to eq [[0, 'a'], [1, 'b'], [2, 'c'], [3, 'd']] end specify('Subexpression#flat_map expressions including self') do root = RP.parse(/a(b(c(d)))/) levels = root.flat_map(true) { |exp| [exp.level, exp.to_s] }.compact expect(levels).to eq [[0, 'a(b(c(d)))'], [0, 'a'], [0, '(b(c(d)))'], [1, 'b'], [1, '(c(d))'], [2, 'c'], [2, '(d)'], [3, 'd']] end end ammar-regexp_parser-68cdeff/spec/expression/options_spec.rb000066400000000000000000000133531506175332700244430ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Expression::Base#options') do it 'returns a hash of options/flags that affect the expression' do exp = RP.parse(/a/ix)[0] expect(exp).to be_a Literal expect(exp.options).to eq(i: true, x: true) end it 'includes options that are locally enabled via special groups' do exp = RP.parse(/(?x)(?m:a)/i)[1][0] expect(exp).to be_a Literal expect(exp.options).to eq(i: true, m: true, x: true) end it 'excludes locally disabled options' do exp = RP.parse(/(?x)(?-im:a)/i)[1][0] expect(exp).to be_a Literal expect(exp.options).to eq(x: true) end it 'gives correct precedence to negative options' do # Negative options have precedence. E.g. /(?i-i)a/ is case-sensitive. regexp = /(?i-i:a)/ expect(regexp).to match 'a' expect(regexp).not_to match 'A' exp = RP.parse(regexp)[0][0] expect(exp).to be_a Literal expect(exp.options).to eq({}) end it 'correctly handles multiple negative option parts' do regexp = /(?--m--mx--) . /mx expect(regexp).to match ' . ' expect(regexp).not_to match '.' expect(regexp).not_to match "\n" exp = RP.parse(regexp)[2] expect(exp.options).to eq({}) end it 'gives correct precedence when encountering multiple encoding flags' do # Any encoding flag overrides all previous encoding flags. If there are # multiple encoding flags in an options string, the last one wins. # E.g. /(?dau)\w/ matches UTF-8 chars but /(?dua)\w/ only ASCII chars. regexp1 = /(?dau)\w/ regexp2 = /(?dua)\w/ expect(regexp1).to match 'ü' expect(regexp2).not_to match 'ü' exp1 = RP.parse(regexp1)[1] exp2 = RP.parse(regexp2)[1] expect(exp1.options).to eq(u: true) expect(exp2.options).to eq(a: true) end it 'is accessible via shortcuts' do exp = Root.construct expect { exp.options[:i] = true } .to change { exp.i? }.from(false).to(true) .and change { exp.ignore_case? }.from(false).to(true) .and change { exp.case_insensitive? }.from(false).to(true) expect { exp.options[:m] = true } .to change { exp.m? }.from(false).to(true) .and change { exp.multiline? }.from(false).to(true) expect { exp.options[:x] = true } .to change { exp.x? }.from(false).to(true) .and change { exp.extended? }.from(false).to(true) .and change { exp.free_spacing? }.from(false).to(true) expect { exp.options[:a] = true } .to change { exp.a? }.from(false).to(true) .and change { exp.ascii_classes? }.from(false).to(true) expect { exp.options[:d] = true } .to change { exp.d? }.from(false).to(true) .and change { exp.default_classes? }.from(false).to(true) expect { exp.options[:u] = true } .to change { exp.u? }.from(false).to(true) .and change { exp.unicode_classes? }.from(false).to(true) end include_examples 'parse', //i, [] => [:root, i?: true, x?: false] include_examples 'parse', /a/i, [0] => [:literal, i?: true, x?: false] include_examples 'parse', /\A/i, [0] => [:bos, i?: true, x?: false] include_examples 'parse', /\d/i, [0] => [:digit, i?: true, x?: false] include_examples 'parse', /\n/i, [0] => [:newline, i?: true, x?: false] include_examples 'parse', /\K/i, [0] => [:mark, i?: true, x?: false] include_examples 'parse', /./i, [0] => [:dot, i?: true, x?: false] include_examples 'parse', /(a)/i, [0] => [:capture, i?: true, x?: false] include_examples 'parse', /(a)/i, [0, 0] => [:literal, i?: true, x?: false] include_examples 'parse', /(?=a)/i, [0] => [:lookahead, i?: true, x?: false] include_examples 'parse', /(?=a)/i, [0, 0] => [:literal, i?: true, x?: false] include_examples 'parse', /(a|b)/i, [0] => [:capture, i?: true, x?: false] include_examples 'parse', /(a|b)/i, [0, 0] => [:alternation, i?: true, x?: false] include_examples 'parse', /(a|b)/i, [0, 0, 0] => [:sequence, i?: true, x?: false] include_examples 'parse', /(a|b)/i, [0, 0, 0, 0] => [:literal, i?: true, x?: false] include_examples 'parse', /(a)\1/i, [1] => [:number, i?: true, x?: false] include_examples 'parse', /(a)\k<1>/i, [1] => [:number_ref, i?: true, x?: false] include_examples 'parse', /(a)\g<1>/i, [1] => [:number_call, i?: true, x?: false] include_examples 'parse', /[a]/i, [0] => [:character, i?: true, x?: false] include_examples 'parse', /[a]/i, [0, 0] => [:literal, i?: true, x?: false] include_examples 'parse', /[a-z]/i, [0, 0] => [:range, i?: true, x?: false] include_examples 'parse', /[a-z]/i, [0, 0, 0] => [:literal, i?: true, x?: false] include_examples 'parse', /[a&&z]/i, [0, 0] => [:intersection, i?: true, x?: false] include_examples 'parse', /[a&&z]/i, [0, 0, 0, 0] => [:literal, i?: true, x?: false] include_examples 'parse', /[[:ascii:]]/i, [0, 0] => [:ascii, i?: true, x?: false] include_examples 'parse', /\p{word}/i, [0] => [:word, i?: true, x?: false] include_examples 'parse', /(a)(?(1)b|c)/i, [1] => [:open, i?: true, x?: false] include_examples 'parse', /(a)(?(1)b|c)/i, [1, 0] => [:condition, i?: true, x?: false] include_examples 'parse', /(a)(?(1)b|c)/i, [1, 1] => [:sequence, i?: true, x?: false] include_examples 'parse', /(a)(?(1)b|c)/i, [1, 1, 0] => [:literal, i?: true, x?: false] end ammar-regexp_parser-68cdeff/spec/expression/subexpression_spec.rb000066400000000000000000000033111506175332700256520ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe(Regexp::Expression::Subexpression) do # check #ts, #te include_examples 'parse', /abcd|ghij|klmn|pqur/, [0] => [Alternation, ts: 0, te: 19], [0, 0] => [Alternative, ts: 0, te: 4], [0, 1] => [Alternative, ts: 5, te: 9], [0, 2] => [Alternative, ts: 10, te: 14], [0, 3] => [Alternative, ts: 15, te: 19] # check #nesting_level include_examples 'parse', /a(b(\d|[ef-g[h]]))/, [0] => [Literal, to_s: 'a', nesting_level: 1], [1, 0] => [Literal, to_s: 'b', nesting_level: 2], [1, 1, 0] => [Alternation, to_s: '\d|[ef-g[h]]', nesting_level: 3], [1, 1, 0, 0] => [Alternative, to_s: '\d', nesting_level: 4], [1, 1, 0, 0, 0] => [CharacterType::Digit, to_s: '\d', nesting_level: 5], [1, 1, 0, 1] => [Alternative, to_s: '[ef-g[h]]', nesting_level: 4], [1, 1, 0, 1, 0] => [CharacterSet, to_s: '[ef-g[h]]', nesting_level: 5], [1, 1, 0, 1, 0, 0] => [Literal, to_s: 'e', nesting_level: 6], [1, 1, 0, 1, 0, 1] => [CharacterSet::Range, to_s: 'f-g', nesting_level: 6], [1, 1, 0, 1, 0, 1, 0] => [Literal, to_s: 'f', nesting_level: 7], [1, 1, 0, 1, 0, 2, 0] => [Literal, to_s: 'h', nesting_level: 7] specify('#dig') do root = RP.parse(/(((a)))/) expect(root.dig(0).to_s).to eq '(((a)))' expect(root.dig(0, 0, 0, 0).to_s).to eq 'a' expect(root.dig(0, 0, 0, 0, 0)).to be_nil expect(root.dig(3, 7)).to be_nil end end ammar-regexp_parser-68cdeff/spec/expression/te_ts_spec.rb000066400000000000000000000020021506175332700240530ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Expression::Shared#te,ts') do # Many tokens/expressions have their own tests for #te and #ts. # This is an integration-like test to ensure they are correct in conjunction. it 'is correct irrespective of nesting or preceding tokens' do regexp = regexp_with_all_features source = regexp.source root = RP.parse(regexp) checked_exps = root.each_expression.with_object([]) do |(exp), acc| acc.each { |e| fail "dupe: #{[e, exp]}" if e.to_s == exp.to_s } acc << exp unless exp.is_a?(Sequence) || exp.is_a?(WhiteSpace) end expect(checked_exps).not_to be_empty checked_exps.each do |exp| start = source.index(exp.to_s(:original)) expect(exp.ts).to eq(start), "expected #{exp.class} #{exp} to start at #{start}, got #{exp.ts}" end_idx = start + exp.base_length expect(exp.te).to eq(end_idx), "expected #{exp.class} #{exp} to end at #{end_idx}, got #{exp.te}" end end end ammar-regexp_parser-68cdeff/spec/expression/to_h_spec.rb000066400000000000000000000016071506175332700237000ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Expression::Base#to_h') do include_examples 'parse', /abc/, [] => [Root, to_h: { token: :root, type: :expression, text: 'abc', starts_at: 0, length: 3, quantifier: nil, options: {}, level: 0, set_level: 0, conditional_level: 0, expressions: [ { token: :literal, type: :literal, text: 'abc', starts_at: 0, length: 3, quantifier: nil, options: {}, level: 0, set_level: 0, conditional_level: 0 } ] }] include_examples 'parse', /a{2,4}/, [0, :q] => [Quantifier, to_h: { max: 4, min: 2, mode: :greedy, text: '{2,4}', token: :interval, }] specify('Conditional#to_h') do root = RP.parse('(?a)(?()b|c)') expect { root.to_h }.not_to(raise_error) end end ammar-regexp_parser-68cdeff/spec/expression/to_s_spec.rb000066400000000000000000000052221506175332700237100ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Expression::Base#to_s') do def parse_frozen(pattern) Leto.deep_freeze(RP.parse(pattern)) end def expect_round_trip(pattern) parsed = parse_frozen(pattern) expect(parsed.to_s).to eql(pattern) end specify('literal alternation') do expect_round_trip('abcd|ghij|klmn|pqur') end specify('quantified alternations') do expect_round_trip('(?:a?[b]+(c){2}|d+[e]*(f)?)|(?:g+[h]?(i){2,3}|j*[k]{3,5}(l)?)') end specify('quantified sets') do expect_round_trip('[abc]+|[^def]{3,6}') end specify('property sets') do expect_round_trip('[\a\b\p{Lu}\P{Z}\c\d]+') end specify('groups') do expect_round_trip("(a(?>b(?:c(?d(?'N'e)??f)+g)*+h)*i)++") end specify('assertions') do expect_round_trip('(a+(?=b+(?!c+(?<=d+(?a)(?()b|c)/, 3 => [:conditional, :open, '(?', 7, 9, 0, 0, 0], 4 => [:conditional, :condition, '()', 9, 14, 0, 0, 1], 6 => [:conditional, :separator, '|', 15, 16, 0, 0, 1], 8 => [:conditional, :close, ')', 17, 18, 0, 0, 0] include_examples 'lex', /((?a)(?(?()b|((?()[e-g]|[h-j])))))/, 0 => [:group, :capture, '(', 0, 1, 0, 0, 0], 1 => [:group, :named, '(?', 1, 6, 1, 0, 0], 5 => [:conditional, :open, '(?', 13, 15, 2, 0, 0], 6 => [:conditional, :condition, '()', 15, 20, 2, 0, 1], 8 => [:conditional, :separator, '|', 21, 22, 2, 0, 1], 10 => [:conditional, :open, '(?', 23, 25, 3, 0, 1], 11 => [:conditional, :condition, '()', 25, 30, 3, 0, 2], 12 => [:set, :open, '[', 30, 31, 3, 0, 2], 13 => [:literal, :literal, 'e', 31, 32, 3, 1, 2], 14 => [:set, :range, '-', 32, 33, 3, 1, 2], 15 => [:literal, :literal, 'g', 33, 34, 3, 1, 2], 16 => [:set, :close, ']', 34, 35, 3, 0, 2], 17 => [:conditional, :separator, '|', 35, 36, 3, 0, 2], 23 => [:conditional, :close, ')', 41, 42, 3, 0, 1], 25 => [:conditional, :close, ')', 43, 44, 2, 0, 0], 26 => [:group, :close, ')', 44, 45, 1, 0, 0], 27 => [:group, :close, ')', 45, 46, 0, 0, 0] include_examples 'lex', /(a(b(c)))(?(1)(?(2)(?(3)d|e))|(?(3)(?(2)f|g)|(?(1)f|g)))/, 9 => [:conditional, :open, '(?', 9, 11, 0, 0, 0], 10 => [:conditional, :condition, '(1)', 11, 14, 0, 0, 1], 11 => [:conditional, :open, '(?', 14, 16, 0, 0, 1], 12 => [:conditional, :condition, '(2)', 16, 19, 0, 0, 2], 13 => [:conditional, :open, '(?', 19, 21, 0, 0, 2], 14 => [:conditional, :condition, '(3)', 21, 24, 0, 0, 3], 16 => [:conditional, :separator, '|', 25, 26, 0, 0, 3], 18 => [:conditional, :close, ')', 27, 28, 0, 0, 2], 19 => [:conditional, :close, ')', 28, 29, 0, 0, 1], 20 => [:conditional, :separator, '|', 29, 30, 0, 0, 1], 21 => [:conditional, :open, '(?', 30, 32, 0, 0, 1], 22 => [:conditional, :condition, '(3)', 32, 35, 0, 0, 2], 23 => [:conditional, :open, '(?', 35, 37, 0, 0, 2], 24 => [:conditional, :condition, '(2)', 37, 40, 0, 0, 3], 26 => [:conditional, :separator, '|', 41, 42, 0, 0, 3], 28 => [:conditional, :close, ')', 43, 44, 0, 0, 2], 29 => [:conditional, :separator, '|', 44, 45, 0, 0, 2], 30 => [:conditional, :open, '(?', 45, 47, 0, 0, 2], 31 => [:conditional, :condition, '(1)', 47, 50, 0, 0, 3], 33 => [:conditional, :separator, '|', 51, 52, 0, 0, 3], 35 => [:conditional, :close, ')', 53, 54, 0, 0, 2], 36 => [:conditional, :close, ')', 54, 55, 0, 0, 1], 37 => [:conditional, :close, ')', 55, 56, 0, 0, 0] end ammar-regexp_parser-68cdeff/spec/lexer/delimiters_spec.rb000066400000000000000000000055501506175332700240310ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Literal delimiter lexing') do include_examples 'lex', '}', 0 => [:literal, :literal, '}', 0, 1, 0, 0, 0] include_examples 'lex', '}}', 0 => [:literal, :literal, '}}', 0, 2, 0, 0, 0] include_examples 'lex', '{', 0 => [:literal, :literal, '{', 0, 1, 0, 0, 0] include_examples 'lex', '{{', 0 => [:literal, :literal, '{{', 0, 2, 0, 0, 0] include_examples 'lex', '{}', 0 => [:literal, :literal, '{}', 0, 2, 0, 0, 0] include_examples 'lex', '}{', 0 => [:literal, :literal, '}{', 0, 2, 0, 0, 0] include_examples 'lex', '}{+', 0 => [:literal, :literal, '}', 0, 1, 0, 0, 0], 1 => [:literal, :literal, '{', 1, 2, 0, 0, 0], 2 => [:quantifier, :one_or_more, '+', 2, 3, 0, 0, 0] include_examples 'lex', '{{var}}', 0 => [:literal, :literal, '{{var}}', 0, 7, 0, 0, 0] include_examples 'lex', 'a{b}c', 0 => [:literal, :literal, 'a{b}c', 0, 5, 0, 0, 0] include_examples 'lex', 'a{1,2', 0 => [:literal, :literal, 'a{1,2', 0, 5, 0, 0, 0] include_examples 'lex', '({.+})', 0 => [:group, :capture, '(', 0, 1, 0, 0, 0], 1 => [:literal, :literal, '{', 1, 2, 1, 0, 0], 2 => [:meta, :dot, '.', 2, 3, 1, 0, 0], 3 => [:quantifier, :one_or_more, '+', 3, 4, 1, 0, 0], 4 => [:literal, :literal, '}', 4, 5, 1, 0, 0], 5 => [:group, :close, ')', 5, 6, 0, 0, 0] include_examples 'lex', ']', 0 => [:literal, :literal, ']', 0, 1, 0, 0, 0] include_examples 'lex', ']]', 0 => [:literal, :literal, ']]', 0, 2, 0, 0, 0] include_examples 'lex', ']\[', 0 => [:literal, :literal, ']', 0, 1, 0, 0, 0], 1 => [:escape, :set_open, '\[', 1, 3, 0, 0, 0] include_examples 'lex', '()', 0 => [:group, :capture, '(', 0, 1, 0, 0, 0], 1 => [:group, :close, ')', 1, 2, 0, 0, 0] include_examples 'lex', '{abc:.+}}}[^}]]}', 0 => [:literal, :literal, '{abc:', 0, 5, 0, 0, 0], 1 => [:meta, :dot, '.', 5, 6, 0, 0, 0], 2 => [:quantifier, :one_or_more, '+', 6, 7, 0, 0, 0], 3 => [:literal, :literal, '}}}', 7, 10, 0, 0, 0], 4 => [:set, :open, '[', 10, 11, 0, 0, 0], 5 => [:set, :negate, '^', 11, 12, 0, 1, 0], 6 => [:literal, :literal, '}', 12, 13, 0, 1, 0], 7 => [:set, :close, ']', 13, 14, 0, 0, 0], 8 => [:literal, :literal, ']}', 14, 16, 0, 0, 0] end ammar-regexp_parser-68cdeff/spec/lexer/escapes_spec.rb000066400000000000000000000010711506175332700233050ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Escape lexing') do include_examples 'lex', '\u{62}', 0 => [:escape, :codepoint_list, '\u{62}', 0, 6, 0, 0, 0] include_examples 'lex', '\u{62 63 64}', 0 => [:escape, :codepoint_list, '\u{62 63 64}', 0, 12, 0, 0, 0] include_examples 'lex', '\u{62 63 64}+', 0 => [:escape, :codepoint_list, '\u{62 63}', 0, 9, 0, 0, 0], 1 => [:escape, :codepoint_list, '\u{64}', 9, 15, 0, 0, 0], 2 => [:quantifier, :one_or_more, '+', 15, 16, 0, 0, 0] end ammar-regexp_parser-68cdeff/spec/lexer/keep_spec.rb000066400000000000000000000004751506175332700226150ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Keep lexing') do include_examples 'lex', /ab\Kcd/, 1 => [:keep, :mark, '\K', 2, 4, 0, 0, 0] include_examples 'lex', /(a\Kb)|(c\\\Kd)ef/, 2 => [:keep, :mark, '\K', 2, 4, 1, 0, 0], 9 => [:keep, :mark, '\K', 11, 13, 1, 0, 0] end ammar-regexp_parser-68cdeff/spec/lexer/literals_spec.rb000066400000000000000000000054631506175332700235120ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Literal lexing') do # ascii, single byte characters include_examples 'lex', 'a', 0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0] include_examples 'lex', 'ab+', 0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0], 1 => [:literal, :literal, 'b', 1, 2, 0, 0, 0], 2 => [:quantifier, :one_or_more, '+', 2, 3, 0, 0, 0] # 2 byte wide characters include_examples 'lex', 'äöü+', 0 => [:literal, :literal, 'äö', 0, 2, 0, 0, 0], 1 => [:literal, :literal, 'ü', 2, 3, 0, 0, 0], 2 => [:quantifier, :one_or_more, '+', 3, 4, 0, 0, 0] # 3 byte wide characters, Japanese include_examples 'lex', 'ab?れます+cd', 0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0], 1 => [:literal, :literal, 'b', 1, 2, 0, 0, 0], 2 => [:quantifier, :zero_or_one, '?', 2, 3, 0, 0, 0], 3 => [:literal, :literal, 'れま', 3, 5, 0, 0, 0], 4 => [:literal, :literal, 'す', 5, 6, 0, 0, 0], 5 => [:quantifier, :one_or_more, '+', 6, 7, 0, 0, 0], 6 => [:literal, :literal, 'cd', 7, 9, 0, 0, 0] # 4 byte wide characters, Osmanya include_examples 'lex', '𐒀𐒁?𐒂ab+𐒃', 0 => [:literal, :literal, '𐒀', 0, 1, 0, 0, 0], 1 => [:literal, :literal, '𐒁', 1, 2, 0, 0, 0], 2 => [:quantifier, :zero_or_one, '?', 2, 3, 0, 0, 0], 3 => [:literal, :literal, '𐒂a', 3, 5, 0, 0, 0], 4 => [:literal, :literal, 'b', 5, 6, 0, 0, 0], 5 => [:quantifier, :one_or_more, '+', 6, 7, 0, 0, 0], 6 => [:literal, :literal, '𐒃', 7, 8, 0, 0, 0] include_examples 'lex', 'mu𝄞?si*𝄫c+', 0 => [:literal, :literal, 'mu', 0, 2, 0, 0, 0], 1 => [:literal, :literal, '𝄞', 2, 3, 0, 0, 0], 2 => [:quantifier, :zero_or_one, '?', 3, 4, 0, 0, 0], 3 => [:literal, :literal, 's', 4, 5, 0, 0, 0], 4 => [:literal, :literal, 'i', 5, 6, 0, 0, 0], 5 => [:quantifier, :zero_or_more, '*', 6, 7, 0, 0, 0], 6 => [:literal, :literal, '𝄫', 7, 8, 0, 0, 0], 7 => [:literal, :literal, 'c', 8, 9, 0, 0, 0], 8 => [:quantifier, :one_or_more, '+', 9, 10, 0, 0, 0] specify('lex single 2 byte char') do tokens = RL.lex("\u0627+") expect(tokens.count).to eq 2 end specify('lex single 3 byte char') do tokens = RL.lex("\u308C+") expect(tokens.count).to eq 2 end specify('lex single 4 byte char') do tokens = RL.lex("\u{1D11E}+") expect(tokens.count).to eq 2 end end ammar-regexp_parser-68cdeff/spec/lexer/nesting_spec.rb000066400000000000000000000126671506175332700233460ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Nesting lexing') do include_examples 'lex', /(((b)))/, 0 => [:group, :capture, '(', 0, 1, 0, 0, 0], 1 => [:group, :capture, '(', 1, 2, 1, 0, 0], 2 => [:group, :capture, '(', 2, 3, 2, 0, 0], 3 => [:literal, :literal, 'b', 3, 4, 3, 0, 0], 4 => [:group, :close, ')', 4, 5, 2, 0, 0], 5 => [:group, :close, ')', 5, 6, 1, 0, 0], 6 => [:group, :close, ')', 6, 7, 0, 0, 0] include_examples 'lex', /(\((b)\))/, 0 => [:group, :capture, '(', 0, 1, 0, 0, 0], 1 => [:escape, :group_open, '\(', 1, 3, 1, 0, 0], 2 => [:group, :capture, '(', 3, 4, 1, 0, 0], 3 => [:literal, :literal, 'b', 4, 5, 2, 0, 0], 4 => [:group, :close, ')', 5, 6, 1, 0, 0], 5 => [:escape, :group_close, '\)', 6, 8, 1, 0, 0], 6 => [:group, :close, ')', 8, 9, 0, 0, 0] include_examples 'lex', /(?>a(?>b(?>c)))/, 0 => [:group, :atomic, '(?>', 0, 3, 0, 0, 0], 2 => [:group, :atomic, '(?>', 4, 7, 1, 0, 0], 4 => [:group, :atomic, '(?>', 8, 11, 2, 0, 0], 6 => [:group, :close, ')', 12, 13, 2, 0, 0], 7 => [:group, :close, ')', 13, 14, 1, 0, 0], 8 => [:group, :close, ')', 14, 15, 0, 0, 0] include_examples 'lex', /(?:a(?:b(?:c)))/, 0 => [:group, :passive, '(?:', 0, 3, 0, 0, 0], 2 => [:group, :passive, '(?:', 4, 7, 1, 0, 0], 4 => [:group, :passive, '(?:', 8, 11, 2, 0, 0], 6 => [:group, :close, ')', 12, 13, 2, 0, 0], 7 => [:group, :close, ')', 13, 14, 1, 0, 0], 8 => [:group, :close, ')', 14, 15, 0, 0, 0] include_examples 'lex', /(?=a(?!b(?<=c(? [:assertion, :lookahead, '(?=', 0, 3, 0, 0, 0], 2 => [:assertion, :nlookahead, '(?!', 4, 7, 1, 0, 0], 4 => [:assertion, :lookbehind, '(?<=', 8, 12, 2, 0, 0], 6 => [:assertion, :nlookbehind, '(? [:group, :close, ')', 18, 19, 3, 0, 0], 9 => [:group, :close, ')', 19, 20, 2, 0, 0], 10 => [:group, :close, ')', 20, 21, 1, 0, 0], 11 => [:group, :close, ')', 21, 22, 0, 0, 0] include_examples 'lex', /((?#a)b(?#c)d(?#e))/, 0 => [:group, :capture, '(', 0, 1, 0, 0, 0], 1 => [:group, :comment, '(?#a)', 1, 6, 1, 0, 0], 3 => [:group, :comment, '(?#c)', 7, 12, 1, 0, 0], 5 => [:group, :comment, '(?#e)', 13, 18, 1, 0, 0], 6 => [:group, :close, ')', 18, 19, 0, 0, 0] include_examples 'lex', /a[b-e]f/, 1 => [:set, :open, '[', 1, 2, 0, 0, 0], 2 => [:literal, :literal, 'b', 2, 3, 0, 1, 0], 3 => [:set, :range, '-', 3, 4, 0, 1, 0], 4 => [:literal, :literal, 'e', 4, 5, 0, 1, 0], 5 => [:set, :close, ']', 5, 6, 0, 0, 0] include_examples 'lex', '[[:word:]&&[^c]z]', 0 => [:set, :open, '[', 0, 1, 0, 0, 0], 1 => [:posixclass, :word, '[:word:]', 1, 9, 0, 1, 0], 2 => [:set, :intersection, '&&', 9, 11, 0, 1, 0], 3 => [:set, :open, '[', 11, 12, 0, 1, 0], 4 => [:set, :negate, '^', 12, 13, 0, 2, 0], 5 => [:literal, :literal, 'c', 13, 14, 0, 2, 0], 6 => [:set, :close, ']', 14, 15, 0, 1, 0], 7 => [:literal, :literal, 'z', 15, 16, 0, 1, 0], 8 => [:set, :close, ']', 16, 17, 0, 0, 0] include_examples 'lex', '[\p{word}&&[^c]z]', 0 => [:set, :open, '[', 0, 1, 0, 0, 0], 1 => [:property, :word, '\p{word}', 1, 9, 0, 1, 0], 2 => [:set, :intersection, '&&', 9, 11, 0, 1, 0], 3 => [:set, :open, '[', 11, 12, 0, 1, 0], 4 => [:set, :negate, '^', 12, 13, 0, 2, 0], 5 => [:literal, :literal, 'c', 13, 14, 0, 2, 0], 6 => [:set, :close, ']', 14, 15, 0, 1, 0], 7 => [:literal, :literal, 'z', 15, 16, 0, 1, 0], 8 => [:set, :close, ']', 16, 17, 0, 0, 0] include_examples 'lex', /[a[b[c[d-g]]]]/, 0 => [:set, :open, '[', 0, 1, 0, 0, 0], 1 => [:literal, :literal, 'a', 1, 2, 0, 1, 0], 2 => [:set, :open, '[', 2, 3, 0, 1, 0], 3 => [:literal, :literal, 'b', 3, 4, 0, 2, 0], 4 => [:set, :open, '[', 4, 5, 0, 2, 0], 5 => [:literal, :literal, 'c', 5, 6, 0, 3, 0], 6 => [:set, :open, '[', 6, 7, 0, 3, 0], 7 => [:literal, :literal, 'd', 7, 8, 0, 4, 0], 8 => [:set, :range, '-', 8, 9, 0, 4, 0], 9 => [:literal, :literal, 'g', 9, 10, 0, 4, 0], 10 => [:set, :close, ']', 10, 11, 0, 3, 0], 11 => [:set, :close, ']', 11, 12, 0, 2, 0], 12 => [:set, :close, ']', 12, 13, 0, 1, 0], 13 => [:set, :close, ']', 13, 14, 0, 0, 0] end ammar-regexp_parser-68cdeff/spec/lexer/refcalls_spec.rb000066400000000000000000000052701506175332700234620ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('RefCall lexing') do # Traditional numerical group back-reference include_examples 'lex', '(abc)\1', 3 => [:backref, :number, '\1', 5, 7, 0, 0, 0] # Group back-references, named, numbered, and relative include_examples 'lex', '(?abc)\k', 3 => [:backref, :name_ref, '\k', 9, 14, 0, 0, 0] include_examples 'lex', "(?abc)\\k'X'", 3 => [:backref, :name_ref, "\\k'X'", 9, 14, 0, 0, 0] include_examples 'lex', '(abc)\k<1>', 3 => [:backref, :number_ref, '\k<1>', 5, 10, 0, 0, 0] include_examples 'lex', "(abc)\\k'1'", 3 => [:backref, :number_ref, "\\k'1'", 5, 10, 0, 0, 0] include_examples 'lex', '(abc)\k<-1>', 3 => [:backref, :number_rel_ref, '\k<-1>', 5, 11, 0, 0, 0] include_examples 'lex', "(abc)\\k'-1'", 3 => [:backref, :number_rel_ref, "\\k'-1'", 5, 11, 0, 0, 0] # Sub-expression invocation, named, numbered, and relative include_examples 'lex', '(?abc)\g', 3 => [:backref, :name_call, '\g', 9, 14, 0, 0, 0] include_examples 'lex', "(?abc)\\g'X'", 3 => [:backref, :name_call, "\\g'X'", 9, 14, 0, 0, 0] include_examples 'lex', '(abc)\g<1>', 3 => [:backref, :number_call, '\g<1>', 5, 10, 0, 0, 0] include_examples 'lex', "(abc)\\g'1'", 3 => [:backref, :number_call, "\\g'1'", 5, 10, 0, 0, 0] include_examples 'lex', '\g<0>', 0 => [:backref, :number_call, '\g<0>', 0, 5, 0, 0, 0] include_examples 'lex', "\\g'0'", 0 => [:backref, :number_call, "\\g'0'", 0, 5, 0, 0, 0] include_examples 'lex', '(abc)\g<-1>', 3 => [:backref, :number_rel_call, '\g<-1>', 5, 11, 0, 0, 0] include_examples 'lex', "(abc)\\g'-1'", 3 => [:backref, :number_rel_call, "\\g'-1'", 5, 11, 0, 0, 0] include_examples 'lex', '(abc)\g<+1>', 3 => [:backref, :number_rel_call, '\g<+1>', 5, 11, 0, 0, 0] include_examples 'lex', "(abc)\\g'+1'", 3 => [:backref, :number_rel_call, "\\g'+1'", 5, 11, 0, 0, 0] # Group back-references, with nesting level include_examples 'lex', '(?abc)\k', 3 => [:backref, :name_recursion_ref, '\k', 9, 16, 0, 0, 0] include_examples 'lex', "(?abc)\\k'X-0'", 3 => [:backref, :name_recursion_ref, "\\k'X-0'", 9, 16, 0, 0, 0] include_examples 'lex', '(abc)\k<1-0>', 3 => [:backref, :number_recursion_ref, '\k<1-0>', 5, 12, 0, 0, 0] include_examples 'lex', "(abc)\\k'1-0'", 3 => [:backref, :number_recursion_ref, "\\k'1-0'", 5, 12, 0, 0, 0] end ammar-regexp_parser-68cdeff/spec/parser/000077500000000000000000000000001506175332700205015ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/spec/parser/all_spec.rb000066400000000000000000000016501506175332700226120ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe(Regexp::Parser) do specify('parse returns a root expression') do expect(RP.parse('abc')).to be_instance_of(Root) end specify('parse can be called with block') do expect(RP.parse('abc') { |root| root.class }).to eq Root end specify('parse root contains expressions') do root = RP.parse(/^a.c+[^one]{2,3}\b\d\\\C-C$/) expect(root.expressions).to all(be_a Regexp::Expression::Base) end specify('parse root options mi') do root = RP.parse(/[abc]/mi) expect(root.m?).to be true expect(root.i?).to be true expect(root.x?).to be false end specify('parse no quantifier target raises error') do expect { RP.parse('?abc') }.to raise_error(Regexp::Parser::Error) end specify('parse sequence no quantifier target raises error') do expect { RP.parse('abc|?def') }.to raise_error(Regexp::Parser::Error) end end ammar-regexp_parser-68cdeff/spec/parser/alternation_spec.rb000066400000000000000000000045471506175332700243720ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Alternation parsing') do include_examples 'parse', /a|b/, [0] => [Alternation, text: '|', count: 2], [0, 0] => [Alternative, text: '', count: 1], [0, 0, 0] => [:literal, text: 'a' ], [0, 1] => [Alternative, text: '', count: 1], [0, 1, 0] => [:literal, text: 'b' ] include_examples 'parse', /a|(b)c/, [0] => [Alternation, text: '|', count: 2], [0, 0] => [Alternative, text: '', count: 1], [0, 0, 0] => [:literal, text: 'a' ], [0, 1] => [Alternative, text: '', count: 2], [0, 1, 0] => [:capture, to_s: '(b)' ], [0, 1, 1] => [:literal, text: 'c' ] include_examples 'parse', /(ab??|cd*|ef+)*|(gh|ij|kl)?/, [0] => [Alternation, text: '|', count: 2, quantified?: false], [0, 0] => [Alternative, text: '', count: 1, quantified?: false], [0, 0, 0] => [:capture, count: 1, quantified?: true ], [0, 0, 0, 0] => [Alternation, text: '|', count: 3 ], [0, 0, 0, 0, 0] => [Alternative, text: '', count: 2 ], [0, 0, 0, 0, 0, 0] => [:literal, to_s: 'a' ], [0, 0, 0, 0, 0, 1] => [:literal, to_s: 'b??' ], [0, 1] => [Alternative, text: '', count: 1, quantified?: false], [0, 1, 0] => [:capture, count: 1, quantified?: true ] # test correct ts values for empty sequences include_examples 'parse', /|||/, [0] => [Alternation, text: '|', count: 4, starts_at: 0], [0, 0] => [Alternative, to_s: '', count: 0, starts_at: 0], [0, 1] => [Alternative, to_s: '', count: 0, starts_at: 1], [0, 2] => [Alternative, to_s: '', count: 0, starts_at: 2], [0, 3] => [Alternative, to_s: '', count: 0, starts_at: 3] # test correct ts values for non-empty sequences include_examples 'parse', /ab|cd|ef|gh/, [0] => [Alternation, text: '|', count: 4, starts_at: 0], [0, 0] => [Alternative, to_s: 'ab', count: 1, starts_at: 0], [0, 1] => [Alternative, to_s: 'cd', count: 1, starts_at: 3], [0, 2] => [Alternative, to_s: 'ef', count: 1, starts_at: 6], [0, 3] => [Alternative, to_s: 'gh', count: 1, starts_at: 9] end ammar-regexp_parser-68cdeff/spec/parser/anchors_spec.rb000066400000000000000000000015611506175332700235000ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Anchor parsing') do include_examples 'parse', /^a/, 0 => [:anchor, :bol, Anchor::BOL] include_examples 'parse', /a$/, 1 => [:anchor, :eol, Anchor::EOL] include_examples 'parse', /\Aa/, 0 => [:anchor, :bos, Anchor::BOS] include_examples 'parse', /a\z/, 1 => [:anchor, :eos, Anchor::EOS] include_examples 'parse', /a\Z/, 1 => [:anchor, :eos_ob_eol, Anchor::EOSobEOL] include_examples 'parse', /a\b/, 1 => [:anchor, :word_boundary, Anchor::WordBoundary] include_examples 'parse', /a\B/, 1 => [:anchor, :nonword_boundary, Anchor::NonWordBoundary] include_examples 'parse', /a\G/, 1 => [:anchor, :match_start, Anchor::MatchStart] include_examples 'parse', /\\A/, 0 => [:escape, :backslash, EscapeSequence::Literal] end ammar-regexp_parser-68cdeff/spec/parser/conditionals_spec.rb000066400000000000000000000064351506175332700245360ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Conditional parsing') do include_examples 'parse', /(?a)(?()T|F)/, [1] => [:conditional, :open, Conditional::Expression, to_s: '(?()T|F)', reference: 'A', ts: 7], [1, 0] => [:conditional, :condition, Conditional::Condition, to_s: '()', reference: 'A', ts: 9], [1, 1] => [:expression, :sequence, Conditional::Branch, to_s: 'T', ts: 14], [1, 1, 0] => [:literal, text: 'T', ts: 14], [1, 2] => [:expression, :sequence, Conditional::Branch, to_s: 'F', ts: 16], [1, 2, 0] => [:literal, text: 'F', ts: 16] include_examples 'parse', /(a)(?(1)T|F)/, [1] => [:conditional, :open, Conditional::Expression, to_s: '(?(1)T|F)', reference: 1, ts: 3], [1, 0] => [:conditional, :condition, Conditional::Condition, to_s: '(1)', reference: 1, ts: 5], [1, 1] => [:expression, :sequence, Conditional::Branch, to_s: 'T', ts: 8], [1, 1, 0] => [:literal, text: 'T', ts: 8], [1, 2] => [:expression, :sequence, Conditional::Branch, to_s: 'F', ts: 10], [1, 2, 0] => [:literal, text: 'F', ts: 10] include_examples 'parse', /(foo)(?(1)\d+|(\w)){42}/, [1] => [Conditional::Expression, quantified?: true, to_s: '(?(1)\d+|(\w)){42}'], [1, 0] => [Conditional::Condition, quantified?: false], [1, 1] => [Conditional::Branch, quantified?: false], [1, 1, 0] => [:digit, quantified?: true, to_s: '\d+'], [1, 2] => [Conditional::Branch, quantified?: false] # test nested and mixed with alternations include_examples 'parse', <<-EOS.gsub(/\s/, ''), ( (a) | (b) | ( ( ?(2) (c(d|e)+)? | ( ?(3) f | ( ?(4) (g|(h)(i)) ) ) ) ) ) EOS [0] => [Group::Capture, count: 1], [0, 0] => [Alternation, count: 3], [0, 0, 2] => [Alternative, count: 1], [0, 0, 2, 0] => [Group::Capture, count: 1], [0, 0, 2, 0, 0] => [Conditional::Expression, count: 3, conditional_level: 0], [0, 0, 2, 0, 0, 0] => [Conditional::Condition, to_s: '(2)', conditional_level: 1], [0, 0, 2, 0, 0, 1] => [Conditional::Branch, to_s: '(c(d|e)+)?', conditional_level: 1], [0, 0, 2, 0, 0, 2] => [Conditional::Branch, to_s: '(?(3)f|(?(4)(g|(h)(i))))', conditional_level: 1], [0, 0, 2, 0, 0, 2, 0] => [Conditional::Expression, count: 3, conditional_level: 1], [0, 0, 2, 0, 0, 2, 0, 0] => [Conditional::Condition, to_s: '(3)', conditional_level: 2], [0, 0, 2, 0, 0, 2, 0, 1] => [Conditional::Branch, count: 1, to_s: 'f', conditional_level: 2], [0, 0, 2, 0, 0, 2, 0, 1, 0] => [Literal, text: 'f', conditional_level: 2] # test empty branch include_examples 'parse', /(?a)(?()T|)/, [1] => [Conditional::Expression, count: 3, to_s: '(?()T|)'], [1, 2] => [Conditional::Branch, to_s: '', ts: 16] # test insignificant leading zeros in the condition's group number ref include_examples 'parse', /(a)(?(001)T)/, [1, 0] => [Conditional::Condition, to_s: '(001)', reference: 1] end ammar-regexp_parser-68cdeff/spec/parser/errors_spec.rb000066400000000000000000000022431506175332700233550ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Parsing errors') do let(:parser) { Regexp::Parser.new } before { parser.parse(/foo/) } # initializes ivars it('raises UnknownTokenTypeError for unknown token types') do expect { parser.send(:parse_token, Regexp::Token.new(:foo, :bar)) } .to raise_error(Regexp::Parser::UnknownTokenTypeError) end RSpec.shared_examples 'UnknownTokenError' do |type| it "raises for unknown tokens of type #{type}" do expect { parser.send(:parse_token, Regexp::Token.new(type, :foo)) } .to raise_error(Regexp::Parser::UnknownTokenError) end end include_examples 'UnknownTokenError', :anchor include_examples 'UnknownTokenError', :backref include_examples 'UnknownTokenError', :conditional include_examples 'UnknownTokenError', :free_space include_examples 'UnknownTokenError', :group include_examples 'UnknownTokenError', :meta include_examples 'UnknownTokenError', :nonproperty include_examples 'UnknownTokenError', :property include_examples 'UnknownTokenError', :quantifier include_examples 'UnknownTokenError', :set include_examples 'UnknownTokenError', :type end ammar-regexp_parser-68cdeff/spec/parser/escapes_spec.rb000066400000000000000000000104471506175332700234710ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('EscapeSequence parsing') do es = EscapeSequence include_examples 'parse', /a\ac/, 1 => [:escape, :bell, es::Bell] include_examples 'parse', /a\ec/, 1 => [:escape, :escape, es::AsciiEscape] include_examples 'parse', /a\fc/, 1 => [:escape, :form_feed, es::FormFeed] include_examples 'parse', /a\nc/, 1 => [:escape, :newline, es::Newline] include_examples 'parse', /a\rc/, 1 => [:escape, :carriage, es::Return] include_examples 'parse', /a\tc/, 1 => [:escape, :tab, es::Tab] include_examples 'parse', /a\vc/, 1 => [:escape, :vertical_tab, es::VerticalTab] # meta character escapes include_examples 'parse', /a\.c/, 1 => [:escape, :dot, es::Literal] include_examples 'parse', /a\?c/, 1 => [:escape, :zero_or_one, es::Literal] include_examples 'parse', /a\*c/, 1 => [:escape, :zero_or_more, es::Literal] include_examples 'parse', /a\+c/, 1 => [:escape, :one_or_more, es::Literal] include_examples 'parse', /a\|c/, 1 => [:escape, :alternation, es::Literal] include_examples 'parse', /a\(c/, 1 => [:escape, :group_open, es::Literal] include_examples 'parse', /a\)c/, 1 => [:escape, :group_close, es::Literal] include_examples 'parse', /a\{c/, 1 => [:escape, :interval_open, es::Literal] include_examples 'parse', /a\}c/, 1 => [:escape, :interval_close, es::Literal] # unicode escapes include_examples 'parse', /a\u0640/, 1 => [:escape, :codepoint, es::Codepoint] include_examples 'parse', /a\u{41 1F60D}/, 1 => [:escape, :codepoint_list, es::CodepointList] include_examples 'parse', /a\u{10FFFF}/, 1 => [:escape, :codepoint_list, es::CodepointList] # hex escapes include_examples 'parse', /a\xFF/n, 1 => [:escape, :hex, es::Hex] include_examples 'parse', /a\xFF\xFF/n, 2 => [:escape, :hex, es::Hex] # octal escapes include_examples 'parse', /a\177/n, 1 => [:escape, :octal, es::Octal] # test #char and #codepoint include_examples 'parse', /\n/, 0 => [char: "\n", codepoint: 10 ] include_examples 'parse', /\?/, 0 => [char: '?', codepoint: 63 ] include_examples 'parse', /\101/, 0 => [char: 'A', codepoint: 65 ] include_examples 'parse', /\x42/, 0 => [char: 'B', codepoint: 66 ] include_examples 'parse', /\xA/, 0 => [char: "\n", codepoint: 10 ] include_examples 'parse', /\xE2\x82\xAC/, 0 => [char: "€", codepoint: 8364 ] include_examples 'parse', /\u0043/, 0 => [char: 'C', codepoint: 67 ] include_examples 'parse', /\u{44 45}/, 0 => [chars: %w[D E], codepoints: [68, 69]] specify('codepoint_list #char and #codepoint raise errors') do exp = RP.parse(/\u{44 45}/)[0] expect { exp.char }.to raise_error(/#chars/) expect { exp.codepoint }.to raise_error(/#codepoints/) end # Meta/control escapes # # After the following fix in Ruby 3.1, a Regexp#source containing meta/control # escapes can only be set with the Regexp::new constructor. # In Regexp literals, these escapes are now pre-processed to hex escapes. # # https://github.com/ruby/ruby/commit/11ae581a4a7f5d5f5ec6378872eab8f25381b1b9 n = ->(regexp_body){ Regexp.new(regexp_body.dup.force_encoding('ascii-8bit')) } include_examples 'parse', n.('\\\\\c2b'), 1 => [es::Control, text: '\c2', char: "\x12", codepoint: 18 ] include_examples 'parse', n.('\d\C-C\w'), 1 => [es::Control, text: '\C-C', char: "\x03", codepoint: 3 ] include_examples 'parse', n.('\Z\M-Z'), 1 => [es::Meta, text: '\M-Z', char: "\u00DA", codepoint: 218] include_examples 'parse', n.('\A\M-\C-X'), 1 => [es::MetaControl, text: '\M-\C-X', char: "\u0098", codepoint: 152] include_examples 'parse', n.('\A\M-\cX'), 1 => [es::MetaControl, text: '\M-\cX', char: "\u0098", codepoint: 152] include_examples 'parse', n.('\A\C-\M-X'), 1 => [es::MetaControl, text: '\C-\M-X', char: "\u0098", codepoint: 152] include_examples 'parse', n.('\A\c\M-X'), 1 => [es::MetaControl, text: '\c\M-X', char: "\u0098", codepoint: 152] end ammar-regexp_parser-68cdeff/spec/parser/free_space_spec.rb000066400000000000000000000043431506175332700241400ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('FreeSpace parsing') do include_examples 'parse', /a b c/, [0] => [Literal, text: 'a b c'] include_examples 'parse', /a b c/x, [0] => [Literal, text: 'a'], [1] => [WhiteSpace, text: ' '], [2] => [Literal, text: 'b'], [3] => [WhiteSpace, text: ' '], [4] => [Literal, text: 'c'] include_examples 'parse', /a * b + c/x, [0] => [Literal, to_s: 'a*', quantified?: true], [1] => [WhiteSpace, text: ' '], [2] => [WhiteSpace, text: ' '], [3] => [Literal, to_s: 'b+', quantified?: true], [4] => [WhiteSpace, text: ' '], [5] => [WhiteSpace, text: ' '], [6] => [Literal, to_s: 'c'] include_examples 'parse', / a ? # One letter b {2,5} # Another one [c-g] + # A set (h|i|j) # A group /x, [0] => [WhiteSpace], [1] => [Literal, to_s: 'a?', quantified?: true], [2] => [WhiteSpace, text: ' '], [3] => [WhiteSpace, text: ' '], [4] => [Comment, to_s: "# One letter\n"], [5] => [WhiteSpace], [6] => [Literal, to_s: 'b{2,5}', quantified?: true], [7] => [WhiteSpace, text: ' '], [8] => [WhiteSpace, text: ' '], [9] => [Comment, to_s: "# Another one\n"], [10] => [WhiteSpace], [11] => [CharacterSet, to_s: '[c-g]+', quantified?: true], [12] => [WhiteSpace], [13] => [WhiteSpace], [14] => [Comment, to_s: "# A set\n"], [15] => [WhiteSpace], [16] => [Group::Capture], [17] => [WhiteSpace], [18] => [Comment, to_s: "# A group\n",] include_examples 'parse', / a # comment 1 ? ( b # comment 2 # comment 3 + ) # comment 4 * /x, [0] => [WhiteSpace], [1] => [Literal, to_s: 'a?', quantified?: true], [2] => [WhiteSpace], [3] => [Comment], [4] => [WhiteSpace], [5] => [WhiteSpace], [6] => [Group::Capture, quantified?: true], [6, 0] => [WhiteSpace], [6, 1] => [Literal, to_s: 'b+', quantified?: true], [6, 2] => [WhiteSpace], [6, 3] => [Comment, to_s: "# comment 2\n"], [6, 4] => [WhiteSpace], [6, 5] => [Comment, to_s: "# comment 3\n"], [6, 6] => [WhiteSpace], [6, 7] => [WhiteSpace] end ammar-regexp_parser-68cdeff/spec/parser/groups_spec.rb000066400000000000000000000140441506175332700233620ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Group parsing') do include_examples 'parse', /(?=abc)(?!def)/, 0 => [:assertion, :lookahead, Assertion::Lookahead], 1 => [:assertion, :nlookahead, Assertion::NegativeLookahead] include_examples 'parse', /(?<=abc)(? [:assertion, :lookbehind, Assertion::Lookbehind], 1 => [:assertion, :nlookbehind, Assertion::NegativeLookbehind] include_examples 'parse', /a(?# is for apple)b(?# for boy)c(?# cat)/, 1 => [:group, :comment, Group::Comment, capturing?: false, comment?: true], 3 => [:group, :comment, Group::Comment, capturing?: false, comment?: true], 5 => [:group, :comment, Group::Comment, capturing?: false, comment?: true] include_examples 'parse', /a(?# is for apple){3}/, [0] => [Literal, text: 'a', quantified?: true], [0, :q] => [Quantifier, text: '{3}'], [1] => [Group::Comment, text: '(?# is for apple)', quantified?: false] if ruby_version_at_least('2.4.1') include_examples 'parse', 'a(?~b)c(?~d)e', 1 => [:group, :absence, Group::Absence], 3 => [:group, :absence, Group::Absence] end include_examples 'parse', /(?m:a)/, 0 => [:group, :options, Group::Options, capturing?: false, options: { m: true }, option_changes: { m: true }] # self-defeating group option include_examples 'parse', /(?m-m:a)/, 0 => [:group, :options, Group::Options, options: {}, option_changes: { m: false }] # activate one option in nested group include_examples 'parse', /(?x-mi:a(?m:b))/, 0 => [:group, :options, Group::Options, options: { x: true }, option_changes: { i: false, m: false, x: true }], [0, 1] => [:group, :options, Group::Options, options: { m: true, x: true }, option_changes: { m: true }] # deactivate one option in nested group include_examples 'parse', /(?ix-m:a(?-i:b))/, 0 => [:group, :options, Group::Options, options: { i: true, x: true }, option_changes: { i: true, m: false, x: true }], [0, 1] => [:group, :options, Group::Options, options: { x: true }, option_changes: { i: false }] # invert all options in nested group include_examples 'parse', /(?xi-m:a(?m-ix:b))/, 0 => [:group, :options, Group::Options, options: { i: true, x: true }, option_changes: { i: true, m: false, x: true }], [0, 1] => [:group, :options, Group::Options, options: { m: true }, option_changes: { i: false, m: true, x: false }] # nested options affect literal subexpressions include_examples 'parse', /(?x-mi:a(?m:b))/, [0, 0] => [:literal, :literal, Literal, text: 'a', options: { x: true }], [0, 1, 0] => [:literal, :literal, Literal, text: 'b', options: { m: true, x: true }] # option switching group include_examples 'parse', /a(?i-m)b/m, 0 => [:literal, :literal, Literal, text: 'a', options: { m: true }], 1 => [:group, :options_switch, Group::Options, options: { i: true }, option_changes: { i: true, m: false }], 2 => [:literal, :literal, Literal, text: 'b', options: { i: true }] # option switch in group include_examples 'parse', /(a(?i-m)b)c/m, 0 => [:group, :capture, Group::Capture, capturing?: true, options: { m: true }], [0, 0] => [:literal, :literal, Literal, capturing?: false, text: 'a', options: { m: true }], [0, 1] => [:group, :options_switch, Group::Options, options: { i: true }, option_changes: { i: true, m: false }], [0, 2] => [:literal, :literal, Literal, text: 'b', options: { i: true }], 1 => [:literal, :literal, Literal, text: 'c', options: { m: true }] # nested option switch in group include_examples 'parse', /((?i-m)(a(?-i)b))/m, [0, 1] => [:group, :capture, Group::Capture, options: { i: true }], [0, 1, 0] => [:literal, :literal, Literal, text: 'a', options: { i: true }], [0, 1, 1] => [:group, :options_switch, Group::Options, options: {}, option_changes: { i: false }], [0, 1, 2] => [:literal, :literal, Literal, text: 'b', options: {}] # options dau include_examples 'parse', /(?dua:abc)/, 0 => [:group, :options, Group::Options, options: { a: true }, option_changes: { a: true }] # nested options dau include_examples 'parse', /(?u:a(?d:b))/, 0 => [:group, :options, Group::Options, options: { u: true }, option_changes: { u: true }], [0, 1] => [:group, :options, Group::Options, options: { d: true }, option_changes: { d: true, u: false }], [0, 1, 0] => [:literal, :literal, Literal, text: 'b', options: { d: true }] # nested options da include_examples 'parse', /(?di-xm:a(?da-x:b))/, 0 => [:group, :options, Group::Options, options: { d: true, i:true }], [0, 1] => [:group, :options, Group::Options, options: { a: true, i: true }, option_changes: { a: true, d: false, x: false}], [0, 1, 0] => [:literal, :literal, Literal, text: 'b', options: { a: true, i: true }] specify('parse group number') do root = RP.parse(/(a)(?=b)((?:c)(d|(e)))/) expect(root.dig(0).number).to eq 1 expect(root.dig(1)).not_to respond_to(:number) expect(root.dig(2).number).to eq 2 expect(root.dig(2, 0)).not_to respond_to(:number) expect(root.dig(2, 1).number).to eq 3 expect(root.dig(2, 1, 0, 1, 0).number).to eq 4 end specify('parse group number at level') do root = RP.parse(/(a)(?=b)((?:c)(d|(e)))/) expect(root.dig(0).number_at_level).to eq 1 expect(root.dig(1)).not_to respond_to(:number_at_level) expect(root.dig(2).number_at_level).to eq 2 expect(root.dig(2, 0)).not_to respond_to(:number_at_level) expect(root.dig(2, 1).number_at_level).to eq 1 expect(root.dig(2, 1, 0, 1, 0).number_at_level).to eq 1 end specify('parse invalid option switch quantification') do expect { RP.parse('a(?i)+') }.to raise_error(/Can not quantify/) expect { RP.parse('a(?i)*') }.to raise_error(/Can not quantify/) expect { RP.parse('a(?i)?') }.to raise_error(/Can not quantify/) expect { RP.parse('a(?i){5}') }.to raise_error(/Can not quantify/) end end ammar-regexp_parser-68cdeff/spec/parser/keep_spec.rb000066400000000000000000000004121506175332700227610ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Keep parsing') do include_examples 'parse', /ab\Kcd/, 1 => [:keep, :mark, Keep::Mark, text: '\K'] include_examples 'parse', /(a\K)/, [0, 1] => [:keep, :mark, Keep::Mark, text: '\K'] end ammar-regexp_parser-68cdeff/spec/parser/options_spec.rb000066400000000000000000000015011506175332700235300ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('passing options to parse') do it 'raises if if parsing from a Regexp and options are passed' do expect { RP.parse(/a+/, options: ::Regexp::EXTENDED) }.to raise_error( ArgumentError, 'options cannot be supplied unless parsing a String' ) end it 'sets options if parsing from a String' do root = RP.parse('a+', options: ::Regexp::MULTILINE | ::Regexp::EXTENDED) expect(root.options).to eq(m: true, x: true) end it 'allows options to not be supplied when parsing from a Regexp' do root = RP.parse(/a+/ix) expect(root.options).to eq(i: true, x: true) end it 'has an empty option-hash when parsing from a String and passing no options' do root = RP.parse('a+') expect(root.options).to be_empty end end ammar-regexp_parser-68cdeff/spec/parser/posix_classes_spec.rb000066400000000000000000000013611506175332700247200ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('PosixClass parsing') do include_examples 'parse', /[[:word:]]/, [0] => [CharacterSet, count: 1], [0, 0] => [:posixclass, :word, PosixClass, name: 'word', text: '[:word:]'] include_examples 'parse', /[[:^word:]]/, [0] => [CharacterSet, count: 1], [0, 0] => [:nonposixclass, :word, PosixClass, name: 'word', text: '[:^word:]'] # cases treated as regular subsets by Ruby, not as (invalid) posix classes include_examples 'parse', '[[:ab]c:]', [0, 0] => [CharacterSet, count: 3], [0, 0, 0] => [Literal, text: ':'] include_examples 'parse', '[[:a[b]c:]]', [0, 0] => [CharacterSet, count: 5], [0, 0, 0] => [Literal, text: ':'] end ammar-regexp_parser-68cdeff/spec/parser/properties_spec.rb000066400000000000000000000075551506175332700242500ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Property parsing') do # test various notations supported by Ruby include_examples 'parse', '\p{sd}', 0 => [:property, :soft_dotted] include_examples 'parse', '\p{SD}', 0 => [:property, :soft_dotted] include_examples 'parse', '\p{Soft Dotted}', 0 => [:property, :soft_dotted] include_examples 'parse', '\p{Soft-Dotted}', 0 => [:property, :soft_dotted] include_examples 'parse', '\p{sOfT_dOtTeD}', 0 => [:property, :soft_dotted] # test ^-negation include_examples 'parse', '\p{^sd}', 0 => [:nonproperty, :soft_dotted] include_examples 'parse', '\p{^SD}', 0 => [:nonproperty, :soft_dotted] include_examples 'parse', '\p{^Soft Dotted}', 0 => [:nonproperty, :soft_dotted] include_examples 'parse', '\p{^Soft-Dotted}', 0 => [:nonproperty, :soft_dotted] include_examples 'parse', '\p{^sOfT_dOtTeD}', 0 => [:nonproperty, :soft_dotted] # test P-negation include_examples 'parse', '\P{sd}', 0 => [:nonproperty, :soft_dotted] include_examples 'parse', '\P{SD}', 0 => [:nonproperty, :soft_dotted] include_examples 'parse', '\P{Soft Dotted}', 0 => [:nonproperty, :soft_dotted] include_examples 'parse', '\P{Soft-Dotted}', 0 => [:nonproperty, :soft_dotted] include_examples 'parse', '\P{sOfT_dOtTeD}', 0 => [:nonproperty, :soft_dotted] # double negation is positive again include_examples 'parse', '\P{^sd}', 0 => [:property, :soft_dotted] include_examples 'parse', '\P{^SD}', 0 => [:property, :soft_dotted] include_examples 'parse', '\P{^Soft Dotted}', 0 => [:property, :soft_dotted] include_examples 'parse', '\P{^Soft-Dotted}', 0 => [:property, :soft_dotted] include_examples 'parse', '\P{^sOfT_dOtTeD}', 0 => [:property, :soft_dotted] # test #shortcut include_examples 'parse', '\p{soft_dotted}', 0 => [:property, :soft_dotted, shortcut: 'sd'] include_examples 'parse', '\p{sd}', 0 => [:property, :soft_dotted, shortcut: 'sd'] include_examples 'parse', '\p{in_bengali}', 0 => [:property, :in_bengali, shortcut: nil] # test classification include_examples 'parse', '\p{age=5.2}', 0 => [UnicodeProperty::Age] include_examples 'parse', '\p{InArmenian}', 0 => [UnicodeProperty::Block] include_examples 'parse', '\p{Math}', 0 => [UnicodeProperty::Derived] include_examples 'parse', '\p{Emoji}', 0 => [UnicodeProperty::Emoji] include_examples 'parse', '\p{GraphemeClusterBreak=Extend}', 0 => [UnicodeProperty::Enumerated] include_examples 'parse', '\p{Hiragana}', 0 => [UnicodeProperty::Script] specify('parse abandoned newline property') do root = RP.parse('\p{newline}', 'ruby/1.9') expect(root.expressions.last).to be_a(UnicodeProperty::Base) expect { RP.parse('\p{newline}', 'ruby/2.0') }.to raise_error(Regexp::Syntax::NotImplementedError) end # cannot test older Rubies because of https://bugs.ruby-lang.org/issues/18686 if ruby_version_at_least('3.2.0') specify('parse all properties of current ruby') do unsupported = RegexpPropertyValues.all_for_current_ruby.reject do |prop| RP.parse("\\p{#{prop}}") rescue false end expect(unsupported).to be_empty end end # Ruby 2.3 supports a short prop name (sterm) without supporting the long name # of the same prop (sentence_terminal). Let's ignore this unique case. if ruby_version_at_least('2.4.0') specify('parse only properties of current ruby') do syntax = Regexp::Syntax.for("ruby/#{RUBY_VERSION}") excessive = syntax.features.fetch(:property, []).reject do |prop| begin Regexp.new("\\p{#{prop}}") rescue RegexpError, SyntaxError # error class depends on Ruby version false end end expect(excessive).to be_empty end end end ammar-regexp_parser-68cdeff/spec/parser/quantifiers_spec.rb000066400000000000000000000067111506175332700243770ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Quantifier parsing') do include_examples 'parse', /a?b/, [0, :q] => [:zero_or_one, text: '?', mode: :greedy, min: 0, max: 1, ts: 1] include_examples 'parse', /a??b/, [0, :q] => [:zero_or_one, text: '??', mode: :reluctant, min: 0, max: 1, ts: 1] include_examples 'parse', /a?+b/, [0, :q] => [:zero_or_one, text: '?+', mode: :possessive, min: 0, max: 1, ts: 1] include_examples 'parse', /a*b/, [0, :q] => [:zero_or_more, text: '*', mode: :greedy, min: 0, max: -1, ts: 1] include_examples 'parse', /a*?b/, [0, :q] => [:zero_or_more, text: '*?', mode: :reluctant, min: 0, max: -1, ts: 1] include_examples 'parse', /a*+b/, [0, :q] => [:zero_or_more, text: '*+', mode: :possessive, min: 0, max: -1, ts: 1] include_examples 'parse', /a+b/, [0, :q] => [:one_or_more, text: '+', mode: :greedy, min: 1, max: -1, ts: 1] include_examples 'parse', /a+?b/, [0, :q] => [:one_or_more, text: '+?', mode: :reluctant, min: 1, max: -1, ts: 1] include_examples 'parse', /a++b/, [0, :q] => [:one_or_more, text: '++', mode: :possessive, min: 1, max: -1, ts: 1] include_examples 'parse', /a{2,4}b/, [0, :q] => [:interval, text: '{2,4}', mode: :greedy, min: 2, max: 4, ts: 1] include_examples 'parse', /a{2,}b/, [0, :q] => [:interval, text: '{2,}', mode: :greedy, min: 2, max: -1, ts: 1] include_examples 'parse', /a{,3}b/, [0, :q] => [:interval, text: '{,3}', mode: :greedy, min: 0, max: 3, ts: 1] include_examples 'parse', /a{4}b/, [0, :q] => [:interval, text: '{4}', mode: :greedy, min: 4, max: 4, ts: 1] include_examples 'parse', /a{004}b/, [0, :q] => [:interval, text: '{004}', mode: :greedy, min: 4, max: 4, ts: 1] # special case: exps with chained quantifiers are wrapped in implicit passive groups include_examples 'parse', /a+{2}{3}/, [0] => [:group, :passive, Group::Passive, implicit?: true, level: 0], [0, :q] => [:quantifier, :interval, Quantifier, text: '{3}', level: 0], [0, 0] => [:group, :passive, Group::Passive, implicit?: true, level: 1], [0, 0, :q] => [:quantifier, :interval, Quantifier, text: '{2}', level: 1], [0, 0, 0] => [:literal, :literal, Literal, text: 'a', level: 2], [0, 0, 0, :q] => [:quantifier, :one_or_more, Quantifier, text: '+', level: 2] # Ruby does not support modes for intervals, following `?` and `+` are read as chained quantifiers include_examples 'parse', /a{2,4}?b/, [0, :q] => [:quantifier, :zero_or_one, Quantifier, text: '?', mode: :greedy, min: 0, max: 1, ts: 6], [0, 0, :q] => [:quantifier, :interval, Quantifier, text: '{2,4}', mode: :greedy, min: 2, max: 4, ts: 1] include_examples 'parse', /a{2,4}+b/, [0, :q] => [:quantifier, :one_or_more, Quantifier, text: '+', mode: :greedy, min: 1, max: -1, ts: 6], [0, 0, :q] => [:quantifier, :interval, Quantifier, text: '{2,4}', mode: :greedy, min: 2, max: 4, ts: 1] specify('mode-checking methods') do exp = RP.parse(/a??/).first expect(exp).to be_reluctant expect(exp).to be_lazy expect(exp).not_to be_greedy expect(exp).not_to be_possessive expect(exp.quantifier).to be_reluctant expect(exp.quantifier).to be_lazy expect(exp.quantifier).not_to be_greedy expect(exp.quantifier).not_to be_possessive end end ammar-regexp_parser-68cdeff/spec/parser/refcalls_spec.rb000066400000000000000000000124661506175332700236440ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Refcall parsing') do include_examples 'parse', /(abc)\1/, 1 => [Backreference::Number, reference: 1] include_examples 'parse', /(?abc)\k/, 1 => [Backreference::Name, name: 'X', reference: 'X'] include_examples 'parse', /(?abc)\k'X'/, 1 => [Backreference::Name, name: 'X', reference: 'X'] include_examples 'parse', /(abc)\k<1>/, 1 => [Backreference::Number, number: 1, reference: 1] include_examples 'parse', /(abc)\k<001>/, 1 => [Backreference::Number, number: 1, reference: 1] include_examples 'parse', /(abc)\k<-1>/, 1 => [Backreference::NumberRelative, number: -1, reference: 1] include_examples 'parse', /(abc)\k'-1'/, 1 => [Backreference::NumberRelative, number: -1, reference: 1] include_examples 'parse', /(abc)\k'-001'/, 1 => [Backreference::NumberRelative, number: -1, reference: 1] include_examples 'parse', /(?abc)\g/, 1 => [Backreference::NameCall, reference: 'X'] include_examples 'parse', /(abc)\g<1>/, 1 => [Backreference::NumberCall, reference: 1] include_examples 'parse', '(abc)\g<001>', 1 => [Backreference::NumberCall, reference: 1] include_examples 'parse', '\g<0>', 0 => [Backreference::NumberCall, reference: 0] include_examples 'parse', /(abc)\g<-1>/, 1 => [Backreference::NumberCallRelative, reference: 1] include_examples 'parse', /(abc)\g<-001>/, 1 => [Backreference::NumberCallRelative, reference: 1] include_examples 'parse', /\g<+1>(abc)/, 0 => [Backreference::NumberCallRelative, reference: 1] include_examples 'parse', /(?abc)\k/, 1 => [Backreference::NameRecursionLevel, name: 'X', recursion_level: 0] include_examples 'parse', /(abc)\k<1-0>/, 1 => [Backreference::NumberRecursionLevel, number: 1, recursion_level: 0] include_examples 'parse', /(abc)\k<1-0>/, 1 => [Backreference::NumberRecursionLevel, number: 1, recursion_level: 0] include_examples 'parse', /(abc)\k<-1+0>/, 1 => [Backreference::NumberRecursionLevel, number: -1, recursion_level: 0] include_examples 'parse', /(abc)\k<1+1>/, 1 => [Backreference::NumberRecursionLevel, number: 1, recursion_level: 1] include_examples 'parse', /(abc)\k<1-1>/, 1 => [Backreference::NumberRecursionLevel, number: 1, recursion_level: -1] # test #effective_number/#reference for complex cases include_examples 'parse', '(abc)(def)\k<-1>(ghi)\k<-3>\k<-1>', 2 => [:number_rel_ref, reference: 2], 4 => [:number_rel_ref, reference: 1], 5 => [:number_rel_ref, reference: 3] include_examples 'parse', '\g<+1>(abc)\g<+2>(def)(ghi)\g<-2>', 0 => [:number_rel_call, reference: 1], 2 => [:number_rel_call, reference: 3], 5 => [:number_rel_call, reference: 2] specify('parse backref referenced_expression') do root = RP.parse('(abc)(def)\\k<-1>(ghi)\\k<-3>\\k<-1>') exp1 = root[2] exp2 = root[4] exp3 = root[5] expect([exp1, exp2, exp3]).to all be_instance_of(Backreference::NumberRelative) expect(exp1.referenced_expression).to eq root[1] expect(exp1.referenced_expression.to_s).to eq '(def)' expect(exp2.referenced_expression).to eq root[0] expect(exp2.referenced_expression.to_s).to eq '(abc)' expect(exp3.referenced_expression).to eq root[3] expect(exp3.referenced_expression.to_s).to eq '(ghi)' end specify('parse backref referenced_expressions (multiplex)') do root = RP.parse('(?A)(?B)\\k') exp = root.last expect(exp.referenced_expressions).to eq [root[0], root[1]] expect(exp.referenced_expressions.map(&:to_s)).to eq ['(?A)', '(?B)'] end specify('parse backref call referenced_expression') do root = RP.parse('\\g<+1>(abc)\\g<+2>(def)(ghi)\\g<-2>') exp1 = root[0] exp2 = root[2] exp3 = root[5] expect([exp1, exp2, exp3]).to all be_instance_of(Backreference::NumberCallRelative) expect(exp1.referenced_expression).to eq root[1] expect(exp1.referenced_expression.to_s).to eq '(abc)' expect(exp2.referenced_expression).to eq root[4] expect(exp2.referenced_expression.to_s).to eq '(ghi)' expect(exp3.referenced_expression).to eq root[3] expect(exp3.referenced_expression.to_s).to eq '(def)' end specify('parse backref call referenced_expression root') do root = RP.parse('\g<0>') expect(root[0].referenced_expression).to eq root end specify('parse invalid reference') do expect { RP.parse('\1') }.to raise_error(/Invalid reference/) expect { RP.parse('1\1') }.to raise_error(/Invalid reference/) expect { RP.parse('\8') }.to raise_error(/Invalid reference/) expect { RP.parse('8\8') }.to raise_error(/Invalid reference/) expect { RP.parse('(a)\2') }.to raise_error(/Invalid reference/) expect { RP.parse('\k<1>') }.to raise_error(/Invalid reference/) expect { RP.parse('\k<+1>') }.to raise_error(/Invalid reference/) expect { RP.parse('\k<+2>(a)') }.to raise_error(/Invalid reference/) expect { RP.parse('\k<-1>') }.to raise_error(/Invalid reference/) expect { RP.parse('(a)\k<-2>') }.to raise_error(/Invalid reference/) expect { RP.parse('\k<1+1>') }.to raise_error(/Invalid reference/) expect { RP.parse('\k<1-1>') }.to raise_error(/Invalid reference/) expect { RP.parse('\k') }.to raise_error(/Invalid reference/) expect { RP.parse('(?)\k') }.to raise_error(/Invalid reference/) end end ammar-regexp_parser-68cdeff/spec/parser/set/000077500000000000000000000000001506175332700212745ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/spec/parser/set/intersections_spec.rb000066400000000000000000000074561506175332700255400ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' # edge cases with `...-&&...` and `...&&-...` are checked in ./ranges_spec.rb RSpec.describe('CharacterSet::Intersection parsing') do include_examples 'parse', /[a&&z]/, [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Intersection, count: 2], [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 0, 0] => [:literal, text: 'a'], [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 1, 0] => [:literal, text: 'z'] include_examples 'parse', /[a-z&&[^a]]/, [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Intersection, count: 2], [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 0, 0] => [CharacterSet::Range, count: 2], [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 1, 0] => [CharacterSet, count: 1] include_examples 'parse', /[a&&a-z]/, [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Intersection, count: 2], [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 0, 0] => [:literal, text: 'a'], [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 1, 0] => [CharacterSet::Range, count: 2] include_examples 'parse', /[a&&\w]/, [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Intersection, count: 2], [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 1, 0] => [:word, text: '\w'] include_examples 'parse', /[\h&&\w&&efg]/, [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Intersection, count: 3], [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 0, 0] => [:hex, text: '\h'], [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 1, 0] => [:word, text: '\w'], [0, 0, 2] => [CharacterSet::IntersectedSequence, count: 3], [0, 0, 2, 0] => [:literal, text: 'e'], [0, 0, 2, 1] => [:literal, text: 'f'], [0, 0, 2, 2] => [:literal, text: 'g'] # test correct ts values for empty sequences include_examples 'parse', /[&&]/, [0, 0] => [CharacterSet::Intersection, text: '&&', count: 2, ts: 1], [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 0, ts: 1], [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 0, ts: 3] # test correct ts values for non-empty sequences include_examples 'parse', /[ab&&cd&&ef]/, [0, 0] => [CharacterSet::Intersection, count: 3, text: '&&', ts: 1], [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 2, to_s: 'ab', ts: 1], [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 2, to_s: 'cd', ts: 5], [0, 0, 2] => [CharacterSet::IntersectedSequence, count: 2, to_s: 'ef', ts: 9] # Some edge-case patterns are evaluated with #match to make sure that # their matching behavior still reflects the way they are parsed. # #capturing_stderr is used to skip any warnings generated by this. specify('intersections behavior remains unchanged') do capturing_stderr do expect(/[a&&z]/).not_to match 'a' expect(/[a&&z]/).not_to match '&' expect(/[a&&z]/).not_to match 'z' expect(/[a-z&&[^a]]/).not_to match 'a' expect(/[a-z&&[^a]]/).not_to match '&' expect(/[a-z&&[^a]]/).to match 'b' expect(/[a&&a-z]/).to match 'a' expect(/[a&&a-z]/).not_to match '&' expect(/[a&&a-z]/).not_to match 'b' expect(/[a&&\w]/).to match 'a' expect(/[a&&\w]/).not_to match '&' expect(/[a&&\w]/).not_to match 'b' expect(/[\h&&\w&&efg]/).to match 'e' expect(/[\h&&\w&&efg]/).to match 'f' expect(/[\h&&\w&&efg]/).not_to match 'a' expect(/[\h&&\w&&efg]/).not_to match 'g' end end end ammar-regexp_parser-68cdeff/spec/parser/set/ranges_spec.rb000066400000000000000000000067201506175332700241170ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('CharacterSet::Range parsing') do include_examples 'parse', '[a-z]', [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Range, count: 2], [0, 0, 0] => [:literal, text: 'a'], [0, 0, 1] => [:literal, text: 'z'] include_examples 'parse', '[\x00-\x22]', [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Range, count: 2], [0, 0, 0] => [:hex, text: '\x00'], [0, 0, 1] => [:hex, text: '\x22'] include_examples 'parse', '[\u{40 42}-\u1234]', [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Range, count: 2], [0, 0, 0] => [:codepoint_list, text: '\u{40 42}'], [0, 0, 1] => [:codepoint, text: '\u1234'] include_examples 'parse', '[--z]', [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Range, count: 2], [0, 0, 0] => [:literal, text: '-'], [0, 0, 1] => [:literal, text: 'z'] include_examples 'parse', '[!--]', [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Range, count: 2], [0, 0, 0] => [:literal, text: '!'], [0, 0, 1] => [:literal, text: '-'] include_examples 'parse', '[!-^]', [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Range, count: 2], [0, 0, 0] => [:literal, text: '!'], [0, 0, 1] => [:literal, text: '^'] # edge cases that are NOT treated as range include_examples 'parse', '[^-z]', [0] => [CharacterSet, count: 2], [0, 0] => [:literal, text: '-'], [0, 1] => [:literal, text: 'z'] include_examples 'parse', '[[\-ab]&&-bc]', [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Intersection, count: 2], [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 0, 0] => [CharacterSet, count: 3], [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 3], [0, 0, 1, 0] => [:literal, text: '-'] include_examples 'parse', '[bc-&&[\-ab]]', [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Intersection, count: 2], [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 3], [0, 0, 0, 2] => [:literal, text: '-'], [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 1, 0] => [CharacterSet, count: 3] # Some edge-case patterns are evaluated with #match to make sure that # their matching behavior still reflects the way they are parsed. # #capturing_stderr is used to skip any warnings generated by this. specify('ranges behavior remains unchanged') do capturing_stderr do expect(Regexp.new('[\x00-\x22]')).to match "\x11" expect(Regexp.new('[\u{40 42}-\u1234]')).to match "\u0600" expect(Regexp.new('[--z]')).to match 'a' expect(Regexp.new('[!--]')).to match '$' expect(Regexp.new('[!-^]')).to match '$' # edge cases that are NOT treated as ranges expect(Regexp.new('[^-z]')).to match 'a' expect(Regexp.new('[^-z]')).not_to match 'z' expect(Regexp.new('[[\-ab]&&-bc]')).to match '-' expect(Regexp.new('[[\-ab]&&-bc]')).to match 'b' expect(Regexp.new('[[\-ab]&&-bc]')).not_to match 'a' expect(Regexp.new('[[\-ab]&&-bc]')).not_to match 'c' expect(Regexp.new('[bc-&&[\-ab]]')).to match '-' expect(Regexp.new('[bc-&&[\-ab]]')).to match 'b' expect(Regexp.new('[bc-&&[\-ab]]')).not_to match 'a' expect(Regexp.new('[bc-&&[\-ab]]')).not_to match 'c' end end end ammar-regexp_parser-68cdeff/spec/parser/sets_spec.rb000066400000000000000000000106441506175332700230230ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('CharacterSet parsing') do include_examples 'parse', /[ab]+/, [0] => [:set, :character, CharacterSet, text: '[', count: 2, quantified?: true], [0, 0] => [:literal, :literal, Literal, text: 'a', set_level: 1], [0, 1] => [:literal, :literal, Literal, text: 'b', set_level: 1] include_examples 'parse', /[a\dc]/, [0] => [:set, :character, CharacterSet, text: '[', count: 3], [0, 1] => [:type, :digit, CharacterType::Digit] include_examples 'parse', /[a\bc]/, [0] => [:set, :character, CharacterSet, text: '[', count: 3], [0, 1] => [:escape, :backspace, EscapeSequence::Backspace, text: '\b'] include_examples 'parse', '[a\xFz]', [0] => [:set, :character, CharacterSet, text: '[', count: 3], [0, 1] => [:escape, :hex, EscapeSequence::Hex, text: '\xF'] include_examples 'parse', '[a\x20c]', [0] => [:set, :character, CharacterSet, text: '[', count: 3], [0, 1] => [:escape, :hex, EscapeSequence::Hex, text: '\x20'] include_examples 'parse', '[a\77c]', [0] => [:set, :character, CharacterSet, text: '[', count: 3], [0, 1] => [:escape, :octal, EscapeSequence::Octal, text: '\77'] include_examples 'parse', '[a\u0640c]', [0] => [:set, :character, CharacterSet, text: '[', count: 3], [0, 1] => [:escape, :codepoint, EscapeSequence::Codepoint, text: '\u0640'] include_examples 'parse', '[a\u{41 1F60D}c]', [0] => [:set, :character, CharacterSet, text: '[', count: 3], [0, 1] => [:escape, :codepoint_list, EscapeSequence::CodepointList, text: '\u{41 1F60D}'] include_examples 'parse', '[[:digit:][:^lower:]]+', [0] => [:set, :character, CharacterSet, text: '[', count: 2], [0, 0] => [:posixclass, :digit, PosixClass, text: '[:digit:]'], [0, 1] => [:nonposixclass, :lower, PosixClass, text: '[:^lower:]'] include_examples 'parse', '[a[b[c]d]e]', [0] => [:set, :character, CharacterSet, text: '[', count: 3, set_level: 0], [0, 0] => [:literal, :literal, Literal, text: 'a', set_level: 1], [0, 1] => [:set, :character, CharacterSet, text: '[', count: 3, set_level: 1], [0, 2] => [:literal, :literal, Literal, text: 'e', set_level: 1], [0, 1, 1] => [:set, :character, CharacterSet, text: '[', count: 1, set_level: 2], [0, 1, 1, 0] => [:literal, :literal, Literal, text: 'c', set_level: 3] include_examples 'parse', '[a[^b[c]]]', [0] => [:set, :character, CharacterSet, text: '[', count: 2, set_level: 0], [0, 0] => [:literal, :literal, Literal, text: 'a', set_level: 1], [0, 1] => [:set, :character, CharacterSet, text: '[', count: 2, set_level: 1], [0, 1, 0] => [:literal, :literal, Literal, text: 'b', set_level: 2], [0, 1, 1] => [:set, :character, CharacterSet, text: '[', count: 1, set_level: 2], [0, 1, 1, 0] => [:literal, :literal, Literal, text: 'c', set_level: 3] include_examples 'parse', '[aaa]', [0] => [:set, :character, CharacterSet, text: '[', count: 3], [0, 0] => [:literal, :literal, Literal, text: 'a'], [0, 1] => [:literal, :literal, Literal, text: 'a'], [0, 2] => [:literal, :literal, Literal, text: 'a'] include_examples 'parse', '[ ]', [0] => [:set, :character, CharacterSet, text: '[', count: 3], [0, 0] => [:literal, :literal, Literal, text: ' '], [0, 1] => [:literal, :literal, Literal, text: ' '], [0, 2] => [:literal, :literal, Literal, text: ' '] include_examples 'parse', '(?x)[ ]', # shouldn't merge whitespace even in x-mode [1] => [:set, :character, CharacterSet, text: '[', count: 3], [1, 0] => [:literal, :literal, Literal, text: ' '], [1, 1] => [:literal, :literal, Literal, text: ' '], [1, 2] => [:literal, :literal, Literal, text: ' '] include_examples 'parse', '[[.span-ll.]]', # collating sequences are disabled in Onigmo [0, 0] => [:set, :character, CharacterSet, text: '[', count: 7], [0, 0, 0] => [:literal, :literal, Literal, text: '.'] include_examples 'parse', '[[=e=]]', # character equivalents are disabled in Onigmo [0, 0] => [:set, :character, CharacterSet, text: '[', count: 3], [0, 0, 0] => [:literal, :literal, Literal, text: '='] end ammar-regexp_parser-68cdeff/spec/parser/types_spec.rb000066400000000000000000000016741506175332700232140ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('CharacterType parsing') do include_examples 'parse', /a\dc/, 1 => [:type, :digit, CharacterType::Digit] include_examples 'parse', /a\Dc/, 1 => [:type, :nondigit, CharacterType::NonDigit] include_examples 'parse', /a\sc/, 1 => [:type, :space, CharacterType::Space] include_examples 'parse', /a\Sc/, 1 => [:type, :nonspace, CharacterType::NonSpace] include_examples 'parse', /a\hc/, 1 => [:type, :hex, CharacterType::Hex] include_examples 'parse', /a\Hc/, 1 => [:type, :nonhex, CharacterType::NonHex] include_examples 'parse', /a\wc/, 1 => [:type, :word, CharacterType::Word] include_examples 'parse', /a\Wc/, 1 => [:type, :nonword, CharacterType::NonWord] include_examples 'parse', 'a\Rc', 1 => [:type, :linebreak, CharacterType::Linebreak] include_examples 'parse', 'a\Xc', 1 => [:type, :xgrapheme, CharacterType::ExtendedGrapheme] end ammar-regexp_parser-68cdeff/spec/scanner/000077500000000000000000000000001506175332700206365ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/spec/scanner/all_spec.rb000066400000000000000000000010071506175332700227430ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe(Regexp::Scanner) do specify('scanner returns an array') do expect(RS.scan('abc')).to be_instance_of(Array) end specify('scanner returns tokens as arrays') do tokens = RS.scan('^abc+[^one]{2,3}\b\d\C-C$') expect(tokens).to all(be_a Array) expect(tokens.map(&:length)).to all(eq 5) end specify('scanner token count') do re = /^(one|two){2,3}([^d\]efm-qz\,\-]*)(ghi)+$/i expect(RS.scan(re).length).to eq 28 end end ammar-regexp_parser-68cdeff/spec/scanner/anchors_spec.rb000066400000000000000000000023161506175332700236340ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Anchor scanning') do include_examples 'scan', '^abc', 0 => [:anchor, :bol, '^', 0, 1] include_examples 'scan', 'abc$', 1 => [:anchor, :eol, '$', 3, 4] include_examples 'scan', '\Aabc', 0 => [:anchor, :bos, '\A', 0, 2] include_examples 'scan', 'abc\z', 1 => [:anchor, :eos, '\z', 3, 5] include_examples 'scan', 'abc\Z', 1 => [:anchor, :eos_ob_eol, '\Z', 3, 5] include_examples 'scan', 'a\bc', 1 => [:anchor, :word_boundary, '\b', 1, 3] include_examples 'scan', 'a\Bc', 1 => [:anchor, :nonword_boundary, '\B', 1, 3] include_examples 'scan', 'a\Gc', 1 => [:anchor, :match_start, '\G', 1, 3] include_examples 'scan', "\\\\Ac", 0 => [:escape, :backslash, '\\\\', 0, 2] include_examples 'scan', "a\\\\z", 1 => [:escape, :backslash, '\\\\', 1, 3] include_examples 'scan', "a\\\\Z", 1 => [:escape, :backslash, '\\\\', 1, 3] include_examples 'scan', "a\\\\bc", 1 => [:escape, :backslash, '\\\\', 1, 3] include_examples 'scan', "a\\\\Bc", 1 => [:escape, :backslash, '\\\\', 1, 3] end ammar-regexp_parser-68cdeff/spec/scanner/conditionals_spec.rb000066400000000000000000000200261506175332700246630ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Conditional scanning') do include_examples 'scan', /(a)(?(1)T|F)1/, 3 => [:conditional, :open, '(?', 3, 5] include_examples 'scan', /(a)(?(1)T|F)2/, 4 => [:conditional, :condition_open, '(', 5, 6] include_examples 'scan', /(a)(?(1)T|F)3/, 5 => [:conditional, :condition, '1', 6, 7] include_examples 'scan', /(a)(?(1)T|F)4/, 6 => [:conditional, :condition_close, ')', 7, 8] include_examples 'scan', /(a)(?(1)T|F)5/, 7 => [:literal, :literal, 'T', 8, 9] include_examples 'scan', /(a)(?(1)T|F)6/, 8 => [:conditional, :separator, '|', 9, 10] include_examples 'scan', /(a)(?(1)T|F)7/, 9 => [:literal, :literal, 'F', 10, 11] include_examples 'scan', /(a)(?(1)T|F)8/, 10 => [:conditional, :close, ')', 11, 12] include_examples 'scan', /(a)(?(1)TRUE)9/, 8 => [:conditional, :close, ')', 12, 13] include_examples 'scan', /(a)(?(1)TRUE|)10/, 8 => [:conditional, :separator, '|', 12, 13] include_examples 'scan', /(a)(?(1)TRUE|)11/, 9 => [:conditional, :close, ')', 13, 14] include_examples 'scan', /(?A)(?()T|F)1/, 5 => [:conditional, :condition, '', 10, 13] include_examples 'scan', /(?'N'A)(?('N')T|F)2/, 5 => [:conditional, :condition, "'N'", 10, 13] include_examples 'scan', /(a)(?(001)T)/, 5 => [:conditional, :condition, '001', 6, 9] include_examples 'scan', /(a(b(c)))(?(1)(?(2)d|(?(3)e|f))|(?(2)(?(1)g|h)))/, 0 => [:group, :capture, '(', 0, 1], 1 => [:literal, :literal, 'a', 1, 2], 2 => [:group, :capture, '(', 2, 3], 3 => [:literal, :literal, 'b', 3, 4], 4 => [:group, :capture, '(', 4, 5], 5 => [:literal, :literal, 'c', 5, 6], 6 => [:group, :close, ')', 6, 7], 7 => [:group, :close, ')', 7, 8], 8 => [:group, :close, ')', 8, 9], 9 => [:conditional, :open, '(?', 9, 11], 10 => [:conditional, :condition_open, '(', 11, 12], 11 => [:conditional, :condition, '1', 12, 13], 12 => [:conditional, :condition_close, ')', 13, 14], 13 => [:conditional, :open, '(?', 14, 16], 14 => [:conditional, :condition_open, '(', 16, 17], 15 => [:conditional, :condition, '2', 17, 18], 16 => [:conditional, :condition_close, ')', 18, 19], 17 => [:literal, :literal, 'd', 19, 20], 18 => [:conditional, :separator, '|', 20, 21], 19 => [:conditional, :open, '(?', 21, 23], 20 => [:conditional, :condition_open, '(', 23, 24], 21 => [:conditional, :condition, '3', 24, 25], 22 => [:conditional, :condition_close, ')', 25, 26], 23 => [:literal, :literal, 'e', 26, 27], 24 => [:conditional, :separator, '|', 27, 28], 25 => [:literal, :literal, 'f', 28, 29], 26 => [:conditional, :close, ')', 29, 30], 27 => [:conditional, :close, ')', 30, 31], 28 => [:conditional, :separator, '|', 31, 32], 29 => [:conditional, :open, '(?', 32, 34], 30 => [:conditional, :condition_open, '(', 34, 35], 31 => [:conditional, :condition, '2', 35, 36], 32 => [:conditional, :condition_close, ')', 36, 37], 33 => [:conditional, :open, '(?', 37, 39], 34 => [:conditional, :condition_open, '(', 39, 40], 35 => [:conditional, :condition, '1', 40, 41], 36 => [:conditional, :condition_close, ')', 41, 42], 37 => [:literal, :literal, 'g', 42, 43], 38 => [:conditional, :separator, '|', 43, 44], 39 => [:literal, :literal, 'h', 44, 45], 40 => [:conditional, :close, ')', 45, 46], 41 => [:conditional, :close, ')', 46, 47], 42 => [:conditional, :close, ')', 47, 48] include_examples 'scan', /((a)|(b)|((?(2)(c(d|e)+)?|(?(3)f|(?(4)(g|(h)(i)))))))/, 0 => [:group, :capture, '(', 0, 1], 1 => [:group, :capture, '(', 1, 2], 2 => [:literal, :literal, 'a', 2, 3], 3 => [:group, :close, ')', 3, 4], 4 => [:meta, :alternation, '|', 4, 5], 5 => [:group, :capture, '(', 5, 6], 6 => [:literal, :literal, 'b', 6, 7], 7 => [:group, :close, ')', 7, 8], 8 => [:meta, :alternation, '|', 8, 9], 9 => [:group, :capture, '(', 9, 10], 10 => [:conditional, :open, '(?', 10, 12], 11 => [:conditional, :condition_open, '(', 12, 13], 12 => [:conditional, :condition, '2', 13, 14], 13 => [:conditional, :condition_close, ')', 14, 15], 14 => [:group, :capture, '(', 15, 16], 15 => [:literal, :literal, 'c', 16, 17], 16 => [:group, :capture, '(', 17, 18], 17 => [:literal, :literal, 'd', 18, 19], 18 => [:meta, :alternation, '|', 19, 20], 19 => [:literal, :literal, 'e', 20, 21], 20 => [:group, :close, ')', 21, 22], 21 => [:quantifier, :one_or_more, '+', 22, 23], 22 => [:group, :close, ')', 23, 24], 23 => [:quantifier, :zero_or_one, '?', 24, 25], 24 => [:conditional, :separator, '|', 25, 26], 25 => [:conditional, :open, '(?', 26, 28], 26 => [:conditional, :condition_open, '(', 28, 29], 27 => [:conditional, :condition, '3', 29, 30], 28 => [:conditional, :condition_close, ')', 30, 31], 29 => [:literal, :literal, 'f', 31, 32], 30 => [:conditional, :separator, '|', 32, 33], 31 => [:conditional, :open, '(?', 33, 35], 32 => [:conditional, :condition_open, '(', 35, 36], 33 => [:conditional, :condition, '4', 36, 37], 34 => [:conditional, :condition_close, ')', 37, 38], 35 => [:group, :capture, '(', 38, 39], 36 => [:literal, :literal, 'g', 39, 40], 37 => [:meta, :alternation, '|', 40, 41], 38 => [:group, :capture, '(', 41, 42], 39 => [:literal, :literal, 'h', 42, 43], 40 => [:group, :close, ')', 43, 44], 41 => [:group, :capture, '(', 44, 45], 42 => [:literal, :literal, 'i', 45, 46], 43 => [:group, :close, ')', 46, 47], 44 => [:group, :close, ')', 47, 48], 45 => [:conditional, :close, ')', 48, 49], 46 => [:conditional, :close, ')', 49, 50], 47 => [:conditional, :close, ')', 50, 51], 48 => [:group, :close, ')', 51, 52], 49 => [:group, :close, ')', 52, 53] include_examples 'scan', /(a)(?(1)(b|c|d)|(e|f|g))(h)(?(2)(i|j|k)|(l|m|n))|o|p/, 9 => [:meta, :alternation, '|', 10, 11], 11 => [:meta, :alternation, '|', 12, 13], 14 => [:conditional, :separator, '|', 15, 16], 17 => [:meta, :alternation, '|', 18, 19], 19 => [:meta, :alternation, '|', 20, 21], 32 => [:meta, :alternation, '|', 34, 35], 34 => [:meta, :alternation, '|', 36, 37], 37 => [:conditional, :separator, '|', 39, 40], 40 => [:meta, :alternation, '|', 42, 43], 42 => [:meta, :alternation, '|', 44, 45], 46 => [:meta, :alternation, '|', 48, 49], 48 => [:meta, :alternation, '|', 50, 51] include_examples 'scan', /(a)(?(1)b|c(?#hello)d)/, 9 => [:literal, :literal, 'c', 10, 11], 10 => [:group, :comment, '(?#hello)', 11, 20], 11 => [:literal, :literal, 'd', 20, 21], 12 => [:conditional, :close, ')', 21, 22] end ammar-regexp_parser-68cdeff/spec/scanner/delimiters_spec.rb000066400000000000000000000034561506175332700243460ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Literal delimiter scanning') do include_examples 'scan', '}', 0 => [:literal, :literal, '}', 0, 1] include_examples 'scan', '}}', 0 => [:literal, :literal, '}}', 0, 2] include_examples 'scan', '{', 0 => [:literal, :literal, '{', 0, 1] include_examples 'scan', '{{', 0 => [:literal, :literal, '{{', 0, 2] include_examples 'scan', '{}', 0 => [:literal, :literal, '{}', 0, 2] include_examples 'scan', '}{', 0 => [:literal, :literal, '}{', 0, 2] include_examples 'scan', '}{+', 0 => [:literal, :literal, '}{', 0, 2] include_examples 'scan', '{{var}}', 0 => [:literal, :literal, '{{var}}', 0, 7] include_examples 'scan', 'a{1,2', 0 => [:literal, :literal, 'a{1,2', 0, 5] include_examples 'scan', '({.+})', 0 => [:group, :capture, '(', 0, 1], 1 => [:literal, :literal, '{', 1, 2], 2 => [:meta, :dot, '.', 2, 3], 3 => [:quantifier, :one_or_more, '+', 3, 4], 4 => [:literal, :literal, '}', 4, 5], 5 => [:group, :close, ')', 5, 6] include_examples 'scan', ']', 0 => [:literal, :literal, ']', 0, 1] include_examples 'scan', ']]', 0 => [:literal, :literal, ']]', 0, 2] include_examples 'scan', ']\[', 0 => [:literal, :literal, ']', 0, 1], 1 => [:escape, :set_open, '\[', 1, 3] include_examples 'scan', '()', 0 => [:group, :capture, '(', 0, 1], 1 => [:group, :close, ')', 1, 2] end ammar-regexp_parser-68cdeff/spec/scanner/errors_spec.rb000066400000000000000000000147431506175332700235220ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe(Regexp::Scanner) do RSpec.shared_examples 'scan error' do |error, issue, source| it "raises #{error} for #{issue} `#{source}`" do expect { RS.scan(source) }.to raise_error(error) end end include_examples 'scan error', RS::PrematureEndError, 'unbalanced set', '[a' include_examples 'scan error', RS::PrematureEndError, 'unbalanced set', '[a[b]' include_examples 'scan error', RS::PrematureEndError, 'unbalanced set', '[[:alpha:]' include_examples 'scan error', RS::PrematureEndError, 'unbalanced group', '(abc' include_examples 'scan error', RS::PrematureEndError, 'eof in property', '\p{asci' include_examples 'scan error', RS::PrematureEndError, 'incomplete property', '\p{ascii abc' include_examples 'scan error', RS::PrematureEndError, 'eof options', '(?mix' include_examples 'scan error', RS::PrematureEndError, 'eof escape', '\\' include_examples 'scan error', RS::PrematureEndError, 'eof in hex escape', '\x' include_examples 'scan error', RS::PrematureEndError, 'eof in cp escape', '\u' include_examples 'scan error', RS::PrematureEndError, 'eof in cp escape', '\u0' include_examples 'scan error', RS::PrematureEndError, 'eof in cp escape', '\u00' include_examples 'scan error', RS::PrematureEndError, 'eof in cp escape', '\u000' include_examples 'scan error', RS::PrematureEndError, 'eof in cp escape', '\u{' include_examples 'scan error', RS::PrematureEndError, 'eof in cp escape', '\u{00' include_examples 'scan error', RS::PrematureEndError, 'eof in cp escape', '\u{0000' include_examples 'scan error', RS::PrematureEndError, 'eof in cp escape', '\u{0000 ' include_examples 'scan error', RS::PrematureEndError, 'eof in cp escape', '\u{0000 0000' include_examples 'scan error', RS::PrematureEndError, 'eof in c-seq', '\c' include_examples 'scan error', RS::PrematureEndError, 'eof in c-seq', '\c\M' include_examples 'scan error', RS::PrematureEndError, 'eof in c-seq', '\c\M-' include_examples 'scan error', RS::PrematureEndError, 'eof in c-seq', '\C' include_examples 'scan error', RS::PrematureEndError, 'eof in c-seq', '\C-' include_examples 'scan error', RS::PrematureEndError, 'eof in c-seq', '\C-\M' include_examples 'scan error', RS::PrematureEndError, 'eof in c-seq', '\C-\M-' include_examples 'scan error', RS::PrematureEndError, 'eof in m-seq', '\M' include_examples 'scan error', RS::PrematureEndError, 'eof in m-seq', '\M-' include_examples 'scan error', RS::PrematureEndError, 'eof in m-seq', '\M-\\' include_examples 'scan error', RS::PrematureEndError, 'eof in m-seq', '\M-\c' include_examples 'scan error', RS::PrematureEndError, 'eof in m-seq', '\M-\C' include_examples 'scan error', RS::PrematureEndError, 'eof in m-seq', '\M-\C-' include_examples 'scan error', RS::InvalidSequenceError, 'invalid hex', '\xZ' include_examples 'scan error', RS::InvalidSequenceError, 'invalid hex', '\xZ0' include_examples 'scan error', RS::InvalidSequenceError, 'invalid hex', '\x{' include_examples 'scan error', RS::InvalidSequenceError, 'invalid c-seq', '\cü' include_examples 'scan error', RS::InvalidSequenceError, 'invalid c-seq', '\c\M-ü' include_examples 'scan error', RS::InvalidSequenceError, 'invalid c-seq', '\C-ü' include_examples 'scan error', RS::InvalidSequenceError, 'invalid c-seq', '\C-\M-ü' include_examples 'scan error', RS::InvalidSequenceError, 'invalid m-seq', '\M-ü' include_examples 'scan error', RS::InvalidSequenceError, 'invalid m-seq', '\M-\cü' include_examples 'scan error', RS::InvalidSequenceError, 'invalid m-seq', '\M-\C-ü' include_examples 'scan error', RS::ScannerError, 'invalid c-seq', '\Ca' include_examples 'scan error', RS::ScannerError, 'invalid m-seq', '\Ma' include_examples 'scan error', RS::InvalidGroupError, 'invalid group', ")" include_examples 'scan error', RS::InvalidGroupError, 'invalid group', "())" include_examples 'scan error', RS::InvalidGroupError, 'invalid group', "(?'')" include_examples 'scan error', RS::InvalidGroupError, 'invalid group', "(?''empty-name)" include_examples 'scan error', RS::InvalidGroupError, 'invalid group', '(?<>)' include_examples 'scan error', RS::InvalidGroupError, 'invalid group', '(?<>empty-name)' include_examples 'scan error', RS::InvalidGroupOption, 'invalid option', '(?foo)' include_examples 'scan error', RS::InvalidGroupOption, 'invalid option', '(?mix abc)' include_examples 'scan error', RS::InvalidGroupOption, 'invalid option', '(?mix^bc' include_examples 'scan error', RS::InvalidGroupOption, 'invalid option', '(?)' include_examples 'scan error', RS::InvalidGroupOption, 'invalid neg option', '(?-foo)' include_examples 'scan error', RS::InvalidGroupOption, 'invalid neg option', '(?-u)' include_examples 'scan error', RS::InvalidGroupOption, 'invalid neg option', '(?-mixu)' include_examples 'scan error', RS::InvalidBackrefError, 'invalid backref', '\k<>' include_examples 'scan error', RS::InvalidBackrefError, 'invalid backref', '\k\'\'' include_examples 'scan error', RS::InvalidBackrefError, 'invalid backref', '\k<0>' include_examples 'scan error', RS::InvalidBackrefError, 'invalid backref', '\k\'0\'' include_examples 'scan error', RS::InvalidBackrefError, 'invalid backref', '\k<-0>' include_examples 'scan error', RS::InvalidBackrefError, 'invalid backref', '\k<000>' include_examples 'scan error', RS::InvalidBackrefError, 'invalid backref', '\k<-000>' include_examples 'scan error', RS::InvalidBackrefError, 'invalid refcall', '\g<>' include_examples 'scan error', RS::InvalidBackrefError, 'invalid refcall', '\g\'\'' include_examples 'scan error', RS::InvalidBackrefError, 'invalid refcall', '\g<000>' include_examples 'scan error', RS::InvalidBackrefError, 'invalid refcall', '\g<-000>' include_examples 'scan error', RS::InvalidBackrefError, 'invalid condition', '(a)(?(0)b)' include_examples 'scan error', RS::InvalidBackrefError, 'invalid condition', '(a)(?(000)b)' include_examples 'scan error', RS::UnknownUnicodePropertyError, 'unknown property', '\p{foobar}' include_examples 'scan error', RS::UnknownPosixClassError, 'unknown POSIX class [::]', '[[::]]' include_examples 'scan error', RS::UnknownPosixClassError, 'unknown POSIX class [:^:]', '[[:^:]]' include_examples 'scan error', RS::UnknownPosixClassError, 'unknown POSIX class [:x:]', '[[:x:]]' include_examples 'scan error', RS::UnknownPosixClassError, 'unknown POSIX class', '[[:^x:]]' include_examples 'scan error', RS::UnknownPosixClassError, 'unknown POSIX class', '[[:WORD:]]' end ammar-regexp_parser-68cdeff/spec/scanner/escapes_spec.rb000066400000000000000000000164471506175332700236340ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Escape scanning') do include_examples 'scan', /c\at/, 1 => [:escape, :bell, '\a', 1, 3] # not an escape outside a character set include_examples 'scan', /c\bt/, 1 => [:anchor, :word_boundary, '\b', 1, 3] include_examples 'scan', /c\ft/, 1 => [:escape, :form_feed, '\f', 1, 3] include_examples 'scan', /c\nt/, 1 => [:escape, :newline, '\n', 1, 3] include_examples 'scan', /c\tt/, 1 => [:escape, :tab, '\t', 1, 3] include_examples 'scan', /c\vt/, 1 => [:escape, :vertical_tab, '\v', 1, 3] # ineffectual literal escapes # these cause "Unknown escape" warnings in Ruby for ascii chars, # and simply drop the backslash for non-ascii chars (/\ü/.inspect == '/ü/'). # In terms of matching, Ruby treats them both like non-escaped literals. include_examples 'scan', 'c\qt', 1 => [:escape, :literal, '\q', 1, 3] include_examples 'scan', 'a\üc', 1 => [:escape, :literal, '\ü', 1, 3] include_examples 'scan', 'a\😋c', 1 => [:escape, :literal, '\😋', 1, 3] # these incomplete ref/call sequences are treated as literal escapes by Ruby include_examples 'scan', 'c\gt', 1 => [:escape, :literal, '\g', 1, 3] include_examples 'scan', 'c\kt', 1 => [:escape, :literal, '\k', 1, 3] include_examples 'scan', 'a\012c', 1 => [:escape, :octal, '\012', 1, 5] include_examples 'scan', 'a\0124', 1 => [:escape, :octal, '\012', 1, 5] include_examples 'scan', '\712+7', 0 => [:escape, :octal, '\712', 0, 4] # Multi-digit escaped numbers that occur before sufficient capturing groups # have been opened are treated as octal or literal. # "\10"[/\10()()()()()()()()()()/] # => "\b" # treated as octal # "\70"[/\70()()()()()()()()()()/] # => "8" # treated as octal # "90"[/\90()()()()()()()()()()/] # => "90" # treated as literal # For cases treated as backrefs, see ./refcalls_spec.rb include_examples 'scan', "\\10#{'()' * 10}",0 => [:escape, :octal, '\10', 0, 3] include_examples 'scan', "\\90#{'()' * 90}",0 => [:escape, :literal, '\9', 0, 2], 1 => [:literal, :literal, '0', 2, 3] # special case: "out-of-bound octal escapes" (digits > 7) are not treated as backrefs include_examples 'scan', '\80', 0 => [:escape, :literal, '\8', 0, 2] include_examples 'scan', '\80', 1 => [:literal, :literal, '0', 2, 3] include_examples 'scan', 'a\xA', 1 => [:escape, :hex, '\xA', 1, 4] include_examples 'scan', 'a\x24c', 1 => [:escape, :hex, '\x24', 1, 5] include_examples 'scan', 'a\x0640c', 1 => [:escape, :hex, '\x06', 1, 5] include_examples 'scan', 'a\xE2\x82\xAC', 1 => [:escape, :utf8_hex, '\xE2\x82\xAC', 1, 13] include_examples 'scan', /a\xE2\x82\xAC/n, 1 => [:escape, :hex, '\xE2', 1, 5] include_examples 'scan', /a\xE2\x82\xAC/n, 2 => [:escape, :hex, '\x82', 5, 9] include_examples 'scan', /a\xE2\x82\xAC/n, 3 => [:escape, :hex, '\xAC', 9, 13] include_examples 'scan', 'a\u0640c', 1 => [:escape, :codepoint, '\u0640', 1, 7] include_examples 'scan', 'a\u{640 0641}c', 1 => [:escape, :codepoint_list, '\u{640 0641}', 1, 13] include_examples 'scan', 'a\u{10FFFF}c', 1 => [:escape, :codepoint_list, '\u{10FFFF}', 1, 11] include_examples 'scan', 'ab\\\xcd', 1 => [:escape, :backslash, '\\\\', 2, 4] include_examples 'scan', 'ab\\\0cd', 1 => [:escape, :backslash, '\\\\', 2, 4] include_examples 'scan', 'ab\\\Kcd', 1 => [:escape, :backslash, '\\\\', 2, 4] include_examples 'scan', 'ab\^cd', 1 => [:escape, :bol, '\^', 2, 4] include_examples 'scan', 'ab\$cd', 1 => [:escape, :eol, '\$', 2, 4] include_examples 'scan', 'ab\[cd', 1 => [:escape, :set_open, '\[', 2, 4] # escaped whitespace in x-mode include_examples 'scan', /a\ b/x, 0 => [:literal, :literal, 'a', 0, 1], 1 => [:escape, :literal, '\ ', 1, 3], 2 => [:literal, :literal, 'b', 3, 4] # newline literals can't be escaped in x-mode, c.f. https://bugs.ruby-lang.org/issues/19639 include_examples 'scan', /a\ b/x, 0 => [:literal, :literal, 'ab', 0, 2] # Meta/control escapes # # After the following fix in Ruby 3.1, a Regexp#source containing meta/control # escapes can only be set with the Regexp::new constructor. # In Regexp literals, these escapes are now pre-processed to hex escapes. # # https://github.com/ruby/ruby/commit/11ae581a4a7f5d5f5ec6378872eab8f25381b1b9 n = ->(regexp_body){ Regexp.new(regexp_body.dup.force_encoding('ascii-8bit')) } include_examples 'scan', 'a\cBc', 1 => [:escape, :control, '\cB', 1, 4] include_examples 'scan', 'a\c^c', 1 => [:escape, :control, '\c^', 1, 4] include_examples 'scan', 'a\c\n', 1 => [:escape, :control, '\c\n', 1, 5] include_examples 'scan', 'a\c\\\\b', 1 => [:escape, :control, '\c\\\\', 1, 5] include_examples 'scan', 'a\C-bc', 1 => [:escape, :control, '\C-b', 1, 5] include_examples 'scan', 'a\C-^b', 1 => [:escape, :control, '\C-^', 1, 5] include_examples 'scan', 'a\C-\nb', 1 => [:escape, :control, '\C-\n', 1, 6] include_examples 'scan', 'a\C-\\\\b', 1 => [:escape, :control, '\C-\\\\', 1, 6] include_examples 'scan', n.('a\c\M-Bc'), 1 => [:escape, :control, '\c\M-B', 1, 7] include_examples 'scan', n.('a\C-\M-Bc'), 1 => [:escape, :control, '\C-\M-B', 1, 8] include_examples 'scan', n.('a\M-Bc'), 1 => [:escape, :meta_sequence, '\M-B', 1, 5] include_examples 'scan', n.('a\M-\cBc'), 1 => [:escape, :meta_sequence, '\M-\cB', 1, 7] include_examples 'scan', n.('a\M-\c^'), 1 => [:escape, :meta_sequence, '\M-\c^', 1, 7] include_examples 'scan', n.('a\M-\c\n'), 1 => [:escape, :meta_sequence, '\M-\c\n', 1, 8] include_examples 'scan', n.('a\M-\c\\\\'), 1 => [:escape, :meta_sequence, '\M-\c\\\\', 1, 8] include_examples 'scan', n.('a\M-\C-Bc'), 1 => [:escape, :meta_sequence, '\M-\C-B', 1, 8] include_examples 'scan', n.('a\M-\C-\\\\'), 1 => [:escape, :meta_sequence, '\M-\C-\\\\', 1, 9] end ammar-regexp_parser-68cdeff/spec/scanner/free_space_spec.rb000066400000000000000000000166001506175332700242740ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('FreeSpace scanning') do describe('scan free space tokens') do let(:tokens) { RS.scan(/ a b ? c * d {2,3} e + | f + /x) } 0.upto(24).select(&:even?).each do |i| it "scans #{i} as free space" do expect(tokens[i][0]).to eq :free_space expect(tokens[i][1]).to eq :whitespace end end 0.upto(24).reject(&:even?).each do |i| it "does not scan #{i} as free space" do expect(tokens[i][0]).not_to eq :free_space expect(tokens[i][1]).not_to eq :whitespace end end it 'sets the correct text' do [0, 2, 10, 14].each { |i| expect(tokens[i][2]).to eq "\n " } [4, 6, 8, 12].each { |i| expect(tokens[i][2]).to eq ' ' } end end describe('scan free space comments') do include_examples 'scan', / a + # A + comment b ? # B ? comment c {2,3} # C {2,3} comment d + | e + # D|E comment /x, 5 => [:free_space, :comment, "# A + comment\n", 11, 25], 11 => [:free_space, :comment, "# B ? comment\n", 37, 51], 17 => [:free_space, :comment, "# C {2,3} comment\n", 66, 84], 29 => [:free_space, :comment, "# D|E comment\n", 100, 114] # single line / no trailing newline (c.f. issue #66) include_examples 'scan', /a # b/x, 0 => [:literal, :literal, 'a', 0, 1], 1 => [:free_space, :whitespace, ' ', 1, 2], 2 => [:free_space, :comment, "# b", 2, 5] # without spaces (c.f. issue #66) include_examples 'scan', /a#b/x, 0 => [:literal, :literal, 'a', 0, 1], 1 => [:free_space, :comment, "#b", 1, 3] end describe('scan free space inlined') do include_examples 'scan', /a b(?x:c d e)f g/, 0 => [:literal, :literal, 'a b', 0, 3], 1 => [:group, :options, '(?x:', 3, 7], 2 => [:literal, :literal, 'c', 7, 8], 3 => [:free_space, :whitespace, ' ', 8, 9], 4 => [:literal, :literal, 'd', 9, 10], 5 => [:free_space, :whitespace, ' ', 10, 11], 6 => [:literal, :literal, 'e', 11, 12], 7 => [:group, :close, ')', 12, 13], 8 => [:literal, :literal, 'f g', 13, 16] end describe('scan free space nested') do include_examples 'scan', /a b(?x:c d(?-x:e f)g h)i j/, 0 => [:literal, :literal, 'a b', 0, 3], 1 => [:group, :options, '(?x:', 3, 7], 2 => [:literal, :literal, 'c', 7, 8], 3 => [:free_space, :whitespace, ' ', 8, 9], 4 => [:literal, :literal, 'd', 9, 10], 5 => [:group, :options, '(?-x:', 10, 15], 6 => [:literal, :literal, 'e f', 15, 18], 7 => [:group, :close, ')', 18, 19], 8 => [:literal, :literal, 'g', 19, 20], 9 => [:free_space, :whitespace, ' ', 20, 21], 10 => [:literal, :literal, 'h', 21, 22], 11 => [:group, :close, ')', 22, 23], 12 => [:literal, :literal, 'i j', 23, 26] end describe('scan free space nested groups') do include_examples 'scan', /(a (b(?x: (c d) (?-x:(e f) )g) h)i j)/, 0 => [:group, :capture, '(', 0, 1], 1 => [:literal, :literal, 'a ', 1, 3], 2 => [:group, :capture, '(', 3, 4], 3 => [:literal, :literal, 'b', 4, 5], 4 => [:group, :options, '(?x:', 5, 9], 5 => [:free_space, :whitespace, ' ', 9, 10], 6 => [:group, :capture, '(', 10, 11], 7 => [:literal, :literal, 'c', 11, 12], 8 => [:free_space, :whitespace, ' ', 12, 13], 9 => [:literal, :literal, 'd', 13, 14], 10 => [:group, :close, ')', 14, 15], 11 => [:free_space, :whitespace, ' ', 15, 16], 12 => [:group, :options, '(?-x:', 16, 21], 13 => [:group, :capture, '(', 21, 22], 14 => [:literal, :literal, 'e f', 22, 25], 15 => [:group, :close, ')', 25, 26], 16 => [:literal, :literal, ' ', 26, 27], 17 => [:group, :close, ')', 27, 28], 18 => [:literal, :literal, 'g', 28, 29], 19 => [:group, :close, ')', 29, 30], 20 => [:literal, :literal, ' h', 30, 32], 21 => [:group, :close, ')', 32, 33], 22 => [:literal, :literal, 'i j', 33, 36], 23 => [:group, :close, ')', 36, 37] include_examples 'scan', /(?x:(?#hello) ) /, 2 => [:free_space, :whitespace, ' ', 13, 14], 4 => [:literal, :literal, ' ', 15, 16] end describe('scan free space switch groups') do include_examples 'scan', /(a (b((?x) (c d) ((?-x)(e f) )g) h)i j)/, 0 => [:group, :capture, '(', 0, 1], 1 => [:literal, :literal, 'a ', 1, 3], 2 => [:group, :capture, '(', 3, 4], 3 => [:literal, :literal, 'b', 4, 5], 4 => [:group, :capture, '(', 5, 6], 5 => [:group, :options_switch, '(?x', 6, 9], 6 => [:group, :close, ')', 9, 10], 7 => [:free_space, :whitespace, ' ', 10, 11], 8 => [:group, :capture, '(', 11, 12], 9 => [:literal, :literal, 'c', 12, 13], 10 => [:free_space, :whitespace, ' ', 13, 14], 11 => [:literal, :literal, 'd', 14, 15], 12 => [:group, :close, ')', 15, 16], 13 => [:free_space, :whitespace, ' ', 16, 17], 14 => [:group, :capture, '(', 17, 18], 15 => [:group, :options_switch, '(?-x', 18, 22], 16 => [:group, :close, ')', 22, 23], 17 => [:group, :capture, '(', 23, 24], 18 => [:literal, :literal, 'e f', 24, 27], 19 => [:group, :close, ')', 27, 28], 20 => [:literal, :literal, ' ', 28, 29], 21 => [:group, :close, ')', 29, 30], 22 => [:literal, :literal, 'g', 30, 31], 23 => [:group, :close, ')', 31, 32], 24 => [:literal, :literal, ' h', 32, 34], 25 => [:group, :close, ')', 34, 35], 26 => [:literal, :literal, 'i j', 35, 38], 27 => [:group, :close, ')', 38, 39] end describe('scanning `#` in regular (non-x mode)') do # c.f. issue 70 include_examples 'scan', /a#bcd/, 0 => [:literal, :literal, 'a#bcd', 0, 5] include_examples 'scan', /a # bcd/, 0 => [:literal, :literal, 'a # bcd', 0, 7] include_examples 'scan', /a#\d/, 0 => [:literal, :literal, 'a#', 0, 2], 1 => [:type, :digit, '\d', 2, 4] include_examples 'scan', /a # \d/, 0 => [:literal, :literal, 'a # ', 0, 4], 1 => [:type, :digit, '\d', 4, 6] include_examples 'scan', /a#()/, 0 => [:literal, :literal, 'a#', 0, 2], 1 => [:group, :capture, '(', 2, 3] include_examples 'scan', /a # ()/, 0 => [:literal, :literal, 'a # ', 0, 4], 1 => [:group, :capture, '(', 4, 5] end end ammar-regexp_parser-68cdeff/spec/scanner/groups_spec.rb000066400000000000000000000133141506175332700235160ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Group scanning') do # Group types include_examples 'scan', '(?>abc)', 0 => [:group, :atomic, '(?>', 0, 3] include_examples 'scan', '(abc)', 0 => [:group, :capture, '(', 0, 1] # Named groups # Names that start with a hyphen or digit (ascii or other) are invalid. # ")" is only allowed as first char of the name. # "!" and "=" are allowed anywhere, but (?…) and (?<=…>…) are treated as lookbehinds by Ruby. include_examples 'scan', '(?abc)', 0 => [:group, :named_ab, '(?', 0, 8] include_examples 'scan', "(?'name'abc)", 0 => [:group, :named_sq, "(?'name'", 0, 8] include_examples 'scan', '(?abc)', 0 => [:group, :named_ab, '(?', 0,10] include_examples 'scan', "(?'name_1'abc)", 0 => [:group, :named_sq, "(?'name_1'", 0,10] include_examples 'scan', '(?abc)', 0 => [:group, :named_ab, '(?', 0,10] include_examples 'scan', "(?'name-1'abc)", 0 => [:group, :named_sq, "(?'name-1'", 0,10] include_examples 'scan', "(?abc)", 0 => [:group, :named_ab, "(?", 0,10] include_examples 'scan', "(?'name>1'abc)", 0 => [:group, :named_sq, "(?'name>1'", 0,10] include_examples 'scan', "(?abc)", 0 => [:group, :named_ab, "(?", 0,10] include_examples 'scan', "(?'!ame1>'abc)", 0 => [:group, :named_sq, "(?'!ame1>'", 0,10] include_examples 'scan', "(?abc)", 0 => [:group, :named_ab, "(?", 0,10] include_examples 'scan', "(?'=ame1>'abc)", 0 => [:group, :named_sq, "(?'=ame1>'", 0,10] include_examples 'scan', '(?<üüuuüü>abc)', 0 => [:group, :named_ab, '(?<üüuuüü>', 0,10] include_examples 'scan', "(?'üüuuüü'abc)", 0 => [:group, :named_sq, "(?'üüuuüü'", 0,10] include_examples 'scan', "(?<😋1234😋>abc)", 0 => [:group, :named_ab, "(?<😋1234😋>", 0,10] include_examples 'scan', "(?'😋1234😋'abc)", 0 => [:group, :named_sq, "(?'😋1234😋'", 0,10] include_examples 'scan', "(?<)x>y)", 0 => [:group, :named_ab, '(?<)x>', 0, 6] include_examples 'scan', "(?')x'y)", 0 => [:group, :named_sq, "(?')x'", 0, 6] include_examples 'scan', "(?'!x'y)", 0 => [:group, :named_sq, "(?'!x'", 0, 6] # Passive groups include_examples 'scan', '(?:abc)', 0 => [:group, :passive, '(?:', 0, 3] include_examples 'scan', '(?:)', 0 => [:group, :passive, '(?:', 0, 3] include_examples 'scan', '(?::)', 0 => [:group, :passive, '(?:', 0, 3] # Comments include_examples 'scan', '(?#abc)', 0 => [:group, :comment, '(?#abc)', 0, 7] include_examples 'scan', '(?#)', 0 => [:group, :comment, '(?#)', 0, 4] # Assertions include_examples 'scan', '(?=abc)', 0 => [:assertion, :lookahead, '(?=', 0, 3] include_examples 'scan', '(?!abc)', 0 => [:assertion, :nlookahead, '(?!', 0, 3] include_examples 'scan', '(?<=abc)', 0 => [:assertion, :lookbehind, '(?<=', 0, 4] include_examples 'scan', '(?<=x>)y', 0 => [:assertion, :lookbehind, '(?<=', 0, 4] include_examples 'scan', '(? [:assertion, :nlookbehind, '(?', 0 => [:assertion, :nlookbehind, '(?)y', 0 => [:assertion, :nlookbehind, '(? [:group, :options, '(?-mix:', 0, 7] include_examples 'scan', '(?m-ix:abc)', 0 => [:group, :options, '(?m-ix:', 0, 7] include_examples 'scan', '(?mi-x:abc)', 0 => [:group, :options, '(?mi-x:', 0, 7] include_examples 'scan', '(?mix:abc)', 0 => [:group, :options, '(?mix:', 0, 6] include_examples 'scan', '(?m:)', 0 => [:group, :options, '(?m:', 0, 4] include_examples 'scan', '(?i:)', 0 => [:group, :options, '(?i:', 0, 4] include_examples 'scan', '(?x:)', 0 => [:group, :options, '(?x:', 0, 4] include_examples 'scan', '(?mix)', 0 => [:group, :options_switch, '(?mix', 0, 5] include_examples 'scan', '(?d-mix:abc)', 0 => [:group, :options, '(?d-mix:', 0, 8] include_examples 'scan', '(?a-mix:abc)', 0 => [:group, :options, '(?a-mix:', 0, 8] include_examples 'scan', '(?u-mix:abc)', 0 => [:group, :options, '(?u-mix:', 0, 8] include_examples 'scan', '(?da-m:abc)', 0 => [:group, :options, '(?da-m:', 0, 7] include_examples 'scan', '(?du-x:abc)', 0 => [:group, :options, '(?du-x:', 0, 7] include_examples 'scan', '(?dau-i:abc)', 0 => [:group, :options, '(?dau-i:', 0, 8] include_examples 'scan', '(?dau:abc)', 0 => [:group, :options, '(?dau:', 0, 6] include_examples 'scan', '(?d:)', 0 => [:group, :options, '(?d:', 0, 4] include_examples 'scan', '(?a:)', 0 => [:group, :options, '(?a:', 0, 4] include_examples 'scan', '(?u:)', 0 => [:group, :options, '(?u:', 0, 4] include_examples 'scan', '(?dau)', 0 => [:group, :options_switch, '(?dau', 0, 5] if ruby_version_at_least('2.4.1') include_examples 'scan', '(?~abc)', 0 => [:group, :absence, '(?~', 0, 3] end end ammar-regexp_parser-68cdeff/spec/scanner/keep_spec.rb000066400000000000000000000004441506175332700231230ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Keep scanning') do include_examples 'scan', /ab\Kcd/, 1 => [:keep, :mark, '\K', 2, 4] include_examples 'scan', /(a\Kb)|(c\\\Kd)ef/, 2 => [:keep, :mark, '\K', 2, 4], 9 => [:keep, :mark, '\K', 11, 13] end ammar-regexp_parser-68cdeff/spec/scanner/literals_spec.rb000066400000000000000000000032211506175332700240120ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('UTF-8 scanning') do # ascii, single byte characters include_examples 'scan', 'a', 0 => [:literal, :literal, 'a', 0, 1] include_examples 'scan', 'ab+', 0 => [:literal, :literal, 'ab', 0, 2], 1 => [:quantifier, :one_or_more, '+', 2, 3] # 2 byte wide characters include_examples 'scan', 'äöü', 0 => [:literal, :literal, 'äöü', 0, 3] # 3 byte wide characters, Japanese include_examples 'scan', 'ab?れます+cd', 0 => [:literal, :literal, 'ab', 0, 2], 1 => [:quantifier, :zero_or_one, '?', 2, 3], 2 => [:literal, :literal, 'れます', 3, 6], 3 => [:quantifier, :one_or_more, '+', 6, 7], 4 => [:literal, :literal, 'cd', 7, 9] # 4 byte wide characters, Osmanya include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃', 0 => [:literal, :literal, '𐒀𐒁', 0, 2], 1 => [:quantifier, :zero_or_one, '?', 2, 3], 2 => [:literal, :literal, '𐒂ab', 3, 6], 3 => [:quantifier, :one_or_more, '+', 6, 7], 4 => [:literal, :literal, '𐒃', 7, 8] include_examples 'scan', 'mu𝄞?si*𝄫c+', 0 => [:literal, :literal, 'mu𝄞', 0, 3], 1 => [:quantifier, :zero_or_one, '?', 3, 4], 2 => [:literal, :literal, 'si', 4, 6], 3 => [:quantifier, :zero_or_more, '*', 6, 7], 4 => [:literal, :literal, '𝄫c', 7, 9], 5 => [:quantifier, :one_or_more, '+', 9, 10] end ammar-regexp_parser-68cdeff/spec/scanner/meta_spec.rb000066400000000000000000000016451506175332700231310ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Meta scanning') do include_examples 'scan', /abc??|def*+|ghi+/, 0 => [:literal, :literal, 'abc', 0, 3], 1 => [:quantifier, :zero_or_one_reluctant, '??', 3, 5], 2 => [:meta, :alternation, '|', 5, 6], 3 => [:literal, :literal, 'def', 6, 9], 4 => [:quantifier, :zero_or_more_possessive, '*+', 9, 11], 5 => [:meta, :alternation, '|', 11, 12] include_examples 'scan', /(a\|b)|(c|d)\|(e[|]f)/, 2 => [:escape, :alternation, '\|', 2, 4], 5 => [:meta, :alternation, '|', 6, 7], 8 => [:meta, :alternation, '|', 9, 10], 11 => [:escape, :alternation, '\|', 12, 14], 15 => [:literal, :literal, '|', 17, 18] end ammar-regexp_parser-68cdeff/spec/scanner/options_spec.rb000066400000000000000000000024471506175332700236770ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('passing options to scan') do def expect_type_tokens(tokens, type_tokens) expect(tokens.map { |type, token, *| [type, token] }).to eq(type_tokens) end it 'raises if scanning from a Regexp and options are passed' do expect { RS.scan(/a+/, options: ::Regexp::EXTENDED) }.to raise_error( ArgumentError, 'options cannot be supplied unless scanning a String' ) end it 'sets free_spacing based on options if scanning from a String' do expect_type_tokens( RS.scan('a+#c', options: ::Regexp::MULTILINE | ::Regexp::EXTENDED), [ %i[literal literal], %i[quantifier one_or_more], %i[free_space comment] ] ) end it 'sets encoding based on options if scanning from a String' do expect_type_tokens( RS.scan('\x94\x95', options: ::Regexp::NOENCODING), [ # in non-binary encodings, these would be seen as a single utf8 escape %i[escape hex], %i[escape hex], ] ) end it 'does not set free_spacing if scanning from a String and passing no options' do expect_type_tokens( RS.scan('a+#c'), [ %i[literal literal], %i[quantifier one_or_more], %i[literal literal] ] ) end end ammar-regexp_parser-68cdeff/spec/scanner/properties_spec.rb000066400000000000000000000052621506175332700243760ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Property scanning') do RSpec.shared_examples 'scan property' do |text, token| it("scans \\p{#{text}} as property #{token}") do result = RS.scan("\\p{#{text}}")[0] expect(result[0..1]).to eq [:property, token] end it("scans \\P{#{text}} as nonproperty #{token}") do result = RS.scan("\\P{#{text}}")[0] expect(result[0..1]).to eq [:nonproperty, token] end it("scans \\p{^#{text}} as nonproperty #{token}") do result = RS.scan("\\p{^#{text}}")[0] expect(result[0..1]).to eq [:nonproperty, token] end it("scans double-negated \\P{^#{text}} as property #{token}") do result = RS.scan("\\P{^#{text}}")[0] expect(result[0..1]).to eq [:property, token] end end include_examples 'scan property', 'Alnum', :alnum include_examples 'scan property', 'XPosixPunct', :xposixpunct include_examples 'scan property', 'Newline', :newline include_examples 'scan property', 'Any', :any include_examples 'scan property', 'Assigned', :assigned include_examples 'scan property', 'Age=1.1', :'age=1.1' include_examples 'scan property', 'Age=10.0', :'age=10.0' include_examples 'scan property', 'ahex', :ascii_hex_digit include_examples 'scan property', 'ASCII_Hex_Digit', :ascii_hex_digit # test underscore include_examples 'scan property', 'sd', :soft_dotted include_examples 'scan property', 'Soft-Dotted', :soft_dotted # test dash include_examples 'scan property', 'Egyp', :egyptian_hieroglyphs include_examples 'scan property', 'Egyptian Hieroglyphs', :egyptian_hieroglyphs # test whitespace include_examples 'scan property', 'Linb', :linear_b include_examples 'scan property', 'Linear-B', :linear_b # test dash include_examples 'scan property', 'InArabic', :in_arabic # test block include_examples 'scan property', 'in Arabic', :in_arabic # test block w. whitespace include_examples 'scan property', 'In_Arabic', :in_arabic # test block w. underscore include_examples 'scan property', 'Yiii', :yi include_examples 'scan property', 'Yi', :yi include_examples 'scan property', 'Zinh', :inherited include_examples 'scan property', 'Inherited', :inherited include_examples 'scan property', 'Qaai', :inherited include_examples 'scan property', 'Zzzz', :unknown include_examples 'scan property', 'Unknown', :unknown end ammar-regexp_parser-68cdeff/spec/scanner/quantifiers_spec.rb000066400000000000000000000033151506175332700245310ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Quantifier scanning') do include_examples 'scan', 'a?', 1 => [:quantifier, :zero_or_one, '?', 1, 2] include_examples 'scan', 'a??', 1 => [:quantifier, :zero_or_one_reluctant, '??', 1, 3] include_examples 'scan', 'a?+', 1 => [:quantifier, :zero_or_one_possessive, '?+', 1, 3] include_examples 'scan', 'a*', 1 => [:quantifier, :zero_or_more, '*', 1, 2] include_examples 'scan', 'a*?', 1 => [:quantifier, :zero_or_more_reluctant, '*?', 1, 3] include_examples 'scan', 'a*+', 1 => [:quantifier, :zero_or_more_possessive, '*+', 1, 3] include_examples 'scan', 'a+', 1 => [:quantifier, :one_or_more, '+', 1, 2] include_examples 'scan', 'a+?', 1 => [:quantifier, :one_or_more_reluctant, '+?', 1, 3] include_examples 'scan', 'a++', 1 => [:quantifier, :one_or_more_possessive, '++', 1, 3] include_examples 'scan', 'a{2}', 1 => [:quantifier, :interval, '{2}', 1, 4] include_examples 'scan', 'a{2,}', 1 => [:quantifier, :interval, '{2,}', 1, 5] include_examples 'scan', 'a{,2}', 1 => [:quantifier, :interval, '{,2}', 1, 5] include_examples 'scan', 'a{2,4}', 1 => [:quantifier, :interval, '{2,4}', 1, 6] # special case: chained quantifiers include_examples 'scan', 'a+{2}{3}', 1 => [:quantifier, :one_or_more, '+', 1, 2] include_examples 'scan', 'a+{2}{3}', 2 => [:quantifier, :interval, '{2}', 2, 5] include_examples 'scan', 'a+{2}{3}', 3 => [:quantifier, :interval, '{3}', 5, 8] end ammar-regexp_parser-68cdeff/spec/scanner/refcalls_spec.rb000066400000000000000000000114051506175332700237710ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('RefCall scanning') do # Traditional numerical group back-reference. # For non-matched cases see ./escapes_spec.rb include_examples 'scan', '(abc)\1' , 3 => [:backref, :number, '\1', 5, 7] # They can have two or more digits: # "#{[*2..101].join}101"[/#{(2..101).map { |n| "(#{n})" }.join}\K\100/] # => '101' include_examples 'scan', '(((((((((())))))))))\10', -1 => [:backref, :number, '\10', 20, 23] include_examples 'scan', "#{[*1..100].map { |n| "(#{n})" }.join}\\100", -1 => [:backref, :number, '\100', 392, 396] # Double digit escapes are treated as backref as soon as a fitting group is open: # "\10"[/((((((((((\10))))))))))/] # => nil include_examples 'scan', '((((((((((\10))))))))))', 10 => [:backref, :number, '\10', 10, 13] # Group back-references, named, numbered, and relative # # NOTE: only \g supports forward-looking references using '+', e.g. \g<+1> # refers to the next group, but \k<+1> refers to a group named '+1'. # Inversely, only \k supports addition or subtraction of a recursion level. # E.g. \k refers to a group named 'x' at the current recursion level, # but \g refers to a a group named 'x+0'. # include_examples 'scan', '(?abc)\k', 3 => [:backref, :name_ref_ab, '\k', 9, 14] include_examples 'scan', "(?abc)\\k'X'", 3 => [:backref, :name_ref_sq, "\\k'X'", 9, 14] include_examples 'scan', '(?<+1>abc)\k<+1>', 3 => [:backref, :name_ref_ab, '\k<+1>', 10, 16] include_examples 'scan', "(?<+1>abc)\\k'+1'", 3 => [:backref, :name_ref_sq, "\\k'+1'", 10, 16] include_examples 'scan', '(abc)\k<1>', 3 => [:backref, :number_ref_ab, '\k<1>', 5, 10] include_examples 'scan', "(abc)\\k'1'", 3 => [:backref, :number_ref_sq, "\\k'1'", 5, 10] include_examples 'scan', "(abc)\\k'001'", 3 => [:backref, :number_ref_sq, "\\k'001'", 5, 12] include_examples 'scan', '(abc)\k<-1>', 3 => [:backref, :number_rel_ref_ab, '\k<-1>', 5, 11] include_examples 'scan', "(abc)\\k'-1'", 3 => [:backref, :number_rel_ref_sq, "\\k'-1'", 5, 11] include_examples 'scan', '(abc)\k<-001>', 3 => [:backref, :number_rel_ref_ab, '\k<-001>', 5, 13] # Sub-expression invocation, named, numbered, and relative include_examples 'scan', '(?abc)\g', 3 => [:backref, :name_call_ab, '\g', 9, 14] include_examples 'scan', "(?abc)\\g'X'", 3 => [:backref, :name_call_sq, "\\g'X'", 9, 14] include_examples 'scan', '(?abc)\g', 3 => [:backref, :name_call_ab, '\g', 9, 16] include_examples 'scan', "(?abc)\\g'X-1'", 3 => [:backref, :name_call_sq, "\\g'X-1'", 9, 16] include_examples 'scan', '(abc)\g<1>', 3 => [:backref, :number_call_ab, '\g<1>', 5, 10] include_examples 'scan', "(abc)\\g'1'", 3 => [:backref, :number_call_sq, "\\g'1'", 5, 10] include_examples 'scan', '(abc)\g<001>', 3 => [:backref, :number_call_ab, '\g<001>', 5, 12] include_examples 'scan', 'a(b|\g<0>)', 4 => [:backref, :number_call_ab, '\g<0>', 4, 9] include_examples 'scan', "a(b|\\g'0')", 4 => [:backref, :number_call_sq, "\\g'0'", 4, 9] include_examples 'scan', '(abc)\g<-1>', 3 => [:backref, :number_rel_call_ab, '\g<-1>', 5, 11] include_examples 'scan', "(abc)\\g'-1'", 3 => [:backref, :number_rel_call_sq, "\\g'-1'", 5, 11] include_examples 'scan', '(abc)\g<-001>', 3 => [:backref, :number_rel_call_ab, '\g<-001>', 5, 13] include_examples 'scan', '\g<+1>(abc)', 0 => [:backref, :number_rel_call_ab, '\g<+1>', 0, 6] include_examples 'scan', "\\g'+1'(abc)", 0 => [:backref, :number_rel_call_sq, "\\g'+1'", 0, 6] # Group back-references, with recursion level include_examples 'scan', '(?abc)\k', 3 => [:backref, :name_recursion_ref_ab, '\k', 9, 16] include_examples 'scan', "(?abc)\\k'X-0'", 3 => [:backref, :name_recursion_ref_sq, "\\k'X-0'", 9, 16] include_examples 'scan', '(abc)\k<1-0>', 3 => [:backref, :number_recursion_ref_ab, '\k<1-0>', 5, 12] include_examples 'scan', "(abc)\\k'1-0'", 3 => [:backref, :number_recursion_ref_sq, "\\k'1-0'", 5, 12] include_examples 'scan', '(abc)\k<+1-0>', 3 => [:backref, :name_recursion_ref_ab, '\k<+1-0>', 5, 13] include_examples 'scan', "(abc)\\k'+1-0'", 3 => [:backref, :name_recursion_ref_sq, "\\k'+1-0'", 5, 13] end ammar-regexp_parser-68cdeff/spec/scanner/sets_spec.rb000066400000000000000000000300701506175332700231530ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Set scanning') do include_examples 'scan', /[a]/, 0 => [:set, :open, '[', 0, 1] include_examples 'scan', /[b]/, 2 => [:set, :close, ']', 2, 3] include_examples 'scan', /[^n]/, 1 => [:set, :negate, '^', 1, 2] include_examples 'scan', /[c]/, 1 => [:literal, :literal, 'c', 1, 2] include_examples 'scan', /[^d]/, 2 => [:literal, :literal, 'd', 2, 3] include_examples 'scan', /[\b]/, 1 => [:escape, :backspace, '\b', 1, 3] include_examples 'scan', /[A\bX]/, 2 => [:escape, :backspace, '\b', 2, 4] include_examples 'scan', /[\a]/, 1 => [:escape, :bell, '\a', 1, 3] include_examples 'scan', /[\e]/, 1 => [:escape, :escape, '\e', 1, 3] include_examples 'scan', /[\f]/, 1 => [:escape, :form_feed, '\f', 1, 3] include_examples 'scan', /[\n]/, 1 => [:escape, :newline, '\n', 1, 3] include_examples 'scan', /[\r]/, 1 => [:escape, :carriage, '\r', 1, 3] include_examples 'scan', /[\t]/, 1 => [:escape, :tab, '\t', 1, 3] include_examples 'scan', /[\v]/, 1 => [:escape, :vertical_tab, '\v', 1, 3] include_examples 'scan', /[.]/, 1 => [:literal, :literal, '.', 1, 2] include_examples 'scan', /[?]/, 1 => [:literal, :literal, '?', 1, 2] include_examples 'scan', /[*]/, 1 => [:literal, :literal, '*', 1, 2] include_examples 'scan', /[+]/, 1 => [:literal, :literal, '+', 1, 2] include_examples 'scan', /[{]/, 1 => [:literal, :literal, '{', 1, 2] include_examples 'scan', /[}]/, 1 => [:literal, :literal, '}', 1, 2] include_examples 'scan', /[<]/, 1 => [:literal, :literal, '<', 1, 2] include_examples 'scan', /[>]/, 1 => [:literal, :literal, '>', 1, 2] include_examples 'scan', '[\\\\]', 1 => [:escape, :backslash, '\\\\', 1, 3] include_examples 'scan', '[\u0040]', 1 => [:escape, :codepoint, '\u0040', 1, 7] include_examples 'scan', '[\u{40}]', 1 => [:escape, :codepoint_list, '\u{40}', 1, 7] include_examples 'scan', '[\c2]', 1 => [:escape, :control, '\c2', 1, 4] include_examples 'scan', '[\C-C]', 1 => [:escape, :control, '\C-C', 1, 5] include_examples 'scan', '[\xF]', 1 => [:escape, :hex, '\xF', 1, 4] include_examples 'scan', '[\x20]', 1 => [:escape, :hex, '\x20', 1, 5] include_examples 'scan', '[\M-Z]', 1 => [:escape, :meta_sequence, '\M-Z', 1, 5] include_examples 'scan', '[\M-\C-X]', 1 => [:escape, :meta_sequence, '\M-\C-X', 1, 8] include_examples 'scan', '[\7]', 1 => [:escape, :octal, '\7', 1, 3] include_examples 'scan', '[\77]', 1 => [:escape, :octal, '\77', 1, 4] include_examples 'scan', '[\777]', 1 => [:escape, :octal, '\777', 1, 5] include_examples 'scan', '[\8]', 1 => [:escape, :literal, '\8', 1, 3] include_examples 'scan', '[\88]', 1 => [:escape, :literal, '\8', 1, 3] include_examples 'scan', '[\\[]', 1 => [:escape, :set_open, '\[', 1, 3] include_examples 'scan', '[\\]]', 1 => [:escape, :set_close, '\]', 1, 3] include_examples 'scan', '[a\-]', 2 => [:escape, :literal, '\-', 2, 4] include_examples 'scan', '[\-c]', 1 => [:escape, :literal, '\-', 1, 3] include_examples 'scan', '[\.]', 1 => [:escape, :literal, '\.', 1, 3] include_examples 'scan', '[\?]', 1 => [:escape, :literal, '\?', 1, 3] include_examples 'scan', '[\*]', 1 => [:escape, :literal, '\*', 1, 3] include_examples 'scan', '[\+]', 1 => [:escape, :literal, '\+', 1, 3] include_examples 'scan', '[\|]', 1 => [:escape, :literal, '\|', 1, 3] include_examples 'scan', '[\{]', 1 => [:escape, :literal, '\{', 1, 3] include_examples 'scan', '[\}]', 1 => [:escape, :literal, '\}', 1, 3] include_examples 'scan', '[\(]', 1 => [:escape, :literal, '\(', 1, 3] include_examples 'scan', '[\)]', 1 => [:escape, :literal, '\)', 1, 3] include_examples 'scan', '[\!]', 1 => [:escape, :literal, '\!', 1, 3] include_examples 'scan', '[\#]', 1 => [:escape, :literal, '\#', 1, 3] include_examples 'scan', '[\A]', 1 => [:escape, :literal, '\A', 1, 3] include_examples 'scan', '[\z]', 1 => [:escape, :literal, '\z', 1, 3] include_examples 'scan', '[\g]', 1 => [:escape, :literal, '\g', 1, 3] include_examples 'scan', '[\K]', 1 => [:escape, :literal, '\K', 1, 3] include_examples 'scan', '[\R]', 1 => [:escape, :literal, '\R', 1, 3] include_examples 'scan', '[\X]', 1 => [:escape, :literal, '\X', 1, 3] include_examples 'scan', '[\B]', 1 => [:escape, :literal, '\B', 1, 3] include_examples 'scan', '[\💎]', 1 => [:escape, :literal, '\💎', 1, 3] include_examples 'scan', /[\d]/, 1 => [:type, :digit, '\d', 1, 3] include_examples 'scan', /[\da-z]/, 1 => [:type, :digit, '\d', 1, 3] include_examples 'scan', /[\D]/, 1 => [:type, :nondigit, '\D', 1, 3] include_examples 'scan', /[\h]/, 1 => [:type, :hex, '\h', 1, 3] include_examples 'scan', /[\H]/, 1 => [:type, :nonhex, '\H', 1, 3] include_examples 'scan', /[\s]/, 1 => [:type, :space, '\s', 1, 3] include_examples 'scan', /[\S]/, 1 => [:type, :nonspace, '\S', 1, 3] include_examples 'scan', /[\w]/, 1 => [:type, :word, '\w', 1, 3] include_examples 'scan', /[\W]/, 1 => [:type, :nonword, '\W', 1, 3] include_examples 'scan', /[a-b]/, 1 => [:literal, :literal, 'a', 1, 2] include_examples 'scan', /[a-c]/, 2 => [:set, :range, '-', 2, 3] include_examples 'scan', /[a-d]/, 3 => [:literal, :literal, 'd', 3, 4] include_examples 'scan', /[a-b-]/, 4 => [:literal, :literal, '-', 4, 5] include_examples 'scan', /[-a]/, 1 => [:literal, :literal, '-', 1, 2] include_examples 'scan', /[a-c^]/, 4 => [:literal, :literal, '^', 4, 5] include_examples 'scan', /[a-bd-f]/, 2 => [:set, :range, '-', 2, 3] include_examples 'scan', /[a-cd-f]/, 5 => [:set, :range, '-', 5, 6] # this is a buggy range, it matches only `c`, but not `a`, `b` or `-` # (this is a string to work around a rubocop v1.56.4 error in Lint/MixedCaseRange) include_examples 'scan', '[a-[c]]', 2 => [:set, :range, '-', 2, 3] # these are not ranges, they match `a`, `c` and `-` (or non-`-` if negated) include_examples 'scan', /[[a]-[c]]/, 4 => [:literal, :literal, '-', 4, 5] include_examples 'scan', /[[a]-c]/, 4 => [:literal, :literal, '-', 4, 5] include_examples 'scan', /[^-c]/, 2 => [:literal, :literal, '-', 2, 3] include_examples 'scan', /[a[:digit:]c]/, 2 => [:posixclass, :digit, '[:digit:]', 2, 11] include_examples 'scan', /[[:digit:][:space:]]/, 2 => [:posixclass, :space, '[:space:]', 10, 19] include_examples 'scan', /[[:^digit:]]/, 1 => [:nonposixclass, :digit, '[:^digit:]', 1, 11] include_examples 'scan', /[a-d&&g-h]/, 4 => [:set, :intersection, '&&', 4, 6] include_examples 'scan', /[a&&]/, 2 => [:set, :intersection, '&&', 2, 4] include_examples 'scan', /[&&z]/, 1 => [:set, :intersection, '&&', 1, 3] include_examples 'scan', /[&&]/, 1 => [:set, :intersection, '&&', 1, 3] include_examples 'scan', '[a\p{digit}c]', 2 => [:property, :digit, '\p{digit}', 2, 11] include_examples 'scan', '[a\P{digit}c]', 2 => [:nonproperty, :digit, '\P{digit}', 2, 11] include_examples 'scan', '[a\p{^digit}c]', 2 => [:nonproperty, :digit, '\p{^digit}', 2, 12] include_examples 'scan', '[a\P{^digit}c]', 2 => [:property, :digit, '\P{^digit}', 2, 12] include_examples 'scan', '[a\p{ALPHA}c]', 2 => [:property, :alpha, '\p{ALPHA}', 2, 11] include_examples 'scan', '[a\p{P}c]', 2 => [:property, :punctuation,'\p{P}', 2, 7] include_examples 'scan', '[a\p{P}\P{P}c]', 3 => [:nonproperty, :punctuation,'\P{P}', 7, 12] include_examples 'scan', '[\x20-\x27]', 1 => [:escape, :hex, '\x20', 1, 5], 2 => [:set, :range, '-', 5, 6], 3 => [:escape, :hex, '\x27', 6, 10] include_examples 'scan', '[a-w&&[^c-g]z]', 5 => [:set, :open, '[', 6, 7], 6 => [:set, :negate, '^', 7, 8], 8 => [:set, :range, '-', 9, 10], 10=> [:set, :close, ']', 11, 12] # Collations/collating sequences and character equivalents are not enabled # in Ruby at the moment. If they ever are, revert and uncomment b82976b, # add them to a new syntax version, and handle them in the parser. Until then, # expect them to be scanned as regular subsets containing literals. # include_examples 'scan', /[a[.a-b.]c]/, 2 => [:set, :collation, '[.a-b.]', 2, 9] # include_examples 'scan', /[a[=e=]c]/, 2 => [:set, :equivalent, '[=e=]', 2, 7] include_examples 'scan', '[a[.a-b.]c]', 2 => [:set, :open, '[', 2, 3], 3 => [:literal, :literal, '.', 3, 4], 4 => [:literal, :literal, 'a', 4, 5] include_examples 'scan', '[a[=e=]c]', 2 => [:set, :open, '[', 2, 3], 3 => [:literal, :literal, '=', 3, 4], 4 => [:literal, :literal, 'e', 4, 5] # multi-byte characters should not affect indices include_examples 'scan', /[れます]/, 0 => [:set, :open, '[', 0, 1], 1 => [:literal, :literal, 'れ', 1, 2], 2 => [:literal, :literal, 'ま', 2, 3], 3 => [:literal, :literal, 'す', 3, 4], 4 => [:set, :close, ']', 4, 5] specify('set literal encoding') do text = RS.scan('[a]')[1][2].to_s expect(text).to eq 'a' expect(text.encoding.to_s).to eq 'UTF-8' text = RS.scan("[\u{1F632}]")[1][2].to_s expect(text).to eq "\u{1F632}" expect(text.encoding.to_s).to eq 'UTF-8' end end ammar-regexp_parser-68cdeff/spec/scanner/types_spec.rb000066400000000000000000000014551506175332700233460ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe('Type scanning') do include_examples 'scan', 'a\dc', 1 => [:type, :digit, '\d', 1, 3] include_examples 'scan', 'a\Dc', 1 => [:type, :nondigit, '\D', 1, 3] include_examples 'scan', 'a\hc', 1 => [:type, :hex, '\h', 1, 3] include_examples 'scan', 'a\Hc', 1 => [:type, :nonhex, '\H', 1, 3] include_examples 'scan', 'a\sc', 1 => [:type, :space, '\s', 1, 3] include_examples 'scan', 'a\Sc', 1 => [:type, :nonspace, '\S', 1, 3] include_examples 'scan', 'a\wc', 1 => [:type, :word, '\w', 1, 3] include_examples 'scan', 'a\Wc', 1 => [:type, :nonword, '\W', 1, 3] include_examples 'scan', 'a\Rc', 1 => [:type, :linebreak, '\R', 1, 3] include_examples 'scan', 'a\Xc', 1 => [:type, :xgrapheme, '\X', 1, 3] end ammar-regexp_parser-68cdeff/spec/spec_helper.rb000066400000000000000000000031061506175332700220230ustar00rootroot00000000000000# frozen_string_literal: true $VERBOSE = true require 'leto' require 'regexp_property_values' require_relative 'support/capturing_stderr' require_relative 'support/shared_examples' req_warn = capturing_stderr { @required_now = require('regexp_parser') } req_warn.empty? || fail("requiring parser generated warnings:\n#{req_warn}") @required_now || fail("regexp_parser was required earlier than expected") RS = Regexp::Scanner RL = Regexp::Lexer RP = Regexp::Parser RE = Regexp::Expression T = Regexp::Syntax::Token include Regexp::Expression def ruby_version_at_least(version) Gem::Version.new(RUBY_VERSION.dup) >= Gem::Version.new(version) end RSpec.configure do |config| config.around(:example) do |example| # treat unexpected warnings as failures expect { example.run }.not_to output.to_stderr end end def s(klass, text = '', *children) exp = klass.construct(text: text.to_s) children.each { |child| exp.expressions << child } exp end def regexp_with_all_features return /dummy/ unless ruby_version_at_least('2.4.1') Regexp.new(<<-'REGEXP', Regexp::EXTENDED) \A a++ (?: \b {2} (?> c ?? 😀😀😀 # 😄😄😄 (?# 😃😃😃 ) ( \d *+ ( ALT1 | ALT2 ) ) {004} | [ä-ü&&ö[:ascii:]\p{thai}] {6} | \z ) (?=lm{8}) ?+ \K (?~ \1 \g<-1> {10} \uFFFF \012 ) (?(1) BRANCH1 | BRANCH2 ) ) REGEXP end ammar-regexp_parser-68cdeff/spec/support/000077500000000000000000000000001506175332700207215ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/spec/support/capturing_stderr.rb000066400000000000000000000002761506175332700246320ustar00rootroot00000000000000# frozen_string_literal: true require 'stringio' def capturing_stderr(&block) old_stderr, $stderr = $stderr, StringIO.new block.call $stderr.string ensure $stderr = old_stderr end ammar-regexp_parser-68cdeff/spec/support/shared_examples.rb000066400000000000000000000056531506175332700244230ustar00rootroot00000000000000# frozen_string_literal: true RSpec.shared_examples 'syntax' do |opts| opts[:implements].each do |type, tokens| tokens.each do |token| it("implements #{token} #{type}") do expect(described_class.implements?(type, token)).to be true end end end opts[:excludes] && opts[:excludes].each do |type, tokens| tokens.each do |token| it("does not implement #{token} #{type}") do expect(described_class.implements?(type, token)).to be false end end end end RSpec.shared_examples 'scan' do |pattern, checks| context "given the pattern #{pattern}" do before(:all) { @tokens = Regexp::Scanner.scan(pattern) } checks.each do |index, (type, token, text, ts, te)| it "scans token #{index} as #{token} #{type} at #{ts}..#{te}" do result = @tokens.at(index) result || fail("no token at index #{index}, max is #{@tokens.size - 1}") expect(result[0]).to eq type expect(result[1]).to eq token expect(result[2]).to eq text expect(result[3]).to eq ts expect(result[4]).to eq te end end end end RSpec.shared_examples 'lex' do |pattern, checks| context "given the pattern #{pattern}" do before(:all) { @tokens = Regexp::Lexer.lex(pattern) } checks.each do |index, (type, token, text, ts, te, lvl, set_lvl, cond_lvl)| it "lexes token #{index} as #{token} #{type} at #{lvl}, #{set_lvl}, #{cond_lvl}" do struct = @tokens.at(index) expect(struct.type).to eq type expect(struct.token).to eq token expect(struct.text).to eq text expect(struct.ts).to eq ts expect(struct.te).to eq te expect(struct.level).to eq lvl expect(struct.set_level).to eq set_lvl expect(struct.conditional_level).to eq cond_lvl end end end end RSpec.shared_examples 'parse' do |pattern, checks| context "given the pattern #{pattern}" do before(:all) { @root = Regexp::Parser.parse(pattern, '*') } checks.each do |path, expectations| path = Array(path) inspect_quantifier = path.last == :q && path.pop attributes = expectations.pop if expectations.last.is_a?(Hash) klass = expectations.pop if expectations.last.is_a?(Class) token = expectations.pop type = expectations.pop description = klass || token || type || 'Expression' it "parses expression at #{path} as #{description}" do exp = @root.dig(*path) exp = exp.quantifier if inspect_quantifier klass && expect(exp).to(be_instance_of(klass)) type && expect(exp.type).to(eq(type)) token && expect(exp.token).to(eq(token)) attributes && attributes.each do |method, value| actual = exp.send(method) expect(actual).to eq(value), "expected #{description} at #{path} to "\ "have #{method} #{value.inspect}, got #{actual.inspect}" end end end end end ammar-regexp_parser-68cdeff/spec/syntax/000077500000000000000000000000001506175332700205335ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/spec/syntax/syntax_spec.rb000066400000000000000000000072741506175332700234320ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe(Regexp::Syntax) do describe('::for') do it { expect(Regexp::Syntax.for('ruby/1.8.6')).to eq Regexp::Syntax::V1_8_6 } it { expect(Regexp::Syntax.for('ruby/1.8')).to eq Regexp::Syntax::V1_8_6 } it { expect(Regexp::Syntax.for('ruby/1.9.1')).to eq Regexp::Syntax::V1_9_1 } it { expect(Regexp::Syntax.for('ruby/1.9')).to eq Regexp::Syntax::V1_9_3 } it { expect(Regexp::Syntax.for('ruby/2.0.0')).to eq Regexp::Syntax::V2_0_0 } it { expect(Regexp::Syntax.for('ruby/2.0')).to eq Regexp::Syntax::V2_0_0 } it { expect(Regexp::Syntax.for('ruby/2.1')).to eq Regexp::Syntax::V2_0_0 } it { expect(Regexp::Syntax.for('ruby/2.2.0')).to eq Regexp::Syntax::V2_2_0 } it { expect(Regexp::Syntax.for('ruby/2.2.10')).to eq Regexp::Syntax::V2_2_0 } it { expect(Regexp::Syntax.for('ruby/2.2')).to eq Regexp::Syntax::V2_2_0 } it { expect(Regexp::Syntax.for('ruby/2.3.0')).to eq Regexp::Syntax::V2_3_0 } it { expect(Regexp::Syntax.for('ruby/2.3')).to eq Regexp::Syntax::V2_3_0 } it { expect(Regexp::Syntax.for('ruby/2.4.0')).to eq Regexp::Syntax::V2_4_0 } it { expect(Regexp::Syntax.for('ruby/2.4.1')).to eq Regexp::Syntax::V2_4_1 } it { expect(Regexp::Syntax.for('ruby/2.5.0')).to eq Regexp::Syntax::V2_5_0 } it { expect(Regexp::Syntax.for('ruby/2.5')).to eq Regexp::Syntax::V2_5_0 } it { expect(Regexp::Syntax.for('ruby/2.6.0')).to eq Regexp::Syntax::V2_6_0 } it { expect(Regexp::Syntax.for('ruby/2.6.2')).to eq Regexp::Syntax::V2_6_2 } it { expect(Regexp::Syntax.for('ruby/2.6.3')).to eq Regexp::Syntax::V2_6_3 } it { expect(Regexp::Syntax.for('ruby/2.6')).to eq Regexp::Syntax::V2_6_3 } it { expect(Regexp::Syntax.for('ruby/3.0.0')).to eq Regexp::Syntax::V2_6_3 } it { expect(Regexp::Syntax.for('ruby/3.0')).to eq Regexp::Syntax::V2_6_3 } it { expect(Regexp::Syntax.for('ruby/3.1.0')).to eq Regexp::Syntax::V3_1_0 } it { expect(Regexp::Syntax.for('ruby/3.1')).to eq Regexp::Syntax::V3_1_0 } it { expect(Regexp::Syntax.for('ruby/3.2.0')).to eq Regexp::Syntax::V3_2_0 } it { expect(Regexp::Syntax.for('ruby/3.2')).to eq Regexp::Syntax::V3_2_0 } it { expect(Regexp::Syntax.for('any')).to eq Regexp::Syntax::Any } it { expect(Regexp::Syntax.for('*')).to eq Regexp::Syntax::Any } it 'raises for unknown names' do expect { Regexp::Syntax.for('ruby/1.0') }.to raise_error(Regexp::Syntax::UnknownSyntaxNameError) end it 'raises for invalid names' do expect { Regexp::Syntax.version_class('2.0.0') }.to raise_error(Regexp::Syntax::InvalidVersionNameError) expect { Regexp::Syntax.version_class('ruby/20') }.to raise_error(Regexp::Syntax::InvalidVersionNameError) end end specify('::new is a deprecated alias of ::for') do expect { expect(Regexp::Syntax.new('ruby/2.0.0')).to eq Regexp::Syntax::V2_0_0 } .to output(/deprecated/).to_stderr end specify('not implemented') do expect { RP.parse('\p{alpha}', 'ruby/1.8') }.to raise_error(Regexp::Syntax::NotImplementedError) end specify('supported?') do expect(Regexp::Syntax.supported?('ruby/1.1.1')).to be false expect(Regexp::Syntax.supported?('ruby/2.4.3')).to be true expect(Regexp::Syntax.supported?('ruby/2.5')).to be true end specify('raises for unknown constant lookups') do expect { Regexp::Syntax::V1 }.to raise_error(/V1/) end specify('instantiation is deprecated but still works') do expect { @instance = Regexp::Syntax::V3_1_0.new } .to output(/deprecated/).to_stderr expect { expect(@instance.implements?(:literal, :literal)).to be true } .to output(/deprecated/).to_stderr end end ammar-regexp_parser-68cdeff/spec/syntax/syntax_token_map_spec.rb000066400000000000000000000014451506175332700254610ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe(Regexp::Syntax::Token::Map) do let(:map) { Regexp::Syntax::Token::Map } let(:current_syntax) { Regexp::Syntax::CURRENT } specify('is complete') do current_syntax.features.each do |type, tokens| tokens.each { |token| expect(map[type]).to include(token) } end end specify('contains no duplicate tokens') do current_syntax.features.each do |_type, tokens| expect(tokens).to eq tokens.uniq end end specify('contains no duplicate type/token combinations') do combinations = map.flat_map do |type, tokens| tokens.map { |token| "#{type} #{token}" } end non_uniq = combinations.group_by { |str| str }.select { |_, v| v.count > 1 } expect(non_uniq.keys).to be_empty end end ammar-regexp_parser-68cdeff/spec/syntax/versions/000077500000000000000000000000001506175332700224035ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/spec/syntax/versions/1.8.6_spec.rb000066400000000000000000000012261506175332700244150ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe(Regexp::Syntax::V1_8_6) do include_examples 'syntax', implements: { assertion: T::Assertion::Lookahead, backref: T::Backreference::Plain, escape: T::Escape::Basic + T::Escape::ASCII + T::Escape::Meta + T::Escape::Control, group: T::Group::V1_8_6, quantifier: T::Quantifier::Greedy + T::Quantifier::Reluctant + T::Quantifier::Interval + T::Quantifier::IntervalReluctant }, excludes: { assertion: T::Assertion::Lookbehind, backref: T::Backreference::All - T::Backreference::Plain + T::SubexpressionCall::All, quantifier: T::Quantifier::Possessive } end ammar-regexp_parser-68cdeff/spec/syntax/versions/1.9.1_spec.rb000066400000000000000000000005271506175332700244140ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe(Regexp::Syntax::V1_9_1) do include_examples 'syntax', implements: { escape: T::Escape::Hex + T::Escape::Octal + T::Escape::Unicode, type: T::CharacterType::Hex, quantifier: T::Quantifier::Greedy + T::Quantifier::Reluctant + T::Quantifier::Possessive } end ammar-regexp_parser-68cdeff/spec/syntax/versions/1.9.3_spec.rb000066400000000000000000000004731506175332700244160ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe(Regexp::Syntax::V1_9_3) do include_examples 'syntax', implements: { property: T::UnicodeProperty::Script_V1_9_3 + T::UnicodeProperty::Age_V1_9_3, nonproperty: T::UnicodeProperty::Script_V1_9_3 + T::UnicodeProperty::Age_V1_9_3 } end ammar-regexp_parser-68cdeff/spec/syntax/versions/2.0.0_spec.rb000066400000000000000000000004761506175332700244060ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe(Regexp::Syntax::V2_0_0) do include_examples 'syntax', implements: { property: T::UnicodeProperty::Age_V2_0_0, nonproperty: T::UnicodeProperty::Age_V2_0_0 }, excludes: { property: %i[newline], nonproperty: %i[newline] } end ammar-regexp_parser-68cdeff/spec/syntax/versions/2.2.0_spec.rb000066400000000000000000000004731506175332700244050ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe(Regexp::Syntax::V2_2_0) do include_examples 'syntax', implements: { property: T::UnicodeProperty::Script_V2_2_0 + T::UnicodeProperty::Age_V2_2_0, nonproperty: T::UnicodeProperty::Script_V2_2_0 + T::UnicodeProperty::Age_V2_2_0 } end ammar-regexp_parser-68cdeff/spec/syntax/versions/3.2.0_spec.rb000066400000000000000000000004731506175332700244060ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe(Regexp::Syntax::V3_2_0) do include_examples 'syntax', implements: { property: T::UnicodeProperty::Script_V3_2_0 + T::UnicodeProperty::Age_V3_2_0, nonproperty: T::UnicodeProperty::Script_V3_2_0 + T::UnicodeProperty::Age_V3_2_0 } end ammar-regexp_parser-68cdeff/spec/token/000077500000000000000000000000001506175332700203255ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/spec/token/token_spec.rb000066400000000000000000000036711506175332700230130ustar00rootroot00000000000000# frozen_string_literal: true require 'spec_helper' RSpec.describe(Regexp::Token) do specify('#offset') do regexp = /ab?cd/ tokens = RL.lex(regexp) expect(tokens[1].text).to eq 'b' expect(tokens[1].offset).to eq [1, 2] expect(tokens[2].text).to eq '?' expect(tokens[2].offset).to eq [2, 3] expect(tokens[3].text).to eq 'cd' expect(tokens[3].offset).to eq [3, 5] end specify('#length') do regexp = /abc?def/ tokens = RL.lex(regexp) expect(tokens[0].text).to eq 'ab' expect(tokens[0].length).to eq 2 expect(tokens[1].text).to eq 'c' expect(tokens[1].length).to eq 1 expect(tokens[2].text).to eq '?' expect(tokens[2].length).to eq 1 expect(tokens[3].text).to eq 'def' expect(tokens[3].length).to eq 3 end specify('#to_h') do regexp = /abc?def/ tokens = RL.lex(regexp) expect(tokens[0].text).to eq 'ab' expect(tokens[0].to_h).to eq type: :literal, token: :literal, text: 'ab', ts: 0, te: 2, level: 0, set_level: 0, conditional_level: 0 expect(tokens[2].text).to eq '?' expect(tokens[2].to_h).to eq type: :quantifier, token: :zero_or_one, text: '?', ts: 3, te: 4, level: 0, set_level: 0, conditional_level: 0 end specify('#next') do regexp = /a+b?c*d{2,3}/ tokens = RL.lex(regexp) a = tokens.first expect(a.text).to eq 'a' plus = a.next expect(plus.text).to eq '+' b = plus.next expect(b.text).to eq 'b' interval = tokens.last expect(interval.text).to eq '{2,3}' expect(interval.next).to be_nil end specify('#previous') do regexp = /a+b?c*d{2,3}/ tokens = RL.lex(regexp) interval = tokens.last expect(interval.text).to eq '{2,3}' d = interval.previous expect(d.text).to eq 'd' star = d.previous expect(star.text).to eq '*' c = star.previous expect(c.text).to eq 'c' a = tokens.first expect(a.text).to eq 'a' expect(a.previous).to be_nil end end ammar-regexp_parser-68cdeff/tasks/000077500000000000000000000000001506175332700174005ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/tasks/benchmark.rake000066400000000000000000000016221506175332700221770ustar00rootroot00000000000000# frozen_string_literal: true BENCHMARKS_DIR = "#{__dir__}/benchmarks" desc 'Run all IPS benchmarks' task :benchmark do Dir["#{BENCHMARKS_DIR}/*.rb"].sort.each { |file| load(file) } end namespace :benchmark do desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md' task :write_to_file do require 'stringio' string_io = StringIO.new with_stdouts(STDOUT, string_io) { Rake.application[:benchmark].invoke } File.write "#{BENCHMARKS_DIR}/log", "Results of rake:benchmark on #{RUBY_DESCRIPTION}\n\n" + string_io.string.gsub(/Warming up.*?Comparison:/m, '') end end def with_stdouts(*ios) old_stdout = $stdout ios.define_singleton_method(:method_missing) { |*args| each { |io| io.send(*args) } } ios.define_singleton_method(:respond_to?) { |*args| IO.respond_to?(*args) } $stdout = ios yield ensure $stdout = old_stdout end ammar-regexp_parser-68cdeff/tasks/benchmarks/000077500000000000000000000000001506175332700215155ustar00rootroot00000000000000ammar-regexp_parser-68cdeff/tasks/benchmarks/log000066400000000000000000000007661506175332700222320ustar00rootroot00000000000000Results of rake:benchmark on ruby 3.1.0p0 (2021-12-25 revision fb4df44d16) [arm64-darwin21] Parsing a minimal Regexp Scanner::scan: 32069.4 i/s Lexer::lex: 30700.6 i/s - same-ish: difference falls within error Parser::parse: 26248.5 i/s - 1.22x (± 0.00) slower Parsing a complex Regexp (URI.regexp) Scanner::scan: 843.4 i/s Lexer::lex: 546.3 i/s - 1.54x (± 0.00) slower Parser::parse: 332.5 i/s - 2.54x (± 0.00) slower ammar-regexp_parser-68cdeff/tasks/benchmarks/minimal_regexp.rb000066400000000000000000000005571506175332700250510ustar00rootroot00000000000000# frozen_string_literal: true require 'benchmark/ips' require_relative '../../lib/regexp_parser' puts 'Parsing a minimal Regexp' regexp = /./ Benchmark.ips do |x| x.report('Scanner::scan') { Regexp::Scanner.scan(regexp) } x.report('Lexer::lex') { Regexp::Lexer.lex(regexp) } x.report('Parser::parse') { Regexp::Parser.parse(regexp) } x.compare! end ammar-regexp_parser-68cdeff/tasks/benchmarks/uri_regexp.rb000066400000000000000000000006461506175332700242210ustar00rootroot00000000000000# frozen_string_literal: true require 'benchmark/ips' require_relative '../../lib/regexp_parser' puts 'Parsing a complex Regexp (URI.regexp)' require 'uri' regexp = URI::DEFAULT_PARSER.make_regexp Benchmark.ips do |x| x.report('Scanner::scan') { Regexp::Scanner.scan(regexp) } x.report('Lexer::lex') { Regexp::Lexer.lex(regexp) } x.report('Parser::parse') { Regexp::Parser.parse(regexp) } x.compare! end ammar-regexp_parser-68cdeff/tasks/props.rake000066400000000000000000000017061506175332700214130ustar00rootroot00000000000000# frozen_string_literal: true namespace :props do desc 'Write new property value hashes for the properties scanner' task :update do require 'regexp_property_values' RegexpPropertyValues.update dir = File.join(__dir__, '../lib/regexp_parser/scanner/properties') write_hash_to_file = ->(hash, path) do File.open(path, 'w') do |f| f.puts '# THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT', *hash.sort.map { |pair| pair.join(',') } end puts "Wrote #{hash.count} aliases to `#{path}`" end long_names_to_tokens = RegexpPropertyValues.all.map do |val| [val.identifier, val.full_name.downcase] end write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.csv") short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v| [k.identifier, v.full_name.downcase] end write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.csv") end end ammar-regexp_parser-68cdeff/tasks/ragel.rake000066400000000000000000000046101506175332700213370ustar00rootroot00000000000000# frozen_string_literal: true # scanner.rl imports the other files RAGEL_SOURCE_PATH = File.join(__dir__, '../lib/regexp_parser/scanner/scanner.rl') RAGEL_OUTPUT_PATH = File.join(__dir__, '../lib/regexp_parser/scanner.rb') desc 'Process the ragel source files and output ruby code' task ragel: 'ragel:install' do # -L = omit line hint comments; -p = print human-readable labels flags = ENV['DEBUG_RAGEL'].to_i == 1 ? ['-p'] : ['-L'] # -F1 = use flat table-driven FSM, about 25% larger code, but about 30% faster flags << '-F1' sh 'ragel', '-R', RAGEL_SOURCE_PATH, '-o', RAGEL_OUTPUT_PATH, *flags cleaned_contents = File .read(RAGEL_OUTPUT_PATH) .gsub(/[ \t]+$/, '') # remove trailing whitespace emitted by ragel .gsub(/ then$/, '') # remove redundant then keywords emitted by ragel .gsub(/(?<=\d,)[ \t]+|^[ \t]+(?=-?\d)/, '') # compact FSM tables (saves ~6KB) .gsub(/(?<=\S ) +/, '') # compact in-line spaces .gsub(/\n(?:[ \t]*\n){2,}/, "\n\n") # compact blank lines File.open(RAGEL_OUTPUT_PATH, 'w') do |file| file.puts(<<-RUBY.gsub(/^\s+/, '')) # -*- frozen_string_literal: true; warn-indent: false -*- # # THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY # # This file was generated from #{RAGEL_SOURCE_PATH.split('/').last} # by running `$ bundle exec rake ragel` RUBY file.write(cleaned_contents) end # Remove redundant begin/end blocks emitted by ragel. # This saves 1KB, but is disabled for now because it increases build time from 0.2s to 1s. # `bundle exec rubocop --only Style/RedundantBegin --autocorrect #{RAGEL_OUTPUT_PATH}` end namespace :ragel do desc 'Delete the ragel generated source file' task :clean do sh "rm -f #{RAGEL_OUTPUT_PATH}" end desc 'Make sure that ragel is installed' task :install do next if ENV['CI'] if system('sh -c "command -v ragel"') # already installed elsif system('sh -c "command -v brew"') puts 'ragel not found, installing with homebrew ...' `brew install ragel` elsif system('sh -c "command -v apt-get"') puts 'ragel not found, installing with apt-get ...' `sudo apt-get install -y ragel` else raise 'Could not install ragel. Please install it manually.' end end desc 'Deprecated alias for the ragel task' task rb: :ragel do warn 'The `ragel:rb` task is deprecated, please use `ragel` instead.' end end