pax_global_header00006660000000000000000000000064143352531350014516gustar00rootroot0000000000000052 comment=922ce35791df962ea0e8170466920e7672488cf9 ammar-regexp_parser-0494e56/000077500000000000000000000000001433525313500157415ustar00rootroot00000000000000ammar-regexp_parser-0494e56/.github/000077500000000000000000000000001433525313500173015ustar00rootroot00000000000000ammar-regexp_parser-0494e56/.github/workflows/000077500000000000000000000000001433525313500213365ustar00rootroot00000000000000ammar-regexp_parser-0494e56/.github/workflows/gouteur.yml000066400000000000000000000006571433525313500235630ustar00rootroot00000000000000name: gouteur on: [push, pull_request] jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: Set up Ruby uses: ruby/setup-ruby@v1 with: ruby-version: 2.7 - name: Prepare run: | bundle install --jobs 4 sudo apt-get install -yqq ragel bundle exec rake ragel:rb - name: Test run: bundle exec gouteur ammar-regexp_parser-0494e56/.github/workflows/lint.yml000066400000000000000000000016201433525313500230260ustar00rootroot00000000000000# based on https://github.com/rails/rails/blob/4a78dcb/.github/workflows/rubocop.yml name: rubocop linting on: [push, pull_request] jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: Set up Ruby uses: ruby/setup-ruby@v1 with: ruby-version: 2.7 - name: Cache gems uses: actions/cache@v1 with: path: vendor/bundle key: ${{ runner.os }}-rubocop-${{ hashFiles('**/Gemfile.lock') }} restore-keys: | ${{ runner.os }}-rubocop- - name: Install gems run: | bundle config path vendor/bundle bundle install --jobs 4 --retry 3 # Create a dummy scanner.rb so we don't need Ragel but require statements work - name: Stub scanner.rb run: 'echo "class Regexp::Scanner; end" > lib/regexp_parser/scanner.rb' - name: Run rubocop run: bundle exec rubocop --lint ammar-regexp_parser-0494e56/.github/workflows/tests.yml000066400000000000000000000012101433525313500232150ustar00rootroot00000000000000name: tests on: push: pull_request: schedule: - cron: '11 11 14 * *' # at 11:11 am on the 14th of every month jobs: build: runs-on: ubuntu-latest strategy: matrix: ruby: [ '2.3', '2.4', '2.5', '2.6', '2.7', '3.0', '3.1', 'ruby-head' ] steps: - uses: actions/checkout@v2 - name: Set up Ruby ${{ matrix.ruby }} uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby }} - name: Install dependencies run: | bundle install --jobs 4 sudo apt-get install -yqq ragel - name: Test with Rake run: bundle exec rake test:full ammar-regexp_parser-0494e56/.gitignore000066400000000000000000000002341433525313500177300ustar00rootroot00000000000000*.gem .*.swp .DS_Store .ruby-version .tags .tags1 .tool-versions Gemfile.lock lib/regexp_parser/scanner.rb doc .yardoc .bundle/* pkg/* coverage/* tmp/* ammar-regexp_parser-0494e56/.gouteur.yml000066400000000000000000000007411433525313500202360ustar00rootroot00000000000000# Usage: https://github.com/jaynetics/gouteur/blob/main/README.md repos: - uri: https://github.com/jaynetics/js_regex - uri: https://github.com/jaynetics/repper - uri: https://github.com/rubocop-hq/rubocop tasks: rspec --pattern "**/{,*}regexp{,*,*/**/*}_spec.rb" - uri: https://github.com/mbj/mutant tasks: rspec --pattern "**/{,*}regexp{,*,*/**/*}_spec.rb" - uri: https://github.com/teamcapybara/capybara tasks: rspec spec/regexp_dissassembler_spec.rb ammar-regexp_parser-0494e56/.rubocop.yml000066400000000000000000000007271433525313500202210ustar00rootroot00000000000000AllCops: Exclude: - '{bin,pkg,tmp,vendor}/**/*' # vendored dependencies etc. - 'lib/regexp_parser/scanner.rb' # Ragel-generated code NewCops: enable RubyInterpreters: - ruby - rake TargetRubyVersion: 2.6 # really 2.0, but 2.6 is lowest supported by rubocop # ignore weird looking regexps in specs, we have these on purpose Lint/DuplicateRegexpCharacterClassElement: Exclude: ['spec/**/*'] Lint/MixedRegexpCaptureTypes: Exclude: ['spec/**/*'] ammar-regexp_parser-0494e56/CHANGELOG.md000066400000000000000000000652511433525313500175630ustar00rootroot00000000000000# Changelog All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] ## [2.6.1] - 2022-11-16 - [Janosch Müller](mailto:janosch84@gmail.com) ### Fixed - fixed scanning of two negative lookbehind edge cases * `(?` used to raise a ScannerError * `(?)y` used to be misinterpreted as a named group * thanks to [Sergio Medina](https://github.com/serch) for the report ## [2.6.0] - 2022-09-26 - [Janosch Müller](mailto:janosch84@gmail.com) ### Fixed - fixed `#referenced_expression` for `\g<0>` (was `nil`, is now the `Root` exp) - fixed `#reference`, `#referenced_expression` for recursion level backrefs * e.g. `(a)(b)\k<-1+1>` * `#referenced_expression` was `nil`, now it is the correct `Group` exp - detect and raise for two more syntax errors when parsing String input * quantification of option switches (e.g. `(?i)+`) * invalid references (e.g. `/\k<1>/`) * these are a `SyntaxError` in Ruby, so could only be passed as a String ### Added - `Regexp::Expression::Base#human_name` * returns a nice, human-readable description of the expression - `Regexp::Expression::Base#optional?` * returns `true` if the expression is quantified accordingly (e.g. with `*`, `{,n}`) - added a deprecation warning when calling `#to_re` on set members ## [2.5.0] - 2022-05-27 - [Janosch Müller](mailto:janosch84@gmail.com) ### Added - `Regexp::Expression::Base.construct` and `.token_class` methods * see the [wiki](https://github.com/ammar/regexp_parser/wiki) for details ## [2.4.0] - 2022-05-09 - [Janosch Müller](mailto:janosch84@gmail.com) ### Fixed - fixed interpretation of `+` and `?` after interval quantifiers (`{n,n}`) * they used to be treated as reluctant or possessive mode indicators * however, Ruby does not support these modes for interval quantifiers * they are now treated as chained quantifiers instead, as Ruby does it * c.f. [#3](https://github.com/ammar/regexp_parser/issues/3) - fixed `Expression::Base#nesting_level` for some tree rewrite cases * e.g. the alternatives in `/a|[b]/` had an inconsistent nesting_level - fixed `Scanner` accepting invalid posix classes, e.g. `[[:foo:]]` * they raise a `SyntaxError` when used in a Regexp, so could only be passed as String * they now raise a `Regexp::Scanner::ValidationError` in the `Scanner` ### Added - added `Expression::Base#==` for (deep) comparison of expressions - added `Expression::Base#parts` * returns the text elements and subexpressions of an expression * e.g. `parse(/(a)/)[0].parts # => ["(", #, ")"]` - added `Expression::Base#te` (a.k.a. token end index) * `Expression::Subexpression` always had `#te`, only terminal nodes lacked it so far - made some `Expression::Base` methods available on `Quantifier` instances, too * `#type`, `#type?`, `#is?`, `#one_of?`, `#options`, `#terminal?` * `#base_length`, `#full_length`, `#starts_at`, `#te`, `#ts`, `#offset` * `#conditional_level`, `#level`, `#nesting_level` , `#set_level` * this allows a more unified handling with `Expression::Base` instances - allowed `Quantifier#initialize` to take a token and options Hash like other nodes - added a deprecation warning for initializing Quantifiers with 4+ arguments: Calling `Expression::Base#quantify` or `Quantifier.new` with 4+ arguments is deprecated. It will no longer be supported in regexp_parser v3.0.0. Please pass a Regexp::Token instead, e.g. replace `token, text, min, max, mode` with `::Regexp::Token.new(:quantifier, token, text)`. min, max, and mode will be derived automatically. Or do `exp.quantifier = Quantifier.construct(token: token, text: str)`. This is consistent with how Expression::Base instances are created. ## [2.3.1] - 2022-04-24 - [Janosch Müller](mailto:janosch84@gmail.com) ### Fixed - removed five inexistent unicode properties from `Syntax#features` * these were never supported by Ruby or the `Regexp::Scanner` * thanks to [Markus Schirp](https://github.com/mbj) for the report ## [2.3.0] - 2022-04-08 - [Janosch Müller](mailto:janosch84@gmail.com) ### Added - improved parsing performance through `Syntax` refactoring * instead of fresh `Syntax` instances, pre-loaded constants are now re-used * this approximately doubles the parsing speed for simple regexps - added methods to `Syntax` classes to show relative feature sets * e.g. `Regexp::Syntax::V3_2_0.added_features` - support for new unicode properties of Ruby 3.2 / Unicode 14.0 ## [2.2.1] - 2022-02-11 - [Janosch Müller](mailto:janosch84@gmail.com) ### Fixed - fixed Syntax version of absence groups (`(?~...)`) * the lexer accepted them for any Ruby version * now they are only recognized for Ruby >= 2.4.1 in which they were introduced - reduced gem size by excluding specs from package - removed deprecated `test_files` gemspec setting - no longer depend on `yaml`/`psych` (except for Ruby <= 2.4) - no longer depend on `set` * `set` was removed from the stdlib and made a standalone gem as of Ruby 3 * this made it a hidden/undeclared dependency of `regexp_parser` ## [2.2.0] - 2021-12-04 - [Janosch Müller](mailto:janosch84@gmail.com) ### Added - added support for 13 new unicode properties introduced in Ruby 3.1.0 ## [2.1.1] - 2021-02-23 - [Janosch Müller](mailto:janosch84@gmail.com) ### Fixed - fixed `NameError` when requiring only `'regexp_parser/scanner'` in v2.1.0 * thanks to [Jared White and Sam Ruby](https://github.com/ruby2js/ruby2js) for the report ## [2.1.0] - 2021-02-22 - [Janosch Müller](mailto:janosch84@gmail.com) ### Added - common ancestor for all scanning/parsing/lexing errors * `Regexp::Parser::Error` can now be rescued as a catch-all * the following errors (and their many descendants) now inherit from it: - `Regexp::Expression::Conditional::TooManyBranches` - `Regexp::Parser::ParserError` - `Regexp::Scanner::ScannerError` - `Regexp::Scanner::ValidationError` - `Regexp::Syntax::SyntaxError` * it replaces `ArgumentError` in some rare cases (`Regexp::Parser.parse('?')`) * thanks to [sandstrom](https://github.com/sandstrom) for the cue ### Fixed - fixed scanning of whole-pattern recursion calls `\g<0>` and `\g'0'` * a regression in v2.0.1 had caused them to be scanned as literals - fixed scanning of some backreference and subexpression call edge cases * e.g. `\k<+1>`, `\g` - fixed tokenization of some escapes in character sets * `.`, `|`, `{`, `}`, `(`, `)`, `^`, `$`, `?`, `+`, `*` * all of these correctly emitted `#type` `:literal` and `#token` `:literal` if *not* escaped * if escaped, they emitted e.g. `#type` `:escape` and `#token` `:group_open` for `[\(]` * the escaped versions now correctly emit `#type` `:escape` and `#token` `:literal` - fixed handling of control/metacontrol escapes in character sets * e.g. `[\cX]`, `[\M-\C-X]` * they were misread as bunch of individual literals, escapes, and ranges - fixed some cases where calling `#dup`/`#clone` on expressions led to shared state ## [2.0.3] - 2020-12-28 - [Janosch Müller](mailto:janosch84@gmail.com) ### Fixed - fixed error when scanning some unlikely and redundant but valid charset patterns * e.g. `/[[.a-b.]]/`, `/[[=e=]]/`, - fixed ancestry of some error classes related to syntax version lookup * `NotImplementedError`, `InvalidVersionNameError`, `UnknownSyntaxNameError` * they now correctly inherit from `Regexp::Syntax::SyntaxError` instead of Rubys `::SyntaxError` ## [2.0.2] - 2020-12-25 - [Janosch Müller](mailto:janosch84@gmail.com) ### Fixed - fixed `FrozenError` when calling `#to_s` on a frozen `Group::Passive` * thanks to [Daniel Gollahon](https://github.com/dgollahon) ## [2.0.1] - 2020-12-20 - [Janosch Müller](mailto:janosch84@gmail.com) ### Fixed - fixed error when scanning some group names * this affected names containing hyphens, digits or multibyte chars, e.g. `/(?a)/` * thanks to [Daniel Gollahon](https://github.com/dgollahon) for the report - fixed error when scanning hex escapes with just one hex digit * e.g. `/\x0A/` was scanned correctly, but the equivalent `/\xA/` was not * thanks to [Daniel Gollahon](https://github.com/dgollahon) for the report ## [2.0.0] - 2020-11-25 - [Janosch Müller](mailto:janosch84@gmail.com) ### Changed - some methods that used to return byte-based indices now return char-based indices * the returned values have only changed for Regexps that contain multibyte chars * this is only a breaking change if you used such methods directly AND relied on them pointing to bytes * affected methods: * `Regexp::Token` `#length`, `#offset`, `#te`, `#ts` * `Regexp::Expression::Base` `#full_length`, `#offset`, `#starts_at`, `#te`, `#ts` * thanks to [Akinori MUSHA](https://github.com/knu) for the report - removed some deprecated methods/signatures * these are rarely used and have been showing deprecation warnings for a long time * `Regexp::Expression::Subexpression.new` with 3 arguments * `Regexp::Expression::Root.new` without a token argument * `Regexp::Expression.parsed` ### Added - `Regexp::Expression::Base#base_length` * returns the character count of an expression body, ignoring any quantifier - pragmatic, experimental support for chained quantifiers * e.g.: `/^a{10}{4,6}$/` matches exactly 40, 50 or 60 `a`s * successive quantifiers used to be silently dropped by the parser * they are now wrapped with passive groups as if they were written `(?:a{10}){4,6}` * thanks to [calfeld](https://github.com/calfeld) for reporting this a while back ### Fixed - incorrect encoding output for non-ascii comments * this led to a crash when calling `#to_s` on parse results containing such comments * thanks to [Michael Glass](https://github.com/michaelglass) for the report - some crashes when scanning contrived patterns such as `'\😋'` ### [1.8.2] - 2020-10-11 - [Janosch Müller](mailto:janosch84@gmail.com) ### Fixed - fix `FrozenError` in `Expression::Base#repetitions` on Ruby 3.0 * thanks to [Thomas Walpole](https://github.com/twalpole) - removed "unknown future version" warning on Ruby 3.0 ### [1.8.1] - 2020-09-28 - [Janosch Müller](mailto:janosch84@gmail.com) ### Fixed - fixed scanning of comment-like text in normal mode * this was an old bug, but had become more prevalent in v1.8.0 * thanks to [Tietew](https://github.com/Tietew) for the report - specified correct minimum Ruby version in gemspec * it said 1.9 but really required 2.0 as of v1.8.0 ### [1.8.0] - 2020-09-20 - [Janosch Müller](mailto:janosch84@gmail.com) ### Changed - dropped support for running on Ruby 1.9.x ### Added - regexp flags can now be passed when parsing a `String` as regexp body * see the [README](/README.md#usage) for details * thanks to [Owen Stephens](https://github.com/owst) - bare occurrences of `\g` and `\k` are now allowed and scanned as literal escapes * matches Onigmo behavior * thanks for the report to [Marc-André Lafortune](https://github.com/marcandre) ### Fixed - fixed parsing comments without preceding space or trailing newline in x-mode * thanks to [Owen Stephens](https://github.com/owst) ### [1.7.1] - 2020-06-07 - [Ammar Ali](mailto:ammarabuali@gmail.com) ### Fixed - Support for literals that include the unescaped delimiters `{`, `}`, and `]`. These delimiters are informally supported by various regexp engines. ### [1.7.0] - 2020-02-23 - [Janosch Müller](mailto:janosch84@gmail.com) ### Added - `Expression::Base#each_expression` and `#traverse` can now be called without a block * this returns an `Enumerator` and allows chaining, e.g. `each_expression.select` * thanks to [Masataka Kuwabara](https://github.com/pocke) ### Fixed - `MatchLength#each` no longer ignores the given `limit:` when called without a block ### [1.6.0] - 2019-06-16 - [Janosch Müller](mailto:janosch84@gmail.com) ### Added - Added support for 16 new unicode properties introduced in Ruby 2.6.2 and 2.6.3 ### [1.5.1] - 2019-05-23 - [Janosch Müller](mailto:janosch84@gmail.com) ### Fixed - Fixed `#options` (and thus `#i?`, `#u?` etc.) not being set for some expressions: * this affected posix classes as well as alternation, conditional, and intersection branches * `#options` was already correct for all child expressions of such branches * this only made an operational difference for posix classes as they respect encoding flags - Fixed `#options` not respecting all negative options in weird cases like '(?u-m-x)' - Fixed `Group#option_changes` not accounting for indirectly disabled (overridden) encoding flags - Fixed `Scanner` allowing negative encoding options if there were no positive options, e.g. '(?-u)' - Fixed `ScannerError` for some valid meta/control sequences such as '\\C-\\\\' - Fixed `Expression::Base#match` and `#=~` not working with a single argument ### [1.5.0] - 2019-05-14 - [Janosch Müller](mailto:janosch84@gmail.com) ### Added - Added `#referenced_expression` for backrefs, subexp calls and conditionals * returns the `Group` expression that is being referenced via name or number - Added `Expression::Base#repetitions` * returns a `Range` of allowed repetitions (`1..1` if there is no quantifier) * like `#quantity` but with a more uniform interface - Added `Expression::Base#match_length` * allows to inspect and iterate over String lengths matched by the Expression ### Fixed - Fixed `Expression::Base#clone` "direction" * it used to dup ivars onto the callee, leaving only the clone referencing the original objects * this will affect you if you call `#eql?`/`#equal?` on expressions or use them as Hash keys - Fixed `#clone` results for `Sequences`, e.g. alternations and conditionals * the inner `#text` was cloned onto the `Sequence` and thus duplicated * e.g. `Regexp::Parser.parse(/(a|bc)/).clone.to_s # => (aa|bcbc)` - Fixed inconsistent `#to_s` output for `Sequences` * it used to return only the "specific" text, e.g. "|" for an alternation * now it includes nested expressions as it does for all other `Subexpressions` - Fixed quantification of codepoint lists with more than one entry (`\u{62 63 64}+`) * quantifiers apply only to the last entry, so this token is now split up if quantified ### [1.4.0] - 2019-04-02 - [Janosch Müller](mailto:janosch84@gmail.com) ### Added - Added support for 19 new unicode properties introduced in Ruby 2.6.0 ### [1.3.0] - 2018-11-14 - [Janosch Müller](mailto:janosch84@gmail.com) ### Added - `Syntax#features` returns a `Hash` of all types and tokens supported by a given `Syntax` ### Fixed - Thanks to [Akira Matsuda](https://github.com/amatsuda) * eliminated warning "assigned but unused variable - testEof" ## [1.2.0] - 2018-09-28 - [Janosch Müller](mailto:janosch84@gmail.com) ### Added - `Subexpression` (branch node) includes `Enumerable`, allowing to `#select` children etc. ### Fixed - Fixed missing quantifier in `Conditional::Expression` methods `#to_s`, `#to_re` - `Conditional::Condition` no longer lives outside the recursive `#expressions` tree * it used to be the only expression stored in a custom ivar, complicating traversal * its setter and getter (`#condition=`, `#condition`) still work as before ## [1.1.0] - 2018-09-17 - [Janosch Müller](mailto:janosch84@gmail.com) ### Added - Added `Quantifier` methods `#greedy?`, `#possessive?`, `#reluctant?`/`#lazy?` - Added `Group::Options#option_changes` * shows the options enabled or disabled by the given options group * as with all other expressions, `#options` shows the overall active options - Added `Conditional#reference` and `Condition#reference`, indicating the determinative group - Added `Subexpression#dig`, acts like [`Array#dig`](http://ruby-doc.org/core-2.5.0/Array.html#method-i-dig) ### Fixed - Fixed parsing of quantified conditional expressions (quantifiers were assigned to the wrong expression) - Fixed scanning and parsing of forward-referring subexpression calls (e.g. `\g<+1>`) - `Root` and `Sequence` expressions now support the same constructor signature as all other expressions ## [1.0.0] - 2018-09-01 - [Janosch Müller](mailto:janosch84@gmail.com) This release includes several breaking changes, mostly to character sets, #map and properties. ### Changed - Changed handling of sets (a.k.a. character classes or "bracket expressions") * see PR [#55](https://github.com/ammar/regexp_parser/pull/55) / issue [#47](https://github.com/ammar/regexp_parser/issues/47) for details * sets are now parsed to expression trees like other nestable expressions * `#scan` now emits the same tokens as outside sets (no longer `:set, :member`) * `CharacterSet#members` has been removed * new `Range` and `Intersection` classes represent corresponding syntax features * a new `PosixClass` expression class represents e.g. `[[:ascii:]]` * `PosixClass` instances behave like `Property` ones, e.g. support `#negative?` * `#scan` emits `:(non)posixclass, :` instead of `:set, :char_(non)` - Changed `Subexpression#map` to act like regular `Enumerable#map` * the old behavior is available as `Subexpression#flat_map` * e.g. `parse(/[a]/).map(&:to_s) == ["[a]"]`; used to be `["[a]", "a"]` - Changed expression emissions for some escape sequences * `EscapeSequence::Codepoint`, `CodepointList`, `Hex` and `Octal` are now all used * they already existed, but were all parsed as `EscapeSequence::Literal` * e.g. `\x97` is now `EscapeSequence::Hex` instead of `EscapeSequence::Literal` - Changed naming of many property tokens (emitted for `\p{...}`) * if you work with these tokens, see PR [#56](https://github.com/ammar/regexp_parser/pull/56) for details * e.g. `:punct_dash` is now `:dash_punctuation` - Changed `(?m)` and the likes to emit as `:options_switch` token (@4ade4d1) * allows differentiating from group-local `:options`, e.g. `(?m:.)` - Changed name of `Backreference::..NestLevel` to `..RecursionLevel` (@4184339) - Changed `Backreference::Number#number` from `String` to `Integer` (@40a2231) ### Added - Added support for all previously missing properties (about 250) - Added `Expression::UnicodeProperty#shortcut` (e.g. returns "m" for `\p{mark}`) - Added `#char(s)` and `#codepoint(s)` methods to all `EscapeSequence` expressions - Added `#number`/`#name`/`#recursion_level` to all backref/call expressions (@174bf21) - Added `#number` and `#number_at_level` to capturing group expressions (@40a2231) ### Fixed - Fixed Ruby version mapping of some properties - Fixed scanning of some property spellings, e.g. with dashes - Fixed some incorrect property alias normalizations - Fixed scanning of codepoint escapes with 6 digits (e.g. `\u{10FFFF}`) - Fixed scanning of `\R` and `\X` within sets; they act as literals there ## [0.5.0] - 2018-04-29 - [Janosch Müller](mailto:janosch84@gmail.com) ### Changed - Changed handling of Ruby versions (PR [#53](https://github.com/ammar/regexp_parser/pull/53)) * New Ruby versions are now supported by default * Some deep-lying APIs have changed, which should not affect most users: * `Regexp::Syntax::VERSIONS` is gone * Syntax version names have changed from `Regexp::Syntax::Ruby::Vnnn` to `Regexp::Syntax::Vn_n_n` * Syntax version classes for Ruby versions without regex feature changes are no longer predefined and are now only created on demand / lazily * `Regexp::Syntax::supported?` returns true for any argument >= 1.8.6 ### Fixed - Fixed some use cases of Expression methods #strfregexp and #to_h (@e738107) ### Added - Added full signature support to collection methods of Expressions (@aa7c55a) ## [0.4.13] - 2018-04-04 - [Ammar Ali](mailto:ammarabuali@gmail.com) - Added ruby version files for 2.2.10 and 2.3.7 ## [0.4.12] - 2018-03-30 - [Janosch Müller](mailto:janosch84@gmail.com) - Added ruby version files for 2.4.4 and 2.5.1 ## [0.4.11] - 2018-03-04 - [Janosch Müller](mailto:janosch84@gmail.com) - Fixed UnknownSyntaxNameError introduced in v0.4.10 if the gems parent dir tree included a 'ruby' dir ## [0.4.10] - 2018-03-04 - [Janosch Müller](mailto:janosch84@gmail.com) - Added ruby version file for 2.6.0 - Added support for Emoji properties (available in Ruby since 2.5.0) - Added support for XPosixPunct and Regional_Indicator properties - Fixed parsing of Unicode 6.0 and 7.0 script properties - Fixed parsing of the special Assigned property - Fixed scanning of InCyrillic_Supplement property ## [0.4.9] - 2017-12-25 - [Ammar Ali](mailto:ammarabuali@gmail.com) - Added ruby version file for 2.5.0 ## [0.4.8] - 2017-12-18 - [Janosch Müller](mailto:janosch84@gmail.com) - Added ruby version files for 2.2.9, 2.3.6, and 2.4.3 ## [0.4.7] - 2017-10-15 - [Janosch Müller](mailto:janosch84@gmail.com) - Fixed a thread safety issue (issue #45) - Some public class methods that were only reliable for internal use are now private instance methods (PR #46) - Improved the usefulness of Expression::Base#options (issue #43) - #options and derived methods such as #i?, #m? and #x? are now defined for all Expressions that are affected by such flags. - Fixed scanning of whitespace following (?x) (commit 5c94bd2) - Fixed a Parser bug where the #number attribute of traditional numerical backreferences was not set correctly (commit 851b620) ## [0.4.6] - 2017-09-18 - [Janosch Müller](mailto:janosch84@gmail.com) - Added Parser support for hex escapes in sets (PR #36) - Added Parser support for octal escapes (PR #37) - Added support for cluster types \R and \X (PR #38) - Added support for more metacontrol notations (PR #39) ## [0.4.5] - 2017-09-17 - [Ammar Ali](mailto:ammarabuali@gmail.com) - Thanks to [Janosch Müller](https://github.com/janosch-x): * Support ruby 2.2.7 (PR #42) - Added ruby version files for 2.2.8, 2.3.5, and 2.4.2 ## [0.4.4] - 2017-07-10 - [Ammar Ali](mailto:ammarabuali@gmail.com) - Thanks to [Janosch Müller](https://github.com/janosch-x): * Add support for new absence operator (PR #33) - Thanks to [Bartek Bułat](https://github.com/barthez): * Add support for Ruby 2.3.4 version (PR #40) ## [0.4.3] - 2017-03-24 - [Ammar Ali](mailto:ammarabuali@gmail.com) - Added ruby version file for 2.4.1 ## [0.4.2] - 2017-01-10 - [Ammar Ali](mailto:ammarabuali@gmail.com) - Thanks to [Janosch Müller](https://github.com/janosch-x): * Support ruby 2.4 (PR #30) * Improve codepoint handling (PR #27) ## [0.4.1] - 2016-11-22 - [Ammar Ali](mailto:ammarabuali@gmail.com) - Updated ruby version file for 2.3.3 ## [0.4.0] - 2016-11-20 - [Ammar Ali](mailto:ammarabuali@gmail.com) - Added Syntax.supported? method - Updated ruby versions for latest releases; 2.1.10, 2.2.6, and 2.3.2 ## [0.3.6] - 2016-06-08 - [Ammar Ali](mailto:ammarabuali@gmail.com) - Thanks to [John Backus](https://github.com/backus): * Remove warnings (PR #26) ## [0.3.5] - 2016-05-30 - [Ammar Ali](mailto:ammarabuali@gmail.com) - Thanks to [John Backus](https://github.com/backus): * Fix parsing of /\xFF/n (hex:escape) (PR #24) ## [0.3.4] - 2016-05-25 - [Ammar Ali](mailto:ammarabuali@gmail.com) - Thanks to [John Backus](https://github.com/backus): * Fix warnings (PR #19) - Thanks to [Dana Scheider](https://github.com/danascheider): * Correct error in README (PR #20) - Fixed mistyped \h and \H character types (issue #21) - Added ancestry syntax files for latest rubies (issue #22) ## [0.3.3] - 2016-04-26 - [Ammar Ali](mailto:ammarabuali@gmail.com) - Thanks to [John Backus](https://github.com/backus): * Fixed scanning of zero length comments (PR #12) * Fixed missing escape:codepoint_list syntax token (PR #14) * Fixed to_s for modified interval quantifiers (PR #17) ## [0.3.2] - 2016-01-01 - [Ammar Ali](mailto:ammarabuali@gmail.com) - Updated ruby versions for latest releases; 2.1.8, 2.2.4, and 2.3.0 - Fixed class name for UnknownSyntaxNameError exception - Added UnicodeBlocks support to the parser. - Added UnicodeBlocks support to the scanner. - Added expand_members method to CharacterSet, returns traditional or unicode property forms of shothands (\d, \W, \s, etc.) - Improved meaning and output of %t and %T in strfregexp. - Added syntax versions for ruby 2.1.4 and 2.1.5 and updated latest 2.1 version. - Added to_h methods to Expression, Subexpression, and Quantifier. - Added traversal methods; traverse, each_expression, and map. - Added token/type test methods; type?, is?, and one_of? - Added printing method strfregexp, inspired by strftime. - Added scanning and parsing of free spacing (x mode) expressions. - Improved handling of inline options (?mixdau:...) - Added conditional expressions. Ruby 2.0. - Added keep (\K) markers. Ruby 2.0. - Added d, a, and u options. Ruby 2.0. - Added missing meta sequences to the parser. They were supported by the scanner only. - Renamed Lexer's method to lex, added an alias to the old name (scan) - Use #map instead of #each to run the block in Lexer.lex. - Replaced VERSION.yml file with a constant. - Update tokens and scanner with new additions in Unicode 7.0. ## [0.1.6] - 2014-10-06 - [Ammar Ali](mailto:ammarabuali@gmail.com) - Fixed test and gem building rake tasks and extracted the gem specification from the Rakefile into a .gemspec file. - Added syntax files for missing ruby 2.x versions. These do not add extra syntax support, they just make the gem work with the newer ruby versions. - Fixed a parser bug where an alternation sequence that contained nested expressions was incorrectly being appended to the parent expression when the nesting was exited. e.g. in /a|(b)c/, c was appended to the root. - Fixed a bug where character types were not being correctly scanned within character sets. e.g. in [\d], two tokens were scanned; one for the backslash '\' and one for the 'd' ## [0.1.5] - 2014-01-14 - [Ammar Ali](mailto:ammarabuali@gmail.com) - Added syntax stubs for ruby versions 2.0 and 2.1 - Added clone methods for deep copying expressions. - Added optional format argument for to_s on expressions to return the text of the expression with (:full, the default) or without (:base) its quantifier. - Renamed the :beginning_of_line and :end_of_line tokens to :bol and :eol. - Fixed a bug where alternations with more than two alternatives and one of them ending in a group were being incorrectly nested. - Improved EOF handling in general and especially from sequences like hex and control escapes. - Fixed a bug where named groups with an empty name would return a blank token []. - Fixed a bug where member of a parent set where being added to its last subset. - Fixed a few mutable string bugs by calling dup on the originals. - Made ruby 1.8.6 the base for all 1.8 syntax, and the 1.8 name a pointer to the latest (1.8.7 at this time) - Removed look-behind assertions (positive and negative) from 1.8 syntax - Added control (\cc and \C-c) and meta (\M-c) escapes to 1.8 syntax - The default syntax is now the one of the running ruby version in both the lexer and the parser. ## [0.1.0] - 2010-11-21 - [Ammar Ali](mailto:ammarabuali@gmail.com) - Initial release ammar-regexp_parser-0494e56/Gemfile000066400000000000000000000004651433525313500172410ustar00rootroot00000000000000source 'https://rubygems.org' gemspec group :development, :test do gem 'ice_nine', '~> 0.11.2' gem 'rake', '~> 13.0' gem 'regexp_property_values', '~> 1.3' gem 'rspec', '~> 3.10' if RUBY_VERSION.to_f >= 2.7 gem 'benchmark-ips', '~> 2.1' gem 'gouteur' gem 'rubocop', '~> 1.7' end end ammar-regexp_parser-0494e56/LICENSE000066400000000000000000000020521433525313500167450ustar00rootroot00000000000000Copyright (c) 2010, 2012-2022, Ammar Ali Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ammar-regexp_parser-0494e56/README.md000066400000000000000000000516641433525313500172340ustar00rootroot00000000000000# Regexp::Parser [![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser) [![Build Status](https://github.com/ammar/regexp_parser/workflows/tests/badge.svg)](https://github.com/ammar/regexp_parser/actions) [![Build Status](https://github.com/ammar/regexp_parser/workflows/gouteur/badge.svg)](https://github.com/ammar/regexp_parser/actions) [![Code Climate](https://codeclimate.com/github/ammar/regexp_parser.svg)](https://codeclimate.com/github/ammar/regexp_parser/badges) A Ruby gem for tokenizing, parsing, and transforming regular expressions. * Multilayered * A scanner/tokenizer based on [Ragel](http://www.colm.net/open-source/ragel/) * A lexer that produces a "stream" of [Token objects](https://github.com/ammar/regexp_parser/wiki/Token-Objects) * A parser that produces a "tree" of [Expression objects (OO API)](https://github.com/ammar/regexp_parser/wiki/Expression-Objects) * Runs on Ruby 2.x, 3.x and JRuby runtimes * Recognizes Ruby 1.8, 1.9, 2.x and 3.x regular expressions [See Supported Syntax](#supported-syntax) _For examples of regexp_parser in use, see [Example Projects](#example-projects)._ --- ## Requirements * Ruby >= 2.0 * Ragel >= 6.0, but only if you want to build the gem or work on the scanner. --- ## Install Install the gem with: `gem install regexp_parser` Or, add it to your project's `Gemfile`: ```gem 'regexp_parser', '~> X.Y.Z'``` See the badge at the top of this README or [rubygems](https://rubygems.org/gems/regexp_parser) for the the latest version number. --- ## Usage The three main modules are **Scanner**, **Lexer**, and **Parser**. Each of them provides a single method that takes a regular expression (as a Regexp object or a string) and returns its results. The **Lexer** and the **Parser** accept an optional second argument that specifies the syntax version, like 'ruby/2.0', which defaults to the host Ruby version (using RUBY_VERSION). Here are the basic usage examples: ```ruby require 'regexp_parser' Regexp::Scanner.scan(regexp) Regexp::Lexer.lex(regexp) Regexp::Parser.parse(regexp) ``` All three methods accept a block as the last argument, which, if given, gets called with the results as follows: * **Scanner**: the block gets passed the results as they are scanned. See the example in the next section for details. * **Lexer**: after completion, the block gets passed the tokens one by one. _The result of the block is returned._ * **Parser**: after completion, the block gets passed the root expression. _The result of the block is returned._ All three methods accept either a `Regexp` or `String` (containing the pattern) - if a String is passed, `options` can be supplied: ```ruby require 'regexp_parser' Regexp::Parser.parse( "a+ # Recognizes a and A...", options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE ) ``` --- ## Components ### Scanner A Ragel-generated scanner that recognizes the cumulative syntax of all supported syntax versions. It breaks a given expression's text into the smallest parts, and identifies their type, token, text, and start/end offsets within the pattern. #### Example The following scans the given pattern and prints out the type, token, text and start/end offsets for each token found. ```ruby require 'regexp_parser' Regexp::Scanner.scan(/(ab?(cd)*[e-h]+)/) do |type, token, text, ts, te| puts "type: #{type}, token: #{token}, text: '#{text}' [#{ts}..#{te}]" end # output # type: group, token: capture, text: '(' [0..1] # type: literal, token: literal, text: 'ab' [1..3] # type: quantifier, token: zero_or_one, text: '?' [3..4] # type: group, token: capture, text: '(' [4..5] # type: literal, token: literal, text: 'cd' [5..7] # type: group, token: close, text: ')' [7..8] # type: quantifier, token: zero_or_more, text: '*' [8..9] # type: set, token: open, text: '[' [9..10] # type: set, token: range, text: 'e-h' [10..13] # type: set, token: close, text: ']' [13..14] # type: quantifier, token: one_or_more, text: '+' [14..15] # type: group, token: close, text: ')' [15..16] ``` A one-liner that uses map on the result of the scan to return the textual parts of the pattern: ```ruby Regexp::Scanner.scan(/(cat?([bhm]at)){3,5}/).map { |token| token[2] } #=> ["(", "cat", "?", "(", "[", "b", "h", "m", "]", "at", ")", ")", "{3,5}"] ``` #### Notes * The scanner performs basic syntax error checking, like detecting missing balancing punctuation and premature end of pattern. Flavor validity checks are performed in the lexer, which uses a syntax object. * If the input is a Ruby **Regexp** object, the scanner calls #source on it to get its string representation. #source does not include the options of the expression (m, i, and x). To include the options in the scan, #to_s should be called on the **Regexp** before passing it to the scanner or the lexer. For the parser, however, this is not necessary. It automatically exposes the options of a passed **Regexp** in the returned root expression. * To keep the scanner simple(r) and fairly reusable for other purposes, it does not perform lexical analysis on the tokens, sticking to the task of identifying the smallest possible tokens and leaving lexical analysis to the lexer. * The MRI implementation may accept expressions that either conflict with the documentation or are undocumented, like `{}` and `]` _(unescaped)_. The scanner will try to support as many of these cases as possible. --- ### Syntax Defines the supported tokens for a specific engine implementation (aka a flavor). Syntax classes act as lookup tables, and are layered to create flavor variations. Syntax only comes into play in the lexer. #### Example The following fetches syntax objects for Ruby 2.0, 1.9, 1.8, and checks a few of their implementation features. ```ruby require 'regexp_parser' ruby_20 = Regexp::Syntax.for 'ruby/2.0' ruby_20.implements? :quantifier, :zero_or_one # => true ruby_20.implements? :quantifier, :zero_or_one_reluctant # => true ruby_20.implements? :quantifier, :zero_or_one_possessive # => true ruby_20.implements? :conditional, :condition # => true ruby_19 = Regexp::Syntax.for 'ruby/1.9' ruby_19.implements? :quantifier, :zero_or_one # => true ruby_19.implements? :quantifier, :zero_or_one_reluctant # => true ruby_19.implements? :quantifier, :zero_or_one_possessive # => true ruby_19.implements? :conditional, :condition # => false ruby_18 = Regexp::Syntax.for 'ruby/1.8' ruby_18.implements? :quantifier, :zero_or_one # => true ruby_18.implements? :quantifier, :zero_or_one_reluctant # => true ruby_18.implements? :quantifier, :zero_or_one_possessive # => false ruby_18.implements? :conditional, :condition # => false ``` Syntax objects can also be queried about their complete and relative feature sets. ```ruby require 'regexp_parser' ruby_20 = Regexp::Syntax.for 'ruby/2.0' # => Regexp::Syntax::V2_0_0 ruby_20.added_features # => { conditional: [...], ... } ruby_20.removed_features # => { property: [:newline], ... } ruby_20.features # => { anchor: [...], ... } ``` #### Notes * Variations on a token, for example a named group with angle brackets (< and >) vs one with a pair of single quotes, are specified with an underscore followed by two characters appended to the base token. In the previous named group example, the tokens would be :named_ab (angle brackets) and :named_sq (single quotes). These variations are normalized by the syntax to :named. --- ### Lexer Sits on top of the scanner and performs lexical analysis on the tokens that it emits. Among its tasks are; breaking quantified literal runs, collecting the emitted token attributes into Token objects, calculating their nesting depth, normalizing tokens for the parser, and checking if the tokens are implemented by the given syntax version. See the [Token Objects](https://github.com/ammar/regexp_parser/wiki/Token-Objects) wiki page for more information on Token objects. #### Example The following example lexes the given pattern, checks it against the Ruby 1.9 syntax, and prints the token objects' text indented to their level. ```ruby require 'regexp_parser' Regexp::Lexer.lex(/a?(b(c))*[d]+/, 'ruby/1.9') do |token| puts "#{' ' * token.level}#{token.text}" end # output # a # ? # ( # b # ( # c # ) # ) # * # [ # d # ] # + ``` A one-liner that returns an array of the textual parts of the given pattern. Compare the output with that of the one-liner example of the **Scanner**; notably how the sequence 'cat' is treated. The 't' is separated because it's followed by a quantifier that only applies to it. ```ruby Regexp::Lexer.scan(/(cat?([b]at)){3,5}/).map { |token| token.text } #=> ["(", "ca", "t", "?", "(", "[", "b", "]", "at", ")", ")", "{3,5}"] ``` #### Notes * The syntax argument is optional. It defaults to the version of the Ruby interpreter in use, as returned by RUBY_VERSION. * The lexer normalizes some tokens, as noted in the Syntax section above. --- ### Parser Sits on top of the lexer and transforms the "stream" of Token objects emitted by it into a tree of Expression objects represented by an instance of the Expression::Root class. See the [Expression Objects](https://github.com/ammar/regexp_parser/wiki/Expression-Objects) wiki page for attributes and methods. #### Example ```ruby require 'regexp_parser' regex = /a?(b+(c)d)*(?[0-9]+)/ tree = Regexp::Parser.parse(regex, 'ruby/2.1') tree.traverse do |event, exp| puts "#{event}: #{exp.type} `#{exp.to_s}`" end # Output # visit: literal `a?` # enter: group `(b+(c)d)*` # visit: literal `b+` # enter: group `(c)` # visit: literal `c` # exit: group `(c)` # visit: literal `d` # exit: group `(b+(c)d)*` # enter: group `(?[0-9]+)` # visit: set `[0-9]+` # exit: group `(?[0-9]+)` ``` Another example, using each_expression and strfregexp to print the object tree. _See the traverse.rb and strfregexp.rb files under `lib/regexp_parser/expression/methods` for more information on these methods._ ```ruby include_root = true indent_offset = include_root ? 1 : 0 tree.each_expression(include_root) do |exp, level_index| puts exp.strfregexp("%>> %c", indent_offset) end # Output # > Regexp::Expression::Root # > Regexp::Expression::Literal # > Regexp::Expression::Group::Capture # > Regexp::Expression::Literal # > Regexp::Expression::Group::Capture # > Regexp::Expression::Literal # > Regexp::Expression::Literal # > Regexp::Expression::Group::Named # > Regexp::Expression::CharacterSet ``` _Note: quantifiers do not appear in the output because they are members of the Expression class. See the next section for details._ --- ## Supported Syntax The three modules support all the regular expression syntax features of Ruby 1.8, 1.9, 2.x and 3.x: _Note that not all of these are available in all versions of Ruby_ | Syntax Feature | Examples | ⋯ | | ------------------------------------- | ------------------------------------------------------- |:--------:| | **Alternation** | `a\|b\|c` | ✓ | | **Anchors** | `\A`, `^`, `\b` | ✓ | | **Character Classes** | `[abc]`, `[^\\]`, `[a-d&&aeiou]`, `[a=e=b]` | ✓ | | **Character Types** | `\d`, `\H`, `\s` | ✓ | | **Cluster Types** | `\R`, `\X` | ✓ | | **Conditional Exps.** | `(?(cond)yes-subexp)`, `(?(cond)yes-subexp\|no-subexp)` | ✓ | | **Escape Sequences** | `\t`, `\\+`, `\?` | ✓ | | **Free Space** | whitespace and `# Comments` _(x modifier)_ | ✓ | | **Grouped Exps.** | | ⋱ | |   _**Assertions**_ | | ⋱ | |   _Lookahead_ | `(?=abc)` | ✓ | |   _Negative Lookahead_ | `(?!abc)` | ✓ | |   _Lookbehind_ | `(?<=abc)` | ✓ | |   _Negative Lookbehind_ | `(?abc)` | ✓ | |   _**Absence**_ | `(?~abc)` | ✓ | |   _**Back-references**_ | | ⋱ | |   _Named_ | `\k` | ✓ | |   _Nest Level_ | `\k` | ✓ | |   _Numbered_ | `\k<1>` | ✓ | |   _Relative_ | `\k<-2>` | ✓ | |   _Traditional_ | `\1` through `\9` | ✓ | |   _**Capturing**_ | `(abc)` | ✓ | |   _**Comments**_ | `(?# comment text)` | ✓ | |   _**Named**_ | `(?abc)`, `(?'name'abc)` | ✓ | |   _**Options**_ | `(?mi-x:abc)`, `(?a:\s\w+)`, `(?i)` | ✓ | |   _**Passive**_ | `(?:abc)` | ✓ | |   _**Subexp. Calls**_ | `\g`, `\g<1>` | ✓ | | **Keep** | `\K`, `(ab\Kc\|d\Ke)f` | ✓ | | **Literals** _(utf-8)_ | `Ruby`, `ルビー`, `روبي` | ✓ | | **POSIX Classes** | `[:alpha:]`, `[:^digit:]` | ✓ | | **Quantifiers** | | ⋱ | |   _**Greedy**_ | `?`, `*`, `+`, `{m,M}` | ✓ | |   _**Reluctant** (Lazy)_ | `??`, `*?`, `+?` \[1\] | ✓ | |   _**Possessive**_ | `?+`, `*+`, `++` \[1\] | ✓ | | **String Escapes** | | ⋱ | |   _**Control** \[2\]_ | `\C-C`, `\cD` | ✓ | |   _**Hex**_ | `\x20`, `\x{701230}` | ✓ | |   _**Meta** \[2\]_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | ✓ | |   _**Octal**_ | `\0`, `\01`, `\012` | ✓ | |   _**Unicode**_ | `\uHHHH`, `\u{H+ H+}` | ✓ | | **Unicode Properties** | _([Unicode 13.0.0])_ | ⋱ | |   _**Age**_ | `\p{Age=5.2}`, `\P{age=7.0}`, `\p{^age=8.0}` | ✓ | |   _**Blocks**_ | `\p{InArmenian}`, `\P{InKhmer}`, `\p{^InThai}` | ✓ | |   _**Classes**_ | `\p{Alpha}`, `\P{Space}`, `\p{^Alnum}` | ✓ | |   _**Derived**_ | `\p{Math}`, `\P{Lowercase}`, `\p{^Cased}` | ✓ | |   _**General Categories**_ | `\p{Lu}`, `\P{Cs}`, `\p{^sc}` | ✓ | |   _**Scripts**_ | `\p{Arabic}`, `\P{Hiragana}`, `\p{^Greek}` | ✓ | |   _**Simple**_ | `\p{Dash}`, `\p{Extender}`, `\p{^Hyphen}` | ✓ | [Unicode 13.0.0]: https://www.unicode.org/versions/Unicode13.0.0/ **\[1\]**: Ruby does not support lazy or possessive interval quantifiers. Any `+` or `?` that follows an interval quantifier will be treated as another, chained quantifier. See also [#3](https://github.com/ammar/regexp_parser/issue/3), [#69](https://github.com/ammar/regexp_parser/pull/69). **\[2\]**: As of Ruby 3.1, meta and control sequences are [pre-processed to hex escapes when used in Regexp literals](https://github.com/ruby/ruby/commit/11ae581), so they will only reach the scanner and will only be emitted if a String or a Regexp that has been built with the `::new` constructor is scanned. ##### Inapplicable Features Some modifiers, like `o` and `s`, apply to the **Regexp** object itself and do not appear in its source. Other such modifiers include the encoding modifiers `e` and `n` [See](http://www.ruby-doc.org/core-2.5.0/Regexp.html#class-Regexp-label-Encoding). These are not seen by the scanner. The following features are not currently enabled for Ruby by its regular expressions library (Onigmo). They are not supported by the scanner. - **Quotes**: `\Q...\E` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L499)_ - **Capture History**: `(?@...)`, `(?@...)` _[[See]](https://github.com/k-takata/Onigmo/blob/7911409/doc/RE#L550)_ See something missing? Please submit an [issue](https://github.com/ammar/regexp_parser/issues) _**Note**: Attempting to process expressions with unsupported syntax features can raise an error, or incorrectly return tokens/objects as literals._ ## Testing To run the tests simply run rake from the root directory. The default task generates the scanner's code from the Ragel source files and runs all the specs, thus it requires Ragel to be installed. Note that changes to Ragel files will not be reflected when running `rspec` on its own, so to run individual tests you might want to run: ``` rake ragel:rb && rspec spec/scanner/properties_spec.rb ``` ## Building Building the scanner and the gem requires [Ragel](http://www.colm.net/open-source/ragel/) to be installed. The build tasks will automatically invoke the 'ragel:rb' task to generate the Ruby scanner code. The project uses the standard rubygems package tasks, so: To build the gem, run: ``` rake build ``` To install the gem from the cloned project, run: ``` rake install ``` ## Example Projects Projects using regexp_parser. - [capybara](https://github.com/teamcapybara/capybara) is an integration testing tool that uses regexp_parser to convert Regexps to css/xpath selectors. - [js_regex](https://github.com/jaynetics/js_regex) converts Ruby regular expressions to JavaScript-compatible regular expressions. - [meta_re](https://github.com/ammar/meta_re) is a regular expression preprocessor with alias support. - [mutant](https://github.com/mbj/mutant) manipulates your regular expressions (amongst others) to see if your tests cover their behavior. - [repper](https://github.com/jaynetics/repper) is a regular expression pretty-printer and formatter for Ruby. - [rubocop](https://github.com/rubocop-hq/rubocop) is a linter for Ruby that uses regexp_parser to lint Regexps. - [twitter-cldr-rb](https://github.com/twitter/twitter-cldr-rb) is a localization helper that uses regexp_parser to generate examples of postal codes. ## References Documentation and books used while working on this project. #### Ruby Flavors * Oniguruma Regular Expressions (Ruby 1.9.x) [link](https://github.com/kkos/oniguruma/blob/master/doc/RE) * Onigmo Regular Expressions (Ruby >= 2.0) [link](https://github.com/k-takata/Onigmo/blob/master/doc/RE) #### Regular Expressions * Mastering Regular Expressions, By Jeffrey E.F. Friedl (2nd Edition) [book](http://oreilly.com/catalog/9781565922570/) * Regular Expression Flavor Comparison [link](http://www.regular-expressions.info/refflavors.html) * Enumerating the strings of regular languages [link](http://www.cs.dartmouth.edu/~doug/nfa.ps.gz) * Stack Overflow Regular Expressions FAQ [link](http://stackoverflow.com/questions/22937618/reference-what-does-this-regex-mean/22944075#22944075) #### Unicode * Unicode Explained, By Jukka K. Korpela. [book](http://oreilly.com/catalog/9780596101213) * Unicode Derived Properties [link](http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt) * Unicode Property Aliases [link](http://www.unicode.org/Public/UNIDATA/PropertyAliases.txt) * Unicode Regular Expressions [link](http://www.unicode.org/reports/tr18/) * Unicode Standard Annex #44 [link](http://www.unicode.org/reports/tr44/) --- ##### Copyright _Copyright (c) 2010-2022 Ammar Ali. See LICENSE file for details._ ammar-regexp_parser-0494e56/Rakefile000066400000000000000000000010601433525313500174030ustar00rootroot00000000000000require 'bundler' require 'rubygems' require 'rubygems/package_task' require 'rake' require 'rake/testtask' require 'rspec/core/rake_task' Dir['tasks/**/*.rake'].each { |file| load(file) } Bundler::GemHelper.install_tasks RSpec::Core::RakeTask.new(:spec) task :default => [:'test:full'] namespace :test do task full: [:'ragel:rb', :spec] end # Add ragel task as a prerequisite for building the gem to ensure that the # latest scanner code is generated and included in the build. desc "Runs ragel:rb before building the gem" task :build => ['ragel:rb'] ammar-regexp_parser-0494e56/bin/000077500000000000000000000000001433525313500165115ustar00rootroot00000000000000ammar-regexp_parser-0494e56/bin/console000077500000000000000000000005531433525313500201040ustar00rootroot00000000000000#!/usr/bin/env ruby require 'bundler/setup' require 'regexp_parser' require 'regexp_property_values' RL = Regexp::Lexer RP = Regexp::Parser RS = Regexp::Scanner PV = RegexpPropertyValues def lex(...); Regexp::Lexer.lex(...) end def parse(...); Regexp::Parser.parse(...) end def scan(...); Regexp::Scanner.scan(...) end require 'irb' IRB.start(__FILE__) ammar-regexp_parser-0494e56/bin/setup000077500000000000000000000005231433525313500175770ustar00rootroot00000000000000#!/bin/sh set -euo pipefail # install gems bundle # install ragel if [[ $(command -v ragel) == "" ]]; then if [[ $(command -v brew) != "" ]]; then brew install ragel elif [[ $(command -v apt-get) != "" ]]; then sudo apt-get install -y ragel else echo "could not install ragel, please do so manually" exit 1 fi fi ammar-regexp_parser-0494e56/lib/000077500000000000000000000000001433525313500165075ustar00rootroot00000000000000ammar-regexp_parser-0494e56/lib/regexp_parser.rb000066400000000000000000000002721433525313500217030ustar00rootroot00000000000000require 'regexp_parser/version' require 'regexp_parser/token' require 'regexp_parser/scanner' require 'regexp_parser/syntax' require 'regexp_parser/lexer' require 'regexp_parser/parser' ammar-regexp_parser-0494e56/lib/regexp_parser/000077500000000000000000000000001433525313500213555ustar00rootroot00000000000000ammar-regexp_parser-0494e56/lib/regexp_parser/error.rb000066400000000000000000000001471433525313500230350ustar00rootroot00000000000000class Regexp::Parser # base class for all gem-specific errors class Error < StandardError; end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression.rb000066400000000000000000000031171433525313500241030ustar00rootroot00000000000000require 'regexp_parser/error' require 'regexp_parser/expression/shared' require 'regexp_parser/expression/base' require 'regexp_parser/expression/quantifier' require 'regexp_parser/expression/subexpression' require 'regexp_parser/expression/sequence' require 'regexp_parser/expression/sequence_operation' require 'regexp_parser/expression/classes/alternation' require 'regexp_parser/expression/classes/anchor' require 'regexp_parser/expression/classes/backreference' require 'regexp_parser/expression/classes/character_set' require 'regexp_parser/expression/classes/character_set/intersection' require 'regexp_parser/expression/classes/character_set/range' require 'regexp_parser/expression/classes/character_type' require 'regexp_parser/expression/classes/conditional' require 'regexp_parser/expression/classes/escape_sequence' require 'regexp_parser/expression/classes/free_space' require 'regexp_parser/expression/classes/group' require 'regexp_parser/expression/classes/keep' require 'regexp_parser/expression/classes/literal' require 'regexp_parser/expression/classes/posix_class' require 'regexp_parser/expression/classes/root' require 'regexp_parser/expression/classes/unicode_property' require 'regexp_parser/expression/methods/construct' require 'regexp_parser/expression/methods/human_name' require 'regexp_parser/expression/methods/match' require 'regexp_parser/expression/methods/match_length' require 'regexp_parser/expression/methods/options' require 'regexp_parser/expression/methods/strfregexp' require 'regexp_parser/expression/methods/tests' require 'regexp_parser/expression/methods/traverse' ammar-regexp_parser-0494e56/lib/regexp_parser/expression/000077500000000000000000000000001433525313500235545ustar00rootroot00000000000000ammar-regexp_parser-0494e56/lib/regexp_parser/expression/base.rb000066400000000000000000000042111433525313500250110ustar00rootroot00000000000000module Regexp::Expression class Base include Regexp::Expression::Shared def initialize(token, options = {}) init_from_token_and_options(token, options) end def initialize_copy(orig) self.text = orig.text.dup if orig.text self.options = orig.options.dup if orig.options self.quantifier = orig.quantifier.clone if orig.quantifier super end def to_re(format = :full) if set_level > 0 warn "Calling #to_re on character set members is deprecated - "\ "their behavior might not be equivalent outside of the set." end ::Regexp.new(to_s(format)) end def quantify(*args) self.quantifier = Quantifier.new(*args) end def unquantified_clone clone.tap { |exp| exp.quantifier = nil } end # Deprecated. Prefer `#repetitions` which has a more uniform interface. def quantity return [nil,nil] unless quantified? [quantifier.min, quantifier.max] end def repetitions @repetitions ||= if quantified? min = quantifier.min max = quantifier.max < 0 ? Float::INFINITY : quantifier.max range = min..max # fix Range#minmax on old Rubies - https://bugs.ruby-lang.org/issues/15807 if RUBY_VERSION.to_f < 2.7 range.define_singleton_method(:minmax) { [min, max] } end range else 1..1 end end def greedy? quantified? and quantifier.greedy? end def reluctant? quantified? and quantifier.reluctant? end alias :lazy? :reluctant? def possessive? quantified? and quantifier.possessive? end def to_h { type: type, token: token, text: to_s(:base), starts_at: ts, length: full_length, level: level, set_level: set_level, conditional_level: conditional_level, options: options, quantifier: quantified? ? quantifier.to_h : nil, } end alias :attributes :to_h end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/classes/000077500000000000000000000000001433525313500252115ustar00rootroot00000000000000ammar-regexp_parser-0494e56/lib/regexp_parser/expression/classes/alternation.rb000066400000000000000000000004471433525313500300630ustar00rootroot00000000000000module Regexp::Expression # A sequence of expressions, used by Alternation as one of its alternative. class Alternative < Regexp::Expression::Sequence; end class Alternation < Regexp::Expression::SequenceOperation OPERAND = Alternative alias :alternatives :expressions end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/classes/anchor.rb000066400000000000000000000013511433525313500270100ustar00rootroot00000000000000module Regexp::Expression module Anchor class Base < Regexp::Expression::Base; end class BeginningOfLine < Anchor::Base; end class EndOfLine < Anchor::Base; end class BeginningOfString < Anchor::Base; end class EndOfString < Anchor::Base; end class EndOfStringOrBeforeEndOfLine < Anchor::Base; end class WordBoundary < Anchor::Base; end class NonWordBoundary < Anchor::Base; end class MatchStart < Anchor::Base; end BOL = BeginningOfLine EOL = EndOfLine BOS = BeginningOfString EOS = EndOfString EOSobEOL = EndOfStringOrBeforeEndOfLine end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/classes/backreference.rb000066400000000000000000000031671433525313500303240ustar00rootroot00000000000000module Regexp::Expression # TODO: unify name with token :backref, one way or the other, in v3.0.0 module Backreference class Base < Regexp::Expression::Base attr_accessor :referenced_expression def initialize_copy(orig) self.referenced_expression = orig.referenced_expression.dup super end end class Number < Backreference::Base attr_reader :number alias reference number def initialize(token, options = {}) @number = token.text[token.token.equal?(:number) ? 1..-1 : 3..-2].to_i super end end class Name < Backreference::Base attr_reader :name alias reference name def initialize(token, options = {}) @name = token.text[3..-2] super end end class NumberRelative < Backreference::Number attr_accessor :effective_number alias reference effective_number end class NumberCall < Backreference::Number; end class NameCall < Backreference::Name; end class NumberCallRelative < Backreference::NumberRelative; end class NumberRecursionLevel < Backreference::NumberRelative attr_reader :recursion_level def initialize(token, options = {}) super @number, @recursion_level = token.text[3..-2].split(/(?=[+-])/).map(&:to_i) end end class NameRecursionLevel < Backreference::Name attr_reader :recursion_level def initialize(token, options = {}) super @name, recursion_level = token.text[3..-2].split(/(?=[+-])/) @recursion_level = recursion_level.to_i end end end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/classes/character_set.rb000066400000000000000000000010401433525313500303400ustar00rootroot00000000000000module Regexp::Expression class CharacterSet < Regexp::Expression::Subexpression attr_accessor :closed, :negative alias :negative? :negative alias :negated? :negative alias :closed? :closed def initialize(token, options = {}) self.negative = false self.closed = false super end def negate self.negative = true end def close self.closed = true end def parts ["#{text}#{'^' if negated?}", *expressions, ']'] end end end # module Regexp::Expression ammar-regexp_parser-0494e56/lib/regexp_parser/expression/classes/character_set/000077500000000000000000000000001433525313500300205ustar00rootroot00000000000000ammar-regexp_parser-0494e56/lib/regexp_parser/expression/classes/character_set/intersection.rb000066400000000000000000000004131433525313500330510ustar00rootroot00000000000000module Regexp::Expression class CharacterSet < Regexp::Expression::Subexpression class IntersectedSequence < Regexp::Expression::Sequence; end class Intersection < Regexp::Expression::SequenceOperation OPERAND = IntersectedSequence end end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/classes/character_set/range.rb000066400000000000000000000010051433525313500314350ustar00rootroot00000000000000module Regexp::Expression class CharacterSet < Regexp::Expression::Subexpression class Range < Regexp::Expression::Subexpression def starts_at expressions.first.starts_at end alias :ts :starts_at def <<(exp) complete? and raise Regexp::Parser::Error, "Can't add more than 2 expressions to a Range" super end def complete? count == 2 end def parts intersperse(expressions, text.dup) end end end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/classes/character_type.rb000066400000000000000000000012751433525313500305400ustar00rootroot00000000000000module Regexp::Expression module CharacterType class Base < Regexp::Expression::Base; end class Any < CharacterType::Base; end class Digit < CharacterType::Base; end class NonDigit < CharacterType::Base; end class Hex < CharacterType::Base; end class NonHex < CharacterType::Base; end class Word < CharacterType::Base; end class NonWord < CharacterType::Base; end class Space < CharacterType::Base; end class NonSpace < CharacterType::Base; end class Linebreak < CharacterType::Base; end class ExtendedGrapheme < CharacterType::Base; end end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/classes/conditional.rb000066400000000000000000000032721433525313500300450ustar00rootroot00000000000000module Regexp::Expression module Conditional class TooManyBranches < Regexp::Parser::Error def initialize super('The conditional expression has more than 2 branches') end end class Condition < Regexp::Expression::Base attr_accessor :referenced_expression # Name or number of the referenced capturing group that determines state. # Returns a String if reference is by name, Integer if by number. def reference ref = text.tr("'<>()", "") ref =~ /\D/ ? ref : Integer(ref) end def initialize_copy(orig) self.referenced_expression = orig.referenced_expression.dup super end end class Branch < Regexp::Expression::Sequence; end class Expression < Regexp::Expression::Subexpression attr_accessor :referenced_expression def <<(exp) expressions.last << exp end def add_sequence(active_opts = {}) raise TooManyBranches.new if branches.length == 2 params = { conditional_level: conditional_level + 1 } Branch.add_to(self, params, active_opts) end alias :branch :add_sequence def condition=(exp) expressions.delete(condition) expressions.unshift(exp) end def condition find { |subexp| subexp.is_a?(Condition) } end def branches select { |subexp| subexp.is_a?(Sequence) } end def reference condition.reference end def parts [text.dup, condition, *intersperse(branches, '|'), ')'] end def initialize_copy(orig) self.referenced_expression = orig.referenced_expression.dup super end end end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/classes/escape_sequence.rb000066400000000000000000000046471433525313500307010ustar00rootroot00000000000000module Regexp::Expression # TODO: unify naming with Token::Escape, one way or the other, in v3.0.0 module EscapeSequence class Base < Regexp::Expression::Base def codepoint char.ord end if ''.respond_to?(:undump) def char %("#{text}").undump end else # poor man's unescape without using eval require 'yaml' def char YAML.load(%Q(---\n"#{text}"\n)) end end end class Literal < EscapeSequence::Base def char text[1..-1] end end class AsciiEscape < EscapeSequence::Base; end class Backspace < EscapeSequence::Base; end class Bell < EscapeSequence::Base; end class FormFeed < EscapeSequence::Base; end class Newline < EscapeSequence::Base; end class Return < EscapeSequence::Base; end class Tab < EscapeSequence::Base; end class VerticalTab < EscapeSequence::Base; end class Hex < EscapeSequence::Base; end class Codepoint < EscapeSequence::Base; end class CodepointList < EscapeSequence::Base def char raise NoMethodError, 'CodepointList responds only to #chars' end def codepoint raise NoMethodError, 'CodepointList responds only to #codepoints' end def chars codepoints.map { |cp| cp.chr('utf-8') } end def codepoints text.scan(/\h+/).map(&:hex) end end class Octal < EscapeSequence::Base def char text[1..-1].to_i(8).chr('utf-8') end end class AbstractMetaControlSequence < EscapeSequence::Base def char codepoint.chr('utf-8') end private def control_sequence_to_s(control_sequence) five_lsb = control_sequence.unpack('B*').first[-5..-1] ["000#{five_lsb}"].pack('B*') end def meta_char_to_codepoint(meta_char) byte_value = meta_char.ord byte_value < 128 ? byte_value + 128 : byte_value end end class Control < AbstractMetaControlSequence def codepoint control_sequence_to_s(text).ord end end class Meta < AbstractMetaControlSequence def codepoint meta_char_to_codepoint(text[-1]) end end class MetaControl < AbstractMetaControlSequence def codepoint meta_char_to_codepoint(control_sequence_to_s(text)) end end end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/classes/free_space.rb000066400000000000000000000005361433525313500276360ustar00rootroot00000000000000module Regexp::Expression class FreeSpace < Regexp::Expression::Base def quantify(*_args) raise Regexp::Parser::Error, 'Can not quantify a free space object' end end class Comment < Regexp::Expression::FreeSpace; end class WhiteSpace < Regexp::Expression::FreeSpace def merge(exp) text << exp.text end end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/classes/group.rb000066400000000000000000000037071433525313500267010ustar00rootroot00000000000000module Regexp::Expression module Group class Base < Regexp::Expression::Subexpression def parts [text.dup, *expressions, ')'] end def capturing?; false end def comment?; false end end class Passive < Group::Base attr_writer :implicit def initialize(*) @implicit = false super end def parts if implicit? expressions else super end end def implicit? @implicit end end class Absence < Group::Base; end class Atomic < Group::Base; end # TODO: should split off OptionsSwitch in v3.0.0. Maybe even make it no # longer inherit from Group because it is effectively a terminal expression. class Options < Group::Base attr_accessor :option_changes def initialize_copy(orig) self.option_changes = orig.option_changes.dup super end def quantify(*args) if token == :options_switch raise Regexp::Parser::Error, 'Can not quantify an option switch' else super end end end class Capture < Group::Base attr_accessor :number, :number_at_level alias identifier number def capturing?; true end end class Named < Group::Capture attr_reader :name alias identifier name def initialize(token, options = {}) @name = token.text[3..-2] super end def initialize_copy(orig) @name = orig.name.dup super end end class Comment < Group::Base def parts [text.dup] end def comment?; true end end end module Assertion class Base < Regexp::Expression::Group::Base; end class Lookahead < Assertion::Base; end class NegativeLookahead < Assertion::Base; end class Lookbehind < Assertion::Base; end class NegativeLookbehind < Assertion::Base; end end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/classes/keep.rb000066400000000000000000000003441433525313500264630ustar00rootroot00000000000000module Regexp::Expression module Keep # TOOD: in regexp_parser v3.0.0 this should possibly be a Subexpression # that contains all expressions to its left. class Mark < Regexp::Expression::Base; end end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/classes/literal.rb000066400000000000000000000001161433525313500271700ustar00rootroot00000000000000module Regexp::Expression class Literal < Regexp::Expression::Base; end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/classes/posix_class.rb000066400000000000000000000002601433525313500300630ustar00rootroot00000000000000module Regexp::Expression class PosixClass < Regexp::Expression::Base def negative? type == :nonposixclass end def name token.to_s end end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/classes/root.rb000066400000000000000000000004741433525313500265260ustar00rootroot00000000000000module Regexp::Expression class Root < Regexp::Expression::Subexpression def self.build(options = {}) warn "`#{self.class}.build(options)` is deprecated and will raise in "\ "regexp_parser v3.0.0. Please use `.construct(options: options)`." construct(options: options) end end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/classes/unicode_property.rb000066400000000000000000000070601433525313500311330ustar00rootroot00000000000000module Regexp::Expression # TODO: unify name with token :property, one way or the other, in v3.0.0 module UnicodeProperty class Base < Regexp::Expression::Base def negative? type == :nonproperty end def name text[/\A\\[pP]\{([^}]+)\}\z/, 1] end def shortcut (Regexp::Scanner.short_prop_map.rassoc(token.to_s) || []).first end end class Alnum < Base; end class Alpha < Base; end class Ascii < Base; end class Blank < Base; end class Cntrl < Base; end class Digit < Base; end class Graph < Base; end class Lower < Base; end class Print < Base; end class Punct < Base; end class Space < Base; end class Upper < Base; end class Word < Base; end class Xdigit < Base; end class XPosixPunct < Base; end class Newline < Base; end class Any < Base; end class Assigned < Base; end module Letter class Base < UnicodeProperty::Base; end class Any < Letter::Base; end class Cased < Letter::Base; end class Uppercase < Letter::Base; end class Lowercase < Letter::Base; end class Titlecase < Letter::Base; end class Modifier < Letter::Base; end class Other < Letter::Base; end end module Mark class Base < UnicodeProperty::Base; end class Any < Mark::Base; end class Combining < Mark::Base; end class Nonspacing < Mark::Base; end class Spacing < Mark::Base; end class Enclosing < Mark::Base; end end module Number class Base < UnicodeProperty::Base; end class Any < Number::Base; end class Decimal < Number::Base; end class Letter < Number::Base; end class Other < Number::Base; end end module Punctuation class Base < UnicodeProperty::Base; end class Any < Punctuation::Base; end class Connector < Punctuation::Base; end class Dash < Punctuation::Base; end class Open < Punctuation::Base; end class Close < Punctuation::Base; end class Initial < Punctuation::Base; end class Final < Punctuation::Base; end class Other < Punctuation::Base; end end module Separator class Base < UnicodeProperty::Base; end class Any < Separator::Base; end class Space < Separator::Base; end class Line < Separator::Base; end class Paragraph < Separator::Base; end end module Symbol class Base < UnicodeProperty::Base; end class Any < Symbol::Base; end class Math < Symbol::Base; end class Currency < Symbol::Base; end class Modifier < Symbol::Base; end class Other < Symbol::Base; end end module Codepoint class Base < UnicodeProperty::Base; end class Any < Codepoint::Base; end class Control < Codepoint::Base; end class Format < Codepoint::Base; end class Surrogate < Codepoint::Base; end class PrivateUse < Codepoint::Base; end class Unassigned < Codepoint::Base; end end class Age < UnicodeProperty::Base; end class Derived < UnicodeProperty::Base; end class Emoji < UnicodeProperty::Base; end class Script < UnicodeProperty::Base; end class Block < UnicodeProperty::Base; end end end # module Regexp::Expression ammar-regexp_parser-0494e56/lib/regexp_parser/expression/methods/000077500000000000000000000000001433525313500252175ustar00rootroot00000000000000ammar-regexp_parser-0494e56/lib/regexp_parser/expression/methods/construct.rb000066400000000000000000000025711433525313500275750ustar00rootroot00000000000000module Regexp::Expression module Shared module ClassMethods # Convenience method to init a valid Expression without a Regexp::Token def construct(params = {}) attrs = construct_defaults.merge(params) options = attrs.delete(:options) token_args = Regexp::TOKEN_KEYS.map { |k| attrs.delete(k) } token = Regexp::Token.new(*token_args) raise ArgumentError, "unsupported attribute(s): #{attrs}" if attrs.any? new(token, options) end def construct_defaults if self == Root { type: :expression, token: :root, ts: 0 } elsif self < Sequence { type: :expression, token: :sequence } else { type: token_class::Type } end.merge(level: 0, set_level: 0, conditional_level: 0, text: '') end def token_class if self == Root || self < Sequence nil # no token class because these objects are Parser-generated # TODO: synch exp & token class names for alt., dot, escapes in v3.0.0 elsif self == Alternation || self == CharacterType::Any Regexp::Syntax::Token::Meta elsif self <= EscapeSequence::Base Regexp::Syntax::Token::Escape else Regexp::Syntax::Token.const_get(name.split('::')[2]) end end end def token_class self.class.token_class end end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/methods/human_name.rb000066400000000000000000000070761433525313500276660ustar00rootroot00000000000000module Regexp::Expression module Shared # default implementation, e.g. "atomic group", "hex escape", "word type", .. def human_name [token, type].compact.join(' ').tr('_', ' ') end end Alternation.class_eval { def human_name; 'alternation' end } Alternative.class_eval { def human_name; 'alternative' end } Anchor::BOL.class_eval { def human_name; 'beginning of line' end } Anchor::BOS.class_eval { def human_name; 'beginning of string' end } Anchor::EOL.class_eval { def human_name; 'end of line' end } Anchor::EOS.class_eval { def human_name; 'end of string' end } Anchor::EOSobEOL.class_eval { def human_name; 'newline-ready end of string' end } Anchor::MatchStart.class_eval { def human_name; 'match start' end } Anchor::NonWordBoundary.class_eval { def human_name; 'no word boundary' end } Anchor::WordBoundary.class_eval { def human_name; 'word boundary' end } Assertion::Lookahead.class_eval { def human_name; 'lookahead' end } Assertion::Lookbehind.class_eval { def human_name; 'lookbehind' end } Assertion::NegativeLookahead.class_eval { def human_name; 'negative lookahead' end } Assertion::NegativeLookbehind.class_eval { def human_name; 'negative lookbehind' end } Backreference::Name.class_eval { def human_name; 'backreference by name' end } Backreference::NameCall.class_eval { def human_name; 'subexpression call by name' end } Backreference::Number.class_eval { def human_name; 'backreference' end } Backreference::NumberRelative.class_eval { def human_name; 'relative backreference' end } Backreference::NumberCall.class_eval { def human_name; 'subexpression call' end } Backreference::NumberCallRelative.class_eval { def human_name; 'relative subexpression call' end } CharacterSet::IntersectedSequence.class_eval { def human_name; 'intersected sequence' end } CharacterSet::Intersection.class_eval { def human_name; 'intersection' end } CharacterSet::Range.class_eval { def human_name; 'character range' end } CharacterType::Any.class_eval { def human_name; 'match-all' end } Comment.class_eval { def human_name; 'comment' end } Conditional::Branch.class_eval { def human_name; 'conditional branch' end } Conditional::Condition.class_eval { def human_name; 'condition' end } Conditional::Expression.class_eval { def human_name; 'conditional' end } Group::Capture.class_eval { def human_name; "capture group #{number}" end } Group::Named.class_eval { def human_name; 'named capture group' end } Keep::Mark.class_eval { def human_name; 'keep-mark lookbehind' end } Literal.class_eval { def human_name; 'literal' end } Root.class_eval { def human_name; 'root' end } WhiteSpace.class_eval { def human_name; 'free space' end } end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/methods/match.rb000066400000000000000000000003571433525313500266450ustar00rootroot00000000000000module Regexp::Expression class Base def match?(string) !!match(string) end alias :matches? :match? def match(string, offset = 0) Regexp.new(to_s).match(string, offset) end alias :=~ :match end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/methods/match_length.rb000066400000000000000000000100731433525313500302020ustar00rootroot00000000000000class Regexp::MatchLength include Enumerable def self.of(obj) exp = obj.is_a?(Regexp::Expression::Base) ? obj : Regexp::Parser.parse(obj) exp.match_length end def initialize(exp, opts = {}) self.exp_class = exp.class self.min_rep = exp.repetitions.min self.max_rep = exp.repetitions.max if (base = opts[:base]) self.base_min = base self.base_max = base self.reify = ->{ '.' * base } else self.base_min = opts.fetch(:base_min) self.base_max = opts.fetch(:base_max) self.reify = opts.fetch(:reify) end end def each(opts = {}) return enum_for(__method__, opts) unless block_given? limit = opts[:limit] || 1000 yielded = 0 (min..max).each do |num| next unless include?(num) yield(num) break if (yielded += 1) >= limit end end def endless_each return enum_for(__method__) unless block_given? (min..max).each { |num| yield(num) if include?(num) } end def include?(length) test_regexp.match?('X' * length) end def fixed? min == max end def min min_rep * base_min end def max max_rep * base_max end def minmax [min, max] end def inspect type = exp_class.name.sub('Regexp::Expression::', '') "#<#{self.class}<#{type}> min=#{min} max=#{max}>" end def to_re "(?:#{reify.call}){#{min_rep},#{max_rep unless max_rep == Float::INFINITY}}" end private attr_accessor :base_min, :base_max, :min_rep, :max_rep, :exp_class, :reify def test_regexp @test_regexp ||= Regexp.new("^#{to_re}$").tap do |regexp| regexp.respond_to?(:match?) || def regexp.match?(str); !!match(str) end end end end module Regexp::Expression MatchLength = Regexp::MatchLength [ CharacterSet, CharacterSet::Intersection, CharacterSet::IntersectedSequence, CharacterSet::Range, CharacterType::Base, EscapeSequence::Base, PosixClass, UnicodeProperty::Base, ].each do |klass| klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1 def match_length MatchLength.new(self, base: 1) end RUBY end class Literal def match_length MatchLength.new(self, base: text.length) end end class Subexpression def match_length MatchLength.new(self, base_min: map { |exp| exp.match_length.min }.inject(0, :+), base_max: map { |exp| exp.match_length.max }.inject(0, :+), reify: ->{ map { |exp| exp.match_length.to_re }.join }) end def inner_match_length dummy = Regexp::Expression::Root.construct dummy.expressions = expressions.map(&:clone) dummy.quantifier = quantifier && quantifier.clone dummy.match_length end end [ Alternation, Conditional::Expression, ].each do |klass| klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1 def match_length MatchLength.new(self, base_min: map { |exp| exp.match_length.min }.min, base_max: map { |exp| exp.match_length.max }.max, reify: ->{ map { |exp| exp.match_length.to_re }.join('|') }) end RUBY end [ Anchor::Base, Assertion::Base, Conditional::Condition, FreeSpace, Keep::Mark, ].each do |klass| klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1 def match_length MatchLength.new(self, base: 0) end RUBY end class Backreference::Base def match_length if referenced_expression.nil? raise ArgumentError, 'Missing referenced_expression - not parsed?' end referenced_expression.unquantified_clone.match_length end end class EscapeSequence::CodepointList def match_length MatchLength.new(self, base: codepoints.count) end end # Special case. Absence group can match 0.. chars, irrespective of content. # TODO: in theory, they *can* exclude match lengths with `.`: `(?~.{3})` class Group::Absence def match_length MatchLength.new(self, base_min: 0, base_max: Float::INFINITY, reify: ->{ '.*' }) end end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/methods/options.rb000066400000000000000000000012231433525313500272350ustar00rootroot00000000000000module Regexp::Expression class Base def multiline? options[:m] == true end alias :m? :multiline? def case_insensitive? options[:i] == true end alias :i? :case_insensitive? alias :ignore_case? :case_insensitive? def free_spacing? options[:x] == true end alias :x? :free_spacing? alias :extended? :free_spacing? def default_classes? options[:d] == true end alias :d? :default_classes? def ascii_classes? options[:a] == true end alias :a? :ascii_classes? def unicode_classes? options[:u] == true end alias :u? :unicode_classes? end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/methods/strfregexp.rb000066400000000000000000000061631433525313500277430ustar00rootroot00000000000000module Regexp::Expression class Base # %l Level (depth) of the expression. Returns 'root' for the root # expression, returns zero or higher for all others. # # %> Indentation at expression's level. # # %x Index of the expression at its depth. Available when using # the sprintf_tree method only. # # %s Start offset within the whole expression. # %e End offset within the whole expression. # %S Length of expression. # # %o Coded offset and length, same as '@%s+%S' # # %y Type of expression. # %k Token of expression. # %i ID, same as '%y:%k' # %c Class name # # %q Quantifier info, as {m[,M]} # %Q Quantifier text # # %z Quantifier min # %Z Quantifier max # # %t Base text of the expression (excludes quantifier, if any) # %~t Full text if the expression is terminal, otherwise %i # %T Full text of the expression (includes quantifier, if any) # # %b Basic info, same as '%o %i' # %m Most info, same as '%b %q' # %a All info, same as '%m %t' # def strfregexp(format = '%a', indent_offset = 0, index = nil) have_index = index ? true : false part = {} print_level = nesting_level > 0 ? nesting_level - 1 : nil # Order is important! Fields that use other fields in their # definition must appear before the fields they use. part_keys = %w[a m b o i l x s e S y k c q Q z Z t ~t T >] part.keys.each {|k| part[k] = ""} part['>'] = print_level ? (' ' * (print_level + indent_offset)) : '' part['l'] = print_level ? "#{'%d' % print_level}" : 'root' part['x'] = "#{'%d' % index}" if have_index part['s'] = starts_at part['S'] = full_length part['e'] = starts_at + full_length part['o'] = coded_offset part['k'] = token part['y'] = type part['i'] = '%y:%k' part['c'] = self.class.name if quantified? if quantifier.max == -1 part['q'] = "{#{quantifier.min}, or-more}" else part['q'] = "{#{quantifier.min}, #{quantifier.max}}" end part['Q'] = quantifier.text part['z'] = quantifier.min part['Z'] = quantifier.max else part['q'] = '{1}' part['Q'] = '' part['z'] = '1' part['Z'] = '1' end part['t'] = to_s(:base) part['~t'] = terminal? ? to_s : "#{type}:#{token}" part['T'] = to_s(:full) part['b'] = '%o %i' part['m'] = '%b %q' part['a'] = '%m %t' out = format.dup part_keys.each do |k| out.gsub!(/%#{k}/, part[k].to_s) end out end alias :strfre :strfregexp end class Subexpression < Regexp::Expression::Base def strfregexp_tree(format = '%a', include_self = true, separator = "\n") output = include_self ? [self.strfregexp(format)] : [] output += flat_map do |exp, index| exp.strfregexp(format, (include_self ? 1 : 0), index) end output.join(separator) end alias :strfre_tree :strfregexp_tree end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/methods/tests.rb000066400000000000000000000062261433525313500267140ustar00rootroot00000000000000module Regexp::Expression module Shared # Test if this expression has the given test_type, which can be either # a symbol or an array of symbols to check against the expression's type. # # # is it a :group expression # exp.type? :group # # # is it a :set, or :meta # exp.type? [:set, :meta] # def type?(test_type) test_types = Array(test_type).map(&:to_sym) test_types.include?(:*) || test_types.include?(type) end # Test if this expression has the given test_token, and optionally a given # test_type. # # # Any expressions # exp.is? :* # always returns true # # # is it a :capture # exp.is? :capture # # # is it a :character and a :set # exp.is? :character, :set # # # is it a :meta :dot # exp.is? :dot, :meta # # # is it a :meta or :escape :dot # exp.is? :dot, [:meta, :escape] # def is?(test_token, test_type = nil) return true if test_token === :* token == test_token and (test_type ? type?(test_type) : true) end # Test if this expression matches an entry in the given scope spec. # # A scope spec can be one of: # # . An array: Interpreted as a set of tokens, tested for inclusion # of the expression's token. # # . A hash: Where the key is interpreted as the expression type # and the value is either a symbol or an array. In this # case, when the scope is a hash, one_of? calls itself to # evaluate the key's value. # # . A symbol: matches the expression's token or type, depending on # the level of the call. If one_of? is called directly with # a symbol then it will always be checked against the # type of the expression. If it's being called for a value # from a hash, it will be checked against the token of the # expression. # # # any expression # exp.one_of?(:*) # always true # # # like exp.type?(:group) # exp.one_of?(:group) # # # any expression of type meta # exp.one_of?(:meta => :*) # # # meta dots and alternations # exp.one_of?(:meta => [:dot, :alternation]) # # # meta dots and any set tokens # exp.one_of?({meta: [:dot], set: :*}) # def one_of?(scope, top = true) case scope when Array scope.include?(:*) || scope.include?(token) when Hash if scope.has_key?(:*) test_type = scope.has_key?(type) ? type : :* one_of?(scope[test_type], false) else scope.has_key?(type) && one_of?(scope[type], false) end when Symbol scope.equal?(:*) || (top ? type?(scope) : is?(scope)) else raise ArgumentError, "Array, Hash, or Symbol expected, #{scope.class.name} given" end end # Deep-compare two expressions for equality. def ==(other) other.class == self.class && other.to_s == to_s && other.options == options end alias :=== :== alias :eql? :== end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/methods/traverse.rb000066400000000000000000000036111433525313500274000ustar00rootroot00000000000000module Regexp::Expression class Subexpression < Regexp::Expression::Base # Traverses the subexpression (depth-first, pre-order) and calls the given # block for each expression with three arguments; the traversal event, # the expression, and the index of the expression within its parent. # # The event argument is passed as follows: # # - For subexpressions, :enter upon entering the subexpression, and # :exit upon exiting it. # # - For terminal expressions, :visit is called once. # # Returns self. def traverse(include_self = false, &block) return enum_for(__method__, include_self) unless block_given? block.call(:enter, self, 0) if include_self each_with_index do |exp, index| if exp.terminal? block.call(:visit, exp, index) else block.call(:enter, exp, index) exp.traverse(&block) block.call(:exit, exp, index) end end block.call(:exit, self, 0) if include_self self end alias :walk :traverse # Iterates over the expressions of this expression as an array, passing # the expression and its index within its parent to the given block. def each_expression(include_self = false) return enum_for(__method__, include_self) unless block_given? traverse(include_self) do |event, exp, index| yield(exp, index) unless event == :exit end end # Returns a new array with the results of calling the given block once # for every expression. If a block is not given, returns an array with # each expression and its level index as an array. def flat_map(include_self = false) result = [] each_expression(include_self) do |exp, index| if block_given? result << yield(exp, index) else result << [exp, index] end end result end end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/quantifier.rb000066400000000000000000000044001433525313500262460ustar00rootroot00000000000000module Regexp::Expression # TODO: in v3.0.0, maybe put Shared back into Base, and inherit from Base and # call super in #initialize, but raise in #quantifier= and #quantify, # or introduce an Expression::Quantifiable intermediate class. # Or actually allow chaining as a more concise but tricky solution than PR#69. class Quantifier include Regexp::Expression::Shared MODES = %i[greedy possessive reluctant] attr_reader :min, :max, :mode def initialize(*args) deprecated_old_init(*args) and return if args.count == 4 || args.count == 5 init_from_token_and_options(*args) @mode = (token.to_s[/greedy|reluctant|possessive/] || :greedy).to_sym @min, @max = minmax # TODO: remove in v3.0.0, stop removing parts of #token (?) self.token = token.to_s.sub(/_(greedy|possessive|reluctant)/, '').to_sym end def to_h { token: token, text: text, mode: mode, min: min, max: max, } end MODES.each do |mode| class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{mode}? mode.equal?(:#{mode}) end RUBY end alias :lazy? :reluctant? private def deprecated_old_init(token, text, min, max, mode = :greedy) warn "Calling `Expression::Base#quantify` or `#{self.class}.new` with 4+ arguments "\ "is deprecated.\nIt will no longer be supported in regexp_parser v3.0.0.\n"\ "Please pass a Regexp::Token instead, e.g. replace `token, text, min, max, mode` "\ "with `::Regexp::Token.new(:quantifier, token, text)`. min, max, and mode "\ "will be derived automatically.\n"\ "Or do `exp.quantifier = #{self.class}.construct(token: token, text: str)`.\n"\ "This is consistent with how Expression::Base instances are created. " @token = token @text = text @min = min @max = max @mode = mode end def minmax case token when /zero_or_one/ then [0, 1] when /zero_or_more/ then [0, -1] when /one_or_more/ then [1, -1] when :interval int_min = text[/\{(\d*)/, 1] int_max = text[/,?(\d*)\}/, 1] [int_min.to_i, (int_max.empty? ? -1 : int_max.to_i)] end end end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/sequence.rb000066400000000000000000000022361433525313500257140ustar00rootroot00000000000000module Regexp::Expression # A sequence of expressions. Differs from a Subexpressions by how it handles # quantifiers, as it applies them to its last element instead of itself as # a whole subexpression. # # Used as the base class for the Alternation alternatives, Conditional # branches, and CharacterSet::Intersection intersected sequences. class Sequence < Regexp::Expression::Subexpression class << self def add_to(exp, params = {}, active_opts = {}) sequence = construct( level: exp.level, set_level: exp.set_level, conditional_level: params[:conditional_level] || exp.conditional_level, ) sequence.nesting_level = exp.nesting_level + 1 sequence.options = active_opts exp.expressions << sequence sequence end end def starts_at expressions.first.starts_at end alias :ts :starts_at def quantify(*args) target = expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) } target or raise Regexp::Parser::Error, "No valid target found for '#{text}' quantifier" target.quantify(*args) end end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/sequence_operation.rb000066400000000000000000000010071433525313500277670ustar00rootroot00000000000000module Regexp::Expression # abstract class class SequenceOperation < Regexp::Expression::Subexpression alias :sequences :expressions alias :operands :expressions alias :operator :text def starts_at expressions.first.starts_at end alias :ts :starts_at def <<(exp) expressions.last << exp end def add_sequence(active_opts = {}) self.class::OPERAND.add_to(self, {}, active_opts) end def parts intersperse(expressions, text.dup) end end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/shared.rb000066400000000000000000000041751433525313500253560ustar00rootroot00000000000000module Regexp::Expression module Shared module ClassMethods; end # filled in ./methods/*.rb def self.included(mod) mod.class_eval do extend Shared::ClassMethods attr_accessor :type, :token, :text, :ts, :te, :level, :set_level, :conditional_level, :options attr_reader :nesting_level, :quantifier end end def init_from_token_and_options(token, options = {}) self.type = token.type self.token = token.token self.text = token.text self.ts = token.ts self.te = token.te self.level = token.level self.set_level = token.set_level self.conditional_level = token.conditional_level self.nesting_level = 0 self.options = options || {} end private :init_from_token_and_options def initialize_copy(orig) self.text = orig.text.dup if orig.text self.options = orig.options.dup if orig.options self.quantifier = orig.quantifier.clone if orig.quantifier super end def starts_at ts end def base_length to_s(:base).length end def full_length to_s.length end def to_s(format = :full) "#{parts.join}#{quantifier_affix(format)}" end alias :to_str :to_s def parts [text.dup] end def quantifier_affix(expression_format) quantifier.to_s if quantified? && expression_format != :base end def quantified? !quantifier.nil? end def optional? quantified? && quantifier.min == 0 end def offset [starts_at, full_length] end def coded_offset '@%d+%d' % offset end def terminal? !respond_to?(:expressions) end def nesting_level=(lvl) @nesting_level = lvl quantifier && quantifier.nesting_level = lvl terminal? || each { |subexp| subexp.nesting_level = lvl + 1 } end def quantifier=(qtf) @quantifier = qtf @repetitions = nil # clear memoized value end end end ammar-regexp_parser-0494e56/lib/regexp_parser/expression/subexpression.rb000066400000000000000000000024721433525313500270170ustar00rootroot00000000000000module Regexp::Expression class Subexpression < Regexp::Expression::Base include Enumerable attr_accessor :expressions def initialize(token, options = {}) self.expressions = [] super end # Override base method to clone the expressions as well. def initialize_copy(orig) self.expressions = orig.expressions.map(&:clone) super end def <<(exp) if exp.is_a?(WhiteSpace) && last && last.is_a?(WhiteSpace) last.merge(exp) else exp.nesting_level = nesting_level + 1 expressions << exp end end %w[[] at each empty? fetch index join last length values_at].each do |method| class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{method}(*args, &block) expressions.#{method}(*args, &block) end RUBY end def dig(*indices) exp = self indices.each { |idx| exp = exp.nil? || exp.terminal? ? nil : exp[idx] } exp end def te ts + to_s.length end def parts expressions end def to_h attributes.merge( text: to_s(:base), expressions: expressions.map(&:to_h) ) end private def intersperse(expressions, separator) expressions.flat_map { |exp| [exp, separator] }.slice(0...-1) end end end ammar-regexp_parser-0494e56/lib/regexp_parser/lexer.rb000066400000000000000000000075731433525313500230350ustar00rootroot00000000000000# A very thin wrapper around the scanner that breaks quantified literal runs, # collects emitted tokens into an array, calculates their nesting depth, and # normalizes tokens for the parser, and checks if they are implemented by the # given syntax flavor. class Regexp::Lexer OPENING_TOKENS = %i[ capture passive lookahead nlookahead lookbehind nlookbehind atomic options options_switch named absence ].freeze CLOSING_TOKENS = %i[close].freeze CONDITION_TOKENS = %i[condition condition_close].freeze def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block) new.lex(input, syntax, options: options, &block) end def lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block) syntax = Regexp::Syntax.for(syntax) self.tokens = [] self.nesting = 0 self.set_nesting = 0 self.conditional_nesting = 0 self.shift = 0 last = nil Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te| type, token = *syntax.normalize(type, token) syntax.check! type, token ascend(type, token) if type == :quantifier and last break_literal(last) if last.type == :literal break_codepoint_list(last) if last.token == :codepoint_list end current = Regexp::Token.new(type, token, text, ts + shift, te + shift, nesting, set_nesting, conditional_nesting) current = merge_condition(current) if type == :conditional and CONDITION_TOKENS.include?(token) last.next = current if last current.previous = last if last tokens << current last = current descend(type, token) end if block_given? tokens.map { |t| block.call(t) } else tokens end end class << self alias :scan :lex end private attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting, :shift def ascend(type, token) case type when :group, :assertion self.nesting = nesting - 1 if CLOSING_TOKENS.include?(token) when :set self.set_nesting = set_nesting - 1 if token == :close when :conditional self.conditional_nesting = conditional_nesting - 1 if token == :close end end def descend(type, token) case type when :group, :assertion self.nesting = nesting + 1 if OPENING_TOKENS.include?(token) when :set self.set_nesting = set_nesting + 1 if token == :open when :conditional self.conditional_nesting = conditional_nesting + 1 if token == :open end end # called by scan to break a literal run that is longer than one character # into two separate tokens when it is followed by a quantifier def break_literal(token) lead, last, _ = token.text.partition(/.\z/mu) return if lead.empty? tokens.pop tokens << Regexp::Token.new(:literal, :literal, lead, token.ts, (token.te - last.length), nesting, set_nesting, conditional_nesting) tokens << Regexp::Token.new(:literal, :literal, last, (token.ts + lead.length), token.te, nesting, set_nesting, conditional_nesting) end def break_codepoint_list(token) lead, _, tail = token.text.rpartition(' ') return if lead.empty? tokens.pop tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}', token.ts, (token.te - tail.length), nesting, set_nesting, conditional_nesting) tokens << Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail, (token.ts + lead.length + 1), (token.te + 3), nesting, set_nesting, conditional_nesting) self.shift = shift + 3 # one space less, but extra \, u, {, and } end def merge_condition(current) last = tokens.pop Regexp::Token.new(:conditional, :condition, last.text + current.text, last.ts, current.te, nesting, set_nesting, conditional_nesting) end end # module Regexp::Lexer ammar-regexp_parser-0494e56/lib/regexp_parser/parser.rb000066400000000000000000000513601433525313500232030ustar00rootroot00000000000000require 'regexp_parser/error' require 'regexp_parser/expression' class Regexp::Parser include Regexp::Expression class ParserError < Regexp::Parser::Error; end class UnknownTokenTypeError < ParserError def initialize(type, token) super "Unknown token type #{type} #{token.inspect}" end end class UnknownTokenError < ParserError def initialize(type, token) super "Unknown #{type} token #{token.token}" end end def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block) new.parse(input, syntax, options: options, &block) end def parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block) root = Root.construct(options: extract_options(input, options)) self.root = root self.node = root self.nesting = [root] self.options_stack = [root.options] self.switching_options = false self.conditional_nesting = [] self.captured_group_counts = Hash.new(0) Regexp::Lexer.scan(input, syntax, options: options) do |token| parse_token(token) end # Trigger recursive setting of #nesting_level, which reflects how deep # a node is in the tree. Do this at the end to account for tree rewrites. root.nesting_level = 0 assign_referenced_expressions if block_given? block.call(root) else root end end private attr_accessor :root, :node, :nesting, :options_stack, :switching_options, :conditional_nesting, :captured_group_counts def extract_options(input, options) if options && !input.is_a?(String) raise ArgumentError, 'options cannot be supplied unless parsing a String' end options = input.options if input.is_a?(::Regexp) return {} unless options enabled_options = {} enabled_options[:i] = true if options & ::Regexp::IGNORECASE != 0 enabled_options[:m] = true if options & ::Regexp::MULTILINE != 0 enabled_options[:x] = true if options & ::Regexp::EXTENDED != 0 enabled_options end def parse_token(token) case token.type when :anchor; anchor(token) when :assertion, :group; group(token) when :backref; backref(token) when :conditional; conditional(token) when :escape; escape(token) when :free_space; free_space(token) when :keep; keep(token) when :literal; literal(token) when :meta; meta(token) when :posixclass, :nonposixclass; posixclass(token) when :property, :nonproperty; property(token) when :quantifier; quantifier(token) when :set; set(token) when :type; type(token) else raise UnknownTokenTypeError.new(token.type, token) end close_completed_character_set_range end def anchor(token) case token.token when :bol; node << Anchor::BeginningOfLine.new(token, active_opts) when :bos; node << Anchor::BOS.new(token, active_opts) when :eol; node << Anchor::EndOfLine.new(token, active_opts) when :eos; node << Anchor::EOS.new(token, active_opts) when :eos_ob_eol; node << Anchor::EOSobEOL.new(token, active_opts) when :match_start; node << Anchor::MatchStart.new(token, active_opts) when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts) when :word_boundary; node << Anchor::WordBoundary.new(token, active_opts) else raise UnknownTokenError.new('Anchor', token) end end def group(token) case token.token when :options, :options_switch options_group(token) when :close close_group when :comment node << Group::Comment.new(token, active_opts) else open_group(token) end end MOD_FLAGS = %w[i m x].map(&:to_sym) ENC_FLAGS = %w[a d u].map(&:to_sym) def options_group(token) positive, negative = token.text.split('-', 2) negative ||= '' self.switching_options = token.token.equal?(:options_switch) opt_changes = {} new_active_opts = active_opts.dup MOD_FLAGS.each do |flag| if positive.include?(flag.to_s) opt_changes[flag] = new_active_opts[flag] = true end if negative.include?(flag.to_s) opt_changes[flag] = false new_active_opts.delete(flag) end end if (enc_flag = positive.reverse[/[adu]/]) enc_flag = enc_flag.to_sym (ENC_FLAGS - [enc_flag]).each do |other| opt_changes[other] = false if new_active_opts[other] new_active_opts.delete(other) end opt_changes[enc_flag] = new_active_opts[enc_flag] = true end options_stack << new_active_opts options_group = Group::Options.new(token, active_opts) options_group.option_changes = opt_changes nest(options_group) end def open_group(token) group_class = case token.token when :absence; Group::Absence when :atomic; Group::Atomic when :capture; Group::Capture when :named; Group::Named when :passive; Group::Passive when :lookahead; Assertion::Lookahead when :lookbehind; Assertion::Lookbehind when :nlookahead; Assertion::NegativeLookahead when :nlookbehind; Assertion::NegativeLookbehind else raise UnknownTokenError.new('Group type open', token) end group = group_class.new(token, active_opts) if group.capturing? group.number = total_captured_group_count + 1 group.number_at_level = captured_group_count_at_level + 1 count_captured_group end # Push the active options to the stack again. This way we can simply pop the # stack for any group we close, no matter if it had its own options or not. options_stack << active_opts nest(group) end def total_captured_group_count captured_group_counts.values.reduce(0, :+) end def captured_group_count_at_level captured_group_counts[node] end def count_captured_group captured_group_counts[node] += 1 end def close_group options_stack.pop unless switching_options self.switching_options = false decrease_nesting end def decrease_nesting while nesting.last.is_a?(SequenceOperation) nesting.pop self.node = nesting.last end nesting.pop yield(node) if block_given? self.node = nesting.last self.node = node.last if node.last.is_a?(SequenceOperation) end def backref(token) case token.token when :name_ref node << Backreference::Name.new(token, active_opts) when :name_recursion_ref node << Backreference::NameRecursionLevel.new(token, active_opts) when :name_call node << Backreference::NameCall.new(token, active_opts) when :number, :number_ref node << Backreference::Number.new(token, active_opts) when :number_recursion_ref node << Backreference::NumberRecursionLevel.new(token, active_opts).tap do |exp| # TODO: should split off new token number_recursion_rel_ref and new # class NumberRelativeRecursionLevel in v3.0.0 to get rid of this if exp.text =~ /[<'][+-]/ assign_effective_number(exp) else exp.effective_number = exp.number end end when :number_call node << Backreference::NumberCall.new(token, active_opts) when :number_rel_ref node << Backreference::NumberRelative.new(token, active_opts).tap do |exp| assign_effective_number(exp) end when :number_rel_call node << Backreference::NumberCallRelative.new(token, active_opts).tap do |exp| assign_effective_number(exp) end else raise UnknownTokenError.new('Backreference', token) end end def assign_effective_number(exp) exp.effective_number = exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0) exp.effective_number > 0 || raise(ParserError, "Invalid reference: #{exp.reference}") end def conditional(token) case token.token when :open nest_conditional(Conditional::Expression.new(token, active_opts)) when :condition conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts) conditional_nesting.last.add_sequence(active_opts) when :separator conditional_nesting.last.add_sequence(active_opts) self.node = conditional_nesting.last.branches.last when :close conditional_nesting.pop decrease_nesting self.node = if conditional_nesting.empty? nesting.last else conditional_nesting.last end else raise UnknownTokenError.new('Conditional', token) end end def nest_conditional(exp) conditional_nesting.push(exp) nest(exp) end def nest(exp) nesting.push(exp) node << exp self.node = exp end def escape(token) case token.token when :backspace; node << EscapeSequence::Backspace.new(token, active_opts) when :escape; node << EscapeSequence::AsciiEscape.new(token, active_opts) when :bell; node << EscapeSequence::Bell.new(token, active_opts) when :form_feed; node << EscapeSequence::FormFeed.new(token, active_opts) when :newline; node << EscapeSequence::Newline.new(token, active_opts) when :carriage; node << EscapeSequence::Return.new(token, active_opts) when :tab; node << EscapeSequence::Tab.new(token, active_opts) when :vertical_tab; node << EscapeSequence::VerticalTab.new(token, active_opts) when :codepoint; node << EscapeSequence::Codepoint.new(token, active_opts) when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts) when :hex; node << EscapeSequence::Hex.new(token, active_opts) when :octal; node << EscapeSequence::Octal.new(token, active_opts) when :control if token.text =~ /\A(?:\\C-\\M|\\c\\M)/ node << EscapeSequence::MetaControl.new(token, active_opts) else node << EscapeSequence::Control.new(token, active_opts) end when :meta_sequence if token.text =~ /\A\\M-\\[Cc]/ node << EscapeSequence::MetaControl.new(token, active_opts) else node << EscapeSequence::Meta.new(token, active_opts) end else # treating everything else as a literal # TODO: maybe split this up a bit more in v3.0.0? # E.g. escaped quantifiers or set meta chars are not the same # as stuff that would be a literal even without the backslash. # Right now, they all end up here. node << EscapeSequence::Literal.new(token, active_opts) end end def free_space(token) case token.token when :comment node << Comment.new(token, active_opts) when :whitespace if node.last.is_a?(WhiteSpace) node.last.merge(WhiteSpace.new(token, active_opts)) else node << WhiteSpace.new(token, active_opts) end else raise UnknownTokenError.new('FreeSpace', token) end end def keep(token) node << Keep::Mark.new(token, active_opts) end def literal(token) node << Literal.new(token, active_opts) end def meta(token) case token.token when :dot node << CharacterType::Any.new(token, active_opts) when :alternation sequence_operation(Alternation, token) else raise UnknownTokenError.new('Meta', token) end end def sequence_operation(klass, token) unless node.is_a?(klass) operator = klass.new(token, active_opts) sequence = operator.add_sequence(active_opts) sequence.expressions = node.expressions node.expressions = [] nest(operator) end node.add_sequence(active_opts) end def posixclass(token) node << PosixClass.new(token, active_opts) end include Regexp::Expression::UnicodeProperty UPTokens = Regexp::Syntax::Token::UnicodeProperty def property(token) case token.token when :alnum; node << Alnum.new(token, active_opts) when :alpha; node << Alpha.new(token, active_opts) when :ascii; node << Ascii.new(token, active_opts) when :blank; node << Blank.new(token, active_opts) when :cntrl; node << Cntrl.new(token, active_opts) when :digit; node << Digit.new(token, active_opts) when :graph; node << Graph.new(token, active_opts) when :lower; node << Lower.new(token, active_opts) when :print; node << Print.new(token, active_opts) when :punct; node << Punct.new(token, active_opts) when :space; node << Space.new(token, active_opts) when :upper; node << Upper.new(token, active_opts) when :word; node << Word.new(token, active_opts) when :xdigit; node << Xdigit.new(token, active_opts) when :xposixpunct; node << XPosixPunct.new(token, active_opts) # only in Oniguruma (old rubies) when :newline; node << Newline.new(token, active_opts) when :any; node << Any.new(token, active_opts) when :assigned; node << Assigned.new(token, active_opts) when :letter; node << Letter::Any.new(token, active_opts) when :cased_letter; node << Letter::Cased.new(token, active_opts) when :uppercase_letter; node << Letter::Uppercase.new(token, active_opts) when :lowercase_letter; node << Letter::Lowercase.new(token, active_opts) when :titlecase_letter; node << Letter::Titlecase.new(token, active_opts) when :modifier_letter; node << Letter::Modifier.new(token, active_opts) when :other_letter; node << Letter::Other.new(token, active_opts) when :mark; node << Mark::Any.new(token, active_opts) when :combining_mark; node << Mark::Combining.new(token, active_opts) when :nonspacing_mark; node << Mark::Nonspacing.new(token, active_opts) when :spacing_mark; node << Mark::Spacing.new(token, active_opts) when :enclosing_mark; node << Mark::Enclosing.new(token, active_opts) when :number; node << Number::Any.new(token, active_opts) when :decimal_number; node << Number::Decimal.new(token, active_opts) when :letter_number; node << Number::Letter.new(token, active_opts) when :other_number; node << Number::Other.new(token, active_opts) when :punctuation; node << Punctuation::Any.new(token, active_opts) when :connector_punctuation; node << Punctuation::Connector.new(token, active_opts) when :dash_punctuation; node << Punctuation::Dash.new(token, active_opts) when :open_punctuation; node << Punctuation::Open.new(token, active_opts) when :close_punctuation; node << Punctuation::Close.new(token, active_opts) when :initial_punctuation; node << Punctuation::Initial.new(token, active_opts) when :final_punctuation; node << Punctuation::Final.new(token, active_opts) when :other_punctuation; node << Punctuation::Other.new(token, active_opts) when :separator; node << Separator::Any.new(token, active_opts) when :space_separator; node << Separator::Space.new(token, active_opts) when :line_separator; node << Separator::Line.new(token, active_opts) when :paragraph_separator; node << Separator::Paragraph.new(token, active_opts) when :symbol; node << Symbol::Any.new(token, active_opts) when :math_symbol; node << Symbol::Math.new(token, active_opts) when :currency_symbol; node << Symbol::Currency.new(token, active_opts) when :modifier_symbol; node << Symbol::Modifier.new(token, active_opts) when :other_symbol; node << Symbol::Other.new(token, active_opts) when :other; node << Codepoint::Any.new(token, active_opts) when :control; node << Codepoint::Control.new(token, active_opts) when :format; node << Codepoint::Format.new(token, active_opts) when :surrogate; node << Codepoint::Surrogate.new(token, active_opts) when :private_use; node << Codepoint::PrivateUse.new(token, active_opts) when :unassigned; node << Codepoint::Unassigned.new(token, active_opts) when *UPTokens::Age; node << Age.new(token, active_opts) when *UPTokens::Derived; node << Derived.new(token, active_opts) when *UPTokens::Emoji; node << Emoji.new(token, active_opts) when *UPTokens::Script; node << Script.new(token, active_opts) when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts) else raise UnknownTokenError.new('UnicodeProperty', token) end end def quantifier(token) target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) } target_node or raise ParserError, "No valid target found for '#{token.text}'" # in case of chained quantifiers, wrap target in an implicit passive group # description of the problem: https://github.com/ammar/regexp_parser/issues/3 # rationale for this solution: https://github.com/ammar/regexp_parser/pull/69 if target_node.quantified? new_group = Group::Passive.construct( token: :passive, ts: target_node.ts, level: target_node.level, set_level: target_node.set_level, conditional_level: target_node.conditional_level, options: active_opts, ) new_group.implicit = true new_group << target_node increase_group_level(target_node) node.expressions[node.expressions.index(target_node)] = new_group target_node = new_group end unless token.token =~ /\A(?:zero_or_one|zero_or_more|one_or_more|interval) (?:_greedy|_reluctant|_possessive)?\z/x raise UnknownTokenError.new('Quantifier', token) end target_node.quantify(token, active_opts) end def increase_group_level(exp) exp.level += 1 exp.quantifier.level += 1 if exp.quantifier exp.terminal? || exp.each { |subexp| increase_group_level(subexp) } end def set(token) case token.token when :open; open_set(token) when :close; close_set when :negate; negate_set when :range; range(token) when :intersection; intersection(token) else raise UnknownTokenError.new('CharacterSet', token) end end def open_set(token) token.token = :character nest(CharacterSet.new(token, active_opts)) end def negate_set node.negate end def close_set decrease_nesting(&:close) end def range(token) exp = CharacterSet::Range.new(token, active_opts) scope = node.last.is_a?(CharacterSet::IntersectedSequence) ? node.last : node exp << scope.expressions.pop nest(exp) end def intersection(token) sequence_operation(CharacterSet::Intersection, token) end def type(token) case token.token when :digit; node << CharacterType::Digit.new(token, active_opts) when :hex; node << CharacterType::Hex.new(token, active_opts) when :linebreak; node << CharacterType::Linebreak.new(token, active_opts) when :nondigit; node << CharacterType::NonDigit.new(token, active_opts) when :nonhex; node << CharacterType::NonHex.new(token, active_opts) when :nonspace; node << CharacterType::NonSpace.new(token, active_opts) when :nonword; node << CharacterType::NonWord.new(token, active_opts) when :space; node << CharacterType::Space.new(token, active_opts) when :word; node << CharacterType::Word.new(token, active_opts) when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts) else raise UnknownTokenError.new('CharacterType', token) end end def close_completed_character_set_range decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete? end def active_opts options_stack.last end # Assigns referenced expressions to refering expressions, e.g. if there is # an instance of Backreference::Number, its #referenced_expression is set to # the instance of Group::Capture that it refers to via its number. def assign_referenced_expressions # find all referencable expressions targets = { 0 => root } root.each_expression do |exp| exp.is_a?(Group::Capture) && targets[exp.identifier] = exp end # assign them to any refering expressions root.each_expression do |exp| next unless exp.respond_to?(:reference) exp.referenced_expression = targets[exp.reference] || raise(ParserError, "Invalid reference: #{exp.reference}") end end end # module Regexp::Parser ammar-regexp_parser-0494e56/lib/regexp_parser/scanner/000077500000000000000000000000001433525313500230065ustar00rootroot00000000000000ammar-regexp_parser-0494e56/lib/regexp_parser/scanner/char_type.rl000066400000000000000000000015651433525313500253320ustar00rootroot00000000000000%%{ machine re_char_type; single_codepoint_char_type = [dDhHsSwW]; multi_codepoint_char_type = [RX]; char_type_char = single_codepoint_char_type | multi_codepoint_char_type; # Char types scanner # -------------------------------------------------------------------------- char_type := |* char_type_char { case text = copy(data, ts-1, te) when '\d'; emit(:type, :digit, text) when '\D'; emit(:type, :nondigit, text) when '\h'; emit(:type, :hex, text) when '\H'; emit(:type, :nonhex, text) when '\s'; emit(:type, :space, text) when '\S'; emit(:type, :nonspace, text) when '\w'; emit(:type, :word, text) when '\W'; emit(:type, :nonword, text) when '\R'; emit(:type, :linebreak, text) when '\X'; emit(:type, :xgrapheme, text) end fret; }; *|; }%% ammar-regexp_parser-0494e56/lib/regexp_parser/scanner/properties/000077500000000000000000000000001433525313500252025ustar00rootroot00000000000000ammar-regexp_parser-0494e56/lib/regexp_parser/scanner/properties/long.csv000066400000000000000000000437711433525313500266720ustar00rootroot00000000000000# THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT adlam,adlam age=1.1,age=1.1 age=10.0,age=10.0 age=11.0,age=11.0 age=12.0,age=12.0 age=12.1,age=12.1 age=13.0,age=13.0 age=14.0,age=14.0 age=2.0,age=2.0 age=2.1,age=2.1 age=3.0,age=3.0 age=3.1,age=3.1 age=3.2,age=3.2 age=4.0,age=4.0 age=4.1,age=4.1 age=5.0,age=5.0 age=5.1,age=5.1 age=5.2,age=5.2 age=6.0,age=6.0 age=6.1,age=6.1 age=6.2,age=6.2 age=6.3,age=6.3 age=7.0,age=7.0 age=8.0,age=8.0 age=9.0,age=9.0 ahom,ahom alnum,alnum alpha,alpha alphabetic,alphabetic anatolianhieroglyphs,anatolian_hieroglyphs any,any arabic,arabic armenian,armenian ascii,ascii asciihexdigit,ascii_hex_digit assigned,assigned avestan,avestan balinese,balinese bamum,bamum bassavah,bassa_vah batak,batak bengali,bengali bhaiksuki,bhaiksuki bidicontrol,bidi_control blank,blank bopomofo,bopomofo brahmi,brahmi braille,braille buginese,buginese buhid,buhid canadianaboriginal,canadian_aboriginal carian,carian cased,cased casedletter,cased_letter caseignorable,case_ignorable caucasianalbanian,caucasian_albanian chakma,chakma cham,cham changeswhencasefolded,changes_when_casefolded changeswhencasemapped,changes_when_casemapped changeswhenlowercased,changes_when_lowercased changeswhentitlecased,changes_when_titlecased changeswhenuppercased,changes_when_uppercased cherokee,cherokee chorasmian,chorasmian closepunctuation,close_punctuation cntrl,cntrl common,common connectorpunctuation,connector_punctuation control,control coptic,coptic cuneiform,cuneiform currencysymbol,currency_symbol cypriot,cypriot cyprominoan,cypro_minoan cyrillic,cyrillic dash,dash dashpunctuation,dash_punctuation decimalnumber,decimal_number defaultignorablecodepoint,default_ignorable_code_point deprecated,deprecated deseret,deseret devanagari,devanagari diacritic,diacritic digit,digit divesakuru,dives_akuru dogra,dogra duployan,duployan egyptianhieroglyphs,egyptian_hieroglyphs elbasan,elbasan elymaic,elymaic emoji,emoji emojicomponent,emoji_component emojimodifier,emoji_modifier emojimodifierbase,emoji_modifier_base emojipresentation,emoji_presentation enclosingmark,enclosing_mark ethiopic,ethiopic extender,extender finalpunctuation,final_punctuation format,format georgian,georgian glagolitic,glagolitic gothic,gothic grantha,grantha graph,graph graphemebase,grapheme_base graphemeextend,grapheme_extend graphemelink,grapheme_link greek,greek gujarati,gujarati gunjalagondi,gunjala_gondi gurmukhi,gurmukhi han,han hangul,hangul hanifirohingya,hanifi_rohingya hanunoo,hanunoo hatran,hatran hebrew,hebrew hexdigit,hex_digit hiragana,hiragana hyphen,hyphen idcontinue,id_continue ideographic,ideographic idsbinaryoperator,ids_binary_operator idstart,id_start idstrinaryoperator,ids_trinary_operator imperialaramaic,imperial_aramaic inadlam,in_adlam inaegeannumbers,in_aegean_numbers inahom,in_ahom inalchemicalsymbols,in_alchemical_symbols inalphabeticpresentationforms,in_alphabetic_presentation_forms inanatolianhieroglyphs,in_anatolian_hieroglyphs inancientgreekmusicalnotation,in_ancient_greek_musical_notation inancientgreeknumbers,in_ancient_greek_numbers inancientsymbols,in_ancient_symbols inarabic,in_arabic inarabicextendeda,in_arabic_extended_a inarabicextendedb,in_arabic_extended_b inarabicmathematicalalphabeticsymbols,in_arabic_mathematical_alphabetic_symbols inarabicpresentationformsa,in_arabic_presentation_forms_a inarabicpresentationformsb,in_arabic_presentation_forms_b inarabicsupplement,in_arabic_supplement inarmenian,in_armenian inarrows,in_arrows inavestan,in_avestan inbalinese,in_balinese inbamum,in_bamum inbamumsupplement,in_bamum_supplement inbasiclatin,in_basic_latin inbassavah,in_bassa_vah inbatak,in_batak inbengali,in_bengali inbhaiksuki,in_bhaiksuki inblockelements,in_block_elements inbopomofo,in_bopomofo inbopomofoextended,in_bopomofo_extended inboxdrawing,in_box_drawing inbrahmi,in_brahmi inbraillepatterns,in_braille_patterns inbuginese,in_buginese inbuhid,in_buhid inbyzantinemusicalsymbols,in_byzantine_musical_symbols incarian,in_carian incaucasianalbanian,in_caucasian_albanian inchakma,in_chakma incham,in_cham incherokee,in_cherokee incherokeesupplement,in_cherokee_supplement inchesssymbols,in_chess_symbols inchorasmian,in_chorasmian incjkcompatibility,in_cjk_compatibility incjkcompatibilityforms,in_cjk_compatibility_forms incjkcompatibilityideographs,in_cjk_compatibility_ideographs incjkcompatibilityideographssupplement,in_cjk_compatibility_ideographs_supplement incjkradicalssupplement,in_cjk_radicals_supplement incjkstrokes,in_cjk_strokes incjksymbolsandpunctuation,in_cjk_symbols_and_punctuation incjkunifiedideographs,in_cjk_unified_ideographs incjkunifiedideographsextensiona,in_cjk_unified_ideographs_extension_a incjkunifiedideographsextensionb,in_cjk_unified_ideographs_extension_b incjkunifiedideographsextensionc,in_cjk_unified_ideographs_extension_c incjkunifiedideographsextensiond,in_cjk_unified_ideographs_extension_d incjkunifiedideographsextensione,in_cjk_unified_ideographs_extension_e incjkunifiedideographsextensionf,in_cjk_unified_ideographs_extension_f incjkunifiedideographsextensiong,in_cjk_unified_ideographs_extension_g incombiningdiacriticalmarks,in_combining_diacritical_marks incombiningdiacriticalmarksextended,in_combining_diacritical_marks_extended incombiningdiacriticalmarksforsymbols,in_combining_diacritical_marks_for_symbols incombiningdiacriticalmarkssupplement,in_combining_diacritical_marks_supplement incombininghalfmarks,in_combining_half_marks incommonindicnumberforms,in_common_indic_number_forms incontrolpictures,in_control_pictures incoptic,in_coptic incopticepactnumbers,in_coptic_epact_numbers incountingrodnumerals,in_counting_rod_numerals incuneiform,in_cuneiform incuneiformnumbersandpunctuation,in_cuneiform_numbers_and_punctuation incurrencysymbols,in_currency_symbols incypriotsyllabary,in_cypriot_syllabary incyprominoan,in_cypro_minoan incyrillic,in_cyrillic incyrillicextendeda,in_cyrillic_extended_a incyrillicextendedb,in_cyrillic_extended_b incyrillicextendedc,in_cyrillic_extended_c incyrillicsupplement,in_cyrillic_supplement indeseret,in_deseret indevanagari,in_devanagari indevanagariextended,in_devanagari_extended indingbats,in_dingbats indivesakuru,in_dives_akuru indogra,in_dogra indominotiles,in_domino_tiles induployan,in_duployan inearlydynasticcuneiform,in_early_dynastic_cuneiform inegyptianhieroglyphformatcontrols,in_egyptian_hieroglyph_format_controls inegyptianhieroglyphs,in_egyptian_hieroglyphs inelbasan,in_elbasan inelymaic,in_elymaic inemoticons,in_emoticons inenclosedalphanumerics,in_enclosed_alphanumerics inenclosedalphanumericsupplement,in_enclosed_alphanumeric_supplement inenclosedcjklettersandmonths,in_enclosed_cjk_letters_and_months inenclosedideographicsupplement,in_enclosed_ideographic_supplement inethiopic,in_ethiopic inethiopicextended,in_ethiopic_extended inethiopicextendeda,in_ethiopic_extended_a inethiopicextendedb,in_ethiopic_extended_b inethiopicsupplement,in_ethiopic_supplement ingeneralpunctuation,in_general_punctuation ingeometricshapes,in_geometric_shapes ingeometricshapesextended,in_geometric_shapes_extended ingeorgian,in_georgian ingeorgianextended,in_georgian_extended ingeorgiansupplement,in_georgian_supplement inglagolitic,in_glagolitic inglagoliticsupplement,in_glagolitic_supplement ingothic,in_gothic ingrantha,in_grantha ingreekandcoptic,in_greek_and_coptic ingreekextended,in_greek_extended ingujarati,in_gujarati ingunjalagondi,in_gunjala_gondi ingurmukhi,in_gurmukhi inhalfwidthandfullwidthforms,in_halfwidth_and_fullwidth_forms inhangulcompatibilityjamo,in_hangul_compatibility_jamo inhanguljamo,in_hangul_jamo inhanguljamoextendeda,in_hangul_jamo_extended_a inhanguljamoextendedb,in_hangul_jamo_extended_b inhangulsyllables,in_hangul_syllables inhanifirohingya,in_hanifi_rohingya inhanunoo,in_hanunoo inhatran,in_hatran inhebrew,in_hebrew inherited,inherited inhighprivateusesurrogates,in_high_private_use_surrogates inhighsurrogates,in_high_surrogates inhiragana,in_hiragana inideographicdescriptioncharacters,in_ideographic_description_characters inideographicsymbolsandpunctuation,in_ideographic_symbols_and_punctuation inimperialaramaic,in_imperial_aramaic inindicsiyaqnumbers,in_indic_siyaq_numbers ininscriptionalpahlavi,in_inscriptional_pahlavi ininscriptionalparthian,in_inscriptional_parthian inipaextensions,in_ipa_extensions initialpunctuation,initial_punctuation injavanese,in_javanese inkaithi,in_kaithi inkanaextendeda,in_kana_extended_a inkanaextendedb,in_kana_extended_b inkanasupplement,in_kana_supplement inkanbun,in_kanbun inkangxiradicals,in_kangxi_radicals inkannada,in_kannada inkatakana,in_katakana inkatakanaphoneticextensions,in_katakana_phonetic_extensions inkayahli,in_kayah_li inkharoshthi,in_kharoshthi inkhitansmallscript,in_khitan_small_script inkhmer,in_khmer inkhmersymbols,in_khmer_symbols inkhojki,in_khojki inkhudawadi,in_khudawadi inlao,in_lao inlatin1supplement,in_latin_1_supplement inlatinextendeda,in_latin_extended_a inlatinextendedadditional,in_latin_extended_additional inlatinextendedb,in_latin_extended_b inlatinextendedc,in_latin_extended_c inlatinextendedd,in_latin_extended_d inlatinextendede,in_latin_extended_e inlatinextendedf,in_latin_extended_f inlatinextendedg,in_latin_extended_g inlepcha,in_lepcha inletterlikesymbols,in_letterlike_symbols inlimbu,in_limbu inlineara,in_linear_a inlinearbideograms,in_linear_b_ideograms inlinearbsyllabary,in_linear_b_syllabary inlisu,in_lisu inlisusupplement,in_lisu_supplement inlowsurrogates,in_low_surrogates inlycian,in_lycian inlydian,in_lydian inmahajani,in_mahajani inmahjongtiles,in_mahjong_tiles inmakasar,in_makasar inmalayalam,in_malayalam inmandaic,in_mandaic inmanichaean,in_manichaean inmarchen,in_marchen inmasaramgondi,in_masaram_gondi inmathematicalalphanumericsymbols,in_mathematical_alphanumeric_symbols inmathematicaloperators,in_mathematical_operators inmayannumerals,in_mayan_numerals inmedefaidrin,in_medefaidrin inmeeteimayek,in_meetei_mayek inmeeteimayekextensions,in_meetei_mayek_extensions inmendekikakui,in_mende_kikakui inmeroiticcursive,in_meroitic_cursive inmeroitichieroglyphs,in_meroitic_hieroglyphs inmiao,in_miao inmiscellaneousmathematicalsymbolsa,in_miscellaneous_mathematical_symbols_a inmiscellaneousmathematicalsymbolsb,in_miscellaneous_mathematical_symbols_b inmiscellaneoussymbols,in_miscellaneous_symbols inmiscellaneoussymbolsandarrows,in_miscellaneous_symbols_and_arrows inmiscellaneoussymbolsandpictographs,in_miscellaneous_symbols_and_pictographs inmiscellaneoustechnical,in_miscellaneous_technical inmodi,in_modi inmodifiertoneletters,in_modifier_tone_letters inmongolian,in_mongolian inmongoliansupplement,in_mongolian_supplement inmro,in_mro inmultani,in_multani inmusicalsymbols,in_musical_symbols inmyanmar,in_myanmar inmyanmarextendeda,in_myanmar_extended_a inmyanmarextendedb,in_myanmar_extended_b innabataean,in_nabataean innandinagari,in_nandinagari innewa,in_newa innewtailue,in_new_tai_lue innko,in_nko innoblock,in_no_block innumberforms,in_number_forms innushu,in_nushu innyiakengpuachuehmong,in_nyiakeng_puachue_hmong inogham,in_ogham inolchiki,in_ol_chiki inoldhungarian,in_old_hungarian inolditalic,in_old_italic inoldnortharabian,in_old_north_arabian inoldpermic,in_old_permic inoldpersian,in_old_persian inoldsogdian,in_old_sogdian inoldsoutharabian,in_old_south_arabian inoldturkic,in_old_turkic inolduyghur,in_old_uyghur inopticalcharacterrecognition,in_optical_character_recognition inoriya,in_oriya inornamentaldingbats,in_ornamental_dingbats inosage,in_osage inosmanya,in_osmanya inottomansiyaqnumbers,in_ottoman_siyaq_numbers inpahawhhmong,in_pahawh_hmong inpalmyrene,in_palmyrene inpaucinhau,in_pau_cin_hau inphagspa,in_phags_pa inphaistosdisc,in_phaistos_disc inphoenician,in_phoenician inphoneticextensions,in_phonetic_extensions inphoneticextensionssupplement,in_phonetic_extensions_supplement inplayingcards,in_playing_cards inprivateusearea,in_private_use_area inpsalterpahlavi,in_psalter_pahlavi inrejang,in_rejang inruminumeralsymbols,in_rumi_numeral_symbols inrunic,in_runic insamaritan,in_samaritan insaurashtra,in_saurashtra inscriptionalpahlavi,inscriptional_pahlavi inscriptionalparthian,inscriptional_parthian insharada,in_sharada inshavian,in_shavian inshorthandformatcontrols,in_shorthand_format_controls insiddham,in_siddham insinhala,in_sinhala insinhalaarchaicnumbers,in_sinhala_archaic_numbers insmallformvariants,in_small_form_variants insmallkanaextension,in_small_kana_extension insogdian,in_sogdian insorasompeng,in_sora_sompeng insoyombo,in_soyombo inspacingmodifierletters,in_spacing_modifier_letters inspecials,in_specials insundanese,in_sundanese insundanesesupplement,in_sundanese_supplement insuperscriptsandsubscripts,in_superscripts_and_subscripts insupplementalarrowsa,in_supplemental_arrows_a insupplementalarrowsb,in_supplemental_arrows_b insupplementalarrowsc,in_supplemental_arrows_c insupplementalmathematicaloperators,in_supplemental_mathematical_operators insupplementalpunctuation,in_supplemental_punctuation insupplementalsymbolsandpictographs,in_supplemental_symbols_and_pictographs insupplementaryprivateuseareaa,in_supplementary_private_use_area_a insupplementaryprivateuseareab,in_supplementary_private_use_area_b insuttonsignwriting,in_sutton_signwriting insylotinagri,in_syloti_nagri insymbolsandpictographsextendeda,in_symbols_and_pictographs_extended_a insymbolsforlegacycomputing,in_symbols_for_legacy_computing insyriac,in_syriac insyriacsupplement,in_syriac_supplement intagalog,in_tagalog intagbanwa,in_tagbanwa intags,in_tags intaile,in_tai_le intaitham,in_tai_tham intaiviet,in_tai_viet intaixuanjingsymbols,in_tai_xuan_jing_symbols intakri,in_takri intamil,in_tamil intamilsupplement,in_tamil_supplement intangsa,in_tangsa intangut,in_tangut intangutcomponents,in_tangut_components intangutsupplement,in_tangut_supplement intelugu,in_telugu inthaana,in_thaana inthai,in_thai intibetan,in_tibetan intifinagh,in_tifinagh intirhuta,in_tirhuta intoto,in_toto intransportandmapsymbols,in_transport_and_map_symbols inugaritic,in_ugaritic inunifiedcanadianaboriginalsyllabics,in_unified_canadian_aboriginal_syllabics inunifiedcanadianaboriginalsyllabicsextended,in_unified_canadian_aboriginal_syllabics_extended inunifiedcanadianaboriginalsyllabicsextendeda,in_unified_canadian_aboriginal_syllabics_extended_a invai,in_vai invariationselectors,in_variation_selectors invariationselectorssupplement,in_variation_selectors_supplement invedicextensions,in_vedic_extensions inverticalforms,in_vertical_forms invithkuqi,in_vithkuqi inwancho,in_wancho inwarangciti,in_warang_citi inyezidi,in_yezidi inyijinghexagramsymbols,in_yijing_hexagram_symbols inyiradicals,in_yi_radicals inyisyllables,in_yi_syllables inzanabazarsquare,in_zanabazar_square inznamennymusicalnotation,in_znamenny_musical_notation javanese,javanese joincontrol,join_control kaithi,kaithi kannada,kannada katakana,katakana kayahli,kayah_li kharoshthi,kharoshthi khitansmallscript,khitan_small_script khmer,khmer khojki,khojki khudawadi,khudawadi lao,lao latin,latin lepcha,lepcha letter,letter letternumber,letter_number limbu,limbu lineara,linear_a linearb,linear_b lineseparator,line_separator lisu,lisu logicalorderexception,logical_order_exception lower,lower lowercase,lowercase lowercaseletter,lowercase_letter lycian,lycian lydian,lydian mahajani,mahajani makasar,makasar malayalam,malayalam mandaic,mandaic manichaean,manichaean marchen,marchen mark,mark masaramgondi,masaram_gondi math,math mathsymbol,math_symbol medefaidrin,medefaidrin meeteimayek,meetei_mayek mendekikakui,mende_kikakui meroiticcursive,meroitic_cursive meroitichieroglyphs,meroitic_hieroglyphs miao,miao modi,modi modifierletter,modifier_letter modifiersymbol,modifier_symbol mongolian,mongolian mro,mro multani,multani myanmar,myanmar nabataean,nabataean nandinagari,nandinagari newa,newa newline,newline newtailue,new_tai_lue nko,nko noncharactercodepoint,noncharacter_code_point nonspacingmark,nonspacing_mark number,number nushu,nushu nyiakengpuachuehmong,nyiakeng_puachue_hmong ogham,ogham olchiki,ol_chiki oldhungarian,old_hungarian olditalic,old_italic oldnortharabian,old_north_arabian oldpermic,old_permic oldpersian,old_persian oldsogdian,old_sogdian oldsoutharabian,old_south_arabian oldturkic,old_turkic olduyghur,old_uyghur openpunctuation,open_punctuation oriya,oriya osage,osage osmanya,osmanya other,other otheralphabetic,other_alphabetic otherdefaultignorablecodepoint,other_default_ignorable_code_point othergraphemeextend,other_grapheme_extend otheridcontinue,other_id_continue otheridstart,other_id_start otherletter,other_letter otherlowercase,other_lowercase othermath,other_math othernumber,other_number otherpunctuation,other_punctuation othersymbol,other_symbol otheruppercase,other_uppercase pahawhhmong,pahawh_hmong palmyrene,palmyrene paragraphseparator,paragraph_separator patternsyntax,pattern_syntax patternwhitespace,pattern_white_space paucinhau,pau_cin_hau phagspa,phags_pa phoenician,phoenician prependedconcatenationmark,prepended_concatenation_mark print,print privateuse,private_use psalterpahlavi,psalter_pahlavi punct,punct punctuation,punctuation quotationmark,quotation_mark radical,radical regionalindicator,regional_indicator rejang,rejang runic,runic samaritan,samaritan saurashtra,saurashtra sentenceterminal,sentence_terminal separator,separator sharada,sharada shavian,shavian siddham,siddham signwriting,signwriting sinhala,sinhala softdotted,soft_dotted sogdian,sogdian sorasompeng,sora_sompeng soyombo,soyombo space,space spaceseparator,space_separator spacingmark,spacing_mark sundanese,sundanese surrogate,surrogate sylotinagri,syloti_nagri symbol,symbol syriac,syriac tagalog,tagalog tagbanwa,tagbanwa taile,tai_le taitham,tai_tham taiviet,tai_viet takri,takri tamil,tamil tangsa,tangsa tangut,tangut telugu,telugu terminalpunctuation,terminal_punctuation thaana,thaana thai,thai tibetan,tibetan tifinagh,tifinagh tirhuta,tirhuta titlecaseletter,titlecase_letter toto,toto ugaritic,ugaritic unassigned,unassigned unifiedideograph,unified_ideograph unknown,unknown upper,upper uppercase,uppercase uppercaseletter,uppercase_letter vai,vai variationselector,variation_selector vithkuqi,vithkuqi wancho,wancho warangciti,warang_citi whitespace,white_space word,word xdigit,xdigit xidcontinue,xid_continue xidstart,xid_start xposixpunct,xposixpunct yezidi,yezidi yi,yi zanabazarsquare,zanabazar_square ammar-regexp_parser-0494e56/lib/regexp_parser/scanner/properties/short.csv000066400000000000000000000100321433525313500270520ustar00rootroot00000000000000# THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT adlm,adlam aghb,caucasian_albanian ahex,ascii_hex_digit arab,arabic armi,imperial_aramaic armn,armenian avst,avestan bali,balinese bamu,bamum bass,bassa_vah batk,batak beng,bengali bhks,bhaiksuki bidic,bidi_control bopo,bopomofo brah,brahmi brai,braille bugi,buginese buhd,buhid c,other cakm,chakma cans,canadian_aboriginal cari,carian cc,control cf,format cher,cherokee chrs,chorasmian ci,case_ignorable cn,unassigned co,private_use combiningmark,mark copt,coptic cpmn,cypro_minoan cprt,cypriot cs,surrogate cwcf,changes_when_casefolded cwcm,changes_when_casemapped cwl,changes_when_lowercased cwt,changes_when_titlecased cwu,changes_when_uppercased cyrl,cyrillic dep,deprecated deva,devanagari di,default_ignorable_code_point dia,diacritic diak,dives_akuru dogr,dogra dsrt,deseret dupl,duployan ebase,emoji_modifier_base ecomp,emoji_component egyp,egyptian_hieroglyphs elba,elbasan elym,elymaic emod,emoji_modifier epres,emoji_presentation ethi,ethiopic ext,extender geor,georgian glag,glagolitic gong,gunjala_gondi gonm,masaram_gondi goth,gothic gran,grantha grbase,grapheme_base grek,greek grext,grapheme_extend grlink,grapheme_link gujr,gujarati guru,gurmukhi hang,hangul hani,han hano,hanunoo hatr,hatran hebr,hebrew hex,hex_digit hira,hiragana hluw,anatolian_hieroglyphs hmng,pahawh_hmong hmnp,nyiakeng_puachue_hmong hung,old_hungarian idc,id_continue ideo,ideographic ids,id_start idsb,ids_binary_operator idst,ids_trinary_operator ital,old_italic java,javanese joinc,join_control kali,kayah_li kana,katakana khar,kharoshthi khmr,khmer khoj,khojki kits,khitan_small_script knda,kannada kthi,kaithi l,letter lana,tai_tham laoo,lao latn,latin lc,cased_letter lepc,lepcha limb,limbu lina,linear_a linb,linear_b ll,lowercase_letter lm,modifier_letter lo,other_letter loe,logical_order_exception lt,titlecase_letter lu,uppercase_letter lyci,lycian lydi,lydian m,mark mahj,mahajani maka,makasar mand,mandaic mani,manichaean marc,marchen mc,spacing_mark me,enclosing_mark medf,medefaidrin mend,mende_kikakui merc,meroitic_cursive mero,meroitic_hieroglyphs mlym,malayalam mn,nonspacing_mark mong,mongolian mroo,mro mtei,meetei_mayek mult,multani mymr,myanmar n,number nand,nandinagari narb,old_north_arabian nbat,nabataean nchar,noncharacter_code_point nd,decimal_number nkoo,nko nl,letter_number no,other_number nshu,nushu oalpha,other_alphabetic odi,other_default_ignorable_code_point ogam,ogham ogrext,other_grapheme_extend oidc,other_id_continue oids,other_id_start olck,ol_chiki olower,other_lowercase omath,other_math orkh,old_turkic orya,oriya osge,osage osma,osmanya ougr,old_uyghur oupper,other_uppercase p,punctuation palm,palmyrene patsyn,pattern_syntax patws,pattern_white_space pauc,pau_cin_hau pc,connector_punctuation pcm,prepended_concatenation_mark pd,dash_punctuation pe,close_punctuation perm,old_permic pf,final_punctuation phag,phags_pa phli,inscriptional_pahlavi phlp,psalter_pahlavi phnx,phoenician pi,initial_punctuation plrd,miao po,other_punctuation prti,inscriptional_parthian ps,open_punctuation qaac,coptic qaai,inherited qmark,quotation_mark ri,regional_indicator rjng,rejang rohg,hanifi_rohingya runr,runic s,symbol samr,samaritan sarb,old_south_arabian saur,saurashtra sc,currency_symbol sd,soft_dotted sgnw,signwriting shaw,shavian shrd,sharada sidd,siddham sind,khudawadi sinh,sinhala sk,modifier_symbol sm,math_symbol so,other_symbol sogd,sogdian sogo,old_sogdian sora,sora_sompeng soyo,soyombo sterm,sentence_terminal sund,sundanese sylo,syloti_nagri syrc,syriac tagb,tagbanwa takr,takri tale,tai_le talu,new_tai_lue taml,tamil tang,tangut tavt,tai_viet telu,telugu term,terminal_punctuation tfng,tifinagh tglg,tagalog thaa,thaana tibt,tibetan tirh,tirhuta tnsa,tangsa ugar,ugaritic uideo,unified_ideograph vaii,vai vith,vithkuqi vs,variation_selector wara,warang_citi wcho,wancho wspace,white_space xidc,xid_continue xids,xid_start xpeo,old_persian xsux,cuneiform yezi,yezidi yiii,yi z,separator zanb,zanabazar_square zinh,inherited zl,line_separator zp,paragraph_separator zs,space_separator zyyy,common zzzz,unknown ammar-regexp_parser-0494e56/lib/regexp_parser/scanner/property.rl000066400000000000000000000014471433525313500252370ustar00rootroot00000000000000%%{ machine re_property; property_char = [pP]; property_sequence = property_char . '{' . '^'? (alnum|space|[_\-\.=])+ '}'; action premature_property_end { raise PrematureEndError.new('unicode property') } # Unicode properties scanner # -------------------------------------------------------------------------- unicode_property := |* property_sequence < eof(premature_property_end) { text = copy(data, ts-1, te) type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase token = self.class.short_prop_map[name] || self.class.long_prop_map[name] validation_error(:property, name) unless token self.emit(type, token.to_sym, text) fret; }; *|; }%% ammar-regexp_parser-0494e56/lib/regexp_parser/scanner/scanner.rl000066400000000000000000000674141433525313500250120ustar00rootroot00000000000000%%{ machine re_scanner; include re_char_type "char_type.rl"; include re_property "property.rl"; utf8_2_byte = (0xc2..0xdf 0x80..0xbf); utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf); utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf); utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte; dot = '.'; backslash = '\\'; alternation = '|'; beginning_of_line = '^'; end_of_line = '$'; range_open = '{'; range_close = '}'; curlies = range_open | range_close; group_open = '('; group_close = ')'; parentheses = group_open | group_close; set_open = '['; set_close = ']'; brackets = set_open | set_close; comment = ('#' . [^\n]* . '\n'?); class_posix = ('[:' . '^'? . [^\[\]]* . ':]'); # these are not supported in ruby at the moment collating_sequence = '[.' . (alpha | [\-])+ . '.]'; character_equivalent = '[=' . alpha . '=]'; line_anchor = beginning_of_line | end_of_line; anchor_char = [AbBzZG]; escaped_ascii = [abefnrtv]; octal_sequence = [0-7]{1,3}; hex_sequence = 'x' . xdigit{1,2}; hex_sequence_err = 'x' . [^0-9a-fA-F{]; codepoint_single = 'u' . xdigit{4}; codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}'; codepoint_sequence = codepoint_single | codepoint_list; control_sequence = ('c' | 'C-') . (backslash . 'M-')? . backslash? . any; meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any; sequence_char = [CMcux]; zero_or_one = '?' | '??' | '?+'; zero_or_more = '*' | '*?' | '*+'; one_or_more = '+' | '+?' | '++'; quantifier_greedy = '?' | '*' | '+'; quantifier_reluctant = '??' | '*?' | '+?'; quantifier_possessive = '?+' | '*+' | '++'; quantifier_mode = '?' | '+'; quantity_exact = (digit+); quantity_minimum = (digit+) . ','; quantity_maximum = ',' . (digit+); quantity_range = (digit+) . ',' . (digit+); quantifier_interval = range_open . ( quantity_exact | quantity_minimum | quantity_maximum | quantity_range ) . range_close; quantifiers = quantifier_greedy | quantifier_reluctant | quantifier_possessive | quantifier_interval; conditional = '(?('; group_comment = '?#' . [^)]* . group_close; group_atomic = '?>'; group_passive = '?:'; group_absence = '?~'; assertion_lookahead = '?='; assertion_nlookahead = '?!'; assertion_lookbehind = '?<='; assertion_nlookbehind = '?~]+ . ':'? ) ?; group_ref = [gk]; group_name_id_ab = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*; group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*; group_number = '-'? . [1-9] . [0-9]*; group_level = [+\-] . [0-9]+; group_name = ('<' . group_name_id_ab? . '>') | ("'" . group_name_id_sq? . "'"); group_lookup = group_name | group_number; group_named = ('?' . group_name ); group_name_backref = 'k' . (('<' . group_name_id_ab? . group_level? '>') | ("'" . group_name_id_sq? . group_level? "'")); group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') | ("'" . group_name_id_sq? . group_level? "'")); group_number_backref = 'k' . (('<' . group_number . group_level? '>') | ("'" . group_number . group_level? "'")); group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') | ("'" . ((group_number . group_level?) | '0') "'")); group_type = group_atomic | group_passive | group_absence | group_named; keep_mark = 'K'; assertion_type = assertion_lookahead | assertion_nlookahead | assertion_lookbehind | assertion_nlookbehind; # characters that 'break' a literal meta_char = dot | backslash | alternation | curlies | parentheses | brackets | line_anchor | quantifier_greedy; literal_delimiters = ']' | '}'; ascii_print = ((0x20..0x7e) - meta_char - '#'); ascii_nonprint = (0x01..0x1f | 0x7f); non_literal_escape = char_type_char | anchor_char | escaped_ascii | keep_mark | sequence_char; # escapes that also work within a character set set_escape = backslash | brackets | escaped_ascii | property_char | sequence_char | single_codepoint_char_type; # EOF error, used where it can be detected action premature_end_error { text = copy(data, ts ? ts-1 : 0, -1) raise PrematureEndError.new( text ) } # Invalid sequence error, used from sequences, like escapes and sets action invalid_sequence_error { text = copy(data, ts ? ts-1 : 0, -1) validation_error(:sequence, 'sequence', text) } # group (nesting) and set open/close actions action group_opened { self.group_depth = group_depth + 1 } action group_closed { self.group_depth = group_depth - 1 } action set_opened { self.set_depth = set_depth + 1 } action set_closed { self.set_depth = set_depth - 1 } # Character set scanner, continues consuming characters until it meets the # closing bracket of the set. # -------------------------------------------------------------------------- character_set := |* set_close > (set_meta, 2) @set_closed { emit(:set, :close, copy(data, ts, te)) if in_set? fret; else fgoto main; end }; '-]' @set_closed { # special case, emits two tokens emit(:literal, :literal, copy(data, ts, te-1)) emit(:set, :close, copy(data, ts+1, te)) if in_set? fret; else fgoto main; end }; '-&&' { # special case, emits two tokens emit(:literal, :literal, '-') emit(:set, :intersection, '&&') }; '^' { text = copy(data, ts, te) if tokens.last[1] == :open emit(:set, :negate, text) else emit(:literal, :literal, text) end }; '-' { text = copy(data, ts, te) # ranges cant start with a subset or intersection/negation/range operator if tokens.last[0] == :set emit(:literal, :literal, text) else emit(:set, :range, text) end }; # Unlike ranges, intersections can start or end at set boundaries, whereupon # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil] '&&' { emit(:set, :intersection, copy(data, ts, te)) }; backslash { fcall set_escape_sequence; }; set_open >(open_bracket, 1) >set_opened { emit(:set, :open, copy(data, ts, te)) fcall character_set; }; class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) { text = copy(data, ts, te) type = :posixclass class_name = text[2..-3] if class_name[0] == '^' class_name = class_name[1..-1] type = :nonposixclass end unless self.class.posix_classes.include?(class_name) validation_error(:posix_class, text) end emit(type, class_name.to_sym, text) }; # These are not supported in ruby at the moment. Enable them if they are. # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) { # emit(:set, :collation, copy(data, ts, te)) # }; # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) { # emit(:set, :equivalent, copy(data, ts, te)) # }; meta_char > (set_meta, 1) { emit(:literal, :literal, copy(data, ts, te)) }; any | ascii_nonprint | utf8_multibyte { text = copy(data, ts, te) emit(:literal, :literal, text) }; *|; # set escapes scanner # -------------------------------------------------------------------------- set_escape_sequence := |* set_escape > (escaped_set_alpha, 2) { fhold; fnext character_set; fcall escape_sequence; }; any > (escaped_set_alpha, 1) { emit(:escape, :literal, copy(data, ts-1, te)) fret; }; *|; # escape sequence scanner # -------------------------------------------------------------------------- escape_sequence := |* [1-9] { text = copy(data, ts-1, te) emit(:backref, :number, text) fret; }; octal_sequence { emit(:escape, :octal, copy(data, ts-1, te)) fret; }; meta_char { case text = copy(data, ts-1, te) when '\.'; emit(:escape, :dot, text) when '\|'; emit(:escape, :alternation, text) when '\^'; emit(:escape, :bol, text) when '\$'; emit(:escape, :eol, text) when '\?'; emit(:escape, :zero_or_one, text) when '\*'; emit(:escape, :zero_or_more, text) when '\+'; emit(:escape, :one_or_more, text) when '\('; emit(:escape, :group_open, text) when '\)'; emit(:escape, :group_close, text) when '\{'; emit(:escape, :interval_open, text) when '\}'; emit(:escape, :interval_close, text) when '\['; emit(:escape, :set_open, text) when '\]'; emit(:escape, :set_close, text) when "\\\\"; emit(:escape, :backslash, text) end fret; }; escaped_ascii > (escaped_alpha, 7) { # \b is emitted as backspace only when inside a character set, otherwise # it is a word boundary anchor. A syntax might "normalize" it if needed. case text = copy(data, ts-1, te) when '\a'; emit(:escape, :bell, text) when '\b'; emit(:escape, :backspace, text) when '\e'; emit(:escape, :escape, text) when '\f'; emit(:escape, :form_feed, text) when '\n'; emit(:escape, :newline, text) when '\r'; emit(:escape, :carriage, text) when '\t'; emit(:escape, :tab, text) when '\v'; emit(:escape, :vertical_tab, text) end fret; }; codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) { text = copy(data, ts-1, te) if text[2] == '{' emit(:escape, :codepoint_list, text) else emit(:escape, :codepoint, text) end fret; }; hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) { emit(:escape, :hex, copy(data, ts-1, te)) fret; }; hex_sequence_err @invalid_sequence_error { fret; }; control_sequence >(escaped_alpha, 4) $eof(premature_end_error) { emit_meta_control_sequence(data, ts, te, :control) fret; }; meta_sequence >(backslashed, 3) $eof(premature_end_error) { emit_meta_control_sequence(data, ts, te, :meta_sequence) fret; }; char_type_char > (escaped_alpha, 2) { fhold; fnext *(in_set? ? fentry(character_set) : fentry(main)); fcall char_type; }; property_char > (escaped_alpha, 2) { fhold; fnext *(in_set? ? fentry(character_set) : fentry(main)); fcall unicode_property; }; (any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) { emit(:escape, :literal, copy(data, ts-1, te)) fret; }; *|; # conditional expressions scanner # -------------------------------------------------------------------------- conditional_expression := |* group_lookup . ')' { text = copy(data, ts, te-1) emit(:conditional, :condition, text) emit(:conditional, :condition_close, ')') }; any { fhold; fcall main; }; *|; # Main scanner # -------------------------------------------------------------------------- main := |* # Meta characters # ------------------------------------------------------------------------ dot { emit(:meta, :dot, copy(data, ts, te)) }; alternation { if conditional_stack.last == group_depth emit(:conditional, :separator, copy(data, ts, te)) else emit(:meta, :alternation, copy(data, ts, te)) end }; # Anchors # ------------------------------------------------------------------------ beginning_of_line { emit(:anchor, :bol, copy(data, ts, te)) }; end_of_line { emit(:anchor, :eol, copy(data, ts, te)) }; backslash . keep_mark > (backslashed, 4) { emit(:keep, :mark, copy(data, ts, te)) }; backslash . anchor_char > (backslashed, 3) { case text = copy(data, ts, te) when '\A'; emit(:anchor, :bos, text) when '\z'; emit(:anchor, :eos, text) when '\Z'; emit(:anchor, :eos_ob_eol, text) when '\b'; emit(:anchor, :word_boundary, text) when '\B'; emit(:anchor, :nonword_boundary, text) when '\G'; emit(:anchor, :match_start, text) end }; literal_delimiters { append_literal(data, ts, te) }; # Character sets # ------------------------------------------------------------------------ set_open >set_opened { emit(:set, :open, copy(data, ts, te)) fcall character_set; }; # Conditional expression # (?(condition)Y|N) conditional expression # ------------------------------------------------------------------------ conditional { text = copy(data, ts, te) conditional_stack << group_depth emit(:conditional, :open, text[0..-2]) emit(:conditional, :condition_open, '(') fcall conditional_expression; }; # (?#...) comments: parsed as a single expression, without introducing a # new nesting level. Comments may not include parentheses, escaped or not. # special case for close, action performed on all transitions to get the # correct closing count. # ------------------------------------------------------------------------ group_open . group_comment $group_closed { emit(:group, :comment, copy(data, ts, te)) }; # Expression options: # (?imxdau-imx) option on/off # i: ignore case # m: multi-line (dot(.) match newline) # x: extended form # d: default class rules (1.9 compatible) # a: ASCII class rules (\s, \w, etc.) # u: Unicode class rules (\s, \w, etc.) # # (?imxdau-imx:subexp) option on/off for subexp # ------------------------------------------------------------------------ group_open . group_options >group_opened { text = copy(data, ts, te) if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/ validation_error(:group_option, $1 || "-#{$2}", text) end emit_options(text) }; # Assertions # (?=subexp) look-ahead # (?!subexp) negative look-ahead # (?<=subexp) look-behind # (?group_opened { case text = copy(data, ts, te) when '(?='; emit(:assertion, :lookahead, text) when '(?!'; emit(:assertion, :nlookahead, text) when '(?<='; emit(:assertion, :lookbehind, text) when '(?subexp) atomic group, don't backtrack in subexp. # (?~subexp) absence group, matches anything that is not subexp # (?subexp) named group # (?'name'subexp) named group (single quoted version) # (subexp) captured group # ------------------------------------------------------------------------ group_open . group_type >group_opened { case text = copy(data, ts, te) when '(?:'; emit(:group, :passive, text) when '(?>'; emit(:group, :atomic, text) when '(?~'; emit(:group, :absence, text) when /^\(\?(?:<>|'')/ validation_error(:group, 'named group', 'name is empty') when /^\(\?<[^>]+>/ emit(:group, :named_ab, text) when /^\(\?'[^']+'/ emit(:group, :named_sq, text) end }; group_open @group_opened { text = copy(data, ts, te) emit(:group, :capture, text) }; group_close @group_closed { if conditional_stack.last == group_depth + 1 conditional_stack.pop emit(:conditional, :close, copy(data, ts, te)) else if spacing_stack.length > 1 && spacing_stack.last[:depth] == group_depth + 1 spacing_stack.pop self.free_spacing = spacing_stack.last[:free_spacing] end emit(:group, :close, copy(data, ts, te)) end }; # Group backreference, named and numbered # ------------------------------------------------------------------------ backslash . (group_name_backref | group_number_backref) > (backslashed, 4) { case text = copy(data, ts, te) when /^\\k(<>|'')/ validation_error(:backref, 'backreference', 'ref ID is empty') when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/ emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text) when /^\\k(.)\d+\D$/ emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text) when /^\\k(.)-\d+\D$/ emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text) when /^\\k(.)[^\p{digit}\-].*[+\-]\d+\D$/ emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text) when /^\\k(.)-?\d+[+\-]\d+\D$/ emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text) end }; # Group call, named and numbered # ------------------------------------------------------------------------ backslash . (group_name_call | group_number_call) > (backslashed, 4) { case text = copy(data, ts, te) when /^\\g(<>|'')/ validation_error(:backref, 'subexpression call', 'ref ID is empty') when /^\\g(.)[^\p{digit}+\->][^+\-]*/ emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text) when /^\\g(.)\d+\D$/ emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text) when /^\\g(.)[+-]\d+/ emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text) end }; # Quantifiers # ------------------------------------------------------------------------ zero_or_one { case text = copy(data, ts, te) when '?' ; emit(:quantifier, :zero_or_one, text) when '??'; emit(:quantifier, :zero_or_one_reluctant, text) when '?+'; emit(:quantifier, :zero_or_one_possessive, text) end }; zero_or_more { case text = copy(data, ts, te) when '*' ; emit(:quantifier, :zero_or_more, text) when '*?'; emit(:quantifier, :zero_or_more_reluctant, text) when '*+'; emit(:quantifier, :zero_or_more_possessive, text) end }; one_or_more { case text = copy(data, ts, te) when '+' ; emit(:quantifier, :one_or_more, text) when '+?'; emit(:quantifier, :one_or_more_reluctant, text) when '++'; emit(:quantifier, :one_or_more_possessive, text) end }; quantifier_interval { emit(:quantifier, :interval, copy(data, ts, te)) }; # Catch unmatched curly braces as literals range_open { append_literal(data, ts, te) }; # Escaped sequences # ------------------------------------------------------------------------ backslash > (backslashed, 1) { fcall escape_sequence; }; comment { if free_spacing emit(:free_space, :comment, copy(data, ts, te)) else # consume only the pound sign (#) and backtrack to do regular scanning append_literal(data, ts, ts + 1) fexec ts + 1; end }; space+ { if free_spacing emit(:free_space, :whitespace, copy(data, ts, te)) else append_literal(data, ts, te) end }; # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8, # except meta characters. # ------------------------------------------------------------------------ (ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ { append_literal(data, ts, te) }; *|; }%% # THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY # This file was generated from lib/regexp_parser/scanner/scanner.rl require 'regexp_parser/error' class Regexp::Scanner # General scanner error (catch all) class ScannerError < Regexp::Parser::Error; end # Base for all scanner validation errors class ValidationError < Regexp::Parser::Error def initialize(reason) super reason end end # Unexpected end of pattern class PrematureEndError < ScannerError def initialize(where = '') super "Premature end of pattern at #{where}" end end # Invalid sequence format. Used for escape sequences, mainly. class InvalidSequenceError < ValidationError def initialize(what = 'sequence', where = '') super "Invalid #{what} at #{where}" end end # Invalid group. Used for named groups. class InvalidGroupError < ValidationError def initialize(what, reason) super "Invalid #{what}, #{reason}." end end # Invalid groupOption. Used for inline options. # TODO: should become InvalidGroupOptionError in v3.0.0 for consistency class InvalidGroupOption < ValidationError def initialize(option, text) super "Invalid group option #{option} in #{text}" end end # Invalid back reference. Used for name a number refs/calls. class InvalidBackrefError < ValidationError def initialize(what, reason) super "Invalid back reference #{what}, #{reason}" end end # The property name was not recognized by the scanner. class UnknownUnicodePropertyError < ValidationError def initialize(name) super "Unknown unicode character property name #{name}" end end # The POSIX class name was not recognized by the scanner. class UnknownPosixClassError < ValidationError def initialize(text) super "Unknown POSIX class #{text}" end end # Scans the given regular expression text, or Regexp object and collects the # emitted token into an array that gets returned at the end. If a block is # given, it gets called for each emitted token. # # This method may raise errors if a syntax error is encountered. # -------------------------------------------------------------------------- def self.scan(input_object, options: nil, &block) new.scan(input_object, options: options, &block) end def scan(input_object, options: nil, &block) self.literal = nil stack = [] input = input_object.is_a?(Regexp) ? input_object.source : input_object self.free_spacing = free_spacing?(input_object, options) self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}] data = input.unpack("c*") if input.is_a?(String) eof = data.length self.tokens = [] self.block = block_given? ? block : nil self.set_depth = 0 self.group_depth = 0 self.conditional_stack = [] self.char_pos = 0 %% write data; %% write init; %% write exec; # to avoid "warning: assigned but unused variable - testEof" testEof = testEof if cs == re_scanner_error text = copy(data, ts ? ts-1 : 0, -1) raise ScannerError.new("Scan error at '#{text}'") end raise PrematureEndError.new("(missing group closing paranthesis) "+ "[#{group_depth}]") if in_group? raise PrematureEndError.new("(missing set closing bracket) "+ "[#{set_depth}]") if in_set? # when the entire expression is a literal run emit_literal if literal tokens end # lazy-load property maps when first needed def self.short_prop_map @short_prop_map ||= parse_prop_map('short') end def self.long_prop_map @long_prop_map ||= parse_prop_map('long') end def self.parse_prop_map(name) File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h end def self.posix_classes %w[alnum alpha ascii blank cntrl digit graph lower print punct space upper word xdigit] end # Emits an array with the details of the scanned pattern def emit(type, token, text) #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}" emit_literal if literal # Ragel runs with byte-based indices (ts, te). These are of little value to # end-users, so we keep track of char-based indices and emit those instead. ts_char_pos = char_pos te_char_pos = char_pos + text.length if block block.call type, token, text, ts_char_pos, te_char_pos end tokens << [type, token, text, ts_char_pos, te_char_pos] self.char_pos = te_char_pos end private attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack, :group_depth, :set_depth, :conditional_stack, :char_pos def free_spacing?(input_object, options) if options && !input_object.is_a?(String) raise ArgumentError, 'options cannot be supplied unless scanning a String' end options = input_object.options if input_object.is_a?(::Regexp) return false unless options options & Regexp::EXTENDED != 0 end def in_group? group_depth > 0 end def in_set? set_depth > 0 end # Copy from ts to te from data as text def copy(data, ts, te) data[ts...te].pack('c*').force_encoding('utf-8') end # Appends one or more characters to the literal buffer, to be emitted later # by a call to emit_literal. def append_literal(data, ts, te) self.literal = literal || [] literal << copy(data, ts, te) end # Emits the literal run collected by calls to the append_literal method. def emit_literal text = literal.join self.literal = nil emit(:literal, :literal, text) end def emit_options(text) token = nil # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'. text =~ /\(\?([mixdau]*)(-(?:[mix]*))*(:)?/ positive, negative, group_local = $1, $2, $3 if positive.include?('x') self.free_spacing = true end # If the x appears in both, treat it like ruby does, the second cancels # the first. if negative && negative.include?('x') self.free_spacing = false end if group_local spacing_stack << {:free_spacing => free_spacing, :depth => group_depth} token = :options else # switch for parent group level spacing_stack.last[:free_spacing] = free_spacing token = :options_switch end emit(:group, token, text) end def emit_meta_control_sequence(data, ts, te, token) if data.last < 0x00 || data.last > 0x7F validation_error(:sequence, 'escape', token.to_s) end emit(:escape, token, copy(data, ts-1, te)) end # Centralizes and unifies the handling of validation related # errors. def validation_error(type, what, reason = nil) error = case type when :backref then InvalidBackrefError.new(what, reason) when :group then InvalidGroupError.new(what, reason) when :group_option then InvalidGroupOption.new(what, reason) when :posix_class then UnknownPosixClassError.new(what) when :property then UnknownUnicodePropertyError.new(what) when :sequence then InvalidSequenceError.new(what, reason) end raise error # unless @@config.validation_ignore end end # module Regexp::Scanner ammar-regexp_parser-0494e56/lib/regexp_parser/syntax.rb000066400000000000000000000004241433525313500232300ustar00rootroot00000000000000require 'regexp_parser/error' module Regexp::Syntax class SyntaxError < Regexp::Parser::Error; end end require_relative 'syntax/token' require_relative 'syntax/base' require_relative 'syntax/any' require_relative 'syntax/version_lookup' require_relative 'syntax/versions' ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/000077500000000000000000000000001433525313500227035ustar00rootroot00000000000000ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/any.rb000066400000000000000000000004731433525313500240230ustar00rootroot00000000000000module Regexp::Syntax # A syntax that always returns true, passing all tokens as implemented. This # is useful during development, testing, and should be useful for some types # of transformations as well. class Any < Base implements :*, [:*] def self.implements?(_type, _token) true end end end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/base.rb000066400000000000000000000064271433525313500241530ustar00rootroot00000000000000module Regexp::Syntax class NotImplementedError < Regexp::Syntax::SyntaxError def initialize(syntax, type, token) super "#{syntax} does not implement: [#{type}:#{token}]" end end # A lookup map of supported types and tokens in a given syntax class Base include Regexp::Syntax::Token class << self attr_accessor :features # automatically inherit features through the syntax class hierarchy def inherited(subclass) super subclass.features = features.to_h.map { |k, v| [k, v.dup] }.to_h end def implements(type, tokens) (features[type] ||= []).concat(tokens) added_features[type] = tokens end def excludes(type, tokens) tokens.each { |tok| features[type].delete(tok) } removed_features[type] = tokens end def implements?(type, token) implementations(type).include?(token) end alias :check? :implements? def implementations(type) features[type] || [] end def implements!(type, token) raise NotImplementedError.new(self, type, token) unless implements?(type, token) end alias :check! :implements! def added_features @added_features ||= {} end def removed_features @removed_features ||= {} end def normalize(type, token) case type when :group normalize_group(type, token) when :backref normalize_backref(type, token) else [type, token] end end def normalize_group(type, token) case token when :named_ab, :named_sq %i[group named] else [type, token] end end def normalize_backref(type, token) case token when :name_ref_ab, :name_ref_sq %i[backref name_ref] when :name_call_ab, :name_call_sq %i[backref name_call] when :name_recursion_ref_ab, :name_recursion_ref_sq %i[backref name_recursion_ref] when :number_ref_ab, :number_ref_sq %i[backref number_ref] when :number_call_ab, :number_call_sq %i[backref number_call] when :number_rel_ref_ab, :number_rel_ref_sq %i[backref number_rel_ref] when :number_rel_call_ab, :number_rel_call_sq %i[backref number_rel_call] when :number_recursion_ref_ab, :number_recursion_ref_sq %i[backref number_recursion_ref] else [type, token] end end end # TODO: drop this backwards compatibility code in v3.0.0, do `private :new` def initialize warn 'Using instances of Regexp::Parser::Syntax is deprecated ' \ "and will no longer be supported in v3.0.0." end def method_missing(name, *args) if self.class.respond_to?(name) warn 'Using instances of Regexp::Parser::Syntax is deprecated ' \ "and will no longer be supported in v3.0.0. Please call "\ "methods on the class directly, e.g.: #{self.class}.#{name}" self.class.send(name, *args) else super end end def respond_to_missing?(name, include_private = false) self.class.respond_to?(name) || super end # end of backwards compatibility code end end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/token.rb000066400000000000000000000024021433525313500243460ustar00rootroot00000000000000# Define the base module and the simplest of tokens. module Regexp::Syntax module Token Map = {} module Literal All = %i[literal] Type = :literal end module FreeSpace All = %i[comment whitespace] Type = :free_space end Map[FreeSpace::Type] = FreeSpace::All Map[Literal::Type] = Literal::All end end # Load all the token files, they will populate the Map constant. require 'regexp_parser/syntax/token/anchor' require 'regexp_parser/syntax/token/assertion' require 'regexp_parser/syntax/token/backreference' require 'regexp_parser/syntax/token/posix_class' require 'regexp_parser/syntax/token/character_set' require 'regexp_parser/syntax/token/character_type' require 'regexp_parser/syntax/token/conditional' require 'regexp_parser/syntax/token/escape' require 'regexp_parser/syntax/token/group' require 'regexp_parser/syntax/token/keep' require 'regexp_parser/syntax/token/meta' require 'regexp_parser/syntax/token/quantifier' require 'regexp_parser/syntax/token/unicode_property' # After loading all the tokens the map is full. Extract all tokens and types # into the All and Types constants. module Regexp::Syntax module Token All = Map.values.flatten.uniq.sort.freeze Types = Map.keys.freeze end end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/token/000077500000000000000000000000001433525313500240235ustar00rootroot00000000000000ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/token/anchor.rb000066400000000000000000000005351433525313500256250ustar00rootroot00000000000000module Regexp::Syntax module Token module Anchor Basic = %i[bol eol] Extended = Basic + %i[word_boundary nonword_boundary] String = %i[bos eos eos_ob_eol] MatchStart = %i[match_start] All = Extended + String + MatchStart Type = :anchor end Map[Anchor::Type] = Anchor::All end end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/token/assertion.rb000066400000000000000000000004141433525313500263560ustar00rootroot00000000000000module Regexp::Syntax module Token module Assertion Lookahead = %i[lookahead nlookahead] Lookbehind = %i[lookbehind nlookbehind] All = Lookahead + Lookbehind Type = :assertion end Map[Assertion::Type] = Assertion::All end end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/token/backreference.rb000066400000000000000000000013421433525313500271270ustar00rootroot00000000000000module Regexp::Syntax module Token module Backreference Plain = %i[number] NumberRef = %i[number_ref number_rel_ref] Number = Plain + NumberRef Name = %i[name_ref] RecursionLevel = %i[name_recursion_ref number_recursion_ref] V1_8_6 = Plain V1_9_1 = Name + NumberRef + RecursionLevel All = V1_8_6 + V1_9_1 Type = :backref end # Type is the same as Backreference so keeping it here, for now. module SubexpressionCall Name = %i[name_call] Number = %i[number_call number_rel_call] All = Name + Number end Map[Backreference::Type] = Backreference::All + SubexpressionCall::All end end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/token/character_set.rb000066400000000000000000000004011433525313500271520ustar00rootroot00000000000000module Regexp::Syntax module Token module CharacterSet Basic = %i[open close negate range] Extended = Basic + %i[intersection] All = Extended Type = :set end Map[CharacterSet::Type] = CharacterSet::All end end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/token/character_type.rb000066400000000000000000000005501433525313500273450ustar00rootroot00000000000000module Regexp::Syntax module Token module CharacterType Basic = [] Extended = %i[digit nondigit space nonspace word nonword] Hex = %i[hex nonhex] Clustered = %i[linebreak xgrapheme] All = Basic + Extended + Hex + Clustered Type = :type end Map[CharacterType::Type] = CharacterType::All end end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/token/conditional.rb000066400000000000000000000005631433525313500266570ustar00rootroot00000000000000module Regexp::Syntax module Token module Conditional Delimiters = %i[open close] Condition = %i[condition_open condition condition_close] Separator = %i[separator] All = Conditional::Delimiters + Conditional::Condition + Conditional::Separator Type = :conditional end Map[Conditional::Type] = Conditional::All end end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/token/escape.rb000066400000000000000000000014331433525313500256110ustar00rootroot00000000000000module Regexp::Syntax module Token # TODO: unify naming with RE::EscapeSequence, one way or the other, in v3.0.0 module Escape Basic = %i[backslash literal] Control = %i[control meta_sequence] ASCII = %i[bell backspace escape form_feed newline carriage tab vertical_tab] Unicode = %i[codepoint codepoint_list] Meta = %i[dot alternation zero_or_one zero_or_more one_or_more bol eol group_open group_close interval_open interval_close set_open set_close] Hex = %i[hex] Octal = %i[octal] All = Basic + Control + ASCII + Unicode + Meta + Hex + Octal Type = :escape end Map[Escape::Type] = Escape::All end end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/token/group.rb000066400000000000000000000007671433525313500255160ustar00rootroot00000000000000module Regexp::Syntax module Token module Group Basic = %i[capture close] Extended = Basic + %i[options options_switch] Named = %i[named] Atomic = %i[atomic] Passive = %i[passive] Comment = %i[comment] V1_8_6 = Group::Extended + Group::Named + Group::Atomic + Group::Passive + Group::Comment V2_4_1 = %i[absence] All = V1_8_6 + V2_4_1 Type = :group end Map[Group::Type] = Group::All end end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/token/keep.rb000066400000000000000000000002441433525313500252740ustar00rootroot00000000000000module Regexp::Syntax module Token module Keep Mark = %i[mark] All = Mark Type = :keep end Map[Keep::Type] = Keep::All end end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/token/meta.rb000066400000000000000000000003231433525313500252740ustar00rootroot00000000000000module Regexp::Syntax module Token module Meta Basic = %i[dot] Extended = Basic + %i[alternation] All = Extended Type = :meta end Map[Meta::Type] = Meta::All end end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/token/posix_class.rb000066400000000000000000000006341433525313500267020ustar00rootroot00000000000000module Regexp::Syntax module Token module PosixClass Standard = %i[alnum alpha blank cntrl digit graph lower print punct space upper xdigit] Extensions = %i[ascii word] All = Standard + Extensions Type = :posixclass NonType = :nonposixclass end Map[PosixClass::Type] = PosixClass::All Map[PosixClass::NonType] = PosixClass::All end end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/token/quantifier.rb000066400000000000000000000014721433525313500265230ustar00rootroot00000000000000module Regexp::Syntax module Token module Quantifier Greedy = %i[ zero_or_one zero_or_more one_or_more ] Reluctant = %i[ zero_or_one_reluctant zero_or_more_reluctant one_or_more_reluctant ] Possessive = %i[ zero_or_one_possessive zero_or_more_possessive one_or_more_possessive ] Interval = %i[interval] IntervalReluctant = %i[interval_reluctant] IntervalPossessive = %i[interval_possessive] IntervalAll = Interval + IntervalReluctant + IntervalPossessive V1_8_6 = Greedy + Reluctant + Interval + IntervalReluctant All = Greedy + Reluctant + Possessive + IntervalAll Type = :quantifier end Map[Quantifier::Type] = Quantifier::All end end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/token/unicode_property.rb000066400000000000000000000406121433525313500277450ustar00rootroot00000000000000module Regexp::Syntax module Token module UnicodeProperty all = proc { |name| constants.grep(/#{name}/).flat_map(&method(:const_get)) } CharType_V1_9_0 = %i[alnum alpha ascii blank cntrl digit graph lower print punct space upper word xdigit] CharType_V2_5_0 = %i[xposixpunct] POSIX = %i[any assigned newline] module Category Letter = %i[letter uppercase_letter lowercase_letter titlecase_letter modifier_letter other_letter] Mark = %i[mark nonspacing_mark spacing_mark enclosing_mark] Number = %i[number decimal_number letter_number other_number] Punctuation = %i[punctuation connector_punctuation dash_punctuation open_punctuation close_punctuation initial_punctuation final_punctuation other_punctuation] Symbol = %i[symbol math_symbol currency_symbol modifier_symbol other_symbol] Separator = %i[separator space_separator line_separator paragraph_separator] Codepoint = %i[other control format surrogate private_use unassigned] All = Letter + Mark + Number + Punctuation + Symbol + Separator + Codepoint end Age_V1_9_3 = %i[age=1.1 age=2.0 age=2.1 age=3.0 age=3.1 age=3.2 age=4.0 age=4.1 age=5.0 age=5.1 age=5.2 age=6.0] Age_V2_0_0 = %i[age=6.1] Age_V2_2_0 = %i[age=6.2 age=6.3 age=7.0] Age_V2_3_0 = %i[age=8.0] Age_V2_4_0 = %i[age=9.0] Age_V2_5_0 = %i[age=10.0] Age_V2_6_0 = %i[age=11.0] Age_V2_6_2 = %i[age=12.0] Age_V2_6_3 = %i[age=12.1] Age_V3_1_0 = %i[age=13.0] Age_V3_2_0 = %i[age=14.0] Age = all[:Age_V] Derived_V1_9_0 = %i[ ascii_hex_digit alphabetic cased changes_when_casefolded changes_when_casemapped changes_when_lowercased changes_when_titlecased changes_when_uppercased case_ignorable bidi_control dash deprecated default_ignorable_code_point diacritic extender grapheme_base grapheme_extend grapheme_link hex_digit hyphen id_continue ideographic id_start ids_binary_operator ids_trinary_operator join_control logical_order_exception lowercase math noncharacter_code_point other_alphabetic other_default_ignorable_code_point other_grapheme_extend other_id_continue other_id_start other_lowercase other_math other_uppercase pattern_syntax pattern_white_space quotation_mark radical sentence_terminal soft_dotted terminal_punctuation unified_ideograph uppercase variation_selector white_space xid_start xid_continue ] Derived_V2_0_0 = %i[ cased_letter combining_mark ] Derived_V2_4_0 = %i[ prepended_concatenation_mark ] Derived_V2_5_0 = %i[ regional_indicator ] Derived = all[:Derived_V] Script_V1_9_0 = %i[ arabic imperial_aramaic armenian avestan balinese bamum bengali bopomofo braille buginese buhid canadian_aboriginal carian cham cherokee coptic cypriot cyrillic devanagari deseret egyptian_hieroglyphs ethiopic georgian glagolitic gothic greek gujarati gurmukhi hangul han hanunoo hebrew hiragana old_italic javanese kayah_li katakana kharoshthi khmer kannada kaithi tai_tham lao latin lepcha limbu linear_b lisu lycian lydian malayalam mongolian meetei_mayek myanmar nko ogham ol_chiki old_turkic oriya osmanya phags_pa inscriptional_pahlavi phoenician inscriptional_parthian rejang runic samaritan old_south_arabian saurashtra shavian sinhala sundanese syloti_nagri syriac tagbanwa tai_le new_tai_lue tamil tai_viet telugu tifinagh tagalog thaana thai tibetan ugaritic vai old_persian cuneiform yi inherited common unknown ] Script_V1_9_3 = %i[ brahmi batak mandaic ] Script_V2_0_0 = %i[ chakma meroitic_cursive meroitic_hieroglyphs miao sharada sora_sompeng takri ] Script_V2_2_0 = %i[ caucasian_albanian bassa_vah duployan elbasan grantha pahawh_hmong khojki linear_a mahajani manichaean mende_kikakui modi mro old_north_arabian nabataean palmyrene pau_cin_hau old_permic psalter_pahlavi siddham khudawadi tirhuta warang_citi ] Script_V2_3_0 = %i[ ahom anatolian_hieroglyphs hatran multani old_hungarian signwriting ] Script_V2_4_0 = %i[ adlam bhaiksuki marchen newa osage tangut ] Script_V2_5_0 = %i[ masaram_gondi nushu soyombo zanabazar_square ] Script_V2_6_0 = %i[ dogra gunjala_gondi hanifi_rohingya makasar medefaidrin old_sogdian sogdian ] Script_V2_6_2 = %i[ elymaic nandinagari nyiakeng_puachue_hmong wancho ] Script_V3_1_0 = %i[ chorasmian dives_akuru khitan_small_script yezidi ] Script_V3_2_0 = %i[ cypro_minoan old_uyghur tangsa toto vithkuqi ] Script = all[:Script_V] UnicodeBlock_V1_9_0 = %i[ in_alphabetic_presentation_forms in_arabic in_armenian in_arrows in_basic_latin in_bengali in_block_elements in_bopomofo_extended in_bopomofo in_box_drawing in_braille_patterns in_buhid in_cjk_compatibility_forms in_cjk_compatibility_ideographs in_cjk_compatibility in_cjk_radicals_supplement in_cjk_symbols_and_punctuation in_cjk_unified_ideographs_extension_a in_cjk_unified_ideographs in_cherokee in_combining_diacritical_marks_for_symbols in_combining_diacritical_marks in_combining_half_marks in_control_pictures in_currency_symbols in_cyrillic_supplement in_cyrillic in_devanagari in_dingbats in_enclosed_alphanumerics in_enclosed_cjk_letters_and_months in_ethiopic in_general_punctuation in_geometric_shapes in_georgian in_greek_extended in_greek_and_coptic in_gujarati in_gurmukhi in_halfwidth_and_fullwidth_forms in_hangul_compatibility_jamo in_hangul_jamo in_hangul_syllables in_hanunoo in_hebrew in_high_private_use_surrogates in_high_surrogates in_hiragana in_ipa_extensions in_ideographic_description_characters in_kanbun in_kangxi_radicals in_kannada in_katakana_phonetic_extensions in_katakana in_khmer_symbols in_khmer in_lao in_latin_extended_additional in_letterlike_symbols in_limbu in_low_surrogates in_malayalam in_mathematical_operators in_miscellaneous_symbols_and_arrows in_miscellaneous_symbols in_miscellaneous_technical in_mongolian in_myanmar in_number_forms in_ogham in_optical_character_recognition in_oriya in_phonetic_extensions in_private_use_area in_runic in_sinhala in_small_form_variants in_spacing_modifier_letters in_specials in_superscripts_and_subscripts in_supplemental_mathematical_operators in_syriac in_tagalog in_tagbanwa in_tai_le in_tamil in_telugu in_thaana in_thai in_tibetan in_unified_canadian_aboriginal_syllabics in_variation_selectors in_yi_radicals in_yi_syllables in_yijing_hexagram_symbols ] UnicodeBlock_V2_0_0 = %i[ in_aegean_numbers in_alchemical_symbols in_ancient_greek_musical_notation in_ancient_greek_numbers in_ancient_symbols in_arabic_extended_a in_arabic_mathematical_alphabetic_symbols in_arabic_presentation_forms_a in_arabic_presentation_forms_b in_arabic_supplement in_avestan in_balinese in_bamum in_bamum_supplement in_batak in_brahmi in_buginese in_byzantine_musical_symbols in_cjk_compatibility_ideographs_supplement in_cjk_strokes in_cjk_unified_ideographs_extension_b in_cjk_unified_ideographs_extension_c in_cjk_unified_ideographs_extension_d in_carian in_chakma in_cham in_combining_diacritical_marks_supplement in_common_indic_number_forms in_coptic in_counting_rod_numerals in_cuneiform in_cuneiform_numbers_and_punctuation in_cypriot_syllabary in_cyrillic_extended_a in_cyrillic_extended_b in_deseret in_devanagari_extended in_domino_tiles in_egyptian_hieroglyphs in_emoticons in_enclosed_alphanumeric_supplement in_enclosed_ideographic_supplement in_ethiopic_extended in_ethiopic_extended_a in_ethiopic_supplement in_georgian_supplement in_glagolitic in_gothic in_hangul_jamo_extended_a in_hangul_jamo_extended_b in_imperial_aramaic in_inscriptional_pahlavi in_inscriptional_parthian in_javanese in_kaithi in_kana_supplement in_kayah_li in_kharoshthi in_latin_1_supplement in_latin_extended_a in_latin_extended_b in_latin_extended_c in_latin_extended_d in_lepcha in_linear_b_ideograms in_linear_b_syllabary in_lisu in_lycian in_lydian in_mahjong_tiles in_mandaic in_mathematical_alphanumeric_symbols in_meetei_mayek in_meetei_mayek_extensions in_meroitic_cursive in_meroitic_hieroglyphs in_miao in_miscellaneous_mathematical_symbols_a in_miscellaneous_mathematical_symbols_b in_miscellaneous_symbols_and_pictographs in_modifier_tone_letters in_musical_symbols in_myanmar_extended_a in_nko in_new_tai_lue in_no_block in_ol_chiki in_old_italic in_old_persian in_old_south_arabian in_old_turkic in_osmanya in_phags_pa in_phaistos_disc in_phoenician in_phonetic_extensions_supplement in_playing_cards in_rejang in_rumi_numeral_symbols in_samaritan in_saurashtra in_sharada in_shavian in_sora_sompeng in_sundanese in_sundanese_supplement in_supplemental_arrows_a in_supplemental_arrows_b in_supplemental_punctuation in_supplementary_private_use_area_a in_supplementary_private_use_area_b in_syloti_nagri in_tags in_tai_tham in_tai_viet in_tai_xuan_jing_symbols in_takri in_tifinagh in_transport_and_map_symbols in_ugaritic in_unified_canadian_aboriginal_syllabics_extended in_vai in_variation_selectors_supplement in_vedic_extensions in_vertical_forms ] UnicodeBlock_V2_2_0 = %i[ in_bassa_vah in_caucasian_albanian in_combining_diacritical_marks_extended in_coptic_epact_numbers in_duployan in_elbasan in_geometric_shapes_extended in_grantha in_khojki in_khudawadi in_latin_extended_e in_linear_a in_mahajani in_manichaean in_mende_kikakui in_modi in_mro in_myanmar_extended_b in_nabataean in_old_north_arabian in_old_permic in_ornamental_dingbats in_pahawh_hmong in_palmyrene in_pau_cin_hau in_psalter_pahlavi in_shorthand_format_controls in_siddham in_sinhala_archaic_numbers in_supplemental_arrows_c in_tirhuta in_warang_citi ] UnicodeBlock_V2_3_0 = %i[ in_ahom in_anatolian_hieroglyphs in_cjk_unified_ideographs_extension_e in_cherokee_supplement in_early_dynastic_cuneiform in_hatran in_multani in_old_hungarian in_supplemental_symbols_and_pictographs in_sutton_signwriting ] UnicodeBlock_V2_4_0 = %i[ in_adlam in_bhaiksuki in_cyrillic_extended_c in_glagolitic_supplement in_ideographic_symbols_and_punctuation in_marchen in_mongolian_supplement in_newa in_osage in_tangut in_tangut_components ] UnicodeBlock_V2_5_0 = %i[ in_cjk_unified_ideographs_extension_f in_kana_extended_a in_masaram_gondi in_nushu in_soyombo in_syriac_supplement in_zanabazar_square ] UnicodeBlock_V2_6_0 = %i[ in_chess_symbols in_dogra in_georgian_extended in_gunjala_gondi in_hanifi_rohingya in_indic_siyaq_numbers in_makasar in_mayan_numerals in_medefaidrin in_old_sogdian in_sogdian ] UnicodeBlock_V2_6_2 = %i[ in_egyptian_hieroglyph_format_controls in_elymaic in_nandinagari in_nyiakeng_puachue_hmong in_ottoman_siyaq_numbers in_small_kana_extension in_symbols_and_pictographs_extended_a in_tamil_supplement in_wancho ] UnicodeBlock_V3_1_0 = %i[ in_chorasmian in_cjk_unified_ideographs_extension_g in_dives_akuru in_khitan_small_script in_lisu_supplement in_symbols_for_legacy_computing in_tangut_supplement in_yezidi ] UnicodeBlock_V3_2_0 = %i[ in_arabic_extended_b in_cypro_minoan in_ethiopic_extended_b in_kana_extended_b in_latin_extended_f in_latin_extended_g in_old_uyghur in_tangsa in_toto in_unified_canadian_aboriginal_syllabics_extended_a in_vithkuqi in_znamenny_musical_notation ] UnicodeBlock = all[:UnicodeBlock_V] Emoji_V2_5_0 = %i[ emoji emoji_component emoji_modifier emoji_modifier_base emoji_presentation ] Emoji = all[:Emoji_V] V1_9_0 = Category::All + POSIX + all[:V1_9_0] V1_9_3 = all[:V1_9_3] V2_0_0 = all[:V2_0_0] V2_2_0 = all[:V2_2_0] V2_3_0 = all[:V2_3_0] V2_4_0 = all[:V2_4_0] V2_5_0 = all[:V2_5_0] V2_6_0 = all[:V2_6_0] V2_6_2 = all[:V2_6_2] V2_6_3 = all[:V2_6_3] V3_1_0 = all[:V3_1_0] V3_2_0 = all[:V3_2_0] All = all[/^V\d+_\d+_\d+$/] Type = :property NonType = :nonproperty end Map[UnicodeProperty::Type] = UnicodeProperty::All Map[UnicodeProperty::NonType] = UnicodeProperty::All end end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/version_lookup.rb000066400000000000000000000042671433525313500263170ustar00rootroot00000000000000module Regexp::Syntax VERSION_FORMAT = '\Aruby/\d+\.\d+(\.\d+)?\z' VERSION_REGEXP = /#{VERSION_FORMAT}/ VERSION_CONST_REGEXP = /\AV\d+_\d+(?:_\d+)?\z/ class InvalidVersionNameError < Regexp::Syntax::SyntaxError def initialize(name) super "Invalid version name '#{name}'. Expected format is '#{VERSION_FORMAT}'" end end class UnknownSyntaxNameError < Regexp::Syntax::SyntaxError def initialize(name) super "Unknown syntax name '#{name}'." end end module_function # Returns the syntax specification class for the given syntax # version name. The special names 'any' and '*' return Syntax::Any. def for(name) (@alias_map ||= {})[name] ||= version_class(name) end def new(name) warn 'Regexp::Syntax.new is deprecated in favor of Regexp::Syntax.for. '\ 'It does not return distinct instances and will be removed in v3.0.0.' self.for(name) end def supported?(name) name =~ VERSION_REGEXP && comparable(name) >= comparable('1.8.6') end def version_class(version) return Regexp::Syntax::Any if ['*', 'any'].include?(version.to_s) version =~ VERSION_REGEXP || raise(InvalidVersionNameError, version) warn_if_future_version(version) version_const_name = "V#{version.to_s.scan(/\d+/).join('_')}" const_get(version_const_name) || raise(UnknownSyntaxNameError, version) end def const_missing(const_name) if const_name =~ VERSION_CONST_REGEXP return fallback_version_class(const_name) end super end def fallback_version_class(version) sorted = (specified_versions + [version]).sort_by { |ver| comparable(ver) } index = sorted.index(version) index > 0 && const_get(sorted[index - 1]) end def specified_versions constants.select { |const_name| const_name =~ VERSION_CONST_REGEXP } end def comparable(name) # add .99 to treat versions without a patch value as latest patch version Gem::Version.new((name.to_s.scan(/\d+/) << 99).join('.')) end def warn_if_future_version(const_name) return if comparable(const_name) < comparable('4.0.0') warn('This library has only been tested up to Ruby 3.x, '\ "but you are running with #{const_name}") end end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/versions.rb000066400000000000000000000004321433525313500250770ustar00rootroot00000000000000# Ruby 1.x is no longer a supported runtime, # but its regex features are still recognized. # # Aliases for the latest patch version are provided as 'ruby/n.n', # e.g. 'ruby/1.9' refers to Ruby v1.9.3. Dir[File.expand_path('../versions/*.rb', __FILE__)].sort.each { |f| require f } ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/versions/000077500000000000000000000000001433525313500245535ustar00rootroot00000000000000ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/versions/1.8.6.rb000066400000000000000000000011611433525313500255510ustar00rootroot00000000000000class Regexp::Syntax::V1_8_6 < Regexp::Syntax::Base implements :anchor, Anchor::All implements :assertion, Assertion::Lookahead implements :backref, Backreference::V1_8_6 implements :escape, Escape::Basic + Escape::ASCII + Escape::Meta + Escape::Control implements :free_space, FreeSpace::All implements :group, Group::V1_8_6 implements :literal, Literal::All implements :meta, Meta::Extended implements :posixclass, PosixClass::Standard implements :quantifier, Quantifier::V1_8_6 implements :set, CharacterSet::All implements :type, CharacterType::Extended end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/versions/1.9.1.rb000066400000000000000000000011241433525313500255440ustar00rootroot00000000000000class Regexp::Syntax::V1_9_1 < Regexp::Syntax::V1_8_6 implements :assertion, Assertion::Lookbehind implements :backref, Backreference::V1_9_1 + SubexpressionCall::All implements :escape, Escape::Unicode + Escape::Hex + Escape::Octal implements :posixclass, PosixClass::Extensions implements :nonposixclass, PosixClass::All implements :property, UnicodeProperty::V1_9_0 implements :nonproperty, UnicodeProperty::V1_9_0 implements :quantifier, Quantifier::Possessive + Quantifier::IntervalPossessive implements :type, CharacterType::Hex end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/versions/1.9.3.rb000066400000000000000000000002401433525313500255440ustar00rootroot00000000000000class Regexp::Syntax::V1_9_3 < Regexp::Syntax::V1_9_1 implements :property, UnicodeProperty::V1_9_3 implements :nonproperty, UnicodeProperty::V1_9_3 end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/versions/2.0.0.rb000066400000000000000000000005641433525313500255420ustar00rootroot00000000000000class Regexp::Syntax::V2_0_0 < Regexp::Syntax::V1_9_3 implements :keep, Keep::All implements :conditional, Conditional::All implements :property, UnicodeProperty::V2_0_0 implements :nonproperty, UnicodeProperty::V2_0_0 implements :type, CharacterType::Clustered excludes :property, %i[newline] excludes :nonproperty, %i[newline] end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/versions/2.2.0.rb000066400000000000000000000002401433525313500255330ustar00rootroot00000000000000class Regexp::Syntax::V2_2_0 < Regexp::Syntax::V2_0_0 implements :property, UnicodeProperty::V2_2_0 implements :nonproperty, UnicodeProperty::V2_2_0 end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/versions/2.3.0.rb000066400000000000000000000002401433525313500255340ustar00rootroot00000000000000class Regexp::Syntax::V2_3_0 < Regexp::Syntax::V2_2_0 implements :property, UnicodeProperty::V2_3_0 implements :nonproperty, UnicodeProperty::V2_3_0 end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/versions/2.4.0.rb000066400000000000000000000002401433525313500255350ustar00rootroot00000000000000class Regexp::Syntax::V2_4_0 < Regexp::Syntax::V2_3_0 implements :property, UnicodeProperty::V2_4_0 implements :nonproperty, UnicodeProperty::V2_4_0 end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/versions/2.4.1.rb000066400000000000000000000001351433525313500255410ustar00rootroot00000000000000class Regexp::Syntax::V2_4_1 < Regexp::Syntax::V2_4_0 implements :group, Group::V2_4_1 end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/versions/2.5.0.rb000066400000000000000000000002401433525313500255360ustar00rootroot00000000000000class Regexp::Syntax::V2_5_0 < Regexp::Syntax::V2_4_1 implements :property, UnicodeProperty::V2_5_0 implements :nonproperty, UnicodeProperty::V2_5_0 end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/versions/2.6.0.rb000066400000000000000000000002401433525313500255370ustar00rootroot00000000000000class Regexp::Syntax::V2_6_0 < Regexp::Syntax::V2_5_0 implements :property, UnicodeProperty::V2_6_0 implements :nonproperty, UnicodeProperty::V2_6_0 end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/versions/2.6.2.rb000066400000000000000000000002401433525313500255410ustar00rootroot00000000000000class Regexp::Syntax::V2_6_2 < Regexp::Syntax::V2_6_0 implements :property, UnicodeProperty::V2_6_2 implements :nonproperty, UnicodeProperty::V2_6_2 end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/versions/2.6.3.rb000066400000000000000000000002401433525313500255420ustar00rootroot00000000000000class Regexp::Syntax::V2_6_3 < Regexp::Syntax::V2_6_2 implements :property, UnicodeProperty::V2_6_3 implements :nonproperty, UnicodeProperty::V2_6_3 end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/versions/3.1.0.rb000066400000000000000000000002401433525313500255330ustar00rootroot00000000000000class Regexp::Syntax::V3_1_0 < Regexp::Syntax::V2_6_3 implements :property, UnicodeProperty::V3_1_0 implements :nonproperty, UnicodeProperty::V3_1_0 end ammar-regexp_parser-0494e56/lib/regexp_parser/syntax/versions/3.2.0.rb000066400000000000000000000002401433525313500255340ustar00rootroot00000000000000class Regexp::Syntax::V3_2_0 < Regexp::Syntax::V3_1_0 implements :property, UnicodeProperty::V3_2_0 implements :nonproperty, UnicodeProperty::V3_2_0 end ammar-regexp_parser-0494e56/lib/regexp_parser/token.rb000066400000000000000000000004431433525313500230230ustar00rootroot00000000000000class Regexp TOKEN_KEYS = %i[ type token text ts te level set_level conditional_level ].freeze Token = Struct.new(*TOKEN_KEYS) do attr_accessor :previous, :next def offset [ts, te] end def length te - ts end end end ammar-regexp_parser-0494e56/lib/regexp_parser/version.rb000066400000000000000000000000741433525313500233700ustar00rootroot00000000000000class Regexp class Parser VERSION = '2.6.1' end end ammar-regexp_parser-0494e56/regexp_parser.gemspec000066400000000000000000000022771433525313500221640ustar00rootroot00000000000000$:.unshift File.join(File.dirname(__FILE__), 'lib') require 'regexp_parser/version' Gem::Specification.new do |spec| spec.name = 'regexp_parser' spec.version = ::Regexp::Parser::VERSION spec.summary = "Scanner, lexer, parser for ruby's regular expressions" spec.description = 'A library for tokenizing, lexing, and parsing Ruby regular expressions.' spec.homepage = 'https://github.com/ammar/regexp_parser' spec.metadata['bug_tracker_uri'] = "#{spec.homepage}/issues" spec.metadata['changelog_uri'] = "#{spec.homepage}/blob/master/CHANGELOG.md" spec.metadata['homepage_uri'] = spec.homepage spec.metadata['source_code_uri'] = spec.homepage spec.metadata['wiki_uri'] = "#{spec.homepage}/wiki" spec.authors = ['Ammar Ali'] spec.email = ['ammarabuali@gmail.com'] spec.license = 'MIT' spec.require_paths = ['lib'] spec.files = Dir.glob('lib/**/*.{csv,rb,rl}') + %w[Gemfile Rakefile LICENSE README.md CHANGELOG.md regexp_parser.gemspec] spec.rdoc_options = ["--inline-source", "--charset=UTF-8"] spec.platform = Gem::Platform::RUBY spec.required_ruby_version = '>= 2.0.0' end ammar-regexp_parser-0494e56/spec/000077500000000000000000000000001433525313500166735ustar00rootroot00000000000000ammar-regexp_parser-0494e56/spec/expression/000077500000000000000000000000001433525313500210725ustar00rootroot00000000000000ammar-regexp_parser-0494e56/spec/expression/base_spec.rb000066400000000000000000000103041433525313500233410ustar00rootroot00000000000000require 'spec_helper' RSpec.describe(Regexp::Expression::Base) do # test #level include_examples 'parse', /^a(b(c(d)))e$/, [0] => [to_s: '^', level: 0], [1] => [to_s: 'a', level: 0], [2] => [to_s: '(b(c(d)))', level: 0], [2, 0] => [to_s: 'b', level: 1], [2, 1] => [to_s: '(c(d))', level: 1], [2, 1, 0] => [to_s: 'c', level: 2], [2, 1, 1] => [to_s: '(d)', level: 2], [2, 1, 1, 0] => [to_s: 'd', level: 3], [3] => [to_s: 'e', level: 0], [4] => [to_s: '$', level: 0] # test #terminal? include_examples 'parse', /^a([b]+)c$/, [] => [Root, terminal?: false], [0] => [to_s: '^', terminal?: true], [1] => [to_s: 'a', terminal?: true], [2] => [to_s: '([b]+)', terminal?: false], [2, 0] => [to_s: '[b]+', terminal?: false], [2, 0, 0] => [to_s: 'b', terminal?: true], [3] => [to_s: 'c', terminal?: true], [4] => [to_s: '$', terminal?: true] include_examples 'parse', /^(ab|cd)$/, [] => [Root, terminal?: false], [0] => [:bol, to_s: '^', terminal?: true], [1] => [:capture, to_s: '(ab|cd)', terminal?: false], [1, 0] => [:alternation, to_s: 'ab|cd', terminal?: false], [1, 0, 0] => [:sequence, to_s: 'ab', terminal?: false], [1, 0, 0, 0] => [:literal, to_s: 'ab', terminal?: true], [1, 0, 1] => [:sequence, to_s: 'cd', terminal?: false], [1, 0, 1, 0] => [:literal, to_s: 'cd', terminal?: true], [2] => [:eol, to_s: '$', terminal?: true] # test #coded_offset include_examples 'parse', /^a*(b+(c?))$/, [] => [Root, coded_offset: '@0+12'], [0] => [to_s: '^', coded_offset: '@0+1'], [1] => [to_s: 'a*', coded_offset: '@1+2'], [2] => [to_s: '(b+(c?))', coded_offset: '@3+8'], [2, 0] => [to_s: 'b+', coded_offset: '@4+2'], [2, 1] => [to_s: '(c?)', coded_offset: '@6+4'], [2, 1, 0] => [to_s: 'c?', coded_offset: '@7+2'], [3] => [to_s: '$', coded_offset: '@11+1'] # test #quantity include_examples 'parse', /aa/, [0] => [quantity: [nil, nil]] include_examples 'parse', /a?/, [0] => [quantity: [0, 1]] include_examples 'parse', /a*/, [0] => [quantity: [0, -1]] include_examples 'parse', /a+/, [0] => [quantity: [1, -1]] # test #repetitions include_examples 'parse', /aa/, [0] => [repetitions: 1..1] include_examples 'parse', /a?/, [0] => [repetitions: 0..1] include_examples 'parse', /a*/, [0] => [repetitions: 0..(Float::INFINITY)] include_examples 'parse', /a+/, [0] => [repetitions: 1..(Float::INFINITY)] # test #optional? include_examples 'parse', /a?/, [0] => [optional?: true] include_examples 'parse', /a*/, [0] => [optional?: true] include_examples 'parse', /a{,5}/, [0] => [optional?: true] include_examples 'parse', /a{0,5}/, [0] => [optional?: true] include_examples 'parse', /a/, [0] => [optional?: false] include_examples 'parse', /a+/, [0] => [optional?: false] include_examples 'parse', /a{1}/, [0] => [optional?: false] include_examples 'parse', /a{1,5}/, [0] => [optional?: false] # test #base_length, #full_length include_examples 'parse', /(aa)/, [0] => [base_length: 4] include_examples 'parse', /(aa)/, [0] => [full_length: 4] include_examples 'parse', /(aa){42}/, [0] => [base_length: 4] include_examples 'parse', /(aa){42}/, [0] => [full_length: 8] # test #to_re include_examples 'parse', '^a*(b([cde]+))+f?$', [] => [Root, to_re: /^a*(b([cde]+))+f?$/] specify '#to_re warns when used on set members' do expect do result = Regexp::Parser.parse(/[\b]/)[0][0].to_re expect(result).to eq(/\b/) end.to output(/set member/).to_stderr end specify 'updating #quantifier updates #repetitions' do exp = Regexp::Parser.parse(/a{3}/)[0] expect(exp.repetitions).to eq 3..3 exp.quantifier = Regexp::Parser.parse(/b{5}/)[0].quantifier expect(exp.repetitions).to eq 5..5 end end ammar-regexp_parser-0494e56/spec/expression/clone_spec.rb000066400000000000000000000112151433525313500235310ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Expression::Base#clone') do specify('Base#clone') do root = RP.parse(/^(?i:a)b+$/i) copy = root.clone expect(copy.to_s).to eq root.to_s expect(root.object_id).not_to eq copy.object_id expect(root.text).to eq copy.text expect(root.text.object_id).not_to eq copy.text.object_id root_1 = root[1] copy_1 = copy[1] expect(root_1.options).to eq copy_1.options expect(root_1.options.object_id).not_to eq copy_1.options.object_id root_2 = root[2] copy_2 = copy[2] expect(root_2).to be_quantified expect(copy_2).to be_quantified expect(root_2.quantifier.text).to eq copy_2.quantifier.text expect(root_2.quantifier.text.object_id).not_to eq copy_2.quantifier.text.object_id expect(root_2.quantifier.object_id).not_to eq copy_2.quantifier.object_id # regression test expect { root_2.clone }.not_to(change { root_2.quantifier.object_id }) expect { root_2.clone }.not_to(change { root_2.quantifier.text.object_id }) end specify('Subexpression#clone') do root = RP.parse(/^a(b([cde])f)g$/) copy = root.clone expect(copy.to_s).to eq root.to_s expect(root).to respond_to(:expressions) expect(copy).to respond_to(:expressions) expect(root.expressions.object_id).not_to eq copy.expressions.object_id copy.expressions.each_with_index do |exp, index| expect(root[index].object_id).not_to eq exp.object_id end copy[2].each_with_index do |exp, index| expect(root[2][index].object_id).not_to eq exp.object_id end # regression test expect { root.clone }.not_to(change { root.expressions.object_id }) end specify('Group::Named#clone') do root = RP.parse('^(?a)+bc$') copy = root.clone expect(copy.to_s).to eq root.to_s root_1 = root[1] copy_1 = copy[1] expect(root_1.name).to eq copy_1.name expect(root_1.name.object_id).not_to eq copy_1.name.object_id expect(root_1.text).to eq copy_1.text expect(root_1.expressions.object_id).not_to eq copy_1.expressions.object_id copy_1.expressions.each_with_index do |exp, index| expect(root_1[index].object_id).not_to eq exp.object_id end # regression test expect { root_1.clone }.not_to(change { root_1.name.object_id }) end specify('Group::Options#clone') do root = RP.parse('foo(?i)bar') copy = root.clone expect(copy.to_s).to eq root.to_s root_1 = root[1] copy_1 = copy[1] expect(root_1.option_changes).to eq copy_1.option_changes expect(root_1.option_changes.object_id).not_to eq copy_1.option_changes.object_id # regression test expect { root_1.clone }.not_to(change { root_1.option_changes.object_id }) end specify('Backreference::Base#clone') do root = RP.parse('(foo)\1') copy = root.clone expect(copy.to_s).to eq root.to_s root_1 = root[1] copy_1 = copy[1] expect(root_1.referenced_expression).to eq copy_1.referenced_expression expect(root_1.referenced_expression.to_s).to eq copy_1.referenced_expression.to_s expect(root_1.referenced_expression.object_id).not_to eq copy_1.referenced_expression.object_id # regression test expect { root_1.clone }.not_to(change { root_1.referenced_expression.object_id }) end specify('Sequence#clone') do root = RP.parse(/(a|b)/) copy = root.clone # regression test expect(copy.to_s).to eq root.to_s root_seq_op = root[0][0] copy_seq_op = copy[0][0] root_seq_1 = root[0][0][0] copy_seq_1 = copy[0][0][0] expect(root_seq_op.object_id).not_to eq copy_seq_op.object_id expect(root_seq_1.object_id).not_to eq copy_seq_1.object_id copy_seq_1.expressions.each_with_index do |exp, index| expect(root_seq_1[index].object_id).not_to eq exp.object_id end end describe('Base#unquantified_clone') do it 'produces a clone' do root = RP.parse(/^a(b([cde])f)g$/) copy = root.unquantified_clone expect(copy.to_s).to eq root.to_s expect(copy.object_id).not_to eq root.object_id end it 'does not carry over the callee quantifier' do expect(RP.parse(/a{3}/)[0]).to be_quantified expect(RP.parse(/a{3}/)[0].unquantified_clone).not_to be_quantified expect(RP.parse(/[a]{3}/)[0]).to be_quantified expect(RP.parse(/[a]{3}/)[0].unquantified_clone).not_to be_quantified expect(RP.parse(/(a|b){3}/)[0]).to be_quantified expect(RP.parse(/(a|b){3}/)[0].unquantified_clone).not_to be_quantified end it 'keeps quantifiers of callee children' do expect(RP.parse(/(a{3}){3}/)[0][0]).to be_quantified expect(RP.parse(/(a{3}){3}/)[0].unquantified_clone[0]).to be_quantified end end end ammar-regexp_parser-0494e56/spec/expression/conditional_spec.rb000066400000000000000000000016231433525313500247360ustar00rootroot00000000000000require 'spec_helper' RSpec.describe(Regexp::Expression::Conditional) do specify('Conditional#condition, #branches') do conditional = RP.parse(/(?a)(?()T|F)/)[1] expect(conditional.condition).to eq conditional[0] expect(conditional.branches).to eq conditional[1..2] end specify('Condition#referenced_expression') do root = RP.parse(/(?a)(?()T|F)/) condition = root[1].condition expect(condition.referenced_expression).to eq root[0] expect(condition.referenced_expression.to_s).to eq '(?a)' root = RP.parse(/(a)(?(1)T|F)/) condition = root[1].condition expect(condition.referenced_expression).to eq root[0] expect(condition.referenced_expression.to_s).to eq '(a)' end specify('parse conditional excessive branches') do regexp = '(?a)(?()T|F|X)' expect { RP.parse(regexp) }.to raise_error(Conditional::TooManyBranches) end end ammar-regexp_parser-0494e56/spec/expression/free_space_spec.rb000066400000000000000000000012101433525313500245170ustar00rootroot00000000000000require 'spec_helper' RSpec.describe(Regexp::Expression::FreeSpace) do specify('white space quantify raises error') do regexp = / a # Comment /x root = RP.parse(regexp) space = root[0] expect(space).to be_instance_of(FreeSpace::WhiteSpace) expect { space.quantify(:dummy, '#') }.to raise_error(Regexp::Parser::Error) end specify('comment quantify raises error') do regexp = / a # Comment /x root = RP.parse(regexp) comment = root[3] expect(comment).to be_instance_of(FreeSpace::Comment) expect { comment.quantify(:dummy, '#') }.to raise_error(Regexp::Parser::Error) end end ammar-regexp_parser-0494e56/spec/expression/methods/000077500000000000000000000000001433525313500225355ustar00rootroot00000000000000ammar-regexp_parser-0494e56/spec/expression/methods/construct_spec.rb000066400000000000000000000043471433525313500261300ustar00rootroot00000000000000require 'spec_helper' RSpec.describe(Regexp::Expression::Shared) do describe '::construct' do { Alternation => :meta, Alternative => :expression, Anchor::Base => :anchor, Anchor::EndOfLine => :anchor, Assertion::Base => :assertion, Assertion::Lookahead => :assertion, Backreference::Base => :backref, Backreference::Number => :backref, CharacterSet => :set, CharacterSet::IntersectedSequence => :expression, CharacterSet::Intersection => :set, CharacterSet::Range => :set, CharacterType::Any => :meta, CharacterType::Base => :type, CharacterType::Digit => :type, Conditional::Branch => :expression, Conditional::Condition => :conditional, Conditional::Expression => :conditional, EscapeSequence::Base => :escape, EscapeSequence::Literal => :escape, FreeSpace => :free_space, Group::Base => :group, Group::Capture => :group, Keep::Mark => :keep, Literal => :literal, PosixClass => :posixclass, Quantifier => :quantifier, Root => :expression, UnicodeProperty::Base => :property, UnicodeProperty::Number::Decimal => :property, }.each do |klass, expected_type| it "works for #{klass}" do result = klass.construct expect(result).to be_a klass expect(result.type).to eq expected_type end end it 'allows overriding defaults' do expect(Literal.construct(type: :foo).type).to eq :foo end it 'allows passing options' do expect(Literal.construct(options: { i: true }).options[:i]).to eq true end it 'raises ArgumentError for unknown parameters' do expect { Literal.construct(foo: :foo) }.to raise_error(ArgumentError) end end end ammar-regexp_parser-0494e56/spec/expression/methods/human_name_spec.rb000066400000000000000000000042361433525313500262110ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Regexp::Expression::Shared#human_name') do include_examples 'parse', //, [] => [human_name: 'root'] include_examples 'parse', /a/, [0] => [human_name: 'literal'] include_examples 'parse', /./, [0] => [human_name: 'match-all'] include_examples 'parse', /[abc]/, [0] => [human_name: 'character set'] include_examples 'parse', /[a-c]/, [0, 0] => [human_name: 'character range'] include_examples 'parse', /\d/, [0] => [human_name: 'digit type'] include_examples 'parse', /\n/, [0] => [human_name: 'newline escape'] include_examples 'parse', /\u{61 62 63}/, [0] => [human_name: 'codepoint list escape'] include_examples 'parse', /\p{ascii}/, [0] => [human_name: 'ascii property'] include_examples 'parse', /[[:ascii:]]/, [0, 0] => [human_name: 'ascii posixclass'] include_examples 'parse', /a{5}/, [0, :q] => [human_name: 'interval quantifier'] include_examples 'parse', /^/, [0] => [human_name: 'beginning of line'] include_examples 'parse', /(?=abc)/, [0] => [human_name: 'lookahead'] include_examples 'parse', /(a)(b)/, [0] => [human_name: 'capture group 1'] include_examples 'parse', /(a)(b)/, [1] => [human_name: 'capture group 2'] include_examples 'parse', /(?abc)/, [0] => [human_name: 'named capture group'] include_examples 'parse', / /x, [0] => [human_name: 'free space'] include_examples 'parse', /#comment /x, [0] => [human_name: 'comment'] include_examples 'parse', /(?#comment)/x, [0] => [human_name: 'comment group'] include_examples 'parse', /(abc)\1/, [1] => [human_name: 'backreference'] include_examples 'parse', /(?)\k/, [1] => [human_name: 'backreference by name'] include_examples 'parse', /(abc)\g<-1>/, [1] => [human_name: 'relative subexpression call'] include_examples 'parse', /a|bc/, [0] => [human_name: 'alternation'] include_examples 'parse', /a|bc/, [0, 0] => [human_name: 'alternative'] end ammar-regexp_parser-0494e56/spec/expression/methods/match_length_spec.rb000066400000000000000000000146461433525313500265440ustar00rootroot00000000000000require 'spec_helper' ML = Regexp::MatchLength RSpec.describe(Regexp::MatchLength) do specify('literal') { expect(ML.of(/a/).minmax).to eq [1, 1] } specify('literal sequence') { expect(ML.of(/abc/).minmax).to eq [3, 3] } specify('dot') { expect(ML.of(/./).minmax).to eq [1, 1] } specify('set') { expect(ML.of(/[abc]/).minmax).to eq [1, 1] } specify('type') { expect(ML.of(/\d/).minmax).to eq [1, 1] } specify('escape') { expect(ML.of(/\n/).minmax).to eq [1, 1] } specify('property') { expect(ML.of(/\p{ascii}/).minmax).to eq [1, 1] } specify('codepoint list') { expect(ML.of(/\u{61 62 63}/).minmax).to eq [3, 3] } specify('multi-char literal') { expect(ML.of(/abc/).minmax).to eq [3, 3] } specify('fixed quantified') { expect(ML.of(/a{5}/).minmax).to eq [5, 5] } specify('range quantified') { expect(ML.of(/a{5,9}/).minmax).to eq [5, 9] } specify('nested quantified') { expect(ML.of(/(a{2}){3,4}/).minmax).to eq [6, 8] } specify('open-end quantified') { expect(ML.of(/a*/).minmax).to eq [0, Float::INFINITY] } specify('empty subexpression') { expect(ML.of(//).minmax).to eq [0, 0] } specify('anchor') { expect(ML.of(/^$/).minmax).to eq [0, 0] } specify('lookaround') { expect(ML.of(/(?=abc)/).minmax).to eq [0, 0] } specify('free space') { expect(ML.of(/ /x).minmax).to eq [0, 0] } specify('comment') { expect(ML.of(/(?#comment)/x).minmax).to eq [0, 0] } specify('backreference') { expect(ML.of(/(abc){2}\1/).minmax).to eq [9, 9] } specify('subexp call') { expect(ML.of(/(abc){2}\g<-1>/).minmax).to eq [9, 9] } specify('alternation') { expect(ML.of(/a|bcde/).minmax).to eq [1, 4] } specify('nested alternation') { expect(ML.of(/a|bc(d|efg)/).minmax).to eq [1, 5] } specify('quantified alternation') { expect(ML.of(/a|bcde?/).minmax).to eq [1, 4] } if ruby_version_at_least('2.4.1') specify('absence group') { expect(ML.of('(?~abc)').minmax).to eq [0, Float::INFINITY] } end specify('raises for missing references') do exp = RP.parse(/(a)\1/).last exp.referenced_expression = nil expect { exp.match_length }.to raise_error(ArgumentError) end describe('::of') do it('works with Regexps') { expect(ML.of(/foo/).minmax).to eq [3, 3] } it('works with Strings') { expect(ML.of('foo').minmax).to eq [3, 3] } it('works with Expressions') { expect(ML.of(RP.parse(/foo/)).minmax).to eq [3, 3] } end describe('Expression::Base#match_length') do it('returns the MatchLength') { expect(RP.parse(/abc/).match_length.minmax).to eq [3, 3] } end describe('Expression::Base#inner_match_length') do it 'returns the MatchLength of an expression that does not count towards parent match_length' do exp = RP.parse(/(?=ab|cdef)/)[0] expect(exp).to be_a Regexp::Expression::Assertion::Base expect(exp.match_length.minmax).to eq [0, 0] expect(exp.inner_match_length.minmax).to eq [2, 4] end end describe('#include?') do specify('unquantified') do expect(ML.of(/a/)).to include 1 expect(ML.of(/a/)).not_to include 0 expect(ML.of(/a/)).not_to include 2 end specify('fixed quantified') do expect(ML.of(/a{5}/)).to include 5 expect(ML.of(/a{5}/)).not_to include 0 expect(ML.of(/a{5}/)).not_to include 4 expect(ML.of(/a{5}/)).not_to include 6 end specify('variably quantified') do expect(ML.of(/a?/)).to include 0 expect(ML.of(/a?/)).to include 1 expect(ML.of(/a?/)).not_to include 2 end specify('nested quantified') do expect(ML.of(/(a{2}){3,4}/)).to include 6 expect(ML.of(/(a{2}){3,4}/)).to include 8 expect(ML.of(/(a{2}){3,4}/)).not_to include 0 expect(ML.of(/(a{2}){3,4}/)).not_to include 5 expect(ML.of(/(a{2}){3,4}/)).not_to include 7 expect(ML.of(/(a{2}){3,4}/)).not_to include 9 end specify('branches') do expect(ML.of(/ab|cdef/)).to include 2 expect(ML.of(/ab|cdef/)).to include 4 expect(ML.of(/ab|cdef/)).not_to include 0 expect(ML.of(/ab|cdef/)).not_to include 3 expect(ML.of(/ab|cdef/)).not_to include 5 end specify('called on leaf node') do expect(ML.of(RP.parse(/a{2}/)[0])).to include 2 expect(ML.of(RP.parse(/a{2}/)[0])).not_to include 0 expect(ML.of(RP.parse(/a{2}/)[0])).not_to include 1 expect(ML.of(RP.parse(/a{2}/)[0])).not_to include 3 end end describe('#fixed?') do specify('unquantified') { expect(ML.of(/a/)).to be_fixed } specify('fixed quantified') { expect(ML.of(/a{5}/)).to be_fixed } specify('variably quantified') { expect(ML.of(/a?/)).not_to be_fixed } specify('equal branches') { expect(ML.of(/ab|cd/)).to be_fixed } specify('unequal branches') { expect(ML.of(/ab|cdef/)).not_to be_fixed } specify('equal quantified branches') { expect(ML.of(/a{2}|cd/)).to be_fixed } specify('unequal quantified branches') { expect(ML.of(/a{3}|cd/)).not_to be_fixed } specify('empty') { expect(ML.of(//)).to be_fixed } end describe('#each') do it 'returns an Enumerator if called without a block' do result = ML.of(/a?/).each expect(result).to be_a(Enumerator) expect(result.next).to eq 0 expect(result.next).to eq 1 expect { result.next }.to raise_error(StopIteration) end it 'is aware of limit option even if called without a block' do result = ML.of(/a?/).each(limit: 1) expect(result).to be_a(Enumerator) expect(result.next).to eq 0 expect { result.next }.to raise_error(StopIteration) end it 'is limited to 1000 iterations in case there are infinite match lengths' do expect(ML.of(/a*/).first(3000).size).to eq 1000 end it 'scaffolds the Enumerable interface' do expect(ML.of(/abc|defg/).count).to eq 2 expect(ML.of(/(ab)*/).first(5)).to eq [0, 2, 4, 6, 8] expect(ML.of(/a{,10}/).any? { |len| len > 20 }).to be false end end describe('#endless_each') do it 'returns an Enumerator if called without a block' do result = ML.of(/a?/).endless_each expect(result).to be_a(Enumerator) expect(result.next).to eq 0 expect(result.next).to eq 1 expect { result.next }.to raise_error(StopIteration) end it 'never stops iterating for infinite match lengths' do expect(ML.of(/a*/).endless_each.first(3000).size).to eq 3000 end end describe('#inspect') do it 'is nice' do result = RP.parse(/a{2,4}/)[0].match_length expect(result.inspect).to eq '# min=2 max=4>' end end end ammar-regexp_parser-0494e56/spec/expression/methods/match_spec.rb000066400000000000000000000012511433525313500251670ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Expression::Base#match') do it 'returns the #match result of the respective Regexp' do expect(RP.parse(/a/).match('a')[0]).to eq 'a' end it 'can be given an offset, just like Regexp#match' do expect(RP.parse(/./).match('ab', 1)[0]).to eq 'b' end it 'works with the #=~ alias' do expect(RP.parse(/a/) =~ 'a').to be_a MatchData end end RSpec.describe('Expression::Base#match?') do it 'returns true if the Respective Regexp matches' do expect(RP.parse(/a/).match?('a')).to be true end it 'returns false if the Respective Regexp does not match' do expect(RP.parse(/a/).match?('b')).to be false end end ammar-regexp_parser-0494e56/spec/expression/methods/strfregexp_spec.rb000066400000000000000000000145621433525313500262750ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Expression::Base#strfregexp') do specify('#strfre alias') do expect(RP.parse(/a/)).to respond_to(:strfre) end specify('#strfregexp level') do root = RP.parse(/a(b(c))/) expect(root.strfregexp('%l')).to eq 'root' a = root.first expect(a.strfregexp('%%l')).to eq '%0' b = root[1].first expect(b.strfregexp('<%l>')).to eq '<1>' c = root[1][1].first expect(c.strfregexp('[at: %l]')).to eq '[at: 2]' end specify('#strfregexp start end') do root = RP.parse(/a(b(c))/) expect(root.strfregexp('%s')).to eq '0' expect(root.strfregexp('%e')).to eq '7' a = root.first expect(a.strfregexp('%%s')).to eq '%0' expect(a.strfregexp('%e')).to eq '1' group_1 = root[1] expect(group_1.strfregexp('GRP:%s')).to eq 'GRP:1' expect(group_1.strfregexp('%e')).to eq '7' b = group_1.first expect(b.strfregexp('<@%s>')).to eq '<@2>' expect(b.strfregexp('%e')).to eq '3' c = group_1.last.first expect(c.strfregexp('[at: %s]')).to eq '[at: 4]' expect(c.strfregexp('%e')).to eq '5' end specify('#strfregexp length') do root = RP.parse(/a[b]c/) expect(root.strfregexp('%S')).to eq '5' a = root.first expect(a.strfregexp('%S')).to eq '1' set = root[1] expect(set.strfregexp('%S')).to eq '3' end specify('#strfregexp coded offset') do root = RP.parse(/a[b]c/) expect(root.strfregexp('%o')).to eq '@0+5' a = root.first expect(a.strfregexp('%o')).to eq '@0+1' set = root[1] expect(set.strfregexp('%o')).to eq '@1+3' end specify('#strfregexp type token') do root = RP.parse(/a[b](c)/) expect(root.strfregexp('%y')).to eq 'expression' expect(root.strfregexp('%k')).to eq 'root' expect(root.strfregexp('%i')).to eq 'expression:root' expect(root.strfregexp('%c')).to eq 'Regexp::Expression::Root' a = root.first expect(a.strfregexp('%y')).to eq 'literal' expect(a.strfregexp('%k')).to eq 'literal' expect(a.strfregexp('%i')).to eq 'literal:literal' expect(a.strfregexp('%c')).to eq 'Regexp::Expression::Literal' set = root[1] expect(set.strfregexp('%y')).to eq 'set' expect(set.strfregexp('%k')).to eq 'character' expect(set.strfregexp('%i')).to eq 'set:character' expect(set.strfregexp('%c')).to eq 'Regexp::Expression::CharacterSet' group = root.last expect(group.strfregexp('%y')).to eq 'group' expect(group.strfregexp('%k')).to eq 'capture' expect(group.strfregexp('%i')).to eq 'group:capture' expect(group.strfregexp('%c')).to eq 'Regexp::Expression::Group::Capture' end specify('#strfregexp quantifier') do root = RP.parse(/a+[b](c)?d{3,4}/) expect(root.strfregexp('%q')).to eq '{1}' expect(root.strfregexp('%Q')).to eq '' expect(root.strfregexp('%z, %Z')).to eq '1, 1' a = root.first expect(a.strfregexp('%q')).to eq '{1, or-more}' expect(a.strfregexp('%Q')).to eq '+' expect(a.strfregexp('%z, %Z')).to eq '1, -1' set = root[1] expect(set.strfregexp('%q')).to eq '{1}' expect(set.strfregexp('%Q')).to eq '' expect(set.strfregexp('%z, %Z')).to eq '1, 1' group = root[2] expect(group.strfregexp('%q')).to eq '{0, 1}' expect(group.strfregexp('%Q')).to eq '?' expect(group.strfregexp('%z, %Z')).to eq '0, 1' d = root.last expect(d.strfregexp('%q')).to eq '{3, 4}' expect(d.strfregexp('%Q')).to eq '{3,4}' expect(d.strfregexp('%z, %Z')).to eq '3, 4' end specify('#strfregexp text') do root = RP.parse(/a(b(c))|[d-gk-p]+/) expect(root.strfregexp('%t')).to eq 'a(b(c))|[d-gk-p]+' expect(root.strfregexp('%~t')).to eq 'expression:root' alt = root.first expect(alt.strfregexp('%t')).to eq 'a(b(c))|[d-gk-p]+' expect(alt.strfregexp('%T')).to eq 'a(b(c))|[d-gk-p]+' expect(alt.strfregexp('%~t')).to eq 'meta:alternation' seq_1 = alt.first expect(seq_1.strfregexp('%t')).to eq 'a(b(c))' expect(seq_1.strfregexp('%T')).to eq 'a(b(c))' expect(seq_1.strfregexp('%~t')).to eq 'expression:sequence' group = seq_1[1] expect(group.strfregexp('%t')).to eq '(b(c))' expect(group.strfregexp('%T')).to eq '(b(c))' expect(group.strfregexp('%~t')).to eq 'group:capture' seq_2 = alt.last expect(seq_2.strfregexp('%t')).to eq '[d-gk-p]+' expect(seq_2.strfregexp('%T')).to eq '[d-gk-p]+' set = seq_2.first expect(set.strfregexp('%t')).to eq '[d-gk-p]' expect(set.strfregexp('%T')).to eq '[d-gk-p]+' expect(set.strfregexp('%~t')).to eq 'set:character' end specify('#strfregexp combined') do root = RP.parse(/a{5}|[b-d]+/) expect(root.strfregexp('%b')).to eq '@0+11 expression:root' expect(root.strfregexp('%b')).to eq root.strfregexp('%o %i') expect(root.strfregexp('%m')).to eq '@0+11 expression:root {1}' expect(root.strfregexp('%m')).to eq root.strfregexp('%b %q') expect(root.strfregexp('%a')).to eq '@0+11 expression:root {1} a{5}|[b-d]+' expect(root.strfregexp('%a')).to eq root.strfregexp('%m %t') end specify('#strfregexp conditional') do root = RP.parse('(?a)(?()b|c)') expect { root.strfregexp }.not_to(raise_error) end specify('#strfregexp_tree') do root = RP.parse(/a[b-d]*(e(f+))?/) expect(root.strfregexp_tree('%>%o %~t')).to eq( "@0+15 expression:root\n" + " @0+1 a\n" + " @1+6 set:character\n" + " @2+3 set:range\n" + " @2+1 b\n" + " @4+1 d\n" + " @7+8 group:capture\n" + " @8+1 e\n" + " @9+4 group:capture\n" + " @10+2 f+" ) end specify('#strfregexp_tree separator') do root = RP.parse(/a[b-d]*(e(f+))?/) expect(root.strfregexp_tree('%>%o %~t', true, '-SEP-')).to eq( "@0+15 expression:root-SEP-" + " @0+1 a-SEP-" + " @1+6 set:character-SEP-" + " @2+3 set:range-SEP-" + " @2+1 b-SEP-" + " @4+1 d-SEP-" + " @7+8 group:capture-SEP-" + " @8+1 e-SEP-" + " @9+4 group:capture-SEP-" + " @10+2 f+" ) end specify('#strfregexp_tree excluding self') do root = RP.parse(/a[b-d]*(e(f+))?/) expect(root.strfregexp_tree('%>%o %~t', false)).to eq( "@0+1 a\n" + "@1+6 set:character\n" + " @2+3 set:range\n" + " @2+1 b\n" + " @4+1 d\n" + "@7+8 group:capture\n" + " @8+1 e\n" + " @9+4 group:capture\n" + " @10+2 f+" ) end end ammar-regexp_parser-0494e56/spec/expression/methods/tests_spec.rb000066400000000000000000000132351433525313500252420ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('ExpressionTests') do specify('#type?') do root = RP.parse(/abcd|(ghij)|[klmn]/) alt = root.first expect(alt.type?(:meta)).to be true expect(alt.type?(:escape)).to be false expect(alt.type?(%i[meta escape])).to be true expect(alt.type?(%i[literal escape])).to be false expect(alt.type?(:*)).to be true expect(alt.type?([:*])).to be true expect(alt.type?(%i[literal escape *])).to be true seq_1 = alt[0] expect(seq_1.type?(:expression)).to be true expect(seq_1.first.type?(:literal)).to be true seq_2 = alt[1] expect(seq_2.type?(:*)).to be true expect(seq_2.first.type?(:group)).to be true seq_3 = alt[2] expect(seq_3.first.type?(:set)).to be true end specify('#is?') do root = RP.parse(/.+|\.?/) expect(root.is?(:*)).to be true alt = root.first expect(alt.is?(:*)).to be true expect(alt.is?(:alternation)).to be true expect(alt.is?(:alternation, :meta)).to be true seq_1 = alt[0] expect(seq_1.is?(:sequence)).to be true expect(seq_1.is?(:sequence, :expression)).to be true expect(seq_1.first.is?(:dot)).to be true expect(seq_1.first.is?(:dot, :escape)).to be false expect(seq_1.first.is?(:dot, :meta)).to be true expect(seq_1.first.is?(:dot, %i[escape meta])).to be true seq_2 = alt[1] expect(seq_2.first.is?(:dot)).to be true expect(seq_2.first.is?(:dot, :escape)).to be true expect(seq_2.first.is?(:dot, :meta)).to be false expect(seq_2.first.is?(:dot, %i[meta escape])).to be true end specify('#one_of?') do root = RP.parse(/\Aab(c[\w])d|e.\z/) expect(root.one_of?(:*)).to be true expect(root.one_of?(:* => :*)).to be true expect(root.one_of?(:* => [:*])).to be true alt = root.first expect(alt.one_of?(:*)).to be true expect(alt.one_of?(:meta)).to be true expect(alt.one_of?(:meta, :alternation)).to be true expect(alt.one_of?(meta: %i[dot bogus])).to be false expect(alt.one_of?(meta: %i[dot alternation])).to be true seq_1 = alt[0] expect(seq_1.one_of?(:expression)).to be true expect(seq_1.one_of?(expression: :sequence)).to be true expect(seq_1.first.one_of?(:anchor)).to be true expect(seq_1.first.one_of?(anchor: :bos)).to be true expect(seq_1.first.one_of?(anchor: :eos)).to be false expect(seq_1.first.one_of?(anchor: %i[escape meta bos])).to be true expect(seq_1.first.one_of?(anchor: %i[escape meta eos])).to be false seq_2 = alt[1] expect(seq_2.first.one_of?(:literal)).to be true expect(seq_2[1].one_of?(:meta)).to be true expect(seq_2[1].one_of?(meta: :dot)).to be true expect(seq_2[1].one_of?(meta: :alternation)).to be false expect(seq_2[1].one_of?(meta: [:dot])).to be true expect(seq_2.last.one_of?(:group)).to be false expect(seq_2.last.one_of?(group: [:*])).to be false expect(seq_2.last.one_of?(group: [:*], meta: :*)).to be false expect(seq_2.last.one_of?(:meta => [:*], :* => :*)).to be true expect(seq_2.last.one_of?(meta: [:*], anchor: :*)).to be true expect(seq_2.last.one_of?(meta: [:*], anchor: :eos)).to be true expect(seq_2.last.one_of?(meta: [:*], anchor: [:bos])).to be false expect(seq_2.last.one_of?(meta: [:*], anchor: %i[bos eos])).to be true expect { root.one_of?(Object.new) }.to raise_error(ArgumentError) end specify('#==') do expect(RP.parse(/a/)).to eq RP.parse(/a/) expect(RP.parse(/a/)).not_to eq RP.parse(/B/) expect(RP.parse(/a+/)).to eq RP.parse(/a+/) expect(RP.parse(/a+/)).not_to eq RP.parse(/a++/) expect(RP.parse(/a+/)).not_to eq RP.parse(/a?/) expect(RP.parse(/\A/)).to eq RP.parse(/\A/) expect(RP.parse(/\A/)).not_to eq RP.parse(/\b/) expect(RP.parse(/[a]/)).to eq RP.parse(/[a]/) expect(RP.parse(/[a]/)).not_to eq RP.parse(/[B]/) expect(RP.parse(/(a)/)).to eq RP.parse(/(a)/) expect(RP.parse(/(a)/)).not_to eq RP.parse(/(B)/) expect(RP.parse(/(a|A)/)).to eq RP.parse(/(a|A)/) expect(RP.parse(/(a|A)/)).not_to eq RP.parse(/(a|B)/) expect(RP.parse(/(?:a)/)).to eq RP.parse(/(?:a)/) expect(RP.parse(/(?:a)/)).not_to eq RP.parse(/(a)/) expect(RP.parse(/(?a)/)).to eq RP.parse(/(?a)/) expect(RP.parse(/(?a)/)).not_to eq RP.parse(/(?B)/) expect(RP.parse(/(?a)/)).not_to eq RP.parse(/(?a)/) expect(RP.parse(/(?a)/)).not_to eq RP.parse(/(?'a'a)/) expect(RP.parse(/(a)(x)(?(1)T|F)/)).to eq RP.parse(/(a)(x)(?(1)T|F)/) expect(RP.parse(/(a)(x)(?(1)T|F)/)).not_to eq RP.parse(/(a)(x)(?(2)T|F)/) expect(RP.parse(/(a)(x)(?(1)T|F)/)).not_to eq RP.parse(/(B)(x)(?(1)T|F)/) expect(RP.parse(/(a)(x)(?(1)T|F)/)).not_to eq RP.parse(/(a)(x)(?(1)T|T)/) expect(RP.parse(/a+/)[0].quantifier).to eq RP.parse(/a+/)[0].quantifier expect(RP.parse(/a+/)[0].quantifier).not_to eq RP.parse(/a++/)[0].quantifier expect(RP.parse(/a+/)[0].quantifier).not_to eq RP.parse(/a?/)[0].quantifier expect(RP.parse(/a+/)[0].quantifier).not_to eq RP.parse(/a{1,}/)[0].quantifier # active options should differentiate expressions expect(RP.parse(/a/)[0]).to eq RP.parse(/a/)[0] expect(RP.parse(/a/i)[0]).not_to eq RP.parse(/a/)[0] expect(RP.parse(/(?i)a/)[1]).not_to eq RP.parse(/a/)[0] expect(RP.parse(/(?i:a)/)[0][0]).not_to eq RP.parse(/a/)[0] # levels should be ignored expect(RP.parse(/([a])/)[0][0][0]).to eq RP.parse(/a/)[0] end end ammar-regexp_parser-0494e56/spec/expression/methods/traverse_spec.rb000066400000000000000000000077371433525313500257450ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Subexpression#traverse') do specify('Subexpression#traverse') do root = RP.parse(/a(b(c(d)))|g[h-i]j|klmn/) enters = 0 visits = 0 exits = 0 root.traverse do |event, _exp, _index| enters = (enters + 1) if event == :enter visits = (visits + 1) if event == :visit exits = (exits + 1) if event == :exit end expect(enters).to eq 9 expect(enters).to eq exits expect(visits).to eq 9 end specify('Subexpression#traverse including self') do root = RP.parse(/a(b(c(d)))|g[h-i]j|klmn/) enters = 0 visits = 0 exits = 0 root.traverse(true) do |event, _exp, _index| enters = (enters + 1) if event == :enter visits = (visits + 1) if event == :visit exits = (exits + 1) if event == :exit end expect(enters).to eq 10 expect(enters).to eq exits expect(visits).to eq 9 end specify('Subexpression#traverse without a block') do root = RP.parse(/abc/) enum = root.traverse expect(enum).to be_a(Enumerator) event, expr, idx = enum.next expect(event).to eq(:visit) expect(expr).to be_a(Regexp::Expression::Literal) expect(idx).to eq(0) end specify('Subexpression#walk alias') do root = RP.parse(/abc/) expect(root).to respond_to(:walk) end specify('Subexpression#each_expression') do root = RP.parse(/a(?x:b(c))|g[h-k]/) count = 0 root.each_expression { count += 1 } expect(count).to eq 13 end specify('Subexpression#each_expression including self') do root = RP.parse(/a(?x:b(c))|g[h-k]/) count = 0 root.each_expression(true) { count += 1 } expect(count).to eq 14 end specify('Subexpression#each_expression indices') do root = RP.parse(/a(b)c/) indices = [] root.each_expression { |_exp, index| (indices << index) } expect(indices).to eq [0, 1, 0, 2] end specify('Subexpression#each_expression indices including self') do root = RP.parse(/a(b)c/) indices = [] root.each_expression(true) { |_exp, index| (indices << index) } expect(indices).to eq [0, 0, 1, 0, 2] end specify('Subexpression#each_expression without a block') do root = RP.parse(/abc/) enum = root.each_expression expect(enum).to be_a(Enumerator) expr, idx = enum.next expect(expr).to be_a(Regexp::Expression::Literal) expect(idx).to eq(0) end specify('Subexpression#flat_map without block') do root = RP.parse(/a(b([c-e]+))?/) array = root.flat_map expect(array).to be_instance_of(Array) expect(array.length).to eq 8 array.each do |item| expect(item).to be_instance_of(Array) expect(item.length).to eq 2 expect(item.first).to be_a(Regexp::Expression::Base) expect(item.last).to be_a(Integer) end end specify('Subexpression#flat_map without block including self') do root = RP.parse(/a(b([c-e]+))?/) array = root.flat_map(true) expect(array).to be_instance_of(Array) expect(array.length).to eq 9 end specify('Subexpression#flat_map indices') do root = RP.parse(/a(b([c-e]+))?f*g/) indices = root.flat_map { |_exp, index| index } expect(indices).to eq [0, 1, 0, 1, 0, 0, 0, 1, 2, 3] end specify('Subexpression#flat_map indices including self') do root = RP.parse(/a(b([c-e]+))?f*g/) indices = root.flat_map(true) { |_exp, index| index } expect(indices).to eq [0, 0, 1, 0, 1, 0, 0, 0, 1, 2, 3] end specify('Subexpression#flat_map expressions') do root = RP.parse(/a(b(c(d)))/) levels = root.flat_map { |exp, _index| [exp.level, exp.text] if exp.terminal? }.compact expect(levels).to eq [[0, 'a'], [1, 'b'], [2, 'c'], [3, 'd']] end specify('Subexpression#flat_map expressions including self') do root = RP.parse(/a(b(c(d)))/) levels = root.flat_map(true) { |exp, _index| [exp.level, exp.to_s] }.compact expect(levels).to eq [[0, 'a(b(c(d)))'], [0, 'a'], [0, '(b(c(d)))'], [1, 'b'], [1, '(c(d))'], [2, 'c'], [2, '(d)'], [3, 'd']] end end ammar-regexp_parser-0494e56/spec/expression/options_spec.rb000066400000000000000000000133131433525313500241250ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Expression::Base#options') do it 'returns a hash of options/flags that affect the expression' do exp = RP.parse(/a/ix)[0] expect(exp).to be_a Literal expect(exp.options).to eq(i: true, x: true) end it 'includes options that are locally enabled via special groups' do exp = RP.parse(/(?x)(?m:a)/i)[1][0] expect(exp).to be_a Literal expect(exp.options).to eq(i: true, m: true, x: true) end it 'excludes locally disabled options' do exp = RP.parse(/(?x)(?-im:a)/i)[1][0] expect(exp).to be_a Literal expect(exp.options).to eq(x: true) end it 'gives correct precedence to negative options' do # Negative options have precedence. E.g. /(?i-i)a/ is case-sensitive. regexp = /(?i-i:a)/ expect(regexp).to match 'a' expect(regexp).not_to match 'A' exp = RP.parse(regexp)[0][0] expect(exp).to be_a Literal expect(exp.options).to eq({}) end it 'correctly handles multiple negative option parts' do regexp = /(?--m--mx--) . /mx expect(regexp).to match ' . ' expect(regexp).not_to match '.' expect(regexp).not_to match "\n" exp = RP.parse(regexp)[2] expect(exp.options).to eq({}) end it 'gives correct precedence when encountering multiple encoding flags' do # Any encoding flag overrides all previous encoding flags. If there are # multiple encoding flags in an options string, the last one wins. # E.g. /(?dau)\w/ matches UTF8 chars but /(?dua)\w/ only ASCII chars. regexp1 = /(?dau)\w/ regexp2 = /(?dua)\w/ expect(regexp1).to match 'ü' expect(regexp2).not_to match 'ü' exp1 = RP.parse(regexp1)[1] exp2 = RP.parse(regexp2)[1] expect(exp1.options).to eq(u: true) expect(exp2.options).to eq(a: true) end it 'is accessible via shortcuts' do exp = Root.construct expect { exp.options[:i] = true } .to change { exp.i? }.from(false).to(true) .and change { exp.ignore_case? }.from(false).to(true) .and change { exp.case_insensitive? }.from(false).to(true) expect { exp.options[:m] = true } .to change { exp.m? }.from(false).to(true) .and change { exp.multiline? }.from(false).to(true) expect { exp.options[:x] = true } .to change { exp.x? }.from(false).to(true) .and change { exp.extended? }.from(false).to(true) .and change { exp.free_spacing? }.from(false).to(true) expect { exp.options[:a] = true } .to change { exp.a? }.from(false).to(true) .and change { exp.ascii_classes? }.from(false).to(true) expect { exp.options[:d] = true } .to change { exp.d? }.from(false).to(true) .and change { exp.default_classes? }.from(false).to(true) expect { exp.options[:u] = true } .to change { exp.u? }.from(false).to(true) .and change { exp.unicode_classes? }.from(false).to(true) end include_examples 'parse', //i, [] => [:root, i?: true, x?: false] include_examples 'parse', /a/i, [0] => [:literal, i?: true, x?: false] include_examples 'parse', /\A/i, [0] => [:bos, i?: true, x?: false] include_examples 'parse', /\d/i, [0] => [:digit, i?: true, x?: false] include_examples 'parse', /\n/i, [0] => [:newline, i?: true, x?: false] include_examples 'parse', /\K/i, [0] => [:mark, i?: true, x?: false] include_examples 'parse', /./i, [0] => [:dot, i?: true, x?: false] include_examples 'parse', /(a)/i, [0] => [:capture, i?: true, x?: false] include_examples 'parse', /(a)/i, [0, 0] => [:literal, i?: true, x?: false] include_examples 'parse', /(?=a)/i, [0] => [:lookahead, i?: true, x?: false] include_examples 'parse', /(?=a)/i, [0, 0] => [:literal, i?: true, x?: false] include_examples 'parse', /(a|b)/i, [0] => [:capture, i?: true, x?: false] include_examples 'parse', /(a|b)/i, [0, 0] => [:alternation, i?: true, x?: false] include_examples 'parse', /(a|b)/i, [0, 0, 0] => [:sequence, i?: true, x?: false] include_examples 'parse', /(a|b)/i, [0, 0, 0, 0] => [:literal, i?: true, x?: false] include_examples 'parse', /(a)\1/i, [1] => [:number, i?: true, x?: false] include_examples 'parse', /(a)\k<1>/i, [1] => [:number_ref, i?: true, x?: false] include_examples 'parse', /(a)\g<1>/i, [1] => [:number_call, i?: true, x?: false] include_examples 'parse', /[a]/i, [0] => [:character, i?: true, x?: false] include_examples 'parse', /[a]/i, [0, 0] => [:literal, i?: true, x?: false] include_examples 'parse', /[a-z]/i, [0, 0] => [:range, i?: true, x?: false] include_examples 'parse', /[a-z]/i, [0, 0, 0] => [:literal, i?: true, x?: false] include_examples 'parse', /[a&&z]/i, [0, 0] => [:intersection, i?: true, x?: false] include_examples 'parse', /[a&&z]/i, [0, 0, 0, 0] => [:literal, i?: true, x?: false] include_examples 'parse', /[[:ascii:]]/i, [0, 0] => [:ascii, i?: true, x?: false] include_examples 'parse', /\p{word}/i, [0] => [:word, i?: true, x?: false] include_examples 'parse', /(a)(?(1)b|c)/i, [1] => [:open, i?: true, x?: false] include_examples 'parse', /(a)(?(1)b|c)/i, [1, 0] => [:condition, i?: true, x?: false] include_examples 'parse', /(a)(?(1)b|c)/i, [1, 1] => [:sequence, i?: true, x?: false] include_examples 'parse', /(a)(?(1)b|c)/i, [1, 1, 0] => [:literal, i?: true, x?: false] end ammar-regexp_parser-0494e56/spec/expression/parts_spec.rb000066400000000000000000000053251433525313500235670ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Expression::Base#parts') do include_examples 'parse', //, [] => [:root, parts: []] include_examples 'parse', /a/, [0] => [:literal, parts: ['a']] include_examples 'parse', /\K/, [0] => [:mark, parts: ['\K']] include_examples 'parse', /\p{any}/, [0] => [:any, parts: ['\p{any}']] include_examples 'parse', /[a]/, [0] => [:character, parts: ['[', s(Literal, 'a'), ']']] include_examples 'parse', /[^a]/, [0] => [:character, parts: ['[^', s(Literal, 'a'), ']']] include_examples 'parse', /(a)/, [0] => [:capture, parts: ['(', s(Literal, 'a'), ')']] include_examples 'parse', /(?>a)/, [0] => [:atomic, parts: ['(?>', s(Literal, 'a'), ')']] include_examples 'parse', /(?=a)/, [0] => [:lookahead, parts: ['(?=', s(Literal, 'a'), ')']] include_examples 'parse', /(?#a)/, [0] => [:comment, parts: ['(?#a)']] include_examples 'parse', /(a(b(c)))/, [0] => [:capture, parts: [ '(', s(Literal, 'a'), s(Group::Capture, '(', s(Literal, 'b'), s(Group::Capture, '(', s(Literal, 'c'), ) ), ')' ]] include_examples 'parse', /a|b|c/, [] => [:root, parts: [ s(Alternation, '|', s(Alternative, nil, s(Literal, 'a')), s(Alternative, nil, s(Literal, 'b')), s(Alternative, nil, s(Literal, 'c')) ) ]], [0] => [:alternation, parts: [ s(Alternative, nil, s(Literal, 'a')), '|', s(Alternative, nil, s(Literal, 'b')), '|', s(Alternative, nil, s(Literal, 'c')) ]] include_examples 'parse', /[a-z]/, [] => [:root, parts: [ s(CharacterSet, '[', s(CharacterSet::Range, '-', s(Literal, 'a'), s(Literal, 'z')), ) ]], [0] => [:character, parts: [ '[', s(CharacterSet::Range, '-', s(Literal, 'a'), s(Literal, 'z')), ']' ]], [0, 0] => [:range, parts: [ s(Literal, 'a'), '-', s(Literal, 'z') ]] include_examples 'parse', /[a&&b&&c]/, [] => [:root, parts: [ s(CharacterSet, '[', s(CharacterSet::Intersection, '&&', s(Literal, 'a'), s(Literal, 'b'), s(Literal, 'c')), ) ]], [0, 0] => [:intersection, parts: [ s(CharacterSet::IntersectedSequence, nil, s(Literal, 'a')), '&&', s(CharacterSet::IntersectedSequence, nil, s(Literal, 'b')), '&&', s(CharacterSet::IntersectedSequence, nil, s(Literal, 'c')) ]] include_examples 'parse', /(a)(?(1)T|F)/, [1] => [Conditional::Expression, parts: [ '(?', s(Conditional::Condition, '(1)'), s(Conditional::Branch, nil, s(Literal, 'T')), '|', s(Conditional::Branch, nil, s(Literal, 'F')), ')' ]] end ammar-regexp_parser-0494e56/spec/expression/subexpression_spec.rb000066400000000000000000000032521433525313500253440ustar00rootroot00000000000000require 'spec_helper' RSpec.describe(Regexp::Expression::Subexpression) do # check #ts, #te include_examples 'parse', /abcd|ghij|klmn|pqur/, [0] => [Alternation, ts: 0, te: 19], [0, 0] => [Alternative, ts: 0, te: 4], [0, 1] => [Alternative, ts: 5, te: 9], [0, 2] => [Alternative, ts: 10, te: 14], [0, 3] => [Alternative, ts: 15, te: 19] # check #nesting_level include_examples 'parse', /a(b(\d|[ef-g[h]]))/, [0] => [Literal, to_s: 'a', nesting_level: 1], [1, 0] => [Literal, to_s: 'b', nesting_level: 2], [1, 1, 0] => [Alternation, to_s: '\d|[ef-g[h]]', nesting_level: 3], [1, 1, 0, 0] => [Alternative, to_s: '\d', nesting_level: 4], [1, 1, 0, 0, 0] => [CharacterType::Digit, to_s: '\d', nesting_level: 5], [1, 1, 0, 1] => [Alternative, to_s: '[ef-g[h]]', nesting_level: 4], [1, 1, 0, 1, 0] => [CharacterSet, to_s: '[ef-g[h]]', nesting_level: 5], [1, 1, 0, 1, 0, 0] => [Literal, to_s: 'e', nesting_level: 6], [1, 1, 0, 1, 0, 1] => [CharacterSet::Range, to_s: 'f-g', nesting_level: 6], [1, 1, 0, 1, 0, 1, 0] => [Literal, to_s: 'f', nesting_level: 7], [1, 1, 0, 1, 0, 2, 0] => [Literal, to_s: 'h', nesting_level: 7] specify('#dig') do root = RP.parse(/(((a)))/) expect(root.dig(0).to_s).to eq '(((a)))' expect(root.dig(0, 0, 0, 0).to_s).to eq 'a' expect(root.dig(0, 0, 0, 0, 0)).to be_nil expect(root.dig(3, 7)).to be_nil end end ammar-regexp_parser-0494e56/spec/expression/to_h_spec.rb000066400000000000000000000015501433525313500233630ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Expression::Base#to_h') do include_examples 'parse', /abc/, [] => [Root, to_h: { token: :root, type: :expression, text: 'abc', starts_at: 0, length: 3, quantifier: nil, options: {}, level: 0, set_level: 0, conditional_level: 0, expressions: [ { token: :literal, type: :literal, text: 'abc', starts_at: 0, length: 3, quantifier: nil, options: {}, level: 0, set_level: 0, conditional_level: 0 } ] }] include_examples 'parse', /a{2,4}/, [0, :q] => [Quantifier, to_h: { max: 4, min: 2, mode: :greedy, text: '{2,4}', token: :interval, }] specify('Conditional#to_h') do root = RP.parse('(?a)(?()b|c)') expect { root.to_h }.not_to(raise_error) end end ammar-regexp_parser-0494e56/spec/expression/to_s_spec.rb000066400000000000000000000055701433525313500234040ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Expression::Base#to_s') do def parse_frozen(pattern, ruby_version = nil) IceNine.deep_freeze(RP.parse(pattern, *ruby_version)) end def expect_round_trip(pattern, ruby_version = nil) parsed = parse_frozen(pattern, ruby_version) expect(parsed.to_s).to eql(pattern) end specify('literal alternation') do expect_round_trip('abcd|ghij|klmn|pqur') end specify('quantified alternations') do expect_round_trip('(?:a?[b]+(c){2}|d+[e]*(f)?)|(?:g+[h]?(i){2,3}|j*[k]{3,5}(l)?)') end specify('quantified sets') do expect_round_trip('[abc]+|[^def]{3,6}') end specify('property sets') do expect_round_trip('[\a\b\p{Lu}\P{Z}\c\d]+') end specify('groups') do expect_round_trip("(a(?>b(?:c(?d(?'N'e)??f)+g)*+h)*i)++") end specify('assertions') do expect_round_trip('(a+(?=b+(?!c+(?<=d+(?a)(?()b|c)/, 3 => [:conditional, :open, '(?', 7, 9, 0, 0, 0], 4 => [:conditional, :condition, '()', 9, 14, 0, 0, 1], 6 => [:conditional, :separator, '|', 15, 16, 0, 0, 1], 8 => [:conditional, :close, ')', 17, 18, 0, 0, 0] include_examples 'lex', /((?a)(?(?()b|((?()[e-g]|[h-j])))))/, 0 => [:group, :capture, '(', 0, 1, 0, 0, 0], 1 => [:group, :named, '(?', 1, 6, 1, 0, 0], 5 => [:conditional, :open, '(?', 13, 15, 2, 0, 0], 6 => [:conditional, :condition, '()', 15, 20, 2, 0, 1], 8 => [:conditional, :separator, '|', 21, 22, 2, 0, 1], 10 => [:conditional, :open, '(?', 23, 25, 3, 0, 1], 11 => [:conditional, :condition, '()', 25, 30, 3, 0, 2], 12 => [:set, :open, '[', 30, 31, 3, 0, 2], 13 => [:literal, :literal, 'e', 31, 32, 3, 1, 2], 14 => [:set, :range, '-', 32, 33, 3, 1, 2], 15 => [:literal, :literal, 'g', 33, 34, 3, 1, 2], 16 => [:set, :close, ']', 34, 35, 3, 0, 2], 17 => [:conditional, :separator, '|', 35, 36, 3, 0, 2], 23 => [:conditional, :close, ')', 41, 42, 3, 0, 1], 25 => [:conditional, :close, ')', 43, 44, 2, 0, 0], 26 => [:group, :close, ')', 44, 45, 1, 0, 0], 27 => [:group, :close, ')', 45, 46, 0, 0, 0] include_examples 'lex', /(a(b(c)))(?(1)(?(2)(?(3)d|e))|(?(3)(?(2)f|g)|(?(1)f|g)))/, 9 => [:conditional, :open, '(?', 9, 11, 0, 0, 0], 10 => [:conditional, :condition, '(1)', 11, 14, 0, 0, 1], 11 => [:conditional, :open, '(?', 14, 16, 0, 0, 1], 12 => [:conditional, :condition, '(2)', 16, 19, 0, 0, 2], 13 => [:conditional, :open, '(?', 19, 21, 0, 0, 2], 14 => [:conditional, :condition, '(3)', 21, 24, 0, 0, 3], 16 => [:conditional, :separator, '|', 25, 26, 0, 0, 3], 18 => [:conditional, :close, ')', 27, 28, 0, 0, 2], 19 => [:conditional, :close, ')', 28, 29, 0, 0, 1], 20 => [:conditional, :separator, '|', 29, 30, 0, 0, 1], 21 => [:conditional, :open, '(?', 30, 32, 0, 0, 1], 22 => [:conditional, :condition, '(3)', 32, 35, 0, 0, 2], 23 => [:conditional, :open, '(?', 35, 37, 0, 0, 2], 24 => [:conditional, :condition, '(2)', 37, 40, 0, 0, 3], 26 => [:conditional, :separator, '|', 41, 42, 0, 0, 3], 28 => [:conditional, :close, ')', 43, 44, 0, 0, 2], 29 => [:conditional, :separator, '|', 44, 45, 0, 0, 2], 30 => [:conditional, :open, '(?', 45, 47, 0, 0, 2], 31 => [:conditional, :condition, '(1)', 47, 50, 0, 0, 3], 33 => [:conditional, :separator, '|', 51, 52, 0, 0, 3], 35 => [:conditional, :close, ')', 53, 54, 0, 0, 2], 36 => [:conditional, :close, ')', 54, 55, 0, 0, 1], 37 => [:conditional, :close, ')', 55, 56, 0, 0, 0] end ammar-regexp_parser-0494e56/spec/lexer/delimiters_spec.rb000066400000000000000000000055111433525313500235140ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Literal delimiter lexing') do include_examples 'lex', '}', 0 => [:literal, :literal, '}', 0, 1, 0, 0, 0] include_examples 'lex', '}}', 0 => [:literal, :literal, '}}', 0, 2, 0, 0, 0] include_examples 'lex', '{', 0 => [:literal, :literal, '{', 0, 1, 0, 0, 0] include_examples 'lex', '{{', 0 => [:literal, :literal, '{{', 0, 2, 0, 0, 0] include_examples 'lex', '{}', 0 => [:literal, :literal, '{}', 0, 2, 0, 0, 0] include_examples 'lex', '}{', 0 => [:literal, :literal, '}{', 0, 2, 0, 0, 0] include_examples 'lex', '}{+', 0 => [:literal, :literal, '}', 0, 1, 0, 0, 0], 1 => [:literal, :literal, '{', 1, 2, 0, 0, 0], 2 => [:quantifier, :one_or_more, '+', 2, 3, 0, 0, 0] include_examples 'lex', '{{var}}', 0 => [:literal, :literal, '{{var}}', 0, 7, 0, 0, 0] include_examples 'lex', 'a{b}c', 0 => [:literal, :literal, 'a{b}c', 0, 5, 0, 0, 0] include_examples 'lex', 'a{1,2', 0 => [:literal, :literal, 'a{1,2', 0, 5, 0, 0, 0] include_examples 'lex', '({.+})', 0 => [:group, :capture, '(', 0, 1, 0, 0, 0], 1 => [:literal, :literal, '{', 1, 2, 1, 0, 0], 2 => [:meta, :dot, '.', 2, 3, 1, 0, 0], 3 => [:quantifier, :one_or_more, '+', 3, 4, 1, 0, 0], 4 => [:literal, :literal, '}', 4, 5, 1, 0, 0], 5 => [:group, :close, ')', 5, 6, 0, 0, 0] include_examples 'lex', ']', 0 => [:literal, :literal, ']', 0, 1, 0, 0, 0] include_examples 'lex', ']]', 0 => [:literal, :literal, ']]', 0, 2, 0, 0, 0] include_examples 'lex', ']\[', 0 => [:literal, :literal, ']', 0, 1, 0, 0, 0], 1 => [:escape, :set_open, '\[', 1, 3, 0, 0, 0] include_examples 'lex', '()', 0 => [:group, :capture, '(', 0, 1, 0, 0, 0], 1 => [:group, :close, ')', 1, 2, 0, 0, 0] include_examples 'lex', '{abc:.+}}}[^}]]}', 0 => [:literal, :literal, '{abc:', 0, 5, 0, 0, 0], 1 => [:meta, :dot, '.', 5, 6, 0, 0, 0], 2 => [:quantifier, :one_or_more, '+', 6, 7, 0, 0, 0], 3 => [:literal, :literal, '}}}', 7, 10, 0, 0, 0], 4 => [:set, :open, '[', 10, 11, 0, 0, 0], 5 => [:set, :negate, '^', 11, 12, 0, 1, 0], 6 => [:literal, :literal, '}', 12, 13, 0, 1, 0], 7 => [:set, :close, ']', 13, 14, 0, 0, 0], 8 => [:literal, :literal, ']}', 14, 16, 0, 0, 0] end ammar-regexp_parser-0494e56/spec/lexer/escapes_spec.rb000066400000000000000000000010321433525313500227700ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Escape lexing') do include_examples 'lex', '\u{62}', 0 => [:escape, :codepoint_list, '\u{62}', 0, 6, 0, 0, 0] include_examples 'lex', '\u{62 63 64}', 0 => [:escape, :codepoint_list, '\u{62 63 64}', 0, 12, 0, 0, 0] include_examples 'lex', '\u{62 63 64}+', 0 => [:escape, :codepoint_list, '\u{62 63}', 0, 9, 0, 0, 0], 1 => [:escape, :codepoint_list, '\u{64}', 9, 15, 0, 0, 0], 2 => [:quantifier, :one_or_more, '+', 15, 16, 0, 0, 0] end ammar-regexp_parser-0494e56/spec/lexer/keep_spec.rb000066400000000000000000000004361433525313500223000ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Keep lexing') do include_examples 'lex', /ab\Kcd/, 1 => [:keep, :mark, '\K', 2, 4, 0, 0, 0] include_examples 'lex', /(a\Kb)|(c\\\Kd)ef/, 2 => [:keep, :mark, '\K', 2, 4, 1, 0, 0], 9 => [:keep, :mark, '\K', 11, 13, 1, 0, 0] end ammar-regexp_parser-0494e56/spec/lexer/literals_spec.rb000066400000000000000000000054241433525313500231750ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Literal lexing') do # ascii, single byte characters include_examples 'lex', 'a', 0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0] include_examples 'lex', 'ab+', 0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0], 1 => [:literal, :literal, 'b', 1, 2, 0, 0, 0], 2 => [:quantifier, :one_or_more, '+', 2, 3, 0, 0, 0] # 2 byte wide characters include_examples 'lex', 'äöü+', 0 => [:literal, :literal, 'äö', 0, 2, 0, 0, 0], 1 => [:literal, :literal, 'ü', 2, 3, 0, 0, 0], 2 => [:quantifier, :one_or_more, '+', 3, 4, 0, 0, 0] # 3 byte wide characters, Japanese include_examples 'lex', 'ab?れます+cd', 0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0], 1 => [:literal, :literal, 'b', 1, 2, 0, 0, 0], 2 => [:quantifier, :zero_or_one, '?', 2, 3, 0, 0, 0], 3 => [:literal, :literal, 'れま', 3, 5, 0, 0, 0], 4 => [:literal, :literal, 'す', 5, 6, 0, 0, 0], 5 => [:quantifier, :one_or_more, '+', 6, 7, 0, 0, 0], 6 => [:literal, :literal, 'cd', 7, 9, 0, 0, 0] # 4 byte wide characters, Osmanya include_examples 'lex', '𐒀𐒁?𐒂ab+𐒃', 0 => [:literal, :literal, '𐒀', 0, 1, 0, 0, 0], 1 => [:literal, :literal, '𐒁', 1, 2, 0, 0, 0], 2 => [:quantifier, :zero_or_one, '?', 2, 3, 0, 0, 0], 3 => [:literal, :literal, '𐒂a', 3, 5, 0, 0, 0], 4 => [:literal, :literal, 'b', 5, 6, 0, 0, 0], 5 => [:quantifier, :one_or_more, '+', 6, 7, 0, 0, 0], 6 => [:literal, :literal, '𐒃', 7, 8, 0, 0, 0] include_examples 'lex', 'mu𝄞?si*𝄫c+', 0 => [:literal, :literal, 'mu', 0, 2, 0, 0, 0], 1 => [:literal, :literal, '𝄞', 2, 3, 0, 0, 0], 2 => [:quantifier, :zero_or_one, '?', 3, 4, 0, 0, 0], 3 => [:literal, :literal, 's', 4, 5, 0, 0, 0], 4 => [:literal, :literal, 'i', 5, 6, 0, 0, 0], 5 => [:quantifier, :zero_or_more, '*', 6, 7, 0, 0, 0], 6 => [:literal, :literal, '𝄫', 7, 8, 0, 0, 0], 7 => [:literal, :literal, 'c', 8, 9, 0, 0, 0], 8 => [:quantifier, :one_or_more, '+', 9, 10, 0, 0, 0] specify('lex single 2 byte char') do tokens = RL.lex("\u0627+") expect(tokens.count).to eq 2 end specify('lex single 3 byte char') do tokens = RL.lex("\u308C+") expect(tokens.count).to eq 2 end specify('lex single 4 byte char') do tokens = RL.lex("\u{1D11E}+") expect(tokens.count).to eq 2 end end ammar-regexp_parser-0494e56/spec/lexer/nesting_spec.rb000066400000000000000000000126301433525313500230220ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Nesting lexing') do include_examples 'lex', /(((b)))/, 0 => [:group, :capture, '(', 0, 1, 0, 0, 0], 1 => [:group, :capture, '(', 1, 2, 1, 0, 0], 2 => [:group, :capture, '(', 2, 3, 2, 0, 0], 3 => [:literal, :literal, 'b', 3, 4, 3, 0, 0], 4 => [:group, :close, ')', 4, 5, 2, 0, 0], 5 => [:group, :close, ')', 5, 6, 1, 0, 0], 6 => [:group, :close, ')', 6, 7, 0, 0, 0] include_examples 'lex', /(\((b)\))/, 0 => [:group, :capture, '(', 0, 1, 0, 0, 0], 1 => [:escape, :group_open, '\(', 1, 3, 1, 0, 0], 2 => [:group, :capture, '(', 3, 4, 1, 0, 0], 3 => [:literal, :literal, 'b', 4, 5, 2, 0, 0], 4 => [:group, :close, ')', 5, 6, 1, 0, 0], 5 => [:escape, :group_close, '\)', 6, 8, 1, 0, 0], 6 => [:group, :close, ')', 8, 9, 0, 0, 0] include_examples 'lex', /(?>a(?>b(?>c)))/, 0 => [:group, :atomic, '(?>', 0, 3, 0, 0, 0], 2 => [:group, :atomic, '(?>', 4, 7, 1, 0, 0], 4 => [:group, :atomic, '(?>', 8, 11, 2, 0, 0], 6 => [:group, :close, ')', 12, 13, 2, 0, 0], 7 => [:group, :close, ')', 13, 14, 1, 0, 0], 8 => [:group, :close, ')', 14, 15, 0, 0, 0] include_examples 'lex', /(?:a(?:b(?:c)))/, 0 => [:group, :passive, '(?:', 0, 3, 0, 0, 0], 2 => [:group, :passive, '(?:', 4, 7, 1, 0, 0], 4 => [:group, :passive, '(?:', 8, 11, 2, 0, 0], 6 => [:group, :close, ')', 12, 13, 2, 0, 0], 7 => [:group, :close, ')', 13, 14, 1, 0, 0], 8 => [:group, :close, ')', 14, 15, 0, 0, 0] include_examples 'lex', /(?=a(?!b(?<=c(? [:assertion, :lookahead, '(?=', 0, 3, 0, 0, 0], 2 => [:assertion, :nlookahead, '(?!', 4, 7, 1, 0, 0], 4 => [:assertion, :lookbehind, '(?<=', 8, 12, 2, 0, 0], 6 => [:assertion, :nlookbehind, '(? [:group, :close, ')', 18, 19, 3, 0, 0], 9 => [:group, :close, ')', 19, 20, 2, 0, 0], 10 => [:group, :close, ')', 20, 21, 1, 0, 0], 11 => [:group, :close, ')', 21, 22, 0, 0, 0] include_examples 'lex', /((?#a)b(?#c)d(?#e))/, 0 => [:group, :capture, '(', 0, 1, 0, 0, 0], 1 => [:group, :comment, '(?#a)', 1, 6, 1, 0, 0], 3 => [:group, :comment, '(?#c)', 7, 12, 1, 0, 0], 5 => [:group, :comment, '(?#e)', 13, 18, 1, 0, 0], 6 => [:group, :close, ')', 18, 19, 0, 0, 0] include_examples 'lex', /a[b-e]f/, 1 => [:set, :open, '[', 1, 2, 0, 0, 0], 2 => [:literal, :literal, 'b', 2, 3, 0, 1, 0], 3 => [:set, :range, '-', 3, 4, 0, 1, 0], 4 => [:literal, :literal, 'e', 4, 5, 0, 1, 0], 5 => [:set, :close, ']', 5, 6, 0, 0, 0] include_examples 'lex', '[[:word:]&&[^c]z]', 0 => [:set, :open, '[', 0, 1, 0, 0, 0], 1 => [:posixclass, :word, '[:word:]', 1, 9, 0, 1, 0], 2 => [:set, :intersection, '&&', 9, 11, 0, 1, 0], 3 => [:set, :open, '[', 11, 12, 0, 1, 0], 4 => [:set, :negate, '^', 12, 13, 0, 2, 0], 5 => [:literal, :literal, 'c', 13, 14, 0, 2, 0], 6 => [:set, :close, ']', 14, 15, 0, 1, 0], 7 => [:literal, :literal, 'z', 15, 16, 0, 1, 0], 8 => [:set, :close, ']', 16, 17, 0, 0, 0] include_examples 'lex', '[\p{word}&&[^c]z]', 0 => [:set, :open, '[', 0, 1, 0, 0, 0], 1 => [:property, :word, '\p{word}', 1, 9, 0, 1, 0], 2 => [:set, :intersection, '&&', 9, 11, 0, 1, 0], 3 => [:set, :open, '[', 11, 12, 0, 1, 0], 4 => [:set, :negate, '^', 12, 13, 0, 2, 0], 5 => [:literal, :literal, 'c', 13, 14, 0, 2, 0], 6 => [:set, :close, ']', 14, 15, 0, 1, 0], 7 => [:literal, :literal, 'z', 15, 16, 0, 1, 0], 8 => [:set, :close, ']', 16, 17, 0, 0, 0] include_examples 'lex', /[a[b[c[d-g]]]]/, 0 => [:set, :open, '[', 0, 1, 0, 0, 0], 1 => [:literal, :literal, 'a', 1, 2, 0, 1, 0], 2 => [:set, :open, '[', 2, 3, 0, 1, 0], 3 => [:literal, :literal, 'b', 3, 4, 0, 2, 0], 4 => [:set, :open, '[', 4, 5, 0, 2, 0], 5 => [:literal, :literal, 'c', 5, 6, 0, 3, 0], 6 => [:set, :open, '[', 6, 7, 0, 3, 0], 7 => [:literal, :literal, 'd', 7, 8, 0, 4, 0], 8 => [:set, :range, '-', 8, 9, 0, 4, 0], 9 => [:literal, :literal, 'g', 9, 10, 0, 4, 0], 10 => [:set, :close, ']', 10, 11, 0, 3, 0], 11 => [:set, :close, ']', 11, 12, 0, 2, 0], 12 => [:set, :close, ']', 12, 13, 0, 1, 0], 13 => [:set, :close, ']', 13, 14, 0, 0, 0] end ammar-regexp_parser-0494e56/spec/lexer/refcalls_spec.rb000066400000000000000000000052311433525313500231450ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('RefCall lexing') do # Traditional numerical group back-reference include_examples 'lex', '(abc)\1', 3 => [:backref, :number, '\1', 5, 7, 0, 0, 0] # Group back-references, named, numbered, and relative include_examples 'lex', '(?abc)\k', 3 => [:backref, :name_ref, '\k', 9, 14, 0, 0, 0] include_examples 'lex', "(?abc)\\k'X'", 3 => [:backref, :name_ref, "\\k'X'", 9, 14, 0, 0, 0] include_examples 'lex', '(abc)\k<1>', 3 => [:backref, :number_ref, '\k<1>', 5, 10, 0, 0, 0] include_examples 'lex', "(abc)\\k'1'", 3 => [:backref, :number_ref, "\\k'1'", 5, 10, 0, 0, 0] include_examples 'lex', '(abc)\k<-1>', 3 => [:backref, :number_rel_ref, '\k<-1>', 5, 11, 0, 0, 0] include_examples 'lex', "(abc)\\k'-1'", 3 => [:backref, :number_rel_ref, "\\k'-1'", 5, 11, 0, 0, 0] # Sub-expression invocation, named, numbered, and relative include_examples 'lex', '(?abc)\g', 3 => [:backref, :name_call, '\g', 9, 14, 0, 0, 0] include_examples 'lex', "(?abc)\\g'X'", 3 => [:backref, :name_call, "\\g'X'", 9, 14, 0, 0, 0] include_examples 'lex', '(abc)\g<1>', 3 => [:backref, :number_call, '\g<1>', 5, 10, 0, 0, 0] include_examples 'lex', "(abc)\\g'1'", 3 => [:backref, :number_call, "\\g'1'", 5, 10, 0, 0, 0] include_examples 'lex', '\g<0>', 0 => [:backref, :number_call, '\g<0>', 0, 5, 0, 0, 0] include_examples 'lex', "\\g'0'", 0 => [:backref, :number_call, "\\g'0'", 0, 5, 0, 0, 0] include_examples 'lex', '(abc)\g<-1>', 3 => [:backref, :number_rel_call, '\g<-1>', 5, 11, 0, 0, 0] include_examples 'lex', "(abc)\\g'-1'", 3 => [:backref, :number_rel_call, "\\g'-1'", 5, 11, 0, 0, 0] include_examples 'lex', '(abc)\g<+1>', 3 => [:backref, :number_rel_call, '\g<+1>', 5, 11, 0, 0, 0] include_examples 'lex', "(abc)\\g'+1'", 3 => [:backref, :number_rel_call, "\\g'+1'", 5, 11, 0, 0, 0] # Group back-references, with nesting level include_examples 'lex', '(?abc)\k', 3 => [:backref, :name_recursion_ref, '\k', 9, 16, 0, 0, 0] include_examples 'lex', "(?abc)\\k'X-0'", 3 => [:backref, :name_recursion_ref, "\\k'X-0'", 9, 16, 0, 0, 0] include_examples 'lex', '(abc)\k<1-0>', 3 => [:backref, :number_recursion_ref, '\k<1-0>', 5, 12, 0, 0, 0] include_examples 'lex', "(abc)\\k'1-0'", 3 => [:backref, :number_recursion_ref, "\\k'1-0'", 5, 12, 0, 0, 0] end ammar-regexp_parser-0494e56/spec/parser/000077500000000000000000000000001433525313500201675ustar00rootroot00000000000000ammar-regexp_parser-0494e56/spec/parser/all_spec.rb000066400000000000000000000016111433525313500222750ustar00rootroot00000000000000require 'spec_helper' RSpec.describe(Regexp::Parser) do specify('parse returns a root expression') do expect(RP.parse('abc')).to be_instance_of(Root) end specify('parse can be called with block') do expect(RP.parse('abc') { |root| root.class }).to eq Root end specify('parse root contains expressions') do root = RP.parse(/^a.c+[^one]{2,3}\b\d\\\C-C$/) expect(root.expressions).to all(be_a Regexp::Expression::Base) end specify('parse root options mi') do root = RP.parse(/[abc]/mi) expect(root.m?).to be true expect(root.i?).to be true expect(root.x?).to be false end specify('parse no quantifier target raises error') do expect { RP.parse('?abc') }.to raise_error(Regexp::Parser::Error) end specify('parse sequence no quantifier target raises error') do expect { RP.parse('abc|?def') }.to raise_error(Regexp::Parser::Error) end end ammar-regexp_parser-0494e56/spec/parser/alternation_spec.rb000066400000000000000000000027601433525313500240530ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Alternation parsing') do include_examples 'parse', /a|b/, [0] => [Alternation, text: '|', count: 2], [0, 0] => [Alternative, text: '', count: 1], [0, 0, 0] => [:literal, text: 'a' ], [0, 1] => [Alternative, text: '', count: 1], [0, 1, 0] => [:literal, text: 'b' ] include_examples 'parse', /a|(b)c/, [0] => [Alternation, text: '|', count: 2], [0, 0] => [Alternative, text: '', count: 1], [0, 0, 0] => [:literal, text: 'a' ], [0, 1] => [Alternative, text: '', count: 2], [0, 1, 0] => [:capture, to_s: '(b)' ], [0, 1, 1] => [:literal, text: 'c' ] include_examples 'parse', /(ab??|cd*|ef+)*|(gh|ij|kl)?/, [0] => [Alternation, text: '|', count: 2, quantified?: false], [0, 0] => [Alternative, text: '', count: 1, quantified?: false], [0, 0, 0] => [:capture, count: 1, quantified?: true ], [0, 0, 0, 0] => [Alternation, text: '|', count: 3 ], [0, 0, 0, 0, 0] => [Alternative, text: '', count: 2 ], [0, 0, 0, 0, 0, 0] => [:literal, to_s: 'a' ], [0, 0, 0, 0, 0, 1] => [:literal, to_s: 'b??' ], [0, 1] => [Alternative, text: '', count: 1, quantified?: false], [0, 1, 0] => [:capture, count: 1, quantified?: true ] end ammar-regexp_parser-0494e56/spec/parser/anchors_spec.rb000066400000000000000000000015221433525313500231630ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Anchor parsing') do include_examples 'parse', /^a/, 0 => [:anchor, :bol, Anchor::BOL] include_examples 'parse', /a$/, 1 => [:anchor, :eol, Anchor::EOL] include_examples 'parse', /\Aa/, 0 => [:anchor, :bos, Anchor::BOS] include_examples 'parse', /a\z/, 1 => [:anchor, :eos, Anchor::EOS] include_examples 'parse', /a\Z/, 1 => [:anchor, :eos_ob_eol, Anchor::EOSobEOL] include_examples 'parse', /a\b/, 1 => [:anchor, :word_boundary, Anchor::WordBoundary] include_examples 'parse', /a\B/, 1 => [:anchor, :nonword_boundary, Anchor::NonWordBoundary] include_examples 'parse', /a\G/, 1 => [:anchor, :match_start, Anchor::MatchStart] include_examples 'parse', /\\A/, 0 => [:escape, :backslash, EscapeSequence::Literal] end ammar-regexp_parser-0494e56/spec/parser/conditionals_spec.rb000066400000000000000000000057421433525313500242240ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Conditional parsing') do include_examples 'parse', /(?a)(?()T|F)/, [1] => [:conditional, :open, Conditional::Expression, to_s: '(?()T|F)', reference: 'A'], [1, 0] => [:conditional, :condition, Conditional::Condition, to_s: '()', reference: 'A'], [1, 1] => [:expression, :sequence, Conditional::Branch, to_s: 'T'], [1, 1, 0] => [:literal, text: 'T'], [1, 2] => [:expression, :sequence, Conditional::Branch, to_s: 'F'], [1, 2, 0] => [:literal, text: 'F'] include_examples 'parse', /(a)(?(1)T|F)/, [1] => [:conditional, :open, Conditional::Expression, to_s: '(?(1)T|F)', reference: 1], [1, 0] => [:conditional, :condition, Conditional::Condition, to_s: '(1)', reference: 1], [1, 1] => [:expression, :sequence, Conditional::Branch, to_s: 'T'], [1, 1, 0] => [:literal, text: 'T'], [1, 2] => [:expression, :sequence, Conditional::Branch, to_s: 'F'], [1, 2, 0] => [:literal, text: 'F'] include_examples 'parse', /(foo)(?(1)\d+|(\w)){42}/, [1] => [Conditional::Expression, quantified?: true, to_s: '(?(1)\d+|(\w)){42}'], [1, 0] => [Conditional::Condition, quantified?: false], [1, 1] => [Conditional::Branch, quantified?: false], [1, 1, 0] => [:digit, quantified?: true, to_s: '\d+'], [1, 2] => [Conditional::Branch, quantified?: false] # test nested and mixed with alternations include_examples 'parse', <<-EOS.gsub(/\s/, ''), ( (a) | (b) | ( ( ?(2) (c(d|e)+)? | ( ?(3) f | ( ?(4) (g|(h)(i)) ) ) ) ) ) EOS [0] => [Group::Capture, count: 1], [0, 0] => [Alternation, count: 3], [0, 0, 2] => [Alternative, count: 1], [0, 0, 2, 0] => [Group::Capture, count: 1], [0, 0, 2, 0, 0] => [Conditional::Expression, count: 3, conditional_level: 0], [0, 0, 2, 0, 0, 0] => [Conditional::Condition, to_s: '(2)', conditional_level: 1], [0, 0, 2, 0, 0, 1] => [Conditional::Branch, to_s: '(c(d|e)+)?', conditional_level: 1], [0, 0, 2, 0, 0, 2] => [Conditional::Branch, to_s: '(?(3)f|(?(4)(g|(h)(i))))', conditional_level: 1], [0, 0, 2, 0, 0, 2, 0] => [Conditional::Expression, count: 3, conditional_level: 1], [0, 0, 2, 0, 0, 2, 0, 0] => [Conditional::Condition, to_s: '(3)', conditional_level: 2], [0, 0, 2, 0, 0, 2, 0, 1] => [Conditional::Branch, count: 1, to_s: 'f', conditional_level: 2], [0, 0, 2, 0, 0, 2, 0, 1, 0] => [Literal, text: 'f', conditional_level: 2] # test empty branch include_examples 'parse', /(?a)(?()T|)/, [1] => [Conditional::Expression, count: 3, to_s: '(?()T|)'], [1, 2] => [Conditional::Branch, to_s: ''] end ammar-regexp_parser-0494e56/spec/parser/errors_spec.rb000066400000000000000000000022031433525313500230370ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Parsing errors') do let(:parser) { Regexp::Parser.new } before { parser.parse(/foo/) } # initializes ivars it('raises UnknownTokenTypeError for unknown token types') do expect { parser.send(:parse_token, Regexp::Token.new(:foo, :bar)) } .to raise_error(Regexp::Parser::UnknownTokenTypeError) end RSpec.shared_examples 'UnknownTokenError' do |type| it "raises for unkown tokens of type #{type}" do expect { parser.send(:parse_token, Regexp::Token.new(type, :foo)) } .to raise_error(Regexp::Parser::UnknownTokenError) end end include_examples 'UnknownTokenError', :anchor include_examples 'UnknownTokenError', :backref include_examples 'UnknownTokenError', :conditional include_examples 'UnknownTokenError', :free_space include_examples 'UnknownTokenError', :group include_examples 'UnknownTokenError', :meta include_examples 'UnknownTokenError', :nonproperty include_examples 'UnknownTokenError', :property include_examples 'UnknownTokenError', :quantifier include_examples 'UnknownTokenError', :set include_examples 'UnknownTokenError', :type end ammar-regexp_parser-0494e56/spec/parser/escapes_spec.rb000066400000000000000000000077721433525313500231660ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('EscapeSequence parsing') do es = EscapeSequence include_examples 'parse', /a\ac/, 1 => [:escape, :bell, es::Bell] include_examples 'parse', /a\ec/, 1 => [:escape, :escape, es::AsciiEscape] include_examples 'parse', /a\fc/, 1 => [:escape, :form_feed, es::FormFeed] include_examples 'parse', /a\nc/, 1 => [:escape, :newline, es::Newline] include_examples 'parse', /a\rc/, 1 => [:escape, :carriage, es::Return] include_examples 'parse', /a\tc/, 1 => [:escape, :tab, es::Tab] include_examples 'parse', /a\vc/, 1 => [:escape, :vertical_tab, es::VerticalTab] # meta character escapes include_examples 'parse', /a\.c/, 1 => [:escape, :dot, es::Literal] include_examples 'parse', /a\?c/, 1 => [:escape, :zero_or_one, es::Literal] include_examples 'parse', /a\*c/, 1 => [:escape, :zero_or_more, es::Literal] include_examples 'parse', /a\+c/, 1 => [:escape, :one_or_more, es::Literal] include_examples 'parse', /a\|c/, 1 => [:escape, :alternation, es::Literal] include_examples 'parse', /a\(c/, 1 => [:escape, :group_open, es::Literal] include_examples 'parse', /a\)c/, 1 => [:escape, :group_close, es::Literal] include_examples 'parse', /a\{c/, 1 => [:escape, :interval_open, es::Literal] include_examples 'parse', /a\}c/, 1 => [:escape, :interval_close, es::Literal] # unicode escapes include_examples 'parse', /a\u0640/, 1 => [:escape, :codepoint, es::Codepoint] include_examples 'parse', /a\u{41 1F60D}/, 1 => [:escape, :codepoint_list, es::CodepointList] include_examples 'parse', /a\u{10FFFF}/, 1 => [:escape, :codepoint_list, es::CodepointList] # hex escapes include_examples 'parse', /a\xFF/n, 1 => [:escape, :hex, es::Hex] # octal escapes include_examples 'parse', /a\177/n, 1 => [:escape, :octal, es::Octal] # test #char and #codepoint include_examples 'parse', /\n/, 0 => [char: "\n", codepoint: 10 ] include_examples 'parse', /\?/, 0 => [char: '?', codepoint: 63 ] include_examples 'parse', /\101/, 0 => [char: 'A', codepoint: 65 ] include_examples 'parse', /\x42/, 0 => [char: 'B', codepoint: 66 ] include_examples 'parse', /\u0043/, 0 => [char: 'C', codepoint: 67 ] include_examples 'parse', /\u{44 45}/, 0 => [chars: %w[D E], codepoints: [68, 69]] specify('codepoint_list #char and #codepoint raise errors') do exp = RP.parse(/\u{44 45}/)[0] expect { exp.char }.to raise_error(/#chars/) expect { exp.codepoint }.to raise_error(/#codepoints/) end # Meta/control espaces # # After the following fix in Ruby 3.1, a Regexp#source containing meta/control # escapes can only be set with the Regexp::new constructor. # In Regexp literals, these escapes are now pre-processed to hex escapes. # # https://github.com/ruby/ruby/commit/11ae581a4a7f5d5f5ec6378872eab8f25381b1b9 n = ->(regexp_body){ Regexp.new(regexp_body.force_encoding('ascii-8bit')) } include_examples 'parse', n.('\\\\\c2b'), 1 => [es::Control, text: '\c2', char: "\x12", codepoint: 18 ] include_examples 'parse', n.('\d\C-C\w'), 1 => [es::Control, text: '\C-C', char: "\x03", codepoint: 3 ] include_examples 'parse', n.('\Z\M-Z'), 1 => [es::Meta, text: '\M-Z', char: "\u00DA", codepoint: 218] include_examples 'parse', n.('\A\M-\C-X'), 1 => [es::MetaControl, text: '\M-\C-X', char: "\u0098", codepoint: 152] include_examples 'parse', n.('\A\M-\cX'), 1 => [es::MetaControl, text: '\M-\cX', char: "\u0098", codepoint: 152] include_examples 'parse', n.('\A\C-\M-X'), 1 => [es::MetaControl, text: '\C-\M-X', char: "\u0098", codepoint: 152] include_examples 'parse', n.('\A\c\M-X'), 1 => [es::MetaControl, text: '\c\M-X', char: "\u0098", codepoint: 152] end ammar-regexp_parser-0494e56/spec/parser/free_space_spec.rb000066400000000000000000000025511433525313500236250ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('FreeSpace parsing') do include_examples 'parse', /a b c/, [0] => [Literal, text: 'a b c'] include_examples 'parse', /a b c/x, [0] => [Literal, text: 'a'], [1] => [WhiteSpace, text: ' '], [2] => [Literal, text: 'b'], [3] => [WhiteSpace, text: ' '], [4] => [Literal, text: 'c'] include_examples 'parse', /a * b + c/x, [0] => [Literal, to_s: 'a*', quantified?: true], [1] => [WhiteSpace, text: ' '], [2] => [Literal, to_s: 'b+', quantified?: true], [3] => [WhiteSpace, text: ' '], [4] => [Literal, to_s: 'c'] include_examples 'parse', / a ? # One letter b {2,5} # Another one [c-g] + # A set (h|i|j) # A group /x, [1] => [Literal, to_s: 'a?', quantified?: true], [2] => [WhiteSpace], [3] => [Comment, to_s: "# One letter\n"], [7] => [Comment, to_s: "# Another one\n"], [11] => [Comment, to_s: "# A set\n"], [15] => [Comment, to_s: "# A group\n"] include_examples 'parse', / a # comment 1 ? ( b # comment 2 # comment 3 + ) # comment 4 * /x, [1] => [Literal, to_s: 'a?', quantified?: true], [5] => [Group::Capture, quantified?: true], [5, 1] => [Literal, to_s: 'b+', quantified?: true], [5, 3] => [Comment, to_s: "# comment 2\n"] end ammar-regexp_parser-0494e56/spec/parser/groups_spec.rb000066400000000000000000000131751433525313500230540ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Group parsing') do include_examples 'parse', /(?=abc)(?!def)/, 0 => [:assertion, :lookahead, Assertion::Lookahead], 1 => [:assertion, :nlookahead, Assertion::NegativeLookahead] include_examples 'parse', /(?<=abc)(? [:assertion, :lookbehind, Assertion::Lookbehind], 1 => [:assertion, :nlookbehind, Assertion::NegativeLookbehind] include_examples 'parse', /a(?# is for apple)b(?# for boy)c(?# cat)/, 1 => [:group, :comment, Group::Comment], 3 => [:group, :comment, Group::Comment], 5 => [:group, :comment, Group::Comment] if ruby_version_at_least('2.4.1') include_examples 'parse', 'a(?~b)c(?~d)e', 1 => [:group, :absence, Group::Absence], 3 => [:group, :absence, Group::Absence] end include_examples 'parse', /(?m:a)/, 0 => [:group, :options, Group::Options, options: { m: true }, option_changes: { m: true }] # self-defeating group option include_examples 'parse', /(?m-m:a)/, 0 => [:group, :options, Group::Options, options: {}, option_changes: { m: false }] # activate one option in nested group include_examples 'parse', /(?x-mi:a(?m:b))/, 0 => [:group, :options, Group::Options, options: { x: true }, option_changes: { i: false, m: false, x: true }], [0, 1] => [:group, :options, Group::Options, options: { m: true, x: true }, option_changes: { m: true }] # deactivate one option in nested group include_examples 'parse', /(?ix-m:a(?-i:b))/, 0 => [:group, :options, Group::Options, options: { i: true, x: true }, option_changes: { i: true, m: false, x: true }], [0, 1] => [:group, :options, Group::Options, options: { x: true }, option_changes: { i: false }] # invert all options in nested group include_examples 'parse', /(?xi-m:a(?m-ix:b))/, 0 => [:group, :options, Group::Options, options: { i: true, x: true }, option_changes: { i: true, m: false, x: true }], [0, 1] => [:group, :options, Group::Options, options: { m: true }, option_changes: { i: false, m: true, x: false }] # nested options affect literal subexpressions include_examples 'parse', /(?x-mi:a(?m:b))/, [0, 0] => [:literal, :literal, Literal, text: 'a', options: { x: true }], [0, 1, 0] => [:literal, :literal, Literal, text: 'b', options: { m: true, x: true }] # option switching group include_examples 'parse', /a(?i-m)b/m, 0 => [:literal, :literal, Literal, text: 'a', options: { m: true }], 1 => [:group, :options_switch, Group::Options, options: { i: true }, option_changes: { i: true, m: false }], 2 => [:literal, :literal, Literal, text: 'b', options: { i: true }] # option switch in group include_examples 'parse', /(a(?i-m)b)c/m, 0 => [:group, :capture, Group::Capture, options: { m: true }], [0, 0] => [:literal, :literal, Literal, text: 'a', options: { m: true }], [0, 1] => [:group, :options_switch, Group::Options, options: { i: true }, option_changes: { i: true, m: false }], [0, 2] => [:literal, :literal, Literal, text: 'b', options: { i: true }], 1 => [:literal, :literal, Literal, text: 'c', options: { m: true }] # nested option switch in group include_examples 'parse', /((?i-m)(a(?-i)b))/m, [0, 1] => [:group, :capture, Group::Capture, options: { i: true }], [0, 1, 0] => [:literal, :literal, Literal, text: 'a', options: { i: true }], [0, 1, 1] => [:group, :options_switch, Group::Options, options: {}, option_changes: { i: false }], [0, 1, 2] => [:literal, :literal, Literal, text: 'b', options: {}] # options dau include_examples 'parse', /(?dua:abc)/, 0 => [:group, :options, Group::Options, options: { a: true }, option_changes: { a: true }] # nested options dau include_examples 'parse', /(?u:a(?d:b))/, 0 => [:group, :options, Group::Options, options: { u: true }, option_changes: { u: true }], [0, 1] => [:group, :options, Group::Options, options: { d: true }, option_changes: { d: true, u: false }], [0, 1, 0] => [:literal, :literal, Literal, text: 'b', options: { d: true }] # nested options da include_examples 'parse', /(?di-xm:a(?da-x:b))/, 0 => [:group, :options, Group::Options, options: { d: true, i:true }], [0, 1] => [:group, :options, Group::Options, options: { a: true, i: true }, option_changes: { a: true, d: false, x: false}], [0, 1, 0] => [:literal, :literal, Literal, text: 'b', options: { a: true, i: true }] specify('parse group number') do root = RP.parse(/(a)(?=b)((?:c)(d|(e)))/) expect(root.dig(0).number).to eq 1 expect(root.dig(1)).not_to respond_to(:number) expect(root.dig(2).number).to eq 2 expect(root.dig(2, 0)).not_to respond_to(:number) expect(root.dig(2, 1).number).to eq 3 expect(root.dig(2, 1, 0, 1, 0).number).to eq 4 end specify('parse group number at level') do root = RP.parse(/(a)(?=b)((?:c)(d|(e)))/) expect(root.dig(0).number_at_level).to eq 1 expect(root.dig(1)).not_to respond_to(:number_at_level) expect(root.dig(2).number_at_level).to eq 2 expect(root.dig(2, 0)).not_to respond_to(:number_at_level) expect(root.dig(2, 1).number_at_level).to eq 1 expect(root.dig(2, 1, 0, 1, 0).number_at_level).to eq 1 end specify('parse invalid option switch quantification') do expect { RP.parse('a(?i)+') }.to raise_error(/Can not quantify/) expect { RP.parse('a(?i)*') }.to raise_error(/Can not quantify/) expect { RP.parse('a(?i)?') }.to raise_error(/Can not quantify/) expect { RP.parse('a(?i){5}') }.to raise_error(/Can not quantify/) end end ammar-regexp_parser-0494e56/spec/parser/keep_spec.rb000066400000000000000000000003531433525313500224530ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Keep parsing') do include_examples 'parse', /ab\Kcd/, 1 => [:keep, :mark, Keep::Mark, text: '\K'] include_examples 'parse', /(a\K)/, [0, 1] => [:keep, :mark, Keep::Mark, text: '\K'] end ammar-regexp_parser-0494e56/spec/parser/options_spec.rb000066400000000000000000000014421433525313500232220ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('passing options to parse') do it 'raises if if parsing from a Regexp and options are passed' do expect { RP.parse(/a+/, options: ::Regexp::EXTENDED) }.to raise_error( ArgumentError, 'options cannot be supplied unless parsing a String' ) end it 'sets options if parsing from a String' do root = RP.parse('a+', options: ::Regexp::MULTILINE | ::Regexp::EXTENDED) expect(root.options).to eq(m: true, x: true) end it 'allows options to not be supplied when parsing from a Regexp' do root = RP.parse(/a+/ix) expect(root.options).to eq(i: true, x: true) end it 'has an empty option-hash when parsing from a String and passing no options' do root = RP.parse('a+') expect(root.options).to be_empty end end ammar-regexp_parser-0494e56/spec/parser/posix_classes_spec.rb000066400000000000000000000013651433525313500244120ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('PosixClass parsing') do include_examples 'parse', /[[:word:]]/, [0] => [CharacterSet, count: 1], [0, 0] => [:posixclass, :word, PosixClass, name: 'word', text: '[:word:]', negative?: false] include_examples 'parse', /[[:^word:]]/, [0] => [CharacterSet, count: 1], [0, 0] => [:nonposixclass, :word, PosixClass, name: 'word', text: '[:^word:]', negative?: true] # cases treated as regular subsets by Ruby, not as (invalid) posix classes include_examples 'parse', '[[:ab]c:]', [0, 0] => [CharacterSet, count: 3], [0, 0, 0] => [Literal, text: ':'] include_examples 'parse', '[[:a[b]c:]]', [0, 0] => [CharacterSet, count: 5], [0, 0, 0] => [Literal, text: ':'] end ammar-regexp_parser-0494e56/spec/parser/properties_spec.rb000066400000000000000000000076611433525313500237340ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Property parsing') do # test various notations supported by Ruby include_examples 'parse', '\p{sd}', 0 => [:property, :soft_dotted, negative?: false] include_examples 'parse', '\p{SD}', 0 => [:property, :soft_dotted, negative?: false] include_examples 'parse', '\p{Soft Dotted}', 0 => [:property, :soft_dotted, negative?: false] include_examples 'parse', '\p{Soft-Dotted}', 0 => [:property, :soft_dotted, negative?: false] include_examples 'parse', '\p{sOfT_dOtTeD}', 0 => [:property, :soft_dotted, negative?: false] # test ^-negation include_examples 'parse', '\p{^sd}', 0 => [:nonproperty, :soft_dotted, negative?: true] include_examples 'parse', '\p{^SD}', 0 => [:nonproperty, :soft_dotted, negative?: true] include_examples 'parse', '\p{^Soft Dotted}', 0 => [:nonproperty, :soft_dotted, negative?: true] include_examples 'parse', '\p{^Soft-Dotted}', 0 => [:nonproperty, :soft_dotted, negative?: true] include_examples 'parse', '\p{^sOfT_dOtTeD}', 0 => [:nonproperty, :soft_dotted, negative?: true] # test P-negation include_examples 'parse', '\P{sd}', 0 => [:nonproperty, :soft_dotted, negative?: true] include_examples 'parse', '\P{SD}', 0 => [:nonproperty, :soft_dotted, negative?: true] include_examples 'parse', '\P{Soft Dotted}', 0 => [:nonproperty, :soft_dotted, negative?: true] include_examples 'parse', '\P{Soft-Dotted}', 0 => [:nonproperty, :soft_dotted, negative?: true] include_examples 'parse', '\P{sOfT_dOtTeD}', 0 => [:nonproperty, :soft_dotted, negative?: true] # double negation is positive again include_examples 'parse', '\P{^sd}', 0 => [:property, :soft_dotted, negative?: false] include_examples 'parse', '\P{^SD}', 0 => [:property, :soft_dotted, negative?: false] include_examples 'parse', '\P{^Soft Dotted}', 0 => [:property, :soft_dotted, negative?: false] include_examples 'parse', '\P{^Soft-Dotted}', 0 => [:property, :soft_dotted, negative?: false] include_examples 'parse', '\P{^sOfT_dOtTeD}', 0 => [:property, :soft_dotted, negative?: false] # test #shortcut include_examples 'parse', '\p{soft_dotted}', 0 => [:property, :soft_dotted, shortcut: 'sd'] include_examples 'parse', '\p{sd}', 0 => [:property, :soft_dotted, shortcut: 'sd'] include_examples 'parse', '\p{in_bengali}', 0 => [:property, :in_bengali, shortcut: nil] # test classification include_examples 'parse', '\p{age=5.2}', 0 => [UnicodeProperty::Age] include_examples 'parse', '\p{Math}', 0 => [UnicodeProperty::Derived] include_examples 'parse', '\p{Hiragana}', 0 => [UnicodeProperty::Script] include_examples 'parse', '\p{InArmenian}', 0 => [UnicodeProperty::Block] specify('parse abandoned newline property') do root = RP.parse('\p{newline}', 'ruby/1.9') expect(root.expressions.last).to be_a(UnicodeProperty::Base) expect { RP.parse('\p{newline}', 'ruby/2.0') }.to raise_error(Regexp::Syntax::NotImplementedError) end # cannot test older Rubies because of https://bugs.ruby-lang.org/issues/18686 if ruby_version_at_least('3.2.0') specify('parse all properties of current ruby') do unsupported = RegexpPropertyValues.all_for_current_ruby.reject do |prop| RP.parse("\\p{#{prop}}") rescue false end expect(unsupported).to be_empty end end # Ruby 2.3 supports a short prop name (sterm) without supporting the long name # of the same prop (sentence_terminal). Let's ignore this unique case. if ruby_version_at_least('2.4.0') specify('parse only properties of current ruby') do syntax = Regexp::Syntax.for("ruby/#{RUBY_VERSION}") excessive = syntax.features.fetch(:property, []).reject do |prop| begin Regexp.new("\\p{#{prop}}") rescue RegexpError, SyntaxError # error class depends on Ruby version false end end expect(excessive).to be_empty end end end ammar-regexp_parser-0494e56/spec/parser/quantifiers_spec.rb000066400000000000000000000066521433525313500240710ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Quantifier parsing') do include_examples 'parse', /a?b/, [0, :q] => [:zero_or_one, text: '?', mode: :greedy, min: 0, max: 1, ts: 1] include_examples 'parse', /a??b/, [0, :q] => [:zero_or_one, text: '??', mode: :reluctant, min: 0, max: 1, ts: 1] include_examples 'parse', /a?+b/, [0, :q] => [:zero_or_one, text: '?+', mode: :possessive, min: 0, max: 1, ts: 1] include_examples 'parse', /a*b/, [0, :q] => [:zero_or_more, text: '*', mode: :greedy, min: 0, max: -1, ts: 1] include_examples 'parse', /a*?b/, [0, :q] => [:zero_or_more, text: '*?', mode: :reluctant, min: 0, max: -1, ts: 1] include_examples 'parse', /a*+b/, [0, :q] => [:zero_or_more, text: '*+', mode: :possessive, min: 0, max: -1, ts: 1] include_examples 'parse', /a+b/, [0, :q] => [:one_or_more, text: '+', mode: :greedy, min: 1, max: -1, ts: 1] include_examples 'parse', /a+?b/, [0, :q] => [:one_or_more, text: '+?', mode: :reluctant, min: 1, max: -1, ts: 1] include_examples 'parse', /a++b/, [0, :q] => [:one_or_more, text: '++', mode: :possessive, min: 1, max: -1, ts: 1] include_examples 'parse', /a{2,4}b/, [0, :q] => [:interval, text: '{2,4}', mode: :greedy, min: 2, max: 4, ts: 1] include_examples 'parse', /a{2,}b/, [0, :q] => [:interval, text: '{2,}', mode: :greedy, min: 2, max: -1, ts: 1] include_examples 'parse', /a{,3}b/, [0, :q] => [:interval, text: '{,3}', mode: :greedy, min: 0, max: 3, ts: 1] include_examples 'parse', /a{4}b/, [0, :q] => [:interval, text: '{4}', mode: :greedy, min: 4, max: 4, ts: 1] include_examples 'parse', /a{004}b/, [0, :q] => [:interval, text: '{004}', mode: :greedy, min: 4, max: 4, ts: 1] # special case: exps with chained quantifiers are wrapped in implicit passive groups include_examples 'parse', /a+{2}{3}/, [0] => [:group, :passive, Group::Passive, implicit?: true, level: 0], [0, :q] => [:quantifier, :interval, Quantifier, text: '{3}', level: 0], [0, 0] => [:group, :passive, Group::Passive, implicit?: true, level: 1], [0, 0, :q] => [:quantifier, :interval, Quantifier, text: '{2}', level: 1], [0, 0, 0] => [:literal, :literal, Literal, text: 'a', level: 2], [0, 0, 0, :q] => [:quantifier, :one_or_more, Quantifier, text: '+', level: 2] # Ruby does not support modes for intervals, following `?` and `+` are read as chained quantifiers include_examples 'parse', /a{2,4}?b/, [0, :q] => [:quantifier, :zero_or_one, Quantifier, text: '?', mode: :greedy, min: 0, max: 1, ts: 6], [0, 0, :q] => [:quantifier, :interval, Quantifier, text: '{2,4}', mode: :greedy, min: 2, max: 4, ts: 1] include_examples 'parse', /a{2,4}+b/, [0, :q] => [:quantifier, :one_or_more, Quantifier, text: '+', mode: :greedy, min: 1, max: -1, ts: 6], [0, 0, :q] => [:quantifier, :interval, Quantifier, text: '{2,4}', mode: :greedy, min: 2, max: 4, ts: 1] specify('mode-checking methods') do exp = RP.parse(/a??/).first expect(exp).to be_reluctant expect(exp).to be_lazy expect(exp).not_to be_greedy expect(exp).not_to be_possessive expect(exp.quantifier).to be_reluctant expect(exp.quantifier).to be_lazy expect(exp.quantifier).not_to be_greedy expect(exp.quantifier).not_to be_possessive end end ammar-regexp_parser-0494e56/spec/parser/refcalls_spec.rb000066400000000000000000000106431433525313500233250ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Refcall parsing') do include_examples 'parse', /(abc)\1/, 1 => [Backreference::Number, reference: 1] include_examples 'parse', /(?abc)\k/, 1 => [Backreference::Name, name: 'X', reference: 'X'] include_examples 'parse', /(?abc)\k'X'/, 1 => [Backreference::Name, name: 'X', reference: 'X'] include_examples 'parse', /(abc)\k<1>/, 1 => [Backreference::Number, number: 1, reference: 1] include_examples 'parse', /(abc)\k<-1>/, 1 => [Backreference::NumberRelative, number: -1, reference: 1] include_examples 'parse', /(abc)\k'-1'/, 1 => [Backreference::NumberRelative, number: -1, reference: 1] include_examples 'parse', /(?abc)\g/, 1 => [Backreference::NameCall, reference: 'X'] include_examples 'parse', /(abc)\g<1>/, 1 => [Backreference::NumberCall, reference: 1] include_examples 'parse', '\g<0>', 0 => [Backreference::NumberCall, reference: 0] include_examples 'parse', /(abc)\g<-1>/, 1 => [Backreference::NumberCallRelative, reference: 1] include_examples 'parse', /\g<+1>(abc)/, 0 => [Backreference::NumberCallRelative, reference: 1] include_examples 'parse', /(?abc)\k/, 1 => [Backreference::NameRecursionLevel, name: 'X', recursion_level: 0] include_examples 'parse', /(abc)\k<1-0>/, 1 => [Backreference::NumberRecursionLevel, number: 1, recursion_level: 0] include_examples 'parse', /(abc)\k<1-0>/, 1 => [Backreference::NumberRecursionLevel, number: 1, recursion_level: 0] include_examples 'parse', /(abc)\k<-1+0>/, 1 => [Backreference::NumberRecursionLevel, number: -1, recursion_level: 0] include_examples 'parse', /(abc)\k<1+1>/, 1 => [Backreference::NumberRecursionLevel, number: 1, recursion_level: 1] include_examples 'parse', /(abc)\k<1-1>/, 1 => [Backreference::NumberRecursionLevel, number: 1, recursion_level: -1] # test #effective_number/#reference for complex cases include_examples 'parse', '(abc)(def)\k<-1>(ghi)\k<-3>\k<-1>', 2 => [:number_rel_ref, reference: 2], 4 => [:number_rel_ref, reference: 1], 5 => [:number_rel_ref, reference: 3] include_examples 'parse', '\g<+1>(abc)\g<+2>(def)(ghi)\g<-2>', 0 => [:number_rel_call, reference: 1], 2 => [:number_rel_call, reference: 3], 5 => [:number_rel_call, reference: 2] specify('parse backref referenced_expression') do root = RP.parse('(abc)(def)\\k<-1>(ghi)\\k<-3>\\k<-1>') exp1 = root[2] exp2 = root[4] exp3 = root[5] expect([exp1, exp2, exp3]).to all be_instance_of(Backreference::NumberRelative) expect(exp1.referenced_expression).to eq root[1] expect(exp1.referenced_expression.to_s).to eq '(def)' expect(exp2.referenced_expression).to eq root[0] expect(exp2.referenced_expression.to_s).to eq '(abc)' expect(exp3.referenced_expression).to eq root[3] expect(exp3.referenced_expression.to_s).to eq '(ghi)' end specify('parse backref call referenced_expression') do root = RP.parse('\\g<+1>(abc)\\g<+2>(def)(ghi)\\g<-2>') exp1 = root[0] exp2 = root[2] exp3 = root[5] expect([exp1, exp2, exp3]).to all be_instance_of(Backreference::NumberCallRelative) expect(exp1.referenced_expression).to eq root[1] expect(exp1.referenced_expression.to_s).to eq '(abc)' expect(exp2.referenced_expression).to eq root[4] expect(exp2.referenced_expression.to_s).to eq '(ghi)' expect(exp3.referenced_expression).to eq root[3] expect(exp3.referenced_expression.to_s).to eq '(def)' end specify('parse backref call referenced_expression root') do root = RP.parse('\g<0>') expect(root[0].referenced_expression).to eq root end specify('parse invalid reference') do expect { RP.parse('\1') }.to raise_error(/Invalid reference/) expect { RP.parse('(a)\2') }.to raise_error(/Invalid reference/) expect { RP.parse('\k<1>') }.to raise_error(/Invalid reference/) expect { RP.parse('\k<+1>') }.to raise_error(/Invalid reference/) expect { RP.parse('\k<+2>(a)') }.to raise_error(/Invalid reference/) expect { RP.parse('\k<-1>') }.to raise_error(/Invalid reference/) expect { RP.parse('(a)\k<-2>') }.to raise_error(/Invalid reference/) expect { RP.parse('\k<1+1>') }.to raise_error(/Invalid reference/) expect { RP.parse('\k<1-1>') }.to raise_error(/Invalid reference/) expect { RP.parse('\k') }.to raise_error(/Invalid reference/) expect { RP.parse('(?)\k') }.to raise_error(/Invalid reference/) end end ammar-regexp_parser-0494e56/spec/parser/set/000077500000000000000000000000001433525313500207625ustar00rootroot00000000000000ammar-regexp_parser-0494e56/spec/parser/set/intersections_spec.rb000066400000000000000000000060511433525313500252140ustar00rootroot00000000000000require 'spec_helper' # edge cases with `...-&&...` and `...&&-...` are checked in ./ranges_spec.rb RSpec.describe('CharacterSet::Intersection parsing') do include_examples 'parse', /[a&&z]/, [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Intersection, count: 2], [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 0, 0] => [:literal, text: 'a'], [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 1, 0] => [:literal, text: 'z'] include_examples 'parse', /[a-z&&[^a]]/, [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Intersection, count: 2], [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 0, 0] => [CharacterSet::Range, count: 2], [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 1, 0] => [CharacterSet, count: 1, negative?: true] include_examples 'parse', /[a&&a-z]/, [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Intersection, count: 2], [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 0, 0] => [:literal, text: 'a'], [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 1, 0] => [CharacterSet::Range, count: 2] include_examples 'parse', /[a&&\w]/, [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Intersection, count: 2], [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 1, 0] => [:word, text: '\w'] include_examples 'parse', /[\h&&\w&&efg]/, [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Intersection, count: 3], [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 0, 0] => [:hex, text: '\h'], [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 1, 0] => [:word, text: '\w'], [0, 0, 2] => [CharacterSet::IntersectedSequence, count: 3], [0, 0, 2, 0] => [:literal, text: 'e'], [0, 0, 2, 1] => [:literal, text: 'f'], [0, 0, 2, 2] => [:literal, text: 'g'] # Some edge-case patterns are evaluated with #match to make sure that # their matching behavior still reflects the way they are parsed. # #capturing_stderr is used to skip any warnings generated by this. specify('intersections behavior remains unchanged') do capturing_stderr do expect(/[a&&z]/).not_to match 'a' expect(/[a&&z]/).not_to match '&' expect(/[a&&z]/).not_to match 'z' expect(/[a-z&&[^a]]/).not_to match 'a' expect(/[a-z&&[^a]]/).not_to match '&' expect(/[a-z&&[^a]]/).to match 'b' expect(/[a&&a-z]/).to match 'a' expect(/[a&&a-z]/).not_to match '&' expect(/[a&&a-z]/).not_to match 'b' expect(/[a&&\w]/).to match 'a' expect(/[a&&\w]/).not_to match '&' expect(/[a&&\w]/).not_to match 'b' expect(/[\h&&\w&&efg]/).to match 'e' expect(/[\h&&\w&&efg]/).to match 'f' expect(/[\h&&\w&&efg]/).not_to match 'a' expect(/[\h&&\w&&efg]/).not_to match 'g' end end end ammar-regexp_parser-0494e56/spec/parser/set/ranges_spec.rb000066400000000000000000000066611433525313500236110ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('CharacterSet::Range parsing') do include_examples 'parse', '[a-z]', [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Range, count: 2], [0, 0, 0] => [:literal, text: 'a'], [0, 0, 1] => [:literal, text: 'z'] include_examples 'parse', '[\x00-\x22]', [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Range, count: 2], [0, 0, 0] => [:hex, text: '\x00'], [0, 0, 1] => [:hex, text: '\x22'] include_examples 'parse', '[\u{40 42}-\u1234]', [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Range, count: 2], [0, 0, 0] => [:codepoint_list, text: '\u{40 42}'], [0, 0, 1] => [:codepoint, text: '\u1234'] include_examples 'parse', '[--z]', [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Range, count: 2], [0, 0, 0] => [:literal, text: '-'], [0, 0, 1] => [:literal, text: 'z'] include_examples 'parse', '[!--]', [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Range, count: 2], [0, 0, 0] => [:literal, text: '!'], [0, 0, 1] => [:literal, text: '-'] include_examples 'parse', '[!-^]', [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Range, count: 2], [0, 0, 0] => [:literal, text: '!'], [0, 0, 1] => [:literal, text: '^'] # edge cases that are NOT treated as range include_examples 'parse', '[^-z]', [0] => [CharacterSet, count: 2], [0, 0] => [:literal, text: '-'], [0, 1] => [:literal, text: 'z'] include_examples 'parse', '[[\-ab]&&-bc]', [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Intersection, count: 2], [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 0, 0] => [CharacterSet, count: 3], [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 3], [0, 0, 1, 0] => [:literal, text: '-'] include_examples 'parse', '[bc-&&[\-ab]]', [0] => [CharacterSet, count: 1], [0, 0] => [CharacterSet::Intersection, count: 2], [0, 0, 0] => [CharacterSet::IntersectedSequence, count: 3], [0, 0, 0, 2] => [:literal, text: '-'], [0, 0, 1] => [CharacterSet::IntersectedSequence, count: 1], [0, 0, 1, 0] => [CharacterSet, count: 3] # Some edge-case patterns are evaluated with #match to make sure that # their matching behavior still reflects the way they are parsed. # #capturing_stderr is used to skip any warnings generated by this. specify('ranges behavior remains unchanged') do capturing_stderr do expect(Regexp.new('[\x00-\x22]')).to match "\x11" expect(Regexp.new('[\u{40 42}-\u1234]')).to match "\u0600" expect(Regexp.new('[--z]')).to match 'a' expect(Regexp.new('[!--]')).to match '$' expect(Regexp.new('[!-^]')).to match '$' # edge cases that are NOT treated as ranges expect(Regexp.new('[^-z]')).to match 'a' expect(Regexp.new('[^-z]')).not_to match 'z' expect(Regexp.new('[[\-ab]&&-bc]')).to match '-' expect(Regexp.new('[[\-ab]&&-bc]')).to match 'b' expect(Regexp.new('[[\-ab]&&-bc]')).not_to match 'a' expect(Regexp.new('[[\-ab]&&-bc]')).not_to match 'c' expect(Regexp.new('[bc-&&[\-ab]]')).to match '-' expect(Regexp.new('[bc-&&[\-ab]]')).to match 'b' expect(Regexp.new('[bc-&&[\-ab]]')).not_to match 'a' expect(Regexp.new('[bc-&&[\-ab]]')).not_to match 'c' end end end ammar-regexp_parser-0494e56/spec/parser/sets_spec.rb000066400000000000000000000101341433525313500225030ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('CharacterSet parsing') do include_examples 'parse', /[ab]+/, [0] => [:set, :character, CharacterSet, text: '[', count: 2, quantified?: true], [0, 0] => [:literal, :literal, Literal, text: 'a', set_level: 1], [0, 1] => [:literal, :literal, Literal, text: 'b', set_level: 1] include_examples 'parse', /[a\dc]/, [0] => [:set, :character, CharacterSet, text: '[', count: 3], [0, 1] => [:type, :digit, CharacterType::Digit] include_examples 'parse', /[a\bc]/, [0] => [:set, :character, CharacterSet, text: '[', count: 3], [0, 1] => [:escape, :backspace, EscapeSequence::Backspace, text: '\b'] include_examples 'parse', '[a\x20c]', [0] => [:set, :character, CharacterSet, text: '[', count: 3], [0, 1] => [:escape, :hex, EscapeSequence::Hex, text: '\x20'] include_examples 'parse', '[a\u0640c]', [0] => [:set, :character, CharacterSet, text: '[', count: 3], [0, 1] => [:escape, :codepoint, EscapeSequence::Codepoint, text: '\u0640'] include_examples 'parse', '[a\u{41 1F60D}c]', [0] => [:set, :character, CharacterSet, text: '[', count: 3], [0, 1] => [:escape, :codepoint_list, EscapeSequence::CodepointList, text: '\u{41 1F60D}'] include_examples 'parse', '[[:digit:][:^lower:]]+', [0] => [:set, :character, CharacterSet, text: '[', count: 2], [0, 0] => [:posixclass, :digit, PosixClass, text: '[:digit:]'], [0, 1] => [:nonposixclass, :lower, PosixClass, text: '[:^lower:]'] include_examples 'parse', '[a[b[c]d]e]', [0] => [:set, :character, CharacterSet, text: '[', count: 3, set_level: 0], [0, 0] => [:literal, :literal, Literal, text: 'a', set_level: 1], [0, 1] => [:set, :character, CharacterSet, text: '[', count: 3, set_level: 1], [0, 2] => [:literal, :literal, Literal, text: 'e', set_level: 1], [0, 1, 1] => [:set, :character, CharacterSet, text: '[', count: 1, set_level: 2], [0, 1, 1, 0] => [:literal, :literal, Literal, text: 'c', set_level: 3] include_examples 'parse', '[a[^b[c]]]', [0] => [:set, :character, CharacterSet, text: '[', count: 2, set_level: 0, negative?: false], [0, 0] => [:literal, :literal, Literal, text: 'a', set_level: 1], [0, 1] => [:set, :character, CharacterSet, text: '[', count: 2, set_level: 1, negative?: true], [0, 1, 0] => [:literal, :literal, Literal, text: 'b', set_level: 2], [0, 1, 1] => [:set, :character, CharacterSet, text: '[', count: 1, set_level: 2, negative?: false], [0, 1, 1, 0] => [:literal, :literal, Literal, text: 'c', set_level: 3] include_examples 'parse', '[aaa]', [0] => [:set, :character, CharacterSet, text: '[', count: 3], [0, 0] => [:literal, :literal, Literal, text: 'a'], [0, 1] => [:literal, :literal, Literal, text: 'a'], [0, 2] => [:literal, :literal, Literal, text: 'a'] include_examples 'parse', '[ ]', [0] => [:set, :character, CharacterSet, text: '[', count: 3], [0, 0] => [:literal, :literal, Literal, text: ' '], [0, 1] => [:literal, :literal, Literal, text: ' '], [0, 2] => [:literal, :literal, Literal, text: ' '] include_examples 'parse', '(?x)[ ]', # shouldn't merge whitespace even in x-mode [1] => [:set, :character, CharacterSet, text: '[', count: 3], [1, 0] => [:literal, :literal, Literal, text: ' '], [1, 1] => [:literal, :literal, Literal, text: ' '], [1, 2] => [:literal, :literal, Literal, text: ' '] include_examples 'parse', '[[.span-ll.]]', # collating sequences are disabled in Onigmo [0, 0] => [:set, :character, CharacterSet, text: '[', count: 7], [0, 0, 0] => [:literal, :literal, Literal, text: '.'] include_examples 'parse', '[[=e=]]', # character equivalents are disabled in Onigmo [0, 0] => [:set, :character, CharacterSet, text: '[', count: 3], [0, 0, 0] => [:literal, :literal, Literal, text: '='] end ammar-regexp_parser-0494e56/spec/parser/types_spec.rb000066400000000000000000000016351433525313500226770ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('CharacterType parsing') do include_examples 'parse', /a\dc/, 1 => [:type, :digit, CharacterType::Digit] include_examples 'parse', /a\Dc/, 1 => [:type, :nondigit, CharacterType::NonDigit] include_examples 'parse', /a\sc/, 1 => [:type, :space, CharacterType::Space] include_examples 'parse', /a\Sc/, 1 => [:type, :nonspace, CharacterType::NonSpace] include_examples 'parse', /a\hc/, 1 => [:type, :hex, CharacterType::Hex] include_examples 'parse', /a\Hc/, 1 => [:type, :nonhex, CharacterType::NonHex] include_examples 'parse', /a\wc/, 1 => [:type, :word, CharacterType::Word] include_examples 'parse', /a\Wc/, 1 => [:type, :nonword, CharacterType::NonWord] include_examples 'parse', 'a\Rc', 1 => [:type, :linebreak, CharacterType::Linebreak] include_examples 'parse', 'a\Xc', 1 => [:type, :xgrapheme, CharacterType::ExtendedGrapheme] end ammar-regexp_parser-0494e56/spec/scanner/000077500000000000000000000000001433525313500203245ustar00rootroot00000000000000ammar-regexp_parser-0494e56/spec/scanner/all_spec.rb000066400000000000000000000007501433525313500224350ustar00rootroot00000000000000require 'spec_helper' RSpec.describe(Regexp::Scanner) do specify('scanner returns an array') do expect(RS.scan('abc')).to be_instance_of(Array) end specify('scanner returns tokens as arrays') do tokens = RS.scan('^abc+[^one]{2,3}\b\d\C-C$') expect(tokens).to all(be_a Array) expect(tokens.map(&:length)).to all(eq 5) end specify('scanner token count') do re = /^(one|two){2,3}([^d\]efm-qz\,\-]*)(ghi)+$/i expect(RS.scan(re).length).to eq 28 end end ammar-regexp_parser-0494e56/spec/scanner/anchors_spec.rb000066400000000000000000000022571433525313500233260ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Anchor scanning') do include_examples 'scan', '^abc', 0 => [:anchor, :bol, '^', 0, 1] include_examples 'scan', 'abc$', 1 => [:anchor, :eol, '$', 3, 4] include_examples 'scan', '\Aabc', 0 => [:anchor, :bos, '\A', 0, 2] include_examples 'scan', 'abc\z', 1 => [:anchor, :eos, '\z', 3, 5] include_examples 'scan', 'abc\Z', 1 => [:anchor, :eos_ob_eol, '\Z', 3, 5] include_examples 'scan', 'a\bc', 1 => [:anchor, :word_boundary, '\b', 1, 3] include_examples 'scan', 'a\Bc', 1 => [:anchor, :nonword_boundary, '\B', 1, 3] include_examples 'scan', 'a\Gc', 1 => [:anchor, :match_start, '\G', 1, 3] include_examples 'scan', "\\\\Ac", 0 => [:escape, :backslash, '\\\\', 0, 2] include_examples 'scan', "a\\\\z", 1 => [:escape, :backslash, '\\\\', 1, 3] include_examples 'scan', "a\\\\Z", 1 => [:escape, :backslash, '\\\\', 1, 3] include_examples 'scan', "a\\\\bc", 1 => [:escape, :backslash, '\\\\', 1, 3] include_examples 'scan', "a\\\\Bc", 1 => [:escape, :backslash, '\\\\', 1, 3] end ammar-regexp_parser-0494e56/spec/scanner/conditionals_spec.rb000066400000000000000000000171661433525313500243640ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Conditional scanning') do include_examples 'scan', /(a)(?(1)T|F)1/, 3 => [:conditional, :open, '(?', 3, 5] include_examples 'scan', /(a)(?(1)T|F)2/, 4 => [:conditional, :condition_open, '(', 5, 6] include_examples 'scan', /(a)(?(1)T|F)3/, 5 => [:conditional, :condition, '1', 6, 7] include_examples 'scan', /(a)(?(1)T|F)4/, 6 => [:conditional, :condition_close, ')', 7, 8] include_examples 'scan', /(a)(?(1)T|F)5/, 7 => [:literal, :literal, 'T', 8, 9] include_examples 'scan', /(a)(?(1)T|F)6/, 8 => [:conditional, :separator, '|', 9, 10] include_examples 'scan', /(a)(?(1)T|F)7/, 9 => [:literal, :literal, 'F', 10, 11] include_examples 'scan', /(a)(?(1)T|F)8/, 10 => [:conditional, :close, ')', 11, 12] include_examples 'scan', /(a)(?(1)TRUE)9/, 8 => [:conditional, :close, ')', 12, 13] include_examples 'scan', /(a)(?(1)TRUE|)10/, 8 => [:conditional, :separator, '|', 12, 13] include_examples 'scan', /(a)(?(1)TRUE|)11/, 9 => [:conditional, :close, ')', 13, 14] include_examples 'scan', /(?A)(?()T|F)1/, 5 => [:conditional, :condition, '', 10, 13] include_examples 'scan', /(?'N'A)(?('N')T|F)2/, 5 => [:conditional, :condition, "'N'", 10, 13] include_examples 'scan', /(a(b(c)))(?(1)(?(2)d|(?(3)e|f))|(?(2)(?(1)g|h)))/, 0 => [:group, :capture, '(', 0, 1], 1 => [:literal, :literal, 'a', 1, 2], 2 => [:group, :capture, '(', 2, 3], 3 => [:literal, :literal, 'b', 3, 4], 4 => [:group, :capture, '(', 4, 5], 5 => [:literal, :literal, 'c', 5, 6], 6 => [:group, :close, ')', 6, 7], 7 => [:group, :close, ')', 7, 8], 8 => [:group, :close, ')', 8, 9], 9 => [:conditional, :open, '(?', 9, 11], 10 => [:conditional, :condition_open, '(', 11, 12], 11 => [:conditional, :condition, '1', 12, 13], 12 => [:conditional, :condition_close, ')', 13, 14], 13 => [:conditional, :open, '(?', 14, 16], 14 => [:conditional, :condition_open, '(', 16, 17], 15 => [:conditional, :condition, '2', 17, 18], 16 => [:conditional, :condition_close, ')', 18, 19], 17 => [:literal, :literal, 'd', 19, 20], 18 => [:conditional, :separator, '|', 20, 21], 19 => [:conditional, :open, '(?', 21, 23], 20 => [:conditional, :condition_open, '(', 23, 24], 21 => [:conditional, :condition, '3', 24, 25], 22 => [:conditional, :condition_close, ')', 25, 26], 23 => [:literal, :literal, 'e', 26, 27], 24 => [:conditional, :separator, '|', 27, 28], 25 => [:literal, :literal, 'f', 28, 29], 26 => [:conditional, :close, ')', 29, 30], 27 => [:conditional, :close, ')', 30, 31], 28 => [:conditional, :separator, '|', 31, 32], 29 => [:conditional, :open, '(?', 32, 34], 30 => [:conditional, :condition_open, '(', 34, 35], 31 => [:conditional, :condition, '2', 35, 36], 32 => [:conditional, :condition_close, ')', 36, 37], 33 => [:conditional, :open, '(?', 37, 39], 34 => [:conditional, :condition_open, '(', 39, 40], 35 => [:conditional, :condition, '1', 40, 41], 36 => [:conditional, :condition_close, ')', 41, 42], 37 => [:literal, :literal, 'g', 42, 43], 38 => [:conditional, :separator, '|', 43, 44], 39 => [:literal, :literal, 'h', 44, 45], 40 => [:conditional, :close, ')', 45, 46], 41 => [:conditional, :close, ')', 46, 47], 42 => [:conditional, :close, ')', 47, 48] include_examples 'scan', /((a)|(b)|((?(2)(c(d|e)+)?|(?(3)f|(?(4)(g|(h)(i)))))))/, 0 => [:group, :capture, '(', 0, 1], 1 => [:group, :capture, '(', 1, 2], 2 => [:literal, :literal, 'a', 2, 3], 3 => [:group, :close, ')', 3, 4], 4 => [:meta, :alternation, '|', 4, 5], 5 => [:group, :capture, '(', 5, 6], 6 => [:literal, :literal, 'b', 6, 7], 7 => [:group, :close, ')', 7, 8], 8 => [:meta, :alternation, '|', 8, 9], 9 => [:group, :capture, '(', 9, 10], 10 => [:conditional, :open, '(?', 10, 12], 11 => [:conditional, :condition_open, '(', 12, 13], 12 => [:conditional, :condition, '2', 13, 14], 13 => [:conditional, :condition_close, ')', 14, 15], 14 => [:group, :capture, '(', 15, 16], 15 => [:literal, :literal, 'c', 16, 17], 16 => [:group, :capture, '(', 17, 18], 17 => [:literal, :literal, 'd', 18, 19], 18 => [:meta, :alternation, '|', 19, 20], 19 => [:literal, :literal, 'e', 20, 21], 20 => [:group, :close, ')', 21, 22], 21 => [:quantifier, :one_or_more, '+', 22, 23], 22 => [:group, :close, ')', 23, 24], 23 => [:quantifier, :zero_or_one, '?', 24, 25], 24 => [:conditional, :separator, '|', 25, 26], 25 => [:conditional, :open, '(?', 26, 28], 26 => [:conditional, :condition_open, '(', 28, 29], 27 => [:conditional, :condition, '3', 29, 30], 28 => [:conditional, :condition_close, ')', 30, 31], 29 => [:literal, :literal, 'f', 31, 32], 30 => [:conditional, :separator, '|', 32, 33], 31 => [:conditional, :open, '(?', 33, 35], 32 => [:conditional, :condition_open, '(', 35, 36], 33 => [:conditional, :condition, '4', 36, 37], 34 => [:conditional, :condition_close, ')', 37, 38], 35 => [:group, :capture, '(', 38, 39], 36 => [:literal, :literal, 'g', 39, 40], 37 => [:meta, :alternation, '|', 40, 41], 38 => [:group, :capture, '(', 41, 42], 39 => [:literal, :literal, 'h', 42, 43], 40 => [:group, :close, ')', 43, 44], 41 => [:group, :capture, '(', 44, 45], 42 => [:literal, :literal, 'i', 45, 46], 43 => [:group, :close, ')', 46, 47], 44 => [:group, :close, ')', 47, 48], 45 => [:conditional, :close, ')', 48, 49], 46 => [:conditional, :close, ')', 49, 50], 47 => [:conditional, :close, ')', 50, 51], 48 => [:group, :close, ')', 51, 52], 49 => [:group, :close, ')', 52, 53] include_examples 'scan', /(a)(?(1)(b|c|d)|(e|f|g))(h)(?(2)(i|j|k)|(l|m|n))|o|p/, 9 => [:meta, :alternation, '|', 10, 11], 11 => [:meta, :alternation, '|', 12, 13], 14 => [:conditional, :separator, '|', 15, 16], 17 => [:meta, :alternation, '|', 18, 19], 19 => [:meta, :alternation, '|', 20, 21], 32 => [:meta, :alternation, '|', 34, 35], 34 => [:meta, :alternation, '|', 36, 37], 37 => [:conditional, :separator, '|', 39, 40], 40 => [:meta, :alternation, '|', 42, 43], 42 => [:meta, :alternation, '|', 44, 45], 46 => [:meta, :alternation, '|', 48, 49], 48 => [:meta, :alternation, '|', 50, 51] end ammar-regexp_parser-0494e56/spec/scanner/delimiters_spec.rb000066400000000000000000000034171433525313500240310ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Literal delimiter scanning') do include_examples 'scan', '}', 0 => [:literal, :literal, '}', 0, 1] include_examples 'scan', '}}', 0 => [:literal, :literal, '}}', 0, 2] include_examples 'scan', '{', 0 => [:literal, :literal, '{', 0, 1] include_examples 'scan', '{{', 0 => [:literal, :literal, '{{', 0, 2] include_examples 'scan', '{}', 0 => [:literal, :literal, '{}', 0, 2] include_examples 'scan', '}{', 0 => [:literal, :literal, '}{', 0, 2] include_examples 'scan', '}{+', 0 => [:literal, :literal, '}{', 0, 2] include_examples 'scan', '{{var}}', 0 => [:literal, :literal, '{{var}}', 0, 7] include_examples 'scan', 'a{1,2', 0 => [:literal, :literal, 'a{1,2', 0, 5] include_examples 'scan', '({.+})', 0 => [:group, :capture, '(', 0, 1], 1 => [:literal, :literal, '{', 1, 2], 2 => [:meta, :dot, '.', 2, 3], 3 => [:quantifier, :one_or_more, '+', 3, 4], 4 => [:literal, :literal, '}', 4, 5], 5 => [:group, :close, ')', 5, 6] include_examples 'scan', ']', 0 => [:literal, :literal, ']', 0, 1] include_examples 'scan', ']]', 0 => [:literal, :literal, ']]', 0, 2] include_examples 'scan', ']\[', 0 => [:literal, :literal, ']', 0, 1], 1 => [:escape, :set_open, '\[', 1, 3] include_examples 'scan', '()', 0 => [:group, :capture, '(', 0, 1], 1 => [:group, :close, ')', 1, 2] end ammar-regexp_parser-0494e56/spec/scanner/errors_spec.rb000066400000000000000000000125441433525313500232050ustar00rootroot00000000000000require 'spec_helper' RSpec.describe(Regexp::Scanner) do RSpec.shared_examples 'scan error' do |error, issue, source| it "raises #{error} for #{issue} `#{source}`" do expect { RS.scan(source) }.to raise_error(error) end end include_examples 'scan error', RS::PrematureEndError, 'unbalanced set', '[a' include_examples 'scan error', RS::PrematureEndError, 'unbalanced set', '[[:alpha:]' include_examples 'scan error', RS::PrematureEndError, 'unbalanced group', '(abc' include_examples 'scan error', RS::PrematureEndError, 'eof in property', '\p{asci' include_examples 'scan error', RS::PrematureEndError, 'incomplete property', '\p{ascii abc' include_examples 'scan error', RS::PrematureEndError, 'eof options', '(?mix' include_examples 'scan error', RS::PrematureEndError, 'eof escape', '\\' include_examples 'scan error', RS::PrematureEndError, 'eof in hex escape', '\x' include_examples 'scan error', RS::PrematureEndError, 'eof in cp escape', '\u' include_examples 'scan error', RS::PrematureEndError, 'eof in cp escape', '\u0' include_examples 'scan error', RS::PrematureEndError, 'eof in cp escape', '\u00' include_examples 'scan error', RS::PrematureEndError, 'eof in cp escape', '\u000' include_examples 'scan error', RS::PrematureEndError, 'eof in cp escape', '\u{' include_examples 'scan error', RS::PrematureEndError, 'eof in cp escape', '\u{00' include_examples 'scan error', RS::PrematureEndError, 'eof in cp escape', '\u{0000' include_examples 'scan error', RS::PrematureEndError, 'eof in cp escape', '\u{0000 ' include_examples 'scan error', RS::PrematureEndError, 'eof in cp escape', '\u{0000 0000' include_examples 'scan error', RS::PrematureEndError, 'eof in c-seq', '\c' include_examples 'scan error', RS::PrematureEndError, 'eof in c-seq', '\c\M' include_examples 'scan error', RS::PrematureEndError, 'eof in c-seq', '\c\M-' include_examples 'scan error', RS::PrematureEndError, 'eof in c-seq', '\C' include_examples 'scan error', RS::PrematureEndError, 'eof in c-seq', '\C-' include_examples 'scan error', RS::PrematureEndError, 'eof in c-seq', '\C-\M' include_examples 'scan error', RS::PrematureEndError, 'eof in c-seq', '\C-\M-' include_examples 'scan error', RS::PrematureEndError, 'eof in m-seq', '\M' include_examples 'scan error', RS::PrematureEndError, 'eof in m-seq', '\M-' include_examples 'scan error', RS::PrematureEndError, 'eof in m-seq', '\M-\\' include_examples 'scan error', RS::PrematureEndError, 'eof in m-seq', '\M-\c' include_examples 'scan error', RS::PrematureEndError, 'eof in m-seq', '\M-\C' include_examples 'scan error', RS::PrematureEndError, 'eof in m-seq', '\M-\C-' include_examples 'scan error', RS::InvalidSequenceError, 'invalid hex', '\xZ' include_examples 'scan error', RS::InvalidSequenceError, 'invalid hex', '\xZ0' include_examples 'scan error', RS::InvalidSequenceError, 'invalid c-seq', '\cü' include_examples 'scan error', RS::InvalidSequenceError, 'invalid c-seq', '\c\M-ü' include_examples 'scan error', RS::InvalidSequenceError, 'invalid c-seq', '\C-ü' include_examples 'scan error', RS::InvalidSequenceError, 'invalid c-seq', '\C-\M-ü' include_examples 'scan error', RS::InvalidSequenceError, 'invalid m-seq', '\M-ü' include_examples 'scan error', RS::InvalidSequenceError, 'invalid m-seq', '\M-\cü' include_examples 'scan error', RS::InvalidSequenceError, 'invalid m-seq', '\M-\C-ü' include_examples 'scan error', RS::ScannerError, 'invalid c-seq', '\Ca' include_examples 'scan error', RS::ScannerError, 'invalid m-seq', '\Ma' include_examples 'scan error', RS::InvalidGroupError, 'invalid group', "(?'')" include_examples 'scan error', RS::InvalidGroupError, 'invalid group', "(?''empty-name)" include_examples 'scan error', RS::InvalidGroupError, 'invalid group', '(?<>)' include_examples 'scan error', RS::InvalidGroupError, 'invalid group', '(?<>empty-name)' include_examples 'scan error', RS::InvalidGroupOption, 'invalid option', '(?foo)' include_examples 'scan error', RS::InvalidGroupOption, 'invalid option', '(?mix abc)' include_examples 'scan error', RS::InvalidGroupOption, 'invalid option', '(?mix^bc' include_examples 'scan error', RS::InvalidGroupOption, 'invalid option', '(?)' include_examples 'scan error', RS::InvalidGroupOption, 'invalid neg option', '(?-foo)' include_examples 'scan error', RS::InvalidGroupOption, 'invalid neg option', '(?-u)' include_examples 'scan error', RS::InvalidGroupOption, 'invalid neg option', '(?-mixu)' include_examples 'scan error', RS::InvalidBackrefError, 'empty backref', '\k<>' include_examples 'scan error', RS::InvalidBackrefError, 'empty backref', '\k\'\'' include_examples 'scan error', RS::InvalidBackrefError, 'empty refcall', '\g<>' include_examples 'scan error', RS::InvalidBackrefError, 'empty refcall', '\g\'\'' include_examples 'scan error', RS::UnknownUnicodePropertyError, 'unknown property', '\p{foobar}' include_examples 'scan error', RS::UnknownPosixClassError, 'unknown POSIX class [::]', '[[::]]' include_examples 'scan error', RS::UnknownPosixClassError, 'unknown POSIX class [:^:]', '[[:^:]]' include_examples 'scan error', RS::UnknownPosixClassError, 'unknown POSIX class [:x:]', '[[:x:]]' include_examples 'scan error', RS::UnknownPosixClassError, 'unknown POSIX class', '[[:^x:]]' include_examples 'scan error', RS::UnknownPosixClassError, 'unknown POSIX class', '[[:WORD:]]' end ammar-regexp_parser-0494e56/spec/scanner/escapes_spec.rb000066400000000000000000000124661433525313500233170ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Escape scanning') do include_examples 'scan', /c\at/, 1 => [:escape, :bell, '\a', 1, 3] # not an escape outside a character set include_examples 'scan', /c\bt/, 1 => [:anchor, :word_boundary, '\b', 1, 3] include_examples 'scan', /c\ft/, 1 => [:escape, :form_feed, '\f', 1, 3] include_examples 'scan', /c\nt/, 1 => [:escape, :newline, '\n', 1, 3] include_examples 'scan', /c\tt/, 1 => [:escape, :tab, '\t', 1, 3] include_examples 'scan', /c\vt/, 1 => [:escape, :vertical_tab, '\v', 1, 3] # ineffectual literal escapes # these cause "Unknown escape" warnings in Ruby for ascii chars, # and simply drop the backslash for non-ascii chars (/\ü/.inspect == '/ü/'). # In terms of matching, Ruby treats them both like non-escaped literals. include_examples 'scan', 'c\qt', 1 => [:escape, :literal, '\q', 1, 3] include_examples 'scan', 'a\üc', 1 => [:escape, :literal, '\ü', 1, 3] include_examples 'scan', 'a\😋c', 1 => [:escape, :literal, '\😋', 1, 3] # these incomplete ref/call sequences are treated as literal escapes by Ruby include_examples 'scan', 'c\gt', 1 => [:escape, :literal, '\g', 1, 3] include_examples 'scan', 'c\kt', 1 => [:escape, :literal, '\k', 1, 3] include_examples 'scan', 'a\012c', 1 => [:escape, :octal, '\012', 1, 5] include_examples 'scan', 'a\0124', 1 => [:escape, :octal, '\012', 1, 5] include_examples 'scan', '\712+7', 0 => [:escape, :octal, '\712', 0, 4] include_examples 'scan', 'a\xA', 1 => [:escape, :hex, '\xA', 1, 4] include_examples 'scan', 'a\x24c', 1 => [:escape, :hex, '\x24', 1, 5] include_examples 'scan', 'a\x0640c', 1 => [:escape, :hex, '\x06', 1, 5] include_examples 'scan', 'a\u0640c', 1 => [:escape, :codepoint, '\u0640', 1, 7] include_examples 'scan', 'a\u{640 0641}c', 1 => [:escape, :codepoint_list, '\u{640 0641}', 1, 13] include_examples 'scan', 'a\u{10FFFF}c', 1 => [:escape, :codepoint_list, '\u{10FFFF}', 1, 11] include_examples 'scan', 'ab\\\xcd', 1 => [:escape, :backslash, '\\\\', 2, 4] include_examples 'scan', 'ab\\\0cd', 1 => [:escape, :backslash, '\\\\', 2, 4] include_examples 'scan', 'ab\\\Kcd', 1 => [:escape, :backslash, '\\\\', 2, 4] include_examples 'scan', 'ab\^cd', 1 => [:escape, :bol, '\^', 2, 4] include_examples 'scan', 'ab\$cd', 1 => [:escape, :eol, '\$', 2, 4] include_examples 'scan', 'ab\[cd', 1 => [:escape, :set_open, '\[', 2, 4] # Meta/control espaces # # After the following fix in Ruby 3.1, a Regexp#source containing meta/control # escapes can only be set with the Regexp::new constructor. # In Regexp literals, these escapes are now pre-processed to hex escapes. # # https://github.com/ruby/ruby/commit/11ae581a4a7f5d5f5ec6378872eab8f25381b1b9 n = ->(regexp_body){ Regexp.new(regexp_body.force_encoding('ascii-8bit')) } include_examples 'scan', 'a\cBc', 1 => [:escape, :control, '\cB', 1, 4] include_examples 'scan', 'a\c^c', 1 => [:escape, :control, '\c^', 1, 4] include_examples 'scan', 'a\c\n', 1 => [:escape, :control, '\c\n', 1, 5] include_examples 'scan', 'a\c\\\\b', 1 => [:escape, :control, '\c\\\\', 1, 5] include_examples 'scan', 'a\C-bc', 1 => [:escape, :control, '\C-b', 1, 5] include_examples 'scan', 'a\C-^b', 1 => [:escape, :control, '\C-^', 1, 5] include_examples 'scan', 'a\C-\nb', 1 => [:escape, :control, '\C-\n', 1, 6] include_examples 'scan', 'a\C-\\\\b', 1 => [:escape, :control, '\C-\\\\', 1, 6] include_examples 'scan', n.('a\c\M-Bc'), 1 => [:escape, :control, '\c\M-B', 1, 7] include_examples 'scan', n.('a\C-\M-Bc'), 1 => [:escape, :control, '\C-\M-B', 1, 8] include_examples 'scan', n.('a\M-Bc'), 1 => [:escape, :meta_sequence, '\M-B', 1, 5] include_examples 'scan', n.('a\M-\cBc'), 1 => [:escape, :meta_sequence, '\M-\cB', 1, 7] include_examples 'scan', n.('a\M-\c^'), 1 => [:escape, :meta_sequence, '\M-\c^', 1, 7] include_examples 'scan', n.('a\M-\c\n'), 1 => [:escape, :meta_sequence, '\M-\c\n', 1, 8] include_examples 'scan', n.('a\M-\c\\\\'), 1 => [:escape, :meta_sequence, '\M-\c\\\\', 1, 8] include_examples 'scan', n.('a\M-\C-Bc'), 1 => [:escape, :meta_sequence, '\M-\C-B', 1, 8] include_examples 'scan', n.('a\M-\C-\\\\'), 1 => [:escape, :meta_sequence, '\M-\C-\\\\', 1, 9] end ammar-regexp_parser-0494e56/spec/scanner/free_space_spec.rb000066400000000000000000000162701433525313500237650ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('FreeSpace scanning') do describe('scan free space tokens') do let(:tokens) { RS.scan(/ a b ? c * d {2,3} e + | f + /x) } 0.upto(24).select(&:even?).each do |i| it "scans #{i} as free space" do expect(tokens[i][0]).to eq :free_space expect(tokens[i][1]).to eq :whitespace end end 0.upto(24).reject(&:even?).each do |i| it "does not scan #{i} as free space" do expect(tokens[i][0]).not_to eq :free_space expect(tokens[i][1]).not_to eq :whitespace end end it 'sets the correct text' do [0, 2, 10, 14].each { |i| expect(tokens[i][2]).to eq "\n " } [4, 6, 8, 12].each { |i| expect(tokens[i][2]).to eq ' ' } end end describe('scan free space comments') do include_examples 'scan', / a + # A + comment b ? # B ? comment c {2,3} # C {2,3} comment d + | e + # D|E comment /x, 5 => [:free_space, :comment, "# A + comment\n", 11, 25], 11 => [:free_space, :comment, "# B ? comment\n", 37, 51], 17 => [:free_space, :comment, "# C {2,3} comment\n", 66, 84], 29 => [:free_space, :comment, "# D|E comment\n", 100, 114] # single line / no trailing newline (c.f. issue #66) include_examples 'scan', /a # b/x, 0 => [:literal, :literal, 'a', 0, 1], 1 => [:free_space, :whitespace, ' ', 1, 2], 2 => [:free_space, :comment, "# b", 2, 5] # without spaces (c.f. issue #66) include_examples 'scan', /a#b/x, 0 => [:literal, :literal, 'a', 0, 1], 1 => [:free_space, :comment, "#b", 1, 3] end describe('scan free space inlined') do include_examples 'scan', /a b(?x:c d e)f g/, 0 => [:literal, :literal, 'a b', 0, 3], 1 => [:group, :options, '(?x:', 3, 7], 2 => [:literal, :literal, 'c', 7, 8], 3 => [:free_space, :whitespace, ' ', 8, 9], 4 => [:literal, :literal, 'd', 9, 10], 5 => [:free_space, :whitespace, ' ', 10, 11], 6 => [:literal, :literal, 'e', 11, 12], 7 => [:group, :close, ')', 12, 13], 8 => [:literal, :literal, 'f g', 13, 16] end describe('scan free space nested') do include_examples 'scan', /a b(?x:c d(?-x:e f)g h)i j/, 0 => [:literal, :literal, 'a b', 0, 3], 1 => [:group, :options, '(?x:', 3, 7], 2 => [:literal, :literal, 'c', 7, 8], 3 => [:free_space, :whitespace, ' ', 8, 9], 4 => [:literal, :literal, 'd', 9, 10], 5 => [:group, :options, '(?-x:', 10, 15], 6 => [:literal, :literal, 'e f', 15, 18], 7 => [:group, :close, ')', 18, 19], 8 => [:literal, :literal, 'g', 19, 20], 9 => [:free_space, :whitespace, ' ', 20, 21], 10 => [:literal, :literal, 'h', 21, 22], 11 => [:group, :close, ')', 22, 23], 12 => [:literal, :literal, 'i j', 23, 26] end describe('scan free space nested groups') do include_examples 'scan', /(a (b(?x: (c d) (?-x:(e f) )g) h)i j)/, 0 => [:group, :capture, '(', 0, 1], 1 => [:literal, :literal, 'a ', 1, 3], 2 => [:group, :capture, '(', 3, 4], 3 => [:literal, :literal, 'b', 4, 5], 4 => [:group, :options, '(?x:', 5, 9], 5 => [:free_space, :whitespace, ' ', 9, 10], 6 => [:group, :capture, '(', 10, 11], 7 => [:literal, :literal, 'c', 11, 12], 8 => [:free_space, :whitespace, ' ', 12, 13], 9 => [:literal, :literal, 'd', 13, 14], 10 => [:group, :close, ')', 14, 15], 11 => [:free_space, :whitespace, ' ', 15, 16], 12 => [:group, :options, '(?-x:', 16, 21], 13 => [:group, :capture, '(', 21, 22], 14 => [:literal, :literal, 'e f', 22, 25], 15 => [:group, :close, ')', 25, 26], 16 => [:literal, :literal, ' ', 26, 27], 17 => [:group, :close, ')', 27, 28], 18 => [:literal, :literal, 'g', 28, 29], 19 => [:group, :close, ')', 29, 30], 20 => [:literal, :literal, ' h', 30, 32], 21 => [:group, :close, ')', 32, 33], 22 => [:literal, :literal, 'i j', 33, 36], 23 => [:group, :close, ')', 36, 37] end describe('scan free space switch groups') do include_examples 'scan', /(a (b((?x) (c d) ((?-x)(e f) )g) h)i j)/, 0 => [:group, :capture, '(', 0, 1], 1 => [:literal, :literal, 'a ', 1, 3], 2 => [:group, :capture, '(', 3, 4], 3 => [:literal, :literal, 'b', 4, 5], 4 => [:group, :capture, '(', 5, 6], 5 => [:group, :options_switch, '(?x', 6, 9], 6 => [:group, :close, ')', 9, 10], 7 => [:free_space, :whitespace, ' ', 10, 11], 8 => [:group, :capture, '(', 11, 12], 9 => [:literal, :literal, 'c', 12, 13], 10 => [:free_space, :whitespace, ' ', 13, 14], 11 => [:literal, :literal, 'd', 14, 15], 12 => [:group, :close, ')', 15, 16], 13 => [:free_space, :whitespace, ' ', 16, 17], 14 => [:group, :capture, '(', 17, 18], 15 => [:group, :options_switch, '(?-x', 18, 22], 16 => [:group, :close, ')', 22, 23], 17 => [:group, :capture, '(', 23, 24], 18 => [:literal, :literal, 'e f', 24, 27], 19 => [:group, :close, ')', 27, 28], 20 => [:literal, :literal, ' ', 28, 29], 21 => [:group, :close, ')', 29, 30], 22 => [:literal, :literal, 'g', 30, 31], 23 => [:group, :close, ')', 31, 32], 24 => [:literal, :literal, ' h', 32, 34], 25 => [:group, :close, ')', 34, 35], 26 => [:literal, :literal, 'i j', 35, 38], 27 => [:group, :close, ')', 38, 39] end describe('scanning `#` in regular (non-x mode)') do # c.f. issue 70 include_examples 'scan', /a#bcd/, 0 => [:literal, :literal, 'a#bcd', 0, 5] include_examples 'scan', /a # bcd/, 0 => [:literal, :literal, 'a # bcd', 0, 7] include_examples 'scan', /a#\d/, 0 => [:literal, :literal, 'a#', 0, 2], 1 => [:type, :digit, '\d', 2, 4] include_examples 'scan', /a # \d/, 0 => [:literal, :literal, 'a # ', 0, 4], 1 => [:type, :digit, '\d', 4, 6] include_examples 'scan', /a#()/, 0 => [:literal, :literal, 'a#', 0, 2], 1 => [:group, :capture, '(', 2, 3] include_examples 'scan', /a # ()/, 0 => [:literal, :literal, 'a # ', 0, 4], 1 => [:group, :capture, '(', 4, 5] end end ammar-regexp_parser-0494e56/spec/scanner/groups_spec.rb000066400000000000000000000122251433525313500232040ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Group scanning') do # Group types include_examples 'scan', '(?>abc)', 0 => [:group, :atomic, '(?>', 0, 3] include_examples 'scan', '(abc)', 0 => [:group, :capture, '(', 0, 1] # Named groups # Names that start with a hyphen or digit (ascii or other) are invalid. # ")" is only allowed as first char of the name. # "!" is allowed anywhere, but ? is treated as a lookbehind by Ruby. include_examples 'scan', '(?abc)', 0 => [:group, :named_ab, '(?', 0, 8] include_examples 'scan', "(?'name'abc)", 0 => [:group, :named_sq, "(?'name'", 0, 8] include_examples 'scan', '(?abc)', 0 => [:group, :named_ab, '(?', 0,10] include_examples 'scan', "(?'name_1'abc)", 0 => [:group, :named_sq, "(?'name_1'", 0,10] include_examples 'scan', '(?abc)', 0 => [:group, :named_ab, '(?', 0,10] include_examples 'scan', "(?'name-1'abc)", 0 => [:group, :named_sq, "(?'name-1'", 0,10] include_examples 'scan', "(?abc)", 0 => [:group, :named_ab, "(?", 0,10] include_examples 'scan', "(?'name>1'abc)", 0 => [:group, :named_sq, "(?'name>1'", 0,10] include_examples 'scan', '(?<üüuuüü>abc)', 0 => [:group, :named_ab, '(?<üüuuüü>', 0,10] include_examples 'scan', "(?'üüuuüü'abc)", 0 => [:group, :named_sq, "(?'üüuuüü'", 0,10] include_examples 'scan', "(?<😋1234😋>abc)", 0 => [:group, :named_ab, "(?<😋1234😋>", 0,10] include_examples 'scan', "(?'😋1234😋'abc)", 0 => [:group, :named_sq, "(?'😋1234😋'", 0,10] include_examples 'scan', "(?<)x>y)", 0 => [:group, :named_ab, '(?<)x>', 0, 6] include_examples 'scan', "(?')x'y)", 0 => [:group, :named_sq, "(?')x'", 0, 6] include_examples 'scan', "(?'!x'y)", 0 => [:group, :named_sq, "(?'!x'", 0, 6] # Passive groups include_examples 'scan', '(?:abc)', 0 => [:group, :passive, '(?:', 0, 3] include_examples 'scan', '(?:)', 0 => [:group, :passive, '(?:', 0, 3] include_examples 'scan', '(?::)', 0 => [:group, :passive, '(?:', 0, 3] # Comments include_examples 'scan', '(?#abc)', 0 => [:group, :comment, '(?#abc)', 0, 7] include_examples 'scan', '(?#)', 0 => [:group, :comment, '(?#)', 0, 4] # Assertions include_examples 'scan', '(?=abc)', 0 => [:assertion, :lookahead, '(?=', 0, 3] include_examples 'scan', '(?!abc)', 0 => [:assertion, :nlookahead, '(?!', 0, 3] include_examples 'scan', '(?<=abc)', 0 => [:assertion, :lookbehind, '(?<=', 0, 4] include_examples 'scan', '(? [:assertion, :nlookbehind, '(?', 0 => [:assertion, :nlookbehind, '(?)y', 0 => [:assertion, :nlookbehind, '(? [:group, :options, '(?-mix:', 0, 7] include_examples 'scan', '(?m-ix:abc)', 0 => [:group, :options, '(?m-ix:', 0, 7] include_examples 'scan', '(?mi-x:abc)', 0 => [:group, :options, '(?mi-x:', 0, 7] include_examples 'scan', '(?mix:abc)', 0 => [:group, :options, '(?mix:', 0, 6] include_examples 'scan', '(?m:)', 0 => [:group, :options, '(?m:', 0, 4] include_examples 'scan', '(?i:)', 0 => [:group, :options, '(?i:', 0, 4] include_examples 'scan', '(?x:)', 0 => [:group, :options, '(?x:', 0, 4] include_examples 'scan', '(?mix)', 0 => [:group, :options_switch, '(?mix', 0, 5] include_examples 'scan', '(?d-mix:abc)', 0 => [:group, :options, '(?d-mix:', 0, 8] include_examples 'scan', '(?a-mix:abc)', 0 => [:group, :options, '(?a-mix:', 0, 8] include_examples 'scan', '(?u-mix:abc)', 0 => [:group, :options, '(?u-mix:', 0, 8] include_examples 'scan', '(?da-m:abc)', 0 => [:group, :options, '(?da-m:', 0, 7] include_examples 'scan', '(?du-x:abc)', 0 => [:group, :options, '(?du-x:', 0, 7] include_examples 'scan', '(?dau-i:abc)', 0 => [:group, :options, '(?dau-i:', 0, 8] include_examples 'scan', '(?dau:abc)', 0 => [:group, :options, '(?dau:', 0, 6] include_examples 'scan', '(?d:)', 0 => [:group, :options, '(?d:', 0, 4] include_examples 'scan', '(?a:)', 0 => [:group, :options, '(?a:', 0, 4] include_examples 'scan', '(?u:)', 0 => [:group, :options, '(?u:', 0, 4] include_examples 'scan', '(?dau)', 0 => [:group, :options_switch, '(?dau', 0, 5] if ruby_version_at_least('2.4.1') include_examples 'scan', '(?~abc)', 0 => [:group, :absence, '(?~', 0, 3] end end ammar-regexp_parser-0494e56/spec/scanner/keep_spec.rb000066400000000000000000000004051433525313500226060ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Keep scanning') do include_examples 'scan', /ab\Kcd/, 1 => [:keep, :mark, '\K', 2, 4] include_examples 'scan', /(a\Kb)|(c\\\Kd)ef/, 2 => [:keep, :mark, '\K', 2, 4], 9 => [:keep, :mark, '\K', 11, 13] end ammar-regexp_parser-0494e56/spec/scanner/literals_spec.rb000066400000000000000000000031611433525313500235030ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('UTF8 scanning') do # ascii, single byte characters include_examples 'scan', 'a', 0 => [:literal, :literal, 'a', 0, 1] include_examples 'scan', 'ab+', 0 => [:literal, :literal, 'ab', 0, 2], 1 => [:quantifier, :one_or_more, '+', 2, 3] # 2 byte wide characters include_examples 'scan', 'äöü', 0 => [:literal, :literal, 'äöü', 0, 3] # 3 byte wide characters, Japanese include_examples 'scan', 'ab?れます+cd', 0 => [:literal, :literal, 'ab', 0, 2], 1 => [:quantifier, :zero_or_one, '?', 2, 3], 2 => [:literal, :literal, 'れます', 3, 6], 3 => [:quantifier, :one_or_more, '+', 6, 7], 4 => [:literal, :literal, 'cd', 7, 9] # 4 byte wide characters, Osmanya include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃', 0 => [:literal, :literal, '𐒀𐒁', 0, 2], 1 => [:quantifier, :zero_or_one, '?', 2, 3], 2 => [:literal, :literal, '𐒂ab', 3, 6], 3 => [:quantifier, :one_or_more, '+', 6, 7], 4 => [:literal, :literal, '𐒃', 7, 8] include_examples 'scan', 'mu𝄞?si*𝄫c+', 0 => [:literal, :literal, 'mu𝄞', 0, 3], 1 => [:quantifier, :zero_or_one, '?', 3, 4], 2 => [:literal, :literal, 'si', 4, 6], 3 => [:quantifier, :zero_or_more, '*', 6, 7], 4 => [:literal, :literal, '𝄫c', 7, 9], 5 => [:quantifier, :one_or_more, '+', 9, 10] end ammar-regexp_parser-0494e56/spec/scanner/meta_spec.rb000066400000000000000000000016061433525313500226140ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Meta scanning') do include_examples 'scan', /abc??|def*+|ghi+/, 0 => [:literal, :literal, 'abc', 0, 3], 1 => [:quantifier, :zero_or_one_reluctant, '??', 3, 5], 2 => [:meta, :alternation, '|', 5, 6], 3 => [:literal, :literal, 'def', 6, 9], 4 => [:quantifier, :zero_or_more_possessive, '*+', 9, 11], 5 => [:meta, :alternation, '|', 11, 12] include_examples 'scan', /(a\|b)|(c|d)\|(e[|]f)/, 2 => [:escape, :alternation, '\|', 2, 4], 5 => [:meta, :alternation, '|', 6, 7], 8 => [:meta, :alternation, '|', 9, 10], 11 => [:escape, :alternation, '\|', 12, 14], 15 => [:literal, :literal, '|', 17, 18] end ammar-regexp_parser-0494e56/spec/scanner/options_spec.rb000066400000000000000000000017321433525313500233610ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('passing options to scan') do def expect_type_tokens(tokens, type_tokens) expect(tokens.map { |type, token, *| [type, token] }).to eq(type_tokens) end it 'raises if if scanning from a Regexp and options are passed' do expect { RS.scan(/a+/, options: ::Regexp::EXTENDED) }.to raise_error( ArgumentError, 'options cannot be supplied unless scanning a String' ) end it 'sets free_spacing based on options if scanning from a String' do expect_type_tokens( RS.scan('a+#c', options: ::Regexp::MULTILINE | ::Regexp::EXTENDED), [ %i[literal literal], %i[quantifier one_or_more], %i[free_space comment] ] ) end it 'does not set free_spacing if scanning from a String and passing no options' do expect_type_tokens( RS.scan('a+#c'), [ %i[literal literal], %i[quantifier one_or_more], %i[literal literal] ] ) end end ammar-regexp_parser-0494e56/spec/scanner/properties_spec.rb000066400000000000000000000052231433525313500240610ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Property scanning') do RSpec.shared_examples 'scan property' do |text, token| it("scans \\p{#{text}} as property #{token}") do result = RS.scan("\\p{#{text}}")[0] expect(result[0..1]).to eq [:property, token] end it("scans \\P{#{text}} as nonproperty #{token}") do result = RS.scan("\\P{#{text}}")[0] expect(result[0..1]).to eq [:nonproperty, token] end it("scans \\p{^#{text}} as nonproperty #{token}") do result = RS.scan("\\p{^#{text}}")[0] expect(result[0..1]).to eq [:nonproperty, token] end it("scans double-negated \\P{^#{text}} as property #{token}") do result = RS.scan("\\P{^#{text}}")[0] expect(result[0..1]).to eq [:property, token] end end include_examples 'scan property', 'Alnum', :alnum include_examples 'scan property', 'XPosixPunct', :xposixpunct include_examples 'scan property', 'Newline', :newline include_examples 'scan property', 'Any', :any include_examples 'scan property', 'Assigned', :assigned include_examples 'scan property', 'Age=1.1', :'age=1.1' include_examples 'scan property', 'Age=10.0', :'age=10.0' include_examples 'scan property', 'ahex', :ascii_hex_digit include_examples 'scan property', 'ASCII_Hex_Digit', :ascii_hex_digit # test underscore include_examples 'scan property', 'sd', :soft_dotted include_examples 'scan property', 'Soft-Dotted', :soft_dotted # test dash include_examples 'scan property', 'Egyp', :egyptian_hieroglyphs include_examples 'scan property', 'Egyptian Hieroglyphs', :egyptian_hieroglyphs # test whitespace include_examples 'scan property', 'Linb', :linear_b include_examples 'scan property', 'Linear-B', :linear_b # test dash include_examples 'scan property', 'InArabic', :in_arabic # test block include_examples 'scan property', 'in Arabic', :in_arabic # test block w. whitespace include_examples 'scan property', 'In_Arabic', :in_arabic # test block w. underscore include_examples 'scan property', 'Yiii', :yi include_examples 'scan property', 'Yi', :yi include_examples 'scan property', 'Zinh', :inherited include_examples 'scan property', 'Inherited', :inherited include_examples 'scan property', 'Qaai', :inherited include_examples 'scan property', 'Zzzz', :unknown include_examples 'scan property', 'Unknown', :unknown end ammar-regexp_parser-0494e56/spec/scanner/quantifiers_spec.rb000066400000000000000000000032561433525313500242230ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Quantifier scanning') do include_examples 'scan', 'a?', 1 => [:quantifier, :zero_or_one, '?', 1, 2] include_examples 'scan', 'a??', 1 => [:quantifier, :zero_or_one_reluctant, '??', 1, 3] include_examples 'scan', 'a?+', 1 => [:quantifier, :zero_or_one_possessive, '?+', 1, 3] include_examples 'scan', 'a*', 1 => [:quantifier, :zero_or_more, '*', 1, 2] include_examples 'scan', 'a*?', 1 => [:quantifier, :zero_or_more_reluctant, '*?', 1, 3] include_examples 'scan', 'a*+', 1 => [:quantifier, :zero_or_more_possessive, '*+', 1, 3] include_examples 'scan', 'a+', 1 => [:quantifier, :one_or_more, '+', 1, 2] include_examples 'scan', 'a+?', 1 => [:quantifier, :one_or_more_reluctant, '+?', 1, 3] include_examples 'scan', 'a++', 1 => [:quantifier, :one_or_more_possessive, '++', 1, 3] include_examples 'scan', 'a{2}', 1 => [:quantifier, :interval, '{2}', 1, 4] include_examples 'scan', 'a{2,}', 1 => [:quantifier, :interval, '{2,}', 1, 5] include_examples 'scan', 'a{,2}', 1 => [:quantifier, :interval, '{,2}', 1, 5] include_examples 'scan', 'a{2,4}', 1 => [:quantifier, :interval, '{2,4}', 1, 6] # special case: chained quantifiers include_examples 'scan', 'a+{2}{3}', 1 => [:quantifier, :one_or_more, '+', 1, 2] include_examples 'scan', 'a+{2}{3}', 2 => [:quantifier, :interval, '{2}', 2, 5] include_examples 'scan', 'a+{2}{3}', 3 => [:quantifier, :interval, '{3}', 5, 8] end ammar-regexp_parser-0494e56/spec/scanner/refcalls_spec.rb000066400000000000000000000070741433525313500234660ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('RefCall scanning') do # Traditional numerical group back-reference include_examples 'scan', '(abc)\1' , 3 => [:backref, :number, '\1', 5, 7] # Group back-references, named, numbered, and relative # # NOTE: only \g supports forward-looking references using '+', e.g. \g<+1> # refers to the next group, but \k<+1> refers to a group named '+1'. # Inversely, only \k supports addition or substraction of a recursion level. # E.g. \k refers to a group named 'x' at the current recursion level, # but \g refers to a a group named 'x+0'. # include_examples 'scan', '(?abc)\k', 3 => [:backref, :name_ref_ab, '\k', 9, 14] include_examples 'scan', "(?abc)\\k'X'", 3 => [:backref, :name_ref_sq, "\\k'X'", 9, 14] include_examples 'scan', '(?<+1>abc)\k<+1>', 3 => [:backref, :name_ref_ab, '\k<+1>', 10, 16] include_examples 'scan', "(?<+1>abc)\\k'+1'", 3 => [:backref, :name_ref_sq, "\\k'+1'", 10, 16] include_examples 'scan', '(abc)\k<1>', 3 => [:backref, :number_ref_ab, '\k<1>', 5, 10] include_examples 'scan', "(abc)\\k'1'", 3 => [:backref, :number_ref_sq, "\\k'1'", 5, 10] include_examples 'scan', '(abc)\k<-1>', 3 => [:backref, :number_rel_ref_ab, '\k<-1>', 5, 11] include_examples 'scan', "(abc)\\k'-1'", 3 => [:backref, :number_rel_ref_sq, "\\k'-1'", 5, 11] # Sub-expression invocation, named, numbered, and relative include_examples 'scan', '(?abc)\g', 3 => [:backref, :name_call_ab, '\g', 9, 14] include_examples 'scan', "(?abc)\\g'X'", 3 => [:backref, :name_call_sq, "\\g'X'", 9, 14] include_examples 'scan', '(?abc)\g', 3 => [:backref, :name_call_ab, '\g', 9, 16] include_examples 'scan', "(?abc)\\g'X-1'", 3 => [:backref, :name_call_sq, "\\g'X-1'", 9, 16] include_examples 'scan', '(abc)\g<1>', 3 => [:backref, :number_call_ab, '\g<1>', 5, 10] include_examples 'scan', "(abc)\\g'1'", 3 => [:backref, :number_call_sq, "\\g'1'", 5, 10] include_examples 'scan', 'a(b|\g<0>)', 4 => [:backref, :number_call_ab, '\g<0>', 4, 9] include_examples 'scan', "a(b|\\g'0')", 4 => [:backref, :number_call_sq, "\\g'0'", 4, 9] include_examples 'scan', '(abc)\g<-1>', 3 => [:backref, :number_rel_call_ab, '\g<-1>', 5, 11] include_examples 'scan', "(abc)\\g'-1'", 3 => [:backref, :number_rel_call_sq, "\\g'-1'", 5, 11] include_examples 'scan', '\g<+1>(abc)', 0 => [:backref, :number_rel_call_ab, '\g<+1>', 0, 6] include_examples 'scan', "\\g'+1'(abc)", 0 => [:backref, :number_rel_call_sq, "\\g'+1'", 0, 6] # Group back-references, with recursion level include_examples 'scan', '(?abc)\k', 3 => [:backref, :name_recursion_ref_ab, '\k', 9, 16] include_examples 'scan', "(?abc)\\k'X-0'", 3 => [:backref, :name_recursion_ref_sq, "\\k'X-0'", 9, 16] include_examples 'scan', '(abc)\k<1-0>', 3 => [:backref, :number_recursion_ref_ab, '\k<1-0>', 5, 12] include_examples 'scan', "(abc)\\k'1-0'", 3 => [:backref, :number_recursion_ref_sq, "\\k'1-0'", 5, 12] include_examples 'scan', '(abc)\k<+1-0>', 3 => [:backref, :name_recursion_ref_ab, '\k<+1-0>', 5, 13] include_examples 'scan', "(abc)\\k'+1-0'", 3 => [:backref, :name_recursion_ref_sq, "\\k'+1-0'", 5, 13] end ammar-regexp_parser-0494e56/spec/scanner/sets_spec.rb000066400000000000000000000263371433525313500226540ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Set scanning') do include_examples 'scan', /[a]/, 0 => [:set, :open, '[', 0, 1] include_examples 'scan', /[b]/, 2 => [:set, :close, ']', 2, 3] include_examples 'scan', /[^n]/, 1 => [:set, :negate, '^', 1, 2] include_examples 'scan', /[c]/, 1 => [:literal, :literal, 'c', 1, 2] include_examples 'scan', /[^d]/, 2 => [:literal, :literal, 'd', 2, 3] include_examples 'scan', /[\b]/, 1 => [:escape, :backspace, '\b', 1, 3] include_examples 'scan', /[A\bX]/, 2 => [:escape, :backspace, '\b', 2, 4] include_examples 'scan', /[\a]/, 1 => [:escape, :bell, '\a', 1, 3] include_examples 'scan', /[\e]/, 1 => [:escape, :escape, '\e', 1, 3] include_examples 'scan', /[\f]/, 1 => [:escape, :form_feed, '\f', 1, 3] include_examples 'scan', /[\n]/, 1 => [:escape, :newline, '\n', 1, 3] include_examples 'scan', /[\r]/, 1 => [:escape, :carriage, '\r', 1, 3] include_examples 'scan', /[\t]/, 1 => [:escape, :tab, '\t', 1, 3] include_examples 'scan', /[\v]/, 1 => [:escape, :vertical_tab, '\v', 1, 3] include_examples 'scan', /[.]/, 1 => [:literal, :literal, '.', 1, 2] include_examples 'scan', /[?]/, 1 => [:literal, :literal, '?', 1, 2] include_examples 'scan', /[*]/, 1 => [:literal, :literal, '*', 1, 2] include_examples 'scan', /[+]/, 1 => [:literal, :literal, '+', 1, 2] include_examples 'scan', /[{]/, 1 => [:literal, :literal, '{', 1, 2] include_examples 'scan', /[}]/, 1 => [:literal, :literal, '}', 1, 2] include_examples 'scan', /[<]/, 1 => [:literal, :literal, '<', 1, 2] include_examples 'scan', /[>]/, 1 => [:literal, :literal, '>', 1, 2] include_examples 'scan', '[\\\\]', 1 => [:escape, :backslash, '\\\\', 1, 3] include_examples 'scan', '[\u0040]', 1 => [:escape, :codepoint, '\u0040', 1, 7] include_examples 'scan', '[\u{40}]', 1 => [:escape, :codepoint_list, '\u{40}', 1, 7] include_examples 'scan', '[\c2]', 1 => [:escape, :control, '\c2', 1, 4] include_examples 'scan', '[\C-C]', 1 => [:escape, :control, '\C-C', 1, 5] include_examples 'scan', '[\x20]', 1 => [:escape, :hex, '\x20', 1, 5] include_examples 'scan', '[\M-Z]', 1 => [:escape, :meta_sequence, '\M-Z', 1, 5] include_examples 'scan', '[\M-\C-X]', 1 => [:escape, :meta_sequence, '\M-\C-X', 1, 8] include_examples 'scan', '[\\[]', 1 => [:escape, :set_open, '\[', 1, 3] include_examples 'scan', '[\\]]', 1 => [:escape, :set_close, '\]', 1, 3] include_examples 'scan', '[a\-]', 2 => [:escape, :literal, '\-', 2, 4] include_examples 'scan', '[\-c]', 1 => [:escape, :literal, '\-', 1, 3] include_examples 'scan', '[\.]', 1 => [:escape, :literal, '\.', 1, 3] include_examples 'scan', '[\?]', 1 => [:escape, :literal, '\?', 1, 3] include_examples 'scan', '[\*]', 1 => [:escape, :literal, '\*', 1, 3] include_examples 'scan', '[\+]', 1 => [:escape, :literal, '\+', 1, 3] include_examples 'scan', '[\|]', 1 => [:escape, :literal, '\|', 1, 3] include_examples 'scan', '[\{]', 1 => [:escape, :literal, '\{', 1, 3] include_examples 'scan', '[\}]', 1 => [:escape, :literal, '\}', 1, 3] include_examples 'scan', '[\(]', 1 => [:escape, :literal, '\(', 1, 3] include_examples 'scan', '[\)]', 1 => [:escape, :literal, '\)', 1, 3] include_examples 'scan', '[\!]', 1 => [:escape, :literal, '\!', 1, 3] include_examples 'scan', '[\#]', 1 => [:escape, :literal, '\#', 1, 3] include_examples 'scan', '[\A]', 1 => [:escape, :literal, '\A', 1, 3] include_examples 'scan', '[\z]', 1 => [:escape, :literal, '\z', 1, 3] include_examples 'scan', '[\g]', 1 => [:escape, :literal, '\g', 1, 3] include_examples 'scan', '[\K]', 1 => [:escape, :literal, '\K', 1, 3] include_examples 'scan', '[\R]', 1 => [:escape, :literal, '\R', 1, 3] include_examples 'scan', '[\X]', 1 => [:escape, :literal, '\X', 1, 3] include_examples 'scan', '[\B]', 1 => [:escape, :literal, '\B', 1, 3] include_examples 'scan', /[\d]/, 1 => [:type, :digit, '\d', 1, 3] include_examples 'scan', /[\da-z]/, 1 => [:type, :digit, '\d', 1, 3] include_examples 'scan', /[\D]/, 1 => [:type, :nondigit, '\D', 1, 3] include_examples 'scan', /[\h]/, 1 => [:type, :hex, '\h', 1, 3] include_examples 'scan', /[\H]/, 1 => [:type, :nonhex, '\H', 1, 3] include_examples 'scan', /[\s]/, 1 => [:type, :space, '\s', 1, 3] include_examples 'scan', /[\S]/, 1 => [:type, :nonspace, '\S', 1, 3] include_examples 'scan', /[\w]/, 1 => [:type, :word, '\w', 1, 3] include_examples 'scan', /[\W]/, 1 => [:type, :nonword, '\W', 1, 3] include_examples 'scan', /[a-b]/, 1 => [:literal, :literal, 'a', 1, 2] include_examples 'scan', /[a-c]/, 2 => [:set, :range, '-', 2, 3] include_examples 'scan', /[a-d]/, 3 => [:literal, :literal, 'd', 3, 4] include_examples 'scan', /[a-b-]/, 4 => [:literal, :literal, '-', 4, 5] include_examples 'scan', /[-a]/, 1 => [:literal, :literal, '-', 1, 2] include_examples 'scan', /[a-c^]/, 4 => [:literal, :literal, '^', 4, 5] include_examples 'scan', /[a-bd-f]/, 2 => [:set, :range, '-', 2, 3] include_examples 'scan', /[a-cd-f]/, 5 => [:set, :range, '-', 5, 6] # this is a buggy range, it matches only `c`, but not `a`, `b` or `-` include_examples 'scan', /[a-[c]]/, 2 => [:set, :range, '-', 2, 3] # these are not ranges, they match `a`, `c` and `-` (or non-`-` if negated) include_examples 'scan', /[[a]-[c]]/, 4 => [:literal, :literal, '-', 4, 5] include_examples 'scan', /[[a]-c]/, 4 => [:literal, :literal, '-', 4, 5] include_examples 'scan', /[^-c]/, 2 => [:literal, :literal, '-', 2, 3] include_examples 'scan', /[a[:digit:]c]/, 2 => [:posixclass, :digit, '[:digit:]', 2, 11] include_examples 'scan', /[[:digit:][:space:]]/, 2 => [:posixclass, :space, '[:space:]', 10, 19] include_examples 'scan', /[[:^digit:]]/, 1 => [:nonposixclass, :digit, '[:^digit:]', 1, 11] include_examples 'scan', /[a-d&&g-h]/, 4 => [:set, :intersection, '&&', 4, 6] include_examples 'scan', /[a&&]/, 2 => [:set, :intersection, '&&', 2, 4] include_examples 'scan', /[&&z]/, 1 => [:set, :intersection, '&&', 1, 3] include_examples 'scan', /[&&]/, 1 => [:set, :intersection, '&&', 1, 3] include_examples 'scan', '[a\p{digit}c]', 2 => [:property, :digit, '\p{digit}', 2, 11] include_examples 'scan', '[a\P{digit}c]', 2 => [:nonproperty, :digit, '\P{digit}', 2, 11] include_examples 'scan', '[a\p{^digit}c]', 2 => [:nonproperty, :digit, '\p{^digit}', 2, 12] include_examples 'scan', '[a\P{^digit}c]', 2 => [:property, :digit, '\P{^digit}', 2, 12] include_examples 'scan', '[a\p{ALPHA}c]', 2 => [:property, :alpha, '\p{ALPHA}', 2, 11] include_examples 'scan', '[a\p{P}c]', 2 => [:property, :punctuation,'\p{P}', 2, 7] include_examples 'scan', '[a\p{P}\P{P}c]', 3 => [:nonproperty, :punctuation,'\P{P}', 7, 12] include_examples 'scan', '[\x20-\x27]', 1 => [:escape, :hex, '\x20', 1, 5], 2 => [:set, :range, '-', 5, 6], 3 => [:escape, :hex, '\x27', 6, 10] include_examples 'scan', '[a-w&&[^c-g]z]', 5 => [:set, :open, '[', 6, 7], 6 => [:set, :negate, '^', 7, 8], 8 => [:set, :range, '-', 9, 10], 10=> [:set, :close, ']', 11, 12] # Collations/collating sequences and character equivalents are not enabled # in Ruby at the moment. If they ever are, enable them in the scanner, # add them to a new syntax version, and handle them in the parser. Until then, # expect them to be scanned as regular subsets containing literals. # include_examples 'scan', /[a[.a-b.]c]/, 2 => [:set, :collation, '[.a-b.]', 2, 9] # include_examples 'scan', /[a[=e=]c]/, 2 => [:set, :equivalent, '[=e=]', 2, 7] include_examples 'scan', '[a[.a-b.]c]', 2 => [:set, :open, '[', 2, 3], 3 => [:literal, :literal, '.', 3, 4], 4 => [:literal, :literal, 'a', 4, 5] include_examples 'scan', '[a[=e=]c]', 2 => [:set, :open, '[', 2, 3], 3 => [:literal, :literal, '=', 3, 4], 4 => [:literal, :literal, 'e', 4, 5] # multi-byte characters should not affect indices include_examples 'scan', /[れます]/, 0 => [:set, :open, '[', 0, 1], 1 => [:literal, :literal, 'れ', 1, 2], 2 => [:literal, :literal, 'ま', 2, 3], 3 => [:literal, :literal, 'す', 3, 4], 4 => [:set, :close, ']', 4, 5] specify('set literal encoding') do text = RS.scan('[a]')[1][2].to_s expect(text).to eq 'a' expect(text.encoding.to_s).to eq 'UTF-8' text = RS.scan("[\u{1F632}]")[1][2].to_s expect(text).to eq "\u{1F632}" expect(text.encoding.to_s).to eq 'UTF-8' end end ammar-regexp_parser-0494e56/spec/scanner/types_spec.rb000066400000000000000000000014161433525313500230310ustar00rootroot00000000000000require 'spec_helper' RSpec.describe('Type scanning') do include_examples 'scan', 'a\dc', 1 => [:type, :digit, '\d', 1, 3] include_examples 'scan', 'a\Dc', 1 => [:type, :nondigit, '\D', 1, 3] include_examples 'scan', 'a\hc', 1 => [:type, :hex, '\h', 1, 3] include_examples 'scan', 'a\Hc', 1 => [:type, :nonhex, '\H', 1, 3] include_examples 'scan', 'a\sc', 1 => [:type, :space, '\s', 1, 3] include_examples 'scan', 'a\Sc', 1 => [:type, :nonspace, '\S', 1, 3] include_examples 'scan', 'a\wc', 1 => [:type, :word, '\w', 1, 3] include_examples 'scan', 'a\Wc', 1 => [:type, :nonword, '\W', 1, 3] include_examples 'scan', 'a\Rc', 1 => [:type, :linebreak, '\R', 1, 3] include_examples 'scan', 'a\Xc', 1 => [:type, :xgrapheme, '\X', 1, 3] end ammar-regexp_parser-0494e56/spec/spec_helper.rb000066400000000000000000000015451433525313500215160ustar00rootroot00000000000000$VERBOSE = true require 'ice_nine' require 'regexp_property_values' require_relative 'support/capturing_stderr' require_relative 'support/shared_examples' req_warn = capturing_stderr { require('regexp_parser') || fail('pre-required') } req_warn.empty? || fail("requiring parser generated warnings:\n#{req_warn}") RS = Regexp::Scanner RL = Regexp::Lexer RP = Regexp::Parser RE = Regexp::Expression T = Regexp::Syntax::Token include Regexp::Expression def ruby_version_at_least(version) Gem::Version.new(RUBY_VERSION.dup) >= Gem::Version.new(version) end RSpec.configure do |config| config.around(:example) do |example| # treat unexpected warnings as failures expect { example.run }.not_to output.to_stderr end end def s(klass, text = nil, *children) exp = klass.construct(text: text) children.each { |child| exp.expressions << child } exp end ammar-regexp_parser-0494e56/spec/support/000077500000000000000000000000001433525313500204075ustar00rootroot00000000000000ammar-regexp_parser-0494e56/spec/support/capturing_stderr.rb000066400000000000000000000002371433525313500243150ustar00rootroot00000000000000require 'stringio' def capturing_stderr(&block) old_stderr, $stderr = $stderr, StringIO.new block.call $stderr.string ensure $stderr = old_stderr end ammar-regexp_parser-0494e56/spec/support/shared_examples.rb000066400000000000000000000054731433525313500241110ustar00rootroot00000000000000RSpec.shared_examples 'syntax' do |opts| opts[:implements].each do |type, tokens| tokens.each do |token| it("implements #{token} #{type}") do expect(described_class.implements?(type, token)).to be true end end end opts[:excludes] && opts[:excludes].each do |type, tokens| tokens.each do |token| it("does not implement #{token} #{type}") do expect(described_class.implements?(type, token)).to be false end end end end RSpec.shared_examples 'scan' do |pattern, checks| context "given the pattern #{pattern}" do before(:all) { @tokens = Regexp::Scanner.scan(pattern) } checks.each do |index, (type, token, text, ts, te)| it "scans token #{index} as #{token} #{type} at #{ts}..#{te}" do result = @tokens.at(index) expect(result[0]).to eq type expect(result[1]).to eq token expect(result[2]).to eq text expect(result[3]).to eq ts expect(result[4]).to eq te end end end end RSpec.shared_examples 'lex' do |pattern, checks| context "given the pattern #{pattern}" do before(:all) { @tokens = Regexp::Lexer.lex(pattern) } checks.each do |index, (type, token, text, ts, te, lvl, set_lvl, cond_lvl)| it "lexes token #{index} as #{token} #{type} at #{lvl}, #{set_lvl}, #{cond_lvl}" do struct = @tokens.at(index) expect(struct.type).to eq type expect(struct.token).to eq token expect(struct.text).to eq text expect(struct.ts).to eq ts expect(struct.te).to eq te expect(struct.level).to eq lvl expect(struct.set_level).to eq set_lvl expect(struct.conditional_level).to eq cond_lvl end end end end RSpec.shared_examples 'parse' do |pattern, checks| context "given the pattern #{pattern}" do before(:all) { @root = Regexp::Parser.parse(pattern, '*') } checks.each do |path, expectations| path = Array(path) inspect_quantifier = path.last == :q && path.pop attributes = expectations.pop if expectations.last.is_a?(Hash) klass = expectations.pop if expectations.last.is_a?(Class) token = expectations.pop type = expectations.pop description = klass || token || type || 'Expression' it "parses expression at #{path} as #{description}" do exp = @root.dig(*path) exp = exp.quantifier if inspect_quantifier klass && expect(exp).to(be_instance_of(klass)) type && expect(exp.type).to(eq(type)) token && expect(exp.token).to(eq(token)) attributes && attributes.each do |method, value| actual = exp.send(method) expect(actual).to eq(value), "expected #{description} at #{path} to "\ "have #{method} #{value.inspect}, got #{actual.inspect}" end end end end end ammar-regexp_parser-0494e56/spec/syntax/000077500000000000000000000000001433525313500202215ustar00rootroot00000000000000ammar-regexp_parser-0494e56/spec/syntax/syntax_spec.rb000066400000000000000000000074721433525313500231200ustar00rootroot00000000000000require 'spec_helper' RSpec.describe(Regexp::Syntax) do describe('::for') do it { expect(Regexp::Syntax.for('ruby/1.8.6')).to eq Regexp::Syntax::V1_8_6 } it { expect(Regexp::Syntax.for('ruby/1.8')).to eq Regexp::Syntax::V1_8_6 } it { expect(Regexp::Syntax.for('ruby/1.9.1')).to eq Regexp::Syntax::V1_9_1 } it { expect(Regexp::Syntax.for('ruby/1.9')).to eq Regexp::Syntax::V1_9_3 } it { expect(Regexp::Syntax.for('ruby/2.0.0')).to eq Regexp::Syntax::V2_0_0 } it { expect(Regexp::Syntax.for('ruby/2.0')).to eq Regexp::Syntax::V2_0_0 } it { expect(Regexp::Syntax.for('ruby/2.1')).to eq Regexp::Syntax::V2_0_0 } it { expect(Regexp::Syntax.for('ruby/2.2.0')).to eq Regexp::Syntax::V2_2_0 } it { expect(Regexp::Syntax.for('ruby/2.2.10')).to eq Regexp::Syntax::V2_2_0 } it { expect(Regexp::Syntax.for('ruby/2.2')).to eq Regexp::Syntax::V2_2_0 } it { expect(Regexp::Syntax.for('ruby/2.3.0')).to eq Regexp::Syntax::V2_3_0 } it { expect(Regexp::Syntax.for('ruby/2.3')).to eq Regexp::Syntax::V2_3_0 } it { expect(Regexp::Syntax.for('ruby/2.4.0')).to eq Regexp::Syntax::V2_4_0 } it { expect(Regexp::Syntax.for('ruby/2.4.1')).to eq Regexp::Syntax::V2_4_1 } it { expect(Regexp::Syntax.for('ruby/2.5.0')).to eq Regexp::Syntax::V2_5_0 } it { expect(Regexp::Syntax.for('ruby/2.5')).to eq Regexp::Syntax::V2_5_0 } it { expect(Regexp::Syntax.for('ruby/2.6.0')).to eq Regexp::Syntax::V2_6_0 } it { expect(Regexp::Syntax.for('ruby/2.6.2')).to eq Regexp::Syntax::V2_6_2 } it { expect(Regexp::Syntax.for('ruby/2.6.3')).to eq Regexp::Syntax::V2_6_3 } it { expect(Regexp::Syntax.for('ruby/2.6')).to eq Regexp::Syntax::V2_6_3 } it { expect(Regexp::Syntax.for('ruby/3.0.0')).to eq Regexp::Syntax::V2_6_3 } it { expect(Regexp::Syntax.for('ruby/3.0')).to eq Regexp::Syntax::V2_6_3 } it { expect(Regexp::Syntax.for('ruby/3.1.0')).to eq Regexp::Syntax::V3_1_0 } it { expect(Regexp::Syntax.for('ruby/3.1')).to eq Regexp::Syntax::V3_1_0 } it { expect(Regexp::Syntax.for('ruby/3.2.0')).to eq Regexp::Syntax::V3_2_0 } it { expect(Regexp::Syntax.for('ruby/3.2')).to eq Regexp::Syntax::V3_2_0 } it { expect(Regexp::Syntax.for('any')).to eq Regexp::Syntax::Any } it { expect(Regexp::Syntax.for('*')).to eq Regexp::Syntax::Any } it 'warns for future versions' do expect { Regexp::Syntax.for('ruby/5.0') }.to output(/This library .* but you are running .*/).to_stderr end it 'raises for unknown names' do expect { Regexp::Syntax.for('ruby/1.0') }.to raise_error(Regexp::Syntax::UnknownSyntaxNameError) end it 'raises for invalid names' do expect { Regexp::Syntax.version_class('2.0.0') }.to raise_error(Regexp::Syntax::InvalidVersionNameError) expect { Regexp::Syntax.version_class('ruby/20') }.to raise_error(Regexp::Syntax::InvalidVersionNameError) end end specify('::new is a deprecated alias of ::for') do expect { expect(Regexp::Syntax.new('ruby/2.0.0')).to eq Regexp::Syntax::V2_0_0 } .to output(/deprecated/).to_stderr end specify('not implemented') do expect { RP.parse('\p{alpha}', 'ruby/1.8') }.to raise_error(Regexp::Syntax::NotImplementedError) end specify('supported?') do expect(Regexp::Syntax.supported?('ruby/1.1.1')).to be false expect(Regexp::Syntax.supported?('ruby/2.4.3')).to be true expect(Regexp::Syntax.supported?('ruby/2.5')).to be true end specify('raises for unknown constant lookups') do expect { Regexp::Syntax::V1 }.to raise_error(/V1/) end specify('instantiation is deprecated but still works') do expect { @instance = Regexp::Syntax::V3_1_0.new } .to output(/deprecated/).to_stderr expect { expect(@instance.implements?(:literal, :literal)).to be true } .to output(/deprecated/).to_stderr end end ammar-regexp_parser-0494e56/spec/syntax/syntax_token_map_spec.rb000066400000000000000000000014021433525313500251400ustar00rootroot00000000000000require 'spec_helper' RSpec.describe(Regexp::Syntax::Token::Map) do let(:map) { Regexp::Syntax::Token::Map } let(:latest_syntax) { Regexp::Syntax::V3_2_0 } specify('is complete') do latest_syntax.features.each do |type, tokens| tokens.each { |token| expect(map[type]).to include(token) } end end specify('contains no duplicate tokens') do latest_syntax.features.each do |_type, tokens| expect(tokens).to eq tokens.uniq end end specify('contains no duplicate type/token combinations') do combinations = map.flat_map do |type, tokens| tokens.map { |token| "#{type} #{token}" } end non_uniq = combinations.group_by { |str| str }.select { |_, v| v.count > 1 } expect(non_uniq.keys).to be_empty end end ammar-regexp_parser-0494e56/spec/syntax/versions/000077500000000000000000000000001433525313500220715ustar00rootroot00000000000000ammar-regexp_parser-0494e56/spec/syntax/versions/1.8.6_spec.rb000066400000000000000000000011671433525313500241070ustar00rootroot00000000000000require 'spec_helper' RSpec.describe(Regexp::Syntax::V1_8_6) do include_examples 'syntax', implements: { assertion: T::Assertion::Lookahead, backref: T::Backreference::Plain, escape: T::Escape::Basic + T::Escape::ASCII + T::Escape::Meta + T::Escape::Control, group: T::Group::V1_8_6, quantifier: T::Quantifier::Greedy + T::Quantifier::Reluctant + T::Quantifier::Interval + T::Quantifier::IntervalReluctant }, excludes: { assertion: T::Assertion::Lookbehind, backref: T::Backreference::All - T::Backreference::Plain + T::SubexpressionCall::All, quantifier: T::Quantifier::Possessive } end ammar-regexp_parser-0494e56/spec/syntax/versions/1.9.1_spec.rb000066400000000000000000000004701433525313500240770ustar00rootroot00000000000000require 'spec_helper' RSpec.describe(Regexp::Syntax::V1_9_1) do include_examples 'syntax', implements: { escape: T::Escape::Hex + T::Escape::Octal + T::Escape::Unicode, type: T::CharacterType::Hex, quantifier: T::Quantifier::Greedy + T::Quantifier::Reluctant + T::Quantifier::Possessive } end ammar-regexp_parser-0494e56/spec/syntax/versions/1.9.3_spec.rb000066400000000000000000000004341433525313500241010ustar00rootroot00000000000000require 'spec_helper' RSpec.describe(Regexp::Syntax::V1_9_3) do include_examples 'syntax', implements: { property: T::UnicodeProperty::Script_V1_9_3 + T::UnicodeProperty::Age_V1_9_3, nonproperty: T::UnicodeProperty::Script_V1_9_3 + T::UnicodeProperty::Age_V1_9_3 } end ammar-regexp_parser-0494e56/spec/syntax/versions/2.0.0_spec.rb000066400000000000000000000004371433525313500240710ustar00rootroot00000000000000require 'spec_helper' RSpec.describe(Regexp::Syntax::V2_0_0) do include_examples 'syntax', implements: { property: T::UnicodeProperty::Age_V2_0_0, nonproperty: T::UnicodeProperty::Age_V2_0_0 }, excludes: { property: %i[newline], nonproperty: %i[newline] } end ammar-regexp_parser-0494e56/spec/syntax/versions/2.2.0_spec.rb000066400000000000000000000004341433525313500240700ustar00rootroot00000000000000require 'spec_helper' RSpec.describe(Regexp::Syntax::V2_2_0) do include_examples 'syntax', implements: { property: T::UnicodeProperty::Script_V2_2_0 + T::UnicodeProperty::Age_V2_2_0, nonproperty: T::UnicodeProperty::Script_V2_2_0 + T::UnicodeProperty::Age_V2_2_0 } end ammar-regexp_parser-0494e56/spec/syntax/versions/3.2.0_spec.rb000066400000000000000000000004341433525313500240710ustar00rootroot00000000000000require 'spec_helper' RSpec.describe(Regexp::Syntax::V3_2_0) do include_examples 'syntax', implements: { property: T::UnicodeProperty::Script_V3_2_0 + T::UnicodeProperty::Age_V3_2_0, nonproperty: T::UnicodeProperty::Script_V3_2_0 + T::UnicodeProperty::Age_V3_2_0 } end ammar-regexp_parser-0494e56/spec/token/000077500000000000000000000000001433525313500200135ustar00rootroot00000000000000ammar-regexp_parser-0494e56/spec/token/token_spec.rb000066400000000000000000000036321433525313500224760ustar00rootroot00000000000000require 'spec_helper' RSpec.describe(Regexp::Token) do specify('#offset') do regexp = /ab?cd/ tokens = RL.lex(regexp) expect(tokens[1].text).to eq 'b' expect(tokens[1].offset).to eq [1, 2] expect(tokens[2].text).to eq '?' expect(tokens[2].offset).to eq [2, 3] expect(tokens[3].text).to eq 'cd' expect(tokens[3].offset).to eq [3, 5] end specify('#length') do regexp = /abc?def/ tokens = RL.lex(regexp) expect(tokens[0].text).to eq 'ab' expect(tokens[0].length).to eq 2 expect(tokens[1].text).to eq 'c' expect(tokens[1].length).to eq 1 expect(tokens[2].text).to eq '?' expect(tokens[2].length).to eq 1 expect(tokens[3].text).to eq 'def' expect(tokens[3].length).to eq 3 end specify('#to_h') do regexp = /abc?def/ tokens = RL.lex(regexp) expect(tokens[0].text).to eq 'ab' expect(tokens[0].to_h).to eq type: :literal, token: :literal, text: 'ab', ts: 0, te: 2, level: 0, set_level: 0, conditional_level: 0 expect(tokens[2].text).to eq '?' expect(tokens[2].to_h).to eq type: :quantifier, token: :zero_or_one, text: '?', ts: 3, te: 4, level: 0, set_level: 0, conditional_level: 0 end specify('#next') do regexp = /a+b?c*d{2,3}/ tokens = RL.lex(regexp) a = tokens.first expect(a.text).to eq 'a' plus = a.next expect(plus.text).to eq '+' b = plus.next expect(b.text).to eq 'b' interval = tokens.last expect(interval.text).to eq '{2,3}' expect(interval.next).to be_nil end specify('#previous') do regexp = /a+b?c*d{2,3}/ tokens = RL.lex(regexp) interval = tokens.last expect(interval.text).to eq '{2,3}' d = interval.previous expect(d.text).to eq 'd' star = d.previous expect(star.text).to eq '*' c = star.previous expect(c.text).to eq 'c' a = tokens.first expect(a.text).to eq 'a' expect(a.previous).to be_nil end end ammar-regexp_parser-0494e56/tasks/000077500000000000000000000000001433525313500170665ustar00rootroot00000000000000ammar-regexp_parser-0494e56/tasks/benchmark.rake000066400000000000000000000015631433525313500216710ustar00rootroot00000000000000BENCHMARKS_DIR = "#{__dir__}/benchmarks" desc 'Run all IPS benchmarks' task :benchmark do Dir["#{BENCHMARKS_DIR}/*.rb"].sort.each { |file| load(file) } end namespace :benchmark do desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md' task :write_to_file do require 'stringio' string_io = StringIO.new with_stdouts(STDOUT, string_io) { Rake.application[:benchmark].invoke } File.write "#{BENCHMARKS_DIR}/log", "Results of rake:benchmark on #{RUBY_DESCRIPTION}\n\n" + string_io.string.gsub(/Warming up.*?Comparison:/m, '') end end def with_stdouts(*ios) old_stdout = $stdout ios.define_singleton_method(:method_missing) { |*args| each { |io| io.send(*args) } } ios.define_singleton_method(:respond_to?) { |*args| IO.respond_to?(*args) } $stdout = ios yield ensure $stdout = old_stdout end ammar-regexp_parser-0494e56/tasks/benchmarks/000077500000000000000000000000001433525313500212035ustar00rootroot00000000000000ammar-regexp_parser-0494e56/tasks/benchmarks/log000066400000000000000000000007661433525313500217200ustar00rootroot00000000000000Results of rake:benchmark on ruby 3.1.0p0 (2021-12-25 revision fb4df44d16) [arm64-darwin21] Parsing a minimal Regexp Scanner::scan: 32710.2 i/s Lexer::lex: 31091.8 i/s - same-ish: difference falls within error Parser::parse: 27097.3 i/s - 1.21x (± 0.00) slower Parsing a complex Regexp (URI.regexp) Scanner::scan: 877.0 i/s Lexer::lex: 545.2 i/s - 1.61x (± 0.00) slower Parser::parse: 294.7 i/s - 2.98x (± 0.00) slower ammar-regexp_parser-0494e56/tasks/benchmarks/minimal_regexp.rb000066400000000000000000000005201433525313500245250ustar00rootroot00000000000000require 'benchmark/ips' require_relative '../../lib/regexp_parser' puts 'Parsing a minimal Regexp' regexp = /./ Benchmark.ips do |x| x.report('Scanner::scan') { Regexp::Scanner.scan(regexp) } x.report('Lexer::lex') { Regexp::Lexer.lex(regexp) } x.report('Parser::parse') { Regexp::Parser.parse(regexp) } x.compare! end ammar-regexp_parser-0494e56/tasks/benchmarks/uri_regexp.rb000066400000000000000000000006071433525313500237040ustar00rootroot00000000000000require 'benchmark/ips' require_relative '../../lib/regexp_parser' puts 'Parsing a complex Regexp (URI.regexp)' require 'uri' regexp = URI::DEFAULT_PARSER.make_regexp Benchmark.ips do |x| x.report('Scanner::scan') { Regexp::Scanner.scan(regexp) } x.report('Lexer::lex') { Regexp::Lexer.lex(regexp) } x.report('Parser::parse') { Regexp::Parser.parse(regexp) } x.compare! end ammar-regexp_parser-0494e56/tasks/props.rake000066400000000000000000000016471433525313500211050ustar00rootroot00000000000000namespace :props do desc 'Write new property value hashes for the properties scanner' task :update do require 'regexp_property_values' RegexpPropertyValues.update dir = File.join(__dir__, '../lib/regexp_parser/scanner/properties') write_hash_to_file = ->(hash, path) do File.open(path, 'w') do |f| f.puts '# THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT', *hash.sort.map { |pair| pair.join(',') } end puts "Wrote #{hash.count} aliases to `#{path}`" end long_names_to_tokens = RegexpPropertyValues.all.map do |val| [val.identifier, val.full_name.downcase] end write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.csv") short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v| [k.identifier, v.full_name.downcase] end write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.csv") end end ammar-regexp_parser-0494e56/tasks/ragel.rake000066400000000000000000000016651433525313500210340ustar00rootroot00000000000000RAGEL_SOURCE_DIR = File.join(__dir__, '../lib/regexp_parser/scanner') RAGEL_OUTPUT_DIR = File.join(__dir__, '../lib/regexp_parser') RAGEL_SOURCE_FILES = %w[scanner] # scanner.rl imports the other files namespace :ragel do desc 'Process the ragel source files and output ruby code' task :rb do RAGEL_SOURCE_FILES.each do |source_file| output_file = "#{RAGEL_OUTPUT_DIR}/#{source_file}.rb" # using faster flat table driven FSM, about 25% larger code, but about 30% faster sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{source_file}.rl -o #{output_file}" contents = File.read(output_file) File.open(output_file, 'r+') do |file| contents = "# -*- warn-indent:false; -*-\n" + contents file.write(contents) end end end desc 'Delete the ragel generated source file(s)' task :clean do RAGEL_SOURCE_FILES.each do |file| sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb" end end end