character_set-1.8.0/0000755000004100000410000000000014620142357014363 5ustar www-datawww-datacharacter_set-1.8.0/bin/0000755000004100000410000000000014620142357015133 5ustar www-datawww-datacharacter_set-1.8.0/bin/setup0000755000004100000410000000020314620142357016214 0ustar www-datawww-data#!/usr/bin/env bash set -euo pipefail IFS=$'\n\t' set -vx bundle install # Do any other automated setup that you need to do here character_set-1.8.0/bin/console0000755000004100000410000000055514620142357016530 0ustar www-datawww-data#!/usr/bin/env ruby require 'bundler/setup' `bundle exec rake compile` require 'character_set' require 'character_set/core_ext' require 'character_set/pure' require 'regexp_property_values' CS = CharacterSet CP = CharacterSet::Pure PV = RegexpPropertyValues require 'benchmark' def m(&block); Benchmark.measure(&block); end require "irb" IRB.start(__FILE__) character_set-1.8.0/.gitignore0000644000004100000410000000057314620142357016360 0ustar www-datawww-data*.bundle *.gem *.iml *.stTheme.cache *.sublime-project *.sublime-workspace *.swp *.tmlanguage.cache *.tmPreferences.cache *~ .byebug_history .DS_Store .idea/ .ruby-gemset .ruby-version .tags .tags1 .tool-versions .vscode bbin/ binstubs/* bundler_stubs/*/.yardoc Gemfile.lock /.bundle/ /_yardoc/ /coverage/ /doc/ /pkg/ /spec/reports/ /tmp/ # rspec failure tracking .rspec_status character_set-1.8.0/tasks/0000755000004100000410000000000014620142357015510 5ustar www-datawww-datacharacter_set-1.8.0/tasks/benchmarks/0000755000004100000410000000000014620142357017625 5ustar www-datawww-datacharacter_set-1.8.0/tasks/benchmarks/delete_in.rb0000644000004100000410000000145614620142357022110 0ustar www-datawww-datarequire_relative './shared' str = 'Lorem ipsum et dolorem' rx = /\s/ trt = "\t\n\v\f\r\s" cs = CharacterSet.whitespace benchmark( caption: 'Removing ASCII whitespace', cases: { 'String#gsub' => -> { str.gsub(rx, '') }, 'String#tr' => -> { str.tr(trt, '') }, 'CharacterSet#delete_in' => -> { cs.delete_in(str) }, } ) str = 'Lörem ipsüm ⛷ et dölörem' rx = /[\s\p{emoji}äüö]/ trt = "\t\n\v\f\r\s😀-🙏äüö" cs = CharacterSet.whitespace + CharacterSet.emoji + CharacterSet['ä', 'ö', 'ü'] benchmark( caption: 'Removing whitespace, emoji and umlauts', cases: { 'String#gsub' => -> { str.gsub(rx, '') }, 'String#tr' => -> { str.tr(trt, '') }, 'CharacterSet#delete_in' => -> { cs.delete_in(str) }, } ) character_set-1.8.0/tasks/benchmarks/keep_in.rb0000644000004100000410000000132114620142357021561 0ustar www-datawww-datarequire_relative './shared' str = 'Lorem ipsum et dolorem' rx = /\S/ trt = "\u{0080}-\u{10FFFF}" # approximation cs = CharacterSet.whitespace benchmark( caption: 'Removing non-whitespace', cases: { 'String#gsub' => -> { str.gsub(rx, '') }, 'String#tr' => -> { str.tr(trt, '') }, 'CharacterSet#keep_in' => -> { cs.keep_in(str) }, } ) str = 'Lorem ipsum ⛷ et dolorem' rx = /\p{^emoji}/ trt = "\u0000-\u{1F599}\u{1F650}-\u{10FFFF}" cs = CharacterSet.emoji benchmark( caption: 'Keeping only emoji', cases: { 'String#gsub' => -> { str.gsub(rx, '') }, 'String#tr' => -> { str.tr(trt, '') }, 'CharacterSet#keep_in' => -> { cs.keep_in(str) }, } ) character_set-1.8.0/tasks/benchmarks/z_minmax.rb0000644000004100000410000000037514620142357022001 0ustar www-datawww-datarequire_relative './shared' cs = CharacterSet.new(0..0xFFFF) ss = SortedSet.new(0..0xFFFF) benchmark( caption: 'Getting the min and max', cases: { 'CharacterSet#minmax' => -> { cs.minmax }, 'SortedSet#minmax' => -> { ss.minmax }, } ) character_set-1.8.0/tasks/benchmarks/z_merge.rb0000644000004100000410000000052514620142357021604 0ustar www-datawww-datarequire_relative './shared' cs1 = CharacterSet.new(0...0x88000) cs2 = CharacterSet.new(0x88000..0x10FFFF) ss1 = SortedSet.new(0...0x88000) ss2 = SortedSet.new(0x88000..0x10FFFF) benchmark( caption: 'Merging entries', cases: { 'CharacterSet#merge' => -> { cs1.merge(cs2) }, 'SortedSet#merge' => -> { ss1.merge(ss2) }, } ) character_set-1.8.0/tasks/benchmarks/cover.rb0000644000004100000410000000105614620142357021272 0ustar www-datawww-datarequire_relative './shared' str = 'Lorem ipsum et dolorem' rx = /\S/ cs = CharacterSet.whitespace.inversion benchmark( caption: 'Detecting non-whitespace', cases: { 'Regexp#match?' => -> { rx.match?(str) }, 'CharacterSet#cover?' => -> { cs.cover?(str) }, } ) str = 'Lorem ipsum et dolorem' rx = /[^a-z]/i cs = CharacterSet.new('A'..'Z') + CharacterSet.new('a'..'z') benchmark( caption: 'Detecting non-letters', cases: { 'Regexp#match?' => -> { rx.match?(str) }, 'CharacterSet#cover?' => -> { cs.cover?(str) }, } ) character_set-1.8.0/tasks/benchmarks/z_add.rb0000644000004100000410000000035614620142357021237 0ustar www-datawww-datarequire_relative './shared' cs = CharacterSet[] ss = SortedSet[] benchmark( caption: 'Adding entries', cases: { 'CharacterSet#add' => -> { cs.add(rand(0x10FFFF)) }, 'SortedSet#add' => -> { ss.add(rand(0x10FFFF)) }, } ) character_set-1.8.0/tasks/benchmarks/used_by.rb0000644000004100000410000000110214620142357021576 0ustar www-datawww-datarequire_relative './shared' str = 'Lorem ipsum et dolorem' rx = /\s/ cs = CharacterSet.whitespace benchmark( caption: 'Detecting whitespace', cases: { 'Regexp#match?' => -> { rx.match?(str) }, 'CharacterSet#used_by?' => -> { cs.used_by?(str) }, } ) str = 'Lorem ipsum et dolorem' * 20 + '⛷' + 'Lorem ipsum et dolorem' * 20 rx = /\p{emoji}/ cs = CharacterSet.emoji benchmark( caption: 'Detecting emoji in a large string', cases: { 'Regexp#match?' => -> { rx.match?(str) }, 'CharacterSet#used_by?' => -> { cs.used_by?(str) }, } ) character_set-1.8.0/tasks/benchmarks/scan.rb0000644000004100000410000000042114620142357021073 0ustar www-datawww-datarequire_relative './shared' str = 'Lorem ipsum ⛷ et dolorem' rx = /\p{emoji}/ cs = CharacterSet.emoji benchmark( caption: 'Extracting emoji to an Array', cases: { 'String#scan' => -> { str.scan(rx) }, 'CharacterSet#scan' => -> { cs.scan(str) }, } ) character_set-1.8.0/tasks/benchmarks/count_in.rb0000644000004100000410000000043314620142357021770 0ustar www-datawww-datarequire_relative './shared' str = 'Lorem ipsum et dolorem' tr = '^A-Za-z' cs = CharacterSet.non_ascii_letter benchmark( caption: 'Counting non-letters', cases: { 'String#count' => -> { str.count(tr) }, 'CharacterSet#count_in' => -> { cs.count_in(str) }, } ) character_set-1.8.0/tasks/benchmarks/shared.rb0000644000004100000410000000133714620142357021424 0ustar www-datawww-datarequire 'benchmark/ips' require_relative '../../lib/character_set' if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i] require 'sorted_set' else require 'set' end def benchmark(caption: nil, cases: {}) with_stdouts($stdout, string_io = StringIO.new) do puts caption Benchmark.ips do |x| cases.each { |label, callable| x.report(label, &callable) } x.compare! end end ($benchmark_results ||= {})[caption] = string_io.string end def with_stdouts(*ios) old_stdout = $stdout ios.define_singleton_method(:method_missing) { |*args| each { |io| io.send(*args) } } ios.define_singleton_method(:respond_to?) { |*args| IO.respond_to?(*args) } $stdout = ios yield ensure $stdout = old_stdout end character_set-1.8.0/tasks/benchmarks/z_delete.rb0000644000004100000410000000043214620142357021744 0ustar www-datawww-datarequire_relative './shared' cs = CharacterSet.new(0..0x10FFFF) ss = SortedSet.new(0..0x10FFFF) benchmark( caption: 'Removing entries', cases: { 'CharacterSet#delete' => -> { cs.delete(rand(0x10FFFF)) }, 'SortedSet#delete' => -> { ss.delete(rand(0x10FFFF)) }, } ) character_set-1.8.0/tasks/sync_predefined_sets.rake0000644000004100000410000000064414620142357022557 0ustar www-datawww-datadesc 'Update codepoint data for predefined sets, based on Onigmo' task :sync_predefined_sets do %w[assigned emoji whitespace].each do |prop| require 'regexp_property_values' ranges = RegexpPropertyValues[prop].matched_ranges str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase File.write("#{__dir__}/../lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w') end end character_set-1.8.0/tasks/sync_casefold_data.rake0000644000004100000410000000135114620142357022161 0ustar www-datawww-datadesc 'Download unicode casefold data and write new C header file' task :sync_casefold_data do src_path = './CaseFolding.txt' dst_path = "#{__dir__}/../ext/character_set/unicode_casefold_table.h" `wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt` mapping = File.foreach(src_path).each_with_object({}) do |line, hash| from, type, to = line.split(/\s*;\s*/).first(3) # type 'C' stands for 'common', excludes mappings to multiple chars hash[from] = to if type == 'C' end.sort content = File.read(dst_path + '.tmpl') .sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}") .sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n")) File.write(dst_path, content) File.unlink(src_path) end character_set-1.8.0/tasks/sync_ruby_spec.rake0000644000004100000410000000463514620142357021413 0ustar www-datawww-datadesc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants' task :sync_ruby_spec do require 'fileutils' variants = { 'CharacterSet' => "#{__dir__}/../spec/ruby-spec/library/character_set", 'CharacterSet::Pure' => "#{__dir__}/../spec/ruby-spec/library/character_set_pure", } # download fresh specs from ruby/spec repository variants.each do |_, dir| FileUtils.rm_rf(dir) `svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}` end # make copies for each CharacterSet variant base = variants.first[1] variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base } # adapt specs to work with CharacterSet variants.each do |class_name, dir| Dir["#{dir}/**/*.rb"].each do |spec| # ignore some tests that do not apply or are covered otherwise if spec =~ %r{/(classify|divide|flatten|initialize|pretty_print)} File.delete(spec) next end adapted_content = File.read(spec). # adapt class name gsub('SortedSet', (spec['/shared/'] ? 'variant' : class_name)). gsub(/(it_behaves_like :[^,\n]+), (:[^,\n]+)/, "\\1, #{class_name}, \\2"). # get shared specs from a single shared dir at the parent level gsub(/(require_relative ['"])(shared\/)/, '\1../\2'). # make 'mspec' syntax rspec-compatible gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |variant, method|'). gsub(/be_(false|true)/, 'be \1'). gsub('stub!', 'stub'). gsub('mock', 'double'). gsub('@method', 'method'). # remove unneeded requires gsub(/require 'set'\n/, ''). gsub(/require.*spec_helper.*\n/, ''). gsub(/\A\n+/, ''). # make examples use Integers/codepoints gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0'). gsub('"one"', '1'). gsub('"two"', '2'). gsub('"three"', '3'). gsub('"four"', '4'). gsub('"five"', '5'). gsub(/x.(size|length) == 3/, 'x != 3'). gsub(/x.(size|length) != 3/, 'x == 3'). gsub(/(add)\(\d\)(\.to_a \}.should raise)/, '\1(:foo)\2') File.open(spec, 'w') { |f| f.puts adapted_content } end end # keep only one copy of the shared specs, at the parent level FileUtils.rm_rf(base + '/../shared') FileUtils.mv(base + '/shared', base + '/../') variants.each_value { |dir| FileUtils.rm_rf(dir + '/shared') } end character_set-1.8.0/tasks/benchmark.rake0000644000004100000410000000127114620142357020307 0ustar www-datawww-datadesc 'Run all IPS benchmarks' task :benchmark do Dir["#{__dir__}/benchmarks/*.rb"].sort.each { |file| load(file) } end namespace :benchmark do desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md' task :write_to_file do Rake.application[:benchmark].invoke # extract comparison results from reports results = $benchmark_results .map { |caption, report| "```\n#{caption}\n\n#{report[/(?<=Comparison:).+/m].strip}\n```" } .join("\n") .gsub(/ \(±[^)]+\) |(?<=same-ish).*/, '') # remove some noise File.write "#{__dir__}/../BENCHMARK.md", "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}\n\n#{results}\n" end end character_set-1.8.0/BENCHMARK.md0000644000004100000410000000376714620142357016214 0ustar www-datawww-dataResults of `rake:benchmark` on ruby 3.2.0dev (2022-02-14T14:35:54Z master 26187a8520) [arm64-darwin21] ``` Counting non-letters CharacterSet#count_in: 14627506.2 i/s String#count: 3859777.0 i/s - 3.79x slower ``` ``` Detecting non-whitespace CharacterSet#cover?: 17241902.8 i/s Regexp#match?: 12971122.6 i/s - 1.33x slower ``` ``` Detecting non-letters CharacterSet#cover?: 17243472.3 i/s Regexp#match?: 7957626.9 i/s - 2.17x slower ``` ``` Removing ASCII whitespace CharacterSet#delete_in: 6190975.7 i/s String#tr: 4722716.6 i/s - 1.31x slower String#gsub: 214239.5 i/s - 28.90x slower ``` ``` Removing whitespace, emoji and umlauts CharacterSet#delete_in: 5890471.8 i/s String#tr: 348506.8 i/s - 16.90x slower String#gsub: 318268.3 i/s - 18.51x slower ``` ``` Removing non-whitespace CharacterSet#keep_in: 7396898.0 i/s String#gsub: 208809.7 i/s - 35.42x slower String#tr: 13.1 i/s - 564682.50x slower ``` ``` Keeping only emoji CharacterSet#keep_in: 7022741.1 i/s String#gsub: 180939.6 i/s - 38.81x slower String#tr: 13.1 i/s - 536724.50x slower ``` ``` Extracting emoji to an Array CharacterSet#scan: 3023176.8 i/s String#scan: 893225.8 i/s - 3.38x slower ``` ``` Detecting whitespace CharacterSet#used_by?: 17284025.9 i/s Regexp#match?: 11847064.5 i/s - 1.46x slower ``` ``` Detecting emoji in a large string CharacterSet#used_by?: 341386.1 i/s Regexp#match?: 183121.6 i/s - 1.86x slower ``` ``` Adding entries CharacterSet#add: 4989762.3 i/s SortedSet#add: 1157911.7 i/s - 4.31x slower ``` ``` Removing entries CharacterSet#delete: 4996703.6 i/s SortedSet#delete: 4177401.5 i/s - same-ish ``` ``` Merging entries CharacterSet#merge: 666.7 i/s SortedSet#merge: 4.0 i/s - 167.84x slower ``` ``` Getting the min and max CharacterSet#minmax: 1596470.9 i/s SortedSet#minmax: 866.4 i/s - 1842.74x slower ``` character_set-1.8.0/.github/0000755000004100000410000000000014620142357015723 5ustar www-datawww-datacharacter_set-1.8.0/.github/workflows/0000755000004100000410000000000014620142357017760 5ustar www-datawww-datacharacter_set-1.8.0/.github/workflows/gouteur.yml0000644000004100000410000000060414620142357022175 0ustar www-datawww-dataname: gouteur on: [push, pull_request] jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: Set up Ruby uses: ruby/setup-ruby@v1 with: ruby-version: 3.3 - name: Prepare run: | bundle install --jobs 4 bundle exec rake compile - name: Test run: bundle exec gouteur character_set-1.8.0/.github/workflows/lint.yml0000644000004100000410000000132314620142357021450 0ustar www-datawww-data# based on https://github.com/rails/rails/blob/4a78dcb/.github/workflows/rubocop.yml name: rubocop linting on: [push, pull_request] jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: Set up Ruby uses: ruby/setup-ruby@v1 with: ruby-version: 3.3 - name: Cache gems uses: actions/cache@v1 with: path: vendor/bundle key: ${{ runner.os }}-rubocop-${{ hashFiles('**/Gemfile.lock') }} restore-keys: | ${{ runner.os }}-rubocop- - name: Install gems run: | bundle config path vendor/bundle bundle install --jobs 4 --retry 3 - name: Run rubocop run: bundle exec rubocop --lint character_set-1.8.0/.github/workflows/tests.yml0000644000004100000410000000123014620142357021641 0ustar www-datawww-dataname: tests on: push: pull_request: schedule: - cron: '11 11 14 * *' # at 11:11 am on the 14th of every month jobs: build: runs-on: ubuntu-latest strategy: matrix: ruby: [ '2.4', '2.7', '3.0', '3.1', '3.2', '3.3', 'ruby-head', 'jruby-head' ] steps: - uses: actions/checkout@v2 - name: Set up Ruby ${{ matrix.ruby }} uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby }} - name: Install dependencies run: bundle install --jobs 4 - name: Test with Rake run: bundle exec rake - uses: codecov/codecov-action@v3 if: matrix.ruby == '3.2' character_set-1.8.0/lib/0000755000004100000410000000000014620142357015131 5ustar www-datawww-datacharacter_set-1.8.0/lib/character_set.rb0000644000004100000410000000105714620142357020270 0ustar www-datawww-datarequire 'character_set/character' require 'character_set/expression_converter' require 'character_set/parser' require 'character_set/predefined_sets' require 'character_set/set_method_adapters' require 'character_set/shared_methods' require 'character_set/version' require 'character_set/writer' class CharacterSet begin require 'character_set/character_set' rescue LoadError require 'character_set/ruby_fallback' prepend RubyFallback end prepend SetMethodAdapters include Enumerable include SharedMethods extend PredefinedSets end character_set-1.8.0/lib/character_set/0000755000004100000410000000000014620142357017740 5ustar www-datawww-datacharacter_set-1.8.0/lib/character_set/core_ext.rb0000644000004100000410000000016014620142357022072 0ustar www-datawww-datarequire 'character_set' require 'character_set/core_ext/regexp_ext' require 'character_set/core_ext/string_ext' character_set-1.8.0/lib/character_set/predefined_sets/0000755000004100000410000000000014620142357023103 5ustar www-datawww-datacharacter_set-1.8.0/lib/character_set/predefined_sets/ascii_alnum.cps0000644000004100000410000000002214620142357026070 0ustar www-datawww-data30,39 41,5A 61,7A character_set-1.8.0/lib/character_set/predefined_sets/newline.cps0000644000004100000410000000002414620142357025247 0ustar www-datawww-dataA,D 85,85 2028,2029 character_set-1.8.0/lib/character_set/predefined_sets/url_query.cps0000644000004100000410000000006014620142357025635 0ustar www-datawww-data21,21 24,24 26,3B 3D,3D 3F,5A 5F,5F 61,7A 7E,7E character_set-1.8.0/lib/character_set/predefined_sets/crypt.cps0000644000004100000410000000001414620142357024746 0ustar www-datawww-data2E,5A 61,7A character_set-1.8.0/lib/character_set/predefined_sets/emoji.cps0000644000004100000410000000314414620142357024717 0ustar www-datawww-data23,23 2A,2A 30,39 A9,A9 AE,AE 203C,203C 2049,2049 2122,2122 2139,2139 2194,2199 21A9,21AA 231A,231B 2328,2328 23CF,23CF 23E9,23F3 23F8,23FA 24C2,24C2 25AA,25AB 25B6,25B6 25C0,25C0 25FB,25FE 2600,2604 260E,260E 2611,2611 2614,2615 2618,2618 261D,261D 2620,2620 2622,2623 2626,2626 262A,262A 262E,262F 2638,263A 2640,2640 2642,2642 2648,2653 265F,2660 2663,2663 2665,2666 2668,2668 267B,267B 267E,267F 2692,2697 2699,2699 269B,269C 26A0,26A1 26A7,26A7 26AA,26AB 26B0,26B1 26BD,26BE 26C4,26C5 26C8,26C8 26CE,26CF 26D1,26D1 26D3,26D4 26E9,26EA 26F0,26F5 26F7,26FA 26FD,26FD 2702,2702 2705,2705 2708,270D 270F,270F 2712,2712 2714,2714 2716,2716 271D,271D 2721,2721 2728,2728 2733,2734 2744,2744 2747,2747 274C,274C 274E,274E 2753,2755 2757,2757 2763,2764 2795,2797 27A1,27A1 27B0,27B0 27BF,27BF 2934,2935 2B05,2B07 2B1B,2B1C 2B50,2B50 2B55,2B55 3030,3030 303D,303D 3297,3297 3299,3299 1F004,1F004 1F0CF,1F0CF 1F170,1F171 1F17E,1F17F 1F18E,1F18E 1F191,1F19A 1F1E6,1F1FF 1F201,1F202 1F21A,1F21A 1F22F,1F22F 1F232,1F23A 1F250,1F251 1F300,1F321 1F324,1F393 1F396,1F397 1F399,1F39B 1F39E,1F3F0 1F3F3,1F3F5 1F3F7,1F4FD 1F4FF,1F53D 1F549,1F54E 1F550,1F567 1F56F,1F570 1F573,1F57A 1F587,1F587 1F58A,1F58D 1F590,1F590 1F595,1F596 1F5A4,1F5A5 1F5A8,1F5A8 1F5B1,1F5B2 1F5BC,1F5BC 1F5C2,1F5C4 1F5D1,1F5D3 1F5DC,1F5DE 1F5E1,1F5E1 1F5E3,1F5E3 1F5E8,1F5E8 1F5EF,1F5EF 1F5F3,1F5F3 1F5FA,1F64F 1F680,1F6C5 1F6CB,1F6D2 1F6D5,1F6D7 1F6DD,1F6E5 1F6E9,1F6E9 1F6EB,1F6EC 1F6F0,1F6F0 1F6F3,1F6FC 1F7E0,1F7EB 1F7F0,1F7F0 1F90C,1F93A 1F93C,1F945 1F947,1F9FF 1FA70,1FA74 1FA78,1FA7C 1FA80,1FA86 1FA90,1FAAC 1FAB0,1FABA 1FAC0,1FAC5 1FAD0,1FAD9 1FAE0,1FAE7 1FAF0,1FAF6 character_set-1.8.0/lib/character_set/predefined_sets/ascii.cps0000644000004100000410000000000514620142357024675 0ustar www-datawww-data0,7F character_set-1.8.0/lib/character_set/predefined_sets/url_fragment.cps0000644000004100000410000000006014620142357026273 0ustar www-datawww-data21,21 24,24 26,3B 3D,3D 3F,5A 5F,5F 61,7A 7E,7E character_set-1.8.0/lib/character_set/predefined_sets/assigned.cps0000644000004100000410000001632214620142357025413 0ustar www-datawww-data0,377 37A,37F 384,38A 38C,38C 38E,3A1 3A3,52F 531,556 559,58A 58D,58F 591,5C7 5D0,5EA 5EF,5F4 600,70D 70F,74A 74D,7B1 7C0,7FA 7FD,82D 830,83E 840,85B 85E,85E 860,86A 870,88E 890,891 898,983 985,98C 98F,990 993,9A8 9AA,9B0 9B2,9B2 9B6,9B9 9BC,9C4 9C7,9C8 9CB,9CE 9D7,9D7 9DC,9DD 9DF,9E3 9E6,9FE A01,A03 A05,A0A A0F,A10 A13,A28 A2A,A30 A32,A33 A35,A36 A38,A39 A3C,A3C A3E,A42 A47,A48 A4B,A4D A51,A51 A59,A5C A5E,A5E A66,A76 A81,A83 A85,A8D A8F,A91 A93,AA8 AAA,AB0 AB2,AB3 AB5,AB9 ABC,AC5 AC7,AC9 ACB,ACD AD0,AD0 AE0,AE3 AE6,AF1 AF9,AFF B01,B03 B05,B0C B0F,B10 B13,B28 B2A,B30 B32,B33 B35,B39 B3C,B44 B47,B48 B4B,B4D B55,B57 B5C,B5D B5F,B63 B66,B77 B82,B83 B85,B8A B8E,B90 B92,B95 B99,B9A B9C,B9C B9E,B9F BA3,BA4 BA8,BAA BAE,BB9 BBE,BC2 BC6,BC8 BCA,BCD BD0,BD0 BD7,BD7 BE6,BFA C00,C0C C0E,C10 C12,C28 C2A,C39 C3C,C44 C46,C48 C4A,C4D C55,C56 C58,C5A C5D,C5D C60,C63 C66,C6F C77,C8C C8E,C90 C92,CA8 CAA,CB3 CB5,CB9 CBC,CC4 CC6,CC8 CCA,CCD CD5,CD6 CDD,CDE CE0,CE3 CE6,CEF CF1,CF2 D00,D0C D0E,D10 D12,D44 D46,D48 D4A,D4F D54,D63 D66,D7F D81,D83 D85,D96 D9A,DB1 DB3,DBB DBD,DBD DC0,DC6 DCA,DCA DCF,DD4 DD6,DD6 DD8,DDF DE6,DEF DF2,DF4 E01,E3A E3F,E5B E81,E82 E84,E84 E86,E8A E8C,EA3 EA5,EA5 EA7,EBD EC0,EC4 EC6,EC6 EC8,ECD ED0,ED9 EDC,EDF F00,F47 F49,F6C F71,F97 F99,FBC FBE,FCC FCE,FDA 1000,10C5 10C7,10C7 10CD,10CD 10D0,1248 124A,124D 1250,1256 1258,1258 125A,125D 1260,1288 128A,128D 1290,12B0 12B2,12B5 12B8,12BE 12C0,12C0 12C2,12C5 12C8,12D6 12D8,1310 1312,1315 1318,135A 135D,137C 1380,1399 13A0,13F5 13F8,13FD 1400,169C 16A0,16F8 1700,1715 171F,1736 1740,1753 1760,176C 176E,1770 1772,1773 1780,17DD 17E0,17E9 17F0,17F9 1800,1819 1820,1878 1880,18AA 18B0,18F5 1900,191E 1920,192B 1930,193B 1940,1940 1944,196D 1970,1974 1980,19AB 19B0,19C9 19D0,19DA 19DE,1A1B 1A1E,1A5E 1A60,1A7C 1A7F,1A89 1A90,1A99 1AA0,1AAD 1AB0,1ACE 1B00,1B4C 1B50,1B7E 1B80,1BF3 1BFC,1C37 1C3B,1C49 1C4D,1C88 1C90,1CBA 1CBD,1CC7 1CD0,1CFA 1D00,1F15 1F18,1F1D 1F20,1F45 1F48,1F4D 1F50,1F57 1F59,1F59 1F5B,1F5B 1F5D,1F5D 1F5F,1F7D 1F80,1FB4 1FB6,1FC4 1FC6,1FD3 1FD6,1FDB 1FDD,1FEF 1FF2,1FF4 1FF6,1FFE 2000,2064 2066,2071 2074,208E 2090,209C 20A0,20C0 20D0,20F0 2100,218B 2190,2426 2440,244A 2460,2B73 2B76,2B95 2B97,2CF3 2CF9,2D25 2D27,2D27 2D2D,2D2D 2D30,2D67 2D6F,2D70 2D7F,2D96 2DA0,2DA6 2DA8,2DAE 2DB0,2DB6 2DB8,2DBE 2DC0,2DC6 2DC8,2DCE 2DD0,2DD6 2DD8,2DDE 2DE0,2E5D 2E80,2E99 2E9B,2EF3 2F00,2FD5 2FF0,2FFB 3000,303F 3041,3096 3099,30FF 3105,312F 3131,318E 3190,31E3 31F0,321E 3220,A48C A490,A4C6 A4D0,A62B A640,A6F7 A700,A7CA A7D0,A7D1 A7D3,A7D3 A7D5,A7D9 A7F2,A82C A830,A839 A840,A877 A880,A8C5 A8CE,A8D9 A8E0,A953 A95F,A97C A980,A9CD A9CF,A9D9 A9DE,A9FE AA00,AA36 AA40,AA4D AA50,AA59 AA5C,AAC2 AADB,AAF6 AB01,AB06 AB09,AB0E AB11,AB16 AB20,AB26 AB28,AB2E AB30,AB6B AB70,ABED ABF0,ABF9 AC00,D7A3 D7B0,D7C6 D7CB,D7FB D800,FA6D FA70,FAD9 FB00,FB06 FB13,FB17 FB1D,FB36 FB38,FB3C FB3E,FB3E FB40,FB41 FB43,FB44 FB46,FBC2 FBD3,FD8F FD92,FDC7 FDCF,FDCF FDF0,FE19 FE20,FE52 FE54,FE66 FE68,FE6B FE70,FE74 FE76,FEFC FEFF,FEFF FF01,FFBE FFC2,FFC7 FFCA,FFCF FFD2,FFD7 FFDA,FFDC FFE0,FFE6 FFE8,FFEE FFF9,FFFD 10000,1000B 1000D,10026 10028,1003A 1003C,1003D 1003F,1004D 10050,1005D 10080,100FA 10100,10102 10107,10133 10137,1018E 10190,1019C 101A0,101A0 101D0,101FD 10280,1029C 102A0,102D0 102E0,102FB 10300,10323 1032D,1034A 10350,1037A 10380,1039D 1039F,103C3 103C8,103D5 10400,1049D 104A0,104A9 104B0,104D3 104D8,104FB 10500,10527 10530,10563 1056F,1057A 1057C,1058A 1058C,10592 10594,10595 10597,105A1 105A3,105B1 105B3,105B9 105BB,105BC 10600,10736 10740,10755 10760,10767 10780,10785 10787,107B0 107B2,107BA 10800,10805 10808,10808 1080A,10835 10837,10838 1083C,1083C 1083F,10855 10857,1089E 108A7,108AF 108E0,108F2 108F4,108F5 108FB,1091B 1091F,10939 1093F,1093F 10980,109B7 109BC,109CF 109D2,10A03 10A05,10A06 10A0C,10A13 10A15,10A17 10A19,10A35 10A38,10A3A 10A3F,10A48 10A50,10A58 10A60,10A9F 10AC0,10AE6 10AEB,10AF6 10B00,10B35 10B39,10B55 10B58,10B72 10B78,10B91 10B99,10B9C 10BA9,10BAF 10C00,10C48 10C80,10CB2 10CC0,10CF2 10CFA,10D27 10D30,10D39 10E60,10E7E 10E80,10EA9 10EAB,10EAD 10EB0,10EB1 10F00,10F27 10F30,10F59 10F70,10F89 10FB0,10FCB 10FE0,10FF6 11000,1104D 11052,11075 1107F,110C2 110CD,110CD 110D0,110E8 110F0,110F9 11100,11134 11136,11147 11150,11176 11180,111DF 111E1,111F4 11200,11211 11213,1123E 11280,11286 11288,11288 1128A,1128D 1128F,1129D 1129F,112A9 112B0,112EA 112F0,112F9 11300,11303 11305,1130C 1130F,11310 11313,11328 1132A,11330 11332,11333 11335,11339 1133B,11344 11347,11348 1134B,1134D 11350,11350 11357,11357 1135D,11363 11366,1136C 11370,11374 11400,1145B 1145D,11461 11480,114C7 114D0,114D9 11580,115B5 115B8,115DD 11600,11644 11650,11659 11660,1166C 11680,116B9 116C0,116C9 11700,1171A 1171D,1172B 11730,11746 11800,1183B 118A0,118F2 118FF,11906 11909,11909 1190C,11913 11915,11916 11918,11935 11937,11938 1193B,11946 11950,11959 119A0,119A7 119AA,119D7 119DA,119E4 11A00,11A47 11A50,11AA2 11AB0,11AF8 11C00,11C08 11C0A,11C36 11C38,11C45 11C50,11C6C 11C70,11C8F 11C92,11CA7 11CA9,11CB6 11D00,11D06 11D08,11D09 11D0B,11D36 11D3A,11D3A 11D3C,11D3D 11D3F,11D47 11D50,11D59 11D60,11D65 11D67,11D68 11D6A,11D8E 11D90,11D91 11D93,11D98 11DA0,11DA9 11EE0,11EF8 11FB0,11FB0 11FC0,11FF1 11FFF,12399 12400,1246E 12470,12474 12480,12543 12F90,12FF2 13000,1342E 13430,13438 14400,14646 16800,16A38 16A40,16A5E 16A60,16A69 16A6E,16ABE 16AC0,16AC9 16AD0,16AED 16AF0,16AF5 16B00,16B45 16B50,16B59 16B5B,16B61 16B63,16B77 16B7D,16B8F 16E40,16E9A 16F00,16F4A 16F4F,16F87 16F8F,16F9F 16FE0,16FE4 16FF0,16FF1 17000,187F7 18800,18CD5 18D00,18D08 1AFF0,1AFF3 1AFF5,1AFFB 1AFFD,1AFFE 1B000,1B122 1B150,1B152 1B164,1B167 1B170,1B2FB 1BC00,1BC6A 1BC70,1BC7C 1BC80,1BC88 1BC90,1BC99 1BC9C,1BCA3 1CF00,1CF2D 1CF30,1CF46 1CF50,1CFC3 1D000,1D0F5 1D100,1D126 1D129,1D1EA 1D200,1D245 1D2E0,1D2F3 1D300,1D356 1D360,1D378 1D400,1D454 1D456,1D49C 1D49E,1D49F 1D4A2,1D4A2 1D4A5,1D4A6 1D4A9,1D4AC 1D4AE,1D4B9 1D4BB,1D4BB 1D4BD,1D4C3 1D4C5,1D505 1D507,1D50A 1D50D,1D514 1D516,1D51C 1D51E,1D539 1D53B,1D53E 1D540,1D544 1D546,1D546 1D54A,1D550 1D552,1D6A5 1D6A8,1D7CB 1D7CE,1DA8B 1DA9B,1DA9F 1DAA1,1DAAF 1DF00,1DF1E 1E000,1E006 1E008,1E018 1E01B,1E021 1E023,1E024 1E026,1E02A 1E100,1E12C 1E130,1E13D 1E140,1E149 1E14E,1E14F 1E290,1E2AE 1E2C0,1E2F9 1E2FF,1E2FF 1E7E0,1E7E6 1E7E8,1E7EB 1E7ED,1E7EE 1E7F0,1E7FE 1E800,1E8C4 1E8C7,1E8D6 1E900,1E94B 1E950,1E959 1E95E,1E95F 1EC71,1ECB4 1ED01,1ED3D 1EE00,1EE03 1EE05,1EE1F 1EE21,1EE22 1EE24,1EE24 1EE27,1EE27 1EE29,1EE32 1EE34,1EE37 1EE39,1EE39 1EE3B,1EE3B 1EE42,1EE42 1EE47,1EE47 1EE49,1EE49 1EE4B,1EE4B 1EE4D,1EE4F 1EE51,1EE52 1EE54,1EE54 1EE57,1EE57 1EE59,1EE59 1EE5B,1EE5B 1EE5D,1EE5D 1EE5F,1EE5F 1EE61,1EE62 1EE64,1EE64 1EE67,1EE6A 1EE6C,1EE72 1EE74,1EE77 1EE79,1EE7C 1EE7E,1EE7E 1EE80,1EE89 1EE8B,1EE9B 1EEA1,1EEA3 1EEA5,1EEA9 1EEAB,1EEBB 1EEF0,1EEF1 1F000,1F02B 1F030,1F093 1F0A0,1F0AE 1F0B1,1F0BF 1F0C1,1F0CF 1F0D1,1F0F5 1F100,1F1AD 1F1E6,1F202 1F210,1F23B 1F240,1F248 1F250,1F251 1F260,1F265 1F300,1F6D7 1F6DD,1F6EC 1F6F0,1F6FC 1F700,1F773 1F780,1F7D8 1F7E0,1F7EB 1F7F0,1F7F0 1F800,1F80B 1F810,1F847 1F850,1F859 1F860,1F887 1F890,1F8AD 1F8B0,1F8B1 1F900,1FA53 1FA60,1FA6D 1FA70,1FA74 1FA78,1FA7C 1FA80,1FA86 1FA90,1FAAC 1FAB0,1FABA 1FAC0,1FAC5 1FAD0,1FAD9 1FAE0,1FAE7 1FAF0,1FAF6 1FB00,1FB92 1FB94,1FBCA 1FBF0,1FBF9 20000,2A6DF 2A700,2B738 2B740,2B81D 2B820,2CEA1 2CEB0,2EBE0 2F800,2FA1D 30000,3134A E0001,E0001 E0020,E007F E0100,E01EF F0000,FFFFD 100000,10FFFD character_set-1.8.0/lib/character_set/predefined_sets/whitespace.cps0000644000004100000410000000012214620142357025741 0ustar www-datawww-data9,D 20,20 85,85 A0,A0 1680,1680 2000,200A 2028,2029 202F,202F 205F,205F 3000,3000 character_set-1.8.0/lib/character_set/predefined_sets/ascii_letter.cps0000644000004100000410000000001414620142357026254 0ustar www-datawww-data41,5A 61,7A character_set-1.8.0/lib/character_set/predefined_sets/unicode.cps0000644000004100000410000000002314620142357025233 0ustar www-datawww-data0,D7FF E000,10FFFF character_set-1.8.0/lib/character_set/predefined_sets/url_host.cps0000644000004100000410000000007414620142357025452 0ustar www-datawww-data21,21 24,24 26,2E 30,3B 3D,3D 41,5B 5D,5D 5F,5F 61,7A 7E,7E character_set-1.8.0/lib/character_set/predefined_sets/url_path.cps0000644000004100000410000000005214620142357025425 0ustar www-datawww-data21,21 24,3A 3D,3D 40,5A 5F,5F 61,7A 7E,7E character_set-1.8.0/lib/character_set/predefined_sets/bmp.cps0000644000004100000410000000002114620142357024361 0ustar www-datawww-data0,D7FF E000,FFFF character_set-1.8.0/lib/character_set/predefined_sets/any.cps0000644000004100000410000000001114620142357024371 0ustar www-datawww-data0,10FFFF character_set-1.8.0/lib/character_set/predefined_sets/surrogate.cps0000644000004100000410000000001214620142357025616 0ustar www-datawww-dataD800,DFFF character_set-1.8.0/lib/character_set/shared_methods.rb0000644000004100000410000001360514620142357023263 0ustar www-datawww-data# # Various methods shared by the pure-Ruby and the extended implementation. # # Many of these methods are hotspots, so they are defined directly on # the including classes for better performance. # class CharacterSet module SharedMethods def self.included(klass) klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1 LoadError = Class.new(::LoadError) class << self def [](*args) new(Array(args)) end def of(*args) args.map do |arg| arg.is_a?(Regexp) ? of_regexp(arg) : of_string(arg) end.reduce(:merge) || new end def parse(string) codepoints = Parser.codepoints_from_bracket_expression(string) result = new(codepoints) string.start_with?('[^') ? result.inversion : result end def of_property(property_name) require_optional_dependency('regexp_property_values', __method__) property = RegexpPropertyValues[property_name.to_s] from_ranges(*property.matched_ranges) end def of_regexp(regexp) require_optional_dependency('regexp_parser', __method__) root = ::Regexp::Parser.parse(regexp) of_expression(root) end def of_expression(expression) ExpressionConverter.convert(expression, self) end def require_optional_dependency(name, method) required_optional_dependencies[name] ||= begin require name true rescue ::LoadError raise LoadError, 'You must install the optional dependency '\ "'\#{name}' to use the method `\#{method}'." end end def required_optional_dependencies @required_optional_dependencies ||= {} end end # class << self def initialize(enumerable = []) merge(Parser.codepoints_from_enumerable(enumerable)) end def replace(enum) unless [Array, CharacterSet, Range].include?(enum.class) enum = self.class.new(enum) end clear merge(enum) end # CharacterSet-specific conversion methods def assigned_part self & self.class.assigned end def valid_part self - self.class.surrogate end # CharacterSet-specific stringification methods def to_s(opts = {}, &block) Writer.write(ranges, opts, &block) end def to_s_with_surrogate_ranges Writer.write_surrogate_ranges(bmp_part.ranges, astral_part.ranges) end def to_s_with_surrogate_alternation Writer.write_surrogate_alternation(bmp_part.ranges, astral_part.ranges) end def secure_token(length = 32) CharacterSet.require_optional_dependency('securerandom', __method__) cps = to_a len = cps.count 1.upto(length).map { cps[SecureRandom.random_number(len)] }.pack('U*') end alias random_token secure_token def inspect len = length "#<#{klass.name}: {\#{first(5) * ', '}\#{'...' if len > 5}} (size: \#{len})>" end # C-extension adapter methods. Need overriding in pure fallback. # Parsing kwargs in C is slower, verbose, and kinda deprecated. def inversion(include_surrogates: false, upto: 0x10FFFF) ext_inversion(include_surrogates, upto) end def section(from:, upto: 0x10FFFF) ext_section(from, upto) end def count_in_section(from:, upto: 0x10FFFF) ext_count_in_section(from, upto) end def section?(from:, upto: 0x10FFFF) ext_section?(from, upto) end def section_ratio(from:, upto: 0x10FFFF) ext_section_ratio(from, upto) end # # The following methods are here for `Set` compatibility, but they are # comparatively slow. Prefer others. # def map! block_given? or return enum_for(__method__) { size } arr = [] each { |cp| arr << yield(cp) } replace(arr) end alias collect! map! def reject!(&block) block_given? or return enum_for(__method__) { size } old_size = size delete_if(&block) self if size != old_size end def select!(&block) block_given? or return enum_for(__method__) { size } old_size = size keep_if(&block) self if size != old_size end alias filter! select! def classify block_given? or return enum_for(__method__) { size } each_with_object({}) { |cp, h| (h[yield(cp)] ||= self.class.new).add(cp) } end def divide(&func) require 'character_set/ruby_fallback' CharacterSet::RubyFallback::Set.new(to_a).divide(&func) end def join(separator = '') to_a(true).join(separator) end RUBY # CharacterSet-specific section methods { ascii: 0..0x7F, bmp: 0..0xFFFF, astral: 0x10000..0x10FFFF, }.each do |section_name, range| klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{section_name}_part section(from: #{range.begin}, upto: #{range.end}) end def #{section_name}_part? section?(from: #{range.begin}, upto: #{range.end}) end def #{section_name}_only? #{range.begin == 0 ? "!section?(from: #{range.end}, upto: 0x10FFFF)" : "!section?(from: 0, upto: #{range.begin})"} end def #{section_name}_ratio section_ratio(from: #{range.begin}, upto: #{range.end}) end RUBY end end # self.included end # SharedMethods end character_set-1.8.0/lib/character_set/expression_converter.rb0000644000004100000410000000640514620142357024560 0ustar www-datawww-dataclass CharacterSet module ExpressionConverter module_function Error = Class.new(ArgumentError) def convert(expression, to = CharacterSet, acc = []) CharacterSet.require_optional_dependency('regexp_parser', __method__) case expression when Regexp::Expression::CharacterSet content = expression.map { |subexp| convert(subexp, to) }.reduce(:+) || to[] acc << (expression.negative? ? content.inversion : content) when Regexp::Expression::CharacterSet::Intersection acc << expression.map { |subexp| convert(subexp, to) }.reduce(:&) when Regexp::Expression::CharacterSet::Range start, finish = expression.map { |subexp| convert(subexp, to) } acc << to.new((start.min)..(finish.max)) when Regexp::Expression::Subexpression # root, group, alternation, etc. expression.each { |subexp| convert(subexp, to, acc) } when Regexp::Expression::CharacterType::Any acc << to.unicode when Regexp::Expression::CharacterType::Base /(?non)?(?.+)/ =~ expression.token content = if expression.unicode_classes? # in u-mode, most type shortcuts match the same as \p{} if base_name == 'linebreak' to.from_ranges(10..13, 133..133, 8232..8233) else to.of_property(base_name) end else # in normal mode, types match only ascii chars case base_name.to_sym when :digit then to.from_ranges(48..57) when :hex then to.from_ranges(48..57, 65..70, 97..102) when :linebreak then to.from_ranges(10..13) when :space then to.from_ranges(9..13, 32..32) when :word then to.from_ranges(48..57, 65..90, 95..95, 97..122) else raise Error, "Unsupported CharacterType #{base_name}" end end acc << (negative ? content.inversion : content) when Regexp::Expression::EscapeSequence::CodepointList content = to.new(expression.codepoints) acc << (expression.i? ? content.case_insensitive : content) when Regexp::Expression::EscapeSequence::Base content = to[expression.codepoint] acc << (expression.i? ? content.case_insensitive : content) when Regexp::Expression::Literal content = to[*expression.text.chars] acc << (expression.i? ? content.case_insensitive : content) when Regexp::Expression::UnicodeProperty::Base, Regexp::Expression::PosixClass content = to.of_property(expression.token) if expression.type == :posixclass && expression.ascii_classes? content = content.ascii_part end acc << (expression.negative? ? content.inversion : content) when Regexp::Expression::Anchor::Base, Regexp::Expression::Backreference::Base, Regexp::Expression::Keep::Mark, Regexp::Expression::Quantifier # ignore zero-length and repeat expressions when Regexp::Expression::Base raise Error, "Unsupported expression class `#{expression.class}`" else raise Error, 'Pass an expression (result of Regexp::Parser.parse)' end acc.reduce(:+) || to[] end end end character_set-1.8.0/lib/character_set/character.rb0000644000004100000410000000350514620142357022224 0ustar www-datawww-dataclass CharacterSet class Character ENCODING = 'utf-8'.freeze SAFELY_PRINTABLE = (0x21..0x7E).to_a - %w(- / [ \\ ] ^).map(&:ord) attr_accessor :codepoint def initialize(codepoint) case codepoint when Integer then self.codepoint = codepoint when String then self.codepoint = codepoint.ord else raise ArgumentError, 'pass an Integer or String' end end def to_s codepoint.chr(ENCODING) end def hex codepoint.to_s(16).upcase end def escape(opts = {}) return to_s if SAFELY_PRINTABLE.include?(codepoint) && !opts[:escape_all] return yield(self) if block_given? # https://billposer.org/Software/ListOfRepresentations.html case opts[:format].to_s.downcase.delete('-_ ') when '', 'default', 'es6', 'esnext', 'rb', 'ruby' default_escape(opts) when 'java', 'javascript', 'js' default_escape(opts, false) when 'capitalizableu', 'c#', 'csharp', 'd', 'python' capitalizable_u_escape when 'u+', 'uplus' u_plus_escape when 'literal', 'raw' to_s else raise ArgumentError, "unsupported format: #{opts[:format].inspect}" end end def plane codepoint / 0x10000 end private def default_escape(opts, support_wide_hex = true) if hex.length <= 2 '\\x' + hex.rjust(2, '0') elsif hex.length <= 4 '\\u' + hex.rjust(4, '0') elsif support_wide_hex '\\u{' + hex + '}' else raise "#{opts[:format]} does not support escaping astral value #{hex}" end end def capitalizable_u_escape if hex.length <= 4 '\\u' + hex.rjust(4, '0') else '\\U' + hex.rjust(8, '0') end end def u_plus_escape 'U+' + hex.rjust(4, '0') end end end character_set-1.8.0/lib/character_set/ruby_fallback/0000755000004100000410000000000014620142357022540 5ustar www-datawww-datacharacter_set-1.8.0/lib/character_set/ruby_fallback/vendored_set_classes.rb0000644000004100000410000001605114620142357027266 0ustar www-datawww-data# set, vendored from https://github.com/ruby/set/blob/master/lib/set.rb, # with comments removed and linted. class CharacterSet::RubyFallback::Set Set = self include Enumerable def self.[](*ary) new(ary) end def initialize(enum = nil, &block) @hash = Hash.new(false) enum.nil? and return if block do_with_enum(enum) { |o| add(block[o]) } else merge(enum) end end def do_with_enum(enum, &block) if enum.respond_to?(:each_entry) enum.each_entry(&block) if block elsif enum.respond_to?(:each) enum.each(&block) if block else raise ArgumentError, "value must be enumerable" end end private :do_with_enum def initialize_dup(orig) super @hash = orig.instance_variable_get(:@hash).dup end if Kernel.instance_method(:initialize_clone).arity != 1 def initialize_clone(orig, **options) super @hash = orig.instance_variable_get(:@hash).clone(**options) end else def initialize_clone(orig) super @hash = orig.instance_variable_get(:@hash).clone end end def freeze @hash.freeze super end def size @hash.size end alias length size def empty? @hash.empty? end def clear @hash.clear self end def to_a @hash.keys end def include?(o) @hash[o] end alias member? include? def superset?(set) case when set.instance_of?(self.class) && @hash.respond_to?(:>=) @hash >= set.instance_variable_get(:@hash) when set.is_a?(Set) size >= set.size && set.all? { |o| include?(o) } else raise ArgumentError, "value must be a set" end end alias >= superset? def proper_superset?(set) case when set.instance_of?(self.class) && @hash.respond_to?(:>) @hash > set.instance_variable_get(:@hash) when set.is_a?(Set) size > set.size && set.all? { |o| include?(o) } else raise ArgumentError, "value must be a set" end end alias > proper_superset? def subset?(set) case when set.instance_of?(self.class) && @hash.respond_to?(:<=) @hash <= set.instance_variable_get(:@hash) when set.is_a?(Set) size <= set.size && all? { |o| set.include?(o) } else raise ArgumentError, "value must be a set" end end alias <= subset? def proper_subset?(set) case when set.instance_of?(self.class) && @hash.respond_to?(:<) @hash < set.instance_variable_get(:@hash) when set.is_a?(Set) size < set.size && all? { |o| set.include?(o) } else raise ArgumentError, "value must be a set" end end alias < proper_subset? def <=>(set) return unless set.is_a?(Set) case size <=> set.size when -1 then -1 if proper_subset?(set) when +1 then +1 if proper_superset?(set) else 0 if self.==(set) end end def intersect?(set) case set when Set if size < set.size any? { |o| set.include?(o) } else set.any? { |o| include?(o) } end when Enumerable set.any? { |o| include?(o) } else raise ArgumentError, "value must be enumerable" end end def disjoint?(set) !intersect?(set) end def each(&block) block_given? or return enum_for(__method__) { size } @hash.each_key(&block) self end def add(o) @hash[o] = true self end alias << add def add?(o) add(o) unless include?(o) end def delete(o) @hash.delete(o) self end def delete?(o) delete(o) if include?(o) end def delete_if block_given? or return enum_for(__method__) { size } select { |o| yield o }.each { |o| @hash.delete(o) } self end def keep_if block_given? or return enum_for(__method__) { size } reject { |o| yield o }.each { |o| @hash.delete(o) } self end def reject!(&block) block_given? or return enum_for(__method__) { size } n = size delete_if(&block) self if size != n end def select!(&block) block_given? or return enum_for(__method__) { size } n = size keep_if(&block) self if size != n end alias filter! select! def merge(*enums, **_rest) enums.each do |enum| if enum.instance_of?(self.class) @hash.update(enum.instance_variable_get(:@hash)) else do_with_enum(enum) { |o| add(o) } end end self end def subtract(enum) do_with_enum(enum) { |o| delete(o) } self end def |(enum) dup.merge(enum) end alias + | alias union | def -(enum) dup.subtract(enum) end alias difference - def &(enum) n = self.class.new if enum.is_a?(Set) if enum.size > size each { |o| n.add(o) if enum.include?(o) } else enum.each { |o| n.add(o) if include?(o) } end else do_with_enum(enum) { |o| n.add(o) if include?(o) } end n end alias intersection & def ^(enum) n = Set.new(enum) each { |o| n.add(o) unless n.delete?(o) } n end def ==(other) if self.equal?(other) true elsif other.instance_of?(self.class) @hash == other.instance_variable_get(:@hash) elsif other.is_a?(Set) && self.size == other.size other.all? { |o| @hash.include?(o) } else false end end def hash @hash.hash end def eql?(o) return false unless o.is_a?(Set) @hash.eql?(o.instance_variable_get(:@hash)) end alias === include? def classify block_given? or return enum_for(__method__) { size } h = {} each { |i| (h[yield(i)] ||= self.class.new).add(i) } h end def divide(&func) func or return enum_for(__method__) { size } if func.arity == 2 require 'tsort' class << dig = {} include TSort alias tsort_each_node each_key def tsort_each_child(node, &block) fetch(node).each(&block) end end each { |u| dig[u] = a = [] each{ |v| func.call(u, v) and a << v } } set = Set.new() dig.each_strongly_connected_component { |css| set.add(self.class.new(css)) } set else Set.new(classify(&func).values) end end end # sorted_set without rbtree dependency, vendored from # https://github.com/ruby/set/blob/72f08c4/lib/set.rb#L731-L800 class CharacterSet::RubyFallback::SortedSet < CharacterSet::RubyFallback::Set def initialize(*args) @keys = nil super end def clear @keys = nil super end def add(o) @keys = nil super end alias << add def delete(o) @keys = nil @hash.delete(o) self end def delete_if block_given? or return enum_for(__method__) { size } n = @hash.size super @keys = nil if @hash.size != n self end def keep_if block_given? or return enum_for(__method__) { size } n = @hash.size super @keys = nil if @hash.size != n self end def merge(enum) @keys = nil super end def each(&block) block or return enum_for(__method__) { size } to_a.each(&block) self end def to_a (@keys = @hash.keys).sort! unless @keys @keys.dup end def freeze to_a super end end character_set-1.8.0/lib/character_set/ruby_fallback/set_methods.rb0000644000004100000410000000535014620142357025406 0ustar www-datawww-dataclass CharacterSet module RubyFallback module SetMethods (Enumerable.instance_methods - %i[include? member? to_a] + %i[empty? hash length size]).each do |mthd| class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{mthd}(*args, &block) @__set.#{mthd}(*args, &block) end RUBY end %i[< <= <=> > >= === disjoint? include? intersect? member? proper_subset? proper_superset? subset? superset?].each do |mthd| class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{mthd}(enum, &block) if enum.is_a?(CharacterSet) || enum.is_a?(CharacterSet::Pure) enum = enum.instance_variable_get(:@__set) end @__set.#{mthd}(enum, &block) end RUBY end %i[<< add add? clear delete delete? delete_if each filter! keep_if reject! select! subtract].each do |mthd| class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{mthd}(*args, &block) result = @__set.#{mthd}(*args, &block) result.is_a?(Set) ? self : result end RUBY end %i[& + - ^ | difference intersection union].each do |mthd| class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{mthd}(enum, &block) if enum.respond_to?(:map) enum = enum.map { |el| el.is_a?(String) ? el.ord : el } end self.class.new(@__set.#{mthd}(enum, &block).to_a) end RUBY end unless RUBY_PLATFORM[/java/i] def freeze @__set.to_a @__set.freeze super end end def merge(other) raise ArgumentError, 'pass an Enumerable' unless other.respond_to?(:each) # pass through #add to use the checks in SetMethodAdapters other.each { |e| add(e) } self end def ==(other) if equal?(other) true elsif other.instance_of?(self.class) @__set == other.instance_variable_get(:@__set) elsif other.is_a?(CharacterSet) || other.is_a?(CharacterSet::Pure) size == other.size && other.all? { |cp| @__set.include?(cp) } else false end end def eql?(other) return false unless other.is_a?(self.class) @__set.eql?(other.instance_variable_get(:@__set)) end def initialize_dup(orig) super @__set = orig.instance_variable_get(:@__set).dup end def initialize_clone(orig) super @__set = orig.instance_variable_get(:@__set).clone end def to_a(stringify = false) result = @__set.to_a stringify ? result.map { |cp| cp.chr('utf-8') } : result end end end end character_set-1.8.0/lib/character_set/ruby_fallback/character_set_methods.rb0000644000004100000410000000673414620142357027431 0ustar www-datawww-dataclass CharacterSet module RubyFallback module CharacterSetMethods module ClassMethods def from_ranges(*ranges) new(Array(ranges).flat_map(&:to_a)) end def of_string(str) raise ArgumentError, 'pass a String' unless str.respond_to?(:codepoints) str.encode('utf-8').each_codepoint.with_object(new) { |cp, set| set << cp } end end def inversion(include_surrogates: false, upto: 0x10FFFF) new_set = self.class.new 0.upto(upto) do |cp| next unless include_surrogates || cp > 0xDFFF || cp < 0xD800 new_set << cp unless include?(cp) end new_set end def case_insensitive new_set = dup each do |cp| swapped_cps = cp.chr('utf-8').swapcase.codepoints swapped_cps.size == 1 && new_set << swapped_cps[0] end new_set end def ranges CharacterSet.require_optional_dependency('range_compressor', __method__) RangeCompressor.compress(self) end def sample(count = nil) count.nil? ? to_a(true).sample : to_a(true).sample(count) end def count_in(string) utf8_str!(string).each_codepoint.count { |cp| include?(cp) } end def cover?(string) utf8_str!(string).each_codepoint { |cp| return false unless include?(cp) } true end def delete_in(string) utf8_str!(string).each_codepoint.with_object('') do |cp, new_str| include?(cp) || (new_str << cp) end.encode(string.encoding) end def delete_in!(string) result = delete_in(string) result.size == string.size ? nil : string.replace(result) end def keep_in(string) utf8_str!(string).each_codepoint.with_object('') do |cp, new_str| include?(cp) && (new_str << cp) end.encode(string.encoding) end def keep_in!(string) result = keep_in(string) result.size == string.size ? nil : string.replace(result) end def scan(string) utf8_str!(string).each_codepoint.with_object([]) do |cp, arr| arr.push(cp.chr('utf-8')) if include?(cp) end end def used_by?(string) utf8_str!(string).each_codepoint { |cp| return true if include?(cp) } false end def section(from:, upto: 0x10FFFF) dup.keep_if { |cp| cp >= from && cp <= upto } end def count_in_section(from:, upto: 0x10FFFF) count { |cp| cp >= from && cp <= upto } end def section?(from:, upto: 0x10FFFF) any? { |cp| cp >= from && cp <= upto } end def section_ratio(from:, upto: 0x10FFFF) section(from: from, upto: upto).count / count.to_f end def planes plane_size = 0x10000.to_f inject({}) { |hash, cp| hash.merge((cp / plane_size).floor => 1) }.keys end def plane(num) validate_plane_number(num) section(from: (num * 0x10000), upto: ((num + 1) * 0x10000) - 1) end def member_in_plane?(num) validate_plane_number(num) ((num * 0x10000)...((num + 1) * 0x10000)).any? { |cp| include?(cp) } end private def validate_plane_number(num) num >= 0 && num <= 16 or raise ArgumentError, 'plane must be between 0 and 16' end def utf8_str!(obj) raise ArgumentError, 'pass a String' unless obj.respond_to?(:codepoints) obj.encode('utf-8') end end end end character_set-1.8.0/lib/character_set/writer.rb0000644000004100000410000000751214620142357021606 0ustar www-datawww-dataclass CharacterSet module Writer class << self def write(codepoint_ranges, opts = {}, &block) content = codepoint_ranges.map do |range| if range.size > 2 && opts[:abbreviate] != false bounds = [range.min, range.max] bounds.map { |cp| write_codepoint(cp, opts, &block) }.join('-') else range.map { |cp| write_codepoint(cp, opts, &block) }.join end end.join opts[:in_brackets] ? "[#{content}]" : content end def write_codepoint(codepoint, opts = {}, &block) Character.new(codepoint).escape(opts, &block) end def write_surrogate_ranges(bmp_ranges, astral_ranges) astral_branches = surrogate_range_expressions(astral_ranges) bmp_set_with_alternatives(bmp_ranges, astral_branches) end def write_surrogate_alternation(bmp_ranges, astral_ranges) astral_branches = surrogate_pairs(astral_ranges) bmp_set_with_alternatives(bmp_ranges, astral_branches) end private def surrogate_range_expressions(astral_ranges) compressed_surrogate_range_pairs(astral_ranges).map do |hi_ranges, lo_ranges| [hi_ranges, lo_ranges].map do |ranges| use_brackets = ranges.size > 1 || ranges.first.size > 1 write(ranges, format: :js, in_brackets: use_brackets) end.join end end def compressed_surrogate_range_pairs(astral_ranges) halves = astral_ranges.flat_map { |range| surrogate_half_ranges(range) } # compress high surrogate codepoint ranges with common low range half with_common_lo = halves.group_by(&:last).map do |lo_range, pairs| hi_ranges = pairs.map(&:first) compressed_hi_ranges = hi_ranges.each_with_object([]) do |range, arr| prev = arr.last if prev.nil? || prev.max + 1 < range.min # first or gap arr << range else # continuous codepoints, expand previous range arr[-1] = (prev.min)..(range.max) end end [compressed_hi_ranges, lo_range] end # compress low surrogate codepoint ranges with common high ranges with_common_lo.each_with_object({}) do |(hi_ranges, lo_range), hash| (hash[hi_ranges] ||= []) << lo_range end end def surrogate_half_ranges(astral_range) hi_min, lo_min = surrogate_pair_codepoints(astral_range.min) hi_max, lo_max = surrogate_pair_codepoints(astral_range.max) hi_count = 1 + hi_max - hi_min return [[hi_min..hi_min, lo_min..lo_max]] if hi_count == 1 ranges = [] # first high surrogate might be partially covered (if lo_min > 0xDC00) ranges << [hi_min..hi_min, lo_min..0xDFFF] # any high surrogates in between are fully covered ranges << [(hi_min + 1)..(hi_max - 1), 0xDC00..0xDFFF] if hi_count > 2 # last high surrogate might be partially covered (if lo_max < 0xDFFF) ranges << [hi_max..hi_max, 0xDC00..lo_max] ranges end def surrogate_pair_codepoints(astral_codepoint) base = astral_codepoint - 0x10000 high = base / 1024 + 0xD800 low = base % 1024 + 0xDC00 [high, low] end def bmp_set_with_alternatives(bmp_ranges, alternatives) bmp_set = write(bmp_ranges, format: :js, in_brackets: true) return bmp_set if alternatives.empty? && bmp_ranges.any? "(?:#{((bmp_ranges.any? ? [bmp_set] : []) + alternatives).join('|')})" end def surrogate_pairs(astral_ranges) astral_ranges.flat_map { |range| range.map { |cp| surrogate_pair(cp) } } end def surrogate_pair(astral_codepoint) surrogate_pair_codepoints(astral_codepoint) .map { |half| write_codepoint(half, format: :js) }.join end end end end character_set-1.8.0/lib/character_set/core_ext/0000755000004100000410000000000014620142357021550 5ustar www-datawww-datacharacter_set-1.8.0/lib/character_set/core_ext/string_ext.rb0000644000004100000410000000164014620142357024264 0ustar www-datawww-dataclass CharacterSet module CoreExt module StringExt def character_set CharacterSet.of_string(self) end { count_by_character_set: :count_in, covered_by_character_set?: :cover?, delete_character_set: :delete_in, delete_character_set!: :delete_in!, keep_character_set: :keep_in, keep_character_set!: :keep_in!, scan_by_character_set: :scan, uses_character_set?: :used_by?, }.each do |string_method, set_method| class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{string_method}(arg) if arg.instance_of?(Symbol) CharacterSet.__send__(arg).#{set_method}(self) else arg.#{set_method}(self) end end RUBY end end end end ::String.instance_eval { include CharacterSet::CoreExt::StringExt } character_set-1.8.0/lib/character_set/core_ext/regexp_ext.rb0000644000004100000410000000061414620142357024250 0ustar www-datawww-dataclass CharacterSet module CoreExt module RegexpExt def character_set CharacterSet.of_regexp(self) end def covered_by_character_set?(other) other.superset?(character_set) end def uses_character_set?(other) other.intersect?(character_set) end end end end ::Regexp.instance_eval { include CharacterSet::CoreExt::RegexpExt } character_set-1.8.0/lib/character_set/pure.rb0000644000004100000410000000057514620142357021247 0ustar www-datawww-datarequire 'character_set' require 'character_set/ruby_fallback' # CharacterSet::Pure uses only Ruby implementations. # It is equal to CharacterSet if the C ext can't be loaded. class CharacterSet class Pure prepend CharacterSet::RubyFallback prepend CharacterSet::SetMethodAdapters include CharacterSet::SharedMethods extend CharacterSet::PredefinedSets end end character_set-1.8.0/lib/character_set/version.rb0000644000004100000410000000005314620142357021750 0ustar www-datawww-dataclass CharacterSet VERSION = '1.8.0' end character_set-1.8.0/lib/character_set/ruby_fallback.rb0000644000004100000410000000216714620142357023073 0ustar www-datawww-datarequire 'character_set/ruby_fallback/set_methods' require 'character_set/ruby_fallback/character_set_methods' class CharacterSet module RubyFallback include CharacterSet::RubyFallback::SetMethods include CharacterSet::RubyFallback::CharacterSetMethods def self.prepended(klass) klass.extend CharacterSet::RubyFallback::CharacterSetMethods::ClassMethods end def initialize(enum = []) @__set = CharacterSet::RubyFallback::SortedSet.new super end end end if RUBY_PLATFORM[/java/i] # JRuby has sorted_set in the stdlib. require 'set' CharacterSet::RubyFallback::Set = ::Set CharacterSet::RubyFallback::SortedSet = ::SortedSet else # For other rubies, set/sorted_set are vendored due to dependency issues: # # - issues with default vs. installed gems such as [#2] # - issues with the sorted_set dependency rb_tree # - long-standing issues in recent versions of sorted_set # # The RubyFallback, and thus these set classes, are only used for testing, # and for exotic rubies which use neither C nor Java. require 'character_set/ruby_fallback/vendored_set_classes' end character_set-1.8.0/lib/character_set/predefined_sets.rb0000644000004100000410000000250514620142357023432 0ustar www-datawww-dataclass CharacterSet module PredefinedSets Dir[File.join(__dir__, 'predefined_sets', '*.cps')].each do |path| set_name = File.basename(path, '.cps') class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{set_name} @#{set_name} ||= build_from_cps_file('#{path}').freeze end def non_#{set_name} @non_#{set_name} ||= build_from_cps_file('#{path}').inversion.freeze end RUBY end alias all any alias ascii_letters ascii_letter alias basic_multilingual_plane bmp alias blank whitespace alias invalid surrogate alias valid unicode def build_from_cps_file(path) if defined?(Ractor) && Ractor.current != Ractor.main raise <<-EOS.gsub(/^ */, '') CharacterSet's predefined sets are lazy-loaded. Pre-load them to use them in Ractors. E.g.: CharacterSet.ascii # pre-load Ractor.new { CharacterSet.ascii.size }.take # => 128 Ractor.new { 'abc'.keep_character_set(:ascii) }.take # => 'abc' EOS end File.readlines(path).inject(new) do |set, line| range_start, range_end = line.split(',') set.merge((range_start.to_i(16))..(range_end.to_i(16))) end end end end character_set-1.8.0/lib/character_set/set_method_adapters.rb0000644000004100000410000000236414620142357024310 0ustar www-datawww-dataclass CharacterSet module SetMethodAdapters # Allow some methods to work with String in addition to Integer args # (the internal representation is geared towards codepoint Integers). %w[add add? << delete delete? include? member? ===].each do |method| class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{method}(arg) case arg when String super(arg.ord) when Integer if arg < 0 || arg > 0x10FFFF raise ArgumentError, 'pass an Integer between 0 and 0x10FFFF' end super(arg) else raise ArgumentError, 'pass a String or an Integer' end end RUBY end # Allow some methods to take an Enum just as well as another CharacterSet. # Tested by ruby-spec. %w[& + - ^ | <=> difference disjoint? intersect? intersection subtract union].each do |method| class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{method}(arg) if arg.is_a?(CharacterSet) super(arg) elsif arg.respond_to?(:each) super(self.class.new(arg.to_a)) else raise ArgumentError, 'pass an enumerable' end end RUBY end end end character_set-1.8.0/lib/character_set/parser.rb0000644000004100000410000000310414620142357021557 0ustar www-datawww-dataclass CharacterSet module Parser module_function def codepoints_from_enumerable(object) raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each) # Use #each to check first element (only this works for all Enumerables) object.each do |el| # rubocop:disable Lint/UnreachableLoop if el.is_a?(Integer) && el >= 0 && el < 0x110000 return object elsif el.is_a?(String) && el.length == 1 return object.to_a.join.encode('utf-8').codepoints end raise ArgumentError, "#{el.inspect} is not valid as a codepoint" end end def codepoints_from_bracket_expression(string) raise ArgumentError, 'pass a String' unless string.is_a?(String) raise ArgumentError, 'advanced syntax' if string =~ /\\[^uUx]|[^\\]\[|&&/ content = strip_brackets(string) literal_content = eval_escapes(content) prev_chr = nil in_range = false literal_content.each_char.map do |chr| if chr == '-' && prev_chr && prev_chr != '\\' && prev_chr != '-' in_range = true nil else result = in_range ? ((prev_chr.ord + 1)..(chr.ord)).to_a : chr.ord in_range = false prev_chr = chr result end end.compact.flatten end def strip_brackets(string) string[/\A\[\^?(.*)\]\z/, 1] || string.dup end def eval_escapes(string) string.gsub(/\\U(\h{8})|\\u(\h{4})|U\+(\h+)|\\x(\h{2})|\\u\{(\h+)\}/) do ($1 || $2 || $3 || $4 || $5).to_i(16).chr('utf-8') end end end end character_set-1.8.0/LICENSE.txt0000644000004100000410000000210014620142357016177 0ustar www-datawww-dataThe MIT License (MIT) Copyright (c) 2018-2023 Janosch Müller Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. character_set-1.8.0/.rspec0000644000004100000410000000006514620142357015501 0ustar www-datawww-data--format documentation --color --require spec_helper character_set-1.8.0/Rakefile0000644000004100000410000000170114620142357016027 0ustar www-datawww-datarequire 'bundler/gem_tasks' require 'rspec/core/rake_task' require 'rubygems/package_task' require 'rake/extensiontask' Dir['tasks/**/*.rake'].each { |file| load(file) } RSpec::Core::RakeTask.new(:spec) task default: :spec namespace :spec do task :quick do ENV['SKIP_MEMSAFETY_SPECS'] = '1' Rake::Task[:spec].invoke end end Rake::ExtensionTask.new('character_set') do |ext| ext.lib_dir = 'lib/character_set' end namespace :java do java_gemspec = eval File.read('./character_set.gemspec') java_gemspec.platform = 'java' java_gemspec.extensions = [] java_gemspec.add_runtime_dependency 'range_compressor', '~> 1.0' Gem::PackageTask.new(java_gemspec) do |pkg| pkg.need_zip = true pkg.need_tar = true pkg.package_dir = 'pkg' end end task package: 'java:gem' unless RUBY_PLATFORM =~ /java/ # recompile before benchmarking or running specs task(:benchmark).enhance([:compile]) task(:spec).enhance([:compile]) end character_set-1.8.0/character_set.gemspec0000644000004100000410000000133714620142357020543 0ustar www-datawww-datalib = File.expand_path('../lib', __FILE__) $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) require 'character_set/version' Gem::Specification.new do |s| s.name = 'character_set' s.version = CharacterSet::VERSION s.authors = ['Janosch Müller'] s.email = ['janosch84@gmail.com'] s.summary = 'Build, read, write and compare sets of Unicode codepoints.' s.homepage = 'https://github.com/jaynetics/character_set' s.license = 'MIT' s.files = `git ls-files -z`.split("\x0").reject do |f| f.match(%r{^(test|spec|features)/}) end s.require_paths = ['lib'] s.extensions = %w[ext/character_set/extconf.rb] s.required_ruby_version = '>= 2.1.0' end character_set-1.8.0/.gouteur.yml0000644000004100000410000000006614620142357016660 0ustar www-datawww-datarepos: - uri: https://github.com/jaynetics/js_regex character_set-1.8.0/Gemfile0000644000004100000410000000106414620142357015657 0ustar www-datawww-datasource "https://rubygems.org" git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } # Specify your gem's dependencies in character_set.gemspec gemspec gem 'benchmark-ips', '~> 2.7' gem 'get_process_mem', '~> 0.2.3' gem 'rake', '~> 13.1' gem 'rake-compiler', '~> 1.1' gem 'range_compressor', '~> 1.0' gem 'regexp_parser', '~> 2.9' gem 'regexp_property_values', '~> 1.5' gem 'rspec', '~> 3.8' gem 'warning', '~> 1.3' if RUBY_VERSION.to_f >= 3.0 gem 'gouteur', '~> 1.0.0' gem 'rubocop', '~> 1.59' gem 'simplecov-cobertura', require: false end character_set-1.8.0/ext/0000755000004100000410000000000014620142357015163 5ustar www-datawww-datacharacter_set-1.8.0/ext/character_set/0000755000004100000410000000000014620142357017772 5ustar www-datawww-datacharacter_set-1.8.0/ext/character_set/character_set.c0000644000004100000410000007563314620142357022763 0ustar www-datawww-data#include "ruby.h" #include "ruby/encoding.h" #include "unicode_casefold_table.h" #define UNICODE_PLANE_SIZE 0x10000 #define UNICODE_PLANE_COUNT 17 #define UNICODE_CP_COUNT (UNICODE_PLANE_SIZE * UNICODE_PLANE_COUNT) // start at ascii size #define CS_DEFAULT_INITIAL_LEN 128 typedef char cs_ar; typedef unsigned long cs_cp; struct cs_data { cs_ar *cps; cs_cp len; }; #define CS_MSIZE(len) (sizeof(cs_ar) * (len / 8)) static inline void add_memspace_for_another_plane(struct cs_data *data) { data->cps = ruby_xrealloc(data->cps, CS_MSIZE(data->len + UNICODE_PLANE_SIZE)); memset(data->cps + CS_MSIZE(data->len), 0, CS_MSIZE(UNICODE_PLANE_SIZE)); data->len += UNICODE_PLANE_SIZE; } static inline void ensure_memsize_fits(struct cs_data *data, cs_cp target_cp) { while (target_cp >= data->len) { add_memspace_for_another_plane(data); } } static inline void set_cp(struct cs_data *data, cs_cp cp) { ensure_memsize_fits(data, cp); data->cps[cp >> 3] |= (1 << (cp & 0x07)); } static inline int tst_cp(cs_ar *cps, cs_cp len, cs_cp cp) { return ((cp < len) && cps[cp >> 3] & (1 << (cp & 0x07))); } static inline void clr_cp(cs_ar *cps, cs_cp len, cs_cp cp) { if (cp < len) { cps[cp >> 3] &= ~(1 << (cp & 0x07)); } } static void cs_free(void *ptr) { struct cs_data *data = ptr; ruby_xfree(data->cps); ruby_xfree(data); } static size_t cs_memsize(const void *ptr) { const struct cs_data *data = ptr; return sizeof(*data) + CS_MSIZE(data->len); } static const rb_data_type_t cs_type = { .wrap_struct_name = "character_set", .function = { .dmark = NULL, .dfree = cs_free, .dsize = cs_memsize, }, .data = NULL, #ifdef RUBY_TYPED_FROZEN_SHAREABLE .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_FROZEN_SHAREABLE, #else .flags = RUBY_TYPED_FREE_IMMEDIATELY, #endif }; static inline VALUE cs_alloc_len(VALUE klass, struct cs_data **data_ptr, cs_cp len) { VALUE cs; struct cs_data *data; cs = TypedData_Make_Struct(klass, struct cs_data, &cs_type, data); data->cps = ruby_xmalloc(CS_MSIZE(len)); memset(data->cps, 0, CS_MSIZE(len)); data->len = len; if (data_ptr) { *data_ptr = data; } return cs; } static inline VALUE cs_alloc(VALUE klass, struct cs_data **data_ptr) { return cs_alloc_len(klass, data_ptr, CS_DEFAULT_INITIAL_LEN); } static inline struct cs_data * cs_fetch_data(VALUE cs) { struct cs_data *data; TypedData_Get_Struct(cs, struct cs_data, &cs_type, data); return data; } static inline cs_ar * cs_fetch_cps(VALUE cs, cs_cp *len_ptr) { struct cs_data *data; data = cs_fetch_data(cs); *len_ptr = data->len; return data->cps; } static VALUE cs_method_allocate(VALUE self) { return cs_alloc(self, 0); } #define FOR_EACH_ACTIVE_CODEPOINT(action) \ do \ { \ cs_cp cp, len; \ cs_ar *cps; \ cps = cs_fetch_cps(self, &len); \ for (cp = 0; cp < len; cp++) \ { \ if (tst_cp(cps, len, cp)) \ { \ action; \ } \ } \ } while (0) // *************************** // `Set` compatibility methods // *************************** static inline cs_cp cs_active_cp_count(VALUE self) { cs_cp count; count = 0; FOR_EACH_ACTIVE_CODEPOINT(count++); return count; } static VALUE cs_method_length(VALUE self) { return LONG2FIX(cs_active_cp_count(self)); } static inline VALUE cs_enumerator_length(VALUE self, VALUE args, VALUE eobj) { return LONG2FIX(cs_active_cp_count(self)); } static VALUE cs_method_each(VALUE self) { RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length); FOR_EACH_ACTIVE_CODEPOINT(rb_yield(LONG2FIX(cp))); return self; } // returns an Array of codepoint Integers by default. // returns an Array of Strings of length 1 if passed `true`. static VALUE cs_method_to_a(int argc, VALUE *argv, VALUE self) { VALUE arr; rb_encoding *enc; rb_check_arity(argc, 0, 1); arr = rb_ary_new(); if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse) { FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, LONG2FIX(cp))); } else { enc = rb_utf8_encoding(); FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, rb_enc_uint_chr((int)cp, enc))); } return arr; } static VALUE cs_method_empty_p(VALUE self) { FOR_EACH_ACTIVE_CODEPOINT(return Qfalse); return Qtrue; } static VALUE cs_method_hash(VALUE self) { cs_cp cp, len, hash, four_byte_value; cs_ar *cps; cps = cs_fetch_cps(self, &len); four_byte_value = 0; hash = 17; for (cp = 0; cp < len; cp++) { if (cp % 32 == 0) { if (cp != 0) { hash = hash * 23 + four_byte_value; } four_byte_value = 0; } if (tst_cp(cps, len, cp)) { four_byte_value++; } } return LONG2FIX(hash); } static inline VALUE cs_delete_if_block_result(VALUE self, int truthy) { VALUE result; rb_need_block(); rb_check_frozen(self); FOR_EACH_ACTIVE_CODEPOINT( result = rb_yield(LONG2FIX(cp)); if ((NIL_P(result) || result == Qfalse) != truthy) clr_cp(cps, len, cp);); return self; } static VALUE cs_method_delete_if(VALUE self) { RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length); return cs_delete_if_block_result(self, 1); } static VALUE cs_method_keep_if(VALUE self) { RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length); return cs_delete_if_block_result(self, 0); } static VALUE cs_method_clear(VALUE self) { struct cs_data *data; rb_check_frozen(self); data = cs_fetch_data(self); memset(data->cps, 0, CS_MSIZE(data->len)); return self; } static VALUE cs_method_min(VALUE self) { FOR_EACH_ACTIVE_CODEPOINT(return LONG2FIX(cp)); return Qnil; } static VALUE cs_method_max(VALUE self) { cs_cp len; long reverse_idx; cs_ar *cps; cps = cs_fetch_cps(self, &len); for (reverse_idx = len; reverse_idx >= 0; reverse_idx--) { if (tst_cp(cps, len, reverse_idx)) { return LONG2FIX(reverse_idx); } } return Qnil; } static VALUE cs_method_minmax(VALUE self) { VALUE arr; arr = rb_ary_new2(2); rb_ary_push(arr, cs_method_min(self)); rb_ary_push(arr, cs_method_max(self)); return arr; } #define RETURN_COMBINED_CS(cs_a, cs_b, comp_op) \ do \ { \ VALUE new_cs; \ cs_cp cp, alen, blen; \ cs_ar *acps, *bcps; \ struct cs_data *new_data; \ acps = cs_fetch_cps(cs_a, &alen); \ bcps = cs_fetch_cps(cs_b, &blen); \ new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) \ { \ if (tst_cp(acps, alen, cp) comp_op tst_cp(bcps, blen, cp)) \ { \ set_cp(new_data, cp); \ } \ } \ return new_cs; \ } while (0) static VALUE cs_method_intersection(VALUE self, VALUE other) { RETURN_COMBINED_CS(self, other, &&); } static VALUE cs_method_exclusion(VALUE self, VALUE other) { RETURN_COMBINED_CS(self, other, ^); } static VALUE cs_method_union(VALUE self, VALUE other) { RETURN_COMBINED_CS(self, other, ||); } static VALUE cs_method_difference(VALUE self, VALUE other) { RETURN_COMBINED_CS(self, other, >); } static VALUE cs_method_include_p(VALUE self, VALUE num) { cs_ar *cps; cs_cp len; cps = cs_fetch_cps(self, &len); return (tst_cp(cps, len, FIX2ULONG(num)) ? Qtrue : Qfalse); } static inline VALUE cs_toggle_codepoint(VALUE cs, VALUE cp_num, int on, int return_nil_if_noop) { cs_cp cp, len; cs_ar *cps; struct cs_data *data; rb_check_frozen(cs); data = cs_fetch_data(cs); cps = data->cps; len = data->len; cp = FIX2ULONG(cp_num); if (return_nil_if_noop && tst_cp(cps, len, cp) == on) { return Qnil; } if (on) { set_cp(data, cp); } else { clr_cp(cps, len, cp); } return cs; } static VALUE cs_method_add(VALUE self, VALUE cp_num) { return cs_toggle_codepoint(self, cp_num, 1, 0); } static VALUE cs_method_add_p(VALUE self, VALUE cp_num) { return cs_toggle_codepoint(self, cp_num, 1, 1); } static VALUE cs_method_delete(VALUE self, VALUE cp_num) { return cs_toggle_codepoint(self, cp_num, 0, 0); } static VALUE cs_method_delete_p(VALUE self, VALUE cp_num) { return cs_toggle_codepoint(self, cp_num, 0, 1); } static VALUE cs_method_intersect_p(VALUE self, VALUE other) { cs_cp cp, alen, blen; cs_ar *acps, *bcps; acps = cs_fetch_cps(self, &alen); bcps = cs_fetch_cps(other, &blen); for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { if (tst_cp(acps, alen, cp) && tst_cp(bcps, blen, cp)) { return Qtrue; } } return Qfalse; } static VALUE cs_method_disjoint_p(VALUE self, VALUE other) { return cs_method_intersect_p(self, other) ? Qfalse : Qtrue; } static inline int cs_check_type(VALUE obj) { return rb_typeddata_is_kind_of(obj, &cs_type); } static VALUE cs_cps_eql(VALUE cs_a, VALUE cs_b) { cs_cp cp, alen, blen; cs_ar *acps, *bcps; acps = cs_fetch_cps(cs_a, &alen); bcps = cs_fetch_cps(cs_b, &blen); for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { if (tst_cp(acps, alen, cp) != tst_cp(bcps, blen, cp)) { return Qfalse; } } return Qtrue; } static VALUE cs_method_eql_p(VALUE self, VALUE other) { if (!cs_check_type(other)) { return Qfalse; } if (self == other) // same object_id { return Qtrue; } return cs_cps_eql(self, other); } static inline VALUE cs_merge_cs(VALUE recipient, VALUE source) { cs_cp cp, source_len; struct cs_data *data; cs_ar *source_cps; data = cs_fetch_data(recipient); source_cps = cs_fetch_cps(source, &source_len); for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { if (tst_cp(source_cps, source_len, cp)) { set_cp(data, cp); } } return recipient; } static inline cs_cp cs_checked_cp(VALUE object_id) { if (FIXNUM_P(object_id) && object_id > 0 && object_id < 0x220001) { return FIX2ULONG(object_id); } rb_raise(rb_eArgError, "CharacterSet members must be between 0 and 0x10FFFF"); } static inline VALUE cs_merge_rb_range(VALUE self, VALUE rb_range) { VALUE from_id, upto_id; cs_cp from_cp, upto_cp, cont_len, rem; int excl; struct cs_data *data; data = cs_fetch_data(self); if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl))) { rb_raise(rb_eArgError, "pass a Range"); } if (excl) { upto_id -= 2; } from_cp = cs_checked_cp(from_id); upto_cp = cs_checked_cp(upto_id); if (upto_cp > from_cp && (upto_cp - from_cp > 6)) { // set bits in preceding partially toggled bytes individually for (/* */; (from_cp <= upto_cp) && (from_cp % 8); from_cp++) { set_cp(data, from_cp); } // memset contiguous bits directly cont_len = upto_cp - from_cp + 1; rem = cont_len % 8; ensure_memsize_fits(data, upto_cp); memset(data->cps + CS_MSIZE(from_cp), 0xFF, CS_MSIZE(cont_len - rem) / 8); from_cp = upto_cp - rem + 1; } // set bits in partially toggled bytes individually for (/* */; from_cp <= upto_cp; from_cp++) { set_cp(data, from_cp); } return self; } static inline VALUE cs_merge_rb_array(VALUE self, VALUE rb_array) { VALUE el, array_length, i; struct cs_data *data; Check_Type(rb_array, T_ARRAY); data = cs_fetch_data(self); array_length = RARRAY_LEN(rb_array); for (i = 0; i < array_length; i++) { el = RARRAY_AREF(rb_array, i); set_cp(data, cs_checked_cp(el)); } return self; } static VALUE cs_method_merge(VALUE self, VALUE other) { rb_check_frozen(self); if (cs_check_type(other)) { return cs_merge_cs(self, other); } if (TYPE(other) == T_ARRAY) { return cs_merge_rb_array(self, other); } return cs_merge_rb_range(self, other); } static VALUE cs_method_initialize_copy(VALUE self, VALUE orig) { cs_merge_cs(self, orig); return self; } static VALUE cs_method_subtract(VALUE self, VALUE other) { cs_cp cp, len, other_len; cs_ar *cps, *other_cps; rb_check_frozen(self); cps = cs_fetch_cps(self, &len); other_cps = cs_fetch_cps(other, &other_len); for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { if (tst_cp(other_cps, other_len, cp)) { clr_cp(cps, len, cp); } } return self; } static inline int cs_a_subset_of_b(VALUE cs_a, VALUE cs_b, int *is_proper_ptr) { cs_ar *a, *b; cs_cp cp, alen, blen, count_a, count_b; if (!cs_check_type(cs_a) || !cs_check_type(cs_b)) { rb_raise(rb_eArgError, "pass a CharacterSet"); } a = cs_fetch_cps(cs_a, &alen); b = cs_fetch_cps(cs_b, &blen); count_a = 0; count_b = 0; for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { if (tst_cp(a, alen, cp)) { if (!tst_cp(b, blen, cp)) { return 0; } count_a++; count_b++; } else if (tst_cp(b, blen, cp)) { count_b++; } } if (is_proper_ptr) { *is_proper_ptr = count_b > count_a; } return 1; } static VALUE cs_method_subset_p(VALUE self, VALUE other) { return cs_a_subset_of_b(self, other, NULL) ? Qtrue : Qfalse; } static VALUE cs_method_proper_subset_p(VALUE self, VALUE other) { int is_subset, is_proper; is_subset = cs_a_subset_of_b(self, other, &is_proper); return (is_subset && is_proper) ? Qtrue : Qfalse; } static VALUE cs_method_superset_p(VALUE self, VALUE other) { return cs_a_subset_of_b(other, self, NULL) ? Qtrue : Qfalse; } static VALUE cs_method_proper_superset_p(VALUE self, VALUE other) { int is_superset, is_proper; is_superset = cs_a_subset_of_b(other, self, &is_proper); return (is_superset && is_proper) ? Qtrue : Qfalse; } static VALUE cs_method_spaceship_operator(VALUE self, VALUE other) { if (cs_method_eql_p(self, other)) return INT2FIX(0); if (cs_method_proper_subset_p(self, other)) return INT2FIX(-1); if (cs_method_proper_superset_p(self, other)) return INT2FIX(1); return Qnil; } // ******************************* // `CharacterSet`-specific methods // ******************************* static VALUE cs_class_method_from_ranges(VALUE self, VALUE ranges) { VALUE new_cs, range_count, i; new_cs = rb_class_new_instance(0, 0, self); range_count = RARRAY_LEN(ranges); for (i = 0; i < range_count; i++) { cs_merge_rb_range(new_cs, RARRAY_AREF(ranges, i)); } return new_cs; } static VALUE cs_method_ranges(VALUE self) { VALUE ranges, cp_num, previous_cp_num, current_start, current_end; ranges = rb_ary_new(); previous_cp_num = 0; current_start = 0; current_end = 0; FOR_EACH_ACTIVE_CODEPOINT( cp_num = LONG2FIX(cp); if (!previous_cp_num) { current_start = cp_num; } else if (previous_cp_num + 2 != cp_num) { // gap found, finalize previous range rb_ary_push(ranges, rb_range_new(current_start, current_end, 0)); current_start = cp_num; } current_end = cp_num; previous_cp_num = cp_num;); // add final range if (current_start) { rb_ary_push(ranges, rb_range_new(current_start, current_end, 0)); } return ranges; } static VALUE cs_method_sample(int argc, VALUE *argv, VALUE self) { VALUE array, to_a_args[1] = {Qtrue}; rb_check_arity(argc, 0, 1); array = cs_method_to_a(1, to_a_args, self); return rb_funcall(array, rb_intern("sample"), argc, argc ? argv[0] : 0); } static inline VALUE cs_from_section(VALUE set, cs_cp from, cs_cp upto) { VALUE new_cs; cs_ar *cps; cs_cp cp, len; struct cs_data *new_data; new_cs = cs_alloc(RBASIC(set)->klass, &new_data); cps = cs_fetch_cps(set, &len); for (cp = from; cp <= upto; cp++) { if (tst_cp(cps, len, cp)) { set_cp(new_data, cp); } } return new_cs; } static VALUE cs_method_ext_section(VALUE self, VALUE from, VALUE upto) { return cs_from_section(self, FIX2ULONG(from), FIX2ULONG(upto)); } static inline cs_cp cs_active_cp_count_in_section(VALUE set, cs_cp from, cs_cp upto) { cs_ar *cps; cs_cp cp, count, len; cps = cs_fetch_cps(set, &len); for (count = 0, cp = from; cp <= upto; cp++) { if (tst_cp(cps, len, cp)) { count++; } } return count; } static VALUE cs_method_ext_count_in_section(VALUE self, VALUE from, VALUE upto) { cs_cp count; count = cs_active_cp_count_in_section(self, FIX2ULONG(from), FIX2ULONG(upto)); return LONG2FIX(count); } static inline VALUE cs_has_cp_in_section(cs_ar *cps, cs_cp len, cs_cp from, cs_cp upto) { cs_cp cp; for (cp = from; cp <= upto; cp++) { if (tst_cp(cps, len, cp)) { return Qtrue; } } return Qfalse; } static VALUE cs_method_ext_section_p(VALUE self, VALUE from, VALUE upto) { cs_ar *cps; cs_cp len; cps = cs_fetch_cps(self, &len); return cs_has_cp_in_section(cps, len, FIX2ULONG(from), FIX2ULONG(upto)); } static inline VALUE cs_ratio_of_section(VALUE set, cs_cp from, cs_cp upto) { double section_count, total_count; section_count = (double)cs_active_cp_count_in_section(set, from, upto); total_count = (double)cs_active_cp_count(set); return DBL2NUM(section_count / total_count); } static VALUE cs_method_ext_section_ratio(VALUE self, VALUE from, VALUE upto) { return cs_ratio_of_section(self, FIX2ULONG(from), FIX2ULONG(upto)); } #define MAX_CP 0x10FFFF #define MAX_ASCII_CP 0x7F #define MAX_BMP_CP 0xFFFF #define MIN_ASTRAL_CP 0x10000 static inline VALUE cs_has_cp_in_plane(cs_ar *cps, cs_cp len, unsigned int plane) { cs_cp plane_beg, plane_end; plane_beg = plane * UNICODE_PLANE_SIZE; plane_end = (plane + 1) * MAX_BMP_CP; return cs_has_cp_in_section(cps, len, plane_beg, plane_end); } static VALUE cs_method_planes(VALUE self) { cs_ar *cps; cs_cp len; unsigned int i; VALUE planes; cps = cs_fetch_cps(self, &len); planes = rb_ary_new(); for (i = 0; i < UNICODE_PLANE_COUNT; i++) { if (cs_has_cp_in_plane(cps, len, i)) { rb_ary_push(planes, INT2FIX(i)); } } return planes; } static inline int cs_valid_plane_num(VALUE num) { int plane; Check_Type(num, T_FIXNUM); plane = FIX2INT(num); if (plane < 0 || plane >= UNICODE_PLANE_COUNT) { rb_raise(rb_eArgError, "plane must be between 0 and %d", UNICODE_PLANE_COUNT - 1); } return plane; } static VALUE cs_method_plane(VALUE self, VALUE plane_num) { cs_cp plane, plane_beg, plane_end; plane = cs_valid_plane_num(plane_num); plane_beg = plane * UNICODE_PLANE_SIZE; plane_end = (plane + 1) * MAX_BMP_CP; return cs_from_section(self, plane_beg, plane_end); } static VALUE cs_method_member_in_plane_p(VALUE self, VALUE plane_num) { cs_ar *cps; cs_cp len; unsigned int plane; plane = cs_valid_plane_num(plane_num); cps = cs_fetch_cps(self, &len); return cs_has_cp_in_plane(cps, len, plane); } #define NON_SURROGATE(cp) (cp > 0xDFFF || cp < 0xD800) static VALUE cs_method_ext_inversion(int argc, VALUE *argv, VALUE self) { int inc_surr; cs_cp upto, cp, len; cs_ar *cps; VALUE new_cs; struct cs_data *new_data; rb_check_arity(argc, 0, 2); cps = cs_fetch_cps(self, &len); inc_surr = argc && argv[0] == Qtrue; new_cs = cs_alloc(RBASIC(self)->klass, &new_data); upto = argc > 1 && FIXNUM_P(argv[1]) ? FIX2ULONG(argv[1]) : UNICODE_CP_COUNT; for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { if (cp <= upto && !tst_cp(cps, len, cp) && (inc_surr || NON_SURROGATE(cp))) { set_cp(new_data, cp); } } return new_cs; } typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE memo); static inline int add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo) { set_cp(data, str_cp); return 1; } static VALUE cs_method_case_insensitive(VALUE self) { cs_cp i, len; cs_ar *cps; VALUE new_cs; struct cs_data *new_data; cps = cs_fetch_cps(self, &len); new_cs = cs_alloc(RBASIC(self)->klass, &new_data); cs_merge_cs(new_cs, self); for (i = 0; i < CASEFOLD_COUNT; i++) { casefold_mapping m = unicode_casefold_table[i]; if (tst_cp(cps, len, m.from)) { set_cp(new_data, m.to); } else if (tst_cp(cps, len, m.to)) { set_cp(new_data, m.from); } } return new_cs; // OnigCaseFoldType flags; // rb_encoding *enc; // // enc = rb_utf8_encoding(); // // ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE (not public on ruby < 2.4) // flags = (1<<13) | (1<<14); // // // case_map args: flags, pp, end, to, to_end, enc // enc->case_map(flags, (const OnigUChar**)&cp, ?, ?, ?, enc); } static inline VALUE each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo) { long i, str_len; unsigned int str_cp; str_len = RSTRING_LEN(str); for (i = 0; i < str_len; i++) { str_cp = (RSTRING_PTR(str)[i] & 0xff); if (!(*func)(str_cp, cp_arr, len, data, memo)) { return Qfalse; } } return Qtrue; } static inline VALUE each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo) { int n; unsigned int str_cp; const char *ptr, *end; rb_encoding *utf8; utf8 = rb_utf8_encoding(); if (rb_enc_get(str) == utf8) { str = rb_str_new_frozen(str); } else { str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil); } ptr = RSTRING_PTR(str); end = RSTRING_END(str); while (ptr < end) { str_cp = rb_enc_codepoint_len(ptr, end, &n, utf8); if (!(*func)(str_cp, cp_arr, len, data, memo)) { return Qfalse; } ptr += n; } return Qtrue; } // single_byte_optimizable - copied from string.c static inline int single_byte_optimizable(VALUE str) { rb_encoding *enc; if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) { return 1; } enc = rb_enc_get(str); if (rb_enc_mbmaxlen(enc) == 1) { return 1; } return 0; } static inline VALUE each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo) { if (single_byte_optimizable(str)) { return each_sb_cp(str, func, cp_arr, len, data, memo); } return each_mb_cp(str, func, cp_arr, len, data, memo); } static inline void raise_arg_err_unless_string(VALUE val) { if (!RB_TYPE_P(val, T_STRING)) { rb_raise(rb_eArgError, "pass a String"); } } static VALUE cs_class_method_of_string(VALUE self, VALUE string) { VALUE new_cs; struct cs_data *new_data; raise_arg_err_unless_string(string); new_cs = cs_alloc(self, &new_data); each_cp(string, add_str_cp_to_arr, 0, 0, new_data, 0); return new_cs; } static inline int count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo) { if (tst_cp(cp_arr, len, str_cp)) { *((VALUE *)memo) += 1; } return 1; } static VALUE cs_method_count_in(VALUE self, VALUE str) { long count; struct cs_data *data; raise_arg_err_unless_string(str); data = cs_fetch_data(self); count = 0; each_cp(str, count_str_cp, data->cps, data->len, data, (VALUE)&count); return LONG2FIX(count); } static inline int str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo) { return tst_cp(cp_arr, len, str_cp); } static VALUE cs_method_cover_p(VALUE self, VALUE str) { struct cs_data *data; raise_arg_err_unless_string(str); data = cs_fetch_data(self); return each_cp(str, str_cp_in_arr, data->cps, data->len, data, 0); } static inline int add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo) { if (tst_cp(cp_arr, len, str_cp)) { rb_ary_push(memo, rb_enc_uint_chr((int)str_cp, rb_utf8_encoding())); } return 1; } static VALUE cs_method_scan(VALUE self, VALUE str) { VALUE memo; struct cs_data *data; raise_arg_err_unless_string(str); data = cs_fetch_data(self); memo = rb_ary_new(); each_cp(str, add_str_cp_to_str_arr, data->cps, data->len, data, memo); return memo; } static inline int str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo) { return !tst_cp(cp_arr, len, str_cp); } static VALUE cs_method_used_by_p(VALUE self, VALUE str) { VALUE only_uses_other_cps; struct cs_data *data; raise_arg_err_unless_string(str); data = cs_fetch_data(self); only_uses_other_cps = each_cp(str, str_cp_not_in_arr, data->cps, data->len, data, 0); return only_uses_other_cps == Qfalse ? Qtrue : Qfalse; } // partially based on rb_str_delete_bang static inline VALUE cs_apply_to_str(VALUE set, VALUE str, int delete, int bang) { cs_ar *cps; cs_cp cs_len; VALUE orig_str_len; rb_encoding *orig_enc, *utf8; char *s, *send, *t; int orig_was_utf8, cr; raise_arg_err_unless_string(str); orig_str_len = RSTRING_LEN(str); if (orig_str_len == 0) { return bang ? Qnil : str; } orig_enc = rb_enc_get(str); utf8 = rb_utf8_encoding(); orig_was_utf8 = orig_enc == utf8; if (!orig_was_utf8 && orig_enc != rb_usascii_encoding()) { str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil); } else { if (!bang) { str = rb_str_dup(str); } } cps = cs_fetch_cps(set, &cs_len); rb_str_modify(str); s = t = RSTRING_PTR(str); send = RSTRING_END(str); cr = ENC_CODERANGE_7BIT; while (s < send) { unsigned int c; int clen; if ((c = *(unsigned char *)s) < 0x80) { if (tst_cp(cps, cs_len, c) != delete) { if (t != s) *t = c; t++; } s++; } else { c = rb_enc_codepoint_len(s, send, &clen, utf8); if (tst_cp(cps, cs_len, c) != delete) { if (t != s) rb_enc_mbcput(c, t, utf8); t += clen; if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID; } s += clen; } } rb_str_set_len(str, t - RSTRING_PTR(str)); ENC_CODERANGE_SET(str, cr); if (bang && (RSTRING_LEN(str) == (long)orig_str_len)) // string unchanged { return Qnil; } if (!orig_was_utf8) { return rb_str_encode(str, rb_enc_from_encoding(orig_enc), 0, Qnil); } return str; } static VALUE cs_method_delete_in(VALUE self, VALUE str) { return cs_apply_to_str(self, str, 1, 0); } static VALUE cs_method_delete_in_bang(VALUE self, VALUE str) { return cs_apply_to_str(self, str, 1, 1); } static VALUE cs_method_keep_in(VALUE self, VALUE str) { return cs_apply_to_str(self, str, 0, 0); } static VALUE cs_method_keep_in_bang(VALUE self, VALUE str) { return cs_apply_to_str(self, str, 0, 1); } static VALUE cs_method_allocated_length(VALUE self) { return LONG2FIX(cs_fetch_data(self)->len); } // **** // init // **** void Init_character_set() { #ifdef HAVE_RB_EXT_RACTOR_SAFE rb_ext_ractor_safe(true); #endif VALUE cs = rb_define_class("CharacterSet", rb_cObject); rb_define_alloc_func(cs, cs_method_allocate); // `Set` compatibility methods rb_define_method(cs, "each", cs_method_each, 0); rb_define_method(cs, "to_a", cs_method_to_a, -1); rb_define_method(cs, "length", cs_method_length, 0); rb_define_method(cs, "size", cs_method_length, 0); rb_define_method(cs, "empty?", cs_method_empty_p, 0); rb_define_method(cs, "hash", cs_method_hash, 0); rb_define_method(cs, "keep_if", cs_method_keep_if, 0); rb_define_method(cs, "delete_if", cs_method_delete_if, 0); rb_define_method(cs, "clear", cs_method_clear, 0); rb_define_method(cs, "min", cs_method_min, 0); rb_define_method(cs, "max", cs_method_max, 0); rb_define_method(cs, "minmax", cs_method_minmax, 0); rb_define_method(cs, "intersection", cs_method_intersection, 1); rb_define_method(cs, "&", cs_method_intersection, 1); rb_define_method(cs, "union", cs_method_union, 1); rb_define_method(cs, "+", cs_method_union, 1); rb_define_method(cs, "|", cs_method_union, 1); rb_define_method(cs, "difference", cs_method_difference, 1); rb_define_method(cs, "-", cs_method_difference, 1); rb_define_method(cs, "^", cs_method_exclusion, 1); rb_define_method(cs, "include?", cs_method_include_p, 1); rb_define_method(cs, "member?", cs_method_include_p, 1); rb_define_method(cs, "===", cs_method_include_p, 1); rb_define_method(cs, "add", cs_method_add, 1); rb_define_method(cs, "<<", cs_method_add, 1); rb_define_method(cs, "add?", cs_method_add_p, 1); rb_define_method(cs, "delete", cs_method_delete, 1); rb_define_method(cs, "delete?", cs_method_delete_p, 1); rb_define_method(cs, "intersect?", cs_method_intersect_p, 1); rb_define_method(cs, "disjoint?", cs_method_disjoint_p, 1); rb_define_method(cs, "eql?", cs_method_eql_p, 1); rb_define_method(cs, "==", cs_method_eql_p, 1); rb_define_method(cs, "merge", cs_method_merge, 1); rb_define_method(cs, "initialize_clone", cs_method_initialize_copy, 1); rb_define_method(cs, "initialize_dup", cs_method_initialize_copy, 1); rb_define_method(cs, "subtract", cs_method_subtract, 1); rb_define_method(cs, "subset?", cs_method_subset_p, 1); rb_define_method(cs, "<=", cs_method_subset_p, 1); rb_define_method(cs, "proper_subset?", cs_method_proper_subset_p, 1); rb_define_method(cs, "<", cs_method_proper_subset_p, 1); rb_define_method(cs, "superset?", cs_method_superset_p, 1); rb_define_method(cs, ">=", cs_method_superset_p, 1); rb_define_method(cs, "proper_superset?", cs_method_proper_superset_p, 1); rb_define_method(cs, ">", cs_method_proper_superset_p, 1); rb_define_method(cs, "<=>", cs_method_spaceship_operator, 1); // `CharacterSet`-specific methods rb_define_singleton_method(cs, "from_ranges", cs_class_method_from_ranges, -2); rb_define_singleton_method(cs, "of_string", cs_class_method_of_string, 1); rb_define_method(cs, "ranges", cs_method_ranges, 0); rb_define_method(cs, "sample", cs_method_sample, -1); rb_define_method(cs, "ext_section", cs_method_ext_section, 2); rb_define_method(cs, "ext_count_in_section", cs_method_ext_count_in_section, 2); rb_define_method(cs, "ext_section?", cs_method_ext_section_p, 2); rb_define_method(cs, "ext_section_ratio", cs_method_ext_section_ratio, 2); rb_define_method(cs, "planes", cs_method_planes, 0); rb_define_method(cs, "plane", cs_method_plane, 1); rb_define_method(cs, "member_in_plane?", cs_method_member_in_plane_p, 1); rb_define_method(cs, "ext_inversion", cs_method_ext_inversion, -1); rb_define_method(cs, "case_insensitive", cs_method_case_insensitive, 0); rb_define_method(cs, "count_in", cs_method_count_in, 1); rb_define_method(cs, "cover?", cs_method_cover_p, 1); rb_define_method(cs, "delete_in", cs_method_delete_in, 1); rb_define_method(cs, "delete_in!", cs_method_delete_in_bang, 1); rb_define_method(cs, "keep_in", cs_method_keep_in, 1); rb_define_method(cs, "keep_in!", cs_method_keep_in_bang, 1); rb_define_method(cs, "scan", cs_method_scan, 1); rb_define_method(cs, "used_by?", cs_method_used_by_p, 1); rb_define_method(cs, "allocated_length", cs_method_allocated_length, 0); } character_set-1.8.0/ext/character_set/unicode_casefold_table.h0000644000004100000410000006072514620142357024612 0ustar www-datawww-data// THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT // -*-C-*- typedef struct casefold_mapping { unsigned long from; unsigned long to; } casefold_mapping; #define CASEFOLD_COUNT 1426 static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = { {0x0041,0x0061}, {0x0042,0x0062}, {0x0043,0x0063}, {0x0044,0x0064}, {0x0045,0x0065}, {0x0046,0x0066}, {0x0047,0x0067}, {0x0048,0x0068}, {0x0049,0x0069}, {0x004A,0x006A}, {0x004B,0x006B}, {0x004C,0x006C}, {0x004D,0x006D}, {0x004E,0x006E}, {0x004F,0x006F}, {0x0050,0x0070}, {0x0051,0x0071}, {0x0052,0x0072}, {0x0053,0x0073}, {0x0054,0x0074}, {0x0055,0x0075}, {0x0056,0x0076}, {0x0057,0x0077}, {0x0058,0x0078}, {0x0059,0x0079}, {0x005A,0x007A}, {0x00B5,0x03BC}, {0x00C0,0x00E0}, {0x00C1,0x00E1}, {0x00C2,0x00E2}, {0x00C3,0x00E3}, {0x00C4,0x00E4}, {0x00C5,0x00E5}, {0x00C6,0x00E6}, {0x00C7,0x00E7}, {0x00C8,0x00E8}, {0x00C9,0x00E9}, {0x00CA,0x00EA}, {0x00CB,0x00EB}, {0x00CC,0x00EC}, {0x00CD,0x00ED}, {0x00CE,0x00EE}, {0x00CF,0x00EF}, {0x00D0,0x00F0}, {0x00D1,0x00F1}, {0x00D2,0x00F2}, {0x00D3,0x00F3}, {0x00D4,0x00F4}, {0x00D5,0x00F5}, {0x00D6,0x00F6}, {0x00D8,0x00F8}, {0x00D9,0x00F9}, {0x00DA,0x00FA}, {0x00DB,0x00FB}, {0x00DC,0x00FC}, {0x00DD,0x00FD}, {0x00DE,0x00FE}, {0x0100,0x0101}, {0x0102,0x0103}, {0x0104,0x0105}, {0x0106,0x0107}, {0x0108,0x0109}, {0x010A,0x010B}, {0x010C,0x010D}, {0x010E,0x010F}, {0x0110,0x0111}, {0x0112,0x0113}, {0x0114,0x0115}, {0x0116,0x0117}, {0x0118,0x0119}, {0x011A,0x011B}, {0x011C,0x011D}, {0x011E,0x011F}, {0x0120,0x0121}, {0x0122,0x0123}, {0x0124,0x0125}, {0x0126,0x0127}, {0x0128,0x0129}, {0x012A,0x012B}, {0x012C,0x012D}, {0x012E,0x012F}, {0x0132,0x0133}, {0x0134,0x0135}, {0x0136,0x0137}, {0x0139,0x013A}, {0x013B,0x013C}, {0x013D,0x013E}, {0x013F,0x0140}, {0x0141,0x0142}, {0x0143,0x0144}, {0x0145,0x0146}, {0x0147,0x0148}, {0x014A,0x014B}, {0x014C,0x014D}, {0x014E,0x014F}, {0x0150,0x0151}, {0x0152,0x0153}, {0x0154,0x0155}, {0x0156,0x0157}, {0x0158,0x0159}, {0x015A,0x015B}, {0x015C,0x015D}, {0x015E,0x015F}, {0x0160,0x0161}, {0x0162,0x0163}, {0x0164,0x0165}, {0x0166,0x0167}, {0x0168,0x0169}, {0x016A,0x016B}, {0x016C,0x016D}, {0x016E,0x016F}, {0x0170,0x0171}, {0x0172,0x0173}, {0x0174,0x0175}, {0x0176,0x0177}, {0x0178,0x00FF}, {0x0179,0x017A}, {0x017B,0x017C}, {0x017D,0x017E}, {0x017F,0x0073}, {0x0181,0x0253}, {0x0182,0x0183}, {0x0184,0x0185}, {0x0186,0x0254}, {0x0187,0x0188}, {0x0189,0x0256}, {0x018A,0x0257}, {0x018B,0x018C}, {0x018E,0x01DD}, {0x018F,0x0259}, {0x0190,0x025B}, {0x0191,0x0192}, {0x0193,0x0260}, {0x0194,0x0263}, {0x0196,0x0269}, {0x0197,0x0268}, {0x0198,0x0199}, {0x019C,0x026F}, {0x019D,0x0272}, {0x019F,0x0275}, {0x01A0,0x01A1}, {0x01A2,0x01A3}, {0x01A4,0x01A5}, {0x01A6,0x0280}, {0x01A7,0x01A8}, {0x01A9,0x0283}, {0x01AC,0x01AD}, {0x01AE,0x0288}, {0x01AF,0x01B0}, {0x01B1,0x028A}, {0x01B2,0x028B}, {0x01B3,0x01B4}, {0x01B5,0x01B6}, {0x01B7,0x0292}, {0x01B8,0x01B9}, {0x01BC,0x01BD}, {0x01C4,0x01C6}, {0x01C5,0x01C6}, {0x01C7,0x01C9}, {0x01C8,0x01C9}, {0x01CA,0x01CC}, {0x01CB,0x01CC}, {0x01CD,0x01CE}, {0x01CF,0x01D0}, {0x01D1,0x01D2}, {0x01D3,0x01D4}, {0x01D5,0x01D6}, {0x01D7,0x01D8}, {0x01D9,0x01DA}, {0x01DB,0x01DC}, {0x01DE,0x01DF}, {0x01E0,0x01E1}, {0x01E2,0x01E3}, {0x01E4,0x01E5}, {0x01E6,0x01E7}, {0x01E8,0x01E9}, {0x01EA,0x01EB}, {0x01EC,0x01ED}, {0x01EE,0x01EF}, {0x01F1,0x01F3}, {0x01F2,0x01F3}, {0x01F4,0x01F5}, {0x01F6,0x0195}, {0x01F7,0x01BF}, {0x01F8,0x01F9}, {0x01FA,0x01FB}, {0x01FC,0x01FD}, {0x01FE,0x01FF}, {0x0200,0x0201}, {0x0202,0x0203}, {0x0204,0x0205}, {0x0206,0x0207}, {0x0208,0x0209}, {0x020A,0x020B}, {0x020C,0x020D}, {0x020E,0x020F}, {0x0210,0x0211}, {0x0212,0x0213}, {0x0214,0x0215}, {0x0216,0x0217}, {0x0218,0x0219}, {0x021A,0x021B}, {0x021C,0x021D}, {0x021E,0x021F}, {0x0220,0x019E}, {0x0222,0x0223}, {0x0224,0x0225}, {0x0226,0x0227}, {0x0228,0x0229}, {0x022A,0x022B}, {0x022C,0x022D}, {0x022E,0x022F}, {0x0230,0x0231}, {0x0232,0x0233}, {0x023A,0x2C65}, {0x023B,0x023C}, {0x023D,0x019A}, {0x023E,0x2C66}, {0x0241,0x0242}, {0x0243,0x0180}, {0x0244,0x0289}, {0x0245,0x028C}, {0x0246,0x0247}, {0x0248,0x0249}, {0x024A,0x024B}, {0x024C,0x024D}, {0x024E,0x024F}, {0x0345,0x03B9}, {0x0370,0x0371}, {0x0372,0x0373}, {0x0376,0x0377}, {0x037F,0x03F3}, {0x0386,0x03AC}, {0x0388,0x03AD}, {0x0389,0x03AE}, {0x038A,0x03AF}, {0x038C,0x03CC}, {0x038E,0x03CD}, {0x038F,0x03CE}, {0x0391,0x03B1}, {0x0392,0x03B2}, {0x0393,0x03B3}, {0x0394,0x03B4}, {0x0395,0x03B5}, {0x0396,0x03B6}, {0x0397,0x03B7}, {0x0398,0x03B8}, {0x0399,0x03B9}, {0x039A,0x03BA}, {0x039B,0x03BB}, {0x039C,0x03BC}, {0x039D,0x03BD}, {0x039E,0x03BE}, {0x039F,0x03BF}, {0x03A0,0x03C0}, {0x03A1,0x03C1}, {0x03A3,0x03C3}, {0x03A4,0x03C4}, {0x03A5,0x03C5}, {0x03A6,0x03C6}, {0x03A7,0x03C7}, {0x03A8,0x03C8}, {0x03A9,0x03C9}, {0x03AA,0x03CA}, {0x03AB,0x03CB}, {0x03C2,0x03C3}, {0x03CF,0x03D7}, {0x03D0,0x03B2}, {0x03D1,0x03B8}, {0x03D5,0x03C6}, {0x03D6,0x03C0}, {0x03D8,0x03D9}, {0x03DA,0x03DB}, {0x03DC,0x03DD}, {0x03DE,0x03DF}, {0x03E0,0x03E1}, {0x03E2,0x03E3}, {0x03E4,0x03E5}, {0x03E6,0x03E7}, {0x03E8,0x03E9}, {0x03EA,0x03EB}, {0x03EC,0x03ED}, {0x03EE,0x03EF}, {0x03F0,0x03BA}, {0x03F1,0x03C1}, {0x03F4,0x03B8}, {0x03F5,0x03B5}, {0x03F7,0x03F8}, {0x03F9,0x03F2}, {0x03FA,0x03FB}, {0x03FD,0x037B}, {0x03FE,0x037C}, {0x03FF,0x037D}, {0x0400,0x0450}, {0x0401,0x0451}, {0x0402,0x0452}, {0x0403,0x0453}, {0x0404,0x0454}, {0x0405,0x0455}, {0x0406,0x0456}, {0x0407,0x0457}, {0x0408,0x0458}, {0x0409,0x0459}, {0x040A,0x045A}, {0x040B,0x045B}, {0x040C,0x045C}, {0x040D,0x045D}, {0x040E,0x045E}, {0x040F,0x045F}, {0x0410,0x0430}, {0x0411,0x0431}, {0x0412,0x0432}, {0x0413,0x0433}, {0x0414,0x0434}, {0x0415,0x0435}, {0x0416,0x0436}, {0x0417,0x0437}, {0x0418,0x0438}, {0x0419,0x0439}, {0x041A,0x043A}, {0x041B,0x043B}, {0x041C,0x043C}, {0x041D,0x043D}, {0x041E,0x043E}, {0x041F,0x043F}, {0x0420,0x0440}, {0x0421,0x0441}, {0x0422,0x0442}, {0x0423,0x0443}, {0x0424,0x0444}, {0x0425,0x0445}, {0x0426,0x0446}, {0x0427,0x0447}, {0x0428,0x0448}, {0x0429,0x0449}, {0x042A,0x044A}, {0x042B,0x044B}, {0x042C,0x044C}, {0x042D,0x044D}, {0x042E,0x044E}, {0x042F,0x044F}, {0x0460,0x0461}, {0x0462,0x0463}, {0x0464,0x0465}, {0x0466,0x0467}, {0x0468,0x0469}, {0x046A,0x046B}, {0x046C,0x046D}, {0x046E,0x046F}, {0x0470,0x0471}, {0x0472,0x0473}, {0x0474,0x0475}, {0x0476,0x0477}, {0x0478,0x0479}, {0x047A,0x047B}, {0x047C,0x047D}, {0x047E,0x047F}, {0x0480,0x0481}, {0x048A,0x048B}, {0x048C,0x048D}, {0x048E,0x048F}, {0x0490,0x0491}, {0x0492,0x0493}, {0x0494,0x0495}, {0x0496,0x0497}, {0x0498,0x0499}, {0x049A,0x049B}, {0x049C,0x049D}, {0x049E,0x049F}, {0x04A0,0x04A1}, {0x04A2,0x04A3}, {0x04A4,0x04A5}, {0x04A6,0x04A7}, {0x04A8,0x04A9}, {0x04AA,0x04AB}, {0x04AC,0x04AD}, {0x04AE,0x04AF}, {0x04B0,0x04B1}, {0x04B2,0x04B3}, {0x04B4,0x04B5}, {0x04B6,0x04B7}, {0x04B8,0x04B9}, {0x04BA,0x04BB}, {0x04BC,0x04BD}, {0x04BE,0x04BF}, {0x04C0,0x04CF}, {0x04C1,0x04C2}, {0x04C3,0x04C4}, {0x04C5,0x04C6}, {0x04C7,0x04C8}, {0x04C9,0x04CA}, {0x04CB,0x04CC}, {0x04CD,0x04CE}, {0x04D0,0x04D1}, {0x04D2,0x04D3}, {0x04D4,0x04D5}, {0x04D6,0x04D7}, {0x04D8,0x04D9}, {0x04DA,0x04DB}, {0x04DC,0x04DD}, {0x04DE,0x04DF}, {0x04E0,0x04E1}, {0x04E2,0x04E3}, {0x04E4,0x04E5}, {0x04E6,0x04E7}, {0x04E8,0x04E9}, {0x04EA,0x04EB}, {0x04EC,0x04ED}, {0x04EE,0x04EF}, {0x04F0,0x04F1}, {0x04F2,0x04F3}, {0x04F4,0x04F5}, {0x04F6,0x04F7}, {0x04F8,0x04F9}, {0x04FA,0x04FB}, {0x04FC,0x04FD}, {0x04FE,0x04FF}, {0x0500,0x0501}, {0x0502,0x0503}, {0x0504,0x0505}, {0x0506,0x0507}, {0x0508,0x0509}, {0x050A,0x050B}, {0x050C,0x050D}, {0x050E,0x050F}, {0x0510,0x0511}, {0x0512,0x0513}, {0x0514,0x0515}, {0x0516,0x0517}, {0x0518,0x0519}, {0x051A,0x051B}, {0x051C,0x051D}, {0x051E,0x051F}, {0x0520,0x0521}, {0x0522,0x0523}, {0x0524,0x0525}, {0x0526,0x0527}, {0x0528,0x0529}, {0x052A,0x052B}, {0x052C,0x052D}, {0x052E,0x052F}, {0x0531,0x0561}, {0x0532,0x0562}, {0x0533,0x0563}, {0x0534,0x0564}, {0x0535,0x0565}, {0x0536,0x0566}, {0x0537,0x0567}, {0x0538,0x0568}, {0x0539,0x0569}, {0x053A,0x056A}, {0x053B,0x056B}, {0x053C,0x056C}, {0x053D,0x056D}, {0x053E,0x056E}, {0x053F,0x056F}, {0x0540,0x0570}, {0x0541,0x0571}, {0x0542,0x0572}, {0x0543,0x0573}, {0x0544,0x0574}, {0x0545,0x0575}, {0x0546,0x0576}, {0x0547,0x0577}, {0x0548,0x0578}, {0x0549,0x0579}, {0x054A,0x057A}, {0x054B,0x057B}, {0x054C,0x057C}, {0x054D,0x057D}, {0x054E,0x057E}, {0x054F,0x057F}, {0x0550,0x0580}, {0x0551,0x0581}, {0x0552,0x0582}, {0x0553,0x0583}, {0x0554,0x0584}, {0x0555,0x0585}, {0x0556,0x0586}, {0x10400,0x10428}, {0x10401,0x10429}, {0x10402,0x1042A}, {0x10403,0x1042B}, {0x10404,0x1042C}, {0x10405,0x1042D}, {0x10406,0x1042E}, {0x10407,0x1042F}, {0x10408,0x10430}, {0x10409,0x10431}, {0x1040A,0x10432}, {0x1040B,0x10433}, {0x1040C,0x10434}, {0x1040D,0x10435}, {0x1040E,0x10436}, {0x1040F,0x10437}, {0x10410,0x10438}, {0x10411,0x10439}, {0x10412,0x1043A}, {0x10413,0x1043B}, {0x10414,0x1043C}, {0x10415,0x1043D}, {0x10416,0x1043E}, {0x10417,0x1043F}, {0x10418,0x10440}, {0x10419,0x10441}, {0x1041A,0x10442}, {0x1041B,0x10443}, {0x1041C,0x10444}, {0x1041D,0x10445}, {0x1041E,0x10446}, {0x1041F,0x10447}, {0x10420,0x10448}, {0x10421,0x10449}, {0x10422,0x1044A}, {0x10423,0x1044B}, {0x10424,0x1044C}, {0x10425,0x1044D}, {0x10426,0x1044E}, {0x10427,0x1044F}, {0x104B0,0x104D8}, {0x104B1,0x104D9}, {0x104B2,0x104DA}, {0x104B3,0x104DB}, {0x104B4,0x104DC}, {0x104B5,0x104DD}, {0x104B6,0x104DE}, {0x104B7,0x104DF}, {0x104B8,0x104E0}, {0x104B9,0x104E1}, {0x104BA,0x104E2}, {0x104BB,0x104E3}, {0x104BC,0x104E4}, {0x104BD,0x104E5}, {0x104BE,0x104E6}, {0x104BF,0x104E7}, {0x104C0,0x104E8}, {0x104C1,0x104E9}, {0x104C2,0x104EA}, {0x104C3,0x104EB}, {0x104C4,0x104EC}, {0x104C5,0x104ED}, {0x104C6,0x104EE}, {0x104C7,0x104EF}, {0x104C8,0x104F0}, {0x104C9,0x104F1}, {0x104CA,0x104F2}, {0x104CB,0x104F3}, {0x104CC,0x104F4}, {0x104CD,0x104F5}, {0x104CE,0x104F6}, {0x104CF,0x104F7}, {0x104D0,0x104F8}, {0x104D1,0x104F9}, {0x104D2,0x104FA}, {0x104D3,0x104FB}, {0x10570,0x10597}, {0x10571,0x10598}, {0x10572,0x10599}, {0x10573,0x1059A}, {0x10574,0x1059B}, {0x10575,0x1059C}, {0x10576,0x1059D}, {0x10577,0x1059E}, {0x10578,0x1059F}, {0x10579,0x105A0}, {0x1057A,0x105A1}, {0x1057C,0x105A3}, {0x1057D,0x105A4}, {0x1057E,0x105A5}, {0x1057F,0x105A6}, {0x10580,0x105A7}, {0x10581,0x105A8}, {0x10582,0x105A9}, {0x10583,0x105AA}, {0x10584,0x105AB}, {0x10585,0x105AC}, {0x10586,0x105AD}, {0x10587,0x105AE}, {0x10588,0x105AF}, {0x10589,0x105B0}, {0x1058A,0x105B1}, {0x1058C,0x105B3}, {0x1058D,0x105B4}, {0x1058E,0x105B5}, {0x1058F,0x105B6}, {0x10590,0x105B7}, {0x10591,0x105B8}, {0x10592,0x105B9}, {0x10594,0x105BB}, {0x10595,0x105BC}, {0x10A0,0x2D00}, {0x10A1,0x2D01}, {0x10A2,0x2D02}, {0x10A3,0x2D03}, {0x10A4,0x2D04}, {0x10A5,0x2D05}, {0x10A6,0x2D06}, {0x10A7,0x2D07}, {0x10A8,0x2D08}, {0x10A9,0x2D09}, {0x10AA,0x2D0A}, {0x10AB,0x2D0B}, {0x10AC,0x2D0C}, {0x10AD,0x2D0D}, {0x10AE,0x2D0E}, {0x10AF,0x2D0F}, {0x10B0,0x2D10}, {0x10B1,0x2D11}, {0x10B2,0x2D12}, {0x10B3,0x2D13}, {0x10B4,0x2D14}, {0x10B5,0x2D15}, {0x10B6,0x2D16}, {0x10B7,0x2D17}, {0x10B8,0x2D18}, {0x10B9,0x2D19}, {0x10BA,0x2D1A}, {0x10BB,0x2D1B}, {0x10BC,0x2D1C}, {0x10BD,0x2D1D}, {0x10BE,0x2D1E}, {0x10BF,0x2D1F}, {0x10C0,0x2D20}, {0x10C1,0x2D21}, {0x10C2,0x2D22}, {0x10C3,0x2D23}, {0x10C4,0x2D24}, {0x10C5,0x2D25}, {0x10C7,0x2D27}, {0x10C80,0x10CC0}, {0x10C81,0x10CC1}, {0x10C82,0x10CC2}, {0x10C83,0x10CC3}, {0x10C84,0x10CC4}, {0x10C85,0x10CC5}, {0x10C86,0x10CC6}, {0x10C87,0x10CC7}, {0x10C88,0x10CC8}, {0x10C89,0x10CC9}, {0x10C8A,0x10CCA}, {0x10C8B,0x10CCB}, {0x10C8C,0x10CCC}, {0x10C8D,0x10CCD}, {0x10C8E,0x10CCE}, {0x10C8F,0x10CCF}, {0x10C90,0x10CD0}, {0x10C91,0x10CD1}, {0x10C92,0x10CD2}, {0x10C93,0x10CD3}, {0x10C94,0x10CD4}, {0x10C95,0x10CD5}, {0x10C96,0x10CD6}, {0x10C97,0x10CD7}, {0x10C98,0x10CD8}, {0x10C99,0x10CD9}, {0x10C9A,0x10CDA}, {0x10C9B,0x10CDB}, {0x10C9C,0x10CDC}, {0x10C9D,0x10CDD}, {0x10C9E,0x10CDE}, {0x10C9F,0x10CDF}, {0x10CA0,0x10CE0}, {0x10CA1,0x10CE1}, {0x10CA2,0x10CE2}, {0x10CA3,0x10CE3}, {0x10CA4,0x10CE4}, {0x10CA5,0x10CE5}, {0x10CA6,0x10CE6}, {0x10CA7,0x10CE7}, {0x10CA8,0x10CE8}, {0x10CA9,0x10CE9}, {0x10CAA,0x10CEA}, {0x10CAB,0x10CEB}, {0x10CAC,0x10CEC}, {0x10CAD,0x10CED}, {0x10CAE,0x10CEE}, {0x10CAF,0x10CEF}, {0x10CB0,0x10CF0}, {0x10CB1,0x10CF1}, {0x10CB2,0x10CF2}, {0x10CD,0x2D2D}, {0x118A0,0x118C0}, {0x118A1,0x118C1}, {0x118A2,0x118C2}, {0x118A3,0x118C3}, {0x118A4,0x118C4}, {0x118A5,0x118C5}, {0x118A6,0x118C6}, {0x118A7,0x118C7}, {0x118A8,0x118C8}, {0x118A9,0x118C9}, {0x118AA,0x118CA}, {0x118AB,0x118CB}, {0x118AC,0x118CC}, {0x118AD,0x118CD}, {0x118AE,0x118CE}, {0x118AF,0x118CF}, {0x118B0,0x118D0}, {0x118B1,0x118D1}, {0x118B2,0x118D2}, {0x118B3,0x118D3}, {0x118B4,0x118D4}, {0x118B5,0x118D5}, {0x118B6,0x118D6}, {0x118B7,0x118D7}, {0x118B8,0x118D8}, {0x118B9,0x118D9}, {0x118BA,0x118DA}, {0x118BB,0x118DB}, {0x118BC,0x118DC}, {0x118BD,0x118DD}, {0x118BE,0x118DE}, {0x118BF,0x118DF}, {0x13F8,0x13F0}, {0x13F9,0x13F1}, {0x13FA,0x13F2}, {0x13FB,0x13F3}, {0x13FC,0x13F4}, {0x13FD,0x13F5}, {0x16E40,0x16E60}, {0x16E41,0x16E61}, {0x16E42,0x16E62}, {0x16E43,0x16E63}, {0x16E44,0x16E64}, {0x16E45,0x16E65}, {0x16E46,0x16E66}, {0x16E47,0x16E67}, {0x16E48,0x16E68}, {0x16E49,0x16E69}, {0x16E4A,0x16E6A}, {0x16E4B,0x16E6B}, {0x16E4C,0x16E6C}, {0x16E4D,0x16E6D}, {0x16E4E,0x16E6E}, {0x16E4F,0x16E6F}, {0x16E50,0x16E70}, {0x16E51,0x16E71}, {0x16E52,0x16E72}, {0x16E53,0x16E73}, {0x16E54,0x16E74}, {0x16E55,0x16E75}, {0x16E56,0x16E76}, {0x16E57,0x16E77}, {0x16E58,0x16E78}, {0x16E59,0x16E79}, {0x16E5A,0x16E7A}, {0x16E5B,0x16E7B}, {0x16E5C,0x16E7C}, {0x16E5D,0x16E7D}, {0x16E5E,0x16E7E}, {0x16E5F,0x16E7F}, {0x1C80,0x0432}, {0x1C81,0x0434}, {0x1C82,0x043E}, {0x1C83,0x0441}, {0x1C84,0x0442}, {0x1C85,0x0442}, {0x1C86,0x044A}, {0x1C87,0x0463}, {0x1C88,0xA64B}, {0x1C90,0x10D0}, {0x1C91,0x10D1}, {0x1C92,0x10D2}, {0x1C93,0x10D3}, {0x1C94,0x10D4}, {0x1C95,0x10D5}, {0x1C96,0x10D6}, {0x1C97,0x10D7}, {0x1C98,0x10D8}, {0x1C99,0x10D9}, {0x1C9A,0x10DA}, {0x1C9B,0x10DB}, {0x1C9C,0x10DC}, {0x1C9D,0x10DD}, {0x1C9E,0x10DE}, {0x1C9F,0x10DF}, {0x1CA0,0x10E0}, {0x1CA1,0x10E1}, {0x1CA2,0x10E2}, {0x1CA3,0x10E3}, {0x1CA4,0x10E4}, {0x1CA5,0x10E5}, {0x1CA6,0x10E6}, {0x1CA7,0x10E7}, {0x1CA8,0x10E8}, {0x1CA9,0x10E9}, {0x1CAA,0x10EA}, {0x1CAB,0x10EB}, {0x1CAC,0x10EC}, {0x1CAD,0x10ED}, {0x1CAE,0x10EE}, {0x1CAF,0x10EF}, {0x1CB0,0x10F0}, {0x1CB1,0x10F1}, {0x1CB2,0x10F2}, {0x1CB3,0x10F3}, {0x1CB4,0x10F4}, {0x1CB5,0x10F5}, {0x1CB6,0x10F6}, {0x1CB7,0x10F7}, {0x1CB8,0x10F8}, {0x1CB9,0x10F9}, {0x1CBA,0x10FA}, {0x1CBD,0x10FD}, {0x1CBE,0x10FE}, {0x1CBF,0x10FF}, {0x1E00,0x1E01}, {0x1E02,0x1E03}, {0x1E04,0x1E05}, {0x1E06,0x1E07}, {0x1E08,0x1E09}, {0x1E0A,0x1E0B}, {0x1E0C,0x1E0D}, {0x1E0E,0x1E0F}, {0x1E10,0x1E11}, {0x1E12,0x1E13}, {0x1E14,0x1E15}, {0x1E16,0x1E17}, {0x1E18,0x1E19}, {0x1E1A,0x1E1B}, {0x1E1C,0x1E1D}, {0x1E1E,0x1E1F}, {0x1E20,0x1E21}, {0x1E22,0x1E23}, {0x1E24,0x1E25}, {0x1E26,0x1E27}, {0x1E28,0x1E29}, {0x1E2A,0x1E2B}, {0x1E2C,0x1E2D}, {0x1E2E,0x1E2F}, {0x1E30,0x1E31}, {0x1E32,0x1E33}, {0x1E34,0x1E35}, {0x1E36,0x1E37}, {0x1E38,0x1E39}, {0x1E3A,0x1E3B}, {0x1E3C,0x1E3D}, {0x1E3E,0x1E3F}, {0x1E40,0x1E41}, {0x1E42,0x1E43}, {0x1E44,0x1E45}, {0x1E46,0x1E47}, {0x1E48,0x1E49}, {0x1E4A,0x1E4B}, {0x1E4C,0x1E4D}, {0x1E4E,0x1E4F}, {0x1E50,0x1E51}, {0x1E52,0x1E53}, {0x1E54,0x1E55}, {0x1E56,0x1E57}, {0x1E58,0x1E59}, {0x1E5A,0x1E5B}, {0x1E5C,0x1E5D}, {0x1E5E,0x1E5F}, {0x1E60,0x1E61}, {0x1E62,0x1E63}, {0x1E64,0x1E65}, {0x1E66,0x1E67}, {0x1E68,0x1E69}, {0x1E6A,0x1E6B}, {0x1E6C,0x1E6D}, {0x1E6E,0x1E6F}, {0x1E70,0x1E71}, {0x1E72,0x1E73}, {0x1E74,0x1E75}, {0x1E76,0x1E77}, {0x1E78,0x1E79}, {0x1E7A,0x1E7B}, {0x1E7C,0x1E7D}, {0x1E7E,0x1E7F}, {0x1E80,0x1E81}, {0x1E82,0x1E83}, {0x1E84,0x1E85}, {0x1E86,0x1E87}, {0x1E88,0x1E89}, {0x1E8A,0x1E8B}, {0x1E8C,0x1E8D}, {0x1E8E,0x1E8F}, {0x1E90,0x1E91}, {0x1E900,0x1E922}, {0x1E901,0x1E923}, {0x1E902,0x1E924}, {0x1E903,0x1E925}, {0x1E904,0x1E926}, {0x1E905,0x1E927}, {0x1E906,0x1E928}, {0x1E907,0x1E929}, {0x1E908,0x1E92A}, {0x1E909,0x1E92B}, {0x1E90A,0x1E92C}, {0x1E90B,0x1E92D}, {0x1E90C,0x1E92E}, {0x1E90D,0x1E92F}, {0x1E90E,0x1E930}, {0x1E90F,0x1E931}, {0x1E910,0x1E932}, {0x1E911,0x1E933}, {0x1E912,0x1E934}, {0x1E913,0x1E935}, {0x1E914,0x1E936}, {0x1E915,0x1E937}, {0x1E916,0x1E938}, {0x1E917,0x1E939}, {0x1E918,0x1E93A}, {0x1E919,0x1E93B}, {0x1E91A,0x1E93C}, {0x1E91B,0x1E93D}, {0x1E91C,0x1E93E}, {0x1E91D,0x1E93F}, {0x1E91E,0x1E940}, {0x1E91F,0x1E941}, {0x1E92,0x1E93}, {0x1E920,0x1E942}, {0x1E921,0x1E943}, {0x1E94,0x1E95}, {0x1E9B,0x1E61}, {0x1EA0,0x1EA1}, {0x1EA2,0x1EA3}, {0x1EA4,0x1EA5}, {0x1EA6,0x1EA7}, {0x1EA8,0x1EA9}, {0x1EAA,0x1EAB}, {0x1EAC,0x1EAD}, {0x1EAE,0x1EAF}, {0x1EB0,0x1EB1}, {0x1EB2,0x1EB3}, {0x1EB4,0x1EB5}, {0x1EB6,0x1EB7}, {0x1EB8,0x1EB9}, {0x1EBA,0x1EBB}, {0x1EBC,0x1EBD}, {0x1EBE,0x1EBF}, {0x1EC0,0x1EC1}, {0x1EC2,0x1EC3}, {0x1EC4,0x1EC5}, {0x1EC6,0x1EC7}, {0x1EC8,0x1EC9}, {0x1ECA,0x1ECB}, {0x1ECC,0x1ECD}, {0x1ECE,0x1ECF}, {0x1ED0,0x1ED1}, {0x1ED2,0x1ED3}, {0x1ED4,0x1ED5}, {0x1ED6,0x1ED7}, {0x1ED8,0x1ED9}, {0x1EDA,0x1EDB}, {0x1EDC,0x1EDD}, {0x1EDE,0x1EDF}, {0x1EE0,0x1EE1}, {0x1EE2,0x1EE3}, {0x1EE4,0x1EE5}, {0x1EE6,0x1EE7}, {0x1EE8,0x1EE9}, {0x1EEA,0x1EEB}, {0x1EEC,0x1EED}, {0x1EEE,0x1EEF}, {0x1EF0,0x1EF1}, {0x1EF2,0x1EF3}, {0x1EF4,0x1EF5}, {0x1EF6,0x1EF7}, {0x1EF8,0x1EF9}, {0x1EFA,0x1EFB}, {0x1EFC,0x1EFD}, {0x1EFE,0x1EFF}, {0x1F08,0x1F00}, {0x1F09,0x1F01}, {0x1F0A,0x1F02}, {0x1F0B,0x1F03}, {0x1F0C,0x1F04}, {0x1F0D,0x1F05}, {0x1F0E,0x1F06}, {0x1F0F,0x1F07}, {0x1F18,0x1F10}, {0x1F19,0x1F11}, {0x1F1A,0x1F12}, {0x1F1B,0x1F13}, {0x1F1C,0x1F14}, {0x1F1D,0x1F15}, {0x1F28,0x1F20}, {0x1F29,0x1F21}, {0x1F2A,0x1F22}, {0x1F2B,0x1F23}, {0x1F2C,0x1F24}, {0x1F2D,0x1F25}, {0x1F2E,0x1F26}, {0x1F2F,0x1F27}, {0x1F38,0x1F30}, {0x1F39,0x1F31}, {0x1F3A,0x1F32}, {0x1F3B,0x1F33}, {0x1F3C,0x1F34}, {0x1F3D,0x1F35}, {0x1F3E,0x1F36}, {0x1F3F,0x1F37}, {0x1F48,0x1F40}, {0x1F49,0x1F41}, {0x1F4A,0x1F42}, {0x1F4B,0x1F43}, {0x1F4C,0x1F44}, {0x1F4D,0x1F45}, {0x1F59,0x1F51}, {0x1F5B,0x1F53}, {0x1F5D,0x1F55}, {0x1F5F,0x1F57}, {0x1F68,0x1F60}, {0x1F69,0x1F61}, {0x1F6A,0x1F62}, {0x1F6B,0x1F63}, {0x1F6C,0x1F64}, {0x1F6D,0x1F65}, {0x1F6E,0x1F66}, {0x1F6F,0x1F67}, {0x1FB8,0x1FB0}, {0x1FB9,0x1FB1}, {0x1FBA,0x1F70}, {0x1FBB,0x1F71}, {0x1FBE,0x03B9}, {0x1FC8,0x1F72}, {0x1FC9,0x1F73}, {0x1FCA,0x1F74}, {0x1FCB,0x1F75}, {0x1FD8,0x1FD0}, {0x1FD9,0x1FD1}, {0x1FDA,0x1F76}, {0x1FDB,0x1F77}, {0x1FE8,0x1FE0}, {0x1FE9,0x1FE1}, {0x1FEA,0x1F7A}, {0x1FEB,0x1F7B}, {0x1FEC,0x1FE5}, {0x1FF8,0x1F78}, {0x1FF9,0x1F79}, {0x1FFA,0x1F7C}, {0x1FFB,0x1F7D}, {0x2126,0x03C9}, {0x212A,0x006B}, {0x212B,0x00E5}, {0x2132,0x214E}, {0x2160,0x2170}, {0x2161,0x2171}, {0x2162,0x2172}, {0x2163,0x2173}, {0x2164,0x2174}, {0x2165,0x2175}, {0x2166,0x2176}, {0x2167,0x2177}, {0x2168,0x2178}, {0x2169,0x2179}, {0x216A,0x217A}, {0x216B,0x217B}, {0x216C,0x217C}, {0x216D,0x217D}, {0x216E,0x217E}, {0x216F,0x217F}, {0x2183,0x2184}, {0x24B6,0x24D0}, {0x24B7,0x24D1}, {0x24B8,0x24D2}, {0x24B9,0x24D3}, {0x24BA,0x24D4}, {0x24BB,0x24D5}, {0x24BC,0x24D6}, {0x24BD,0x24D7}, {0x24BE,0x24D8}, {0x24BF,0x24D9}, {0x24C0,0x24DA}, {0x24C1,0x24DB}, {0x24C2,0x24DC}, {0x24C3,0x24DD}, {0x24C4,0x24DE}, {0x24C5,0x24DF}, {0x24C6,0x24E0}, {0x24C7,0x24E1}, {0x24C8,0x24E2}, {0x24C9,0x24E3}, {0x24CA,0x24E4}, {0x24CB,0x24E5}, {0x24CC,0x24E6}, {0x24CD,0x24E7}, {0x24CE,0x24E8}, {0x24CF,0x24E9}, {0x2C00,0x2C30}, {0x2C01,0x2C31}, {0x2C02,0x2C32}, {0x2C03,0x2C33}, {0x2C04,0x2C34}, {0x2C05,0x2C35}, {0x2C06,0x2C36}, {0x2C07,0x2C37}, {0x2C08,0x2C38}, {0x2C09,0x2C39}, {0x2C0A,0x2C3A}, {0x2C0B,0x2C3B}, {0x2C0C,0x2C3C}, {0x2C0D,0x2C3D}, {0x2C0E,0x2C3E}, {0x2C0F,0x2C3F}, {0x2C10,0x2C40}, {0x2C11,0x2C41}, {0x2C12,0x2C42}, {0x2C13,0x2C43}, {0x2C14,0x2C44}, {0x2C15,0x2C45}, {0x2C16,0x2C46}, {0x2C17,0x2C47}, {0x2C18,0x2C48}, {0x2C19,0x2C49}, {0x2C1A,0x2C4A}, {0x2C1B,0x2C4B}, {0x2C1C,0x2C4C}, {0x2C1D,0x2C4D}, {0x2C1E,0x2C4E}, {0x2C1F,0x2C4F}, {0x2C20,0x2C50}, {0x2C21,0x2C51}, {0x2C22,0x2C52}, {0x2C23,0x2C53}, {0x2C24,0x2C54}, {0x2C25,0x2C55}, {0x2C26,0x2C56}, {0x2C27,0x2C57}, {0x2C28,0x2C58}, {0x2C29,0x2C59}, {0x2C2A,0x2C5A}, {0x2C2B,0x2C5B}, {0x2C2C,0x2C5C}, {0x2C2D,0x2C5D}, {0x2C2E,0x2C5E}, {0x2C2F,0x2C5F}, {0x2C60,0x2C61}, {0x2C62,0x026B}, {0x2C63,0x1D7D}, {0x2C64,0x027D}, {0x2C67,0x2C68}, {0x2C69,0x2C6A}, {0x2C6B,0x2C6C}, {0x2C6D,0x0251}, {0x2C6E,0x0271}, {0x2C6F,0x0250}, {0x2C70,0x0252}, {0x2C72,0x2C73}, {0x2C75,0x2C76}, {0x2C7E,0x023F}, {0x2C7F,0x0240}, {0x2C80,0x2C81}, {0x2C82,0x2C83}, {0x2C84,0x2C85}, {0x2C86,0x2C87}, {0x2C88,0x2C89}, {0x2C8A,0x2C8B}, {0x2C8C,0x2C8D}, {0x2C8E,0x2C8F}, {0x2C90,0x2C91}, {0x2C92,0x2C93}, {0x2C94,0x2C95}, {0x2C96,0x2C97}, {0x2C98,0x2C99}, {0x2C9A,0x2C9B}, {0x2C9C,0x2C9D}, {0x2C9E,0x2C9F}, {0x2CA0,0x2CA1}, {0x2CA2,0x2CA3}, {0x2CA4,0x2CA5}, {0x2CA6,0x2CA7}, {0x2CA8,0x2CA9}, {0x2CAA,0x2CAB}, {0x2CAC,0x2CAD}, {0x2CAE,0x2CAF}, {0x2CB0,0x2CB1}, {0x2CB2,0x2CB3}, {0x2CB4,0x2CB5}, {0x2CB6,0x2CB7}, {0x2CB8,0x2CB9}, {0x2CBA,0x2CBB}, {0x2CBC,0x2CBD}, {0x2CBE,0x2CBF}, {0x2CC0,0x2CC1}, {0x2CC2,0x2CC3}, {0x2CC4,0x2CC5}, {0x2CC6,0x2CC7}, {0x2CC8,0x2CC9}, {0x2CCA,0x2CCB}, {0x2CCC,0x2CCD}, {0x2CCE,0x2CCF}, {0x2CD0,0x2CD1}, {0x2CD2,0x2CD3}, {0x2CD4,0x2CD5}, {0x2CD6,0x2CD7}, {0x2CD8,0x2CD9}, {0x2CDA,0x2CDB}, {0x2CDC,0x2CDD}, {0x2CDE,0x2CDF}, {0x2CE0,0x2CE1}, {0x2CE2,0x2CE3}, {0x2CEB,0x2CEC}, {0x2CED,0x2CEE}, {0x2CF2,0x2CF3}, {0xA640,0xA641}, {0xA642,0xA643}, {0xA644,0xA645}, {0xA646,0xA647}, {0xA648,0xA649}, {0xA64A,0xA64B}, {0xA64C,0xA64D}, {0xA64E,0xA64F}, {0xA650,0xA651}, {0xA652,0xA653}, {0xA654,0xA655}, {0xA656,0xA657}, {0xA658,0xA659}, {0xA65A,0xA65B}, {0xA65C,0xA65D}, {0xA65E,0xA65F}, {0xA660,0xA661}, {0xA662,0xA663}, {0xA664,0xA665}, {0xA666,0xA667}, {0xA668,0xA669}, {0xA66A,0xA66B}, {0xA66C,0xA66D}, {0xA680,0xA681}, {0xA682,0xA683}, {0xA684,0xA685}, {0xA686,0xA687}, {0xA688,0xA689}, {0xA68A,0xA68B}, {0xA68C,0xA68D}, {0xA68E,0xA68F}, {0xA690,0xA691}, {0xA692,0xA693}, {0xA694,0xA695}, {0xA696,0xA697}, {0xA698,0xA699}, {0xA69A,0xA69B}, {0xA722,0xA723}, {0xA724,0xA725}, {0xA726,0xA727}, {0xA728,0xA729}, {0xA72A,0xA72B}, {0xA72C,0xA72D}, {0xA72E,0xA72F}, {0xA732,0xA733}, {0xA734,0xA735}, {0xA736,0xA737}, {0xA738,0xA739}, {0xA73A,0xA73B}, {0xA73C,0xA73D}, {0xA73E,0xA73F}, {0xA740,0xA741}, {0xA742,0xA743}, {0xA744,0xA745}, {0xA746,0xA747}, {0xA748,0xA749}, {0xA74A,0xA74B}, {0xA74C,0xA74D}, {0xA74E,0xA74F}, {0xA750,0xA751}, {0xA752,0xA753}, {0xA754,0xA755}, {0xA756,0xA757}, {0xA758,0xA759}, {0xA75A,0xA75B}, {0xA75C,0xA75D}, {0xA75E,0xA75F}, {0xA760,0xA761}, {0xA762,0xA763}, {0xA764,0xA765}, {0xA766,0xA767}, {0xA768,0xA769}, {0xA76A,0xA76B}, {0xA76C,0xA76D}, {0xA76E,0xA76F}, {0xA779,0xA77A}, {0xA77B,0xA77C}, {0xA77D,0x1D79}, {0xA77E,0xA77F}, {0xA780,0xA781}, {0xA782,0xA783}, {0xA784,0xA785}, {0xA786,0xA787}, {0xA78B,0xA78C}, {0xA78D,0x0265}, {0xA790,0xA791}, {0xA792,0xA793}, {0xA796,0xA797}, {0xA798,0xA799}, {0xA79A,0xA79B}, {0xA79C,0xA79D}, {0xA79E,0xA79F}, {0xA7A0,0xA7A1}, {0xA7A2,0xA7A3}, {0xA7A4,0xA7A5}, {0xA7A6,0xA7A7}, {0xA7A8,0xA7A9}, {0xA7AA,0x0266}, {0xA7AB,0x025C}, {0xA7AC,0x0261}, {0xA7AD,0x026C}, {0xA7AE,0x026A}, {0xA7B0,0x029E}, {0xA7B1,0x0287}, {0xA7B2,0x029D}, {0xA7B3,0xAB53}, {0xA7B4,0xA7B5}, {0xA7B6,0xA7B7}, {0xA7B8,0xA7B9}, {0xA7BA,0xA7BB}, {0xA7BC,0xA7BD}, {0xA7BE,0xA7BF}, {0xA7C0,0xA7C1}, {0xA7C2,0xA7C3}, {0xA7C4,0xA794}, {0xA7C5,0x0282}, {0xA7C6,0x1D8E}, {0xA7C7,0xA7C8}, {0xA7C9,0xA7CA}, {0xA7D0,0xA7D1}, {0xA7D6,0xA7D7}, {0xA7D8,0xA7D9}, {0xA7F5,0xA7F6}, {0xAB70,0x13A0}, {0xAB71,0x13A1}, {0xAB72,0x13A2}, {0xAB73,0x13A3}, {0xAB74,0x13A4}, {0xAB75,0x13A5}, {0xAB76,0x13A6}, {0xAB77,0x13A7}, {0xAB78,0x13A8}, {0xAB79,0x13A9}, {0xAB7A,0x13AA}, {0xAB7B,0x13AB}, {0xAB7C,0x13AC}, {0xAB7D,0x13AD}, {0xAB7E,0x13AE}, {0xAB7F,0x13AF}, {0xAB80,0x13B0}, {0xAB81,0x13B1}, {0xAB82,0x13B2}, {0xAB83,0x13B3}, {0xAB84,0x13B4}, {0xAB85,0x13B5}, {0xAB86,0x13B6}, {0xAB87,0x13B7}, {0xAB88,0x13B8}, {0xAB89,0x13B9}, {0xAB8A,0x13BA}, {0xAB8B,0x13BB}, {0xAB8C,0x13BC}, {0xAB8D,0x13BD}, {0xAB8E,0x13BE}, {0xAB8F,0x13BF}, {0xAB90,0x13C0}, {0xAB91,0x13C1}, {0xAB92,0x13C2}, {0xAB93,0x13C3}, {0xAB94,0x13C4}, {0xAB95,0x13C5}, {0xAB96,0x13C6}, {0xAB97,0x13C7}, {0xAB98,0x13C8}, {0xAB99,0x13C9}, {0xAB9A,0x13CA}, {0xAB9B,0x13CB}, {0xAB9C,0x13CC}, {0xAB9D,0x13CD}, {0xAB9E,0x13CE}, {0xAB9F,0x13CF}, {0xABA0,0x13D0}, {0xABA1,0x13D1}, {0xABA2,0x13D2}, {0xABA3,0x13D3}, {0xABA4,0x13D4}, {0xABA5,0x13D5}, {0xABA6,0x13D6}, {0xABA7,0x13D7}, {0xABA8,0x13D8}, {0xABA9,0x13D9}, {0xABAA,0x13DA}, {0xABAB,0x13DB}, {0xABAC,0x13DC}, {0xABAD,0x13DD}, {0xABAE,0x13DE}, {0xABAF,0x13DF}, {0xABB0,0x13E0}, {0xABB1,0x13E1}, {0xABB2,0x13E2}, {0xABB3,0x13E3}, {0xABB4,0x13E4}, {0xABB5,0x13E5}, {0xABB6,0x13E6}, {0xABB7,0x13E7}, {0xABB8,0x13E8}, {0xABB9,0x13E9}, {0xABBA,0x13EA}, {0xABBB,0x13EB}, {0xABBC,0x13EC}, {0xABBD,0x13ED}, {0xABBE,0x13EE}, {0xABBF,0x13EF}, {0xFF21,0xFF41}, {0xFF22,0xFF42}, {0xFF23,0xFF43}, {0xFF24,0xFF44}, {0xFF25,0xFF45}, {0xFF26,0xFF46}, {0xFF27,0xFF47}, {0xFF28,0xFF48}, {0xFF29,0xFF49}, {0xFF2A,0xFF4A}, {0xFF2B,0xFF4B}, {0xFF2C,0xFF4C}, {0xFF2D,0xFF4D}, {0xFF2E,0xFF4E}, {0xFF2F,0xFF4F}, {0xFF30,0xFF50}, {0xFF31,0xFF51}, {0xFF32,0xFF52}, {0xFF33,0xFF53}, {0xFF34,0xFF54}, {0xFF35,0xFF55}, {0xFF36,0xFF56}, {0xFF37,0xFF57}, {0xFF38,0xFF58}, {0xFF39,0xFF59}, {0xFF3A,0xFF5A}, }; character_set-1.8.0/ext/character_set/unicode_casefold_table.h.tmpl0000644000004100000410000000042714620142357025556 0ustar www-datawww-data// THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT // -*-C-*- typedef struct casefold_mapping { unsigned long from; unsigned long to; } casefold_mapping; #define CASEFOLD_COUNT 0 static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {}; character_set-1.8.0/ext/character_set/extconf.rb0000644000004100000410000000017514620142357021770 0ustar www-datawww-datarequire 'mkmf' $CFLAGS << ' -Wextra -Wno-unused-parameter -Wall -pedantic ' create_makefile('character_set/character_set') character_set-1.8.0/.gitattributes0000644000004100000410000000015014620142357017252 0ustar www-datawww-data*.cps linguist-detectable=false benchmarks/* linguist-detectable=false spec/* linguist-detectable=false character_set-1.8.0/README.md0000644000004100000410000001666114620142357015654 0ustar www-datawww-data# CharacterSet [![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set) [![Build Status](https://github.com/jaynetics/character_set/workflows/tests/badge.svg)](https://github.com/jaynetics/character_set/actions) [![Build Status](https://github.com/jaynetics/character_set/workflows/gouteur/badge.svg)](https://github.com/jaynetics/character_set/actions) [![Coverage](https://codecov.io/gh/jaynetics/character_set/branch/main/graph/badge.svg?token=oY7gcWNbIN)](https://codecov.io/gh/jaynetics/character_set) This is a C-extended Ruby gem to work with sets of Unicode codepoints. It can [read](#parseinitialize) and [write](#write) sets of codepoints in various formats and it implements the stdlib `Set` interface for them. It also offers a [way of scrubbing and scanning characters in Strings](#interact-with-strings) that is more semantic and consistently offers better performance than `Regexp` and `String` methods from the stdlib for this (see [benchmarks](./BENCHMARK.md)). Many parts can be used independently, e.g.: - `CharacterSet::Character` - `CharacterSet::ExpressionConverter` - `CharacterSet::Parser` - `CharacterSet::Writer` ## Usage ### Usage examples ```ruby CharacterSet.url_query.cover?('?a=(b$c;)') # => true CharacterSet.non_ascii.delete_in!(string) CharacterSet.emoji.sample(5) # => ["⛷", "👈", "🌞", "♑", "⛈"] ``` ### Parse/Initialize These all produce a `CharacterSet` containing `a`, `b` and `c`: ```ruby CharacterSet['a', 'b', 'c'] CharacterSet[97, 98, 99] CharacterSet.new('a'..'c') CharacterSet.new(0x61..0x63) CharacterSet.of('abacababa') CharacterSet.parse('[a-c]') CharacterSet.parse('\U00000061-\U00000063') ``` If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `Regexp` instances and unicode property names can also be read. ```ruby CharacterSet.of(/./) # => # CharacterSet.of_property('Thai') # => # require 'character_set/core_ext/regexp_ext' /[\D&&[:ascii:]&&\p{emoji}]/.character_set.size # => 2 ``` ### Predefined utility sets `ascii`, `ascii_alnum`, `ascii_letter`, `assigned`, `bmp`, `crypt`, `emoji`, `newline`, `surrogate`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace` ```ruby CharacterSet.ascii # => # # all can be prefixed with `non_`, e.g. CharacterSet.non_ascii ``` ### Interact with Strings `CharacterSet` can replace some types of `String` handling with better performance than the stdlib. `#used_by?` and `#cover?` can replace some `Regexp#match?` calls: ```ruby CharacterSet.ascii.used_by?('Tüür') # => true CharacterSet.ascii.cover?('Tüür') # => false CharacterSet.ascii.cover?('Tr') # => true ``` `#delete_in(!)` and `#keep_in(!)` can replace `String#gsub(!)` and the like: ```ruby string = 'Tüür' CharacterSet.ascii.delete_in(string) # => 'üü' CharacterSet.ascii.keep_in(string) # => 'Tr' string # => 'Tüür' CharacterSet.ascii.delete_in!(string) # => 'üü' string # => 'üü' CharacterSet.ascii.keep_in!(string) # => '' string # => '' ``` `#count_in` and `#scan` can replace `String#count` and `String#scan`: ```ruby CharacterSet.non_ascii.count_in('Tüür') # => 2 CharacterSet.non_ascii.scan('Tüür') # => ['ü', 'ü'] ``` There is also a core extension for String interaction. ```ruby require 'character_set/core_ext/string_ext' "a\rb".character_set & CharacterSet.newline # => CharacterSet["\r"] "a\rb".uses_character_set?(CharacterSet['ä', 'ö', 'ü']) # => false "a\rb".covered_by_character_set?(CharacterSet.newline) # => false # predefined sets can also be referenced via Symbols "a\rb".covered_by_character_set?(:ascii) # => true "a\rb".delete_character_set(:newline) # => 'ab' # etc. ``` ### Manipulate Use [any Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members. Where appropriate, methods take both chars and codepoints, e.g.: ```ruby CharacterSet['a'].add('b') # => CharacterSet['a', 'b'] CharacterSet['a'].add(98) # => CharacterSet['a', 'b'] CharacterSet['a'].include?('a') # => true CharacterSet['a'].include?(0x61) # => true ``` `#inversion` can be used to create a `CharacterSet` with all valid Unicode codepoints that are not in the current set: ```ruby non_a = CharacterSet['a'].inversion # => # non_a.include?('a') # => false non_a.include?('ü') # => true # surrogate pair halves are not included by default CharacterSet['a'].inversion(include_surrogates: true) # => # ``` `#case_insensitive` can be used to create a `CharacterSet` where upper/lower case codepoints are supplemented: ```ruby CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a'] ``` ### Write ```ruby set = CharacterSet['a', 'b', 'c', 'j', '-'] # safely printable ASCII chars are not escaped by default set.to_s # => 'a-cj\x2D' set.to_s(escape_all: true) # => '\x61-\x63\x6A\x2D' # brackets may be added set.to_s(in_brackets: true) # => '[a-cj\x2D]' # the default escape format is Ruby/ES6 compatible, others are available set = CharacterSet['a', 'b', 'c', 'ɘ', '🤩'] set.to_s # => 'a-c\u0258\u{1F929}' set.to_s(format: 'U+') # => 'a-cU+0258U+1F929' set.to_s(format: 'Python') # => "a-c\u0258\U0001F929" set.to_s(format: 'raw') # => 'a-cɘ🤩' # or pass a block set.to_s { |char| "[#{char.codepoint}]" } # => "a-c[600][129321]" set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>" # disable abbreviation (grouping of codepoints in ranges) set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}" # astral members require some trickery if we want to target environments # that are based on UTF-16 or "UCS-2 with surrogates", such as JavaScript. set = CharacterSet['a', 'b', '🤩', '🤪', '🤫'] # Use #to_s_with_surrogate_ranges e.g. for JavaScript: set.to_s_with_surrogate_ranges # => '(?:[ab]|\uD83E[\uDD29-\uDD2B])' # Or use #to_s_with_surrogate_alternation if such surrogate set pairs # don't work in your target environment: set.to_s_with_surrogate_alternation # => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)' ``` ### Other features #### Secure tokens Generate secure random strings of characters from a set: ```ruby CharacterSet.new('a'..'z').secure_token(8) # => "ugwpujmt" CharacterSet.crypt.secure_token # => "8.1w7aBT737/pMfcMoO4y2y8/=0xtmo:" ``` #### Unicode planes There are some methods to check for planes and to handle ASCII, [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts: ```Ruby CharacterSet['a', 'ü', '🤩'].ascii_part # => CharacterSet['a'] CharacterSet['a', 'ü', '🤩'].ascii_part? # => true CharacterSet['a', 'ü', '🤩'].ascii_only? # => false CharacterSet['a', 'ü', '🤩'].ascii_ratio # => 0.3333333 CharacterSet['a', 'ü', '🤩'].bmp_part # => CharacterSet['a', 'ü'] CharacterSet['a', 'ü', '🤩'].astral_part # => CharacterSet['🤩'] CharacterSet['a', 'ü', '🤩'].bmp_ratio # => 0.6666666 CharacterSet['a', 'ü', '🤩'].planes # => [0, 1] CharacterSet['a', 'ü', '🤩'].plane(1) # => CharacterSet['🤩'] CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false CharacterSet::Character.new('a').plane # => 0 ``` ## Contributions Feel free to send suggestions, point out issues, or submit pull requests. character_set-1.8.0/.rubocop.yml0000644000004100000410000000061414620142357016636 0ustar www-datawww-dataAllCops: Exclude: - '**/doc/*' - '**/pkg/*' - '**/spec/ruby-spec/**/*' - '**/vendor/**/*' # vendored dependencies NewCops: enable RubyInterpreters: - ruby - rake TargetRubyVersion: 2.5 # really 2.1, but 2.5 is lowest supported by rubocop Lint/AmbiguousOperatorPrecedence: Enabled: false Lint/AmbiguousRegexpLiteral: Enabled: false Metrics: Enabled: false character_set-1.8.0/CHANGELOG.md0000644000004100000410000001012414620142357016172 0ustar www-datawww-data# Changelog All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). ## [Unreleased] ## [1.8.0] - 2024-01-07 ### Added - support for `#<=>` and `#join`, which were added to `set` in the meantime - support for getting the (overall) character set of a Regexp with multiple expressions - support for global and local case-insensitivity in Regexp inputs - `Regexp#{covered_by_character_set?,uses_character_set?}` methods (if core ext is used) ## [1.7.0] - 2023-05-12 ### Added - new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.2.0 ### Fixed - fixed processing of Strings that are not ASCII- or UTF8-encoded - removed dependency on `set` and `sorted_set` - thanks to https://github.com/mikebaldry for reporting a related issue (#2) ## [1.6.0] - 2022-02-16 ### Added - `::of` now supports both `String` and `Regexp` arguments ### Fixed - fixed segfault during `String` manipulation on Ruby 3.2.0-dev - improved performance for `String` manipulation - allow usage in Ractors - predefined sets must be pre-initialized for this, though - e.g. `CharacterSet.ascii`, `keep_character_set(:ascii)` etc. - call them once in the main Ractor to trigger initialization ## [1.5.0] - 2021-12-05 ### Added - new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.1.0 - latest unicode case-folding data (for `#case_insensitive`) - support for passing any Enumerable to `#disjoint?`, `#intersect?` - this matches recent broadening of these methods in `ruby/set` - new instance method `#secure_token` (see README) - class method `::of` now accepts more than one `String` - `CharacterSet::ExpressionConverter` can now build output of any Set-like class ### Fixed - `CharacterSet::Pure::of_expression` now returns a `CharacterSet::Pure` - it used to return a regular `CharacterSet` ## [1.4.1] - 2020-01-10 ### Fixed - multiple fixes for Ruby 3 - fixed segfault for some `String` manipulation cases - added `sorted_set` as dependency, so `CharacterSet::Pure` (non-C fallback) works - fixed error when parsing a `Regexp` with an empty intersection (e.g. `/[a&&]/`) ## [1.4.0] - 2019-06-07 ### Added - `#to_s_with_surrogate_ranges` / `Writer::write_surrogate_ranges` - allows for much shorter astral plane representations e.g. in JavaScript - thanks to https://github.com/singpolyma for the suggestion and groundwork (#1) - improved performance for `#to_s` / `Writer` by avoiding bugged `Range#minmax` ### Fixed - '/' is now escaped by default when stringifying so as to work with //-regexp syntax ## [1.3.0] - 2019-04-26 ### Added - improved `String` manipulation speed - improved initialization and `#merge` speed when passing a large `Range` - reduced memory consumption by > 90% for most use cases via dynamic resizing - before, every set instance required 136 KB for codepoints - now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc. - `#count_in` and `#scan` methods for `String` interaction - new predefined sets `::any`/`::all`, `::assigned`, `::surrogate` - conversion methods `#assigned_part`, `#valid_part` - sectioning methods `#ascii_part`, `#plane(n)` - section test methods `#ascii_part?`, `#ascii_ratio`, `#ascii_only?`, `#astral_only?` ### Fixed - `#count` now supports passing an argument or block as usual - `CharacterSet::Pure#keep_in`, `#delete_in` now preserve the original encoding ## [1.2.0] - 2019-04-02 ### Added - added latest Unicode casefold data (for `#case_insensitive`) ## [1.1.2] - 2018-09-25 ### Fixed - restored `range_compressor` as a runtime dependency for JRuby only ## [1.1.1] - 2018-09-24 ### Fixed - improved messages for missing optional dependencies - made `range_compressor` an optional dependency as it is almost never needed ## [1.1.0] - 2018-09-21 ### Added - added option to reference a predefined set via Symbol in `String` extension methods - added predefined sets `::ascii_alnum` and `::ascii_letters` ## [1.0.0] - 2018-09-02 Initial release.