character-set-1.1.2/0000755000175100017510000000000013373461276013302 5ustar pravipravicharacter-set-1.1.2/Rakefile0000644000175100017510000000760613373461276014760 0ustar pravipravirequire 'bundler/gem_tasks' require 'rspec/core/rake_task' require 'rubygems/package_task' require 'rake/extensiontask' RSpec::Core::RakeTask.new(:spec) task default: :spec Rake::ExtensionTask.new('character_set') do |ext| ext.lib_dir = 'lib/character_set' end namespace :java do java_gemspec = eval File.read('./character_set.gemspec') java_gemspec.platform = 'java' java_gemspec.extensions = [] java_gemspec.add_runtime_dependency 'range_compressor', '~> 1.0' Gem::PackageTask.new(java_gemspec) do |pkg| pkg.need_zip = true pkg.need_tar = true pkg.package_dir = 'pkg' end end task package: 'java:gem' desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants' task :sync_ruby_spec do require 'fileutils' variants = { 'CharacterSet' => './spec/ruby-spec/library/character_set', 'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure', } variants.each do |_, dir| FileUtils.rm_rf(dir) if File.exist?(dir) `svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}` end base = variants.first[1] variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base } variants.each.with_index do |(class_name, dir), i| Dir["#{dir}/**/*.rb"].each do |spec| # remove some tests that do not apply or are covered otherwise if spec =~ %r{/(flatten|initialize|pretty_print)} File.delete(spec) next end # some examples w. Strings must be adapted, "mspec" made rspec-compatible, # and `i` added to shared example names or they'll override each other adapted_content = File .read(spec) .gsub('SortedSet', class_name) .gsub('sorted_set_', "sorted_set_#{i}_") .gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |method|') .gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0') .gsub('"one"', '1') .gsub('"two"', '2') .gsub('"three"', '3') .gsub('"four"', '4') .gsub('"five"', '5') .gsub('@method', 'method') .gsub(/be_(false|true)/, 'be \1') .gsub('mock', 'double') File.open(spec, 'w') { |f| f.puts adapted_content } end end end desc 'Download unicode casefold data and write new C header file' task :sync_casefold_data do src_path = './CaseFolding.txt' dst_path = './ext/character_set/unicode_casefold_table.h' `wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt` mapping = File.foreach(src_path).each_with_object({}) do |line, hash| from, type, to = line.split(/\s*;\s*/).first(3) # type 'C' stands for 'common', excludes mappings to multiple chars hash[from] = to if type == 'C' end.sort File.open(dst_path, 'w') do |f| f.puts <<-C // THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT' typedef struct casefold_mapping { unsigned long from; unsigned long to; } casefold_mapping; #define CASEFOLD_COUNT #{mapping.size} static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = { C mapping.each { |from, to| f.puts "{0x#{from},0x#{to}}," } f.puts '};' end File.unlink(src_path) end desc 'Run all IPS benchmarks' task :benchmark do Dir['./benchmarks/*.rb'].sort.each { |file| require file } end namespace :benchmark do desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md' task :write_to_file do $store_comparison_results = {} Rake.application[:benchmark].invoke File.open('BENCHMARK.md', 'w') do |f| f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", '' $store_comparison_results.each do |caption, result| f.puts '```', caption, '', result.strip.gsub(/(same-ish).*$/, '\1').lines[1..-1], '```' end end end end unless RUBY_PLATFORM =~ /java/ # recompile before benchmarking or running specs task(:benchmark).enhance([:compile]) task(:spec).enhance([:compile]) end character-set-1.1.2/bin/0000755000175100017510000000000013373461276014052 5ustar pravipravicharacter-set-1.1.2/bin/setup0000755000175100017510000000020313373461276015133 0ustar pravipravi#!/usr/bin/env bash set -euo pipefail IFS=$'\n\t' set -vx bundle install # Do any other automated setup that you need to do here character-set-1.1.2/bin/console0000755000175100017510000000052113373461276015440 0ustar pravipravi#!/usr/bin/env ruby require 'bundler/setup' require 'character_set' require 'character_set/core_ext' require 'character_set/pure' require 'regexp_property_values' CS = CharacterSet CP = CharacterSet::Pure PV = RegexpPropertyValues require 'benchmark' def m(&block); Benchmark.measure(&block); end require "irb" IRB.start(__FILE__) character-set-1.1.2/README.md0000644000175100017510000001321513373461276014563 0ustar pravipravi# CharacterSet [![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set) [![Build Status](https://travis-ci.org/janosch-x/character_set.svg?branch=master)](https://travis-ci.org/janosch-x/character_set) A gem to build, read, write and compare sets of Unicode codepoints. Many parts can be used independently, e.g.: - `CharacterSet::Character` - `CharacterSet::Parser` - `CharacterSet::Writer` - [`RangeCompressor`](https://github.com/janosch-x/range_compressor) ## Usage ### Usage examples ```ruby CharacterSet.url_query.cover?('?a=(b$c;)') # => true CharacterSet.non_ascii.delete_in!(string) CharacterSet.emoji.sample(5) # => ["⛷", "👈", "🌞", "♑", "⛈"] ``` ### Parse/Initialize These all produce a `CharacterSet` containing `a`, `b` and `c`: ```ruby CharacterSet['a', 'b', 'c'] CharacterSet[97, 98, 99] CharacterSet.new('a'..'c') CharacterSet.new(0x61..0x63) CharacterSet.of('abacababa') CharacterSet.parse('[a-c]') CharacterSet.parse('\U00000061-\U00000063') ``` If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/janosch-x/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting. ```ruby CharacterSet.of_property('Thai') # => # require 'character_set/core_ext/regexp_ext' /[\D&&[:ascii:]&&\p{emoji}]/.character_set.size # => 2 ``` ### Predefined utility sets `ascii`, `ascii_alnum`, `ascii_letters`, `bmp`, `crypt`, `emoji`, `newline`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace` ```ruby CharacterSet.ascii # => # # all can be prefixed with `non_`, e.g. CharacterSet.non_ascii ``` ### Interact with Strings CharacterSet can replace some `Regexp` actions on Strings, at better speed (see [benchmarks](./BENCHMARK.md)). `#used_by?` and `#cover?` can replace some `Regexp#match?` calls: ```ruby CharacterSet.ascii.used_by?('Tüür') # => true CharacterSet.ascii.cover?('Tüür') # => false CharacterSet.ascii.cover?('Tr') # => true ``` `#delete_in(!)` and `#keep_in(!)` can replace `String#gsub(!)` and the like: ```ruby string = 'Tüür' CharacterSet.ascii.delete_in(string) # => 'üü' CharacterSet.ascii.keep_in(string) # => 'Tr' string # => 'Tüür' CharacterSet.ascii.delete_in!(string) # => 'üü' string # => 'üü' CharacterSet.ascii.keep_in!(string) # => '' string # => '' ``` There is also a core extension for String interaction. ```ruby require 'character_set/core_ext/string_ext' "a\rb".character_set & CharacterSet.newline # => CharacterSet["\r"] "a\rb".uses_character_set?(CharacterSet['ä', 'ö', 'ü']) # => false "a\rb".covered_by_character_set?(CharacterSet.newline) # => false # predefined sets can also be referenced via Symbols "a\rb".covered_by_character_set?(:ascii) # => true "a\rb".delete_character_set(:newline) # => 'ab' # etc. ``` ### Manipulate Use any [Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members. Where appropriate, methods take both chars and codepoints, e.g.: ```ruby CharacterSet['a'].add('b') # => CharacterSet['a', 'b'] CharacterSet['a'].add(98) # => CharacterSet['a', 'b'] CharacterSet['a'].include?('a') # => true CharacterSet['a'].include?(0x61) # => true ``` `#inversion` can be used to create a `CharacterSet` with all valid Unicode codepoints that are not in the current set: ```ruby non_a = CharacterSet['a'].inversion # => # non_a.include?('a') # => false non_a.include?('ü') # => true # surrogate pair halves are not included by default CharacterSet['a'].inversion(include_surrogates: true) # => # ``` `#case_insensitive` can be used to create a `CharacterSet` where upper/lower case codepoints are supplemented: ```ruby CharacterSet['1', 'a'].case_insensitive # => CharacterSet['1', 'A', 'a'] ``` ### Write ```ruby set = CharacterSet['a', 'b', 'c', 'j', '-'] # safely printable ASCII chars are not escaped by default set.to_s # => 'a-cj\x2D' set.to_s(escape_all: true) # => '\x61-\x63\x6A\x2D' # brackets may be added set.to_s(in_brackets: true) # => '[a-cj\x2D]' # the default escape format is Ruby/ES6 compatible, others are available set = CharacterSet['a', 'b', 'c', 'ɘ', '🤩'] set.to_s # => 'a-c\u0258\u{1F929}' set.to_s(format: 'U+') # => 'a-cU+0258U+1F929' set.to_s(format: 'Python') # => "a-c\u0258\U0001F929" set.to_s(format: 'raw') # => 'a-cɘ🤩' # or pass a block set.to_s { |char| "[#{char.codepoint}]" } # => "a-c[600][129321]" set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>" # disable abbreviation (grouping of codepoints in ranges) set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}" # for full js regex compatibility in case of astral members: set.to_s_with_surrogate_alternation # => '(?:[\u0258]|\ud83e\udd29)' ``` ### Unicode plane methods There are some methods to check for planes and to handle [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts: ```Ruby CharacterSet['a', 'ü', '🤩'].bmp_part # => CharacterSet['a', 'ü'] CharacterSet['a', 'ü', '🤩'].astral_part # => CharacterSet['🤩'] CharacterSet['a', 'ü', '🤩'].bmp_ratio # => 0.6666666 CharacterSet['a', 'ü', '🤩'].planes # => [0, 1] CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false CharacterSet::Character.new('a').plane # => 0 ``` ### Contributions Feel free to send suggestions, point out issues, or submit pull requests. character-set-1.1.2/.gitignore0000644000175100017510000000054413373461276015275 0ustar pravipravi*.bundle *.gem *.iml *.stTheme.cache *.sublime-project *.sublime-workspace *.swp *.tmlanguage.cache *.tmPreferences.cache *~ .byebug_history .DS_Store .idea/ .ruby-gemset .ruby-version .tags .tags1 bbin/ binstubs/* bundler_stubs/*/.yardoc Gemfile.lock /.bundle/ /_yardoc/ /coverage/ /doc/ /pkg/ /spec/reports/ /tmp/ # rspec failure tracking .rspec_status character-set-1.1.2/.travis.yml0000644000175100017510000000022213373461276015407 0ustar pravipravisudo: false language: ruby rvm: - 2.1 - 2.4 - 2.5 - 2.6 - jruby-9.1.9.0 before_install: - gem update --system - gem install bundler character-set-1.1.2/lib/0000755000175100017510000000000013373461276014050 5ustar pravipravicharacter-set-1.1.2/lib/character_set.rb0000644000175100017510000000105713373461276017207 0ustar pravipravirequire 'character_set/character' require 'character_set/expression_converter' require 'character_set/parser' require 'character_set/predefined_sets' require 'character_set/set_method_adapters' require 'character_set/shared_methods' require 'character_set/version' require 'character_set/writer' class CharacterSet begin require 'character_set/character_set' rescue LoadError require 'character_set/ruby_fallback' prepend RubyFallback end prepend SetMethodAdapters include Enumerable include SharedMethods extend PredefinedSets end character-set-1.1.2/lib/character_set/0000755000175100017510000000000013373461276016657 5ustar pravipravicharacter-set-1.1.2/lib/character_set/ruby_fallback.rb0000644000175100017510000000112013373461276021776 0ustar pravipravirequire 'set' require 'character_set/ruby_fallback/set_methods' require 'character_set/ruby_fallback/plane_methods' require 'character_set/ruby_fallback/character_set_methods' class CharacterSet module RubyFallback include CharacterSet::RubyFallback::SetMethods include CharacterSet::RubyFallback::PlaneMethods include CharacterSet::RubyFallback::CharacterSetMethods def self.prepended(klass) klass.extend CharacterSet::RubyFallback::CharacterSetMethods::ClassMethods end def initialize(enum = []) @__set = SortedSet.new super end end end character-set-1.1.2/lib/character_set/character.rb0000644000175100017510000000351713373461276021146 0ustar pravipraviclass CharacterSet class Character ENCODING = 'utf-8'.freeze SAFELY_PRINTABLE = (0x21..0x7E).to_a - ['-', '[', '\\', ']', '^'].map(&:ord) attr_accessor :codepoint def initialize(codepoint) case codepoint when Integer then self.codepoint = codepoint when String then self.codepoint = codepoint.ord else raise ArgumentError, 'pass an Integer or String' end end def to_s codepoint.chr(ENCODING) end def hex codepoint.to_s(16).upcase end def escape(opts = {}) return to_s if SAFELY_PRINTABLE.include?(codepoint) && !opts[:escape_all] return yield(self) if block_given? # https://billposer.org/Software/ListOfRepresentations.html case opts[:format].to_s.downcase.delete('-_ ') when '', 'default', 'es6', 'esnext', 'rb', 'ruby' default_escape(opts) when 'java', 'javascript', 'js' default_escape(opts, false) when 'capitalizableu', 'c#', 'csharp', 'd', 'python' capitalizable_u_escape when 'u+', 'uplus' u_plus_escape when 'literal', 'raw' to_s else raise ArgumentError, "unsupported format: #{opts[:format].inspect}" end end def plane codepoint / 0x10000 end private def default_escape(opts, support_wide_hex = true) if hex.length <= 2 '\\x' + hex.rjust(2, '0') elsif hex.length <= 4 '\\u' + hex.rjust(4, '0') elsif support_wide_hex '\\u{' + hex + '}' else raise "#{opts[:format]} does not support escaping astral value #{hex}" end end def capitalizable_u_escape if hex.length <= 4 '\\u' + hex.rjust(4, '0') else '\\U' + hex.rjust(8, '0') end end def u_plus_escape 'U+' + hex.rjust(4, '0') end end end character-set-1.1.2/lib/character_set/core_ext.rb0000644000175100017510000000016013373461276021011 0ustar pravipravirequire 'character_set' require 'character_set/core_ext/regexp_ext' require 'character_set/core_ext/string_ext' character-set-1.1.2/lib/character_set/writer.rb0000644000175100017510000000221213373461276020515 0ustar pravipraviclass CharacterSet module Writer module_function def write(codepoint_ranges, opts = {}, &block) content = codepoint_ranges.map do |range| if range.size > 2 && opts[:abbreviate] != false range.minmax.map { |cp| Character.new(cp).escape(opts, &block) }.join('-') else range.map { |cp| Character.new(cp).escape(opts, &block) }.join end end.join opts[:in_brackets] ? "[#{content}]" : content end def write_surrogate_alternation(bmp_ranges, astral_ranges) bmp_set = write(bmp_ranges, format: :js, in_brackets: true) if astral_ranges.empty? bmp_set else surrogate_pairs = surrogate_pairs(astral_ranges) "(?:#{((bmp_ranges.any? ? [bmp_set] : []) + surrogate_pairs) * '|'})" end end def surrogate_pairs(astral_ranges) astral_ranges.flat_map { |range| range.map { |cp| surrogate_pair(cp) } } end def surrogate_pair(astral_codepoint) base = astral_codepoint - 0x10000 high = ((base / 1024).floor + 0xD800).to_s(16) low = (base % 1024 + 0xDC00).to_s(16) "\\u#{high}\\u#{low}" end end end character-set-1.1.2/lib/character_set/core_ext/0000755000175100017510000000000013373461276020467 5ustar pravipravicharacter-set-1.1.2/lib/character_set/core_ext/string_ext.rb0000644000175100017510000000146713373461276023212 0ustar pravipraviclass CharacterSet module CoreExt module StringExt def character_set CharacterSet.of(self) end { covered_by_character_set?: :cover?, delete_character_set: :delete_in, delete_character_set!: :delete_in!, keep_character_set: :keep_in, keep_character_set!: :keep_in!, uses_character_set?: :used_by?, }.each do |string_method, set_method| class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{string_method}(arg) if arg.instance_of?(Symbol) CharacterSet.__send__(arg).#{set_method}(self) else arg.#{set_method}(self) end end RUBY end end end end ::String.send(:include, CharacterSet::CoreExt::StringExt) character-set-1.1.2/lib/character_set/core_ext/regexp_ext.rb0000644000175100017510000000031513373461276023165 0ustar pravipraviclass CharacterSet module CoreExt module RegexpExt def character_set CharacterSet.of_regexp(self) end end end end ::Regexp.send(:include, CharacterSet::CoreExt::RegexpExt) character-set-1.1.2/lib/character_set/expression_converter.rb0000644000175100017510000000721113373461276023473 0ustar pravipraviclass CharacterSet module ExpressionConverter module_function Error = Class.new(ArgumentError) def convert(expression) CharacterSet.require_optional_dependency('regexp_parser') case expression when Regexp::Expression::Root if expression.count != 1 raise Error, 'Pass a Regexp with exactly one expression, e.g. /[a-z]/' end convert(expression[0]) when Regexp::Expression::CharacterSet content = expression.map { |subexp| convert(subexp) }.reduce(:+) expression.negative? ? content.inversion : content when Regexp::Expression::CharacterSet::Intersection expression.map { |subexp| convert(subexp) }.reduce(:&) when Regexp::Expression::CharacterSet::IntersectedSequence expression.map { |subexp| convert(subexp) }.reduce(:+) when Regexp::Expression::CharacterSet::Range start, finish = expression.map { |subexp| convert(subexp) } CharacterSet.from_ranges((start.min)..(finish.max)) when Regexp::Expression::CharacterType::Any CharacterSet.unicode when Regexp::Expression::CharacterType::Digit CharacterSet.from_ranges(48..57) when Regexp::Expression::CharacterType::NonDigit CharacterSet.from_ranges(48..57).inversion when Regexp::Expression::CharacterType::Hex CharacterSet.from_ranges(48..57, 65..70, 97..102) when Regexp::Expression::CharacterType::NonHex CharacterSet.from_ranges(48..57, 65..70, 97..102).inversion when Regexp::Expression::CharacterType::Space CharacterSet["\t", "\n", "\v", "\f", "\r", "\x20"] when Regexp::Expression::CharacterType::NonSpace CharacterSet["\t", "\n", "\v", "\f", "\r", "\x20"].inversion when Regexp::Expression::CharacterType::Word CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122) when Regexp::Expression::CharacterType::NonWord CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122).inversion when Regexp::Expression::EscapeSequence::CodepointList CharacterSet.new(expression.codepoints) when Regexp::Expression::EscapeSequence::Base CharacterSet[expression.codepoint] when Regexp::Expression::Group::Capture, Regexp::Expression::Group::Passive, Regexp::Expression::Group::Named, Regexp::Expression::Group::Atomic, Regexp::Expression::Group::Options case expression.count when 0 then CharacterSet[] when 1 then convert(expression.first) else raise Error, 'Groups must contain exactly one expression, e.g. ([a-z])' end when Regexp::Expression::Alternation expression.map { |subexp| convert(subexp) }.reduce(:+) when Regexp::Expression::Alternative case expression.count when 0 then CharacterSet[] when 1 then convert(expression.first) else raise Error, 'Alternatives must contain exactly one expression' end when Regexp::Expression::Literal if expression.set_level == 0 && expression.text.size != 1 raise Error, 'Literal runs outside of sets are codepoint *sequences*' end CharacterSet[expression.text.ord] when Regexp::Expression::UnicodeProperty::Base, Regexp::Expression::PosixClass content = CharacterSet.of_property(expression.token) expression.negative? ? content.inversion : content when Regexp::Expression::Base raise Error, "Unsupported expression class `#{expression.class}`" else raise Error, "Pass an expression (result of Regexp::Parser.parse)" end end end end character-set-1.1.2/lib/character_set/shared_methods.rb0000644000175100017510000001163613373461276022204 0ustar pravipravi# # Various methods shared by the pure-Ruby and the extended implementation. # # Many of these methods are hotspots, so they are defined directly on # the including classes for better performance. # class CharacterSet module SharedMethods def self.included(klass) klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1 LoadError = Class.new(::LoadError) class << self def [](*args) new(Array(args)) end def parse(string) codepoints = Parser.codepoints_from_bracket_expression(string) result = new(codepoints) string.start_with?('[^') ? result.inversion : result end def of_property(property_name) require_optional_dependency('regexp_property_values') property = RegexpPropertyValues[property_name.to_s] from_ranges(*property.matched_ranges) end def of_regexp(regexp) require_optional_dependency('regexp_parser') root = ::Regexp::Parser.parse(regexp) of_expression(root) end def of_expression(expression) ExpressionConverter.convert(expression) end def require_optional_dependency(name) required_optional_dependencies[name] ||= begin require name true rescue ::LoadError entry_point = caller_locations.reverse.find do |loc| loc.absolute_path.to_s.include?('/lib/character_set') end method = entry_point && entry_point.label raise LoadError, 'You must the install the optional dependency '\ "'\#{name}' to use the method `\#{method}'." end end def required_optional_dependencies @required_optional_dependencies ||= {} end end # class << self def initialize(enumerable = []) merge(Parser.codepoints_from_enumerable(enumerable)) end def replace(enum) unless [Array, CharacterSet, Range].include?(enum.class) enum = self.class.new(enum) end clear merge(enum) end # stringification methods def to_s(opts = {}, &block) Writer.write(ranges, opts, &block) end def to_s_with_surrogate_alternation Writer.write_surrogate_alternation(bmp_part.ranges, astral_part.ranges) end def inspect len = length "# 5}} (size: \#{len})>" end # unicode-plane-related methods def bmp_part? !bmp_part.empty? end def astral_part? !astral_part.empty? end def bmp_ratio bmp_part.count / count.to_f end def astral_ratio astral_part.count / count.to_f end # # The following methods are here for `Set` compatibility, but they are # comparatively slow. Prefer others. # def map! block_given? or return enum_for(__method__) { size } arr = [] each { |cp| arr << yield(cp) } replace(arr) end alias collect! map! def reject!(&block) block_given? or return enum_for(__method__) { size } old_size = size delete_if(&block) self if size != old_size end def select!(&block) block_given? or return enum_for(__method__) { size } old_size = size keep_if(&block) self if size != old_size end alias filter! select! def classify block_given? or return enum_for(__method__) { size } each_with_object({}) { |cp, h| (h[yield(cp)] ||= self.class.new).add(cp) } end def divide(&func) block_given? or return enum_for(__method__) { size } require 'set' if func.arity == 2 require 'tsort' class << dig = {} include TSort alias tsort_each_node each_key def tsort_each_child(node, &block) fetch(node).each(&block) end end each do |u| dig[u] = a = [] each{ |v| a << v if yield(u, v) } end set = Set.new dig.each_strongly_connected_component do |css| set.add(self.class.new(css)) end set else Set.new(classify(&func).values) end end # C-extension adapter method. Needs overriding in pure fallback. # Parsing kwargs in C is slower, verbose, and kinda deprecated. def inversion(include_surrogates: false, upto: 0x10FFFF) ext_inversion(include_surrogates, upto) end RUBY end # self.included end # SharedMethods end character-set-1.1.2/lib/character_set/ruby_fallback/0000755000175100017510000000000013373461276021457 5ustar pravipravicharacter-set-1.1.2/lib/character_set/ruby_fallback/character_set_methods.rb0000644000175100017510000000423613373461276026343 0ustar pravipraviclass CharacterSet module RubyFallback module CharacterSetMethods module ClassMethods def from_ranges(*ranges) new(Array(ranges).flat_map(&:to_a)) end def of(string) raise ArgumentError, 'pass a String' unless string.is_a?(String) new(string.codepoints) end end def inversion(include_surrogates: false, upto: 0x10FFFF) new_set = self.class.new 0.upto(upto) do |cp| next unless include_surrogates || cp > 0xDFFF || cp < 0xD800 new_set << cp unless include?(cp) end new_set end def case_insensitive new_set = dup each do |cp| swapped_cps = cp.chr('utf-8').swapcase.codepoints swapped_cps.size == 1 && new_set << swapped_cps[0] end new_set end def ranges CharacterSet.require_optional_dependency('range_compressor') RangeCompressor.compress(self) end def sample(count = nil) count.nil? ? to_a(true).sample : to_a(true).sample(count) end def used_by?(string) str!(string).each_codepoint { |cp| return true if include?(cp) } false end def cover?(string) str!(string).each_codepoint { |cp| return false unless include?(cp) } true end def delete_in(string) make_new_str(string) { |cp, new_str| include?(cp) || (new_str << cp) } end def delete_in!(string) result = delete_in(string) result.size == string.size ? nil : string.replace(result) end def keep_in(string) make_new_str(string) { |cp, new_str| include?(cp) && (new_str << cp) } end def keep_in!(string) result = keep_in(string) result.size == string.size ? nil : string.replace(result) end private def str!(obj) raise ArgumentError, 'pass a String' unless obj.respond_to?(:codepoints) obj end def make_new_str(original, &block) new_string = str!(original).each_codepoint.each_with_object('', &block) original.tainted? ? new_string.taint : new_string end end end end character-set-1.1.2/lib/character_set/ruby_fallback/set_methods.rb0000644000175100017510000000556013373461276024330 0ustar pravipraviclass CharacterSet module RubyFallback module SetMethods Enumerable.instance_methods.concat(%w[empty? length size]).each do |mthd| class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{mthd}(*args, &block) @__set.#{mthd}(*args, &block) end RUBY end %w[< <= > >= disjoint? intersect? proper_subset? proper_superset? subset? superset?].each do |mthd| class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{mthd}(enum, &block) if enum.is_a?(CharacterSet) || enum.is_a?(CharacterSet::Pure) enum = enum.instance_variable_get(:@__set) end @__set.#{mthd}(enum, &block) end RUBY end %w[<< === add add? clear collect! delete delete? delete_if each filter! hash include? map! member? keep_if reject! select! subtract].each do |mthd| class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{mthd}(*args, &block) result = @__set.#{mthd}(*args, &block) result.is_a?(Set) ? self : result end RUBY end %w[& + - ^ | difference intersection union].each do |mthd| class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{mthd}(enum, &block) if enum.respond_to?(:map) enum = enum.map { |el| el.is_a?(String) ? el.ord : el } end self.class.new(@__set.#{mthd}(enum, &block).to_a) end RUBY end %w[taint untaint].each do |mthd| class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{mthd} @__set.#{mthd} super end RUBY end unless RUBY_PLATFORM[/java/i] def freeze @__set.to_a @__set.freeze super end end def merge(other) raise ArgumentError, 'pass an Enumerable' unless other.respond_to?(:each) # pass through #add to use the checks in SetMethodAdapters other.each { |e| add(e) } self end def ==(other) if equal?(other) true elsif other.instance_of?(self.class) @__set == other.instance_variable_get(:@__set) elsif other.is_a?(self.class) && size == other.size other.all? { |cp| @__set.include?(cp) } else false end end def eql?(other) return false unless other.is_a?(self.class) @__set.eql?(other.instance_variable_get(:@__set)) end def initialize_dup(orig) super @__set = orig.instance_variable_get(:@__set).dup end def initialize_clone(orig) super @__set = orig.instance_variable_get(:@__set).clone end def to_a(stringify = false) result = @__set.to_a stringify ? result.map { |cp| cp.chr('utf-8') } : result end end end end character-set-1.1.2/lib/character_set/ruby_fallback/plane_methods.rb0000644000175100017510000000106713373461276024632 0ustar pravipraviclass CharacterSet module RubyFallback module PlaneMethods def bmp_part dup.keep_if { |cp| cp < 0x10000 } end def astral_part dup.keep_if { |cp| cp >= 0x10000 } end def planes plane_set = {} plane_size = 0x10000.to_f each do |cp| plane = (cp / plane_size).floor plane_set[plane] = true end plane_set.keys end def member_in_plane?(num) ((num * 0x10000)...((num + 1) * 0x10000)).any? { |cp| include?(cp) } end end end end character-set-1.1.2/lib/character_set/parser.rb0000644000175100017510000000274413373461276020507 0ustar pravipraviclass CharacterSet module Parser module_function def codepoints_from_enumerable(object) raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each) # Use #each to check first element (only this works for all Enumerables) object.each do |e| return object if e.is_a?(Integer) && e >= 0 && e < 0x110000 return object.map(&:ord) if e.is_a?(String) && e.length == 1 raise ArgumentError, "#{e.inspect} is not valid as a codepoint" end end def codepoints_from_bracket_expression(string) raise ArgumentError, 'pass a String' unless string.is_a?(String) raise ArgumentError, 'advanced syntax' if string =~ /\\[^uUx]|[^\\]\[|&&/ content = strip_brackets(string) literal_content = eval_escapes(content) prev_chr = nil in_range = false literal_content.each_char.map do |chr| if chr == '-' && prev_chr && prev_chr != '\\' && prev_chr != '-' in_range = true nil else result = in_range ? ((prev_chr.ord + 1)..(chr.ord)).to_a : chr.ord in_range = false prev_chr = chr result end end.compact.flatten end def strip_brackets(string) string[/\A\[\^?(.*)\]\z/, 1] || string.dup end def eval_escapes(string) string.gsub(/\\U(\h{8})|\\u(\h{4})|U\+(\h+)|\\x(\h{2})|\\u\{(\h+)\}/) do ($1 || $2 || $3 || $4 || $5).to_i(16).chr('utf-8') end end end end character-set-1.1.2/lib/character_set/set_method_adapters.rb0000644000175100017510000000232113373461276023220 0ustar pravipraviclass CharacterSet module SetMethodAdapters # Allow some methods to work with String in addition to Integer args # (the internal representation is geared towards codepoint Integers). %w[add add? << delete delete? include? member? ===].each do |method| class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{method}(arg) case arg when String super(arg.ord) when Integer if arg < 0 || arg > 0x10FFFF raise ArgumentError, 'pass an Integer between 0 and 0x10FFFF' end super(arg) else raise ArgumentError, 'pass a String or an Integer' end end RUBY end # Allow some methods to take an Enum just as well as another CharacterSet. # Tested by ruby-spec. %w[& + - ^ | difference intersection subtract union].each do |method| class_eval <<-RUBY, __FILE__, __LINE__ + 1 def #{method}(arg) if arg.is_a?(CharacterSet) super elsif arg.respond_to?(:each) super(CharacterSet.new(arg.to_a)) else raise ArgumentError, 'pass an enumerable' end end RUBY end end end character-set-1.1.2/lib/character_set/predefined_sets.rb0000644000175100017510000001403213373461276022347 0ustar pravipraviclass CharacterSet module PredefinedSets def ascii @ascii ||= from_ranges(0..0x7F).freeze end def ascii_alnum @ascii_alnum ||= from_ranges(0x30..0x39, 0x41..0x5A, 0x61..0x7A).freeze end def ascii_letters @ascii_letters ||= from_ranges(0x41..0x5A, 0x61..0x7A).freeze end # basic multilingual plane def bmp @bmp ||= from_ranges(0..0xD7FF, 0xE000..0xFFFF).freeze end # ./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz def crypt @crypt ||= from_ranges(0x2E..0x5A, 0x61..0x7A).freeze end def newline @newline ||= from_ranges(0xA..0xD, 0x85..0x85, 0x2028..0x2029).freeze end def unicode @unicode ||= from_ranges(0..0xD7FF, 0xE000..0x10FFFF).freeze end def url_fragment @url_fragment ||= from_ranges( 0x21..0x21, 0x24..0x24, 0x26..0x3B, 0x3D..0x3D, 0x3F..0x5A, 0x5F..0x5F, 0x61..0x7A, 0x7E..0x7E ).freeze end def url_host @url_host ||= from_ranges( 0x21..0x21, 0x24..0x24, 0x26..0x2E, 0x30..0x3B, 0x3D..0x3D, 0x41..0x5B, 0x5D..0x5D, 0x5F..0x5F, 0x61..0x7A, 0x7E..0x7E ).freeze end def url_path @url_path ||= from_ranges( 0x21..0x21, 0x24..0x3A, 0x3D..0x3D, 0x40..0x5A, 0x5F..0x5F, 0x61..0x7A, 0x7E..0x7E ).freeze end def url_query @url_query ||= from_ranges( 0x21..0x21, 0x24..0x24, 0x26..0x3B, 0x3D..0x3D, 0x3F..0x5A, 0x5F..0x5F, 0x61..0x7A, 0x7E..0x7E ).freeze end def whitespace @whitespace ||= from_ranges( 0x9..0x9, 0xA..0xD, 0x20..0x20, 0x85..0x85, 0xA0..0xA0, 0x1680..0x1680, 0x180E..0x180E, 0x2000..0x200A, 0x2028..0x2029, 0x202F..0x202F, 0x205F..0x205F, 0x3000..0x3000 ).freeze end def emoji @emoji ||= from_ranges( 0x23..0x23, 0x2A..0x2A, 0x30..0x39, 0xA9..0xA9, 0xAE..0xAE, 0x203C..0x203C, 0x2049..0x2049, 0x2122..0x2122, 0x2139..0x2139, 0x2194..0x2199, 0x21A9..0x21AA, 0x231A..0x231B, 0x2328..0x2328, 0x23CF..0x23CF, 0x23E9..0x23F3, 0x23F8..0x23FA, 0x24C2..0x24C2, 0x25AA..0x25AB, 0x25B6..0x25B6, 0x25C0..0x25C0, 0x25FB..0x25FE, 0x2600..0x2604, 0x260E..0x260E, 0x2611..0x2611, 0x2614..0x2615, 0x2618..0x2618, 0x261D..0x261D, 0x2620..0x2620, 0x2622..0x2623, 0x2626..0x2626, 0x262A..0x262A, 0x262E..0x262F, 0x2638..0x263A, 0x2640..0x2640, 0x2642..0x2642, 0x2648..0x2653, 0x2660..0x2660, 0x2663..0x2663, 0x2665..0x2666, 0x2668..0x2668, 0x267B..0x267B, 0x267F..0x267F, 0x2692..0x2697, 0x2699..0x2699, 0x269B..0x269C, 0x26A0..0x26A1, 0x26AA..0x26AB, 0x26B0..0x26B1, 0x26BD..0x26BE, 0x26C4..0x26C5, 0x26C8..0x26C8, 0x26CE..0x26CF, 0x26D1..0x26D1, 0x26D3..0x26D4, 0x26E9..0x26EA, 0x26F0..0x26F5, 0x26F7..0x26FA, 0x26FD..0x26FD, 0x2702..0x2702, 0x2705..0x2705, 0x2708..0x270D, 0x270F..0x270F, 0x2712..0x2712, 0x2714..0x2714, 0x2716..0x2716, 0x271D..0x271D, 0x2721..0x2721, 0x2728..0x2728, 0x2733..0x2734, 0x2744..0x2744, 0x2747..0x2747, 0x274C..0x274C, 0x274E..0x274E, 0x2753..0x2755, 0x2757..0x2757, 0x2763..0x2764, 0x2795..0x2797, 0x27A1..0x27A1, 0x27B0..0x27B0, 0x27BF..0x27BF, 0x2934..0x2935, 0x2B05..0x2B07, 0x2B1B..0x2B1C, 0x2B50..0x2B50, 0x2B55..0x2B55, 0x3030..0x3030, 0x303D..0x303D, 0x3297..0x3297, 0x3299..0x3299, 0x1F004..0x1F004, 0x1F0CF..0x1F0CF, 0x1F170..0x1F171, 0x1F17E..0x1F17F, 0x1F18E..0x1F18E, 0x1F191..0x1F19A, 0x1F1E6..0x1F1FF, 0x1F201..0x1F202, 0x1F21A..0x1F21A, 0x1F22F..0x1F22F, 0x1F232..0x1F23A, 0x1F250..0x1F251, 0x1F300..0x1F321, 0x1F324..0x1F393, 0x1F396..0x1F397, 0x1F399..0x1F39B, 0x1F39E..0x1F3F0, 0x1F3F3..0x1F3F5, 0x1F3F7..0x1F4FD, 0x1F4FF..0x1F53D, 0x1F549..0x1F54E, 0x1F550..0x1F567, 0x1F56F..0x1F570, 0x1F573..0x1F57A, 0x1F587..0x1F587, 0x1F58A..0x1F58D, 0x1F590..0x1F590, 0x1F595..0x1F596, 0x1F5A4..0x1F5A5, 0x1F5A8..0x1F5A8, 0x1F5B1..0x1F5B2, 0x1F5BC..0x1F5BC, 0x1F5C2..0x1F5C4, 0x1F5D1..0x1F5D3, 0x1F5DC..0x1F5DE, 0x1F5E1..0x1F5E1, 0x1F5E3..0x1F5E3, 0x1F5E8..0x1F5E8, 0x1F5EF..0x1F5EF, 0x1F5F3..0x1F5F3, 0x1F5FA..0x1F64F, 0x1F680..0x1F6C5, 0x1F6CB..0x1F6D2, 0x1F6E0..0x1F6E5, 0x1F6E9..0x1F6E9, 0x1F6EB..0x1F6EC, 0x1F6F0..0x1F6F0, 0x1F6F3..0x1F6F8, 0x1F910..0x1F93A, 0x1F93C..0x1F93E, 0x1F940..0x1F945, 0x1F947..0x1F94C, 0x1F950..0x1F96B, 0x1F980..0x1F997, 0x1F9C0..0x1F9C0, 0x1F9D0..0x1F9E6 ).freeze end def respond_to_missing?(method_name, include_private = false) (base = method_name[/^non_(.*)/, 1]) && respond_to?(base) || super end def method_missing(method_name, *args, &block) if (base = method_name[/^non_(.*)/, 1]) ivar_name = "@#{method_name}" return instance_variable_get(ivar_name) || instance_variable_set(ivar_name, send(base).inversion.freeze) end super end end end character-set-1.1.2/lib/character_set/pure.rb0000644000175100017510000000057513373461276020166 0ustar pravipravirequire 'character_set' require 'character_set/ruby_fallback' # CharacterSet::Pure uses only Ruby implementations. # It is equal to CharacterSet if the C ext can't be loaded. class CharacterSet class Pure prepend CharacterSet::RubyFallback prepend CharacterSet::SetMethodAdapters include CharacterSet::SharedMethods extend CharacterSet::PredefinedSets end end character-set-1.1.2/lib/character_set/version.rb0000644000175100017510000000005313373461276020667 0ustar pravipraviclass CharacterSet VERSION = '1.1.2' end character-set-1.1.2/LICENSE.txt0000644000175100017510000000207313373461276015127 0ustar pravipraviThe MIT License (MIT) Copyright (c) 2018 Janosch Müller Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. character-set-1.1.2/Gemfile0000644000175100017510000000025013373461276014572 0ustar pravipravisource "https://rubygems.org" git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } # Specify your gem's dependencies in character_set.gemspec gemspec character-set-1.1.2/benchmarks/0000755000175100017510000000000013373461276015417 5ustar pravipravicharacter-set-1.1.2/benchmarks/delete_in.rb0000644000175100017510000000116513373461276017677 0ustar pravipravirequire_relative './shared' str = 'Lorem ipsum et dolorem' rx = /\s/ cs = CharacterSet.whitespace benchmark( caption: 'Removing whitespace', cases: { 'String#gsub' => -> { str.gsub(rx, '') }, 'CharacterSet#delete_in' => -> { cs.delete_in(str) }, } ) str = 'Lörem ipsüm ⛷ et dölörem' rx = /[\s\p{emoji}äüö]/ cs = CharacterSet.whitespace + CharacterSet.emoji + CS['ä', 'ü', 'ö'] benchmark( caption: 'Removing whitespace, emoji and umlauts', cases: { 'String#gsub' => -> { str.gsub(rx, '') }, 'CharacterSet#delete_in' => -> { cs.delete_in(str) }, } ) character-set-1.1.2/benchmarks/keep_in.rb0000644000175100017510000000101213373461276017350 0ustar pravipravirequire_relative './shared' str = 'Lorem ipsum et dolorem' rx = /\S/ cs = CharacterSet.whitespace benchmark( caption: 'Removing non-whitespace', cases: { 'String#gsub' => -> { str.gsub(rx, '') }, 'CharacterSet#keep_in' => -> { cs.keep_in(str) }, } ) str = 'Lorem ipsum ⛷ et dolorem' rx = /\p{^emoji}/ cs = CharacterSet.emoji benchmark( caption: 'Extracting emoji', cases: { 'String#gsub' => -> { str.gsub(rx, '') }, 'CharacterSet#keep_in' => -> { cs.keep_in(str) }, } ) character-set-1.1.2/benchmarks/cover.rb0000644000175100017510000000105613373461276017064 0ustar pravipravirequire_relative './shared' str = 'Lorem ipsum et dolorem' rx = /\S/ cs = CharacterSet.whitespace.inversion benchmark( caption: 'Detecting non-whitespace', cases: { 'Regexp#match?' => -> { rx.match?(str) }, 'CharacterSet#cover?' => -> { cs.cover?(str) }, } ) str = 'Lorem ipsum et dolorem' rx = /[^a-z]/i cs = CharacterSet.new('A'..'Z') + CharacterSet.new('a'..'z') benchmark( caption: 'Detecting non-letters', cases: { 'Regexp#match?' => -> { rx.match?(str) }, 'CharacterSet#cover?' => -> { cs.cover?(str) }, } ) character-set-1.1.2/benchmarks/shared.rb0000644000175100017510000000110413373461276017206 0ustar pravipravilib = File.expand_path('../lib', __dir__) $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) require 'benchmark/ips' require 'character_set' def benchmark(caption: nil, cases: {}) puts caption report = Benchmark.ips do |x| cases.each do |label, callable| x.report(label, &callable) end x.compare! end return unless $store_comparison_results old_stdout = $stdout.clone captured_stdout = StringIO.new $stdout = captured_stdout report.run_comparison $store_comparison_results[caption] = captured_stdout.string $stdout = old_stdout end character-set-1.1.2/benchmarks/used_by.rb0000644000175100017510000000110213373461276017370 0ustar pravipravirequire_relative './shared' str = 'Lorem ipsum et dolorem' rx = /\s/ cs = CharacterSet.whitespace benchmark( caption: 'Detecting whitespace', cases: { 'Regexp#match?' => -> { rx.match?(str) }, 'CharacterSet#used_by?' => -> { cs.used_by?(str) }, } ) str = 'Lorem ipsum et dolorem' * 20 + '⛷' + 'Lorem ipsum et dolorem' * 20 rx = /\p{emoji}/ cs = CharacterSet.emoji benchmark( caption: 'Detecting emoji in a large string', cases: { 'Regexp#match?' => -> { rx.match?(str) }, 'CharacterSet#used_by?' => -> { cs.used_by?(str) }, } ) character-set-1.1.2/CHANGELOG.md0000644000175100017510000000135313373461276015115 0ustar pravipravi# Changelog All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). ## [1.1.2] - 2018-09-25 ### Fixed - restored `range_compressor` as a runtime dependency for JRuby only ## [1.1.1] - 2018-09-24 ### Fixed - improved messages for missing optional dependencies - made `range_compressor` an optional dependency as it is almost never needed ## [1.1.0] - 2018-09-21 ### Added - added option to reference a predefined set via Symbol in `String` extension methods - added predefined sets `::ascii_alnum` and `::ascii_letters` ## [1.0.0] - 2018-09-02 Initial release. character-set-1.1.2/character_set.gemspec0000644000175100017510000000224113373461276017455 0ustar pravipravilib = File.expand_path('../lib', __FILE__) $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) require 'character_set/version' Gem::Specification.new do |s| s.name = 'character_set' s.version = CharacterSet::VERSION s.authors = ['Janosch Müller'] s.email = ['janosch84@gmail.com'] s.summary = 'Build, read, write and compare sets of Unicode codepoints.' s.homepage = 'https://github.com/janosch-x/character_set' s.license = 'MIT' s.files = `git ls-files -z`.split("\x0").reject do |f| f.match(%r{^(test|spec|features)/}) end s.require_paths = ['lib'] s.extensions = %w[ext/character_set/extconf.rb] s.required_ruby_version = '>= 2.1.0' s.add_development_dependency 'benchmark-ips', '~> 2.7' s.add_development_dependency 'bundler', '~> 1.16' s.add_development_dependency 'rake', '~> 12.0' s.add_development_dependency 'rake-compiler', '~> 1.0' s.add_development_dependency 'range_compressor', '~> 1.0' s.add_development_dependency 'regexp_parser', '~> 1.1' s.add_development_dependency 'regexp_property_values', '~> 0.3.4' s.add_development_dependency 'rspec', '~> 3.8' end character-set-1.1.2/.rspec0000644000175100017510000000006513373461276014420 0ustar pravipravi--format documentation --color --require spec_helper character-set-1.1.2/ext/0000755000175100017510000000000013373461276014102 5ustar pravipravicharacter-set-1.1.2/ext/character_set/0000755000175100017510000000000013373461276016711 5ustar pravipravicharacter-set-1.1.2/ext/character_set/unicode_casefold_table.h0000644000175100017510000005706313373461276023532 0ustar pravipravi// THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT' typedef struct casefold_mapping { unsigned long from; unsigned long to; } casefold_mapping; #define CASEFOLD_COUNT 1376 static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = { {0x0041,0x0061}, {0x0042,0x0062}, {0x0043,0x0063}, {0x0044,0x0064}, {0x0045,0x0065}, {0x0046,0x0066}, {0x0047,0x0067}, {0x0048,0x0068}, {0x0049,0x0069}, {0x004A,0x006A}, {0x004B,0x006B}, {0x004C,0x006C}, {0x004D,0x006D}, {0x004E,0x006E}, {0x004F,0x006F}, {0x0050,0x0070}, {0x0051,0x0071}, {0x0052,0x0072}, {0x0053,0x0073}, {0x0054,0x0074}, {0x0055,0x0075}, {0x0056,0x0076}, {0x0057,0x0077}, {0x0058,0x0078}, {0x0059,0x0079}, {0x005A,0x007A}, {0x00B5,0x03BC}, {0x00C0,0x00E0}, {0x00C1,0x00E1}, {0x00C2,0x00E2}, {0x00C3,0x00E3}, {0x00C4,0x00E4}, {0x00C5,0x00E5}, {0x00C6,0x00E6}, {0x00C7,0x00E7}, {0x00C8,0x00E8}, {0x00C9,0x00E9}, {0x00CA,0x00EA}, {0x00CB,0x00EB}, {0x00CC,0x00EC}, {0x00CD,0x00ED}, {0x00CE,0x00EE}, {0x00CF,0x00EF}, {0x00D0,0x00F0}, {0x00D1,0x00F1}, {0x00D2,0x00F2}, {0x00D3,0x00F3}, {0x00D4,0x00F4}, {0x00D5,0x00F5}, {0x00D6,0x00F6}, {0x00D8,0x00F8}, {0x00D9,0x00F9}, {0x00DA,0x00FA}, {0x00DB,0x00FB}, {0x00DC,0x00FC}, {0x00DD,0x00FD}, {0x00DE,0x00FE}, {0x0100,0x0101}, {0x0102,0x0103}, {0x0104,0x0105}, {0x0106,0x0107}, {0x0108,0x0109}, {0x010A,0x010B}, {0x010C,0x010D}, {0x010E,0x010F}, {0x0110,0x0111}, {0x0112,0x0113}, {0x0114,0x0115}, {0x0116,0x0117}, {0x0118,0x0119}, {0x011A,0x011B}, {0x011C,0x011D}, {0x011E,0x011F}, {0x0120,0x0121}, {0x0122,0x0123}, {0x0124,0x0125}, {0x0126,0x0127}, {0x0128,0x0129}, {0x012A,0x012B}, {0x012C,0x012D}, {0x012E,0x012F}, {0x0132,0x0133}, {0x0134,0x0135}, {0x0136,0x0137}, {0x0139,0x013A}, {0x013B,0x013C}, {0x013D,0x013E}, {0x013F,0x0140}, {0x0141,0x0142}, {0x0143,0x0144}, {0x0145,0x0146}, {0x0147,0x0148}, {0x014A,0x014B}, {0x014C,0x014D}, {0x014E,0x014F}, {0x0150,0x0151}, {0x0152,0x0153}, {0x0154,0x0155}, {0x0156,0x0157}, {0x0158,0x0159}, {0x015A,0x015B}, {0x015C,0x015D}, {0x015E,0x015F}, {0x0160,0x0161}, {0x0162,0x0163}, {0x0164,0x0165}, {0x0166,0x0167}, {0x0168,0x0169}, {0x016A,0x016B}, {0x016C,0x016D}, {0x016E,0x016F}, {0x0170,0x0171}, {0x0172,0x0173}, {0x0174,0x0175}, {0x0176,0x0177}, {0x0178,0x00FF}, {0x0179,0x017A}, {0x017B,0x017C}, {0x017D,0x017E}, {0x017F,0x0073}, {0x0181,0x0253}, {0x0182,0x0183}, {0x0184,0x0185}, {0x0186,0x0254}, {0x0187,0x0188}, {0x0189,0x0256}, {0x018A,0x0257}, {0x018B,0x018C}, {0x018E,0x01DD}, {0x018F,0x0259}, {0x0190,0x025B}, {0x0191,0x0192}, {0x0193,0x0260}, {0x0194,0x0263}, {0x0196,0x0269}, {0x0197,0x0268}, {0x0198,0x0199}, {0x019C,0x026F}, {0x019D,0x0272}, {0x019F,0x0275}, {0x01A0,0x01A1}, {0x01A2,0x01A3}, {0x01A4,0x01A5}, {0x01A6,0x0280}, {0x01A7,0x01A8}, {0x01A9,0x0283}, {0x01AC,0x01AD}, {0x01AE,0x0288}, {0x01AF,0x01B0}, {0x01B1,0x028A}, {0x01B2,0x028B}, {0x01B3,0x01B4}, {0x01B5,0x01B6}, {0x01B7,0x0292}, {0x01B8,0x01B9}, {0x01BC,0x01BD}, {0x01C4,0x01C6}, {0x01C5,0x01C6}, {0x01C7,0x01C9}, {0x01C8,0x01C9}, {0x01CA,0x01CC}, {0x01CB,0x01CC}, {0x01CD,0x01CE}, {0x01CF,0x01D0}, {0x01D1,0x01D2}, {0x01D3,0x01D4}, {0x01D5,0x01D6}, {0x01D7,0x01D8}, {0x01D9,0x01DA}, {0x01DB,0x01DC}, {0x01DE,0x01DF}, {0x01E0,0x01E1}, {0x01E2,0x01E3}, {0x01E4,0x01E5}, {0x01E6,0x01E7}, {0x01E8,0x01E9}, {0x01EA,0x01EB}, {0x01EC,0x01ED}, {0x01EE,0x01EF}, {0x01F1,0x01F3}, {0x01F2,0x01F3}, {0x01F4,0x01F5}, {0x01F6,0x0195}, {0x01F7,0x01BF}, {0x01F8,0x01F9}, {0x01FA,0x01FB}, {0x01FC,0x01FD}, {0x01FE,0x01FF}, {0x0200,0x0201}, {0x0202,0x0203}, {0x0204,0x0205}, {0x0206,0x0207}, {0x0208,0x0209}, {0x020A,0x020B}, {0x020C,0x020D}, {0x020E,0x020F}, {0x0210,0x0211}, {0x0212,0x0213}, {0x0214,0x0215}, {0x0216,0x0217}, {0x0218,0x0219}, {0x021A,0x021B}, {0x021C,0x021D}, {0x021E,0x021F}, {0x0220,0x019E}, {0x0222,0x0223}, {0x0224,0x0225}, {0x0226,0x0227}, {0x0228,0x0229}, {0x022A,0x022B}, {0x022C,0x022D}, {0x022E,0x022F}, {0x0230,0x0231}, {0x0232,0x0233}, {0x023A,0x2C65}, {0x023B,0x023C}, {0x023D,0x019A}, {0x023E,0x2C66}, {0x0241,0x0242}, {0x0243,0x0180}, {0x0244,0x0289}, {0x0245,0x028C}, {0x0246,0x0247}, {0x0248,0x0249}, {0x024A,0x024B}, {0x024C,0x024D}, {0x024E,0x024F}, {0x0345,0x03B9}, {0x0370,0x0371}, {0x0372,0x0373}, {0x0376,0x0377}, {0x037F,0x03F3}, {0x0386,0x03AC}, {0x0388,0x03AD}, {0x0389,0x03AE}, {0x038A,0x03AF}, {0x038C,0x03CC}, {0x038E,0x03CD}, {0x038F,0x03CE}, {0x0391,0x03B1}, {0x0392,0x03B2}, {0x0393,0x03B3}, {0x0394,0x03B4}, {0x0395,0x03B5}, {0x0396,0x03B6}, {0x0397,0x03B7}, {0x0398,0x03B8}, {0x0399,0x03B9}, {0x039A,0x03BA}, {0x039B,0x03BB}, {0x039C,0x03BC}, {0x039D,0x03BD}, {0x039E,0x03BE}, {0x039F,0x03BF}, {0x03A0,0x03C0}, {0x03A1,0x03C1}, {0x03A3,0x03C3}, {0x03A4,0x03C4}, {0x03A5,0x03C5}, {0x03A6,0x03C6}, {0x03A7,0x03C7}, {0x03A8,0x03C8}, {0x03A9,0x03C9}, {0x03AA,0x03CA}, {0x03AB,0x03CB}, {0x03C2,0x03C3}, {0x03CF,0x03D7}, {0x03D0,0x03B2}, {0x03D1,0x03B8}, {0x03D5,0x03C6}, {0x03D6,0x03C0}, {0x03D8,0x03D9}, {0x03DA,0x03DB}, {0x03DC,0x03DD}, {0x03DE,0x03DF}, {0x03E0,0x03E1}, {0x03E2,0x03E3}, {0x03E4,0x03E5}, {0x03E6,0x03E7}, {0x03E8,0x03E9}, {0x03EA,0x03EB}, {0x03EC,0x03ED}, {0x03EE,0x03EF}, {0x03F0,0x03BA}, {0x03F1,0x03C1}, {0x03F4,0x03B8}, {0x03F5,0x03B5}, {0x03F7,0x03F8}, {0x03F9,0x03F2}, {0x03FA,0x03FB}, {0x03FD,0x037B}, {0x03FE,0x037C}, {0x03FF,0x037D}, {0x0400,0x0450}, {0x0401,0x0451}, {0x0402,0x0452}, {0x0403,0x0453}, {0x0404,0x0454}, {0x0405,0x0455}, {0x0406,0x0456}, {0x0407,0x0457}, {0x0408,0x0458}, {0x0409,0x0459}, {0x040A,0x045A}, {0x040B,0x045B}, {0x040C,0x045C}, {0x040D,0x045D}, {0x040E,0x045E}, {0x040F,0x045F}, {0x0410,0x0430}, {0x0411,0x0431}, {0x0412,0x0432}, {0x0413,0x0433}, {0x0414,0x0434}, {0x0415,0x0435}, {0x0416,0x0436}, {0x0417,0x0437}, {0x0418,0x0438}, {0x0419,0x0439}, {0x041A,0x043A}, {0x041B,0x043B}, {0x041C,0x043C}, {0x041D,0x043D}, {0x041E,0x043E}, {0x041F,0x043F}, {0x0420,0x0440}, {0x0421,0x0441}, {0x0422,0x0442}, {0x0423,0x0443}, {0x0424,0x0444}, {0x0425,0x0445}, {0x0426,0x0446}, {0x0427,0x0447}, {0x0428,0x0448}, {0x0429,0x0449}, {0x042A,0x044A}, {0x042B,0x044B}, {0x042C,0x044C}, {0x042D,0x044D}, {0x042E,0x044E}, {0x042F,0x044F}, {0x0460,0x0461}, {0x0462,0x0463}, {0x0464,0x0465}, {0x0466,0x0467}, {0x0468,0x0469}, {0x046A,0x046B}, {0x046C,0x046D}, {0x046E,0x046F}, {0x0470,0x0471}, {0x0472,0x0473}, {0x0474,0x0475}, {0x0476,0x0477}, {0x0478,0x0479}, {0x047A,0x047B}, {0x047C,0x047D}, {0x047E,0x047F}, {0x0480,0x0481}, {0x048A,0x048B}, {0x048C,0x048D}, {0x048E,0x048F}, {0x0490,0x0491}, {0x0492,0x0493}, {0x0494,0x0495}, {0x0496,0x0497}, {0x0498,0x0499}, {0x049A,0x049B}, {0x049C,0x049D}, {0x049E,0x049F}, {0x04A0,0x04A1}, {0x04A2,0x04A3}, {0x04A4,0x04A5}, {0x04A6,0x04A7}, {0x04A8,0x04A9}, {0x04AA,0x04AB}, {0x04AC,0x04AD}, {0x04AE,0x04AF}, {0x04B0,0x04B1}, {0x04B2,0x04B3}, {0x04B4,0x04B5}, {0x04B6,0x04B7}, {0x04B8,0x04B9}, {0x04BA,0x04BB}, {0x04BC,0x04BD}, {0x04BE,0x04BF}, {0x04C0,0x04CF}, {0x04C1,0x04C2}, {0x04C3,0x04C4}, {0x04C5,0x04C6}, {0x04C7,0x04C8}, {0x04C9,0x04CA}, {0x04CB,0x04CC}, {0x04CD,0x04CE}, {0x04D0,0x04D1}, {0x04D2,0x04D3}, {0x04D4,0x04D5}, {0x04D6,0x04D7}, {0x04D8,0x04D9}, {0x04DA,0x04DB}, {0x04DC,0x04DD}, {0x04DE,0x04DF}, {0x04E0,0x04E1}, {0x04E2,0x04E3}, {0x04E4,0x04E5}, {0x04E6,0x04E7}, {0x04E8,0x04E9}, {0x04EA,0x04EB}, {0x04EC,0x04ED}, {0x04EE,0x04EF}, {0x04F0,0x04F1}, {0x04F2,0x04F3}, {0x04F4,0x04F5}, {0x04F6,0x04F7}, {0x04F8,0x04F9}, {0x04FA,0x04FB}, {0x04FC,0x04FD}, {0x04FE,0x04FF}, {0x0500,0x0501}, {0x0502,0x0503}, {0x0504,0x0505}, {0x0506,0x0507}, {0x0508,0x0509}, {0x050A,0x050B}, {0x050C,0x050D}, {0x050E,0x050F}, {0x0510,0x0511}, {0x0512,0x0513}, {0x0514,0x0515}, {0x0516,0x0517}, {0x0518,0x0519}, {0x051A,0x051B}, {0x051C,0x051D}, {0x051E,0x051F}, {0x0520,0x0521}, {0x0522,0x0523}, {0x0524,0x0525}, {0x0526,0x0527}, {0x0528,0x0529}, {0x052A,0x052B}, {0x052C,0x052D}, {0x052E,0x052F}, {0x0531,0x0561}, {0x0532,0x0562}, {0x0533,0x0563}, {0x0534,0x0564}, {0x0535,0x0565}, {0x0536,0x0566}, {0x0537,0x0567}, {0x0538,0x0568}, {0x0539,0x0569}, {0x053A,0x056A}, {0x053B,0x056B}, {0x053C,0x056C}, {0x053D,0x056D}, {0x053E,0x056E}, {0x053F,0x056F}, {0x0540,0x0570}, {0x0541,0x0571}, {0x0542,0x0572}, {0x0543,0x0573}, {0x0544,0x0574}, {0x0545,0x0575}, {0x0546,0x0576}, {0x0547,0x0577}, {0x0548,0x0578}, {0x0549,0x0579}, {0x054A,0x057A}, {0x054B,0x057B}, {0x054C,0x057C}, {0x054D,0x057D}, {0x054E,0x057E}, {0x054F,0x057F}, {0x0550,0x0580}, {0x0551,0x0581}, {0x0552,0x0582}, {0x0553,0x0583}, {0x0554,0x0584}, {0x0555,0x0585}, {0x0556,0x0586}, {0x10400,0x10428}, {0x10401,0x10429}, {0x10402,0x1042A}, {0x10403,0x1042B}, {0x10404,0x1042C}, {0x10405,0x1042D}, {0x10406,0x1042E}, {0x10407,0x1042F}, {0x10408,0x10430}, {0x10409,0x10431}, {0x1040A,0x10432}, {0x1040B,0x10433}, {0x1040C,0x10434}, {0x1040D,0x10435}, {0x1040E,0x10436}, {0x1040F,0x10437}, {0x10410,0x10438}, {0x10411,0x10439}, {0x10412,0x1043A}, {0x10413,0x1043B}, {0x10414,0x1043C}, {0x10415,0x1043D}, {0x10416,0x1043E}, {0x10417,0x1043F}, {0x10418,0x10440}, {0x10419,0x10441}, {0x1041A,0x10442}, {0x1041B,0x10443}, {0x1041C,0x10444}, {0x1041D,0x10445}, {0x1041E,0x10446}, {0x1041F,0x10447}, {0x10420,0x10448}, {0x10421,0x10449}, {0x10422,0x1044A}, {0x10423,0x1044B}, {0x10424,0x1044C}, {0x10425,0x1044D}, {0x10426,0x1044E}, {0x10427,0x1044F}, {0x104B0,0x104D8}, {0x104B1,0x104D9}, {0x104B2,0x104DA}, {0x104B3,0x104DB}, {0x104B4,0x104DC}, {0x104B5,0x104DD}, {0x104B6,0x104DE}, {0x104B7,0x104DF}, {0x104B8,0x104E0}, {0x104B9,0x104E1}, {0x104BA,0x104E2}, {0x104BB,0x104E3}, {0x104BC,0x104E4}, {0x104BD,0x104E5}, {0x104BE,0x104E6}, {0x104BF,0x104E7}, {0x104C0,0x104E8}, {0x104C1,0x104E9}, {0x104C2,0x104EA}, {0x104C3,0x104EB}, {0x104C4,0x104EC}, {0x104C5,0x104ED}, {0x104C6,0x104EE}, {0x104C7,0x104EF}, {0x104C8,0x104F0}, {0x104C9,0x104F1}, {0x104CA,0x104F2}, {0x104CB,0x104F3}, {0x104CC,0x104F4}, {0x104CD,0x104F5}, {0x104CE,0x104F6}, {0x104CF,0x104F7}, {0x104D0,0x104F8}, {0x104D1,0x104F9}, {0x104D2,0x104FA}, {0x104D3,0x104FB}, {0x10A0,0x2D00}, {0x10A1,0x2D01}, {0x10A2,0x2D02}, {0x10A3,0x2D03}, {0x10A4,0x2D04}, {0x10A5,0x2D05}, {0x10A6,0x2D06}, {0x10A7,0x2D07}, {0x10A8,0x2D08}, {0x10A9,0x2D09}, {0x10AA,0x2D0A}, {0x10AB,0x2D0B}, {0x10AC,0x2D0C}, {0x10AD,0x2D0D}, {0x10AE,0x2D0E}, {0x10AF,0x2D0F}, {0x10B0,0x2D10}, {0x10B1,0x2D11}, {0x10B2,0x2D12}, {0x10B3,0x2D13}, {0x10B4,0x2D14}, {0x10B5,0x2D15}, {0x10B6,0x2D16}, {0x10B7,0x2D17}, {0x10B8,0x2D18}, {0x10B9,0x2D19}, {0x10BA,0x2D1A}, {0x10BB,0x2D1B}, {0x10BC,0x2D1C}, {0x10BD,0x2D1D}, {0x10BE,0x2D1E}, {0x10BF,0x2D1F}, {0x10C0,0x2D20}, {0x10C1,0x2D21}, {0x10C2,0x2D22}, {0x10C3,0x2D23}, {0x10C4,0x2D24}, {0x10C5,0x2D25}, {0x10C7,0x2D27}, {0x10C80,0x10CC0}, {0x10C81,0x10CC1}, {0x10C82,0x10CC2}, {0x10C83,0x10CC3}, {0x10C84,0x10CC4}, {0x10C85,0x10CC5}, {0x10C86,0x10CC6}, {0x10C87,0x10CC7}, {0x10C88,0x10CC8}, {0x10C89,0x10CC9}, {0x10C8A,0x10CCA}, {0x10C8B,0x10CCB}, {0x10C8C,0x10CCC}, {0x10C8D,0x10CCD}, {0x10C8E,0x10CCE}, {0x10C8F,0x10CCF}, {0x10C90,0x10CD0}, {0x10C91,0x10CD1}, {0x10C92,0x10CD2}, {0x10C93,0x10CD3}, {0x10C94,0x10CD4}, {0x10C95,0x10CD5}, {0x10C96,0x10CD6}, {0x10C97,0x10CD7}, {0x10C98,0x10CD8}, {0x10C99,0x10CD9}, {0x10C9A,0x10CDA}, {0x10C9B,0x10CDB}, {0x10C9C,0x10CDC}, {0x10C9D,0x10CDD}, {0x10C9E,0x10CDE}, {0x10C9F,0x10CDF}, {0x10CA0,0x10CE0}, {0x10CA1,0x10CE1}, {0x10CA2,0x10CE2}, {0x10CA3,0x10CE3}, {0x10CA4,0x10CE4}, {0x10CA5,0x10CE5}, {0x10CA6,0x10CE6}, {0x10CA7,0x10CE7}, {0x10CA8,0x10CE8}, {0x10CA9,0x10CE9}, {0x10CAA,0x10CEA}, {0x10CAB,0x10CEB}, {0x10CAC,0x10CEC}, {0x10CAD,0x10CED}, {0x10CAE,0x10CEE}, {0x10CAF,0x10CEF}, {0x10CB0,0x10CF0}, {0x10CB1,0x10CF1}, {0x10CB2,0x10CF2}, {0x10CD,0x2D2D}, {0x118A0,0x118C0}, {0x118A1,0x118C1}, {0x118A2,0x118C2}, {0x118A3,0x118C3}, {0x118A4,0x118C4}, {0x118A5,0x118C5}, {0x118A6,0x118C6}, {0x118A7,0x118C7}, {0x118A8,0x118C8}, {0x118A9,0x118C9}, {0x118AA,0x118CA}, {0x118AB,0x118CB}, {0x118AC,0x118CC}, {0x118AD,0x118CD}, {0x118AE,0x118CE}, {0x118AF,0x118CF}, {0x118B0,0x118D0}, {0x118B1,0x118D1}, {0x118B2,0x118D2}, {0x118B3,0x118D3}, {0x118B4,0x118D4}, {0x118B5,0x118D5}, {0x118B6,0x118D6}, {0x118B7,0x118D7}, {0x118B8,0x118D8}, {0x118B9,0x118D9}, {0x118BA,0x118DA}, {0x118BB,0x118DB}, {0x118BC,0x118DC}, {0x118BD,0x118DD}, {0x118BE,0x118DE}, {0x118BF,0x118DF}, {0x13F8,0x13F0}, {0x13F9,0x13F1}, {0x13FA,0x13F2}, {0x13FB,0x13F3}, {0x13FC,0x13F4}, {0x13FD,0x13F5}, {0x16E40,0x16E60}, {0x16E41,0x16E61}, {0x16E42,0x16E62}, {0x16E43,0x16E63}, {0x16E44,0x16E64}, {0x16E45,0x16E65}, {0x16E46,0x16E66}, {0x16E47,0x16E67}, {0x16E48,0x16E68}, {0x16E49,0x16E69}, {0x16E4A,0x16E6A}, {0x16E4B,0x16E6B}, {0x16E4C,0x16E6C}, {0x16E4D,0x16E6D}, {0x16E4E,0x16E6E}, {0x16E4F,0x16E6F}, {0x16E50,0x16E70}, {0x16E51,0x16E71}, {0x16E52,0x16E72}, {0x16E53,0x16E73}, {0x16E54,0x16E74}, {0x16E55,0x16E75}, {0x16E56,0x16E76}, {0x16E57,0x16E77}, {0x16E58,0x16E78}, {0x16E59,0x16E79}, {0x16E5A,0x16E7A}, {0x16E5B,0x16E7B}, {0x16E5C,0x16E7C}, {0x16E5D,0x16E7D}, {0x16E5E,0x16E7E}, {0x16E5F,0x16E7F}, {0x1C80,0x0432}, {0x1C81,0x0434}, {0x1C82,0x043E}, {0x1C83,0x0441}, {0x1C84,0x0442}, {0x1C85,0x0442}, {0x1C86,0x044A}, {0x1C87,0x0463}, {0x1C88,0xA64B}, {0x1C90,0x10D0}, {0x1C91,0x10D1}, {0x1C92,0x10D2}, {0x1C93,0x10D3}, {0x1C94,0x10D4}, {0x1C95,0x10D5}, {0x1C96,0x10D6}, {0x1C97,0x10D7}, {0x1C98,0x10D8}, {0x1C99,0x10D9}, {0x1C9A,0x10DA}, {0x1C9B,0x10DB}, {0x1C9C,0x10DC}, {0x1C9D,0x10DD}, {0x1C9E,0x10DE}, {0x1C9F,0x10DF}, {0x1CA0,0x10E0}, {0x1CA1,0x10E1}, {0x1CA2,0x10E2}, {0x1CA3,0x10E3}, {0x1CA4,0x10E4}, {0x1CA5,0x10E5}, {0x1CA6,0x10E6}, {0x1CA7,0x10E7}, {0x1CA8,0x10E8}, {0x1CA9,0x10E9}, {0x1CAA,0x10EA}, {0x1CAB,0x10EB}, {0x1CAC,0x10EC}, {0x1CAD,0x10ED}, {0x1CAE,0x10EE}, {0x1CAF,0x10EF}, {0x1CB0,0x10F0}, {0x1CB1,0x10F1}, {0x1CB2,0x10F2}, {0x1CB3,0x10F3}, {0x1CB4,0x10F4}, {0x1CB5,0x10F5}, {0x1CB6,0x10F6}, {0x1CB7,0x10F7}, {0x1CB8,0x10F8}, {0x1CB9,0x10F9}, {0x1CBA,0x10FA}, {0x1CBD,0x10FD}, {0x1CBE,0x10FE}, {0x1CBF,0x10FF}, {0x1E00,0x1E01}, {0x1E02,0x1E03}, {0x1E04,0x1E05}, {0x1E06,0x1E07}, {0x1E08,0x1E09}, {0x1E0A,0x1E0B}, {0x1E0C,0x1E0D}, {0x1E0E,0x1E0F}, {0x1E10,0x1E11}, {0x1E12,0x1E13}, {0x1E14,0x1E15}, {0x1E16,0x1E17}, {0x1E18,0x1E19}, {0x1E1A,0x1E1B}, {0x1E1C,0x1E1D}, {0x1E1E,0x1E1F}, {0x1E20,0x1E21}, {0x1E22,0x1E23}, {0x1E24,0x1E25}, {0x1E26,0x1E27}, {0x1E28,0x1E29}, {0x1E2A,0x1E2B}, {0x1E2C,0x1E2D}, {0x1E2E,0x1E2F}, {0x1E30,0x1E31}, {0x1E32,0x1E33}, {0x1E34,0x1E35}, {0x1E36,0x1E37}, {0x1E38,0x1E39}, {0x1E3A,0x1E3B}, {0x1E3C,0x1E3D}, {0x1E3E,0x1E3F}, {0x1E40,0x1E41}, {0x1E42,0x1E43}, {0x1E44,0x1E45}, {0x1E46,0x1E47}, {0x1E48,0x1E49}, {0x1E4A,0x1E4B}, {0x1E4C,0x1E4D}, {0x1E4E,0x1E4F}, {0x1E50,0x1E51}, {0x1E52,0x1E53}, {0x1E54,0x1E55}, {0x1E56,0x1E57}, {0x1E58,0x1E59}, {0x1E5A,0x1E5B}, {0x1E5C,0x1E5D}, {0x1E5E,0x1E5F}, {0x1E60,0x1E61}, {0x1E62,0x1E63}, {0x1E64,0x1E65}, {0x1E66,0x1E67}, {0x1E68,0x1E69}, {0x1E6A,0x1E6B}, {0x1E6C,0x1E6D}, {0x1E6E,0x1E6F}, {0x1E70,0x1E71}, {0x1E72,0x1E73}, {0x1E74,0x1E75}, {0x1E76,0x1E77}, {0x1E78,0x1E79}, {0x1E7A,0x1E7B}, {0x1E7C,0x1E7D}, {0x1E7E,0x1E7F}, {0x1E80,0x1E81}, {0x1E82,0x1E83}, {0x1E84,0x1E85}, {0x1E86,0x1E87}, {0x1E88,0x1E89}, {0x1E8A,0x1E8B}, {0x1E8C,0x1E8D}, {0x1E8E,0x1E8F}, {0x1E90,0x1E91}, {0x1E900,0x1E922}, {0x1E901,0x1E923}, {0x1E902,0x1E924}, {0x1E903,0x1E925}, {0x1E904,0x1E926}, {0x1E905,0x1E927}, {0x1E906,0x1E928}, {0x1E907,0x1E929}, {0x1E908,0x1E92A}, {0x1E909,0x1E92B}, {0x1E90A,0x1E92C}, {0x1E90B,0x1E92D}, {0x1E90C,0x1E92E}, {0x1E90D,0x1E92F}, {0x1E90E,0x1E930}, {0x1E90F,0x1E931}, {0x1E910,0x1E932}, {0x1E911,0x1E933}, {0x1E912,0x1E934}, {0x1E913,0x1E935}, {0x1E914,0x1E936}, {0x1E915,0x1E937}, {0x1E916,0x1E938}, {0x1E917,0x1E939}, {0x1E918,0x1E93A}, {0x1E919,0x1E93B}, {0x1E91A,0x1E93C}, {0x1E91B,0x1E93D}, {0x1E91C,0x1E93E}, {0x1E91D,0x1E93F}, {0x1E91E,0x1E940}, {0x1E91F,0x1E941}, {0x1E92,0x1E93}, {0x1E920,0x1E942}, {0x1E921,0x1E943}, {0x1E94,0x1E95}, {0x1E9B,0x1E61}, {0x1EA0,0x1EA1}, {0x1EA2,0x1EA3}, {0x1EA4,0x1EA5}, {0x1EA6,0x1EA7}, {0x1EA8,0x1EA9}, {0x1EAA,0x1EAB}, {0x1EAC,0x1EAD}, {0x1EAE,0x1EAF}, {0x1EB0,0x1EB1}, {0x1EB2,0x1EB3}, {0x1EB4,0x1EB5}, {0x1EB6,0x1EB7}, {0x1EB8,0x1EB9}, {0x1EBA,0x1EBB}, {0x1EBC,0x1EBD}, {0x1EBE,0x1EBF}, {0x1EC0,0x1EC1}, {0x1EC2,0x1EC3}, {0x1EC4,0x1EC5}, {0x1EC6,0x1EC7}, {0x1EC8,0x1EC9}, {0x1ECA,0x1ECB}, {0x1ECC,0x1ECD}, {0x1ECE,0x1ECF}, {0x1ED0,0x1ED1}, {0x1ED2,0x1ED3}, {0x1ED4,0x1ED5}, {0x1ED6,0x1ED7}, {0x1ED8,0x1ED9}, {0x1EDA,0x1EDB}, {0x1EDC,0x1EDD}, {0x1EDE,0x1EDF}, {0x1EE0,0x1EE1}, {0x1EE2,0x1EE3}, {0x1EE4,0x1EE5}, {0x1EE6,0x1EE7}, {0x1EE8,0x1EE9}, {0x1EEA,0x1EEB}, {0x1EEC,0x1EED}, {0x1EEE,0x1EEF}, {0x1EF0,0x1EF1}, {0x1EF2,0x1EF3}, {0x1EF4,0x1EF5}, {0x1EF6,0x1EF7}, {0x1EF8,0x1EF9}, {0x1EFA,0x1EFB}, {0x1EFC,0x1EFD}, {0x1EFE,0x1EFF}, {0x1F08,0x1F00}, {0x1F09,0x1F01}, {0x1F0A,0x1F02}, {0x1F0B,0x1F03}, {0x1F0C,0x1F04}, {0x1F0D,0x1F05}, {0x1F0E,0x1F06}, {0x1F0F,0x1F07}, {0x1F18,0x1F10}, {0x1F19,0x1F11}, {0x1F1A,0x1F12}, {0x1F1B,0x1F13}, {0x1F1C,0x1F14}, {0x1F1D,0x1F15}, {0x1F28,0x1F20}, {0x1F29,0x1F21}, {0x1F2A,0x1F22}, {0x1F2B,0x1F23}, {0x1F2C,0x1F24}, {0x1F2D,0x1F25}, {0x1F2E,0x1F26}, {0x1F2F,0x1F27}, {0x1F38,0x1F30}, {0x1F39,0x1F31}, {0x1F3A,0x1F32}, {0x1F3B,0x1F33}, {0x1F3C,0x1F34}, {0x1F3D,0x1F35}, {0x1F3E,0x1F36}, {0x1F3F,0x1F37}, {0x1F48,0x1F40}, {0x1F49,0x1F41}, {0x1F4A,0x1F42}, {0x1F4B,0x1F43}, {0x1F4C,0x1F44}, {0x1F4D,0x1F45}, {0x1F59,0x1F51}, {0x1F5B,0x1F53}, {0x1F5D,0x1F55}, {0x1F5F,0x1F57}, {0x1F68,0x1F60}, {0x1F69,0x1F61}, {0x1F6A,0x1F62}, {0x1F6B,0x1F63}, {0x1F6C,0x1F64}, {0x1F6D,0x1F65}, {0x1F6E,0x1F66}, {0x1F6F,0x1F67}, {0x1FB8,0x1FB0}, {0x1FB9,0x1FB1}, {0x1FBA,0x1F70}, {0x1FBB,0x1F71}, {0x1FBE,0x03B9}, {0x1FC8,0x1F72}, {0x1FC9,0x1F73}, {0x1FCA,0x1F74}, {0x1FCB,0x1F75}, {0x1FD8,0x1FD0}, {0x1FD9,0x1FD1}, {0x1FDA,0x1F76}, {0x1FDB,0x1F77}, {0x1FE8,0x1FE0}, {0x1FE9,0x1FE1}, {0x1FEA,0x1F7A}, {0x1FEB,0x1F7B}, {0x1FEC,0x1FE5}, {0x1FF8,0x1F78}, {0x1FF9,0x1F79}, {0x1FFA,0x1F7C}, {0x1FFB,0x1F7D}, {0x2126,0x03C9}, {0x212A,0x006B}, {0x212B,0x00E5}, {0x2132,0x214E}, {0x2160,0x2170}, {0x2161,0x2171}, {0x2162,0x2172}, {0x2163,0x2173}, {0x2164,0x2174}, {0x2165,0x2175}, {0x2166,0x2176}, {0x2167,0x2177}, {0x2168,0x2178}, {0x2169,0x2179}, {0x216A,0x217A}, {0x216B,0x217B}, {0x216C,0x217C}, {0x216D,0x217D}, {0x216E,0x217E}, {0x216F,0x217F}, {0x2183,0x2184}, {0x24B6,0x24D0}, {0x24B7,0x24D1}, {0x24B8,0x24D2}, {0x24B9,0x24D3}, {0x24BA,0x24D4}, {0x24BB,0x24D5}, {0x24BC,0x24D6}, {0x24BD,0x24D7}, {0x24BE,0x24D8}, {0x24BF,0x24D9}, {0x24C0,0x24DA}, {0x24C1,0x24DB}, {0x24C2,0x24DC}, {0x24C3,0x24DD}, {0x24C4,0x24DE}, {0x24C5,0x24DF}, {0x24C6,0x24E0}, {0x24C7,0x24E1}, {0x24C8,0x24E2}, {0x24C9,0x24E3}, {0x24CA,0x24E4}, {0x24CB,0x24E5}, {0x24CC,0x24E6}, {0x24CD,0x24E7}, {0x24CE,0x24E8}, {0x24CF,0x24E9}, {0x2C00,0x2C30}, {0x2C01,0x2C31}, {0x2C02,0x2C32}, {0x2C03,0x2C33}, {0x2C04,0x2C34}, {0x2C05,0x2C35}, {0x2C06,0x2C36}, {0x2C07,0x2C37}, {0x2C08,0x2C38}, {0x2C09,0x2C39}, {0x2C0A,0x2C3A}, {0x2C0B,0x2C3B}, {0x2C0C,0x2C3C}, {0x2C0D,0x2C3D}, {0x2C0E,0x2C3E}, {0x2C0F,0x2C3F}, {0x2C10,0x2C40}, {0x2C11,0x2C41}, {0x2C12,0x2C42}, {0x2C13,0x2C43}, {0x2C14,0x2C44}, {0x2C15,0x2C45}, {0x2C16,0x2C46}, {0x2C17,0x2C47}, {0x2C18,0x2C48}, {0x2C19,0x2C49}, {0x2C1A,0x2C4A}, {0x2C1B,0x2C4B}, {0x2C1C,0x2C4C}, {0x2C1D,0x2C4D}, {0x2C1E,0x2C4E}, {0x2C1F,0x2C4F}, {0x2C20,0x2C50}, {0x2C21,0x2C51}, {0x2C22,0x2C52}, {0x2C23,0x2C53}, {0x2C24,0x2C54}, {0x2C25,0x2C55}, {0x2C26,0x2C56}, {0x2C27,0x2C57}, {0x2C28,0x2C58}, {0x2C29,0x2C59}, {0x2C2A,0x2C5A}, {0x2C2B,0x2C5B}, {0x2C2C,0x2C5C}, {0x2C2D,0x2C5D}, {0x2C2E,0x2C5E}, {0x2C60,0x2C61}, {0x2C62,0x026B}, {0x2C63,0x1D7D}, {0x2C64,0x027D}, {0x2C67,0x2C68}, {0x2C69,0x2C6A}, {0x2C6B,0x2C6C}, {0x2C6D,0x0251}, {0x2C6E,0x0271}, {0x2C6F,0x0250}, {0x2C70,0x0252}, {0x2C72,0x2C73}, {0x2C75,0x2C76}, {0x2C7E,0x023F}, {0x2C7F,0x0240}, {0x2C80,0x2C81}, {0x2C82,0x2C83}, {0x2C84,0x2C85}, {0x2C86,0x2C87}, {0x2C88,0x2C89}, {0x2C8A,0x2C8B}, {0x2C8C,0x2C8D}, {0x2C8E,0x2C8F}, {0x2C90,0x2C91}, {0x2C92,0x2C93}, {0x2C94,0x2C95}, {0x2C96,0x2C97}, {0x2C98,0x2C99}, {0x2C9A,0x2C9B}, {0x2C9C,0x2C9D}, {0x2C9E,0x2C9F}, {0x2CA0,0x2CA1}, {0x2CA2,0x2CA3}, {0x2CA4,0x2CA5}, {0x2CA6,0x2CA7}, {0x2CA8,0x2CA9}, {0x2CAA,0x2CAB}, {0x2CAC,0x2CAD}, {0x2CAE,0x2CAF}, {0x2CB0,0x2CB1}, {0x2CB2,0x2CB3}, {0x2CB4,0x2CB5}, {0x2CB6,0x2CB7}, {0x2CB8,0x2CB9}, {0x2CBA,0x2CBB}, {0x2CBC,0x2CBD}, {0x2CBE,0x2CBF}, {0x2CC0,0x2CC1}, {0x2CC2,0x2CC3}, {0x2CC4,0x2CC5}, {0x2CC6,0x2CC7}, {0x2CC8,0x2CC9}, {0x2CCA,0x2CCB}, {0x2CCC,0x2CCD}, {0x2CCE,0x2CCF}, {0x2CD0,0x2CD1}, {0x2CD2,0x2CD3}, {0x2CD4,0x2CD5}, {0x2CD6,0x2CD7}, {0x2CD8,0x2CD9}, {0x2CDA,0x2CDB}, {0x2CDC,0x2CDD}, {0x2CDE,0x2CDF}, {0x2CE0,0x2CE1}, {0x2CE2,0x2CE3}, {0x2CEB,0x2CEC}, {0x2CED,0x2CEE}, {0x2CF2,0x2CF3}, {0xA640,0xA641}, {0xA642,0xA643}, {0xA644,0xA645}, {0xA646,0xA647}, {0xA648,0xA649}, {0xA64A,0xA64B}, {0xA64C,0xA64D}, {0xA64E,0xA64F}, {0xA650,0xA651}, {0xA652,0xA653}, {0xA654,0xA655}, {0xA656,0xA657}, {0xA658,0xA659}, {0xA65A,0xA65B}, {0xA65C,0xA65D}, {0xA65E,0xA65F}, {0xA660,0xA661}, {0xA662,0xA663}, {0xA664,0xA665}, {0xA666,0xA667}, {0xA668,0xA669}, {0xA66A,0xA66B}, {0xA66C,0xA66D}, {0xA680,0xA681}, {0xA682,0xA683}, {0xA684,0xA685}, {0xA686,0xA687}, {0xA688,0xA689}, {0xA68A,0xA68B}, {0xA68C,0xA68D}, {0xA68E,0xA68F}, {0xA690,0xA691}, {0xA692,0xA693}, {0xA694,0xA695}, {0xA696,0xA697}, {0xA698,0xA699}, {0xA69A,0xA69B}, {0xA722,0xA723}, {0xA724,0xA725}, {0xA726,0xA727}, {0xA728,0xA729}, {0xA72A,0xA72B}, {0xA72C,0xA72D}, {0xA72E,0xA72F}, {0xA732,0xA733}, {0xA734,0xA735}, {0xA736,0xA737}, {0xA738,0xA739}, {0xA73A,0xA73B}, {0xA73C,0xA73D}, {0xA73E,0xA73F}, {0xA740,0xA741}, {0xA742,0xA743}, {0xA744,0xA745}, {0xA746,0xA747}, {0xA748,0xA749}, {0xA74A,0xA74B}, {0xA74C,0xA74D}, {0xA74E,0xA74F}, {0xA750,0xA751}, {0xA752,0xA753}, {0xA754,0xA755}, {0xA756,0xA757}, {0xA758,0xA759}, {0xA75A,0xA75B}, {0xA75C,0xA75D}, {0xA75E,0xA75F}, {0xA760,0xA761}, {0xA762,0xA763}, {0xA764,0xA765}, {0xA766,0xA767}, {0xA768,0xA769}, {0xA76A,0xA76B}, {0xA76C,0xA76D}, {0xA76E,0xA76F}, {0xA779,0xA77A}, {0xA77B,0xA77C}, {0xA77D,0x1D79}, {0xA77E,0xA77F}, {0xA780,0xA781}, {0xA782,0xA783}, {0xA784,0xA785}, {0xA786,0xA787}, {0xA78B,0xA78C}, {0xA78D,0x0265}, {0xA790,0xA791}, {0xA792,0xA793}, {0xA796,0xA797}, {0xA798,0xA799}, {0xA79A,0xA79B}, {0xA79C,0xA79D}, {0xA79E,0xA79F}, {0xA7A0,0xA7A1}, {0xA7A2,0xA7A3}, {0xA7A4,0xA7A5}, {0xA7A6,0xA7A7}, {0xA7A8,0xA7A9}, {0xA7AA,0x0266}, {0xA7AB,0x025C}, {0xA7AC,0x0261}, {0xA7AD,0x026C}, {0xA7AE,0x026A}, {0xA7B0,0x029E}, {0xA7B1,0x0287}, {0xA7B2,0x029D}, {0xA7B3,0xAB53}, {0xA7B4,0xA7B5}, {0xA7B6,0xA7B7}, {0xA7B8,0xA7B9}, {0xAB70,0x13A0}, {0xAB71,0x13A1}, {0xAB72,0x13A2}, {0xAB73,0x13A3}, {0xAB74,0x13A4}, {0xAB75,0x13A5}, {0xAB76,0x13A6}, {0xAB77,0x13A7}, {0xAB78,0x13A8}, {0xAB79,0x13A9}, {0xAB7A,0x13AA}, {0xAB7B,0x13AB}, {0xAB7C,0x13AC}, {0xAB7D,0x13AD}, {0xAB7E,0x13AE}, {0xAB7F,0x13AF}, {0xAB80,0x13B0}, {0xAB81,0x13B1}, {0xAB82,0x13B2}, {0xAB83,0x13B3}, {0xAB84,0x13B4}, {0xAB85,0x13B5}, {0xAB86,0x13B6}, {0xAB87,0x13B7}, {0xAB88,0x13B8}, {0xAB89,0x13B9}, {0xAB8A,0x13BA}, {0xAB8B,0x13BB}, {0xAB8C,0x13BC}, {0xAB8D,0x13BD}, {0xAB8E,0x13BE}, {0xAB8F,0x13BF}, {0xAB90,0x13C0}, {0xAB91,0x13C1}, {0xAB92,0x13C2}, {0xAB93,0x13C3}, {0xAB94,0x13C4}, {0xAB95,0x13C5}, {0xAB96,0x13C6}, {0xAB97,0x13C7}, {0xAB98,0x13C8}, {0xAB99,0x13C9}, {0xAB9A,0x13CA}, {0xAB9B,0x13CB}, {0xAB9C,0x13CC}, {0xAB9D,0x13CD}, {0xAB9E,0x13CE}, {0xAB9F,0x13CF}, {0xABA0,0x13D0}, {0xABA1,0x13D1}, {0xABA2,0x13D2}, {0xABA3,0x13D3}, {0xABA4,0x13D4}, {0xABA5,0x13D5}, {0xABA6,0x13D6}, {0xABA7,0x13D7}, {0xABA8,0x13D8}, {0xABA9,0x13D9}, {0xABAA,0x13DA}, {0xABAB,0x13DB}, {0xABAC,0x13DC}, {0xABAD,0x13DD}, {0xABAE,0x13DE}, {0xABAF,0x13DF}, {0xABB0,0x13E0}, {0xABB1,0x13E1}, {0xABB2,0x13E2}, {0xABB3,0x13E3}, {0xABB4,0x13E4}, {0xABB5,0x13E5}, {0xABB6,0x13E6}, {0xABB7,0x13E7}, {0xABB8,0x13E8}, {0xABB9,0x13E9}, {0xABBA,0x13EA}, {0xABBB,0x13EB}, {0xABBC,0x13EC}, {0xABBD,0x13ED}, {0xABBE,0x13EE}, {0xABBF,0x13EF}, {0xFF21,0xFF41}, {0xFF22,0xFF42}, {0xFF23,0xFF43}, {0xFF24,0xFF44}, {0xFF25,0xFF45}, {0xFF26,0xFF46}, {0xFF27,0xFF47}, {0xFF28,0xFF48}, {0xFF29,0xFF49}, {0xFF2A,0xFF4A}, {0xFF2B,0xFF4B}, {0xFF2C,0xFF4C}, {0xFF2D,0xFF4D}, {0xFF2E,0xFF4E}, {0xFF2F,0xFF4F}, {0xFF30,0xFF50}, {0xFF31,0xFF51}, {0xFF32,0xFF52}, {0xFF33,0xFF53}, {0xFF34,0xFF54}, {0xFF35,0xFF55}, {0xFF36,0xFF56}, {0xFF37,0xFF57}, {0xFF38,0xFF58}, {0xFF39,0xFF59}, {0xFF3A,0xFF5A}, }; character-set-1.1.2/ext/character_set/extconf.rb0000644000175100017510000000017513373461276020707 0ustar pravipravirequire 'mkmf' $CFLAGS << ' -Wextra -Wno-unused-parameter -Wall -pedantic ' create_makefile('character_set/character_set') character-set-1.1.2/ext/character_set/character_set.c0000644000175100017510000005325213373461276021673 0ustar pravipravi#include "ruby.h" #include "ruby/encoding.h" #include "unicode_casefold_table.h" #define SETBIT(byte_arr, bit) (byte_arr[bit >> 3] |= (1 << (bit & 0x07))) #define CLRBIT(byte_arr, bit) (byte_arr[bit >> 3] &= ~(1 << (bit & 0x07))) #define TSTBIT(byte_arr, bit) (byte_arr[bit >> 3] & (1 << (bit & 0x07))) typedef char cp_byte; typedef unsigned long cp_index; #define UNICODE_CP_COUNT 0x110000 #define UNICODE_BYTES UNICODE_CP_COUNT / 8 #define UNICODE_PLANE_SIZE 0x10000 #define UNICODE_PLANE_COUNT UNICODE_CP_COUNT / UNICODE_PLANE_SIZE static void free_character_set(void* codepoints) { free(codepoints); } static size_t memsize_character_set(const void* codepoints) { return sizeof(cp_byte) * UNICODE_BYTES; } static const rb_data_type_t character_set_type = { .wrap_struct_name = "character_set", .function = { .dmark = NULL, .dfree = free_character_set, .dsize = memsize_character_set, }, .data = NULL, .flags = RUBY_TYPED_FREE_IMMEDIATELY, }; #define FETCH_CODEPOINTS(set, cps)\ TypedData_Get_Struct(set, cp_byte, &character_set_type, cps) #define NEW_CHARACTER_SET(klass, cps)\ TypedData_Wrap_Struct(klass, &character_set_type, cps) static VALUE method_allocate(VALUE self) { cp_byte *cp_arr; cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte)); return NEW_CHARACTER_SET(self, cp_arr); } #define FOR_EACH_ACTIVE_CODEPOINT(action)\ cp_index cp;\ cp_byte *cps;\ FETCH_CODEPOINTS(self, cps);\ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {\ if (TSTBIT(cps, cp)) { action; }\ } // *************************** // `Set` compatibility methods // *************************** static inline VALUE enumerator_length(VALUE self, VALUE args, VALUE eobj) { cp_index count; count = 0; FOR_EACH_ACTIVE_CODEPOINT(count++); return LONG2FIX(count); } static VALUE method_length(VALUE self) { return enumerator_length(self, 0, 0); } static VALUE method_each(VALUE self) { RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length); FOR_EACH_ACTIVE_CODEPOINT(rb_yield(LONG2FIX(cp))); return self; } // returns an Array of codepoint Integers by default. // returns an Array of Strings of length 1 if passed `true`. static VALUE method_to_a(int argc, VALUE *argv, VALUE self) { VALUE arr; rb_encoding *enc; rb_check_arity(argc, 0, 1); arr = rb_ary_new(); if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse) { FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, LONG2FIX(cp))); } else { enc = rb_utf8_encoding(); FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, rb_enc_uint_chr((int)cp, enc))); } return arr; } static VALUE method_empty_p(VALUE self) { FOR_EACH_ACTIVE_CODEPOINT(return Qfalse); return Qtrue; } static VALUE method_hash(VALUE self) { cp_index cp, hash, four_byte_value; cp_byte *cps; FETCH_CODEPOINTS(self, cps); hash = 17; for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { if (cp % 32 == 0) { if (cp != 0) { hash = hash * 23 + four_byte_value; } four_byte_value = 0; } if (TSTBIT(cps, cp)) four_byte_value++; } return LONG2FIX(hash); } static inline VALUE delete_if_block_result(VALUE self, int truthy) { VALUE result; rb_need_block(); rb_check_frozen(self); FOR_EACH_ACTIVE_CODEPOINT( result = rb_yield(LONG2FIX(cp)); if ((NIL_P(result) || result == Qfalse) != truthy) CLRBIT(cps, cp); ); return self; } static VALUE method_delete_if(VALUE self) { RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length); return delete_if_block_result(self, 1); } static VALUE method_keep_if(VALUE self) { RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length); return delete_if_block_result(self, 0); } static VALUE method_clear(VALUE self) { cp_index cp; cp_byte *cps; rb_check_frozen(self); FETCH_CODEPOINTS(self, cps); for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { CLRBIT(cps, cp); } return self; } #define RETURN_NEW_SET_BASED_ON(condition)\ cp_index cp;\ cp_byte *a, *b, *new_cps;\ FETCH_CODEPOINTS(self, a);\ if (other) FETCH_CODEPOINTS(other, b);\ new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));\ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {\ if (condition) SETBIT(new_cps, cp);\ }\ return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps);\ static VALUE method_intersection(VALUE self, VALUE other) { RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) && TSTBIT(b, cp)); } static VALUE method_exclusion(VALUE self, VALUE other) { RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) ^ TSTBIT(b, cp)); } static VALUE method_union(VALUE self, VALUE other) { RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) || TSTBIT(b, cp)); } static VALUE method_difference(VALUE self, VALUE other) { RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) > TSTBIT(b, cp)); } static VALUE method_include_p(VALUE self, VALUE num) { cp_byte *cps; FETCH_CODEPOINTS(self, cps); return (TSTBIT(cps, FIX2ULONG(num)) ? Qtrue : Qfalse); } static inline int toggle_codepoint(VALUE set, VALUE cp_num, unsigned int on, int check_if_noop) { cp_index cp; cp_byte *cps; rb_check_frozen(set); FETCH_CODEPOINTS(set, cps); cp = FIX2ULONG(cp_num); if (check_if_noop && (!TSTBIT(cps, cp) == !on)) { return 0; } else { if (on) { SETBIT(cps, cp); } else { CLRBIT(cps, cp); } return 1; } } static VALUE method_add(VALUE self, VALUE cp_num) { return toggle_codepoint(self, cp_num, 1, 0) ? self : Qnil; } static VALUE method_add_p(VALUE self, VALUE cp_num) { return toggle_codepoint(self, cp_num, 1, 1) ? self : Qnil; } static VALUE method_delete(VALUE self, VALUE cp_num) { return toggle_codepoint(self, cp_num, 0, 0) ? self : Qnil; } static VALUE method_delete_p(VALUE self, VALUE cp_num) { return toggle_codepoint(self, cp_num, 0, 1) ? self : Qnil; } #define COMPARE_SETS(action)\ cp_index cp;\ cp_byte *cps, *other_cps;\ FETCH_CODEPOINTS(self, cps);\ FETCH_CODEPOINTS(other, other_cps);\ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { action; }\ static VALUE method_intersect_p(VALUE self, VALUE other) { COMPARE_SETS(if (TSTBIT(cps, cp) && TSTBIT(other_cps, cp)) return Qtrue); return Qfalse; } static VALUE method_disjoint_p(VALUE self, VALUE other) { return method_intersect_p(self, other) ? Qfalse : Qtrue; } static inline int is_character_set(VALUE obj) { return rb_typeddata_is_kind_of(obj, &character_set_type); } static VALUE method_eql_p(VALUE self, VALUE other) { if (!is_character_set(other)) return Qfalse; if (self == other) return Qtrue; // same object_id COMPARE_SETS(if (TSTBIT(cps, cp) != TSTBIT(other_cps, cp)) return Qfalse); return Qtrue; } static inline VALUE merge_character_set(VALUE self, VALUE other) { COMPARE_SETS(if (TSTBIT(other_cps, cp)) SETBIT(cps, cp)); return self; } static inline void raise_arg_err_unless_valid_as_cp(VALUE object_id) { if (FIXNUM_P(object_id) && object_id > 0 && object_id < 0x220001) return; rb_raise(rb_eArgError, "CharacterSet members must be between 0 and 0x10FFFF"); } static inline VALUE merge_rb_range(VALUE self, VALUE rb_range) { VALUE from_id, upto_id; int excl; cp_index cp; cp_byte *cps; FETCH_CODEPOINTS(self, cps); if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl))) { rb_raise(rb_eArgError, "pass a Range"); } if (excl) upto_id -= 2; raise_arg_err_unless_valid_as_cp(from_id); raise_arg_err_unless_valid_as_cp(upto_id); for (/* */; from_id <= upto_id; from_id += 2) { cp = FIX2ULONG(from_id); SETBIT(cps, cp); } return self; } static inline VALUE merge_rb_array(VALUE self, VALUE rb_array) { VALUE el; cp_byte *cps; VALUE array_length, i; FETCH_CODEPOINTS(self, cps); Check_Type(rb_array, T_ARRAY); array_length = RARRAY_LEN(rb_array); for (i = 0; i < array_length; i++) { el = RARRAY_AREF(rb_array, i); raise_arg_err_unless_valid_as_cp(el); SETBIT(cps, FIX2ULONG(el)); } return self; } static VALUE method_merge(VALUE self, VALUE other) { rb_check_frozen(self); if (is_character_set(other)) { return merge_character_set(self, other); } else if (TYPE(other) == T_ARRAY) { return merge_rb_array(self, other); } return merge_rb_range(self, other); } static VALUE method_initialize_copy(VALUE self, VALUE other) { merge_character_set(self, other); return other; } static VALUE method_subtract(VALUE self, VALUE other) { rb_check_frozen(self); COMPARE_SETS(if (TSTBIT(other_cps, cp)) CLRBIT(cps, cp)); return self; } static inline int a_subset_of_b(VALUE set_a, VALUE set_b, int *is_proper) { cp_byte *cps_a, *cps_b; cp_index cp, size_a, size_b; if (!is_character_set(set_a) || !is_character_set(set_b)) { rb_raise(rb_eArgError, "pass a CharacterSet"); } FETCH_CODEPOINTS(set_a, cps_a); FETCH_CODEPOINTS(set_b, cps_b); *is_proper = 0; size_a = 0; size_b = 0; for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { if (TSTBIT(cps_a, cp)) { if (!TSTBIT(cps_b, cp)) return 0; size_a++; size_b++; } else if (TSTBIT(cps_b, cp)) size_b++; } if (size_b > size_a) *is_proper = 1; return 1; } static VALUE method_subset_p(VALUE self, VALUE other) { int is_proper; return a_subset_of_b(self, other, &is_proper) ? Qtrue : Qfalse; } static VALUE method_proper_subset_p(VALUE self, VALUE other) { int is, is_proper; is = a_subset_of_b(self, other, &is_proper); return (is && is_proper) ? Qtrue : Qfalse; } static VALUE method_superset_p(VALUE self, VALUE other) { int is_proper; return a_subset_of_b(other, self, &is_proper) ? Qtrue : Qfalse; } static VALUE method_proper_superset_p(VALUE self, VALUE other) { int is, is_proper; is = a_subset_of_b(other, self, &is_proper); return (is && is_proper) ? Qtrue : Qfalse; } // ******************************* // `CharacterSet`-specific methods // ******************************* static VALUE class_method_from_ranges(VALUE self, VALUE ranges) { VALUE new_set, range_count, i; new_set = rb_class_new_instance(0, 0, self); range_count = RARRAY_LEN(ranges); for (i = 0; i < range_count; i++) { merge_rb_range(new_set, RARRAY_AREF(ranges, i)); } return new_set; } static VALUE method_ranges(VALUE self) { VALUE ranges, codepoint, previous_codepoint, current_start, current_end; ranges = rb_ary_new(); previous_codepoint = 0; current_start = 0; current_end = 0; FOR_EACH_ACTIVE_CODEPOINT( codepoint = LONG2FIX(cp); if (!previous_codepoint) { current_start = codepoint; } else if (previous_codepoint + 2 != codepoint) { // gap found, finalize previous range rb_ary_push(ranges, rb_range_new(current_start, current_end, 0)); current_start = codepoint; } current_end = codepoint; previous_codepoint = codepoint; ); // add final range if (current_start) { rb_ary_push(ranges, rb_range_new(current_start, current_end, 0)); } return ranges; } static VALUE method_sample(int argc, VALUE *argv, VALUE self) { VALUE to_a_args[1], array; rb_check_arity(argc, 0, 1); to_a_args[0] = Qtrue; array = method_to_a(1, to_a_args, self); return rb_funcall(array, rb_intern("sample"), argc, argc ? argv[0] : 0); } static inline VALUE new_set_from_section(VALUE set, cp_index from, cp_index upto) { cp_byte *cps, *new_cps; cp_index cp; FETCH_CODEPOINTS(set, cps); new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte)); for (cp = from; cp <= upto; cp++) { if (TSTBIT(cps, cp)) SETBIT(new_cps, cp); } return NEW_CHARACTER_SET(RBASIC(set)->klass, new_cps); } static VALUE method_bmp_part(VALUE self) { return new_set_from_section(self, 0, UNICODE_PLANE_SIZE - 1); } static VALUE method_astral_part(VALUE self) { return new_set_from_section(self, UNICODE_PLANE_SIZE, UNICODE_CP_COUNT - 1); } static inline VALUE set_has_member_in_plane(VALUE set, unsigned int plane) { cp_byte *cps; cp_index cp, max_cp; FETCH_CODEPOINTS(set, cps); cp = plane * UNICODE_PLANE_SIZE; max_cp = (plane + 1) * UNICODE_PLANE_SIZE - 1; for (/* */; cp <= max_cp; cp++) { if (TSTBIT(cps, cp)) return Qtrue; } return Qfalse; } static VALUE method_planes(VALUE self) { unsigned int i; VALUE planes; planes = rb_ary_new(); for (i = 0; i < UNICODE_PLANE_COUNT; i++) { if (set_has_member_in_plane(self, i)) rb_ary_push(planes, INT2FIX(i)); } return planes; } static VALUE method_member_in_plane_p(VALUE self, VALUE plane_num) { int plane; Check_Type(plane_num, T_FIXNUM); plane = FIX2INT(plane_num); if (plane < 0 || plane >= UNICODE_PLANE_COUNT) { rb_raise(rb_eArgError, "plane must be between 0 and 16"); } return set_has_member_in_plane(self, plane); } #define NON_SURROGATE(cp) (cp > 0xDFFF || cp < 0xD800) static VALUE method_ext_inversion(int argc, VALUE *argv, VALUE self) { int include_surrogates; cp_index upto; VALUE other; other = 0; rb_check_arity(argc, 0, 2); include_surrogates = ((argc > 0) && (argv[0] == Qtrue)); if ((argc > 1) && FIXNUM_P(argv[1])) { upto = FIX2ULONG(argv[1]); RETURN_NEW_SET_BASED_ON( cp <= upto && !TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp)) ); } RETURN_NEW_SET_BASED_ON( !TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp)) ); } typedef int(*str_cp_handler)(unsigned int, cp_byte*); static inline int add_str_cp_to_arr(unsigned int str_cp, cp_byte *cp_arr) { SETBIT(cp_arr, str_cp); return 1; } static VALUE method_case_insensitive(VALUE self) { cp_index i; cp_byte *new_cps; new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte)); FOR_EACH_ACTIVE_CODEPOINT(SETBIT(new_cps, cp)); for (i = 0; i < CASEFOLD_COUNT; i++) { casefold_mapping m = unicode_casefold_table[i]; if (TSTBIT(cps, m.from)) { SETBIT(new_cps, m.to); } else if (TSTBIT(cps, m.to)) { SETBIT(new_cps, m.from); } } return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps); // OnigCaseFoldType flags; // rb_encoding *enc; // // enc = rb_utf8_encoding(); // // ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE (not public on ruby < 2.4) // flags = (1<<13) | (1<<14); // // // case_map args: flags, pp, end, to, to_end, enc // enc->case_map(flags, (const OnigUChar**)&cp, ?, ?, ?, enc); } static inline VALUE each_sb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) { long i; unsigned int str_cp; for (i = 0; i < RSTRING_LEN(str); i++) { str_cp = (RSTRING_PTR(str)[i] & 0xff); if (!(*func)(str_cp, cp_arr)) return Qfalse; } return Qtrue; } static inline VALUE each_mb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) { int n; unsigned int str_cp; const char *ptr, *end; rb_encoding *enc; str = rb_str_new_frozen(str); ptr = RSTRING_PTR(str); end = RSTRING_END(str); enc = rb_enc_get(str); while (ptr < end) { str_cp = rb_enc_codepoint_len(ptr, end, &n, enc); if (!(*func)(str_cp, cp_arr)) return Qfalse; ptr += n; } return Qtrue; } // single_byte_optimizable - copied from string.c static inline int single_byte_optimizable(VALUE str) { rb_encoding *enc; if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) return 1; enc = rb_enc_get(str); if (rb_enc_mbmaxlen(enc) == 1) return 1; return 0; } static inline VALUE each_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) { if (single_byte_optimizable(str)) { return each_sb_cp(str, func, cp_arr); } return each_mb_cp(str, func, cp_arr); } static inline void raise_arg_err_unless_string(VALUE val) { if (!RB_TYPE_P(val, T_STRING)) rb_raise(rb_eArgError, "pass a String"); } static VALUE class_method_of(VALUE self, VALUE str) { cp_byte *cp_arr; raise_arg_err_unless_string(str); cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte)); each_cp(str, add_str_cp_to_arr, cp_arr); return NEW_CHARACTER_SET(self, cp_arr); } static inline int str_cp_not_in_arr(unsigned int str_cp, cp_byte *cp_arr) { return !TSTBIT(cp_arr, str_cp); } static VALUE method_used_by_p(VALUE self, VALUE str) { cp_byte *cps; VALUE only_uses_other_cps; raise_arg_err_unless_string(str); FETCH_CODEPOINTS(self, cps); only_uses_other_cps = each_cp(str, str_cp_not_in_arr, cps); return only_uses_other_cps == Qfalse ? Qtrue : Qfalse; } static inline int str_cp_in_arr(unsigned int str_cp, cp_byte *cp_arr) { return TSTBIT(cp_arr, str_cp); } static VALUE method_cover_p(VALUE self, VALUE str) { cp_byte *cps; raise_arg_err_unless_string(str); FETCH_CODEPOINTS(self, cps); return each_cp(str, str_cp_in_arr, cps); } static inline VALUE apply_to_str(VALUE set, VALUE str, int delete, int bang) { cp_byte *cps; rb_encoding *str_enc; VALUE orig_len, blen, new_str_buf, chr; int n; unsigned int str_cp; const char *ptr, *end; raise_arg_err_unless_string(str); FETCH_CODEPOINTS(set, cps); orig_len = RSTRING_LEN(str); blen = orig_len + 30; /* len + margin */ // not sure why, copied from string.c new_str_buf = rb_str_buf_new(blen); str_enc = rb_enc_get(str); rb_enc_associate(new_str_buf, str_enc); ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID); ptr = RSTRING_PTR(str); end = RSTRING_END(str); while (ptr < end) { str_cp = rb_enc_codepoint_len(ptr, end, &n, str_enc); if (!TSTBIT(cps, str_cp) != !delete) { chr = rb_enc_uint_chr(str_cp, str_enc); rb_enc_str_buf_cat(new_str_buf, RSTRING_PTR(chr), n, str_enc); } ptr += n; } if (bang) { if (RSTRING_LEN(new_str_buf) == (long)orig_len) return Qnil; // unchanged rb_str_shared_replace(str, new_str_buf); } else { RB_OBJ_WRITE(new_str_buf, &(RBASIC(new_str_buf))->klass, rb_obj_class(str)); // slightly cumbersome approach needed for compatibility with Ruby < 2.3: RBASIC(new_str_buf)->flags |= (RBASIC(str)->flags&(FL_TAINT)); str = new_str_buf; } return str; } static VALUE method_delete_in(VALUE self, VALUE str) { return apply_to_str(self, str, 1, 0); } static VALUE method_delete_in_bang(VALUE self, VALUE str) { return apply_to_str(self, str, 1, 1); } static VALUE method_keep_in(VALUE self, VALUE str) { return apply_to_str(self, str, 0, 0); } static VALUE method_keep_in_bang(VALUE self, VALUE str) { return apply_to_str(self, str, 0, 1); } // **** // init // **** void Init_character_set() { VALUE cs = rb_define_class("CharacterSet", rb_cObject); rb_define_alloc_func(cs, method_allocate); // `Set` compatibility methods rb_define_method(cs, "each", method_each, 0); rb_define_method(cs, "to_a", method_to_a, -1); rb_define_method(cs, "length", method_length, 0); rb_define_method(cs, "size", method_length, 0); rb_define_method(cs, "count", method_length, 0); rb_define_method(cs, "empty?", method_empty_p, 0); rb_define_method(cs, "hash", method_hash, 0); rb_define_method(cs, "keep_if", method_keep_if, 0); rb_define_method(cs, "delete_if", method_delete_if, 0); rb_define_method(cs, "clear", method_clear, 0); rb_define_method(cs, "intersection", method_intersection, 1); rb_define_method(cs, "&", method_intersection, 1); rb_define_method(cs, "union", method_union, 1); rb_define_method(cs, "+", method_union, 1); rb_define_method(cs, "|", method_union, 1); rb_define_method(cs, "difference", method_difference, 1); rb_define_method(cs, "-", method_difference, 1); rb_define_method(cs, "^", method_exclusion, 1); rb_define_method(cs, "include?", method_include_p, 1); rb_define_method(cs, "member?", method_include_p, 1); rb_define_method(cs, "===", method_include_p, 1); rb_define_method(cs, "add", method_add, 1); rb_define_method(cs, "<<", method_add, 1); rb_define_method(cs, "add?", method_add_p, 1); rb_define_method(cs, "delete", method_delete, 1); rb_define_method(cs, "delete?", method_delete_p, 1); rb_define_method(cs, "intersect?", method_intersect_p, 1); rb_define_method(cs, "disjoint?", method_disjoint_p, 1); rb_define_method(cs, "eql?", method_eql_p, 1); rb_define_method(cs, "==", method_eql_p, 1); rb_define_method(cs, "merge", method_merge, 1); rb_define_method(cs, "initialize_clone", method_initialize_copy, 1); rb_define_method(cs, "initialize_dup", method_initialize_copy, 1); rb_define_method(cs, "subtract", method_subtract, 1); rb_define_method(cs, "subset?", method_subset_p, 1); rb_define_method(cs, "<=", method_subset_p, 1); rb_define_method(cs, "proper_subset?", method_proper_subset_p, 1); rb_define_method(cs, "<", method_proper_subset_p, 1); rb_define_method(cs, "superset?", method_superset_p, 1); rb_define_method(cs, ">=", method_superset_p, 1); rb_define_method(cs, "proper_superset?", method_proper_superset_p, 1); rb_define_method(cs, ">", method_proper_superset_p, 1); // `CharacterSet`-specific methods rb_define_singleton_method(cs, "from_ranges", class_method_from_ranges, -2); rb_define_singleton_method(cs, "of", class_method_of, 1); rb_define_method(cs, "ranges", method_ranges, 0); rb_define_method(cs, "sample", method_sample, -1); rb_define_method(cs, "bmp_part", method_bmp_part, 0); rb_define_method(cs, "astral_part", method_astral_part, 0); rb_define_method(cs, "planes", method_planes, 0); rb_define_method(cs, "member_in_plane?", method_member_in_plane_p, 1); rb_define_method(cs, "ext_inversion", method_ext_inversion, -1); rb_define_method(cs, "case_insensitive", method_case_insensitive, 0); rb_define_method(cs, "used_by?", method_used_by_p, 1); rb_define_method(cs, "cover?", method_cover_p, 1); rb_define_method(cs, "delete_in", method_delete_in, 1); rb_define_method(cs, "delete_in!", method_delete_in_bang, 1); rb_define_method(cs, "keep_in", method_keep_in, 1); rb_define_method(cs, "keep_in!", method_keep_in_bang, 1); } character-set-1.1.2/BENCHMARK.md0000644000175100017510000000210613373461276015115 0ustar pravipraviResults of `rake:benchmark` on ruby 2.6.0preview1 (2018-02-24 trunk 62554) [x86_64-darwin17] ``` Detecting non-whitespace CharacterSet#cover?: 13244577.7 i/s Regexp#match?: 8027017.5 i/s - 1.65x slower ``` ``` Detecting non-letters CharacterSet#cover?: 13082940.8 i/s Regexp#match?: 5372589.2 i/s - 2.44x slower ``` ``` Removing whitespace CharacterSet#delete_in: 389315.6 i/s String#gsub: 223773.5 i/s - 1.74x slower ``` ``` Removing whitespace, emoji and umlauts CharacterSet#delete_in: 470239.3 i/s String#gsub: 278679.4 i/s - 1.69x slower ``` ``` Removing non-whitespace CharacterSet#keep_in: 1138461.0 i/s String#gsub: 235287.4 i/s - 4.84x slower ``` ``` Extracting emoji CharacterSet#keep_in: 1474472.0 i/s String#gsub: 212269.6 i/s - 6.95x slower ``` ``` Detecting whitespace CharacterSet#used_by?: 13063108.7 i/s Regexp#match?: 7215075.0 i/s - 1.81x slower ``` ``` Detecting emoji in a large string CharacterSet#used_by?: 246527.7 i/s Regexp#match?: 92956.5 i/s - 2.65x slower ```