regexp_property_values-1.0.0/0000755000004100000410000000000013641343655016367 5ustar www-datawww-dataregexp_property_values-1.0.0/.travis.yml0000644000004100000410000000012213641343655020473 0ustar www-datawww-datasudo: false language: ruby rvm: - 2.1 - 2.4 - 2.5 - 2.6 - jruby-9.1.9.0 regexp_property_values-1.0.0/.rspec0000644000004100000410000000006513641343655017505 0ustar www-datawww-data--format documentation --color --require spec_helper regexp_property_values-1.0.0/README.md0000644000004100000410000000322713641343655017652 0ustar www-datawww-data# RegexpPropertyValues [![Gem Version](https://badge.fury.io/rb/regexp_property_values.svg)](http://badge.fury.io/rb/regexp_property_values) [![Build Status](https://travis-ci.org/jaynetics/regexp_property_values.svg?branch=master)](https://travis-ci.org/jaynetics/regexp_property_values) This small library lets you see which property values are supported by the regular expression engine of the Ruby version you are running and directly reads out their codepoint ranges from there. That is, it determines all supported values for `\p{value}` expressions and what they match. ## Usage ##### Browse all property values (supported by any Ruby, ever) ```ruby require 'regexp_property_values' PV = RegexpPropertyValues PV.all # => [, , ...] ``` ##### Browse property values supported by the Ruby you are running ```ruby PV.all_for_current_ruby # => [, , ...] ``` ##### Inspect property values ```ruby PV['alpha'].supported_by_current_ruby? # => true PV['foobar'].supported_by_current_ruby? # => false PV['AHex'].matched_characters # => %w[0 1 2 3 4 5 6 7 8 9 A B C ...] PV['AHex'].matched_codepoints # => [48, 49, 50, ...] PV['AHex'].matched_ranges # => [48..57, 65..70, 97..102] ``` If [`character_set`](https://github.com/jaynetics/character_set) is installed, you can also do this: ```ruby PV['AHex'].character_set # => # ``` ##### Utility methods ```ruby # get a Hash of aliases for property names PV.alias_hash # => { => , ... } # download a list of possible properties for the running Ruby version PV.update ``` regexp_property_values-1.0.0/bin/0000755000004100000410000000000013641343655017137 5ustar www-datawww-dataregexp_property_values-1.0.0/bin/console0000755000004100000410000000060013641343655020523 0ustar www-datawww-data#!/usr/bin/env ruby require "bundler/setup" require "regexp_property_values" # You can add fixtures and/or initialization code here to make experimenting # with your gem easier. You can also use a different console, if you like. # (If you use this, don't forget to add pry to your Gemfile!) # require "pry" # Pry.start PV = RegexpPropertyValues require "irb" IRB.start(__FILE__) regexp_property_values-1.0.0/bin/setup0000755000004100000410000000020313641343655020220 0ustar www-datawww-data#!/usr/bin/env bash set -euo pipefail IFS=$'\n\t' set -vx bundle install # Do any other automated setup that you need to do here regexp_property_values-1.0.0/CHANGELOG.md0000644000004100000410000000124313641343655020200 0ustar www-datawww-data# Changelog All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). ## [1.0.0] - 2019-06-16 ### Changed - removed `::by_category`, `::by_matched_codepoints`, `::short_and_long_names` - return values are now always of a custom `Value` class, no longer extended `Strings` - unknown properties now raise `RegexpPropertyValues::Error`, no longer an `ArgumentError` ### Added - `Value#identifier` - `Value#full_name` ### Fixed - better codepoint determination speed for non-C Rubies (still slow) regexp_property_values-1.0.0/.gitignore0000644000004100000410000000055613641343655020365 0ustar www-datawww-data*.bundle *.gem *.iml *.stTheme.cache *.sublime-project *.sublime-workspace *.swp *.tmlanguage.cache *.tmPreferences.cache *~ .byebug_history .DS_Store .idea/ .ruby-gemset .ruby-version .tags .tags1 bbin/ binstubs/* bundler_stubs/*/.yardoc Gemfile.lock /.bundle/ /.vscode/ /_yardoc/ /coverage/ /doc/ /pkg/ /spec/reports/ /tmp/ # rspec failure tracking .rspec_status regexp_property_values-1.0.0/regexp_property_values.gemspec0000644000004100000410000000237713641343655024562 0ustar www-datawww-datalib = File.expand_path("../lib", __FILE__) $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) require 'regexp_property_values/version' Gem::Specification.new do |s| s.name = 'regexp_property_values' s.version = RegexpPropertyValues::VERSION s.authors = ['Janosch Müller'] s.email = ['janosch84@gmail.com'] s.summary = "Inspect property values supported by Ruby's regex engine" s.description = 'This small library lets you see which property values '\ 'are supported by the regular expression engine of the '\ 'Ruby version you are running, and what they match.' s.homepage = 'https://github.com/jaynetics/regexp_property_values' s.license = 'MIT' s.files = `git ls-files -z`.split("\x0").reject do |f| f.match(%r{^(test|spec|features)/}) end s.require_paths = ['lib'] s.extensions = %w[ext/regexp_property_values/extconf.rb] s.required_ruby_version = '>= 2.0.0' s.add_development_dependency 'character_set', '~> 1.4.0' s.add_development_dependency 'rake', '~> 12.0' s.add_development_dependency 'rake-compiler', '~> 1.0' s.add_development_dependency 'range_compressor', '~> 1.0' s.add_development_dependency 'rspec', '~> 3.0' end regexp_property_values-1.0.0/Rakefile0000644000004100000410000000140113641343655020030 0ustar www-datawww-datarequire 'bundler/gem_tasks' require 'rubygems/package_task' require 'rspec/core/rake_task' RSpec::Core::RakeTask.new(:spec) task :default => :spec require 'rake/extensiontask' Rake::ExtensionTask.new('regexp_property_values') do |ext| ext.lib_dir = 'lib/regexp_property_values' end namespace :java do java_gemspec = eval File.read('./regexp_property_values.gemspec') java_gemspec.platform = 'java' java_gemspec.extensions = [] java_gemspec.add_runtime_dependency 'range_compressor', '~> 1.0' Gem::PackageTask.new(java_gemspec) do |pkg| pkg.need_zip = true pkg.need_tar = true pkg.package_dir = 'pkg' end end task package: 'java:gem' if RUBY_PLATFORM !~ /java/i # recompile before running specs task(:spec).enhance([:compile]) end regexp_property_values-1.0.0/lib/0000755000004100000410000000000013641343655017135 5ustar www-datawww-dataregexp_property_values-1.0.0/lib/regexp_property_values.rb0000644000004100000410000000161513641343655024302 0ustar www-datawww-databegin require 'regexp_property_values/regexp_property_values' rescue LoadError warn 'regexp_property_values could not load C extension, using slower Ruby' end require 'regexp_property_values/updater' require 'regexp_property_values/value' require 'regexp_property_values/version' module RegexpPropertyValues Error = Class.new(StandardError) VALUES_PATH = File.join(__dir__, 'values') ALIASES_PATH = File.join(__dir__, 'aliases') def self.[](name) Value.new(name) end def self.all_for_current_ruby @all_for_current_ruby ||= all.select(&:supported_by_current_ruby?) end def self.all @all ||= File.readlines(VALUES_PATH).map { |line| Value.new(line.chomp) } end def self.alias_hash @alias_hash ||= File.readlines(ALIASES_PATH).map do |line| line.chomp.split(';').map { |name| Value.new(name) } end.to_h end def self.update Updater.call end end regexp_property_values-1.0.0/lib/aliases0000644000004100000410000000740013641343655020502 0ustar www-datawww-dataAHex;ASCII_Hex_Digit Adlm;Adlam Aghb;Caucasian_Albanian Arab;Arabic Armi;Imperial_Aramaic Armn;Armenian Avst;Avestan Bali;Balinese Bamu;Bamum Bass;Bassa_Vah Batk;Batak Beng;Bengali Bhks;Bhaiksuki Bidi_C;Bidi_Control Bopo;Bopomofo Brah;Brahmi Brai;Braille Bugi;Buginese Buhd;Buhid C;Other CI;Case_Ignorable CWCF;Changes_When_Casefolded CWCM;Changes_When_Casemapped CWL;Changes_When_Lowercased CWT;Changes_When_Titlecased CWU;Changes_When_Uppercased Cakm;Chakma Cans;Canadian_Aboriginal Cari;Carian Cc;Control Cf;Format Cher;Cherokee Cn;Unassigned Co;Private_Use Combining_Mark;Mark Copt;Coptic Cprt;Cypriot Cs;Surrogate Cyrl;Cyrillic DI;Default_Ignorable_Code_Point Dep;Deprecated Deva;Devanagari Dia;Diacritic Dogr;Dogra Dsrt;Deseret Dupl;Duployan Egyp;Egyptian_Hieroglyphs Elba;Elbasan Elym;Elymaic Ethi;Ethiopic Ext;Extender Geor;Georgian Glag;Glagolitic Gong;Gunjala_Gondi Gonm;Masaram_Gondi Goth;Gothic Gr_Base;Grapheme_Base Gr_Ext;Grapheme_Extend Gr_Link;Grapheme_Link Gran;Grantha Grek;Greek Gujr;Gujarati Guru;Gurmukhi Hang;Hangul Hani;Han Hano;Hanunoo Hatr;Hatran Hebr;Hebrew Hex;Hex_Digit Hira;Hiragana Hluw;Anatolian_Hieroglyphs Hmng;Pahawh_Hmong Hmnp;Nyiakeng_Puachue_Hmong Hung;Old_Hungarian IDC;ID_Continue IDS;ID_Start IDSB;IDS_Binary_Operator IDST;IDS_Trinary_Operator Ideo;Ideographic Ital;Old_Italic Java;Javanese Join_C;Join_Control Kali;Kayah_Li Kana;Katakana Khar;Kharoshthi Khmr;Khmer Khoj;Khojki Knda;Kannada Kthi;Kaithi L;Letter LC;Cased_Letter LOE;Logical_Order_Exception Lana;Tai_Tham Laoo;Lao Latn;Latin Lepc;Lepcha Limb;Limbu Lina;Linear_A Linb;Linear_B Ll;Lowercase_Letter Lm;Modifier_Letter Lo;Other_Letter Lt;Titlecase_Letter Lu;Uppercase_Letter Lyci;Lycian Lydi;Lydian M;Mark Mahj;Mahajani Maka;Makasar Mand;Mandaic Mani;Manichaean Marc;Marchen Mc;Spacing_Mark Me;Enclosing_Mark Medf;Medefaidrin Mend;Mende_Kikakui Merc;Meroitic_Cursive Mero;Meroitic_Hieroglyphs Mlym;Malayalam Mn;Nonspacing_Mark Mong;Mongolian Mroo;Mro Mtei;Meetei_Mayek Mult;Multani Mymr;Myanmar N;Number NChar;Noncharacter_Code_Point Nand;Nandinagari Narb;Old_North_Arabian Nbat;Nabataean Nd;Decimal_Number Nkoo;Nko Nl;Letter_Number No;Other_Number Nshu;Nushu OAlpha;Other_Alphabetic ODI;Other_Default_Ignorable_Code_Point OGr_Ext;Other_Grapheme_Extend OIDC;Other_ID_Continue OIDS;Other_ID_Start OLower;Other_Lowercase OMath;Other_Math OUpper;Other_Uppercase Ogam;Ogham Olck;Ol_Chiki Orkh;Old_Turkic Orya;Oriya Osge;Osage Osma;Osmanya P;Punctuation PCM;Prepended_Concatenation_Mark Palm;Palmyrene Pat_Syn;Pattern_Syntax Pat_WS;Pattern_White_Space Pauc;Pau_Cin_Hau Pc;Connector_Punctuation Pd;Dash_Punctuation Pe;Close_Punctuation Perm;Old_Permic Pf;Final_Punctuation Phag;Phags_Pa Phli;Inscriptional_Pahlavi Phlp;Psalter_Pahlavi Phnx;Phoenician Pi;Initial_Punctuation Plrd;Miao Po;Other_Punctuation Prti;Inscriptional_Parthian Ps;Open_Punctuation QMark;Quotation_Mark Qaac;Coptic Qaai;Inherited RI;Regional_Indicator Rjng;Rejang Rohg;Hanifi_Rohingya Runr;Runic S;Symbol SD;Soft_Dotted STerm;Sentence_Terminal Samr;Samaritan Sarb;Old_South_Arabian Saur;Saurashtra Sc;Currency_Symbol Sgnw;SignWriting Shaw;Shavian Shrd;Sharada Sidd;Siddham Sind;Khudawadi Sinh;Sinhala Sk;Modifier_Symbol Sm;Math_Symbol So;Other_Symbol Sogd;Sogdian Sogo;Old_Sogdian Sora;Sora_Sompeng Soyo;Soyombo Sund;Sundanese Sylo;Syloti_Nagri Syrc;Syriac Tagb;Tagbanwa Takr;Takri Tale;Tai_Le Talu;New_Tai_Lue Taml;Tamil Tang;Tangut Tavt;Tai_Viet Telu;Telugu Term;Terminal_Punctuation Tfng;Tifinagh Tglg;Tagalog Thaa;Thaana Tibt;Tibetan Tirh;Tirhuta UIdeo;Unified_Ideograph Ugar;Ugaritic VS;Variation_Selector Vaii;Vai WSpace;White_Space Wara;Warang_Citi Wcho;Wancho XIDC;XID_Continue XIDS;XID_Start Xpeo;Old_Persian Xsux;Cuneiform Yiii;Yi Z;Separator Zanb;Zanabazar_Square Zinh;Inherited Zl;Line_Separator Zp;Paragraph_Separator Zs;Space_Separator Zyyy;Common Zzzz;Unknownregexp_property_values-1.0.0/lib/regexp_property_values/0000755000004100000410000000000013641343655023752 5ustar www-datawww-dataregexp_property_values-1.0.0/lib/regexp_property_values/version.rb0000644000004100000410000000006413641343655025764 0ustar www-datawww-datamodule RegexpPropertyValues VERSION = '1.0.0' end regexp_property_values-1.0.0/lib/regexp_property_values/value.rb0000644000004100000410000000051113641343655025410 0ustar www-datawww-datamodule RegexpPropertyValues class Value require_relative 'value/shared_methods' include SharedMethods if const_defined?(:OnigRegexpPropertyHelper) require_relative 'value/ext_adapter' include ExtAdapter else require_relative 'value/ruby_fallback' include RubyFallback end end end regexp_property_values-1.0.0/lib/regexp_property_values/updater.rb0000644000004100000410000000716113641343655025750 0ustar www-datawww-datamodule RegexpPropertyValues module Updater module_function require 'fileutils' require 'set' BASE_URL = 'http://www.unicode.org/Public/' UCD_FILES = %w[ Blocks.txt DerivedAge.txt DerivedCoreProperties.txt PropertyAliases.txt PropertyValueAliases.txt PropList.txt Scripts.txt ] EMOJI_FILES = %w[ emoji-data.txt ] TMP_DIR = File.join(__dir__, 'tmp_ucd') def call prepare_tmp_dir download_ucd_files write_values write_aliases remove_tmp_dir print_stats end def prepare_tmp_dir FileUtils.rm_rf(TMP_DIR) if File.exist?(TMP_DIR) FileUtils.mkdir(TMP_DIR) end def download_ucd_files unicode_version = RbConfig::CONFIG.fetch('UNICODE_VERSION') emoji_version = RbConfig::CONFIG.fetch('UNICODE_EMOJI_VERSION') puts 'This will load ucd and emoji data for the CURRENT RUBY '\ "(#{unicode_version} / #{emoji_version}). Run this on the "\ 'latest Ruby version you want to support. Continue? [y/n]' return puts 'download skipped.' unless $stdin.gets =~ /^y/i Dir.chdir(TMP_DIR) do UCD_FILES.each { |f| `wget #{BASE_URL}/#{unicode_version}/ucd/#{f}` } EMOJI_FILES.each { |f| `wget #{BASE_URL}/emoji/#{emoji_version}/#{f}` } end end def write_values @values = Set.new # posix properties @values += %w[ Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word Alnum ASCII XPosixPunct ] # special properties @values += %w[Any Assigned In_No_Block Unknown] # legacy properties @values += %w[Newline] regexp = /^[0-9a-fA-F]+(?:\.\.[0-9a-fA-F]+)? *; (?\w+) +# / %w[ DerivedCoreProperties.txt PropList.txt Scripts.txt emoji-data.txt ].each { |file| scan(file, regexp) { |caps| @values << caps[:prop_name] } } scan('PropertyValueAliases.txt', /^gc ; \w+ *; (?\w+)/) do |caps| @values << caps[:prop_name] end scan('Blocks.txt', /^[\dA-F.]+ *; (?[-\w ]+)/) do |caps| @values << 'In_' + caps[:block_name].gsub(/\W/, '_') end scan('DerivedAge.txt', /^[\dA-F.]+ *; (?[\d.]+)/) do |caps| @values << 'Age=' + caps[:age_num] end File.write(RegexpPropertyValues::VALUES_PATH, @values.sort.join("\n")) end def write_aliases @aliases = Set.new scan('PropertyAliases.txt', /^(?\w+) *; (?\w+)/) do |caps| if in_values?(caps[:name]) && !in_values?(caps[:alias]) @aliases << [caps[:alias], caps[:name]] end end scan('PropertyValueAliases.txt', /^[gs]c ; (?\w+) *; (?\w+)(?: *; (?\w+))?/) do |caps| if in_values?(caps[:name]) && !in_values?(caps[:alias1]) @aliases << [caps[:alias1], caps[:name]] end if in_values?(caps[:name]) && caps[:alias2] && !in_values?(caps[:alias2]) @aliases << [caps[:alias2], caps[:name]] end end File.write(RegexpPropertyValues::ALIASES_PATH, @aliases.sort.map { |pair| pair.join(';') }.join("\n")) end def in_values?(string) @values.any? { |value| value.casecmp?(string) } end def scan(file, pattern) path = File.join(TMP_DIR, file) File.read(path).scan(pattern) { yield(Regexp.last_match) } end def remove_tmp_dir FileUtils.rm_rf(TMP_DIR) end def print_stats print "\nFetched #{@values.size} values and #{@aliases.size} aliases.\n\n" end end end regexp_property_values-1.0.0/lib/regexp_property_values/value/0000755000004100000410000000000013641343655025066 5ustar www-datawww-dataregexp_property_values-1.0.0/lib/regexp_property_values/value/ruby_fallback.rb0000644000004100000410000000115713641343655030217 0ustar www-datawww-datamodule RegexpPropertyValues class Value module RubyFallback def matched_characters matched_codepoints.map { |cp| cp.chr('utf-8') } end def matched_codepoints # turns out scanning one big string is the least slow way to do this @@test_str ||= (0..0xD7FF).map { |cp| cp.chr('utf-8') }.join << (0xE000..0x10FFFF).map { |cp| cp.chr('utf-8') }.join @@test_str.scan(regexp).flat_map(&:codepoints) end def matched_ranges require 'range_compressor' RangeCompressor.compress(matched_codepoints) end end end end regexp_property_values-1.0.0/lib/regexp_property_values/value/shared_methods.rb0000644000004100000410000000261613641343655030411 0ustar www-datawww-datamodule RegexpPropertyValues class Value module SharedMethods attr_reader :name def initialize(name) @name = name end def supported_by_current_ruby? !!regexp rescue false end def ==(other) identifier == other.identifier end alias eql? == def hash @hash ||= identifier.hash end def identifier @identifier ||= name.to_s.downcase.gsub(/[^0-9a-z=.]/, '') end alias to_s identifier def full_name (original = find_original) ? original.name : raise_unknown_error end def character_set require 'character_set' CharacterSet.from_ranges(*matched_ranges) end private def regexp @regexp ||= /\p{#{identifier}}/u rescue RegexpError, SyntaxError raise_unsupported_or_unknown_error end def find_original RegexpPropertyValues.all.find { |orig| orig.eql?(self) } || RegexpPropertyValues.alias_hash[self] end def raise_unsupported_or_unknown_error find_original ? raise_unsupported_error : raise_unknown_error end def raise_unsupported_error raise Error, "Property name `#{name}` is known, but not in this Ruby" end def raise_unknown_error raise Error, "Property name `#{name}` is not known in any Ruby" end end end end regexp_property_values-1.0.0/lib/regexp_property_values/value/ext_adapter.rb0000644000004100000410000000064113641343655027714 0ustar www-datawww-datamodule RegexpPropertyValues class Value module ExtAdapter def matched_characters matched_codepoints.map { |cp| cp.chr('utf-8') } end def matched_codepoints matched_ranges.flat_map(&:to_a) end def matched_ranges OnigRegexpPropertyHelper.matched_ranges(name) rescue ArgumentError raise_unsupported_or_unknown_error end end end end regexp_property_values-1.0.0/lib/values0000644000004100000410000002155213641343655020364 0ustar www-datawww-dataASCII ASCII_Hex_Digit Adlam Age=1.1 Age=10.0 Age=11.0 Age=12.0 Age=12.1 Age=2.0 Age=2.1 Age=3.0 Age=3.1 Age=3.2 Age=4.0 Age=4.1 Age=5.0 Age=5.1 Age=5.2 Age=6.0 Age=6.1 Age=6.2 Age=6.3 Age=7.0 Age=8.0 Age=9.0 Ahom Alnum Alpha Alphabetic Anatolian_Hieroglyphs Any Arabic Armenian Assigned Avestan Balinese Bamum Bassa_Vah Batak Bengali Bhaiksuki Bidi_Control Blank Bopomofo Brahmi Braille Buginese Buhid Canadian_Aboriginal Carian Case_Ignorable Cased Cased_Letter Caucasian_Albanian Chakma Cham Changes_When_Casefolded Changes_When_Casemapped Changes_When_Lowercased Changes_When_Titlecased Changes_When_Uppercased Cherokee Close_Punctuation Cntrl Common Connector_Punctuation Control Coptic Cuneiform Currency_Symbol Cypriot Cyrillic Dash Dash_Punctuation Decimal_Number Default_Ignorable_Code_Point Deprecated Deseret Devanagari Diacritic Digit Dogra Duployan Egyptian_Hieroglyphs Elbasan Elymaic Emoji Emoji_Component Emoji_Modifier Emoji_Modifier_Base Emoji_Presentation Enclosing_Mark Ethiopic Extender Final_Punctuation Format Georgian Glagolitic Gothic Grantha Graph Grapheme_Base Grapheme_Extend Grapheme_Link Greek Gujarati Gunjala_Gondi Gurmukhi Han Hangul Hanifi_Rohingya Hanunoo Hatran Hebrew Hex_Digit Hiragana Hyphen IDS_Binary_Operator IDS_Trinary_Operator ID_Continue ID_Start Ideographic Imperial_Aramaic In_Adlam In_Aegean_Numbers In_Ahom In_Alchemical_Symbols In_Alphabetic_Presentation_Forms In_Anatolian_Hieroglyphs In_Ancient_Greek_Musical_Notation In_Ancient_Greek_Numbers In_Ancient_Symbols In_Arabic In_Arabic_Extended_A In_Arabic_Mathematical_Alphabetic_Symbols In_Arabic_Presentation_Forms_A In_Arabic_Presentation_Forms_B In_Arabic_Supplement In_Armenian In_Arrows In_Avestan In_Balinese In_Bamum In_Bamum_Supplement In_Basic_Latin In_Bassa_Vah In_Batak In_Bengali In_Bhaiksuki In_Block_Elements In_Bopomofo In_Bopomofo_Extended In_Box_Drawing In_Brahmi In_Braille_Patterns In_Buginese In_Buhid In_Byzantine_Musical_Symbols In_CJK_Compatibility In_CJK_Compatibility_Forms In_CJK_Compatibility_Ideographs In_CJK_Compatibility_Ideographs_Supplement In_CJK_Radicals_Supplement In_CJK_Strokes In_CJK_Symbols_and_Punctuation In_CJK_Unified_Ideographs In_CJK_Unified_Ideographs_Extension_A In_CJK_Unified_Ideographs_Extension_B In_CJK_Unified_Ideographs_Extension_C In_CJK_Unified_Ideographs_Extension_D In_CJK_Unified_Ideographs_Extension_E In_CJK_Unified_Ideographs_Extension_F In_Carian In_Caucasian_Albanian In_Chakma In_Cham In_Cherokee In_Cherokee_Supplement In_Chess_Symbols In_Combining_Diacritical_Marks In_Combining_Diacritical_Marks_Extended In_Combining_Diacritical_Marks_Supplement In_Combining_Diacritical_Marks_for_Symbols In_Combining_Half_Marks In_Common_Indic_Number_Forms In_Control_Pictures In_Coptic In_Coptic_Epact_Numbers In_Counting_Rod_Numerals In_Cuneiform In_Cuneiform_Numbers_and_Punctuation In_Currency_Symbols In_Cypriot_Syllabary In_Cyrillic In_Cyrillic_Extended_A In_Cyrillic_Extended_B In_Cyrillic_Extended_C In_Cyrillic_Supplement In_Deseret In_Devanagari In_Devanagari_Extended In_Dingbats In_Dogra In_Domino_Tiles In_Duployan In_Early_Dynastic_Cuneiform In_Egyptian_Hieroglyph_Format_Controls In_Egyptian_Hieroglyphs In_Elbasan In_Elymaic In_Emoticons In_Enclosed_Alphanumeric_Supplement In_Enclosed_Alphanumerics In_Enclosed_CJK_Letters_and_Months In_Enclosed_Ideographic_Supplement In_Ethiopic In_Ethiopic_Extended In_Ethiopic_Extended_A In_Ethiopic_Supplement In_General_Punctuation In_Geometric_Shapes In_Geometric_Shapes_Extended In_Georgian In_Georgian_Extended In_Georgian_Supplement In_Glagolitic In_Glagolitic_Supplement In_Gothic In_Grantha In_Greek_Extended In_Greek_and_Coptic In_Gujarati In_Gunjala_Gondi In_Gurmukhi In_Halfwidth_and_Fullwidth_Forms In_Hangul_Compatibility_Jamo In_Hangul_Jamo In_Hangul_Jamo_Extended_A In_Hangul_Jamo_Extended_B In_Hangul_Syllables In_Hanifi_Rohingya In_Hanunoo In_Hatran In_Hebrew In_High_Private_Use_Surrogates In_High_Surrogates In_Hiragana In_IPA_Extensions In_Ideographic_Description_Characters In_Ideographic_Symbols_and_Punctuation In_Imperial_Aramaic In_Indic_Siyaq_Numbers In_Inscriptional_Pahlavi In_Inscriptional_Parthian In_Javanese In_Kaithi In_Kana_Extended_A In_Kana_Supplement In_Kanbun In_Kangxi_Radicals In_Kannada In_Katakana In_Katakana_Phonetic_Extensions In_Kayah_Li In_Kharoshthi In_Khmer In_Khmer_Symbols In_Khojki In_Khudawadi In_Lao In_Latin_1_Supplement In_Latin_Extended_A In_Latin_Extended_Additional In_Latin_Extended_B In_Latin_Extended_C In_Latin_Extended_D In_Latin_Extended_E In_Lepcha In_Letterlike_Symbols In_Limbu In_Linear_A In_Linear_B_Ideograms In_Linear_B_Syllabary In_Lisu In_Low_Surrogates In_Lycian In_Lydian In_Mahajani In_Mahjong_Tiles In_Makasar In_Malayalam In_Mandaic In_Manichaean In_Marchen In_Masaram_Gondi In_Mathematical_Alphanumeric_Symbols In_Mathematical_Operators In_Mayan_Numerals In_Medefaidrin In_Meetei_Mayek In_Meetei_Mayek_Extensions In_Mende_Kikakui In_Meroitic_Cursive In_Meroitic_Hieroglyphs In_Miao In_Miscellaneous_Mathematical_Symbols_A In_Miscellaneous_Mathematical_Symbols_B In_Miscellaneous_Symbols In_Miscellaneous_Symbols_and_Arrows In_Miscellaneous_Symbols_and_Pictographs In_Miscellaneous_Technical In_Modi In_Modifier_Tone_Letters In_Mongolian In_Mongolian_Supplement In_Mro In_Multani In_Musical_Symbols In_Myanmar In_Myanmar_Extended_A In_Myanmar_Extended_B In_NKo In_Nabataean In_Nandinagari In_New_Tai_Lue In_Newa In_No_Block In_Number_Forms In_Nushu In_Nyiakeng_Puachue_Hmong In_Ogham In_Ol_Chiki In_Old_Hungarian In_Old_Italic In_Old_North_Arabian In_Old_Permic In_Old_Persian In_Old_Sogdian In_Old_South_Arabian In_Old_Turkic In_Optical_Character_Recognition In_Oriya In_Ornamental_Dingbats In_Osage In_Osmanya In_Ottoman_Siyaq_Numbers In_Pahawh_Hmong In_Palmyrene In_Pau_Cin_Hau In_Phags_pa In_Phaistos_Disc In_Phoenician In_Phonetic_Extensions In_Phonetic_Extensions_Supplement In_Playing_Cards In_Private_Use_Area In_Psalter_Pahlavi In_Rejang In_Rumi_Numeral_Symbols In_Runic In_Samaritan In_Saurashtra In_Sharada In_Shavian In_Shorthand_Format_Controls In_Siddham In_Sinhala In_Sinhala_Archaic_Numbers In_Small_Form_Variants In_Small_Kana_Extension In_Sogdian In_Sora_Sompeng In_Soyombo In_Spacing_Modifier_Letters In_Specials In_Sundanese In_Sundanese_Supplement In_Superscripts_and_Subscripts In_Supplemental_Arrows_A In_Supplemental_Arrows_B In_Supplemental_Arrows_C In_Supplemental_Mathematical_Operators In_Supplemental_Punctuation In_Supplemental_Symbols_and_Pictographs In_Supplementary_Private_Use_Area_A In_Supplementary_Private_Use_Area_B In_Sutton_SignWriting In_Syloti_Nagri In_Symbols_and_Pictographs_Extended_A In_Syriac In_Syriac_Supplement In_Tagalog In_Tagbanwa In_Tags In_Tai_Le In_Tai_Tham In_Tai_Viet In_Tai_Xuan_Jing_Symbols In_Takri In_Tamil In_Tamil_Supplement In_Tangut In_Tangut_Components In_Telugu In_Thaana In_Thai In_Tibetan In_Tifinagh In_Tirhuta In_Transport_and_Map_Symbols In_Ugaritic In_Unified_Canadian_Aboriginal_Syllabics In_Unified_Canadian_Aboriginal_Syllabics_Extended In_Vai In_Variation_Selectors In_Variation_Selectors_Supplement In_Vedic_Extensions In_Vertical_Forms In_Wancho In_Warang_Citi In_Yi_Radicals In_Yi_Syllables In_Yijing_Hexagram_Symbols In_Zanabazar_Square Inherited Initial_Punctuation Inscriptional_Pahlavi Inscriptional_Parthian Javanese Join_Control Kaithi Kannada Katakana Kayah_Li Kharoshthi Khmer Khojki Khudawadi Lao Latin Lepcha Letter Letter_Number Limbu Line_Separator Linear_A Linear_B Lisu Logical_Order_Exception Lower Lowercase Lowercase_Letter Lycian Lydian Mahajani Makasar Malayalam Mandaic Manichaean Marchen Mark Masaram_Gondi Math Math_Symbol Medefaidrin Meetei_Mayek Mende_Kikakui Meroitic_Cursive Meroitic_Hieroglyphs Miao Modi Modifier_Letter Modifier_Symbol Mongolian Mro Multani Myanmar Nabataean Nandinagari New_Tai_Lue Newa Newline Nko Noncharacter_Code_Point Nonspacing_Mark Number Nushu Nyiakeng_Puachue_Hmong Ogham Ol_Chiki Old_Hungarian Old_Italic Old_North_Arabian Old_Permic Old_Persian Old_Sogdian Old_South_Arabian Old_Turkic Open_Punctuation Oriya Osage Osmanya Other Other_Alphabetic Other_Default_Ignorable_Code_Point Other_Grapheme_Extend Other_ID_Continue Other_ID_Start Other_Letter Other_Lowercase Other_Math Other_Number Other_Punctuation Other_Symbol Other_Uppercase Pahawh_Hmong Palmyrene Paragraph_Separator Pattern_Syntax Pattern_White_Space Pau_Cin_Hau Phags_Pa Phoenician Prepended_Concatenation_Mark Print Private_Use Psalter_Pahlavi Punct Punctuation Quotation_Mark Radical Regional_Indicator Rejang Runic Samaritan Saurashtra Sentence_Terminal Separator Sharada Shavian Siddham SignWriting Sinhala Soft_Dotted Sogdian Sora_Sompeng Soyombo Space Space_Separator Spacing_Mark Sundanese Surrogate Syloti_Nagri Symbol Syriac Tagalog Tagbanwa Tai_Le Tai_Tham Tai_Viet Takri Tamil Tangut Telugu Terminal_Punctuation Thaana Thai Tibetan Tifinagh Tirhuta Titlecase_Letter Ugaritic Unassigned Unified_Ideograph Unknown Upper Uppercase Uppercase_Letter Vai Variation_Selector Wancho Warang_Citi White_Space Word XDigit XID_Continue XID_Start XPosixPunct Yi Zanabazar_Squareregexp_property_values-1.0.0/Gemfile0000644000004100000410000000026113641343655017661 0ustar www-datawww-datasource "https://rubygems.org" git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } # Specify your gem's dependencies in regexp_property_values.gemspec gemspec regexp_property_values-1.0.0/ext/0000755000004100000410000000000013641343655017167 5ustar www-datawww-dataregexp_property_values-1.0.0/ext/regexp_property_values/0000755000004100000410000000000013641343655024004 5ustar www-datawww-dataregexp_property_values-1.0.0/ext/regexp_property_values/regexp_property_values.c0000644000004100000410000000304013641343655030762 0ustar www-datawww-data#include "ruby.h" #include "ruby/encoding.h" #include "ruby/oniguruma.h" // still in recent rubies f. backwards compatibility static int prop_name_to_ctype(char *name, rb_encoding *enc) { UChar *uname; int ctype; uname = (UChar *)name; ctype = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, uname, uname + strlen(name)); if (ctype < 0) rb_raise(rb_eArgError, "Unknown property name `%s`", name); return ctype; } VALUE onig_ranges_to_rb(const OnigCodePoint *onig_ranges) { unsigned int range_count, i; VALUE result, sub_range; range_count = onig_ranges[0]; result = rb_ary_new2(range_count); // rb_ary_new_capa not avail. in Ruby 2.0 for (i = 0; i < range_count; i++) { sub_range = rb_range_new(INT2FIX(onig_ranges[(i * 2) + 1]), INT2FIX(onig_ranges[(i * 2) + 2]), 0); rb_ary_store(result, i, sub_range); } return result; } VALUE rb_prop_ranges(char *name) { int ctype; const OnigCodePoint *onig_ranges; OnigCodePoint sb_out; rb_encoding *enc; enc = rb_utf8_encoding(); ctype = prop_name_to_ctype(name, enc); ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &onig_ranges); return onig_ranges_to_rb(onig_ranges); } VALUE method_matched_ranges(VALUE self, VALUE arg) { char *prop_name; prop_name = StringValueCStr(arg); return rb_prop_ranges(prop_name); } void Init_regexp_property_values() { VALUE module; module = rb_define_module("OnigRegexpPropertyHelper"); rb_define_singleton_method(module, "matched_ranges", method_matched_ranges, 1); } regexp_property_values-1.0.0/ext/regexp_property_values/extconf.rb0000644000004100000410000000012413641343655025774 0ustar www-datawww-datarequire 'mkmf' name = 'regexp_property_values' create_makefile("#{name}/#{name}") regexp_property_values-1.0.0/LICENSE.txt0000644000004100000410000000207413641343655020215 0ustar www-datawww-dataThe MIT License (MIT) Copyright (c) 2018 Jannosch Müller Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.