pax_global_header00006660000000000000000000000064137371060750014523gustar00rootroot0000000000000052 comment=7b6a723b3d5d9d72e7c445ca690b99607272400a babosa-1.0.4/000077500000000000000000000000001373710607500127545ustar00rootroot00000000000000babosa-1.0.4/.gemtest000066400000000000000000000000001373710607500144130ustar00rootroot00000000000000babosa-1.0.4/.gitignore000066400000000000000000000000461373710607500147440ustar00rootroot00000000000000Gemfile Gemfile.lock coverage pkg .rbxbabosa-1.0.4/.travis.yml000066400000000000000000000001471373710607500150670ustar00rootroot00000000000000language: ruby sudo: false rvm: - 2.6.0 - 2.5.0 - 2.4.0 - jruby branches: only: - master babosa-1.0.4/Changelog.md000066400000000000000000000036221373710607500151700ustar00rootroot00000000000000# Babosa Changelog ## 1.0.4 * Fix nil being cast to frozen string (https://github.com/norman/babosa/pull/52) ## 1.0.3 * Fix Active Support 6 deprecations (https://github.com/norman/babosa/pull/50) ## 1.0.2 * Fix regression in ActiveSupport UTF8 proxy. ## 1.0.1 * Fix error with tidy_bytes on Rubinius. * Simplify Active Support UTF8 proxy. * Fix `allow_bangs` argument to to_ruby_method being silently ignored. * Raise error when generating an impossible Ruby method name. ## 1.0.0 * Adopt semantic versioning. * When using Active Support, require 3.2 or greater. * Require Ruby 2.0 or greater. * Fix Ruby warnings. * Improve support for Ukrainian. * Support some additional punctuation characters used by Chinese and others. * Add Polish spec. * Use native Unicode normalization on Ruby 2.2 in UTF8::DumbProxy. * Invoke Ruby-native upcase/downcase in UTF8::DumbProxy. * Proxy `tidy_bytes` method to Active Support when possible. * Remove SlugString constant. ## 0.3.11 * Add support for Vietnamese. ## 0.3.10 * Fix Macedonian "S/S". Don't `include JRuby` unnecessarily. ## 0.3.9 * Add missing Greek vowels with diaeresis. ## 0.3.8 * Correct and improve Macedonian support. ## 0.3.7 * Fix compatibility with Ruby 1.8.7. * Add Swedish support. ## 0.3.6 * Allow multiple transliterators. * Add Greek support. ## 0.3.5 * Don't strip underscores from identifiers. ## 0.3.4 * Add Romanian support. ## 0.3.3 * Add Norwegian support. ## 0.3.2 * Improve Macedonian support. ## 0.3.1 * Small fixes to Cyrillic. ## 0.3.0 * Cyrillic support. * Improve support for various Unicode spaces and dashes. ## 0.2.2 * Fix for "smart" quote handling. ## 0.2.1 * Implement #empty? for compatiblity with Active Support's #blank?. ## 0.2.0 * Add support for Danish. * Add method to generate Ruby identifiers. * Improve performance. ## 0.1.1 * Add support for Serbian. ## 0.1.0 * Initial extraction from FriendlyId. babosa-1.0.4/Gemfile000066400000000000000000000002711373710607500142470ustar00rootroot00000000000000# Note that the Gemfile is here so you can run the coverage Rake task, and # easily install gems. However, Babosa does not use Bundler internally. source 'https://rubygems.org' gemspec babosa-1.0.4/MIT-LICENSE000066400000000000000000000020411373710607500144050ustar00rootroot00000000000000Copyright (c) 2010 Norman Clarke Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. babosa-1.0.4/README.md000066400000000000000000000223061373710607500142360ustar00rootroot00000000000000# Babosa [![Build Status](https://travis-ci.org/norman/babosa.png?branch=master)](https://travis-ci.org/norman/babosa) Babosa is a library for creating human-friendly identifiers, aka "slugs". It can also be useful for normalizing and sanitizing data. It is an extraction and improvement of the string code from [FriendlyId](http://github.com/norman/friendly_id). I have released this as a separate library to help developers who want to create libraries similar to FriendlyId. ## Features / Usage ### Transliterate UTF-8 characters to ASCII "Gölcük, Turkey".to_slug.transliterate.to_s #=> "Golcuk, Turkey" ### Locale sensitive transliteration, with support for many languages "Jürgen Müller".to_slug.transliterate.to_s #=> "Jurgen Muller" "Jürgen Müller".to_slug.transliterate(:german).to_s #=> "Juergen Mueller" Currently supported languages include: * Bulgarian * Danish * German * Greek * Macedonian * Norwegian * Romanian * Russian * Serbian * Spanish * Swedish * Ukrainian I'll gladly accept contributions from fluent speakers to support more languages. ### Strip non-ASCII characters "Gölcük, Turkey".to_slug.to_ascii.to_s #=> "Glck, Turkey" ### Truncate by characters "üüü".to_slug.truncate(2).to_s #=> "üü" ### Truncate by bytes This can be useful to ensure the generated slug will fit in a database column whose length is limited by bytes rather than UTF-8 characters. "üüü".to_slug.truncate_bytes(2).to_s #=> "ü" ### Remove punctuation chars "this is, um, **really** cool, huh?".to_slug.word_chars.to_s #=> "this is um really cool huh" ### All-in-one "Gölcük, Turkey".to_slug.normalize.to_s #=> "golcuk-turkey" ### Other stuff #### Using Babosa With FriendlyId 4 require "babosa" class Person < ActiveRecord::Base friendly_id :name, use: :slugged def normalize_friendly_id(input) input.to_s.to_slug.normalize(transliterations: :russian).to_s end end #### Pedantic UTF-8 support Babosa goes out of its way to handle [nasty Unicode issues you might never think you would have](https://github.com/norman/enc/blob/master/equivalence.rb) by checking, sanitizing and normalizing your string input. It will automatically use whatever Unicode library you have loaded before Babosa, or fall back to a simple built-in library. Supported Unicode libraries include: * Java (only on JRuby of course) * Active Support * [Unicode](https://github.com/blackwinter/unicode) * Built-in This built-in module is much faster than Active Support but much slower than Java or Unicode. It can only do **very** naive Unicode composition to ensure that, for example, "é" will always be composed to a single codepoint rather than an "e" and a "´" - making it safe to use as a hash key. But seriously - save yourself the headache and install a real Unicode library. If you are using Babosa with a language that uses the Cyrillic alphabet, Babosa requires either Unicode, Active Support or Java. #### Ruby Method Names Babosa can also generate strings for Ruby method names. (Yes, Ruby 1.9 can use UTF-8 chars in method names, but you may not want to): "this is a method".to_slug.to_ruby_method! #=> this_is_a_method "über cool stuff!".to_slug.to_ruby_method! #=> uber_cool_stuff! # You can also disallow trailing punctuation chars "über cool stuff!".to_slug.to_ruby_method(false) #=> uber_cool_stuff #### Easy to Extend You can add custom transliterators for your language with very little code. For example here's the transliterator for German: # encoding: utf-8 module Babosa module Transliterator class German < Latin APPROXIMATIONS = { "ä" => "ae", "ö" => "oe", "ü" => "ue", "Ä" => "Ae", "Ö" => "Oe", "Ü" => "Ue" } end end end And a spec (you can use this as a template): # encoding: utf-8 require File.expand_path("../../spec_helper", __FILE__) describe Babosa::Transliterator::German do let(:t) { described_class.instance } it_behaves_like "a latin transliterator" it "should transliterate Eszett" do t.transliterate("ß").should eql("ss") end it "should transliterate vowels with umlauts" do t.transliterate("üöä").should eql("ueoeae") end end ### Rails 3.x and higher Some of Babosa's functionality was added to Active Support 3.0.0. Babosa now differs from ActiveSupport primarily in that it supports non-Latin strings by default, and has per-locale ASCII transliterations already baked-in. If you are considering using Babosa with Rails, you may want to first take a look at Active Support's [transliterate](http://api.rubyonrails.org/classes/ActiveSupport/Inflector.html#method-i-transliterate) and [parameterize](http://api.rubyonrails.org/classes/ActiveSupport/Inflector.html#method-i-parameterize) to see if they suit your needs. ### Babosa vs. Stringex Babosa provides much of the functionality provided by the [Stringex](https://github.com/rsl/stringex) gem, but in the subjective opinion of the author, is for most use cases a better choice. #### Fewer Features Stringex offers functionality for storing slugs in an Active Record model, like a simple version of [FriendlyId](http://github.com/norman/friendly_id), in addition to string processing. Babosa only does string processing. #### Less Aggressive Unicode Transliteration Stringex uses an agressive Unicode to ASCII mapping which outputs gibberish for almost anything but Western European langages and Mandarin Chinese. Babosa supports only languages for which fluent speakers have provided transliterations, to ensure that the output makes sense to users. #### Unicode Support Stringex does no Unicode normalization or validation before transliterating strings, so if you pass in strings with encoding errors or with different Unicode normalizations, you'll get unpredictable results. #### No Locale Assumptions Babosa avoids making assumptions about locales like Stringex does, so it doesn't offer transliterations like this out of the box: "$12 worth of Ruby power".to_url => "12-dollars-worth-of-ruby-power" This is because the symbol "$" is used in many Latin American countries for the peso. Stringex does this in many places, for example, transliterating all Han characters into Pinyin, effectively treating Japanese text as if it were Mandarin Chinese. ### More info Please see the [API docs](http://rubydoc.info/github/norman/babosa/master/frames) and source code for more info. ## Getting it Babosa can be installed via Rubygems: gem install babosa You can get the source code from its [Github repository](http://github.com/norman/babosa). Babosa is tested to be compatible with Ruby 2.x, JRuby 1.7+, and Rubinius 2.x It's probably compatible with other Rubies as well. ## Reporting bugs Please use Babosa's [Github issue tracker](http://github.com/norman/babosa/issues). ## Misc "Babosa" means slug in Spanish. ## Author [Norman Clarke](http://njclarke.com) ## Contributors Many thanks to the following people for their help: * [Dmitry A. Ilyashevich](https://github.com/dmitry-ilyashevich) - Deprecation fixes * [anhkind](https://github.com/anhkind) - Vietnamese support * [Martins Zakis](https://github.com/martins) - Bug fixes * [Vassilis Rodokanakis](https://github.com/vrodokanakis) - Greek support * [Peco Danajlovski](https://github.com/Vortex) - Macedonian support * [Philip Arndt](https://github.com/parndt) - Bug fixes * [Jonas Forsberg](https://github.com/himynameisjonas) - Swedish support * [Jaroslav Kalistsuk](https://github.com/jarosan) - Greek support * [Steven Heidel](https://github.com/stevenheidel) - Bug fixes * [Edgars Beigarts](https://github.com/ebeigarts) - Support for multiple transliterators * [Tiberiu C. Turbureanu](https://gitorious.org/~tct) - Romanian support * [Kim Joar Bekkelund](https://github.com/kjbekkelund) - Norwegian support * [Alexey Shkolnikov](https://github.com/grlm) - Russian support * [Martin Petrov](https://github.com/martin-petrov) - Bulgarian support * [Molte Emil Strange Andersen](https://github.com/molte) - Danish support * [Milan Dobrota](https://github.com/milandobrota) - Serbian support ## Copyright Copyright (c) 2010-2013 Norman Clarke Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. babosa-1.0.4/Rakefile000066400000000000000000000012251373710607500144210ustar00rootroot00000000000000require "rubygems" require "rake/testtask" require "rake/clean" require "rubygems/package_task" task :default => :spec task :test => :spec CLEAN << "pkg" << "doc" << "coverage" << ".yardoc" begin require "yard" YARD::Rake::YardocTask.new do |t| t.options = ["--output-dir=doc"] end rescue LoadError end begin desc "Run SimpleCov" task :coverage do ENV["COV"] = "true" Rake::Task["spec"].execute end rescue LoadError end gemspec = File.expand_path("../babosa.gemspec", __FILE__) if File.exist? gemspec Gem::PackageTask.new(eval(File.read(gemspec))) { |pkg| } end require 'rspec/core/rake_task' RSpec::Core::RakeTask.new(:spec) babosa-1.0.4/babosa.gemspec000066400000000000000000000021171373710607500155510ustar00rootroot00000000000000require File.expand_path("../lib/babosa/version", __FILE__) spec = Gem::Specification.new do |s| s.name = 'babosa' s.version = Babosa::Version::STRING s.author = 'Norman Clarke' s.email = 'norman@njclarke.com' s.homepage = 'http://github.com/norman/babosa' s.required_ruby_version = '>= 2.0.0' s.summary = 'A library for creating slugs.' s.description = <<-EOM A library for creating slugs. Babosa an extraction and improvement of the string code from FriendlyId, intended to help developers create similar libraries or plugins. EOM s.test_files = Dir.glob 'test/**/*_test.rb' s.files = Dir['lib/**/*.rb', 'lib/**/*.rake', '*.md', 'MIT-LICENSE', 'Rakefile', 'init.rb', 'generators/**/*.*', 'spec/**/*.*', '.gemtest'] s.add_development_dependency 'activesupport', '>= 3.2.0' s.add_development_dependency 'rspec', '>= 3.7.0' s.add_development_dependency 'simplecov' s.add_development_dependency 'rake' s.add_development_dependency 'unicode' end babosa-1.0.4/bench.rb000066400000000000000000000011301373710607500143530ustar00rootroot00000000000000# encoding: utf-8 require "benchmark" require "rubygems" require "bundler/setup" require "babosa" def sample "Ja, żołnierz Wojska Polskiego, przysięgam służyć wiernie Rzeczypospolitej Polskiej".to_slug end N = 1000 Benchmark.bmbm do |x| x.report 'Truncate bytes' do N.times do sample.truncate_bytes(20) end end x.report 'Truncate chars' do N.times do sample.truncate(20) end end x.report 'Transliterate' do N.times do sample.transliterate end end x.report 'Strip non-ASCII' do N.times do sample.to_ascii end end end babosa-1.0.4/lib/000077500000000000000000000000001373710607500135225ustar00rootroot00000000000000babosa-1.0.4/lib/babosa.rb000066400000000000000000000004341373710607500152770ustar00rootroot00000000000000module Babosa def self.jruby15? JRUBY_VERSION >= "1.5" rescue false end end class String def to_identifier Babosa::Identifier.new self end alias to_slug to_identifier end require "babosa/transliterator/base" require "babosa/utf8/proxy" require "babosa/identifier" babosa-1.0.4/lib/babosa/000077500000000000000000000000001373710607500147515ustar00rootroot00000000000000babosa-1.0.4/lib/babosa/identifier.rb000066400000000000000000000231171373710607500174240ustar00rootroot00000000000000# encoding: utf-8 module Babosa # Codepoints for characters that will be deleted by +#word_chars!+. STRIPPABLE = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 58, 59, 60, 61, 62, 63, 64, 91, 92, 93, 94, 96, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 161, 162, 163, 164, 165, 166, 167, 168, 169, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 182, 183, 184, 185, 187, 188, 189, 190, 191, 215, 247, 8203, 8204, 8205, 8239, 65279] # This class provides some string-manipulation methods specific to slugs. # # Note that this class includes many "bang methods" such as {#clean!} and # {#normalize!} that perform actions on the string in-place. Each of these # methods has a corresponding "bangless" method (i.e., +Identifier#clean!+ # and +Identifier#clean+) which does not appear in the documentation because # it is generated dynamically. # # All of the bang methods return an instance of String, while the bangless # versions return an instance of Babosa::Identifier, so that calls to methods # specific to this class can be chained: # # string = Identifier.new("hello world") # string.with_separators! # => "hello-world" # string.with_separators # => # # @see http://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec Unicode character table class Identifier Error = Class.new(StandardError) attr_reader :wrapped_string alias to_s wrapped_string @@utf8_proxy = if Babosa.jruby15? UTF8::JavaProxy elsif defined? Unicode::VERSION UTF8::UnicodeProxy elsif defined? ActiveSupport UTF8::ActiveSupportProxy else UTF8::DumbProxy end # Return the proxy used for UTF-8 support. # @see Babosa::UTF8::Proxy def self.utf8_proxy @@utf8_proxy end # Set a proxy object used for UTF-8 support. # @see Babosa::UTF8::Proxy def self.utf8_proxy=(obj) @@utf8_proxy = obj end def method_missing(symbol, *args, &block) @wrapped_string.__send__(symbol, *args, &block) end # @param string [#to_s] The string to use as the basis of the Identifier. def initialize(string) @wrapped_string = string.to_s tidy_bytes! normalize_utf8! end def ==(value) @wrapped_string.to_s == value.to_s end def eql?(value) @wrapped_string == value end def empty? # included to make this class :respond_to? :empty for compatibility with Active Support's # #blank? @wrapped_string.empty? end # Approximate an ASCII string. This works only for Western strings using # characters that are Roman-alphabet characters + diacritics. Non-letter # characters are left unmodified. # # string = Identifier.new "Łódź # string.transliterate # => "Lodz, Poland" # string = Identifier.new "日本" # string.transliterate # => "日本" # # You can pass any key(s) from +Characters.approximations+ as arguments. This allows # for contextual approximations. Various languages are supported, you can see which ones # by looking at the source of {Babosa::Transliterator::Base}. # # string = Identifier.new "Jürgen Müller" # string.transliterate # => "Jurgen Muller" # string.transliterate :german # => "Juergen Mueller" # string = Identifier.new "¡Feliz año!" # string.transliterate # => "¡Feliz ano!" # string.transliterate :spanish # => "¡Feliz anio!" # # The approximations are an array, which you can modify if you choose: # # # Make Spanish use "nh" rather than "nn" # Babosa::Transliterator::Spanish::APPROXIMATIONS["ñ"] = "nh" # # Notice that this method does not simply convert to ASCII; if you want # to remove non-ASCII characters such as "¡" and "¿", use {#to_ascii!}: # # string.transliterate!(:spanish) # => "¡Feliz anio!" # string.transliterate! # => "¡Feliz anio!" # # @param *args # @return String def transliterate!(*kinds) kinds.compact! kinds = [:latin] if kinds.empty? kinds.each do |kind| transliterator = Transliterator.get(kind).instance @wrapped_string = transliterator.transliterate(@wrapped_string) end @wrapped_string end # Converts dashes to spaces, removes leading and trailing spaces, and # replaces multiple whitespace characters with a single space. # @return String def clean! @wrapped_string = @wrapped_string.gsub("-", " ").squeeze(" ").strip end # Remove any non-word characters. For this library's purposes, this means # anything other than letters, numbers, spaces, newlines and linefeeds. # @return String def word_chars! @wrapped_string = (unpack("U*") - Babosa::STRIPPABLE).pack("U*") end # Normalize the string for use as a URL slug. Note that in this context, # +normalize+ means, strip, remove non-letters/numbers, downcasing, # truncating to 255 bytes and converting whitespace to dashes. # @param Options # @return String def normalize!(options = nil) options = default_normalize_options.merge(options || {}) if translit_option = options[:transliterate] if translit_option != true transliterate!(*translit_option) else transliterate!(*options[:transliterations]) end end to_ascii! if options[:to_ascii] clean! word_chars! clean! downcase! truncate_bytes!(options[:max_length]) with_separators!(options[:separator]) end # Normalize a string so that it can safely be used as a Ruby method name. def to_ruby_method!(allow_bangs = true) leader, trailer = @wrapped_string.strip.scan(/\A(.+)(.)\z/).flatten leader = leader.to_s.dup trailer = trailer.to_s.dup if allow_bangs trailer.downcase! trailer.gsub!(/[^a-z0-9!=\\?]/, '') else trailer.downcase! trailer.gsub!(/[^a-z0-9]/, '') end id = leader.to_identifier id.transliterate! id.to_ascii! id.clean! id.word_chars! id.clean! @wrapped_string = id.to_s + trailer if @wrapped_string == "" raise Error, "Input generates impossible Ruby method name" end with_separators!("_") end # Delete any non-ascii characters. # @return String def to_ascii! @wrapped_string = @wrapped_string.gsub(/[^\x00-\x7f]/u, '') end # Truncate the string to +max+ characters. # @example # "üéøá".to_identifier.truncate(3) #=> "üéø" # @return String def truncate!(max) @wrapped_string = unpack("U*")[0...max].pack("U*") end # Truncate the string to +max+ bytes. This can be useful for ensuring that # a UTF-8 string will always fit into a database column with a certain max # byte length. The resulting string may be less than +max+ if the string must # be truncated at a multibyte character boundary. # @example # "üéøá".to_identifier.truncate_bytes(3) #=> "ü" # @return String def truncate_bytes!(max) return @wrapped_string if @wrapped_string.bytesize <= max curr = 0 new = [] unpack("U*").each do |char| break if curr > max char = [char].pack("U") curr += char.bytesize if curr <= max new << char end end @wrapped_string = new.join end # Replaces whitespace with dashes ("-"). # @return String def with_separators!(char = "-") @wrapped_string = @wrapped_string.gsub(/\s/u, char) end # Perform UTF-8 sensitive upcasing. # @return String def upcase! @wrapped_string = @@utf8_proxy.upcase(@wrapped_string) end # Perform UTF-8 sensitive downcasing. # @return String def downcase! @wrapped_string = @@utf8_proxy.downcase(@wrapped_string) end # Perform Unicode composition on the wrapped string. # @return String def normalize_utf8! @wrapped_string = @@utf8_proxy.normalize_utf8(@wrapped_string) end # Attempt to convert characters encoded using CP1252 and IS0-8859-1 to # UTF-8. # @return String def tidy_bytes! @wrapped_string = @@utf8_proxy.tidy_bytes(@wrapped_string) end %w[transliterate clean downcase word_chars normalize normalize_utf8 tidy_bytes to_ascii to_ruby_method truncate truncate_bytes upcase with_separators].each do |method| class_eval(<<-EOM, __FILE__, __LINE__ + 1) def #{method}(*args) send_to_new_instance(:#{method}!, *args) end EOM end def to_identifier self end # The default options for {#normalize!}. Override to set your own defaults. def default_normalize_options {:transliterate => true, :max_length => 255, :separator => "-"} end alias approximate_ascii transliterate alias approximate_ascii! transliterate! alias with_dashes with_separators alias with_dashes! with_separators! alias to_slug to_identifier private # Used as the basis of the bangless methods. def send_to_new_instance(*args) id = Identifier.allocate id.instance_variable_set :@wrapped_string, to_s id.send(*args) id end end end babosa-1.0.4/lib/babosa/transliterator/000077500000000000000000000000001373710607500200265ustar00rootroot00000000000000babosa-1.0.4/lib/babosa/transliterator/base.rb000066400000000000000000000063441373710607500212740ustar00rootroot00000000000000# encoding: utf-8 require 'singleton' module Babosa module Transliterator autoload :Bulgarian, "babosa/transliterator/bulgarian" autoload :Cyrillic, "babosa/transliterator/cyrillic" autoload :Danish, "babosa/transliterator/danish" autoload :German, "babosa/transliterator/german" autoload :Hindi, "babosa/transliterator/hindi" autoload :Latin, "babosa/transliterator/latin" autoload :Macedonian, "babosa/transliterator/macedonian" autoload :Norwegian, "babosa/transliterator/norwegian" autoload :Romanian, "babosa/transliterator/romanian" autoload :Russian, "babosa/transliterator/russian" autoload :Serbian, "babosa/transliterator/serbian" autoload :Spanish, "babosa/transliterator/spanish" autoload :Swedish, "babosa/transliterator/swedish" autoload :Ukrainian, "babosa/transliterator/ukrainian" autoload :Greek, "babosa/transliterator/greek" autoload :Vietnamese, "babosa/transliterator/vietnamese" autoload :Turkish, "babosa/transliterator/turkish" def self.get(symbol) class_name = symbol.to_s.split("_").map {|a| a.gsub(/\b('?[a-z])/) { $1.upcase }}.join const_get(class_name) end class Base include Singleton APPROXIMATIONS = { "×" => "x", "÷" => "/", "‐" => "-", "‑" => "-", "‒" => "-", "–" => "-", "—" => "-", "―" => "-", "‘" => "'", "‛" => "'", "“" => '"', "”" => '"', "„" => '"', "‟" => '"', '’' => "'", ',' => ",", '。' => ".", '!' => "!", '?' => '?', '、' => ',', '(' => '(', ')' => ')', '【' => '[', '】' => ']', ';' => ';', ':' => ':', '《' => '<', '》' => '>', # various kinds of space characters "\xc2\xa0" => " ", "\xe2\x80\x80" => " ", "\xe2\x80\x81" => " ", "\xe2\x80\x82" => " ", "\xe2\x80\x83" => " ", "\xe2\x80\x84" => " ", "\xe2\x80\x85" => " ", "\xe2\x80\x86" => " ", "\xe2\x80\x87" => " ", "\xe2\x80\x88" => " ", "\xe2\x80\x89" => " ", "\xe2\x80\x8a" => " ", "\xe2\x81\x9f" => " ", "\xe3\x80\x80" => " ", }.freeze attr_reader :approximations def initialize if self.class < Base @approximations = self.class.superclass.instance.approximations.dup else @approximations = {} end self.class.const_get(:APPROXIMATIONS).inject(@approximations) do |memo, object| index = object[0].unpack("U").shift value = object[1].unpack("C*") memo[index] = value.length == 1 ? value[0] : value memo end @approximations.freeze end # Accepts a single UTF-8 codepoint and returns the ASCII character code # used as the transliteration value. def [](codepoint) @approximations[codepoint] end # Transliterates a string. def transliterate(string) string.unpack("U*").map {|char| self[char] || char}.flatten.pack("U*") end end end end babosa-1.0.4/lib/babosa/transliterator/bulgarian.rb000066400000000000000000000010131373710607500223120ustar00rootroot00000000000000# encoding: utf-8 module Babosa module Transliterator class Bulgarian < Cyrillic APPROXIMATIONS = { "Ж" => "J", "Й" => "I", "Х" => "H", "Ц" => "C", "Щ" => "Sht", "Ъ" => "U", "Ь" => "I", "Ю" => "Iu", "Я" => "Ia", "ж" => "j", "й" => "i", "х" => "h", "ц" => "c", "щ" => "sht", "ъ" => "u", "ь" => "i", "ю" => "iu", "я" => "ia" } end end end babosa-1.0.4/lib/babosa/transliterator/cyrillic.rb000066400000000000000000000043651373710607500221750ustar00rootroot00000000000000# encoding: utf-8 module Babosa module Transliterator # Approximations are based on GOST 7.79, System B: # http://en.wikipedia.org/wiki/ISO_9#GOST_7.79 class Cyrillic < Base APPROXIMATIONS = { "Ё" => "Yo", "Ѓ" => "G", "Є" => "Ye", "Ї" => "Yi", "Љ" => "L", "Њ" => "N", "Ќ" => "K", "Ў" => "U", "Џ" => "Dh", "А" => "A", "Б" => "B", "В" => "V", "Г" => "G", "Д" => "D", "Е" => "E", "Ж" => "Zh", "З" => "Z", "И" => "I", "Й" => "J", "К" => "K", "Л" => "L", "М" => "M", "Н" => "N", "О" => "O", "П" => "P", "Р" => "R", "С" => "S", "Т" => "T", "У" => "U", "Ф" => "F", "Х" => "X", "Ц" => "Cz", "Ч" => "Ch", "Ш" => "Sh", "Щ" => "Shh", "Ъ" => "", "Ы" => "Y", "Ь" => "", "Э" => "E", "Ю" => "Yu", "Я" => "Ya", "а" => "a", "б" => "b", "в" => "v", "г" => "g", "д" => "d", "е" => "e", "ж" => "zh", "з" => "z", "и" => "i", "й" => "j", "к" => "k", "л" => "l", "м" => "m", "н" => "n", "о" => "o", "п" => "p", "р" => "r", "с" => "s", "т" => "t", "у" => "u", "ф" => "f", "х" => "x", "ц" => "cz", "ч" => "ch", "ш" => "sh", "щ" => "shh", "ъ" => "", "ы" => "y", "ь" => "", "э" => "e", "ю" => "yu", "я" => "ya", "ё" => "yo", "ѓ" => "g", "є" => "ye", "ї" => "yi", "љ" => "l", "њ" => "n", "ќ" => "k", "ў" => "u", "џ" => "dh", "Ѣ" => "Ye", "ѣ" => "ye", "Ѫ" => "O", "ѫ" => "o", "Ѳ" => "Fh", "ѳ" => "fh", "Ѵ" => "Yh", "ѵ" => "yh", "Ґ" => "G", "ґ" => "g", } def transliterate(string) super.gsub(/(c)z([ieyj])/) { "#{$1}#{$2}" } end end end end babosa-1.0.4/lib/babosa/transliterator/danish.rb000066400000000000000000000003621373710607500216220ustar00rootroot00000000000000# encoding: utf-8 module Babosa module Transliterator class Danish < Latin APPROXIMATIONS = { "æ" => "ae", "ø" => "oe", "å" => "aa", "Ø" => "Oe", "Å" => "Aa" } end end end babosa-1.0.4/lib/babosa/transliterator/german.rb000066400000000000000000000004071373710607500216250ustar00rootroot00000000000000# encoding: utf-8 module Babosa module Transliterator class German < Latin APPROXIMATIONS = { "ä" => "ae", "ö" => "oe", "ü" => "ue", "Ä" => "Ae", "Ö" => "Oe", "Ü" => "Ue" } end end end babosa-1.0.4/lib/babosa/transliterator/greek.rb000066400000000000000000000030331373710607500214470ustar00rootroot00000000000000# encoding: utf-8 module Babosa module Transliterator class Greek < Base APPROXIMATIONS = { "Α" => "A", "Ά" => "A", "α" => "a", "ά" => "a", "Β" => "V", "β" => "v", "Γ" => "G", "γ" => "g", "Δ" => "D", "δ" => "d", "Ε" => "E", "Έ" => "E", "ε" => "e", "έ" => "e", "Ζ" => "Z", "ζ" => "z", "Η" => "I", "Ή" => "i", "η" => "i", "ή" => "i", "Θ" => "TH", "θ" => "th", "Ι" => "I", "Ί" => "Ι", "Î" => "I", "ι" => "i", "ί" => "i", "ϊ" => "i", "ΐ" => "i", "Κ" => "K", "κ" => "k", "Λ" => "L", "λ" => "l", "Μ" => "M", "μ" => "m", "Ν" => "N", "ν" => "n", "Ξ" => "KS", "ξ" => "ks", "Ο" => "O", "Ό" => "O", "ο" => "o", "ό" => "o", "Π" => "P", "π" => "p", "Ρ" => "R", "ρ" => "r", "Σ" => "S", "σ" => "s", "ς" => "s", "Τ" => "T", "τ" => "t", "Υ" => "Y", "Ύ" => "Y", "υ" => "y", "ύ" => "y", "ϋ" => "y", "ΰ" => "y", "Φ" => "F", "φ" => "f", "Χ" => "X", "χ" => "x", "Ψ" => "PS", "ψ" => "ps", "Ω" => "O", "Ώ" => "O", "ω" => "o", "ώ" => "o" } end end endbabosa-1.0.4/lib/babosa/transliterator/hindi.rb000066400000000000000000000057101373710607500214510ustar00rootroot00000000000000# encoding: utf-8 module Babosa module Transliterator class Hindi < Base APPROXIMATIONS = { "ऀ" => "n", "ँ" => "n", "ं" => "n", "ः" => "h", "ऄ" => "a", "अ" => "a", "आ" => "aa", "इ" => "i", "ई" => "ii", "उ" => "u", "ऊ" => "uu", "ऋ" => "ri", "ऌ" => "lri", "ऍ" => "e", "ऎ" => "e", "ए" => "e", "ऐ" => "ei", "ऑ" => "o", "ऒ" => "o", "ओ" => "o", "औ" => "ou", "क" => "k", "ख" => "kh", "ग" => "g", "घ" => "gh", "ङ" => "d", "च" => "ch", "छ" => "chh", "ज" => "j", "झ" => "jh", "ञ" => "ny", "ट" => "tt", "ठ" => "tth", "ड" => "dd", "ढ" => "ddh", "ण" => "nn", "त" => "t", "थ" => "th", "द" => "d", "ध" => "dh", "न" => "n", "ऩ" => "nnn", "प" => "p", "फ" => "ph", "ब" => "b", "भ" => "bh", "म" => "m", "य" => "y", "र" => "r", "ऱ" => "rr", "ल" => "l", "ळ" => "ll", "ऴ" => "ll", "व" => "v", "श" => "sh", "ष" => "ss", "स" => "s", "ह" => "h", "ऺ" => "oe", "ऻ" => "ooe", "़" => "", "ऽ" => "-", "ा" => "aa", "ि" => "i", "ी" => "ii", "ु" => "u", "ू" => "uu", "ृ" => "r", "ॄ" => "rr", "ॅ" => "e", "ॆ" => "e", "े" => "e", "ै" => "ai", "ॉ" => "o", "ॊ" => "o", "ो" => "o", "ौ" => "au", "्" => "", "ॎ" => "e", "ॏ" => "aw", "ॐ" => "om", "॑" => "", "॒" => "_", "॓" => "", "॔" => "", "ॕ" => "ee", "ॖ" => "ue", "ॗ" => "uue", "क़" => "q", "ख़" => "khh", "ग़" => "ghh", "ज़" => "za", "ड़" => "dddh", "ढ़" => "rh", "फ़" => "f", "य़" => "yy", "ॠ" => "rri", "ॡ" => "lr", "ॢ" => "l", "ॣ" => "l", "।" => ".", "॥" => "..", "०" => "0", "१" => "1", "२" => "2", "३" => "3", "४" => "4", "५" => "5", "६" => "6", "७" => "7", "८" => "8", "९" => "9", "॰" => ".", "ॱ" => ".", "ॲ" => "a", "ॳ" => "oe", "ॴ" => "ooe", "ॵ" => "aw", "ॶ" => "ue", "ॷ" => "uue", "ॸ" => "dd", "ॹ" => "zh", "ॺ" => "y", "ॻ" => "gg", "ॼ" => "jj", "ॽ" => "?", "ॾ" => "ddd", "ॿ" => "bb" } end end end babosa-1.0.4/lib/babosa/transliterator/latin.rb000066400000000000000000000100201373710607500214530ustar00rootroot00000000000000# encoding: utf-8 module Babosa module Transliterator class Latin < Base APPROXIMATIONS = { "À" => "A", "Á" => "A", "Â" => "A", "Ã" => "A", "Ä" => "A", "Å" => "A", "Æ" => "Ae", "Ç" => "C", "È" => "E", "É" => "E", "Ê" => "E", "Ë" => "E", "Ì" => "I", "Í" => "I", "Î" => "I", "Ï" => "I", "Ð" => "D", "Ñ" => "N", "Ò" => "O", "Ó" => "O", "Ô" => "O", "Õ" => "O", "Ö" => "O", "Ø" => "O", "Ù" => "U", "Ú" => "U", "Û" => "U", "Ü" => "U", "Ý" => "Y", "Þ" => "Th", "ß" => "ss", "à" => "a" , "á" => "a", "â" => "a", "ã" => "a", "ä" => "a", "å" => "a", "æ" => "ae", "ç" => "c" , "è" => "e", "é" => "e", "ê" => "e", "ë" => "e", "ì" => "i", "í" => "i", "î" => "i", "ï" => "i", "ð" => "d", "ñ" => "n", "ò" => "o", "ó" => "o", "ô" => "o", "õ" => "o", "ö" => "o", "ø" => "o", "ù" => "u", "ú" => "u", "û" => "u", "ü" => "u", "ý" => "y", "þ" => "th", "ÿ" => "y", "Ā" => "A", "Ă" => "A", "Ą" => "A", "Ć" => "C", "Ĉ" => "C", "Ċ" => "C", "Č" => "C", "Ď" => "D", "Đ" => "D", "Ē" => "E", "Ĕ" => "E", "Ė" => "E", "Ę" => "E", "Ě" => "E", "Ĝ" => "G", "Ğ" => "G", "Ġ" => "G", "Ģ" => "G", "Ĥ" => "H", "Ħ" => "H", "Ĩ" => "I", "Ī" => "I", "Ĭ" => "I", "Į" => "I", "İ" => "I", "IJ" => "Ij", "Ĵ" => "J", "Ķ" => "K", "Ĺ" => "L", "Ļ" => "L", "Ľ" => "L", "Ŀ" => "L", "Ł" => "L", "Ń" => "N", "Ņ" => "N", "Ň" => "N", "Ŋ" => "Ng", "Ō" => "O", "Ŏ" => "O", "Ő" => "O", "Œ" => "OE", "Ŕ" => "R", "Ŗ" => "R", "Ř" => "R", "Ś" => "S", "Ŝ" => "S", "Ş" => "S", "Š" => "S", "Ţ" => "T", "Ť" => "T", "Ŧ" => "T", "Ũ" => "U", "Ū" => "U", "Ŭ" => "U", "Ů" => "U", "Ű" => "U", "Ų" => "U", "Ŵ" => "W", "Ŷ" => "Y", "Ÿ" => "Y", "Ź" => "Z", "Ż" => "Z", "Ž" => "Z", "ā" => "a", "ă" => "a", "ą" => "a", "ć" => "c", "ĉ" => "c", "ċ" => "c", "č" => "c", "ď" => "d", "đ" => "d", "ē" => "e", "ĕ" => "e", "ė" => "e", "ę" => "e", "ě" => "e", "ĝ" => "g", "ğ" => "g", "ġ" => "g", "ģ" => "g", "ĥ" => "h", "ħ" => "h", "ĩ" => "i", "ī" => "i", "ĭ" => "i", "į" => "i", "ı" => "i", "ij" => "ij", "ĵ" => "j", "ķ" => "k", "ĸ" => "k", "ĺ" => "l", "ļ" => "l", "ľ" => "l", "ŀ" => "l", "ł" => "l", "ń" => "n", "ņ" => "n", "ň" => "n", "ʼn" => "n", "ŋ" => "ng", "ō" => "o", "ŏ" => "o", "ő" => "o", "œ" => "oe", "ŕ" => "r", "ŗ" => "r", "ř" => "r", "ś" => "s", "ŝ" => "s", "ş" => "s", "š" => "s", "ţ" => "t", "ť" => "t", "ŧ" => "t", "ũ" => "u", "ū" => "u", "ŭ" => "u", "ů" => "u", "ű" => "u", "ų" => "u", "ŵ" => "w", "ŷ" => "y", "ž" => "z", "ź" => "z", "ż" => "z" } end end end babosa-1.0.4/lib/babosa/transliterator/macedonian.rb000066400000000000000000000010741373710607500224530ustar00rootroot00000000000000# encoding: utf-8 module Babosa module Transliterator class Macedonian < Cyrillic APPROXIMATIONS = { "Ѓ" => "Gj", "Љ" => "Lj", "Њ" => "Nj", "Ќ" => "Kj", "Џ" => "Dzh", "Ж" => "Zh", "Ц" => "C", "Ѕ" => "Z", "Ј" => "J", "Х" => "H", "ѓ" => "gj", "љ" => "lj", "њ" => "nj", "ќ" => "kj", "џ" => "dzh", "ж" => "zh", "ц" => "c", "ѕ" => "z", "ј" => "j", "х" => "h" } end end end babosa-1.0.4/lib/babosa/transliterator/norwegian.rb000066400000000000000000000003371373710607500223470ustar00rootroot00000000000000# encoding: utf-8 module Babosa module Transliterator class Norwegian < Latin APPROXIMATIONS = { "ø" => "oe", "å" => "aa", "Ø" => "Oe", "Å" => "Aa" } end end end babosa-1.0.4/lib/babosa/transliterator/romanian.rb000066400000000000000000000003311373710607500221540ustar00rootroot00000000000000# encoding: utf-8 module Babosa module Transliterator class Romanian < Latin APPROXIMATIONS = { "ș" => "s", "ț" => "t", "Ș" => "S", "Ț" => "T" } end end end babosa-1.0.4/lib/babosa/transliterator/russian.rb000066400000000000000000000006401373710607500220370ustar00rootroot00000000000000# encoding: utf-8 module Babosa module Transliterator class Russian < Cyrillic APPROXIMATIONS = { "Й" => "I", "М" => "M", "Х" => "H", "Ц" => "Ts", "Ш" => "Sh", "Щ" => "Sch", "Ю" => "U", "Я" => "Ya", "й" => "i", "х" => "h", "ц" => "ts", "щ" => "sch", "ю" => "u" } end end end babosa-1.0.4/lib/babosa/transliterator/serbian.rb000066400000000000000000000012701373710607500217760ustar00rootroot00000000000000# encoding: utf-8 module Babosa module Transliterator class Serbian < Latin APPROXIMATIONS = Cyrillic.const_get(:APPROXIMATIONS).merge({ "Ð" => "Dj", "Č" => "Ch", "Š" => "Sh", "č" => "ch", "đ" => "dj", "š" => "sh", "Ћ" => "C", "Ц" => "C", "Ч" => "Ch", "Ђ" => "Dj", "Џ" => "Dz", "Х" => "H", "Ј" => "J", "Љ" => "Lj", "Њ" => "Nj", "ц" => "c", "ћ" => "c", "ч" => "ch", "ђ" => "dj", "џ" => "dz", "х" => "h", "ј" => "j", "љ" => "lj", "њ" => "nj" }) end end end babosa-1.0.4/lib/babosa/transliterator/spanish.rb000066400000000000000000000002311373710607500220140ustar00rootroot00000000000000# encoding: utf-8 module Babosa module Transliterator class Spanish < Latin APPROXIMATIONS = {"ñ" => "ni", "Ñ" => "Ni"} end end end babosa-1.0.4/lib/babosa/transliterator/swedish.rb000066400000000000000000000004111373710607500220150ustar00rootroot00000000000000# encoding: utf-8 module Babosa module Transliterator class Swedish < Latin APPROXIMATIONS = { "å" => "aa", "ä" => "ae", "ö" => "oe", "Å" => "Aa", "Ä" => "Ae", "Ö" => "Oe" } end end end babosa-1.0.4/lib/babosa/transliterator/turkish.rb000066400000000000000000000001551373710607500220450ustar00rootroot00000000000000# encoding: utf-8 module Babosa module Transliterator class Turkish < Latin end end end babosa-1.0.4/lib/babosa/transliterator/ukrainian.rb000066400000000000000000000011141373710607500223310ustar00rootroot00000000000000# encoding: utf-8 module Babosa module Transliterator class Ukrainian < Cyrillic APPROXIMATIONS = { "Г" => "H", "г" => "h", "Ґ" => "G", "ґ" => "g", "є" => "ie", "И" => "Y", "и" => "y", "І" => "I", "і" => "i", "ї" => "i", "Й" => "Y", "й" => "i", "Х" => "Kh", "х" => "kh", "Ц" => "Ts", "ц" => 'ts', "Щ" => "Shch", "щ" => "shch", "ю" => "iu", "я" => "ia", "'" => "" } end end endbabosa-1.0.4/lib/babosa/transliterator/vietnamese.rb000066400000000000000000000057361373710607500225260ustar00rootroot00000000000000# encoding: utf-8 module Babosa module Transliterator class Vietnamese < Latin APPROXIMATIONS = { "à" => "a", "á" => "a", "ạ" => "a", "ả" => "a", "ã" => "a", "â" => "a", "ầ" => "a", "ấ" => "a", "ậ" => "a", "ẩ" => "a", "ẫ" => "a", "ă" => "a", "ằ" => "a", "ắ" => "a", "ặ" => "a", "ẳ" => "a", "ẵ" => "a", "À" => "A", "Á" => "A", "Ạ" => "A", "Ả" => "A", "Ã" => "A", "Â" => "A", "Ầ" => "A", "Ấ" => "A", "Ậ" => "A", "Ẩ" => "A", "Ẫ" => "A", "Ă" => "A", "Ằ" => "A", "Ắ" => "A", "Ặ" => "A", "Ẳ" => "A", "Ẵ" => "A", "ì" => "i", "í" => "i", "ị" => "i", "ỉ" => "i", "ĩ" => "i", "Ì" => "I", "Í" => "I", "Ị" => "I", "Ỉ" => "I", "Ĩ" => "I", "ù" => "u", "ú" => "u", "ụ" => "u", "ủ" => "u", "ũ" => "u", "ư" => "u", "ừ" => "u", "ứ" => "u", "ự" => "u", "ử" => "u", "ữ" => "u", "Ù" => "U", "Ú" => "U", "Ụ" => "U", "Ủ" => "U", "Ũ" => "U", "Ư" => "U", "Ừ" => "U", "Ứ" => "U", "Ự" => "U", "Ử" => "U", "Ữ" => "U", "è" => "e", "é" => "e", "ẹ" => "e", "ẻ" => "e", "ẽ" => "e", "ê" => "e", "ề" => "e", "ế" => "e", "ệ" => "e", "ể" => "e", "ễ" => "e", "È" => "E", "É" => "E", "Ẹ" => "E", "Ẻ" => "E", "Ẽ" => "E", "Ê" => "E", "Ề" => "E", "Ế" => "E", "Ệ" => "E", "Ể" => "E", "Ễ" => "E", "ò" => "o", "ó" => "o", "ọ" => "o", "ỏ" => "o", "õ" => "o", "ô" => "o", "ồ" => "o", "ố" => "o", "ộ" => "o", "ổ" => "o", "ỗ" => "o", "ơ" => "o", "ờ" => "o", "ớ" => "o", "ợ" => "o", "ở" => "o", "ỡ" => "o", "Ò" => "O", "Ó" => "O", "Ọ" => "O", "Ỏ" => "O", "Õ" => "O", "Ô" => "O", "Ồ" => "O", "Ố" => "O", "Ộ" => "O", "Ổ" => "O", "Ỗ" => "O", "Ơ" => "O", "Ờ" => "O", "Ớ" => "O", "Ợ" => "O", "Ở" => "O", "Ỡ" => "O", "ỳ" => "y", "ý" => "y", "ỵ" => "y", "ỷ" => "y", "ỹ" => "y", "Ỳ" => "Y", "Ý" => "Y", "Ỵ" => "Y", "Ỷ" => "Y", "Ỹ" => "Y", "đ" => "d", "Đ" => "D" } end end endbabosa-1.0.4/lib/babosa/utf8/000077500000000000000000000000001373710607500156375ustar00rootroot00000000000000babosa-1.0.4/lib/babosa/utf8/active_support_proxy.rb000066400000000000000000000016141373710607500224760ustar00rootroot00000000000000require 'active_support' require 'active_support/multibyte/unicode' module Babosa module UTF8 # A UTF-8 proxy using Active Support's multibyte support. module ActiveSupportProxy extend ActiveSupport::Multibyte::Unicode extend self def self.normalize_utf8(string) normalize(string, :c) end if ActiveSupport::VERSION::MAJOR == 3 def downcase(string) ActiveSupport::Multibyte::Chars.new(string).downcase.to_s end def upcase(string) ActiveSupport::Multibyte::Chars.new(string).upcase.to_s end elsif ActiveSupport::VERSION::MAJOR >= 6 def self.normalize_utf8(string) string.unicode_normalize(:nfc).to_s end def downcase(string) string.downcase.to_s end def upcase(string) string.upcase.to_s end end end end end babosa-1.0.4/lib/babosa/utf8/dumb_proxy.rb000066400000000000000000000033251373710607500203570ustar00rootroot00000000000000require File.expand_path("../mappings", __FILE__) module Babosa module UTF8 # This module provides fallback UTF-8 support when nothing else is # available. It does case folding for Roman alphabet-based characters # commonly used by Western European languages and little else, making it # useless for Russian, Bulgarian, Greek, etc. If at all possible, Unicode # or ActiveSupport should be used instead because they support the full # UTF-8 character range. module DumbProxy extend Proxy extend self def downcase(string) string.downcase.unpack("U*").map {|char| Mappings::DOWNCASE[char] or char}.flatten.pack("U*") end def upcase(string) string.upcase.unpack("U*").map {|char| Mappings::UPCASE[char] or char}.flatten.pack("U*") end if ''.respond_to?(:unicode_normalize) def normalize_utf8(string) string.unicode_normalize end else # On Ruby 2.2, this uses the native Unicode normalize method. On all # other Rubies, it does a very naive Unicode normalization, which should # work for this library's purposes (i.e., Roman-based codepoints, up to # U+017E). Do not use reuse this as a general solution! Use a real # library like Unicode or ActiveSupport instead. def normalize_utf8(string) codepoints = string.unpack("U*") new = [] until codepoints.empty? do if Mappings::COMPOSITION[codepoints[0..1]] new << Mappings::COMPOSITION[codepoints.slice!(0,2)] else new << codepoints.shift end end new.compact.flatten.pack("U*") end end end end end babosa-1.0.4/lib/babosa/utf8/java_proxy.rb000066400000000000000000000007611373710607500203520ustar00rootroot00000000000000module Babosa module UTF8 # A UTF-8 proxy module using Java's built-in Unicode support. Requires JRuby 1.5+. module JavaProxy extend Proxy extend self java_import java.text.Normalizer def downcase(string) string.to_java.to_lower_case.to_s end def upcase(string) string.to_java.to_upper_case.to_s end def normalize_utf8(string) Normalizer.normalize(string, Normalizer::Form::NFC).to_s end end end end babosa-1.0.4/lib/babosa/utf8/mappings.rb000066400000000000000000000133161373710607500200060ustar00rootroot00000000000000module Babosa module UTF8 # A small subset of the mappings provided by Unicode.org, limited to Latin # characters. This is used for Babosa's default "dumb" UTF-8 support. module Mappings DOWNCASE = Hash[65, 97, 66, 98, 67, 99, 68, 100, 69, 101, 70, 102, 71, 103, 72, 104, 73, 105, 74, 106, 75, 107, 76, 108, 77, 109, 78, 110, 79, 111, 80, 112, 81, 113, 82, 114, 83, 115, 84, 116, 85, 117, 86, 118, 87, 119, 88, 120, 89, 121, 90, 122, 181, 956, 192, 224, 193, 225, 194, 226, 195, 227, 196, 228, 197, 229, 198, 230, 199, 231, 200, 232, 201, 233, 202, 234, 203, 235, 204, 236, 205, 237, 206, 238, 207, 239, 208, 240, 209, 241, 210, 242, 211, 243, 212, 244, 213, 245, 214, 246, 216, 248, 217, 249, 218, 250, 219, 251, 220, 252, 221, 253, 222, 254, 223, [115, 115], 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, [105, 775], 306, 307, 308, 309, 310, 311, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, [700, 110], 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 255, 377, 378, 379, 380, 381, 382] UPCASE = DOWNCASE.invert COMPOSITION = { [65,768] => 192, [65,769] => 193, [65,770] => 194, [65,771] => 195, [65,776] => 196, [65,778] => 197, [67,807] => 199, [69,768] => 200, [69,769] => 201, [69,770] => 202, [69,776] => 203, [73,768] => 204, [73,769] => 205, [73,770] => 206, [73,776] => 207, [78,771] => 209, [79,768] => 210, [79,769] => 211, [79,770] => 212, [79,771] => 213, [79,776] => 214, [85,768] => 217, [85,769] => 218, [85,770] => 219, [85,776] => 220, [89,769] => 221, [97,768] => 224, [97,769] => 225, [97,770] => 226, [97,771] => 227, [97,776] => 228, [97,778] => 229, [99,807] => 231, [101,768] => 232, [101,769] => 233, [101,770] => 234, [101,776] => 235, [105,768] => 236, [105,769] => 237, [105,770] => 238, [105,776] => 239, [110,771] => 241, [111,768] => 242, [111,769] => 243, [111,770] => 244, [111,771] => 245, [111,776] => 246, [117,768] => 249, [117,769] => 250, [117,770] => 251, [117,776] => 252, [121,769] => 253, [121,776] => 255, [65,772] => 256, [97,772] => 257, [65,774] => 258, [97,774] => 259, [65,808] => 260, [97,808] => 261, [67,769] => 262, [99,769] => 263, [67,770] => 264, [99,770] => 265, [67,775] => 266, [99,775] => 267, [67,780] => 268, [99,780] => 269, [68,780] => 270, [100,780] => 271, [69,772] => 274, [101,772] => 275, [69,774] => 276, [101,774] => 277, [69,775] => 278, [101,775] => 279, [69,808] => 280, [101,808] => 281, [69,780] => 282, [101,780] => 283, [71,770] => 284, [103,770] => 285, [71,774] => 286, [103,774] => 287, [71,775] => 288, [103,775] => 289, [71,807] => 290, [103,807] => 291, [72,770] => 292, [104,770] => 293, [73,771] => 296, [105,771] => 297, [73,772] => 298, [105,772] => 299, [73,774] => 300, [105,774] => 301, [73,808] => 302, [105,808] => 303, [73,775] => 304, [74,770] => 308, [106,770] => 309, [75,807] => 310, [107,807] => 311, [76,769] => 313, [108,769] => 314, [76,807] => 315, [108,807] => 316, [76,780] => 317, [108,780] => 318, [78,769] => 323, [110,769] => 324, [78,807] => 325, [110,807] => 326, [78,780] => 327, [110,780] => 328, [79,772] => 332, [111,772] => 333, [79,774] => 334, [111,774] => 335, [79,779] => 336, [111,779] => 337, [82,769] => 340, [114,769] => 341, [82,807] => 342, [114,807] => 343, [82,780] => 344, [114,780] => 345, [83,769] => 346, [115,769] => 347, [83,770] => 348, [115,770] => 349, [83,807] => 350, [115,807] => 351, [83,780] => 352, [115,780] => 353, [84,807] => 354, [116,807] => 355, [84,780] => 356, [116,780] => 357, [85,771] => 360, [117,771] => 361, [85,772] => 362, [117,772] => 363, [85,774] => 364, [117,774] => 365, [85,778] => 366, [117,778] => 367, [85,779] => 368, [117,779] => 369, [85,808] => 370, [117,808] => 371, [87,770] => 372, [119,770] => 373, [89,770] => 374, [121,770] => 375, [89,776] => 376, [90,769] => 377, [122,769] => 378, [90,775] => 379, [122,775] => 380, [90,780] => 381, [122,780] => 382 } end end end babosa-1.0.4/lib/babosa/utf8/proxy.rb000066400000000000000000000103451373710607500173500ustar00rootroot00000000000000module Babosa module UTF8 autoload :JavaProxy, "babosa/utf8/java_proxy" autoload :UnicodeProxy, "babosa/utf8/unicode_proxy" autoload :ActiveSupportProxy, "babosa/utf8/active_support_proxy" autoload :DumbProxy, "babosa/utf8/dumb_proxy" # A UTF-8 proxy for Babosa can be any object which responds to the methods in this module. # The following proxies are provided by Babosa: {ActiveSupportProxy}, {DumbProxy}, {JavaProxy}, and {UnicodeProxy}. module Proxy CP1252 = { 128 => [226, 130, 172], 129 => nil, 130 => [226, 128, 154], 131 => [198, 146], 132 => [226, 128, 158], 133 => [226, 128, 166], 134 => [226, 128, 160], 135 => [226, 128, 161], 136 => [203, 134], 137 => [226, 128, 176], 138 => [197, 160], 139 => [226, 128, 185], 140 => [197, 146], 141 => nil, 142 => [197, 189], 143 => nil, 144 => nil, 145 => [226, 128, 152], 146 => [226, 128, 153], 147 => [226, 128, 156], 148 => [226, 128, 157], 149 => [226, 128, 162], 150 => [226, 128, 147], 151 => [226, 128, 148], 152 => [203, 156], 153 => [226, 132, 162], 154 => [197, 161], 155 => [226, 128, 186], 156 => [197, 147], 157 => nil, 158 => [197, 190], 159 => [197, 184] } # This is a stub for a method that should return a Unicode-aware # downcased version of the given string. def downcase(string) raise NotImplementedError end # This is a stub for a method that should return a Unicode-aware # upcased version of the given string. def upcase(string) raise NotImplementedError end # This is a stub for a method that should return the Unicode NFC # normalization of the given string. def normalize_utf8(string) raise NotImplementedError end if ''.respond_to?(:scrub) && !defined?(Rubinius) # Attempt to replace invalid UTF-8 bytes with valid ones. This method # naively assumes if you have invalid UTF8 bytes, they are either Windows # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not # always work. def tidy_bytes(string) string.scrub do |bad| tidy_byte(*bad.bytes).flatten.compact.pack('C*').unpack('U*').pack('U*') end end else def tidy_bytes(string) bytes = string.unpack("C*") conts_expected = 0 last_lead = 0 bytes.each_index do |i| byte = bytes[i] is_cont = byte > 127 && byte < 192 is_lead = byte > 191 && byte < 245 is_unused = byte > 240 is_restricted = byte > 244 # Impossible or highly unlikely byte? Clean it. if is_unused || is_restricted bytes[i] = tidy_byte(byte) elsif is_cont # Not expecting contination byte? Clean up. Otherwise, now expect one less. conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1 else if conts_expected > 0 # Expected continuation, but got ASCII or leading? Clean backwards up to # the leading byte. (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])} conts_expected = 0 end if is_lead # Final byte is leading? Clean it. if i == bytes.length - 1 bytes[i] = tidy_byte(bytes.last) else # Valid leading byte? Expect continuations determined by position of # first zero bit, with max of 3. conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3 last_lead = i end end end end bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*") end end private def tidy_byte(byte) byte < 160 ? CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64] end end end end babosa-1.0.4/lib/babosa/utf8/unicode_proxy.rb000066400000000000000000000006631373710607500210600ustar00rootroot00000000000000require 'unicode' module Babosa module UTF8 # A UTF-8 proxy using the Unicode gem. # @see http://github.com/blackwinter/unicode module UnicodeProxy extend Proxy extend self def downcase(string) Unicode.downcase(string) end def upcase(string) Unicode.upcase(string) end def normalize_utf8(string) Unicode.normalize_C(string) end end end end babosa-1.0.4/lib/babosa/version.rb000066400000000000000000000000761373710607500167660ustar00rootroot00000000000000module Babosa module Version STRING = '1.0.4' end end babosa-1.0.4/spec/000077500000000000000000000000001373710607500137065ustar00rootroot00000000000000babosa-1.0.4/spec/babosa_spec.rb000066400000000000000000000121731373710607500165000ustar00rootroot00000000000000# encoding: utf-8 require File.expand_path("../spec_helper", __FILE__) describe Babosa::Identifier do it "should respond_to :empty?" do expect("".to_slug).to respond_to(:empty?) end %w[approximate_ascii clean downcase word_chars normalize to_ascii upcase with_dashes].each do |method| describe "##{method}" do it "should work with invalid UTF-8 strings" do expect {"\x93abc".to_slug.send method}.not_to raise_exception end end end describe "#word_chars" do it "word_chars! should leave only letters and spaces" do string = "a*$%^$@!@b$%^&*()*!c" expect(string.to_slug.word_chars!).to match(/[a-z ]*/i) end end describe "#transliterate" do it "should transliterate to ascii" do (0xC0..0x17E).to_a.each do |codepoint| ss = [codepoint].pack("U*").to_slug expect(ss.approximate_ascii!).to match(/[\x0-\x7f]/) end end it "should transliterate uncomposed utf8" do string = [117, 776].pack("U*") # "ü" as ASCII "u" plus COMBINING DIAERESIS expect(string.to_slug.approximate_ascii).to eql("u") end it "should transliterate using multiple transliterators" do string = "свободное režģis" expect(string.to_slug.approximate_ascii(:latin, :russian)).to eql("svobodnoe rezgis") end end describe "#downcase" do it "should lowercase strings" do expect("FELIZ AÑO".to_slug.downcase).to eql("feliz año") end end describe "#upcase" do it "should uppercase strings" do expect("feliz año".to_slug.upcase).to eql("FELIZ AÑO") end end describe "#normalize" do it "should allow passing locale as key for :transliterate" do expect("ö".to_slug.clean.normalize(:transliterate => :german)).to eql("oe") end it "should replace whitespace with dashes" do expect("a b".to_slug.clean.normalize).to eql("a-b") end it "should replace multiple spaces with 1 dash" do expect("a b".to_slug.clean.normalize).to eql("a-b") end it "should replace multiple dashes with 1 dash" do expect("male - female".to_slug.normalize).to eql("male-female") end it "should strip trailing space" do expect("ab ".to_slug.normalize).to eql("ab") end it "should strip leading space" do expect(" ab".to_slug.normalize).to eql("ab") end it "should strip trailing slashes" do expect("ab-".to_slug.normalize).to eql("ab") end it "should strip leading slashes" do expect("-ab".to_slug.normalize).to eql("ab") end it "should not modify valid name strings" do expect("a-b-c-d".to_slug.normalize).to eql("a-b-c-d") end it "should not convert underscores" do expect("hello_world".to_slug.normalize).to eql("hello_world") end it "should work with non roman chars" do expect("検 索".to_slug.normalize).to eql("検-索") end context "with to_ascii option" do it "should approximate and strip non ascii" do ss = "カタカナ: katakana is über cool".to_slug expect(ss.normalize(:to_ascii => true)).to eql("katakana-is-uber-cool") end end end describe "#truncate_bytes" do it "should by byte length" do expect("üa".to_slug.truncate_bytes(2)).to eql("ü") expect("üa".to_slug.truncate_bytes(1)).to eql("") expect("üa".to_slug.truncate_bytes(100)).to eql("üa") expect("üéøá".to_slug.truncate_bytes(3)).to eql("ü") end end describe "#truncate" do it "should truncate by char length" do expect("üa".to_slug.truncate(2)).to eql("üa") expect("üa".to_slug.truncate(1)).to eql("ü") expect("üa".to_slug.truncate(100)).to eql("üa") end end describe "#with_dashes" do it "should not change byte size when replacing spaces" do expect("".to_slug.with_dashes.bytesize).to eql(0) expect(" ".to_slug.with_dashes.bytesize).to eql(1) expect("-abc-".to_slug.with_dashes.bytesize).to eql(5) expect(" abc ".to_slug.with_dashes.bytesize).to eql(5) expect(" a bc ".to_slug.with_dashes.bytesize).to eql(7) end end describe "#to_ruby_method" do it "should get a string suitable for use as a ruby method" do expect("¿¿¿hello... world???".to_slug.to_ruby_method).to eql("hello_world?") expect("カタカナ: katakana is über cool".to_slug.to_ruby_method).to eql("katakana_is_uber_cool") expect("カタカナ: katakana is über cool!".to_slug.to_ruby_method).to eql("katakana_is_uber_cool!") expect("カタカナ: katakana is über cool".to_slug.to_ruby_method(false)).to eql("katakana_is_uber_cool") end it "should optionally remove trailing punctuation" do expect("¿¿¿hello... world???".to_slug.to_ruby_method(false)).to eql("hello_world") end it "should raise an error when it would generate an impossible method name" do # "1".to_identifier.to_ruby_method expect {"1".to_identifier.to_ruby_method}.to raise_error(Babosa::Identifier::Error) end it "should raise Babosa::Error error when the string is nil" do expect { "".to_slug.to_ruby_method }.to raise_error(Babosa::Identifier::Error) end end end babosa-1.0.4/spec/spec_helper.rb000066400000000000000000000022061373710607500165240ustar00rootroot00000000000000# coding: utf-8 if ENV['COV'] require 'simplecov' SimpleCov.start end require 'bundler/setup' require 'babosa' shared_examples_for "a latin transliterator" do let(:t) { described_class.instance } it "should transliterate latin characters" do string = (0xC0..0x17E).to_a.pack("U*") expect(t.transliterate(string)).to match(/[\x0-\x7f]/) end end shared_examples_for "a cyrillic transliterator" do let(:t) { described_class.instance } it "should transliterate cyrillic characters" do string = "Славься, Отечество наше свободное" expect(t.transliterate(string)).to match(/[\x0-\x7f]/) end end shared_examples_for "a greek transliterator" do let(:t) { described_class.instance } it "should transliterate greek characters" do string = "Γερμανία" expect(t.transliterate(string)).to match(/[\x0-\x7f]/) end end shared_examples_for "a hindi transliterator" do let(:t) { described_class.instance } it "should transliterate hindi characters" do string = "आदित्य तापड़िया" expect(t.transliterate(string)).to match(/[\x0-\x7f]/) end endbabosa-1.0.4/spec/transliterators/000077500000000000000000000000001373710607500171465ustar00rootroot00000000000000babosa-1.0.4/spec/transliterators/base_spec.rb000066400000000000000000000005751373710607500214260ustar00rootroot00000000000000# encoding: utf-8 require File.expand_path("../../spec_helper", __FILE__) describe Babosa::Transliterator::Base do let(:t) {Babosa::Transliterator::Base.instance} it "should transliterate 'smart' quotes" do expect(t.transliterate("’")).to eql("'") end it "should transliterate non-breaking spaces" do expect(t.transliterate("\xc2\xa0")).to eql(" ") end endbabosa-1.0.4/spec/transliterators/bulgarian_spec.rb000066400000000000000000000010251373710607500224470ustar00rootroot00000000000000# encoding: utf-8 require File.expand_path("../../spec_helper", __FILE__) describe Babosa::Transliterator::Bulgarian do let(:t) { described_class.instance } it_behaves_like "a cyrillic transliterator" it "should transliterate Cyrillic characters" do examples = { "Ютия" => "Iutiia", "Чушка" => "Chushka", "кьорав" => "kiorav", "Щъркел" => "Shturkel", "полицай" => "policai" } examples.each {|k, v| expect(t.transliterate(k)).to eql(v)} end end babosa-1.0.4/spec/transliterators/danish_spec.rb000066400000000000000000000006501373710607500217540ustar00rootroot00000000000000# encoding: utf-8 require File.expand_path("../../spec_helper", __FILE__) describe Babosa::Transliterator::Danish do let(:t) { described_class.instance } it_behaves_like "a latin transliterator" it "should transliterate various characters" do examples = { "Ærøskøbing" => "Aeroeskoebing", "Årslev" => "Aarslev" } examples.each {|k, v| expect(t.transliterate(k)).to eql(v)} end endbabosa-1.0.4/spec/transliterators/german_spec.rb000066400000000000000000000006321373710607500217570ustar00rootroot00000000000000# encoding: utf-8 require File.expand_path("../../spec_helper", __FILE__) describe Babosa::Transliterator::German do let(:t) { described_class.instance } it_behaves_like "a latin transliterator" it "should transliterate Eszett" do expect(t.transliterate("ß")).to eql("ss") end it "should transliterate vowels with umlauts" do expect(t.transliterate("üöä")).to eql("ueoeae") end endbabosa-1.0.4/spec/transliterators/greek_spec.rb000066400000000000000000000007201373710607500216010ustar00rootroot00000000000000# encoding: utf-8 require File.expand_path("../../spec_helper", __FILE__) describe Babosa::Transliterator::Greek do let(:t) { described_class.instance } it_behaves_like "a greek transliterator" it "should transliterate various characters" do examples = { "Γερμανία" => "Germania", "Αυστρία" => "Aystria", "Ιταλία" => "Italia" } examples.each {|k, v| expect(t.transliterate(k)).to eql(v)} end end babosa-1.0.4/spec/transliterators/hindi_spec.rb000066400000000000000000000010521373710607500215760ustar00rootroot00000000000000# encoding: utf-8 require File.expand_path("../../spec_helper", __FILE__) describe Babosa::Transliterator::Hindi do let(:t) { described_class.instance } it_behaves_like "a hindi transliterator" it "should transliterate hindi characters" do examples = { "आदित्य" => "aadity", "सबरीमाला करवाना पायसम" => "sbriimaalaa krvaanaa paaysm", "सक्रांति आँख" => "skraanti aankh" } examples.each {|k, v| expect(t.transliterate(k)).to eql(v)} end end babosa-1.0.4/spec/transliterators/latin_spec.rb000066400000000000000000000003151373710607500216130ustar00rootroot00000000000000# encoding: utf-8 require File.expand_path("../../spec_helper", __FILE__) describe Babosa::Transliterator::Latin do let(:t) { described_class.instance } it_behaves_like "a latin transliterator" end babosa-1.0.4/spec/transliterators/macedonian_spec.rb000066400000000000000000000003251373710607500226030ustar00rootroot00000000000000# encoding: utf-8 require File.expand_path("../../spec_helper", __FILE__) describe Babosa::Transliterator::Macedonian do let(:t) { described_class.instance } it_behaves_like "a cyrillic transliterator" end babosa-1.0.4/spec/transliterators/norwegian_spec.rb000066400000000000000000000007171373710607500225030ustar00rootroot00000000000000# encoding: utf-8 require File.expand_path("../../spec_helper", __FILE__) describe Babosa::Transliterator::Norwegian do let(:t) { described_class.instance } it_behaves_like "a latin transliterator" it "should transliterate various characters" do examples = { "Øivind" => "Oeivind", "Bø" => "Boe", "Åre" => "Aare", "Håkon" => "Haakon" } examples.each {|k, v| expect(t.transliterate(k)).to eql(v)} end end babosa-1.0.4/spec/transliterators/polish_spec.rb000066400000000000000000000005531373710607500220060ustar00rootroot00000000000000# encoding: utf-8 require File.expand_path("../../spec_helper", __FILE__) describe Babosa::Transliterator::Romanian do let(:t) { described_class.instance } it_behaves_like "a latin transliterator" it "should transliterate various characters" do expect(t.transliterate("ĄąĆćĘꣳŃńÓóŚśŹźŻż")).to eql("AaCcEeLlNnOoSsZzZz") end end babosa-1.0.4/spec/transliterators/romanian_spec.rb000066400000000000000000000007351373710607500223160ustar00rootroot00000000000000# encoding: utf-8 require File.expand_path("../../spec_helper", __FILE__) describe Babosa::Transliterator::Romanian do let(:t) { described_class.instance } it_behaves_like "a latin transliterator" it "should transliterate various characters" do examples = { "Iași" => "Iasi", "Mehedinți" => "Mehedinti", "Țară" => "Tara", "Șanț" => "Sant" } examples.each {|k, v| expect(t.transliterate(k)).to eql(v)} end endbabosa-1.0.4/spec/transliterators/russian_spec.rb000066400000000000000000000003221373710607500221660ustar00rootroot00000000000000# encoding: utf-8 require File.expand_path("../../spec_helper", __FILE__) describe Babosa::Transliterator::Russian do let(:t) { described_class.instance } it_behaves_like "a cyrillic transliterator" end babosa-1.0.4/spec/transliterators/serbian_spec.rb000066400000000000000000000012431373710607500221300ustar00rootroot00000000000000# encoding: utf-8 require File.expand_path("../../spec_helper", __FILE__) describe Babosa::Transliterator::Serbian do let(:t) { described_class.instance } it_behaves_like "a latin transliterator" it_behaves_like "a cyrillic transliterator" it "should transliterate Latin characters" do examples = { "Ðorđe" => "Djordje", "Inđija" => "Indjija", "Četiri" => "Chetiri", "četiri" => "chetiri", "Škola" => "Shkola", "škola" => "shkola", "Ђорђе" => "Djordje", "Инђија" => "Indjija", "Школа" => "Shkola", } examples.each {|k, v| expect(t.transliterate(k)).to eql(v)} end endbabosa-1.0.4/spec/transliterators/spanish_spec.rb000066400000000000000000000004461373710607500221560ustar00rootroot00000000000000# encoding: utf-8 require File.expand_path("../../spec_helper", __FILE__) describe Babosa::Transliterator::Spanish do let(:t) { described_class.instance } it_behaves_like "a latin transliterator" it "should transliterate ñ" do expect(t.transliterate("ñ")).to eql("ni") end endbabosa-1.0.4/spec/transliterators/swedish_spec.rb000066400000000000000000000007351373710607500221600ustar00rootroot00000000000000# encoding: utf-8 require File.expand_path("../../spec_helper", __FILE__) describe Babosa::Transliterator::Swedish do let(:t) { described_class.instance } it_behaves_like "a latin transliterator" it "should transliterate various characters" do examples = { "Räksmörgås" => "Raeksmoergaas", "Öre" => "Oere", "Åre" => "Aare", "Älskar" => "Aelskar" } examples.each {|k, v| expect(t.transliterate(k)).to eql(v)} end end babosa-1.0.4/spec/transliterators/turkish_spec.rb000066400000000000000000000013601373710607500221760ustar00rootroot00000000000000# encoding: utf-8 require File.expand_path("../../spec_helper", __FILE__) describe Babosa::Transliterator::Turkish do let(:t) { described_class.instance } it_behaves_like "a latin transliterator" it "should transliterate various characters" do examples = { "Nâzım" => "Nazim", "sükûnet" => "sukunet", "millîleştirmek" => "millilestirmek", "mêmur" => "memur", "lôkman" => "lokman", "yoğurt" => "yogurt", "şair" => "sair", "İzmir" => "Izmir", "yığın" => "yigin", "çarşı" => "carsi" } examples.each {|k, v| expect(t.transliterate(k)).to eql(v)} end end babosa-1.0.4/spec/transliterators/ukrainian_spec.rb000066400000000000000000000061121373710607500224660ustar00rootroot00000000000000# encoding: utf-8 require File.expand_path("../../spec_helper", __FILE__) describe Babosa::Transliterator::Ukrainian do let(:t) { described_class.instance } it_behaves_like "a cyrillic transliterator" it "should transliterate Cyrillic characters" do examples = { "Алушта" => "Alushta", "Андрій" => "Andrii", "Борщагівка" => "Borshchahivka", "Борисенко" => "Borysenko", "Вінниця" => "Vinnytsia", "Володимир" => "Volodymyr", "Гадяч" => "Hadiach", "Богдан" => "Bohdan", "Ґалаґан" => "Galagan", "Ґорґани" => "Gorgany", "Донецьк" => "Donetsk", "Дмитро" => "Dmytro", "Рівне" => "Rivne", "Олег" => "Oleh", "Есмань" => "Esman", "Єнакієве" => "Yenakiieve", "Гаєвич" => "Haievych", "Короп'є" => "Koropie", "Житомир" => "Zhytomyr", "Жанна" => "Zhanna", "Жежелів" => "Zhezheliv", "Закарпаття" => "Zakarpattia", "Казимирчук" => "Kazymyrchuk", "Медвин" => "Medvyn", "Михайленко" => "Mykhailenko", "Іванків" => "Ivankiv", "Іващенко" => "Ivashchenko", "Їжакевич" => "Yizhakevych", "Кадиївка" => "Kadyivka", "Мар'їне" => "Marine", "Йосипівка" => "Yosypivka", "Стрий" => "Stryi", "Олексій" => "Oleksii", "Київ" => "Kyiv", "Коваленко" => "Kovalenko", "Лебедин" => "Lebedyn", "Леонід" => "Leonid", "Миколаїв" => "Mykolaiv", "Маринич" => "Marynych", "Ніжин" => "Nizhyn", "Наталія" => "Nataliia", "Одеса" => "Odesa", "Онищенко" => "Onyshchenko", "Полтава" => "Poltava", "Петро" => "Petro", "Решетилівка" => "Reshetylivka", "Рибчинський" => "Rybchynskyi", "Суми" => "Sumy", "Соломія" => "Solomiia", "Тернопіль" => "Ternopil", "Троць" => "Trots", "Ужгород" => "Uzhhorod", "Уляна" => "Uliana", "Фастів" => "Fastiv", "Філіпчук" => "Filipchuk", "Харків" => "Kharkiv", "Христина" => "Khrystyna", "Біла Церква" => "Bila Tserkva", "Стеценко" => "Stetsenko", "Чернівці" => "Chernivtsi", "Шевченко" => "Shevchenko", "Шостка" => "Shostka", "Кишеньки" => "Kyshenky", "Щербухи" => "Shcherbukhy", "Гоща" => "Hoshcha", "Гаращенко" => "Harashchenko", "Юрій" => "Yurii", "Корюківка" => "Koriukivka", "Яготин" => "Yahotyn", "Ярошенко" => "Yaroshenko", "Костянтин" => "Kostiantyn", "Знам'янка" => "Znamianka", "Феодосія" => "Feodosiia" } examples.each { |k, v| expect(t.transliterate(k)).to eql(v) } end endbabosa-1.0.4/spec/transliterators/vietnamese_spec.rb000066400000000000000000000007101373710607500226430ustar00rootroot00000000000000# encoding: utf-8 require File.expand_path("../../spec_helper", __FILE__) describe Babosa::Transliterator::Vietnamese do let(:t) { described_class.instance } it_behaves_like "a latin transliterator" it "should transliterate various characters" do examples = { "làm" => "lam", "đàn ông" => "dan ong", "thật" => "that", "khổ" => "kho" } examples.each {|k, v| expect(t.transliterate(k)).to eql(v)} end end babosa-1.0.4/spec/utf8_proxy_spec.rb000066400000000000000000000030641373710607500173770ustar00rootroot00000000000000# encoding: utf-8 require File.expand_path("../spec_helper", __FILE__) PROXIES = [Babosa::UTF8::DumbProxy, Babosa::UTF8::ActiveSupportProxy, Babosa::UTF8::UnicodeProxy] PROXIES << Babosa::UTF8::JavaProxy if Babosa.jruby15? PROXIES.each do |proxy| describe proxy do around do |example| begin old_proxy = Babosa::Identifier.utf8_proxy Babosa::Identifier.utf8_proxy = proxy example.run ensure Babosa::Identifier.utf8_proxy = old_proxy end end describe "#normalize_utf8" do it "should normalize to canonical composed" do # ÅÉÎØÜ uncomposed_bytes = [65, 204, 138, 69, 204, 129, 73, 204, 130, 195, 152, 85, 204, 136] composed_bytes = [195, 133, 195, 137, 195, 142, 195, 152, 195, 156] uncomposed_string = uncomposed_bytes.pack("C*").unpack("U*").pack("U*") expect(proxy.normalize_utf8(uncomposed_string).unpack("C*")).to eql(composed_bytes) end end describe "#upcase" do it "should upcase the string" do expect(proxy.upcase("åéîøü")).to eql("ÅÉÎØÜ") expect("åéîøü".to_identifier.upcase).to eql("ÅÉÎØÜ") end end describe "#downcase" do it "should downcase the string" do expect(proxy.downcase("ÅÉÎØÜ")).to eql("åéîøü") expect("ÅÉÎØÜ".to_identifier.downcase).to eql("åéîøü") end end describe 'tidy_bytes' do it 'should fix invalid UTF-8 strings' do expect(proxy.tidy_bytes("\x93abc")).to eq('“abc') end end end end