pax_global_header00006660000000000000000000000064121211066410014504gustar00rootroot0000000000000052 comment=0b59657b38340f7139f463da0beb4b164d29a954 ruby-levenshtein-0.2.2/000077500000000000000000000000001212110664100150105ustar00rootroot00000000000000ruby-levenshtein-0.2.2/CHANGELOG000066400000000000000000000012361212110664100162240ustar00rootroot000000000000000.2.2 (16-03-2012) * Simplified code. 0.2.1 (11-03-2012) * Better memory handling. * Little speed improvements. * Ruby 1.9 compatible? 0.2.0 (11-07-2009) * Return 0 instead of 0.0 in case of empty strings. * Added specific support for arrays. * Added specific support for arrays of strings. * Added generic support for all (?) kind of sequences. * Moved a lot of code to the C world. 0.1.1 (06-10-2008) * If one of the strings was both the begin and the end of the other string, it would be stripped from both ends. Example: Levenshtein.distance("abracadabra", "abra") resulted in 3 instead of 7. It's fixed now. 0.1.0 (24-05-2008) * First release. ruby-levenshtein-0.2.2/LICENSE000066400000000000000000000012731212110664100160200ustar00rootroot00000000000000# Copyright Erik Veenstra # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License, # version 2, as published by the Free Software Foundation. # # This program is distributed in the hope that it will be # useful, but WITHOUT ANY WARRANTY; without even the implied # warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR # PURPOSE. See the GNU General Public License for more details. # # You should have received a copy of the GNU General Public # License along with this program; if not, write to the Free # Software Foundation, Inc., 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA. ruby-levenshtein-0.2.2/README000066400000000000000000000012411212110664100156660ustar00rootroot00000000000000The Levenshtein distance is a metric for measuring the amount of difference between two sequences (i.e., the so called edit distance). The Levenshtein distance between two sequences is given by the minimum number of operations needed to transform one sequence into the other, where an operation is an insertion, deletion, or substitution of a single element. The two sequences can be two strings, two arrays, or two other objects responding to :each. All sequences are by generic (fast) C code. All objects in the sequences should respond to :hash and :eql?. More information about the Levenshtein distance algorithm: http://en.wikipedia.org/wiki/Levenshtein_distance . ruby-levenshtein-0.2.2/VERSION000066400000000000000000000000061212110664100160540ustar00rootroot000000000000000.2.2 ruby-levenshtein-0.2.2/ext/000077500000000000000000000000001212110664100156105ustar00rootroot00000000000000ruby-levenshtein-0.2.2/ext/levenshtein/000077500000000000000000000000001212110664100201345ustar00rootroot00000000000000ruby-levenshtein-0.2.2/ext/levenshtein/extconf.rb000066400000000000000000000003621212110664100221300ustar00rootroot00000000000000require "mkmf" dir_config("levenshtein") have_library("levenshtein_array") have_library("levenshtein_array_of_strings") have_library("levenshtein_generic") have_library("levenshtein_string") create_makefile("levenshtein/levenshtein_fast") ruby-levenshtein-0.2.2/ext/levenshtein/levenshtein.h000066400000000000000000000003651212110664100226350ustar00rootroot00000000000000#ifdef RARRAY_PTR #else #define RARRAY_PTR(o) (RARRAY(o)->ptr) #define RARRAY_LEN(o) (RARRAY(o)->len) #endif #ifdef RSTRING_PTR #else #define RSTRING_PTR(o) (RSTRING(o)->ptr) #define RSTRING_LEN(o) (RSTRING(o)->len) #endif VALUE mLevenshtein; ruby-levenshtein-0.2.2/ext/levenshtein/levenshtein_fast.c000066400000000000000000000055761212110664100236560ustar00rootroot00000000000000#include "ruby.h" #include "levenshtein.h" VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) { VALUE *p1, *p2; long l1, l2; long col, row; int threshold; int *prev_row, *curr_row, *temp_row; int curr_row_min, result; int value1, value2; /* Be sure that all equivalent objects in rb_o1 and rb_o2 (a.eql?(b) == true) are taken from a pool (a.equal?(b) == true). */ /* This is done in levenshtein.rb by means of Util.pool. */ /* Get the sizes of both arrays. */ l1 = RARRAY_LEN(rb_o1); l2 = RARRAY_LEN(rb_o2); /* Get the pointers of both arrays. */ p1 = RARRAY_PTR(rb_o1); p2 = RARRAY_PTR(rb_o2); /* Convert Ruby's threshold to C's threshold. */ if (!NIL_P(rb_threshold)) { threshold = FIX2INT(rb_threshold); } else { threshold = -1; } /* The Levenshtein algorithm itself. */ /* s1= */ /* ERIK */ /* */ /* 01234 */ /* s2=V 11234 */ /* E 21234 */ /* E 32234 */ /* N 43334 <- prev_row */ /* S 54444 <- curr_row */ /* T 65555 */ /* R 76566 */ /* A 87667 */ /* Allocate memory for both rows */ prev_row = (int*) ALLOC_N(int, (l1+1)); curr_row = (int*) ALLOC_N(int, (l1+1)); /* Initialize the current row. */ for (col=0; col<=l1; col++) { curr_row[col] = col; } for (row=1; row<=l2; row++) { /* Copy the current row to the previous row. */ temp_row = prev_row; prev_row = curr_row; curr_row = temp_row; /* Calculate the values of the current row. */ curr_row[0] = row; curr_row_min = row; for (col=1; col<=l1; col++) { /* Equal (cost=0) or substitution (cost=1). */ value1 = prev_row[col-1] + ((p1[col-1] == p2[row-1]) ? 0 : 1); /* Insertion if it's cheaper than substitution. */ value2 = prev_row[col]+1; if (value2 < value1) { value1 = value2; } /* Deletion if it's cheaper than substitution. */ value2 = curr_row[col-1]+1; if (value2 < value1) { value1 = value2; } /* Keep track of the minimum value on this row. */ if (value1 < curr_row_min) { curr_row_min = value1; } curr_row[col] = value1; } /* Return nil as soon as we exceed the threshold. */ if (threshold > -1 && curr_row_min >= threshold) { free(prev_row); free(curr_row); return Qnil; } } /* The result is the last value on the last row. */ result = curr_row[l1]; free(prev_row); free(curr_row); /* Return the Ruby version of the result. */ return INT2FIX(result); } void Init_levenshtein_fast() { mLevenshtein = rb_const_get(rb_mKernel, rb_intern("Levenshtein")); rb_define_singleton_method(mLevenshtein, "distance_fast" , levenshtein_distance_fast, 3); } ruby-levenshtein-0.2.2/lib/000077500000000000000000000000001212110664100155565ustar00rootroot00000000000000ruby-levenshtein-0.2.2/lib/levenshtein.rb000077500000000000000000000071361212110664100204410ustar00rootroot00000000000000# encoding: UTF-8 require "levenshtein/version" module Levenshtein # Returns the Levenshtein distance as a number between 0.0 and # 1.0. It's basically the Levenshtein distance divided by the # size of the longest sequence. def self.normalized_distance(a1, a2, threshold=nil, options={}) size = [a1.size, a2.size].max if a1.size == 0 and a2.size == 0 0.0 elsif a1.size == 0 a2.size.to_f/size elsif a2.size == 0 a1.size.to_f/size else if threshold if d = self.distance(a1, a2, (threshold*size).to_i+1) d.to_f/size else nil end else self.distance(a1, a2).to_f/size end end end # Returns the Levenshtein distance between two sequences. # # The two sequences can be two strings, two arrays, or two other # objects responding to :each. All sequences are by generic # (fast) C code. # # All objects in the sequences should respond to :hash and :eql?. def self.distance(a1, a2, threshold=nil, options={}) a1, a2 = a1.scan(/./), a2.scan(/./) if String === a1 and String === a2 a1, a2 = Util.pool(a1, a2) # Handle some basic circumstances. return 0 if a1 == a2 return a2.size if a1.empty? return a1.size if a2.empty? if threshold return nil if (a1.size-a2.size) >= threshold return nil if (a2.size-a1.size) >= threshold return nil if (a1-a2).size >= threshold return nil if (a2-a1).size >= threshold end # Remove the common prefix and the common postfix. l1 = a1.size l2 = a2.size offset = 0 no_more_optimizations = true while offset < l1 and offset < l2 and a1[offset].equal?(a2[offset]) offset += 1 no_more_optimizations = false end while offset < l1 and offset < l2 and a1[l1-1].equal?(a2[l2-1]) l1 -= 1 l2 -= 1 no_more_optimizations = false end if no_more_optimizations distance_fast_or_slow(a1, a2, threshold, options) else l1 -= offset l2 -= offset a1 = a1[offset, l1] a2 = a2[offset, l2] distance(a1, a2, threshold, options) end end def self.distance_fast_or_slow(a1, a2, threshold, options) # :nodoc: if respond_to?(:distance_fast) and options[:force_slow] distance_fast(a1, a2, threshold) # Implemented in C. else distance_slow(a1, a2, threshold) # Implemented in Ruby. end end def self.distance_slow(a1, a2, threshold) # :nodoc: crow = (0..a1.size).to_a 1.upto(a2.size) do |y| prow = crow crow = [y] 1.upto(a1.size) do |x| crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[x-1].equal?(a2[y-1]) ? 0 : 1)].min end # Stop analysing this sequence as soon as the best possible # result for this sequence is bigger than the best result so far. # (The minimum value in the next row will be equal to or greater # than the minimum value in this row.) return nil if threshold and crow.min >= threshold end crow[-1] end module Util # :nodoc: def self.pool(*args) # So we can compare pointers instead of objects (equal?() instead of ==()). pool = {} args.collect do |arg| a = [] arg.each do |o| a << pool[o] ||= o end a end end end end begin require "levenshtein/levenshtein_fast" # Compiled by RubyGems. rescue LoadError begin require "levenshtein_fast" # Compiled by the build script. rescue LoadError $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein. Using the much slower Ruby version instead." end end ruby-levenshtein-0.2.2/lib/levenshtein/000077500000000000000000000000001212110664100201025ustar00rootroot00000000000000ruby-levenshtein-0.2.2/lib/levenshtein/version.rb000077500000000000000000000000761212110664100221220ustar00rootroot00000000000000# encoding: UTF-8 module Levenshtein VERSION = "0.2.2" end ruby-levenshtein-0.2.2/metadata.yml000066400000000000000000000025521212110664100173170ustar00rootroot00000000000000--- !ruby/object:Gem::Specification name: levenshtein version: !ruby/object:Gem::Version version: 0.2.2 prerelease: platform: ruby authors: - Erik Veenstra autorequire: bindir: bin cert_chain: [] date: 2012-03-16 00:00:00.000000000 Z dependencies: [] description: Calculates the Levenshtein distance between two byte strings. email: levenshtein@erikveen.dds.nl executables: [] extensions: - ext/levenshtein/extconf.rb extra_rdoc_files: [] files: - lib/levenshtein/version.rb - lib/levenshtein.rb - ext/levenshtein/levenshtein.h - ext/levenshtein/levenshtein_fast.c - ext/levenshtein/extconf.rb - README - LICENSE - VERSION - CHANGELOG - test/test.rb homepage: http://www.erikveen.dds.nl/levenshtein/index.html licenses: [] post_install_message: rdoc_options: - README - LICENSE - VERSION - CHANGELOG - --title - levenshtein (0.2.2) - --main - README require_paths: - lib required_ruby_version: !ruby/object:Gem::Requirement none: false requirements: - - ! '>=' - !ruby/object:Gem::Version version: '0' required_rubygems_version: !ruby/object:Gem::Requirement none: false requirements: - - ! '>=' - !ruby/object:Gem::Version version: '0' requirements: [] rubyforge_project: levenshtein rubygems_version: 1.8.18 signing_key: specification_version: 3 summary: Calculates the Levenshtein distance between two byte strings. test_files: - test/test.rb ruby-levenshtein-0.2.2/test/000077500000000000000000000000001212110664100157675ustar00rootroot00000000000000ruby-levenshtein-0.2.2/test/test.rb000077500000000000000000000130441212110664100173000ustar00rootroot00000000000000#!/usr/bin/env ruby # encoding: UTF-8 require "test/unit" require "levenshtein" module Levenshtein class TestSequence def initialize(o) @sequence = o end def each @sequence.length.times do |pos| yield(@sequence[pos]) end end end class TestElement attr_reader :object def initialize(o) @object = o end def hash @object.hash end def eql?(other) @object.eql?(other.object) end end end class TestLevenshtein < Test::Unit::TestCase def test_erik_veenstra assert_equal(7, Levenshtein.distance("erik", "veenstra")) assert_equal(7, Levenshtein.distance("veenstra", "erik")) assert_in_delta(0.875, Levenshtein.normalized_distance("erik", "veenstra"), 0.01) assert_in_delta(0.875, Levenshtein.normalized_distance("veenstra", "erik"), 0.01) end def test_empty_string assert_equal(0, Levenshtein.distance("", "")) assert_equal(3, Levenshtein.distance("", "foo")) assert_equal(3, Levenshtein.distance("foo", "")) assert_in_delta(0.0, Levenshtein.normalized_distance("", ""), 0.01) assert_in_delta(1.0, Levenshtein.normalized_distance("", "foo"), 0.01) assert_in_delta(1.0, Levenshtein.normalized_distance("foo", ""), 0.01) end def test_same_string assert_equal(0, Levenshtein.distance("", "")) assert_equal(0, Levenshtein.distance("foo", "foo")) assert_in_delta(0.0, Levenshtein.normalized_distance("", ""), 0.01) assert_in_delta(0.0, Levenshtein.normalized_distance("foo", "foo"), 0.01) end def test_threshold assert_equal(3, Levenshtein.distance("foo", "foobar")) assert_equal(3, Levenshtein.distance("foo", "foobar", 4)) assert_equal(nil, Levenshtein.distance("foo", "foobar", 2)) assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar"), 0.01) assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar", 0.66), 0.01) assert_equal(nil, Levenshtein.normalized_distance("foo", "foobar", 0.30)) end def test_same_head_and_or_tail assert_equal(3, Levenshtein.distance("ab123cd", "abxyzcd")) assert_equal(3, Levenshtein.distance("ab123", "abxyz")) assert_equal(3, Levenshtein.distance("123cd", "xyzcd")) assert_equal(5, Levenshtein.distance("123cd123", "123")) assert_in_delta(0.42, Levenshtein.normalized_distance("ab123cd", "abxyzcd"), 0.01) assert_in_delta(0.6, Levenshtein.normalized_distance("ab123", "abxyz"), 0.01) assert_in_delta(0.6, Levenshtein.normalized_distance("123cd", "xyzcd"), 0.01) assert_in_delta(0.625, Levenshtein.normalized_distance("123cd123", "123"), 0.01) end def test_interface seq1 = Levenshtein::TestSequence.new("erik".scan(/./).collect{|e| Levenshtein::TestElement.new(e)}) seq2 = Levenshtein::TestSequence.new("veenstra".scan(/./).collect{|e| Levenshtein::TestElement.new(e)}) assert_equal(7, Levenshtein.distance(seq1, seq2)) end end class TestLevenshteinFast < Test::Unit::TestCase def test_erik_veenstra assert_equal(7, Levenshtein.distance("erik", "veenstra", nil, :force_slow=>false)) assert_equal(7, Levenshtein.distance("veenstra", "erik", nil, :force_slow=>false)) end def test_empty_string assert_equal(0, Levenshtein.distance("", "", nil, :force_slow=>false)) assert_equal(3, Levenshtein.distance("", "foo", nil, :force_slow=>false)) assert_equal(3, Levenshtein.distance("foo", "", nil, :force_slow=>false)) end def test_same_string assert_equal(0, Levenshtein.distance("", "", nil, :force_slow=>false)) assert_equal(0, Levenshtein.distance("foo", "foo", nil, :force_slow=>false)) end def test_threshold assert_equal(3, Levenshtein.distance("foo", "foobar", nil, :force_slow=>false)) assert_equal(3, Levenshtein.distance("foo", "foobar", 4, :force_slow=>false)) assert_equal(nil, Levenshtein.distance("foo", "foobar", 2, :force_slow=>false)) end def test_same_head_and_or_tail assert_equal(3, Levenshtein.distance("ab123cd", "abxyzcd", nil, :force_slow=>false)) assert_equal(3, Levenshtein.distance("ab123", "abxyz", nil, :force_slow=>false)) assert_equal(3, Levenshtein.distance("123cd", "xyzcd", nil, :force_slow=>false)) assert_equal(5, Levenshtein.distance("123cd123", "123", nil, :force_slow=>false)) end end class TestLevenshteinSlow < Test::Unit::TestCase def test_erik_veenstra assert_equal(7, Levenshtein.distance("erik", "veenstra", nil, :force_slow=>true)) assert_equal(7, Levenshtein.distance("veenstra", "erik", nil, :force_slow=>true)) end def test_empty_string assert_equal(0, Levenshtein.distance("", "", nil, :force_slow=>true)) assert_equal(3, Levenshtein.distance("", "foo", nil, :force_slow=>true)) assert_equal(3, Levenshtein.distance("foo", "", nil, :force_slow=>true)) end def test_same_string assert_equal(0, Levenshtein.distance("", "", nil, :force_slow=>true)) assert_equal(0, Levenshtein.distance("foo", "foo", nil, :force_slow=>true)) end def test_threshold assert_equal(3, Levenshtein.distance("foo", "foobar", nil, :force_slow=>true)) assert_equal(3, Levenshtein.distance("foo", "foobar", 4, :force_slow=>true)) assert_equal(nil, Levenshtein.distance("foo", "foobar", 2, :force_slow=>true)) end def test_same_head_and_or_tail assert_equal(3, Levenshtein.distance("ab123cd", "abxyzcd", nil, :force_slow=>true)) assert_equal(3, Levenshtein.distance("ab123", "abxyz", nil, :force_slow=>true)) assert_equal(3, Levenshtein.distance("123cd", "xyzcd", nil, :force_slow=>true)) assert_equal(5, Levenshtein.distance("123cd123", "123", nil, :force_slow=>true)) end end