pax_global_header 0000666 0000000 0000000 00000000064 15142757443 0014526 g ustar 00root root 0000000 0000000 52 comment=9d7b6c110fa866e654d5677b94a36a09b9b151df
jaro_winkler-1.7.0/ 0000775 0000000 0000000 00000000000 15142757443 0014221 5 ustar 00root root 0000000 0000000 jaro_winkler-1.7.0/.github/ 0000775 0000000 0000000 00000000000 15142757443 0015561 5 ustar 00root root 0000000 0000000 jaro_winkler-1.7.0/.github/workflows/ 0000775 0000000 0000000 00000000000 15142757443 0017616 5 ustar 00root root 0000000 0000000 jaro_winkler-1.7.0/.github/workflows/test.yml 0000664 0000000 0000000 00000001616 15142757443 0021324 0 ustar 00root root 0000000 0000000 name: Test
on:
push:
pull_request:
jobs:
test:
env:
BUNDLE_WITHOUT: benchmark
continue-on-error: ${{ matrix.fallible }}
strategy:
fail-fast: false
matrix:
os:
- ubuntu-latest
- ubuntu-24.04
- macos-latest
- macos-15
- windows-latest
- windows-2025
ruby:
- "2.7"
- "3.4"
- head
include:
- fallible: false
- ruby: head
fallible: true
- os: ubuntu-latest
fallible: true
- os: windows-latest
fallible: true
- os: macos-latest
fallible: true
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v3
- uses: ruby/setup-ruby@v1
with:
ruby-version: ${{ matrix.ruby }}
bundler-cache: true
- run: bundle exec rake
jaro_winkler-1.7.0/.gitignore 0000664 0000000 0000000 00000000174 15142757443 0016213 0 ustar 00root root 0000000 0000000 /.bundle/
/.yardoc
/Gemfile.lock
/_yardoc/
/coverage/
/doc/
/pkg/
/spec/reports/
/tmp/
*.bundle
*.so
*.o
*.a
mkmf.log
/tags
jaro_winkler-1.7.0/.rspec 0000664 0000000 0000000 00000000036 15142757443 0015335 0 ustar 00root root 0000000 0000000 --color
--require spec_helper
jaro_winkler-1.7.0/CHANGELOG.md 0000664 0000000 0000000 00000026210 15142757443 0016033 0 ustar 00root root 0000000 0000000 ## 1.6.1 (2025-05-11)
* chore: use "similarity" methods over deprecated "distance" methods for rake tasks ([b8bbec0](https://github.com/tonytonyjan/jaro_winkler/commit/b8bbec0))
* ci: update strategy matrix ([8e4b0ef](https://github.com/tonytonyjan/jaro_winkler/commit/8e4b0ef))
* feat: Use `append_cflags` instead of `CFLAGS` environment varaible ([f8f13a0](https://github.com/tonytonyjan/jaro_winkler/commit/f8f13a0))
## 1.6.0 (2024-08-28)
* chore: chore: bump version to v1.6.0 ([483e7ca](https://github.com/tonytonyjan/jaro_winkler/commit/483e7ca))
* fix: add "similarity" methods and deprecate "distance" methods ([1682f94](https://github.com/tonytonyjan/jaro_winkler/commit/1682f94))
* fix: use `rb_warn` over `rb_category_warn` for Ruby 2.7 ([0fd7b5d](https://github.com/tonytonyjan/jaro_winkler/commit/0fd7b5d))
* test: reduce deprecation warnings ([9ada77e](https://github.com/tonytonyjan/jaro_winkler/commit/9ada77e))
* docs: use "similarity" over "distance" in README ([97c9cbc](https://github.com/tonytonyjan/jaro_winkler/commit/97c9cbc))
## 1.5.6 (2023-05-29)
* chore: bump version to v1.5.6 ([1327330](https://github.com/tonytonyjan/jaro_winkler/commit/1327330))
* ractor safe ([240d42d](https://github.com/tonytonyjan/jaro_winkler/commit/240d42d))
## 1.5.5 (2023-05-22)
* chore: update changelog ([ecf1b26](https://github.com/tonytonyjan/jaro_winkler/commit/ecf1b26))
* chore(release): bump version to v1.5.5 ([4abbfc5](https://github.com/tonytonyjan/jaro_winkler/commit/4abbfc5))
* ci: add ruby 3 to ci ([886be74](https://github.com/tonytonyjan/jaro_winkler/commit/886be74))
* ci: replace travis with github action ([5701ef7](https://github.com/tonytonyjan/jaro_winkler/commit/5701ef7))
* ci: support windows and refine matrix ([84f3a42](https://github.com/tonytonyjan/jaro_winkler/commit/84f3a42))
* fix: Update argc type from size_t to int ([4c6ed4f](https://github.com/tonytonyjan/jaro_winkler/commit/4c6ed4f))
* Add project metadata to the gemspec ([6591989](https://github.com/tonytonyjan/jaro_winkler/commit/6591989))
* Create GitHub Actions workflow, remove dups from Travis ([7821e9d](https://github.com/tonytonyjan/jaro_winkler/commit/7821e9d))
## 1.5.4 (2019-10-30)
* chore: package license file ([dd98482](https://github.com/tonytonyjan/jaro_winkler/commit/dd98482))
* chore(release): bump version to 1.5.4 ([50990da](https://github.com/tonytonyjan/jaro_winkler/commit/50990da)), closes [#33](https://github.com/tonytonyjan/jaro_winkler/issues/33)
* Add the license to the gem for packaging. ([46b9359](https://github.com/tonytonyjan/jaro_winkler/commit/46b9359))
## 1.5.3 (2019-06-25)
* chore(release): bump version to v1.5.3 ([cb9b69e](https://github.com/tonytonyjan/jaro_winkler/commit/cb9b69e))
* Fall back to pure ruby implementation on LoadError ([49f811e](https://github.com/tonytonyjan/jaro_winkler/commit/49f811e))
* Rename Rake tasks for test ([42e0a36](https://github.com/tonytonyjan/jaro_winkler/commit/42e0a36))
* Update Rakefile ([1c05793](https://github.com/tonytonyjan/jaro_winkler/commit/1c05793))
## 1.5.2 (2019-01-04)
* chore: fix benchmark ([4508599](https://github.com/tonytonyjan/jaro_winkler/commit/4508599))
* chore(release): bump version to v1.5.2 ([aa4be61](https://github.com/tonytonyjan/jaro_winkler/commit/aa4be61))
* fix: raises TypeError when input type is not string ([c146491](https://github.com/tonytonyjan/jaro_winkler/commit/c146491)), closes [#24](https://github.com/tonytonyjan/jaro_winkler/issues/24)
* fix(memory): make sure codepoints will be allocated/freed after rb_raise to prevent memory leak ([fe9d784](https://github.com/tonytonyjan/jaro_winkler/commit/fe9d784)), closes [#20](https://github.com/tonytonyjan/jaro_winkler/issues/20)
* ci: fix travis ci ([e1b9add](https://github.com/tonytonyjan/jaro_winkler/commit/e1b9add))
* build: build cross platform gems ([eb091bf](https://github.com/tonytonyjan/jaro_winkler/commit/eb091bf))
* test: test both native and pure ruby implementation ([2ecb24b](https://github.com/tonytonyjan/jaro_winkler/commit/2ecb24b))
## 1.5.1 (2018-06-06)
* chore: add a rake task to measure performance of each version ([6368e2d](https://github.com/tonytonyjan/jaro_winkler/commit/6368e2d))
* chore: remove outdated benchmark output files ([7d590dc](https://github.com/tonytonyjan/jaro_winkler/commit/7d590dc))
* chore(benchmark): dry up benchmark codes ([5ebd36d](https://github.com/tonytonyjan/jaro_winkler/commit/5ebd36d))
* chore(benchmark): enhance the codes for measuring performance between each version ([4b05c43](https://github.com/tonytonyjan/jaro_winkler/commit/4b05c43))
* chore(release): bump version to 1.5.1 ([5d38f8e](https://github.com/tonytonyjan/jaro_winkler/commit/5d38f8e))
* build: build c extension only on MRI platform, and make a dummy Makefile for other platforms ([7e66cbf](https://github.com/tonytonyjan/jaro_winkler/commit/7e66cbf)), closes [#19](https://github.com/tonytonyjan/jaro_winkler/issues/19)
* build: update ci ([868609a](https://github.com/tonytonyjan/jaro_winkler/commit/868609a))
* build: update gems ([536e667](https://github.com/tonytonyjan/jaro_winkler/commit/536e667))
* refactor: convert types explicitly ([2db63d9](https://github.com/tonytonyjan/jaro_winkler/commit/2db63d9))
* style: remote unused variables ([64dd5cc](https://github.com/tonytonyjan/jaro_winkler/commit/64dd5cc))
## 1.5.0 (2017-10-02)
* chore: add a rake task to generate changelog automatically ([a8069f9](https://github.com/tonytonyjan/jaro_winkler/commit/a8069f9))
* chore: ignore tag file ([0ac7fce](https://github.com/tonytonyjan/jaro_winkler/commit/0ac7fce))
* chore: refine benchmark ([e6b93fb](https://github.com/tonytonyjan/jaro_winkler/commit/e6b93fb))
* chore: remove print_time task ([01e1cea](https://github.com/tonytonyjan/jaro_winkler/commit/01e1cea))
* chore(release): bump version to v1.5.0 ([f0e27a4](https://github.com/tonytonyjan/jaro_winkler/commit/f0e27a4))
* docs: fix table format ([fba1b2e](https://github.com/tonytonyjan/jaro_winkler/commit/fba1b2e))
* docs: update benchmark table ([f6f09d2](https://github.com/tonytonyjan/jaro_winkler/commit/f6f09d2))
* docs: update gemspec description ([3054a3e](https://github.com/tonytonyjan/jaro_winkler/commit/3054a3e))
* docs: update README.md ([a6ef904](https://github.com/tonytonyjan/jaro_winkler/commit/a6ef904))
* perf: optimize single byte codepoints computation ([e0cdd51](https://github.com/tonytonyjan/jaro_winkler/commit/e0cdd51))
* feat: support encodings other than utf-8 ([fe72ab4](https://github.com/tonytonyjan/jaro_winkler/commit/fe72ab4)), closes [#7](https://github.com/tonytonyjan/jaro_winkler/issues/7)
* feat: support rubinius ([27090ff](https://github.com/tonytonyjan/jaro_winkler/commit/27090ff))
* ci: add linux and osx to travis ci ([814418e](https://github.com/tonytonyjan/jaro_winkler/commit/814418e))
* ci: refine .travis.yml, update ruby versions ([330e368](https://github.com/tonytonyjan/jaro_winkler/commit/330e368))
* style: normalize codeing style with clang-format ([20865f4](https://github.com/tonytonyjan/jaro_winkler/commit/20865f4))
* style: normalize coding styles with rubocop ([3864897](https://github.com/tonytonyjan/jaro_winkler/commit/3864897))
* fix: free codepoints before returning to prevent memory leak ([8babd4f](https://github.com/tonytonyjan/jaro_winkler/commit/8babd4f))
* fix: remove module functions from JaroWinkler ([af249d5](https://github.com/tonytonyjan/jaro_winkler/commit/af249d5))
* refactor: declare DEFAULT_OPT as a global variable ([554f4cf](https://github.com/tonytonyjan/jaro_winkler/commit/554f4cf))
* refactor: prefer sizeof(variable) over sizeof(type) ([1a37c7e](https://github.com/tonytonyjan/jaro_winkler/commit/1a37c7e))
* refactor: rename code.h to codepoints.h ([106da9c](https://github.com/tonytonyjan/jaro_winkler/commit/106da9c))
* refactor: rename functions, variables and arguments to make them more sense ([71f9e95](https://github.com/tonytonyjan/jaro_winkler/commit/71f9e95))
* refactor: stop using char or int or short or long or unsigned, use int8_t, int16_t, int32_t, int64_t ([ba2b936](https://github.com/tonytonyjan/jaro_winkler/commit/ba2b936))
* refactor: use ruby's built-in hash function ([16883ce](https://github.com/tonytonyjan/jaro_winkler/commit/16883ce)), closes [#14](https://github.com/tonytonyjan/jaro_winkler/issues/14)
* test: use assert_in_delta for comparing floats ([feca1a5](https://github.com/tonytonyjan/jaro_winkler/commit/feca1a5))
* add changelog [ci skip] ([de15eba](https://github.com/tonytonyjan/jaro_winkler/commit/de15eba))
* Add JRuby 9.0.4.0 to the Travis CLI tests ([29fe873](https://github.com/tonytonyjan/jaro_winkler/commit/29fe873))
* add ruby 2.3 test ([bb3fed8](https://github.com/tonytonyjan/jaro_winkler/commit/bb3fed8))
* Add ruby 2.4.0 to .travis.yml ([b49f202](https://github.com/tonytonyjan/jaro_winkler/commit/b49f202))
* fix ci ([f2b959f](https://github.com/tonytonyjan/jaro_winkler/commit/f2b959f))
* fix ci ([71bbdc9](https://github.com/tonytonyjan/jaro_winkler/commit/71bbdc9))
* fix rakefile ([f5cd294](https://github.com/tonytonyjan/jaro_winkler/commit/f5cd294))
* refine adj table ([104c1af](https://github.com/tonytonyjan/jaro_winkler/commit/104c1af))
* support jruby19 ([84c85a9](https://github.com/tonytonyjan/jaro_winkler/commit/84c85a9))
* there has been gem for java platform since 1.4.0 [ci skip] ([093bd09](https://github.com/tonytonyjan/jaro_winkler/commit/093bd09))
* typo ([6b8be41](https://github.com/tonytonyjan/jaro_winkler/commit/6b8be41))
* update benchmark ([ce27575](https://github.com/tonytonyjan/jaro_winkler/commit/ce27575))
* update README ([46e3137](https://github.com/tonytonyjan/jaro_winkler/commit/46e3137))
* update README [ci skip] ([d88d73e](https://github.com/tonytonyjan/jaro_winkler/commit/d88d73e))
* Update ruby version in .travis.yml to 2.3.3/2.2.6/2.1.10 ([6d11b55](https://github.com/tonytonyjan/jaro_winkler/commit/6d11b55))
* use `pragma once` instead of `include guard`, it's been ([2d7f43e](https://github.com/tonytonyjan/jaro_winkler/commit/2d7f43e))
### BREAKING CHANGE
* JaroWinkler no longer supports mixin, use class methods instead, ex.
`JaroWinkler.distance`
## 1.4.0 (2015-12-12)
* add JaroWinkler.jaro_distance ([7347807](https://github.com/tonytonyjan/jaro_winkler/commit/7347807))
* add jruby to ci ([c46d208](https://github.com/tonytonyjan/jaro_winkler/commit/c46d208))
* correct wording ([ead6367](https://github.com/tonytonyjan/jaro_winkler/commit/ead6367))
* fix benchmark ([72e64ca](https://github.com/tonytonyjan/jaro_winkler/commit/72e64ca))
* gems that are used in benchmark should not be defined in gemspec. ([8fd870e](https://github.com/tonytonyjan/jaro_winkler/commit/8fd870e))
* indent ([609a56d](https://github.com/tonytonyjan/jaro_winkler/commit/609a56d))
* more test ([5c9f8b6](https://github.com/tonytonyjan/jaro_winkler/commit/5c9f8b6))
* prevent warning from rspec ([0a94fcb](https://github.com/tonytonyjan/jaro_winkler/commit/0a94fcb))
* reimplement pure ruby version ([8c63b8a](https://github.com/tonytonyjan/jaro_winkler/commit/8c63b8a))
* replace rspec with minitest ([accc6eb](https://github.com/tonytonyjan/jaro_winkler/commit/accc6eb))
* use rake gem packaging tasks instead of bundler ([871c0d7](https://github.com/tonytonyjan/jaro_winkler/commit/871c0d7))
jaro_winkler-1.7.0/Gemfile 0000664 0000000 0000000 00000000172 15142757443 0015514 0 ustar 00root root 0000000 0000000 source 'https://rubygems.org'
gemspec
group :benchmark do
gem 'fuzzy-string-match'
gem 'hotwater'
gem 'amatch'
end jaro_winkler-1.7.0/LICENSE.txt 0000664 0000000 0000000 00000002055 15142757443 0016046 0 ustar 00root root 0000000 0000000 Copyright (c) 2014 Jian Weihang
MIT License
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
jaro_winkler-1.7.0/README.md 0000664 0000000 0000000 00000015454 15142757443 0015511 0 ustar 00root root 0000000 0000000 
[jaro_winkler](https://rubygems.org/gems/jaro_winkler) is an implementation of [Jaro-Winkler similarity](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm which is written in C extension and will fallback to pure Ruby version in platforms other than MRI/KRI like JRuby or Rubinius. **Both of C and Ruby implementation support any kind of string encoding, such as UTF-8, EUC-JP, Big5, etc.**
# Installation
```
gem install jaro_winkler
```
# Usage
```ruby
require 'jaro_winkler'
# Jaro Winkler Similarity
JaroWinkler.similarity "MARTHA", "MARHTA"
# => 0.9611
JaroWinkler.similarity "MARTHA", "marhta", ignore_case: true
# => 0.9611
JaroWinkler.similarity "MARTHA", "MARHTA", weight: 0.2
# => 0.9778
# Jaro Similarity
JaroWinkler.jaro_similarity "MARTHA", "MARHTA"
# => 0.9444444444444445
```
There is no `JaroWinkler.jaro_winkler_similarity`, it's tediously long.
## Options
Name | Type | Default | Note
----------- | ------ | ------- | ------------------------------------------------------------------------------------------------------------
ignore_case | boolean | false | All lower case characters are converted to upper case prior to the comparison.
weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro similarity above the threshold.
adj_table | boolean | false | The option is used to give partial credit for characters that may be errors due to known phonetic or character recognition errors. A typical example is to match the letter "O" with the number "0".
# Adjusting Table
## Default Table
```
['A', 'E'], ['A', 'I'], ['A', 'O'], ['A', 'U'], ['B', 'V'], ['E', 'I'], ['E', 'O'], ['E', 'U'], ['I', 'O'], ['I', 'U'],
['O', 'U'], ['I', 'Y'], ['E', 'Y'], ['C', 'G'], ['E', 'F'], ['W', 'U'], ['W', 'V'], ['X', 'K'], ['S', 'Z'], ['X', 'S'],
['Q', 'C'], ['U', 'V'], ['M', 'N'], ['L', 'I'], ['Q', 'O'], ['P', 'R'], ['I', 'J'], ['2', 'Z'], ['5', 'S'], ['8', 'B'],
['1', 'I'], ['1', 'L'], ['0', 'O'], ['0', 'Q'], ['C', 'K'], ['G', 'J'], ['E', ' '], ['Y', ' '], ['S', ' ']
```
## How it works?
Original Formula:
%26%5Ctext%7Bothers%7D%5Cend%7Bcases%7D)
where
- `m` is the number of matching characters.
- `t` is half the number of transpositions.
With Adjusting Table:
%26%5Ctext%7Bothers%7D%5Cend%7Bcases%7D)
where
- `s` is the number of nonmatching but similar characters.
# Why This?
There is also another similar gem named [fuzzy-string-match](https://github.com/kiyoka/fuzzy-string-match) which both provides C and Ruby version as well.
I reinvent this wheel because of the naming in `fuzzy-string-match` such as `getDistance` breaks convention, and some weird code like `a1 = s1.split( // )` (`s1.chars` could be better), furthermore, it's bugged (see tables below).
# Compare with other gems
| | jaro_winkler | fuzzystringmatch | hotwater | amatch |
|-----------------|--------------|------------------|----------|---------|
| Encoding Support| **Yes** | Pure Ruby only | No | No |
| Windows Support | **Yes** | ? | No | **Yes** |
| Adjusting Table | **Yes** | No | No | No |
| Native | **Yes** | **Yes** | **Yes** | **Yes** |
| Pure Ruby | **Yes** | **Yes** | No | No |
| Speed | **1st** | 3rd | 2nd | 4th |
I made a table below to compare accuracy between each gem:
str_1 | str_2 | origin | jaro_winkler | fuzzystringmatch | hotwater | amatch
--- | --- | --- | --- | --- | --- | ---
"henka" | "henkan" | 0.9667 | 0.9667 | **0.9722** | 0.9667 | **0.9444**
"al" | "al" | 1.0 | 1.0 | 1.0 | 1.0 | 1.0
"martha" | "marhta" | 0.9611 | 0.9611 | 0.9611 | 0.9611 | **0.9444**
"jones" | "johnson" | 0.8324 | 0.8324 | 0.8324 | 0.8324 | **0.7905**
"abcvwxyz" | "cabvwxyz" | 0.9583 | 0.9583 | 0.9583 | 0.9583 | 0.9583
"dwayne" | "duane" | 0.84 | 0.84 | 0.84 | 0.84 | **0.8222**
"dixon" | "dicksonx" | 0.8133 | 0.8133 | 0.8133 | 0.8133 | **0.7667**
"fvie" | "ten" | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
- The "origin" result is from the [original C implementation by the author of the algorithm](http://web.archive.org/web/20100227020019/http://www.census.gov/geo/msb/stand/strcmp.c).
- Test data are borrowed from [fuzzy-string-match's rspec file](https://github.com/kiyoka/fuzzy-string-match/blob/master/test/basic_pure_spec.rb).
# Benchmark
```
$ bundle exec rake benchmark
ruby 2.4.1p111 (2017-03-22 revision 58053) [x86_64-darwin16]
# C Extension
Rehearsal --------------------------------------------------------------
jaro_winkler (8c16e09) 0.240000 0.000000 0.240000 ( 0.241347)
fuzzy-string-match (1.0.1) 0.400000 0.010000 0.410000 ( 0.403673)
hotwater (0.1.2) 0.250000 0.000000 0.250000 ( 0.254503)
amatch (0.4.0) 0.870000 0.000000 0.870000 ( 0.875930)
----------------------------------------------------- total: 1.770000sec
user system total real
jaro_winkler (8c16e09) 0.230000 0.000000 0.230000 ( 0.236921)
fuzzy-string-match (1.0.1) 0.380000 0.000000 0.380000 ( 0.381942)
hotwater (0.1.2) 0.250000 0.000000 0.250000 ( 0.254977)
amatch (0.4.0) 0.860000 0.000000 0.860000 ( 0.861207)
# Pure Ruby
Rehearsal --------------------------------------------------------------
jaro_winkler (8c16e09) 0.440000 0.000000 0.440000 ( 0.438470)
fuzzy-string-match (1.0.1) 0.860000 0.000000 0.860000 ( 0.862850)
----------------------------------------------------- total: 1.300000sec
user system total real
jaro_winkler (8c16e09) 0.440000 0.000000 0.440000 ( 0.439237)
fuzzy-string-match (1.0.1) 0.910000 0.010000 0.920000 ( 0.920259)
```
# Todo
- Custom adjusting word table.
jaro_winkler-1.7.0/Rakefile 0000664 0000000 0000000 00000005723 15142757443 0015675 0 ustar 00root root 0000000 0000000 require 'rubygems/package_task'
require 'rake/extensiontask'
require 'rake/testtask'
task default: :test
task test: %w[test:pure_ruby test:compiled]
task benchmark: %w[benchmark:native benchmark:pure]
task :print_ruby_version do
print "#{RUBY_DESCRIPTION}\n\n"
end
namespace :benchmark do
task native: :print_ruby_version do |t, args|
puts '# C Extension'
load File.expand_path("../benchmark/native.rb", __FILE__)
puts
end
task pure: :print_ruby_version do |t, args|
puts '# Pure Ruby'
load File.expand_path("../benchmark/pure.rb", __FILE__)
puts
end
task :measure do
tags = ENV['TAGS'] ? ENV['TAGS'].split(',') : `git tag --list`.split.select { |v| v.match? /\Av1\.[1-9]\.\d\z/ }
puts 'version,label,utime,stime,cutime,cstime,real'
tags.each do |tag|
sh("git checkout -f #{tag} 1>&2")
sh('git checkout master -- benchmark 1>&2')
sh('bundle exec rake clobber compile 1>&2')
sh("ruby #{File.expand_path("../benchmark/measure.rb", __FILE__)}")
end
end
end
task compare: :compile do
require 'jaro_winkler'
require 'fuzzystringmatch'
require 'hotwater'
require 'amatch'
@ary = [['henka', 'henkan'], ['al', 'al'], ['martha', 'marhta'], ['jones', 'johnson'], ['abcvwxyz', 'cabvwxyz'], ['dwayne', 'duane'], ['dixon', 'dicksonx'], ['fvie', 'ten'], ['San Francisco', 'Santa Monica']]
table = []
table << %w[str_1 str_2 jaro_winkler fuzzystringmatch hotwater amatch]
table << %w[--- --- --- --- --- ---]
jarow = FuzzyStringMatch::JaroWinkler.create(:native)
@ary.each do |str_1, str_2|
table << ["\"#{str_1}\"", "\"#{str_2}\"", JaroWinkler.similarity(str_1, str_2).round(4), jarow.getDistance(str_1, str_2).round(4), Hotwater.jaro_winkler_distance(str_1, str_2).round(4), Amatch::Jaro.new(str_1).match(str_2).round(4)]
end
col_len = []
table.first.length.times{ |i| col_len << table.map{ |row| row[i].to_s.length }.max }
table.first.each_with_index{ |title, i| "%-#{col_len[i]}s" % title }
table.each_with_index do |row|
row.each_with_index do |col, i|
row[i] = "%-#{col_len[i]}s" % col.to_s
end
end
table.each{|row| puts row.join(' | ')}
end
if RUBY_ENGINE == 'ruby'
Rake::ExtensionTask.new 'jaro_winkler_ext' do |ext|
ext.lib_dir = 'lib/jaro_winkler'
ext.ext_dir = 'ext/jaro_winkler'
end
else
task :compile do
puts 'Can not compile C extension, fallback to pure Ruby version.'
end
end
namespace :test do
Rake::TestTask.new(:compiled => :compile) do |t|
t.libs << 'test'
t.test_files = FileList['test/test_jaro_winkler.rb']
t.verbose = true
end
Rake::TestTask.new(:pure_ruby) do |t|
t.libs << 'test'
t.test_files = FileList['test/test_pure_ruby.rb']
t.verbose = true
end
end
%w[jaro_winkler jaro_winkler.java]
.map { |name| Gem::Specification.load(File.expand_path("../#{name}.gemspec", __FILE__)) }
.each { |spec| Gem::PackageTask.new(spec).define }
task 'CHANGELOG.md' do
sh 'conventional-changelog -p angular -i CHANGELOG.md -s'
end
jaro_winkler-1.7.0/benchmark/ 0000775 0000000 0000000 00000000000 15142757443 0016153 5 ustar 00root root 0000000 0000000 jaro_winkler-1.7.0/benchmark/env.rb 0000664 0000000 0000000 00000000404 15142757443 0017266 0 ustar 00root root 0000000 0000000 # frozen_string_literal: true
$LOAD_PATH << File.expand_path('../../lib', __FILE__)
require 'bundler'
Bundler.setup(:benchmark)
require File.expand_path('../samples', __FILE__)
def gem_name_with_version(gem)
"#{gem} (#{Gem.loaded_specs[gem].version})"
end
jaro_winkler-1.7.0/benchmark/measure.rb 0000664 0000000 0000000 00000002573 15142757443 0020150 0 ustar 00root root 0000000 0000000 # frozen_string_literal: true
require File.expand_path('../samples', __FILE__)
gem 'jaro_winkler', ENV['JARO_WINKLER_VERSION'] || ARGV[0] || raise('missing ENV["JARO_WINKLER_VERSION"]')
require 'benchmark'
require 'csv'
csv = CSV.new($stdout)
n = 100_000
version = Gem::Version.new(Gem.loaded_specs["jaro_winkler"].version)
if version >= Gem::Version.new('1.1.0') && version < Gem::Version.new('1.2.4')
require 'jaro_winkler.bundle'
else
require 'jaro_winkler'
end
jobs = {
ascii: -> { n.times { SAMPLES[:ascii].each { |str1, str2| JaroWinkler.jaro_winkler_distance(str1, str2) } } }
}
if version >= Gem::Version.new('1.1.0')
jobs[:ascii] = -> { n.times { SAMPLES[:ascii].each { |str1, str2| JaroWinkler.c_distance(str1, str2) } } }
end
if version >= Gem::Version.new('1.2.0')
jobs[:utf8] = -> { n.times { SAMPLES[:utf8].each { |str1, str2| JaroWinkler.c_distance(str1, str2) } } }
end
if version >= Gem::Version.new('1.4.0')
jobs[:ascii] = -> { n.times { SAMPLES[:ascii].each { |str1, str2| JaroWinkler.similarity(str1, str2) } } }
jobs[:utf8] = -> { n.times { SAMPLES[:utf8].each { |str1, str2| JaroWinkler.similarity(str1, str2) } } }
end
# rehearsal
jobs.each { |label, job| Benchmark.measure(label, &job) }
# take
jobs.each do |label, job|
GC.start
tms = Benchmark.measure(label, &job)
# version,label,utime,stime,cutime,cstime,real
csv << [version, *tms.to_a]
end
jaro_winkler-1.7.0/benchmark/native.rb 0000664 0000000 0000000 00000001610 15142757443 0017764 0 ustar 00root root 0000000 0000000 # frozen_string_literal: true
require File.expand_path('../env', __FILE__)
require 'benchmark'
require 'jaro_winkler/jaro_winkler_ext'
require 'fuzzystringmatch'
require 'hotwater'
require 'amatch'
n = 100_000
Benchmark.bmbm do |x|
x.report "jaro_winkler (#{`git rev-parse --short HEAD`.chop!})" do
n.times { SAMPLES[:ascii].each { |str1, str2| JaroWinkler.similarity(str1, str2) } }
end
x.report gem_name_with_version('fuzzy-string-match') do
jarow = FuzzyStringMatch::JaroWinkler.create(:native)
n.times { SAMPLES[:ascii].each { |str1, str2| jarow.getDistance(str1, str2) } }
end
x.report gem_name_with_version('hotwater') do
n.times { SAMPLES[:ascii].each { |str1, str2| Hotwater.jaro_winkler_distance(str1, str2) } }
end
x.report gem_name_with_version('amatch') do
n.times { SAMPLES[:ascii].each { |str1, str2| Amatch::Jaro.new(str1).match(str2) } }
end
end
jaro_winkler-1.7.0/benchmark/pure.rb 0000664 0000000 0000000 00000001074 15142757443 0017455 0 ustar 00root root 0000000 0000000 # frozen_string_literal: true
require File.expand_path('../env', __FILE__)
require 'benchmark'
require 'jaro_winkler/jaro_winkler_pure'
require 'fuzzystringmatch'
n = 10_000
Benchmark.bmbm do |x|
x.report "jaro_winkler (#{`git rev-parse --short HEAD`.chop!})" do
n.times { SAMPLES[:ascii].each { |str1, str2| JaroWinkler.similarity(str1, str2) } }
end
x.report gem_name_with_version('fuzzy-string-match') do
jarow = FuzzyStringMatch::JaroWinkler.create(:pure)
n.times { SAMPLES[:ascii].each { |str1, str2| jarow.getDistance(str1, str2) } }
end
end
jaro_winkler-1.7.0/benchmark/samples.rb 0000664 0000000 0000000 00000001031 15142757443 0020137 0 ustar 00root root 0000000 0000000 SAMPLES = {
ascii: [
%w[al al], %w[martha marhta], %w[jones johnson], %w[abcvwxyz cabvwxyz],
%w[dwayne duane], %w[dixon dicksonx], %w[fvie ten]
].freeze,
utf8: [
%w[馬英九 馬英丸], %w[蔡英文 蔡中文], %w[簡煒航 簡偉航], %w[焦玟綾 焦紋綾],
%w[眼球中央電視台 眼球中英電視台], %w[床前明月光 床前日月光],
%w[海水退了就知道誰沒穿褲子 海水退了就知道誰沒穿襪子],
%w[阿里山的姑娘美如水 阿里山的姑娘沒乳水]
].freeze
}.freeze
jaro_winkler-1.7.0/bin/ 0000775 0000000 0000000 00000000000 15142757443 0014771 5 ustar 00root root 0000000 0000000 jaro_winkler-1.7.0/bin/measure 0000775 0000000 0000000 00000000377 15142757443 0016367 0 ustar 00root root 0000000 0000000 #!/bin/sh
echo version,label,utime,stime,cutime,cstime,real
gem search -ear jaro_winkler \
| grep -o '\((.*)\)$' \
| tr -d '() ' \
| tr ',' "\n" \
| grep -o '\d\.\d\.\d' \
| sort \
| xargs -I{} ruby "`dirname $0`"/../benchmark/measure.rb '{}'
jaro_winkler-1.7.0/ext/ 0000775 0000000 0000000 00000000000 15142757443 0015021 5 ustar 00root root 0000000 0000000 jaro_winkler-1.7.0/ext/jaro_winkler/ 0000775 0000000 0000000 00000000000 15142757443 0017507 5 ustar 00root root 0000000 0000000 jaro_winkler-1.7.0/ext/jaro_winkler/adj_matrix.c 0000664 0000000 0000000 00000006041 15142757443 0021776 0 ustar 00root root 0000000 0000000 #include "adj_matrix.h"
#include "codepoints.h"
#include "ruby.h"
const char *DEFAULT_ADJ_TABLE[] = {
"A", "E", "A", "I", "A", "O", "A", "U", "B", "V", "E", "I", "E",
"O", "E", "U", "I", "O", "I", "U", "O", "U", "I", "Y", "E", "Y",
"C", "G", "E", "F", "W", "U", "W", "V", "X", "K", "S", "Z", "X",
"S", "Q", "C", "U", "V", "M", "N", "L", "I", "Q", "O", "P", "R",
"I", "J", "2", "Z", "5", "S", "8", "B", "1", "I", "1", "L", "0",
"O", "0", "Q", "C", "K", "G", "J", "E", " ", "Y", " ", "S", " "};
void node_free(Node *head);
AdjMatrix *adj_matrix_new(uint32_t length) {
AdjMatrix *matrix = malloc(sizeof(AdjMatrix));
matrix->length = length == 0 ? ADJ_MATRIX_DEFAULT_LENGTH : length;
matrix->table = malloc(matrix->length * sizeof(Node **));
for (size_t i = 0; i < matrix->length; i++) {
matrix->table[i] = malloc(matrix->length * sizeof(Node *));
for (size_t j = 0; j < matrix->length; j++)
matrix->table[i][j] = NULL;
}
return matrix;
}
void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y) {
uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
ADJ_MATRIX_DEFAULT_LENGTH,
h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
ADJ_MATRIX_DEFAULT_LENGTH;
Node *new_node = malloc(sizeof(Node));
new_node->x = h1;
new_node->y = h2;
new_node->next = NULL;
if (matrix->table[h1][h2] == NULL) {
matrix->table[h1][h2] = matrix->table[h2][h1] = new_node;
} else {
Node *previous = NULL;
for (Node *i = matrix->table[h1][h2]; i != NULL; i = i->next)
previous = i;
previous->next = new_node;
}
}
char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y) {
uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
ADJ_MATRIX_DEFAULT_LENGTH,
h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
ADJ_MATRIX_DEFAULT_LENGTH;
Node *node = matrix->table[h1][h2];
if (node == NULL)
return 0;
else {
for (Node *i = node; i != NULL; i = i->next)
if ((i->x == h1 && i->y == h2) || (i->x == h2 && i->y == h1))
return 1;
return 0;
}
}
void node_free(Node *head) {
if (head == NULL)
return;
node_free(head->next);
free(head);
}
void adj_matrix_free(AdjMatrix *matrix) {
for (size_t i = 0; i < matrix->length; i++) {
for (size_t j = 0; j < matrix->length; j++)
if (matrix->table[i][j] != NULL) {
node_free(matrix->table[i][j]);
matrix->table[i][j] = matrix->table[j][i] = NULL;
}
free(matrix->table[i]);
}
free(matrix->table);
free(matrix);
}
AdjMatrix *adj_matrix_default() {
static char first_time = 1;
static AdjMatrix *ret_matrix;
if (first_time) {
ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
size_t length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char *);
for (size_t i = 0; i < length; i += 2) {
uint64_t code_1, code_2;
code_1 = *DEFAULT_ADJ_TABLE[i] & 0xff;
code_2 = *DEFAULT_ADJ_TABLE[i + 1] & 0xff;
adj_matrix_add(ret_matrix, code_1, code_2);
}
first_time = 0;
}
return ret_matrix;
}
jaro_winkler-1.7.0/ext/jaro_winkler/adj_matrix.h 0000664 0000000 0000000 00000000750 15142757443 0022004 0 ustar 00root root 0000000 0000000 #pragma once
#include "stdint.h"
#define ADJ_MATRIX_DEFAULT_LENGTH 958
#define ADJ_MATRIX_SEED 9527
typedef struct _node {
struct _node *next;
uint64_t x, y;
} Node;
typedef struct {
Node ***table;
uint32_t length;
} AdjMatrix;
AdjMatrix *adj_matrix_new(uint32_t length);
void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y);
char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y);
void adj_matrix_free(AdjMatrix *matrix);
AdjMatrix *adj_matrix_default();
jaro_winkler-1.7.0/ext/jaro_winkler/codepoints.c 0000664 0000000 0000000 00000003304 15142757443 0022022 0 ustar 00root root 0000000 0000000 #include "codepoints.h"
#include "ruby.h"
#include "ruby/encoding.h"
#include
#include
#include
// this function is copied from string.c
static inline int single_byte_optimizable(VALUE str) {
rb_encoding *enc;
/* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
return 1;
enc = rb_enc_get(str);
if (rb_enc_mbmaxlen(enc) == 1)
return 1;
/* Conservative. Possibly single byte.
* "\xa1" in Shift_JIS for example. */
return 0;
}
void codepoints_init(CodePoints *codepoints, VALUE str) {
size_t i, length;
int32_t n;
uint32_t c;
const char *ptr, *end;
rb_encoding *enc;
if (single_byte_optimizable(str)) {
length = RSTRING_LEN(str);
ptr = RSTRING_PTR(str);
codepoints->data = malloc(length * sizeof(*codepoints->data));
for (i = 0, codepoints->length = 0; i < length; i++, codepoints->length++)
codepoints->data[i] = ptr[i] & 0xff;
} else {
codepoints->length = 0;
codepoints->size = 32;
codepoints->data = malloc(codepoints->size * sizeof(*codepoints->data));
str = rb_str_new_frozen(str);
ptr = RSTRING_PTR(str);
end = RSTRING_END(str);
enc = rb_enc_get(str);
while (ptr < end) {
c = rb_enc_codepoint_len(ptr, end, &n, enc);
if (codepoints->length == codepoints->size) {
codepoints->size *= 2;
codepoints->data = realloc(codepoints->data, sizeof(*codepoints->data) *
codepoints->size);
}
codepoints->data[codepoints->length++] = c;
ptr += n;
}
RB_GC_GUARD(str);
}
}
void codepoints_free(CodePoints *codepoints) { free(codepoints->data); }
jaro_winkler-1.7.0/ext/jaro_winkler/codepoints.h 0000664 0000000 0000000 00000000355 15142757443 0022032 0 ustar 00root root 0000000 0000000 #pragma once
#include "ruby.h"
#include
#include
typedef struct {
uint32_t *data;
size_t length;
size_t size;
} CodePoints;
void codepoints_init(CodePoints *, VALUE str);
void codepoints_free(CodePoints *);
jaro_winkler-1.7.0/ext/jaro_winkler/extconf.rb 0000664 0000000 0000000 00000000172 15142757443 0021502 0 ustar 00root root 0000000 0000000 # frozen_string_literal: true
require 'mkmf'
append_cflags(['-std=c99'])
create_makefile('jaro_winkler/jaro_winkler_ext') jaro_winkler-1.7.0/ext/jaro_winkler/jaro.c 0000664 0000000 0000000 00000010047 15142757443 0020610 0 ustar 00root root 0000000 0000000 #include "jaro.h"
#include "adj_matrix.h"
#include "codepoints.h"
#include
#include
#include
#if HAVE_ALLOCA_H
# include
#elif defined __GNUC__
# define alloca __builtin_alloca
#elif defined _AIX
# define alloca __alloca
#elif defined _MSC_VER
# include
# define alloca _alloca
#else
# include
# ifdef __cplusplus
extern "C"
# endif
void *alloca (size_t);
#endif
#define DEFAULT_WEIGHT 0.1
#define DEFAULT_THRESHOLD 0.7
#define SWAP(type, x, y) \
do { \
type SWAP = x; \
x = y; \
y = SWAP; \
} while (0)
const Options DEFAULT_OPTIONS = {.weight = DEFAULT_WEIGHT,
.threshold = DEFAULT_THRESHOLD,
.ignore_case = 0,
.adj_table = 0};
double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
uint32_t *codepoints2, size_t len2,
Options *opt) {
if (!len1 || !len2)
return 0.0;
if (len1 > len2) {
SWAP(uint32_t*, codepoints1, codepoints2);
SWAP(size_t, len1, len2);
}
if (opt->ignore_case) {
for (size_t i = 0; i < len1; i++)
codepoints1[i] = tolower(codepoints1[i]);
for (size_t i = 0; i < len2; i++)
codepoints2[i] = tolower(codepoints2[i]);
}
int32_t window_size = (int32_t)len2 / 2 - 1;
if (window_size < 0)
window_size = 0;
char * short_codes_flag = alloca(len1);
char * long_codes_flag = alloca(len2);
memset(short_codes_flag, 0, len1);
memset(long_codes_flag, 0, len2);
// count number of matching characters
size_t match_count = 0;
for (size_t i = 0; i < len1; i++) {
size_t left = (i >= (size_t)window_size) ? i - window_size : 0;
size_t right =
(i + window_size <= len2 - 1) ? (i + window_size) : (len2 - 1);
if (right > len2 - 1)
right = len2 - 1;
for (size_t j = left; j <= right; j++) {
if (!long_codes_flag[j] && codepoints1[i] == codepoints2[j]) {
short_codes_flag[i] = long_codes_flag[j] = 1;
match_count++;
break;
}
}
}
if (!match_count)
return 0.0;
// count number of transpositions
size_t transposition_count = 0, j = 0, k = 0;
for (size_t i = 0; i < len1; i++) {
if (short_codes_flag[i]) {
for (j = k; j < len2; j++) {
if (long_codes_flag[j]) {
k = j + 1;
break;
}
}
if (codepoints1[i] != codepoints2[j])
transposition_count++;
}
}
// count similarities in nonmatched characters
size_t similar_count = 0;
if (opt->adj_table && len1 > match_count)
for (size_t i = 0; i < len1; i++)
if (!short_codes_flag[i])
for (size_t j = 0; j < len2; j++)
if (!long_codes_flag[j])
if (adj_matrix_find(adj_matrix_default(), codepoints1[i],
codepoints2[j])) {
similar_count += 3;
break;
}
double m = (double)match_count;
double t = (double)(transposition_count / 2);
if (opt->adj_table)
m = similar_count / 10.0 + m;
return (m / len1 + m / len2 + (m - t) / m) / 3;
}
double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
uint32_t *codepoints2, size_t len2,
Options *opt) {
double jaro_distance =
jaro_distance_from_codes(codepoints1, len1, codepoints2, len2, opt);
if (jaro_distance < opt->threshold)
return jaro_distance;
else {
size_t prefix = 0;
size_t max_4 = len1 > 4 ? 4 : len1;
for (prefix = 0;
prefix < max_4 && codepoints1[prefix] == codepoints2[prefix]; prefix++)
;
return jaro_distance + prefix * opt->weight * (1 - jaro_distance);
}
}
jaro_winkler-1.7.0/ext/jaro_winkler/jaro.h 0000664 0000000 0000000 00000001026 15142757443 0020612 0 ustar 00root root 0000000 0000000 #pragma once
#include
#include
typedef struct {
double weight, threshold;
char ignore_case, adj_table;
} Options;
extern const Options DEFAULT_OPTIONS;
double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
uint32_t *codepoints2, size_t len2, Options *);
double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
uint32_t *codepoints2, size_t len2,
Options *);
jaro_winkler-1.7.0/ext/jaro_winkler/jaro_winkler.c 0000664 0000000 0000000 00000007236 15142757443 0022351 0 ustar 00root root 0000000 0000000 #include "codepoints.h"
#include "jaro.h"
#include "ruby.h"
VALUE rb_mJaroWinkler, rb_eError, rb_eInvalidWeightError;
VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self);
VALUE rb_jaro_winkler_similarity(int argc, VALUE *argv, VALUE self);
VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self);
VALUE rb_jaro_similarity(int argc, VALUE *argv, VALUE self);
VALUE distance(int argc, VALUE *argv, VALUE self,
double (*distance_fn)(uint32_t *codepoints1, size_t len1,
uint32_t *codepoints2, size_t len2,
Options *));
void Init_jaro_winkler_ext(void) {
#ifdef HAVE_RB_EXT_RACTOR_SAFE
rb_ext_ractor_safe(true);
#endif
rb_mJaroWinkler = rb_define_module("JaroWinkler");
rb_eError = rb_define_class_under(rb_mJaroWinkler, "Error", rb_eRuntimeError);
rb_eInvalidWeightError =
rb_define_class_under(rb_mJaroWinkler, "InvalidWeightError", rb_eError);
rb_define_singleton_method(rb_mJaroWinkler, "distance",
rb_jaro_winkler_distance, -1);
rb_define_singleton_method(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance,
-1);
rb_define_singleton_method(rb_mJaroWinkler, "similarity",
rb_jaro_winkler_similarity, -1);
rb_define_singleton_method(rb_mJaroWinkler, "jaro_similarity", rb_jaro_similarity,
-1);
}
VALUE distance(int argc, VALUE *argv, VALUE self,
double (*distance_fn)(uint32_t *codepoints1, size_t len1,
uint32_t *codepoints2, size_t len2,
Options *)) {
VALUE s1, s2, opt;
rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
Check_Type(s1, T_STRING);
Check_Type(s2, T_STRING);
Options c_opt = DEFAULT_OPTIONS;
if (TYPE(opt) == T_HASH) {
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
ignore_case = rb_hash_aref(opt, ID2SYM(rb_intern("ignore_case"))),
adj_table = rb_hash_aref(opt, ID2SYM(rb_intern("adj_table")));
if (!NIL_P(weight))
c_opt.weight = NUM2DBL(weight);
if (c_opt.weight > 0.25)
rb_raise(rb_eInvalidWeightError, "Scaling factor should not exceed 0.25, "
"otherwise the distance can become "
"larger than 1.");
if (!NIL_P(threshold))
c_opt.threshold = NUM2DBL(threshold);
if (!NIL_P(ignore_case))
c_opt.ignore_case =
(TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
if (!NIL_P(adj_table))
c_opt.adj_table =
(TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
}
CodePoints cp1, cp2;
codepoints_init(&cp1, s1);
codepoints_init(&cp2, s2);
VALUE ret = rb_float_new(
(*distance_fn)(cp1.data, cp1.length, cp2.data, cp2.length, &c_opt));
codepoints_free(&cp1);
codepoints_free(&cp2);
return ret;
}
VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self) {
rb_warn("JaroWinkler.jaro_distance is deprecated. Use JaroWinkler.jaro_similarity instead.");
return rb_jaro_similarity(argc, argv, self);
}
VALUE rb_jaro_similarity(int argc, VALUE *argv, VALUE self) {
return distance(argc, argv, self, jaro_distance_from_codes);
}
VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self) {
rb_warn("JaroWinkler.distance is deprecated. Use JaroWinkler.similarity instead.");
return rb_jaro_winkler_similarity(argc, argv, self);
}
VALUE rb_jaro_winkler_similarity(int argc, VALUE *argv, VALUE self) {
return distance(argc, argv, self, jaro_winkler_distance_from_codes);
}
jaro_winkler-1.7.0/jaro_winkler.gemspec 0000664 0000000 0000000 00000003017 15142757443 0020255 0 ustar 00root root 0000000 0000000 # coding: utf-8
lib = File.expand_path('../lib', __FILE__)
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
require 'jaro_winkler/version'
Gem::Specification.new do |spec|
spec.name = 'jaro_winkler'
spec.version = JaroWinkler::VERSION
spec.authors = ['Jian Weihang']
spec.email = 'tonytonyjan@gmail.com'
spec.extensions = ['ext/jaro_winkler/extconf.rb']
spec.summary = 'An implementation of Jaro-Winkler distance algorithm written \
in C extension which supports any kind of string encoding.'
spec.description = 'jaro_winkler is an implementation of Jaro-Winkler \
distance algorithm which is written in C extension and will fallback to pure \
Ruby version in platforms other than MRI/KRI like JRuby or Rubinius. Both of \
C and Ruby implementation support any kind of string encoding, such as \
UTF-8, EUC-JP, Big5, etc.'
spec.homepage = 'https://github.com/tonytonyjan/jaro_winkler'
spec.license = 'MIT'
spec.metadata = {
'bug_tracker_uri' => 'https://github.com/tonytonyjan/jaro_winkler/issues',
'changelog_uri' => "https://github.com/tonytonyjan/jaro_winkler/blob/v#{spec.version}/CHANGELOG.md",
'documentation_uri' => "https://www.rubydoc.info/gems/jaro_winkler/#{spec.version}",
'source_code_uri' => "https://github.com/tonytonyjan/jaro_winkler/tree/v#{spec.version}",
}
spec.files = Dir['lib/**/*.rb', 'ext/**/*.{h,c}', 'LICENSE.txt']
spec.add_development_dependency 'rake', '~> 13.0'
spec.add_development_dependency 'rake-compiler'
spec.add_development_dependency 'minitest'
end
jaro_winkler-1.7.0/jaro_winkler.java.gemspec 0000664 0000000 0000000 00000003046 15142757443 0021177 0 ustar 00root root 0000000 0000000 # frozen_string_literal: true
lib = File.expand_path('../lib', __FILE__)
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
require 'jaro_winkler/version'
Gem::Specification.new do |spec|
spec.name = 'jaro_winkler'
spec.version = JaroWinkler::VERSION
spec.authors = ['Jian Weihang']
spec.email = 'tonytonyjan@gmail.com'
spec.summary = 'An implementation of Jaro-Winkler distance algorithm written \
in C extension which supports any kind of string encoding.'
spec.description = 'jaro_winkler is an implementation of Jaro-Winkler \
distance algorithm which is written in C extension and will fallback to pure \
Ruby version in platforms other than MRI/KRI like JRuby or Rubinius. Both of \
C and Ruby implementation support any kind of string encoding, such as \
UTF-8, EUC-JP, Big5, etc.'
spec.homepage = 'https://github.com/tonytonyjan/jaro_winkler'
spec.license = 'MIT'
spec.metadata = {
'bug_tracker_uri' => 'https://github.com/tonytonyjan/jaro_winkler/issues',
'changelog_uri' => "https://github.com/tonytonyjan/jaro_winkler/blob/v#{spec.version}/CHANGELOG.md",
'documentation_uri' => "https://www.rubydoc.info/gems/jaro_winkler/#{spec.version}",
'source_code_uri' => "https://github.com/tonytonyjan/jaro_winkler/tree/v#{spec.version}",
}
spec.files = Dir['lib/**/*.rb', 'LICENSE.txt']
spec.add_development_dependency 'bundler', '~> 1.7'
spec.add_development_dependency 'rake', '~> 12.0'
spec.add_development_dependency 'rake-compiler'
spec.add_development_dependency 'minitest'
spec.platform = 'java'
end
jaro_winkler-1.7.0/lib/ 0000775 0000000 0000000 00000000000 15142757443 0014767 5 ustar 00root root 0000000 0000000 jaro_winkler-1.7.0/lib/jaro_winkler.rb 0000664 0000000 0000000 00000000601 15142757443 0017777 0 ustar 00root root 0000000 0000000 # frozen_string_literal: true
require 'jaro_winkler/version'
if RUBY_ENGINE == 'ruby'
begin
require 'jaro_winkler/jaro_winkler_ext'
rescue LoadError
# Fall back to the pure implementation if the extension
# can't be loaded for any reason (e.g. it was never built)
require 'jaro_winkler/jaro_winkler_pure'
end
else
require 'jaro_winkler/jaro_winkler_pure'
end
jaro_winkler-1.7.0/lib/jaro_winkler/ 0000775 0000000 0000000 00000000000 15142757443 0017455 5 ustar 00root root 0000000 0000000 jaro_winkler-1.7.0/lib/jaro_winkler/adjusting_table.rb 0000664 0000000 0000000 00000001134 15142757443 0023140 0 ustar 00root root 0000000 0000000 # frozen_string_literal: true
module JaroWinkler
DEFAULT_ADJ_TABLE = Hash.new { |h, k| h[k] = Hash.new(&h.default_proc) }
[
%w[A E], %w[A I], %w[A O], %w[A U], %w[B V], %w[E I], %w[E O], %w[E U], %w[I O],
%w[I U], %w[O U], %w[I Y], %w[E Y], %w[C G], %w[E F], %w[W U], %w[W V], %w[X K],
%w[S Z], %w[X S], %w[Q C], %w[U V], %w[M N], %w[L I], %w[Q O], %w[P R], %w[I J],
%w[2 Z], %w[5 S], %w[8 B], %w[1 I], %w[1 L], %w[0 O], %w[0 Q], %w[C K], %w[G J],
['E', ' '], ['Y', ' '], ['S', ' ']
].each do |s1, s2|
DEFAULT_ADJ_TABLE[s1][s2] = DEFAULT_ADJ_TABLE[s2][s1] = true
end
end
jaro_winkler-1.7.0/lib/jaro_winkler/jaro_winkler_pure.rb 0000664 0000000 0000000 00000010016 15142757443 0023521 0 ustar 00root root 0000000 0000000 # frozen_string_literal: true
require 'jaro_winkler/adjusting_table'
module JaroWinkler
class Error < RuntimeError; end
class InvalidWeightError < Error; end
DEFAULT_WEIGHT = 0.1
DEFAULT_THRESHOLD = 0.7
DEFAULT_OPTIONS = {
jaro: { adj_table: false, ignore_case: false },
jaro_winkler: { weight: DEFAULT_WEIGHT, threshold: DEFAULT_THRESHOLD }
}.freeze
class << self
def distance(str1, str2, options = {})
warn("JaroWinkler.distance is deprecated. Use JaroWinkler.similarity instead.")
similarity(str1, str2, options)
end
def similarity(str1, str2, options = {})
validate!(str1, str2)
_distance str1.codepoints.to_a, str2.codepoints.to_a, options
end
def jaro_distance(str1, str2, options = {})
warn("JaroWinkler.jaro_distance is deprecated. Use JaroWinkler.jaro_similarity instead.")
jaro_similarity(str1, str2, options)
end
def jaro_similarity(str1, str2, options = {})
validate!(str1, str2)
_jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
end
private
def _distance(codes1, codes2, options = {})
options = DEFAULT_OPTIONS[:jaro_winkler].merge options
raise InvalidWeightError if options[:weight] > 0.25
jaro_distance = _jaro_distance(codes1, codes2, options)
if jaro_distance < options[:threshold]
jaro_distance
else
codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
len1 = codes1.length
len2 = codes2.length
max_4 = len1 > 4 ? 4 : len1
prefix = 0
prefix += 1 while prefix < max_4 && codes1[prefix] == codes2[prefix]
jaro_distance + prefix * options[:weight] * (1 - jaro_distance)
end
end
def _jaro_distance(codes1, codes2, options = {})
options = DEFAULT_OPTIONS[:jaro].merge options
codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
len1 = codes1.length
len2 = codes2.length
return 0.0 if len1 == 0 || len2 == 0
if options[:ignore_case]
codes1.map! { |c| c >= 97 && c <= 122 ? c -= 32 : c }
codes2.map! { |c| c >= 97 && c <= 122 ? c -= 32 : c }
end
window = len2 / 2 - 1
window = 0 if window < 0
flags1 = 0
flags2 = 0
# // count number of matching characters
match_count = 0
i = 0
while i < len1
left = i >= window ? i - window : 0
right = i + window <= len2 - 1 ? (i + window) : (len2 - 1)
right = len2 - 1 if right > len2 - 1
j = left
while j <= right
if flags2[j] == 0 && codes1[i] == codes2[j]
flags1 |= (1 << i)
flags2 |= (1 << j)
match_count += 1
break
end
j += 1
end
i += 1
end
return 0.0 if match_count == 0
# // count number of transpositions
transposition_count = j = k = 0
i = 0
while i < len1
if flags1[i] == 1
j = k
while j < len2
if flags2[j] == 1
k = j + 1
break
end
j += 1
end
transposition_count += 1 if codes1[i] != codes2[j]
end
i += 1
end
# // count similarities in nonmatched characters
similar_count = 0
if options[:adj_table] && len1 > match_count
i = 0
while i < len1
if flags1[i] == 0
j = 0
while j < len2
if flags2[j] == 0
if DEFAULT_ADJ_TABLE[codes1[i].chr(Encoding::UTF_8)][codes2[j].chr(Encoding::UTF_8)]
similar_count += 3
break
end
end
j += 1
end
end
i += 1
end
end
m = match_count.to_f
t = transposition_count / 2
m = similar_count / 10.0 + m if options[:adj_table]
(m / len1 + m / len2 + (m - t) / m) / 3
end
def validate!(str1, str2)
raise TypeError unless str1.is_a?(String) && str2.is_a?(String)
end
end
end
jaro_winkler-1.7.0/lib/jaro_winkler/version.rb 0000664 0000000 0000000 00000000112 15142757443 0021461 0 ustar 00root root 0000000 0000000 # frozen_string_literal: true
module JaroWinkler
VERSION = '1.7.0'
end
jaro_winkler-1.7.0/test/ 0000775 0000000 0000000 00000000000 15142757443 0015200 5 ustar 00root root 0000000 0000000 jaro_winkler-1.7.0/test/test_jaro_winkler.rb 0000664 0000000 0000000 00000000230 15142757443 0021245 0 ustar 00root root 0000000 0000000 require 'minitest/autorun'
require_relative 'tests'
require 'jaro_winkler/jaro_winkler_ext'
class TestJaroWinkler < Minitest::Test
include Tests
end
jaro_winkler-1.7.0/test/test_pure_ruby.rb 0000664 0000000 0000000 00000000231 15142757443 0020574 0 ustar 00root root 0000000 0000000 require 'minitest/autorun'
require_relative 'tests'
require 'jaro_winkler/jaro_winkler_pure'
class TestJaroWinkler < Minitest::Test
include Tests
end
jaro_winkler-1.7.0/test/tests.rb 0000664 0000000 0000000 00000013023 15142757443 0016666 0 ustar 00root root 0000000 0000000 # encoding: utf-8
module Tests
def test_similarity
assert_similarity 0.9667, 'henka', 'henkan'
assert_similarity 1.0, 'al', 'al'
assert_similarity 0.9611, 'martha', 'marhta'
assert_similarity 0.8324, 'jones', 'johnson'
assert_similarity 0.9583, 'abcvwxyz', 'cabvwxyz'
assert_similarity 0.84, 'dwayne', 'duane'
assert_similarity 0.8133, 'dixon', 'dicksonx'
assert_similarity 0.0, 'fvie', 'ten'
assert_similarity 1.0, 'tony', 'tony'
assert_similarity 1.0, 'tonytonyjan', 'tonytonyjan'
assert_similarity 1.0, 'x', 'x'
assert_similarity 0.0, '', ''
assert_similarity 0.0, 'tony', ''
assert_similarity 0.0, '', 'tony'
assert_similarity 0.8727, 'tonytonyjan', 'tony'
assert_similarity 0.8727, 'tony', 'tonytonyjan'
assert_similarity 0.9407, 'necessary', 'nessecary'
assert_similarity 0.9067, 'does_exist', 'doesnt_exist'
assert_similarity 0.975, '12345678', '12345687'
assert_similarity 0.975, '12345678', '12345867'
assert_similarity 0.95, '12345678', '12348567'
end
def test_jaro_similarity
assert_jaro_similarity 0.9444, 'henka', 'henkan'
assert_jaro_similarity 1.0, 'al', 'al'
assert_jaro_similarity 0.9444, 'martha', 'marhta'
assert_jaro_similarity 0.7905, 'jones', 'johnson'
assert_jaro_similarity 0.9583, 'abcvwxyz', 'cabvwxyz'
assert_jaro_similarity 0.8222, 'dwayne', 'duane'
assert_jaro_similarity 0.7667, 'dixon', 'dicksonx'
assert_jaro_similarity 0.0, 'fvie', 'ten'
assert_jaro_similarity 1.0, 'tony', 'tony'
assert_jaro_similarity 1.0, 'tonytonyjan', 'tonytonyjan'
assert_jaro_similarity 1.0, 'x', 'x'
assert_jaro_similarity 0.0, '', ''
assert_jaro_similarity 0.0, 'tony', ''
assert_jaro_similarity 0.0, '', 'tony'
assert_jaro_similarity 0.7879, 'tonytonyjan', 'tony'
assert_jaro_similarity 0.7879, 'tony', 'tonytonyjan'
assert_jaro_similarity 0.9259, 'necessary', 'nessecary'
assert_jaro_similarity 0.8444, 'does_exist', 'doesnt_exist'
assert_jaro_similarity 0.9583, '12345678', '12345687'
assert_jaro_similarity 0.9583, '12345678', '12345867'
assert_jaro_similarity 0.9167, '12345678', '12348567'
assert_jaro_similarity 0.604, 'tonytonyjan', 'janjantony'
end
def test_distance
assert_distance 0.9667, 'henka', 'henkan'
end
def test_jaro_distance
assert_jaro_distance 0.9444, 'henka', 'henkan'
end
def test_unicode
assert_similarity 0.9818, '變形金剛4:絕跡重生', '變形金剛4: 絕跡重生'
assert_similarity 0.8222, '連勝文', '連勝丼'
assert_similarity 0.8222, '馬英九', '馬英丸'
assert_similarity 0.6667, '良い', 'いい'
end
def test_ignore_case
assert_similarity 0.9611, 'MARTHA', 'marhta', ignore_case: true
end
def test_weight
assert_similarity 0.9778, 'MARTHA', 'MARHTA', weight: 0.2
end
def test_threshold
assert_similarity 0.9444, 'MARTHA', 'MARHTA', threshold: 0.99
end
def test_adjusting_table
assert_similarity 0.9667, 'HENKA', 'HENKAN', adj_table: true
assert_similarity 1.0, 'AL', 'AL', adj_table: true
assert_similarity 0.9611, 'MARTHA', 'MARHTA', adj_table: true
assert_similarity 0.8598, 'JONES', 'JOHNSON', adj_table: true
assert_similarity 0.9583, 'ABCVWXYZ', 'CABVWXYZ', adj_table: true
assert_similarity 0.8730, 'DWAYNE', 'DUANE', adj_table: true
assert_similarity 0.8393, 'DIXON', 'DICKSONX', adj_table: true
assert_similarity 0.0, 'FVIE', 'TEN', adj_table: true
end
def test_error
assert_raises JaroWinkler::InvalidWeightError do
JaroWinkler.similarity 'MARTHA', 'MARHTA', weight: 0.26
end
end
def test_long_string
JaroWinkler.similarity 'haisai' * 20, 'haisai' * 20
end
def test_encoding
assert_encoding '焦玟綾', '焦紋綾', Encoding::Big5
assert_encoding '簡煒航', '簡偉航', Encoding::Big5_HKSCS
assert_encoding '西島之', '西鳥志', Encoding::EUCJP
assert_encoding '松本行弘', '枩本行弘', Encoding::Shift_JIS
assert_similarity 1.0, "\xe8".force_encoding('iso8859-1'), 'è'
end
def test_raises_type_error
assert_raises(TypeError){ JaroWinkler.similarity 'MARTHA', nil }
assert_raises(TypeError){ JaroWinkler.similarity nil, 'MARTHA' }
assert_raises(TypeError){ JaroWinkler.similarity nil, nil }
assert_raises(TypeError){ JaroWinkler.similarity 'MARTHA', :non_string }
assert_raises(TypeError){ JaroWinkler.similarity :non_string, 'MARTHA' }
assert_raises(TypeError){ JaroWinkler.similarity :non_string, :non_string }
end
private
def assert_distance score, str1, str2, **options
assert_in_delta score, JaroWinkler.distance(str1, str2, **options)
end
def assert_encoding str1, str2, encoding, **options
assert_similarity JaroWinkler.distance(str1, str2), str1.encode(encoding), str2.encode(encoding)
end
def assert_jaro_distance score, str1, str2, **options
assert_in_delta score, JaroWinkler.jaro_distance(str1, str2, **options)
end
def assert_similarity score, str1, str2, **options
assert_in_delta score, JaroWinkler.similarity(str1, str2, **options)
end
def assert_jaro_similarity score, str1, str2, **options
assert_in_delta score, JaroWinkler.jaro_similarity(str1, str2, **options)
end
end