text-metrics-0.3.3/0000755000000000000000000000000007346545000012336 5ustar0000000000000000text-metrics-0.3.3/CHANGELOG.md0000644000000000000000000000163407346545000014153 0ustar0000000000000000## Text Metrics 0.3.3 * Slightly optimized the `levenshtein` and `levenshteinNorm` functions. [PR 50](https://github.com/mrkkrp/text-metrics/pull/50). ## Text Metrics 0.3.2 * Works with `text-2.0`. ## Text Metrics 0.3.1 * Fixed a bug in the implementation of Jaro-Winkler distance when two strings share a long prefix. [PR 21](https://github.com/mrkkrp/text-metrics/pull/21). * Dropped support for GHC 8.6 and older. ## Text Metrics 0.3.0 * All functions are now implemented in pure Haskell. * All functions return `Int` or `Ratio Int` instead of `Natural` and `Ratio Natural`. * Added `overlap` (returns overlap coefficient) and `jaccard` (returns Jaccard similarity coefficient). ## Text Metrics 0.2.0 * Made the `levenshtein`, `levenshteinNorm`, `damerauLevenshtein`, and `demerauLevenshtein` more efficient. * Added `jaro` and `jaroWinkler` functions. ## Text Metrics 0.1.0 * Initial release. text-metrics-0.3.3/Data/Text/0000755000000000000000000000000007346545000014133 5ustar0000000000000000text-metrics-0.3.3/Data/Text/Metrics.hs0000644000000000000000000003121107346545000016073 0ustar0000000000000000{-# LANGUAGE BangPatterns #-} {-# LANGUAGE MultiWayIf #-} -- | -- Module : Data.Text.Metrics -- Copyright : © 2016–present Mark Karpov -- License : BSD 3 clause -- -- Maintainer : Mark Karpov -- Stability : experimental -- Portability : portable -- -- The module provides efficient implementations of various strings metric -- algorithms. It works with strict 'Text' values. -- -- __Note__: before version /0.3.0/ the package used C implementations of -- the algorithms under the hood. Beginning from version /0.3.0/, the -- implementations are written in Haskell while staying almost as fast, see: -- -- module Data.Text.Metrics ( -- * Levenshtein variants levenshtein, levenshteinNorm, damerauLevenshtein, damerauLevenshteinNorm, -- * Treating inputs like sets overlap, jaccard, -- * Other hamming, jaro, jaroWinkler, ) where import Control.Monad import Control.Monad.ST import Data.Map.Strict (Map) import Data.Map.Strict qualified as M import Data.Primitive qualified as P import Data.Ratio import Data.Text import Data.Text qualified as T import Data.Text.Internal qualified as T import Data.Text.Unsafe qualified as TU import Data.Vector.Unboxed.Mutable qualified as VUM import GHC.Exts (inline) ---------------------------------------------------------------------------- -- Levenshtein variants -- | Return the Levenshtein distance between two 'Text' values. The -- Levenshtein distance between two strings is the minimal number of -- operations necessary to transform one string into another. For the -- Levenshtein distance allowed operations are: deletion, insertion, and -- substitution. -- -- See also: . -- -- __Heads up__, before version /0.3.0/ this function returned -- 'Data.Numeric.Natural'. levenshtein :: Text -> Text -> Int levenshtein a b = fst (levenshtein_ a b) -- | Return the normalized Levenshtein distance between two 'Text' values. -- Result is a non-negative rational number (represented as @'Ratio' -- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the -- strings, while 1 means exact match. -- -- See also: . -- -- __Heads up__, before version /0.3.0/ this function returned @'Ratio' -- 'Data.Numeric.Natural'@. levenshteinNorm :: Text -> Text -> Ratio Int levenshteinNorm = norm levenshtein_ -- | An internal helper, returns the Levenshtein distance as the first -- element of the tuple and max length of the two inputs as the second -- element of the tuple. levenshtein_ :: Text -> Text -> (Int, Int) levenshtein_ a b | T.null a = (lenb, lenm) | T.null b = (lena, lenm) | otherwise = runST $ do let v_len = lenb + 1 v <- P.newPrimArray (v_len * 2) let gov !i = when (i < v_len) $ do P.writePrimArray v i i gov (i + 1) goi !i !na !v0 !v1 = do let !(TU.Iter ai da) = TU.iter a na goj !j !nb = when (j < lenb) $ do let !(TU.Iter bj db) = TU.iter b nb if ai == bj then do z <- P.readPrimArray v (v0 + j) P.writePrimArray v (v1 + j + 1) z goj (j + 1) (nb + db) else do x <- P.readPrimArray v (v1 + j) y <- P.readPrimArray v (v0 + j + 1) z <- P.readPrimArray v (v0 + j) P.writePrimArray v (v1 + j + 1) (1 + (min x (min y z))) goj (j + 1) (nb + db) when (i < lena) $ do P.writePrimArray v v1 (i + 1) goj 0 0 goi (i + 1) (na + da) v1 v0 gov 0 goi 0 0 0 v_len ld <- P.readPrimArray v (lenb + if even lena then 0 else v_len) return (ld, lenm) where lena = T.length a lenb = T.length b lenm = max lena lenb {-# INLINE levenshtein_ #-} -- | Return the Damerau-Levenshtein distance between two 'Text' values. The -- function works like 'levenshtein', but the collection of allowed -- operations also includes transposition of two /adjacent/ characters. -- -- See also: . -- -- __Heads up__, before version /0.3.0/ this function returned -- 'Data.Numeric.Natural'. damerauLevenshtein :: Text -> Text -> Int damerauLevenshtein a b = fst (damerauLevenshtein_ a b) -- | Return the normalized Damerau-Levenshtein distance between two 'Text' -- values. 0 signifies no similarity between the strings, while 1 means -- exact match. -- -- See also: . -- -- __Heads up__, before version /0.3.0/ this function returned @'Ratio' -- 'Data.Numeric.Natural'@. damerauLevenshteinNorm :: Text -> Text -> Ratio Int damerauLevenshteinNorm = norm damerauLevenshtein_ -- | An internal helper, returns the Damerau-Levenshtein distance as the -- first element of the tuple and max length of the two inputs as the second -- element of the tuple. damerauLevenshtein_ :: Text -> Text -> (Int, Int) damerauLevenshtein_ a b | T.null a = (lenb, lenm) | T.null b = (lena, lenm) | otherwise = runST $ do let v_len = lenb + 1 v <- VUM.unsafeNew (v_len * 3) let gov !i = when (i < v_len) $ do VUM.unsafeWrite v i i gov (i + 1) goi !i !na !ai_1 !v0 !v1 !v2 = do let !(TU.Iter ai da) = TU.iter a na goj !j !nb !bj_1 = when (j < lenb) $ do let !(TU.Iter bj db) = TU.iter b nb cost = if ai == bj then 0 else 1 x <- (+ 1) <$> VUM.unsafeRead v (v1 + j) y <- (+ 1) <$> VUM.unsafeRead v (v0 + j + 1) z <- (+ cost) <$> VUM.unsafeRead v (v0 + j) let g = min x (min y z) val <- (+ cost) <$> VUM.unsafeRead v (v2 + j - 1) VUM.unsafeWrite v (v1 + j + 1) $ if i > 0 && j > 0 && ai == bj_1 && ai_1 == bj && val < g then val else g goj (j + 1) (nb + db) bj when (i < lena) $ do VUM.unsafeWrite v v1 (i + 1) goj 0 0 'a' goi (i + 1) (na + da) ai v1 v2 v0 gov 0 goi 0 0 'a' 0 v_len (v_len * 2) ld <- VUM.unsafeRead v (lenb + (lena `mod` 3) * v_len) return (ld, lenm) where lena = T.length a lenb = T.length b lenm = max lena lenb {-# INLINE damerauLevenshtein_ #-} ---------------------------------------------------------------------------- -- Treating inputs like sets -- | Return the overlap coefficient for two 'Text' values. Returned value is -- in the range from 0 (no similarity) to 1 (exact match). Return 1 if both -- 'Text' values are empty. -- -- See also: . -- -- @since 0.3.0 overlap :: Text -> Text -> Ratio Int overlap a b = if d == 0 then 1 % 1 else intersectionSize (mkTextMap a) (mkTextMap b) % d where d = min (T.length a) (T.length b) -- | Return the Jaccard similarity coefficient for two 'Text' values. -- Returned value is in the range from 0 (no similarity) to 1 (exact match). -- Return 1 if both -- -- See also: -- -- @since 0.3.0 jaccard :: Text -> Text -> Ratio Int jaccard a b = if d == 0 then 1 % 1 else intersectionSize ma mb % d where ma = mkTextMap a mb = mkTextMap b d = unionSize ma mb -- | Make a map from 'Char' to 'Int' representing how many times the 'Char' -- appears in the input 'Text'. mkTextMap :: Text -> Map Char Int mkTextMap = T.foldl' f M.empty where f m ch = M.insertWith (+) ch 1 m {-# INLINE mkTextMap #-} -- | Return intersection size between two 'Text'-maps. intersectionSize :: Map Char Int -> Map Char Int -> Int intersectionSize a b = M.foldl' (+) 0 (M.intersectionWith min a b) {-# INLINE intersectionSize #-} -- | Return union size between two 'Text'-maps. unionSize :: Map Char Int -> Map Char Int -> Int unionSize a b = M.foldl' (+) 0 (M.unionWith max a b) {-# INLINE unionSize #-} ---------------------------------------------------------------------------- -- Other -- | /O(n)/ Return the Hamming distance between two 'Text' values. Hamming -- distance is defined as the number of positions at which the corresponding -- symbols are different. The input 'Text' values should be of equal length -- or 'Nothing' will be returned. -- -- See also: . -- -- __Heads up__, before version /0.3.0/ this function returned @'Maybe' -- 'Data.Numeric.Natural'@. hamming :: Text -> Text -> Maybe Int hamming a@(T.Text _ _ len) b = if T.length a == T.length b then Just (go 0 0 0) else Nothing where go !na !nb !r = let !(TU.Iter cha da) = TU.iter a na !(TU.Iter chb db) = TU.iter b nb in if | na == len -> r | cha /= chb -> go (na + da) (nb + db) (r + 1) | otherwise -> go (na + da) (nb + db) r -- | Return the Jaro distance between two 'Text' values. Returned value is -- in the range from 0 (no similarity) to 1 (exact match). -- -- While the algorithm is pretty clear for artificial examples (like those -- from the linked Wikipedia article), for /arbitrary/ strings, it may be -- hard to decide which of two strings should be considered as one having -- “reference” order of characters (order of matching characters in an -- essential part of the definition of the algorithm). This makes us -- consider the first string the “reference” string (with correct order of -- characters). Thus generally, -- -- > jaro a b ≠ jaro b a -- -- This asymmetry can be found in all implementations of the algorithm on -- the internet, AFAIK. -- -- See also: -- -- @since 0.2.0 -- -- __Heads up__, before version /0.3.0/ this function returned @'Ratio' -- 'Data.Numeric.Natural'@. jaro :: Text -> Text -> Ratio Int jaro a b = if T.null a || T.null b then 0 % 1 else runST $ do let lena = T.length a lenb = T.length b d = if lena >= 2 && lenb >= 2 then max lena lenb `quot` 2 - 1 else 0 v <- VUM.replicate lenb (0 :: Int) r <- VUM.replicate 3 (0 :: Int) -- tj, m, t let goi !i !na !fromb = do let !(TU.Iter ai da) = TU.iter a na (from, fromb') = if i >= d then (i - d, fromb + TU.iter_ b fromb) else (0, 0) to = min (i + d + 1) lenb goj !j !nb = when (j < to) $ do let !(TU.Iter bj db) = TU.iter b nb used <- (== 1) <$> VUM.unsafeRead v j if not used && ai == bj then do tj <- VUM.unsafeRead r 0 if j < tj then VUM.unsafeModify r (+ 1) 2 else VUM.unsafeWrite r 0 j VUM.unsafeWrite v j 1 VUM.unsafeModify r (+ 1) 1 else goj (j + 1) (nb + db) when (i < lena) $ do goj from fromb goi (i + 1) (na + da) fromb' goi 0 0 0 m <- VUM.unsafeRead r 1 t <- VUM.unsafeRead r 2 return $ if m == 0 then 0 % 1 else ( (m % lena) + (m % lenb) + ((m - t) % m) ) / 3 -- | Return the Jaro-Winkler distance between two 'Text' values. Returned -- value is in range from 0 (no similarity) to 1 (exact match). -- -- See also: -- -- @since 0.2.0 -- -- __Heads up__, before version /0.3.0/ this function returned @'Ratio' -- 'Data.Numeric.Natural'@. jaroWinkler :: Text -> Text -> Ratio Int jaroWinkler a b = dj + (1 % 10) * l * (1 - dj) where dj = inline (jaro a b) l = fromIntegral (min 4 (commonPrefix a b)) -- | Return the length of the common prefix two 'Text' values have. commonPrefix :: Text -> Text -> Int commonPrefix a b = case T.commonPrefixes a b of Nothing -> 0 Just (pref, _, _) -> T.length pref {-# INLINE commonPrefix #-} ---------------------------------------------------------------------------- -- Helpers norm :: (Text -> Text -> (Int, Int)) -> Text -> Text -> Ratio Int norm f a b = let (r, l) = f a b in if r == 0 then 1 % 1 else 1 % 1 - r % l {-# INLINE norm #-} text-metrics-0.3.3/LICENSE.md0000644000000000000000000000265607346545000013753 0ustar0000000000000000Copyright © 2016–present Mark Karpov All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name Mark Karpov nor the names of contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. text-metrics-0.3.3/README.md0000644000000000000000000000521107346545000013614 0ustar0000000000000000# Text Metrics [![License BSD3](https://img.shields.io/badge/license-BSD3-brightgreen.svg)](http://opensource.org/licenses/BSD-3-Clause) [![Hackage](https://img.shields.io/hackage/v/text-metrics.svg?style=flat)](https://hackage.haskell.org/package/text-metrics) [![Stackage Nightly](http://stackage.org/package/text-metrics/badge/nightly)](http://stackage.org/nightly/package/text-metrics) [![Stackage LTS](http://stackage.org/package/text-metrics/badge/lts)](http://stackage.org/lts/package/text-metrics) ![CI](https://github.com/mrkkrp/text-metrics/workflows/CI/badge.svg?branch=master) The library provides efficient implementations of various strings metric algorithms. It works with strict `Text` values. The current version of the package implements: * [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) * [Normalized Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) * [Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) * [Normalized Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) * [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance) * [Jaro distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) * [Jaro-Winkler distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) * [Overlap coefficient](https://en.wikipedia.org/wiki/Overlap_coefficient) * [Jaccard similarity coefficient](https://en.wikipedia.org/wiki/Jaccard_index) ## Comparison with the `edit-distance` package There is [`edit-distance`](https://hackage.haskell.org/package/edit-distance) package whose scope overlaps with the scope of this package. The differences are: * `edit-distance` allows to specify costs for every operation when calculating Levenshtein distance (insertion, deletion, substitution, and transposition). This is rarely needed though in real-world applications, IMO. * `edit-distance` only provides Levenshtein distance, `text-metrics` aims to provide implementations of most string metrics algorithms. * `edit-distance` works on `Strings`, while `text-metrics` works on strict `Text` values. ## Implementation Although we originally used C for speed, currently all functions are pure Haskell tuned for performance. See [this blog post](https://markkarpov.com/post/migrating-text-metrics.html) for more info. ## Contribution Issues, bugs, and questions may be reported in [the GitHub issue tracker for this project](https://github.com/mrkkrp/text-metrics/issues). Pull requests are also welcome. ## License Copyright © 2016–present Mark Karpov Distributed under BSD 3 clause license. text-metrics-0.3.3/bench/memory/0000755000000000000000000000000007346545000014725 5ustar0000000000000000text-metrics-0.3.3/bench/memory/Main.hs0000644000000000000000000000214307346545000016145 0ustar0000000000000000module Main (main) where import Control.DeepSeq import Control.Monad import Data.Text (Text) import Data.Text qualified as T import Data.Text.Metrics import Weigh main :: IO () main = mainWith $ do setColumns [Case, Allocated, GCs, Max] bmetric "levenshtein" levenshtein bmetric "levenshteinNorm" levenshteinNorm bmetric "damerauLevenshtein" damerauLevenshtein bmetric "damerauLevenshteinNorm" damerauLevenshteinNorm bmetric "overlap" overlap bmetric "jaccard" jaccard bmetric "hamming" hamming bmetric "jaro" jaro bmetric "jaroWinkler" jaroWinkler -- | Perform a series to measurements with the same metric function. bmetric :: (NFData a) => -- | Name of the benchmark group String -> -- | The function to benchmark (Text -> Text -> a) -> Weigh () bmetric name f = forM_ stdSeries $ \n -> func (name ++ "/" ++ show n) (uncurry f) (testData n, testData n) -- | The series of lengths to try with every function as part of 'btmetric'. stdSeries :: [Int] stdSeries = [5, 10, 20, 40, 80, 160] testData :: Int -> Text testData n = T.pack . take n . drop (n `mod` 4) . cycle $ ['a' .. 'z'] text-metrics-0.3.3/bench/speed/0000755000000000000000000000000007346545000014515 5ustar0000000000000000text-metrics-0.3.3/bench/speed/Main.hs0000644000000000000000000000203707346545000015737 0ustar0000000000000000module Main (main) where import Control.DeepSeq import Criterion.Main import Data.Text (Text) import Data.Text qualified as T import Data.Text.Metrics main :: IO () main = defaultMain [ btmetric "levenshtein" levenshtein, btmetric "levenshteinNorm" levenshteinNorm, btmetric "damerauLevenshtein" damerauLevenshtein, btmetric "damerauLevenshteinNorm" damerauLevenshteinNorm, btmetric "overlap" overlap, btmetric "jaccard" jaccard, btmetric "hamming" hamming, btmetric "jaro" jaro, btmetric "jaroWinkler" jaroWinkler ] -- | Produce benchmark group to test. btmetric :: (NFData a) => String -> (Text -> Text -> a) -> Benchmark btmetric name f = bgroup name (bs <$> stdSeries) where bs n = env (return (testData n, testData n)) (bench (show n) . nf (uncurry f)) -- | The series of lengths to try with every function as part of 'btmetric'. stdSeries :: [Int] stdSeries = [5, 10, 20, 40, 80, 160] testData :: Int -> Text testData n = T.pack . take n . drop (n `mod` 4) . cycle $ ['a' .. 'z'] text-metrics-0.3.3/tests/0000755000000000000000000000000007346545000013500 5ustar0000000000000000text-metrics-0.3.3/tests/Main.hs0000644000000000000000000001322207346545000014720 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} {-# OPTIONS_GHC -fno-warn-orphans #-} module Main (main) where import Data.Ratio import Data.Text (Text) import Data.Text qualified as T import Data.Text.Metrics import Test.Hspec import Test.QuickCheck instance Arbitrary Text where arbitrary = T.pack <$> arbitrary main :: IO () main = hspec spec spec :: Spec spec = do describe "levenshtein" $ do testSwap levenshtein context "with concrete examples" $ do testPair levenshtein "kitten" "sitting" 3 testPair levenshtein "cake" "drake" 2 testPair levenshtein "saturday" "sunday" 3 testPair levenshtein "red" "wax" 3 testPair levenshtein "a😀c" "abc" 1 testPair levenshtein "lucky" "lucky" 0 testPair levenshtein "" "" 0 describe "levenshteinNorm" $ do testSwap levenshteinNorm testPair levenshteinNorm "kitten" "sitting" (4 % 7) testPair levenshteinNorm "cake" "drake" (3 % 5) testPair levenshteinNorm "saturday" "sunday" (5 % 8) testPair levenshteinNorm "red" "wax" (0 % 1) testPair levenshteinNorm "a😀c" "abc" (2 % 3) testPair levenshteinNorm "lucky" "lucky" (1 % 1) testPair levenshteinNorm "" "" (1 % 1) describe "damerauLevenshtein" $ do testSwap damerauLevenshtein testPair damerauLevenshtein "veryvery long" "very long" 4 testPair damerauLevenshtein "thing" "think" 1 testPair damerauLevenshtein "nose" "ones" 2 testPair damerauLevenshtein "thing" "sign" 3 testPair damerauLevenshtein "red" "wax" 3 testPair damerauLevenshtein "a😀c" "abc" 1 testPair damerauLevenshtein "lucky" "lucky" 0 testPair damerauLevenshtein "" "" 0 describe "damerauLevenshteinNorm" $ do testSwap damerauLevenshteinNorm testPair damerauLevenshteinNorm "veryvery long" "very long" (9 % 13) testPair damerauLevenshteinNorm "thing" "think" (4 % 5) testPair damerauLevenshteinNorm "nose" "ones" (1 % 2) testPair damerauLevenshteinNorm "thing" "sign" (2 % 5) testPair damerauLevenshteinNorm "red" "wax" (0 % 1) testPair damerauLevenshteinNorm "a😀c" "abc" (2 % 3) testPair damerauLevenshteinNorm "lucky" "lucky" (1 % 1) testPair damerauLevenshteinNorm "" "" (1 % 1) describe "hamming" $ do testSwap hamming testPair hamming "karolin" "kathrin" (Just 3) testPair hamming "karolin" "kerstin" (Just 3) testPair hamming "1011101" "1001001" (Just 2) testPair hamming "2173896" "2233796" (Just 3) testPair hamming "toned" "roses" (Just 3) testPair hamming "red" "wax" (Just 3) testPair hamming "a😀c" "abc" (Just 1) testPair hamming "lucky" "lucky" (Just 0) testPair hamming "" "" (Just 0) testPair hamming "small" "big" Nothing describe "jaro" $ do testPair jaro "aa" "a" (5 % 6) testPair jaro "a" "aa" (5 % 6) testPair jaro "martha" "marhta" (17 % 18) testPair jaro "marhta" "martha" (17 % 18) testPair jaro "dwayne" "duane" (37 % 45) testPair jaro "duane" "dwayne" (37 % 45) testPair jaro "dixon" "dicksonx" (23 % 30) testPair jaro "dicksonx" "dixon" (23 % 30) testPair jaro "jones" "johnson" (83 % 105) testPair jaro "johnson" "jones" (83 % 105) testPair jaro "brain" "brian" (14 % 15) testPair jaro "brian" "brain" (14 % 15) testPair jaro "five" "ten" (0 % 1) testPair jaro "ten" "five" (0 % 1) testPair jaro "lucky" "lucky" (1 % 1) testPair jaro "a😀c" "abc" (7 % 9) testPair jaro "" "" (0 % 1) describe "jaroWinkler" $ do testPair jaroWinkler "aa" "a" (17 % 20) testPair jaroWinkler "a" "aa" (17 % 20) testPair jaroWinkler "martha" "marhta" (173 % 180) testPair jaroWinkler "marhta" "martha" (173 % 180) testPair jaroWinkler "dwayne" "duane" (21 % 25) testPair jaroWinkler "duane" "dwayne" (21 % 25) testPair jaroWinkler "dixon" "dicksonx" (61 % 75) testPair jaroWinkler "dicksonx" "dixon" (61 % 75) testPair jaroWinkler "jones" "johnson" (437 % 525) testPair jaroWinkler "johnson" "jones" (437 % 525) testPair jaroWinkler "brain" "brian" (71 % 75) testPair jaroWinkler "brian" "brain" (71 % 75) testPair jaroWinkler "five" "ten" (0 % 1) testPair jaroWinkler "ten" "five" (0 % 1) testPair jaroWinkler "lucky" "lucky" (1 % 1) testPair jaroWinkler "a😀c" "abc" (4 % 5) testPair jaroWinkler "" "" (0 % 1) testPair jaroWinkler "aaaaaaaaaab" "aaaaaaaaaa" (54 % 55) testPair jaroWinkler "aaaaaaaaaaaaaaaaaaaab" "aaaaaaaaaaaaaaaaaaaa" (104 % 105) describe "overlap" $ do testSwap overlap testPair overlap "fly" "butterfly" (1 % 1) testPair overlap "night" "nacht" (3 % 5) testPair overlap "context" "contact" (5 % 7) testPair overlap "red" "wax" (0 % 1) testPair overlap "a😀c" "abc" (2 % 3) testPair overlap "lucky" "lucky" (1 % 1) describe "jaccard" $ do testSwap jaccard testPair jaccard "xxx" "xyx" (1 % 2) testPair jaccard "night" "nacht" (3 % 7) testPair jaccard "context" "contact" (5 % 9) testPair overlap "a😀c" "abc" (2 % 3) testPair jaccard "lucky" "lucky" (1 % 1) -- | Test that given function returns the same results when order of -- arguments is swapped. testSwap :: (Eq a, Show a) => (Text -> Text -> a) -> SpecWith () testSwap f = context "if we swap the arguments" $ it "produces the same result" $ property $ \a b -> f a b === f b a -- | Create spec for given metric function applying it to two 'Text' values -- and comparing the result with expected one. testPair :: (Eq a, Show a) => -- | Function to test (Text -> Text -> a) -> -- | First input Text -> -- | Second input Text -> -- | Expected result a -> SpecWith () testPair f a b r = it ("‘" ++ T.unpack a ++ "’ and ‘" ++ T.unpack b ++ "’") $ f a b `shouldBe` r text-metrics-0.3.3/text-metrics.cabal0000644000000000000000000000526007346545000015755 0ustar0000000000000000cabal-version: 2.4 name: text-metrics version: 0.3.3 license: BSD-3-Clause license-file: LICENSE.md maintainer: Mark Karpov author: Mark Karpov tested-with: ghc ==9.6.3 ghc ==9.8.2 ghc ==9.10.1 homepage: https://github.com/mrkkrp/text-metrics bug-reports: https://github.com/mrkkrp/text-metrics/issues synopsis: Calculate various string metrics efficiently description: Calculate various string metrics efficiently. category: Text, Algorithms build-type: Simple extra-doc-files: CHANGELOG.md README.md source-repository head type: git location: https://github.com/mrkkrp/text-metrics.git flag dev description: Turn on development settings. default: False manual: True library exposed-modules: Data.Text.Metrics default-language: GHC2021 build-depends: base >=4.15 && <5, containers >=0.5 && <0.8, text >=0.2 && <2.2, vector >=0.11 && <0.14, primitive >=0.9 && <0.10 if flag(dev) ghc-options: -Wall -Werror -Wredundant-constraints -Wpartial-fields -Wunused-packages else ghc-options: -O2 -Wall test-suite tests type: exitcode-stdio-1.0 main-is: Main.hs hs-source-dirs: tests default-language: GHC2021 build-depends: QuickCheck >=2.8 && <3, base >=4.15 && <5, hspec >=2.0 && <3, text >=0.2 && <2.2, text-metrics if flag(dev) ghc-options: -Wall -Werror -Wredundant-constraints -Wpartial-fields -Wunused-packages else ghc-options: -O2 -Wall benchmark bench-speed type: exitcode-stdio-1.0 main-is: Main.hs hs-source-dirs: bench/speed default-language: GHC2021 build-depends: base >=4.15 && <5, criterion >=0.6.2.1 && <1.7, deepseq >=1.3 && <1.6, text >=0.2 && <2.2, text-metrics if flag(dev) ghc-options: -Wall -Werror -Wredundant-constraints -Wpartial-fields -Wunused-packages else ghc-options: -O2 -Wall benchmark bench-memory type: exitcode-stdio-1.0 main-is: Main.hs hs-source-dirs: bench/memory default-language: GHC2021 build-depends: base >=4.15 && <5, deepseq >=1.3 && <1.6, text >=0.2 && <2.2, text-metrics, weigh >=0.0.4 if flag(dev) ghc-options: -Wall -Werror -Wredundant-constraints -Wpartial-fields -Wunused-packages else ghc-options: -O2 -Wall