Hello community,
here is the log from the commit of package ghc-text-metrics for openSUSE:Factory checked in at 2017-08-31 21:00:28
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/ghc-text-metrics (Old)
and /work/SRC/openSUSE:Factory/.ghc-text-metrics.new (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "ghc-text-metrics"
Thu Aug 31 21:00:28 2017 rev:3 rq:513514 version:0.3.0
Changes:
--------
--- /work/SRC/openSUSE:Factory/ghc-text-metrics/ghc-text-metrics.changes 2017-06-22 10:39:20.558507940 +0200
+++ /work/SRC/openSUSE:Factory/.ghc-text-metrics.new/ghc-text-metrics.changes 2017-08-31 21:00:30.341340143 +0200
@@ -1,0 +2,5 @@
+Thu Jul 27 14:06:28 UTC 2017 - psimons@suse.com
+
+- Update to version 0.3.0.
+
+-------------------------------------------------------------------
Old:
----
text-metrics-0.2.0.tar.gz
text-metrics.cabal
New:
----
text-metrics-0.3.0.tar.gz
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Other differences:
------------------
++++++ ghc-text-metrics.spec ++++++
--- /var/tmp/diff_new_pack.K6p7GD/_old 2017-08-31 21:00:32.157085026 +0200
+++ /var/tmp/diff_new_pack.K6p7GD/_new 2017-08-31 21:00:32.185081092 +0200
@@ -19,17 +19,18 @@
%global pkg_name text-metrics
%bcond_with tests
Name: ghc-%{pkg_name}
-Version: 0.2.0
+Version: 0.3.0
Release: 0
Summary: Calculate various string metrics efficiently
License: BSD-3-Clause
Group: Development/Languages/Other
Url: https://hackage.haskell.org/package/%{pkg_name}
Source0: https://hackage.haskell.org/package/%{pkg_name}-%{version}/%{pkg_name}-%{version}.tar.gz
-Source1: https://hackage.haskell.org/package/%{pkg_name}-%{version}/revision/1.cabal#/%{pkg_name}.cabal
BuildRequires: ghc-Cabal-devel
+BuildRequires: ghc-containers-devel
BuildRequires: ghc-rpm-macros
BuildRequires: ghc-text-devel
+BuildRequires: ghc-vector-devel
BuildRoot: %{_tmppath}/%{name}-%{version}-build
%if %{with tests}
BuildRequires: ghc-QuickCheck-devel
@@ -52,7 +53,6 @@
%prep
%setup -q -n %{pkg_name}-%{version}
-cp -p %{SOURCE1} %{pkg_name}.cabal
%build
%ghc_lib_build
++++++ text-metrics-0.2.0.tar.gz -> text-metrics-0.3.0.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/CHANGELOG.md new/text-metrics-0.3.0/CHANGELOG.md
--- old/text-metrics-0.2.0/CHANGELOG.md 2016-10-09 16:43:02.000000000 +0200
+++ new/text-metrics-0.3.0/CHANGELOG.md 2017-06-13 10:43:59.000000000 +0200
@@ -1,3 +1,13 @@
+## Text Metrics 0.3.0
+
+* All functions are now implemented in pure Haskell.
+
+* All functions return `Int` or `Ratio Int` instead of `Natural` and `Ratio
+ Natural`.
+
+* Added `overlap` (returns overlap coefficient) and `jaccard` (returns
+ Jaccard similarity coefficient).
+
## Text Metrics 0.2.0
* Made the `levenshtein`, `levenshteinNorm`, `damerauLevenshtein`, and
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/Data/Text/Metrics.hs new/text-metrics-0.3.0/Data/Text/Metrics.hs
--- old/text-metrics-0.2.0/Data/Text/Metrics.hs 2016-10-09 16:43:02.000000000 +0200
+++ new/text-metrics-0.3.0/Data/Text/Metrics.hs 2017-06-13 10:42:49.000000000 +0200
@@ -1,34 +1,24 @@
-- |
-- Module : Data.Text.Metrics
--- Copyright : © 2016 Mark Karpov
+-- Copyright : © 2016–2017 Mark Karpov
-- License : BSD 3 clause
--
--- Maintainer : Mark Karpov
+-- Maintainer : Mark Karpov
-- Stability : experimental
-- Portability : portable
--
--- The module provides efficient implementations of various strings metrics.
--- It works with strict 'Text' values and returns either 'Natural' numbers
--- (because the metrics cannot be negative), or @'Ratio' 'Natural'@ values
--- because returned values are rational non-negative numbers by definition.
---
--- The functions provided here are the fastest implementations available for
--- use in Haskell programs. In fact the functions are implemented in C for
--- maximal efficiency, but this leads to a minor flaw. When we work with
--- 'Text' values in C, they are represented as UTF-16 encoded strings of
--- two-byte values. The algorithms treat the strings as if a character
--- corresponds to one element in such strings, which is true for almost all
--- modern text data. However, there are characters that are represented by
--- two adjoined elements in UTF-16: emoji, historic scripts, less used
--- Chinese ideographs, and some more. If input 'Text' of the functions
--- contains such characters, the functions may return slightly incorrect
--- result. Decide for yourself if this is acceptable for your use case, but
--- chances are you will never run into situations when the functions produce
--- incorrect results.
-
-{-# LANGUAGE CPP #-}
-{-# LANGUAGE ForeignFunctionInterface #-}
-{-# LANGUAGE OverloadedStrings #-}
+-- The module provides efficient implementations of various strings metric
+-- algorithms. It works with strict 'Text' values.
+--
+-- __Note__: before version /0.3.0/ the package used C implementations of
+-- the algorithms under the hood. Beginning from version /0.3.0/, the
+-- implementations are written in Haskell while staying almost as fast, see:
+--
+-- https://markkarpov.com/post/migrating-text-metrics.html
+
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE MultiWayIf #-}
module Data.Text.Metrics
( -- * Levenshtein variants
@@ -36,20 +26,25 @@
, levenshteinNorm
, damerauLevenshtein
, damerauLevenshteinNorm
+ -- * Treating inputs like sets
+ , overlap
+ , jaccard
-- * Other
, hamming
, jaro
, jaroWinkler )
where
+import Control.Monad
+import Control.Monad.ST
+import Data.Map.Strict (Map)
import Data.Ratio
import Data.Text
-import Foreign
-import Foreign.C.Types
-import Numeric.Natural
-import System.IO.Unsafe
-import qualified Data.Text as T
-import qualified Data.Text.Foreign as TF
+import GHC.Exts (inline)
+import qualified Data.Map.Strict as M
+import qualified Data.Text as T
+import qualified Data.Text.Unsafe as TU
+import qualified Data.Vector.Unboxed.Mutable as VUM
#if !MIN_VERSION_base(4,8,0)
import Control.Applicative
@@ -59,82 +54,233 @@
-- Levenshtein variants
-- | Return Levenshtein distance between two 'Text' values. Classic
--- Levenshtein distance between two strings is minimal number of operations
--- necessary to transform one string into another. For Levenshtein distance
--- allowed operations are: deletion, insertion, and substitution.
+-- Levenshtein distance between two strings is the minimal number of
+-- operations necessary to transform one string into another. For
+-- Levenshtein distance allowed operations are: deletion, insertion, and
+-- substitution.
--
-- See also: https://en.wikipedia.org/wiki/Levenshtein_distance.
+--
+-- __Heads up__, before version /0.3.0/ this function returned
+-- 'Data.Numeric.Natural'.
-levenshtein :: Text -> Text -> Natural
-levenshtein = withTwo c_levenshtein
-
-foreign import ccall unsafe "tmetrics_levenshtein"
- c_levenshtein :: CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> IO CUInt
+levenshtein :: Text -> Text -> Int
+levenshtein a b = fst (levenshtein_ a b)
-- | Return normalized Levenshtein distance between two 'Text' values.
-- Result is a non-negative rational number (represented as @'Ratio'
--- 'Natural'@), where 0 signifies no similarity between the strings, while 1
--- means exact match. The operation is virtually as fast as 'levenshtein'.
+-- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
+-- strings, while 1 means exact match.
--
-- See also: https://en.wikipedia.org/wiki/Levenshtein_distance.
+--
+-- __Heads up__, before version /0.3.0/ this function returned @'Ratio'
+-- 'Data.Numeric.Natural'@.
+
+levenshteinNorm :: Text -> Text -> Ratio Int
+levenshteinNorm = norm levenshtein_
-levenshteinNorm :: Text -> Text -> Ratio Natural
-levenshteinNorm = norm levenshtein
-{-# INLINE levenshteinNorm #-}
+-- | An internal helper, returns Levenshtein distance as the first element
+-- of the tuple and max length of the two inputs as the second element of
+-- the tuple.
+
+levenshtein_ :: Text -> Text -> (Int, Int)
+levenshtein_ a b
+ | T.null a = (lenb, lenm)
+ | T.null b = (lena, lenm)
+ | otherwise = runST $ do
+ let v_len = lenb + 1
+ v <- VUM.unsafeNew (v_len * 2)
+ let gov !i =
+ when (i < v_len) $ do
+ VUM.unsafeWrite v i i
+ gov (i + 1)
+ goi !i !na !v0 !v1 = do
+ let !(TU.Iter ai da) = TU.iter a na
+ goj !j !nb =
+ when (j < lenb) $ do
+ let !(TU.Iter bj db) = TU.iter b nb
+ cost = if ai == bj then 0 else 1
+ x <- (+ 1) <$> VUM.unsafeRead v (v1 + j)
+ y <- (+ 1) <$> VUM.unsafeRead v (v0 + j + 1)
+ z <- (+ cost) <$> VUM.unsafeRead v (v0 + j)
+ VUM.unsafeWrite v (v1 + j + 1) (min x (min y z))
+ goj (j + 1) (nb + db)
+ when (i < lena) $ do
+ VUM.unsafeWrite v v1 (i + 1)
+ goj 0 0
+ goi (i + 1) (na + da) v1 v0
+ gov 0
+ goi 0 0 0 v_len
+ ld <- VUM.unsafeRead v (lenb + if even lena then 0 else v_len)
+ return (ld, lenm)
+ where
+ lena = T.length a
+ lenb = T.length b
+ lenm = max lena lenb
+{-# INLINE levenshtein_ #-}
-- | Return Damerau-Levenshtein distance between two 'Text' values. The
-- function works like 'levenshtein', but the collection of allowed
--- operations also includes transposition of two /adjacent/ characters. The
--- function is about 20% slower than 'levenshtein', but still pretty fast.
+-- operations also includes transposition of two /adjacent/ characters.
--
-- See also: https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance.
+--
+-- __Heads up__, before version /0.3.0/ this function returned
+-- 'Data.Numeric.Natural'.
-damerauLevenshtein :: Text -> Text -> Natural
-damerauLevenshtein = withTwo c_damerau_levenshtein
-
-foreign import ccall unsafe "tmetrics_damerau_levenshtein"
- c_damerau_levenshtein :: CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> IO CUInt
+damerauLevenshtein :: Text -> Text -> Int
+damerauLevenshtein a b = fst (damerauLevenshtein_ a b)
-- | Return normalized Damerau-Levenshtein distance between two 'Text'
--- values. Result is a non-negative rational number (represented as @'Ratio'
--- 'Natural'@), where 0 signifies no similarity between the strings, while 1
--- means exact match. The operation is virtually as fast as
--- 'damerauLevenshtein'.
+-- values. 0 signifies no similarity between the strings, while 1 means
+-- exact match.
--
-- See also: https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance.
+--
+-- __Heads up__, before version /0.3.0/ this function returned @'Ratio'
+-- 'Data.Numeric.Natural'@.
+
+damerauLevenshteinNorm :: Text -> Text -> Ratio Int
+damerauLevenshteinNorm = norm damerauLevenshtein_
+
+-- | An internal helper, returns Damerau-Levenshtein distance as the first
+-- element of the tuple and max length of the two inputs as the second
+-- element of the tuple.
+
+damerauLevenshtein_ :: Text -> Text -> (Int, Int)
+damerauLevenshtein_ a b
+ | T.null a = (lenb, lenm)
+ | T.null b = (lena, lenm)
+ | otherwise = runST $ do
+ let v_len = lenb + 1
+ v <- VUM.unsafeNew (v_len * 3)
+ let gov !i =
+ when (i < v_len) $ do
+ VUM.unsafeWrite v i i
+ gov (i + 1)
+ goi !i !na !ai_1 !v0 !v1 !v2 = do
+ let !(TU.Iter ai da) = TU.iter a na
+ goj !j !nb !bj_1 =
+ when (j < lenb) $ do
+ let !(TU.Iter bj db) = TU.iter b nb
+ cost = if ai == bj then 0 else 1
+ x <- (+ 1) <$> VUM.unsafeRead v (v1 + j)
+ y <- (+ 1) <$> VUM.unsafeRead v (v0 + j + 1)
+ z <- (+ cost) <$> VUM.unsafeRead v (v0 + j)
+ let g = min x (min y z)
+ val <- (+ cost) <$> VUM.unsafeRead v (v2 + j - 1)
+ VUM.unsafeWrite v (v1 + j + 1) $
+ if i > 0 && j > 0 && ai == bj_1 && ai_1 == bj && val < g
+ then val
+ else g
+ goj (j + 1) (nb + db) bj
+ when (i < lena) $ do
+ VUM.unsafeWrite v v1 (i + 1)
+ goj 0 0 'a'
+ goi (i + 1) (na + da) ai v1 v2 v0
+ gov 0
+ goi 0 0 'a' 0 v_len (v_len * 2)
+ ld <- VUM.unsafeRead v (lenb + (lena `mod` 3) * v_len)
+ return (ld, lenm)
+ where
+ lena = T.length a
+ lenb = T.length b
+ lenm = max lena lenb
+{-# INLINE damerauLevenshtein_ #-}
+
+----------------------------------------------------------------------------
+-- Treating inputs like sets
+
+-- | Return overlap coefficient for two 'Text' values. Returned value is in
+-- the range from 0 (no similarity) to 1 (exact match). Return 1 if both
+-- 'Text' values are empty.
+--
+-- See also: https://en.wikipedia.org/wiki/Overlap_coefficient.
+--
+-- @since 0.3.0
+
+overlap :: Text -> Text -> Ratio Int
+overlap a b =
+ if d == 0
+ then 1 % 1
+ else intersectionSize (mkTextMap a) (mkTextMap b) % d
+ where
+ d = min (T.length a) (T.length b)
+
+-- | Return Jaccard similarity coefficient for two 'Text' values. Returned
+-- value is in the range from 0 (no similarity) to 1 (exact match). Return 1
+-- if both
+--
+-- See also: https://en.wikipedia.org/wiki/Jaccard_index
+--
+-- @since 0.3.0
+
+jaccard :: Text -> Text -> Ratio Int
+jaccard a b =
+ if d == 0
+ then 1 % 1
+ else intersectionSize ma mb % d
+ where
+ ma = mkTextMap a
+ mb = mkTextMap b
+ d = unionSize ma mb
+
+-- | Make a map from 'Char' to 'Int' representing how many times the 'Char'
+-- appears in the input 'Text'.
+
+mkTextMap :: Text -> Map Char Int
+mkTextMap = T.foldl' f M.empty
+ where
+ f m ch = M.insertWith (+) ch 1 m
+{-# INLINE mkTextMap #-}
+
+-- | Return intersection size between two 'Text'-maps.
+
+intersectionSize :: Map Char Int -> Map Char Int -> Int
+intersectionSize a b = M.foldl' (+) 0 (M.intersectionWith min a b)
+{-# INLINE intersectionSize #-}
+
+-- | Return union size between two 'Text'-maps.
-damerauLevenshteinNorm :: Text -> Text -> Ratio Natural
-damerauLevenshteinNorm = norm damerauLevenshtein
-{-# INLINE damerauLevenshteinNorm #-}
+unionSize :: Map Char Int -> Map Char Int -> Int
+unionSize a b = M.foldl' (+) 0 (M.unionWith max a b)
+{-# INLINE unionSize #-}
----------------------------------------------------------------------------
-- Other
-- | /O(n)/ Return Hamming distance between two 'Text' values. Hamming
--- distance is defined as number of positions at which the corresponding
+-- distance is defined as the number of positions at which the corresponding
-- symbols are different. The input 'Text' values should be of equal length
-- or 'Nothing' will be returned.
--
-- See also: https://en.wikipedia.org/wiki/Hamming_distance.
+--
+-- __Heads up__, before version /0.3.0/ this function returned @'Maybe'
+-- 'Data.Numeric.Natural'@.
-hamming :: Text -> Text -> Maybe Natural
+hamming :: Text -> Text -> Maybe Int
hamming a b =
if T.length a == T.length b
- then Just . unsafePerformIO . TF.useAsPtr a $ \aptr size ->
- TF.useAsPtr b $ \bptr _ ->
- fromIntegral <$> c_hamming (fromIntegral size) aptr bptr
+ then Just (go 0 0 0)
else Nothing
-
-foreign import ccall unsafe "tmetrics_hamming"
- c_hamming :: CUInt -> Ptr Word16 -> Ptr Word16 -> IO CUInt
+ where
+ go !na !nb !r =
+ let !(TU.Iter cha da) = TU.iter a na
+ !(TU.Iter chb db) = TU.iter b nb
+ in if | na == len -> r
+ | cha /= chb -> go (na + da) (nb + db) (r + 1)
+ | otherwise -> go (na + da) (nb + db) r
+ len = TU.lengthWord16 a
-- | Return Jaro distance between two 'Text' values. Returned value is in
--- range from 0 (no similarity) to 1 (exact match).
+-- the range from 0 (no similarity) to 1 (exact match).
--
-- While the algorithm is pretty clear for artificial examples (like those
-- from the linked Wikipedia article), for /arbitrary/ strings, it may be
-- hard to decide which of two strings should be considered as one having
--- “reference” order of characters (since order of matching characters in an
+-- “reference” order of characters (order of matching characters in an
-- essential part of the definition of the algorithm). This makes us
-- consider the first string the “reference” string (with correct order of
-- characters). Thus generally,
@@ -144,71 +290,98 @@
-- This asymmetry can be found in all implementations of the algorithm on
-- the internet, AFAIK.
--
--- See also: http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
+-- See also: https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
--
-- @since 0.2.0
+--
+-- __Heads up__, before version /0.3.0/ this function returned @'Ratio'
+-- 'Data.Numeric.Natural'@.
-jaro :: Text -> Text -> Ratio Natural
-jaro = jaroCommon (\_ _ _ _ x -> return x)
-
-jaroCommon :: (CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> Ratio Natural -> IO (Ratio Natural)) -> Text -> Text -> Ratio Natural
-jaroCommon f a b = unsafePerformIO $ alloca $ \m' -> alloca $ \t' ->
- TF.useAsPtr a $ \aptr asize ->
- TF.useAsPtr b $ \bptr bsize ->
- if asize == 0 || bsize == 0
- then return (1 % 1)
- else do
- let asize' = fromIntegral asize
- bsize' = fromIntegral bsize
- c_jaro m' t' asize' aptr bsize' bptr
- m <- fromIntegral <$> peek m'
- t <- fromIntegral <$> peek t'
- f asize' aptr bsize' bptr $
- if m == 0
- then 0
- else ((m % fromIntegral asize) +
- (m % fromIntegral bsize) +
- ((m - t) % m)) / 3
-{-# INLINE jaroCommon #-}
-
-foreign import ccall unsafe "tmetrics_jaro"
- c_jaro :: Ptr CUInt -> Ptr CUInt -> CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> IO ()
+jaro :: Text -> Text -> Ratio Int
+jaro a b =
+ if T.null a || T.null b
+ then 0 % 1
+ else runST $ do
+ let lena = T.length a
+ lenb = T.length b
+ d =
+ if lena >= 2 && lenb >= 2
+ then max lena lenb `quot` 2 - 1
+ else 0
+ v <- VUM.replicate lenb (0 :: Int)
+ r <- VUM.replicate 3 (0 :: Int) -- tj, m, t
+ let goi !i !na !fromb = do
+ let !(TU.Iter ai da) = TU.iter a na
+ (from, fromb') =
+ if i >= d
+ then (i - d, fromb + TU.iter_ b fromb)
+ else (0, 0)
+ to = min (i + d + 1) lenb
+ goj !j !nb =
+ when (j < to) $ do
+ let !(TU.Iter bj db) = TU.iter b nb
+ used <- (== 1) <$> VUM.unsafeRead v j
+ if not used && ai == bj
+ then do
+ tj <- VUM.unsafeRead r 0
+ if j < tj
+ then VUM.unsafeModify r (+ 1) 2
+ else VUM.unsafeWrite r 0 j
+ VUM.unsafeWrite v j 1
+ VUM.unsafeModify r (+ 1) 1
+ else goj (j + 1) (nb + db)
+ when (i < lena) $ do
+ goj from fromb
+ goi (i + 1) (na + da) fromb'
+ goi 0 0 0
+ m <- VUM.unsafeRead r 1
+ t <- VUM.unsafeRead r 2
+ return $
+ if m == 0
+ then 0 % 1
+ else ((m % lena) +
+ (m % lenb) +
+ ((m - t) % m)) / 3
-- | Return Jaro-Winkler distance between two 'Text' values. Returned value
-- is in range from 0 (no similarity) to 1 (exact match).
--
--- See also: http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
+-- See also: https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
--
-- @since 0.2.0
+--
+-- __Heads up__, before version /0.3.0/ this function returned @'Ratio'
+-- 'Data.Numeric.Natural'@.
-jaroWinkler :: Text -> Text -> Ratio Natural
-jaroWinkler = jaroCommon g
+jaroWinkler :: Text -> Text -> Ratio Int
+jaroWinkler a b = dj + (1 % 10) * l * (1 - dj)
where
- g asize aptr bsize bptr dj = do
- l <- fromIntegral <$> c_common_prefix asize aptr bsize bptr
- return (dj + (1 % 10) * l * (1 - dj))
+ dj = inline (jaro a b)
+ l = fromIntegral (commonPrefix a b)
-foreign import ccall unsafe "tmetrics_common_prefix"
- c_common_prefix :: CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> IO CUInt
+-- | Return length of common prefix two 'Text' values have.
+
+commonPrefix :: Text -> Text -> Int
+commonPrefix a b = go 0 0 0
+ where
+ go !na !nb !r =
+ let !(TU.Iter cha da) = TU.iter a na
+ !(TU.Iter chb db) = TU.iter b nb
+ in if | na == lena -> r
+ | nb == lenb -> r
+ | cha == chb -> go (na + da) (nb + db) (r + 1)
+ | otherwise -> r
+ lena = TU.lengthWord16 a
+ lenb = TU.lengthWord16 b
+{-# INLINE commonPrefix #-}
----------------------------------------------------------------------------
-- Helpers
-withTwo
- :: (CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> IO CUInt)
- -> Text
- -> Text
- -> Natural
-withTwo f a b =
- unsafePerformIO . TF.useAsPtr a $ \aptr asize ->
- TF.useAsPtr b $ \bptr bsize ->
- fromIntegral <$> f (fromIntegral asize) aptr (fromIntegral bsize) bptr
-{-# INLINE withTwo #-}
-
-norm :: (Text -> Text -> Natural) -> Text -> Text -> Ratio Natural
+norm :: (Text -> Text -> (Int, Int)) -> Text -> Text -> Ratio Int
norm f a b =
- let r = f a b
+ let (r, l) = f a b
in if r == 0
then 1 % 1
- else 1 % 1 - r % fromIntegral (max (T.length a) (T.length b))
+ else 1 % 1 - r % l
{-# INLINE norm #-}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/LICENSE.md new/text-metrics-0.3.0/LICENSE.md
--- old/text-metrics-0.2.0/LICENSE.md 2016-01-03 14:37:56.000000000 +0100
+++ new/text-metrics-0.3.0/LICENSE.md 2017-06-02 17:36:51.000000000 +0200
@@ -1,4 +1,4 @@
-Copyright © 2016 Mark Karpov
+Copyright © 2016–2017 Mark Karpov
All rights reserved.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/README.md new/text-metrics-0.3.0/README.md
--- old/text-metrics-0.2.0/README.md 2016-10-09 16:43:02.000000000 +0200
+++ new/text-metrics-0.3.0/README.md 2017-06-13 10:42:49.000000000 +0200
@@ -7,76 +7,44 @@
[![Build Status](https://travis-ci.org/mrkkrp/text-metrics.svg?branch=master)](https://travis-ci.org/mrkkrp/text-metrics)
[![Coverage Status](https://coveralls.io/repos/mrkkrp/text-metrics/badge.svg?branch=master&service=github)](https://coveralls.io/github/mrkkrp/text-metrics?branch=master)
-The library provides efficient implementations of various strings metrics.
-It works with strict `Text` values and returns either `Natural` numbers
-(because the metrics cannot be negative), or `Ratio Natural` values because
-returned values are rational non-negative numbers by definition.
-
-The functions provided here are the fastest implementations available for
-use in Haskell programs. In fact the functions are implemented in C for
-maximal efficiency, but this leads to a minor flaw. When we work with `Text`
-values in C, they are represented as UTF-16 encoded strings of two-byte
-values. The algorithms treat the strings as if a character corresponds to
-one element in such strings, which is true for almost all modern text data.
-However, there are characters that are represented by two adjoined elements
-in UTF-16: emoji, historic scripts, less used Chinese ideographs, and some
-more. If input `Text` of the functions contains such characters, the
-functions may return slightly incorrect result. Decide for yourself if this
-is acceptable for your use case, but chances are you will never run into
-situations when the functions produce incorrect results.
+The library provides efficient implementations of various strings metric
+algorithms. It works with strict `Text` values.
The current version of the package implements:
-* [Levenshtein distance](http://en.wikipedia.org/wiki/Levenshtein_distance)
-* [Normalized Levenshtein distance](http://en.wikipedia.org/wiki/Levenshtein_distance)
-* [Damerau-Levenshtein distance](http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
-* [Normalized Damerau-Levenshtein distance](http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
-* [Hamming distance](http://en.wikipedia.org/wiki/Hamming_distance)
-* [Jaro distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
-* [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
-
-TODO list:
-
-* [Overlap coefficient](http://en.wikipedia.org/wiki/Overlap_coefficient)
-* [Jaccard similarity coefficient](http://en.wikipedia.org/wiki/Jaccard_index)
+* [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
+* [Normalized Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
+* [Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
+* [Normalized Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
+* [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance)
+* [Jaro distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
+* [Jaro-Winkler distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
+* [Overlap coefficient](https://en.wikipedia.org/wiki/Overlap_coefficient)
+* [Jaccard similarity coefficient](https://en.wikipedia.org/wiki/Jaccard_index)
## Comparison with the `edit-distance` package
-There
-is [`edit-distance`](https://hackage.haskell.org/package/edit-distance)
-package whose scope overlaps with the scope of this package. The differences
-are:
+There is [`edit-distance`](https://hackage.haskell.org/package/edit-distance) package whose scope overlaps with the scope of
+this package. The differences are:
* `edit-distance` allows to specify costs for every operation when
calculating Levenshtein distance (insertion, deletion, substitution, and
transposition). This is rarely needed though in real-world applications,
IMO.
-* `edit-distance` only provides single Levenshtein distance, `text-metrics`
- aims to provide implementations of most string metrics algorithms.
+* `edit-distance` only provides Levenshtein distance, `text-metrics` aims to
+ provide implementations of most string metrics algorithms.
* `edit-distance` works on `Strings`, while `text-metrics` works on strict
`Text` values.
-* As `README.md` of `edit-distance` states, “[the algorithms] have been
- fairly heavily optimized”, which is apparently true, yet the
- `text-metrics` is faster for short strings (under 64 characters) and even
- faster for longer strings (scales better). How much faster? For short
- strings more than ×3, and about ×4 for longer strings.
-
## Implementation
-All “meat” of the algorithms is written in C in a rather straightforward
-way. Levenshtein variants are based on the “iterative algorithm with two
-matrix rows” from Wikipedia with additional improvement that we do not copy
-current row of distances into previous row, but just swap the pointers
-(which is OK, since the arrays have equal length and current row will be
-overwritten in the next iteration anyway).
-
-Normalized versions are defined as thin (inlined) Haskell wrappers.
+Although we originally used C for speed, currently all functions are pure
+Haskell tuned for performance. See [this blog post](https://markkarpov.com/post/migrating-text-metrics.html) for more info.
## License
-Copyright © 2016 Mark Karpov
+Copyright © 2016–2017 Mark Karpov
Distributed under BSD 3 clause license.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/bench/Main.hs new/text-metrics-0.3.0/bench/Main.hs
--- old/text-metrics-0.2.0/bench/Main.hs 2016-10-09 16:43:02.000000000 +0200
+++ new/text-metrics-0.3.0/bench/Main.hs 1970-01-01 01:00:00.000000000 +0100
@@ -1,64 +0,0 @@
---
--- Benchmarks for the ‘text-metrics’ package.
---
--- Copyright © 2016 Mark Karpov
---
--- Redistribution and use in source and binary forms, with or without
--- modification, are permitted provided that the following conditions are
--- met:
---
--- * Redistributions of source code must retain the above copyright notice,
--- this list of conditions and the following disclaimer.
---
--- * Redistributions in binary form must reproduce the above copyright
--- notice, this list of conditions and the following disclaimer in the
--- documentation and/or other materials provided with the distribution.
---
--- * Neither the name Mark Karpov nor the names of contributors may be used
--- to endorse or promote products derived from this software without
--- specific prior written permission.
---
--- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS “AS IS” AND ANY
--- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
--- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
--- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
--- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
--- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
--- OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
--- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
--- STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
--- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
--- POSSIBILITY OF SUCH DAMAGE.
-
-module Main (main) where
-
-import Control.DeepSeq
-import Criterion.Main
-import Data.Text (Text)
-import Data.Text.Metrics
-import qualified Data.Text as T
-
-main :: IO ()
-main = defaultMain
- [ btmetric "levenshtein" levenshtein
- , btmetric "levenshteinNorm" levenshteinNorm
- , btmetric "damerauLevenshtein" damerauLevenshtein
- , btmetric "damerauLevenshteinNorm" damerauLevenshteinNorm
- , btmetric "hamming" hamming
- , btmetric "jaro" jaro
- , btmetric "jaroWinkler" jaroWinkler ]
-
--- | Produce benchmark group to test
-
-btmetric :: NFData a => String -> (Text -> Text -> a) -> Benchmark
-btmetric name f = bgroup name (bs <$> stdSeries)
- where
- bs n = env (return (testData n, testData n)) (bench (show n) . nf (uncurry f))
-
--- | The series of lengths to try with every function as part of 'btmetric'.
-
-stdSeries :: [Int]
-stdSeries = [5,10,20,40,80,160]
-
-testData :: Int -> Text
-testData n = T.pack . take n . drop (n `mod` 4) . cycle $ ['a'..'z']
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/bench-memory/Main.hs new/text-metrics-0.3.0/bench-memory/Main.hs
--- old/text-metrics-0.2.0/bench-memory/Main.hs 1970-01-01 01:00:00.000000000 +0100
+++ new/text-metrics-0.3.0/bench-memory/Main.hs 2017-06-13 10:42:49.000000000 +0200
@@ -0,0 +1,38 @@
+module Main (main) where
+
+import Control.DeepSeq
+import Control.Monad
+import Data.Text (Text)
+import Data.Text.Metrics
+import Weigh
+import qualified Data.Text as T
+
+main :: IO ()
+main = mainWith $ do
+ setColumns [Case, Allocated, GCs, Max]
+ bmetric "levenshtein" levenshtein
+ bmetric "levenshteinNorm" levenshteinNorm
+ bmetric "damerauLevenshtein" damerauLevenshtein
+ bmetric "damerauLevenshteinNorm" damerauLevenshteinNorm
+ bmetric "overlap" overlap
+ bmetric "jaccard" jaccard
+ bmetric "hamming" hamming
+ bmetric "jaro" jaro
+ bmetric "jaroWinkler" jaroWinkler
+
+-- | Perform a series to measurements with the same metric function.
+
+bmetric :: NFData a
+ => String -- ^ Name of the benchmark group
+ -> (Text -> Text -> a) -- ^ The function to benchmark
+ -> Weigh ()
+bmetric name f = forM_ stdSeries $ \n ->
+ func (name ++ "/" ++ show n) (uncurry f) (testData n, testData n)
+
+-- | The series of lengths to try with every function as part of 'btmetric'.
+
+stdSeries :: [Int]
+stdSeries = [5,10,20,40,80,160]
+
+testData :: Int -> Text
+testData n = T.pack . take n . drop (n `mod` 4) . cycle $ ['a'..'z']
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/bench-speed/Main.hs new/text-metrics-0.3.0/bench-speed/Main.hs
--- old/text-metrics-0.2.0/bench-speed/Main.hs 1970-01-01 01:00:00.000000000 +0100
+++ new/text-metrics-0.3.0/bench-speed/Main.hs 2017-06-13 10:42:49.000000000 +0200
@@ -0,0 +1,34 @@
+module Main (main) where
+
+import Control.DeepSeq
+import Criterion.Main
+import Data.Text (Text)
+import Data.Text.Metrics
+import qualified Data.Text as T
+
+main :: IO ()
+main = defaultMain
+ [ btmetric "levenshtein" levenshtein
+ , btmetric "levenshteinNorm" levenshteinNorm
+ , btmetric "damerauLevenshtein" damerauLevenshtein
+ , btmetric "damerauLevenshteinNorm" damerauLevenshteinNorm
+ , btmetric "overlap" overlap
+ , btmetric "jaccard" jaccard
+ , btmetric "hamming" hamming
+ , btmetric "jaro" jaro
+ , btmetric "jaroWinkler" jaroWinkler ]
+
+-- | Produce benchmark group to test.
+
+btmetric :: NFData a => String -> (Text -> Text -> a) -> Benchmark
+btmetric name f = bgroup name (bs <$> stdSeries)
+ where
+ bs n = env (return (testData n, testData n)) (bench (show n) . nf (uncurry f))
+
+-- | The series of lengths to try with every function as part of 'btmetric'.
+
+stdSeries :: [Int]
+stdSeries = [5,10,20,40,80,160]
+
+testData :: Int -> Text
+testData n = T.pack . take n . drop (n `mod` 4) . cycle $ ['a'..'z']
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/cbits/text_metrics.c new/text-metrics-0.3.0/cbits/text_metrics.c
--- old/text-metrics-0.2.0/cbits/text_metrics.c 2016-10-09 16:43:02.000000000 +0200
+++ new/text-metrics-0.3.0/cbits/text_metrics.c 1970-01-01 01:00:00.000000000 +0100
@@ -1,207 +0,0 @@
-/*
- * This file is part of ‘text-metrics’ package.
- *
- * Copyright © 2016 Mark Karpov
- *
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * * Neither the name Mark Karpov nor the names of contributors may be used to
- * endorse or promote products derived from this software without specific
- * prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
- * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
- * NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "text_metrics.h"
-
-/* Levenshtein variants */
-
-unsigned int tmetrics_levenshtein (unsigned int la, uint16_t *a, unsigned int lb, uint16_t *b)
-{
- if (la == 0) return lb;
- if (lb == 0) return la;
-
- unsigned int v_len = lb + 1, *v0, *v1, i, j;
-
- if (v_len > VLEN_MAX)
- {
- v0 = malloc(sizeof(unsigned int) * v_len);
- v1 = malloc(sizeof(unsigned int) * v_len);
- }
- else
- {
- v0 = alloca(sizeof(unsigned int) * v_len);
- v1 = alloca(sizeof(unsigned int) * v_len);
- }
-
- for (i = 0; i < v_len; i++)
- v0[i] = i;
-
- for (i = 0; i < la; i++)
- {
- v1[0] = i + 1;
-
- for (j = 0; j < lb; j++)
- {
- unsigned int cost = *(a + i) == *(b + j) ? 0 : 1;
- unsigned int x = *(v1 + j) + 1;
- unsigned int y = *(v0 + j + 1) + 1;
- unsigned int z = *(v0 + j) + cost;
- *(v1 + j + 1) = MIN(x, MIN(y, z));
- }
-
- unsigned int *ptr = v0;
- v0 = v1;
- v1 = ptr;
- }
-
- unsigned int result = *(v0 + lb);
-
- if (v_len > VLEN_MAX)
- {
- free(v0);
- free(v1);
- }
-
- return result;
-}
-
-unsigned int tmetrics_damerau_levenshtein (unsigned int la, uint16_t *a, unsigned int lb, uint16_t *b)
-{
- if (la == 0) return lb;
- if (lb == 0) return la;
-
- unsigned int v_len = lb + 1, *v0, *v1, *v2, i, j;
-
- if (v_len > VLEN_MAX)
- {
- v0 = malloc(sizeof(unsigned int) * v_len);
- v1 = malloc(sizeof(unsigned int) * v_len);
- v2 = malloc(sizeof(unsigned int) * v_len);
- }
- else
- {
- v0 = alloca(sizeof(unsigned int) * v_len);
- v1 = alloca(sizeof(unsigned int) * v_len);
- v2 = alloca(sizeof(unsigned int) * v_len);
- }
-
- for (i = 0; i < v_len; i++)
- v0[i] = i;
-
- for (i = 0; i < la; i++)
- {
- v1[0] = i + 1;
-
- for (j = 0; j < lb; j++)
- {
- unsigned int cost = *(a + i) == *(b + j) ? 0 : 1;
- unsigned int x = *(v1 + j) + 1;
- unsigned int y = *(v0 + j + 1) + 1;
- unsigned int z = *(v0 + j) + cost;
- *(v1 + j + 1) = MIN(x, MIN(y, z));
- unsigned int val = *(v2 + j - 1) + cost;
- if ( i > 0 &&
- j > 0 &&
- *(a + i) == *(b + j - 1) &&
- *(a + i - 1) == *(b + j) &&
- val < *(v1 + j + 1) )
- *(v1 + j + 1) = val;
- }
-
- unsigned int *ptr = v0;
- v0 = v1;
- v1 = v2;
- v2 = ptr;
- }
-
- unsigned int result = *(v0 + lb);
-
- if (v_len > VLEN_MAX)
- {
- free(v0);
- free(v1);
- free(v2);
- }
-
- return result;
-}
-
-/* Other */
-
-unsigned int tmetrics_hamming (unsigned int len, uint16_t *a, uint16_t *b)
-{
- unsigned int acc = 0, i;
- for (i = 0; i < len; i++)
- {
- if (*(a + i) != *(b + i)) acc++;
- }
- return acc;
-}
-
-void tmetrics_jaro (unsigned int *m, unsigned int *t, unsigned int la, uint16_t *a, unsigned int lb, uint16_t *b)
-{
- unsigned int d = 0, i, j, tj = 0, from, to;
- char *v;
-
- *m = 0, *t = 0;
-
- if (la >= 2 && lb >= 2)
- d = MAX(lb, la) / 2 - 1;
-
- if (lb > VLEN_MAX) v = malloc(sizeof(char) * lb);
- else v = alloca(sizeof(char) * lb);
-
- for (i = 0; i < lb; i++) *(v + i) = 0;
-
- for (i = 0; i < la; i++)
- {
- from = i < d ? 0 : i - d;
- to = MIN(i + d + 1, lb);
- for (j = from; j < to; j++)
- {
- if (*(v + j)) continue;
-
- if (*(a + i) == *(b + j))
- {
- if (j < tj) (*t)++;
- else tj = j;
- *(v + j) = 1;
- (*m)++;
- break;
- }
- }
- }
-
- if (lb > VLEN_MAX) free(v);
-}
-
-unsigned int tmetrics_common_prefix (unsigned int la, uint16_t *a, unsigned int lb, uint16_t *b)
-{
- unsigned int acc = 0, i, l = MIN(la, lb);
- for (i = 0; i < l; i++)
- {
- if (*(a + i) == *(b + i)) acc++;
- else break;
- }
- return acc;
-}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/cbits/text_metrics.h new/text-metrics-0.3.0/cbits/text_metrics.h
--- old/text-metrics-0.2.0/cbits/text_metrics.h 2016-10-09 16:43:02.000000000 +0200
+++ new/text-metrics-0.3.0/cbits/text_metrics.h 1970-01-01 01:00:00.000000000 +0100
@@ -1,56 +0,0 @@
-/*
- * This file is part of ‘text-metrics’ package.
- *
- * Copyright © 2016 Mark Karpov
- *
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * * Neither the name Mark Karpov nor the names of contributors may be used to
- * endorse or promote products derived from this software without specific
- * prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
- * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
- * NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef TEXT_METRICS_H
-#define TEXT_METRICS_H
-
-#include
-#include
-
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-
-#define VLEN_MAX 255 /* Up to this length we use alloca. */
-
-/* Levenshein variants */
-
-unsigned int tmetrics_levenshtein (unsigned int, uint16_t *, unsigned int, uint16_t *);
-unsigned int tmetrics_damerau_levenshtein (unsigned int, uint16_t *, unsigned int, uint16_t *);
-
-/* Other */
-
-unsigned int tmetrics_hamming (unsigned int, uint16_t *, uint16_t *);
-void tmetrics_jaro (unsigned int *, unsigned int *, unsigned int, uint16_t *, unsigned int, uint16_t *);
-unsigned int tmetrics_common_prefix (unsigned int, uint16_t *, unsigned int, uint16_t *);
-
-#endif /* TEXT_METRICS_H */
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/tests/Main.hs new/text-metrics-0.3.0/tests/Main.hs
--- old/text-metrics-0.2.0/tests/Main.hs 2016-10-09 16:43:02.000000000 +0200
+++ new/text-metrics-0.3.0/tests/Main.hs 2017-06-13 10:42:49.000000000 +0200
@@ -1,35 +1,3 @@
---
--- Tests for the ‘text-metrics’ package.
---
--- Copyright © 2016 Mark Karpov
---
--- Redistribution and use in source and binary forms, with or without
--- modification, are permitted provided that the following conditions are
--- met:
---
--- * Redistributions of source code must retain the above copyright notice,
--- this list of conditions and the following disclaimer.
---
--- * Redistributions in binary form must reproduce the above copyright
--- notice, this list of conditions and the following disclaimer in the
--- documentation and/or other materials provided with the distribution.
---
--- * Neither the name Mark Karpov nor the names of contributors may be used
--- to endorse or promote products derived from this software without
--- specific prior written permission.
---
--- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS “AS IS” AND ANY
--- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
--- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
--- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
--- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
--- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
--- OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
--- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
--- STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
--- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
--- POSSIBILITY OF SUCH DAMAGE.
-
{-# LANGUAGE CPP #-}
{-# LANGUAGE OverloadedStrings #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
@@ -62,6 +30,9 @@
testPair levenshtein "cake" "drake" 2
testPair levenshtein "saturday" "sunday" 3
testPair levenshtein "red" "wax" 3
+#if __GLASGOW_HASKELL__ >= 710
+ testPair levenshtein "a😀c" "abc" 1
+#endif
testPair levenshtein "lucky" "lucky" 0
testPair levenshtein "" "" 0
describe "levenshteinNorm" $ do
@@ -70,6 +41,9 @@
testPair levenshteinNorm "cake" "drake" (3 % 5)
testPair levenshteinNorm "saturday" "sunday" (5 % 8)
testPair levenshteinNorm "red" "wax" (0 % 1)
+#if __GLASGOW_HASKELL__ >= 710
+ testPair levenshteinNorm "a😀c" "abc" (2 % 3)
+#endif
testPair levenshteinNorm "lucky" "lucky" (1 % 1)
testPair levenshteinNorm "" "" (1 % 1)
describe "damerauLevenshtein" $ do
@@ -79,6 +53,9 @@
testPair damerauLevenshtein "nose" "ones" 2
testPair damerauLevenshtein "thing" "sign" 3
testPair damerauLevenshtein "red" "wax" 3
+#if __GLASGOW_HASKELL__ >= 710
+ testPair damerauLevenshtein "a😀c" "abc" 1
+#endif
testPair damerauLevenshtein "lucky" "lucky" 0
testPair damerauLevenshtein "" "" 0
describe "damerauLevenshteinNorm" $ do
@@ -88,6 +65,9 @@
testPair damerauLevenshteinNorm "nose" "ones" (1 % 2)
testPair damerauLevenshteinNorm "thing" "sign" (2 % 5)
testPair damerauLevenshteinNorm "red" "wax" (0 % 1)
+#if __GLASGOW_HASKELL__ >= 710
+ testPair damerauLevenshteinNorm "a😀c" "abc" (2 % 3)
+#endif
testPair damerauLevenshteinNorm "lucky" "lucky" (1 % 1)
testPair damerauLevenshteinNorm "" "" (1 % 1)
describe "hamming" $ do
@@ -98,6 +78,9 @@
testPair hamming "2173896" "2233796" (Just 3)
testPair hamming "toned" "roses" (Just 3)
testPair hamming "red" "wax" (Just 3)
+#if __GLASGOW_HASKELL__ >= 710
+ testPair hamming "a😀c" "abc" (Just 1)
+#endif
testPair hamming "lucky" "lucky" (Just 0)
testPair hamming "" "" (Just 0)
testPair hamming "small" "big" Nothing
@@ -117,7 +100,10 @@
testPair jaro "five" "ten" (0 % 1)
testPair jaro "ten" "five" (0 % 1)
testPair jaro "lucky" "lucky" (1 % 1)
- testPair jaro "" "" (1 % 1)
+#if __GLASGOW_HASKELL__ >= 710
+ testPair jaro "a😀c" "abc" (7 % 9)
+#endif
+ testPair jaro "" "" (0 % 1)
describe "jaroWinkler" $ do
testPair jaroWinkler "aa" "a" (17 % 20)
testPair jaroWinkler "a" "aa" (17 % 20)
@@ -134,7 +120,29 @@
testPair jaroWinkler "five" "ten" (0 % 1)
testPair jaroWinkler "ten" "five" (0 % 1)
testPair jaroWinkler "lucky" "lucky" (1 % 1)
- testPair jaroWinkler "" "" (1 % 1)
+#if __GLASGOW_HASKELL__ >= 710
+ testPair jaroWinkler "a😀c" "abc" (4 % 5)
+#endif
+ testPair jaroWinkler "" "" (0 % 1)
+ describe "overlap" $ do
+ testSwap overlap
+ testPair overlap "fly" "butterfly" (1 % 1)
+ testPair overlap "night" "nacht" (3 % 5)
+ testPair overlap "context" "contact" (5 % 7)
+ testPair overlap "red" "wax" (0 % 1)
+#if __GLASGOW_HASKELL__ >= 710
+ testPair overlap "a😀c" "abc" (2 % 3)
+#endif
+ testPair overlap "lucky" "lucky" (1 % 1)
+ describe "jaccard" $ do
+ testSwap jaccard
+ testPair jaccard "xxx" "xyx" (1 % 2)
+ testPair jaccard "night" "nacht" (3 % 7)
+ testPair jaccard "context" "contact" (5 % 9)
+#if __GLASGOW_HASKELL__ >= 710
+ testPair overlap "a😀c" "abc" (2 % 3)
+#endif
+ testPair jaccard "lucky" "lucky" (1 % 1)
-- | Test that given function returns the same results when order of
-- arguments is swapped.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/text-metrics.cabal new/text-metrics-0.3.0/text-metrics.cabal
--- old/text-metrics-0.2.0/text-metrics.cabal 2016-10-09 16:43:36.000000000 +0200
+++ new/text-metrics-0.3.0/text-metrics.cabal 2017-06-13 12:04:50.000000000 +0200
@@ -1,42 +1,11 @@
---
--- Cabal configuration for ‘text-metrics’ package.
---
--- Copyright © 2016 Mark Karpov
---
--- Redistribution and use in source and binary forms, with or without
--- modification, are permitted provided that the following conditions are
--- met:
---
--- * Redistributions of source code must retain the above copyright notice,
--- this list of conditions and the following disclaimer.
---
--- * Redistributions in binary form must reproduce the above copyright
--- notice, this list of conditions and the following disclaimer in the
--- documentation and/or other materials provided with the distribution.
---
--- * Neither the name Mark Karpov nor the names of contributors may be used
--- to endorse or promote products derived from this software without
--- specific prior written permission.
---
--- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS “AS IS” AND ANY
--- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
--- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
--- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
--- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
--- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
--- OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
--- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
--- STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
--- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
--- POSSIBILITY OF SUCH DAMAGE.
-
name: text-metrics
-version: 0.2.0
+version: 0.3.0
cabal-version: >= 1.10
+tested-with: GHC==7.8.4, GHC==7.10.3, GHC==8.0.2, GHC==8.2.1
license: BSD3
license-file: LICENSE.md
-author: Mark Karpov
-maintainer: Mark Karpov
+author: Mark Karpov
+maintainer: Mark Karpov
homepage: https://github.com/mrkkrp/text-metrics
bug-reports: https://github.com/mrkkrp/text-metrics/issues
category: Text, Algorithms
@@ -45,7 +14,6 @@
description: Calculate various string metrics efficiently.
extra-doc-files: CHANGELOG.md
, README.md
-extra-source-files: cbits/*.h
source-repository head
type: git
@@ -58,12 +26,10 @@
library
build-depends: base >= 4.7 && < 5.0
+ , containers >= 0.5.6.2 && < 0.6
, text >= 0.2 && < 1.3
- if !impl(ghc >= 7.10)
- build-depends: nats == 1.*
+ , vector >= 0.11 && < 0.13
exposed-modules: Data.Text.Metrics
- c-sources: cbits/text_metrics.c
- include-dirs: cbits
if flag(dev)
ghc-options: -Wall -Werror
else
@@ -78,28 +44,39 @@
, base >= 4.7 && < 5.0
, hspec >= 2.0 && < 3.0
, text >= 0.2 && < 1.3
- , text-metrics >= 0.2.0
- if !impl(ghc >= 7.10)
- build-depends: nats == 1.*
+ , text-metrics
if flag(dev)
ghc-options: -Wall -Werror
else
ghc-options: -O2 -Wall
default-language: Haskell2010
-benchmark bench
+benchmark bench-speed
main-is: Main.hs
- hs-source-dirs: bench
+ hs-source-dirs: bench-speed
type: exitcode-stdio-1.0
- build-depends: base >= 4.7 && < 5.0
- , criterion >= 0.6.2.1 && < 1.2
- , deepseq >= 1.4 && < 1.5
+ build-depends: base >= 4.7 && < 5.0
+ , criterion >= 0.6.2.1 && < 1.3
+ , deepseq >= 1.4 && < 1.5
, text >= 0.2 && < 1.3
- , text-metrics >= 0.2.0
- if !impl(ghc >= 7.10)
- build-depends: nats == 1.*
+ , text-metrics
if flag(dev)
- ghc-options: -Wall -Werror
+ ghc-options: -O2 -Wall -Werror
+ else
+ ghc-options: -O2 -Wall
+ default-language: Haskell2010
+
+benchmark bench-memory
+ main-is: Main.hs
+ hs-source-dirs: bench-memory
+ type: exitcode-stdio-1.0
+ build-depends: base >= 4.7 && < 5.0
+ , deepseq >= 1.4 && < 1.5
+ , text >= 0.2 && < 1.3
+ , text-metrics
+ , weigh >= 0.0.4
+ if flag(dev)
+ ghc-options: -O2 -Wall -Werror
else
ghc-options: -O2 -Wall
default-language: Haskell2010