about summary refs log tree commit diff
path: root/dump.scm
diff options
context:
space:
mode:
Diffstat (limited to 'dump.scm')
-rwxr-xr-xdump.scm35
1 files changed, 1 insertions, 34 deletions
diff --git a/dump.scm b/dump.scm
index fb6b85f..51abdeb 100755
--- a/dump.scm
+++ b/dump.scm
@@ -11,6 +11,7 @@
              (ice-9 string-fun)
              (sxml simple)
              (dump sql)
+             (dump string-similarity)
              (dump utils))
 
 
@@ -568,40 +569,6 @@ case-insensitive."
             color-scheme
             (1+ (min (floor-log1024 bytes) 3)))))
 
-(define (trigrams str)
-  "Return all trigrams in STR."
-  (if (< (string-length str) 3)
-      '()
-      (map (lambda (start)
-             (substring str start (+ start 3)))
-           (iota (- (string-length str) 2)))))
-
-(define (jaccard-index set1 set2)
-  "Return the Jaccard similarity coefficient between lists SET1 and
-SET2. Similarity between null sets is defined to be 0."
-  (if (and (null? set1)
-           (null? set2))
-      0
-      (let ((length-of-intersection (length (lset-intersection equal? set1 set2))))
-        (exact->inexact
-         (/ length-of-intersection
-            (- (+ (length set1) (length set2))
-               length-of-intersection))))))
-
-(define (jaccard-string-similarity str1 str2)
-  "Return the trigram similarity between strings STR1 and STR2 as
-defined by the Jaccard index."
-  (jaccard-index (trigrams (string-downcase str1))
-                 (trigrams (string-downcase str2))))
-
-(define (jaccard-string-similar? str1 str2)
-  "Return #t if STR1 and STR2 have a trigram similarity greater than
-0.8. Else, return #f. The Jaccard index is used as the similarity
-metric."
-  (let ((similarity-threshold 0.8))
-    (> (jaccard-string-similarity str1 str2)
-       similarity-threshold)))
-
 (define (sxml->xml-string tree)
   "Serialize sxml TREE to a string. Return the serialized string."
   (call-with-output-string