diff options
Diffstat (limited to 'dump/string-similarity.scm')
-rw-r--r-- | dump/string-similarity.scm | 37 |
1 files changed, 0 insertions, 37 deletions
diff --git a/dump/string-similarity.scm b/dump/string-similarity.scm deleted file mode 100644 index 4bcdf7c..0000000 --- a/dump/string-similarity.scm +++ /dev/null @@ -1,37 +0,0 @@ -(define-module (dump string-similarity) - #:use-module (srfi srfi-1) - #:export (jaccard-string-similar?)) - -(define (trigrams str) - "Return all trigrams in STR." - (if (< (string-length str) 3) - '() - (map (lambda (start) - (substring str start (+ start 3))) - (iota (- (string-length str) 2))))) - -(define (jaccard-index set1 set2) - "Return the Jaccard similarity coefficient between lists SET1 and -SET2. Similarity between null sets is defined to be 0." - (if (and (null? set1) - (null? set2)) - 0 - (let ((length-of-intersection (length (lset-intersection equal? set1 set2)))) - (exact->inexact - (/ length-of-intersection - (- (+ (length set1) (length set2)) - length-of-intersection)))))) - -(define (jaccard-string-similarity str1 str2) - "Return the trigram similarity between strings STR1 and STR2 as -defined by the Jaccard index." - (jaccard-index (trigrams (string-downcase str1)) - (trigrams (string-downcase str2)))) - -(define (jaccard-string-similar? str1 str2) - "Return #t if STR1 and STR2 have a trigram similarity greater than -0.8. Else, return #f. The Jaccard index is used as the similarity -metric." - (let ((similarity-threshold 0.8)) - (> (jaccard-string-similarity str1 str2) - similarity-threshold))) |