diff options
author | Munyoki Kilyungi | 2023-08-21 15:03:20 +0300 |
---|---|---|
committer | Munyoki Kilyungi | 2023-08-21 15:06:06 +0300 |
commit | 8e1e4cceab516afab46ccced63ca9edab663ca11 (patch) | |
tree | cad625c3ecf0a555d7b56b777cdade535cb30d07 /transform/string-similarity.scm | |
parent | 51b3c0548c98e0bc05e11a89cbf6b75d31b9f8d5 (diff) | |
download | gn-transform-databases-8e1e4cceab516afab46ccced63ca9edab663ca11.tar.gz |
Rename dump -> transform
Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
Diffstat (limited to 'transform/string-similarity.scm')
-rw-r--r-- | transform/string-similarity.scm | 37 |
1 files changed, 37 insertions, 0 deletions
diff --git a/transform/string-similarity.scm b/transform/string-similarity.scm new file mode 100644 index 0000000..c715856 --- /dev/null +++ b/transform/string-similarity.scm @@ -0,0 +1,37 @@ +(define-module (transform string-similarity) + #:use-module (srfi srfi-1) + #:export (jaccard-string-similar?)) + +(define (trigrams str) + "Return all trigrams in STR." + (if (< (string-length str) 3) + '() + (map (lambda (start) + (substring str start (+ start 3))) + (iota (- (string-length str) 2))))) + +(define (jaccard-index set1 set2) + "Return the Jaccard similarity coefficient between lists SET1 and +SET2. Similarity between null sets is defined to be 0." + (if (and (null? set1) + (null? set2)) + 0 + (let ((length-of-intersection (length (lset-intersection equal? set1 set2)))) + (exact->inexact + (/ length-of-intersection + (- (+ (length set1) (length set2)) + length-of-intersection)))))) + +(define (jaccard-string-similarity str1 str2) + "Return the trigram similarity between strings STR1 and STR2 as +defined by the Jaccard index." + (jaccard-index (trigrams (string-downcase str1)) + (trigrams (string-downcase str2)))) + +(define (jaccard-string-similar? str1 str2) + "Return #t if STR1 and STR2 have a trigram similarity greater than +0.8. Else, return #f. The Jaccard index is used as the similarity +metric." + (let ((similarity-threshold 0.8)) + (> (jaccard-string-similarity str1 str2) + similarity-threshold))) |