aboutsummaryrefslogtreecommitdiff
path: root/transform/string-similarity.scm
diff options
context:
space:
mode:
authorMunyoki Kilyungi2023-08-21 15:03:20 +0300
committerMunyoki Kilyungi2023-08-21 15:06:06 +0300
commit8e1e4cceab516afab46ccced63ca9edab663ca11 (patch)
treecad625c3ecf0a555d7b56b777cdade535cb30d07 /transform/string-similarity.scm
parent51b3c0548c98e0bc05e11a89cbf6b75d31b9f8d5 (diff)
downloadgn-transform-databases-8e1e4cceab516afab46ccced63ca9edab663ca11.tar.gz
Rename dump -> transform
Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
Diffstat (limited to 'transform/string-similarity.scm')
-rw-r--r--transform/string-similarity.scm37
1 files changed, 37 insertions, 0 deletions
diff --git a/transform/string-similarity.scm b/transform/string-similarity.scm
new file mode 100644
index 0000000..c715856
--- /dev/null
+++ b/transform/string-similarity.scm
@@ -0,0 +1,37 @@
+(define-module (transform string-similarity)
+ #:use-module (srfi srfi-1)
+ #:export (jaccard-string-similar?))
+
+(define (trigrams str)
+ "Return all trigrams in STR."
+ (if (< (string-length str) 3)
+ '()
+ (map (lambda (start)
+ (substring str start (+ start 3)))
+ (iota (- (string-length str) 2)))))
+
+(define (jaccard-index set1 set2)
+ "Return the Jaccard similarity coefficient between lists SET1 and
+SET2. Similarity between null sets is defined to be 0."
+ (if (and (null? set1)
+ (null? set2))
+ 0
+ (let ((length-of-intersection (length (lset-intersection equal? set1 set2))))
+ (exact->inexact
+ (/ length-of-intersection
+ (- (+ (length set1) (length set2))
+ length-of-intersection))))))
+
+(define (jaccard-string-similarity str1 str2)
+ "Return the trigram similarity between strings STR1 and STR2 as
+defined by the Jaccard index."
+ (jaccard-index (trigrams (string-downcase str1))
+ (trigrams (string-downcase str2))))
+
+(define (jaccard-string-similar? str1 str2)
+ "Return #t if STR1 and STR2 have a trigram similarity greater than
+0.8. Else, return #f. The Jaccard index is used as the similarity
+metric."
+ (let ((similarity-threshold 0.8))
+ (> (jaccard-string-similarity str1 str2)
+ similarity-threshold)))