aboutsummaryrefslogtreecommitdiff
path: root/dump/string-similarity.scm
blob: 4bcdf7c1426b1c8053c4c3cae293f7a539a2aebd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
(define-module (dump string-similarity)
  #:use-module (srfi srfi-1)
  #:export (jaccard-string-similar?))

(define (trigrams str)
  "Return all trigrams in STR."
  (if (< (string-length str) 3)
      '()
      (map (lambda (start)
             (substring str start (+ start 3)))
           (iota (- (string-length str) 2)))))

(define (jaccard-index set1 set2)
  "Return the Jaccard similarity coefficient between lists SET1 and
SET2. Similarity between null sets is defined to be 0."
  (if (and (null? set1)
           (null? set2))
      0
      (let ((length-of-intersection (length (lset-intersection equal? set1 set2))))
        (exact->inexact
         (/ length-of-intersection
            (- (+ (length set1) (length set2))
               length-of-intersection))))))

(define (jaccard-string-similarity str1 str2)
  "Return the trigram similarity between strings STR1 and STR2 as
defined by the Jaccard index."
  (jaccard-index (trigrams (string-downcase str1))
                 (trigrams (string-downcase str2))))

(define (jaccard-string-similar? str1 str2)
  "Return #t if STR1 and STR2 have a trigram similarity greater than
0.8. Else, return #f. The Jaccard index is used as the similarity
metric."
  (let ((similarity-threshold 0.8))
    (> (jaccard-string-similarity str1 str2)
       similarity-threshold)))