aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArun Isaac2021-12-15 17:27:48 +0530
committerArun Isaac2021-12-15 17:27:48 +0530
commit1aad2a2f50dc51b1cd89eb2cac0944b7c87e6c61 (patch)
tree57565ae75b8e653f309f69675a0dd2a5d37acf64
parentb2541ba75174826abf27a49ea15c96975caf6c68 (diff)
downloadgn-transform-databases-1aad2a2f50dc51b1cd89eb2cac0944b7c87e6c61.tar.gz
Move string similarity functions to separate module.
* dump.scm: Use (dump string-similarity). (trigrams, jaccard-index, jaccard-string-similarity, jaccard-string-similar?): Move to ... * dump/string-similarity.scm: ... here.
-rwxr-xr-xdump.scm35
-rw-r--r--dump/string-similarity.scm37
2 files changed, 38 insertions, 34 deletions
diff --git a/dump.scm b/dump.scm
index fb6b85f..51abdeb 100755
--- a/dump.scm
+++ b/dump.scm
@@ -11,6 +11,7 @@
(ice-9 string-fun)
(sxml simple)
(dump sql)
+ (dump string-similarity)
(dump utils))
@@ -568,40 +569,6 @@ case-insensitive."
color-scheme
(1+ (min (floor-log1024 bytes) 3)))))
-(define (trigrams str)
- "Return all trigrams in STR."
- (if (< (string-length str) 3)
- '()
- (map (lambda (start)
- (substring str start (+ start 3)))
- (iota (- (string-length str) 2)))))
-
-(define (jaccard-index set1 set2)
- "Return the Jaccard similarity coefficient between lists SET1 and
-SET2. Similarity between null sets is defined to be 0."
- (if (and (null? set1)
- (null? set2))
- 0
- (let ((length-of-intersection (length (lset-intersection equal? set1 set2))))
- (exact->inexact
- (/ length-of-intersection
- (- (+ (length set1) (length set2))
- length-of-intersection))))))
-
-(define (jaccard-string-similarity str1 str2)
- "Return the trigram similarity between strings STR1 and STR2 as
-defined by the Jaccard index."
- (jaccard-index (trigrams (string-downcase str1))
- (trigrams (string-downcase str2))))
-
-(define (jaccard-string-similar? str1 str2)
- "Return #t if STR1 and STR2 have a trigram similarity greater than
-0.8. Else, return #f. The Jaccard index is used as the similarity
-metric."
- (let ((similarity-threshold 0.8))
- (> (jaccard-string-similarity str1 str2)
- similarity-threshold)))
-
(define (sxml->xml-string tree)
"Serialize sxml TREE to a string. Return the serialized string."
(call-with-output-string
diff --git a/dump/string-similarity.scm b/dump/string-similarity.scm
new file mode 100644
index 0000000..4bcdf7c
--- /dev/null
+++ b/dump/string-similarity.scm
@@ -0,0 +1,37 @@
+(define-module (dump string-similarity)
+ #:use-module (srfi srfi-1)
+ #:export (jaccard-string-similar?))
+
+(define (trigrams str)
+ "Return all trigrams in STR."
+ (if (< (string-length str) 3)
+ '()
+ (map (lambda (start)
+ (substring str start (+ start 3)))
+ (iota (- (string-length str) 2)))))
+
+(define (jaccard-index set1 set2)
+ "Return the Jaccard similarity coefficient between lists SET1 and
+SET2. Similarity between null sets is defined to be 0."
+ (if (and (null? set1)
+ (null? set2))
+ 0
+ (let ((length-of-intersection (length (lset-intersection equal? set1 set2))))
+ (exact->inexact
+ (/ length-of-intersection
+ (- (+ (length set1) (length set2))
+ length-of-intersection))))))
+
+(define (jaccard-string-similarity str1 str2)
+ "Return the trigram similarity between strings STR1 and STR2 as
+defined by the Jaccard index."
+ (jaccard-index (trigrams (string-downcase str1))
+ (trigrams (string-downcase str2))))
+
+(define (jaccard-string-similar? str1 str2)
+ "Return #t if STR1 and STR2 have a trigram similarity greater than
+0.8. Else, return #f. The Jaccard index is used as the similarity
+metric."
+ (let ((similarity-threshold 0.8))
+ (> (jaccard-string-similarity str1 str2)
+ similarity-threshold)))