diff options
author | Arun Isaac | 2021-12-15 17:27:48 +0530 |
---|---|---|
committer | Arun Isaac | 2021-12-15 17:27:48 +0530 |
commit | 1aad2a2f50dc51b1cd89eb2cac0944b7c87e6c61 (patch) | |
tree | 57565ae75b8e653f309f69675a0dd2a5d37acf64 | |
parent | b2541ba75174826abf27a49ea15c96975caf6c68 (diff) | |
download | gn-transform-databases-1aad2a2f50dc51b1cd89eb2cac0944b7c87e6c61.tar.gz |
Move string similarity functions to separate module.
* dump.scm: Use (dump string-similarity).
(trigrams, jaccard-index, jaccard-string-similarity,
jaccard-string-similar?): Move to ...
* dump/string-similarity.scm: ... here.
-rwxr-xr-x | dump.scm | 35 | ||||
-rw-r--r-- | dump/string-similarity.scm | 37 |
2 files changed, 38 insertions, 34 deletions
@@ -11,6 +11,7 @@ (ice-9 string-fun) (sxml simple) (dump sql) + (dump string-similarity) (dump utils)) @@ -568,40 +569,6 @@ case-insensitive." color-scheme (1+ (min (floor-log1024 bytes) 3))))) -(define (trigrams str) - "Return all trigrams in STR." - (if (< (string-length str) 3) - '() - (map (lambda (start) - (substring str start (+ start 3))) - (iota (- (string-length str) 2))))) - -(define (jaccard-index set1 set2) - "Return the Jaccard similarity coefficient between lists SET1 and -SET2. Similarity between null sets is defined to be 0." - (if (and (null? set1) - (null? set2)) - 0 - (let ((length-of-intersection (length (lset-intersection equal? set1 set2)))) - (exact->inexact - (/ length-of-intersection - (- (+ (length set1) (length set2)) - length-of-intersection)))))) - -(define (jaccard-string-similarity str1 str2) - "Return the trigram similarity between strings STR1 and STR2 as -defined by the Jaccard index." - (jaccard-index (trigrams (string-downcase str1)) - (trigrams (string-downcase str2)))) - -(define (jaccard-string-similar? str1 str2) - "Return #t if STR1 and STR2 have a trigram similarity greater than -0.8. Else, return #f. The Jaccard index is used as the similarity -metric." - (let ((similarity-threshold 0.8)) - (> (jaccard-string-similarity str1 str2) - similarity-threshold))) - (define (sxml->xml-string tree) "Serialize sxml TREE to a string. Return the serialized string." (call-with-output-string diff --git a/dump/string-similarity.scm b/dump/string-similarity.scm new file mode 100644 index 0000000..4bcdf7c --- /dev/null +++ b/dump/string-similarity.scm @@ -0,0 +1,37 @@ +(define-module (dump string-similarity) + #:use-module (srfi srfi-1) + #:export (jaccard-string-similar?)) + +(define (trigrams str) + "Return all trigrams in STR." + (if (< (string-length str) 3) + '() + (map (lambda (start) + (substring str start (+ start 3))) + (iota (- (string-length str) 2))))) + +(define (jaccard-index set1 set2) + "Return the Jaccard similarity coefficient between lists SET1 and +SET2. Similarity between null sets is defined to be 0." + (if (and (null? set1) + (null? set2)) + 0 + (let ((length-of-intersection (length (lset-intersection equal? set1 set2)))) + (exact->inexact + (/ length-of-intersection + (- (+ (length set1) (length set2)) + length-of-intersection)))))) + +(define (jaccard-string-similarity str1 str2) + "Return the trigram similarity between strings STR1 and STR2 as +defined by the Jaccard index." + (jaccard-index (trigrams (string-downcase str1)) + (trigrams (string-downcase str2)))) + +(define (jaccard-string-similar? str1 str2) + "Return #t if STR1 and STR2 have a trigram similarity greater than +0.8. Else, return #f. The Jaccard index is used as the similarity +metric." + (let ((similarity-threshold 0.8)) + (> (jaccard-string-similarity str1 str2) + similarity-threshold))) |