diff options
Diffstat (limited to 'transform')
| -rw-r--r-- | transform/schema.scm (renamed from transform/schema-dump.scm) | 20 | ||||
| -rw-r--r-- | transform/special-forms.scm | 70 | ||||
| -rw-r--r-- | transform/sql.scm | 19 | ||||
| -rw-r--r-- | transform/strings.scm | 23 | ||||
| -rw-r--r-- | transform/triples.scm | 24 | ||||
| -rw-r--r-- | transform/uuid.scm | 234 |
6 files changed, 88 insertions, 302 deletions
diff --git a/transform/schema-dump.scm b/transform/schema.scm index 18df5da..f3896a7 100644 --- a/transform/schema-dump.scm +++ b/transform/schema.scm @@ -4,7 +4,13 @@ #:use-module (transform sql) #:use-module (transform triples) #:use-module (transform strings) - #:use-module (transform table)) + #:use-module (transform table) + #:export (table-fields + get-tables-from-comments + schema-annotations + tables + schema + data-table)) (define (table-fields db table) @@ -47,7 +53,7 @@ (for-each (cut table-fields db <>) (get-tables-from-comments db))))) -(define (tables db) +(define (tables connection-settings db) "Return list of all tables in DB. Each element of the returned list is a <table> object." (map (lambda (table) @@ -68,7 +74,7 @@ is a <table> object." (information_schema.tables data_length)) (information_schema.tables) (format #f "WHERE table_schema = '~a'" - (assq-ref %connection-settings 'sql-database)))))) + (assq-ref connection-settings 'sql-database)))))) (define (schema db) (let ((tables (tables db))) @@ -83,14 +89,14 @@ is a <table> object." (table-name table))))) (triple table-id 'rdf:type 'gn:sqlTable) (triple table-id 'gn:name (table-name table)) - (triple table-id 'gn:hasSize (table-size table)) + (triple table-id 'gn:has_size (table-size table)) (for-each (lambda (column) (let ((column-id (column-id (table-name table) (column-name column)))) - (triple column-id 'rdf:type 'gn:sqlTableField) + (triple column-id 'rdf:type 'gn:sql_table_field) (triple column-id 'gn:name (column-name column)) - (triple column-id 'gn:sqlFieldType (column-type column)) - (triple table-id 'gn:hasField column-id))) + (triple column-id 'gn:sql_field_type (column-type column)) + (triple table-id 'gn:has_field column-id))) (table-columns table)))) tables))) diff --git a/transform/special-forms.scm b/transform/special-forms.scm index 99b30df..ddb3180 100644 --- a/transform/special-forms.scm +++ b/transform/special-forms.scm @@ -537,40 +537,42 @@ The above query results to triples that have the form: (call-with-target-database connection (lambda (db) - (with-output-to-file ; - doc-path - (lambda () - (format #t "# ~a" name) - (for-each - (lambda (proc) - (proc db - #:metadata? #f - #:data? #f - #:documentation? - (lambda () (for-each - (match-lambda - ((k v) - (begin - (prefix k v #f)))) - prefixes)))) - inputs)) - #:encoding "UTF-8") + (when doc-path + (with-output-to-file ; + doc-path + (lambda () + (format #t "# ~a" name) + (for-each + (lambda (proc) + (proc db + #:metadata? #f + #:data? #f + #:documentation? + (lambda () (for-each + (match-lambda + ((k v) + (begin + (prefix k v #f)))) + prefixes)))) + inputs)) + #:encoding "UTF-8")) ;; Dumping the actual data - (with-output-to-file - rdf-path - (lambda () - ;; Add the prefixes - (for-each - (match-lambda - ((k v) - (begin - (prefix k v)))) - prefixes) - (newline) - (for-each - (lambda (proc) - (proc db #:metadata? table-metadata?)) - inputs)) - #:encoding "UTF-8"))))))) + (when rdf-path + (with-output-to-file + rdf-path + (lambda () + ;; Add the prefixes + (for-each + (match-lambda + ((k v) + (begin + (prefix k v)))) + prefixes) + (newline) + (for-each + (lambda (proc) + (proc db #:metadata? table-metadata?)) + inputs)) + #:encoding "UTF-8")))))))) diff --git a/transform/sql.scm b/transform/sql.scm index a8962c8..daedf97 100644 --- a/transform/sql.scm +++ b/transform/sql.scm @@ -102,13 +102,14 @@ (dbi-get_row db)) (define (call-with-target-database connection-settings proc) - (call-with-database "mysql" (string-join - (list (assq-ref connection-settings 'sql-username) - (assq-ref connection-settings 'sql-password) - (assq-ref connection-settings 'sql-database) - "tcp" - (assq-ref connection-settings 'sql-host) - (number->string - (assq-ref connection-settings 'sql-port))) - ":") + (call-with-database "mysql" (string-append (string-join + (list (assq-ref connection-settings 'sql-username) + (assq-ref connection-settings 'sql-password) + (assq-ref connection-settings 'sql-database) + "tcp" + (assq-ref connection-settings 'sql-host) + (number->string + (assq-ref connection-settings 'sql-port))) + ":") + "?charset=utf8") proc)) diff --git a/transform/strings.scm b/transform/strings.scm index 7545f62..aba554a 100644 --- a/transform/strings.scm +++ b/transform/strings.scm @@ -11,11 +11,11 @@ delete-substrings replace-substrings remove-duplicates - remap-species-identifiers str sanitize-rdf-string snake->lower-camel lower-case-and-replace-spaces - string-capitalize-first)) + string-capitalize-first + normalize-string-field)) (define (lower-case-and-replace-spaces str) (string-map @@ -121,13 +121,12 @@ association list mapping substrings to their replacements." ((memq (car lst) result) (loop (cdr lst) result)) (else (loop (cdr lst) (cons (car lst) result)))))) - -(define (remap-species-identifiers str) - "This procedure remaps identifiers to standard binominal. Obviously this should - be sorted by correcting the database!" - (match str - ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] - ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] - ["Macaca mulatta" "Macaca nemestrina"] - ["Bat (Glossophaga soricina)" "Glossophaga soricina"] - [str str])) +(define (normalize-string-field field) + (let ((field (string-trim-both field))) + (match field + ((? string? field) + (if (or (string-blank? field) + (string=? (string-downcase field) "none")) + "" + field)) + (_ "")))) diff --git a/transform/triples.scm b/transform/triples.scm index 9775d36..13758e5 100644 --- a/transform/triples.scm +++ b/transform/triples.scm @@ -8,8 +8,19 @@ triple scm->triples annotate-field + remap-species-identifiers string->binomial-name)) +(define (remap-species-identifiers str) + "This procedure remaps identifiers to standard binominal. Obviously this should + be sorted by correcting the database!" + (match str + ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] + ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] + ["Macaca mulatta" "Macaca nemestrina"] + ["Bat (Glossophaga soricina)" "Glossophaga soricina"] + [str str])) + (define (annotate-field field schema) (let ([schema (cond ((symbol? schema) (symbol->string schema)) @@ -28,7 +39,7 @@ #:optional #:key (ontology "gn:") (separator "") - (proc string-capitalize-first)) + (proc (lambda (x) x))) "Convert STR to a turtle identifier after replacing illegal characters with an underscore and prefixing with gn:PREFIX." (if (or (and (string? str) (string-null? str)) @@ -40,11 +51,12 @@ characters with an underscore and prefixing with gn:PREFIX." (lambda (c) (eq? c #\))) (string-map (lambda (c) - (case c - ((#\/ #\< #\> #\+ #\( #\space #\@) #\_) - (else c))) - (proc - (string-trim-right str #\.)))))))) + (if (or (char-alphabetic? c) + (char-numeric? c) + (char=? c #\_)) + c + #\_)) + (proc str))))))) (define* (prefix prefix iri #:optional (ttl? #t)) diff --git a/transform/uuid.scm b/transform/uuid.scm deleted file mode 100644 index be0e592..0000000 --- a/transform/uuid.scm +++ /dev/null @@ -1,234 +0,0 @@ -;; CREDIT: https://lists.gnu.org/archive/html/guile-user/2018-01/msg00019.html -(define-module (transform uuid) - #:use-module (srfi srfi-1) - #:use-module (srfi srfi-11) - #:use-module (rnrs bytevectors) - #:use-module (ice-9 iconv) - #:export (bytevector->md5 - make-version-3-uuid)) - -(define (bytevector->md5 bytevector) - "Convert BYTEVECTOR to a bytevector containing the MD5 hash of -BYTEVECTOR." - ;; Implemented along RFC 1321. It should be easy to verify that - ;; this procedure performs the operations specified therein. - (define (append-padding-bits bytevector) - "Makes a list from BYTEVECTOR with padding as per RFC 1321 3.1." - (let* ((length-in-bits (* 8 (bytevector-length bytevector))) - (padding-bits (- 512 (modulo (- length-in-bits 448) 512)))) - (append (bytevector->u8-list bytevector) - '(128) ; #*10000000 - (iota - (- (/ padding-bits 8) 1) - 0 0)))) - (define (append-length msg-list message-length) - "Append MESSAGE-LENGTH as 8 byte values from a uint64 to MSG-LIST." - (append msg-list - ;; For numbers too large for an uint64, only the low-order - ;; bytes are returned. - (bytevector->u8-list (u64vector - (modulo - (* message-length 8) ; bits - (1+ #xffffffffffffffff)))))) - (let hash ((AA #x67452301) - (BB #xefcdab89) - (CC #x98badcfe) - (DD #x10325476) - (to-digest - (append-length - (append-padding-bits - bytevector) - (bytevector-length bytevector)))) - (define (F X Y Z) - (logior (logand X Y) (logand (lognot X) Z))) - (define (G X Y Z) - (logior (logand X Z) (logand Y (lognot Z)))) - (define (H X Y Z) - (logxor X Y Z)) - (define (I X Y Z) - (logxor Y (logior X (lognot Z)))) - (define (T i) - (inexact->exact (floor (* 4294967296 (abs (sin i)))))) - (define (number->u32 n) - "Cut off all bits that do not fit in a uint32." - (bit-extract n 0 32)) - (define (lsh32 n count) - (number->u32 (logior (ash n count) - (bit-extract n (- 32 count) 32)))) - (if (not (null? to-digest)) - (let* ((block (u8-list->bytevector - (list-head to-digest (/ 512 8)))) - (X (lambda (j) (bytevector-u32-ref - block (* 4 j) (endianness little)))) - (do-round1 - (lambda (A B C D) - (define (operation a b c d k s i) - (number->u32 - (+ b (lsh32 (+ a (F b c d) (X k) (T i)) s)))) - (let* ((A (operation A B C D 0 7 1)) - (D (operation D A B C 1 12 2)) - (C (operation C D A B 2 17 3)) - (B (operation B C D A 3 22 4)) - (A (operation A B C D 4 7 5)) - (D (operation D A B C 5 12 6)) - (C (operation C D A B 6 17 7)) - (B (operation B C D A 7 22 8)) - (A (operation A B C D 8 7 9)) - (D (operation D A B C 9 12 10)) - (C (operation C D A B 10 17 11)) - (B (operation B C D A 11 22 12)) - (A (operation A B C D 12 7 13)) - (D (operation D A B C 13 12 14)) - (C (operation C D A B 14 17 15)) - (B (operation B C D A 15 22 16))) - (values A B C D)))) - (do-round2 - (lambda (A B C D) - (define (operation a b c d k s i) - (number->u32 - (+ b (lsh32 (+ a (G b c d) (X k) (T i)) s)))) - (let* ((A (operation A B C D 1 5 17)) - (D (operation D A B C 6 9 18)) - (C (operation C D A B 11 14 19)) - (B (operation B C D A 0 20 20)) - (A (operation A B C D 5 5 21)) - (D (operation D A B C 10 9 22)) - (C (operation C D A B 15 14 23)) - (B (operation B C D A 4 20 24)) - (A (operation A B C D 9 5 25)) - (D (operation D A B C 14 9 26)) - (C (operation C D A B 3 14 27)) - (B (operation B C D A 8 20 28)) - (A (operation A B C D 13 5 29)) - (D (operation D A B C 2 9 30)) - (C (operation C D A B 7 14 31)) - (B (operation B C D A 12 20 32))) - (values A B C D)))) - (do-round3 - (lambda (A B C D) - (define (operation a b c d k s i) - (number->u32 - (+ b (lsh32 (+ a (H b c d) (X k) (T i)) s)))) - (let* ((A (operation A B C D 5 4 33)) - (D (operation D A B C 8 11 34)) - (C (operation C D A B 11 16 35)) - (B (operation B C D A 14 23 36)) - (A (operation A B C D 1 4 37)) - (D (operation D A B C 4 11 38)) - (C (operation C D A B 7 16 39)) - (B (operation B C D A 10 23 40)) - (A (operation A B C D 13 4 41)) - (D (operation D A B C 0 11 42)) - (C (operation C D A B 3 16 43)) - (B (operation B C D A 6 23 44)) - (A (operation A B C D 9 4 45)) - (D (operation D A B C 12 11 46)) - (C (operation C D A B 15 16 47)) - (B (operation B C D A 2 23 48))) - (values A B C D)))) - (do-round4 - (lambda (A B C D) - (define (operation a b c d k s i) - (number->u32 - (+ b (lsh32 (+ a (I b c d) (X k) (T i)) s)))) - (let* ((A (operation A B C D 0 6 49)) - (D (operation D A B C 7 10 50)) - (C (operation C D A B 14 15 51)) - (B (operation B C D A 5 21 52)) - (A (operation A B C D 12 6 53)) - (D (operation D A B C 3 10 54)) - (C (operation C D A B 10 15 55)) - (B (operation B C D A 1 21 56)) - (A (operation A B C D 8 6 57)) - (D (operation D A B C 15 10 58)) - (C (operation C D A B 6 15 59)) - (B (operation B C D A 13 21 60)) - (A (operation A B C D 4 6 61)) - (D (operation D A B C 11 10 62)) - (C (operation C D A B 2 15 63)) - (B (operation B C D A 9 21 64))) - (values A B C D))))) - (let*-values (((A B C D) (values AA BB CC DD)) - ((A B C D) (do-round1 A B C D)) - ((A B C D) (do-round2 A B C D)) - ((A B C D) (do-round3 A B C D)) - ((A B C D) (do-round4 A B C D))) - (hash (number->u32 (+ A AA)) - (number->u32 (+ B BB)) - (number->u32 (+ C CC)) - (number->u32 (+ D DD)) - (list-tail to-digest (/ 512 8))))) - ;; we’re done: - (u8-list->bytevector - (append - (bytevector->u8-list (u32vector AA)) - (bytevector->u8-list (u32vector BB)) - (bytevector->u8-list (u32vector CC)) - (bytevector->u8-list (u32vector DD))))))) - -(define* (make-version-3-uuid namespace-uuid str #:optional (prefix "urn:uuid:")) - "Generates a UUID string by computing the MD5 hash of NAMESPACE-UUID -and STR. NAMESPACE-UUID must be a bytevector consisting of the UUID’s -bytes, *not* the UUID’s string representation." - (define (half-byte->hex-char number) - "Returns the corresponding hexadecimal digit for a number NUMBER -between 0 and 15." - (case number - ((0) #\0) - ((1) #\1) - ((2) #\2) - ((3) #\3) - ((4) #\4) - ((5) #\5) - ((6) #\6) - ((7) #\7) - ((8) #\8) - ((9) #\9) - ((10) #\a) - ((11) #\b) - ((12) #\c) - ((13) #\d) - ((14) #\e) - ((15) #\f))) - (define (byte->hex-string bv index) - "Convert the byte at INDEX of bytevector BV to a hex string." - (let ((byte (bytevector-u8-ref bv index))) - (string (half-byte->hex-char (quotient byte 16)) - (half-byte->hex-char (modulo byte 16))))) - (let ((md5 (bytevector->md5 - (u8-list->bytevector - (append (bytevector->u8-list namespace-uuid) - (bytevector->u8-list (string->utf8 str))))))) - (string-append prefix - ;; time_low field: - (byte->hex-string md5 0) - (byte->hex-string md5 1) - (byte->hex-string md5 2) - (byte->hex-string md5 3) - "-" - ;; time_mid field: - (byte->hex-string md5 4) - (byte->hex-string md5 5) - "-" - ;; time_hi_and_version field: - (let ((byte (bytevector-u8-ref md5 6))) - (string (half-byte->hex-char 3) ; UUID version 3 - (half-byte->hex-char (modulo byte 16)))) - (byte->hex-string md5 7) - "-" - ;; clock_seq_hi_and_reserved field: - (let ((byte (bytevector-u8-ref md5 8))) - (string (half-byte->hex-char - (logior #b1000 ; most significant bits are 10 - (bit-extract (quotient byte 16) 0 2))) - (half-byte->hex-char (modulo byte 16)))) - ;; clock_seq_low field: - (byte->hex-string md5 9) - "-" - ;; node field: - (byte->hex-string md5 10) - (byte->hex-string md5 11) - (byte->hex-string md5 12) - (byte->hex-string md5 13) - (byte->hex-string md5 14) - (byte->hex-string md5 15)))) |
