about summary refs log tree commit diff
diff options
context:
space:
mode:
authorArun Isaac2021-12-04 14:25:50 +0530
committerArun Isaac2021-12-04 14:32:14 +0530
commitb53d4c2e14f94f442c80c2cd1a12227d78719d06 (patch)
tree76798cb42d7ebf43ce3cf14ed24464507c155865
parentc0ecec48f4d5ab97e213c8a249c47b78eb448fbf (diff)
downloadgn-transform-databases-b53d4c2e14f94f442c80c2cd1a12227d78719d06.tar.gz
Use the map-alist DSL.
* dump.scm: Import (dump utils).
(string-blank?): New function.
(scm->triples): Filter out triples with #f or blank string objects.
(process-metadata-alist): Delete function.
(default-metadata-proc): New function.
(dump-species, dump-strain, mapping-method-name->id, dump-inbred-set,
dump-phenotype, dump-publication, dump-publish-xref, dump-tissue,
dump-investigators, dump-avg-method, dump-gene-chip, dump-info-files):
Use map-alist.
-rwxr-xr-xdump.scm355
1 files changed, 147 insertions, 208 deletions
diff --git a/dump.scm b/dump.scm
index 3701fbd..7b86b8f 100755
--- a/dump.scm
+++ b/dump.scm
@@ -8,7 +8,8 @@
              (srfi srfi-26)
              (ice-9 match)
              (ice-9 string-fun)
-             (dump sql))
+             (dump sql)
+             (dump utils))
 
 
 ;;; GeneNetwork database connection parameters and dump path
@@ -110,22 +111,29 @@ characters with an underscore and prefixing with gn:PREFIX."
   (string-append (string-downcase (substring str 0 1))
                  (substring str 1)))
 
+(define (string-blank? str)
+  "Return non-#f if STR consists only of whitespace characters."
+  (string-every char-set:whitespace str))
+
 (define (scm->triples alist id)
   (for-each (match-lambda
               ((predicate . object)
-               (triple id predicate object)))
+               (when (cond
+                      ((string? object)
+                       (not (string-blank? object)))
+                      (else object))
+                 (triple id predicate object))))
             alist))
 
-(define (process-metadata-alist alist)
-  (filter-map (match-lambda
-                ((key . "") #f)
-                ((key . value)
-                 (cons (string->symbol
-                        (string-append
-                         "gn:" (camel->lower-camel
-                                (snake->lower-camel key))))
-                       value)))
-              alist))
+(define default-metadata-proc
+  (match-lambda
+    ((key . value)
+     (cons (string->symbol
+            (string-append
+             "gn:" (camel->lower-camel
+                    (snake->lower-camel key))))
+           value))
+    (x (error "malformed alist element" x))))
 
 (define (triple subject predicate object)
   (format #t "~a ~a ~s .~%" subject predicate object))
@@ -134,48 +142,30 @@ characters with an underscore and prefixing with gn:PREFIX."
   (cut string->identifier "species" <>))
 
 (define (dump-species db)
-  (sql-for-each (lambda (alist)
-                  (match alist
-                    (((_ . common-name)
-                      (_ . menu-name)
-                      (_ . binomial-name))
-                     (let ((id (binomial-name->species-id binomial-name)))
-                       (triple id 'rdf:type 'gn:species)
-                       ;; Common name
-                       (triple id 'gn:name common-name)
-                       ;; Menu name (TODO: Maybe, drop this
-                       ;; field. It can be inferred from the
-                       ;; common name.)
-                       (triple id 'gn:menuName menu-name)
-                       ;; Binomial name
-                       (triple id 'gn:binomialName binomial-name)))))
+  (sql-for-each (lambda (row)
+                  (scm->triples (map-alist row
+                                  (set rdf:type 'gn:species)
+                                  ;; Common name
+                                  (set gn:name (key "SpeciesName"))
+                                  ;; Menu name (TODO: Maybe, drop this field. It can
+                                  ;; be inferred from the common name.)
+                                  (set gn:menuName (key "MenuName"))
+                                  (set gn:binomialname (key "FullName")))
+                                (binomial-name->species-id (assoc-ref row "FullName"))))
                 db
                 "SELECT SpeciesName, MenuName, FullName FROM Species"))
 
 (define (dump-strain db)
-  (sql-for-each (lambda (alist)
-                  (match alist
-                    (((_ . binomial-name)
-                      (_ . name)
-                      (_ . name2)
-                      (_ . symbol)
-                      (_ . alias))
-                     (let ((id (string->identifier "strain" name)))
-                       (triple id 'rdf:type 'gn:strain)
-                       ;; The species this is a strain of
-                       (triple id 'gn:strainOfSpecies
-                               (binomial-name->species-id binomial-name))
-                       ;; Name
-                       (triple id 'gn:name name)
-                       ;; A second name, if there is one
-                       (unless (string=? name name2)
-                         (triple id 'gn:name name2))
-                       ;; Symbol, if there is one
-                       (unless (string-null? symbol)
-                         (triple id 'gn:symbol symbol))
-                       ;; Alias, if there is one
-                       (unless (string-null? alias)
-                         (triple id 'gn:alias alias))))))
+  (sql-for-each (lambda (row)
+                  (scm->triples (map-alist row
+                                  (set rdf:type 'gn:strain)
+                                  (set gn:strainOfSpecies
+                                       (binomial-name->species-id (key "FullName")))
+                                  ;; Name, and maybe a second name
+                                  (set gn:name (key "Name"))
+                                  (set gn:name (key "Name2"))
+                                  (set gn:alias (key "Alias")))
+                                (string->identifier "strain" (assoc-ref row "Name"))))
                 db
                 "SELECT Species.FullName, Strain.Name, Strain.Name2, Strain.Symbol, Strain.Alias FROM Strain JOIN Species ON Strain.SpeciesId = Species.SpeciesId"))
 
@@ -185,10 +175,10 @@ characters with an underscore and prefixing with gn:PREFIX."
 
 ;; TODO: This function is unused. Remove if not required.
 (define (dump-mapping-method db)
-  (sql-for-each (match-lambda
-                  (((_ . name))
-                   (triple (string-append "gn:mappingMethod" name)
-                           'rdf:type 'gn:mappingMethod)))
+  (sql-for-each (lambda (row)
+                  (scm->triples (map-alist row
+                                  (set rdf:type 'gn:mappingMethod))
+                                (string-append "gn:mappingMethod" (assoc-ref row "Name"))))
                 db
                 "SELECT Name FROM MappingMethod"))
 
@@ -196,17 +186,13 @@ characters with an underscore and prefixing with gn:PREFIX."
   (cut string->identifier "inbredSet" <>))
 
 (define (dump-inbred-set db)
-  (sql-for-each (lambda (alist)
-                  (let ((id (inbred-set-name->id (assoc-ref alist "Name"))))
-                    (triple id 'rdf:type 'gn:phenotype)
-                    (scm->triples
-                     (filter-map (match-lambda
-                                   (('gn:binomialName . binomial-name)
-                                    (cons 'gn:inbredSetOfSpecies
-                                          (binomial-name->species-id binomial-name)))
-                                   (x x))
-                                 (process-metadata-alist alist))
-                     id)))
+  (sql-for-each (lambda (row)
+                  (scm->triples (map-alist row
+                                  (set rdf:type 'gn:phenotype)
+                                  (set gn:inbredSetOfSpecies
+                                       (binomial-name->species-id (key "BinomialName")))
+                                  (else=> default-metadata-proc))
+                                (inbred-set-name->id (assoc-ref row "Name"))))
                 db
                 "SELECT InbredSet.Name, InbredSet.FullName, GeneticType, Family,
 Species.FullName AS BinomialName
@@ -217,54 +203,42 @@ INNER JOIN Species USING (SpeciesId)"))
   (string->identifier "phenotype" (number->string id)))
 
 (define (dump-phenotype db)
-  (sql-for-each (lambda (alist)
-                  (let ((id (phenotype-id->id (assoc-ref alist "Id"))))
-                    (triple id 'rdf:type 'gn:phenotype)
-                    (scm->triples
-                     (filter (match-lambda
-                               (('gn:id . _) #f)
-                               (('gn:units . value)
-                                (string-ci=? value "unknown"))
-                               (_ #t))
-                             (process-metadata-alist alist))
-                     id)))
+  (sql-for-each (lambda (row)
+                  (scm->triples (map-alist row
+                                  (delete "Id")
+                                  (set rdf:type 'gn:phenotype)
+                                  (set gn:units (and (string-ci=? (key "Units") "unknown")
+                                                     (key "Units")))
+                                  (else=> default-metadata-proc))
+                                (phenotype-id->id (assoc-ref row "Id"))))
                 db
                 "SELECT Id, Pre_publication_description, Post_publication_description,
 Original_description, Units, Pre_publication_abbreviation, Post_publication_abbreviation,
 Lab_code, Submitter, Owner, Authorized_Users FROM Phenotype"))
 
 (define (dump-publication db)
-  (sql-for-each (lambda (alist)
-                  (let ((id (string-append "gn:publication"
-                                           (number->string (assoc-ref alist "Id")))))
-                    (triple id 'rdf:type 'gn:publication)
-                    (scm->triples
-                     (append-map (match-lambda
-                                   (('gn:id . _) '())
-                                   ;; The authors field is a comma
-                                   ;; separated list. Split it.
-                                   (('gn:authors . authors)
-                                    (map (lambda (author-name)
-                                           (cons 'gn:author (string-trim author-name)))
-                                         (string-split authors #\,)))
-                                   (('gn:abstract . abstract)
-                                    ;; TODO: Handle unprintable
-                                    ;; characters better.
-                                    (list (cons 'gn:abstract
-                                                (delete-substrings abstract "\x01"))))
-                                   (x (list x)))
-                                 (process-metadata-alist alist))
-                     id)))
+  (sql-for-each (lambda (row)
+                  (scm->triples (map-alist row
+                                  (delete "Id")
+                                  (set rdf:type 'gn:publication)
+                                  (multiset gn:authors
+                                            ;; The authors field is a comma
+                                            ;; separated list. Split it.
+                                            (map string-trim (string-split (key "Authors") #\,)))
+                                  (set gn:abstract
+                                       ;; TODO: Why are there unprintable characters?
+                                       (delete-substrings (key "Abstract") "\x01"))
+                                  (else=> default-metadata-proc))
+                                (string-append "gn:publication"
+                                               (number->string (assoc-ref row "Id")))))
                 db
                 "SELECT Id, PubMed_ID, Abstract, Authors, Title, Journal, Volume, Pages, Month, Year FROM Publication"))
 
 (define (dump-publish-xref db)
-  (sql-for-each (match-lambda
-                  (((_ . inbred-set-name)
-                    (_ . phenotype-id))
-                   (triple (phenotype-id->id phenotype-id)
-                           'gn:phenotypeOfSpecies
-                           (inbred-set-name->id inbred-set-name))))
+  (sql-for-each (lambda (row)
+                  (scm->triples (map-alist row
+                                  (set gn:phenotypeOfSpecies (inbred-set-name->id (key "Name"))))
+                                (phenotype-id->id (assoc-ref row "PhenotypeId"))))
                 db
                 "SELECT Name, PhenotypeId
 FROM PublishXRef
@@ -276,14 +250,15 @@ INNER JOIN InbredSet USING (InbredSetId)"))
 (define (dump-tissue db)
   ;; The Name and TissueName fields seem to be identical. BIRN_lex_ID
   ;; and BIRN_lex_Name are mostly NULL.
-  (sql-for-each (match-lambda
-                  (((_ . name)
-                    (_ . short-name))
-                   ;; Hopefully the Short_Name field is distinct and
-                   ;; can be used as an identifier.
-                   (let ((id (tissue-short-name->id short-name)))
-                     (triple id 'rdf:type 'gn:tissue)
-                     (triple id 'gn:name name))))
+  (sql-for-each (lambda (row)
+                  (scm->triples (map-alist row
+                                  (delete "Short_Name")
+                                  (set rdf:type 'gn:tissue)
+                                  (set gn:name (key "Name")))
+                                ;; Hopefully the Short_Name field is
+                                ;; distinct and can be used as an
+                                ;; identifier.
+                                (tissue-short-name->id (assoc-ref row "Short_Name"))))
                 db
                 "SELECT Name, Short_Name FROM Tissue"))
 
@@ -303,29 +278,19 @@ INNER JOIN InbredSet USING (InbredSetId)"))
                                    "_")))
 
 (define (dump-investigators db)
-  (sql-for-each (lambda (alist)
-                  (let ((id (investigator-attributes->id (assoc-ref alist "FirstName")
-                                                         (assoc-ref alist "LastName")
-                                                         (assoc-ref alist "Email"))))
-                    (triple id 'rdf:type 'foaf:Person)
-                    (scm->triples
-                     (cons (cons 'foaf:name (string-append
-                                             (assoc-ref alist "FirstName")
-                                             " " (assoc-ref alist "LastName")))
-                           (map (match-lambda
-                                  (('gn:firstName . first-name)
-                                   (cons 'foaf:givenName first-name))
-                                  (('gn:lastName . last-name)
-                                   (cons 'foaf:familyName last-name))
-                                  (('gn:phone . phone)
-                                   (cons 'foaf:phone phone))
-                                  (('gn:email . email)
-                                   (cons 'foaf:mbox (fix-email-id email)))
-                                  (('gn:url . url)
-                                   (cons 'foaf:homepage url))
-                                  (x x))
-                                (process-metadata-alist alist)))
-                     id)))
+  (sql-for-each (lambda (row)
+                  (scm->triples (map-alist row
+                                  (set rdf:type 'foaf:Person)
+                                  (set foaf:name (string-append (key "FirstName") " " (key "LastName")))
+                                  (set foaf:givenName (key "FirstName"))
+                                  (set foaf:familyName (key "LastName"))
+                                  (set foaf:phone (key "Phone"))
+                                  (set foaf:mbox (fix-email-id (key "Email")))
+                                  (set foaf:homepage (key "Url"))
+                                  (else=> default-metadata-proc))
+                                (investigator-attributes->id (assoc-ref row "FirstName")
+                                                             (assoc-ref row "LastName")
+                                                             (assoc-ref row "Email"))))
                 db
                 ;; There are a few duplicate entries. We group by
                 ;; email to deduplicate.
@@ -336,11 +301,11 @@ GROUP BY Email"))
   (cut string->identifier "avgmethod" <>))
 
 (define (dump-avg-method db)
-  (sql-for-each (match-lambda
-                  (((_ . name))
-                   (let ((id (avg-method-name->id name)))
-                     (triple id 'rdf:type 'gn:avgMethod)
-                     (triple id 'gn:name name))))
+  (sql-for-each (lambda (row)
+                  (scm->triples (map-alist row
+                                  (set rdf:type 'gn:avgMethod)
+                                  (set gn:name (key "Name")))
+                                (avg-method-name->id (assoc-ref row "Name"))))
                 db
                 ;; The Name and Normalization fields seem to be the
                 ;; same. Dump only the Name field.
@@ -353,77 +318,50 @@ GROUP BY Email"))
   (cut string->identifier "platform" <>))
 
 (define (dump-gene-chip db)
-  (sql-for-each (match-lambda
-                  (((_ . gene-chip-name)
-                    (_ . name))
-                   (let ((id (gene-chip-name->id name)))
-                     (triple id 'rdf:type 'gn:platform)
-                     (triple id 'gn:name gene-chip-name))))
+  (sql-for-each (lambda (row)
+                  (scm->triples (map-alist row
+                                  (delete "Name")
+                                  (set rdf:type 'gn:platform)
+                                  (set gn:name (key "GeneChipName")))
+                                (gene-chip-name->id (assoc-ref row "Name"))))
                 db
                 "SELECT GeneChipName, Name FROM GeneChip"))
 
 (define (dump-info-files db)
-  (sql-for-each (lambda (alist)
-                  (let ((id (string-append "gn:dataset"
-                                           (number->string
-                                            (assoc-ref alist "GN_AccesionId")))))
-                    (triple id 'rdf:type 'gn:dataset)
-                    (scm->triples
-                     (cons (cons 'gn:datasetOfInvestigator
-                                 (investigator-attributes->id (assoc-ref alist "FirstName")
-                                                              (assoc-ref alist "LastName")
-                                                              (assoc-ref alist "Email")))
-                           (filter-map (match-lambda
-                                         (('gn:gNAccesionId . accession-id)
-                                          (cons 'gn:accessionId
-                                                (string-append "GN" (number->string accession-id))))
-                                         (('gn:datasetStatusName . status)
-                                          (cons 'gn:datasetStatus
-                                                (string-downcase status)))
-                                         (('gn:binomialName . binomial-name)
-                                          (cons 'gn:datasetOfSpecies
-                                                (binomial-name->species-id binomial-name)))
-                                         (('gn:inbredSetName . inbred-set-name)
-                                          (cons 'gn:datasetOfInbredSet
-                                                (inbred-set-name->id inbred-set-name)))
-                                         (('gn:shortName . short-name)
-                                          (cons 'gn:datasetOfTissue
-                                                (tissue-short-name->id short-name)))
-                                         ;; Remove first name, last name and
-                                         ;; email. We are using it outside
-                                         ;; this filter-map.
-                                         (('gn:firstName . first-name) #f)
-                                         (('gn:lastName . last-name) #f)
-                                         (('gn:email . email) #f)
-                                         (('gn:avgMethodId . avg-method-id)
-                                          ;; If avg-method-id is 0, a
-                                          ;; non-existent method, assume
-                                          ;; N/A.
-                                          (and (zero? avg-method-id)
-                                               (cons 'gn:normalization
-                                                     (avg-method-name->id "N/A"))))
-                                         (('gn:avgMethodName . avg-method-name)
-                                          (cons 'gn:normalization
-                                                (avg-method-name->id avg-method-name)))
-                                         (('gn:geneChip . name)
-                                          (cons 'gn:datasetOfPlatform
-                                                (gene-chip-name->id name)))
-                                         (('gn:summary . summary)
-                                          ;; TODO: Why are there unprintable
-                                          ;; characters in the summary?
-                                          (cons 'gn:summary
-                                                (delete-substrings summary "\x01" "\x03")))
-                                         (('gn:aboutTissue . about-tissue)
-                                          ;; TODO: Why are there unprintable
-                                          ;; characters in the summary?
-                                          (cons 'gn:aboutTissue
-                                                (delete-substrings about-tissue "\x01" "\x03")))
-                                         (('gn:geoSeries . geo-series)
-                                          (and (not (string-prefix-ci? "no geo series" geo-series))
-                                               (cons 'gn:geoSeries geo-series)))
-                                         (x x))
-                                       (process-metadata-alist alist)))
-                     id)))
+  (sql-for-each (lambda (row)
+                  (scm->triples
+                   (map-alist row
+                     (set rdf:type 'gn:dataset)
+                     (set gn:datasetOfInvestigator
+                          (investigator-attributes->id (key "FirstName")
+                                                       (key "LastName")
+                                                       (key "Email")))
+                     (set gn:accessionId (string-append "GN" (number->string (key "GN_AccesionId"))))
+                     (set gn:datasetStatusName (string-downcase (key "DatasetStatusName")))
+                     (set gn:datasetOfSpecies (binomial-name->species-id (key "BinomialName")))
+                     (set gn:datasetOfInbredSet (inbred-set-name->id (key "InbredSetName")))
+                     (set gn:datasetOfTissue (tissue-short-name->id (key "Short_Name")))
+                     (set gn:normalization
+                          (avg-method-name->id
+                           ;; If AvgMethodName is NULL, assume N/A.
+                           (if (string-blank? (key "AvgMethodName"))
+                               "N/A" (key "AvgMethodName"))))
+                     (set gn:datasetOfPlatform (gene-chip-name->id (key "GeneChip")))
+                     (set gn:summary
+                          ;; TODO: Why are there unprintable characters?
+                          (delete-substrings (key "Summary")
+                                             "\x01" "\x03"))
+                     (set gn:aboutTissue
+                          ;; TODO: Why are there unprintable characters?
+                          (delete-substrings (key "AboutTissue")
+                                             "\x01" "\x03"))
+                     (set gn:geoSeries
+                          (and (not (string-prefix-ci? "no geo series"
+                                                       (key "GeoSeries")))
+                               (key "GeoSeries")))
+                     (else=> default-metadata-proc))
+                   (string-append "gn:dataset"
+                                  (number->string (assoc-ref row "GN_AccesionId")))))
                 db
                 ;; TODO: Double check Platforms. It doesn't seem to
                 ;; match up.
@@ -437,7 +375,7 @@ Species.FullName AS BinomialName,
 InbredSet.Name AS InbredSetName,
 Tissue.Short_Name,
 Investigators.FirstName, Investigators.LastName, Investigators.Email,
-AvgMethodId, AvgMethod.Name AS AvgMethodName,
+AvgMethod.Name AS AvgMethodName,
 GeneChip.Name AS GeneChip
 FROM InfoFiles
 LEFT JOIN Datasets USING (DatasetId)
@@ -447,7 +385,8 @@ LEFT JOIN InbredSet USING (InbredSetId)
 LEFT JOIN Tissue USING (TissueId)
 LEFT JOIN Investigators USING (InvestigatorId)
 LEFT JOIN AvgMethod USING (AvgMethodId)
-LEFT JOIN GeneChip USING (GeneChipId)"))
+LEFT JOIN GeneChip USING (GeneChipId)
+WHERE GN_AccesionId IS NOT NULL"))
 
 (define (dump-data-table db table-name data-field)
   (let ((dump-directory (string-append %dump-directory "/" table-name))