about summary refs log tree commit diff
diff options
context:
space:
mode:
authorArun Isaac2021-12-16 15:49:47 +0530
committerArun Isaac2021-12-16 16:21:38 +0530
commit61d3cd3d9c51af80f5f0f8f92b42da56d6398a71 (patch)
tree39e42bbf0b409057fe62205a9432677c53163b8a
parenta182c3d3702f1d65ed36dbb3c747d32ffb1f2791 (diff)
downloadgn-transform-databases-61d3cd3d9c51af80f5f0f8f92b42da56d6398a71.tar.gz
Make define-dump syntax more concise.
* dump.scm (field->key, field->assoc-ref, collect-fields): New
functions.
(define-dump): Redefine with more concise syntax.
* dump.scm (dump-species, dump-strain, dump-mapping-method,
dump-inbred-set, dump-phenotype, dump-publication, dump-publish-xref,
dump-tissue, dump-investigators, dump-avg-method, dump-gene-chip,
dump-info-files): Use new define-dump syntax.
(default-metadata-proc): Delete function.
* .dir-locals.el (scheme-mode): Indent triples form correctly.
-rw-r--r--.dir-locals.el3
-rwxr-xr-xdump.scm456
2 files changed, 210 insertions, 249 deletions
diff --git a/.dir-locals.el b/.dir-locals.el
index f0a8de9..2fe8b32 100644
--- a/.dir-locals.el
+++ b/.dir-locals.el
@@ -5,4 +5,5 @@
   (indent-tabs-mode))
  (scheme-mode
   (eval put 'map-alist 'scheme-indent-function 1)
-  (eval put 'set-table-columns 'scheme-indent-function 1)))
+  (eval put 'set-table-columns 'scheme-indent-function 1)
+  (eval put 'triples 'scheme-indent-function 1)))
diff --git a/dump.scm b/dump.scm
index 51abdeb..93b8c7f 100755
--- a/dump.scm
+++ b/dump.scm
@@ -143,72 +143,91 @@ characters with an underscore and prefixing with gn:PREFIX."
                  (triple id predicate object))))
             alist))
 
-(define default-metadata-proc
-  (match-lambda
-    ((key . value)
-     (cons (string->symbol
-            (string-append "gn:" (snake->lower-camel key)))
-           value))
-    (x (error "malformed alist element" x))))
-
 (define (triple subject predicate object)
   (format #t "~a ~a ~s .~%" subject predicate object))
 
+(define (field->key x)
+  (translate-forms 'field
+                   (lambda (x)
+                     #`(key #,(symbol->string
+                               (syntax->datum
+                                ((syntax-rules (field)
+                                   ((field table column) column)
+                                   ((field table column alias) alias))
+                                 x)))))
+                   x))
+
+(define (field->assoc-ref alist x)
+  "Recursively translate field references in source X to (assoc-ref
+ALIST field-name) forms."
+  (translate-forms 'field
+                   (lambda (x)
+                     #`(assoc-ref #,alist
+                                  #,(symbol->string
+                                     (syntax->datum
+                                      ((syntax-rules (field)
+                                         ((field table column) column)
+                                         ((field table column alias) alias))
+                                       x)))))
+                   x))
+
+(define (collect-fields x)
+  (map (syntax-rules (field)
+         ((field reference ...)
+          (reference ...)))
+       (collect-forms 'field x)))
+
 (define %dumped '())
 
 (define-syntax define-dump
   (lambda (x)
-    (syntax-case x (select-query)
-      ((_ name (select-query (fields ...) tables raw-forms ...) proc)
-       #`(begin
-           (set! %dumped
-                 (append (list #,@(filter-map (lambda (field)
-                                                (syntax-case field (distinct)
-                                                  (distinct #f)
-                                                  ((table column _ ...) #'(cons 'table 'column))
-                                                  (field-spec (error "Invalid field specification" #'field-spec))))
-                                              #'(fields ...)))
-                         %dumped))
-           (define (name db)
-             (sql-for-each proc db (select-query (fields ...) tables raw-forms ...))))))))
+    (syntax-case x (tables triples)
+      ((_ name
+          (tables tables-spec raw ...)
+          (triples subject predicates ...))
+       (let ((fields (collect-fields #'(subject predicates ...))))
+         #`(begin
+             (set! %dumped
+                   (append (list #,@(filter-map (lambda (field)
+                                                  (syntax-case field (distinct)
+                                                    (distinct #f)
+                                                    ((table column _ ...) #'(cons 'table 'column))
+                                                    (field-spec (error "Invalid field specification" #'field-spec))))
+                                                fields))
+                           %dumped))
+             (define (name db)
+               (sql-for-each
+                (lambda (row)
+                  (scm->triples
+                   (map-alist row #,@(field->key #'(predicates ...)))
+                   #,(field->assoc-ref #'row #'subject)))
+                db
+                (select-query #,fields tables-spec raw ...))))))
+      (_ (error "Invalid define-dump syntax:" (syntax->datum x))))))
 
 (define binomial-name->species-id
   (cut string->identifier "species" <>))
 
 (define-dump dump-species
-  (select-query ((Species SpeciesName)
-                 (Species MenuName)
-                 (Species FullName))
-                (Species))
-  (lambda (row)
-    (scm->triples (map-alist row
-                    (set rdf:type 'gn:species)
-                    ;; Common name
-                    (set gn:name (key "SpeciesName"))
-                    ;; Menu name (TODO: Maybe, drop this field. It can
-                    ;; be inferred from the common name.)
-                    (set gn:menuName (key "MenuName"))
-                    (set gn:binomialName (key "FullName")))
-                  (binomial-name->species-id (assoc-ref row "FullName")))))
+  (tables (Species))
+  (triples (binomial-name->species-id (field Species FullName))
+    (set rdf:type 'gn:species)
+    (set gn:name (field Species SpeciesName))
+    (set gn:menuName (field Species MenuName))
+    (set gn:binomialName (field Species FullName))))
 
 (define-dump dump-strain
-  (select-query ((Species FullName)
-                 (Strain Name)
-                 (Strain Name2)
-                 (Strain Symbol)
-                 (Strain Alias))
-                (Strain
-                 (join Species "ON Strain.SpeciesId = Species.SpeciesId")))
-  (lambda (row)
-    (scm->triples (map-alist row
-                    (set rdf:type 'gn:strain)
-                    (set gn:strainOfSpecies
-                         (binomial-name->species-id (key "FullName")))
-                    ;; Name, and maybe a second name
-                    (set gn:name (key "Name"))
-                    (set gn:name (key "Name2"))
-                    (set gn:alias (key "Alias")))
-                  (string->identifier "strain" (assoc-ref row "Name")))))
+  (tables (Strain
+           (join Species "ON Strain.SpeciesId = Species.SpeciesId")))
+  (triples (string->identifier "strain" (field Strain Name))
+    (set rdf:type 'gn:strain)
+    (set gn:strainOfSpecies
+         (binomial-name->species-id (field Species FullName)))
+    ;; Name, and maybe a second name
+    (set gn:name (field Strain Name))
+    (set gn:name (field Strain Name2))
+    (set gn:alias (field Strain Alias))
+    (set gn:symbol (field Strain Symbol))))
 
 ;; TODO: This function is unused. Remove if not required.
 (define mapping-method-name->id
@@ -216,93 +235,69 @@ characters with an underscore and prefixing with gn:PREFIX."
 
 ;; TODO: This function is unused. Remove if not required.
 (define-dump dump-mapping-method
-  (select-query ((MappingMethod Name))
-                (MappingMethod))
-  (lambda (row)
-    (scm->triples (map-alist row
-                    (set rdf:type 'gn:mappingMethod))
-                  (string->identifier "mappingMethod" (assoc-ref row "Name")))))
+  (tables (MappingMethod))
+  (triples (string->identifier "mappingMethod" (field MappingMethod Name))
+    (set rdf:type 'gn:mappingMethod)))
 
 (define inbred-set-name->id
   (cut string->identifier "inbredSet" <>))
 
 (define-dump dump-inbred-set
-  (select-query ((InbredSet Name)
-                 (InbredSet FullName)
-                 (InbredSet GeneticType)
-                 (InbredSet Family)
-                 (Species FullName BinomialName))
-                (InbredSet
-                 (inner-join Species "USING (SpeciesId)")))
-  (lambda (row)
-    (scm->triples (map-alist row
-                    (set rdf:type 'gn:phenotype)
-                    (set gn:inbredSetOfSpecies
-                         (binomial-name->species-id (key "BinomialName")))
-                    (else=> default-metadata-proc))
-                  (inbred-set-name->id (assoc-ref row "Name")))))
+  (tables (InbredSet
+           (inner-join Species "USING (SpeciesId)")))
+  (triples (inbred-set-name->id (field InbredSet Name))
+    (set rdf:type 'gn:inbredSet)
+    (set gn:fullName (field InbredSet FullName))
+    (set gn:geneticType (field InbredSet GeneticType))
+    (set gn:family (field InbredSet Family))
+    (set gn:inbredSetOfSpecies
+         (binomial-name->species-id (field Species FullName BinomialName)))))
 
 (define (phenotype-id->id id)
   (string->identifier "phenotype" (number->string id)))
 
 (define-dump dump-phenotype
-  (select-query ((Phenotype Id)
-                 (Phenotype Pre_publication_description)
-                 (Phenotype Post_publication_description)
-                 (Phenotype Original_description)
-                 (Phenotype Units)
-                 (Phenotype Pre_publication_abbreviation)
-                 (Phenotype Post_publication_abbreviation)
-                 (Phenotype Lab_code)
-                 (Phenotype Submitter)
-                 (Phenotype Owner)
-                 (Phenotype Authorized_Users))
-                (Phenotype))
-  (lambda (row)
-    (scm->triples (map-alist row
-                    (delete "Id")
-                    (set rdf:type 'gn:phenotype)
-                    (set gn:units (and (string-ci=? (key "Units") "unknown")
-                                       (key "Units")))
-                    (else=> default-metadata-proc))
-                  (phenotype-id->id (assoc-ref row "Id")))))
+  (tables (Phenotype))
+  (triples (phenotype-id->id (field Phenotype Id))
+    (set rdf:type 'gn:phenotype)
+    (set gn:prePublicationDescription (field Phenotype Pre_publication_description))
+    (set gn:postPublicationDescription (field Phenotype Post_publication_description))
+    (set gn:originalDescription (field Phenotype Original_description))
+    (set gn:prePublicationDescription (field Phenotype Pre_publication_abbreviation))
+    (set gn:postPublicationDescription (field Phenotype Post_publication_abbreviation))
+    (set gn:labCode (field Phenotype Lab_code))
+    (set gn:submitter (field Phenotype Submitter))
+    (set gn:owner (field Phenotype Owner))
+    (set gn:authorizedUsers (field Phenotype Authorized_Users))
+    (set gn:units (and (string-ci=? (field Phenotype Units) "unknown")
+                       (field Phenotype Units)))))
 
 (define-dump dump-publication
-  (select-query ((Publication Id)
-                 (Publication PubMed_ID)
-                 (Publication Abstract)
-                 (Publication Authors)
-                 (Publication Title)
-                 (Publication Journal)
-                 (Publication Volume)
-                 (Publication Pages)
-                 (Publication Month)
-                 (Publication Year))
-                (Publication))
-  (lambda (row)
-    (scm->triples (map-alist row
-                    (delete "Id")
-                    (set rdf:type 'gn:publication)
-                    (multiset gn:authors
-                              ;; The authors field is a comma
-                              ;; separated list. Split it.
-                              (map string-trim (string-split (key "Authors") #\,)))
-                    (set gn:abstract
-                         ;; TODO: Why are there unprintable characters?
-                         (delete-substrings (key "Abstract") "\x01"))
-                    (else=> default-metadata-proc))
-                  (string->identifier "publication"
-                                      (number->string (assoc-ref row "Id"))))))
+  (tables (Publication))
+  (triples (string->identifier "publication"
+                               (number->string (field Publication Id)))
+    (set rdf:type 'gn:publication)
+    (set gn:pubMedId (field Publication PubMed_ID))
+    (set gn:title (field Publication Title))
+    (set gn:journal (field Publication Journal))
+    (set gn:volume (field Publication Volume))
+    (set gn:pages (field Publication Pages))
+    (set gn:month (field Publication Month))
+    (set gn:year (field Publication Year))
+    (multiset gn:authors
+              ;; The authors field is a comma
+              ;; separated list. Split it.
+              (map string-trim (string-split (field Publication Authors) #\,)))
+    (set gn:abstract
+         ;; TODO: Why are there unprintable characters?
+         (delete-substrings (field Publication Abstract)
+                            "\x01"))))
 
 (define-dump dump-publish-xref
-  (select-query ((InbredSet Name)
-                 (PublishXRef PhenotypeId))
-                (PublishXRef
-                 (inner-join InbredSet "USING (InbredSetId)")))
-  (lambda (row)
-    (scm->triples (map-alist row
-                    (set gn:phenotypeOfSpecies (inbred-set-name->id (key "Name"))))
-                  (phenotype-id->id (assoc-ref row "PhenotypeId")))))
+  (tables (PublishXRef
+           (inner-join InbredSet "USING (InbredSetId)")))
+  (triples (phenotype-id->id (field PublishXRef PhenotypeId))
+    (set gn:phenotypeOfSpecies (inbred-set-name->id (field InbredSet Name)))))
 
 (define tissue-short-name->id
   (cut string->identifier "tissue" <>))
@@ -310,18 +305,12 @@ characters with an underscore and prefixing with gn:PREFIX."
 (define-dump dump-tissue
   ;; The Name and TissueName fields seem to be identical. BIRN_lex_ID
   ;; and BIRN_lex_Name are mostly NULL.
-  (select-query ((Tissue Name)
-                 (Tissue Short_Name))
-                (Tissue))
-  (lambda (row)
-    (scm->triples (map-alist row
-                    (delete "Short_Name")
-                    (set rdf:type 'gn:tissue)
-                    (set gn:name (key "Name")))
-                  ;; Hopefully the Short_Name field is
-                  ;; distinct and can be used as an
-                  ;; identifier.
-                  (tissue-short-name->id (assoc-ref row "Short_Name")))))
+  (tables (Tissue))
+  ;; Hopefully the Short_Name field is distinct and can be used as an
+  ;; identifier.
+  (triples (tissue-short-name->id (field Tissue Short_Name))
+    (set rdf:type 'gn:tissue)
+    (set gn:name (field Tissue Name))))
 
 ;; One email ID in the Investigators table has spaces in it. This
 ;; function fixes that.
@@ -341,31 +330,23 @@ characters with an underscore and prefixing with gn:PREFIX."
 (define-dump dump-investigators
   ;; There are a few duplicate entries. We group by email to
   ;; deduplicate.
-  (select-query ((Investigators FirstName)
-                 (Investigators LastName)
-                 (Investigators Address)
-                 (Investigators City)
-                 (Investigators State)
-                 (Investigators ZipCode)
-                 (Investigators Phone)
-                 (Investigators Email)
-                 (Investigators Country)
-                 (Investigators Url))
-                (Investigators)
-                "GROUP BY Email")
-  (lambda (row)
-    (scm->triples (map-alist row
-                    (set rdf:type 'foaf:Person)
-                    (set foaf:name (string-append (key "FirstName") " " (key "LastName")))
-                    (set foaf:givenName (key "FirstName"))
-                    (set foaf:familyName (key "LastName"))
-                    (set foaf:phone (key "Phone"))
-                    (set foaf:mbox (fix-email-id (key "Email")))
-                    (set foaf:homepage (key "Url"))
-                    (else=> default-metadata-proc))
-                  (investigator-attributes->id (assoc-ref row "FirstName")
-                                               (assoc-ref row "LastName")
-                                               (assoc-ref row "Email")))))
+  (tables (Investigators)
+          "GROUP BY Email")
+  (triples (investigator-attributes->id (field Investigators FirstName)
+                                        (field Investigators LastName)
+                                        (field Investigators Email))
+    (set rdf:type 'foaf:Person)
+    (set foaf:name (string-append (field Investigators FirstName) " " (field Investigators LastName)))
+    (set foaf:givenName (field Investigators FirstName))
+    (set foaf:familyName (field Investigators LastName))
+    (set foaf:phone (field Investigators Phone))
+    (set foaf:mbox (fix-email-id (field Investigators Email)))
+    (set foaf:homepage (field Investigators Url))
+    (set gn:address (field Investigators Address))
+    (set gn:city (field Investigators City))
+    (set gn:state (field Investigators State))
+    (set gn:zipCode (field Investigators ZipCode))
+    (set gn:country (field Investigators Country))))
 
 (define avg-method-name->id
   (cut string->identifier "avgmethod" <>))
@@ -375,99 +356,78 @@ characters with an underscore and prefixing with gn:PREFIX."
   ;; the Name field.
   ;;
   ;; There are two records with Name as "N/A". Deduplicate.
-  (select-query (distinct (AvgMethod Name))
-                (AvgMethod))
-  (lambda (row)
-    (scm->triples (map-alist row
-                    (set rdf:type 'gn:avgMethod)
-                    (set gn:name (key "Name")))
-                  (avg-method-name->id (assoc-ref row "Name")))))
+  (tables (AvgMethod)
+          "GROUP BY Name")
+  (triples (avg-method-name->id (field AvgMethod Name))
+    (set rdf:type 'gn:avgMethod)
+    (set gn:name (field AvgMethod Name))))
 
 (define gene-chip-name->id
   (cut string->identifier "platform" <>))
 
 (define-dump dump-gene-chip
-  (select-query ((GeneChip GeneChipName)
-                 (GeneChip Name))
-                (GeneChip))
-  (lambda (row)
-    (scm->triples (map-alist row
-                    (delete "Name")
-                    (set rdf:type 'gn:platform)
-                    (set gn:name (key "GeneChipName")))
-                  (gene-chip-name->id (assoc-ref row "Name")))))
+  (tables (GeneChip))
+  (triples (gene-chip-name->id (field GeneChip Name))
+    (set rdf:type 'gn:platform)
+    (set gn:name (field GeneChip GeneChipName))))
 
+;; TODO: Double check Platforms. It doesn't seem to match up.
 (define-dump dump-info-files
-  ;; TODO: Double check Platforms. It doesn't seem to match up.
-  (select-query ((InfoFiles GN_AccesionId)
-                 (InfoFiles InfoFileTitle Name)
-                 (InfoFiles Title)
-                 (InfoFiles Specifics)
-                 (DatasetStatus DatasetStatusName)
-                 (Datasets DatasetName DatasetGroup)
-                 (Datasets Summary)
-                 (Datasets GeoSeries)
-                 (Datasets AboutCases)
-                 (Datasets AboutPlatform)
-                 (Datasets AboutTissue)
-                 (Datasets AboutDataProcessing)
-                 (Datasets Notes)
-                 (Datasets ExperimentDesign)
-                 (Datasets Contributors)
-                 (Datasets Citation)
-                 (Datasets Acknowledgment)
-                 (Species FullName BinomialName)
-                 (InbredSet Name InbredSetName)
-                 (Tissue Short_Name)
-                 (Investigators FirstName)
-                 (Investigators LastName)
-                 (Investigators Email)
-                 (AvgMethod Name AvgMethodName)
-                 (GeneChip Name GeneChip))
-                (InfoFiles
-                 (left-join Datasets "USING (DatasetId)")
-                 (left-join DatasetStatus "USING (DatasetStatusId)")
-                 (left-join Species "USING (SpeciesId)")
-                 (left-join InbredSet "USING (InbredSetId)")
-                 (left-join Tissue "USING (TissueId)")
-                 (left-join Investigators "USING (InvestigatorId)")
-                 (left-join AvgMethod "USING (AvgMethodId)")
-                 (left-join GeneChip "USING (GeneChipId)"))
-                "WHERE GN_AccesionId IS NOT NULL")
-  (lambda (row)
-    (scm->triples
-     (map-alist row
-       (set rdf:type 'gn:dataset)
-       (set gn:datasetOfInvestigator
-            (investigator-attributes->id (key "FirstName")
-                                         (key "LastName")
-                                         (key "Email")))
-       (set gn:accessionId (string-append "GN" (number->string (key "GN_AccesionId"))))
-       (set gn:datasetStatusName (string-downcase (key "DatasetStatusName")))
-       (set gn:datasetOfSpecies (binomial-name->species-id (key "BinomialName")))
-       (set gn:datasetOfInbredSet (inbred-set-name->id (key "InbredSetName")))
-       (set gn:datasetOfTissue (tissue-short-name->id (key "Short_Name")))
-       (set gn:normalization
-            (avg-method-name->id
-             ;; If AvgMethodName is NULL, assume N/A.
-             (if (string-blank? (key "AvgMethodName"))
-                 "N/A" (key "AvgMethodName"))))
-       (set gn:datasetOfPlatform (gene-chip-name->id (key "GeneChip")))
-       (set gn:summary
-            ;; TODO: Why are there unprintable characters?
-            (delete-substrings (key "Summary")
-                               "\x01" "\x03"))
-       (set gn:aboutTissue
-            ;; TODO: Why are there unprintable characters?
-            (delete-substrings (key "AboutTissue")
-                               "\x01" "\x03"))
-       (set gn:geoSeries
-            (and (not (string-prefix-ci? "no geo series"
-                                         (key "GeoSeries")))
-                 (key "GeoSeries")))
-       (else=> default-metadata-proc))
-     (string->identifier "dataset"
-                         (number->string (assoc-ref row "GN_AccesionId"))))))
+  (tables (InfoFiles
+           (left-join Datasets "USING (DatasetId)")
+           (left-join DatasetStatus "USING (DatasetStatusId)")
+           (left-join Species "USING (SpeciesId)")
+           (left-join InbredSet "USING (InbredSetId)")
+           (left-join Tissue "USING (TissueId)")
+           (left-join Investigators "USING (InvestigatorId)")
+           (left-join AvgMethod "USING (AvgMethodId)")
+           (left-join GeneChip "USING (GeneChipId)"))
+          "WHERE GN_AccesionId IS NOT NULL")
+  (triples (string->identifier "dataset"
+                               (number->string (field InfoFiles GN_AccesionId)))
+    (set rdf:type 'gn:dataset)
+    (set gn:datasetOfInvestigator
+         (investigator-attributes->id (field Investigators FirstName)
+                                      (field Investigators LastName)
+                                      (field Investigators Email)))
+    (set gn:accessionId (string-append "GN" (number->string
+                                             (field InfoFiles GN_AccesionId))))
+    (set gn:datasetStatusName (string-downcase
+                               (field DatasetStatus DatasetStatusName)))
+    (set gn:datasetOfSpecies (binomial-name->species-id
+                              (field Species FullName BinomialName)))
+    (set gn:datasetOfInbredSet (inbred-set-name->id (field InbredSet Name InbredSetName)))
+    (set gn:datasetOfTissue (tissue-short-name->id (field Tissue Short_Name)))
+    (set gn:normalization
+         (avg-method-name->id
+          ;; If AvgMethodName is NULL, assume N/A.
+          (if (string-blank? (field AvgMethod Name AvgMethodName))
+              "N/A" (field AvgMethod Name AvgMethodName))))
+    (set gn:datasetOfPlatform (gene-chip-name->id (field GeneChip Name GeneChip)))
+    (set gn:summary
+         ;; TODO: Why are there unprintable characters?
+         (delete-substrings (field Datasets Summary)
+                            "\x01" "\x03"))
+    (set gn:aboutTissue
+         ;; TODO: Why are there unprintable characters?
+         (delete-substrings (field Datasets AboutTissue)
+                            "\x01" "\x03"))
+    (set gn:geoSeries
+         (and (not (string-prefix-ci? "no geo series"
+                                      (field Datasets GeoSeries)))
+              (field Datasets GeoSeries)))
+    (set gn:name (field InfoFiles InfoFileTitle Name))
+    (set gn:title (field InfoFiles Title))
+    (set gn:specifics (field InfoFiles Specifics))
+    (set gn:datasetGroup (field Datasets DatasetName DatasetGroup))
+    (set gn:aboutCases (field Datasets AboutCases))
+    (set gn:aboutPlatform (field Datasets AboutPlatform))
+    (set gn:aboutDataProcessing (field Datasets AboutDataProcessing))
+    (set gn:notes (field Datasets Notes))
+    (set gn:experimentDesign (field Datasets ExperimentDesign))
+    (set gn:contributors (field Datasets Contributors))
+    (set gn:citation (field Datasets Citation))
+    (set gn:acknowledgment (field Datasets Acknowledgment))))
 
 (define (dump-data-table db table-name data-field)
   (let ((dump-directory (string-append %dump-directory "/" table-name))