about summary refs log tree commit diff
diff options
context:
space:
mode:
-rwxr-xr-xdump.scm519
1 files changed, 256 insertions, 263 deletions
diff --git a/dump.scm b/dump.scm
index dfa0471..665dee8 100755
--- a/dump.scm
+++ b/dump.scm
@@ -143,164 +143,163 @@ characters with an underscore and prefixing with gn:PREFIX."
 (define (triple subject predicate object)
   (format #t "~a ~a ~s .~%" subject predicate object))
 
+(define-syntax define-dump
+  (lambda (x)
+    (syntax-case x (select-query)
+      ((_ name (select-query (fields ...) tables raw-forms ...) proc)
+       (define (name db)
+         (sql-for-each proc db (select-query (fields ...) tables raw-forms ...)))))))
+
 (define binomial-name->species-id
   (cut string->identifier "species" <>))
 
-(define (dump-species db)
-  (sql-for-each (lambda (row)
-                  (scm->triples (map-alist row
-                                  (set rdf:type 'gn:species)
-                                  ;; Common name
-                                  (set gn:name (key "SpeciesName"))
-                                  ;; Menu name (TODO: Maybe, drop this field. It can
-                                  ;; be inferred from the common name.)
-                                  (set gn:menuName (key "MenuName"))
-                                  (set gn:binomialname (key "FullName")))
-                                (binomial-name->species-id (assoc-ref row "FullName"))))
-                db
-                (select-query ((Species SpeciesName)
-                               (Species MenuName)
-                               (Species FullName))
-                              (Species))))
-
-(define (dump-strain db)
-  (sql-for-each (lambda (row)
-                  (scm->triples (map-alist row
-                                  (set rdf:type 'gn:strain)
-                                  (set gn:strainOfSpecies
-                                       (binomial-name->species-id (key "FullName")))
-                                  ;; Name, and maybe a second name
-                                  (set gn:name (key "Name"))
-                                  (set gn:name (key "Name2"))
-                                  (set gn:alias (key "Alias")))
-                                (string->identifier "strain" (assoc-ref row "Name"))))
-                db
-                (select-query ((Species FullName)
-                               (Strain Name)
-                               (Strain Name2)
-                               (Strain Symbol)
-                               (Strain Alias))
-                              (Strain
-                               (join Species "ON Strain.SpeciesId = Species.SpeciesId")))))
+(define-dump dump-species
+  (select-query ((Species SpeciesName)
+                 (Species MenuName)
+                 (Species FullName))
+                (Species))
+  (lambda (row)
+    (scm->triples (map-alist row
+                    (set rdf:type 'gn:species)
+                    ;; Common name
+                    (set gn:name (key "SpeciesName"))
+                    ;; Menu name (TODO: Maybe, drop this field. It can
+                    ;; be inferred from the common name.)
+                    (set gn:menuName (key "MenuName"))
+                    (set gn:binomialname (key "FullName")))
+                  (binomial-name->species-id (assoc-ref row "FullName")))))
+
+(define-dump dump-strain
+  (select-query ((Species FullName)
+                 (Strain Name)
+                 (Strain Name2)
+                 (Strain Symbol)
+                 (Strain Alias))
+                (Strain
+                 (join Species "ON Strain.SpeciesId = Species.SpeciesId")))
+  (lambda (row)
+    (scm->triples (map-alist row
+                    (set rdf:type 'gn:strain)
+                    (set gn:strainOfSpecies
+                         (binomial-name->species-id (key "FullName")))
+                    ;; Name, and maybe a second name
+                    (set gn:name (key "Name"))
+                    (set gn:name (key "Name2"))
+                    (set gn:alias (key "Alias")))
+                  (string->identifier "strain" (assoc-ref row "Name")))))
 
 ;; TODO: This function is unused. Remove if not required.
 (define mapping-method-name->id
   (cut string->identifier "mappingMethod" <>))
 
 ;; TODO: This function is unused. Remove if not required.
-(define (dump-mapping-method db)
-  (sql-for-each (lambda (row)
-                  (scm->triples (map-alist row
-                                  (set rdf:type 'gn:mappingMethod))
-                                (string->identifier "mappingMethod" (assoc-ref row "Name"))))
-                db
-                (select-query ((MappingMethod Name))
-                              (MappingMethod))))
+(define-dump dump-mapping-method
+  (select-query ((MappingMethod Name))
+                (MappingMethod))
+  (lambda (row)
+    (scm->triples (map-alist row
+                    (set rdf:type 'gn:mappingMethod))
+                  (string->identifier "mappingMethod" (assoc-ref row "Name")))))
 
 (define inbred-set-name->id
   (cut string->identifier "inbredSet" <>))
 
-(define (dump-inbred-set db)
-  (sql-for-each (lambda (row)
-                  (scm->triples (map-alist row
-                                  (set rdf:type 'gn:phenotype)
-                                  (set gn:inbredSetOfSpecies
-                                       (binomial-name->species-id (key "BinomialName")))
-                                  (else=> default-metadata-proc))
-                                (inbred-set-name->id (assoc-ref row "Name"))))
-                db
-                (select-query ((InbredSet Name)
-                               (InbredSet FullName)
-                               (InbredSet GeneticType)
-                               (InbredSet Family)
-                               (Species FullName BinomialName))
-                              (InbredSet
-                               (inner-join Species "USING (SpeciesId)")))))
+(define-dump dump-inbred-set
+  (select-query ((InbredSet Name)
+                 (InbredSet FullName)
+                 (InbredSet GeneticType)
+                 (InbredSet Family)
+                 (Species FullName BinomialName))
+                (InbredSet
+                 (inner-join Species "USING (SpeciesId)")))
+  (lambda (row)
+    (scm->triples (map-alist row
+                    (set rdf:type 'gn:phenotype)
+                    (set gn:inbredSetOfSpecies
+                         (binomial-name->species-id (key "BinomialName")))
+                    (else=> default-metadata-proc))
+                  (inbred-set-name->id (assoc-ref row "Name")))))
 
 (define (phenotype-id->id id)
   (string->identifier "phenotype" (number->string id)))
 
-(define (dump-phenotype db)
-  (sql-for-each (lambda (row)
-                  (scm->triples (map-alist row
-                                  (delete "Id")
-                                  (set rdf:type 'gn:phenotype)
-                                  (set gn:units (and (string-ci=? (key "Units") "unknown")
-                                                     (key "Units")))
-                                  (else=> default-metadata-proc))
-                                (phenotype-id->id (assoc-ref row "Id"))))
-                db
-                (select-query ((Phenotype Id)
-                               (Phenotype Pre_publication_description)
-                               (Phenotype Post_publication_description)
-                               (Phenotype Original_description)
-                               (Phenotype Units)
-                               (Phenotype Pre_publication_abbreviation)
-                               (Phenotype Post_publication_abbreviation)
-                               (Phenotype Lab_code)
-                               (Phenotype Submitter)
-                               (Phenotype Owner)
-                               (Phenotype Authorized_Users))
-                              (Phenotype))))
-
-(define (dump-publication db)
-  (sql-for-each (lambda (row)
-                  (scm->triples (map-alist row
-                                  (delete "Id")
-                                  (set rdf:type 'gn:publication)
-                                  (multiset gn:authors
-                                            ;; The authors field is a comma
-                                            ;; separated list. Split it.
-                                            (map string-trim (string-split (key "Authors") #\,)))
-                                  (set gn:abstract
-                                       ;; TODO: Why are there unprintable characters?
-                                       (delete-substrings (key "Abstract") "\x01"))
-                                  (else=> default-metadata-proc))
-                                (string->identifier "publication"
-                                                    (number->string (assoc-ref row "Id")))))
-                db
-                (select-query ((Publication Id)
-                               (Publication PubMed_ID)
-                               (Publication Abstract)
-                               (Publication Authors)
-                               (Publication Title)
-                               (Publication Journal)
-                               (Publication Volume)
-                               (Publication Pages)
-                               (Publication Month)
-                               (Publication Year))
-                              (Publication))))
-
-(define (dump-publish-xref db)
-  (sql-for-each (lambda (row)
-                  (scm->triples (map-alist row
-                                  (set gn:phenotypeOfSpecies (inbred-set-name->id (key "Name"))))
-                                (phenotype-id->id (assoc-ref row "PhenotypeId"))))
-                db
-                (select-query ((InbredSet Name)
-                               (PublishXRef PhenotypeId))
-                              (PublishXRef
-                               (inner-join InbredSet "USING (InbredSetId)")))))
+(define-dump dump-phenotype
+  (select-query ((Phenotype Id)
+                 (Phenotype Pre_publication_description)
+                 (Phenotype Post_publication_description)
+                 (Phenotype Original_description)
+                 (Phenotype Units)
+                 (Phenotype Pre_publication_abbreviation)
+                 (Phenotype Post_publication_abbreviation)
+                 (Phenotype Lab_code)
+                 (Phenotype Submitter)
+                 (Phenotype Owner)
+                 (Phenotype Authorized_Users))
+                (Phenotype))
+  (lambda (row)
+    (scm->triples (map-alist row
+                    (delete "Id")
+                    (set rdf:type 'gn:phenotype)
+                    (set gn:units (and (string-ci=? (key "Units") "unknown")
+                                       (key "Units")))
+                    (else=> default-metadata-proc))
+                  (phenotype-id->id (assoc-ref row "Id")))))
+
+(define-dump dump-publication
+  (select-query ((Publication Id)
+                 (Publication PubMed_ID)
+                 (Publication Abstract)
+                 (Publication Authors)
+                 (Publication Title)
+                 (Publication Journal)
+                 (Publication Volume)
+                 (Publication Pages)
+                 (Publication Month)
+                 (Publication Year))
+                (Publication))
+  (lambda (row)
+    (scm->triples (map-alist row
+                    (delete "Id")
+                    (set rdf:type 'gn:publication)
+                    (multiset gn:authors
+                              ;; The authors field is a comma
+                              ;; separated list. Split it.
+                              (map string-trim (string-split (key "Authors") #\,)))
+                    (set gn:abstract
+                         ;; TODO: Why are there unprintable characters?
+                         (delete-substrings (key "Abstract") "\x01"))
+                    (else=> default-metadata-proc))
+                  (string->identifier "publication"
+                                      (number->string (assoc-ref row "Id"))))))
+
+(define-dump dump-publish-xref
+  (select-query ((InbredSet Name)
+                 (PublishXRef PhenotypeId))
+                (PublishXRef
+                 (inner-join InbredSet "USING (InbredSetId)")))
+  (lambda (row)
+    (scm->triples (map-alist row
+                    (set gn:phenotypeOfSpecies (inbred-set-name->id (key "Name"))))
+                  (phenotype-id->id (assoc-ref row "PhenotypeId")))))
 
 (define tissue-short-name->id
   (cut string->identifier "tissue" <>))
 
-(define (dump-tissue db)
+(define-dump dump-tissue
   ;; The Name and TissueName fields seem to be identical. BIRN_lex_ID
   ;; and BIRN_lex_Name are mostly NULL.
-  (sql-for-each (lambda (row)
-                  (scm->triples (map-alist row
-                                  (delete "Short_Name")
-                                  (set rdf:type 'gn:tissue)
-                                  (set gn:name (key "Name")))
-                                ;; Hopefully the Short_Name field is
-                                ;; distinct and can be used as an
-                                ;; identifier.
-                                (tissue-short-name->id (assoc-ref row "Short_Name"))))
-                db
-                (select-query ((Tissue Name)
-                               (Tissue Short_Name))
-                              (Tissue))))
+  (select-query ((Tissue Name)
+                 (Tissue Short_Name))
+                (Tissue))
+  (lambda (row)
+    (scm->triples (map-alist row
+                    (delete "Short_Name")
+                    (set rdf:type 'gn:tissue)
+                    (set gn:name (key "Name")))
+                  ;; Hopefully the Short_Name field is
+                  ;; distinct and can be used as an
+                  ;; identifier.
+                  (tissue-short-name->id (assoc-ref row "Short_Name")))))
 
 ;; One email ID in the Investigators table has spaces in it. This
 ;; function fixes that.
@@ -317,142 +316,136 @@ characters with an underscore and prefixing with gn:PREFIX."
                       (string-join (list first-name last-name (fix-email-id email))
                                    "_")))
 
-(define (dump-investigators db)
-  (sql-for-each (lambda (row)
-                  (scm->triples (map-alist row
-                                  (set rdf:type 'foaf:Person)
-                                  (set foaf:name (string-append (key "FirstName") " " (key "LastName")))
-                                  (set foaf:givenName (key "FirstName"))
-                                  (set foaf:familyName (key "LastName"))
-                                  (set foaf:phone (key "Phone"))
-                                  (set foaf:mbox (fix-email-id (key "Email")))
-                                  (set foaf:homepage (key "Url"))
-                                  (else=> default-metadata-proc))
-                                (investigator-attributes->id (assoc-ref row "FirstName")
-                                                             (assoc-ref row "LastName")
-                                                             (assoc-ref row "Email"))))
-                db
-                ;; There are a few duplicate entries. We group by
-                ;; email to deduplicate.
-                (select-query ((Investigators FirstName)
-                               (Investigators LastName)
-                               (Investigators Address)
-                               (Investigators City)
-                               (Investigators State)
-                               (Investigators ZipCode)
-                               (Investigators Phone)
-                               (Investigators Email)
-                               (Investigators Country)
-                               (Investigators Url))
-                              (Investigators)
-                              "GROUP BY Email")))
+(define-dump dump-investigators
+  ;; There are a few duplicate entries. We group by email to
+  ;; deduplicate.
+  (select-query ((Investigators FirstName)
+                 (Investigators LastName)
+                 (Investigators Address)
+                 (Investigators City)
+                 (Investigators State)
+                 (Investigators ZipCode)
+                 (Investigators Phone)
+                 (Investigators Email)
+                 (Investigators Country)
+                 (Investigators Url))
+                (Investigators)
+                "GROUP BY Email")
+  (lambda (row)
+    (scm->triples (map-alist row
+                    (set rdf:type 'foaf:Person)
+                    (set foaf:name (string-append (key "FirstName") " " (key "LastName")))
+                    (set foaf:givenName (key "FirstName"))
+                    (set foaf:familyName (key "LastName"))
+                    (set foaf:phone (key "Phone"))
+                    (set foaf:mbox (fix-email-id (key "Email")))
+                    (set foaf:homepage (key "Url"))
+                    (else=> default-metadata-proc))
+                  (investigator-attributes->id (assoc-ref row "FirstName")
+                                               (assoc-ref row "LastName")
+                                               (assoc-ref row "Email")))))
 
 (define avg-method-name->id
   (cut string->identifier "avgmethod" <>))
 
-(define (dump-avg-method db)
-  (sql-for-each (lambda (row)
-                  (scm->triples (map-alist row
-                                  (set rdf:type 'gn:avgMethod)
-                                  (set gn:name (key "Name")))
-                                (avg-method-name->id (assoc-ref row "Name"))))
-                db
-                ;; The Name and Normalization fields seem to be the
-                ;; same. Dump only the Name field.
-                ;;
-                ;; There are two records with Name as
-                ;; "N/A". Deduplicate.
-                (select-query (distinct (AvgMethod Name))
-                              (AvgMethod))))
+(define-dump dump-avg-method
+  ;; The Name and Normalization fields seem to be the same. Dump only
+  ;; the Name field.
+  ;;
+  ;; There are two records with Name as "N/A". Deduplicate.
+  (select-query (distinct (AvgMethod Name))
+                (AvgMethod))
+  (lambda (row)
+    (scm->triples (map-alist row
+                    (set rdf:type 'gn:avgMethod)
+                    (set gn:name (key "Name")))
+                  (avg-method-name->id (assoc-ref row "Name")))))
 
 (define gene-chip-name->id
   (cut string->identifier "platform" <>))
 
-(define (dump-gene-chip db)
-  (sql-for-each (lambda (row)
-                  (scm->triples (map-alist row
-                                  (delete "Name")
-                                  (set rdf:type 'gn:platform)
-                                  (set gn:name (key "GeneChipName")))
-                                (gene-chip-name->id (assoc-ref row "Name"))))
-                db
-                (select-query ((GeneChip GeneChipName)
-                               (GeneChip Name))
-                              (GeneChip))))
-
-(define (dump-info-files db)
-  (sql-for-each (lambda (row)
-                  (scm->triples
-                   (map-alist row
-                     (set rdf:type 'gn:dataset)
-                     (set gn:datasetOfInvestigator
-                          (investigator-attributes->id (key "FirstName")
-                                                       (key "LastName")
-                                                       (key "Email")))
-                     (set gn:accessionId (string-append "GN" (number->string (key "GN_AccesionId"))))
-                     (set gn:datasetStatusName (string-downcase (key "DatasetStatusName")))
-                     (set gn:datasetOfSpecies (binomial-name->species-id (key "BinomialName")))
-                     (set gn:datasetOfInbredSet (inbred-set-name->id (key "InbredSetName")))
-                     (set gn:datasetOfTissue (tissue-short-name->id (key "Short_Name")))
-                     (set gn:normalization
-                          (avg-method-name->id
-                           ;; If AvgMethodName is NULL, assume N/A.
-                           (if (string-blank? (key "AvgMethodName"))
-                               "N/A" (key "AvgMethodName"))))
-                     (set gn:datasetOfPlatform (gene-chip-name->id (key "GeneChip")))
-                     (set gn:summary
-                          ;; TODO: Why are there unprintable characters?
-                          (delete-substrings (key "Summary")
-                                             "\x01" "\x03"))
-                     (set gn:aboutTissue
-                          ;; TODO: Why are there unprintable characters?
-                          (delete-substrings (key "AboutTissue")
-                                             "\x01" "\x03"))
-                     (set gn:geoSeries
-                          (and (not (string-prefix-ci? "no geo series"
-                                                       (key "GeoSeries")))
-                               (key "GeoSeries")))
-                     (else=> default-metadata-proc))
-                   (string->identifier "dataset"
-                                       (number->string (assoc-ref row "GN_AccesionId")))))
-                db
-                ;; TODO: Double check Platforms. It doesn't seem to
-                ;; match up.
-                (select-query ((InfoFiles GN_AccesionId)
-                               (InfoFiles InfoFileTitle Name)
-                               (InfoFiles Title)
-                               (InfoFiles Specifics)
-                               (DatasetStatus DatasetStatusName)
-                               (Datasets DatasetName DatasetGroup)
-                               (Datasets Summary)
-                               (Datasets GeoSeries)
-                               (Datasets AboutCases)
-                               (Datasets AboutPlatform)
-                               (Datasets AboutTissue)
-                               (Datasets AboutDataProcessing)
-                               (Datasets Notes)
-                               (Datasets ExperimentDesign)
-                               (Datasets Contributors)
-                               (Datasets Citation)
-                               (Datasets Acknowledgment)
-                               (Species FullName BinomialName)
-                               (InbredSet Name InbredSetName)
-                               (Tissue Short_Name)
-                               (Investigators FirstName)
-                               (Investigators LastName)
-                               (Investigators Email)
-                               (AvgMethod Name AvgMethodName)
-                               (GeneChip Name GeneChip))
-                              (InfoFiles
-                               (left-join Datasets "USING (DatasetId)")
-                               (left-join DatasetStatus "USING (DatasetStatusId)")
-                               (left-join Species "USING (SpeciesId)")
-                               (left-join InbredSet "USING (InbredSetId)")
-                               (left-join Tissue "USING (TissueId)")
-                               (left-join Investigators "USING (InvestigatorId)")
-                               (left-join AvgMethod "USING (AvgMethodId)")
-                               (left-join GeneChip "USING (GeneChipId)"))
-                              "WHERE GN_AccesionId IS NOT NULL")))
+(define-dump dump-gene-chip
+  (select-query ((GeneChip GeneChipName)
+                 (GeneChip Name))
+                (GeneChip))
+  (lambda (row)
+    (scm->triples (map-alist row
+                    (delete "Name")
+                    (set rdf:type 'gn:platform)
+                    (set gn:name (key "GeneChipName")))
+                  (gene-chip-name->id (assoc-ref row "Name")))))
+
+(define-dump dump-info-files
+  ;; TODO: Double check Platforms. It doesn't seem to match up.
+  (select-query ((InfoFiles GN_AccesionId)
+                 (InfoFiles InfoFileTitle Name)
+                 (InfoFiles Title)
+                 (InfoFiles Specifics)
+                 (DatasetStatus DatasetStatusName)
+                 (Datasets DatasetName DatasetGroup)
+                 (Datasets Summary)
+                 (Datasets GeoSeries)
+                 (Datasets AboutCases)
+                 (Datasets AboutPlatform)
+                 (Datasets AboutTissue)
+                 (Datasets AboutDataProcessing)
+                 (Datasets Notes)
+                 (Datasets ExperimentDesign)
+                 (Datasets Contributors)
+                 (Datasets Citation)
+                 (Datasets Acknowledgment)
+                 (Species FullName BinomialName)
+                 (InbredSet Name InbredSetName)
+                 (Tissue Short_Name)
+                 (Investigators FirstName)
+                 (Investigators LastName)
+                 (Investigators Email)
+                 (AvgMethod Name AvgMethodName)
+                 (GeneChip Name GeneChip))
+                (InfoFiles
+                 (left-join Datasets "USING (DatasetId)")
+                 (left-join DatasetStatus "USING (DatasetStatusId)")
+                 (left-join Species "USING (SpeciesId)")
+                 (left-join InbredSet "USING (InbredSetId)")
+                 (left-join Tissue "USING (TissueId)")
+                 (left-join Investigators "USING (InvestigatorId)")
+                 (left-join AvgMethod "USING (AvgMethodId)")
+                 (left-join GeneChip "USING (GeneChipId)"))
+                "WHERE GN_AccesionId IS NOT NULL")
+  (lambda (row)
+    (scm->triples
+     (map-alist row
+       (set rdf:type 'gn:dataset)
+       (set gn:datasetOfInvestigator
+            (investigator-attributes->id (key "FirstName")
+                                         (key "LastName")
+                                         (key "Email")))
+       (set gn:accessionId (string-append "GN" (number->string (key "GN_AccesionId"))))
+       (set gn:datasetStatusName (string-downcase (key "DatasetStatusName")))
+       (set gn:datasetOfSpecies (binomial-name->species-id (key "BinomialName")))
+       (set gn:datasetOfInbredSet (inbred-set-name->id (key "InbredSetName")))
+       (set gn:datasetOfTissue (tissue-short-name->id (key "Short_Name")))
+       (set gn:normalization
+            (avg-method-name->id
+             ;; If AvgMethodName is NULL, assume N/A.
+             (if (string-blank? (key "AvgMethodName"))
+                 "N/A" (key "AvgMethodName"))))
+       (set gn:datasetOfPlatform (gene-chip-name->id (key "GeneChip")))
+       (set gn:summary
+            ;; TODO: Why are there unprintable characters?
+            (delete-substrings (key "Summary")
+                               "\x01" "\x03"))
+       (set gn:aboutTissue
+            ;; TODO: Why are there unprintable characters?
+            (delete-substrings (key "AboutTissue")
+                               "\x01" "\x03"))
+       (set gn:geoSeries
+            (and (not (string-prefix-ci? "no geo series"
+                                         (key "GeoSeries")))
+                 (key "GeoSeries")))
+       (else=> default-metadata-proc))
+     (string->identifier "dataset"
+                         (number->string (assoc-ref row "GN_AccesionId"))))))
 
 (define (dump-data-table db table-name data-field)
   (let ((dump-directory (string-append %dump-directory "/" table-name))