aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xdump.scm519
1 files changed, 256 insertions, 263 deletions
diff --git a/dump.scm b/dump.scm
index dfa0471..665dee8 100755
--- a/dump.scm
+++ b/dump.scm
@@ -143,164 +143,163 @@ characters with an underscore and prefixing with gn:PREFIX."
(define (triple subject predicate object)
(format #t "~a ~a ~s .~%" subject predicate object))
+(define-syntax define-dump
+ (lambda (x)
+ (syntax-case x (select-query)
+ ((_ name (select-query (fields ...) tables raw-forms ...) proc)
+ (define (name db)
+ (sql-for-each proc db (select-query (fields ...) tables raw-forms ...)))))))
+
(define binomial-name->species-id
(cut string->identifier "species" <>))
-(define (dump-species db)
- (sql-for-each (lambda (row)
- (scm->triples (map-alist row
- (set rdf:type 'gn:species)
- ;; Common name
- (set gn:name (key "SpeciesName"))
- ;; Menu name (TODO: Maybe, drop this field. It can
- ;; be inferred from the common name.)
- (set gn:menuName (key "MenuName"))
- (set gn:binomialname (key "FullName")))
- (binomial-name->species-id (assoc-ref row "FullName"))))
- db
- (select-query ((Species SpeciesName)
- (Species MenuName)
- (Species FullName))
- (Species))))
-
-(define (dump-strain db)
- (sql-for-each (lambda (row)
- (scm->triples (map-alist row
- (set rdf:type 'gn:strain)
- (set gn:strainOfSpecies
- (binomial-name->species-id (key "FullName")))
- ;; Name, and maybe a second name
- (set gn:name (key "Name"))
- (set gn:name (key "Name2"))
- (set gn:alias (key "Alias")))
- (string->identifier "strain" (assoc-ref row "Name"))))
- db
- (select-query ((Species FullName)
- (Strain Name)
- (Strain Name2)
- (Strain Symbol)
- (Strain Alias))
- (Strain
- (join Species "ON Strain.SpeciesId = Species.SpeciesId")))))
+(define-dump dump-species
+ (select-query ((Species SpeciesName)
+ (Species MenuName)
+ (Species FullName))
+ (Species))
+ (lambda (row)
+ (scm->triples (map-alist row
+ (set rdf:type 'gn:species)
+ ;; Common name
+ (set gn:name (key "SpeciesName"))
+ ;; Menu name (TODO: Maybe, drop this field. It can
+ ;; be inferred from the common name.)
+ (set gn:menuName (key "MenuName"))
+ (set gn:binomialname (key "FullName")))
+ (binomial-name->species-id (assoc-ref row "FullName")))))
+
+(define-dump dump-strain
+ (select-query ((Species FullName)
+ (Strain Name)
+ (Strain Name2)
+ (Strain Symbol)
+ (Strain Alias))
+ (Strain
+ (join Species "ON Strain.SpeciesId = Species.SpeciesId")))
+ (lambda (row)
+ (scm->triples (map-alist row
+ (set rdf:type 'gn:strain)
+ (set gn:strainOfSpecies
+ (binomial-name->species-id (key "FullName")))
+ ;; Name, and maybe a second name
+ (set gn:name (key "Name"))
+ (set gn:name (key "Name2"))
+ (set gn:alias (key "Alias")))
+ (string->identifier "strain" (assoc-ref row "Name")))))
;; TODO: This function is unused. Remove if not required.
(define mapping-method-name->id
(cut string->identifier "mappingMethod" <>))
;; TODO: This function is unused. Remove if not required.
-(define (dump-mapping-method db)
- (sql-for-each (lambda (row)
- (scm->triples (map-alist row
- (set rdf:type 'gn:mappingMethod))
- (string->identifier "mappingMethod" (assoc-ref row "Name"))))
- db
- (select-query ((MappingMethod Name))
- (MappingMethod))))
+(define-dump dump-mapping-method
+ (select-query ((MappingMethod Name))
+ (MappingMethod))
+ (lambda (row)
+ (scm->triples (map-alist row
+ (set rdf:type 'gn:mappingMethod))
+ (string->identifier "mappingMethod" (assoc-ref row "Name")))))
(define inbred-set-name->id
(cut string->identifier "inbredSet" <>))
-(define (dump-inbred-set db)
- (sql-for-each (lambda (row)
- (scm->triples (map-alist row
- (set rdf:type 'gn:phenotype)
- (set gn:inbredSetOfSpecies
- (binomial-name->species-id (key "BinomialName")))
- (else=> default-metadata-proc))
- (inbred-set-name->id (assoc-ref row "Name"))))
- db
- (select-query ((InbredSet Name)
- (InbredSet FullName)
- (InbredSet GeneticType)
- (InbredSet Family)
- (Species FullName BinomialName))
- (InbredSet
- (inner-join Species "USING (SpeciesId)")))))
+(define-dump dump-inbred-set
+ (select-query ((InbredSet Name)
+ (InbredSet FullName)
+ (InbredSet GeneticType)
+ (InbredSet Family)
+ (Species FullName BinomialName))
+ (InbredSet
+ (inner-join Species "USING (SpeciesId)")))
+ (lambda (row)
+ (scm->triples (map-alist row
+ (set rdf:type 'gn:phenotype)
+ (set gn:inbredSetOfSpecies
+ (binomial-name->species-id (key "BinomialName")))
+ (else=> default-metadata-proc))
+ (inbred-set-name->id (assoc-ref row "Name")))))
(define (phenotype-id->id id)
(string->identifier "phenotype" (number->string id)))
-(define (dump-phenotype db)
- (sql-for-each (lambda (row)
- (scm->triples (map-alist row
- (delete "Id")
- (set rdf:type 'gn:phenotype)
- (set gn:units (and (string-ci=? (key "Units") "unknown")
- (key "Units")))
- (else=> default-metadata-proc))
- (phenotype-id->id (assoc-ref row "Id"))))
- db
- (select-query ((Phenotype Id)
- (Phenotype Pre_publication_description)
- (Phenotype Post_publication_description)
- (Phenotype Original_description)
- (Phenotype Units)
- (Phenotype Pre_publication_abbreviation)
- (Phenotype Post_publication_abbreviation)
- (Phenotype Lab_code)
- (Phenotype Submitter)
- (Phenotype Owner)
- (Phenotype Authorized_Users))
- (Phenotype))))
-
-(define (dump-publication db)
- (sql-for-each (lambda (row)
- (scm->triples (map-alist row
- (delete "Id")
- (set rdf:type 'gn:publication)
- (multiset gn:authors
- ;; The authors field is a comma
- ;; separated list. Split it.
- (map string-trim (string-split (key "Authors") #\,)))
- (set gn:abstract
- ;; TODO: Why are there unprintable characters?
- (delete-substrings (key "Abstract") "\x01"))
- (else=> default-metadata-proc))
- (string->identifier "publication"
- (number->string (assoc-ref row "Id")))))
- db
- (select-query ((Publication Id)
- (Publication PubMed_ID)
- (Publication Abstract)
- (Publication Authors)
- (Publication Title)
- (Publication Journal)
- (Publication Volume)
- (Publication Pages)
- (Publication Month)
- (Publication Year))
- (Publication))))
-
-(define (dump-publish-xref db)
- (sql-for-each (lambda (row)
- (scm->triples (map-alist row
- (set gn:phenotypeOfSpecies (inbred-set-name->id (key "Name"))))
- (phenotype-id->id (assoc-ref row "PhenotypeId"))))
- db
- (select-query ((InbredSet Name)
- (PublishXRef PhenotypeId))
- (PublishXRef
- (inner-join InbredSet "USING (InbredSetId)")))))
+(define-dump dump-phenotype
+ (select-query ((Phenotype Id)
+ (Phenotype Pre_publication_description)
+ (Phenotype Post_publication_description)
+ (Phenotype Original_description)
+ (Phenotype Units)
+ (Phenotype Pre_publication_abbreviation)
+ (Phenotype Post_publication_abbreviation)
+ (Phenotype Lab_code)
+ (Phenotype Submitter)
+ (Phenotype Owner)
+ (Phenotype Authorized_Users))
+ (Phenotype))
+ (lambda (row)
+ (scm->triples (map-alist row
+ (delete "Id")
+ (set rdf:type 'gn:phenotype)
+ (set gn:units (and (string-ci=? (key "Units") "unknown")
+ (key "Units")))
+ (else=> default-metadata-proc))
+ (phenotype-id->id (assoc-ref row "Id")))))
+
+(define-dump dump-publication
+ (select-query ((Publication Id)
+ (Publication PubMed_ID)
+ (Publication Abstract)
+ (Publication Authors)
+ (Publication Title)
+ (Publication Journal)
+ (Publication Volume)
+ (Publication Pages)
+ (Publication Month)
+ (Publication Year))
+ (Publication))
+ (lambda (row)
+ (scm->triples (map-alist row
+ (delete "Id")
+ (set rdf:type 'gn:publication)
+ (multiset gn:authors
+ ;; The authors field is a comma
+ ;; separated list. Split it.
+ (map string-trim (string-split (key "Authors") #\,)))
+ (set gn:abstract
+ ;; TODO: Why are there unprintable characters?
+ (delete-substrings (key "Abstract") "\x01"))
+ (else=> default-metadata-proc))
+ (string->identifier "publication"
+ (number->string (assoc-ref row "Id"))))))
+
+(define-dump dump-publish-xref
+ (select-query ((InbredSet Name)
+ (PublishXRef PhenotypeId))
+ (PublishXRef
+ (inner-join InbredSet "USING (InbredSetId)")))
+ (lambda (row)
+ (scm->triples (map-alist row
+ (set gn:phenotypeOfSpecies (inbred-set-name->id (key "Name"))))
+ (phenotype-id->id (assoc-ref row "PhenotypeId")))))
(define tissue-short-name->id
(cut string->identifier "tissue" <>))
-(define (dump-tissue db)
+(define-dump dump-tissue
;; The Name and TissueName fields seem to be identical. BIRN_lex_ID
;; and BIRN_lex_Name are mostly NULL.
- (sql-for-each (lambda (row)
- (scm->triples (map-alist row
- (delete "Short_Name")
- (set rdf:type 'gn:tissue)
- (set gn:name (key "Name")))
- ;; Hopefully the Short_Name field is
- ;; distinct and can be used as an
- ;; identifier.
- (tissue-short-name->id (assoc-ref row "Short_Name"))))
- db
- (select-query ((Tissue Name)
- (Tissue Short_Name))
- (Tissue))))
+ (select-query ((Tissue Name)
+ (Tissue Short_Name))
+ (Tissue))
+ (lambda (row)
+ (scm->triples (map-alist row
+ (delete "Short_Name")
+ (set rdf:type 'gn:tissue)
+ (set gn:name (key "Name")))
+ ;; Hopefully the Short_Name field is
+ ;; distinct and can be used as an
+ ;; identifier.
+ (tissue-short-name->id (assoc-ref row "Short_Name")))))
;; One email ID in the Investigators table has spaces in it. This
;; function fixes that.
@@ -317,142 +316,136 @@ characters with an underscore and prefixing with gn:PREFIX."
(string-join (list first-name last-name (fix-email-id email))
"_")))
-(define (dump-investigators db)
- (sql-for-each (lambda (row)
- (scm->triples (map-alist row
- (set rdf:type 'foaf:Person)
- (set foaf:name (string-append (key "FirstName") " " (key "LastName")))
- (set foaf:givenName (key "FirstName"))
- (set foaf:familyName (key "LastName"))
- (set foaf:phone (key "Phone"))
- (set foaf:mbox (fix-email-id (key "Email")))
- (set foaf:homepage (key "Url"))
- (else=> default-metadata-proc))
- (investigator-attributes->id (assoc-ref row "FirstName")
- (assoc-ref row "LastName")
- (assoc-ref row "Email"))))
- db
- ;; There are a few duplicate entries. We group by
- ;; email to deduplicate.
- (select-query ((Investigators FirstName)
- (Investigators LastName)
- (Investigators Address)
- (Investigators City)
- (Investigators State)
- (Investigators ZipCode)
- (Investigators Phone)
- (Investigators Email)
- (Investigators Country)
- (Investigators Url))
- (Investigators)
- "GROUP BY Email")))
+(define-dump dump-investigators
+ ;; There are a few duplicate entries. We group by email to
+ ;; deduplicate.
+ (select-query ((Investigators FirstName)
+ (Investigators LastName)
+ (Investigators Address)
+ (Investigators City)
+ (Investigators State)
+ (Investigators ZipCode)
+ (Investigators Phone)
+ (Investigators Email)
+ (Investigators Country)
+ (Investigators Url))
+ (Investigators)
+ "GROUP BY Email")
+ (lambda (row)
+ (scm->triples (map-alist row
+ (set rdf:type 'foaf:Person)
+ (set foaf:name (string-append (key "FirstName") " " (key "LastName")))
+ (set foaf:givenName (key "FirstName"))
+ (set foaf:familyName (key "LastName"))
+ (set foaf:phone (key "Phone"))
+ (set foaf:mbox (fix-email-id (key "Email")))
+ (set foaf:homepage (key "Url"))
+ (else=> default-metadata-proc))
+ (investigator-attributes->id (assoc-ref row "FirstName")
+ (assoc-ref row "LastName")
+ (assoc-ref row "Email")))))
(define avg-method-name->id
(cut string->identifier "avgmethod" <>))
-(define (dump-avg-method db)
- (sql-for-each (lambda (row)
- (scm->triples (map-alist row
- (set rdf:type 'gn:avgMethod)
- (set gn:name (key "Name")))
- (avg-method-name->id (assoc-ref row "Name"))))
- db
- ;; The Name and Normalization fields seem to be the
- ;; same. Dump only the Name field.
- ;;
- ;; There are two records with Name as
- ;; "N/A". Deduplicate.
- (select-query (distinct (AvgMethod Name))
- (AvgMethod))))
+(define-dump dump-avg-method
+ ;; The Name and Normalization fields seem to be the same. Dump only
+ ;; the Name field.
+ ;;
+ ;; There are two records with Name as "N/A". Deduplicate.
+ (select-query (distinct (AvgMethod Name))
+ (AvgMethod))
+ (lambda (row)
+ (scm->triples (map-alist row
+ (set rdf:type 'gn:avgMethod)
+ (set gn:name (key "Name")))
+ (avg-method-name->id (assoc-ref row "Name")))))
(define gene-chip-name->id
(cut string->identifier "platform" <>))
-(define (dump-gene-chip db)
- (sql-for-each (lambda (row)
- (scm->triples (map-alist row
- (delete "Name")
- (set rdf:type 'gn:platform)
- (set gn:name (key "GeneChipName")))
- (gene-chip-name->id (assoc-ref row "Name"))))
- db
- (select-query ((GeneChip GeneChipName)
- (GeneChip Name))
- (GeneChip))))
-
-(define (dump-info-files db)
- (sql-for-each (lambda (row)
- (scm->triples
- (map-alist row
- (set rdf:type 'gn:dataset)
- (set gn:datasetOfInvestigator
- (investigator-attributes->id (key "FirstName")
- (key "LastName")
- (key "Email")))
- (set gn:accessionId (string-append "GN" (number->string (key "GN_AccesionId"))))
- (set gn:datasetStatusName (string-downcase (key "DatasetStatusName")))
- (set gn:datasetOfSpecies (binomial-name->species-id (key "BinomialName")))
- (set gn:datasetOfInbredSet (inbred-set-name->id (key "InbredSetName")))
- (set gn:datasetOfTissue (tissue-short-name->id (key "Short_Name")))
- (set gn:normalization
- (avg-method-name->id
- ;; If AvgMethodName is NULL, assume N/A.
- (if (string-blank? (key "AvgMethodName"))
- "N/A" (key "AvgMethodName"))))
- (set gn:datasetOfPlatform (gene-chip-name->id (key "GeneChip")))
- (set gn:summary
- ;; TODO: Why are there unprintable characters?
- (delete-substrings (key "Summary")
- "\x01" "\x03"))
- (set gn:aboutTissue
- ;; TODO: Why are there unprintable characters?
- (delete-substrings (key "AboutTissue")
- "\x01" "\x03"))
- (set gn:geoSeries
- (and (not (string-prefix-ci? "no geo series"
- (key "GeoSeries")))
- (key "GeoSeries")))
- (else=> default-metadata-proc))
- (string->identifier "dataset"
- (number->string (assoc-ref row "GN_AccesionId")))))
- db
- ;; TODO: Double check Platforms. It doesn't seem to
- ;; match up.
- (select-query ((InfoFiles GN_AccesionId)
- (InfoFiles InfoFileTitle Name)
- (InfoFiles Title)
- (InfoFiles Specifics)
- (DatasetStatus DatasetStatusName)
- (Datasets DatasetName DatasetGroup)
- (Datasets Summary)
- (Datasets GeoSeries)
- (Datasets AboutCases)
- (Datasets AboutPlatform)
- (Datasets AboutTissue)
- (Datasets AboutDataProcessing)
- (Datasets Notes)
- (Datasets ExperimentDesign)
- (Datasets Contributors)
- (Datasets Citation)
- (Datasets Acknowledgment)
- (Species FullName BinomialName)
- (InbredSet Name InbredSetName)
- (Tissue Short_Name)
- (Investigators FirstName)
- (Investigators LastName)
- (Investigators Email)
- (AvgMethod Name AvgMethodName)
- (GeneChip Name GeneChip))
- (InfoFiles
- (left-join Datasets "USING (DatasetId)")
- (left-join DatasetStatus "USING (DatasetStatusId)")
- (left-join Species "USING (SpeciesId)")
- (left-join InbredSet "USING (InbredSetId)")
- (left-join Tissue "USING (TissueId)")
- (left-join Investigators "USING (InvestigatorId)")
- (left-join AvgMethod "USING (AvgMethodId)")
- (left-join GeneChip "USING (GeneChipId)"))
- "WHERE GN_AccesionId IS NOT NULL")))
+(define-dump dump-gene-chip
+ (select-query ((GeneChip GeneChipName)
+ (GeneChip Name))
+ (GeneChip))
+ (lambda (row)
+ (scm->triples (map-alist row
+ (delete "Name")
+ (set rdf:type 'gn:platform)
+ (set gn:name (key "GeneChipName")))
+ (gene-chip-name->id (assoc-ref row "Name")))))
+
+(define-dump dump-info-files
+ ;; TODO: Double check Platforms. It doesn't seem to match up.
+ (select-query ((InfoFiles GN_AccesionId)
+ (InfoFiles InfoFileTitle Name)
+ (InfoFiles Title)
+ (InfoFiles Specifics)
+ (DatasetStatus DatasetStatusName)
+ (Datasets DatasetName DatasetGroup)
+ (Datasets Summary)
+ (Datasets GeoSeries)
+ (Datasets AboutCases)
+ (Datasets AboutPlatform)
+ (Datasets AboutTissue)
+ (Datasets AboutDataProcessing)
+ (Datasets Notes)
+ (Datasets ExperimentDesign)
+ (Datasets Contributors)
+ (Datasets Citation)
+ (Datasets Acknowledgment)
+ (Species FullName BinomialName)
+ (InbredSet Name InbredSetName)
+ (Tissue Short_Name)
+ (Investigators FirstName)
+ (Investigators LastName)
+ (Investigators Email)
+ (AvgMethod Name AvgMethodName)
+ (GeneChip Name GeneChip))
+ (InfoFiles
+ (left-join Datasets "USING (DatasetId)")
+ (left-join DatasetStatus "USING (DatasetStatusId)")
+ (left-join Species "USING (SpeciesId)")
+ (left-join InbredSet "USING (InbredSetId)")
+ (left-join Tissue "USING (TissueId)")
+ (left-join Investigators "USING (InvestigatorId)")
+ (left-join AvgMethod "USING (AvgMethodId)")
+ (left-join GeneChip "USING (GeneChipId)"))
+ "WHERE GN_AccesionId IS NOT NULL")
+ (lambda (row)
+ (scm->triples
+ (map-alist row
+ (set rdf:type 'gn:dataset)
+ (set gn:datasetOfInvestigator
+ (investigator-attributes->id (key "FirstName")
+ (key "LastName")
+ (key "Email")))
+ (set gn:accessionId (string-append "GN" (number->string (key "GN_AccesionId"))))
+ (set gn:datasetStatusName (string-downcase (key "DatasetStatusName")))
+ (set gn:datasetOfSpecies (binomial-name->species-id (key "BinomialName")))
+ (set gn:datasetOfInbredSet (inbred-set-name->id (key "InbredSetName")))
+ (set gn:datasetOfTissue (tissue-short-name->id (key "Short_Name")))
+ (set gn:normalization
+ (avg-method-name->id
+ ;; If AvgMethodName is NULL, assume N/A.
+ (if (string-blank? (key "AvgMethodName"))
+ "N/A" (key "AvgMethodName"))))
+ (set gn:datasetOfPlatform (gene-chip-name->id (key "GeneChip")))
+ (set gn:summary
+ ;; TODO: Why are there unprintable characters?
+ (delete-substrings (key "Summary")
+ "\x01" "\x03"))
+ (set gn:aboutTissue
+ ;; TODO: Why are there unprintable characters?
+ (delete-substrings (key "AboutTissue")
+ "\x01" "\x03"))
+ (set gn:geoSeries
+ (and (not (string-prefix-ci? "no geo series"
+ (key "GeoSeries")))
+ (key "GeoSeries")))
+ (else=> default-metadata-proc))
+ (string->identifier "dataset"
+ (number->string (assoc-ref row "GN_AccesionId"))))))
(define (dump-data-table db table-name data-field)
(let ((dump-directory (string-append %dump-directory "/" table-name))