diff options
Diffstat (limited to 'dump.scm')
-rwxr-xr-x | dump.scm | 519 |
1 files changed, 256 insertions, 263 deletions
@@ -143,164 +143,163 @@ characters with an underscore and prefixing with gn:PREFIX." (define (triple subject predicate object) (format #t "~a ~a ~s .~%" subject predicate object)) +(define-syntax define-dump + (lambda (x) + (syntax-case x (select-query) + ((_ name (select-query (fields ...) tables raw-forms ...) proc) + (define (name db) + (sql-for-each proc db (select-query (fields ...) tables raw-forms ...))))))) + (define binomial-name->species-id (cut string->identifier "species" <>)) -(define (dump-species db) - (sql-for-each (lambda (row) - (scm->triples (map-alist row - (set rdf:type 'gn:species) - ;; Common name - (set gn:name (key "SpeciesName")) - ;; Menu name (TODO: Maybe, drop this field. It can - ;; be inferred from the common name.) - (set gn:menuName (key "MenuName")) - (set gn:binomialname (key "FullName"))) - (binomial-name->species-id (assoc-ref row "FullName")))) - db - (select-query ((Species SpeciesName) - (Species MenuName) - (Species FullName)) - (Species)))) - -(define (dump-strain db) - (sql-for-each (lambda (row) - (scm->triples (map-alist row - (set rdf:type 'gn:strain) - (set gn:strainOfSpecies - (binomial-name->species-id (key "FullName"))) - ;; Name, and maybe a second name - (set gn:name (key "Name")) - (set gn:name (key "Name2")) - (set gn:alias (key "Alias"))) - (string->identifier "strain" (assoc-ref row "Name")))) - db - (select-query ((Species FullName) - (Strain Name) - (Strain Name2) - (Strain Symbol) - (Strain Alias)) - (Strain - (join Species "ON Strain.SpeciesId = Species.SpeciesId"))))) +(define-dump dump-species + (select-query ((Species SpeciesName) + (Species MenuName) + (Species FullName)) + (Species)) + (lambda (row) + (scm->triples (map-alist row + (set rdf:type 'gn:species) + ;; Common name + (set gn:name (key "SpeciesName")) + ;; Menu name (TODO: Maybe, drop this field. It can + ;; be inferred from the common name.) + (set gn:menuName (key "MenuName")) + (set gn:binomialname (key "FullName"))) + (binomial-name->species-id (assoc-ref row "FullName"))))) + +(define-dump dump-strain + (select-query ((Species FullName) + (Strain Name) + (Strain Name2) + (Strain Symbol) + (Strain Alias)) + (Strain + (join Species "ON Strain.SpeciesId = Species.SpeciesId"))) + (lambda (row) + (scm->triples (map-alist row + (set rdf:type 'gn:strain) + (set gn:strainOfSpecies + (binomial-name->species-id (key "FullName"))) + ;; Name, and maybe a second name + (set gn:name (key "Name")) + (set gn:name (key "Name2")) + (set gn:alias (key "Alias"))) + (string->identifier "strain" (assoc-ref row "Name"))))) ;; TODO: This function is unused. Remove if not required. (define mapping-method-name->id (cut string->identifier "mappingMethod" <>)) ;; TODO: This function is unused. Remove if not required. -(define (dump-mapping-method db) - (sql-for-each (lambda (row) - (scm->triples (map-alist row - (set rdf:type 'gn:mappingMethod)) - (string->identifier "mappingMethod" (assoc-ref row "Name")))) - db - (select-query ((MappingMethod Name)) - (MappingMethod)))) +(define-dump dump-mapping-method + (select-query ((MappingMethod Name)) + (MappingMethod)) + (lambda (row) + (scm->triples (map-alist row + (set rdf:type 'gn:mappingMethod)) + (string->identifier "mappingMethod" (assoc-ref row "Name"))))) (define inbred-set-name->id (cut string->identifier "inbredSet" <>)) -(define (dump-inbred-set db) - (sql-for-each (lambda (row) - (scm->triples (map-alist row - (set rdf:type 'gn:phenotype) - (set gn:inbredSetOfSpecies - (binomial-name->species-id (key "BinomialName"))) - (else=> default-metadata-proc)) - (inbred-set-name->id (assoc-ref row "Name")))) - db - (select-query ((InbredSet Name) - (InbredSet FullName) - (InbredSet GeneticType) - (InbredSet Family) - (Species FullName BinomialName)) - (InbredSet - (inner-join Species "USING (SpeciesId)"))))) +(define-dump dump-inbred-set + (select-query ((InbredSet Name) + (InbredSet FullName) + (InbredSet GeneticType) + (InbredSet Family) + (Species FullName BinomialName)) + (InbredSet + (inner-join Species "USING (SpeciesId)"))) + (lambda (row) + (scm->triples (map-alist row + (set rdf:type 'gn:phenotype) + (set gn:inbredSetOfSpecies + (binomial-name->species-id (key "BinomialName"))) + (else=> default-metadata-proc)) + (inbred-set-name->id (assoc-ref row "Name"))))) (define (phenotype-id->id id) (string->identifier "phenotype" (number->string id))) -(define (dump-phenotype db) - (sql-for-each (lambda (row) - (scm->triples (map-alist row - (delete "Id") - (set rdf:type 'gn:phenotype) - (set gn:units (and (string-ci=? (key "Units") "unknown") - (key "Units"))) - (else=> default-metadata-proc)) - (phenotype-id->id (assoc-ref row "Id")))) - db - (select-query ((Phenotype Id) - (Phenotype Pre_publication_description) - (Phenotype Post_publication_description) - (Phenotype Original_description) - (Phenotype Units) - (Phenotype Pre_publication_abbreviation) - (Phenotype Post_publication_abbreviation) - (Phenotype Lab_code) - (Phenotype Submitter) - (Phenotype Owner) - (Phenotype Authorized_Users)) - (Phenotype)))) - -(define (dump-publication db) - (sql-for-each (lambda (row) - (scm->triples (map-alist row - (delete "Id") - (set rdf:type 'gn:publication) - (multiset gn:authors - ;; The authors field is a comma - ;; separated list. Split it. - (map string-trim (string-split (key "Authors") #\,))) - (set gn:abstract - ;; TODO: Why are there unprintable characters? - (delete-substrings (key "Abstract") "\x01")) - (else=> default-metadata-proc)) - (string->identifier "publication" - (number->string (assoc-ref row "Id"))))) - db - (select-query ((Publication Id) - (Publication PubMed_ID) - (Publication Abstract) - (Publication Authors) - (Publication Title) - (Publication Journal) - (Publication Volume) - (Publication Pages) - (Publication Month) - (Publication Year)) - (Publication)))) - -(define (dump-publish-xref db) - (sql-for-each (lambda (row) - (scm->triples (map-alist row - (set gn:phenotypeOfSpecies (inbred-set-name->id (key "Name")))) - (phenotype-id->id (assoc-ref row "PhenotypeId")))) - db - (select-query ((InbredSet Name) - (PublishXRef PhenotypeId)) - (PublishXRef - (inner-join InbredSet "USING (InbredSetId)"))))) +(define-dump dump-phenotype + (select-query ((Phenotype Id) + (Phenotype Pre_publication_description) + (Phenotype Post_publication_description) + (Phenotype Original_description) + (Phenotype Units) + (Phenotype Pre_publication_abbreviation) + (Phenotype Post_publication_abbreviation) + (Phenotype Lab_code) + (Phenotype Submitter) + (Phenotype Owner) + (Phenotype Authorized_Users)) + (Phenotype)) + (lambda (row) + (scm->triples (map-alist row + (delete "Id") + (set rdf:type 'gn:phenotype) + (set gn:units (and (string-ci=? (key "Units") "unknown") + (key "Units"))) + (else=> default-metadata-proc)) + (phenotype-id->id (assoc-ref row "Id"))))) + +(define-dump dump-publication + (select-query ((Publication Id) + (Publication PubMed_ID) + (Publication Abstract) + (Publication Authors) + (Publication Title) + (Publication Journal) + (Publication Volume) + (Publication Pages) + (Publication Month) + (Publication Year)) + (Publication)) + (lambda (row) + (scm->triples (map-alist row + (delete "Id") + (set rdf:type 'gn:publication) + (multiset gn:authors + ;; The authors field is a comma + ;; separated list. Split it. + (map string-trim (string-split (key "Authors") #\,))) + (set gn:abstract + ;; TODO: Why are there unprintable characters? + (delete-substrings (key "Abstract") "\x01")) + (else=> default-metadata-proc)) + (string->identifier "publication" + (number->string (assoc-ref row "Id")))))) + +(define-dump dump-publish-xref + (select-query ((InbredSet Name) + (PublishXRef PhenotypeId)) + (PublishXRef + (inner-join InbredSet "USING (InbredSetId)"))) + (lambda (row) + (scm->triples (map-alist row + (set gn:phenotypeOfSpecies (inbred-set-name->id (key "Name")))) + (phenotype-id->id (assoc-ref row "PhenotypeId"))))) (define tissue-short-name->id (cut string->identifier "tissue" <>)) -(define (dump-tissue db) +(define-dump dump-tissue ;; The Name and TissueName fields seem to be identical. BIRN_lex_ID ;; and BIRN_lex_Name are mostly NULL. - (sql-for-each (lambda (row) - (scm->triples (map-alist row - (delete "Short_Name") - (set rdf:type 'gn:tissue) - (set gn:name (key "Name"))) - ;; Hopefully the Short_Name field is - ;; distinct and can be used as an - ;; identifier. - (tissue-short-name->id (assoc-ref row "Short_Name")))) - db - (select-query ((Tissue Name) - (Tissue Short_Name)) - (Tissue)))) + (select-query ((Tissue Name) + (Tissue Short_Name)) + (Tissue)) + (lambda (row) + (scm->triples (map-alist row + (delete "Short_Name") + (set rdf:type 'gn:tissue) + (set gn:name (key "Name"))) + ;; Hopefully the Short_Name field is + ;; distinct and can be used as an + ;; identifier. + (tissue-short-name->id (assoc-ref row "Short_Name"))))) ;; One email ID in the Investigators table has spaces in it. This ;; function fixes that. @@ -317,142 +316,136 @@ characters with an underscore and prefixing with gn:PREFIX." (string-join (list first-name last-name (fix-email-id email)) "_"))) -(define (dump-investigators db) - (sql-for-each (lambda (row) - (scm->triples (map-alist row - (set rdf:type 'foaf:Person) - (set foaf:name (string-append (key "FirstName") " " (key "LastName"))) - (set foaf:givenName (key "FirstName")) - (set foaf:familyName (key "LastName")) - (set foaf:phone (key "Phone")) - (set foaf:mbox (fix-email-id (key "Email"))) - (set foaf:homepage (key "Url")) - (else=> default-metadata-proc)) - (investigator-attributes->id (assoc-ref row "FirstName") - (assoc-ref row "LastName") - (assoc-ref row "Email")))) - db - ;; There are a few duplicate entries. We group by - ;; email to deduplicate. - (select-query ((Investigators FirstName) - (Investigators LastName) - (Investigators Address) - (Investigators City) - (Investigators State) - (Investigators ZipCode) - (Investigators Phone) - (Investigators Email) - (Investigators Country) - (Investigators Url)) - (Investigators) - "GROUP BY Email"))) +(define-dump dump-investigators + ;; There are a few duplicate entries. We group by email to + ;; deduplicate. + (select-query ((Investigators FirstName) + (Investigators LastName) + (Investigators Address) + (Investigators City) + (Investigators State) + (Investigators ZipCode) + (Investigators Phone) + (Investigators Email) + (Investigators Country) + (Investigators Url)) + (Investigators) + "GROUP BY Email") + (lambda (row) + (scm->triples (map-alist row + (set rdf:type 'foaf:Person) + (set foaf:name (string-append (key "FirstName") " " (key "LastName"))) + (set foaf:givenName (key "FirstName")) + (set foaf:familyName (key "LastName")) + (set foaf:phone (key "Phone")) + (set foaf:mbox (fix-email-id (key "Email"))) + (set foaf:homepage (key "Url")) + (else=> default-metadata-proc)) + (investigator-attributes->id (assoc-ref row "FirstName") + (assoc-ref row "LastName") + (assoc-ref row "Email"))))) (define avg-method-name->id (cut string->identifier "avgmethod" <>)) -(define (dump-avg-method db) - (sql-for-each (lambda (row) - (scm->triples (map-alist row - (set rdf:type 'gn:avgMethod) - (set gn:name (key "Name"))) - (avg-method-name->id (assoc-ref row "Name")))) - db - ;; The Name and Normalization fields seem to be the - ;; same. Dump only the Name field. - ;; - ;; There are two records with Name as - ;; "N/A". Deduplicate. - (select-query (distinct (AvgMethod Name)) - (AvgMethod)))) +(define-dump dump-avg-method + ;; The Name and Normalization fields seem to be the same. Dump only + ;; the Name field. + ;; + ;; There are two records with Name as "N/A". Deduplicate. + (select-query (distinct (AvgMethod Name)) + (AvgMethod)) + (lambda (row) + (scm->triples (map-alist row + (set rdf:type 'gn:avgMethod) + (set gn:name (key "Name"))) + (avg-method-name->id (assoc-ref row "Name"))))) (define gene-chip-name->id (cut string->identifier "platform" <>)) -(define (dump-gene-chip db) - (sql-for-each (lambda (row) - (scm->triples (map-alist row - (delete "Name") - (set rdf:type 'gn:platform) - (set gn:name (key "GeneChipName"))) - (gene-chip-name->id (assoc-ref row "Name")))) - db - (select-query ((GeneChip GeneChipName) - (GeneChip Name)) - (GeneChip)))) - -(define (dump-info-files db) - (sql-for-each (lambda (row) - (scm->triples - (map-alist row - (set rdf:type 'gn:dataset) - (set gn:datasetOfInvestigator - (investigator-attributes->id (key "FirstName") - (key "LastName") - (key "Email"))) - (set gn:accessionId (string-append "GN" (number->string (key "GN_AccesionId")))) - (set gn:datasetStatusName (string-downcase (key "DatasetStatusName"))) - (set gn:datasetOfSpecies (binomial-name->species-id (key "BinomialName"))) - (set gn:datasetOfInbredSet (inbred-set-name->id (key "InbredSetName"))) - (set gn:datasetOfTissue (tissue-short-name->id (key "Short_Name"))) - (set gn:normalization - (avg-method-name->id - ;; If AvgMethodName is NULL, assume N/A. - (if (string-blank? (key "AvgMethodName")) - "N/A" (key "AvgMethodName")))) - (set gn:datasetOfPlatform (gene-chip-name->id (key "GeneChip"))) - (set gn:summary - ;; TODO: Why are there unprintable characters? - (delete-substrings (key "Summary") - "\x01" "\x03")) - (set gn:aboutTissue - ;; TODO: Why are there unprintable characters? - (delete-substrings (key "AboutTissue") - "\x01" "\x03")) - (set gn:geoSeries - (and (not (string-prefix-ci? "no geo series" - (key "GeoSeries"))) - (key "GeoSeries"))) - (else=> default-metadata-proc)) - (string->identifier "dataset" - (number->string (assoc-ref row "GN_AccesionId"))))) - db - ;; TODO: Double check Platforms. It doesn't seem to - ;; match up. - (select-query ((InfoFiles GN_AccesionId) - (InfoFiles InfoFileTitle Name) - (InfoFiles Title) - (InfoFiles Specifics) - (DatasetStatus DatasetStatusName) - (Datasets DatasetName DatasetGroup) - (Datasets Summary) - (Datasets GeoSeries) - (Datasets AboutCases) - (Datasets AboutPlatform) - (Datasets AboutTissue) - (Datasets AboutDataProcessing) - (Datasets Notes) - (Datasets ExperimentDesign) - (Datasets Contributors) - (Datasets Citation) - (Datasets Acknowledgment) - (Species FullName BinomialName) - (InbredSet Name InbredSetName) - (Tissue Short_Name) - (Investigators FirstName) - (Investigators LastName) - (Investigators Email) - (AvgMethod Name AvgMethodName) - (GeneChip Name GeneChip)) - (InfoFiles - (left-join Datasets "USING (DatasetId)") - (left-join DatasetStatus "USING (DatasetStatusId)") - (left-join Species "USING (SpeciesId)") - (left-join InbredSet "USING (InbredSetId)") - (left-join Tissue "USING (TissueId)") - (left-join Investigators "USING (InvestigatorId)") - (left-join AvgMethod "USING (AvgMethodId)") - (left-join GeneChip "USING (GeneChipId)")) - "WHERE GN_AccesionId IS NOT NULL"))) +(define-dump dump-gene-chip + (select-query ((GeneChip GeneChipName) + (GeneChip Name)) + (GeneChip)) + (lambda (row) + (scm->triples (map-alist row + (delete "Name") + (set rdf:type 'gn:platform) + (set gn:name (key "GeneChipName"))) + (gene-chip-name->id (assoc-ref row "Name"))))) + +(define-dump dump-info-files + ;; TODO: Double check Platforms. It doesn't seem to match up. + (select-query ((InfoFiles GN_AccesionId) + (InfoFiles InfoFileTitle Name) + (InfoFiles Title) + (InfoFiles Specifics) + (DatasetStatus DatasetStatusName) + (Datasets DatasetName DatasetGroup) + (Datasets Summary) + (Datasets GeoSeries) + (Datasets AboutCases) + (Datasets AboutPlatform) + (Datasets AboutTissue) + (Datasets AboutDataProcessing) + (Datasets Notes) + (Datasets ExperimentDesign) + (Datasets Contributors) + (Datasets Citation) + (Datasets Acknowledgment) + (Species FullName BinomialName) + (InbredSet Name InbredSetName) + (Tissue Short_Name) + (Investigators FirstName) + (Investigators LastName) + (Investigators Email) + (AvgMethod Name AvgMethodName) + (GeneChip Name GeneChip)) + (InfoFiles + (left-join Datasets "USING (DatasetId)") + (left-join DatasetStatus "USING (DatasetStatusId)") + (left-join Species "USING (SpeciesId)") + (left-join InbredSet "USING (InbredSetId)") + (left-join Tissue "USING (TissueId)") + (left-join Investigators "USING (InvestigatorId)") + (left-join AvgMethod "USING (AvgMethodId)") + (left-join GeneChip "USING (GeneChipId)")) + "WHERE GN_AccesionId IS NOT NULL") + (lambda (row) + (scm->triples + (map-alist row + (set rdf:type 'gn:dataset) + (set gn:datasetOfInvestigator + (investigator-attributes->id (key "FirstName") + (key "LastName") + (key "Email"))) + (set gn:accessionId (string-append "GN" (number->string (key "GN_AccesionId")))) + (set gn:datasetStatusName (string-downcase (key "DatasetStatusName"))) + (set gn:datasetOfSpecies (binomial-name->species-id (key "BinomialName"))) + (set gn:datasetOfInbredSet (inbred-set-name->id (key "InbredSetName"))) + (set gn:datasetOfTissue (tissue-short-name->id (key "Short_Name"))) + (set gn:normalization + (avg-method-name->id + ;; If AvgMethodName is NULL, assume N/A. + (if (string-blank? (key "AvgMethodName")) + "N/A" (key "AvgMethodName")))) + (set gn:datasetOfPlatform (gene-chip-name->id (key "GeneChip"))) + (set gn:summary + ;; TODO: Why are there unprintable characters? + (delete-substrings (key "Summary") + "\x01" "\x03")) + (set gn:aboutTissue + ;; TODO: Why are there unprintable characters? + (delete-substrings (key "AboutTissue") + "\x01" "\x03")) + (set gn:geoSeries + (and (not (string-prefix-ci? "no geo series" + (key "GeoSeries"))) + (key "GeoSeries"))) + (else=> default-metadata-proc)) + (string->identifier "dataset" + (number->string (assoc-ref row "GN_AccesionId")))))) (define (dump-data-table db table-name data-field) (let ((dump-directory (string-append %dump-directory "/" table-name)) |