From afaa6b096ec0df7055faafbe7f89917f734b3897 Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Mon, 15 May 2023 21:20:50 +0300 Subject: Replace dump.scm with separete dumps for several tables Signed-off-by: Munyoki Kilyungi --- examples/dump-dataset-metadata.scm | 220 +++++++++++++++++++++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100755 examples/dump-dataset-metadata.scm (limited to 'examples/dump-dataset-metadata.scm') diff --git a/examples/dump-dataset-metadata.scm b/examples/dump-dataset-metadata.scm new file mode 100755 index 0000000..aa7a5f2 --- /dev/null +++ b/examples/dump-dataset-metadata.scm @@ -0,0 +1,220 @@ +#! /usr/bin/env guile +!# + +(use-modules (srfi srfi-1) + (srfi srfi-26) + (ice-9 match) + (ice-9 regex) + (dump strings) + (dump sql) + (dump triples) + (dump special-forms)) + + + +(define %connection-settings + (call-with-input-file (list-ref (command-line) 1) + read)) + +(define %dump-directory + (list-ref (command-line) 2)) + + + +;; One email ID in the Investigators table has spaces in it. This +;; function fixes that. +(define (fix-email-id email) + (string-delete #\space email)) + +(define (investigator-attributes->id first-name last-name email) + ;; There is just one record corresponding to "Evan Williams" which + ;; does not have an email ID. To accommodate that record, we + ;; construct the investigator ID from not just the email ID, but + ;; also the first and the last names. It would be preferable to just + ;; find Evan Williams' email ID and insert it into the database. + (string->identifier "investigator" + (string-join + ;; Add special case for Yohan Bossé whose name + ;; has unprintable characters. + ;; TODO: Fix Yohan Bossé's name in the database. + (let ((last-name (if (string=? first-name "Yohan") + "Bosse" + last-name))) + (list first-name last-name (fix-email-id email))) + "_"))) + +(define-dump dump-investigators + ;; There are a few duplicate entries. We group by email to + ;; deduplicate. + (tables (Investigators) + "GROUP BY Email") + (schema-triples + ;; TODO: Are ranges required for FOAF predicates? Can they not be + ;; obtained from the FOAF spec? + (foaf:name rdfs:range rdfs:Literal) + (foaf:givenName rdfs:range rdfs:Literal) + (foaf:familyName rdfs:range rdfs:Literal) + (foaf:phone rdfs:range rdfs:Literal) + (foaf:mbox rdfs:range rdfs:Literal) + (foaf:homepage rdfs:range rdfs:Literal) + (gn:address rdfs:range rdfs:Literal) + (gn:city rdfs:range rdfs:Literal) + (gn:state rdfs:range rdfs:Literal) + (gn:zipCode rdfs:range rdfs:Literal) + (gn:country rdfs:range rdfs:Literal)) + (triples (investigator-attributes->id (field Investigators FirstName) + (field Investigators LastName) + (field Investigators Email)) + (set rdf:type 'foaf:Person) + ;; Special case Yohan Bossé's name since the last name has + ;; unprintable characters. + (set foaf:name (string-append (field Investigators FirstName) " " + (if (string=? (field Investigators FirstName) "Yohan") + "Bossé" + (field Investigators LastName)))) + (set foaf:givenName (field Investigators FirstName)) + ;; Special case Yohan Bossé's name since the last name has + ;; unprintable characters. + (set foaf:familyName (if (string=? (field Investigators FirstName) "Yohan") + "Bossé" + (field Investigators LastName))) + (set foaf:phone (field Investigators Phone)) + (set foaf:mbox (fix-email-id (field Investigators Email))) + (set foaf:homepage (field Investigators Url)) + (set gn:address (field Investigators Address)) + (set gn:city (field Investigators City)) + (set gn:state (field Investigators State)) + (set gn:zipCode (field Investigators ZipCode)) + (set gn:country (field Investigators Country)))) + +(define-dump dump-info-files + (tables (InfoFiles + (left-join PublishFreeze "ON InfoFiles.InfoPageName = PublishFreeze.Name") + (left-join GenoFreeze "ON InfoFiles.InfoPageName = GenoFreeze.Name") + (left-join ProbeSetFreeze "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") + (left-join Datasets "USING (DatasetId)") + (left-join DatasetStatus "USING (DatasetStatusId)") + (left-join Species "USING (SpeciesId)") + (left-join Tissue "USING (TissueId)") + (left-join Investigators "USING (InvestigatorId)") + (left-join AvgMethod "USING (AvgMethodId)") + (left-join GeneChip "USING (GeneChipId)")) + "WHERE GN_AccesionId IS NOT NULL") + (schema-triples + (gn:datasetOfInvestigator rdfs:domain gn:dataset) + (gn:datasetOfInvestigator rdfs:range foaf:Person) + (gn:datasetOfSpecies rdfs:domain gn:dataset) + (gn:datasetOfSpecies rdfs:range gn:species) + (gn:datasetOfInbredSet rdfs:domain gn:dataset) + (gn:datasetOfInbredSet rdfs:range gn:inbredSet) + (gn:datasetOfTissue rdfs:domain gn:dataset) + (gn:datasetOfTissue rdfs:range gn:tissue) + (gn:normalization rdfs:domain gn:dataset) + (gn:normalization rdfs:range gn:avgMethod) + (gn:datasetOfPlatform rdfs:domain gn:dataset) + (gn:datasetOfPlatform rdfs:range gn:geneChip) + (gn:accessionId rdfs:range rdfs:Literal) + (gn:datasetStatusName rdfs:range rdfs:Literal) + (gn:summary rdfs:range rdfs:Literal) + (gn:aboutTissue rdfs:range rdfs:Literal) + (gn:geoSeries rdfs:range rdfs:Literal) + (gn:name rdfs:range rdfs:Literal) + (gn:title rdfs:range rdfs:Literal) + (gn:specifics rdfs:range rdfs:Literal) + (gn:datasetGroup rdfs:range rdfs:Literal) + (gn:aboutCases rdfs:range rdfs:Literal) + (gn:aboutPlatform rdfs:range rdfs:Literal) + (gn:aboutDataProcessing rdfs:range rdfs:Literal) + (gn:notes rdfs:range rdfs:Literal) + (gn:experimentDesign rdfs:range rdfs:Literal) + (gn:contributors rdfs:range rdfs:Literal) + (gn:citation rdfs:range rdfs:Literal) + (gn:acknowledgment rdfs:range rdfs:Literal)) + (triples (string->identifier "dataset" + (field InfoFiles InfoPageName)) + ;; Add GeneChipName and GeoPlatform: + ;; GeneChip.GeneChipName AS gene_chip_name + ;; GeneChip.GeoPlatform AS geo_platform + (set rdf:type (string->symbol + (field ("IF(GenoFreeze.Id IS NOT NULL, 'gn:genotypeDataset', IF(PublishFreeze.Id IS NOT NULL, 'gn:phenotypeDataset', 'gn:dataset'))" + rdfType)))) + (set gn:name (field InfoFiles InfoPageName)) + (set dct:created + (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))" + createTimeGenoFreeze))) + (set gn:datasetOfInvestigator + (investigator-attributes->id (field Investigators FirstName) + (field Investigators LastName) + (field Investigators Email))) + (set gn:accessionId (string-append "GN" (number->string + (field InfoFiles GN_AccesionId)))) + (set gn:datasetStatusName (string-downcase + (field DatasetStatus DatasetStatusName))) + (set gn:datasetOfSpecies (string->identifier "species" + (field Species FullName BinomialName))) + (set gn:datasetOfTissue (string->identifier "tissue" + (field Tissue Short_Name))) + (set gn:normalization + (string->identifier "avgmethod" + ;; If AvgMethodName is NULL, assume N/A. + (if (string-blank? (field AvgMethod Name AvgMethodName)) + "N/A" (field AvgMethod Name AvgMethodName)))) + (set gn:datasetOfPlatform + (string->identifier "platform" + (field GeneChip Name GeneChip))) + (set gn:summary + (sanitize-rdf-string (field Datasets Summary))) + (set gn:aboutTissue + (sanitize-rdf-string (field Datasets AboutTissue))) + (set gn:geoSeries + (and (not (string-prefix-ci? "no geo series" + (field Datasets GeoSeries))) + (field Datasets GeoSeries))) + (set gn:title (field InfoFiles Title)) + (set gn:specifics (sanitize-rdf-string (field InfoFiles Specifics))) + (set gn:datasetGroup (field Datasets DatasetName DatasetGroup)) + (set gn:aboutCases (sanitize-rdf-string (field Datasets AboutCases))) + (set gn:aboutPlatform (sanitize-rdf-string (field Datasets AboutPlatform))) + (set gn:aboutDataProcessing (sanitize-rdf-string + (field Datasets AboutDataProcessing))) + (set gn:notes (sanitize-rdf-string (field Datasets Notes))) + (set gn:experimentDesign (sanitize-rdf-string + (field Datasets ExperimentDesign))) + (set gn:contributors (sanitize-rdf-string (field Datasets Contributors))) + (set gn:citation (sanitize-rdf-string (field Datasets Citation))) + (set gn:acknowledgment (sanitize-rdf-string + (field Datasets Acknowledgment))))) + + + + +(call-with-target-database + %connection-settings + (lambda (db) + (with-output-to-file (string-append %dump-directory "dump-info-pages.ttl") + (lambda () + (prefix "chebi:" "") + (prefix "dct:" "") + (prefix "foaf:" "") + (prefix "generif:" "") + (prefix "gn:" "") + (prefix "hgnc:" "") + (prefix "homologene:" "") + (prefix "kegg:" "") + (prefix "molecularTrait:" "") + (prefix "nuccore:" "") + (prefix "omim:" "") + (prefix "owl:" "") + (prefix "phenotype:" "") + (prefix "pubchem:" "") + (prefix "pubmed:" "") + (prefix "rdf:" "") + (prefix "rdfs:" "") + (prefix "taxon:" "") + (prefix "uniprot:" "") + (prefix "up:" "") + (prefix "xsd:" "") + (newline) + (dump-info-files db) + (dump-investigators db)) + #:encoding "utf8"))) -- cgit v1.2.3