#! /usr/bin/env guile !# (use-modules (srfi srfi-1) (srfi srfi-26) (ice-9 match) (ice-9 regex) (dump strings) (dump sql) (dump triples) (dump special-forms)) (define %connection-settings (call-with-input-file (list-ref (command-line) 1) read)) (define %dump-directory (list-ref (command-line) 2)) ;; One email ID in the Investigators table has spaces in it. This ;; function fixes that. (define (fix-email-id email) (string-delete #\space email)) (define (investigator-attributes->id first-name last-name email) ;; There is just one record corresponding to "Evan Williams" which ;; does not have an email ID. To accommodate that record, we ;; construct the investigator ID from not just the email ID, but ;; also the first and the last names. It would be preferable to just ;; find Evan Williams' email ID and insert it into the database. (string->identifier "investigator" (string-join ;; Add special case for Yohan Bossé whose name ;; has unprintable characters. ;; TODO: Fix Yohan Bossé's name in the database. (let ((last-name (if (string=? first-name "Yohan") "Bosse" last-name))) (list first-name last-name (fix-email-id email))) "_"))) (define-dump dump-investigators ;; There are a few duplicate entries. We group by email to ;; deduplicate. (tables (Investigators) "GROUP BY Email") (schema-triples ;; TODO: Are ranges required for FOAF predicates? Can they not be ;; obtained from the FOAF spec? (foaf:name rdfs:range rdfs:Literal) (foaf:givenName rdfs:range rdfs:Literal) (foaf:familyName rdfs:range rdfs:Literal) (foaf:homepage rdfs:range rdfs:Literal) (gn:address rdfs:range rdfs:Literal) (gn:city rdfs:range rdfs:Literal) (gn:state rdfs:range rdfs:Literal) (gn:zipCode rdfs:range rdfs:Literal) (gn:country rdfs:range rdfs:Literal)) (triples (investigator-attributes->id (field Investigators FirstName) (field Investigators LastName) (field Investigators Email)) (set rdf:type 'foaf:Person) (set foaf:name (string-append (field Investigators FirstName) " " (if (string=? (field Investigators FirstName) "Yohan") "Bossé" (field Investigators LastName)))) (set foaf:givenName (field ("CAST(CONVERT(BINARY CONVERT(FirstName USING latin1) USING utf8) AS VARCHAR(100))" FirstName))) (set foaf:familyName (field ("CAST(CONVERT(BINARY CONVERT(LastName USING latin1) USING utf8) AS VARCHAR(100))" LastName))) (set foaf:homepage (field Investigators Url)) (set gn:address (field Investigators Address)) (set gn:city (field Investigators City)) (set gn:state (field Investigators State)) (set gn:zipCode (field Investigators ZipCode)) (set gn:country (field Investigators Country)))) (define-dump dump-info-files (tables (InfoFiles (left-join PublishFreeze "ON InfoFiles.InfoPageName = PublishFreeze.Name") (left-join GenoFreeze "ON InfoFiles.InfoPageName = GenoFreeze.Name") (left-join ProbeSetFreeze "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") (left-join Datasets "USING (DatasetId)") (left-join DatasetStatus "USING (DatasetStatusId)") (left-join Species "USING (SpeciesId)") (left-join Tissue "USING (TissueId)") (left-join Investigators "USING (InvestigatorId)") (left-join AvgMethod "USING (AvgMethodId)") (left-join GeneChip "USING (GeneChipId)")) "WHERE GN_AccesionId IS NOT NULL") (schema-triples (gn:datasetOfInvestigator rdfs:domain gn:dataset) (gn:datasetOfInvestigator rdfs:range foaf:Person) (gn:datasetOfSpecies rdfs:domain gn:dataset) (gn:datasetOfSpecies rdfs:range gn:species) (gn:datasetOfInbredSet rdfs:domain gn:dataset) (gn:datasetOfInbredSet rdfs:range gn:inbredSet) (gn:datasetOfTissue rdfs:domain gn:dataset) (gn:datasetOfTissue rdfs:range gn:tissue) (gn:normalization rdfs:domain gn:dataset) (gn:normalization rdfs:range gn:avgMethod) (gn:datasetOfPlatform rdfs:domain gn:dataset) (gn:datasetOfPlatform rdfs:range gn:geneChip) (gn:accessionId rdfs:range rdfs:Literal) (gn:datasetStatusName rdfs:range rdfs:Literal) (gn:summary rdfs:range rdfs:Literal) (gn:aboutTissue rdfs:range rdfs:Literal) (gn:geoSeries rdfs:range rdfs:Literal) (gn:name rdfs:range rdfs:Literal) (gn:title rdfs:range rdfs:Literal) (gn:specifics rdfs:range rdfs:Literal) (gn:datasetGroup rdfs:range rdfs:Literal) (gn:aboutCases rdfs:range rdfs:Literal) (gn:aboutPlatform rdfs:range rdfs:Literal) (gn:aboutDataProcessing rdfs:range rdfs:Literal) (gn:notes rdfs:range rdfs:Literal) (gn:experimentDesign rdfs:range rdfs:Literal) (gn:contributors rdfs:range rdfs:Literal) (gn:citation rdfs:range rdfs:Literal) (gn:acknowledgment rdfs:range rdfs:Literal)) (triples (ontology 'dataset: (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)) ;; Add GeneChipName and GeoPlatform: ;; GeneChip.GeneChipName AS gene_chip_name ;; GeneChip.GeoPlatform AS geo_platform (set rdf:type (string->symbol (field ("IF(GenoFreeze.Id IS NOT NULL, 'gn:genotypeDataset', IF(PublishFreeze.Id IS NOT NULL, 'gn:phenotypeDataset', 'gn:dataset'))" rdfType)))) (set gn:name (field InfoFiles InfoPageName)) (set dct:created (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))" createTimeGenoFreeze))) (set gn:datasetOfInvestigator (investigator-attributes->id (field Investigators FirstName) (field Investigators LastName) (field Investigators Email))) (set gn:accessionId (string-append "GN" (number->string (field InfoFiles GN_AccesionId)))) (set gn:datasetStatusName (string-downcase (field DatasetStatus DatasetStatusName))) (set gn:datasetOfSpecies (string->identifier "species" (field Species FullName BinomialName))) (set gn:datasetOfTissue (string->identifier "tissue" (field Tissue Short_Name))) (set gn:normalization (string->identifier "avgmethod" ;; If AvgMethodName is NULL, assume N/A. (if (string-blank? (field AvgMethod Name AvgMethodName)) "N/A" (field AvgMethod Name AvgMethodName)))) (set gn:datasetOfPlatform (string->identifier "platform" (field GeneChip Name GeneChip))) (set gn:summary (sanitize-rdf-string (field Datasets Summary))) (set gn:aboutTissue (sanitize-rdf-string (field Datasets AboutTissue))) (set gn:geoSeries (and (not (string-prefix-ci? "no geo series" (field Datasets GeoSeries))) (field Datasets GeoSeries))) (set gn:title (field InfoFiles Title)) (set gn:specifics (sanitize-rdf-string (field InfoFiles Specifics))) (set gn:datasetGroup (field Datasets DatasetName DatasetGroup)) (set gn:aboutCases (sanitize-rdf-string (field Datasets AboutCases))) (set gn:aboutPlatform (sanitize-rdf-string (field Datasets AboutPlatform))) (set gn:aboutDataProcessing (sanitize-rdf-string (field Datasets AboutDataProcessing))) (set gn:notes (sanitize-rdf-string (field Datasets Notes))) (set gn:experimentDesign (sanitize-rdf-string (field Datasets ExperimentDesign))) (set gn:contributors (sanitize-rdf-string (field Datasets Contributors))) (set gn:citation (sanitize-rdf-string (field Datasets Citation))) (set gn:acknowledgment (sanitize-rdf-string (field Datasets Acknowledgment))))) (call-with-target-database %connection-settings (lambda (db) (with-output-to-file (string-append %dump-directory "dump-info-pages.ttl") (lambda () (prefix "chebi:" "") (prefix "dct:" "") (prefix "foaf:" "") (prefix "generif:" "") (prefix "gn:" "") (prefix "hgnc:" "") (prefix "homologene:" "") (prefix "kegg:" "") (prefix "molecularTrait:" "") (prefix "nuccore:" "") (prefix "omim:" "") (prefix "owl:" "") (prefix "phenotype:" "") (prefix "pubchem:" "") (prefix "pubmed:" "") (prefix "rdf:" "") (prefix "rdfs:" "") (prefix "taxon:" "") (prefix "uniprot:" "") (prefix "up:" "") (prefix "xsd:" "") (prefix "dataset:" "") (newline) (dump-info-files db) (dump-investigators db)) #:encoding "utf8")))