#! /usr/bin/env guile !# (use-modules (srfi srfi-1) (srfi srfi-26) (ice-9 getopt-long) (ice-9 match) (ice-9 regex) (transform strings) (transform sql) (transform triples) (transform special-forms)) (define (remap-species-identifiers str) "This procedure remaps identifiers to standard binominal. Obviously this should be sorted by correcting the database!" (match str ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] ["Macaca mulatta" "Macaca nemestrina"] ["Bat (Glossophaga soricina)" "Glossophaga soricina"] [str str])) ;; One email ID in the Investigators table has spaces in it. This ;; function fixes that. (define (fix-email-id email) (string-delete #\space email)) (define (investigator-attributes->id first-name last-name email) ;; There is just one record corresponding to "Evan Williams" which ;; does not have an email ID. To accommodate that record, we ;; construct the investigator ID from not just the email ID, but ;; also the first and the last names. It would be preferable to just ;; find Evan Williams' email ID and insert it into the database. (string->identifier "investigator" (string-join (list first-name last-name (fix-email-id email)) "_"))) (define-transformer investigators ;; There are a few duplicate entries. We group by email to ;; deduplicate. (tables (Investigators) "GROUP BY Email") (triples (investigator-attributes->id (field Investigators FirstName) (field Investigators LastName) (field Investigators Email)) (set rdf:type 'foaf:Person) (set foaf:name (string-append (field Investigators FirstName) " " (field Investigators LastName))) (set foaf:givenName (field Investigators FirstName)) (set foaf:familyName (field Investigators LastName)) (set foaf:homepage (field Investigators Url)) (set v:adr (field Investigators Address)) (set v:locality (field Investigators City)) (set v:region (field Investigators State)) (set v:postal-code (field Investigators ZipCode)) (set v:country-name (field Investigators Country)))) (define-transformer gene-chip (tables (GeneChip (left-join Species "USING (SpeciesId)"))) (schema-triples (gnc:geneChip a skos:Concept) (gnc:geneChip skos:description "This is a set of controlled terms that are used to describe a given gene chip/platform") (gnt:hasGeoSeriesId rdfs:domain gnc:platform) (gnt:belongsToSpecies a owl:ObjectProperty) (gnt:belongsToSpecies skos:definition "This resource belongs to this given species") (gnt:belongsToSpecies rdfs:domain gnc:geneChip) (gnt:hasGeoSeriesId rdfs:domain gnc:geneChip) (gnt:hasGOTreeValue a owl:ObjectProperty) (gnt:hasGOTreeValue skos:definition "This resource the following GO tree value") (gnt:hasGOTreeValue rdfs:domain gnc:geneChip)) (triples (string->identifier "platform" (field GeneChip Name)) (set rdf:type 'gnc:geneChip) (set rdfs:label (field GeneChip GeneChipName)) (set skos:prefLabel (field GeneChip Name)) (set skos:altLabel (field ("IF(GeneChip.GeneChipName != GeneChip.Title, Title, NULL)" Title))) (set gnt:hasGOTreeValue (field GeneChip Go_tree_value)) (set gnt:belongsToSpecies (string->identifier "" (remap-species-identifiers (field Species Fullname)) #:separator "" #:proc string-capitalize-first)) (set gnt:hasGeoSeriesId (ontology 'geoSeries: (string-trim-both (field GeneChip GeoPlatform)))))) (define-transformer info-files (tables (InfoFiles (left-join PublishFreeze "ON InfoFiles.InfoPageName = PublishFreeze.Name") (left-join GenoFreeze "ON InfoFiles.InfoPageName = GenoFreeze.Name") (left-join ProbeSetFreeze "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") (left-join InbredSet "ON InfoFiles.InbredSetId = InbredSet.InbredSetId") (left-join Species "ON InfoFiles.SpeciesId = Species.SpeciesId") (left-join Datasets "USING (DatasetId)") (left-join DatasetStatus "USING (DatasetStatusId)") (left-join Tissue "USING (TissueId)") (left-join Investigators "USING (InvestigatorId)") (left-join AvgMethod "USING (AvgMethodId)") (left-join Organizations "USING (OrganizationId)") (left-join GeneChip "USING (GeneChipId)")) "WHERE GN_AccesionId IS NOT NULL") (schema-triples (gnc:dataset rdf:type gdmt:Dataset) (gnc:genotypeDataset rdfs:subClassOf gnc:dataset) (gnc:phenotypeDataset rdfs:subClassOf gnc:dataset) (gnc:probesetDataset rdfs:subClassOf gnc:dataset) (gnt:belongsToSet rdfs:domain gnc:dataset) (gnt:belongsToSet a owl:ObjectProperty) (gnt:belongsToSet skos:definition "The InbredSet this resource belongs to") (gnt:hasTissue rdfs:domain gnc:dataset) (gnt:hasTissue a owl:ObjectProperty) (gnt:hasTissue skos:definition "Tissues this resource has") (gnt:hasTissueInfo rdfs:domain gnc:dataset) (gnt:hasTissueInfo a owl:ObjectProperty) (gnt:hasTissueInfo skos:definition "Metadata about Tissue for this resource") (gnt:usesNormalization rdfs:domain gnc:dataset) (gnt:usesNormalization a owl:ObjectProperty) (gnt:usesNormalization skos:definition "Normalization techniques this resource has") (gnt:usesPlatform rdfs:domain gnc:dataset) (gnt:usesPlatform a owl:ObjectProperty) (gnt:usesPlatform skos:definition "The Platform this resource uses") (gnt:hasGeoSeriesId rdfs:domain gnc:dataset) (gnt:hasGeoSeriesId a owl:ObjectProperty) (gnt:hasGeoSeriesId skos:definition "id of record in NCBI database") (gnt:hasExperimentDesignInfo rdfs:domain gnc:dataset) (gnt:hasExperimentDesignInfo rdfs:label "Experiment Design") (gnt:hasExperimentDesignInfo a owl:ObjectProperty) (gnt:hasExperimentDesignInfo skos:definition "Information about how the experiment was designed") (gnt:hasNotes rdfs:domain gnc:dataset) (gnt:hasNotes a owl:ObjectProperty) (gnt:hasNotes rdfs:label "Notes") (gnt:hasNotes skos:definition "Extra Notes about this dataset") (gnt:hasDataProcessingInfo rdfs:domain gnc:dataset) (gnt:hasDataProcessingInfo rdfs:label "About Data Processing") (gnt:hasDataProcessingInfo a owl:ObjectProperty) (gnt:hasDataProcessingInfo skos:definition "Information about how this dataset was processed") (gnt:hasPlatformInfo rdfs:domain gnc:dataset) (gnt:hasPlatformInfo a owl:ObjectProperty) (gnt:hasPlatformInfo rdfs:label "About Platfoorm") (gnt:hasPlatformInfo skos:definition "Information about the platform that was used with this dataset") (gnt:hasCaseInfo rdfs:domain gnc:dataset) (gnt:hasCaseInfo rdfs:label "About Case") (gnt:hasCaseInfo a owl:ObjectProperty) (gnt:hasCaseInfo skos:definition "Information about the cases used in this platform") (gnt:hasAcknowledgement rdfs:domain gnc:dataset) (gnt:hasAcknowledgement rdfs:label "Acknowledgement") (gnt:hasAcknowledgement a owl:ObjectProperty) (gnt:hasAcknowledgement skos:definition "People to acknowledge")) (triples (string->identifier "" (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post) #:separator "" #:proc string-capitalize-first) (set rdf:type (string->symbol (field ("IF(GenoFreeze.Id IS NOT NULL, 'gnc:genotypeDataset', IF(PublishFreeze.Id IS NOT NULL, 'gnc:phenotypeDataset', IF(ProbeSetFreeze.Name IS NOT NULL, 'gnc:probesetDataset', 'gnc:dataset')))" rdfType)))) (set rdfs:label (regexp-substitute/global #f "^[Nn]one$" (field InfoFiles InfoPageName) "")) (set skos:prefLabel (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))" DatasetFullName))) (set skos:prefLabel (field Datasets DatasetName DatasetGroup)) (set gdmt:hasTitleInfo (regexp-substitute/global #f "^[Nn]one$" (field InfoFiles InfoFileTitle) "")) ;; This is the published title (set dct:title (regexp-substitute/global #f "^[Nn]one$" (field Datasets PublicationTitle) "")) (set dct:created (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))" createTimeGenoFreeze))) (set gdmt:hasCreatorInfo (investigator-attributes->id (field Investigators FirstName) (field Investigators LastName) (field Investigators Email))) (set gdmt:hasCreatorAffiliation (field Organizations OrganizationName)) (set gdmt:hasDatasetIdentifierSubType (format #f "GN~a" (field InfoFiles GN_AccesionId))) (set gdmt:hasRightsInfo (string-downcase (field DatasetStatus DatasetStatusName))) (set gnt:belongsToSet (string->identifier "set" (field InbredSet Name) #:separator "" #:proc string-capitalize-first)) (set gnt:hasTissue (string->identifier "tissue" (field Tissue Short_Name))) (set gnt:usesNormalization (string->identifier "avgmethod" ;; If AvgMethodName is NULL, assume N/A. (if (string-blank? (field AvgMethod Name AvgMethodName)) "N/A" (field AvgMethod Name AvgMethodName)))) (set gnt:usesPlatform (string->identifier "platform" (field GeneChip Name GeneChip))) (set gdmt:isDescribedBy (sanitize-rdf-string (field Datasets Summary))) (set gnt:hasGeoSeriesId (let ((s (string-match "GSE[0-9]*" (field ("IFNULL(Datasets.GeoSeries, '')" GeoSeries))))) (if s (ontology 'geoSeries: (match:substring s)) ""))) (set gnt:hasTissueInfo (sanitize-rdf-string (field Datasets AboutTissue))) (set gnt:hasContentInfo (sanitize-rdf-string (field InfoFiles Specifics))) (set gnt:hasCaseInfo (sanitize-rdf-string (field Datasets AboutCases))) (set gnt:hasPlatformInfo (sanitize-rdf-string (field Datasets AboutPlatform))) (set gnt:hasDataProcessingInfo (sanitize-rdf-string (field Datasets AboutDataProcessing))) (set gnt:hasNotes (sanitize-rdf-string (field Datasets Notes))) (set gnt:hasExperimentDesignInfo (sanitize-rdf-string (field Datasets ExperimentDesign))) (set gdmt:hasContributorInfo (sanitize-rdf-string (field Datasets Contributors))) (set gdmt:IsCitedBy (sanitize-rdf-string (regexp-substitute/global #f "^[Nn]one$" (field Datasets Citation) ""))) (set gnt:hasAcknowledgement (sanitize-rdf-string (string-trim-both (regexp-substitute/global #f "^[Nn]one$" (field InfoFiles Data_Source_Acknowledge) "")))) (set gnt:hasAcknowledgement (sanitize-rdf-string (field Datasets Acknowledgment))))) ;; These are phenotype datasets that don't have Infofile metadata (define-transformer publishfreeze (tables (PublishFreeze (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name") (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId")) "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL") (triples (string->identifier "" (regexp-substitute/global #f "[^A-Za-z0-9:]" (field PublishFreeze Name) 'pre "_" 'post) #:separator "" #:proc string-capitalize-first) (set rdf:type 'gnc:phenotypeDataset) (set rdfs:label (field PublishFreeze Name)) (set skos:prefLabel (field PublishFreeze FullName)) (set skos:altLabel (field PublishFreeze ShortName)) (set dct:created (annotate-field (field PublishFreeze CreateTime) '^^xsd:date)) (set gnt:belongsToSet (string->identifier "set" (field InbredSet Name) #:separator "" #:proc string-capitalize-first)))) (define-transformer genofreeze (tables (GenoFreeze (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name") (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId")) "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL") (triples (string->identifier "" (regexp-substitute/global #f "[^A-Za-z0-9:]" (regexp-substitute/global #f "[^A-Za-z0-9:]" (field GenoFreeze Name) 'pre "_" 'post) 'pre "_" 'post) #:separator "" #:proc string-capitalize-first) (set rdf:type 'gnc:genotypeDataset) (set rdfs:label (field GenoFreeze Name)) (set skos:prefLabel (field GenoFreeze FullName)) (set skos:altLabel (field GenoFreeze ShortName)) (set dct:created (annotate-field (field GenoFreeze CreateTime) '^^xsd:date)) (set gnt:belongsToSet (string->identifier "set" (field InbredSet Name) #:separator "" #:proc string-capitalize-first)))) ;; Molecular Traits are also referred to as ProbeSets (define-transformer probesetfreeze (tables (ProbeSetFreeze (left-join InfoFiles "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") (left-join ProbeFreeze "USING (ProbeFreezeId)") (left-join AvgMethod "ON AvgMethod.AvgMethodId = ProbeSetFreeze.AvgID") (left-join InbredSet "ON ProbeFreeze.InbredSetId = InbredSet.Id") (left-join Tissue "ON ProbeFreeze.TissueId = Tissue.TissueId")) "WHERE ProbeSetFreeze.public > 0 AND InfoFiles.InfoPageName IS NULL GROUP BY ProbeFreeze.Id") (schema-triples (gnt:usesNormalization rdfs:domain gnc:probeset) (gnt:usesDataScale rdfs:domain gnc:probeset) (gnt:usesDataScale a owl:ObjectProperty) (gnt:usesDataScale skos:definition "Thi data scale this resource uses")) (triples (string->identifier "" (regexp-substitute/global #f "[^A-Za-z0-9:]" (field ProbeSetFreeze Name) 'pre "_" 'post) #:separator "" #:proc string-capitalize-first) (set rdf:type 'gnc:probesetDataset) (set gnt:usesNormalization (string->identifier "avgmethod" ;; If AvgMethodName is NULL, assume N/A. (if (string-blank? (field AvgMethod Name AvgMethodName)) "N/A" (field AvgMethod Name AvgMethodName)))) (set dct:title (field ProbeSetFreeze FullName)) (set rdfs:label (field ProbeSetFreeze ShortName)) (set skos:prefLabel (field ProbeSetFreeze Name)) (set skos:altLabel (field ProbeSetFreeze Name2)) (set dct:created (annotate-field (field ProbeSetFreeze CreateTime) '^^xsd:datetime)) (set gnt:usesDataScale (field ProbeSetFreeze DataScale)) (set gnt:hasTissue (string->identifier "tissue" (field Tissue Short_Name))) (set gnt:belongsToSet (string->identifier "set" (field InbredSet Name) #:separator "" #:proc string-capitalize-first)))) (let* ((option-spec '((settings (single-char #\s) (value #t)) (output (single-char #\o) (value #t)) (documentation (single-char #\d) (value #t)))) (options (getopt-long (command-line) option-spec)) (settings (option-ref options 'settings #f)) (output (option-ref options 'output #f)) (documentation (option-ref options 'documentation #f)) (%connection-settings (call-with-input-file settings read))) (with-documentation (name "Info files / Investigators Metadata") (connection %connection-settings) (table-metadata? #f) (prefixes '(("v:" "") ("foaf:" "") ("gdmt:" "") ("skos:" "") ("geoSeries:" "") ("gnt:" "") ("gn:" "") ("gnc:" "") ("rdf:" "") ("owl:" "") ("rdfs:" "") ("taxon:" "") ("dct:" ""))) (inputs (list info-files publishfreeze genofreeze probesetfreeze investigators gene-chip)) (outputs `(#:documentation ,documentation #:rdf ,output))))