#! /usr/bin/env guile !# (use-modules (srfi srfi-1) (srfi srfi-26) (ice-9 getopt-long) (ice-9 match) (ice-9 regex) (transform strings) (transform sql) (transform triples) (transform special-forms)) (define (remap-species-identifiers str) "This procedure remaps identifiers to standard binominal. Obviously this should be sorted by correcting the database!" (match str ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] ["Macaca mulatta" "Macaca nemestrina"] ["Bat (Glossophaga soricina)" "Glossophaga soricina"] [str str])) ;; One email ID in the Investigators table has spaces in it. This ;; function fixes that. (define (fix-email-id email) (string-delete #\space email)) (define (investigator-attributes->id first-name last-name email) ;; There is just one record corresponding to "Evan Williams" which ;; does not have an email ID. To accommodate that record, we ;; construct the investigator ID from not just the email ID, but ;; also the first and the last names. It would be preferable to just ;; find Evan Williams' email ID and insert it into the database. (string->identifier "investigator" (string-join (list first-name last-name (fix-email-id email)) "_"))) (define-transformer investigators ;; There are a few duplicate entries. We group by email to ;; deduplicate. (tables (Investigators) "GROUP BY Email") (triples (investigator-attributes->id (field Investigators FirstName) (field Investigators LastName) (field Investigators Email)) (set rdf:type 'foaf:Person) (set foaf:name (string-append (field Investigators FirstName) " " (field Investigators LastName))) (set foaf:givenName (field Investigators FirstName)) (set foaf:familyName (field Investigators LastName)) (set foaf:homepage (field Investigators Url)) (set v:adr (field Investigators Address)) (set v:locality (field Investigators City)) (set v:region (field Investigators State)) (set v:postal-code (field Investigators ZipCode)) (set v:country-name (field Investigators Country)))) (define-transformer gene-chip (tables (GeneChip (left-join Species "USING (SpeciesId)"))) (schema-triples (gnc:geneChip a skos:Concept) (gnc:geneChip skos:description "This is a set of controlled terms that are used to describe a given gene chip/platform") (gnt:hasGeoSeriesId rdfs:domain gnc:platform) (gnt:hasGeoSeriesId rdfs:domain gnc:geneChip) (gnt:hasGOTreeValue a owl:ObjectProperty) (gnt:hasGOTreeValue skos:definition "This resource the following GO tree value") (gnt:hasGOTreeValue rdfs:domain gnc:geneChip)) (triples (string->identifier "platform" (field GeneChip Name)) (set rdf:type 'gnc:geneChip) (set rdfs:label (field GeneChip GeneChipName)) (set skos:prefLabel (field GeneChip Name)) (set skos:altLabel (field ("IF(GeneChip.GeneChipName != GeneChip.Title, Title, NULL)" Title))) (set gnt:hasGOTreeValue (field GeneChip Go_tree_value)) (set xkos:classifiedUnder (string->identifier "" (remap-species-identifiers (field Species Fullname)) #:separator "" #:proc string-capitalize-first)) (set gnt:hasGeoSeriesId (ontology 'geoSeries: (string-trim-both (field GeneChip GeoPlatform)))))) (define-transformer info-files (tables (InfoFiles (left-join PublishFreeze "ON InfoFiles.InfoPageName = PublishFreeze.Name") (left-join GenoFreeze "ON InfoFiles.InfoPageName = GenoFreeze.Name") (left-join ProbeSetFreeze "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") (left-join InbredSet "ON InfoFiles.InbredSetId = InbredSet.InbredSetId") (left-join Species "ON InfoFiles.SpeciesId = Species.SpeciesId") (left-join Datasets "USING (DatasetId)") (left-join DatasetStatus "USING (DatasetStatusId)") (left-join Tissue "USING (TissueId)") (left-join Investigators "USING (InvestigatorId)") (left-join AvgMethod "USING (AvgMethodId)") (left-join Organizations "USING (OrganizationId)") (left-join GeneChip "USING (GeneChipId)")) ;; XXXX: There are datasets that don't have the InbredSetId ;; in the Infofiles table. This clause allows us to check ;; if they exist in the (Publish/Geno)Freeze tables. "LEFT JOIN InbredSet PublishInbredSet ON PublishFreeze.InbredSetId = PublishInbredSet.InbredSetId LEFT JOIN InbredSet GenoInbredSet ON GenoFreeze.InbredSetId = GenoInbredSet.InbredSetId WHERE GN_AccesionId IS NOT NULL") (schema-triples (gnt:hasTissue rdfs:domain dcat:Dataset) (gnt:hasTissue a owl:ObjectProperty) (gnt:hasTissue skos:definition "Tissues this resource has") (gnt:usesNormalization rdfs:domain dcat:Dataset) (gnt:usesNormalization a owl:ObjectProperty) (gnt:usesNormalization skos:definition "Normalization techniques this resource has") (gnt:usesPlatform rdfs:domain dcat:Dataset) (gnt:usesPlatform a owl:ObjectProperty) (gnt:usesPlatform skos:definition "The Platform this resource uses") (gnt:hasGeoSeriesId rdfs:domain dcat:Dataset) (gnt:hasGeoSeriesId a owl:ObjectProperty) (gnt:hasGeoSeriesId skos:definition "id of record in NCBI database") (gnt:hasExperimentType rdfs:domain dcat:Dataset) (gnt:hasExperimentType a owl:ObjectProperty) (gnt:hasExperimentType rdfs:label "Experiment Type Metadata") (gnt:hasExperimentType skos:definition "Information about the experiment type") (gnt:hasTissueInfo rdfs:domain dcat:Dataset) (gnt:hasTissueInfo a owl:ObjectProperty) (gnt:hasTissueInfo skos:definition "Metadata about Tissue for this resource") (gnt:hasExperimentDesignInfo rdfs:domain dcat:Dataset) (gnt:hasExperimentDesignInfo rdfs:label "Experiment Design") (gnt:hasExperimentDesignInfo a owl:ObjectProperty) (gnt:hasExperimentDesignInfo skos:definition "Information about how the experiment was designed") (gnt:hasNotes rdfs:domain dcat:Dataset) (gnt:hasNotes a owl:ObjectProperty) (gnt:hasNotes rdfs:label "Notes") (gnt:hasNotes skos:definition "Extra Notes about this dataset") (gnt:hasDataProcessingInfo rdfs:domain dcat:Dataset) (gnt:hasDataProcessingInfo rdfs:label "About Data Processing") (gnt:hasDataProcessingInfo a owl:ObjectProperty) (gnt:hasDataProcessingInfo skos:definition "Information about how this dataset was processed") (gnt:hasPlatformInfo rdfs:domain dcat:Dataset) (gnt:hasPlatformInfo a owl:ObjectProperty) (gnt:hasPlatformInfo rdfs:label "About Platform") (gnt:hasPlatformInfo skos:definition "Information about the platform that was used with this dataset") (gnt:hasCaseInfo rdfs:domain dcat:Dataset) (gnt:hasCaseInfo rdfs:label "About Case") (gnt:hasCaseInfo a owl:ObjectProperty) (gnt:hasCaseInfo skos:definition "Information about the cases used in this platform") (gnt:hasSummary rdfs:domain dcat:Dataset) (gnt:hasSummary rdfs:label "Summary") (gnt:hasSummary a owl:ObjectProperty) (gnt:hasSummary skos:definition "Summary information about dataset") (gnt:hasCitation rdfs:domain dcat:Dataset) (gnt:hasCitation rdfs:label "Citation") (gnt:hasCitation a owl:ObjectProperty) (gnt:hasCitation skos:definition "Citation for this dataset") (gnt:hasContributors rdfs:domain dcat:Dataset) (gnt:hasContributors rdfs:label "Contributors") (gnt:hasContributors a owl:ObjectProperty) (gnt:hasContributors skos:definition "Contributors of this resource") (gnt:hashasExperimentDesign rdfs:domain dcat:Dataset) (gnt:hashasExperimentDesign rdfs:label "Experiment Design") (gnt:hashasExperimentDesign a owl:ObjectProperty) (gnt:hashasExperimentDesign skos:definition "Experiment Design for this resource") (gnt:hasTissueInfo rdfs:domain dcat:Dataset) (gnt:hasTissueInfo rdfs:label "Tissue Information") (gnt:hasTissueInfo a owl:ObjectProperty) (gnt:hasTissueInfo skos:definition "Tissue information about dataset") (gnt:hasExperimentType skos:definition "Information about the experiment type") (gnt:hasAcknowledgement rdfs:domain dcat:Dataset) (gnt:hasAcknowledgement rdfs:label "Acknowledgement") (gnt:hasAcknowledgement a owl:ObjectProperty) (gnt:hasAcknowledgement skos:definition "People to acknowledge")) (triples (string->identifier "" (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)) (set rdf:type 'dcat:Dataset) (set xkos:classifiedUnder (let ([dataset-type (string-trim-both (field ("IF(GenoFreeze.Id IS NOT NULL, 'gnc:Genotype', IF(PublishFreeze.Id IS NOT NULL, 'gnc:Phenotype', IF(ProbeSetFreeze.Name IS NOT NULL, 'gnc:Probeset', '')))" DatasetType)))]) (if (not (string-null? dataset-type)) (string->symbol dataset-type) ""))) (set rdfs:label (regexp-substitute/global #f "^[Nn]one$" (field InfoFiles InfoPageName) "")) (set skos:prefLabel (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))" DatasetFullName))) (set skos:altLabel (field Datasets DatasetName DatasetGroup)) (set dct:title (regexp-substitute/global #f "^[Nn]one$" (or (regexp-substitute/global #f "^Unpublished$" (field Datasets PublicationTitle) "") (field InfoFiles InfoFileTitle) "") "")) (set dct:created (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))" createTimeGenoFreeze))) (set dcat:contactPoint (investigator-attributes->id (field Investigators FirstName) (field Investigators LastName) (field Investigators Email))) (set foaf:Organization (field Organizations OrganizationName)) (set dct:identifier (format #f "GN~a" (field InfoFiles GN_AccesionId))) (set dct:accessRights (string-downcase (field DatasetStatus DatasetStatusName))) (set gnt:belongsToGroup (string->identifier "set" (field ("IFNULL(InbredSet.Name, IFNULL(PublishInbredSet.Name, GenoInbredSet.Name))" InbredSetName)))) (set gnt:hasTissue (string->identifier "tissue" (field Tissue Short_Name))) (set gnt:usesNormalization (string->identifier "avgMethod" ;; If AvgMethodName is NULL, assume N/A. (if (string-blank? (field AvgMethod Name AvgMethodName)) "N/A" (field AvgMethod Name AvgMethodName)))) (set gnt:hasSummary (let* ((summary-link (format #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/summary.rtf>" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (summary (field InfoFiles Summary))) (if (or (null? summary) (string-blank? summary)) "" (string->symbol summary-link)))) (set gnt:hasTissueInfo (let* ((tissue-info-link (format #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/tissue.rtf>" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (tissue-info (field Datasets AboutTissue))) (if (or (null? tissue-info) (string-blank? tissue-info)) "" (string->symbol tissue-info-link)))) (set gnt:hasCitation (let* ((citation-link (format #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/citation.rtf>" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (citation (field Datasets Citation))) (if (or (null? citation) (string-blank? citation)) "" (string->symbol citation-link)))) (set gnt:hasSpecifics (let* ((specifics-link (format #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/specifics.rtf>" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (specifics (field InfoFiles Specifics))) (if (or (null? specifics) (string-blank? specifics)) "" (string->symbol specifics-link)))) (set gnt:hasCaseInfo (let* ((cases-link (format #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/cases.rtf>" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (cases (field Datasets AboutCases))) (if (or (null? cases) (string-blank? cases)) "" (string->symbol cases-link)))) (set gnt:hasPlatformInfo (let* ((platform-link (format #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/platform.rtf>" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (platform (field Datasets AboutPlatform))) (if (or (null? platform) (string-blank? platform)) "" (string->symbol platform-link)))) (set gnt:hasDataProcessingInfo (let* ((processing-link (format #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/processing.rtf>" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (processing (field Datasets AboutDataProcessing))) (if (or (null? processing) (string-blank? processing)) "" (string->symbol processing-link)))) (set gnt:hasNotes (let* ((notes-link (format #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/notes.rtf>" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (notes (field Datasets Notes))) (if (or (null? notes) (string-blank? notes)) "" (string->symbol notes-link)))) (set gnt:hasExperimentType (let* ((experiment-type-link (format #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/experiment-type.rtf>" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (experiment-type (field InfoFiles Experiment_Type))) (if (or (null? experiment-type) (string-blank? experiment-type)) "" (string->symbol experiment-type-link)))) (set gnt:hasExperimentDesign (let* ((experiment-design-link (format #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/experiment-design.rtf>" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (experiment-design (field Datasets ExperimentDesign))) (if (or (null? experiment-design) (string-blank? experiment-design)) "" (string->symbol experiment-design-link)))) (set gnt:hasContributors (let* ((contributors-link (format #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/contributors.rtf>" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (contributors (field Datasets Contributors))) (if (or (null? contributors) (string-blank? contributors)) "" (string->symbol contributors-link)))) (set gnt:hasAcknowledgement (let* ((acknowledgment-link (format #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/acknowledgment.rtf>" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (acknowledgment (field Datasets Acknowledgment))) (if (or (null? acknowledgment) (string-blank? acknowledgment)) "" (string->symbol acknowledgment-link)))) (set gnt:usesPlatform (string->identifier "platform" (field GeneChip Name GeneChip))) (set gnt:hasGeoSeriesId (let ((s (string-match "GSE[0-9]*" (field ("IFNULL(Datasets.GeoSeries, '')" GeoSeries))))) (if s (ontology 'geoSeries: (match:substring s)) ""))))) ;; These are phenotype datasets that don't have Infofile metadata (define-transformer publishfreeze (tables (PublishFreeze (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name") (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId")) "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL") (triples (string->identifier "" (regexp-substitute/global #f "[^A-Za-z0-9:]" (field PublishFreeze Name) 'pre "_" 'post)) (set rdf:type 'dcat:Dataset) (set xkos:classifiedUnder 'gnc:Phenotype) (set dct:title (field PublishFreeze FullName)) (set rdfs:label (field PublishFreeze Name)) (set skos:altLabel (field PublishFreeze ShortName)) (set dct:created (annotate-field (field PublishFreeze CreateTime) '^^xsd:date)) (set gnt:belongsToGroup (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "" #:proc string-capitalize-first)))) (define-transformer genofreeze (tables (GenoFreeze (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name") (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId")) "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL") (triples (string->identifier "" (regexp-substitute/global #f "[^A-Za-z0-9:]" (regexp-substitute/global #f "[^A-Za-z0-9:]" (field GenoFreeze Name) 'pre "_" 'post) 'pre "_" 'post)) (set rdf:type 'dcat:Dataset) (set xkos:classifiedUnder 'gnc:Genotype) (set rdfs:label (field GenoFreeze Name)) (set dct:title (field GenoFreeze FullName)) (set skos:altLabel (field GenoFreeze ShortName)) (set dct:created (annotate-field (field GenoFreeze CreateTime) '^^xsd:date)) (set gnt:belongsToGroup (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "" #:proc string-capitalize-first)))) ;; Molecular Traits are also referred to as ProbeSets (define-transformer probesetfreeze (tables (ProbeSetFreeze (left-join InfoFiles "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") (left-join ProbeFreeze "USING (ProbeFreezeId)") (left-join AvgMethod "ON AvgMethod.AvgMethodId = ProbeSetFreeze.AvgID") (left-join InbredSet "ON ProbeFreeze.InbredSetId = InbredSet.Id") (left-join Tissue "ON ProbeFreeze.TissueId = Tissue.TissueId")) "WHERE ProbeSetFreeze.public > 0 AND InfoFiles.InfoPageName IS NULL GROUP BY ProbeFreeze.Id") (schema-triples (gnt:usesNormalization rdfs:domain gnc:probeset) (gnt:usesDataScale rdfs:domain gnc:probeset) (gnt:usesDataScale a owl:ObjectProperty) (gnt:usesDataScale skos:definition "Thi data scale this resource uses")) (triples (string->identifier "" (regexp-substitute/global #f "[^A-Za-z0-9:]" (field ProbeSetFreeze Name) 'pre "_" 'post)) (set rdf:type 'dcat:Dataset) (set xkos:classifiedUnder 'gnc:Probeset) (set gnt:usesNormalization (string->identifier "avgMethod" ;; If AvgMethodName is NULL, assume N/A. (if (string-blank? (field AvgMethod Name AvgMethodName)) "N/A" (field AvgMethod Name AvgMethodName)))) (set dct:title (field ProbeSetFreeze FullName)) (set rdfs:label (field ProbeSetFreeze ShortName)) (set skos:prefLabel (field ProbeSetFreeze Name)) (set skos:altLabel (field ProbeSetFreeze Name2)) (set dct:created (annotate-field (field ProbeSetFreeze CreateTime) '^^xsd:datetime)) (set gnt:usesDataScale (field ProbeSetFreeze DataScale)) (set gnt:hasTissue (string->identifier "tissue" (field Tissue Short_Name))) (set gnt:belongsToGroup (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "" #:proc string-capitalize-first)))) (let* ((option-spec '((settings (single-char #\s) (value #t)) (output (single-char #\o) (value #t)) (documentation (single-char #\d) (value #t)))) (options (getopt-long (command-line) option-spec)) (settings (option-ref options 'settings #f)) (output (option-ref options 'output #f)) (documentation (option-ref options 'documentation #f)) (%connection-settings (call-with-input-file settings read))) (with-documentation (name "Info files / Investigators Metadata") (connection %connection-settings) (table-metadata? #f) (prefixes '(("v:" "<http://www.w3.org/2006/vcard/ns#>") ("foaf:" "<http://xmlns.com/foaf/0.1/>") ("xsd:" "<http://www.w3.org/2001/XMLSchema#>") ("dcat:" "<http://www.w3.org/ns/dcat#>") ("skos:" "<http://www.w3.org/2004/02/skos/core#>") ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") ("geoSeries:" "<http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=>") ("gnt:" "<http://genenetwork.org/term/>") ("gn:" "<http://genenetwork.org/id/>") ("gnc:" "<http://genenetwork.org/category/>") ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") ("owl:" "<http://www.w3.org/2002/07/owl#>") ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") ("taxon:" "<http://purl.uniprot.org/taxonomy/>") ("dct:" "<http://purl.org/dc/terms/>"))) (inputs (list info-files publishfreeze genofreeze probesetfreeze investigators gene-chip)) (outputs `(#:documentation ,documentation #:rdf ,output))))