#! /usr/bin/env guile !# (use-modules (srfi srfi-1) (srfi srfi-26) (ice-9 getopt-long) (ice-9 match) (ice-9 regex) (transform strings) (transform sql) (transform triples) (transform special-forms)) ;; One email ID in the Investigators table has spaces in it. This ;; function fixes that. (define (fix-email-id email) (string-delete #\space email)) (define (investigator-attributes->id first-name last-name email) ;; There is just one record corresponding to "Evan Williams" which ;; does not have an email ID. To accommodate that record, we ;; construct the investigator ID from not just the email ID, but ;; also the first and the last names. It would be preferable to just ;; find Evan Williams' email ID and insert it into the database. (string->identifier "investigator" (string-join (list first-name last-name (fix-email-id email)) "_") #:separator "_")) (define-transformer investigators ;; There are a few duplicate entries. We group by email to ;; deduplicate. (tables (Investigators) "GROUP BY Email") (triples (investigator-attributes->id (field Investigators FirstName) (field Investigators LastName) "") (set rdf:type 'foaf:Person) (set foaf:name (string-append (field Investigators FirstName) " " (field Investigators LastName))) (set foaf:givenName (field Investigators FirstName)) (set foaf:familyName (field Investigators LastName)) (set foaf:homepage (field Investigators Url)) (set v:adr (field Investigators Address)) (set v:locality (field Investigators City)) (set v:region (field Investigators State)) (set v:postal-code (field Investigators ZipCode)) (set v:country-name (field Investigators Country)))) (define-transformer gene-chip (tables (GeneChip (left-join Species "USING (SpeciesId)"))) (schema-triples (gnc:gene_chip a skos:Concept) (gnc:gene_chip skos:description "This is a set of controlled terms that are used to describe a given gene chip/platform") (gnt:has_geo_series_id rdfs:domain gnc:platform) (gnt:has_geo_series_id rdfs:domain gnc:gene_chip) (gnt:has_go_tree_value a owl:ObjectProperty) (gnt:has_go_tree_value skos:definition "This resource the following GO tree value") (gnt:has_go_tree_value rdfs:domain gnc:gene_chip)) (triples (string->identifier "platform" (field GeneChip Name) #:separator "_") (set rdf:type 'gnc:gene_chip) (set rdfs:label (field GeneChip GeneChipName)) (set skos:prefLabel (field GeneChip Name)) (set skos:altLabel (field ("IF(GeneChip.GeneChipName != GeneChip.Title, Title, NULL)" Title))) (set gnt:has_go_tree_value (field GeneChip Go_tree_value)) (set xkos:classifiedUnder (string->identifier "" (remap-species-identifiers (field Species Fullname)) #:separator "")) (set gnt:has_geo_series_id (ontology 'geoSeries: (string-trim-both (field GeneChip GeoPlatform)))))) (define-transformer info-files (tables (InfoFiles (left-join PublishFreeze "ON InfoFiles.InfoPageName = PublishFreeze.Name") (left-join GenoFreeze "ON InfoFiles.InfoPageName = GenoFreeze.Name") (left-join ProbeSetFreeze "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") (left-join InbredSet "ON InfoFiles.InbredSetId = InbredSet.InbredSetId") (left-join Species "ON InfoFiles.SpeciesId = Species.SpeciesId") (left-join Datasets "USING (DatasetId)") (left-join DatasetStatus "USING (DatasetStatusId)") (left-join Tissue "USING (TissueId)") (left-join Investigators "USING (InvestigatorId)") (left-join AvgMethod "USING (AvgMethodId)") (left-join Organizations "USING (OrganizationId)") (left-join GeneChip "USING (GeneChipId)")) ;; XXXX: There are datasets that don't have the InbredSetId ;; in the Infofiles table. This clause allows us to check ;; if they exist in the (Publish/Geno)Freeze tables. "LEFT JOIN InbredSet PublishInbredSet ON PublishFreeze.InbredSetId = PublishInbredSet.InbredSetId LEFT JOIN InbredSet GenoInbredSet ON GenoFreeze.InbredSetId = GenoInbredSet.InbredSetId WHERE GN_AccesionId IS NOT NULL") (schema-triples (gnt:has_tissue rdfs:domain dcat:Dataset) (gnt:has_tissue a owl:ObjectProperty) (gnt:has_tissue skos:definition "Tissues this resource has") (gnt:uses_normalization rdfs:domain dcat:Dataset) (gnt:uses_normalization a owl:ObjectProperty) (gnt:uses_normalization skos:definition "Normalization techniques this resource has") (gnt:uses_platform rdfs:domain dcat:Dataset) (gnt:uses_platform a owl:ObjectProperty) (gnt:uses_platform skos:definition "The Platform this resource uses") (gnt:has_geo_series_id rdfs:domain dcat:Dataset) (gnt:has_geo_series_id a owl:ObjectProperty) (gnt:has_geo_series_id skos:definition "id of record in NCBI database") (gnt:has_experiment_type rdfs:domain dcat:Dataset) (gnt:has_experiment_type a owl:ObjectProperty) (gnt:has_experiment_type rdfs:label "Experiment Type Metadata") (gnt:has_experiment_type skos:definition "Information about the experiment type") (gnt:has_tissue_info rdfs:domain dcat:Dataset) (gnt:has_tissue_info a owl:ObjectProperty) (gnt:has_tissue_info skos:definition "Metadata about Tissue for this resource") (gnt:has_experiment_design_info rdfs:domain dcat:Dataset) (gnt:has_experiment_design_info rdfs:label "Experiment Design") (gnt:has_experiment_design_info a owl:ObjectProperty) (gnt:has_experiment_design_info skos:definition "Information about how the experiment was designed") (gnt:has_notes rdfs:domain dcat:Dataset) (gnt:has_notes a owl:ObjectProperty) (gnt:has_notes rdfs:label "Notes") (gnt:has_notes skos:definition "Extra Notes about this dataset") (gnt:has_data_processing_info rdfs:domain dcat:Dataset) (gnt:has_data_processing_info rdfs:label "About Data Processing") (gnt:has_data_processing_info a owl:ObjectProperty) (gnt:has_data_processing_info skos:definition "Information about how this dataset was processed") (gnt:has_platform_info rdfs:domain dcat:Dataset) (gnt:has_platform_info a owl:ObjectProperty) (gnt:has_platform_info rdfs:label "About Platform") (gnt:has_platform_info skos:definition "Information about the platform that was used with this dataset") (gnt:has_case_info rdfs:domain dcat:Dataset) (gnt:has_case_info rdfs:label "About Case") (gnt:has_case_info a owl:ObjectProperty) (gnt:has_case_info skos:definition "Information about the cases used in this platform") (gnt:has_summary rdfs:domain dcat:Dataset) (gnt:has_summary rdfs:label "Summary") (gnt:has_summary a owl:ObjectProperty) (gnt:has_summary skos:definition "Summary information about dataset") (gnt:has_citation rdfs:domain dcat:Dataset) (gnt:has_citation rdfs:label "Citation") (gnt:has_citation a owl:ObjectProperty) (gnt:has_citation skos:definition "Citation for this dataset") (gnt:has_contributors rdfs:domain dcat:Dataset) (gnt:has_contributors rdfs:label "Contributors") (gnt:has_contributors a owl:ObjectProperty) (gnt:has_contributors skos:definition "Contributors of this resource") (gnt:has_experiment_design rdfs:domain dcat:Dataset) (gnt:has_experiment_design rdfs:label "Experiment Design") (gnt:has_experiment_design a owl:ObjectProperty) (gnt:has_experiment_design skos:definition "Experiment Design for this resource") (gnt:has_tissue_info rdfs:domain dcat:Dataset) (gnt:has_tissue_info rdfs:label "Tissue Information") (gnt:has_tissue_info a owl:ObjectProperty) (gnt:has_tissue_info skos:definition "Tissue information about dataset") (gnt:has_experiment_type skos:definition "Information about the experiment type") (gnt:has_acknowledgement rdfs:domain dcat:Dataset) (gnt:has_acknowledgement rdfs:label "Acknowledgement") (gnt:has_acknowledgement a owl:ObjectProperty) (gnt:has_acknowledgement skos:definition "People to acknowledge")) (triples (string->identifier "" (let ((info-page-name (field InfoFiles InfoPageName)) (info-title (field InfoFiles Title))) (format #f "~a" (if (and (string? info-page-name) (string=? (string-downcase (string-trim-both info-page-name)) "none")) info-title info-page-name)))) (set rdf:type 'dcat:Dataset) (set xkos:classifiedUnder (let ([dataset-type (string-trim-both (field ("IF(GenoFreeze.Id IS NOT NULL, 'gnc:genotype', IF(PublishFreeze.Id IS NOT NULL, 'gnc:phenotype', IF(ProbeSetFreeze.Name IS NOT NULL, 'gnc:probeset', '')))" DatasetType)))]) (if (not (string-null? dataset-type)) (string->symbol dataset-type) ""))) (set rdfs:label (normalize-string-field (field InfoFiles InfoPageName))) (set skos:prefLabel (normalize-string-field (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))" DatasetFullName)))) (set skos:altLabel (field Datasets DatasetName DatasetGroup)) (set dct:title (normalize-string-field (field Datasets PublicationTitle))) (set dct:created (normalize-string-field (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))" createTimeGenoFreeze)))) (set dcat:contactPoint (investigator-attributes->id (field Investigators FirstName) (field Investigators LastName) "")) (set foaf:Organization (field Organizations OrganizationName)) (set dct:identifier (format #f "GN~a" (field InfoFiles GN_AccesionId))) (set dct:accessRights (string-downcase (field DatasetStatus DatasetStatusName))) (set gnt:belongs_to_group (string->identifier "set" (field ("IFNULL(InbredSet.Name, IFNULL(PublishInbredSet.Name, GenoInbredSet.Name))" InbredSetName)) #:separator "_")) (set gnt:has_tissue (string->identifier "tissue" (field Tissue Short_Name) #:separator "_")) (set gnt:uses_normalization (let ((avg-method (normalize-string-field (field AvgMethod Name AvgMethodName)))) (if (not (string-blank? avg-method)) (string->identifier "avg_method" avg-method #:separator "_") ""))) (set gnt:has_summary (let* ((summary-link (format #f "" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (summary (field InfoFiles Summary))) (if (or (null? summary) (string-blank? summary)) "" (string->symbol summary-link)))) (set gnt:has_tissue_info (let* ((tissue-info-link (format #f "" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (tissue-info (field Datasets AboutTissue))) (if (or (null? tissue-info) (string-blank? tissue-info)) "" (string->symbol tissue-info-link)))) (set gnt:has_citation (let* ((citation-link (format #f "" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (citation (field Datasets Citation))) (if (or (null? citation) (string-blank? citation)) "" (string->symbol citation-link)))) (set gnt:hasSpecifics (let* ((specifics-link (format #f "" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (specifics (field InfoFiles Specifics))) (if (or (null? specifics) (string-blank? specifics)) "" (string->symbol specifics-link)))) (set gnt:has_case_info (let* ((cases-link (format #f "" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (cases (field Datasets AboutCases))) (if (or (null? cases) (string-blank? cases)) "" (string->symbol cases-link)))) (set gnt:has_platform_info (let* ((platform-link (format #f "" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (platform (field Datasets AboutPlatform))) (if (or (null? platform) (string-blank? platform)) "" (string->symbol platform-link)))) (set gnt:has_data_processing_info (let* ((processing-link (format #f "" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (processing (field Datasets AboutDataProcessing))) (if (or (null? processing) (string-blank? processing)) "" (string->symbol processing-link)))) (set gnt:has_notes (let* ((notes-link (format #f "" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (notes (field Datasets Notes))) (if (or (null? notes) (string-blank? notes)) "" (string->symbol notes-link)))) (set gnt:has_experiment_type (let* ((experiment-type-link (format #f "" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (experiment-type (field InfoFiles Experiment_Type))) (if (or (null? experiment-type) (string-blank? experiment-type)) "" (string->symbol experiment-type-link)))) (set gnt:has_experiment_design (let* ((experiment-design-link (format #f "" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (experiment-design (field Datasets ExperimentDesign))) (if (or (null? experiment-design) (string-blank? experiment-design)) "" (string->symbol experiment-design-link)))) (set gnt:has_contributors (let* ((contributors-link (format #f "" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (contributors (field Datasets Contributors))) (if (or (null? contributors) (string-blank? contributors)) "" (string->symbol contributors-link)))) (set gnt:has_acknowledgement (let* ((acknowledgment-link (format #f "" (string-capitalize-first (regexp-substitute/global #f "[^A-Za-z0-9:]" (field InfoFiles InfoPageName) 'pre "_" 'post)))) (acknowledgment (field Datasets Acknowledgment))) (if (or (null? acknowledgment) (string-blank? acknowledgment)) "" (string->symbol acknowledgment-link)))) (set gnt:uses_platform (string->identifier "platform" (field GeneChip Name GeneChip) #:separator "_")) (set gnt:has_geo_series_id (let ((s (string-match "GSE[0-9]*" (field ("IFNULL(Datasets.GeoSeries, '')" GeoSeries))))) (if s (ontology 'geoSeries: (match:substring s)) ""))))) ;; These are phenotype datasets that don't have Infofile metadata (define-transformer publishfreeze (tables (PublishFreeze (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name") (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId")) "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL") (triples (string->identifier "" (field PublishFreeze Name)) (set rdf:type 'dcat:Dataset) (set xkos:classifiedUnder 'gnc:phenotype) (set dct:title (field PublishFreeze FullName)) (set rdfs:label (field PublishFreeze Name)) (set skos:altLabel (field PublishFreeze ShortName)) (set dct:created (annotate-field (field PublishFreeze CreateTime) '^^xsd:date)) (set gnt:belongs_to_group (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")))) (define-transformer genofreeze (tables (GenoFreeze (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name") (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId")) "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL") (triples (string->identifier "" (field GenoFreeze Name)) (set rdf:type 'dcat:Dataset) (set xkos:classifiedUnder 'gnc:genotype) (set rdfs:label (field GenoFreeze Name)) (set dct:title (field GenoFreeze FullName)) (set skos:altLabel (field GenoFreeze ShortName)) (set dct:created (annotate-field (field GenoFreeze CreateTime) '^^xsd:date)) (set gnt:belongs_to_group (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_" #:proc (lambda (x) x))))) ;; Molecular Traits are also referred to as ProbeSets (define-transformer probesetfreeze (tables (ProbeSetFreeze (left-join InfoFiles "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") (left-join ProbeFreeze "USING (ProbeFreezeId)") (left-join AvgMethod "ON AvgMethod.AvgMethodId = ProbeSetFreeze.AvgID") (left-join InbredSet "ON ProbeFreeze.InbredSetId = InbredSet.Id") (left-join Tissue "ON ProbeFreeze.TissueId = Tissue.TissueId")) "WHERE ProbeSetFreeze.public > 0 AND InfoFiles.InfoPageName IS NULL GROUP BY ProbeFreeze.Id") (schema-triples (gnt:uses_normalization rdfs:domain gnc:probeset) (gnt:uses_data_scale rdfs:domain gnc:probeset) (gnt:uses_data_scale a owl:ObjectProperty) (gnt:uses_data_scale skos:definition "Thi data scale this resource uses")) (triples (string->identifier "" (field ProbeSetFreeze Name)) (set rdf:type 'dcat:Dataset) (set xkos:classifiedUnder 'gnc:probeset) (set gnt:uses_normalization (let ((avg-method (field AvgMethod Name AvgMethodName))) (if (string-blank? avg-method) #f avg-method))) (set dct:title (field ProbeSetFreeze FullName)) (set rdfs:label (field ProbeSetFreeze ShortName)) (set skos:prefLabel (field ProbeSetFreeze Name)) (set skos:altLabel (field ProbeSetFreeze Name2)) (set dct:created (annotate-field (field ProbeSetFreeze CreateTime) '^^xsd:datetime)) (set gnt:uses_data_scale (field ProbeSetFreeze DataScale)) (set gnt:has_tissue (string->identifier "tissue" (field Tissue Short_Name) #:separator "_")) (set gnt:belongs_to_group (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")))) (let* ((option-spec '((settings (single-char #\s) (value #t)) (output (single-char #\o) (value #t)) (documentation (single-char #\d) (value #t)))) (options (getopt-long (command-line) option-spec)) (settings (option-ref options 'settings #f)) (output (option-ref options 'output #f)) (documentation (option-ref options 'documentation #f)) (%connection-settings (call-with-input-file settings read))) (with-documentation (name "Info files / Investigators Metadata") (connection %connection-settings) (table-metadata? #f) (prefixes '(("v:" "") ("foaf:" "") ("xsd:" "") ("dcat:" "") ("skos:" "") ("xkos:" "") ("geoSeries:" "") ("gnt:" "") ("gn:" "") ("gnc:" "") ("rdf:" "") ("owl:" "") ("rdfs:" "") ("taxon:" "") ("dct:" ""))) (inputs (list info-files publishfreeze genofreeze probesetfreeze investigators gene-chip)) (outputs `(#:documentation ,documentation #:rdf ,output))))