diff options
Diffstat (limited to 'examples/dataset-metadata.scm')
| -rwxr-xr-x | examples/dataset-metadata.scm | 437 |
1 files changed, 0 insertions, 437 deletions
diff --git a/examples/dataset-metadata.scm b/examples/dataset-metadata.scm deleted file mode 100755 index 32dba7a..0000000 --- a/examples/dataset-metadata.scm +++ /dev/null @@ -1,437 +0,0 @@ -#! /usr/bin/env guile -!# - -(use-modules (srfi srfi-1) - (srfi srfi-26) - (ice-9 getopt-long) - (ice-9 match) - (ice-9 regex) - (transform strings) - (transform sql) - (transform triples) - (transform special-forms)) - - -(define-transformer info-files - (tables (InfoFiles - (left-join PublishFreeze "ON InfoFiles.InfoPageName = PublishFreeze.Name") - (left-join GenoFreeze "ON InfoFiles.InfoPageName = GenoFreeze.Name") - (left-join ProbeSetFreeze "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") - (left-join InbredSet "ON InfoFiles.InbredSetId = InbredSet.InbredSetId") - (left-join Species "ON InfoFiles.SpeciesId = Species.SpeciesId") - (left-join Datasets "USING (DatasetId)") - (left-join DatasetStatus "USING (DatasetStatusId)") - (left-join Tissue "USING (TissueId)") - (left-join Investigators "USING (InvestigatorId)") - (left-join AvgMethod "USING (AvgMethodId)") - (left-join Organizations "USING (OrganizationId)") - (left-join GeneChip "USING (GeneChipId)")) - ;; XXXX: There are datasets that don't have the InbredSetId - ;; in the Infofiles table. This clause allows us to check - ;; if they exist in the (Publish/Geno)Freeze tables. - "LEFT JOIN InbredSet PublishInbredSet ON PublishFreeze.InbredSetId = PublishInbredSet.InbredSetId LEFT JOIN InbredSet GenoInbredSet ON GenoFreeze.InbredSetId = GenoInbredSet.InbredSetId WHERE GN_AccesionId IS NOT NULL") - (schema-triples - (gnt:has_tissue rdfs:domain dcat:Dataset) - (gnt:has_tissue a owl:ObjectProperty) - (gnt:has_tissue skos:definition "Tissues this resource has") - (gnt:uses_normalization rdfs:domain dcat:Dataset) - (gnt:uses_normalization a owl:ObjectProperty) - (gnt:uses_normalization skos:definition "Normalization techniques this resource has") - (gnt:uses_platform rdfs:domain dcat:Dataset) - (gnt:uses_platform a owl:ObjectProperty) - (gnt:uses_platform skos:definition "The Platform this resource uses") - (gnt:has_geo_series_id rdfs:domain dcat:Dataset) - (gnt:has_geo_series_id a owl:ObjectProperty) - (gnt:has_geo_series_id skos:definition "id of record in NCBI database") - (gnt:has_experiment_type rdfs:domain dcat:Dataset) - (gnt:has_experiment_type a owl:ObjectProperty) - (gnt:has_experiment_type rdfs:label "Experiment Type Metadata") - (gnt:has_experiment_type skos:definition "Information about the experiment type") - (gnt:has_tissue_info rdfs:domain dcat:Dataset) - (gnt:has_tissue_info a owl:ObjectProperty) - (gnt:has_tissue_info skos:definition "Metadata about Tissue for this resource") - (gnt:has_experiment_design_info rdfs:domain dcat:Dataset) - (gnt:has_experiment_design_info rdfs:label "Experiment Design") - (gnt:has_experiment_design_info a owl:ObjectProperty) - (gnt:has_experiment_design_info skos:definition "Information about how the experiment was designed") - (gnt:has_notes rdfs:domain dcat:Dataset) - (gnt:has_notes a owl:ObjectProperty) - (gnt:has_notes rdfs:label "Notes") - (gnt:has_notes skos:definition "Extra Notes about this dataset") - (gnt:has_data_processing_info rdfs:domain dcat:Dataset) - (gnt:has_data_processing_info rdfs:label "About Data Processing") - (gnt:has_data_processing_info a owl:ObjectProperty) - (gnt:has_data_processing_info skos:definition "Information about how this dataset was processed") - (gnt:has_platform_info rdfs:domain dcat:Dataset) - (gnt:has_platform_info a owl:ObjectProperty) - (gnt:has_platform_info rdfs:label "About Platform") - (gnt:has_platform_info skos:definition "Information about the platform that was used with this dataset") - (gnt:has_case_info rdfs:domain dcat:Dataset) - (gnt:has_case_info rdfs:label "About Case") - (gnt:has_case_info a owl:ObjectProperty) - (gnt:has_case_info skos:definition "Information about the cases used in this platform") - (gnt:has_summary rdfs:domain dcat:Dataset) - (gnt:has_summary rdfs:label "Summary") - (gnt:has_summary a owl:ObjectProperty) - (gnt:has_summary skos:definition "Summary information about dataset") - (gnt:has_citation rdfs:domain dcat:Dataset) - (gnt:has_citation rdfs:label "Citation") - (gnt:has_citation a owl:ObjectProperty) - (gnt:has_citation skos:definition "Citation for this dataset") - (gnt:has_contributors rdfs:domain dcat:Dataset) - (gnt:has_contributors rdfs:label "Contributors") - (gnt:has_contributors a owl:ObjectProperty) - (gnt:has_contributors skos:definition "Contributors of this resource") - (gnt:has_experiment_design rdfs:domain dcat:Dataset) - (gnt:has_experiment_design rdfs:label "Experiment Design") - (gnt:has_experiment_design a owl:ObjectProperty) - (gnt:has_experiment_design skos:definition "Experiment Design for this resource") - (gnt:has_tissue_info rdfs:domain dcat:Dataset) - (gnt:has_tissue_info rdfs:label "Tissue Information") - (gnt:has_tissue_info a owl:ObjectProperty) - (gnt:has_tissue_info skos:definition "Tissue information about dataset") - (gnt:has_experiment_type skos:definition "Information about the experiment type") - (gnt:has_acknowledgement rdfs:domain dcat:Dataset) - (gnt:has_acknowledgement rdfs:label "Acknowledgement") - (gnt:has_acknowledgement a owl:ObjectProperty) - (gnt:has_acknowledgement skos:definition "People to acknowledge")) - (triples - (string->identifier - "" (let ((info-page-name (field InfoFiles InfoPageName)) - (info-title (field InfoFiles Title))) - (format #f "~a" - (if (and (string? info-page-name) - (string=? (string-downcase (string-trim-both info-page-name)) - "none")) - info-title info-page-name)))) - (set rdf:type 'dcat:Dataset) - (set xkos:classifiedUnder - (let ([dataset-type - (string-trim-both - (field ("IF(GenoFreeze.Id IS NOT NULL, 'gnc:genotype', IF(PublishFreeze.Id IS NOT NULL, 'gnc:phenotype', IF(ProbeSetFreeze.Name IS NOT NULL, 'gnc:probeset', '')))" - DatasetType)))]) - (if (not (string-null? dataset-type)) - (string->symbol - dataset-type) - ""))) - (set rdfs:label (normalize-string-field (field InfoFiles InfoPageName))) - (set skos:prefLabel - (normalize-string-field - (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))" - DatasetFullName)))) - (set skos:altLabel (field Datasets DatasetName DatasetGroup)) - (set dct:title (normalize-string-field (field Datasets PublicationTitle))) - (set dct:created - (normalize-string-field - (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))" - createTimeGenoFreeze)))) - (set dcat:contactPoint - (investigator-attributes->id (field Investigators FirstName) - (field Investigators LastName) - "")) - (set foaf:Organization - (field Organizations OrganizationName)) - (set dct:identifier (format #f "GN~a" (field InfoFiles GN_AccesionId))) - (set dct:accessRights (string-downcase - (field DatasetStatus DatasetStatusName))) - (set gnt:has_strain - (string->identifier - "set" - (field ("IFNULL(InbredSet.Name, IFNULL(PublishInbredSet.Name, GenoInbredSet.Name))" - InbredSetName)) - #:separator "_")) - (set gnt:has_tissue (string->identifier "tissue" - (field Tissue Short_Name) - #:separator "_")) - (set gnt:uses_normalization - (let ((avg-method (normalize-string-field (field AvgMethod Name AvgMethodName)))) - (if (not (string-blank? avg-method)) - (string->identifier "avg_method" avg-method #:separator "_") - ""))) - (set gnt:has_summary - (let* ((summary-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/summary.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (summary - (field InfoFiles Summary))) - (if (or (null? summary) (string-blank? summary)) - "" (string->symbol summary-link)))) - (set gnt:has_tissue_info - (let* ((tissue-info-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/tissue.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (tissue-info - (field Datasets AboutTissue))) - (if (or (null? tissue-info) (string-blank? tissue-info)) - "" (string->symbol tissue-info-link)))) - (set gnt:has_citation - (let* ((citation-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/citation.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (citation - (field Datasets Citation))) - (if (or (null? citation) (string-blank? citation)) - "" (string->symbol citation-link)))) - (set gnt:hasSpecifics - (let* ((specifics-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/specifics.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (specifics - (field InfoFiles Specifics))) - (if (or (null? specifics) (string-blank? specifics)) - "" (string->symbol specifics-link)))) - (set gnt:has_case_info - (let* ((cases-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/cases.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (cases - (field Datasets AboutCases))) - (if (or (null? cases) (string-blank? cases)) - "" (string->symbol cases-link)))) - (set gnt:has_platform_info - (let* ((platform-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/platform.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (platform - (field Datasets AboutPlatform))) - (if (or (null? platform) (string-blank? platform)) - "" (string->symbol platform-link)))) - (set gnt:has_data_processing_info - (let* ((processing-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/processing.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (processing - (field Datasets AboutDataProcessing))) - (if (or (null? processing) (string-blank? processing)) - "" (string->symbol processing-link)))) - (set gnt:has_notes - (let* ((notes-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/notes.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (notes - (field Datasets Notes))) - (if (or (null? notes) (string-blank? notes)) - "" (string->symbol notes-link)))) - (set gnt:has_experiment_type - (let* ((experiment-type-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/experiment-type.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (experiment-type - (field InfoFiles Experiment_Type))) - (if (or (null? experiment-type) (string-blank? experiment-type)) - "" (string->symbol experiment-type-link)))) - (set gnt:has_experiment_design - (let* ((experiment-design-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/experiment-design.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (experiment-design - (field Datasets ExperimentDesign))) - (if (or (null? experiment-design) (string-blank? experiment-design)) - "" (string->symbol experiment-design-link)))) - (set gnt:has_contributors - (let* ((contributors-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/contributors.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (contributors - (field Datasets Contributors))) - (if (or (null? contributors) (string-blank? contributors)) - "" (string->symbol contributors-link)))) - (set gnt:has_acknowledgement - (let* ((acknowledgment-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/acknowledgment.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (acknowledgment - (field Datasets Acknowledgment))) - (if (or (null? acknowledgment) (string-blank? acknowledgment)) - "" (string->symbol acknowledgment-link)))) - (set gnt:uses_platform - (string->identifier "platform" - (field GeneChip Name GeneChip) - #:separator "_")) - (set gnt:has_geo_series_id - (let ((s - (string-match "GSE[0-9]*" - (field ("IFNULL(Datasets.GeoSeries, '')" GeoSeries))))) - (if s (ontology - 'geoSeries: (match:substring s)) - ""))))) - -;; These are phenotype datasets that don't have Infofile metadata -(define-transformer publishfreeze - (tables (PublishFreeze - (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name") - (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId")) - "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL") - (triples - (string->identifier "" (field PublishFreeze Name)) - (set rdf:type 'dcat:Dataset) - (set xkos:classifiedUnder 'gnc:phenotype) - (set dct:title (field PublishFreeze FullName)) - (set rdfs:label (field PublishFreeze Name)) - (set skos:altLabel (field PublishFreeze ShortName)) - (set dct:created (annotate-field - (field PublishFreeze CreateTime) - '^^xsd:date)) - (set gnt:has_strain - (string->identifier - "set" (field InbredSet Name InbredSetName) - #:separator "_")))) - -(define-transformer genofreeze - (tables (GenoFreeze - (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name") - (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId")) - "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL") - (triples - (string->identifier "" (field GenoFreeze Name)) - (set rdf:type 'dcat:Dataset) - (set xkos:classifiedUnder 'gnc:genotype) - (set rdfs:label (field GenoFreeze Name)) - (set dct:title (field GenoFreeze FullName)) - (set skos:altLabel (field GenoFreeze ShortName)) - (set dct:created (annotate-field - (field GenoFreeze CreateTime) - '^^xsd:date)) - (set gnt:has_strain - (string->identifier - "set" (field InbredSet Name InbredSetName) - #:separator "_" - #:proc (lambda (x) x))))) - -(define-transformer probesetfreeze - (tables (ProbeSetFreeze - (left-join InfoFiles "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") - (left-join ProbeFreeze "USING (ProbeFreezeId)") - (left-join AvgMethod "ON AvgMethod.AvgMethodId = ProbeSetFreeze.AvgID") - (left-join InbredSet "ON ProbeFreeze.InbredSetId = InbredSet.Id") - (left-join Tissue "ON ProbeFreeze.TissueId = Tissue.TissueId")) - "WHERE ProbeSetFreeze.public > 0 AND InfoFiles.InfoPageName IS NULL GROUP BY ProbeFreeze.Id") - (schema-triples - (gnt:uses_normalization rdfs:domain gnc:probeset) - (gnt:uses_data_scale rdfs:domain gnc:probeset) - (gnt:uses_data_scale a owl:ObjectProperty) - (gnt:uses_data_scale skos:definition "Thi data scale this resource uses")) - (triples - (string->identifier "" (field ProbeSetFreeze Name)) - (set rdf:type 'dcat:Dataset) - (set xkos:classifiedUnder 'gnc:probeset) - (set gnt:uses_normalization - (let ((avg-method (field AvgMethod Name AvgMethodName))) - (if (string-blank? avg-method) - #f - avg-method))) - (set dct:title (field ProbeSetFreeze FullName)) - (set rdfs:label (field ProbeSetFreeze ShortName)) - (set skos:prefLabel (field ProbeSetFreeze Name)) - (set skos:altLabel (field ProbeSetFreeze Name2)) - (set dct:created (annotate-field - (field ProbeSetFreeze CreateTime) - '^^xsd:datetime)) - (set gnt:uses_data_scale (field ProbeSetFreeze DataScale)) - (set gnt:has_tissue (string->identifier "tissue" (field Tissue Short_Name) #:separator "_")) - (set gnt:has_strain (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")))) - - - -(let* ((option-spec - '((settings (single-char #\s) (value #t)) - (output (single-char #\o) (value #t)) - (documentation (single-char #\d) (value #t)))) - (options (getopt-long (command-line) option-spec)) - (settings (option-ref options 'settings #f)) - (output (option-ref options 'output #f)) - (documentation (option-ref options 'documentation #f)) - (%connection-settings - (call-with-input-file settings - read))) - (with-documentation - (name "Info files / Investigators Metadata") - (connection %connection-settings) - (table-metadata? #f) - (prefixes - '(("v:" "<http://www.w3.org/2006/vcard/ns#>") - ("foaf:" "<http://xmlns.com/foaf/0.1/#term_>") - ("xsd:" "<http://www.w3.org/2001/XMLSchema#>") - ("dcat:" "<http://www.w3.org/ns/dcat#>") - ("skos:" "<http://www.w3.org/2004/02/skos/core#>") - ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") - ("geoSeries:" "<http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=>") - ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") - ("gn:" "<http://rdf.genenetwork.org/v1/id/>") - ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") - ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") - ("owl:" "<http://www.w3.org/2002/07/owl#>") - ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") - ("taxon:" "<http://purl.uniprot.org/taxonomy/>") - ("dct:" "<http://purl.org/dc/terms/>"))) - (inputs - (list info-files - publishfreeze - genofreeze - probesetfreeze - gene-chip)) - (outputs - `(#:documentation ,documentation - #:rdf ,output)))) - - |
