From 9fc5193ada9ade78a4e0364138326b17a765f179 Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Thu, 29 Jan 2026 22:05:25 +0300 Subject: Add datasets.scm Signed-off-by: Munyoki Kilyungi --- examples/datasets.scm | 163 ++++++++++++++++++++++++++++++++++++++++++ examples/molecular-traits.scm | 116 ++---------------------------- 2 files changed, 167 insertions(+), 112 deletions(-) create mode 100755 examples/datasets.scm (limited to 'examples') diff --git a/examples/datasets.scm b/examples/datasets.scm new file mode 100755 index 0000000..d8b3a00 --- /dev/null +++ b/examples/datasets.scm @@ -0,0 +1,163 @@ +#! /usr/bin/env guile +!# + +(use-modules (rnrs programs) + (rnrs io ports) + (srfi srfi-1) + (srfi srfi-26) + (ice-9 getopt-long) + (ice-9 match) + (ice-9 regex) + (transform strings) + (transform sql) + (transform triples) + (transform special-forms)) + + +(define-transformer gn:dataset->metadata + (tables (Datasets + (inner-join InfoFiles "ON InfoFiles.DatasetId = Datasets.DatasetId")) + ;; Skip monkey datasets + "WHERE InfoFiles.InfoPageName NOT LIKE 'INIA_MacFas_%'" + "GROUP BY Datasets.DatasetId") + (schema-triples + (gnt:has_case_info a owl:ObjectProperty) + (gnt:has_case_info rdfs:comment "Information about the cases used in this platform") + (gnt:has_case_info rdfs:domain dcat:Dataset) + (gnt:has_case_info rdfs:label "About Case") + (gnt:has_citation a owl:ObjectProperty) + (gnt:has_citation rdfs:comment "Citation for this dataset") + (gnt:has_citation rdfs:domain dcat:Dataset) + (gnt:has_citation rdfs:label "Citation") + (gnt:has_contributors a owl:ObjectProperty) + (gnt:has_contributors rdfs:comment "Contributors of this resource") + (gnt:has_contributors rdfs:comment "Contributors of this resource") + (gnt:has_contributors rdfs:domain dcat:Dataset) + (gnt:has_contributors rdfs:label "Contributors") + (gnt:has_data_processing_info a owl:ObjectProperty) + (gnt:has_data_processing_info rdfs:comment "Information about how this dataset was processed") + (gnt:has_data_processing_info rdfs:domain dcat:Dataset) + (gnt:has_data_processing_info rdfs:label "About Data Processing") + (gnt:has_experiment_design a owl:ObjectProperty) + (gnt:has_experiment_design rdfs:comment "Experiment Design for this resource") + (gnt:has_experiment_design rdfs:domain dcat:Dataset) + (gnt:has_experiment_design rdfs:label "Experiment Design") + (gnt:has_experiment_design_info a owl:ObjectProperty) + (gnt:has_experiment_design_info rdfs:comment "Information about how the experiment was designed") + (gnt:has_experiment_design_info rdfs:domain dcat:Dataset) + (gnt:has_experiment_design_info rdfs:label "Experiment Design") + (gnt:has_experiment_type a owl:ObjectProperty) + (gnt:has_experiment_type rdfs:comment "Information about the experiment type") + (gnt:has_experiment_type rdfs:domain dcat:Dataset) + (gnt:has_experiment_type rdfs:label "Experiment Type Metadata") + (gnt:has_platform_info a owl:ObjectProperty) + (gnt:has_platform_info rdfs:comment "Information about the platform that was used with this dataset") + (gnt:has_platform_info rdfs:domain dcat:Dataset) + (gnt:has_platform_info rdfs:label "About Platform") + (gnt:has_samples a owl:ObjectProperty) + (gnt:has_samples rdfs:domain dcat:Dataset) + (gnt:has_samples rdfs:label "Samples") + (gnt:has_specifics a owl:ObjectProperty) + (gnt:has_specifics rdfs:comment "Has specifics") + (gnt:has_specifics rdfs:domain dcat:Dataset) + (gnt:has_specifics rdfs:label "Specifics") + (gnt:has_summary a owl:ObjectProperty) + (gnt:has_summary rdfs:comment "Summary information about dataset") + (gnt:has_summary rdfs:domain dcat:Dataset) + (gnt:has_summary rdfs:label "Summary") + (gnt:has_tissue_info a owl:ObjectProperty) + (gnt:has_tissue_info rdfs:domain dcat:Dataset) + (gnt:has_tissue_info rdfs:label "Metadata about Tissue for this resource")) + (triples (string->identifier "dataset" (field InfoFiles InfoPageName) #:separator "_") + (set rdf:type 'dcat:Dataset) + (set dct:title (normalize-string-field (field InfoFiles InfoPageName))) + (set dct:identifier (format #f "GN~a" (field InfoFiles GN_AccesionId))) + (set gnt:has_experiment_type + (let ((experiment-type + (field InfoFiles Experiment_Type))) + (if (or (null? experiment-type) (string-blank? experiment-type)) + "" (sanitize-rdf-string experiment-type)))) + (set gnt:has_tissue_info + (let ((tissue-info + (field Datasets AboutTissue))) + (if (or (null? tissue-info) (string-blank? tissue-info)) + "" (sanitize-rdf-string tissue-info)))) + (set gnt:has_summary + (let* ((summary + (field Datasets Summary))) + (if (or (null? summary) (string-blank? summary)) + "" (sanitize-rdf-string summary)))) + (set gnt:has_citation + (let ((citation + (field Datasets Citation))) + (if (or (null? citation) (string-blank? citation)) + "" (sanitize-rdf-string citation)))) + (set gnt:has_samples + (let ((samples + (field InfoFiles samples))) + (if (or (null? samples) (string-blank? samples)) + "" (sanitize-rdf-string samples)))) + (set gnt:has_specifics + (let* ((specifics + (field InfoFiles Specifics))) + (if (or (null? specifics) (string-blank? specifics)) + "" (sanitize-rdf-string specifics)))) + (set gnt:has_case_info + (let ((cases + (field Datasets AboutCases))) + (if (or (null? cases) (string-blank? cases)) + "" (sanitize-rdf-string cases)))) + (set gnt:has_platform_info + (let* ((platform + (field Datasets AboutPlatform))) + (if (or (null? platform) (string-blank? platform)) + "" (sanitize-rdf-string platform)))) + (set gnt:has_data_processing_info + (let* ((processing + (field Datasets AboutDataProcessing))) + (if (or (null? processing) (string-blank? processing)) + "" (sanitize-rdf-string processing)))) + (set gnt:has_experiment_design + (let ((experiment-design + (field Datasets ExperimentDesign))) + (if (or (null? experiment-design) (string-blank? experiment-design)) + "" (sanitize-rdf-string experiment-design)))) + (set gnt:has_contributors + (let ((contributors + (field Datasets Contributors))) + (if (or (null? contributors) (string-blank? contributors)) + "" (sanitize-rdf-string contributors)))))) + + +(let* ((option-spec + '((settings (single-char #\s) (value #t)) + (output (single-char #\o) (value #t)) + (documentation (single-char #\d) (value #t)))) + (options (getopt-long (command-line) option-spec)) + (settings (option-ref options 'settings #f)) + (output (option-ref options 'output #f)) + (documentation (option-ref options 'documentation #f)) + (%connection-settings + (call-with-input-file settings + read))) + (with-documentation + (name "Datasets Metadata") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("dct:" "") + ("dcat:" "") + ("gn:" "") + ("gnc:" "") + ("gnt:" "") + ("rdf:" "") + ("rdfs:" "") + ("owl:" "") + ("skos:" "") + ("xkos:" "") + ("xsd:" ""))) + (inputs + (list gn:dataset->metadata)) + (outputs + `(#:documentation ,documentation + #:rdf ,output)))) diff --git a/examples/molecular-traits.scm b/examples/molecular-traits.scm index da33950..f338693 100755 --- a/examples/molecular-traits.scm +++ b/examples/molecular-traits.scm @@ -83,66 +83,14 @@ (left-join GeneChip "ON GeneChip.Id = InfoFiles.GeneChipId")) "WHERE ProbeSetFreeze.public > 0 AND Species.Name != 'monkey'") (schema-triples - (gnt:has_case_info a owl:ObjectProperty) - (gnt:has_case_info rdfs:comment "Information about the cases used in this platform") - (gnt:has_case_info rdfs:domain dcat:Dataset) - (gnt:has_case_info rdfs:label "About Case") - (gnt:has_citation a owl:ObjectProperty) - (gnt:has_citation rdfs:comment "Citation for this dataset") - (gnt:has_citation rdfs:domain dcat:Dataset) - (gnt:has_citation rdfs:label "Citation") - (gnt:has_contributors a owl:ObjectProperty) - (gnt:has_contributors rdfs:comment "Contributors of this resource") - (gnt:has_contributors rdfs:comment "Contributors of this resource") - (gnt:has_contributors rdfs:domain dcat:Dataset) - (gnt:has_contributors rdfs:label "Contributors") - (gnt:has_data_processing_info a owl:ObjectProperty) - (gnt:has_data_processing_info rdfs:comment "Information about how this dataset was processed") - (gnt:has_data_processing_info rdfs:domain dcat:Dataset) - (gnt:has_data_processing_info rdfs:label "About Data Processing") - (gnt:has_experiment_design a owl:ObjectProperty) - (gnt:has_experiment_design rdfs:comment "Experiment Design for this resource") - (gnt:has_experiment_design rdfs:domain dcat:Dataset) - (gnt:has_experiment_design rdfs:label "Experiment Design") - (gnt:has_experiment_design_info a owl:ObjectProperty) - (gnt:has_experiment_design_info rdfs:comment "Information about how the experiment was designed") - (gnt:has_experiment_design_info rdfs:domain dcat:Dataset) - (gnt:has_experiment_design_info rdfs:label "Experiment Design") - (gnt:has_experiment_type a owl:ObjectProperty) - (gnt:has_experiment_type rdfs:comment "Information about the experiment type") - (gnt:has_experiment_type rdfs:domain dcat:Dataset) - (gnt:has_experiment_type rdfs:label "Experiment Type Metadata") - (gnt:has_platform_info a owl:ObjectProperty) - (gnt:has_platform_info rdfs:comment "Information about the platform that was used with this dataset") - (gnt:has_platform_info rdfs:domain dcat:Dataset) - (gnt:has_platform_info rdfs:label "About Platform") - (gnt:has_samples a owl:ObjectProperty) - (gnt:has_samples rdfs:domain dcat:Dataset) - (gnt:has_samples rdfs:label "Samples") - (gnt:has_specifics a owl:ObjectProperty) - (gnt:has_specifics rdfs:comment "Has specifics") - (gnt:has_specifics rdfs:domain dcat:Dataset) - (gnt:has_specifics rdfs:label "Specifics") - (gnt:has_summary a owl:ObjectProperty) - (gnt:has_summary rdfs:comment "Summary information about dataset") - (gnt:has_summary rdfs:domain dcat:Dataset) - (gnt:has_summary rdfs:label "Summary") - (gnt:has_tissue_info a owl:ObjectProperty) - (gnt:has_tissue_info rdfs:domain dcat:Dataset) - (gnt:has_tissue_info rdfs:label "Metadata about Tissue for this resource") (gnt:uses_genechip a owl:ObjectProperty) (gnt:uses_genechip rdfs:domain dcat:Dataset) - (gnt:uses_genechip skos:definition "The Platform this resource uses") - (gnt:uses_normalization_method rdfs:comment "The method used to map genetic or experimental data for this resource.") + (gnt:uses_genechip skos:definition "The Platform this resource uses for it's molecular traits.") + (gnt:uses_normalization_method rdfs:comment "The normalization method used for the molecular traits in this dataset") (gnt:uses_normalization_method rdfs:domain dcat:Dataset) - (gnt:uses_normalization_method rdfs:label "Averaging method") + (gnt:uses_normalization_method rdfs:label "Averaging method used for the molecular traits in this dataset.") (gnt:uses_normalization_method rdfs:range gnc:avg_method)) - (gnt:uses_normalization_method a owl:ObjectProperty) (triples (string->identifier "dataset" (field ProbeSetFreeze Name) #:separator "_") - (set rdf:type 'dcat:Dataset) - (set skos:prefLabel (field ProbeSetFreeze Name)) - (set dct:title (normalize-string-field (field InfoFiles InfoPageName))) - (set rdfs:label (normalize-string-field (field InfoFiles InfoPageName))) (set dct:created (annotate-field (field ProbeSetFreeze CreateTime) '^^xsd:datetime)) (set gnt:uses_normalization_method (string->identifier "avg_method" (field AvgMethod Name AvgMethodName) #:separator "_")) @@ -153,63 +101,7 @@ (set gnt:has_molecular_trait (string->identifier "trait" (field Tissue Short_Name) #:separator "_")) (set gnt:uses_genechip - (string->identifier "platform" (field GeneChip Name) #:separator "_")) - (set dct:identifier (format #f "GN~a" (field InfoFiles GN_AccesionId))) - (set gnt:has_experiment_type - (let ((experiment-type - (field InfoFiles Experiment_Type))) - (if (or (null? experiment-type) (string-blank? experiment-type)) - "" (sanitize-rdf-string experiment-type)))) - (set gnt:has_tissue_info - (let ((tissue-info - (field Datasets AboutTissue))) - (if (or (null? tissue-info) (string-blank? tissue-info)) - "" (sanitize-rdf-string tissue-info)))) - (set gnt:has_summary - (let* ((summary - (field Datasets Summary))) - (if (or (null? summary) (string-blank? summary)) - "" (sanitize-rdf-string summary)))) - (set gnt:has_citation - (let ((citation - (field Datasets Citation))) - (if (or (null? citation) (string-blank? citation)) - "" (sanitize-rdf-string citation)))) - (set gnt:has_samples - (let ((samples - (field InfoFiles samples))) - (if (or (null? samples) (string-blank? samples)) - "" (sanitize-rdf-string samples)))) - (set gnt:has_specifics - (let* ((specifics - (field InfoFiles Specifics))) - (if (or (null? specifics) (string-blank? specifics)) - "" (sanitize-rdf-string specifics)))) - (set gnt:has_case_info - (let ((cases - (field Datasets AboutCases))) - (if (or (null? cases) (string-blank? cases)) - "" (sanitize-rdf-string cases)))) - (set gnt:has_platform_info - (let* ((platform - (field Datasets AboutPlatform))) - (if (or (null? platform) (string-blank? platform)) - "" (sanitize-rdf-string platform)))) - (set gnt:has_data_processing_info - (let* ((processing - (field Datasets AboutDataProcessing))) - (if (or (null? processing) (string-blank? processing)) - "" (sanitize-rdf-string processing)))) - (set gnt:has_experiment_design - (let ((experiment-design - (field Datasets ExperimentDesign))) - (if (or (null? experiment-design) (string-blank? experiment-design)) - "" (sanitize-rdf-string experiment-design)))) - (set gnt:has_contributors - (let ((contributors - (field Datasets Contributors))) - (if (or (null? contributors) (string-blank? contributors)) - "" (sanitize-rdf-string contributors)))))) + (string->identifier "platform" (field GeneChip Name) #:separator "_")))) (let* ((option-spec -- cgit 1.4.1