diff options
Diffstat (limited to 'examples')
| -rwxr-xr-x | examples/classification.scm | 175 | ||||
| -rwxr-xr-x | examples/dataset-metadata-git.scm | 94 | ||||
| -rwxr-xr-x | examples/dataset-metadata.scm | 505 | ||||
| -rwxr-xr-x | examples/datasets.scm | 120 | ||||
| -rwxr-xr-x | examples/gene-chip.scm | 76 | ||||
| -rwxr-xr-x | examples/genotype-datasets.scm | 87 | ||||
| -rwxr-xr-x | examples/genotype.scm | 88 | ||||
| -rwxr-xr-x | examples/investigators.scm | 93 | ||||
| -rwxr-xr-x | examples/molecular-traits-datasets.scm | 100 | ||||
| -rwxr-xr-x | examples/molecular-traits.scm | 63 | ||||
| -rwxr-xr-x | examples/ontology.scm | 272 | ||||
| -rwxr-xr-x | examples/phenotype-datasets.scm | 109 | ||||
| -rwxr-xr-x | examples/phenotype.scm | 167 | ||||
| -rwxr-xr-x | examples/publication.scm | 4 | ||||
| -rwxr-xr-x | examples/schema.scm | 17 |
15 files changed, 1023 insertions, 947 deletions
diff --git a/examples/classification.scm b/examples/classification.scm index 7f89be8..130bec8 100755 --- a/examples/classification.scm +++ b/examples/classification.scm @@ -14,69 +14,24 @@ ;; Classification Scheme -(define-transformer classification-scheme-species - (tables (Species)) - (schema-triples - (gnc:resource_classification_scheme a skos:ConceptScheme) - (gnc:resource_classification_scheme skos:prefLabel "GeneNetwork Resource Classification Scheme") - (gnc:resource_classification_scheme skos:definition "A hierarchical classification scheme for organizing GeneNetwork resources by dataset type, resource set (inbredset group), or species.") - (gnc:resource_classification_scheme xkos:numberOfLevels "4") - (gnc:resource_classification_scheme xkos:levels gnc:taxonomic_family) - (gnc:resource_classification_scheme xkos:levels gnc:species) - (gnc:resource_classification_scheme xkos:levels gnc:set) - (gnc:resource_classification_scheme xkos:levels gnc:population_category) - (gnc:population_category a xkos:ClassificationLevel) - (gnc:population_category skos:inScheme gnc:resource_classification_scheme) - (gnc:population_category xkos:nextLevel gnc:set) - (gnc:population_category skos:prefLabel "Species") - (gnc:population_category rdfs:label "Population Category") - (gnc:population_category xkos:depth "4") - (gnt:population_category skos:definition "Classification of genetic populations by breeding design and data aggregation.") - (gnc:species a xkos:ClassificationLevel) - (gnc:species skos:inScheme gnc:resource_classification_scheme) - (gnc:species xkos:previousLevel gnc:taxonomic_family) - (gnc:species xkos:nextLevel gnc:set) - (gnc:species skos:prefLabel "Species") - (gnc:species skos:definition "A classification level that that associates a given resource to a species in GeneNetwork.") - (gnc:species xkos:depth "2")) +(define-transformer gnc:species->gn:species + (tables (Species) + "WHERE Name != 'monkey'") (triples "gnc:species" (set skos:member (string->identifier "" (remap-species-identifiers (field Species Fullname)))))) -(define-transformer classification-scheme-set +(define-transformer gnc:set->gn:set (tables (InbredSet) - "WHERE public > 0") - (schema-triples - (gnc:set a xkos:ClassificationLevel) - (gnc:set skos:inScheme gnc:resource_classification_scheme) - (gnc:set xkos:nextLevel gnc:population_category) - (gnc:set xkos:previousLevel gnc:species) - (gnc:set skos:prefLabel "InbredSet Group") - (gnc:set skos:definition "A category representing groups of genetically related strains or individuals (inbred sets, recombinant inbred lines, etc.).") - (gnc:set xkos:depth "3")) + "WHERE public > 0 AND FullName NOT LIKE '%monkey%'") (triples "gnc:set" (set skos:member (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")))) -(define-transformer species - (tables (Species)) - (schema-triples - (gnt:has_uniprot_taxon_id a owl:ObjectProperty) - (gnt:has_uniprot_taxon_id rdfs:label "has uniprot taxonomic id") - (gnt:has_taxonomic_family a owl:ObjectProperty) - (gnt:has_taxonomic_family rdfs:label "has family") - (gnt:has_taxonomic_family skos:definition "Links a species to its taxonomic family") - (gnt:has_taxonomic_family schema:domainIncludes gnc:species) - (gnt:has_taxonomic_family schema:domainIncludes gnc:set) - (gnt:short_name a owl:DatatypeProperty) - (gnt:short_name rdfs:label "has short name") - (gnt:short_name rdfs:domain gnc:species) - (gnt:short_name skos:definition "The short name of a given resource") - (gnt:has_species a owl:ObjectProperty) - (gnt:has_species rdf:comment "This resource belongs to this species") - (gnt:has_species rdfs:label "belongs to species") - (gnt:has_species rdfs:range gnc:species)) +(define-transformer gnc:species->metadata + (tables (Species) + "WHERE Name != 'monkey'") (triples (string->identifier "" (remap-species-identifiers (field Species Fullname))) (set rdf:type 'gnc:species) @@ -89,33 +44,17 @@ 'taxon: (field Species TaxonomyId))))) -(define-transformer species-fan-out +(define-transformer gnc:species->gn:set (tables (InbredSet (left-join Species "ON InbredSet.SpeciesId=Species.Id")) - "WHERE public > 0") - (schema-triples - (gnt:has_strain a owl:ObjectProperty) - (gnt:has_strain rdfs:range gnc:set) - (gnt:has_strain rdfs:domain gnc:species) - (gnt:has_strain rdfs:label "this resource belongs to this strain.") - (gnt:has_strain skos:definition "Lists all strains that belong to this resource.")) + "WHERE public > 0 AND Species.Name != 'monkey'") (triples (string->identifier "" (remap-species-identifiers (field Species Fullname))) (set gnt:has_strain (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")))) - -(define-transformer species-families-list +(define-transformer gn:family->gn:species/metadata (tables (Species) - "GROUP BY FAMILY") - (schema-triples - (gnc:taxonomic_family a xkos:ClassificationLevel) - (gnc:taxonomic_family skos:inScheme gnc:resource_classification_scheme) - (gnc:taxonomic_family skos:prefLabel "Family") - (gnc:taxonomic_family skos:definition "An organizational classification level used in GeneNetwork to group resources into families.") - (gnc:taxonomic_family xkos:depth "1") - (gnc:taxonomic_family xkos:nextLevel gnc:species) - (gnt:has_family_order_id a owl:DatatypeProperty) - (gnt:has_family_order_id rdfs:range xsd:integer)) + "WHERE Name != 'monkey' GROUP BY FAMILY") (triples (string->identifier "family" (field Species Family) #:separator "_") (set gnt:has_species (string->identifier "" (remap-species-identifiers (field Species Fullname)))) @@ -124,49 +63,26 @@ (annotate-field (field Species OrderId) '^^xsd:integer)))) -(define-transformer species-families-fanout - (tables (Species)) - (schema-triples - (gnt:has_family_order_id a owl:DatatypeProperty)) +(define-transformer gn:family->gn:species + (tables (Species) + "WHERE Name != 'monkey'") (triples (string->identifier "family" (field Species Family) #:separator "_") (set gnt:has_species (string->identifier "" (remap-species-identifiers (field Species Fullname)))))) -(define-transformer inbred-set +(define-transformer gn:set->metadata (tables (InbredSet (left-join Species "ON InbredSet.SpeciesId=Species.Id") (left-join MappingMethod "ON InbredSet.MappingMethodId=MappingMethod.Id")) - "WHERE public > 0") - (schema-triples - (gnt:genetic_type a owl:DatatypeProperty) - (gnt:genetic_type rdfs:label "has genetic type") - (gnt:genetic_type skos:definition "Describes the genetic architecture of a resource set (e.g., intercross, riset).") - (gnt:genetic_type rdfs:domain gnc:set) - (gnt:genetic_type rdfs:range xsd:string) - (gnt:has_set_code a owl:DatatypeProperty) - (gnt:has_set_code rdfs:label "has set code") - (gnt:has_set_code skos:definition "Provides a unique identifier code for a resource set.") - (gnt:has_set_code rdfs:domain gnc:set) - (gnt:has_set_code rdfs:range xsd:string) - (gnt:uses_mapping_method a owl:ObjectProperty) - (gnt:uses_mapping_method rdfs:label "mapping method") - (gnt:uses_mapping_method rdfs:domain gnc:set) - (gnt:uses_mapping_method rdfs:range gnc:mapping_method) - (gnt:uses_mapping_method rdfs:comment "The method used to map genetic or experimental data for this resource.") - (gnt:has_strain a owl:ObjectProperty) - (gnt:has_strain rdf:comment "Indicates the group the resources belongs to") - (gnt:has_strain schema:domainIncludes dcat:Dataset) - (gnt:has_strain schema:domainIncludes gnc:species) - (gnt:has_strain rdfs:range gnc:set) - (gnt:has_strain rdfs:label "belongs-to-group")) + "WHERE public > 0 AND Species.Name != 'monkey'") (triples (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_") (set rdf:type 'gnc:set) (set rdfs:label (field InbredSet FullName)) (set skos:prefLabel (field InbredSet Name InbredSetName)) (set gnt:genetic_type (field InbredSet GeneticType)) - (set dct:description (annotate-field (field InbredSet description) + (set dct:description (annotate-field (sanitize-rdf-string (field InbredSet description)) '^^rdf:HTML)) (set gnt:uses_mapping_method (string->identifier "mapping_method" (field MappingMethod Name) #:separator "_")) @@ -174,9 +90,9 @@ (set gnt:has_species (string->identifier "" (remap-species-identifiers (field Species Fullname)))))) -(define-transformer inbredset-population* +(define-transformer gn:set->gn:population (tables (InbredSet) - "WHERE Family IS NOT NULL") + "WHERE Family IS NOT NULL AND FullName NOT LIKE '%monkey%'") (schema-triples (gnt:has_reference_population rdfs:domain gnc:set) (gnt:has_reference_population a owl:ObjectProperty) @@ -186,14 +102,9 @@ (set gnt:has_reference_population (string->identifier "population" (field InbredSet Family) #:separator "_")))) -(define-transformer inbredset-population-list +(define-transformer gn:population->metadata (tables (InbredSet) - "WHERE Family IS NOT NULL GROUP BY Family") - (schema-triples - (gnc:reference_population a skos:Concept) - (gnc:reference_population skos:inScheme gnc:population_category) - (gnc:reference_population skos:prefLabel "Reference population") - (gnc:reference_population skos:definition "A genetic population")) + "WHERE Family IS NOT NULL AND FullName NOT LIKE '%monkey%' GROUP BY Family") (triples (string->identifier "population" (field InbredSet Family) #:separator "_") (set rdf:type 'gnc:reference_population) (set rdfs:label (field InbredSet Family)) @@ -202,27 +113,23 @@ (annotate-field (field InbredSet FamilyOrder) '^^xsd:integer)))) -(define-transformer inbredset-population-fanout +(define-transformer gn:population->gn:set (tables (InbredSet) - "WHERE Family IS NOT NULL") + "WHERE Family IS NOT NULL AND FullName NOT LIKE '%monkey%'") (triples (string->identifier "population" (field InbredSet Family) #:separator "_") (set gnt:has_strain (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")))) -(define-transformer population-category-inbredset +(define-transformer gnc:population_category->gn:population (tables (InbredSet) - "WHERE public > 0 GROUP BY Family") + "WHERE public > 0 AND FullName NOT LIKE '%monkey%' GROUP BY Family") (triples "gnc:population_category" (set gnt:has_reference_population (string->identifier "population" (field InbredSet Family) #:separator "_")))) -(define-transformer family-category-species +(define-transformer gnc:taxonomic_family->gn:family (tables (Species) - "GROUP BY Family") - (schema-triples - (gnt:assigned_species rdfs:domain gnc:set) - (gnt:assigned_species a owl:ObjectProperty) - (gnt:assigned_species rdfs:label "These families have been assigned to these species")) + "WHERE Name != 'monkey' GROUP BY Family") (triples "gnc:taxonomic_family" (set gnt:has_taxonomic_family (string->identifier "family" (field Species Family) #:separator "_")))) @@ -242,7 +149,7 @@ read))) (with-documentation - (name "Species Metadata") + (name "GN Classification Hierarchy") (connection %connection-settings) (table-metadata? #f) (prefixes @@ -260,18 +167,18 @@ ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") ("taxon:" "<http://purl.uniprot.org/taxonomy/>"))) (inputs - (list classification-scheme-species - classification-scheme-set - species - species-fan-out - inbred-set - inbredset-population* - species-families-list - species-families-fanout - inbredset-population-list - inbredset-population-fanout - population-category-inbredset - family-category-species)) + (list gnc:species->gn:species + gnc:set->gn:set + gnc:species->metadata + gnc:species->gn:set + gn:family->gn:species/metadata + gn:family->gn:species + gn:set->metadata + gn:set->gn:population + gn:population->metadata + gn:population->gn:set + gnc:population_category->gn:population + gnc:taxonomic_family->gn:family)) (outputs `(#:documentation ,documentation #:rdf ,output)))) diff --git a/examples/dataset-metadata-git.scm b/examples/dataset-metadata-git.scm deleted file mode 100755 index c9ea59b..0000000 --- a/examples/dataset-metadata-git.scm +++ /dev/null @@ -1,94 +0,0 @@ -#! /usr/bin/env guile - -!# -(use-modules - (ice-9 getopt-long) - (srfi srfi-26) - ((ice-9 regex) #:select (regexp-substitute/global)) - ((transform strings) #:select (string-blank? string-capitalize-first)) - ((transform sql) #:select (call-with-target-database sql-for-each))) - -(define (save-file file result) - (when result - (let ((dir-name (dirname file))) - (unless (file-exists? dir-name) - (mkdir dir-name)) - (with-output-to-file file - (lambda () - (format #t "~a" result)))))) - -(define (infopages/sql->rtf result) - (let* ((get (cut assoc-ref result <>)) - (get* (compose (lambda (str) - (if (or (string-blank? str) - (string-ci=? - (string-trim-both str) "None")) - #f - str)) - get)) - (identifier - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (get "InfoPageName") - 'pre "_" 'post))) - (dir-name "/export/data/genenetwork/gn-docs/general/datasets/") - (file-name (cut string-append dir-name <>)) - (summary (get* "Summary")) - (tissue (get* "AboutTissue")) - (specifics (get* "Specifics")) - (contributors (get* "Contributors")) - (cases (get* "AboutCases")) - (platform (get* "AboutPlatform")) - (processing (get* "AboutDataProcessing")) - (notes (get* "Notes")) - (citation (get* "Citation")) - (experiment-type (get* "Experiment_Type")) - (experiment-design (get* "ExperimentDesign")) - (acknowledgment (get* "Acknowledgment"))) - (for-each (lambda (x) - (save-file - (string-append (file-name identifier) - "/" - (car x)) - (cdr x))) - `(("summary.rtf" . ,summary) - ("tissue.rtf" . ,tissue) - ("citation.rtf" . ,citation) - ("specifics.rtf" . ,specifics) - ("cases.rtf" . ,cases) - ("platform.rtf" . ,platform) - ("processing.rtf" . ,processing) - ("notes.rtf" . ,notes) - ("experiment-design.rtf" . ,experiment-design) - ("experiment-type.rtf" . ,experiment-type) - ("contributors.rtf" . ,contributors) - ("acknowledgment.rtf" . ,acknowledgment))))) - - -(let* ((option-spec - '((settings (single-char #\s) (value #t)))) - (options (getopt-long (command-line) option-spec)) - (settings (option-ref options 'settings #f)) - (query "SELECT InfoPageName, Datasets.Summary, Datasets.AboutTissue, InfoFiles.Specifics, -Datasets.AboutCases, Datasets.AboutPlatform, Datasets.AboutDataProcessing, InfoFiles.Experiment_Type, -Datasets.Notes, Datasets.ExperimentDesign, Datasets.Acknowledgment, Datasets.Contributors, Datasets.Citation -FROM InfoFiles LEFT JOIN Datasets USING (DatasetId)") - (%connection-settings - (call-with-input-file settings - read))) - (call-with-target-database - %connection-settings - (lambda (db) - (let ((dir "/export/data/genenetwork/gn-docs/")) - (chdir dir) - (system "git reset --hard origin") - (system "git pull") - ;; Clear directory so that we can re-do the dump again from the db. - (system "rm -rf general/datasets/*/") - (sql-for-each infopages/sql->rtf - db - query) - (system "git add general/datasets") - (system (format #f "git commit -m ~s" "Update dataset RTF Files.")) - (system "git push origin master"))))) diff --git a/examples/dataset-metadata.scm b/examples/dataset-metadata.scm deleted file mode 100755 index 53ff2d7..0000000 --- a/examples/dataset-metadata.scm +++ /dev/null @@ -1,505 +0,0 @@ -#! /usr/bin/env guile -!# - -(use-modules (srfi srfi-1) - (srfi srfi-26) - (ice-9 getopt-long) - (ice-9 match) - (ice-9 regex) - (transform strings) - (transform sql) - (transform triples) - (transform special-forms)) - - - -;; One email ID in the Investigators table has spaces in it. This -;; function fixes that. -(define (fix-email-id email) - (string-delete #\space email)) - -(define (investigator-attributes->id first-name last-name email) - ;; There is just one record corresponding to "Evan Williams" which - ;; does not have an email ID. To accommodate that record, we - ;; construct the investigator ID from not just the email ID, but - ;; also the first and the last names. It would be preferable to just - ;; find Evan Williams' email ID and insert it into the database. - (string->identifier "investigator" - (string-join - (list first-name last-name (fix-email-id email)) - "_") - #:separator "_")) - -(define-transformer investigators - ;; There are a few duplicate entries. We group by email to - ;; deduplicate. - (tables (Investigators) - "GROUP BY Email") - (triples (investigator-attributes->id (field Investigators FirstName) - (field Investigators LastName) - "") - (set rdf:type 'foaf:Person) - (set foaf:name (string-append (field Investigators FirstName) " " - (field Investigators LastName))) - (set foaf:givenName - (field Investigators FirstName)) - (set foaf:familyName - (field Investigators LastName)) - (set foaf:homepage (field Investigators Url)) - (set v:adr (field Investigators Address)) - (set v:locality (field Investigators City)) - (set v:region (field Investigators State)) - (set v:postal-code (field Investigators ZipCode)) - (set v:country-name (field Investigators Country)))) - -(define-transformer gene-chip - (tables (GeneChip - (left-join Species "USING (SpeciesId)"))) - (schema-triples - (gnc:gene_chip a skos:Concept) - (gnc:gene_chip - skos:description - "This is a set of controlled terms that are used to describe a given gene chip/platform") - (gnt:has_geo_series_id rdfs:domain gnc:platform) - (gnt:has_geo_series_id rdfs:domain gnc:gene_chip) - (gnt:has_go_tree_value a owl:ObjectProperty) - (gnt:has_go_tree_value skos:definition "This resource the following GO tree value") - (gnt:has_go_tree_value rdfs:domain gnc:gene_chip)) - (triples (string->identifier "platform" (field GeneChip Name) #:separator "_") - (set rdf:type 'gnc:gene_chip) - (set rdfs:label (field GeneChip GeneChipName)) - (set skos:prefLabel (field GeneChip Name)) - (set skos:altLabel (field ("IF(GeneChip.GeneChipName != GeneChip.Title, Title, NULL)" - Title))) - (set gnt:has_go_tree_value (field GeneChip Go_tree_value)) - (set xkos:classifiedUnder - (string->identifier "" (remap-species-identifiers (field Species Fullname)) #:separator "")) - (set gnt:has_geo_series_id - (ontology 'geoSeries: - (string-trim-both (field GeneChip GeoPlatform)))))) - -(define-transformer info-files - (tables (InfoFiles - (left-join PublishFreeze "ON InfoFiles.InfoPageName = PublishFreeze.Name") - (left-join GenoFreeze "ON InfoFiles.InfoPageName = GenoFreeze.Name") - (left-join ProbeSetFreeze "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") - (left-join InbredSet "ON InfoFiles.InbredSetId = InbredSet.InbredSetId") - (left-join Species "ON InfoFiles.SpeciesId = Species.SpeciesId") - (left-join Datasets "USING (DatasetId)") - (left-join DatasetStatus "USING (DatasetStatusId)") - (left-join Tissue "USING (TissueId)") - (left-join Investigators "USING (InvestigatorId)") - (left-join AvgMethod "USING (AvgMethodId)") - (left-join Organizations "USING (OrganizationId)") - (left-join GeneChip "USING (GeneChipId)")) - ;; XXXX: There are datasets that don't have the InbredSetId - ;; in the Infofiles table. This clause allows us to check - ;; if they exist in the (Publish/Geno)Freeze tables. - "LEFT JOIN InbredSet PublishInbredSet ON PublishFreeze.InbredSetId = PublishInbredSet.InbredSetId LEFT JOIN InbredSet GenoInbredSet ON GenoFreeze.InbredSetId = GenoInbredSet.InbredSetId WHERE GN_AccesionId IS NOT NULL") - (schema-triples - (gnt:has_tissue rdfs:domain dcat:Dataset) - (gnt:has_tissue a owl:ObjectProperty) - (gnt:has_tissue skos:definition "Tissues this resource has") - (gnt:uses_normalization rdfs:domain dcat:Dataset) - (gnt:uses_normalization a owl:ObjectProperty) - (gnt:uses_normalization skos:definition "Normalization techniques this resource has") - (gnt:uses_platform rdfs:domain dcat:Dataset) - (gnt:uses_platform a owl:ObjectProperty) - (gnt:uses_platform skos:definition "The Platform this resource uses") - (gnt:has_geo_series_id rdfs:domain dcat:Dataset) - (gnt:has_geo_series_id a owl:ObjectProperty) - (gnt:has_geo_series_id skos:definition "id of record in NCBI database") - (gnt:has_experiment_type rdfs:domain dcat:Dataset) - (gnt:has_experiment_type a owl:ObjectProperty) - (gnt:has_experiment_type rdfs:label "Experiment Type Metadata") - (gnt:has_experiment_type skos:definition "Information about the experiment type") - (gnt:has_tissue_info rdfs:domain dcat:Dataset) - (gnt:has_tissue_info a owl:ObjectProperty) - (gnt:has_tissue_info skos:definition "Metadata about Tissue for this resource") - (gnt:has_experiment_design_info rdfs:domain dcat:Dataset) - (gnt:has_experiment_design_info rdfs:label "Experiment Design") - (gnt:has_experiment_design_info a owl:ObjectProperty) - (gnt:has_experiment_design_info skos:definition "Information about how the experiment was designed") - (gnt:has_notes rdfs:domain dcat:Dataset) - (gnt:has_notes a owl:ObjectProperty) - (gnt:has_notes rdfs:label "Notes") - (gnt:has_notes skos:definition "Extra Notes about this dataset") - (gnt:has_data_processing_info rdfs:domain dcat:Dataset) - (gnt:has_data_processing_info rdfs:label "About Data Processing") - (gnt:has_data_processing_info a owl:ObjectProperty) - (gnt:has_data_processing_info skos:definition "Information about how this dataset was processed") - (gnt:has_platform_info rdfs:domain dcat:Dataset) - (gnt:has_platform_info a owl:ObjectProperty) - (gnt:has_platform_info rdfs:label "About Platform") - (gnt:has_platform_info skos:definition "Information about the platform that was used with this dataset") - (gnt:has_case_info rdfs:domain dcat:Dataset) - (gnt:has_case_info rdfs:label "About Case") - (gnt:has_case_info a owl:ObjectProperty) - (gnt:has_case_info skos:definition "Information about the cases used in this platform") - (gnt:has_summary rdfs:domain dcat:Dataset) - (gnt:has_summary rdfs:label "Summary") - (gnt:has_summary a owl:ObjectProperty) - (gnt:has_summary skos:definition "Summary information about dataset") - (gnt:has_citation rdfs:domain dcat:Dataset) - (gnt:has_citation rdfs:label "Citation") - (gnt:has_citation a owl:ObjectProperty) - (gnt:has_citation skos:definition "Citation for this dataset") - (gnt:has_contributors rdfs:domain dcat:Dataset) - (gnt:has_contributors rdfs:label "Contributors") - (gnt:has_contributors a owl:ObjectProperty) - (gnt:has_contributors skos:definition "Contributors of this resource") - (gnt:has_experiment_design rdfs:domain dcat:Dataset) - (gnt:has_experiment_design rdfs:label "Experiment Design") - (gnt:has_experiment_design a owl:ObjectProperty) - (gnt:has_experiment_design skos:definition "Experiment Design for this resource") - (gnt:has_tissue_info rdfs:domain dcat:Dataset) - (gnt:has_tissue_info rdfs:label "Tissue Information") - (gnt:has_tissue_info a owl:ObjectProperty) - (gnt:has_tissue_info skos:definition "Tissue information about dataset") - (gnt:has_experiment_type skos:definition "Information about the experiment type") - (gnt:has_acknowledgement rdfs:domain dcat:Dataset) - (gnt:has_acknowledgement rdfs:label "Acknowledgement") - (gnt:has_acknowledgement a owl:ObjectProperty) - (gnt:has_acknowledgement skos:definition "People to acknowledge")) - (triples - (string->identifier - "" (let ((info-page-name (field InfoFiles InfoPageName)) - (info-title (field InfoFiles Title))) - (format #f "~a" - (if (and (string? info-page-name) - (string=? (string-downcase (string-trim-both info-page-name)) - "none")) - info-title info-page-name)))) - (set rdf:type 'dcat:Dataset) - (set xkos:classifiedUnder - (let ([dataset-type - (string-trim-both - (field ("IF(GenoFreeze.Id IS NOT NULL, 'gnc:genotype', IF(PublishFreeze.Id IS NOT NULL, 'gnc:phenotype', IF(ProbeSetFreeze.Name IS NOT NULL, 'gnc:probeset', '')))" - DatasetType)))]) - (if (not (string-null? dataset-type)) - (string->symbol - dataset-type) - ""))) - (set rdfs:label (normalize-string-field (field InfoFiles InfoPageName))) - (set skos:prefLabel - (normalize-string-field - (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))" - DatasetFullName)))) - (set skos:altLabel (field Datasets DatasetName DatasetGroup)) - (set dct:title (normalize-string-field (field Datasets PublicationTitle))) - (set dct:created - (normalize-string-field - (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))" - createTimeGenoFreeze)))) - (set dcat:contactPoint - (investigator-attributes->id (field Investigators FirstName) - (field Investigators LastName) - "")) - (set foaf:Organization - (field Organizations OrganizationName)) - (set dct:identifier (format #f "GN~a" (field InfoFiles GN_AccesionId))) - (set dct:accessRights (string-downcase - (field DatasetStatus DatasetStatusName))) - (set gnt:has_strain - (string->identifier - "set" - (field ("IFNULL(InbredSet.Name, IFNULL(PublishInbredSet.Name, GenoInbredSet.Name))" - InbredSetName)) - #:separator "_")) - (set gnt:has_tissue (string->identifier "tissue" - (field Tissue Short_Name) - #:separator "_")) - (set gnt:uses_normalization - (let ((avg-method (normalize-string-field (field AvgMethod Name AvgMethodName)))) - (if (not (string-blank? avg-method)) - (string->identifier "avg_method" avg-method #:separator "_") - ""))) - (set gnt:has_summary - (let* ((summary-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/summary.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (summary - (field InfoFiles Summary))) - (if (or (null? summary) (string-blank? summary)) - "" (string->symbol summary-link)))) - (set gnt:has_tissue_info - (let* ((tissue-info-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/tissue.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (tissue-info - (field Datasets AboutTissue))) - (if (or (null? tissue-info) (string-blank? tissue-info)) - "" (string->symbol tissue-info-link)))) - (set gnt:has_citation - (let* ((citation-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/citation.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (citation - (field Datasets Citation))) - (if (or (null? citation) (string-blank? citation)) - "" (string->symbol citation-link)))) - (set gnt:hasSpecifics - (let* ((specifics-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/specifics.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (specifics - (field InfoFiles Specifics))) - (if (or (null? specifics) (string-blank? specifics)) - "" (string->symbol specifics-link)))) - (set gnt:has_case_info - (let* ((cases-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/cases.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (cases - (field Datasets AboutCases))) - (if (or (null? cases) (string-blank? cases)) - "" (string->symbol cases-link)))) - (set gnt:has_platform_info - (let* ((platform-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/platform.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (platform - (field Datasets AboutPlatform))) - (if (or (null? platform) (string-blank? platform)) - "" (string->symbol platform-link)))) - (set gnt:has_data_processing_info - (let* ((processing-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/processing.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (processing - (field Datasets AboutDataProcessing))) - (if (or (null? processing) (string-blank? processing)) - "" (string->symbol processing-link)))) - (set gnt:has_notes - (let* ((notes-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/notes.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (notes - (field Datasets Notes))) - (if (or (null? notes) (string-blank? notes)) - "" (string->symbol notes-link)))) - (set gnt:has_experiment_type - (let* ((experiment-type-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/experiment-type.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (experiment-type - (field InfoFiles Experiment_Type))) - (if (or (null? experiment-type) (string-blank? experiment-type)) - "" (string->symbol experiment-type-link)))) - (set gnt:has_experiment_design - (let* ((experiment-design-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/experiment-design.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (experiment-design - (field Datasets ExperimentDesign))) - (if (or (null? experiment-design) (string-blank? experiment-design)) - "" (string->symbol experiment-design-link)))) - (set gnt:has_contributors - (let* ((contributors-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/contributors.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (contributors - (field Datasets Contributors))) - (if (or (null? contributors) (string-blank? contributors)) - "" (string->symbol contributors-link)))) - (set gnt:has_acknowledgement - (let* ((acknowledgment-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/acknowledgment.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (acknowledgment - (field Datasets Acknowledgment))) - (if (or (null? acknowledgment) (string-blank? acknowledgment)) - "" (string->symbol acknowledgment-link)))) - (set gnt:uses_platform - (string->identifier "platform" - (field GeneChip Name GeneChip) - #:separator "_")) - (set gnt:has_geo_series_id - (let ((s - (string-match "GSE[0-9]*" - (field ("IFNULL(Datasets.GeoSeries, '')" GeoSeries))))) - (if s (ontology - 'geoSeries: (match:substring s)) - ""))))) - -;; These are phenotype datasets that don't have Infofile metadata -(define-transformer publishfreeze - (tables (PublishFreeze - (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name") - (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId")) - "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL") - (triples - (string->identifier "" (field PublishFreeze Name)) - (set rdf:type 'dcat:Dataset) - (set xkos:classifiedUnder 'gnc:phenotype) - (set dct:title (field PublishFreeze FullName)) - (set rdfs:label (field PublishFreeze Name)) - (set skos:altLabel (field PublishFreeze ShortName)) - (set dct:created (annotate-field - (field PublishFreeze CreateTime) - '^^xsd:date)) - (set gnt:has_strain - (string->identifier - "set" (field InbredSet Name InbredSetName) - #:separator "_")))) - -(define-transformer genofreeze - (tables (GenoFreeze - (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name") - (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId")) - "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL") - (triples - (string->identifier "" (field GenoFreeze Name)) - (set rdf:type 'dcat:Dataset) - (set xkos:classifiedUnder 'gnc:genotype) - (set rdfs:label (field GenoFreeze Name)) - (set dct:title (field GenoFreeze FullName)) - (set skos:altLabel (field GenoFreeze ShortName)) - (set dct:created (annotate-field - (field GenoFreeze CreateTime) - '^^xsd:date)) - (set gnt:has_strain - (string->identifier - "set" (field InbredSet Name InbredSetName) - #:separator "_" - #:proc (lambda (x) x))))) - -;; Molecular Traits are also referred to as ProbeSets -(define-transformer probesetfreeze - (tables (ProbeSetFreeze - (left-join InfoFiles "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") - (left-join ProbeFreeze "USING (ProbeFreezeId)") - (left-join AvgMethod "ON AvgMethod.AvgMethodId = ProbeSetFreeze.AvgID") - (left-join InbredSet "ON ProbeFreeze.InbredSetId = InbredSet.Id") - (left-join Tissue "ON ProbeFreeze.TissueId = Tissue.TissueId")) - "WHERE ProbeSetFreeze.public > 0 AND InfoFiles.InfoPageName IS NULL GROUP BY ProbeFreeze.Id") - (schema-triples - (gnt:uses_normalization rdfs:domain gnc:probeset) - (gnt:uses_data_scale rdfs:domain gnc:probeset) - (gnt:uses_data_scale a owl:ObjectProperty) - (gnt:uses_data_scale skos:definition "Thi data scale this resource uses")) - (triples - (string->identifier "" (field ProbeSetFreeze Name)) - (set rdf:type 'dcat:Dataset) - (set xkos:classifiedUnder 'gnc:probeset) - (set gnt:uses_normalization - (let ((avg-method (field AvgMethod Name AvgMethodName))) - (if (string-blank? avg-method) - #f - avg-method))) - (set dct:title (field ProbeSetFreeze FullName)) - (set rdfs:label (field ProbeSetFreeze ShortName)) - (set skos:prefLabel (field ProbeSetFreeze Name)) - (set skos:altLabel (field ProbeSetFreeze Name2)) - (set dct:created (annotate-field - (field ProbeSetFreeze CreateTime) - '^^xsd:datetime)) - (set gnt:uses_data_scale (field ProbeSetFreeze DataScale)) - (set gnt:has_tissue (string->identifier "tissue" (field Tissue Short_Name) #:separator "_")) - (set gnt:has_strain (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")))) - - - -(let* ((option-spec - '((settings (single-char #\s) (value #t)) - (output (single-char #\o) (value #t)) - (documentation (single-char #\d) (value #t)))) - (options (getopt-long (command-line) option-spec)) - (settings (option-ref options 'settings #f)) - (output (option-ref options 'output #f)) - (documentation (option-ref options 'documentation #f)) - (%connection-settings - (call-with-input-file settings - read))) - (with-documentation - (name "Info files / Investigators Metadata") - (connection %connection-settings) - (table-metadata? #f) - (prefixes - '(("v:" "<http://www.w3.org/2006/vcard/ns#>") - ("foaf:" "<http://xmlns.com/foaf/0.1/#term_>") - ("xsd:" "<http://www.w3.org/2001/XMLSchema#>") - ("dcat:" "<http://www.w3.org/ns/dcat#>") - ("skos:" "<http://www.w3.org/2004/02/skos/core#>") - ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") - ("geoSeries:" "<http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=>") - ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") - ("gn:" "<http://rdf.genenetwork.org/v1/id/>") - ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") - ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") - ("owl:" "<http://www.w3.org/2002/07/owl#>") - ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") - ("taxon:" "<http://purl.uniprot.org/taxonomy/>") - ("dct:" "<http://purl.org/dc/terms/>"))) - (inputs - (list info-files - publishfreeze - genofreeze - probesetfreeze - investigators - gene-chip)) - (outputs - `(#:documentation ,documentation - #:rdf ,output)))) - - diff --git a/examples/datasets.scm b/examples/datasets.scm new file mode 100755 index 0000000..85a5aee --- /dev/null +++ b/examples/datasets.scm @@ -0,0 +1,120 @@ +#! /usr/bin/env guile +!# + +(use-modules (rnrs programs) + (rnrs io ports) + (srfi srfi-1) + (srfi srfi-26) + (ice-9 getopt-long) + (ice-9 match) + (ice-9 regex) + (transform strings) + (transform sql) + (transform triples) + (transform special-forms)) + + +(define-transformer gn:dataset->metadata + (tables (Datasets + (inner-join InfoFiles "ON InfoFiles.DatasetId = Datasets.DatasetId") + (inner-join InbredSet "ON InbredSet.Id = InfoFiles.InbredSetId")) + ;; Skip monkey datasets + "WHERE InfoFiles.InfoPageName NOT LIKE 'INIA_MacFas_%'" + "GROUP BY Datasets.DatasetId") + (triples (string->identifier "dataset" (field InfoFiles InfoPageName) #:separator "_") + (set rdf:type 'dcat:Dataset) + (set dct:title (normalize-string-field (field InfoFiles InfoPageName))) + (set dct:identifier (format #f "GN~a" (field InfoFiles GN_AccesionId))) + (set gnt:has_genotype_files (string->symbol (format #f "gn-files:GN~a%2F" (field InfoFiles GN_AccesionId)))) + (set gnt:has_strain + (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")) + (set gnt:has_experiment_type + (let ((experiment-type + (field InfoFiles Experiment_Type))) + (if (or (null? experiment-type) (string-blank? experiment-type)) + "" (sanitize-rdf-string experiment-type)))) + (set gnt:has_tissue_info + (let ((tissue-info + (field Datasets AboutTissue))) + (if (or (null? tissue-info) (string-blank? tissue-info)) + "" (sanitize-rdf-string tissue-info)))) + (set gnt:has_summary + (let* ((summary + (field Datasets Summary))) + (if (or (null? summary) (string-blank? summary)) + "" (sanitize-rdf-string summary)))) + (set gnt:has_citation + (let ((citation + (field Datasets Citation))) + (if (or (null? citation) (string-blank? citation)) + "" (sanitize-rdf-string citation)))) + (set gnt:has_samples + (let ((samples + (field InfoFiles samples))) + (if (or (null? samples) (string-blank? samples)) + "" (sanitize-rdf-string samples)))) + (set gnt:has_specifics + (let* ((specifics + (field InfoFiles Specifics))) + (if (or (null? specifics) (string-blank? specifics)) + "" (sanitize-rdf-string specifics)))) + (set gnt:has_case_info + (let ((cases + (field Datasets AboutCases))) + (if (or (null? cases) (string-blank? cases)) + "" (sanitize-rdf-string cases)))) + (set gnt:has_platform_info + (let* ((platform + (field Datasets AboutPlatform))) + (if (or (null? platform) (string-blank? platform)) + "" (sanitize-rdf-string platform)))) + (set gnt:has_data_processing_info + (let* ((processing + (field Datasets AboutDataProcessing))) + (if (or (null? processing) (string-blank? processing)) + "" (sanitize-rdf-string processing)))) + (set gnt:has_experiment_design + (let ((experiment-design + (field Datasets ExperimentDesign))) + (if (or (null? experiment-design) (string-blank? experiment-design)) + "" (sanitize-rdf-string experiment-design)))) + (set gnt:has_contributors + (let ((contributors + (field Datasets Contributors))) + (if (or (null? contributors) (string-blank? contributors)) + "" (sanitize-rdf-string contributors)))))) + + +(let* ((option-spec + '((settings (single-char #\s) (value #t)) + (output (single-char #\o) (value #t)) + (documentation (single-char #\d) (value #t)))) + (options (getopt-long (command-line) option-spec)) + (settings (option-ref options 'settings #f)) + (output (option-ref options 'output #f)) + (documentation (option-ref options 'documentation #f)) + (%connection-settings + (call-with-input-file settings + read))) + (with-documentation + (name "Datasets Metadata") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("dct:" "<http://purl.org/dc/terms/>") + ("dcat:" "<http://www.w3.org/ns/dcat#>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") + ("gn-files:" "<http://files.genenetwork.org/current/>") + ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") + ("owl:" "<http://www.w3.org/2002/07/owl#>") + ("skos:" "<http://www.w3.org/2004/02/skos/core#>") + ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") + ("xsd:" "<http://www.w3.org/2001/XMLSchema#>"))) + (inputs + (list gn:dataset->metadata)) + (outputs + `(#:documentation ,documentation + #:rdf ,output)))) diff --git a/examples/gene-chip.scm b/examples/gene-chip.scm new file mode 100755 index 0000000..eec17b8 --- /dev/null +++ b/examples/gene-chip.scm @@ -0,0 +1,76 @@ +#! /usr/bin/env guile +!# + +(use-modules (srfi srfi-1) + (srfi srfi-26) + (ice-9 getopt-long) + (ice-9 match) + (ice-9 regex) + (transform strings) + (transform sql) + (transform triples) + (transform special-forms)) + + +(define-transformer gn:platform->metadata + (tables (GeneChip + (left-join Species "USING (SpeciesId)"))) + (schema-triples + (gnc:gene_chip a skos:ConceptScheme) + (gnc:gene_chip skos:prefLabel "Gene Chip Vocabulary") + (gnc:gene_chip skos:definition "A controlled vocabulary used to describe gene chip and microarray platforms.") + (gnt:has_geo_series_id rdf:type owl:ObjectProperty) + (gnt:has_geo_series_id rdf:label "has GEO Series ID") + (gnt:has_geo_series_id rdfs:domain skos:Concept) + (gnt:has_go_tree_value a owl:ObjectProperty) + (gnt:has_go_tree_value rdfs:label "has GO tree value") + (gnt:has_go_tree_value + rdfs:comment + "Associates a gene chip concept with a Gene Ontology term used for categorization.") + (gnt:has_go_tree_value rdfs:domain skos:Concept) + (gnt:has_go_tree_value rdfs:range xsd:string)) + (triples (string->identifier "platform" (field GeneChip Name) #:separator "_") + (set rdf:type 'skos:Concept) + (set skos:inScheme (field GeneChip GeneChipName)) + (set skos:prefLabel (field GeneChip Name)) + (set skos:altLabel (field ("IF(GeneChip.GeneChipName != GeneChip.Title, Title, NULL)" + Title))) + (set gnt:has_go_tree_value (field GeneChip Go_tree_value)) + (set gnt:has_species + (string->identifier "" (remap-species-identifiers (field Species Fullname)))) + (set gnt:has_geo_series_id + (ontology 'geoSeries: + (string-trim-both (field GeneChip GeoPlatform)))))) + + + +(let* ((option-spec + '((settings (single-char #\s) (value #t)) + (output (single-char #\o) (value #t)) + (documentation (single-char #\d) (value #t)))) + (options (getopt-long (command-line) option-spec)) + (settings (option-ref options 'settings #f)) + (output (option-ref options 'output #f)) + (documentation (option-ref options 'documentation #f)) + (%connection-settings + (call-with-input-file settings + read))) + (with-documentation + (name "GeneChip Metadata") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("xsd:" "<http://www.w3.org/2001/XMLSchema#>") + ("skos:" "<http://www.w3.org/2004/02/skos/core#>") + ("geoSeries:" "<http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + ("owl:" "<http://www.w3.org/2002/07/owl#>") + ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>"))) + (inputs + (list gn:platform->metadata)) + (outputs + `(#:documentation ,documentation + #:rdf ,output)))) diff --git a/examples/genotype-datasets.scm b/examples/genotype-datasets.scm new file mode 100755 index 0000000..ebe2349 --- /dev/null +++ b/examples/genotype-datasets.scm @@ -0,0 +1,87 @@ +#! /usr/bin/env guile +!# + +(use-modules (rnrs programs) + (rnrs io ports) + (srfi srfi-1) + (srfi srfi-26) + (ice-9 getopt-long) + (ice-9 match) + (ice-9 regex) + (transform strings) + (transform sql) + (transform triples) + (transform special-forms)) + + +(define-transformer gn:set->gn:dataset + (tables (Species + (inner-join InbredSet "ON InbredSet.SpeciesId = Species.Id") + (inner-join GenoFreeze "ON GenoFreeze.InbredSetId = InbredSet.Id")) + "WHERE GenoFreeze.public > 0 AND Species.Name != 'monkey' GROUP BY Species.Name, GenoFreeze.ShortName") + (triples (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_") + (multiset gnt:has_genotype_data + (map (cut string->identifier "dataset" <> #:separator "_") + (string-split + (field ("GROUP_CONCAT(GenoFreeze.Name SEPARATOR ',')" + dataset_name)) + #\,))))) + +(define-transformer gn:dataset->metadata + (tables (GenoFreeze + (inner-join InbredSet "ON InbredSet.Id = GenoFreeze.InbredSetId") + (inner-join Species "ON InbredSet.SpeciesId = Species.Id")) + "WHERE GenoFreeze.public > 0 AND Species.Name != 'monkey'") + (triples (string->identifier "dataset" (field GenoFreeze Name) #:separator "_") + (set gnt:has_strain (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")) + (set dct:created (annotate-field (field GenoFreeze CreateTime) '^^xsd:datetime)))) + +(define-transformer gn:dataset->marker/snp-count + (tables (GenoFreeze + (inner-join InbredSet "ON InbredSet.Id = GenoFreeze.InbredSetId") + (inner-join Species "ON InbredSet.SpeciesId = Species.Id") + (inner-join Geno "ON Geno.SpeciesId = Species.Id")) + "WHERE GenoFreeze.public > 0 AND Species.Name != 'monkey' GROUP BY GenoFreeze.Name") + (triples (string->identifier "dataset" (field GenoFreeze Name) #:separator "_") + (set gnt:has_marker_count + (string->symbol + (format #f "'~s'^^xsd:integer" + (field + ("COUNT(DISTINCT Geno.Marker_Name)" MarkerCount))))))) + + +(let* ((option-spec + '((settings (single-char #\s) (value #t)) + (output (single-char #\o) (value #t)) + (documentation (single-char #\d) (value #t)))) + (options (getopt-long (command-line) option-spec)) + (settings (option-ref options 'settings #f)) + (output (option-ref options 'output #f)) + (documentation (option-ref options 'documentation #f)) + (%connection-settings + (call-with-input-file settings + read))) + (with-documentation + (name "Genotype Datasets") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("dcat:" "<http://www.w3.org/ns/dcat#>") + ("dct:" "<http://purl.org/dc/terms/>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") + ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") + ("owl:" "<http://www.w3.org/2002/07/owl#>") + ("skos:" "<http://www.w3.org/2004/02/skos/core#>") + ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") + ("xsd:" "<http://www.w3.org/2001/XMLSchema#>"))) + (inputs + (list + gn:set->gn:dataset + gn:dataset->metadata + gn:dataset->marker/snp-count)) + (outputs + `(#:documentation ,documentation + #:rdf ,output)))) diff --git a/examples/genotype.scm b/examples/genotype.scm index e2ac782..4a91b63 100755 --- a/examples/genotype.scm +++ b/examples/genotype.scm @@ -12,66 +12,22 @@ (transform sql) (transform triples) (transform special-forms)) - - -(define-transformer genotypes +(define-transformer gn:markers/snps->metadata (tables (Geno - (left-join Species "USING (SpeciesId)"))) - (schema-triples - (gnt:chr a owl:ObjectProperty) - (gnt:chr skos:description "This resource is located on a given chromosome") - (gnt:chr rdfs:domain gnc:genotype) - (gnt:mb a owl:ObjectProperty) - (gnt:mb skos:definition "The size of this resource in Mb") - (gnt:mb rdfs:domain gnc:genotype) - (gnt:mb_mm8 a owl:ObjectProperty) - (gnt:mb_mm8 skos:definition "TODO") - (gnt:mb_mm8 rdfs:domain gnc:genotype) - (gnt:mb2016 a owl:ObjectProperty) - (gnt:mb2016 skos:definition "TODO") - (gnt:mb2016 rdfs:domain gnc:genotype) - (gnt:has_sequence a owl:ObjectProperty) - (gnt:has_sequence skos:definition "This resource has a given sequence") - (gnt:has_sequence rdfs:domain gnc:genotype) - (gnt:has_source a owl:ObjectProperty) - (gnt:has_source rdfs:domain gnc:genotype) - (gnt:has_source skos:definition "This resource was obtained from this given source") - (gnt:has_alt_source_name a owl:ObjectProperty) - (gnt:has_alt_source_name rdfs:domain gnc:genotype) - (gnt:has_alt_source_name - skos:definition - "The alternative name this resource was obtained from") - (gnt:chr_num a owl:ObjectProperty) - (gnt:chr_num rdfs:domain gnc:genotype) - (gnt:chr_num skos:definition "The chromosome number for this resource")) - (triples - (string->identifier "" (field Geno Name)) - (set rdf:type 'gnc:genotype) - (set rdfs:label (sanitize-rdf-string (field Geno Name))) - (set gnt:chr (field Geno Chr)) - (set gnt:mb (annotate-field - (field ("IFNULL(Geno.Mb, '')" Mb)) '^^xsd:double)) - (set gnt:mb_mm8 (annotate-field (field ("IFNULL(Geno.Mb_mm8, '')" Mb_mm8)) - '^^xsd:double)) - (set gnt:mb2016 - (annotate-field (field ("IFNULL(Geno.Mb_2016, '')" Mb_2016)) - '^^xsd:double)) - (set gnt:has_sequence (field Geno Sequence)) - (set gnt:has_source (field Geno Source)) - ;; Only transform Source2 if it differs from Source - (set gnt:has_alt_source_name - (field ("IF((Source2 = Source), NULL, Source2)" - Source2))) + (inner-join Species "ON Geno.SpeciesId = Species.Id")) + "WHERE Species.Name != 'monkey'") + (triples (string->identifier "marker" (field Geno Name) #:separator "_") (set gnt:has_species - (string->identifier "" (remap-species-identifiers (field Species Fullname)) - #:separator "_" - #:proc string-downcase)) - (set gnt:chr_num - (annotate-field - (field Geno chr_num) - '^^xsd:int)) - (set rdfs:comments (field Geno Comments)))) + (string->identifier "" (remap-species-identifiers (field Species Fullname)))) + (set rdf:type 'gnc:dna_marker) + (set skos:prefLabel (field Geno Name)) + (set skos:altLabel (field Geno Marker_Name)) + (set gnt:chr (field Geno Chr)) + (set gnt:mb (annotate-field (field Geno Mb) '^^xsd:doubleg)) + (set gnt:sequence (field Geno Sequence)) + (set gnt:source (field Geno Source)) + (set rdfs:comment (field Geno Comments)))) @@ -87,22 +43,28 @@ (call-with-input-file settings read))) (with-documentation - (name "Genotype Metadata") + (name "Phenotypes Metadata") (connection %connection-settings) (table-metadata? #f) (prefixes - '(("dct:" "<http://purl.org/dc/terms/>") + '(("dcat:" "<http://www.w3.org/ns/dcat#>") + ("dct:" "<http://purl.org/dc/terms/>") ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("owl:" "<http://www.w3.org/2002/07/owl#>") ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnd:" "<https://cd.genenetwork.org/lmdb/v1/data/traits/>") ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") + ("sdmx-measure:" "<http://purl.org/linked-data/sdmx/2009/measure#>") + ("skos:" "<http://www.w3.org/2004/02/skos/core#>") ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") - ("owl:" "<http://www.w3.org/2002/07/owl#>") - ("skos:" "<http://www.w3.org/2004/02/skos/core#>") + ("xsd:" "<http://www.w3.org/2001/XMLSchema#>") + ("qb:" "<http://purl.org/linked-data/cube#>") ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") - ("xsd:" "<http://www.w3.org/2001/XMLSchema#>"))) + ("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>"))) (inputs - (list genotypes)) + (list gn:markers/snps->metadata)) (outputs `(#:documentation ,documentation #:rdf ,output)))) + diff --git a/examples/investigators.scm b/examples/investigators.scm new file mode 100755 index 0000000..8d31974 --- /dev/null +++ b/examples/investigators.scm @@ -0,0 +1,93 @@ +#! /usr/bin/env guile +!# + +(use-modules (srfi srfi-1) + (srfi srfi-26) + (ice-9 getopt-long) + (ice-9 match) + (ice-9 regex) + (transform strings) + (transform sql) + (transform triples) + (transform special-forms)) + + +;; One email ID in the Investigators table has spaces in it. This +;; function fixes that. +(define (fix-email-id email) + (string-delete #\space email)) + +(define (investigator-attributes->id first-name last-name email) + ;; There is just one record corresponding to "Evan Williams" which + ;; does not have an email ID. To accommodate that record, we + ;; construct the investigator ID from not just the email ID, but + ;; also the first and the last names. It would be preferable to just + ;; find Evan Williams' email ID and insert it into the database. + (string->identifier "investigator" + (string-join + (list first-name last-name (fix-email-id email)) + "_") + #:separator "_")) + + +(define-transformer investigators + ;; There are a few duplicate entries. We group by email to + ;; deduplicate. + (tables (Investigators) + "GROUP BY Email") + (triples (investigator-attributes->id (field Investigators FirstName) + (field Investigators LastName) + "") + (set rdf:type 'foaf:Person) + (set foaf:name (string-append (field Investigators FirstName) " " + (field Investigators LastName))) + (set foaf:givenName + (field Investigators FirstName)) + (set foaf:familyName + (field Investigators LastName)) + (set foaf:homepage (field Investigators Url)) + (set v:adr (field Investigators Address)) + (set v:locality (field Investigators City)) + (set v:region (field Investigators State)) + (set v:postal-code (field Investigators ZipCode)) + (set v:country-name (field Investigators Country)))) + + +(let* ((option-spec + '((settings (single-char #\s) (value #t)) + (output (single-char #\o) (value #t)) + (documentation (single-char #\d) (value #t)))) + (options (getopt-long (command-line) option-spec)) + (settings (option-ref options 'settings #f)) + (output (option-ref options 'output #f)) + (documentation (option-ref options 'documentation #f)) + (%connection-settings + (call-with-input-file settings + read))) + (with-documentation + (name "Info files / Investigators Metadata") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '( + ("dcat:" "<http://www.w3.org/ns/dcat#>") + ("dct:" "<http://purl.org/dc/terms/>") + ("foaf:" "<http://xmlns.com/foaf/0.1/#term_>") + ("geoSeries:" "<http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") + ("owl:" "<http://www.w3.org/2002/07/owl#>") + ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") + ("skos:" "<http://www.w3.org/2004/02/skos/core#>") + ("taxon:" "<http://purl.uniprot.org/taxonomy/>") + ("v:" "<http://www.w3.org/2006/vcard/ns#>") + ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") + ("xsd:" "<http://www.w3.org/2001/XMLSchema#>") + )) + (inputs + (list investigators)) + (outputs + `(#:documentation ,documentation + #:rdf ,output)))) diff --git a/examples/molecular-traits-datasets.scm b/examples/molecular-traits-datasets.scm new file mode 100755 index 0000000..34ddf3a --- /dev/null +++ b/examples/molecular-traits-datasets.scm @@ -0,0 +1,100 @@ +#! /usr/bin/env guile +!# + +(use-modules (srfi srfi-1) + (srfi srfi-26) + (ice-9 getopt-long) + (ice-9 match) + (ice-9 regex) + (transform strings) + (transform sql) + (transform triples) + (transform special-forms)) + + +(define-transformer gn:molecular-trait->gn:dataset + (tables (Tissue)) + (triples (string->identifier "trait" (field Tissue Short_Name) #:separator "_") + (set rdf:type 'gnc:molecular_trait) + (set skos:prefLabel (field Tissue Name)) + (set skos:altLabel (field Tissue Short_Name)))) + +(define-transformer gnc:molecular_trait->gn:molecular_trait + (tables (Tissue)) + (triples "gnc:molecular_trait" + (set skos:member (string->identifier "trait" (field Tissue Short_Name) #:separator "_")))) + +(define-transformer gn:set->gn:dataset + (tables (Species + (inner-join InbredSet "ON InbredSet.SpeciesId = Species.Id") + (inner-join ProbeFreeze "ON ProbeFreeze.InbredSetId = InbredSet.Id") + (inner-join ProbeSetFreeze "ON ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id") + (inner-join Tissue "ON ProbeFreeze.TissueId = Tissue.Id")) + "WHERE ProbeSetFreeze.public > 0 AND Species.Name != 'monkey' GROUP BY Species.Name, Tissue.Short_Name") + (triples (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_") + (multiset gnt:has_probeset_data + (map (cut string->identifier "dataset" <> #:separator "_") + (string-split + (field ("GROUP_CONCAT(ProbeSetFreeze.Name SEPARATOR ',')" + dataset_name)) + #\,))))) + +(define-transformer gn:dataset->metadata + (tables (ProbeSetFreeze + (inner-join ProbeFreeze "ON ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id") + (inner-join InbredSet "ON InbredSet.Id = ProbeFreeze.InbredSetId") + (inner-join Species "ON InbredSet.SpeciesId = Species.Id") + (inner-join Tissue "ON ProbeFreeze.TissueId = Tissue.Id") + (inner-join AvgMethod "ON AvgMethod.AvgMethodId = ProbeSetFreeze.AvgID") + (inner-join InfoFiles "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") + (inner-join Datasets "ON InfoFiles.DatasetId = Datasets.DatasetId") + (left-join GeneChip "ON GeneChip.Id = InfoFiles.GeneChipId")) + "WHERE ProbeSetFreeze.public > 0 AND Species.Name != 'monkey'") + (triples (string->identifier "dataset" (field ProbeSetFreeze Name) #:separator "_") + (set dct:created (annotate-field (field ProbeSetFreeze CreateTime) '^^xsd:datetime)) + (set gnt:has_strain (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")) + (set gnt:uses_normalization_method + (string->identifier "avg_method" (field AvgMethod Name AvgMethodName) #:separator "_")) + (set gnt:has_molecular_trait + (string->identifier "trait" (field Tissue Short_Name) #:separator "_")) + (set gnt:uses_genechip + (string->identifier "platform" (field GeneChip Name) #:separator "_")))) + + +(let* ((option-spec + '((settings (single-char #\s) (value #t)) + (output (single-char #\o) (value #t)) + (documentation (single-char #\d) (value #t)))) + (options (getopt-long (command-line) option-spec)) + (settings (option-ref options 'settings #f)) + (output (option-ref options 'output #f)) + (documentation (option-ref options 'documentation #f)) + (%connection-settings + (call-with-input-file settings + read))) + (with-documentation + (name "Molecular Trait Datasets") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("dcat:" "<http://www.w3.org/ns/dcat#>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("obo:" "<http://purl.obolibrary.org/obo/>") + ("owl:" "<http://www.w3.org/2002/07/owl#>") + ("xsd:" "<http://www.w3.org/2001/XMLSchema#>") + ("dct:" "<http://purl.org/dc/terms/>") + ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") + ("skos:" "<http://www.w3.org/2004/02/skos/core#>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>"))) + (inputs + (list + gn:dataset->metadata + gn:molecular-trait->gn:dataset + gn:set->gn:dataset + gnc:molecular_trait->gn:molecular_trait)) + (outputs + `(#:documentation ,documentation + #:rdf ,output)))) diff --git a/examples/molecular-traits.scm b/examples/molecular-traits.scm deleted file mode 100755 index 737c0b0..0000000 --- a/examples/molecular-traits.scm +++ /dev/null @@ -1,63 +0,0 @@ -#! /usr/bin/env guile -!# - -(use-modules (srfi srfi-1) - (srfi srfi-26) - (ice-9 getopt-long) - (ice-9 match) - (ice-9 regex) - (transform strings) - (transform sql) - (transform triples) - (transform special-forms)) - - -(define-transformer tissue - ;; The Name and TissueName fields seem to be identical. BIRN_lex_ID - ;; and BIRN_lex_Name are mostly NULL. - (tables (Tissue)) - (schema-triples - (gnc:tissue a owl:Class) - (gnc:tissue rdfs:subClassOf obo:UBERON_0000479) ; Anatomical Entity - (gnc:tissue rdfs:label "Tissue (GN)") - (gnt:has_tissue rdf:type owl:ObjectProperty) - (gnt:has_tissue rdfs:domain gnc:molecular_traits) - (gnt:has_tissue rdfs:range gnc:tissue) - (gnt:has_tissue rdfs:label "has tissue")) - (triples (string->identifier "tissue" (field Tissue Short_Name) - #:separator "_") - (set rdf:type 'gnc:tissue) - (set rdfs:label (field Tissue Name)))) - - - -(let* ((option-spec - '((settings (single-char #\s) (value #t)) - (output (single-char #\o) (value #t)) - (documentation (single-char #\d) (value #t)))) - (options (getopt-long (command-line) option-spec)) - (settings (option-ref options 'settings #f)) - (output (option-ref options 'output #f)) - (documentation (option-ref options 'documentation #f)) - (%connection-settings - (call-with-input-file settings - read))) - (with-documentation - (name "Tissue Metadata") - (connection %connection-settings) - (table-metadata? #f) - (prefixes - '(("gn:" "<http://rdf.genenetwork.org/v1/id/>") - ("obo:" "<http://purl.obolibrary.org/obo/>") - ("owl:" "<http://www.w3.org/2002/07/owl#>") - ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") - ("skos:" "<http://www.w3.org/2004/02/skos/core#>") - ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") - ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") - ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>"))) - (inputs - (list tissue)) - (outputs - `(#:documentation ,documentation - #:rdf ,output)))) -;; http://purl.obolibrary.org/obo/UBERON_0000479 diff --git a/examples/ontology.scm b/examples/ontology.scm new file mode 100755 index 0000000..724a75a --- /dev/null +++ b/examples/ontology.scm @@ -0,0 +1,272 @@ +#! /usr/bin/env guile +!# + +(use-modules (ice-9 getopt-long) + (transform triples) + (transform schema) + (transform special-forms)) + +(let* ((option-spec + '((settings (single-char #\s) (value #t)) + (output (single-char #\o) (value #t)) + (documentation (single-char #\d) (value #t)))) + (options (getopt-long (command-line) option-spec)) + (settings (option-ref options 'settings #f)) + (output (option-ref options 'output #f)) + (documentation (option-ref options 'documentation #f)) + (%connection-settings (call-with-input-file settings read))) + (with-output-to-file output + (lambda () + ;; Define all GN ontology in one place. + (prefix "dcat:" "<http://www.w3.org/ns/dcat#>") + (prefix "dct:" "<http://purl.org/dc/terms/>") + (prefix "gn:" "<http://rdf.genenetwork.org/v1/id/>") + (prefix "owl:" "<http://www.w3.org/2002/07/owl#>") + (prefix "gnc:" "<http://rdf.genenetwork.org/v1/category/>") + (prefix "gnt:" "<http://rdf.genenetwork.org/v1/term/>") + (prefix "obo:" "<http://purl.obolibrary.org/obo/>") + (prefix "sdmx-measure:" "<http://purl.org/linked-data/sdmx/2009/measure#>") + (prefix "skos:" "<http://www.w3.org/2004/02/skos/core#>") + (prefix "rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + (prefix "rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") + (prefix "xsd:" "<http://www.w3.org/2001/XMLSchema#>") + (prefix "qb:" "<http://purl.org/linked-data/cube#>") + (prefix "xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") + (prefix "pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>") + (prefix "schema:" "<https://schema.org/>") + (newline) + (triple 'gnc:population_category 'a 'xkos:ClassificationLevel) + (triple 'gnc:population_category 'rdfs:label "Population Category") + (triple 'gnc:population_category 'skos:inScheme 'gnc:resource_classification_scheme) + (triple 'gnc:population_category 'skos:prefLabel "Population Category") + (triple 'gnc:population_category 'xkos:depth "3") + (triple 'gnc:population_category 'xkos:nextLevel 'gnc:set) + (triple 'gnc:population_category 'xkos:previousLevel 'gnc:species) + (triple 'gnc:reference_population 'a 'skos:Concept) + (triple 'gnc:reference_population 'skos:definition "A genetic population") + (triple 'gnc:reference_population 'skos:inScheme 'gnc:population_category) + (triple 'gnc:reference_population 'skos:prefLabel "Reference population") + (triple 'gnc:resource_classification_scheme 'a 'skos:ConceptScheme) + (triple 'gnc:resource_classification_scheme 'skos:definition "A hierarchical classification scheme for organizing GeneNetwork resources by dataset type, resource set (inbredset group), or species.") + (triple 'gnc:resource_classification_scheme 'skos:prefLabel "GeneNetwork Resource Classification Scheme") + (triple 'gnc:resource_classification_scheme 'xkos:levels 'gnc:population_category) + (triple 'gnc:resource_classification_scheme 'xkos:levels 'gnc:set) + (triple 'gnc:resource_classification_scheme 'xkos:levels 'gnc:species) + (triple 'gnc:resource_classification_scheme 'xkos:levels 'gnc:taxonomic_family) + (triple 'gnc:resource_classification_scheme 'xkos:numberOfLevels "4") + (triple 'gnc:set 'a 'xkos:ClassificationLevel) + (triple 'gnc:set 'skos:definition "A category representing groups of genetically related strains or individuals (inbred sets, recombinant inbred lines, etc.).") + (triple 'gnc:set 'skos:inScheme 'gnc:resource_classification_scheme) + (triple 'gnc:set 'skos:prefLabel "InbredSet Group") + (triple 'gnc:set 'xkos:depth "4") + (triple 'gnc:set 'xkos:previousLevel 'gnc:population_category) + (triple 'gnc:species 'a 'xkos:ClassificationLevel) + (triple 'gnc:species 'skos:definition "A classification level that that associates a given resource to a species in GeneNetwork.") + (triple 'gnc:species 'skos:inScheme 'gnc:resource_classification_scheme) + (triple 'gnc:species 'skos:prefLabel "Species") + (triple 'gnc:species 'xkos:depth "2") + (triple 'gnc:species 'xkos:nextLevel 'gnc:population_category) + (triple 'gnc:species 'xkos:previousLevel 'gnc:taxonomic_family) + (triple 'gnc:taxonomic_family 'a 'xkos:ClassificationLevel) + (triple 'gnc:taxonomic_family 'skos:definition "An organizational classification level used in GeneNetwork to group resources into families.") + (triple 'gnc:taxonomic_family 'skos:inScheme 'gnc:resource_classification_scheme) + (triple 'gnc:taxonomic_family 'skos:prefLabel "Family") + (triple 'gnc:taxonomic_family 'xkos:depth "1") + (triple 'gnc:taxonomic_family 'xkos:nextLevel 'gnc:species) + (triple 'gnt:assigned_species 'a 'owl:ObjectProperty) + (triple 'gnt:assigned_species 'rdfs:domain 'gnc:set) + (triple 'gnt:assigned_species 'rdfs:label "These families have been assigned to these species") + (triple 'gnt:genetic_type 'a 'owl:DatatypeProperty) + (triple 'gnt:genetic_type 'rdfs:domain 'gnc:set) + (triple 'gnt:genetic_type 'rdfs:label "has genetic type") + (triple 'gnt:genetic_type 'rdfs:range 'xsd:string) + (triple 'gnt:genetic_type 'skos:definition "Describes the genetic architecture of a resource set (e.g., intercross, riset).") + (triple 'gnt:has_family_order_id 'a 'owl:DatatypeProperty) + (triple 'gnt:has_family_order_id 'a 'owl:DatatypeProperty) + (triple 'gnt:has_family_order_id 'rdfs:range 'xsd:integer) + (triple 'gnt:has_set_code 'a 'owl:DatatypeProperty) + (triple 'gnt:has_set_code 'rdfs:domain 'gnc:set) + (triple 'gnt:has_set_code 'rdfs:label "has set code") + (triple 'gnt:has_set_code 'rdfs:range 'xsd:string) + (triple 'gnt:has_set_code 'skos:definition "Provides a unique identifier code for a resource set.") + (triple 'gnt:has_species 'a 'owl:ObjectProperty) + (triple 'gnt:has_species 'rdf:comment "This resource belongs to this species") + (triple 'gnt:has_species 'rdfs:label "belongs to species") + (triple 'gnt:has_species 'rdfs:range 'gnc:species) + (triple 'gnt:has_strain 'a 'owl:ObjectProperty) + (triple 'gnt:has_strain 'rdf:comment "Indicates the group the resources belongs to") + (triple 'gnt:has_strain 'rdfs:domain 'gnc:species) + (triple 'gnt:has_strain 'rdfs:label "this resource belongs to this strain.") + (triple 'gnt:has_strain 'rdfs:range 'gnc:set) + (triple 'gnt:has_strain 'schema:domainIncludes 'dcat:Dataset) + (triple 'gnt:has_strain 'schema:domainIncludes 'gnc:species) + (triple 'gnt:has_strain 'skos:definition "Lists all strains that belong to this resource.") + (triple 'gnt:has_taxonomic_family 'a 'owl:ObjectProperty) + (triple 'gnt:has_taxonomic_family 'rdfs:label "has family") + (triple 'gnt:has_taxonomic_family 'schema:domainIncludes 'gnc:set) + (triple 'gnt:has_taxonomic_family 'schema:domainIncludes 'gnc:species) + (triple 'gnt:has_taxonomic_family 'skos:definition "Links a species to its taxonomic family") + (triple 'gnt:has_uniprot_taxon_id 'a 'owl:ObjectProperty) + (triple 'gnt:has_uniprot_taxon_id 'rdfs:label "has uniprot taxonomic id") + (triple 'gnt:population_category 'skos:definition "Classification of genetic populations by breeding design and data aggregation.") + (triple 'gnt:short_name 'a 'owl:DatatypeProperty) + (triple 'gnt:short_name 'rdfs:domain 'gnc:species) + (triple 'gnt:short_name 'rdfs:label "has short name") + (triple 'gnt:short_name 'skos:definition "The short name of a given resource") + (triple 'gnt:uses_mapping_method 'a 'owl:ObjectProperty) + (triple 'gnt:uses_mapping_method 'rdfs:comment "The method used to map genetic or experimental data for this resource.") + (triple 'gnt:uses_mapping_method 'rdfs:domain 'gnc:set) + (triple 'gnt:uses_mapping_method 'rdfs:label "mapping method") + (triple 'gnt:uses_mapping_method 'rdfs:range 'gnc:mapping_method) + + ;; Describing Datasets + (triple 'gnc:molecular_trait 'a 'owl:Class) + (triple 'gnc:molecular_trait 'a 'skos:Concept) + (triple 'gnc:molecular_trait 'rdfs:label "Molecular Trait. This describes a melecular trait of a given species. We combine the species name and the tissue name in order to differentiate the traits across different inbredset groups.") + (triple 'gnc:molecular_trait 'rdfs:subClassOf 'obo:UBERON_0000479) + (triple 'gnt:has_case_info 'a 'owl:ObjectProperty) + (triple 'gnt:has_case_info 'rdfs:comment "Information about the cases used in this platform") + (triple 'gnt:has_case_info 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_case_info 'rdfs:label "About Case") + (triple 'gnt:has_citation 'a 'owl:ObjectProperty) + (triple 'gnt:has_citation 'rdfs:comment "Citation for this dataset") + (triple 'gnt:has_citation 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_citation 'rdfs:label "Citation") + (triple 'gnt:has_contributors 'a 'owl:ObjectProperty) + (triple 'gnt:has_contributors 'rdfs:comment "Contributors of this resource") + (triple 'gnt:has_contributors 'rdfs:comment "Contributors of this resource") + (triple 'gnt:has_contributors 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_contributors 'rdfs:label "Contributors") + (triple 'gnt:has_data_processing_info 'a 'owl:ObjectProperty) + (triple 'gnt:has_data_processing_info 'rdfs:comment "Information about how this dataset was processed") + (triple 'gnt:has_data_processing_info 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_data_processing_info 'rdfs:label "About Data Processing") + (triple 'gnt:has_experiment_design 'a 'owl:ObjectProperty) + (triple 'gnt:has_experiment_design 'rdfs:comment "Experiment Design for this resource") + (triple 'gnt:has_experiment_design 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_experiment_design 'rdfs:label "Experiment Design") + (triple 'gnt:has_experiment_design_info 'a 'owl:ObjectProperty) + (triple 'gnt:has_experiment_design_info 'rdfs:comment "Information about how the experiment was designed") + (triple 'gnt:has_experiment_design_info 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_experiment_design_info 'rdfs:label "Experiment Design") + (triple 'gnt:has_experiment_type 'a 'owl:ObjectProperty) + (triple 'gnt:has_experiment_type 'rdfs:comment "Information about the experiment type") + (triple 'gnt:has_experiment_type 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_experiment_type 'rdfs:label "Experiment Type Metadata") + (triple 'gnt:has_molecular_trait 'rdf:type 'owl:ObjectProperty) + (triple 'gnt:has_molecular_trait 'rdfs:domain 'gnc:set) + (triple 'gnt:has_molecular_trait 'rdfs:label "has molecular trait") + (triple 'gnt:has_molecular_trait 'rdfs:range 'gnc:molecular_trait) + (triple 'gnt:has_phenotype_data 'rdf:type 'owl:ObjectProperty) + (triple 'gnt:has_phenotype_data 'rdfs:comment "Associates a resource with its phenotype data.") + (triple 'gnt:has_phenotype_data 'rdfs:domain 'gnc:set) + (triple 'gnt:has_phenotype_data 'rdfs:label "this resources has this phenotype data.") + (triple 'gnt:has_phenotype_data 'rdfs:range 'dcat:Dataset) + (triple 'gnt:has_phenotype_data 'rdfs:subPropertyOf 'dct:relation) + (triple 'gnt:has_platform_info 'a 'owl:ObjectProperty) + (triple 'gnt:has_platform_info 'rdfs:comment "Information about the platform that was used with this dataset") + (triple 'gnt:has_platform_info 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_platform_info 'rdfs:label "About Platform") + (triple 'gnt:has_probeset_data 'rdf:type 'owl:ObjectProperty) + (triple 'gnt:has_probeset_data 'rdfs:comment "Associates a resource with this probeset data.") + (triple 'gnt:has_probeset_data 'rdfs:domain 'gnc:set) + (triple 'gnt:has_probeset_data 'rdfs:label "this resources has this probeset data.") + (triple 'gnt:has_probeset_data 'rdfs:range 'gnc:molecular_trait) + (triple 'gnt:has_probeset_data 'rdfs:subPropertyOf 'dct:relation) + (triple 'gnt:has_samples 'a 'owl:ObjectProperty) + (triple 'gnt:has_samples 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_samples 'rdfs:label "Samples") + (triple 'gnt:has_specifics 'a 'owl:ObjectProperty) + (triple 'gnt:has_specifics 'rdfs:comment "Has specifics") + (triple 'gnt:has_specifics 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_specifics 'rdfs:label "Specifics") + (triple 'gnt:has_summary 'a 'owl:ObjectProperty) + (triple 'gnt:has_summary 'rdfs:comment "Summary information about dataset") + (triple 'gnt:has_summary 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_summary 'rdfs:label "Summary") + (triple 'gnt:has_tissue_info 'a 'owl:ObjectProperty) + (triple 'gnt:has_tissue_info 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_tissue_info 'rdfs:label "Metadata about Tissue for this resource") + (triple 'gnt:uses_genechip 'a 'owl:ObjectProperty) + (triple 'gnt:uses_genechip 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:uses_genechip 'skos:definition "The Platform this resource uses..") + (triple 'gnt:uses_normalization_method 'rdfs:comment "The normalization method used for the molecular traits in this dataset") + (triple 'gnt:uses_normalization_method 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:uses_normalization_method 'rdfs:label "Averaging method used for the molecular traits in this dataset.") + (triple 'gnt:uses_normalization_method 'rdfs:range 'gnc:avg_method) + + ;; Describing phenotypes + (triple 'gnc:phenotype 'a 'owl:Class) + (triple 'gnc:phenotype 'a 'skos:Concept) + (triple 'gnc:phenotype 'rdfs:label "A phenotype.") + (triple 'gnc:phenotype_trait 'a 'owl:Class) + (triple 'gnc:phenotype_trait 'a 'skos:Concept) + (triple 'gnc:phenotype_trait 'rdfs:label "A phenotype trait.") + (triple 'gnt:abbreviation 'a 'owl:ObjectProperty) + (triple 'gnt:abbreviation 'rdfs:domain 'gnc:phenotype) + (triple 'gnt:abbreviation 'skos:definition "The abbreviation used for this resource") + (triple 'gnt:additive 'rdfs:domain 'gnc:phenotype) + (triple 'gnt:additive 'rdfs:range 'xsd:double) + (triple 'gnt:lab_code 'a 'owl:ObjectProperty) + (triple 'gnt:lab_code 'rdfs:domain 'gnc:phenotype) + (triple 'gnt:locus 'a 'qb:MeasureProperty) + (triple 'gnt:locus 'a 'rdf:Property) + (triple 'gnt:locus 'rdfs:domain 'gnc:phenotype) + (triple 'gnt:locus 'rdfs:range 'rdfs:Literal) + (triple 'gnt:locus 'rdfs:subPropertyOf 'sdmx-measure:obsValue) + (triple 'gnt:lod_score 'a 'qb:MeasureProperty) + (triple 'gnt:lod_score 'a 'rdf:Property) + (triple 'gnt:lod_score 'rdfs:domain 'gnc:phenotype) + (triple 'gnt:lod_score 'rdfs:label "Peak -logP") + (triple 'gnt:lod_score 'rdfs:range 'xsd:double) + (triple 'gnt:lod_score 'rdfs:subPropertyOf 'sdmx-measure:obsValue) + (triple 'gnt:lod_score 'skos:definition "Statistical measurement assessing the likelihood of genetic linkage between traits or genetic markers.") + (triple 'gnt:mean 'a 'qb:MeasureProperty) + (triple 'gnt:mean 'a 'rdf:Property) + (triple 'gnt:mean 'rdfs:domain 'gnc:phenotype) + (triple 'gnt:mean 'rdfs:range 'xsd:double) + (triple 'gnt:mean 'rdfs:subPropertyOf 'sdmx-measure:obsValue) + (triple 'gnt:sequence 'rdfs:domain 'gnc:phenotype) + (triple 'gnt:sequence 'rdfs:range 'xsd:integer) + (triple 'gnt:submitter 'a 'owl:ObjectProperty) + (triple 'gnt:submitter 'rdfs:domain 'gnc:phenotype) + (triple 'gnt:submitter 'skos:definition "A person who submitted this resource to GN") + (triple 'gnt:submitter 'skos:definition "A person who submitted this resource to GN") + (triple 'gnt:has_phenotype_data 'a 'owl:ObjectProperty) + (triple 'gnt:has_phenotype_data 'rdfs:domain 'gnc:set) + (triple 'gnt:has_phenotype_data 'skos:definition "This resource has phenotype data.") + + ;; Genotypes + (triple 'gnc:dna_marker 'a 'owl:Class) + (triple 'gnc:dna_marker 'a 'skos:Concept) + (triple 'gnc:dna_marker 'rdfs:label "A DNA Marker or SNP") + (triple 'gnt:has_genotype_files 'rdfs:label "This resource has these genotype files") + (triple 'gnt:has_genotype_files 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_genotype_data 'rdf:type 'owl:ObjectProperty) + (triple 'gnt:has_genotype_data 'rdfs:label "this resources has genotype data.") + (triple 'gnt:has_genotype_data 'rdfs:comment "Associates a resource with its genotype data.") + (triple 'gnt:has_genotype_data 'rdfs:domain 'gnc:set) + (triple 'gnt:has_genotype_data 'rdfs:range 'dcat:Dataset) + (triple 'gnt:has_genotype_data 'rdfs:subPropertyOf 'dct:relation) + (triple 'gnt:has_marker_count 'rdf:type 'owl:ObjectProperty) + (triple 'gnt:has_marker_count 'rdfs:label "this resources has N number of dna markers/SNPs.") + (triple 'gnt:has_marker_count 'rdfs:domain 'xsd:integer) + (triple 'gnt:has_marker_count 'rdfs:range 'dcat:Dataset) + (triple 'gnt:chr 'a 'qb:MeasureProperty) + (triple 'gnt:chr 'a 'rdf:Property) + (triple 'gnt:chr 'rdfs:label "Chromosome") + (triple 'gnt:chr 'rdfs:domain 'gnc:marker) + (triple 'gnt:chr 'rdfs:range 'rdfs:Literal) + (triple 'gnt:chr 'rdfs:subPropertyOf 'sdmx-measure:obsValue) + (triple 'gnt:mb 'rdfs:label "Megabase") + (triple 'gnt:mb 'rdfs:domain 'gnc:marker) + (triple 'gnt:mb 'rdfs:range 'rdfs:Literal) + (triple 'gnt:mb 'rdfs:subPropertyOf 'sdmx-measure:obsValue) + (triple 'gnt:sequence 'rdfs:label "Sequence") + (triple 'gnt:sequence 'rdfs:domain 'gnc:marker) + (triple 'gnt:sequence 'rdfs:range 'rdfs:Literal) + (triple 'gnt:sequence 'rdfs:subPropertyOf 'sdmx-measure:obsValue) + (triple 'gnt:source 'rdfs:label "Source") + (triple 'gnt:source 'rdfs:domain 'gnc:marker) + (triple 'gnt:source 'rdfs:range 'rdfs:Literal) + (triple 'gnt:source 'rdfs:subPropertyOf 'sdmx-measure:obsValue)))) diff --git a/examples/phenotype-datasets.scm b/examples/phenotype-datasets.scm new file mode 100755 index 0000000..4819627 --- /dev/null +++ b/examples/phenotype-datasets.scm @@ -0,0 +1,109 @@ +#! /usr/bin/env guile +!# + +(use-modules (rnrs programs) + (rnrs io ports) + (srfi srfi-1) + (srfi srfi-26) + (ice-9 getopt-long) + (ice-9 match) + (ice-9 regex) + (transform strings) + (transform sql) + (transform triples) + (transform special-forms)) + + +(define-transformer gn:set->gn:dataset + (tables (Species + (inner-join InbredSet "ON InbredSet.SpeciesId = Species.Id") + (inner-join PublishFreeze "ON PublishFreeze.InbredSetId = InbredSet.Id")) + "WHERE PublishFreeze.public > 0 AND Species.Name != 'monkey' GROUP BY Species.Name, PublishFreeze.ShortName") + (triples (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_") + (multiset gnt:has_phenotype_data + (map (cut string->identifier "dataset" <> #:separator "_") + (string-split + (field ("GROUP_CONCAT(PublishFreeze.Name SEPARATOR ',')" + dataset_name)) + #\,))))) + +(define-transformer gn:dataset->gn:set + (tables (Datasets + (inner-join InfoFiles "ON InfoFiles.DatasetId = Datasets.DatasetId") + (inner-join InbredSet "ON InbredSet.Id = InfoFiles.InbredSetId") + (inner-join PublishFreeze "ON PublishFreeze.InbredSetId = InbredSet.Id")) + "WHERE PublishFreeze.public > 0 GROUP BY Datasets.DatasetId") + (triples (string->identifier "dataset" (field PublishFreeze Name) #:separator "_") + (set gnt:has_strain (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")))) + +(define-transformer gn:dataset->metadata + (tables (PublishXRef + (inner-join InbredSet "ON InbredSet.InbredSetId = PublishXRef.InbredSetId") + (inner-join Species "ON InbredSet.SpeciesId = Species.Id") + (inner-join PublishFreeze "ON PublishFreeze.InbredSetId = InbredSet.Id") + (inner-join Publication "ON Publication.Id = PublishXRef.PublicationId") + (inner-join Phenotype "ON Phenotype.Id = PublishXRef.PhenotypeId")) + "WHERE InbredSet.public > 0 GROUP BY Species.Name, PublishFreeze.Name") + (triples (string->identifier "dataset" (field PublishFreeze Name) #:separator "_") + (set dct:created (annotate-field (field PublishFreeze CreateTime) '^^xsd:datetime)) + (set gnt:has_strain (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")))) + +(define-transformer gn:dataset->gn:trait + (tables (PublishXRef + (inner-join InbredSet "ON InbredSet.InbredSetId = PublishXRef.InbredSetId") + (inner-join Species "ON InbredSet.SpeciesId = Species.Id") + (inner-join PublishFreeze "ON PublishFreeze.InbredSetId = InbredSet.Id") + (inner-join Publication "ON Publication.Id = PublishXRef.PublicationId") + (inner-join Phenotype "ON Phenotype.Id = PublishXRef.PhenotypeId")) + "WHERE InbredSet.public > 0") + (triples (string->identifier "dataset" (field PublishFreeze Name) #:separator "_") + (set gnt:has_phenotype_trait + (let ((post-abbrev (blank-p (field Phenotype Post_publication_abbreviation))) + (pre-abbrev (blank-p (field Phenotype Pre_publication_abbreviation))) + (post-desc (blank-p (field Phenotype Post_publication_description))) + (pre-desc (blank-p (field Phenotype Post_publication_description)))) + (string->identifier + "trait" + (format #f "~a_~a" (field PublishFreeze Name) + (or post-abbrev pre-abbrev post-desc pre-desc)) + #:separator "_"))) + (set dct:created (annotate-field (field PublishFreeze CreateTime) '^^xsd:datetime)) + (set gnt:has_strain (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")))) + + +(let* ((option-spec + '((settings (single-char #\s) (value #t)) + (output (single-char #\o) (value #t)) + (documentation (single-char #\d) (value #t)))) + (options (getopt-long (command-line) option-spec)) + (settings (option-ref options 'settings #f)) + (output (option-ref options 'output #f)) + (documentation (option-ref options 'documentation #f)) + (%connection-settings + (call-with-input-file settings + read))) + (with-documentation + (name "Phenotype Datasets") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("dcat:" "<http://www.w3.org/ns/dcat#>") + ("dct:" "<http://purl.org/dc/terms/>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") + ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") + ("owl:" "<http://www.w3.org/2002/07/owl#>") + ("skos:" "<http://www.w3.org/2004/02/skos/core#>") + ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") + ("xsd:" "<http://www.w3.org/2001/XMLSchema#>"))) + (inputs + (list + gn:set->gn:dataset + gn:dataset->gn:set + gn:dataset->metadata + gn:dataset->gn:trait)) + (outputs + `(#:documentation ,documentation + #:rdf ,output)))) diff --git a/examples/phenotype.scm b/examples/phenotype.scm index 03eec45..834ff5a 100755 --- a/examples/phenotype.scm +++ b/examples/phenotype.scm @@ -14,92 +14,112 @@ (transform special-forms)) -(define-transformer phenotypes - (tables (PublishXRef - (left-join InbredSet "ON InbredSet.InbredSetId = PublishXRef.InbredSetId") - (left-join Publication "ON Publication.Id = PublishXRef.PublicationId") - (left-join Phenotype "ON Phenotype.Id = PublishXRef.PhenotypeId"))) - (schema-triples - (gnt:trait_id a owl:ObjectProperty) - (gnt:trait_id rdfs:domain gnc:phenotype) - (gnt:trait_id skos:definition "This is the unique trait id assigned from GeneNetwork") - (gnt:abbreviation a owl:ObjectProperty) - (gnt:abbreviation rdfs:domain gnc:phenotype) - (gnt:abbreviation skos:definition "The abbreviation used for this resource") - (gnt:labCode a owl:ObjectProperty) - (gnt:labCode rdfs:domain gnc:phenotype) - (gnt:submitter a owl:ObjectProperty) - (gnt:submitter rdfs:domain gnc:phenotype) - (gnt:submitter skos:definition "A person who submitted this resource to GN") - (gnt:mean a rdf:Property) - (gnt:mean a qb:MeasureProperty) - (gnt:mean rdfs:subPropertyOf sdmx-measure:obsValue) - (gnt:mean rdfs:domain gnc:phenotype) - (gnt:mean rdfs:range xsd:double) - (gnt:lod_score a rdf:Property) - (gnt:lod_score a qb:MeasureProperty) - (gnt:lod_score rdfs:subPropertyOf sdmx-measure:obsValue) - (gnt:lod_score rdfs:domain gnc:phenotype) - (gnt:lod_score rdfs:range xsd:double) - (gnt:lod_score rdfs:label "Peak -logP") - (gnt:lod_score skos:definition "Statistical measurement assessing the likelihood of genetic linkage between traits or genetic markers.") - (gnt:locus a rdf:Property) - (gnt:locus a qb:MeasureProperty) - (gnt:locus rdfs:subPropertyOf sdmx-measure:obsValue) - (gnt:locus rdfs:domain gnc:phenotype) - (gnt:locus rdfs:range rdfs:Literal) - (gnt:additive rdfs:domain gnc:phenotype) - (gnt:additive rdfs:range xsd:double) - (gnt:sequence rdfs:domain gnc:phenotype) - (gnt:sequence rdfs:range xsd:integer)) - (triples (string->identifier - "trait" - (field ("CONCAT(IFNULL(InbredSet.InbredSetCode, PublishXRef.InbredSetId), '_', PublishXRef.Id)" - Phenotype)) - #:separator "_") + + + + + + + +(define-transformer gnc:phenotype->gn:phenotype + (tables (Phenotype)) + (triples "gnc:phenotype" + (set skos:member + (let ((post-abbrev (blank-p (field Phenotype Post_publication_abbreviation))) + (pre-abbrev (blank-p (field Phenotype Pre_publication_abbreviation))) + (post-desc (blank-p (field Phenotype Post_publication_description))) + (pre-desc (blank-p (field Phenotype Post_publication_description)))) + (string->identifier + "phenotype" + (or post-abbrev pre-abbrev post-desc pre-desc) + #:separator "_"))))) + +(define-transformer gn:phenotype->metadata + (tables (Phenotype)) + (triples (let ((post-abbrev (blank-p (field Phenotype Post_publication_abbreviation))) + (pre-abbrev (blank-p (field Phenotype Pre_publication_abbreviation))) + (post-desc (blank-p (field Phenotype Post_publication_description))) + (pre-desc (blank-p (field Phenotype Post_publication_description)))) + (string->identifier + "phenotype" + (or post-abbrev pre-abbrev post-desc pre-desc) + #:separator "_")) (set rdf:type 'gnc:phenotype) - (set gnt:has_strain - (string->identifier - "set" (field InbredSet Name InbredSetName) - #:separator "_")) - ;; This is the trait's name - (set gnt:trait_id - (let ((trait-id (field PublishXRef Id))) - (if (number? trait-id) - (number->string trait-id) - trait-id))) - (set skos:altLabel - (field ("CONCAT(IFNULL(InbredSet.InbredSetCode, PublishXRef.InbredSetId), '_', PublishXRef.Id)" - Phenotype))) ;; All phenotypes have a post-publication description (set dct:description (sanitize-rdf-string (field Phenotype Post_publication_description))) ;; All phenotypes have a post-publication abbreviation - (set gnt:abbreviation (field Phenotype Post_publication_abbreviation)) - (set gnt:labCode (field Phenotype Lab_code)) + (set gnt:abbreviation (sanitize-rdf-string (field Phenotype Post_publication_abbreviation))) + (set gnt:has_lab_code (field Phenotype Lab_code)) (set gnt:submitter (sanitize-rdf-string (field Phenotype Submitter))) (set dct:contributor (sanitize-rdf-string (field Phenotype Owner))) + (set skos:member + (let ((post-abbrev (blank-p (field Phenotype Post_publication_abbreviation))) + (pre-abbrev (blank-p (field Phenotype Pre_publication_abbreviation))) + (post-desc (blank-p (field Phenotype Post_publication_description))) + (pre-desc (blank-p (field Phenotype Post_publication_description)))) + (string->identifier + "phenotype" + (or post-abbrev pre-abbrev post-desc pre-desc) + #:separator "_"))))) + +(define-transformer gn:trait->gn:phenotype + (tables (PublishXRef + (left-join InbredSet "ON InbredSet.InbredSetId = PublishXRef.InbredSetId") + (inner-join PublishFreeze "ON PublishFreeze.InbredSetId = InbredSet.Id") + (left-join Publication "ON Publication.Id = PublishXRef.PublicationId") + (left-join Phenotype "ON Phenotype.Id = PublishXRef.PhenotypeId")) + "WHERE InbredSet.public > 0") + (triples (let ((post-abbrev (blank-p (field Phenotype Post_publication_abbreviation))) + (pre-abbrev (blank-p (field Phenotype Pre_publication_abbreviation))) + (post-desc (blank-p (field Phenotype Post_publication_description))) + (pre-desc (blank-p (field Phenotype Post_publication_description)))) + (string->identifier + "trait" + (format #f "~a_~a" (field PublishFreeze Name) + (or post-abbrev pre-abbrev post-desc pre-desc)) + #:separator "_")) + (set rdf:type 'gnc:phenotype_trait) + (set gnt:has_strain (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")) + (set owl:equivalentClass + (field ("CONCAT(IFNULL(InbredSet.InbredSetCode, PublishXRef.InbredSetId), '_', PublishXRef.Id)" + Phenotype))) + (set dcat:distribution + (string->symbol + (format #f "gnd:~a" + (field ("CONCAT(PublishFreeze.Name, '_', PublishXRef.Id)" + Phenotype)))) ) + (set dct:references + (let ((pmid (field + ("IF(Publication.PubMed_ID IS NULL, '', CONVERT(Publication.PubMed_Id, INT))" + pmid))) + (publication-id (field Publication Id))) + (if (string-null? pmid) + (string->identifier "unpublished" + (number->string publication-id)) + (ontology 'pubmed: pmid)))) + (set gnt:has_phenotype + (let ((post-abbrev (blank-p (field Phenotype Post_publication_abbreviation))) + (pre-abbrev (blank-p (field Phenotype Pre_publication_abbreviation))) + (post-desc (blank-p (field Phenotype Post_publication_description))) + (pre-desc (blank-p (field Phenotype Post_publication_description)))) + (string->identifier + "phenotype" + (or post-abbrev pre-abbrev post-desc pre-desc) + #:separator "_"))) (set gnt:mean (annotate-field (field ("IFNULL(PublishXRef.mean, '')" mean)) '^^xsd:double)) (set gnt:locus (sanitize-rdf-string (field PublishXRef Locus))) (set gnt:lod_score (annotate-field - (field ("IFNULL((PublishXRef.LRS/4.604), '')" lrs)) - '^^xsd:double)) + (field ("IFNULL((PublishXRef.LRS/4.604), '')" lrs)) + '^^xsd:double)) (set gnt:additive (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive)) '^^xsd:double)) (set gnt:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:integer)) - (set dct:isReferencedBy - (let ((pmid (field - ("IF(Publication.PubMed_ID IS NULL, '', CONVERT(Publication.PubMed_Id, INT))" - pmid))) - (publication-id (field Publication Id PublicationId))) - (if (string-null? pmid) - (string->identifier "unpublished" - (number->string publication-id)) - (ontology 'pubmed: pmid)))))) + (set rdfs:comment (sanitize-rdf-string (field PublishXRef comments))))) @@ -119,10 +139,12 @@ (connection %connection-settings) (table-metadata? #f) (prefixes - '(("dct:" "<http://purl.org/dc/terms/>") + '(("dcat:" "<http://www.w3.org/ns/dcat#>") + ("dct:" "<http://purl.org/dc/terms/>") ("gn:" "<http://rdf.genenetwork.org/v1/id/>") ("owl:" "<http://www.w3.org/2002/07/owl#>") ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnd:" "<https://cd.genenetwork.org/lmdb/v1/data/traits/>") ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") ("sdmx-measure:" "<http://purl.org/linked-data/sdmx/2009/measure#>") ("skos:" "<http://www.w3.org/2004/02/skos/core#>") @@ -133,8 +155,9 @@ ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") ("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>"))) (inputs - (list - phenotypes)) + (list gnc:phenotype->gn:phenotype + gn:phenotype->metadata + gn:trait->gn:phenotype)) (outputs `(#:documentation ,documentation #:rdf ,output)))) diff --git a/examples/publication.scm b/examples/publication.scm index 6b57856..c411af6 100755 --- a/examples/publication.scm +++ b/examples/publication.scm @@ -13,7 +13,7 @@ -(define-transformer publication +(define-transformer publication->metadata (tables (Publication)) (triples (let ((pmid (field @@ -81,7 +81,7 @@ ("xsd:" "<http://www.w3.org/2001/XMLSchema#>") ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>"))) (inputs - (list publication)) + (list publication->metadata)) (outputs `(#:documentation ,documentation #:rdf ,output)))) diff --git a/examples/schema.scm b/examples/schema.scm index 4bde895..c4ff082 100755 --- a/examples/schema.scm +++ b/examples/schema.scm @@ -8,18 +8,7 @@ (transform sql) (transform table)) -(define (call-with-genenetwork-database connection-settings proc) - (call-with-database "mysql" (string-join - (list (assq-ref connection-settings 'sql-username) - (assq-ref connection-settings 'sql-password) - (assq-ref connection-settings 'sql-database) - "tcp" - (assq-ref connection-settings 'sql-host) - (number->string - (assq-ref connection-settings 'sql-port))) - ":") - proc)) - + (define (transform-table-schema connection-settings db) (let ((tables (tables connection-settings db))) (for-each (lambda (table) @@ -44,7 +33,7 @@ (table-columns table)))) tables))) - + (let* ((option-spec '((settings (single-char #\s) (value #t)) (output (single-char #\o) (value #t)) @@ -54,7 +43,7 @@ (output (option-ref options 'output #f)) (documentation (option-ref options 'documentation #f)) (%connection-settings (call-with-input-file settings read))) - (call-with-genenetwork-database + (call-with-target-database %connection-settings (lambda (db) (with-output-to-file output |
