diff options
35 files changed, 1918 insertions, 1895 deletions
diff --git a/README.md b/README.md index 246e6d6..c8efad2 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,12 @@ guile -s examples/phenotype.scm \ which does the same thing, but has the potential to be confusing due to the two `-s` options: the first `-s` option is to guile while the second is to the script itself. +There's an extra script that loops through all the scheme files in examples and runs them. To run it: + +```sh +./generate-ttl-files.scm -s conn.scm -o <ttl-output-directory> -d <docs-output-directory> +``` + ## Validate and load dump Then, validate the dumped RDF using `rapper`: diff --git a/examples/classification.scm b/examples/classification.scm index 3024af6..130bec8 100755 --- a/examples/classification.scm +++ b/examples/classification.scm @@ -13,118 +13,126 @@ -(define (remap-species-identifiers str) - "This procedure remaps identifiers to standard binominal. Obviously this should - be sorted by correcting the database!" - (match str - ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] - ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] - ["Macaca mulatta" "Macaca nemestrina"] - ["Bat (Glossophaga soricina)" "Glossophaga soricina"] - [str str])) - ;; Classification Scheme -(define-transformer classification-scheme-species - (tables (Species)) - (schema-triples - (gnc:ResourceClassificationScheme a skos:ConceptScheme) - (gnc:ResourceClassificationScheme skos:prefLabel "GeneNetwork Classification Scheme For Resources") - (gnc:ResourceClassificationScheme xkos:numberOfLevels "3") - (gnc:ResourceClassificationScheme xkos:levels "( gnc:DatasetType gnc:Set gnc:Species )") - (gnc:DatasetType a xkos:ClassificationLevel) - (gnc:DatasetType skos:prefLabel "The Type of a Dataset which can be a ProbeSet, Genotype, or Phenotype") - (gnc:DatasetType xkos:depth "1") - (gnc:DatasetType skos:member gnc:Probeset) - (gnc:DatasetType skos:member gnc:Genotype) - (gnc:DatasetType skos:member gnc:Phenotype) - (gnc:Probeset skos:prefLabel "mRNA Assay Datasets") - (gnc:Probeset skos:altLabel "ProbeSet") - (gnc:Genotype skos:prefLabel "Genotype") - (gnc:Genotype skos:altLabel "DNA Markers and SNPs") - (gnc:Phenotype skos:prefLabel "Phenotype") - (gnc:Phenotype skos:altLabel "Traits and Cofactors") - (gnc:Species a xkos:ClassificationLevel) - (gnc:Species skos:prefLabel "The species in which this resource belongs") - (gnc:Species xkos:depth "3") - (gnc:Species xkos:specializes gnc:Set)) - (triples "gnc:Species" +(define-transformer gnc:species->gn:species + (tables (Species) + "WHERE Name != 'monkey'") + (triples "gnc:species" (set skos:member - (string->identifier "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)))) + (string->identifier "" (remap-species-identifiers (field Species Fullname)))))) -(define-transformer classification-scheme-set - (tables (InbredSet)) - (schema-triples - (gnc:Set a xkos:ClassificationLevel) - (gnc:Set skos:prefLabel "The Type of Set, Ie InbredSet/OutbredSet that a resource can belong to") - (gnc:Set xkos:depth "2") - (gnc:Set xkos:generalizes gnc:Species)) - (triples "gnc:Set" +(define-transformer gnc:set->gn:set + (tables (InbredSet) + "WHERE public > 0 AND FullName NOT LIKE '%monkey%'") + (triples "gnc:set" (set skos:member (string->identifier - "set" (field InbredSet Name InbredSetName) - #:separator "" - #:proc string-capitalize-first)))) + "set" (field InbredSet Name InbredSetName) #:separator "_")))) -(define-transformer species - (tables (Species)) - (schema-triples - (gnt:family a owl:ObjectProperty) - (gnt:family rdfs:domain gnc:Species) - (gnt:family skos:definition "This resource belongs to this family") - (gnt:shortName a owl:ObjectProperty) - (gnt:shortName rdfs:domain gnc:Species) - (gnt:shortName skos:definition "The short name of a given resource") - (gnt:belongsToSpecies a rdf:property) - (gnt:belongsToSpecies rdf:comment "This resource given to this species") - (gnt:belongsToSpecies rdf:label "belongsToSpecies")) +(define-transformer gnc:species->metadata + (tables (Species) + "WHERE Name != 'monkey'") (triples - (string->identifier "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first) - (set skos:inScheme 'gnc:ResourceClassificationScheme) + (string->identifier "" (remap-species-identifiers (field Species Fullname))) + (set rdf:type 'gnc:species) (set rdfs:label (remap-species-identifiers (field Species Fullname))) (set skos:prefLabel (field Species MenuName)) (set skos:altLabel (field Species SpeciesName)) - (set gnt:shortName (field Species Name)) - (set gnt:family (field Species Family)) - (set skos:notation (ontology - 'taxon: - (field Species TaxonomyId))))) + (set gnt:short_name (field Species Name)) + (set gnt:has_taxonomic_family (string->identifier "family" (field Species Family) #:separator "_")) + (set gnt:has_uniprot_taxon_id (ontology + 'taxon: + (field Species TaxonomyId))))) + +(define-transformer gnc:species->gn:set + (tables (InbredSet + (left-join Species "ON InbredSet.SpeciesId=Species.Id")) + "WHERE public > 0 AND Species.Name != 'monkey'") + (triples (string->identifier "" (remap-species-identifiers (field Species Fullname))) + (set gnt:has_strain + (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")))) + +(define-transformer gn:family->gn:species/metadata + (tables (Species) + "WHERE Name != 'monkey' GROUP BY FAMILY") + (triples (string->identifier "family" (field Species Family) #:separator "_") + (set gnt:has_species + (string->identifier "" (remap-species-identifiers (field Species Fullname)))) + (set rdfs:label (field Species Family)) + (set gnt:has_family_order_id + (annotate-field (field Species OrderId) + '^^xsd:integer)))) + +(define-transformer gn:family->gn:species + (tables (Species) + "WHERE Name != 'monkey'") + (triples (string->identifier "family" (field Species Family) #:separator "_") + (set gnt:has_species + (string->identifier "" (remap-species-identifiers (field Species Fullname)))))) -(define-transformer inbred-set + +(define-transformer gn:set->metadata (tables (InbredSet (left-join Species "ON InbredSet.SpeciesId=Species.Id") (left-join MappingMethod - "ON InbredSet.MappingMethodId=MappingMethod.Id"))) - (schema-triples - (gnt:geneticType a owl:ObjectProperty) - (gnt:geneticType rdfs:domain gnc:set) - (gnt:code a owl:ObjectProperty) - (gnt:code rdfs:domain gnc:set) - ;; Already defined as an owl prop in species - (gnt:family rdfs:domain gnc:Set) - (gnt:mappingMethod a owl:ObjectProperty) - (gnt:mappingMethod rdfs:domain gnc:set) - (gnt:belongsToGroup a rdf:property) - (gnt:belongsToGroup rdf:comment "This resource given to this group") - (gnt:belongsToGroup rdf:label "belongsToGroup")) - (triples (string->identifier - "set" (field InbredSet Name InbredSetName) - #:separator "" - #:proc string-capitalize-first) - (set skos:inScheme 'gnc:ResourceClassificationScheme) + "ON InbredSet.MappingMethodId=MappingMethod.Id")) + "WHERE public > 0 AND Species.Name != 'monkey'") + (triples (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_") + (set rdf:type 'gnc:set) (set rdfs:label (field InbredSet FullName)) (set skos:prefLabel (field InbredSet Name InbredSetName)) - (set gnt:geneticType (field InbredSet GeneticType)) - (set gnt:family (field InbredSet Family)) - (set gnt:mappingMethod (field MappingMethod Name)) - (set gnt:code (field InbredSet InbredSetCode)) - (set xkos:generalizes - (string->identifier "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)))) + (set gnt:genetic_type (field InbredSet GeneticType)) + (set dct:description (annotate-field (sanitize-rdf-string (field InbredSet description)) + '^^rdf:HTML)) + (set gnt:uses_mapping_method + (string->identifier "mapping_method" (field MappingMethod Name) #:separator "_")) + (set gnt:has_set_code (field InbredSet InbredSetCode)) + (set gnt:has_species + (string->identifier "" (remap-species-identifiers (field Species Fullname)))))) + +(define-transformer gn:set->gn:population + (tables (InbredSet) + "WHERE Family IS NOT NULL AND FullName NOT LIKE '%monkey%'") + (schema-triples + (gnt:has_reference_population rdfs:domain gnc:set) + (gnt:has_reference_population a owl:ObjectProperty) + (gnt:has_reference_population rdfs:comment "This group belongs to this population category.") + (gnt:has_reference_population rdfs:label "belongs to population category.")) + (triples (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_") + (set gnt:has_reference_population + (string->identifier "population" (field InbredSet Family) #:separator "_")))) + +(define-transformer gn:population->metadata + (tables (InbredSet) + "WHERE Family IS NOT NULL AND FullName NOT LIKE '%monkey%' GROUP BY Family") + (triples (string->identifier "population" (field InbredSet Family) #:separator "_") + (set rdf:type 'gnc:reference_population) + (set rdfs:label (field InbredSet Family)) + (set skos:member 'gnc:population_category) + (set gnt:has_population_order_id + (annotate-field (field InbredSet FamilyOrder) + '^^xsd:integer)))) + +(define-transformer gn:population->gn:set + (tables (InbredSet) + "WHERE Family IS NOT NULL AND FullName NOT LIKE '%monkey%'") + (triples (string->identifier "population" (field InbredSet Family) #:separator "_") + (set gnt:has_strain + (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")))) + +(define-transformer gnc:population_category->gn:population + (tables (InbredSet) + "WHERE public > 0 AND FullName NOT LIKE '%monkey%' GROUP BY Family") + (triples "gnc:population_category" + (set gnt:has_reference_population + (string->identifier "population" (field InbredSet Family) #:separator "_")))) + +(define-transformer gnc:taxonomic_family->gn:family + (tables (Species) + "WHERE Name != 'monkey' GROUP BY Family") + (triples "gnc:taxonomic_family" + (set gnt:has_taxonomic_family + (string->identifier "family" (field Species Family) #:separator "_")))) @@ -141,24 +149,36 @@ read))) (with-documentation - (name "Species Metadata") + (name "GN Classification Hierarchy") (connection %connection-settings) (table-metadata? #f) (prefixes - '(("gn:" "<http://genenetwork.org/id/>") - ("gnc:" "<http://genenetwork.org/category/>") + '(("dcat:" "<http://www.w3.org/ns/dcat#>") + ("dct:" "<http://purl.org/dc/terms/>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") ("owl:" "<http://www.w3.org/2002/07/owl#>") - ("gnt:" "<http://genenetwork.org/term/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") + ("schema:" "<https://schema.org/>") ("skos:" "<http://www.w3.org/2004/02/skos/core#>") ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") + ("xsd:" "<http://www.w3.org/2001/XMLSchema#>") ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") ("taxon:" "<http://purl.uniprot.org/taxonomy/>"))) (inputs - (list classification-scheme-species - classification-scheme-set - species - inbred-set)) + (list gnc:species->gn:species + gnc:set->gn:set + gnc:species->metadata + gnc:species->gn:set + gn:family->gn:species/metadata + gn:family->gn:species + gn:set->metadata + gn:set->gn:population + gn:population->metadata + gn:population->gn:set + gnc:population_category->gn:population + gnc:taxonomic_family->gn:family)) (outputs `(#:documentation ,documentation #:rdf ,output)))) diff --git a/examples/dataset-metadata-git.scm b/examples/dataset-metadata-git.scm deleted file mode 100755 index c9ea59b..0000000 --- a/examples/dataset-metadata-git.scm +++ /dev/null @@ -1,94 +0,0 @@ -#! /usr/bin/env guile - -!# -(use-modules - (ice-9 getopt-long) - (srfi srfi-26) - ((ice-9 regex) #:select (regexp-substitute/global)) - ((transform strings) #:select (string-blank? string-capitalize-first)) - ((transform sql) #:select (call-with-target-database sql-for-each))) - -(define (save-file file result) - (when result - (let ((dir-name (dirname file))) - (unless (file-exists? dir-name) - (mkdir dir-name)) - (with-output-to-file file - (lambda () - (format #t "~a" result)))))) - -(define (infopages/sql->rtf result) - (let* ((get (cut assoc-ref result <>)) - (get* (compose (lambda (str) - (if (or (string-blank? str) - (string-ci=? - (string-trim-both str) "None")) - #f - str)) - get)) - (identifier - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (get "InfoPageName") - 'pre "_" 'post))) - (dir-name "/export/data/genenetwork/gn-docs/general/datasets/") - (file-name (cut string-append dir-name <>)) - (summary (get* "Summary")) - (tissue (get* "AboutTissue")) - (specifics (get* "Specifics")) - (contributors (get* "Contributors")) - (cases (get* "AboutCases")) - (platform (get* "AboutPlatform")) - (processing (get* "AboutDataProcessing")) - (notes (get* "Notes")) - (citation (get* "Citation")) - (experiment-type (get* "Experiment_Type")) - (experiment-design (get* "ExperimentDesign")) - (acknowledgment (get* "Acknowledgment"))) - (for-each (lambda (x) - (save-file - (string-append (file-name identifier) - "/" - (car x)) - (cdr x))) - `(("summary.rtf" . ,summary) - ("tissue.rtf" . ,tissue) - ("citation.rtf" . ,citation) - ("specifics.rtf" . ,specifics) - ("cases.rtf" . ,cases) - ("platform.rtf" . ,platform) - ("processing.rtf" . ,processing) - ("notes.rtf" . ,notes) - ("experiment-design.rtf" . ,experiment-design) - ("experiment-type.rtf" . ,experiment-type) - ("contributors.rtf" . ,contributors) - ("acknowledgment.rtf" . ,acknowledgment))))) - - -(let* ((option-spec - '((settings (single-char #\s) (value #t)))) - (options (getopt-long (command-line) option-spec)) - (settings (option-ref options 'settings #f)) - (query "SELECT InfoPageName, Datasets.Summary, Datasets.AboutTissue, InfoFiles.Specifics, -Datasets.AboutCases, Datasets.AboutPlatform, Datasets.AboutDataProcessing, InfoFiles.Experiment_Type, -Datasets.Notes, Datasets.ExperimentDesign, Datasets.Acknowledgment, Datasets.Contributors, Datasets.Citation -FROM InfoFiles LEFT JOIN Datasets USING (DatasetId)") - (%connection-settings - (call-with-input-file settings - read))) - (call-with-target-database - %connection-settings - (lambda (db) - (let ((dir "/export/data/genenetwork/gn-docs/")) - (chdir dir) - (system "git reset --hard origin") - (system "git pull") - ;; Clear directory so that we can re-do the dump again from the db. - (system "rm -rf general/datasets/*/") - (sql-for-each infopages/sql->rtf - db - query) - (system "git add general/datasets") - (system (format #f "git commit -m ~s" "Update dataset RTF Files.")) - (system "git push origin master"))))) diff --git a/examples/dataset-metadata.scm b/examples/dataset-metadata.scm deleted file mode 100755 index 9c30180..0000000 --- a/examples/dataset-metadata.scm +++ /dev/null @@ -1,541 +0,0 @@ -#! /usr/bin/env guile -!# - -(use-modules (srfi srfi-1) - (srfi srfi-26) - (ice-9 getopt-long) - (ice-9 match) - (ice-9 regex) - (transform strings) - (transform sql) - (transform triples) - (transform special-forms)) - - -(define (remap-species-identifiers str) - "This procedure remaps identifiers to standard binominal. Obviously this should - be sorted by correcting the database!" - (match str - ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] - ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] - ["Macaca mulatta" "Macaca nemestrina"] - ["Bat (Glossophaga soricina)" "Glossophaga soricina"] - [str str])) - -;; One email ID in the Investigators table has spaces in it. This -;; function fixes that. -(define (fix-email-id email) - (string-delete #\space email)) - -(define (investigator-attributes->id first-name last-name email) - ;; There is just one record corresponding to "Evan Williams" which - ;; does not have an email ID. To accommodate that record, we - ;; construct the investigator ID from not just the email ID, but - ;; also the first and the last names. It would be preferable to just - ;; find Evan Williams' email ID and insert it into the database. - (string->identifier "investigator" - (string-join - (list first-name last-name (fix-email-id email)) - "_"))) - -(define-transformer investigators - ;; There are a few duplicate entries. We group by email to - ;; deduplicate. - (tables (Investigators) - "GROUP BY Email") - (triples (investigator-attributes->id (field Investigators FirstName) - (field Investigators LastName) - (field Investigators Email)) - (set rdf:type 'foaf:Person) - (set foaf:name (string-append (field Investigators FirstName) " " - (field Investigators LastName))) - (set foaf:givenName - (field Investigators FirstName)) - (set foaf:familyName - (field Investigators LastName)) - (set foaf:homepage (field Investigators Url)) - (set v:adr (field Investigators Address)) - (set v:locality (field Investigators City)) - (set v:region (field Investigators State)) - (set v:postal-code (field Investigators ZipCode)) - (set v:country-name (field Investigators Country)))) - -(define-transformer gene-chip - (tables (GeneChip - (left-join Species "USING (SpeciesId)"))) - (schema-triples - (gnc:geneChip a skos:Concept) - (gnc:geneChip - skos:description - "This is a set of controlled terms that are used to describe a given gene chip/platform") - (gnt:hasGeoSeriesId rdfs:domain gnc:platform) - (gnt:hasGeoSeriesId rdfs:domain gnc:geneChip) - (gnt:hasGOTreeValue a owl:ObjectProperty) - (gnt:hasGOTreeValue skos:definition "This resource the following GO tree value") - (gnt:hasGOTreeValue rdfs:domain gnc:geneChip)) - (triples (string->identifier "platform" (field GeneChip Name)) - (set rdf:type 'gnc:geneChip) - (set rdfs:label (field GeneChip GeneChipName)) - (set skos:prefLabel (field GeneChip Name)) - (set skos:altLabel (field ("IF(GeneChip.GeneChipName != GeneChip.Title, Title, NULL)" - Title))) - (set gnt:hasGOTreeValue (field GeneChip Go_tree_value)) - (set xkos:classifiedUnder - (string->identifier "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)) - (set gnt:hasGeoSeriesId - (ontology 'geoSeries: - (string-trim-both (field GeneChip GeoPlatform)))))) - -(define-transformer info-files - (tables (InfoFiles - (left-join PublishFreeze "ON InfoFiles.InfoPageName = PublishFreeze.Name") - (left-join GenoFreeze "ON InfoFiles.InfoPageName = GenoFreeze.Name") - (left-join ProbeSetFreeze "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") - (left-join InbredSet "ON InfoFiles.InbredSetId = InbredSet.InbredSetId") - (left-join Species "ON InfoFiles.SpeciesId = Species.SpeciesId") - (left-join Datasets "USING (DatasetId)") - (left-join DatasetStatus "USING (DatasetStatusId)") - (left-join Tissue "USING (TissueId)") - (left-join Investigators "USING (InvestigatorId)") - (left-join AvgMethod "USING (AvgMethodId)") - (left-join Organizations "USING (OrganizationId)") - (left-join GeneChip "USING (GeneChipId)")) - ;; XXXX: There are datasets that don't have the InbredSetId - ;; in the Infofiles table. This clause allows us to check - ;; if they exist in the (Publish/Geno)Freeze tables. - "LEFT JOIN InbredSet PublishInbredSet ON PublishFreeze.InbredSetId = PublishInbredSet.InbredSetId LEFT JOIN InbredSet GenoInbredSet ON GenoFreeze.InbredSetId = GenoInbredSet.InbredSetId WHERE GN_AccesionId IS NOT NULL") - (schema-triples - (gnt:hasTissue rdfs:domain dcat:Dataset) - (gnt:hasTissue a owl:ObjectProperty) - (gnt:hasTissue skos:definition "Tissues this resource has") - (gnt:usesNormalization rdfs:domain dcat:Dataset) - (gnt:usesNormalization a owl:ObjectProperty) - (gnt:usesNormalization skos:definition "Normalization techniques this resource has") - (gnt:usesPlatform rdfs:domain dcat:Dataset) - (gnt:usesPlatform a owl:ObjectProperty) - (gnt:usesPlatform skos:definition "The Platform this resource uses") - (gnt:hasGeoSeriesId rdfs:domain dcat:Dataset) - (gnt:hasGeoSeriesId a owl:ObjectProperty) - (gnt:hasGeoSeriesId skos:definition "id of record in NCBI database") - (gnt:hasExperimentType rdfs:domain dcat:Dataset) - (gnt:hasExperimentType a owl:ObjectProperty) - (gnt:hasExperimentType rdfs:label "Experiment Type Metadata") - (gnt:hasExperimentType skos:definition "Information about the experiment type") - (gnt:hasTissueInfo rdfs:domain dcat:Dataset) - (gnt:hasTissueInfo a owl:ObjectProperty) - (gnt:hasTissueInfo skos:definition "Metadata about Tissue for this resource") - (gnt:hasExperimentDesignInfo rdfs:domain dcat:Dataset) - (gnt:hasExperimentDesignInfo rdfs:label "Experiment Design") - (gnt:hasExperimentDesignInfo a owl:ObjectProperty) - (gnt:hasExperimentDesignInfo skos:definition "Information about how the experiment was designed") - (gnt:hasNotes rdfs:domain dcat:Dataset) - (gnt:hasNotes a owl:ObjectProperty) - (gnt:hasNotes rdfs:label "Notes") - (gnt:hasNotes skos:definition "Extra Notes about this dataset") - (gnt:hasDataProcessingInfo rdfs:domain dcat:Dataset) - (gnt:hasDataProcessingInfo rdfs:label "About Data Processing") - (gnt:hasDataProcessingInfo a owl:ObjectProperty) - (gnt:hasDataProcessingInfo skos:definition "Information about how this dataset was processed") - (gnt:hasPlatformInfo rdfs:domain dcat:Dataset) - (gnt:hasPlatformInfo a owl:ObjectProperty) - (gnt:hasPlatformInfo rdfs:label "About Platform") - (gnt:hasPlatformInfo skos:definition "Information about the platform that was used with this dataset") - (gnt:hasCaseInfo rdfs:domain dcat:Dataset) - (gnt:hasCaseInfo rdfs:label "About Case") - (gnt:hasCaseInfo a owl:ObjectProperty) - (gnt:hasCaseInfo skos:definition "Information about the cases used in this platform") - (gnt:hasSummary rdfs:domain dcat:Dataset) - (gnt:hasSummary rdfs:label "Summary") - (gnt:hasSummary a owl:ObjectProperty) - (gnt:hasSummary skos:definition "Summary information about dataset") - (gnt:hasCitation rdfs:domain dcat:Dataset) - (gnt:hasCitation rdfs:label "Citation") - (gnt:hasCitation a owl:ObjectProperty) - (gnt:hasCitation skos:definition "Citation for this dataset") - (gnt:hasContributors rdfs:domain dcat:Dataset) - (gnt:hasContributors rdfs:label "Contributors") - (gnt:hasContributors a owl:ObjectProperty) - (gnt:hasContributors skos:definition "Contributors of this resource") - (gnt:hashasExperimentDesign rdfs:domain dcat:Dataset) - (gnt:hashasExperimentDesign rdfs:label "Experiment Design") - (gnt:hashasExperimentDesign a owl:ObjectProperty) - (gnt:hashasExperimentDesign skos:definition "Experiment Design for this resource") - (gnt:hasTissueInfo rdfs:domain dcat:Dataset) - (gnt:hasTissueInfo rdfs:label "Tissue Information") - (gnt:hasTissueInfo a owl:ObjectProperty) - (gnt:hasTissueInfo skos:definition "Tissue information about dataset") - (gnt:hasExperimentType skos:definition "Information about the experiment type") - (gnt:hasAcknowledgement rdfs:domain dcat:Dataset) - (gnt:hasAcknowledgement rdfs:label "Acknowledgement") - (gnt:hasAcknowledgement a owl:ObjectProperty) - (gnt:hasAcknowledgement skos:definition "People to acknowledge")) - (triples (string->identifier - "" (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)) - (set rdf:type 'dcat:Dataset) - (set xkos:classifiedUnder - (let ([dataset-type - (string-trim-both - (field ("IF(GenoFreeze.Id IS NOT NULL, 'gnc:Genotype', IF(PublishFreeze.Id IS NOT NULL, 'gnc:Phenotype', IF(ProbeSetFreeze.Name IS NOT NULL, 'gnc:Probeset', '')))" - DatasetType)))]) - (if (not (string-null? dataset-type)) - (string->symbol - dataset-type) - ""))) - (set rdfs:label (regexp-substitute/global - #f "^[Nn]one$" - (field InfoFiles InfoPageName) - "")) - (set skos:prefLabel - (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))" - DatasetFullName))) - (set skos:altLabel (field Datasets DatasetName DatasetGroup)) - (set dct:title - (regexp-substitute/global - #f "^[Nn]one$" - (or - (regexp-substitute/global - #f "^Unpublished$" (field Datasets PublicationTitle) "") - (field InfoFiles InfoFileTitle) - "") - "")) - (set dct:created - (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))" - createTimeGenoFreeze))) - (set dcat:contactPoint - (investigator-attributes->id (field Investigators FirstName) - (field Investigators LastName) - (field Investigators Email))) - (set foaf:Organization - (field Organizations OrganizationName)) - (set dct:identifier (format #f "GN~a" (field InfoFiles GN_AccesionId))) - (set dct:accessRights (string-downcase - (field DatasetStatus DatasetStatusName))) - (set gnt:belongsToGroup - (string->identifier - "set" - (field ("IFNULL(InbredSet.Name, IFNULL(PublishInbredSet.Name, GenoInbredSet.Name))" - InbredSetName)))) - (set gnt:hasTissue (string->identifier "tissue" - (field Tissue Short_Name))) - (set gnt:usesNormalization - (string->identifier "avgMethod" - ;; If AvgMethodName is NULL, assume N/A. - (if (string-blank? (field AvgMethod Name AvgMethodName)) - "N/A" (field AvgMethod Name AvgMethodName)))) - (set gnt:hasSummary - (let* ((summary-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/summary.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (summary - (field InfoFiles Summary))) - (if (or (null? summary) (string-blank? summary)) - "" (string->symbol summary-link)))) - (set gnt:hasTissueInfo - (let* ((tissue-info-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/tissue.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (tissue-info - (field Datasets AboutTissue))) - (if (or (null? tissue-info) (string-blank? tissue-info)) - "" (string->symbol tissue-info-link)))) - (set gnt:hasCitation - (let* ((citation-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/citation.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (citation - (field Datasets Citation))) - (if (or (null? citation) (string-blank? citation)) - "" (string->symbol citation-link)))) - (set gnt:hasSpecifics - (let* ((specifics-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/specifics.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (specifics - (field InfoFiles Specifics))) - (if (or (null? specifics) (string-blank? specifics)) - "" (string->symbol specifics-link)))) - (set gnt:hasCaseInfo - (let* ((cases-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/cases.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (cases - (field Datasets AboutCases))) - (if (or (null? cases) (string-blank? cases)) - "" (string->symbol cases-link)))) - (set gnt:hasPlatformInfo - (let* ((platform-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/platform.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (platform - (field Datasets AboutPlatform))) - (if (or (null? platform) (string-blank? platform)) - "" (string->symbol platform-link)))) - (set gnt:hasDataProcessingInfo - (let* ((processing-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/processing.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (processing - (field Datasets AboutDataProcessing))) - (if (or (null? processing) (string-blank? processing)) - "" (string->symbol processing-link)))) - (set gnt:hasNotes - (let* ((notes-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/notes.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (notes - (field Datasets Notes))) - (if (or (null? notes) (string-blank? notes)) - "" (string->symbol notes-link)))) - (set gnt:hasExperimentType - (let* ((experiment-type-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/experiment-type.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (experiment-type - (field InfoFiles Experiment_Type))) - (if (or (null? experiment-type) (string-blank? experiment-type)) - "" (string->symbol experiment-type-link)))) - (set gnt:hasExperimentDesign - (let* ((experiment-design-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/experiment-design.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (experiment-design - (field Datasets ExperimentDesign))) - (if (or (null? experiment-design) (string-blank? experiment-design)) - "" (string->symbol experiment-design-link)))) - (set gnt:hasContributors - (let* ((contributors-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/contributors.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (contributors - (field Datasets Contributors))) - (if (or (null? contributors) (string-blank? contributors)) - "" (string->symbol contributors-link)))) - (set gnt:hasAcknowledgement - (let* ((acknowledgment-link - (format - #f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/acknowledgment.rtf>" - (string-capitalize-first - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)))) - (acknowledgment - (field Datasets Acknowledgment))) - (if (or (null? acknowledgment) (string-blank? acknowledgment)) - "" (string->symbol acknowledgment-link)))) - (set gnt:usesPlatform - (string->identifier "platform" - (field GeneChip Name GeneChip))) - (set gnt:hasGeoSeriesId - (let ((s - (string-match "GSE[0-9]*" - (field ("IFNULL(Datasets.GeoSeries, '')" GeoSeries))))) - (if s (ontology - 'geoSeries: (match:substring s)) - ""))))) - -;; These are phenotype datasets that don't have Infofile metadata -(define-transformer publishfreeze - (tables (PublishFreeze - (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name") - (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId")) - "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL") - (triples - (string->identifier - "" - (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field PublishFreeze Name) - 'pre "_" 'post)) - (set rdf:type 'dcat:Dataset) - (set xkos:classifiedUnder 'gnc:Phenotype) - (set dct:title (field PublishFreeze FullName)) - (set rdfs:label (field PublishFreeze Name)) - (set skos:altLabel (field PublishFreeze ShortName)) - (set dct:created (annotate-field - (field PublishFreeze CreateTime) - '^^xsd:date)) - (set gnt:belongsToGroup - (string->identifier - "set" (field InbredSet Name InbredSetName) - #:separator "" - #:proc string-capitalize-first)))) - -(define-transformer genofreeze - (tables (GenoFreeze - (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name") - (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId")) - "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL") - (triples - (string->identifier - "" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field GenoFreeze Name) - 'pre "_" 'post) - 'pre "_" 'post)) - (set rdf:type 'dcat:Dataset) - (set xkos:classifiedUnder 'gnc:Genotype) - (set rdfs:label (field GenoFreeze Name)) - (set dct:title (field GenoFreeze FullName)) - (set skos:altLabel (field GenoFreeze ShortName)) - (set dct:created (annotate-field - (field GenoFreeze CreateTime) - '^^xsd:date)) - (set gnt:belongsToGroup - (string->identifier - "set" (field InbredSet Name InbredSetName) - #:separator "" - #:proc string-capitalize-first)))) - -;; Molecular Traits are also referred to as ProbeSets -(define-transformer probesetfreeze - (tables (ProbeSetFreeze - (left-join InfoFiles "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") - (left-join ProbeFreeze "USING (ProbeFreezeId)") - (left-join AvgMethod "ON AvgMethod.AvgMethodId = ProbeSetFreeze.AvgID") - (left-join InbredSet "ON ProbeFreeze.InbredSetId = InbredSet.Id") - (left-join Tissue "ON ProbeFreeze.TissueId = Tissue.TissueId")) - "WHERE ProbeSetFreeze.public > 0 AND InfoFiles.InfoPageName IS NULL GROUP BY ProbeFreeze.Id") - (schema-triples - (gnt:usesNormalization rdfs:domain gnc:probeset) - (gnt:usesDataScale rdfs:domain gnc:probeset) - (gnt:usesDataScale a owl:ObjectProperty) - (gnt:usesDataScale skos:definition "Thi data scale this resource uses")) - (triples - (string->identifier - "" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field ProbeSetFreeze Name) - 'pre "_" 'post)) - (set rdf:type 'dcat:Dataset) - (set xkos:classifiedUnder 'gnc:Probeset) - (set gnt:usesNormalization - (string->identifier "avgMethod" - ;; If AvgMethodName is NULL, assume N/A. - (if (string-blank? (field AvgMethod Name AvgMethodName)) - "N/A" (field AvgMethod Name AvgMethodName)))) - (set dct:title (field ProbeSetFreeze FullName)) - (set rdfs:label (field ProbeSetFreeze ShortName)) - (set skos:prefLabel (field ProbeSetFreeze Name)) - (set skos:altLabel (field ProbeSetFreeze Name2)) - (set dct:created (annotate-field - (field ProbeSetFreeze CreateTime) - '^^xsd:datetime)) - (set gnt:usesDataScale (field ProbeSetFreeze DataScale)) - (set gnt:hasTissue - (string->identifier - "tissue" - (field Tissue Short_Name))) - (set gnt:belongsToGroup - (string->identifier - "set" (field InbredSet Name InbredSetName) - #:separator "" - #:proc string-capitalize-first)))) - - - -(let* ((option-spec - '((settings (single-char #\s) (value #t)) - (output (single-char #\o) (value #t)) - (documentation (single-char #\d) (value #t)))) - (options (getopt-long (command-line) option-spec)) - (settings (option-ref options 'settings #f)) - (output (option-ref options 'output #f)) - (documentation (option-ref options 'documentation #f)) - (%connection-settings - (call-with-input-file settings - read))) - (with-documentation - (name "Info files / Investigators Metadata") - (connection %connection-settings) - (table-metadata? #f) - (prefixes - '(("v:" "<http://www.w3.org/2006/vcard/ns#>") - ("foaf:" "<http://xmlns.com/foaf/0.1/>") - ("xsd:" "<http://www.w3.org/2001/XMLSchema#>") - ("dcat:" "<http://www.w3.org/ns/dcat#>") - ("skos:" "<http://www.w3.org/2004/02/skos/core#>") - ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") - ("geoSeries:" "<http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=>") - ("gnt:" "<http://genenetwork.org/term/>") - ("gn:" "<http://genenetwork.org/id/>") - ("gnc:" "<http://genenetwork.org/category/>") - ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") - ("owl:" "<http://www.w3.org/2002/07/owl#>") - ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") - ("taxon:" "<http://purl.uniprot.org/taxonomy/>") - ("dct:" "<http://purl.org/dc/terms/>"))) - (inputs - (list info-files - publishfreeze - genofreeze - probesetfreeze - investigators - gene-chip)) - (outputs - `(#:documentation ,documentation - #:rdf ,output)))) - - diff --git a/examples/datasets.scm b/examples/datasets.scm new file mode 100755 index 0000000..85a5aee --- /dev/null +++ b/examples/datasets.scm @@ -0,0 +1,120 @@ +#! /usr/bin/env guile +!# + +(use-modules (rnrs programs) + (rnrs io ports) + (srfi srfi-1) + (srfi srfi-26) + (ice-9 getopt-long) + (ice-9 match) + (ice-9 regex) + (transform strings) + (transform sql) + (transform triples) + (transform special-forms)) + + +(define-transformer gn:dataset->metadata + (tables (Datasets + (inner-join InfoFiles "ON InfoFiles.DatasetId = Datasets.DatasetId") + (inner-join InbredSet "ON InbredSet.Id = InfoFiles.InbredSetId")) + ;; Skip monkey datasets + "WHERE InfoFiles.InfoPageName NOT LIKE 'INIA_MacFas_%'" + "GROUP BY Datasets.DatasetId") + (triples (string->identifier "dataset" (field InfoFiles InfoPageName) #:separator "_") + (set rdf:type 'dcat:Dataset) + (set dct:title (normalize-string-field (field InfoFiles InfoPageName))) + (set dct:identifier (format #f "GN~a" (field InfoFiles GN_AccesionId))) + (set gnt:has_genotype_files (string->symbol (format #f "gn-files:GN~a%2F" (field InfoFiles GN_AccesionId)))) + (set gnt:has_strain + (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")) + (set gnt:has_experiment_type + (let ((experiment-type + (field InfoFiles Experiment_Type))) + (if (or (null? experiment-type) (string-blank? experiment-type)) + "" (sanitize-rdf-string experiment-type)))) + (set gnt:has_tissue_info + (let ((tissue-info + (field Datasets AboutTissue))) + (if (or (null? tissue-info) (string-blank? tissue-info)) + "" (sanitize-rdf-string tissue-info)))) + (set gnt:has_summary + (let* ((summary + (field Datasets Summary))) + (if (or (null? summary) (string-blank? summary)) + "" (sanitize-rdf-string summary)))) + (set gnt:has_citation + (let ((citation + (field Datasets Citation))) + (if (or (null? citation) (string-blank? citation)) + "" (sanitize-rdf-string citation)))) + (set gnt:has_samples + (let ((samples + (field InfoFiles samples))) + (if (or (null? samples) (string-blank? samples)) + "" (sanitize-rdf-string samples)))) + (set gnt:has_specifics + (let* ((specifics + (field InfoFiles Specifics))) + (if (or (null? specifics) (string-blank? specifics)) + "" (sanitize-rdf-string specifics)))) + (set gnt:has_case_info + (let ((cases + (field Datasets AboutCases))) + (if (or (null? cases) (string-blank? cases)) + "" (sanitize-rdf-string cases)))) + (set gnt:has_platform_info + (let* ((platform + (field Datasets AboutPlatform))) + (if (or (null? platform) (string-blank? platform)) + "" (sanitize-rdf-string platform)))) + (set gnt:has_data_processing_info + (let* ((processing + (field Datasets AboutDataProcessing))) + (if (or (null? processing) (string-blank? processing)) + "" (sanitize-rdf-string processing)))) + (set gnt:has_experiment_design + (let ((experiment-design + (field Datasets ExperimentDesign))) + (if (or (null? experiment-design) (string-blank? experiment-design)) + "" (sanitize-rdf-string experiment-design)))) + (set gnt:has_contributors + (let ((contributors + (field Datasets Contributors))) + (if (or (null? contributors) (string-blank? contributors)) + "" (sanitize-rdf-string contributors)))))) + + +(let* ((option-spec + '((settings (single-char #\s) (value #t)) + (output (single-char #\o) (value #t)) + (documentation (single-char #\d) (value #t)))) + (options (getopt-long (command-line) option-spec)) + (settings (option-ref options 'settings #f)) + (output (option-ref options 'output #f)) + (documentation (option-ref options 'documentation #f)) + (%connection-settings + (call-with-input-file settings + read))) + (with-documentation + (name "Datasets Metadata") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("dct:" "<http://purl.org/dc/terms/>") + ("dcat:" "<http://www.w3.org/ns/dcat#>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") + ("gn-files:" "<http://files.genenetwork.org/current/>") + ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") + ("owl:" "<http://www.w3.org/2002/07/owl#>") + ("skos:" "<http://www.w3.org/2004/02/skos/core#>") + ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") + ("xsd:" "<http://www.w3.org/2001/XMLSchema#>"))) + (inputs + (list gn:dataset->metadata)) + (outputs + `(#:documentation ,documentation + #:rdf ,output)))) diff --git a/examples/genbank.scm b/examples/genbank.scm index 391cff0..d09b30f 100755 --- a/examples/genbank.scm +++ b/examples/genbank.scm @@ -10,35 +10,22 @@ (transform strings) (transform sql) (transform triples) - (transform special-forms) - (transform uuid)) + (transform special-forms)) -(define (remap-species-identifiers str) - "This procedure remaps identifiers to standard binominal. Obviously this should - be sorted by correcting the database!" - (match str - ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] - ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] - ["Macaca mulatta" "Macaca nemestrina"] - ["Bat (Glossophaga soricina)" "Glossophaga soricina"] - [str str])) - (define-transformer genbank (tables (Genbank (left-join Species "USING (SpeciesId)"))) (schema-triples (gnc:nucleotide a skos:Concept) - (gnt:hasSequence rdfs:domain gnc:nucleotide)) + (gnt:has_sequence rdfs:domain gnc:nucleotide)) (triples (ontology 'genbank: (field Genbank Id)) - (set gnt:hasSequence (field Genbank Sequence)) - (set gnt:belongsToSpecies - (string->identifier "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)))) + (set gnt:has_sequence (field Genbank Sequence)) + (set gnt:has_species + (string->identifier "" (remap-species-identifiers (field Species Fullname)))))) @@ -63,11 +50,11 @@ ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") ("skos:" "<http://www.w3.org/2004/02/skos/core#>") ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") - ("gn:" "<http://genenetwork.org/id/>") - ("gnc:" "<http://genenetwork.org/category/>") - ("gnt:" "<http://genenetwork.org/term/>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") ("dct:" "<http://purl.org/dc/terms/>") - ("foaf:" "<http://xmlns.com/foaf/0.1/>") + ("foaf:" "<http://xmlns.com/foaf/0.1/#term_>") ("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>") ("ncbiTaxon:" "<https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=>") ("generif:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>") diff --git a/examples/gene-chip.scm b/examples/gene-chip.scm new file mode 100755 index 0000000..eec17b8 --- /dev/null +++ b/examples/gene-chip.scm @@ -0,0 +1,76 @@ +#! /usr/bin/env guile +!# + +(use-modules (srfi srfi-1) + (srfi srfi-26) + (ice-9 getopt-long) + (ice-9 match) + (ice-9 regex) + (transform strings) + (transform sql) + (transform triples) + (transform special-forms)) + + +(define-transformer gn:platform->metadata + (tables (GeneChip + (left-join Species "USING (SpeciesId)"))) + (schema-triples + (gnc:gene_chip a skos:ConceptScheme) + (gnc:gene_chip skos:prefLabel "Gene Chip Vocabulary") + (gnc:gene_chip skos:definition "A controlled vocabulary used to describe gene chip and microarray platforms.") + (gnt:has_geo_series_id rdf:type owl:ObjectProperty) + (gnt:has_geo_series_id rdf:label "has GEO Series ID") + (gnt:has_geo_series_id rdfs:domain skos:Concept) + (gnt:has_go_tree_value a owl:ObjectProperty) + (gnt:has_go_tree_value rdfs:label "has GO tree value") + (gnt:has_go_tree_value + rdfs:comment + "Associates a gene chip concept with a Gene Ontology term used for categorization.") + (gnt:has_go_tree_value rdfs:domain skos:Concept) + (gnt:has_go_tree_value rdfs:range xsd:string)) + (triples (string->identifier "platform" (field GeneChip Name) #:separator "_") + (set rdf:type 'skos:Concept) + (set skos:inScheme (field GeneChip GeneChipName)) + (set skos:prefLabel (field GeneChip Name)) + (set skos:altLabel (field ("IF(GeneChip.GeneChipName != GeneChip.Title, Title, NULL)" + Title))) + (set gnt:has_go_tree_value (field GeneChip Go_tree_value)) + (set gnt:has_species + (string->identifier "" (remap-species-identifiers (field Species Fullname)))) + (set gnt:has_geo_series_id + (ontology 'geoSeries: + (string-trim-both (field GeneChip GeoPlatform)))))) + + + +(let* ((option-spec + '((settings (single-char #\s) (value #t)) + (output (single-char #\o) (value #t)) + (documentation (single-char #\d) (value #t)))) + (options (getopt-long (command-line) option-spec)) + (settings (option-ref options 'settings #f)) + (output (option-ref options 'output #f)) + (documentation (option-ref options 'documentation #f)) + (%connection-settings + (call-with-input-file settings + read))) + (with-documentation + (name "GeneChip Metadata") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("xsd:" "<http://www.w3.org/2001/XMLSchema#>") + ("skos:" "<http://www.w3.org/2004/02/skos/core#>") + ("geoSeries:" "<http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + ("owl:" "<http://www.w3.org/2002/07/owl#>") + ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>"))) + (inputs + (list gn:platform->metadata)) + (outputs + `(#:documentation ,documentation + #:rdf ,output)))) diff --git a/examples/genelist.scm b/examples/genelist.scm index 9c1ced0..5048bf2 100755 --- a/examples/genelist.scm +++ b/examples/genelist.scm @@ -18,73 +18,72 @@ (tables (GeneList (left-join Species "USING (SpeciesId)"))) (schema-triples - (gnt:gene rdfs:domain gnc:GeneSymbol) - (gnt:belongsToSpecies rdfs:domain gnc:GeneSymbol) - (gnc:Gene a rdfs:Class) - (gnc:Gene rdfs:label "Gene") - (gnt:hasGeneId a owl:ObjectProperty) - (gnt:hasGeneId rdfs:domain gnc:NCBIWikiEntry) - (gnt:hasGeneId skos:definition "The GeneId of this this resource") - (gnc:transcript rdfs:domain gnc:GeneSymbol) + (gnc:gene_symbol a rdfs:Class) + (gnc:gene_symbol rdfs:label "A gene symbol") + (gnt:gene rdfs:domain gnc:gene_symbol) + (gnt:has_species rdfs:domain gnc:gene_symbol) + (gnc:gene a rdfs:Class) + (gnc:gene rdfs:label "Gene") + (gnt:has_gene_id a owl:ObjectProperty) + (gnt:has_gene_id rdfs:domain gnc:ncbi_wiki_entry) + (gnt:has_gene_id skos:definition "The GeneId of this this resource") + (gnc:transcript rdfs:domain gnc:gene_symbol) (gnt:transcript a owl:ObjectProperty) (gnc:transcript rdfs:comments "The gene transcript of this resource") - (gnc:ebiGwasLink rdfs:Class gnc:ResourceLink) - (gnc:ebiGwasLink rdfs:label "EBI GWAS") - (gnc:ebiGwasLink rdfs:comments "EBI GWAS") - (gnc:proteinAtlasLink rdfs:Class gnc:ResourceLink) - (gnc:proteinAtlasLink rdfs:label "Protein Atlas") - (gnc:proteinAtlasLink rdfs:comments "Human Protein Atlas") - (gnc:genemaniaLink rdfs:Class gnc:ResourceLink) - (gnc:genemaniaLink rdfs:label "GeneMANIA") - (gnc:genemaniaLink rdfs:comments "GeneMANIA") - (gnc:gemmaLink rdfs:Class gnc:ResourceLink) - (gnc:gemmaLink rdfs:label "Gemma") - (gnc:gemmaLink rdfs:comments "Meta-analysis of gene expression data") - (gnc:biogpsLink rdfs:Class gnc:ResourceLink) - (gnc:biogpsLink rdfs:label "BioGPS") - (gnc:biogpsLink rdfs:comments "Expression across many tissues and cell types") - (gnc:abaLink rdfs:Class gnc:ResourceLink) - (gnc:abaLink rdfs:label "ABA") - (gnc:abaLink rdfs:comments "Allen Brain Atlas") - (gnc:pantherLink rdfs:Class gnc:ResourceLink) - (gnc:pantherLink rdfs:label "PANTHER") - (gnc:pantherLink rdfs:comments "Gene and protein data resources from Celera-ABI") - (gnc:stringLink rdfs:Class gnc:ResourceLink) - (gnc:stringLink rdfs:label "STRING") - (gnc:stringLink rdfs:comments "Protein interactions: known and inferred") - (gnc:gtexLink rdfs:Class gnc:ResourceLink) - (gnc:gtexLink rdfs:label "GTEx Portal") - (gnc:gtexLink rdfs:comments "GTEx Portal") - (gnc:rgdLink rdfs:Class gnc:ResourceLink) - (gnc:rgdLink rdfs:label "Rat Genome DB") - (gnc:rgdLink rdfs:comments "Rat Genome DB") - (gnc:hasKgID rdfs:domain gnc:GeneSymbol) - (gnt:hasKgID a owl:ObjectProperty) - (gnc:hasKgID rdfs:comments "The kgID of this resource") - (gnc:hasUnigenID rdfs:domain gnc:GeneSymbol) - (gnt:hasUnigenID a owl:ObjectProperty) - (gnc:hasUnigenID rdfs:comments "The UnigenID of this resource") - (gnc:hasProteinID rdfs:domain gnc:GeneSymbol) - (gnt:hasProteinID a owl:ObjectProperty) - (gnc:hasProteinID rdfs:comments "The ProteinID of this resource") - (gnc:hasAlignID rdfs:domain gnc:GeneSymbol) - (gnt:hasAlignID a owl:ObjectProperty) - (gnc:hasAlignID rdfs:comments "The AlignID of this resource") - (gnt:TxEnd rdfs:range xsd:double) - (gnt:TxStart rdfs:range xsd:double) - (gnt:hasTargetSeq rdfs:domain gnc:Probeset)) + (gnc:ebi_gwas_link rdfs:Class gnc:ResourceLink) + (gnc:ebi_gwas_link rdfs:label "EBI GWAS") + (gnc:ebi_gwas_link rdfs:comments "EBI GWAS") + (gnc:protein_atlas_link rdfs:Class gnc:ResourceLink) + (gnc:protein_atlas_link rdfs:label "Protein Atlas") + (gnc:protein_atlas_link rdfs:comments "Human Protein Atlas") + (gnc:genemania_link rdfs:Class gnc:ResourceLink) + (gnc:genemania_link rdfs:label "GeneMANIA") + (gnc:genemania_link rdfs:comments "GeneMANIA") + (gnc:gemma_link rdfs:Class gnc:ResourceLink) + (gnc:gemma_link rdfs:label "Gemma") + (gnc:gemma_link rdfs:comments "Meta-analysis of gene expression data") + (gnc:biogps_link rdfs:Class gnc:ResourceLink) + (gnc:biogps_link rdfs:label "BioGPS") + (gnc:biogps_link rdfs:comments "Expression across many tissues and cell types") + (gnc:aba_link rdfs:Class gnc:ResourceLink) + (gnc:aba_link rdfs:label "ABA") + (gnc:aba_link rdfs:comments "Allen Brain Atlas") + (gnc:panther_link rdfs:Class gnc:ResourceLink) + (gnc:panther_link rdfs:label "PANTHER") + (gnc:panther_link rdfs:comments "Gene and protein data resources from Celera-ABI") + (gnc:panther_link rdfs:Class gnc:ResourceLink) + (gnc:panther_link rdfs:label "STRING") + (gnc:panther_link rdfs:comments "Protein interactions: known and inferred") + (gnc:gtex_link rdfs:Class gnc:ResourceLink) + (gnc:gtex_link rdfs:label "GTEx Portal") + (gnc:gtex_link rdfs:comments "GTEx Portal") + (gnc:rgd_link rdfs:Class gnc:ResourceLink) + (gnc:rgd_link rdfs:label "Rat Genome DB") + (gnc:rgd_link rdfs:comments "Rat Genome DB") + (gnc:has_kg_id rdfs:domain gnc:gene_symbol) + (gnc:has_kg_id a owl:ObjectProperty) + (gnc:has_kg_id rdfs:comments "The kgID of this resource") + (gnc:has_unigen_id rdfs:domain gnc:gene_symbol) + (gnc:has_unigen_id a owl:ObjectProperty) + (gnc:has_unigen_id rdfs:comments "The UnigenID of this resource") + (gnc:has_protein_id rdfs:domain gnc:gene_symbol) + (gnt:has_protein_id a owl:ObjectProperty) + (gnc:has_protein_id rdfs:comments "The ProteinID of this resource") + (gnc:has_align_id rdfs:domain gnc:gene_symbol) + (gnt:has_align_id a owl:ObjectProperty) + (gnc:has_align_id rdfs:comments "The AlignID of this resource") + (gnt:tx_end rdfs:range xsd:double) + (gnt:tx_start rdfs:range xsd:double) + (gnt:has_target_seq rdfs:domain gnc:probeset)) (triples (string->identifier - "gene" (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (string-trim-both - (field ("CONCAT_WS('_', GeneSymbol, GeneID, AlignID)" GENE_UID))) - 'pre "_" 'post) - #:proc (lambda (x) x)) - (set rdf:type 'gnc:Gene) - (set gnt:geneSymbol (field GeneList GeneSymbol)) + "gene" (normalize-string-field (string-trim-both + (field ("CONCAT_WS('_', GeneSymbol, GeneID, AlignID)" GENE_UID)))) + #:separator "_") + (set rdf:type 'gnc:gene) + (set gnt:gene_symbol (field GeneList GeneSymbol)) (set dct:description (sanitize-rdf-string (field GeneList GeneDescription))) - (set gnt:hasGeneId (ontology 'gene: (field GeneList GeneId))) + (set gnt:has_gene_id (ontology 'gene: (field GeneList GeneId))) (set dct:references (let ((symbol (field GeneList GeneSymbol))) (if (not (string-blank? symbol)) @@ -94,7 +93,7 @@ "https://www.ebi.ac.uk/gwas/search?query=" (uri-encode (string-trim-both symbol)) - "a gnc:ebiGwasLink")) + "a gnc:ebi_gwas_link")) ""))) (set dct:references (let ((symbol (field GeneList GeneSymbol)) @@ -107,7 +106,7 @@ (string->symbol (format #f "<~0@*~a> .~%<~0@*~a> ~1@*~a" "http://mouse.brain-map.org/search/show?search_type=gene&search_term=" - "a gnc:abaLink" + "a gnc:aba_link" (if (string=? species "mouse") (uri-encode (string-trim-both symbol)) @@ -129,7 +128,7 @@ (string-trim-both symbol)) "&category=Gene&species=" (string-capitalize species) - "a gnc:rgdLink")) + "a gnc:rgd_link")) ""))) (set dct:references (let ((geneId (field GeneList GeneID)) @@ -147,7 +146,7 @@ species "#goto=genereport&id=" geneId - "a gnc:biogpsLink")) + "a gnc:biogps_link")) ""))) (set dct:references (let ((geneId (field GeneList GeneID))) @@ -157,7 +156,7 @@ "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "http://www.chibi.ubc.ca/Gemma/gene/showGene.html?ncbiid=" geneId - "a gnc:gemmaLink")) + "a gnc:gemma_link")) ""))) (set dct:references (let ((symbol (field GeneList GeneSymbol)) @@ -175,7 +174,7 @@ species (uri-encode (string-trim-both symbol)) - "a gnc:genemaniaLink")) + "a gnc:genemania_link")) ""))) (set dct:references (let ((symbol (field GeneList GeneSymbol))) @@ -186,7 +185,7 @@ "http://www.pantherdb.org/genes/geneList.do?searchType=basic&fieldName=all&organism=all&listType=1&fieldValue=" (uri-encode (string-trim-both symbol)) - "a gnc:pantherLink")) + "a gnc:panther_link")) ""))) (set dct:references (let ((symbol (field GeneList GeneSymbol))) @@ -197,7 +196,7 @@ "http://string-db.org/newstring_cgi/show_network_section.pl?identifier=" (uri-encode (string-trim-both symbol)) - "a gnc:stringLink")) + "a gnc:panther_link")) ""))) (set dct:references (let ((symbol (field GeneList GeneSymbol))) @@ -208,7 +207,7 @@ "https://www.gtexportal.org/home/gene/" (uri-encode (string-trim-both symbol)) - "a gnc:gtexLink")) + "a gnc:gtex_link")) ""))) (set dct:references (let ((symbol (field GeneList GeneSymbol))) @@ -219,33 +218,27 @@ "http://www.proteinatlas.org/search/" (uri-encode (string-trim-both symbol)) - "a gnc:proteinAtlasLink")) + "a gnc:protein_atlas_link")) ""))) (set gnt:chromosome (field GeneList Chromosome)) - (set gnt:TxStart (annotate-field + (set gnt:tx_start (annotate-field (field GeneList TxStart) '^^xsd:double)) - (set gnt:TxEnd (annotate-field + (set gnt:tx_end (annotate-field (field GeneList TxEnd) '^^xsd:double)) - (set gnt:Strand (string-trim-both (field GeneList Strand))) + (set gnt:strand (string-trim-both (field GeneList Strand))) (set - gnt:belongsToSpecies - (string->identifier - "" - (remap-species-identifiers - (string-trim-both (field Species Name))) - #:separator "" - #:proc string-capitalize-first)) + gnt:has_species (string->identifier "" (remap-species-identifiers (field Species Fullname)))) (set gnt:transcript (ontology 'transcript: (string-trim-both (field GeneList NM_ID)))) - (set gnt:hasKgID (string-trim-both (field GeneList kgID))) - (set gnt:hasUnigenID (string-trim-both (field GeneList UnigenID))) - (set gnt:hasProteinID (string-trim-both (field GeneList ProteinID))) - (set gnt:hasAlignID (string-trim-both (field GeneList AlignID))) - (set gnt:hasRgdID + (set gnc:has_kg_id (string-trim-both (field GeneList kgID))) + (set gnc:has_unigen_id (string-trim-both (field GeneList UnigenID))) + (set gnt:has_protein_id (string-trim-both (field GeneList ProteinID))) + (set gnt:has_align_id (string-trim-both (field GeneList AlignID))) + (set gnt:has_rgd_id (field ("IFNULL(RGD_ID, '')" RGD_ID))))) (define-transformer genelist-rn33 @@ -257,25 +250,26 @@ (if (number? gene-uid) (number->string gene-uid) - gene-uid))) - (set rdf:type 'gnc:Gene) - (set gnt:belongsToSpecies 'gn:Rattus_norvegicus) - (set gnt:geneSymbol (string-trim-both (field GeneList_rn33 geneSymbol))) + gene-uid) + #:separator "_")) + (set rdf:type 'gnc:gene) + (set gnt:has_species 'gn:Rattus_norvegicus) + (set gnt:gene_symbol (string-trim-both (field GeneList_rn33 geneSymbol))) (set gnt:chromosome (field GeneList_rn33 chromosome)) - (set gnt:TxStart (annotate-field + (set gnt:tx_start (annotate-field (field GeneList_rn33 txStart) '^^xsd:double)) - (set gnt:TxEnd (annotate-field + (set gnt:tx_end (annotate-field (field GeneList_rn33 txEnd) '^^xsd:double)) - (set gnt:Strand (string-trim-both (field GeneList_rn33 strand))) + (set gnt:strand (string-trim-both (field GeneList_rn33 strand))) (set gnt:transcript (ontology 'transcript: (string-trim-both (field GeneList_rn33 NM_ID)))) (set - gnt:hasKgID + gnc:has_kg_id (string-trim-both (field GeneList_rn33 kgID))) (set dct:references (let ((symbol (field GeneList_rn33 geneSymbol))) @@ -295,7 +289,7 @@ "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "https://www.ebi.ac.uk/gwas/search?query=" (string-trim-both symbol) - "a gnc:ebiGwasLink")) + "a gnc:ebi_gwas_link")) ""))) (set dct:references (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol)))) @@ -306,7 +300,7 @@ "http://string-db.org/newstring_cgi/show_network_section.pl?identifier=" (uri-encode (string-trim-both symbol)) - "a gnc:stringLink")) + "a gnc:panther_link")) ""))) (set dct:references (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol)))) @@ -317,7 +311,7 @@ "https://www.gtexportal.org/home/gene/" (uri-encode (string-trim-both symbol)) - "a gnc:gtexLink")) + "a gnc:gtex_link")) ""))) (set dct:references (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol)))) @@ -328,7 +322,7 @@ "http://www.proteinatlas.org/search/" (uri-encode (string-trim-both symbol)) - "a gnc:proteinAtlasLink")) + "a gnc:protein_atlas_link")) ""))))) @@ -349,10 +343,10 @@ (connection %connection-settings) (table-metadata? #f) (prefixes - '(("gn:" "<http://genenetwork.org/id/>") - ("probeset:" "<http://genenetwork.org/probeset/>") - ("gnc:" "<http://genenetwork.org/category/>") - ("gnt:" "<http://genenetwork.org/term/>") + '(("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("probeset:" "<http://rdf.genenetwork.org/v1/probeset/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") ("dct:" "<http://purl.org/dc/terms/>") diff --git a/examples/generif.scm b/examples/generif.scm index 1809737..3b794fa 100755 --- a/examples/generif.scm +++ b/examples/generif.scm @@ -11,196 +11,124 @@ (transform strings) (transform sql) (transform triples) - (transform special-forms) - (transform uuid)) + (transform special-forms)) -(define (fix-email-id email) - (string-delete #\space email)) - -(define (investigator-attributes->id first-name last-name email) - ;; There is just one record corresponding to "Evan Williams" which - ;; does not have an email ID. To accommodate that record, we - ;; construct the investigator ID from not just the email ID, but - ;; also the first and the last names. It would be preferable to just - ;; find Evan Williams' email ID and insert it into the database. - (string->identifier "investigator" - (string-join - (list first-name last-name (fix-email-id email)) - "_"))) - - - -(define-transformer genewiki-symbols - (tables (GeneRIF_BASIC) - "GROUP BY BINARY symbol") - (triples - (string->identifier - "symbol" - (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field GeneRIF_BASIC symbol) - 'pre "_" 'post) - #:proc (lambda (x) x)) - (set rdfs:label - (field GeneRIF_BASIC symbol)))) - -;; Some symbols exist in the RIF table that don't exist in the GeneRIF -;; table. -(define-transformer generif-symbols - (tables (GeneRIF) - "WHERE symbol NOT IN (SELECT symbol from GeneRIF_BASIC) GROUP BY BINARY symbol") - (triples - (string->identifier - "symbol" - (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field GeneRIF symbol) - 'pre "_" 'post) - #:proc (lambda (x) x)) - (set rdfs:label - (field GeneRIF symbol)))) - (define-transformer gn-genewiki-entries (tables (GeneRIF (left-join Species "ON Species.SpeciesId = GeneRIF.SpeciesId") (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id") - (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id") - (left-join Investigators "ON Investigators.Email = GeneRIF.email")) - "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 AND GeneRIF.comment IS NOT NULL GROUP BY GeneRIF.comment, BINARY GeneRIF.symbol") + (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id")) + "WHERE GeneRIF.display > 0 AND GeneRIF.comment IS NOT NULL +GROUP BY GeneRIF.Id, GeneRIF.versionId, GeneRIF.symbol") (schema-triples - (gnc:GeneWikiEntry a rdfs:Class) - (gnc:GNWikiEntry rdfs:subClassOf gnc:GeneWikiEntry) - (gnc:GNWikiEntry rdfs:comment "Represents GeneRIF Entries entered from GeneNetwork") - (gnt:geneSymbol rdfs:domain gnc:GNWikiEntry)) + (gnc:gene_wiki_entry a rdfs:Class) + (gnc:gn_wiki_entry rdfs:subClassOf gnc:gene_wiki_entry) + (gnt:initial a owl:ObjectProperty) + (gnt:initial rdfs:domain gnc:gene_wiki_entry) + (gnt:initial skos:definition "Optional user or project code or your initials") + (gnt:reason a owl:ObjectProperty) + (gnt:reason rdfs:domain gnc:gene_wiki_entry) + (gnt:reason skos:definition "The reason why this resource was modified") + (gnc:gn_wiki_entry rdfs:comment "Represents GeneRIF Entries entered from GeneNetwork") + (gnt:gene_symbol rdfs:domain gnc:gn_wiki_entry)) (triples (string->identifier - "symbol" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field GeneRIF symbol) - 'pre "_" 'post) - #:proc (lambda (x) x)) - (set rdfs:comment - (let* ([generif-comment (sanitize-rdf-string (field GeneRIF comment))] - [create-time (field GeneRIF createtime EntryCreateTime)] - [pmid (field GeneRIF PubMed_ID PMID)] - [web-url (field GeneRIF weburl)] - [species (string->identifier - "" - (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)] - [version-id (field GeneRIF versionId)] - [identifier (field GeneRIF Id)] - [categories - (remove (lambda (x) - (or (eq? x #f) - (and (string? x) - (string-null? x)))) - (remove-duplicates - (string-split-substring - (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR '$$')" - GeneCategory)) - "$$")))]) - (string->symbol - (string-append - "[ " - (format #f "rdf:type gnc:GNWikiEntry ; ") - (if (string? species) - "" - (format #f "gnt:belongsToSpecies ~a ; " - species)) - (format #f "rdfs:comment ~s^^xsd:string ; " - generif-comment) - (if (string? create-time) - "" - (format #f "dct:created ~s^^xsd:datetime ; " - (time-unix->string - create-time "~5"))) - (if (and (string? pmid) (not (string-null? pmid))) - (format #f - "~{dct:references pubmed:~a ; ~}" - (string-split pmid #\space)) - "") - (if (and (not (string-null? - (string-trim-both (field GeneRIF email)))) - (not (string-null? (field Investigators Email)))) - (format #f "dct:creator ~a ; " - (investigator-attributes->id - (field Investigators FirstName) - (field Investigators LastName) - (field Investigators Email))) - "") - (format #f "dct:identifier ~s ; " identifier) - (format #f "dct:hasVersion \"~s\"^^xsd:int ; " version-id) - (if (not (null? categories)) - (format #f - "~{gnt:belongsToCategory ~s ; ~}" - categories) - "") - (if (and (string? web-url) (not (string-null? web-url))) - (format #f "foaf:homepage ~s ; " - web-url) - "") - " ] ")))))) + "wiki" (format #f "~a_~a" + (field GeneRIF Id) + (field GeneRIF versionId)) + #:separator "_") + (set rdfs:label (string->symbol + (format #f "'~a'@en" + (replace-substrings + (sanitize-rdf-string + (field GeneRIF comment)) + '(("'" . "\\'")))))) + (set rdf:type 'gnc:gn_wiki_entry) + (set gnt:symbol (field GeneRIF symbol)) + (set gnt:has_species (string->identifier "" (remap-species-identifiers (field Species Fullname)))) + (set dct:created + (string->symbol + (format #f "~s^^xsd:datetime " + (field + ("CAST(createtime AS CHAR)" EntryCreateTime))))) + (multiset dct:references + (map (lambda (pmid) + (match pmid + ((? string-blank? p) "") + (p (string->symbol + (format #f "pubmed:~a" (string-trim-both pmid)))))) + (string-split (field GeneRIF PubMed_ID PMID) + #\space))) + (set foaf:mbox + (match (sanitize-rdf-string (field GeneRIF email)) + ((? string-blank? mbox) "") + (mbox (string->symbol + (format #f "<~a>" mbox))))) + (set dct:identifier (annotate-field (format #f "~s" (field GeneRIF Id)) + '^^xsd:integer)) + (set foaf:homepage + (match (sanitize-rdf-string (field GeneRIF weburl)) + ((? string-blank? homepage) "") + (homepage (string->symbol + (format #f "<~a>" homepage))))) + (set dct:hasVersion (annotate-field (format #f "~s" (field GeneRIF versionId)) + '^^xsd:integer)) + (set gnt:initial (sanitize-rdf-string (field GeneRIF initial))) + (set gnt:reason (field GeneRIF reason)) + (multiset gnt:belongs_to_category + (string-split + (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR ';')" + GeneCategory)) + #\;)))) (define-transformer ncbi-genewiki-entries (tables (GeneRIF_BASIC - (left-join Species "USING (SpeciesId)")) - "WHERE GeneRIF_BASIC.comment IS NOT NULL AND TRIM(GeneRIF_BASIC.comment) != '' AND TRIM(GeneRIF_BASIC.symbol) != '' GROUP BY GeneRIF_BASIC.comment, GeneRIF_BASIC.createtime, GeneRIF_BASIC.VersionId, GeneRIF_BASIC.SpeciesId, GeneRIF_BASIC.TaxID") + (left-join Species "USING (SpeciesId)"))) (schema-triples - (gnc:NCBIWikiEntry rdfs:subClassOf gnc:GeneWikiEntry) - (gnc:NCBIWikiEntry rdfs:comment "Represents GeneRIF Entries obtained from NCBI") - (gnt:hasVersionId a owl:ObjectProperty) - (gnt:hasVersionId rdfs:domain gnc:NCBIWikiEntry) - (gnt:hasVersionId skos:definition "The VersionId of this this resource")) + (gnc:ncbi_wiki_entry rdfs:subClassOf gnc:gene_wiki_entry) + (gnc:ncbi_wiki_entry rdfs:comment "Represents GeneRIF Entries obtained from NCBI")) (triples (string->identifier - "symbol" - (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field GeneRIF_BASIC symbol GeneRIFSymbol) - 'pre "_" 'post) - #:proc (lambda (x) x)) - (set rdfs:comment - (let ([ncbi-comment (sanitize-rdf-string (field GeneRIF_BASIC comment))] - [species-name - (string->identifier - "" - (remap-species-identifiers (field Species Fullname SpeciesFullName)) - #:separator "" - #:proc string-capitalize-first)] - [taxonomic-id (field GeneRIF_BASIC TaxID TaxonomicId)] - [create-time (field GeneRIF_BASIC createtime EntryCreateTime)] - [pmid (field GeneRIF_BASIC PubMed_ID PMID)] - [gene-id (field GeneRIF_BASIC GeneId)] - [version-id (field GeneRIF_BASIC VersionId)]) + "rif" (format #f "~a_~a_~a_~a" + (field GeneRIF_BASIC GeneId) + (field GeneRIF_BASIC PubMed_ID) + (field ("DATE_FORMAT(createtime, '%Y-%m-%dT%T')" CreateTime)) + (field GeneRIF_BASIC VersionId)) + #:separator "_") + (set rdf:type + (let* ((comment (format #f "'~a'@en" + (replace-substrings + (sanitize-rdf-string + (field GeneRIF_BASIC comment)) + '(("\\" . "\\\\") + ("\n" . "\\n") + ("\r" . "\\r") + ("'" . "\\'"))))) + (create-time (format #f "~s^^xsd:datetime" + (field + ("CAST(createtime AS CHAR)" EntryCreateTime)))) + (symbol (field GeneRIF_BASIC symbol)) + (species (string->identifier "" (remap-species-identifiers (field Species Fullname)))) + (gene-id (field GeneRIF_BASIC GeneId)) + (taxon-id (field GeneRIF_BASIC TaxID TaxonomicId)) + (pmid (field GeneRIF_BASIC PubMed_ID)) + (version-id (field GeneRIF_BASIC versionId))) (string->symbol (string-append - "[ " - (format #f "rdf:type gnc:NCBIWikiEntry ; ") - (format #f "rdfs:comment ~s^^xsd:string ; " - ncbi-comment) - (format #f "gnt:belongsToSpecies ~a ; " - species-name) - (if (eq? #f taxonomic-id) - "" - (format #f "skos:notation taxon:~a ; " - taxonomic-id)) - (format #f "gnt:hasGeneId generif:~a ; " - gene-id) - (format #f "dct:hasVersion '~a'^^xsd:int ; " - version-id) - (if (and (string? pmid) (not (string-null? pmid))) - (format #f - "~{dct:references pubmed:~a ; ~}" - (string-split pmid #\space)) - "") - (if (string? create-time) - "" - (format #f "dct:created ~s^^xsd:datetime ; " - (time-unix->string - create-time "~5"))) - " ]")))))) + (format #f "gnc:ncbi_wiki_entry ;\n") + (format #f "\trdfs:label ~a ;\n" comment) + (format #f "\tgnt:has_species ~a ;\n" species) + (format #f "\tgnt:symbol ~s ;\n" symbol) + (format #f "\tgnt:has_gene_id generif:~a ;\n" gene-id) + (match taxon-id + ((? number? x) + (format #f "\tskos:notation taxon:~a ;\n" taxon-id)) + (else "")) + (format #f "\tdct:hasVersion \"~a\"^^xsd:integer ;\n" version-id) + (format #f "\tdct:references pubmed:~a ;\n" pmid) + (format #f "\tdct:created ~a" create-time))))))) @@ -225,11 +153,11 @@ ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") ("skos:" "<http://www.w3.org/2004/02/skos/core#>") ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") - ("gn:" "<http://genenetwork.org/id/>") - ("gnc:" "<http://genenetwork.org/category/>") - ("gnt:" "<http://genenetwork.org/term/>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") ("dct:" "<http://purl.org/dc/terms/>") - ("foaf:" "<http://xmlns.com/foaf/0.1/>") + ("foaf:" "<http://xmlns.com/foaf/0.1/#term_>") ("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>") ("taxon:" "<https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=>") ("generif:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>") @@ -237,10 +165,9 @@ ("owl:" "<http://www.w3.org/2002/07/owl#>"))) (inputs (list - genewiki-symbols - generif-symbols - gn-genewiki-entries - ncbi-genewiki-entries)) + ;; gn-genewiki-entries + ncbi-genewiki-entries + )) (outputs `(#:documentation ,documentation #:rdf ,output)))) diff --git a/examples/genotype-datasets.scm b/examples/genotype-datasets.scm new file mode 100755 index 0000000..ebe2349 --- /dev/null +++ b/examples/genotype-datasets.scm @@ -0,0 +1,87 @@ +#! /usr/bin/env guile +!# + +(use-modules (rnrs programs) + (rnrs io ports) + (srfi srfi-1) + (srfi srfi-26) + (ice-9 getopt-long) + (ice-9 match) + (ice-9 regex) + (transform strings) + (transform sql) + (transform triples) + (transform special-forms)) + + +(define-transformer gn:set->gn:dataset + (tables (Species + (inner-join InbredSet "ON InbredSet.SpeciesId = Species.Id") + (inner-join GenoFreeze "ON GenoFreeze.InbredSetId = InbredSet.Id")) + "WHERE GenoFreeze.public > 0 AND Species.Name != 'monkey' GROUP BY Species.Name, GenoFreeze.ShortName") + (triples (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_") + (multiset gnt:has_genotype_data + (map (cut string->identifier "dataset" <> #:separator "_") + (string-split + (field ("GROUP_CONCAT(GenoFreeze.Name SEPARATOR ',')" + dataset_name)) + #\,))))) + +(define-transformer gn:dataset->metadata + (tables (GenoFreeze + (inner-join InbredSet "ON InbredSet.Id = GenoFreeze.InbredSetId") + (inner-join Species "ON InbredSet.SpeciesId = Species.Id")) + "WHERE GenoFreeze.public > 0 AND Species.Name != 'monkey'") + (triples (string->identifier "dataset" (field GenoFreeze Name) #:separator "_") + (set gnt:has_strain (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")) + (set dct:created (annotate-field (field GenoFreeze CreateTime) '^^xsd:datetime)))) + +(define-transformer gn:dataset->marker/snp-count + (tables (GenoFreeze + (inner-join InbredSet "ON InbredSet.Id = GenoFreeze.InbredSetId") + (inner-join Species "ON InbredSet.SpeciesId = Species.Id") + (inner-join Geno "ON Geno.SpeciesId = Species.Id")) + "WHERE GenoFreeze.public > 0 AND Species.Name != 'monkey' GROUP BY GenoFreeze.Name") + (triples (string->identifier "dataset" (field GenoFreeze Name) #:separator "_") + (set gnt:has_marker_count + (string->symbol + (format #f "'~s'^^xsd:integer" + (field + ("COUNT(DISTINCT Geno.Marker_Name)" MarkerCount))))))) + + +(let* ((option-spec + '((settings (single-char #\s) (value #t)) + (output (single-char #\o) (value #t)) + (documentation (single-char #\d) (value #t)))) + (options (getopt-long (command-line) option-spec)) + (settings (option-ref options 'settings #f)) + (output (option-ref options 'output #f)) + (documentation (option-ref options 'documentation #f)) + (%connection-settings + (call-with-input-file settings + read))) + (with-documentation + (name "Genotype Datasets") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("dcat:" "<http://www.w3.org/ns/dcat#>") + ("dct:" "<http://purl.org/dc/terms/>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") + ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") + ("owl:" "<http://www.w3.org/2002/07/owl#>") + ("skos:" "<http://www.w3.org/2004/02/skos/core#>") + ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") + ("xsd:" "<http://www.w3.org/2001/XMLSchema#>"))) + (inputs + (list + gn:set->gn:dataset + gn:dataset->metadata + gn:dataset->marker/snp-count)) + (outputs + `(#:documentation ,documentation + #:rdf ,output)))) diff --git a/examples/genotype.scm b/examples/genotype.scm index 7e72cf8..4a91b63 100755 --- a/examples/genotype.scm +++ b/examples/genotype.scm @@ -12,74 +12,22 @@ (transform sql) (transform triples) (transform special-forms)) - - -(define-transformer genotypes +(define-transformer gn:markers/snps->metadata (tables (Geno - (left-join Species "USING (SpeciesId)"))) - (schema-triples - (gnt:chr a owl:ObjectProperty) - (gnt:chr skos:description "This resource is located on a given chromosome") - (gnt:chr rdfs:domain gnc:Genotype) - (gnt:mb a owl:ObjectProperty) - (gnt:mb skos:definition "The size of this resource in Mb") - (gnt:mb rdfs:domain gnc:Genotype) - (gnt:mbMm8 a owl:ObjectProperty) - (gnt:mbMm8 skos:definition "TODO") - (gnt:mbMm8 rdfs:domain gnc:Genotype) - (gnt:mb2016 a owl:ObjectProperty) - (gnt:mb2016 skos:definition "TODO") - (gnt:mb2016 rdfs:domain gnc:Genotype) - (gnt:hasSequence a owl:ObjectProperty) - (gnt:hasSequence skos:definition "This resource has a given sequence") - (gnt:hasSequence rdfs:domain gnc:Genotype) - (gnt:hasSource a owl:ObjectProperty) - (gnt:hasSource rdfs:domain gnc:Genotype) - (gnt:hasSource skos:definition "This resource was obtained from this given source") - (gnt:hasAltSourceName a owl:ObjectProperty) - (gnt:hasAltSourceName rdfs:domain gnc:Genotype) - (gnt:hasAltSourceName - skos:definition - "The alternative name this resource was obtained from") - (gnt:chrNum a owl:ObjectProperty) - (gnt:chrNum rdfs:domain gnc:Genotype) - (gnt:chrNum skos:definition "The chromosome number for this resource")) - (triples - (string->identifier - "" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field Geno Name) - 'pre "_" 'post) - #:separator "" - #:proc string-capitalize-first) - (set rdf:type 'gnc:Genotype) - (set rdfs:label (sanitize-rdf-string (field Geno Name))) + (inner-join Species "ON Geno.SpeciesId = Species.Id")) + "WHERE Species.Name != 'monkey'") + (triples (string->identifier "marker" (field Geno Name) #:separator "_") + (set gnt:has_species + (string->identifier "" (remap-species-identifiers (field Species Fullname)))) + (set rdf:type 'gnc:dna_marker) + (set skos:prefLabel (field Geno Name)) + (set skos:altLabel (field Geno Marker_Name)) (set gnt:chr (field Geno Chr)) - (set gnt:mb (annotate-field - (field ("IFNULL(Geno.Mb, '')" Mb)) '^^xsd:double)) - (set gnt:mbMm8 (annotate-field (field ("IFNULL(Geno.Mb_mm8, '')" Mb_mm8)) - '^^xsd:double)) - (set gnt:mb2016 - (annotate-field (field ("IFNULL(Geno.Mb_2016, '')" Mb_2016)) - '^^xsd:double)) - (set gnt:hasSequence (field Geno Sequence)) - (set gnt:hasSource (field Geno Source)) - ;; Only transform Source2 if it differs from Source - (set gnt:hasAltSourceName - (field ("IF((Source2 = Source), NULL, Source2)" - Source2))) - (set gnt:belongsToSpecies - (string->identifier - "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)) - (set gnt:chrNum - (annotate-field - (field Geno chr_num) - '^^xsd:int)) - (set rdfs:comments (field Geno Comments)))) + (set gnt:mb (annotate-field (field Geno Mb) '^^xsd:doubleg)) + (set gnt:sequence (field Geno Sequence)) + (set gnt:source (field Geno Source)) + (set rdfs:comment (field Geno Comments)))) @@ -95,22 +43,28 @@ (call-with-input-file settings read))) (with-documentation - (name "Genotype Metadata") + (name "Phenotypes Metadata") (connection %connection-settings) (table-metadata? #f) (prefixes - '(("dct:" "<http://purl.org/dc/terms/>") - ("gn:" "<http://genenetwork.org/id/>") - ("gnc:" "<http://genenetwork.org/category/>") - ("gnt:" "<http://genenetwork.org/term/>") - ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") - ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") + '(("dcat:" "<http://www.w3.org/ns/dcat#>") + ("dct:" "<http://purl.org/dc/terms/>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") ("owl:" "<http://www.w3.org/2002/07/owl#>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnd:" "<https://cd.genenetwork.org/lmdb/v1/data/traits/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") + ("sdmx-measure:" "<http://purl.org/linked-data/sdmx/2009/measure#>") ("skos:" "<http://www.w3.org/2004/02/skos/core#>") + ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") + ("xsd:" "<http://www.w3.org/2001/XMLSchema#>") + ("qb:" "<http://purl.org/linked-data/cube#>") ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") - ("xsd:" "<http://www.w3.org/2001/XMLSchema#>"))) + ("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>"))) (inputs - (list genotypes)) + (list gn:markers/snps->metadata)) (outputs `(#:documentation ,documentation #:rdf ,output)))) + diff --git a/examples/investigators.scm b/examples/investigators.scm new file mode 100755 index 0000000..8d31974 --- /dev/null +++ b/examples/investigators.scm @@ -0,0 +1,93 @@ +#! /usr/bin/env guile +!# + +(use-modules (srfi srfi-1) + (srfi srfi-26) + (ice-9 getopt-long) + (ice-9 match) + (ice-9 regex) + (transform strings) + (transform sql) + (transform triples) + (transform special-forms)) + + +;; One email ID in the Investigators table has spaces in it. This +;; function fixes that. +(define (fix-email-id email) + (string-delete #\space email)) + +(define (investigator-attributes->id first-name last-name email) + ;; There is just one record corresponding to "Evan Williams" which + ;; does not have an email ID. To accommodate that record, we + ;; construct the investigator ID from not just the email ID, but + ;; also the first and the last names. It would be preferable to just + ;; find Evan Williams' email ID and insert it into the database. + (string->identifier "investigator" + (string-join + (list first-name last-name (fix-email-id email)) + "_") + #:separator "_")) + + +(define-transformer investigators + ;; There are a few duplicate entries. We group by email to + ;; deduplicate. + (tables (Investigators) + "GROUP BY Email") + (triples (investigator-attributes->id (field Investigators FirstName) + (field Investigators LastName) + "") + (set rdf:type 'foaf:Person) + (set foaf:name (string-append (field Investigators FirstName) " " + (field Investigators LastName))) + (set foaf:givenName + (field Investigators FirstName)) + (set foaf:familyName + (field Investigators LastName)) + (set foaf:homepage (field Investigators Url)) + (set v:adr (field Investigators Address)) + (set v:locality (field Investigators City)) + (set v:region (field Investigators State)) + (set v:postal-code (field Investigators ZipCode)) + (set v:country-name (field Investigators Country)))) + + +(let* ((option-spec + '((settings (single-char #\s) (value #t)) + (output (single-char #\o) (value #t)) + (documentation (single-char #\d) (value #t)))) + (options (getopt-long (command-line) option-spec)) + (settings (option-ref options 'settings #f)) + (output (option-ref options 'output #f)) + (documentation (option-ref options 'documentation #f)) + (%connection-settings + (call-with-input-file settings + read))) + (with-documentation + (name "Info files / Investigators Metadata") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '( + ("dcat:" "<http://www.w3.org/ns/dcat#>") + ("dct:" "<http://purl.org/dc/terms/>") + ("foaf:" "<http://xmlns.com/foaf/0.1/#term_>") + ("geoSeries:" "<http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") + ("owl:" "<http://www.w3.org/2002/07/owl#>") + ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") + ("skos:" "<http://www.w3.org/2004/02/skos/core#>") + ("taxon:" "<http://purl.uniprot.org/taxonomy/>") + ("v:" "<http://www.w3.org/2006/vcard/ns#>") + ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") + ("xsd:" "<http://www.w3.org/2001/XMLSchema#>") + )) + (inputs + (list investigators)) + (outputs + `(#:documentation ,documentation + #:rdf ,output)))) diff --git a/examples/molecular-traits-datasets.scm b/examples/molecular-traits-datasets.scm new file mode 100755 index 0000000..34ddf3a --- /dev/null +++ b/examples/molecular-traits-datasets.scm @@ -0,0 +1,100 @@ +#! /usr/bin/env guile +!# + +(use-modules (srfi srfi-1) + (srfi srfi-26) + (ice-9 getopt-long) + (ice-9 match) + (ice-9 regex) + (transform strings) + (transform sql) + (transform triples) + (transform special-forms)) + + +(define-transformer gn:molecular-trait->gn:dataset + (tables (Tissue)) + (triples (string->identifier "trait" (field Tissue Short_Name) #:separator "_") + (set rdf:type 'gnc:molecular_trait) + (set skos:prefLabel (field Tissue Name)) + (set skos:altLabel (field Tissue Short_Name)))) + +(define-transformer gnc:molecular_trait->gn:molecular_trait + (tables (Tissue)) + (triples "gnc:molecular_trait" + (set skos:member (string->identifier "trait" (field Tissue Short_Name) #:separator "_")))) + +(define-transformer gn:set->gn:dataset + (tables (Species + (inner-join InbredSet "ON InbredSet.SpeciesId = Species.Id") + (inner-join ProbeFreeze "ON ProbeFreeze.InbredSetId = InbredSet.Id") + (inner-join ProbeSetFreeze "ON ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id") + (inner-join Tissue "ON ProbeFreeze.TissueId = Tissue.Id")) + "WHERE ProbeSetFreeze.public > 0 AND Species.Name != 'monkey' GROUP BY Species.Name, Tissue.Short_Name") + (triples (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_") + (multiset gnt:has_probeset_data + (map (cut string->identifier "dataset" <> #:separator "_") + (string-split + (field ("GROUP_CONCAT(ProbeSetFreeze.Name SEPARATOR ',')" + dataset_name)) + #\,))))) + +(define-transformer gn:dataset->metadata + (tables (ProbeSetFreeze + (inner-join ProbeFreeze "ON ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id") + (inner-join InbredSet "ON InbredSet.Id = ProbeFreeze.InbredSetId") + (inner-join Species "ON InbredSet.SpeciesId = Species.Id") + (inner-join Tissue "ON ProbeFreeze.TissueId = Tissue.Id") + (inner-join AvgMethod "ON AvgMethod.AvgMethodId = ProbeSetFreeze.AvgID") + (inner-join InfoFiles "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") + (inner-join Datasets "ON InfoFiles.DatasetId = Datasets.DatasetId") + (left-join GeneChip "ON GeneChip.Id = InfoFiles.GeneChipId")) + "WHERE ProbeSetFreeze.public > 0 AND Species.Name != 'monkey'") + (triples (string->identifier "dataset" (field ProbeSetFreeze Name) #:separator "_") + (set dct:created (annotate-field (field ProbeSetFreeze CreateTime) '^^xsd:datetime)) + (set gnt:has_strain (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")) + (set gnt:uses_normalization_method + (string->identifier "avg_method" (field AvgMethod Name AvgMethodName) #:separator "_")) + (set gnt:has_molecular_trait + (string->identifier "trait" (field Tissue Short_Name) #:separator "_")) + (set gnt:uses_genechip + (string->identifier "platform" (field GeneChip Name) #:separator "_")))) + + +(let* ((option-spec + '((settings (single-char #\s) (value #t)) + (output (single-char #\o) (value #t)) + (documentation (single-char #\d) (value #t)))) + (options (getopt-long (command-line) option-spec)) + (settings (option-ref options 'settings #f)) + (output (option-ref options 'output #f)) + (documentation (option-ref options 'documentation #f)) + (%connection-settings + (call-with-input-file settings + read))) + (with-documentation + (name "Molecular Trait Datasets") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("dcat:" "<http://www.w3.org/ns/dcat#>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("obo:" "<http://purl.obolibrary.org/obo/>") + ("owl:" "<http://www.w3.org/2002/07/owl#>") + ("xsd:" "<http://www.w3.org/2001/XMLSchema#>") + ("dct:" "<http://purl.org/dc/terms/>") + ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") + ("skos:" "<http://www.w3.org/2004/02/skos/core#>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>"))) + (inputs + (list + gn:dataset->metadata + gn:molecular-trait->gn:dataset + gn:set->gn:dataset + gnc:molecular_trait->gn:molecular_trait)) + (outputs + `(#:documentation ,documentation + #:rdf ,output)))) diff --git a/examples/ontology.scm b/examples/ontology.scm new file mode 100755 index 0000000..724a75a --- /dev/null +++ b/examples/ontology.scm @@ -0,0 +1,272 @@ +#! /usr/bin/env guile +!# + +(use-modules (ice-9 getopt-long) + (transform triples) + (transform schema) + (transform special-forms)) + +(let* ((option-spec + '((settings (single-char #\s) (value #t)) + (output (single-char #\o) (value #t)) + (documentation (single-char #\d) (value #t)))) + (options (getopt-long (command-line) option-spec)) + (settings (option-ref options 'settings #f)) + (output (option-ref options 'output #f)) + (documentation (option-ref options 'documentation #f)) + (%connection-settings (call-with-input-file settings read))) + (with-output-to-file output + (lambda () + ;; Define all GN ontology in one place. + (prefix "dcat:" "<http://www.w3.org/ns/dcat#>") + (prefix "dct:" "<http://purl.org/dc/terms/>") + (prefix "gn:" "<http://rdf.genenetwork.org/v1/id/>") + (prefix "owl:" "<http://www.w3.org/2002/07/owl#>") + (prefix "gnc:" "<http://rdf.genenetwork.org/v1/category/>") + (prefix "gnt:" "<http://rdf.genenetwork.org/v1/term/>") + (prefix "obo:" "<http://purl.obolibrary.org/obo/>") + (prefix "sdmx-measure:" "<http://purl.org/linked-data/sdmx/2009/measure#>") + (prefix "skos:" "<http://www.w3.org/2004/02/skos/core#>") + (prefix "rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + (prefix "rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") + (prefix "xsd:" "<http://www.w3.org/2001/XMLSchema#>") + (prefix "qb:" "<http://purl.org/linked-data/cube#>") + (prefix "xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") + (prefix "pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>") + (prefix "schema:" "<https://schema.org/>") + (newline) + (triple 'gnc:population_category 'a 'xkos:ClassificationLevel) + (triple 'gnc:population_category 'rdfs:label "Population Category") + (triple 'gnc:population_category 'skos:inScheme 'gnc:resource_classification_scheme) + (triple 'gnc:population_category 'skos:prefLabel "Population Category") + (triple 'gnc:population_category 'xkos:depth "3") + (triple 'gnc:population_category 'xkos:nextLevel 'gnc:set) + (triple 'gnc:population_category 'xkos:previousLevel 'gnc:species) + (triple 'gnc:reference_population 'a 'skos:Concept) + (triple 'gnc:reference_population 'skos:definition "A genetic population") + (triple 'gnc:reference_population 'skos:inScheme 'gnc:population_category) + (triple 'gnc:reference_population 'skos:prefLabel "Reference population") + (triple 'gnc:resource_classification_scheme 'a 'skos:ConceptScheme) + (triple 'gnc:resource_classification_scheme 'skos:definition "A hierarchical classification scheme for organizing GeneNetwork resources by dataset type, resource set (inbredset group), or species.") + (triple 'gnc:resource_classification_scheme 'skos:prefLabel "GeneNetwork Resource Classification Scheme") + (triple 'gnc:resource_classification_scheme 'xkos:levels 'gnc:population_category) + (triple 'gnc:resource_classification_scheme 'xkos:levels 'gnc:set) + (triple 'gnc:resource_classification_scheme 'xkos:levels 'gnc:species) + (triple 'gnc:resource_classification_scheme 'xkos:levels 'gnc:taxonomic_family) + (triple 'gnc:resource_classification_scheme 'xkos:numberOfLevels "4") + (triple 'gnc:set 'a 'xkos:ClassificationLevel) + (triple 'gnc:set 'skos:definition "A category representing groups of genetically related strains or individuals (inbred sets, recombinant inbred lines, etc.).") + (triple 'gnc:set 'skos:inScheme 'gnc:resource_classification_scheme) + (triple 'gnc:set 'skos:prefLabel "InbredSet Group") + (triple 'gnc:set 'xkos:depth "4") + (triple 'gnc:set 'xkos:previousLevel 'gnc:population_category) + (triple 'gnc:species 'a 'xkos:ClassificationLevel) + (triple 'gnc:species 'skos:definition "A classification level that that associates a given resource to a species in GeneNetwork.") + (triple 'gnc:species 'skos:inScheme 'gnc:resource_classification_scheme) + (triple 'gnc:species 'skos:prefLabel "Species") + (triple 'gnc:species 'xkos:depth "2") + (triple 'gnc:species 'xkos:nextLevel 'gnc:population_category) + (triple 'gnc:species 'xkos:previousLevel 'gnc:taxonomic_family) + (triple 'gnc:taxonomic_family 'a 'xkos:ClassificationLevel) + (triple 'gnc:taxonomic_family 'skos:definition "An organizational classification level used in GeneNetwork to group resources into families.") + (triple 'gnc:taxonomic_family 'skos:inScheme 'gnc:resource_classification_scheme) + (triple 'gnc:taxonomic_family 'skos:prefLabel "Family") + (triple 'gnc:taxonomic_family 'xkos:depth "1") + (triple 'gnc:taxonomic_family 'xkos:nextLevel 'gnc:species) + (triple 'gnt:assigned_species 'a 'owl:ObjectProperty) + (triple 'gnt:assigned_species 'rdfs:domain 'gnc:set) + (triple 'gnt:assigned_species 'rdfs:label "These families have been assigned to these species") + (triple 'gnt:genetic_type 'a 'owl:DatatypeProperty) + (triple 'gnt:genetic_type 'rdfs:domain 'gnc:set) + (triple 'gnt:genetic_type 'rdfs:label "has genetic type") + (triple 'gnt:genetic_type 'rdfs:range 'xsd:string) + (triple 'gnt:genetic_type 'skos:definition "Describes the genetic architecture of a resource set (e.g., intercross, riset).") + (triple 'gnt:has_family_order_id 'a 'owl:DatatypeProperty) + (triple 'gnt:has_family_order_id 'a 'owl:DatatypeProperty) + (triple 'gnt:has_family_order_id 'rdfs:range 'xsd:integer) + (triple 'gnt:has_set_code 'a 'owl:DatatypeProperty) + (triple 'gnt:has_set_code 'rdfs:domain 'gnc:set) + (triple 'gnt:has_set_code 'rdfs:label "has set code") + (triple 'gnt:has_set_code 'rdfs:range 'xsd:string) + (triple 'gnt:has_set_code 'skos:definition "Provides a unique identifier code for a resource set.") + (triple 'gnt:has_species 'a 'owl:ObjectProperty) + (triple 'gnt:has_species 'rdf:comment "This resource belongs to this species") + (triple 'gnt:has_species 'rdfs:label "belongs to species") + (triple 'gnt:has_species 'rdfs:range 'gnc:species) + (triple 'gnt:has_strain 'a 'owl:ObjectProperty) + (triple 'gnt:has_strain 'rdf:comment "Indicates the group the resources belongs to") + (triple 'gnt:has_strain 'rdfs:domain 'gnc:species) + (triple 'gnt:has_strain 'rdfs:label "this resource belongs to this strain.") + (triple 'gnt:has_strain 'rdfs:range 'gnc:set) + (triple 'gnt:has_strain 'schema:domainIncludes 'dcat:Dataset) + (triple 'gnt:has_strain 'schema:domainIncludes 'gnc:species) + (triple 'gnt:has_strain 'skos:definition "Lists all strains that belong to this resource.") + (triple 'gnt:has_taxonomic_family 'a 'owl:ObjectProperty) + (triple 'gnt:has_taxonomic_family 'rdfs:label "has family") + (triple 'gnt:has_taxonomic_family 'schema:domainIncludes 'gnc:set) + (triple 'gnt:has_taxonomic_family 'schema:domainIncludes 'gnc:species) + (triple 'gnt:has_taxonomic_family 'skos:definition "Links a species to its taxonomic family") + (triple 'gnt:has_uniprot_taxon_id 'a 'owl:ObjectProperty) + (triple 'gnt:has_uniprot_taxon_id 'rdfs:label "has uniprot taxonomic id") + (triple 'gnt:population_category 'skos:definition "Classification of genetic populations by breeding design and data aggregation.") + (triple 'gnt:short_name 'a 'owl:DatatypeProperty) + (triple 'gnt:short_name 'rdfs:domain 'gnc:species) + (triple 'gnt:short_name 'rdfs:label "has short name") + (triple 'gnt:short_name 'skos:definition "The short name of a given resource") + (triple 'gnt:uses_mapping_method 'a 'owl:ObjectProperty) + (triple 'gnt:uses_mapping_method 'rdfs:comment "The method used to map genetic or experimental data for this resource.") + (triple 'gnt:uses_mapping_method 'rdfs:domain 'gnc:set) + (triple 'gnt:uses_mapping_method 'rdfs:label "mapping method") + (triple 'gnt:uses_mapping_method 'rdfs:range 'gnc:mapping_method) + + ;; Describing Datasets + (triple 'gnc:molecular_trait 'a 'owl:Class) + (triple 'gnc:molecular_trait 'a 'skos:Concept) + (triple 'gnc:molecular_trait 'rdfs:label "Molecular Trait. This describes a melecular trait of a given species. We combine the species name and the tissue name in order to differentiate the traits across different inbredset groups.") + (triple 'gnc:molecular_trait 'rdfs:subClassOf 'obo:UBERON_0000479) + (triple 'gnt:has_case_info 'a 'owl:ObjectProperty) + (triple 'gnt:has_case_info 'rdfs:comment "Information about the cases used in this platform") + (triple 'gnt:has_case_info 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_case_info 'rdfs:label "About Case") + (triple 'gnt:has_citation 'a 'owl:ObjectProperty) + (triple 'gnt:has_citation 'rdfs:comment "Citation for this dataset") + (triple 'gnt:has_citation 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_citation 'rdfs:label "Citation") + (triple 'gnt:has_contributors 'a 'owl:ObjectProperty) + (triple 'gnt:has_contributors 'rdfs:comment "Contributors of this resource") + (triple 'gnt:has_contributors 'rdfs:comment "Contributors of this resource") + (triple 'gnt:has_contributors 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_contributors 'rdfs:label "Contributors") + (triple 'gnt:has_data_processing_info 'a 'owl:ObjectProperty) + (triple 'gnt:has_data_processing_info 'rdfs:comment "Information about how this dataset was processed") + (triple 'gnt:has_data_processing_info 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_data_processing_info 'rdfs:label "About Data Processing") + (triple 'gnt:has_experiment_design 'a 'owl:ObjectProperty) + (triple 'gnt:has_experiment_design 'rdfs:comment "Experiment Design for this resource") + (triple 'gnt:has_experiment_design 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_experiment_design 'rdfs:label "Experiment Design") + (triple 'gnt:has_experiment_design_info 'a 'owl:ObjectProperty) + (triple 'gnt:has_experiment_design_info 'rdfs:comment "Information about how the experiment was designed") + (triple 'gnt:has_experiment_design_info 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_experiment_design_info 'rdfs:label "Experiment Design") + (triple 'gnt:has_experiment_type 'a 'owl:ObjectProperty) + (triple 'gnt:has_experiment_type 'rdfs:comment "Information about the experiment type") + (triple 'gnt:has_experiment_type 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_experiment_type 'rdfs:label "Experiment Type Metadata") + (triple 'gnt:has_molecular_trait 'rdf:type 'owl:ObjectProperty) + (triple 'gnt:has_molecular_trait 'rdfs:domain 'gnc:set) + (triple 'gnt:has_molecular_trait 'rdfs:label "has molecular trait") + (triple 'gnt:has_molecular_trait 'rdfs:range 'gnc:molecular_trait) + (triple 'gnt:has_phenotype_data 'rdf:type 'owl:ObjectProperty) + (triple 'gnt:has_phenotype_data 'rdfs:comment "Associates a resource with its phenotype data.") + (triple 'gnt:has_phenotype_data 'rdfs:domain 'gnc:set) + (triple 'gnt:has_phenotype_data 'rdfs:label "this resources has this phenotype data.") + (triple 'gnt:has_phenotype_data 'rdfs:range 'dcat:Dataset) + (triple 'gnt:has_phenotype_data 'rdfs:subPropertyOf 'dct:relation) + (triple 'gnt:has_platform_info 'a 'owl:ObjectProperty) + (triple 'gnt:has_platform_info 'rdfs:comment "Information about the platform that was used with this dataset") + (triple 'gnt:has_platform_info 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_platform_info 'rdfs:label "About Platform") + (triple 'gnt:has_probeset_data 'rdf:type 'owl:ObjectProperty) + (triple 'gnt:has_probeset_data 'rdfs:comment "Associates a resource with this probeset data.") + (triple 'gnt:has_probeset_data 'rdfs:domain 'gnc:set) + (triple 'gnt:has_probeset_data 'rdfs:label "this resources has this probeset data.") + (triple 'gnt:has_probeset_data 'rdfs:range 'gnc:molecular_trait) + (triple 'gnt:has_probeset_data 'rdfs:subPropertyOf 'dct:relation) + (triple 'gnt:has_samples 'a 'owl:ObjectProperty) + (triple 'gnt:has_samples 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_samples 'rdfs:label "Samples") + (triple 'gnt:has_specifics 'a 'owl:ObjectProperty) + (triple 'gnt:has_specifics 'rdfs:comment "Has specifics") + (triple 'gnt:has_specifics 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_specifics 'rdfs:label "Specifics") + (triple 'gnt:has_summary 'a 'owl:ObjectProperty) + (triple 'gnt:has_summary 'rdfs:comment "Summary information about dataset") + (triple 'gnt:has_summary 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_summary 'rdfs:label "Summary") + (triple 'gnt:has_tissue_info 'a 'owl:ObjectProperty) + (triple 'gnt:has_tissue_info 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_tissue_info 'rdfs:label "Metadata about Tissue for this resource") + (triple 'gnt:uses_genechip 'a 'owl:ObjectProperty) + (triple 'gnt:uses_genechip 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:uses_genechip 'skos:definition "The Platform this resource uses..") + (triple 'gnt:uses_normalization_method 'rdfs:comment "The normalization method used for the molecular traits in this dataset") + (triple 'gnt:uses_normalization_method 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:uses_normalization_method 'rdfs:label "Averaging method used for the molecular traits in this dataset.") + (triple 'gnt:uses_normalization_method 'rdfs:range 'gnc:avg_method) + + ;; Describing phenotypes + (triple 'gnc:phenotype 'a 'owl:Class) + (triple 'gnc:phenotype 'a 'skos:Concept) + (triple 'gnc:phenotype 'rdfs:label "A phenotype.") + (triple 'gnc:phenotype_trait 'a 'owl:Class) + (triple 'gnc:phenotype_trait 'a 'skos:Concept) + (triple 'gnc:phenotype_trait 'rdfs:label "A phenotype trait.") + (triple 'gnt:abbreviation 'a 'owl:ObjectProperty) + (triple 'gnt:abbreviation 'rdfs:domain 'gnc:phenotype) + (triple 'gnt:abbreviation 'skos:definition "The abbreviation used for this resource") + (triple 'gnt:additive 'rdfs:domain 'gnc:phenotype) + (triple 'gnt:additive 'rdfs:range 'xsd:double) + (triple 'gnt:lab_code 'a 'owl:ObjectProperty) + (triple 'gnt:lab_code 'rdfs:domain 'gnc:phenotype) + (triple 'gnt:locus 'a 'qb:MeasureProperty) + (triple 'gnt:locus 'a 'rdf:Property) + (triple 'gnt:locus 'rdfs:domain 'gnc:phenotype) + (triple 'gnt:locus 'rdfs:range 'rdfs:Literal) + (triple 'gnt:locus 'rdfs:subPropertyOf 'sdmx-measure:obsValue) + (triple 'gnt:lod_score 'a 'qb:MeasureProperty) + (triple 'gnt:lod_score 'a 'rdf:Property) + (triple 'gnt:lod_score 'rdfs:domain 'gnc:phenotype) + (triple 'gnt:lod_score 'rdfs:label "Peak -logP") + (triple 'gnt:lod_score 'rdfs:range 'xsd:double) + (triple 'gnt:lod_score 'rdfs:subPropertyOf 'sdmx-measure:obsValue) + (triple 'gnt:lod_score 'skos:definition "Statistical measurement assessing the likelihood of genetic linkage between traits or genetic markers.") + (triple 'gnt:mean 'a 'qb:MeasureProperty) + (triple 'gnt:mean 'a 'rdf:Property) + (triple 'gnt:mean 'rdfs:domain 'gnc:phenotype) + (triple 'gnt:mean 'rdfs:range 'xsd:double) + (triple 'gnt:mean 'rdfs:subPropertyOf 'sdmx-measure:obsValue) + (triple 'gnt:sequence 'rdfs:domain 'gnc:phenotype) + (triple 'gnt:sequence 'rdfs:range 'xsd:integer) + (triple 'gnt:submitter 'a 'owl:ObjectProperty) + (triple 'gnt:submitter 'rdfs:domain 'gnc:phenotype) + (triple 'gnt:submitter 'skos:definition "A person who submitted this resource to GN") + (triple 'gnt:submitter 'skos:definition "A person who submitted this resource to GN") + (triple 'gnt:has_phenotype_data 'a 'owl:ObjectProperty) + (triple 'gnt:has_phenotype_data 'rdfs:domain 'gnc:set) + (triple 'gnt:has_phenotype_data 'skos:definition "This resource has phenotype data.") + + ;; Genotypes + (triple 'gnc:dna_marker 'a 'owl:Class) + (triple 'gnc:dna_marker 'a 'skos:Concept) + (triple 'gnc:dna_marker 'rdfs:label "A DNA Marker or SNP") + (triple 'gnt:has_genotype_files 'rdfs:label "This resource has these genotype files") + (triple 'gnt:has_genotype_files 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_genotype_data 'rdf:type 'owl:ObjectProperty) + (triple 'gnt:has_genotype_data 'rdfs:label "this resources has genotype data.") + (triple 'gnt:has_genotype_data 'rdfs:comment "Associates a resource with its genotype data.") + (triple 'gnt:has_genotype_data 'rdfs:domain 'gnc:set) + (triple 'gnt:has_genotype_data 'rdfs:range 'dcat:Dataset) + (triple 'gnt:has_genotype_data 'rdfs:subPropertyOf 'dct:relation) + (triple 'gnt:has_marker_count 'rdf:type 'owl:ObjectProperty) + (triple 'gnt:has_marker_count 'rdfs:label "this resources has N number of dna markers/SNPs.") + (triple 'gnt:has_marker_count 'rdfs:domain 'xsd:integer) + (triple 'gnt:has_marker_count 'rdfs:range 'dcat:Dataset) + (triple 'gnt:chr 'a 'qb:MeasureProperty) + (triple 'gnt:chr 'a 'rdf:Property) + (triple 'gnt:chr 'rdfs:label "Chromosome") + (triple 'gnt:chr 'rdfs:domain 'gnc:marker) + (triple 'gnt:chr 'rdfs:range 'rdfs:Literal) + (triple 'gnt:chr 'rdfs:subPropertyOf 'sdmx-measure:obsValue) + (triple 'gnt:mb 'rdfs:label "Megabase") + (triple 'gnt:mb 'rdfs:domain 'gnc:marker) + (triple 'gnt:mb 'rdfs:range 'rdfs:Literal) + (triple 'gnt:mb 'rdfs:subPropertyOf 'sdmx-measure:obsValue) + (triple 'gnt:sequence 'rdfs:label "Sequence") + (triple 'gnt:sequence 'rdfs:domain 'gnc:marker) + (triple 'gnt:sequence 'rdfs:range 'rdfs:Literal) + (triple 'gnt:sequence 'rdfs:subPropertyOf 'sdmx-measure:obsValue) + (triple 'gnt:source 'rdfs:label "Source") + (triple 'gnt:source 'rdfs:domain 'gnc:marker) + (triple 'gnt:source 'rdfs:range 'rdfs:Literal) + (triple 'gnt:source 'rdfs:subPropertyOf 'sdmx-measure:obsValue)))) diff --git a/examples/phenotype-datasets.scm b/examples/phenotype-datasets.scm new file mode 100755 index 0000000..4819627 --- /dev/null +++ b/examples/phenotype-datasets.scm @@ -0,0 +1,109 @@ +#! /usr/bin/env guile +!# + +(use-modules (rnrs programs) + (rnrs io ports) + (srfi srfi-1) + (srfi srfi-26) + (ice-9 getopt-long) + (ice-9 match) + (ice-9 regex) + (transform strings) + (transform sql) + (transform triples) + (transform special-forms)) + + +(define-transformer gn:set->gn:dataset + (tables (Species + (inner-join InbredSet "ON InbredSet.SpeciesId = Species.Id") + (inner-join PublishFreeze "ON PublishFreeze.InbredSetId = InbredSet.Id")) + "WHERE PublishFreeze.public > 0 AND Species.Name != 'monkey' GROUP BY Species.Name, PublishFreeze.ShortName") + (triples (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_") + (multiset gnt:has_phenotype_data + (map (cut string->identifier "dataset" <> #:separator "_") + (string-split + (field ("GROUP_CONCAT(PublishFreeze.Name SEPARATOR ',')" + dataset_name)) + #\,))))) + +(define-transformer gn:dataset->gn:set + (tables (Datasets + (inner-join InfoFiles "ON InfoFiles.DatasetId = Datasets.DatasetId") + (inner-join InbredSet "ON InbredSet.Id = InfoFiles.InbredSetId") + (inner-join PublishFreeze "ON PublishFreeze.InbredSetId = InbredSet.Id")) + "WHERE PublishFreeze.public > 0 GROUP BY Datasets.DatasetId") + (triples (string->identifier "dataset" (field PublishFreeze Name) #:separator "_") + (set gnt:has_strain (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")))) + +(define-transformer gn:dataset->metadata + (tables (PublishXRef + (inner-join InbredSet "ON InbredSet.InbredSetId = PublishXRef.InbredSetId") + (inner-join Species "ON InbredSet.SpeciesId = Species.Id") + (inner-join PublishFreeze "ON PublishFreeze.InbredSetId = InbredSet.Id") + (inner-join Publication "ON Publication.Id = PublishXRef.PublicationId") + (inner-join Phenotype "ON Phenotype.Id = PublishXRef.PhenotypeId")) + "WHERE InbredSet.public > 0 GROUP BY Species.Name, PublishFreeze.Name") + (triples (string->identifier "dataset" (field PublishFreeze Name) #:separator "_") + (set dct:created (annotate-field (field PublishFreeze CreateTime) '^^xsd:datetime)) + (set gnt:has_strain (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")))) + +(define-transformer gn:dataset->gn:trait + (tables (PublishXRef + (inner-join InbredSet "ON InbredSet.InbredSetId = PublishXRef.InbredSetId") + (inner-join Species "ON InbredSet.SpeciesId = Species.Id") + (inner-join PublishFreeze "ON PublishFreeze.InbredSetId = InbredSet.Id") + (inner-join Publication "ON Publication.Id = PublishXRef.PublicationId") + (inner-join Phenotype "ON Phenotype.Id = PublishXRef.PhenotypeId")) + "WHERE InbredSet.public > 0") + (triples (string->identifier "dataset" (field PublishFreeze Name) #:separator "_") + (set gnt:has_phenotype_trait + (let ((post-abbrev (blank-p (field Phenotype Post_publication_abbreviation))) + (pre-abbrev (blank-p (field Phenotype Pre_publication_abbreviation))) + (post-desc (blank-p (field Phenotype Post_publication_description))) + (pre-desc (blank-p (field Phenotype Post_publication_description)))) + (string->identifier + "trait" + (format #f "~a_~a" (field PublishFreeze Name) + (or post-abbrev pre-abbrev post-desc pre-desc)) + #:separator "_"))) + (set dct:created (annotate-field (field PublishFreeze CreateTime) '^^xsd:datetime)) + (set gnt:has_strain (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")))) + + +(let* ((option-spec + '((settings (single-char #\s) (value #t)) + (output (single-char #\o) (value #t)) + (documentation (single-char #\d) (value #t)))) + (options (getopt-long (command-line) option-spec)) + (settings (option-ref options 'settings #f)) + (output (option-ref options 'output #f)) + (documentation (option-ref options 'documentation #f)) + (%connection-settings + (call-with-input-file settings + read))) + (with-documentation + (name "Phenotype Datasets") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("dcat:" "<http://www.w3.org/ns/dcat#>") + ("dct:" "<http://purl.org/dc/terms/>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") + ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") + ("owl:" "<http://www.w3.org/2002/07/owl#>") + ("skos:" "<http://www.w3.org/2004/02/skos/core#>") + ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") + ("xsd:" "<http://www.w3.org/2001/XMLSchema#>"))) + (inputs + (list + gn:set->gn:dataset + gn:dataset->gn:set + gn:dataset->metadata + gn:dataset->gn:trait)) + (outputs + `(#:documentation ,documentation + #:rdf ,output)))) diff --git a/examples/phenotype.scm b/examples/phenotype.scm index aa1e9c5..37bbd59 100755 --- a/examples/phenotype.scm +++ b/examples/phenotype.scm @@ -14,100 +14,112 @@ (transform special-forms)) -(define-transformer phenotypes - (tables (PublishXRef - (left-join InbredSet "ON InbredSet.InbredSetId = PublishXRef.InbredSetId") - (left-join Publication "ON Publication.Id = PublishXRef.PublicationId") - (left-join Phenotype "ON Phenotype.Id = PublishXRef.PhenotypeId"))) - (schema-triples - (gnt:traitId a owl:ObjectProperty) - (gnt:traitId rdfs:domain gnc:Phenotype) - (gnt:traitId skos:definition "This is the unique trait id assigned from GeneNetwork") - (gnt:abbreviation a owl:ObjectProperty) - (gnt:abbreviation rdfs:domain gnc:Phenotype) - (gnt:abbreviation skos:definition "The abbreviation used for this resource") - (gnt:labCode a owl:ObjectProperty) - (gnt:labCode rdfs:domain gnc:Phenotype) - (gnt:submitter a owl:ObjectProperty) - (gnt:submitter rdfs:domain gnc:Phenotype) - (gnt:submitter skos:definition "A person who submitted this resource to GN") - (gnt:mean a rdf:Property) - (gnt:mean a qb:MeasureProperty) - (gnt:mean rdfs:subPropertyOf sdmx-measure:obsValue) - (gnt:mean rdfs:domain gnc:Phenotype) - (gnt:mean rdfs:range xsd:double) - (gnt:lodScore a rdf:Property) - (gnt:lodScore a qb:MeasureProperty) - (gnt:lodScore rdfs:subPropertyOf sdmx-measure:obsValue) - (gnt:lodScore rdfs:domain gnc:Phenotype) - (gnt:lodScore rdfs:range xsd:double) - (gnt:lodScore rdfs:label "Peak -logP") - (gnt:lodScore skos:definition "Statistical measurement assessing the likelihood of genetic linkage between traits or genetic markers.") - (gnt:locus a rdf:Property) - (gnt:locus a qb:MeasureProperty) - (gnt:locus rdfs:subPropertyOf sdmx-measure:obsValue) - (gnt:locus rdfs:domain gnc:Phenotype) - (gnt:locus rdfs:range rdfs:Literal) - (gnt:additive rdfs:domain gnc:Phenotype) - (gnt:additive rdfs:range xsd:double) - (gnt:sequence rdfs:domain gnc:Phenotype) - (gnt:sequence rdfs:range xsd:integer)) - (triples (string->identifier - "trait" - (field ("CONCAT(IFNULL(InbredSet.InbredSetCode, PublishXRef.InbredSetId), '_', PublishXRef.Id)" - Phenotype))) - (set rdf:type 'gnc:Phenotype) - (set gnt:belongsToGroup - (string->identifier - "set" (field InbredSet Name InbredSetName) - #:separator "" - #:proc string-capitalize-first)) - ;; This is the trait's name - (set gnt:traitId - (let ((trait-id (field PublishXRef Id))) - (if (number? trait-id) - (number->string trait-id) - trait-id))) - (set skos:altLabel - (field ("CONCAT(IFNULL(InbredSet.InbredSetCode, PublishXRef.InbredSetId), '_', PublishXRef.Id)" - Phenotype))) + + + + + + + +(define-transformer gnc:phenotype->gn:phenotype + (tables (Phenotype)) + (triples "gnc:phenotype" + (set skos:member + (let ((post-abbrev (blank-p (field Phenotype Post_publication_abbreviation))) + (pre-abbrev (blank-p (field Phenotype Pre_publication_abbreviation))) + (post-desc (blank-p (field Phenotype Post_publication_description))) + (pre-desc (blank-p (field Phenotype Post_publication_description)))) + (string->identifier + "phenotype" + (or post-abbrev pre-abbrev post-desc pre-desc) + #:separator "_"))))) + +(define-transformer gn:phenotype->metadata + (tables (Phenotype)) + (triples (let ((post-abbrev (blank-p (field Phenotype Post_publication_abbreviation))) + (pre-abbrev (blank-p (field Phenotype Pre_publication_abbreviation))) + (post-desc (blank-p (field Phenotype Post_publication_description))) + (pre-desc (blank-p (field Phenotype Post_publication_description)))) + (string->identifier + "phenotype" + (or post-abbrev pre-abbrev post-desc pre-desc) + #:separator "_")) + (set rdf:type 'gnc:phenotype) ;; All phenotypes have a post-publication description (set dct:description (sanitize-rdf-string (field Phenotype Post_publication_description))) ;; All phenotypes have a post-publication abbreviation - (set gnt:abbreviation (field Phenotype Post_publication_abbreviation)) - (set gnt:labCode (field Phenotype Lab_code)) + (set gnt:abbreviation (sanitize-rdf-string (field Phenotype Post_publication_abbreviation))) + (set gnt:has_lab_code (field Phenotype Lab_code)) (set gnt:submitter (sanitize-rdf-string (field Phenotype Submitter))) (set dct:contributor (sanitize-rdf-string (field Phenotype Owner))) - (set gnt:mean (annotate-field (field ("IFNULL(PublishXRef.mean, '')" mean)) - '^^xsd:double)) - (set gnt:locus - (string->identifier - "" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (sanitize-rdf-string (field PublishXRef Locus)) - 'pre "_" 'post) - #:separator "" - #:proc string-capitalize-first)) - (set gnt:lodScore (annotate-field - (field ("IFNULL((PublishXRef.LRS/4.604), '')" lrs)) - '^^xsd:double)) - (set gnt:additive - (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive)) - '^^xsd:double)) - (set gnt:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:integer)) - (set dct:isReferencedBy + (set skos:member + (let ((post-abbrev (blank-p (field Phenotype Post_publication_abbreviation))) + (pre-abbrev (blank-p (field Phenotype Pre_publication_abbreviation))) + (post-desc (blank-p (field Phenotype Post_publication_description))) + (pre-desc (blank-p (field Phenotype Post_publication_description)))) + (string->identifier + "phenotype" + (or post-abbrev pre-abbrev post-desc pre-desc) + #:separator "_"))))) + +(define-transformer gn:trait->gn:phenotype + (tables (PublishXRef + (left-join InbredSet "ON InbredSet.InbredSetId = PublishXRef.InbredSetId") + (inner-join PublishFreeze "ON PublishFreeze.InbredSetId = InbredSet.Id") + (left-join Publication "ON Publication.Id = PublishXRef.PublicationId") + (left-join Phenotype "ON Phenotype.Id = PublishXRef.PhenotypeId")) + "WHERE InbredSet.public > 0") + (triples (let ((post-abbrev (blank-p (field Phenotype Post_publication_abbreviation))) + (pre-abbrev (blank-p (field Phenotype Pre_publication_abbreviation))) + (post-desc (blank-p (field Phenotype Post_publication_description))) + (pre-desc (blank-p (field Phenotype Post_publication_description)))) + (string->identifier + "trait" + (format #f "~a_~a" (field PublishFreeze Name) + (or post-abbrev pre-abbrev post-desc pre-desc)) + #:separator "_")) + (set rdf:type 'gnc:phenotype_trait) + (set gnt:has_strain (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")) + (set owl:equivalentClass + (field ("CONCAT(PublishFreeze.Name, '_', PublishXRef.Id)" + PublishFreeze))) + (set dcat:distribution + (string->symbol + (format #f "gnd:~a" + (field ("CONCAT(PublishFreeze.Name, '_', PublishXRef.Id)" + PublishFreeze)))) ) + (set dct:references (let ((pmid (field ("IF(Publication.PubMed_ID IS NULL, '', CONVERT(Publication.PubMed_Id, INT))" pmid))) - (publication-id (field Publication Id PublicationId))) + (publication-id (field Publication Id))) (if (string-null? pmid) (string->identifier "unpublished" (number->string publication-id)) - (ontology 'pubmed: pmid)))))) + (ontology 'pubmed: pmid)))) + (set gnt:has_phenotype + (let ((post-abbrev (blank-p (field Phenotype Post_publication_abbreviation))) + (pre-abbrev (blank-p (field Phenotype Pre_publication_abbreviation))) + (post-desc (blank-p (field Phenotype Post_publication_description))) + (pre-desc (blank-p (field Phenotype Post_publication_description)))) + (string->identifier + "phenotype" + (or post-abbrev pre-abbrev post-desc pre-desc) + #:separator "_"))) + (set gnt:mean (annotate-field (field ("IFNULL(PublishXRef.mean, '')" mean)) + '^^xsd:double)) + (set gnt:locus (sanitize-rdf-string (field PublishXRef Locus))) + (set gnt:lod_score (annotate-field + (field ("IFNULL((PublishXRef.LRS/4.604), '')" lrs)) + '^^xsd:double)) + (set gnt:additive + (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive)) + '^^xsd:double)) + (set gnt:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:integer)) + (set rdfs:comment (sanitize-rdf-string (field PublishXRef comments))))) @@ -127,11 +139,13 @@ (connection %connection-settings) (table-metadata? #f) (prefixes - '(("dct:" "<http://purl.org/dc/terms/>") - ("gn:" "<http://genenetwork.org/id/>") + '(("dcat:" "<http://www.w3.org/ns/dcat#>") + ("dct:" "<http://purl.org/dc/terms/>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") ("owl:" "<http://www.w3.org/2002/07/owl#>") - ("gnc:" "<http://genenetwork.org/category/>") - ("gnt:" "<http://genenetwork.org/term/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnd:" "<https://cd.genenetwork.org/api3/lmdb/v1/data/traits/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") ("sdmx-measure:" "<http://purl.org/linked-data/sdmx/2009/measure#>") ("skos:" "<http://www.w3.org/2004/02/skos/core#>") ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") @@ -141,8 +155,9 @@ ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") ("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>"))) (inputs - (list - phenotypes)) + (list gnc:phenotype->gn:phenotype + gn:phenotype->metadata + gn:trait->gn:phenotype)) (outputs `(#:documentation ,documentation #:rdf ,output)))) diff --git a/examples/probeset.scm b/examples/probeset.scm deleted file mode 100755 index 9f694af..0000000 --- a/examples/probeset.scm +++ /dev/null @@ -1,203 +0,0 @@ -#! /usr/bin/env guile -!# - -(use-modules (srfi srfi-1) - (srfi srfi-26) - (ice-9 format) - (ice-9 getopt-long) - (ice-9 match) - (ice-9 regex) - (transform strings) - (transform sql) - (transform triples) - (transform special-forms) - (web uri)) - - -(define-transformer probeset - (tables (ProbeSet - (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId") - (left-join Species "ON GeneChip.SpeciesId = Species.Id")) - "WHERE ProbeSet.Name IS NOT NULL") - (schema-triples - (gnc:omimLink rdfs:Class gnc:ResourceLink) - (gnc:omimLink rdfs:label "OMIM") - (gnc:omimLink rdfs:comments "Summary from On Mendelion Inheritance in Man") - (gnc:homologeneLink rdfs:Class gnc:ResourceLink) - (gnc:homologeneLink rdfs:label "HomoloGene") - (gnc:homologeneLink rdfs:comments "Find similar genes in other species") - (gnc:uniprot a owl:ObjectProperty) - (gnc:uniprot rdfs:label "UniProt") - (gnc:uniprot rdfs:comments "UniProt resource") - (gnt:hasChip a owl:ObjectProperty) - (gnt:hasChip rdfs:domain gnc:Probeset) - (gnt:hasTargetId a owl:ObjectProperty) - (gnt:hasTargetId rdfs:domain gnc:Probeset) - (gnt:geneSymbol rdfs:domain gnc:Probeset) - (gnt:location rdfs:domain gnc:ProbeSet) - (gnt:location a owl:ObjectProperty) - (gnt:strandPosition rdfs:domain gnc:ProbeSet) - (gnt:strandPosition a owl:ObjectProperty) - (gnt:targetsRegion a owl:ObjectProperty) - (gnt:targetsRegion rdfs:domain gnc:Probeset) - (gnt:chr rdfs:domain gnc:Probeset) - (gnt:mb rdfs:domain gnc:Probeset) - (gnt:hasSpecificity a owl:ObjectProperty) - (gnt:hasSpecificity rdfs:domain gnc:Probeset) - (gnt:hasBlatScore a owl:ObjectProperty) - (gnt:hasBlatScore rdfs:domain gnc:Probeset) - (gnt:hasBlatMbStart a owl:ObjectProperty) - (gnt:hasBlatMbStart rdfs:domain gnc:Probeset) - (gnt:hasBlatMbEnd a owl:ObjectProperty) - (gnt:hasBlatMbEnd rdfs:domain gnc:Probeset) - (gnt:hasBlatSeq a owl:ObjectProperty) - (gnt:hasBlatSeq rdfs:domain gnc:Probeset) - (gnt:hasTargetSeq a owl:ObjectProperty) - (gnt:hasTargetSeq rdfs:domain gnc:Probeset)) - (triples - (let ((id (field ("IF(NULLIF(TRIM(ProbeSet.Name), '') IS NULL, '', TRIM(ProbeSet.Name))" - ProbeSetIdName))) - (probeset-id (field ProbeSet Id))) - (string->identifier - "probeset" - (if (string-null? id) - (number->string probeset-id) - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - id - 'pre "_" 'post)))) - (set rdf:type 'gnc:Probeset) - (set rdfs:label (field ProbeSet Name)) - (set skos:altLabel - (replace-substrings - (field ProbeSet alias) - '(("\r\n" . "; ")))) - (set gnt:hasChip - (string->identifier - "platform" - (field ("IFNULL(GeneChip.Name, '')" GeneChipName)))) - (set gnt:hasTargetId - (field ("NULLIF(TRIM(ProbeSet.TargetId), '')" - TargetId))) - (set gnt:geneSymbol - (field ProbeSet Symbol)) - (set dct:description (sanitize-rdf-string (field ProbeSet description))) - (set gnt:targetsRegion - (sanitize-rdf-string - (field ("NULLIF(TRIM(ProbeSet.Probe_set_target_region), '')" - Probe_set_target_region)))) - (set gnt:chr (field ProbeSet Chr)) - (set gnt:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double)) - (set gnt:location - (let* ((mb (field ProbeSet Mb)) - (chr (field ProbeSet Chr)) - (strand-probe (field ProbeSet Strand_Probe)) - (location (list chr mb))) - (match location - (("Un" mb) - (format #f "Not available")) - ((chr "") - (if (string-blank? chr) - (format #f "Not available") - (format #f "Chr ~a @ Unknown position ~a~:[~;~a~]" - chr mb - (and (string? strand-probe) (or (string=? "+" strand-probe) - (string=? "-" strand-probe))) - (cond ((string=? "+" strand-probe) - "on the plus strand") - ((string=? "-" strand-probe) - "on the minus strand") - (else ""))))) - (_ - (format #f "Chr ~a @ ~a Mb ~:[~;~a~]" - chr mb - (and (string? strand-probe) (or (string=? "+" strand-probe) - (string=? "-" strand-probe))) - (cond ((string=? "+" strand-probe) - "on the plus strand") - ((string=? "-" strand-probe) - "on the minus strand") - (else ""))))))) - (set gnt:hasGeneId - (ontology 'gene: - (string-trim-both (field ProbeSet GeneId)))) - ;; OMIM Link - (set dct:references - (let ((omim (field ProbeSet OMIM))) - (if (not (string-blank? omim)) - (string->symbol - (format #f - "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" - "http://www.ncbi.nlm.nih.gov/omim/" - (uri-encode omim) - "a gnc:omimLink")) - ""))) - ;; Homologene Link - (set dct:references - (let ((homologene (field ProbeSet HomoloGeneID))) - (if (not (string-blank? homologene)) - (string->symbol - (format #f - "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" - "http://www.ncbi.nlm.nih.gov/homologene/?term=" - (uri-encode homologene) - "a gnc:homologeneLink")) - ""))) - (set gnt:uniprot - (ontology 'uniprot: (field ProbeSet UniProtID))) - (set gnt:strandProbe - (field ProbeSet Strand_Probe)) - (set gnt:hasSpecificity - (field ("IFNULL(ProbeSet.Probe_set_specificity, '')" - Probe_set_specificity))) - (set gnt:hasBlatScore - (field ("IFNULL(ProbeSet.Probe_set_BLAT_score, '')" - Probe_set_BLAT_score))) - (set gnt:hasBlatMbStart - (annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_start, '')" - Probe_set_Blat_Mb_start)) - '^^xsd:double)) - (set gnt:hasBlatMbEnd - (annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_end, '')" - Probe_set_Blat_Mb_end)) - '^^xsd:double)) - (set gnt:hasBlatSeq (sanitize-rdf-string (field ProbeSet BlatSeq))) - (set gnt:hasTargetSeq (sanitize-rdf-string (field ProbeSet TargetSeq))))) - - - - -(let* ((option-spec - '((settings (single-char #\s) (value #t)) - (output (single-char #\o) (value #t)) - (documentation (single-char #\d) (value #t)))) - (options (getopt-long (command-line) option-spec)) - (settings (option-ref options 'settings #f)) - (output (option-ref options 'output #f)) - (documentation (option-ref options 'documentation #f)) - (%connection-settings - (call-with-input-file settings - read))) - (with-documentation - (name "ProbeSet Metadata") - (connection %connection-settings) - (table-metadata? #f) - (prefixes - '(("gn:" "<http://genenetwork.org/id/>") - ("probeset:" "<http://genenetwork.org/probeset/>") - ("gnc:" "<http://genenetwork.org/category/>") - ("gene:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>") - ("gnt:" "<http://genenetwork.org/term/>") - ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") - ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") - ("dct:" "<http://purl.org/dc/terms/>") - ("owl:" "<http://www.w3.org/2002/07/owl#>") - ("xsd:" "<http://www.w3.org/2001/XMLSchema#>") - ("qb:" "<http://purl.org/linked-data/cube#>") - ("sdmx-measure:" "<http://purl.org/linked-data/sdmx/2009/measure#>") - ("skos:" "<http://www.w3.org/2004/02/skos/core#>"))) - (inputs - (list probeset)) - (outputs - `(#:documentation ,documentation - #:rdf ,output)))) diff --git a/examples/publication.scm b/examples/publication.scm index eab4da7..c411af6 100755 --- a/examples/publication.scm +++ b/examples/publication.scm @@ -13,7 +13,7 @@ -(define-transformer publication +(define-transformer publication->metadata (tables (Publication)) (triples (let ((pmid (field @@ -70,18 +70,18 @@ (connection %connection-settings) (table-metadata? #f) (prefixes - '(("gnt:" "<http://genenetwork.org/term/>") + '(("gnt:" "<http://rdf.genenetwork.org/v1/term/>") ("fabio:" "<http://purl.org/spar/fabio/>") ("dct:" "<http://purl.org/dc/terms/>") ("prism:" "<http://prismstandard.org/namespaces/basic/2.0/>") - ("gn:" "<http://genenetwork.org/id/>") - ("gnc:" "<http://genenetwork.org/category/>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") ("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>") ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") ("xsd:" "<http://www.w3.org/2001/XMLSchema#>") ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>"))) (inputs - (list publication)) + (list publication->metadata)) (outputs `(#:documentation ,documentation #:rdf ,output)))) diff --git a/examples/schema.scm b/examples/schema.scm new file mode 100755 index 0000000..c4ff082 --- /dev/null +++ b/examples/schema.scm @@ -0,0 +1,59 @@ +#! /usr/bin/env guile +!# + +(use-modules (ice-9 getopt-long) + (transform triples) + (transform schema) + (transform special-forms) + (transform sql) + (transform table)) + + +(define (transform-table-schema connection-settings db) + (let ((tables (tables connection-settings db))) + (for-each (lambda (table) + (let ((table-id (string->identifier + "table" + ;; We downcase table names in + ;; identifiers. So, we distinguish + ;; between the user and User tables. + (if (string=? (table-name table) "User") + "user2" + (table-name table))))) + (triple table-id 'rdf:type 'gn:sqlTable) + (triple table-id 'gn:name (table-name table)) + (triple table-id 'gn:has_size (string->symbol (format #f "~a" (table-size table)))) + (for-each (lambda (column) + (let ((column-id (column-id (table-name table) + (column-name column)))) + (triple column-id 'rdf:type 'gn:sql_table_field) + (triple column-id 'gn:name (column-name column)) + (triple column-id 'gn:sql_field_type (column-type column)) + (triple table-id 'gn:has_field column-id))) + (table-columns table)))) + tables))) + + +(let* ((option-spec + '((settings (single-char #\s) (value #t)) + (output (single-char #\o) (value #t)) + (documentation (single-char #\d) (value #t)))) + (options (getopt-long (command-line) option-spec)) + (settings (option-ref options 'settings #f)) + (output (option-ref options 'output #f)) + (documentation (option-ref options 'documentation #f)) + (%connection-settings (call-with-input-file settings read))) + (call-with-target-database + %connection-settings + (lambda (db) + (with-output-to-file output + (lambda () + (prefix "rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + (prefix "rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") + (prefix "gn:" "<http://rdf.genenetwork.org/v1/id/>") + (prefix "gnc:" "<http://rdf.genenetwork.org/v1/category/>") + (prefix "gnt:" "<http://rdf.genenetwork.org/v1/term/>") + (prefix "xsd:" "<http://www.w3.org/2001/XMLSchema#>") + (prefix "owl:" "<http://www.w3.org/2002/07/owl#>") + (newline) + (transform-table-schema %connection-settings db)))))) diff --git a/examples/strains.scm b/examples/strains.scm index 2e1e24f..cc98d71 100755 --- a/examples/strains.scm +++ b/examples/strains.scm @@ -11,15 +11,6 @@ (transform triples) (transform special-forms)) -(define (remap-species-identifiers str) - "This procedure remaps identifiers to standard binominal. Obviously this should - be sorted by correcting the database!" - (match str - ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] - ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] - ["Macaca mulatta" "Macaca nemestrina"] - ["Bat (Glossophaga soricina)" "Glossophaga soricina"] - [str str])) #! @@ -69,45 +60,56 @@ At this point it is not very clear how Name, Name2, Symbol and Alias are used. (schema-triples (gnt:alias rdfs:domain gnc:strain) (gnt:alias a owl:ObjectProperty) - (gnt:geneSymbol rdfs:domain gnc:strain) - (gnt:geneSymbol a owl:ObjectProperty)) + (gnt:gene_symbol rdfs:domain gnc:strain) + (gnt:gene_symbol a owl:ObjectProperty)) (triples (string->identifier - "" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field Strain Name) - 'pre "_" 'post)) + "strain" + (field Strain Name) + #:separator "_") (set rdf:type 'gnc:strain) - (set gnt:belongsToSpecies - (string->identifier "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)) + (set gnt:has_species (string->identifier "" (remap-species-identifiers (field Species Fullname)))) ;; Name, and maybe a second name (set rdfs:label (sanitize-rdf-string (field Strain Name))) (set skos:altLabel (sanitize-rdf-string (field ("IF ((Strain.Name2 != Strain.Name), Strain.Name2, '')" Name2)))) (set gnt:alias (sanitize-rdf-string (field ("IF ((Strain.Alias != Strain.Name), Strain.Alias, '')" Alias)))) - (set gnt:geneSymbol (field Strain Symbol)))) + (set gnt:gene_symbol (field Strain Symbol)))) (define-transformer mapping-method (tables (MappingMethod)) (schema-triples - (gnc:mappingMethod a skos:Concept) - (gnc:mappingMethod skos:definition "Terms that decribe mapping methods used on this resource")) + (gnc:mapping_method a skos:ConceptScheme) + (gnc:mapping_method skos:prefLabel "Mapping Method Vocabulary") + (gnc:mapping_method skos:definition "Controlled vocabulary describing statistical/computational methods used for mapping in GeneNetwork.")) (triples - (string->identifier "mappingMethod" (field MappingMethod Name)) - (set rdf:type 'gnc:mappingMethod) - (set rdfs:label (field MappingMethod Name)))) + (string->identifier "mapping_method" (field MappingMethod Name) #:separator "_") + (set rdf:type 'skos:Concept) + (set skos:inScheme 'gnc:mapping_method) + (set skos:prefLabel (field MappingMethod Name)))) + +(define-transformer mapping-method-fan-out + (tables (MappingMethod)) + (triples + 'gnc:mapping_method + (set skos:member (string->identifier "mapping_method" (field MappingMethod Name) #:separator "_")))) + +(define-transformer avg-method-fan-out + (tables (AvgMethod)) + (triples + 'gnc:avg_method + (set skos:member (string->identifier "avg_method" (field AvgMethod Name AvgMethodName) #:separator "_")))) (define-transformer avg-method ;; The Name and Normalization fields seem to be the same. Dump only ;; the Name field. (tables (AvgMethod)) (schema-triples - (gnc:avgMethod a skos:Concept) - (gnc:avgMethod skos:definition "Terms that decribe normalization methods used on this resource")) - (triples (string->identifier "avgMethod" (field AvgMethod Name AvgMethodName)) - (set rdf:type 'gnc:avgMethod) - (set rdfs:label (field AvgMethod Normalization)))) + (gnc:avg_method a skos:ConceptScheme) + (gnc:avg_method skos:prefLabel "Normalization and Averaging Method Vocabulary") + (gnc:avg_method skos:definition "Controlled vocabulary describing normalization, transformation, and summarization methods applied in GeneNetwork.")) + (triples (string->identifier "avg_method" (field AvgMethod Name AvgMethodName) #:separator "_") + (set rdf:type 'skos:Concept) + (set skos:inScheme 'gnc:avg_method) + (set skos:prefLabel (field AvgMethod Normalization)))) @@ -124,22 +126,21 @@ At this point it is not very clear how Name, Name2, Symbol and Alias are used. read))) (with-documentation - (name "Species Metadata") + (name "Strain Metadata") (connection %connection-settings) (table-metadata? #f) (prefixes - '(("gn:" "<http://genenetwork.org/id/>") - ("gnc:" "<http://genenetwork.org/category/>") + '(("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") ("owl:" "<http://www.w3.org/2002/07/owl#>") - ("gnt:" "<http://genenetwork.org/term/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") ("skos:" "<http://www.w3.org/2004/02/skos/core#>") ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>") ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") ("taxon:" "<http://purl.uniprot.org/taxonomy/>"))) (inputs - (list strain mapping-method avg-method)) + (list strain mapping-method avg-method mapping-method-fan-out avg-method-fan-out)) (outputs `(#:documentation ,documentation #:rdf ,output)))) - diff --git a/examples/tissue.scm b/examples/tissue.scm deleted file mode 100755 index 2659b66..0000000 --- a/examples/tissue.scm +++ /dev/null @@ -1,55 +0,0 @@ -#! /usr/bin/env guile -!# - -(use-modules (srfi srfi-1) - (srfi srfi-26) - (ice-9 getopt-long) - (ice-9 match) - (ice-9 regex) - (transform strings) - (transform sql) - (transform triples) - (transform special-forms)) - - -(define-transformer tissue - ;; The Name and TissueName fields seem to be identical. BIRN_lex_ID - ;; and BIRN_lex_Name are mostly NULL. - (tables (Tissue)) - (schema-triples - (gnc:tissue a skos:Concept)) - ;; Hopefully the Short_Name field is distinct and can be used as an - ;; identifier. - (triples (string->identifier "tissue" (field Tissue Short_Name)) - (set rdf:type 'gnc:tissue) - (set rdfs:label (field Tissue Name)))) - - - -(let* ((option-spec - '((settings (single-char #\s) (value #t)) - (output (single-char #\o) (value #t)) - (documentation (single-char #\d) (value #t)))) - (options (getopt-long (command-line) option-spec)) - (settings (option-ref options 'settings #f)) - (output (option-ref options 'output #f)) - (documentation (option-ref options 'documentation #f)) - (%connection-settings - (call-with-input-file settings - read))) - (with-documentation - (name "Tissue Metadata") - (connection %connection-settings) - (table-metadata? #f) - (prefixes - '(("gn:" "<http://genenetwork.org/id/>") - ("gnt:" "<http://genenetwork.org/term/>") - ("skos:" "<http://www.w3.org/2004/02/skos/core#>") - ("gnc:" "<http://genenetwork.org/category/>") - ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") - ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>"))) - (inputs - (list tissue)) - (outputs - `(#:documentation ,documentation - #:rdf ,output)))) diff --git a/generate-ttl-files.scm b/generate-ttl-files.scm index 65db03f..28be496 100755 --- a/generate-ttl-files.scm +++ b/generate-ttl-files.scm @@ -1,60 +1,127 @@ -#! ./pre-inst-env +#! /usr/bin/env guile !# + (use-modules (ice-9 format) - (ice-9 futures) (ice-9 getopt-long) - (ice-9 ftw)) + (ice-9 ftw) + (ice-9 regex) + (srfi srfi-26) + (srfi srfi-34) + (srfi srfi-35)) + + +;; Copied over from GNU/Guix source tree. +(define (file-name-predicate regexp) + "Return a predicate that returns true when passed a file name whose base +name matches REGEXP." + (let ((file-rx (if (regexp? regexp) + regexp + (make-regexp regexp)))) + (lambda (file stat) + (regexp-exec file-rx (basename file))))) + +(define* (find-files dir #:optional (pred (const #t)) + #:key (stat lstat) + directories? + fail-on-error?) + "Return the lexicographically sorted list of files under DIR for which PRED +returns true. PRED is passed two arguments: the absolute file name, and its +stat buffer; the default predicate always returns true. PRED can also be a +regular expression, in which case it is equivalent to (file-name-predicate +PRED). STAT is used to obtain file information; using 'lstat' means that +symlinks are not followed. If DIRECTORIES? is true, then directories will +also be included. If FAIL-ON-ERROR? is true, raise an exception upon error." + (let ((pred (if (procedure? pred) + pred + (file-name-predicate pred)))) + ;; Sort the result to get deterministic results. + (sort (file-system-fold (const #t) + (lambda (file stat result) ; leaf + (if (pred file stat) + (cons file result) + result)) + (lambda (dir stat result) ; down + (if (and directories? + (pred dir stat)) + (cons dir result) + result)) + (lambda (dir stat result) ; up + result) + (lambda (file stat result) ; skip + result) + (lambda (file stat errno result) + (format (current-error-port) "find-files: ~a: ~a~%" + file (strerror errno)) + (when fail-on-error? + (error "find-files failed")) + result) + '() + dir + stat) + string<?))) + +(define-syntax-rule (warn-on-error expr file) + (catch 'system-error + (lambda () + expr) + (lambda args + (format (current-error-port) + "warning: failed to delete ~a: ~a~%" + file (strerror + (system-error-errno args)))))) + +(define* (delete-file-recursively dir + #:key follow-mounts?) + "Delete DIR recursively, like `rm -rf', without following symlinks. Don't +follow mount points either, unless FOLLOW-MOUNTS? is true. Report but ignore +errors." + (let ((dev (stat:dev (lstat dir)))) + (file-system-fold (lambda (dir stat result) ; enter? + (or follow-mounts? + (= dev (stat:dev stat)))) + (lambda (file stat result) ; leaf + (warn-on-error (delete-file file) file)) + (const #t) ; down + (lambda (dir stat result) ; up + (warn-on-error (rmdir dir) dir)) + (const #t) ; skip + (lambda (file stat errno result) + (format (current-error-port) + "warning: failed to delete ~a: ~a~%" + file (strerror errno))) + #t + dir + + ;; Don't follow symlinks. + lstat))) (let* ((option-spec '((settings (single-char #\s) (value #t)) - (output (single-char #\o) (value #t)) - (documentation (single-char #\d) (value #t)))) + (documentation (single-char #\d) (value #t)) + (output (single-char #\o) (value #t)))) (options (getopt-long (command-line) option-spec)) (settings (option-ref options 'settings #f)) (output (option-ref options 'output #f)) - (documentation (option-ref options 'documentation #f))) - (define (enter? name stat result) - stat result ;ignore - ;; Skip version control directories if any. - (not (member (basename name) '(".git" ".svn" "CVS")))) - - (define (leaf name stat result) - stat result ;ignore - (when (string-suffix? ".scm" name) - (let* ((base-file-name (basename name ".scm")) - (cmd (format #f " ~a --settings ~a --output ~a --documentation ~a" - name - settings - (string-append output "/" base-file-name ".ttl") - (string-append documentation "/" base-file-name ".md")))) - (touch - (future - (begin - (display (format #f "Running ~a" cmd)) - (display "\n") - (system cmd))))))) - - (define (down name stat result) - name stat ;ignore - result) - - (define (up name stat result) - name stat ;ignore - result) - - (define (skip name stat result) - name stat ;ignore - result) - - ;; Ignore unreadable files/directories but warn the user. - (define (error name stat errno result) - stat ;ignore - (format (current-error-port) "warning: ~a: ~a~%" - name (strerror errno)) - result) - - (file-system-fold enter? leaf down up skip error - 0 ;initial counter is zero bytes - "./examples")) - - + (documentation (option-ref options 'documentation #f)) + (%source-dir (dirname (current-filename)))) + (unless (file-exists? output) + (mkdir output)) + ;; Transform data to RDF + (for-each (lambda (file) + (let* ((base-file-name (basename file ".scm")) + (ttl-file (string-append output "/" base-file-name ".ttl"))) + ;; Ignore dataset-metadata-git.scm because TODO + (unless (string=? base-file-name "dataset-metadata-git") + (system* "guile" "-L" (dirname (current-filename)) file + "--settings" settings "--output" ttl-file)))) + (find-files "./examples" ".scm")) + ;; Copy hand-woven ttl files. + (for-each (lambda (file) + (copy-file + file (format #f "~a/~a" output (basename file)))) + (find-files "./schema" ".ttl")) + ;; Validate transformed turtle files + (for-each (lambda (file) + (system* "rapper" "--input" "turtle" "--count" file)) + (append (find-files output ".ttl") + (find-files "./schema" ".ttl")))) diff --git a/json-to-ttl.scm b/json-to-ttl.scm index 0a054c5..8fc4002 100755 --- a/json-to-ttl.scm +++ b/json-to-ttl.scm @@ -59,7 +59,7 @@ inside it." (with-output-to-file (string-append %directory "/sampledata.ttl") (lambda () - (prefix "gn:" "<http://genenetwork.org/>") + (prefix "gn:" "<http://rdf.genenetwork.org/v1/>") (newline) (run-proc-on-files %data-directory diff --git a/load-rdf.scm b/load-rdf.scm index aaf1b00..2ef79ac 100755 --- a/load-rdf.scm +++ b/load-rdf.scm @@ -16,7 +16,7 @@ (web uri)) (define %graph-uri - "http://genenetwork.org") + "http://rdf.genenetwork.org/v1") (define (call-with-pipe proc mode program . args) "Execute PROGRAM ARGS ... in a subprocess with a pipe of MODE to @@ -82,6 +82,43 @@ CHECKPOINT; OPEN_WRITE "isql")) +(define (set-global-namespaces port password) + "Set the global namespaces" + (call-with-pipe + (lambda (out) + (format out + "SET DSN=localhost:~a; +SET PWD=~s; +DB.DBA.XML_SET_NS_DECL ('dcat', 'http://www.w3.org/ns/dcat#', 2); +DB.DBA.XML_SET_NS_DECL ('dct', 'http://purl.org/dc/terms/', 2); +DB.DBA.XML_SET_NS_DECL ('fabio', 'http://purl.org/spar/fabio/', 2); +DB.DBA.XML_SET_NS_DECL ('genbank', 'https://bioregistry.io/reference/genbank:', 2); +DB.DBA.XML_SET_NS_DECL ('gene', 'http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=', 2); +DB.DBA.XML_SET_NS_DECL ('generif', 'http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=', 2); +DB.DBA.XML_SET_NS_DECL ('geoSeries', 'http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=', 2); +DB.DBA.XML_SET_NS_DECL ('gn', 'http://rdf.genenetwork.org/v1/id/', 2); +DB.DBA.XML_SET_NS_DECL ('gnc', 'http://rdf.genenetwork.org/v1/category/', 2); +DB.DBA.XML_SET_NS_DECL ('gnt', 'http://rdf.genenetwork.org/v1/term/', 2); +DB.DBA.XML_SET_NS_DECL ('ncbiTaxon', 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=', 2); +DB.DBA.XML_SET_NS_DECL ('prism', 'http://prismstandard.org/namespaces/basic/2.0/', 2); +DB.DBA.XML_SET_NS_DECL ('probeset', 'http://rdf.genenetwork.org/v1/probeset/', 2); +DB.DBA.XML_SET_NS_DECL ('pubmed', 'http://rdf.ncbi.nlm.nih.gov/pubmed/', 2); +DB.DBA.XML_SET_NS_DECL ('qb', 'http://purl.org/linked-data/cube#', 2); +DB.DBA.XML_SET_NS_DECL ('sdmx-measure', 'http://purl.org/linked-data/sdmx/2009/measure#', 2); +DB.DBA.XML_SET_NS_DECL ('taxon', 'http://purl.uniprot.org/taxonomy/', 2); +DB.DBA.XML_SET_NS_DECL ('transcript', 'https://portals.broadinstitute.org/gpp/public/trans/details?transName=', 2); +DB.DBA.XML_SET_NS_DECL ('v', 'http://www.w3.org/2006/vcard/ns#', 2); +DB.DBA.XML_SET_NS_DECL ('xkos', 'http://rdf-vocabulary.ddialliance.org/xkos#', 2); +DB.DBA.XML_SET_NS_DECL ('schema', 'https://schema.org/', 2); +DB.DBA.XML_SET_NS_DECL ('foaf', 'http://xmlns.com/foaf/0.1/#term_', 2); +DB.DBA.XML_SET_NS_DECL ('gnd', 'https://cd.genenetwork.org/api3/lmdb/v1/data/traits/', 2); +DB.DBA.XML_SET_NS_DECL ('gn-files', 'http://files.genenetwork.org/current/', 2); +" + port + password)) + OPEN_WRITE + "isql")) + (define (index-data port password) "Index all text data for quicker search" (call-with-pipe @@ -120,6 +157,13 @@ quit; (assq-ref connection-settings 'virtuoso-port) (assq-ref connection-settings 'virtuoso-password) %graph-uri))) + ;; Update global namespaces + (format (current-output-port) + "Global namespaces set in ~a seconds~%" + (time-thunk + (cut set-global-namespaces + (assq-ref connection-settings 'virtuoso-port) + (assq-ref connection-settings 'virtuoso-password)))) ;; Delete the load queue (format (current-output-port) "Existing DB.LOAD queue deleted in ~a seconds~%" diff --git a/manifest.scm b/manifest.scm index 63e9bd7..d736e51 100644 --- a/manifest.scm +++ b/manifest.scm @@ -6,8 +6,7 @@ (use-modules (gnu packages autotools) ((gnu packages base) #:select (gnu-make)) - ((gnu packages bioinformatics) #:select (ccwl)) - ((gnu packages databases) #:select (virtuoso-ose mariadb)) + (gnu packages databases) (gnu packages graphviz) (gnu packages guile) ((gnu packages guile-xyz) #:select (guile-sparql) #:prefix guix:) @@ -60,5 +59,6 @@ guile-json-4 guile-dsv ;; We abuse (ccwl graphviz) as a library to visualize the database ;; schema. Hence we need ccwl and guile-libyaml. - ccwl graphviz guile-hashing guile-libyaml guile-sparql + ;; ccwl graphviz + guile-hashing guile-libyaml guile-sparql raptor2 run64 virtuoso-ose mariadb)) diff --git a/schema/gn-curation-metadata.ttl b/schema/gn-curation-metadata.ttl new file mode 100644 index 0000000..1286453 --- /dev/null +++ b/schema/gn-curation-metadata.ttl @@ -0,0 +1,19 @@ +@prefix gn: <http://rdf.genenetwork.org/v1/id/> . +@prefix gnt: <http://rdf.genenetwork.org/v1/term/> . +@prefix skos: <http://www.w3.org/2004/02/skos/core#> . +@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . +@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . +@prefix owl: <http://www.w3.org/2002/07/owl#> . +@prefix xsd: <http://www.w3.org/2001/XMLSchema#> . + +gnt:editorial_status rdf:type owl:DatatypeProperty ; + rdfs:label "editorial status" ; + rdfs:comment "Curation and lifecycle status of a GN entity." . + +gnt:obsolete rdf:type skos:Concept ; + skos:prefLabel "obsolete" ; + skos:definition "Data retained for historical reasons but known to be incomplete, low-quality, or not scientifically useful." . + +gnt:deprecated rdf:type skos:Concept ; + skos:prefLabel "deprecated" ; + skos:definition "Data that should no longer be exposed in primary user interfaces." . diff --git a/schema/mapping.ttl b/schema/mapping.ttl new file mode 100644 index 0000000..5249526 --- /dev/null +++ b/schema/mapping.ttl @@ -0,0 +1,164 @@ +@prefix foaf: <http://xmlns.com/foaf/0.1/#term_> . +@prefix gn: <http://rdf.genenetwork.org/v1/id/> . +@prefix gnc: <http://rdf.genenetwork.org/v1/category/> . +@prefix gnt: <http://rdf.genenetwork.org/v1/term/> . +@prefix skos: <http://www.w3.org/2004/02/skos/core#> . +@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . +@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . + +gn:mapping_method_qtlreaper + skos:definition "Rapidly scan microarray expression data for QTLs" ; + foaf:homepage <https://github.com/genenetwork/QTLReaper> . + +gn:mapping_method_Rqtl + skos:definition "R/qtl is an extension library for the R statistics system. It is used to analyze experimental crosses for identifying genes contributing to variation in quantitative traits (so-called quantitative trait loci, QTLs). Using a hidden Markov model, R/qtl estimates genetic maps, to identify genotyping errors, and to perform single-QTL and two-QTL, two-dimensional genome scans." ; + foaf:homepage <https://rqtl.org/> . + +gn:mapping_method_HappyR + skos:definition "Haplotype-based QTL mapping method, often used for multiparent populations." . + +gn:mapping_method_PLINK + skos:definition + "Genome-wide association and population-based linkage analysis tool." . + +gn:mapping_method_FastMap + skos:definition + "Efficient mapping algorithm optimized for large datasets." . + +################################################################# +# Normalization / Averaging Method Concept Scheme +################################################################# + +gn:avg_method_MAS5 + skos:definition + "Affymetrix Microarray Suite 5 normalization method." . + +gn:avg_method_PDNN + skos:definition + "Probe-level normalization method that models sequence-dependent hybridization effects in microarray data." . + +gn:avg_method_RMA + skos:definition + "Robust Multi-array Average normalization for microarray data." . + +gn:avg_method_dChip + skos:definition + "Model-based normalization and expression summarization method for microarray data using probe-level intensity modeling." . + +gn:avg_method_GCRMA + skos:definition + "GC-content adjusted RMA normalization method." . + +gn:avg_method_Herit + skos:definition + "Method that estimates heritability by partitioning phenotypic variance into genetic and environmental components." . + +gn:avg_method_Rank + skos:definition + "Normalization method that transforms values based on their rank order within a dataset." . + +gn:avg_method_RankInv + skos:definition + "Normalization method that applies an inverse rank transformation to enforce a uniform distribution of values." . + +gn:avg_method_LOESS + skos:definition + "Locally estimated scatterplot smoothing normalization." . + +gn:avg_method_LOESS_NB + skos:definition + "LOESS-based normalization method that corrects systematic biases without assuming a parametric noise model." . + + +gn:avg_method_QUANT + skos:definition + "Quantile normalization method that forces all samples to share the same empirical distribution." . + +gn:avg_method_QUANT_NB + skos:definition + "Quantile normalization method applied without background correction or parametric noise assumptions." . + +gn:avg_method_RSN + skos:definition + "Robust spline normalization method that adjusts microarray data to remove systematic technical variation." . + +gn:avg_method_RSN_NB + skos:definition + "Robust spline normalization method applied without background correction or parametric noise assumptions." . + +gn:avg_method_Sscore + skos:definition + "Statistical scoring method for identifying significant gene expression changes in microarray data." . + +gn:avg_method_mlratio + skos:definition + "Method calculating the log-ratio of expression values between two conditions for microarray analysis." . + +gn:avg_method_VST + skos:definition + "Variance stabilizing transformation for RNA-seq count data to reduce heteroscedasticity." . + +gn:avg_method_RPN + skos:definition + "Robust Probe Normalization method for microarray data to adjust probe-level intensities." . + + +gn:TPM_Log2 + skos:definition + "Transcripts Per Million normalized expression values, log2 transformed." . + +gn:avg_method_RPKM + skos:definition + "Reads Per Kilobase Million normalization for RNA-seq data." . + +gn:avg_method_RNA_seq + skos:definition + "Normalization pipeline applied to RNA sequencing datasets." . + +gn:avg_method_SRM + skos:definition + "Normalization method for targeted proteomics using Selected Reaction Monitoring." . + +gn:avg_method_SWATH + skos:definition + "Normalization and quantification method for SWATH-MS proteomics data." . + +gn:avg_method_RPKM_log2 + skos:definition + "Log2-transformed Reads Per Kilobase per Million mapped reads for RNA-seq data." . + +gn:avg_method_Sesame + skos:definition + "Normalization method for Illumina methylation arrays using the Sesame pipeline." . + +gn:avg_method_TPM_Log2 + skos:definition + "Log2-transformed Transcripts Per Million for RNA-seq expression quantification." . + +gn:avg_method_rlog + skos:definition + "Regularized log transformation for RNA-seq count data to stabilize variance." . + +gn:avg_method_edgeR + skos:definition + "Normalization and differential expression analysis method for RNA-seq using edgeR." . + +gn:avg_method_minfi + skos:definition + "Normalization pipeline for Illumina methylation arrays using the minfi Bioconductor package." . + +gn:avg_method_2Z_8 + skos:definition + "Microarray normalization method that applies a 2Z+8 transformation to expression values." . + +gn:avg_method_Log2 + skos:definition + "Simple log2 transformation applied to expression values to reduce skew and stabilize variance." . + +gn:avg_method_DESeq2_rlog2 + skos:definition + "Regularized log transformation from the DESeq2 RNA-seq workflow." . + +gn:avg_method_N_A + skos:definition + "Indicates that no normalization or averaging method applies." . diff --git a/schema/species.ttl b/schema/species.ttl index f0d5207..c66dda0 100644 --- a/schema/species.ttl +++ b/schema/species.ttl @@ -2,48 +2,58 @@ @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . +@prefix gn: <http://rdf.genenetwork.org/v1/id/> . +@prefix gnt: <http://rdf.genenetwork.org/v1/term/> . +@prefix owl: <http://www.w3.org/2002/07/owl#> . +@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . +@prefix skos: <http://www.w3.org/2004/02/skos/core#> . @prefix wd: <http://www.wikidata.org/entity/> . -@prefix gn: <http://genenetwork.org/id/> . -@prefix gnt: <http://genenetwork.org/term/> . + + +gnt:has_wikidata_link a owl:ObjectProperty ; + rdfs:label "has Wikidata link" ; + rdfs:comment "Associates a GeneNetwork resource with its corresponding canonical entity in Wikidata." ; + rdfs:subPropertyOf skos:exactMatch ; + rdfs:range owl:Thing . # sorted on short names: # -gn:Arabidopsis_thaliana rdf:isDefinedBy wd:Q158695 ; +gn:Arabidopsis_thaliana gnt:has_wikidata_link wd:Q158695 ; gnt:shortName "arabidopsis" . -gn:Hordeum_vulgare rdf:isDefinedBy wd:Q11577 ; +gn:Hordeum_vulgare gnt:has_wikidata_link wd:Q11577 ; gnt:shortName "barley" . -gn:Glossophaga_soricina rdf:isDefinedBy wd:Q304929 ; +gn:Glossophaga_soricina gnt:has_wikidata_link wd:Q304929 ; gnt:shortName "bat" . -gn:Drosophila_melanogaster rdf:isDefinedBy wd:Q130888 ; +gn:Drosophila_melanogaster gnt:has_wikidata_link wd:Q130888 ; gnt:shortName "drosophila" . -gn:Homo_sapiens rdf:isDefinedBy wd:Q15978631 ; +gn:Homo_sapiens gnt:has_wikidata_link wd:Q15978631 ; gnt:shortName "human" . # Monkey -# gn:Macaca_mulatta rdf:isDefinedBy wd:Q156606 ; +# gn:Macaca_mulatta gnt:has_wikidata_link wd:Q156606 ; # gnt:shortName "macaca" . -gn:Macaca_nemestrina rdf:isDefinedBy wd:Q618026 ; +gn:Macaca_nemestrina gnt:has_wikidata_link wd:Q618026 ; gnt:shortName "macaca" . -gn:Mus_musculus rdf:isDefinedBy wd:Q83310 ; +gn:Mus_musculus gnt:has_wikidata_link wd:Q83310 ; gnt:shortName "mouse" . -gn:Oryzias_latipes rdf:isDefinedBy wd:Q1142975 ; +gn:Oryzias_latipes gnt:has_wikidata_link wd:Q1142975 ; gnt:shortName "medaka" . -gn:Populus_trichocarpa rdf:isDefinedBy wd:Q149382 ; +gn:Populus_trichocarpa gnt:has_wikidata_link wd:Q149382 ; gnt:shortName "poplar" . -gn:Rattus_norvegicus rdf:isDefinedBy wd:Q184224 ; +gn:Rattus_norvegicus gnt:has_wikidata_link wd:Q184224 ; gnt:shortName "rat" . -gn:Glycine_max rdf:isDefinedBy wd:Q11006 ; +gn:Glycine_max gnt:has_wikidata_link wd:Q11006 ; gnt:shortName "soybean" . -gn:Solanum_lycopersicum rdf:isDefinedBy wd:Q23501 ; +gn:Solanum_lycopersicum gnt:has_wikidata_link wd:Q23501 ; gnt:shortName "tomato" . diff --git a/transform/schema-dump.scm b/transform/schema.scm index 18df5da..f3896a7 100644 --- a/transform/schema-dump.scm +++ b/transform/schema.scm @@ -4,7 +4,13 @@ #:use-module (transform sql) #:use-module (transform triples) #:use-module (transform strings) - #:use-module (transform table)) + #:use-module (transform table) + #:export (table-fields + get-tables-from-comments + schema-annotations + tables + schema + data-table)) (define (table-fields db table) @@ -47,7 +53,7 @@ (for-each (cut table-fields db <>) (get-tables-from-comments db))))) -(define (tables db) +(define (tables connection-settings db) "Return list of all tables in DB. Each element of the returned list is a <table> object." (map (lambda (table) @@ -68,7 +74,7 @@ is a <table> object." (information_schema.tables data_length)) (information_schema.tables) (format #f "WHERE table_schema = '~a'" - (assq-ref %connection-settings 'sql-database)))))) + (assq-ref connection-settings 'sql-database)))))) (define (schema db) (let ((tables (tables db))) @@ -83,14 +89,14 @@ is a <table> object." (table-name table))))) (triple table-id 'rdf:type 'gn:sqlTable) (triple table-id 'gn:name (table-name table)) - (triple table-id 'gn:hasSize (table-size table)) + (triple table-id 'gn:has_size (table-size table)) (for-each (lambda (column) (let ((column-id (column-id (table-name table) (column-name column)))) - (triple column-id 'rdf:type 'gn:sqlTableField) + (triple column-id 'rdf:type 'gn:sql_table_field) (triple column-id 'gn:name (column-name column)) - (triple column-id 'gn:sqlFieldType (column-type column)) - (triple table-id 'gn:hasField column-id))) + (triple column-id 'gn:sql_field_type (column-type column)) + (triple table-id 'gn:has_field column-id))) (table-columns table)))) tables))) diff --git a/transform/special-forms.scm b/transform/special-forms.scm index 99b30df..ddb3180 100644 --- a/transform/special-forms.scm +++ b/transform/special-forms.scm @@ -537,40 +537,42 @@ The above query results to triples that have the form: (call-with-target-database connection (lambda (db) - (with-output-to-file ; - doc-path - (lambda () - (format #t "# ~a" name) - (for-each - (lambda (proc) - (proc db - #:metadata? #f - #:data? #f - #:documentation? - (lambda () (for-each - (match-lambda - ((k v) - (begin - (prefix k v #f)))) - prefixes)))) - inputs)) - #:encoding "UTF-8") + (when doc-path + (with-output-to-file ; + doc-path + (lambda () + (format #t "# ~a" name) + (for-each + (lambda (proc) + (proc db + #:metadata? #f + #:data? #f + #:documentation? + (lambda () (for-each + (match-lambda + ((k v) + (begin + (prefix k v #f)))) + prefixes)))) + inputs)) + #:encoding "UTF-8")) ;; Dumping the actual data - (with-output-to-file - rdf-path - (lambda () - ;; Add the prefixes - (for-each - (match-lambda - ((k v) - (begin - (prefix k v)))) - prefixes) - (newline) - (for-each - (lambda (proc) - (proc db #:metadata? table-metadata?)) - inputs)) - #:encoding "UTF-8"))))))) + (when rdf-path + (with-output-to-file + rdf-path + (lambda () + ;; Add the prefixes + (for-each + (match-lambda + ((k v) + (begin + (prefix k v)))) + prefixes) + (newline) + (for-each + (lambda (proc) + (proc db #:metadata? table-metadata?)) + inputs)) + #:encoding "UTF-8")))))))) diff --git a/transform/sql.scm b/transform/sql.scm index a8962c8..daedf97 100644 --- a/transform/sql.scm +++ b/transform/sql.scm @@ -102,13 +102,14 @@ (dbi-get_row db)) (define (call-with-target-database connection-settings proc) - (call-with-database "mysql" (string-join - (list (assq-ref connection-settings 'sql-username) - (assq-ref connection-settings 'sql-password) - (assq-ref connection-settings 'sql-database) - "tcp" - (assq-ref connection-settings 'sql-host) - (number->string - (assq-ref connection-settings 'sql-port))) - ":") + (call-with-database "mysql" (string-append (string-join + (list (assq-ref connection-settings 'sql-username) + (assq-ref connection-settings 'sql-password) + (assq-ref connection-settings 'sql-database) + "tcp" + (assq-ref connection-settings 'sql-host) + (number->string + (assq-ref connection-settings 'sql-port))) + ":") + "?charset=utf8") proc)) diff --git a/transform/strings.scm b/transform/strings.scm index 7545f62..7b62349 100644 --- a/transform/strings.scm +++ b/transform/strings.scm @@ -11,19 +11,25 @@ delete-substrings replace-substrings remove-duplicates - remap-species-identifiers str sanitize-rdf-string snake->lower-camel lower-case-and-replace-spaces - string-capitalize-first)) + string-capitalize-first + normalize-string-field + fix-email-id + blank-p + investigator-attributes->id)) + +(define (blank-p str) + (if (string-blank? str) #f str)) (define (lower-case-and-replace-spaces str) (string-map - (lambda (c) - (if (char=? c #\space) - #\- ; replace space with hyphen - c)) ; convert character to lower case - (string-downcase str))) + (lambda (c) + (if (char=? c #\space) + #\- ; replace space with hyphen + c)) ; convert character to lower case + (string-downcase str))) (define (time-unix->string seconds . maybe-format) "Given an integer saying the number of seconds since the Unix @@ -121,13 +127,12 @@ association list mapping substrings to their replacements." ((memq (car lst) result) (loop (cdr lst) result)) (else (loop (cdr lst) (cons (car lst) result)))))) - -(define (remap-species-identifiers str) - "This procedure remaps identifiers to standard binominal. Obviously this should - be sorted by correcting the database!" - (match str - ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] - ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] - ["Macaca mulatta" "Macaca nemestrina"] - ["Bat (Glossophaga soricina)" "Glossophaga soricina"] - [str str])) +(define (normalize-string-field field) + (let ((field (string-trim-both field))) + (match field + ((? string? field) + (if (or (string-blank? field) + (string=? (string-downcase field) "none")) + "" + field)) + (_ "")))) diff --git a/transform/triples.scm b/transform/triples.scm index 9775d36..13758e5 100644 --- a/transform/triples.scm +++ b/transform/triples.scm @@ -8,8 +8,19 @@ triple scm->triples annotate-field + remap-species-identifiers string->binomial-name)) +(define (remap-species-identifiers str) + "This procedure remaps identifiers to standard binominal. Obviously this should + be sorted by correcting the database!" + (match str + ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] + ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] + ["Macaca mulatta" "Macaca nemestrina"] + ["Bat (Glossophaga soricina)" "Glossophaga soricina"] + [str str])) + (define (annotate-field field schema) (let ([schema (cond ((symbol? schema) (symbol->string schema)) @@ -28,7 +39,7 @@ #:optional #:key (ontology "gn:") (separator "") - (proc string-capitalize-first)) + (proc (lambda (x) x))) "Convert STR to a turtle identifier after replacing illegal characters with an underscore and prefixing with gn:PREFIX." (if (or (and (string? str) (string-null? str)) @@ -40,11 +51,12 @@ characters with an underscore and prefixing with gn:PREFIX." (lambda (c) (eq? c #\))) (string-map (lambda (c) - (case c - ((#\/ #\< #\> #\+ #\( #\space #\@) #\_) - (else c))) - (proc - (string-trim-right str #\.)))))))) + (if (or (char-alphabetic? c) + (char-numeric? c) + (char=? c #\_)) + c + #\_)) + (proc str))))))) (define* (prefix prefix iri #:optional (ttl? #t)) diff --git a/transform/uuid.scm b/transform/uuid.scm deleted file mode 100644 index be0e592..0000000 --- a/transform/uuid.scm +++ /dev/null @@ -1,234 +0,0 @@ -;; CREDIT: https://lists.gnu.org/archive/html/guile-user/2018-01/msg00019.html -(define-module (transform uuid) - #:use-module (srfi srfi-1) - #:use-module (srfi srfi-11) - #:use-module (rnrs bytevectors) - #:use-module (ice-9 iconv) - #:export (bytevector->md5 - make-version-3-uuid)) - -(define (bytevector->md5 bytevector) - "Convert BYTEVECTOR to a bytevector containing the MD5 hash of -BYTEVECTOR." - ;; Implemented along RFC 1321. It should be easy to verify that - ;; this procedure performs the operations specified therein. - (define (append-padding-bits bytevector) - "Makes a list from BYTEVECTOR with padding as per RFC 1321 3.1." - (let* ((length-in-bits (* 8 (bytevector-length bytevector))) - (padding-bits (- 512 (modulo (- length-in-bits 448) 512)))) - (append (bytevector->u8-list bytevector) - '(128) ; #*10000000 - (iota - (- (/ padding-bits 8) 1) - 0 0)))) - (define (append-length msg-list message-length) - "Append MESSAGE-LENGTH as 8 byte values from a uint64 to MSG-LIST." - (append msg-list - ;; For numbers too large for an uint64, only the low-order - ;; bytes are returned. - (bytevector->u8-list (u64vector - (modulo - (* message-length 8) ; bits - (1+ #xffffffffffffffff)))))) - (let hash ((AA #x67452301) - (BB #xefcdab89) - (CC #x98badcfe) - (DD #x10325476) - (to-digest - (append-length - (append-padding-bits - bytevector) - (bytevector-length bytevector)))) - (define (F X Y Z) - (logior (logand X Y) (logand (lognot X) Z))) - (define (G X Y Z) - (logior (logand X Z) (logand Y (lognot Z)))) - (define (H X Y Z) - (logxor X Y Z)) - (define (I X Y Z) - (logxor Y (logior X (lognot Z)))) - (define (T i) - (inexact->exact (floor (* 4294967296 (abs (sin i)))))) - (define (number->u32 n) - "Cut off all bits that do not fit in a uint32." - (bit-extract n 0 32)) - (define (lsh32 n count) - (number->u32 (logior (ash n count) - (bit-extract n (- 32 count) 32)))) - (if (not (null? to-digest)) - (let* ((block (u8-list->bytevector - (list-head to-digest (/ 512 8)))) - (X (lambda (j) (bytevector-u32-ref - block (* 4 j) (endianness little)))) - (do-round1 - (lambda (A B C D) - (define (operation a b c d k s i) - (number->u32 - (+ b (lsh32 (+ a (F b c d) (X k) (T i)) s)))) - (let* ((A (operation A B C D 0 7 1)) - (D (operation D A B C 1 12 2)) - (C (operation C D A B 2 17 3)) - (B (operation B C D A 3 22 4)) - (A (operation A B C D 4 7 5)) - (D (operation D A B C 5 12 6)) - (C (operation C D A B 6 17 7)) - (B (operation B C D A 7 22 8)) - (A (operation A B C D 8 7 9)) - (D (operation D A B C 9 12 10)) - (C (operation C D A B 10 17 11)) - (B (operation B C D A 11 22 12)) - (A (operation A B C D 12 7 13)) - (D (operation D A B C 13 12 14)) - (C (operation C D A B 14 17 15)) - (B (operation B C D A 15 22 16))) - (values A B C D)))) - (do-round2 - (lambda (A B C D) - (define (operation a b c d k s i) - (number->u32 - (+ b (lsh32 (+ a (G b c d) (X k) (T i)) s)))) - (let* ((A (operation A B C D 1 5 17)) - (D (operation D A B C 6 9 18)) - (C (operation C D A B 11 14 19)) - (B (operation B C D A 0 20 20)) - (A (operation A B C D 5 5 21)) - (D (operation D A B C 10 9 22)) - (C (operation C D A B 15 14 23)) - (B (operation B C D A 4 20 24)) - (A (operation A B C D 9 5 25)) - (D (operation D A B C 14 9 26)) - (C (operation C D A B 3 14 27)) - (B (operation B C D A 8 20 28)) - (A (operation A B C D 13 5 29)) - (D (operation D A B C 2 9 30)) - (C (operation C D A B 7 14 31)) - (B (operation B C D A 12 20 32))) - (values A B C D)))) - (do-round3 - (lambda (A B C D) - (define (operation a b c d k s i) - (number->u32 - (+ b (lsh32 (+ a (H b c d) (X k) (T i)) s)))) - (let* ((A (operation A B C D 5 4 33)) - (D (operation D A B C 8 11 34)) - (C (operation C D A B 11 16 35)) - (B (operation B C D A 14 23 36)) - (A (operation A B C D 1 4 37)) - (D (operation D A B C 4 11 38)) - (C (operation C D A B 7 16 39)) - (B (operation B C D A 10 23 40)) - (A (operation A B C D 13 4 41)) - (D (operation D A B C 0 11 42)) - (C (operation C D A B 3 16 43)) - (B (operation B C D A 6 23 44)) - (A (operation A B C D 9 4 45)) - (D (operation D A B C 12 11 46)) - (C (operation C D A B 15 16 47)) - (B (operation B C D A 2 23 48))) - (values A B C D)))) - (do-round4 - (lambda (A B C D) - (define (operation a b c d k s i) - (number->u32 - (+ b (lsh32 (+ a (I b c d) (X k) (T i)) s)))) - (let* ((A (operation A B C D 0 6 49)) - (D (operation D A B C 7 10 50)) - (C (operation C D A B 14 15 51)) - (B (operation B C D A 5 21 52)) - (A (operation A B C D 12 6 53)) - (D (operation D A B C 3 10 54)) - (C (operation C D A B 10 15 55)) - (B (operation B C D A 1 21 56)) - (A (operation A B C D 8 6 57)) - (D (operation D A B C 15 10 58)) - (C (operation C D A B 6 15 59)) - (B (operation B C D A 13 21 60)) - (A (operation A B C D 4 6 61)) - (D (operation D A B C 11 10 62)) - (C (operation C D A B 2 15 63)) - (B (operation B C D A 9 21 64))) - (values A B C D))))) - (let*-values (((A B C D) (values AA BB CC DD)) - ((A B C D) (do-round1 A B C D)) - ((A B C D) (do-round2 A B C D)) - ((A B C D) (do-round3 A B C D)) - ((A B C D) (do-round4 A B C D))) - (hash (number->u32 (+ A AA)) - (number->u32 (+ B BB)) - (number->u32 (+ C CC)) - (number->u32 (+ D DD)) - (list-tail to-digest (/ 512 8))))) - ;; we’re done: - (u8-list->bytevector - (append - (bytevector->u8-list (u32vector AA)) - (bytevector->u8-list (u32vector BB)) - (bytevector->u8-list (u32vector CC)) - (bytevector->u8-list (u32vector DD))))))) - -(define* (make-version-3-uuid namespace-uuid str #:optional (prefix "urn:uuid:")) - "Generates a UUID string by computing the MD5 hash of NAMESPACE-UUID -and STR. NAMESPACE-UUID must be a bytevector consisting of the UUID’s -bytes, *not* the UUID’s string representation." - (define (half-byte->hex-char number) - "Returns the corresponding hexadecimal digit for a number NUMBER -between 0 and 15." - (case number - ((0) #\0) - ((1) #\1) - ((2) #\2) - ((3) #\3) - ((4) #\4) - ((5) #\5) - ((6) #\6) - ((7) #\7) - ((8) #\8) - ((9) #\9) - ((10) #\a) - ((11) #\b) - ((12) #\c) - ((13) #\d) - ((14) #\e) - ((15) #\f))) - (define (byte->hex-string bv index) - "Convert the byte at INDEX of bytevector BV to a hex string." - (let ((byte (bytevector-u8-ref bv index))) - (string (half-byte->hex-char (quotient byte 16)) - (half-byte->hex-char (modulo byte 16))))) - (let ((md5 (bytevector->md5 - (u8-list->bytevector - (append (bytevector->u8-list namespace-uuid) - (bytevector->u8-list (string->utf8 str))))))) - (string-append prefix - ;; time_low field: - (byte->hex-string md5 0) - (byte->hex-string md5 1) - (byte->hex-string md5 2) - (byte->hex-string md5 3) - "-" - ;; time_mid field: - (byte->hex-string md5 4) - (byte->hex-string md5 5) - "-" - ;; time_hi_and_version field: - (let ((byte (bytevector-u8-ref md5 6))) - (string (half-byte->hex-char 3) ; UUID version 3 - (half-byte->hex-char (modulo byte 16)))) - (byte->hex-string md5 7) - "-" - ;; clock_seq_hi_and_reserved field: - (let ((byte (bytevector-u8-ref md5 8))) - (string (half-byte->hex-char - (logior #b1000 ; most significant bits are 10 - (bit-extract (quotient byte 16) 0 2))) - (half-byte->hex-char (modulo byte 16)))) - ;; clock_seq_low field: - (byte->hex-string md5 9) - "-" - ;; node field: - (byte->hex-string md5 10) - (byte->hex-string md5 11) - (byte->hex-string md5 12) - (byte->hex-string md5 13) - (byte->hex-string md5 14) - (byte->hex-string md5 15)))) diff --git a/visualize-schema.scm b/visualize-schema.scm index 92f9272..13448cc 100755 --- a/visualize-schema.scm +++ b/visualize-schema.scm @@ -22,7 +22,7 @@ (prefix "http://www.w3.org/1999/02/22-rdf-syntax-ns#")) (define gn - (prefix "http://genenetwork.org/")) + (prefix "http://rdf.genenetwork.org/v1/")) (define graph (@@ (ccwl graphviz) graph)) (define graph-node (@@ (ccwl graphviz) graph-node)) @@ -149,17 +149,17 @@ is a <table> object." (map (cut string=? <> "1") (string-split field-transformed #\,)))))) (sparql-query-records - "PREFIX gn: <http://genenetwork.org/> -SELECT SAMPLE(?tablename) SAMPLE(?size) GROUP_CONCAT(?fieldname ; separator=\",\") GROUP_CONCAT(?fieldtype ; separator=\",\") GROUP_CONCAT(EXISTS{ ?transform rdf:type gn:transform . ?transform gn:dependsOn ?field .} ; separator=\",\") + "PREFIX gn: <http://rdf.genenetwork.org/v1/> +SELECT SAMPLE(?tablename) SAMPLE(?size) GROUP_CONCAT(?fieldname ; separator=\",\") GROUP_CONCAT(?fieldtype ; separator=\",\") GROUP_CONCAT(EXISTS{ ?transform rdf:type gn:transform . ?transform gn:depends_on ?field .} ; separator=\",\") WHERE { ?table rdf:type gn:sqlTable ; gn:name ?tablename ; - gn:hasSize ?size ; - gn:hasField ?field . - ?field rdf:type gn:sqlTableField ; + gn:has_size ?size ; + gn:has_field ?field . + ?field rdf:type gn:sql_table_field ; gn:name ?fieldname ; - gn:sqlFieldType ?fieldtype . + gn:sql_field_type ?fieldtype . } GROUP BY ?table"))) (define (foreign-key-graphviz-edges tables) @@ -233,20 +233,20 @@ properties." (string-split fields #\,)) ", ")))))) (sparql-query-records - "PREFIX gn: <http://genenetwork.org/> + "PREFIX gn: <http://rdf.genenetwork.org/v1/> SELECT ?type ?predicate GROUP_CONCAT(?tablename ; separator=\",\") GROUP_CONCAT(?fieldname ; separator=\",\") WHERE { ?predicate rdfs:domain ?type ; rdfs:range rdfs:Literal . ?transform rdf:type gn:transform ; - gn:createsPredicate ?predicate ; - gn:forSubjectType ?type ; - gn:dependsOn ?field . - ?field rdf:type gn:sqlTableField ; + gn:creates_predicate ?predicate ; + gn:for_subject_type ?type ; + gn:depends_on ?field . + ?field rdf:type gn:sql_table_field ; gn:name ?fieldname . ?table rdf:type gn:sqlTable ; - gn:hasField ?field ; + gn:has_field ?field ; gn:name ?tablename . } GROUP BY ?type ?predicate "))) |
