diff options
| -rwxr-xr-x | examples/genelist.scm | 22 | ||||
| -rwxr-xr-x | examples/generif.scm | 87 | ||||
| -rwxr-xr-x | examples/genotype-datasets.scm | 6 | ||||
| -rwxr-xr-x | examples/ontology.scm | 173 | ||||
| -rwxr-xr-x | examples/phenotype-datasets.scm | 27 | ||||
| -rwxr-xr-x | examples/phenotype.scm | 7 | ||||
| -rwxr-xr-x | examples/probesets-experiment-metadata.scm | 110 | ||||
| -rwxr-xr-x | examples/probesets.scm | 133 | ||||
| -rwxr-xr-x | load-rdf.scm | 10 | ||||
| -rw-r--r-- | manifest.scm | 3 | ||||
| -rw-r--r-- | transform/special-forms.scm | 356 | ||||
| -rw-r--r-- | transform/strings.scm | 24 | ||||
| -rw-r--r-- | transform/triples.scm | 3 |
13 files changed, 687 insertions, 274 deletions
diff --git a/examples/genelist.scm b/examples/genelist.scm index 5048bf2..ecd5cad 100755 --- a/examples/genelist.scm +++ b/examples/genelist.scm @@ -30,34 +30,34 @@ (gnc:transcript rdfs:domain gnc:gene_symbol) (gnt:transcript a owl:ObjectProperty) (gnc:transcript rdfs:comments "The gene transcript of this resource") - (gnc:ebi_gwas_link rdfs:Class gnc:ResourceLink) + (gnc:ebi_gwas_link rdfs:Class gnc:resource_link) (gnc:ebi_gwas_link rdfs:label "EBI GWAS") (gnc:ebi_gwas_link rdfs:comments "EBI GWAS") - (gnc:protein_atlas_link rdfs:Class gnc:ResourceLink) + (gnc:protein_atlas_link rdfs:Class gnc:resource_link) (gnc:protein_atlas_link rdfs:label "Protein Atlas") (gnc:protein_atlas_link rdfs:comments "Human Protein Atlas") - (gnc:genemania_link rdfs:Class gnc:ResourceLink) + (gnc:genemania_link rdfs:Class gnc:resource_link) (gnc:genemania_link rdfs:label "GeneMANIA") (gnc:genemania_link rdfs:comments "GeneMANIA") - (gnc:gemma_link rdfs:Class gnc:ResourceLink) + (gnc:gemma_link rdfs:Class gnc:resource_link) (gnc:gemma_link rdfs:label "Gemma") (gnc:gemma_link rdfs:comments "Meta-analysis of gene expression data") - (gnc:biogps_link rdfs:Class gnc:ResourceLink) + (gnc:biogps_link rdfs:Class gnc:resource_link) (gnc:biogps_link rdfs:label "BioGPS") (gnc:biogps_link rdfs:comments "Expression across many tissues and cell types") - (gnc:aba_link rdfs:Class gnc:ResourceLink) + (gnc:aba_link rdfs:Class gnc:resource_link) (gnc:aba_link rdfs:label "ABA") (gnc:aba_link rdfs:comments "Allen Brain Atlas") - (gnc:panther_link rdfs:Class gnc:ResourceLink) + (gnc:panther_link rdfs:Class gnc:resource_link) (gnc:panther_link rdfs:label "PANTHER") (gnc:panther_link rdfs:comments "Gene and protein data resources from Celera-ABI") - (gnc:panther_link rdfs:Class gnc:ResourceLink) + (gnc:panther_link rdfs:Class gnc:resource_link) (gnc:panther_link rdfs:label "STRING") (gnc:panther_link rdfs:comments "Protein interactions: known and inferred") - (gnc:gtex_link rdfs:Class gnc:ResourceLink) + (gnc:gtex_link rdfs:Class gnc:resource_link) (gnc:gtex_link rdfs:label "GTEx Portal") (gnc:gtex_link rdfs:comments "GTEx Portal") - (gnc:rgd_link rdfs:Class gnc:ResourceLink) + (gnc:rgd_link rdfs:Class gnc:resource_link) (gnc:rgd_link rdfs:label "Rat Genome DB") (gnc:rgd_link rdfs:comments "Rat Genome DB") (gnc:has_kg_id rdfs:domain gnc:gene_symbol) @@ -279,7 +279,7 @@ "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "http://www.pantherdb.org/genes/geneList.do?searchType=basic&fieldName=all&organism=all&listType=1&fieldValue=" (string-trim-both symbol) - "a gnc:PantherLink")) + "a gnc:panther_link")) ""))) (set dct:references (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol)))) diff --git a/examples/generif.scm b/examples/generif.scm index a4a2e4b..a8a8460 100755 --- a/examples/generif.scm +++ b/examples/generif.scm @@ -20,14 +20,16 @@ (left-join Species "ON Species.SpeciesId = GeneRIF.SpeciesId") (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id") (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id")) - "WHERE GeneRIF.display > 0 AND GeneRIF.comment IS NOT NULL -GROUP BY GeneRIF.Id, GeneRIF.versionId, GeneRIF.symbol") + "WHERE GeneRIF.display > 0 AND GeneRIF.comment IS NOT NULL GROUP BY GeneRIF.Id, GeneRIF.versionId, GeneRIF.symbol") (triples - (string->identifier - "wiki" (format #f "~a_~a" - (field GeneRIF Id) - (field GeneRIF versionId)) - #:separator "_") + (string->identifier "" + (gn-uuid (format #f "~a.~a.~a?type=wikii" + (field GeneRIF Id) + (field GeneRIF versionId) + (field GeneRIF createtime))) + #:url-char #\-) + (set dct:identifier (gn-uuid (format #f "~a?type=wiki" + (field GeneRIF Id)))) (set rdfs:label (string->symbol (format #f "'~a'@en" (replace-substrings @@ -56,8 +58,6 @@ GROUP BY GeneRIF.Id, GeneRIF.versionId, GeneRIF.symbol") ;; ((? string-blank? mbox) "") ;; (mbox (string->symbol ;; (format #f "<~a>" mbox))))) - (set dct:identifier (annotate-field (format #f "~s" (field GeneRIF Id)) - '^^xsd:integer)) (set foaf:homepage (match (sanitize-rdf-string (field GeneRIF weburl)) ((? string-blank? homepage) "") @@ -78,44 +78,32 @@ GROUP BY GeneRIF.Id, GeneRIF.versionId, GeneRIF.symbol") (left-join Species "USING (SpeciesId)"))) (triples (string->identifier - "rif" (format #f "~a_~a_~a_~a" - (field GeneRIF_BASIC GeneId) - (field GeneRIF_BASIC PubMed_ID) - (field ("DATE_FORMAT(createtime, '%Y-%m-%dT%T')" CreateTime)) - (field GeneRIF_BASIC VersionId)) - #:separator "_") - (set rdf:type - (let* ((comment (format #f "'~a'@en" - (replace-substrings - (sanitize-rdf-string - (field GeneRIF_BASIC comment)) - '(("\\" . "\\\\") - ("\n" . "\\n") - ("\r" . "\\r") - ("'" . "\\'"))))) - (create-time (format #f "~s^^xsd:datetime" - (field - ("CAST(createtime AS CHAR)" EntryCreateTime)))) - (symbol (field GeneRIF_BASIC symbol)) - (species (string->identifier "" (remap-species-identifiers (field Species Fullname)))) - (gene-id (field GeneRIF_BASIC GeneId)) - (taxon-id (field GeneRIF_BASIC TaxID TaxonomicId)) - (pmid (field GeneRIF_BASIC PubMed_ID)) - (version-id (field GeneRIF_BASIC versionId))) - (string->symbol - (string-append - (format #f "gnc:ncbi_wiki_entry ;\n") - (format #f "\trdfs:label ~a ;\n" comment) - (format #f "\tgnt:has_species ~a ;\n" species) - (format #f "\tgnt:symbol ~s ;\n" symbol) - (format #f "\tgnt:has_gene_id generif:~a ;\n" gene-id) - (match taxon-id - ((? number? x) - (format #f "\tskos:notation taxon:~a ;\n" taxon-id)) - (else "")) - (format #f "\tdct:hasVersion \"~a\"^^xsd:integer ;\n" version-id) - (format #f "\tdct:references pubmed:~a ;\n" pmid) - (format #f "\tdct:created ~a" create-time))))))) + "" (gn-uuid (format #f "~a_~a_~a_~a" + (field GeneRIF_BASIC GeneId) + (field GeneRIF_BASIC PubMed_ID) + (field ("DATE_FORMAT(createtime, '%Y-%m-%dT%T')" CreateTime)) + (field GeneRIF_BASIC VersionId))) + #:url-char #\-) + (set rdf:type 'gnc:ncbi_wiki_entry) + (set rdfs:label (format #f "'~a'@en" + (replace-substrings + (sanitize-rdf-string + (field GeneRIF_BASIC comment)) + '(("\\" . "\\\\") + ("\n" . "\\n") + ("\r" . "\\r") + ("'" . "\\'"))))) + (set gnt:symbol (field GeneRIF_BASIC symbol)) + (set gnt:has_species (string->identifier "" (remap-species-identifiers (field Species Fullname)))) + (set skos:notation (ontology 'taxon: (field GeneRIF_BASIC TaxID TaxonomicId))) + (set dct:hasVersion (annotate-field (field GeneRIF_BASIC versionId) '^^xsd:integer)) + (set gnt:has_gene_id (ontology 'generif: (field GeneRIF_BASIC GeneId))) + (set dct:references (ontology 'pubmed: (field GeneRIF_BASIC PubMed_ID))) + (set dct:created + (string->symbol + (format #f "~s^^xsd:datetime" + (field + ("CAST(createtime AS CHAR)" EntryCreateTime))))))) @@ -152,9 +140,8 @@ GROUP BY GeneRIF.Id, GeneRIF.versionId, GeneRIF.symbol") ("owl:" "<http://www.w3.org/2002/07/owl#>"))) (inputs (list - ;; gn-genewiki-entries - ncbi-genewiki-entries - )) + gn-genewiki-entries + ncbi-genewiki-entries)) (outputs `(#:documentation ,documentation #:rdf ,output)))) diff --git a/examples/genotype-datasets.scm b/examples/genotype-datasets.scm index ebe2349..38d524b 100755 --- a/examples/genotype-datasets.scm +++ b/examples/genotype-datasets.scm @@ -18,7 +18,7 @@ (tables (Species (inner-join InbredSet "ON InbredSet.SpeciesId = Species.Id") (inner-join GenoFreeze "ON GenoFreeze.InbredSetId = InbredSet.Id")) - "WHERE GenoFreeze.public > 0 AND Species.Name != 'monkey' GROUP BY Species.Name, GenoFreeze.ShortName") + "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND Species.Name != 'monkey' GROUP BY Species.Name, GenoFreeze.ShortName") (triples (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_") (multiset gnt:has_genotype_data (map (cut string->identifier "dataset" <> #:separator "_") @@ -31,7 +31,7 @@ (tables (GenoFreeze (inner-join InbredSet "ON InbredSet.Id = GenoFreeze.InbredSetId") (inner-join Species "ON InbredSet.SpeciesId = Species.Id")) - "WHERE GenoFreeze.public > 0 AND Species.Name != 'monkey'") + "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND Species.Name != 'monkey'") (triples (string->identifier "dataset" (field GenoFreeze Name) #:separator "_") (set gnt:has_strain (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")) (set dct:created (annotate-field (field GenoFreeze CreateTime) '^^xsd:datetime)))) @@ -41,7 +41,7 @@ (inner-join InbredSet "ON InbredSet.Id = GenoFreeze.InbredSetId") (inner-join Species "ON InbredSet.SpeciesId = Species.Id") (inner-join Geno "ON Geno.SpeciesId = Species.Id")) - "WHERE GenoFreeze.public > 0 AND Species.Name != 'monkey' GROUP BY GenoFreeze.Name") + "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND Species.Name != 'monkey' GROUP BY GenoFreeze.Name") (triples (string->identifier "dataset" (field GenoFreeze Name) #:separator "_") (set gnt:has_marker_count (string->symbol diff --git a/examples/ontology.scm b/examples/ontology.scm index f2b54cc..7ea9c4f 100755 --- a/examples/ontology.scm +++ b/examples/ontology.scm @@ -25,6 +25,7 @@ (prefix "gnc:" "<http://rdf.genenetwork.org/v1/category/>") (prefix "gnt:" "<http://rdf.genenetwork.org/v1/term/>") (prefix "obo:" "<http://purl.obolibrary.org/obo/>") + (prefix "bfo:" "<http://purl.obolibrary.org/obo/BFO_>") (prefix "sdmx-measure:" "<http://purl.org/linked-data/sdmx/2009/measure#>") (prefix "skos:" "<http://www.w3.org/2004/02/skos/core#>") (prefix "rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") @@ -35,6 +36,28 @@ (prefix "pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>") (prefix "schema:" "<https://schema.org/>") (newline) + (triple 'gnt:has_trait_page 'a 'owl:ObjectProperty) + (triple 'gnt:has_trait_page 'rdfs:label "has traits page") + (triple 'gnt:has_trait_page 'rdfs:comment "Links a trait resource to its GeneNetwork web interface page for interactive exploration.") + (triple 'gnt:has_trait_page 'skos:definition "Provides a resolvable HTTP link to the GeneNetwork trait interface for a given phenotype trait or dataset entry.") + (triple 'gnt:has_trait_page 'rdfs:domain 'gnc:phenotype_trait) + (triple 'gnt:has_trait_page 'rdfs:range 'gnc:resource_link) + (triple 'gnt:has_trait_page 'schema:domainIncludes 'gnc:phenotype) + (triple 'gnt:has_trait_page 'schema:domainIncludes 'dcat:Dataset) + (triple 'gnt:has_trait_page 'rdfs:subPropertyOf 'schema:url) + ;; Minimal BFO bridge for GN terms. + (triple 'gnc:resource_entity 'a 'owl:Class) + (triple 'gnc:resource_entity 'rdfs:label "GeneNetwork resource entity") + (triple 'gnc:resource_entity 'rdfs:subClassOf 'bfo:0000001) + (triple 'gnc:material_resource 'a 'owl:Class) + (triple 'gnc:material_resource 'rdfs:label "GeneNetwork material resource") + (triple 'gnc:material_resource 'rdfs:subClassOf 'bfo:0000040) + (triple 'gnc:material_resource 'rdfs:subClassOf 'gnc:resource_entity) + (triple 'gnc:information_resource 'a 'owl:Class) + (triple 'gnc:information_resource 'rdfs:label "GeneNetwork information resource") + (triple 'gnc:information_resource 'rdfs:subClassOf 'bfo:0000031) + (triple 'gnc:information_resource 'rdfs:subClassOf 'gnc:resource_entity) + (triple 'gnc:population_category 'a 'xkos:ClassificationLevel) (triple 'gnc:population_category 'rdfs:label "Population Category") (triple 'gnc:population_category 'skos:inScheme 'gnc:resource_classification_scheme) @@ -73,6 +96,10 @@ (triple 'gnc:taxonomic_family 'skos:prefLabel "Family") (triple 'gnc:taxonomic_family 'xkos:depth "1") (triple 'gnc:taxonomic_family 'xkos:nextLevel 'gnc:species) + (triple 'gnc:strain 'a 'owl:Class) + (triple 'gnc:strain 'rdfs:subClassOf 'gnc:material_resource) + (triple 'gnc:mapping_method 'a 'skos:ConceptScheme) + (triple 'gnc:avg_method 'a 'skos:ConceptScheme) (triple 'gnt:assigned_species 'a 'owl:ObjectProperty) (triple 'gnt:assigned_species 'rdfs:domain 'gnc:set) (triple 'gnt:assigned_species 'rdfs:label "These families have been assigned to these species") @@ -82,7 +109,6 @@ (triple 'gnt:genetic_type 'rdfs:range 'xsd:string) (triple 'gnt:genetic_type 'skos:definition "Describes the genetic architecture of a resource set (e.g., intercross, riset).") (triple 'gnt:has_family_order_id 'a 'owl:DatatypeProperty) - (triple 'gnt:has_family_order_id 'a 'owl:DatatypeProperty) (triple 'gnt:has_family_order_id 'rdfs:range 'xsd:integer) (triple 'gnt:has_set_code 'a 'owl:DatatypeProperty) (triple 'gnt:has_set_code 'rdfs:domain 'gnc:set) @@ -118,12 +144,27 @@ (triple 'gnt:uses_mapping_method 'rdfs:domain 'gnc:set) (triple 'gnt:uses_mapping_method 'rdfs:label "mapping method") (triple 'gnt:uses_mapping_method 'rdfs:range 'gnc:mapping_method) + (triple 'gnt:has_reference_population 'a 'owl:ObjectProperty) + (triple 'gnt:has_reference_population 'schema:domainIncludes 'gnc:set) + (triple 'gnt:has_reference_population 'schema:domainIncludes 'gnc:population_category) + (triple 'gnt:has_reference_population 'rdfs:range 'gnc:reference_population) + (triple 'gnt:has_population_order_id 'a 'owl:DatatypeProperty) + (triple 'gnt:has_population_order_id 'rdfs:domain 'gnc:reference_population) + (triple 'gnt:has_population_order_id 'rdfs:range 'xsd:integer) + (triple 'gnt:alias 'a 'owl:DatatypeProperty) + (triple 'gnt:alias 'rdfs:domain 'gnc:strain) + (triple 'gnt:gene 'a 'owl:DatatypeProperty) + (triple 'gnt:gene 'rdfs:domain 'gnc:gene_symbol) ;; Describing Datasets (triple 'gnc:molecular_trait 'a 'owl:Class) (triple 'gnc:molecular_trait 'a 'skos:Concept) (triple 'gnc:molecular_trait 'rdfs:label "Molecular Trait. This describes a melecular trait of a given species. We combine the species name and the tissue name in order to differentiate the traits across different inbredset groups.") (triple 'gnc:molecular_trait 'rdfs:subClassOf 'obo:UBERON_0000479) + (triple 'gnc:molecular_trait 'rdfs:subClassOf 'gnc:information_resource) + (triple 'gnc:molecular_trait_metadata 'a 'owl:Class) + (triple 'gnc:molecular_trait_metadata 'rdfs:subClassOf 'gnc:information_resource) + (triple 'gnc:gene_chip 'a 'skos:ConceptScheme) (triple 'gnt:has_case_info 'a 'owl:ObjectProperty) (triple 'gnt:has_case_info 'rdfs:comment "Information about the cases used in this platform") (triple 'gnt:has_case_info 'rdfs:domain 'dcat:Dataset) @@ -163,6 +204,9 @@ (triple 'gnt:has_phenotype_data 'rdfs:label "this resources has this phenotype data.") (triple 'gnt:has_phenotype_data 'rdfs:range 'dcat:Dataset) (triple 'gnt:has_phenotype_data 'rdfs:subPropertyOf 'dct:relation) + (triple 'gnt:has_phenotype_trait 'a 'owl:ObjectProperty) + (triple 'gnt:has_phenotype_trait 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_phenotype_trait 'rdfs:range 'gnc:phenotype_trait) (triple 'gnt:has_platform_info 'a 'owl:ObjectProperty) (triple 'gnt:has_platform_info 'rdfs:comment "Information about the platform that was used with this dataset") (triple 'gnt:has_platform_info 'rdfs:domain 'dcat:Dataset) @@ -194,21 +238,32 @@ (triple 'gnt:uses_normalization_method 'rdfs:domain 'dcat:Dataset) (triple 'gnt:uses_normalization_method 'rdfs:label "Averaging method used for the molecular traits in this dataset.") (triple 'gnt:uses_normalization_method 'rdfs:range 'gnc:avg_method) + (triple 'gnt:has_probeset 'a 'owl:ObjectProperty) + (triple 'gnt:has_probeset 'rdfs:domain 'gnc:molecular_trait_metadata) + (triple 'gnt:has_probeset 'rdfs:range 'gnc:probeset) ;; Describing phenotypes (triple 'gnc:phenotype 'a 'owl:Class) (triple 'gnc:phenotype 'a 'skos:Concept) (triple 'gnc:phenotype 'rdfs:label "A phenotype.") + (triple 'gnc:phenotype 'rdfs:subClassOf 'gnc:information_resource) (triple 'gnc:phenotype_trait 'a 'owl:Class) (triple 'gnc:phenotype_trait 'a 'skos:Concept) (triple 'gnc:phenotype_trait 'rdfs:label "A phenotype trait.") - (triple 'gnt:abbreviation 'a 'owl:ObjectProperty) + (triple 'gnc:phenotype_trait 'rdfs:subClassOf 'gnc:information_resource) + (triple 'gnt:abbreviation 'a 'owl:DatatypeProperty) (triple 'gnt:abbreviation 'rdfs:domain 'gnc:phenotype) (triple 'gnt:abbreviation 'skos:definition "The abbreviation used for this resource") + (triple 'gnt:has_phenotype 'a 'owl:ObjectProperty) + (triple 'gnt:has_phenotype 'rdfs:domain 'gnc:phenotype_trait) + (triple 'gnt:has_phenotype 'rdfs:range 'gnc:phenotype) (triple 'gnt:additive 'rdfs:domain 'gnc:phenotype) (triple 'gnt:additive 'rdfs:range 'xsd:double) - (triple 'gnt:lab_code 'a 'owl:ObjectProperty) + (triple 'gnt:lab_code 'a 'owl:DatatypeProperty) (triple 'gnt:lab_code 'rdfs:domain 'gnc:phenotype) + (triple 'gnt:has_lab_code 'a 'owl:DatatypeProperty) + (triple 'gnt:has_lab_code 'rdfs:domain 'gnc:phenotype) + (triple 'gnt:has_lab_code 'rdfs:subPropertyOf 'gnt:lab_code) (triple 'gnt:locus 'a 'qb:MeasureProperty) (triple 'gnt:locus 'a 'rdf:Property) (triple 'gnt:locus 'rdfs:domain 'gnc:phenotype) @@ -226,9 +281,24 @@ (triple 'gnt:mean 'rdfs:domain 'gnc:phenotype) (triple 'gnt:mean 'rdfs:range 'xsd:double) (triple 'gnt:mean 'rdfs:subPropertyOf 'sdmx-measure:obsValue) + (triple 'gnt:se 'a 'qb:MeasureProperty) + (triple 'gnt:se 'a 'rdf:Property) + (triple 'gnt:se 'rdfs:domain 'gnc:molecular_trait_metadata) + (triple 'gnt:se 'rdfs:range 'xsd:double) + (triple 'gnt:se 'rdfs:subPropertyOf 'sdmx-measure:obsValue) + (triple 'gnt:pvalue 'a 'qb:MeasureProperty) + (triple 'gnt:pvalue 'a 'rdf:Property) + (triple 'gnt:pvalue 'rdfs:domain 'gnc:molecular_trait_metadata) + (triple 'gnt:pvalue 'rdfs:range 'xsd:double) + (triple 'gnt:pvalue 'rdfs:subPropertyOf 'sdmx-measure:obsValue) + (triple 'gnt:h2 'a 'qb:MeasureProperty) + (triple 'gnt:h2 'a 'rdf:Property) + (triple 'gnt:h2 'rdfs:domain 'gnc:molecular_trait_metadata) + (triple 'gnt:h2 'rdfs:range 'xsd:double) + (triple 'gnt:h2 'rdfs:subPropertyOf 'sdmx-measure:obsValue) (triple 'gnt:sequence 'rdfs:domain 'gnc:phenotype) (triple 'gnt:sequence 'rdfs:range 'xsd:integer) - (triple 'gnt:submitter 'a 'owl:ObjectProperty) + (triple 'gnt:submitter 'a 'owl:DatatypeProperty) (triple 'gnt:submitter 'rdfs:domain 'gnc:phenotype) (triple 'gnt:submitter 'skos:definition "A person who submitted this resource to GN") (triple 'gnt:submitter 'skos:definition "A person who submitted this resource to GN") @@ -240,6 +310,9 @@ (triple 'gnc:dna_marker 'a 'owl:Class) (triple 'gnc:dna_marker 'a 'skos:Concept) (triple 'gnc:dna_marker 'rdfs:label "A DNA Marker or SNP") + (triple 'gnc:dna_marker 'rdfs:subClassOf 'gnc:material_resource) + (triple 'gnc:marker 'a 'owl:Class) + (triple 'gnc:marker 'rdfs:subClassOf 'gnc:dna_marker) (triple 'gnt:has_genotype_files 'rdfs:label "This resource has these genotype files") (triple 'gnt:has_genotype_files 'rdfs:domain 'dcat:Dataset) (triple 'gnt:has_genotype_data 'rdf:type 'owl:ObjectProperty) @@ -248,16 +321,19 @@ (triple 'gnt:has_genotype_data 'rdfs:domain 'gnc:set) (triple 'gnt:has_genotype_data 'rdfs:range 'dcat:Dataset) (triple 'gnt:has_genotype_data 'rdfs:subPropertyOf 'dct:relation) - (triple 'gnt:has_marker_count 'rdf:type 'owl:ObjectProperty) + (triple 'gnt:has_marker_count 'rdf:type 'owl:DatatypeProperty) (triple 'gnt:has_marker_count 'rdfs:label "this resources has N number of dna markers/SNPs.") - (triple 'gnt:has_marker_count 'rdfs:domain 'xsd:integer) - (triple 'gnt:has_marker_count 'rdfs:range 'dcat:Dataset) + (triple 'gnt:has_marker_count 'rdfs:domain 'dcat:Dataset) + (triple 'gnt:has_marker_count 'rdfs:range 'xsd:integer) (triple 'gnt:chr 'a 'qb:MeasureProperty) (triple 'gnt:chr 'a 'rdf:Property) (triple 'gnt:chr 'rdfs:label "Chromosome") (triple 'gnt:chr 'rdfs:domain 'gnc:marker) (triple 'gnt:chr 'rdfs:range 'rdfs:Literal) (triple 'gnt:chr 'rdfs:subPropertyOf 'sdmx-measure:obsValue) + (triple 'gnt:chromosome 'a 'owl:DatatypeProperty) + (triple 'gnt:chromosome 'rdfs:subPropertyOf 'gnt:chr) + (triple 'gnt:chromosome 'rdfs:range 'rdfs:Literal) (triple 'gnt:mb 'rdfs:label "Megabase") (triple 'gnt:mb 'rdfs:domain 'gnc:marker) (triple 'gnt:mb 'rdfs:range 'rdfs:Literal) @@ -270,31 +346,39 @@ (triple 'gnt:source 'rdfs:domain 'gnc:marker) (triple 'gnt:source 'rdfs:range 'rdfs:Literal) (triple 'gnt:source 'rdfs:subPropertyOf 'sdmx-measure:obsValue) + (triple 'gnc:nucleotide 'a 'owl:Class) + (triple 'gnc:nucleotide 'rdfs:subClassOf 'gnc:material_resource) + (triple 'gnt:has_sequence 'a 'owl:DatatypeProperty) + (triple 'gnt:has_sequence 'rdfs:domain 'gnc:nucleotide) + (triple 'gnt:has_sequence 'rdfs:range 'xsd:string) ;; Probesets (triple 'gnc:probeset 'a 'owl:Class) (triple 'gnc:probeset 'a 'skos:Concept) (triple 'gnc:probeset 'rdfs:label "A probeset") + (triple 'gnc:probeset 'rdfs:subClassOf 'gnc:material_resource) (triple 'gnt:has_target_id 'a 'owl:ObjectProperty) (triple 'gnt:has_target_id 'rdfs:label "The target id for this probeset") (triple 'gnt:has_target_id 'rdfs:domain 'gnc:probeset) - (triple 'gnt:symbol 'a 'owl:ObjectProperty) + (triple 'gnt:symbol 'a 'owl:DatatypeProperty) (triple 'gnt:symbol 'rdfs:domain 'gnc:probeset) - (triple 'gnt:targets_region 'a 'owl:ObjectProperty) + (triple 'gnt:targets_region 'a 'owl:DatatypeProperty) (triple 'gnt:targets_region 'rdfs:label "The target region") (triple 'gnt:targets_region 'rdfs:domain 'gnc:probeset) + (triple 'gnt:mb_mm8 'a 'owl:DatatypeProperty) (triple 'gnt:mb_mm8 'rdfs:domain 'gnc:probeset) - (triple 'gnt:has_specificity 'a 'owl:ObjectProperty) + (triple 'gnt:mb_mm8 'rdfs:range 'xsd:double) + (triple 'gnt:has_specificity 'a 'owl:DatatypeProperty) (triple 'gnt:has_specificity 'rdfs:domain 'gnc:probeset) - (triple 'gnt:has_blat_score 'a 'owl:ObjectProperty) + (triple 'gnt:has_blat_score 'a 'owl:DatatypeProperty) (triple 'gnt:has_blat_score 'rdfs:domain 'gnc:probeset) - (triple 'gnt:has_blat_mb_start 'a 'owl:ObjectProperty) + (triple 'gnt:has_blat_mb_start 'a 'owl:DatatypeProperty) (triple 'gnt:has_blat_mb_start 'rdfs:domain 'gnc:probeset) - (triple 'gnt:has_blat_mb_end 'a 'owl:ObjectProperty) + (triple 'gnt:has_blat_mb_end 'a 'owl:DatatypeProperty) (triple 'gnt:has_blat_mb_end 'rdfs:domain 'gnc:probeset) - (triple 'gnt:has_blat_seq 'a 'owl:ObjectProperty) + (triple 'gnt:has_blat_seq 'a 'owl:DatatypeProperty) (triple 'gnt:has_blat_seq 'rdfs:domain 'gnc:probeset) - (triple 'gnt:has_target_seq 'a 'owl:ObjectProperty) + (triple 'gnt:has_target_seq 'a 'owl:DatatypeProperty) (triple 'gnt:has_target_seq 'rdfs:domain 'gnc:probeset) (triple 'gnt:has_homologene_id 'a 'owl:ObjectProperty) (triple 'gnt:has_homologene_id 'rdfs:domain 'gnc:probeset) @@ -310,15 +394,70 @@ (triple 'gnt:has_chebi_id 'rdfs:domain 'gnc:probeset) ;; RIF + (triple 'gnc:gene 'a 'rdfs:Class) + (triple 'gnc:gene 'rdfs:subClassOf 'gnc:material_resource) + (triple 'gnc:gene_symbol 'a 'rdfs:Class) + (triple 'gnc:gene_symbol 'rdfs:subClassOf 'gnc:information_resource) + (triple 'gnc:transcript 'a 'rdfs:Class) + (triple 'gnc:transcript 'rdfs:subClassOf 'gnc:information_resource) + (triple 'gnc:resource_link 'a 'rdfs:Class) + (triple 'gnc:aba_link 'rdfs:subClassOf 'gnc:resource_link) + (triple 'gnc:biogps_link 'rdfs:subClassOf 'gnc:resource_link) + (triple 'gnc:ebi_gwas_link 'rdfs:subClassOf 'gnc:resource_link) + (triple 'gnc:gemma_link 'rdfs:subClassOf 'gnc:resource_link) + (triple 'gnc:genemania_link 'rdfs:subClassOf 'gnc:resource_link) + (triple 'gnc:gtex_link 'rdfs:subClassOf 'gnc:resource_link) + (triple 'gnc:panther_link 'rdfs:subClassOf 'gnc:resource_link) + (triple 'gnc:protein_atlas_link 'rdfs:subClassOf 'gnc:resource_link) + (triple 'gnc:rgd_link 'rdfs:subClassOf 'gnc:resource_link) + (triple 'gnc:has_kg_id 'a 'owl:DatatypeProperty) + (triple 'gnc:has_kg_id 'rdfs:domain 'gnc:gene) + (triple 'gnc:has_unigen_id 'a 'owl:DatatypeProperty) + (triple 'gnc:has_unigen_id 'rdfs:domain 'gnc:gene) + (triple 'gnc:has_protein_id 'a 'owl:DatatypeProperty) + (triple 'gnc:has_protein_id 'rdfs:domain 'gnc:gene) + (triple 'gnc:has_align_id 'a 'owl:DatatypeProperty) + (triple 'gnc:has_align_id 'rdfs:domain 'gnc:gene) (triple 'gnc:gene_wiki_entry 'a 'rdfs:Class) + (triple 'gnc:gene_wiki_entry 'rdfs:subClassOf 'gnc:information_resource) (triple 'gnc:gn_wiki_entry 'rdfs:subClassOf 'gnc:gene_wiki_entry) - (triple 'gnt:initial 'a 'owl:ObjectProperty) + (triple 'gnt:initial 'a 'owl:DatatypeProperty) (triple 'gnt:initial 'rdfs:domain 'gnc:gene_wiki_entry) (triple 'gnt:initial 'skos:definition "Optional user or project code or your initials") - (triple 'gnt:reason 'a 'owl:ObjectProperty) + (triple 'gnt:reason 'a 'owl:DatatypeProperty) (triple 'gnt:reason 'rdfs:domain 'gnc:gene_wiki_entry) (triple 'gnt:reason 'skos:definition "The reason why this resource was modified") + (triple 'gnt:belongs_to_category 'a 'owl:DatatypeProperty) + (triple 'gnt:belongs_to_category 'rdfs:domain 'gnc:gene_wiki_entry) + (triple 'gnt:has_gene_id 'a 'owl:ObjectProperty) + (triple 'gnt:has_gene_id 'schema:domainIncludes 'gnc:gene) + (triple 'gnt:has_gene_id 'schema:domainIncludes 'gnc:ncbi_wiki_entry) + (triple 'gnt:gene_symbol 'a 'owl:DatatypeProperty) + (triple 'gnt:gene_symbol 'rdfs:domain 'gnc:gene) (triple 'gnc:gn_wiki_entry 'rdfs:comment "Represents GeneRIF Entries entered from GeneNetwork") (triple 'gnt:gene_symbol 'rdfs:domain 'gnc:gn_wiki_entry) + (triple 'gnt:transcript 'a 'owl:ObjectProperty) + (triple 'gnt:transcript 'rdfs:domain 'gnc:gene) + (triple 'gnt:transcript 'rdfs:range 'gnc:transcript) + (triple 'gnt:strand 'a 'owl:DatatypeProperty) + (triple 'gnt:strand 'rdfs:domain 'gnc:gene) + (triple 'gnt:strand 'rdfs:range 'xsd:string) + (triple 'gnt:tx_start 'a 'owl:DatatypeProperty) + (triple 'gnt:tx_start 'rdfs:domain 'gnc:gene) + (triple 'gnt:tx_start 'rdfs:range 'xsd:double) + (triple 'gnt:tx_end 'a 'owl:DatatypeProperty) + (triple 'gnt:tx_end 'rdfs:domain 'gnc:gene) + (triple 'gnt:tx_end 'rdfs:range 'xsd:double) + (triple 'gnt:has_align_id 'a 'owl:DatatypeProperty) + (triple 'gnt:has_align_id 'rdfs:domain 'gnc:gene) + (triple 'gnt:has_protein_id 'a 'owl:DatatypeProperty) + (triple 'gnt:has_protein_id 'rdfs:domain 'gnc:gene) + (triple 'gnt:has_rgd_id 'a 'owl:DatatypeProperty) + (triple 'gnt:has_rgd_id 'rdfs:domain 'gnc:gene) + (triple 'gnt:has_geo_series_id 'a 'owl:ObjectProperty) + (triple 'gnt:has_geo_series_id 'rdfs:domain 'skos:Concept) + (triple 'gnt:has_go_tree_value 'a 'owl:DatatypeProperty) + (triple 'gnt:has_go_tree_value 'rdfs:domain 'skos:Concept) + (triple 'gnt:has_go_tree_value 'rdfs:range 'xsd:string) (triple 'gnc:ncbi_wiki_entry 'rdfs:subClassOf 'gnc:gene_wiki_entry) (triple 'gnc:ncbi_wiki_entry 'rdfs:comment "Represents GeneRIF Entries obtained from NCBI")))) diff --git a/examples/phenotype-datasets.scm b/examples/phenotype-datasets.scm index 4819627..c005621 100755 --- a/examples/phenotype-datasets.scm +++ b/examples/phenotype-datasets.scm @@ -18,7 +18,7 @@ (tables (Species (inner-join InbredSet "ON InbredSet.SpeciesId = Species.Id") (inner-join PublishFreeze "ON PublishFreeze.InbredSetId = InbredSet.Id")) - "WHERE PublishFreeze.public > 0 AND Species.Name != 'monkey' GROUP BY Species.Name, PublishFreeze.ShortName") + "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND Species.Name != 'monkey' GROUP BY Species.Name, PublishFreeze.ShortName") (triples (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_") (multiset gnt:has_phenotype_data (map (cut string->identifier "dataset" <> #:separator "_") @@ -27,27 +27,6 @@ dataset_name)) #\,))))) -(define-transformer gn:dataset->gn:set - (tables (Datasets - (inner-join InfoFiles "ON InfoFiles.DatasetId = Datasets.DatasetId") - (inner-join InbredSet "ON InbredSet.Id = InfoFiles.InbredSetId") - (inner-join PublishFreeze "ON PublishFreeze.InbredSetId = InbredSet.Id")) - "WHERE PublishFreeze.public > 0 GROUP BY Datasets.DatasetId") - (triples (string->identifier "dataset" (field PublishFreeze Name) #:separator "_") - (set gnt:has_strain (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")))) - -(define-transformer gn:dataset->metadata - (tables (PublishXRef - (inner-join InbredSet "ON InbredSet.InbredSetId = PublishXRef.InbredSetId") - (inner-join Species "ON InbredSet.SpeciesId = Species.Id") - (inner-join PublishFreeze "ON PublishFreeze.InbredSetId = InbredSet.Id") - (inner-join Publication "ON Publication.Id = PublishXRef.PublicationId") - (inner-join Phenotype "ON Phenotype.Id = PublishXRef.PhenotypeId")) - "WHERE InbredSet.public > 0 GROUP BY Species.Name, PublishFreeze.Name") - (triples (string->identifier "dataset" (field PublishFreeze Name) #:separator "_") - (set dct:created (annotate-field (field PublishFreeze CreateTime) '^^xsd:datetime)) - (set gnt:has_strain (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")))) - (define-transformer gn:dataset->gn:trait (tables (PublishXRef (inner-join InbredSet "ON InbredSet.InbredSetId = PublishXRef.InbredSetId") @@ -55,7 +34,7 @@ (inner-join PublishFreeze "ON PublishFreeze.InbredSetId = InbredSet.Id") (inner-join Publication "ON Publication.Id = PublishXRef.PublicationId") (inner-join Phenotype "ON Phenotype.Id = PublishXRef.PhenotypeId")) - "WHERE InbredSet.public > 0") + "WHERE InbredSet.public > 0 AND PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1") (triples (string->identifier "dataset" (field PublishFreeze Name) #:separator "_") (set gnt:has_phenotype_trait (let ((post-abbrev (blank-p (field Phenotype Post_publication_abbreviation))) @@ -101,8 +80,6 @@ (inputs (list gn:set->gn:dataset - gn:dataset->gn:set - gn:dataset->metadata gn:dataset->gn:trait)) (outputs `(#:documentation ,documentation diff --git a/examples/phenotype.scm b/examples/phenotype.scm index c2564b6..70deed7 100755 --- a/examples/phenotype.scm +++ b/examples/phenotype.scm @@ -71,7 +71,7 @@ (inner-join PublishFreeze "ON PublishFreeze.InbredSetId = InbredSet.Id") (left-join Publication "ON Publication.Id = PublishXRef.PublicationId") (left-join Phenotype "ON Phenotype.Id = PublishXRef.PhenotypeId")) - "WHERE InbredSet.public > 0") + "WHERE InbredSet.public > 0 AND PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1") (triples (let ((post-abbrev (blank-p (field Phenotype Post_publication_abbreviation))) (pre-abbrev (blank-p (field Phenotype Pre_publication_abbreviation))) (post-desc (blank-p (field Phenotype Post_publication_description))) @@ -86,6 +86,11 @@ (set owl:equivalentClass (field ("CONCAT(PublishFreeze.Name, '_', PublishXRef.Id)" PublishFreeze))) + (set gnt:has_trait_page + (string->symbol + (format #f "<https://genenetwork.org/show_trait?trait_id=~a&dataset=~a>" + (field PublishXRef Id) + (field PublishFreeze Name)))) (set dcat:distribution (string->symbol (format #f "gnd:~a.json" diff --git a/examples/probesets-experiment-metadata.scm b/examples/probesets-experiment-metadata.scm new file mode 100755 index 0000000..4bab425 --- /dev/null +++ b/examples/probesets-experiment-metadata.scm @@ -0,0 +1,110 @@ +#! /usr/bin/env guile +!# + +(use-modules (srfi srfi-1) + (srfi srfi-26) + (ice-9 getopt-long) + (ice-9 match) + (ice-9 regex) + (transform strings) + (transform sql) + (transform triples) + (transform special-forms) + (web uri)) + + +(define-transformer probesetxref->metadata + (tables (ProbeSetXRef + (inner-join ProbeSetFreeze "ON ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id") + (inner-join ProbeSet "ON ProbeSet.Id = ProbeSetXRef.ProbeSetId")) + "WHERE ProbeSetFreeze.public > 0 AND ProbeSetFreeze.confidentiality < 1") + (triples (string->identifier + "probeset_data" + (uri-encode + (format #f "~a_~a" (field ProbeSetFreeze Name ProbeSetFreezeName) (field ProbeSet Name ProbeSetName))) #:separator "_") + (set rdf:type 'gnc:molecular_trait_metadata) + ;; KLUDGE: Agree with Alex on how we want to name this. + ;; (set dcat:distribution + ;; (string->symbol + ;; (sanitize-rdf-string + ;; (format #f "gnd:~a.json" + ;; (field ("CONCAT(ProbeSetFreeze.Name, '_', ProbeSet.Name)" + ;; PublishFreeze))))) ) + (set gnt:has_trait_page + (string->symbol + (format #f "<https://genenetwork.org/show_trait?trait_id=~a&dataset=~a>" + (field ProbeSet Name) + ;; GTEx_Lung _0414 + (uri-encode + (field ProbeSetFreeze Name ProbeSetFreezeName))))) + (set gnt:has_probeset (string->identifier "probeset" (field ProbeSet Name ProbeSetName))) + (set dcat:isPartOf (string->identifier "dataset" (field ProbeSetFreeze Name ProbeSetFreezeName) + #:separator "_")) + (set gnt:mean (annotate-field (field ("IFNULL(ProbeSetXRef.mean, '')" mean)) + '^^xsd:double)) + (set gnt:se (annotate-field (field ("IFNULL(ProbeSetXRef.se, '')" se)) + '^^xsd:double)) + (set gnt:locus (sanitize-rdf-string (field ProbeSetXRef Locus))) + (set gnt:lod_score (annotate-field + (field ("IFNULL((ProbeSetXRef.LRS/4.604), '')" lrs)) + '^^xsd:double)) + (set gnt:pvalue (annotate-field + (field ("IFNULL((ProbeSetXRef.pValue), '')" pValue)) + '^^xsd:double)) + (set gnt:additive (annotate-field + (field ("IFNULL((ProbeSetXRef.additive), '')" additive)) + '^^xsd:double)) + (set gnt:h2 (annotate-field + (field ("IFNULL((ProbeSetXRef.h2), '')" h2)) + '^^xsd:double)))) + + + +(let* ((option-spec + '((settings (single-char #\s) (value #t)) + (output (single-char #\o) (value #t)) + (documentation (single-char #\d) (value #t)))) + (options (getopt-long (command-line) option-spec)) + (settings (option-ref options 'settings #f)) + (output (option-ref options 'output #f)) + (documentation (option-ref options 'documentation #f)) + (%connection-settings + (call-with-input-file settings + read))) + (call-with-target-database + %connection-settings + (lambda (db) + (with-documentation + (name "ProbeSet Experiments Metadata") + (connection %connection-settings) + (table-metadata? #f) + (total-rows (assoc-ref + (sql-find db "SELECT count(*) AS count from ProbeSetXRef") + "count")) + (rows-per-chunk 1000000) + (prefixes + '(("dcat:" "<http://www.w3.org/ns/dcat#>") + ("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") + ("gnd:" "<https://cd.genenetwork.org/api3/lmdb/v1/data/traits/>") + ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + ("kegg:" "<http://bio2rdf.org/ns/kegg#>") + ("pubchem:" "<https://pubchem.ncbi.nlm.nih.gov/>") + ("omim:" "<https://www.omim.org/entry/>") + ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") + ("uniprot:" "<http://purl.uniprot.org/uniprot/>") + ("chebi:" "<http://purl.obolibrary.org/obo/CHEBI_>") + ("dcat:" "<http://www.w3.org/ns/dcat#>") + ("dct:" "<http://purl.org/dc/terms/>") + ("owl:" "<http://www.w3.org/2002/07/owl#>") + ("homologene:" "<https://bio2rdf.org/homologene:>") + ("xsd:" "<http://www.w3.org/2001/XMLSchema#>") + ("qb:" "<http://purl.org/linked-data/cube#>") + ("sdmx-measure:" "<http://purl.org/linked-data/sdmx/2009/measure#>") + ("skos:" "<http://www.w3.org/2004/02/skos/core#>"))) + (inputs + (list probesetxref->metadata)) + (outputs + `(#:documentation ,documentation + #:rdf ,output)))))) diff --git a/examples/probesets.scm b/examples/probesets.scm new file mode 100755 index 0000000..97e5753 --- /dev/null +++ b/examples/probesets.scm @@ -0,0 +1,133 @@ +#! /usr/bin/env guile +!# + +(use-modules (srfi srfi-1) + (srfi srfi-26) + (ice-9 getopt-long) + (ice-9 match) + (ice-9 regex) + (transform strings) + (transform sql) + (transform triples) + (transform special-forms) + (web uri)) + +(define-transformer probeset->metadata + (tables (ProbeSet + (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId")) + "WHERE ProbeSet.Name IS NOT NULL AND TRIM(ProbeSet.Name) != ''") + (triples + (string->identifier "probeset" (field ProbeSet Name)) + (set rdf:type 'gnc:probeset) + (set skos:prefLabel (field ProbeSet Name)) + (multiset skos:altLabel + (map string-trim-both + (string-split (sanitize-rdf-string (field ProbeSet alias)) #\;))) + (set gnt:uses_genechip (string->identifier "platform" (field GeneChip Name) #:separator "_")) + (set gnt:has_target_id (string-trim-both (sanitize-rdf-string (field ProbeSet TargetId)))) + (set gnt:symbol (string-trim-both (field ProbeSet Symbol))) + (set dct:description (sanitize-rdf-string (field ProbeSet description))) + (set gnt:targets_region (string-trim-both (sanitize-rdf-string (field ProbeSet Probe_set_target_region)))) + (set gnt:chr (field ProbeSet Chr)) + (set gnt:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double)) + (set gnt:mb_mm8 (annotate-field (field ("IFNULL(ProbeSet.Mb_mm8, '')" Mb_mm8)) + '^^xsd:double)) + (set gnt:has_specificity + (field ("IFNULL(ProbeSet.Probe_set_specificity, '')" + Probe_set_specificity))) + (set gnt:has_blat_score + (field ("IFNULL(ProbeSet.Probe_set_BLAT_score, '')" + Probe_set_BLAT_score))) + (set gnt:has_blat_mb_start + (annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_start, '')" + Probe_set_Blat_Mb_start)) + '^^xsd:double)) + (set gnt:has_blat_mb_end + (annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_end, '')" + Probe_set_Blat_Mb_end)) + '^^xsd:double)) + (set gnt:has_blat_seq (sanitize-rdf-string (field ProbeSet BlatSeq))) + (set gnt:has_target_seq (sanitize-rdf-string (field ProbeSet TargetSeq))) + (set gnt:has_homologene_id (ontology 'homologene: + (uri-encode + (field ("IFNULL(ProbeSet.HomoloGeneID, '')" + HomoloGeneID))))) + (set gnt:has_uniprot_id (ontology 'uniprot: + (uri-encode + (field ("IFNULL(ProbeSet.UniProtID, '')" + UniProtID))))) + (set gnt:has_pub_chem_id (ontology + 'pubchem: + (uri-encode + (field ("IFNULL(ProbeSet.PubChem_ID, '')" + PubChem_ID))))) + (set gnt:has_kegg_id (ontology + 'kegg: + (uri-encode + (field ("IFNULL(ProbeSet.KEGG_ID, '')" + KEGG_ID))))) + (set gnt:has_omim_id (ontology + 'omim: + (uri-encode + (let ((omim (field ("IFNULL(ProbeSet.OMIM, '')" + OMIM)))) + (if (number? omim) + omim + (regexp-substitute/global + #f "[^0-9]" + omim + 'pre "" 'post)))))) + (set gnt:has_chebi_id (ontology + 'chebi: + (uri-encode + (field ("IFNULL(ProbeSet.ChEBI_ID, '')" + ChEBI_ID))))))) + + + + +(let* ((option-spec + '((settings (single-char #\s) (value #t)) + (output (single-char #\o) (value #t)) + (documentation (single-char #\d) (value #t)))) + (options (getopt-long (command-line) option-spec)) + (settings (option-ref options 'settings #f)) + (output (option-ref options 'output #f)) + (documentation (option-ref options 'documentation #f)) + (%connection-settings + (call-with-input-file settings + read))) + (call-with-target-database + %connection-settings + (lambda (db) + (with-documentation + (name "ProbeSet Metadata") + (connection %connection-settings) + (table-metadata? #f) + (total-rows (assoc-ref + (sql-find db "SELECT count(*) AS count from ProbeSet") + "count")) + (rows-per-chunk 1000000) + (prefixes + '(("gn:" "<http://rdf.genenetwork.org/v1/id/>") + ("gnc:" "<http://rdf.genenetwork.org/v1/category/>") + ("gnt:" "<http://rdf.genenetwork.org/v1/term/>") + ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + ("kegg:" "<http://bio2rdf.org/ns/kegg#>") + ("pubchem:" "<https://pubchem.ncbi.nlm.nih.gov/>") + ("omim:" "<https://www.omim.org/entry/>") + ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") + ("uniprot:" "<http://purl.uniprot.org/uniprot/>") + ("chebi:" "<http://purl.obolibrary.org/obo/CHEBI_>") + ("dct:" "<http://purl.org/dc/terms/>") + ("owl:" "<http://www.w3.org/2002/07/owl#>") + ("homologene:" "<https://bio2rdf.org/homologene:>") + ("xsd:" "<http://www.w3.org/2001/XMLSchema#>") + ("qb:" "<http://purl.org/linked-data/cube#>") + ("sdmx-measure:" "<http://purl.org/linked-data/sdmx/2009/measure#>") + ("skos:" "<http://www.w3.org/2004/02/skos/core#>"))) + (inputs + (list probeset->metadata)) + (outputs + `(#:documentation ,documentation + #:rdf ,output)))))) diff --git a/load-rdf.scm b/load-rdf.scm index 2ef79ac..4acce8a 100755 --- a/load-rdf.scm +++ b/load-rdf.scm @@ -45,7 +45,8 @@ authenticating as the dba user with PASSWORD." (format out "SET DSN=localhost:~a; SET PWD=~s; -DELETE FROM rdf_quad WHERE g = iri_to_id ('~a');" +DELETE FROM rdf_quad WHERE g = iri_to_id ('~a'); +CHECKPOINT;" port password graph)) @@ -59,7 +60,8 @@ DELETE FROM rdf_quad WHERE g = iri_to_id ('~a');" (format out "SET DSN=localhost:~a; SET PWD=~s; -DELETE FROM DB.DBA.load_list;" +DELETE FROM DB.DBA.load_list; +CHECKPOINT;" port password)) OPEN_WRITE @@ -101,7 +103,6 @@ DB.DBA.XML_SET_NS_DECL ('gnc', 'http://rdf.genenetwork.org/v1/category/', 2); DB.DBA.XML_SET_NS_DECL ('gnt', 'http://rdf.genenetwork.org/v1/term/', 2); DB.DBA.XML_SET_NS_DECL ('ncbiTaxon', 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=', 2); DB.DBA.XML_SET_NS_DECL ('prism', 'http://prismstandard.org/namespaces/basic/2.0/', 2); -DB.DBA.XML_SET_NS_DECL ('probeset', 'http://rdf.genenetwork.org/v1/probeset/', 2); DB.DBA.XML_SET_NS_DECL ('pubmed', 'http://rdf.ncbi.nlm.nih.gov/pubmed/', 2); DB.DBA.XML_SET_NS_DECL ('qb', 'http://purl.org/linked-data/cube#', 2); DB.DBA.XML_SET_NS_DECL ('sdmx-measure', 'http://purl.org/linked-data/sdmx/2009/measure#', 2); @@ -111,8 +112,10 @@ DB.DBA.XML_SET_NS_DECL ('v', 'http://www.w3.org/2006/vcard/ns#', 2); DB.DBA.XML_SET_NS_DECL ('xkos', 'http://rdf-vocabulary.ddialliance.org/xkos#', 2); DB.DBA.XML_SET_NS_DECL ('schema', 'https://schema.org/', 2); DB.DBA.XML_SET_NS_DECL ('foaf', 'http://xmlns.com/foaf/0.1/#term_', 2); +DB.DBA.XML_SET_NS_DECL ('wd', 'http://www.wikidata.org/entity/', 2); DB.DBA.XML_SET_NS_DECL ('gnd', 'https://cd.genenetwork.org/api3/lmdb/v1/data/traits/', 2); DB.DBA.XML_SET_NS_DECL ('gn-files', 'http://files.genenetwork.org/current/', 2); +CHECKPOINT; " port password)) @@ -128,6 +131,7 @@ DB.DBA.XML_SET_NS_DECL ('gn-files', 'http://files.genenetwork.org/current/', 2); SET PWD=~s; DB.DBA.RDF_OBJ_FT_RULE_ADD (null, null, 'All'); DB.DBA.VT_INC_INDEX_DB_DBA_RDF_OBJ(); +CHECKPOINT; quit; " port diff --git a/manifest.scm b/manifest.scm index d736e51..2905b6f 100644 --- a/manifest.scm +++ b/manifest.scm @@ -15,6 +15,7 @@ guile-dsv guile-hashing guile-libyaml + guile-uuid guile-dbd-mysql)) ((gnu packages rdf) #:select (raptor2)) (guix build-system gnu) @@ -55,7 +56,7 @@ (license license:gpl3+))) (packages->manifest - (list gnu-make guile-3.0 guile-dbi guile-dbd-mysql guile-zlib + (list gnu-make guile-3.0 guile-dbi guile-dbd-mysql guile-zlib guile-uuid guile-json-4 guile-dsv ;; We abuse (ccwl graphviz) as a library to visualize the database ;; schema. Hence we need ccwl and guile-libyaml. diff --git a/transform/special-forms.scm b/transform/special-forms.scm index 8de4966..0c07a0a 100644 --- a/transform/special-forms.scm +++ b/transform/special-forms.scm @@ -6,6 +6,7 @@ #:use-module (transform sql) #:use-module (transform table) #:use-module (transform triples) + #:use-module (transform strings) #:export (translate-forms collect-forms collect-keys @@ -22,36 +23,47 @@ emit-short-turtle define-transformer)) +(define (emittable-object? o) + (cond + ((null? o) #f) + ((not o) #f) + ((and (string? o) (string-blank? o)) #f) + (else #t))) + (define (emit-short-turtle subject po-alist) (let loop ((pairs po-alist) (first? #t)) (match pairs (((p . o) rest ...) - ;; subject only on first line - (when first? - (format #t "~a " subject)) - (when (not first?) - (format #t "\t")) ; indent following lines - - (match o - ((? symbol?) - (format #t "~a ~a" p (symbol->string o))) - ((or (? (lambda (el) (and (string? el) - (string-match "^\\(.*\\)$" el)))) - (? (lambda (el) (and (string? el) - (string-match "^\\[.*\\]$" el))))) - (format #t "~a ~s" p o)) - (_ - (format #t "~a \"~a\"" p o))) - - (if (null? rest) - (format #t " .~%") ; last triple - (format #t " ;~%")) ; continuation - - (loop rest #f)) - + (if (not (emittable-object? o)) + (loop rest first?) ; skip malformed or empty objects + (begin + ;; subject only once + (when first? + (format #t "~a " subject)) + (when (not first?) + (format #t "\t")) + + ;; emit predicate–object + (match o + ((? symbol?) + (format #t "~a ~a" p (symbol->string o))) + ((? string?) + (format #t "~a \"~a\"" p o)) + (_ + (format #t "~a ~s" p o))) + + ;; separator depends on *remaining emittable pairs* + (if (any (match-lambda + ((p . o) (emittable-object? o))) + rest) + (format #t " ;~%") + (format #t " .~%")) + + (loop rest #f)))) (() #f)))) + (define (key->assoc-ref alist x) "Recursively translate (key k) forms in source X to (assoc-ref ALIST k) forms." @@ -407,57 +419,68 @@ must be remedied." #`(define* (name db #:key (metadata? #f) (data? #t) - (documentation? #f)) - (when metadata? - #,@(let ((table (symbol->string (syntax->datum #'primary-table))) - (subject-type (any (lambda (predicate) - (syntax-case predicate (rdf:type) - ((_ rdf:type type) #'type) - (_ #f))) - #'(predicate-clauses ...)))) - (map (lambda (predicate-clause) - (syntax-case predicate-clause () - ((_ predicate _) - ;; Dump metadata about the transform itself. - #`(begin - (scm->triples - (map-alist '() - (set rdf:type 'gn-id:transform) - (set gn-term:createsPredicate 'predicate) - (filter-set gn-term:forSubjectType #,subject-type) - (multiset gn-term:dependsOn - '#,(map (lambda (field) - (match (syntax->datum field) - ((table-name column-name _ ...) - (datum->syntax - x (column-id (symbol->string table-name) - (symbol->string column-name)))) - (((query alias)) - (datum->syntax - x (column-id query (symbol->string alias)))))) - (collect-fields predicate-clause)))) - #,(id table (syntax->datum #'predicate))) - ;; Automatically create domain triples - ;; for predicates. - (when #,subject-type - (triple 'predicate 'rdfs:domain #,subject-type)))) - (_ (error "Invalid predicate clause:" predicate-clause)))) - #'(predicate-clauses ...)))) - (when documentation? - (format #t "~%## '~a'~%~%" (syntax->datum #'name)) - #,(syntax-case #'schema-triples-clause (schema-triples) - ((schema-triples (triple-subject triple-predicate triple-object) ...) - #`(begin - (when (not (list 'triple-subject ...)) - (format #t "## Schema Triples:~%~%```text~%") - (for-each (lambda (s p o) - (format #t "~a -> ~a -> ~a~%" s p o)) - (list 'triple-subject ...) - (list 'triple-predicate ...) - (list 'triple-object ...)) - (format #t "```")))) - (_ (error "Invalid schema triples clause:" #'schema-triples-clause))) - (format #t "## Generated Triples: + (documentation? #f) + (limit #f) + (offset #f)) + (let* ((base-sql + (select-query #,(collect-fields #'(subject predicate-clauses ...)) + (primary-table other-tables ...) + tables-raw ...)) + (sql + (if (and limit offset) + (format #f "~a LIMIT ~a OFFSET ~a" + base-sql limit offset) + base-sql))) + (when metadata? + #,@(let ((table (symbol->string (syntax->datum #'primary-table))) + (subject-type (any (lambda (predicate) + (syntax-case predicate (rdf:type) + ((_ rdf:type type) #'type) + (_ #f))) + #'(predicate-clauses ...)))) + (map (lambda (predicate-clause) + (syntax-case predicate-clause () + ((_ predicate _) + ;; Dump metadata about the transform itself. + #`(begin + (scm->triples + (map-alist '() + (set rdf:type 'gn-id:transform) + (set gn-term:createsPredicate 'predicate) + (filter-set gn-term:forSubjectType #,subject-type) + (multiset gn-term:dependsOn + '#,(map (lambda (field) + (match (syntax->datum field) + ((table-name column-name _ ...) + (datum->syntax + x (column-id (symbol->string table-name) + (symbol->string column-name)))) + (((query alias)) + (datum->syntax + x (column-id query (symbol->string alias)))))) + (collect-fields predicate-clause)))) + #,(id table (syntax->datum #'predicate))) + ;; Automatically create domain triples + ;; for predicates. + (when #,subject-type + (triple 'predicate 'rdfs:domain #,subject-type)))) + (_ (error "Invalid predicate clause:" predicate-clause)))) + #'(predicate-clauses ...)))) + (when documentation? + (format #t "~%## '~a'~%~%" (syntax->datum #'name)) + #,(syntax-case #'schema-triples-clause (schema-triples) + ((schema-triples (triple-subject triple-predicate triple-object) ...) + #`(begin + (when (not (list 'triple-subject ...)) + (format #t "## Schema Triples:~%~%```text~%") + (for-each (lambda (s p o) + (format #t "~a -> ~a -> ~a~%" s p o)) + (list 'triple-subject ...) + (list 'triple-predicate ...) + (list 'triple-object ...)) + (format #t "```")))) + (_ (error "Invalid schema triples clause:" #'schema-triples-clause))) + (format #t "## Generated Triples: The following SQL query was executed: @@ -469,67 +492,64 @@ The above query results to triples that have the form: ```text " - (select-query #,(collect-fields #'(subject predicate-clauses ...)) - (primary-table other-tables ...) - tables-raw ...)) - (for-each (match-lambda - ((predicate . object) - (format #t "~a -> ~a -> ~a ~%" - (if (symbol? #,(field->datum #'subject)) - (symbol->string #,(field->datum #'subject)) - #,(field->datum #'subject)) - predicate - (if (symbol? object) - (symbol->string object) - object)))) - (map-alist - '() - #,@(field->datum #'(predicate-clauses ...)))) - (format #t "```~%Here's an example query:~%~%```sparql~%") - (documentation?) - (newline) - (let* ((result - (map-alist (sql-find - db - (format #f "~a LIMIT 1" - (select-query #,(collect-fields #'(subject predicate-clauses ...)) - (primary-table other-tables ...) - tables-raw ...))) - #,@(field->key #'(predicate-clauses ...)))) - (first-n (list-head result - (let ((n - (min 4 (truncate - (+ (exact-integer-sqrt (length result)) 1))))) - (if (< n 3) - (length result) - n))))) - (format #t "SELECT * WHERE { ~%") + (select-query #,(collect-fields #'(subject predicate-clauses ...)) + (primary-table other-tables ...) + tables-raw ...)) (for-each (match-lambda ((predicate . object) - (match object - ((or (? symbol? object) - (? (lambda (el) (string-match "^\\[ .* \\]$" el)) object)) - (format #t " ?s ~a ~a .~%" predicate object)) - ((and (? string? object) - (? (lambda (el) (not (string-null? el))) object)) - (format #t " ?s ~a \"~a\" .~%" predicate object)) - (_ "")))) - first-n) - (format #t " ?s ?p ?o .~%}~%```~%")) - (format #t "~%Expected Result:~%~%```rdf~%") - (sql-for-each (lambda (row) - (scm->triples - (map-alist row #,@(field->key #'(predicate-clauses ...))) - #,(field->assoc-ref #'row #'subject) - (lambda (s p o) - (triple s p o)))) - db - (format #f "~a LIMIT 1" - (select-query #,(collect-fields #'(subject predicate-clauses ...)) - (primary-table other-tables ...) - tables-raw ...))) - (format #t "```~%~%")) - (when data? + (format #t "~a -> ~a -> ~a ~%" + (if (symbol? #,(field->datum #'subject)) + (symbol->string #,(field->datum #'subject)) + #,(field->datum #'subject)) + predicate + (if (symbol? object) + (symbol->string object) + object)))) + (map-alist + '() + #,@(field->datum #'(predicate-clauses ...)))) + (format #t "```~%Here's an example query:~%~%```sparql~%") + (documentation?) + (newline) + (let* ((result + (map-alist (sql-find + db + (format #f "~a LIMIT 1" + (select-query #,(collect-fields #'(subject predicate-clauses ...)) + (primary-table other-tables ...) + tables-raw ...))) + #,@(field->key #'(predicate-clauses ...)))) + (first-n (list-head result + (let ((n + (min 4 (truncate + (+ (exact-integer-sqrt (length result)) 1))))) + (if (< n 3) + (length result) + n))))) + (format #t "SELECT * WHERE { ~%") + (for-each (match-lambda + ((predicate . object) + (match object + ((or (? symbol? object) + (? (lambda (el) (string-match "^\\[ .* \\]$" el)) object)) + (format #t " ?s ~a ~a .~%" predicate object)) + ((and (? string? object) + (? (lambda (el) (not (string-null? el))) object)) + (format #t " ?s ~a \"~a\" .~%" predicate object)) + (_ "")))) + first-n) + (format #t " ?s ?p ?o .~%}~%```~%")) + (format #t "~%Expected Result:~%~%```rdf~%") + (sql-for-each (lambda (row) + (scm->triples + (map-alist row #,@(field->key #'(predicate-clauses ...))) + #,(field->assoc-ref #'row #'subject) + (lambda (s p o) + (triple s p o)))) + db + (format #f "~a LIMIT 1" base-sql)) + (format #t "```~%~%")) + (when data? #,(syntax-case #'schema-triples-clause (schema-triples) ((schema-triples (triple-subject triple-predicate triple-object) ...) #`(for-each triple @@ -537,16 +557,14 @@ The above query results to triples that have the form: (list 'triple-predicate ...) (list 'triple-object ...))) (_ (error "Invalid schema triples clause:" #'schema-triples-clause))) - (sql-for-each (lambda (row) - (let* ((subject-val #,(field->assoc-ref #'row #'subject)) - (po-alist - (map-alist row #,@(field->key #'(predicate-clauses ...))))) - (emit-short-turtle subject-val po-alist))) - db - (select-query #,(collect-fields #'(subject predicate-clauses ...)) - (primary-table other-tables ...) - tables-raw ...))) - ))) + (sql-for-each + (lambda (row) + (let* ((subject-val #,(field->assoc-ref #'row #'subject)) + (po-alist + (map-alist row #,@(field->key #'(predicate-clauses ...))))) + (emit-short-turtle subject-val po-alist))) + db + sql)))))) (_ (error "Invalid define-transformer syntax:" (syntax->datum x)))))) (define (get-keyword-value args keyword default) @@ -565,8 +583,14 @@ The above query results to triples that have the form: (prefixes (assoc-ref alist 'prefixes)) (inputs (assoc-ref alist 'inputs)) (outputs (assoc-ref alist 'outputs)) - (rdf-path (get-keyword-value outputs #:rdf "")) - (doc-path (get-keyword-value outputs #:documentation ""))) + (total-rows (assoc-ref alist 'total-rows)) + (rows-per-chunk (assoc-ref alist 'rows-per-chunk)) + (chunking? (and total-rows rows-per-chunk)) + (chunks (if chunking? + (ceiling (/ total-rows rows-per-chunk)) + 1)) + (rdf-path (get-keyword-value outputs #:rdf #f)) + (doc-path (get-keyword-value outputs #:documentation #f))) (call-with-target-database connection (lambda (db) @@ -592,20 +616,30 @@ The above query results to triples that have the form: ;; Dumping the actual data (when rdf-path - (with-output-to-file - rdf-path - (lambda () - ;; Add the prefixes - (for-each - (match-lambda - ((k v) - (begin - (prefix k v)))) - prefixes) - (newline) - (for-each - (lambda (proc) - (proc db #:metadata? table-metadata?)) - inputs)) - #:encoding "UTF-8")))))))) + (do ((i 0 (+ i 1))) + ((>= i chunks)) + (let* ((offset (* i (or rows-per-chunk 0))) + (out-file + (if (= chunks 1) + rdf-path + (string-append (path-without-extension rdf-path) + "." (number->string (+ i 1)) ".ttl")))) + (with-output-to-file + out-file + (lambda () + ;; Add the prefixes + (for-each + (match-lambda + ((k v) + (begin + (prefix k v)))) + prefixes) + (newline) + (for-each + (lambda (proc) + (proc db #:metadata? table-metadata? + #:limit rows-per-chunk + #:offset offset)) + inputs)) + #:encoding "UTF-8")))))))))) diff --git a/transform/strings.scm b/transform/strings.scm index 7b62349..c0f02e5 100644 --- a/transform/strings.scm +++ b/transform/strings.scm @@ -1,7 +1,13 @@ (define-module (transform strings) #:use-module (srfi srfi-1) #:use-module (srfi srfi-19) + #:use-module (rnrs bytevectors) + #:use-module (uuid generate) + #:use-module (uuid utils) + #:use-module (uuid well-known) + #:use-module (ice-9 iconv) #:use-module (ice-9 match) + #:use-module (ice-9 rdelim) #:use-module (ice-9 string-fun) #:use-module (ice-9 textual-ports) #:export (string-blank? @@ -18,11 +24,27 @@ normalize-string-field fix-email-id blank-p - investigator-attributes->id)) + investigator-attributes->id + path-without-extension + gn-uuid)) + +(define (gn-uuid string) + (generate-string-uuid + 'uuidv5 + (string->bytevector string "UTF-8"))) (define (blank-p str) (if (string-blank? str) #f str)) +(define (path-without-extension path) + (let* ((dir (dirname path)) ; directory part + (base (basename path)) ; filename part + (dot-pos (string-rindex base #\.))) ; last dot position + (string-append dir "/" ; reconstruct path + (if dot-pos + (substring base 0 dot-pos) ; strip extension + base)))) + (define (lower-case-and-replace-spaces str) (string-map (lambda (c) diff --git a/transform/triples.scm b/transform/triples.scm index 13758e5..7f96eea 100644 --- a/transform/triples.scm +++ b/transform/triples.scm @@ -39,6 +39,7 @@ #:optional #:key (ontology "gn:") (separator "") + (url-char #\_) (proc (lambda (x) x))) "Convert STR to a turtle identifier after replacing illegal characters with an underscore and prefixing with gn:PREFIX." @@ -55,7 +56,7 @@ characters with an underscore and prefixing with gn:PREFIX." (char-numeric? c) (char=? c #\_)) c - #\_)) + url-char)) (proc str))))))) |
