From 51b3c0548c98e0bc05e11a89cbf6b75d31b9f8d5 Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Mon, 21 Aug 2023 14:54:21 +0300 Subject: Remove "dump-" prefix Signed-off-by: Munyoki Kilyungi --- examples/dataset-metadata.scm | 387 +++++++++++++++++++++++++++++++++++++ examples/dump-dataset-metadata.scm | 387 ------------------------------------- examples/dump-generif.scm | 150 -------------- examples/dump-genotype.scm | 124 ------------ examples/dump-phenotype.scm | 125 ------------ examples/dump-probeset-data.scm | 106 ---------- examples/dump-probeset.scm | 184 ------------------ examples/dump-publication.scm | 81 -------- examples/dump-species-metadata.scm | 226 ---------------------- examples/dump-tissue.scm | 50 ----- examples/generif.scm | 150 ++++++++++++++ examples/genotype.scm | 124 ++++++++++++ examples/phenotype.scm | 125 ++++++++++++ examples/probeset-data.scm | 98 ++++++++++ examples/probeset.scm | 184 ++++++++++++++++++ examples/publication.scm | 81 ++++++++ examples/species-metadata.scm | 226 ++++++++++++++++++++++ examples/tissue.scm | 50 +++++ 18 files changed, 1425 insertions(+), 1433 deletions(-) create mode 100755 examples/dataset-metadata.scm delete mode 100755 examples/dump-dataset-metadata.scm delete mode 100755 examples/dump-generif.scm delete mode 100755 examples/dump-genotype.scm delete mode 100755 examples/dump-phenotype.scm delete mode 100755 examples/dump-probeset-data.scm delete mode 100755 examples/dump-probeset.scm delete mode 100755 examples/dump-publication.scm delete mode 100755 examples/dump-species-metadata.scm delete mode 100755 examples/dump-tissue.scm create mode 100755 examples/generif.scm create mode 100755 examples/genotype.scm create mode 100755 examples/phenotype.scm create mode 100755 examples/probeset-data.scm create mode 100755 examples/probeset.scm create mode 100755 examples/publication.scm create mode 100755 examples/species-metadata.scm create mode 100755 examples/tissue.scm (limited to 'examples') diff --git a/examples/dataset-metadata.scm b/examples/dataset-metadata.scm new file mode 100755 index 0000000..5680a2b --- /dev/null +++ b/examples/dataset-metadata.scm @@ -0,0 +1,387 @@ +#! /usr/bin/env guile +!# + +(use-modules (srfi srfi-1) + (srfi srfi-26) + (ice-9 match) + (ice-9 regex) + (dump strings) + (dump sql) + (dump triples) + (dump special-forms)) + + + +(define %connection-settings + (call-with-input-file (list-ref (command-line) 1) + read)) + + + +;; One email ID in the Investigators table has spaces in it. This +;; function fixes that. +(define (fix-email-id email) + (string-delete #\space email)) + +(define (investigator-attributes->id first-name last-name email) + ;; There is just one record corresponding to "Evan Williams" which + ;; does not have an email ID. To accommodate that record, we + ;; construct the investigator ID from not just the email ID, but + ;; also the first and the last names. It would be preferable to just + ;; find Evan Williams' email ID and insert it into the database. + (string->identifier "investigator" + (string-join + (list first-name last-name (fix-email-id email)) + "_"))) + +(define-transformer investigators + ;; There are a few duplicate entries. We group by email to + ;; deduplicate. + (tables (Investigators) + "GROUP BY Email") + (triples (investigator-attributes->id (field Investigators FirstName) + (field Investigators LastName) + (field Investigators Email)) + (set rdf:type 'foaf:Person) + (set foaf:name (string-append (field Investigators FirstName) " " + (field Investigators LastName))) + (set foaf:givenName + (field Investigators FirstName)) + (set foaf:familyName + (field Investigators LastName)) + (set foaf:homepage (field Investigators Url)) + (set v:adr (field Investigators Address)) + (set v:locality (field Investigators City)) + (set v:region (field Investigators State)) + (set v:postal-code (field Investigators ZipCode)) + (set v:country-name (field Investigators Country)))) + +(define-transformer gene-chip + (tables (GeneChip + (left-join Species "USING (SpeciesId)"))) + (schema-triples + (gnc:geneChip a skos:Concept) + (gnc:geneChip + skos:description + "This is a set of controlled terms that are used to describe a given gene chip/platform") + (gnt:hasGeoSeriesId rdfs:domain gnc:platform) + (gnt:belongsToSpecies a owl:ObjectProperty) + (gnt:belongsToSpecies skos:definition "This resource belongs to this given species") + (gnt:belongsToSpecies rdfs:domain gnc:geneChip) + (gnt:hasGeoSeriesId rdfs:domain gnc:geneChip) + (gnt:hasGOTreeValue a owl:ObjectProperty) + (gnt:hasGOTreeValue skos:definition "This resource the following GO tree value") + (gnt:hasGOTreeValue rdfs:domain gnc:geneChip)) + (triples (string->identifier "platform" (field GeneChip Name)) + (set rdf:type 'gnc:geneChip) + (set rdfs:label (field GeneChip GeneChipName)) + (set skos:prefLabel (field GeneChip Name)) + (set skos:altLabel (field ("IF(GeneChip.GeneChipName != GeneChip.Title, Title, NULL)" + Title))) + (set gnt:hasGOTreeValue (field GeneChip Go_tree_value)) + (set gnt:belongsToSpecies + (string->identifier "" (remap-species-identifiers (field Species Fullname)) + #:separator "" + #:proc string-capitalize-first)) + (set gnt:hasGeoSeriesId + (ontology 'geoSeries: + (string-trim-both (field GeneChip GeoPlatform)))))) + +(define-transformer info-files + (tables (InfoFiles + (left-join PublishFreeze "ON InfoFiles.InfoPageName = PublishFreeze.Name") + (left-join GenoFreeze "ON InfoFiles.InfoPageName = GenoFreeze.Name") + (left-join ProbeSetFreeze "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") + (left-join InbredSet "ON InfoFiles.InbredSetId = InbredSet.InbredSetId") + (left-join Species "ON InfoFiles.SpeciesId = Species.SpeciesId") + (left-join Datasets "USING (DatasetId)") + (left-join DatasetStatus "USING (DatasetStatusId)") + (left-join Tissue "USING (TissueId)") + (left-join Investigators "USING (InvestigatorId)") + (left-join AvgMethod "USING (AvgMethodId)") + (left-join Organizations "USING (OrganizationId)") + (left-join GeneChip "USING (GeneChipId)")) + "WHERE GN_AccesionId IS NOT NULL") + (schema-triples + (gnc:dataset rdf:type gdmt:Dataset) + (gnc:genotypeDataset rdfs:subClassOf gnc:dataset) + (gnc:phenotypeDataset rdfs:subClassOf gnc:dataset) + (gnc:probesetDataset rdfs:subClassOf gnc:dataset) + (gnt:belongsToSet rdfs:domain gnc:dataset) + (gnt:belongsToSet a owl:ObjectProperty) + (gnt:belongsToSet skos:definition "The InbredSet this resource belongs to") + (gnt:hasTissue rdfs:domain gnc:dataset) + (gnt:hasTissue a owl:ObjectProperty) + (gnt:hasTissue skos:definition "Tissues this resource has") + (gnt:hasTissueInfo rdfs:domain gnc:dataset) + (gnt:hasTissueInfo a owl:ObjectProperty) + (gnt:hasTissueInfo skos:definition "Metadata about Tissue for this resource") + (gnt:usesNormalization rdfs:domain gnc:dataset) + (gnt:usesNormalization a owl:ObjectProperty) + (gnt:usesNormalization skos:definition "Normalization techniques this resource has") + (gnt:usesPlatform rdfs:domain gnc:dataset) + (gnt:usesPlatform a owl:ObjectProperty) + (gnt:usesPlatform skos:definition "The Platform this resource uses") + (gnt:hasGeoSeriesId rdfs:domain gnc:dataset) + (gnt:hasGeoSeriesId a owl:ObjectProperty) + (gnt:hasGeoSeriesId skos:definition "id of record in NCBI database") + (gnt:hasExperimentDesignInfo rdfs:domain gnc:dataset) + (gnt:hasExperimentDesignInfo rdfs:label "Experiment Design") + (gnt:hasExperimentDesignInfo a owl:ObjectProperty) + (gnt:hasExperimentDesignInfo skos:definition "Information about how the experiment was designed") + (gnt:hasNotes rdfs:domain gnc:dataset) + (gnt:hasNotes a owl:ObjectProperty) + (gnt:hasNotes rdfs:label "Notes") + (gnt:hasNotes skos:definition "Extra Notes about this dataset") + (gnt:hasDataProcessingInfo rdfs:domain gnc:dataset) + (gnt:hasDataProcessingInfo rdfs:label "About Data Processing") + (gnt:hasDataProcessingInfo a owl:ObjectProperty) + (gnt:hasDataProcessingInfo skos:definition "Information about how this dataset was processed") + (gnt:hasPlatformInfo rdfs:domain gnc:dataset) + (gnt:hasPlatformInfo a owl:ObjectProperty) + (gnt:hasPlatformInfo rdfs:label "About Platfoorm") + (gnt:hasPlatformInfo skos:definition "Information about the platform that was used with this dataset") + (gnt:hasCaseInfo rdfs:domain gnc:dataset) + (gnt:hasCaseInfo rdfs:label "About Case") + (gnt:hasCaseInfo a owl:ObjectProperty) + (gnt:hasCaseInfo skos:definition "Information about the cases used in this platform") + (gnt:hasAcknowledgement rdfs:domain gnc:dataset) + (gnt:hasAcknowledgement rdfs:label "Acknowledgement") + (gnt:hasAcknowledgement a owl:ObjectProperty) + (gnt:hasAcknowledgement skos:definition "People to acknowledge")) + (triples (string->identifier + "" (regexp-substitute/global #f "[^A-Za-z0-9:]" + (field InfoFiles InfoPageName) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type (string->symbol + (field ("IF(GenoFreeze.Id IS NOT NULL, 'gnc:genotypeDataset', IF(PublishFreeze.Id IS NOT NULL, 'gnc:phenotypeDataset', IF(ProbeSetFreeze.Name IS NOT NULL, 'gnc:probesetDataset', 'gnc:dataset')))" + rdfType)))) + (set rdfs:label (regexp-substitute/global + #f "^[Nn]one$" + (field InfoFiles InfoPageName) + "")) + (set skos:prefLabel + (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))" + DatasetFullName))) + (set skos:prefLabel (field Datasets DatasetName DatasetGroup)) + (set gdmt:hasTitleInfo + (regexp-substitute/global + #f "^[Nn]one$" + (field InfoFiles InfoFileTitle) + "")) + ;; This is the published title + (set dct:title + (regexp-substitute/global + #f "^[Nn]one$" + (field Datasets PublicationTitle) + "")) + (set dct:created + (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))" + createTimeGenoFreeze))) + (set gdmt:hasCreatorInfo + (investigator-attributes->id (field Investigators FirstName) + (field Investigators LastName) + (field Investigators Email))) + (set gdmt:hasCreatorAffiliation + (field Organizations OrganizationName)) + (set gdmt:hasDatasetIdentifierSubType (format #f "GN~a" (field InfoFiles GN_AccesionId))) + (set gdmt:hasRightsInfo (string-downcase + (field DatasetStatus DatasetStatusName))) + (set gnt:belongsToSet + (string->identifier + "set" (field InbredSet Name) + #:separator "" + #:proc string-capitalize-first)) + (set gnt:hasTissue (string->identifier "tissue" + (field Tissue Short_Name))) + (set gnt:usesNormalization + (string->identifier "avgmethod" + ;; If AvgMethodName is NULL, assume N/A. + (if (string-blank? (field AvgMethod Name AvgMethodName)) + "N/A" (field AvgMethod Name AvgMethodName)))) + (set gnt:usesPlatform + (string->identifier "platform" + (field GeneChip Name GeneChip))) + (set gdmt:isDescribedBy + (sanitize-rdf-string (field Datasets Summary))) + (set gnt:hasGeoSeriesId + (let ((s + (string-match "GSE[0-9]*" + (field ("IFNULL(Datasets.GeoSeries, '')" GeoSeries))))) + (if s (ontology + 'geoSeries: (match:substring s)) + ""))) + (set gnt:hasTissueInfo + (sanitize-rdf-string (field Datasets AboutTissue))) + (set gnt:hasContentInfo (sanitize-rdf-string (field InfoFiles Specifics))) + (set gnt:hasCaseInfo + (sanitize-rdf-string + (field Datasets AboutCases))) + (set gnt:hasPlatformInfo + (sanitize-rdf-string + (field Datasets AboutPlatform))) + (set gnt:hasDataProcessingInfo + (sanitize-rdf-string + (field Datasets AboutDataProcessing))) + (set gnt:hasNotes + (sanitize-rdf-string + (field Datasets Notes))) + (set gnt:hasExperimentDesignInfo + (sanitize-rdf-string + (field Datasets ExperimentDesign))) + (set gdmt:hasContributorInfo + (sanitize-rdf-string + (field Datasets Contributors))) + (set gdmt:IsCitedBy + (sanitize-rdf-string + (regexp-substitute/global + #f "^[Nn]one$" + (field Datasets Citation) + ""))) + (set gnt:hasAcknowledgement + (sanitize-rdf-string + (string-trim-both + (regexp-substitute/global + #f "^[Nn]one$" + (field InfoFiles Data_Source_Acknowledge) + "")))) + (set gnt:hasAcknowledgement (sanitize-rdf-string + (field Datasets Acknowledgment))))) + +;; These are phenotype datasets that don't have Infofile metadata +(define-transformer publishfreeze + (tables (PublishFreeze + (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name") + (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId")) + "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL") + (triples + (string->identifier + "" + (regexp-substitute/global #f "[^A-Za-z0-9:]" + (field PublishFreeze Name) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type 'gnc:phenotypeDataset) + (set rdfs:label (field PublishFreeze Name)) + (set skos:prefLabel (field PublishFreeze FullName)) + (set skos:altLabel (field PublishFreeze ShortName)) + (set dct:created (annotate-field + (field PublishFreeze CreateTime) + '^^xsd:date)) + (set gnt:belongsToSet + (string->identifier + "set" (field InbredSet Name) + #:separator "" + #:proc string-capitalize-first)))) + +(define-transformer genofreeze + (tables (GenoFreeze + (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name") + (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId")) + "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL") + (triples + (string->identifier + "" + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field GenoFreeze Name) + 'pre "_" 'post) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type 'gnc:genotypeDataset) + (set rdfs:label (field GenoFreeze Name)) + (set skos:prefLabel (field GenoFreeze FullName)) + (set skos:altLabel (field GenoFreeze ShortName)) + (set dct:created (annotate-field + (field GenoFreeze CreateTime) + '^^xsd:date)) + (set gnt:belongsToSet + (string->identifier + "set" (field InbredSet Name) + #:separator "" + #:proc string-capitalize-first)))) + +;; Molecular Traits are also referred to as ProbeSets +(define-transformer probesetfreeze + (tables (ProbeSetFreeze + (left-join InfoFiles "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") + (left-join ProbeFreeze "USING (ProbeFreezeId)") + (left-join AvgMethod "ON AvgMethod.AvgMethodId = ProbeSetFreeze.AvgID") + (left-join InbredSet "ON ProbeFreeze.InbredSetId = InbredSet.Id") + (left-join Tissue "ON ProbeFreeze.TissueId = Tissue.TissueId")) + "WHERE ProbeSetFreeze.public > 0 AND InfoFiles.InfoPageName IS NULL GROUP BY ProbeFreeze.Id") + (schema-triples + (gnt:usesNormalization rdfs:domain gnc:probeset) + (gnt:usesDataScale rdfs:domain gnc:probeset) + (gnt:usesDataScale a owl:ObjectProperty) + (gnt:usesDataScale skos:definition "Thi data scale this resource uses")) + (triples + (string->identifier + "" + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field ProbeSetFreeze Name) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type 'gnc:probesetDataset) + (set gnt:usesNormalization + (string->identifier "avgmethod" + ;; If AvgMethodName is NULL, assume N/A. + (if (string-blank? (field AvgMethod Name AvgMethodName)) + "N/A" (field AvgMethod Name AvgMethodName)))) + (set dct:title (field ProbeSetFreeze FullName)) + (set rdfs:label (field ProbeSetFreeze ShortName)) + (set skos:prefLabel (field ProbeSetFreeze Name)) + (set skos:altLabel (field ProbeSetFreeze Name2)) + (set dct:created (annotate-field + (field ProbeSetFreeze CreateTime) + '^^xsd:datetime)) + (set gnt:usesDataScale (field ProbeSetFreeze DataScale)) + (set gnt:hasTissue + (string->identifier + "tissue" + (field Tissue Short_Name))) + (set gnt:belongsToSet + (string->identifier + "set" (field InbredSet Name) + #:separator "" + #:proc string-capitalize-first)))) + + + +(with-documentation + (name "Info files / Investigators Metadata") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("v:" "") + ("foaf:" "") + ("gdmt:" "") + ("skos:" "") + ("geoSeries:" "") + ("gnt:" "") + ("gn:" "") + ("gnc:" "") + ("rdf:" "") + ("owl:" "") + ("rdfs:" "") + ("taxon:" "") + ("dct:" ""))) + (inputs + (list info-files + publishfreeze + genofreeze + probesetfreeze + investigators + gene-chip)) + (outputs + '(#:documentation "./docs/info-pages.md" + #:rdf "/export/data/genenetwork-virtuoso/info-pages.ttl"))) + diff --git a/examples/dump-dataset-metadata.scm b/examples/dump-dataset-metadata.scm deleted file mode 100755 index 6173201..0000000 --- a/examples/dump-dataset-metadata.scm +++ /dev/null @@ -1,387 +0,0 @@ -#! /usr/bin/env guile -!# - -(use-modules (srfi srfi-1) - (srfi srfi-26) - (ice-9 match) - (ice-9 regex) - (dump strings) - (dump sql) - (dump triples) - (dump special-forms)) - - - -(define %connection-settings - (call-with-input-file (list-ref (command-line) 1) - read)) - - - -;; One email ID in the Investigators table has spaces in it. This -;; function fixes that. -(define (fix-email-id email) - (string-delete #\space email)) - -(define (investigator-attributes->id first-name last-name email) - ;; There is just one record corresponding to "Evan Williams" which - ;; does not have an email ID. To accommodate that record, we - ;; construct the investigator ID from not just the email ID, but - ;; also the first and the last names. It would be preferable to just - ;; find Evan Williams' email ID and insert it into the database. - (string->identifier "investigator" - (string-join - (list first-name last-name (fix-email-id email)) - "_"))) - -(define-transformer dump-investigators - ;; There are a few duplicate entries. We group by email to - ;; deduplicate. - (tables (Investigators) - "GROUP BY Email") - (triples (investigator-attributes->id (field Investigators FirstName) - (field Investigators LastName) - (field Investigators Email)) - (set rdf:type 'foaf:Person) - (set foaf:name (string-append (field Investigators FirstName) " " - (field Investigators LastName))) - (set foaf:givenName - (field Investigators FirstName)) - (set foaf:familyName - (field Investigators LastName)) - (set foaf:homepage (field Investigators Url)) - (set v:adr (field Investigators Address)) - (set v:locality (field Investigators City)) - (set v:region (field Investigators State)) - (set v:postal-code (field Investigators ZipCode)) - (set v:country-name (field Investigators Country)))) - -(define-transformer dump-gene-chip - (tables (GeneChip - (left-join Species "USING (SpeciesId)"))) - (schema-triples - (gnc:geneChip a skos:Concept) - (gnc:geneChip - skos:description - "This is a set of controlled terms that are used to describe a given gene chip/platform") - (gnt:hasGeoSeriesId rdfs:domain gnc:platform) - (gnt:belongsToSpecies a owl:ObjectProperty) - (gnt:belongsToSpecies skos:definition "This resource belongs to this given species") - (gnt:belongsToSpecies rdfs:domain gnc:geneChip) - (gnt:hasGeoSeriesId rdfs:domain gnc:geneChip) - (gnt:hasGOTreeValue a owl:ObjectProperty) - (gnt:hasGOTreeValue skos:definition "This resource the following GO tree value") - (gnt:hasGOTreeValue rdfs:domain gnc:geneChip)) - (triples (string->identifier "platform" (field GeneChip Name)) - (set rdf:type 'gnc:geneChip) - (set rdfs:label (field GeneChip GeneChipName)) - (set skos:prefLabel (field GeneChip Name)) - (set skos:altLabel (field ("IF(GeneChip.GeneChipName != GeneChip.Title, Title, NULL)" - Title))) - (set gnt:hasGOTreeValue (field GeneChip Go_tree_value)) - (set gnt:belongsToSpecies - (string->identifier "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)) - (set gnt:hasGeoSeriesId - (ontology 'geoSeries: - (string-trim-both (field GeneChip GeoPlatform)))))) - -(define-transformer dump-info-files - (tables (InfoFiles - (left-join PublishFreeze "ON InfoFiles.InfoPageName = PublishFreeze.Name") - (left-join GenoFreeze "ON InfoFiles.InfoPageName = GenoFreeze.Name") - (left-join ProbeSetFreeze "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") - (left-join InbredSet "ON InfoFiles.InbredSetId = InbredSet.InbredSetId") - (left-join Species "ON InfoFiles.SpeciesId = Species.SpeciesId") - (left-join Datasets "USING (DatasetId)") - (left-join DatasetStatus "USING (DatasetStatusId)") - (left-join Tissue "USING (TissueId)") - (left-join Investigators "USING (InvestigatorId)") - (left-join AvgMethod "USING (AvgMethodId)") - (left-join Organizations "USING (OrganizationId)") - (left-join GeneChip "USING (GeneChipId)")) - "WHERE GN_AccesionId IS NOT NULL") - (schema-triples - (gnc:dataset rdf:type gdmt:Dataset) - (gnc:genotypeDataset rdfs:subClassOf gnc:dataset) - (gnc:phenotypeDataset rdfs:subClassOf gnc:dataset) - (gnc:probesetDataset rdfs:subClassOf gnc:dataset) - (gnt:belongsToSet rdfs:domain gnc:dataset) - (gnt:belongsToSet a owl:ObjectProperty) - (gnt:belongsToSet skos:definition "The InbredSet this resource belongs to") - (gnt:hasTissue rdfs:domain gnc:dataset) - (gnt:hasTissue a owl:ObjectProperty) - (gnt:hasTissue skos:definition "Tissues this resource has") - (gnt:hasTissueInfo rdfs:domain gnc:dataset) - (gnt:hasTissueInfo a owl:ObjectProperty) - (gnt:hasTissueInfo skos:definition "Metadata about Tissue for this resource") - (gnt:usesNormalization rdfs:domain gnc:dataset) - (gnt:usesNormalization a owl:ObjectProperty) - (gnt:usesNormalization skos:definition "Normalization techniques this resource has") - (gnt:usesPlatform rdfs:domain gnc:dataset) - (gnt:usesPlatform a owl:ObjectProperty) - (gnt:usesPlatform skos:definition "The Platform this resource uses") - (gnt:hasGeoSeriesId rdfs:domain gnc:dataset) - (gnt:hasGeoSeriesId a owl:ObjectProperty) - (gnt:hasGeoSeriesId skos:definition "id of record in NCBI database") - (gnt:hasExperimentDesignInfo rdfs:domain gnc:dataset) - (gnt:hasExperimentDesignInfo rdfs:label "Experiment Design") - (gnt:hasExperimentDesignInfo a owl:ObjectProperty) - (gnt:hasExperimentDesignInfo skos:definition "Information about how the experiment was designed") - (gnt:hasNotes rdfs:domain gnc:dataset) - (gnt:hasNotes a owl:ObjectProperty) - (gnt:hasNotes rdfs:label "Notes") - (gnt:hasNotes skos:definition "Extra Notes about this dataset") - (gnt:hasDataProcessingInfo rdfs:domain gnc:dataset) - (gnt:hasDataProcessingInfo rdfs:label "About Data Processing") - (gnt:hasDataProcessingInfo a owl:ObjectProperty) - (gnt:hasDataProcessingInfo skos:definition "Information about how this dataset was processed") - (gnt:hasPlatformInfo rdfs:domain gnc:dataset) - (gnt:hasPlatformInfo a owl:ObjectProperty) - (gnt:hasPlatformInfo rdfs:label "About Platfoorm") - (gnt:hasPlatformInfo skos:definition "Information about the platform that was used with this dataset") - (gnt:hasCaseInfo rdfs:domain gnc:dataset) - (gnt:hasCaseInfo rdfs:label "About Case") - (gnt:hasCaseInfo a owl:ObjectProperty) - (gnt:hasCaseInfo skos:definition "Information about the cases used in this platform") - (gnt:hasAcknowledgement rdfs:domain gnc:dataset) - (gnt:hasAcknowledgement rdfs:label "Acknowledgement") - (gnt:hasAcknowledgement a owl:ObjectProperty) - (gnt:hasAcknowledgement skos:definition "People to acknowledge")) - (triples (string->identifier - "" (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post) - #:separator "" - #:proc string-capitalize-first) - (set rdf:type (string->symbol - (field ("IF(GenoFreeze.Id IS NOT NULL, 'gnc:genotypeDataset', IF(PublishFreeze.Id IS NOT NULL, 'gnc:phenotypeDataset', IF(ProbeSetFreeze.Name IS NOT NULL, 'gnc:probesetDataset', 'gnc:dataset')))" - rdfType)))) - (set rdfs:label (regexp-substitute/global - #f "^[Nn]one$" - (field InfoFiles InfoPageName) - "")) - (set skos:prefLabel - (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))" - DatasetFullName))) - (set skos:prefLabel (field Datasets DatasetName DatasetGroup)) - (set gdmt:hasTitleInfo - (regexp-substitute/global - #f "^[Nn]one$" - (field InfoFiles InfoFileTitle) - "")) - ;; This is the published title - (set dct:title - (regexp-substitute/global - #f "^[Nn]one$" - (field Datasets PublicationTitle) - "")) - (set dct:created - (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))" - createTimeGenoFreeze))) - (set gdmt:hasCreatorInfo - (investigator-attributes->id (field Investigators FirstName) - (field Investigators LastName) - (field Investigators Email))) - (set gdmt:hasCreatorAffiliation - (field Organizations OrganizationName)) - (set gdmt:hasDatasetIdentifierSubType (format #f "GN~a" (field InfoFiles GN_AccesionId))) - (set gdmt:hasRightsInfo (string-downcase - (field DatasetStatus DatasetStatusName))) - (set gnt:belongsToSet - (string->identifier - "set" (field InbredSet Name) - #:separator "" - #:proc string-capitalize-first)) - (set gnt:hasTissue (string->identifier "tissue" - (field Tissue Short_Name))) - (set gnt:usesNormalization - (string->identifier "avgmethod" - ;; If AvgMethodName is NULL, assume N/A. - (if (string-blank? (field AvgMethod Name AvgMethodName)) - "N/A" (field AvgMethod Name AvgMethodName)))) - (set gnt:usesPlatform - (string->identifier "platform" - (field GeneChip Name GeneChip))) - (set gdmt:isDescribedBy - (sanitize-rdf-string (field Datasets Summary))) - (set gnt:hasGeoSeriesId - (let ((s - (string-match "GSE[0-9]*" - (field ("IFNULL(Datasets.GeoSeries, '')" GeoSeries))))) - (if s (ontology - 'geoSeries: (match:substring s)) - ""))) - (set gnt:hasTissueInfo - (sanitize-rdf-string (field Datasets AboutTissue))) - (set gnt:hasContentInfo (sanitize-rdf-string (field InfoFiles Specifics))) - (set gnt:hasCaseInfo - (sanitize-rdf-string - (field Datasets AboutCases))) - (set gnt:hasPlatformInfo - (sanitize-rdf-string - (field Datasets AboutPlatform))) - (set gnt:hasDataProcessingInfo - (sanitize-rdf-string - (field Datasets AboutDataProcessing))) - (set gnt:hasNotes - (sanitize-rdf-string - (field Datasets Notes))) - (set gnt:hasExperimentDesignInfo - (sanitize-rdf-string - (field Datasets ExperimentDesign))) - (set gdmt:hasContributorInfo - (sanitize-rdf-string - (field Datasets Contributors))) - (set gdmt:IsCitedBy - (sanitize-rdf-string - (regexp-substitute/global - #f "^[Nn]one$" - (field Datasets Citation) - ""))) - (set gnt:hasAcknowledgement - (sanitize-rdf-string - (string-trim-both - (regexp-substitute/global - #f "^[Nn]one$" - (field InfoFiles Data_Source_Acknowledge) - "")))) - (set gnt:hasAcknowledgement (sanitize-rdf-string - (field Datasets Acknowledgment))))) - -;; These are phenotype datasets that don't have Infofile metadata -(define-transformer dump-publishfreeze - (tables (PublishFreeze - (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name") - (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId")) - "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL") - (triples - (string->identifier - "" - (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field PublishFreeze Name) - 'pre "_" 'post) - #:separator "" - #:proc string-capitalize-first) - (set rdf:type 'gnc:phenotypeDataset) - (set rdfs:label (field PublishFreeze Name)) - (set skos:prefLabel (field PublishFreeze FullName)) - (set skos:altLabel (field PublishFreeze ShortName)) - (set dct:created (annotate-field - (field PublishFreeze CreateTime) - '^^xsd:date)) - (set gnt:belongsToSet - (string->identifier - "set" (field InbredSet Name) - #:separator "" - #:proc string-capitalize-first)))) - -(define-transformer dump-genofreeze - (tables (GenoFreeze - (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name") - (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId")) - "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL") - (triples - (string->identifier - "" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field GenoFreeze Name) - 'pre "_" 'post) - 'pre "_" 'post) - #:separator "" - #:proc string-capitalize-first) - (set rdf:type 'gnc:genotypeDataset) - (set rdfs:label (field GenoFreeze Name)) - (set skos:prefLabel (field GenoFreeze FullName)) - (set skos:altLabel (field GenoFreeze ShortName)) - (set dct:created (annotate-field - (field GenoFreeze CreateTime) - '^^xsd:date)) - (set gnt:belongsToSet - (string->identifier - "set" (field InbredSet Name) - #:separator "" - #:proc string-capitalize-first)))) - -;; Molecular Traits are also referred to as ProbeSets -(define-transformer dump-probesetfreeze - (tables (ProbeSetFreeze - (left-join InfoFiles "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") - (left-join ProbeFreeze "USING (ProbeFreezeId)") - (left-join AvgMethod "ON AvgMethod.AvgMethodId = ProbeSetFreeze.AvgID") - (left-join InbredSet "ON ProbeFreeze.InbredSetId = InbredSet.Id") - (left-join Tissue "ON ProbeFreeze.TissueId = Tissue.TissueId")) - "WHERE ProbeSetFreeze.public > 0 AND InfoFiles.InfoPageName IS NULL GROUP BY ProbeFreeze.Id") - (schema-triples - (gnt:usesNormalization rdfs:domain gnc:probeset) - (gnt:usesDataScale rdfs:domain gnc:probeset) - (gnt:usesDataScale a owl:ObjectProperty) - (gnt:usesDataScale skos:definition "Thi data scale this resource uses")) - (triples - (string->identifier - "" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field ProbeSetFreeze Name) - 'pre "_" 'post) - #:separator "" - #:proc string-capitalize-first) - (set rdf:type 'gnc:probesetDataset) - (set gnt:usesNormalization - (string->identifier "avgmethod" - ;; If AvgMethodName is NULL, assume N/A. - (if (string-blank? (field AvgMethod Name AvgMethodName)) - "N/A" (field AvgMethod Name AvgMethodName)))) - (set dct:title (field ProbeSetFreeze FullName)) - (set rdfs:label (field ProbeSetFreeze ShortName)) - (set skos:prefLabel (field ProbeSetFreeze Name)) - (set skos:altLabel (field ProbeSetFreeze Name2)) - (set dct:created (annotate-field - (field ProbeSetFreeze CreateTime) - '^^xsd:datetime)) - (set gnt:usesDataScale (field ProbeSetFreeze DataScale)) - (set gnt:hasTissue - (string->identifier - "tissue" - (field Tissue Short_Name))) - (set gnt:belongsToSet - (string->identifier - "set" (field InbredSet Name) - #:separator "" - #:proc string-capitalize-first)))) - - - -(dump-with-documentation - (name "Info files / Investigators Metadata") - (connection %connection-settings) - (table-metadata? #f) - (prefixes - '(("v:" "") - ("foaf:" "") - ("gdmt:" "") - ("skos:" "") - ("geoSeries:" "") - ("gnt:" "") - ("gn:" "") - ("gnc:" "") - ("rdf:" "") - ("owl:" "") - ("rdfs:" "") - ("taxon:" "") - ("dct:" ""))) - (inputs - (list dump-info-files - dump-publishfreeze - dump-genofreeze - dump-probesetfreeze - dump-investigators - dump-gene-chip)) - (outputs - '(#:documentation "./docs/dump-info-pages.md" - #:rdf "/export/data/genenetwork-virtuoso/dump-info-pages.ttl"))) - diff --git a/examples/dump-generif.scm b/examples/dump-generif.scm deleted file mode 100755 index f754274..0000000 --- a/examples/dump-generif.scm +++ /dev/null @@ -1,150 +0,0 @@ -#! /usr/bin/env guile -!# - -(use-modules (srfi srfi-1) - (srfi srfi-26) - (ice-9 match) - (ice-9 regex) - (dump strings) - (dump sql) - (dump triples) - (dump special-forms)) - - - -(define %connection-settings - (call-with-input-file (list-ref (command-line) 1) - read)) - - - -(define-transformer dump-genewiki-symbols - (tables (GeneRIF_BASIC - (left-join Species "USING (SpeciesId)")) - "GROUP BY GeneId ORDER BY BINARY symbol") - (schema-triples - (gnt:symbol rdfs:domain gn-term:geneWikiEntry) - (gnt:wikiEntryOfSpecies rdfs:range gn:species) - (gnt:taxid rdfs:domain gn-term:geneWikiEntry)) - (triples (ontology 'generif: (field GeneRIF_BASIC GeneId)) - (multiset gnt:symbol (string-split (field ("GROUP_CONCAT(DISTINCT symbol)" symbol)) - #\,)) - (multiset gnt:wikiEntryOfSpecies - (string-split - (field ("GROUP_CONCAT(DISTINCT Species.SpeciesName)" species)) - #\,)) - (multiset gnt:taxId (map (cut ontology 'ncbiTaxon: <>) - (string-split (field ("GROUP_CONCAT(DISTINCT TaxID)" taxId)) - #\,))))) - -(define-transformer dump-gn-genewiki-entries - (tables (GeneRIF - (left-join GeneRIF_BASIC "USING (symbol)") - (left-join Species "ON Species.SpeciesId = GeneRIF.SpeciesId") - (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id") - (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id")) - "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 GROUP BY GeneRIF.symbol") - (schema-triples - (gnt:geneWikiEntry a rdfs:Class) - (gnt:geneWikiEntry a owl:Class) - (gnt:geneWikiEntry rdfs:comment "Represents GeneRIF Entries") - (gnt:geneCategory rdfs:domain gn:geneWikiEntry) - (gnt:geneWikiEntryOfGn rdfs:domain gn:geneWikiEntry) - (gnt:geneWikiEntry rdfs:domain gn:geneWikiEntry)) - (triples - (let ([geneid (field GeneRIF_BASIC GeneId)]) - (if (eq? geneid 0) - (ontology 'gnt:anonSymbol_ - (field GeneRIF symbol)) - (ontology 'generif: - geneid))) - (set rdf:type - (if (string-null? (field ("IFNULL(GeneRIF_BASIC.GeneId, '')" geneWikiEntryP))) - "" - 'gn:geneWikiEntry)) - (set gnt:wikiEntryOfSpecies - (string->binomial-name (field Species FullName))) - ;; This only dumps symbols not present in the GeneRIF_BASIC table - (set gnt:symbol (let ([geneid (field GeneRIF_BASIC GeneId)]) - (if (eq? geneid 0) - (field GeneRIF symbol) - ""))) - (multiset gnt:geneWikiEntryOfGn - (let* ([entries - (sanitize-rdf-string - (field - ("GROUP_CONCAT(DISTINCT CONCAT_WS('::::', IFNULL(GeneCategory.Name, ''), IFNULL(GeneRIF.PubMed_ID, ''), GeneRIF.email, CAST(CONVERT(BINARY CONVERT(GeneRIF.comment USING latin1) USING utf8) AS VARCHAR(15000)), GeneRIF.createtime, IFNULL(weburl, '')) SEPARATOR';;;;;')" - wikientry)))] - [comments (string-split-substring entries ";;;;;")]) - (map - (match-lambda - ((genecategory pmid email text createtime weburl) - (blank-node - (set gnt:geneCategory genecategory) - (multiset dct:source - (map (lambda (el) (if (string-null? el) - "" - (ontology 'pubmed: el))) - (string-split pmid #\space))) - (set dct:creator (regexp-substitute/global #f "@.*$" - email - 'pre - "" - 'post)) - (set gnt:geneWikiEntry - (annotate-field text '^^xsd:string)) - (set dct:created (annotate-field - createtime - '^^xsd:datetime)) - (set foaf:homepage weburl)))) - (map - (cut string-split-substring <> "::::") - comments)))))) - -(define-transformer dump-ncbi-genewiki-entries - (tables (GeneRIF_BASIC) - "GROUP BY GeneId, comment, createtime") - (schema-triples - (gnt:geneWikiEntryofNCBI rdfs:domain gn:geneWikiEntry)) - (triples (ontology 'generif: - (field GeneRIF_BASIC GeneId)) - (set gnt:geneWikiEntryOfNCBI - (blank-node - (set gnt:geneWikiEntry - (annotate-field (field GeneRIF_BASIC comment) - '^^xsd:string)) - (multiset dct:source (map (lambda (el) (if (string-null? el) - "" - (ontology 'pubmed: el))) - (string-split (field ("GROUP_CONCAT(PubMed_ID)" pmids)) - #\,))) - (set dct:created (annotate-field (time-unix->string - (field GeneRIF_BASIC createtime) "~5") - '^^xsd:datetime)))))) - - - -(dump-with-documentation - (name "GeneRIF Metadata") - (connection %connection-settings) - (table-metadata? #f) - (prefixes - '(("rdf:" "") - ("rdfs:" "") - ("gn:" "") - ("gnc:" "") - ("gnt:" "") - ("dct:" "") - ("pubmed:" "") - ("ncbiTaxon:" "") - ("generif:" "") - ("xsd:" "") - ("owl:" ""))) - (inputs - (list ;; dump-genewiki-symbols - dump-gn-genewiki-entries - ;; dump-ncbi-genewiki-entries - )) - (outputs - '(#:documentation "./docs/dump-generif.md" - #:rdf "./verified-data/dump-generif.ttl"))) diff --git a/examples/dump-genotype.scm b/examples/dump-genotype.scm deleted file mode 100755 index a055039..0000000 --- a/examples/dump-genotype.scm +++ /dev/null @@ -1,124 +0,0 @@ -#! /usr/bin/env guile -!# - -(use-modules (rnrs programs) - (rnrs io ports) - (srfi srfi-1) - (srfi srfi-26) - (ice-9 match) - (ice-9 regex) - (dump strings) - (dump sql) - (dump triples) - (dump special-forms)) - - - -(define %connection-settings - (call-with-input-file (list-ref (command-line) 1) - read)) - - - -(define (remap-species-identifiers str) - "This procedure remaps identifiers to standard binominal. Obviously this should - be sorted by correcting the database!" - (match str - ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] - ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] - ["Macaca mulatta" "Macaca nemestrina"] - ["Bat (Glossophaga soricina)" "Glossophaga soricina"] - [str str])) - -(define-transformer dump-genotypes - (tables (Geno - (left-join Species "USING (SpeciesId)"))) - (schema-triples - (gnc:genotype a skos:Concept) - (gnc:genotype - skos:description - "This is a set of controlled terms that are used to describe a given genotype") - (gnt:chr a owl:ObjectProperty) - (gnt:chr skos:description "This resource is located on a given chromosome") - (gnt:chr rdfs:domain gnc:genotype) - (gnt:mb a owl:ObjectProperty) - (gnt:mb skos:definition "The size of this resource in Mb") - (gnt:mb rdfs:domain gnc:genotype) - (gnt:mbMm8 a owl:ObjectProperty) - (gnt:mbMm8 skos:definition "TODO") - (gnt:mbMm8 rdfs:domain gnc:genotype) - (gnt:mb2016 a owl:ObjectProperty) - (gnt:mb2016 skos:definition "TODO") - (gnt:mb2016 rdfs:domain gnc:genotype) - (gnt:hasSequence a owl:ObjectProperty) - (gnt:hasSequence skos:definition "This resource has a given sequence") - (gnt:hasSequence rdfs:domain gnc:genotype) - (gnt:hasSource a owl:ObjectProperty) - (gnt:hasSource rdfs:domain gnc:genotype) - (gnt:hasSource skos:definition "This resource was obtained from this given source") - (gnt:hasAltSourceName a owl:ObjectProperty) - (gnt:hasAltSourceName rdfs:domain gnc:genotype) - (gnt:hasAltSourceName - skos:definition - "The alternative name this resource was obtained from") - (gnt:chrNum a owl:ObjectProperty) - (gnt:chrNum rdfs:domain gnc:genotype) - (gnt:chrNum skos:definition "The chromosome number for this resource") - (gnt:chrNum skos:definition "The chromosome number for this resource")) - (triples - (string->identifier - "" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field Geno Name) - 'pre "_" 'post) - #:separator "" - #:proc string-capitalize-first) - (set rdf:type 'gnc:genotype) - (set skos:prefLabel (sanitize-rdf-string (field Geno Name))) - (set gnt:chr (field Geno Chr)) - (set gnt:mb (annotate-field - (field ("IFNULL(Geno.Mb, '')" Mb)) '^^xsd:double)) - (set gnt:mbMm8 (annotate-field (field ("IFNULL(Geno.Mb_mm8, '')" Mb_mm8)) - '^^xsd:double)) - (set gnt:mb2016 - (annotate-field (field ("IFNULL(Geno.Mb_2016, '')" Mb_2016)) - '^^xsd:double)) - (set gnt:hasSequence (field Geno Sequence)) - (set gnt:hasSource (field Geno Source)) - ;; Only dump Source2 if it differs from Source - (set gnt:hasAltSourceName - (field ("IF((Source2 = Source), NULL, Source2)" - Source2))) - (set gnt:belongsToSpecies - (string->identifier - "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)) - (set gnt:chrNum - (annotate-field - (field Geno chr_num) - '^^xsd:int)) - (set rdfs:comments (field Geno Comments)))) - - - -(dump-with-documentation - (name "Genotype Metadata") - (connection %connection-settings) - (table-metadata? #f) - (prefixes - '(("dct:" "") - ("gn:" "") - ("gnc:" "") - ("gnt:" "") - ("rdf:" "") - ("rdfs:" "") - ("owl:" "") - ("skos:" "") - ("xsd:" ""))) - (inputs - (list dump-genotypes)) - (outputs - '(#:documentation "./docs/dump-genotype.md" - #:rdf "/export/data/genenetwork-virtuoso/dump-genotype.ttl"))) diff --git a/examples/dump-phenotype.scm b/examples/dump-phenotype.scm deleted file mode 100755 index b7ae003..0000000 --- a/examples/dump-phenotype.scm +++ /dev/null @@ -1,125 +0,0 @@ -#! /usr/bin/env guile -!# - -(use-modules (rnrs programs) - (rnrs io ports) - (srfi srfi-1) - (srfi srfi-26) - (ice-9 match) - (ice-9 regex) - (dump strings) - (dump sql) - (dump triples) - (dump special-forms)) - - - -(define %connection-settings - (call-with-input-file (list-ref (command-line) 1) - read)) - - -(define-transformer dump-phenotypes - (tables (PublishXRef - (left-join InbredSet "ON InbredSet.InbredSetId = PublishXRef.InbredSetId") - (left-join Publication "ON Publication.Id = PublishXRef.PublicationId") - (left-join Phenotype "ON Phenotype.Id = PublishXRef.PhenotypeId")) - "WHERE PublishXRef.InbredSetId IN (SELECT PublishFreeze.InbredSetId FROM PublishFreeze)") - (schema-triples - (gnc:phenotype a skos:Concept) - (gnc:phenotype skos:description "This is a set of controlled terms that are used to describe a given phenotype") - (gnt:abbreviation a owl:ObjectProperty) - (gnt:abbreviation rdfs:domain gnc:phenotype) - (gnt:abbreviation skos:definition "The abbreviation used for this resource") - (gnt:traitName a owl:ObjectProperty) - (gnt:traitName rdfs:domain gnc:phenotype) - (gnt:traitName skos:definition "The trait Name of this resource") - (gnt:labCode a owl:ObjectProperty) - (gnt:labCode rdfs:domain gnc:phenotype) - (gnt:submitter a owl:ObjectProperty) - (gnt:submitter rdfs:domain gnc:phenotype) - (gnt:submitter skos:definition "A person who submitted this resource to GN") - (gnt:mean rdfs:domain gnc:phenotype) - (gnt:mean rdfs:range xsd:double) - (gnt:LRS rdfs:domain gnc:phenotype) - (gnt:LRS rdfs:range xsd:double) - (gnt:locus rdfs:domain gnc:phenotype) - (gnt:locus rdfs:range rdfs:Literal) - (gnt:additive rdfs:domain gnc:phenotype) - (gnt:additive rdfs:range xsd:double) - (gnt:sequence rdfs:domain gnc:phenotype) - (gnt:sequence rdfs:range xsd:integer)) - (triples (string->identifier - "trait" - (field ("CONCAT(IFNULL(InbredSet.Name, PublishXRef.InbredSetId), '_', PublishXRef.Id)" - Phenotype))) - (set rdf:type 'gnc:phenotype) - (set gnt:belongsToSet - (string->identifier - "set" (field InbredSet Name) - #:separator "" - #:proc string-capitalize-first)) - (set gnt:traitName - (let ((trait-id (field PublishXRef Id))) - (if (number? trait-id) - (number->string trait-id) - trait-id))) - (set rdfs:label - (field ("CONCAT(IFNULL(InbredSet.Name, PublishXRef.InbredSetId), '_', PublishXRef.Id)" - Phenotype))) - ;; All phenotypes have a post-publication description - (set dct:description - (sanitize-rdf-string - (field Phenotype Post_publication_description))) - ;; All phenotypes have a post-publication abbreviation - (set gnt:abbreviation (field Phenotype Post_publication_abbreviation)) - (set gnt:labCode (field Phenotype Lab_code)) - (set gnt:submitter - (sanitize-rdf-string (field Phenotype Submitter))) - (multiset dct:contributor - (string-split - (sanitize-rdf-string (field Phenotype Owner)) - #\,)) - (set gnt:mean (annotate-field (field ("IFNULL(PublishXRef.mean, '')" mean)) - '^^xsd:double)) - (set gnt:locus (field PublishXRef Locus)) - (set gnt:LRS (annotate-field - (field ("IFNULL(PublishXRef.LRS, '')" lrs)) - '^^xsd:double)) - (set gnt:additive - (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive)) - '^^xsd:double)) - (set gnt:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:integer)) - (set dct:isReferencedBy - (let ((pmid (field - ("IF(Publication.PubMed_ID IS NULL, '', CONVERT(Publication.PubMed_Id, INT))" - pmid))) - (publication-id (field Publication Id))) - (if (string-null? pmid) - (string->identifier "unpublished" - (number->string publication-id)) - (ontology 'pubmed: pmid)))))) - - - -(dump-with-documentation - (name "Phenotypes Metadata") - (connection %connection-settings) - (table-metadata? #f) - (prefixes - '(("dct:" "") - ("gn:" "") - ("owl:" "") - ("gnc:" "") - ("gnt:" "") - ("skos:" "") - ("rdf:" "") - ("rdfs:" "") - ("xsd:" "") - ("pubmed:" ""))) - (inputs - (list - dump-phenotypes)) - (outputs - '(#:documentation "./docs/dump-phenotype.md" - #:rdf "/export/data/genenetwork-virtuoso/dump-phenotype.ttl"))) diff --git a/examples/dump-probeset-data.scm b/examples/dump-probeset-data.scm deleted file mode 100755 index 55f3f4b..0000000 --- a/examples/dump-probeset-data.scm +++ /dev/null @@ -1,106 +0,0 @@ -#! /usr/bin/env guile -!# - -(use-modules (srfi srfi-1) - (srfi srfi-26) - (ice-9 match) - (ice-9 regex) - (dump strings) - (dump sql) - (dump triples) - (dump special-forms)) - - - -(define %connection-settings - (call-with-input-file (list-ref (command-line) 1) - read)) - - - -(define-transformer dump-probeset-data - (tables (ProbeSetXRef - (left-join ProbeSet "ON ProbeSetXRef.ProbeSetId = ProbeSet.Id") - (left-join ProbeSetFreeze "ON ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id")) - "WHERE ProbeSetFreeze.public > 0 AND ProbeSetFreeze.confidentiality < 1") - (schema-triples - (gnc:probesetStatistics a skos:Concept) - (gnc:probesetStatistics - skos:description - "This is a set of controlled terms that are used to describe a given probeset's statistics") - (gnt:mean rdfs:domain gnc:probeset) - (gnt:locus rdfs:domain gnc:probeset) - (gnt:LRS rdfs:domain gnc:probeset) - (gnt:stdErr rdfs:domain gnc:probeset) - (gnt:stdErr rdfs:range xsd:double) - (gnt:pValue rdfs:domain gnc:probeset) - (gnt:pValue rdfs:range xsd:double) - (gnt:h2 rdfs:domain gnc:probeset) - (gnt:h2 rdfs:range xsd:double)) - (triples - (string->identifier - "" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field - ("CONCAT(ProbeSetFreeze.Name, '_', IF(NULLIF(TRIM(ProbeSet.Name), ProbeSet.Id) IS NULL, '', TRIM(ProbeSet.Name)))" - probesetData)) - 'pre "_" 'post) - #:separator "" - #:proc string-capitalize-first) - (set rdf:type 'gnc:probesetStatistics) - (set gnt:hasProbeSet (let ((id (field ("IF(NULLIF(TRIM(ProbeSet.Name), '') IS NULL, '', TRIM(ProbeSet.Name))" - ProbeSetIdName))) - (probeset-id (field ProbeSet Id))) - (string->identifier - "probeset" - (if (string-null? id) - (number->string probeset-id) - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - id - 'pre "_" 'post))))) - (set gnt:mean (annotate-field (field ("IFNULL(ProbeSetXRef.mean, '')" mean)) - '^^xsd:double)) - (set gnt:locus (field ProbeSetXRef Locus)) - (set gnt:LRS (annotate-field - (field ("IFNULL(ProbeSetXRef.LRS, '')" lrs)) - '^^xsd:double)) - (set gnt:additive - (annotate-field (field ("IFNULL(ProbeSetXRef.additive, '')" additive)) - '^^xsd:double)) - (set gnt:stdErr (annotate-field (field ("IFNULL(ProbeSetXRef.se, '')" stdErr)) - '^^xsd:double)) - (set gnt:pValue (annotate-field (field ("IFNULL(ProbeSetXRef.pValue, '')" pValue)) - '^^xsd:double)) - (set gnt:h2 (annotate-field (field ("IFNULL(ProbeSetXRef.h2, '')" h2)) - '^^xsd:double)) - (set gnt:belongsToDataset - (string->identifier - "" - (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field ProbeSetFreeze Name) - 'pre "_" 'post) - #:separator "" - #:proc string-capitalize-first)))) - - - -(dump-with-documentation - (name "Probeset Summary Statistics") - (connection %connection-settings) - (table-metadata? #f) - (prefixes - '(("gn:" "") - ("gnc:" "") - ("gnt:" "") - ("skos:" "") - ("owl:" "") - ("rdf:" "") - ("rdfs:" "") - ("xsd:" ""))) - (inputs - (list dump-probeset-data)) - (outputs - '(#:documentation "./docs/dump-probeset-summary-stats.md" - #:rdf "./verified-data/dump-probeset-summary-stats.ttl"))) diff --git a/examples/dump-probeset.scm b/examples/dump-probeset.scm deleted file mode 100755 index 3a55506..0000000 --- a/examples/dump-probeset.scm +++ /dev/null @@ -1,184 +0,0 @@ -#! /usr/bin/env guile -!# - -(use-modules (srfi srfi-1) - (srfi srfi-26) - (ice-9 match) - (ice-9 regex) - (dump strings) - (dump sql) - (dump triples) - (dump special-forms)) - - - -(define %connection-settings - (call-with-input-file (list-ref (command-line) 1) - read)) - - -(define-transformer dump-probeset - (tables (ProbeSet - (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId"))) - (schema-triples - (gnc:probeset a skos:Concept) - (gnc:probeset - skos:description - "This is a set of controlled terms that are used to describe a given probeset") - (gnt:hasChip a owl:ObjectProperty) - (gnt:hasChip rdfs:domain gnc:probeset) - (gnt:hasTargetId a owl:ObjectProperty) - (gnt:hasTargetId rdfs:domain gnc:probeset) - (gnt:symbol rdfs:domain gnc:probeset) - (gnt:targetsRegion a owl:ObjectProperty) - (gnt:targetsRegion rdfs:domain gnc:probeset) - (gnt:chr rdfs:domain gnc:probeset) - (gnt:mb rdfs:domain gnc:probeset) - (gnt:mbMm8 rdfs:domain gnc:probeset) - (gnt:mb2016 rdfs:domain gnc:probeset) - (gnt:hasSpecificity a owl:ObjectProperty) - (gnt:hasSpecificity rdfs:domain gnc:probeset) - (gnt:hasBlatScore a owl:ObjectProperty) - (gnt:hasBlatScore rdfs:domain gnc:probeset) - (gnt:hasBlatMbStart a owl:ObjectProperty) - (gnt:hasBlatMbStart rdfs:domain gnc:probeset) - (gnt:hasBlatMbStart2016 a owl:ObjectProperty) - (gnt:hasBlatMbStart2016 rdfs:domain gnc:probeset) - (gnt:hasBlatMbEnd a owl:ObjectProperty) - (gnt:hasBlatMbEnd rdfs:domain gnc:probeset) - (gnt:hasBlatMbEnd2016 a owl:ObjectProperty) - (gnt:hasBlatMbEnd2016 rdfs:domain gnc:probeset) - (gnt:hasBlatSeq a owl:ObjectProperty) - (gnt:hasBlatSeq rdfs:domain gnc:probeset) - (gnt:hasTargetSeq a owl:ObjectProperty) - (gnt:hasTargetSeq rdfs:domain gnc:probeset) - (gnt:hasHomologeneId a owl:ObjectProperty) - (gnt:hasHomologeneId rdfs:domain gnc:probeset) - (gnt:hasPubChemId a owl:ObjectProperty) - (gnt:hasPubChemId rdfs:domain gnc:probeset) - (gnt:hasKeggId a owl:ObjectProperty) - (gnt:hasKeggId rdfs:domain gnc:probeset) - (gnt:hasOmimId a owl:ObjectProperty) - (gnt:hasOmimId rdfs:domain gnc:probeset) - (gnt:hasChebiId a owl:ObjectProperty) - (gnt:hasChebiId rdfs:domain gnc:probeset)) - (triples - (let ((id (field ("IF(NULLIF(TRIM(ProbeSet.Name), '') IS NULL, '', TRIM(ProbeSet.Name))" - ProbeSetIdName))) - (probeset-id (field ProbeSet Id))) - (string->identifier - "probeset" - (if (string-null? id) - (number->string probeset-id) - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - id - 'pre "_" 'post)))) - (set rdf:type 'gnc:probeset) - (set rdfs:label (field ProbeSet Name)) - (set skos:altLabel - (replace-substrings - (field ProbeSet alias) - '(("\r\n" . "; ")))) - (set gnt:hasChip - (string->identifier - "platform" - (field ("IFNULL(GeneChip.Name, '')" GeneChipName)))) - (set gnt:hasTargetId - (field ("NULLIF(TRIM(ProbeSet.TargetId), '')" - TargetId))) - (set gnt:symbol (field ProbeSet Symbol)) - (set dct:description (sanitize-rdf-string (field ProbeSet description))) - (set gnt:targetsRegion - (sanitize-rdf-string - (field ("NULLIF(TRIM(ProbeSet.Probe_set_target_region), '')" - Probe_set_target_region)))) - (set gnt:chr (field ProbeSet Chr)) - (set gnt:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double)) - (set gnt:mbMm8 (annotate-field (field ("IFNULL(ProbeSet.Mb_mm8, '')" Mb_mm8)) - '^^xsd:double)) - (set gnt:mb2016 - (annotate-field (field ("IFNULL(ProbeSet.Mb_2016, '')" Mb_2016)) - '^^xsd:double)) - (set gnt:hasSpecificity - (field ("IFNULL(ProbeSet.Probe_set_specificity, '')" - Probe_set_specificity))) - (set gnt:hasBlatScore - (field ("IFNULL(ProbeSet.Probe_set_BLAT_score, '')" - Probe_set_BLAT_score))) - (set gnt:hasBlatMbStart - (annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_start, '')" - Probe_set_Blat_Mb_start)) - '^^xsd:double)) - (set gnt:hasBlatMbStart2016 - (annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_start_2016, '')" - Probe_set_Blat_Mb_start_2016)) - '^^xsd:double)) - (set gnt:hasBlatMbEnd - (annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_end, '')" - Probe_set_Blat_Mb_end)) - '^^xsd:double)) - (set gnt:hasBlatMbEnd2016 - (annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_start_2016, '')" - Probe_set_Blat_Mb_start_2016)) - '^^xsd:double)) - (set gnt:hasBlatSeq (sanitize-rdf-string (field ProbeSet BlatSeq))) - (set gnt:hasTargetSeq (sanitize-rdf-string (field ProbeSet TargetSeq))) - (set gnt:hasHomologeneId (ontology 'homologene: - (field ("IFNULL(ProbeSet.HomoloGeneID, '')" - HomoloGeneID)))) - (set gnt:hasUniprotId (ontology 'uniprot: - (field ("IFNULL(ProbeSet.UniProtID, '')" - UniProtID)))) - (set gnt:hasPubChemId (ontology - 'pubchem: - (field ("IFNULL(ProbeSet.PubChem_ID, '')" - PubChem_ID)))) - (set gnt:hasKeggId (ontology - 'kegg: - (field ("IFNULL(ProbeSet.KEGG_ID, '')" - KEGG_ID)))) - (set gnt:hasOmimId (ontology - 'omim: - (let ((omim (field ("IFNULL(ProbeSet.OMIM, '')" - OMIM)))) - (if (number? omim) - omim - (regexp-substitute/global - #f "[^0-9]" - omim - 'pre "" 'post))))) - (set gnt:hasChebiId (ontology - 'chebi: - (field ("IFNULL(ProbeSet.ChEBI_ID, '')" - ChEBI_ID)))))) - - - - -(dump-with-documentation - (name "ProbeSet Metadata") - (connection %connection-settings) - (table-metadata? #f) - (prefixes - '(("gn:" "") - ("probeset:" "") - ("gnc:" "") - ("gnt:" "") - ("rdf:" "") - ("kegg:" "") - ("pubchem:" "") - ("omim:" "") - ("rdfs:" "") - ("uniprot:" "") - ("chebi:" "") - ("dct:" "") - ("owl:" "") - ("homologene:" "") - ("xsd:" "") - ("skos:" ""))) - (inputs - (list dump-probeset)) - (outputs - '(#:documentation "./docs/dump-probeset.md" - #:rdf "./verified-data/dump-probeset.ttl"))) diff --git a/examples/dump-publication.scm b/examples/dump-publication.scm deleted file mode 100755 index 1881872..0000000 --- a/examples/dump-publication.scm +++ /dev/null @@ -1,81 +0,0 @@ -#! /usr/bin/env guile -!# - -(use-modules (srfi srfi-1) - (srfi srfi-26) - (ice-9 match) - (ice-9 regex) - (dump strings) - (dump sql) - (dump triples) - (dump special-forms)) - - - -(define %connection-settings - (call-with-input-file (list-ref (command-line) 1) - read)) - - - -(define-transformer dump-publication - (tables (Publication)) - (triples - (let ((pmid (field - ("IF(Publication.PubMed_ID IS NULL, '', CONVERT(Publication.PubMed_Id, INT))" - pmid))) - (publication-id (field Publication Id))) - (if (string-null? pmid) - (string->identifier "unpublished" - (number->string publication-id)) - (ontology 'pubmed: pmid))) - (set rdf:type 'fabio:ResearchPaper) - (set fabio:hasPubMedId - (ontology 'pubmed: (field ("IFNULL(PubMed_ID, '')" pubmedId)))) - (set dct:title (delete-substrings (field Publication Title) - "Unknown")) - (set fabio:Journal (delete-substrings (field Publication Journal) - "Unknown")) - (set prism:volume (delete-substrings (field Publication Volume) - "Unknown")) - (set fabio:page (delete-substrings (field Publication Pages) - "Unknown")) - (set prism:publicationDate (annotate-field - (delete-substrings (field Publication Month) - "Unknown") - '^^xsd:gMonth)) - (set fabio:hasPublicationYear - (annotate-field - (field - ("IF(Publication.Year = 0, NULL, Publication.Year)" Year)) - '^^xsd:gYear)) - (multiset dct:creator - ;; The authors field is a comma - ;; separated list. Split it. - (map string-trim (string-split (sanitize-rdf-string (field Publication Authors)) #\,))) - (set dct:abstract - (sanitize-rdf-string - (field Publication Abstract))))) - - - -(dump-with-documentation - (name "Publications Metadata") - (connection %connection-settings) - (table-metadata? #f) - (prefixes - '(("gnt:" "") - ("fabio:" "") - ("dct:" "") - ("prism:" "") - ("gn:" "") - ("gnc:" "") - ("pubmed:" "") - ("rdfs:" "") - ("xsd:" "") - ("rdf:" ""))) - (inputs - (list dump-publication)) - (outputs - '(#:documentation "./docs/dump-publication.md" - #:rdf "./verified-data/dump-publication.ttl"))) diff --git a/examples/dump-species-metadata.scm b/examples/dump-species-metadata.scm deleted file mode 100755 index b0ac6f8..0000000 --- a/examples/dump-species-metadata.scm +++ /dev/null @@ -1,226 +0,0 @@ -#! /usr/bin/env guile -!# - -(use-modules (srfi srfi-1) - (srfi srfi-26) - (ice-9 match) - (ice-9 regex) - (dump strings) - (dump sql) - (dump triples) - (dump special-forms)) - - - -(define %connection-settings - (call-with-input-file (list-ref (command-line) 1) - read)) - - - -(define (remap-species-identifiers str) - "This procedure remaps identifiers to standard binominal. Obviously this should - be sorted by correcting the database!" - (match str - ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] - ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] - ["Macaca mulatta" "Macaca nemestrina"] - ["Bat (Glossophaga soricina)" "Glossophaga soricina"] - [str str])) - -(define-transformer dump-species - (tables (Species)) - (schema-triples - (gnc:species a skos:Concept) - (gnc:species skos:description "This is a set of controlled terms that are used to describe a given species") - (gnc:species skos:broader gnc:family) - (gnt:binomialName a owl:ObjectProperty) - (gnt:binomialName rdfs:domain gnc:species) - (gnt:family a owl:ObjectProperty) - (gnt:family rdfs:domain gnc:species) - (gnt:family skos:definition "This resource belongs to this family") - (gnt:organism a owl:ObjectProperty) - (gnt:organism rdfs:domain gnc:species) - (gnt:shortName a owl:ObjectProperty) - (gnt:shortName rdfs:domain gnc:species)) - (triples - (string->identifier "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first) - (set rdf:type 'gnc:species) - (set skos:label (field Species SpeciesName)) - (set skos:altLabel (field Species Name)) - (set rdfs:label (field Species MenuName)) - (set gnt:binomialName (field Species FullName)) - (set gnt:family (field Species Family)) - (set gnt:organism (ontology 'taxon: (field Species TaxonomyId))))) - -#! - -The ProbeData table contains StrainID. - -MariaDB [db_webqtl]> select * from ProbeData limit 2; -+--------+----------+---------+ -| Id | StrainId | value | -+--------+----------+---------+ -| 503636 | 42 | 11.6906 | -| 503636 | 43 | 11.4205 | -+--------+----------+---------+ - -Likewise - -MariaDB [db_webqtl]> select * from ProbeSetData wher limit 2; -+----+----------+-------+ -| Id | StrainId | value | -+----+----------+-------+ -| 1 | 1 | 5.742 | -| 1 | 2 | 5.006 | -+----+----------+-------+ - -To get at the strain use - -MariaDB [db_webqtl]> select * from Strain where Id=1 limit 15; -+----+--------+--------+-----------+--------+-------+ -| Id | Name | Name2 | SpeciesId | Symbol | Alias | -+----+--------+--------+-----------+--------+-------+ -| 1 | B6D2F1 | B6D2F1 | 1 | NULL | NULL | -+----+--------+--------+-----------+--------+-------+ - -A typical query may look like - -SELECT Strain.Name, Strain.Id FROM Strain, Species -WHERE Strain.Name IN f{create_in_clause(self.samplelist)} -AND Strain.SpeciesId=Species.Id -AND Species.name = %s, (self.group.species,) - -At this point it is not very clear how Name, Name2, Symbol and Alias are used. - -!# - -(define-transformer dump-strain - (tables (Strain - (left-join Species "ON Strain.SpeciesId = Species.SpeciesId"))) - (schema-triples - (gnc:strain skos:broader gnc:species) - (gnt:belongsToSpecies rdfs:domain gnc:strain) - (gnt:belongsToSpecies skos:definition "This resource belongs to this species") - (gnt:belongsToSpecies a owl:ObjectProperty) - (gnt:belongsToSpecies skos:definition "This resource belongs to this species") - (gnt:alias rdfs:domain gnc:strain) - (gnt:alias a owl:ObjectProperty) - (gnt:symbol rdfs:domain gnc:strain) - (gnt:symbol a owl:ObjectProperty)) - (triples (string->identifier - "" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field Strain Name) - 'pre "_" 'post) - #:separator "" - #:proc string-capitalize-first) - (set rdf:type 'gnc:strain) - (set gnt:belongsToSpecies - (string->identifier "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)) - ;; Name, and maybe a second name - (set rdfs:label (sanitize-rdf-string (field Strain Name))) - (set rdfs:label (sanitize-rdf-string (field ("IF ((Strain.Name2 != Strain.Name), Strain.Name2, '')" Name2)))) - (set gnt:alias (sanitize-rdf-string (field ("IF ((Strain.Alias != Strain.Name), Strain.Alias, '')" Alias)))) - (set gnt:symbol (field ("IF ((Strain.Symbol != Strain.Name), Strain.Symbol, '')" Symbol))))) - -(define-transformer dump-mapping-method - (tables (MappingMethod)) - (schema-triples - (gnc:mappingMethod a skos:Concept) - (gnc:mappingMethod skos:definition "Terms that decribe mapping/normalization methods used in GeneNetwork")) - (triples - (string->identifier "mappingMethod" (field MappingMethod Name)) - (set rdf:type 'gnc:mappingMethod) - (set rdfs:label (field MappingMethod Name)))) - - -(define-transformer dump-inbred-set - (tables (InbredSet - (left-join Species "ON InbredSet.SpeciesId=Species.Id") - (left-join MappingMethod - "ON InbredSet.MappingMethodId=MappingMethod.Id"))) - (schema-triples - (gnc:set skos:broader gnc:species) - (gnc:set skos:definition "A set of terms used to describe an set, which can be inbredSet, outbredSet etc etc.") - (gnt:geneticType a owl:ObjectProperty) - (gnt:geneticType rdfs:domain gnc:set) - (gnt:code a owl:ObjectProperty) - (gnt:code rdfs:domain gnc:set) - ;; Already defined as an owl prop in dump-species - (gnt:family rdfs:domain gnc:set) - (gnt:phenotype a owl:ObjectProperty) - (gnt:phenotype rdfs:domain gnc:set) - (gnt:genotype a owl:ObjectProperty) - (gnt:genotype rdfs:domain gnt:inbredSet) - (gnt:mappingMethod a owl:ObjectProperty) - (gnt:mappingMethod rdfs:domain gnc:set)) - (triples (string->identifier - "set" (field InbredSet Name) - #:separator "" - #:proc string-capitalize-first) - (set rdf:type 'gnc:set) - (set rdfs:label (field InbredSet FullName)) - (set skos:altLabel (field InbredSet Name)) - (set gnt:geneticType (field InbredSet GeneticType)) - (set gnt:family (field InbredSet Family)) - (set gnt:mappingMethod (field MappingMethod Name)) - (set gnt:code (field InbredSet InbredSetCode)) - (set gnt:belongsToSpecies - (string->identifier "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)) - (set gnt:genotype - (field ("IF ((SELECT PublishFreeze.Name FROM PublishFreeze WHERE PublishFreeze.InbredSetId = InbredSet.Id LIMIT 1) IS NOT NULL, 'Traits and Cofactors', '')" genotypeP))) - (set gnt:phenotype - (field ("IF ((SELECT GenoFreeze.Name FROM GenoFreeze WHERE GenoFreeze.InbredSetId = InbredSet.Id LIMIT 1) IS NOT NULL, 'DNA Markers and SNPs', '')" phenotypeP))) - (multiset gnt:hasTissue - (map - (lambda (x) - (string->identifier "tissue" - x)) - (string-split-substring - (field ("(SELECT GROUP_CONCAT(DISTINCT Tissue.Short_Name SEPARATOR'||') AS MolecularTraits FROM ProbeFreeze, ProbeSetFreeze, InbredSet, Tissue, Species WHERE ProbeFreeze.TissueId = Tissue.Id AND ProbeFreeze.InbredSetId = InbredSet.Id AND ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id ORDER BY Tissue.Name)" - molecularTrait)) - "||"))))) - -(define-transformer dump-avg-method - ;; The Name and Normalization fields seem to be the same. Dump only - ;; the Name field. - (tables (AvgMethod)) - (schema-triples - (gnc:avgMethod rdf:type owl:Class)) - (triples (string->identifier "avgmethod" (field AvgMethod Name)) - (set rdf:type 'gnc:avgMethod) - (set rdfs:label (field AvgMethod Normalization)))) - - - -(dump-with-documentation - (name "Species Metadata") - (connection %connection-settings) - (table-metadata? #f) - (prefixes - '(("gn:" "") - ("gnc:" "") - ("owl:" "") - ("gnt:" "") - ("skos:" "") - ("rdf:" "") - ("rdfs:" "") - ("taxon:" ""))) - (inputs - (list - dump-inbred-set - dump-species - dump-strain - dump-mapping-method - dump-avg-method)) - (outputs - '(#:documentation "./docs/dump-species-metadata.md" - #:rdf "/export/data/genenetwork-virtuoso/dump-species-metadata.ttl"))) diff --git a/examples/dump-tissue.scm b/examples/dump-tissue.scm deleted file mode 100755 index 3658a26..0000000 --- a/examples/dump-tissue.scm +++ /dev/null @@ -1,50 +0,0 @@ -#! /usr/bin/env guile -!# - -(use-modules (srfi srfi-1) - (srfi srfi-26) - (ice-9 match) - (ice-9 regex) - (dump strings) - (dump sql) - (dump triples) - (dump special-forms)) - - - -(define %connection-settings - (call-with-input-file (list-ref (command-line) 1) - read)) - - - -(define-transformer dump-tissue - ;; The Name and TissueName fields seem to be identical. BIRN_lex_ID - ;; and BIRN_lex_Name are mostly NULL. - (tables (Tissue)) - (schema-triples - (gnc:tissue a skos:Concept)) - ;; Hopefully the Short_Name field is distinct and can be used as an - ;; identifier. - (triples (string->identifier "tissue" (field Tissue Short_Name)) - (set rdf:type 'gnc:tissue) - (set rdfs:label (field Tissue Name)))) - - - -(dump-with-documentation - (name "Tissue Metadata") - (connection %connection-settings) - (table-metadata? #f) - (prefixes - '(("gn:" "") - ("gnt:" "") - ("skos:" "") - ("gnc:" "") - ("rdf:" "") - ("rdfs:" ""))) - (inputs - (list dump-tissue)) - (outputs - '(#:documentation "./docs/dump-tissue.md" - #:rdf "./verified-data/dump-tissue.ttl"))) diff --git a/examples/generif.scm b/examples/generif.scm new file mode 100755 index 0000000..0b3c8e4 --- /dev/null +++ b/examples/generif.scm @@ -0,0 +1,150 @@ +#! /usr/bin/env guile +!# + +(use-modules (srfi srfi-1) + (srfi srfi-26) + (ice-9 match) + (ice-9 regex) + (dump strings) + (dump sql) + (dump triples) + (dump special-forms)) + + + +(define %connection-settings + (call-with-input-file (list-ref (command-line) 1) + read)) + + + +(define-transformer genewiki-symbols + (tables (GeneRIF_BASIC + (left-join Species "USING (SpeciesId)")) + "GROUP BY GeneId ORDER BY BINARY symbol") + (schema-triples + (gnt:symbol rdfs:domain gn-term:geneWikiEntry) + (gnt:wikiEntryOfSpecies rdfs:range gn:species) + (gnt:taxid rdfs:domain gn-term:geneWikiEntry)) + (triples (ontology 'generif: (field GeneRIF_BASIC GeneId)) + (multiset gnt:symbol (string-split (field ("GROUP_CONCAT(DISTINCT symbol)" symbol)) + #\,)) + (multiset gnt:wikiEntryOfSpecies + (string-split + (field ("GROUP_CONCAT(DISTINCT Species.SpeciesName)" species)) + #\,)) + (multiset gnt:taxId (map (cut ontology 'ncbiTaxon: <>) + (string-split (field ("GROUP_CONCAT(DISTINCT TaxID)" taxId)) + #\,))))) + +(define-transformer gn-genewiki-entries + (tables (GeneRIF + (left-join GeneRIF_BASIC "USING (symbol)") + (left-join Species "ON Species.SpeciesId = GeneRIF.SpeciesId") + (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id") + (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id")) + "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 GROUP BY GeneRIF.symbol") + (schema-triples + (gnt:geneWikiEntry a rdfs:Class) + (gnt:geneWikiEntry a owl:Class) + (gnt:geneWikiEntry rdfs:comment "Represents GeneRIF Entries") + (gnt:geneCategory rdfs:domain gn:geneWikiEntry) + (gnt:geneWikiEntryOfGn rdfs:domain gn:geneWikiEntry) + (gnt:geneWikiEntry rdfs:domain gn:geneWikiEntry)) + (triples + (let ([geneid (field GeneRIF_BASIC GeneId)]) + (if (eq? geneid 0) + (ontology 'gnt:anonSymbol_ + (field GeneRIF symbol)) + (ontology 'generif: + geneid))) + (set rdf:type + (if (string-null? (field ("IFNULL(GeneRIF_BASIC.GeneId, '')" geneWikiEntryP))) + "" + 'gn:geneWikiEntry)) + (set gnt:wikiEntryOfSpecies + (string->binomial-name (field Species FullName))) + ;; This only dumps symbols not present in the GeneRIF_BASIC table + (set gnt:symbol (let ([geneid (field GeneRIF_BASIC GeneId)]) + (if (eq? geneid 0) + (field GeneRIF symbol) + ""))) + (multiset gnt:geneWikiEntryOfGn + (let* ([entries + (sanitize-rdf-string + (field + ("GROUP_CONCAT(DISTINCT CONCAT_WS('::::', IFNULL(GeneCategory.Name, ''), IFNULL(GeneRIF.PubMed_ID, ''), GeneRIF.email, CAST(CONVERT(BINARY CONVERT(GeneRIF.comment USING latin1) USING utf8) AS VARCHAR(15000)), GeneRIF.createtime, IFNULL(weburl, '')) SEPARATOR';;;;;')" + wikientry)))] + [comments (string-split-substring entries ";;;;;")]) + (map + (match-lambda + ((genecategory pmid email text createtime weburl) + (blank-node + (set gnt:geneCategory genecategory) + (multiset dct:source + (map (lambda (el) (if (string-null? el) + "" + (ontology 'pubmed: el))) + (string-split pmid #\space))) + (set dct:creator (regexp-substitute/global #f "@.*$" + email + 'pre + "" + 'post)) + (set gnt:geneWikiEntry + (annotate-field text '^^xsd:string)) + (set dct:created (annotate-field + createtime + '^^xsd:datetime)) + (set foaf:homepage weburl)))) + (map + (cut string-split-substring <> "::::") + comments)))))) + +(define-transformer ncbi-genewiki-entries + (tables (GeneRIF_BASIC) + "GROUP BY GeneId, comment, createtime") + (schema-triples + (gnt:geneWikiEntryofNCBI rdfs:domain gn:geneWikiEntry)) + (triples (ontology 'generif: + (field GeneRIF_BASIC GeneId)) + (set gnt:geneWikiEntryOfNCBI + (blank-node + (set gnt:geneWikiEntry + (annotate-field (field GeneRIF_BASIC comment) + '^^xsd:string)) + (multiset dct:source (map (lambda (el) (if (string-null? el) + "" + (ontology 'pubmed: el))) + (string-split (field ("GROUP_CONCAT(PubMed_ID)" pmids)) + #\,))) + (set dct:created (annotate-field (time-unix->string + (field GeneRIF_BASIC createtime) "~5") + '^^xsd:datetime)))))) + + + +(with-documentation + (name "GeneRIF Metadata") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("rdf:" "") + ("rdfs:" "") + ("gn:" "") + ("gnc:" "") + ("gnt:" "") + ("dct:" "") + ("pubmed:" "") + ("ncbiTaxon:" "") + ("generif:" "") + ("xsd:" "") + ("owl:" ""))) + (inputs + (list ;; genewiki-symbols + gn-genewiki-entries + ;; ncbi-genewiki-entries + )) + (outputs + '(#:documentation "./docs/generif.md" + #:rdf "./verified-data/generif.ttl"))) diff --git a/examples/genotype.scm b/examples/genotype.scm new file mode 100755 index 0000000..63b85a7 --- /dev/null +++ b/examples/genotype.scm @@ -0,0 +1,124 @@ +#! /usr/bin/env guile +!# + +(use-modules (rnrs programs) + (rnrs io ports) + (srfi srfi-1) + (srfi srfi-26) + (ice-9 match) + (ice-9 regex) + (dump strings) + (dump sql) + (dump triples) + (dump special-forms)) + + + +(define %connection-settings + (call-with-input-file (list-ref (command-line) 1) + read)) + + + +(define (remap-species-identifiers str) + "This procedure remaps identifiers to standard binominal. Obviously this should + be sorted by correcting the database!" + (match str + ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] + ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] + ["Macaca mulatta" "Macaca nemestrina"] + ["Bat (Glossophaga soricina)" "Glossophaga soricina"] + [str str])) + +(define-transformer genotypes + (tables (Geno + (left-join Species "USING (SpeciesId)"))) + (schema-triples + (gnc:genotype a skos:Concept) + (gnc:genotype + skos:description + "This is a set of controlled terms that are used to describe a given genotype") + (gnt:chr a owl:ObjectProperty) + (gnt:chr skos:description "This resource is located on a given chromosome") + (gnt:chr rdfs:domain gnc:genotype) + (gnt:mb a owl:ObjectProperty) + (gnt:mb skos:definition "The size of this resource in Mb") + (gnt:mb rdfs:domain gnc:genotype) + (gnt:mbMm8 a owl:ObjectProperty) + (gnt:mbMm8 skos:definition "TODO") + (gnt:mbMm8 rdfs:domain gnc:genotype) + (gnt:mb2016 a owl:ObjectProperty) + (gnt:mb2016 skos:definition "TODO") + (gnt:mb2016 rdfs:domain gnc:genotype) + (gnt:hasSequence a owl:ObjectProperty) + (gnt:hasSequence skos:definition "This resource has a given sequence") + (gnt:hasSequence rdfs:domain gnc:genotype) + (gnt:hasSource a owl:ObjectProperty) + (gnt:hasSource rdfs:domain gnc:genotype) + (gnt:hasSource skos:definition "This resource was obtained from this given source") + (gnt:hasAltSourceName a owl:ObjectProperty) + (gnt:hasAltSourceName rdfs:domain gnc:genotype) + (gnt:hasAltSourceName + skos:definition + "The alternative name this resource was obtained from") + (gnt:chrNum a owl:ObjectProperty) + (gnt:chrNum rdfs:domain gnc:genotype) + (gnt:chrNum skos:definition "The chromosome number for this resource") + (gnt:chrNum skos:definition "The chromosome number for this resource")) + (triples + (string->identifier + "" + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field Geno Name) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type 'gnc:genotype) + (set skos:prefLabel (sanitize-rdf-string (field Geno Name))) + (set gnt:chr (field Geno Chr)) + (set gnt:mb (annotate-field + (field ("IFNULL(Geno.Mb, '')" Mb)) '^^xsd:double)) + (set gnt:mbMm8 (annotate-field (field ("IFNULL(Geno.Mb_mm8, '')" Mb_mm8)) + '^^xsd:double)) + (set gnt:mb2016 + (annotate-field (field ("IFNULL(Geno.Mb_2016, '')" Mb_2016)) + '^^xsd:double)) + (set gnt:hasSequence (field Geno Sequence)) + (set gnt:hasSource (field Geno Source)) + ;; Only dump Source2 if it differs from Source + (set gnt:hasAltSourceName + (field ("IF((Source2 = Source), NULL, Source2)" + Source2))) + (set gnt:belongsToSpecies + (string->identifier + "" (remap-species-identifiers (field Species Fullname)) + #:separator "" + #:proc string-capitalize-first)) + (set gnt:chrNum + (annotate-field + (field Geno chr_num) + '^^xsd:int)) + (set rdfs:comments (field Geno Comments)))) + + + +(with-documentation + (name "Genotype Metadata") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("dct:" "") + ("gn:" "") + ("gnc:" "") + ("gnt:" "") + ("rdf:" "") + ("rdfs:" "") + ("owl:" "") + ("skos:" "") + ("xsd:" ""))) + (inputs + (list genotypes)) + (outputs + '(#:documentation "./docs/genotype.md" + #:rdf "/export/data/genenetwork-virtuoso/genotype.ttl"))) diff --git a/examples/phenotype.scm b/examples/phenotype.scm new file mode 100755 index 0000000..1c68159 --- /dev/null +++ b/examples/phenotype.scm @@ -0,0 +1,125 @@ +#! /usr/bin/env guile +!# + +(use-modules (rnrs programs) + (rnrs io ports) + (srfi srfi-1) + (srfi srfi-26) + (ice-9 match) + (ice-9 regex) + (dump strings) + (dump sql) + (dump triples) + (dump special-forms)) + + + +(define %connection-settings + (call-with-input-file (list-ref (command-line) 1) + read)) + + +(define-transformer phenotypes + (tables (PublishXRef + (left-join InbredSet "ON InbredSet.InbredSetId = PublishXRef.InbredSetId") + (left-join Publication "ON Publication.Id = PublishXRef.PublicationId") + (left-join Phenotype "ON Phenotype.Id = PublishXRef.PhenotypeId")) + "WHERE PublishXRef.InbredSetId IN (SELECT PublishFreeze.InbredSetId FROM PublishFreeze)") + (schema-triples + (gnc:phenotype a skos:Concept) + (gnc:phenotype skos:description "This is a set of controlled terms that are used to describe a given phenotype") + (gnt:abbreviation a owl:ObjectProperty) + (gnt:abbreviation rdfs:domain gnc:phenotype) + (gnt:abbreviation skos:definition "The abbreviation used for this resource") + (gnt:traitName a owl:ObjectProperty) + (gnt:traitName rdfs:domain gnc:phenotype) + (gnt:traitName skos:definition "The trait Name of this resource") + (gnt:labCode a owl:ObjectProperty) + (gnt:labCode rdfs:domain gnc:phenotype) + (gnt:submitter a owl:ObjectProperty) + (gnt:submitter rdfs:domain gnc:phenotype) + (gnt:submitter skos:definition "A person who submitted this resource to GN") + (gnt:mean rdfs:domain gnc:phenotype) + (gnt:mean rdfs:range xsd:double) + (gnt:LRS rdfs:domain gnc:phenotype) + (gnt:LRS rdfs:range xsd:double) + (gnt:locus rdfs:domain gnc:phenotype) + (gnt:locus rdfs:range rdfs:Literal) + (gnt:additive rdfs:domain gnc:phenotype) + (gnt:additive rdfs:range xsd:double) + (gnt:sequence rdfs:domain gnc:phenotype) + (gnt:sequence rdfs:range xsd:integer)) + (triples (string->identifier + "trait" + (field ("CONCAT(IFNULL(InbredSet.Name, PublishXRef.InbredSetId), '_', PublishXRef.Id)" + Phenotype))) + (set rdf:type 'gnc:phenotype) + (set gnt:belongsToSet + (string->identifier + "set" (field InbredSet Name) + #:separator "" + #:proc string-capitalize-first)) + (set gnt:traitName + (let ((trait-id (field PublishXRef Id))) + (if (number? trait-id) + (number->string trait-id) + trait-id))) + (set rdfs:label + (field ("CONCAT(IFNULL(InbredSet.Name, PublishXRef.InbredSetId), '_', PublishXRef.Id)" + Phenotype))) + ;; All phenotypes have a post-publication description + (set dct:description + (sanitize-rdf-string + (field Phenotype Post_publication_description))) + ;; All phenotypes have a post-publication abbreviation + (set gnt:abbreviation (field Phenotype Post_publication_abbreviation)) + (set gnt:labCode (field Phenotype Lab_code)) + (set gnt:submitter + (sanitize-rdf-string (field Phenotype Submitter))) + (multiset dct:contributor + (string-split + (sanitize-rdf-string (field Phenotype Owner)) + #\,)) + (set gnt:mean (annotate-field (field ("IFNULL(PublishXRef.mean, '')" mean)) + '^^xsd:double)) + (set gnt:locus (field PublishXRef Locus)) + (set gnt:LRS (annotate-field + (field ("IFNULL(PublishXRef.LRS, '')" lrs)) + '^^xsd:double)) + (set gnt:additive + (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive)) + '^^xsd:double)) + (set gnt:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:integer)) + (set dct:isReferencedBy + (let ((pmid (field + ("IF(Publication.PubMed_ID IS NULL, '', CONVERT(Publication.PubMed_Id, INT))" + pmid))) + (publication-id (field Publication Id))) + (if (string-null? pmid) + (string->identifier "unpublished" + (number->string publication-id)) + (ontology 'pubmed: pmid)))))) + + + +(with-documentation + (name "Phenotypes Metadata") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("dct:" "") + ("gn:" "") + ("owl:" "") + ("gnc:" "") + ("gnt:" "") + ("skos:" "") + ("rdf:" "") + ("rdfs:" "") + ("xsd:" "") + ("pubmed:" ""))) + (inputs + (list + phenotypes)) + (outputs + '(#:documentation "./docs/phenotype.md" + #:rdf "/export/data/genenetwork-virtuoso/phenotype.ttl"))) diff --git a/examples/probeset-data.scm b/examples/probeset-data.scm new file mode 100755 index 0000000..d46bcda --- /dev/null +++ b/examples/probeset-data.scm @@ -0,0 +1,98 @@ +#! /usr/bin/env guile +!# + +(use-modules (srfi srfi-1) + (srfi srfi-26) + (ice-9 match) + (ice-9 regex) + (dump strings) + (dump sql) + (dump triples) + (dump special-forms)) + + + +(define %connection-settings + (call-with-input-file (list-ref (command-line) 1) + read)) + + + +(define-transformer probeset-data + (tables (ProbeSetXRef + (left-join ProbeSet "ON ProbeSetXRef.ProbeSetId = ProbeSet.Id") + (left-join ProbeSetFreeze "ON ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id")) + "WHERE ProbeSetFreeze.public > 0 AND ProbeSetFreeze.confidentiality < 1") + (schema-triples + (gnc:probesetStatistics a skos:Concept) + (gnc:probesetStatistics + skos:description + "This is a set of controlled terms that are used to describe a given probeset's statistics") + (gnt:mean rdfs:domain gnc:probeset) + (gnt:locus rdfs:domain gnc:probeset) + (gnt:LRS rdfs:domain gnc:probeset) + (gnt:stdErr rdfs:domain gnc:probeset) + (gnt:stdErr rdfs:range xsd:double) + (gnt:pValue rdfs:domain gnc:probeset) + (gnt:pValue rdfs:range xsd:double) + (gnt:h2 rdfs:domain gnc:probeset) + (gnt:h2 rdfs:range xsd:double)) + (triples + (string->identifier + "" + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field + ("CONCAT(ProbeSetFreeze.Name, '_', IF(NULLIF(TRIM(ProbeSet.Name), ProbeSet.Id) IS NULL, '', TRIM(ProbeSet.Name)))" + probesetData)) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type 'gnc:probesetStatistics) + (set gnt:hasProbeSet (let ((id (field ("IF(NULLIF(TRIM(ProbeSet.Name), '') IS NULL, '', TRIM(ProbeSet.Name))" + ProbeSetIdName))) + (probeset-id (field ProbeSet Id))) + (string->identifier + "probeset" + (if (string-null? id) + (number->string probeset-id) + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + id + 'pre "_" 'post))))) + (set gnt:mean (annotate-field (field ("IFNULL(ProbeSetXRef.mean, '')" mean)) + '^^xsd:double)) + (set gnt:locus (field ProbeSetXRef Locus)) + (set gnt:LRS (annotate-field + (field ("IFNULL(ProbeSetXRef.LRS, '')" lrs)) + '^^xsd:double)) + (set gnt:additive + (annotate-field (field ("IFNULL(ProbeSetXRef.additive, '')" additive)) + '^^xsd:double)) + (set gnt:stdErr (annotate-field (field ("IFNULL(ProbeSetXRef.se, '')" stdErr)) + '^^xsd:double)) + (set gnt:pValue (annotate-field (field ("IFNULL(ProbeSetXRef.pValue, '')" pValue)) + '^^xsd:double)) + (set gnt:h2 (annotate-field (field ("IFNULL(ProbeSetXRef.h2, '')" h2)) + '^^xsd:double)))) + + + +(with-documentation + (name "Probeset Summary Statistics") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("gn:" "") + ("gnc:" "") + ("gnt:" "") + ("skos:" "") + ("owl:" "") + ("rdf:" "") + ("rdfs:" "") + ("xsd:" ""))) + (inputs + (list probeset-data)) + (outputs + '(#:documentation "./docs/probeset-summary-stats.md" + #:rdf "./verified-data/probeset-summary-stats.ttl"))) diff --git a/examples/probeset.scm b/examples/probeset.scm new file mode 100755 index 0000000..68ddb59 --- /dev/null +++ b/examples/probeset.scm @@ -0,0 +1,184 @@ +#! /usr/bin/env guile +!# + +(use-modules (srfi srfi-1) + (srfi srfi-26) + (ice-9 match) + (ice-9 regex) + (dump strings) + (dump sql) + (dump triples) + (dump special-forms)) + + + +(define %connection-settings + (call-with-input-file (list-ref (command-line) 1) + read)) + + +(define-transformer probeset + (tables (ProbeSet + (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId"))) + (schema-triples + (gnc:probeset a skos:Concept) + (gnc:probeset + skos:description + "This is a set of controlled terms that are used to describe a given probeset") + (gnt:hasChip a owl:ObjectProperty) + (gnt:hasChip rdfs:domain gnc:probeset) + (gnt:hasTargetId a owl:ObjectProperty) + (gnt:hasTargetId rdfs:domain gnc:probeset) + (gnt:symbol rdfs:domain gnc:probeset) + (gnt:targetsRegion a owl:ObjectProperty) + (gnt:targetsRegion rdfs:domain gnc:probeset) + (gnt:chr rdfs:domain gnc:probeset) + (gnt:mb rdfs:domain gnc:probeset) + (gnt:mbMm8 rdfs:domain gnc:probeset) + (gnt:mb2016 rdfs:domain gnc:probeset) + (gnt:hasSpecificity a owl:ObjectProperty) + (gnt:hasSpecificity rdfs:domain gnc:probeset) + (gnt:hasBlatScore a owl:ObjectProperty) + (gnt:hasBlatScore rdfs:domain gnc:probeset) + (gnt:hasBlatMbStart a owl:ObjectProperty) + (gnt:hasBlatMbStart rdfs:domain gnc:probeset) + (gnt:hasBlatMbStart2016 a owl:ObjectProperty) + (gnt:hasBlatMbStart2016 rdfs:domain gnc:probeset) + (gnt:hasBlatMbEnd a owl:ObjectProperty) + (gnt:hasBlatMbEnd rdfs:domain gnc:probeset) + (gnt:hasBlatMbEnd2016 a owl:ObjectProperty) + (gnt:hasBlatMbEnd2016 rdfs:domain gnc:probeset) + (gnt:hasBlatSeq a owl:ObjectProperty) + (gnt:hasBlatSeq rdfs:domain gnc:probeset) + (gnt:hasTargetSeq a owl:ObjectProperty) + (gnt:hasTargetSeq rdfs:domain gnc:probeset) + (gnt:hasHomologeneId a owl:ObjectProperty) + (gnt:hasHomologeneId rdfs:domain gnc:probeset) + (gnt:hasPubChemId a owl:ObjectProperty) + (gnt:hasPubChemId rdfs:domain gnc:probeset) + (gnt:hasKeggId a owl:ObjectProperty) + (gnt:hasKeggId rdfs:domain gnc:probeset) + (gnt:hasOmimId a owl:ObjectProperty) + (gnt:hasOmimId rdfs:domain gnc:probeset) + (gnt:hasChebiId a owl:ObjectProperty) + (gnt:hasChebiId rdfs:domain gnc:probeset)) + (triples + (let ((id (field ("IF(NULLIF(TRIM(ProbeSet.Name), '') IS NULL, '', TRIM(ProbeSet.Name))" + ProbeSetIdName))) + (probeset-id (field ProbeSet Id))) + (string->identifier + "probeset" + (if (string-null? id) + (number->string probeset-id) + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + id + 'pre "_" 'post)))) + (set rdf:type 'gnc:probeset) + (set rdfs:label (field ProbeSet Name)) + (set skos:altLabel + (replace-substrings + (field ProbeSet alias) + '(("\r\n" . "; ")))) + (set gnt:hasChip + (string->identifier + "platform" + (field ("IFNULL(GeneChip.Name, '')" GeneChipName)))) + (set gnt:hasTargetId + (field ("NULLIF(TRIM(ProbeSet.TargetId), '')" + TargetId))) + (set gnt:symbol (field ProbeSet Symbol)) + (set dct:description (sanitize-rdf-string (field ProbeSet description))) + (set gnt:targetsRegion + (sanitize-rdf-string + (field ("NULLIF(TRIM(ProbeSet.Probe_set_target_region), '')" + Probe_set_target_region)))) + (set gnt:chr (field ProbeSet Chr)) + (set gnt:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double)) + (set gnt:mbMm8 (annotate-field (field ("IFNULL(ProbeSet.Mb_mm8, '')" Mb_mm8)) + '^^xsd:double)) + (set gnt:mb2016 + (annotate-field (field ("IFNULL(ProbeSet.Mb_2016, '')" Mb_2016)) + '^^xsd:double)) + (set gnt:hasSpecificity + (field ("IFNULL(ProbeSet.Probe_set_specificity, '')" + Probe_set_specificity))) + (set gnt:hasBlatScore + (field ("IFNULL(ProbeSet.Probe_set_BLAT_score, '')" + Probe_set_BLAT_score))) + (set gnt:hasBlatMbStart + (annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_start, '')" + Probe_set_Blat_Mb_start)) + '^^xsd:double)) + (set gnt:hasBlatMbStart2016 + (annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_start_2016, '')" + Probe_set_Blat_Mb_start_2016)) + '^^xsd:double)) + (set gnt:hasBlatMbEnd + (annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_end, '')" + Probe_set_Blat_Mb_end)) + '^^xsd:double)) + (set gnt:hasBlatMbEnd2016 + (annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_start_2016, '')" + Probe_set_Blat_Mb_start_2016)) + '^^xsd:double)) + (set gnt:hasBlatSeq (sanitize-rdf-string (field ProbeSet BlatSeq))) + (set gnt:hasTargetSeq (sanitize-rdf-string (field ProbeSet TargetSeq))) + (set gnt:hasHomologeneId (ontology 'homologene: + (field ("IFNULL(ProbeSet.HomoloGeneID, '')" + HomoloGeneID)))) + (set gnt:hasUniprotId (ontology 'uniprot: + (field ("IFNULL(ProbeSet.UniProtID, '')" + UniProtID)))) + (set gnt:hasPubChemId (ontology + 'pubchem: + (field ("IFNULL(ProbeSet.PubChem_ID, '')" + PubChem_ID)))) + (set gnt:hasKeggId (ontology + 'kegg: + (field ("IFNULL(ProbeSet.KEGG_ID, '')" + KEGG_ID)))) + (set gnt:hasOmimId (ontology + 'omim: + (let ((omim (field ("IFNULL(ProbeSet.OMIM, '')" + OMIM)))) + (if (number? omim) + omim + (regexp-substitute/global + #f "[^0-9]" + omim + 'pre "" 'post))))) + (set gnt:hasChebiId (ontology + 'chebi: + (field ("IFNULL(ProbeSet.ChEBI_ID, '')" + ChEBI_ID)))))) + + + + +(with-documentation + (name "ProbeSet Metadata") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("gn:" "") + ("probeset:" "") + ("gnc:" "") + ("gnt:" "") + ("rdf:" "") + ("kegg:" "") + ("pubchem:" "") + ("omim:" "") + ("rdfs:" "") + ("uniprot:" "") + ("chebi:" "") + ("dct:" "") + ("owl:" "") + ("homologene:" "") + ("xsd:" "") + ("skos:" ""))) + (inputs + (list probeset)) + (outputs + '(#:documentation "./docs/probeset.md" + #:rdf "./verified-data/probeset.ttl"))) diff --git a/examples/publication.scm b/examples/publication.scm new file mode 100755 index 0000000..313ee96 --- /dev/null +++ b/examples/publication.scm @@ -0,0 +1,81 @@ +#! /usr/bin/env guile +!# + +(use-modules (srfi srfi-1) + (srfi srfi-26) + (ice-9 match) + (ice-9 regex) + (dump strings) + (dump sql) + (dump triples) + (dump special-forms)) + + + +(define %connection-settings + (call-with-input-file (list-ref (command-line) 1) + read)) + + + +(define-transformer publication + (tables (Publication)) + (triples + (let ((pmid (field + ("IF(Publication.PubMed_ID IS NULL, '', CONVERT(Publication.PubMed_Id, INT))" + pmid))) + (publication-id (field Publication Id))) + (if (string-null? pmid) + (string->identifier "unpublished" + (number->string publication-id)) + (ontology 'pubmed: pmid))) + (set rdf:type 'fabio:ResearchPaper) + (set fabio:hasPubMedId + (ontology 'pubmed: (field ("IFNULL(PubMed_ID, '')" pubmedId)))) + (set dct:title (delete-substrings (field Publication Title) + "Unknown")) + (set fabio:Journal (delete-substrings (field Publication Journal) + "Unknown")) + (set prism:volume (delete-substrings (field Publication Volume) + "Unknown")) + (set fabio:page (delete-substrings (field Publication Pages) + "Unknown")) + (set prism:publicationDate (annotate-field + (delete-substrings (field Publication Month) + "Unknown") + '^^xsd:gMonth)) + (set fabio:hasPublicationYear + (annotate-field + (field + ("IF(Publication.Year = 0, NULL, Publication.Year)" Year)) + '^^xsd:gYear)) + (multiset dct:creator + ;; The authors field is a comma + ;; separated list. Split it. + (map string-trim (string-split (sanitize-rdf-string (field Publication Authors)) #\,))) + (set dct:abstract + (sanitize-rdf-string + (field Publication Abstract))))) + + + +(with-documentation + (name "Publications Metadata") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("gnt:" "") + ("fabio:" "") + ("dct:" "") + ("prism:" "") + ("gn:" "") + ("gnc:" "") + ("pubmed:" "") + ("rdfs:" "") + ("xsd:" "") + ("rdf:" ""))) + (inputs + (list publication)) + (outputs + '(#:documentation "./docs/publication.md" + #:rdf "./verified-data/publication.ttl"))) diff --git a/examples/species-metadata.scm b/examples/species-metadata.scm new file mode 100755 index 0000000..f3794b8 --- /dev/null +++ b/examples/species-metadata.scm @@ -0,0 +1,226 @@ +#! /usr/bin/env guile +!# + +(use-modules (srfi srfi-1) + (srfi srfi-26) + (ice-9 match) + (ice-9 regex) + (dump strings) + (dump sql) + (dump triples) + (dump special-forms)) + + + +(define %connection-settings + (call-with-input-file (list-ref (command-line) 1) + read)) + + + +(define (remap-species-identifiers str) + "This procedure remaps identifiers to standard binominal. Obviously this should + be sorted by correcting the database!" + (match str + ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] + ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] + ["Macaca mulatta" "Macaca nemestrina"] + ["Bat (Glossophaga soricina)" "Glossophaga soricina"] + [str str])) + +(define-transformer species + (tables (Species)) + (schema-triples + (gnc:species a skos:Concept) + (gnc:species skos:description "This is a set of controlled terms that are used to describe a given species") + (gnc:species skos:broader gnc:family) + (gnt:binomialName a owl:ObjectProperty) + (gnt:binomialName rdfs:domain gnc:species) + (gnt:family a owl:ObjectProperty) + (gnt:family rdfs:domain gnc:species) + (gnt:family skos:definition "This resource belongs to this family") + (gnt:organism a owl:ObjectProperty) + (gnt:organism rdfs:domain gnc:species) + (gnt:shortName a owl:ObjectProperty) + (gnt:shortName rdfs:domain gnc:species)) + (triples + (string->identifier "" (remap-species-identifiers (field Species Fullname)) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type 'gnc:species) + (set skos:label (field Species SpeciesName)) + (set skos:altLabel (field Species Name)) + (set rdfs:label (field Species MenuName)) + (set gnt:binomialName (field Species FullName)) + (set gnt:family (field Species Family)) + (set gnt:organism (ontology 'taxon: (field Species TaxonomyId))))) + +#! + +The ProbeData table contains StrainID. + +MariaDB [db_webqtl]> select * from ProbeData limit 2; ++--------+----------+---------+ +| Id | StrainId | value | ++--------+----------+---------+ +| 503636 | 42 | 11.6906 | +| 503636 | 43 | 11.4205 | ++--------+----------+---------+ + +Likewise + +MariaDB [db_webqtl]> select * from ProbeSetData wher limit 2; ++----+----------+-------+ +| Id | StrainId | value | ++----+----------+-------+ +| 1 | 1 | 5.742 | +| 1 | 2 | 5.006 | ++----+----------+-------+ + +To get at the strain use + +MariaDB [db_webqtl]> select * from Strain where Id=1 limit 15; ++----+--------+--------+-----------+--------+-------+ +| Id | Name | Name2 | SpeciesId | Symbol | Alias | ++----+--------+--------+-----------+--------+-------+ +| 1 | B6D2F1 | B6D2F1 | 1 | NULL | NULL | ++----+--------+--------+-----------+--------+-------+ + +A typical query may look like + +SELECT Strain.Name, Strain.Id FROM Strain, Species +WHERE Strain.Name IN f{create_in_clause(self.samplelist)} +AND Strain.SpeciesId=Species.Id +AND Species.name = %s, (self.group.species,) + +At this point it is not very clear how Name, Name2, Symbol and Alias are used. + +!# + +(define-transformer strain + (tables (Strain + (left-join Species "ON Strain.SpeciesId = Species.SpeciesId"))) + (schema-triples + (gnc:strain skos:broader gnc:species) + (gnt:belongsToSpecies rdfs:domain gnc:strain) + (gnt:belongsToSpecies skos:definition "This resource belongs to this species") + (gnt:belongsToSpecies a owl:ObjectProperty) + (gnt:belongsToSpecies skos:definition "This resource belongs to this species") + (gnt:alias rdfs:domain gnc:strain) + (gnt:alias a owl:ObjectProperty) + (gnt:symbol rdfs:domain gnc:strain) + (gnt:symbol a owl:ObjectProperty)) + (triples (string->identifier + "" + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field Strain Name) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type 'gnc:strain) + (set gnt:belongsToSpecies + (string->identifier "" (remap-species-identifiers (field Species Fullname)) + #:separator "" + #:proc string-capitalize-first)) + ;; Name, and maybe a second name + (set rdfs:label (sanitize-rdf-string (field Strain Name))) + (set rdfs:label (sanitize-rdf-string (field ("IF ((Strain.Name2 != Strain.Name), Strain.Name2, '')" Name2)))) + (set gnt:alias (sanitize-rdf-string (field ("IF ((Strain.Alias != Strain.Name), Strain.Alias, '')" Alias)))) + (set gnt:symbol (field ("IF ((Strain.Symbol != Strain.Name), Strain.Symbol, '')" Symbol))))) + +(define-transformer mapping-method + (tables (MappingMethod)) + (schema-triples + (gnc:mappingMethod a skos:Concept) + (gnc:mappingMethod skos:definition "Terms that decribe mapping/normalization methods used in GeneNetwork")) + (triples + (string->identifier "mappingMethod" (field MappingMethod Name)) + (set rdf:type 'gnc:mappingMethod) + (set rdfs:label (field MappingMethod Name)))) + + +(define-transformer inbred-set + (tables (InbredSet + (left-join Species "ON InbredSet.SpeciesId=Species.Id") + (left-join MappingMethod + "ON InbredSet.MappingMethodId=MappingMethod.Id"))) + (schema-triples + (gnc:set skos:broader gnc:species) + (gnc:set skos:definition "A set of terms used to describe an set, which can be inbredSet, outbredSet etc etc.") + (gnt:geneticType a owl:ObjectProperty) + (gnt:geneticType rdfs:domain gnc:set) + (gnt:code a owl:ObjectProperty) + (gnt:code rdfs:domain gnc:set) + ;; Already defined as an owl prop in species + (gnt:family rdfs:domain gnc:set) + (gnt:phenotype a owl:ObjectProperty) + (gnt:phenotype rdfs:domain gnc:set) + (gnt:genotype a owl:ObjectProperty) + (gnt:genotype rdfs:domain gnt:inbredSet) + (gnt:mappingMethod a owl:ObjectProperty) + (gnt:mappingMethod rdfs:domain gnc:set)) + (triples (string->identifier + "set" (field InbredSet Name) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type 'gnc:set) + (set rdfs:label (field InbredSet FullName)) + (set skos:altLabel (field InbredSet Name)) + (set gnt:geneticType (field InbredSet GeneticType)) + (set gnt:family (field InbredSet Family)) + (set gnt:mappingMethod (field MappingMethod Name)) + (set gnt:code (field InbredSet InbredSetCode)) + (set gnt:belongsToSpecies + (string->identifier "" (remap-species-identifiers (field Species Fullname)) + #:separator "" + #:proc string-capitalize-first)) + (set gnt:genotype + (field ("IF ((SELECT PublishFreeze.Name FROM PublishFreeze WHERE PublishFreeze.InbredSetId = InbredSet.Id LIMIT 1) IS NOT NULL, 'Traits and Cofactors', '')" genotypeP))) + (set gnt:phenotype + (field ("IF ((SELECT GenoFreeze.Name FROM GenoFreeze WHERE GenoFreeze.InbredSetId = InbredSet.Id LIMIT 1) IS NOT NULL, 'DNA Markers and SNPs', '')" phenotypeP))) + (multiset gnt:hasTissue + (map + (lambda (x) + (string->identifier "tissue" + x)) + (string-split-substring + (field ("(SELECT GROUP_CONCAT(DISTINCT Tissue.Short_Name SEPARATOR'||') AS MolecularTraits FROM ProbeFreeze, ProbeSetFreeze, InbredSet, Tissue, Species WHERE ProbeFreeze.TissueId = Tissue.Id AND ProbeFreeze.InbredSetId = InbredSet.Id AND ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id ORDER BY Tissue.Name)" + molecularTrait)) + "||"))))) + +(define-transformer avg-method + ;; The Name and Normalization fields seem to be the same. Dump only + ;; the Name field. + (tables (AvgMethod)) + (schema-triples + (gnc:avgMethod rdf:type owl:Class)) + (triples (string->identifier "avgmethod" (field AvgMethod Name)) + (set rdf:type 'gnc:avgMethod) + (set rdfs:label (field AvgMethod Normalization)))) + + + +(with-documentation + (name "Species Metadata") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("gn:" "") + ("gnc:" "") + ("owl:" "") + ("gnt:" "") + ("skos:" "") + ("rdf:" "") + ("rdfs:" "") + ("taxon:" ""))) + (inputs + (list + inbred-set + species + strain + mapping-method + avg-method)) + (outputs + '(#:documentation "./docs/species-metadata.md" + #:rdf "/export/data/genenetwork-virtuoso/species-metadata.ttl"))) diff --git a/examples/tissue.scm b/examples/tissue.scm new file mode 100755 index 0000000..8ce96c8 --- /dev/null +++ b/examples/tissue.scm @@ -0,0 +1,50 @@ +#! /usr/bin/env guile +!# + +(use-modules (srfi srfi-1) + (srfi srfi-26) + (ice-9 match) + (ice-9 regex) + (dump strings) + (dump sql) + (dump triples) + (dump special-forms)) + + + +(define %connection-settings + (call-with-input-file (list-ref (command-line) 1) + read)) + + + +(define-transformer tissue + ;; The Name and TissueName fields seem to be identical. BIRN_lex_ID + ;; and BIRN_lex_Name are mostly NULL. + (tables (Tissue)) + (schema-triples + (gnc:tissue a skos:Concept)) + ;; Hopefully the Short_Name field is distinct and can be used as an + ;; identifier. + (triples (string->identifier "tissue" (field Tissue Short_Name)) + (set rdf:type 'gnc:tissue) + (set rdfs:label (field Tissue Name)))) + + + +(with-documentation + (name "Tissue Metadata") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("gn:" "") + ("gnt:" "") + ("skos:" "") + ("gnc:" "") + ("rdf:" "") + ("rdfs:" ""))) + (inputs + (list tissue)) + (outputs + '(#:documentation "./docs/tissue.md" + #:rdf "./verified-data/tissue.ttl"))) -- cgit v1.2.3