aboutsummaryrefslogtreecommitdiff
path: root/examples/dataset-metadata.scm
diff options
context:
space:
mode:
authorMunyoki Kilyungi2023-08-21 14:54:21 +0300
committerMunyoki Kilyungi2023-08-21 14:56:57 +0300
commit51b3c0548c98e0bc05e11a89cbf6b75d31b9f8d5 (patch)
treeab3d7c6f589ed8480f0a9d451566681bcfd8eaaf /examples/dataset-metadata.scm
parent849874fdfe11003f05abe5f82efde974a8c8a388 (diff)
downloadgn-transform-databases-51b3c0548c98e0bc05e11a89cbf6b75d31b9f8d5.tar.gz
Remove "dump-" prefix
Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
Diffstat (limited to 'examples/dataset-metadata.scm')
-rwxr-xr-xexamples/dataset-metadata.scm387
1 files changed, 387 insertions, 0 deletions
diff --git a/examples/dataset-metadata.scm b/examples/dataset-metadata.scm
new file mode 100755
index 0000000..5680a2b
--- /dev/null
+++ b/examples/dataset-metadata.scm
@@ -0,0 +1,387 @@
+#! /usr/bin/env guile
+!#
+
+(use-modules (srfi srfi-1)
+ (srfi srfi-26)
+ (ice-9 match)
+ (ice-9 regex)
+ (dump strings)
+ (dump sql)
+ (dump triples)
+ (dump special-forms))
+
+
+
+(define %connection-settings
+ (call-with-input-file (list-ref (command-line) 1)
+ read))
+
+
+
+;; One email ID in the Investigators table has spaces in it. This
+;; function fixes that.
+(define (fix-email-id email)
+ (string-delete #\space email))
+
+(define (investigator-attributes->id first-name last-name email)
+ ;; There is just one record corresponding to "Evan Williams" which
+ ;; does not have an email ID. To accommodate that record, we
+ ;; construct the investigator ID from not just the email ID, but
+ ;; also the first and the last names. It would be preferable to just
+ ;; find Evan Williams' email ID and insert it into the database.
+ (string->identifier "investigator"
+ (string-join
+ (list first-name last-name (fix-email-id email))
+ "_")))
+
+(define-transformer investigators
+ ;; There are a few duplicate entries. We group by email to
+ ;; deduplicate.
+ (tables (Investigators)
+ "GROUP BY Email")
+ (triples (investigator-attributes->id (field Investigators FirstName)
+ (field Investigators LastName)
+ (field Investigators Email))
+ (set rdf:type 'foaf:Person)
+ (set foaf:name (string-append (field Investigators FirstName) " "
+ (field Investigators LastName)))
+ (set foaf:givenName
+ (field Investigators FirstName))
+ (set foaf:familyName
+ (field Investigators LastName))
+ (set foaf:homepage (field Investigators Url))
+ (set v:adr (field Investigators Address))
+ (set v:locality (field Investigators City))
+ (set v:region (field Investigators State))
+ (set v:postal-code (field Investigators ZipCode))
+ (set v:country-name (field Investigators Country))))
+
+(define-transformer gene-chip
+ (tables (GeneChip
+ (left-join Species "USING (SpeciesId)")))
+ (schema-triples
+ (gnc:geneChip a skos:Concept)
+ (gnc:geneChip
+ skos:description
+ "This is a set of controlled terms that are used to describe a given gene chip/platform")
+ (gnt:hasGeoSeriesId rdfs:domain gnc:platform)
+ (gnt:belongsToSpecies a owl:ObjectProperty)
+ (gnt:belongsToSpecies skos:definition "This resource belongs to this given species")
+ (gnt:belongsToSpecies rdfs:domain gnc:geneChip)
+ (gnt:hasGeoSeriesId rdfs:domain gnc:geneChip)
+ (gnt:hasGOTreeValue a owl:ObjectProperty)
+ (gnt:hasGOTreeValue skos:definition "This resource the following GO tree value")
+ (gnt:hasGOTreeValue rdfs:domain gnc:geneChip))
+ (triples (string->identifier "platform" (field GeneChip Name))
+ (set rdf:type 'gnc:geneChip)
+ (set rdfs:label (field GeneChip GeneChipName))
+ (set skos:prefLabel (field GeneChip Name))
+ (set skos:altLabel (field ("IF(GeneChip.GeneChipName != GeneChip.Title, Title, NULL)"
+ Title)))
+ (set gnt:hasGOTreeValue (field GeneChip Go_tree_value))
+ (set gnt:belongsToSpecies
+ (string->identifier "" (remap-species-identifiers (field Species Fullname))
+ #:separator ""
+ #:proc string-capitalize-first))
+ (set gnt:hasGeoSeriesId
+ (ontology 'geoSeries:
+ (string-trim-both (field GeneChip GeoPlatform))))))
+
+(define-transformer info-files
+ (tables (InfoFiles
+ (left-join PublishFreeze "ON InfoFiles.InfoPageName = PublishFreeze.Name")
+ (left-join GenoFreeze "ON InfoFiles.InfoPageName = GenoFreeze.Name")
+ (left-join ProbeSetFreeze "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name")
+ (left-join InbredSet "ON InfoFiles.InbredSetId = InbredSet.InbredSetId")
+ (left-join Species "ON InfoFiles.SpeciesId = Species.SpeciesId")
+ (left-join Datasets "USING (DatasetId)")
+ (left-join DatasetStatus "USING (DatasetStatusId)")
+ (left-join Tissue "USING (TissueId)")
+ (left-join Investigators "USING (InvestigatorId)")
+ (left-join AvgMethod "USING (AvgMethodId)")
+ (left-join Organizations "USING (OrganizationId)")
+ (left-join GeneChip "USING (GeneChipId)"))
+ "WHERE GN_AccesionId IS NOT NULL")
+ (schema-triples
+ (gnc:dataset rdf:type gdmt:Dataset)
+ (gnc:genotypeDataset rdfs:subClassOf gnc:dataset)
+ (gnc:phenotypeDataset rdfs:subClassOf gnc:dataset)
+ (gnc:probesetDataset rdfs:subClassOf gnc:dataset)
+ (gnt:belongsToSet rdfs:domain gnc:dataset)
+ (gnt:belongsToSet a owl:ObjectProperty)
+ (gnt:belongsToSet skos:definition "The InbredSet this resource belongs to")
+ (gnt:hasTissue rdfs:domain gnc:dataset)
+ (gnt:hasTissue a owl:ObjectProperty)
+ (gnt:hasTissue skos:definition "Tissues this resource has")
+ (gnt:hasTissueInfo rdfs:domain gnc:dataset)
+ (gnt:hasTissueInfo a owl:ObjectProperty)
+ (gnt:hasTissueInfo skos:definition "Metadata about Tissue for this resource")
+ (gnt:usesNormalization rdfs:domain gnc:dataset)
+ (gnt:usesNormalization a owl:ObjectProperty)
+ (gnt:usesNormalization skos:definition "Normalization techniques this resource has")
+ (gnt:usesPlatform rdfs:domain gnc:dataset)
+ (gnt:usesPlatform a owl:ObjectProperty)
+ (gnt:usesPlatform skos:definition "The Platform this resource uses")
+ (gnt:hasGeoSeriesId rdfs:domain gnc:dataset)
+ (gnt:hasGeoSeriesId a owl:ObjectProperty)
+ (gnt:hasGeoSeriesId skos:definition "id of record in NCBI database")
+ (gnt:hasExperimentDesignInfo rdfs:domain gnc:dataset)
+ (gnt:hasExperimentDesignInfo rdfs:label "Experiment Design")
+ (gnt:hasExperimentDesignInfo a owl:ObjectProperty)
+ (gnt:hasExperimentDesignInfo skos:definition "Information about how the experiment was designed")
+ (gnt:hasNotes rdfs:domain gnc:dataset)
+ (gnt:hasNotes a owl:ObjectProperty)
+ (gnt:hasNotes rdfs:label "Notes")
+ (gnt:hasNotes skos:definition "Extra Notes about this dataset")
+ (gnt:hasDataProcessingInfo rdfs:domain gnc:dataset)
+ (gnt:hasDataProcessingInfo rdfs:label "About Data Processing")
+ (gnt:hasDataProcessingInfo a owl:ObjectProperty)
+ (gnt:hasDataProcessingInfo skos:definition "Information about how this dataset was processed")
+ (gnt:hasPlatformInfo rdfs:domain gnc:dataset)
+ (gnt:hasPlatformInfo a owl:ObjectProperty)
+ (gnt:hasPlatformInfo rdfs:label "About Platfoorm")
+ (gnt:hasPlatformInfo skos:definition "Information about the platform that was used with this dataset")
+ (gnt:hasCaseInfo rdfs:domain gnc:dataset)
+ (gnt:hasCaseInfo rdfs:label "About Case")
+ (gnt:hasCaseInfo a owl:ObjectProperty)
+ (gnt:hasCaseInfo skos:definition "Information about the cases used in this platform")
+ (gnt:hasAcknowledgement rdfs:domain gnc:dataset)
+ (gnt:hasAcknowledgement rdfs:label "Acknowledgement")
+ (gnt:hasAcknowledgement a owl:ObjectProperty)
+ (gnt:hasAcknowledgement skos:definition "People to acknowledge"))
+ (triples (string->identifier
+ "" (regexp-substitute/global #f "[^A-Za-z0-9:]"
+ (field InfoFiles InfoPageName)
+ 'pre "_" 'post)
+ #:separator ""
+ #:proc string-capitalize-first)
+ (set rdf:type (string->symbol
+ (field ("IF(GenoFreeze.Id IS NOT NULL, 'gnc:genotypeDataset', IF(PublishFreeze.Id IS NOT NULL, 'gnc:phenotypeDataset', IF(ProbeSetFreeze.Name IS NOT NULL, 'gnc:probesetDataset', 'gnc:dataset')))"
+ rdfType))))
+ (set rdfs:label (regexp-substitute/global
+ #f "^[Nn]one$"
+ (field InfoFiles InfoPageName)
+ ""))
+ (set skos:prefLabel
+ (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))"
+ DatasetFullName)))
+ (set skos:prefLabel (field Datasets DatasetName DatasetGroup))
+ (set gdmt:hasTitleInfo
+ (regexp-substitute/global
+ #f "^[Nn]one$"
+ (field InfoFiles InfoFileTitle)
+ ""))
+ ;; This is the published title
+ (set dct:title
+ (regexp-substitute/global
+ #f "^[Nn]one$"
+ (field Datasets PublicationTitle)
+ ""))
+ (set dct:created
+ (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))"
+ createTimeGenoFreeze)))
+ (set gdmt:hasCreatorInfo
+ (investigator-attributes->id (field Investigators FirstName)
+ (field Investigators LastName)
+ (field Investigators Email)))
+ (set gdmt:hasCreatorAffiliation
+ (field Organizations OrganizationName))
+ (set gdmt:hasDatasetIdentifierSubType (format #f "GN~a" (field InfoFiles GN_AccesionId)))
+ (set gdmt:hasRightsInfo (string-downcase
+ (field DatasetStatus DatasetStatusName)))
+ (set gnt:belongsToSet
+ (string->identifier
+ "set" (field InbredSet Name)
+ #:separator ""
+ #:proc string-capitalize-first))
+ (set gnt:hasTissue (string->identifier "tissue"
+ (field Tissue Short_Name)))
+ (set gnt:usesNormalization
+ (string->identifier "avgmethod"
+ ;; If AvgMethodName is NULL, assume N/A.
+ (if (string-blank? (field AvgMethod Name AvgMethodName))
+ "N/A" (field AvgMethod Name AvgMethodName))))
+ (set gnt:usesPlatform
+ (string->identifier "platform"
+ (field GeneChip Name GeneChip)))
+ (set gdmt:isDescribedBy
+ (sanitize-rdf-string (field Datasets Summary)))
+ (set gnt:hasGeoSeriesId
+ (let ((s
+ (string-match "GSE[0-9]*"
+ (field ("IFNULL(Datasets.GeoSeries, '')" GeoSeries)))))
+ (if s (ontology
+ 'geoSeries: (match:substring s))
+ "")))
+ (set gnt:hasTissueInfo
+ (sanitize-rdf-string (field Datasets AboutTissue)))
+ (set gnt:hasContentInfo (sanitize-rdf-string (field InfoFiles Specifics)))
+ (set gnt:hasCaseInfo
+ (sanitize-rdf-string
+ (field Datasets AboutCases)))
+ (set gnt:hasPlatformInfo
+ (sanitize-rdf-string
+ (field Datasets AboutPlatform)))
+ (set gnt:hasDataProcessingInfo
+ (sanitize-rdf-string
+ (field Datasets AboutDataProcessing)))
+ (set gnt:hasNotes
+ (sanitize-rdf-string
+ (field Datasets Notes)))
+ (set gnt:hasExperimentDesignInfo
+ (sanitize-rdf-string
+ (field Datasets ExperimentDesign)))
+ (set gdmt:hasContributorInfo
+ (sanitize-rdf-string
+ (field Datasets Contributors)))
+ (set gdmt:IsCitedBy
+ (sanitize-rdf-string
+ (regexp-substitute/global
+ #f "^[Nn]one$"
+ (field Datasets Citation)
+ "")))
+ (set gnt:hasAcknowledgement
+ (sanitize-rdf-string
+ (string-trim-both
+ (regexp-substitute/global
+ #f "^[Nn]one$"
+ (field InfoFiles Data_Source_Acknowledge)
+ ""))))
+ (set gnt:hasAcknowledgement (sanitize-rdf-string
+ (field Datasets Acknowledgment)))))
+
+;; These are phenotype datasets that don't have Infofile metadata
+(define-transformer publishfreeze
+ (tables (PublishFreeze
+ (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name")
+ (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId"))
+ "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL")
+ (triples
+ (string->identifier
+ ""
+ (regexp-substitute/global #f "[^A-Za-z0-9:]"
+ (field PublishFreeze Name)
+ 'pre "_" 'post)
+ #:separator ""
+ #:proc string-capitalize-first)
+ (set rdf:type 'gnc:phenotypeDataset)
+ (set rdfs:label (field PublishFreeze Name))
+ (set skos:prefLabel (field PublishFreeze FullName))
+ (set skos:altLabel (field PublishFreeze ShortName))
+ (set dct:created (annotate-field
+ (field PublishFreeze CreateTime)
+ '^^xsd:date))
+ (set gnt:belongsToSet
+ (string->identifier
+ "set" (field InbredSet Name)
+ #:separator ""
+ #:proc string-capitalize-first))))
+
+(define-transformer genofreeze
+ (tables (GenoFreeze
+ (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name")
+ (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId"))
+ "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL")
+ (triples
+ (string->identifier
+ ""
+ (regexp-substitute/global
+ #f "[^A-Za-z0-9:]"
+ (regexp-substitute/global
+ #f "[^A-Za-z0-9:]"
+ (field GenoFreeze Name)
+ 'pre "_" 'post)
+ 'pre "_" 'post)
+ #:separator ""
+ #:proc string-capitalize-first)
+ (set rdf:type 'gnc:genotypeDataset)
+ (set rdfs:label (field GenoFreeze Name))
+ (set skos:prefLabel (field GenoFreeze FullName))
+ (set skos:altLabel (field GenoFreeze ShortName))
+ (set dct:created (annotate-field
+ (field GenoFreeze CreateTime)
+ '^^xsd:date))
+ (set gnt:belongsToSet
+ (string->identifier
+ "set" (field InbredSet Name)
+ #:separator ""
+ #:proc string-capitalize-first))))
+
+;; Molecular Traits are also referred to as ProbeSets
+(define-transformer probesetfreeze
+ (tables (ProbeSetFreeze
+ (left-join InfoFiles "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name")
+ (left-join ProbeFreeze "USING (ProbeFreezeId)")
+ (left-join AvgMethod "ON AvgMethod.AvgMethodId = ProbeSetFreeze.AvgID")
+ (left-join InbredSet "ON ProbeFreeze.InbredSetId = InbredSet.Id")
+ (left-join Tissue "ON ProbeFreeze.TissueId = Tissue.TissueId"))
+ "WHERE ProbeSetFreeze.public > 0 AND InfoFiles.InfoPageName IS NULL GROUP BY ProbeFreeze.Id")
+ (schema-triples
+ (gnt:usesNormalization rdfs:domain gnc:probeset)
+ (gnt:usesDataScale rdfs:domain gnc:probeset)
+ (gnt:usesDataScale a owl:ObjectProperty)
+ (gnt:usesDataScale skos:definition "Thi data scale this resource uses"))
+ (triples
+ (string->identifier
+ ""
+ (regexp-substitute/global
+ #f "[^A-Za-z0-9:]"
+ (field ProbeSetFreeze Name)
+ 'pre "_" 'post)
+ #:separator ""
+ #:proc string-capitalize-first)
+ (set rdf:type 'gnc:probesetDataset)
+ (set gnt:usesNormalization
+ (string->identifier "avgmethod"
+ ;; If AvgMethodName is NULL, assume N/A.
+ (if (string-blank? (field AvgMethod Name AvgMethodName))
+ "N/A" (field AvgMethod Name AvgMethodName))))
+ (set dct:title (field ProbeSetFreeze FullName))
+ (set rdfs:label (field ProbeSetFreeze ShortName))
+ (set skos:prefLabel (field ProbeSetFreeze Name))
+ (set skos:altLabel (field ProbeSetFreeze Name2))
+ (set dct:created (annotate-field
+ (field ProbeSetFreeze CreateTime)
+ '^^xsd:datetime))
+ (set gnt:usesDataScale (field ProbeSetFreeze DataScale))
+ (set gnt:hasTissue
+ (string->identifier
+ "tissue"
+ (field Tissue Short_Name)))
+ (set gnt:belongsToSet
+ (string->identifier
+ "set" (field InbredSet Name)
+ #:separator ""
+ #:proc string-capitalize-first))))
+
+
+
+(with-documentation
+ (name "Info files / Investigators Metadata")
+ (connection %connection-settings)
+ (table-metadata? #f)
+ (prefixes
+ '(("v:" "<http://www.w3.org/2006/vcard/ns#>")
+ ("foaf:" "<http://xmlns.com/foaf/0.1/>")
+ ("gdmt:" "<http://vocab.fairdatacollective.org/gdmt/>")
+ ("skos:" "<http://www.w3.org/2004/02/skos/core#>")
+ ("geoSeries:" "<http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=>")
+ ("gnt:" "<http://genenetwork.org/term/>")
+ ("gn:" "<http://genenetwork.org/id/>")
+ ("gnc:" "<http://genenetwork.org/category/>")
+ ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
+ ("owl:" "<http://www.w3.org/2002/07/owl#>")
+ ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
+ ("taxon:" "<http://purl.uniprot.org/taxonomy/>")
+ ("dct:" "<http://purl.org/dc/terms/>")))
+ (inputs
+ (list info-files
+ publishfreeze
+ genofreeze
+ probesetfreeze
+ investigators
+ gene-chip))
+ (outputs
+ '(#:documentation "./docs/info-pages.md"
+ #:rdf "/export/data/genenetwork-virtuoso/info-pages.ttl")))
+