about summary refs log tree commit diff
path: root/examples/dataset-metadata.scm
diff options
context:
space:
mode:
Diffstat (limited to 'examples/dataset-metadata.scm')
-rwxr-xr-xexamples/dataset-metadata.scm387
1 files changed, 387 insertions, 0 deletions
diff --git a/examples/dataset-metadata.scm b/examples/dataset-metadata.scm
new file mode 100755
index 0000000..5680a2b
--- /dev/null
+++ b/examples/dataset-metadata.scm
@@ -0,0 +1,387 @@
+#! /usr/bin/env guile
+!#
+
+(use-modules (srfi srfi-1)
+             (srfi srfi-26)
+             (ice-9 match)
+             (ice-9 regex)
+             (dump strings)
+             (dump sql)
+             (dump triples)
+             (dump special-forms))
+
+
+
+(define %connection-settings
+  (call-with-input-file (list-ref (command-line) 1)
+    read))
+
+
+
+;; One email ID in the Investigators table has spaces in it. This
+;; function fixes that.
+(define (fix-email-id email)
+  (string-delete #\space email))
+
+(define (investigator-attributes->id first-name last-name email)
+  ;; There is just one record corresponding to "Evan Williams" which
+  ;; does not have an email ID. To accommodate that record, we
+  ;; construct the investigator ID from not just the email ID, but
+  ;; also the first and the last names. It would be preferable to just
+  ;; find Evan Williams' email ID and insert it into the database.
+  (string->identifier "investigator"
+                      (string-join
+                       (list first-name last-name (fix-email-id email))
+                       "_")))
+
+(define-transformer investigators
+  ;; There are a few duplicate entries. We group by email to
+  ;; deduplicate.
+  (tables (Investigators)
+          "GROUP BY Email")
+  (triples (investigator-attributes->id (field Investigators FirstName)
+                                        (field Investigators LastName)
+                                        (field Investigators Email))
+    (set rdf:type 'foaf:Person)
+    (set foaf:name (string-append (field Investigators FirstName) " "
+                                  (field Investigators LastName)))
+    (set foaf:givenName
+         (field Investigators FirstName))
+    (set foaf:familyName
+         (field Investigators LastName))
+    (set foaf:homepage (field Investigators Url))
+    (set v:adr (field Investigators Address))
+    (set v:locality (field Investigators City))
+    (set v:region (field Investigators State))
+    (set v:postal-code (field Investigators ZipCode))
+    (set v:country-name (field Investigators Country))))
+
+(define-transformer gene-chip
+  (tables (GeneChip
+           (left-join Species "USING (SpeciesId)")))
+  (schema-triples
+   (gnc:geneChip a skos:Concept)
+   (gnc:geneChip
+    skos:description
+    "This is a set of controlled terms that are used to describe a given gene chip/platform")
+   (gnt:hasGeoSeriesId rdfs:domain gnc:platform)
+   (gnt:belongsToSpecies a owl:ObjectProperty)
+   (gnt:belongsToSpecies skos:definition "This resource belongs to this given species")
+   (gnt:belongsToSpecies rdfs:domain gnc:geneChip)
+   (gnt:hasGeoSeriesId rdfs:domain gnc:geneChip)
+   (gnt:hasGOTreeValue a owl:ObjectProperty)
+   (gnt:hasGOTreeValue skos:definition "This resource the following GO tree value")
+   (gnt:hasGOTreeValue rdfs:domain gnc:geneChip))
+  (triples (string->identifier "platform" (field GeneChip Name))
+    (set rdf:type 'gnc:geneChip)
+    (set rdfs:label (field GeneChip GeneChipName))
+    (set skos:prefLabel (field GeneChip Name))
+    (set skos:altLabel (field ("IF(GeneChip.GeneChipName != GeneChip.Title, Title, NULL)"
+                               Title)))
+    (set gnt:hasGOTreeValue (field GeneChip Go_tree_value))
+    (set gnt:belongsToSpecies
+         (string->identifier "" (remap-species-identifiers (field Species Fullname))
+                             #:separator ""
+                             #:proc string-capitalize-first))
+    (set gnt:hasGeoSeriesId
+         (ontology 'geoSeries:
+                   (string-trim-both (field GeneChip GeoPlatform))))))
+
+(define-transformer info-files
+  (tables (InfoFiles
+           (left-join PublishFreeze "ON InfoFiles.InfoPageName = PublishFreeze.Name")
+           (left-join GenoFreeze "ON InfoFiles.InfoPageName = GenoFreeze.Name")
+           (left-join ProbeSetFreeze "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name")
+           (left-join InbredSet "ON InfoFiles.InbredSetId = InbredSet.InbredSetId")
+           (left-join Species "ON InfoFiles.SpeciesId = Species.SpeciesId")
+           (left-join Datasets "USING (DatasetId)")
+           (left-join DatasetStatus "USING (DatasetStatusId)")
+           (left-join Tissue "USING (TissueId)")
+           (left-join Investigators "USING (InvestigatorId)")
+           (left-join AvgMethod "USING (AvgMethodId)")
+           (left-join Organizations "USING (OrganizationId)")
+           (left-join GeneChip "USING (GeneChipId)"))
+          "WHERE GN_AccesionId IS NOT NULL")
+  (schema-triples
+   (gnc:dataset rdf:type gdmt:Dataset)
+   (gnc:genotypeDataset rdfs:subClassOf gnc:dataset)
+   (gnc:phenotypeDataset rdfs:subClassOf gnc:dataset)
+   (gnc:probesetDataset rdfs:subClassOf gnc:dataset)
+   (gnt:belongsToSet rdfs:domain gnc:dataset)
+   (gnt:belongsToSet a owl:ObjectProperty)
+   (gnt:belongsToSet skos:definition "The InbredSet this resource belongs to")
+   (gnt:hasTissue rdfs:domain gnc:dataset)
+   (gnt:hasTissue a owl:ObjectProperty)
+   (gnt:hasTissue skos:definition "Tissues this resource has")
+   (gnt:hasTissueInfo rdfs:domain gnc:dataset)
+   (gnt:hasTissueInfo a owl:ObjectProperty)
+   (gnt:hasTissueInfo skos:definition "Metadata about Tissue for this resource")
+   (gnt:usesNormalization rdfs:domain gnc:dataset)
+   (gnt:usesNormalization a owl:ObjectProperty)
+   (gnt:usesNormalization skos:definition "Normalization techniques this resource has")
+   (gnt:usesPlatform rdfs:domain gnc:dataset)
+   (gnt:usesPlatform a owl:ObjectProperty)
+   (gnt:usesPlatform skos:definition "The Platform this resource uses")
+   (gnt:hasGeoSeriesId rdfs:domain gnc:dataset)
+   (gnt:hasGeoSeriesId a owl:ObjectProperty)
+   (gnt:hasGeoSeriesId skos:definition "id of record in NCBI database")
+   (gnt:hasExperimentDesignInfo rdfs:domain gnc:dataset)
+   (gnt:hasExperimentDesignInfo rdfs:label "Experiment Design")
+   (gnt:hasExperimentDesignInfo a owl:ObjectProperty)
+   (gnt:hasExperimentDesignInfo skos:definition "Information about how the experiment was designed")
+   (gnt:hasNotes rdfs:domain gnc:dataset)
+   (gnt:hasNotes a owl:ObjectProperty)
+   (gnt:hasNotes rdfs:label "Notes")
+   (gnt:hasNotes skos:definition "Extra Notes about this dataset")
+   (gnt:hasDataProcessingInfo rdfs:domain gnc:dataset)
+   (gnt:hasDataProcessingInfo rdfs:label "About Data Processing")
+   (gnt:hasDataProcessingInfo a owl:ObjectProperty)
+   (gnt:hasDataProcessingInfo skos:definition "Information about how this dataset was processed")
+   (gnt:hasPlatformInfo rdfs:domain gnc:dataset)
+   (gnt:hasPlatformInfo a owl:ObjectProperty)
+   (gnt:hasPlatformInfo rdfs:label "About Platfoorm")
+   (gnt:hasPlatformInfo skos:definition "Information about the platform that was used with this dataset")
+   (gnt:hasCaseInfo rdfs:domain gnc:dataset)
+   (gnt:hasCaseInfo rdfs:label "About Case")
+   (gnt:hasCaseInfo a owl:ObjectProperty)
+   (gnt:hasCaseInfo skos:definition "Information about the cases used in this platform")
+   (gnt:hasAcknowledgement rdfs:domain gnc:dataset)
+   (gnt:hasAcknowledgement rdfs:label "Acknowledgement")
+   (gnt:hasAcknowledgement a owl:ObjectProperty)
+   (gnt:hasAcknowledgement skos:definition "People to acknowledge"))
+  (triples (string->identifier
+            "" (regexp-substitute/global #f "[^A-Za-z0-9:]"
+                                         (field InfoFiles InfoPageName)
+                                         'pre "_" 'post)
+            #:separator ""
+            #:proc string-capitalize-first)
+    (set rdf:type (string->symbol
+                   (field ("IF(GenoFreeze.Id IS NOT NULL, 'gnc:genotypeDataset', IF(PublishFreeze.Id IS NOT NULL, 'gnc:phenotypeDataset', IF(ProbeSetFreeze.Name IS NOT NULL, 'gnc:probesetDataset', 'gnc:dataset')))"
+                           rdfType))))
+    (set rdfs:label (regexp-substitute/global
+                     #f "^[Nn]one$"
+                     (field InfoFiles InfoPageName)
+                     ""))
+    (set skos:prefLabel
+         (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))"
+                 DatasetFullName)))
+    (set skos:prefLabel (field Datasets DatasetName DatasetGroup))
+    (set gdmt:hasTitleInfo
+         (regexp-substitute/global
+          #f "^[Nn]one$"
+          (field InfoFiles InfoFileTitle)
+          ""))
+    ;; This is the published title
+    (set dct:title
+         (regexp-substitute/global
+          #f "^[Nn]one$"
+          (field Datasets PublicationTitle)
+          ""))
+    (set dct:created
+         (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))"
+                 createTimeGenoFreeze)))
+    (set gdmt:hasCreatorInfo
+         (investigator-attributes->id (field Investigators FirstName)
+                                      (field Investigators LastName)
+                                      (field Investigators Email)))
+    (set gdmt:hasCreatorAffiliation
+         (field Organizations OrganizationName))
+    (set gdmt:hasDatasetIdentifierSubType (format #f "GN~a" (field InfoFiles GN_AccesionId)))
+    (set gdmt:hasRightsInfo (string-downcase
+                             (field DatasetStatus DatasetStatusName)))
+    (set gnt:belongsToSet
+         (string->identifier
+          "set" (field InbredSet Name)
+          #:separator ""
+          #:proc string-capitalize-first))
+    (set gnt:hasTissue (string->identifier "tissue"
+                                           (field Tissue Short_Name)))
+    (set gnt:usesNormalization
+         (string->identifier "avgmethod"
+                             ;; If AvgMethodName is NULL, assume N/A.
+                             (if (string-blank? (field AvgMethod Name AvgMethodName))
+                                 "N/A" (field AvgMethod Name AvgMethodName))))
+    (set gnt:usesPlatform
+         (string->identifier "platform"
+                             (field GeneChip Name GeneChip)))
+    (set gdmt:isDescribedBy
+         (sanitize-rdf-string (field Datasets Summary)))
+    (set gnt:hasGeoSeriesId
+         (let ((s
+                (string-match "GSE[0-9]*"
+                              (field ("IFNULL(Datasets.GeoSeries, '')" GeoSeries)))))
+           (if s (ontology
+                  'geoSeries: (match:substring s))
+               "")))
+    (set gnt:hasTissueInfo
+         (sanitize-rdf-string (field Datasets AboutTissue)))
+    (set gnt:hasContentInfo (sanitize-rdf-string (field InfoFiles Specifics)))
+    (set gnt:hasCaseInfo
+         (sanitize-rdf-string
+          (field Datasets AboutCases)))
+    (set gnt:hasPlatformInfo
+         (sanitize-rdf-string
+          (field Datasets AboutPlatform)))
+    (set gnt:hasDataProcessingInfo
+         (sanitize-rdf-string
+          (field Datasets AboutDataProcessing)))
+    (set gnt:hasNotes
+         (sanitize-rdf-string
+          (field Datasets Notes)))
+    (set gnt:hasExperimentDesignInfo
+         (sanitize-rdf-string
+          (field Datasets ExperimentDesign)))
+    (set gdmt:hasContributorInfo
+         (sanitize-rdf-string
+          (field Datasets Contributors)))
+    (set gdmt:IsCitedBy
+         (sanitize-rdf-string
+          (regexp-substitute/global
+           #f "^[Nn]one$"
+           (field Datasets Citation)
+           "")))
+    (set gnt:hasAcknowledgement
+         (sanitize-rdf-string
+          (string-trim-both
+           (regexp-substitute/global
+            #f "^[Nn]one$"
+            (field InfoFiles Data_Source_Acknowledge)
+            ""))))
+    (set gnt:hasAcknowledgement (sanitize-rdf-string
+                                 (field Datasets Acknowledgment)))))
+
+;; These are phenotype datasets that don't have Infofile metadata
+(define-transformer publishfreeze
+  (tables (PublishFreeze
+           (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name")
+           (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId"))
+          "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL")
+  (triples
+      (string->identifier
+       ""
+       (regexp-substitute/global #f "[^A-Za-z0-9:]"
+                                 (field PublishFreeze Name)
+                                 'pre "_" 'post)
+       #:separator ""
+       #:proc string-capitalize-first)
+    (set rdf:type 'gnc:phenotypeDataset)
+    (set rdfs:label (field PublishFreeze Name))
+    (set skos:prefLabel (field PublishFreeze FullName))
+    (set skos:altLabel (field PublishFreeze ShortName))
+    (set dct:created (annotate-field
+                      (field PublishFreeze CreateTime)
+                      '^^xsd:date))
+    (set gnt:belongsToSet
+         (string->identifier
+          "set" (field InbredSet Name)
+          #:separator ""
+          #:proc string-capitalize-first))))
+
+(define-transformer genofreeze
+  (tables (GenoFreeze
+           (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name")
+           (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId"))
+          "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL")
+  (triples
+      (string->identifier
+       ""
+       (regexp-substitute/global
+        #f "[^A-Za-z0-9:]"
+        (regexp-substitute/global
+         #f "[^A-Za-z0-9:]"
+         (field GenoFreeze Name)
+         'pre "_" 'post)
+        'pre "_" 'post)
+       #:separator ""
+       #:proc string-capitalize-first)
+    (set rdf:type 'gnc:genotypeDataset)
+    (set rdfs:label (field GenoFreeze Name))
+    (set skos:prefLabel (field GenoFreeze FullName))
+    (set skos:altLabel (field GenoFreeze ShortName))
+    (set dct:created (annotate-field
+                      (field GenoFreeze CreateTime)
+                      '^^xsd:date))
+    (set gnt:belongsToSet
+         (string->identifier
+            "set" (field InbredSet Name)
+            #:separator ""
+            #:proc string-capitalize-first))))
+
+;; Molecular Traits are also referred to as ProbeSets
+(define-transformer probesetfreeze
+  (tables (ProbeSetFreeze
+           (left-join InfoFiles "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name")
+           (left-join ProbeFreeze "USING (ProbeFreezeId)")
+           (left-join AvgMethod "ON AvgMethod.AvgMethodId = ProbeSetFreeze.AvgID")
+           (left-join InbredSet "ON ProbeFreeze.InbredSetId = InbredSet.Id")
+           (left-join Tissue "ON ProbeFreeze.TissueId = Tissue.TissueId"))
+          "WHERE ProbeSetFreeze.public > 0 AND InfoFiles.InfoPageName IS NULL GROUP BY ProbeFreeze.Id")
+  (schema-triples
+   (gnt:usesNormalization rdfs:domain gnc:probeset)
+   (gnt:usesDataScale rdfs:domain gnc:probeset)
+   (gnt:usesDataScale a owl:ObjectProperty)
+   (gnt:usesDataScale skos:definition "Thi data scale this resource uses"))
+  (triples
+      (string->identifier
+       ""
+       (regexp-substitute/global
+        #f "[^A-Za-z0-9:]"
+        (field ProbeSetFreeze Name)
+        'pre "_" 'post)
+       #:separator ""
+       #:proc string-capitalize-first)
+    (set rdf:type 'gnc:probesetDataset)
+    (set gnt:usesNormalization
+         (string->identifier "avgmethod"
+                             ;; If AvgMethodName is NULL, assume N/A.
+                             (if (string-blank? (field AvgMethod Name AvgMethodName))
+                                 "N/A" (field AvgMethod Name AvgMethodName))))
+    (set dct:title (field ProbeSetFreeze FullName))
+    (set rdfs:label (field ProbeSetFreeze ShortName))
+    (set skos:prefLabel (field ProbeSetFreeze Name))
+    (set skos:altLabel (field ProbeSetFreeze Name2))
+    (set dct:created (annotate-field
+                      (field ProbeSetFreeze CreateTime)
+                      '^^xsd:datetime))
+    (set gnt:usesDataScale (field ProbeSetFreeze DataScale))
+    (set gnt:hasTissue
+         (string->identifier
+          "tissue"
+          (field Tissue Short_Name)))
+    (set gnt:belongsToSet
+         (string->identifier
+          "set" (field InbredSet Name)
+          #:separator ""
+          #:proc string-capitalize-first))))
+
+
+
+(with-documentation
+ (name "Info files / Investigators Metadata")
+ (connection %connection-settings)
+ (table-metadata? #f)
+ (prefixes
+  '(("v:" "<http://www.w3.org/2006/vcard/ns#>")
+    ("foaf:" "<http://xmlns.com/foaf/0.1/>")
+    ("gdmt:" "<http://vocab.fairdatacollective.org/gdmt/>")
+    ("skos:" "<http://www.w3.org/2004/02/skos/core#>")
+    ("geoSeries:" "<http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=>")
+    ("gnt:" "<http://genenetwork.org/term/>")
+    ("gn:" "<http://genenetwork.org/id/>")
+    ("gnc:" "<http://genenetwork.org/category/>")
+    ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
+    ("owl:" "<http://www.w3.org/2002/07/owl#>")
+    ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
+    ("taxon:" "<http://purl.uniprot.org/taxonomy/>")
+    ("dct:" "<http://purl.org/dc/terms/>")))
+ (inputs
+  (list info-files
+        publishfreeze
+        genofreeze
+        probesetfreeze
+        investigators
+        gene-chip))
+ (outputs
+  '(#:documentation "./docs/info-pages.md"
+    #:rdf "/export/data/genenetwork-virtuoso/info-pages.ttl")))
+