about summary refs log tree commit diff
path: root/examples/dump-dataset-metadata.scm
diff options
context:
space:
mode:
Diffstat (limited to 'examples/dump-dataset-metadata.scm')
-rwxr-xr-xexamples/dump-dataset-metadata.scm140
1 files changed, 139 insertions, 1 deletions
diff --git a/examples/dump-dataset-metadata.scm b/examples/dump-dataset-metadata.scm
index e732772..8f381b7 100755
--- a/examples/dump-dataset-metadata.scm
+++ b/examples/dump-dataset-metadata.scm
@@ -56,6 +56,37 @@
     (set v:postal-code (field Investigators ZipCode))
     (set v:country-name (field Investigators Country))))
 
+(define-dump dump-gene-chip
+  (tables (GeneChip
+           (left-join Species "USING (SpeciesId)")))
+  (schema-triples
+   (gnc:geneChip a skos:Concept)
+   (gnc:geneChip
+    skos:description
+    "This is a set of controlled terms that are used to describe a given gene chip/platform")
+   (gnt:hasGeoSeriesId rdfs:domain gnc:platform)
+   (gnt:belongsToSpecies a owl:ObjectProperty)
+   (gnt:belongsToSpecies skos:definition "This resource belongs to this given species")
+   (gnt:belongsToSpecies rdfs:domain gnc:geneChip)
+   (gnt:hasGeoSeriesId rdfs:domain gnc:geneChip)
+   (gnt:hasGOTreeValue a owl:ObjectProperty)
+   (gnt:hasGOTreeValue skos:definition "This resource the following GO tree value")
+   (gnt:hasGOTreeValue rdfs:domain gnc:geneChip))
+  (triples (string->identifier "platform" (field GeneChip Name))
+    (set rdf:type 'gnc:geneChip)
+    (set rdfs:label (field GeneChip GeneChipName))
+    (set skos:prefLabel (field GeneChip Name))
+    (set skos:altLabel (field ("IF(GeneChip.GeneChipName != GeneChip.Title, Title, NULL)"
+                               Title)))
+    (set gnt:hasGOTreeValue (field GeneChip Go_tree_value))
+    (set gnt:belongsToSpecies
+         (string->identifier "" (remap-species-identifiers (field Species Fullname))
+                             #:separator ""
+                             #:proc string-capitalize-first))
+    (set gnt:hasGeoSeriesId
+         (ontology 'geoSeries:
+                   (string-trim-both (field GeneChip GeoPlatform))))))
+
 (define-dump dump-info-files
   (tables (InfoFiles
            (left-join PublishFreeze "ON InfoFiles.InfoPageName = PublishFreeze.Name")
@@ -219,6 +250,109 @@
     (set gnt:hasAcknowledgement (sanitize-rdf-string
                                  (field Datasets Acknowledgment)))))
 
+;; These are phenotype datasets that don't have Infofile metadata
+(define-dump dump-publishfreeze
+  (tables (PublishFreeze
+           (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name")
+           (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId"))
+          "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL")
+  (triples
+      (string->identifier
+       ""
+       (regexp-substitute/global #f "[^A-Za-z0-9:]"
+                                 (field PublishFreeze Name)
+                                 'pre "_" 'post)
+       #:separator ""
+       #:proc string-capitalize-first)
+    (set rdf:type 'gnc:phenotypeDataset)
+    (set rdfs:label (field PublishFreeze Name))
+    (set skos:prefLabel (field PublishFreeze FullName))
+    (set skos:altLabel (field PublishFreeze ShortName))
+    (set dct:created (annotate-field
+                      (field PublishFreeze CreateTime)
+                      '^^xsd:date))
+    (set gnt:belongsToSet
+         (string->identifier
+          "inbredSet" (field InbredSet Name)
+          #:separator ""
+          #:proc string-capitalize-first))))
+
+(define-dump dump-genofreeze
+  (tables (GenoFreeze
+           (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name")
+           (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId"))
+          "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL")
+  (triples
+      (string->identifier
+       ""
+       (regexp-substitute/global
+        #f "[^A-Za-z0-9:]"
+        (regexp-substitute/global
+         #f "[^A-Za-z0-9:]"
+         (field GenoFreeze Name)
+         'pre "_" 'post)
+        'pre "_" 'post)
+       #:separator ""
+       #:proc string-capitalize-first)
+    (set rdf:type 'gnc:genotypeDataset)
+    (set rdfs:label (field GenoFreeze Name))
+    (set skos:prefLabel (field GenoFreeze FullName))
+    (set skos:altLabel (field GenoFreeze ShortName))
+    (set dct:created (annotate-field
+                      (field GenoFreeze CreateTime)
+                      '^^xsd:date))
+    (set gnt:belongsToSet
+         (string->identifier
+            "inbredSet" (field InbredSet Name)
+            #:separator ""
+            #:proc string-capitalize-first))))
+
+;; Molecular Traits are also referred to as ProbeSets
+(define-dump dump-probesetfreeze
+  (tables (ProbeSetFreeze
+           (left-join InfoFiles "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name")
+           (left-join ProbeFreeze "USING (ProbeFreezeId)")
+           (left-join AvgMethod "ON AvgMethod.AvgMethodId = ProbeSetFreeze.AvgID")
+           (left-join InbredSet "ON ProbeFreeze.InbredSetId = InbredSet.Id")
+           (left-join Tissue "ON ProbeFreeze.TissueId = Tissue.TissueId"))
+          "WHERE ProbeSetFreeze.public > 0 AND InfoFiles.InfoPageName IS NULL GROUP BY ProbeFreeze.Id")
+  (schema-triples
+   (gnt:usesNormalization rdfs:domain gnc:probeset)
+   (gnt:usesDataScale rdfs:domain gnc:probeset)
+   (gnt:usesDataScale a owl:ObjectProperty)
+   (gnt:usesDataScale skos:definition "Thi data scale this resource uses"))
+  (triples
+      (string->identifier
+       ""
+       (regexp-substitute/global
+        #f "[^A-Za-z0-9:]"
+        (field ProbeSetFreeze Name)
+        'pre "_" 'post)
+       #:separator ""
+       #:proc string-capitalize-first)
+    (set rdf:type 'gnc:probesetDataset)
+    (set gnt:usesNormalization
+         (string->identifier "avgmethod"
+                             ;; If AvgMethodName is NULL, assume N/A.
+                             (if (string-blank? (field AvgMethod Name AvgMethodName))
+                                 "N/A" (field AvgMethod Name AvgMethodName))))
+    (set dct:title (field ProbeSetFreeze FullName))
+    (set rdfs:label (field ProbeSetFreeze ShortName))
+    (set skos:prefLabel (field ProbeSetFreeze Name))
+    (set skos:altLabel (field ProbeSetFreeze Name2))
+    (set dct:created (annotate-field
+                      (field ProbeSetFreeze CreateTime)
+                      '^^xsd:datetime))
+    (set gnt:usesDataScale (field ProbeSetFreeze DataScale))
+    (set gnt:hasTissue
+         (string->identifier
+          "tissue"
+          (field Tissue Short_Name)))
+    (set gnt:belongsToSet
+         (string->identifier
+          "inbredSet" (field InbredSet Name)
+          #:separator ""
+          #:proc string-capitalize-first))))
 
 
 
@@ -242,7 +376,11 @@
     ("dct:" "<http://purl.org/dc/terms/>")))
  (inputs
   (list dump-info-files
-        dump-investigators))
+        dump-publishfreeze
+        dump-genofreeze
+        dump-probesetfreeze
+        dump-investigators
+        dump-gene-chip))
  (outputs
   '(#:documentation "./docs/dump-info-pages.md"
     #:rdf "/export/data/genenetwork-virtuoso/dump-info-pages.ttl")))