aboutsummaryrefslogtreecommitdiff
path: root/examples/dump-probesetfreeze.scm
blob: fab41b4c6062a417f8aa5ff2b2c2a4b21df33c62 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#! /usr/bin/env guile
!#

(use-modules (srfi srfi-1)
             (srfi srfi-26)
             (ice-9 match)
             (ice-9 regex)
             (dump strings)
             (dump sql)
             (dump triples)
             (dump special-forms))



(define %connection-settings
  (call-with-input-file (list-ref (command-line) 1)
    read))


(define (remap-species-identifiers str)
  "This procedure remaps identifiers to standard binominal. Obviously this should
   be sorted by correcting the database!"
  (match str
    ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"]
    ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"]
    ["Macaca mulatta" "Macaca nemestrina"]
    ["Bat (Glossophaga soricina)" "Glossophaga soricina"]
    [str str]))


(define-dump dump-gene-chip
  (tables (GeneChip
           (left-join Species "USING (SpeciesId)")))
  (schema-triples
   (gnc:geneChip a skos:Concept)
   (gnc:geneChip
    skos:description
    "This is a set of controlled terms that are used to describe a given gene chip/platform")
   (gnt:hasGeoSeriesId rdfs:domain gnc:platform)
   (gnt:belongsToSpecies a owl:ObjectProperty)
   (gnt:belongsToSpecies skos:definition "This resource belongs to this given species")
   (gnt:belongsToSpecies rdfs:domain gnc:geneChip)
   (gnt:hasGeoSeriesId rdfs:domain gnc:geneChip)
   (gnt:hasGOTreeValue a owl:ObjectProperty)
   (gnt:hasGOTreeValue skos:definition "This resource the following GO tree value")
   (gnt:hasGOTreeValue rdfs:domain gnc:geneChip))
  (triples (string->identifier "platform" (field GeneChip Name))
    (set rdf:type 'gnc:geneChip)
    (set rdfs:label (field GeneChip GeneChipName))
    (set skos:prefLabel (field GeneChip Name))
    (set skos:altLabel (field ("IF(GeneChip.GeneChipName != GeneChip.Title, Title, NULL)"
                               Title)))
    (set gnt:hasGOTreeValue (field GeneChip Go_tree_value))
    (set gnt:belongsToSpecies
         (string->identifier "" (remap-species-identifiers (field Species Fullname))
                             #:separator ""
                             #:proc string-capitalize-first))
    (set gnt:hasGeoSeriesId
         (ontology 'geoSeries:
                   (string-trim-both (field GeneChip GeoPlatform))))))

;; Molecular Traits are also referred to as ProbeSets
(define-dump dump-probesetfreeze
  (tables (ProbeSetFreeze
           (left-join InfoFiles "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name")
           (left-join ProbeFreeze "USING (ProbeFreezeId)")
           (left-join AvgMethod "ON AvgMethod.AvgMethodId = ProbeSetFreeze.AvgID")
           (left-join InbredSet "ON ProbeFreeze.InbredSetId = InbredSet.Id")
           (left-join Tissue "ON ProbeFreeze.TissueId = Tissue.TissueId"))
          "WHERE ProbeSetFreeze.public > 0 AND InfoFiles.InfoPageName IS NULL GROUP BY ProbeFreeze.Id")
  (schema-triples
   (gnt:usesNormalization rdfs:domain gnc:probeset)
   (gnt:usesDataScale rdfs:domain gnc:probeset)
   (gnt:usesDataScale a owl:ObjectProperty)
   (gnt:usesDataScale skos:definition "Thi data scale this resource uses"))
  (triples
      (string->identifier
       ""
       (regexp-substitute/global
        #f "[^A-Za-z0-9:]"
        (field ProbeSetFreeze Name)
        'pre "_" 'post)
       #:separator ""
       #:proc string-capitalize-first)
    (set rdf:type 'gnc:probesetDataset)
    (set gnt:usesNormalization
         (string->identifier "avgmethod"
                             ;; If AvgMethodName is NULL, assume N/A.
                             (if (string-blank? (field AvgMethod Name AvgMethodName))
                                 "N/A" (field AvgMethod Name AvgMethodName))))
    (set dct:title (field ProbeSetFreeze FullName))
    (set rdfs:label (field ProbeSetFreeze ShortName))
    (set skos:prefLabel (field ProbeSetFreeze Name))
    (set skos:altLabel (field ProbeSetFreeze Name2))
    (set dct:created (annotate-field
                      (field ProbeSetFreeze CreateTime)
                      '^^xsd:datetime))
    (set gnt:usesDataScale (field ProbeSetFreeze DataScale))
    (set gnt:hasTissue
         (string->identifier
          "tissue"
          (field Tissue Short_Name)))
    (set gnt:belongsToInbredSet
         (string->identifier
            "inbredSet" (field InbredSet Name)
            #:separator ""
            #:proc string-capitalize-first))))



(dump-with-documentation
 (name "Probeset freeze metadata")
 (connection %connection-settings)
 (table-metadata? #f)
 (prefixes
  '(("geoSeries:" "<http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=>")
    ("gn:" "<http://genenetwork.org/id/>")
    ("gnc:" "<http://genenetwork.org/category/>")
    ("dct:" "<http://purl.org/dc/terms/>")
    ("owl:" "<http://www.w3.org/2002/07/owl#>")
    ("skos:" "<http://www.w3.org/2004/02/skos/core#>")
    ("gnt:" "<http://genenetwork.org/term/>")
    ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
    ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
    ("xsd:" "<http://www.w3.org/2001/XMLSchema#>")))
 (inputs
  (list dump-gene-chip
        dump-probesetfreeze))
 (outputs
  '(#:documentation "./docs/dump-gene-chip.md"
    #:rdf "./verified-data/dump-probesetfreeze.ttl")))