aboutsummaryrefslogtreecommitdiff
path: root/examples/dump-genotype.scm
blob: ed23e805424ea7cfa2fee09dda3c5349112e6c3e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#! /usr/bin/env guile
!#

(use-modules (rnrs programs)
             (rnrs io ports)
             (srfi srfi-1)
             (srfi srfi-26)
             (ice-9 match)
             (ice-9 regex)
             (dump strings)
             (dump sql)
             (dump triples)
             (dump special-forms))



(define %connection-settings
  (call-with-input-file (list-ref (command-line) 1)
    read))



(define-dump dump-genofreeze
  (tables (GenoFreeze
           (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name")
           (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId"))
          "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL")
  (triples
      (string->identifier
       ""
       (regexp-substitute/global
        #f "[^A-Za-z0-9:]"
        (regexp-substitute/global
         #f "[^A-Za-z0-9:]"
         (field GenoFreeze Name)
         'pre "_" 'post)
        'pre "_" 'post)
       #:separator ""
       #:proc string-capitalize-first)
    (set rdf:type 'gnc:genotypeDataset)
    (set rdfs:label (field GenoFreeze Name))
    (set skos:prefLabel (field GenoFreeze FullName))
    (set skos:altLabel (field GenoFreeze ShortName))
    (set dct:created (annotate-field
                      (field GenoFreeze CreateTime)
                      '^^xsd:date))
    (set gnt:belongsToInbredSet
         (string->identifier "" (field InbredSet Name InbredSetName)))))

(define-dump dump-genotypes
  (tables (Geno
           (left-join GenoXRef "ON Geno.Id = GenoXRef.GenoId")
           (left-join GenoFreeze "ON GenoFreeze.Id = GenoXRef.GenoFreezeId")
           (left-join InbredSet "ON InbredSet.InbredSetId = GenoFreeze.InbredSetId")
           (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name")))
  (schema-triples
   (gnc:genotype a skos:Concept)
   (gnc:genotype
    skos:description
    "This is a set of controlled terms that are used to describe a given genotype")
   (gnt:chr a owl:ObjectProperty)
   (gnt:chr skos:description "This resource is located on a given chromosome")
   (gnt:chr rdfs:domain gnc:genotype)
   (gnt:mb a owl:ObjectProperty)
   (gnt:mb skos:definition "The size of this resource in Mb")
   (gnt:mb rdfs:domain gnc:genotype)
   (gnt:mbMm8 a owl:ObjectProperty)
   (gnt:mbMm8 skos:definition "TODO")
   (gnt:mbMm8 rdfs:domain gnc:genotype)
   (gnt:mb2016 a owl:ObjectProperty)
   (gnt:mb2016 skos:definition "TODO")
   (gnt:mb2016 rdfs:domain gnc:genotype)
   (gnt:hasSequence a owl:ObjectProperty)
   (gnt:hasSequence skos:definition "This resource has a given sequence")
   (gnt:hasSequence rdfs:domain gnc:genotype)
   (gnt:hasSource a owl:ObjectProperty)
   (gnt:hasSource rdfs:domain gnc:genotype)
   (gnt:hasSource skos:definition "This resource was obtained from this given source")
   (gnt:hasAltSourceName a owl:ObjectProperty)
   (gnt:hasAltSourceName rdfs:domain gnc:genotype)
   (gnt:hasAltSourceName
    skos:definition
    "The alternative name this resource was obtained from")
   (gnt:chrNum a owl:ObjectProperty)
   (gnt:chrNum rdfs:domain gnc:genotype)
   (gnt:chrNum skos:definition "The chromosome number for this resource")
   (gnt:cM a owl:ObjectProperty)
   (gnt:cM rdfs:domain gnc:genotype)
   (gnt:cM skos:definition "The centimorgan for this resource")
   (gnt:usedForMapping a owl:ObjectProperty)
   (gnt:usedForMapping rdfs:domain gnc:genotype)
   (gnt:usedForMapping
    skos:definition "This indicates whether this resource is used for mapping"))
  (triples
      (string->identifier
       ""
       (regexp-substitute/global
        #f "[^A-Za-z0-9:]"
        (field ("CONCAT(IF(GenoFreeze.Name IS NULL, '', CONCAT(GenoFreeze.Name, '_')), Geno.Name)" abbrev))
        'pre "_" 'post)
       #:separator ""
       #:proc string-capitalize-first)
    (set rdf:type 'gnc:genotype)
    (set skos:prefLabel (sanitize-rdf-string (field Geno Name)))
    (set gnt:chr (field Geno Chr))
    (set gnt:mb (annotate-field
                 (field ("IFNULL(Geno.Mb, '')" Mb)) '^^xsd:double))
    (set gnt:mbMm8 (annotate-field (field ("IFNULL(Geno.Mb_mm8, '')" Mb_mm8))
                                   '^^xsd:double))
    (set gnt:mb2016
         (annotate-field (field ("IFNULL(Geno.Mb_2016, '')" Mb_2016))
                         '^^xsd:double))
    (set gnt:hasSequence (field Geno Sequence))
    (set gnt:hasSource (field Geno Source))
    ;; Only dump Source2 if it differs from Source
    (set gnt:hasAltSourceName
         (field ("IF((Source2 = Source), NULL, Source2)"
                 Source2)))
    (set gnt:belongsToDataset
         (string->identifier
          ""
          (regexp-substitute/global
           #f "[^A-Za-z0-9:]"
           (field ("IFNULL(GenoFreeze.Name, '')" DatasetName))
           'pre "_" 'post)
          #:separator ""
          #:proc string-capitalize-first))
    (set gnt:chrNum
         (annotate-field
          (field Geno chr_num)
          '^^xsd:int))
    (set rdfs:comments (field Geno Comments))
    (set gnt:cM
         (annotate-field
          (field GenoXRef cM)
          '^^xsd:int))))



(dump-with-documentation
 (name "Genotype Metadata")
 (connection %connection-settings)
 (table-metadata? #f)
 (prefixes
  '(("dct:" "<http://purl.org/dc/terms/>")
    ("gn:" "<http://genenetwork.org/id/>")
    ("gnc:" "<http://genenetwork.org/category/>")
    ("gnt:" "<http://genenetwork.org/term/>")
    ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
    ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
    ("owl:" "<http://www.w3.org/2002/07/owl#>")
    ("skos:" "<http://www.w3.org/2004/02/skos/core#>")
    ("xsd:" "<http://www.w3.org/2001/XMLSchema#>")))
 (inputs
  (list dump-genofreeze
        dump-genotypes))
 (outputs
  '(#:documentation "./docs/dump-genotype.md"
    #:rdf "./verified-data/dump-genotype.ttl")))