aboutsummaryrefslogtreecommitdiff
path: root/examples/dump-genotype.scm
blob: 04f1af02fabc0ac8db4499efcef183a1cc8be448 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#! /usr/bin/env guile
!#

(use-modules (rnrs programs)
             (rnrs io ports)
             (srfi srfi-1)
             (srfi srfi-26)
             (ice-9 match)
             (ice-9 regex)
             (dump strings)
             (dump sql)
             (dump triples)
             (dump special-forms))



(define %connection-settings
  (call-with-input-file (list-ref (command-line) 1)
    read))



(define (remap-species-identifiers str)
  "This procedure remaps identifiers to standard binominal. Obviously this should
   be sorted by correcting the database!"
  (match str
    ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"]
    ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"]
    ["Macaca mulatta" "Macaca nemestrina"]
    ["Bat (Glossophaga soricina)" "Glossophaga soricina"]
    [str str]))

(define-dump dump-genofreeze
  (tables (GenoFreeze
           (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name")
           (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId"))
          "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL")
  (triples
      (string->identifier
       ""
       (regexp-substitute/global
        #f "[^A-Za-z0-9:]"
        (regexp-substitute/global
         #f "[^A-Za-z0-9:]"
         (field GenoFreeze Name)
         'pre "_" 'post)
        'pre "_" 'post)
       #:separator ""
       #:proc string-capitalize-first)
    (set rdf:type 'gnc:genotypeDataset)
    (set rdfs:label (field GenoFreeze Name))
    (set skos:prefLabel (field GenoFreeze FullName))
    (set skos:altLabel (field GenoFreeze ShortName))
    (set dct:created (annotate-field
                      (field GenoFreeze CreateTime)
                      '^^xsd:date))
    (set gnt:belongsToSet
         (string->identifier
            "inbredSet" (field InbredSet Name)
            #:separator ""
            #:proc string-capitalize-first))))


(define-dump dump-genotypes
  (tables (Geno
           (left-join Species "USING (SpeciesId)")))
  (schema-triples
   (gnc:genotype a skos:Concept)
   (gnc:genotype
    skos:description
    "This is a set of controlled terms that are used to describe a given genotype")
   (gnt:chr a owl:ObjectProperty)
   (gnt:chr skos:description "This resource is located on a given chromosome")
   (gnt:chr rdfs:domain gnc:genotype)
   (gnt:mb a owl:ObjectProperty)
   (gnt:mb skos:definition "The size of this resource in Mb")
   (gnt:mb rdfs:domain gnc:genotype)
   (gnt:mbMm8 a owl:ObjectProperty)
   (gnt:mbMm8 skos:definition "TODO")
   (gnt:mbMm8 rdfs:domain gnc:genotype)
   (gnt:mb2016 a owl:ObjectProperty)
   (gnt:mb2016 skos:definition "TODO")
   (gnt:mb2016 rdfs:domain gnc:genotype)
   (gnt:hasSequence a owl:ObjectProperty)
   (gnt:hasSequence skos:definition "This resource has a given sequence")
   (gnt:hasSequence rdfs:domain gnc:genotype)
   (gnt:hasSource a owl:ObjectProperty)
   (gnt:hasSource rdfs:domain gnc:genotype)
   (gnt:hasSource skos:definition "This resource was obtained from this given source")
   (gnt:hasAltSourceName a owl:ObjectProperty)
   (gnt:hasAltSourceName rdfs:domain gnc:genotype)
   (gnt:hasAltSourceName
    skos:definition
    "The alternative name this resource was obtained from")
   (gnt:chrNum a owl:ObjectProperty)
   (gnt:chrNum rdfs:domain gnc:genotype)
   (gnt:chrNum skos:definition "The chromosome number for this resource")
   (gnt:chrNum skos:definition "The chromosome number for this resource"))
  (triples
      (string->identifier
       ""
       (regexp-substitute/global
        #f "[^A-Za-z0-9:]"
        (field Geno Name)
        'pre "_" 'post)
       #:separator ""
       #:proc string-capitalize-first)
    (set rdf:type 'gnc:genotype)
    (set skos:prefLabel (sanitize-rdf-string (field Geno Name)))
    (set gnt:chr (field Geno Chr))
    (set gnt:mb (annotate-field
                 (field ("IFNULL(Geno.Mb, '')" Mb)) '^^xsd:double))
    (set gnt:mbMm8 (annotate-field (field ("IFNULL(Geno.Mb_mm8, '')" Mb_mm8))
                                   '^^xsd:double))
    (set gnt:mb2016
         (annotate-field (field ("IFNULL(Geno.Mb_2016, '')" Mb_2016))
                         '^^xsd:double))
    (set gnt:hasSequence (field Geno Sequence))
    (set gnt:hasSource (field Geno Source))
    ;; Only dump Source2 if it differs from Source
    (set gnt:hasAltSourceName
         (field ("IF((Source2 = Source), NULL, Source2)"
                 Source2)))
    (set gnt:belongsToSpecies
         (string->identifier
          "" (remap-species-identifiers (field Species Fullname))
          #:separator ""
          #:proc string-capitalize-first))
    (set gnt:chrNum
         (annotate-field
          (field Geno chr_num)
          '^^xsd:int))
    (set rdfs:comments (field Geno Comments))))



(dump-with-documentation
 (name "Genotype Metadata")
 (connection %connection-settings)
 (table-metadata? #f)
 (prefixes
  '(("dct:" "<http://purl.org/dc/terms/>")
    ("gn:" "<http://genenetwork.org/id/>")
    ("gnc:" "<http://genenetwork.org/category/>")
    ("gnt:" "<http://genenetwork.org/term/>")
    ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
    ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
    ("owl:" "<http://www.w3.org/2002/07/owl#>")
    ("skos:" "<http://www.w3.org/2004/02/skos/core#>")
    ("xsd:" "<http://www.w3.org/2001/XMLSchema#>")))
 (inputs
  (list dump-genofreeze
        dump-genotypes))
 (outputs
  '(#:documentation "./docs/dump-genotype.md"
    #:rdf "/export/data/genenetwork-virtuoso/dump-genotype.ttl")))