aboutsummaryrefslogtreecommitdiff
path: root/examples/genotype.scm
blob: 830da0aacf2a988dccf01820f2f506f115846ee2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#! /usr/bin/env guile
!#

(use-modules (rnrs programs)
             (rnrs io ports)
             (srfi srfi-1)
             (srfi srfi-26)
             (ice-9 getopt-long)
             (ice-9 match)
             (ice-9 regex)
             (transform strings)
             (transform sql)
             (transform triples)
             (transform special-forms))



(define (remap-species-identifiers str)
  "This procedure remaps identifiers to standard binominal. Obviously this should
   be sorted by correcting the database!"
  (match str
    ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"]
    ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"]
    ["Macaca mulatta" "Macaca nemestrina"]
    ["Bat (Glossophaga soricina)" "Glossophaga soricina"]
    [str str]))

(define-transformer genotypes
  (tables (Geno
           (left-join Species "USING (SpeciesId)")))
  (schema-triples
   (gnc:genotype a skos:Concept)
   (gnc:genotype
    skos:description
    "This is a set of controlled terms that are used to describe a given genotype")
   (gnt:chr a owl:ObjectProperty)
   (gnt:chr skos:description "This resource is located on a given chromosome")
   (gnt:chr rdfs:domain gnc:genotype)
   (gnt:mb a owl:ObjectProperty)
   (gnt:mb skos:definition "The size of this resource in Mb")
   (gnt:mb rdfs:domain gnc:genotype)
   (gnt:mbMm8 a owl:ObjectProperty)
   (gnt:mbMm8 skos:definition "TODO")
   (gnt:mbMm8 rdfs:domain gnc:genotype)
   (gnt:mb2016 a owl:ObjectProperty)
   (gnt:mb2016 skos:definition "TODO")
   (gnt:mb2016 rdfs:domain gnc:genotype)
   (gnt:hasSequence a owl:ObjectProperty)
   (gnt:hasSequence skos:definition "This resource has a given sequence")
   (gnt:hasSequence rdfs:domain gnc:genotype)
   (gnt:hasSource a owl:ObjectProperty)
   (gnt:hasSource rdfs:domain gnc:genotype)
   (gnt:hasSource skos:definition "This resource was obtained from this given source")
   (gnt:hasAltSourceName a owl:ObjectProperty)
   (gnt:hasAltSourceName rdfs:domain gnc:genotype)
   (gnt:hasAltSourceName
    skos:definition
    "The alternative name this resource was obtained from")
   (gnt:chrNum a owl:ObjectProperty)
   (gnt:chrNum rdfs:domain gnc:genotype)
   (gnt:chrNum skos:definition "The chromosome number for this resource")
   (gnt:chrNum skos:definition "The chromosome number for this resource"))
  (triples
      (string->identifier
       ""
       (regexp-substitute/global
        #f "[^A-Za-z0-9:]"
        (field Geno Name)
        'pre "_" 'post)
       #:separator ""
       #:proc string-capitalize-first)
    (set rdf:type 'gnc:genotype)
    (set skos:prefLabel (sanitize-rdf-string (field Geno Name)))
    (set gnt:chr (field Geno Chr))
    (set gnt:mb (annotate-field
                 (field ("IFNULL(Geno.Mb, '')" Mb)) '^^xsd:double))
    (set gnt:mbMm8 (annotate-field (field ("IFNULL(Geno.Mb_mm8, '')" Mb_mm8))
                                   '^^xsd:double))
    (set gnt:mb2016
         (annotate-field (field ("IFNULL(Geno.Mb_2016, '')" Mb_2016))
                         '^^xsd:double))
    (set gnt:hasSequence (field Geno Sequence))
    (set gnt:hasSource (field Geno Source))
    ;; Only transform Source2 if it differs from Source
    (set gnt:hasAltSourceName
         (field ("IF((Source2 = Source), NULL, Source2)"
                 Source2)))
    (set gnt:belongsToSpecies
         (string->identifier
          "" (remap-species-identifiers (field Species Fullname))
          #:separator ""
          #:proc string-capitalize-first))
    (set gnt:chrNum
         (annotate-field
          (field Geno chr_num)
          '^^xsd:int))
    (set rdfs:comments (field Geno Comments))))



(let* ((option-spec
        '((settings (single-char #\s) (value #t))
          (output (single-char #\o) (value #t))
          (documentation (single-char #\d) (value #t))))
       (options (getopt-long (command-line) option-spec))
       (settings (option-ref options 'settings #f))
       (output (option-ref options 'output #f))
       (documentation (option-ref options 'documentation #f))
       (%connection-settings
        (call-with-input-file settings
          read)))
  (with-documentation
   (name "Genotype Metadata")
   (connection %connection-settings)
   (table-metadata? #f)
   (prefixes
    '(("dct:" "<http://purl.org/dc/terms/>")
      ("gn:" "<http://genenetwork.org/id/>")
      ("gnc:" "<http://genenetwork.org/category/>")
      ("gnt:" "<http://genenetwork.org/term/>")
      ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
      ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
      ("owl:" "<http://www.w3.org/2002/07/owl#>")
      ("skos:" "<http://www.w3.org/2004/02/skos/core#>")
      ("xsd:" "<http://www.w3.org/2001/XMLSchema#>")))
   (inputs
    (list genotypes))
   (outputs
    `(#:documentation ,documentation
      #:rdf ,output))))