aboutsummaryrefslogtreecommitdiff
path: root/examples/dump-species-metadata.scm
blob: 5c9ef494fe4ce2c520b463c8cb6b8911106d489d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#! /usr/bin/env guile
!#

(use-modules (srfi srfi-1)
             (srfi srfi-26)
             (ice-9 match)
             (ice-9 regex)
             (dump strings)
             (dump sql)
             (dump triples)
             (dump special-forms))



(define %connection-settings
  (call-with-input-file (list-ref (command-line) 1)
    read))



(define (remap-species-identifiers str)
  "This procedure remaps identifiers to standard binominal. Obviously this should
   be sorted by correcting the database!"
  (match str
    ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"]
    ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"]
    ["Monkey (Macaca nemestrina)" "Macaca nemestrina"]
    ["Bat (Glossophaga soricina)" "Glossophaga soricina"]
    [str str]))

(define-dump dump-species
  (tables (Species))
  (schema-triples
   (gnt:name rdfs:range rdfs:Literal)
   (gnt:displayName rdfs:range rdfs:Literal)
   (gnt:binomialName rdfs:range rdfs:Literal)
   (gnt:family rdfs:range rdfs:Literal))
  (triples
      (string->binomial-name (field Species FullName))
    (set rdf:type 'gnc:species)
    (set gnt:name (field Species SpeciesName))
    (set gnt:displayName (field Species MenuName))
    (set gnt:binomialName (field Species FullName))
    (set gnt:family (field Species Family))
    (set gnt:organism (ontology 'taxon: (field Species TaxonomyId)))))

(define-dump dump-strain
  (tables (Strain
           (left-join Species "ON Strain.SpeciesId = Species.SpeciesId")))
  (schema-triples
   (gnt:strainOfSpecies rdfs:domain gnt:strain)
   (gnt:strainOfSpecies rdfs:range gn-term:species)
   (gn-term:name rdfs:range rdfs:Literal)
   (gn-term:alias rdfs:range rdfs:Literal)
   (gn-term:symbol rdfs:range rdfs:Literal))
  (triples (string->identifier
            ""
            (regexp-substitute/global
             #f "[^A-Za-z0-9:]"
             (field ("CAST(CONVERT(BINARY CONVERT(Strain.Name USING latin1) USING utf8) AS VARCHAR(15000))" StrainName))
             'pre "_" 'post)
            #:separator ""
            #:proc string-capitalize-first)
    (set rdf:type 'gnc:strain)
    (set gn-term:strainOfSpecies
         (string->identifier "" (remap-species-identifiers (field Species FullName))
                          #:separator ""
                          #:proc string-capitalize-first))
    ;; Name, and maybe a second name
    (set gn-term:name (sanitize-rdf-string (field Strain Name)))
    (set gn-term:name2 (sanitize-rdf-string (field Strain Name2)))
    (set gn-term:alias (sanitize-rdf-string (field Strain Alias)))
    (set gn-term:symbol (field Strain Symbol))))

(define-dump dump-mapping-method
  (tables (MappingMethod))
  (triples
      (string->identifier "mappingMethod" (field MappingMethod Name))
    (set rdf:type 'gnc:mappingMethod)))

(define-dump dump-inbred-set
  (tables (InbredSet
           (left-join Species "ON InbredSet.SpeciesId=Species.Id")
           (left-join MappingMethod
                       "ON InbredSet.MappingMethodId=MappingMethod.Id")))
  (schema-triples
   (gn-term:fullName rdfs:range rdfs:Literal)
   (gn-term:geneticType rdfs:range rdfs:Literal)
   (gn-term:inbredSetCode rdfs:range rdfs:Literal)
   (gn-term:inbredFamily rdfs:range rdfs:Literal)
   (gn-term:inbredSetOfSpecies rdfs:range gn:species)
   (gn-term:inbredSetType rdfs:range rdfs:Literal)
   (gn-term:phenotype rdfs:range gn-term:inbredSetType)
   (gn-term:genotype rdfs:range gn-term:inbredSetType)
   (gn-term:inbredSetOfMappingMethod rdfs:range gn-term:mappingMethod))
  (triples (string->identifier
            "" (field InbredSet Name)
            #:separator ""
            #:proc string-capitalize-first)
    (set rdf:type 'gnc:inbredSet)
    (set gn-term:binomialName (field InbredSet FullName))
    (set gn-term:geneticType (field InbredSet GeneticType))
    (set gn-term:inbredFamily (field InbredSet Family))
    (set gn-term:inbredSetOfMappingMethod (field MappingMethod Name))
    (set gn-term:inbredSetCode (field InbredSet InbredSetCode))
    (set gn-term:inbredSetOfSpecies
         (string->binomial-name
          (field Species FullName BinomialName)))
    (set gn-term:genotype
         (field ("IF ((SELECT PublishFreeze.Name FROM PublishFreeze WHERE PublishFreeze.InbredSetId = InbredSet.Id LIMIT 1) IS NOT NULL, 'Traits and Cofactors', '')" genotypeP)))
    (set gn-term:phenotype
         (field ("IF ((SELECT GenoFreeze.Name FROM GenoFreeze WHERE GenoFreeze.InbredSetId = InbredSet.Id LIMIT 1) IS NOT NULL, 'DNA Markers and SNPs', '')" phenotypeP)))))

(define-dump dump-avg-method
  ;; The Name and Normalization fields seem to be the same. Dump only
  ;; the Name field.
  (tables (AvgMethod))
  (schema-triples
   (gn-term:normalization rdfs:range rdfs:Literal))
  (triples (string->identifier "avgmethod" (field AvgMethod Name))
    (set rdf:type 'gnc:avgMethod)
    (set gn-term:normalization (field AvgMethod Normalization))))



(dump-with-documentation
 (name "Species Metadata")
 (connection %connection-settings)
 (table-metadata? #f)
 (prefixes
  '(("gn:" "<http://genenetwork.org/id/>")
    ("gnc:" "<http://genenetwork.org/category/>")
    ("gnt:" "<http://genenetwork.org/term/>")
    ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
    ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
    ("taxon:" "<http://purl.uniprot.org/taxonomy/>")))
 (inputs
  (list dump-species
        dump-strain
        dump-mapping-method
        dump-avg-method
	))
 (outputs
  '(#:documentation "./docs/dump-species-metadata.md"
    #:rdf "./verified-data/dump-species-metadata.ttl")))