aboutsummaryrefslogtreecommitdiff
path: root/examples/dump-dataset-metadata.scm
blob: e099bac98ffc16d01696af5a65e5e63b929c3603 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#! /usr/bin/env guile
!#

(use-modules (srfi srfi-1)
             (srfi srfi-26)
             (ice-9 match)
             (ice-9 regex)
             (dump strings)
             (dump sql)
             (dump triples)
             (dump special-forms))



(define %connection-settings
  (call-with-input-file (list-ref (command-line) 1)
    read))



;; One email ID in the Investigators table has spaces in it. This
;; function fixes that.
(define (fix-email-id email)
  (string-delete #\space email))

(define (investigator-attributes->id first-name last-name email)
  ;; There is just one record corresponding to "Evan Williams" which
  ;; does not have an email ID. To accommodate that record, we
  ;; construct the investigator ID from not just the email ID, but
  ;; also the first and the last names. It would be preferable to just
  ;; find Evan Williams' email ID and insert it into the database.
  (string->identifier "investigator"
                      (string-join
                       (list first-name last-name (fix-email-id email))
                       "_")))

(define-dump dump-investigators
  ;; There are a few duplicate entries. We group by email to
  ;; deduplicate.
  (tables (Investigators)
          "GROUP BY Email")
  (triples (investigator-attributes->id (field Investigators FirstName)
                                        (field Investigators LastName)
                                        (field Investigators Email))
    (set rdf:type 'foaf:Person)
    (set foaf:name (string-append (field Investigators FirstName) " "
                                  (field Investigators LastName)))
    (set foaf:givenName
         (field Investigators FirstName))
    (set foaf:familyName
         (field Investigators LastName))
    (set foaf:homepage (field Investigators Url))
    (set v:adr (field Investigators Address))
    (set v:locality (field Investigators City))
    (set v:region (field Investigators State))
    (set v:postal-code (field Investigators ZipCode))
    (set v:country-name (field Investigators Country))))

(define-dump dump-info-files
  (tables (InfoFiles
           (left-join PublishFreeze "ON InfoFiles.InfoPageName = PublishFreeze.Name")
           (left-join GenoFreeze "ON InfoFiles.InfoPageName = GenoFreeze.Name")
           (left-join ProbeSetFreeze "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name")
           (left-join InbredSet "ON InfoFiles.InbredSetId = InbredSet.InbredSetId")
           (left-join Species "ON InfoFiles.SpeciesId = Species.SpeciesId")
           (left-join Datasets "USING (DatasetId)")
           (left-join DatasetStatus "USING (DatasetStatusId)")
           (left-join Tissue "USING (TissueId)")
           (left-join Investigators "USING (InvestigatorId)")
           (left-join AvgMethod "USING (AvgMethodId)")
           (left-join Organizations "USING (OrganizationId)")
           (left-join GeneChip "USING (GeneChipId)"))
          "WHERE GN_AccesionId IS NOT NULL")
  (schema-triples
   (gnc:dataset rdf:type gdmt:Dataset)
   (gnc:genotypeDataset rdfs:subClassOf gnc:dataset)
   (gnc:phenotypeDataset rdfs:subClassOf gnc:dataset)
   (gnc:probesetDataset rdfs:subClassOf gnc:dataset)
   (gnt:belongsToInbredSet rdfs:domain gnc:dataset)
   (gnt:belongsToInbredSet a owl:ObjectProperty)
   (gnt:belongsToInbredSet skos:definition "The InbredSet this resource belongs to")
   (gnt:hasTissue rdfs:domain gnc:dataset)
   (gnt:hasTissue a owl:ObjectProperty)
   (gnt:hasTissue skos:definition "Tissues this resource has")
   (gnt:hasTissueInfo rdfs:domain gnc:dataset)
   (gnt:hasTissueInfo a owl:ObjectProperty)
   (gnt:hasTissueInfo skos:definition "Metadata about Tissue for this resource")
   (gnt:usesNormalization rdfs:domain gnc:dataset)
   (gnt:usesNormalization a owl:ObjectProperty)
   (gnt:usesNormalization skos:definition "Normalization techniques this resource has")
   (gnt:usesPlatform rdfs:domain gnc:dataset)
   (gnt:usesPlatform a owl:ObjectProperty)
   (gnt:usesPlatform skos:definition "The Platform this resource uses")
   (gnt:hasGeoSeriesId rdfs:domain gnc:dataset)
   (gnt:hasGeoSeriesId a owl:ObjectProperty)
   (gnt:hasGeoSeriesId skos:definition "id of record in NCBI database")
   (gnt:hasExperimentDesignInfo rdfs:domain gnc:dataset)
   (gnt:hasExperimentDesignInfo rdfs:label "Experiment Design")
   (gnt:hasExperimentDesignInfo a owl:ObjectProperty)
   (gnt:hasExperimentDesignInfo skos:definition "Information about how the experiment was designed")
   (gnt:hasNotes rdfs:domain gnc:dataset)
   (gnt:hasNotes a owl:ObjectProperty)
   (gnt:hasNotes rdfs:label "Notes")
   (gnt:hasNotes skos:definition "Extra Notes about this dataset")
   (gnt:hasDataProcessingInfo rdfs:domain gnc:dataset)
   (gnt:hasDataProcessingInfo rdfs:label "About Data Processing")
   (gnt:hasDataProcessingInfo a owl:ObjectProperty)
   (gnt:hasDataProcessingInfo skos:definition "Information about how this dataset was processed")
   (gnt:hasPlatformInfo rdfs:domain gnc:dataset)
   (gnt:hasPlatformInfo a owl:ObjectProperty)
   (gnt:hasPlatformInfo rdfs:label "About Platfoorm")
   (gnt:hasPlatformInfo skos:definition "Information about the platform that was used with this dataset")
   (gnt:hasCaseInfo rdfs:domain gnc:dataset)
   (gnt:hasCaseInfo rdfs:label "About Case")
   (gnt:hasCaseInfo a owl:ObjectProperty)
   (gnt:hasCaseInfo skos:definition "Information about the cases used in this platform")
   (gnt:hasAcknowledgement rdfs:domain gnc:dataset)
   (gnt:hasAcknowledgement rdfs:label "Acknowledgement")
   (gnt:hasAcknowledgement a owl:ObjectProperty)
   (gnt:hasAcknowledgement skos:definition "People to acknowledge"))
  (triples (string->identifier
            "" (regexp-substitute/global #f "[^A-Za-z0-9:]"
                                         (field InfoFiles InfoPageName)
                                         'pre "_" 'post)
            #:separator ""
            #:proc string-capitalize-first)
    (set rdf:type (string->symbol
                   (field ("IF(GenoFreeze.Id IS NOT NULL, 'gnc:genotypeDataset', IF(PublishFreeze.Id IS NOT NULL, 'gnc:phenotypeDataset', IF(ProbeSetFreeze.Name IS NOT NULL, 'gnc:probesetDataset', 'gnc:dataset')))"
                           rdfType))))
    (set rdfs:label (regexp-substitute/global
                     #f "^[Nn]one$"
                     (field InfoFiles InfoPageName)
                     ""))
    (set skos:prefLabel
         (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))"
                 DatasetFullName)))
    (set skos:prefLabel (field Datasets DatasetName DatasetGroup))
    (set gdmt:hasTitleInfo
         (regexp-substitute/global
          #f "^[Nn]one$"
          (field InfoFiles InfoFileTitle)
          ""))
    ;; This is the published title
    (set dct:title
         (regexp-substitute/global
          #f "^[Nn]one$"
          (field Datasets PublicationTitle)
          ""))
    (set dct:created
         (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))"
                 createTimeGenoFreeze)))
    (set gdmt:hasCreatorInfo
         (investigator-attributes->id (field Investigators FirstName)
                                      (field Investigators LastName)
                                      (field Investigators Email)))
    (set gdmt:hasCreatorAffiliation
         (field Organizations OrganizationName))
    (set gdmt:hasDatasetIdentifierSubType (format #f "GN~a" (field InfoFiles GN_AccesionId)))
    (set gdmt:hasRightsInfo (string-downcase
                             (field DatasetStatus DatasetStatusName)))
    (set gnt:belongsToInbredSet
         (string->identifier "inbredSet" (field InbredSet Name InbredSetName)))
    (set gnt:hasTissue (string->identifier "tissue"
                                           (field Tissue Short_Name)))
    (set gnt:usesNormalization
         (string->identifier "avgmethod"
                             ;; If AvgMethodName is NULL, assume N/A.
                             (if (string-blank? (field AvgMethod Name AvgMethodName))
                                 "N/A" (field AvgMethod Name AvgMethodName))))
    (set gnt:usesPlatform
         (string->identifier "platform"
                             (field GeneChip Name GeneChip)))
    (set gdmt:isDescribedBy
         (sanitize-rdf-string (field Datasets Summary)))
    (set gnt:hasGeoSeriesId
         (let ((s
                (string-match "GSE[0-9]*"
                              (field ("IFNULL(Datasets.GeoSeries, '')" GeoSeries)))))
           (if s (ontology
                  'geoSeries: (match:substring s))
               "")))
    (set gnt:hasTissueInfo
         (sanitize-rdf-string (field Datasets AboutTissue)))
    (set gnt:hasContentInfo (sanitize-rdf-string (field InfoFiles Specifics)))
    (set gnt:hasCaseInfo
         (sanitize-rdf-string
          (field Datasets AboutCases)))
    (set gnt:hasPlatformInfo
         (sanitize-rdf-string
          (field Datasets AboutPlatform)))
    (set gnt:hasDataProcessingInfo
         (sanitize-rdf-string
          (field Datasets AboutDataProcessing)))
    (set gnt:hasNotes
         (sanitize-rdf-string
          (field Datasets Notes)))
    (set gnt:hasExperimentDesignInfo
         (sanitize-rdf-string
          (field Datasets ExperimentDesign)))
    (set gdmt:hasContributorInfo
         (sanitize-rdf-string
          (field Datasets Contributors)))
    (set gdmt:IsCitedBy
         (sanitize-rdf-string
          (regexp-substitute/global
           #f "^[Nn]one$"
           (field Datasets Citation)
           "")))
    (set gnt:hasAcknowledgement
         (sanitize-rdf-string
          (string-trim-both
           (regexp-substitute/global
            #f "^[Nn]one$"
            (field InfoFiles Data_Source_Acknowledge)
            ""))))
    (set gnt:hasAcknowledgement (sanitize-rdf-string
                                 (field Datasets Acknowledgment)))))




(dump-with-documentation
 (name "Info files / Investigators Metadata")
 (connection %connection-settings)
 (table-metadata? #f)
 (prefixes
  '(("v:" "<http://www.w3.org/2006/vcard/ns#>")
    ("foaf:" "<http://xmlns.com/foaf/0.1/>")
    ("gdmt:" "<http://vocab.fairdatacollective.org/gdmt/>")
    ("skos:" "<http://www.w3.org/2004/02/skos/core#>")
    ("geoSeries:" "<http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=>")
    ("gnt:" "<http://genenetwork.org/term/>")
    ("gn:" "<http://genenetwork.org/id/>")
    ("gnc:" "<http://genenetwork.org/category/>")
    ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
    ("owl:" "<http://www.w3.org/2002/07/owl#>")
    ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
    ("taxon:" "<http://purl.uniprot.org/taxonomy/>")
    ("dct:" "<http://purl.org/dc/terms/>")))
 (inputs
  (list dump-info-files
        dump-investigators))
 (outputs
  '(#:documentation "./docs/dump-info-pages.md"
    #:rdf "./verified-data/dump-info-pages.ttl")))