1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
|
#! /usr/bin/env guile
!#
(use-modules (srfi srfi-1)
(srfi srfi-26)
(ice-9 getopt-long)
(ice-9 match)
(ice-9 regex)
(transform strings)
(transform sql)
(transform triples)
(transform special-forms))
(define-transformer investigators
;; There are a few duplicate entries. We group by email to
;; deduplicate.
(tables (Investigators)
"GROUP BY Email")
(triples (investigator-attributes->id (field Investigators FirstName)
(field Investigators LastName)
"")
(set rdf:type 'foaf:Person)
(set foaf:name (string-append (field Investigators FirstName) " "
(field Investigators LastName)))
(set foaf:givenName
(field Investigators FirstName))
(set foaf:familyName
(field Investigators LastName))
(set foaf:homepage (field Investigators Url))
(set v:adr (field Investigators Address))
(set v:locality (field Investigators City))
(set v:region (field Investigators State))
(set v:postal-code (field Investigators ZipCode))
(set v:country-name (field Investigators Country))))
(define-transformer gene-chip
(tables (GeneChip
(left-join Species "USING (SpeciesId)")))
(schema-triples
(gnc:gene_chip a skos:Concept)
(gnc:gene_chip
skos:description
"This is a set of controlled terms that are used to describe a given gene chip/platform")
(gnt:has_geo_series_id rdfs:domain gnc:platform)
(gnt:has_geo_series_id rdfs:domain gnc:gene_chip)
(gnt:has_go_tree_value a owl:ObjectProperty)
(gnt:has_go_tree_value skos:definition "This resource the following GO tree value")
(gnt:has_go_tree_value rdfs:domain gnc:gene_chip))
(triples (string->identifier "platform" (field GeneChip Name) #:separator "_")
(set rdf:type 'gnc:gene_chip)
(set rdfs:label (field GeneChip GeneChipName))
(set skos:prefLabel (field GeneChip Name))
(set skos:altLabel (field ("IF(GeneChip.GeneChipName != GeneChip.Title, Title, NULL)"
Title)))
(set gnt:has_go_tree_value (field GeneChip Go_tree_value))
(set xkos:classifiedUnder
(string->identifier "" (remap-species-identifiers (field Species Fullname)) #:separator ""))
(set gnt:has_geo_series_id
(ontology 'geoSeries:
(string-trim-both (field GeneChip GeoPlatform))))))
(define-transformer info-files
(tables (InfoFiles
(left-join PublishFreeze "ON InfoFiles.InfoPageName = PublishFreeze.Name")
(left-join GenoFreeze "ON InfoFiles.InfoPageName = GenoFreeze.Name")
(left-join ProbeSetFreeze "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name")
(left-join InbredSet "ON InfoFiles.InbredSetId = InbredSet.InbredSetId")
(left-join Species "ON InfoFiles.SpeciesId = Species.SpeciesId")
(left-join Datasets "USING (DatasetId)")
(left-join DatasetStatus "USING (DatasetStatusId)")
(left-join Tissue "USING (TissueId)")
(left-join Investigators "USING (InvestigatorId)")
(left-join AvgMethod "USING (AvgMethodId)")
(left-join Organizations "USING (OrganizationId)")
(left-join GeneChip "USING (GeneChipId)"))
;; XXXX: There are datasets that don't have the InbredSetId
;; in the Infofiles table. This clause allows us to check
;; if they exist in the (Publish/Geno)Freeze tables.
"LEFT JOIN InbredSet PublishInbredSet ON PublishFreeze.InbredSetId = PublishInbredSet.InbredSetId LEFT JOIN InbredSet GenoInbredSet ON GenoFreeze.InbredSetId = GenoInbredSet.InbredSetId WHERE GN_AccesionId IS NOT NULL")
(schema-triples
(gnt:has_tissue rdfs:domain dcat:Dataset)
(gnt:has_tissue a owl:ObjectProperty)
(gnt:has_tissue skos:definition "Tissues this resource has")
(gnt:uses_normalization rdfs:domain dcat:Dataset)
(gnt:uses_normalization a owl:ObjectProperty)
(gnt:uses_normalization skos:definition "Normalization techniques this resource has")
(gnt:uses_platform rdfs:domain dcat:Dataset)
(gnt:uses_platform a owl:ObjectProperty)
(gnt:uses_platform skos:definition "The Platform this resource uses")
(gnt:has_geo_series_id rdfs:domain dcat:Dataset)
(gnt:has_geo_series_id a owl:ObjectProperty)
(gnt:has_geo_series_id skos:definition "id of record in NCBI database")
(gnt:has_experiment_type rdfs:domain dcat:Dataset)
(gnt:has_experiment_type a owl:ObjectProperty)
(gnt:has_experiment_type rdfs:label "Experiment Type Metadata")
(gnt:has_experiment_type skos:definition "Information about the experiment type")
(gnt:has_tissue_info rdfs:domain dcat:Dataset)
(gnt:has_tissue_info a owl:ObjectProperty)
(gnt:has_tissue_info skos:definition "Metadata about Tissue for this resource")
(gnt:has_experiment_design_info rdfs:domain dcat:Dataset)
(gnt:has_experiment_design_info rdfs:label "Experiment Design")
(gnt:has_experiment_design_info a owl:ObjectProperty)
(gnt:has_experiment_design_info skos:definition "Information about how the experiment was designed")
(gnt:has_notes rdfs:domain dcat:Dataset)
(gnt:has_notes a owl:ObjectProperty)
(gnt:has_notes rdfs:label "Notes")
(gnt:has_notes skos:definition "Extra Notes about this dataset")
(gnt:has_data_processing_info rdfs:domain dcat:Dataset)
(gnt:has_data_processing_info rdfs:label "About Data Processing")
(gnt:has_data_processing_info a owl:ObjectProperty)
(gnt:has_data_processing_info skos:definition "Information about how this dataset was processed")
(gnt:has_platform_info rdfs:domain dcat:Dataset)
(gnt:has_platform_info a owl:ObjectProperty)
(gnt:has_platform_info rdfs:label "About Platform")
(gnt:has_platform_info skos:definition "Information about the platform that was used with this dataset")
(gnt:has_case_info rdfs:domain dcat:Dataset)
(gnt:has_case_info rdfs:label "About Case")
(gnt:has_case_info a owl:ObjectProperty)
(gnt:has_case_info skos:definition "Information about the cases used in this platform")
(gnt:has_summary rdfs:domain dcat:Dataset)
(gnt:has_summary rdfs:label "Summary")
(gnt:has_summary a owl:ObjectProperty)
(gnt:has_summary skos:definition "Summary information about dataset")
(gnt:has_citation rdfs:domain dcat:Dataset)
(gnt:has_citation rdfs:label "Citation")
(gnt:has_citation a owl:ObjectProperty)
(gnt:has_citation skos:definition "Citation for this dataset")
(gnt:has_contributors rdfs:domain dcat:Dataset)
(gnt:has_contributors rdfs:label "Contributors")
(gnt:has_contributors a owl:ObjectProperty)
(gnt:has_contributors skos:definition "Contributors of this resource")
(gnt:has_experiment_design rdfs:domain dcat:Dataset)
(gnt:has_experiment_design rdfs:label "Experiment Design")
(gnt:has_experiment_design a owl:ObjectProperty)
(gnt:has_experiment_design skos:definition "Experiment Design for this resource")
(gnt:has_tissue_info rdfs:domain dcat:Dataset)
(gnt:has_tissue_info rdfs:label "Tissue Information")
(gnt:has_tissue_info a owl:ObjectProperty)
(gnt:has_tissue_info skos:definition "Tissue information about dataset")
(gnt:has_experiment_type skos:definition "Information about the experiment type")
(gnt:has_acknowledgement rdfs:domain dcat:Dataset)
(gnt:has_acknowledgement rdfs:label "Acknowledgement")
(gnt:has_acknowledgement a owl:ObjectProperty)
(gnt:has_acknowledgement skos:definition "People to acknowledge"))
(triples
(string->identifier
"" (let ((info-page-name (field InfoFiles InfoPageName))
(info-title (field InfoFiles Title)))
(format #f "~a"
(if (and (string? info-page-name)
(string=? (string-downcase (string-trim-both info-page-name))
"none"))
info-title info-page-name))))
(set rdf:type 'dcat:Dataset)
(set xkos:classifiedUnder
(let ([dataset-type
(string-trim-both
(field ("IF(GenoFreeze.Id IS NOT NULL, 'gnc:genotype', IF(PublishFreeze.Id IS NOT NULL, 'gnc:phenotype', IF(ProbeSetFreeze.Name IS NOT NULL, 'gnc:probeset', '')))"
DatasetType)))])
(if (not (string-null? dataset-type))
(string->symbol
dataset-type)
"")))
(set rdfs:label (normalize-string-field (field InfoFiles InfoPageName)))
(set skos:prefLabel
(normalize-string-field
(field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))"
DatasetFullName))))
(set skos:altLabel (field Datasets DatasetName DatasetGroup))
(set dct:title (normalize-string-field (field Datasets PublicationTitle)))
(set dct:created
(normalize-string-field
(field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))"
createTimeGenoFreeze))))
(set dcat:contactPoint
(investigator-attributes->id (field Investigators FirstName)
(field Investigators LastName)
""))
(set foaf:Organization
(field Organizations OrganizationName))
(set dct:identifier (format #f "GN~a" (field InfoFiles GN_AccesionId)))
(set dct:accessRights (string-downcase
(field DatasetStatus DatasetStatusName)))
(set gnt:has_strain
(string->identifier
"set"
(field ("IFNULL(InbredSet.Name, IFNULL(PublishInbredSet.Name, GenoInbredSet.Name))"
InbredSetName))
#:separator "_"))
(set gnt:has_tissue (string->identifier "tissue"
(field Tissue Short_Name)
#:separator "_"))
(set gnt:uses_normalization
(let ((avg-method (normalize-string-field (field AvgMethod Name AvgMethodName))))
(if (not (string-blank? avg-method))
(string->identifier "avg_method" avg-method #:separator "_")
"")))
(set gnt:has_summary
(let* ((summary-link
(format
#f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/summary.rtf>"
(string-capitalize-first
(regexp-substitute/global
#f "[^A-Za-z0-9:]"
(field InfoFiles InfoPageName)
'pre "_" 'post))))
(summary
(field InfoFiles Summary)))
(if (or (null? summary) (string-blank? summary))
"" (string->symbol summary-link))))
(set gnt:has_tissue_info
(let* ((tissue-info-link
(format
#f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/tissue.rtf>"
(string-capitalize-first
(regexp-substitute/global
#f "[^A-Za-z0-9:]"
(field InfoFiles InfoPageName)
'pre "_" 'post))))
(tissue-info
(field Datasets AboutTissue)))
(if (or (null? tissue-info) (string-blank? tissue-info))
"" (string->symbol tissue-info-link))))
(set gnt:has_citation
(let* ((citation-link
(format
#f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/citation.rtf>"
(string-capitalize-first
(regexp-substitute/global
#f "[^A-Za-z0-9:]"
(field InfoFiles InfoPageName)
'pre "_" 'post))))
(citation
(field Datasets Citation)))
(if (or (null? citation) (string-blank? citation))
"" (string->symbol citation-link))))
(set gnt:hasSpecifics
(let* ((specifics-link
(format
#f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/specifics.rtf>"
(string-capitalize-first
(regexp-substitute/global
#f "[^A-Za-z0-9:]"
(field InfoFiles InfoPageName)
'pre "_" 'post))))
(specifics
(field InfoFiles Specifics)))
(if (or (null? specifics) (string-blank? specifics))
"" (string->symbol specifics-link))))
(set gnt:has_case_info
(let* ((cases-link
(format
#f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/cases.rtf>"
(string-capitalize-first
(regexp-substitute/global
#f "[^A-Za-z0-9:]"
(field InfoFiles InfoPageName)
'pre "_" 'post))))
(cases
(field Datasets AboutCases)))
(if (or (null? cases) (string-blank? cases))
"" (string->symbol cases-link))))
(set gnt:has_platform_info
(let* ((platform-link
(format
#f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/platform.rtf>"
(string-capitalize-first
(regexp-substitute/global
#f "[^A-Za-z0-9:]"
(field InfoFiles InfoPageName)
'pre "_" 'post))))
(platform
(field Datasets AboutPlatform)))
(if (or (null? platform) (string-blank? platform))
"" (string->symbol platform-link))))
(set gnt:has_data_processing_info
(let* ((processing-link
(format
#f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/processing.rtf>"
(string-capitalize-first
(regexp-substitute/global
#f "[^A-Za-z0-9:]"
(field InfoFiles InfoPageName)
'pre "_" 'post))))
(processing
(field Datasets AboutDataProcessing)))
(if (or (null? processing) (string-blank? processing))
"" (string->symbol processing-link))))
(set gnt:has_notes
(let* ((notes-link
(format
#f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/notes.rtf>"
(string-capitalize-first
(regexp-substitute/global
#f "[^A-Za-z0-9:]"
(field InfoFiles InfoPageName)
'pre "_" 'post))))
(notes
(field Datasets Notes)))
(if (or (null? notes) (string-blank? notes))
"" (string->symbol notes-link))))
(set gnt:has_experiment_type
(let* ((experiment-type-link
(format
#f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/experiment-type.rtf>"
(string-capitalize-first
(regexp-substitute/global
#f "[^A-Za-z0-9:]"
(field InfoFiles InfoPageName)
'pre "_" 'post))))
(experiment-type
(field InfoFiles Experiment_Type)))
(if (or (null? experiment-type) (string-blank? experiment-type))
"" (string->symbol experiment-type-link))))
(set gnt:has_experiment_design
(let* ((experiment-design-link
(format
#f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/experiment-design.rtf>"
(string-capitalize-first
(regexp-substitute/global
#f "[^A-Za-z0-9:]"
(field InfoFiles InfoPageName)
'pre "_" 'post))))
(experiment-design
(field Datasets ExperimentDesign)))
(if (or (null? experiment-design) (string-blank? experiment-design))
"" (string->symbol experiment-design-link))))
(set gnt:has_contributors
(let* ((contributors-link
(format
#f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/contributors.rtf>"
(string-capitalize-first
(regexp-substitute/global
#f "[^A-Za-z0-9:]"
(field InfoFiles InfoPageName)
'pre "_" 'post))))
(contributors
(field Datasets Contributors)))
(if (or (null? contributors) (string-blank? contributors))
"" (string->symbol contributors-link))))
(set gnt:has_acknowledgement
(let* ((acknowledgment-link
(format
#f "<https://git.genenetwork.org/gn-docs/tree/general/datasets/~a/acknowledgment.rtf>"
(string-capitalize-first
(regexp-substitute/global
#f "[^A-Za-z0-9:]"
(field InfoFiles InfoPageName)
'pre "_" 'post))))
(acknowledgment
(field Datasets Acknowledgment)))
(if (or (null? acknowledgment) (string-blank? acknowledgment))
"" (string->symbol acknowledgment-link))))
(set gnt:uses_platform
(string->identifier "platform"
(field GeneChip Name GeneChip)
#:separator "_"))
(set gnt:has_geo_series_id
(let ((s
(string-match "GSE[0-9]*"
(field ("IFNULL(Datasets.GeoSeries, '')" GeoSeries)))))
(if s (ontology
'geoSeries: (match:substring s))
"")))))
;; These are phenotype datasets that don't have Infofile metadata
(define-transformer publishfreeze
(tables (PublishFreeze
(left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name")
(left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId"))
"WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL")
(triples
(string->identifier "" (field PublishFreeze Name))
(set rdf:type 'dcat:Dataset)
(set xkos:classifiedUnder 'gnc:phenotype)
(set dct:title (field PublishFreeze FullName))
(set rdfs:label (field PublishFreeze Name))
(set skos:altLabel (field PublishFreeze ShortName))
(set dct:created (annotate-field
(field PublishFreeze CreateTime)
'^^xsd:date))
(set gnt:has_strain
(string->identifier
"set" (field InbredSet Name InbredSetName)
#:separator "_"))))
(define-transformer genofreeze
(tables (GenoFreeze
(left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name")
(left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId"))
"WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL")
(triples
(string->identifier "" (field GenoFreeze Name))
(set rdf:type 'dcat:Dataset)
(set xkos:classifiedUnder 'gnc:genotype)
(set rdfs:label (field GenoFreeze Name))
(set dct:title (field GenoFreeze FullName))
(set skos:altLabel (field GenoFreeze ShortName))
(set dct:created (annotate-field
(field GenoFreeze CreateTime)
'^^xsd:date))
(set gnt:has_strain
(string->identifier
"set" (field InbredSet Name InbredSetName)
#:separator "_"
#:proc (lambda (x) x)))))
;; Molecular Traits are also referred to as ProbeSets
(define-transformer probesetfreeze
(tables (ProbeSetFreeze
(left-join InfoFiles "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name")
(left-join ProbeFreeze "USING (ProbeFreezeId)")
(left-join AvgMethod "ON AvgMethod.AvgMethodId = ProbeSetFreeze.AvgID")
(left-join InbredSet "ON ProbeFreeze.InbredSetId = InbredSet.Id")
(left-join Tissue "ON ProbeFreeze.TissueId = Tissue.TissueId"))
"WHERE ProbeSetFreeze.public > 0 AND InfoFiles.InfoPageName IS NULL GROUP BY ProbeFreeze.Id")
(schema-triples
(gnt:uses_normalization rdfs:domain gnc:probeset)
(gnt:uses_data_scale rdfs:domain gnc:probeset)
(gnt:uses_data_scale a owl:ObjectProperty)
(gnt:uses_data_scale skos:definition "Thi data scale this resource uses"))
(triples
(string->identifier "" (field ProbeSetFreeze Name))
(set rdf:type 'dcat:Dataset)
(set xkos:classifiedUnder 'gnc:probeset)
(set gnt:uses_normalization
(let ((avg-method (field AvgMethod Name AvgMethodName)))
(if (string-blank? avg-method)
#f
avg-method)))
(set dct:title (field ProbeSetFreeze FullName))
(set rdfs:label (field ProbeSetFreeze ShortName))
(set skos:prefLabel (field ProbeSetFreeze Name))
(set skos:altLabel (field ProbeSetFreeze Name2))
(set dct:created (annotate-field
(field ProbeSetFreeze CreateTime)
'^^xsd:datetime))
(set gnt:uses_data_scale (field ProbeSetFreeze DataScale))
(set gnt:has_tissue (string->identifier "tissue" (field Tissue Short_Name) #:separator "_"))
(set gnt:has_strain (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_"))))
(let* ((option-spec
'((settings (single-char #\s) (value #t))
(output (single-char #\o) (value #t))
(documentation (single-char #\d) (value #t))))
(options (getopt-long (command-line) option-spec))
(settings (option-ref options 'settings #f))
(output (option-ref options 'output #f))
(documentation (option-ref options 'documentation #f))
(%connection-settings
(call-with-input-file settings
read)))
(with-documentation
(name "Info files / Investigators Metadata")
(connection %connection-settings)
(table-metadata? #f)
(prefixes
'(("v:" "<http://www.w3.org/2006/vcard/ns#>")
("foaf:" "<http://xmlns.com/foaf/0.1/#term_>")
("xsd:" "<http://www.w3.org/2001/XMLSchema#>")
("dcat:" "<http://www.w3.org/ns/dcat#>")
("skos:" "<http://www.w3.org/2004/02/skos/core#>")
("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>")
("geoSeries:" "<http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=>")
("gnt:" "<http://rdf.genenetwork.org/v1/term/>")
("gn:" "<http://rdf.genenetwork.org/v1/id/>")
("gnc:" "<http://rdf.genenetwork.org/v1/category/>")
("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
("owl:" "<http://www.w3.org/2002/07/owl#>")
("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
("taxon:" "<http://purl.uniprot.org/taxonomy/>")
("dct:" "<http://purl.org/dc/terms/>")))
(inputs
(list info-files
publishfreeze
genofreeze
probesetfreeze
investigators
gene-chip))
(outputs
`(#:documentation ,documentation
#:rdf ,output))))
|