aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMunyoki Kilyungi2023-07-19 16:57:20 +0300
committerBonfaceKilz2023-07-30 12:29:56 +0300
commitd9e8b0ee01d4cdef99d5e23f53bcb34b8cd63d88 (patch)
tree6119a18b2da33ab9939894bbac68ae50d46baa85
parentbfeeefcd6b6383a5df317441f7e885a4631e5458 (diff)
downloadgn-transform-databases-d9e8b0ee01d4cdef99d5e23f53bcb34b8cd63d88.tar.gz
Use "gn:" and "gn-term:" when dumping phenotypes
Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
-rwxr-xr-xexamples/dump-phenotype.scm98
1 files changed, 50 insertions, 48 deletions
diff --git a/examples/dump-phenotype.scm b/examples/dump-phenotype.scm
index 33577ce..924ec9a 100755
--- a/examples/dump-phenotype.scm
+++ b/examples/dump-phenotype.scm
@@ -18,9 +18,6 @@
(call-with-input-file (list-ref (command-line) 1)
read))
-(define %dump-directory
- (list-ref (command-line) 2))
-
;; Only dump publish freeze entries that were not dumped from the InfoFiles page
@@ -30,25 +27,28 @@
(left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId"))
"WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL")
(schema-triples
- (gn:datasetOfInbredSet rdfs:range gn:inbredSet)
- (gn:name rdfs:range rdfs:Literal)
- (gn:fullName rdfs:range rdfs:Literal)
- (gn:shortName rdfs:range rdfs:Literal)
- (gn:createTime rdfs:range rdfs:Literal)
+ (gn-term:datasetOfInbredSet rdfs:range gn:inbredSet)
+ (gn-term:name rdfs:range rdfs:Literal)
+ (gn-term:fullName rdfs:range rdfs:Literal)
+ (gn-term:shortName rdfs:range rdfs:Literal)
+ (gn-term:createTime rdfs:range rdfs:Literal)
(gn:phenotypeDataset rdf:subClassOf gn:dataset))
(triples
- (ontology 'dataset:
- (regexp-substitute/global #f "[^A-Za-z0-9:]"
- (field PublishFreeze Name)
- 'pre "_" 'post))
+ (string->identifier
+ ""
+ (regexp-substitute/global #f "[^A-Za-z0-9:]"
+ (field PublishFreeze Name)
+ 'pre "_" 'post)
+ #:separator ""
+ #:proc string-capitalize-first)
(set rdf:type 'gn:phenotypeDataset)
- (set gn:name (field PublishFreeze Name))
- (set gn:fullName (field PublishFreeze FullName))
- (set gn:shortName (field PublishFreeze ShortName))
- (set dct:created (annotate-field
+ (set gn-term:name (field PublishFreeze Name))
+ (set gn-term:fullName (field PublishFreeze FullName))
+ (set gn-term:shortName (field PublishFreeze ShortName))
+ (set dc-termt:created (annotate-field
(field PublishFreeze CreateTime)
'^^xsd:date))
- (set gn:datasetOfInbredSet
+ (set gn-term:datasetOfInbredSet
(string->identifier "inbredSet" (field InbredSet Name InbredSetName)))))
(define-dump dump-phenotypes
@@ -59,48 +59,52 @@
(left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name")))
(schema-triples
(gn:phenotypeDataset rdfs:subPropertyOf gn:dataset))
- (triples (ontology 'phenotype:
- (regexp-substitute/global #f "[^A-Za-z0-9:]"
- (field ("CONCAT(IF(PublishFreeze.Name IS NULL, '', CONCAT(PublishFreeze.Name, ':')), IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation))" abbrev))
- 'pre "_" 'post))
+ (triples (string->identifier
+ ""
+ (regexp-substitute/global #f "[^A-Za-z0-9:]"
+ (field ("CONCAT(IF(PublishFreeze.Name IS NULL, '', CONCAT(PublishFreeze.Name, '_')), IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation))" abbrev))
+ 'pre "_" 'post)
+ #:separator ""
+ #:proc string-capitalize-first)
(set rdf:type 'gn:phenotype)
- (set gn:name (sanitize-rdf-string
+ (set gn-term:name (sanitize-rdf-string
(field
("CAST(CONVERT(BINARY CONVERT(IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Phenotype.Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation) USING latin1) USING utf8) AS VARCHAR(100))"
PhenotypeName))))
;; There is no row with an empty post-publication description so
;; use this field as the main publication description
- (set gn:publicationDescription
+ (set gn-term:publicationDescription
(sanitize-rdf-string
(field ("CAST(CONVERT(BINARY CONVERT(Phenotype.Post_publication_description USING latin1) USING utf8) AS CHAR(10000))"
postPubDescr))))
- (set gn:originalDescription (sanitize-rdf-string
+ (set gn-term:originalDescription (sanitize-rdf-string
(delete-substrings
(field Phenotype Original_description)
"Original post publication description: ")))
- (set gn:prePublicationDescription
+ (set gn-term:prePublicationDescription
(sanitize-rdf-string
(field
("CAST(CONVERT(BINARY CONVERT(Phenotype.Pre_publication_description USING latin1) USING utf8) AS VARCHAR(15000))"
prePubDesc))))
- (set gn:prePublicationAbbreviation (sanitize-rdf-string (field Phenotype Pre_publication_abbreviation)))
- (set gn:postPublicationAbbreviation (sanitize-rdf-string (field Phenotype Post_publication_abbreviation)))
- (set gn:labCode (field Phenotype Lab_code))
- (set gn:submitter (sanitize-rdf-string (field Phenotype Submitter)))
- (set gn:owner (sanitize-rdf-string (field Phenotype Owner)))
- (set gn:mean (annotate-field (field ("IFNULL(PublishXRef.mean, '')" mean))
+ (set gn-term:prePublicationAbbreviation (sanitize-rdf-string (field Phenotype Pre_publication_abbreviation)))
+ (set gn-term:postPublicationAbbreviation (sanitize-rdf-string (field Phenotype Post_publication_abbreviation)))
+ (set gn-term:labCode (field Phenotype Lab_code))
+ (set gn-term:submitter (sanitize-rdf-string (field Phenotype Submitter)))
+ (set gn-term:owner (sanitize-rdf-string (field Phenotype Owner)))
+ (set gn-term:mean (annotate-field (field ("IFNULL(PublishXRef.mean, '')" mean))
'^^xsd:double))
- (set gn:locus (field PublishXRef Locus))
- (set gn:LRS (annotate-field (field ("IFNULL(PublishXRef.LRS, '')" lrs)) '^^xsd:float))
- (set gn:additive (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive)) '^^xsd:decimal))
- (set gn:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:int))
- (set gn:phenotypeOfDataset
- (ontology 'dataset:
- (regexp-substitute/global
- #f "[^A-Za-z0-9:]"
- (field ("IFNULL(InfoFiles.InfoPageName, IFNULL(PublishFreeze.Name, ''))" DatasetName))
- 'pre "_" 'post)))
- (set gn:phenotypeOfPublication
+ (set gn-term:locus (field PublishXRef Locus))
+ (set gn-term:LRS (annotate-field (field ("IFNULL(PublishXRef.LRS, '')" lrs)) '^^xsd:float))
+ (set gn-term:additive (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive)) '^^xsd:decimal))
+ (set gn-term:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:int))
+ (set gn-term:phenotypeOfDataset
+ (string->identifier
+ ""
+ (field
+ ("IFNULL(InfoFiles.InfoPageName, IFNULL(PublishFreeze.Name, ''))" DatasetName))
+ #:separator ""
+ #:proc string-capitalize-first))
+ (set gn-term:phenotypeOfPublication
(let ((pmid (field
("IF(Publication.PubMed_ID IS NULL, '', CONVERT(Publication.PubMed_Id, INT))"
pmid)))
@@ -108,7 +112,7 @@
(if (string-null? pmid)
(string->identifier "unpublished"
(number->string publication-id))
- (ontology 'publication: pmid))))))
+ (ontology 'pubmed: pmid))))))
(dump-with-documentation
@@ -116,17 +120,15 @@
(connection %connection-settings)
(table-metadata? #f)
(prefixes
- '(("gn-id:" "<http://genenetwork.org/terms/>")
+ '(("gn:" "<http://genenetwork.org/id/>")
("gn-term:" "<http://genenetwork.org/terms/>")
- ("phenotype:" "<http://genenetwork.org/phenotype/>")
("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
("xsd:" "<http://www.w3.org/2001/XMLSchema#>")
- ("dataset:" "<http://genenetwork.org/dataset/>")
- ("publication:" "<http://genenetwork.org/publication/>")))
+ ("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>")))
(inputs
(list dump-publishfreeze
- dump-phenotype))
+ dump-phenotypes))
(outputs
'(#:documentation "./docs/dump-phenotype.md"
#:rdf "./verified-data/dump-phenotype.ttl")))