From 50fd5b4a9f2b4c687a59ac94260ab31789aceb00 Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Wed, 19 Jul 2023 16:57:20 +0300 Subject: Use "gn:" and "gn-term:" when dumping phenotypes Signed-off-by: Munyoki Kilyungi --- examples/dump-phenotype.scm | 98 +++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 48 deletions(-) (limited to 'examples') diff --git a/examples/dump-phenotype.scm b/examples/dump-phenotype.scm index 33577ce..924ec9a 100755 --- a/examples/dump-phenotype.scm +++ b/examples/dump-phenotype.scm @@ -18,9 +18,6 @@ (call-with-input-file (list-ref (command-line) 1) read)) -(define %dump-directory - (list-ref (command-line) 2)) - ;; Only dump publish freeze entries that were not dumped from the InfoFiles page @@ -30,25 +27,28 @@ (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId")) "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL") (schema-triples - (gn:datasetOfInbredSet rdfs:range gn:inbredSet) - (gn:name rdfs:range rdfs:Literal) - (gn:fullName rdfs:range rdfs:Literal) - (gn:shortName rdfs:range rdfs:Literal) - (gn:createTime rdfs:range rdfs:Literal) + (gn-term:datasetOfInbredSet rdfs:range gn:inbredSet) + (gn-term:name rdfs:range rdfs:Literal) + (gn-term:fullName rdfs:range rdfs:Literal) + (gn-term:shortName rdfs:range rdfs:Literal) + (gn-term:createTime rdfs:range rdfs:Literal) (gn:phenotypeDataset rdf:subClassOf gn:dataset)) (triples - (ontology 'dataset: - (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field PublishFreeze Name) - 'pre "_" 'post)) + (string->identifier + "" + (regexp-substitute/global #f "[^A-Za-z0-9:]" + (field PublishFreeze Name) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) (set rdf:type 'gn:phenotypeDataset) - (set gn:name (field PublishFreeze Name)) - (set gn:fullName (field PublishFreeze FullName)) - (set gn:shortName (field PublishFreeze ShortName)) - (set dct:created (annotate-field + (set gn-term:name (field PublishFreeze Name)) + (set gn-term:fullName (field PublishFreeze FullName)) + (set gn-term:shortName (field PublishFreeze ShortName)) + (set dc-termt:created (annotate-field (field PublishFreeze CreateTime) '^^xsd:date)) - (set gn:datasetOfInbredSet + (set gn-term:datasetOfInbredSet (string->identifier "inbredSet" (field InbredSet Name InbredSetName))))) (define-dump dump-phenotypes @@ -59,48 +59,52 @@ (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name"))) (schema-triples (gn:phenotypeDataset rdfs:subPropertyOf gn:dataset)) - (triples (ontology 'phenotype: - (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field ("CONCAT(IF(PublishFreeze.Name IS NULL, '', CONCAT(PublishFreeze.Name, ':')), IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation))" abbrev)) - 'pre "_" 'post)) + (triples (string->identifier + "" + (regexp-substitute/global #f "[^A-Za-z0-9:]" + (field ("CONCAT(IF(PublishFreeze.Name IS NULL, '', CONCAT(PublishFreeze.Name, '_')), IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation))" abbrev)) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) (set rdf:type 'gn:phenotype) - (set gn:name (sanitize-rdf-string + (set gn-term:name (sanitize-rdf-string (field ("CAST(CONVERT(BINARY CONVERT(IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Phenotype.Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation) USING latin1) USING utf8) AS VARCHAR(100))" PhenotypeName)))) ;; There is no row with an empty post-publication description so ;; use this field as the main publication description - (set gn:publicationDescription + (set gn-term:publicationDescription (sanitize-rdf-string (field ("CAST(CONVERT(BINARY CONVERT(Phenotype.Post_publication_description USING latin1) USING utf8) AS CHAR(10000))" postPubDescr)))) - (set gn:originalDescription (sanitize-rdf-string + (set gn-term:originalDescription (sanitize-rdf-string (delete-substrings (field Phenotype Original_description) "Original post publication description: "))) - (set gn:prePublicationDescription + (set gn-term:prePublicationDescription (sanitize-rdf-string (field ("CAST(CONVERT(BINARY CONVERT(Phenotype.Pre_publication_description USING latin1) USING utf8) AS VARCHAR(15000))" prePubDesc)))) - (set gn:prePublicationAbbreviation (sanitize-rdf-string (field Phenotype Pre_publication_abbreviation))) - (set gn:postPublicationAbbreviation (sanitize-rdf-string (field Phenotype Post_publication_abbreviation))) - (set gn:labCode (field Phenotype Lab_code)) - (set gn:submitter (sanitize-rdf-string (field Phenotype Submitter))) - (set gn:owner (sanitize-rdf-string (field Phenotype Owner))) - (set gn:mean (annotate-field (field ("IFNULL(PublishXRef.mean, '')" mean)) + (set gn-term:prePublicationAbbreviation (sanitize-rdf-string (field Phenotype Pre_publication_abbreviation))) + (set gn-term:postPublicationAbbreviation (sanitize-rdf-string (field Phenotype Post_publication_abbreviation))) + (set gn-term:labCode (field Phenotype Lab_code)) + (set gn-term:submitter (sanitize-rdf-string (field Phenotype Submitter))) + (set gn-term:owner (sanitize-rdf-string (field Phenotype Owner))) + (set gn-term:mean (annotate-field (field ("IFNULL(PublishXRef.mean, '')" mean)) '^^xsd:double)) - (set gn:locus (field PublishXRef Locus)) - (set gn:LRS (annotate-field (field ("IFNULL(PublishXRef.LRS, '')" lrs)) '^^xsd:float)) - (set gn:additive (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive)) '^^xsd:decimal)) - (set gn:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:int)) - (set gn:phenotypeOfDataset - (ontology 'dataset: - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field ("IFNULL(InfoFiles.InfoPageName, IFNULL(PublishFreeze.Name, ''))" DatasetName)) - 'pre "_" 'post))) - (set gn:phenotypeOfPublication + (set gn-term:locus (field PublishXRef Locus)) + (set gn-term:LRS (annotate-field (field ("IFNULL(PublishXRef.LRS, '')" lrs)) '^^xsd:float)) + (set gn-term:additive (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive)) '^^xsd:decimal)) + (set gn-term:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:int)) + (set gn-term:phenotypeOfDataset + (string->identifier + "" + (field + ("IFNULL(InfoFiles.InfoPageName, IFNULL(PublishFreeze.Name, ''))" DatasetName)) + #:separator "" + #:proc string-capitalize-first)) + (set gn-term:phenotypeOfPublication (let ((pmid (field ("IF(Publication.PubMed_ID IS NULL, '', CONVERT(Publication.PubMed_Id, INT))" pmid))) @@ -108,7 +112,7 @@ (if (string-null? pmid) (string->identifier "unpublished" (number->string publication-id)) - (ontology 'publication: pmid)))))) + (ontology 'pubmed: pmid)))))) (dump-with-documentation @@ -116,17 +120,15 @@ (connection %connection-settings) (table-metadata? #f) (prefixes - '(("gn-id:" "") + '(("gn:" "") ("gn-term:" "") - ("phenotype:" "") ("rdf:" "") ("rdfs:" "") ("xsd:" "") - ("dataset:" "") - ("publication:" ""))) + ("pubmed:" ""))) (inputs (list dump-publishfreeze - dump-phenotype)) + dump-phenotypes)) (outputs '(#:documentation "./docs/dump-phenotype.md" #:rdf "./verified-data/dump-phenotype.ttl"))) -- cgit v1.2.3