From f02867825c04e13b23415a4855d732286032fad5 Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Tue, 9 May 2023 13:09:21 +0300 Subject: Dump phenotypes Signed-off-by: Munyoki Kilyungi --- dump.scm | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/dump.scm b/dump.scm index 5cfdd72..4a3f37f 100755 --- a/dump.scm +++ b/dump.scm @@ -726,6 +726,76 @@ must be remedied." (set gn:citation (field Datasets Citation)) (set gn:acknowledgment (field Datasets Acknowledgment)))) +;; Dumping Phenotypes from PublishFreeze that are not present in the InfoFiles tables +(define-dump dump-phenotypes + (tables (Phenotype + (left-join PublishXRef "ON Phenotype.Id = PublishXRef.PhenotypeId") + (left-join Publication "ON Publication.Id = PublishXRef.PublicationId") + (left-join PublishFreeze "ON PublishFreeze.InbredSetId = PublishXRef.InbredSetId") + (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name"))) + (schema-triples + (gn:phenotypeDataset rdfs:subPropertyOf gn:dataset)) + (triples (ontology 'phenotype: + (regexp-substitute/global #f "[^A-Za-z0-9:]" + (field ("CONCAT(IF(PublishFreeze.Name IS NULL, '', CONCAT(PublishFreeze.Name, ':')), IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation))" abbrev)) + 'pre "_" 'post)) + (set rdf:type 'gn:phenotype) + (set gn:name (sanitize-rdf-string + (field + ("CAST(CONVERT(BINARY CONVERT(CONCAT(IF(PublishFreeze.Name IS NULL, '', CONCAT(PublishFreeze.Name, '-')), IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Phenotype.Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation)) USING latin1) USING utf8) AS VARCHAR(10000))" + abbrev)))) + ;; There is no row with an empty post-publication description so + ;; use this field as the main publication description + (set gn:publicationDescription + (sanitize-rdf-string + (field ("CAST(CONVERT(BINARY CONVERT(Phenotype.Post_publication_description USING latin1) USING utf8) AS CHAR(10000))" + postPubDescr)))) + (set gn:originalDescription (sanitize-rdf-string + (delete-substrings + (field Phenotype Original_description) + "Original post publication description: "))) + (set gn:prePublicationDescription + (sanitize-rdf-string + (field + ("CAST(CONVERT(BINARY CONVERT(Phenotype.Pre_publication_description USING latin1) USING utf8) AS VARCHAR(15000))" + prePubDesc)))) + (set gn:prePublicationAbbreviation (sanitize-rdf-string (field Phenotype Pre_publication_abbreviation))) + (set gn:postPublicationAbbreviation (sanitize-rdf-string (field Phenotype Post_publication_abbreviation))) + (set gn:labCode (field Phenotype Lab_code)) + (set gn:submitter (sanitize-rdf-string (field Phenotype Submitter))) + (set gn:owner (sanitize-rdf-string (field Phenotype Owner))) + (set gn:mean (annotate-field (field ("IFNULL(PublishXRef.mean, '')" mean)) + '^^xsd:float)) + (set gn:locus (field PublishXRef Locus)) + (set gn:LRS (annotate-field (field ("IFNULL(PublishXRef.LRS, '')" lrs)) '^^xsd:float)) + (set gn:additive (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive)) '^^xsd:decimal)) + (set gn:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:int)) + (set gn:phenotypeOfDataset (string->identifier "dataset" (field PublishFreeze Name))) + (set gn:phenotypeOfPublication + (let ((pmid (field + ("IF(Publication.PubMed_ID IS NULL, '', CONVERT(Publication.PubMed_Id, INT))" + pmid))) + (publication-id (field Publication Id))) + (if (string-null? pmid) + (string->identifier "publication" + (number->string publication-id)) + (ontology 'pubmed: pmid)))))) + +(define-dump dump-genotypes + (tables (GenoFreeze + (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name")) + "WHERE GenoFreeze.Name NOT IN (SELECT DISTINCT InfoFiles.InfoPageName FROM InfoFiles)") + (schema-triples + (gn:phenotypeDataset rdfs:subPropertyOf gn:dataset)) + (triples (string->identifier "dataset" + (field GenoFreeze Name)) + (set rdf:type 'gn:genotypeDataset) + (set gn:name (field GenoFreeze FullName)) + (set dct:created (annotate-field + (field GenoFreeze CreateTime) + '^^xsd:datetime)))) + + (define (dump-data-table db table-name data-field) (let ((dump-directory (string-append %dump-directory "/" table-name)) (port #f) @@ -960,6 +1030,7 @@ is a object." (prefix "generif:" "") (prefix "xsd:" "") (prefix "owl:" "") + (prefix "phenotype:" "") (newline) (dump-genewiki-symbols db) (dump-gn-genewiki-entries db) -- cgit v1.2.3