diff options
author | Munyoki Kilyungi | 2023-07-19 16:57:20 +0300 |
---|---|---|
committer | Munyoki Kilyungi | 2023-07-21 14:36:42 +0300 |
commit | 50fd5b4a9f2b4c687a59ac94260ab31789aceb00 (patch) | |
tree | 6119a18b2da33ab9939894bbac68ae50d46baa85 | |
parent | 19620d231d13ba93a81382313aa45730afd09acd (diff) | |
download | gn-transform-databases-50fd5b4a9f2b4c687a59ac94260ab31789aceb00.tar.gz |
Use "gn:" and "gn-term:" when dumping phenotypes
Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
-rwxr-xr-x | examples/dump-phenotype.scm | 98 |
1 files changed, 50 insertions, 48 deletions
diff --git a/examples/dump-phenotype.scm b/examples/dump-phenotype.scm index 33577ce..924ec9a 100755 --- a/examples/dump-phenotype.scm +++ b/examples/dump-phenotype.scm @@ -18,9 +18,6 @@ (call-with-input-file (list-ref (command-line) 1) read)) -(define %dump-directory - (list-ref (command-line) 2)) - ;; Only dump publish freeze entries that were not dumped from the InfoFiles page @@ -30,25 +27,28 @@ (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId")) "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL") (schema-triples - (gn:datasetOfInbredSet rdfs:range gn:inbredSet) - (gn:name rdfs:range rdfs:Literal) - (gn:fullName rdfs:range rdfs:Literal) - (gn:shortName rdfs:range rdfs:Literal) - (gn:createTime rdfs:range rdfs:Literal) + (gn-term:datasetOfInbredSet rdfs:range gn:inbredSet) + (gn-term:name rdfs:range rdfs:Literal) + (gn-term:fullName rdfs:range rdfs:Literal) + (gn-term:shortName rdfs:range rdfs:Literal) + (gn-term:createTime rdfs:range rdfs:Literal) (gn:phenotypeDataset rdf:subClassOf gn:dataset)) (triples - (ontology 'dataset: - (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field PublishFreeze Name) - 'pre "_" 'post)) + (string->identifier + "" + (regexp-substitute/global #f "[^A-Za-z0-9:]" + (field PublishFreeze Name) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) (set rdf:type 'gn:phenotypeDataset) - (set gn:name (field PublishFreeze Name)) - (set gn:fullName (field PublishFreeze FullName)) - (set gn:shortName (field PublishFreeze ShortName)) - (set dct:created (annotate-field + (set gn-term:name (field PublishFreeze Name)) + (set gn-term:fullName (field PublishFreeze FullName)) + (set gn-term:shortName (field PublishFreeze ShortName)) + (set dc-termt:created (annotate-field (field PublishFreeze CreateTime) '^^xsd:date)) - (set gn:datasetOfInbredSet + (set gn-term:datasetOfInbredSet (string->identifier "inbredSet" (field InbredSet Name InbredSetName))))) (define-dump dump-phenotypes @@ -59,48 +59,52 @@ (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name"))) (schema-triples (gn:phenotypeDataset rdfs:subPropertyOf gn:dataset)) - (triples (ontology 'phenotype: - (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field ("CONCAT(IF(PublishFreeze.Name IS NULL, '', CONCAT(PublishFreeze.Name, ':')), IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation))" abbrev)) - 'pre "_" 'post)) + (triples (string->identifier + "" + (regexp-substitute/global #f "[^A-Za-z0-9:]" + (field ("CONCAT(IF(PublishFreeze.Name IS NULL, '', CONCAT(PublishFreeze.Name, '_')), IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation))" abbrev)) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) (set rdf:type 'gn:phenotype) - (set gn:name (sanitize-rdf-string + (set gn-term:name (sanitize-rdf-string (field ("CAST(CONVERT(BINARY CONVERT(IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Phenotype.Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation) USING latin1) USING utf8) AS VARCHAR(100))" PhenotypeName)))) ;; There is no row with an empty post-publication description so ;; use this field as the main publication description - (set gn:publicationDescription + (set gn-term:publicationDescription (sanitize-rdf-string (field ("CAST(CONVERT(BINARY CONVERT(Phenotype.Post_publication_description USING latin1) USING utf8) AS CHAR(10000))" postPubDescr)))) - (set gn:originalDescription (sanitize-rdf-string + (set gn-term:originalDescription (sanitize-rdf-string (delete-substrings (field Phenotype Original_description) "Original post publication description: "))) - (set gn:prePublicationDescription + (set gn-term:prePublicationDescription (sanitize-rdf-string (field ("CAST(CONVERT(BINARY CONVERT(Phenotype.Pre_publication_description USING latin1) USING utf8) AS VARCHAR(15000))" prePubDesc)))) - (set gn:prePublicationAbbreviation (sanitize-rdf-string (field Phenotype Pre_publication_abbreviation))) - (set gn:postPublicationAbbreviation (sanitize-rdf-string (field Phenotype Post_publication_abbreviation))) - (set gn:labCode (field Phenotype Lab_code)) - (set gn:submitter (sanitize-rdf-string (field Phenotype Submitter))) - (set gn:owner (sanitize-rdf-string (field Phenotype Owner))) - (set gn:mean (annotate-field (field ("IFNULL(PublishXRef.mean, '')" mean)) + (set gn-term:prePublicationAbbreviation (sanitize-rdf-string (field Phenotype Pre_publication_abbreviation))) + (set gn-term:postPublicationAbbreviation (sanitize-rdf-string (field Phenotype Post_publication_abbreviation))) + (set gn-term:labCode (field Phenotype Lab_code)) + (set gn-term:submitter (sanitize-rdf-string (field Phenotype Submitter))) + (set gn-term:owner (sanitize-rdf-string (field Phenotype Owner))) + (set gn-term:mean (annotate-field (field ("IFNULL(PublishXRef.mean, '')" mean)) '^^xsd:double)) - (set gn:locus (field PublishXRef Locus)) - (set gn:LRS (annotate-field (field ("IFNULL(PublishXRef.LRS, '')" lrs)) '^^xsd:float)) - (set gn:additive (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive)) '^^xsd:decimal)) - (set gn:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:int)) - (set gn:phenotypeOfDataset - (ontology 'dataset: - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field ("IFNULL(InfoFiles.InfoPageName, IFNULL(PublishFreeze.Name, ''))" DatasetName)) - 'pre "_" 'post))) - (set gn:phenotypeOfPublication + (set gn-term:locus (field PublishXRef Locus)) + (set gn-term:LRS (annotate-field (field ("IFNULL(PublishXRef.LRS, '')" lrs)) '^^xsd:float)) + (set gn-term:additive (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive)) '^^xsd:decimal)) + (set gn-term:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:int)) + (set gn-term:phenotypeOfDataset + (string->identifier + "" + (field + ("IFNULL(InfoFiles.InfoPageName, IFNULL(PublishFreeze.Name, ''))" DatasetName)) + #:separator "" + #:proc string-capitalize-first)) + (set gn-term:phenotypeOfPublication (let ((pmid (field ("IF(Publication.PubMed_ID IS NULL, '', CONVERT(Publication.PubMed_Id, INT))" pmid))) @@ -108,7 +112,7 @@ (if (string-null? pmid) (string->identifier "unpublished" (number->string publication-id)) - (ontology 'publication: pmid)))))) + (ontology 'pubmed: pmid)))))) (dump-with-documentation @@ -116,17 +120,15 @@ (connection %connection-settings) (table-metadata? #f) (prefixes - '(("gn-id:" "<http://genenetwork.org/terms/>") + '(("gn:" "<http://genenetwork.org/id/>") ("gn-term:" "<http://genenetwork.org/terms/>") - ("phenotype:" "<http://genenetwork.org/phenotype/>") ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") ("xsd:" "<http://www.w3.org/2001/XMLSchema#>") - ("dataset:" "<http://genenetwork.org/dataset/>") - ("publication:" "<http://genenetwork.org/publication/>"))) + ("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>"))) (inputs (list dump-publishfreeze - dump-phenotype)) + dump-phenotypes)) (outputs '(#:documentation "./docs/dump-phenotype.md" #:rdf "./verified-data/dump-phenotype.ttl"))) |