From 1cda2a508f4766d503bd6fed97add9e9885c25ef Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Wed, 9 Aug 2023 18:08:00 +0300 Subject: Update how phenotypes are dumped Signed-off-by: Munyoki Kilyungi --- examples/dump-phenotype.scm | 88 +++++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 34 deletions(-) (limited to 'examples') diff --git a/examples/dump-phenotype.scm b/examples/dump-phenotype.scm index cd6ca95..19a8892 100755 --- a/examples/dump-phenotype.scm +++ b/examples/dump-phenotype.scm @@ -20,12 +20,12 @@ -;; Only dump publish freeze entries that were not dumped from the InfoFiles page +;; These are phenotype datasets that don't have Infofile metadata (define-dump dump-publishfreeze (tables (PublishFreeze (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name") (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId")) - "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL") + "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL") (triples (string->identifier "" @@ -51,27 +51,32 @@ (tables (Phenotype (left-join PublishXRef "ON Phenotype.Id = PublishXRef.PhenotypeId") (left-join Publication "ON Publication.Id = PublishXRef.PublicationId") + ;; We need this join so as to construct the trait's skos:altLabel + (left-join InbredSet "ON InbredSet.InbredSetId = PublishXRef.InbredSetId") (left-join PublishFreeze "ON PublishFreeze.InbredSetId = PublishXRef.InbredSetId") - (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name"))) + (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name")) + ;; Only dump public traits; Ignore "hanging" traits + ;; I.e. traits that have no associated vectors + "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND PublishFreeze.Id IS NOT NULL") (schema-triples - (gnt:originalDescription a owl:ObjectProperty) - (gnt:originalDescription rdfs:domain gnc:phenotype) - (gnt:originalDescription skos:definition "The original description of this resource") - (gnt:prePublicationDescription a owl:ObjectProperty) - (gnt:prePublicationDescription rdfs:domain gnc:phenotype) - (gnt:prePublicationDescription skos:definition "The pre publication details of this resource") (gnt:abbreviation a owl:ObjectProperty) (gnt:abbreviation rdfs:domain gnc:phenotype) (gnt:abbreviation skos:definition "The abbreviation used for this resource") - (gnt:labCode rdfs:range rdfs:Literal) - (gnt:submitter rdfs:range rdfs:Literal) - (gnt:owner rdfs:range rdfs:Literal) + (gnt:labCode a owl:ObjectProperty) + (gnt:labCode rdfs:domain gnc:phenotype) + (gnt:submitter a owl:ObjectProperty) + (gnt:submitter rdfs:domain gnc:phenotype) + (gnt:submitter skos:definition "A person who submitted this resource to GN") + (gnt:mean rdfs:domain gnc:phenotype) (gnt:mean rdfs:range xsd:double) - (gnt:LRS rdfs:range xsd:float) + (gnt:LRS rdfs:domain gnc:phenotype) + (gnt:LRS rdfs:range xsd:double) + (gnt:locus rdfs:domain gnc:phenotype) (gnt:locus rdfs:range rdfs:Literal) - (gnt:additive rdfs:range xsd:decimal) - (gnt:sequence rdfs:range rdfs:Literal) - (gnt:phenotypeOfPublication rdfs:range gn-term:pubMedId)) + (gnt:additive rdfs:domain gnc:phenotype) + (gnt:additive rdfs:range xsd:double) + (gnt:sequence rdfs:domain gnc:phenotype) + (gnt:sequence rdfs:range xsd:integer)) (triples (string->identifier "" (regexp-substitute/global #f "[^A-Za-z0-9:]" @@ -80,32 +85,44 @@ #:separator "" #:proc string-capitalize-first) (set rdf:type 'gnc:phenotype) - (set rdfs:label (sanitize-rdf-string - (field - ("IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Phenotype.Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation)" - PhenotypeName)))) - ;; There is no row with an empty post-publication description so - ;; use this field as the main publication description + (set skos:prefLabel (sanitize-rdf-string + (field + ("IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Phenotype.Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation)" + PhenotypeName)))) + ;; Add an alternative name for this resources. This is how GN + ;; currently labels phenotypes + (set skos:altLabel (field + ("CONCAT(InbredSet.Name, '_', PublishXRef.Id)" + phenotypeAltName))) + ;; All phenotypes have a post-publication description (set dct:description (sanitize-rdf-string - (field Phenotype Post_publication_description))) - (set gnt:prePublicationAbbreviation (sanitize-rdf-string (field Phenotype Pre_publication_abbreviation))) - (set gnt:postPublicationAbbreviation (sanitize-rdf-string (field Phenotype Post_publication_abbreviation))) + (field Phenotype Post_publication_description))) + ;; All phenotypes have a post-publication abbreviation + (set gnt:abbreviation (field Phenotype Post_publication_abbreviation)) (set gnt:labCode (field Phenotype Lab_code)) - (set gdmt:hasDistributorInfo + (set gnt:submitter (sanitize-rdf-string (field Phenotype Submitter))) - (set gnt:owner (sanitize-rdf-string (field Phenotype Owner))) + (set dct:contributor (sanitize-rdf-string (field Phenotype Owner))) + (multiset dct:contributor (string-split + (sanitize-rdf-string (field Phenotype Owner)) + #\,)) (set gnt:mean (annotate-field (field ("IFNULL(PublishXRef.mean, '')" mean)) '^^xsd:double)) (set gnt:locus (field PublishXRef Locus)) - (set gnt:LRS (annotate-field (field ("IFNULL(PublishXRef.LRS, '')" lrs)) '^^xsd:double)) - (set gnt:additive (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive)) '^^xsd:double)) - (set gnt:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:int)) + (set gnt:LRS (annotate-field + (field ("IFNULL(PublishXRef.LRS, '')" lrs)) + '^^xsd:double)) + (set gnt:additive + (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive)) + '^^xsd:double)) + (set gnt:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:integer)) (set gnt:belongsToDataset (string->identifier "" - (field - ("IFNULL(InfoFiles.InfoPageName, IFNULL(PublishFreeze.Name, ''))" DatasetName)) + (regexp-substitute/global #f "[^A-Za-z0-9:]" + (field InfoFiles InfoPageName) + 'pre "_" 'post) #:separator "" #:proc string-capitalize-first)) (set dct:isReferencedBy @@ -126,6 +143,7 @@ (prefixes '(("dct:" "") ("gn:" "") + ("owl:" "") ("gnc:" "") ("gnt:" "") ("skos:" "") @@ -134,8 +152,10 @@ ("xsd:" "") ("pubmed:" ""))) (inputs - (list dump-publishfreeze - dump-phenotypes)) + (list + ;; dump-publishfreeze + dump-phenotypes + )) (outputs '(#:documentation "./docs/dump-phenotype.md" #:rdf "./verified-data/dump-phenotype.ttl"))) -- cgit v1.2.3