aboutsummaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorMunyoki Kilyungi2023-08-09 18:08:00 +0300
committerMunyoki Kilyungi2023-08-09 18:08:00 +0300
commit1cda2a508f4766d503bd6fed97add9e9885c25ef (patch)
tree0b9e672bc362d6475b4ba7855d2fcf3efbd1d9a6 /examples
parent9b5ea8f0d7f4d968c7dd4feedcde0288d554f9f0 (diff)
downloadgn-transform-databases-1cda2a508f4766d503bd6fed97add9e9885c25ef.tar.gz
Update how phenotypes are dumped
Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
Diffstat (limited to 'examples')
-rwxr-xr-xexamples/dump-phenotype.scm88
1 files changed, 54 insertions, 34 deletions
diff --git a/examples/dump-phenotype.scm b/examples/dump-phenotype.scm
index cd6ca95..19a8892 100755
--- a/examples/dump-phenotype.scm
+++ b/examples/dump-phenotype.scm
@@ -20,12 +20,12 @@
-;; Only dump publish freeze entries that were not dumped from the InfoFiles page
+;; These are phenotype datasets that don't have Infofile metadata
(define-dump dump-publishfreeze
(tables (PublishFreeze
(left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name")
(left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId"))
- "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL")
+ "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL")
(triples
(string->identifier
""
@@ -51,27 +51,32 @@
(tables (Phenotype
(left-join PublishXRef "ON Phenotype.Id = PublishXRef.PhenotypeId")
(left-join Publication "ON Publication.Id = PublishXRef.PublicationId")
+ ;; We need this join so as to construct the trait's skos:altLabel
+ (left-join InbredSet "ON InbredSet.InbredSetId = PublishXRef.InbredSetId")
(left-join PublishFreeze "ON PublishFreeze.InbredSetId = PublishXRef.InbredSetId")
- (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name")))
+ (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name"))
+ ;; Only dump public traits; Ignore "hanging" traits
+ ;; I.e. traits that have no associated vectors
+ "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND PublishFreeze.Id IS NOT NULL")
(schema-triples
- (gnt:originalDescription a owl:ObjectProperty)
- (gnt:originalDescription rdfs:domain gnc:phenotype)
- (gnt:originalDescription skos:definition "The original description of this resource")
- (gnt:prePublicationDescription a owl:ObjectProperty)
- (gnt:prePublicationDescription rdfs:domain gnc:phenotype)
- (gnt:prePublicationDescription skos:definition "The pre publication details of this resource")
(gnt:abbreviation a owl:ObjectProperty)
(gnt:abbreviation rdfs:domain gnc:phenotype)
(gnt:abbreviation skos:definition "The abbreviation used for this resource")
- (gnt:labCode rdfs:range rdfs:Literal)
- (gnt:submitter rdfs:range rdfs:Literal)
- (gnt:owner rdfs:range rdfs:Literal)
+ (gnt:labCode a owl:ObjectProperty)
+ (gnt:labCode rdfs:domain gnc:phenotype)
+ (gnt:submitter a owl:ObjectProperty)
+ (gnt:submitter rdfs:domain gnc:phenotype)
+ (gnt:submitter skos:definition "A person who submitted this resource to GN")
+ (gnt:mean rdfs:domain gnc:phenotype)
(gnt:mean rdfs:range xsd:double)
- (gnt:LRS rdfs:range xsd:float)
+ (gnt:LRS rdfs:domain gnc:phenotype)
+ (gnt:LRS rdfs:range xsd:double)
+ (gnt:locus rdfs:domain gnc:phenotype)
(gnt:locus rdfs:range rdfs:Literal)
- (gnt:additive rdfs:range xsd:decimal)
- (gnt:sequence rdfs:range rdfs:Literal)
- (gnt:phenotypeOfPublication rdfs:range gn-term:pubMedId))
+ (gnt:additive rdfs:domain gnc:phenotype)
+ (gnt:additive rdfs:range xsd:double)
+ (gnt:sequence rdfs:domain gnc:phenotype)
+ (gnt:sequence rdfs:range xsd:integer))
(triples (string->identifier
""
(regexp-substitute/global #f "[^A-Za-z0-9:]"
@@ -80,32 +85,44 @@
#:separator ""
#:proc string-capitalize-first)
(set rdf:type 'gnc:phenotype)
- (set rdfs:label (sanitize-rdf-string
- (field
- ("IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Phenotype.Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation)"
- PhenotypeName))))
- ;; There is no row with an empty post-publication description so
- ;; use this field as the main publication description
+ (set skos:prefLabel (sanitize-rdf-string
+ (field
+ ("IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Phenotype.Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation)"
+ PhenotypeName))))
+ ;; Add an alternative name for this resources. This is how GN
+ ;; currently labels phenotypes
+ (set skos:altLabel (field
+ ("CONCAT(InbredSet.Name, '_', PublishXRef.Id)"
+ phenotypeAltName)))
+ ;; All phenotypes have a post-publication description
(set dct:description
(sanitize-rdf-string
- (field Phenotype Post_publication_description)))
- (set gnt:prePublicationAbbreviation (sanitize-rdf-string (field Phenotype Pre_publication_abbreviation)))
- (set gnt:postPublicationAbbreviation (sanitize-rdf-string (field Phenotype Post_publication_abbreviation)))
+ (field Phenotype Post_publication_description)))
+ ;; All phenotypes have a post-publication abbreviation
+ (set gnt:abbreviation (field Phenotype Post_publication_abbreviation))
(set gnt:labCode (field Phenotype Lab_code))
- (set gdmt:hasDistributorInfo
+ (set gnt:submitter
(sanitize-rdf-string (field Phenotype Submitter)))
- (set gnt:owner (sanitize-rdf-string (field Phenotype Owner)))
+ (set dct:contributor (sanitize-rdf-string (field Phenotype Owner)))
+ (multiset dct:contributor (string-split
+ (sanitize-rdf-string (field Phenotype Owner))
+ #\,))
(set gnt:mean (annotate-field (field ("IFNULL(PublishXRef.mean, '')" mean))
'^^xsd:double))
(set gnt:locus (field PublishXRef Locus))
- (set gnt:LRS (annotate-field (field ("IFNULL(PublishXRef.LRS, '')" lrs)) '^^xsd:double))
- (set gnt:additive (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive)) '^^xsd:double))
- (set gnt:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:int))
+ (set gnt:LRS (annotate-field
+ (field ("IFNULL(PublishXRef.LRS, '')" lrs))
+ '^^xsd:double))
+ (set gnt:additive
+ (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive))
+ '^^xsd:double))
+ (set gnt:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:integer))
(set gnt:belongsToDataset
(string->identifier
""
- (field
- ("IFNULL(InfoFiles.InfoPageName, IFNULL(PublishFreeze.Name, ''))" DatasetName))
+ (regexp-substitute/global #f "[^A-Za-z0-9:]"
+ (field InfoFiles InfoPageName)
+ 'pre "_" 'post)
#:separator ""
#:proc string-capitalize-first))
(set dct:isReferencedBy
@@ -126,6 +143,7 @@
(prefixes
'(("dct:" "<http://purl.org/dc/terms/>")
("gn:" "<http://genenetwork.org/id/>")
+ ("owl:" "<http://www.w3.org/2002/07/owl#>")
("gnc:" "<http://genenetwork.org/category/>")
("gnt:" "<http://genenetwork.org/terms/>")
("skos:" "<http://www.w3.org/2004/02/skos/core#>")
@@ -134,8 +152,10 @@
("xsd:" "<http://www.w3.org/2001/XMLSchema#>")
("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>")))
(inputs
- (list dump-publishfreeze
- dump-phenotypes))
+ (list
+ ;; dump-publishfreeze
+ dump-phenotypes
+ ))
(outputs
'(#:documentation "./docs/dump-phenotype.md"
#:rdf "./verified-data/dump-phenotype.ttl")))