From 877906dc164add37bfff07ebc1d0684ab6a3a333 Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Mon, 14 Aug 2023 18:05:44 +0300 Subject: Update how genotypes are dumped Signed-off-by: Munyoki Kilyungi --- examples/dump-genotype.scm | 91 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 27 deletions(-) diff --git a/examples/dump-genotype.scm b/examples/dump-genotype.scm index 50cafb6..4ac836d 100755 --- a/examples/dump-genotype.scm +++ b/examples/dump-genotype.scm @@ -25,10 +25,6 @@ (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name") (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId")) "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL") - (schema-triples - (gnt:datasetOfInbredSet rdfs:subPropertyOf gnc:inbredSet) - (gnc:genotypeDataset rdfs:subPropertyOf gnc:dataset) - (gnt:shortName rdfs:subPropertyOf gnc:genotypeDataset)) (triples (string->identifier "" @@ -41,59 +37,98 @@ 'pre "_" 'post) #:separator "" #:proc string-capitalize-first) - (set rdf:type 'gnc:genotypeDataset) - (set gnt:name (field GenoFreeze Name)) - (set gnt:fullName (field GenoFreeze FullName)) - (set gnt:shortName (field GenoFreeze ShortName)) + (set rdf:type 'gnc:genotype) + (set rdfs:label (field GenoFreeze Name)) + (set skos:prefLabel (field GenoFreeze FullName)) + (set skos:altLabel (field GenoFreeze ShortName)) (set dct:created (annotate-field (field GenoFreeze CreateTime) '^^xsd:date)) - (set gnt:datasetOfInbredSet + (set gnt:belongsToInbredSet (string->identifier "" (field InbredSet Name InbredSetName))))) (define-dump dump-genotypes (tables (Geno (left-join GenoXRef "ON Geno.Id = GenoXRef.GenoId") (left-join GenoFreeze "ON GenoFreeze.Id = GenoXRef.GenoFreezeId") + (left-join InbredSet "ON InbredSet.InbredSetId = GenoFreeze.InbredSetId") (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name"))) (schema-triples - (gnc:genotype rdfs:range rdfs:Literal) - (gnt:genotypeDataset rdfs:subPropertyOf gn:dataset)) + (gnt:chr a owl:ObjectProperty) + (gnt:chr skos:description "This resource is located on a given chromosome") + (gnt:chr rdfs:domain gnc:genotype) + (gnt:mb a owl:ObjectProperty) + (gnt:mb skos:definition "The size of this resource in Mb") + (gnt:mb rdfs:domain gnc:genotype) + (gnt:mbMm8 a owl:ObjectProperty) + (gnt:mbMm8 skos:definition "TODO") + (gnt:mbMm8 rdfs:domain gnc:genotype) + (gnt:mb2016 a owl:ObjectProperty) + (gnt:mb2016 skos:definition "TODO") + (gnt:mb2016 rdfs:domain gnc:genotype) + (gnt:hasSequence a owl:ObjectProperty) + (gnt:hasSequence skos:definition "This resource has a given sequence") + (gnt:hasSequence rdfs:domain gnc:genotype) + (gnt:hasSource a owl:ObjectProperty) + (gnt:hasSource rdfs:domain gnc:genotype) + (gnt:hasSource skos:definition "This resource was obtained from this given source") + (gnt:hasAltSourceName a owl:ObjectProperty) + (gnt:hasAltSourceName rdfs:domain gnc:genotype) + (gnt:hasAltSourceName + skos:definition + "The alternative name this resource was obtained from") + (gnt:chrNum a owl:ObjectProperty) + (gnt:chrNum rdfs:domain gnc:genotype) + (gnt:chrNum skos:definition "The chromosome number for this resource") + (gnt:cM a owl:ObjectProperty) + (gnt:cM rdfs:domain gnc:genotype) + (gnt:cM skos:definition "The centimorgan for this resource") + (gnt:usedForMapping a owl:ObjectProperty) + (gnt:usedForMapping rdfs:domain gnc:genotype) + (gnt:usedForMapping + skos:definition "This indicates whether this resource is used for mapping")) (triples (string->identifier "" (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field ("CONCAT(IF(GenoFreeze.Name IS NULL, '', CONCAT(GenoFreeze.Name, ':')), Geno.Name)" abbrev)) + (field ("CONCAT(IF(GenoFreeze.Name IS NULL, '', CONCAT(GenoFreeze.Name, '_')), Geno.Name)" abbrev)) 'pre "_" 'post) #:separator "" #:proc string-capitalize-first) (set rdf:type 'gnc:genotype) - (set gnt:name (sanitize-rdf-string (field Geno Name))) - (set gnt:markerName (sanitize-rdf-string (field Geno Marker_Name))) + (set skos:prefLabel (sanitize-rdf-string (field Geno Name))) (set gnt:chr (field Geno Chr)) - (set gnt:mb (annotate-field (field ("IFNULL(Geno.Mb, '')" Mb)) '^^xsd:double)) - (set gnt:sequence (field Geno Sequence)) - (set gnt:source (field Geno Source)) - (set gnt:source2 (field Geno Source2)) - (set gnt:genotypeOfDataset + (set gnt:mb (annotate-field + (field ("IFNULL(Geno.Mb, '')" Mb)) '^^xsd:double)) + (set gnt:mbMm8 (annotate-field (field ("IFNULL(Geno.Mb_mm8, '')" Mb_mm8)) + '^^xsd:double)) + (set gnt:mb2016 + (annotate-field (field ("IFNULL(Geno.Mb_2016, '')" Mb_2016)) + '^^xsd:double)) + (set gnt:hasSequence (field Geno Sequence)) + (set gnt:hasSource (field Geno Source)) + ;; Only dump Source2 if it differs from Source + (set gnt:hasAltSourceName + (field ("IF((Source2 = Source), NULL, Source2)" + Source2))) + (set gnt:belongsToDataset (string->identifier "" (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field ("IFNULL(GenoFreeze.Name, '')" DatasetName)) - 'pre "_" 'post) + #f "[^A-Za-z0-9:]" + (field ("IFNULL(GenoFreeze.Name, '')" DatasetName)) + 'pre "_" 'post) #:separator "" - #:proc string-capitalize-first) - ) + #:proc string-capitalize-first)) (set gnt:chrNum (annotate-field - (field ("IFNULL(Geno.chr_num, '')" chr_num)) + (field Geno chr_num) '^^xsd:int)) - (set gn:comments (field ("CAST(CONVERT(BINARY CONVERT(Geno.Comments USING latin1) USING utf8) AS VARCHAR(255))" Comments))) + (set rdfs:comments (field Geno Comments)) (set gnt:cM (annotate-field - (field ("IFNULL(GenoXRef.cM, '')" Chr_mm8)) + (field GenoXRef cM) '^^xsd:int)))) @@ -109,6 +144,8 @@ ("gnt:" "") ("rdf:" "") ("rdfs:" "") + ("owl:" "") + ("skos:" "") ("xsd:" ""))) (inputs (list dump-genofreeze -- cgit v1.2.3