From 82de3420a0c269d79e8942cb18abe247747877dc Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Tue, 19 Dec 2023 20:34:31 +0300 Subject: Refactor gene metadata RDF transform. In the case of the GeneList_rn33 table, the table id is used, since there is no other way to uniquely identify a gene using the other fields. See the following for more details: https://issues.genenetwork.org/issues/transform-genelist-to-rdf Signed-off-by: Munyoki Kilyungi --- examples/genelist.scm | 171 ++++++++++++++++---------------------------------- 1 file changed, 55 insertions(+), 116 deletions(-) (limited to 'examples') diff --git a/examples/genelist.scm b/examples/genelist.scm index 6b8c3e5..7bee085 100755 --- a/examples/genelist.scm +++ b/examples/genelist.scm @@ -3,6 +3,7 @@ (use-modules (srfi srfi-1) (srfi srfi-26) + (ice-9 format) (ice-9 getopt-long) (ice-9 match) (ice-9 regex) @@ -91,8 +92,7 @@ (set dct:description (sanitize-rdf-string (field GeneList GeneDescription))) (set gnt:hasGeneId (ontology 'gene: (field GeneList GeneId))) (set dct:references - (let ((symbol (string-trim-both - (field GeneList GeneSymbol)))) + (let ((symbol (field GeneList GeneSymbol))) (if (not (string-blank? symbol)) (string->symbol (format #f @@ -103,8 +103,7 @@ "a gnc:ebiGwasLink")) ""))) (set dct:references - (let ((symbol (string-trim-both - (field GeneList GeneSymbol))) + (let ((symbol (field GeneList GeneSymbol)) (geneId (field GeneList GeneID)) (species (field Species Name))) (if (and (not (string-blank? symbol)) @@ -121,8 +120,7 @@ geneId))) ""))) (set dct:references - (let ((symbol (string-trim-both - (field GeneList GeneSymbol))) + (let ((symbol (field GeneList GeneSymbol)) (species (field Species Name))) (if (and (not (string-blank? symbol)) (not (string-blank? species)) @@ -168,7 +166,7 @@ "a gnc:gemmaLink")) ""))) (set dct:references - (let ((symbol (field GeneList GeneID)) + (let ((symbol (field GeneList GeneSymbol)) (species (lower-case-and-replace-spaces (field Species FullName)))) (if (and (not (string-blank? symbol)) @@ -186,8 +184,7 @@ "a gnc:genemaniaLink")) ""))) (set dct:references - (let ((symbol (string-trim-both - (field GeneList GeneSymbol)))) + (let ((symbol (field GeneList GeneSymbol))) (if (not (string-blank? symbol)) (string->symbol (format #f @@ -198,8 +195,7 @@ "a gnc:PantherLink")) ""))) (set dct:references - (let ((symbol (string-trim-both - (field GeneList GeneSymbol)))) + (let ((symbol (field GeneList GeneSymbol))) (if (not (string-blank? symbol)) (string->symbol (format #f @@ -210,8 +206,7 @@ "a gnc:stringLink")) ""))) (set dct:references - (let ((symbol (string-trim-both - (field GeneList GeneSymbol)))) + (let ((symbol (field GeneList GeneSymbol))) (if (not (string-blank? symbol)) (string->symbol (format #f @@ -222,8 +217,7 @@ "a gnc:gtexLink")) ""))) (set dct:references - (let ((symbol (string-trim-both - (field GeneList GeneSymbol)))) + (let ((symbol (field GeneList GeneSymbol))) (if (not (string-blank? symbol)) (string->symbol (format #f @@ -241,74 +235,53 @@ (field GeneList TxEnd) '^^xsd:double)) (set gnt:Strand (string-trim-both (field GeneList Strand))) - (multiset + (set gnt:belongsToSpecies - (map - (lambda (species) - (string->identifier - "" - (remap-species-identifiers - (string-trim-both species)) - #:separator "" - #:proc string-capitalize-first)) - (string-split - (sanitize-rdf-string - (field ("GROUP_CONCAT( DISTINCT Species.Name )" SpeciesName))) - #\,))) - (multiset + (string->identifier + "" + (remap-species-identifiers + (string-trim-both (field Species Name))) + #:separator "" + #:proc string-capitalize-first)) + (set gnt:transcript - (map - (lambda (transcript) - (ontology 'transcript: - (string-trim-both transcript))) - (string-split - (sanitize-rdf-string - (field ("GROUP_CONCAT( DISTINCT NM_ID )" NMID))) - #\,))) - (multiset - gnt:hasKgID - (map string-trim-both - (string-split - (sanitize-rdf-string - (field ("GROUP_CONCAT( DISTINCT kgID )" kgID))) - #\,))) - (multiset - gnt:hasUnigenID - (map string-trim-both - (string-split - (sanitize-rdf-string - (field ("GROUP_CONCAT( DISTINCT UnigenID )" UnigenID))) - #\,))) - (multiset - gnt:hasProteinID - (map string-trim-both - (string-split - (sanitize-rdf-string - (field ("GROUP_CONCAT( DISTINCT ProteinID )" ProteinID))) - #\,))) - (multiset - gnt:hasAlignID - (map string-trim-both - (string-split - (sanitize-rdf-string - (field ("GROUP_CONCAT( DISTINCT AlignID )" AlignID))) - #\,))) - (multiset - gnt:hasRgdID - (map string-trim-both - (string-split - (sanitize-rdf-string - (field ("GROUP_CONCAT( DISTINCT RGD_ID )" RgdID))) - #\,))))) + (ontology 'transcript: + (string-trim-both (field GeneList NM_ID)))) + (set gnt:hasKgID (string-trim-both (field GeneList kgID))) + (set gnt:hasUnigenID (string-trim-both (field GeneList UnigenID))) + (set gnt:hasProteinID (string-trim-both (field GeneList ProteinID))) + (set gnt:hasAlignID (string-trim-both (field GeneList AlignID))) + (set gnt:hasRgdID (field GeneList RGD_ID)))) (define-transformer genelist-rn33 (tables (GeneList_rn33)) (triples - (string->identifier - "gene" (regexp-substitute/global #f "[^A-Za-z0-9:]" - (string-trim-both - (field GeneList_rn33 geneSymbol)) - 'pre "_" 'post)) + (let ([gene-uid (field GeneList_rn33 id GENE_UID)]) + (string->identifier + "gene_rn33" + (if (number? gene-uid) + (number->string + gene-uid) + gene-uid))) + (set rdf:type 'gnc:Gene) + (set gnt:belongsToSpecies 'gn:Rattus_norvegicus) + (set gnt:geneSymbol (string-trim-both (field GeneList_rn33 geneSymbol))) + (set gnt:chromosome (field GeneList_rn33 chromosome)) + (set gnt:TxStart (annotate-field + (field GeneList_rn33 txStart) + '^^xsd:double)) + (set gnt:TxEnd (annotate-field + (field GeneList_rn33 txEnd) + '^^xsd:double)) + (set gnt:Strand (string-trim-both (field GeneList_rn33 strand))) + (set + gnt:transcript + (ontology + 'transcript: + (string-trim-both (field GeneList_rn33 NM_ID)))) + (set + gnt:hasKgID + (string-trim-both (field GeneList_rn33 kgID))) (set dct:references (let ((symbol (field GeneList_rn33 geneSymbol))) (if (not (string-blank? symbol)) @@ -320,8 +293,7 @@ "a gnc:PantherLink")) ""))) (set dct:references - (let ((symbol (string-trim-both - (field GeneList_rn33 geneSymbol)))) + (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol)))) (if (not (string-blank? symbol)) (string->symbol (format #f @@ -331,8 +303,7 @@ "a gnc:ebiGwasLink")) ""))) (set dct:references - (let ((symbol (string-trim-both - (field GeneList_rn33 geneSymbol)))) + (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol)))) (if (not (string-blank? symbol)) (string->symbol (format #f @@ -343,8 +314,7 @@ "a gnc:stringLink")) ""))) (set dct:references - (let ((symbol (string-trim-both - (field GeneList_rn33 geneSymbol)))) + (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol)))) (if (not (string-blank? symbol)) (string->symbol (format #f @@ -355,8 +325,7 @@ "a gnc:gtexLink")) ""))) (set dct:references - (let ((symbol (string-trim-both - (field GeneList_rn33 geneSymbol)))) + (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol)))) (if (not (string-blank? symbol)) (string->symbol (format #f @@ -365,37 +334,7 @@ (uri-encode (string-trim-both symbol)) "a gnc:proteinAtlasLink")) - ""))) - (set rdf:type 'gnc:GeneSymbol) - (set rdfs:label (string-trim-both - (string-trim-both - (field GeneList_rn33 geneSymbol)))) - (set gnt:chromosome (field GeneList_rn33 chromosome)) - (set gnt:TxStart (annotate-field - (field GeneList_rn33 txStart) - '^^xsd:double)) - (set gnt:TxEnd (annotate-field - (field GeneList_rn33 txEnd) - '^^xsd:double)) - (set gnt:Strand (string-trim-both (field GeneList_rn33 strand))) - (set gnt:belongsToSpecies 'gn:Rattus_norvegicus) - (multiset - gnt:transcript - (map - (lambda (transcript) - (ontology 'transcript: - (string-trim-both transcript))) - (string-split - (sanitize-rdf-string - (field ("GROUP_CONCAT( DISTINCT NM_ID )" NMID))) - #\,))) - (multiset - gnt:hasKgID - (map string-trim-both - (string-split - (sanitize-rdf-string - (field ("GROUP_CONCAT( DISTINCT kgID )" kgID))) - #\,))))) + ""))))) -- cgit v1.2.3