aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMunyoki Kilyungi2023-12-19 20:34:31 +0300
committerMunyoki Kilyungi2023-12-22 11:53:22 +0300
commit82de3420a0c269d79e8942cb18abe247747877dc (patch)
treee75a8ec09782b51774aa1d2fef3793bec61cadc8
parent71a9553bd12c848f76fdab63c77a6b00ec3e62e7 (diff)
downloadgn-transform-databases-82de3420a0c269d79e8942cb18abe247747877dc.tar.gz
Refactor gene metadata RDF transform.
In the case of the GeneList_rn33 table, the table id is used, since there is no other way to uniquely identify a gene using the other fields. See the following for more details: https://issues.genenetwork.org/issues/transform-genelist-to-rdf Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
-rwxr-xr-xexamples/genelist.scm171
1 files changed, 55 insertions, 116 deletions
diff --git a/examples/genelist.scm b/examples/genelist.scm
index 6b8c3e5..7bee085 100755
--- a/examples/genelist.scm
+++ b/examples/genelist.scm
@@ -3,6 +3,7 @@
(use-modules (srfi srfi-1)
(srfi srfi-26)
+ (ice-9 format)
(ice-9 getopt-long)
(ice-9 match)
(ice-9 regex)
@@ -91,8 +92,7 @@
(set dct:description (sanitize-rdf-string (field GeneList GeneDescription)))
(set gnt:hasGeneId (ontology 'gene: (field GeneList GeneId)))
(set dct:references
- (let ((symbol (string-trim-both
- (field GeneList GeneSymbol))))
+ (let ((symbol (field GeneList GeneSymbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
@@ -103,8 +103,7 @@
"a gnc:ebiGwasLink"))
"")))
(set dct:references
- (let ((symbol (string-trim-both
- (field GeneList GeneSymbol)))
+ (let ((symbol (field GeneList GeneSymbol))
(geneId (field GeneList GeneID))
(species (field Species Name)))
(if (and (not (string-blank? symbol))
@@ -121,8 +120,7 @@
geneId)))
"")))
(set dct:references
- (let ((symbol (string-trim-both
- (field GeneList GeneSymbol)))
+ (let ((symbol (field GeneList GeneSymbol))
(species (field Species Name)))
(if (and (not (string-blank? symbol))
(not (string-blank? species))
@@ -168,7 +166,7 @@
"a gnc:gemmaLink"))
"")))
(set dct:references
- (let ((symbol (field GeneList GeneID))
+ (let ((symbol (field GeneList GeneSymbol))
(species (lower-case-and-replace-spaces
(field Species FullName))))
(if (and (not (string-blank? symbol))
@@ -186,8 +184,7 @@
"a gnc:genemaniaLink"))
"")))
(set dct:references
- (let ((symbol (string-trim-both
- (field GeneList GeneSymbol))))
+ (let ((symbol (field GeneList GeneSymbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
@@ -198,8 +195,7 @@
"a gnc:PantherLink"))
"")))
(set dct:references
- (let ((symbol (string-trim-both
- (field GeneList GeneSymbol))))
+ (let ((symbol (field GeneList GeneSymbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
@@ -210,8 +206,7 @@
"a gnc:stringLink"))
"")))
(set dct:references
- (let ((symbol (string-trim-both
- (field GeneList GeneSymbol))))
+ (let ((symbol (field GeneList GeneSymbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
@@ -222,8 +217,7 @@
"a gnc:gtexLink"))
"")))
(set dct:references
- (let ((symbol (string-trim-both
- (field GeneList GeneSymbol))))
+ (let ((symbol (field GeneList GeneSymbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
@@ -241,74 +235,53 @@
(field GeneList TxEnd)
'^^xsd:double))
(set gnt:Strand (string-trim-both (field GeneList Strand)))
- (multiset
+ (set
gnt:belongsToSpecies
- (map
- (lambda (species)
- (string->identifier
- ""
- (remap-species-identifiers
- (string-trim-both species))
- #:separator ""
- #:proc string-capitalize-first))
- (string-split
- (sanitize-rdf-string
- (field ("GROUP_CONCAT( DISTINCT Species.Name )" SpeciesName)))
- #\,)))
- (multiset
+ (string->identifier
+ ""
+ (remap-species-identifiers
+ (string-trim-both (field Species Name)))
+ #:separator ""
+ #:proc string-capitalize-first))
+ (set
gnt:transcript
- (map
- (lambda (transcript)
- (ontology 'transcript:
- (string-trim-both transcript)))
- (string-split
- (sanitize-rdf-string
- (field ("GROUP_CONCAT( DISTINCT NM_ID )" NMID)))
- #\,)))
- (multiset
- gnt:hasKgID
- (map string-trim-both
- (string-split
- (sanitize-rdf-string
- (field ("GROUP_CONCAT( DISTINCT kgID )" kgID)))
- #\,)))
- (multiset
- gnt:hasUnigenID
- (map string-trim-both
- (string-split
- (sanitize-rdf-string
- (field ("GROUP_CONCAT( DISTINCT UnigenID )" UnigenID)))
- #\,)))
- (multiset
- gnt:hasProteinID
- (map string-trim-both
- (string-split
- (sanitize-rdf-string
- (field ("GROUP_CONCAT( DISTINCT ProteinID )" ProteinID)))
- #\,)))
- (multiset
- gnt:hasAlignID
- (map string-trim-both
- (string-split
- (sanitize-rdf-string
- (field ("GROUP_CONCAT( DISTINCT AlignID )" AlignID)))
- #\,)))
- (multiset
- gnt:hasRgdID
- (map string-trim-both
- (string-split
- (sanitize-rdf-string
- (field ("GROUP_CONCAT( DISTINCT RGD_ID )" RgdID)))
- #\,)))))
+ (ontology 'transcript:
+ (string-trim-both (field GeneList NM_ID))))
+ (set gnt:hasKgID (string-trim-both (field GeneList kgID)))
+ (set gnt:hasUnigenID (string-trim-both (field GeneList UnigenID)))
+ (set gnt:hasProteinID (string-trim-both (field GeneList ProteinID)))
+ (set gnt:hasAlignID (string-trim-both (field GeneList AlignID)))
+ (set gnt:hasRgdID (field GeneList RGD_ID))))
(define-transformer genelist-rn33
(tables (GeneList_rn33))
(triples
- (string->identifier
- "gene" (regexp-substitute/global #f "[^A-Za-z0-9:]"
- (string-trim-both
- (field GeneList_rn33 geneSymbol))
- 'pre "_" 'post))
+ (let ([gene-uid (field GeneList_rn33 id GENE_UID)])
+ (string->identifier
+ "gene_rn33"
+ (if (number? gene-uid)
+ (number->string
+ gene-uid)
+ gene-uid)))
+ (set rdf:type 'gnc:Gene)
+ (set gnt:belongsToSpecies 'gn:Rattus_norvegicus)
+ (set gnt:geneSymbol (string-trim-both (field GeneList_rn33 geneSymbol)))
+ (set gnt:chromosome (field GeneList_rn33 chromosome))
+ (set gnt:TxStart (annotate-field
+ (field GeneList_rn33 txStart)
+ '^^xsd:double))
+ (set gnt:TxEnd (annotate-field
+ (field GeneList_rn33 txEnd)
+ '^^xsd:double))
+ (set gnt:Strand (string-trim-both (field GeneList_rn33 strand)))
+ (set
+ gnt:transcript
+ (ontology
+ 'transcript:
+ (string-trim-both (field GeneList_rn33 NM_ID))))
+ (set
+ gnt:hasKgID
+ (string-trim-both (field GeneList_rn33 kgID)))
(set dct:references
(let ((symbol (field GeneList_rn33 geneSymbol)))
(if (not (string-blank? symbol))
@@ -320,8 +293,7 @@
"a gnc:PantherLink"))
"")))
(set dct:references
- (let ((symbol (string-trim-both
- (field GeneList_rn33 geneSymbol))))
+ (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
@@ -331,8 +303,7 @@
"a gnc:ebiGwasLink"))
"")))
(set dct:references
- (let ((symbol (string-trim-both
- (field GeneList_rn33 geneSymbol))))
+ (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
@@ -343,8 +314,7 @@
"a gnc:stringLink"))
"")))
(set dct:references
- (let ((symbol (string-trim-both
- (field GeneList_rn33 geneSymbol))))
+ (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
@@ -355,8 +325,7 @@
"a gnc:gtexLink"))
"")))
(set dct:references
- (let ((symbol (string-trim-both
- (field GeneList_rn33 geneSymbol))))
+ (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
@@ -365,37 +334,7 @@
(uri-encode
(string-trim-both symbol))
"a gnc:proteinAtlasLink"))
- "")))
- (set rdf:type 'gnc:GeneSymbol)
- (set rdfs:label (string-trim-both
- (string-trim-both
- (field GeneList_rn33 geneSymbol))))
- (set gnt:chromosome (field GeneList_rn33 chromosome))
- (set gnt:TxStart (annotate-field
- (field GeneList_rn33 txStart)
- '^^xsd:double))
- (set gnt:TxEnd (annotate-field
- (field GeneList_rn33 txEnd)
- '^^xsd:double))
- (set gnt:Strand (string-trim-both (field GeneList_rn33 strand)))
- (set gnt:belongsToSpecies 'gn:Rattus_norvegicus)
- (multiset
- gnt:transcript
- (map
- (lambda (transcript)
- (ontology 'transcript:
- (string-trim-both transcript)))
- (string-split
- (sanitize-rdf-string
- (field ("GROUP_CONCAT( DISTINCT NM_ID )" NMID)))
- #\,)))
- (multiset
- gnt:hasKgID
- (map string-trim-both
- (string-split
- (sanitize-rdf-string
- (field ("GROUP_CONCAT( DISTINCT kgID )" kgID)))
- #\,)))))
+ "")))))