about summary refs log tree commit diff
diff options
context:
space:
mode:
authorMunyoki Kilyungi2023-12-19 20:34:31 +0300
committerMunyoki Kilyungi2023-12-22 11:53:22 +0300
commit82de3420a0c269d79e8942cb18abe247747877dc (patch)
treee75a8ec09782b51774aa1d2fef3793bec61cadc8
parent71a9553bd12c848f76fdab63c77a6b00ec3e62e7 (diff)
downloadgn-transform-databases-82de3420a0c269d79e8942cb18abe247747877dc.tar.gz
Refactor gene metadata RDF transform.
In the case of the GeneList_rn33 table, the table id is used, since
there is no other way to uniquely identify a gene using the other
fields.  See the following for more details:

    https://issues.genenetwork.org/issues/transform-genelist-to-rdf

Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
-rwxr-xr-xexamples/genelist.scm171
1 files changed, 55 insertions, 116 deletions
diff --git a/examples/genelist.scm b/examples/genelist.scm
index 6b8c3e5..7bee085 100755
--- a/examples/genelist.scm
+++ b/examples/genelist.scm
@@ -3,6 +3,7 @@
 
 (use-modules (srfi srfi-1)
              (srfi srfi-26)
+             (ice-9 format)
              (ice-9 getopt-long)
              (ice-9 match)
              (ice-9 regex)
@@ -91,8 +92,7 @@
     (set dct:description (sanitize-rdf-string (field GeneList GeneDescription)))
     (set gnt:hasGeneId (ontology 'gene: (field GeneList GeneId)))
     (set dct:references
-         (let ((symbol (string-trim-both
-                        (field GeneList GeneSymbol))))
+         (let ((symbol (field GeneList GeneSymbol)))
            (if (not (string-blank? symbol))
                (string->symbol
                 (format #f
@@ -103,8 +103,7 @@
                         "a gnc:ebiGwasLink"))
                "")))
     (set dct:references
-         (let ((symbol (string-trim-both
-                        (field GeneList GeneSymbol)))
+         (let ((symbol (field GeneList GeneSymbol))
                (geneId (field GeneList GeneID))
                (species (field Species Name)))
            (if (and (not (string-blank? symbol))
@@ -121,8 +120,7 @@
                             geneId)))
                "")))
     (set dct:references
-         (let ((symbol (string-trim-both
-                        (field GeneList GeneSymbol)))
+         (let ((symbol (field GeneList GeneSymbol))
                (species (field Species Name)))
            (if (and (not (string-blank? symbol))
                     (not (string-blank? species))
@@ -168,7 +166,7 @@
                         "a gnc:gemmaLink"))
                "")))
     (set dct:references
-         (let ((symbol (field GeneList GeneID))
+         (let ((symbol (field GeneList GeneSymbol))
                (species (lower-case-and-replace-spaces
                          (field Species FullName))))
            (if (and (not (string-blank? symbol))
@@ -186,8 +184,7 @@
                         "a gnc:genemaniaLink"))
                "")))
     (set dct:references
-         (let ((symbol (string-trim-both
-                        (field GeneList GeneSymbol))))
+         (let ((symbol (field GeneList GeneSymbol)))
            (if (not (string-blank? symbol))
                (string->symbol
                 (format #f
@@ -198,8 +195,7 @@
                         "a gnc:PantherLink"))
                "")))
     (set dct:references
-         (let ((symbol (string-trim-both
-                        (field GeneList GeneSymbol))))
+         (let ((symbol (field GeneList GeneSymbol)))
            (if (not (string-blank? symbol))
                (string->symbol
                 (format #f
@@ -210,8 +206,7 @@
                         "a gnc:stringLink"))
                "")))
     (set dct:references
-         (let ((symbol (string-trim-both
-                        (field GeneList GeneSymbol))))
+         (let ((symbol (field GeneList GeneSymbol)))
            (if (not (string-blank? symbol))
                (string->symbol
                 (format #f
@@ -222,8 +217,7 @@
                         "a gnc:gtexLink"))
                "")))
     (set dct:references
-         (let ((symbol (string-trim-both
-                        (field GeneList GeneSymbol))))
+         (let ((symbol (field GeneList GeneSymbol)))
            (if (not (string-blank? symbol))
                (string->symbol
                 (format #f
@@ -241,74 +235,53 @@
                     (field GeneList TxEnd)
                     '^^xsd:double))
     (set gnt:Strand (string-trim-both (field GeneList Strand)))
-    (multiset
+    (set
      gnt:belongsToSpecies
-     (map
-      (lambda (species)
-        (string->identifier
-         ""
-         (remap-species-identifiers
-          (string-trim-both species))
-         #:separator ""
-         #:proc string-capitalize-first))
-      (string-split
-       (sanitize-rdf-string
-        (field ("GROUP_CONCAT( DISTINCT Species.Name )" SpeciesName)))
-       #\,)))
-    (multiset
+     (string->identifier
+      ""
+      (remap-species-identifiers
+       (string-trim-both (field Species Name)))
+      #:separator ""
+      #:proc string-capitalize-first))
+    (set
      gnt:transcript
-     (map
-      (lambda (transcript)
-        (ontology 'transcript:
-                  (string-trim-both transcript)))
-      (string-split
-       (sanitize-rdf-string
-        (field ("GROUP_CONCAT( DISTINCT NM_ID )" NMID)))
-       #\,)))
-    (multiset
-     gnt:hasKgID
-     (map string-trim-both
-          (string-split
-           (sanitize-rdf-string
-            (field ("GROUP_CONCAT( DISTINCT kgID )" kgID)))
-           #\,)))
-    (multiset
-     gnt:hasUnigenID
-     (map string-trim-both
-          (string-split
-           (sanitize-rdf-string
-            (field ("GROUP_CONCAT( DISTINCT UnigenID )" UnigenID)))
-           #\,)))
-    (multiset
-     gnt:hasProteinID
-     (map string-trim-both
-          (string-split
-           (sanitize-rdf-string
-            (field ("GROUP_CONCAT( DISTINCT ProteinID )" ProteinID)))
-           #\,)))
-    (multiset
-     gnt:hasAlignID
-     (map string-trim-both
-          (string-split
-           (sanitize-rdf-string
-            (field ("GROUP_CONCAT( DISTINCT AlignID )" AlignID)))
-           #\,)))
-    (multiset
-     gnt:hasRgdID
-     (map string-trim-both
-          (string-split
-           (sanitize-rdf-string
-            (field ("GROUP_CONCAT( DISTINCT RGD_ID )" RgdID)))
-           #\,)))))
+     (ontology 'transcript:
+               (string-trim-both (field GeneList NM_ID))))
+    (set gnt:hasKgID (string-trim-both (field GeneList kgID)))
+    (set gnt:hasUnigenID (string-trim-both (field GeneList UnigenID)))
+    (set gnt:hasProteinID (string-trim-both (field GeneList ProteinID)))
+    (set gnt:hasAlignID (string-trim-both (field GeneList AlignID)))
+    (set gnt:hasRgdID (field GeneList RGD_ID))))
 
 (define-transformer genelist-rn33
   (tables (GeneList_rn33))
   (triples
-      (string->identifier
-       "gene" (regexp-substitute/global #f "[^A-Za-z0-9:]"
-                                        (string-trim-both
-                                         (field GeneList_rn33 geneSymbol))
-                                        'pre "_" 'post))
+      (let ([gene-uid (field GeneList_rn33 id GENE_UID)])
+        (string->identifier
+         "gene_rn33"
+         (if (number? gene-uid)
+             (number->string
+              gene-uid)
+             gene-uid)))
+    (set rdf:type 'gnc:Gene)
+    (set gnt:belongsToSpecies 'gn:Rattus_norvegicus)
+    (set gnt:geneSymbol (string-trim-both (field GeneList_rn33 geneSymbol)))
+    (set gnt:chromosome (field GeneList_rn33 chromosome))
+    (set gnt:TxStart (annotate-field
+                      (field GeneList_rn33 txStart)
+                      '^^xsd:double))
+    (set gnt:TxEnd (annotate-field
+                    (field GeneList_rn33 txEnd)
+                    '^^xsd:double))
+    (set gnt:Strand (string-trim-both (field GeneList_rn33 strand)))
+    (set
+     gnt:transcript
+     (ontology
+      'transcript:
+      (string-trim-both (field GeneList_rn33 NM_ID))))
+    (set
+     gnt:hasKgID
+     (string-trim-both (field GeneList_rn33 kgID)))
     (set dct:references
          (let ((symbol (field GeneList_rn33 geneSymbol)))
            (if (not (string-blank? symbol))
@@ -320,8 +293,7 @@
                         "a gnc:PantherLink"))
                "")))
     (set dct:references
-         (let ((symbol (string-trim-both
-                        (field GeneList_rn33 geneSymbol))))
+         (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol))))
            (if (not (string-blank? symbol))
                (string->symbol
                 (format #f
@@ -331,8 +303,7 @@
                         "a gnc:ebiGwasLink"))
                "")))
     (set dct:references
-         (let ((symbol (string-trim-both
-                        (field GeneList_rn33 geneSymbol))))
+         (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol))))
            (if (not (string-blank? symbol))
                (string->symbol
                 (format #f
@@ -343,8 +314,7 @@
                         "a gnc:stringLink"))
                "")))
     (set dct:references
-         (let ((symbol (string-trim-both
-                        (field GeneList_rn33 geneSymbol))))
+         (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol))))
            (if (not (string-blank? symbol))
                (string->symbol
                 (format #f
@@ -355,8 +325,7 @@
                         "a gnc:gtexLink"))
                "")))
     (set dct:references
-         (let ((symbol (string-trim-both
-                        (field GeneList_rn33 geneSymbol))))
+         (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol))))
            (if (not (string-blank? symbol))
                (string->symbol
                 (format #f
@@ -365,37 +334,7 @@
                         (uri-encode
                          (string-trim-both symbol))
                         "a gnc:proteinAtlasLink"))
-               "")))
-    (set rdf:type 'gnc:GeneSymbol)
-    (set rdfs:label (string-trim-both
-                     (string-trim-both
-                      (field GeneList_rn33 geneSymbol))))
-    (set gnt:chromosome (field GeneList_rn33 chromosome))
-    (set gnt:TxStart (annotate-field
-                      (field GeneList_rn33 txStart)
-                      '^^xsd:double))
-    (set gnt:TxEnd (annotate-field
-                    (field GeneList_rn33 txEnd)
-                    '^^xsd:double))
-    (set gnt:Strand (string-trim-both (field GeneList_rn33 strand)))
-    (set gnt:belongsToSpecies 'gn:Rattus_norvegicus)
-    (multiset
-     gnt:transcript
-     (map
-      (lambda (transcript)
-        (ontology 'transcript:
-                  (string-trim-both transcript)))
-      (string-split
-       (sanitize-rdf-string
-        (field ("GROUP_CONCAT( DISTINCT NM_ID )" NMID)))
-       #\,)))
-    (multiset
-     gnt:hasKgID
-     (map string-trim-both
-          (string-split
-           (sanitize-rdf-string
-            (field ("GROUP_CONCAT( DISTINCT kgID )" kgID)))
-           #\,)))))
+               "")))))