Chunk probeset dump

The probeset table has many columns, with about 5Million rows. As such, the dump can be huge. One problem with the dump is that rapper fails with an out-of-memory error. This commit chunks the data to make linting and uploading data more manageable. Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
author: Munyoki Kilyungi 2023-06-19 12:07:02 +0300
committer: Munyoki Kilyungi 2023-06-19 12:07:02 +0300
commit: 3beb94e591b2d739bd50f1ceb831bb19784a5a2e (patch)
tree: 11cf6dfc65e1c4162b8676b9f82826c275afae3f /examples
parent: fc29dfad2078b7e9d5616ac0f8594344471c1758 (diff)
download: gn-transform-databases-3beb94e591b2d739bd50f1ceb831bb19784a5a2e.tar.gz
1 files changed, 115 insertions, 23 deletions
diff --git a/examples/dump-probeset.scm b/examples/dump-probeset.scm
index 6b1b7a8..0a6e07b 100755
--- a/examples/dump-probeset.scm
+++ b/examples/dump-probeset.scm
@@ -20,51 +20,143 @@
   (list-ref (command-line) 2))
 
 
-(define-dump dump-probeset
+(define-dump dump-probeset-0
   (tables (ProbeSet
-           (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId")))
+           (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId"))
+          "LIMIT 2000000 OFFSET 0")
   (schema-triples
    (gn:name rdfs:range rdfs:Literal)
    (gn:probeset rdfs:range rdfs:Literal))
   (triples (ontology
             'probeset:
-            (regexp-substitute/global
-             #f "[^A-Za-z0-9:]"
-             (field ("IFNULL(ProbeSet.Name, ProbeSet.Id)"
-                     name))
-             'pre "_" 'post))
+            (string-trim-both
+             (regexp-substitute/global
+              #f "[^A-Za-z0-9:]"
+              (field ("IFNULL(NULLIF(TRIM(ProbeSet.Name), ''), ProbeSet.Id)"
+                      name))
+              'pre "_" 'post)))
     (set rdf:type 'gn:probeset)
     (set gn:chipOf (string->identifier "platform" (field GeneChip Name)))
     (set gn:name (field ProbeSet Name))
-    (set gn:symbol (field ProbeSet Symbol))
+    (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\""))
     (set gn:description (sanitize-rdf-string
                          (field ProbeSet description)))
     (set gn:chr (field ProbeSet Chr))
     (set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double))
-    (multiset gn:tissue (map string-trim-both
-                             (string-split
-                              (field ("IFNULL(ProbeSet.Tissue, '')" Tissue))
-                              #\,)))
-    (multiset gn:alias (map string-trim-both
-                            (string-split (sanitize-rdf-string (field ProbeSet alias))
-                                          #\;)))
-    (set gn:unigene (field ProbeSet UniGeneId))
-    (set gn:generif (ontology 'generif: (field ProbeSet GeneId)))
-    (set gn:genbank (field ProbeSet GenbankId))
+    ;; For now have the tissue, and alias as one line without
+    ;; splitting to make the dump faster
+    ;; (set gn:tissue (field ("IFNULL(ProbeSet.Tissue, '')" Tissue)))
+    ;; (set gn:alias (field ProbeSet alias))
+    ;; (set gn:generif (ontology 'generif: (field ProbeSet GeneId)))
     (set gn:blatSeq (sanitize-rdf-string
                      (string-trim-both (field ProbeSet BlatSeq))))
     (set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq)))
-    (set gn:omim (sanitize-rdf-string (string-trim-both (field ProbeSet OMIM))))
-    (set gn:RefSeq_TranscriptId (field ProbeSet RefSeq_TranscriptId))
+    ;; (set gn:unigene (field ProbeSet UniGeneId))
+    ;; (set gn:genbank (field ProbeSet GenbankId))
+    ;; (set gn:omim (sanitize-rdf-string (string-trim-both (field ProbeSet OMIM))))
+    ;; (set gn:RefSeq_TranscriptId (field ProbeSet RefSeq_TranscriptId))
     (set gn:uniProtReference (ontology 'uniprot:
-                                        (field ProbeSet UniProtID)))))
+                                       (field ProbeSet UniProtID)))))
+
+(define-dump dump-probeset-1
+  (tables (ProbeSet
+           (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId"))
+          "LIMIT 2000000 OFFSET 2000000")
+  (schema-triples
+   (gn:name rdfs:range rdfs:Literal)
+   (gn:probeset rdfs:range rdfs:Literal))
+  (triples (ontology
+            'probeset:
+             (string-trim-both
+             (regexp-substitute/global
+              #f "[^A-Za-z0-9:]"
+              (field ("IFNULL(NULLIF(TRIM(ProbeSet.Name), ''), ProbeSet.Id)"
+                      name))
+              'pre "_" 'post)))
+    (set rdf:type 'gn:probeset)
+    (set gn:chipOf (string->identifier "platform" (field GeneChip Name)))
+    (set gn:name (field ProbeSet Name))
+    (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\""))
+    (set gn:description (sanitize-rdf-string
+                         (field ProbeSet description)))
+    (set gn:chr (field ProbeSet Chr))
+    (set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double))
+    (set gn:blatSeq (sanitize-rdf-string
+                     (string-trim-both (field ProbeSet BlatSeq))))
+    (set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq)))
+    (set gn:uniProtReference (ontology 'uniprot:
+                                       (field ProbeSet UniProtID)))))
+(define-dump dump-probeset-2
+  (tables (ProbeSet
+           (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId"))
+          "WHERE ProbeSet.Name IS NOT NULL LIMIT 2000000 OFFSET 4000000")
+  (schema-triples
+   (gn:name rdfs:range rdfs:Literal)
+   (gn:probeset rdfs:range rdfs:Literal))
+  (triples (ontology
+            'probeset:
+            (string-trim-both
+             (regexp-substitute/global
+              #f "[^A-Za-z0-9:]"
+              (field ("IFNULL(ProbeSet.Name, ProbeSet.Id)"
+                      name))
+              'pre "_" 'post)))
+    (set rdf:type 'gn:probeset)
+    (set gn:chipOf (string->identifier "platform" (field GeneChip Name)))
+    (set gn:name (field ProbeSet Name))
+    (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\""))
+    (set gn:description (sanitize-rdf-string
+                         (field ProbeSet description)))
+    (set gn:chr (field ProbeSet Chr))
+    (set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double))
+    (set gn:blatSeq (sanitize-rdf-string
+                     (string-trim-both (field ProbeSet BlatSeq))))
+    (set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq)))
+    (set gn:uniProtReference (ontology 'uniprot:
+                                       (field ProbeSet UniProtID)))))
 
 
 
 (call-with-target-database
  %connection-settings
  (lambda (db)
-   (with-output-to-file (string-append %dump-directory "dump-probeset.ttl")
+   (with-output-to-file (string-append %dump-directory "dump-probeset-0.ttl")
+     (lambda ()
+       (prefix "dct:" "<http://purl.org/dc/terms/>")
+       (prefix "foaf:" "<http://xmlns.com/foaf/0.1/>")
+       (prefix "generif:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>")
+       (prefix "gn:" "<http://genenetwork.org/>")
+       (prefix "owl:" "<http://www.w3.org/2002/07/owl#>")
+       (prefix "phenotype:" "<http://genenetwork.org/phenotype/>")
+       (prefix "pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>")
+       (prefix "rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
+       (prefix "rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
+       (prefix "uniprot:" "<http://purl.uniprot.org/uniprot/>")
+       (prefix "up:" "<http://purl.uniprot.org/core/>")
+       (prefix "xsd:" "<http://www.w3.org/2001/XMLSchema#>")
+       (prefix "probeset:" "<http://genenetwork.org/probeset/>")
+       (newline)
+       (dump-probeset-0 db))
+     #:encoding "utf8")
+   (with-output-to-file (string-append %dump-directory "dump-probeset-1.ttl")
+     (lambda ()
+       (prefix "dct:" "<http://purl.org/dc/terms/>")
+       (prefix "foaf:" "<http://xmlns.com/foaf/0.1/>")
+       (prefix "generif:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>")
+       (prefix "gn:" "<http://genenetwork.org/>")
+       (prefix "owl:" "<http://www.w3.org/2002/07/owl#>")
+       (prefix "phenotype:" "<http://genenetwork.org/phenotype/>")
+       (prefix "pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>")
+       (prefix "rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
+       (prefix "rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
+       (prefix "uniprot:" "<http://purl.uniprot.org/uniprot/>")
+       (prefix "up:" "<http://purl.uniprot.org/core/>")
+       (prefix "xsd:" "<http://www.w3.org/2001/XMLSchema#>")
+       (prefix "probeset:" "<http://genenetwork.org/probeset/>")
+       (newline)
+       (dump-probeset-1 db))
+     #:encoding "utf8")
+   (with-output-to-file (string-append %dump-directory "dump-probeset-2.ttl")
      (lambda ()
        (prefix "dct:" "<http://purl.org/dc/terms/>")
        (prefix "foaf:" "<http://xmlns.com/foaf/0.1/>")
@@ -80,5 +172,5 @@
        (prefix "xsd:" "<http://www.w3.org/2001/XMLSchema#>")
        (prefix "probeset:" "<http://genenetwork.org/probeset/>")
        (newline)
-       (dump-probeset db))
+       (dump-probeset-2 db))
      #:encoding "utf8")))
author	Munyoki Kilyungi	2023-06-19 12:07:02 +0300
committer	Munyoki Kilyungi	2023-06-19 12:07:02 +0300
commit	3beb94e591b2d739bd50f1ceb831bb19784a5a2e (patch)
tree	11cf6dfc65e1c4162b8676b9f82826c275afae3f /examples
parent	fc29dfad2078b7e9d5616ac0f8594344471c1758 (diff)
download	gn-transform-databases-3beb94e591b2d739bd50f1ceb831bb19784a5a2e.tar.gz