From 3beb94e591b2d739bd50f1ceb831bb19784a5a2e Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Mon, 19 Jun 2023 12:07:02 +0300 Subject: Chunk probeset dump The probeset table has many columns, with about 5Million rows. As such, the dump can be huge. One problem with the dump is that rapper fails with an out-of-memory error. This commit chunks the data to make linting and uploading data more manageable. Signed-off-by: Munyoki Kilyungi --- examples/dump-probeset.scm | 138 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 115 insertions(+), 23 deletions(-) diff --git a/examples/dump-probeset.scm b/examples/dump-probeset.scm index 6b1b7a8..0a6e07b 100755 --- a/examples/dump-probeset.scm +++ b/examples/dump-probeset.scm @@ -20,51 +20,143 @@ (list-ref (command-line) 2)) -(define-dump dump-probeset +(define-dump dump-probeset-0 (tables (ProbeSet - (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId"))) + (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId")) + "LIMIT 2000000 OFFSET 0") (schema-triples (gn:name rdfs:range rdfs:Literal) (gn:probeset rdfs:range rdfs:Literal)) (triples (ontology 'probeset: - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field ("IFNULL(ProbeSet.Name, ProbeSet.Id)" - name)) - 'pre "_" 'post)) + (string-trim-both + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field ("IFNULL(NULLIF(TRIM(ProbeSet.Name), ''), ProbeSet.Id)" + name)) + 'pre "_" 'post))) (set rdf:type 'gn:probeset) (set gn:chipOf (string->identifier "platform" (field GeneChip Name))) (set gn:name (field ProbeSet Name)) - (set gn:symbol (field ProbeSet Symbol)) + (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\"")) (set gn:description (sanitize-rdf-string (field ProbeSet description))) (set gn:chr (field ProbeSet Chr)) (set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double)) - (multiset gn:tissue (map string-trim-both - (string-split - (field ("IFNULL(ProbeSet.Tissue, '')" Tissue)) - #\,))) - (multiset gn:alias (map string-trim-both - (string-split (sanitize-rdf-string (field ProbeSet alias)) - #\;))) - (set gn:unigene (field ProbeSet UniGeneId)) - (set gn:generif (ontology 'generif: (field ProbeSet GeneId))) - (set gn:genbank (field ProbeSet GenbankId)) + ;; For now have the tissue, and alias as one line without + ;; splitting to make the dump faster + ;; (set gn:tissue (field ("IFNULL(ProbeSet.Tissue, '')" Tissue))) + ;; (set gn:alias (field ProbeSet alias)) + ;; (set gn:generif (ontology 'generif: (field ProbeSet GeneId))) (set gn:blatSeq (sanitize-rdf-string (string-trim-both (field ProbeSet BlatSeq)))) (set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq))) - (set gn:omim (sanitize-rdf-string (string-trim-both (field ProbeSet OMIM)))) - (set gn:RefSeq_TranscriptId (field ProbeSet RefSeq_TranscriptId)) + ;; (set gn:unigene (field ProbeSet UniGeneId)) + ;; (set gn:genbank (field ProbeSet GenbankId)) + ;; (set gn:omim (sanitize-rdf-string (string-trim-both (field ProbeSet OMIM)))) + ;; (set gn:RefSeq_TranscriptId (field ProbeSet RefSeq_TranscriptId)) (set gn:uniProtReference (ontology 'uniprot: - (field ProbeSet UniProtID))))) + (field ProbeSet UniProtID))))) + +(define-dump dump-probeset-1 + (tables (ProbeSet + (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId")) + "LIMIT 2000000 OFFSET 2000000") + (schema-triples + (gn:name rdfs:range rdfs:Literal) + (gn:probeset rdfs:range rdfs:Literal)) + (triples (ontology + 'probeset: + (string-trim-both + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field ("IFNULL(NULLIF(TRIM(ProbeSet.Name), ''), ProbeSet.Id)" + name)) + 'pre "_" 'post))) + (set rdf:type 'gn:probeset) + (set gn:chipOf (string->identifier "platform" (field GeneChip Name))) + (set gn:name (field ProbeSet Name)) + (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\"")) + (set gn:description (sanitize-rdf-string + (field ProbeSet description))) + (set gn:chr (field ProbeSet Chr)) + (set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double)) + (set gn:blatSeq (sanitize-rdf-string + (string-trim-both (field ProbeSet BlatSeq)))) + (set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq))) + (set gn:uniProtReference (ontology 'uniprot: + (field ProbeSet UniProtID))))) +(define-dump dump-probeset-2 + (tables (ProbeSet + (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId")) + "WHERE ProbeSet.Name IS NOT NULL LIMIT 2000000 OFFSET 4000000") + (schema-triples + (gn:name rdfs:range rdfs:Literal) + (gn:probeset rdfs:range rdfs:Literal)) + (triples (ontology + 'probeset: + (string-trim-both + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field ("IFNULL(ProbeSet.Name, ProbeSet.Id)" + name)) + 'pre "_" 'post))) + (set rdf:type 'gn:probeset) + (set gn:chipOf (string->identifier "platform" (field GeneChip Name))) + (set gn:name (field ProbeSet Name)) + (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\"")) + (set gn:description (sanitize-rdf-string + (field ProbeSet description))) + (set gn:chr (field ProbeSet Chr)) + (set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double)) + (set gn:blatSeq (sanitize-rdf-string + (string-trim-both (field ProbeSet BlatSeq)))) + (set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq))) + (set gn:uniProtReference (ontology 'uniprot: + (field ProbeSet UniProtID))))) (call-with-target-database %connection-settings (lambda (db) - (with-output-to-file (string-append %dump-directory "dump-probeset.ttl") + (with-output-to-file (string-append %dump-directory "dump-probeset-0.ttl") + (lambda () + (prefix "dct:" "") + (prefix "foaf:" "") + (prefix "generif:" "") + (prefix "gn:" "") + (prefix "owl:" "") + (prefix "phenotype:" "") + (prefix "pubmed:" "") + (prefix "rdf:" "") + (prefix "rdfs:" "") + (prefix "uniprot:" "") + (prefix "up:" "") + (prefix "xsd:" "") + (prefix "probeset:" "") + (newline) + (dump-probeset-0 db)) + #:encoding "utf8") + (with-output-to-file (string-append %dump-directory "dump-probeset-1.ttl") + (lambda () + (prefix "dct:" "") + (prefix "foaf:" "") + (prefix "generif:" "") + (prefix "gn:" "") + (prefix "owl:" "") + (prefix "phenotype:" "") + (prefix "pubmed:" "") + (prefix "rdf:" "") + (prefix "rdfs:" "") + (prefix "uniprot:" "") + (prefix "up:" "") + (prefix "xsd:" "") + (prefix "probeset:" "") + (newline) + (dump-probeset-1 db)) + #:encoding "utf8") + (with-output-to-file (string-append %dump-directory "dump-probeset-2.ttl") (lambda () (prefix "dct:" "") (prefix "foaf:" "") @@ -80,5 +172,5 @@ (prefix "xsd:" "") (prefix "probeset:" "") (newline) - (dump-probeset db)) + (dump-probeset-2 db)) #:encoding "utf8"))) -- cgit v1.2.3