aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMunyoki Kilyungi2023-06-19 12:07:02 +0300
committerMunyoki Kilyungi2023-06-19 12:07:02 +0300
commit3beb94e591b2d739bd50f1ceb831bb19784a5a2e (patch)
tree11cf6dfc65e1c4162b8676b9f82826c275afae3f
parentfc29dfad2078b7e9d5616ac0f8594344471c1758 (diff)
downloadgn-transform-databases-3beb94e591b2d739bd50f1ceb831bb19784a5a2e.tar.gz
Chunk probeset dump
The probeset table has many columns, with about 5Million rows. As such, the dump can be huge. One problem with the dump is that rapper fails with an out-of-memory error. This commit chunks the data to make linting and uploading data more manageable. Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
-rwxr-xr-xexamples/dump-probeset.scm138
1 files changed, 115 insertions, 23 deletions
diff --git a/examples/dump-probeset.scm b/examples/dump-probeset.scm
index 6b1b7a8..0a6e07b 100755
--- a/examples/dump-probeset.scm
+++ b/examples/dump-probeset.scm
@@ -20,51 +20,143 @@
(list-ref (command-line) 2))
-(define-dump dump-probeset
+(define-dump dump-probeset-0
(tables (ProbeSet
- (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId")))
+ (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId"))
+ "LIMIT 2000000 OFFSET 0")
(schema-triples
(gn:name rdfs:range rdfs:Literal)
(gn:probeset rdfs:range rdfs:Literal))
(triples (ontology
'probeset:
- (regexp-substitute/global
- #f "[^A-Za-z0-9:]"
- (field ("IFNULL(ProbeSet.Name, ProbeSet.Id)"
- name))
- 'pre "_" 'post))
+ (string-trim-both
+ (regexp-substitute/global
+ #f "[^A-Za-z0-9:]"
+ (field ("IFNULL(NULLIF(TRIM(ProbeSet.Name), ''), ProbeSet.Id)"
+ name))
+ 'pre "_" 'post)))
(set rdf:type 'gn:probeset)
(set gn:chipOf (string->identifier "platform" (field GeneChip Name)))
(set gn:name (field ProbeSet Name))
- (set gn:symbol (field ProbeSet Symbol))
+ (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\""))
(set gn:description (sanitize-rdf-string
(field ProbeSet description)))
(set gn:chr (field ProbeSet Chr))
(set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double))
- (multiset gn:tissue (map string-trim-both
- (string-split
- (field ("IFNULL(ProbeSet.Tissue, '')" Tissue))
- #\,)))
- (multiset gn:alias (map string-trim-both
- (string-split (sanitize-rdf-string (field ProbeSet alias))
- #\;)))
- (set gn:unigene (field ProbeSet UniGeneId))
- (set gn:generif (ontology 'generif: (field ProbeSet GeneId)))
- (set gn:genbank (field ProbeSet GenbankId))
+ ;; For now have the tissue, and alias as one line without
+ ;; splitting to make the dump faster
+ ;; (set gn:tissue (field ("IFNULL(ProbeSet.Tissue, '')" Tissue)))
+ ;; (set gn:alias (field ProbeSet alias))
+ ;; (set gn:generif (ontology 'generif: (field ProbeSet GeneId)))
(set gn:blatSeq (sanitize-rdf-string
(string-trim-both (field ProbeSet BlatSeq))))
(set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq)))
- (set gn:omim (sanitize-rdf-string (string-trim-both (field ProbeSet OMIM))))
- (set gn:RefSeq_TranscriptId (field ProbeSet RefSeq_TranscriptId))
+ ;; (set gn:unigene (field ProbeSet UniGeneId))
+ ;; (set gn:genbank (field ProbeSet GenbankId))
+ ;; (set gn:omim (sanitize-rdf-string (string-trim-both (field ProbeSet OMIM))))
+ ;; (set gn:RefSeq_TranscriptId (field ProbeSet RefSeq_TranscriptId))
(set gn:uniProtReference (ontology 'uniprot:
- (field ProbeSet UniProtID)))))
+ (field ProbeSet UniProtID)))))
+
+(define-dump dump-probeset-1
+ (tables (ProbeSet
+ (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId"))
+ "LIMIT 2000000 OFFSET 2000000")
+ (schema-triples
+ (gn:name rdfs:range rdfs:Literal)
+ (gn:probeset rdfs:range rdfs:Literal))
+ (triples (ontology
+ 'probeset:
+ (string-trim-both
+ (regexp-substitute/global
+ #f "[^A-Za-z0-9:]"
+ (field ("IFNULL(NULLIF(TRIM(ProbeSet.Name), ''), ProbeSet.Id)"
+ name))
+ 'pre "_" 'post)))
+ (set rdf:type 'gn:probeset)
+ (set gn:chipOf (string->identifier "platform" (field GeneChip Name)))
+ (set gn:name (field ProbeSet Name))
+ (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\""))
+ (set gn:description (sanitize-rdf-string
+ (field ProbeSet description)))
+ (set gn:chr (field ProbeSet Chr))
+ (set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double))
+ (set gn:blatSeq (sanitize-rdf-string
+ (string-trim-both (field ProbeSet BlatSeq))))
+ (set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq)))
+ (set gn:uniProtReference (ontology 'uniprot:
+ (field ProbeSet UniProtID)))))
+(define-dump dump-probeset-2
+ (tables (ProbeSet
+ (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId"))
+ "WHERE ProbeSet.Name IS NOT NULL LIMIT 2000000 OFFSET 4000000")
+ (schema-triples
+ (gn:name rdfs:range rdfs:Literal)
+ (gn:probeset rdfs:range rdfs:Literal))
+ (triples (ontology
+ 'probeset:
+ (string-trim-both
+ (regexp-substitute/global
+ #f "[^A-Za-z0-9:]"
+ (field ("IFNULL(ProbeSet.Name, ProbeSet.Id)"
+ name))
+ 'pre "_" 'post)))
+ (set rdf:type 'gn:probeset)
+ (set gn:chipOf (string->identifier "platform" (field GeneChip Name)))
+ (set gn:name (field ProbeSet Name))
+ (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\""))
+ (set gn:description (sanitize-rdf-string
+ (field ProbeSet description)))
+ (set gn:chr (field ProbeSet Chr))
+ (set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double))
+ (set gn:blatSeq (sanitize-rdf-string
+ (string-trim-both (field ProbeSet BlatSeq))))
+ (set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq)))
+ (set gn:uniProtReference (ontology 'uniprot:
+ (field ProbeSet UniProtID)))))
(call-with-target-database
%connection-settings
(lambda (db)
- (with-output-to-file (string-append %dump-directory "dump-probeset.ttl")
+ (with-output-to-file (string-append %dump-directory "dump-probeset-0.ttl")
+ (lambda ()
+ (prefix "dct:" "<http://purl.org/dc/terms/>")
+ (prefix "foaf:" "<http://xmlns.com/foaf/0.1/>")
+ (prefix "generif:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>")
+ (prefix "gn:" "<http://genenetwork.org/>")
+ (prefix "owl:" "<http://www.w3.org/2002/07/owl#>")
+ (prefix "phenotype:" "<http://genenetwork.org/phenotype/>")
+ (prefix "pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>")
+ (prefix "rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
+ (prefix "rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
+ (prefix "uniprot:" "<http://purl.uniprot.org/uniprot/>")
+ (prefix "up:" "<http://purl.uniprot.org/core/>")
+ (prefix "xsd:" "<http://www.w3.org/2001/XMLSchema#>")
+ (prefix "probeset:" "<http://genenetwork.org/probeset/>")
+ (newline)
+ (dump-probeset-0 db))
+ #:encoding "utf8")
+ (with-output-to-file (string-append %dump-directory "dump-probeset-1.ttl")
+ (lambda ()
+ (prefix "dct:" "<http://purl.org/dc/terms/>")
+ (prefix "foaf:" "<http://xmlns.com/foaf/0.1/>")
+ (prefix "generif:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>")
+ (prefix "gn:" "<http://genenetwork.org/>")
+ (prefix "owl:" "<http://www.w3.org/2002/07/owl#>")
+ (prefix "phenotype:" "<http://genenetwork.org/phenotype/>")
+ (prefix "pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>")
+ (prefix "rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
+ (prefix "rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
+ (prefix "uniprot:" "<http://purl.uniprot.org/uniprot/>")
+ (prefix "up:" "<http://purl.uniprot.org/core/>")
+ (prefix "xsd:" "<http://www.w3.org/2001/XMLSchema#>")
+ (prefix "probeset:" "<http://genenetwork.org/probeset/>")
+ (newline)
+ (dump-probeset-1 db))
+ #:encoding "utf8")
+ (with-output-to-file (string-append %dump-directory "dump-probeset-2.ttl")
(lambda ()
(prefix "dct:" "<http://purl.org/dc/terms/>")
(prefix "foaf:" "<http://xmlns.com/foaf/0.1/>")
@@ -80,5 +172,5 @@
(prefix "xsd:" "<http://www.w3.org/2001/XMLSchema#>")
(prefix "probeset:" "<http://genenetwork.org/probeset/>")
(newline)
- (dump-probeset db))
+ (dump-probeset-2 db))
#:encoding "utf8")))