aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xexamples/dump-probeset.scm138
1 files changed, 115 insertions, 23 deletions
diff --git a/examples/dump-probeset.scm b/examples/dump-probeset.scm
index 6b1b7a8..0a6e07b 100755
--- a/examples/dump-probeset.scm
+++ b/examples/dump-probeset.scm
@@ -20,51 +20,143 @@
(list-ref (command-line) 2))
-(define-dump dump-probeset
+(define-dump dump-probeset-0
(tables (ProbeSet
- (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId")))
+ (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId"))
+ "LIMIT 2000000 OFFSET 0")
(schema-triples
(gn:name rdfs:range rdfs:Literal)
(gn:probeset rdfs:range rdfs:Literal))
(triples (ontology
'probeset:
- (regexp-substitute/global
- #f "[^A-Za-z0-9:]"
- (field ("IFNULL(ProbeSet.Name, ProbeSet.Id)"
- name))
- 'pre "_" 'post))
+ (string-trim-both
+ (regexp-substitute/global
+ #f "[^A-Za-z0-9:]"
+ (field ("IFNULL(NULLIF(TRIM(ProbeSet.Name), ''), ProbeSet.Id)"
+ name))
+ 'pre "_" 'post)))
(set rdf:type 'gn:probeset)
(set gn:chipOf (string->identifier "platform" (field GeneChip Name)))
(set gn:name (field ProbeSet Name))
- (set gn:symbol (field ProbeSet Symbol))
+ (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\""))
(set gn:description (sanitize-rdf-string
(field ProbeSet description)))
(set gn:chr (field ProbeSet Chr))
(set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double))
- (multiset gn:tissue (map string-trim-both
- (string-split
- (field ("IFNULL(ProbeSet.Tissue, '')" Tissue))
- #\,)))
- (multiset gn:alias (map string-trim-both
- (string-split (sanitize-rdf-string (field ProbeSet alias))
- #\;)))
- (set gn:unigene (field ProbeSet UniGeneId))
- (set gn:generif (ontology 'generif: (field ProbeSet GeneId)))
- (set gn:genbank (field ProbeSet GenbankId))
+ ;; For now have the tissue, and alias as one line without
+ ;; splitting to make the dump faster
+ ;; (set gn:tissue (field ("IFNULL(ProbeSet.Tissue, '')" Tissue)))
+ ;; (set gn:alias (field ProbeSet alias))
+ ;; (set gn:generif (ontology 'generif: (field ProbeSet GeneId)))
(set gn:blatSeq (sanitize-rdf-string
(string-trim-both (field ProbeSet BlatSeq))))
(set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq)))
- (set gn:omim (sanitize-rdf-string (string-trim-both (field ProbeSet OMIM))))
- (set gn:RefSeq_TranscriptId (field ProbeSet RefSeq_TranscriptId))
+ ;; (set gn:unigene (field ProbeSet UniGeneId))
+ ;; (set gn:genbank (field ProbeSet GenbankId))
+ ;; (set gn:omim (sanitize-rdf-string (string-trim-both (field ProbeSet OMIM))))
+ ;; (set gn:RefSeq_TranscriptId (field ProbeSet RefSeq_TranscriptId))
(set gn:uniProtReference (ontology 'uniprot:
- (field ProbeSet UniProtID)))))
+ (field ProbeSet UniProtID)))))
+
+(define-dump dump-probeset-1
+ (tables (ProbeSet
+ (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId"))
+ "LIMIT 2000000 OFFSET 2000000")
+ (schema-triples
+ (gn:name rdfs:range rdfs:Literal)
+ (gn:probeset rdfs:range rdfs:Literal))
+ (triples (ontology
+ 'probeset:
+ (string-trim-both
+ (regexp-substitute/global
+ #f "[^A-Za-z0-9:]"
+ (field ("IFNULL(NULLIF(TRIM(ProbeSet.Name), ''), ProbeSet.Id)"
+ name))
+ 'pre "_" 'post)))
+ (set rdf:type 'gn:probeset)
+ (set gn:chipOf (string->identifier "platform" (field GeneChip Name)))
+ (set gn:name (field ProbeSet Name))
+ (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\""))
+ (set gn:description (sanitize-rdf-string
+ (field ProbeSet description)))
+ (set gn:chr (field ProbeSet Chr))
+ (set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double))
+ (set gn:blatSeq (sanitize-rdf-string
+ (string-trim-both (field ProbeSet BlatSeq))))
+ (set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq)))
+ (set gn:uniProtReference (ontology 'uniprot:
+ (field ProbeSet UniProtID)))))
+(define-dump dump-probeset-2
+ (tables (ProbeSet
+ (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId"))
+ "WHERE ProbeSet.Name IS NOT NULL LIMIT 2000000 OFFSET 4000000")
+ (schema-triples
+ (gn:name rdfs:range rdfs:Literal)
+ (gn:probeset rdfs:range rdfs:Literal))
+ (triples (ontology
+ 'probeset:
+ (string-trim-both
+ (regexp-substitute/global
+ #f "[^A-Za-z0-9:]"
+ (field ("IFNULL(ProbeSet.Name, ProbeSet.Id)"
+ name))
+ 'pre "_" 'post)))
+ (set rdf:type 'gn:probeset)
+ (set gn:chipOf (string->identifier "platform" (field GeneChip Name)))
+ (set gn:name (field ProbeSet Name))
+ (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\""))
+ (set gn:description (sanitize-rdf-string
+ (field ProbeSet description)))
+ (set gn:chr (field ProbeSet Chr))
+ (set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double))
+ (set gn:blatSeq (sanitize-rdf-string
+ (string-trim-both (field ProbeSet BlatSeq))))
+ (set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq)))
+ (set gn:uniProtReference (ontology 'uniprot:
+ (field ProbeSet UniProtID)))))
(call-with-target-database
%connection-settings
(lambda (db)
- (with-output-to-file (string-append %dump-directory "dump-probeset.ttl")
+ (with-output-to-file (string-append %dump-directory "dump-probeset-0.ttl")
+ (lambda ()
+ (prefix "dct:" "<http://purl.org/dc/terms/>")
+ (prefix "foaf:" "<http://xmlns.com/foaf/0.1/>")
+ (prefix "generif:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>")
+ (prefix "gn:" "<http://genenetwork.org/>")
+ (prefix "owl:" "<http://www.w3.org/2002/07/owl#>")
+ (prefix "phenotype:" "<http://genenetwork.org/phenotype/>")
+ (prefix "pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>")
+ (prefix "rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
+ (prefix "rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
+ (prefix "uniprot:" "<http://purl.uniprot.org/uniprot/>")
+ (prefix "up:" "<http://purl.uniprot.org/core/>")
+ (prefix "xsd:" "<http://www.w3.org/2001/XMLSchema#>")
+ (prefix "probeset:" "<http://genenetwork.org/probeset/>")
+ (newline)
+ (dump-probeset-0 db))
+ #:encoding "utf8")
+ (with-output-to-file (string-append %dump-directory "dump-probeset-1.ttl")
+ (lambda ()
+ (prefix "dct:" "<http://purl.org/dc/terms/>")
+ (prefix "foaf:" "<http://xmlns.com/foaf/0.1/>")
+ (prefix "generif:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>")
+ (prefix "gn:" "<http://genenetwork.org/>")
+ (prefix "owl:" "<http://www.w3.org/2002/07/owl#>")
+ (prefix "phenotype:" "<http://genenetwork.org/phenotype/>")
+ (prefix "pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>")
+ (prefix "rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
+ (prefix "rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
+ (prefix "uniprot:" "<http://purl.uniprot.org/uniprot/>")
+ (prefix "up:" "<http://purl.uniprot.org/core/>")
+ (prefix "xsd:" "<http://www.w3.org/2001/XMLSchema#>")
+ (prefix "probeset:" "<http://genenetwork.org/probeset/>")
+ (newline)
+ (dump-probeset-1 db))
+ #:encoding "utf8")
+ (with-output-to-file (string-append %dump-directory "dump-probeset-2.ttl")
(lambda ()
(prefix "dct:" "<http://purl.org/dc/terms/>")
(prefix "foaf:" "<http://xmlns.com/foaf/0.1/>")
@@ -80,5 +172,5 @@
(prefix "xsd:" "<http://www.w3.org/2001/XMLSchema#>")
(prefix "probeset:" "<http://genenetwork.org/probeset/>")
(newline)
- (dump-probeset db))
+ (dump-probeset-2 db))
#:encoding "utf8")))