From 30525673f58ace73f9ccc84de570d6967e79958e Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Tue, 18 Jul 2023 17:26:50 +0300 Subject: Dump probeset metadata with documentation Signed-off-by: Munyoki Kilyungi --- examples/dump-probeset.scm | 174 ++++++++------------------------------------- 1 file changed, 31 insertions(+), 143 deletions(-) diff --git a/examples/dump-probeset.scm b/examples/dump-probeset.scm index 0a6e07b..be09b48 100755 --- a/examples/dump-probeset.scm +++ b/examples/dump-probeset.scm @@ -16,17 +16,13 @@ (call-with-input-file (list-ref (command-line) 1) read)) -(define %dump-directory - (list-ref (command-line) 2)) - -(define-dump dump-probeset-0 +(define-dump dump-probeset (tables (ProbeSet - (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId")) - "LIMIT 2000000 OFFSET 0") + (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId"))) (schema-triples - (gn:name rdfs:range rdfs:Literal) - (gn:probeset rdfs:range rdfs:Literal)) + (gn-term:name rdfs:range rdfs:Literal) + (gn-term:probeset rdfs:range rdfs:Literal)) (triples (ontology 'probeset: (string-trim-both @@ -35,142 +31,34 @@ (field ("IFNULL(NULLIF(TRIM(ProbeSet.Name), ''), ProbeSet.Id)" name)) 'pre "_" 'post))) - (set rdf:type 'gn:probeset) - (set gn:chipOf (string->identifier "platform" (field GeneChip Name))) - (set gn:name (field ProbeSet Name)) - (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\"")) - (set gn:description (sanitize-rdf-string - (field ProbeSet description))) - (set gn:chr (field ProbeSet Chr)) - (set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double)) - ;; For now have the tissue, and alias as one line without - ;; splitting to make the dump faster - ;; (set gn:tissue (field ("IFNULL(ProbeSet.Tissue, '')" Tissue))) - ;; (set gn:alias (field ProbeSet alias)) - ;; (set gn:generif (ontology 'generif: (field ProbeSet GeneId))) - (set gn:blatSeq (sanitize-rdf-string - (string-trim-both (field ProbeSet BlatSeq)))) - (set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq))) - ;; (set gn:unigene (field ProbeSet UniGeneId)) - ;; (set gn:genbank (field ProbeSet GenbankId)) - ;; (set gn:omim (sanitize-rdf-string (string-trim-both (field ProbeSet OMIM)))) - ;; (set gn:RefSeq_TranscriptId (field ProbeSet RefSeq_TranscriptId)) - (set gn:uniProtReference (ontology 'uniprot: - (field ProbeSet UniProtID))))) + (set rdf:type 'gn-id:probeset) + (set gn-term:chipOf (string->identifier "platform" (field GeneChip Name))) + (set gn-term:name (field ProbeSet Name)) + (set gn-term:symbol (delete-substrings (field ProbeSet Symbol) "\"")) + (set gn-term:description (sanitize-rdf-string + (field ProbeSet description))) + (set gn-term:chr (field ProbeSet Chr)) + (set gn-term:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double)) + (set gn-term:blatSeq (sanitize-rdf-string + (string-trim-both (field ProbeSet BlatSeq)))) + (set gn-term:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq))) + (set gn-term:uniProtReference (ontology 'uniprot: + (field ProbeSet UniProtID))))) -(define-dump dump-probeset-1 - (tables (ProbeSet - (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId")) - "LIMIT 2000000 OFFSET 2000000") - (schema-triples - (gn:name rdfs:range rdfs:Literal) - (gn:probeset rdfs:range rdfs:Literal)) - (triples (ontology - 'probeset: - (string-trim-both - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field ("IFNULL(NULLIF(TRIM(ProbeSet.Name), ''), ProbeSet.Id)" - name)) - 'pre "_" 'post))) - (set rdf:type 'gn:probeset) - (set gn:chipOf (string->identifier "platform" (field GeneChip Name))) - (set gn:name (field ProbeSet Name)) - (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\"")) - (set gn:description (sanitize-rdf-string - (field ProbeSet description))) - (set gn:chr (field ProbeSet Chr)) - (set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double)) - (set gn:blatSeq (sanitize-rdf-string - (string-trim-both (field ProbeSet BlatSeq)))) - (set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq))) - (set gn:uniProtReference (ontology 'uniprot: - (field ProbeSet UniProtID))))) -(define-dump dump-probeset-2 - (tables (ProbeSet - (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId")) - "WHERE ProbeSet.Name IS NOT NULL LIMIT 2000000 OFFSET 4000000") - (schema-triples - (gn:name rdfs:range rdfs:Literal) - (gn:probeset rdfs:range rdfs:Literal)) - (triples (ontology - 'probeset: - (string-trim-both - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field ("IFNULL(ProbeSet.Name, ProbeSet.Id)" - name)) - 'pre "_" 'post))) - (set rdf:type 'gn:probeset) - (set gn:chipOf (string->identifier "platform" (field GeneChip Name))) - (set gn:name (field ProbeSet Name)) - (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\"")) - (set gn:description (sanitize-rdf-string - (field ProbeSet description))) - (set gn:chr (field ProbeSet Chr)) - (set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double)) - (set gn:blatSeq (sanitize-rdf-string - (string-trim-both (field ProbeSet BlatSeq)))) - (set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq))) - (set gn:uniProtReference (ontology 'uniprot: - (field ProbeSet UniProtID))))) -(call-with-target-database - %connection-settings - (lambda (db) - (with-output-to-file (string-append %dump-directory "dump-probeset-0.ttl") - (lambda () - (prefix "dct:" "") - (prefix "foaf:" "") - (prefix "generif:" "") - (prefix "gn:" "") - (prefix "owl:" "") - (prefix "phenotype:" "") - (prefix "pubmed:" "") - (prefix "rdf:" "") - (prefix "rdfs:" "") - (prefix "uniprot:" "") - (prefix "up:" "") - (prefix "xsd:" "") - (prefix "probeset:" "") - (newline) - (dump-probeset-0 db)) - #:encoding "utf8") - (with-output-to-file (string-append %dump-directory "dump-probeset-1.ttl") - (lambda () - (prefix "dct:" "") - (prefix "foaf:" "") - (prefix "generif:" "") - (prefix "gn:" "") - (prefix "owl:" "") - (prefix "phenotype:" "") - (prefix "pubmed:" "") - (prefix "rdf:" "") - (prefix "rdfs:" "") - (prefix "uniprot:" "") - (prefix "up:" "") - (prefix "xsd:" "") - (prefix "probeset:" "") - (newline) - (dump-probeset-1 db)) - #:encoding "utf8") - (with-output-to-file (string-append %dump-directory "dump-probeset-2.ttl") - (lambda () - (prefix "dct:" "") - (prefix "foaf:" "") - (prefix "generif:" "") - (prefix "gn:" "") - (prefix "owl:" "") - (prefix "phenotype:" "") - (prefix "pubmed:" "") - (prefix "rdf:" "") - (prefix "rdfs:" "") - (prefix "uniprot:" "") - (prefix "up:" "") - (prefix "xsd:" "") - (prefix "probeset:" "") - (newline) - (dump-probeset-2 db)) - #:encoding "utf8"))) +(dump-with-documentation + (name "ProbeSet Metadata") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("probeset:" "") + ("rdf:" "") + ("rdfs:" ""))) + (inputs + (list dump-probeset)) + (outputs + '(#:documentation "./docs/dump-probeset.md" + #:rdf "./verified-data/dump-probeset.ttl"))) -- cgit v1.2.3