diff options
author | Munyoki Kilyungi | 2023-07-18 17:26:50 +0300 |
---|---|---|
committer | Munyoki Kilyungi | 2023-07-21 14:36:41 +0300 |
commit | e9d40395375e3b24174626736c2b53ad41317c1e (patch) | |
tree | c18dafb484bbc7d85f52b781730d117a00e69ad4 | |
parent | 7b8f47c1d45a584b136358f6d5ac91795ad1c443 (diff) | |
download | gn-transform-databases-e9d40395375e3b24174626736c2b53ad41317c1e.tar.gz |
Dump probeset metadata with documentation
Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
-rwxr-xr-x | examples/dump-probeset.scm | 174 |
1 files changed, 31 insertions, 143 deletions
diff --git a/examples/dump-probeset.scm b/examples/dump-probeset.scm index 0a6e07b..be09b48 100755 --- a/examples/dump-probeset.scm +++ b/examples/dump-probeset.scm @@ -16,17 +16,13 @@ (call-with-input-file (list-ref (command-line) 1) read)) -(define %dump-directory - (list-ref (command-line) 2)) - -(define-dump dump-probeset-0 +(define-dump dump-probeset (tables (ProbeSet - (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId")) - "LIMIT 2000000 OFFSET 0") + (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId"))) (schema-triples - (gn:name rdfs:range rdfs:Literal) - (gn:probeset rdfs:range rdfs:Literal)) + (gn-term:name rdfs:range rdfs:Literal) + (gn-term:probeset rdfs:range rdfs:Literal)) (triples (ontology 'probeset: (string-trim-both @@ -35,142 +31,34 @@ (field ("IFNULL(NULLIF(TRIM(ProbeSet.Name), ''), ProbeSet.Id)" name)) 'pre "_" 'post))) - (set rdf:type 'gn:probeset) - (set gn:chipOf (string->identifier "platform" (field GeneChip Name))) - (set gn:name (field ProbeSet Name)) - (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\"")) - (set gn:description (sanitize-rdf-string - (field ProbeSet description))) - (set gn:chr (field ProbeSet Chr)) - (set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double)) - ;; For now have the tissue, and alias as one line without - ;; splitting to make the dump faster - ;; (set gn:tissue (field ("IFNULL(ProbeSet.Tissue, '')" Tissue))) - ;; (set gn:alias (field ProbeSet alias)) - ;; (set gn:generif (ontology 'generif: (field ProbeSet GeneId))) - (set gn:blatSeq (sanitize-rdf-string - (string-trim-both (field ProbeSet BlatSeq)))) - (set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq))) - ;; (set gn:unigene (field ProbeSet UniGeneId)) - ;; (set gn:genbank (field ProbeSet GenbankId)) - ;; (set gn:omim (sanitize-rdf-string (string-trim-both (field ProbeSet OMIM)))) - ;; (set gn:RefSeq_TranscriptId (field ProbeSet RefSeq_TranscriptId)) - (set gn:uniProtReference (ontology 'uniprot: - (field ProbeSet UniProtID))))) + (set rdf:type 'gn-id:probeset) + (set gn-term:chipOf (string->identifier "platform" (field GeneChip Name))) + (set gn-term:name (field ProbeSet Name)) + (set gn-term:symbol (delete-substrings (field ProbeSet Symbol) "\"")) + (set gn-term:description (sanitize-rdf-string + (field ProbeSet description))) + (set gn-term:chr (field ProbeSet Chr)) + (set gn-term:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double)) + (set gn-term:blatSeq (sanitize-rdf-string + (string-trim-both (field ProbeSet BlatSeq)))) + (set gn-term:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq))) + (set gn-term:uniProtReference (ontology 'uniprot: + (field ProbeSet UniProtID))))) -(define-dump dump-probeset-1 - (tables (ProbeSet - (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId")) - "LIMIT 2000000 OFFSET 2000000") - (schema-triples - (gn:name rdfs:range rdfs:Literal) - (gn:probeset rdfs:range rdfs:Literal)) - (triples (ontology - 'probeset: - (string-trim-both - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field ("IFNULL(NULLIF(TRIM(ProbeSet.Name), ''), ProbeSet.Id)" - name)) - 'pre "_" 'post))) - (set rdf:type 'gn:probeset) - (set gn:chipOf (string->identifier "platform" (field GeneChip Name))) - (set gn:name (field ProbeSet Name)) - (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\"")) - (set gn:description (sanitize-rdf-string - (field ProbeSet description))) - (set gn:chr (field ProbeSet Chr)) - (set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double)) - (set gn:blatSeq (sanitize-rdf-string - (string-trim-both (field ProbeSet BlatSeq)))) - (set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq))) - (set gn:uniProtReference (ontology 'uniprot: - (field ProbeSet UniProtID))))) -(define-dump dump-probeset-2 - (tables (ProbeSet - (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId")) - "WHERE ProbeSet.Name IS NOT NULL LIMIT 2000000 OFFSET 4000000") - (schema-triples - (gn:name rdfs:range rdfs:Literal) - (gn:probeset rdfs:range rdfs:Literal)) - (triples (ontology - 'probeset: - (string-trim-both - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field ("IFNULL(ProbeSet.Name, ProbeSet.Id)" - name)) - 'pre "_" 'post))) - (set rdf:type 'gn:probeset) - (set gn:chipOf (string->identifier "platform" (field GeneChip Name))) - (set gn:name (field ProbeSet Name)) - (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\"")) - (set gn:description (sanitize-rdf-string - (field ProbeSet description))) - (set gn:chr (field ProbeSet Chr)) - (set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double)) - (set gn:blatSeq (sanitize-rdf-string - (string-trim-both (field ProbeSet BlatSeq)))) - (set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq))) - (set gn:uniProtReference (ontology 'uniprot: - (field ProbeSet UniProtID))))) -(call-with-target-database - %connection-settings - (lambda (db) - (with-output-to-file (string-append %dump-directory "dump-probeset-0.ttl") - (lambda () - (prefix "dct:" "<http://purl.org/dc/terms/>") - (prefix "foaf:" "<http://xmlns.com/foaf/0.1/>") - (prefix "generif:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>") - (prefix "gn:" "<http://genenetwork.org/>") - (prefix "owl:" "<http://www.w3.org/2002/07/owl#>") - (prefix "phenotype:" "<http://genenetwork.org/phenotype/>") - (prefix "pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>") - (prefix "rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") - (prefix "rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") - (prefix "uniprot:" "<http://purl.uniprot.org/uniprot/>") - (prefix "up:" "<http://purl.uniprot.org/core/>") - (prefix "xsd:" "<http://www.w3.org/2001/XMLSchema#>") - (prefix "probeset:" "<http://genenetwork.org/probeset/>") - (newline) - (dump-probeset-0 db)) - #:encoding "utf8") - (with-output-to-file (string-append %dump-directory "dump-probeset-1.ttl") - (lambda () - (prefix "dct:" "<http://purl.org/dc/terms/>") - (prefix "foaf:" "<http://xmlns.com/foaf/0.1/>") - (prefix "generif:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>") - (prefix "gn:" "<http://genenetwork.org/>") - (prefix "owl:" "<http://www.w3.org/2002/07/owl#>") - (prefix "phenotype:" "<http://genenetwork.org/phenotype/>") - (prefix "pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>") - (prefix "rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") - (prefix "rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") - (prefix "uniprot:" "<http://purl.uniprot.org/uniprot/>") - (prefix "up:" "<http://purl.uniprot.org/core/>") - (prefix "xsd:" "<http://www.w3.org/2001/XMLSchema#>") - (prefix "probeset:" "<http://genenetwork.org/probeset/>") - (newline) - (dump-probeset-1 db)) - #:encoding "utf8") - (with-output-to-file (string-append %dump-directory "dump-probeset-2.ttl") - (lambda () - (prefix "dct:" "<http://purl.org/dc/terms/>") - (prefix "foaf:" "<http://xmlns.com/foaf/0.1/>") - (prefix "generif:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>") - (prefix "gn:" "<http://genenetwork.org/>") - (prefix "owl:" "<http://www.w3.org/2002/07/owl#>") - (prefix "phenotype:" "<http://genenetwork.org/phenotype/>") - (prefix "pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>") - (prefix "rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") - (prefix "rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") - (prefix "uniprot:" "<http://purl.uniprot.org/uniprot/>") - (prefix "up:" "<http://purl.uniprot.org/core/>") - (prefix "xsd:" "<http://www.w3.org/2001/XMLSchema#>") - (prefix "probeset:" "<http://genenetwork.org/probeset/>") - (newline) - (dump-probeset-2 db)) - #:encoding "utf8"))) +(dump-with-documentation + (name "ProbeSet Metadata") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("probeset:" "<http://genenetwork.org/probeset/>") + ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") + ("rdfs:" "<http://www.w3.org/2000/01/rdf-r") + ("uniprot:" "<http://purl.uniprot.org/uniprot/>"))) + (inputs + (list dump-probeset)) + (outputs + '(#:documentation "./docs/dump-probeset.md" + #:rdf "./verified-data/dump-probeset.ttl"))) |