about summary refs log tree commit diff
diff options
context:
space:
mode:
authorMunyoki Kilyungi2023-07-18 17:26:50 +0300
committerMunyoki Kilyungi2023-07-21 14:36:41 +0300
commite9d40395375e3b24174626736c2b53ad41317c1e (patch)
treec18dafb484bbc7d85f52b781730d117a00e69ad4
parent7b8f47c1d45a584b136358f6d5ac91795ad1c443 (diff)
downloadgn-transform-databases-e9d40395375e3b24174626736c2b53ad41317c1e.tar.gz
Dump probeset metadata with documentation
Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
-rwxr-xr-xexamples/dump-probeset.scm174
1 files changed, 31 insertions, 143 deletions
diff --git a/examples/dump-probeset.scm b/examples/dump-probeset.scm
index 0a6e07b..be09b48 100755
--- a/examples/dump-probeset.scm
+++ b/examples/dump-probeset.scm
@@ -16,17 +16,13 @@
   (call-with-input-file (list-ref (command-line) 1)
     read))
 
-(define %dump-directory
-  (list-ref (command-line) 2))
-
 
-(define-dump dump-probeset-0
+(define-dump dump-probeset
   (tables (ProbeSet
-           (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId"))
-          "LIMIT 2000000 OFFSET 0")
+           (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId")))
   (schema-triples
-   (gn:name rdfs:range rdfs:Literal)
-   (gn:probeset rdfs:range rdfs:Literal))
+   (gn-term:name rdfs:range rdfs:Literal)
+   (gn-term:probeset rdfs:range rdfs:Literal))
   (triples (ontology
             'probeset:
             (string-trim-both
@@ -35,142 +31,34 @@
               (field ("IFNULL(NULLIF(TRIM(ProbeSet.Name), ''), ProbeSet.Id)"
                       name))
               'pre "_" 'post)))
-    (set rdf:type 'gn:probeset)
-    (set gn:chipOf (string->identifier "platform" (field GeneChip Name)))
-    (set gn:name (field ProbeSet Name))
-    (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\""))
-    (set gn:description (sanitize-rdf-string
-                         (field ProbeSet description)))
-    (set gn:chr (field ProbeSet Chr))
-    (set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double))
-    ;; For now have the tissue, and alias as one line without
-    ;; splitting to make the dump faster
-    ;; (set gn:tissue (field ("IFNULL(ProbeSet.Tissue, '')" Tissue)))
-    ;; (set gn:alias (field ProbeSet alias))
-    ;; (set gn:generif (ontology 'generif: (field ProbeSet GeneId)))
-    (set gn:blatSeq (sanitize-rdf-string
-                     (string-trim-both (field ProbeSet BlatSeq))))
-    (set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq)))
-    ;; (set gn:unigene (field ProbeSet UniGeneId))
-    ;; (set gn:genbank (field ProbeSet GenbankId))
-    ;; (set gn:omim (sanitize-rdf-string (string-trim-both (field ProbeSet OMIM))))
-    ;; (set gn:RefSeq_TranscriptId (field ProbeSet RefSeq_TranscriptId))
-    (set gn:uniProtReference (ontology 'uniprot:
-                                       (field ProbeSet UniProtID)))))
+    (set rdf:type 'gn-id:probeset)
+    (set gn-term:chipOf (string->identifier "platform" (field GeneChip Name)))
+    (set gn-term:name (field ProbeSet Name))
+    (set gn-term:symbol (delete-substrings (field ProbeSet Symbol) "\""))
+    (set gn-term:description (sanitize-rdf-string
+                              (field ProbeSet description)))
+    (set gn-term:chr (field ProbeSet Chr))
+    (set gn-term:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double))
+    (set gn-term:blatSeq (sanitize-rdf-string
+                          (string-trim-both (field ProbeSet BlatSeq))))
+    (set gn-term:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq)))
+    (set gn-term:uniProtReference (ontology 'uniprot:
+                                            (field ProbeSet UniProtID)))))
 
-(define-dump dump-probeset-1
-  (tables (ProbeSet
-           (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId"))
-          "LIMIT 2000000 OFFSET 2000000")
-  (schema-triples
-   (gn:name rdfs:range rdfs:Literal)
-   (gn:probeset rdfs:range rdfs:Literal))
-  (triples (ontology
-            'probeset:
-             (string-trim-both
-             (regexp-substitute/global
-              #f "[^A-Za-z0-9:]"
-              (field ("IFNULL(NULLIF(TRIM(ProbeSet.Name), ''), ProbeSet.Id)"
-                      name))
-              'pre "_" 'post)))
-    (set rdf:type 'gn:probeset)
-    (set gn:chipOf (string->identifier "platform" (field GeneChip Name)))
-    (set gn:name (field ProbeSet Name))
-    (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\""))
-    (set gn:description (sanitize-rdf-string
-                         (field ProbeSet description)))
-    (set gn:chr (field ProbeSet Chr))
-    (set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double))
-    (set gn:blatSeq (sanitize-rdf-string
-                     (string-trim-both (field ProbeSet BlatSeq))))
-    (set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq)))
-    (set gn:uniProtReference (ontology 'uniprot:
-                                       (field ProbeSet UniProtID)))))
-(define-dump dump-probeset-2
-  (tables (ProbeSet
-           (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId"))
-          "WHERE ProbeSet.Name IS NOT NULL LIMIT 2000000 OFFSET 4000000")
-  (schema-triples
-   (gn:name rdfs:range rdfs:Literal)
-   (gn:probeset rdfs:range rdfs:Literal))
-  (triples (ontology
-            'probeset:
-            (string-trim-both
-             (regexp-substitute/global
-              #f "[^A-Za-z0-9:]"
-              (field ("IFNULL(ProbeSet.Name, ProbeSet.Id)"
-                      name))
-              'pre "_" 'post)))
-    (set rdf:type 'gn:probeset)
-    (set gn:chipOf (string->identifier "platform" (field GeneChip Name)))
-    (set gn:name (field ProbeSet Name))
-    (set gn:symbol (delete-substrings (field ProbeSet Symbol) "\""))
-    (set gn:description (sanitize-rdf-string
-                         (field ProbeSet description)))
-    (set gn:chr (field ProbeSet Chr))
-    (set gn:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double))
-    (set gn:blatSeq (sanitize-rdf-string
-                     (string-trim-both (field ProbeSet BlatSeq))))
-    (set gn:targetSeq (sanitize-rdf-string (field ProbeSet TargetSeq)))
-    (set gn:uniProtReference (ontology 'uniprot:
-                                       (field ProbeSet UniProtID)))))
 
 
 
-(call-with-target-database
- %connection-settings
- (lambda (db)
-   (with-output-to-file (string-append %dump-directory "dump-probeset-0.ttl")
-     (lambda ()
-       (prefix "dct:" "<http://purl.org/dc/terms/>")
-       (prefix "foaf:" "<http://xmlns.com/foaf/0.1/>")
-       (prefix "generif:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>")
-       (prefix "gn:" "<http://genenetwork.org/>")
-       (prefix "owl:" "<http://www.w3.org/2002/07/owl#>")
-       (prefix "phenotype:" "<http://genenetwork.org/phenotype/>")
-       (prefix "pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>")
-       (prefix "rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
-       (prefix "rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
-       (prefix "uniprot:" "<http://purl.uniprot.org/uniprot/>")
-       (prefix "up:" "<http://purl.uniprot.org/core/>")
-       (prefix "xsd:" "<http://www.w3.org/2001/XMLSchema#>")
-       (prefix "probeset:" "<http://genenetwork.org/probeset/>")
-       (newline)
-       (dump-probeset-0 db))
-     #:encoding "utf8")
-   (with-output-to-file (string-append %dump-directory "dump-probeset-1.ttl")
-     (lambda ()
-       (prefix "dct:" "<http://purl.org/dc/terms/>")
-       (prefix "foaf:" "<http://xmlns.com/foaf/0.1/>")
-       (prefix "generif:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>")
-       (prefix "gn:" "<http://genenetwork.org/>")
-       (prefix "owl:" "<http://www.w3.org/2002/07/owl#>")
-       (prefix "phenotype:" "<http://genenetwork.org/phenotype/>")
-       (prefix "pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>")
-       (prefix "rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
-       (prefix "rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
-       (prefix "uniprot:" "<http://purl.uniprot.org/uniprot/>")
-       (prefix "up:" "<http://purl.uniprot.org/core/>")
-       (prefix "xsd:" "<http://www.w3.org/2001/XMLSchema#>")
-       (prefix "probeset:" "<http://genenetwork.org/probeset/>")
-       (newline)
-       (dump-probeset-1 db))
-     #:encoding "utf8")
-   (with-output-to-file (string-append %dump-directory "dump-probeset-2.ttl")
-     (lambda ()
-       (prefix "dct:" "<http://purl.org/dc/terms/>")
-       (prefix "foaf:" "<http://xmlns.com/foaf/0.1/>")
-       (prefix "generif:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>")
-       (prefix "gn:" "<http://genenetwork.org/>")
-       (prefix "owl:" "<http://www.w3.org/2002/07/owl#>")
-       (prefix "phenotype:" "<http://genenetwork.org/phenotype/>")
-       (prefix "pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>")
-       (prefix "rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
-       (prefix "rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
-       (prefix "uniprot:" "<http://purl.uniprot.org/uniprot/>")
-       (prefix "up:" "<http://purl.uniprot.org/core/>")
-       (prefix "xsd:" "<http://www.w3.org/2001/XMLSchema#>")
-       (prefix "probeset:" "<http://genenetwork.org/probeset/>")
-       (newline)
-       (dump-probeset-2 db))
-     #:encoding "utf8")))
+(dump-with-documentation
+ (name "ProbeSet Metadata")
+ (connection %connection-settings)
+ (table-metadata? #f)
+ (prefixes
+  '(("probeset:" "<http://genenetwork.org/probeset/>")
+    ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
+    ("rdfs:" "<http://www.w3.org/2000/01/rdf-r")
+    ("uniprot:" "<http://purl.uniprot.org/uniprot/>")))
+ (inputs
+  (list dump-probeset))
+ (outputs
+  '(#:documentation "./docs/dump-probeset.md"
+    #:rdf "./verified-data/dump-probeset.ttl")))