From 50fd5b4a9f2b4c687a59ac94260ab31789aceb00 Mon Sep 17 00:00:00 2001
From: Munyoki Kilyungi
Date: Wed, 19 Jul 2023 16:57:20 +0300
Subject: Use "gn:" and "gn-term:" when dumping phenotypes

Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
---
 examples/dump-phenotype.scm | 98 +++++++++++++++++++++++----------------------
 1 file changed, 50 insertions(+), 48 deletions(-)

diff --git a/examples/dump-phenotype.scm b/examples/dump-phenotype.scm
index 33577ce..924ec9a 100755
--- a/examples/dump-phenotype.scm
+++ b/examples/dump-phenotype.scm
@@ -18,9 +18,6 @@
   (call-with-input-file (list-ref (command-line) 1)
     read))
 
-(define %dump-directory
-  (list-ref (command-line) 2))
-
 
 
 ;; Only dump publish freeze entries that were not dumped from the InfoFiles page
@@ -30,25 +27,28 @@
            (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId"))
           "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL")
   (schema-triples
-   (gn:datasetOfInbredSet rdfs:range gn:inbredSet)
-   (gn:name rdfs:range rdfs:Literal)
-   (gn:fullName rdfs:range rdfs:Literal)
-   (gn:shortName rdfs:range rdfs:Literal)
-   (gn:createTime rdfs:range rdfs:Literal)
+   (gn-term:datasetOfInbredSet rdfs:range gn:inbredSet)
+   (gn-term:name rdfs:range rdfs:Literal)
+   (gn-term:fullName rdfs:range rdfs:Literal)
+   (gn-term:shortName rdfs:range rdfs:Literal)
+   (gn-term:createTime rdfs:range rdfs:Literal)
    (gn:phenotypeDataset rdf:subClassOf gn:dataset))
   (triples
-      (ontology 'dataset:
-                (regexp-substitute/global #f "[^A-Za-z0-9:]"
-                                          (field PublishFreeze Name)
-                                          'pre "_" 'post))
+      (string->identifier
+       ""
+       (regexp-substitute/global #f "[^A-Za-z0-9:]"
+                                 (field PublishFreeze Name)
+                                 'pre "_" 'post)
+       #:separator ""
+       #:proc string-capitalize-first)
     (set rdf:type 'gn:phenotypeDataset)
-    (set gn:name (field PublishFreeze Name))
-    (set gn:fullName (field PublishFreeze FullName))
-    (set gn:shortName (field PublishFreeze ShortName))
-    (set dct:created (annotate-field
+    (set gn-term:name (field PublishFreeze Name))
+    (set gn-term:fullName (field PublishFreeze FullName))
+    (set gn-term:shortName (field PublishFreeze ShortName))
+    (set dc-termt:created (annotate-field
                       (field PublishFreeze CreateTime)
                       '^^xsd:date))
-    (set gn:datasetOfInbredSet
+    (set gn-term:datasetOfInbredSet
          (string->identifier "inbredSet" (field InbredSet Name InbredSetName)))))
 
 (define-dump dump-phenotypes
@@ -59,48 +59,52 @@
            (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name")))
   (schema-triples
    (gn:phenotypeDataset rdfs:subPropertyOf gn:dataset))
-  (triples (ontology 'phenotype:
-                     (regexp-substitute/global #f "[^A-Za-z0-9:]"
-                                               (field ("CONCAT(IF(PublishFreeze.Name IS NULL, '', CONCAT(PublishFreeze.Name, ':')), IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation))" abbrev))
-                                               'pre "_" 'post))
+  (triples (string->identifier
+            ""
+            (regexp-substitute/global #f "[^A-Za-z0-9:]"
+                                        (field ("CONCAT(IF(PublishFreeze.Name IS NULL, '', CONCAT(PublishFreeze.Name, '_')), IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation))" abbrev))
+                                        'pre "_" 'post)
+            #:separator ""
+            #:proc string-capitalize-first)
     (set rdf:type 'gn:phenotype)
-    (set gn:name (sanitize-rdf-string
+    (set gn-term:name (sanitize-rdf-string
                   (field
                    ("CAST(CONVERT(BINARY CONVERT(IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Phenotype.Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation) USING latin1) USING utf8) AS VARCHAR(100))"
                     PhenotypeName))))
     ;; There is no row with an empty post-publication description so
     ;; use this field as the main publication description
-    (set gn:publicationDescription
+    (set gn-term:publicationDescription
          (sanitize-rdf-string
           (field ("CAST(CONVERT(BINARY CONVERT(Phenotype.Post_publication_description USING latin1) USING utf8) AS CHAR(10000))"
                   postPubDescr))))
-    (set gn:originalDescription (sanitize-rdf-string
+    (set gn-term:originalDescription (sanitize-rdf-string
                                  (delete-substrings
                                   (field Phenotype Original_description)
                                   "Original post publication description: ")))
-    (set gn:prePublicationDescription
+    (set gn-term:prePublicationDescription
          (sanitize-rdf-string
           (field
            ("CAST(CONVERT(BINARY CONVERT(Phenotype.Pre_publication_description USING latin1) USING utf8) AS VARCHAR(15000))"
             prePubDesc))))
-    (set gn:prePublicationAbbreviation (sanitize-rdf-string (field Phenotype Pre_publication_abbreviation)))
-    (set gn:postPublicationAbbreviation (sanitize-rdf-string (field Phenotype Post_publication_abbreviation)))
-    (set gn:labCode (field Phenotype Lab_code))
-    (set gn:submitter (sanitize-rdf-string (field Phenotype Submitter)))
-    (set gn:owner (sanitize-rdf-string (field Phenotype Owner)))
-    (set gn:mean (annotate-field (field ("IFNULL(PublishXRef.mean, '')" mean))
+    (set gn-term:prePublicationAbbreviation (sanitize-rdf-string (field Phenotype Pre_publication_abbreviation)))
+    (set gn-term:postPublicationAbbreviation (sanitize-rdf-string (field Phenotype Post_publication_abbreviation)))
+    (set gn-term:labCode (field Phenotype Lab_code))
+    (set gn-term:submitter (sanitize-rdf-string (field Phenotype Submitter)))
+    (set gn-term:owner (sanitize-rdf-string (field Phenotype Owner)))
+    (set gn-term:mean (annotate-field (field ("IFNULL(PublishXRef.mean, '')" mean))
                                  '^^xsd:double))
-    (set gn:locus (field PublishXRef Locus))
-    (set gn:LRS (annotate-field (field ("IFNULL(PublishXRef.LRS, '')" lrs)) '^^xsd:float))
-    (set gn:additive (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive)) '^^xsd:decimal))
-    (set gn:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:int))
-    (set gn:phenotypeOfDataset
-         (ontology 'dataset:
-                   (regexp-substitute/global
-                    #f "[^A-Za-z0-9:]"
-                    (field ("IFNULL(InfoFiles.InfoPageName, IFNULL(PublishFreeze.Name, ''))" DatasetName))
-                    'pre "_" 'post)))
-    (set gn:phenotypeOfPublication
+    (set gn-term:locus (field PublishXRef Locus))
+    (set gn-term:LRS (annotate-field (field ("IFNULL(PublishXRef.LRS, '')" lrs)) '^^xsd:float))
+    (set gn-term:additive (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive)) '^^xsd:decimal))
+    (set gn-term:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:int))
+    (set gn-term:phenotypeOfDataset
+         (string->identifier
+          ""
+          (field
+           ("IFNULL(InfoFiles.InfoPageName, IFNULL(PublishFreeze.Name, ''))" DatasetName))
+          #:separator ""
+          #:proc string-capitalize-first))
+    (set gn-term:phenotypeOfPublication
          (let ((pmid (field
                       ("IF(Publication.PubMed_ID IS NULL, '', CONVERT(Publication.PubMed_Id, INT))"
                        pmid)))
@@ -108,7 +112,7 @@
            (if (string-null? pmid)
                (string->identifier "unpublished"
                                    (number->string publication-id))
-               (ontology 'publication: pmid))))))
+               (ontology 'pubmed: pmid))))))
 
 
 (dump-with-documentation
@@ -116,17 +120,15 @@
  (connection %connection-settings)
  (table-metadata? #f)
  (prefixes
-  '(("gn-id:" "<http://genenetwork.org/terms/>")
+  '(("gn:" "<http://genenetwork.org/id/>")
     ("gn-term:" "<http://genenetwork.org/terms/>")
-    ("phenotype:" "<http://genenetwork.org/phenotype/>")
     ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
     ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
     ("xsd:" "<http://www.w3.org/2001/XMLSchema#>")
-    ("dataset:" "<http://genenetwork.org/dataset/>")
-    ("publication:" "<http://genenetwork.org/publication/>")))
+    ("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>")))
  (inputs
   (list dump-publishfreeze
-        dump-phenotype))
+        dump-phenotypes))
  (outputs
   '(#:documentation "./docs/dump-phenotype.md"
     #:rdf "./verified-data/dump-phenotype.ttl")))
-- 
cgit v1.2.3