about summary refs log tree commit diff
diff options
context:
space:
mode:
-rwxr-xr-xdump.scm151
1 files changed, 59 insertions, 92 deletions
diff --git a/dump.scm b/dump.scm
index 9a8cc07..1a1d900 100755
--- a/dump.scm
+++ b/dump.scm
@@ -859,105 +859,72 @@ is a <table> object."
     (set gn:binomialName (field InbredSet fullName))
     (set gn:species (field Species Name))))
 
-(define-dump dump-generif
-  (tables (GeneRIF
-	   (left-join Species "USING (SpeciesId)")
-	   (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id")
-	   (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id"))
-          "WHERE GeneRIF.display > 0")
-  (schema-triples
-   (gn:versionId rdfs:range rdfs:Literal)
-   (gn:symbol rdfs:range rdfs:Literal)
-   (gn:pubMedId rdfs:range rdfs:Literal)
-   (gn:geneRIFOfSpecies rdfs:range gn:species)
-   (gn:comment rdfs:range rdfs:Literal)
-   (gn:weburl rdfs:range rdfs:Literal)
-   (gn:createTime rdfs:range xsd:datetime)
-   (gn:createTime rdfs:range rdfs:Literal)
-   (gn:reason rdfs:range rdfs:Literal)
-   (gn:geneRIFOFGenenetwork rdfs:range gn:geneRIF)
-   (gn:geneCategory rdfs:range gn:geneRIF)
-   (gn:initial rdfs:range rdfs:Literal))
-  (triples (string->identifier
-            "geneRIF"
-            (number->string (field GeneRIF Id)))
-    (set rdf:type 'gn:geneRIFOfGenenetwork)
-    (set gn:versionId (field GeneRIF versionId))
-    (set gn:symbol (field GeneRIF symbol))
-    (set gn:geneCategory (field GeneCategory Name))
-    (set gn:pubMedId (field GeneRIF PubMed_ID))
-    (set gn:geneRIFOfSpecies
-         (binomial-name->species-id
-          (field Species FullName)))
-    (set gn:comment
-         (format #f "(~a) (~a) ~a"
-                 (time-unix->string (field GeneRIF createtime) "~5")
-                 (field GeneRIF email)
-                 (replace-substrings
-                  (field GeneRIF comment)
-                  '(("\xa0" . " ")
-                    ("â\x81„" . "/")
-                    ("â€\x9d" . #\")
-                    ("’" . #\')
-                    ("\x02" . "")
-                    ("\x01" . "")
-                    ("β" . "β")
-                    ("α-Â\xad" . "α")
-                    ("Â\xad" . "")
-                    ("α" . "α")
-                    ("–" . "-")))))
-    (set gn:createTime
-         (annotate-field
-          (time-unix->string
-           (field GeneRIF createtime) "~5")
-          '^^xsd:datetime))
-    (set gn:weburl (field GeneRIF weburl))
-    (set gn:reason (field GeneRIF reason))
-    (set gn:initial (field GeneRIF initial))))
-
-;; GeneRIF data from NCBI
-(define-dump dump-generif-basic
+;; GeneRIF metadata
+(define-dump dump-gn-genewiki-entries
   (tables (GeneRIF_BASIC
-           (left-join Species "USING (SpeciesId)"))
-          "GROUP BY SpeciesId, symbol, GeneId, VersionId")
+           (left-join GeneRIF "USING (symbol)")
+           (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id")
+           (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id"))
+          "WHERE GeneRIF.display > 0 and GeneRIF.VersionId = 0 AND GeneRIF.Id != 2322 GROUP BY GeneRIF.symbol")
   (schema-triples
-   (gn:taxId rdfs:range rdfs:Literal)
-   (gn:geneId rdfs:range rdfs:Literal)
-   (gn:pubMedId rdfs:range rdfs:Literal)
-   (pubmed:pmid rdfs:range rdfs:Literal)
-   (gn:comment rdfs:range rdfs:Literal)
-   (gn:symbol rdfs:range rdfs:Literal)
-   (gn:geneRIFOfSpecies rdfs:range gn:species)
-   (gn:versionId rdfs:range rdfs:Literal))
-  (triples
-      (string->identifier
-       "geneRIF"
-       (number->string (field GeneRIF_BASIC GeneId)))
-    (set rdf:type 'gn:geneRIFOfNcbi)
-    (set gn:geneRIFOfSpecies
-         (binomial-name->species-id
-          (field Species FullName)))
-    (set gn:taxId (ontology 'taxon: (field GeneRIF_BASIC TaxID)))
-    (set gn:geneId (ontology 'generif: (field GeneRIF_BASIC GeneId)))
-    (set gn:symbol (field GeneRIF_BASIC symbol))
-    (set gn:comment (field GeneRIF_BASIC comment))
-    (multiset gn:pubMedId
-              (map (compose
-                    (cut ontology 'pubmed: <>)
-                    string-trim)
-                   (string-split (field GeneRIF_BASIC
-                                        PubMed_ID
-                                        GROUP_CONCAT
-                                        PubMedID)
-                                 #\,)))
-    (set gn:versionId (field GeneRIF_BASIC VersionId))))
-
+   (gn:geneWikiEntryOfGN rdfs:domain gn:geneWiki)
+   (gn:weburl rdfs:domain gn:geneWiki)
+   (gn:versionId rdfs:domain gn:geneWiki)
+   (gn:category rdfs:domain gn:geneWiki)
+   (gn:pubMedId rdfs:domain rdfs:Literal)
+   (gn:createTime rdfs:range xsd:datetime))
+  (triples (ontology 'generif:
+                     (field GeneRIF_BASIC GeneId))
+    (set rdf:type 'gn:geneWikiEntry)
+    (set gn:symbol (field GeneRIF symbol))
+    (multiset gn:geneWikiEntryOfGn
+              (let* ([entries (replace-substrings (field
+                                                   ("GROUP_CONCAT(DISTINCT CONCAT_WS('::::', IFNULL(GeneCategory.Name, ''), IFNULL(GeneRIF.PubMed_ID, ''), GeneRIF.email, GeneRIF.comment, GeneRIF.createtime, IFNULL(weburl, '')) SEPARATOR';;;;;')"
+                                                    wikientry))
+                                                  '(("\x28" . "")
+                                                    ("\x29" . "")
+                                                    ("\xa0" . " ")
+                                                    ("â\x81„" . "/")
+                                                    ("â€\x9d" . #\")
+                                                    ("’" . #\')
+                                                    ("\x02" . "")
+                                                    ("\x01" . "")
+                                                    ("β" . "β")
+                                                    ("α-Â\xad" . "α")
+                                                    ("Â\xad" . "")
+                                                    ("α" . "α")
+                                                    ("–" . "-")))
+                              ]
+                     [comments (string-split-substring entries ";;;;;")])
+                (map
+                 (match-lambda
+                   ;; annotate pubmed id properly
+                   ((genecategory pmid email text createtime weburl)
+                    (blank-node
+                     (gn:category genecategory)
+                     (multiset
+                      gn:pubMedId
+                      (string-split
+                       (ontology 'pubmed: pmid)
+                       #\space))
+                     ;; TODO: Truncate mail to '@'
+                     (gn:email email)
+                     (gn:comment
+                      (annotate-field text '^^xsd:string))
+                     (gn:createTime (annotate-field
+                                     createtime
+                                     ;; (time-unix->string
+                                     ;;  createtime)
+                                     '^^xsd:datetime))
+                     (gn:weburl weburl))))
+                 (map
+                  (cut string-split-substring <> "::::")
+                  comments))))))
 
 ;; Import GeneRIF
 
 ;; Download GeneRIF data from
 ;; https://ftp.ncbi.nih.gov/gene/GeneRIF/generifs_basic.gz
-
 (define decode-html-entities
   (cut regexp-substitute/global
        #f