aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMunyoki Kilyungi2023-03-31 15:44:35 +0300
committerBonfaceKilz2023-04-05 16:17:11 +0300
commit69fb07b21f85f7c8f270082756d40ff529cc3777 (patch)
tree73a850f5b4669cc900f34f4978e395c283beed48
parenta78a5002a734feb7a4daaf52ace3d9ce33d19560 (diff)
downloadgn-transform-databases-69fb07b21f85f7c8f270082756d40ff529cc3777.tar.gz
Separate NCBI and GN wikientries in dump
Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
-rwxr-xr-xdump.scm131
1 files changed, 70 insertions, 61 deletions
diff --git a/dump.scm b/dump.scm
index 639b5a0..d27810b 100755
--- a/dump.scm
+++ b/dump.scm
@@ -861,84 +861,92 @@ is a <table> object."
;; GeneRIF metadata
(define-dump dump-gn-genewiki-entries
- (tables (GeneRIF_BASIC
- (left-join GeneRIF "USING (symbol)")
+ (tables (GeneRIF
+ (left-join GeneRIF_BASIC "USING (symbol)")
+ (left-join Species "ON Species.SpeciesId = GeneRIF.SpeciesId")
(left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id")
(left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id"))
- "WHERE GeneRIF.display > 0 and GeneRIF.VersionId = 0 GROUP BY GeneRIF.symbol")
+ "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 GROUP BY GeneRIF.symbol")
(schema-triples
- (gn:geneWikiEntry rdfs:domain gn:geneWiki)
- (gn:geneWikiEntryOfGN rdfs:domain gn:geneWiki)
- (gn:geneWikiEntryofNCBI rdfs:domain gn:geneWiki)
- (gn:weburl rdfs:domain gn:geneWiki)
- (gn:versionId rdfs:domain gn:geneWiki)
- (gn:category rdfs:domain gn:geneWiki)
- (gn:pubMedId rdfs:domain rdfs:Literal)
- (gn:createTime rdfs:range xsd:datetime))
- (triples (ontology 'generif:
- (field GeneRIF_BASIC GeneId))
- (set rdf:type 'gn:geneWikiEntry)
- (set gn:symbol (field GeneRIF symbol))
+ (gn:geneCategory rdfs:domain gn:geneWikiEntry)
+ (gn:geneWikiEntryOfGn rdfs:domain gn:geneWikiEntry)
+ (gn:geneWikiEntry rdfs:domain gn:geneWikiEntry))
+ (triples
+ (let ([geneid (field GeneRIF_BASIC GeneId)])
+ (if (eq? geneid 0)
+ (ontology 'gn:anonSymbol_
+ (field GeneRIF symbol))
+ (ontology 'generif:
+ geneid)))
+ (set gn:species (let ([geneid (field GeneRIF_BASIC GeneId)])
+ (if (eq? geneid 0)
+ (field Species SpeciesName)
+ "")))
(multiset gn:geneWikiEntryOfGn
- (let* ([entries (replace-substrings (field
- ("GROUP_CONCAT(DISTINCT CONCAT_WS('::::', IFNULL(GeneCategory.Name, ''), IFNULL(GeneRIF.PubMed_ID, ''), GeneRIF.email, GeneRIF.comment, GeneRIF.createtime, IFNULL(weburl, '')) SEPARATOR';;;;;')"
- wikientry))
- '(("\x28" . "")
- ("\x29" . "")
- ("\xa0" . " ")
- ("â\x81„" . "/")
- ("â€\x9d" . #\")
- ("’" . #\')
- ("\x02" . "")
- ("\x01" . "")
- ("β" . "β")
- ("α-Â\xad" . "α")
- ("Â\xad" . "")
- ("α" . "α")
- ("–" . "-")))]
+ (let* ([entries
+ (replace-substrings
+ (field
+ ("GROUP_CONCAT(DISTINCT CONCAT_WS('::::', IFNULL(GeneCategory.Name, ''), IFNULL(GeneRIF.PubMed_ID, ''), GeneRIF.email, GeneRIF.comment, GeneRIF.createtime, IFNULL(weburl, '')) SEPARATOR';;;;;')"
+ wikientry))
+ '(("\x28" . "")
+ ("\x29" . "")
+ ("\xa0" . " ")
+ ("â\x81„" . "/")
+ ("â€\x9d" . #\")
+ ("’" . #\')
+ ("\x02" . "")
+ ("\x01" . "")
+ ("β" . "β")
+ ("α-Â\xad" . "α")
+ ("Â\xad" . "")
+ ("α" . "α")
+ ("–" . "-")))]
[comments (string-split-substring entries ";;;;;")])
(map
(match-lambda
((genecategory pmid email text createtime weburl)
(blank-node
- (set gn:category genecategory)
- (multiset gn:pubMedId
+ (set gn:geneCategory genecategory)
+ (multiset dct:source
(map (lambda (el) (if (string-null? el)
""
(ontology 'pubmed: el)))
(string-split pmid #\space)))
- (set gn:author (regexp-substitute/global #f "@.*$"
- email
- 'pre
- ""
- 'post))
+ (set dct:creator (regexp-substitute/global #f "@.*$"
+ email
+ 'pre
+ ""
+ 'post))
(set gn:geneWikiEntry
(annotate-field text '^^xsd:string))
- (set gn:createTime (annotate-field
- createtime
- '^^xsd:datetime))
- (set gn:weburl weburl))))
- (map
- (cut string-split-substring <> "::::")
- comments))))
- (multiset gn:geneWikiEntryOfNCBI
- (let* ([entries (field
- ("GROUP_CONCAT(DISTINCT CONCAT_WS('::::', IFNULL(GeneRIF_BASIC.PubMed_ID, ''), IFNULL(GeneRIF_BASIC.comment, '')) SEPARATOR'|||||')"
- ncbientry))
- ]
- [ncbi-comments (string-split-substring entries "|||||")])
- (map
- (match-lambda
- ((pmid text)
- (blank-node
- (set gn:geneWikiEntry (annotate-field text '^^xsd:string))
- (set gn:pubMedId (ontology 'pubmed: pmid))))
- (_ (display (string-split-substring ncbi-comments "::::"))
- (error "error")))
+ (set dct:created (annotate-field
+ createtime
+ '^^xsd:datetime))
+ (set foaf:homepage weburl))))
(map
(cut string-split-substring <> "::::")
- ncbi-comments))))
- ))
+ comments))))))
+
+(define-dump dump-ncbi-genewiki-entries
+ (tables (GeneRIF_BASIC)
+ "GROUP BY GeneId, comment, createtime")
+ (schema-triples
+ (gn:geneWikiEntryofNCBI rdfs:domain gn:geneWikiEntry))
+ (triples (ontology 'generif:
+ (field GeneRIF_BASIC GeneId))
+ (set gn:geneWikiEntryOfNCBI
+ (blank-node
+ (set gn:geneWikiEntry
+ (annotate-field (field GeneRIF_BASIC comment)
+ '^^xsd:string))
+ (multiset dct:source (map (lambda (el) (if (string-null? el)
+ ""
+ (ontology 'pubmed: el)))
+ (string-split (field ("GROUP_CONCAT(PubMed_ID)" pmids))
+ #\,)))
+ (set dct:created (annotate-field (time-unix->string
+ (field GeneRIF_BASIC createtime) "~5")
+ '^^xsd:datetime))))))
;; Main function
@@ -960,6 +968,7 @@ is a <table> object."
(prefix "owl:" "<http://www.w3.org/2002/07/owl#>")
(newline)
(dump-gn-genewiki-entries db)
+ (dump-ncbi-genewiki-entries db)
(dump-species db)
(dump-strain db)
(dump-mapping-method db)