From e7aafff4225965effcf7415d31c558a78b7c0cca Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Mon, 27 Mar 2023 22:52:49 +0300 Subject: Redefine how genewiki is dumped * dump.scm (dump-generif, dump-generif-basic): Replace with ... (dump-gn-genewiki-entries): ... this. Signed-off-by: Munyoki Kilyungi --- dump.scm | 151 +++++++++++++++++++++++++-------------------------------------- 1 file changed, 59 insertions(+), 92 deletions(-) diff --git a/dump.scm b/dump.scm index 9a8cc07..1a1d900 100755 --- a/dump.scm +++ b/dump.scm @@ -859,105 +859,72 @@ is a object." (set gn:binomialName (field InbredSet fullName)) (set gn:species (field Species Name)))) -(define-dump dump-generif - (tables (GeneRIF - (left-join Species "USING (SpeciesId)") - (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id") - (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id")) - "WHERE GeneRIF.display > 0") - (schema-triples - (gn:versionId rdfs:range rdfs:Literal) - (gn:symbol rdfs:range rdfs:Literal) - (gn:pubMedId rdfs:range rdfs:Literal) - (gn:geneRIFOfSpecies rdfs:range gn:species) - (gn:comment rdfs:range rdfs:Literal) - (gn:weburl rdfs:range rdfs:Literal) - (gn:createTime rdfs:range xsd:datetime) - (gn:createTime rdfs:range rdfs:Literal) - (gn:reason rdfs:range rdfs:Literal) - (gn:geneRIFOFGenenetwork rdfs:range gn:geneRIF) - (gn:geneCategory rdfs:range gn:geneRIF) - (gn:initial rdfs:range rdfs:Literal)) - (triples (string->identifier - "geneRIF" - (number->string (field GeneRIF Id))) - (set rdf:type 'gn:geneRIFOfGenenetwork) - (set gn:versionId (field GeneRIF versionId)) - (set gn:symbol (field GeneRIF symbol)) - (set gn:geneCategory (field GeneCategory Name)) - (set gn:pubMedId (field GeneRIF PubMed_ID)) - (set gn:geneRIFOfSpecies - (binomial-name->species-id - (field Species FullName))) - (set gn:comment - (format #f "(~a) (~a) ~a" - (time-unix->string (field GeneRIF createtime) "~5") - (field GeneRIF email) - (replace-substrings - (field GeneRIF comment) - '(("\xa0" . " ") - ("â\x81„" . "/") - ("â€\x9d" . #\") - ("’" . #\') - ("\x02" . "") - ("\x01" . "") - ("β" . "β") - ("α-Â\xad" . "α") - ("Â\xad" . "") - ("α" . "α") - ("–" . "-"))))) - (set gn:createTime - (annotate-field - (time-unix->string - (field GeneRIF createtime) "~5") - '^^xsd:datetime)) - (set gn:weburl (field GeneRIF weburl)) - (set gn:reason (field GeneRIF reason)) - (set gn:initial (field GeneRIF initial)))) - -;; GeneRIF data from NCBI -(define-dump dump-generif-basic +;; GeneRIF metadata +(define-dump dump-gn-genewiki-entries (tables (GeneRIF_BASIC - (left-join Species "USING (SpeciesId)")) - "GROUP BY SpeciesId, symbol, GeneId, VersionId") + (left-join GeneRIF "USING (symbol)") + (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id") + (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id")) + "WHERE GeneRIF.display > 0 and GeneRIF.VersionId = 0 AND GeneRIF.Id != 2322 GROUP BY GeneRIF.symbol") (schema-triples - (gn:taxId rdfs:range rdfs:Literal) - (gn:geneId rdfs:range rdfs:Literal) - (gn:pubMedId rdfs:range rdfs:Literal) - (pubmed:pmid rdfs:range rdfs:Literal) - (gn:comment rdfs:range rdfs:Literal) - (gn:symbol rdfs:range rdfs:Literal) - (gn:geneRIFOfSpecies rdfs:range gn:species) - (gn:versionId rdfs:range rdfs:Literal)) - (triples - (string->identifier - "geneRIF" - (number->string (field GeneRIF_BASIC GeneId))) - (set rdf:type 'gn:geneRIFOfNcbi) - (set gn:geneRIFOfSpecies - (binomial-name->species-id - (field Species FullName))) - (set gn:taxId (ontology 'taxon: (field GeneRIF_BASIC TaxID))) - (set gn:geneId (ontology 'generif: (field GeneRIF_BASIC GeneId))) - (set gn:symbol (field GeneRIF_BASIC symbol)) - (set gn:comment (field GeneRIF_BASIC comment)) - (multiset gn:pubMedId - (map (compose - (cut ontology 'pubmed: <>) - string-trim) - (string-split (field GeneRIF_BASIC - PubMed_ID - GROUP_CONCAT - PubMedID) - #\,))) - (set gn:versionId (field GeneRIF_BASIC VersionId)))) - + (gn:geneWikiEntryOfGN rdfs:domain gn:geneWiki) + (gn:weburl rdfs:domain gn:geneWiki) + (gn:versionId rdfs:domain gn:geneWiki) + (gn:category rdfs:domain gn:geneWiki) + (gn:pubMedId rdfs:domain rdfs:Literal) + (gn:createTime rdfs:range xsd:datetime)) + (triples (ontology 'generif: + (field GeneRIF_BASIC GeneId)) + (set rdf:type 'gn:geneWikiEntry) + (set gn:symbol (field GeneRIF symbol)) + (multiset gn:geneWikiEntryOfGn + (let* ([entries (replace-substrings (field + ("GROUP_CONCAT(DISTINCT CONCAT_WS('::::', IFNULL(GeneCategory.Name, ''), IFNULL(GeneRIF.PubMed_ID, ''), GeneRIF.email, GeneRIF.comment, GeneRIF.createtime, IFNULL(weburl, '')) SEPARATOR';;;;;')" + wikientry)) + '(("\x28" . "") + ("\x29" . "") + ("\xa0" . " ") + ("â\x81„" . "/") + ("â€\x9d" . #\") + ("’" . #\') + ("\x02" . "") + ("\x01" . "") + ("β" . "β") + ("α-Â\xad" . "α") + ("Â\xad" . "") + ("α" . "α") + ("–" . "-"))) + ] + [comments (string-split-substring entries ";;;;;")]) + (map + (match-lambda + ;; annotate pubmed id properly + ((genecategory pmid email text createtime weburl) + (blank-node + (gn:category genecategory) + (multiset + gn:pubMedId + (string-split + (ontology 'pubmed: pmid) + #\space)) + ;; TODO: Truncate mail to '@' + (gn:email email) + (gn:comment + (annotate-field text '^^xsd:string)) + (gn:createTime (annotate-field + createtime + ;; (time-unix->string + ;; createtime) + '^^xsd:datetime)) + (gn:weburl weburl)))) + (map + (cut string-split-substring <> "::::") + comments)))))) ;; Import GeneRIF ;; Download GeneRIF data from ;; https://ftp.ncbi.nih.gov/gene/GeneRIF/generifs_basic.gz - (define decode-html-entities (cut regexp-substitute/global #f -- cgit v1.2.3