aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMunyoki Kilyungi2023-03-27 22:52:49 +0300
committerBonfaceKilz2023-04-05 16:17:11 +0300
commite7aafff4225965effcf7415d31c558a78b7c0cca (patch)
tree490d8c58072e15c6896f217072fa0257ddd4a7e5
parent510a6056fa73b537481abca9b8027675b3d03f94 (diff)
downloadgn-transform-databases-e7aafff4225965effcf7415d31c558a78b7c0cca.tar.gz
Redefine how genewiki is dumped
* dump.scm (dump-generif, dump-generif-basic): Replace with ... (dump-gn-genewiki-entries): ... this. Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
-rwxr-xr-xdump.scm151
1 files changed, 59 insertions, 92 deletions
diff --git a/dump.scm b/dump.scm
index 9a8cc07..1a1d900 100755
--- a/dump.scm
+++ b/dump.scm
@@ -859,105 +859,72 @@ is a <table> object."
(set gn:binomialName (field InbredSet fullName))
(set gn:species (field Species Name))))
-(define-dump dump-generif
- (tables (GeneRIF
- (left-join Species "USING (SpeciesId)")
- (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id")
- (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id"))
- "WHERE GeneRIF.display > 0")
- (schema-triples
- (gn:versionId rdfs:range rdfs:Literal)
- (gn:symbol rdfs:range rdfs:Literal)
- (gn:pubMedId rdfs:range rdfs:Literal)
- (gn:geneRIFOfSpecies rdfs:range gn:species)
- (gn:comment rdfs:range rdfs:Literal)
- (gn:weburl rdfs:range rdfs:Literal)
- (gn:createTime rdfs:range xsd:datetime)
- (gn:createTime rdfs:range rdfs:Literal)
- (gn:reason rdfs:range rdfs:Literal)
- (gn:geneRIFOFGenenetwork rdfs:range gn:geneRIF)
- (gn:geneCategory rdfs:range gn:geneRIF)
- (gn:initial rdfs:range rdfs:Literal))
- (triples (string->identifier
- "geneRIF"
- (number->string (field GeneRIF Id)))
- (set rdf:type 'gn:geneRIFOfGenenetwork)
- (set gn:versionId (field GeneRIF versionId))
- (set gn:symbol (field GeneRIF symbol))
- (set gn:geneCategory (field GeneCategory Name))
- (set gn:pubMedId (field GeneRIF PubMed_ID))
- (set gn:geneRIFOfSpecies
- (binomial-name->species-id
- (field Species FullName)))
- (set gn:comment
- (format #f "(~a) (~a) ~a"
- (time-unix->string (field GeneRIF createtime) "~5")
- (field GeneRIF email)
- (replace-substrings
- (field GeneRIF comment)
- '(("\xa0" . " ")
- ("â\x81„" . "/")
- ("â€\x9d" . #\")
- ("’" . #\')
- ("\x02" . "")
- ("\x01" . "")
- ("β" . "β")
- ("α-Â\xad" . "α")
- ("Â\xad" . "")
- ("α" . "α")
- ("–" . "-")))))
- (set gn:createTime
- (annotate-field
- (time-unix->string
- (field GeneRIF createtime) "~5")
- '^^xsd:datetime))
- (set gn:weburl (field GeneRIF weburl))
- (set gn:reason (field GeneRIF reason))
- (set gn:initial (field GeneRIF initial))))
-
-;; GeneRIF data from NCBI
-(define-dump dump-generif-basic
+;; GeneRIF metadata
+(define-dump dump-gn-genewiki-entries
(tables (GeneRIF_BASIC
- (left-join Species "USING (SpeciesId)"))
- "GROUP BY SpeciesId, symbol, GeneId, VersionId")
+ (left-join GeneRIF "USING (symbol)")
+ (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id")
+ (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id"))
+ "WHERE GeneRIF.display > 0 and GeneRIF.VersionId = 0 AND GeneRIF.Id != 2322 GROUP BY GeneRIF.symbol")
(schema-triples
- (gn:taxId rdfs:range rdfs:Literal)
- (gn:geneId rdfs:range rdfs:Literal)
- (gn:pubMedId rdfs:range rdfs:Literal)
- (pubmed:pmid rdfs:range rdfs:Literal)
- (gn:comment rdfs:range rdfs:Literal)
- (gn:symbol rdfs:range rdfs:Literal)
- (gn:geneRIFOfSpecies rdfs:range gn:species)
- (gn:versionId rdfs:range rdfs:Literal))
- (triples
- (string->identifier
- "geneRIF"
- (number->string (field GeneRIF_BASIC GeneId)))
- (set rdf:type 'gn:geneRIFOfNcbi)
- (set gn:geneRIFOfSpecies
- (binomial-name->species-id
- (field Species FullName)))
- (set gn:taxId (ontology 'taxon: (field GeneRIF_BASIC TaxID)))
- (set gn:geneId (ontology 'generif: (field GeneRIF_BASIC GeneId)))
- (set gn:symbol (field GeneRIF_BASIC symbol))
- (set gn:comment (field GeneRIF_BASIC comment))
- (multiset gn:pubMedId
- (map (compose
- (cut ontology 'pubmed: <>)
- string-trim)
- (string-split (field GeneRIF_BASIC
- PubMed_ID
- GROUP_CONCAT
- PubMedID)
- #\,)))
- (set gn:versionId (field GeneRIF_BASIC VersionId))))
-
+ (gn:geneWikiEntryOfGN rdfs:domain gn:geneWiki)
+ (gn:weburl rdfs:domain gn:geneWiki)
+ (gn:versionId rdfs:domain gn:geneWiki)
+ (gn:category rdfs:domain gn:geneWiki)
+ (gn:pubMedId rdfs:domain rdfs:Literal)
+ (gn:createTime rdfs:range xsd:datetime))
+ (triples (ontology 'generif:
+ (field GeneRIF_BASIC GeneId))
+ (set rdf:type 'gn:geneWikiEntry)
+ (set gn:symbol (field GeneRIF symbol))
+ (multiset gn:geneWikiEntryOfGn
+ (let* ([entries (replace-substrings (field
+ ("GROUP_CONCAT(DISTINCT CONCAT_WS('::::', IFNULL(GeneCategory.Name, ''), IFNULL(GeneRIF.PubMed_ID, ''), GeneRIF.email, GeneRIF.comment, GeneRIF.createtime, IFNULL(weburl, '')) SEPARATOR';;;;;')"
+ wikientry))
+ '(("\x28" . "")
+ ("\x29" . "")
+ ("\xa0" . " ")
+ ("â\x81„" . "/")
+ ("â€\x9d" . #\")
+ ("’" . #\')
+ ("\x02" . "")
+ ("\x01" . "")
+ ("β" . "β")
+ ("α-Â\xad" . "α")
+ ("Â\xad" . "")
+ ("α" . "α")
+ ("–" . "-")))
+ ]
+ [comments (string-split-substring entries ";;;;;")])
+ (map
+ (match-lambda
+ ;; annotate pubmed id properly
+ ((genecategory pmid email text createtime weburl)
+ (blank-node
+ (gn:category genecategory)
+ (multiset
+ gn:pubMedId
+ (string-split
+ (ontology 'pubmed: pmid)
+ #\space))
+ ;; TODO: Truncate mail to '@'
+ (gn:email email)
+ (gn:comment
+ (annotate-field text '^^xsd:string))
+ (gn:createTime (annotate-field
+ createtime
+ ;; (time-unix->string
+ ;; createtime)
+ '^^xsd:datetime))
+ (gn:weburl weburl))))
+ (map
+ (cut string-split-substring <> "::::")
+ comments))))))
;; Import GeneRIF
;; Download GeneRIF data from
;; https://ftp.ncbi.nih.gov/gene/GeneRIF/generifs_basic.gz
-
(define decode-html-entities
(cut regexp-substitute/global
#f