aboutsummaryrefslogtreecommitdiff
path: root/examples/generif.scm
diff options
context:
space:
mode:
authorMunyoki Kilyungi2023-08-23 17:06:46 +0300
committerMunyoki Kilyungi2023-08-23 18:02:19 +0300
commit51f6fa8064ab5ac7d1db0b8dc1edabfb8381c3ce (patch)
treeb4d316cb8f909edd69260b976b12de60b2ef0df8 /examples/generif.scm
parent200231f282c5e20a4dd1789adec070f87d437532 (diff)
downloadgn-transform-databases-51f6fa8064ab5ac7d1db0b8dc1edabfb8381c3ce.tar.gz
Update how gn-specific genewiki entries are transformed
Each comment is identified using a unique uuid so that we can identify it's metadata I.e time, author, symbol, etc.
Diffstat (limited to 'examples/generif.scm')
-rwxr-xr-xexamples/generif.scm173
1 files changed, 111 insertions, 62 deletions
diff --git a/examples/generif.scm b/examples/generif.scm
index 170cf0c..e960104 100755
--- a/examples/generif.scm
+++ b/examples/generif.scm
@@ -3,13 +3,41 @@
(use-modules (srfi srfi-1)
(srfi srfi-26)
+ (rnrs bytevectors)
(ice-9 getopt-long)
(ice-9 match)
(ice-9 regex)
(transform strings)
(transform sql)
(transform triples)
- (transform special-forms))
+ (transform special-forms)
+ (transform uuid))
+
+
+
+(define (remap-species-identifiers str)
+ "This procedure remaps identifiers to standard binominal. Obviously this should
+ be sorted by correcting the database!"
+ (match str
+ ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"]
+ ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"]
+ ["Macaca mulatta" "Macaca nemestrina"]
+ ["Bat (Glossophaga soricina)" "Glossophaga soricina"]
+ [str str]))
+
+(define (fix-email-id email)
+ (string-delete #\space email))
+
+(define (investigator-attributes->id first-name last-name email)
+ ;; There is just one record corresponding to "Evan Williams" which
+ ;; does not have an email ID. To accommodate that record, we
+ ;; construct the investigator ID from not just the email ID, but
+ ;; also the first and the last names. It would be preferable to just
+ ;; find Evan Williams' email ID and insert it into the database.
+ (string->identifier "investigator"
+ (string-join
+ (list first-name last-name (fix-email-id email))
+ "_")))
@@ -23,79 +51,30 @@
(gnt:taxid rdfs:domain gn-term:geneWikiEntry))
(triples (ontology 'generif: (field GeneRIF_BASIC GeneId))
(multiset gnt:symbol (string-split (field ("GROUP_CONCAT(DISTINCT symbol)" symbol))
- #\,))
+ #\,))
(multiset gnt:wikiEntryOfSpecies
(string-split
(field ("GROUP_CONCAT(DISTINCT Species.SpeciesName)" species))
#\,))
(multiset gnt:taxId (map (cut ontology 'ncbiTaxon: <>)
- (string-split (field ("GROUP_CONCAT(DISTINCT TaxID)" taxId))
- #\,)))))
+ (string-split (field ("GROUP_CONCAT(DISTINCT TaxID)" taxId))
+ #\,)))))
(define-transformer gn-genewiki-entries
(tables (GeneRIF
- (left-join GeneRIF_BASIC "USING (symbol)")
(left-join Species "ON Species.SpeciesId = GeneRIF.SpeciesId")
(left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id")
- (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id"))
- "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 GROUP BY GeneRIF.symbol")
+ (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id")
+ (left-join Investigators "ON Investigators.Email = GeneRIF.email"))
+ "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 AND GeneRIF.comment IS NOT NULL GROUP BY GeneRIF.comment, GeneRIF.createtime")
(schema-triples
- (gnt:geneWikiEntry a rdfs:Class)
- (gnt:geneWikiEntry a owl:Class)
- (gnt:geneWikiEntry rdfs:comment "Represents GeneRIF Entries")
- (gnt:geneCategory rdfs:domain gn:geneWikiEntry)
- (gnt:geneWikiEntryOfGn rdfs:domain gn:geneWikiEntry)
- (gnt:geneWikiEntry rdfs:domain gn:geneWikiEntry))
+ (gnc:GeneWikiEntry a rdfs:Class)
+ (gnc:GNWikiEntry rdfs:subClassOf gnc:GeneWikiEntry)
+ (gnc:GNWikiEntry rdfs:comment "Represents GeneRIF Entries entered from GeneNetwork")
+ (gnt:belongsToCategory rdfs:domain gnc:GNWikiEntry)
+ (gnt:belongsToSpecies rdfs:domain gnc:GNWikiEntry)
+ (gnt:symbol rdfs:domain gnc:GNWikiEntry))
(triples
- (let ([geneid (field GeneRIF_BASIC GeneId)])
- (if (eq? geneid 0)
- (ontology 'gnt:anonSymbol_
- (field GeneRIF symbol))
- (ontology 'generif:
- geneid)))
- (set rdf:type
- (if (string-null? (field ("IFNULL(GeneRIF_BASIC.GeneId, '')" geneWikiEntryP)))
- ""
- 'gn:geneWikiEntry))
- (set gnt:wikiEntryOfSpecies
- (string->binomial-name (field Species FullName)))
- ;; This only transforms symbols not present in the GeneRIF_BASIC table
- (set gnt:symbol (let ([geneid (field GeneRIF_BASIC GeneId)])
- (if (eq? geneid 0)
- (field GeneRIF symbol)
- "")))
- (multiset gnt:geneWikiEntryOfGn
- (let* ([entries
- (sanitize-rdf-string
- (field
- ("GROUP_CONCAT(DISTINCT CONCAT_WS('::::', IFNULL(GeneCategory.Name, ''), IFNULL(GeneRIF.PubMed_ID, ''), GeneRIF.email, CAST(CONVERT(BINARY CONVERT(GeneRIF.comment USING latin1) USING utf8) AS VARCHAR(15000)), GeneRIF.createtime, IFNULL(weburl, '')) SEPARATOR';;;;;')"
- wikientry)))]
- [comments (string-split-substring entries ";;;;;")])
- (map
- (match-lambda
- ((genecategory pmid email text createtime weburl)
- (blank-node
- (set gnt:geneCategory genecategory)
- (multiset dct:source
- (map (lambda (el) (if (string-null? el)
- ""
- (ontology 'pubmed: el)))
- (string-split pmid #\space)))
- (set dct:creator (regexp-substitute/global #f "@.*$"
- email
- 'pre
- ""
- 'post))
- (set gnt:geneWikiEntry
- (annotate-field text '^^xsd:string))
- (set dct:created (annotate-field
- createtime
- '^^xsd:datetime))
- (set foaf:homepage weburl))))
- (map
- (cut string-split-substring <> "::::")
- comments))))))
-
(define-transformer ncbi-genewiki-entries
(tables (GeneRIF_BASIC)
"GROUP BY GeneId, comment, createtime")
@@ -116,6 +95,76 @@
(set dct:created (annotate-field (time-unix->string
(field GeneRIF_BASIC createtime) "~5")
'^^xsd:datetime))))))
+ (string->identifier
+ "generif"
+ (make-version-3-uuid
+ (u8-list->bytevector
+ ;; URL namespace UUID is 6ba7b811-9dad-11d1-80b4-00c04fd430c8
+ '(107 167 184 17 157 173 17 209 128 180
+ 0 192 79 212 48 200))
+ (format #f "~a~a~a~a"
+ (field Species FullName)
+ (field GeneRIF comment)
+ (field GeneRIF symbol)
+ (field GeneRIF createtime))
+ ""))
+ (string->identifier
+ "generif"
+ (make-version-3-uuid
+ (u8-list->bytevector
+ ;; URL namespace UUID is 6ba7b811-9dad-11d1-80b4-00c04fd430c8
+ '(107 167 184 17 157 173 17 209 128 180
+ 0 192 79 212 48 200))
+ (format #f "~a~a~a~a"
+ (field Species FullName)
+ (field GeneRIF comment)
+ (field GeneRIF symbol)
+ (field GeneRIF createtime))
+ ""))
+ (set rdf:type 'gnc:GNWikiEntry)
+ (set rdfs:label (sanitize-rdf-string (field GeneRIF comment)))
+ (set gnt:symbol (field GeneRIF symbol))
+ (multiset gnt:belongsToCategory
+ (remove-duplicates
+ (string-split-substring
+ (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR '$$')"
+ GeneCategory))
+ "$$")))
+ (set gnt:belongsToSpecies
+ (string->identifier
+ ""
+ (remap-species-identifiers (field Species Fullname))
+ #:separator ""
+ #:proc string-capitalize-first))
+ (multiset dct:references
+ (map (lambda (x)
+ (ontology 'pubmed: x))
+ (string-split
+ (let ((pmid (field
+ ("IFNULL(GeneRIF.PubMed_ID, '')"
+ PubMed_ID))))
+ (if (number? pmid)
+ (number->string pmid)
+ pmid))
+ #\space)))
+ (set dct:created
+ (let ((createtime (field GeneRIF createtime)))
+ (if (string? createtime)
+ ""
+ (annotate-field
+ (time-unix->string
+ createtime "~5")
+ '^^xsd:datetime))))
+ (set dct:creator
+ (if (and (not (string-null?
+ (string-trim-both (field GeneRIF email))))
+ (not (string-null? (field Investigators Email))))
+ (investigator-attributes->id
+ (field Investigators FirstName)
+ (field Investigators LastName)
+ (field Investigators Email))
+ ""))
+ (set foaf:homepage (field GeneRIF weburl))))