From 51f6fa8064ab5ac7d1db0b8dc1edabfb8381c3ce Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Wed, 23 Aug 2023 17:06:46 +0300 Subject: Update how gn-specific genewiki entries are transformed Each comment is identified using a unique uuid so that we can identify it's metadata I.e time, author, symbol, etc. --- examples/generif.scm | 173 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 111 insertions(+), 62 deletions(-) (limited to 'examples/generif.scm') diff --git a/examples/generif.scm b/examples/generif.scm index 170cf0c..e960104 100755 --- a/examples/generif.scm +++ b/examples/generif.scm @@ -3,13 +3,41 @@ (use-modules (srfi srfi-1) (srfi srfi-26) + (rnrs bytevectors) (ice-9 getopt-long) (ice-9 match) (ice-9 regex) (transform strings) (transform sql) (transform triples) - (transform special-forms)) + (transform special-forms) + (transform uuid)) + + + +(define (remap-species-identifiers str) + "This procedure remaps identifiers to standard binominal. Obviously this should + be sorted by correcting the database!" + (match str + ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] + ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] + ["Macaca mulatta" "Macaca nemestrina"] + ["Bat (Glossophaga soricina)" "Glossophaga soricina"] + [str str])) + +(define (fix-email-id email) + (string-delete #\space email)) + +(define (investigator-attributes->id first-name last-name email) + ;; There is just one record corresponding to "Evan Williams" which + ;; does not have an email ID. To accommodate that record, we + ;; construct the investigator ID from not just the email ID, but + ;; also the first and the last names. It would be preferable to just + ;; find Evan Williams' email ID and insert it into the database. + (string->identifier "investigator" + (string-join + (list first-name last-name (fix-email-id email)) + "_"))) @@ -23,79 +51,30 @@ (gnt:taxid rdfs:domain gn-term:geneWikiEntry)) (triples (ontology 'generif: (field GeneRIF_BASIC GeneId)) (multiset gnt:symbol (string-split (field ("GROUP_CONCAT(DISTINCT symbol)" symbol)) - #\,)) + #\,)) (multiset gnt:wikiEntryOfSpecies (string-split (field ("GROUP_CONCAT(DISTINCT Species.SpeciesName)" species)) #\,)) (multiset gnt:taxId (map (cut ontology 'ncbiTaxon: <>) - (string-split (field ("GROUP_CONCAT(DISTINCT TaxID)" taxId)) - #\,))))) + (string-split (field ("GROUP_CONCAT(DISTINCT TaxID)" taxId)) + #\,))))) (define-transformer gn-genewiki-entries (tables (GeneRIF - (left-join GeneRIF_BASIC "USING (symbol)") (left-join Species "ON Species.SpeciesId = GeneRIF.SpeciesId") (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id") - (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id")) - "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 GROUP BY GeneRIF.symbol") + (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id") + (left-join Investigators "ON Investigators.Email = GeneRIF.email")) + "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 AND GeneRIF.comment IS NOT NULL GROUP BY GeneRIF.comment, GeneRIF.createtime") (schema-triples - (gnt:geneWikiEntry a rdfs:Class) - (gnt:geneWikiEntry a owl:Class) - (gnt:geneWikiEntry rdfs:comment "Represents GeneRIF Entries") - (gnt:geneCategory rdfs:domain gn:geneWikiEntry) - (gnt:geneWikiEntryOfGn rdfs:domain gn:geneWikiEntry) - (gnt:geneWikiEntry rdfs:domain gn:geneWikiEntry)) + (gnc:GeneWikiEntry a rdfs:Class) + (gnc:GNWikiEntry rdfs:subClassOf gnc:GeneWikiEntry) + (gnc:GNWikiEntry rdfs:comment "Represents GeneRIF Entries entered from GeneNetwork") + (gnt:belongsToCategory rdfs:domain gnc:GNWikiEntry) + (gnt:belongsToSpecies rdfs:domain gnc:GNWikiEntry) + (gnt:symbol rdfs:domain gnc:GNWikiEntry)) (triples - (let ([geneid (field GeneRIF_BASIC GeneId)]) - (if (eq? geneid 0) - (ontology 'gnt:anonSymbol_ - (field GeneRIF symbol)) - (ontology 'generif: - geneid))) - (set rdf:type - (if (string-null? (field ("IFNULL(GeneRIF_BASIC.GeneId, '')" geneWikiEntryP))) - "" - 'gn:geneWikiEntry)) - (set gnt:wikiEntryOfSpecies - (string->binomial-name (field Species FullName))) - ;; This only transforms symbols not present in the GeneRIF_BASIC table - (set gnt:symbol (let ([geneid (field GeneRIF_BASIC GeneId)]) - (if (eq? geneid 0) - (field GeneRIF symbol) - ""))) - (multiset gnt:geneWikiEntryOfGn - (let* ([entries - (sanitize-rdf-string - (field - ("GROUP_CONCAT(DISTINCT CONCAT_WS('::::', IFNULL(GeneCategory.Name, ''), IFNULL(GeneRIF.PubMed_ID, ''), GeneRIF.email, CAST(CONVERT(BINARY CONVERT(GeneRIF.comment USING latin1) USING utf8) AS VARCHAR(15000)), GeneRIF.createtime, IFNULL(weburl, '')) SEPARATOR';;;;;')" - wikientry)))] - [comments (string-split-substring entries ";;;;;")]) - (map - (match-lambda - ((genecategory pmid email text createtime weburl) - (blank-node - (set gnt:geneCategory genecategory) - (multiset dct:source - (map (lambda (el) (if (string-null? el) - "" - (ontology 'pubmed: el))) - (string-split pmid #\space))) - (set dct:creator (regexp-substitute/global #f "@.*$" - email - 'pre - "" - 'post)) - (set gnt:geneWikiEntry - (annotate-field text '^^xsd:string)) - (set dct:created (annotate-field - createtime - '^^xsd:datetime)) - (set foaf:homepage weburl)))) - (map - (cut string-split-substring <> "::::") - comments)))))) - (define-transformer ncbi-genewiki-entries (tables (GeneRIF_BASIC) "GROUP BY GeneId, comment, createtime") @@ -116,6 +95,76 @@ (set dct:created (annotate-field (time-unix->string (field GeneRIF_BASIC createtime) "~5") '^^xsd:datetime)))))) + (string->identifier + "generif" + (make-version-3-uuid + (u8-list->bytevector + ;; URL namespace UUID is 6ba7b811-9dad-11d1-80b4-00c04fd430c8 + '(107 167 184 17 157 173 17 209 128 180 + 0 192 79 212 48 200)) + (format #f "~a~a~a~a" + (field Species FullName) + (field GeneRIF comment) + (field GeneRIF symbol) + (field GeneRIF createtime)) + "")) + (string->identifier + "generif" + (make-version-3-uuid + (u8-list->bytevector + ;; URL namespace UUID is 6ba7b811-9dad-11d1-80b4-00c04fd430c8 + '(107 167 184 17 157 173 17 209 128 180 + 0 192 79 212 48 200)) + (format #f "~a~a~a~a" + (field Species FullName) + (field GeneRIF comment) + (field GeneRIF symbol) + (field GeneRIF createtime)) + "")) + (set rdf:type 'gnc:GNWikiEntry) + (set rdfs:label (sanitize-rdf-string (field GeneRIF comment))) + (set gnt:symbol (field GeneRIF symbol)) + (multiset gnt:belongsToCategory + (remove-duplicates + (string-split-substring + (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR '$$')" + GeneCategory)) + "$$"))) + (set gnt:belongsToSpecies + (string->identifier + "" + (remap-species-identifiers (field Species Fullname)) + #:separator "" + #:proc string-capitalize-first)) + (multiset dct:references + (map (lambda (x) + (ontology 'pubmed: x)) + (string-split + (let ((pmid (field + ("IFNULL(GeneRIF.PubMed_ID, '')" + PubMed_ID)))) + (if (number? pmid) + (number->string pmid) + pmid)) + #\space))) + (set dct:created + (let ((createtime (field GeneRIF createtime))) + (if (string? createtime) + "" + (annotate-field + (time-unix->string + createtime "~5") + '^^xsd:datetime)))) + (set dct:creator + (if (and (not (string-null? + (string-trim-both (field GeneRIF email)))) + (not (string-null? (field Investigators Email)))) + (investigator-attributes->id + (field Investigators FirstName) + (field Investigators LastName) + (field Investigators Email)) + "")) + (set foaf:homepage (field GeneRIF weburl)))) -- cgit v1.2.3