From b212b91f7f0454d64c86b85693d37783a42d5bc3 Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Wed, 30 Aug 2023 18:05:46 +0300 Subject: Remodel how GeneRIF metadata is transformed * examples/generif.scm: Import (ice-9 format). (genewiki-symbols): Transform symbols and their names only. This way there's no need to transform the symbol names in the other names thereby preventing duplication. (gn-genewiki-entries): Use format strings to create the comment blank-node. (ncbi-genewiki-entries): Ditto. Signed-off-by: Munyoki Kilyungi --- examples/generif.scm | 254 +++++++++++++++++++++++++++------------------------ 1 file changed, 135 insertions(+), 119 deletions(-) (limited to 'examples/generif.scm') diff --git a/examples/generif.scm b/examples/generif.scm index a5930ad..89e7fce 100755 --- a/examples/generif.scm +++ b/examples/generif.scm @@ -4,6 +4,7 @@ (use-modules (srfi srfi-1) (srfi srfi-26) (rnrs bytevectors) + (ice-9 format) (ice-9 getopt-long) (ice-9 match) (ice-9 regex) @@ -42,23 +43,17 @@ (define-transformer genewiki-symbols - (tables (GeneRIF_BASIC - (left-join Species "USING (SpeciesId)")) - "GROUP BY GeneId ORDER BY BINARY symbol") - (schema-triples - (gnt:symbol rdfs:domain gnc:NCBIWikiEntry)) - (triples (ontology 'generif: (field GeneRIF_BASIC GeneId)) - (multiset gnt:symbol (string-split (field ("GROUP_CONCAT(DISTINCT symbol)" symbol)) - #\,)) - (multiset xkos:classifiedUnder - (string-split - (field ("GROUP_CONCAT(DISTINCT Species.SpeciesName)" species)) - #\,)) - (multiset dct:relation - (map - (cut ontology 'ncbiTaxon: <>) - (string-split (field ("GROUP_CONCAT(DISTINCT TaxID)" taxId)) - #\,))))) + (tables (GeneRIF_BASIC) + "GROUP BY BINARY symbol") + (triples + (string->identifier + "symbol" + (regexp-substitute/global #f "[^A-Za-z0-9:]" + (field GeneRIF_BASIC symbol) + 'pre "_" 'post) + #:proc (lambda (x) x)) + (set rdfs:label + (field GeneRIF_BASIC symbol)))) (define-transformer gn-genewiki-entries (tables (GeneRIF @@ -66,115 +61,136 @@ (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id") (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id") (left-join Investigators "ON Investigators.Email = GeneRIF.email")) - "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 AND GeneRIF.comment IS NOT NULL GROUP BY GeneRIF.comment, GeneRIF.createtime") + "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 AND GeneRIF.comment IS NOT NULL GROUP BY GeneRIF.comment, BINARY GeneRIF.symbol") (schema-triples (gnc:GeneWikiEntry a rdfs:Class) (gnc:GNWikiEntry rdfs:subClassOf gnc:GeneWikiEntry) (gnc:GNWikiEntry rdfs:comment "Represents GeneRIF Entries entered from GeneNetwork") (gnt:symbol rdfs:domain gnc:GNWikiEntry)) (triples - (string->identifier - "generif" - (make-version-3-uuid - (u8-list->bytevector - ;; URL namespace UUID is 6ba7b811-9dad-11d1-80b4-00c04fd430c8 - '(107 167 184 17 157 173 17 209 128 180 - 0 192 79 212 48 200)) - (format #f "~a~a~a~a" - (field Species FullName) - (field GeneRIF comment) - (field GeneRIF symbol) - (field GeneRIF createtime)) - "")) - (set rdf:type 'gnc:GNWikiEntry) - (set rdfs:label (sanitize-rdf-string (field GeneRIF comment))) - (set gnt:symbol (field GeneRIF symbol)) - (multiset gnt:belongsToCategory - (remove-duplicates - (string-split-substring - (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR '$$')" - GeneCategory)) - "$$"))) - (set xkos:classifiedUnder - (string->identifier - "" - (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)) - (multiset dct:references - (map (lambda (x) - (ontology 'pubmed: x)) - (string-split - (let ((pmid (field - ("IFNULL(GeneRIF.PubMed_ID, '')" - PubMed_ID)))) - (if (number? pmid) - (number->string pmid) - pmid)) - #\space))) - (set dct:created - (let ((createtime (field GeneRIF createtime))) - (if (string? createtime) - "" - (annotate-field - (time-unix->string - createtime "~5") - '^^xsd:datetime)))) - (set dct:creator - (if (and (not (string-null? - (string-trim-both (field GeneRIF email)))) - (not (string-null? (field Investigators Email)))) - (investigator-attributes->id - (field Investigators FirstName) - (field Investigators LastName) - (field Investigators Email)) - "")) - (set foaf:homepage (field GeneRIF weburl)))) + (string->identifier + "symbol" + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field GeneRIF symbol) + 'pre "_" 'post) + #:proc (lambda (x) x)) + (set rdfs:comment + (let* ([generif-comment (sanitize-rdf-string (field GeneRIF comment))] + [create-time (field GeneRIF createtime EntryCreateTime)] + [pmid (field GeneRIF PubMed_ID PMID)] + [web-url (field GeneRIF weburl)] + [species (string->identifier + "" + (remap-species-identifiers (field Species Fullname)) + #:separator "" + #:proc string-capitalize-first)] + [categories + (remove (lambda (x) + (or (eq? x #f) + (and (string? x) + (string-null? x)))) + (remove-duplicates + (string-split-substring + (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR '$$')" + GeneCategory)) + "$$")))]) + (string->symbol + (string-append + "[ " + (format #f "rdf:type gnc:GNWikiEntry ; ") + (if (string? species) + "" + (format #f "xkos:classifiedUnder ~a ; " + species)) + (format #f "rdfs:comment ~s^^xsd:string ; " + generif-comment) + (if (string? create-time) + "" + (format #f "dct:created ~s^^xsd:datetime ; " + (time-unix->string + create-time "~5"))) + (if (and (string? pmid) (not (string-null? pmid))) + (format #f + "~{dct:references pubmed:~a ; ~}" + (string-split pmid #\space)) + "") + (if (and (not (string-null? + (string-trim-both (field GeneRIF email)))) + (not (string-null? (field Investigators Email)))) + (format #f "dct:creator ~a ; " + (investigator-attributes->id + (field Investigators FirstName) + (field Investigators LastName) + (field Investigators Email))) + "") + (if (not (null? categories)) + (format #f + "~{gnt:belongsToCategory ~s ; ~}" + categories) + "") + (if (and (string? web-url) (not (string-null? web-url))) + (format #f "foaf:homepage ~s ; " + web-url) + "") + " ] ")))))) (define-transformer ncbi-genewiki-entries - (tables (GeneRIF_BASIC) - "GROUP BY GeneId, comment, createtime") - (schema-triples - (gnc:NCBIWikiEntry rdfs:subClassOf gnc:GeneWikiEntry) - (gnc:NCBIWikiEntry rdfs:comment "Represents GeneRIF Entries obtained from NCBI")) - (triples - (string->identifier - "generif" - (make-version-3-uuid - (u8-list->bytevector - ;; URL namespace UUID is 6ba7b811-9dad-11d1-80b4-00c04fd430c8 - '(107 167 184 17 157 173 17 209 128 180 - 0 192 79 212 48 200)) - (format #f "~a~a~a~a~a" - (field GeneRIF_BASIC GeneId) - (field GeneRIF_BASIC VersionId) - (field GeneRIF_BASIC comment) - (field GeneRIF_BASIC symbol) - (field GeneRIF_BASIC createtime)) - "")) - (set rdf:type 'gnc:NCBIWikiEntry) - (set rdfs:label - (annotate-field (field GeneRIF_BASIC comment) - '^^xsd:string)) - (set gnt:symbol (field GeneRIF_BASIC symbol)) - (multiset dct:references - (map - (lambda (el) - (if (string-null? el) - "" - (ontology 'pubmed: el))) - (string-split (field ("GROUP_CONCAT(PubMed_ID)" pmids)) - #\,))) - (set gnt:hasVersion - (format #f "~a" (field GeneRIF_BASIC VersionId))) - (set dct:created - (let ((createtime (field GeneRIF_BASIC createtime))) - (if (string? createtime) - "" - (annotate-field - (time-unix->string - createtime "~5") - '^^xsd:datetime)))))) + (tables (GeneRIF_BASIC + (left-join Species "USING (SpeciesId)")) + "WHERE GeneRIF_BASIC.comment IS NOT NULL GROUP BY GeneRIF_BASIC.comment, GeneRIF_BASIC.createtime, GeneRIF_BASIC.VersionId, GeneRIF_BASIC.SpeciesId, GeneRIF_BASIC.TaxID") + (schema-triples + (gnc:NCBIWikiEntry rdfs:subClassOf gnc:GeneWikiEntry) + (gnc:NCBIWikiEntry rdfs:comment "Represents GeneRIF Entries obtained from NCBI") + (gnt:hasGeneId a owl:ObjectProperty) + (gnt:hasGeneId rdfs:domain gnc:NCBIWikiEntry) + (gnt:hasGeneId skos:definition "The GeneId of this this resource") + (gnt:hasVersionId a owl:ObjectProperty) + (gnt:hasVersionId rdfs:domain gnc:NCBIWikiEntry) + (gnt:hasVersionId skos:definition "The VersionId of this this resource")) + (triples + (string->identifier + "symbol" + (regexp-substitute/global #f "[^A-Za-z0-9:]" + (field GeneRIF_BASIC symbol) + 'pre "_" 'post) + #:proc (lambda (x) x)) + (set rdfs:comment + (let* ([ncbi-comment (field GeneRIF_BASIC comment)] + [species + (string->identifier + "" + (remap-species-identifiers (field Species Fullname)) + #:separator "" + #:proc string-capitalize-first)] + [taxonomic-id (field GeneRIF_BASIC TaxID)] + [create-time (field GeneRIF_BASIC createtime EntryCreateTime)] + [pmid (field GeneRIF_BASIC PubMed_ID PMID)]) + (string->symbol + (string-append + "[ " + (format #f "rdf:type gnc:NCBIWikiEntry ; ") + (format #f "xkos:classifiedUnder ~a ; " + species) + (if (eq? #f taxonomic-id) + "" + (format #f "skos:notation taxon:~a ; " + (field GeneRIF_BASIC TaxID))) + (format #f "gnt:hasGeneId generif:~a ; " + (field GeneRIF_BASIC GeneId)) + (format #f "gnt:hasVersionId '~a'^^xsd:integer ; " + (field GeneRIF_BASIC VersionId)) + (if (and (string? pmid) (not (string-null? pmid))) + (format #f + "~{dct:references pubmed:~a ; ~}" + (string-split pmid #\space)) + "") + (if (string? create-time) + "" + (format #f "dct:created ~s^^xsd:datetime ; " + (time-unix->string + create-time "~5"))) + " ]")))))) @@ -205,7 +221,7 @@ ("dct:" "") ("foaf:" "") ("pubmed:" "") - ("ncbiTaxon:" "") + ("taxon:" "") ("generif:" "") ("xsd:" "") ("owl:" ""))) -- cgit v1.2.3