From 685b9a6bfc273c8060a1d0d0daae8cc4e294c75b Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Fri, 6 Sep 2024 23:55:36 +0300 Subject: Use predicateObject Lists with a blank-node to model GeneRIF. Signed-off-by: Munyoki Kilyungi --- examples/generif.scm | 120 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 72 insertions(+), 48 deletions(-) diff --git a/examples/generif.scm b/examples/generif.scm index 894b766..8dcc201 100755 --- a/examples/generif.scm +++ b/examples/generif.scm @@ -50,8 +50,7 @@ (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id") (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id")) "WHERE GeneRIF.display > 0 AND GeneRIF.comment IS NOT NULL -GROUP BY GeneRIF.Id, GeneRIF.versionId, GeneRIF.symbol, GeneRIF.SpeciesId, -GeneRIF.createtime, GeneRIF.reason") +GROUP BY GeneRIF.Id, GeneRIF.versionId, GeneRIF.symbol") (schema-triples (gnc:GeneWikiEntry a rdfs:Class) (gnc:GNWikiEntry rdfs:subClassOf gnc:GeneWikiEntry) @@ -63,53 +62,78 @@ GeneRIF.createtime, GeneRIF.reason") (gnt:reason skos:definition "The reason why this resource was modified") (gnc:GNWikiEntry rdfs:comment "Represents GeneRIF Entries entered from GeneNetwork") (gnt:geneSymbol rdfs:domain gnc:GNWikiEntry)) - ;; Here we use the Id and VersionId to uniquely identify comments. - ;; We could use blank-nodes here; however, querying blank nodes - ;; E.g. getting the latest versionId is very complicated. Prefer - ;; normal triplets over blank-nodes. + ;; We want to avoid manually generating a unique identifier for each + ;; comment. As such we use a blank node (that has the comment) as + ;; the subject of the triples produced by matching the + ;; predicateObjectList production: + ;; (triples - (format #f "gn:wiki-~a-~a" - (field GeneRIF Id) - (field GeneRIF versionId)) - (set rdfs:comment (sanitize-rdf-string (field GeneRIF comment))) - (set rdf:type 'gnc:GNWikiEntry) - (set gnt:symbol - (string->identifier - "symbol" - (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field GeneRIF symbol) - 'pre "_" 'post) - #:proc (lambda (x) x))) - (set dct:created - (let ((create-time (field GeneRIF createtime EntryCreateTime))) - (if (string? create-time) - "" - (annotate-field - (time-unix->string - create-time - "~5") - '^^xsd:datetime)))) - (multiset dct:references - (string-split (field GeneRIF PubMed_ID PMID) - #\space)) - (set foaf:homepage (field GeneRIF weburl)) - (set gnt:belongsToSpecies (string->identifier - "" - (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)) - (set dct:hasVersion (annotate-field (format #f "~s" (field GeneRIF versionId)) - '^^xsd:int)) - (set dct:identifier (annotate-field (format #f "~s" (field GeneRIF Id)) - '^^xsd:int)) - (set gnt:initial (sanitize-rdf-string (field GeneRIF initial))) - (set gnt:reason (field GeneRIF reason)) - (set foaf:mbox (sanitize-rdf-string (field GeneRIF email))) - (multiset gnt:belongsToCategory - (string-split - (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR ';')" - GeneCategory)) - #\;)))) + (format #f "[ rdfs:comment '''~a'''@en] " + (field GeneRIF comment)) + (set rdf:type + (let* ((create-time (field + ("CAST(createtime AS CHAR)" EntryCreateTime))) + (pmid (field GeneRIF PubMed_ID PMID)) + (web-url (field GeneRIF weburl)) + (species (string->identifier + "" + (remap-species-identifiers (field Species Fullname)) + #:separator "" + #:proc string-capitalize-first)) + (version-id (field GeneRIF versionId)) + (identifier (field GeneRIF Id)) + (initial (sanitize-rdf-string (field GeneRIF initial))) + (reason (field GeneRIF reason)) + (email (sanitize-rdf-string (field GeneRIF email))) + (category + (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR ';')" + GeneCategory)))) + (string->symbol + (string-append + (format #f "gnc:GNWikiEntry ;\n") + (if (string? species) + "" + (format #f "\tgnt:belongsToSpecies ~a ;\n" + species)) + (format #f "\tdct:created ~s^^xsd:datetime ;\n" + create-time) + (if (and (string? pmid) (not (string-null? pmid))) + (format #f + "\tdct:references ( ~{pubmed:~a ~}) ;\n" + (string-split pmid #\space)) + "\tdct:references rdf:nil ;\n") + (if (string-blank? email) + "" + (format #f "\tfoaf:mbox <~a> ;\n" email)) + (format #f "\tdct:identifier \"~s\"^^xsd:integer ;\n" identifier) + (if (and (string? web-url) (not (string-null? web-url))) + (format #f "\tfoaf:homepage <~a> ;\n" + web-url) + "") + (format #f "\tdct:hasVersion \"~s\"^^xsd:integer ;\n" version-id) + (if (or (null? initial) + (string-blank? initial)) + "" (format #f "\tgnt:initial ~s ;\n" initial)) + (if (string-blank? reason) + "" + (format #f "\tgnt:reason ~s ;\n" reason)) + (if (string-blank? category) + "\tgnt:belongsToCategory rdf:nil ;\n" + (format #f + "\tgnt:belongsToCategory ( ~{~s ~}) ;\n" + (string-split category #\;))) + ;; We have this symbol at the very end of this transform + ;; because we have a strong guarantee that it will be a + ;; non-null value hence always terminating this triple + ;; properly with a "." + (format + #f "\tgnt:symbol ~a" + (string->identifier + "symbol" + (regexp-substitute/global #f "[^A-Za-z0-9:]" + (field GeneRIF symbol) + 'pre "_" 'post) + #:proc (lambda (x) x))))))))) (define-transformer ncbi-genewiki-entries (tables (GeneRIF_BASIC -- cgit v1.2.3