aboutsummaryrefslogtreecommitdiff
path: root/examples/generif.scm
diff options
context:
space:
mode:
authorMunyoki Kilyungi2023-08-30 18:05:46 +0300
committerMunyoki Kilyungi2023-08-30 18:16:10 +0300
commitb212b91f7f0454d64c86b85693d37783a42d5bc3 (patch)
treeb043930a7cff3e88973b94989df54f7139478b03 /examples/generif.scm
parentf3ede362e1d7d00022a6f9f74d7ca304014f07fe (diff)
downloadgn-transform-databases-b212b91f7f0454d64c86b85693d37783a42d5bc3.tar.gz
Remodel how GeneRIF metadata is transformed
* examples/generif.scm: Import (ice-9 format). (genewiki-symbols): Transform symbols and their names only. This way there's no need to transform the symbol names in the other names thereby preventing duplication. (gn-genewiki-entries): Use format strings to create the comment blank-node. (ncbi-genewiki-entries): Ditto. Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
Diffstat (limited to 'examples/generif.scm')
-rwxr-xr-xexamples/generif.scm254
1 files changed, 135 insertions, 119 deletions
diff --git a/examples/generif.scm b/examples/generif.scm
index a5930ad..89e7fce 100755
--- a/examples/generif.scm
+++ b/examples/generif.scm
@@ -4,6 +4,7 @@
(use-modules (srfi srfi-1)
(srfi srfi-26)
(rnrs bytevectors)
+ (ice-9 format)
(ice-9 getopt-long)
(ice-9 match)
(ice-9 regex)
@@ -42,23 +43,17 @@
(define-transformer genewiki-symbols
- (tables (GeneRIF_BASIC
- (left-join Species "USING (SpeciesId)"))
- "GROUP BY GeneId ORDER BY BINARY symbol")
- (schema-triples
- (gnt:symbol rdfs:domain gnc:NCBIWikiEntry))
- (triples (ontology 'generif: (field GeneRIF_BASIC GeneId))
- (multiset gnt:symbol (string-split (field ("GROUP_CONCAT(DISTINCT symbol)" symbol))
- #\,))
- (multiset xkos:classifiedUnder
- (string-split
- (field ("GROUP_CONCAT(DISTINCT Species.SpeciesName)" species))
- #\,))
- (multiset dct:relation
- (map
- (cut ontology 'ncbiTaxon: <>)
- (string-split (field ("GROUP_CONCAT(DISTINCT TaxID)" taxId))
- #\,)))))
+ (tables (GeneRIF_BASIC)
+ "GROUP BY BINARY symbol")
+ (triples
+ (string->identifier
+ "symbol"
+ (regexp-substitute/global #f "[^A-Za-z0-9:]"
+ (field GeneRIF_BASIC symbol)
+ 'pre "_" 'post)
+ #:proc (lambda (x) x))
+ (set rdfs:label
+ (field GeneRIF_BASIC symbol))))
(define-transformer gn-genewiki-entries
(tables (GeneRIF
@@ -66,115 +61,136 @@
(left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id")
(left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id")
(left-join Investigators "ON Investigators.Email = GeneRIF.email"))
- "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 AND GeneRIF.comment IS NOT NULL GROUP BY GeneRIF.comment, GeneRIF.createtime")
+ "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 AND GeneRIF.comment IS NOT NULL GROUP BY GeneRIF.comment, BINARY GeneRIF.symbol")
(schema-triples
(gnc:GeneWikiEntry a rdfs:Class)
(gnc:GNWikiEntry rdfs:subClassOf gnc:GeneWikiEntry)
(gnc:GNWikiEntry rdfs:comment "Represents GeneRIF Entries entered from GeneNetwork")
(gnt:symbol rdfs:domain gnc:GNWikiEntry))
(triples
- (string->identifier
- "generif"
- (make-version-3-uuid
- (u8-list->bytevector
- ;; URL namespace UUID is 6ba7b811-9dad-11d1-80b4-00c04fd430c8
- '(107 167 184 17 157 173 17 209 128 180
- 0 192 79 212 48 200))
- (format #f "~a~a~a~a"
- (field Species FullName)
- (field GeneRIF comment)
- (field GeneRIF symbol)
- (field GeneRIF createtime))
- ""))
- (set rdf:type 'gnc:GNWikiEntry)
- (set rdfs:label (sanitize-rdf-string (field GeneRIF comment)))
- (set gnt:symbol (field GeneRIF symbol))
- (multiset gnt:belongsToCategory
- (remove-duplicates
- (string-split-substring
- (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR '$$')"
- GeneCategory))
- "$$")))
- (set xkos:classifiedUnder
- (string->identifier
- ""
- (remap-species-identifiers (field Species Fullname))
- #:separator ""
- #:proc string-capitalize-first))
- (multiset dct:references
- (map (lambda (x)
- (ontology 'pubmed: x))
- (string-split
- (let ((pmid (field
- ("IFNULL(GeneRIF.PubMed_ID, '')"
- PubMed_ID))))
- (if (number? pmid)
- (number->string pmid)
- pmid))
- #\space)))
- (set dct:created
- (let ((createtime (field GeneRIF createtime)))
- (if (string? createtime)
- ""
- (annotate-field
- (time-unix->string
- createtime "~5")
- '^^xsd:datetime))))
- (set dct:creator
- (if (and (not (string-null?
- (string-trim-both (field GeneRIF email))))
- (not (string-null? (field Investigators Email))))
- (investigator-attributes->id
- (field Investigators FirstName)
- (field Investigators LastName)
- (field Investigators Email))
- ""))
- (set foaf:homepage (field GeneRIF weburl))))
+ (string->identifier
+ "symbol"
+ (regexp-substitute/global
+ #f "[^A-Za-z0-9:]"
+ (field GeneRIF symbol)
+ 'pre "_" 'post)
+ #:proc (lambda (x) x))
+ (set rdfs:comment
+ (let* ([generif-comment (sanitize-rdf-string (field GeneRIF comment))]
+ [create-time (field GeneRIF createtime EntryCreateTime)]
+ [pmid (field GeneRIF PubMed_ID PMID)]
+ [web-url (field GeneRIF weburl)]
+ [species (string->identifier
+ ""
+ (remap-species-identifiers (field Species Fullname))
+ #:separator ""
+ #:proc string-capitalize-first)]
+ [categories
+ (remove (lambda (x)
+ (or (eq? x #f)
+ (and (string? x)
+ (string-null? x))))
+ (remove-duplicates
+ (string-split-substring
+ (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR '$$')"
+ GeneCategory))
+ "$$")))])
+ (string->symbol
+ (string-append
+ "[ "
+ (format #f "rdf:type gnc:GNWikiEntry ; ")
+ (if (string? species)
+ ""
+ (format #f "xkos:classifiedUnder ~a ; "
+ species))
+ (format #f "rdfs:comment ~s^^xsd:string ; "
+ generif-comment)
+ (if (string? create-time)
+ ""
+ (format #f "dct:created ~s^^xsd:datetime ; "
+ (time-unix->string
+ create-time "~5")))
+ (if (and (string? pmid) (not (string-null? pmid)))
+ (format #f
+ "~{dct:references pubmed:~a ; ~}"
+ (string-split pmid #\space))
+ "")
+ (if (and (not (string-null?
+ (string-trim-both (field GeneRIF email))))
+ (not (string-null? (field Investigators Email))))
+ (format #f "dct:creator ~a ; "
+ (investigator-attributes->id
+ (field Investigators FirstName)
+ (field Investigators LastName)
+ (field Investigators Email)))
+ "")
+ (if (not (null? categories))
+ (format #f
+ "~{gnt:belongsToCategory ~s ; ~}"
+ categories)
+ "")
+ (if (and (string? web-url) (not (string-null? web-url)))
+ (format #f "foaf:homepage ~s ; "
+ web-url)
+ "")
+ " ] "))))))
(define-transformer ncbi-genewiki-entries
- (tables (GeneRIF_BASIC)
- "GROUP BY GeneId, comment, createtime")
- (schema-triples
- (gnc:NCBIWikiEntry rdfs:subClassOf gnc:GeneWikiEntry)
- (gnc:NCBIWikiEntry rdfs:comment "Represents GeneRIF Entries obtained from NCBI"))
- (triples
- (string->identifier
- "generif"
- (make-version-3-uuid
- (u8-list->bytevector
- ;; URL namespace UUID is 6ba7b811-9dad-11d1-80b4-00c04fd430c8
- '(107 167 184 17 157 173 17 209 128 180
- 0 192 79 212 48 200))
- (format #f "~a~a~a~a~a"
- (field GeneRIF_BASIC GeneId)
- (field GeneRIF_BASIC VersionId)
- (field GeneRIF_BASIC comment)
- (field GeneRIF_BASIC symbol)
- (field GeneRIF_BASIC createtime))
- ""))
- (set rdf:type 'gnc:NCBIWikiEntry)
- (set rdfs:label
- (annotate-field (field GeneRIF_BASIC comment)
- '^^xsd:string))
- (set gnt:symbol (field GeneRIF_BASIC symbol))
- (multiset dct:references
- (map
- (lambda (el)
- (if (string-null? el)
- ""
- (ontology 'pubmed: el)))
- (string-split (field ("GROUP_CONCAT(PubMed_ID)" pmids))
- #\,)))
- (set gnt:hasVersion
- (format #f "~a" (field GeneRIF_BASIC VersionId)))
- (set dct:created
- (let ((createtime (field GeneRIF_BASIC createtime)))
- (if (string? createtime)
- ""
- (annotate-field
- (time-unix->string
- createtime "~5")
- '^^xsd:datetime))))))
+ (tables (GeneRIF_BASIC
+ (left-join Species "USING (SpeciesId)"))
+ "WHERE GeneRIF_BASIC.comment IS NOT NULL GROUP BY GeneRIF_BASIC.comment, GeneRIF_BASIC.createtime, GeneRIF_BASIC.VersionId, GeneRIF_BASIC.SpeciesId, GeneRIF_BASIC.TaxID")
+ (schema-triples
+ (gnc:NCBIWikiEntry rdfs:subClassOf gnc:GeneWikiEntry)
+ (gnc:NCBIWikiEntry rdfs:comment "Represents GeneRIF Entries obtained from NCBI")
+ (gnt:hasGeneId a owl:ObjectProperty)
+ (gnt:hasGeneId rdfs:domain gnc:NCBIWikiEntry)
+ (gnt:hasGeneId skos:definition "The GeneId of this this resource")
+ (gnt:hasVersionId a owl:ObjectProperty)
+ (gnt:hasVersionId rdfs:domain gnc:NCBIWikiEntry)
+ (gnt:hasVersionId skos:definition "The VersionId of this this resource"))
+ (triples
+ (string->identifier
+ "symbol"
+ (regexp-substitute/global #f "[^A-Za-z0-9:]"
+ (field GeneRIF_BASIC symbol)
+ 'pre "_" 'post)
+ #:proc (lambda (x) x))
+ (set rdfs:comment
+ (let* ([ncbi-comment (field GeneRIF_BASIC comment)]
+ [species
+ (string->identifier
+ ""
+ (remap-species-identifiers (field Species Fullname))
+ #:separator ""
+ #:proc string-capitalize-first)]
+ [taxonomic-id (field GeneRIF_BASIC TaxID)]
+ [create-time (field GeneRIF_BASIC createtime EntryCreateTime)]
+ [pmid (field GeneRIF_BASIC PubMed_ID PMID)])
+ (string->symbol
+ (string-append
+ "[ "
+ (format #f "rdf:type gnc:NCBIWikiEntry ; ")
+ (format #f "xkos:classifiedUnder ~a ; "
+ species)
+ (if (eq? #f taxonomic-id)
+ ""
+ (format #f "skos:notation taxon:~a ; "
+ (field GeneRIF_BASIC TaxID)))
+ (format #f "gnt:hasGeneId generif:~a ; "
+ (field GeneRIF_BASIC GeneId))
+ (format #f "gnt:hasVersionId '~a'^^xsd:integer ; "
+ (field GeneRIF_BASIC VersionId))
+ (if (and (string? pmid) (not (string-null? pmid)))
+ (format #f
+ "~{dct:references pubmed:~a ; ~}"
+ (string-split pmid #\space))
+ "")
+ (if (string? create-time)
+ ""
+ (format #f "dct:created ~s^^xsd:datetime ; "
+ (time-unix->string
+ create-time "~5")))
+ " ]"))))))
@@ -205,7 +221,7 @@
("dct:" "<http://purl.org/dc/terms/>")
("foaf:" "<http://xmlns.com/foaf/0.1/>")
("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>")
- ("ncbiTaxon:" "<https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=>")
+ ("taxon:" "<https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=>")
("generif:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>")
("xsd:" "<http://www.w3.org/2001/XMLSchema#>")
("owl:" "<http://www.w3.org/2002/07/owl#>")))