aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMunyoki Kilyungi2024-10-03 23:00:03 +0300
committerMunyoki Kilyungi2024-10-08 13:26:39 +0300
commit53ab3481a430de39fa110ad3760660e14c720987 (patch)
tree3ae90198189bb4aa9ecde554dff440d2714f5bf7
parent7460e6733b9bd672befeb254a7c3644a758d9864 (diff)
downloadgn-transform-databases-53ab3481a430de39fa110ad3760660e14c720987.tar.gz
Re-model NCBI tranform.
Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
-rwxr-xr-xexamples/generif.scm91
1 files changed, 39 insertions, 52 deletions
diff --git a/examples/generif.scm b/examples/generif.scm
index 1dfd224..7a60214 100755
--- a/examples/generif.scm
+++ b/examples/generif.scm
@@ -86,61 +86,48 @@ GROUP BY GeneRIF.Id, GeneRIF.versionId, GeneRIF.symbol")
(define-transformer ncbi-genewiki-entries
(tables (GeneRIF_BASIC
- (left-join Species "USING (SpeciesId)"))
- "WHERE GeneRIF_BASIC.comment IS NOT NULL AND TRIM(GeneRIF_BASIC.comment) != '' AND TRIM(GeneRIF_BASIC.symbol) != '' GROUP BY GeneRIF_BASIC.comment, GeneRIF_BASIC.createtime, GeneRIF_BASIC.VersionId, GeneRIF_BASIC.SpeciesId, GeneRIF_BASIC.TaxID")
+ (left-join Species "USING (SpeciesId)")))
(schema-triples
(gnc:NCBIWikiEntry rdfs:subClassOf gnc:GeneWikiEntry)
- (gnc:NCBIWikiEntry rdfs:comment "Represents GeneRIF Entries obtained from NCBI")
- (gnt:hasVersionId a owl:ObjectProperty)
- (gnt:hasVersionId rdfs:domain gnc:NCBIWikiEntry)
- (gnt:hasVersionId skos:definition "The VersionId of this this resource"))
+ (gnc:NCBIWikiEntry rdfs:comment "Represents GeneRIF Entries obtained from NCBI"))
(triples
- (string->identifier
- "symbol"
- (regexp-substitute/global #f "[^A-Za-z0-9:]"
- (field GeneRIF_BASIC symbol GeneRIFSymbol)
- 'pre "_" 'post)
- #:proc (lambda (x) x))
- (set rdfs:comment
- (let ([ncbi-comment (sanitize-rdf-string (field GeneRIF_BASIC comment))]
- [species-name
- (string->identifier
- ""
- (remap-species-identifiers (field Species Fullname SpeciesFullName))
- #:separator ""
- #:proc string-capitalize-first)]
- [taxonomic-id (field GeneRIF_BASIC TaxID TaxonomicId)]
- [create-time (field GeneRIF_BASIC createtime EntryCreateTime)]
- [pmid (field GeneRIF_BASIC PubMed_ID PMID)]
- [gene-id (field GeneRIF_BASIC GeneId)]
- [version-id (field GeneRIF_BASIC VersionId)])
- (string->symbol
- (string-append
- "[ "
- (format #f "rdf:type gnc:NCBIWikiEntry ; ")
- (format #f "rdfs:comment ~s^^xsd:string ; "
- ncbi-comment)
- (format #f "gnt:belongsToSpecies ~a ; "
- species-name)
- (if (eq? #f taxonomic-id)
- ""
- (format #f "skos:notation taxon:~a ; "
- taxonomic-id))
- (format #f "gnt:hasGeneId generif:~a ; "
- gene-id)
- (format #f "dct:hasVersion '~a'^^xsd:int ; "
- version-id)
- (if (and (string? pmid) (not (string-null? pmid)))
- (format #f
- "~{dct:references pubmed:~a ; ~}"
- (string-split pmid #\space))
- "")
- (if (string? create-time)
- ""
- (format #f "dct:created ~s^^xsd:datetime ; "
- (time-unix->string
- create-time "~5")))
- " ]"))))))
+ (format
+ #f "gn:rif-~a-~a-~a-~a"
+ (field GeneRIF_BASIC GeneId)
+ (field GeneRIF_BASIC PubMed_ID)
+ (field
+ ("DATE_FORMAT(createtime, '%Y-%m-%dT%T')" CreateTime))
+ (field GeneRIF_BASIC VersionId))
+ (set rdf:type 'gnc:NCBIWikiEntry)
+ (set gnt:symbol (field GeneRIF_BASIC symbol))
+ (set rdfs:label
+ (let* ((comment
+ (format #f "'~a'@en"
+ (replace-substrings
+ (field GeneRIF_BASIC comment)
+ '(("\\" . "\\\\")
+ ("\n" . "\\n")
+ ("\r" . "\\r")
+ ("'" . "\\'"))))))
+ (string->symbol comment)))
+ (set dct:created
+ (string->symbol
+ (format #f "~s^^xsd:datetime "
+ (field
+ ("CAST(createtime AS CHAR)" EntryCreateTime)))))
+ (set gnt:belongsToSpecies (string->identifier
+ ""
+ (remap-species-identifiers (field Species Fullname))
+ #:separator ""
+ #:proc string-capitalize-first))
+ (set gnt:hasGeneId (string->symbol (format #f "generif:~a" (field GeneRIF_BASIC GeneId))))
+ (set skos:notation (match (field GeneRIF_BASIC TaxID TaxonomicId)
+ ((? number? x)
+ (string->symbol (format #f "taxon:~a" x)))
+ (else "")))
+ (set dct:hasVersion (annotate-field (format #f "~s" (field GeneRIF_BASIC versionId))
+ '^^xsd:integer))
+ (set dct:references (string->symbol (format #f "pubmed:~a" (field GeneRIF_BASIC PubMed_ID))))))