aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xexamples/generif.scm159
-rwxr-xr-xexamples/probeset.scm10
-rwxr-xr-xgenerate-ttl-files.scm60
3 files changed, 139 insertions, 90 deletions
diff --git a/examples/generif.scm b/examples/generif.scm
index fb3208a..894b766 100755
--- a/examples/generif.scm
+++ b/examples/generif.scm
@@ -16,22 +16,6 @@
-(define (fix-email-id email)
- (string-delete #\space email))
-
-(define (investigator-attributes->id first-name last-name email)
- ;; There is just one record corresponding to "Evan Williams" which
- ;; does not have an email ID. To accommodate that record, we
- ;; construct the investigator ID from not just the email ID, but
- ;; also the first and the last names. It would be preferable to just
- ;; find Evan Williams' email ID and insert it into the database.
- (string->identifier "investigator"
- (string-join
- (list first-name last-name (fix-email-id email))
- "_")))
-
-
-
(define-transformer genewiki-symbols
(tables (GeneRIF_BASIC)
"GROUP BY BINARY symbol")
@@ -45,85 +29,87 @@
(set rdfs:label
(field GeneRIF_BASIC symbol))))
+;; Some symbols exist in the RIF table that don't exist in the GeneRIF
+;; table.
+(define-transformer generif-symbols
+ (tables (GeneRIF)
+ "WHERE symbol NOT IN (SELECT symbol from GeneRIF_BASIC) GROUP BY BINARY symbol")
+ (triples
+ (string->identifier
+ "symbol"
+ (regexp-substitute/global #f "[^A-Za-z0-9:]"
+ (field GeneRIF symbol)
+ 'pre "_" 'post)
+ #:proc (lambda (x) x))
+ (set rdfs:label
+ (field GeneRIF symbol))))
+
(define-transformer gn-genewiki-entries
(tables (GeneRIF
(left-join Species "ON Species.SpeciesId = GeneRIF.SpeciesId")
(left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id")
- (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id")
- (left-join Investigators "ON Investigators.Email = GeneRIF.email"))
- "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 AND GeneRIF.comment IS NOT NULL GROUP BY GeneRIF.comment, BINARY GeneRIF.symbol")
+ (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id"))
+ "WHERE GeneRIF.display > 0 AND GeneRIF.comment IS NOT NULL
+GROUP BY GeneRIF.Id, GeneRIF.versionId, GeneRIF.symbol, GeneRIF.SpeciesId,
+GeneRIF.createtime, GeneRIF.reason")
(schema-triples
(gnc:GeneWikiEntry a rdfs:Class)
(gnc:GNWikiEntry rdfs:subClassOf gnc:GeneWikiEntry)
+ (gnt:initial a owl:ObjectProperty)
+ (gnt:initial rdfs:domain gnc:GeneWikiEntry)
+ (gnt:initial skos:definition "Optional user or project code or your initials")
+ (gnt:reason a owl:ObjectProperty)
+ (gnt:reason rdfs:domain gnc:GeneWikiEntry)
+ (gnt:reason skos:definition "The reason why this resource was modified")
(gnc:GNWikiEntry rdfs:comment "Represents GeneRIF Entries entered from GeneNetwork")
(gnt:geneSymbol rdfs:domain gnc:GNWikiEntry))
+ ;; Here we use the Id and VersionId to uniquely identify comments.
+ ;; We could use blank-nodes here; however, querying blank nodes
+ ;; E.g. getting the latest versionId is very complicated. Prefer
+ ;; normal triplets over blank-nodes.
(triples
- (string->identifier
- "symbol"
- (regexp-substitute/global
- #f "[^A-Za-z0-9:]"
- (field GeneRIF symbol)
- 'pre "_" 'post)
- #:proc (lambda (x) x))
- (set rdfs:comment
- (let* ([generif-comment (sanitize-rdf-string (field GeneRIF comment))]
- [create-time (field GeneRIF createtime EntryCreateTime)]
- [pmid (field GeneRIF PubMed_ID PMID)]
- [web-url (field GeneRIF weburl)]
- [species (string->identifier
- ""
- (remap-species-identifiers (field Species Fullname))
- #:separator ""
- #:proc string-capitalize-first)]
- [categories
- (remove (lambda (x)
- (or (eq? x #f)
- (and (string? x)
- (string-null? x))))
- (remove-duplicates
- (string-split-substring
- (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR '$$')"
- GeneCategory))
- "$$")))])
- (string->symbol
- (string-append
- "[ "
- (format #f "rdf:type gnc:GNWikiEntry ; ")
- (if (string? species)
- ""
- (format #f "gnt:belongsToSpecies ~a ; "
- species))
- (format #f "rdfs:comment ~s^^xsd:string ; "
- generif-comment)
- (if (string? create-time)
- ""
- (format #f "dct:created ~s^^xsd:datetime ; "
- (time-unix->string
- create-time "~5")))
- (if (and (string? pmid) (not (string-null? pmid)))
- (format #f
- "~{dct:references pubmed:~a ; ~}"
- (string-split pmid #\space))
- "")
- (if (and (not (string-null?
- (string-trim-both (field GeneRIF email))))
- (not (string-null? (field Investigators Email))))
- (format #f "dct:creator ~a ; "
- (investigator-attributes->id
- (field Investigators FirstName)
- (field Investigators LastName)
- (field Investigators Email)))
- "")
- (if (not (null? categories))
- (format #f
- "~{gnt:belongsToCategory ~s ; ~}"
- categories)
- "")
- (if (and (string? web-url) (not (string-null? web-url)))
- (format #f "foaf:homepage ~s ; "
- web-url)
- "")
- " ] "))))))
+ (format #f "gn:wiki-~a-~a"
+ (field GeneRIF Id)
+ (field GeneRIF versionId))
+ (set rdfs:comment (sanitize-rdf-string (field GeneRIF comment)))
+ (set rdf:type 'gnc:GNWikiEntry)
+ (set gnt:symbol
+ (string->identifier
+ "symbol"
+ (regexp-substitute/global #f "[^A-Za-z0-9:]"
+ (field GeneRIF symbol)
+ 'pre "_" 'post)
+ #:proc (lambda (x) x)))
+ (set dct:created
+ (let ((create-time (field GeneRIF createtime EntryCreateTime)))
+ (if (string? create-time)
+ ""
+ (annotate-field
+ (time-unix->string
+ create-time
+ "~5")
+ '^^xsd:datetime))))
+ (multiset dct:references
+ (string-split (field GeneRIF PubMed_ID PMID)
+ #\space))
+ (set foaf:homepage (field GeneRIF weburl))
+ (set gnt:belongsToSpecies (string->identifier
+ ""
+ (remap-species-identifiers (field Species Fullname))
+ #:separator ""
+ #:proc string-capitalize-first))
+ (set dct:hasVersion (annotate-field (format #f "~s" (field GeneRIF versionId))
+ '^^xsd:int))
+ (set dct:identifier (annotate-field (format #f "~s" (field GeneRIF Id))
+ '^^xsd:int))
+ (set gnt:initial (sanitize-rdf-string (field GeneRIF initial)))
+ (set gnt:reason (field GeneRIF reason))
+ (set foaf:mbox (sanitize-rdf-string (field GeneRIF email)))
+ (multiset gnt:belongsToCategory
+ (string-split
+ (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR ';')"
+ GeneCategory))
+ #\;))))
(define-transformer ncbi-genewiki-entries
(tables (GeneRIF_BASIC
@@ -169,7 +155,7 @@
taxonomic-id))
(format #f "gnt:hasGeneId generif:~a ; "
gene-id)
- (format #f "gnt:hasVersionId '~a'^^xsd:integer ; "
+ (format #f "dct:hasVersion '~a'^^xsd:int ; "
version-id)
(if (and (string? pmid) (not (string-null? pmid)))
(format #f
@@ -219,6 +205,7 @@
(inputs
(list
genewiki-symbols
+ generif-symbols
gn-genewiki-entries
ncbi-genewiki-entries))
(outputs
diff --git a/examples/probeset.scm b/examples/probeset.scm
index 9f694af..caf81aa 100755
--- a/examples/probeset.scm
+++ b/examples/probeset.scm
@@ -79,8 +79,10 @@
(set gnt:hasTargetId
(field ("NULLIF(TRIM(ProbeSet.TargetId), '')"
TargetId)))
- (set gnt:geneSymbol
- (field ProbeSet Symbol))
+ (multiset gnt:geneSymbol
+ (map string-trim (string-split
+ (field ProbeSet Symbol)
+ #\,)))
(set dct:description (sanitize-rdf-string (field ProbeSet description)))
(set gnt:targetsRegion
(sanitize-rdf-string
@@ -119,8 +121,7 @@
"on the minus strand")
(else "")))))))
(set gnt:hasGeneId
- (ontology 'gene:
- (string-trim-both (field ProbeSet GeneId))))
+ (field ProbeSet GeneId))
;; OMIM Link
(set dct:references
(let ((omim (field ProbeSet OMIM)))
@@ -191,6 +192,7 @@
("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
("dct:" "<http://purl.org/dc/terms/>")
+ ("uniprot:" "<http://purl.uniprot.org/uniprot/>")
("owl:" "<http://www.w3.org/2002/07/owl#>")
("xsd:" "<http://www.w3.org/2001/XMLSchema#>")
("qb:" "<http://purl.org/linked-data/cube#>")
diff --git a/generate-ttl-files.scm b/generate-ttl-files.scm
new file mode 100755
index 0000000..65db03f
--- /dev/null
+++ b/generate-ttl-files.scm
@@ -0,0 +1,60 @@
+#! ./pre-inst-env
+!#
+(use-modules (ice-9 format)
+ (ice-9 futures)
+ (ice-9 getopt-long)
+ (ice-9 ftw))
+
+(let* ((option-spec
+ '((settings (single-char #\s) (value #t))
+ (output (single-char #\o) (value #t))
+ (documentation (single-char #\d) (value #t))))
+ (options (getopt-long (command-line) option-spec))
+ (settings (option-ref options 'settings #f))
+ (output (option-ref options 'output #f))
+ (documentation (option-ref options 'documentation #f)))
+ (define (enter? name stat result)
+ stat result ;ignore
+ ;; Skip version control directories if any.
+ (not (member (basename name) '(".git" ".svn" "CVS"))))
+
+ (define (leaf name stat result)
+ stat result ;ignore
+ (when (string-suffix? ".scm" name)
+ (let* ((base-file-name (basename name ".scm"))
+ (cmd (format #f " ~a --settings ~a --output ~a --documentation ~a"
+ name
+ settings
+ (string-append output "/" base-file-name ".ttl")
+ (string-append documentation "/" base-file-name ".md"))))
+ (touch
+ (future
+ (begin
+ (display (format #f "Running ~a" cmd))
+ (display "\n")
+ (system cmd)))))))
+
+ (define (down name stat result)
+ name stat ;ignore
+ result)
+
+ (define (up name stat result)
+ name stat ;ignore
+ result)
+
+ (define (skip name stat result)
+ name stat ;ignore
+ result)
+
+ ;; Ignore unreadable files/directories but warn the user.
+ (define (error name stat errno result)
+ stat ;ignore
+ (format (current-error-port) "warning: ~a: ~a~%"
+ name (strerror errno))
+ result)
+
+ (file-system-fold enter? leaf down up skip error
+ 0 ;initial counter is zero bytes
+ "./examples"))
+
+