aboutsummaryrefslogtreecommitdiff
path: root/examples/generif-old.scm
diff options
context:
space:
mode:
Diffstat (limited to 'examples/generif-old.scm')
-rwxr-xr-xexamples/generif-old.scm241
1 files changed, 241 insertions, 0 deletions
diff --git a/examples/generif-old.scm b/examples/generif-old.scm
new file mode 100755
index 0000000..ede5a28
--- /dev/null
+++ b/examples/generif-old.scm
@@ -0,0 +1,241 @@
+#! /usr/bin/env guile
+!#
+
+(use-modules (srfi srfi-1)
+ (srfi srfi-26)
+ (rnrs bytevectors)
+ (ice-9 format)
+ (ice-9 getopt-long)
+ (ice-9 match)
+ (ice-9 regex)
+ (transform strings)
+ (transform sql)
+ (transform triples)
+ (transform special-forms))
+
+
+
+(define (fix-email-id email)
+ (string-delete #\space email))
+
+(define (investigator-attributes->id first-name last-name email)
+ ;; There is just one record corresponding to "Evan Williams" which
+ ;; does not have an email ID. To accommodate that record, we
+ ;; construct the investigator ID from not just the email ID, but
+ ;; also the first and the last names. It would be preferable to just
+ ;; find Evan Williams' email ID and insert it into the database.
+ (string->identifier "investigator"
+ (string-join
+ (list first-name last-name (fix-email-id email))
+ "_")))
+
+
+
+(define-transformer genewiki-symbols
+ (tables (GeneRIF_BASIC)
+ "GROUP BY BINARY symbol")
+ (triples
+ (string->identifier
+ "symbol"
+ (regexp-substitute/global #f "[^A-Za-z0-9:]"
+ (field GeneRIF_BASIC symbol)
+ 'pre "_" 'post)
+ #:proc (lambda (x) x))
+ (set rdfs:label
+ (field GeneRIF_BASIC symbol))))
+
+;; Some symbols exist in the RIF table that don't exist in the GeneRIF
+;; table.
+(define-transformer generif-symbols
+ (tables (GeneRIF)
+ "WHERE symbol NOT IN (SELECT symbol from GeneRIF_BASIC) GROUP BY BINARY symbol")
+ (triples
+ (string->identifier
+ "symbol"
+ (regexp-substitute/global #f "[^A-Za-z0-9:]"
+ (field GeneRIF symbol)
+ 'pre "_" 'post)
+ #:proc (lambda (x) x))
+ (set rdfs:label
+ (field GeneRIF symbol))))
+
+(define-transformer gn-genewiki-entries
+ (tables (GeneRIF
+ (left-join Species "ON Species.SpeciesId = GeneRIF.SpeciesId")
+ (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id")
+ (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id")
+ (left-join Investigators "ON Investigators.Email = GeneRIF.email"))
+ "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 AND GeneRIF.comment IS NOT NULL GROUP BY GeneRIF.comment, BINARY GeneRIF.symbol")
+ (schema-triples
+ (gnc:GeneWikiEntry a rdfs:Class)
+ (gnc:GNWikiEntry rdfs:subClassOf gnc:GeneWikiEntry)
+ (gnc:GNWikiEntry rdfs:comment "Represents GeneRIF Entries entered from GeneNetwork")
+ (gnt:geneSymbol rdfs:domain gnc:GNWikiEntry))
+ (triples
+ (string->identifier
+ "symbol"
+ (regexp-substitute/global
+ #f "[^A-Za-z0-9:]"
+ (field GeneRIF symbol)
+ 'pre "_" 'post)
+ #:proc (lambda (x) x))
+ (set rdfs:comment
+ (let* ([generif-comment (sanitize-rdf-string (field GeneRIF comment))]
+ [create-time (field GeneRIF createtime EntryCreateTime)]
+ [pmid (field GeneRIF PubMed_ID PMID)]
+ [web-url (field GeneRIF weburl)]
+ [species (string->identifier
+ ""
+ (remap-species-identifiers (field Species Fullname))
+ #:separator ""
+ #:proc string-capitalize-first)]
+ [categories
+ (remove (lambda (x)
+ (or (eq? x #f)
+ (and (string? x)
+ (string-null? x))))
+ (remove-duplicates
+ (string-split-substring
+ (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR '$$')"
+ GeneCategory))
+ "$$")))])
+ (string->symbol
+ (string-append
+ "[ "
+ (format #f "rdf:type gnc:GNWikiEntry ; ")
+ (if (string? species)
+ ""
+ (format #f "gnt:belongsToSpecies ~a ; "
+ species))
+ (format #f "rdfs:comment ~s^^xsd:string ; "
+ generif-comment)
+ (if (string? create-time)
+ ""
+ (format #f "dct:created ~s^^xsd:datetime ; "
+ (time-unix->string
+ create-time "~5")))
+ (if (and (string? pmid) (not (string-null? pmid)))
+ (format #f
+ "~{dct:references pubmed:~a ; ~}"
+ (string-split pmid #\space))
+ "")
+ (if (and (not (string-null?
+ (string-trim-both (field GeneRIF email))))
+ (not (string-null? (field Investigators Email))))
+ (format #f "dct:creator ~a ; "
+ (investigator-attributes->id
+ (field Investigators FirstName)
+ (field Investigators LastName)
+ (field Investigators Email)))
+ "")
+ (if (not (null? categories))
+ (format #f
+ "~{gnt:belongsToCategory ~s ; ~}"
+ categories)
+ "")
+ (if (and (string? web-url) (not (string-null? web-url)))
+ (format #f "foaf:homepage ~s ; "
+ web-url)
+ "")
+ " ] "))))))
+
+(define-transformer ncbi-genewiki-entries
+ (tables (GeneRIF_BASIC
+ (left-join Species "USING (SpeciesId)"))
+ "WHERE GeneRIF_BASIC.comment IS NOT NULL AND TRIM(GeneRIF_BASIC.comment) != '' AND TRIM(GeneRIF_BASIC.symbol) != '' GROUP BY GeneRIF_BASIC.comment, GeneRIF_BASIC.createtime, GeneRIF_BASIC.VersionId, GeneRIF_BASIC.SpeciesId, GeneRIF_BASIC.TaxID")
+ (schema-triples
+ (gnc:NCBIWikiEntry rdfs:subClassOf gnc:GeneWikiEntry)
+ (gnc:NCBIWikiEntry rdfs:comment "Represents GeneRIF Entries obtained from NCBI")
+ (gnt:hasVersionId a owl:ObjectProperty)
+ (gnt:hasVersionId rdfs:domain gnc:NCBIWikiEntry)
+ (gnt:hasVersionId skos:definition "The VersionId of this this resource"))
+ (triples
+ (string->identifier
+ "symbol"
+ (regexp-substitute/global #f "[^A-Za-z0-9:]"
+ (field GeneRIF_BASIC symbol GeneRIFSymbol)
+ 'pre "_" 'post)
+ #:proc (lambda (x) x))
+ (set rdfs:comment
+ (let ([ncbi-comment (sanitize-rdf-string (field GeneRIF_BASIC comment))]
+ [species-name
+ (string->identifier
+ ""
+ (remap-species-identifiers (field Species Fullname SpeciesFullName))
+ #:separator ""
+ #:proc string-capitalize-first)]
+ [taxonomic-id (field GeneRIF_BASIC TaxID TaxonomicId)]
+ [create-time (field GeneRIF_BASIC createtime EntryCreateTime)]
+ [pmid (field GeneRIF_BASIC PubMed_ID PMID)]
+ [gene-id (field GeneRIF_BASIC GeneId)]
+ [version-id (field GeneRIF_BASIC VersionId)])
+ (string->symbol
+ (string-append
+ "[ "
+ (format #f "rdf:type gnc:NCBIWikiEntry ; ")
+ (format #f "rdfs:comment ~s^^xsd:string ; "
+ ncbi-comment)
+ (format #f "gnt:belongsToSpecies ~a ; "
+ species-name)
+ (if (eq? #f taxonomic-id)
+ ""
+ (format #f "skos:notation taxon:~a ; "
+ taxonomic-id))
+ (format #f "gnt:hasGeneId generif:~a ; "
+ gene-id)
+ (format #f "gnt:hasVersionId '~a'^^xsd:integer ; "
+ version-id)
+ (if (and (string? pmid) (not (string-null? pmid)))
+ (format #f
+ "~{dct:references pubmed:~a ; ~}"
+ (string-split pmid #\space))
+ "")
+ (if (string? create-time)
+ ""
+ (format #f "dct:created ~s^^xsd:datetime ; "
+ (time-unix->string
+ create-time "~5")))
+ " ]"))))))
+
+
+
+(let* ((option-spec
+ '((settings (single-char #\s) (value #t))
+ (output (single-char #\o) (value #t))
+ (documentation (single-char #\d) (value #t))))
+ (options (getopt-long (command-line) option-spec))
+ (settings (option-ref options 'settings #f))
+ (output (option-ref options 'output #f))
+ (documentation (option-ref options 'documentation #f))
+ (%connection-settings
+ (call-with-input-file settings
+ read)))
+
+ (with-documentation
+ (name "GeneRIF Metadata")
+ (connection %connection-settings)
+ (table-metadata? #f)
+ (prefixes
+ '(("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
+ ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
+ ("skos:" "<http://www.w3.org/2004/02/skos/core#>")
+ ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>")
+ ("gn:" "<http://genenetwork.org/id/>")
+ ("gnc:" "<http://genenetwork.org/category/>")
+ ("gnt:" "<http://genenetwork.org/term/>")
+ ("dct:" "<http://purl.org/dc/terms/>")
+ ("foaf:" "<http://xmlns.com/foaf/0.1/>")
+ ("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>")
+ ("taxon:" "<https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=>")
+ ("generif:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>")
+ ("xsd:" "<http://www.w3.org/2001/XMLSchema#>")
+ ("owl:" "<http://www.w3.org/2002/07/owl#>")))
+ (inputs
+ (list
+ genewiki-symbols
+ generif-symbols
+ gn-genewiki-entries
+ ncbi-genewiki-entries))
+ (outputs
+ `(#:documentation ,documentation
+ #:rdf ,output))))