#! /usr/bin/env guile
!#
(use-modules (srfi srfi-1)
(srfi srfi-26)
(ice-9 match)
(ice-9 regex)
(dump strings)
(dump sql)
(dump triples)
(dump special-forms))
(define %connection-settings
(call-with-input-file (list-ref (command-line) 1)
read))
(define %dump-directory
(list-ref (command-line) 2))
;; One email ID in the Investigators table has spaces in it. This
;; function fixes that.
(define (fix-email-id email)
(string-delete #\space email))
(define (investigator-attributes->id first-name last-name email)
;; There is just one record corresponding to "Evan Williams" which
;; does not have an email ID. To accommodate that record, we
;; construct the investigator ID from not just the email ID, but
;; also the first and the last names. It would be preferable to just
;; find Evan Williams' email ID and insert it into the database.
(string->identifier "investigator"
(string-join
;; Add special case for Yohan Bossé whose name
;; has unprintable characters.
;; TODO: Fix Yohan Bossé's name in the database.
(let ((last-name (if (string=? first-name "Yohan")
"Bosse"
last-name)))
(list first-name last-name (fix-email-id email)))
"_")))
(define-dump dump-investigators
;; There are a few duplicate entries. We group by email to
;; deduplicate.
(tables (Investigators)
"GROUP BY Email")
(schema-triples
;; TODO: Are ranges required for FOAF predicates? Can they not be
;; obtained from the FOAF spec?
(foaf:name rdfs:range rdfs:Literal)
(foaf:givenName rdfs:range rdfs:Literal)
(foaf:familyName rdfs:range rdfs:Literal)
(foaf:homepage rdfs:range rdfs:Literal)
(gn:address rdfs:range rdfs:Literal)
(gn:city rdfs:range rdfs:Literal)
(gn:state rdfs:range rdfs:Literal)
(gn:zipCode rdfs:range rdfs:Literal)
(gn:country rdfs:range rdfs:Literal))
(triples (investigator-attributes->id (field Investigators FirstName)
(field Investigators LastName)
(field Investigators Email))
(set rdf:type 'foaf:Person)
(set foaf:name (string-append (field Investigators FirstName) " "
(if (string=? (field Investigators FirstName) "Yohan")
"Bossé"
(field Investigators LastName))))
(set foaf:givenName
(field ("CAST(CONVERT(BINARY CONVERT(FirstName USING latin1) USING utf8) AS VARCHAR(100))" FirstName)))
(set foaf:familyName
(field ("CAST(CONVERT(BINARY CONVERT(LastName USING latin1) USING utf8) AS VARCHAR(100))" LastName)))
(set foaf:homepage (field Investigators Url))
(set gn:address (field Investigators Address))
(set gn:city (field Investigators City))
(set gn:state (field Investigators State))
(set gn:zipCode (field Investigators ZipCode))
(set gn:country (field Investigators Country))))
(define-dump dump-info-files
(tables (InfoFiles
(left-join PublishFreeze "ON InfoFiles.InfoPageName = PublishFreeze.Name")
(left-join GenoFreeze "ON InfoFiles.InfoPageName = GenoFreeze.Name")
(left-join ProbeSetFreeze "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name")
(left-join InbredSet "ON InfoFiles.InbredSetId = InbredSet.InbredSetId")
(left-join Species "ON InfoFiles.SpeciesId = Species.SpeciesId")
(left-join Datasets "USING (DatasetId)")
(left-join DatasetStatus "USING (DatasetStatusId)")
(left-join Tissue "USING (TissueId)")
(left-join Investigators "USING (InvestigatorId)")
(left-join AvgMethod "USING (AvgMethodId)")
(left-join Organizations "USING (OrganizationId)")
(left-join GeneChip "USING (GeneChipId)"))
"WHERE GN_AccesionId IS NOT NULL")
(schema-triples
(gn:dataset rdfs:range rdfs:Literal)
(gn:datasetOfInvestigator rdfs:domain gn:dataset)
(gn:datasetOfOrganization rdfs:domain gn:dataset)
(gn:datasetOfInvestigator rdfs:range foaf:Person)
(gn:datasetOfInbredSet rdfs:domain gn:dataset)
(gn:datasetOfInbredSet rdfs:range gn:inbredSet)
(gn:datasetOfSpecies rdfs:domain gn:dataset)
(gn:datasetOfSpecies rdfs:range gn:inbredSet)
(gn:datasetOfTissue rdfs:domain gn:dataset)
(gn:datasetOfTissue rdfs:range gn:tissue)
(gn:normalization rdfs:domain gn:dataset)
(gn:normalization rdfs:range gn:avgMethod)
(gn:datasetOfPlatform rdfs:domain gn:dataset)
(gn:datasetOfPlatform rdfs:range gn:geneChip)
(gn:accessionId rdfs:range rdfs:Literal)
(gn:datasetStatusName rdfs:range rdfs:Literal)
(gn:summary rdfs:range rdfs:Literal)
(gn:aboutTissue rdfs:range rdfs:Literal)
(gn:geoSeries rdfs:range rdfs:Literal)
(gn:name rdfs:range rdfs:Literal)
(gn:title rdfs:range rdfs:Literal)
(gn:publicationTitle rdfs:range rdfs:Literal)
(gn:specifics rdfs:range rdfs:Literal)
(gn:datasetGroup rdfs:range rdfs:Literal)
(gn:aboutCases rdfs:range rdfs:Literal)
(gn:aboutPlatform rdfs:range rdfs:Literal)
(gn:aboutDataProcessing rdfs:range rdfs:Literal)
(gn:notes rdfs:range rdfs:Literal)
(gn:experimentDesign rdfs:range rdfs:Literal)
(gn:contributors rdfs:range rdfs:Literal)
(gn:citation rdfs:range rdfs:Literal)
(gn:acknowledgment rdfs:range rdfs:Literal))
(triples (ontology 'dataset:
(regexp-substitute/global #f "[^A-Za-z0-9:]"
(field InfoFiles InfoPageName)
'pre "_" 'post))
(set rdf:type (string->symbol
(field ("IF(GenoFreeze.Id IS NOT NULL, 'gn:genotypeDataset', IF(PublishFreeze.Id IS NOT NULL, 'gn:phenotypeDataset', 'gn:dataset'))"
rdfType))))
(set gn:name (regexp-substitute/global
#f "^[Nn]one$"
(field InfoFiles InfoPageName)
""))
(set gn:fullName
(field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))"
DatasetFullName)))
(set dct:created
(field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))"
createTimeGenoFreeze)))
(set gn:datasetOfInvestigator
(investigator-attributes->id (field Investigators FirstName)
(field Investigators LastName)
(field Investigators Email)))
(set gn:datasetOfOrganization
(field ("CAST(CONVERT(BINARY CONVERT(Organizations.OrganizationName USING latin1) USING utf8) AS VARCHAR(1500))" Organizations)))
(set gn:accessionId (string-append "GN" (number->string
(field InfoFiles GN_AccesionId))))
(set gn:datasetStatusName (string-downcase
(field DatasetStatus DatasetStatusName)))
(set gn:datasetOfInbredSet
(string->identifier "inbredSet" (field InbredSet Name InbredSetName)))
(set gn:datasetOfTissue (string->identifier "tissue"
(field Tissue Short_Name)))
(set gn:normalization
(string->identifier "avgmethod"
;; If AvgMethodName is NULL, assume N/A.
(if (string-blank? (field AvgMethod Name AvgMethodName))
"N/A" (field AvgMethod Name AvgMethodName))))
(set gn:datasetOfPlatform
(string->identifier "platform"
(field GeneChip Name GeneChip)))
(set gn:summary
(sanitize-rdf-string (field Datasets Summary)))
(set gn:aboutTissue
(sanitize-rdf-string (field Datasets AboutTissue)))
(set gn:geoSeries
(let ((s
(string-match "GSE[0-9]*"
(field ("IFNULL(Datasets.GeoSeries, '')" GeoSeries)))))
(if s (ontology
'geoSeries: (match:substring s))
"")))
(set gn:title
(regexp-substitute/global
#f "^[Nn]one$"
(field InfoFiles InfoFileTitle)
""))
(set gn:publicationTitle
(regexp-substitute/global
#f "^[Nn]one$"
(field Datasets PublicationTitle)
""))
(set gn:specifics (sanitize-rdf-string (field InfoFiles Specifics)))
(set gn:datasetGroup (field Datasets DatasetName DatasetGroup))
(set gn:aboutCases
(sanitize-rdf-string
(field ("CAST(CONVERT(BINARY CONVERT(Datasets.AboutCases USING latin1) USING utf8) AS VARCHAR(10000))" AboutCases))))
(set gn:aboutPlatform
(sanitize-rdf-string
(field ("CAST(CONVERT(BINARY CONVERT(Datasets.AboutPlatform USING latin1) USING utf8) AS VARCHAR(1500))"
AboutPlatform))))
(set gn:aboutDataProcessing
(sanitize-rdf-string
(field ("CAST(CONVERT(BINARY CONVERT(Datasets.AboutDataProcessing USING latin1) USING utf8) AS VARCHAR(1500))"
AboutDataProcessing))))
(set gn:notes
(sanitize-rdf-string
(field ("CAST(CONVERT(BINARY CONVERT(Datasets.Notes USING latin1) USING utf8) AS VARCHAR(1500))"
GNNotes))))
(set gn:experimentDesign
(sanitize-rdf-string
(field ("CAST(CONVERT(BINARY CONVERT(Datasets.ExperimentDesign USING latin1) USING utf8) AS VARCHAR(1500))"
ExperimentDesign))))
(set gn:contributors
(sanitize-rdf-string
(field ("CAST(CONVERT(BINARY CONVERT(Datasets.Contributors USING latin1) USING utf8) AS VARCHAR(1500))"
Contributors))))
(set gn:citation
(sanitize-rdf-string
(regexp-substitute/global
#f "^[Nn]one$"
(field
("CAST(CONVERT(BINARY CONVERT(Datasets.Citation USING latin1) USING utf8) AS VARCHAR(1500))"
Citation))
"")))
(set gn:dataSourceAcknowledgment
(sanitize-rdf-string
(string-trim-both
(regexp-substitute/global
#f "^[Nn]one$"
(field ("CAST(CONVERT(BINARY CONVERT(InfoFiles.Data_Source_Acknowledge USING latin1) USING utf8) AS VARCHAR(1500))"
Data_Source_Acknowledge))
""))))
(set gn:acknowledgment (sanitize-rdf-string
(field Datasets Acknowledgment)))))
(call-with-target-database
%connection-settings
(lambda (db)
(with-output-to-file (string-append %dump-directory "dump-info-pages.ttl")
(lambda ()
(prefix "dct:" "")
(prefix "foaf:" "")
(prefix "generif:" "")
(prefix "geoSeries:" "")
(prefix "gn:" "")
(prefix "owl:" "")
(prefix "phenotype:" "")
(prefix "pubmed:" "")
(prefix "rdf:" "")
(prefix "rdfs:" "")
(prefix "uniprot:" "")
(prefix "up:" "")
(prefix "xsd:" "")
(prefix "probeset:" "")
(prefix "dataset:" "")
(newline)
(dump-info-files db)
(dump-investigators db))
#:encoding "utf8")))