#! /usr/bin/env guile
!#
(use-modules (srfi srfi-1)
(srfi srfi-26)
(ice-9 match)
(ice-9 regex)
(dump strings)
(dump sql)
(dump triples)
(dump special-forms))
(define %connection-settings
(call-with-input-file (list-ref (command-line) 1)
read))
;; One email ID in the Investigators table has spaces in it. This
;; function fixes that.
(define (fix-email-id email)
(string-delete #\space email))
(define (investigator-attributes->id first-name last-name email)
;; There is just one record corresponding to "Evan Williams" which
;; does not have an email ID. To accommodate that record, we
;; construct the investigator ID from not just the email ID, but
;; also the first and the last names. It would be preferable to just
;; find Evan Williams' email ID and insert it into the database.
(string->identifier "investigator"
(string-join
;; Add special case for Yohan Bossé whose name
;; has unprintable characters.
;; TODO: Fix Yohan Bossé's name in the database.
(let ((last-name (if (string=? first-name "Yohan")
"Bosse"
last-name)))
(list first-name last-name (fix-email-id email)))
"_")))
(define-dump dump-investigators
;; There are a few duplicate entries. We group by email to
;; deduplicate.
(tables (Investigators)
"GROUP BY Email")
(schema-triples
;; TODO: Are ranges required for FOAF predicates? Can they not be
;; obtained from the FOAF spec?
(foaf:name rdfs:range rdfs:Literal)
(foaf:givenName rdfs:range rdfs:Literal)
(foaf:familyName rdfs:range rdfs:Literal)
(foaf:homepage rdfs:range rdfs:Literal)
(gnt:address rdfs:range rdfs:Literal)
(gnt:city rdfs:range rdfs:Literal)
(gnt:state rdfs:range rdfs:Literal)
(gnt:zipCode rdfs:range rdfs:Literal)
(gnt:country rdfs:range rdfs:Literal))
(triples (investigator-attributes->id (field Investigators FirstName)
(field Investigators LastName)
(field Investigators Email))
(set rdf:type 'foaf:Person)
(set foaf:name (string-append (field Investigators FirstName) " "
(if (string=? (field Investigators FirstName) "Yohan")
"Bossé"
(field Investigators LastName))))
(set foaf:givenName
(field ("CAST(CONVERT(BINARY CONVERT(FirstName USING latin1) USING utf8) AS VARCHAR(100))" FirstName)))
(set foaf:familyName
(field ("CAST(CONVERT(BINARY CONVERT(LastName USING latin1) USING utf8) AS VARCHAR(100))" LastName)))
(set foaf:homepage (field Investigators Url))
(set gnt:address (field Investigators Address))
(set gnt:city (field Investigators City))
(set gnt:state (field Investigators State))
(set gnt:zipCode (field Investigators ZipCode))
(set gnt:country (field Investigators Country))))
(define-dump dump-info-files
(tables (InfoFiles
(left-join PublishFreeze "ON InfoFiles.InfoPageName = PublishFreeze.Name")
(left-join GenoFreeze "ON InfoFiles.InfoPageName = GenoFreeze.Name")
(left-join ProbeSetFreeze "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name")
(left-join InbredSet "ON InfoFiles.InbredSetId = InbredSet.InbredSetId")
(left-join Species "ON InfoFiles.SpeciesId = Species.SpeciesId")
(left-join Datasets "USING (DatasetId)")
(left-join DatasetStatus "USING (DatasetStatusId)")
(left-join Tissue "USING (TissueId)")
(left-join Investigators "USING (InvestigatorId)")
(left-join AvgMethod "USING (AvgMethodId)")
(left-join Organizations "USING (OrganizationId)")
(left-join GeneChip "USING (GeneChipId)"))
"WHERE GN_AccesionId IS NOT NULL")
(schema-triples
(gnt:dataset rdfs:range rdfs:Literal)
(gnt:datasetOfInvestigator rdfs:domain gn:dataset)
(gnt:datasetOfOrganization rdfs:domain gn:dataset)
(gnt:datasetOfInvestigator rdfs:range foaf:Person)
(gnt:datasetOfInbredSet rdfs:domain gn:dataset)
(gnt:datasetOfInbredSet rdfs:range gn:inbredSet)
(gnt:datasetOfSpecies rdfs:domain gn:dataset)
(gnt:datasetOfSpecies rdfs:range gn:inbredSet)
(gnt:datasetOfTissue rdfs:domain gn:dataset)
(gnt:datasetOfTissue rdfs:range gn:tissue)
(gnt:normalization rdfs:domain gn:dataset)
(gnt:normalization rdfs:range gn:avgMethod)
(gnt:datasetOfPlatform rdfs:domain gn:dataset)
(gnt:datasetOfPlatform rdfs:range gn:geneChip)
(gnt:accessionId rdfs:range rdfs:Literal)
(gnt:datasetStatusName rdfs:range rdfs:Literal)
(gnt:summary rdfs:range rdfs:Literal)
(gnt:aboutTissue rdfs:range rdfs:Literal)
(gnt:geoSeries rdfs:range rdfs:Literal)
(gnt:name rdfs:range rdfs:Literal)
(gnt:title rdfs:range rdfs:Literal)
(gnt:publicationTitle rdfs:range rdfs:Literal)
(gnt:specifics rdfs:range rdfs:Literal)
(gnt:datasetGroup rdfs:range rdfs:Literal)
(gnt:aboutCases rdfs:range rdfs:Literal)
(gnt:aboutPlatform rdfs:range rdfs:Literal)
(gnt:aboutDataProcessing rdfs:range rdfs:Literal)
(gnt:notes rdfs:range rdfs:Literal)
(gnt:experimentDesign rdfs:range rdfs:Literal)
(gnt:contributors rdfs:range rdfs:Literal)
(gnt:citation rdfs:range rdfs:Literal)
(gnt:acknowledgment rdfs:range rdfs:Literal))
(triples (string->identifier
"" (regexp-substitute/global #f "[^A-Za-z0-9:]"
(field InfoFiles InfoPageName)
'pre "_" 'post)
#:separator ""
#:proc string-capitalize-first)
(set rdf:type (string->symbol
(field ("IF(GenoFreeze.Id IS NOT NULL, 'gn:genotypeDataset', IF(PublishFreeze.Id IS NOT NULL, 'gn:phenotypeDataset', 'gn:dataset'))"
rdfType))))
(set gnt:name (regexp-substitute/global
#f "^[Nn]one$"
(field InfoFiles InfoPageName)
""))
(set gnt:fullName
(field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))"
DatasetFullName)))
(set dct:created
(field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))"
createTimeGenoFreeze)))
(set gnt:datasetOfInvestigator
(investigator-attributes->id (field Investigators FirstName)
(field Investigators LastName)
(field Investigators Email)))
(set gnt:datasetOfOrganization
(field ("CAST(CONVERT(BINARY CONVERT(Organizations.OrganizationName USING latin1) USING utf8) AS VARCHAR(1500))" Organizations)))
(set gnt:accessionId (format #f "GN~a" (field InfoFiles GN_AccesionId)))
(set gnt:datasetStatusName (string-downcase
(field DatasetStatus DatasetStatusName)))
(set gnt:datasetOfInbredSet
(string->identifier "inbredSet" (field InbredSet Name InbredSetName)))
(set gnt:datasetOfTissue (string->identifier "tissue"
(field Tissue Short_Name)))
(set gnt:normalization
(string->identifier "avgmethod"
;; If AvgMethodName is NULL, assume N/A.
(if (string-blank? (field AvgMethod Name AvgMethodName))
"N/A" (field AvgMethod Name AvgMethodName))))
(set gnt:datasetOfPlatform
(string->identifier "platform"
(field GeneChip Name GeneChip)))
(set gnt:summary
(sanitize-rdf-string (field Datasets Summary)))
(set gnt:aboutTissue
(sanitize-rdf-string (field Datasets AboutTissue)))
(set gnt:geoSeries
(let ((s
(string-match "GSE[0-9]*"
(field ("IFNULL(Datasets.GeoSeries, '')" GeoSeries)))))
(if s (ontology
'geoSeries: (match:substring s))
"")))
(set gnt:title
(regexp-substitute/global
#f "^[Nn]one$"
(field InfoFiles InfoFileTitle)
""))
(set gnt:publicationTitle
(regexp-substitute/global
#f "^[Nn]one$"
(field Datasets PublicationTitle)
""))
(set gnt:specifics (sanitize-rdf-string (field InfoFiles Specifics)))
(set gnt:datasetGroup (field Datasets DatasetName DatasetGroup))
(set gnt:aboutCases
(sanitize-rdf-string
(field ("CAST(CONVERT(BINARY CONVERT(Datasets.AboutCases USING latin1) USING utf8) AS VARCHAR(10000))" AboutCases))))
(set gnt:aboutPlatform
(sanitize-rdf-string
(field ("CAST(CONVERT(BINARY CONVERT(Datasets.AboutPlatform USING latin1) USING utf8) AS VARCHAR(1500))"
AboutPlatform))))
(set gnt:aboutDataProcessing
(sanitize-rdf-string
(field ("CAST(CONVERT(BINARY CONVERT(Datasets.AboutDataProcessing USING latin1) USING utf8) AS VARCHAR(1500))"
AboutDataProcessing))))
(set gnt:notes
(sanitize-rdf-string
(field ("CAST(CONVERT(BINARY CONVERT(Datasets.Notes USING latin1) USING utf8) AS VARCHAR(1500))"
GNNotes))))
(set gnt:experimentDesign
(sanitize-rdf-string
(field ("CAST(CONVERT(BINARY CONVERT(Datasets.ExperimentDesign USING latin1) USING utf8) AS VARCHAR(1500))"
ExperimentDesign))))
(set gnt:contributors
(sanitize-rdf-string
(field ("CAST(CONVERT(BINARY CONVERT(Datasets.Contributors USING latin1) USING utf8) AS VARCHAR(1500))"
Contributors))))
(set gnt:citation
(sanitize-rdf-string
(regexp-substitute/global
#f "^[Nn]one$"
(field
("CAST(CONVERT(BINARY CONVERT(Datasets.Citation USING latin1) USING utf8) AS VARCHAR(1500))"
Citation))
"")))
(set gnt:dataSourceAcknowledgment
(sanitize-rdf-string
(string-trim-both
(regexp-substitute/global
#f "^[Nn]one$"
(field ("CAST(CONVERT(BINARY CONVERT(InfoFiles.Data_Source_Acknowledge USING latin1) USING utf8) AS VARCHAR(1500))"
Data_Source_Acknowledge))
""))))
(set gnt:acknowledgment (sanitize-rdf-string
(field Datasets Acknowledgment)))))
(dump-with-documentation
(name "Info files / Investigators Metadata")
(connection %connection-settings)
(table-metadata? #f)
(prefixes
'(("foaf:" "")
("geoSeries:" "")
("gnt:" "")
("gn:" "")
("rdf:" "")
("rdfs:" "")
("taxon:" "")
("dct:" "")))
(inputs
(list dump-info-files
dump-investigators))
(outputs
'(#:documentation "./docs/dump-info-pages.md"
#:rdf "./verified-data/dump-info-pages.ttl")))