aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArun Isaac2021-12-16 15:49:47 +0530
committerArun Isaac2021-12-16 16:21:38 +0530
commit61d3cd3d9c51af80f5f0f8f92b42da56d6398a71 (patch)
tree39e42bbf0b409057fe62205a9432677c53163b8a
parenta182c3d3702f1d65ed36dbb3c747d32ffb1f2791 (diff)
downloadgn-transform-databases-61d3cd3d9c51af80f5f0f8f92b42da56d6398a71.tar.gz
Make define-dump syntax more concise.
* dump.scm (field->key, field->assoc-ref, collect-fields): New functions. (define-dump): Redefine with more concise syntax. * dump.scm (dump-species, dump-strain, dump-mapping-method, dump-inbred-set, dump-phenotype, dump-publication, dump-publish-xref, dump-tissue, dump-investigators, dump-avg-method, dump-gene-chip, dump-info-files): Use new define-dump syntax. (default-metadata-proc): Delete function. * .dir-locals.el (scheme-mode): Indent triples form correctly.
-rw-r--r--.dir-locals.el3
-rwxr-xr-xdump.scm456
2 files changed, 210 insertions, 249 deletions
diff --git a/.dir-locals.el b/.dir-locals.el
index f0a8de9..2fe8b32 100644
--- a/.dir-locals.el
+++ b/.dir-locals.el
@@ -5,4 +5,5 @@
(indent-tabs-mode))
(scheme-mode
(eval put 'map-alist 'scheme-indent-function 1)
- (eval put 'set-table-columns 'scheme-indent-function 1)))
+ (eval put 'set-table-columns 'scheme-indent-function 1)
+ (eval put 'triples 'scheme-indent-function 1)))
diff --git a/dump.scm b/dump.scm
index 51abdeb..93b8c7f 100755
--- a/dump.scm
+++ b/dump.scm
@@ -143,72 +143,91 @@ characters with an underscore and prefixing with gn:PREFIX."
(triple id predicate object))))
alist))
-(define default-metadata-proc
- (match-lambda
- ((key . value)
- (cons (string->symbol
- (string-append "gn:" (snake->lower-camel key)))
- value))
- (x (error "malformed alist element" x))))
-
(define (triple subject predicate object)
(format #t "~a ~a ~s .~%" subject predicate object))
+(define (field->key x)
+ (translate-forms 'field
+ (lambda (x)
+ #`(key #,(symbol->string
+ (syntax->datum
+ ((syntax-rules (field)
+ ((field table column) column)
+ ((field table column alias) alias))
+ x)))))
+ x))
+
+(define (field->assoc-ref alist x)
+ "Recursively translate field references in source X to (assoc-ref
+ALIST field-name) forms."
+ (translate-forms 'field
+ (lambda (x)
+ #`(assoc-ref #,alist
+ #,(symbol->string
+ (syntax->datum
+ ((syntax-rules (field)
+ ((field table column) column)
+ ((field table column alias) alias))
+ x)))))
+ x))
+
+(define (collect-fields x)
+ (map (syntax-rules (field)
+ ((field reference ...)
+ (reference ...)))
+ (collect-forms 'field x)))
+
(define %dumped '())
(define-syntax define-dump
(lambda (x)
- (syntax-case x (select-query)
- ((_ name (select-query (fields ...) tables raw-forms ...) proc)
- #`(begin
- (set! %dumped
- (append (list #,@(filter-map (lambda (field)
- (syntax-case field (distinct)
- (distinct #f)
- ((table column _ ...) #'(cons 'table 'column))
- (field-spec (error "Invalid field specification" #'field-spec))))
- #'(fields ...)))
- %dumped))
- (define (name db)
- (sql-for-each proc db (select-query (fields ...) tables raw-forms ...))))))))
+ (syntax-case x (tables triples)
+ ((_ name
+ (tables tables-spec raw ...)
+ (triples subject predicates ...))
+ (let ((fields (collect-fields #'(subject predicates ...))))
+ #`(begin
+ (set! %dumped
+ (append (list #,@(filter-map (lambda (field)
+ (syntax-case field (distinct)
+ (distinct #f)
+ ((table column _ ...) #'(cons 'table 'column))
+ (field-spec (error "Invalid field specification" #'field-spec))))
+ fields))
+ %dumped))
+ (define (name db)
+ (sql-for-each
+ (lambda (row)
+ (scm->triples
+ (map-alist row #,@(field->key #'(predicates ...)))
+ #,(field->assoc-ref #'row #'subject)))
+ db
+ (select-query #,fields tables-spec raw ...))))))
+ (_ (error "Invalid define-dump syntax:" (syntax->datum x))))))
(define binomial-name->species-id
(cut string->identifier "species" <>))
(define-dump dump-species
- (select-query ((Species SpeciesName)
- (Species MenuName)
- (Species FullName))
- (Species))
- (lambda (row)
- (scm->triples (map-alist row
- (set rdf:type 'gn:species)
- ;; Common name
- (set gn:name (key "SpeciesName"))
- ;; Menu name (TODO: Maybe, drop this field. It can
- ;; be inferred from the common name.)
- (set gn:menuName (key "MenuName"))
- (set gn:binomialName (key "FullName")))
- (binomial-name->species-id (assoc-ref row "FullName")))))
+ (tables (Species))
+ (triples (binomial-name->species-id (field Species FullName))
+ (set rdf:type 'gn:species)
+ (set gn:name (field Species SpeciesName))
+ (set gn:menuName (field Species MenuName))
+ (set gn:binomialName (field Species FullName))))
(define-dump dump-strain
- (select-query ((Species FullName)
- (Strain Name)
- (Strain Name2)
- (Strain Symbol)
- (Strain Alias))
- (Strain
- (join Species "ON Strain.SpeciesId = Species.SpeciesId")))
- (lambda (row)
- (scm->triples (map-alist row
- (set rdf:type 'gn:strain)
- (set gn:strainOfSpecies
- (binomial-name->species-id (key "FullName")))
- ;; Name, and maybe a second name
- (set gn:name (key "Name"))
- (set gn:name (key "Name2"))
- (set gn:alias (key "Alias")))
- (string->identifier "strain" (assoc-ref row "Name")))))
+ (tables (Strain
+ (join Species "ON Strain.SpeciesId = Species.SpeciesId")))
+ (triples (string->identifier "strain" (field Strain Name))
+ (set rdf:type 'gn:strain)
+ (set gn:strainOfSpecies
+ (binomial-name->species-id (field Species FullName)))
+ ;; Name, and maybe a second name
+ (set gn:name (field Strain Name))
+ (set gn:name (field Strain Name2))
+ (set gn:alias (field Strain Alias))
+ (set gn:symbol (field Strain Symbol))))
;; TODO: This function is unused. Remove if not required.
(define mapping-method-name->id
@@ -216,93 +235,69 @@ characters with an underscore and prefixing with gn:PREFIX."
;; TODO: This function is unused. Remove if not required.
(define-dump dump-mapping-method
- (select-query ((MappingMethod Name))
- (MappingMethod))
- (lambda (row)
- (scm->triples (map-alist row
- (set rdf:type 'gn:mappingMethod))
- (string->identifier "mappingMethod" (assoc-ref row "Name")))))
+ (tables (MappingMethod))
+ (triples (string->identifier "mappingMethod" (field MappingMethod Name))
+ (set rdf:type 'gn:mappingMethod)))
(define inbred-set-name->id
(cut string->identifier "inbredSet" <>))
(define-dump dump-inbred-set
- (select-query ((InbredSet Name)
- (InbredSet FullName)
- (InbredSet GeneticType)
- (InbredSet Family)
- (Species FullName BinomialName))
- (InbredSet
- (inner-join Species "USING (SpeciesId)")))
- (lambda (row)
- (scm->triples (map-alist row
- (set rdf:type 'gn:phenotype)
- (set gn:inbredSetOfSpecies
- (binomial-name->species-id (key "BinomialName")))
- (else=> default-metadata-proc))
- (inbred-set-name->id (assoc-ref row "Name")))))
+ (tables (InbredSet
+ (inner-join Species "USING (SpeciesId)")))
+ (triples (inbred-set-name->id (field InbredSet Name))
+ (set rdf:type 'gn:inbredSet)
+ (set gn:fullName (field InbredSet FullName))
+ (set gn:geneticType (field InbredSet GeneticType))
+ (set gn:family (field InbredSet Family))
+ (set gn:inbredSetOfSpecies
+ (binomial-name->species-id (field Species FullName BinomialName)))))
(define (phenotype-id->id id)
(string->identifier "phenotype" (number->string id)))
(define-dump dump-phenotype
- (select-query ((Phenotype Id)
- (Phenotype Pre_publication_description)
- (Phenotype Post_publication_description)
- (Phenotype Original_description)
- (Phenotype Units)
- (Phenotype Pre_publication_abbreviation)
- (Phenotype Post_publication_abbreviation)
- (Phenotype Lab_code)
- (Phenotype Submitter)
- (Phenotype Owner)
- (Phenotype Authorized_Users))
- (Phenotype))
- (lambda (row)
- (scm->triples (map-alist row
- (delete "Id")
- (set rdf:type 'gn:phenotype)
- (set gn:units (and (string-ci=? (key "Units") "unknown")
- (key "Units")))
- (else=> default-metadata-proc))
- (phenotype-id->id (assoc-ref row "Id")))))
+ (tables (Phenotype))
+ (triples (phenotype-id->id (field Phenotype Id))
+ (set rdf:type 'gn:phenotype)
+ (set gn:prePublicationDescription (field Phenotype Pre_publication_description))
+ (set gn:postPublicationDescription (field Phenotype Post_publication_description))
+ (set gn:originalDescription (field Phenotype Original_description))
+ (set gn:prePublicationDescription (field Phenotype Pre_publication_abbreviation))
+ (set gn:postPublicationDescription (field Phenotype Post_publication_abbreviation))
+ (set gn:labCode (field Phenotype Lab_code))
+ (set gn:submitter (field Phenotype Submitter))
+ (set gn:owner (field Phenotype Owner))
+ (set gn:authorizedUsers (field Phenotype Authorized_Users))
+ (set gn:units (and (string-ci=? (field Phenotype Units) "unknown")
+ (field Phenotype Units)))))
(define-dump dump-publication
- (select-query ((Publication Id)
- (Publication PubMed_ID)
- (Publication Abstract)
- (Publication Authors)
- (Publication Title)
- (Publication Journal)
- (Publication Volume)
- (Publication Pages)
- (Publication Month)
- (Publication Year))
- (Publication))
- (lambda (row)
- (scm->triples (map-alist row
- (delete "Id")
- (set rdf:type 'gn:publication)
- (multiset gn:authors
- ;; The authors field is a comma
- ;; separated list. Split it.
- (map string-trim (string-split (key "Authors") #\,)))
- (set gn:abstract
- ;; TODO: Why are there unprintable characters?
- (delete-substrings (key "Abstract") "\x01"))
- (else=> default-metadata-proc))
- (string->identifier "publication"
- (number->string (assoc-ref row "Id"))))))
+ (tables (Publication))
+ (triples (string->identifier "publication"
+ (number->string (field Publication Id)))
+ (set rdf:type 'gn:publication)
+ (set gn:pubMedId (field Publication PubMed_ID))
+ (set gn:title (field Publication Title))
+ (set gn:journal (field Publication Journal))
+ (set gn:volume (field Publication Volume))
+ (set gn:pages (field Publication Pages))
+ (set gn:month (field Publication Month))
+ (set gn:year (field Publication Year))
+ (multiset gn:authors
+ ;; The authors field is a comma
+ ;; separated list. Split it.
+ (map string-trim (string-split (field Publication Authors) #\,)))
+ (set gn:abstract
+ ;; TODO: Why are there unprintable characters?
+ (delete-substrings (field Publication Abstract)
+ "\x01"))))
(define-dump dump-publish-xref
- (select-query ((InbredSet Name)
- (PublishXRef PhenotypeId))
- (PublishXRef
- (inner-join InbredSet "USING (InbredSetId)")))
- (lambda (row)
- (scm->triples (map-alist row
- (set gn:phenotypeOfSpecies (inbred-set-name->id (key "Name"))))
- (phenotype-id->id (assoc-ref row "PhenotypeId")))))
+ (tables (PublishXRef
+ (inner-join InbredSet "USING (InbredSetId)")))
+ (triples (phenotype-id->id (field PublishXRef PhenotypeId))
+ (set gn:phenotypeOfSpecies (inbred-set-name->id (field InbredSet Name)))))
(define tissue-short-name->id
(cut string->identifier "tissue" <>))
@@ -310,18 +305,12 @@ characters with an underscore and prefixing with gn:PREFIX."
(define-dump dump-tissue
;; The Name and TissueName fields seem to be identical. BIRN_lex_ID
;; and BIRN_lex_Name are mostly NULL.
- (select-query ((Tissue Name)
- (Tissue Short_Name))
- (Tissue))
- (lambda (row)
- (scm->triples (map-alist row
- (delete "Short_Name")
- (set rdf:type 'gn:tissue)
- (set gn:name (key "Name")))
- ;; Hopefully the Short_Name field is
- ;; distinct and can be used as an
- ;; identifier.
- (tissue-short-name->id (assoc-ref row "Short_Name")))))
+ (tables (Tissue))
+ ;; Hopefully the Short_Name field is distinct and can be used as an
+ ;; identifier.
+ (triples (tissue-short-name->id (field Tissue Short_Name))
+ (set rdf:type 'gn:tissue)
+ (set gn:name (field Tissue Name))))
;; One email ID in the Investigators table has spaces in it. This
;; function fixes that.
@@ -341,31 +330,23 @@ characters with an underscore and prefixing with gn:PREFIX."
(define-dump dump-investigators
;; There are a few duplicate entries. We group by email to
;; deduplicate.
- (select-query ((Investigators FirstName)
- (Investigators LastName)
- (Investigators Address)
- (Investigators City)
- (Investigators State)
- (Investigators ZipCode)
- (Investigators Phone)
- (Investigators Email)
- (Investigators Country)
- (Investigators Url))
- (Investigators)
- "GROUP BY Email")
- (lambda (row)
- (scm->triples (map-alist row
- (set rdf:type 'foaf:Person)
- (set foaf:name (string-append (key "FirstName") " " (key "LastName")))
- (set foaf:givenName (key "FirstName"))
- (set foaf:familyName (key "LastName"))
- (set foaf:phone (key "Phone"))
- (set foaf:mbox (fix-email-id (key "Email")))
- (set foaf:homepage (key "Url"))
- (else=> default-metadata-proc))
- (investigator-attributes->id (assoc-ref row "FirstName")
- (assoc-ref row "LastName")
- (assoc-ref row "Email")))))
+ (tables (Investigators)
+ "GROUP BY Email")
+ (triples (investigator-attributes->id (field Investigators FirstName)
+ (field Investigators LastName)
+ (field Investigators Email))
+ (set rdf:type 'foaf:Person)
+ (set foaf:name (string-append (field Investigators FirstName) " " (field Investigators LastName)))
+ (set foaf:givenName (field Investigators FirstName))
+ (set foaf:familyName (field Investigators LastName))
+ (set foaf:phone (field Investigators Phone))
+ (set foaf:mbox (fix-email-id (field Investigators Email)))
+ (set foaf:homepage (field Investigators Url))
+ (set gn:address (field Investigators Address))
+ (set gn:city (field Investigators City))
+ (set gn:state (field Investigators State))
+ (set gn:zipCode (field Investigators ZipCode))
+ (set gn:country (field Investigators Country))))
(define avg-method-name->id
(cut string->identifier "avgmethod" <>))
@@ -375,99 +356,78 @@ characters with an underscore and prefixing with gn:PREFIX."
;; the Name field.
;;
;; There are two records with Name as "N/A". Deduplicate.
- (select-query (distinct (AvgMethod Name))
- (AvgMethod))
- (lambda (row)
- (scm->triples (map-alist row
- (set rdf:type 'gn:avgMethod)
- (set gn:name (key "Name")))
- (avg-method-name->id (assoc-ref row "Name")))))
+ (tables (AvgMethod)
+ "GROUP BY Name")
+ (triples (avg-method-name->id (field AvgMethod Name))
+ (set rdf:type 'gn:avgMethod)
+ (set gn:name (field AvgMethod Name))))
(define gene-chip-name->id
(cut string->identifier "platform" <>))
(define-dump dump-gene-chip
- (select-query ((GeneChip GeneChipName)
- (GeneChip Name))
- (GeneChip))
- (lambda (row)
- (scm->triples (map-alist row
- (delete "Name")
- (set rdf:type 'gn:platform)
- (set gn:name (key "GeneChipName")))
- (gene-chip-name->id (assoc-ref row "Name")))))
+ (tables (GeneChip))
+ (triples (gene-chip-name->id (field GeneChip Name))
+ (set rdf:type 'gn:platform)
+ (set gn:name (field GeneChip GeneChipName))))
+;; TODO: Double check Platforms. It doesn't seem to match up.
(define-dump dump-info-files
- ;; TODO: Double check Platforms. It doesn't seem to match up.
- (select-query ((InfoFiles GN_AccesionId)
- (InfoFiles InfoFileTitle Name)
- (InfoFiles Title)
- (InfoFiles Specifics)
- (DatasetStatus DatasetStatusName)
- (Datasets DatasetName DatasetGroup)
- (Datasets Summary)
- (Datasets GeoSeries)
- (Datasets AboutCases)
- (Datasets AboutPlatform)
- (Datasets AboutTissue)
- (Datasets AboutDataProcessing)
- (Datasets Notes)
- (Datasets ExperimentDesign)
- (Datasets Contributors)
- (Datasets Citation)
- (Datasets Acknowledgment)
- (Species FullName BinomialName)
- (InbredSet Name InbredSetName)
- (Tissue Short_Name)
- (Investigators FirstName)
- (Investigators LastName)
- (Investigators Email)
- (AvgMethod Name AvgMethodName)
- (GeneChip Name GeneChip))
- (InfoFiles
- (left-join Datasets "USING (DatasetId)")
- (left-join DatasetStatus "USING (DatasetStatusId)")
- (left-join Species "USING (SpeciesId)")
- (left-join InbredSet "USING (InbredSetId)")
- (left-join Tissue "USING (TissueId)")
- (left-join Investigators "USING (InvestigatorId)")
- (left-join AvgMethod "USING (AvgMethodId)")
- (left-join GeneChip "USING (GeneChipId)"))
- "WHERE GN_AccesionId IS NOT NULL")
- (lambda (row)
- (scm->triples
- (map-alist row
- (set rdf:type 'gn:dataset)
- (set gn:datasetOfInvestigator
- (investigator-attributes->id (key "FirstName")
- (key "LastName")
- (key "Email")))
- (set gn:accessionId (string-append "GN" (number->string (key "GN_AccesionId"))))
- (set gn:datasetStatusName (string-downcase (key "DatasetStatusName")))
- (set gn:datasetOfSpecies (binomial-name->species-id (key "BinomialName")))
- (set gn:datasetOfInbredSet (inbred-set-name->id (key "InbredSetName")))
- (set gn:datasetOfTissue (tissue-short-name->id (key "Short_Name")))
- (set gn:normalization
- (avg-method-name->id
- ;; If AvgMethodName is NULL, assume N/A.
- (if (string-blank? (key "AvgMethodName"))
- "N/A" (key "AvgMethodName"))))
- (set gn:datasetOfPlatform (gene-chip-name->id (key "GeneChip")))
- (set gn:summary
- ;; TODO: Why are there unprintable characters?
- (delete-substrings (key "Summary")
- "\x01" "\x03"))
- (set gn:aboutTissue
- ;; TODO: Why are there unprintable characters?
- (delete-substrings (key "AboutTissue")
- "\x01" "\x03"))
- (set gn:geoSeries
- (and (not (string-prefix-ci? "no geo series"
- (key "GeoSeries")))
- (key "GeoSeries")))
- (else=> default-metadata-proc))
- (string->identifier "dataset"
- (number->string (assoc-ref row "GN_AccesionId"))))))
+ (tables (InfoFiles
+ (left-join Datasets "USING (DatasetId)")
+ (left-join DatasetStatus "USING (DatasetStatusId)")
+ (left-join Species "USING (SpeciesId)")
+ (left-join InbredSet "USING (InbredSetId)")
+ (left-join Tissue "USING (TissueId)")
+ (left-join Investigators "USING (InvestigatorId)")
+ (left-join AvgMethod "USING (AvgMethodId)")
+ (left-join GeneChip "USING (GeneChipId)"))
+ "WHERE GN_AccesionId IS NOT NULL")
+ (triples (string->identifier "dataset"
+ (number->string (field InfoFiles GN_AccesionId)))
+ (set rdf:type 'gn:dataset)
+ (set gn:datasetOfInvestigator
+ (investigator-attributes->id (field Investigators FirstName)
+ (field Investigators LastName)
+ (field Investigators Email)))
+ (set gn:accessionId (string-append "GN" (number->string
+ (field InfoFiles GN_AccesionId))))
+ (set gn:datasetStatusName (string-downcase
+ (field DatasetStatus DatasetStatusName)))
+ (set gn:datasetOfSpecies (binomial-name->species-id
+ (field Species FullName BinomialName)))
+ (set gn:datasetOfInbredSet (inbred-set-name->id (field InbredSet Name InbredSetName)))
+ (set gn:datasetOfTissue (tissue-short-name->id (field Tissue Short_Name)))
+ (set gn:normalization
+ (avg-method-name->id
+ ;; If AvgMethodName is NULL, assume N/A.
+ (if (string-blank? (field AvgMethod Name AvgMethodName))
+ "N/A" (field AvgMethod Name AvgMethodName))))
+ (set gn:datasetOfPlatform (gene-chip-name->id (field GeneChip Name GeneChip)))
+ (set gn:summary
+ ;; TODO: Why are there unprintable characters?
+ (delete-substrings (field Datasets Summary)
+ "\x01" "\x03"))
+ (set gn:aboutTissue
+ ;; TODO: Why are there unprintable characters?
+ (delete-substrings (field Datasets AboutTissue)
+ "\x01" "\x03"))
+ (set gn:geoSeries
+ (and (not (string-prefix-ci? "no geo series"
+ (field Datasets GeoSeries)))
+ (field Datasets GeoSeries)))
+ (set gn:name (field InfoFiles InfoFileTitle Name))
+ (set gn:title (field InfoFiles Title))
+ (set gn:specifics (field InfoFiles Specifics))
+ (set gn:datasetGroup (field Datasets DatasetName DatasetGroup))
+ (set gn:aboutCases (field Datasets AboutCases))
+ (set gn:aboutPlatform (field Datasets AboutPlatform))
+ (set gn:aboutDataProcessing (field Datasets AboutDataProcessing))
+ (set gn:notes (field Datasets Notes))
+ (set gn:experimentDesign (field Datasets ExperimentDesign))
+ (set gn:contributors (field Datasets Contributors))
+ (set gn:citation (field Datasets Citation))
+ (set gn:acknowledgment (field Datasets Acknowledgment))))
(define (dump-data-table db table-name data-field)
(let ((dump-directory (string-append %dump-directory "/" table-name))