aboutsummaryrefslogtreecommitdiff
path: root/gn/packages/bioinformatics.scm
diff options
context:
space:
mode:
authorEfraim Flashner2021-09-05 00:30:57 +0300
committerEfraim Flashner2021-09-05 00:30:57 +0300
commitb773e0e0a2869b2d370891696e0dd1382dacf9f4 (patch)
treefc6a2fa132d520ee140b2ae4f055a312c40769d1 /gn/packages/bioinformatics.scm
parent6b358542f5832376be16a165b897ea43270e2323 (diff)
downloadguix-bioinformatics-b773e0e0a2869b2d370891696e0dd1382dacf9f4.tar.gz
WIP braker
Diffstat (limited to 'gn/packages/bioinformatics.scm')
-rw-r--r--gn/packages/bioinformatics.scm435
1 files changed, 435 insertions, 0 deletions
diff --git a/gn/packages/bioinformatics.scm b/gn/packages/bioinformatics.scm
index 91901a6..677de1d 100644
--- a/gn/packages/bioinformatics.scm
+++ b/gn/packages/bioinformatics.scm
@@ -12,12 +12,14 @@
#:use-module (guix build-system cmake)
#:use-module (guix build-system gnu)
#:use-module (guix build-system meson)
+ #:use-module (guix build-system perl)
#:use-module (guix build-system python)
#:use-module (guix build-system trivial)
#:use-module (guix build-system waf)
#:use-module (gnu packages)
#:use-module (gn packages crates-io)
#:use-module (gn packages java)
+ #:use-module (gn packages perl)
#:use-module (gn packages python)
#:use-module (gn packages twint)
#:use-module (gnu packages algebra)
@@ -2407,3 +2409,436 @@ To run the bundled rtg-tools software you will also need java. The
@code{icedtea:jdk} output should work nicely.")
(license (list license:expat ; bundled jsoncpp, klib
license:bsd-2))))
+
+(define-public braker
+ (package
+ (name "braker")
+ (version "2.1.6")
+ (source
+ (origin
+ (method git-fetch)
+ (uri (git-reference
+ (url "https://github.com/Gaius-Augustus/BRAKER")
+ (commit (string-append "v" version))))
+ (file-name (git-file-name name version))
+ (sha256
+ (base32
+ "0jx9ycyh8s42yb4va84mcjyh7m95wzvf42skc3p4pvc5r7c5ns41"))))
+ (build-system perl-build-system)
+ (arguments
+ `(#:modules ((srfi srfi-26)
+ (guix build perl-build-system)
+ (guix build utils))
+ #:phases
+ (modify-phases %standard-phases
+ (delete 'configure)
+ (delete 'build)
+ (replace 'check
+ (lambda* (#:key tests? #:allow-other-keys)
+ (when tests?
+ (invoke "prove" "-l" "t"))))
+ (replace 'install
+ (lambda* (#:key inputs outputs #:allow-other-keys)
+ (let* ((out (assoc-ref outputs "out"))
+ (bin (string-append out "/bin")))
+ (with-directory-excursion "scripts"
+ (for-each (cut install-file <> bin)
+ (find-files "." "\\.(py|pl|pm)$"))
+ (for-each
+ (cut wrap-script <>
+ `("PERL5LIB" ":" prefix (,(getenv "PERL5LIB")
+ ,bin)))
+ ;; braker.pl is the entry point so wrap it separately.
+ (delete (string-append bin "/braker.pl")
+ (find-files bin "\\.pl$")))
+ (wrap-script (string-append bin "/braker.pl")
+ `("PERL5LIB" ":" prefix (,(getenv "PERL5LIB")
+ ,bin))
+ `("PATH" ":" prefix (;,(assoc-ref inputs "augustus")
+ ;,(assoc-ref inputs "genemark")
+ ,(assoc-ref inputs "bamtools")
+ ,(assoc-ref inputs "samtools")
+ ,(assoc-ref inputs "prothint")
+ ;,(assoc-ref inputs "genomethreader")
+ ,(assoc-ref inputs "spaln")
+ ,(assoc-ref inputs "exonerate")
+ ,(assoc-ref inputs "ncbi-blast")
+ ,(assoc-ref inputs "diamond")
+ ,(assoc-ref inputs "cdbfasta")
+ ;,(assoc-ref inputs "gushr")
+ ;,(assoc-ref inputs "ucsc")
+ ))
+ ;`("GENEMARK_PATH" "=" (,(string-append (assoc-ref inputs "genemark") "/bin")))
+ ;`("AUGUSTUS_BIN_PATH" "=" (,(string-append (assoc-ref inputs "augustus") "/bin")))
+ ;`("AUGUSTUS_AUGUSTUS_SCRIPTS_PATH" "=" (,(string-append (assoc-ref inputs "augustus") "/bin/augustus_scripts")))
+ `("PYTHON3_PATH" "=" (,(string-append (assoc-ref inputs "python") "/bin")))
+ `("BAMTOOLS_PATH" "=" (,(string-append (assoc-ref inputs "bamtools") "/bin")))
+ `("DIAMOND_PATH" "=" (,(string-append (assoc-ref inputs "diamond") "/bin")))
+ `("BLAST_PATH" "=" (,(string-append (assoc-ref inputs "blast+") "/bin")))
+ `("PROTHINT_PATH" "=" (,(string-append (assoc-ref inputs "prothint") "/bin")))
+ `("SAMTOOLS_PATH" "=" (,(string-append (assoc-ref inputs "samtools") "/bin")))
+ `("CDBTOOLS_PATH" "=" (,(string-append (assoc-ref inputs "cdbfasta") "/bin")))
+ `("ALIGNMENT_TOOL_PATH" "=" (,(string-append (assoc-ref inputs "spaln") "/bin/spaln")
+ ,(string-append (assoc-ref inputs "exonerate") "/bin/exonerate")))
+ ;`("MAKEHUB_PATH" "=" (,(string-append (assoc-ref inputs "makehub") "/bin")))
+ )
+ ))))
+ )))
+ (inputs
+ `(
+ ("guile" ,guile-3.0) ; for wrap-script
+
+ ;("augustus" ,augustus)
+ ;("genemark" ,genemark)
+ ("bamtools" ,bamtools)
+ ("samtools" ,samtools)
+ ("prothint" ,prothint)
+ ;("genometracker" ,genometracker)
+ ("spaln" ,spaln)
+ ("exonerate" ,exonerate)
+ ("ncbi-blast" ,blast+)
+ ("diamond" ,diamond)
+ ("cbdfasta" ,cbdfasta) ; provides cdbfasta and cdbyank
+ ;("gushr" ,gushr)
+ ;("makehub" ,makehub)
+ ;("ucsc" ,ucsc-genome-browser) ; provides bin/twoBitInfo and bin/faToTwoBit
+
+ ("perl" ,perl)
+ ("perl-hash-merge" ,perl-hash-merge)
+ ("perl-math-utils" ,perl-math-utils)
+ ("perl-mce" ,perl-mce)
+ ("perl-module-load-conditional" ,perl-module-load-conditional)
+ ("perl-parallel-forkmanager" ,perl-parallel-forkmanager)
+ ("perl-yaml" ,perl-yaml)
+ ("python" ,python)
+ ))
+ (native-inputs
+ `(
+ ))
+ (home-page "https://github.com/Gaius-Augustus/BRAKER")
+ (synopsis
+ "Pipeline for fully automated prediction of protein coding gene structures")
+ (description "The rapidly growing number of sequenced genomes requires
+fully automated methods for accurate gene structure annotation. With this goal
+in mind, we have developed BRAKER1, a combination of GeneMark-ET and AUGUSTUS
+that uses genomic and RNA-Seq data to automatically generate full gene structure
+annotations in novel genome.
+However, the quality of RNA-Seq data that is available for annotating a novel
+genome is variable, and in some cases, RNA-Seq data is not available, at all.
+BRAKER2 is an extension of BRAKER1 which allows for fully automated training of
+the gene prediction tools GeneMark-EX and AUGUSTUS from RNA-Seq and/or protein
+homology information, and that integrates the extrinsic evidence from RNA-Seq
+and protein homology information into the prediction.
+In contrast to other available methods that rely on protein homology
+information, BRAKER2 reaches high gene prediction accuracy even in the absence
+of the annotation of very closely related species and in the absence of
+RNA-Seq data.")
+ (license license:artistic2.0)))
+
+(define-public prothint
+ (package
+ (name "prothint")
+ (version "2.6.0")
+ (source
+ (origin
+ ;; Tests not included in release.
+ (method git-fetch)
+ (uri (git-reference
+ (url "https://github.com/gatech-genemark/ProtHint")
+ (commit (string-append "v" version))))
+ (file-name (git-file-name name version))
+ (sha256
+ (base32
+ "0fwz8rxlmfxn5im2ck0jnqi3qaps1rnpkbh0l8k0waw2di5982zf"))
+ (modules '((guix build utils)))
+ (snippet
+ '(begin
+ ;; Lets at least remove the executables ...
+ (delete-file "dependencies/diamond")
+ (delete-file "dependencies/probuild")
+ (delete-file "dependencies/spaln")
+ (delete-file "dependencies/spaln_boundary_scorer")))))
+ (build-system perl-build-system)
+ (arguments
+ `(#:tests? #f ; TODO: Test suite fails, or packaging is wrong?
+ #:phases
+ (modify-phases %standard-phases
+ (add-after 'unpack 'adjust-source
+ (lambda* (#:key inputs #:allow-other-keys)
+ (substitute* "bin/spalnBatch.sh"
+ (("\\$binDir/\\.\\./dependencies/spaln\\\"")
+ (string-append (assoc-ref inputs "spaln") "/bin/spaln\""))
+ (("\\$binDir/\\.\\./dependencies/spaln_boundary_scorer\\\"")
+ (string-append (assoc-ref inputs "spaln-boundary-scorer")
+ "/bin/spaln_boundary_scorer\""))
+ (("\\$binDir/\\.\\./dependencies/spaln_table\\\"")
+ (string-append (assoc-ref inputs "spaln")
+ "/share/spaln/table\""))
+ )))
+ (delete 'configure)
+ (delete 'build)
+ (replace 'check
+ (lambda* (#:key tests? #:allow-other-keys)
+ (when tests?
+ (substitute* "tests/common.py"
+ (("/bin/bash") (which "bash")))
+ (invoke "./tests/test_ProtHint.py")
+ (invoke "./tests/test_Spaln.py")
+ (invoke "./tests/test_iter.py"))))
+ (replace 'install
+ (lambda* (#:key inputs outputs #:allow-other-keys)
+ (let* ((out (assoc-ref outputs "out"))
+ (bin (string-append out "/bin")))
+ (with-directory-excursion "scripts"
+ (for-each (cut install-file <> bin)
+ (find-files "bin" "\\.(pl|py|sh)$"))
+ (for-each
+ (cut wrap-script <>
+ `("PERL5LIB" ":" prefix (,(getenv "PERL5LIB"))))
+ (find-files bin "\\.pl$"))
+ (for-each
+ (cut wrap-script <>
+ `("PATH" ":" prefix (,(assoc-ref inputs "coreutils")
+ ,(assoc-ref inputs "diamond")
+ ;,(assoc-ref inputs "genemark")
+ ,(assoc-ref inputs "grep")
+ ,(assoc-ref inputs "spaln"))))
+ (find-files bin "\\.py$")))))))))
+ (inputs
+ `(("guile" ,guile-3.0) ; for wrap-script
+
+ ;("genemark" ,genemark)
+ ("spaln" ,spaln)
+ ("spaln-boundary-scorer" ,spaln-boundary-scorer)
+ ("diamond" ,diamond)
+ ("grep" ,grep)
+
+ ("perl" ,perl)
+ ("perl-mce" ,perl-mce)
+ ("perl-math-utils" ,perl-math-utils)
+ ("perl-yaml" ,perl-yaml)
+ ("python" ,python)))
+ (home-page "https://github.com/gatech-genemark/ProtHint")
+ (synopsis
+ "Protein hint generation pipeline for gene finding in eukaryotic genomes")
+ (description "ProtHint is a pipeline for predicting and scoring hints (in
+the form of introns, start and stop codons) in the genome of interest by mapping
+and spliced aligning predicted genes to a database of reference protein
+sequences.")
+ ;; Licensee may use the Product solely for Licensee's own internal research purposes.
+ (license license:non-copyleft)))
+
+(define-public cdbfasta
+ (let ((commit "014498c66eebdd59b6f1b97e8aad0fcedbdd20b1") ; Oct 5, 2018
+ (revision "4"))
+ (package
+ (name "cdbfasta")
+ (version (git-version "0.0.0" revision commit))
+ (source (origin
+ (method git-fetch)
+ (uri (git-reference
+ (url "https://github.com/gpertea/cdbfasta")
+ (commit commit)))
+ (file-name (git-file-name name version))
+ (sha256
+ (base32
+ "1gqgv3q9qwdq6lqnxd9xnfc7bzkxkiz4crr0vmywafsvwvg6nghk"))))
+ (build-system gnu-build-system)
+ (arguments
+ '(#:tests? #f ; TODO: run test suite.
+ #:phases
+ (modify-phases %standard-phases
+ (delete 'configure) ; No configure script.
+ (replace 'check
+ (lambda* (#:key tests? #:allow-other-keys)
+ (when tests?
+ (invoke "perl" "perltest.pl" "<fasta.cidx>" "<key>"))))
+ (replace 'install
+ (lambda* (#:key outputs #:allow-other-keys)
+ (let ((bin (string-append (assoc-ref outputs "out")
+ "/bin/")))
+ (install-file "cdbfasta" bin)
+ (install-file "cdbyank" bin)
+ #t))))))
+ (inputs
+ `(("zlib" ,zlib)))
+ (native-inputs
+ `(("perl" ,perl)))
+ (home-page "https://github.com/gpertea/cdbfasta")
+ (synopsis "CDB (Constant DataBase) indexing and retrieval tools for FASTA files")
+ (description
+ "This package provides platform independent file-based hashing tools
+(cdbfasta and cdbyank) that can be used for creating indices for quick retrieval
+of any particular sequences from large multi-FASTA files.")
+ (license license:artistic2.0))))
+
+(define-public spaln
+ (package
+ (name "spaln")
+ (version "2.4.11")
+ (source
+ (origin
+ (method git-fetch)
+ (uri (git-reference
+ (url "https://github.com/ogotoh/spaln")
+ (commit (string-append "ver." version))))
+ (file-name (git-file-name name version))
+ (sha256
+ (base32
+ "049jmaf06v8h8b7fxnwf713dg1hkpb1nf3v4glkvxmq9jwv3d90y"))))
+ (build-system gnu-build-system)
+ (arguments
+ `(#:configure-flags
+ (let ((out (assoc-ref %outputs "out")))
+ (list "--use_zlib=1"
+ (string-append "--exec_prefix=" out "/bin")
+ (string-append "--table_dir=" out "/share/spaln/table")
+ (string-append "--alndbs_dir=" out "/share/spaln/seqdb")))
+ #:test-target "test"
+ #:tests? #f ; test.sh not included in source
+ #:phases
+ (modify-phases %standard-phases
+ (add-after 'unpack 'chdir
+ (lambda _
+ (chdir "src")))
+ (add-before 'configure 'adjust-source
+ (lambda* (#:key outputs #:allow-other-keys)
+ (substitute* "Makefile.in"
+ (("CFLAGS =") "CFLAGS +=")
+ (("(\\s+)\\./makmdm.*" all space)
+ (string-append space "chmod +w $(table_dir)/mdm_*\n" all
+ space "chmod -w $(table_dir)/mdm_*\n")))))
+ (replace 'configure
+ (lambda* (#:key configure-flags #:allow-other-keys)
+ ;; The custom configure script doesn't recognize some common flags.
+ (apply invoke "./configure" configure-flags)))
+ ;; Move 'check phase to after 'install.
+ (delete 'check)
+ (add-after 'install 'check
+ (assoc-ref %standard-phases 'check))
+ (add-after 'install 'make-tarballs-writable
+ (lambda* (#:key outputs #:allow-other-keys)
+ (let ((out (assoc-ref outputs "out")))
+ (for-each make-file-writable (find-files out "\\.gz$")))))
+ (add-after 'install 'install-manpages
+ (lambda* (#:key outputs #:allow-other-keys)
+ (let ((man1 (string-append (assoc-ref outputs "out")
+ "/share/man/man1")))
+ (mkdir-p man1)
+ (install-file "../spaln.1" man1)
+ (install-file "../sortgrcd.1" man1))))
+ (add-before 'install-license-files 'chdir
+ (lambda _
+ (chdir "../"))))))
+ (inputs
+ `(("zlib" ,zlib)))
+ (home-page "https://github.com/ogotoh/spaln")
+ (synopsis
+ "Genome mapping and spliced alignment of cDNA or amino acid sequences")
+ (description "@acronym{Spaln, space-efficient spliced alignment} is a
+stand-alone program that maps and aligns a set of cDNA or protein sequences onto
+a whole genomic sequence in a single job. Spaln also performs spliced or
+ordinary alignment after rapid similarity search against a protein sequence
+database, if a genomic segment or an amino acid sequence is given as a query.")
+ (license license:gpl2+)))
+
+(define-public spaln-boundary-scorer
+ (let ((commit "b48977154a75a8559ff0398b8858dc2a51768632")
+ (revision "1"))
+ (package
+ (name "spaln-boundary-scorer")
+ (version "0.0.0")
+ (source
+ (origin
+ (method git-fetch)
+ (uri (git-reference
+ (url "https://github.com/gatech-genemark/spaln-boundary-scorer")
+ (commit commit)))
+ (file-name (git-file-name name version))
+ (sha256
+ (base32
+ "1f2q37kgw1vk0g3xqq7dkq2wvl6vsfbc58g9g487mq7prjb7nkqc"))))
+ (build-system gnu-build-system)
+ (arguments
+ `(#:test-target "test"
+ #:phases
+ (modify-phases %standard-phases
+ (delete 'configure)
+ (replace 'check
+ (lambda* (#:key tests? test-target #:allow-other-keys)
+ (when tests?
+ (invoke "make" test-target)
+ (invoke "./test/t_spaln_boundary_scorer"))))
+ (replace 'install
+ (lambda* (#:key outputs #:allow-other-keys)
+ (let ((bin (string-append (assoc-ref outputs "out") "/bin")))
+ (install-file "spaln_boundary_scorer" bin)))))))
+ (home-page "https://github.com/gatech-genemark/spaln-boundary-scorer")
+ (synopsis
+ "Parse introns, starts, stops and exons from Spaln's alignment output and scores them")
+ (description "Spaln boundary scorer parses introns, starts, stops and
+exons from Spaln's alignment output and scores them. Introns, starts and stops
+are scored based on local alignment quality around their boundaries.")
+ ;; https://github.com/gatech-genemark/spaln-boundary-scorer/issues/1
+ (license license:non-copyleft))))
+
+;; Not currently working
+(define-public genemark
+ (package
+ (name "genemark")
+ (version "4.68")
+ (source
+ (origin
+ (method url-fetch)
+ (uri "file:///gnu/store/1xjfwgqz6kif8a9dy0hlc3a67vjrpipx-gmes_linux_64.tar.gz")
+ (file-name "gmes_linux_64.tar.gz")
+ (sha256
+ (base32
+ "11j7fr3jmammicvdphq69sh8wmi56aipxiimb6vdfc9z3lafz308"))
+ (modules '((guix build utils)))
+ (snippet
+ '(begin
+ (delete-file-recursively "ProtHint")))))
+ (build-system gnu-build-system)
+ (arguments
+ `(
+ #:phases
+ (modify-phases %standard-phases
+ (delete 'configure)
+ (delete 'build)
+ (add-after 'unpack 'add-license-key
+ (lambda _
+ (setenv "HOME" (getcwd))
+ ;; Provided key good until ~ 2022-09-04
+ (with-output-to-file ".gm_key"
+ (lambda _
+ (format #t
+ "AACCCACTCACGGGCCAGGGTTTGAGCACTCGAGCTAGTGCAGCATGTTTTTTTTTTTTTTTTCATATGAGCGCCAGTGG~@
+ 454168340~%")))))
+ ;; This doesn't work, it's statically linked
+ ;(add-after 'unpack 'set-interpreter
+ ; (lambda* (#:key inputs #:allow-other-keys)
+ ; (invoke "patchelf" "--set-interpreter" (car (find-files (assoc-ref inputs "libc") "ld-linux.*\\.so")) "gmhmme3")))
+ (replace 'check
+ (lambda _
+ (invoke "bash" "check_install.bash")))
+ )))
+ (inputs
+ `(("perl" ,perl)
+ ("perl-hash-merge" ,perl-hash-merge)
+ ("perl-mce" ,perl-mce)
+ ("perl-math-utils" ,perl-math-utils)
+ ("perl-parallel-forkmanager" ,perl-parallel-forkmanager)
+ ("perl-yaml" ,perl-yaml)
+ ("python" ,python)))
+ (native-inputs
+ `(
+ ;("patchelf" ,patchelf)
+ ))
+ (home-page "http://exon.gatech.edu/GeneMark/license_download.cgi")
+ (synopsis "")
+ (description "")
+ ;; Precompiled binary
+ (supported-systems '("x86_64-linux"))
+ ;; Licensee may use the Product solely for Licensee's own internal research purposes.
+ (license license:non-copyleft)))