diff options
author | Efraim Flashner | 2021-09-05 00:30:57 +0300 |
---|---|---|
committer | Efraim Flashner | 2021-09-05 00:30:57 +0300 |
commit | b773e0e0a2869b2d370891696e0dd1382dacf9f4 (patch) | |
tree | fc6a2fa132d520ee140b2ae4f055a312c40769d1 /gn/packages/bioinformatics.scm | |
parent | 6b358542f5832376be16a165b897ea43270e2323 (diff) | |
download | guix-bioinformatics-b773e0e0a2869b2d370891696e0dd1382dacf9f4.tar.gz |
WIP braker
Diffstat (limited to 'gn/packages/bioinformatics.scm')
-rw-r--r-- | gn/packages/bioinformatics.scm | 435 |
1 files changed, 435 insertions, 0 deletions
diff --git a/gn/packages/bioinformatics.scm b/gn/packages/bioinformatics.scm index 91901a6..677de1d 100644 --- a/gn/packages/bioinformatics.scm +++ b/gn/packages/bioinformatics.scm @@ -12,12 +12,14 @@ #:use-module (guix build-system cmake) #:use-module (guix build-system gnu) #:use-module (guix build-system meson) + #:use-module (guix build-system perl) #:use-module (guix build-system python) #:use-module (guix build-system trivial) #:use-module (guix build-system waf) #:use-module (gnu packages) #:use-module (gn packages crates-io) #:use-module (gn packages java) + #:use-module (gn packages perl) #:use-module (gn packages python) #:use-module (gn packages twint) #:use-module (gnu packages algebra) @@ -2407,3 +2409,436 @@ To run the bundled rtg-tools software you will also need java. The @code{icedtea:jdk} output should work nicely.") (license (list license:expat ; bundled jsoncpp, klib license:bsd-2)))) + +(define-public braker + (package + (name "braker") + (version "2.1.6") + (source + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/Gaius-Augustus/BRAKER") + (commit (string-append "v" version)))) + (file-name (git-file-name name version)) + (sha256 + (base32 + "0jx9ycyh8s42yb4va84mcjyh7m95wzvf42skc3p4pvc5r7c5ns41")))) + (build-system perl-build-system) + (arguments + `(#:modules ((srfi srfi-26) + (guix build perl-build-system) + (guix build utils)) + #:phases + (modify-phases %standard-phases + (delete 'configure) + (delete 'build) + (replace 'check + (lambda* (#:key tests? #:allow-other-keys) + (when tests? + (invoke "prove" "-l" "t")))) + (replace 'install + (lambda* (#:key inputs outputs #:allow-other-keys) + (let* ((out (assoc-ref outputs "out")) + (bin (string-append out "/bin"))) + (with-directory-excursion "scripts" + (for-each (cut install-file <> bin) + (find-files "." "\\.(py|pl|pm)$")) + (for-each + (cut wrap-script <> + `("PERL5LIB" ":" prefix (,(getenv "PERL5LIB") + ,bin))) + ;; braker.pl is the entry point so wrap it separately. + (delete (string-append bin "/braker.pl") + (find-files bin "\\.pl$"))) + (wrap-script (string-append bin "/braker.pl") + `("PERL5LIB" ":" prefix (,(getenv "PERL5LIB") + ,bin)) + `("PATH" ":" prefix (;,(assoc-ref inputs "augustus") + ;,(assoc-ref inputs "genemark") + ,(assoc-ref inputs "bamtools") + ,(assoc-ref inputs "samtools") + ,(assoc-ref inputs "prothint") + ;,(assoc-ref inputs "genomethreader") + ,(assoc-ref inputs "spaln") + ,(assoc-ref inputs "exonerate") + ,(assoc-ref inputs "ncbi-blast") + ,(assoc-ref inputs "diamond") + ,(assoc-ref inputs "cdbfasta") + ;,(assoc-ref inputs "gushr") + ;,(assoc-ref inputs "ucsc") + )) + ;`("GENEMARK_PATH" "=" (,(string-append (assoc-ref inputs "genemark") "/bin"))) + ;`("AUGUSTUS_BIN_PATH" "=" (,(string-append (assoc-ref inputs "augustus") "/bin"))) + ;`("AUGUSTUS_AUGUSTUS_SCRIPTS_PATH" "=" (,(string-append (assoc-ref inputs "augustus") "/bin/augustus_scripts"))) + `("PYTHON3_PATH" "=" (,(string-append (assoc-ref inputs "python") "/bin"))) + `("BAMTOOLS_PATH" "=" (,(string-append (assoc-ref inputs "bamtools") "/bin"))) + `("DIAMOND_PATH" "=" (,(string-append (assoc-ref inputs "diamond") "/bin"))) + `("BLAST_PATH" "=" (,(string-append (assoc-ref inputs "blast+") "/bin"))) + `("PROTHINT_PATH" "=" (,(string-append (assoc-ref inputs "prothint") "/bin"))) + `("SAMTOOLS_PATH" "=" (,(string-append (assoc-ref inputs "samtools") "/bin"))) + `("CDBTOOLS_PATH" "=" (,(string-append (assoc-ref inputs "cdbfasta") "/bin"))) + `("ALIGNMENT_TOOL_PATH" "=" (,(string-append (assoc-ref inputs "spaln") "/bin/spaln") + ,(string-append (assoc-ref inputs "exonerate") "/bin/exonerate"))) + ;`("MAKEHUB_PATH" "=" (,(string-append (assoc-ref inputs "makehub") "/bin"))) + ) + )))) + ))) + (inputs + `( + ("guile" ,guile-3.0) ; for wrap-script + + ;("augustus" ,augustus) + ;("genemark" ,genemark) + ("bamtools" ,bamtools) + ("samtools" ,samtools) + ("prothint" ,prothint) + ;("genometracker" ,genometracker) + ("spaln" ,spaln) + ("exonerate" ,exonerate) + ("ncbi-blast" ,blast+) + ("diamond" ,diamond) + ("cbdfasta" ,cbdfasta) ; provides cdbfasta and cdbyank + ;("gushr" ,gushr) + ;("makehub" ,makehub) + ;("ucsc" ,ucsc-genome-browser) ; provides bin/twoBitInfo and bin/faToTwoBit + + ("perl" ,perl) + ("perl-hash-merge" ,perl-hash-merge) + ("perl-math-utils" ,perl-math-utils) + ("perl-mce" ,perl-mce) + ("perl-module-load-conditional" ,perl-module-load-conditional) + ("perl-parallel-forkmanager" ,perl-parallel-forkmanager) + ("perl-yaml" ,perl-yaml) + ("python" ,python) + )) + (native-inputs + `( + )) + (home-page "https://github.com/Gaius-Augustus/BRAKER") + (synopsis + "Pipeline for fully automated prediction of protein coding gene structures") + (description "The rapidly growing number of sequenced genomes requires +fully automated methods for accurate gene structure annotation. With this goal +in mind, we have developed BRAKER1, a combination of GeneMark-ET and AUGUSTUS +that uses genomic and RNA-Seq data to automatically generate full gene structure +annotations in novel genome. +However, the quality of RNA-Seq data that is available for annotating a novel +genome is variable, and in some cases, RNA-Seq data is not available, at all. +BRAKER2 is an extension of BRAKER1 which allows for fully automated training of +the gene prediction tools GeneMark-EX and AUGUSTUS from RNA-Seq and/or protein +homology information, and that integrates the extrinsic evidence from RNA-Seq +and protein homology information into the prediction. +In contrast to other available methods that rely on protein homology +information, BRAKER2 reaches high gene prediction accuracy even in the absence +of the annotation of very closely related species and in the absence of +RNA-Seq data.") + (license license:artistic2.0))) + +(define-public prothint + (package + (name "prothint") + (version "2.6.0") + (source + (origin + ;; Tests not included in release. + (method git-fetch) + (uri (git-reference + (url "https://github.com/gatech-genemark/ProtHint") + (commit (string-append "v" version)))) + (file-name (git-file-name name version)) + (sha256 + (base32 + "0fwz8rxlmfxn5im2ck0jnqi3qaps1rnpkbh0l8k0waw2di5982zf")) + (modules '((guix build utils))) + (snippet + '(begin + ;; Lets at least remove the executables ... + (delete-file "dependencies/diamond") + (delete-file "dependencies/probuild") + (delete-file "dependencies/spaln") + (delete-file "dependencies/spaln_boundary_scorer"))))) + (build-system perl-build-system) + (arguments + `(#:tests? #f ; TODO: Test suite fails, or packaging is wrong? + #:phases + (modify-phases %standard-phases + (add-after 'unpack 'adjust-source + (lambda* (#:key inputs #:allow-other-keys) + (substitute* "bin/spalnBatch.sh" + (("\\$binDir/\\.\\./dependencies/spaln\\\"") + (string-append (assoc-ref inputs "spaln") "/bin/spaln\"")) + (("\\$binDir/\\.\\./dependencies/spaln_boundary_scorer\\\"") + (string-append (assoc-ref inputs "spaln-boundary-scorer") + "/bin/spaln_boundary_scorer\"")) + (("\\$binDir/\\.\\./dependencies/spaln_table\\\"") + (string-append (assoc-ref inputs "spaln") + "/share/spaln/table\"")) + ))) + (delete 'configure) + (delete 'build) + (replace 'check + (lambda* (#:key tests? #:allow-other-keys) + (when tests? + (substitute* "tests/common.py" + (("/bin/bash") (which "bash"))) + (invoke "./tests/test_ProtHint.py") + (invoke "./tests/test_Spaln.py") + (invoke "./tests/test_iter.py")))) + (replace 'install + (lambda* (#:key inputs outputs #:allow-other-keys) + (let* ((out (assoc-ref outputs "out")) + (bin (string-append out "/bin"))) + (with-directory-excursion "scripts" + (for-each (cut install-file <> bin) + (find-files "bin" "\\.(pl|py|sh)$")) + (for-each + (cut wrap-script <> + `("PERL5LIB" ":" prefix (,(getenv "PERL5LIB")))) + (find-files bin "\\.pl$")) + (for-each + (cut wrap-script <> + `("PATH" ":" prefix (,(assoc-ref inputs "coreutils") + ,(assoc-ref inputs "diamond") + ;,(assoc-ref inputs "genemark") + ,(assoc-ref inputs "grep") + ,(assoc-ref inputs "spaln")))) + (find-files bin "\\.py$"))))))))) + (inputs + `(("guile" ,guile-3.0) ; for wrap-script + + ;("genemark" ,genemark) + ("spaln" ,spaln) + ("spaln-boundary-scorer" ,spaln-boundary-scorer) + ("diamond" ,diamond) + ("grep" ,grep) + + ("perl" ,perl) + ("perl-mce" ,perl-mce) + ("perl-math-utils" ,perl-math-utils) + ("perl-yaml" ,perl-yaml) + ("python" ,python))) + (home-page "https://github.com/gatech-genemark/ProtHint") + (synopsis + "Protein hint generation pipeline for gene finding in eukaryotic genomes") + (description "ProtHint is a pipeline for predicting and scoring hints (in +the form of introns, start and stop codons) in the genome of interest by mapping +and spliced aligning predicted genes to a database of reference protein +sequences.") + ;; Licensee may use the Product solely for Licensee's own internal research purposes. + (license license:non-copyleft))) + +(define-public cdbfasta + (let ((commit "014498c66eebdd59b6f1b97e8aad0fcedbdd20b1") ; Oct 5, 2018 + (revision "4")) + (package + (name "cdbfasta") + (version (git-version "0.0.0" revision commit)) + (source (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/gpertea/cdbfasta") + (commit commit))) + (file-name (git-file-name name version)) + (sha256 + (base32 + "1gqgv3q9qwdq6lqnxd9xnfc7bzkxkiz4crr0vmywafsvwvg6nghk")))) + (build-system gnu-build-system) + (arguments + '(#:tests? #f ; TODO: run test suite. + #:phases + (modify-phases %standard-phases + (delete 'configure) ; No configure script. + (replace 'check + (lambda* (#:key tests? #:allow-other-keys) + (when tests? + (invoke "perl" "perltest.pl" "<fasta.cidx>" "<key>")))) + (replace 'install + (lambda* (#:key outputs #:allow-other-keys) + (let ((bin (string-append (assoc-ref outputs "out") + "/bin/"))) + (install-file "cdbfasta" bin) + (install-file "cdbyank" bin) + #t)))))) + (inputs + `(("zlib" ,zlib))) + (native-inputs + `(("perl" ,perl))) + (home-page "https://github.com/gpertea/cdbfasta") + (synopsis "CDB (Constant DataBase) indexing and retrieval tools for FASTA files") + (description + "This package provides platform independent file-based hashing tools +(cdbfasta and cdbyank) that can be used for creating indices for quick retrieval +of any particular sequences from large multi-FASTA files.") + (license license:artistic2.0)))) + +(define-public spaln + (package + (name "spaln") + (version "2.4.11") + (source + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/ogotoh/spaln") + (commit (string-append "ver." version)))) + (file-name (git-file-name name version)) + (sha256 + (base32 + "049jmaf06v8h8b7fxnwf713dg1hkpb1nf3v4glkvxmq9jwv3d90y")))) + (build-system gnu-build-system) + (arguments + `(#:configure-flags + (let ((out (assoc-ref %outputs "out"))) + (list "--use_zlib=1" + (string-append "--exec_prefix=" out "/bin") + (string-append "--table_dir=" out "/share/spaln/table") + (string-append "--alndbs_dir=" out "/share/spaln/seqdb"))) + #:test-target "test" + #:tests? #f ; test.sh not included in source + #:phases + (modify-phases %standard-phases + (add-after 'unpack 'chdir + (lambda _ + (chdir "src"))) + (add-before 'configure 'adjust-source + (lambda* (#:key outputs #:allow-other-keys) + (substitute* "Makefile.in" + (("CFLAGS =") "CFLAGS +=") + (("(\\s+)\\./makmdm.*" all space) + (string-append space "chmod +w $(table_dir)/mdm_*\n" all + space "chmod -w $(table_dir)/mdm_*\n"))))) + (replace 'configure + (lambda* (#:key configure-flags #:allow-other-keys) + ;; The custom configure script doesn't recognize some common flags. + (apply invoke "./configure" configure-flags))) + ;; Move 'check phase to after 'install. + (delete 'check) + (add-after 'install 'check + (assoc-ref %standard-phases 'check)) + (add-after 'install 'make-tarballs-writable + (lambda* (#:key outputs #:allow-other-keys) + (let ((out (assoc-ref outputs "out"))) + (for-each make-file-writable (find-files out "\\.gz$"))))) + (add-after 'install 'install-manpages + (lambda* (#:key outputs #:allow-other-keys) + (let ((man1 (string-append (assoc-ref outputs "out") + "/share/man/man1"))) + (mkdir-p man1) + (install-file "../spaln.1" man1) + (install-file "../sortgrcd.1" man1)))) + (add-before 'install-license-files 'chdir + (lambda _ + (chdir "../")))))) + (inputs + `(("zlib" ,zlib))) + (home-page "https://github.com/ogotoh/spaln") + (synopsis + "Genome mapping and spliced alignment of cDNA or amino acid sequences") + (description "@acronym{Spaln, space-efficient spliced alignment} is a +stand-alone program that maps and aligns a set of cDNA or protein sequences onto +a whole genomic sequence in a single job. Spaln also performs spliced or +ordinary alignment after rapid similarity search against a protein sequence +database, if a genomic segment or an amino acid sequence is given as a query.") + (license license:gpl2+))) + +(define-public spaln-boundary-scorer + (let ((commit "b48977154a75a8559ff0398b8858dc2a51768632") + (revision "1")) + (package + (name "spaln-boundary-scorer") + (version "0.0.0") + (source + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/gatech-genemark/spaln-boundary-scorer") + (commit commit))) + (file-name (git-file-name name version)) + (sha256 + (base32 + "1f2q37kgw1vk0g3xqq7dkq2wvl6vsfbc58g9g487mq7prjb7nkqc")))) + (build-system gnu-build-system) + (arguments + `(#:test-target "test" + #:phases + (modify-phases %standard-phases + (delete 'configure) + (replace 'check + (lambda* (#:key tests? test-target #:allow-other-keys) + (when tests? + (invoke "make" test-target) + (invoke "./test/t_spaln_boundary_scorer")))) + (replace 'install + (lambda* (#:key outputs #:allow-other-keys) + (let ((bin (string-append (assoc-ref outputs "out") "/bin"))) + (install-file "spaln_boundary_scorer" bin))))))) + (home-page "https://github.com/gatech-genemark/spaln-boundary-scorer") + (synopsis + "Parse introns, starts, stops and exons from Spaln's alignment output and scores them") + (description "Spaln boundary scorer parses introns, starts, stops and +exons from Spaln's alignment output and scores them. Introns, starts and stops +are scored based on local alignment quality around their boundaries.") + ;; https://github.com/gatech-genemark/spaln-boundary-scorer/issues/1 + (license license:non-copyleft)))) + +;; Not currently working +(define-public genemark + (package + (name "genemark") + (version "4.68") + (source + (origin + (method url-fetch) + (uri "file:///gnu/store/1xjfwgqz6kif8a9dy0hlc3a67vjrpipx-gmes_linux_64.tar.gz") + (file-name "gmes_linux_64.tar.gz") + (sha256 + (base32 + "11j7fr3jmammicvdphq69sh8wmi56aipxiimb6vdfc9z3lafz308")) + (modules '((guix build utils))) + (snippet + '(begin + (delete-file-recursively "ProtHint"))))) + (build-system gnu-build-system) + (arguments + `( + #:phases + (modify-phases %standard-phases + (delete 'configure) + (delete 'build) + (add-after 'unpack 'add-license-key + (lambda _ + (setenv "HOME" (getcwd)) + ;; Provided key good until ~ 2022-09-04 + (with-output-to-file ".gm_key" + (lambda _ + (format #t + "AACCCACTCACGGGCCAGGGTTTGAGCACTCGAGCTAGTGCAGCATGTTTTTTTTTTTTTTTTCATATGAGCGCCAGTGG~@ + 454168340~%"))))) + ;; This doesn't work, it's statically linked + ;(add-after 'unpack 'set-interpreter + ; (lambda* (#:key inputs #:allow-other-keys) + ; (invoke "patchelf" "--set-interpreter" (car (find-files (assoc-ref inputs "libc") "ld-linux.*\\.so")) "gmhmme3"))) + (replace 'check + (lambda _ + (invoke "bash" "check_install.bash"))) + ))) + (inputs + `(("perl" ,perl) + ("perl-hash-merge" ,perl-hash-merge) + ("perl-mce" ,perl-mce) + ("perl-math-utils" ,perl-math-utils) + ("perl-parallel-forkmanager" ,perl-parallel-forkmanager) + ("perl-yaml" ,perl-yaml) + ("python" ,python))) + (native-inputs + `( + ;("patchelf" ,patchelf) + )) + (home-page "http://exon.gatech.edu/GeneMark/license_download.cgi") + (synopsis "") + (description "") + ;; Precompiled binary + (supported-systems '("x86_64-linux")) + ;; Licensee may use the Product solely for Licensee's own internal research purposes. + (license license:non-copyleft))) |