about summary refs log tree commit diff
path: root/gn/packages/bioinformatics.scm
diff options
context:
space:
mode:
authorShepherd on Tux022024-06-20 07:26:21 -0500
committerShepherd on Tux022024-06-20 07:26:21 -0500
commitf0f8dc84eca7088177797f84db05314537615c77 (patch)
tree5a7408a19c92b0c4d536916269425d883c4f06ca /gn/packages/bioinformatics.scm
parent951e77b10c84889f29c1f2322087c796cedb375a (diff)
parent48af9393cf186230e08b0fa6f7f443bc818408d2 (diff)
downloadguix-bioinformatics-f0f8dc84eca7088177797f84db05314537615c77.tar.gz
Merge branch 'master' of https://gitlab.com/genenetwork/guix-bioinformatics
Diffstat (limited to 'gn/packages/bioinformatics.scm')
-rw-r--r--gn/packages/bioinformatics.scm444
1 files changed, 402 insertions, 42 deletions
diff --git a/gn/packages/bioinformatics.scm b/gn/packages/bioinformatics.scm
index 2d1b55f..db420b7 100644
--- a/gn/packages/bioinformatics.scm
+++ b/gn/packages/bioinformatics.scm
@@ -23,7 +23,6 @@
   #:use-module (gn packages java)
   #:use-module (gn packages ocaml)
   #:use-module (gn packages python)
-  #:use-module (gn packages twint)
   #:use-module (gnu packages algebra)
   #:use-module (gnu packages assembly)
   #:use-module (gnu packages autotools)
@@ -33,6 +32,7 @@
   #:use-module (gnu packages bioinformatics)
   #:use-module (gnu packages boost)
   #:use-module (gnu packages bootstrap)
+  #:use-module (gnu packages c)
   #:use-module (gnu packages check)
   #:use-module (gnu packages cmake)
   #:use-module (gnu packages compression)
@@ -54,6 +54,7 @@
   #:use-module (gnu packages java)
   #:use-module (gnu packages jemalloc)
   #:use-module (gnu packages linux)
+  #:use-module (gnu packages llvm)
   #:use-module (gnu packages machine-learning)
   #:use-module (gnu packages maths)
   #:use-module (gnu packages mpi)
@@ -460,7 +461,7 @@ reads.")
     (description "Variant detection in massively parallel sequencing data.")
     ;; Free for non-commercial use by academic, government, and
     ;; non-profit/not-for-profit institutions
-    (license license:non-copyleft)))
+    (license (license:non-copyleft "file:///LICENSE"))))
 
 (define-public edirect-gn
   (deprecated-package "edirect-gn" edirect))
@@ -487,7 +488,7 @@ reads.")
     (arguments
      `(#:install-source? #f
        #:cargo-inputs
-       (("rust-clap" ,rust-clap-3.1)
+       (("rust-clap" ,rust-clap-3)
         ("rust-rustc-hash" ,rust-rustc-hash-1)
         ("rust-regex" ,rust-regex-1)
         ("rust-handlegraph" ,rust-handlegraph-0.7)
@@ -510,6 +511,305 @@ reads.")
 collapses them into a non-redundant graph structure.")
     (license license:expat)))
 
+(define-public gafpack
+  (let ((commit "ad31875b6914d964c6fd72d1bf334f0843538fb6")     ; November 10, 2022
+        (revision "1"))
+    (package
+      (name "gafpack")
+      (version (git-version "0.0.0" revision commit))
+      (source
+        (origin
+          (method git-fetch)
+          (uri (git-reference
+                 (url "https://github.com/ekg/gafpack")
+                 (commit commit)))
+          (file-name (git-file-name name version))
+          (sha256
+           (base32 "0di2psh0ls7jlbnqs7k71p55f73pn23a09k1h3ril7gwjcrzr3rk"))))
+      (build-system cargo-build-system)
+      (arguments
+       `(#:install-source? #f
+         #:cargo-inputs
+         (("rust-clap" ,rust-clap-4)
+          ("rust-gfa" ,rust-gfa-0.10))))
+      (home-page "https://github.com/ekg/gafpack")
+      (synopsis "Convert variation graph alignments to coverage maps over nodes")
+      (description
+       "Gafpack converts alignments to pangenome variation graphs to coverage
+maps useful in haplotype-based genotyping.")
+      (license license:expat))))
+
+(define-public agc-for-pgr-tk
+  (let ((commit "453c0afdc54b4aa00fa8e97a63f196931fdb81c4") ; April 26, 2022
+        (revision "1"))
+    (package
+      (name "agc")
+      (version (git-version "2.0" revision commit))
+      (source
+        (origin
+          (method git-fetch)
+          (uri (git-reference
+                 (url "https://github.com/cschin/agc")
+                 (commit commit)))
+          (file-name (git-file-name name version))
+          (sha256
+           (base32 "1v5s79rl38dcyy5h1lykbp6clcbqq9winn533j54y49q1jp8chix"))
+          (snippet
+           #~(begin
+               (use-modules (guix build utils))
+               ;; Copy the two radul files we can't find a replacement for:
+               ;; https://github.com/refresh-bio/RADULS
+               (mkdir "keep-libs")
+               (rename-file "libs/raduls.h" "keep-libs/raduls.h")
+               (rename-file "libs/libraduls.a" "keep-libs/libraduls.a")
+               (delete-file-recursively "libs")
+               (rename-file "keep-libs" "libs")
+
+               (delete-file-recursively "py_agc_api/pybind11-2.8.1")
+               (substitute* '("makefile" "makefile.release")
+                 (("-mavx") "")
+                 (("-m64") "")
+                 (("\\$\\(AGC_LIBS_DIR)\\/mimalloc/\\$\\(LIB_ALLOC\\)")
+                  "$(pkg-config --cflags --libs mimalloc) /usr/lib/libmimalloc.so")
+                 (("\\$\\(AGC_LIBS_DIR)\\/\\$\\(LIB_ZLIB\\)")
+                  "$(pkg-config --cflags --libs zlib) /usr/lib/libz.so")
+                 (("\\$\\(AGC_LIBS_DIR)\\/\\$\\(LIB_ZSTD\\)")
+                  "$(pkg-config --cflags --libs libzstd) /usr/lib/libzstd.so")
+                 (("^PYBIND11_LIB = .*") "PYBIND11_LIB = /usr/include/pybind11")
+                 (("\\$\\(PYBIND11_LIB\\)/include") "$(PYBIND11_LIB)"))
+               (substitute* (find-files "src" "\\.(h|cpp)$")
+                 (("../../libs/ketopt.h") "ketopt.h")
+                 (("../../libs/zlib.h") "zlib.h")
+                 (("../../libs/zstd.h") "zstd.h"))))))
+      (build-system gnu-build-system)
+      (arguments
+       `(#:tests? #f                    ; No tests.
+         #:phases
+         (modify-phases %standard-phases
+           (delete 'configure)          ; No configure script.
+           (add-after 'unpack 'adjust-sources
+             (lambda* (#:key inputs #:allow-other-keys)
+               (let ((mimalloc (assoc-ref inputs "mimalloc")))
+                 (substitute* '("makefile" "makefile.release")
+                   (("/usr/include/pybind11")
+                    (search-input-directory inputs "/include/pybind11"))
+                   (("/usr/lib/libmimalloc.so")
+                    (search-input-file inputs "/lib/libmimalloc.so"))
+                   (("/usr/lib/libz.so")
+                    (search-input-file inputs "/lib/libz.so"))
+                   (("/usr/lib/libzstd.so")
+                    (search-input-file inputs "/lib/libzstd.so"))
+                   (("pkg-config") ,(pkg-config-for-target))))))
+           (replace 'install
+             (lambda* (#:key outputs #:allow-other-keys)
+               (let* ((out (assoc-ref outputs "out"))
+                      (include (string-append out "/include/")))
+                 (install-file "agc" (string-append out "/bin"))
+                 (install-file "libagc.so" (string-append out "/lib"))
+                 (mkdir-p (string-append include "app"))
+                 (mkdir-p (string-append include "core"))
+                 (mkdir-p (string-append include "lib-cxx"))
+                 (with-directory-excursion "src"
+                   (for-each
+                     (lambda (file)
+                       (copy-file file (string-append include file)))
+                     (find-files "." "\\.h$")))))))))
+      (native-inputs
+       (list minimap2                   ; for ketopt.h
+             pkg-config))
+      (inputs
+       (list mimalloc
+             python
+             pybind11
+             zlib
+             (list zstd "lib")))
+      (home-page "https://github.com/cschin/agc")
+      (synopsis "Assembled Genomes Compressor")
+      (description
+       "@acronym{Assembled Genomes Compressor, AGC} is a tool designed to
+compress collections of de-novo assembled genomes.  It can be used for various
+types of datasets: short genomes (viruses) as well as long (humans).")
+      (license license:expat))))
+
+(define-public pgr-tk
+  (package
+    (name "pgr-tk")
+    (version "0.3.6")
+    (source
+      (origin
+        (method git-fetch)
+        (uri (git-reference
+               (url "https://github.com/Sema4-Research/pgr-tk")
+               (commit (string-append "v" version))))
+        (file-name (git-file-name name version))
+        (sha256
+         (base32 "0vm1k63v91zd0pfbg2zmwskajylz8xg83m63qxwaiwny5f4y6f1j"))
+        (snippet
+         #~(begin
+             (use-modules (guix build utils))
+             (substitute* (find-files "." "Cargo.toml")
+               ;; Only use the major+minor version to decrease the number of
+               ;; special version crates.
+               (("(.*= \")([[:digit:]]+\\.[[:digit:]]+)\\.[[:digit:]]+(\".*)"
+                 _ name version tail)
+                (string-append name version tail))
+               ;; Then fix the version string for the actual package.
+               (("^version = \".*")
+                (string-append "version = \"" #$version "\"\n")))))))
+    (build-system cargo-build-system)
+    (arguments
+     `(#:install-source? #f
+       #:cargo-test-flags
+       (list "--release" "--"
+             "--skip=get_aln_segements"
+             "--skip=get_shmmr_dots"
+             "--skip=AGCFile"
+             "--skip=SeqIndexDB")
+       #:cargo-inputs
+       (("rust-bindgen" ,rust-bindgen-0.58)
+        ("rust-bgzip" ,rust-bgzip-0.2)
+        ("rust-byteorder" ,rust-byteorder-1)
+        ("rust-clap" ,rust-clap-3)
+        ("rust-cuckoofilter" ,rust-cuckoofilter-0.5)
+        ("rust-flate2" ,rust-flate2-1)
+        ("rust-libc" ,rust-libc-0.2)
+        ("rust-log" ,rust-log-0.4)
+        ("rust-petgraph" ,rust-petgraph-0.6)
+        ("rust-pyo3" ,rust-pyo3-0.14)
+        ("rust-rayon" ,rust-rayon-1)
+        ("rust-regex" ,rust-regex-1)
+        ("rust-rustc-hash" ,rust-rustc-hash-1)
+        ("rust-serde" ,rust-serde-1)
+        ("rust-serde-json" ,rust-serde-json-1)
+        ("rust-simple-logger" ,rust-simple-logger-1))
+       #:phases
+       (modify-phases %standard-phases
+         (add-after 'unpack 'insert-wfa-source
+           (lambda* (#:key inputs #:allow-other-keys)
+             (copy-recursively (assoc-ref inputs "wfa-src")
+                               "rs-wfa/WFA")))
+         (add-after 'unpack 'adjust-source
+           (lambda* (#:key inputs #:allow-other-keys)
+             (substitute* '("pgr-bin/build.rs"
+                            "pgr-db/build.rs"
+                            "pgr-tk/build.rs")
+               (("git") "ls")
+               (("bioconda") "Guix"))
+             ;; Build with zlib, not zlib-ng
+             (substitute* '("pgr-bin/Cargo.toml"
+                            "pgr-db/Cargo.toml")
+               (("zlib-ng-compat") "zlib"))
+             ;; Don't look for agc to be bundled.
+             (substitute* "pgr-db/wrapper.h"
+               (("../agc/src/lib-cxx/agc-api.h") "lib-cxx/agc-api.h"))
+             (substitute* "pgr-db/build.rs"
+               ((".*panic!\\(\"Error.*") ""))
+             (mkdir-p "target/release")
+             (symlink (search-input-file inputs "/bin/agc")
+                      "target/release/agc")
+             (symlink (search-input-file inputs "/lib/libagc.so")
+                      "target/release/libagc")))
+         (replace 'install
+           (lambda* (#:key outputs #:allow-other-keys)
+             (let ((out (assoc-ref outputs "out")))
+               (with-directory-excursion "target/release"
+                 (install-file "libpgrtk.so" (string-append out "/lib"))
+                 (for-each
+                   (lambda (file)
+                     (install-file file (string-append out "/bin")))
+                   (list "pgr-filter"
+                         "pgr-mdb"
+                         "pgr-multifilter"
+                         "pgr-probe-match"
+                         "pgr-shmmr-pair-count")))))))))
+    (inputs
+     (list agc-for-pgr-tk
+           clang
+           python
+           zlib
+           (list zstd "lib")))
+    (native-inputs
+     `(("pkg-config" ,pkg-config)
+       ("wfa-src"
+        ,(origin
+           (method git-fetch)
+           (uri (git-reference
+                  ;; forPYO3 branch, 14-03-2021
+                  (url "https://github.com/cschin/WFA")
+                  (commit "1f8c8d2905ed482cd2d306a1676d60c2a45cb098")))
+           (file-name "wfa-for-pgr-tk")
+           (sha256
+            (base32 "19h1cjp2bdlcfq5c6rsbk8bc0f8zn64b471dhj4xlfxd1prv2dpk"))))))
+    (home-page "https://github.com/Sema4-Research/pgr-tk")
+    (synopsis "Pangenome Research Tool Kit")
+    (description
+     "PGR-TK provides pangenome assembly management, query and
+@acronym{Minimizer Anchored Pangenome, MAP} Graph Generation.  It is a project
+to provide Python and Rust libraries to facilitate pangenomics analysis.
+Several algorithms and data structures used for the Peregrine Genome Assembler
+are useful for Pangenomics analysis as well.  This repo takes those algorithms
+and data structure, combining other handy 3rd party tools to expose them as a
+library in Python (with Rust code for those computing parts that need
+performance.)")
+    (license (license:non-copyleft
+               "file:///LICENSE"
+               "CC-BY-NC-SA 4.0"))))
+
+(define-public graph-genotyper
+  (let ((commit "e7cc6b43a5b1f389d76bf9aac7f2ee02f92caeaf") ; October 17, 2022
+        (revision "13"))
+    (package
+      (name "graph-genotyper")
+      (version (git-version "0.0.0" revision commit))
+      (source (origin
+        (method git-fetch)
+        (uri (git-reference
+               (url "https://github.com/davidebolo1993/graph_genotyper")
+               (commit commit)))
+        (file-name (git-file-name name version))
+        (sha256
+         (base32 "1l8yjpkqamiqr1q5i7vr5z04aba7skpbcwyc9dx5fiklvljjfhcx"))))
+      (build-system copy-build-system)
+      (arguments
+       `(#:install-plan
+         '(("genotype.py" "bin/")
+           ("genotype.sh" "bin/"))
+         #:phases
+         (modify-phases %standard-phases
+           (add-after 'install 'wrap-genotype
+             (lambda* (#:key inputs outputs #:allow-other-keys)
+               (let ((out (assoc-ref outputs "out")))
+                 (wrap-script (string-append out "/bin/genotype.sh")
+                  `("GUIX_PYTHONPATH" ":" prefix (,(getenv "GUIX_PYTHONPATH")))
+                  `("PATH" ":" prefix
+                    ,(map (lambda (file-name)
+                            (string-append (assoc-ref inputs file-name) "/bin"))
+                          (list "gafpack"
+                                "odgi"
+                                "python"
+                                "samtools"
+                                "vg"))))))))))
+      (inputs
+       (list gafpack
+             guile-3.0
+             odgi
+             python
+             python-numpy
+             python-pandas
+             python-scipy
+             samtools
+             vg))
+      (home-page "https://bitbucket.org/jana_ebler")
+      (synopsis "Genotyping based on k-mers and pangenome graphs")
+      (description
+       "This package provides a genotyper for various types of genetic variants
+(such as SNPs, indels and structural variants).  Genotypes are computed based on
+read k-mer counts and a panel of known haplotypes.  A description of the method
+can be found @url{https://www.biorxiv.org/content/10.1101/2020.11.11.378133v1,
+here}.")
+      (license (license:non-copyleft
+                 "No license listed")))))
+
 (define-public pangenie
   (let ((commit "e779076827022d1416ab9fabf99a03d8f4725956") ; September 2, 2021 from phasing-tests branch
         (revision "2"))
@@ -1490,20 +1790,9 @@ reads, also called read-based phasing or haplotype assembly.  It is especially
 suitable for long reads, but works also well with short reads.")
     (license license:expat)))
 
-(define-public python-pytest-runner-2
-  (package
-    (inherit python-pytest-runner)
-    (version "2.12.2")
-    (source (origin
-              (method url-fetch)
-              (uri (pypi-uri "pytest-runner" version))
-              (sha256
-               (base32
-                "11ivjj9hfphkv4yfb2g74av4yy86y8gcbf7gbif0p1hcdfnxg3w6"))))))
-
 (define-public bh20-seq-resource
-  (let ((commit "ae4cb3c2cf7103bbc84f52618bb755d7ce25775b")
-        (revision "3"))
+  (let ((commit "2ae71911cd87ce4f2eabdff21e538267b3270d45")
+        (revision "4"))
     (package
       (name "bh20-seq-resource")
       (version (git-version "1.0" revision commit))
@@ -1514,36 +1803,45 @@ suitable for long reads, but works also well with short reads.")
                        (commit commit)))
                 (file-name (git-file-name name version))
                 (sha256
-                 (base32 "1k0gsz4yc8l5znanzd094g2jp40ksbpa9667zr31ayrjx6labz02"))
+                 (base32 "1k6cc88hrcm77jwpdk2084q0zirv2vlbz3c07nmpbhk1lhqk5x0n"))
                 (modules '((guix build utils)))
                 (snippet
                  '(begin
-                    (substitute* "setup.py"
-                      (("py-dateutil") "python-dateutil"))
-                    #t))))
+                    (delete-file "gittaggers.py")))))
       (build-system python-build-system)
+      (arguments
+       (list
+         #:tests? #f    ; Tests can't find pytest
+         #:phases
+         #~(modify-phases %standard-phases
+             (add-after 'unpack 'patch-program-calls
+               (lambda* (#:key inputs #:allow-other-keys)
+                 (substitute* "bh20sequploader/qc_fasta.py"
+                   (("\"minimap2\"")
+                    (string-append "\"" (search-input-file
+                                          inputs "/bin/minimap2")
+                                   "\""))))))))
       (propagated-inputs
-       `(("python-arvados-python-client" ,python-arvados-python-client)
-         ("python-dateutil" ,python-dateutil)
-         ("python-flask" ,python-flask)
-         ("python-magic" ,python-magic)
-         ("python-pyyaml" ,python-pyyaml)
-         ("python-pycurl" ,python-pycurl)
-         ("python-pyshex" ,python-pyshex)
-         ("python-redis" ,python-redis)
-         ("python-ruaml.yaml" ,python38-ruaml.yaml-0.15.76)
-         ("clustalw" ,clustalw)
-         ("python-schema-salad" ,python-schema-salad)
-         ("python-twint" ,python-twint)
-         ;; and for the service
-         ("python" ,python)
-         ("gunicorn" ,gunicorn)))
+       (list python-arvados-python-client
+             python-schema-salad
+             python-magic
+             python-pyshex
+             python-pyshexc-0.7
+             python-py-dateutil
+
+             ;; for the web
+             python-flask
+             python-pyyaml
+             python-redis
+
+             ;; and for the service
+             python
+             gunicorn))
+      (inputs
+       (list minimap2))
       (native-inputs
-       `(("git" ,(@ (gnu packages version-control) git))
-         ("python-oauth2client" ,python-oauth2client)
-         ("python-pytest" ,python-pytest-4)
-         ("python-pytest-runner" ,python-pytest-runner-2)
-         ("python-uritemplate" ,python-uritemplate)))
+       (list python-pytest-4                ; < 6
+             python-pytest-runner-4))       ; < 5
       (home-page "https://github.com/pubseq/bh20-seq-resource")
       (synopsis
        "Tool to upload SARS-CoV-19 sequences and service to kick off analysis")
@@ -1553,6 +1851,18 @@ it to upload the genomes of SARS-CoV-2 samples to make them publicly and freely
 available to other researchers.")
       (license license:asl2.0))))
 
+;; This version has no profile collisions.
+(define-public bh20-seq-resource-for-service
+  (package
+    ;(inherit (fix-profile-collisions-for-bh20 bh20-seq-resource))
+    (inherit
+      ((package-input-rewriting/spec
+        `(("python-google-api-core" . ,(const python-google-api-core-1))
+          ("python-google-auth" . ,(const python-google-auth-1))
+          ("python-pyparsing" . ,(const python-pyparsing-2.4.7))))
+       bh20-seq-resource))
+    (properties `((hidden? . #t)))))
+
 (define-public python-scanpy-git
   (let ((commit "590d42309f9ed6550d7b887039990edfc1ac7648") ; April 22, 2020
         (revision "1"))
@@ -1593,6 +1903,56 @@ available to other researchers.")
                  (delete-file "scanpy/tests/test_pca.py")
                  #t)))))))))
 
+;; TODO: Unbundle everything
+(define-public odgi
+  (package
+    (name "odgi")
+    (version "0.8.1")
+    (source (origin
+              (method url-fetch)
+              (uri (string-append "https://github.com/pangenome/odgi/releases"
+                                  "/download/v" version
+                                  "/odgi-v" version ".tar.gz"))
+              (sha256
+               (base32 "175083pb9hp0vn9a00hbxlayyk5a5j8p52yq5qfmbnfvndisbmbv"))
+              (snippet
+               #~(begin
+                   (use-modules (guix build utils))
+                   (substitute* "CMakeLists.txt"
+                     (("-march=native") "")
+                     (("-msse4\\.2") ""))
+                   (delete-file-recursively "deps/pybind11")
+                   (delete-file-recursively "deps/sdsl-lite")))))
+    (build-system cmake-build-system)
+    (native-inputs
+     (list pkg-config))
+    (inputs
+     (list jemalloc
+           libdivsufsort
+           pybind11
+           python
+           sdsl-lite))
+    (home-page "https://github.com/vgteam/odgi")
+    (synopsis "Optimized Dynamic Genome/Graph Implementation")
+    (description "@acronym{Optimized Dynamic Genome/Graph Implementation, odgi}
+provides an efficient and succinct dynamic DNA sequence graph model, as well as
+a host of algorithms that allow the use of such graphs in bioinformatic
+analyses.
+
+Careful encoding of graph entities allows odgi to efficiently compute and
+transform pangenomes with minimal overheads.  @command{odgi} implements a
+dynamic data structure that leveraged multi-core CPUs and can be updated on the
+fly.
+
+The edges and path steps are recorded as deltas between the current node id and
+the target node id, where the node id corresponds to the rank in the global
+array of nodes.  Graphs built from biological data sets tend to have local
+partial order and, when sorted, the deltas be small.  This allows them to be
+compressed with a variable length integer representation, resulting in a small
+in-memory footprint at the cost of packing and unpacking.")
+    (properties '((tunable? . #t)))
+    (license license:expat)))
+
 (define-public vg
   (package
     (name "vg")
@@ -1826,7 +2186,7 @@ available to other researchers.")
     (inputs
      `(("boost" ,boost)
        ("cairo" ,cairo)
-       ("curl" ,curl-minimal)
+       ("curl" ,curl)
        ("elfutils" ,elfutils)
        ("fastahack" ,fastahack)
        ("htslib" ,htslib)
@@ -2055,7 +2415,7 @@ The Genome Browser itself does not draw conclusions; rather, it collates all
 relevant information in one location, leaving the exploration and interpretation
 to the user.")
     (license (list
-               license:bsd-0    ; kent/src/{utils,lib,inc,tabStorm,parasol,hg/ausoSql,hg/autoXml}
+               ;; license:bsd-0    ; kent/src/{utils,lib,inc,tabStorm,parasol,hg/ausoSql,hg/autoXml}
                license:bsd-3    ; these two for bundled htslib-1.3
                license:expat
                (license:non-copyleft