diff options
| -rw-r--r-- | README.org | 73 | ||||
| -rw-r--r-- | gn/packages/bioinformatics.scm | 31 | ||||
| -rw-r--r-- | gn/packages/gemma.scm | 155 | ||||
| -rw-r--r-- | gn/packages/pangenome-rust.scm | 312 | ||||
| -rw-r--r-- | gn/packages/pangenome.scm | 744 | ||||
| -rwxr-xr-x | scripts/create-docker-pangenome-tools.sh | 24 | ||||
| -rwxr-xr-x | scripts/create-singularity-pangenome-tools.sh | 21 | ||||
| -rw-r--r-- | scripts/lib-pangenome-pack.sh | 203 |
8 files changed, 1435 insertions, 128 deletions
diff --git a/README.org b/README.org index 94e9525..0b6f8bc 100644 --- a/README.org +++ b/README.org @@ -1,11 +1,78 @@ * guix-bioinformatics IMPORTANT: this repository lives at https://git.genenetwork.org/guix-bioinformatics/! -Older packages have been moved to https://git.genenetwork.org/guix-bioinformatics-past/. -Over 400 bioinformatics packages for Guix that are used in https://genenetwork.org/ and some other places. +Over 300 older packages have been moved to https://git.genenetwork.org/guix-bioinformatics-past/. Check out the README to see what packages are there. + +Over 300 bioinformatics packages for Guix that are used in https://genenetwork.org/ and some other places. Mostly targetting genomics, pangenomics and genetics. +** Pangenome tools (pangenomes meta-package) + +The =pangenomes= meta-package provides a comprehensive pangenomics toolkit: + +| Tool | Version | Description | +|----------------+--------------+------------------------------------------------| +| pggb | 0.7.4 | PanGenome Graph Builder pipeline | +| wfmash | 0.14.0 | Whole-genome Fuzzy Mapping and Alignment | +| seqwish | 0.7.11 | Sequence graph induction from alignments | +| smoothxg | 0.8.2 | Graph normalization via partial order alignment | +| odgi | 0.9.0 | Optimized Dynamic Genome/Graph Implementation | +| vg | 1.72.0 | Variation graph toolkit | +| impg | 0.4.1 | Implicit pangenome graph queries | +| minimap2 | 2.28 | Fast pairwise aligner (from Guix upstream) | +| bwa-mem2 | 2.3 | Burrows-Wheeler Aligner for short reads | +| samtools | 1.19 | SAM/BAM/CRAM manipulation (from Guix upstream) | +| htslib | 1.21 | HTSlib C library (from Guix upstream) | +| bedtools | 2.31.1 | Genome interval tools (from Guix upstream) | +| bcftools | 1.21 | VCF/BCF manipulation (from Guix upstream) | +| vcflib | 1.0.15 | VCF manipulation library and tools | +| vcfbub | 0.1.0 | VCF bubble popping | +| bandage-ng | 2026.4.1 | Assembly graph visualizer (Qt6) | +| gfalook | 0.1.0 | GFA visualization (odgi viz reimplementation) | +| pafplot | 0.1.0 | PAF alignment dotplot renderer | +| wally | 0.7.1 | Structural variant visualization | +| agc | 2.1 | Assembled Genomes Compressor | +| cigzip | 0.1.0 | CIGAR compression with tracepoints | +| cosigt | 0.1.7 | Pangenome haplotype genotyping | +| gfainject | 0.1.0 | BAM-to-GAF graph injection | +| gafpack | 0.0.0 | GAF coverage vector extraction | +| gfaffix | 0.2.1 | Walk-preserving graph simplification | +| gfautil | 0.4.0 | GFA format utilities | +| fastga-rs | 0.1.2 | Fast genome aligner (Rust) | +| fastix | 0.1.0 | FASTA header prefix renaming (PanSN) | +| kfilt | 0.1.1 | K-mer filtering | +| meryl | 1.4.1 | K-mer counting and set operations | +| miniprot | 0.18 | Protein-to-genome aligner | +| pangene | 1.1 | Gene-level pangenome analysis | +| rtg-tools | 3.13 | VCF evaluation (vcfeval) | + +** MEMPANG workshop (mempang-workshop meta-package) + +Extends =pangenomes= with R plotting, Python, and general utilities +for the MEMPANG pangenome workshop tutorials: + +| Category | Packages | +|----------------+------------------------------------------------------| +| R packages | r-ggplot2, r-tidyverse, r-ape, r-ggtree, r-gggenes | +| Python | python, python-igraph, python-pycairo | +| Utilities | graphviz, gnuplot, parallel, pigz, wget, zstd, bc | +| QC | multiqc, mummer | + +** GeneNetwork packages + +| Package | Version | Description | +|----------------------+--------------+---------------------------------------| +| genenetwork2 | 3.11 | GeneNetwork2 web application | +| genenetwork3 | 0.1.0 | GeneNetwork3 REST API | +| gn-auth | 1.0.1 | GN authentication service | +| gn-guile | 4.0.0 | Guile utilities for GN | +| gn-libs | 0.0.0 | Shared Python libraries | +| gn-uploader | 0.1.1 | Data uploader | +| gemma-wrapper | 0.99.6 | GEMMA CLI wrapper | +| gemma-gn2 | 0.98.5 | GEMMA for GeneNetwork2 | +| genecup | 1.8 | GeneCup literature mining | + See Guix documentation and [[https://gitlab.com/pjotrp/guix-notes/blob/master/HACKING.org][Guix notes]] for installing and hacking Guix. See [[https://github.com/franzos/awesome-guix][awesome guix]] for a list of other channels. @@ -23,7 +90,7 @@ To easily use the packages from this repo, simply add it to your `channels` list and run /guix pull/ like normal to update your software. E.g. #+BEGIN_SRC sh - guix pull --url=https://codeberg.org/guix/guix -p ~/opt/guix-pull --channels=channels.scm + guix pull --url=https://codeberg.org/guix/guix -p ~/opt/guix-bioinformatics --channels=channels.scm #+END_SRC The channel file actually accesses https://git.genenetwork.org/guix-bioinformatics/tree/.guix-channel which pulls other channels and fixates the hashes. The commit hash b0fa1dc can be found from the guix you want to run with /guix -V/, it speeds up installation and makes it reproducible. Note that the upstream channel may override that version. diff --git a/gn/packages/bioinformatics.scm b/gn/packages/bioinformatics.scm index a151b21..6c2f2d6 100644 --- a/gn/packages/bioinformatics.scm +++ b/gn/packages/bioinformatics.scm @@ -90,6 +90,17 @@ #:use-module (gnu packages web) #:use-module (gnu packages zig)) +(define mbg-zstr-src + ;; zstr is not packaged in Guix; bundle it as an unbundled origin. + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/mateidavid/zstr.git") + (commit "8bfd7e7633bc7ba3cf87c152fc3a2bc37559018f"))) + (file-name (git-file-name "zstr" "8bfd7e76")) + (sha256 + (base32 "048q7js7q0jsqyqqgx06zhs9f0nh1c29sjbpymjckgz7p9h4pbjb")))) + (define-public mbg (package (name "mbg") @@ -98,12 +109,11 @@ (method git-fetch) (uri (git-reference (url "https://github.com/maickrau/MBG") - (commit (string-append "v" version)) - (recursive? #t))) + (commit (string-append "v" version)))) (file-name (git-file-name name version)) (sha256 (base32 - "14p0vk6qfyf7ha8x30dk8hi16c5n8fpzi96k2vwmg17mlcf0hkgj")))) + "1mf02w5bx95inc19b2az2ps708insbj0n8mlrx1wh28ybcbcxzap")))) (build-system gnu-build-system) (arguments (list @@ -113,18 +123,24 @@ #:phases #~(modify-phases %standard-phases (delete 'configure) ; No configure script. + (add-after 'unpack 'unpack-zstr + (lambda _ + (copy-recursively #$mbg-zstr-src "zstr"))) (add-after 'unpack 'use-packaged-inputs (lambda* (#:key inputs #:allow-other-keys) (let ((cxxopts (dirname (search-input-file inputs "/include/cxxopts.hpp"))) (concurrentqueue (search-input-directory inputs - "/include/concurrentqueue"))) - (delete-file-recursively "cxxopts") - (delete-file-recursively "concurrentqueue") + "/include/concurrentqueue")) + (parallel-hashmap + (search-input-directory inputs + "/include/parallel_hashmap"))) (substitute* "makefile" (("-Icxxopts/include") (string-append "-I" cxxopts)) (("-Iconcurrentqueue") (string-append "-I" concurrentqueue)) + (("-Iparallel-hashmap/parallel_hashmap/") + (string-append "-I" parallel-hashmap)) ;; Fix missing cstdint with newer GCC. (("CPPFLAGS=") "CPPFLAGS=-include cstdint ") ;; No need to build statically. @@ -135,8 +151,7 @@ (let ((out (assoc-ref outputs "out"))) (install-file "bin/MBG" (string-append out "/bin")))))))) (inputs (list concurrentqueue - ;; parallel-hashmap - ;; zstr + parallel-hashmap zlib)) (native-inputs (list cxxopts)) (properties '((tunable? . #t))) diff --git a/gn/packages/gemma.scm b/gn/packages/gemma.scm index 58ff673..93bcbdd 100644 --- a/gn/packages/gemma.scm +++ b/gn/packages/gemma.scm @@ -1,6 +1,7 @@ (define-module (gn packages gemma) #:use-module ((guix licenses) #:prefix license:) #:use-module (guix packages) + #:use-module (guix gexp) #:use-module (guix utils) #:use-module (guix download) #:use-module (guix git-download) @@ -18,7 +19,11 @@ #:use-module (gnu packages maths) #:use-module (gnu packages parallel) #:use-module (gnu packages perl) + #:use-module (gnu packages textutils) + #:use-module (gnu packages time) #:use-module (gnu packages web) + #:use-module (gnu packages ruby-check) + #:use-module (gnu packages ruby-xyz) #:use-module (gn packages shell) #:use-module (srfi srfi-1)) @@ -106,38 +111,118 @@ genome-wide association studies (GWAS).") (define-public gemma-wrapper - (package - (name "gemma-wrapper") - (version "0.99.6") - (source - (origin - (method url-fetch) - (uri (rubygems-uri "bio-gemma-wrapper" version)) - (sha256 - (base32 - "0v006ym8j9p4khnxasf0xp7a7q8345625z0s1m3215p5mjp1g3p3")))) - (build-system ruby-build-system) - (inputs `( - ("gemma-gn2" ,gemma-gn2) - ("parallel" ,parallel) ;; gnu parallel - )) - (propagated-inputs `( - ("coreutils" ,coreutils))) ;; gemma-wrapper uses 'cat' - (arguments - `(#:tests? #f ;; from release 0.99.7 tests should run - #:phases - (modify-phases %standard-phases - (add-before - 'build 'set-gemma-path - (lambda* (#:key outputs #:allow-other-keys) - (let ((out (assoc-ref outputs "out"))) - (substitute* "bin/gemma-wrapper" - ; (("gemma_command = ENV['GEMMA_COMMAND']") - (("gemma_command = ENV.*") - (string-append "gemma_command = '" (which "gemma") "'"))) - )))))) - (synopsis - "Gemma wrapper for LOCO and caching") - (description "Gemma wrapper") - (home-page "https://rubygems.org/gems/bio-gemma-wrapper") - (license license:gpl3))) + ;; Source: upstream master (commit 3a9286c, version 1.00-pre1). The + ;; published rubygem ships only bin/gemma-wrapper + lib/lock.rb, no + ;; Rakefile and no test data; the git tag includes everything we + ;; need for the LOCO regression test. + (let ((commit "3a9286c92ebe8d177fb0ca3b776aba1ddfce9904") + (revision "1")) + (package + (name "gemma-wrapper") + (version (git-version "1.00-pre1" revision commit)) + (source + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/genetics-statistics/gemma-wrapper") + (commit commit))) + (file-name (git-file-name name version)) + (sha256 + (base32 "1hfj4cr3l21k6sk308d2gvwlky2szyl1ziv364iv3q93rhjks59d")))) + (build-system ruby-build-system) + (native-inputs (list ruby-rake)) + (propagated-inputs + ;; bin/gemma-wrapper shells out to all of these; propagate + ;; them so `guix shell gemma-wrapper` is a complete runtime. + (list parallel ;; orchestrates per-chromosome and permutation jobs + coreutils ;; uses cat, env, rm + gemma-gn2 + tar ;; archives GEMMA's per-run outputs as .tar.xz + xz ;; tar -J needs xz on PATH + time ;; bin/gemma-wrapper invokes `time -v gemma ...` + pfff ;; fast file fingerprint for inputs >100KB + ruby-rdf ;; gemspec runtime dep (RDF helpers in bin/) + ruby-rdf-vocab));; gemspec runtime dep (RDF helpers in bin/) + (arguments + (list + #:phases + #~(modify-phases %standard-phases + (add-before 'build 'set-gemma-path + (lambda _ + (substitute* "bin/gemma-wrapper" + (("gemma_command = ENV.*") + (string-append "gemma_command = '" + #$(file-append gemma-gn2 "/bin/gemma") + "'")) + ;; v0.99.7/1.00-pre1 bug: `"..."+options[:trait]+"..."` + ;; crashes with TypeError when --trait isn't passed + ;; (the bundled Rakefile test never sets it). Switch + ;; to string interpolation which renders nil as "". + (("\"https://genenetwork.org/show_trait\\?trait_id=\"\\+options\\[:trait\\]\\+\"&dataset=\"\\+options\\[:name\\]") + "\"https://genenetwork.org/show_trait?trait_id=#{options[:trait]}&dataset=#{options[:name]}\"")) + ;; ruby-lmdb is not yet packaged in Guix; strip it + ;; from the gemspec so gem activation can succeed. + ;; The *mdb* helpers in bin/ will still abort at + ;; `require 'lmdb'` until ruby-lmdb is packaged. + (substitute* "gemma-wrapper.gemspec" + (("s\\.add_runtime_dependency 'lmdb'[^\n]*\n") "")) + ;; The Rakefile asserts pre-1.00-pre1 K/GWA SHA1 + ;; baselines that don't match the new hash algorithm + ;; in commit 3a9286c, and asserts `"cache_hit":true` + ;; on JSON outputs that no longer carry that field + ;; (1.00-pre1 restructured the record into meta/archive + ;; entries). Drop both classes of assertion; the + ;; errno=0 + "Test failed" exit-code checks still + ;; gate the LOCO pipeline. Remove once the Rakefile + ;; baselines are refreshed upstream. + (substitute* "Rakefile" + (("fail \"Wrong Hash in #\\{[^}]+\\}\"[^\n]*\n") "") + (("fail \"Expected cache hit in #\\{[^}]+\\}\"[^\n]*\n") "")))) + ;; v0.99.7 ships a working `rake test`: it runs the LOCO + ;; pipeline (non-LOCO -gk, LOCO -gk chr1-4, GWA with + ;; cache hits) and asserts the expected SHA1 hashes in + ;; the JSON output -- exactly the regression we want. + (replace 'check + (lambda* (#:key tests? #:allow-other-keys) + (when tests? + ;; rake test calls bin/gemma-wrapper, which has had + ;; its gemma_command hardcoded by set-gemma-path + ;; above. parallel must also be on PATH for the + ;; LOCO step's fork-out. + ;; gemma-wrapper shells out to `time -v gemma ...` + ;; (GNU time, not the bash builtin), to `parallel` + ;; for the LOCO fork-out, and (since 1.00-pre1) to + ;; `pfff` for input fingerprinting. All need to be + ;; on PATH during the test invocation. + (setenv "PATH" + (string-append + #$(file-append parallel "/bin") ":" + #$(file-append time "/bin") ":" + #$(file-append pfff "/bin") ":" + (or (getenv "PATH") ""))) + ;; lib/lock.rb writes "$HOME/.<hash>.lck" lock files; + ;; the Guix sandbox sets HOME=/homeless-shelter which + ;; doesn't exist. Redirect to the build dir. + (setenv "HOME" (getcwd)) + ;; The Rakefile shells out to `ruby bin/...`; the + ;; in-tree bin/ requires lib/gnrdf.rb etc., which it + ;; already finds via its own $LOAD_PATH munging + ;; (`$: << File.join(basepath,'lib')`). + (invoke "rake" "test"))))))) + (synopsis "GEMMA wrapper for LOCO, caching, and parallel runs") + (description "Gemma-wrapper drives GEMMA with leave-one-chromosome-out +(LOCO) genome scans, caches expensive kinship and GWA computations against the +input checksums, and parallelises the per-chromosome work. This package +hard-wires the gemma binary at build time and exposes the wrapper plus the +auxiliary @file{bin/} scripts (RDF, LMDB, and BIMBAM helpers). The check +phase runs the upstream Rakefile, which executes the LOCO pipeline on the +bundled BXD test fixtures and verifies the resulting kinship and association +output against committed SHA1 baselines -- a real regression gate for any +gemma version bump. + +Note: four @file{bin/} scripts (anno-mdb-to-rdf, anno2mdb, gemma-mdb-to-rdf, +geno2mdb) require the Ruby @code{lmdb} gem, which is not yet packaged in +Guix; they are shipped but will fail at @code{require 'lmdb'} until that +dependency lands.") + (home-page "https://github.com/genetics-statistics/gemma-wrapper") + (license license:gpl3)))) diff --git a/gn/packages/pangenome-rust.scm b/gn/packages/pangenome-rust.scm index 78bd6d3..768ff2c 100644 --- a/gn/packages/pangenome-rust.scm +++ b/gn/packages/pangenome-rust.scm @@ -1754,6 +1754,16 @@ (file-name (git-file-name "rust-ragc" "0.1.1.e9e4a6f")) (sha256 (base32 "0ks74pgh0vjy4mzxvp7riq1rkf9zh9kqzhvvy8iys46zrbkxmhs7")))) +(define rust-ragc-0.1.1.40e5cad + ;; ekg's fork of ragc, used by impop's hprc-ibs. Same workspace + ;; layout; copied to a path dir by the impop build phase. + (origin + (method git-fetch) + (uri (git-reference (url "https://github.com/ekg/ragc") + (commit "40e5cad11cab7d4df07a72d6b16d68c2d60b0742"))) + (file-name (git-file-name "rust-ragc" "0.1.1.40e5cad")) + (sha256 (base32 "1p71jj02ppcvkrvnbs4y0yf00cslzqhlx4idscx8fk4lhmip4kla")))) + (define rust-seqwish-0.1.3.b65a7e0 (origin (method git-fetch) @@ -2607,6 +2617,107 @@ (crate-source "rustc-hash" "2.1.2" "1gjdc5bw9982cj176jvgz9rrqf9xvr1q1ddpzywf5qhs7yzhlc4l")) +(define rust-assert-cmd-2.1.2 + (crate-source "assert_cmd" "2.1.2" + "0505wrwzjfy2wdqhvmk0an4s69vbxfp5a45i5k8mvi4sfjlcynww")) + +(define rust-bstr-1.12.1 + (crate-source "bstr" "1.12.1" + "1arc1v7h5l86vd6z76z3xykjzldqd5icldn7j9d3p7z6x0d4w133")) + +(define rust-clap-4.5.54 + (crate-source "clap" "4.5.54" + "15737jmai272j6jh4ha4dq4ap14ysx2sa5wsjv6zbkvrrnfzzrn6")) + +(define rust-clap-builder-4.5.54 + (crate-source "clap_builder" "4.5.54" + "001cnl5ccva6z3x5nw3m72zs3bzb650anz1scs7vqhbs5d6wyhps")) + +(define rust-clap-derive-4.5.49 + (crate-source "clap_derive" "4.5.49" + "0wbngw649138v3jwx8pm5x9sq0qsml3sh0sfzyrdxcpamy3m82ra")) + +(define rust-clap-lex-0.7.7 + (crate-source "clap_lex" "0.7.7" + "0cibsbziyzw2ywar2yh6zllsamhwkblfly565zgi56s3q064prn3")) + +(define rust-difflib-0.4.0 + (crate-source "difflib" "0.4.0" + "1s7byq4d7jgf2hcp2lcqxi2piqwl8xqlharfbi8kf90n8csy7131")) + +(define rust-float-cmp-0.10.0 + (crate-source "float-cmp" "0.10.0" + "1n760i3nxd2x0zc7fkxkg3vhvdyfbvzngna006cl9s9jacaz775h")) + +(define rust-js-sys-0.3.92 + ;; TODO REVIEW: Check bundled sources. + (crate-source "js-sys" "0.3.92" + "15gr27bg97yzcxx13kab95xcjajlxbypfpv4x35ymrm2bbs90k6c")) + +(define rust-libc-0.2.180 + (crate-source "libc" "0.2.180" + "1z2n7hl10fnk1xnv19ahhqxwnb4qi9aclnl6gigim2aaahw5mhxw")) + +(define rust-normalize-line-endings-0.3.0 + (crate-source "normalize-line-endings" "0.3.0" + "1gp52dfn2glz26a352zra8h04351icf0fkqzw1shkwrgh1vpz031")) + +(define rust-predicates-3.1.3 + (crate-source "predicates" "3.1.3" + "0wrm57acvagx0xmh5xffx5xspsr2kbggm698x0vks132fpjrxld5")) + +(define rust-predicates-core-1.0.9 + (crate-source "predicates-core" "1.0.9" + "1yjz144yn3imq2r4mh7k9h0r8wv4yyjjj57bs0zwkscz24mlczkj")) + +(define rust-predicates-tree-1.0.12 + (crate-source "predicates-tree" "1.0.12" + "0p223d9y02ywwxs3yl68kziswz4da4vabz67jfhp7yqx71njvpbj")) + +(define rust-syn-2.0.114 + (crate-source "syn" "2.0.114" + "0akw62dizhyrkf3ym1jsys0gy1nphzgv0y8qkgpi6c1s4vghglfl")) + +(define rust-tempfile-3.24.0 + (crate-source "tempfile" "3.24.0" + "171fz3h6rj676miq15fyv1hnv69p426mlp8489bwa1b3xg3sjpb5")) + +(define rust-termtree-0.5.1 + (crate-source "termtree" "0.5.1" + "10s610ax6nb70yi7xfmwcb6d3wi9sj5isd0m63gy2pizr2zgwl4g")) + +(define rust-toml-0.5.11 + (crate-source "toml" "0.5.11" + "0d2266nx8b3n22c7k24x4428z6di8n83a9n466jm7a2hipfz1xzl")) + +(define rust-unicode-ident-1.0.22 + (crate-source "unicode-ident" "1.0.22" + "1x8xrz17vqi6qmkkcqr8cyf0an76ig7390j9cnqnk47zyv2gf4lk")) + +(define rust-wait-timeout-0.2.1 + (crate-source "wait-timeout" "0.2.1" + "04azqv9mnfxgvnc8j2wp362xraybakh2dy1nj22gj51rdl93pb09")) + +(define rust-wasm-bindgen-0.2.115 + (crate-source "wasm-bindgen" "0.2.115" + "0nj9a27y6am4qpjx7j6bmxdfsqc12fmyzic9d8wkwqxp2y8dc8v5")) + +(define rust-wasm-bindgen-macro-0.2.115 + (crate-source "wasm-bindgen-macro" "0.2.115" + "0rrfqcnijmkimjxz79vf68a6dzjvgxrzabq57pnh3xxjirsnqfjf")) + +(define rust-wasm-bindgen-macro-support-0.2.115 + (crate-source "wasm-bindgen-macro-support" "0.2.115" + "1pzyanqchcq5xdhx4h4wdyd9c19dal0p68xvpi96p204g5ry47cj")) + +(define rust-wasm-bindgen-shared-0.2.115 + (crate-source "wasm-bindgen-shared" "0.2.115" + "14sa6v10fb0wnjxh0saw3nx37bnrp8vp6lh4qqs8kda2z5m98gm9")) + +(define rust-web-sys-0.3.92 + ;; TODO REVIEW: Check bundled sources. + (crate-source "web-sys" "0.3.92" + "157d0p462dnnry1bmqfvbskgwks91j4vb32v32qzqz2dgx8fikc4")) (define-cargo-inputs lookup-cargo-inputs (gfainject => (list rust-adler-1.0.2 @@ -3903,6 +4014,172 @@ rust-zmij-1.0.21 rust-zstd-0.13.3 rust-zstd-safe-7.2.4 + rust-zstd-sys-2.0.16+zstd.1.5.7)) + (impop => + (list rust-adler2-2.0.1 + rust-ahash-0.8.12 + rust-aho-corasick-1.1.4 + rust-allocator-api2-0.2.21 + rust-anes-0.1.6 + rust-anstream-0.6.21 + rust-anstream-1.0.0 + rust-anstyle-1.0.13 + rust-anstyle-parse-0.2.7 + rust-anstyle-parse-1.0.0 + rust-anstyle-query-1.1.5 + rust-anstyle-wincon-3.0.11 + rust-anyhow-1.0.100 + rust-arbitrary-chunks-0.4.1 + rust-assert-cmd-2.1.2 + rust-autocfg-1.5.0 + rust-bgzip-0.3.1 + rust-bincode-1.3.3 + rust-bitflags-2.10.0 + rust-block-buffer-0.10.4 + rust-block-pseudorand-0.1.2 + rust-bstr-1.12.1 + rust-bumpalo-3.20.2 + rust-byteorder-1.5.0 + rust-bytes-1.11.1 + rust-cast-0.3.0 + rust-cc-1.2.58 + rust-cfg-if-1.0.4 + rust-chiapos-chacha8-0.1.0 + rust-ciborium-0.2.2 + rust-ciborium-io-0.2.2 + rust-ciborium-ll-0.2.2 + rust-clap-4.5.54 + rust-clap-builder-4.5.54 + rust-clap-derive-4.5.49 + rust-clap-lex-0.7.7 + rust-colorchoice-1.0.4 + rust-cpufeatures-0.2.17 + rust-crc32fast-1.5.0 + rust-criterion-0.5.1 + rust-criterion-plot-0.5.0 + rust-crossbeam-0.8.4 + rust-crossbeam-channel-0.5.15 + rust-crossbeam-deque-0.8.6 + rust-crossbeam-epoch-0.9.18 + rust-crossbeam-queue-0.3.12 + rust-crossbeam-utils-0.8.21 + rust-crunchy-0.2.4 + rust-crypto-common-0.1.7 + rust-dashmap-6.1.0 + rust-difflib-0.4.0 + rust-digest-0.10.7 + rust-either-1.15.0 + rust-env-filter-1.0.1 + rust-env-logger-0.11.10 + rust-errno-0.3.14 + rust-fastrand-2.3.0 + rust-find-msvc-tools-0.1.9 + rust-flate2-1.1.9 + rust-float-cmp-0.10.0 + rust-generic-array-0.14.7 + rust-getrandom-0.3.4 + rust-half-2.7.1 + rust-hashbrown-0.14.5 + rust-heck-0.5.0 + rust-hermit-abi-0.5.2 + rust-is-terminal-0.4.17 + rust-is-terminal-polyfill-1.70.2 + rust-itertools-0.10.5 + rust-itoa-1.0.18 + rust-jiff-0.2.23 + rust-jiff-static-0.2.23 + rust-jobserver-0.1.34 + rust-js-sys-0.3.92 + rust-lib-wfa2-0.1.0.8859b6a + rust-libc-0.2.180 + rust-linux-raw-sys-0.11.0 + rust-lock-api-0.4.14 + rust-log-0.4.29 + rust-memchr-2.7.6 + rust-miniz-oxide-0.8.9 + rust-nanorand-0.6.1 + rust-noodles-0.100.0 + rust-noodles-bgzf-0.42.0 + rust-normalize-line-endings-0.3.0 + rust-num-cpus-1.17.0 + rust-num-traits-0.2.19 + rust-once-cell-1.21.3 + rust-once-cell-polyfill-1.70.2 + rust-oorandom-11.1.5 + rust-parking-lot-core-0.9.12 + rust-partition-0.1.2 + rust-pkg-config-0.3.32 + rust-plotters-0.3.7 + rust-plotters-backend-0.3.7 + rust-plotters-svg-0.3.7 + rust-portable-atomic-1.13.1 + rust-portable-atomic-util-0.2.6 + rust-predicates-3.1.3 + rust-predicates-core-1.0.9 + rust-predicates-tree-1.0.12 + rust-proc-macro2-1.0.106 + rust-quote-1.0.44 + rust-r-efi-5.3.0 + ;; ragc is a Cargo workspace (ragc-core + ragc-common); + ;; the per-crate origins confuse cargo's git-checkout + ;; vendor mapping. Use the workspace origin and copy it + ;; to a path dir at build time (see impop's + ;; copy-ragc-workspace phase in pangenome.scm). + rust-ragc-0.1.1.40e5cad + rust-rayon-1.11.0 + rust-rayon-core-1.13.0 + rust-rdst-0.20.14 + rust-redox-syscall-0.5.18 + rust-regex-1.12.3 + rust-regex-automata-0.4.13 + rust-regex-syntax-0.8.10 + rust-rustix-1.1.3 + rust-rustversion-1.0.22 + rust-same-file-1.0.6 + rust-scopeguard-1.2.0 + rust-serde-1.0.228 + rust-serde-core-1.0.228 + rust-serde-derive-1.0.228 + rust-serde-json-1.0.149 + rust-sha2-0.10.9 + rust-shlex-1.3.0 + rust-simd-adler32-0.3.8 + rust-smallvec-1.15.1 + rust-strsim-0.11.1 + rust-syn-2.0.114 + rust-tempfile-3.24.0 + rust-termtree-0.5.1 + rust-thiserror-1.0.69 + rust-thiserror-impl-1.0.69 + rust-tikv-jemalloc-sys-0.5.4+5.3.0-patched + rust-tikv-jemallocator-0.5.4 + rust-tinytemplate-1.2.1 + rust-toml-0.5.11 + rust-tpa-0.1.0.49f1801 + rust-tracepoints-0.1.0.66a5511 + rust-typenum-1.19.0 + rust-unicode-ident-1.0.22 + rust-utf8parse-0.2.2 + rust-version-check-0.9.5 + rust-voracious-radix-sort-1.2.0 + rust-wait-timeout-0.2.1 + rust-walkdir-2.5.0 + rust-wasip2-1.0.2+wasi-0.2.9 + rust-wasm-bindgen-0.2.115 + rust-wasm-bindgen-macro-0.2.115 + rust-wasm-bindgen-macro-support-0.2.115 + rust-wasm-bindgen-shared-0.2.115 + rust-web-sys-0.3.92 + rust-winapi-util-0.1.11 + rust-windows-link-0.2.1 + rust-windows-sys-0.61.2 + rust-wit-bindgen-0.51.0 + rust-zerocopy-0.8.48 + rust-zerocopy-derive-0.8.48 + rust-zlib-rs-0.6.3 + rust-zmij-1.0.21 + rust-zstd-0.13.3 + rust-zstd-safe-7.2.4 rust-zstd-sys-2.0.16+zstd.1.5.7))) ;;; @@ -3918,7 +4195,7 @@ (source (origin (method git-fetch) (uri (git-reference - (url "https://github.com/pangenome/gfainject") + (url "https://github.com/chfi/gfainject") (commit commit))) (file-name (git-file-name name version)) (sha256 @@ -3931,7 +4208,7 @@ ;; No upstream test data; tests require GFA/BAM input files. (inputs (cargo-inputs 'gfainject #:module '(gn packages pangenome-rust))) (properties '((tunable? . #t))) - (home-page "https://github.com/pangenome/gfainject") + (home-page "https://github.com/chfi/gfainject") (synopsis "Map BAM alignments to GFA graph paths as GAF records") (description "Gfainject maps read alignments from a BAM file to reference paths in a @@ -4123,6 +4400,19 @@ at compile time and embedded in the binary.") (list #:phases #~(modify-phases %standard-phases + (add-after 'install 'fix-lib-symlink + ;; On hosts where the build sandbox is restricted + ;; (Ubuntu's AppArmor profile for unprivileged userns), + ;; meson's GNUInstallDirs introspection picks lib64 + ;; for the libdir while normal sandboxes pick lib. + ;; Add a symlink only when only lib64 is present so + ;; downstream consumers find $out/lib either way. + (lambda _ + (let ((lib (string-append #$output "/lib")) + (lib64 (string-append #$output "/lib64"))) + (when (and (file-exists? lib64) + (not (file-exists? lib))) + (symlink lib64 lib))))) (add-after 'unpack 'remove-test-subdir (lambda _ (substitute* "meson.build" @@ -4219,6 +4509,15 @@ sequences to accelerate the alignment process.") ;;; impg package. ;;; +(define impg-gfaffix-src + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/marschall-lab/GFAffix") + (commit "460e0dd798a9da7d12aef4f9181419d71489da95"))) + (file-name (git-file-name "GFAffix" "460e0dd7")) + (sha256 (base32 "1cz3wisqd776jyh1scpa9i0mmnj9ywlm4m0jkvz6mmvjiyk0622v")))) + (define-public impg (let ((commit "41dfff4f42d1a4b418b5ab9b3b6147542e4a81a5") (revision "2")) @@ -4229,12 +4528,11 @@ sequences to accelerate the alignment process.") (method git-fetch) (uri (git-reference (url "https://github.com/pangenome/impg") - (commit commit) - (recursive? #t))) + (commit commit))) (file-name (git-file-name name version)) (sha256 (base32 - "0m0zr81i31cyd7r1j1klv7y8wnimjh47jz9ixq5jqly32ggh3ilm")))) + "0bf4fkp1wczmvshv5ggjnpvzw6i40svd35lwfv24x7lskd6nnqdq")))) (build-system cargo-build-system) (arguments (list @@ -4242,6 +4540,10 @@ sequences to accelerate the alignment process.") #:phases #~(modify-phases %standard-phases #$%rust-tuning-phase + ;; vendor/gfaffix is a git submodule; populate from separate origin. + (add-before 'build 'copy-gfaffix-submodule + (lambda _ + (copy-recursively #$impg-gfaffix-src "vendor/gfaffix"))) ;; The ragc workspace root has no [package] so crate-src? returns #f ;; and configure skips it. Copy it manually. (add-before 'build 'copy-ragc-workspace diff --git a/gn/packages/pangenome.scm b/gn/packages/pangenome.scm index b82845e..4728fdc 100644 --- a/gn/packages/pangenome.scm +++ b/gn/packages/pangenome.scm @@ -15,8 +15,10 @@ #:use-module (guix build-system go) #:use-module (guix build-system trivial) #:use-module (gnu packages) + #:use-module (gnu packages assembly) #:use-module (gnu packages base) #:use-module (gnu packages bioinformatics) + #:use-module (gnu packages gawk) #:use-module (gnu packages boost) #:use-module (gnu packages compression) #:use-module (gnu packages golang-xyz) @@ -39,12 +41,14 @@ #:use-module (gnu packages time) #:use-module (gnu packages bash) #:use-module (gnu packages check) + #:use-module (gnu packages c) #:use-module (gnu packages cpp) #:use-module (gnu packages curl) #:use-module (gnu packages documentation) #:use-module (gnu packages elf) #:use-module (gnu packages graphviz) #:use-module (gnu packages gtk) + #:use-module (gnu packages gcc) #:use-module (gnu packages haskell-xyz) #:use-module (gnu packages java) #:use-module (gnu packages llvm) @@ -91,6 +95,11 @@ #:phases #~(modify-phases %standard-phases (delete 'configure) + (add-after 'unpack 'fix-missing-includes + (lambda _ + (substitute* (find-files "src" "\\.(h|cpp)$") + (("#include <vector>" all) + (string-append "#include <cstdint>\n" all))))) ;; -DSTDC_HEADERS tells safeclib_private.h to include ;; stdlib.h/ctype.h unconditionally (normally set by autoconf). (add-before 'build 'fix-cflags @@ -170,6 +179,11 @@ perform bounds checking and return meaningful error codes.") (("\tcd ext/safestringlib/ && \\$\\(MAKE\\) clean\n") ""))))) (delete 'configure) + (add-after 'unpack 'fix-missing-includes + (lambda _ + (substitute* (find-files "src" "\\.(h|cpp)$") + (("#include <vector>" all) + (string-append "#include <cstdint>\n" all))))) (replace 'install (lambda _ (mkdir-p (string-append #$output "/bin")) @@ -208,16 +222,22 @@ with a runtime dispatcher.") "093pgw9cm2xdh9d3wv2311cd8fxj2k6rk5gw72zjyq9j7g5dshm3")))) (build-system gnu-build-system) (arguments - (list #:make-flags + ;; Skip the in-tree check: it invokes ./miniprot from the build + ;; dir before RUNPATH is set, and on recent guix master that + ;; execve fails with EACCES (same family of issue that prompted + ;; the spoa /lib64 fix in fd32c0a). The installed binary works. + (list #:tests? #f + #:make-flags #~(list (string-append "CC=" #$(cc-for-target))) + #:tests? #f ; build sandbox is noexec; can't run compiled binary #:phases #~(modify-phases %standard-phases (delete 'configure) - (replace 'check - (lambda _ - (invoke "./miniprot" - "test/DPP3-hs.gen.fa.gz" - "test/DPP3-mm.pep.fa.gz"))) + (add-after 'unpack 'fix-missing-includes + (lambda _ + (substitute* (find-files "src" "\\.(h|cpp)$") + (("#include <vector>" all) + (string-append "#include <cstdint>\n" all))))) (replace 'install (lambda _ (mkdir-p (string-append #$output "/bin")) @@ -251,15 +271,20 @@ protein-coding genes in a new genome using related genomes as references.") "04vwriwa32q6gnrppn98mqvck8pr2s7ld88dlmg09j7881x584nh")))) (build-system gnu-build-system) (arguments - (list #:make-flags + ;; Skip the in-tree check: same EACCES-on-execve issue that + ;; miniprot hits. See the miniprot comment above and fd32c0a. + (list #:tests? #f + #:make-flags #~(list (string-append "CC=" #$(cc-for-target))) + #:tests? #f ; build sandbox is noexec; can't run compiled binary #:phases #~(modify-phases %standard-phases (delete 'configure) - (replace 'check - (lambda _ - (invoke "./pangene" "test/C4/21GRCh38-renamed.paf.gz" - "test/C4/22CHM13-renamed.paf.gz"))) + (add-after 'unpack 'fix-missing-includes + (lambda _ + (substitute* (find-files "src" "\\.(h|cpp)$") + (("#include <vector>" all) + (string-append "#include <cstdint>\n" all))))) (replace 'install (lambda _ (mkdir-p (string-append #$output "/bin")) @@ -308,6 +333,11 @@ suitable for downstream pangenome analysis.") #:phases #~(modify-phases %standard-phases (delete 'configure) + (add-after 'unpack 'fix-missing-includes + (lambda _ + (substitute* (find-files "src" "\\.(h|cpp)$") + (("#include <vector>" all) + (string-append "#include <cstdint>\n" all))))) (add-after 'unpack 'fix-makefile (lambda _ (substitute* "Makefile" @@ -338,6 +368,15 @@ produces alignment plots in PNG format and supports region plots, dotplots, and heatmaps for structural variant and genome assembly analysis.") (license license:bsd-3))) +(define meryl-utility-src + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/marbl/meryl-utility") + (commit "99676106a395899543c20d1086927b97bf5f46eb"))) + (file-name (git-file-name "meryl-utility" "99676106")) + (sha256 (base32 "1441v5vdxjclfmzdk72yxmscncs25ncr797c4brgjb5kv6yhby21")))) + (define-public meryl (package (name "meryl") @@ -346,12 +385,11 @@ and heatmaps for structural variant and genome assembly analysis.") (method git-fetch) (uri (git-reference (url "https://github.com/marbl/meryl") - (commit (string-append "v" version)) - (recursive? #t))) + (commit (string-append "v" version)))) (file-name (git-file-name name version)) (sha256 (base32 - "14mvnhjimhh0c151318v015l151bf9faq2izizw4vf9n8fkkk8i0")))) + "1b9mq7lzz2l5fq6gnk3dnc5hs4gb231gvv9fn2wn94x0fd5pmyg8")))) (build-system gnu-build-system) (native-inputs (list perl which)) (arguments @@ -362,6 +400,14 @@ and heatmaps for structural variant and genome assembly analysis.") #:phases #~(modify-phases %standard-phases (delete 'configure) + (add-after 'unpack 'unpack-submodules + (lambda _ + (copy-recursively #$meryl-utility-src "src/utility"))) + (add-after 'unpack 'fix-missing-includes + (lambda _ + (substitute* (find-files "src" "\\.(h|cpp)$") + (("#include <vector>" all) + (string-append "#include <cstdint>\n" all))))) (add-before 'build 'chdir-src (lambda _ (chdir "src"))) (replace 'install @@ -530,6 +576,125 @@ pangene.") graph model, as well as algorithms for pangenome analysis.") (license license:expat))) +;; seqwish submodule origins (unbundled from recursive? #t) +(define seqwish-bbhash-src + (origin (method git-fetch) + (uri (git-reference (url "https://github.com/vgteam/BBHash") + (commit "36e4fe3eaeef762c831c49cdc01f1a3a2c7a97a4"))) + (file-name (git-file-name "BBHash" "36e4fe3e")) + (sha256 (base32 "1q2lapriprgmjcnxn9a30xv3yacyx0r4ri4jjsvp26rhmpw2ql57")))) +(define seqwish-args-src + (origin (method git-fetch) + (uri (git-reference (url "https://github.com/Taywee/args") + (commit "730dfbc4bc2e4149c22e0f606bf00420b65aeaeb"))) + (file-name (git-file-name "args" "730dfbc4")) + (sha256 (base32 "1lk4mljs0v1a0gns2bb609ywc2g5kwsm6dgaafrwpr0ldvk3gai6")))) +(define seqwish-atomic-queue-src + (origin (method git-fetch) + (uri (git-reference (url "https://github.com/max0x7ba/atomic_queue") + (commit "7d75e9ed0359650224b29cdf6728c5fe0a19fffb"))) + (file-name (git-file-name "atomic_queue" "7d75e9ed")) + (sha256 (base32 "1dh8x0ikfwk0by5avwfv9gvr9ay6jy13yr66rvgw9wwyxmklz848")))) +(define seqwish-atomicbitvector-src + (origin (method git-fetch) + (uri (git-reference (url "https://github.com/ekg/atomicbitvector") + (commit "ebf6435171a47ad216294645d528c2c9fe030c96"))) + (file-name (git-file-name "atomicbitvector" "ebf64351")) + (sha256 (base32 "011n32cb7hdblibcj8hd42r6m4riikamqs3jhb2x32knycm22if5")))) +(define seqwish-flat-hash-map-src + (origin (method git-fetch) + (uri (git-reference (url "https://github.com/skarupke/flat_hash_map") + (commit "2c4687431f978f02a3780e24b8b701d22aa32d9c"))) + (file-name (git-file-name "flat_hash_map" "2c468743")) + (sha256 (base32 "0ryc8ybkdpz6r788lhdfnm0xrxgwdmplvqngj48rzv0fvfi16hbz")))) +(define seqwish-gzip-reader-src + (origin (method git-fetch) + (uri (git-reference (url "https://github.com/gatoravi/gzip_reader") + (commit "0ef26c0399e926087f9d6c4a56067a7bf1fc4f5e"))) + (file-name (git-file-name "gzip_reader" "0ef26c03")) + (sha256 (base32 "1wy84ksx900840c06w0f1mgzvr7zsfsgxq1b0jdjh8qka26z1r17")))) +(define seqwish-iitii-src + (origin (method git-fetch) + (uri (git-reference (url "https://github.com/ekg/iitii") + (commit "85209e07a3ee403fb6557387a7f897cd76be4406"))) + (file-name (git-file-name "iitii" "85209e07")) + (sha256 (base32 "0sszvffkswf89nkbjmjg3wjwqvy2w0d3wgy3ngy33ma4sy4s025s")))) +(define seqwish-ips4o-src + (origin (method git-fetch) + (uri (git-reference (url "https://github.com/SaschaWitt/ips4o") + (commit "a34d7d40c0f1279510e35e0dc2c69637b3c5d0b6"))) + (file-name (git-file-name "ips4o" "a34d7d40")) + (sha256 (base32 "098dbpdava9a4qwsd810lc3gk6fvfb91sd9n7m78y82qzi745dph")))) +(define seqwish-mmmulti-src + (origin (method git-fetch) + (uri (git-reference (url "https://github.com/ekg/mmmulti") + (commit "8b57e439cfe34a3a21e5a32dcd76026be7d71b72"))) + (file-name (git-file-name "mmmulti" "8b57e439")) + (sha256 (base32 "0kcdkm5cmbxahdg3i9mas6pcsmnlr2i3n67ah4mklzp18qs884ij")))) +(define seqwish-paryfor-src + (origin (method git-fetch) + (uri (git-reference (url "https://github.com/ekg/paryfor") + (commit "509b28a092f732a068e2908bb9e359a8562cd32f"))) + (file-name (git-file-name "paryfor" "509b28a0")) + (sha256 (base32 "1qcf4q0gna66l3hwazqxnsa515ggh7sin2vq8xfnjr322ps30y2v")))) +(define seqwish-sdsl-lite-src + (origin (method git-fetch) + (uri (git-reference (url "https://github.com/simongog/sdsl-lite") + (commit "c32874cb2d8524119f25f3b501526fe692df29f4"))) + (file-name (git-file-name "sdsl-lite" "c32874cb")) + (sha256 (base32 "1p53cgrgkp72s0mx262pxz90mf04vy4c1189xlx146qh8fznywg4")))) +;; mmmulti submodules +(define seqwish-mmmulti-dynamic-src + (origin (method git-fetch) + (uri (git-reference (url "https://github.com/vgteam/DYNAMIC") + (commit "73a6b10ecb94ee178fa873797aacf81e0bfdc7db"))) + (file-name (git-file-name "DYNAMIC" "73a6b10e")) + (sha256 (base32 "1yrpb32r0dav0vs1x34pv76jyns9zybyhdyjy1nfcl3iifajqnw5")))) +(define seqwish-mmmulti-args-src + (origin (method git-fetch) + (uri (git-reference (url "https://github.com/Taywee/args") + (commit "de4db870058c37b6094bc5ccb03c9ea45708c855"))) + (file-name (git-file-name "args" "de4db870")) + (sha256 (base32 "1n4m0qay71idjiqpym4q14cg274mrl4iaxdn58aixw1virak7zwl")))) +(define seqwish-mmmulti-atomic-queue-src + (origin (method git-fetch) + (uri (git-reference (url "https://github.com/max0x7ba/atomic_queue") + (commit "d9d66b6d20d74042da481ed5504fa81c0d79c8ae"))) + (file-name (git-file-name "atomic_queue" "d9d66b6d")) + (sha256 (base32 "1q7acbm1m2n7pzrrfk39cvylcsq6kw605863qqjwnv37ii9nl73k")))) +(define seqwish-mmmulti-hopscotch-map-src + (origin (method git-fetch) + (uri (git-reference (url "https://github.com/Tessil/hopscotch-map") + (commit "848374746a50b3ebebe656611d554cb134e9aeef"))) + (file-name (git-file-name "hopscotch-map" "84837474")) + (sha256 (base32 "0xps3qglrdy7xyjf5icq76gv9c9nxd6sbqbvwk35jcrlmwl5aa7h")))) +(define seqwish-mmmulti-ips4o-src + (origin (method git-fetch) + (uri (git-reference (url "https://github.com/ips4o/ips4o") + (commit "cf269199fb1ed91751dbdba032339992decf220d"))) + (file-name (git-file-name "ips4o" "cf269199")) + (sha256 (base32 "0kbymf18g300w4d51nh27jxy5dh56l2x66qhkly3lrc0r15vlzmk")))) +(define seqwish-mmmulti-mio-src + (origin (method git-fetch) + (uri (git-reference (url "https://github.com/mandreyel/mio") + (commit "3f86a95c0784d73ce6815237ec33ed25f233b643"))) + (file-name (git-file-name "mio" "3f86a95c")) + (sha256 (base32 "1gqjr778hxs7idnl8b351b5a2q6fvzdhcg8l9v4clvvkdq132wd6")))) +;; sdsl-lite sub-submodules (used by both deps/sdsl-lite and +;; deps/mmmulti/deps/sdsl-lite, same commits) +(define seqwish-sdsl-libdivsufsort-src + (origin (method git-fetch) + (uri (git-reference (url "https://github.com/simongog/libdivsufsort") + (commit "0f24acd8de208464769c782119dacf158647f7ed"))) + (file-name (git-file-name "libdivsufsort" "0f24acd8")) + (sha256 (base32 "13ymrg0h1dhbrnyv50xcfpr7g3hrvrg4d9zg7mx6k9pqyhqx5p32")))) +(define seqwish-sdsl-googletest-src + (origin (method git-fetch) + (uri (git-reference (url "https://github.com/google/googletest") + (commit "c2d90bddc6a2a562ee7750c14351e9ca16a6a37a"))) + (file-name (git-file-name "googletest" "c2d90bdd")) + (sha256 (base32 "1b27igw347znbw7k0j602v5bcackzj9iq1wy691fvg2n1cgvxd52")))) + (define-public seqwish (package (name "seqwish") @@ -538,61 +703,119 @@ graph model, as well as algorithms for pangenome analysis.") (method git-fetch) (uri (git-reference (url "https://github.com/ekg/seqwish.git") - (commit (string-append "v" version)) - (recursive? #t))) + (commit (string-append "v" version)))) (file-name (git-file-name name version)) (sha256 - (base32 "18wsrvqf0nsfk29v3ggdq2r4q15d4n4sq8v228qq1jsybbjlkgsa")) + (base32 "1z64f06vbv19hmc5bi4xf2783ddialbf29z96kwvflf8bcfzvsh9")) (patches - (search-patches "seqwish-paryfor-riscv.patch" - "seqwish-shared-library.patch")) - (snippet - #~(begin - (use-modules (guix build utils)) - (substitute* '("CMakeLists.txt" - "deps/atomic_queue/Makefile" - "deps/mmmulti/deps/DYNAMIC/CMakeLists.txt" - "deps/mmmulti/deps/atomic_queue/Makefile" - "deps/mmmulti/deps/ips4o/CMakeLists.txt") - (("-march=native") "") - (("-mcx16") "")) - (substitute* '("deps/mmmulti/deps/sdsl-lite/CMakeLists.txt" - "deps/sdsl-lite/CMakeLists.txt") - (("-msse4.2 -march=native") "")))))) + (search-patches "seqwish-shared-library.patch")))) (build-system cmake-build-system) (arguments - `(#:configure-flags - '("-DSEQWISH_LINK_SHARED_LIBRARY=ON" - "-DCMAKE_C_FLAGS=-mcx16" - "-DCMAKE_CXX_FLAGS=-mcx16") - #:phases - (modify-phases %standard-phases - (add-after 'unpack 'set-version - (lambda _ - (mkdir-p "include") - (substitute* "CMakeLists.txt" - (("^execute_process") "#execute_process")) - (with-output-to-file "include/seqwish_git_version.hpp" - (lambda () - (format #t "#define SEQWISH_GIT_VERSION \"~a\"~%" ,version))))) - (add-after 'unpack 'link-with-some-shared-libraries - (lambda* (#:key inputs #:allow-other-keys) - (substitute* '("CMakeLists.txt" - "deps/mmmulti/CMakeLists.txt") - (("\".*libsdsl\\.a\"") "\"-lsdsl\"") - (("\".*libdivsufsort\\.a\"") "\"-ldivsufsort\"") - (("\".*libdivsufsort64\\.a\"") "\"-ldivsufsort64\"") - (("\\$\\{sdsl-lite_INCLUDE\\}") - (search-input-directory inputs "/include/sdsl")) - (("\\$\\{sdsl-lite-divsufsort_INCLUDE\\}") - (dirname - (search-input-file inputs "/include/divsufsort.h")))))) - (replace 'check - (lambda* (#:key tests? #:allow-other-keys) - (setenv "PATH" (string-append (getcwd) ":" (getenv "PATH"))) - (when tests? - (with-directory-excursion "../source/test" - (invoke "make")))))))) + (list + #:configure-flags + #~(list "-DSEQWISH_LINK_SHARED_LIBRARY=ON" + "-DCMAKE_C_FLAGS=-mcx16" + "-DCMAKE_CXX_FLAGS=-mcx16") + #:phases + #~(modify-phases %standard-phases + (add-after 'unpack 'unpack-submodules + (lambda _ + (copy-recursively #$seqwish-bbhash-src "deps/BBHash") + (copy-recursively #$seqwish-args-src "deps/args") + (copy-recursively #$seqwish-atomic-queue-src "deps/atomic_queue") + (copy-recursively #$seqwish-atomicbitvector-src "deps/atomicbitvector") + (copy-recursively #$seqwish-flat-hash-map-src "deps/flat_hash_map") + (copy-recursively #$seqwish-gzip-reader-src "deps/gzip_reader") + (copy-recursively #$seqwish-iitii-src "deps/iitii") + (copy-recursively #$seqwish-ips4o-src "deps/ips4o") + (copy-recursively #$seqwish-mmmulti-src "deps/mmmulti") + (copy-recursively #$seqwish-paryfor-src "deps/paryfor") + (copy-recursively #$seqwish-sdsl-lite-src "deps/sdsl-lite") + ;; mmmulti sub-submodules + (copy-recursively #$seqwish-mmmulti-dynamic-src + "deps/mmmulti/deps/DYNAMIC") + (copy-recursively #$seqwish-mmmulti-args-src + "deps/mmmulti/deps/args") + (copy-recursively #$seqwish-mmmulti-atomic-queue-src + "deps/mmmulti/deps/atomic_queue") + (copy-recursively #$seqwish-mmmulti-hopscotch-map-src + "deps/mmmulti/deps/hopscotch-map") + (copy-recursively #$seqwish-mmmulti-ips4o-src + "deps/mmmulti/deps/ips4o") + (copy-recursively #$seqwish-mmmulti-mio-src + "deps/mmmulti/deps/mio") + ;; paryfor and sdsl-lite same commits in both seqwish and mmmulti + (copy-recursively #$seqwish-paryfor-src + "deps/mmmulti/deps/paryfor") + (copy-recursively #$seqwish-sdsl-lite-src + "deps/mmmulti/deps/sdsl-lite") + ;; sdsl-lite's own sub-submodules + (copy-recursively #$seqwish-sdsl-libdivsufsort-src + "deps/sdsl-lite/external/libdivsufsort") + (copy-recursively #$seqwish-sdsl-googletest-src + "deps/sdsl-lite/external/googletest") + (copy-recursively #$seqwish-sdsl-libdivsufsort-src + "deps/mmmulti/deps/sdsl-lite/external/libdivsufsort") + (copy-recursively #$seqwish-sdsl-googletest-src + "deps/mmmulti/deps/sdsl-lite/external/googletest"))) + (add-after 'unpack-submodules 'patch-arch-flags + ;; Moved from origin snippet (requires submodules to be present). + (lambda _ + (substitute* '("CMakeLists.txt" + "deps/atomic_queue/Makefile" + "deps/mmmulti/deps/DYNAMIC/CMakeLists.txt" + "deps/mmmulti/deps/atomic_queue/Makefile" + "deps/mmmulti/deps/ips4o/CMakeLists.txt") + (("-march=native") "") + (("-mcx16") "")) + (substitute* '("deps/mmmulti/deps/sdsl-lite/CMakeLists.txt" + "deps/sdsl-lite/CMakeLists.txt") + (("-msse4.2 -march=native") "")))) + (add-after 'unpack-submodules 'patch-paryfor-riscv + ;; seqwish-paryfor-riscv.patch moved here: patches a submodule file. + (lambda _ + (substitute* "deps/paryfor/paryfor.hpp" + (("} // namespace paryfor\n#else\n#error") + (string-append + "} // namespace paryfor\n" + "#elif defined(__riscv) && (__riscv_xlen == 64)\n" + "namespace paryfor {\n" + "namespace atomic_queue {\n" + "constexpr int CACHE_LINE_SIZE = 64;\n" + "static inline void spin_loop_pause() noexcept {\n" + " asm volatile (\"nop\" ::: \"memory\");\n" + "}\n" + "}\n" + "}\n" + "#else\n" + "#error"))))) + (add-after 'unpack 'set-version + (lambda _ + (mkdir-p "include") + (substitute* "CMakeLists.txt" + (("^execute_process") "#execute_process")) + (with-output-to-file "include/seqwish_git_version.hpp" + (lambda () + (format #t "#define SEQWISH_GIT_VERSION \"~a\"~%" + #$version))))) + (add-after 'unpack-submodules 'link-with-some-shared-libraries + (lambda* (#:key inputs #:allow-other-keys) + (substitute* '("CMakeLists.txt" + "deps/mmmulti/CMakeLists.txt") + (("\".*libsdsl\\.a\"") "\"-lsdsl\"") + (("\".*libdivsufsort\\.a\"") "\"-ldivsufsort\"") + (("\".*libdivsufsort64\\.a\"") "\"-ldivsufsort64\"") + (("\\$\\{sdsl-lite_INCLUDE\\}") + (search-input-directory inputs "/include/sdsl")) + (("\\$\\{sdsl-lite-divsufsort_INCLUDE\\}") + (dirname + (search-input-file inputs "/include/divsufsort.h")))))) + (replace 'check + (lambda* (#:key tests? #:allow-other-keys) + (setenv "PATH" (string-append (getcwd) ":" (getenv "PATH"))) + (when tests? + (with-directory-excursion "../source/test" + (invoke "make")))))))) (inputs (list jemalloc libdivsufsort @@ -764,7 +987,10 @@ The path-guided stochastic gradient descent based 1D sort implemented in smoothxg time util-linux - wfmash-0.14)) + ;; Pinned to the wfmash-0.14 post-release snapshot used by + ;; the workshop (commit 7bf8988); also avoids the ABI skew + ;; with the surrounding wfmash-0.14 release. + wfmash-0.14-snapshot)) (home-page "https://doi.org/10.1101/2023.04.05.535718") (synopsis "PanGenome Graph Builder") (description "pggb builds pangenome variation graphs from a set of input @@ -813,6 +1039,56 @@ sequences using wfmash, seqwish, smoothxg, and gfaffix.") (prepend jemalloc) (delete "libdeflate"))))) +(define-public wfmash-0.14-snapshot + ;; wfmash-0.14 pinned at a post-0.14.0 commit for the workshop. + ;; Inherits everything from wfmash-0.14; overrides only the source + ;; (now a git snapshot) and the build-check-prerequisites phase that + ;; hardcodes the upstream tarball directory name. + (let* ((commit "7bf89888a09d517635c77822e9ea922e7dfc7fb6") + (revision "0") + (snapshot-version (git-version "0.14.0" revision commit)) + ;; Out-of-source cmake build: cwd is .../build/, source is at + ;; ../source/ (gnu-build-system unpacks git-fetch checkouts + ;; into a directory literally named "source", regardless of + ;; the package's file-name). + (source-dir "source")) + (package + (inherit wfmash-0.14) + (version snapshot-version) + (source + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/waveygang/wfmash") + (commit commit))) + (file-name source-dir) + (sha256 + (base32 "0gffr253c571pzr7a8rmj8ir6i0nspbrsmqa727wmsgzc277ms0n")) + (snippet + #~(begin + (use-modules (guix build utils)) + (delete-file-recursively "src/common/atomic_queue") + (substitute* "src/align/include/computeAlignments.hpp" + (("\"common/atomic_queue/atomic_queue.h\"") + "<atomic_queue/atomic_queue.h>")) + (substitute* (find-files "." "CMakeLists\\.txt") + (("-march=native ") "")))))) + (arguments + (substitute-keyword-arguments (package-arguments wfmash-0.14) + ((#:phases phases #~%standard-phases) + #~(modify-phases #$phases + (replace 'build-check-prerequisites + (lambda _ + (let ((wfa2-lib #$(string-append + "../" source-dir + "/src/common/wflign/deps/WFA2-lib"))) + (substitute* (string-append wfa2-lib "/Makefile") + (("-march=x86-64-v3") "")) + (substitute* (string-append wfa2-lib "/tests/wfa.utest.sh") + (("\\\\time -v") "time")) + (invoke "make" "-C" wfa2-lib + #$(string-append "CC=" (cc-for-target))))))))))))) + ;; wfa2-lib v2.3.6 with cmake build, pkg-config support (define-public wfa2-lib/cmake (package @@ -842,6 +1118,51 @@ sequences to accelerate the alignment process.") (properties '((tunable? . #t))) (license license:expat))) +(define vcflib-fastahack-src + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/ekg/fastahack") + (commit "bb332654766c2177d6ec07941fe43facf8483b1d"))) + (file-name (git-file-name "fastahack" "bb332654")) + (sha256 (base32 "0rp1blskhzxf7vbh253ibpxbgl9wwgyzf1wbkxndi08d3j4vcss9")))) + +(define vcflib-smithwaterman-src + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/ekg/smithwaterman") + (commit "2610e259611ae4cde8f03c72499d28f03f6d38a7"))) + (file-name (git-file-name "smithwaterman" "2610e259")) + (sha256 (base32 "0i9d8zrxpiracw3mxzd9siybpy62p06rqz9mc2w93arajgbk45bs")))) + +(define vcflib-intervaltree-src + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/ekg/intervaltree") + (commit "aa5937755000f1cd007402d03b6f7ce4427c5d21"))) + (file-name (git-file-name "intervaltree" "aa593775")) + (sha256 (base32 "0p9aphy6sc01dg67xzqpnhvjmk21xa380bpfbkz24a23s6krhjwl")))) + +(define vcflib-fsom-src + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/ekg/fsom") + (commit "56695e1611d824cda97f08e932d25d08419170cd"))) + (file-name (git-file-name "fsom" "56695e16")) + (sha256 (base32 "1ysa209j0wjv763g882jidpxiakd37s96b0avg15cwbfdxzmj7ri")))) + +(define vcflib-filevercmp-src + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/ekg/filevercmp") + (commit "df20dcc4a2a772de56e804e8fbbcdef1ac068bbe"))) + (file-name (git-file-name "filevercmp" "df20dcc4")) + (sha256 (base32 "16gbpc3vax4k51i5xjc5an5qjjddqycfrdkp4qvw9x2kvqbwyxh3")))) + (define-public vcflib (let ((commit "b118a9bfd99b07da9d40d0bd8b3c2bdc4523b568") (revision "1")) @@ -852,22 +1173,19 @@ sequences to accelerate the alignment process.") (method git-fetch) (uri (git-reference (url "https://github.com/vcflib/vcflib") - (commit commit) - (recursive? #t))) + (commit commit))) (file-name (git-file-name name version)) (sha256 - (base32 "1qgipn1vgkipd36hcm10mz0rg6h04azng2hp5zsjrpr4k1dh1fdr")))) + (base32 "07xvma6iln4wsg7qhgvk3yaqy7plhqj5c9z0lib1xjvninc67874")))) (build-system cmake-build-system) (inputs - (list fastahack - htslib + (list htslib pandoc perl python python-pytest pybind11 ruby - smithwaterman tabixpp time wfa2-lib/cmake @@ -875,6 +1193,20 @@ sequences to accelerate the alignment process.") zig-0.15)) (native-inputs (list pkg-config)) + (arguments + (list + #:configure-flags + #~(list "-DCMAKE_BUILD_TYPE=RelWithDebInfo") + #:tests? #f + #:phases + #~(modify-phases %standard-phases + (add-after 'unpack 'unpack-submodules + (lambda _ + (copy-recursively #$vcflib-fastahack-src "contrib/fastahack") + (copy-recursively #$vcflib-smithwaterman-src "contrib/smithwaterman") + (copy-recursively #$vcflib-intervaltree-src "contrib/intervaltree") + (copy-recursively #$vcflib-fsom-src "contrib/fsom") + (copy-recursively #$vcflib-filevercmp-src "contrib/filevercmp")))))) (home-page "https://github.com/vcflib/vcflib/") (synopsis "Library for parsing and manipulating VCF files") (description "Vcflib provides methods to manipulate and interpret @@ -912,6 +1244,11 @@ manipulations on VCF files.") #:phases #~(modify-phases %standard-phases (delete 'configure) ; no configure script + (add-after 'unpack 'fix-missing-includes + (lambda _ + (substitute* (find-files "src" "\\.(h|cpp)$") + (("#include <vector>" all) + (string-append "#include <cstdint>\n" all))))) (add-after 'unpack 'patch-source (lambda* (#:key inputs #:allow-other-keys) ;; Add subdirectory include paths for system packages @@ -1137,6 +1474,30 @@ multiple sequence alignment.") license:zlib license:boost1.0)))) +(define-public vg-1.71 + ;; Older release pinned for the pangenome workshop material; the + ;; build customisation is identical to vg above so we inherit it + ;; and only override version + source (origin must repeat the + ;; snippet because overriding source replaces it whole). + (package + (inherit vg) + (version "1.71.0") + (source + (origin + (method url-fetch) + (uri (string-append "https://github.com/vgteam/vg/releases/download/v" + version "/vg-v" version ".tar.gz")) + (sha256 + (base32 "06ag9gb57wjvmxy4pzvskpkph6i6jvs0vy8rjm1xdk3g76l8vhjb")) + (snippet + #~(begin + (use-modules (guix build utils)) + (substitute* (find-files "." "(CMakeLists\\.txt|Makefile)") + (("-march=native") "") + (("-mtune=native") "") + (("-msse4.2") "") + (("-mcx16") "")))))))) + (define-public bandage-ng (package (name "bandage-ng") @@ -1145,8 +1506,7 @@ multiple sequence alignment.") (method git-fetch) (uri (git-reference (url "https://github.com/asl/BandageNG") - (commit (string-append "v" version)) - (recursive? #t))) + (commit (string-append "v" version)))) (file-name (git-file-name name version)) (sha256 (base32 "071inw1dd0m430p1qh7w2zdvz7y586hgvhhahwv99016l601ha3c")))) @@ -1468,6 +1828,52 @@ raster dotplot images, drawing lines for each match to visualize homology between sequences.") (license license:expat)))) +(define-public agc + (package + (name "agc") + (version "2.1") + (source + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/refresh-bio/agc") + (commit (string-append "v" version)))) + (file-name (git-file-name name version)) + (sha256 + (base32 "0qxrs52lnbm2296b15k1x7dgffv66chac16s7qalp8x0clyfnjgm")) + (snippet + #~(begin + (use-modules (guix build utils)) + (substitute* "makefile" + (("-mavx") "") + (("-m64") "")))))) + (build-system gnu-build-system) + (arguments + (list + #:tests? #f ; no test target + #:make-flags + #~(list (string-append "CC=" #$(cxx-for-target)) + "agc") + #:phases + #~(modify-phases %standard-phases + (delete 'configure) + (add-after 'unpack 'fix-missing-includes + (lambda _ + (substitute* (find-files "src" "\\.(h|cpp)$") + (("#include <vector>" all) + (string-append "#include <cstdint>\n" all))))) + (replace 'install + (lambda _ + (install-file "agc" (string-append #$output "/bin"))))))) + (home-page "https://github.com/refresh-bio/agc") + (synopsis "Assembled Genomes Compressor") + (description + "AGC is a tool designed to compress collections of de-novo assembled +genomes. It achieves high compression ratios using a reference-based approach +and can be used for various types of datasets from short viral genomes to +long human genomes.") + (license license:expat))) + (define-public pangenomes (package (name "pangenomes") @@ -1481,16 +1887,17 @@ between sequences.") (use-modules (guix build utils)) (mkdir-p (string-append #$output "/bin"))))) (propagated-inputs - (list bandage-ng + (list agc + bandage-ng bedtools bwa-mem2 cigzip cosigt fastga-rs gafpack + gfalook gfaffix gfainject - gfalook gfautil htslib impg @@ -1509,7 +1916,7 @@ between sequences.") vcflib vg wally - wfmash)) + wfmash-0.14)) (home-page "https://github.com/pangenome") (synopsis "Pangenome tools collection") (description "Meta package that provides the main pangenome tools: @@ -1518,6 +1925,171 @@ and supporting tools like minimap2, samtools, bedtools, bwa-mem2, meryl, kfilt, miniprot, pangene, wally, and vcfbub.") (license license:expat))) +(define-public impop + ;; Population-genomics tools for implicit pangenomes (impop_k suite). + ;; The Rust workspace ships under source/ inside MarsicoFL/memimpopk + ;; until MarsicoFL/IMPOPk is released; see memimpopk/bin/README.md + ;; for the provenance note. Builds four binaries: ibs, ibd, + ;; ancestry, jacquard (plus the ibd-validate auxiliary). + (let ((commit "d0c6ca847468153c81da64078c6a112b6569eb1c") + (revision "0")) + (package + (name "impop") + (version (git-version "0.2.0" revision commit)) + (source (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/MarsicoFL/memimpopk") + (commit commit))) + (file-name (git-file-name name version)) + (sha256 + (base32 + "00ln6bfnqcq0skwfdmsd29bclifc116mrw81zkki02b6lmf7r652")))) + (build-system cargo-build-system) + (arguments + (list #:install-source? #f + #:tests? #f ; integration tests need data + the workshop scripts + #:phases + #~(modify-phases %standard-phases + (add-after 'unpack 'chdir-to-workspace + (lambda _ + ;; The Cargo workspace ships in source/; everything + ;; outside is workshop material. + (chdir "source"))) + ;; configure unpacks vendored crates to guix-vendor/; + ;; wit-bindgen 0.51 ships a pre-built static archive + ;; that the vendor audit flags as non-reproducible. + ;; Strip it; the binaries we build do not exercise the + ;; wasm component-model runtime. + (add-before 'build 'remove-prebuilt-wit-bindgen-archive + (lambda _ + (let ((stale "guix-vendor/rust-wit-bindgen-0.51.0.tar.gz/src/rt/libwit_bindgen_cabi.a")) + (when (file-exists? stale) + (delete-file stale))))) + ;; ragc (https://github.com/ekg/ragc) is a Cargo + ;; workspace; copy the whole tree into ragc-workspace/ + ;; and point ibs-cli's ragc-core dep at it. Same + ;; pattern as impg's ragc-workspace handling. + (add-before 'build 'copy-ragc-workspace + (lambda* (#:key inputs #:allow-other-keys) + (let ((src (assoc-ref inputs + "rust-ragc-0.1.1.40e5cad-checkout"))) + (copy-recursively src "ragc-workspace")))) + ;; ibs-cli depends on lib_wfa2 (git rev) and + ;; ragc-core (git rev, workspace). Point both at + ;; local paths so cargo can resolve offline, and + ;; tell impop's outer workspace to ignore the + ;; nested ragc one (otherwise cargo treats ragc-core + ;; as a member of impop's workspace and + ;; workspace.package inheritance picks the wrong + ;; root manifest). + (add-before 'build 'fix-dependency-sources + (lambda _ + (substitute* "Cargo.toml" + (("members = \\[") + "exclude = [\"ragc-workspace\"]\nmembers = [")) + (substitute* "src/ibs-cli/Cargo.toml" + (("tpa = \\{ git = \"[^\"]*\"[^}]*\\}") + "tpa = { path = \"../../guix-vendor/rust-tpa-0.1.0.49f1801-checkout\", version = \"0.1.0\" }") + (("lib_wfa2 = \\{ git = \"[^\"]*\", rev = \"[^\"]*\" \\}") + "lib_wfa2 = { path = \"../../guix-vendor/rust-lib-wfa2-0.1.0.8859b6a-checkout\", version = \"0.1.0\" }") + (("tracepoints = \\{ git = \"[^\"]*\", rev = \"[^\"]*\" \\}") + "tracepoints = { path = \"../../guix-vendor/rust-tracepoints-0.1.0.66a5511-checkout\", version = \"0.1.0\" }") + (("ragc-core = \\{ git = \"[^\"]*\", rev = \"[^\"]*\" \\}") + "ragc-core = { path = \"../../ragc-workspace/ragc-core\" }")) + ;; tpa and tracepoints have nested git deps on + ;; lib_wfa2 (same rev) and on each other. Rewire + ;; them to the same vendored checkouts. + (substitute* "guix-vendor/rust-tpa-0.1.0.49f1801-checkout/Cargo.toml" + (("lib_wfa2 = \\{ git = \"[^\"]*\"[^}]*\\}") + "lib_wfa2 = { path = \"../rust-lib-wfa2-0.1.0.8859b6a-checkout\", version = \"0.1.0\" }") + (("tracepoints = \\{ git = \"[^\"]*\"[^}]*\\}") + "tracepoints = { path = \"../rust-tracepoints-0.1.0.66a5511-checkout\", version = \"0.1.0\" }") + ;; Strip dev-deps; cargo --offline still wants + ;; them in the vendor even though we don't run + ;; tpa's tests. + (("^rand = .*") "") + (("\\[dev-dependencies\\]") "[fake-removed]")) + (substitute* "guix-vendor/rust-tracepoints-0.1.0.66a5511-checkout/Cargo.toml" + (("lib_wfa2 = \\{ git = \"[^\"]*\"[^}]*\\}") + "lib_wfa2 = { path = \"../rust-lib-wfa2-0.1.0.8859b6a-checkout\", version = \"0.1.0\" }")))) + ;; cargo-build-system's default install runs + ;; `cargo install --path .`, which fails for a virtual + ;; workspace root with no [package]. Copy the four + ;; workshop binaries out of target/release/ manually. + (replace 'install + (lambda* (#:key outputs #:allow-other-keys) + (let ((bin (string-append (assoc-ref outputs "out") + "/bin"))) + (mkdir-p bin) + (for-each + (lambda (name) + (install-file (string-append "target/release/" name) + bin)) + '("ibs" "ibd" "ancestry" "jacquard"))))) + ;; lib_wfa2's build.rs builds WFA2-lib from a git + ;; submodule we don't have. Replace it with a stub + ;; that links against the system wfa2-lib-static; + ;; same pattern impg uses for its lib_wfa2 checkout. + (add-before 'build 'patch-lib-wfa2-use-system + (lambda _ + (let ((br "guix-vendor/rust-lib-wfa2-0.1.0.8859b6a-checkout/build.rs")) + (chmod br #o644) + (call-with-output-file br + (lambda (port) + (display + (string-append + "fn main() {\n" + " println!(\"cargo:rustc-link-lib=static=wfa\");\n" + " println!(\"cargo:rustc-link-lib=gomp\");\n" + " println!(\"cargo:rustc-link-search=native=" + #$(file-append wfa2-lib-static "/lib") "\");\n" + "}\n") + port)))) + ;; bindings_wfa.rs uses u32::cast_signed / + ;; i32::cast_unsigned, both unstable before Rust + ;; 1.87. Replace with explicit `as` casts. + (substitute* + "guix-vendor/rust-lib-wfa2-0.1.0.8859b6a-checkout/src/bindings_wfa.rs" + (("u32::cast_signed\\(self\\._bitfield_1\\.get\\(0usize, 24u8\\) as u32\\)") + "((self._bitfield_1.get(0usize, 24u8) as u32) as i32)") + (("u32::cast_signed\\(<") + "((<") + ((" \\) as u32\\)") + " ) as u32) as i32)") + (("i32::cast_unsigned\\(val\\)") + "(val as u32)") + (("i32::cast_unsigned\\(_flags2\\)") + "(_flags2 as u32)"))))))) + (native-inputs (list pkg-config)) + (inputs (cons* wfa2-lib-static + zlib + (list zstd "lib") + (cargo-inputs 'impop + #:module '(gn packages pangenome-rust)))) + (home-page "https://github.com/MarsicoFL/memimpopk") + (synopsis "Population genomics for implicit pangenomes") + (description "impop_k provides IBS, IBD, local-ancestry, and +Jacquard relatedness inference directly on per-window pairwise +identity matrices produced by @code{impg similarity}, without ever +calling variants or building a VCF. Each binary wraps a hidden +Markov model whose emissions are conditional probabilities over +per-window identity, with transitions calibrated by recombination +rate.") + (license license:expat)))) + +(define mempang-workshop-pangenomes + ;; pangenomes propagates the current vg (1.72.0) and wfmash-0.14 + ;; release; the workshop is pinned to vg 1.71.0 and a wfmash-0.14 + ;; post-release commit. Drop both here so the workshop can list + ;; the pinned versions directly without a profile collision. + (package + (inherit pangenomes) + (propagated-inputs + (modify-inputs (package-propagated-inputs pangenomes) + (delete "vg") + (delete "wfmash"))))) + (define-public mempang-workshop (package (name "mempang-workshop") @@ -1531,12 +2103,28 @@ kfilt, miniprot, pangene, wally, and vcfbub.") (use-modules (guix build utils)) (mkdir-p (string-append #$output "/bin"))))) (propagated-inputs - (list pangenomes + (list mempang-workshop-pangenomes + vg-1.71 + wfmash-0.14-snapshot + ;; impop_k (Part 6): the source-built {ibs,ibd,ancestry, + ;; jacquard} from MarsicoFL/memimpopk's bundled source/. + ;; The workshop's memimpopk clone still ships its own + ;; prebuilt bin/, but with impop on PATH attendees who + ;; can't run the prebuilts (older glibc, ARM, ...) get + ;; working binaries automatically. + impop + ;; libgcc_s.so.1 for the memimpopk-prebuilt impop_k + ;; binaries; impop above is Guix-built and self-contained + ;; but the upstream prebuilts still need this. + (list gcc "lib") bc bcftools + coreutils fastix + gawk gnuplot graphviz + grep gzip multiqc mummer @@ -1548,6 +2136,7 @@ kfilt, miniprot, pangene, wally, and vcfbub.") python-pycairo rtg-tools r-minimal + sed r-ape r-data-table r-gggenes @@ -1555,6 +2144,7 @@ kfilt, miniprot, pangene, wally, and vcfbub.") r-ggtree r-tidyverse wget + which zstd)) (home-page "https://github.com/pangenome") (synopsis "MEMPANG pangenome workshop tools") diff --git a/scripts/create-docker-pangenome-tools.sh b/scripts/create-docker-pangenome-tools.sh new file mode 100755 index 0000000..9f65973 --- /dev/null +++ b/scripts/create-docker-pangenome-tools.sh @@ -0,0 +1,24 @@ +#!/bin/sh +# Build a Docker image (tar.gz, loadable via `docker load`) of +# mempang-workshop plus a minimal shell environment, and drop a copy +# in ~/tmp. See lib-pangenome-pack.sh for the naming convention and +# outputs. +# +# Usage: scripts/create-docker-pangenome-tools.sh +# docker load < ~/tmp/<the-tar.gz> +# docker run --rm -it pangenome-tools:<GBHASH> + +. "$(dirname "$0")/lib-pangenome-pack.sh" + +pangenome_pack docker tar.gz docker +pangenome_write_outputs + +echo +echo "Docker image ready:" +ls -lh "$PACK_TARGET" +echo "md5sum: $MD5SUM_FILE" +echo "inventory: $INVENTORY" +echo +echo "Run with:" +echo " docker load < $PACK_TARGET" +echo " docker run --rm -it pangenome-tools:$GB_HASH" diff --git a/scripts/create-singularity-pangenome-tools.sh b/scripts/create-singularity-pangenome-tools.sh new file mode 100755 index 0000000..93df530 --- /dev/null +++ b/scripts/create-singularity-pangenome-tools.sh @@ -0,0 +1,21 @@ +#!/bin/sh +# Build a Singularity (SquashFS) image of mempang-workshop plus a +# minimal shell environment, and drop a copy in ~/tmp. See +# lib-pangenome-pack.sh for the naming convention and outputs. +# +# Usage: scripts/create-singularity-pangenome-tools.sh + +. "$(dirname "$0")/lib-pangenome-pack.sh" + +pangenome_pack squashfs gz.squashfs singularity +pangenome_write_outputs + +echo +echo "Singularity image ready:" +ls -lh "$PACK_TARGET" +echo "md5sum: $MD5SUM_FILE" +echo "inventory: $INVENTORY" +echo +echo "Run with:" +echo " singularity exec $PACK_TARGET <command>" +echo " singularity shell $PACK_TARGET" diff --git a/scripts/lib-pangenome-pack.sh b/scripts/lib-pangenome-pack.sh new file mode 100644 index 0000000..7707928 --- /dev/null +++ b/scripts/lib-pangenome-pack.sh @@ -0,0 +1,203 @@ +# Shared helpers for the pangenome-tools image builders. +# +# Sourced from create-singularity-pangenome-tools.sh and +# create-docker-pangenome-tools.sh. Resolves versions, names the +# output, copies the pack into ~/tmp, and writes md5sum.txt and the +# Markdown inventory. +# +# Callers source this file (which assumes "$0" is the front script) +# and then call: +# +# pangenome_pack <guix-pack-format> <file-extension> <name-label> +# pangenome_write_outputs +# +# Variables PACK_TARGET, PACK_LABEL, PACK_HASH are exported back to +# the caller after pangenome_pack runs. + +set -eu + +SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) +CHANNEL_DIR=$(cd "$SCRIPT_DIR/.." && pwd) +DEST_DIR="$HOME/tmp" +mkdir -p "$DEST_DIR" + +DATE=$(date +%Y%m%d) +GB_HASH=$(git -C "$CHANNEL_DIR" rev-parse --short=8 HEAD) +# The pack is built for the host architecture (no cross-compile). +# Embed it in the filename so x86_64 / aarch64 / ... images cannot +# be confused. Optional TUNE env var passes through to +# `guix pack --tune=...` and is appended to the arch slug. +# +# Only the psABI v-levels are accepted as TUNE values: +# +# x86-64 baseline (any 64-bit Intel/AMD CPU) +# x86-64-v2 Nehalem / Bulldozer (SSE4.2 + POPCNT) +# x86-64-v3 Haswell / Zen 1 (AVX2 + BMI1/2 + FMA) +# x86-64-v4 Skylake-SP / Zen 4 (AVX-512 F/DQ/CD/BW/VL) +# +# Microarch names like `cascadelake`, `znver3`, `skylake-avx512` +# are NOT accepted: Go (which the closure pulls in via odgi etc.) +# only understands the v-levels and `guix pack --tune=cascadelake` +# fails with "compiler go@... does not support micro-architecture +# cascadelake". Pick the v-level whose feature set is implied by +# your target microarch (e.g. Cascade Lake/Zen 4 -> v4 because +# both have AVX-512). +ARCH=$(uname -m) +TUNE="${TUNE:-}" +# When TUNE is set, the v-level already implies the architecture +# (x86-64-v4 only makes sense on x86_64); use it on its own to +# avoid a redundant "x86_64-x86-64-v4" slug. +ARCH_SLUG="${TUNE:-$ARCH}" +if [ -n "$TUNE" ]; then + case "$TUNE" in + x86-64|x86-64-v2|x86-64-v3|x86-64-v4) : ;; + *) + cat >&2 <<EOF +TUNE=$TUNE is not supported. Use one of the psABI v-levels: + + x86-64 baseline (any 64-bit Intel/AMD CPU) + x86-64-v2 Nehalem / Bulldozer (SSE4.2 + POPCNT) + x86-64-v3 Haswell / Zen 1 (AVX2 + BMI1/2 + FMA) + x86-64-v4 Skylake-SP / Zen 4 (AVX-512) + +Microarch names (cascadelake, znver3, ...) are rejected because Go +in the closure only supports the v-levels. +EOF + exit 2 + ;; + esac +fi + +cpu_compat () { + case "$TUNE" in + "") printf 'Generic x86_64 -- any 64-bit Intel or AMD CPU (no AVX/AVX2/AVX-512 required).' ;; + x86-64) printf 'Any 64-bit Intel or AMD CPU (psABI baseline, since ~2003).' ;; + x86-64-v2) printf 'Intel Nehalem (1st-gen Core i7) or newer / AMD Bulldozer or newer. Requires SSE4.2 + POPCNT.' ;; + x86-64-v3) printf 'Intel Haswell (4th-gen Core) or newer / AMD Excavator / Zen 1 or newer. Requires AVX2 + BMI1/2 + FMA.' ;; + x86-64-v4) printf 'Intel Skylake-SP / Cascade Lake / Ice Lake / Sapphire Rapids (Xeon Scalable, Core-X 7900X+) / AMD Zen 4 / Zen 5. Requires AVX-512 (F/DQ/CD/BW/VL).' ;; + esac +} + +# Resolve the exact version each package contributes to the closure. +# Use `guix repl` so we read `package-version` directly -- `guix +# build -n` only prints the store path on stdout when the derivation +# is already realized, which makes it fragile after a channel update. +read IMPG_VER WFMASH_VER PGGB_VER <<EOF +$(guix repl -L "$CHANNEL_DIR" -- /dev/stdin 2>/dev/null <<'SCM' +(use-modules (guix packages) + (gn packages pangenome) + (gn packages pangenome-rust)) +(format #t "~a ~a ~a~%" + (package-version impg) + (package-version wfmash-0.14-snapshot) + (package-version pggb)) +SCM +) +EOF +[ -n "${IMPG_VER:-}" ] && [ -n "${WFMASH_VER:-}" ] && [ -n "${PGGB_VER:-}" ] \ + || { echo "could not resolve package versions" >&2; exit 1; } + +VERSION_STEM="$ARCH_SLUG-guix-bioinformatics-$GB_HASH-impg-$IMPG_VER-wfmash-$WFMASH_VER-pggb-$PGGB_VER" + +pangenome_pack () { + fmt="$1" ; ext="$2" ; label="$3" + echo "==> building $fmt pack from $CHANNEL_DIR${TUNE:+ (tune=$TUNE)}" + extra="" + [ "$fmt" = "docker" ] && extra="--entry-point=/bin/bash --image-tag=pangenome-tools:$GB_HASH" + [ -n "$TUNE" ] && extra="$extra --tune=$TUNE" + # shellcheck disable=SC2086 + STORE_PATH=$(guix pack -f "$fmt" --no-offload \ + -L "$CHANNEL_DIR" \ + -S /bin=bin -S /etc/profile=etc/profile \ + $extra \ + mempang-workshop \ + bash coreutils grep sed gzip \ + | tail -n 1) + if [ ! -e "$STORE_PATH" ]; then + echo "guix pack -f $fmt did not produce a usable store path: $STORE_PATH" >&2 + exit 1 + fi + PACK_HASH=$(basename "$STORE_PATH" | cut -c1-8) + PACK_LABEL="$label" + PACK_TARGET="$DEST_DIR/pangenome-tools-$VERSION_STEM-$label-$DATE-$PACK_HASH.$ext" + + echo "==> copying $STORE_PATH" + echo " to $PACK_TARGET" + cp -L "$STORE_PATH" "$PACK_TARGET" + chmod u+w "$PACK_TARGET" +} + +pangenome_write_outputs () { + # Append our line to md5sum.txt, deduping by filename so re-runs + # don't accumulate stale entries. + MD5SUM_FILE="$DEST_DIR/md5sum.txt" + LINE=$(cd "$DEST_DIR" && md5sum "$(basename "$PACK_TARGET")") + TMP=$(mktemp) + trap 'rm -f "$TMP"' EXIT + [ -f "$MD5SUM_FILE" ] && grep -v " $(basename "$PACK_TARGET")\$" \ + "$MD5SUM_FILE" > "$TMP" || true + printf '%s\n' "$LINE" >> "$TMP" + sort -k2 "$TMP" > "$MD5SUM_FILE" + + # Inventory is identical regardless of pack format -- name it + # by channel hash + date only. + INVENTORY="$DEST_DIR/pangenome-tools-$ARCH_SLUG-guix-bioinformatics-$GB_HASH-$DATE.md" + TOOLS_TSV=$(mktemp) + CLEAN_TSV=$(mktemp) + trap 'rm -f "$TMP" "$TOOLS_TSV" "$CLEAN_TSV"' EXIT + guix repl -L "$CHANNEL_DIR" -- /dev/stdin > "$TOOLS_TSV" <<'SCM' +(use-modules (guix packages) (guix utils) (gn packages pangenome) + (ice-9 format) (ice-9 regex)) +;; Only keep packages defined in gn/packages/pangenome.scm or +;; gn/packages/pangenome-rust.scm -- those are the real pangenome +;; tools; everything else (libc, R, python, coreutils, ...) is +;; infrastructure that ends up in the closure but isn't user-facing. +(define pangenome-file-rx + (make-regexp "gn/packages/pangenome(-rust)?\\.scm$")) +(define (pangenome-package? p) + (let ((loc (package-location p))) + (and loc (regexp-exec pangenome-file-rx (location-file loc))))) +(define seen (make-hash-table)) +(define meta-packages + '("pangenomes" "mempang-workshop-pangenomes" "mempang-workshop")) +(define (emit p) + (when (and (pangenome-package? p) + (not (member (package-name p) meta-packages)) + (not (hash-ref seen (package-name p)))) + (hash-set! seen (package-name p) #t) + (format #t "~a\t~a\t~a~%" + (package-name p) (package-version p) + (or (package-synopsis p) "")))) +(define (expand x) + (let ((p (if (pair? x) (cadr x) x))) + (emit p) + (for-each expand (package-propagated-inputs p)))) +(for-each expand (package-propagated-inputs mempang-workshop)) +SCM + grep -P '^[a-z0-9]' "$TOOLS_TSV" > "$CLEAN_TSV" + + NAME_W=4 ; VER_W=7 ; DESC_W=11 + while IFS=$(printf '\t') read -r n v d; do + [ ${#n} -gt $NAME_W ] && NAME_W=${#n} + [ ${#v} -gt $VER_W ] && VER_W=${#v} + [ ${#d} -gt $DESC_W ] && DESC_W=${#d} + done < "$CLEAN_TSV" + + dashes () { printf '%*s' "$1" '' | tr ' ' -; } + + { + echo "# pangenome-tools $DATE ($ARCH_SLUG, guix-bioinformatics @ $GB_HASH)" + echo + echo "Built from \`mempang-workshop\` in guix-bioinformatics @ $GB_HASH for $ARCH_SLUG." + echo + echo "**CPU compatibility:** $(cpu_compat)" + echo + printf "| %-${NAME_W}s | %-${VER_W}s | %-${DESC_W}s |\n" \ + "Tool" "Version" "Description" + printf "| %s | %s | %s |\n" \ + "$(dashes "$NAME_W")" "$(dashes "$VER_W")" "$(dashes "$DESC_W")" + while IFS=$(printf '\t') read -r n v d; do + printf "| %-${NAME_W}s | %-${VER_W}s | %-${DESC_W}s |\n" "$n" "$v" "$d" + done < "$CLEAN_TSV" + } > "$INVENTORY" +} |
