aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEfraim Flashner2023-09-19 19:16:01 +0300
committerEfraim Flashner2023-09-19 19:16:01 +0300
commit90d0c931a585ec862151a11b6a88fce678670b9d (patch)
tree09454b41ba468cbcbfef765c47b13da7c39448a6
parent8885768936858286babae788a5dcfb01c2cad0a8 (diff)
downloadguix-bioinformatics-90d0c931a585ec862151a11b6a88fce678670b9d.tar.gz
Add vcfbub
-rw-r--r--gn/packages/bioinformatics.scm49
-rw-r--r--gn/packages/crates-io.scm26
2 files changed, 75 insertions, 0 deletions
diff --git a/gn/packages/bioinformatics.scm b/gn/packages/bioinformatics.scm
index 63934fc..2b1d514 100644
--- a/gn/packages/bioinformatics.scm
+++ b/gn/packages/bioinformatics.scm
@@ -499,6 +499,55 @@ reads.")
collapses them into a non-redundant graph structure.")
(license license:expat)))
+(define-public vcfbub
+ (package
+ (name "vcfbub")
+ (version "0.1.0")
+ (source
+ (origin
+ (method git-fetch)
+ (uri (git-reference
+ (url "https://github.com/pangenome/vcfbub")
+ (commit (string-append "v" version))))
+ (file-name (git-file-name name version))
+ (sha256
+ (base32 "0sk2ab22z6qa00j1w8a8f5kbb7q2xb10fhd32zy4lh351v3mqmyg"))))
+ (build-system cargo-build-system)
+ (arguments
+ `(#:install-source? #f
+ #:cargo-inputs
+ (("rust-clap" ,rust-clap-2)
+ ("rust-flate2" ,rust-flate2-1)
+ ("rust-vcf" ,rust-vcf-0.6))))
+ (home-page "https://github.com/pangenome/vcfbub")
+ (synopsis "Popping bubbles in vg deconstruct VCFs")
+ (description
+ "The VCF output produced by a command like @command{vg deconstruct -e -a
+-H '#' ...} includes information about the nesting of variants. With @code{-a},
+@code{--all-snarls}, we obtain not just the top level bubbles, but all nested
+ones. This exposed snarl tree information can be used to filter the VCF to
+obtain a set of non-overlapping sites (n.b. \"snarl\" is a generic model of
+graph bubbles including tips and loops).
+@code{vcfbub} lets us do two common operations on these VCFs:
+@enumerate
+@item We can filter sites by maximum level in the snarl tree. For instance,
+@code{--max-level 0} would keep only sites with @code{LV=0}. In practice, vg's
+snarl finder ensures that these are sites rooted on the main linear axis of the
+pangenome graph. Those at higher levels occur within larger variants.
+@item We can filter sites by maximum allele size, either for the reference
+allele or any allele. In this case, @code{--max-ref-length 10000} would keep
+only sites where the reference allele is less than 10kb long. Setting
+@code{--max-ref-length} or @code{--max-allele-length} additionally ensures that
+the output contains the bubbles nested inside of any popped bubble, even if
+they are at greater than @code{--max-level}.
+@end enumerate
+@code{vcfbub} accomplishes a simple task: we keep sites that are the children
+of those which we \"pop\" due to their size. These occur around complex large
+SVs, such as multi-Mbp inversions and segmental duplications. We often need to
+remove these, as they provide little information for many downstream
+applications, such as haplotype panels or other imputation references.")
+ (license license:expat)))
+
(define-public gafpack
(let ((commit "ad31875b6914d964c6fd72d1bf334f0843538fb6") ; November 10, 2022
(revision "1"))
diff --git a/gn/packages/crates-io.scm b/gn/packages/crates-io.scm
index a0e31ce..79832bc 100644
--- a/gn/packages/crates-io.scm
+++ b/gn/packages/crates-io.scm
@@ -575,6 +575,32 @@ or any combination.")
(description "Yet another format-preserving TOML parser.")
(license (list license:expat license:asl2.0))))
+(define-public rust-vcf-0.6
+ (package
+ (name "rust-vcf")
+ (version "0.6.1")
+ (source (origin
+ (method url-fetch)
+ (uri (crate-uri "vcf" version))
+ (file-name (string-append name "-" version ".tar.gz"))
+ (sha256
+ (base32
+ "0dc0p00a19rpmhrqcshrn2qg5l716b5s1fy8vpd3p32bw77vpbs0"))))
+ (build-system cargo-build-system)
+ (arguments
+ `(#:tests? #f ; Not all files included
+ #:cargo-inputs
+ (("rust-nom" ,rust-nom-7)
+ ("rust-once-cell" ,rust-once-cell-1)
+ ("rust-thiserror" ,rust-thiserror-1))
+ #:cargo-development-inputs
+ (("rust-clap" ,rust-clap-2)
+ ("rust-flate2" ,rust-flate2-1))))
+ (home-page "https://github.com/informationsea/vcf-rs")
+ (synopsis "Rust implmentation of VCF parser")
+ (description "This package provides a rust implmentation of a VCF parser.")
+ (license license:asl2.0)))
+
(define-public rust-gsl-sys
(package
(name "rust-gsl-sys")