about summary refs log tree commit diff
diff options
context:
space:
mode:
authorEfraim Flashner2023-09-19 19:16:01 +0300
committerEfraim Flashner2023-09-19 19:16:01 +0300
commit90d0c931a585ec862151a11b6a88fce678670b9d (patch)
tree09454b41ba468cbcbfef765c47b13da7c39448a6
parent8885768936858286babae788a5dcfb01c2cad0a8 (diff)
downloadguix-bioinformatics-90d0c931a585ec862151a11b6a88fce678670b9d.tar.gz
Add vcfbub
-rw-r--r--gn/packages/bioinformatics.scm49
-rw-r--r--gn/packages/crates-io.scm26
2 files changed, 75 insertions, 0 deletions
diff --git a/gn/packages/bioinformatics.scm b/gn/packages/bioinformatics.scm
index 63934fc..2b1d514 100644
--- a/gn/packages/bioinformatics.scm
+++ b/gn/packages/bioinformatics.scm
@@ -499,6 +499,55 @@ reads.")
 collapses them into a non-redundant graph structure.")
     (license license:expat)))
 
+(define-public vcfbub
+  (package
+    (name "vcfbub")
+    (version "0.1.0")
+    (source
+      (origin
+        (method git-fetch)
+        (uri (git-reference
+               (url "https://github.com/pangenome/vcfbub")
+               (commit (string-append "v" version))))
+        (file-name (git-file-name name version))
+        (sha256
+         (base32 "0sk2ab22z6qa00j1w8a8f5kbb7q2xb10fhd32zy4lh351v3mqmyg"))))
+    (build-system cargo-build-system)
+    (arguments
+     `(#:install-source? #f
+       #:cargo-inputs
+       (("rust-clap" ,rust-clap-2)
+        ("rust-flate2" ,rust-flate2-1)
+        ("rust-vcf" ,rust-vcf-0.6))))
+    (home-page "https://github.com/pangenome/vcfbub")
+    (synopsis "Popping bubbles in vg deconstruct VCFs")
+    (description
+     "The VCF output produced by a command like @command{vg deconstruct -e -a
+-H '#' ...} includes information about the nesting of variants.  With @code{-a},
+@code{--all-snarls}, we obtain not just the top level bubbles, but all nested
+ones.  This exposed snarl tree information can be used to filter the VCF to
+obtain a set of non-overlapping sites (n.b. \"snarl\" is a generic model of
+graph bubbles including tips and loops).
+@code{vcfbub} lets us do two common operations on these VCFs:
+@enumerate
+@item We can filter sites by maximum level in the snarl tree.  For instance,
+@code{--max-level 0} would keep only sites with @code{LV=0}.  In practice, vg's
+snarl finder ensures that these are sites rooted on the main linear axis of the
+pangenome graph.  Those at higher levels occur within larger variants.
+@item We can filter sites by maximum allele size, either for the reference
+allele or any allele.  In this case, @code{--max-ref-length 10000} would keep
+only sites where the reference allele is less than 10kb long.  Setting
+@code{--max-ref-length} or @code{--max-allele-length} additionally ensures that
+the output contains the bubbles nested inside of any popped bubble, even if
+they are at greater than @code{--max-level}.
+@end enumerate
+@code{vcfbub} accomplishes a simple task: we keep sites that are the children
+of those which we \"pop\" due to their size.  These occur around complex large
+SVs, such as multi-Mbp inversions and segmental duplications.  We often need to
+remove these, as they provide little information for many downstream
+applications, such as haplotype panels or other imputation references.")
+    (license license:expat)))
+
 (define-public gafpack
   (let ((commit "ad31875b6914d964c6fd72d1bf334f0843538fb6")     ; November 10, 2022
         (revision "1"))
diff --git a/gn/packages/crates-io.scm b/gn/packages/crates-io.scm
index a0e31ce..79832bc 100644
--- a/gn/packages/crates-io.scm
+++ b/gn/packages/crates-io.scm
@@ -575,6 +575,32 @@ or any combination.")
     (description "Yet another format-preserving TOML parser.")
     (license (list license:expat license:asl2.0))))
 
+(define-public rust-vcf-0.6
+  (package
+    (name "rust-vcf")
+    (version "0.6.1")
+    (source (origin
+              (method url-fetch)
+              (uri (crate-uri "vcf" version))
+              (file-name (string-append name "-" version ".tar.gz"))
+              (sha256
+               (base32
+                "0dc0p00a19rpmhrqcshrn2qg5l716b5s1fy8vpd3p32bw77vpbs0"))))
+    (build-system cargo-build-system)
+    (arguments
+     `(#:tests? #f      ; Not all files included
+       #:cargo-inputs
+       (("rust-nom" ,rust-nom-7)
+        ("rust-once-cell" ,rust-once-cell-1)
+        ("rust-thiserror" ,rust-thiserror-1))
+       #:cargo-development-inputs
+       (("rust-clap" ,rust-clap-2)
+        ("rust-flate2" ,rust-flate2-1))))
+    (home-page "https://github.com/informationsea/vcf-rs")
+    (synopsis "Rust implmentation of VCF parser")
+    (description "This package provides a rust implmentation of a VCF parser.")
+    (license license:asl2.0)))
+
 (define-public rust-gsl-sys
   (package
     (name "rust-gsl-sys")