about summary refs log tree commit diff
path: root/scripts/lib-pangenome-pack.sh
diff options
context:
space:
mode:
authorpjotr2026-05-24 09:37:14 +0000
committerpjotr2026-05-29 08:59:13 +0000
commit7069ddc63d1b462fbdb994055dd0006a4f4bc17b (patch)
tree8dbab6a9b59c56060757f6d8ce3267ec461c703a /scripts/lib-pangenome-pack.sh
parent2554e69813dd9ad06b2014913e15608be9d88e3d (diff)
downloadguix-bioinformatics-7069ddc63d1b462fbdb994055dd0006a4f4bc17b.tar.gz
Summary of the refactor:
  - scripts/lib-pangenome-pack.sh — shared (sourced): resolves versions, builds the pack, writes md5sum.txt and the Markdown inventory. Exposes
   pangenome_pack <fmt> <ext> <label> and pangenome_write_outputs.
  - scripts/create-singularity-pangenome-tools.sh — 22 lines: sources the lib, calls pangenome_pack squashfs gz.squashfs singularity.
  - scripts/create-docker-pangenome-tools.sh — 24 lines: sources the lib, calls pangenome_pack docker tar.gz docker.

  md5sum.txt deduplicates by filename across runs and sorts by filename, so running both scripts produces a single combined manifest. The
  inventory .md is identical content for both formats and names by <GBHASH>-<DATE> (no per-image hash, since contents are the same).
Diffstat (limited to 'scripts/lib-pangenome-pack.sh')
-rw-r--r--scripts/lib-pangenome-pack.sh143
1 files changed, 143 insertions, 0 deletions
diff --git a/scripts/lib-pangenome-pack.sh b/scripts/lib-pangenome-pack.sh
new file mode 100644
index 0000000..35c50f7
--- /dev/null
+++ b/scripts/lib-pangenome-pack.sh
@@ -0,0 +1,143 @@
+# Shared helpers for the pangenome-tools image builders.
+#
+# Sourced from create-singularity-pangenome-tools.sh and
+# create-docker-pangenome-tools.sh.  Resolves versions, names the
+# output, copies the pack into ~/tmp, and writes md5sum.txt and the
+# Markdown inventory.
+#
+# Callers source this file (which assumes "$0" is the front script)
+# and then call:
+#
+#   pangenome_pack <guix-pack-format> <file-extension> <name-label>
+#   pangenome_write_outputs
+#
+# Variables PACK_TARGET, PACK_LABEL, PACK_HASH are exported back to
+# the caller after pangenome_pack runs.
+
+set -eu
+
+SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
+CHANNEL_DIR=$(cd "$SCRIPT_DIR/.." && pwd)
+DEST_DIR="$HOME/tmp"
+mkdir -p "$DEST_DIR"
+
+DATE=$(date +%Y%m%d)
+GB_HASH=$(git -C "$CHANNEL_DIR" rev-parse --short=8 HEAD)
+
+# Resolve the exact version each package contributes to the closure.
+# `guix package -A` is regex-on-name and can be ambiguous (multiple
+# wfmash variants), so go through `guix build -e ... -n` and parse
+# the store basename, which always carries the full version.
+resolve_version () {
+    expr="$1" ; name="$2"
+    path=$(guix build --no-offload -L "$CHANNEL_DIR" -e "$expr" -n 2>/dev/null \
+           | tail -n 1)
+    [ -n "$path" ] || { echo "could not resolve $name" >&2; exit 1; }
+    basename "$path" | sed -E "s/^[a-z0-9]+-${name}-//"
+}
+
+IMPG_VER=$(resolve_version '(@ (gn packages pangenome-rust) impg)' impg)
+WFMASH_VER=$(resolve_version '(@ (gn packages pangenome) wfmash-0.14-snapshot)' wfmash)
+PGGB_VER=$(resolve_version '(@ (gn packages pangenome) pggb)' pggb)
+
+VERSION_STEM="guix-bioinformatics-$GB_HASH-impg-$IMPG_VER-wfmash-$WFMASH_VER-pggb-$PGGB_VER"
+
+pangenome_pack () {
+    fmt="$1" ; ext="$2" ; label="$3"
+    echo "==> building $fmt pack from $CHANNEL_DIR"
+    extra=""
+    [ "$fmt" = "docker" ] && extra="--entry-point=/bin/bash --image-tag=pangenome-tools:$GB_HASH"
+    # shellcheck disable=SC2086
+    STORE_PATH=$(guix pack -f "$fmt" --no-offload \
+                          -L "$CHANNEL_DIR" \
+                          -S /bin=bin -S /etc/profile=etc/profile \
+                          $extra \
+                          mempang-workshop \
+                          bash coreutils grep sed gzip \
+                  | tail -n 1)
+    if [ ! -e "$STORE_PATH" ]; then
+        echo "guix pack -f $fmt did not produce a usable store path: $STORE_PATH" >&2
+        exit 1
+    fi
+    PACK_HASH=$(basename "$STORE_PATH" | cut -c1-8)
+    PACK_LABEL="$label"
+    PACK_TARGET="$DEST_DIR/pangenome-tools-$VERSION_STEM-$label-$DATE-$PACK_HASH.$ext"
+
+    echo "==> copying $STORE_PATH"
+    echo "    to $PACK_TARGET"
+    cp -L "$STORE_PATH" "$PACK_TARGET"
+    chmod u+w "$PACK_TARGET"
+}
+
+pangenome_write_outputs () {
+    # Append our line to md5sum.txt, deduping by filename so re-runs
+    # don't accumulate stale entries.
+    MD5SUM_FILE="$DEST_DIR/md5sum.txt"
+    LINE=$(cd "$DEST_DIR" && md5sum "$(basename "$PACK_TARGET")")
+    TMP=$(mktemp)
+    trap 'rm -f "$TMP"' EXIT
+    [ -f "$MD5SUM_FILE" ] && grep -v "  $(basename "$PACK_TARGET")\$" \
+                                  "$MD5SUM_FILE" > "$TMP" || true
+    printf '%s\n' "$LINE" >> "$TMP"
+    sort -k2 "$TMP" > "$MD5SUM_FILE"
+
+    # Inventory is identical regardless of pack format -- name it
+    # by channel hash + date only.
+    INVENTORY="$DEST_DIR/pangenome-tools-guix-bioinformatics-$GB_HASH-$DATE.md"
+    TOOLS_TSV=$(mktemp)
+    CLEAN_TSV=$(mktemp)
+    trap 'rm -f "$TMP" "$TOOLS_TSV" "$CLEAN_TSV"' EXIT
+    guix repl -L "$CHANNEL_DIR" -- /dev/stdin > "$TOOLS_TSV" <<'SCM'
+(use-modules (guix packages) (guix utils) (gn packages pangenome)
+             (ice-9 format) (ice-9 regex))
+;; Only keep packages defined in gn/packages/pangenome.scm or
+;; gn/packages/pangenome-rust.scm -- those are the real pangenome
+;; tools; everything else (libc, R, python, coreutils, ...) is
+;; infrastructure that ends up in the closure but isn't user-facing.
+(define pangenome-file-rx
+  (make-regexp "gn/packages/pangenome(-rust)?\\.scm$"))
+(define (pangenome-package? p)
+  (let ((loc (package-location p)))
+    (and loc (regexp-exec pangenome-file-rx (location-file loc)))))
+(define seen (make-hash-table))
+(define meta-packages
+  '("pangenomes" "mempang-workshop-pangenomes" "mempang-workshop"))
+(define (emit p)
+  (when (and (pangenome-package? p)
+             (not (member (package-name p) meta-packages))
+             (not (hash-ref seen (package-name p))))
+    (hash-set! seen (package-name p) #t)
+    (format #t "~a\t~a\t~a~%"
+            (package-name p) (package-version p)
+            (or (package-synopsis p) ""))))
+(define (expand x)
+  (let ((p (if (pair? x) (cadr x) x)))
+    (emit p)
+    (for-each expand (package-propagated-inputs p))))
+(for-each expand (package-propagated-inputs mempang-workshop))
+SCM
+    grep -P '^[a-z0-9]' "$TOOLS_TSV" > "$CLEAN_TSV"
+
+    NAME_W=4 ; VER_W=7 ; DESC_W=11
+    while IFS=$(printf '\t') read -r n v d; do
+        [ ${#n} -gt $NAME_W ] && NAME_W=${#n}
+        [ ${#v} -gt $VER_W ]  && VER_W=${#v}
+        [ ${#d} -gt $DESC_W ] && DESC_W=${#d}
+    done < "$CLEAN_TSV"
+
+    dashes () { printf '%*s' "$1" '' | tr ' ' -; }
+
+    {
+        echo "# pangenome-tools $DATE (guix-bioinformatics @ $GB_HASH)"
+        echo
+        echo "Built from \`mempang-workshop\` in guix-bioinformatics @ $GB_HASH."
+        echo
+        printf "| %-${NAME_W}s | %-${VER_W}s | %-${DESC_W}s |\n" \
+               "Tool" "Version" "Description"
+        printf "| %s | %s | %s |\n" \
+               "$(dashes "$NAME_W")" "$(dashes "$VER_W")" "$(dashes "$DESC_W")"
+        while IFS=$(printf '\t') read -r n v d; do
+            printf "| %-${NAME_W}s | %-${VER_W}s | %-${DESC_W}s |\n" "$n" "$v" "$d"
+        done < "$CLEAN_TSV"
+    } > "$INVENTORY"
+}