about summary refs log tree commit diff
path: root/scripts/lib-pangenome-pack.sh
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/lib-pangenome-pack.sh')
-rw-r--r--scripts/lib-pangenome-pack.sh203
1 files changed, 203 insertions, 0 deletions
diff --git a/scripts/lib-pangenome-pack.sh b/scripts/lib-pangenome-pack.sh
new file mode 100644
index 0000000..7707928
--- /dev/null
+++ b/scripts/lib-pangenome-pack.sh
@@ -0,0 +1,203 @@
+# Shared helpers for the pangenome-tools image builders.
+#
+# Sourced from create-singularity-pangenome-tools.sh and
+# create-docker-pangenome-tools.sh.  Resolves versions, names the
+# output, copies the pack into ~/tmp, and writes md5sum.txt and the
+# Markdown inventory.
+#
+# Callers source this file (which assumes "$0" is the front script)
+# and then call:
+#
+#   pangenome_pack <guix-pack-format> <file-extension> <name-label>
+#   pangenome_write_outputs
+#
+# Variables PACK_TARGET, PACK_LABEL, PACK_HASH are exported back to
+# the caller after pangenome_pack runs.
+
+set -eu
+
+SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
+CHANNEL_DIR=$(cd "$SCRIPT_DIR/.." && pwd)
+DEST_DIR="$HOME/tmp"
+mkdir -p "$DEST_DIR"
+
+DATE=$(date +%Y%m%d)
+GB_HASH=$(git -C "$CHANNEL_DIR" rev-parse --short=8 HEAD)
+# The pack is built for the host architecture (no cross-compile).
+# Embed it in the filename so x86_64 / aarch64 / ... images cannot
+# be confused.  Optional TUNE env var passes through to
+# `guix pack --tune=...` and is appended to the arch slug.
+#
+# Only the psABI v-levels are accepted as TUNE values:
+#
+#     x86-64      baseline (any 64-bit Intel/AMD CPU)
+#     x86-64-v2   Nehalem / Bulldozer (SSE4.2 + POPCNT)
+#     x86-64-v3   Haswell / Zen 1     (AVX2 + BMI1/2 + FMA)
+#     x86-64-v4   Skylake-SP / Zen 4  (AVX-512 F/DQ/CD/BW/VL)
+#
+# Microarch names like `cascadelake`, `znver3`, `skylake-avx512`
+# are NOT accepted: Go (which the closure pulls in via odgi etc.)
+# only understands the v-levels and `guix pack --tune=cascadelake`
+# fails with "compiler go@... does not support micro-architecture
+# cascadelake".  Pick the v-level whose feature set is implied by
+# your target microarch (e.g. Cascade Lake/Zen 4 -> v4 because
+# both have AVX-512).
+ARCH=$(uname -m)
+TUNE="${TUNE:-}"
+# When TUNE is set, the v-level already implies the architecture
+# (x86-64-v4 only makes sense on x86_64); use it on its own to
+# avoid a redundant "x86_64-x86-64-v4" slug.
+ARCH_SLUG="${TUNE:-$ARCH}"
+if [ -n "$TUNE" ]; then
+    case "$TUNE" in
+        x86-64|x86-64-v2|x86-64-v3|x86-64-v4) : ;;
+        *)
+            cat >&2 <<EOF
+TUNE=$TUNE is not supported.  Use one of the psABI v-levels:
+
+    x86-64      baseline (any 64-bit Intel/AMD CPU)
+    x86-64-v2   Nehalem / Bulldozer (SSE4.2 + POPCNT)
+    x86-64-v3   Haswell / Zen 1     (AVX2 + BMI1/2 + FMA)
+    x86-64-v4   Skylake-SP / Zen 4  (AVX-512)
+
+Microarch names (cascadelake, znver3, ...) are rejected because Go
+in the closure only supports the v-levels.
+EOF
+            exit 2
+            ;;
+    esac
+fi
+
+cpu_compat () {
+    case "$TUNE" in
+        "")        printf 'Generic x86_64 -- any 64-bit Intel or AMD CPU (no AVX/AVX2/AVX-512 required).' ;;
+        x86-64)    printf 'Any 64-bit Intel or AMD CPU (psABI baseline, since ~2003).' ;;
+        x86-64-v2) printf 'Intel Nehalem (1st-gen Core i7) or newer / AMD Bulldozer or newer.  Requires SSE4.2 + POPCNT.' ;;
+        x86-64-v3) printf 'Intel Haswell (4th-gen Core) or newer / AMD Excavator / Zen 1 or newer.  Requires AVX2 + BMI1/2 + FMA.' ;;
+        x86-64-v4) printf 'Intel Skylake-SP / Cascade Lake / Ice Lake / Sapphire Rapids (Xeon Scalable, Core-X 7900X+) / AMD Zen 4 / Zen 5.  Requires AVX-512 (F/DQ/CD/BW/VL).' ;;
+    esac
+}
+
+# Resolve the exact version each package contributes to the closure.
+# Use `guix repl` so we read `package-version` directly -- `guix
+# build -n` only prints the store path on stdout when the derivation
+# is already realized, which makes it fragile after a channel update.
+read IMPG_VER WFMASH_VER PGGB_VER <<EOF
+$(guix repl -L "$CHANNEL_DIR" -- /dev/stdin 2>/dev/null <<'SCM'
+(use-modules (guix packages)
+             (gn packages pangenome)
+             (gn packages pangenome-rust))
+(format #t "~a ~a ~a~%"
+        (package-version impg)
+        (package-version wfmash-0.14-snapshot)
+        (package-version pggb))
+SCM
+)
+EOF
+[ -n "${IMPG_VER:-}" ] && [ -n "${WFMASH_VER:-}" ] && [ -n "${PGGB_VER:-}" ] \
+    || { echo "could not resolve package versions" >&2; exit 1; }
+
+VERSION_STEM="$ARCH_SLUG-guix-bioinformatics-$GB_HASH-impg-$IMPG_VER-wfmash-$WFMASH_VER-pggb-$PGGB_VER"
+
+pangenome_pack () {
+    fmt="$1" ; ext="$2" ; label="$3"
+    echo "==> building $fmt pack from $CHANNEL_DIR${TUNE:+ (tune=$TUNE)}"
+    extra=""
+    [ "$fmt" = "docker" ] && extra="--entry-point=/bin/bash --image-tag=pangenome-tools:$GB_HASH"
+    [ -n "$TUNE" ] && extra="$extra --tune=$TUNE"
+    # shellcheck disable=SC2086
+    STORE_PATH=$(guix pack -f "$fmt" --no-offload \
+                          -L "$CHANNEL_DIR" \
+                          -S /bin=bin -S /etc/profile=etc/profile \
+                          $extra \
+                          mempang-workshop \
+                          bash coreutils grep sed gzip \
+                  | tail -n 1)
+    if [ ! -e "$STORE_PATH" ]; then
+        echo "guix pack -f $fmt did not produce a usable store path: $STORE_PATH" >&2
+        exit 1
+    fi
+    PACK_HASH=$(basename "$STORE_PATH" | cut -c1-8)
+    PACK_LABEL="$label"
+    PACK_TARGET="$DEST_DIR/pangenome-tools-$VERSION_STEM-$label-$DATE-$PACK_HASH.$ext"
+
+    echo "==> copying $STORE_PATH"
+    echo "    to $PACK_TARGET"
+    cp -L "$STORE_PATH" "$PACK_TARGET"
+    chmod u+w "$PACK_TARGET"
+}
+
+pangenome_write_outputs () {
+    # Append our line to md5sum.txt, deduping by filename so re-runs
+    # don't accumulate stale entries.
+    MD5SUM_FILE="$DEST_DIR/md5sum.txt"
+    LINE=$(cd "$DEST_DIR" && md5sum "$(basename "$PACK_TARGET")")
+    TMP=$(mktemp)
+    trap 'rm -f "$TMP"' EXIT
+    [ -f "$MD5SUM_FILE" ] && grep -v "  $(basename "$PACK_TARGET")\$" \
+                                  "$MD5SUM_FILE" > "$TMP" || true
+    printf '%s\n' "$LINE" >> "$TMP"
+    sort -k2 "$TMP" > "$MD5SUM_FILE"
+
+    # Inventory is identical regardless of pack format -- name it
+    # by channel hash + date only.
+    INVENTORY="$DEST_DIR/pangenome-tools-$ARCH_SLUG-guix-bioinformatics-$GB_HASH-$DATE.md"
+    TOOLS_TSV=$(mktemp)
+    CLEAN_TSV=$(mktemp)
+    trap 'rm -f "$TMP" "$TOOLS_TSV" "$CLEAN_TSV"' EXIT
+    guix repl -L "$CHANNEL_DIR" -- /dev/stdin > "$TOOLS_TSV" <<'SCM'
+(use-modules (guix packages) (guix utils) (gn packages pangenome)
+             (ice-9 format) (ice-9 regex))
+;; Only keep packages defined in gn/packages/pangenome.scm or
+;; gn/packages/pangenome-rust.scm -- those are the real pangenome
+;; tools; everything else (libc, R, python, coreutils, ...) is
+;; infrastructure that ends up in the closure but isn't user-facing.
+(define pangenome-file-rx
+  (make-regexp "gn/packages/pangenome(-rust)?\\.scm$"))
+(define (pangenome-package? p)
+  (let ((loc (package-location p)))
+    (and loc (regexp-exec pangenome-file-rx (location-file loc)))))
+(define seen (make-hash-table))
+(define meta-packages
+  '("pangenomes" "mempang-workshop-pangenomes" "mempang-workshop"))
+(define (emit p)
+  (when (and (pangenome-package? p)
+             (not (member (package-name p) meta-packages))
+             (not (hash-ref seen (package-name p))))
+    (hash-set! seen (package-name p) #t)
+    (format #t "~a\t~a\t~a~%"
+            (package-name p) (package-version p)
+            (or (package-synopsis p) ""))))
+(define (expand x)
+  (let ((p (if (pair? x) (cadr x) x)))
+    (emit p)
+    (for-each expand (package-propagated-inputs p))))
+(for-each expand (package-propagated-inputs mempang-workshop))
+SCM
+    grep -P '^[a-z0-9]' "$TOOLS_TSV" > "$CLEAN_TSV"
+
+    NAME_W=4 ; VER_W=7 ; DESC_W=11
+    while IFS=$(printf '\t') read -r n v d; do
+        [ ${#n} -gt $NAME_W ] && NAME_W=${#n}
+        [ ${#v} -gt $VER_W ]  && VER_W=${#v}
+        [ ${#d} -gt $DESC_W ] && DESC_W=${#d}
+    done < "$CLEAN_TSV"
+
+    dashes () { printf '%*s' "$1" '' | tr ' ' -; }
+
+    {
+        echo "# pangenome-tools $DATE ($ARCH_SLUG, guix-bioinformatics @ $GB_HASH)"
+        echo
+        echo "Built from \`mempang-workshop\` in guix-bioinformatics @ $GB_HASH for $ARCH_SLUG."
+        echo
+        echo "**CPU compatibility:** $(cpu_compat)"
+        echo
+        printf "| %-${NAME_W}s | %-${VER_W}s | %-${DESC_W}s |\n" \
+               "Tool" "Version" "Description"
+        printf "| %s | %s | %s |\n" \
+               "$(dashes "$NAME_W")" "$(dashes "$VER_W")" "$(dashes "$DESC_W")"
+        while IFS=$(printf '\t') read -r n v d; do
+            printf "| %-${NAME_W}s | %-${VER_W}s | %-${DESC_W}s |\n" "$n" "$v" "$d"
+        done < "$CLEAN_TSV"
+    } > "$INVENTORY"
+}