about summary refs log tree commit diff
path: root/scripts/lib-pangenome-pack.sh
blob: 7707928559f16ed64e2d93938fb0b185b0de02fa (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# Shared helpers for the pangenome-tools image builders.
#
# Sourced from create-singularity-pangenome-tools.sh and
# create-docker-pangenome-tools.sh.  Resolves versions, names the
# output, copies the pack into ~/tmp, and writes md5sum.txt and the
# Markdown inventory.
#
# Callers source this file (which assumes "$0" is the front script)
# and then call:
#
#   pangenome_pack <guix-pack-format> <file-extension> <name-label>
#   pangenome_write_outputs
#
# Variables PACK_TARGET, PACK_LABEL, PACK_HASH are exported back to
# the caller after pangenome_pack runs.

set -eu

SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
CHANNEL_DIR=$(cd "$SCRIPT_DIR/.." && pwd)
DEST_DIR="$HOME/tmp"
mkdir -p "$DEST_DIR"

DATE=$(date +%Y%m%d)
GB_HASH=$(git -C "$CHANNEL_DIR" rev-parse --short=8 HEAD)
# The pack is built for the host architecture (no cross-compile).
# Embed it in the filename so x86_64 / aarch64 / ... images cannot
# be confused.  Optional TUNE env var passes through to
# `guix pack --tune=...` and is appended to the arch slug.
#
# Only the psABI v-levels are accepted as TUNE values:
#
#     x86-64      baseline (any 64-bit Intel/AMD CPU)
#     x86-64-v2   Nehalem / Bulldozer (SSE4.2 + POPCNT)
#     x86-64-v3   Haswell / Zen 1     (AVX2 + BMI1/2 + FMA)
#     x86-64-v4   Skylake-SP / Zen 4  (AVX-512 F/DQ/CD/BW/VL)
#
# Microarch names like `cascadelake`, `znver3`, `skylake-avx512`
# are NOT accepted: Go (which the closure pulls in via odgi etc.)
# only understands the v-levels and `guix pack --tune=cascadelake`
# fails with "compiler go@... does not support micro-architecture
# cascadelake".  Pick the v-level whose feature set is implied by
# your target microarch (e.g. Cascade Lake/Zen 4 -> v4 because
# both have AVX-512).
ARCH=$(uname -m)
TUNE="${TUNE:-}"
# When TUNE is set, the v-level already implies the architecture
# (x86-64-v4 only makes sense on x86_64); use it on its own to
# avoid a redundant "x86_64-x86-64-v4" slug.
ARCH_SLUG="${TUNE:-$ARCH}"
if [ -n "$TUNE" ]; then
    case "$TUNE" in
        x86-64|x86-64-v2|x86-64-v3|x86-64-v4) : ;;
        *)
            cat >&2 <<EOF
TUNE=$TUNE is not supported.  Use one of the psABI v-levels:

    x86-64      baseline (any 64-bit Intel/AMD CPU)
    x86-64-v2   Nehalem / Bulldozer (SSE4.2 + POPCNT)
    x86-64-v3   Haswell / Zen 1     (AVX2 + BMI1/2 + FMA)
    x86-64-v4   Skylake-SP / Zen 4  (AVX-512)

Microarch names (cascadelake, znver3, ...) are rejected because Go
in the closure only supports the v-levels.
EOF
            exit 2
            ;;
    esac
fi

cpu_compat () {
    case "$TUNE" in
        "")        printf 'Generic x86_64 -- any 64-bit Intel or AMD CPU (no AVX/AVX2/AVX-512 required).' ;;
        x86-64)    printf 'Any 64-bit Intel or AMD CPU (psABI baseline, since ~2003).' ;;
        x86-64-v2) printf 'Intel Nehalem (1st-gen Core i7) or newer / AMD Bulldozer or newer.  Requires SSE4.2 + POPCNT.' ;;
        x86-64-v3) printf 'Intel Haswell (4th-gen Core) or newer / AMD Excavator / Zen 1 or newer.  Requires AVX2 + BMI1/2 + FMA.' ;;
        x86-64-v4) printf 'Intel Skylake-SP / Cascade Lake / Ice Lake / Sapphire Rapids (Xeon Scalable, Core-X 7900X+) / AMD Zen 4 / Zen 5.  Requires AVX-512 (F/DQ/CD/BW/VL).' ;;
    esac
}

# Resolve the exact version each package contributes to the closure.
# Use `guix repl` so we read `package-version` directly -- `guix
# build -n` only prints the store path on stdout when the derivation
# is already realized, which makes it fragile after a channel update.
read IMPG_VER WFMASH_VER PGGB_VER <<EOF
$(guix repl -L "$CHANNEL_DIR" -- /dev/stdin 2>/dev/null <<'SCM'
(use-modules (guix packages)
             (gn packages pangenome)
             (gn packages pangenome-rust))
(format #t "~a ~a ~a~%"
        (package-version impg)
        (package-version wfmash-0.14-snapshot)
        (package-version pggb))
SCM
)
EOF
[ -n "${IMPG_VER:-}" ] && [ -n "${WFMASH_VER:-}" ] && [ -n "${PGGB_VER:-}" ] \
    || { echo "could not resolve package versions" >&2; exit 1; }

VERSION_STEM="$ARCH_SLUG-guix-bioinformatics-$GB_HASH-impg-$IMPG_VER-wfmash-$WFMASH_VER-pggb-$PGGB_VER"

pangenome_pack () {
    fmt="$1" ; ext="$2" ; label="$3"
    echo "==> building $fmt pack from $CHANNEL_DIR${TUNE:+ (tune=$TUNE)}"
    extra=""
    [ "$fmt" = "docker" ] && extra="--entry-point=/bin/bash --image-tag=pangenome-tools:$GB_HASH"
    [ -n "$TUNE" ] && extra="$extra --tune=$TUNE"
    # shellcheck disable=SC2086
    STORE_PATH=$(guix pack -f "$fmt" --no-offload \
                          -L "$CHANNEL_DIR" \
                          -S /bin=bin -S /etc/profile=etc/profile \
                          $extra \
                          mempang-workshop \
                          bash coreutils grep sed gzip \
                  | tail -n 1)
    if [ ! -e "$STORE_PATH" ]; then
        echo "guix pack -f $fmt did not produce a usable store path: $STORE_PATH" >&2
        exit 1
    fi
    PACK_HASH=$(basename "$STORE_PATH" | cut -c1-8)
    PACK_LABEL="$label"
    PACK_TARGET="$DEST_DIR/pangenome-tools-$VERSION_STEM-$label-$DATE-$PACK_HASH.$ext"

    echo "==> copying $STORE_PATH"
    echo "    to $PACK_TARGET"
    cp -L "$STORE_PATH" "$PACK_TARGET"
    chmod u+w "$PACK_TARGET"
}

pangenome_write_outputs () {
    # Append our line to md5sum.txt, deduping by filename so re-runs
    # don't accumulate stale entries.
    MD5SUM_FILE="$DEST_DIR/md5sum.txt"
    LINE=$(cd "$DEST_DIR" && md5sum "$(basename "$PACK_TARGET")")
    TMP=$(mktemp)
    trap 'rm -f "$TMP"' EXIT
    [ -f "$MD5SUM_FILE" ] && grep -v "  $(basename "$PACK_TARGET")\$" \
                                  "$MD5SUM_FILE" > "$TMP" || true
    printf '%s\n' "$LINE" >> "$TMP"
    sort -k2 "$TMP" > "$MD5SUM_FILE"

    # Inventory is identical regardless of pack format -- name it
    # by channel hash + date only.
    INVENTORY="$DEST_DIR/pangenome-tools-$ARCH_SLUG-guix-bioinformatics-$GB_HASH-$DATE.md"
    TOOLS_TSV=$(mktemp)
    CLEAN_TSV=$(mktemp)
    trap 'rm -f "$TMP" "$TOOLS_TSV" "$CLEAN_TSV"' EXIT
    guix repl -L "$CHANNEL_DIR" -- /dev/stdin > "$TOOLS_TSV" <<'SCM'
(use-modules (guix packages) (guix utils) (gn packages pangenome)
             (ice-9 format) (ice-9 regex))
;; Only keep packages defined in gn/packages/pangenome.scm or
;; gn/packages/pangenome-rust.scm -- those are the real pangenome
;; tools; everything else (libc, R, python, coreutils, ...) is
;; infrastructure that ends up in the closure but isn't user-facing.
(define pangenome-file-rx
  (make-regexp "gn/packages/pangenome(-rust)?\\.scm$"))
(define (pangenome-package? p)
  (let ((loc (package-location p)))
    (and loc (regexp-exec pangenome-file-rx (location-file loc)))))
(define seen (make-hash-table))
(define meta-packages
  '("pangenomes" "mempang-workshop-pangenomes" "mempang-workshop"))
(define (emit p)
  (when (and (pangenome-package? p)
             (not (member (package-name p) meta-packages))
             (not (hash-ref seen (package-name p))))
    (hash-set! seen (package-name p) #t)
    (format #t "~a\t~a\t~a~%"
            (package-name p) (package-version p)
            (or (package-synopsis p) ""))))
(define (expand x)
  (let ((p (if (pair? x) (cadr x) x)))
    (emit p)
    (for-each expand (package-propagated-inputs p))))
(for-each expand (package-propagated-inputs mempang-workshop))
SCM
    grep -P '^[a-z0-9]' "$TOOLS_TSV" > "$CLEAN_TSV"

    NAME_W=4 ; VER_W=7 ; DESC_W=11
    while IFS=$(printf '\t') read -r n v d; do
        [ ${#n} -gt $NAME_W ] && NAME_W=${#n}
        [ ${#v} -gt $VER_W ]  && VER_W=${#v}
        [ ${#d} -gt $DESC_W ] && DESC_W=${#d}
    done < "$CLEAN_TSV"

    dashes () { printf '%*s' "$1" '' | tr ' ' -; }

    {
        echo "# pangenome-tools $DATE ($ARCH_SLUG, guix-bioinformatics @ $GB_HASH)"
        echo
        echo "Built from \`mempang-workshop\` in guix-bioinformatics @ $GB_HASH for $ARCH_SLUG."
        echo
        echo "**CPU compatibility:** $(cpu_compat)"
        echo
        printf "| %-${NAME_W}s | %-${VER_W}s | %-${DESC_W}s |\n" \
               "Tool" "Version" "Description"
        printf "| %s | %s | %s |\n" \
               "$(dashes "$NAME_W")" "$(dashes "$VER_W")" "$(dashes "$DESC_W")"
        while IFS=$(printf '\t') read -r n v d; do
            printf "| %-${NAME_W}s | %-${VER_W}s | %-${DESC_W}s |\n" "$n" "$v" "$d"
        done < "$CLEAN_TSV"
    } > "$INVENTORY"
}