#!/usr/bin/env bash
set -euo pipefail

cat <<'COMMANDS'
# Reading note: this file is a command map, not a requirement to profile now.
# Read each command as "which layer of the stack would this tool explain?"

# Edit these placeholders only if you later turn this reading lab into a real
# profiling session.
TRAIN_SCRIPT="train.py"
KERNEL_BENCH="kernel_bench.py"
CONFIG="config.yaml"
NPROC=8

# 1. torchrun starts the workload shape. In a DDP/FSDP job, this is where the
# process count, rank environment variables and communication group topology
# enter the program.
torchrun --nproc-per-node=${NPROC} ${TRAIN_SCRIPT} --config ${CONFIG}

# 2. Nsight Systems answers timeline questions: Are there CPU gaps? Are kernels
# serialized? Are H2D/D2H copies visible? Is NCCL overlapping with compute?
nsys profile \
  --trace=cuda,nvtx,osrt \
  --output=reports/nsys_baseline \
  python ${TRAIN_SCRIPT} --config ${CONFIG}

# 3. Nsight Compute answers kernel questions: Is this kernel memory-bound,
# compute-bound, low-occupancy, or stalled on memory dependencies?
ncu --set full \
  --target-processes all \
  python ${KERNEL_BENCH}

# 4. The method is baseline -> hypothesis -> one change -> remeasure. Changing
# several variables at once makes the trace hard to explain.
COMMANDS

echo
echo "This starter only prints command templates. Edit variables and run commands manually."
