model:
  # name_or_path is the model identity. In a reading lab, treat it as the
  # source of weight size, layer count, hidden shape and tokenizer behavior.
  name_or_path: "REPLACE_WITH_MODEL"
  tokenizer: "REPLACE_WITH_TOKENIZER"
  # dtype changes weight memory and often the kernel path. Do not assume a
  # lower precision helps unless the serving engine has optimized kernels.
  dtype: "bf16"
  quantization: null

engine:
  # This does not assume vLLM is installed. The fields represent common serving
  # knobs: engine choice, tensor parallel degree, context limit and memory cap.
  name: "REPLACE_WITH_ENGINE"
  tensor_parallel_size: 1
  max_model_len: 4096
  gpu_memory_utilization: 0.90

workloads:
  # input_tokens mostly affects prefill and TTFT because the model processes the
  # prompt tokens to create initial KV Cache entries.
  - name: "short-short"
    input_tokens: 128
    output_tokens: 64
    concurrency: [1, 4, 8, 16]
  # long-short stresses prefill and KV capacity more than decode length.
  - name: "long-short"
    input_tokens: 4096
    output_tokens: 64
    concurrency: [1, 4, 8]
  # short-long stresses decode scheduling because each output token repeatedly
  # reads prior KV Cache and advances the batch one token at a time.
  - name: "short-long"
    input_tokens: 128
    output_tokens: 1024
    concurrency: [1, 4, 8]

metrics:
  # Read these as different views of serving behavior, not interchangeable
  # "speed" numbers. TTFT is first-token latency; TPOT is decode pace; QPS is
  # request throughput; tokens_per_second is token throughput; tails show user
  # experience under queueing and variance.
  - ttft_ms
  - tpot_ms
  - qps
  - tokens_per_second
  - p50_ms
  - p95_ms
  - p99_ms
  - memory_peak_gb
  - oom_boundary

notes:
  fixed_sampling_parameters: true
  fixed_prompt_distribution: true
  warmup_requests: 20
  measurement_requests: 200
