# KV Cache Step-by-Step
# Annotated reading material. Running this file is optional.
# Source-of-truth focus: Read prefill and decode as two phases of the same cache contract.

kv_cache = []
for prompt_token in ["A", "B", "C"]:
    kv_cache.append(f"K,V({prompt_token})")  # prefill

new_token = "D"
query = f"Q({new_token})"
context = f"attention({query}, cached={len(kv_cache)} tokens)"
kv_cache.append(f"K,V({new_token})")

# What to explain while reading:
# - prefill writes K/V for the prompt.
# - decode_step appends one new K/V per layer.
# - The new query reads all cached K/V instead of recomputing history.
#
# Common traps:
# - KV cache is mainly an inference serving concern.
# - Weight quantization does not automatically shrink KV cache.