From 5f59ddb9cfc2d2f82f7832671108e1564c574235 Mon Sep 17 00:00:00 2001
From: Yiou Li <liyiousu@gmail.com>
Date: Thu, 9 Apr 2026 18:19:28 -0700
Subject: [PATCH] update eval-driven-dev skill (#1352)

* update eval-driven-dev skill

* small refinement of skill description

* address review, rerun npm start.
---
 docs/README.skills.md                         |   2 +-
 skills/eval-driven-dev/SKILL.md               | 343 ++---------
 .../references/1-a-entry-point.md             |  68 +++
 .../references/1-b-eval-criteria.md           |  82 +++
 .../references/2-wrap-and-trace.md            | 260 +++++++++
 .../references/3-define-evaluators.md         | 161 ++++++
 .../references/4-build-dataset.md             | 228 ++++++++
 .../eval-driven-dev/references/5-run-tests.md |  79 +++
 .../{investigation.md => 6-investigate.md}    |  80 ++-
 .../references/dataset-generation.md          | 235 --------
 .../eval-driven-dev/references/eval-tests.md  | 241 --------
 .../eval-driven-dev/references/evaluators.md  | 531 ++++++++++++++++++
 .../references/instrumentation.md             | 174 ------
 .../eval-driven-dev/references/pixie-api.md   | 257 ---------
 .../references/run-harness-patterns.md        | 281 ---------
 .../eval-driven-dev/references/testing-api.md | 367 ++++++++++++
 .../references/understanding-app.md           | 201 -------
 skills/eval-driven-dev/references/wrap-api.md | 255 +++++++++
 skills/eval-driven-dev/resources/setup.sh     |  43 ++
 19 files changed, 2180 insertions(+), 1708 deletions(-)
 create mode 100644 skills/eval-driven-dev/references/1-a-entry-point.md
 create mode 100644 skills/eval-driven-dev/references/1-b-eval-criteria.md
 create mode 100644 skills/eval-driven-dev/references/2-wrap-and-trace.md
 create mode 100644 skills/eval-driven-dev/references/3-define-evaluators.md
 create mode 100644 skills/eval-driven-dev/references/4-build-dataset.md
 create mode 100644 skills/eval-driven-dev/references/5-run-tests.md
 rename skills/eval-driven-dev/references/{investigation.md => 6-investigate.md} (62%)
 delete mode 100644 skills/eval-driven-dev/references/dataset-generation.md
 delete mode 100644 skills/eval-driven-dev/references/eval-tests.md
 create mode 100644 skills/eval-driven-dev/references/evaluators.md
 delete mode 100644 skills/eval-driven-dev/references/instrumentation.md
 delete mode 100644 skills/eval-driven-dev/references/pixie-api.md
 delete mode 100644 skills/eval-driven-dev/references/run-harness-patterns.md
 create mode 100644 skills/eval-driven-dev/references/testing-api.md
 delete mode 100644 skills/eval-driven-dev/references/understanding-app.md
 create mode 100644 skills/eval-driven-dev/references/wrap-api.md
 create mode 100755 skills/eval-driven-dev/resources/setup.sh

diff --git a/docs/README.skills.md b/docs/README.skills.md
index 521fb077..780afeb8 100644
--- a/docs/README.skills.md
+++ b/docs/README.skills.md
@@ -131,7 +131,7 @@ See [CONTRIBUTING.md](../CONTRIBUTING.md#adding-skills) for guidelines on how to
 | [ef-core](../skills/ef-core/SKILL.md) | Get best practices for Entity Framework Core | None |
 | [email-drafter](../skills/email-drafter/SKILL.md) | Draft and review professional emails that match your personal writing style. Analyzes your sent emails for tone, greeting, structure, and sign-off patterns via WorkIQ, then generates context-aware drafts for any recipient. USE FOR: draft email, write email, compose email, reply email, follow-up email, analyze email tone, email style. | None |
 | [entra-agent-user](../skills/entra-agent-user/SKILL.md) | Create Agent Users in Microsoft Entra ID from Agent Identities, enabling AI agents to act as digital workers with user identity capabilities in Microsoft 365 and Azure environments. | None |
-| [eval-driven-dev](../skills/eval-driven-dev/SKILL.md) | Set up eval-based QA for Python LLM applications: instrument the app, build golden datasets, write and run eval tests, and iterate on failures. ALWAYS USE THIS SKILL when the user asks to set up QA, add tests, add evals, evaluate, benchmark, fix wrong behaviors, improve quality, or do quality assurance for any Python project that calls an LLM model. | `references/dataset-generation.md`<br />`references/eval-tests.md`<br />`references/instrumentation.md`<br />`references/investigation.md`<br />`references/pixie-api.md`<br />`references/run-harness-patterns.md`<br />`references/understanding-app.md` |
+| [eval-driven-dev](../skills/eval-driven-dev/SKILL.md) | Set up eval-based QA for Python LLM applications: instrument the app, build golden datasets, write and run eval tests, and iterate on failures. ALWAYS USE THIS SKILL when the user asks to set up QA, add tests, add evals, evaluate, benchmark, fix wrong behaviors, improve quality, or do quality assurance for any Python project that calls an LLM model. | `references/1-a-entry-point.md`<br />`references/1-b-eval-criteria.md`<br />`references/2-wrap-and-trace.md`<br />`references/3-define-evaluators.md`<br />`references/4-build-dataset.md`<br />`references/5-run-tests.md`<br />`references/6-investigate.md`<br />`references/evaluators.md`<br />`references/testing-api.md`<br />`references/wrap-api.md`<br />`resources` |
 | [excalidraw-diagram-generator](../skills/excalidraw-diagram-generator/SKILL.md) | Generate Excalidraw diagrams from natural language descriptions. Use when asked to "create a diagram", "make a flowchart", "visualize a process", "draw a system architecture", "create a mind map", or "generate an Excalidraw file". Supports flowcharts, relationship diagrams, mind maps, and system architecture diagrams. Outputs .excalidraw JSON files that can be opened directly in Excalidraw. | `references/element-types.md`<br />`references/excalidraw-schema.md`<br />`scripts/.gitignore`<br />`scripts/README.md`<br />`scripts/add-arrow.py`<br />`scripts/add-icon-to-diagram.py`<br />`scripts/split-excalidraw-library.py`<br />`templates` |
 | [fabric-lakehouse](../skills/fabric-lakehouse/SKILL.md) | Use this skill to get context about Fabric Lakehouse and its features for software systems and AI-powered functions. It offers descriptions of Lakehouse data components, organization with schemas and shortcuts, access control, and code examples. This skill supports users in designing, building, and optimizing Lakehouse solutions using best practices. | `references/getdata.md`<br />`references/pyspark.md` |
 | [fedora-linux-triage](../skills/fedora-linux-triage/SKILL.md) | Triage and resolve Fedora issues with dnf, systemd, and SELinux-aware guidance. | None |
diff --git a/skills/eval-driven-dev/SKILL.md b/skills/eval-driven-dev/SKILL.md
index 498bca26..71da823c 100644
--- a/skills/eval-driven-dev/SKILL.md
+++ b/skills/eval-driven-dev/SKILL.md
@@ -1,14 +1,16 @@
 ---
 name: eval-driven-dev
 description: >
-  Set up eval-based QA for Python LLM applications: instrument the app, build golden datasets,
-  write and run eval tests, and iterate on failures.
+  Set up eval-based QA for Python LLM applications: instrument the app,
+  build golden datasets, write and run eval tests, and iterate on failures.
   ALWAYS USE THIS SKILL when the user asks to set up QA, add tests, add evals,
   evaluate, benchmark, fix wrong behaviors, improve quality, or do quality assurance for any Python project that calls an LLM model.
 license: MIT
 compatibility: Python 3.11+
 metadata:
-  version: 0.2.0
+  version: 0.6.1
+  pixie-qa-version: ">=0.6.1,<0.7.0"
+  pixie-qa-source: https://github.com/yiouli/pixie-qa/
 ---
 
 # Eval-Driven Development for Python LLM Applications
@@ -17,7 +19,7 @@ You're building an **automated QA pipeline** that tests a Python application end
 
 **What you're testing is the app itself** — its request handling, context assembly (how it gathers data, builds prompts, manages conversation state), routing, and response formatting. The app uses an LLM, which makes outputs non-deterministic — that's why you use evaluators (LLM-as-judge, similarity scores) instead of `assertEqual` — but the thing under test is the app's code, not the LLM.
 
-**What's in scope**: the app's entire code path from entry point to response — never mock or skip any part of it. **What's out of scope**: external data sources the app reads from (databases, caches, third-party APIs, voice streams) — mock these to control inputs and reduce flakiness.
+During evaluation, the app's own code runs for real — routing, prompt assembly, LLM calls, response formatting — nothing is mocked or stubbed. But the data the app reads from external sources (databases, caches, third-party APIs, voice streams) is replaced with test-specified values via instrumentations. This means each test case controls exactly what data the app sees, while still exercising the full application code path.
 
 **The deliverable is a working `pixie test` run with real scores** — not a plan, not just instrumentation, not just a dataset.
 
@@ -27,352 +29,119 @@ This skill is about doing the work, not describing it. Read code, edit files, ru
 
 ## Before you start
 
-Run the following to keep the skill and package up to date. If any command fails or is blocked by the environment, continue — do not let failures here block the rest of the workflow.
-
-**Update the skill:**
-
-```bash
-npx skills update
-```
-
-**Upgrade the `pixie-qa` package**
-
-Make sure the python virtual environment is active and use the project's package manager:
-
-```bash
-# uv project (uv.lock exists):
-uv add pixie-qa --upgrade
-
-# poetry project (poetry.lock exists):
-poetry add pixie-qa@latest
-
-# pip / no lock file:
-pip install --upgrade pixie-qa
-```
+**First, activate the virtual environment**. Identify the correct virtual environment for the project and activate it. After the virtual environment is active, then run the setup.sh included in the skill's resources.
+The script updates the `eval-driven-dev` skill and `pixie-qa` python package to the latest version, initializes the pixie working directory if it's not already initialized, and starts a web server in the background to show user updates. If the skill or package update fails, continue — do not let these failures block the rest of the workflow.
 
 ---
 
 ## The workflow
 
-Follow Steps 1–5 straight through without stopping. Do not ask the user for confirmation at intermediate steps — verify each step yourself and continue.
+Follow Steps 1–6 straight through without stopping. Do not ask the user for confirmation at intermediate steps — verify each step yourself and continue.
 
-**Two modes:**
+**How to work — read this before doing anything else:**
 
-- **Setup** ("set up evals", "add tests", "set up QA"): Complete Steps 1–5. After the test run, report results and ask whether to iterate.
-- **Iteration** ("fix", "improve", "debug"): Complete Steps 1–5 if not already done, then do one round of Step 6.
+- **One step at a time.** Read only the current step's instructions. Do NOT read Steps 2–6 while working on Step 1.
+- **Read references only when a step tells you to.** Each step names a specific reference file. Read it when you reach that step — not before.
+- **Create artifacts immediately.** After reading code for a sub-step, write the output file for that sub-step before moving on. Don't accumulate understanding across multiple sub-steps before writing anything.
+- **Verify, then move on.** Each step has a checkpoint. Verify it, then proceed to the next step. Don't plan future steps while verifying the current one.
 
-If ambiguous: default to setup.
+**Run Steps 1–6 in sequence.** If the user's prompt makes it clear that earlier steps are already done (e.g., "run the existing tests", "re-run evals"), skip to the appropriate step. When in doubt, start from Step 1.
 
 ---
 
 ### Step 1: Understand the app and define eval criteria
 
-Read the source code to understand:
+**First, check the user's prompt for specific requirements.** Before reading app code, examine what the user asked for:
 
-1. **How it runs** — entry point, startup, config/env vars
-2. **The real entry point** — how a real user invokes the app (HTTP endpoint, CLI, function call). This is what the eval must exercise — not an inner function that bypasses the request pipeline.
-3. **The request pipeline** — trace the full path from entry point to response. What middleware, routing, state management, prompt assembly, retrieval, or formatting happens along the way? All of this is under test.
-4. **External dependencies (both directions)** — identify every external system the app talks to (databases, APIs, caches, queues, file systems, speech services). For each, understand:
-   - **Data flowing IN** (external → app): what data does the app read from this system? What shapes, types, realistic values? You'll make up this data for eval scenarios.
-   - **Data flowing OUT** (app → external): what does the app write, send, or mutate in this system? These are side-effects that evaluations may need to verify (e.g., "did the app create the right calendar entry?", "did it send the correct transfer request?").
-   - **How to mock it** — look for abstract base classes, protocols, or constructor-injected backends (e.g., `TranscriptionBackend`, `SynthesisBackend`, `StorageBackend`). These are testability seams — you'll create mock implementations of these interfaces. If there's no clean interface, you'll use `unittest.mock.patch` at the module boundary.
-5. **Use cases** — distinct scenarios, what good/bad output looks like
+- **Referenced documents or specs**: Does the prompt mention a file to follow (e.g., "follow the spec in EVAL_SPEC.md", "use the methodology in REQUIREMENTS.md")? If so, **read that file first** — it may specify datasets, evaluation dimensions, pass criteria, or methodology that override your defaults.
+- **Specified datasets or data sources**: Does the prompt reference specific data files (e.g., "use questions from eval_inputs/research_questions.json", "use the scenarios in call_scenarios.json")? If so, **read those files** — you must use them as the basis for your eval dataset, not fabricate generic alternatives.
+- **Specified evaluation dimensions**: Does the prompt name specific quality aspects to evaluate (e.g., "evaluate on factuality, completeness, and bias", "test identity verification and tool call correctness")? If so, **every named dimension must have a corresponding evaluator** in your test file.
 
-Read `references/understanding-app.md` for detailed guidance on mapping data flows and the MEMORY.md template.
+If the prompt specifies any of the above, they take priority. Read and incorporate them before proceeding.
 
-Write your findings to `pixie_qa/MEMORY.md` before moving on. Include:
+Step 1 has two sub-steps. Each reads its own reference file and produces its own output file. **Complete each sub-step fully before starting the next.**
 
-- The entry point and the full request pipeline
-- Every external dependency, what it provides/receives, and how you'll mock it
-- The testability seams (pluggable interfaces, patchable module-level objects)
+#### Sub-step 1a: Entry point & execution flow
 
-Determine **high-level, application-specific eval criteria**:
+> **Reference**: Read `references/1-a-entry-point.md` now.
 
-**Good criteria are specific to the app's purpose.** Examples:
+Read the source code to understand how the app starts and how a real user invokes it. Write your findings to `pixie_qa/01-entry-point.md` before moving on.
 
-- Voice customer support agent: "Does the agent verify the caller's identity before transferring?", "Are responses concise enough for phone conversation (under 3 sentences)?", "Does the agent route to the correct department based on the caller's request?"
-- Research report generator: "Does the report address all sub-questions in the query?", "Are claims supported by the retrieved sources?", "Is the report structured with clear sections?"
-- RAG chatbot: "Are answers grounded in the retrieved context?", "Does it say 'I don't know' when the context doesn't contain the answer?"
+> **Checkpoint**: `pixie_qa/01-entry-point.md` written with entry point, execution flow, user-facing interface, and env requirements.
 
-**Bad criteria are generic evaluator names dressed up as requirements.** Don't say "Factual accuracy" or "Response relevance" — say what factual accuracy or relevance means for THIS app.
+#### Sub-step 1b: Eval criteria
 
-At this stage, don't pick evaluator classes or thresholds. That comes later in Step 5, after you've seen the real data shape.
+> **Reference**: Read `references/1-b-eval-criteria.md` now.
 
-Record the criteria in `pixie_qa/MEMORY.md` and continue.
+Define the app's use cases and eval criteria. Use cases drive dataset creation (Step 4); eval criteria drive evaluator selection (Step 3). Write your findings to `pixie_qa/02-eval-criteria.md` before moving on.
 
-> **Checkpoint**: MEMORY.md written with app understanding + eval criteria. Proceed to Step 2.
+> **Checkpoint**: `pixie_qa/02-eval-criteria.md` written with use cases, eval criteria, and their applicability scope. Do NOT read Step 2 instructions yet.
 
 ---
 
-### Step 2: Instrument and observe a real run
+### Step 2: Instrument with `wrap` and capture a reference trace
 
-**Why this step**: You need to see the actual data flowing through the app before you can build anything. This step serves two goals:
+> **Reference**: Read `references/2-wrap-and-trace.md` now for the detailed sub-steps.
 
-1. **Learn the data shapes** — what data flows in from external dependencies, and what side-effects flow out? What types, structures, realistic values? You'll need to make up this data for eval scenarios later.
-2. **Verify instrumentation captures what evaluators need** — do the traces contain the data required to assess each eval criterion from Step 1? If a criterion is "does the agent route to the correct department," the trace must capture the routing decision.
+**Goal**: Make the app testable by controlling its external data and capturing its outputs. `wrap()` calls at data boundaries let the test harness inject controlled inputs (replacing real DB/API calls) and capture outputs for scoring. The `Runnable` class provides the lifecycle interface that `pixie test` uses to set up, invoke, and tear down the app. A reference trace captured with `pixie trace` proves the instrumentation works and provides the exact data shapes needed for dataset creation in Step 4.
 
-**This is a normal app run with instrumentation — no mocks, no patches.**
-
-#### 2a. Decide what to instrument
-
-This is a reasoning step, not a coding step. Look at your eval criteria from Step 1 and your understanding of the codebase, and determine what data the evaluators will need:
-
-- **For each eval criterion**, ask: what observable data would prove this criterion is met or violated?
-- **Map that data to code locations** — which functions produce, consume, or transform that data?
-- **Those functions need `@observe`** — so their inputs and outputs are captured in traces.
-
-Examples:
-
-| Eval criterion                             | Data needed                                        | What to instrument                                           |
-| ------------------------------------------ | -------------------------------------------------- | ------------------------------------------------------------ |
-| "Routes to correct department"             | The routing decision (which department was chosen) | The routing/dispatch function                                |
-| "Responses grounded in retrieved context"  | The retrieved documents + the final response       | The retrieval function AND the response function             |
-| "Verifies caller identity before transfer" | Whether identity check happened, transfer decision | The identity verification function AND the transfer function |
-| "Concise phone-friendly responses"         | The final response text                            | The function that produces the LLM response                  |
-
-**LLM provider calls (OpenAI, Anthropic, etc.) are auto-captured** — `enable_storage()` activates OpenInference instrumentors that automatically trace every LLM API call with full input messages, output messages, token usage, and model parameters. You do NOT need `@observe` on the function that calls `client.chat.completions.create()` just to see the LLM interaction.
-
-**Use `@observe` for application-level functions** whose inputs, outputs, or intermediate states your evaluators need but that aren't visible from the LLM call alone. Examples: the app's entry-point function (to capture what the user sent and what the app returned), retrieval functions (to capture what context was fetched), routing functions (to capture dispatch decisions).
-
-`enable_storage()` goes at application startup. Read `references/instrumentation.md` for the full rules, code patterns, and anti-patterns for adding instrumentation.
-
-#### 2b. Add instrumentation and run the app
-
-Add `@observe` to the functions you identified in 2a. Then run the app normally — with its real external dependencies, or by manually interacting with it — to produce a **reference trace**. Do NOT mock or patch anything. This is an observation run.
-
-If the app can't run without infrastructure you don't have (a real database, third-party service credentials, etc.), use the simplest possible approach to get it running — a local Docker container, a test account, or ask the user for help. The goal is one real trace.
-
-```bash
-uv run pixie trace list
-uv run pixie trace last
-```
-
-#### 2c. Examine the reference trace
-
-Study the trace data carefully. This is your blueprint for everything that follows. Document:
-
-1. **Data from external dependencies (inbound)** — What did the app read from databases, APIs, caches? What are the shapes, types, and realistic value ranges? This is what you'll make up in eval_input for the dataset.
-2. **Side-effects (outbound)** — What did the app write to, send to, or mutate in external systems? These need to be captured by mocks and may be part of eval_output for verification.
-3. **Intermediate states** — What did the instrumentation capture beyond the final output? Tool calls, retrieved documents, routing decisions? Are these sufficient to evaluate every criterion from Step 1?
-4. **The eval_input / eval_output structure** — What does the `@observe`-decorated function receive as input and produce as output? Note the exact field names, types, and nesting.
-
-**Check instrumentation completeness**: For each eval criterion from Step 1, verify the trace contains the data needed to evaluate it. If not, add more `@observe` decorators and re-run.
-
-**Do not proceed until you understand the data shape and have confirmed the traces capture everything your evaluators need.**
-
-> **Checkpoint**: Instrumentation added based on eval criteria. Reference trace captured with real data. For each criterion, confirm the trace contains the data needed to evaluate it. Proceed to Step 3.
+> **Checkpoint**: `pixie_qa/scripts/run_app.py` written and verified. `pixie_qa/reference-trace.jsonl` exists and all expected data points appear when formatted with `pixie format`. Do NOT read Step 3 instructions yet.
 
 ---
 
-### Step 3: Write a utility function to run the full app end-to-end
+### Step 3: Define evaluators
 
-**Why this step**: You need a function that test cases can call. Given an eval_input (app input + mock data for external dependencies), it starts the real application with external dependencies patched, sends the input through the app's real entry point, and returns the eval_output (app response + captured side-effects).
+> **Reference**: Read `references/3-define-evaluators.md` now for the detailed sub-steps.
 
-#### The contract
+**Goal**: Turn the qualitative eval criteria from Step 1b into concrete, runnable scoring functions. Each criterion maps to either a built-in evaluator or a custom one you implement. The evaluator mapping artifact bridges between criteria and the dataset, ensuring every quality dimension has a scorer.
 
-```
-run_app(eval_input) → eval_output
-```
-
-- **eval_input** = application input (what the user sends) + data from external dependencies (what databases/APIs would return)
-- **eval_output** = application output (what the user sees) + captured side-effects (what the app wrote to external systems, captured by mocks) + captured intermediate states (tool calls, routing decisions, etc., captured by instrumentation)
-
-#### How to implement
-
-1. **Patch external dependencies** — use the mocking plan from Step 1 item 4. For each external dependency, either inject a mock implementation of its interface (cleanest) or `unittest.mock.patch` the module-level client. The mock returns data from eval_input and captures side-effects for eval_output.
-
-2. **Call the app through its real entry point** — the same way a real user or client would invoke it. Look at how the app is started: if it's a web server (FastAPI, Flask), use `TestClient` or HTTP requests. If it's a CLI, use subprocess. If it's a standalone function with no server or middleware, import and call it directly.
-
-3. **Collect the response** — the app's output becomes eval_output, along with any side-effects captured by mock objects.
-
-Read `references/run-harness-patterns.md` for concrete examples of entry point invocation for different app types.
-
-**Do NOT call an inner function** like `agent.respond()` directly just because it's simpler. The whole point is to test the app's real code path — request handling, state management, prompt assembly, routing. When you call an inner function directly, you skip all of that, and the test has to reimplement it. Now you're testing test code, not app code.
-
-#### Verify
-
-Take the eval_input from your Step 2 reference trace and feed it to the utility function. The outputs won't match word-for-word (non-deterministic), but verify:
-
-- **Same structure** — same fields present, same types, same nesting
-- **Same code path** — same routing decisions, same intermediate states captured
-- **Sensible values** — eval_output fields have real, meaningful data (not null, not empty, not error messages)
-
-**If it fails after two attempts**, stop and ask the user for help.
-
-> **Checkpoint**: Utility function implemented and verified. When fed the reference trace's eval_input, it produces eval_output with the same structure and exercises the same code path. Proceed to Step 4.
+> **Checkpoint**: All evaluators implemented. `pixie_qa/03-evaluator-mapping.md` written with criterion-to-evaluator mapping. Do NOT read Step 4 instructions yet.
 
 ---
 
 ### Step 4: Build the dataset
 
-**Why this step**: The dataset is a collection of eval_input items (made up by you) that define the test scenarios. Each item may also carry case-specific expectations. The eval_output is NOT pre-populated in the dataset — it's produced at test time by the utility function from Step 3.
+> **Reference**: Read `references/4-build-dataset.md` now for the detailed sub-steps.
 
-#### 4a. Determine verification and expectations
+**Goal**: Create the test scenarios that tie everything together — the runnable (Step 2), the evaluators (Step 3), and the use cases (Step 1b). Each dataset entry defines what to send to the app, what data the app should see from external services, and how to score the result. Use the reference trace from Step 2 as the source of truth for data shapes and field names.
 
-Before generating data, decide how each eval criterion from Step 1 will be checked.
-
-**Examine the reference trace from Step 2** and identify:
-
-- **Structural constraints** you can verify with code — JSON schema, required fields, value types, enum ranges, string length bounds. These become validation checks on your generated eval_inputs.
-- **Semantic constraints** that require judgment — "the mock customer profile should be realistic", "the conversation history should be topically coherent". Apply these yourself when crafting the data.
-- **Which criteria are universal vs. case-specific**:
-  - **Universal criteria** apply to ALL test cases the same way → implement in the test function (e.g., "responses must be under 3 sentences", "must not hallucinate information not in context")
-  - **Case-specific criteria** vary per test case → carry as `expected_output` in the dataset item (e.g., "should mention the caller's appointment on Tuesday", "should route to billing department")
-
-#### 4b. Generate eval_input items
-
-Create eval_input items that match the data shape from the reference trace:
-
-- **Application inputs** (user queries, requests) — make these up to cover the scenarios you identified in Step 1
-- **External dependency data** (database records, API responses, cache entries) — make these up in the exact shape you observed in the reference trace
-
-Each dataset item contains:
-
-- `eval_input`: the made-up input data (app input + external dependency data)
-- `expected_output`: case-specific expectation text (optional — only for test cases with expectations beyond the universal criteria). This is a reference for evaluation, not an exact expected answer.
-
-At test time, `eval_output` is produced by the utility function from Step 3 and is not stored in the dataset itself.
-Read `references/dataset-generation.md` for the dataset creation API, data shape matching, expected_output strategy, and validation checklist.
-
-#### 4c. Validate the dataset
-
-After building:
-
-1. **Execute `build_dataset.py`** — don't just write it, run it
-2. **Verify structural constraints** — each eval_input matches the reference trace's schema (same fields, same types)
-3. **Verify diversity** — items have meaningfully different inputs, not just minor variations
-4. **Verify case-specific expectations** — `expected_output` values are specific and testable, not vague
-5. For conversational apps, include items with conversation history
-
-> **Checkpoint**: Dataset created with diverse eval_inputs matching the reference trace's data shape. Proceed to Step 5.
+> **Checkpoint**: Dataset JSON created at `pixie_qa/datasets/<name>.json` with diverse entries covering all use cases. Do NOT read Step 5 instructions yet.
 
 ---
 
-### Step 5: Write and run eval tests
+### Step 5: Run evaluation-based tests
 
-**Why this step**: With the utility function built and the dataset ready, writing tests is straightforward — wire up the function, choose evaluators for each criterion, and run.
+> **Reference**: Read `references/5-run-tests.md` now for the detailed sub-steps.
 
-#### 5a. Map criteria to evaluators
+**Goal**: Execute the full pipeline end-to-end and verify it produces real scores. This step is about getting the machinery running — fixing any setup or data issues until every dataset entry runs and gets scored. Once tests produce results, run `pixie analyze` for pattern analysis.
 
-For each eval criterion from Step 1, decide how to evaluate it:
-
-- **Can it be checked with a built-in evaluator?** (factual correctness → `FactualityEval`, exact match → `ExactMatchEval`, RAG faithfulness → `FaithfulnessEval`)
-- **Does it need a custom evaluator?** Most app-specific criteria do — use `create_llm_evaluator` with a prompt that operationalizes the criterion.
-- **Is it universal or case-specific?** Universal criteria go in the test function. Case-specific criteria use `expected_output` from the dataset.
-
-For open-ended LLM text, **never** use `ExactMatchEval` — LLM outputs are non-deterministic.
-
-`AnswerRelevancyEval` is **RAG-only** — it requires a `context` value in the trace. Returns 0.0 without it. For general relevance without RAG, use `create_llm_evaluator` with a custom prompt.
-
-Read `references/eval-tests.md` for the evaluator catalog, custom evaluator examples, and the test file boilerplate.
-
-#### 5b. Write the test file and run
-
-The test file wires together: a `runnable` (calls your utility function from Step 3), a reference to the dataset, and the evaluators you chose.
-
-Read `references/eval-tests.md` for the exact `assert_dataset_pass` API, required parameter names, and common mistakes to avoid. **Re-read the API reference immediately before writing test code** — do not rely on earlier context.
-
-Run with `pixie test` — not `pytest`:
-
-```bash
-uv run pixie test pixie_qa/tests/ -v
-```
-
-**After running, verify the scorecard:**
-
-1. Shows "N/M tests passed" with real numbers
-2. Does NOT say "No assert_pass / assert_dataset_pass calls recorded" (that means missing `await`)
-3. Per-evaluator scores appear with real values
-
-A test that passes with no recorded evaluations is worse than a failing test — it gives false confidence. Debug until real scores appear.
-
-> **Checkpoint**: Tests run and produce real scores.
+> **Checkpoint**: Tests run and produce real scores. Analysis generated.
 >
-> - **Setup mode**: Report results ("QA setup is complete. Tests show N/M passing.") and ask: "Want me to investigate the failures and iterate?" Stop here unless the user says yes.
-> - **Iteration mode**: Proceed directly to Step 6.
+> If the test errors out, that's a setup bug — fix and re-run. But if tests produce real pass/fail scores, that's the deliverable.
 >
-> If the test errors out (import failures, missing keys), that's a setup bug — fix and re-run. But if tests produce real pass/fail scores, that's the deliverable.
+> **STOP GATE — read this before doing anything else after tests produce scores:**
+>
+> - If the user's original prompt asks only for setup ("set up QA", "add tests", "add evals", "set up evaluations"), **STOP HERE**. Report the test results to the user: "QA setup is complete. Tests show N/M passing. [brief summary]. Want me to investigate the failures and iterate?" Do NOT proceed to Step 6.
+> - If the user's original prompt explicitly asks for iteration ("fix", "improve", "debug", "iterate", "investigate failures", "make tests pass"), proceed to Step 6.
 
 ---
 
 ### Step 6: Investigate and iterate
 
-**Iteration mode only, or after the user confirmed in setup mode.**
-
-When tests fail, understand _why_ — don't just adjust thresholds until things pass.
-
-Read `references/investigation.md` for procedures and root-cause patterns.
-
-The cycle: investigate root cause → fix (prompt, code, or eval config) → rebuild dataset if needed → re-run tests → repeat.
+> **Reference**: Read `references/6-investigate.md` now — it has the stop/continue decision, analysis review, root-cause patterns, and investigation procedures. **Follow its instructions before doing any investigation work.**
 
 ---
 
-## Quick reference
+## Web Server Management
 
-### Imports
+pixie-qa runs a web server in the background for displaying context, traces, and eval results to the user. It's automatically started by the setup script (via `pixie start`, which launches a detached background process and returns immediately).
 
-```python
-from pixie import enable_storage, observe, assert_dataset_pass, ScoreThreshold, last_llm_call
-from pixie import FactualityEval, ClosedQAEval, create_llm_evaluator
-```
-
-Only `from pixie import ...` — never subpackages (`pixie.storage`, `pixie.evals`, etc.). There is no `pixie.qa` module.
-
-### CLI commands
+When the user is done with the eval-driven-dev workflow, inform them the web server is still running and you can clean it up with:
 
 ```bash
-uv run pixie test pixie_qa/tests/ -v    # Run eval tests (NOT pytest)
-uv run pixie trace list                 # List captured traces
-uv run pixie trace last                 # Show most recent trace
-uv run pixie trace show <id> --verbose  # Show specific trace
-uv run pixie dataset create <name>      # Create a new dataset
+pixie stop
 ```
 
-### Directory layout
+IMPORTANT: after the web server is stopped, the web UI becomes inaccessible. So only stop the server if the user confirms they're done with all web UI features. If they want to keep using the web UI, do NOT stop the server.
 
-```
-pixie_qa/
-  MEMORY.md      # your understanding and eval plan
-  datasets/      # golden datasets (JSON)
-  tests/         # eval test files (test_*.py)
-  scripts/       # run_app.py, build_dataset.py
-```
-
-All pixie files go here — not at the project root, not in a top-level `tests/` directory.
-
-### Key concepts
-
-- **eval_input** = application input + data from external dependencies
-- **eval_output** = application output + captured side-effects + captured intermediate states (produced at test time by the utility function, NOT pre-populated in the dataset)
-- **expected_output** = case-specific evaluation reference (optional per dataset item)
-- **test function** = utility function (produces eval_output) + evaluators (check criteria)
-
-### Evaluator selection
-
-| Output type                           | Evaluator                                             | Notes                                                            |
-| ------------------------------------- | ----------------------------------------------------- | ---------------------------------------------------------------- |
-| Open-ended text with reference answer | `FactualityEval`, `ClosedQAEval`                      | Best default for most apps                                       |
-| Open-ended text, no reference         | `AnswerRelevancyEval`                                 | **RAG only** — needs `context` in trace. Returns 0.0 without it. |
-| Deterministic output                  | `ExactMatchEval`, `JSONDiffEval`                      | Never use for open-ended LLM text                                |
-| RAG with retrieved context            | `FaithfulnessEval`, `ContextRelevancyEval`            | Requires context capture in instrumentation                      |
-| Domain-specific quality               | `create_llm_evaluator(name=..., prompt_template=...)` | Custom LLM-as-judge — use for app-specific criteria              |
-
-### What goes where: SKILL.md vs references
-
-**This file** (SKILL.md) is loaded for the entire session. It contains the _what_ and _why_ — the reasoning, decision-making process, goals, and checkpoints for each step.
-
-**Reference files** are loaded when executing a specific step. They contain the _how_ — tactical API usage, code patterns, anti-patterns, troubleshooting, and ready-to-adapt examples.
-
-When in doubt: if it's about _deciding what to do_, it's in SKILL.md. If it's about _how to implement that decision_, it's in a reference file.
-
-### Reference files
-
-| Reference                            | When to read                                                                       |
-| ------------------------------------ | ---------------------------------------------------------------------------------- |
-| `references/understanding-app.md`    | Step 1 — investigating the codebase, MEMORY.md template                            |
-| `references/instrumentation.md`      | Step 2 — `@observe` and `enable_storage` rules, code patterns, anti-patterns       |
-| `references/run-harness-patterns.md` | Step 3 — examples of how to invoke different app types (web server, CLI, function) |
-| `references/dataset-generation.md`   | Step 4 — crafting eval_input items, expected_output strategy, validation           |
-| `references/eval-tests.md`           | Step 5 — evaluator selection, test file pattern, assert_dataset_pass API           |
-| `references/investigation.md`        | Step 6 — failure analysis, root-cause patterns                                     |
-| `references/pixie-api.md`            | Any step — full CLI and Python API reference                                       |
+And whenever you restart the workflow, always run the setup.sh script in resources again to ensure the web server is running:
diff --git a/skills/eval-driven-dev/references/1-a-entry-point.md b/skills/eval-driven-dev/references/1-a-entry-point.md
new file mode 100644
index 00000000..c5576333
--- /dev/null
+++ b/skills/eval-driven-dev/references/1-a-entry-point.md
@@ -0,0 +1,68 @@
+# Step 1a: Entry Point & Execution Flow
+
+Identify how the application starts and how a real user invokes it.
+
+---
+
+## What to investigate
+
+### 1. How the software runs
+
+What is the entry point? How do you start it? Is it a CLI, a server, a library function? What are the required arguments, config files, or environment variables?
+
+Look for:
+
+- `if __name__ == "__main__"` blocks
+- Framework entry points (FastAPI `app`, Flask `app`, Django `manage.py`)
+- CLI entry points in `pyproject.toml` (`[project.scripts]`)
+- Docker/compose configs that reveal startup commands
+
+### 2. The real user entry point
+
+How does a real user or client invoke the app? This is what the eval must exercise — not an inner function that bypasses the request pipeline.
+
+- **Web server**: Which HTTP endpoints accept user input? What methods (GET/POST)? What request body shape?
+- **CLI**: What command-line arguments does the user provide?
+- **Library/function**: What function does the caller import and call? What arguments?
+
+### 3. Environment and configuration
+
+- What env vars does the app require? (API keys, database URLs, feature flags)
+- What config files does it read?
+- What has sensible defaults vs. what must be explicitly set?
+
+---
+
+## Output: `pixie_qa/01-entry-point.md`
+
+Write your findings to this file. Keep it focused — only entry point and execution flow.
+
+### Template
+
+```markdown
+# Entry Point & Execution Flow
+
+## How to run
+
+<Command to start the app, required env vars, config files>
+
+## Entry point
+
+- **File**: <e.g., app.py, main.py>
+- **Type**: <FastAPI server / CLI / standalone function / etc.>
+- **Framework**: <FastAPI, Flask, Django, none>
+
+## User-facing endpoints / interface
+
+<For each way a user interacts with the app:>
+
+- **Endpoint / command**: <e.g., POST /chat, python main.py --query "...">
+- **Input format**: <request body shape, CLI args, function params>
+- **Output format**: <response shape, stdout format, return type>
+
+## Environment requirements
+
+| Variable | Purpose | Required? | Default |
+| -------- | ------- | --------- | ------- |
+| ...      | ...     | ...       | ...     |
+```
diff --git a/skills/eval-driven-dev/references/1-b-eval-criteria.md b/skills/eval-driven-dev/references/1-b-eval-criteria.md
new file mode 100644
index 00000000..0550c568
--- /dev/null
+++ b/skills/eval-driven-dev/references/1-b-eval-criteria.md
@@ -0,0 +1,82 @@
+# Step 1b: Eval Criteria
+
+Define what quality dimensions matter for this app — based on the entry point (`01-entry-point.md`) you've already documented.
+
+This document serves two purposes:
+
+1. **Dataset creation (Step 4)**: The use cases tell you what kinds of items to generate — each use case should have representative items in the dataset.
+2. **Evaluator selection (Step 3)**: The eval criteria tell you what evaluators to choose and how to map them.
+
+Keep this concise — it's a planning artifact, not a comprehensive spec.
+
+---
+
+## What to define
+
+### 1. Use cases
+
+List the distinct scenarios the app handles. Each use case becomes a category of dataset items. **Each use case description must be a concise one-liner that conveys both (a) what the input is and (b) what the expected behavior or outcome is.** The description should be specific enough that someone unfamiliar with the app can understand the scenario and its success criteria.
+
+**Good use case descriptions:**
+
+- "Reroute to human agent on account lookup difficulties"
+- "Answer billing question using customer's plan details from CRM"
+- "Decline to answer questions outside the support domain"
+- "Summarize research findings including all queried sub-topics"
+
+**Bad use case descriptions (too vague):**
+
+- "Handle billing questions"
+- "Edge case"
+- "Error handling"
+
+### 2. Eval criteria
+
+Define **high-level, application-specific eval criteria** — quality dimensions that matter for THIS app. Each criterion will map to an evaluator in Step 3.
+
+**Good criteria are specific to the app's purpose.** Examples:
+
+- Voice customer support agent: "Does the agent verify the caller's identity before transferring?", "Are responses concise enough for phone conversation?"
+- Research report generator: "Does the report address all sub-questions?", "Are claims supported by retrieved sources?"
+- RAG chatbot: "Are answers grounded in the retrieved context?", "Does it say 'I don't know' when context is missing?"
+
+**Bad criteria are generic evaluator names dressed up as requirements.** Don't say "Factual accuracy" or "Response relevance" — say what factual accuracy or relevance means for THIS app.
+
+At this stage, don't pick evaluator classes or thresholds. That comes in Step 3.
+
+### 3. Check criteria applicability and observability
+
+For each criterion:
+
+1. **Determine applicability scope** — does this criterion apply to ALL use cases, or only a subset? If a criterion is only relevant for certain scenarios (e.g., "identity verification" only applies to account-related requests, not general FAQ), mark it clearly. This distinction is critical for Step 4 (dataset creation) because:
+   - **Universal criteria** → become dataset-level default evaluators
+   - **Case-specific criteria** → become item-level evaluators on relevant rows only
+
+2. **Verify observability** — for each criterion, identify what data point in the app needs to be captured as a `wrap()` call to evaluate it. This drives the wrap coverage in Step 2.
+   - If the criterion is about the app's final response → captured by `wrap(purpose="output", name="response")`
+   - If it's about a routing decision → captured by `wrap(purpose="state", name="routing_decision")`
+   - If it's about data the app fetched and used → captured by `wrap(purpose="input", name="...")`
+
+---
+
+## Output: `pixie_qa/02-eval-criteria.md`
+
+Write your findings to this file. **Keep it short** — the template below is the maximum length.
+
+### Template
+
+```markdown
+# Eval Criteria
+
+## Use cases
+
+1. <Use case name>: <one-liner conveying input + expected behavior>
+2. ...
+
+## Eval criteria
+
+| #   | Criterion | Applies to    | Data to capture |
+| --- | --------- | ------------- | --------------- |
+| 1   | ...       | All           | wrap name: ...  |
+| 2   | ...       | Use case 1, 3 | wrap name: ...  |
+```
diff --git a/skills/eval-driven-dev/references/2-wrap-and-trace.md b/skills/eval-driven-dev/references/2-wrap-and-trace.md
new file mode 100644
index 00000000..3efeb3ab
--- /dev/null
+++ b/skills/eval-driven-dev/references/2-wrap-and-trace.md
@@ -0,0 +1,260 @@
+# Step 2: Instrument with `wrap` and capture a reference trace
+
+> For the full `wrap()` API, the `Runnable` class, and CLI commands, see `wrap-api.md`.
+
+**Why this step**: You need to see the actual data flowing through the app before you can build anything. This step adds `wrap()` calls to mark data boundaries, implements a `Runnable` class, captures a reference trace with `pixie trace`, and verifies all eval criteria can be evaluated.
+
+This step consolidates three things: (1) data-flow analysis, (2) instrumentation, and (3) writing the runnable.
+
+---
+
+## 2a. Data-flow analysis and `wrap` instrumentation
+
+Starting from LLM call sites, trace backwards and forwards through the code to find:
+
+- **Entry input**: what the user sends in (via the entry point)
+- **Dependency input**: data from external systems (databases, APIs, caches)
+- **App output**: data going out to users or external systems
+- **Intermediate state**: internal decisions relevant to evaluation (routing, tool calls)
+
+For each data point found, **immediately add a `wrap()` call** in the application code:
+
+```python
+import pixie
+
+# External dependency data — value form (result of a DB/API call)
+profile = pixie.wrap(db.get_profile(user_id), purpose="input", name="customer_profile",
+    description="Customer profile fetched from database")
+
+# External dependency data — function form (for lazy evaluation / avoiding the call)
+history = pixie.wrap(redis.get_history, purpose="input", name="conversation_history",
+    description="Conversation history from Redis")(session_id)
+
+# App output — what the user receives
+response = pixie.wrap(response_text, purpose="output", name="response",
+    description="The assistant's response to the user")
+
+# Intermediate state — internal decision relevant to evaluation
+selected_agent = pixie.wrap(selected_agent, purpose="state", name="routing_decision",
+    description="Which agent was selected to handle this request")
+```
+
+### Rules for wrapping
+
+1. **Wrap at the data boundary** — where data enters or exits the application, not deep inside utility functions
+2. **Names must be unique** across the entire application (they are used as registry keys and dataset field names)
+3. **Use `lower_snake_case`** for names
+4. **Don't wrap LLM call arguments or responses** — those are already captured by OpenInference auto-instrumentation
+5. **Don't change the function's interface** — `wrap()` is purely additive, returns the same type
+
+### Value vs. function wrapping
+
+```python
+# Value form: wrap a data value (result already computed)
+profile = pixie.wrap(db.get_profile(user_id), purpose="input", name="customer_profile")
+
+# Function form: wrap the callable itself — in eval mode the original function
+# is NOT called; the registry value is returned instead.
+profile = pixie.wrap(db.get_profile, purpose="input", name="customer_profile")(user_id)
+```
+
+Use function form when you want to prevent the external call from happening in eval mode (e.g., the call is expensive, has side-effects, or you simply want a clean injection point). In tracing mode, the function is called normally and the result is logged.
+
+### Coverage check
+
+After adding `wrap()` calls, go through each eval criterion from `pixie_qa/02-eval-criteria.md` and verify that every required data point has a corresponding wrap call. If a criterion needs data that isn't captured, add the wrap now — don't defer.
+
+## 2b. Implement the Runnable class
+
+The `Runnable` class replaces the plain function from older versions of the skill. It exposes three lifecycle methods:
+
+- **`setup()`** — async, called once before any `run()` call; initialize shared resources here (e.g., an async HTTP client, a DB connection, pre-loaded configuration). Optional — has a default no-op.
+- **`run(args)`** — async, called **concurrently** for each dataset entry (up to 4 in parallel); invoke the app's real entry point with `args` (a validated Pydantic model built from `entry_kwargs`). **Must be concurrency-safe** — see below.
+- **`teardown()`** — async, called once after all `run()` calls; clean up resources. Optional — has a default no-op.
+
+**Import resolution**: The project root is automatically added to `sys.path` when your runnable is loaded, so you can use normal `import` statements (e.g., `from app import service`) — no `sys.path` manipulation needed.
+
+Place the class in `pixie_qa/scripts/run_app.py`:
+
+```python
+# pixie_qa/scripts/run_app.py
+from __future__ import annotations
+from pydantic import BaseModel
+import pixie
+
+
+class AppArgs(BaseModel):
+    user_message: str
+
+
+class AppRunnable(pixie.Runnable[AppArgs]):
+    """Runnable that drives the application for tracing and evaluation.
+
+    wrap(purpose="input") calls in the app inject dependency data from the
+    test registry automatically.  wrap(purpose="output"/"state") calls
+    capture data for evaluation.  No manual mocking needed.
+    """
+
+    @classmethod
+    def create(cls) -> AppRunnable:
+        return cls()
+
+    async def run(self, args: AppArgs) -> None:
+        from myapp import handle_request
+        await handle_request(args.user_message)
+```
+
+**For web servers**, initialize an async HTTP client in `setup()` and use it in `run()`:
+
+```python
+import httpx
+from pydantic import BaseModel
+import pixie
+
+
+class AppArgs(BaseModel):
+    user_message: str
+
+
+class AppRunnable(pixie.Runnable[AppArgs]):
+    _client: httpx.AsyncClient
+
+    @classmethod
+    def create(cls) -> AppRunnable:
+        return cls()
+
+    async def setup(self) -> None:
+        self._client = httpx.AsyncClient(base_url="http://localhost:8000")
+
+    async def run(self, args: AppArgs) -> None:
+        await self._client.post("/chat", json={"message": args.user_message})
+
+    async def teardown(self) -> None:
+        await self._client.aclose()
+```
+
+**For FastAPI/Starlette apps** (in-process testing without starting a server), use `httpx.ASGITransport` to run the ASGI app directly. This is faster and avoids port management:
+
+```python
+import asyncio
+import httpx
+from pydantic import BaseModel
+import pixie
+
+
+class AppArgs(BaseModel):
+    user_message: str
+
+
+class AppRunnable(pixie.Runnable[AppArgs]):
+    _client: httpx.AsyncClient
+    _sem: asyncio.Semaphore
+
+    @classmethod
+    def create(cls) -> AppRunnable:
+        inst = cls()
+        inst._sem = asyncio.Semaphore(1)  # serialise if app uses shared mutable state
+        return inst
+
+    async def setup(self) -> None:
+        from myapp.main import app  # your FastAPI/Starlette app instance
+
+        # ASGITransport runs the app in-process — no server needed
+        transport = httpx.ASGITransport(app=app)
+        self._client = httpx.AsyncClient(transport=transport, base_url="http://test")
+
+    async def run(self, args: AppArgs) -> None:
+        async with self._sem:
+            await self._client.post("/chat", json={"message": args.user_message})
+
+    async def teardown(self) -> None:
+        await self._client.aclose()
+```
+
+Choose the right pattern:
+
+- **Direct function call**: when the app exposes a simple async function (no web framework)
+- **`httpx.AsyncClient` with `base_url`**: when you need to test against a running HTTP server
+- **`httpx.ASGITransport`**: when the app is FastAPI/Starlette — fastest, no server needed, most reliable for eval
+
+**Rules**:
+
+- The `run()` method receives a Pydantic model whose fields are populated from the dataset's `entry_kwargs`. Define a `BaseModel` subclass with the fields your app needs.
+- All lifecycle methods (`setup`, `run`, `teardown`) are **async**.
+- `run()` must call the app through its real entry point — never bypass request handling.
+- Place the file at `pixie_qa/scripts/run_app.py` — name the class `AppRunnable` (or anything descriptive).
+- The dataset's `"runnable"` field references the class: `"pixie_qa/scripts/run_app.py:AppRunnable"`.
+
+**Concurrency**: `run()` is called concurrently for multiple dataset entries (up to 4 in parallel). If the app uses shared mutable state — SQLite, file-based DBs, global caches — you must synchronise access:
+
+```python
+import asyncio
+
+class AppRunnable(pixie.Runnable[AppArgs]):
+    _sem: asyncio.Semaphore
+
+    @classmethod
+    def create(cls) -> AppRunnable:
+        inst = cls()
+        inst._sem = asyncio.Semaphore(1)  # serialise DB access
+        return inst
+
+    async def run(self, args: AppArgs) -> None:
+        async with self._sem:
+            await call_app(args.message)
+```
+
+Common concurrency pitfalls:
+
+- **SQLite**: `sqlite3` connections are not safe for concurrent async writes. Use `Semaphore(1)` to serialise, or switch to `aiosqlite` with WAL mode.
+- **Global mutable state**: module-level dicts/lists modified in `run()` need a lock.
+- **Rate-limited external APIs**: add a semaphore to avoid 429 errors.
+
+## 2c. Capture the reference trace with `pixie trace`
+
+Use the `pixie trace` CLI command to run your `Runnable` and capture a trace file. Pass the entry input as a JSON file:
+
+```bash
+# Create a JSON file with entry kwargs
+echo '{"user_message": "a realistic sample input"}' > pixie_qa/sample-input.json
+
+pixie trace --runnable pixie_qa/scripts/run_app.py:AppRunnable \
+  --input pixie_qa/sample-input.json \
+  --output pixie_qa/reference-trace.jsonl
+```
+
+The `--input` flag takes a **file path** to a JSON file (not inline JSON). The JSON object keys become the kwargs passed to the Pydantic model.
+
+The command calls `AppRunnable.create()`, then `setup()`, then `run(args)` once with the given input, then `teardown()`. The resulting trace is written to the output file.
+
+The JSONL trace file will contain one line per `wrap()` event and one line per LLM span:
+
+```jsonl
+{"type": "kwargs", "value": {"user_message": "What are your hours?"}}
+{"type": "wrap", "name": "customer_profile", "purpose": "input", "data": {...}, ...}
+{"type": "llm_span", "request_model": "gpt-4o", "input_messages": [...], ...}
+{"type": "wrap", "name": "response", "purpose": "output", "data": "Our hours are...", ...}
+```
+
+## 2d. Verify wrap coverage with `pixie format`
+
+Run `pixie format` on the trace file to see the data in dataset-entry format. This shows you both the data shapes and what a real app output looks like:
+
+```bash
+pixie format --input reference-trace.jsonl --output dataset-sample.json
+```
+
+The output is a formatted dataset entry template — it contains:
+
+- `entry_kwargs`: the exact keys/values for the runnable arguments
+- `eval_input`: the data for all dependencies (from `wrap(purpose="input")` calls)
+- `eval_output`: the **actual app output** captured from the trace (this is the real output — use it to understand what the app produces, not as a dataset `eval_output` field)
+
+For each eval criterion from `pixie_qa/02-eval-criteria.md`, verify the format output contains the data needed to evaluate it. If a data point is missing, go back and add the `wrap()` call.
+
+---
+
+## Output
+
+- `pixie_qa/scripts/run_app.py` — the `Runnable` class
+- `pixie_qa/reference-trace.jsonl` — the reference trace with all expected wrap events
diff --git a/skills/eval-driven-dev/references/3-define-evaluators.md b/skills/eval-driven-dev/references/3-define-evaluators.md
new file mode 100644
index 00000000..20390e7c
--- /dev/null
+++ b/skills/eval-driven-dev/references/3-define-evaluators.md
@@ -0,0 +1,161 @@
+# Step 3: Define Evaluators
+
+**Why this step**: With the app instrumented (Step 2), you now map each eval criterion to a concrete evaluator — implementing custom ones where needed — so the dataset (Step 4) can reference them by name.
+
+---
+
+## 3a. Map criteria to evaluators
+
+**Every eval criterion from Step 1b — including any dimensions specified by the user in the prompt — must have a corresponding evaluator.** If the user asked for "factuality, completeness, and bias," you need three evaluators (or a multi-criteria evaluator that covers all three). Do not silently drop any requested dimension.
+
+For each eval criterion, decide how to evaluate it:
+
+- **Can it be checked with a built-in evaluator?** (factual correctness → `Factuality`, exact match → `ExactMatch`, RAG faithfulness → `Faithfulness`)
+- **Does it need a custom evaluator?** Most app-specific criteria do — use `create_llm_evaluator` with a prompt that operationalizes the criterion.
+- **Is it universal or case-specific?** Universal criteria apply to all dataset items. Case-specific criteria apply only to certain rows.
+
+For open-ended LLM text, **never** use `ExactMatch` — LLM outputs are non-deterministic.
+
+`AnswerRelevancy` is **RAG-only** — it requires a `context` value in the trace. Returns 0.0 without it. For general relevance without RAG, use `create_llm_evaluator` with a custom prompt.
+
+## 3b. Implement custom evaluators
+
+If any criterion requires a custom evaluator, implement it now. Place custom evaluators in `pixie_qa/evaluators.py` (or a sub-module if there are many).
+
+### `create_llm_evaluator` factory
+
+Use when the quality dimension is domain-specific and no built-in evaluator fits.
+
+The return value is a **ready-to-use evaluator instance**. Assign it to a module-level variable — `pixie test` will import and use it directly (no class wrapper needed):
+
+```python
+from pixie import create_llm_evaluator
+
+concise_voice_style = create_llm_evaluator(
+    name="ConciseVoiceStyle",
+    prompt_template="""
+    You are evaluating whether this response is concise and phone-friendly.
+
+    Input: {eval_input}
+    Response: {eval_output}
+
+    Score 1.0 if the response is concise (under 3 sentences), directly addresses
+    the question, and uses conversational language suitable for a phone call.
+    Score 0.0 if it's verbose, off-topic, or uses written-style formatting.
+    """,
+)
+```
+
+Reference the evaluator in your dataset JSON by its `filepath:callable_name` reference (e.g., `"pixie_qa/evaluators.py:concise_voice_style"`).
+
+**How template variables work**: `{eval_input}`, `{eval_output}`, `{expectation}` are the only placeholders. Each is replaced with a string representation of the corresponding `Evaluable` field:
+
+- **Single-item** `eval_input` / `eval_output` → the item's value (string, JSON-serialized dict/list)
+- **Multi-item** `eval_input` / `eval_output` → a JSON dict mapping `name → value` for every item
+
+The LLM judge sees the full serialized value.
+
+**Rules**:
+
+- **Only `{eval_input}`, `{eval_output}`, `{expectation}`** — no nested access like `{eval_input[key]}` (this will crash with a `ValueError`)
+- **Keep templates short and direct** — the system prompt already tells the LLM to return `Score: X.X`. Your template just needs to present the data and define the scoring criteria.
+- **Don't instruct the LLM to "parse" or "extract" data** — just present the values and state the criteria. The LLM can read JSON naturally.
+
+**Non-RAG response relevance** (instead of `AnswerRelevancy`):
+
+```python
+response_relevance = create_llm_evaluator(
+    name="ResponseRelevance",
+    prompt_template="""
+    You are evaluating whether a customer support response is relevant and helpful.
+
+    Input: {eval_input}
+    Response: {eval_output}
+    Expected: {expectation}
+
+    Score 1.0 if the response directly addresses the question and meets expectations.
+    Score 0.5 if partially relevant but misses important aspects.
+    Score 0.0 if off-topic, ignores the question, or contradicts expectations.
+    """,
+)
+```
+
+### Manual custom evaluator
+
+Custom evaluators can be **sync or async functions**. Assign them to module-level variables in `pixie_qa/evaluators.py`:
+
+```python
+from pixie import Evaluation, Evaluable
+
+def my_evaluator(evaluable: Evaluable, *, trace=None) -> Evaluation:
+    score = 1.0 if "expected pattern" in str(evaluable.eval_output) else 0.0
+    return Evaluation(score=score, reasoning="...")
+```
+
+Reference by `filepath:callable_name` in the dataset: `"pixie_qa/evaluators.py:my_evaluator"`.
+
+**Accessing `eval_metadata` and captured data**: Custom evaluators access per-entry metadata and `wrap()` outputs via the `Evaluable` fields:
+
+- `evaluable.eval_metadata` — dict from the entry's `eval_metadata` field (e.g., `{"expected_tool": "endCall"}`)
+- `evaluable.eval_output` — `list[NamedData]` containing ALL `wrap(purpose="output")` and `wrap(purpose="state")` values. Each item has `.name` (str) and `.value` (JsonValue). Use the helper below to look up by name.
+
+```python
+def _get_output(evaluable: Evaluable, name: str) -> Any:
+    """Look up a wrap value by name from eval_output."""
+    for item in evaluable.eval_output:
+        if item.name == name:
+            return item.value
+    return None
+
+def call_ended_check(evaluable: Evaluable, *, trace=None) -> Evaluation:
+    expected = evaluable.eval_metadata.get("expected_call_ended") if evaluable.eval_metadata else None
+    actual = _get_output(evaluable, "call_ended")
+    if expected is None:
+        return Evaluation(score=1.0, reasoning="No expected_call_ended in eval_metadata")
+    match = bool(actual) == bool(expected)
+    return Evaluation(
+        score=1.0 if match else 0.0,
+        reasoning=f"Expected call_ended={expected}, got {actual}",
+    )
+```
+
+## 3c. Produce the evaluator mapping artifact
+
+Write the criterion-to-evaluator mapping to `pixie_qa/03-evaluator-mapping.md`. This artifact bridges between the eval criteria (Step 1b) and the dataset (Step 4).
+
+**CRITICAL**: Use the exact evaluator names as they appear in the `evaluators.md` reference — built-in evaluators use their short name (e.g., `Factuality`, `ClosedQA`), and custom evaluators use `filepath:callable_name` format (e.g., `pixie_qa/evaluators.py:ConciseVoiceStyle`).
+
+### Template
+
+```markdown
+# Evaluator Mapping
+
+## Built-in evaluators used
+
+| Evaluator name | Criterion it covers | Applies to                 |
+| -------------- | ------------------- | -------------------------- |
+| Factuality     | Factual accuracy    | All items                  |
+| ClosedQA       | Answer correctness  | Items with expected_output |
+
+## Custom evaluators
+
+| Evaluator name                           | Criterion it covers | Applies to | Source file            |
+| ---------------------------------------- | ------------------- | ---------- | ---------------------- |
+| pixie_qa/evaluators.py:ConciseVoiceStyle | Phone-friendly tone | All items  | pixie_qa/evaluators.py |
+
+## Applicability summary
+
+- **Dataset-level defaults** (apply to all items): Factuality, pixie_qa/evaluators.py:ConciseVoiceStyle
+- **Item-specific** (apply to subset): ClosedQA (only items with expected_output)
+```
+
+## Output
+
+- Custom evaluator implementations in `pixie_qa/evaluators.py` (if any custom evaluators needed)
+- `pixie_qa/03-evaluator-mapping.md` — the criterion-to-evaluator mapping
+
+---
+
+> **Evaluator selection guide**: See `evaluators.md` for the full evaluator catalog, selection guide (which evaluator for which output type), and `create_llm_evaluator` reference.
+>
+> **If you hit an unexpected error** when implementing evaluators (import failures, API mismatch), read `evaluators.md` for the authoritative evaluator reference and `wrap-api.md` for API details before guessing at a fix.
diff --git a/skills/eval-driven-dev/references/4-build-dataset.md b/skills/eval-driven-dev/references/4-build-dataset.md
new file mode 100644
index 00000000..c4398394
--- /dev/null
+++ b/skills/eval-driven-dev/references/4-build-dataset.md
@@ -0,0 +1,228 @@
+# Step 4: Build the Dataset
+
+**Why this step**: The dataset ties everything together — the runnable (Step 2), the evaluators (Step 3), and the use cases (Step 1b) — into concrete test scenarios. At test time, `pixie test` calls the runnable with `entry_kwargs`, the wrap registry is populated with `eval_input`, and evaluators score the resulting captured outputs.
+
+---
+
+## Understanding `entry_kwargs`, `eval_input`, and `expectation`
+
+Before building the dataset, understand what these terms mean:
+
+- **`entry_kwargs`** = the kwargs passed to `Runnable.run()` as a Pydantic model. These are the entry-point inputs (user message, request body, CLI args). The keys must match the fields of the Pydantic model defined for `run(args: T)`.
+
+- **`eval_input`** = a list of `{"name": ..., "value": ...}` objects corresponding to `wrap(purpose="input")` calls in the app. At test time, these are injected automatically by the wrap registry; `wrap(purpose="input")` calls in the app return the registry value instead of calling the real external dependency.
+
+  **CRITICAL**: `eval_input` must have **at least one item** (enforced by `min_length=1` validation). If the app has no `wrap(purpose="input")` calls, you must still include at least one `eval_input` item — use the primary entry-point argument as a synthetic input:
+
+  ```json
+  "eval_input": [
+    { "name": "user_input", "value": "What are your business hours?" }
+  ]
+  ```
+
+  Each item is a `NamedData` object with `name` (str) and `value` (any JSON-serializable value).
+
+- **`expectation`** (optional) = case-specific evaluation reference. What a correct output should look like for this scenario. Used by evaluators that compare output against a reference (e.g., `Factuality`, `ClosedQA`). Not needed for output-quality evaluators that don't require a reference.
+
+- **eval output** = what the app actually produces, captured at runtime by `wrap(purpose="output")` and `wrap(purpose="state")` calls. **Not stored in the dataset** — it's produced when `pixie test` runs the app.
+
+The **reference trace** at `pixie_qa/reference-trace.jsonl` is your primary source for data shapes:
+
+- Filter it to see the exact serialized format for `eval_input` values
+- Read the `kwargs` record to understand the `entry_kwargs` structure
+- Read `purpose="output"/"state"` events to understand what outputs the app produces, so you can write meaningful `expectation` values
+
+---
+
+## 4a. Derive evaluator assignments
+
+The eval criteria artifact (`pixie_qa/02-eval-criteria.md`) maps each criterion to use cases. The evaluator mapping artifact (`pixie_qa/03-evaluator-mapping.md`) maps each criterion to a concrete evaluator name. Combine these:
+
+1. **Dataset-level default evaluators**: Criteria marked as applying to "All" use cases → their evaluator names go in the top-level `"evaluators"` array.
+2. **Item-level evaluators**: Criteria that apply to only a subset → their evaluator names go in `"evaluators"` on the relevant rows only, using `"..."` to also include the defaults.
+
+## 4b. Inspect data shapes with `pixie format`
+
+Use `pixie format` on the reference trace to see the exact data shapes **and** the real app output in dataset-entry format:
+
+```bash
+pixie format --input reference-trace.jsonl --output dataset-sample.json
+```
+
+The output looks like:
+
+```json
+{
+  "entry_kwargs": {
+    "user_message": "What are your business hours?"
+  },
+  "eval_input": [
+    {
+      "name": "customer_profile",
+      "value": { "name": "Alice", "tier": "gold" }
+    },
+    {
+      "name": "conversation_history",
+      "value": [{ "role": "user", "content": "What are your hours?" }]
+    }
+  ],
+  "expectation": null,
+  "eval_output": {
+    "response": "Our business hours are Monday to Friday, 9am to 5pm..."
+  }
+}
+```
+
+**Important**: The `eval_output` in this template is the **full real output** produced by the running app. Do NOT copy `eval_output` into your dataset entries — it would make tests trivially pass by giving evaluators the real answer. Instead:
+
+- Use `entry_kwargs` and `eval_input` as exact templates for data keys and format
+- Look at `eval_output` to understand what the app produces — then write a **concise `expectation` description** that captures the key quality criteria for each scenario
+
+**Example**: if `eval_output.response` is `"Our business hours are Monday to Friday, 9 AM to 5 PM, and Saturday 10 AM to 2 PM."`, write `expectation` as `"Should mention weekday hours (Mon–Fri 9am–5pm) and Saturday hours"` — a short description a human or LLM evaluator can compare against.
+
+## 4c. Generate dataset items
+
+Create diverse entries guided by the reference trace and use cases:
+
+- **`entry_kwargs` keys** must match the fields of the Pydantic model used in `Runnable.run(args: T)`
+- **`eval_input`** must be a list of `{"name": ..., "value": ...}` objects matching the `name` values of `wrap(purpose="input")` calls in the app
+- **Cover each use case** from `pixie_qa/02-eval-criteria.md` — at least one entry per use case, with meaningfully diverse inputs across entries
+
+**If the user specified a dataset or data source in the prompt** (e.g., a JSON file with research questions or conversation scenarios), read that file, adapt each entry to the `entry_kwargs` / `eval_input` shape, and incorporate them into the dataset. Do NOT ignore specified data.
+
+## 4d. Build the dataset JSON file
+
+Create the dataset at `pixie_qa/datasets/<name>.json`:
+
+```json
+{
+  "name": "qa-golden-set",
+  "runnable": "pixie_qa/scripts/run_app.py:AppRunnable",
+  "evaluators": ["Factuality", "pixie_qa/evaluators.py:concise_voice_style"],
+  "entries": [
+    {
+      "entry_kwargs": {
+        "user_message": "What are your business hours?"
+      },
+      "description": "Customer asks about business hours with gold tier account",
+      "eval_input": [
+        {
+          "name": "customer_profile",
+          "value": { "name": "Alice Johnson", "tier": "gold" }
+        }
+      ],
+      "expectation": "Should mention Mon-Fri 9am-5pm and Sat 10am-2pm"
+    },
+    {
+      "entry_kwargs": {
+        "user_message": "I want to change something"
+      },
+      "description": "Ambiguous change request from basic tier customer",
+      "eval_input": [
+        {
+          "name": "customer_profile",
+          "value": { "name": "Bob Smith", "tier": "basic" }
+        }
+      ],
+      "expectation": "Should ask for clarification",
+      "evaluators": ["...", "ClosedQA"]
+    },
+    {
+      "entry_kwargs": {
+        "user_message": "I want to end this call"
+      },
+      "description": "User requests call end after failed verification",
+      "eval_input": [
+        {
+          "name": "customer_profile",
+          "value": { "name": "Charlie Brown", "tier": "basic" }
+        }
+      ],
+      "expectation": "Agent should call endCall tool and end the conversation",
+      "eval_metadata": {
+        "expected_tool": "endCall",
+        "expected_call_ended": true
+      },
+      "evaluators": ["...", "pixie_qa/evaluators.py:tool_call_check"]
+    }
+  ]
+}
+```
+
+### Key fields
+
+**Entry structure** — all fields are top-level on each entry (flat structure — no nesting):
+
+```
+entry:
+  ├── entry_kwargs    (required) — args for Runnable.run()
+  ├── eval_input      (required) — list of {"name": ..., "value": ...} objects
+  ├── description     (required) — human-readable label for the test case
+  ├── expectation     (optional) — reference for comparison-based evaluators
+  ├── eval_metadata   (optional) — extra per-entry data for custom evaluators
+  └── evaluators      (optional) — evaluator names for THIS entry
+```
+
+**Top-level fields:**
+
+- **`runnable`** (required): `filepath:ClassName` reference to the `Runnable` class from Step 2 (e.g., `"pixie_qa/scripts/run_app.py:AppRunnable"`). Path is relative to the project root.
+- **`evaluators`** (dataset-level, optional): Default evaluator names applied to every entry — the evaluators for criteria that apply to ALL use cases.
+
+**Per-entry fields (all top-level on each entry):**
+
+- **`entry_kwargs`** (required): Keys match the Pydantic model fields for `Runnable.run(args: T)`. These are the app's entry-point inputs.
+- **`eval_input`** (required): List of `{"name": ..., "value": ...}` objects. Names match `wrap(purpose="input")` names in the app.
+- **`description`** (required): Use case one-liner from `pixie_qa/02-eval-criteria.md`.
+- **`expectation`** (optional): Case-specific expectation text for evaluators that need a reference.
+- **`eval_metadata`** (optional): Extra per-entry data for custom evaluators — e.g., expected tool names, boolean flags, thresholds. Accessible in evaluators as `evaluable.eval_metadata`.
+- **`evaluators`** (optional): Row-level evaluator override.
+
+### Evaluator assignment rules
+
+1. Evaluators that apply to ALL items go in the top-level `"evaluators"` array.
+2. Items that need **additional** evaluators use `"evaluators": ["...", "ExtraEval"]` — `"..."` expands to defaults.
+3. Items that need a **completely different** set use `"evaluators": ["OnlyThis"]` without `"..."`.
+4. Items using only defaults: omit the `"evaluators"` field.
+
+---
+
+## Dataset Creation Reference
+
+### Using `eval_input` values
+
+The `eval_input` values are `{"name": ..., "value": ...}` objects. Use the reference trace as templates — copy the `"data"` field from the relevant `purpose="input"` event and adapt the values:
+
+**Simple dict**:
+
+```json
+{ "name": "customer_profile", "value": { "name": "Alice", "tier": "gold" } }
+```
+
+**List of dicts** (e.g., conversation history):
+
+```json
+{
+  "name": "conversation_history",
+  "value": [
+    { "role": "user", "content": "Hello" },
+    { "role": "assistant", "content": "Hi there!" }
+  ]
+}
+```
+
+**Important**: The exact format depends on what the `wrap(purpose="input")` call captures. Always copy from the reference trace rather than constructing from scratch.
+
+### Crafting diverse eval scenarios
+
+Cover different aspects of each use case:
+
+- Different user phrasings of the same request
+- Edge cases (ambiguous input, missing information, error conditions)
+- Entries that stress-test specific eval criteria
+- At least one entry per use case from Step 1b
+
+---
+
+## Output
+
+`pixie_qa/datasets/<name>.json` — the dataset file.
diff --git a/skills/eval-driven-dev/references/5-run-tests.md b/skills/eval-driven-dev/references/5-run-tests.md
new file mode 100644
index 00000000..a8172a6f
--- /dev/null
+++ b/skills/eval-driven-dev/references/5-run-tests.md
@@ -0,0 +1,79 @@
+# Step 5: Run Evaluation-Based Tests
+
+**Why this step**: Run `pixie test` and fix any dataset quality issues — `WrapRegistryMissError`, `WrapTypeMismatchError`, bad `eval_input` data, or import failures — until real evaluator scores are produced for every entry.
+
+---
+
+## 5a. Run tests
+
+```bash
+pixie test
+```
+
+For verbose output with per-case scores and evaluator reasoning:
+
+```bash
+pixie test -v
+```
+
+`pixie test` automatically loads the `.env` file before running tests.
+
+The test runner now:
+
+1. Resolves the `Runnable` class from the dataset's `runnable` field
+2. Calls `Runnable.create()` to construct an instance, then `setup()` once
+3. Runs all dataset entries **concurrently** (up to 4 in parallel):
+   a. Reads `entry_kwargs` and `eval_input` from the entry
+   b. Populates the wrap input registry with `eval_input` data
+   c. Initialises the capture registry
+   d. Validates `entry_kwargs` into the Pydantic model and calls `Runnable.run(args)`
+   e. `wrap(purpose="input")` calls in the app return registry values instead of calling external services
+   f. `wrap(purpose="output"/"state")` calls capture data for evaluation
+   g. Builds `Evaluable` from captured data
+   h. Runs evaluators
+4. Calls `Runnable.teardown()` once
+
+Because entries run concurrently, the Runnable's `run()` method must be concurrency-safe. If you see `sqlite3.OperationalError`, `"database is locked"`, or similar errors, add a `Semaphore(1)` to your Runnable (see the concurrency section in Step 2 reference).
+
+## 5b. Fix dataset/harness issues
+
+**Data validation errors** (registry miss, type mismatch, deserialization failure) are reported per-entry with clear messages pointing to the specific `wrap` name and dataset field. This step is about fixing **what you did wrong in Step 4** — bad data, wrong format, missing fields — not about evaluating the app's quality.
+
+| Error                                 | Cause                                                                                                                   | Fix                                                                                          |
+| ------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------- |
+| `WrapRegistryMissError: name='<key>'` | Dataset entry missing an `eval_input` item with the `name` that the app's `wrap(purpose="input", name="<key>")` expects | Add the missing `{"name": "<key>", "value": ...}` to `eval_input` in every affected entry    |
+| `WrapTypeMismatchError`               | Deserialized type doesn't match what the app expects                                                                    | Fix the value in the dataset                                                                 |
+| Runnable resolution failure           | `runnable` path or class name is wrong, or the class doesn't implement the `Runnable` protocol                          | Fix `filepath:ClassName` in the dataset; ensure the class has `create()` and `run()` methods |
+| Import error                          | Module path or syntax error in runnable/evaluator                                                                       | Fix the referenced file                                                                      |
+| `ModuleNotFoundError: pixie_qa`       | `pixie_qa/` directory missing `__init__.py`                                                                             | Run `pixie init` to recreate it                                                              |
+| `TypeError: ... is not callable`      | Evaluator name points to a non-callable attribute                                                                       | Evaluators must be functions, classes, or callable instances                                 |
+| `sqlite3.OperationalError`            | Concurrent `run()` calls sharing a SQLite connection                                                                    | Add `asyncio.Semaphore(1)` to the Runnable (see Step 2 concurrency section)                  |
+
+Iterate — fix errors, re-run, fix the next error — until `pixie test` runs cleanly with real evaluator scores for all entries.
+
+### When to stop iterating on evaluator results
+
+Once the dataset runs without errors and produces real scores, assess the results:
+
+- **Custom function evaluators** (deterministic checks): If they fail, the issue is in the dataset data or evaluator logic. Fix and re-run — these should converge quickly.
+- **LLM-as-judge evaluators** (e.g., `Factuality`, `ClosedQA`, custom LLM evaluators): These have inherent variance across runs. If scores fluctuate between runs without code changes, the issue is evaluator prompt quality, not app behavior. **Do not spend more than one revision cycle on LLM evaluator prompts.** Run 2–3 times, assess variance, and accept the results if they are directionally correct.
+- **General rule**: Stop iterating when all custom function evaluators pass consistently and LLM evaluators produce reasonable scores (most passing). Perfect LLM evaluator scores are not the goal — the goal is a working QA pipeline that catches real regressions.
+
+## 5c. Run analysis
+
+Once tests complete without setup errors and produce real scores, run analysis:
+
+```bash
+pixie analyze <test_id>
+```
+
+Where `<test_id>` is the test run identifier printed by `pixie test` (e.g., `20250615-120000`). This generates LLM-powered markdown analysis for each dataset, identifying patterns in successes and failures.
+
+## Output
+
+- Test results at `{PIXIE_ROOT}/results/<test_id>/result.json`
+- Analysis files at `{PIXIE_ROOT}/results/<test_id>/dataset-<index>.md` (after `pixie analyze`)
+
+---
+
+> **If you hit an unexpected error** when running tests (wrong parameter names, import failures, API mismatch), read `wrap-api.md`, `evaluators.md`, or `testing-api.md` for the authoritative API reference before guessing at a fix.
diff --git a/skills/eval-driven-dev/references/investigation.md b/skills/eval-driven-dev/references/6-investigate.md
similarity index 62%
rename from skills/eval-driven-dev/references/investigation.md
rename to skills/eval-driven-dev/references/6-investigate.md
index a6221c73..15e6a2cb 100644
--- a/skills/eval-driven-dev/references/investigation.md
+++ b/skills/eval-driven-dev/references/6-investigate.md
@@ -4,28 +4,42 @@ This reference covers Step 6 of the eval-driven-dev process: investigating test
 
 ---
 
-## When to use this
+## STOP — check before proceeding
 
-Only proceed with investigation if the user asked for it (iteration intent) or confirmed after seeing setup results. If the user's intent was "set up evals," stop after reporting test results and ask before investigating.
+**Before doing any investigation or iteration work, you must decide whether to continue or stop and ask the user.**
+
+**Continue immediately** if the user's original prompt explicitly asked for iteration — look for words like "fix", "improve", "debug", "iterate", "investigate failures", or "make tests pass". In this case, proceed to the investigation steps below.
+
+**Otherwise, STOP here.** Report the test results to the user:
+
+> "QA setup is complete. Tests show N/M passing. [brief summary of failures if any]. Want me to investigate the failures and iterate?"
+
+**Do not proceed with investigation until the user confirms.** This is the default — most prompts like "set up evals", "add tests", "set up QA", or "add evaluations" are asking for setup only, not iteration.
 
 ---
 
 ## Step-by-step investigation
 
-### 1. Get detailed test output
+When the user has confirmed (or their original prompt was explicitly about iteration), proceed:
+
+### 1. Read the analysis
+
+Start by reading the analysis generated in Step 5. The analysis files are at `{PIXIE_ROOT}/results/<test_id>/dataset-<index>.md`. These contain LLM-generated insights about patterns in successes and failures across your test run. Use the analysis to prioritize which failures to investigate first and to understand systemic issues.
+
+### 2. Get detailed test output
 
 ```bash
-pixie test pixie_qa/tests/ -v    # shows score and reasoning per case
+pixie test -v    # shows score and reasoning per case
 ```
 
 Capture the full verbose output. For each failing case, note:
 
-- The `eval_input` (what was sent)
-- The `eval_output` (what the app produced)
+- The `entry_kwargs` (what was sent)
+- The `the captured output` (what the app produced)
 - The `expected_output` (what was expected, if applicable)
 - The evaluator score and reasoning
 
-### 2. Inspect the trace data
+### 3. Inspect the trace data
 
 For each failing case, look up the full trace to see what happened inside the app:
 
@@ -53,7 +67,7 @@ async def inspect(trace_id: str):
 asyncio.run(inspect("the-trace-id-here"))
 ```
 
-### 3. Root-cause analysis
+### 4. Root-cause analysis
 
 Walk through the trace and identify exactly where the failure originates. Common patterns:
 
@@ -77,22 +91,22 @@ Walk through the trace and identify exactly where the failure originates. Common
 
 For non-LLM failures: note them in the investigation log and recommend the code fix, but **do not adjust eval expectations or thresholds to accommodate bugs in non-LLM code**. The eval test should measure LLM quality assuming the rest of the system works correctly.
 
-### 4. Document findings in MEMORY.md
+### 5. Document findings
 
-**Every failure investigation must be documented in `pixie_qa/MEMORY.md`** under the Investigation Log section:
+**Every failure investigation should be documented** alongside the fix. Include:
 
 ````markdown
-### <date> — <test_name> failure
+### <date> — failure investigation
 
-**Test**: `test_faq_factuality` in `pixie_qa/tests/test_customer_service.py`
-**Result**: 3/5 cases passed (60%), threshold was 80% ≥ 0.7
+**Dataset**: `qa-golden-set`
+**Result**: 3/5 cases passed (60%)
 
 #### Failing case 1: "What rows have extra legroom?"
 
-- **eval_input**: `{"user_message": "What rows have extra legroom?"}`
-- **eval_output**: "I'm sorry, I don't have the exact row numbers for extra legroom..."
+- **entry_kwargs**: `{"user_message": "What rows have extra legroom?"}`
+- **the captured output**: "I'm sorry, I don't have the exact row numbers for extra legroom..."
 - **expected_output**: "rows 5-8 Economy Plus with extra legroom"
-- **Evaluator score**: 0.1 (FactualityEval)
+- **Evaluator score**: 0.1 (Factuality)
 - **Evaluator reasoning**: "The output claims not to know the answer while the reference clearly states rows 5-8..."
 
 **Trace analysis**:
@@ -118,29 +132,33 @@ not an eval/prompt change.
 **Verification**: After fix, re-run:
 
 ```bash
-python pixie_qa/scripts/build_dataset.py  # refresh dataset
-pixie test pixie_qa/tests/ -k faq -v      # verify
+pixie test -v      # verify
 ```
 ````
 
-````
+### 6. Fix and re-run
 
-### 5. Fix and re-run
-
-Make the targeted change, rebuild the dataset if needed, and re-run. Always finish by giving the user the exact commands to verify:
+Make the targeted change, update the dataset if needed, and re-run:
 
 ```bash
-pixie test pixie_qa/tests/test_<feature>.py -v
-````
+pixie test -v
+```
+
+After fixes stabilize, run analysis again to see if the patterns have changed:
+
+```bash
+pixie analyze <new_test_id>
+```
 
 ---
 
 ## The iteration cycle
 
-1. Run tests → identify failures
-2. Investigate each failure → classify as LLM vs. non-LLM
-3. For LLM failures: adjust prompts, model, or eval criteria
-4. For non-LLM failures: recommend or apply code fix
-5. Rebuild dataset if the fix changed app behavior
-6. Re-run tests
-7. Repeat until passing or user is satisfied
+1. Read analysis from Step 6 → prioritize failures
+2. Run tests verbose → identify specific failures
+3. Investigate each failure → classify as LLM vs. non-LLM
+4. For LLM failures: adjust prompts, model, or eval criteria
+5. For non-LLM failures: recommend or apply code fix
+6. Update dataset if the fix changed app behavior
+7. Re-run tests and analysis
+8. Repeat until passing or user is satisfied
diff --git a/skills/eval-driven-dev/references/dataset-generation.md b/skills/eval-driven-dev/references/dataset-generation.md
deleted file mode 100644
index cbdfebad..00000000
--- a/skills/eval-driven-dev/references/dataset-generation.md
+++ /dev/null
@@ -1,235 +0,0 @@
-# Dataset Generation
-
-This reference covers Step 4 of the eval-driven-dev process: creating the eval dataset.
-
-For full `DatasetStore`, `Evaluable`, and CLI command signatures, see `references/pixie-api.md` (Dataset Python API and CLI Commands sections).
-
----
-
-## What a dataset contains
-
-A dataset is a collection of `Evaluable` items. Each item has:
-
-- **`eval_input`**: Made-up application input + data from external dependencies. This is what the utility function from Step 3 feeds into the app at test time.
-- **`expected_output`**: Case-specific evaluation reference (optional). The meaning depends on the evaluator — it could be an exact answer, a factual reference, or quality criteria text.
-- **`eval_output`**: **NOT stored in the dataset.** Produced at test time when the utility function replays the eval_input through the real app.
-
-The dataset is made up by you based on the data shapes observed in the reference trace from Step 2. You are NOT extracting data from traces — you are crafting realistic test scenarios.
-
----
-
-## Creating the dataset
-
-### CLI
-
-```bash
-pixie dataset create <dataset-name>
-pixie dataset list   # verify it exists
-```
-
-### Python API
-
-```python
-from pixie import DatasetStore, Evaluable
-
-store = DatasetStore()
-store.create("qa-golden-set", items=[
-    Evaluable(
-        eval_input={"user_message": "What are your hours?", "customer_profile": {"name": "Alice", "tier": "gold"}},
-        expected_output="Response should mention Monday-Friday 9am-5pm and Saturday 10am-2pm",
-    ),
-    Evaluable(
-        eval_input={"user_message": "I need to cancel my order", "customer_profile": {"name": "Bob", "tier": "basic"}},
-        expected_output="Should confirm which order and explain the cancellation policy",
-    ),
-])
-```
-
-Or build incrementally:
-
-```python
-store = DatasetStore()
-store.create("qa-golden-set")
-for item in items:
-    store.append("qa-golden-set", item)
-```
-
----
-
-## Crafting eval_input items
-
-Each eval_input must match the **exact data shape** from the reference trace. Look at what the `@observe`-decorated function received as input in Step 2 — same field names, same types, same nesting.
-
-### What goes into eval_input
-
-| Data category            | Example                                           | Source                                              |
-| ------------------------ | ------------------------------------------------- | --------------------------------------------------- |
-| Application input        | User message, query, request body                 | What a real user would send                         |
-| External dependency data | Customer profile, retrieved documents, DB records | Made up to match the shape from the reference trace |
-| Conversation history     | Previous messages in a chat                       | Made up to set up the scenario                      |
-| Configuration / context  | Feature flags, session state                      | Whatever the function expects as arguments          |
-
-### Matching the reference trace shape
-
-From the reference trace (`pixie trace last`), note:
-
-1. **Field names** — use the exact same keys (e.g., `user_message` not `message`, `customer_profile` not `profile`)
-2. **Types** — if the trace shows a list, use a list; if it shows a nested dict, use a nested dict
-3. **Realistic values** — the data should look like something the app would actually receive. Don't use placeholder text like "test input" or "lorem ipsum"
-
-**Example**: If the reference trace shows the function received:
-
-```json
-{
-  "user_message": "I'd like to reschedule my appointment",
-  "customer_profile": {
-    "name": "Jane Smith",
-    "account_id": "A12345",
-    "tier": "premium"
-  },
-  "conversation_history": [
-    { "role": "assistant", "content": "Welcome! How can I help you today?" }
-  ]
-}
-```
-
-Then every eval_input you make up must have `user_message` (string), `customer_profile` (dict with `name`, `account_id`, `tier`), and `conversation_history` (list of message dicts).
-
----
-
-## Setting expected_output
-
-`expected_output` is a **reference for evaluation** — its meaning depends on which evaluator will consume it.
-
-### When to set it
-
-| Scenario                                    | expected_output value                                                                  | Evaluator it pairs with                                    |
-| ------------------------------------------- | -------------------------------------------------------------------------------------- | ---------------------------------------------------------- |
-| Deterministic answer exists                 | The exact answer: `"Paris"`                                                            | `ExactMatchEval`, `FactualityEval`, `ClosedQAEval`         |
-| Open-ended but has quality criteria         | Description of good output: `"Should mention Saturday hours and be under 2 sentences"` | `create_llm_evaluator` with `{expected_output}` in prompt  |
-| Truly open-ended, no case-specific criteria | Leave as `"UNSET"` or omit                                                             | Standalone evaluators (`PossibleEval`, `FaithfulnessEval`) |
-
-### Universal vs. case-specific criteria
-
-- **Universal criteria** apply to ALL test cases → implement in the test function's evaluators (e.g., "responses must be concise", "must not hallucinate"). These don't need expected_output.
-- **Case-specific criteria** vary per test case → carry as `expected_output` in the dataset item (e.g., "should mention the caller's Tuesday appointment", "should route to billing").
-
-### Anti-patterns
-
-- **Don't generate both eval_output and expected_output from the same source.** If they're identical and you use `ExactMatchEval`, the test is circular and catches zero regressions.
-- **Don't use comparison evaluators (`FactualityEval`, `ClosedQAEval`, `ExactMatchEval`) on items without expected_output.** They produce meaningless scores.
-- **Don't mix expected_output semantics in one dataset.** If some items use expected_output as a factual answer and others as style guidance, evaluators can't handle both. Split into separate datasets or use separate test functions.
-
----
-
-## Validating the dataset
-
-After creating the dataset, check:
-
-### 1. Structural validation
-
-Every eval_input must match the reference trace's schema:
-
-- Same fields present
-- Same types (string, int, list, dict)
-- Same nesting depth
-- No extra or missing fields compared to what the function expects
-
-### 2. Semantic validation
-
-- **Realistic values** — names, messages, and data look like real-world inputs, not test placeholders
-- **Coherent scenarios** — if there's conversation history, it should make topical sense with the user message
-- **External dependency data makes sense** — customer profiles have realistic account IDs, retrieved documents are plausible
-
-### 3. Diversity validation
-
-- Items have **meaningfully different** inputs — different user intents, different customer types, different edge cases
-- Not just minor variations of the same scenario (e.g., don't have 5 items that are all "What are your hours?" with different names)
-- Cover: normal cases, edge cases, things the app might plausibly get wrong
-
-### 4. Expected_output validation
-
-- case-specific `expected_output` values are specific and testable, not vague
-- Items where expected_output is universal don't redundantly carry expected_output
-
-### 5. Verify by listing
-
-```bash
-pixie dataset list
-```
-
-Or in the build script:
-
-```python
-ds = store.get("qa-golden-set")
-print(f"Dataset has {len(ds.items)} items")
-for i, item in enumerate(ds.items):
-    print(f"  [{i}] input keys: {list(item.eval_input.keys()) if isinstance(item.eval_input, dict) else type(item.eval_input)}")
-    print(f"       expected_output: {item.expected_output[:80] if item.expected_output != 'UNSET' else 'UNSET'}...")
-```
-
----
-
-## Recommended build_dataset.py structure
-
-Put the build script at `pixie_qa/scripts/build_dataset.py`:
-
-```python
-"""Build the eval dataset with made-up scenarios.
-
-Each eval_input matches the data shape from the reference trace (Step 2).
-Run this script to create/recreate the dataset.
-"""
-from pixie import DatasetStore, Evaluable
-
-DATASET_NAME = "qa-golden-set"
-
-def build() -> None:
-    store = DatasetStore()
-
-    # Recreate fresh
-    try:
-        store.delete(DATASET_NAME)
-    except FileNotFoundError:
-        pass
-    store.create(DATASET_NAME)
-
-    items = [
-        # Normal case — straightforward question
-        Evaluable(
-            eval_input={
-                "user_message": "What are your business hours?",
-                "customer_profile": {"name": "Alice Johnson", "account_id": "C100", "tier": "gold"},
-            },
-            expected_output="Should mention Mon-Fri 9am-5pm and Sat 10am-2pm",
-        ),
-        # Edge case — ambiguous request
-        Evaluable(
-            eval_input={
-                "user_message": "I want to change something",
-                "customer_profile": {"name": "Bob Smith", "account_id": "C200", "tier": "basic"},
-            },
-            expected_output="Should ask for clarification about what to change",
-        ),
-        # ... more items covering different scenarios
-    ]
-
-    for item in items:
-        store.append(DATASET_NAME, item)
-
-    # Verify
-    ds = store.get(DATASET_NAME)
-    print(f"Dataset '{DATASET_NAME}' has {len(ds.items)} items")
-    for i, entry in enumerate(ds.items):
-        keys = list(entry.eval_input.keys()) if isinstance(entry.eval_input, dict) else type(entry.eval_input)
-        print(f"  [{i}] input keys: {keys}")
-
-if __name__ == "__main__":
-    build()
-```
-
----
-
-## The cardinal rule
-
-**`eval_output` is always produced at test time, never stored in the dataset.** The dataset contains `eval_input` (made-up input matching the reference trace shape) and optionally `expected_output` (the reference to judge against). The test's `runnable` function produces `eval_output` by replaying `eval_input` through the real app.
diff --git a/skills/eval-driven-dev/references/eval-tests.md b/skills/eval-driven-dev/references/eval-tests.md
deleted file mode 100644
index dcf046e3..00000000
--- a/skills/eval-driven-dev/references/eval-tests.md
+++ /dev/null
@@ -1,241 +0,0 @@
-# Eval Tests: Evaluator Selection and Test Writing
-
-This reference covers Step 5 of the eval-driven-dev process: choosing evaluators, writing the test file, and running `pixie test`.
-
-**Before writing any test code, re-read `references/pixie-api.md`** (Eval Runner API and Evaluator catalog sections) for exact parameter names and current evaluator signatures — these change when the package is updated.
-
----
-
-## Evaluator selection
-
-Choose evaluators based on the **output type** and your eval criteria from Step 1, not the app type.
-
-### Decision table
-
-| Output type                                                 | Evaluator category                                                      | Examples                                  |
-| ----------------------------------------------------------- | ----------------------------------------------------------------------- | ----------------------------------------- |
-| Deterministic (classification labels, yes/no, fixed-format) | Heuristic: `ExactMatchEval`, `JSONDiffEval`, `ValidJSONEval`            | Label classification, JSON extraction     |
-| Open-ended text with a reference answer                     | LLM-as-judge: `FactualityEval`, `ClosedQAEval`, `AnswerCorrectnessEval` | Chatbot responses, QA, summaries          |
-| Text with expected context/grounding                        | RAG evaluators: `FaithfulnessEval`, `ContextRelevancyEval`              | RAG pipelines, context-grounded responses |
-| Text with style/format requirements                         | Custom LLM-as-judge via `create_llm_evaluator`                          | Voice-friendly responses, tone checks     |
-| Multi-aspect quality                                        | Multiple evaluators combined                                            | Factuality + relevance + tone             |
-
-### Critical rules
-
-- For open-ended LLM text, **never** use `ExactMatchEval`. LLM outputs are non-deterministic — exact match will either always fail or always pass (if comparing against the same output). Use LLM-as-judge evaluators instead.
-- `AnswerRelevancyEval` is **RAG-only** — it requires a `context` value in the trace. Returns 0.0 without it. For general relevance without RAG, use `create_llm_evaluator` with a custom prompt.
-- Do NOT use comparison evaluators (`FactualityEval`, `ClosedQAEval`, `ExactMatchEval`) on items without `expected_output` — they produce meaningless scores.
-
-### When `expected_output` IS available
-
-Use comparison-based evaluators:
-
-| Evaluator               | Use when                                                   |
-| ----------------------- | ---------------------------------------------------------- |
-| `FactualityEval`        | Output is factually correct compared to reference          |
-| `ClosedQAEval`          | Output matches the expected answer                         |
-| `ExactMatchEval`        | Exact string match (structured/deterministic outputs only) |
-| `AnswerCorrectnessEval` | Answer is correct vs reference                             |
-
-### When `expected_output` is NOT available
-
-Use standalone evaluators that judge quality without a reference:
-
-| Evaluator              | Use when                              | Note                                                             |
-| ---------------------- | ------------------------------------- | ---------------------------------------------------------------- |
-| `FaithfulnessEval`     | Response faithful to provided context | RAG pipelines                                                    |
-| `ContextRelevancyEval` | Retrieved context relevant to query   | RAG pipelines                                                    |
-| `AnswerRelevancyEval`  | Answer addresses the question         | **RAG only** — needs `context` in trace. Returns 0.0 without it. |
-| `PossibleEval`         | Output is plausible / feasible        | General purpose                                                  |
-| `ModerationEval`       | Output is safe and appropriate        | Content safety                                                   |
-| `SecurityEval`         | No security vulnerabilities           | Security check                                                   |
-
-For non-RAG apps needing response relevance, write a `create_llm_evaluator` instead.
-
----
-
-## Custom evaluators
-
-### `create_llm_evaluator` factory
-
-Use when the quality dimension is domain-specific and no built-in evaluator fits:
-
-```python
-from pixie import create_llm_evaluator
-
-concise_voice_style = create_llm_evaluator(
-    name="ConciseVoiceStyle",
-    prompt_template="""
-    You are evaluating whether this response is concise and phone-friendly.
-
-    Input: {eval_input}
-    Response: {eval_output}
-
-    Score 1.0 if the response is concise (under 3 sentences), directly addresses
-    the question, and uses conversational language suitable for a phone call.
-    Score 0.0 if it's verbose, off-topic, or uses written-style formatting.
-    """,
-)
-```
-
-**How template variables work**: `{eval_input}`, `{eval_output}`, `{expected_output}` are the only placeholders. Each is replaced with a string representation of the corresponding `Evaluable` field — if the field is a dict or list, it becomes a JSON string. The LLM judge sees the full serialized value.
-
-**Rules**:
-
-- **Only `{eval_input}`, `{eval_output}`, `{expected_output}`** — no nested access like `{eval_input[key]}` (this will crash with a `TypeError`)
-- **Keep templates short and direct** — the system prompt already tells the LLM to return `Score: X.X`. Your template just needs to present the data and define the scoring criteria.
-- **Don't instruct the LLM to "parse" or "extract" data** — just present the values and state the criteria. The LLM can read JSON naturally.
-
-**Non-RAG response relevance** (instead of `AnswerRelevancyEval`):
-
-```python
-response_relevance = create_llm_evaluator(
-    name="ResponseRelevance",
-    prompt_template="""
-    You are evaluating whether a customer support response is relevant and helpful.
-
-    Input: {eval_input}
-    Response: {eval_output}
-    Expected: {expected_output}
-
-    Score 1.0 if the response directly addresses the question and meets expectations.
-    Score 0.5 if partially relevant but misses important aspects.
-    Score 0.0 if off-topic, ignores the question, or contradicts expectations.
-    """,
-)
-```
-
-### Manual custom evaluator
-
-```python
-from pixie import Evaluation, Evaluable
-
-async def my_evaluator(evaluable: Evaluable, *, trace=None) -> Evaluation:
-    # evaluable.eval_input  — what was passed to the observed function
-    # evaluable.eval_output — what the function returned
-    # evaluable.expected_output — reference answer (UNSET if not provided)
-    score = 1.0 if "expected pattern" in str(evaluable.eval_output) else 0.0
-    return Evaluation(score=score, reasoning="...")
-```
-
----
-
-## Writing the test file
-
-Create `pixie_qa/tests/test_<feature>.py`. The pattern: a `runnable` adapter that calls the app's production function, plus `async` test functions that `await` `assert_dataset_pass`.
-
-**Before writing any test code, re-read the `assert_dataset_pass` API reference below.** The exact parameter names matter — using `dataset=` instead of `dataset_name=`, or omitting `await`, will cause failures that are hard to debug. Do not rely on memory from earlier in the conversation.
-
-### Test file template
-
-```python
-from pixie import enable_storage, assert_dataset_pass, FactualityEval, ScoreThreshold, last_llm_call
-
-from myapp import answer_question
-
-
-def runnable(eval_input):
-    """Replays one dataset item through the app.
-
-    Calls the same function the production app uses.
-    enable_storage() here ensures traces are captured during eval runs.
-    """
-    enable_storage()
-    answer_question(**eval_input)
-
-
-async def test_answer_quality():
-    await assert_dataset_pass(
-        runnable=runnable,
-        dataset_name="qa-golden-set",
-        evaluators=[FactualityEval()],
-        pass_criteria=ScoreThreshold(threshold=0.7, pct=0.8),
-        from_trace=last_llm_call,
-    )
-```
-
-### `assert_dataset_pass` API — exact parameter names
-
-```python
-await assert_dataset_pass(
-    runnable=runnable,              # callable that takes eval_input dict
-    dataset_name="my-dataset",      # NOT dataset_path — name of dataset created in Step 4
-    evaluators=[...],               # list of evaluator instances
-    pass_criteria=ScoreThreshold(   # NOT thresholds — ScoreThreshold object
-        threshold=0.7,              # minimum score to count as passing
-        pct=0.8,                    # fraction of items that must pass
-    ),
-    from_trace=last_llm_call,       # which span to extract eval data from
-)
-```
-
-### Common mistakes that break tests
-
-| Mistake                  | Symptom                                                             | Fix                                           |
-| ------------------------ | ------------------------------------------------------------------- | --------------------------------------------- |
-| `def test_...():` (sync) | RuntimeWarning "coroutine was never awaited", test passes vacuously | Use `async def test_...():`                   |
-| No `await`               | Same: "coroutine was never awaited"                                 | Add `await` before `assert_dataset_pass(...)` |
-| `dataset_path="..."`     | TypeError: unexpected keyword argument                              | Use `dataset_name="..."`                      |
-| `thresholds={...}`       | TypeError: unexpected keyword argument                              | Use `pass_criteria=ScoreThreshold(...)`       |
-| Omitting `from_trace`    | Evaluator may not find the right span                               | Add `from_trace=last_llm_call`                |
-
-**If `pixie test` shows "No assert_pass / assert_dataset_pass calls recorded"**, the test passed vacuously because `assert_dataset_pass` was never awaited. Fix the async signature and await immediately.
-
-### Multiple test functions
-
-Split into separate test functions when you have different evaluator sets:
-
-```python
-async def test_factual_answers():
-    """Test items that have deterministic expected outputs."""
-    await assert_dataset_pass(
-        runnable=runnable,
-        dataset_name="qa-deterministic",
-        evaluators=[FactualityEval()],
-        pass_criteria=ScoreThreshold(threshold=0.7, pct=0.8),
-        from_trace=last_llm_call,
-    )
-
-async def test_response_style():
-    """Test open-ended quality criteria."""
-    await assert_dataset_pass(
-        runnable=runnable,
-        dataset_name="qa-open-ended",
-        evaluators=[concise_voice_style],
-        pass_criteria=ScoreThreshold(threshold=0.6, pct=0.8),
-        from_trace=last_llm_call,
-    )
-```
-
-### Key points
-
-- `enable_storage()` belongs inside the `runnable`, not at module level — it needs to fire on each invocation so the trace is captured for that specific run.
-- The `runnable` imports and calls the **same function** that production uses — the app's entry point, going through the utility function from Step 3.
-- If the `runnable` calls a different function than what the utility function calls, something is wrong.
-- The `eval_input` dict should contain **only the semantic arguments** the function needs (e.g., `question`, `messages`, `context`). The `@observe` decorator automatically strips `self` and `cls`.
-- **Choose evaluators that match your data.** If dataset items have `expected_output`, use comparison evaluators. If not, use standalone evaluators.
-
----
-
-## Running tests
-
-The test runner is `pixie test` (not `pytest`):
-
-```bash
-uv run pixie test                           # run all test_*.py in current directory
-uv run pixie test pixie_qa/tests/           # specify path
-uv run pixie test -k factuality             # filter by name
-uv run pixie test -v                        # verbose: shows per-case scores and reasoning
-```
-
-`pixie test` automatically loads the `.env` file before running tests, so API keys do not need to be exported in the shell. No `sys.path` hacks are needed in test files.
-
-The `-v` flag is important: it shows per-case scores and evaluator reasoning, which makes it much easier to see what's passing and what isn't.
-
-### After running, verify the scorecard
-
-1. Shows "N/M tests passed" with real numbers
-2. Does NOT say "No assert_pass / assert_dataset_pass calls recorded" (that means missing `await`)
-3. Per-evaluator scores appear with real values
-
-A test that passes with no recorded evaluations is worse than a failing test — it gives false confidence. Debug until real scores appear.
diff --git a/skills/eval-driven-dev/references/evaluators.md b/skills/eval-driven-dev/references/evaluators.md
new file mode 100644
index 00000000..4e9cce89
--- /dev/null
+++ b/skills/eval-driven-dev/references/evaluators.md
@@ -0,0 +1,531 @@
+# Built-in Evaluators
+
+> Auto-generated from pixie source code docstrings.
+> Do not edit by hand — regenerate from the upstream [pixie-qa](https://github.com/yiouli/pixie-qa) source repository.
+
+Autoevals adapters — pre-made evaluators wrapping `autoevals` scorers.
+
+This module provides :class:`AutoevalsAdapter`, which bridges the
+autoevals `Scorer` interface to pixie's `Evaluator` protocol, and
+a set of factory functions for common evaluation tasks.
+
+Public API (all are also re-exported from `pixie.evals`):
+
+**Core adapter:** - :class:`AutoevalsAdapter` — generic wrapper for any autoevals `Scorer`.
+
+**Heuristic scorers (no LLM required):** - :func:`LevenshteinMatch` — edit-distance string similarity. - :func:`ExactMatch` — exact value comparison. - :func:`NumericDiff` — normalised numeric difference. - :func:`JSONDiff` — structural JSON comparison. - :func:`ValidJSON` — JSON syntax / schema validation. - :func:`ListContains` — overlap between two string lists.
+
+**Embedding scorer:** - :func:`EmbeddingSimilarity` — cosine similarity via embeddings.
+
+**LLM-as-judge scorers:** - :func:`Factuality`, :func:`ClosedQA`, :func:`Battle`,
+:func:`Humor`, :func:`Security`, :func:`Sql`,
+:func:`Summary`, :func:`Translation`, :func:`Possible`.
+
+**Moderation:** - :func:`Moderation` — OpenAI content-moderation check.
+
+**RAGAS metrics:** - :func:`ContextRelevancy`, :func:`Faithfulness`,
+:func:`AnswerRelevancy`, :func:`AnswerCorrectness`.
+
+## Evaluator Selection Guide
+
+Choose evaluators based on the **output type** and eval criteria:
+
+| Output type                                  | Evaluator category                                          | Examples                              |
+| -------------------------------------------- | ----------------------------------------------------------- | ------------------------------------- |
+| Deterministic (labels, yes/no, fixed-format) | Heuristic: `ExactMatch`, `JSONDiff`, `ValidJSON`            | Label classification, JSON extraction |
+| Open-ended text with a reference answer      | LLM-as-judge: `Factuality`, `ClosedQA`, `AnswerCorrectness` | Chatbot responses, QA, summaries      |
+| Text with expected context/grounding         | RAG: `Faithfulness`, `ContextRelevancy`                     | RAG pipelines                         |
+| Text with style/format requirements          | Custom via `create_llm_evaluator`                           | Voice-friendly responses, tone checks |
+| Multi-aspect quality                         | Multiple evaluators combined                                | Factuality + relevance + tone         |
+
+Critical rules:
+
+- For open-ended LLM text, **never** use `ExactMatch` — LLM outputs are
+  non-deterministic.
+- `AnswerRelevancy` is **RAG-only** — requires `context` in the trace.
+  Returns 0.0 without it. For general relevance, use `create_llm_evaluator`.
+- Do NOT use comparison evaluators (`Factuality`, `ClosedQA`,
+  `ExactMatch`) on items without `expected_output` — they produce
+  meaningless scores.
+
+---
+
+## Evaluator Reference
+
+### `AnswerCorrectness`
+
+```python
+AnswerCorrectness(*, client: 'Any' = None) -> 'AutoevalsAdapter'
+```
+
+Answer correctness evaluator (RAGAS).
+
+Judges whether `eval_output` is correct compared to
+`expected_output`, combining factual similarity and semantic
+similarity.
+
+**When to use**: QA scenarios in RAG pipelines where you have a
+reference answer and want a comprehensive correctness score.
+
+**Requires `expected_output`**: Yes.
+**Requires `eval_metadata["context"]`**: Optional (improves accuracy).
+
+Args:
+client: OpenAI client instance.
+
+### `AnswerRelevancy`
+
+```python
+AnswerRelevancy(*, client: 'Any' = None) -> 'AutoevalsAdapter'
+```
+
+Answer relevancy evaluator (RAGAS).
+
+Judges whether `eval_output` directly addresses the question in
+`eval_input`.
+
+**When to use**: RAG pipelines only — requires `context` in the
+trace. Returns 0.0 without it. For general (non-RAG) response
+relevance, use `create_llm_evaluator` with a custom prompt instead.
+
+**Requires `expected_output`**: No.
+**Requires `eval_metadata["context"]`**: Yes — **RAG pipelines only**.
+
+Args:
+client: OpenAI client instance.
+
+### `Battle`
+
+```python
+Battle(*, model: 'str | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'
+```
+
+Head-to-head comparison evaluator (LLM-as-judge).
+
+Uses an LLM to compare `eval_output` against `expected_output`
+and determine which is better given the instructions in `eval_input`.
+
+**When to use**: A/B testing scenarios, comparing model outputs,
+or ranking alternative responses.
+
+**Requires `expected_output`**: Yes.
+
+Args:
+model: LLM model name.
+client: OpenAI client instance.
+
+### `ClosedQA`
+
+```python
+ClosedQA(*, model: 'str | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'
+```
+
+Closed-book question-answering evaluator (LLM-as-judge).
+
+Uses an LLM to judge whether `eval_output` correctly answers the
+question in `eval_input` compared to `expected_output`. Optionally
+forwards `eval_metadata["criteria"]` for custom grading criteria.
+
+**When to use**: QA scenarios where the answer should match a reference —
+e.g. customer support answers, knowledge-base queries.
+
+**Requires `expected_output`**: Yes — do NOT use on items without
+`expected_output`; produces meaningless scores.
+
+Args:
+model: LLM model name.
+client: OpenAI client instance.
+
+### `ContextRelevancy`
+
+```python
+ContextRelevancy(*, client: 'Any' = None) -> 'AutoevalsAdapter'
+```
+
+Context relevancy evaluator (RAGAS).
+
+Judges whether the retrieved context is relevant to the query.
+Forwards `eval_metadata["context"]` to the underlying scorer.
+
+**When to use**: RAG pipelines — evaluating retrieval quality.
+
+**Requires `expected_output`**: Yes.
+**Requires `eval_metadata["context"]`**: Yes (RAG pipelines only).
+
+Args:
+client: OpenAI client instance.
+
+### `EmbeddingSimilarity`
+
+```python
+EmbeddingSimilarity(*, prefix: 'str | None' = None, model: 'str | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'
+```
+
+Embedding-based semantic similarity evaluator.
+
+Computes cosine similarity between embedding vectors of `eval_output`
+and `expected_output`.
+
+**When to use**: Comparing semantic meaning of two texts when exact
+wording doesn't matter. More robust than Levenshtein for paraphrased
+content but less nuanced than LLM-as-judge evaluators.
+
+**Requires `expected_output`**: Yes.
+
+Args:
+prefix: Optional text to prepend for domain context.
+model: Embedding model name.
+client: OpenAI client instance.
+
+### `ExactMatch`
+
+```python
+ExactMatch() -> 'AutoevalsAdapter'
+```
+
+Exact value comparison evaluator.
+
+Returns 1.0 if `eval_output` exactly equals `expected_output`,
+0.0 otherwise.
+
+**When to use**: Deterministic, structured outputs (classification labels,
+yes/no answers, fixed-format strings). **Never** use for open-ended LLM
+text — LLM outputs are non-deterministic, so exact match will almost always
+fail.
+
+**Requires `expected_output`**: Yes.
+
+### `Factuality`
+
+```python
+Factuality(*, model: 'str | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'
+```
+
+Factual accuracy evaluator (LLM-as-judge).
+
+Uses an LLM to judge whether `eval_output` is factually consistent
+with `expected_output` given the `eval_input` context.
+
+**When to use**: Open-ended text where factual correctness matters
+(chatbot responses, QA answers, summaries). Preferred over
+`ExactMatch` for LLM-generated text.
+
+**Requires `expected_output`**: Yes — do NOT use on items without
+`expected_output`; produces meaningless scores.
+
+Args:
+model: LLM model name.
+client: OpenAI client instance.
+
+### `Faithfulness`
+
+```python
+Faithfulness(*, client: 'Any' = None) -> 'AutoevalsAdapter'
+```
+
+Faithfulness evaluator (RAGAS).
+
+Judges whether `eval_output` is faithful to (i.e. supported by)
+the provided context. Forwards `eval_metadata["context"]`.
+
+**When to use**: RAG pipelines — ensuring the answer doesn't
+hallucinate beyond what the retrieved context supports.
+
+**Requires `expected_output`**: No.
+**Requires `eval_metadata["context"]`**: Yes (RAG pipelines only).
+
+Args:
+client: OpenAI client instance.
+
+### `Humor`
+
+```python
+Humor(*, model: 'str | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'
+```
+
+Humor quality evaluator (LLM-as-judge).
+
+Uses an LLM to judge the humor quality of `eval_output` against
+`expected_output`.
+
+**When to use**: Evaluating humor in creative writing, chatbot
+personality, or entertainment applications.
+
+**Requires `expected_output`**: Yes.
+
+Args:
+model: LLM model name.
+client: OpenAI client instance.
+
+### `JSONDiff`
+
+```python
+JSONDiff(*, string_scorer: 'Any' = None) -> 'AutoevalsAdapter'
+```
+
+Structural JSON comparison evaluator.
+
+Recursively compares two JSON structures and produces a similarity
+score. Handles nested objects, arrays, and mixed types.
+
+**When to use**: Structured JSON outputs where field-level comparison
+is needed (e.g. extracted data, API response schemas, tool call arguments).
+
+**Requires `expected_output`**: Yes.
+
+Args:
+string_scorer: Optional pairwise scorer for string fields.
+
+### `LevenshteinMatch`
+
+```python
+LevenshteinMatch() -> 'AutoevalsAdapter'
+```
+
+Edit-distance string similarity evaluator.
+
+Computes a normalised Levenshtein distance between `eval_output` and
+`expected_output`. Returns 1.0 for identical strings and decreasing
+scores as edit distance grows.
+
+**When to use**: Deterministic or near-deterministic outputs where small
+textual variations are acceptable (e.g. formatting differences, minor
+spelling). Not suitable for open-ended LLM text — use an LLM-as-judge
+evaluator instead.
+
+**Requires `expected_output`**: Yes.
+
+### `ListContains`
+
+```python
+ListContains(*, pairwise_scorer: 'Any' = None, allow_extra_entities: 'bool' = False) -> 'AutoevalsAdapter'
+```
+
+List overlap evaluator.
+
+Checks whether `eval_output` contains all items from
+`expected_output`. Scores based on overlap ratio.
+
+**When to use**: Outputs that produce a list of items where completeness
+matters (e.g. extracted entities, search results, recommendations).
+
+**Requires `expected_output`**: Yes.
+
+Args:
+pairwise_scorer: Optional scorer for pairwise element comparison.
+allow_extra_entities: If True, extra items in output are not penalised.
+
+### `Moderation`
+
+```python
+Moderation(*, threshold: 'float | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'
+```
+
+Content moderation evaluator.
+
+Uses the OpenAI moderation API to check `eval_output` for unsafe
+content (hate speech, violence, self-harm, etc.).
+
+**When to use**: Any application where output safety is a concern —
+chatbots, content generation, user-facing AI.
+
+**Requires `expected_output`**: No.
+
+Args:
+threshold: Custom flagging threshold.
+client: OpenAI client instance.
+
+### `NumericDiff`
+
+```python
+NumericDiff() -> 'AutoevalsAdapter'
+```
+
+Normalised numeric difference evaluator.
+
+Computes a normalised numeric distance between `eval_output` and
+`expected_output`. Returns 1.0 for identical numbers and decreasing
+scores as the difference grows.
+
+**When to use**: Numeric outputs where approximate equality is acceptable
+(e.g. price calculations, scores, measurements).
+
+**Requires `expected_output`**: Yes.
+
+### `Possible`
+
+```python
+Possible(*, model: 'str | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'
+```
+
+Feasibility / plausibility evaluator (LLM-as-judge).
+
+Uses an LLM to judge whether `eval_output` is a plausible or
+feasible response.
+
+**When to use**: General-purpose quality check when you want to
+verify outputs are reasonable without a specific reference answer.
+
+**Requires `expected_output`**: No.
+
+Args:
+model: LLM model name.
+client: OpenAI client instance.
+
+### `Security`
+
+```python
+Security(*, model: 'str | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'
+```
+
+Security vulnerability evaluator (LLM-as-judge).
+
+Uses an LLM to check `eval_output` for security vulnerabilities
+based on the instructions in `eval_input`.
+
+**When to use**: Code generation, SQL output, or any scenario
+where output must be checked for injection or vulnerability risks.
+
+**Requires `expected_output`**: No.
+
+Args:
+model: LLM model name.
+client: OpenAI client instance.
+
+### `Sql`
+
+```python
+Sql(*, model: 'str | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'
+```
+
+SQL equivalence evaluator (LLM-as-judge).
+
+Uses an LLM to judge whether `eval_output` SQL is semantically
+equivalent to `expected_output` SQL.
+
+**When to use**: Text-to-SQL applications where the generated SQL
+should be functionally equivalent to a reference query.
+
+**Requires `expected_output`**: Yes.
+
+Args:
+model: LLM model name.
+client: OpenAI client instance.
+
+### `Summary`
+
+```python
+Summary(*, model: 'str | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'
+```
+
+Summarisation quality evaluator (LLM-as-judge).
+
+Uses an LLM to judge the quality of `eval_output` as a summary
+compared to the reference summary in `expected_output`.
+
+**When to use**: Summarisation tasks where the output must capture
+key information from the source material.
+
+**Requires `expected_output`**: Yes.
+
+Args:
+model: LLM model name.
+client: OpenAI client instance.
+
+### `Translation`
+
+```python
+Translation(*, language: 'str | None' = None, model: 'str | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'
+```
+
+Translation quality evaluator (LLM-as-judge).
+
+Uses an LLM to judge the translation quality of `eval_output`
+compared to `expected_output` in the target language.
+
+**When to use**: Machine translation or multilingual output scenarios.
+
+**Requires `expected_output`**: Yes.
+
+Args:
+language: Target language (e.g. `"Spanish"`).
+model: LLM model name.
+client: OpenAI client instance.
+
+### `ValidJSON`
+
+```python
+ValidJSON(*, schema: 'Any' = None) -> 'AutoevalsAdapter'
+```
+
+JSON syntax and schema validation evaluator.
+
+Returns 1.0 if `eval_output` is valid JSON (and optionally matches
+the provided schema), 0.0 otherwise.
+
+**When to use**: Outputs that must be valid JSON — optionally conforming
+to a specific schema (e.g. tool call responses, structured extraction).
+
+**Requires `expected_output`**: No.
+
+Args:
+schema: Optional JSON Schema to validate against.
+
+---
+
+## Custom Evaluators: `create_llm_evaluator`
+
+Factory for custom LLM-as-judge evaluators from prompt templates.
+
+Usage::
+
+    from pixie import create_llm_evaluator
+
+    concise_voice_style = create_llm_evaluator(
+        name="ConciseVoiceStyle",
+        prompt_template="""
+        You are evaluating whether a voice agent response is concise and
+        phone-friendly.
+
+        User said: {eval_input}
+        Agent responded: {eval_output}
+        Expected behavior: {expectation}
+
+        Score 1.0 if the response is concise (under 3 sentences), directly
+        addresses the question, and uses conversational language suitable for
+        a phone call. Score 0.0 if it's verbose, off-topic, or uses
+        written-style formatting.
+        """,
+    )
+
+### `create_llm_evaluator`
+
+```python
+create_llm_evaluator(name: 'str', prompt_template: 'str', *, model: 'str' = 'gpt-4o-mini', client: 'Any | None' = None) -> '_LLMEvaluator'
+```
+
+Create a custom LLM-as-judge evaluator from a prompt template.
+
+The template may reference these variables (populated from the
+:class:`~pixie.storage.evaluable.Evaluable` fields):
+
+- `{eval_input}` — the evaluable's input data. Single-item lists expand
+  to that item's value; multi-item lists expand to a JSON dict of
+  `name → value` pairs.
+- `{eval_output}` — the evaluable's output data (same rule as
+  `eval_input`).
+- `{expectation}` — the evaluable's expected output
+
+Args:
+name: Display name for the evaluator (shown in scorecard).
+prompt_template: A string template with `{eval_input}`,
+`{eval_output}`, and/or `{expectation}` placeholders.
+model: OpenAI model name (default: `gpt-4o-mini`).
+client: Optional pre-configured OpenAI client instance.
+
+Returns:
+An evaluator callable satisfying the `Evaluator` protocol.
+
+Raises:
+ValueError: If the template uses nested field access like
+`{eval_input[key]}` (only top-level placeholders are supported).
diff --git a/skills/eval-driven-dev/references/instrumentation.md b/skills/eval-driven-dev/references/instrumentation.md
deleted file mode 100644
index 9f8deef0..00000000
--- a/skills/eval-driven-dev/references/instrumentation.md
+++ /dev/null
@@ -1,174 +0,0 @@
-# Instrumentation
-
-This reference covers the tactical implementation of instrumentation in Step 2: how to use `@observe`, `enable_storage()`, and `start_observation` correctly.
-
-For full API signatures and all available parameters, see `references/pixie-api.md` (Instrumentation API section).
-
-For guidance on **what** to instrument (which functions, based on your eval criteria), see Step 2a in the main skill instructions.
-
----
-
-## Adding `enable_storage()` at application startup
-
-Call `enable_storage()` once at the beginning of the application's startup code — inside `main()`, or at the top of a server's initialization. **Never at module level** (top of a file outside any function), because that causes storage setup to trigger on import.
-
-Good places:
-
-- Inside `if __name__ == "__main__":` blocks
-- In a FastAPI `lifespan` or `on_startup` handler
-- At the top of `main()` / `run()` functions
-- Inside the `runnable` function in test files
-
-```python
-# ✅ CORRECT — at application startup
-async def main():
-    enable_storage()
-    ...
-
-# ✅ CORRECT — in a runnable for tests
-def runnable(eval_input):
-    enable_storage()
-    my_function(**eval_input)
-
-# ❌ WRONG — at module level, runs on import
-from pixie import enable_storage
-enable_storage()  # this runs when any file imports this module!
-```
-
----
-
-## Wrapping functions with `@observe` or `start_observation`
-
-Instrument the **existing function** that the app actually calls during normal operation. The `@observe` decorator or `start_observation` context manager goes on the production code path — not on new helper functions created for testing.
-
-```python
-# ✅ CORRECT — decorating the existing production function
-from pixie import observe
-
-@observe(name="answer_question")
-def answer_question(question: str, context: str) -> str:  # existing function
-    ...  # existing code, unchanged
-```
-
-```python
-# ✅ CORRECT — decorating a class method (works exactly the same way)
-from pixie import observe
-
-class OpenAIAgent:
-    def __init__(self, model: str = "gpt-4o-mini"):
-        self.client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
-        self.model = model
-
-    @observe(name="openai_agent_respond")
-    def respond(self, user_message: str, conversation_history: list | None = None) -> str:
-        # existing code, unchanged — @observe handles `self` automatically
-        messages = [{"role": "system", "content": SYSTEM_PROMPT}]
-        if conversation_history:
-            messages.extend(conversation_history)
-        messages.append({"role": "user", "content": user_message})
-        response = self.client.chat.completions.create(model=self.model, messages=messages)
-        return response.choices[0].message.content or ""
-```
-
-**`@observe` handles `self` and `cls` automatically** — it strips them from the captured input so only the meaningful arguments appear in traces. Do NOT create wrapper methods or call unbound methods to work around this. Just decorate the existing method directly.
-
-```python
-# ✅ CORRECT — context manager inside an existing function
-from pixie import start_observation
-
-async def main():  # existing function
-    ...
-    with start_observation(input={"user_input": user_input}, name="handle_turn") as obs:
-        result = await Runner.run(current_agent, input_items, context=context)
-        # ... existing response handling ...
-        obs.set_output(response_text)
-    ...
-```
-
----
-
-## Anti-patterns to avoid
-
-### Creating new wrapper functions
-
-```python
-# ❌ WRONG — creating a new function that duplicates logic from main()
-@observe(name="run_for_eval")
-async def run_for_eval(user_messages: list[str]) -> str:
-    # This duplicates what main() does, creating a separate code path
-    # that diverges from production. Don't do this.
-    ...
-```
-
-### Creating wrapper methods instead of decorating the existing method
-
-```python
-# ❌ WRONG — creating a new _respond_observed wrapper method
-class OpenAIAgent:
-    def respond(self, user_message, conversation_history=None):
-        result = self._respond_observed({
-            'user_message': user_message,
-            'conversation_history': conversation_history,
-        })
-        return result['result']
-
-    @observe
-    def _respond_observed(self, args):
-        # WRONG: creates a separate code path, changes the interface,
-        # and breaks when called as an unbound method.
-        ...
-
-# ✅ CORRECT — just decorate the existing method directly
-class OpenAIAgent:
-    @observe(name="openai_agent_respond")
-    def respond(self, user_message, conversation_history=None):
-        ...  # existing code, unchanged
-```
-
-### Bypassing the app by calling the LLM directly
-
-```python
-# ❌ WRONG — calling the LLM directly instead of calling the app's function
-@observe(name="agent_answer_question")
-def answer_question(question: str) -> str:
-    # This bypasses the entire app and calls OpenAI directly.
-    # You're testing a script you just wrote, not the user's app.
-    response = client.responses.create(
-        model="gpt-4.1",
-        input=[{"role": "user", "content": question}],
-    )
-    return response.output_text
-```
-
----
-
-## Rules
-
-- **Never add new wrapper functions** to the application code for eval purposes.
-- **Never bypass the app by calling the LLM provider directly** — if you find yourself writing `client.responses.create(...)` or `openai.ChatCompletion.create(...)` in a test or utility function, you're not testing the app. Import and call the app's own function instead.
-- **Never change the function's interface** (arguments, return type, behavior).
-- **Never duplicate production logic** into a separate "testable" function.
-- The instrumentation is purely additive — if you removed all pixie imports and decorators, the app would work identically.
-- After instrumentation, call `flush()` at the end of runs to make sure all spans are written.
-- For interactive apps (CLI loops, chat interfaces), instrument the **per-turn processing** function — the one that takes user input and produces a response. The eval `runnable` should call this same function.
-
-**Import rule**: All pixie symbols are importable from the top-level `pixie` package. Never import from submodules (`pixie.instrumentation`, `pixie.evals`, `pixie.storage.evaluable`, etc.) — always use `from pixie import ...`.
-
----
-
-## What to instrument based on eval criteria
-
-**LLM provider calls are auto-captured.** When you call `enable_storage()`, pixie activates OpenInference instrumentors that automatically trace every LLM API call (OpenAI, Anthropic, Google, etc.) with full input/output messages, token usage, and model parameters. You do NOT need `@observe` on a function just because it contains an LLM call — the LLM call is already instrumented.
-
-**Use `@observe` for application-level functions** whose inputs, outputs, or intermediate states your evaluators need but that aren't visible from the LLM call alone:
-
-| What your evaluator needs                                  | What to instrument with `@observe`                                       |
-| ---------------------------------------------------------- | ------------------------------------------------------------------------ |
-| App-level input/output (what user sent, what app returned) | The app's entry-point or per-turn processing function                    |
-| Retrieved context (for faithfulness/grounding checks)      | The retrieval function — captures what documents were fetched            |
-| Routing/dispatch decisions                                 | The routing function — captures which tool/agent/department was selected |
-| Side-effects sent to external systems                      | The function that writes to the external system — captures what was sent |
-| Conversation history handling                              | The per-turn processing function — captures how history is assembled     |
-| Intermediate processing stages                             | Each intermediate function — captures each stage                         |
-
-If your eval criteria can be fully assessed from the auto-captured LLM inputs and outputs alone, you may not need `@observe` at all. But typically you need at least one `@observe` on the app's entry-point function to capture the application-level input/output shape that the dataset and evaluators work with.
diff --git a/skills/eval-driven-dev/references/pixie-api.md b/skills/eval-driven-dev/references/pixie-api.md
deleted file mode 100644
index 279cce49..00000000
--- a/skills/eval-driven-dev/references/pixie-api.md
+++ /dev/null
@@ -1,257 +0,0 @@
-# pixie API Reference
-
-> This file is auto-generated by `generate_api_doc` from the
-> live pixie-qa package. Do not edit by hand — run
-> `generate_api_doc` to regenerate after updating pixie-qa.
-
-## Configuration
-
-All settings read from environment variables at call time. By default,
-every artefact lives inside a single `pixie_qa` project directory:
-
-| Variable            | Default                    | Description                        |
-| ------------------- | -------------------------- | ---------------------------------- |
-| `PIXIE_ROOT`        | `pixie_qa`                 | Root directory for all artefacts   |
-| `PIXIE_DB_PATH`     | `pixie_qa/observations.db` | SQLite database file path          |
-| `PIXIE_DB_ENGINE`   | `sqlite`                   | Database engine (currently sqlite) |
-| `PIXIE_DATASET_DIR` | `pixie_qa/datasets`        | Directory for dataset JSON files   |
-
----
-
-## Instrumentation API (`pixie`)
-
-```python
-from pixie import enable_storage, observe, start_observation, flush, init, add_handler
-```
-
-| Function / Decorator | Signature                                                    | Notes                                                                                               |
-| -------------------- | ------------------------------------------------------------ | --------------------------------------------------------------------------------------------------- |
-| `observe`   | `observe(name: 'str | None' = None) -> 'Callable[[Callable[P, T]], Callable[P, T]]'` | Wraps a sync or async function. Captures all kwargs as `eval_input`, return value as `eval_output`. |
-| `enable_storage`   | `enable_storage() -> 'StorageHandler'` | Idempotent. Creates DB, registers handler. Call at app startup. |
-| `start_observation`   | `start_observation(*, input: 'JsonValue', name: 'str | None' = None) -> 'Generator[ObservationContext, None, None]'` | Manual span. Call `obs.set_output(v)` and `obs.set_metadata(key, value)` inside. |
-| `flush`   | `flush(timeout_seconds: 'float' = 5.0) -> 'bool'` | Drains the queue. Call after a run before using CLI commands. |
-| `init`   | `init(*, capture_content: 'bool' = True, queue_size: 'int' = 1000) -> 'None'` | Called internally by `enable_storage`. Idempotent. |
-| `add_handler`   | `add_handler(handler: 'InstrumentationHandler') -> 'None'` | Register a custom handler (must call `init()` first). |
-| `remove_handler`   | `remove_handler(handler: 'InstrumentationHandler') -> 'None'` | Unregister a previously added handler. |
-
----
-
-## CLI Commands
-
-```bash
-# Trace inspection
-pixie trace list [--limit N] [--errors]              # show recent traces
-pixie trace show <trace_id> [--verbose] [--json]     # show span tree for a trace
-pixie trace last [--json]                            # show most recent trace (verbose)
-
-# Dataset management
-pixie dataset create <name>
-pixie dataset list
-pixie dataset save <name>                              # root span (default)
-pixie dataset save <name> --select last_llm_call       # last LLM call
-pixie dataset save <name> --select by_name --span-name <name>
-pixie dataset save <name> --notes "some note"
-echo '"expected value"' | pixie dataset save <name> --expected-output
-
-# Run eval tests
-pixie test [path] [-k filter_substring] [-v]
-```
-
-### `pixie trace` commands
-
-**`pixie trace list`** — show recent traces with summary info (trace ID, root span, timestamp, span count, errors).
-
-- `--limit N` (default 10) — number of traces to show
-- `--errors` — show only traces with errors
-
-**`pixie trace show <trace_id>`** — show the span tree for a specific trace.
-
-- Default (compact): span names, types, timing
-- `--verbose` / `-v`: full input/output data for each span
-- `--json`: machine-readable JSON output
-- Trace ID accepts prefix match (first 8+ characters)
-
-**`pixie trace last`** — shortcut to show the most recent trace in verbose mode. This is the primary command to use after running the harness.
-
-- `--json`: machine-readable JSON output
-
-**`pixie dataset save` selection modes:**
-
-- `root` (default) — the outermost `@observe` or `start_observation` span
-- `last_llm_call` — the most recent LLM API call span in the trace
-- `by_name` — a span matching the `--span-name` argument (takes the last matching span)
-
----
-
-## Dataset Python API
-
-```python
-from pixie import DatasetStore, Evaluable
-```
-
-```python
-store = DatasetStore()                               # reads PIXIE_DATASET_DIR
-store.append(...)    # add one or more items
-store.create(...)    # create empty / create with items
-store.delete(...)    # delete entirely
-store.get(...)    # returns Dataset
-store.list(...)    # list names
-store.list_details(...)    # list names with metadata
-store.remove(...)    # remove by index
-```
-
-**`Evaluable` fields:**
-
-- `eval_input`: the input (what `@observe` captured as function kwargs)
-- `eval_output`: the output (return value of the observed function)
-- `eval_metadata`: dict of extra info (trace_id, span_id, provider, token counts, etc.) — always includes `trace_id` and `span_id`
-- `expected_output`: reference answer for comparison (`UNSET` if not provided)
-
----
-
-## ObservationStore Python API
-
-```python
-from pixie import ObservationStore
-
-store = ObservationStore()   # reads PIXIE_DB_PATH
-await store.create_tables()
-```
-
-```python
-await store.create_tables(self) -> 'None'
-await store.get_by_name(self, name: 'str', trace_id: 'str | None' = None) -> 'list[ObserveSpan | LLMSpan]'  # → list of spans
-await store.get_by_type(self, span_kind: 'str', trace_id: 'str | None' = None) -> 'list[ObserveSpan | LLMSpan]'  # → list of spans filtered by kind
-await store.get_errors(self, trace_id: 'str | None' = None) -> 'list[ObserveSpan | LLMSpan]'  # → list of error spans
-await store.get_last_llm(self, trace_id: 'str') -> 'LLMSpan | None'  # → most recent LLMSpan
-await store.get_root(self, trace_id: 'str') -> 'ObserveSpan'  # → root ObserveSpan
-await store.get_trace(self, trace_id: 'str') -> 'list[ObservationNode]'  # → list[ObservationNode] (tree)
-await store.get_trace_flat(self, trace_id: 'str') -> 'list[ObserveSpan | LLMSpan]'  # → flat list of all spans
-await store.list_traces(self, limit: 'int' = 50, offset: 'int' = 0) -> 'list[dict[str, Any]]'  # → list of trace summaries
-await store.save(self, span: 'ObserveSpan | LLMSpan') -> 'None'  # persist a single span
-await store.save_many(self, spans: 'list[ObserveSpan | LLMSpan]') -> 'None'  # persist multiple spans
-
-# ObservationNode
-node.to_text()          # pretty-print span tree
-node.find(name)         # find a child span by name
-node.children           # list of child ObservationNode
-node.span               # the underlying span (ObserveSpan or LLMSpan)
-```
-
----
-
-## Eval Runner API
-
-### `assert_dataset_pass`
-
-```python
-await assert_dataset_pass(runnable: 'Callable[..., Any]', dataset_name: 'str', evaluators: 'list[Callable[..., Any]]', *, dataset_dir: 'str | None' = None, passes: 'int' = 1, pass_criteria: 'Callable[[list[list[list[Evaluation]]]], tuple[bool, str]] | None' = None, from_trace: 'Callable[[list[ObservationNode]], Evaluable] | None' = None) -> 'None'
-```
-
-**Parameters:**
-
-- `runnable` — callable that takes `eval_input` and runs the app
-- `dataset_name` — name of the dataset to load (NOT `dataset_path`)
-- `evaluators` — list of evaluator instances
-- `pass_criteria` — `ScoreThreshold(threshold=..., pct=...)` (NOT `thresholds`)
-- `from_trace` — span selector: use `last_llm_call` or `root`
-- `dataset_dir` — override dataset directory (default: reads from config)
-- `passes` — number of times to run the full matrix (default: 1)
-
-### `ScoreThreshold`
-
-```python
-ScoreThreshold(threshold: 'float' = 0.5, pct: 'float' = 1.0) -> None
-
-# threshold: minimum per-item score to count as passing (0.0–1.0)
-# pct:       fraction of items that must pass (0.0–1.0, default=1.0)
-```
-
-### Trace helpers
-
-```python
-from pixie import last_llm_call, root
-
-# Pass one of these as the from_trace= argument:
-from_trace=last_llm_call  # extract eval data from the most recent LLM call span
-from_trace=root           # extract eval data from the root @observe span
-```
-
----
-
-## Evaluator catalog
-
-Import any evaluator directly from `pixie`:
-
-```python
-from pixie import FactualityEval, ClosedQAEval, create_llm_evaluator
-```
-
-### Heuristic (no LLM required)
-
-| Evaluator | Signature | Use when | Needs `expected_output`? |
-| --- | --- | --- | --- |
-| `ExactMatchEval() -> 'AutoevalsAdapter'` | Output must exactly equal the expected string | **Yes** |
-| `LevenshteinMatch() -> 'AutoevalsAdapter'` | Partial string similarity (edit distance) | **Yes** |
-| `NumericDiffEval() -> 'AutoevalsAdapter'` | Normalised numeric difference | **Yes** |
-| `JSONDiffEval(*, string_scorer: 'Any' = None) -> 'AutoevalsAdapter'` | Structural JSON comparison | **Yes** |
-| `ValidJSONEval(*, schema: 'Any' = None) -> 'AutoevalsAdapter'` | Output is valid JSON (optionally matching a schema) | No |
-| `ListContainsEval(*, pairwise_scorer: 'Any' = None, allow_extra_entities: 'bool' = False) -> 'AutoevalsAdapter'` | Output list contains expected items | **Yes** |
-
-### LLM-as-judge (require OpenAI key or compatible client)
-
-| Evaluator | Signature | Use when | Needs `expected_output`? |
-| --- | --- | --- | --- |
-| `FactualityEval(*, model: 'str | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'` | Output is factually accurate vs reference | **Yes** |
-| `ClosedQAEval(*, model: 'str | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'` | Closed-book QA comparison | **Yes** |
-| `SummaryEval(*, model: 'str | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'` | Summarisation quality | **Yes** |
-| `TranslationEval(*, language: 'str | None' = None, model: 'str | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'` | Translation quality | **Yes** |
-| `PossibleEval(*, model: 'str | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'` | Output is feasible / plausible | No |
-| `SecurityEval(*, model: 'str | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'` | No security vulnerabilities in output | No |
-| `ModerationEval(*, threshold: 'float | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'` | Content moderation | No |
-| `BattleEval(*, model: 'str | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'` | Head-to-head comparison | **Yes** |
-| `HumorEval(*, model: 'str | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'` | Humor quality evaluation | **Yes** |
-| `EmbeddingSimilarityEval(*, prefix: 'str | None' = None, model: 'str | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'` | Embedding-based semantic similarity | **Yes** |
-
-### RAG / retrieval
-
-| Evaluator | Signature | Use when | Needs `expected_output`? |
-| --- | --- | --- | --- |
-| `ContextRelevancyEval(*, client: 'Any' = None) -> 'AutoevalsAdapter'` | Retrieved context is relevant to query | **Yes** |
-| `FaithfulnessEval(*, client: 'Any' = None) -> 'AutoevalsAdapter'` | Answer is faithful to the provided context | No |
-| `AnswerRelevancyEval(*, client: 'Any' = None) -> 'AutoevalsAdapter'` | Answer addresses the question (⚠️ requires `context` in trace — **RAG pipelines only**) | No |
-| `AnswerCorrectnessEval(*, client: 'Any' = None) -> 'AutoevalsAdapter'` | Answer is correct vs reference | **Yes** |
-
-### Other evaluators
-
-| Evaluator | Signature | Needs `expected_output`? |
-| --- | --- | --- |
-| `SqlEval(*, model: 'str | None' = None, client: 'Any' = None) -> 'AutoevalsAdapter'` | No |
-
----
-
-## Custom evaluator — `create_llm_evaluator` factory
-
-```python
-from pixie import create_llm_evaluator
-
-my_eval = create_llm_evaluator(name: 'str', prompt_template: 'str', *, model: 'str' = 'gpt-4o-mini', client: 'Any | None' = None) -> '_LLMEvaluator'
-```
-
-- Returns a callable satisfying the `Evaluator` protocol
-- Template variables: `{eval_input}`, `{eval_output}`, `{expected_output}` — populated from `Evaluable` fields
-- No nested field access — include any needed metadata in `eval_input` when building the dataset
-- Score parsing extracts a 0–1 float from the LLM response
-
-### Custom evaluator — manual template
-
-```python
-from pixie import Evaluation, Evaluable
-
-async def my_evaluator(evaluable: Evaluable, *, trace=None) -> Evaluation:
-    # evaluable.eval_input  — what was passed to the observed function
-    # evaluable.eval_output — what the function returned
-    # evaluable.expected_output — reference answer (UNSET if not provided)
-    score = 1.0 if "expected pattern" in str(evaluable.eval_output) else 0.0
-    return Evaluation(score=score, reasoning="...")
-```
diff --git a/skills/eval-driven-dev/references/run-harness-patterns.md b/skills/eval-driven-dev/references/run-harness-patterns.md
deleted file mode 100644
index ff995d32..00000000
--- a/skills/eval-driven-dev/references/run-harness-patterns.md
+++ /dev/null
@@ -1,281 +0,0 @@
-# Running the App from Its Entry Point — Examples by App Type
-
-This reference shows concrete examples of how to write the utility function from Step 3 — the function that runs the full application end-to-end with external dependencies mocked. Each example demonstrates what an "entry point" looks like for a different kind of application and how to invoke it.
-
-For `enable_storage()` and `observe` API details, see `references/pixie-api.md` (Instrumentation API section).
-
-## What entry point to use
-
-Look at how a real user or client invokes the app, and do the same thing in your utility function:
-
-| App type                                           | Entry point example     | How to invoke it                                     |
-| -------------------------------------------------- | ----------------------- | ---------------------------------------------------- |
-| **Web server** (FastAPI, Flask)                    | HTTP/WebSocket endpoint | `TestClient`, `httpx`, or subprocess + HTTP requests |
-| **CLI application**                                | Command-line invocation | `subprocess.run()`                                   |
-| **Standalone function** (no server, no middleware) | Python function         | Import and call directly                             |
-
-**Do NOT call an inner function** like `agent.respond()` directly just because it's simpler. Between the entry point and that inner function, the app does request handling, state management, prompt assembly, routing — all of which is under test. When you call an inner function, you skip all of that and end up reimplementing it in your test. Now your test is testing test code, not app code.
-
-Mock only external dependencies (databases, speech services, third-party APIs) — everything you identified and planned in Step 1.
-
----
-
-## Example: FastAPI / Web Server with External Services
-
-**When your app is a web server** (FastAPI, Flask, etc.) with external service dependencies (Redis, Twilio, speech services, databases). **This is the most common case** — most production apps are web servers.
-
-**Approach**: Mock external dependencies, then drive the app through its HTTP/WebSocket interface. Two sub-approaches:
-
-- **Subprocess approach**: Launch the patched server as a subprocess, wait for health, then send HTTP/WebSocket requests with `httpx`. Best when the app has complex startup or uses `uvicorn.run()`.
-- **In-process approach**: Use FastAPI's `TestClient` (or `httpx.AsyncClient` with `ASGITransport`) to drive the app in-process. Simpler — no subprocess management, no ports. Best when you can import the `app` object directly.
-
-Both approaches exercise the full request pipeline: routing → middleware → state management → business logic → response assembly.
-
-### Step 1: Identify pluggable interfaces and write mock backends
-
-Look for abstract base classes, protocols, or constructor-injected backends in the codebase. These are the app's testability seams — the places where external services can be swapped out. Create mock implementations that satisfy the interface but don't call external services.
-
-```python
-# pixie_qa/scripts/mock_backends.py
-from myapp.services.transcription import TranscriptionBackend
-from myapp.services.voice_synthesis import SynthesisBackend
-
-class MockTranscriptionBackend(TranscriptionBackend):
-    """Decodes UTF-8 text instead of calling real STT service."""
-    async def transcribe_chunk(self, audio_data: bytes) -> str | None:
-        try:
-            return audio_data.decode("utf-8")
-        except UnicodeDecodeError:
-            return None
-
-class MockSynthesisBackend(SynthesisBackend):
-    """Encodes text as bytes instead of calling real TTS service."""
-    async def synthesize(self, text: str) -> bytes:
-        return text.encode("utf-8")
-```
-
-### Step 2: Write the patched server launcher
-
-Monkey-patch the app's module-level dependencies before starting the server:
-
-```python
-# pixie_qa/scripts/demo_server.py
-import uvicorn
-from pixie_qa.scripts.mock_backends import (
-    MockTranscriptionBackend,
-    MockSynthesisBackend,
-)
-
-# Patch module-level backends BEFORE uvicorn imports the ASGI app
-import myapp.app as the_app
-the_app.transcription_backend = MockTranscriptionBackend()
-the_app.synthesis_backend = MockSynthesisBackend()
-
-if __name__ == "__main__":
-    uvicorn.run(the_app.app, host="127.0.0.1", port=8000)
-```
-
-### Step 3: Write the utility function
-
-Launch the server subprocess, wait for health, send real requests, collect responses:
-
-```python
-# pixie_qa/scripts/run_app.py
-import subprocess
-import sys
-import time
-import httpx
-
-BASE_URL = "http://127.0.0.1:8000"
-
-def wait_for_server(timeout: float = 30.0) -> None:
-    start = time.time()
-    while time.time() - start < timeout:
-        try:
-            resp = httpx.get(f"{BASE_URL}/health", timeout=2)
-            if resp.status_code == 200:
-                return
-        except httpx.ConnectError:
-            pass
-        time.sleep(0.5)
-    raise TimeoutError(f"Server did not start within {timeout}s")
-
-def main() -> None:
-    # Launch patched server
-    server = subprocess.Popen(
-        [sys.executable, "-m", "pixie_qa.scripts.demo_server"],
-    )
-    try:
-        wait_for_server()
-        # Drive the app with real inputs
-        resp = httpx.post(f"{BASE_URL}/api/chat", json={
-            "message": "What are your business hours?"
-        })
-        print(resp.json())
-    finally:
-        server.terminate()
-        server.wait()
-
-if __name__ == "__main__":
-    main()
-```
-
-**Run**: `uv run python -m pixie_qa.scripts.run_app`
-
-### Alternative: In-process with TestClient (simpler)
-
-If the app's `app` object can be imported directly, skip the subprocess and use FastAPI's `TestClient`:
-
-```python
-# pixie_qa/scripts/run_app.py
-from unittest.mock import patch
-from fastapi.testclient import TestClient
-from pixie import enable_storage, observe
-
-from pixie_qa.scripts.mock_backends import (
-    MockTranscriptionBackend,
-    MockSynthesisBackend,
-)
-
-@observe
-def run_app(eval_input: dict) -> dict:
-    """Run the voice agent through its real FastAPI app layer."""
-    enable_storage()
-    # Patch external dependencies before importing the app
-    with patch("myapp.app.transcription_backend", MockTranscriptionBackend()), \
-         patch("myapp.app.synthesis_backend", MockSynthesisBackend()), \
-         patch("myapp.app.call_state_store", MockCallStateStore()):
-
-        from myapp.app import app
-        client = TestClient(app)
-
-        # Drive through the real HTTP/WebSocket endpoints
-        resp = client.post("/api/chat", json={
-            "message": eval_input["user_message"],
-            "call_sid": eval_input.get("call_sid", "test-call-001"),
-        })
-        return {"response": resp.json()["response"]}
-```
-
-This approach is simpler (no subprocess, no port management) and equally valid. Both approaches exercise the full request pipeline.
-
-**Run**: `uv run python -m pixie_qa.scripts.run_app`
-
----
-
-## Example: CLI / Command-Line App
-
-**When your app is invoked from the command line** (e.g., `python -m myapp`, a CLI tool).
-
-**Approach**: Invoke the app's entry point via `subprocess.run()`, capture stdout/stderr, parse results.
-
-```python
-# pixie_qa/scripts/run_app.py
-import subprocess
-import sys
-import json
-
-def run_app(user_input: str) -> str:
-    """Run the CLI app with the given input and return its output."""
-    result = subprocess.run(
-        [sys.executable, "-m", "myapp", "--query", user_input],
-        capture_output=True,
-        text=True,
-        timeout=120,
-    )
-    if result.returncode != 0:
-        raise RuntimeError(f"App failed: {result.stderr}")
-    return result.stdout.strip()
-
-def main() -> None:
-    inputs = [
-        "What are your business hours?",
-        "How do I reset my password?",
-        "Tell me about your return policy",
-    ]
-    for user_input in inputs:
-        output = run_app(user_input)
-        print(f"Input: {user_input}")
-        print(f"Output: {output}")
-        print("---")
-
-if __name__ == "__main__":
-    main()
-```
-
-If the CLI app needs external dependencies mocked, create a wrapper script that patches them before invoking the entry point:
-
-```python
-# pixie_qa/scripts/patched_app.py
-"""Entry point that patches DB/cache before running the real app."""
-import myapp.config as config
-config.redis_url = "mock://localhost"  # or use a mock implementation
-
-from myapp.main import main
-main()
-```
-
-**Run**: `uv run python -m pixie_qa.scripts.run_app`
-
----
-
-## Example: Standalone Function (No Infrastructure)
-
-**When your app is a single function or module** with no server, no database, no external services.
-
-**Approach**: Import the function directly and call it. This is the simplest case.
-
-```python
-# pixie_qa/scripts/run_app.py
-from pixie import enable_storage, observe
-
-# Enable trace capture
-enable_storage()
-
-from myapp.agent import answer_question
-
-@observe
-def run_agent(question: str) -> str:
-    """Wrapper that captures traces for the agent call."""
-    return answer_question(question)
-
-def main() -> None:
-    inputs = [
-        "What are your business hours?",
-        "How do I reset my password?",
-        "Tell me about your return policy",
-    ]
-    for q in inputs:
-        result = run_agent(q)
-        print(f"Q: {q}")
-        print(f"A: {result}")
-        print("---")
-
-if __name__ == "__main__":
-    main()
-```
-
-If the function depends on something that needs mocking (e.g., a vector store client), patch it before calling:
-
-```python
-from unittest.mock import MagicMock
-import myapp.retriever as retriever
-
-# Mock the vector store with a simple keyword search
-retriever.vector_client = MagicMock()
-retriever.vector_client.search.return_value = [
-    {"text": "Business hours: Mon-Fri 9am-5pm", "score": 0.95}
-]
-```
-
-**Run**: `uv run python -m pixie_qa.scripts.run_app`
-
----
-
-## Key Rules
-
-1. **Always call through the real entry point** — the same way a real user or client would
-2. **Mock only external dependencies** — the ones you identified in Step 1
-3. **Use `uv run python -m <module>`** to run scripts — never `python <path>`
-4. **Add `enable_storage()` and `@observe`** in the utility function so traces are captured
-5. **After running, verify traces**: `uv run pixie trace list` then `uv run pixie trace show <trace_id> --verbose`
diff --git a/skills/eval-driven-dev/references/testing-api.md b/skills/eval-driven-dev/references/testing-api.md
new file mode 100644
index 00000000..29a091d7
--- /dev/null
+++ b/skills/eval-driven-dev/references/testing-api.md
@@ -0,0 +1,367 @@
+# Testing API Reference
+
+> Auto-generated from pixie source code docstrings.
+> Do not edit by hand — regenerate from the upstream [pixie-qa](https://github.com/yiouli/pixie-qa) source repository.
+
+pixie.evals — evaluation harness for LLM applications.
+
+Public API: - `Evaluation` — result dataclass for a single evaluator run - `Evaluator` — protocol for evaluation callables - `evaluate` — run one evaluator against one evaluable - `run_and_evaluate` — evaluate spans from a MemoryTraceHandler - `assert_pass` — batch evaluation with pass/fail criteria - `assert_dataset_pass` — load a dataset and run assert_pass - `EvalAssertionError` — raised when assert_pass fails - `capture_traces` — context manager for in-memory trace capture - `MemoryTraceHandler` — InstrumentationHandler that collects spans - `ScoreThreshold` — configurable pass criteria - `last_llm_call` / `root` — trace-to-evaluable helpers - `DatasetEntryResult` — evaluation results for a single dataset entry - `DatasetScorecard` — per-dataset scorecard with non-uniform evaluators - `generate_dataset_scorecard_html` — render a scorecard as HTML - `save_dataset_scorecard` — write scorecard HTML to disk
+
+Pre-made evaluators (autoevals adapters): - `AutoevalsAdapter` — generic wrapper for any autoevals `Scorer` - `LevenshteinMatch` — edit-distance string similarity - `ExactMatch` — exact value comparison - `NumericDiff` — normalised numeric difference - `JSONDiff` — structural JSON comparison - `ValidJSON` — JSON syntax / schema validation - `ListContains` — list overlap - `EmbeddingSimilarity` — embedding cosine similarity - `Factuality` — LLM factual accuracy check - `ClosedQA` — closed-book QA evaluation - `Battle` — head-to-head comparison - `Humor` — humor detection - `Security` — security vulnerability check - `Sql` — SQL equivalence - `Summary` — summarisation quality - `Translation` — translation quality - `Possible` — feasibility check - `Moderation` — content moderation - `ContextRelevancy` — RAGAS context relevancy - `Faithfulness` — RAGAS faithfulness - `AnswerRelevancy` — RAGAS answer relevancy - `AnswerCorrectness` — RAGAS answer correctness
+
+## Dataset JSON Format
+
+The dataset is a JSON object with these top-level fields:
+
+```json
+{
+  "name": "customer-faq",
+  "runnable": "pixie_qa/scripts/run_app.py:AppRunnable",
+  "evaluators": ["Factuality"],
+  "entries": [
+    {
+      "entry_kwargs": { "question": "Hello" },
+      "description": "Basic greeting",
+      "eval_input": [{ "name": "input", "value": "Hello" }],
+      "expectation": "A friendly greeting that offers to help",
+      "evaluators": ["...", "ClosedQA"]
+    }
+  ]
+}
+```
+
+### Entry structure
+
+All fields are top-level on each entry (flat structure — no nesting):
+
+```
+entry:
+  ├── entry_kwargs    (required) — args for Runnable.run()
+  ├── eval_input      (required) — list of {"name": ..., "value": ...} objects
+  ├── description     (required) — human-readable label for the test case
+  ├── expectation     (optional) — reference for comparison-based evaluators
+  ├── eval_metadata   (optional) — extra per-entry data for custom evaluators
+  └── evaluators      (optional) — evaluator names for THIS entry
+```
+
+### Field reference
+
+- `runnable` (required): `filepath:ClassName` reference to the `Runnable`
+  subclass that drives the app during evaluation.
+- `evaluators` (dataset-level, optional): Default evaluator names — applied to
+  every entry that does not declare its own `evaluators`.
+- `entries[].entry_kwargs` (required): Kwargs passed to `Runnable.run()` as a
+  Pydantic model. Keys must match the fields of the Pydantic model used in
+  `run(args: T)`.
+- `entries[].description` (required): Human-readable label for the test case.
+- `entries[].eval_input` (required): List of `{"name": ..., "value": ...}`
+  objects. Used to populate the wrap input registry — `wrap(purpose="input")`
+  calls in the app return registry values keyed by `name`.
+- `entries[].expectation` (optional): Concise expectation description
+  for comparison-based evaluators. Should describe what a correct output looks
+  like, **not** copy the verbatim output. Use `pixie format` on the trace to
+  see the real output shape, then write a shorter description.
+- `entries[].eval_metadata` (optional): Extra per-entry data for custom
+  evaluators — e.g., expected tool names, boolean flags, thresholds. Accessed in
+  evaluators as `evaluable.eval_metadata`.
+- `entries[].evaluators` (optional): Row-level evaluator override. Rules:
+  - Omit → entry inherits dataset-level `evaluators`.
+  - `["...", "ClosedQA"]` → dataset defaults **plus** ClosedQA.
+  - `["OnlyThis"]` (no `"..."`) → **only** OnlyThis, no defaults.
+
+## Evaluator Name Resolution
+
+In dataset JSON, evaluator names are resolved as follows:
+
+- **Built-in names** (bare names like `"Factuality"`, `"ExactMatch"`) are
+  resolved to `pixie.{Name}` automatically.
+- **Custom evaluators** use `filepath:callable_name` format
+  (e.g. `"pixie_qa/evaluators.py:my_evaluator"`).
+- Custom evaluator references point to module-level callables — classes
+  (instantiated automatically), factory functions (called if zero-arg),
+  evaluator functions (used as-is), or pre-instantiated callables (e.g.
+  `create_llm_evaluator` results — used as-is).
+
+## CLI Commands
+
+| Command                                     | Description                           |
+| ------------------------------------------- | ------------------------------------- |
+| `pixie test [path] [-v] [--no-open]`        | Run eval tests on dataset files       |
+| `pixie dataset create <name>`               | Create a new empty dataset            |
+| `pixie dataset list`                        | List all datasets                     |
+| `pixie dataset save <name> [--select MODE]` | Save a span to a dataset              |
+| `pixie dataset validate [path]`             | Validate dataset JSON files           |
+| `pixie analyze <test_run_id>`               | Generate analysis and recommendations |
+
+---
+
+## Types
+
+### `Evaluable`
+
+```python
+class Evaluable(TestCase):
+    eval_output: list[NamedData]      # wrap(purpose="output") + wrap(purpose="state") values
+    # Inherited from TestCase:
+    # eval_input: list[NamedData]     # from eval_input in dataset entry
+    # expectation: JsonValue | _Unset # from expectation in dataset entry
+    # eval_metadata: dict[str, JsonValue] | None  # from eval_metadata in dataset entry
+    # description: str | None
+```
+
+Data carrier for evaluators. Extends `TestCase` with actual output.
+
+- `eval_input` — `list[NamedData]` populated from the entry's `eval_input` field. **Must have at least one item** (`min_length=1`).
+- `eval_output` — `list[NamedData]` containing ALL `wrap(purpose="output")` and `wrap(purpose="state")` values captured during the run. Each item has `.name` (str) and `.value` (JsonValue). Use `_get_output(evaluable, "name")` to look up by name.
+- `eval_metadata` — `dict[str, JsonValue] | None` from the entry's `eval_metadata` field
+- `expected_output` — expectation text from dataset (or `UNSET` if not provided)
+
+Attributes:
+eval_input: Named input data items (from dataset). Must be non-empty.
+eval_output: Named output data items (from wrap calls during run).
+Each item has `.name` (str) and `.value` (JsonValue).
+Contains ALL `wrap(purpose="output")` and `wrap(purpose="state")` values.
+eval_metadata: Supplementary metadata (`None` when absent).
+expected_output: The expected/reference output for evaluation.
+Defaults to `UNSET` (not provided). May be explicitly
+set to `None` to indicate "there is no expected output".
+
+### How `wrap()` maps to `Evaluable` fields at test time
+
+When `pixie test` runs a dataset entry, `wrap()` calls in the app populate the `Evaluable` that evaluators receive:
+
+| `wrap()` call in app code                | Evaluable field   | Type              | How to access in evaluator                           |
+| ---------------------------------------- | ----------------- | ----------------- | ---------------------------------------------------- |
+| `wrap(data, purpose="input", name="X")`  | `eval_input`      | `list[NamedData]` | Pre-populated from `eval_input` in the dataset entry |
+| `wrap(data, purpose="output", name="X")` | `eval_output`     | `list[NamedData]` | `_get_output(evaluable, "X")` — see helper below     |
+| `wrap(data, purpose="state", name="X")`  | `eval_output`     | `list[NamedData]` | `_get_output(evaluable, "X")` — same list as output  |
+| (from dataset entry `expectation`)       | `expected_output` | `str \| None`     | `evaluable.expected_output`                          |
+| (from dataset entry `eval_metadata`)     | `eval_metadata`   | `dict \| None`    | `evaluable.eval_metadata`                            |
+
+**Key insight**: Both `purpose="output"` and `purpose="state"` wrap values end up in `eval_output` as `NamedData` items. There is no separate `captured_output` or `captured_state` dict. Use the helper function below to look up values by wrap name:
+
+```python
+def _get_output(evaluable: Evaluable, name: str) -> Any:
+    """Look up a wrap value by name from eval_output."""
+    for item in evaluable.eval_output:
+        if item.name == name:
+            return item.value
+    return None
+```
+
+**`eval_metadata`** is for passing extra per-entry data to evaluators that isn't an app input or output — e.g., expected tool names, boolean flags, thresholds. Defined as a top-level field on the entry, accessed as `evaluable.eval_metadata`.
+
+**Complete custom evaluator example** (tool call check + dataset entry):
+
+```python
+from pixie import Evaluation, Evaluable
+
+def _get_output(evaluable: Evaluable, name: str) -> Any:
+    """Look up a wrap value by name from eval_output."""
+    for item in evaluable.eval_output:
+        if item.name == name:
+            return item.value
+    return None
+
+def tool_call_check(evaluable: Evaluable, *, trace=None) -> Evaluation:
+    expected = evaluable.eval_metadata.get("expected_tool") if evaluable.eval_metadata else None
+    actual = _get_output(evaluable, "function_called")
+    if expected is None:
+        return Evaluation(score=1.0, reasoning="No expected_tool specified")
+    match = str(actual) == str(expected)
+    return Evaluation(
+        score=1.0 if match else 0.0,
+        reasoning=f"Expected {expected}, got {actual}",
+    )
+```
+
+Corresponding dataset entry:
+
+```json
+{
+  "entry_kwargs": { "user_message": "I want to end this call" },
+  "description": "User requests call end after failed verification",
+  "eval_input": [{ "name": "user_input", "value": "I want to end this call" }],
+  "expectation": "Agent should call endCall tool",
+  "eval_metadata": {
+    "expected_tool": "endCall",
+    "expected_call_ended": true
+  },
+  "evaluators": ["...", "pixie_qa/evaluators.py:tool_call_check"]
+}
+```
+
+### `Evaluation`
+
+```python
+Evaluation(score: 'float', reasoning: 'str', details: 'dict[str, Any]' = <factory>) -> None
+```
+
+The result of a single evaluator applied to a single test case.
+
+Attributes:
+score: Evaluation score between 0.0 and 1.0.
+reasoning: Human-readable explanation (required).
+details: Arbitrary JSON-serializable metadata.
+
+### `ScoreThreshold`
+
+```python
+ScoreThreshold(threshold: 'float' = 0.5, pct: 'float' = 1.0) -> None
+```
+
+Pass criteria: _pct_ fraction of inputs must score >= _threshold_ on all evaluators.
+
+Attributes:
+threshold: Minimum score an individual evaluation must reach.
+pct: Fraction of test-case inputs (0.0–1.0) that must pass.
+
+## Eval Functions
+
+### `pixie.run_and_evaluate`
+
+```python
+pixie.run_and_evaluate(evaluator: 'Callable[..., Any]', runnable: 'Callable[..., Any]', eval_input: 'Any', *, expected_output: 'Any' = <object object at 0x7788c2ad5c80>, from_trace: 'Callable[[list[ObservationNode]], Evaluable] | None' = None) -> 'Evaluation'
+```
+
+Run _runnable(eval_input)_ while capturing traces, then evaluate.
+
+Convenience wrapper combining `_run_and_capture` and `evaluate`.
+The runnable is called exactly once.
+
+Args:
+evaluator: An evaluator callable (sync or async).
+runnable: The application function to test.
+eval*input: The single input passed to \_runnable*.
+expected_output: Optional expected value merged into the
+evaluable.
+from_trace: Optional callable to select a specific span from
+the trace tree for evaluation.
+
+Returns:
+The `Evaluation` result.
+
+Raises:
+ValueError: If no spans were captured during execution.
+
+### `pixie.assert_pass`
+
+```python
+pixie.assert_pass(runnable: 'Callable[..., Any]', eval_inputs: 'list[Any]', evaluators: 'list[Callable[..., Any]]', *, evaluables: 'list[Evaluable] | None' = None, pass_criteria: 'Callable[[list[list[Evaluation]]], tuple[bool, str]] | None' = None, from_trace: 'Callable[[list[ObservationNode]], Evaluable] | None' = None) -> 'None'
+```
+
+Run evaluators against a runnable over multiple inputs.
+
+For each input, runs the runnable once via `_run_and_capture`,
+then evaluates with every evaluator concurrently via
+`asyncio.gather`.
+
+The results matrix has shape `[eval_inputs][evaluators]`.
+If the pass criteria are not met, raises :class:`EvalAssertionError`
+carrying the matrix.
+
+When `evaluables` is provided, behaviour depends on whether each
+item already has `eval_output` populated:
+
+- **eval_output is None** — the `runnable` is called via
+  `run_and_evaluate` to produce an output from traces, and
+  `expected_output` from the evaluable is merged into the result.
+- **eval_output is not None** — the evaluable is used directly
+  (the runnable is not called for that item).
+
+Args:
+runnable: The application function to test.
+eval*inputs: List of inputs, each passed to \_runnable*.
+evaluators: List of evaluator callables.
+evaluables: Optional list of `Evaluable` items, one per input.
+When provided, their `expected_output` is forwarded to
+`run_and_evaluate`. Must have the same length as
+_eval_inputs_.
+pass_criteria: Receives the results matrix, returns
+`(passed, message)`. Defaults to `ScoreThreshold()`.
+from_trace: Optional span selector forwarded to
+`run_and_evaluate`.
+
+Raises:
+EvalAssertionError: When pass criteria are not met.
+ValueError: When _evaluables_ length does not match _eval_inputs_.
+
+### `pixie.assert_dataset_pass`
+
+```python
+pixie.assert_dataset_pass(runnable: 'Callable[..., Any]', dataset_name: 'str', evaluators: 'list[Callable[..., Any]]', *, dataset_dir: 'str | None' = None, pass_criteria: 'Callable[[list[list[Evaluation]]], tuple[bool, str]] | None' = None, from_trace: 'Callable[[list[ObservationNode]], Evaluable] | None' = None) -> 'None'
+```
+
+Load a dataset by name, then run `assert_pass` with its items.
+
+This is a convenience wrapper that:
+
+1. Loads the dataset from the `DatasetStore`.
+2. Extracts `eval_input` from each item as the runnable inputs.
+3. Uses the full `Evaluable` items (which carry `expected_output`)
+   as the evaluables.
+4. Delegates to `assert_pass`.
+
+Args:
+runnable: The application function to test.
+dataset_name: Name of the dataset to load.
+evaluators: List of evaluator callables.
+dataset_dir: Override directory for the dataset store.
+When `None`, reads from `PixieConfig.dataset_dir`.
+pass_criteria: Receives the results matrix, returns
+`(passed, message)`.
+from_trace: Optional span selector forwarded to
+`assert_pass`.
+
+Raises:
+FileNotFoundError: If no dataset with _dataset_name_ exists.
+EvalAssertionError: When pass criteria are not met.
+
+## Trace Helpers
+
+### `pixie.last_llm_call`
+
+```python
+pixie.last_llm_call(trace: 'list[ObservationNode]') -> 'Evaluable'
+```
+
+Find the `LLMSpan` with the latest `ended_at` in the trace tree.
+
+Args:
+trace: The trace tree (list of root `ObservationNode` instances).
+
+Returns:
+An `Evaluable` wrapping the most recently ended `LLMSpan`.
+
+Raises:
+ValueError: If no `LLMSpan` exists in the trace.
+
+### `pixie.root`
+
+```python
+pixie.root(trace: 'list[ObservationNode]') -> 'Evaluable'
+```
+
+Return the first root node's span as `Evaluable`.
+
+Args:
+trace: The trace tree (list of root `ObservationNode` instances).
+
+Returns:
+An `Evaluable` wrapping the first root node's span.
+
+Raises:
+ValueError: If the trace is empty.
+
+### `pixie.capture_traces`
+
+```python
+pixie.capture_traces() -> 'Generator[MemoryTraceHandler, None, None]'
+```
+
+Context manager that installs a `MemoryTraceHandler` and yields it.
+
+Calls `init()` (no-op if already initialised) then registers the
+handler via `add_handler()`. On exit the handler is removed and
+the delivery queue is flushed so that all spans are available on
+`handler.spans`.
diff --git a/skills/eval-driven-dev/references/understanding-app.md b/skills/eval-driven-dev/references/understanding-app.md
deleted file mode 100644
index e7c8a47a..00000000
--- a/skills/eval-driven-dev/references/understanding-app.md
+++ /dev/null
@@ -1,201 +0,0 @@
-# Understanding the Application
-
-This reference covers Step 1 of the eval-driven-dev process in detail: how to read the codebase, map the data flows, and document your findings.
-
----
-
-## What to investigate
-
-Before touching any code, spend time actually reading the source. The code will tell you more than asking the user would.
-
-### 1. How the software runs
-
-What is the entry point? How do you start it? Is it a CLI, a server, a library function? What are the required arguments, config files, or environment variables?
-
-### 2. Find where the LLM provider client is called
-
-Locate every place in the codebase where an LLM provider client is invoked (e.g., `openai.ChatCompletion.create()`, `client.chat.completions.create()`, `anthropic.messages.create()`). These are the anchor points for your analysis. For each LLM call site, record:
-
-- The file and function where the call lives
-- Which LLM provider/client is used
-- The exact arguments being passed (model, messages, tools, etc.)
-
-### 3. Track backwards: external data dependencies flowing IN
-
-Starting from each LLM call site, trace **backwards** through the code to find every piece of data that feeds into the LLM prompt. Categorize each data source:
-
-**Application inputs** (from the user / caller):
-
-- User messages, queries, uploaded files
-- Configuration or feature flags
-
-**External dependency data** (from systems outside the app):
-
-- Database lookups (conversation history from Redis, user profiles from Postgres, etc.)
-- Retrieved context (RAG chunks from a vector DB, search results from an API)
-- Cache reads
-- Third-party API responses
-
-For each external data dependency, document:
-
-- What system it comes from
-- What the data shape looks like (types, fields, structure)
-- What realistic values look like
-- Whether it requires real credentials or can be mocked
-
-**In-code data** (assembled by the application itself):
-
-- System prompts (hardcoded or templated)
-- Tool definitions and function schemas
-- Prompt-building logic that combines the above
-
-### 4. Track forwards: external side-effects flowing OUT
-
-Starting from each LLM call site, trace **forwards** through the code to find every side-effect the application causes in external systems based on the LLM's output:
-
-- Database writes (saving conversation history, updating records)
-- API calls to third-party services (sending emails, creating calendar entries, initiating transfers)
-- Messages sent to other systems (queues, webhooks, notifications)
-- File system writes
-
-For each side-effect, document:
-
-- What system is affected
-- What data is written/sent
-- Whether this side-effect is something evaluations should verify (e.g., "did the agent route to the correct department?")
-
-### 5. Identify intermediate states to capture
-
-Along the paths between input and output, identify intermediate states that are necessary for proper evaluation but aren't visible in the final output:
-
-- Tool call decisions and results (which tools were called, what they returned)
-- Agent routing / handoff decisions
-- Intermediate LLM calls (e.g., summarization before final answer)
-- Retrieval results (what context was fetched)
-- Any branching logic that determines the code path
-
-These are things that evaluators will need to check criteria like "did the agent verify identity before transferring?" or "did it use the correct tool?"
-
-### 6. Use cases and expected behaviors
-
-What are the distinct things the app is supposed to handle? For each use case, what does a "good" response look like? What would constitute a failure?
-
----
-
-## Writing MEMORY.md
-
-Write your findings to `pixie_qa/MEMORY.md`. This is the primary working document for the eval effort. It should be human-readable and detailed enough that someone unfamiliar with the project can understand the application and the eval strategy.
-
-**MEMORY.md documents your understanding of the existing application code. It must NOT contain references to pixie commands, instrumentation code you plan to add, or scripts/functions that don't exist yet.** Those belong in later steps, only after they've been implemented.
-
-### Template
-
-```markdown
-# Eval Notes: <Project Name>
-
-## How the application works
-
-### Entry point and execution flow
-
-<Describe how to start/run the app, what happens step by step>
-
-### LLM call sites
-
-<For each LLM call in the codebase, document:>
-
-- Where it is in the code (file + function name)
-- Which LLM provider/client is used
-- What arguments are passed
-
-### External data dependencies (data flowing IN to LLM)
-
-<For each external system the app reads from:>
-
-- **System**: <e.g., Redis, Postgres, vector DB, third-party API>
-- **What data**: <e.g., conversation history, user profile, retrieved documents>
-- **Data shape**: <types, fields, structure, realistic values>
-- **Code path**: <file:line where each read happens>
-- **Credentials needed**: <yes/no, what kind>
-
-### External side-effects (data flowing OUT from LLM output)
-
-<For each external system the app writes to / affects:>
-
-- **System**: <e.g., database, API, queue, file system>
-- **What happens**: <e.g., saves conversation, sends email, creates calendar entry>
-- **Code path**: <file:line where each write happens>
-- **Eval-relevant?**: <should evaluations verify this side-effect?>
-
-### Pluggable/injectable interfaces (testability seams)
-
-<For each abstract base class, protocol, or constructor-injected backend:>
-
-- **Interface**: <e.g., `TranscriptionBackend`, `SynthesisBackend`, `StorageBackend`>
-- **Defined in**: <file:line>
-- **What it wraps**: <e.g., real STT service, real TTS service, Redis>
-- **How it's injected**: <constructor param, module-level var, dependency injection framework>
-- **Mock strategy**: <what mock implementation should do — e.g., decode UTF-8 instead of real STT>
-
-These are the primary testability seams. In Step 3, you'll write mock implementations of these interfaces.
-
-### Mocking plan summary
-
-<For each external dependency, how will you replace it in the utility function (Step 3)?>
-
-| Dependency          | Mock approach                  | What mock provides (IN)                | What mock captures (OUT) |
-| ------------------- | ------------------------------ | -------------------------------------- | ------------------------ |
-| <e.g., Redis>       | <mock.patch / mock class / DI> | <conversation history from eval_input> | <saved messages>         |
-| <e.g., STT service> | <MockTranscriptionBackend>     | <text from eval_input>                 | <n/a>                    |
-
-### Intermediate states to capture
-
-<States along the execution path needed for evaluation but not in final output:>
-
-- <e.g., tool call decisions, routing choices, retrieval results>
-- Include code pointers (file:line) for each
-
-### Final output
-
-<What the user sees, what format, what the quality bar should be>
-
-### Use cases
-
-<List each distinct scenario the app handles, with examples of good/bad outputs>
-
-1. <Use case 1>: <description>
-   - Input example: ...
-   - Good output: ...
-   - Bad output: ...
-
-## Evaluation plan
-
-### What to evaluate and why
-
-<App-specific quality dimensions and rationale — filled in during Step 1>
-
-### Evaluators and criteria
-
-<Filled in during Step 5 — maps each quality criterion to a specific evaluator>
-
-| Criterion | Evaluator | Dataset | Pass criteria | Rationale |
-| --------- | --------- | ------- | ------------- | --------- |
-| ...       | ...       | ...     | ...           | ...       |
-
-### Data needed for evaluation
-
-<What data to capture, with code pointers>
-
-## Datasets
-
-| Dataset | Items | Purpose |
-| ------- | ----- | ------- |
-| ...     | ...   | ...     |
-
-## Investigation log
-
-### <date> — <test_name> failure
-
-<Structured investigation entries — filled in during Step 6>
-```
-
-If something is genuinely unclear from the code, ask the user — but most questions answer themselves once you've read the code carefully.
diff --git a/skills/eval-driven-dev/references/wrap-api.md b/skills/eval-driven-dev/references/wrap-api.md
new file mode 100644
index 00000000..574ffadd
--- /dev/null
+++ b/skills/eval-driven-dev/references/wrap-api.md
@@ -0,0 +1,255 @@
+# Wrap API Reference
+
+> Auto-generated from pixie source code docstrings.
+> Do not edit by hand — regenerate from the upstream [pixie-qa](https://github.com/yiouli/pixie-qa) source repository.
+
+`pixie.wrap` — data-oriented observation API.
+
+`wrap()` observes a data value or callable at a named point in the
+processing pipeline. Its behavior depends on the active mode:
+
+- **No-op** (tracing disabled, no eval registry): returns `data` unchanged.
+- **Tracing** (during `pixie trace`): writes to the trace file and emits an
+  OTel event (via span event if a span is active, or via OTel logger
+  otherwise) and returns `data` unchanged (or wraps a callable so the
+  event fires on call).
+- **Eval** (eval registry active): injects dependency data for
+  `purpose="input"`, captures output/state for `purpose="output"`/
+  `purpose="state"`.
+
+---
+
+## CLI Commands
+
+| Command                                                                                   | Description                                                                                                                                   |
+| ----------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
+| `pixie trace --runnable <filepath:ClassName> --input <kwargs.json> --output <file.jsonl>` | Run the Runnable once with kwargs from the JSON file and write a trace file. `--input` is a **file path** (not inline JSON).                  |
+| `pixie format <file.jsonl>`                                                               | Convert a trace file to a formatted dataset entry template. Shows `entry_kwargs`, `eval_input`, and `eval_output` (the real captured output). |
+| `pixie trace filter <file.jsonl> --purpose input`                                         | Print only wrap events matching the given purposes. Outputs one JSON line per matching event.                                                 |
+
+---
+
+## Classes
+
+### `pixie.Runnable`
+
+```python
+class pixie.Runnable(Protocol[T]):
+    @classmethod
+    def create(cls) -> Runnable[Any]: ...
+    async def setup(self) -> None: ...
+    async def run(self, args: T) -> None: ...
+    async def teardown(self) -> None: ...
+```
+
+Protocol for structured runnables used by the dataset runner. `T` is a
+`pydantic.BaseModel` subclass whose fields match the `entry_kwargs` keys
+in the dataset JSON.
+
+Lifecycle:
+
+1. `create()` — class method to construct and return a runnable instance.
+2. `setup()` — **async**, called **once** before the first `run()` call.
+   Initialize shared resources here (e.g., `TestClient`, database connections).
+   Optional — has a default no-op implementation.
+3. `run(args)` — **async**, called **concurrently for each dataset entry**
+   (up to 4 entries in parallel). `args` is a validated Pydantic model
+   built from `entry_kwargs`. Invoke the application's real entry point.
+4. `teardown()` — **async**, called **once** after the last `run()` call.
+   Release any resources acquired in `setup()`.
+   Optional — has a default no-op implementation.
+
+`setup()` and `teardown()` have default no-op implementations;
+you only need to override them when shared resources are required.
+
+**Concurrency**: `run()` is called concurrently via `asyncio.gather`. Your
+implementation **must be concurrency-safe**. If it uses shared mutable state
+(e.g., a SQLite connection, an in-memory cache, a file handle), protect it
+with `asyncio.Semaphore` or `asyncio.Lock`:
+
+```python
+class AppRunnable(pixie.Runnable[AppArgs]):
+    _sem: asyncio.Semaphore
+
+    @classmethod
+    def create(cls) -> AppRunnable:
+        inst = cls()
+        inst._sem = asyncio.Semaphore(1)  # serialise DB access
+        return inst
+
+    async def run(self, args: AppArgs) -> None:
+        async with self._sem:
+            await call_app(args.message)
+```
+
+Common concurrency pitfalls:
+
+- **SQLite**: not safe for concurrent writes — use `Semaphore(1)` or `aiosqlite` with WAL mode.
+- **Global mutable state**: module-level dicts/lists modified in `run()` need protection.
+- **Rate-limited APIs**: add a semaphore to avoid 429 errors.
+
+**Import resolution**: The project root directory (where `pixie test` / `pixie trace`
+is invoked) is automatically added to `sys.path` before loading runnables and
+evaluators. This means your runnable can use normal `import` statements to
+reference project modules (e.g., `from app import service`).
+
+**Example**:
+
+```python
+# pixie_qa/scripts/run_app.py
+from __future__ import annotations
+from pydantic import BaseModel
+import pixie
+
+class AppArgs(BaseModel):
+    user_message: str
+
+class AppRunnable(pixie.Runnable[AppArgs]):
+    @classmethod
+    def create(cls) -> AppRunnable:
+        return cls()
+
+    async def run(self, args: AppArgs) -> None:
+        from myapp import handle_request
+        await handle_request(args.user_message)
+```
+
+**Web server example** (using an async HTTP client):
+
+```python
+import httpx
+from pydantic import BaseModel
+import pixie
+
+class AppArgs(BaseModel):
+    user_message: str
+
+class AppRunnable(pixie.Runnable[AppArgs]):
+    _client: httpx.AsyncClient
+
+    @classmethod
+    def create(cls) -> AppRunnable:
+        return cls()
+
+    async def setup(self) -> None:
+        self._client = httpx.AsyncClient(base_url="http://localhost:8000")
+
+    async def run(self, args: AppArgs) -> None:
+        await self._client.post("/chat", json={"message": args.user_message})
+
+    async def teardown(self) -> None:
+        await self._client.aclose()
+```
+
+---
+
+## Functions
+
+### `pixie.wrap`
+
+```python
+pixie.wrap(data: 'T', *, purpose: "Literal['input', 'output', 'state']", name: 'str', description: 'str | None' = None) -> 'T'
+```
+
+Observe a data value or data-provider callable at a point in the processing pipeline.
+
+`data` can be either a plain value or a callable that produces a value.
+In both cases the return type is `T` — the caller gets back exactly the
+same type it passed in when in no-op or tracing modes.
+
+In eval mode with `purpose="input"`, the returned value (or callable) is
+replaced with the deserialized registry value. When `data` is callable
+the returned wrapper ignores the original function and returns the injected
+value on every call; in all other modes the returned callable wraps the
+original and adds tracing or capture behaviour.
+
+Args:
+data: A data value or a data-provider callable.
+purpose: Classification of the data point: - "input": data from external dependencies (DB records, API responses) - "output": data going out to external systems or users - "state": intermediate state for evaluation (routing decisions, etc.)
+name: Unique identifier for this data point. Used as the key in the
+eval registry and in trace logs.
+description: Optional human-readable description of what this data is.
+
+Returns:
+The original data unchanged (tracing / no-op modes), or the
+registry value (eval mode with purpose="input"). When `data`
+is callable the return value is also callable.
+
+---
+
+## Error Types
+
+### `WrapRegistryMissError`
+
+```python
+WrapRegistryMissError(name: 'str') -> 'None'
+```
+
+Raised when a wrap(purpose="input") name is not found in the eval registry.
+
+### `WrapTypeMismatchError`
+
+```python
+WrapTypeMismatchError(name: 'str', expected_type: 'type', actual_type: 'type') -> 'None'
+```
+
+Raised when deserialized registry value doesn't match expected type.
+
+---
+
+## Trace File Utilities
+
+Pydantic model for wrap log entries and JSONL loading utilities.
+
+`WrapLogEntry` is the typed representation of a single `wrap()` event
+as recorded in a JSONL trace file. Multiple places in the codebase load
+these objects — the `pixie trace filter` CLI, the dataset loader, and
+the verification scripts — so they share this single model.
+
+### `pixie.WrapLogEntry`
+
+```python
+pixie.WrapLogEntry(*, type: str = 'wrap', name: str, purpose: str, data: Any, description: str | None = None, trace_id: str | None = None, span_id: str | None = None) -> None
+```
+
+A single wrap() event as logged to a JSONL trace file.
+
+Attributes:
+type: Always `"wrap"` for wrap events.
+name: The wrap point name (matches `wrap(name=...)`).
+purpose: One of `"input"`, `"output"`, `"state"`.
+data: The serialized data (jsonpickle string).
+description: Optional human-readable description.
+trace_id: OTel trace ID (if available).
+span_id: OTel span ID (if available).
+
+### `pixie.load_wrap_log_entries`
+
+```python
+pixie.load_wrap_log_entries(jsonl_path: 'str | Path') -> 'list[WrapLogEntry]'
+```
+
+Load all wrap log entries from a JSONL file.
+
+Skips non-wrap lines (e.g. `type=llm_span`) and malformed lines.
+
+Args:
+jsonl_path: Path to a JSONL trace file.
+
+Returns:
+List of :class:`WrapLogEntry` objects.
+
+### `pixie.filter_by_purpose`
+
+```python
+pixie.filter_by_purpose(entries: 'list[WrapLogEntry]', purposes: 'set[str]') -> 'list[WrapLogEntry]'
+```
+
+Filter wrap log entries by purpose.
+
+Args:
+entries: List of wrap log entries.
+purposes: Set of purpose values to include.
+
+Returns:
+Filtered list.
diff --git a/skills/eval-driven-dev/resources/setup.sh b/skills/eval-driven-dev/resources/setup.sh
new file mode 100755
index 00000000..57236678
--- /dev/null
+++ b/skills/eval-driven-dev/resources/setup.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+# Setup script for eval-driven-dev skill.
+# Updates the skill, installs/upgrades pixie-qa[all], initializes the
+# pixie working directory, and starts the web UI server in the background.
+# Failures are non-fatal — the workflow continues even if a step here is
+# blocked by the environment.
+set -u
+
+echo "=== Updating skill ==="
+npx skills update || echo "(skill update skipped)"
+
+echo ""
+echo "=== Installing / upgrading pixie-qa[all] ==="
+if [ -f uv.lock ]; then
+  uv add "pixie-qa[all]>=0.6.1,<0.7.0" --upgrade
+elif [ -f poetry.lock ]; then
+  poetry add "pixie-qa[all]>=0.6.1,<0.7.0"
+else
+  pip install --upgrade "pixie-qa[all]>=0.6.1,<0.7.0"
+fi
+
+echo ""
+echo "=== Initializing pixie working directory ==="
+if [ -f uv.lock ]; then
+  uv run pixie init
+elif [ -f poetry.lock ]; then
+  poetry run pixie init
+else
+  pixie init
+fi
+
+echo ""
+echo "=== Starting web UI server (background) ==="
+if [ -f uv.lock ]; then
+  uv run pixie start
+elif [ -f poetry.lock ]; then
+  poetry run pixie start
+else
+  pixie start
+fi
+
+echo ""
+echo "=== Setup complete ==="