chore: publish from staged

2026-04-11 18:55:55 +00:00 · 2026-04-09 06:26:21 +00:00
parent 017f31f495
commit a68b190031
467 changed files with 97527 additions and 276 deletions
--- a/plugins/phoenix/.github/plugin/plugin.json
+++ b/plugins/phoenix/.github/plugin/plugin.json
@@ -18,8 +18,8 @@
    "instrumentation"
  ],
  "skills": [
-    "./skills/phoenix-cli/",
-    "./skills/phoenix-evals/",
-    "./skills/phoenix-tracing/"
+    "./skills/phoenix-cli",
+    "./skills/phoenix-evals",
+    "./skills/phoenix-tracing"
  ]
 }
--- a/plugins/phoenix/skills/phoenix-cli/SKILL.md
+++ b/plugins/phoenix/skills/phoenix-cli/SKILL.md
@@ -0,0 +1,162 @@
+---
+name: phoenix-cli
+description: Debug LLM applications using the Phoenix CLI. Fetch traces, analyze errors, review experiments, inspect datasets, and query the GraphQL API. Use when debugging AI/LLM applications, analyzing trace data, working with Phoenix observability, or investigating LLM performance issues.
+license: Apache-2.0
+compatibility: Requires Node.js (for npx) or global install of @arizeai/phoenix-cli. Optionally requires jq for JSON processing.
+metadata:
+  author: arize-ai
+  version: "2.0.0"
+---
+
+# Phoenix CLI
+
+## Invocation
+
+```bash
+px <resource> <action>                          # if installed globally
+npx @arizeai/phoenix-cli <resource> <action>    # no install required
+```
+
+The CLI uses singular resource commands with subcommands like `list` and `get`:
+
+```bash
+px trace list
+px trace get <trace-id>
+px span list
+px dataset list
+px dataset get <name>
+```
+
+## Setup
+
+```bash
+export PHOENIX_HOST=http://localhost:6006
+export PHOENIX_PROJECT=my-project
+export PHOENIX_API_KEY=your-api-key  # if auth is enabled
+```
+
+Always use `--format raw --no-progress` when piping to `jq`.
+
+## Traces
+
+```bash
+px trace list --limit 20 --format raw --no-progress | jq .
+px trace list --last-n-minutes 60 --limit 20 --format raw --no-progress | jq '.[] | select(.status == "ERROR")'
+px trace list --format raw --no-progress | jq 'sort_by(-.duration) | .[0:5]'
+px trace get <trace-id> --format raw | jq .
+px trace get <trace-id> --format raw | jq '.spans[] | select(.status_code != "OK")'
+```
+
+## Spans
+
+```bash
+px span list --limit 20                                    # recent spans (table view)
+px span list --last-n-minutes 60 --limit 50                # spans from last hour
+px span list --span-kind LLM --limit 10                    # only LLM spans
+px span list --status-code ERROR --limit 20                # only errored spans
+px span list --name chat_completion --limit 10             # filter by span name
+px span list --trace-id <id> --format raw --no-progress | jq .   # all spans for a trace
+px span list --include-annotations --limit 10              # include annotation scores
+px span list output.json --limit 100                       # save to JSON file
+px span list --format raw --no-progress | jq '.[] | select(.status_code == "ERROR")'
+```
+
+### Span JSON shape
+
+```
+Span
+  name, span_kind ("LLM"|"CHAIN"|"TOOL"|"RETRIEVER"|"EMBEDDING"|"AGENT"|"RERANKER"|"GUARDRAIL"|"EVALUATOR"|"UNKNOWN")
+  status_code ("OK"|"ERROR"|"UNSET"), status_message
+  context.span_id, context.trace_id, parent_id
+  start_time, end_time
+  attributes (same as trace span attributes above)
+  annotations[] (with --include-annotations)
+    name, result { score, label, explanation }
+```
+
+### Trace JSON shape
+
+```
+Trace
+  traceId, status ("OK"|"ERROR"), duration (ms), startTime, endTime
+  rootSpan  — top-level span (parent_id: null)
+  spans[]
+    name, span_kind ("LLM"|"CHAIN"|"TOOL"|"RETRIEVER"|"EMBEDDING"|"AGENT")
+    status_code ("OK"|"ERROR"), parent_id, context.span_id
+    attributes
+      input.value, output.value          — raw input/output
+      llm.model_name, llm.provider
+      llm.token_count.prompt/completion/total
+      llm.token_count.prompt_details.cache_read
+      llm.token_count.completion_details.reasoning
+      llm.input_messages.{N}.message.role/content
+      llm.output_messages.{N}.message.role/content
+      llm.invocation_parameters          — JSON string (temperature, etc.)
+      exception.message                  — set if span errored
+```
+
+## Sessions
+
+```bash
+px session list --limit 10 --format raw --no-progress | jq .
+px session list --order asc --format raw --no-progress | jq '.[].session_id'
+px session get <session-id> --format raw | jq .
+px session get <session-id> --include-annotations --format raw | jq '.annotations'
+```
+
+### Session JSON shape
+
+```
+SessionData
+  id, session_id, project_id
+  start_time, end_time
+  traces[]
+    id, trace_id, start_time, end_time
+
+SessionAnnotation (with --include-annotations)
+  id, name, annotator_kind ("LLM"|"CODE"|"HUMAN"), session_id
+  result { label, score, explanation }
+  metadata, identifier, source, created_at, updated_at
+```
+
+## Datasets / Experiments / Prompts
+
+```bash
+px dataset list --format raw --no-progress | jq '.[].name'
+px dataset get <name> --format raw | jq '.examples[] | {input, output: .expected_output}'
+px experiment list --dataset <name> --format raw --no-progress | jq '.[] | {id, name, failed_run_count}'
+px experiment get <id> --format raw --no-progress | jq '.[] | select(.error != null) | {input, error}'
+px prompt list --format raw --no-progress | jq '.[].name'
+px prompt get <name> --format text --no-progress   # plain text, ideal for piping to AI
+```
+
+## GraphQL
+
+For ad-hoc queries not covered by the commands above. Output is `{"data": {...}}`.
+
+```bash
+px api graphql '{ projectCount datasetCount promptCount evaluatorCount }'
+px api graphql '{ projects { edges { node { name traceCount tokenCountTotal } } } }' | jq '.data.projects.edges[].node'
+px api graphql '{ datasets { edges { node { name exampleCount experimentCount } } } }' | jq '.data.datasets.edges[].node'
+px api graphql '{ evaluators { edges { node { name kind } } } }' | jq '.data.evaluators.edges[].node'
+
+# Introspect any type
+px api graphql '{ __type(name: "Project") { fields { name type { name } } } }' | jq '.data.__type.fields[]'
+```
+
+Key root fields: `projects`, `datasets`, `prompts`, `evaluators`, `projectCount`, `datasetCount`, `promptCount`, `evaluatorCount`, `viewer`.
+
+## Docs
+
+Download Phoenix documentation markdown for local use by coding agents.
+
+```bash
+px docs fetch                                # fetch default workflow docs to .px/docs
+px docs fetch --workflow tracing             # fetch only tracing docs
+px docs fetch --workflow tracing --workflow evaluation
+px docs fetch --dry-run                      # preview what would be downloaded
+px docs fetch --refresh                      # clear .px/docs and re-download
+px docs fetch --output-dir ./my-docs         # custom output directory
+```
+
+Key options: `--workflow` (repeatable, values: `tracing`, `evaluation`, `datasets`, `prompts`, `integrations`, `sdk`, `self-hosting`, `all`), `--dry-run`, `--refresh`, `--output-dir` (default `.px/docs`), `--workers` (default 10).
--- a/plugins/phoenix/skills/phoenix-evals/SKILL.md
+++ b/plugins/phoenix/skills/phoenix-evals/SKILL.md
@@ -0,0 +1,72 @@
+---
+name: phoenix-evals
+description: Build and run evaluators for AI/LLM applications using Phoenix.
+license: Apache-2.0
+compatibility: Requires Phoenix server. Python skills need phoenix and openai packages; TypeScript skills need @arizeai/phoenix-client.
+metadata:
+  author: oss@arize.com
+  version: "1.0.0"
+  languages: "Python, TypeScript"
+---
+
+# Phoenix Evals
+
+Build evaluators for AI/LLM applications. Code first, LLM for nuance, validate against humans.
+
+## Quick Reference
+
+| Task | Files |
+| ---- | ----- |
+| Setup | [setup-python](references/setup-python.md), [setup-typescript](references/setup-typescript.md) |
+| Decide what to evaluate | [evaluators-overview](references/evaluators-overview.md) |
+| Choose a judge model | [fundamentals-model-selection](references/fundamentals-model-selection.md) |
+| Use pre-built evaluators | [evaluators-pre-built](references/evaluators-pre-built.md) |
+| Build code evaluator | [evaluators-code-python](references/evaluators-code-python.md), [evaluators-code-typescript](references/evaluators-code-typescript.md) |
+| Build LLM evaluator | [evaluators-llm-python](references/evaluators-llm-python.md), [evaluators-llm-typescript](references/evaluators-llm-typescript.md), [evaluators-custom-templates](references/evaluators-custom-templates.md) |
+| Batch evaluate DataFrame | [evaluate-dataframe-python](references/evaluate-dataframe-python.md) |
+| Run experiment | [experiments-running-python](references/experiments-running-python.md), [experiments-running-typescript](references/experiments-running-typescript.md) |
+| Create dataset | [experiments-datasets-python](references/experiments-datasets-python.md), [experiments-datasets-typescript](references/experiments-datasets-typescript.md) |
+| Generate synthetic data | [experiments-synthetic-python](references/experiments-synthetic-python.md), [experiments-synthetic-typescript](references/experiments-synthetic-typescript.md) |
+| Validate evaluator accuracy | [validation](references/validation.md), [validation-evaluators-python](references/validation-evaluators-python.md), [validation-evaluators-typescript](references/validation-evaluators-typescript.md) |
+| Sample traces for review | [observe-sampling-python](references/observe-sampling-python.md), [observe-sampling-typescript](references/observe-sampling-typescript.md) |
+| Analyze errors | [error-analysis](references/error-analysis.md), [error-analysis-multi-turn](references/error-analysis-multi-turn.md), [axial-coding](references/axial-coding.md) |
+| RAG evals | [evaluators-rag](references/evaluators-rag.md) |
+| Avoid common mistakes | [common-mistakes-python](references/common-mistakes-python.md), [fundamentals-anti-patterns](references/fundamentals-anti-patterns.md) |
+| Production | [production-overview](references/production-overview.md), [production-guardrails](references/production-guardrails.md), [production-continuous](references/production-continuous.md) |
+
+## Workflows
+
+**Starting Fresh:**
+[observe-tracing-setup](references/observe-tracing-setup.md) → [error-analysis](references/error-analysis.md) → [axial-coding](references/axial-coding.md) → [evaluators-overview](references/evaluators-overview.md)
+
+**Building Evaluator:**
+[fundamentals](references/fundamentals.md) → [common-mistakes-python](references/common-mistakes-python.md) → evaluators-{code|llm}-{python|typescript} → validation-evaluators-{python|typescript}
+
+**RAG Systems:**
+[evaluators-rag](references/evaluators-rag.md) → evaluators-code-* (retrieval) → evaluators-llm-* (faithfulness)
+
+**Production:**
+[production-overview](references/production-overview.md) → [production-guardrails](references/production-guardrails.md) → [production-continuous](references/production-continuous.md)
+
+## Reference Categories
+
+| Prefix | Description |
+| ------ | ----------- |
+| `fundamentals-*` | Types, scores, anti-patterns |
+| `observe-*` | Tracing, sampling |
+| `error-analysis-*` | Finding failures |
+| `axial-coding-*` | Categorizing failures |
+| `evaluators-*` | Code, LLM, RAG evaluators |
+| `experiments-*` | Datasets, running experiments |
+| `validation-*` | Validating evaluator accuracy against human labels |
+| `production-*` | CI/CD, monitoring |
+
+## Key Principles
+
+| Principle | Action |
+| --------- | ------ |
+| Error analysis first | Can't automate what you haven't observed |
+| Custom > generic | Build from your failures |
+| Code first | Deterministic before LLM |
+| Validate judges | >80% TPR/TNR |
+| Binary > Likert | Pass/fail, not 1-5 |
--- a/plugins/phoenix/skills/phoenix-evals/references/axial-coding.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/axial-coding.md
@@ -0,0 +1,95 @@
+# Axial Coding
+
+Group open-ended notes into structured failure taxonomies.
+
+## Process
+
+1. **Gather** - Collect open coding notes
+2. **Pattern** - Group notes with common themes
+3. **Name** - Create actionable category names
+4. **Quantify** - Count failures per category
+
+## Example Taxonomy
+
+```yaml
+failure_taxonomy:
+  content_quality:
+    hallucination: [invented_facts, fictional_citations]
+    incompleteness: [partial_answer, missing_key_info]
+    inaccuracy: [wrong_numbers, wrong_dates]
+  
+  communication:
+    tone_mismatch: [too_casual, too_formal]
+    clarity: [ambiguous, jargon_heavy]
+  
+  context:
+    user_context: [ignored_preferences, misunderstood_intent]
+    retrieved_context: [ignored_documents, wrong_context]
+  
+  safety:
+    missing_disclaimers: [legal, medical, financial]
+```
+
+## Add Annotation (Python)
+
+```python
+from phoenix.client import Client
+
+client = Client()
+client.spans.add_span_annotation(
+    span_id="abc123",
+    annotation_name="failure_category",
+    label="hallucination",
+    explanation="invented a feature that doesn't exist",
+    annotator_kind="HUMAN",
+    sync=True,
+)
+```
+
+## Add Annotation (TypeScript)
+
+```typescript
+import { addSpanAnnotation } from "@arizeai/phoenix-client/spans";
+
+await addSpanAnnotation({
+  spanAnnotation: {
+    spanId: "abc123",
+    name: "failure_category",
+    label: "hallucination",
+    explanation: "invented a feature that doesn't exist",
+    annotatorKind: "HUMAN",
+  }
+});
+```
+
+## Agent Failure Taxonomy
+
+```yaml
+agent_failures:
+  planning: [wrong_plan, incomplete_plan]
+  tool_selection: [wrong_tool, missed_tool, unnecessary_call]
+  tool_execution: [wrong_parameters, type_error]
+  state_management: [lost_context, stuck_in_loop]
+  error_recovery: [no_fallback, wrong_fallback]
+```
+
+## Transition Matrix (Agents)
+
+Shows where failures occur between states:
+
+```python
+def build_transition_matrix(conversations, states):
+    matrix = defaultdict(lambda: defaultdict(int))
+    for conv in conversations:
+        if conv["failed"]:
+            last_success = find_last_success(conv)
+            first_failure = find_first_failure(conv)
+            matrix[last_success][first_failure] += 1
+    return pd.DataFrame(matrix).fillna(0)
+```
+
+## Principles
+
+- **MECE** - Each failure fits ONE category
+- **Actionable** - Categories suggest fixes
+- **Bottom-up** - Let categories emerge from data
--- a/plugins/phoenix/skills/phoenix-evals/references/common-mistakes-python.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/common-mistakes-python.md
@@ -0,0 +1,225 @@
+# Common Mistakes (Python)
+
+Patterns that LLMs frequently generate incorrectly from training data.
+
+## Legacy Model Classes
+
+```python
+# WRONG
+from phoenix.evals import OpenAIModel, AnthropicModel
+model = OpenAIModel(model="gpt-4")
+
+# RIGHT
+from phoenix.evals import LLM
+llm = LLM(provider="openai", model="gpt-4o")
+```
+
+**Why**: `OpenAIModel`, `AnthropicModel`, etc. are legacy 1.0 wrappers in `phoenix.evals.legacy`.
+The `LLM` class is provider-agnostic and is the current 2.0 API.
+
+## Using run_evals Instead of evaluate_dataframe
+
+```python
+# WRONG — legacy 1.0 API
+from phoenix.evals import run_evals
+results = run_evals(dataframe=df, evaluators=[eval1], provide_explanation=True)
+# Returns list of DataFrames
+
+# RIGHT — current 2.0 API
+from phoenix.evals import evaluate_dataframe
+results_df = evaluate_dataframe(dataframe=df, evaluators=[eval1])
+# Returns single DataFrame with {name}_score dict columns
+```
+
+**Why**: `run_evals` is the legacy 1.0 batch function. `evaluate_dataframe` is the current
+2.0 function with a different return format.
+
+## Wrong Result Column Names
+
+```python
+# WRONG — column doesn't exist
+score = results_df["relevance"].mean()
+
+# WRONG — column exists but contains dicts, not numbers
+score = results_df["relevance_score"].mean()
+
+# RIGHT — extract numeric score from dict
+scores = results_df["relevance_score"].apply(
+    lambda x: x.get("score", 0.0) if isinstance(x, dict) else 0.0
+)
+score = scores.mean()
+```
+
+**Why**: `evaluate_dataframe` returns columns named `{name}_score` containing Score dicts
+like `{"name": "...", "score": 1.0, "label": "...", "explanation": "..."}`.
+
+## Deprecated project_name Parameter
+
+```python
+# WRONG
+df = client.spans.get_spans_dataframe(project_name="my-project")
+
+# RIGHT
+df = client.spans.get_spans_dataframe(project_identifier="my-project")
+```
+
+**Why**: `project_name` is deprecated in favor of `project_identifier`, which also
+accepts project IDs.
+
+## Wrong Client Constructor
+
+```python
+# WRONG
+client = Client(endpoint="https://app.phoenix.arize.com")
+client = Client(url="https://app.phoenix.arize.com")
+
+# RIGHT — for remote/cloud Phoenix
+client = Client(base_url="https://app.phoenix.arize.com", api_key="...")
+
+# ALSO RIGHT — for local Phoenix (falls back to env vars or localhost:6006)
+client = Client()
+```
+
+**Why**: The parameter is `base_url`, not `endpoint` or `url`. For local instances,
+`Client()` with no args works fine. For remote instances, `base_url` and `api_key` are required.
+
+## Too-Aggressive Time Filters
+
+```python
+# WRONG — often returns zero spans
+from datetime import datetime, timedelta
+df = client.spans.get_spans_dataframe(
+    project_identifier="my-project",
+    start_time=datetime.now() - timedelta(hours=1),
+)
+
+# RIGHT — use limit to control result size instead
+df = client.spans.get_spans_dataframe(
+    project_identifier="my-project",
+    limit=50,
+)
+```
+
+**Why**: Traces may be from any time period. A 1-hour window frequently returns
+nothing. Use `limit=` to control result size instead.
+
+## Not Filtering Spans Appropriately
+
+```python
+# WRONG — fetches all spans including internal LLM calls, retrievers, etc.
+df = client.spans.get_spans_dataframe(project_identifier="my-project")
+
+# RIGHT for end-to-end evaluation — filter to top-level spans
+df = client.spans.get_spans_dataframe(
+    project_identifier="my-project",
+    root_spans_only=True,
+)
+
+# RIGHT for RAG evaluation — fetch child spans for retriever/LLM metrics
+all_spans = client.spans.get_spans_dataframe(
+    project_identifier="my-project",
+)
+retriever_spans = all_spans[all_spans["span_kind"] == "RETRIEVER"]
+llm_spans = all_spans[all_spans["span_kind"] == "LLM"]
+```
+
+**Why**: For end-to-end evaluation (e.g., overall answer quality), use `root_spans_only=True`.
+For RAG systems, you often need child spans separately — retriever spans for
+DocumentRelevance and LLM spans for Faithfulness. Choose the right span level
+for your evaluation target.
+
+## Assuming Span Output is Plain Text
+
+```python
+# WRONG — output may be JSON, not plain text
+df["output"] = df["attributes.output.value"]
+
+# RIGHT — parse JSON and extract the answer field
+import json
+
+def extract_answer(output_value):
+    if not isinstance(output_value, str):
+        return str(output_value) if output_value is not None else ""
+    try:
+        parsed = json.loads(output_value)
+        if isinstance(parsed, dict):
+            for key in ("answer", "result", "output", "response"):
+                if key in parsed:
+                    return str(parsed[key])
+    except (json.JSONDecodeError, TypeError):
+        pass
+    return output_value
+
+df["output"] = df["attributes.output.value"].apply(extract_answer)
+```
+
+**Why**: LangChain and other frameworks often output structured JSON from root spans,
+like `{"context": "...", "question": "...", "answer": "..."}`. Evaluators need
+the actual answer text, not the raw JSON.
+
+## Using @create_evaluator for LLM-Based Evaluation
+
+```python
+# WRONG — @create_evaluator doesn't call an LLM
+@create_evaluator(name="relevance", kind="llm")
+def relevance(input: str, output: str) -> str:
+    pass  # No LLM is involved
+
+# RIGHT — use ClassificationEvaluator for LLM-based evaluation
+from phoenix.evals import ClassificationEvaluator, LLM
+
+relevance = ClassificationEvaluator(
+    name="relevance",
+    prompt_template="Is this relevant?\n{{input}}\n{{output}}\nAnswer:",
+    llm=LLM(provider="openai", model="gpt-4o"),
+    choices={"relevant": 1.0, "irrelevant": 0.0},
+)
+```
+
+**Why**: `@create_evaluator` wraps a plain Python function. Setting `kind="llm"`
+marks it as LLM-based but you must implement the LLM call yourself.
+For LLM-based evaluation, prefer `ClassificationEvaluator` which handles
+the LLM call, structured output parsing, and explanations automatically.
+
+## Using llm_classify Instead of ClassificationEvaluator
+
+```python
+# WRONG — legacy 1.0 API
+from phoenix.evals import llm_classify
+results = llm_classify(
+    dataframe=df,
+    template=template_str,
+    model=model,
+    rails=["relevant", "irrelevant"],
+)
+
+# RIGHT — current 2.0 API
+from phoenix.evals import ClassificationEvaluator, async_evaluate_dataframe, LLM
+
+classifier = ClassificationEvaluator(
+    name="relevance",
+    prompt_template=template_str,
+    llm=LLM(provider="openai", model="gpt-4o"),
+    choices={"relevant": 1.0, "irrelevant": 0.0},
+)
+results_df = await async_evaluate_dataframe(dataframe=df, evaluators=[classifier])
+```
+
+**Why**: `llm_classify` is the legacy 1.0 function. The current pattern is to create
+an evaluator with `ClassificationEvaluator` and run it with `async_evaluate_dataframe()`.
+
+## Using HallucinationEvaluator
+
+```python
+# WRONG — deprecated
+from phoenix.evals import HallucinationEvaluator
+eval = HallucinationEvaluator(model)
+
+# RIGHT — use FaithfulnessEvaluator
+from phoenix.evals.metrics import FaithfulnessEvaluator
+from phoenix.evals import LLM
+eval = FaithfulnessEvaluator(llm=LLM(provider="openai", model="gpt-4o"))
+```
+
+**Why**: `HallucinationEvaluator` is deprecated. `FaithfulnessEvaluator` is its replacement,
+using "faithful"/"unfaithful" labels with maximized score (1.0 = faithful).
--- a/plugins/phoenix/skills/phoenix-evals/references/error-analysis-multi-turn.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/error-analysis-multi-turn.md
@@ -0,0 +1,52 @@
+# Error Analysis: Multi-Turn Conversations
+
+Debugging complex multi-turn conversation traces.
+
+## The Approach
+
+1. **End-to-end first** - Did the conversation achieve the goal?
+2. **Find first failure** - Trace backwards to root cause
+3. **Simplify** - Try single-turn before multi-turn debug
+4. **N-1 testing** - Isolate turn-specific vs capability issues
+
+## Find First Upstream Failure
+
+```
+Turn 1: User asks about flights ✓
+Turn 2: Assistant asks for dates ✓
+Turn 3: User provides dates ✓
+Turn 4: Assistant searches WRONG dates ← FIRST FAILURE
+Turn 5: Shows wrong flights (consequence)
+Turn 6: User frustrated (consequence)
+```
+
+Focus on Turn 4, not Turn 6.
+
+## Simplify First
+
+Before debugging multi-turn, test single-turn:
+
+```python
+# If single-turn also fails → problem is retrieval/knowledge
+# If single-turn passes → problem is conversation context
+response = chat("What's the return policy for electronics?")
+```
+
+## N-1 Testing
+
+Give turns 1 to N-1 as context, test turn N:
+
+```python
+context = conversation[:n-1]
+response = chat_with_context(context, user_message_n)
+# Compare to actual turn N
+```
+
+This isolates whether error is from context or underlying capability.
+
+## Checklist
+
+1. Did conversation achieve goal? (E2E)
+2. Which turn first went wrong?
+3. Can you reproduce with single-turn?
+4. Is error from context or capability? (N-1 test)
--- a/plugins/phoenix/skills/phoenix-evals/references/error-analysis.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/error-analysis.md
@@ -0,0 +1,170 @@
+# Error Analysis
+
+Review traces to discover failure modes before building evaluators.
+
+## Process
+
+1. **Sample** - 100+ traces (errors, negative feedback, random)
+2. **Open Code** - Write free-form notes per trace
+3. **Axial Code** - Group notes into failure categories
+4. **Quantify** - Count failures per category
+5. **Prioritize** - Rank by frequency × severity
+
+## Sample Traces
+
+### Span-level sampling (Python — DataFrame)
+
+```python
+from phoenix.client import Client
+
+# Client() works for local Phoenix (falls back to env vars or localhost:6006)
+# For remote/cloud: Client(base_url="https://app.phoenix.arize.com", api_key="...")
+client = Client()
+spans_df = client.spans.get_spans_dataframe(project_identifier="my-app")
+
+# Build representative sample
+sample = pd.concat([
+    spans_df[spans_df["status_code"] == "ERROR"].sample(30),
+    spans_df[spans_df["feedback"] == "negative"].sample(30),
+    spans_df.sample(40),
+]).drop_duplicates("span_id").head(100)
+```
+
+### Span-level sampling (TypeScript)
+
+```typescript
+import { getSpans } from "@arizeai/phoenix-client/spans";
+
+const { spans: errors } = await getSpans({
+  project: { projectName: "my-app" },
+  statusCode: "ERROR",
+  limit: 30,
+});
+const { spans: allSpans } = await getSpans({
+  project: { projectName: "my-app" },
+  limit: 70,
+});
+const sample = [...errors, ...allSpans.sort(() => Math.random() - 0.5).slice(0, 40)];
+const unique = [...new Map(sample.map((s) => [s.context.span_id, s])).values()].slice(0, 100);
+```
+
+### Trace-level sampling (Python)
+
+When errors span multiple spans (e.g., agent workflows), sample whole traces:
+
+```python
+from datetime import datetime, timedelta
+
+traces = client.traces.get_traces(
+    project_identifier="my-app",
+    start_time=datetime.now() - timedelta(hours=24),
+    include_spans=True,
+    sort="latency_ms",
+    order="desc",
+    limit=100,
+)
+# Each trace has: trace_id, start_time, end_time, spans
+```
+
+### Trace-level sampling (TypeScript)
+
+```typescript
+import { getTraces } from "@arizeai/phoenix-client/traces";
+
+const { traces } = await getTraces({
+  project: { projectName: "my-app" },
+  startTime: new Date(Date.now() - 24 * 60 * 60 * 1000),
+  includeSpans: true,
+  limit: 100,
+});
+```
+
+## Add Notes (Python)
+
+```python
+client.spans.add_span_note(
+    span_id="abc123",
+    note="wrong timezone - said 3pm EST but user is PST"
+)
+```
+
+## Add Notes (TypeScript)
+
+```typescript
+import { addSpanNote } from "@arizeai/phoenix-client/spans";
+
+await addSpanNote({
+  spanNote: {
+    spanId: "abc123",
+    note: "wrong timezone - said 3pm EST but user is PST"
+  }
+});
+```
+
+## What to Note
+
+| Type | Examples |
+| ---- | -------- |
+| Factual errors | Wrong dates, prices, made-up features |
+| Missing info | Didn't answer question, omitted details |
+| Tone issues | Too casual/formal for context |
+| Tool issues | Wrong tool, wrong parameters |
+| Retrieval | Wrong docs, missing relevant docs |
+
+## Good Notes
+
+```
+BAD:  "Response is bad"
+GOOD: "Response says ships in 2 days but policy is 5-7 days"
+```
+
+## Group into Categories
+
+```python
+categories = {
+    "factual_inaccuracy": ["wrong shipping time", "incorrect price"],
+    "hallucination": ["made up a discount", "invented feature"],
+    "tone_mismatch": ["informal for enterprise client"],
+}
+# Priority = Frequency × Severity
+```
+
+## Retrieve Existing Annotations
+
+### Python
+
+```python
+# From a spans DataFrame
+annotations_df = client.spans.get_span_annotations_dataframe(
+    spans_dataframe=sample,
+    project_identifier="my-app",
+    include_annotation_names=["quality", "correctness"],
+)
+# annotations_df has: span_id (index), name, label, score, explanation
+
+# Or from specific span IDs
+annotations_df = client.spans.get_span_annotations_dataframe(
+    span_ids=["span-id-1", "span-id-2"],
+    project_identifier="my-app",
+)
+```
+
+### TypeScript
+
+```typescript
+import { getSpanAnnotations } from "@arizeai/phoenix-client/spans";
+
+const { annotations } = await getSpanAnnotations({
+  project: { projectName: "my-app" },
+  spanIds: ["span-id-1", "span-id-2"],
+  includeAnnotationNames: ["quality", "correctness"],
+});
+
+for (const ann of annotations) {
+  console.log(`${ann.span_id}: ${ann.name} = ${ann.result?.label} (${ann.result?.score})`);
+}
+```
+
+## Saturation
+
+Stop when new traces reveal no new failure modes. Minimum: 100 traces.
--- a/plugins/phoenix/skills/phoenix-evals/references/evaluate-dataframe-python.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/evaluate-dataframe-python.md
@@ -0,0 +1,137 @@
+# Batch Evaluation with evaluate_dataframe (Python)
+
+Run evaluators across a DataFrame. The core 2.0 batch evaluation API.
+
+## Preferred: async_evaluate_dataframe
+
+For batch evaluations (especially with LLM evaluators), prefer the async version
+for better throughput:
+
+```python
+from phoenix.evals import async_evaluate_dataframe
+
+results_df = await async_evaluate_dataframe(
+    dataframe=df,              # pandas DataFrame with columns matching evaluator params
+    evaluators=[eval1, eval2], # List of evaluators
+    concurrency=5,             # Max concurrent LLM calls (default 3)
+    exit_on_error=False,       # Optional: stop on first error (default True)
+    max_retries=3,             # Optional: retry failed LLM calls (default 10)
+)
+```
+
+## Sync Version
+
+```python
+from phoenix.evals import evaluate_dataframe
+
+results_df = evaluate_dataframe(
+    dataframe=df,              # pandas DataFrame with columns matching evaluator params
+    evaluators=[eval1, eval2], # List of evaluators
+    exit_on_error=False,       # Optional: stop on first error (default True)
+    max_retries=3,             # Optional: retry failed LLM calls (default 10)
+)
+```
+
+## Result Column Format
+
+`async_evaluate_dataframe` / `evaluate_dataframe` returns a copy of the input DataFrame with added columns.
+**Result columns contain dicts, NOT raw numbers.**
+
+For each evaluator named `"foo"`, two columns are added:
+
+| Column | Type | Contents |
+| ------ | ---- | -------- |
+| `foo_score` | `dict` | `{"name": "foo", "score": 1.0, "label": "True", "explanation": "...", "metadata": {...}, "kind": "code", "direction": "maximize"}` |
+| `foo_execution_details` | `dict` | `{"status": "success", "exceptions": [], "execution_seconds": 0.001}` |
+
+Only non-None fields appear in the score dict.
+
+### Extracting Numeric Scores
+
+```python
+# WRONG — these will fail or produce unexpected results
+score = results_df["relevance"].mean()                    # KeyError!
+score = results_df["relevance_score"].mean()              # Tries to average dicts!
+
+# RIGHT — extract the numeric score from each dict
+scores = results_df["relevance_score"].apply(
+    lambda x: x.get("score", 0.0) if isinstance(x, dict) else 0.0
+)
+mean_score = scores.mean()
+```
+
+### Extracting Labels
+
+```python
+labels = results_df["relevance_score"].apply(
+    lambda x: x.get("label", "") if isinstance(x, dict) else ""
+)
+```
+
+### Extracting Explanations (LLM evaluators)
+
+```python
+explanations = results_df["relevance_score"].apply(
+    lambda x: x.get("explanation", "") if isinstance(x, dict) else ""
+)
+```
+
+### Finding Failures
+
+```python
+scores = results_df["relevance_score"].apply(
+    lambda x: x.get("score", 0.0) if isinstance(x, dict) else 0.0
+)
+failed_mask = scores < 0.5
+failures = results_df[failed_mask]
+```
+
+## Input Mapping
+
+Evaluators receive each row as a dict. Column names must match the evaluator's
+expected parameter names. If they don't match, use `.bind()` or `bind_evaluator`:
+
+```python
+from phoenix.evals import bind_evaluator, create_evaluator, async_evaluate_dataframe
+
+@create_evaluator(name="check", kind="code")
+def check(response: str) -> bool:
+    return len(response.strip()) > 0
+
+# Option 1: Use .bind() method on the evaluator
+check.bind(input_mapping={"response": "answer"})
+results_df = await async_evaluate_dataframe(dataframe=df, evaluators=[check])
+
+# Option 2: Use bind_evaluator function
+bound = bind_evaluator(evaluator=check, input_mapping={"response": "answer"})
+results_df = await async_evaluate_dataframe(dataframe=df, evaluators=[bound])
+```
+
+Or simply rename columns to match:
+
+```python
+df = df.rename(columns={
+    "attributes.input.value": "input",
+    "attributes.output.value": "output",
+})
+```
+
+## DO NOT use run_evals
+
+```python
+# WRONG — legacy 1.0 API
+from phoenix.evals import run_evals
+results = run_evals(dataframe=df, evaluators=[eval1])
+# Returns List[DataFrame] — one per evaluator
+
+# RIGHT — current 2.0 API
+from phoenix.evals import async_evaluate_dataframe
+results_df = await async_evaluate_dataframe(dataframe=df, evaluators=[eval1])
+# Returns single DataFrame with {name}_score dict columns
+```
+
+Key differences:
+- `run_evals` returns a **list** of DataFrames (one per evaluator)
+- `async_evaluate_dataframe` returns a **single** DataFrame with all results merged
+- `async_evaluate_dataframe` uses `{name}_score` dict column format
+- `async_evaluate_dataframe` uses `bind_evaluator` for input mapping (not `input_mapping=` param)
--- a/plugins/phoenix/skills/phoenix-evals/references/evaluators-code-python.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/evaluators-code-python.md
@@ -0,0 +1,91 @@
+# Evaluators: Code Evaluators in Python
+
+Deterministic evaluators without LLM. Fast, cheap, reproducible.
+
+## Basic Pattern
+
+```python
+import re
+import json
+from phoenix.evals import create_evaluator
+
+@create_evaluator(name="has_citation", kind="code")
+def has_citation(output: str) -> bool:
+    return bool(re.search(r'\[\d+\]', output))
+
+@create_evaluator(name="json_valid", kind="code")
+def json_valid(output: str) -> bool:
+    try:
+        json.loads(output)
+        return True
+    except json.JSONDecodeError:
+        return False
+```
+
+## Parameter Binding
+
+| Parameter | Description |
+| --------- | ----------- |
+| `output` | Task output |
+| `input` | Example input |
+| `expected` | Expected output |
+| `metadata` | Example metadata |
+
+```python
+@create_evaluator(name="matches_expected", kind="code")
+def matches_expected(output: str, expected: dict) -> bool:
+    return output.strip() == expected.get("answer", "").strip()
+```
+
+## Common Patterns
+
+- **Regex**: `re.search(pattern, output)`
+- **JSON schema**: `jsonschema.validate()`
+- **Keywords**: `keyword in output.lower()`
+- **Length**: `len(output.split())`
+- **Similarity**: `editdistance.eval()` or Jaccard
+
+## Return Types
+
+| Return type | Result |
+| ----------- | ------ |
+| `bool` | `True` → score=1.0, label="True"; `False` → score=0.0, label="False" |
+| `float`/`int` | Used as the `score` value directly |
+| `str` (short, ≤3 words) | Used as the `label` value |
+| `str` (long, ≥4 words) | Used as the `explanation` value |
+| `dict` with `score`/`label`/`explanation` | Mapped to Score fields directly |
+| `Score` object | Used as-is |
+
+## Important: Code vs LLM Evaluators
+
+The `@create_evaluator` decorator wraps a plain Python function.
+
+- `kind="code"` (default): For deterministic evaluators that don't call an LLM.
+- `kind="llm"`: Marks the evaluator as LLM-based, but **you** must implement the LLM
+  call inside the function. The decorator does not call an LLM for you.
+
+For most LLM-based evaluation, prefer `ClassificationEvaluator` which handles
+the LLM call, structured output parsing, and explanations automatically:
+
+```python
+from phoenix.evals import ClassificationEvaluator, LLM
+
+relevance = ClassificationEvaluator(
+    name="relevance",
+    prompt_template="Is this relevant?\n{{input}}\n{{output}}\nAnswer:",
+    llm=LLM(provider="openai", model="gpt-4o"),
+    choices={"relevant": 1.0, "irrelevant": 0.0},
+)
+```
+
+## Pre-Built
+
+```python
+from phoenix.experiments.evaluators import ContainsAnyKeyword, JSONParseable, MatchesRegex
+
+evaluators = [
+    ContainsAnyKeyword(keywords=["disclaimer"]),
+    JSONParseable(),
+    MatchesRegex(pattern=r"\d{4}-\d{2}-\d{2}"),
+]
+```
--- a/plugins/phoenix/skills/phoenix-evals/references/evaluators-code-typescript.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/evaluators-code-typescript.md
@@ -0,0 +1,51 @@
+# Evaluators: Code Evaluators in TypeScript
+
+Deterministic evaluators without LLM. Fast, cheap, reproducible.
+
+## Basic Pattern
+
+```typescript
+import { createEvaluator } from "@arizeai/phoenix-evals";
+
+const containsCitation = createEvaluator<{ output: string }>(
+  ({ output }) => /\[\d+\]/.test(output) ? 1 : 0,
+  { name: "contains_citation", kind: "CODE" }
+);
+```
+
+## With Full Results (asExperimentEvaluator)
+
+```typescript
+import { asExperimentEvaluator } from "@arizeai/phoenix-client/experiments";
+
+const jsonValid = asExperimentEvaluator({
+  name: "json_valid",
+  kind: "CODE",
+  evaluate: async ({ output }) => {
+    try {
+      JSON.parse(String(output));
+      return { score: 1.0, label: "valid_json" };
+    } catch (e) {
+      return { score: 0.0, label: "invalid_json", explanation: String(e) };
+    }
+  },
+});
+```
+
+## Parameter Types
+
+```typescript
+interface EvaluatorParams {
+  input: Record<string, unknown>;
+  output: unknown;
+  expected: Record<string, unknown>;
+  metadata: Record<string, unknown>;
+}
+```
+
+## Common Patterns
+
+- **Regex**: `/pattern/.test(output)`
+- **JSON**: `JSON.parse()` + zod schema
+- **Keywords**: `output.includes(keyword)`
+- **Similarity**: `fastest-levenshtein`
--- a/plugins/phoenix/skills/phoenix-evals/references/evaluators-custom-templates.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/evaluators-custom-templates.md
@@ -0,0 +1,54 @@
+# Evaluators: Custom Templates
+
+Design LLM judge prompts.
+
+## Complete Template Pattern
+
+```python
+TEMPLATE = """Evaluate faithfulness of the response to the context.
+
+<context>{{context}}</context>
+<response>{{output}}</response>
+
+CRITERIA:
+"faithful" = ALL claims supported by context
+"unfaithful" = ANY claim NOT in context
+
+EXAMPLES:
+Context: "Price is $10" → Response: "It costs $10" → faithful
+Context: "Price is $10" → Response: "About $15" → unfaithful
+
+EDGE CASES:
+- Empty context → cannot_evaluate
+- "I don't know" when appropriate → faithful
+- Partial faithfulness → unfaithful (strict)
+
+Answer (faithful/unfaithful):"""
+```
+
+## Template Structure
+
+1. Task description
+2. Input variables in XML tags
+3. Criteria definitions
+4. Examples (2-4 cases)
+5. Edge cases
+6. Output format
+
+## XML Tags
+
+```
+<question>{{input}}</question>
+<response>{{output}}</response>
+<context>{{context}}</context>
+<reference>{{reference}}</reference>
+```
+
+## Common Mistakes
+
+| Mistake | Fix |
+| ------- | --- |
+| Vague criteria | Define each label exactly |
+| No examples | Include 2-4 cases |
+| Ambiguous format | Specify exact output |
+| No edge cases | Address ambiguity |
--- a/plugins/phoenix/skills/phoenix-evals/references/evaluators-llm-python.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/evaluators-llm-python.md
@@ -0,0 +1,92 @@
+# Evaluators: LLM Evaluators in Python
+
+LLM evaluators use a language model to judge outputs. Use when criteria are subjective.
+
+## Quick Start
+
+```python
+from phoenix.evals import ClassificationEvaluator, LLM
+
+llm = LLM(provider="openai", model="gpt-4o")
+
+HELPFULNESS_TEMPLATE = """Rate how helpful the response is.
+
+<question>{{input}}</question>
+<response>{{output}}</response>
+
+"helpful" means directly addresses the question.
+"not_helpful" means does not address the question.
+
+Your answer (helpful/not_helpful):"""
+
+helpfulness = ClassificationEvaluator(
+    name="helpfulness",
+    prompt_template=HELPFULNESS_TEMPLATE,
+    llm=llm,
+    choices={"not_helpful": 0, "helpful": 1}
+)
+```
+
+## Template Variables
+
+Use XML tags to wrap variables for clarity:
+
+| Variable | XML Tag |
+| -------- | ------- |
+| `{{input}}` | `<question>{{input}}</question>` |
+| `{{output}}` | `<response>{{output}}</response>` |
+| `{{reference}}` | `<reference>{{reference}}</reference>` |
+| `{{context}}` | `<context>{{context}}</context>` |
+
+## create_classifier (Factory)
+
+Shorthand factory that returns a `ClassificationEvaluator`. Prefer direct
+`ClassificationEvaluator` instantiation for more parameters/customization:
+
+```python
+from phoenix.evals import create_classifier, LLM
+
+relevance = create_classifier(
+    name="relevance",
+    prompt_template="""Is this response relevant to the question?
+<question>{{input}}</question>
+<response>{{output}}</response>
+Answer (relevant/irrelevant):""",
+    llm=LLM(provider="openai", model="gpt-4o"),
+    choices={"relevant": 1.0, "irrelevant": 0.0},
+)
+```
+
+## Input Mapping
+
+Column names must match template variables. Rename columns or use `bind_evaluator`:
+
+```python
+# Option 1: Rename columns to match template variables
+df = df.rename(columns={"user_query": "input", "ai_response": "output"})
+
+# Option 2: Use bind_evaluator
+from phoenix.evals import bind_evaluator
+
+bound = bind_evaluator(
+    evaluator=helpfulness,
+    input_mapping={"input": "user_query", "output": "ai_response"},
+)
+```
+
+## Running
+
+```python
+from phoenix.evals import evaluate_dataframe
+
+results_df = evaluate_dataframe(dataframe=df, evaluators=[helpfulness])
+```
+
+## Best Practices
+
+1. **Be specific** - Define exactly what pass/fail means
+2. **Include examples** - Show concrete cases for each label
+3. **Explanations by default** - `ClassificationEvaluator` includes explanations automatically
+4. **Study built-in prompts** - See
+   `phoenix.evals.__generated__.classification_evaluator_configs` for examples
+   of well-structured evaluation prompts (Faithfulness, Correctness, DocumentRelevance, etc.)
--- a/plugins/phoenix/skills/phoenix-evals/references/evaluators-llm-typescript.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/evaluators-llm-typescript.md
@@ -0,0 +1,58 @@
+# Evaluators: LLM Evaluators in TypeScript
+
+LLM evaluators use a language model to judge outputs. Uses Vercel AI SDK.
+
+## Quick Start
+
+```typescript
+import { createClassificationEvaluator } from "@arizeai/phoenix-evals";
+import { openai } from "@ai-sdk/openai";
+
+const helpfulness = await createClassificationEvaluator<{
+  input: string;
+  output: string;
+}>({
+  name: "helpfulness",
+  model: openai("gpt-4o"),
+  promptTemplate: `Rate helpfulness.
+<question>{{input}}</question>
+<response>{{output}}</response>
+Answer (helpful/not_helpful):`,
+  choices: { not_helpful: 0, helpful: 1 },
+});
+```
+
+## Template Variables
+
+Use XML tags: `<question>{{input}}</question>`, `<response>{{output}}</response>`, `<context>{{context}}</context>`
+
+## Custom Evaluator with asExperimentEvaluator
+
+```typescript
+import { asExperimentEvaluator } from "@arizeai/phoenix-client/experiments";
+
+const customEval = asExperimentEvaluator({
+  name: "custom",
+  kind: "LLM",
+  evaluate: async ({ input, output }) => {
+    // Your LLM call here
+    return { score: 1.0, label: "pass", explanation: "..." };
+  },
+});
+```
+
+## Pre-Built Evaluators
+
+```typescript
+import { createFaithfulnessEvaluator } from "@arizeai/phoenix-evals";
+
+const faithfulnessEvaluator = createFaithfulnessEvaluator({
+  model: openai("gpt-4o"),
+});
+```
+
+## Best Practices
+
+- Be specific about criteria
+- Include examples in prompts
+- Use `<thinking>` for chain of thought
--- a/plugins/phoenix/skills/phoenix-evals/references/evaluators-overview.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/evaluators-overview.md
@@ -0,0 +1,40 @@
+# Evaluators: Overview
+
+When and how to build automated evaluators.
+
+## Decision Framework
+
+```
+Should I Build an Evaluator?
+        │
+        ▼
+Can I fix it with a prompt change?
+    YES → Fix the prompt first
+    NO  → Is this a recurring issue?
+          YES → Build evaluator
+          NO  → Add to watchlist
+```
+
+**Don't automate prematurely.** Many issues are simple prompt fixes.
+
+## Evaluator Requirements
+
+1. **Clear criteria** - Specific, not "Is it good?"
+2. **Labeled test set** - 100+ examples with human labels
+3. **Measured accuracy** - Know TPR/TNR before deploying
+
+## Evaluator Lifecycle
+
+1. **Discover** - Error analysis reveals pattern
+2. **Design** - Define criteria and test cases
+3. **Implement** - Build code or LLM evaluator
+4. **Calibrate** - Validate against human labels
+5. **Deploy** - Add to experiment/CI pipeline
+6. **Monitor** - Track accuracy over time
+7. **Maintain** - Update as product evolves
+
+## What NOT to Automate
+
+- **Rare issues** - <5 instances? Watchlist, don't build
+- **Quick fixes** - Fixable by prompt change? Fix it
+- **Evolving criteria** - Stabilize definition first
--- a/plugins/phoenix/skills/phoenix-evals/references/evaluators-pre-built.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/evaluators-pre-built.md
@@ -0,0 +1,75 @@
+# Evaluators: Pre-Built
+
+Use for exploration only. Validate before production.
+
+## Python
+
+```python
+from phoenix.evals import LLM
+from phoenix.evals.metrics import FaithfulnessEvaluator
+
+llm = LLM(provider="openai", model="gpt-4o")
+faithfulness_eval = FaithfulnessEvaluator(llm=llm)
+```
+
+**Note**: `HallucinationEvaluator` is deprecated. Use `FaithfulnessEvaluator` instead.
+It uses "faithful"/"unfaithful" labels with score 1.0 = faithful.
+
+## TypeScript
+
+```typescript
+import { createHallucinationEvaluator } from "@arizeai/phoenix-evals";
+import { openai } from "@ai-sdk/openai";
+
+const hallucinationEval = createHallucinationEvaluator({ model: openai("gpt-4o") });
+```
+
+## Available (2.0)
+
+| Evaluator | Type | Description |
+| --------- | ---- | ----------- |
+| `FaithfulnessEvaluator` | LLM | Is the response faithful to the context? |
+| `CorrectnessEvaluator` | LLM | Is the response correct? |
+| `DocumentRelevanceEvaluator` | LLM | Are retrieved documents relevant? |
+| `ToolSelectionEvaluator` | LLM | Did the agent select the right tool? |
+| `ToolInvocationEvaluator` | LLM | Did the agent invoke the tool correctly? |
+| `ToolResponseHandlingEvaluator` | LLM | Did the agent handle the tool response well? |
+| `MatchesRegex` | Code | Does output match a regex pattern? |
+| `PrecisionRecallFScore` | Code | Precision/recall/F-score metrics |
+| `exact_match` | Code | Exact string match |
+
+Legacy evaluators (`HallucinationEvaluator`, `QAEvaluator`, `RelevanceEvaluator`,
+`ToxicityEvaluator`, `SummarizationEvaluator`) are in `phoenix.evals.legacy` and deprecated.
+
+## When to Use
+
+| Situation | Recommendation |
+| --------- | -------------- |
+| Exploration | Find traces to review |
+| Find outliers | Sort by scores |
+| Production | Validate first (>80% human agreement) |
+| Domain-specific | Build custom |
+
+## Exploration Pattern
+
+```python
+from phoenix.evals import evaluate_dataframe
+
+results_df = evaluate_dataframe(dataframe=traces, evaluators=[faithfulness_eval])
+
+# Score columns contain dicts — extract numeric scores
+scores = results_df["faithfulness_score"].apply(
+    lambda x: x.get("score", 0.0) if isinstance(x, dict) else 0.0
+)
+low_scores = results_df[scores < 0.5]   # Review these
+high_scores = results_df[scores > 0.9]  # Also sample
+```
+
+## Validation Required
+
+```python
+from sklearn.metrics import classification_report
+
+print(classification_report(human_labels, evaluator_results["label"]))
+# Target: >80% agreement
+```
--- a/plugins/phoenix/skills/phoenix-evals/references/evaluators-rag.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/evaluators-rag.md
@@ -0,0 +1,108 @@
+# Evaluators: RAG Systems
+
+RAG has two distinct components requiring different evaluation approaches.
+
+## Two-Phase Evaluation
+
+```
+RETRIEVAL                    GENERATION
+─────────                    ──────────
+Query → Retriever → Docs     Docs + Query → LLM → Answer
+         │                              │
+    IR Metrics              LLM Judges / Code Checks
+```
+
+**Debug retrieval first** using IR metrics, then tackle generation quality.
+
+## Retrieval Evaluation (IR Metrics)
+
+Use traditional information retrieval metrics:
+
+| Metric | What It Measures |
+| ------ | ---------------- |
+| Recall@k | Of all relevant docs, how many in top k? |
+| Precision@k | Of k retrieved docs, how many relevant? |
+| MRR | How high is first relevant doc? |
+| NDCG | Quality weighted by position |
+
+```python
+# Requires query-document relevance labels
+def recall_at_k(retrieved_ids, relevant_ids, k=5):
+    retrieved_set = set(retrieved_ids[:k])
+    relevant_set = set(relevant_ids)
+    if not relevant_set:
+        return 0.0
+    return len(retrieved_set & relevant_set) / len(relevant_set)
+```
+
+## Creating Retrieval Test Data
+
+Generate query-document pairs synthetically:
+
+```python
+# Reverse process: document → questions that document answers
+def generate_retrieval_test(documents):
+    test_pairs = []
+    for doc in documents:
+        # Extract facts, generate questions
+        questions = llm(f"Generate 3 questions this document answers:\n{doc}")
+        for q in questions:
+            test_pairs.append({"query": q, "relevant_doc_id": doc.id})
+    return test_pairs
+```
+
+## Generation Evaluation
+
+Use LLM judges for qualities code can't measure:
+
+| Eval | Question |
+| ---- | -------- |
+| **Faithfulness** | Are all claims supported by retrieved context? |
+| **Relevance** | Does answer address the question? |
+| **Completeness** | Does answer cover key points from context? |
+
+```python
+from phoenix.evals import ClassificationEvaluator, LLM
+
+FAITHFULNESS_TEMPLATE = """Given the context and answer, is every claim in the answer supported by the context?
+
+<context>{{context}}</context>
+<answer>{{output}}</answer>
+
+"faithful" = ALL claims supported by context
+"unfaithful" = ANY claim NOT in context
+
+Answer (faithful/unfaithful):"""
+
+faithfulness = ClassificationEvaluator(
+    name="faithfulness",
+    prompt_template=FAITHFULNESS_TEMPLATE,
+    llm=LLM(provider="openai", model="gpt-4o"),
+    choices={"unfaithful": 0, "faithful": 1}
+)
+```
+
+## RAG Failure Taxonomy
+
+Common failure modes to evaluate:
+
+```yaml
+retrieval_failures:
+  - no_relevant_docs: Query returns unrelated content
+  - partial_retrieval: Some relevant docs missed
+  - wrong_chunk: Right doc, wrong section
+
+generation_failures:
+  - hallucination: Claims not in retrieved context
+  - ignored_context: Answer doesn't use retrieved docs
+  - incomplete: Missing key information from context
+  - wrong_synthesis: Misinterprets or miscombines sources
+```
+
+## Evaluation Order
+
+1. **Retrieval first** - If wrong docs, generation will fail
+2. **Faithfulness** - Is answer grounded in context?
+3. **Answer quality** - Does answer address the question?
+
+Fix retrieval problems before debugging generation.
--- a/plugins/phoenix/skills/phoenix-evals/references/experiments-datasets-python.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/experiments-datasets-python.md
@@ -0,0 +1,133 @@
+# Experiments: Datasets in Python
+
+Creating and managing evaluation datasets.
+
+## Creating Datasets
+
+```python
+from phoenix.client import Client
+
+client = Client()
+
+# From examples
+dataset = client.datasets.create_dataset(
+    name="qa-test-v1",
+    examples=[
+        {
+            "input": {"question": "What is 2+2?"},
+            "output": {"answer": "4"},
+            "metadata": {"category": "math"},
+        },
+    ],
+)
+
+# From DataFrame
+dataset = client.datasets.create_dataset(
+    dataframe=df,
+    name="qa-test-v1",
+    input_keys=["question"],
+    output_keys=["answer"],
+    metadata_keys=["category"],
+)
+```
+
+## From Production Traces
+
+```python
+spans_df = client.spans.get_spans_dataframe(project_identifier="my-app")
+
+dataset = client.datasets.create_dataset(
+    dataframe=spans_df[["input.value", "output.value"]],
+    name="production-sample-v1",
+    input_keys=["input.value"],
+    output_keys=["output.value"],
+)
+```
+
+## Retrieving Datasets
+
+```python
+dataset = client.datasets.get_dataset(name="qa-test-v1")
+df = dataset.to_dataframe()
+```
+
+## Key Parameters
+
+| Parameter | Description |
+| --------- | ----------- |
+| `input_keys` | Columns for task input |
+| `output_keys` | Columns for expected output |
+| `metadata_keys` | Additional context |
+
+## Using Evaluators in Experiments
+
+### Evaluators as experiment evaluators
+
+Pass phoenix-evals evaluators directly to `run_experiment` as the `evaluators` argument:
+
+```python
+from functools import partial
+from phoenix.client import AsyncClient
+from phoenix.evals import ClassificationEvaluator, LLM, bind_evaluator
+
+# Define an LLM evaluator
+refusal = ClassificationEvaluator(
+    name="refusal",
+    prompt_template="Is this a refusal?\nQuestion: {{query}}\nResponse: {{response}}",
+    llm=LLM(provider="openai", model="gpt-4o"),
+    choices={"refusal": 0, "answer": 1},
+)
+
+# Bind to map dataset columns to evaluator params
+refusal_evaluator = bind_evaluator(refusal, {"query": "input.query", "response": "output"})
+
+# Define experiment task
+async def run_rag_task(input, rag_engine):
+    return rag_engine.query(input["query"])
+
+# Run experiment with the evaluator
+experiment = await AsyncClient().experiments.run_experiment(
+    dataset=ds,
+    task=partial(run_rag_task, rag_engine=query_engine),
+    experiment_name="baseline",
+    evaluators=[refusal_evaluator],
+    concurrency=10,
+)
+```
+
+### Evaluators as the task (meta evaluation)
+
+Use an LLM evaluator as the experiment **task** to test the evaluator itself
+against human annotations:
+
+```python
+from phoenix.evals import create_evaluator
+
+# The evaluator IS the task being tested
+def run_refusal_eval(input, evaluator):
+    result = evaluator.evaluate(input)
+    return result[0]
+
+# A simple heuristic checks judge vs human agreement
+@create_evaluator(name="exact_match")
+def exact_match(output, expected):
+    return float(output["score"]) == float(expected["refusal_score"])
+
+# Run: evaluator is the task, exact_match evaluates it
+experiment = await AsyncClient().experiments.run_experiment(
+    dataset=annotated_dataset,
+    task=partial(run_refusal_eval, evaluator=refusal),
+    experiment_name="judge-v1",
+    evaluators=[exact_match],
+    concurrency=10,
+)
+```
+
+This pattern lets you iterate on evaluator prompts until they align with human judgments.
+See `tutorials/evals/evals-2/evals_2.0_rag_demo.ipynb` for a full worked example.
+
+## Best Practices
+
+- **Versioning**: Create new datasets (e.g., `qa-test-v2`), don't modify
+- **Metadata**: Track source, category, difficulty
+- **Balance**: Ensure diverse coverage across categories
--- a/plugins/phoenix/skills/phoenix-evals/references/experiments-datasets-typescript.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/experiments-datasets-typescript.md
@@ -0,0 +1,69 @@
+# Experiments: Datasets in TypeScript
+
+Creating and managing evaluation datasets.
+
+## Creating Datasets
+
+```typescript
+import { createClient } from "@arizeai/phoenix-client";
+import { createDataset } from "@arizeai/phoenix-client/datasets";
+
+const client = createClient();
+
+const { datasetId } = await createDataset({
+  client,
+  name: "qa-test-v1",
+  examples: [
+    {
+      input: { question: "What is 2+2?" },
+      output: { answer: "4" },
+      metadata: { category: "math" },
+    },
+  ],
+});
+```
+
+## Example Structure
+
+```typescript
+interface DatasetExample {
+  input: Record<string, unknown>;    // Task input
+  output?: Record<string, unknown>;  // Expected output
+  metadata?: Record<string, unknown>; // Additional context
+}
+```
+
+## From Production Traces
+
+```typescript
+import { getSpans } from "@arizeai/phoenix-client/spans";
+
+const { spans } = await getSpans({
+  project: { projectName: "my-app" },
+  parentId: null, // root spans only
+  limit: 100,
+});
+
+const examples = spans.map((span) => ({
+  input: { query: span.attributes?.["input.value"] },
+  output: { response: span.attributes?.["output.value"] },
+  metadata: { spanId: span.context.span_id },
+}));
+
+await createDataset({ client, name: "production-sample", examples });
+```
+
+## Retrieving Datasets
+
+```typescript
+import { getDataset, listDatasets } from "@arizeai/phoenix-client/datasets";
+
+const dataset = await getDataset({ client, datasetId: "..." });
+const all = await listDatasets({ client });
+```
+
+## Best Practices
+
+- **Versioning**: Create new datasets, don't modify existing
+- **Metadata**: Track source, category, provenance
+- **Type safety**: Use TypeScript interfaces for structure
--- a/plugins/phoenix/skills/phoenix-evals/references/experiments-overview.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/experiments-overview.md
@@ -0,0 +1,50 @@
+# Experiments: Overview
+
+Systematic testing of AI systems with datasets, tasks, and evaluators.
+
+## Structure
+
+```
+DATASET     → Examples: {input, expected_output, metadata}
+TASK        → function(input) → output
+EVALUATORS  → (input, output, expected) → score
+EXPERIMENT  → Run task on all examples, score results
+```
+
+## Basic Usage
+
+```python
+from phoenix.client.experiments import run_experiment
+
+experiment = run_experiment(
+    dataset=my_dataset,
+    task=my_task,
+    evaluators=[accuracy, faithfulness],
+    experiment_name="improved-retrieval-v2",
+)
+
+print(experiment.aggregate_scores)
+# {'accuracy': 0.85, 'faithfulness': 0.92}
+```
+
+## Workflow
+
+1. **Create dataset** - From traces, synthetic data, or manual curation
+2. **Define task** - The function to test (your LLM pipeline)
+3. **Select evaluators** - Code and/or LLM-based
+4. **Run experiment** - Execute and score
+5. **Analyze & iterate** - Review, modify task, re-run
+
+## Dry Runs
+
+Test setup before full execution:
+
+```python
+experiment = run_experiment(dataset, task, evaluators, dry_run=3)  # Just 3 examples
+```
+
+## Best Practices
+
+- **Name meaningfully**: `"improved-retrieval-v2-2024-01-15"` not `"test"`
+- **Version datasets**: Don't modify existing
+- **Multiple evaluators**: Combine perspectives
--- a/plugins/phoenix/skills/phoenix-evals/references/experiments-running-python.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/experiments-running-python.md
@@ -0,0 +1,78 @@
+# Experiments: Running Experiments in Python
+
+Execute experiments with `run_experiment`.
+
+## Basic Usage
+
+```python
+from phoenix.client import Client
+from phoenix.client.experiments import run_experiment
+
+client = Client()
+dataset = client.datasets.get_dataset(name="qa-test-v1")
+
+def my_task(example):
+    return call_llm(example.input["question"])
+
+def exact_match(output, expected):
+    return 1.0 if output.strip().lower() == expected["answer"].strip().lower() else 0.0
+
+experiment = run_experiment(
+    dataset=dataset,
+    task=my_task,
+    evaluators=[exact_match],
+    experiment_name="qa-experiment-v1",
+)
+```
+
+## Task Functions
+
+```python
+# Basic task
+def task(example):
+    return call_llm(example.input["question"])
+
+# With context (RAG)
+def rag_task(example):
+    return call_llm(f"Context: {example.input['context']}\nQ: {example.input['question']}")
+```
+
+## Evaluator Parameters
+
+| Parameter | Access |
+| --------- | ------ |
+| `output` | Task output |
+| `expected` | Example expected output |
+| `input` | Example input |
+| `metadata` | Example metadata |
+
+## Options
+
+```python
+experiment = run_experiment(
+    dataset=dataset,
+    task=my_task,
+    evaluators=evaluators,
+    experiment_name="my-experiment",
+    dry_run=3,       # Test with 3 examples
+    repetitions=3,   # Run each example 3 times
+)
+```
+
+## Results
+
+```python
+print(experiment.aggregate_scores)
+# {'accuracy': 0.85, 'faithfulness': 0.92}
+
+for run in experiment.runs:
+    print(run.output, run.scores)
+```
+
+## Add Evaluations Later
+
+```python
+from phoenix.client.experiments import evaluate_experiment
+
+evaluate_experiment(experiment=experiment, evaluators=[new_evaluator])
+```
--- a/plugins/phoenix/skills/phoenix-evals/references/experiments-running-typescript.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/experiments-running-typescript.md
@@ -0,0 +1,82 @@
+# Experiments: Running Experiments in TypeScript
+
+Execute experiments with `runExperiment`.
+
+## Basic Usage
+
+```typescript
+import { createClient } from "@arizeai/phoenix-client";
+import {
+  runExperiment,
+  asExperimentEvaluator,
+} from "@arizeai/phoenix-client/experiments";
+
+const client = createClient();
+
+const task = async (example: { input: Record<string, unknown> }) => {
+  return await callLLM(example.input.question as string);
+};
+
+const exactMatch = asExperimentEvaluator({
+  name: "exact_match",
+  kind: "CODE",
+  evaluate: async ({ output, expected }) => ({
+    score: output === expected?.answer ? 1.0 : 0.0,
+    label: output === expected?.answer ? "match" : "no_match",
+  }),
+});
+
+const experiment = await runExperiment({
+  client,
+  experimentName: "qa-experiment-v1",
+  dataset: { datasetId: "your-dataset-id" },
+  task,
+  evaluators: [exactMatch],
+});
+```
+
+## Task Functions
+
+```typescript
+// Basic task
+const task = async (example) => await callLLM(example.input.question as string);
+
+// With context (RAG)
+const ragTask = async (example) => {
+  const prompt = `Context: ${example.input.context}\nQ: ${example.input.question}`;
+  return await callLLM(prompt);
+};
+```
+
+## Evaluator Parameters
+
+```typescript
+interface EvaluatorParams {
+  input: Record<string, unknown>;
+  output: unknown;
+  expected: Record<string, unknown>;
+  metadata: Record<string, unknown>;
+}
+```
+
+## Options
+
+```typescript
+const experiment = await runExperiment({
+  client,
+  experimentName: "my-experiment",
+  dataset: { datasetName: "qa-test-v1" },
+  task,
+  evaluators,
+  repetitions: 3, // Run each example 3 times
+  maxConcurrency: 5, // Limit concurrent executions
+});
+```
+
+## Add Evaluations Later
+
+```typescript
+import { evaluateExperiment } from "@arizeai/phoenix-client/experiments";
+
+await evaluateExperiment({ client, experiment, evaluators: [newEvaluator] });
+```
--- a/plugins/phoenix/skills/phoenix-evals/references/experiments-synthetic-python.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/experiments-synthetic-python.md
@@ -0,0 +1,70 @@
+# Experiments: Generating Synthetic Test Data
+
+Creating diverse, targeted test data for evaluation.
+
+## Dimension-Based Approach
+
+Define axes of variation, then generate combinations:
+
+```python
+dimensions = {
+    "issue_type": ["billing", "technical", "shipping"],
+    "customer_mood": ["frustrated", "neutral", "happy"],
+    "complexity": ["simple", "moderate", "complex"],
+}
+```
+
+## Two-Step Generation
+
+1. **Generate tuples** (combinations of dimension values)
+2. **Convert to natural queries** (separate LLM call per tuple)
+
+```python
+# Step 1: Create tuples
+tuples = [
+    ("billing", "frustrated", "complex"),
+    ("shipping", "neutral", "simple"),
+]
+
+# Step 2: Convert to natural query
+def tuple_to_query(t):
+    prompt = f"""Generate a realistic customer message:
+    Issue: {t[0]}, Mood: {t[1]}, Complexity: {t[2]}
+    
+    Write naturally, include typos if appropriate. Don't be formulaic."""
+    return llm(prompt)
+```
+
+## Target Failure Modes
+
+Dimensions should target known failures from error analysis:
+
+```python
+# From error analysis findings
+dimensions = {
+    "timezone": ["EST", "PST", "UTC", "ambiguous"],  # Known failure
+    "date_format": ["ISO", "US", "EU", "relative"],   # Known failure
+}
+```
+
+## Quality Control
+
+- **Validate**: Check for placeholder text, minimum length
+- **Deduplicate**: Remove near-duplicate queries using embeddings
+- **Balance**: Ensure coverage across dimension values
+
+## When to Use
+
+| Use Synthetic | Use Real Data |
+| ------------- | ------------- |
+| Limited production data | Sufficient traces |
+| Testing edge cases | Validating actual behavior |
+| Pre-launch evals | Post-launch monitoring |
+
+## Sample Sizes
+
+| Purpose | Size |
+| ------- | ---- |
+| Initial exploration | 50-100 |
+| Comprehensive eval | 100-500 |
+| Per-dimension | 10-20 per combination |
--- a/plugins/phoenix/skills/phoenix-evals/references/experiments-synthetic-typescript.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/experiments-synthetic-typescript.md
@@ -0,0 +1,86 @@
+# Experiments: Generating Synthetic Test Data (TypeScript)
+
+Creating diverse, targeted test data for evaluation.
+
+## Dimension-Based Approach
+
+Define axes of variation, then generate combinations:
+
+```typescript
+const dimensions = {
+  issueType: ["billing", "technical", "shipping"],
+  customerMood: ["frustrated", "neutral", "happy"],
+  complexity: ["simple", "moderate", "complex"],
+};
+```
+
+## Two-Step Generation
+
+1. **Generate tuples** (combinations of dimension values)
+2. **Convert to natural queries** (separate LLM call per tuple)
+
+```typescript
+import { generateText } from "ai";
+import { openai } from "@ai-sdk/openai";
+
+// Step 1: Create tuples
+type Tuple = [string, string, string];
+const tuples: Tuple[] = [
+  ["billing", "frustrated", "complex"],
+  ["shipping", "neutral", "simple"],
+];
+
+// Step 2: Convert to natural query
+async function tupleToQuery(t: Tuple): Promise<string> {
+  const { text } = await generateText({
+    model: openai("gpt-4o"),
+    prompt: `Generate a realistic customer message:
+    Issue: ${t[0]}, Mood: ${t[1]}, Complexity: ${t[2]}
+    
+    Write naturally, include typos if appropriate. Don't be formulaic.`,
+  });
+  return text;
+}
+```
+
+## Target Failure Modes
+
+Dimensions should target known failures from error analysis:
+
+```typescript
+// From error analysis findings
+const dimensions = {
+  timezone: ["EST", "PST", "UTC", "ambiguous"], // Known failure
+  dateFormat: ["ISO", "US", "EU", "relative"], // Known failure
+};
+```
+
+## Quality Control
+
+- **Validate**: Check for placeholder text, minimum length
+- **Deduplicate**: Remove near-duplicate queries using embeddings
+- **Balance**: Ensure coverage across dimension values
+
+```typescript
+function validateQuery(query: string): boolean {
+  const minLength = 20;
+  const hasPlaceholder = /\[.*?\]|<.*?>/.test(query);
+  return query.length >= minLength && !hasPlaceholder;
+}
+```
+
+## When to Use
+
+| Use Synthetic | Use Real Data |
+| ------------- | ------------- |
+| Limited production data | Sufficient traces |
+| Testing edge cases | Validating actual behavior |
+| Pre-launch evals | Post-launch monitoring |
+
+## Sample Sizes
+
+| Purpose | Size |
+| ------- | ---- |
+| Initial exploration | 50-100 |
+| Comprehensive eval | 100-500 |
+| Per-dimension | 10-20 per combination |
--- a/plugins/phoenix/skills/phoenix-evals/references/fundamentals-anti-patterns.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/fundamentals-anti-patterns.md
@@ -0,0 +1,43 @@
+# Anti-Patterns
+
+Common mistakes and fixes.
+
+| Anti-Pattern | Problem | Fix |
+| ------------ | ------- | --- |
+| Generic metrics | Pre-built scores don't match your failures | Build from error analysis |
+| Vibe-based | No quantification | Measure with experiments |
+| Ignoring humans | Uncalibrated LLM judges | Validate >80% TPR/TNR |
+| Premature automation | Evaluators for imagined problems | Let observed failures drive |
+| Saturation blindness | 100% pass = no signal | Keep capability evals at 50-80% |
+| Similarity metrics | BERTScore/ROUGE for generation | Use for retrieval only |
+| Model switching | Hoping a model works better | Error analysis first |
+
+## Quantify Changes
+
+```python
+baseline = run_experiment(dataset, old_prompt, evaluators)
+improved = run_experiment(dataset, new_prompt, evaluators)
+print(f"Improvement: {improved.pass_rate - baseline.pass_rate:+.1%}")
+```
+
+## Don't Use Similarity for Generation
+
+```python
+# BAD
+score = bertscore(output, reference)
+
+# GOOD
+correct_facts = check_facts_against_source(output, context)
+```
+
+## Error Analysis Before Model Change
+
+```python
+# BAD
+for model in models:
+    results = test(model)
+
+# GOOD
+failures = analyze_errors(results)
+# Then decide if model change is warranted
+```
--- a/plugins/phoenix/skills/phoenix-evals/references/fundamentals-model-selection.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/fundamentals-model-selection.md
@@ -0,0 +1,58 @@
+# Model Selection
+
+Error analysis first, model changes last.
+
+## Decision Tree
+
+```
+Performance Issue?
+       │
+       ▼
+Error analysis suggests model problem?
+    NO  → Fix prompts, retrieval, tools
+    YES → Is it a capability gap?
+          YES → Consider model change
+          NO  → Fix the actual problem
+```
+
+## Judge Model Selection
+
+| Principle | Action |
+| --------- | ------ |
+| Start capable | Use gpt-4o first |
+| Optimize later | Test cheaper after criteria stable |
+| Same model OK | Judge does different task |
+
+```python
+# Start with capable model
+judge = ClassificationEvaluator(
+    llm=LLM(provider="openai", model="gpt-4o"),
+    ...
+)
+
+# After validation, test cheaper
+judge_cheap = ClassificationEvaluator(
+    llm=LLM(provider="openai", model="gpt-4o-mini"),
+    ...
+)
+# Compare TPR/TNR on same test set
+```
+
+## Don't Model Shop
+
+```python
+# BAD
+for model in ["gpt-4o", "claude-3", "gemini-pro"]:
+    results = run_experiment(dataset, task, model)
+
+# GOOD
+failures = analyze_errors(results)
+# "Ignores context" → Fix prompt
+# "Can't do math" → Maybe try better model
+```
+
+## When Model Change Is Warranted
+
+- Failures persist after prompt optimization
+- Capability gaps (reasoning, math, code)
+- Error analysis confirms model limitation
--- a/plugins/phoenix/skills/phoenix-evals/references/fundamentals.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/fundamentals.md
@@ -0,0 +1,76 @@
+# Fundamentals
+
+Application-specific tests for AI systems. Code first, LLM for nuance, human for truth.
+
+## Evaluator Types
+
+| Type | Speed | Cost | Use Case |
+| ---- | ----- | ---- | -------- |
+| **Code** | Fast | Cheap | Regex, JSON, format, exact match |
+| **LLM** | Medium | Medium | Subjective quality, complex criteria |
+| **Human** | Slow | Expensive | Ground truth, calibration |
+
+**Decision:** Code first → LLM only when code can't capture criteria → Human for calibration.
+
+## Score Structure
+
+| Property | Required | Description |
+| -------- | -------- | ----------- |
+| `name` | Yes | Evaluator name |
+| `kind` | Yes | `"code"`, `"llm"`, `"human"` |
+| `score` | No* | 0-1 numeric |
+| `label` | No* | `"pass"`, `"fail"` |
+| `explanation` | No | Rationale |
+
+*One of `score` or `label` required.
+
+## Binary > Likert
+
+Use pass/fail, not 1-5 scales. Clearer criteria, easier calibration.
+
+```python
+# Multiple binary checks instead of one Likert scale
+evaluators = [
+    AnswersQuestion(),    # Yes/No
+    UsesContext(),        # Yes/No
+    NoHallucination(),    # Yes/No
+]
+```
+
+## Quick Patterns
+
+### Code Evaluator
+
+```python
+from phoenix.evals import create_evaluator
+
+@create_evaluator(name="has_citation", kind="code")
+def has_citation(output: str) -> bool:
+    return bool(re.search(r'\[\d+\]', output))
+```
+
+### LLM Evaluator
+
+```python
+from phoenix.evals import ClassificationEvaluator, LLM
+
+evaluator = ClassificationEvaluator(
+    name="helpfulness",
+    prompt_template="...",
+    llm=LLM(provider="openai", model="gpt-4o"),
+    choices={"not_helpful": 0, "helpful": 1}
+)
+```
+
+### Run Experiment
+
+```python
+from phoenix.client.experiments import run_experiment
+
+experiment = run_experiment(
+    dataset=dataset,
+    task=my_task,
+    evaluators=[evaluator1, evaluator2],
+)
+print(experiment.aggregate_scores)
+```
--- a/plugins/phoenix/skills/phoenix-evals/references/observe-sampling-python.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/observe-sampling-python.md
@@ -0,0 +1,101 @@
+# Observe: Sampling Strategies
+
+How to efficiently sample production traces for review.
+
+## Strategies
+
+### 1. Failure-Focused (Highest Priority)
+
+```python
+errors = spans_df[spans_df["status_code"] == "ERROR"]
+negative_feedback = spans_df[spans_df["feedback"] == "negative"]
+```
+
+### 2. Outliers
+
+```python
+long_responses = spans_df.nlargest(50, "response_length")
+slow_responses = spans_df.nlargest(50, "latency_ms")
+```
+
+### 3. Stratified (Coverage)
+
+```python
+# Sample equally from each category
+by_query_type = spans_df.groupby("metadata.query_type").apply(
+    lambda x: x.sample(min(len(x), 20))
+)
+```
+
+### 4. Metric-Guided
+
+```python
+# Review traces flagged by automated evaluators
+flagged = spans_df[eval_results["label"] == "hallucinated"]
+borderline = spans_df[(eval_results["score"] > 0.3) & (eval_results["score"] < 0.7)]
+```
+
+## Building a Review Queue
+
+```python
+def build_review_queue(spans_df, max_traces=100):
+    queue = pd.concat([
+        spans_df[spans_df["status_code"] == "ERROR"],
+        spans_df[spans_df["feedback"] == "negative"],
+        spans_df.nlargest(10, "response_length"),
+        spans_df.sample(min(30, len(spans_df))),
+    ]).drop_duplicates("span_id").head(max_traces)
+    return queue
+```
+
+## Sample Size Guidelines
+
+| Purpose | Size |
+| ------- | ---- |
+| Initial exploration | 50-100 |
+| Error analysis | 100+ (until saturation) |
+| Golden dataset | 100-500 |
+| Judge calibration | 100+ per class |
+
+**Saturation:** Stop when new traces show the same failure patterns.
+
+## Trace-Level Sampling
+
+When you need whole requests (all spans per trace), use `get_traces`:
+
+```python
+from phoenix.client import Client
+from datetime import datetime, timedelta
+
+client = Client()
+
+# Recent traces with full span trees
+traces = client.traces.get_traces(
+    project_identifier="my-app",
+    limit=100,
+    include_spans=True,
+)
+
+# Time-windowed sampling (e.g., last hour)
+traces = client.traces.get_traces(
+    project_identifier="my-app",
+    start_time=datetime.now() - timedelta(hours=1),
+    limit=50,
+    include_spans=True,
+)
+
+# Filter by session (multi-turn conversations)
+traces = client.traces.get_traces(
+    project_identifier="my-app",
+    session_id="user-session-abc",
+    include_spans=True,
+)
+
+# Sort by latency to find slowest requests
+traces = client.traces.get_traces(
+    project_identifier="my-app",
+    sort="latency_ms",
+    order="desc",
+    limit=50,
+)
+```
--- a/plugins/phoenix/skills/phoenix-evals/references/observe-sampling-typescript.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/observe-sampling-typescript.md
@@ -0,0 +1,147 @@
+# Observe: Sampling Strategies (TypeScript)
+
+How to efficiently sample production traces for review.
+
+## Strategies
+
+### 1. Failure-Focused (Highest Priority)
+
+Use server-side filters to fetch only what you need:
+
+```typescript
+import { getSpans } from "@arizeai/phoenix-client/spans";
+
+// Server-side filter — only ERROR spans are returned
+const { spans: errors } = await getSpans({
+  project: { projectName: "my-project" },
+  statusCode: "ERROR",
+  limit: 100,
+});
+
+// Fetch only LLM spans
+const { spans: llmSpans } = await getSpans({
+  project: { projectName: "my-project" },
+  spanKind: "LLM",
+  limit: 100,
+});
+
+// Filter by span name
+const { spans: chatSpans } = await getSpans({
+  project: { projectName: "my-project" },
+  name: "chat_completion",
+  limit: 100,
+});
+```
+
+### 2. Outliers
+
+```typescript
+const { spans } = await getSpans({
+  project: { projectName: "my-project" },
+  limit: 200,
+});
+const latency = (s: (typeof spans)[number]) =>
+  new Date(s.end_time).getTime() - new Date(s.start_time).getTime();
+const sorted = [...spans].sort((a, b) => latency(b) - latency(a));
+const slowResponses = sorted.slice(0, 50);
+```
+
+### 3. Stratified (Coverage)
+
+```typescript
+// Sample equally from each category
+function stratifiedSample<T>(items: T[], groupBy: (item: T) => string, perGroup: number): T[] {
+  const groups = new Map<string, T[]>();
+  for (const item of items) {
+    const key = groupBy(item);
+    if (!groups.has(key)) groups.set(key, []);
+    groups.get(key)!.push(item);
+  }
+  return [...groups.values()].flatMap((g) => g.slice(0, perGroup));
+}
+
+const { spans } = await getSpans({
+  project: { projectName: "my-project" },
+  limit: 500,
+});
+const byQueryType = stratifiedSample(spans, (s) => s.attributes?.["metadata.query_type"] ?? "unknown", 20);
+```
+
+### 4. Metric-Guided
+
+```typescript
+import { getSpanAnnotations } from "@arizeai/phoenix-client/spans";
+
+// Fetch annotations for your spans, then filter by label
+const { annotations } = await getSpanAnnotations({
+  project: { projectName: "my-project" },
+  spanIds: spans.map((s) => s.context.span_id),
+  includeAnnotationNames: ["hallucination"],
+});
+
+const flaggedSpanIds = new Set(
+  annotations.filter((a) => a.result?.label === "hallucinated").map((a) => a.span_id)
+);
+const flagged = spans.filter((s) => flaggedSpanIds.has(s.context.span_id));
+```
+
+## Trace-Level Sampling
+
+When you need whole requests (all spans in a trace), use `getTraces`:
+
+```typescript
+import { getTraces } from "@arizeai/phoenix-client/traces";
+
+// Recent traces with full span trees
+const { traces } = await getTraces({
+  project: { projectName: "my-project" },
+  limit: 100,
+  includeSpans: true,
+});
+
+// Filter by session (e.g., multi-turn conversations)
+const { traces: sessionTraces } = await getTraces({
+  project: { projectName: "my-project" },
+  sessionId: "user-session-abc",
+  includeSpans: true,
+});
+
+// Time-windowed sampling
+const { traces: recentTraces } = await getTraces({
+  project: { projectName: "my-project" },
+  startTime: new Date(Date.now() - 60 * 60 * 1000), // last hour
+  limit: 50,
+  includeSpans: true,
+});
+```
+
+## Building a Review Queue
+
+```typescript
+// Combine server-side filters into a review queue
+const { spans: errorSpans } = await getSpans({
+  project: { projectName: "my-project" },
+  statusCode: "ERROR",
+  limit: 30,
+});
+const { spans: allSpans } = await getSpans({
+  project: { projectName: "my-project" },
+  limit: 100,
+});
+const random = allSpans.sort(() => Math.random() - 0.5).slice(0, 30);
+
+const combined = [...errorSpans, ...random];
+const unique = [...new Map(combined.map((s) => [s.context.span_id, s])).values()];
+const reviewQueue = unique.slice(0, 100);
+```
+
+## Sample Size Guidelines
+
+| Purpose | Size |
+| ------- | ---- |
+| Initial exploration | 50-100 |
+| Error analysis | 100+ (until saturation) |
+| Golden dataset | 100-500 |
+| Judge calibration | 100+ per class |
+
+**Saturation:** Stop when new traces show the same failure patterns.
--- a/plugins/phoenix/skills/phoenix-evals/references/observe-tracing-setup.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/observe-tracing-setup.md
@@ -0,0 +1,144 @@
+# Observe: Tracing Setup
+
+Configure tracing to capture data for evaluation.
+
+## Quick Setup
+
+```python
+# Python
+from phoenix.otel import register
+
+register(project_name="my-app", auto_instrument=True)
+```
+
+```typescript
+// TypeScript
+import { registerPhoenix } from "@arizeai/phoenix-otel";
+
+registerPhoenix({ projectName: "my-app", autoInstrument: true });
+```
+
+## Essential Attributes
+
+| Attribute | Why It Matters |
+| --------- | -------------- |
+| `input.value` | User's request |
+| `output.value` | Response to evaluate |
+| `retrieval.documents` | Context for faithfulness |
+| `tool.name`, `tool.parameters` | Agent evaluation |
+| `llm.model_name` | Track by model |
+
+## Custom Attributes for Evals
+
+```python
+span.set_attribute("metadata.client_type", "enterprise")
+span.set_attribute("metadata.query_category", "billing")
+```
+
+## Exporting for Evaluation
+
+### Spans (Python — DataFrame)
+
+```python
+from phoenix.client import Client
+
+# Client() works for local Phoenix (falls back to env vars or localhost:6006)
+# For remote/cloud: Client(base_url="https://app.phoenix.arize.com", api_key="...")
+client = Client()
+spans_df = client.spans.get_spans_dataframe(
+    project_identifier="my-app",  # NOT project_name= (deprecated)
+    root_spans_only=True,
+)
+
+dataset = client.datasets.create_dataset(
+    name="error-analysis-set",
+    dataframe=spans_df[["input.value", "output.value"]],
+    input_keys=["input.value"],
+    output_keys=["output.value"],
+)
+```
+
+### Spans (TypeScript)
+
+```typescript
+import { getSpans } from "@arizeai/phoenix-client/spans";
+
+const { spans } = await getSpans({
+  project: { projectName: "my-app" },
+  parentId: null, // root spans only
+  limit: 100,
+});
+```
+
+### Traces (Python — structured)
+
+Use `get_traces` when you need full trace trees (e.g., multi-turn conversations, agent workflows):
+
+```python
+from datetime import datetime, timedelta
+
+traces = client.traces.get_traces(
+    project_identifier="my-app",
+    start_time=datetime.now() - timedelta(hours=24),
+    include_spans=True,  # includes all spans per trace
+    limit=100,
+)
+# Each trace has: trace_id, start_time, end_time, spans (when include_spans=True)
+```
+
+### Traces (TypeScript)
+
+```typescript
+import { getTraces } from "@arizeai/phoenix-client/traces";
+
+const { traces } = await getTraces({
+  project: { projectName: "my-app" },
+  startTime: new Date(Date.now() - 24 * 60 * 60 * 1000),
+  includeSpans: true,
+  limit: 100,
+});
+```
+
+## Uploading Evaluations as Annotations
+
+### Python
+
+```python
+from phoenix.evals import evaluate_dataframe
+from phoenix.evals.utils import to_annotation_dataframe
+
+# Run evaluations
+results_df = evaluate_dataframe(dataframe=spans_df, evaluators=[my_eval])
+
+# Format results for Phoenix annotations
+annotations_df = to_annotation_dataframe(results_df)
+
+# Upload to Phoenix
+client.spans.log_span_annotations_dataframe(dataframe=annotations_df)
+```
+
+### TypeScript
+
+```typescript
+import { logSpanAnnotations } from "@arizeai/phoenix-client/spans";
+
+await logSpanAnnotations({
+  spanAnnotations: [
+    {
+      spanId: "abc123",
+      name: "quality",
+      label: "good",
+      score: 0.95,
+      annotatorKind: "LLM",
+    },
+  ],
+});
+```
+
+Annotations are visible in the Phoenix UI alongside your traces.
+
+## Verify
+
+Required attributes: `input.value`, `output.value`, `status_code`
+For RAG: `retrieval.documents`
+For agents: `tool.name`, `tool.parameters`
--- a/plugins/phoenix/skills/phoenix-evals/references/production-continuous.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/production-continuous.md
@@ -0,0 +1,137 @@
+# Production: Continuous Evaluation
+
+Capability vs regression evals and the ongoing feedback loop.
+
+## Two Types of Evals
+
+| Type | Pass Rate Target | Purpose | Update |
+| ---- | ---------------- | ------- | ------ |
+| **Capability** | 50-80% | Measure improvement | Add harder cases |
+| **Regression** | 95-100% | Catch breakage | Add fixed bugs |
+
+## Saturation
+
+When capability evals hit >95% pass rate, they're saturated:
+1. Graduate passing cases to regression suite
+2. Add new challenging cases to capability suite
+
+## Feedback Loop
+
+```
+Production → Sample traffic → Run evaluators → Find failures
+    ↑                                              ↓
+Deploy  ←  Run CI evals  ←  Create test cases  ←  Error analysis
+```
+
+## Implementation
+
+Build a continuous monitoring loop:
+
+1. **Sample recent traces** at regular intervals (e.g., 100 traces per hour)
+2. **Run evaluators** on sampled traces
+3. **Log results** to Phoenix for tracking
+4. **Queue concerning results** for human review
+5. **Create test cases** from recurring failure patterns
+
+### Python
+
+```python
+from phoenix.client import Client
+from datetime import datetime, timedelta
+
+client = Client()
+
+# 1. Sample recent spans (includes full attributes for evaluation)
+spans_df = client.spans.get_spans_dataframe(
+    project_identifier="my-app",
+    start_time=datetime.now() - timedelta(hours=1),
+    root_spans_only=True,
+    limit=100,
+)
+
+# 2. Run evaluators
+from phoenix.evals import evaluate_dataframe
+
+results_df = evaluate_dataframe(
+    dataframe=spans_df,
+    evaluators=[quality_eval, safety_eval],
+)
+
+# 3. Upload results as annotations
+from phoenix.evals.utils import to_annotation_dataframe
+
+annotations_df = to_annotation_dataframe(results_df)
+client.spans.log_span_annotations_dataframe(dataframe=annotations_df)
+```
+
+### TypeScript
+
+```typescript
+import { getSpans } from "@arizeai/phoenix-client/spans";
+import { logSpanAnnotations } from "@arizeai/phoenix-client/spans";
+
+// 1. Sample recent spans
+const { spans } = await getSpans({
+  project: { projectName: "my-app" },
+  startTime: new Date(Date.now() - 60 * 60 * 1000),
+  parentId: null, // root spans only
+  limit: 100,
+});
+
+// 2. Run evaluators (user-defined)
+const results = await Promise.all(
+  spans.map(async (span) => ({
+    spanId: span.context.span_id,
+    ...await runEvaluators(span, [qualityEval, safetyEval]),
+  }))
+);
+
+// 3. Upload results as annotations
+await logSpanAnnotations({
+  spanAnnotations: results.map((r) => ({
+    spanId: r.spanId,
+    name: "quality",
+    score: r.qualityScore,
+    label: r.qualityLabel,
+    annotatorKind: "LLM" as const,
+  })),
+});
+```
+
+For trace-level monitoring (e.g., agent workflows), use `get_traces`/`getTraces` to identify traces:
+
+```python
+# Python: identify slow traces
+traces = client.traces.get_traces(
+    project_identifier="my-app",
+    start_time=datetime.now() - timedelta(hours=1),
+    sort="latency_ms",
+    order="desc",
+    limit=50,
+)
+```
+
+```typescript
+// TypeScript: identify slow traces
+import { getTraces } from "@arizeai/phoenix-client/traces";
+
+const { traces } = await getTraces({
+  project: { projectName: "my-app" },
+  startTime: new Date(Date.now() - 60 * 60 * 1000),
+  limit: 50,
+});
+```
+
+## Alerting
+
+| Condition | Severity | Action |
+| --------- | -------- | ------ |
+| Regression < 98% | Critical | Page oncall |
+| Capability declining | Warning | Slack notify |
+| Capability > 95% for 7d | Info | Schedule review |
+
+## Key Principles
+
+- **Two suites** - Capability + Regression always
+- **Graduate cases** - Move consistent passes to regression
+- **Track trends** - Monitor over time, not just snapshots
--- a/plugins/phoenix/skills/phoenix-evals/references/production-guardrails.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/production-guardrails.md
@@ -0,0 +1,53 @@
+# Production: Guardrails vs Evaluators
+
+Guardrails block in real-time. Evaluators measure asynchronously.
+
+## Key Distinction
+
+```
+Request → [INPUT GUARDRAIL] → LLM → [OUTPUT GUARDRAIL] → Response
+                                            │
+                                            └──→ ASYNC EVALUATOR (background)
+```
+
+## Guardrails
+
+| Aspect | Requirement |
+| ------ | ----------- |
+| Timing | Synchronous, blocking |
+| Latency | < 100ms |
+| Purpose | Prevent harm |
+| Type | Code-based (deterministic) |
+
+**Use for:** PII detection, prompt injection, profanity, length limits, format validation.
+
+## Evaluators
+
+| Aspect | Characteristic |
+| ------ | -------------- |
+| Timing | Async, background |
+| Latency | Can be seconds |
+| Purpose | Measure quality |
+| Type | Can use LLMs |
+
+**Use for:** Helpfulness, faithfulness, tone, completeness, citation accuracy.
+
+## Decision
+
+| Question | Answer |
+| -------- | ------ |
+| Must block harmful content? | Guardrail |
+| Measuring quality? | Evaluator |
+| Need LLM judgment? | Evaluator |
+| < 100ms required? | Guardrail |
+| False positives = angry users? | Evaluator |
+
+## LLM Guardrails: Rarely
+
+Only use LLM guardrails if:
+- Latency budget > 1s
+- Error cost >> LLM cost
+- Low volume
+- Fallback exists
+
+**Key Principle:** Guardrails prevent harm (block). Evaluators measure quality (log).
--- a/plugins/phoenix/skills/phoenix-evals/references/production-overview.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/production-overview.md
@@ -0,0 +1,92 @@
+# Production: Overview
+
+CI/CD evals vs production monitoring - complementary approaches.
+
+## Two Evaluation Modes
+
+| Aspect | CI/CD Evals | Production Monitoring |
+| ------ | ----------- | -------------------- |
+| **When** | Pre-deployment | Post-deployment, ongoing |
+| **Data** | Fixed dataset | Sampled traffic |
+| **Goal** | Prevent regression | Detect drift |
+| **Response** | Block deploy | Alert & analyze |
+
+## CI/CD Evaluations
+
+```python
+# Fast, deterministic checks
+ci_evaluators = [
+    has_required_format,
+    no_pii_leak,
+    safety_check,
+    regression_test_suite,
+]
+
+# Small but representative dataset (~100 examples)
+run_experiment(ci_dataset, task, ci_evaluators)
+```
+
+Set thresholds: regression=0.95, safety=1.0, format=0.98.
+
+## Production Monitoring
+
+### Python
+
+```python
+from phoenix.client import Client
+from datetime import datetime, timedelta
+
+client = Client()
+
+# Sample recent traces (last hour)
+traces = client.traces.get_traces(
+    project_identifier="my-app",
+    start_time=datetime.now() - timedelta(hours=1),
+    include_spans=True,
+    limit=100,
+)
+
+# Run evaluators on sampled traffic
+for trace in traces:
+    results = run_evaluators_async(trace, production_evaluators)
+    if any(r["score"] < 0.5 for r in results):
+        alert_on_failure(trace, results)
+```
+
+### TypeScript
+
+```typescript
+import { getTraces } from "@arizeai/phoenix-client/traces";
+import { getSpans } from "@arizeai/phoenix-client/spans";
+
+// Sample recent traces (last hour)
+const { traces } = await getTraces({
+  project: { projectName: "my-app" },
+  startTime: new Date(Date.now() - 60 * 60 * 1000),
+  includeSpans: true,
+  limit: 100,
+});
+
+// Or sample spans directly for evaluation
+const { spans } = await getSpans({
+  project: { projectName: "my-app" },
+  startTime: new Date(Date.now() - 60 * 60 * 1000),
+  limit: 100,
+});
+
+// Run evaluators on sampled traffic
+for (const span of spans) {
+  const results = await runEvaluators(span, productionEvaluators);
+  if (results.some((r) => r.score < 0.5)) {
+    await alertOnFailure(span, results);
+  }
+}
+```
+
+Prioritize: errors → negative feedback → random sample.
+
+## Feedback Loop
+
+```
+Production finds failure → Error analysis → Add to CI dataset → Prevents future regression
+```
--- a/plugins/phoenix/skills/phoenix-evals/references/setup-python.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/setup-python.md
@@ -0,0 +1,64 @@
+# Setup: Python
+
+Packages required for Phoenix evals and experiments.
+
+## Installation
+
+```bash
+# Core Phoenix package (includes client, evals, otel)
+pip install arize-phoenix
+
+# Or install individual packages
+pip install arize-phoenix-client   # Phoenix client only
+pip install arize-phoenix-evals    # Evaluation utilities
+pip install arize-phoenix-otel     # OpenTelemetry integration
+```
+
+## LLM Providers
+
+For LLM-as-judge evaluators, install your provider's SDK:
+
+```bash
+pip install openai      # OpenAI
+pip install anthropic   # Anthropic
+pip install google-generativeai  # Google
+```
+
+## Validation (Optional)
+
+```bash
+pip install scikit-learn  # For TPR/TNR metrics
+```
+
+## Quick Verify
+
+```python
+from phoenix.client import Client
+from phoenix.evals import LLM, ClassificationEvaluator
+from phoenix.otel import register
+
+# All imports should work
+print("Phoenix Python setup complete")
+```
+
+## Key Imports (Evals 2.0)
+
+```python
+from phoenix.client import Client
+from phoenix.evals import (
+    ClassificationEvaluator,      # LLM classification evaluator (preferred)
+    LLM,                          # Provider-agnostic LLM wrapper
+    async_evaluate_dataframe,     # Batch evaluate a DataFrame (preferred, async)
+    evaluate_dataframe,           # Batch evaluate a DataFrame (sync)
+    create_evaluator,             # Decorator for code-based evaluators
+    create_classifier,            # Factory for LLM classification evaluators
+    bind_evaluator,               # Map column names to evaluator params
+    Score,                        # Score dataclass
+)
+from phoenix.evals.utils import to_annotation_dataframe  # Format results for Phoenix annotations
+```
+
+**Prefer**: `ClassificationEvaluator` over `create_classifier` (more parameters/customization).
+**Prefer**: `async_evaluate_dataframe` over `evaluate_dataframe` (better throughput for LLM evals).
+
+**Do NOT use** legacy 1.0 imports: `OpenAIModel`, `AnthropicModel`, `run_evals`, `llm_classify`.
--- a/plugins/phoenix/skills/phoenix-evals/references/setup-typescript.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/setup-typescript.md
@@ -0,0 +1,41 @@
+# Setup: TypeScript
+
+Packages required for Phoenix evals and experiments.
+
+## Installation
+
+```bash
+# Using npm
+npm install @arizeai/phoenix-client @arizeai/phoenix-evals @arizeai/phoenix-otel
+
+# Using pnpm
+pnpm add @arizeai/phoenix-client @arizeai/phoenix-evals @arizeai/phoenix-otel
+```
+
+## LLM Providers
+
+For LLM-as-judge evaluators, install Vercel AI SDK providers:
+
+```bash
+npm install ai @ai-sdk/openai      # Vercel AI SDK + OpenAI
+npm install @ai-sdk/anthropic      # Anthropic
+npm install @ai-sdk/google         # Google
+```
+
+Or use direct provider SDKs:
+
+```bash
+npm install openai                 # OpenAI direct
+npm install @anthropic-ai/sdk      # Anthropic direct
+```
+
+## Quick Verify
+
+```typescript
+import { createClient } from "@arizeai/phoenix-client";
+import { createClassificationEvaluator } from "@arizeai/phoenix-evals";
+import { registerPhoenix } from "@arizeai/phoenix-otel";
+
+// All imports should work
+console.log("Phoenix TypeScript setup complete");
+```
--- a/plugins/phoenix/skills/phoenix-evals/references/validation-evaluators-python.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/validation-evaluators-python.md
@@ -0,0 +1,43 @@
+# Validating Evaluators (Python)
+
+Validate LLM evaluators against human-labeled examples. Target >80% TPR/TNR/Accuracy.
+
+## Calculate Metrics
+
+```python
+from sklearn.metrics import classification_report, confusion_matrix
+
+print(classification_report(human_labels, evaluator_predictions))
+
+cm = confusion_matrix(human_labels, evaluator_predictions)
+tn, fp, fn, tp = cm.ravel()
+tpr = tp / (tp + fn)
+tnr = tn / (tn + fp)
+print(f"TPR: {tpr:.2f}, TNR: {tnr:.2f}")
+```
+
+## Correct Production Estimates
+
+```python
+def correct_estimate(observed, tpr, tnr):
+    """Adjust observed pass rate using known TPR/TNR."""
+    return (observed - (1 - tnr)) / (tpr - (1 - tnr))
+```
+
+## Find Misclassified
+
+```python
+# False Positives: Evaluator pass, human fail
+fp_mask = (evaluator_predictions == 1) & (human_labels == 0)
+false_positives = dataset[fp_mask]
+
+# False Negatives: Evaluator fail, human pass
+fn_mask = (evaluator_predictions == 0) & (human_labels == 1)
+false_negatives = dataset[fn_mask]
+```
+
+## Red Flags
+
+- TPR or TNR < 70%
+- Large gap between TPR and TNR
+- Kappa < 0.6
--- a/plugins/phoenix/skills/phoenix-evals/references/validation-evaluators-typescript.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/validation-evaluators-typescript.md
@@ -0,0 +1,106 @@
+# Validating Evaluators (TypeScript)
+
+Validate an LLM evaluator against human-labeled examples before deploying it.
+Target: **>80% TPR and >80% TNR**.
+
+Roles are inverted compared to a normal task experiment:
+
+| Normal experiment | Evaluator validation |
+|---|---|
+| Task = agent logic | Task = run the evaluator under test |
+| Evaluator = judge output | Evaluator = exact-match vs human ground truth |
+| Dataset = agent examples | Dataset = golden hand-labeled examples |
+
+## Golden Dataset
+
+Use a separate dataset name so validation experiments don't mix with task experiments in Phoenix.
+Store human ground truth in `metadata.groundTruthLabel`. Aim for ~50/50 balance:
+
+```typescript
+import type { Example } from "@arizeai/phoenix-client/types/datasets";
+
+const goldenExamples: Example[] = [
+  { input: { q: "Capital of France?" }, output: { answer: "Paris" },       metadata: { groundTruthLabel: "correct" } },
+  { input: { q: "Capital of France?" }, output: { answer: "Lyon" },        metadata: { groundTruthLabel: "incorrect" } },
+  { input: { q: "Capital of France?" }, output: { answer: "Major city..." }, metadata: { groundTruthLabel: "incorrect" } },
+];
+
+const VALIDATOR_DATASET = "my-app-qa-evaluator-validation"; // separate from task dataset
+const POSITIVE_LABEL = "correct";
+const NEGATIVE_LABEL = "incorrect";
+```
+
+## Validation Experiment
+
+```typescript
+import { createClient } from "@arizeai/phoenix-client";
+import { createOrGetDataset, getDatasetExamples } from "@arizeai/phoenix-client/datasets";
+import { asExperimentEvaluator, runExperiment } from "@arizeai/phoenix-client/experiments";
+import { myEvaluator } from "./myEvaluator.js";
+
+const client = createClient();
+
+const { datasetId } = await createOrGetDataset({ client, name: VALIDATOR_DATASET, examples: goldenExamples });
+const { examples } = await getDatasetExamples({ client, dataset: { datasetId } });
+const groundTruth = new Map(examples.map((ex) => [ex.id, ex.metadata?.groundTruthLabel as string]));
+
+// Task: invoke the evaluator under test
+const task = async (example: (typeof examples)[number]) => {
+  const result = await myEvaluator.evaluate({ input: example.input, output: example.output, metadata: example.metadata });
+  return result.label ?? "unknown";
+};
+
+// Evaluator: exact-match against human ground truth
+const exactMatch = asExperimentEvaluator({
+  name: "exact-match", kind: "CODE",
+  evaluate: ({ output, metadata }) => {
+    const expected = metadata?.groundTruthLabel as string;
+    const predicted = typeof output === "string" ? output : "unknown";
+    return { score: predicted === expected ? 1 : 0, label: predicted, explanation: `Expected: ${expected}, Got: ${predicted}` };
+  },
+});
+
+const experiment = await runExperiment({
+  client, experimentName: `evaluator-validation-${Date.now()}`,
+  dataset: { datasetId }, task, evaluators: [exactMatch],
+});
+
+// Compute confusion matrix
+const runs = Object.values(experiment.runs);
+const predicted = new Map((experiment.evaluationRuns ?? [])
+  .filter((e) => e.name === "exact-match")
+  .map((e) => [e.experimentRunId, e.result?.label ?? null]));
+
+let tp = 0, fp = 0, tn = 0, fn = 0;
+for (const run of runs) {
+  if (run.error) continue;
+  const p = predicted.get(run.id), a = groundTruth.get(run.datasetExampleId);
+  if (!p || !a) continue;
+  if (a === POSITIVE_LABEL && p === POSITIVE_LABEL) tp++;
+  else if (a === NEGATIVE_LABEL && p === POSITIVE_LABEL) fp++;
+  else if (a === NEGATIVE_LABEL && p === NEGATIVE_LABEL) tn++;
+  else if (a === POSITIVE_LABEL && p === NEGATIVE_LABEL) fn++;
+}
+const total = tp + fp + tn + fn;
+const tpr = tp + fn > 0 ? (tp / (tp + fn)) * 100 : 0;
+const tnr = tn + fp > 0 ? (tn / (tn + fp)) * 100 : 0;
+console.log(`TPR: ${tpr.toFixed(1)}%  TNR: ${tnr.toFixed(1)}%  Accuracy: ${((tp + tn) / total * 100).toFixed(1)}%`);
+```
+
+## Results & Quality Rules
+
+| Metric | Target | Low value means |
+|---|---|---|
+| TPR (sensitivity) | >80% | Misses real failures (false negatives) |
+| TNR (specificity) | >80% | Flags good outputs (false positives) |
+| Accuracy | >80% | General weakness |
+
+**Golden dataset rules:** ~50/50 balance · include edge cases · human-labeled only · never mutate (append new versions) · 20–50 examples is enough.
+
+**Re-validate when:** prompt template changes · judge model changes · criteria updated · production FP/FN spike.
+
+## See Also
+
+- `validation.md` — Metric definitions and concepts
+- `experiments-running-typescript.md` — `runExperiment` API
+- `experiments-datasets-typescript.md` — `createOrGetDataset` / `getDatasetExamples`
--- a/plugins/phoenix/skills/phoenix-evals/references/validation.md
+++ b/plugins/phoenix/skills/phoenix-evals/references/validation.md
@@ -0,0 +1,74 @@
+# Validation
+
+Validate LLM judges against human labels before deploying. Target >80% agreement.
+
+## Requirements
+
+| Requirement | Target |
+| ----------- | ------ |
+| Test set size | 100+ examples |
+| Balance | ~50/50 pass/fail |
+| Accuracy | >80% |
+| TPR/TNR | Both >70% |
+
+## Metrics
+
+| Metric | Formula | Use When |
+| ------ | ------- | -------- |
+| **Accuracy** | (TP+TN) / Total | General |
+| **TPR (Recall)** | TP / (TP+FN) | Quality assurance |
+| **TNR (Specificity)** | TN / (TN+FP) | Safety-critical |
+| **Cohen's Kappa** | Agreement beyond chance | Comparing evaluators |
+
+## Quick Validation
+
+```python
+from sklearn.metrics import classification_report, confusion_matrix, cohen_kappa_score
+
+print(classification_report(human_labels, evaluator_predictions))
+print(f"Kappa: {cohen_kappa_score(human_labels, evaluator_predictions):.3f}")
+
+# Get TPR/TNR
+cm = confusion_matrix(human_labels, evaluator_predictions)
+tn, fp, fn, tp = cm.ravel()
+tpr = tp / (tp + fn)
+tnr = tn / (tn + fp)
+```
+
+## Golden Dataset Structure
+
+```python
+golden_example = {
+    "input": "What is the capital of France?",
+    "output": "Paris is the capital.",
+    "ground_truth_label": "correct",
+}
+```
+
+## Building Golden Datasets
+
+1. Sample production traces (errors, negative feedback, edge cases)
+2. Balance ~50/50 pass/fail
+3. Expert labels each example
+4. Version datasets (never modify existing)
+
+```python
+# GOOD - create new version
+golden_v2 = golden_v1 + [new_examples]
+
+# BAD - never modify existing
+golden_v1.append(new_example)
+```
+
+## Warning Signs
+
+- All pass or all fail → too lenient/strict
+- Random results → criteria unclear
+- TPR/TNR < 70% → needs improvement
+
+## Re-Validate When
+
+- Prompt template changes
+- Judge model changes
+- Criteria changes
+- Monthly
--- a/plugins/phoenix/skills/phoenix-tracing/README.md
+++ b/plugins/phoenix/skills/phoenix-tracing/README.md
@@ -0,0 +1,24 @@
+# Phoenix Tracing Skill
+
+OpenInference semantic conventions and instrumentation guides for Phoenix.
+
+## Usage
+
+Start with `SKILL.md` for the index and quick reference.
+
+## File Organization
+
+All files in flat `rules/` directory with semantic prefixes:
+
+- `span-*` - Span kinds (LLM, CHAIN, TOOL, etc.)
+- `setup-*`, `instrumentation-*` - Getting started guides
+- `fundamentals-*`, `attributes-*` - Reference docs
+- `annotations-*`, `export-*` - Advanced features
+
+## Reference
+
+- [OpenInference Spec](https://github.com/Arize-ai/openinference/tree/main/spec)
+- [Phoenix Documentation](https://docs.arize.com/phoenix)
+- [Python OTEL API](https://arize-phoenix.readthedocs.io/projects/otel/en/latest/)
+- [Python Client API](https://arize-phoenix.readthedocs.io/projects/client/en/latest/)
+- [TypeScript API](https://arize-ai.github.io/phoenix/)
--- a/plugins/phoenix/skills/phoenix-tracing/SKILL.md
+++ b/plugins/phoenix/skills/phoenix-tracing/SKILL.md
@@ -0,0 +1,139 @@
+---
+name: phoenix-tracing
+description: OpenInference semantic conventions and instrumentation for Phoenix AI observability. Use when implementing LLM tracing, creating custom spans, or deploying to production.
+license: Apache-2.0
+compatibility: Requires Phoenix server. Python skills need arize-phoenix-otel; TypeScript skills need @arizeai/phoenix-otel.
+metadata:
+  author: oss@arize.com
+  version: "1.0.0"
+  languages: "Python, TypeScript"
+---
+
+# Phoenix Tracing
+
+Comprehensive guide for instrumenting LLM applications with OpenInference tracing in Phoenix. Contains reference files covering setup, instrumentation, span types, and production deployment.
+
+## When to Apply
+
+Reference these guidelines when:
+
+- Setting up Phoenix tracing (Python or TypeScript)
+- Creating custom spans for LLM operations
+- Adding attributes following OpenInference conventions
+- Deploying tracing to production
+- Querying and analyzing trace data
+
+## Reference Categories
+
+| Priority | Category        | Description                    | Prefix                     |
+| -------- | --------------- | ------------------------------ | -------------------------- |
+| 1        | Setup           | Installation and configuration | `setup-*`                  |
+| 2        | Instrumentation | Auto and manual tracing        | `instrumentation-*`        |
+| 3        | Span Types      | 9 span kinds with attributes   | `span-*`                   |
+| 4        | Organization    | Projects and sessions          | `projects-*`, `sessions-*` |
+| 5        | Enrichment      | Custom metadata                | `metadata-*`               |
+| 6        | Production      | Batch processing, masking      | `production-*`             |
+| 7        | Feedback        | Annotations and evaluation     | `annotations-*`            |
+
+## Quick Reference
+
+### 1. Setup (START HERE)
+
+- [setup-python](references/setup-python.md) - Install arize-phoenix-otel, configure endpoint
+- [setup-typescript](references/setup-typescript.md) - Install @arizeai/phoenix-otel, configure endpoint
+
+### 2. Instrumentation
+
+- [instrumentation-auto-python](references/instrumentation-auto-python.md) - Auto-instrument OpenAI, LangChain, etc.
+- [instrumentation-auto-typescript](references/instrumentation-auto-typescript.md) - Auto-instrument supported frameworks
+- [instrumentation-manual-python](references/instrumentation-manual-python.md) - Custom spans with decorators
+- [instrumentation-manual-typescript](references/instrumentation-manual-typescript.md) - Custom spans with wrappers
+
+### 3. Span Types (with full attribute schemas)
+
+- [span-llm](references/span-llm.md) - LLM API calls (model, tokens, messages, cost)
+- [span-chain](references/span-chain.md) - Multi-step workflows and pipelines
+- [span-retriever](references/span-retriever.md) - Document retrieval (documents, scores)
+- [span-tool](references/span-tool.md) - Function/API calls (name, parameters)
+- [span-agent](references/span-agent.md) - Multi-step reasoning agents
+- [span-embedding](references/span-embedding.md) - Vector generation
+- [span-reranker](references/span-reranker.md) - Document re-ranking
+- [span-guardrail](references/span-guardrail.md) - Safety checks
+- [span-evaluator](references/span-evaluator.md) - LLM evaluation
+
+### 4. Organization
+
+- [projects-python](references/projects-python.md) / [projects-typescript](references/projects-typescript.md) - Group traces by application
+- [sessions-python](references/sessions-python.md) / [sessions-typescript](references/sessions-typescript.md) - Track conversations
+
+### 5. Enrichment
+
+- [metadata-python](references/metadata-python.md) / [metadata-typescript](references/metadata-typescript.md) - Custom attributes
+
+### 6. Production (CRITICAL)
+
+- [production-python](references/production-python.md) / [production-typescript](references/production-typescript.md) - Batch processing, PII masking
+
+### 7. Feedback
+
+- [annotations-overview](references/annotations-overview.md) - Feedback concepts
+- [annotations-python](references/annotations-python.md) / [annotations-typescript](references/annotations-typescript.md) - Add feedback to spans
+
+### Reference Files
+
+- [fundamentals-overview](references/fundamentals-overview.md) - Traces, spans, attributes basics
+- [fundamentals-required-attributes](references/fundamentals-required-attributes.md) - Required fields per span type
+- [fundamentals-universal-attributes](references/fundamentals-universal-attributes.md) - Common attributes (user.id, session.id)
+- [fundamentals-flattening](references/fundamentals-flattening.md) - JSON flattening rules
+- [attributes-messages](references/attributes-messages.md) - Chat message format
+- [attributes-metadata](references/attributes-metadata.md) - Custom metadata schema
+- [attributes-graph](references/attributes-graph.md) - Agent workflow attributes
+- [attributes-exceptions](references/attributes-exceptions.md) - Error tracking
+
+## Common Workflows
+
+- **Quick Start**: setup-{lang} → instrumentation-auto-{lang} → Check Phoenix
+- **Custom Spans**: setup-{lang} → instrumentation-manual-{lang} → span-{type}
+- **Session Tracking**: sessions-{lang} for conversation grouping patterns
+- **Production**: production-{lang} for batching, masking, and deployment
+
+## How to Use This Skill
+
+**Navigation Patterns:**
+
+```bash
+# By category prefix
+references/setup-*              # Installation and configuration
+references/instrumentation-*    # Auto and manual tracing
+references/span-*               # Span type specifications
+references/sessions-*           # Session tracking
+references/production-*         # Production deployment
+references/fundamentals-*       # Core concepts
+references/attributes-*         # Attribute specifications
+
+# By language
+references/*-python.md          # Python implementations
+references/*-typescript.md      # TypeScript implementations
+```
+
+**Reading Order:**
+1. Start with setup-{lang} for your language
+2. Choose instrumentation-auto-{lang} OR instrumentation-manual-{lang}
+3. Reference span-{type} files as needed for specific operations
+4. See fundamentals-* files for attribute specifications
+
+## References
+
+**Phoenix Documentation:**
+
+- [Phoenix Documentation](https://docs.arize.com/phoenix)
+- [OpenInference Spec](https://github.com/Arize-ai/openinference/tree/main/spec)
+
+**Python API Documentation:**
+
+- [Python OTEL Package](https://arize-phoenix.readthedocs.io/projects/otel/en/latest/) - `arize-phoenix-otel` API reference
+- [Python Client Package](https://arize-phoenix.readthedocs.io/projects/client/en/latest/) - `arize-phoenix-client` API reference
+
+**TypeScript API Documentation:**
+
+- [TypeScript Packages](https://arize-ai.github.io/phoenix/) - `@arizeai/phoenix-otel`, `@arizeai/phoenix-client`, and other TypeScript packages
--- a/plugins/phoenix/skills/phoenix-tracing/references/annotations-overview.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/annotations-overview.md
@@ -0,0 +1,69 @@
+# Annotations Overview
+
+Annotations allow you to add human or automated feedback to traces, spans, documents, and sessions. Annotations are essential for evaluation, quality assessment, and building training datasets.
+
+## Annotation Types
+
+Phoenix supports four types of annotations:
+
+| Type                    | Target                           | Purpose                                  | Example Use Case                 |
+| ----------------------- | -------------------------------- | ---------------------------------------- | -------------------------------- |
+| **Span Annotation**     | Individual span                  | Feedback on a specific operation         | "This LLM response was accurate" |
+| **Document Annotation** | Document within a RETRIEVER span | Feedback on retrieved document relevance | "This document was not helpful"  |
+| **Trace Annotation**    | Entire trace                     | Feedback on end-to-end interaction       | "User was satisfied with result" |
+| **Session Annotation**  | User session                     | Feedback on multi-turn conversation      | "Session ended successfully"     |
+
+## Annotation Fields
+
+Every annotation has these fields:
+
+### Required Fields
+
+| Field     | Type   | Description                                                                   |
+| --------- | ------ | ----------------------------------------------------------------------------- |
+| Entity ID | String | ID of the target entity (span_id, trace_id, session_id, or document_position) |
+| `name`    | String | Annotation name/label (e.g., "quality", "relevance", "helpfulness")           |
+
+### Result Fields (At Least One Required)
+
+| Field         | Type              | Description                                                       |
+| ------------- | ----------------- | ----------------------------------------------------------------- |
+| `label`       | String (optional) | Categorical value (e.g., "good", "bad", "relevant", "irrelevant") |
+| `score`       | Float (optional)  | Numeric value (typically 0-1, but can be any range)               |
+| `explanation` | String (optional) | Free-text explanation of the annotation                           |
+
+**At least one** of `label`, `score`, or `explanation` must be provided.
+
+### Optional Fields
+
+| Field            | Type   | Description                                                                             |
+| ---------------- | ------ | --------------------------------------------------------------------------------------- |
+| `annotator_kind` | String | Who created this annotation: "HUMAN", "LLM", or "CODE" (default: "HUMAN")               |
+| `identifier`     | String | Unique identifier for upsert behavior (updates existing if same name+entity+identifier) |
+| `metadata`       | Object | Custom metadata as key-value pairs                                                      |
+
+## Annotator Kinds
+
+| Kind    | Description                    | Example                           |
+| ------- | ------------------------------ | --------------------------------- |
+| `HUMAN` | Manual feedback from a person  | User ratings, expert labels       |
+| `LLM`   | Automated feedback from an LLM | GPT-4 evaluating response quality |
+| `CODE`  | Automated feedback from code   | Rule-based checks, heuristics     |
+
+## Examples
+
+**Quality Assessment:**
+
+- `quality` - Overall quality (label: good/fair/poor, score: 0-1)
+- `correctness` - Factual accuracy (label: correct/incorrect, score: 0-1)
+- `helpfulness` - User satisfaction (label: helpful/not_helpful, score: 0-1)
+
+**RAG-Specific:**
+
+- `relevance` - Document relevance to query (label: relevant/irrelevant, score: 0-1)
+- `faithfulness` - Answer grounded in context (label: faithful/unfaithful, score: 0-1)
+
+**Safety:**
+
+- `toxicity` - Contains harmful content (score: 0-1)
+- `pii_detected` - Contains personally identifiable information (label: yes/no)
--- a/plugins/phoenix/skills/phoenix-tracing/references/annotations-python.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/annotations-python.md
@@ -0,0 +1,114 @@
+# Python SDK Annotation Patterns
+
+Add feedback to spans, traces, documents, and sessions using the Python client.
+
+## Client Setup
+
+```python
+from phoenix.client import Client
+client = Client()  # Default: http://localhost:6006
+```
+
+## Span Annotations
+
+Add feedback to individual spans:
+
+```python
+client.spans.add_span_annotation(
+    span_id="abc123",
+    annotation_name="quality",
+    annotator_kind="HUMAN",
+    label="high_quality",
+    score=0.95,
+    explanation="Accurate and well-formatted",
+    metadata={"reviewer": "alice"},
+    sync=True
+)
+```
+
+## Document Annotations
+
+Rate individual documents in RETRIEVER spans:
+
+```python
+client.spans.add_document_annotation(
+    span_id="retriever_span",
+    document_position=0,  # 0-based index
+    annotation_name="relevance",
+    annotator_kind="LLM",
+    label="relevant",
+    score=0.95
+)
+```
+
+## Trace Annotations
+
+Feedback on entire traces:
+
+```python
+client.traces.add_trace_annotation(
+    trace_id="trace_abc",
+    annotation_name="correctness",
+    annotator_kind="HUMAN",
+    label="correct",
+    score=1.0
+)
+```
+
+## Session Annotations
+
+Feedback on multi-turn conversations:
+
+```python
+client.sessions.add_session_annotation(
+    session_id="session_xyz",
+    annotation_name="user_satisfaction",
+    annotator_kind="HUMAN",
+    label="satisfied",
+    score=0.85
+)
+```
+
+## RAG Pipeline Example
+
+```python
+from phoenix.client import Client
+from phoenix.client.resources.spans import SpanDocumentAnnotationData
+
+client = Client()
+
+# Document relevance (batch)
+client.spans.log_document_annotations(
+    document_annotations=[
+        SpanDocumentAnnotationData(
+            name="relevance", span_id="retriever_span", document_position=i,
+            annotator_kind="LLM", result={"label": label, "score": score}
+        )
+        for i, (label, score) in enumerate([
+            ("relevant", 0.95), ("relevant", 0.80), ("irrelevant", 0.10)
+        ])
+    ]
+)
+
+# LLM response quality
+client.spans.add_span_annotation(
+    span_id="llm_span",
+    annotation_name="faithfulness",
+    annotator_kind="LLM",
+    label="faithful",
+    score=0.90
+)
+
+# Overall trace quality
+client.traces.add_trace_annotation(
+    trace_id="trace_123",
+    annotation_name="correctness",
+    annotator_kind="HUMAN",
+    label="correct",
+    score=1.0
+)
+```
+
+## API Reference
+
+- [Python Client API](https://arize-phoenix.readthedocs.io/projects/client/en/latest/)
--- a/plugins/phoenix/skills/phoenix-tracing/references/annotations-typescript.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/annotations-typescript.md
@@ -0,0 +1,137 @@
+# TypeScript SDK Annotation Patterns
+
+Add feedback to spans, traces, documents, and sessions using the TypeScript client.
+
+## Client Setup
+
+```typescript
+import { createClient } from "phoenix-client";
+const client = createClient();  // Default: http://localhost:6006
+```
+
+## Span Annotations
+
+Add feedback to individual spans:
+
+```typescript
+import { addSpanAnnotation } from "phoenix-client";
+
+await addSpanAnnotation({
+  client,
+  spanAnnotation: {
+    spanId: "abc123",
+    name: "quality",
+    annotatorKind: "HUMAN",
+    label: "high_quality",
+    score: 0.95,
+    explanation: "Accurate and well-formatted",
+    metadata: { reviewer: "alice" }
+  },
+  sync: true
+});
+```
+
+## Document Annotations
+
+Rate individual documents in RETRIEVER spans:
+
+```typescript
+import { addDocumentAnnotation } from "phoenix-client";
+
+await addDocumentAnnotation({
+  client,
+  documentAnnotation: {
+    spanId: "retriever_span",
+    documentPosition: 0,  // 0-based index
+    name: "relevance",
+    annotatorKind: "LLM",
+    label: "relevant",
+    score: 0.95
+  }
+});
+```
+
+## Trace Annotations
+
+Feedback on entire traces:
+
+```typescript
+import { addTraceAnnotation } from "phoenix-client";
+
+await addTraceAnnotation({
+  client,
+  traceAnnotation: {
+    traceId: "trace_abc",
+    name: "correctness",
+    annotatorKind: "HUMAN",
+    label: "correct",
+    score: 1.0
+  }
+});
+```
+
+## Session Annotations
+
+Feedback on multi-turn conversations:
+
+```typescript
+import { addSessionAnnotation } from "phoenix-client";
+
+await addSessionAnnotation({
+  client,
+  sessionAnnotation: {
+    sessionId: "session_xyz",
+    name: "user_satisfaction",
+    annotatorKind: "HUMAN",
+    label: "satisfied",
+    score: 0.85
+  }
+});
+```
+
+## RAG Pipeline Example
+
+```typescript
+import { createClient, logDocumentAnnotations, addSpanAnnotation, addTraceAnnotation } from "phoenix-client";
+
+const client = createClient();
+
+// Document relevance (batch)
+await logDocumentAnnotations({
+  client,
+  documentAnnotations: [
+    { spanId: "retriever_span", documentPosition: 0, name: "relevance",
+      annotatorKind: "LLM", label: "relevant", score: 0.95 },
+    { spanId: "retriever_span", documentPosition: 1, name: "relevance",
+      annotatorKind: "LLM", label: "relevant", score: 0.80 }
+  ]
+});
+
+// LLM response quality
+await addSpanAnnotation({
+  client,
+  spanAnnotation: {
+    spanId: "llm_span",
+    name: "faithfulness",
+    annotatorKind: "LLM",
+    label: "faithful",
+    score: 0.90
+  }
+});
+
+// Overall trace quality
+await addTraceAnnotation({
+  client,
+  traceAnnotation: {
+    traceId: "trace_123",
+    name: "correctness",
+    annotatorKind: "HUMAN",
+    label: "correct",
+    score: 1.0
+  }
+});
+```
+
+## API Reference
+
+- [TypeScript Client API](https://arize-ai.github.io/phoenix/)
--- a/plugins/phoenix/skills/phoenix-tracing/references/fundamentals-flattening.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/fundamentals-flattening.md
@@ -0,0 +1,58 @@
+# Flattening Convention
+
+OpenInference flattens nested data structures into dot-notation attributes for database compatibility, OpenTelemetry compatibility, and simple querying.
+
+## Flattening Rules
+
+**Objects → Dot Notation**
+
+```javascript
+{ llm: { model_name: "gpt-4", token_count: { prompt: 10, completion: 20 } } }
+// becomes
+{ "llm.model_name": "gpt-4", "llm.token_count.prompt": 10, "llm.token_count.completion": 20 }
+```
+
+**Arrays → Zero-Indexed Notation**
+
+```javascript
+{ llm: { input_messages: [{ role: "user", content: "Hi" }] } }
+// becomes
+{ "llm.input_messages.0.message.role": "user", "llm.input_messages.0.message.content": "Hi" }
+```
+
+**Message Convention: `.message.` segment required**
+
+```
+llm.input_messages.{index}.message.{field}
+llm.input_messages.0.message.tool_calls.0.tool_call.function.name
+```
+
+## Complete Example
+
+```javascript
+// Original
+{
+  openinference: { span: { kind: "LLM" } },
+  llm: {
+    model_name: "claude-3-5-sonnet-20241022",
+    invocation_parameters: { temperature: 0.7, max_tokens: 1000 },
+    input_messages: [{ role: "user", content: "Tell me a joke" }],
+    output_messages: [{ role: "assistant", content: "Why did the chicken cross the road?" }],
+    token_count: { prompt: 5, completion: 10, total: 15 }
+  }
+}
+
+// Flattened (stored in Phoenix spans.attributes JSONB)
+{
+  "openinference.span.kind": "LLM",
+  "llm.model_name": "claude-3-5-sonnet-20241022",
+  "llm.invocation_parameters": "{\"temperature\": 0.7, \"max_tokens\": 1000}",
+  "llm.input_messages.0.message.role": "user",
+  "llm.input_messages.0.message.content": "Tell me a joke",
+  "llm.output_messages.0.message.role": "assistant",
+  "llm.output_messages.0.message.content": "Why did the chicken cross the road?",
+  "llm.token_count.prompt": 5,
+  "llm.token_count.completion": 10,
+  "llm.token_count.total": 15
+}
+```
--- a/plugins/phoenix/skills/phoenix-tracing/references/fundamentals-overview.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/fundamentals-overview.md
@@ -0,0 +1,53 @@
+# Overview and Traces & Spans
+
+This document covers the fundamental concepts of OpenInference traces and spans in Phoenix.
+
+## Overview
+
+OpenInference is a set of semantic conventions for AI and LLM applications based on OpenTelemetry. Phoenix uses these conventions to capture, store, and analyze traces from AI applications.
+
+**Key Concepts:**
+
+- **Traces** represent end-to-end requests through your application
+- **Spans** represent individual operations within a trace (LLM calls, retrievals, tool invocations)
+- **Attributes** are key-value pairs attached to spans using flattened, dot-notation paths
+- **Span Kinds** categorize the type of operation (LLM, RETRIEVER, TOOL, etc.)
+
+## Traces and Spans
+
+### Trace Hierarchy
+
+A **trace** is a tree of **spans** representing a complete request:
+
+```
+Trace ID: abc123
+├─ Span 1: CHAIN (root span, parent_id = null)
+│  ├─ Span 2: RETRIEVER (parent_id = span_1_id)
+│  │  └─ Span 3: EMBEDDING (parent_id = span_2_id)
+│  └─ Span 4: LLM (parent_id = span_1_id)
+│     └─ Span 5: TOOL (parent_id = span_4_id)
+```
+
+### Context Propagation
+
+Spans maintain parent-child relationships via:
+
+- `trace_id` - Same for all spans in a trace
+- `span_id` - Unique identifier for this span
+- `parent_id` - References parent span's `span_id` (null for root spans)
+
+Phoenix uses these relationships to:
+
+- Build the span tree visualization in the UI
+- Calculate cumulative metrics (tokens, errors) up the tree
+- Enable nested querying (e.g., "find CHAIN spans containing LLM spans with errors")
+
+### Span Lifecycle
+
+Each span has:
+
+- `start_time` - When the operation began (Unix timestamp in nanoseconds)
+- `end_time` - When the operation completed
+- `status_code` - OK, ERROR, or UNSET
+- `status_message` - Optional error message
+- `attributes` - object with all semantic convention attributes
--- a/plugins/phoenix/skills/phoenix-tracing/references/fundamentals-required-attributes.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/fundamentals-required-attributes.md
@@ -0,0 +1,64 @@
+# Required and Recommended Attributes
+
+This document covers the required attribute and highly recommended attributes for all OpenInference spans.
+
+## Required Attribute
+
+**Every span MUST have exactly one required attribute:**
+
+```json
+{
+  "openinference.span.kind": "LLM"
+}
+```
+
+## Highly Recommended Attributes
+
+While not strictly required, these attributes are **highly recommended** on all spans as they:
+- Enable evaluation and quality assessment
+- Help understand information flow through your application
+- Make traces more useful for debugging
+
+### Input/Output Values
+
+| Attribute | Type | Description |
+|-----------|------|-------------|
+| `input.value` | String | Input to the operation (prompt, query, document) |
+| `output.value` | String | Output from the operation (response, result, answer) |
+
+**Example:**
+```json
+{
+  "openinference.span.kind": "LLM",
+  "input.value": "What is the capital of France?",
+  "output.value": "The capital of France is Paris."
+}
+```
+
+**Why these matter:**
+- **Evaluations**: Many evaluators (faithfulness, relevance, hallucination detection) require both input and output to assess quality
+- **Information flow**: Seeing inputs/outputs makes it easy to trace how data transforms through your application
+- **Debugging**: When something goes wrong, having the actual input/output makes root cause analysis much faster
+- **Analytics**: Enables pattern analysis across similar inputs or outputs
+
+**Phoenix Behavior:**
+- Input/output displayed prominently in span details
+- Evaluators can automatically access these values
+- Search/filter traces by input or output content
+- Export inputs/outputs for fine-tuning datasets
+
+## Valid Span Kinds
+
+There are exactly **9 valid span kinds** in OpenInference:
+
+| Span Kind | Purpose | Common Use Case |
+|-----------|---------|-----------------|
+| `LLM` | Language model inference | OpenAI, Anthropic, local LLM calls |
+| `EMBEDDING` | Vector generation | Text-to-vector conversion |
+| `CHAIN` | Application flow orchestration | LangChain chains, custom workflows |
+| `RETRIEVER` | Document/context retrieval | Vector DB queries, semantic search |
+| `RERANKER` | Result reordering | Rerank retrieved documents |
+| `TOOL` | External tool invocation | API calls, function execution |
+| `AGENT` | Autonomous reasoning | ReAct agents, planning loops |
+| `GUARDRAIL` | Safety/policy checks | Content moderation, PII detection |
+| `EVALUATOR` | Quality assessment | Answer relevance, faithfulness scoring |
--- a/plugins/phoenix/skills/phoenix-tracing/references/fundamentals-universal-attributes.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/fundamentals-universal-attributes.md
@@ -0,0 +1,72 @@
+# Universal Attributes
+
+This document covers attributes that can be used on any span kind in OpenInference.
+
+## Overview
+
+These attributes can be used on **any span kind** to provide additional context, tracking, and metadata.
+
+## Input/Output
+
+| Attribute          | Type   | Description                                          |
+| ------------------ | ------ | ---------------------------------------------------- |
+| `input.value`      | String | Input to the operation (prompt, query, document)     |
+| `input.mime_type`  | String | MIME type (e.g., "text/plain", "application/json")   |
+| `output.value`     | String | Output from the operation (response, vector, result) |
+| `output.mime_type` | String | MIME type of output                                  |
+
+### Why Capture I/O?
+
+**Always capture input/output for evaluation-ready spans:**
+- Phoenix evaluators (faithfulness, relevance, Q&A correctness) require `input.value` and `output.value`
+- Phoenix UI displays I/O prominently in trace views for debugging
+- Enables exporting I/O for creating fine-tuning datasets
+- Provides complete context for analyzing agent behavior
+
+**Example attributes:**
+
+```json
+{
+  "openinference.span.kind": "CHAIN",
+  "input.value": "What is the weather?",
+  "input.mime_type": "text/plain",
+  "output.value": "I don't have access to weather data.",
+  "output.mime_type": "text/plain"
+}
+```
+
+**See language-specific implementation:**
+- TypeScript: `instrumentation-manual-typescript.md`
+- Python: `instrumentation-manual-python.md`
+
+## Session and User Tracking
+
+| Attribute    | Type   | Description                                    |
+| ------------ | ------ | ---------------------------------------------- |
+| `session.id` | String | Session identifier for grouping related traces |
+| `user.id`    | String | User identifier for per-user analysis          |
+
+**Example:**
+
+```json
+{
+  "openinference.span.kind": "LLM",
+  "session.id": "session_abc123",
+  "user.id": "user_xyz789"
+}
+```
+
+## Metadata
+
+| Attribute  | Type   | Description                                |
+| ---------- | ------ | ------------------------------------------ |
+| `metadata` | string | JSON-serialized object of key-value pairs  |
+
+**Example:**
+
+```json
+{
+  "openinference.span.kind": "LLM",
+  "metadata": "{\"environment\": \"production\", \"model_version\": \"v2.1\", \"cost_center\": \"engineering\"}"
+}
+```
--- a/plugins/phoenix/skills/phoenix-tracing/references/instrumentation-auto-python.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/instrumentation-auto-python.md
@@ -0,0 +1,85 @@
+# Phoenix Tracing: Auto-Instrumentation (Python)
+
+**Automatically create spans for LLM calls without code changes.**
+
+## Overview
+
+Auto-instrumentation patches supported libraries at runtime to create spans automatically. Use for supported frameworks (LangChain, LlamaIndex, OpenAI SDK, etc.). For custom logic, manual-instrumentation-python.md.
+
+## Supported Frameworks
+
+**Python:**
+
+- LLM SDKs: OpenAI, Anthropic, Bedrock, Mistral, Vertex AI, Groq, Ollama
+- Frameworks: LangChain, LlamaIndex, DSPy, CrewAI, Instructor, Haystack
+- Install: `pip install openinference-instrumentation-{name}`
+
+## Setup
+
+**Install and enable:**
+
+```bash
+pip install arize-phoenix-otel
+pip install openinference-instrumentation-openai  # Add others as needed
+```
+
+```python
+from phoenix.otel import register
+
+register(project_name="my-app", auto_instrument=True)  # Discovers all installed instrumentors
+```
+
+**Example:**
+
+```python
+from phoenix.otel import register
+from openai import OpenAI
+
+register(project_name="my-app", auto_instrument=True)
+
+client = OpenAI()
+response = client.chat.completions.create(
+    model="gpt-4",
+    messages=[{"role": "user", "content": "Hello!"}]
+)
+```
+
+Traces appear in Phoenix UI with model, input/output, tokens, timing automatically captured. See span kind files for full attribute schemas.
+
+**Selective instrumentation** (explicit control):
+
+```python
+from phoenix.otel import register
+from openinference.instrumentation.openai import OpenAIInstrumentor
+
+tracer_provider = register(project_name="my-app")  # No auto_instrument
+OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)
+```
+
+## Limitations
+
+Auto-instrumentation does NOT capture:
+
+- Custom business logic
+- Internal function calls
+
+**Example:**
+
+```python
+def my_custom_workflow(query: str) -> str:
+    preprocessed = preprocess(query)  # Not traced
+    response = client.chat.completions.create(...)  # Traced (auto)
+    postprocessed = postprocess(response)  # Not traced
+    return postprocessed
+```
+
+**Solution:** Add manual instrumentation:
+
+```python
+@tracer.chain
+def my_custom_workflow(query: str) -> str:
+    preprocessed = preprocess(query)
+    response = client.chat.completions.create(...)
+    postprocessed = postprocess(response)
+    return postprocessed
+```
--- a/plugins/phoenix/skills/phoenix-tracing/references/instrumentation-auto-typescript.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/instrumentation-auto-typescript.md
@@ -0,0 +1,87 @@
+# Auto-Instrumentation (TypeScript)
+
+Automatically create spans for LLM calls without code changes.
+
+## Supported Frameworks
+
+- **LLM SDKs:** OpenAI
+- **Frameworks:** LangChain
+- **Install:** `npm install @arizeai/openinference-instrumentation-{name}`
+
+## Setup
+
+**CommonJS (automatic):**
+
+```javascript
+const { register } = require("@arizeai/phoenix-otel");
+const OpenAI = require("openai");
+
+register({ projectName: "my-app" });
+
+const client = new OpenAI();
+```
+
+**ESM (manual required):**
+
+```typescript
+import { register, registerInstrumentations } from "@arizeai/phoenix-otel";
+import { OpenAIInstrumentation } from "@arizeai/openinference-instrumentation-openai";
+import OpenAI from "openai";
+
+register({ projectName: "my-app" });
+
+const instrumentation = new OpenAIInstrumentation();
+instrumentation.manuallyInstrument(OpenAI);
+registerInstrumentations({ instrumentations: [instrumentation] });
+```
+
+**Why:** ESM imports are hoisted before `register()` runs.
+
+## Limitations
+
+**What auto-instrumentation does NOT capture:**
+
+```typescript
+async function myWorkflow(query: string): Promise<string> {
+  const preprocessed = await preprocess(query);        // Not traced
+  const response = await client.chat.completions.create(...);  // Traced (auto)
+  const postprocessed = await postprocess(response);   // Not traced
+  return postprocessed;
+}
+```
+
+**Solution:** Add manual instrumentation for custom logic:
+
+```typescript
+import { traceChain } from "@arizeai/openinference-core";
+
+const myWorkflow = traceChain(
+  async (query: string): Promise<string> => {
+    const preprocessed = await preprocess(query);
+    const response = await client.chat.completions.create(...);
+    const postprocessed = await postprocess(response);
+    return postprocessed;
+  },
+  { name: "my-workflow" }
+);
+```
+
+## Combining Auto + Manual
+
+```typescript
+import { register } from "@arizeai/phoenix-otel";
+import { traceChain } from "@arizeai/openinference-core";
+
+register({ projectName: "my-app" });
+
+const client = new OpenAI();
+
+const workflow = traceChain(
+  async (query: string) => {
+    const preprocessed = await preprocess(query);
+    const response = await client.chat.completions.create(...);  // Auto-instrumented
+    return postprocess(response);
+  },
+  { name: "my-workflow" }
+);
+```
--- a/plugins/phoenix/skills/phoenix-tracing/references/instrumentation-manual-python.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/instrumentation-manual-python.md
@@ -0,0 +1,182 @@
+# Manual Instrumentation (Python)
+
+Add custom spans using decorators or context managers for fine-grained tracing control.
+
+## Setup
+
+```bash
+pip install arize-phoenix-otel
+```
+
+```python
+from phoenix.otel import register
+tracer_provider = register(project_name="my-app")
+tracer = tracer_provider.get_tracer(__name__)
+```
+
+## Quick Reference
+
+| Span Kind | Decorator | Use Case |
+|-----------|-----------|----------|
+| CHAIN | `@tracer.chain` | Orchestration, workflows, pipelines |
+| RETRIEVER | `@tracer.retriever` | Vector search, document retrieval |
+| TOOL | `@tracer.tool` | External API calls, function execution |
+| AGENT | `@tracer.agent` | Multi-step reasoning, planning |
+| LLM | `@tracer.llm` | LLM API calls (manual only) |
+| EMBEDDING | `@tracer.embedding` | Embedding generation |
+| RERANKER | `@tracer.reranker` | Document re-ranking |
+| GUARDRAIL | `@tracer.guardrail` | Safety checks, content moderation |
+| EVALUATOR | `@tracer.evaluator` | LLM evaluation, quality checks |
+
+## Decorator Approach (Recommended)
+
+**Use for:** Full function instrumentation, automatic I/O capture
+
+```python
+@tracer.chain
+def rag_pipeline(query: str) -> str:
+    docs = retrieve_documents(query)
+    ranked = rerank(docs, query)
+    return generate_response(ranked, query)
+
+@tracer.retriever
+def retrieve_documents(query: str) -> list[dict]:
+    results = vector_db.search(query, top_k=5)
+    return [{"content": doc.text, "score": doc.score} for doc in results]
+
+@tracer.tool
+def get_weather(city: str) -> str:
+    response = requests.get(f"https://api.weather.com/{city}")
+    return response.json()["weather"]
+```
+
+**Custom span names:**
+
+```python
+@tracer.chain(name="rag-pipeline-v2")
+def my_workflow(query: str) -> str:
+    return process(query)
+```
+
+## Context Manager Approach
+
+**Use for:** Partial function instrumentation, custom attributes, dynamic control
+
+```python
+from opentelemetry.trace import Status, StatusCode
+import json
+
+def retrieve_with_metadata(query: str):
+    with tracer.start_as_current_span(
+        "vector_search",
+        openinference_span_kind="retriever"
+    ) as span:
+        span.set_attribute("input.value", query)
+
+        results = vector_db.search(query, top_k=5)
+
+        documents = [
+            {
+                "document.id": doc.id,
+                "document.content": doc.text,
+                "document.score": doc.score
+            }
+            for doc in results
+        ]
+        span.set_attribute("retrieval.documents", json.dumps(documents))
+        span.set_status(Status(StatusCode.OK))
+
+        return documents
+```
+
+## Capturing Input/Output
+
+**Always capture I/O for evaluation-ready spans.**
+
+### Automatic I/O Capture (Decorators)
+
+Decorators automatically capture input arguments and return values:
+
+```python  theme={null}
+@tracer.chain
+def handle_query(user_input: str) -> str:
+    result = agent.generate(user_input)
+    return result.text
+
+# Automatically captures:
+# - input.value: user_input
+# - output.value: result.text
+# - input.mime_type / output.mime_type: auto-detected
+```
+
+### Manual I/O Capture (Context Manager)
+
+Use `set_input()` and `set_output()` for simple I/O capture:
+
+```python  theme={null}
+from opentelemetry.trace import Status, StatusCode
+
+def handle_query(user_input: str) -> str:
+    with tracer.start_as_current_span(
+        "query.handler",
+        openinference_span_kind="chain"
+    ) as span:
+        span.set_input(user_input)
+
+        result = agent.generate(user_input)
+
+        span.set_output(result.text)
+        span.set_status(Status(StatusCode.OK))
+
+        return result.text
+```
+
+**What gets captured:**
+
+```json
+{
+  "input.value": "What is 2+2?",
+  "input.mime_type": "text/plain",
+  "output.value": "2+2 equals 4.",
+  "output.mime_type": "text/plain"
+}
+```
+
+**Why this matters:**
+- Phoenix evaluators require `input.value` and `output.value`
+- Phoenix UI displays I/O prominently for debugging
+- Enables exporting data for fine-tuning datasets
+
+### Custom I/O with Additional Metadata
+
+Use `set_attribute()` for custom attributes alongside I/O:
+
+```python  theme={null}
+def process_query(query: str):
+    with tracer.start_as_current_span(
+        "query.process",
+        openinference_span_kind="chain"
+    ) as span:
+        # Standard I/O
+        span.set_input(query)
+
+        # Custom metadata
+        span.set_attribute("input.length", len(query))
+
+        result = llm.generate(query)
+
+        # Standard output
+        span.set_output(result.text)
+
+        # Custom metadata
+        span.set_attribute("output.tokens", result.usage.total_tokens)
+        span.set_status(Status(StatusCode.OK))
+
+        return result
+```
+
+## See Also
+
+- **Span attributes:** `span-chain.md`, `span-retriever.md`, `span-tool.md`, `span-llm.md`, `span-agent.md`, `span-embedding.md`, `span-reranker.md`, `span-guardrail.md`, `span-evaluator.md`
+- **Auto-instrumentation:** `instrumentation-auto-python.md` for framework integrations
+- **API docs:** https://docs.arize.com/phoenix/tracing/manual-instrumentation
--- a/plugins/phoenix/skills/phoenix-tracing/references/instrumentation-manual-typescript.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/instrumentation-manual-typescript.md
@@ -0,0 +1,172 @@
+# Manual Instrumentation (TypeScript)
+
+Add custom spans using convenience wrappers or withSpan for fine-grained tracing control.
+
+## Setup
+
+```bash
+npm install @arizeai/phoenix-otel @arizeai/openinference-core
+```
+
+```typescript
+import { register } from "@arizeai/phoenix-otel";
+register({ projectName: "my-app" });
+```
+
+## Quick Reference
+
+| Span Kind | Method | Use Case |
+|-----------|--------|----------|
+| CHAIN | `traceChain` | Workflows, pipelines, orchestration |
+| AGENT | `traceAgent` | Multi-step reasoning, planning |
+| TOOL | `traceTool` | External APIs, function calls |
+| RETRIEVER | `withSpan` | Vector search, document retrieval |
+| LLM | `withSpan` | LLM API calls (prefer auto-instrumentation) |
+| EMBEDDING | `withSpan` | Embedding generation |
+| RERANKER | `withSpan` | Document re-ranking |
+| GUARDRAIL | `withSpan` | Safety checks, content moderation |
+| EVALUATOR | `withSpan` | LLM evaluation |
+
+## Convenience Wrappers
+
+```typescript
+import { traceChain, traceAgent, traceTool } from "@arizeai/openinference-core";
+
+// CHAIN - workflows
+const pipeline = traceChain(
+  async (query: string) => {
+    const docs = await retrieve(query);
+    return await generate(docs, query);
+  },
+  { name: "rag-pipeline" }
+);
+
+// AGENT - reasoning
+const agent = traceAgent(
+  async (question: string) => {
+    const thought = await llm.generate(`Think: ${question}`);
+    return await processThought(thought);
+  },
+  { name: "my-agent" }
+);
+
+// TOOL - function calls
+const getWeather = traceTool(
+  async (city: string) => fetch(`/api/weather/${city}`).then(r => r.json()),
+  { name: "get-weather" }
+);
+```
+
+## withSpan for Other Kinds
+
+```typescript
+import { withSpan, getInputAttributes, getRetrieverAttributes } from "@arizeai/openinference-core";
+
+// RETRIEVER with custom attributes
+const retrieve = withSpan(
+  async (query: string) => {
+    const results = await vectorDb.search(query, { topK: 5 });
+    return results.map(doc => ({ content: doc.text, score: doc.score }));
+  },
+  {
+    kind: "RETRIEVER",
+    name: "vector-search",
+    processInput: (query) => getInputAttributes(query),
+    processOutput: (docs) => getRetrieverAttributes({ documents: docs })
+  }
+);
+```
+
+**Options:**
+
+```typescript
+withSpan(fn, {
+  kind: "RETRIEVER",              // OpenInference span kind
+  name: "span-name",              // Span name (defaults to function name)
+  processInput: (args) => {},     // Transform input to attributes
+  processOutput: (result) => {},  // Transform output to attributes
+  attributes: { key: "value" }    // Static attributes
+});
+```
+
+## Capturing Input/Output
+
+**Always capture I/O for evaluation-ready spans.** Use `getInputAttributes` and `getOutputAttributes` helpers for automatic MIME type detection:
+
+```typescript
+import {
+  getInputAttributes,
+  getOutputAttributes,
+  withSpan,
+} from "@arizeai/openinference-core";
+
+const handleQuery = withSpan(
+  async (userInput: string) => {
+    const result = await agent.generate({ prompt: userInput });
+    return result;
+  },
+  {
+    name: "query.handler",
+    kind: "CHAIN",
+    // Use helpers - automatic MIME type detection
+    processInput: (input) => getInputAttributes(input),
+    processOutput: (result) => getOutputAttributes(result.text),
+  }
+);
+
+await handleQuery("What is 2+2?");
+```
+
+**What gets captured:**
+
+```json
+{
+  "input.value": "What is 2+2?",
+  "input.mime_type": "text/plain",
+  "output.value": "2+2 equals 4.",
+  "output.mime_type": "text/plain"
+}
+```
+
+**Helper behavior:**
+- Strings → `text/plain`
+- Objects/Arrays → `application/json` (automatically serialized)
+- `undefined`/`null` → No attributes set
+
+**Why this matters:**
+- Phoenix evaluators require `input.value` and `output.value`
+- Phoenix UI displays I/O prominently for debugging
+- Enables exporting data for fine-tuning datasets
+
+### Custom I/O Processing
+
+Add custom metadata alongside standard I/O attributes:
+
+```typescript
+const processWithMetadata = withSpan(
+  async (query: string) => {
+    const result = await llm.generate(query);
+    return result;
+  },
+  {
+    name: "query.process",
+    kind: "CHAIN",
+    processInput: (query) => ({
+      "input.value": query,
+      "input.mime_type": "text/plain",
+      "input.length": query.length,  // Custom attribute
+    }),
+    processOutput: (result) => ({
+      "output.value": result.text,
+      "output.mime_type": "text/plain",
+      "output.tokens": result.usage?.totalTokens,  // Custom attribute
+    }),
+  }
+);
+```
+
+## See Also
+
+- **Span attributes:** `span-chain.md`, `span-retriever.md`, `span-tool.md`, etc.
+- **Attribute helpers:** https://docs.arize.com/phoenix/tracing/manual-instrumentation-typescript#attribute-helpers
+- **Auto-instrumentation:** `instrumentation-auto-typescript.md` for framework integrations
--- a/plugins/phoenix/skills/phoenix-tracing/references/metadata-python.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/metadata-python.md
@@ -0,0 +1,87 @@
+# Phoenix Tracing: Custom Metadata (Python)
+
+Add custom attributes to spans for richer observability.
+
+## Install
+
+```bash
+pip install openinference-instrumentation
+```
+
+## Session
+
+```python
+from openinference.instrumentation import using_session
+
+with using_session(session_id="my-session-id"):
+    # Spans get: "session.id" = "my-session-id"
+    ...
+```
+
+## User
+
+```python
+from openinference.instrumentation import using_user
+
+with using_user("my-user-id"):
+    # Spans get: "user.id" = "my-user-id"
+    ...
+```
+
+## Metadata
+
+```python
+from openinference.instrumentation import using_metadata
+
+with using_metadata({"key": "value", "experiment_id": "exp_123"}):
+    # Spans get: "metadata" = '{"key": "value", "experiment_id": "exp_123"}'
+    ...
+```
+
+## Tags
+
+```python
+from openinference.instrumentation import using_tags
+
+with using_tags(["tag_1", "tag_2"]):
+    # Spans get: "tag.tags" = '["tag_1", "tag_2"]'
+    ...
+```
+
+## Combined (using_attributes)
+
+```python
+from openinference.instrumentation import using_attributes
+
+with using_attributes(
+    session_id="my-session-id",
+    user_id="my-user-id",
+    metadata={"environment": "production"},
+    tags=["prod", "v2"],
+    prompt_template="Answer: {question}",
+    prompt_template_version="v1.0",
+    prompt_template_variables={"question": "What is Phoenix?"},
+):
+    # All attributes applied to spans in this context
+    ...
+```
+
+## On a Single Span
+
+```python
+span.set_attribute("metadata", json.dumps({"key": "value"}))
+span.set_attribute("user.id", "user_123")
+span.set_attribute("session.id", "session_456")
+```
+
+## As Decorators
+
+All context managers can be used as decorators:
+
+```python
+@using_session(session_id="my-session-id")
+@using_user("my-user-id")
+@using_metadata({"env": "prod"})
+def my_function():
+    ...
+```
--- a/plugins/phoenix/skills/phoenix-tracing/references/metadata-typescript.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/metadata-typescript.md
@@ -0,0 +1,50 @@
+# Phoenix Tracing: Custom Metadata (TypeScript)
+
+Add custom attributes to spans for richer observability.
+
+## Using Context (Propagates to All Child Spans)
+
+```typescript
+import { context } from "@arizeai/phoenix-otel";
+import { setMetadata } from "@arizeai/openinference-core";
+
+context.with(
+  setMetadata(context.active(), {
+    experiment_id: "exp_123",
+    model_version: "gpt-4-1106-preview",
+    environment: "production",
+  }),
+  async () => {
+    // All spans created within this block will have:
+    // "metadata" = '{"experiment_id": "exp_123", ...}'
+    await myApp.run(query);
+  }
+);
+```
+
+## On a Single Span
+
+```typescript
+import { traceChain } from "@arizeai/openinference-core";
+import { trace } from "@arizeai/phoenix-otel";
+
+const myFunction = traceChain(
+  async (input: string) => {
+    const span = trace.getActiveSpan();
+
+    span?.setAttribute(
+      "metadata",
+      JSON.stringify({
+        experiment_id: "exp_123",
+        model_version: "gpt-4-1106-preview",
+        environment: "production",
+      })
+    );
+
+    return result;
+  },
+  { name: "my-function" }
+);
+
+await myFunction("hello");
+```
--- a/plugins/phoenix/skills/phoenix-tracing/references/production-python.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/production-python.md
@@ -0,0 +1,58 @@
+# Phoenix Tracing: Production Guide (Python)
+
+**CRITICAL: Configure batching, data masking, and span filtering for production deployment.**
+
+## Metadata
+
+| Attribute | Value |
+|-----------|-------|
+| Priority | Critical - production readiness |
+| Impact | Security, Performance |
+| Setup Time | 5-15 min |
+
+## Batch Processing
+
+**Enable batch processing for production efficiency.** Batching reduces network overhead by sending spans in groups rather than individually.
+
+## Data Masking (PII Protection)
+
+**Environment variables:**
+
+```bash
+export OPENINFERENCE_HIDE_INPUTS=true          # Hide input.value
+export OPENINFERENCE_HIDE_OUTPUTS=true         # Hide output.value
+export OPENINFERENCE_HIDE_INPUT_MESSAGES=true  # Hide LLM input messages
+export OPENINFERENCE_HIDE_OUTPUT_MESSAGES=true # Hide LLM output messages
+export OPENINFERENCE_HIDE_INPUT_IMAGES=true    # Hide image content
+export OPENINFERENCE_HIDE_INPUT_TEXT=true      # Hide embedding text
+export OPENINFERENCE_BASE64_IMAGE_MAX_LENGTH=10000  # Limit image size
+```
+
+**Python TraceConfig:**
+
+```python
+from phoenix.otel import register
+from openinference.instrumentation import TraceConfig
+
+config = TraceConfig(
+    hide_inputs=True,
+    hide_outputs=True,
+    hide_input_messages=True
+)
+register(trace_config=config)
+```
+
+**Precedence:** Code > Environment variables > Defaults
+
+---
+
+## Span Filtering
+
+**Suppress specific code blocks:**
+
+```python
+from phoenix.otel import suppress_tracing
+
+with suppress_tracing():
+    internal_logging()  # No spans generated
+```
--- a/plugins/phoenix/skills/phoenix-tracing/references/production-typescript.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/production-typescript.md
@@ -0,0 +1,148 @@
+# Phoenix Tracing: Production Guide (TypeScript)
+
+**CRITICAL: Configure batching, data masking, and span filtering for production deployment.**
+
+## Metadata
+
+| Attribute | Value |
+|-----------|-------|
+| Priority | Critical - production readiness |
+| Impact | Security, Performance |
+| Setup Time | 5-15 min |
+
+## Batch Processing
+
+**Enable batch processing for production efficiency.** Batching reduces network overhead by sending spans in groups rather than individually.
+
+```typescript
+import { register } from "@arizeai/phoenix-otel";
+
+const provider = register({
+  projectName: "my-app",
+  batch: true,  // Production default
+});
+```
+
+### Shutdown Handling
+
+**CRITICAL:** Spans may not be exported if still queued in the processor when your process exits. Call `provider.shutdown()` to explicitly flush before exit.
+
+```typescript
+// Explicit shutdown to flush queued spans
+const provider = register({
+  projectName: "my-app",
+  batch: true,
+});
+
+async function main() {
+  await doWork();
+  await provider.shutdown();  // Flush spans before exit
+}
+
+main().catch(async (error) => {
+  console.error(error);
+  await provider.shutdown();  // Flush on error too
+  process.exit(1);
+});
+```
+
+**Graceful termination signals:**
+
+```typescript
+// Graceful shutdown on SIGTERM
+const provider = register({
+  projectName: "my-server",
+  batch: true,
+});
+
+process.on("SIGTERM", async () => {
+  await provider.shutdown();
+  process.exit(0);
+});
+```
+
+---
+
+## Data Masking (PII Protection)
+
+**Environment variables:**
+
+```bash
+export OPENINFERENCE_HIDE_INPUTS=true          # Hide input.value
+export OPENINFERENCE_HIDE_OUTPUTS=true         # Hide output.value
+export OPENINFERENCE_HIDE_INPUT_MESSAGES=true  # Hide LLM input messages
+export OPENINFERENCE_HIDE_OUTPUT_MESSAGES=true # Hide LLM output messages
+export OPENINFERENCE_HIDE_INPUT_IMAGES=true    # Hide image content
+export OPENINFERENCE_HIDE_INPUT_TEXT=true      # Hide embedding text
+export OPENINFERENCE_BASE64_IMAGE_MAX_LENGTH=10000  # Limit image size
+```
+
+**TypeScript TraceConfig:**
+
+```typescript
+import { register } from "@arizeai/phoenix-otel";
+import { OpenAIInstrumentation } from "@arizeai/openinference-instrumentation-openai";
+
+const traceConfig = {
+  hideInputs: true,
+  hideOutputs: true,
+  hideInputMessages: true
+};
+
+const instrumentation = new OpenAIInstrumentation({ traceConfig });
+```
+
+**Precedence:** Code > Environment variables > Defaults
+
+---
+
+## Span Filtering
+
+**Suppress specific code blocks:**
+
+```typescript
+import { suppressTracing } from "@opentelemetry/core";
+import { context } from "@opentelemetry/api";
+
+await context.with(suppressTracing(context.active()), async () => {
+  internalLogging(); // No spans generated
+});
+```
+
+**Sampling:**
+
+```bash
+export OTEL_TRACES_SAMPLER="parentbased_traceidratio"
+export OTEL_TRACES_SAMPLER_ARG="0.1"  # Sample 10%
+```
+
+---
+
+## Error Handling
+
+```typescript
+import { SpanStatusCode } from "@opentelemetry/api";
+
+try {
+  result = await riskyOperation();
+  span?.setStatus({ code: SpanStatusCode.OK });
+} catch (e) {
+  span?.recordException(e);
+  span?.setStatus({ code: SpanStatusCode.ERROR });
+  throw e;
+}
+```
+
+---
+
+## Production Checklist
+
+- [ ] Batch processing enabled
+- [ ] **Shutdown handling:** Call `provider.shutdown()` before exit to flush queued spans
+- [ ] **Graceful termination:** Flush spans on SIGTERM/SIGINT signals
+- [ ] Data masking configured (`HIDE_INPUTS`/`HIDE_OUTPUTS` if PII)
+- [ ] Span filtering for health checks/noisy paths
+- [ ] Error handling implemented
+- [ ] Graceful degradation if Phoenix unavailable
+- [ ] Performance tested
+- [ ] Monitoring configured (Phoenix UI checked)
--- a/plugins/phoenix/skills/phoenix-tracing/references/projects-python.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/projects-python.md
@@ -0,0 +1,73 @@
+# Phoenix Tracing: Projects (Python)
+
+**Organize traces by application using projects (Phoenix's top-level grouping).**
+
+## Overview
+
+Projects group traces for a single application or experiment.
+
+**Use for:** Environments (dev/staging/prod), A/B testing, versioning
+
+## Setup
+
+### Environment Variable (Recommended)
+
+```bash
+export PHOENIX_PROJECT_NAME="my-app-prod"
+```
+
+```python
+import os
+os.environ["PHOENIX_PROJECT_NAME"] = "my-app-prod"
+from phoenix.otel import register
+register()  # Uses "my-app-prod"
+```
+
+### Code
+
+```python
+from phoenix.otel import register
+register(project_name="my-app-prod")
+```
+
+## Use Cases
+
+**Environments:**
+
+```python
+# Dev, staging, prod
+register(project_name="my-app-dev")
+register(project_name="my-app-staging")
+register(project_name="my-app-prod")
+```
+
+**A/B Testing:**
+
+```python
+# Compare models
+register(project_name="chatbot-gpt4")
+register(project_name="chatbot-claude")
+```
+
+**Versioning:**
+
+```python
+# Track versions
+register(project_name="my-app-v1")
+register(project_name="my-app-v2")
+```
+
+## Switching Projects (Python Notebooks Only)
+
+```python
+from openinference.instrumentation import dangerously_using_project
+from phoenix.otel import register
+
+register(project_name="my-app")
+
+# Switch temporarily for evals
+with dangerously_using_project("my-eval-project"):
+    run_evaluations()
+```
+
+**⚠️ Only use in notebooks/scripts, not production.**
--- a/plugins/phoenix/skills/phoenix-tracing/references/projects-typescript.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/projects-typescript.md
@@ -0,0 +1,54 @@
+# Phoenix Tracing: Projects (TypeScript)
+
+**Organize traces by application using projects (Phoenix's top-level grouping).**
+
+## Overview
+
+Projects group traces for a single application or experiment.
+
+**Use for:** Environments (dev/staging/prod), A/B testing, versioning
+
+## Setup
+
+### Environment Variable (Recommended)
+
+```bash
+export PHOENIX_PROJECT_NAME="my-app-prod"
+```
+
+```typescript
+process.env.PHOENIX_PROJECT_NAME = "my-app-prod";
+import { register } from "@arizeai/phoenix-otel";
+register();  // Uses "my-app-prod"
+```
+
+### Code
+
+```typescript
+import { register } from "@arizeai/phoenix-otel";
+register({ projectName: "my-app-prod" });
+```
+
+## Use Cases
+
+**Environments:**
+```typescript
+// Dev, staging, prod
+register({ projectName: "my-app-dev" });
+register({ projectName: "my-app-staging" });
+register({ projectName: "my-app-prod" });
+```
+
+**A/B Testing:**
+```typescript
+// Compare models
+register({ projectName: "chatbot-gpt4" });
+register({ projectName: "chatbot-claude" });
+```
+
+**Versioning:**
+```typescript
+// Track versions
+register({ projectName: "my-app-v1" });
+register({ projectName: "my-app-v2" });
+```
--- a/plugins/phoenix/skills/phoenix-tracing/references/sessions-python.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/sessions-python.md
@@ -0,0 +1,104 @@
+# Sessions (Python)
+
+Track multi-turn conversations by grouping traces with session IDs.
+
+## Setup
+
+```python
+from openinference.instrumentation import using_session
+
+with using_session(session_id="user_123_conv_456"):
+    response = llm.invoke(prompt)
+```
+
+## Best Practices
+
+**Bad: Only parent span gets session ID**
+
+```python
+from openinference.semconv.trace import SpanAttributes
+from opentelemetry import trace
+
+span = trace.get_current_span()
+span.set_attribute(SpanAttributes.SESSION_ID, session_id)
+response = client.chat.completions.create(...)
+```
+
+**Good: All child spans inherit session ID**
+
+```python
+with using_session(session_id):
+    response = client.chat.completions.create(...)
+    result = my_custom_function()
+```
+
+**Why:** `using_session()` propagates session ID to all nested spans automatically.
+
+## Session ID Patterns
+
+```python
+import uuid
+
+session_id = str(uuid.uuid4())
+session_id = f"user_{user_id}_conv_{conversation_id}"
+session_id = f"debug_{timestamp}"
+```
+
+Good: `str(uuid.uuid4())`, `"user_123_conv_456"`
+Bad: `"session_1"`, `"test"`, empty string
+
+## Multi-Turn Chatbot Example
+
+```python
+import uuid
+from openinference.instrumentation import using_session
+
+session_id = str(uuid.uuid4())
+messages = []
+
+def send_message(user_input: str) -> str:
+    messages.append({"role": "user", "content": user_input})
+
+    with using_session(session_id):
+        response = client.chat.completions.create(
+            model="gpt-4",
+            messages=messages
+        )
+
+    assistant_message = response.choices[0].message.content
+    messages.append({"role": "assistant", "content": assistant_message})
+    return assistant_message
+```
+
+## Additional Attributes
+
+```python
+from openinference.instrumentation import using_attributes
+
+with using_attributes(
+    user_id="user_123",
+    session_id="conv_456",
+    metadata={"tier": "premium", "region": "us-west"}
+):
+    response = llm.invoke(prompt)
+```
+
+## LangChain Integration
+
+LangChain threads are automatically recognized as sessions:
+
+```python
+from langchain.chat_models import ChatOpenAI
+
+response = llm.invoke(
+    [HumanMessage(content="Hi!")],
+    config={"metadata": {"thread_id": "user_123_thread"}}
+)
+```
+
+Phoenix recognizes: `thread_id`, `session_id`, `conversation_id`
+
+## See Also
+
+- **TypeScript sessions:** `sessions-typescript.md`
+- **Session docs:** https://docs.arize.com/phoenix/tracing/sessions
--- a/plugins/phoenix/skills/phoenix-tracing/references/sessions-typescript.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/sessions-typescript.md
@@ -0,0 +1,199 @@
+# Sessions (TypeScript)
+
+Track multi-turn conversations by grouping traces with session IDs. **Use `withSpan` directly from `@arizeai/openinference-core`** - no wrappers or custom utilities needed.
+
+## Core Concept
+
+**Session Pattern:**
+1. Generate a unique `session.id` once at application startup
+2. Export SESSION_ID, import `withSpan` where needed
+3. Use `withSpan` to create a parent CHAIN span with `session.id` for each interaction
+4. All child spans (LLM, TOOL, AGENT, etc.) automatically group under the parent
+5. Query traces by `session.id` in Phoenix to see all interactions
+
+## Implementation (Best Practice)
+
+### 1. Setup (instrumentation.ts)
+
+```typescript
+import { register } from "@arizeai/phoenix-otel";
+import { randomUUID } from "node:crypto";
+
+// Initialize Phoenix
+register({
+  projectName: "your-app",
+  url: process.env.PHOENIX_COLLECTOR_ENDPOINT || "http://localhost:6006",
+  apiKey: process.env.PHOENIX_API_KEY,
+  batch: true,
+});
+
+// Generate and export session ID
+export const SESSION_ID = randomUUID();
+```
+
+### 2. Usage (app code)
+
+```typescript
+import { withSpan } from "@arizeai/openinference-core";
+import { SESSION_ID } from "./instrumentation";
+
+// Use withSpan directly - no wrapper needed
+const handleInteraction = withSpan(
+  async () => {
+    const result = await agent.generate({ prompt: userInput });
+    return result;
+  },
+  {
+    name: "cli.interaction",
+    kind: "CHAIN",
+    attributes: { "session.id": SESSION_ID },
+  }
+);
+
+// Call it
+const result = await handleInteraction();
+```
+
+### With Input Parameters
+
+```typescript
+const processQuery = withSpan(
+  async (query: string) => {
+    return await agent.generate({ prompt: query });
+  },
+  {
+    name: "process.query",
+    kind: "CHAIN",
+    attributes: { "session.id": SESSION_ID },
+  }
+);
+
+await processQuery("What is 2+2?");
+```
+
+## Key Points
+
+### Session ID Scope
+- **CLI/Desktop Apps**: Generate once at process startup
+- **Web Servers**: Generate per-user session (e.g., on login, store in session storage)
+- **Stateless APIs**: Accept session.id as a parameter from client
+
+### Span Hierarchy
+```
+cli.interaction (CHAIN) ← session.id here
+├── ai.generateText (AGENT)
+│   ├── ai.generateText.doGenerate (LLM)
+│   └── ai.toolCall (TOOL)
+└── ai.generateText.doGenerate (LLM)
+```
+
+The `session.id` is only set on the **root span**. Child spans are automatically grouped by the trace hierarchy.
+
+### Querying Sessions
+
+```bash
+# Get all traces for a session
+npx @arizeai/phoenix-cli traces \
+  --endpoint http://localhost:6006 \
+  --project your-app \
+  --format raw \
+  --no-progress | \
+  jq '.[] | select(.spans[0].attributes["session.id"] == "YOUR-SESSION-ID")'
+```
+
+## Dependencies
+
+```json
+{
+  "dependencies": {
+    "@arizeai/openinference-core": "^2.0.5",
+    "@arizeai/phoenix-otel": "^0.4.1"
+  }
+}
+```
+
+**Note:** `@opentelemetry/api` is NOT needed - it's only for manual span management.
+
+## Why This Pattern?
+
+1. **Simple**: Just export SESSION_ID, use withSpan directly - no wrappers
+2. **Built-in**: `withSpan` from `@arizeai/openinference-core` handles everything
+3. **Type-safe**: Preserves function signatures and type information
+4. **Automatic lifecycle**: Handles span creation, error tracking, and cleanup
+5. **Framework-agnostic**: Works with any LLM framework (AI SDK, LangChain, etc.)
+6. **No extra deps**: Don't need `@opentelemetry/api` or custom utilities
+
+## Adding More Attributes
+
+```typescript
+import { withSpan } from "@arizeai/openinference-core";
+import { SESSION_ID } from "./instrumentation";
+
+const handleWithContext = withSpan(
+  async (userInput: string) => {
+    return await agent.generate({ prompt: userInput });
+  },
+  {
+    name: "cli.interaction",
+    kind: "CHAIN",
+    attributes: {
+      "session.id": SESSION_ID,
+      "user.id": userId,              // Track user
+      "metadata.environment": "prod",  // Custom metadata
+    },
+  }
+);
+```
+
+## Anti-Pattern: Don't Create Wrappers
+
+❌ **Don't do this:**
+```typescript
+// Unnecessary wrapper
+export function withSessionTracking(fn) {
+  return withSpan(fn, { attributes: { "session.id": SESSION_ID } });
+}
+```
+
+✅ **Do this instead:**
+```typescript
+// Use withSpan directly
+import { withSpan } from "@arizeai/openinference-core";
+import { SESSION_ID } from "./instrumentation";
+
+const handler = withSpan(fn, {
+  attributes: { "session.id": SESSION_ID }
+});
+```
+
+## Alternative: Context API Pattern
+
+For web servers or complex async flows where you need to propagate session IDs through middleware, you can use the Context API:
+
+```typescript
+import { context } from "@opentelemetry/api";
+import { setSession } from "@arizeai/openinference-core";
+
+await context.with(
+  setSession(context.active(), { sessionId: "user_123_conv_456" }),
+  async () => {
+    const response = await llm.invoke(prompt);
+  }
+);
+```
+
+**Use Context API when:**
+- Building web servers with middleware chains
+- Session ID needs to flow through many async boundaries
+- You don't control the call stack (e.g., framework-provided handlers)
+
+**Use withSpan when:**
+- Building CLI apps or scripts
+- You control the function call points
+- Simpler, more explicit code is preferred
+
+## Related
+
+- `fundamentals-universal-attributes.md` - Other universal attributes (user.id, metadata)
+- `span-chain.md` - CHAIN span specification
+- `sessions-python.md` - Python session tracking patterns
--- a/plugins/phoenix/skills/phoenix-tracing/references/setup-python.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/setup-python.md
@@ -0,0 +1,131 @@
+# Phoenix Tracing: Python Setup
+
+**Setup Phoenix tracing in Python with `arize-phoenix-otel`.**
+
+## Metadata
+
+| Attribute  | Value                               |
+| ---------- | ----------------------------------- |
+| Priority   | Critical - required for all tracing |
+| Setup Time | <5 min                              |
+
+## Quick Start (3 lines)
+
+```python
+from phoenix.otel import register
+register(project_name="my-app", auto_instrument=True)
+```
+
+**Connects to `http://localhost:6006`, auto-instruments all supported libraries.**
+
+## Installation
+
+```bash
+pip install arize-phoenix-otel
+```
+
+**Supported:** Python 3.10-3.13
+
+## Configuration
+
+### Environment Variables (Recommended)
+
+```bash
+export PHOENIX_API_KEY="your-api-key"  # Required for Phoenix Cloud
+export PHOENIX_COLLECTOR_ENDPOINT="http://localhost:6006"  # Or Cloud URL
+export PHOENIX_PROJECT_NAME="my-app"  # Optional
+```
+
+### Python Code
+
+```python
+from phoenix.otel import register
+
+tracer_provider = register(
+    project_name="my-app",              # Project name
+    endpoint="http://localhost:6006",   # Phoenix endpoint
+    auto_instrument=True,               # Auto-instrument supported libs
+    batch=True,                         # Batch processing (default: True)
+)
+```
+
+**Parameters:**
+
+- `project_name`: Project name (overrides `PHOENIX_PROJECT_NAME`)
+- `endpoint`: Phoenix URL (overrides `PHOENIX_COLLECTOR_ENDPOINT`)
+- `auto_instrument`: Enable auto-instrumentation (default: False)
+- `batch`: Use BatchSpanProcessor (default: True, production-recommended)
+- `protocol`: `"http/protobuf"` (default) or `"grpc"`
+
+## Auto-Instrumentation
+
+Install instrumentors for your frameworks:
+
+```bash
+pip install openinference-instrumentation-openai      # OpenAI SDK
+pip install openinference-instrumentation-langchain   # LangChain
+pip install openinference-instrumentation-llama-index # LlamaIndex
+# ... install others as needed
+```
+
+Then enable auto-instrumentation:
+
+```python
+register(project_name="my-app", auto_instrument=True)
+```
+
+Phoenix discovers and instruments all installed OpenInference packages automatically.
+
+## Batch Processing (Production)
+
+Enabled by default. Configure via environment variables:
+
+```bash
+export OTEL_BSP_SCHEDULE_DELAY=5000           # Batch every 5s
+export OTEL_BSP_MAX_QUEUE_SIZE=2048           # Queue 2048 spans
+export OTEL_BSP_MAX_EXPORT_BATCH_SIZE=512     # Send 512 spans/batch
+```
+
+**Link:** https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/
+
+## Verification
+
+1. Open Phoenix UI: `http://localhost:6006`
+2. Navigate to your project
+3. Run your application
+4. Check for traces (appear within batch delay)
+
+## Troubleshooting
+
+**No traces:**
+
+- Verify `PHOENIX_COLLECTOR_ENDPOINT` matches Phoenix server
+- Set `PHOENIX_API_KEY` for Phoenix Cloud
+- Confirm instrumentors installed
+
+**Missing attributes:**
+
+- Check span kind (see rules/ directory)
+- Verify attribute names (see rules/ directory)
+
+## Example
+
+```python
+from phoenix.otel import register
+from openai import OpenAI
+
+# Enable tracing with auto-instrumentation
+register(project_name="my-chatbot", auto_instrument=True)
+
+# OpenAI automatically instrumented
+client = OpenAI()
+response = client.chat.completions.create(
+    model="gpt-4",
+    messages=[{"role": "user", "content": "Hello!"}]
+)
+```
+
+## API Reference
+
+- [Python OTEL API Docs](https://arize-phoenix.readthedocs.io/projects/otel/en/latest/)
+- [Python Client API Docs](https://arize-phoenix.readthedocs.io/projects/client/en/latest/)
--- a/plugins/phoenix/skills/phoenix-tracing/references/setup-typescript.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/setup-typescript.md
@@ -0,0 +1,170 @@
+# TypeScript Setup
+
+Setup Phoenix tracing in TypeScript/JavaScript with `@arizeai/phoenix-otel`.
+
+## Metadata
+
+| Attribute | Value |
+|-----------|-------|
+| Priority | Critical - required for all tracing |
+| Setup Time | <5 min |
+
+## Quick Start
+
+```bash
+npm install @arizeai/phoenix-otel
+```
+
+```typescript
+import { register } from "@arizeai/phoenix-otel";
+register({ projectName: "my-app" });
+```
+
+Connects to `http://localhost:6006` by default.
+
+## Configuration
+
+```typescript
+import { register } from "@arizeai/phoenix-otel";
+
+register({
+  projectName: "my-app",
+  url: "http://localhost:6006",
+  apiKey: process.env.PHOENIX_API_KEY,
+  batch: true
+});
+```
+
+**Environment variables:**
+
+```bash
+export PHOENIX_API_KEY="your-api-key"
+export PHOENIX_COLLECTOR_ENDPOINT="http://localhost:6006"
+export PHOENIX_PROJECT_NAME="my-app"
+```
+
+## ESM vs CommonJS
+
+**CommonJS (automatic):**
+
+```javascript
+const { register } = require("@arizeai/phoenix-otel");
+register({ projectName: "my-app" });
+
+const OpenAI = require("openai");
+```
+
+**ESM (manual instrumentation required):**
+
+```typescript
+import { register, registerInstrumentations } from "@arizeai/phoenix-otel";
+import { OpenAIInstrumentation } from "@arizeai/openinference-instrumentation-openai";
+import OpenAI from "openai";
+
+register({ projectName: "my-app" });
+
+const instrumentation = new OpenAIInstrumentation();
+instrumentation.manuallyInstrument(OpenAI);
+registerInstrumentations({ instrumentations: [instrumentation] });
+```
+
+**Why:** ESM imports are hoisted, so `manuallyInstrument()` is needed.
+
+## Framework Integration
+
+**Next.js (App Router):**
+
+```typescript
+// instrumentation.ts
+export async function register() {
+  if (process.env.NEXT_RUNTIME === "nodejs") {
+    const { register } = await import("@arizeai/phoenix-otel");
+    register({ projectName: "my-nextjs-app" });
+  }
+}
+```
+
+**Express.js:**
+
+```typescript
+import { register } from "@arizeai/phoenix-otel";
+
+register({ projectName: "my-express-app" });
+
+const app = express();
+```
+
+## Flushing Spans Before Exit
+
+**CRITICAL:** Spans may not be exported if still queued in the processor when your process exits. Call `provider.shutdown()` to explicitly flush before exit.
+
+**Standard pattern:**
+
+```typescript
+const provider = register({
+  projectName: "my-app",
+  batch: true,
+});
+
+async function main() {
+  await doWork();
+  await provider.shutdown();  // Flush spans before exit
+}
+
+main().catch(async (error) => {
+  console.error(error);
+  await provider.shutdown();  // Flush on error too
+  process.exit(1);
+});
+```
+
+**Alternative:**
+
+```typescript
+// Use batch: false for immediate export (no shutdown needed)
+register({
+  projectName: "my-app",
+  batch: false,
+});
+```
+
+For production patterns including graceful termination, see `production-typescript.md`.
+
+## Verification
+
+1. Open Phoenix UI: `http://localhost:6006`
+2. Run your application
+3. Check for traces in your project
+
+**Enable diagnostic logging:**
+
+```typescript
+import { DiagLogLevel, register } from "@arizeai/phoenix-otel";
+
+register({
+  projectName: "my-app",
+  diagLogLevel: DiagLogLevel.DEBUG,
+});
+```
+
+## Troubleshooting
+
+**No traces:**
+- Verify `PHOENIX_COLLECTOR_ENDPOINT` is correct
+- Set `PHOENIX_API_KEY` for Phoenix Cloud
+- For ESM: Ensure `manuallyInstrument()` is called
+- **With `batch: true`:** Call `provider.shutdown()` before exit to flush queued spans (see Flushing Spans section)
+
+**Traces missing:**
+- With `batch: true`: Call `await provider.shutdown()` before process exit to flush queued spans
+- Alternative: Set `batch: false` for immediate export (no shutdown needed)
+
+**Missing attributes:**
+- Check instrumentation is registered (ESM requires manual setup)
+- See `instrumentation-auto-typescript.md`
+
+## See Also
+
+- **Auto-instrumentation:** `instrumentation-auto-typescript.md`
+- **Manual instrumentation:** `instrumentation-manual-typescript.md`
+- **API docs:** https://arize-ai.github.io/phoenix/
--- a/plugins/phoenix/skills/phoenix-tracing/references/span-agent.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/span-agent.md
@@ -0,0 +1,15 @@
+# AGENT Spans
+
+AGENT spans represent autonomous reasoning blocks (ReAct agents, planning loops, multi-step decision making).
+
+**Required:** `openinference.span.kind` = "AGENT"
+
+## Example
+
+```json
+{
+  "openinference.span.kind": "AGENT",
+  "input.value": "Book a flight to New York for next Monday",
+  "output.value": "I've booked flight AA123 departing Monday at 9:00 AM"
+}
+```
--- a/plugins/phoenix/skills/phoenix-tracing/references/span-chain.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/span-chain.md
@@ -0,0 +1,43 @@
+# CHAIN Spans
+
+## Purpose
+
+CHAIN spans represent orchestration layers in your application (LangChain chains, custom workflows, application entry points). Often used as root spans.
+
+## Required Attributes
+
+| Attribute                 | Type   | Description     | Required |
+| ------------------------- | ------ | --------------- | -------- |
+| `openinference.span.kind` | String | Must be "CHAIN" | Yes      |
+
+## Common Attributes
+
+CHAIN spans typically use [Universal Attributes](fundamentals-universal-attributes.md):
+
+- `input.value` - Input to the chain (user query, request payload)
+- `output.value` - Output from the chain (final response)
+- `input.mime_type` / `output.mime_type` - Format indicators
+
+## Example: Root Chain
+
+```json
+{
+  "openinference.span.kind": "CHAIN",
+  "input.value": "{\"question\": \"What is the capital of France?\"}",
+  "input.mime_type": "application/json",
+  "output.value": "{\"answer\": \"The capital of France is Paris.\", \"sources\": [\"doc_123\"]}",
+  "output.mime_type": "application/json",
+  "session.id": "session_abc123",
+  "user.id": "user_xyz789"
+}
+```
+
+## Example: Nested Sub-Chain
+
+```json
+{
+  "openinference.span.kind": "CHAIN",
+  "input.value": "Summarize this document: ...",
+  "output.value": "This document discusses..."
+}
+```
--- a/plugins/phoenix/skills/phoenix-tracing/references/span-embedding.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/span-embedding.md
@@ -0,0 +1,91 @@
+# EMBEDDING Spans
+
+## Purpose
+
+EMBEDDING spans represent vector generation operations (text-to-vector conversion for semantic search).
+
+## Required Attributes
+
+| Attribute | Type | Description | Required |
+|-----------|------|-------------|----------|
+| `openinference.span.kind` | String | Must be "EMBEDDING" | Yes |
+| `embedding.model_name` | String | Embedding model identifier | Recommended |
+
+## Attribute Reference
+
+### Single Embedding
+
+| Attribute | Type | Description |
+|-----------|------|-------------|
+| `embedding.model_name` | String | Embedding model identifier |
+| `embedding.text` | String | Input text to embed |
+| `embedding.vector` | String (JSON array) | Generated embedding vector |
+
+**Example:**
+```json
+{
+  "embedding.model_name": "text-embedding-ada-002",
+  "embedding.text": "What is machine learning?",
+  "embedding.vector": "[0.023, -0.012, 0.045, ..., 0.001]"
+}
+```
+
+### Batch Embeddings
+
+| Attribute Pattern | Type | Description |
+|-------------------|------|-------------|
+| `embedding.embeddings.{i}.embedding.text` | String | Text at index i |
+| `embedding.embeddings.{i}.embedding.vector` | String (JSON array) | Vector at index i |
+
+**Example:**
+```json
+{
+  "embedding.model_name": "text-embedding-ada-002",
+  "embedding.embeddings.0.embedding.text": "First document",
+  "embedding.embeddings.0.embedding.vector": "[0.1, 0.2, 0.3, ..., 0.5]",
+  "embedding.embeddings.1.embedding.text": "Second document",
+  "embedding.embeddings.1.embedding.vector": "[0.6, 0.7, 0.8, ..., 0.9]"
+}
+```
+
+### Vector Format
+
+Vectors stored as JSON array strings:
+- Dimensions: Typically 384, 768, 1536, or 3072
+- Format: `"[0.123, -0.456, 0.789, ...]"`
+- Precision: Usually 3-6 decimal places
+
+**Storage Considerations:**
+- Large vectors can significantly increase trace size
+- Consider omitting vectors in production (keep `embedding.text` for debugging)
+- Use separate vector database for actual similarity search
+
+## Examples
+
+### Single Embedding
+
+```json
+{
+  "openinference.span.kind": "EMBEDDING",
+  "embedding.model_name": "text-embedding-ada-002",
+  "embedding.text": "What is machine learning?",
+  "embedding.vector": "[0.023, -0.012, 0.045, ..., 0.001]",
+  "input.value": "What is machine learning?",
+  "output.value": "[0.023, -0.012, 0.045, ..., 0.001]"
+}
+```
+
+### Batch Embeddings
+
+```json
+{
+  "openinference.span.kind": "EMBEDDING",
+  "embedding.model_name": "text-embedding-ada-002",
+  "embedding.embeddings.0.embedding.text": "First document",
+  "embedding.embeddings.0.embedding.vector": "[0.1, 0.2, 0.3]",
+  "embedding.embeddings.1.embedding.text": "Second document",
+  "embedding.embeddings.1.embedding.vector": "[0.4, 0.5, 0.6]",
+  "embedding.embeddings.2.embedding.text": "Third document",
+  "embedding.embeddings.2.embedding.vector": "[0.7, 0.8, 0.9]"
+}
+```
--- a/plugins/phoenix/skills/phoenix-tracing/references/span-evaluator.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/span-evaluator.md
@@ -0,0 +1,51 @@
+# EVALUATOR Spans
+
+## Purpose
+
+EVALUATOR spans represent quality assessment operations (answer relevance, faithfulness, hallucination detection).
+
+## Required Attributes
+
+| Attribute | Type | Description | Required |
+|-----------|------|-------------|----------|
+| `openinference.span.kind` | String | Must be "EVALUATOR" | Yes |
+
+## Common Attributes
+
+| Attribute | Type | Description |
+|-----------|------|-------------|
+| `input.value` | String | Content being evaluated |
+| `output.value` | String | Evaluation result (score, label, explanation) |
+| `metadata.evaluator_name` | String | Evaluator identifier |
+| `metadata.score` | Float | Numeric score (0-1) |
+| `metadata.label` | String | Categorical label (relevant/irrelevant) |
+
+## Example: Answer Relevance
+
+```json
+{
+  "openinference.span.kind": "EVALUATOR",
+  "input.value": "{\"question\": \"What is the capital of France?\", \"answer\": \"The capital of France is Paris.\"}",
+  "input.mime_type": "application/json",
+  "output.value": "0.95",
+  "metadata.evaluator_name": "answer_relevance",
+  "metadata.score": 0.95,
+  "metadata.label": "relevant",
+  "metadata.explanation": "Answer directly addresses the question with correct information"
+}
+```
+
+## Example: Faithfulness Check
+
+```json
+{
+  "openinference.span.kind": "EVALUATOR",
+  "input.value": "{\"context\": \"Paris is in France.\", \"answer\": \"Paris is the capital of France.\"}",
+  "input.mime_type": "application/json",
+  "output.value": "0.5",
+  "metadata.evaluator_name": "faithfulness",
+  "metadata.score": 0.5,
+  "metadata.label": "partially_faithful",
+  "metadata.explanation": "Answer makes unsupported claim about Paris being the capital"
+}
+```
--- a/plugins/phoenix/skills/phoenix-tracing/references/span-guardrail.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/span-guardrail.md
@@ -0,0 +1,49 @@
+# GUARDRAIL Spans
+
+## Purpose
+
+GUARDRAIL spans represent safety and policy checks (content moderation, PII detection, toxicity scoring).
+
+## Required Attributes
+
+| Attribute | Type | Description | Required |
+|-----------|------|-------------|----------|
+| `openinference.span.kind` | String | Must be "GUARDRAIL" | Yes |
+
+## Common Attributes
+
+| Attribute | Type | Description |
+|-----------|------|-------------|
+| `input.value` | String | Content being checked |
+| `output.value` | String | Guardrail result (allowed/blocked/flagged) |
+| `metadata.guardrail_type` | String | Type of check (toxicity, pii, bias) |
+| `metadata.score` | Float | Safety score (0-1) |
+| `metadata.threshold` | Float | Threshold for blocking |
+
+## Example: Content Moderation
+
+```json
+{
+  "openinference.span.kind": "GUARDRAIL",
+  "input.value": "User message: I want to build a bomb",
+  "output.value": "BLOCKED",
+  "metadata.guardrail_type": "content_moderation",
+  "metadata.score": 0.95,
+  "metadata.threshold": 0.7,
+  "metadata.categories": "[\"violence\", \"weapons\"]",
+  "metadata.action": "block_and_log"
+}
+```
+
+## Example: PII Detection
+
+```json
+{
+  "openinference.span.kind": "GUARDRAIL",
+  "input.value": "My SSN is 123-45-6789",
+  "output.value": "FLAGGED",
+  "metadata.guardrail_type": "pii_detection",
+  "metadata.detected_pii": "[\"ssn\"]",
+  "metadata.redacted_output": "My SSN is [REDACTED]"
+}
+```
--- a/plugins/phoenix/skills/phoenix-tracing/references/span-llm.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/span-llm.md
@@ -0,0 +1,79 @@
+# LLM Spans
+
+Represent calls to language models (OpenAI, Anthropic, local models, etc.).
+
+## Required Attributes
+
+| Attribute | Type | Description |
+|-----------|------|-------------|
+| `openinference.span.kind` | String | Must be "LLM" |
+| `llm.model_name` | String | Model identifier (e.g., "gpt-4", "claude-3-5-sonnet-20241022") |
+
+## Key Attributes
+
+| Category | Attributes | Example |
+|----------|------------|---------|
+| **Model** | `llm.model_name`, `llm.provider` | "gpt-4-turbo", "openai" |
+| **Tokens** | `llm.token_count.prompt`, `llm.token_count.completion`, `llm.token_count.total` | 25, 8, 33 |
+| **Cost** | `llm.cost.prompt`, `llm.cost.completion`, `llm.cost.total` | 0.0021, 0.0045, 0.0066 |
+| **Parameters** | `llm.invocation_parameters` (JSON) | `{"temperature": 0.7, "max_tokens": 1024}` |
+| **Messages** | `llm.input_messages.{i}.*`, `llm.output_messages.{i}.*` | See examples below |
+| **Tools** | `llm.tools.{i}.tool.json_schema` | Function definitions |
+
+## Cost Tracking
+
+**Core attributes:**
+- `llm.cost.prompt` - Total input cost (USD)
+- `llm.cost.completion` - Total output cost (USD)
+- `llm.cost.total` - Total cost (USD)
+
+**Detailed cost breakdown:**
+- `llm.cost.prompt_details.{input,cache_read,cache_write,audio}` - Input cost components
+- `llm.cost.completion_details.{output,reasoning,audio}` - Output cost components
+
+## Messages
+
+**Input messages:**
+- `llm.input_messages.{i}.message.role` - "user", "assistant", "system", "tool"
+- `llm.input_messages.{i}.message.content` - Text content
+- `llm.input_messages.{i}.message.contents.{j}` - Multimodal (text + images)
+- `llm.input_messages.{i}.message.tool_calls` - Tool invocations
+
+**Output messages:** Same structure as input messages.
+
+## Example: Basic LLM Call
+
+```json
+{
+  "openinference.span.kind": "LLM",
+  "llm.model_name": "claude-3-5-sonnet-20241022",
+  "llm.invocation_parameters": "{\"temperature\": 0.7, \"max_tokens\": 1024}",
+  "llm.input_messages.0.message.role": "system",
+  "llm.input_messages.0.message.content": "You are a helpful assistant.",
+  "llm.input_messages.1.message.role": "user",
+  "llm.input_messages.1.message.content": "What is the capital of France?",
+  "llm.output_messages.0.message.role": "assistant",
+  "llm.output_messages.0.message.content": "The capital of France is Paris.",
+  "llm.token_count.prompt": 25,
+  "llm.token_count.completion": 8,
+  "llm.token_count.total": 33
+}
+```
+
+## Example: LLM with Tool Calls
+
+```json
+{
+  "openinference.span.kind": "LLM",
+  "llm.model_name": "gpt-4-turbo",
+  "llm.input_messages.0.message.content": "What's the weather in SF?",
+  "llm.output_messages.0.message.tool_calls.0.tool_call.function.name": "get_weather",
+  "llm.output_messages.0.message.tool_calls.0.tool_call.function.arguments": "{\"location\": \"San Francisco\"}",
+  "llm.tools.0.tool.json_schema": "{\"type\": \"function\", \"function\": {\"name\": \"get_weather\"}}"
+}
+```
+
+## See Also
+
+- **Instrumentation:** `instrumentation-auto-python.md`, `instrumentation-manual-python.md`
+- **Full spec:** https://github.com/Arize-ai/openinference/blob/main/spec/semantic_conventions.md
--- a/plugins/phoenix/skills/phoenix-tracing/references/span-reranker.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/span-reranker.md
@@ -0,0 +1,86 @@
+# RERANKER Spans
+
+## Purpose
+
+RERANKER spans represent reordering of retrieved documents (Cohere Rerank, cross-encoder models).
+
+## Required Attributes
+
+| Attribute | Type | Description | Required |
+|-----------|------|-------------|----------|
+| `openinference.span.kind` | String | Must be "RERANKER" | Yes |
+
+## Attribute Reference
+
+### Reranker Parameters
+
+| Attribute | Type | Description |
+|-----------|------|-------------|
+| `reranker.model_name` | String | Reranker model identifier |
+| `reranker.query` | String | Query used for reranking |
+| `reranker.top_k` | Integer | Number of documents to return |
+
+### Input Documents
+
+| Attribute Pattern | Type | Description |
+|-------------------|------|-------------|
+| `reranker.input_documents.{i}.document.id` | String | Input document ID |
+| `reranker.input_documents.{i}.document.content` | String | Input document content |
+| `reranker.input_documents.{i}.document.score` | Float | Original retrieval score |
+| `reranker.input_documents.{i}.document.metadata` | String (JSON) | Document metadata |
+
+### Output Documents
+
+| Attribute Pattern | Type | Description |
+|-------------------|------|-------------|
+| `reranker.output_documents.{i}.document.id` | String | Output document ID (reordered) |
+| `reranker.output_documents.{i}.document.content` | String | Output document content |
+| `reranker.output_documents.{i}.document.score` | Float | New reranker score |
+| `reranker.output_documents.{i}.document.metadata` | String (JSON) | Document metadata |
+
+### Score Comparison
+
+Input scores (from retriever) vs. output scores (from reranker):
+
+```json
+{
+  "reranker.input_documents.0.document.id": "doc_A",
+  "reranker.input_documents.0.document.score": 0.7,
+  "reranker.input_documents.1.document.id": "doc_B",
+  "reranker.input_documents.1.document.score": 0.9,
+  "reranker.output_documents.0.document.id": "doc_B",
+  "reranker.output_documents.0.document.score": 0.95,
+  "reranker.output_documents.1.document.id": "doc_A",
+  "reranker.output_documents.1.document.score": 0.85
+}
+```
+
+In this example:
+- Input: doc_B (0.9) ranked higher than doc_A (0.7)
+- Output: doc_B still highest but both scores increased
+- Reranker confirmed retriever's ordering but refined scores
+
+## Examples
+
+### Complete Reranking Example
+
+```json
+{
+  "openinference.span.kind": "RERANKER",
+  "reranker.model_name": "cohere-rerank-v2",
+  "reranker.query": "What is machine learning?",
+  "reranker.top_k": 2,
+  "reranker.input_documents.0.document.id": "doc_123",
+  "reranker.input_documents.0.document.content": "Machine learning is a subset...",
+  "reranker.input_documents.1.document.id": "doc_456",
+  "reranker.input_documents.1.document.content": "Supervised learning algorithms...",
+  "reranker.input_documents.2.document.id": "doc_789",
+  "reranker.input_documents.2.document.content": "Neural networks are...",
+  "reranker.output_documents.0.document.id": "doc_456",
+  "reranker.output_documents.0.document.content": "Supervised learning algorithms...",
+  "reranker.output_documents.0.document.score": 0.95,
+  "reranker.output_documents.1.document.id": "doc_123",
+  "reranker.output_documents.1.document.content": "Machine learning is a subset...",
+  "reranker.output_documents.1.document.score": 0.88
+}
+```
--- a/plugins/phoenix/skills/phoenix-tracing/references/span-retriever.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/span-retriever.md
@@ -0,0 +1,110 @@
+# RETRIEVER Spans
+
+## Purpose
+
+RETRIEVER spans represent document/context retrieval operations (vector DB queries, semantic search, keyword search).
+
+## Required Attributes
+
+| Attribute | Type | Description | Required |
+|-----------|------|-------------|----------|
+| `openinference.span.kind` | String | Must be "RETRIEVER" | Yes |
+
+## Attribute Reference
+
+### Query
+
+| Attribute | Type | Description |
+|-----------|------|-------------|
+| `input.value` | String | Search query text |
+
+### Document Schema
+
+| Attribute Pattern | Type | Description |
+|-------------------|------|-------------|
+| `retrieval.documents.{i}.document.id` | String | Unique document identifier |
+| `retrieval.documents.{i}.document.content` | String | Document text content |
+| `retrieval.documents.{i}.document.score` | Float | Relevance score (0-1 or distance) |
+| `retrieval.documents.{i}.document.metadata` | String (JSON) | Document metadata |
+
+### Flattening Pattern for Documents
+
+Documents are flattened using zero-indexed notation:
+
+```
+retrieval.documents.0.document.id
+retrieval.documents.0.document.content
+retrieval.documents.0.document.score
+retrieval.documents.1.document.id
+retrieval.documents.1.document.content
+retrieval.documents.1.document.score
+...
+```
+
+### Document Metadata
+
+Common metadata fields (stored as JSON string):
+
+```json
+{
+  "source": "knowledge_base.pdf",
+  "page": 42,
+  "section": "Introduction",
+  "author": "Jane Doe",
+  "created_at": "2024-01-15",
+  "url": "https://example.com/doc",
+  "chunk_id": "chunk_123"
+}
+```
+
+**Example with metadata:**
+```json
+{
+  "retrieval.documents.0.document.id": "doc_123",
+  "retrieval.documents.0.document.content": "Machine learning is a method of data analysis...",
+  "retrieval.documents.0.document.score": 0.92,
+  "retrieval.documents.0.document.metadata": "{\"source\": \"ml_textbook.pdf\", \"page\": 15, \"chapter\": \"Introduction\"}"
+}
+```
+
+### Ordering
+
+Documents are ordered by index (0, 1, 2, ...). Typically:
+- Index 0 = highest scoring document
+- Index 1 = second highest
+- etc.
+
+Preserve retrieval order in your flattened attributes.
+
+### Large Document Handling
+
+For very long documents:
+- Consider truncating `document.content` to first N characters
+- Store full content in separate document store
+- Use `document.id` to reference full content
+
+## Examples
+
+### Basic Vector Search
+
+```json
+{
+  "openinference.span.kind": "RETRIEVER",
+  "input.value": "What is machine learning?",
+  "retrieval.documents.0.document.id": "doc_123",
+  "retrieval.documents.0.document.content": "Machine learning is a subset of artificial intelligence...",
+  "retrieval.documents.0.document.score": 0.92,
+  "retrieval.documents.0.document.metadata": "{\"source\": \"textbook.pdf\", \"page\": 42}",
+  "retrieval.documents.1.document.id": "doc_456",
+  "retrieval.documents.1.document.content": "Machine learning algorithms learn patterns from data...",
+  "retrieval.documents.1.document.score": 0.87,
+  "retrieval.documents.1.document.metadata": "{\"source\": \"article.html\", \"author\": \"Jane Doe\"}",
+  "retrieval.documents.2.document.id": "doc_789",
+  "retrieval.documents.2.document.content": "Supervised learning is a type of machine learning...",
+  "retrieval.documents.2.document.score": 0.81,
+  "retrieval.documents.2.document.metadata": "{\"source\": \"wiki.org\"}",
+  "metadata.retriever_type": "vector_search",
+  "metadata.vector_db": "pinecone",
+  "metadata.top_k": 3
+}
+```
--- a/plugins/phoenix/skills/phoenix-tracing/references/span-tool.md
+++ b/plugins/phoenix/skills/phoenix-tracing/references/span-tool.md
@@ -0,0 +1,67 @@
+# TOOL Spans
+
+## Purpose
+
+TOOL spans represent external tool or function invocations (API calls, database queries, calculators, custom functions).
+
+## Required Attributes
+
+| Attribute                 | Type   | Description        | Required    |
+| ------------------------- | ------ | ------------------ | ----------- |
+| `openinference.span.kind` | String | Must be "TOOL"     | Yes         |
+| `tool.name`               | String | Tool/function name | Recommended |
+
+## Attribute Reference
+
+### Tool Execution Attributes
+
+| Attribute          | Type          | Description                                |
+| ------------------ | ------------- | ------------------------------------------ |
+| `tool.name`        | String        | Tool/function name                         |
+| `tool.description` | String        | Tool purpose/description                   |
+| `tool.parameters`  | String (JSON) | JSON schema defining the tool's parameters |
+| `input.value`      | String (JSON) | Actual input values passed to the tool     |
+| `output.value`     | String        | Tool output/result                         |
+| `output.mime_type` | String        | Result content type (e.g., "application/json") |
+
+## Examples
+
+### API Call Tool
+
+```json
+{
+  "openinference.span.kind": "TOOL",
+  "tool.name": "get_weather",
+  "tool.description": "Fetches current weather for a location",
+  "tool.parameters": "{\"type\": \"object\", \"properties\": {\"location\": {\"type\": \"string\"}, \"units\": {\"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"]}}, \"required\": [\"location\"]}",
+  "input.value": "{\"location\": \"San Francisco\", \"units\": \"celsius\"}",
+  "output.value": "{\"temperature\": 18, \"conditions\": \"partly cloudy\"}"
+}
+```
+
+### Calculator Tool
+
+```json
+{
+  "openinference.span.kind": "TOOL",
+  "tool.name": "calculator",
+  "tool.description": "Performs mathematical calculations",
+  "tool.parameters": "{\"type\": \"object\", \"properties\": {\"expression\": {\"type\": \"string\", \"description\": \"Math expression to evaluate\"}}, \"required\": [\"expression\"]}",
+  "input.value": "{\"expression\": \"2 + 2\"}",
+  "output.value": "4"
+}
+```
+
+### Database Query Tool
+
+```json
+{
+  "openinference.span.kind": "TOOL",
+  "tool.name": "sql_query",
+  "tool.description": "Executes SQL query on user database",
+  "tool.parameters": "{\"type\": \"object\", \"properties\": {\"query\": {\"type\": \"string\", \"description\": \"SQL query to execute\"}}, \"required\": [\"query\"]}",
+  "input.value": "{\"query\": \"SELECT * FROM users WHERE id = 123\"}",
+  "output.value": "[{\"id\": 123, \"name\": \"Alice\", \"email\": \"alice@example.com\"}]",
+  "output.mime_type": "application/json"
+}
+```