chore: publish from staged

This commit is contained in:
github-actions[bot]
2026-04-09 06:26:21 +00:00
parent 017f31f495
commit a68b190031
467 changed files with 97527 additions and 276 deletions

View File

@@ -18,8 +18,8 @@
"instrumentation"
],
"skills": [
"./skills/phoenix-cli/",
"./skills/phoenix-evals/",
"./skills/phoenix-tracing/"
"./skills/phoenix-cli",
"./skills/phoenix-evals",
"./skills/phoenix-tracing"
]
}

View File

@@ -0,0 +1,162 @@
---
name: phoenix-cli
description: Debug LLM applications using the Phoenix CLI. Fetch traces, analyze errors, review experiments, inspect datasets, and query the GraphQL API. Use when debugging AI/LLM applications, analyzing trace data, working with Phoenix observability, or investigating LLM performance issues.
license: Apache-2.0
compatibility: Requires Node.js (for npx) or global install of @arizeai/phoenix-cli. Optionally requires jq for JSON processing.
metadata:
author: arize-ai
version: "2.0.0"
---
# Phoenix CLI
## Invocation
```bash
px <resource> <action> # if installed globally
npx @arizeai/phoenix-cli <resource> <action> # no install required
```
The CLI uses singular resource commands with subcommands like `list` and `get`:
```bash
px trace list
px trace get <trace-id>
px span list
px dataset list
px dataset get <name>
```
## Setup
```bash
export PHOENIX_HOST=http://localhost:6006
export PHOENIX_PROJECT=my-project
export PHOENIX_API_KEY=your-api-key # if auth is enabled
```
Always use `--format raw --no-progress` when piping to `jq`.
## Traces
```bash
px trace list --limit 20 --format raw --no-progress | jq .
px trace list --last-n-minutes 60 --limit 20 --format raw --no-progress | jq '.[] | select(.status == "ERROR")'
px trace list --format raw --no-progress | jq 'sort_by(-.duration) | .[0:5]'
px trace get <trace-id> --format raw | jq .
px trace get <trace-id> --format raw | jq '.spans[] | select(.status_code != "OK")'
```
## Spans
```bash
px span list --limit 20 # recent spans (table view)
px span list --last-n-minutes 60 --limit 50 # spans from last hour
px span list --span-kind LLM --limit 10 # only LLM spans
px span list --status-code ERROR --limit 20 # only errored spans
px span list --name chat_completion --limit 10 # filter by span name
px span list --trace-id <id> --format raw --no-progress | jq . # all spans for a trace
px span list --include-annotations --limit 10 # include annotation scores
px span list output.json --limit 100 # save to JSON file
px span list --format raw --no-progress | jq '.[] | select(.status_code == "ERROR")'
```
### Span JSON shape
```
Span
name, span_kind ("LLM"|"CHAIN"|"TOOL"|"RETRIEVER"|"EMBEDDING"|"AGENT"|"RERANKER"|"GUARDRAIL"|"EVALUATOR"|"UNKNOWN")
status_code ("OK"|"ERROR"|"UNSET"), status_message
context.span_id, context.trace_id, parent_id
start_time, end_time
attributes (same as trace span attributes above)
annotations[] (with --include-annotations)
name, result { score, label, explanation }
```
### Trace JSON shape
```
Trace
traceId, status ("OK"|"ERROR"), duration (ms), startTime, endTime
rootSpan — top-level span (parent_id: null)
spans[]
name, span_kind ("LLM"|"CHAIN"|"TOOL"|"RETRIEVER"|"EMBEDDING"|"AGENT")
status_code ("OK"|"ERROR"), parent_id, context.span_id
attributes
input.value, output.value — raw input/output
llm.model_name, llm.provider
llm.token_count.prompt/completion/total
llm.token_count.prompt_details.cache_read
llm.token_count.completion_details.reasoning
llm.input_messages.{N}.message.role/content
llm.output_messages.{N}.message.role/content
llm.invocation_parameters — JSON string (temperature, etc.)
exception.message — set if span errored
```
## Sessions
```bash
px session list --limit 10 --format raw --no-progress | jq .
px session list --order asc --format raw --no-progress | jq '.[].session_id'
px session get <session-id> --format raw | jq .
px session get <session-id> --include-annotations --format raw | jq '.annotations'
```
### Session JSON shape
```
SessionData
id, session_id, project_id
start_time, end_time
traces[]
id, trace_id, start_time, end_time
SessionAnnotation (with --include-annotations)
id, name, annotator_kind ("LLM"|"CODE"|"HUMAN"), session_id
result { label, score, explanation }
metadata, identifier, source, created_at, updated_at
```
## Datasets / Experiments / Prompts
```bash
px dataset list --format raw --no-progress | jq '.[].name'
px dataset get <name> --format raw | jq '.examples[] | {input, output: .expected_output}'
px experiment list --dataset <name> --format raw --no-progress | jq '.[] | {id, name, failed_run_count}'
px experiment get <id> --format raw --no-progress | jq '.[] | select(.error != null) | {input, error}'
px prompt list --format raw --no-progress | jq '.[].name'
px prompt get <name> --format text --no-progress # plain text, ideal for piping to AI
```
## GraphQL
For ad-hoc queries not covered by the commands above. Output is `{"data": {...}}`.
```bash
px api graphql '{ projectCount datasetCount promptCount evaluatorCount }'
px api graphql '{ projects { edges { node { name traceCount tokenCountTotal } } } }' | jq '.data.projects.edges[].node'
px api graphql '{ datasets { edges { node { name exampleCount experimentCount } } } }' | jq '.data.datasets.edges[].node'
px api graphql '{ evaluators { edges { node { name kind } } } }' | jq '.data.evaluators.edges[].node'
# Introspect any type
px api graphql '{ __type(name: "Project") { fields { name type { name } } } }' | jq '.data.__type.fields[]'
```
Key root fields: `projects`, `datasets`, `prompts`, `evaluators`, `projectCount`, `datasetCount`, `promptCount`, `evaluatorCount`, `viewer`.
## Docs
Download Phoenix documentation markdown for local use by coding agents.
```bash
px docs fetch # fetch default workflow docs to .px/docs
px docs fetch --workflow tracing # fetch only tracing docs
px docs fetch --workflow tracing --workflow evaluation
px docs fetch --dry-run # preview what would be downloaded
px docs fetch --refresh # clear .px/docs and re-download
px docs fetch --output-dir ./my-docs # custom output directory
```
Key options: `--workflow` (repeatable, values: `tracing`, `evaluation`, `datasets`, `prompts`, `integrations`, `sdk`, `self-hosting`, `all`), `--dry-run`, `--refresh`, `--output-dir` (default `.px/docs`), `--workers` (default 10).

View File

@@ -0,0 +1,72 @@
---
name: phoenix-evals
description: Build and run evaluators for AI/LLM applications using Phoenix.
license: Apache-2.0
compatibility: Requires Phoenix server. Python skills need phoenix and openai packages; TypeScript skills need @arizeai/phoenix-client.
metadata:
author: oss@arize.com
version: "1.0.0"
languages: "Python, TypeScript"
---
# Phoenix Evals
Build evaluators for AI/LLM applications. Code first, LLM for nuance, validate against humans.
## Quick Reference
| Task | Files |
| ---- | ----- |
| Setup | [setup-python](references/setup-python.md), [setup-typescript](references/setup-typescript.md) |
| Decide what to evaluate | [evaluators-overview](references/evaluators-overview.md) |
| Choose a judge model | [fundamentals-model-selection](references/fundamentals-model-selection.md) |
| Use pre-built evaluators | [evaluators-pre-built](references/evaluators-pre-built.md) |
| Build code evaluator | [evaluators-code-python](references/evaluators-code-python.md), [evaluators-code-typescript](references/evaluators-code-typescript.md) |
| Build LLM evaluator | [evaluators-llm-python](references/evaluators-llm-python.md), [evaluators-llm-typescript](references/evaluators-llm-typescript.md), [evaluators-custom-templates](references/evaluators-custom-templates.md) |
| Batch evaluate DataFrame | [evaluate-dataframe-python](references/evaluate-dataframe-python.md) |
| Run experiment | [experiments-running-python](references/experiments-running-python.md), [experiments-running-typescript](references/experiments-running-typescript.md) |
| Create dataset | [experiments-datasets-python](references/experiments-datasets-python.md), [experiments-datasets-typescript](references/experiments-datasets-typescript.md) |
| Generate synthetic data | [experiments-synthetic-python](references/experiments-synthetic-python.md), [experiments-synthetic-typescript](references/experiments-synthetic-typescript.md) |
| Validate evaluator accuracy | [validation](references/validation.md), [validation-evaluators-python](references/validation-evaluators-python.md), [validation-evaluators-typescript](references/validation-evaluators-typescript.md) |
| Sample traces for review | [observe-sampling-python](references/observe-sampling-python.md), [observe-sampling-typescript](references/observe-sampling-typescript.md) |
| Analyze errors | [error-analysis](references/error-analysis.md), [error-analysis-multi-turn](references/error-analysis-multi-turn.md), [axial-coding](references/axial-coding.md) |
| RAG evals | [evaluators-rag](references/evaluators-rag.md) |
| Avoid common mistakes | [common-mistakes-python](references/common-mistakes-python.md), [fundamentals-anti-patterns](references/fundamentals-anti-patterns.md) |
| Production | [production-overview](references/production-overview.md), [production-guardrails](references/production-guardrails.md), [production-continuous](references/production-continuous.md) |
## Workflows
**Starting Fresh:**
[observe-tracing-setup](references/observe-tracing-setup.md) → [error-analysis](references/error-analysis.md) → [axial-coding](references/axial-coding.md) → [evaluators-overview](references/evaluators-overview.md)
**Building Evaluator:**
[fundamentals](references/fundamentals.md) → [common-mistakes-python](references/common-mistakes-python.md) → evaluators-{code|llm}-{python|typescript} → validation-evaluators-{python|typescript}
**RAG Systems:**
[evaluators-rag](references/evaluators-rag.md) → evaluators-code-* (retrieval) → evaluators-llm-* (faithfulness)
**Production:**
[production-overview](references/production-overview.md) → [production-guardrails](references/production-guardrails.md) → [production-continuous](references/production-continuous.md)
## Reference Categories
| Prefix | Description |
| ------ | ----------- |
| `fundamentals-*` | Types, scores, anti-patterns |
| `observe-*` | Tracing, sampling |
| `error-analysis-*` | Finding failures |
| `axial-coding-*` | Categorizing failures |
| `evaluators-*` | Code, LLM, RAG evaluators |
| `experiments-*` | Datasets, running experiments |
| `validation-*` | Validating evaluator accuracy against human labels |
| `production-*` | CI/CD, monitoring |
## Key Principles
| Principle | Action |
| --------- | ------ |
| Error analysis first | Can't automate what you haven't observed |
| Custom > generic | Build from your failures |
| Code first | Deterministic before LLM |
| Validate judges | >80% TPR/TNR |
| Binary > Likert | Pass/fail, not 1-5 |

View File

@@ -0,0 +1,95 @@
# Axial Coding
Group open-ended notes into structured failure taxonomies.
## Process
1. **Gather** - Collect open coding notes
2. **Pattern** - Group notes with common themes
3. **Name** - Create actionable category names
4. **Quantify** - Count failures per category
## Example Taxonomy
```yaml
failure_taxonomy:
content_quality:
hallucination: [invented_facts, fictional_citations]
incompleteness: [partial_answer, missing_key_info]
inaccuracy: [wrong_numbers, wrong_dates]
communication:
tone_mismatch: [too_casual, too_formal]
clarity: [ambiguous, jargon_heavy]
context:
user_context: [ignored_preferences, misunderstood_intent]
retrieved_context: [ignored_documents, wrong_context]
safety:
missing_disclaimers: [legal, medical, financial]
```
## Add Annotation (Python)
```python
from phoenix.client import Client
client = Client()
client.spans.add_span_annotation(
span_id="abc123",
annotation_name="failure_category",
label="hallucination",
explanation="invented a feature that doesn't exist",
annotator_kind="HUMAN",
sync=True,
)
```
## Add Annotation (TypeScript)
```typescript
import { addSpanAnnotation } from "@arizeai/phoenix-client/spans";
await addSpanAnnotation({
spanAnnotation: {
spanId: "abc123",
name: "failure_category",
label: "hallucination",
explanation: "invented a feature that doesn't exist",
annotatorKind: "HUMAN",
}
});
```
## Agent Failure Taxonomy
```yaml
agent_failures:
planning: [wrong_plan, incomplete_plan]
tool_selection: [wrong_tool, missed_tool, unnecessary_call]
tool_execution: [wrong_parameters, type_error]
state_management: [lost_context, stuck_in_loop]
error_recovery: [no_fallback, wrong_fallback]
```
## Transition Matrix (Agents)
Shows where failures occur between states:
```python
def build_transition_matrix(conversations, states):
matrix = defaultdict(lambda: defaultdict(int))
for conv in conversations:
if conv["failed"]:
last_success = find_last_success(conv)
first_failure = find_first_failure(conv)
matrix[last_success][first_failure] += 1
return pd.DataFrame(matrix).fillna(0)
```
## Principles
- **MECE** - Each failure fits ONE category
- **Actionable** - Categories suggest fixes
- **Bottom-up** - Let categories emerge from data

View File

@@ -0,0 +1,225 @@
# Common Mistakes (Python)
Patterns that LLMs frequently generate incorrectly from training data.
## Legacy Model Classes
```python
# WRONG
from phoenix.evals import OpenAIModel, AnthropicModel
model = OpenAIModel(model="gpt-4")
# RIGHT
from phoenix.evals import LLM
llm = LLM(provider="openai", model="gpt-4o")
```
**Why**: `OpenAIModel`, `AnthropicModel`, etc. are legacy 1.0 wrappers in `phoenix.evals.legacy`.
The `LLM` class is provider-agnostic and is the current 2.0 API.
## Using run_evals Instead of evaluate_dataframe
```python
# WRONG — legacy 1.0 API
from phoenix.evals import run_evals
results = run_evals(dataframe=df, evaluators=[eval1], provide_explanation=True)
# Returns list of DataFrames
# RIGHT — current 2.0 API
from phoenix.evals import evaluate_dataframe
results_df = evaluate_dataframe(dataframe=df, evaluators=[eval1])
# Returns single DataFrame with {name}_score dict columns
```
**Why**: `run_evals` is the legacy 1.0 batch function. `evaluate_dataframe` is the current
2.0 function with a different return format.
## Wrong Result Column Names
```python
# WRONG — column doesn't exist
score = results_df["relevance"].mean()
# WRONG — column exists but contains dicts, not numbers
score = results_df["relevance_score"].mean()
# RIGHT — extract numeric score from dict
scores = results_df["relevance_score"].apply(
lambda x: x.get("score", 0.0) if isinstance(x, dict) else 0.0
)
score = scores.mean()
```
**Why**: `evaluate_dataframe` returns columns named `{name}_score` containing Score dicts
like `{"name": "...", "score": 1.0, "label": "...", "explanation": "..."}`.
## Deprecated project_name Parameter
```python
# WRONG
df = client.spans.get_spans_dataframe(project_name="my-project")
# RIGHT
df = client.spans.get_spans_dataframe(project_identifier="my-project")
```
**Why**: `project_name` is deprecated in favor of `project_identifier`, which also
accepts project IDs.
## Wrong Client Constructor
```python
# WRONG
client = Client(endpoint="https://app.phoenix.arize.com")
client = Client(url="https://app.phoenix.arize.com")
# RIGHT — for remote/cloud Phoenix
client = Client(base_url="https://app.phoenix.arize.com", api_key="...")
# ALSO RIGHT — for local Phoenix (falls back to env vars or localhost:6006)
client = Client()
```
**Why**: The parameter is `base_url`, not `endpoint` or `url`. For local instances,
`Client()` with no args works fine. For remote instances, `base_url` and `api_key` are required.
## Too-Aggressive Time Filters
```python
# WRONG — often returns zero spans
from datetime import datetime, timedelta
df = client.spans.get_spans_dataframe(
project_identifier="my-project",
start_time=datetime.now() - timedelta(hours=1),
)
# RIGHT — use limit to control result size instead
df = client.spans.get_spans_dataframe(
project_identifier="my-project",
limit=50,
)
```
**Why**: Traces may be from any time period. A 1-hour window frequently returns
nothing. Use `limit=` to control result size instead.
## Not Filtering Spans Appropriately
```python
# WRONG — fetches all spans including internal LLM calls, retrievers, etc.
df = client.spans.get_spans_dataframe(project_identifier="my-project")
# RIGHT for end-to-end evaluation — filter to top-level spans
df = client.spans.get_spans_dataframe(
project_identifier="my-project",
root_spans_only=True,
)
# RIGHT for RAG evaluation — fetch child spans for retriever/LLM metrics
all_spans = client.spans.get_spans_dataframe(
project_identifier="my-project",
)
retriever_spans = all_spans[all_spans["span_kind"] == "RETRIEVER"]
llm_spans = all_spans[all_spans["span_kind"] == "LLM"]
```
**Why**: For end-to-end evaluation (e.g., overall answer quality), use `root_spans_only=True`.
For RAG systems, you often need child spans separately — retriever spans for
DocumentRelevance and LLM spans for Faithfulness. Choose the right span level
for your evaluation target.
## Assuming Span Output is Plain Text
```python
# WRONG — output may be JSON, not plain text
df["output"] = df["attributes.output.value"]
# RIGHT — parse JSON and extract the answer field
import json
def extract_answer(output_value):
if not isinstance(output_value, str):
return str(output_value) if output_value is not None else ""
try:
parsed = json.loads(output_value)
if isinstance(parsed, dict):
for key in ("answer", "result", "output", "response"):
if key in parsed:
return str(parsed[key])
except (json.JSONDecodeError, TypeError):
pass
return output_value
df["output"] = df["attributes.output.value"].apply(extract_answer)
```
**Why**: LangChain and other frameworks often output structured JSON from root spans,
like `{"context": "...", "question": "...", "answer": "..."}`. Evaluators need
the actual answer text, not the raw JSON.
## Using @create_evaluator for LLM-Based Evaluation
```python
# WRONG — @create_evaluator doesn't call an LLM
@create_evaluator(name="relevance", kind="llm")
def relevance(input: str, output: str) -> str:
pass # No LLM is involved
# RIGHT — use ClassificationEvaluator for LLM-based evaluation
from phoenix.evals import ClassificationEvaluator, LLM
relevance = ClassificationEvaluator(
name="relevance",
prompt_template="Is this relevant?\n{{input}}\n{{output}}\nAnswer:",
llm=LLM(provider="openai", model="gpt-4o"),
choices={"relevant": 1.0, "irrelevant": 0.0},
)
```
**Why**: `@create_evaluator` wraps a plain Python function. Setting `kind="llm"`
marks it as LLM-based but you must implement the LLM call yourself.
For LLM-based evaluation, prefer `ClassificationEvaluator` which handles
the LLM call, structured output parsing, and explanations automatically.
## Using llm_classify Instead of ClassificationEvaluator
```python
# WRONG — legacy 1.0 API
from phoenix.evals import llm_classify
results = llm_classify(
dataframe=df,
template=template_str,
model=model,
rails=["relevant", "irrelevant"],
)
# RIGHT — current 2.0 API
from phoenix.evals import ClassificationEvaluator, async_evaluate_dataframe, LLM
classifier = ClassificationEvaluator(
name="relevance",
prompt_template=template_str,
llm=LLM(provider="openai", model="gpt-4o"),
choices={"relevant": 1.0, "irrelevant": 0.0},
)
results_df = await async_evaluate_dataframe(dataframe=df, evaluators=[classifier])
```
**Why**: `llm_classify` is the legacy 1.0 function. The current pattern is to create
an evaluator with `ClassificationEvaluator` and run it with `async_evaluate_dataframe()`.
## Using HallucinationEvaluator
```python
# WRONG — deprecated
from phoenix.evals import HallucinationEvaluator
eval = HallucinationEvaluator(model)
# RIGHT — use FaithfulnessEvaluator
from phoenix.evals.metrics import FaithfulnessEvaluator
from phoenix.evals import LLM
eval = FaithfulnessEvaluator(llm=LLM(provider="openai", model="gpt-4o"))
```
**Why**: `HallucinationEvaluator` is deprecated. `FaithfulnessEvaluator` is its replacement,
using "faithful"/"unfaithful" labels with maximized score (1.0 = faithful).

View File

@@ -0,0 +1,52 @@
# Error Analysis: Multi-Turn Conversations
Debugging complex multi-turn conversation traces.
## The Approach
1. **End-to-end first** - Did the conversation achieve the goal?
2. **Find first failure** - Trace backwards to root cause
3. **Simplify** - Try single-turn before multi-turn debug
4. **N-1 testing** - Isolate turn-specific vs capability issues
## Find First Upstream Failure
```
Turn 1: User asks about flights ✓
Turn 2: Assistant asks for dates ✓
Turn 3: User provides dates ✓
Turn 4: Assistant searches WRONG dates ← FIRST FAILURE
Turn 5: Shows wrong flights (consequence)
Turn 6: User frustrated (consequence)
```
Focus on Turn 4, not Turn 6.
## Simplify First
Before debugging multi-turn, test single-turn:
```python
# If single-turn also fails → problem is retrieval/knowledge
# If single-turn passes → problem is conversation context
response = chat("What's the return policy for electronics?")
```
## N-1 Testing
Give turns 1 to N-1 as context, test turn N:
```python
context = conversation[:n-1]
response = chat_with_context(context, user_message_n)
# Compare to actual turn N
```
This isolates whether error is from context or underlying capability.
## Checklist
1. Did conversation achieve goal? (E2E)
2. Which turn first went wrong?
3. Can you reproduce with single-turn?
4. Is error from context or capability? (N-1 test)

View File

@@ -0,0 +1,170 @@
# Error Analysis
Review traces to discover failure modes before building evaluators.
## Process
1. **Sample** - 100+ traces (errors, negative feedback, random)
2. **Open Code** - Write free-form notes per trace
3. **Axial Code** - Group notes into failure categories
4. **Quantify** - Count failures per category
5. **Prioritize** - Rank by frequency × severity
## Sample Traces
### Span-level sampling (Python — DataFrame)
```python
from phoenix.client import Client
# Client() works for local Phoenix (falls back to env vars or localhost:6006)
# For remote/cloud: Client(base_url="https://app.phoenix.arize.com", api_key="...")
client = Client()
spans_df = client.spans.get_spans_dataframe(project_identifier="my-app")
# Build representative sample
sample = pd.concat([
spans_df[spans_df["status_code"] == "ERROR"].sample(30),
spans_df[spans_df["feedback"] == "negative"].sample(30),
spans_df.sample(40),
]).drop_duplicates("span_id").head(100)
```
### Span-level sampling (TypeScript)
```typescript
import { getSpans } from "@arizeai/phoenix-client/spans";
const { spans: errors } = await getSpans({
project: { projectName: "my-app" },
statusCode: "ERROR",
limit: 30,
});
const { spans: allSpans } = await getSpans({
project: { projectName: "my-app" },
limit: 70,
});
const sample = [...errors, ...allSpans.sort(() => Math.random() - 0.5).slice(0, 40)];
const unique = [...new Map(sample.map((s) => [s.context.span_id, s])).values()].slice(0, 100);
```
### Trace-level sampling (Python)
When errors span multiple spans (e.g., agent workflows), sample whole traces:
```python
from datetime import datetime, timedelta
traces = client.traces.get_traces(
project_identifier="my-app",
start_time=datetime.now() - timedelta(hours=24),
include_spans=True,
sort="latency_ms",
order="desc",
limit=100,
)
# Each trace has: trace_id, start_time, end_time, spans
```
### Trace-level sampling (TypeScript)
```typescript
import { getTraces } from "@arizeai/phoenix-client/traces";
const { traces } = await getTraces({
project: { projectName: "my-app" },
startTime: new Date(Date.now() - 24 * 60 * 60 * 1000),
includeSpans: true,
limit: 100,
});
```
## Add Notes (Python)
```python
client.spans.add_span_note(
span_id="abc123",
note="wrong timezone - said 3pm EST but user is PST"
)
```
## Add Notes (TypeScript)
```typescript
import { addSpanNote } from "@arizeai/phoenix-client/spans";
await addSpanNote({
spanNote: {
spanId: "abc123",
note: "wrong timezone - said 3pm EST but user is PST"
}
});
```
## What to Note
| Type | Examples |
| ---- | -------- |
| Factual errors | Wrong dates, prices, made-up features |
| Missing info | Didn't answer question, omitted details |
| Tone issues | Too casual/formal for context |
| Tool issues | Wrong tool, wrong parameters |
| Retrieval | Wrong docs, missing relevant docs |
## Good Notes
```
BAD: "Response is bad"
GOOD: "Response says ships in 2 days but policy is 5-7 days"
```
## Group into Categories
```python
categories = {
"factual_inaccuracy": ["wrong shipping time", "incorrect price"],
"hallucination": ["made up a discount", "invented feature"],
"tone_mismatch": ["informal for enterprise client"],
}
# Priority = Frequency × Severity
```
## Retrieve Existing Annotations
### Python
```python
# From a spans DataFrame
annotations_df = client.spans.get_span_annotations_dataframe(
spans_dataframe=sample,
project_identifier="my-app",
include_annotation_names=["quality", "correctness"],
)
# annotations_df has: span_id (index), name, label, score, explanation
# Or from specific span IDs
annotations_df = client.spans.get_span_annotations_dataframe(
span_ids=["span-id-1", "span-id-2"],
project_identifier="my-app",
)
```
### TypeScript
```typescript
import { getSpanAnnotations } from "@arizeai/phoenix-client/spans";
const { annotations } = await getSpanAnnotations({
project: { projectName: "my-app" },
spanIds: ["span-id-1", "span-id-2"],
includeAnnotationNames: ["quality", "correctness"],
});
for (const ann of annotations) {
console.log(`${ann.span_id}: ${ann.name} = ${ann.result?.label} (${ann.result?.score})`);
}
```
## Saturation
Stop when new traces reveal no new failure modes. Minimum: 100 traces.

View File

@@ -0,0 +1,137 @@
# Batch Evaluation with evaluate_dataframe (Python)
Run evaluators across a DataFrame. The core 2.0 batch evaluation API.
## Preferred: async_evaluate_dataframe
For batch evaluations (especially with LLM evaluators), prefer the async version
for better throughput:
```python
from phoenix.evals import async_evaluate_dataframe
results_df = await async_evaluate_dataframe(
dataframe=df, # pandas DataFrame with columns matching evaluator params
evaluators=[eval1, eval2], # List of evaluators
concurrency=5, # Max concurrent LLM calls (default 3)
exit_on_error=False, # Optional: stop on first error (default True)
max_retries=3, # Optional: retry failed LLM calls (default 10)
)
```
## Sync Version
```python
from phoenix.evals import evaluate_dataframe
results_df = evaluate_dataframe(
dataframe=df, # pandas DataFrame with columns matching evaluator params
evaluators=[eval1, eval2], # List of evaluators
exit_on_error=False, # Optional: stop on first error (default True)
max_retries=3, # Optional: retry failed LLM calls (default 10)
)
```
## Result Column Format
`async_evaluate_dataframe` / `evaluate_dataframe` returns a copy of the input DataFrame with added columns.
**Result columns contain dicts, NOT raw numbers.**
For each evaluator named `"foo"`, two columns are added:
| Column | Type | Contents |
| ------ | ---- | -------- |
| `foo_score` | `dict` | `{"name": "foo", "score": 1.0, "label": "True", "explanation": "...", "metadata": {...}, "kind": "code", "direction": "maximize"}` |
| `foo_execution_details` | `dict` | `{"status": "success", "exceptions": [], "execution_seconds": 0.001}` |
Only non-None fields appear in the score dict.
### Extracting Numeric Scores
```python
# WRONG — these will fail or produce unexpected results
score = results_df["relevance"].mean() # KeyError!
score = results_df["relevance_score"].mean() # Tries to average dicts!
# RIGHT — extract the numeric score from each dict
scores = results_df["relevance_score"].apply(
lambda x: x.get("score", 0.0) if isinstance(x, dict) else 0.0
)
mean_score = scores.mean()
```
### Extracting Labels
```python
labels = results_df["relevance_score"].apply(
lambda x: x.get("label", "") if isinstance(x, dict) else ""
)
```
### Extracting Explanations (LLM evaluators)
```python
explanations = results_df["relevance_score"].apply(
lambda x: x.get("explanation", "") if isinstance(x, dict) else ""
)
```
### Finding Failures
```python
scores = results_df["relevance_score"].apply(
lambda x: x.get("score", 0.0) if isinstance(x, dict) else 0.0
)
failed_mask = scores < 0.5
failures = results_df[failed_mask]
```
## Input Mapping
Evaluators receive each row as a dict. Column names must match the evaluator's
expected parameter names. If they don't match, use `.bind()` or `bind_evaluator`:
```python
from phoenix.evals import bind_evaluator, create_evaluator, async_evaluate_dataframe
@create_evaluator(name="check", kind="code")
def check(response: str) -> bool:
return len(response.strip()) > 0
# Option 1: Use .bind() method on the evaluator
check.bind(input_mapping={"response": "answer"})
results_df = await async_evaluate_dataframe(dataframe=df, evaluators=[check])
# Option 2: Use bind_evaluator function
bound = bind_evaluator(evaluator=check, input_mapping={"response": "answer"})
results_df = await async_evaluate_dataframe(dataframe=df, evaluators=[bound])
```
Or simply rename columns to match:
```python
df = df.rename(columns={
"attributes.input.value": "input",
"attributes.output.value": "output",
})
```
## DO NOT use run_evals
```python
# WRONG — legacy 1.0 API
from phoenix.evals import run_evals
results = run_evals(dataframe=df, evaluators=[eval1])
# Returns List[DataFrame] — one per evaluator
# RIGHT — current 2.0 API
from phoenix.evals import async_evaluate_dataframe
results_df = await async_evaluate_dataframe(dataframe=df, evaluators=[eval1])
# Returns single DataFrame with {name}_score dict columns
```
Key differences:
- `run_evals` returns a **list** of DataFrames (one per evaluator)
- `async_evaluate_dataframe` returns a **single** DataFrame with all results merged
- `async_evaluate_dataframe` uses `{name}_score` dict column format
- `async_evaluate_dataframe` uses `bind_evaluator` for input mapping (not `input_mapping=` param)

View File

@@ -0,0 +1,91 @@
# Evaluators: Code Evaluators in Python
Deterministic evaluators without LLM. Fast, cheap, reproducible.
## Basic Pattern
```python
import re
import json
from phoenix.evals import create_evaluator
@create_evaluator(name="has_citation", kind="code")
def has_citation(output: str) -> bool:
return bool(re.search(r'\[\d+\]', output))
@create_evaluator(name="json_valid", kind="code")
def json_valid(output: str) -> bool:
try:
json.loads(output)
return True
except json.JSONDecodeError:
return False
```
## Parameter Binding
| Parameter | Description |
| --------- | ----------- |
| `output` | Task output |
| `input` | Example input |
| `expected` | Expected output |
| `metadata` | Example metadata |
```python
@create_evaluator(name="matches_expected", kind="code")
def matches_expected(output: str, expected: dict) -> bool:
return output.strip() == expected.get("answer", "").strip()
```
## Common Patterns
- **Regex**: `re.search(pattern, output)`
- **JSON schema**: `jsonschema.validate()`
- **Keywords**: `keyword in output.lower()`
- **Length**: `len(output.split())`
- **Similarity**: `editdistance.eval()` or Jaccard
## Return Types
| Return type | Result |
| ----------- | ------ |
| `bool` | `True` → score=1.0, label="True"; `False` → score=0.0, label="False" |
| `float`/`int` | Used as the `score` value directly |
| `str` (short, ≤3 words) | Used as the `label` value |
| `str` (long, ≥4 words) | Used as the `explanation` value |
| `dict` with `score`/`label`/`explanation` | Mapped to Score fields directly |
| `Score` object | Used as-is |
## Important: Code vs LLM Evaluators
The `@create_evaluator` decorator wraps a plain Python function.
- `kind="code"` (default): For deterministic evaluators that don't call an LLM.
- `kind="llm"`: Marks the evaluator as LLM-based, but **you** must implement the LLM
call inside the function. The decorator does not call an LLM for you.
For most LLM-based evaluation, prefer `ClassificationEvaluator` which handles
the LLM call, structured output parsing, and explanations automatically:
```python
from phoenix.evals import ClassificationEvaluator, LLM
relevance = ClassificationEvaluator(
name="relevance",
prompt_template="Is this relevant?\n{{input}}\n{{output}}\nAnswer:",
llm=LLM(provider="openai", model="gpt-4o"),
choices={"relevant": 1.0, "irrelevant": 0.0},
)
```
## Pre-Built
```python
from phoenix.experiments.evaluators import ContainsAnyKeyword, JSONParseable, MatchesRegex
evaluators = [
ContainsAnyKeyword(keywords=["disclaimer"]),
JSONParseable(),
MatchesRegex(pattern=r"\d{4}-\d{2}-\d{2}"),
]
```

View File

@@ -0,0 +1,51 @@
# Evaluators: Code Evaluators in TypeScript
Deterministic evaluators without LLM. Fast, cheap, reproducible.
## Basic Pattern
```typescript
import { createEvaluator } from "@arizeai/phoenix-evals";
const containsCitation = createEvaluator<{ output: string }>(
({ output }) => /\[\d+\]/.test(output) ? 1 : 0,
{ name: "contains_citation", kind: "CODE" }
);
```
## With Full Results (asExperimentEvaluator)
```typescript
import { asExperimentEvaluator } from "@arizeai/phoenix-client/experiments";
const jsonValid = asExperimentEvaluator({
name: "json_valid",
kind: "CODE",
evaluate: async ({ output }) => {
try {
JSON.parse(String(output));
return { score: 1.0, label: "valid_json" };
} catch (e) {
return { score: 0.0, label: "invalid_json", explanation: String(e) };
}
},
});
```
## Parameter Types
```typescript
interface EvaluatorParams {
input: Record<string, unknown>;
output: unknown;
expected: Record<string, unknown>;
metadata: Record<string, unknown>;
}
```
## Common Patterns
- **Regex**: `/pattern/.test(output)`
- **JSON**: `JSON.parse()` + zod schema
- **Keywords**: `output.includes(keyword)`
- **Similarity**: `fastest-levenshtein`

View File

@@ -0,0 +1,54 @@
# Evaluators: Custom Templates
Design LLM judge prompts.
## Complete Template Pattern
```python
TEMPLATE = """Evaluate faithfulness of the response to the context.
<context>{{context}}</context>
<response>{{output}}</response>
CRITERIA:
"faithful" = ALL claims supported by context
"unfaithful" = ANY claim NOT in context
EXAMPLES:
Context: "Price is $10" → Response: "It costs $10" → faithful
Context: "Price is $10" → Response: "About $15" → unfaithful
EDGE CASES:
- Empty context → cannot_evaluate
- "I don't know" when appropriate → faithful
- Partial faithfulness → unfaithful (strict)
Answer (faithful/unfaithful):"""
```
## Template Structure
1. Task description
2. Input variables in XML tags
3. Criteria definitions
4. Examples (2-4 cases)
5. Edge cases
6. Output format
## XML Tags
```
<question>{{input}}</question>
<response>{{output}}</response>
<context>{{context}}</context>
<reference>{{reference}}</reference>
```
## Common Mistakes
| Mistake | Fix |
| ------- | --- |
| Vague criteria | Define each label exactly |
| No examples | Include 2-4 cases |
| Ambiguous format | Specify exact output |
| No edge cases | Address ambiguity |

View File

@@ -0,0 +1,92 @@
# Evaluators: LLM Evaluators in Python
LLM evaluators use a language model to judge outputs. Use when criteria are subjective.
## Quick Start
```python
from phoenix.evals import ClassificationEvaluator, LLM
llm = LLM(provider="openai", model="gpt-4o")
HELPFULNESS_TEMPLATE = """Rate how helpful the response is.
<question>{{input}}</question>
<response>{{output}}</response>
"helpful" means directly addresses the question.
"not_helpful" means does not address the question.
Your answer (helpful/not_helpful):"""
helpfulness = ClassificationEvaluator(
name="helpfulness",
prompt_template=HELPFULNESS_TEMPLATE,
llm=llm,
choices={"not_helpful": 0, "helpful": 1}
)
```
## Template Variables
Use XML tags to wrap variables for clarity:
| Variable | XML Tag |
| -------- | ------- |
| `{{input}}` | `<question>{{input}}</question>` |
| `{{output}}` | `<response>{{output}}</response>` |
| `{{reference}}` | `<reference>{{reference}}</reference>` |
| `{{context}}` | `<context>{{context}}</context>` |
## create_classifier (Factory)
Shorthand factory that returns a `ClassificationEvaluator`. Prefer direct
`ClassificationEvaluator` instantiation for more parameters/customization:
```python
from phoenix.evals import create_classifier, LLM
relevance = create_classifier(
name="relevance",
prompt_template="""Is this response relevant to the question?
<question>{{input}}</question>
<response>{{output}}</response>
Answer (relevant/irrelevant):""",
llm=LLM(provider="openai", model="gpt-4o"),
choices={"relevant": 1.0, "irrelevant": 0.0},
)
```
## Input Mapping
Column names must match template variables. Rename columns or use `bind_evaluator`:
```python
# Option 1: Rename columns to match template variables
df = df.rename(columns={"user_query": "input", "ai_response": "output"})
# Option 2: Use bind_evaluator
from phoenix.evals import bind_evaluator
bound = bind_evaluator(
evaluator=helpfulness,
input_mapping={"input": "user_query", "output": "ai_response"},
)
```
## Running
```python
from phoenix.evals import evaluate_dataframe
results_df = evaluate_dataframe(dataframe=df, evaluators=[helpfulness])
```
## Best Practices
1. **Be specific** - Define exactly what pass/fail means
2. **Include examples** - Show concrete cases for each label
3. **Explanations by default** - `ClassificationEvaluator` includes explanations automatically
4. **Study built-in prompts** - See
`phoenix.evals.__generated__.classification_evaluator_configs` for examples
of well-structured evaluation prompts (Faithfulness, Correctness, DocumentRelevance, etc.)

View File

@@ -0,0 +1,58 @@
# Evaluators: LLM Evaluators in TypeScript
LLM evaluators use a language model to judge outputs. Uses Vercel AI SDK.
## Quick Start
```typescript
import { createClassificationEvaluator } from "@arizeai/phoenix-evals";
import { openai } from "@ai-sdk/openai";
const helpfulness = await createClassificationEvaluator<{
input: string;
output: string;
}>({
name: "helpfulness",
model: openai("gpt-4o"),
promptTemplate: `Rate helpfulness.
<question>{{input}}</question>
<response>{{output}}</response>
Answer (helpful/not_helpful):`,
choices: { not_helpful: 0, helpful: 1 },
});
```
## Template Variables
Use XML tags: `<question>{{input}}</question>`, `<response>{{output}}</response>`, `<context>{{context}}</context>`
## Custom Evaluator with asExperimentEvaluator
```typescript
import { asExperimentEvaluator } from "@arizeai/phoenix-client/experiments";
const customEval = asExperimentEvaluator({
name: "custom",
kind: "LLM",
evaluate: async ({ input, output }) => {
// Your LLM call here
return { score: 1.0, label: "pass", explanation: "..." };
},
});
```
## Pre-Built Evaluators
```typescript
import { createFaithfulnessEvaluator } from "@arizeai/phoenix-evals";
const faithfulnessEvaluator = createFaithfulnessEvaluator({
model: openai("gpt-4o"),
});
```
## Best Practices
- Be specific about criteria
- Include examples in prompts
- Use `<thinking>` for chain of thought

View File

@@ -0,0 +1,40 @@
# Evaluators: Overview
When and how to build automated evaluators.
## Decision Framework
```
Should I Build an Evaluator?
Can I fix it with a prompt change?
YES → Fix the prompt first
NO → Is this a recurring issue?
YES → Build evaluator
NO → Add to watchlist
```
**Don't automate prematurely.** Many issues are simple prompt fixes.
## Evaluator Requirements
1. **Clear criteria** - Specific, not "Is it good?"
2. **Labeled test set** - 100+ examples with human labels
3. **Measured accuracy** - Know TPR/TNR before deploying
## Evaluator Lifecycle
1. **Discover** - Error analysis reveals pattern
2. **Design** - Define criteria and test cases
3. **Implement** - Build code or LLM evaluator
4. **Calibrate** - Validate against human labels
5. **Deploy** - Add to experiment/CI pipeline
6. **Monitor** - Track accuracy over time
7. **Maintain** - Update as product evolves
## What NOT to Automate
- **Rare issues** - <5 instances? Watchlist, don't build
- **Quick fixes** - Fixable by prompt change? Fix it
- **Evolving criteria** - Stabilize definition first

View File

@@ -0,0 +1,75 @@
# Evaluators: Pre-Built
Use for exploration only. Validate before production.
## Python
```python
from phoenix.evals import LLM
from phoenix.evals.metrics import FaithfulnessEvaluator
llm = LLM(provider="openai", model="gpt-4o")
faithfulness_eval = FaithfulnessEvaluator(llm=llm)
```
**Note**: `HallucinationEvaluator` is deprecated. Use `FaithfulnessEvaluator` instead.
It uses "faithful"/"unfaithful" labels with score 1.0 = faithful.
## TypeScript
```typescript
import { createHallucinationEvaluator } from "@arizeai/phoenix-evals";
import { openai } from "@ai-sdk/openai";
const hallucinationEval = createHallucinationEvaluator({ model: openai("gpt-4o") });
```
## Available (2.0)
| Evaluator | Type | Description |
| --------- | ---- | ----------- |
| `FaithfulnessEvaluator` | LLM | Is the response faithful to the context? |
| `CorrectnessEvaluator` | LLM | Is the response correct? |
| `DocumentRelevanceEvaluator` | LLM | Are retrieved documents relevant? |
| `ToolSelectionEvaluator` | LLM | Did the agent select the right tool? |
| `ToolInvocationEvaluator` | LLM | Did the agent invoke the tool correctly? |
| `ToolResponseHandlingEvaluator` | LLM | Did the agent handle the tool response well? |
| `MatchesRegex` | Code | Does output match a regex pattern? |
| `PrecisionRecallFScore` | Code | Precision/recall/F-score metrics |
| `exact_match` | Code | Exact string match |
Legacy evaluators (`HallucinationEvaluator`, `QAEvaluator`, `RelevanceEvaluator`,
`ToxicityEvaluator`, `SummarizationEvaluator`) are in `phoenix.evals.legacy` and deprecated.
## When to Use
| Situation | Recommendation |
| --------- | -------------- |
| Exploration | Find traces to review |
| Find outliers | Sort by scores |
| Production | Validate first (>80% human agreement) |
| Domain-specific | Build custom |
## Exploration Pattern
```python
from phoenix.evals import evaluate_dataframe
results_df = evaluate_dataframe(dataframe=traces, evaluators=[faithfulness_eval])
# Score columns contain dicts — extract numeric scores
scores = results_df["faithfulness_score"].apply(
lambda x: x.get("score", 0.0) if isinstance(x, dict) else 0.0
)
low_scores = results_df[scores < 0.5] # Review these
high_scores = results_df[scores > 0.9] # Also sample
```
## Validation Required
```python
from sklearn.metrics import classification_report
print(classification_report(human_labels, evaluator_results["label"]))
# Target: >80% agreement
```

View File

@@ -0,0 +1,108 @@
# Evaluators: RAG Systems
RAG has two distinct components requiring different evaluation approaches.
## Two-Phase Evaluation
```
RETRIEVAL GENERATION
───────── ──────────
Query → Retriever → Docs Docs + Query → LLM → Answer
│ │
IR Metrics LLM Judges / Code Checks
```
**Debug retrieval first** using IR metrics, then tackle generation quality.
## Retrieval Evaluation (IR Metrics)
Use traditional information retrieval metrics:
| Metric | What It Measures |
| ------ | ---------------- |
| Recall@k | Of all relevant docs, how many in top k? |
| Precision@k | Of k retrieved docs, how many relevant? |
| MRR | How high is first relevant doc? |
| NDCG | Quality weighted by position |
```python
# Requires query-document relevance labels
def recall_at_k(retrieved_ids, relevant_ids, k=5):
retrieved_set = set(retrieved_ids[:k])
relevant_set = set(relevant_ids)
if not relevant_set:
return 0.0
return len(retrieved_set & relevant_set) / len(relevant_set)
```
## Creating Retrieval Test Data
Generate query-document pairs synthetically:
```python
# Reverse process: document → questions that document answers
def generate_retrieval_test(documents):
test_pairs = []
for doc in documents:
# Extract facts, generate questions
questions = llm(f"Generate 3 questions this document answers:\n{doc}")
for q in questions:
test_pairs.append({"query": q, "relevant_doc_id": doc.id})
return test_pairs
```
## Generation Evaluation
Use LLM judges for qualities code can't measure:
| Eval | Question |
| ---- | -------- |
| **Faithfulness** | Are all claims supported by retrieved context? |
| **Relevance** | Does answer address the question? |
| **Completeness** | Does answer cover key points from context? |
```python
from phoenix.evals import ClassificationEvaluator, LLM
FAITHFULNESS_TEMPLATE = """Given the context and answer, is every claim in the answer supported by the context?
<context>{{context}}</context>
<answer>{{output}}</answer>
"faithful" = ALL claims supported by context
"unfaithful" = ANY claim NOT in context
Answer (faithful/unfaithful):"""
faithfulness = ClassificationEvaluator(
name="faithfulness",
prompt_template=FAITHFULNESS_TEMPLATE,
llm=LLM(provider="openai", model="gpt-4o"),
choices={"unfaithful": 0, "faithful": 1}
)
```
## RAG Failure Taxonomy
Common failure modes to evaluate:
```yaml
retrieval_failures:
- no_relevant_docs: Query returns unrelated content
- partial_retrieval: Some relevant docs missed
- wrong_chunk: Right doc, wrong section
generation_failures:
- hallucination: Claims not in retrieved context
- ignored_context: Answer doesn't use retrieved docs
- incomplete: Missing key information from context
- wrong_synthesis: Misinterprets or miscombines sources
```
## Evaluation Order
1. **Retrieval first** - If wrong docs, generation will fail
2. **Faithfulness** - Is answer grounded in context?
3. **Answer quality** - Does answer address the question?
Fix retrieval problems before debugging generation.

View File

@@ -0,0 +1,133 @@
# Experiments: Datasets in Python
Creating and managing evaluation datasets.
## Creating Datasets
```python
from phoenix.client import Client
client = Client()
# From examples
dataset = client.datasets.create_dataset(
name="qa-test-v1",
examples=[
{
"input": {"question": "What is 2+2?"},
"output": {"answer": "4"},
"metadata": {"category": "math"},
},
],
)
# From DataFrame
dataset = client.datasets.create_dataset(
dataframe=df,
name="qa-test-v1",
input_keys=["question"],
output_keys=["answer"],
metadata_keys=["category"],
)
```
## From Production Traces
```python
spans_df = client.spans.get_spans_dataframe(project_identifier="my-app")
dataset = client.datasets.create_dataset(
dataframe=spans_df[["input.value", "output.value"]],
name="production-sample-v1",
input_keys=["input.value"],
output_keys=["output.value"],
)
```
## Retrieving Datasets
```python
dataset = client.datasets.get_dataset(name="qa-test-v1")
df = dataset.to_dataframe()
```
## Key Parameters
| Parameter | Description |
| --------- | ----------- |
| `input_keys` | Columns for task input |
| `output_keys` | Columns for expected output |
| `metadata_keys` | Additional context |
## Using Evaluators in Experiments
### Evaluators as experiment evaluators
Pass phoenix-evals evaluators directly to `run_experiment` as the `evaluators` argument:
```python
from functools import partial
from phoenix.client import AsyncClient
from phoenix.evals import ClassificationEvaluator, LLM, bind_evaluator
# Define an LLM evaluator
refusal = ClassificationEvaluator(
name="refusal",
prompt_template="Is this a refusal?\nQuestion: {{query}}\nResponse: {{response}}",
llm=LLM(provider="openai", model="gpt-4o"),
choices={"refusal": 0, "answer": 1},
)
# Bind to map dataset columns to evaluator params
refusal_evaluator = bind_evaluator(refusal, {"query": "input.query", "response": "output"})
# Define experiment task
async def run_rag_task(input, rag_engine):
return rag_engine.query(input["query"])
# Run experiment with the evaluator
experiment = await AsyncClient().experiments.run_experiment(
dataset=ds,
task=partial(run_rag_task, rag_engine=query_engine),
experiment_name="baseline",
evaluators=[refusal_evaluator],
concurrency=10,
)
```
### Evaluators as the task (meta evaluation)
Use an LLM evaluator as the experiment **task** to test the evaluator itself
against human annotations:
```python
from phoenix.evals import create_evaluator
# The evaluator IS the task being tested
def run_refusal_eval(input, evaluator):
result = evaluator.evaluate(input)
return result[0]
# A simple heuristic checks judge vs human agreement
@create_evaluator(name="exact_match")
def exact_match(output, expected):
return float(output["score"]) == float(expected["refusal_score"])
# Run: evaluator is the task, exact_match evaluates it
experiment = await AsyncClient().experiments.run_experiment(
dataset=annotated_dataset,
task=partial(run_refusal_eval, evaluator=refusal),
experiment_name="judge-v1",
evaluators=[exact_match],
concurrency=10,
)
```
This pattern lets you iterate on evaluator prompts until they align with human judgments.
See `tutorials/evals/evals-2/evals_2.0_rag_demo.ipynb` for a full worked example.
## Best Practices
- **Versioning**: Create new datasets (e.g., `qa-test-v2`), don't modify
- **Metadata**: Track source, category, difficulty
- **Balance**: Ensure diverse coverage across categories

View File

@@ -0,0 +1,69 @@
# Experiments: Datasets in TypeScript
Creating and managing evaluation datasets.
## Creating Datasets
```typescript
import { createClient } from "@arizeai/phoenix-client";
import { createDataset } from "@arizeai/phoenix-client/datasets";
const client = createClient();
const { datasetId } = await createDataset({
client,
name: "qa-test-v1",
examples: [
{
input: { question: "What is 2+2?" },
output: { answer: "4" },
metadata: { category: "math" },
},
],
});
```
## Example Structure
```typescript
interface DatasetExample {
input: Record<string, unknown>; // Task input
output?: Record<string, unknown>; // Expected output
metadata?: Record<string, unknown>; // Additional context
}
```
## From Production Traces
```typescript
import { getSpans } from "@arizeai/phoenix-client/spans";
const { spans } = await getSpans({
project: { projectName: "my-app" },
parentId: null, // root spans only
limit: 100,
});
const examples = spans.map((span) => ({
input: { query: span.attributes?.["input.value"] },
output: { response: span.attributes?.["output.value"] },
metadata: { spanId: span.context.span_id },
}));
await createDataset({ client, name: "production-sample", examples });
```
## Retrieving Datasets
```typescript
import { getDataset, listDatasets } from "@arizeai/phoenix-client/datasets";
const dataset = await getDataset({ client, datasetId: "..." });
const all = await listDatasets({ client });
```
## Best Practices
- **Versioning**: Create new datasets, don't modify existing
- **Metadata**: Track source, category, provenance
- **Type safety**: Use TypeScript interfaces for structure

View File

@@ -0,0 +1,50 @@
# Experiments: Overview
Systematic testing of AI systems with datasets, tasks, and evaluators.
## Structure
```
DATASET → Examples: {input, expected_output, metadata}
TASK → function(input) → output
EVALUATORS → (input, output, expected) → score
EXPERIMENT → Run task on all examples, score results
```
## Basic Usage
```python
from phoenix.client.experiments import run_experiment
experiment = run_experiment(
dataset=my_dataset,
task=my_task,
evaluators=[accuracy, faithfulness],
experiment_name="improved-retrieval-v2",
)
print(experiment.aggregate_scores)
# {'accuracy': 0.85, 'faithfulness': 0.92}
```
## Workflow
1. **Create dataset** - From traces, synthetic data, or manual curation
2. **Define task** - The function to test (your LLM pipeline)
3. **Select evaluators** - Code and/or LLM-based
4. **Run experiment** - Execute and score
5. **Analyze & iterate** - Review, modify task, re-run
## Dry Runs
Test setup before full execution:
```python
experiment = run_experiment(dataset, task, evaluators, dry_run=3) # Just 3 examples
```
## Best Practices
- **Name meaningfully**: `"improved-retrieval-v2-2024-01-15"` not `"test"`
- **Version datasets**: Don't modify existing
- **Multiple evaluators**: Combine perspectives

View File

@@ -0,0 +1,78 @@
# Experiments: Running Experiments in Python
Execute experiments with `run_experiment`.
## Basic Usage
```python
from phoenix.client import Client
from phoenix.client.experiments import run_experiment
client = Client()
dataset = client.datasets.get_dataset(name="qa-test-v1")
def my_task(example):
return call_llm(example.input["question"])
def exact_match(output, expected):
return 1.0 if output.strip().lower() == expected["answer"].strip().lower() else 0.0
experiment = run_experiment(
dataset=dataset,
task=my_task,
evaluators=[exact_match],
experiment_name="qa-experiment-v1",
)
```
## Task Functions
```python
# Basic task
def task(example):
return call_llm(example.input["question"])
# With context (RAG)
def rag_task(example):
return call_llm(f"Context: {example.input['context']}\nQ: {example.input['question']}")
```
## Evaluator Parameters
| Parameter | Access |
| --------- | ------ |
| `output` | Task output |
| `expected` | Example expected output |
| `input` | Example input |
| `metadata` | Example metadata |
## Options
```python
experiment = run_experiment(
dataset=dataset,
task=my_task,
evaluators=evaluators,
experiment_name="my-experiment",
dry_run=3, # Test with 3 examples
repetitions=3, # Run each example 3 times
)
```
## Results
```python
print(experiment.aggregate_scores)
# {'accuracy': 0.85, 'faithfulness': 0.92}
for run in experiment.runs:
print(run.output, run.scores)
```
## Add Evaluations Later
```python
from phoenix.client.experiments import evaluate_experiment
evaluate_experiment(experiment=experiment, evaluators=[new_evaluator])
```

View File

@@ -0,0 +1,82 @@
# Experiments: Running Experiments in TypeScript
Execute experiments with `runExperiment`.
## Basic Usage
```typescript
import { createClient } from "@arizeai/phoenix-client";
import {
runExperiment,
asExperimentEvaluator,
} from "@arizeai/phoenix-client/experiments";
const client = createClient();
const task = async (example: { input: Record<string, unknown> }) => {
return await callLLM(example.input.question as string);
};
const exactMatch = asExperimentEvaluator({
name: "exact_match",
kind: "CODE",
evaluate: async ({ output, expected }) => ({
score: output === expected?.answer ? 1.0 : 0.0,
label: output === expected?.answer ? "match" : "no_match",
}),
});
const experiment = await runExperiment({
client,
experimentName: "qa-experiment-v1",
dataset: { datasetId: "your-dataset-id" },
task,
evaluators: [exactMatch],
});
```
## Task Functions
```typescript
// Basic task
const task = async (example) => await callLLM(example.input.question as string);
// With context (RAG)
const ragTask = async (example) => {
const prompt = `Context: ${example.input.context}\nQ: ${example.input.question}`;
return await callLLM(prompt);
};
```
## Evaluator Parameters
```typescript
interface EvaluatorParams {
input: Record<string, unknown>;
output: unknown;
expected: Record<string, unknown>;
metadata: Record<string, unknown>;
}
```
## Options
```typescript
const experiment = await runExperiment({
client,
experimentName: "my-experiment",
dataset: { datasetName: "qa-test-v1" },
task,
evaluators,
repetitions: 3, // Run each example 3 times
maxConcurrency: 5, // Limit concurrent executions
});
```
## Add Evaluations Later
```typescript
import { evaluateExperiment } from "@arizeai/phoenix-client/experiments";
await evaluateExperiment({ client, experiment, evaluators: [newEvaluator] });
```

View File

@@ -0,0 +1,70 @@
# Experiments: Generating Synthetic Test Data
Creating diverse, targeted test data for evaluation.
## Dimension-Based Approach
Define axes of variation, then generate combinations:
```python
dimensions = {
"issue_type": ["billing", "technical", "shipping"],
"customer_mood": ["frustrated", "neutral", "happy"],
"complexity": ["simple", "moderate", "complex"],
}
```
## Two-Step Generation
1. **Generate tuples** (combinations of dimension values)
2. **Convert to natural queries** (separate LLM call per tuple)
```python
# Step 1: Create tuples
tuples = [
("billing", "frustrated", "complex"),
("shipping", "neutral", "simple"),
]
# Step 2: Convert to natural query
def tuple_to_query(t):
prompt = f"""Generate a realistic customer message:
Issue: {t[0]}, Mood: {t[1]}, Complexity: {t[2]}
Write naturally, include typos if appropriate. Don't be formulaic."""
return llm(prompt)
```
## Target Failure Modes
Dimensions should target known failures from error analysis:
```python
# From error analysis findings
dimensions = {
"timezone": ["EST", "PST", "UTC", "ambiguous"], # Known failure
"date_format": ["ISO", "US", "EU", "relative"], # Known failure
}
```
## Quality Control
- **Validate**: Check for placeholder text, minimum length
- **Deduplicate**: Remove near-duplicate queries using embeddings
- **Balance**: Ensure coverage across dimension values
## When to Use
| Use Synthetic | Use Real Data |
| ------------- | ------------- |
| Limited production data | Sufficient traces |
| Testing edge cases | Validating actual behavior |
| Pre-launch evals | Post-launch monitoring |
## Sample Sizes
| Purpose | Size |
| ------- | ---- |
| Initial exploration | 50-100 |
| Comprehensive eval | 100-500 |
| Per-dimension | 10-20 per combination |

View File

@@ -0,0 +1,86 @@
# Experiments: Generating Synthetic Test Data (TypeScript)
Creating diverse, targeted test data for evaluation.
## Dimension-Based Approach
Define axes of variation, then generate combinations:
```typescript
const dimensions = {
issueType: ["billing", "technical", "shipping"],
customerMood: ["frustrated", "neutral", "happy"],
complexity: ["simple", "moderate", "complex"],
};
```
## Two-Step Generation
1. **Generate tuples** (combinations of dimension values)
2. **Convert to natural queries** (separate LLM call per tuple)
```typescript
import { generateText } from "ai";
import { openai } from "@ai-sdk/openai";
// Step 1: Create tuples
type Tuple = [string, string, string];
const tuples: Tuple[] = [
["billing", "frustrated", "complex"],
["shipping", "neutral", "simple"],
];
// Step 2: Convert to natural query
async function tupleToQuery(t: Tuple): Promise<string> {
const { text } = await generateText({
model: openai("gpt-4o"),
prompt: `Generate a realistic customer message:
Issue: ${t[0]}, Mood: ${t[1]}, Complexity: ${t[2]}
Write naturally, include typos if appropriate. Don't be formulaic.`,
});
return text;
}
```
## Target Failure Modes
Dimensions should target known failures from error analysis:
```typescript
// From error analysis findings
const dimensions = {
timezone: ["EST", "PST", "UTC", "ambiguous"], // Known failure
dateFormat: ["ISO", "US", "EU", "relative"], // Known failure
};
```
## Quality Control
- **Validate**: Check for placeholder text, minimum length
- **Deduplicate**: Remove near-duplicate queries using embeddings
- **Balance**: Ensure coverage across dimension values
```typescript
function validateQuery(query: string): boolean {
const minLength = 20;
const hasPlaceholder = /\[.*?\]|<.*?>/.test(query);
return query.length >= minLength && !hasPlaceholder;
}
```
## When to Use
| Use Synthetic | Use Real Data |
| ------------- | ------------- |
| Limited production data | Sufficient traces |
| Testing edge cases | Validating actual behavior |
| Pre-launch evals | Post-launch monitoring |
## Sample Sizes
| Purpose | Size |
| ------- | ---- |
| Initial exploration | 50-100 |
| Comprehensive eval | 100-500 |
| Per-dimension | 10-20 per combination |

View File

@@ -0,0 +1,43 @@
# Anti-Patterns
Common mistakes and fixes.
| Anti-Pattern | Problem | Fix |
| ------------ | ------- | --- |
| Generic metrics | Pre-built scores don't match your failures | Build from error analysis |
| Vibe-based | No quantification | Measure with experiments |
| Ignoring humans | Uncalibrated LLM judges | Validate >80% TPR/TNR |
| Premature automation | Evaluators for imagined problems | Let observed failures drive |
| Saturation blindness | 100% pass = no signal | Keep capability evals at 50-80% |
| Similarity metrics | BERTScore/ROUGE for generation | Use for retrieval only |
| Model switching | Hoping a model works better | Error analysis first |
## Quantify Changes
```python
baseline = run_experiment(dataset, old_prompt, evaluators)
improved = run_experiment(dataset, new_prompt, evaluators)
print(f"Improvement: {improved.pass_rate - baseline.pass_rate:+.1%}")
```
## Don't Use Similarity for Generation
```python
# BAD
score = bertscore(output, reference)
# GOOD
correct_facts = check_facts_against_source(output, context)
```
## Error Analysis Before Model Change
```python
# BAD
for model in models:
results = test(model)
# GOOD
failures = analyze_errors(results)
# Then decide if model change is warranted
```

View File

@@ -0,0 +1,58 @@
# Model Selection
Error analysis first, model changes last.
## Decision Tree
```
Performance Issue?
Error analysis suggests model problem?
NO → Fix prompts, retrieval, tools
YES → Is it a capability gap?
YES → Consider model change
NO → Fix the actual problem
```
## Judge Model Selection
| Principle | Action |
| --------- | ------ |
| Start capable | Use gpt-4o first |
| Optimize later | Test cheaper after criteria stable |
| Same model OK | Judge does different task |
```python
# Start with capable model
judge = ClassificationEvaluator(
llm=LLM(provider="openai", model="gpt-4o"),
...
)
# After validation, test cheaper
judge_cheap = ClassificationEvaluator(
llm=LLM(provider="openai", model="gpt-4o-mini"),
...
)
# Compare TPR/TNR on same test set
```
## Don't Model Shop
```python
# BAD
for model in ["gpt-4o", "claude-3", "gemini-pro"]:
results = run_experiment(dataset, task, model)
# GOOD
failures = analyze_errors(results)
# "Ignores context" → Fix prompt
# "Can't do math" → Maybe try better model
```
## When Model Change Is Warranted
- Failures persist after prompt optimization
- Capability gaps (reasoning, math, code)
- Error analysis confirms model limitation

View File

@@ -0,0 +1,76 @@
# Fundamentals
Application-specific tests for AI systems. Code first, LLM for nuance, human for truth.
## Evaluator Types
| Type | Speed | Cost | Use Case |
| ---- | ----- | ---- | -------- |
| **Code** | Fast | Cheap | Regex, JSON, format, exact match |
| **LLM** | Medium | Medium | Subjective quality, complex criteria |
| **Human** | Slow | Expensive | Ground truth, calibration |
**Decision:** Code first → LLM only when code can't capture criteria → Human for calibration.
## Score Structure
| Property | Required | Description |
| -------- | -------- | ----------- |
| `name` | Yes | Evaluator name |
| `kind` | Yes | `"code"`, `"llm"`, `"human"` |
| `score` | No* | 0-1 numeric |
| `label` | No* | `"pass"`, `"fail"` |
| `explanation` | No | Rationale |
*One of `score` or `label` required.
## Binary > Likert
Use pass/fail, not 1-5 scales. Clearer criteria, easier calibration.
```python
# Multiple binary checks instead of one Likert scale
evaluators = [
AnswersQuestion(), # Yes/No
UsesContext(), # Yes/No
NoHallucination(), # Yes/No
]
```
## Quick Patterns
### Code Evaluator
```python
from phoenix.evals import create_evaluator
@create_evaluator(name="has_citation", kind="code")
def has_citation(output: str) -> bool:
return bool(re.search(r'\[\d+\]', output))
```
### LLM Evaluator
```python
from phoenix.evals import ClassificationEvaluator, LLM
evaluator = ClassificationEvaluator(
name="helpfulness",
prompt_template="...",
llm=LLM(provider="openai", model="gpt-4o"),
choices={"not_helpful": 0, "helpful": 1}
)
```
### Run Experiment
```python
from phoenix.client.experiments import run_experiment
experiment = run_experiment(
dataset=dataset,
task=my_task,
evaluators=[evaluator1, evaluator2],
)
print(experiment.aggregate_scores)
```

View File

@@ -0,0 +1,101 @@
# Observe: Sampling Strategies
How to efficiently sample production traces for review.
## Strategies
### 1. Failure-Focused (Highest Priority)
```python
errors = spans_df[spans_df["status_code"] == "ERROR"]
negative_feedback = spans_df[spans_df["feedback"] == "negative"]
```
### 2. Outliers
```python
long_responses = spans_df.nlargest(50, "response_length")
slow_responses = spans_df.nlargest(50, "latency_ms")
```
### 3. Stratified (Coverage)
```python
# Sample equally from each category
by_query_type = spans_df.groupby("metadata.query_type").apply(
lambda x: x.sample(min(len(x), 20))
)
```
### 4. Metric-Guided
```python
# Review traces flagged by automated evaluators
flagged = spans_df[eval_results["label"] == "hallucinated"]
borderline = spans_df[(eval_results["score"] > 0.3) & (eval_results["score"] < 0.7)]
```
## Building a Review Queue
```python
def build_review_queue(spans_df, max_traces=100):
queue = pd.concat([
spans_df[spans_df["status_code"] == "ERROR"],
spans_df[spans_df["feedback"] == "negative"],
spans_df.nlargest(10, "response_length"),
spans_df.sample(min(30, len(spans_df))),
]).drop_duplicates("span_id").head(max_traces)
return queue
```
## Sample Size Guidelines
| Purpose | Size |
| ------- | ---- |
| Initial exploration | 50-100 |
| Error analysis | 100+ (until saturation) |
| Golden dataset | 100-500 |
| Judge calibration | 100+ per class |
**Saturation:** Stop when new traces show the same failure patterns.
## Trace-Level Sampling
When you need whole requests (all spans per trace), use `get_traces`:
```python
from phoenix.client import Client
from datetime import datetime, timedelta
client = Client()
# Recent traces with full span trees
traces = client.traces.get_traces(
project_identifier="my-app",
limit=100,
include_spans=True,
)
# Time-windowed sampling (e.g., last hour)
traces = client.traces.get_traces(
project_identifier="my-app",
start_time=datetime.now() - timedelta(hours=1),
limit=50,
include_spans=True,
)
# Filter by session (multi-turn conversations)
traces = client.traces.get_traces(
project_identifier="my-app",
session_id="user-session-abc",
include_spans=True,
)
# Sort by latency to find slowest requests
traces = client.traces.get_traces(
project_identifier="my-app",
sort="latency_ms",
order="desc",
limit=50,
)
```

View File

@@ -0,0 +1,147 @@
# Observe: Sampling Strategies (TypeScript)
How to efficiently sample production traces for review.
## Strategies
### 1. Failure-Focused (Highest Priority)
Use server-side filters to fetch only what you need:
```typescript
import { getSpans } from "@arizeai/phoenix-client/spans";
// Server-side filter — only ERROR spans are returned
const { spans: errors } = await getSpans({
project: { projectName: "my-project" },
statusCode: "ERROR",
limit: 100,
});
// Fetch only LLM spans
const { spans: llmSpans } = await getSpans({
project: { projectName: "my-project" },
spanKind: "LLM",
limit: 100,
});
// Filter by span name
const { spans: chatSpans } = await getSpans({
project: { projectName: "my-project" },
name: "chat_completion",
limit: 100,
});
```
### 2. Outliers
```typescript
const { spans } = await getSpans({
project: { projectName: "my-project" },
limit: 200,
});
const latency = (s: (typeof spans)[number]) =>
new Date(s.end_time).getTime() - new Date(s.start_time).getTime();
const sorted = [...spans].sort((a, b) => latency(b) - latency(a));
const slowResponses = sorted.slice(0, 50);
```
### 3. Stratified (Coverage)
```typescript
// Sample equally from each category
function stratifiedSample<T>(items: T[], groupBy: (item: T) => string, perGroup: number): T[] {
const groups = new Map<string, T[]>();
for (const item of items) {
const key = groupBy(item);
if (!groups.has(key)) groups.set(key, []);
groups.get(key)!.push(item);
}
return [...groups.values()].flatMap((g) => g.slice(0, perGroup));
}
const { spans } = await getSpans({
project: { projectName: "my-project" },
limit: 500,
});
const byQueryType = stratifiedSample(spans, (s) => s.attributes?.["metadata.query_type"] ?? "unknown", 20);
```
### 4. Metric-Guided
```typescript
import { getSpanAnnotations } from "@arizeai/phoenix-client/spans";
// Fetch annotations for your spans, then filter by label
const { annotations } = await getSpanAnnotations({
project: { projectName: "my-project" },
spanIds: spans.map((s) => s.context.span_id),
includeAnnotationNames: ["hallucination"],
});
const flaggedSpanIds = new Set(
annotations.filter((a) => a.result?.label === "hallucinated").map((a) => a.span_id)
);
const flagged = spans.filter((s) => flaggedSpanIds.has(s.context.span_id));
```
## Trace-Level Sampling
When you need whole requests (all spans in a trace), use `getTraces`:
```typescript
import { getTraces } from "@arizeai/phoenix-client/traces";
// Recent traces with full span trees
const { traces } = await getTraces({
project: { projectName: "my-project" },
limit: 100,
includeSpans: true,
});
// Filter by session (e.g., multi-turn conversations)
const { traces: sessionTraces } = await getTraces({
project: { projectName: "my-project" },
sessionId: "user-session-abc",
includeSpans: true,
});
// Time-windowed sampling
const { traces: recentTraces } = await getTraces({
project: { projectName: "my-project" },
startTime: new Date(Date.now() - 60 * 60 * 1000), // last hour
limit: 50,
includeSpans: true,
});
```
## Building a Review Queue
```typescript
// Combine server-side filters into a review queue
const { spans: errorSpans } = await getSpans({
project: { projectName: "my-project" },
statusCode: "ERROR",
limit: 30,
});
const { spans: allSpans } = await getSpans({
project: { projectName: "my-project" },
limit: 100,
});
const random = allSpans.sort(() => Math.random() - 0.5).slice(0, 30);
const combined = [...errorSpans, ...random];
const unique = [...new Map(combined.map((s) => [s.context.span_id, s])).values()];
const reviewQueue = unique.slice(0, 100);
```
## Sample Size Guidelines
| Purpose | Size |
| ------- | ---- |
| Initial exploration | 50-100 |
| Error analysis | 100+ (until saturation) |
| Golden dataset | 100-500 |
| Judge calibration | 100+ per class |
**Saturation:** Stop when new traces show the same failure patterns.

View File

@@ -0,0 +1,144 @@
# Observe: Tracing Setup
Configure tracing to capture data for evaluation.
## Quick Setup
```python
# Python
from phoenix.otel import register
register(project_name="my-app", auto_instrument=True)
```
```typescript
// TypeScript
import { registerPhoenix } from "@arizeai/phoenix-otel";
registerPhoenix({ projectName: "my-app", autoInstrument: true });
```
## Essential Attributes
| Attribute | Why It Matters |
| --------- | -------------- |
| `input.value` | User's request |
| `output.value` | Response to evaluate |
| `retrieval.documents` | Context for faithfulness |
| `tool.name`, `tool.parameters` | Agent evaluation |
| `llm.model_name` | Track by model |
## Custom Attributes for Evals
```python
span.set_attribute("metadata.client_type", "enterprise")
span.set_attribute("metadata.query_category", "billing")
```
## Exporting for Evaluation
### Spans (Python — DataFrame)
```python
from phoenix.client import Client
# Client() works for local Phoenix (falls back to env vars or localhost:6006)
# For remote/cloud: Client(base_url="https://app.phoenix.arize.com", api_key="...")
client = Client()
spans_df = client.spans.get_spans_dataframe(
project_identifier="my-app", # NOT project_name= (deprecated)
root_spans_only=True,
)
dataset = client.datasets.create_dataset(
name="error-analysis-set",
dataframe=spans_df[["input.value", "output.value"]],
input_keys=["input.value"],
output_keys=["output.value"],
)
```
### Spans (TypeScript)
```typescript
import { getSpans } from "@arizeai/phoenix-client/spans";
const { spans } = await getSpans({
project: { projectName: "my-app" },
parentId: null, // root spans only
limit: 100,
});
```
### Traces (Python — structured)
Use `get_traces` when you need full trace trees (e.g., multi-turn conversations, agent workflows):
```python
from datetime import datetime, timedelta
traces = client.traces.get_traces(
project_identifier="my-app",
start_time=datetime.now() - timedelta(hours=24),
include_spans=True, # includes all spans per trace
limit=100,
)
# Each trace has: trace_id, start_time, end_time, spans (when include_spans=True)
```
### Traces (TypeScript)
```typescript
import { getTraces } from "@arizeai/phoenix-client/traces";
const { traces } = await getTraces({
project: { projectName: "my-app" },
startTime: new Date(Date.now() - 24 * 60 * 60 * 1000),
includeSpans: true,
limit: 100,
});
```
## Uploading Evaluations as Annotations
### Python
```python
from phoenix.evals import evaluate_dataframe
from phoenix.evals.utils import to_annotation_dataframe
# Run evaluations
results_df = evaluate_dataframe(dataframe=spans_df, evaluators=[my_eval])
# Format results for Phoenix annotations
annotations_df = to_annotation_dataframe(results_df)
# Upload to Phoenix
client.spans.log_span_annotations_dataframe(dataframe=annotations_df)
```
### TypeScript
```typescript
import { logSpanAnnotations } from "@arizeai/phoenix-client/spans";
await logSpanAnnotations({
spanAnnotations: [
{
spanId: "abc123",
name: "quality",
label: "good",
score: 0.95,
annotatorKind: "LLM",
},
],
});
```
Annotations are visible in the Phoenix UI alongside your traces.
## Verify
Required attributes: `input.value`, `output.value`, `status_code`
For RAG: `retrieval.documents`
For agents: `tool.name`, `tool.parameters`

View File

@@ -0,0 +1,137 @@
# Production: Continuous Evaluation
Capability vs regression evals and the ongoing feedback loop.
## Two Types of Evals
| Type | Pass Rate Target | Purpose | Update |
| ---- | ---------------- | ------- | ------ |
| **Capability** | 50-80% | Measure improvement | Add harder cases |
| **Regression** | 95-100% | Catch breakage | Add fixed bugs |
## Saturation
When capability evals hit >95% pass rate, they're saturated:
1. Graduate passing cases to regression suite
2. Add new challenging cases to capability suite
## Feedback Loop
```
Production → Sample traffic → Run evaluators → Find failures
↑ ↓
Deploy ← Run CI evals ← Create test cases ← Error analysis
```
## Implementation
Build a continuous monitoring loop:
1. **Sample recent traces** at regular intervals (e.g., 100 traces per hour)
2. **Run evaluators** on sampled traces
3. **Log results** to Phoenix for tracking
4. **Queue concerning results** for human review
5. **Create test cases** from recurring failure patterns
### Python
```python
from phoenix.client import Client
from datetime import datetime, timedelta
client = Client()
# 1. Sample recent spans (includes full attributes for evaluation)
spans_df = client.spans.get_spans_dataframe(
project_identifier="my-app",
start_time=datetime.now() - timedelta(hours=1),
root_spans_only=True,
limit=100,
)
# 2. Run evaluators
from phoenix.evals import evaluate_dataframe
results_df = evaluate_dataframe(
dataframe=spans_df,
evaluators=[quality_eval, safety_eval],
)
# 3. Upload results as annotations
from phoenix.evals.utils import to_annotation_dataframe
annotations_df = to_annotation_dataframe(results_df)
client.spans.log_span_annotations_dataframe(dataframe=annotations_df)
```
### TypeScript
```typescript
import { getSpans } from "@arizeai/phoenix-client/spans";
import { logSpanAnnotations } from "@arizeai/phoenix-client/spans";
// 1. Sample recent spans
const { spans } = await getSpans({
project: { projectName: "my-app" },
startTime: new Date(Date.now() - 60 * 60 * 1000),
parentId: null, // root spans only
limit: 100,
});
// 2. Run evaluators (user-defined)
const results = await Promise.all(
spans.map(async (span) => ({
spanId: span.context.span_id,
...await runEvaluators(span, [qualityEval, safetyEval]),
}))
);
// 3. Upload results as annotations
await logSpanAnnotations({
spanAnnotations: results.map((r) => ({
spanId: r.spanId,
name: "quality",
score: r.qualityScore,
label: r.qualityLabel,
annotatorKind: "LLM" as const,
})),
});
```
For trace-level monitoring (e.g., agent workflows), use `get_traces`/`getTraces` to identify traces:
```python
# Python: identify slow traces
traces = client.traces.get_traces(
project_identifier="my-app",
start_time=datetime.now() - timedelta(hours=1),
sort="latency_ms",
order="desc",
limit=50,
)
```
```typescript
// TypeScript: identify slow traces
import { getTraces } from "@arizeai/phoenix-client/traces";
const { traces } = await getTraces({
project: { projectName: "my-app" },
startTime: new Date(Date.now() - 60 * 60 * 1000),
limit: 50,
});
```
## Alerting
| Condition | Severity | Action |
| --------- | -------- | ------ |
| Regression < 98% | Critical | Page oncall |
| Capability declining | Warning | Slack notify |
| Capability > 95% for 7d | Info | Schedule review |
## Key Principles
- **Two suites** - Capability + Regression always
- **Graduate cases** - Move consistent passes to regression
- **Track trends** - Monitor over time, not just snapshots

View File

@@ -0,0 +1,53 @@
# Production: Guardrails vs Evaluators
Guardrails block in real-time. Evaluators measure asynchronously.
## Key Distinction
```
Request → [INPUT GUARDRAIL] → LLM → [OUTPUT GUARDRAIL] → Response
└──→ ASYNC EVALUATOR (background)
```
## Guardrails
| Aspect | Requirement |
| ------ | ----------- |
| Timing | Synchronous, blocking |
| Latency | < 100ms |
| Purpose | Prevent harm |
| Type | Code-based (deterministic) |
**Use for:** PII detection, prompt injection, profanity, length limits, format validation.
## Evaluators
| Aspect | Characteristic |
| ------ | -------------- |
| Timing | Async, background |
| Latency | Can be seconds |
| Purpose | Measure quality |
| Type | Can use LLMs |
**Use for:** Helpfulness, faithfulness, tone, completeness, citation accuracy.
## Decision
| Question | Answer |
| -------- | ------ |
| Must block harmful content? | Guardrail |
| Measuring quality? | Evaluator |
| Need LLM judgment? | Evaluator |
| < 100ms required? | Guardrail |
| False positives = angry users? | Evaluator |
## LLM Guardrails: Rarely
Only use LLM guardrails if:
- Latency budget > 1s
- Error cost >> LLM cost
- Low volume
- Fallback exists
**Key Principle:** Guardrails prevent harm (block). Evaluators measure quality (log).

View File

@@ -0,0 +1,92 @@
# Production: Overview
CI/CD evals vs production monitoring - complementary approaches.
## Two Evaluation Modes
| Aspect | CI/CD Evals | Production Monitoring |
| ------ | ----------- | -------------------- |
| **When** | Pre-deployment | Post-deployment, ongoing |
| **Data** | Fixed dataset | Sampled traffic |
| **Goal** | Prevent regression | Detect drift |
| **Response** | Block deploy | Alert & analyze |
## CI/CD Evaluations
```python
# Fast, deterministic checks
ci_evaluators = [
has_required_format,
no_pii_leak,
safety_check,
regression_test_suite,
]
# Small but representative dataset (~100 examples)
run_experiment(ci_dataset, task, ci_evaluators)
```
Set thresholds: regression=0.95, safety=1.0, format=0.98.
## Production Monitoring
### Python
```python
from phoenix.client import Client
from datetime import datetime, timedelta
client = Client()
# Sample recent traces (last hour)
traces = client.traces.get_traces(
project_identifier="my-app",
start_time=datetime.now() - timedelta(hours=1),
include_spans=True,
limit=100,
)
# Run evaluators on sampled traffic
for trace in traces:
results = run_evaluators_async(trace, production_evaluators)
if any(r["score"] < 0.5 for r in results):
alert_on_failure(trace, results)
```
### TypeScript
```typescript
import { getTraces } from "@arizeai/phoenix-client/traces";
import { getSpans } from "@arizeai/phoenix-client/spans";
// Sample recent traces (last hour)
const { traces } = await getTraces({
project: { projectName: "my-app" },
startTime: new Date(Date.now() - 60 * 60 * 1000),
includeSpans: true,
limit: 100,
});
// Or sample spans directly for evaluation
const { spans } = await getSpans({
project: { projectName: "my-app" },
startTime: new Date(Date.now() - 60 * 60 * 1000),
limit: 100,
});
// Run evaluators on sampled traffic
for (const span of spans) {
const results = await runEvaluators(span, productionEvaluators);
if (results.some((r) => r.score < 0.5)) {
await alertOnFailure(span, results);
}
}
```
Prioritize: errors → negative feedback → random sample.
## Feedback Loop
```
Production finds failure → Error analysis → Add to CI dataset → Prevents future regression
```

View File

@@ -0,0 +1,64 @@
# Setup: Python
Packages required for Phoenix evals and experiments.
## Installation
```bash
# Core Phoenix package (includes client, evals, otel)
pip install arize-phoenix
# Or install individual packages
pip install arize-phoenix-client # Phoenix client only
pip install arize-phoenix-evals # Evaluation utilities
pip install arize-phoenix-otel # OpenTelemetry integration
```
## LLM Providers
For LLM-as-judge evaluators, install your provider's SDK:
```bash
pip install openai # OpenAI
pip install anthropic # Anthropic
pip install google-generativeai # Google
```
## Validation (Optional)
```bash
pip install scikit-learn # For TPR/TNR metrics
```
## Quick Verify
```python
from phoenix.client import Client
from phoenix.evals import LLM, ClassificationEvaluator
from phoenix.otel import register
# All imports should work
print("Phoenix Python setup complete")
```
## Key Imports (Evals 2.0)
```python
from phoenix.client import Client
from phoenix.evals import (
ClassificationEvaluator, # LLM classification evaluator (preferred)
LLM, # Provider-agnostic LLM wrapper
async_evaluate_dataframe, # Batch evaluate a DataFrame (preferred, async)
evaluate_dataframe, # Batch evaluate a DataFrame (sync)
create_evaluator, # Decorator for code-based evaluators
create_classifier, # Factory for LLM classification evaluators
bind_evaluator, # Map column names to evaluator params
Score, # Score dataclass
)
from phoenix.evals.utils import to_annotation_dataframe # Format results for Phoenix annotations
```
**Prefer**: `ClassificationEvaluator` over `create_classifier` (more parameters/customization).
**Prefer**: `async_evaluate_dataframe` over `evaluate_dataframe` (better throughput for LLM evals).
**Do NOT use** legacy 1.0 imports: `OpenAIModel`, `AnthropicModel`, `run_evals`, `llm_classify`.

View File

@@ -0,0 +1,41 @@
# Setup: TypeScript
Packages required for Phoenix evals and experiments.
## Installation
```bash
# Using npm
npm install @arizeai/phoenix-client @arizeai/phoenix-evals @arizeai/phoenix-otel
# Using pnpm
pnpm add @arizeai/phoenix-client @arizeai/phoenix-evals @arizeai/phoenix-otel
```
## LLM Providers
For LLM-as-judge evaluators, install Vercel AI SDK providers:
```bash
npm install ai @ai-sdk/openai # Vercel AI SDK + OpenAI
npm install @ai-sdk/anthropic # Anthropic
npm install @ai-sdk/google # Google
```
Or use direct provider SDKs:
```bash
npm install openai # OpenAI direct
npm install @anthropic-ai/sdk # Anthropic direct
```
## Quick Verify
```typescript
import { createClient } from "@arizeai/phoenix-client";
import { createClassificationEvaluator } from "@arizeai/phoenix-evals";
import { registerPhoenix } from "@arizeai/phoenix-otel";
// All imports should work
console.log("Phoenix TypeScript setup complete");
```

View File

@@ -0,0 +1,43 @@
# Validating Evaluators (Python)
Validate LLM evaluators against human-labeled examples. Target >80% TPR/TNR/Accuracy.
## Calculate Metrics
```python
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(human_labels, evaluator_predictions))
cm = confusion_matrix(human_labels, evaluator_predictions)
tn, fp, fn, tp = cm.ravel()
tpr = tp / (tp + fn)
tnr = tn / (tn + fp)
print(f"TPR: {tpr:.2f}, TNR: {tnr:.2f}")
```
## Correct Production Estimates
```python
def correct_estimate(observed, tpr, tnr):
"""Adjust observed pass rate using known TPR/TNR."""
return (observed - (1 - tnr)) / (tpr - (1 - tnr))
```
## Find Misclassified
```python
# False Positives: Evaluator pass, human fail
fp_mask = (evaluator_predictions == 1) & (human_labels == 0)
false_positives = dataset[fp_mask]
# False Negatives: Evaluator fail, human pass
fn_mask = (evaluator_predictions == 0) & (human_labels == 1)
false_negatives = dataset[fn_mask]
```
## Red Flags
- TPR or TNR < 70%
- Large gap between TPR and TNR
- Kappa < 0.6

View File

@@ -0,0 +1,106 @@
# Validating Evaluators (TypeScript)
Validate an LLM evaluator against human-labeled examples before deploying it.
Target: **>80% TPR and >80% TNR**.
Roles are inverted compared to a normal task experiment:
| Normal experiment | Evaluator validation |
|---|---|
| Task = agent logic | Task = run the evaluator under test |
| Evaluator = judge output | Evaluator = exact-match vs human ground truth |
| Dataset = agent examples | Dataset = golden hand-labeled examples |
## Golden Dataset
Use a separate dataset name so validation experiments don't mix with task experiments in Phoenix.
Store human ground truth in `metadata.groundTruthLabel`. Aim for ~50/50 balance:
```typescript
import type { Example } from "@arizeai/phoenix-client/types/datasets";
const goldenExamples: Example[] = [
{ input: { q: "Capital of France?" }, output: { answer: "Paris" }, metadata: { groundTruthLabel: "correct" } },
{ input: { q: "Capital of France?" }, output: { answer: "Lyon" }, metadata: { groundTruthLabel: "incorrect" } },
{ input: { q: "Capital of France?" }, output: { answer: "Major city..." }, metadata: { groundTruthLabel: "incorrect" } },
];
const VALIDATOR_DATASET = "my-app-qa-evaluator-validation"; // separate from task dataset
const POSITIVE_LABEL = "correct";
const NEGATIVE_LABEL = "incorrect";
```
## Validation Experiment
```typescript
import { createClient } from "@arizeai/phoenix-client";
import { createOrGetDataset, getDatasetExamples } from "@arizeai/phoenix-client/datasets";
import { asExperimentEvaluator, runExperiment } from "@arizeai/phoenix-client/experiments";
import { myEvaluator } from "./myEvaluator.js";
const client = createClient();
const { datasetId } = await createOrGetDataset({ client, name: VALIDATOR_DATASET, examples: goldenExamples });
const { examples } = await getDatasetExamples({ client, dataset: { datasetId } });
const groundTruth = new Map(examples.map((ex) => [ex.id, ex.metadata?.groundTruthLabel as string]));
// Task: invoke the evaluator under test
const task = async (example: (typeof examples)[number]) => {
const result = await myEvaluator.evaluate({ input: example.input, output: example.output, metadata: example.metadata });
return result.label ?? "unknown";
};
// Evaluator: exact-match against human ground truth
const exactMatch = asExperimentEvaluator({
name: "exact-match", kind: "CODE",
evaluate: ({ output, metadata }) => {
const expected = metadata?.groundTruthLabel as string;
const predicted = typeof output === "string" ? output : "unknown";
return { score: predicted === expected ? 1 : 0, label: predicted, explanation: `Expected: ${expected}, Got: ${predicted}` };
},
});
const experiment = await runExperiment({
client, experimentName: `evaluator-validation-${Date.now()}`,
dataset: { datasetId }, task, evaluators: [exactMatch],
});
// Compute confusion matrix
const runs = Object.values(experiment.runs);
const predicted = new Map((experiment.evaluationRuns ?? [])
.filter((e) => e.name === "exact-match")
.map((e) => [e.experimentRunId, e.result?.label ?? null]));
let tp = 0, fp = 0, tn = 0, fn = 0;
for (const run of runs) {
if (run.error) continue;
const p = predicted.get(run.id), a = groundTruth.get(run.datasetExampleId);
if (!p || !a) continue;
if (a === POSITIVE_LABEL && p === POSITIVE_LABEL) tp++;
else if (a === NEGATIVE_LABEL && p === POSITIVE_LABEL) fp++;
else if (a === NEGATIVE_LABEL && p === NEGATIVE_LABEL) tn++;
else if (a === POSITIVE_LABEL && p === NEGATIVE_LABEL) fn++;
}
const total = tp + fp + tn + fn;
const tpr = tp + fn > 0 ? (tp / (tp + fn)) * 100 : 0;
const tnr = tn + fp > 0 ? (tn / (tn + fp)) * 100 : 0;
console.log(`TPR: ${tpr.toFixed(1)}% TNR: ${tnr.toFixed(1)}% Accuracy: ${((tp + tn) / total * 100).toFixed(1)}%`);
```
## Results & Quality Rules
| Metric | Target | Low value means |
|---|---|---|
| TPR (sensitivity) | >80% | Misses real failures (false negatives) |
| TNR (specificity) | >80% | Flags good outputs (false positives) |
| Accuracy | >80% | General weakness |
**Golden dataset rules:** ~50/50 balance · include edge cases · human-labeled only · never mutate (append new versions) · 2050 examples is enough.
**Re-validate when:** prompt template changes · judge model changes · criteria updated · production FP/FN spike.
## See Also
- `validation.md` — Metric definitions and concepts
- `experiments-running-typescript.md``runExperiment` API
- `experiments-datasets-typescript.md``createOrGetDataset` / `getDatasetExamples`

View File

@@ -0,0 +1,74 @@
# Validation
Validate LLM judges against human labels before deploying. Target >80% agreement.
## Requirements
| Requirement | Target |
| ----------- | ------ |
| Test set size | 100+ examples |
| Balance | ~50/50 pass/fail |
| Accuracy | >80% |
| TPR/TNR | Both >70% |
## Metrics
| Metric | Formula | Use When |
| ------ | ------- | -------- |
| **Accuracy** | (TP+TN) / Total | General |
| **TPR (Recall)** | TP / (TP+FN) | Quality assurance |
| **TNR (Specificity)** | TN / (TN+FP) | Safety-critical |
| **Cohen's Kappa** | Agreement beyond chance | Comparing evaluators |
## Quick Validation
```python
from sklearn.metrics import classification_report, confusion_matrix, cohen_kappa_score
print(classification_report(human_labels, evaluator_predictions))
print(f"Kappa: {cohen_kappa_score(human_labels, evaluator_predictions):.3f}")
# Get TPR/TNR
cm = confusion_matrix(human_labels, evaluator_predictions)
tn, fp, fn, tp = cm.ravel()
tpr = tp / (tp + fn)
tnr = tn / (tn + fp)
```
## Golden Dataset Structure
```python
golden_example = {
"input": "What is the capital of France?",
"output": "Paris is the capital.",
"ground_truth_label": "correct",
}
```
## Building Golden Datasets
1. Sample production traces (errors, negative feedback, edge cases)
2. Balance ~50/50 pass/fail
3. Expert labels each example
4. Version datasets (never modify existing)
```python
# GOOD - create new version
golden_v2 = golden_v1 + [new_examples]
# BAD - never modify existing
golden_v1.append(new_example)
```
## Warning Signs
- All pass or all fail → too lenient/strict
- Random results → criteria unclear
- TPR/TNR < 70% → needs improvement
## Re-Validate When
- Prompt template changes
- Judge model changes
- Criteria changes
- Monthly

View File

@@ -0,0 +1,24 @@
# Phoenix Tracing Skill
OpenInference semantic conventions and instrumentation guides for Phoenix.
## Usage
Start with `SKILL.md` for the index and quick reference.
## File Organization
All files in flat `rules/` directory with semantic prefixes:
- `span-*` - Span kinds (LLM, CHAIN, TOOL, etc.)
- `setup-*`, `instrumentation-*` - Getting started guides
- `fundamentals-*`, `attributes-*` - Reference docs
- `annotations-*`, `export-*` - Advanced features
## Reference
- [OpenInference Spec](https://github.com/Arize-ai/openinference/tree/main/spec)
- [Phoenix Documentation](https://docs.arize.com/phoenix)
- [Python OTEL API](https://arize-phoenix.readthedocs.io/projects/otel/en/latest/)
- [Python Client API](https://arize-phoenix.readthedocs.io/projects/client/en/latest/)
- [TypeScript API](https://arize-ai.github.io/phoenix/)

View File

@@ -0,0 +1,139 @@
---
name: phoenix-tracing
description: OpenInference semantic conventions and instrumentation for Phoenix AI observability. Use when implementing LLM tracing, creating custom spans, or deploying to production.
license: Apache-2.0
compatibility: Requires Phoenix server. Python skills need arize-phoenix-otel; TypeScript skills need @arizeai/phoenix-otel.
metadata:
author: oss@arize.com
version: "1.0.0"
languages: "Python, TypeScript"
---
# Phoenix Tracing
Comprehensive guide for instrumenting LLM applications with OpenInference tracing in Phoenix. Contains reference files covering setup, instrumentation, span types, and production deployment.
## When to Apply
Reference these guidelines when:
- Setting up Phoenix tracing (Python or TypeScript)
- Creating custom spans for LLM operations
- Adding attributes following OpenInference conventions
- Deploying tracing to production
- Querying and analyzing trace data
## Reference Categories
| Priority | Category | Description | Prefix |
| -------- | --------------- | ------------------------------ | -------------------------- |
| 1 | Setup | Installation and configuration | `setup-*` |
| 2 | Instrumentation | Auto and manual tracing | `instrumentation-*` |
| 3 | Span Types | 9 span kinds with attributes | `span-*` |
| 4 | Organization | Projects and sessions | `projects-*`, `sessions-*` |
| 5 | Enrichment | Custom metadata | `metadata-*` |
| 6 | Production | Batch processing, masking | `production-*` |
| 7 | Feedback | Annotations and evaluation | `annotations-*` |
## Quick Reference
### 1. Setup (START HERE)
- [setup-python](references/setup-python.md) - Install arize-phoenix-otel, configure endpoint
- [setup-typescript](references/setup-typescript.md) - Install @arizeai/phoenix-otel, configure endpoint
### 2. Instrumentation
- [instrumentation-auto-python](references/instrumentation-auto-python.md) - Auto-instrument OpenAI, LangChain, etc.
- [instrumentation-auto-typescript](references/instrumentation-auto-typescript.md) - Auto-instrument supported frameworks
- [instrumentation-manual-python](references/instrumentation-manual-python.md) - Custom spans with decorators
- [instrumentation-manual-typescript](references/instrumentation-manual-typescript.md) - Custom spans with wrappers
### 3. Span Types (with full attribute schemas)
- [span-llm](references/span-llm.md) - LLM API calls (model, tokens, messages, cost)
- [span-chain](references/span-chain.md) - Multi-step workflows and pipelines
- [span-retriever](references/span-retriever.md) - Document retrieval (documents, scores)
- [span-tool](references/span-tool.md) - Function/API calls (name, parameters)
- [span-agent](references/span-agent.md) - Multi-step reasoning agents
- [span-embedding](references/span-embedding.md) - Vector generation
- [span-reranker](references/span-reranker.md) - Document re-ranking
- [span-guardrail](references/span-guardrail.md) - Safety checks
- [span-evaluator](references/span-evaluator.md) - LLM evaluation
### 4. Organization
- [projects-python](references/projects-python.md) / [projects-typescript](references/projects-typescript.md) - Group traces by application
- [sessions-python](references/sessions-python.md) / [sessions-typescript](references/sessions-typescript.md) - Track conversations
### 5. Enrichment
- [metadata-python](references/metadata-python.md) / [metadata-typescript](references/metadata-typescript.md) - Custom attributes
### 6. Production (CRITICAL)
- [production-python](references/production-python.md) / [production-typescript](references/production-typescript.md) - Batch processing, PII masking
### 7. Feedback
- [annotations-overview](references/annotations-overview.md) - Feedback concepts
- [annotations-python](references/annotations-python.md) / [annotations-typescript](references/annotations-typescript.md) - Add feedback to spans
### Reference Files
- [fundamentals-overview](references/fundamentals-overview.md) - Traces, spans, attributes basics
- [fundamentals-required-attributes](references/fundamentals-required-attributes.md) - Required fields per span type
- [fundamentals-universal-attributes](references/fundamentals-universal-attributes.md) - Common attributes (user.id, session.id)
- [fundamentals-flattening](references/fundamentals-flattening.md) - JSON flattening rules
- [attributes-messages](references/attributes-messages.md) - Chat message format
- [attributes-metadata](references/attributes-metadata.md) - Custom metadata schema
- [attributes-graph](references/attributes-graph.md) - Agent workflow attributes
- [attributes-exceptions](references/attributes-exceptions.md) - Error tracking
## Common Workflows
- **Quick Start**: setup-{lang} → instrumentation-auto-{lang} → Check Phoenix
- **Custom Spans**: setup-{lang} → instrumentation-manual-{lang} → span-{type}
- **Session Tracking**: sessions-{lang} for conversation grouping patterns
- **Production**: production-{lang} for batching, masking, and deployment
## How to Use This Skill
**Navigation Patterns:**
```bash
# By category prefix
references/setup-* # Installation and configuration
references/instrumentation-* # Auto and manual tracing
references/span-* # Span type specifications
references/sessions-* # Session tracking
references/production-* # Production deployment
references/fundamentals-* # Core concepts
references/attributes-* # Attribute specifications
# By language
references/*-python.md # Python implementations
references/*-typescript.md # TypeScript implementations
```
**Reading Order:**
1. Start with setup-{lang} for your language
2. Choose instrumentation-auto-{lang} OR instrumentation-manual-{lang}
3. Reference span-{type} files as needed for specific operations
4. See fundamentals-* files for attribute specifications
## References
**Phoenix Documentation:**
- [Phoenix Documentation](https://docs.arize.com/phoenix)
- [OpenInference Spec](https://github.com/Arize-ai/openinference/tree/main/spec)
**Python API Documentation:**
- [Python OTEL Package](https://arize-phoenix.readthedocs.io/projects/otel/en/latest/) - `arize-phoenix-otel` API reference
- [Python Client Package](https://arize-phoenix.readthedocs.io/projects/client/en/latest/) - `arize-phoenix-client` API reference
**TypeScript API Documentation:**
- [TypeScript Packages](https://arize-ai.github.io/phoenix/) - `@arizeai/phoenix-otel`, `@arizeai/phoenix-client`, and other TypeScript packages

View File

@@ -0,0 +1,69 @@
# Annotations Overview
Annotations allow you to add human or automated feedback to traces, spans, documents, and sessions. Annotations are essential for evaluation, quality assessment, and building training datasets.
## Annotation Types
Phoenix supports four types of annotations:
| Type | Target | Purpose | Example Use Case |
| ----------------------- | -------------------------------- | ---------------------------------------- | -------------------------------- |
| **Span Annotation** | Individual span | Feedback on a specific operation | "This LLM response was accurate" |
| **Document Annotation** | Document within a RETRIEVER span | Feedback on retrieved document relevance | "This document was not helpful" |
| **Trace Annotation** | Entire trace | Feedback on end-to-end interaction | "User was satisfied with result" |
| **Session Annotation** | User session | Feedback on multi-turn conversation | "Session ended successfully" |
## Annotation Fields
Every annotation has these fields:
### Required Fields
| Field | Type | Description |
| --------- | ------ | ----------------------------------------------------------------------------- |
| Entity ID | String | ID of the target entity (span_id, trace_id, session_id, or document_position) |
| `name` | String | Annotation name/label (e.g., "quality", "relevance", "helpfulness") |
### Result Fields (At Least One Required)
| Field | Type | Description |
| ------------- | ----------------- | ----------------------------------------------------------------- |
| `label` | String (optional) | Categorical value (e.g., "good", "bad", "relevant", "irrelevant") |
| `score` | Float (optional) | Numeric value (typically 0-1, but can be any range) |
| `explanation` | String (optional) | Free-text explanation of the annotation |
**At least one** of `label`, `score`, or `explanation` must be provided.
### Optional Fields
| Field | Type | Description |
| ---------------- | ------ | --------------------------------------------------------------------------------------- |
| `annotator_kind` | String | Who created this annotation: "HUMAN", "LLM", or "CODE" (default: "HUMAN") |
| `identifier` | String | Unique identifier for upsert behavior (updates existing if same name+entity+identifier) |
| `metadata` | Object | Custom metadata as key-value pairs |
## Annotator Kinds
| Kind | Description | Example |
| ------- | ------------------------------ | --------------------------------- |
| `HUMAN` | Manual feedback from a person | User ratings, expert labels |
| `LLM` | Automated feedback from an LLM | GPT-4 evaluating response quality |
| `CODE` | Automated feedback from code | Rule-based checks, heuristics |
## Examples
**Quality Assessment:**
- `quality` - Overall quality (label: good/fair/poor, score: 0-1)
- `correctness` - Factual accuracy (label: correct/incorrect, score: 0-1)
- `helpfulness` - User satisfaction (label: helpful/not_helpful, score: 0-1)
**RAG-Specific:**
- `relevance` - Document relevance to query (label: relevant/irrelevant, score: 0-1)
- `faithfulness` - Answer grounded in context (label: faithful/unfaithful, score: 0-1)
**Safety:**
- `toxicity` - Contains harmful content (score: 0-1)
- `pii_detected` - Contains personally identifiable information (label: yes/no)

View File

@@ -0,0 +1,114 @@
# Python SDK Annotation Patterns
Add feedback to spans, traces, documents, and sessions using the Python client.
## Client Setup
```python
from phoenix.client import Client
client = Client() # Default: http://localhost:6006
```
## Span Annotations
Add feedback to individual spans:
```python
client.spans.add_span_annotation(
span_id="abc123",
annotation_name="quality",
annotator_kind="HUMAN",
label="high_quality",
score=0.95,
explanation="Accurate and well-formatted",
metadata={"reviewer": "alice"},
sync=True
)
```
## Document Annotations
Rate individual documents in RETRIEVER spans:
```python
client.spans.add_document_annotation(
span_id="retriever_span",
document_position=0, # 0-based index
annotation_name="relevance",
annotator_kind="LLM",
label="relevant",
score=0.95
)
```
## Trace Annotations
Feedback on entire traces:
```python
client.traces.add_trace_annotation(
trace_id="trace_abc",
annotation_name="correctness",
annotator_kind="HUMAN",
label="correct",
score=1.0
)
```
## Session Annotations
Feedback on multi-turn conversations:
```python
client.sessions.add_session_annotation(
session_id="session_xyz",
annotation_name="user_satisfaction",
annotator_kind="HUMAN",
label="satisfied",
score=0.85
)
```
## RAG Pipeline Example
```python
from phoenix.client import Client
from phoenix.client.resources.spans import SpanDocumentAnnotationData
client = Client()
# Document relevance (batch)
client.spans.log_document_annotations(
document_annotations=[
SpanDocumentAnnotationData(
name="relevance", span_id="retriever_span", document_position=i,
annotator_kind="LLM", result={"label": label, "score": score}
)
for i, (label, score) in enumerate([
("relevant", 0.95), ("relevant", 0.80), ("irrelevant", 0.10)
])
]
)
# LLM response quality
client.spans.add_span_annotation(
span_id="llm_span",
annotation_name="faithfulness",
annotator_kind="LLM",
label="faithful",
score=0.90
)
# Overall trace quality
client.traces.add_trace_annotation(
trace_id="trace_123",
annotation_name="correctness",
annotator_kind="HUMAN",
label="correct",
score=1.0
)
```
## API Reference
- [Python Client API](https://arize-phoenix.readthedocs.io/projects/client/en/latest/)

View File

@@ -0,0 +1,137 @@
# TypeScript SDK Annotation Patterns
Add feedback to spans, traces, documents, and sessions using the TypeScript client.
## Client Setup
```typescript
import { createClient } from "phoenix-client";
const client = createClient(); // Default: http://localhost:6006
```
## Span Annotations
Add feedback to individual spans:
```typescript
import { addSpanAnnotation } from "phoenix-client";
await addSpanAnnotation({
client,
spanAnnotation: {
spanId: "abc123",
name: "quality",
annotatorKind: "HUMAN",
label: "high_quality",
score: 0.95,
explanation: "Accurate and well-formatted",
metadata: { reviewer: "alice" }
},
sync: true
});
```
## Document Annotations
Rate individual documents in RETRIEVER spans:
```typescript
import { addDocumentAnnotation } from "phoenix-client";
await addDocumentAnnotation({
client,
documentAnnotation: {
spanId: "retriever_span",
documentPosition: 0, // 0-based index
name: "relevance",
annotatorKind: "LLM",
label: "relevant",
score: 0.95
}
});
```
## Trace Annotations
Feedback on entire traces:
```typescript
import { addTraceAnnotation } from "phoenix-client";
await addTraceAnnotation({
client,
traceAnnotation: {
traceId: "trace_abc",
name: "correctness",
annotatorKind: "HUMAN",
label: "correct",
score: 1.0
}
});
```
## Session Annotations
Feedback on multi-turn conversations:
```typescript
import { addSessionAnnotation } from "phoenix-client";
await addSessionAnnotation({
client,
sessionAnnotation: {
sessionId: "session_xyz",
name: "user_satisfaction",
annotatorKind: "HUMAN",
label: "satisfied",
score: 0.85
}
});
```
## RAG Pipeline Example
```typescript
import { createClient, logDocumentAnnotations, addSpanAnnotation, addTraceAnnotation } from "phoenix-client";
const client = createClient();
// Document relevance (batch)
await logDocumentAnnotations({
client,
documentAnnotations: [
{ spanId: "retriever_span", documentPosition: 0, name: "relevance",
annotatorKind: "LLM", label: "relevant", score: 0.95 },
{ spanId: "retriever_span", documentPosition: 1, name: "relevance",
annotatorKind: "LLM", label: "relevant", score: 0.80 }
]
});
// LLM response quality
await addSpanAnnotation({
client,
spanAnnotation: {
spanId: "llm_span",
name: "faithfulness",
annotatorKind: "LLM",
label: "faithful",
score: 0.90
}
});
// Overall trace quality
await addTraceAnnotation({
client,
traceAnnotation: {
traceId: "trace_123",
name: "correctness",
annotatorKind: "HUMAN",
label: "correct",
score: 1.0
}
});
```
## API Reference
- [TypeScript Client API](https://arize-ai.github.io/phoenix/)

View File

@@ -0,0 +1,58 @@
# Flattening Convention
OpenInference flattens nested data structures into dot-notation attributes for database compatibility, OpenTelemetry compatibility, and simple querying.
## Flattening Rules
**Objects → Dot Notation**
```javascript
{ llm: { model_name: "gpt-4", token_count: { prompt: 10, completion: 20 } } }
// becomes
{ "llm.model_name": "gpt-4", "llm.token_count.prompt": 10, "llm.token_count.completion": 20 }
```
**Arrays → Zero-Indexed Notation**
```javascript
{ llm: { input_messages: [{ role: "user", content: "Hi" }] } }
// becomes
{ "llm.input_messages.0.message.role": "user", "llm.input_messages.0.message.content": "Hi" }
```
**Message Convention: `.message.` segment required**
```
llm.input_messages.{index}.message.{field}
llm.input_messages.0.message.tool_calls.0.tool_call.function.name
```
## Complete Example
```javascript
// Original
{
openinference: { span: { kind: "LLM" } },
llm: {
model_name: "claude-3-5-sonnet-20241022",
invocation_parameters: { temperature: 0.7, max_tokens: 1000 },
input_messages: [{ role: "user", content: "Tell me a joke" }],
output_messages: [{ role: "assistant", content: "Why did the chicken cross the road?" }],
token_count: { prompt: 5, completion: 10, total: 15 }
}
}
// Flattened (stored in Phoenix spans.attributes JSONB)
{
"openinference.span.kind": "LLM",
"llm.model_name": "claude-3-5-sonnet-20241022",
"llm.invocation_parameters": "{\"temperature\": 0.7, \"max_tokens\": 1000}",
"llm.input_messages.0.message.role": "user",
"llm.input_messages.0.message.content": "Tell me a joke",
"llm.output_messages.0.message.role": "assistant",
"llm.output_messages.0.message.content": "Why did the chicken cross the road?",
"llm.token_count.prompt": 5,
"llm.token_count.completion": 10,
"llm.token_count.total": 15
}
```

View File

@@ -0,0 +1,53 @@
# Overview and Traces & Spans
This document covers the fundamental concepts of OpenInference traces and spans in Phoenix.
## Overview
OpenInference is a set of semantic conventions for AI and LLM applications based on OpenTelemetry. Phoenix uses these conventions to capture, store, and analyze traces from AI applications.
**Key Concepts:**
- **Traces** represent end-to-end requests through your application
- **Spans** represent individual operations within a trace (LLM calls, retrievals, tool invocations)
- **Attributes** are key-value pairs attached to spans using flattened, dot-notation paths
- **Span Kinds** categorize the type of operation (LLM, RETRIEVER, TOOL, etc.)
## Traces and Spans
### Trace Hierarchy
A **trace** is a tree of **spans** representing a complete request:
```
Trace ID: abc123
├─ Span 1: CHAIN (root span, parent_id = null)
│ ├─ Span 2: RETRIEVER (parent_id = span_1_id)
│ │ └─ Span 3: EMBEDDING (parent_id = span_2_id)
│ └─ Span 4: LLM (parent_id = span_1_id)
│ └─ Span 5: TOOL (parent_id = span_4_id)
```
### Context Propagation
Spans maintain parent-child relationships via:
- `trace_id` - Same for all spans in a trace
- `span_id` - Unique identifier for this span
- `parent_id` - References parent span's `span_id` (null for root spans)
Phoenix uses these relationships to:
- Build the span tree visualization in the UI
- Calculate cumulative metrics (tokens, errors) up the tree
- Enable nested querying (e.g., "find CHAIN spans containing LLM spans with errors")
### Span Lifecycle
Each span has:
- `start_time` - When the operation began (Unix timestamp in nanoseconds)
- `end_time` - When the operation completed
- `status_code` - OK, ERROR, or UNSET
- `status_message` - Optional error message
- `attributes` - object with all semantic convention attributes

View File

@@ -0,0 +1,64 @@
# Required and Recommended Attributes
This document covers the required attribute and highly recommended attributes for all OpenInference spans.
## Required Attribute
**Every span MUST have exactly one required attribute:**
```json
{
"openinference.span.kind": "LLM"
}
```
## Highly Recommended Attributes
While not strictly required, these attributes are **highly recommended** on all spans as they:
- Enable evaluation and quality assessment
- Help understand information flow through your application
- Make traces more useful for debugging
### Input/Output Values
| Attribute | Type | Description |
|-----------|------|-------------|
| `input.value` | String | Input to the operation (prompt, query, document) |
| `output.value` | String | Output from the operation (response, result, answer) |
**Example:**
```json
{
"openinference.span.kind": "LLM",
"input.value": "What is the capital of France?",
"output.value": "The capital of France is Paris."
}
```
**Why these matter:**
- **Evaluations**: Many evaluators (faithfulness, relevance, hallucination detection) require both input and output to assess quality
- **Information flow**: Seeing inputs/outputs makes it easy to trace how data transforms through your application
- **Debugging**: When something goes wrong, having the actual input/output makes root cause analysis much faster
- **Analytics**: Enables pattern analysis across similar inputs or outputs
**Phoenix Behavior:**
- Input/output displayed prominently in span details
- Evaluators can automatically access these values
- Search/filter traces by input or output content
- Export inputs/outputs for fine-tuning datasets
## Valid Span Kinds
There are exactly **9 valid span kinds** in OpenInference:
| Span Kind | Purpose | Common Use Case |
|-----------|---------|-----------------|
| `LLM` | Language model inference | OpenAI, Anthropic, local LLM calls |
| `EMBEDDING` | Vector generation | Text-to-vector conversion |
| `CHAIN` | Application flow orchestration | LangChain chains, custom workflows |
| `RETRIEVER` | Document/context retrieval | Vector DB queries, semantic search |
| `RERANKER` | Result reordering | Rerank retrieved documents |
| `TOOL` | External tool invocation | API calls, function execution |
| `AGENT` | Autonomous reasoning | ReAct agents, planning loops |
| `GUARDRAIL` | Safety/policy checks | Content moderation, PII detection |
| `EVALUATOR` | Quality assessment | Answer relevance, faithfulness scoring |

View File

@@ -0,0 +1,72 @@
# Universal Attributes
This document covers attributes that can be used on any span kind in OpenInference.
## Overview
These attributes can be used on **any span kind** to provide additional context, tracking, and metadata.
## Input/Output
| Attribute | Type | Description |
| ------------------ | ------ | ---------------------------------------------------- |
| `input.value` | String | Input to the operation (prompt, query, document) |
| `input.mime_type` | String | MIME type (e.g., "text/plain", "application/json") |
| `output.value` | String | Output from the operation (response, vector, result) |
| `output.mime_type` | String | MIME type of output |
### Why Capture I/O?
**Always capture input/output for evaluation-ready spans:**
- Phoenix evaluators (faithfulness, relevance, Q&A correctness) require `input.value` and `output.value`
- Phoenix UI displays I/O prominently in trace views for debugging
- Enables exporting I/O for creating fine-tuning datasets
- Provides complete context for analyzing agent behavior
**Example attributes:**
```json
{
"openinference.span.kind": "CHAIN",
"input.value": "What is the weather?",
"input.mime_type": "text/plain",
"output.value": "I don't have access to weather data.",
"output.mime_type": "text/plain"
}
```
**See language-specific implementation:**
- TypeScript: `instrumentation-manual-typescript.md`
- Python: `instrumentation-manual-python.md`
## Session and User Tracking
| Attribute | Type | Description |
| ------------ | ------ | ---------------------------------------------- |
| `session.id` | String | Session identifier for grouping related traces |
| `user.id` | String | User identifier for per-user analysis |
**Example:**
```json
{
"openinference.span.kind": "LLM",
"session.id": "session_abc123",
"user.id": "user_xyz789"
}
```
## Metadata
| Attribute | Type | Description |
| ---------- | ------ | ------------------------------------------ |
| `metadata` | string | JSON-serialized object of key-value pairs |
**Example:**
```json
{
"openinference.span.kind": "LLM",
"metadata": "{\"environment\": \"production\", \"model_version\": \"v2.1\", \"cost_center\": \"engineering\"}"
}
```

View File

@@ -0,0 +1,85 @@
# Phoenix Tracing: Auto-Instrumentation (Python)
**Automatically create spans for LLM calls without code changes.**
## Overview
Auto-instrumentation patches supported libraries at runtime to create spans automatically. Use for supported frameworks (LangChain, LlamaIndex, OpenAI SDK, etc.). For custom logic, manual-instrumentation-python.md.
## Supported Frameworks
**Python:**
- LLM SDKs: OpenAI, Anthropic, Bedrock, Mistral, Vertex AI, Groq, Ollama
- Frameworks: LangChain, LlamaIndex, DSPy, CrewAI, Instructor, Haystack
- Install: `pip install openinference-instrumentation-{name}`
## Setup
**Install and enable:**
```bash
pip install arize-phoenix-otel
pip install openinference-instrumentation-openai # Add others as needed
```
```python
from phoenix.otel import register
register(project_name="my-app", auto_instrument=True) # Discovers all installed instrumentors
```
**Example:**
```python
from phoenix.otel import register
from openai import OpenAI
register(project_name="my-app", auto_instrument=True)
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": "Hello!"}]
)
```
Traces appear in Phoenix UI with model, input/output, tokens, timing automatically captured. See span kind files for full attribute schemas.
**Selective instrumentation** (explicit control):
```python
from phoenix.otel import register
from openinference.instrumentation.openai import OpenAIInstrumentor
tracer_provider = register(project_name="my-app") # No auto_instrument
OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)
```
## Limitations
Auto-instrumentation does NOT capture:
- Custom business logic
- Internal function calls
**Example:**
```python
def my_custom_workflow(query: str) -> str:
preprocessed = preprocess(query) # Not traced
response = client.chat.completions.create(...) # Traced (auto)
postprocessed = postprocess(response) # Not traced
return postprocessed
```
**Solution:** Add manual instrumentation:
```python
@tracer.chain
def my_custom_workflow(query: str) -> str:
preprocessed = preprocess(query)
response = client.chat.completions.create(...)
postprocessed = postprocess(response)
return postprocessed
```

View File

@@ -0,0 +1,87 @@
# Auto-Instrumentation (TypeScript)
Automatically create spans for LLM calls without code changes.
## Supported Frameworks
- **LLM SDKs:** OpenAI
- **Frameworks:** LangChain
- **Install:** `npm install @arizeai/openinference-instrumentation-{name}`
## Setup
**CommonJS (automatic):**
```javascript
const { register } = require("@arizeai/phoenix-otel");
const OpenAI = require("openai");
register({ projectName: "my-app" });
const client = new OpenAI();
```
**ESM (manual required):**
```typescript
import { register, registerInstrumentations } from "@arizeai/phoenix-otel";
import { OpenAIInstrumentation } from "@arizeai/openinference-instrumentation-openai";
import OpenAI from "openai";
register({ projectName: "my-app" });
const instrumentation = new OpenAIInstrumentation();
instrumentation.manuallyInstrument(OpenAI);
registerInstrumentations({ instrumentations: [instrumentation] });
```
**Why:** ESM imports are hoisted before `register()` runs.
## Limitations
**What auto-instrumentation does NOT capture:**
```typescript
async function myWorkflow(query: string): Promise<string> {
const preprocessed = await preprocess(query); // Not traced
const response = await client.chat.completions.create(...); // Traced (auto)
const postprocessed = await postprocess(response); // Not traced
return postprocessed;
}
```
**Solution:** Add manual instrumentation for custom logic:
```typescript
import { traceChain } from "@arizeai/openinference-core";
const myWorkflow = traceChain(
async (query: string): Promise<string> => {
const preprocessed = await preprocess(query);
const response = await client.chat.completions.create(...);
const postprocessed = await postprocess(response);
return postprocessed;
},
{ name: "my-workflow" }
);
```
## Combining Auto + Manual
```typescript
import { register } from "@arizeai/phoenix-otel";
import { traceChain } from "@arizeai/openinference-core";
register({ projectName: "my-app" });
const client = new OpenAI();
const workflow = traceChain(
async (query: string) => {
const preprocessed = await preprocess(query);
const response = await client.chat.completions.create(...); // Auto-instrumented
return postprocess(response);
},
{ name: "my-workflow" }
);
```

View File

@@ -0,0 +1,182 @@
# Manual Instrumentation (Python)
Add custom spans using decorators or context managers for fine-grained tracing control.
## Setup
```bash
pip install arize-phoenix-otel
```
```python
from phoenix.otel import register
tracer_provider = register(project_name="my-app")
tracer = tracer_provider.get_tracer(__name__)
```
## Quick Reference
| Span Kind | Decorator | Use Case |
|-----------|-----------|----------|
| CHAIN | `@tracer.chain` | Orchestration, workflows, pipelines |
| RETRIEVER | `@tracer.retriever` | Vector search, document retrieval |
| TOOL | `@tracer.tool` | External API calls, function execution |
| AGENT | `@tracer.agent` | Multi-step reasoning, planning |
| LLM | `@tracer.llm` | LLM API calls (manual only) |
| EMBEDDING | `@tracer.embedding` | Embedding generation |
| RERANKER | `@tracer.reranker` | Document re-ranking |
| GUARDRAIL | `@tracer.guardrail` | Safety checks, content moderation |
| EVALUATOR | `@tracer.evaluator` | LLM evaluation, quality checks |
## Decorator Approach (Recommended)
**Use for:** Full function instrumentation, automatic I/O capture
```python
@tracer.chain
def rag_pipeline(query: str) -> str:
docs = retrieve_documents(query)
ranked = rerank(docs, query)
return generate_response(ranked, query)
@tracer.retriever
def retrieve_documents(query: str) -> list[dict]:
results = vector_db.search(query, top_k=5)
return [{"content": doc.text, "score": doc.score} for doc in results]
@tracer.tool
def get_weather(city: str) -> str:
response = requests.get(f"https://api.weather.com/{city}")
return response.json()["weather"]
```
**Custom span names:**
```python
@tracer.chain(name="rag-pipeline-v2")
def my_workflow(query: str) -> str:
return process(query)
```
## Context Manager Approach
**Use for:** Partial function instrumentation, custom attributes, dynamic control
```python
from opentelemetry.trace import Status, StatusCode
import json
def retrieve_with_metadata(query: str):
with tracer.start_as_current_span(
"vector_search",
openinference_span_kind="retriever"
) as span:
span.set_attribute("input.value", query)
results = vector_db.search(query, top_k=5)
documents = [
{
"document.id": doc.id,
"document.content": doc.text,
"document.score": doc.score
}
for doc in results
]
span.set_attribute("retrieval.documents", json.dumps(documents))
span.set_status(Status(StatusCode.OK))
return documents
```
## Capturing Input/Output
**Always capture I/O for evaluation-ready spans.**
### Automatic I/O Capture (Decorators)
Decorators automatically capture input arguments and return values:
```python theme={null}
@tracer.chain
def handle_query(user_input: str) -> str:
result = agent.generate(user_input)
return result.text
# Automatically captures:
# - input.value: user_input
# - output.value: result.text
# - input.mime_type / output.mime_type: auto-detected
```
### Manual I/O Capture (Context Manager)
Use `set_input()` and `set_output()` for simple I/O capture:
```python theme={null}
from opentelemetry.trace import Status, StatusCode
def handle_query(user_input: str) -> str:
with tracer.start_as_current_span(
"query.handler",
openinference_span_kind="chain"
) as span:
span.set_input(user_input)
result = agent.generate(user_input)
span.set_output(result.text)
span.set_status(Status(StatusCode.OK))
return result.text
```
**What gets captured:**
```json
{
"input.value": "What is 2+2?",
"input.mime_type": "text/plain",
"output.value": "2+2 equals 4.",
"output.mime_type": "text/plain"
}
```
**Why this matters:**
- Phoenix evaluators require `input.value` and `output.value`
- Phoenix UI displays I/O prominently for debugging
- Enables exporting data for fine-tuning datasets
### Custom I/O with Additional Metadata
Use `set_attribute()` for custom attributes alongside I/O:
```python theme={null}
def process_query(query: str):
with tracer.start_as_current_span(
"query.process",
openinference_span_kind="chain"
) as span:
# Standard I/O
span.set_input(query)
# Custom metadata
span.set_attribute("input.length", len(query))
result = llm.generate(query)
# Standard output
span.set_output(result.text)
# Custom metadata
span.set_attribute("output.tokens", result.usage.total_tokens)
span.set_status(Status(StatusCode.OK))
return result
```
## See Also
- **Span attributes:** `span-chain.md`, `span-retriever.md`, `span-tool.md`, `span-llm.md`, `span-agent.md`, `span-embedding.md`, `span-reranker.md`, `span-guardrail.md`, `span-evaluator.md`
- **Auto-instrumentation:** `instrumentation-auto-python.md` for framework integrations
- **API docs:** https://docs.arize.com/phoenix/tracing/manual-instrumentation

View File

@@ -0,0 +1,172 @@
# Manual Instrumentation (TypeScript)
Add custom spans using convenience wrappers or withSpan for fine-grained tracing control.
## Setup
```bash
npm install @arizeai/phoenix-otel @arizeai/openinference-core
```
```typescript
import { register } from "@arizeai/phoenix-otel";
register({ projectName: "my-app" });
```
## Quick Reference
| Span Kind | Method | Use Case |
|-----------|--------|----------|
| CHAIN | `traceChain` | Workflows, pipelines, orchestration |
| AGENT | `traceAgent` | Multi-step reasoning, planning |
| TOOL | `traceTool` | External APIs, function calls |
| RETRIEVER | `withSpan` | Vector search, document retrieval |
| LLM | `withSpan` | LLM API calls (prefer auto-instrumentation) |
| EMBEDDING | `withSpan` | Embedding generation |
| RERANKER | `withSpan` | Document re-ranking |
| GUARDRAIL | `withSpan` | Safety checks, content moderation |
| EVALUATOR | `withSpan` | LLM evaluation |
## Convenience Wrappers
```typescript
import { traceChain, traceAgent, traceTool } from "@arizeai/openinference-core";
// CHAIN - workflows
const pipeline = traceChain(
async (query: string) => {
const docs = await retrieve(query);
return await generate(docs, query);
},
{ name: "rag-pipeline" }
);
// AGENT - reasoning
const agent = traceAgent(
async (question: string) => {
const thought = await llm.generate(`Think: ${question}`);
return await processThought(thought);
},
{ name: "my-agent" }
);
// TOOL - function calls
const getWeather = traceTool(
async (city: string) => fetch(`/api/weather/${city}`).then(r => r.json()),
{ name: "get-weather" }
);
```
## withSpan for Other Kinds
```typescript
import { withSpan, getInputAttributes, getRetrieverAttributes } from "@arizeai/openinference-core";
// RETRIEVER with custom attributes
const retrieve = withSpan(
async (query: string) => {
const results = await vectorDb.search(query, { topK: 5 });
return results.map(doc => ({ content: doc.text, score: doc.score }));
},
{
kind: "RETRIEVER",
name: "vector-search",
processInput: (query) => getInputAttributes(query),
processOutput: (docs) => getRetrieverAttributes({ documents: docs })
}
);
```
**Options:**
```typescript
withSpan(fn, {
kind: "RETRIEVER", // OpenInference span kind
name: "span-name", // Span name (defaults to function name)
processInput: (args) => {}, // Transform input to attributes
processOutput: (result) => {}, // Transform output to attributes
attributes: { key: "value" } // Static attributes
});
```
## Capturing Input/Output
**Always capture I/O for evaluation-ready spans.** Use `getInputAttributes` and `getOutputAttributes` helpers for automatic MIME type detection:
```typescript
import {
getInputAttributes,
getOutputAttributes,
withSpan,
} from "@arizeai/openinference-core";
const handleQuery = withSpan(
async (userInput: string) => {
const result = await agent.generate({ prompt: userInput });
return result;
},
{
name: "query.handler",
kind: "CHAIN",
// Use helpers - automatic MIME type detection
processInput: (input) => getInputAttributes(input),
processOutput: (result) => getOutputAttributes(result.text),
}
);
await handleQuery("What is 2+2?");
```
**What gets captured:**
```json
{
"input.value": "What is 2+2?",
"input.mime_type": "text/plain",
"output.value": "2+2 equals 4.",
"output.mime_type": "text/plain"
}
```
**Helper behavior:**
- Strings → `text/plain`
- Objects/Arrays → `application/json` (automatically serialized)
- `undefined`/`null` → No attributes set
**Why this matters:**
- Phoenix evaluators require `input.value` and `output.value`
- Phoenix UI displays I/O prominently for debugging
- Enables exporting data for fine-tuning datasets
### Custom I/O Processing
Add custom metadata alongside standard I/O attributes:
```typescript
const processWithMetadata = withSpan(
async (query: string) => {
const result = await llm.generate(query);
return result;
},
{
name: "query.process",
kind: "CHAIN",
processInput: (query) => ({
"input.value": query,
"input.mime_type": "text/plain",
"input.length": query.length, // Custom attribute
}),
processOutput: (result) => ({
"output.value": result.text,
"output.mime_type": "text/plain",
"output.tokens": result.usage?.totalTokens, // Custom attribute
}),
}
);
```
## See Also
- **Span attributes:** `span-chain.md`, `span-retriever.md`, `span-tool.md`, etc.
- **Attribute helpers:** https://docs.arize.com/phoenix/tracing/manual-instrumentation-typescript#attribute-helpers
- **Auto-instrumentation:** `instrumentation-auto-typescript.md` for framework integrations

View File

@@ -0,0 +1,87 @@
# Phoenix Tracing: Custom Metadata (Python)
Add custom attributes to spans for richer observability.
## Install
```bash
pip install openinference-instrumentation
```
## Session
```python
from openinference.instrumentation import using_session
with using_session(session_id="my-session-id"):
# Spans get: "session.id" = "my-session-id"
...
```
## User
```python
from openinference.instrumentation import using_user
with using_user("my-user-id"):
# Spans get: "user.id" = "my-user-id"
...
```
## Metadata
```python
from openinference.instrumentation import using_metadata
with using_metadata({"key": "value", "experiment_id": "exp_123"}):
# Spans get: "metadata" = '{"key": "value", "experiment_id": "exp_123"}'
...
```
## Tags
```python
from openinference.instrumentation import using_tags
with using_tags(["tag_1", "tag_2"]):
# Spans get: "tag.tags" = '["tag_1", "tag_2"]'
...
```
## Combined (using_attributes)
```python
from openinference.instrumentation import using_attributes
with using_attributes(
session_id="my-session-id",
user_id="my-user-id",
metadata={"environment": "production"},
tags=["prod", "v2"],
prompt_template="Answer: {question}",
prompt_template_version="v1.0",
prompt_template_variables={"question": "What is Phoenix?"},
):
# All attributes applied to spans in this context
...
```
## On a Single Span
```python
span.set_attribute("metadata", json.dumps({"key": "value"}))
span.set_attribute("user.id", "user_123")
span.set_attribute("session.id", "session_456")
```
## As Decorators
All context managers can be used as decorators:
```python
@using_session(session_id="my-session-id")
@using_user("my-user-id")
@using_metadata({"env": "prod"})
def my_function():
...
```

View File

@@ -0,0 +1,50 @@
# Phoenix Tracing: Custom Metadata (TypeScript)
Add custom attributes to spans for richer observability.
## Using Context (Propagates to All Child Spans)
```typescript
import { context } from "@arizeai/phoenix-otel";
import { setMetadata } from "@arizeai/openinference-core";
context.with(
setMetadata(context.active(), {
experiment_id: "exp_123",
model_version: "gpt-4-1106-preview",
environment: "production",
}),
async () => {
// All spans created within this block will have:
// "metadata" = '{"experiment_id": "exp_123", ...}'
await myApp.run(query);
}
);
```
## On a Single Span
```typescript
import { traceChain } from "@arizeai/openinference-core";
import { trace } from "@arizeai/phoenix-otel";
const myFunction = traceChain(
async (input: string) => {
const span = trace.getActiveSpan();
span?.setAttribute(
"metadata",
JSON.stringify({
experiment_id: "exp_123",
model_version: "gpt-4-1106-preview",
environment: "production",
})
);
return result;
},
{ name: "my-function" }
);
await myFunction("hello");
```

View File

@@ -0,0 +1,58 @@
# Phoenix Tracing: Production Guide (Python)
**CRITICAL: Configure batching, data masking, and span filtering for production deployment.**
## Metadata
| Attribute | Value |
|-----------|-------|
| Priority | Critical - production readiness |
| Impact | Security, Performance |
| Setup Time | 5-15 min |
## Batch Processing
**Enable batch processing for production efficiency.** Batching reduces network overhead by sending spans in groups rather than individually.
## Data Masking (PII Protection)
**Environment variables:**
```bash
export OPENINFERENCE_HIDE_INPUTS=true # Hide input.value
export OPENINFERENCE_HIDE_OUTPUTS=true # Hide output.value
export OPENINFERENCE_HIDE_INPUT_MESSAGES=true # Hide LLM input messages
export OPENINFERENCE_HIDE_OUTPUT_MESSAGES=true # Hide LLM output messages
export OPENINFERENCE_HIDE_INPUT_IMAGES=true # Hide image content
export OPENINFERENCE_HIDE_INPUT_TEXT=true # Hide embedding text
export OPENINFERENCE_BASE64_IMAGE_MAX_LENGTH=10000 # Limit image size
```
**Python TraceConfig:**
```python
from phoenix.otel import register
from openinference.instrumentation import TraceConfig
config = TraceConfig(
hide_inputs=True,
hide_outputs=True,
hide_input_messages=True
)
register(trace_config=config)
```
**Precedence:** Code > Environment variables > Defaults
---
## Span Filtering
**Suppress specific code blocks:**
```python
from phoenix.otel import suppress_tracing
with suppress_tracing():
internal_logging() # No spans generated
```

View File

@@ -0,0 +1,148 @@
# Phoenix Tracing: Production Guide (TypeScript)
**CRITICAL: Configure batching, data masking, and span filtering for production deployment.**
## Metadata
| Attribute | Value |
|-----------|-------|
| Priority | Critical - production readiness |
| Impact | Security, Performance |
| Setup Time | 5-15 min |
## Batch Processing
**Enable batch processing for production efficiency.** Batching reduces network overhead by sending spans in groups rather than individually.
```typescript
import { register } from "@arizeai/phoenix-otel";
const provider = register({
projectName: "my-app",
batch: true, // Production default
});
```
### Shutdown Handling
**CRITICAL:** Spans may not be exported if still queued in the processor when your process exits. Call `provider.shutdown()` to explicitly flush before exit.
```typescript
// Explicit shutdown to flush queued spans
const provider = register({
projectName: "my-app",
batch: true,
});
async function main() {
await doWork();
await provider.shutdown(); // Flush spans before exit
}
main().catch(async (error) => {
console.error(error);
await provider.shutdown(); // Flush on error too
process.exit(1);
});
```
**Graceful termination signals:**
```typescript
// Graceful shutdown on SIGTERM
const provider = register({
projectName: "my-server",
batch: true,
});
process.on("SIGTERM", async () => {
await provider.shutdown();
process.exit(0);
});
```
---
## Data Masking (PII Protection)
**Environment variables:**
```bash
export OPENINFERENCE_HIDE_INPUTS=true # Hide input.value
export OPENINFERENCE_HIDE_OUTPUTS=true # Hide output.value
export OPENINFERENCE_HIDE_INPUT_MESSAGES=true # Hide LLM input messages
export OPENINFERENCE_HIDE_OUTPUT_MESSAGES=true # Hide LLM output messages
export OPENINFERENCE_HIDE_INPUT_IMAGES=true # Hide image content
export OPENINFERENCE_HIDE_INPUT_TEXT=true # Hide embedding text
export OPENINFERENCE_BASE64_IMAGE_MAX_LENGTH=10000 # Limit image size
```
**TypeScript TraceConfig:**
```typescript
import { register } from "@arizeai/phoenix-otel";
import { OpenAIInstrumentation } from "@arizeai/openinference-instrumentation-openai";
const traceConfig = {
hideInputs: true,
hideOutputs: true,
hideInputMessages: true
};
const instrumentation = new OpenAIInstrumentation({ traceConfig });
```
**Precedence:** Code > Environment variables > Defaults
---
## Span Filtering
**Suppress specific code blocks:**
```typescript
import { suppressTracing } from "@opentelemetry/core";
import { context } from "@opentelemetry/api";
await context.with(suppressTracing(context.active()), async () => {
internalLogging(); // No spans generated
});
```
**Sampling:**
```bash
export OTEL_TRACES_SAMPLER="parentbased_traceidratio"
export OTEL_TRACES_SAMPLER_ARG="0.1" # Sample 10%
```
---
## Error Handling
```typescript
import { SpanStatusCode } from "@opentelemetry/api";
try {
result = await riskyOperation();
span?.setStatus({ code: SpanStatusCode.OK });
} catch (e) {
span?.recordException(e);
span?.setStatus({ code: SpanStatusCode.ERROR });
throw e;
}
```
---
## Production Checklist
- [ ] Batch processing enabled
- [ ] **Shutdown handling:** Call `provider.shutdown()` before exit to flush queued spans
- [ ] **Graceful termination:** Flush spans on SIGTERM/SIGINT signals
- [ ] Data masking configured (`HIDE_INPUTS`/`HIDE_OUTPUTS` if PII)
- [ ] Span filtering for health checks/noisy paths
- [ ] Error handling implemented
- [ ] Graceful degradation if Phoenix unavailable
- [ ] Performance tested
- [ ] Monitoring configured (Phoenix UI checked)

View File

@@ -0,0 +1,73 @@
# Phoenix Tracing: Projects (Python)
**Organize traces by application using projects (Phoenix's top-level grouping).**
## Overview
Projects group traces for a single application or experiment.
**Use for:** Environments (dev/staging/prod), A/B testing, versioning
## Setup
### Environment Variable (Recommended)
```bash
export PHOENIX_PROJECT_NAME="my-app-prod"
```
```python
import os
os.environ["PHOENIX_PROJECT_NAME"] = "my-app-prod"
from phoenix.otel import register
register() # Uses "my-app-prod"
```
### Code
```python
from phoenix.otel import register
register(project_name="my-app-prod")
```
## Use Cases
**Environments:**
```python
# Dev, staging, prod
register(project_name="my-app-dev")
register(project_name="my-app-staging")
register(project_name="my-app-prod")
```
**A/B Testing:**
```python
# Compare models
register(project_name="chatbot-gpt4")
register(project_name="chatbot-claude")
```
**Versioning:**
```python
# Track versions
register(project_name="my-app-v1")
register(project_name="my-app-v2")
```
## Switching Projects (Python Notebooks Only)
```python
from openinference.instrumentation import dangerously_using_project
from phoenix.otel import register
register(project_name="my-app")
# Switch temporarily for evals
with dangerously_using_project("my-eval-project"):
run_evaluations()
```
**⚠️ Only use in notebooks/scripts, not production.**

View File

@@ -0,0 +1,54 @@
# Phoenix Tracing: Projects (TypeScript)
**Organize traces by application using projects (Phoenix's top-level grouping).**
## Overview
Projects group traces for a single application or experiment.
**Use for:** Environments (dev/staging/prod), A/B testing, versioning
## Setup
### Environment Variable (Recommended)
```bash
export PHOENIX_PROJECT_NAME="my-app-prod"
```
```typescript
process.env.PHOENIX_PROJECT_NAME = "my-app-prod";
import { register } from "@arizeai/phoenix-otel";
register(); // Uses "my-app-prod"
```
### Code
```typescript
import { register } from "@arizeai/phoenix-otel";
register({ projectName: "my-app-prod" });
```
## Use Cases
**Environments:**
```typescript
// Dev, staging, prod
register({ projectName: "my-app-dev" });
register({ projectName: "my-app-staging" });
register({ projectName: "my-app-prod" });
```
**A/B Testing:**
```typescript
// Compare models
register({ projectName: "chatbot-gpt4" });
register({ projectName: "chatbot-claude" });
```
**Versioning:**
```typescript
// Track versions
register({ projectName: "my-app-v1" });
register({ projectName: "my-app-v2" });
```

View File

@@ -0,0 +1,104 @@
# Sessions (Python)
Track multi-turn conversations by grouping traces with session IDs.
## Setup
```python
from openinference.instrumentation import using_session
with using_session(session_id="user_123_conv_456"):
response = llm.invoke(prompt)
```
## Best Practices
**Bad: Only parent span gets session ID**
```python
from openinference.semconv.trace import SpanAttributes
from opentelemetry import trace
span = trace.get_current_span()
span.set_attribute(SpanAttributes.SESSION_ID, session_id)
response = client.chat.completions.create(...)
```
**Good: All child spans inherit session ID**
```python
with using_session(session_id):
response = client.chat.completions.create(...)
result = my_custom_function()
```
**Why:** `using_session()` propagates session ID to all nested spans automatically.
## Session ID Patterns
```python
import uuid
session_id = str(uuid.uuid4())
session_id = f"user_{user_id}_conv_{conversation_id}"
session_id = f"debug_{timestamp}"
```
Good: `str(uuid.uuid4())`, `"user_123_conv_456"`
Bad: `"session_1"`, `"test"`, empty string
## Multi-Turn Chatbot Example
```python
import uuid
from openinference.instrumentation import using_session
session_id = str(uuid.uuid4())
messages = []
def send_message(user_input: str) -> str:
messages.append({"role": "user", "content": user_input})
with using_session(session_id):
response = client.chat.completions.create(
model="gpt-4",
messages=messages
)
assistant_message = response.choices[0].message.content
messages.append({"role": "assistant", "content": assistant_message})
return assistant_message
```
## Additional Attributes
```python
from openinference.instrumentation import using_attributes
with using_attributes(
user_id="user_123",
session_id="conv_456",
metadata={"tier": "premium", "region": "us-west"}
):
response = llm.invoke(prompt)
```
## LangChain Integration
LangChain threads are automatically recognized as sessions:
```python
from langchain.chat_models import ChatOpenAI
response = llm.invoke(
[HumanMessage(content="Hi!")],
config={"metadata": {"thread_id": "user_123_thread"}}
)
```
Phoenix recognizes: `thread_id`, `session_id`, `conversation_id`
## See Also
- **TypeScript sessions:** `sessions-typescript.md`
- **Session docs:** https://docs.arize.com/phoenix/tracing/sessions

View File

@@ -0,0 +1,199 @@
# Sessions (TypeScript)
Track multi-turn conversations by grouping traces with session IDs. **Use `withSpan` directly from `@arizeai/openinference-core`** - no wrappers or custom utilities needed.
## Core Concept
**Session Pattern:**
1. Generate a unique `session.id` once at application startup
2. Export SESSION_ID, import `withSpan` where needed
3. Use `withSpan` to create a parent CHAIN span with `session.id` for each interaction
4. All child spans (LLM, TOOL, AGENT, etc.) automatically group under the parent
5. Query traces by `session.id` in Phoenix to see all interactions
## Implementation (Best Practice)
### 1. Setup (instrumentation.ts)
```typescript
import { register } from "@arizeai/phoenix-otel";
import { randomUUID } from "node:crypto";
// Initialize Phoenix
register({
projectName: "your-app",
url: process.env.PHOENIX_COLLECTOR_ENDPOINT || "http://localhost:6006",
apiKey: process.env.PHOENIX_API_KEY,
batch: true,
});
// Generate and export session ID
export const SESSION_ID = randomUUID();
```
### 2. Usage (app code)
```typescript
import { withSpan } from "@arizeai/openinference-core";
import { SESSION_ID } from "./instrumentation";
// Use withSpan directly - no wrapper needed
const handleInteraction = withSpan(
async () => {
const result = await agent.generate({ prompt: userInput });
return result;
},
{
name: "cli.interaction",
kind: "CHAIN",
attributes: { "session.id": SESSION_ID },
}
);
// Call it
const result = await handleInteraction();
```
### With Input Parameters
```typescript
const processQuery = withSpan(
async (query: string) => {
return await agent.generate({ prompt: query });
},
{
name: "process.query",
kind: "CHAIN",
attributes: { "session.id": SESSION_ID },
}
);
await processQuery("What is 2+2?");
```
## Key Points
### Session ID Scope
- **CLI/Desktop Apps**: Generate once at process startup
- **Web Servers**: Generate per-user session (e.g., on login, store in session storage)
- **Stateless APIs**: Accept session.id as a parameter from client
### Span Hierarchy
```
cli.interaction (CHAIN) ← session.id here
├── ai.generateText (AGENT)
│ ├── ai.generateText.doGenerate (LLM)
│ └── ai.toolCall (TOOL)
└── ai.generateText.doGenerate (LLM)
```
The `session.id` is only set on the **root span**. Child spans are automatically grouped by the trace hierarchy.
### Querying Sessions
```bash
# Get all traces for a session
npx @arizeai/phoenix-cli traces \
--endpoint http://localhost:6006 \
--project your-app \
--format raw \
--no-progress | \
jq '.[] | select(.spans[0].attributes["session.id"] == "YOUR-SESSION-ID")'
```
## Dependencies
```json
{
"dependencies": {
"@arizeai/openinference-core": "^2.0.5",
"@arizeai/phoenix-otel": "^0.4.1"
}
}
```
**Note:** `@opentelemetry/api` is NOT needed - it's only for manual span management.
## Why This Pattern?
1. **Simple**: Just export SESSION_ID, use withSpan directly - no wrappers
2. **Built-in**: `withSpan` from `@arizeai/openinference-core` handles everything
3. **Type-safe**: Preserves function signatures and type information
4. **Automatic lifecycle**: Handles span creation, error tracking, and cleanup
5. **Framework-agnostic**: Works with any LLM framework (AI SDK, LangChain, etc.)
6. **No extra deps**: Don't need `@opentelemetry/api` or custom utilities
## Adding More Attributes
```typescript
import { withSpan } from "@arizeai/openinference-core";
import { SESSION_ID } from "./instrumentation";
const handleWithContext = withSpan(
async (userInput: string) => {
return await agent.generate({ prompt: userInput });
},
{
name: "cli.interaction",
kind: "CHAIN",
attributes: {
"session.id": SESSION_ID,
"user.id": userId, // Track user
"metadata.environment": "prod", // Custom metadata
},
}
);
```
## Anti-Pattern: Don't Create Wrappers
**Don't do this:**
```typescript
// Unnecessary wrapper
export function withSessionTracking(fn) {
return withSpan(fn, { attributes: { "session.id": SESSION_ID } });
}
```
**Do this instead:**
```typescript
// Use withSpan directly
import { withSpan } from "@arizeai/openinference-core";
import { SESSION_ID } from "./instrumentation";
const handler = withSpan(fn, {
attributes: { "session.id": SESSION_ID }
});
```
## Alternative: Context API Pattern
For web servers or complex async flows where you need to propagate session IDs through middleware, you can use the Context API:
```typescript
import { context } from "@opentelemetry/api";
import { setSession } from "@arizeai/openinference-core";
await context.with(
setSession(context.active(), { sessionId: "user_123_conv_456" }),
async () => {
const response = await llm.invoke(prompt);
}
);
```
**Use Context API when:**
- Building web servers with middleware chains
- Session ID needs to flow through many async boundaries
- You don't control the call stack (e.g., framework-provided handlers)
**Use withSpan when:**
- Building CLI apps or scripts
- You control the function call points
- Simpler, more explicit code is preferred
## Related
- `fundamentals-universal-attributes.md` - Other universal attributes (user.id, metadata)
- `span-chain.md` - CHAIN span specification
- `sessions-python.md` - Python session tracking patterns

View File

@@ -0,0 +1,131 @@
# Phoenix Tracing: Python Setup
**Setup Phoenix tracing in Python with `arize-phoenix-otel`.**
## Metadata
| Attribute | Value |
| ---------- | ----------------------------------- |
| Priority | Critical - required for all tracing |
| Setup Time | <5 min |
## Quick Start (3 lines)
```python
from phoenix.otel import register
register(project_name="my-app", auto_instrument=True)
```
**Connects to `http://localhost:6006`, auto-instruments all supported libraries.**
## Installation
```bash
pip install arize-phoenix-otel
```
**Supported:** Python 3.10-3.13
## Configuration
### Environment Variables (Recommended)
```bash
export PHOENIX_API_KEY="your-api-key" # Required for Phoenix Cloud
export PHOENIX_COLLECTOR_ENDPOINT="http://localhost:6006" # Or Cloud URL
export PHOENIX_PROJECT_NAME="my-app" # Optional
```
### Python Code
```python
from phoenix.otel import register
tracer_provider = register(
project_name="my-app", # Project name
endpoint="http://localhost:6006", # Phoenix endpoint
auto_instrument=True, # Auto-instrument supported libs
batch=True, # Batch processing (default: True)
)
```
**Parameters:**
- `project_name`: Project name (overrides `PHOENIX_PROJECT_NAME`)
- `endpoint`: Phoenix URL (overrides `PHOENIX_COLLECTOR_ENDPOINT`)
- `auto_instrument`: Enable auto-instrumentation (default: False)
- `batch`: Use BatchSpanProcessor (default: True, production-recommended)
- `protocol`: `"http/protobuf"` (default) or `"grpc"`
## Auto-Instrumentation
Install instrumentors for your frameworks:
```bash
pip install openinference-instrumentation-openai # OpenAI SDK
pip install openinference-instrumentation-langchain # LangChain
pip install openinference-instrumentation-llama-index # LlamaIndex
# ... install others as needed
```
Then enable auto-instrumentation:
```python
register(project_name="my-app", auto_instrument=True)
```
Phoenix discovers and instruments all installed OpenInference packages automatically.
## Batch Processing (Production)
Enabled by default. Configure via environment variables:
```bash
export OTEL_BSP_SCHEDULE_DELAY=5000 # Batch every 5s
export OTEL_BSP_MAX_QUEUE_SIZE=2048 # Queue 2048 spans
export OTEL_BSP_MAX_EXPORT_BATCH_SIZE=512 # Send 512 spans/batch
```
**Link:** https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/
## Verification
1. Open Phoenix UI: `http://localhost:6006`
2. Navigate to your project
3. Run your application
4. Check for traces (appear within batch delay)
## Troubleshooting
**No traces:**
- Verify `PHOENIX_COLLECTOR_ENDPOINT` matches Phoenix server
- Set `PHOENIX_API_KEY` for Phoenix Cloud
- Confirm instrumentors installed
**Missing attributes:**
- Check span kind (see rules/ directory)
- Verify attribute names (see rules/ directory)
## Example
```python
from phoenix.otel import register
from openai import OpenAI
# Enable tracing with auto-instrumentation
register(project_name="my-chatbot", auto_instrument=True)
# OpenAI automatically instrumented
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": "Hello!"}]
)
```
## API Reference
- [Python OTEL API Docs](https://arize-phoenix.readthedocs.io/projects/otel/en/latest/)
- [Python Client API Docs](https://arize-phoenix.readthedocs.io/projects/client/en/latest/)

View File

@@ -0,0 +1,170 @@
# TypeScript Setup
Setup Phoenix tracing in TypeScript/JavaScript with `@arizeai/phoenix-otel`.
## Metadata
| Attribute | Value |
|-----------|-------|
| Priority | Critical - required for all tracing |
| Setup Time | <5 min |
## Quick Start
```bash
npm install @arizeai/phoenix-otel
```
```typescript
import { register } from "@arizeai/phoenix-otel";
register({ projectName: "my-app" });
```
Connects to `http://localhost:6006` by default.
## Configuration
```typescript
import { register } from "@arizeai/phoenix-otel";
register({
projectName: "my-app",
url: "http://localhost:6006",
apiKey: process.env.PHOENIX_API_KEY,
batch: true
});
```
**Environment variables:**
```bash
export PHOENIX_API_KEY="your-api-key"
export PHOENIX_COLLECTOR_ENDPOINT="http://localhost:6006"
export PHOENIX_PROJECT_NAME="my-app"
```
## ESM vs CommonJS
**CommonJS (automatic):**
```javascript
const { register } = require("@arizeai/phoenix-otel");
register({ projectName: "my-app" });
const OpenAI = require("openai");
```
**ESM (manual instrumentation required):**
```typescript
import { register, registerInstrumentations } from "@arizeai/phoenix-otel";
import { OpenAIInstrumentation } from "@arizeai/openinference-instrumentation-openai";
import OpenAI from "openai";
register({ projectName: "my-app" });
const instrumentation = new OpenAIInstrumentation();
instrumentation.manuallyInstrument(OpenAI);
registerInstrumentations({ instrumentations: [instrumentation] });
```
**Why:** ESM imports are hoisted, so `manuallyInstrument()` is needed.
## Framework Integration
**Next.js (App Router):**
```typescript
// instrumentation.ts
export async function register() {
if (process.env.NEXT_RUNTIME === "nodejs") {
const { register } = await import("@arizeai/phoenix-otel");
register({ projectName: "my-nextjs-app" });
}
}
```
**Express.js:**
```typescript
import { register } from "@arizeai/phoenix-otel";
register({ projectName: "my-express-app" });
const app = express();
```
## Flushing Spans Before Exit
**CRITICAL:** Spans may not be exported if still queued in the processor when your process exits. Call `provider.shutdown()` to explicitly flush before exit.
**Standard pattern:**
```typescript
const provider = register({
projectName: "my-app",
batch: true,
});
async function main() {
await doWork();
await provider.shutdown(); // Flush spans before exit
}
main().catch(async (error) => {
console.error(error);
await provider.shutdown(); // Flush on error too
process.exit(1);
});
```
**Alternative:**
```typescript
// Use batch: false for immediate export (no shutdown needed)
register({
projectName: "my-app",
batch: false,
});
```
For production patterns including graceful termination, see `production-typescript.md`.
## Verification
1. Open Phoenix UI: `http://localhost:6006`
2. Run your application
3. Check for traces in your project
**Enable diagnostic logging:**
```typescript
import { DiagLogLevel, register } from "@arizeai/phoenix-otel";
register({
projectName: "my-app",
diagLogLevel: DiagLogLevel.DEBUG,
});
```
## Troubleshooting
**No traces:**
- Verify `PHOENIX_COLLECTOR_ENDPOINT` is correct
- Set `PHOENIX_API_KEY` for Phoenix Cloud
- For ESM: Ensure `manuallyInstrument()` is called
- **With `batch: true`:** Call `provider.shutdown()` before exit to flush queued spans (see Flushing Spans section)
**Traces missing:**
- With `batch: true`: Call `await provider.shutdown()` before process exit to flush queued spans
- Alternative: Set `batch: false` for immediate export (no shutdown needed)
**Missing attributes:**
- Check instrumentation is registered (ESM requires manual setup)
- See `instrumentation-auto-typescript.md`
## See Also
- **Auto-instrumentation:** `instrumentation-auto-typescript.md`
- **Manual instrumentation:** `instrumentation-manual-typescript.md`
- **API docs:** https://arize-ai.github.io/phoenix/

View File

@@ -0,0 +1,15 @@
# AGENT Spans
AGENT spans represent autonomous reasoning blocks (ReAct agents, planning loops, multi-step decision making).
**Required:** `openinference.span.kind` = "AGENT"
## Example
```json
{
"openinference.span.kind": "AGENT",
"input.value": "Book a flight to New York for next Monday",
"output.value": "I've booked flight AA123 departing Monday at 9:00 AM"
}
```

View File

@@ -0,0 +1,43 @@
# CHAIN Spans
## Purpose
CHAIN spans represent orchestration layers in your application (LangChain chains, custom workflows, application entry points). Often used as root spans.
## Required Attributes
| Attribute | Type | Description | Required |
| ------------------------- | ------ | --------------- | -------- |
| `openinference.span.kind` | String | Must be "CHAIN" | Yes |
## Common Attributes
CHAIN spans typically use [Universal Attributes](fundamentals-universal-attributes.md):
- `input.value` - Input to the chain (user query, request payload)
- `output.value` - Output from the chain (final response)
- `input.mime_type` / `output.mime_type` - Format indicators
## Example: Root Chain
```json
{
"openinference.span.kind": "CHAIN",
"input.value": "{\"question\": \"What is the capital of France?\"}",
"input.mime_type": "application/json",
"output.value": "{\"answer\": \"The capital of France is Paris.\", \"sources\": [\"doc_123\"]}",
"output.mime_type": "application/json",
"session.id": "session_abc123",
"user.id": "user_xyz789"
}
```
## Example: Nested Sub-Chain
```json
{
"openinference.span.kind": "CHAIN",
"input.value": "Summarize this document: ...",
"output.value": "This document discusses..."
}
```

View File

@@ -0,0 +1,91 @@
# EMBEDDING Spans
## Purpose
EMBEDDING spans represent vector generation operations (text-to-vector conversion for semantic search).
## Required Attributes
| Attribute | Type | Description | Required |
|-----------|------|-------------|----------|
| `openinference.span.kind` | String | Must be "EMBEDDING" | Yes |
| `embedding.model_name` | String | Embedding model identifier | Recommended |
## Attribute Reference
### Single Embedding
| Attribute | Type | Description |
|-----------|------|-------------|
| `embedding.model_name` | String | Embedding model identifier |
| `embedding.text` | String | Input text to embed |
| `embedding.vector` | String (JSON array) | Generated embedding vector |
**Example:**
```json
{
"embedding.model_name": "text-embedding-ada-002",
"embedding.text": "What is machine learning?",
"embedding.vector": "[0.023, -0.012, 0.045, ..., 0.001]"
}
```
### Batch Embeddings
| Attribute Pattern | Type | Description |
|-------------------|------|-------------|
| `embedding.embeddings.{i}.embedding.text` | String | Text at index i |
| `embedding.embeddings.{i}.embedding.vector` | String (JSON array) | Vector at index i |
**Example:**
```json
{
"embedding.model_name": "text-embedding-ada-002",
"embedding.embeddings.0.embedding.text": "First document",
"embedding.embeddings.0.embedding.vector": "[0.1, 0.2, 0.3, ..., 0.5]",
"embedding.embeddings.1.embedding.text": "Second document",
"embedding.embeddings.1.embedding.vector": "[0.6, 0.7, 0.8, ..., 0.9]"
}
```
### Vector Format
Vectors stored as JSON array strings:
- Dimensions: Typically 384, 768, 1536, or 3072
- Format: `"[0.123, -0.456, 0.789, ...]"`
- Precision: Usually 3-6 decimal places
**Storage Considerations:**
- Large vectors can significantly increase trace size
- Consider omitting vectors in production (keep `embedding.text` for debugging)
- Use separate vector database for actual similarity search
## Examples
### Single Embedding
```json
{
"openinference.span.kind": "EMBEDDING",
"embedding.model_name": "text-embedding-ada-002",
"embedding.text": "What is machine learning?",
"embedding.vector": "[0.023, -0.012, 0.045, ..., 0.001]",
"input.value": "What is machine learning?",
"output.value": "[0.023, -0.012, 0.045, ..., 0.001]"
}
```
### Batch Embeddings
```json
{
"openinference.span.kind": "EMBEDDING",
"embedding.model_name": "text-embedding-ada-002",
"embedding.embeddings.0.embedding.text": "First document",
"embedding.embeddings.0.embedding.vector": "[0.1, 0.2, 0.3]",
"embedding.embeddings.1.embedding.text": "Second document",
"embedding.embeddings.1.embedding.vector": "[0.4, 0.5, 0.6]",
"embedding.embeddings.2.embedding.text": "Third document",
"embedding.embeddings.2.embedding.vector": "[0.7, 0.8, 0.9]"
}
```

View File

@@ -0,0 +1,51 @@
# EVALUATOR Spans
## Purpose
EVALUATOR spans represent quality assessment operations (answer relevance, faithfulness, hallucination detection).
## Required Attributes
| Attribute | Type | Description | Required |
|-----------|------|-------------|----------|
| `openinference.span.kind` | String | Must be "EVALUATOR" | Yes |
## Common Attributes
| Attribute | Type | Description |
|-----------|------|-------------|
| `input.value` | String | Content being evaluated |
| `output.value` | String | Evaluation result (score, label, explanation) |
| `metadata.evaluator_name` | String | Evaluator identifier |
| `metadata.score` | Float | Numeric score (0-1) |
| `metadata.label` | String | Categorical label (relevant/irrelevant) |
## Example: Answer Relevance
```json
{
"openinference.span.kind": "EVALUATOR",
"input.value": "{\"question\": \"What is the capital of France?\", \"answer\": \"The capital of France is Paris.\"}",
"input.mime_type": "application/json",
"output.value": "0.95",
"metadata.evaluator_name": "answer_relevance",
"metadata.score": 0.95,
"metadata.label": "relevant",
"metadata.explanation": "Answer directly addresses the question with correct information"
}
```
## Example: Faithfulness Check
```json
{
"openinference.span.kind": "EVALUATOR",
"input.value": "{\"context\": \"Paris is in France.\", \"answer\": \"Paris is the capital of France.\"}",
"input.mime_type": "application/json",
"output.value": "0.5",
"metadata.evaluator_name": "faithfulness",
"metadata.score": 0.5,
"metadata.label": "partially_faithful",
"metadata.explanation": "Answer makes unsupported claim about Paris being the capital"
}
```

View File

@@ -0,0 +1,49 @@
# GUARDRAIL Spans
## Purpose
GUARDRAIL spans represent safety and policy checks (content moderation, PII detection, toxicity scoring).
## Required Attributes
| Attribute | Type | Description | Required |
|-----------|------|-------------|----------|
| `openinference.span.kind` | String | Must be "GUARDRAIL" | Yes |
## Common Attributes
| Attribute | Type | Description |
|-----------|------|-------------|
| `input.value` | String | Content being checked |
| `output.value` | String | Guardrail result (allowed/blocked/flagged) |
| `metadata.guardrail_type` | String | Type of check (toxicity, pii, bias) |
| `metadata.score` | Float | Safety score (0-1) |
| `metadata.threshold` | Float | Threshold for blocking |
## Example: Content Moderation
```json
{
"openinference.span.kind": "GUARDRAIL",
"input.value": "User message: I want to build a bomb",
"output.value": "BLOCKED",
"metadata.guardrail_type": "content_moderation",
"metadata.score": 0.95,
"metadata.threshold": 0.7,
"metadata.categories": "[\"violence\", \"weapons\"]",
"metadata.action": "block_and_log"
}
```
## Example: PII Detection
```json
{
"openinference.span.kind": "GUARDRAIL",
"input.value": "My SSN is 123-45-6789",
"output.value": "FLAGGED",
"metadata.guardrail_type": "pii_detection",
"metadata.detected_pii": "[\"ssn\"]",
"metadata.redacted_output": "My SSN is [REDACTED]"
}
```

View File

@@ -0,0 +1,79 @@
# LLM Spans
Represent calls to language models (OpenAI, Anthropic, local models, etc.).
## Required Attributes
| Attribute | Type | Description |
|-----------|------|-------------|
| `openinference.span.kind` | String | Must be "LLM" |
| `llm.model_name` | String | Model identifier (e.g., "gpt-4", "claude-3-5-sonnet-20241022") |
## Key Attributes
| Category | Attributes | Example |
|----------|------------|---------|
| **Model** | `llm.model_name`, `llm.provider` | "gpt-4-turbo", "openai" |
| **Tokens** | `llm.token_count.prompt`, `llm.token_count.completion`, `llm.token_count.total` | 25, 8, 33 |
| **Cost** | `llm.cost.prompt`, `llm.cost.completion`, `llm.cost.total` | 0.0021, 0.0045, 0.0066 |
| **Parameters** | `llm.invocation_parameters` (JSON) | `{"temperature": 0.7, "max_tokens": 1024}` |
| **Messages** | `llm.input_messages.{i}.*`, `llm.output_messages.{i}.*` | See examples below |
| **Tools** | `llm.tools.{i}.tool.json_schema` | Function definitions |
## Cost Tracking
**Core attributes:**
- `llm.cost.prompt` - Total input cost (USD)
- `llm.cost.completion` - Total output cost (USD)
- `llm.cost.total` - Total cost (USD)
**Detailed cost breakdown:**
- `llm.cost.prompt_details.{input,cache_read,cache_write,audio}` - Input cost components
- `llm.cost.completion_details.{output,reasoning,audio}` - Output cost components
## Messages
**Input messages:**
- `llm.input_messages.{i}.message.role` - "user", "assistant", "system", "tool"
- `llm.input_messages.{i}.message.content` - Text content
- `llm.input_messages.{i}.message.contents.{j}` - Multimodal (text + images)
- `llm.input_messages.{i}.message.tool_calls` - Tool invocations
**Output messages:** Same structure as input messages.
## Example: Basic LLM Call
```json
{
"openinference.span.kind": "LLM",
"llm.model_name": "claude-3-5-sonnet-20241022",
"llm.invocation_parameters": "{\"temperature\": 0.7, \"max_tokens\": 1024}",
"llm.input_messages.0.message.role": "system",
"llm.input_messages.0.message.content": "You are a helpful assistant.",
"llm.input_messages.1.message.role": "user",
"llm.input_messages.1.message.content": "What is the capital of France?",
"llm.output_messages.0.message.role": "assistant",
"llm.output_messages.0.message.content": "The capital of France is Paris.",
"llm.token_count.prompt": 25,
"llm.token_count.completion": 8,
"llm.token_count.total": 33
}
```
## Example: LLM with Tool Calls
```json
{
"openinference.span.kind": "LLM",
"llm.model_name": "gpt-4-turbo",
"llm.input_messages.0.message.content": "What's the weather in SF?",
"llm.output_messages.0.message.tool_calls.0.tool_call.function.name": "get_weather",
"llm.output_messages.0.message.tool_calls.0.tool_call.function.arguments": "{\"location\": \"San Francisco\"}",
"llm.tools.0.tool.json_schema": "{\"type\": \"function\", \"function\": {\"name\": \"get_weather\"}}"
}
```
## See Also
- **Instrumentation:** `instrumentation-auto-python.md`, `instrumentation-manual-python.md`
- **Full spec:** https://github.com/Arize-ai/openinference/blob/main/spec/semantic_conventions.md

View File

@@ -0,0 +1,86 @@
# RERANKER Spans
## Purpose
RERANKER spans represent reordering of retrieved documents (Cohere Rerank, cross-encoder models).
## Required Attributes
| Attribute | Type | Description | Required |
|-----------|------|-------------|----------|
| `openinference.span.kind` | String | Must be "RERANKER" | Yes |
## Attribute Reference
### Reranker Parameters
| Attribute | Type | Description |
|-----------|------|-------------|
| `reranker.model_name` | String | Reranker model identifier |
| `reranker.query` | String | Query used for reranking |
| `reranker.top_k` | Integer | Number of documents to return |
### Input Documents
| Attribute Pattern | Type | Description |
|-------------------|------|-------------|
| `reranker.input_documents.{i}.document.id` | String | Input document ID |
| `reranker.input_documents.{i}.document.content` | String | Input document content |
| `reranker.input_documents.{i}.document.score` | Float | Original retrieval score |
| `reranker.input_documents.{i}.document.metadata` | String (JSON) | Document metadata |
### Output Documents
| Attribute Pattern | Type | Description |
|-------------------|------|-------------|
| `reranker.output_documents.{i}.document.id` | String | Output document ID (reordered) |
| `reranker.output_documents.{i}.document.content` | String | Output document content |
| `reranker.output_documents.{i}.document.score` | Float | New reranker score |
| `reranker.output_documents.{i}.document.metadata` | String (JSON) | Document metadata |
### Score Comparison
Input scores (from retriever) vs. output scores (from reranker):
```json
{
"reranker.input_documents.0.document.id": "doc_A",
"reranker.input_documents.0.document.score": 0.7,
"reranker.input_documents.1.document.id": "doc_B",
"reranker.input_documents.1.document.score": 0.9,
"reranker.output_documents.0.document.id": "doc_B",
"reranker.output_documents.0.document.score": 0.95,
"reranker.output_documents.1.document.id": "doc_A",
"reranker.output_documents.1.document.score": 0.85
}
```
In this example:
- Input: doc_B (0.9) ranked higher than doc_A (0.7)
- Output: doc_B still highest but both scores increased
- Reranker confirmed retriever's ordering but refined scores
## Examples
### Complete Reranking Example
```json
{
"openinference.span.kind": "RERANKER",
"reranker.model_name": "cohere-rerank-v2",
"reranker.query": "What is machine learning?",
"reranker.top_k": 2,
"reranker.input_documents.0.document.id": "doc_123",
"reranker.input_documents.0.document.content": "Machine learning is a subset...",
"reranker.input_documents.1.document.id": "doc_456",
"reranker.input_documents.1.document.content": "Supervised learning algorithms...",
"reranker.input_documents.2.document.id": "doc_789",
"reranker.input_documents.2.document.content": "Neural networks are...",
"reranker.output_documents.0.document.id": "doc_456",
"reranker.output_documents.0.document.content": "Supervised learning algorithms...",
"reranker.output_documents.0.document.score": 0.95,
"reranker.output_documents.1.document.id": "doc_123",
"reranker.output_documents.1.document.content": "Machine learning is a subset...",
"reranker.output_documents.1.document.score": 0.88
}
```

View File

@@ -0,0 +1,110 @@
# RETRIEVER Spans
## Purpose
RETRIEVER spans represent document/context retrieval operations (vector DB queries, semantic search, keyword search).
## Required Attributes
| Attribute | Type | Description | Required |
|-----------|------|-------------|----------|
| `openinference.span.kind` | String | Must be "RETRIEVER" | Yes |
## Attribute Reference
### Query
| Attribute | Type | Description |
|-----------|------|-------------|
| `input.value` | String | Search query text |
### Document Schema
| Attribute Pattern | Type | Description |
|-------------------|------|-------------|
| `retrieval.documents.{i}.document.id` | String | Unique document identifier |
| `retrieval.documents.{i}.document.content` | String | Document text content |
| `retrieval.documents.{i}.document.score` | Float | Relevance score (0-1 or distance) |
| `retrieval.documents.{i}.document.metadata` | String (JSON) | Document metadata |
### Flattening Pattern for Documents
Documents are flattened using zero-indexed notation:
```
retrieval.documents.0.document.id
retrieval.documents.0.document.content
retrieval.documents.0.document.score
retrieval.documents.1.document.id
retrieval.documents.1.document.content
retrieval.documents.1.document.score
...
```
### Document Metadata
Common metadata fields (stored as JSON string):
```json
{
"source": "knowledge_base.pdf",
"page": 42,
"section": "Introduction",
"author": "Jane Doe",
"created_at": "2024-01-15",
"url": "https://example.com/doc",
"chunk_id": "chunk_123"
}
```
**Example with metadata:**
```json
{
"retrieval.documents.0.document.id": "doc_123",
"retrieval.documents.0.document.content": "Machine learning is a method of data analysis...",
"retrieval.documents.0.document.score": 0.92,
"retrieval.documents.0.document.metadata": "{\"source\": \"ml_textbook.pdf\", \"page\": 15, \"chapter\": \"Introduction\"}"
}
```
### Ordering
Documents are ordered by index (0, 1, 2, ...). Typically:
- Index 0 = highest scoring document
- Index 1 = second highest
- etc.
Preserve retrieval order in your flattened attributes.
### Large Document Handling
For very long documents:
- Consider truncating `document.content` to first N characters
- Store full content in separate document store
- Use `document.id` to reference full content
## Examples
### Basic Vector Search
```json
{
"openinference.span.kind": "RETRIEVER",
"input.value": "What is machine learning?",
"retrieval.documents.0.document.id": "doc_123",
"retrieval.documents.0.document.content": "Machine learning is a subset of artificial intelligence...",
"retrieval.documents.0.document.score": 0.92,
"retrieval.documents.0.document.metadata": "{\"source\": \"textbook.pdf\", \"page\": 42}",
"retrieval.documents.1.document.id": "doc_456",
"retrieval.documents.1.document.content": "Machine learning algorithms learn patterns from data...",
"retrieval.documents.1.document.score": 0.87,
"retrieval.documents.1.document.metadata": "{\"source\": \"article.html\", \"author\": \"Jane Doe\"}",
"retrieval.documents.2.document.id": "doc_789",
"retrieval.documents.2.document.content": "Supervised learning is a type of machine learning...",
"retrieval.documents.2.document.score": 0.81,
"retrieval.documents.2.document.metadata": "{\"source\": \"wiki.org\"}",
"metadata.retriever_type": "vector_search",
"metadata.vector_db": "pinecone",
"metadata.top_k": 3
}
```

View File

@@ -0,0 +1,67 @@
# TOOL Spans
## Purpose
TOOL spans represent external tool or function invocations (API calls, database queries, calculators, custom functions).
## Required Attributes
| Attribute | Type | Description | Required |
| ------------------------- | ------ | ------------------ | ----------- |
| `openinference.span.kind` | String | Must be "TOOL" | Yes |
| `tool.name` | String | Tool/function name | Recommended |
## Attribute Reference
### Tool Execution Attributes
| Attribute | Type | Description |
| ------------------ | ------------- | ------------------------------------------ |
| `tool.name` | String | Tool/function name |
| `tool.description` | String | Tool purpose/description |
| `tool.parameters` | String (JSON) | JSON schema defining the tool's parameters |
| `input.value` | String (JSON) | Actual input values passed to the tool |
| `output.value` | String | Tool output/result |
| `output.mime_type` | String | Result content type (e.g., "application/json") |
## Examples
### API Call Tool
```json
{
"openinference.span.kind": "TOOL",
"tool.name": "get_weather",
"tool.description": "Fetches current weather for a location",
"tool.parameters": "{\"type\": \"object\", \"properties\": {\"location\": {\"type\": \"string\"}, \"units\": {\"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"]}}, \"required\": [\"location\"]}",
"input.value": "{\"location\": \"San Francisco\", \"units\": \"celsius\"}",
"output.value": "{\"temperature\": 18, \"conditions\": \"partly cloudy\"}"
}
```
### Calculator Tool
```json
{
"openinference.span.kind": "TOOL",
"tool.name": "calculator",
"tool.description": "Performs mathematical calculations",
"tool.parameters": "{\"type\": \"object\", \"properties\": {\"expression\": {\"type\": \"string\", \"description\": \"Math expression to evaluate\"}}, \"required\": [\"expression\"]}",
"input.value": "{\"expression\": \"2 + 2\"}",
"output.value": "4"
}
```
### Database Query Tool
```json
{
"openinference.span.kind": "TOOL",
"tool.name": "sql_query",
"tool.description": "Executes SQL query on user database",
"tool.parameters": "{\"type\": \"object\", \"properties\": {\"query\": {\"type\": \"string\", \"description\": \"SQL query to execute\"}}, \"required\": [\"query\"]}",
"input.value": "{\"query\": \"SELECT * FROM users WHERE id = 123\"}",
"output.value": "[{\"id\": 123, \"name\": \"Alice\", \"email\": \"alice@example.com\"}]",
"output.mime_type": "application/json"
}
```