mirror of
https://github.com/github/awesome-copilot.git
synced 2026-04-11 18:55:55 +00:00
chore: publish from staged
This commit is contained in:
6
plugins/phoenix/.github/plugin/plugin.json
vendored
6
plugins/phoenix/.github/plugin/plugin.json
vendored
@@ -18,8 +18,8 @@
|
||||
"instrumentation"
|
||||
],
|
||||
"skills": [
|
||||
"./skills/phoenix-cli/",
|
||||
"./skills/phoenix-evals/",
|
||||
"./skills/phoenix-tracing/"
|
||||
"./skills/phoenix-cli",
|
||||
"./skills/phoenix-evals",
|
||||
"./skills/phoenix-tracing"
|
||||
]
|
||||
}
|
||||
|
||||
162
plugins/phoenix/skills/phoenix-cli/SKILL.md
Normal file
162
plugins/phoenix/skills/phoenix-cli/SKILL.md
Normal file
@@ -0,0 +1,162 @@
|
||||
---
|
||||
name: phoenix-cli
|
||||
description: Debug LLM applications using the Phoenix CLI. Fetch traces, analyze errors, review experiments, inspect datasets, and query the GraphQL API. Use when debugging AI/LLM applications, analyzing trace data, working with Phoenix observability, or investigating LLM performance issues.
|
||||
license: Apache-2.0
|
||||
compatibility: Requires Node.js (for npx) or global install of @arizeai/phoenix-cli. Optionally requires jq for JSON processing.
|
||||
metadata:
|
||||
author: arize-ai
|
||||
version: "2.0.0"
|
||||
---
|
||||
|
||||
# Phoenix CLI
|
||||
|
||||
## Invocation
|
||||
|
||||
```bash
|
||||
px <resource> <action> # if installed globally
|
||||
npx @arizeai/phoenix-cli <resource> <action> # no install required
|
||||
```
|
||||
|
||||
The CLI uses singular resource commands with subcommands like `list` and `get`:
|
||||
|
||||
```bash
|
||||
px trace list
|
||||
px trace get <trace-id>
|
||||
px span list
|
||||
px dataset list
|
||||
px dataset get <name>
|
||||
```
|
||||
|
||||
## Setup
|
||||
|
||||
```bash
|
||||
export PHOENIX_HOST=http://localhost:6006
|
||||
export PHOENIX_PROJECT=my-project
|
||||
export PHOENIX_API_KEY=your-api-key # if auth is enabled
|
||||
```
|
||||
|
||||
Always use `--format raw --no-progress` when piping to `jq`.
|
||||
|
||||
## Traces
|
||||
|
||||
```bash
|
||||
px trace list --limit 20 --format raw --no-progress | jq .
|
||||
px trace list --last-n-minutes 60 --limit 20 --format raw --no-progress | jq '.[] | select(.status == "ERROR")'
|
||||
px trace list --format raw --no-progress | jq 'sort_by(-.duration) | .[0:5]'
|
||||
px trace get <trace-id> --format raw | jq .
|
||||
px trace get <trace-id> --format raw | jq '.spans[] | select(.status_code != "OK")'
|
||||
```
|
||||
|
||||
## Spans
|
||||
|
||||
```bash
|
||||
px span list --limit 20 # recent spans (table view)
|
||||
px span list --last-n-minutes 60 --limit 50 # spans from last hour
|
||||
px span list --span-kind LLM --limit 10 # only LLM spans
|
||||
px span list --status-code ERROR --limit 20 # only errored spans
|
||||
px span list --name chat_completion --limit 10 # filter by span name
|
||||
px span list --trace-id <id> --format raw --no-progress | jq . # all spans for a trace
|
||||
px span list --include-annotations --limit 10 # include annotation scores
|
||||
px span list output.json --limit 100 # save to JSON file
|
||||
px span list --format raw --no-progress | jq '.[] | select(.status_code == "ERROR")'
|
||||
```
|
||||
|
||||
### Span JSON shape
|
||||
|
||||
```
|
||||
Span
|
||||
name, span_kind ("LLM"|"CHAIN"|"TOOL"|"RETRIEVER"|"EMBEDDING"|"AGENT"|"RERANKER"|"GUARDRAIL"|"EVALUATOR"|"UNKNOWN")
|
||||
status_code ("OK"|"ERROR"|"UNSET"), status_message
|
||||
context.span_id, context.trace_id, parent_id
|
||||
start_time, end_time
|
||||
attributes (same as trace span attributes above)
|
||||
annotations[] (with --include-annotations)
|
||||
name, result { score, label, explanation }
|
||||
```
|
||||
|
||||
### Trace JSON shape
|
||||
|
||||
```
|
||||
Trace
|
||||
traceId, status ("OK"|"ERROR"), duration (ms), startTime, endTime
|
||||
rootSpan — top-level span (parent_id: null)
|
||||
spans[]
|
||||
name, span_kind ("LLM"|"CHAIN"|"TOOL"|"RETRIEVER"|"EMBEDDING"|"AGENT")
|
||||
status_code ("OK"|"ERROR"), parent_id, context.span_id
|
||||
attributes
|
||||
input.value, output.value — raw input/output
|
||||
llm.model_name, llm.provider
|
||||
llm.token_count.prompt/completion/total
|
||||
llm.token_count.prompt_details.cache_read
|
||||
llm.token_count.completion_details.reasoning
|
||||
llm.input_messages.{N}.message.role/content
|
||||
llm.output_messages.{N}.message.role/content
|
||||
llm.invocation_parameters — JSON string (temperature, etc.)
|
||||
exception.message — set if span errored
|
||||
```
|
||||
|
||||
## Sessions
|
||||
|
||||
```bash
|
||||
px session list --limit 10 --format raw --no-progress | jq .
|
||||
px session list --order asc --format raw --no-progress | jq '.[].session_id'
|
||||
px session get <session-id> --format raw | jq .
|
||||
px session get <session-id> --include-annotations --format raw | jq '.annotations'
|
||||
```
|
||||
|
||||
### Session JSON shape
|
||||
|
||||
```
|
||||
SessionData
|
||||
id, session_id, project_id
|
||||
start_time, end_time
|
||||
traces[]
|
||||
id, trace_id, start_time, end_time
|
||||
|
||||
SessionAnnotation (with --include-annotations)
|
||||
id, name, annotator_kind ("LLM"|"CODE"|"HUMAN"), session_id
|
||||
result { label, score, explanation }
|
||||
metadata, identifier, source, created_at, updated_at
|
||||
```
|
||||
|
||||
## Datasets / Experiments / Prompts
|
||||
|
||||
```bash
|
||||
px dataset list --format raw --no-progress | jq '.[].name'
|
||||
px dataset get <name> --format raw | jq '.examples[] | {input, output: .expected_output}'
|
||||
px experiment list --dataset <name> --format raw --no-progress | jq '.[] | {id, name, failed_run_count}'
|
||||
px experiment get <id> --format raw --no-progress | jq '.[] | select(.error != null) | {input, error}'
|
||||
px prompt list --format raw --no-progress | jq '.[].name'
|
||||
px prompt get <name> --format text --no-progress # plain text, ideal for piping to AI
|
||||
```
|
||||
|
||||
## GraphQL
|
||||
|
||||
For ad-hoc queries not covered by the commands above. Output is `{"data": {...}}`.
|
||||
|
||||
```bash
|
||||
px api graphql '{ projectCount datasetCount promptCount evaluatorCount }'
|
||||
px api graphql '{ projects { edges { node { name traceCount tokenCountTotal } } } }' | jq '.data.projects.edges[].node'
|
||||
px api graphql '{ datasets { edges { node { name exampleCount experimentCount } } } }' | jq '.data.datasets.edges[].node'
|
||||
px api graphql '{ evaluators { edges { node { name kind } } } }' | jq '.data.evaluators.edges[].node'
|
||||
|
||||
# Introspect any type
|
||||
px api graphql '{ __type(name: "Project") { fields { name type { name } } } }' | jq '.data.__type.fields[]'
|
||||
```
|
||||
|
||||
Key root fields: `projects`, `datasets`, `prompts`, `evaluators`, `projectCount`, `datasetCount`, `promptCount`, `evaluatorCount`, `viewer`.
|
||||
|
||||
## Docs
|
||||
|
||||
Download Phoenix documentation markdown for local use by coding agents.
|
||||
|
||||
```bash
|
||||
px docs fetch # fetch default workflow docs to .px/docs
|
||||
px docs fetch --workflow tracing # fetch only tracing docs
|
||||
px docs fetch --workflow tracing --workflow evaluation
|
||||
px docs fetch --dry-run # preview what would be downloaded
|
||||
px docs fetch --refresh # clear .px/docs and re-download
|
||||
px docs fetch --output-dir ./my-docs # custom output directory
|
||||
```
|
||||
|
||||
Key options: `--workflow` (repeatable, values: `tracing`, `evaluation`, `datasets`, `prompts`, `integrations`, `sdk`, `self-hosting`, `all`), `--dry-run`, `--refresh`, `--output-dir` (default `.px/docs`), `--workers` (default 10).
|
||||
72
plugins/phoenix/skills/phoenix-evals/SKILL.md
Normal file
72
plugins/phoenix/skills/phoenix-evals/SKILL.md
Normal file
@@ -0,0 +1,72 @@
|
||||
---
|
||||
name: phoenix-evals
|
||||
description: Build and run evaluators for AI/LLM applications using Phoenix.
|
||||
license: Apache-2.0
|
||||
compatibility: Requires Phoenix server. Python skills need phoenix and openai packages; TypeScript skills need @arizeai/phoenix-client.
|
||||
metadata:
|
||||
author: oss@arize.com
|
||||
version: "1.0.0"
|
||||
languages: "Python, TypeScript"
|
||||
---
|
||||
|
||||
# Phoenix Evals
|
||||
|
||||
Build evaluators for AI/LLM applications. Code first, LLM for nuance, validate against humans.
|
||||
|
||||
## Quick Reference
|
||||
|
||||
| Task | Files |
|
||||
| ---- | ----- |
|
||||
| Setup | [setup-python](references/setup-python.md), [setup-typescript](references/setup-typescript.md) |
|
||||
| Decide what to evaluate | [evaluators-overview](references/evaluators-overview.md) |
|
||||
| Choose a judge model | [fundamentals-model-selection](references/fundamentals-model-selection.md) |
|
||||
| Use pre-built evaluators | [evaluators-pre-built](references/evaluators-pre-built.md) |
|
||||
| Build code evaluator | [evaluators-code-python](references/evaluators-code-python.md), [evaluators-code-typescript](references/evaluators-code-typescript.md) |
|
||||
| Build LLM evaluator | [evaluators-llm-python](references/evaluators-llm-python.md), [evaluators-llm-typescript](references/evaluators-llm-typescript.md), [evaluators-custom-templates](references/evaluators-custom-templates.md) |
|
||||
| Batch evaluate DataFrame | [evaluate-dataframe-python](references/evaluate-dataframe-python.md) |
|
||||
| Run experiment | [experiments-running-python](references/experiments-running-python.md), [experiments-running-typescript](references/experiments-running-typescript.md) |
|
||||
| Create dataset | [experiments-datasets-python](references/experiments-datasets-python.md), [experiments-datasets-typescript](references/experiments-datasets-typescript.md) |
|
||||
| Generate synthetic data | [experiments-synthetic-python](references/experiments-synthetic-python.md), [experiments-synthetic-typescript](references/experiments-synthetic-typescript.md) |
|
||||
| Validate evaluator accuracy | [validation](references/validation.md), [validation-evaluators-python](references/validation-evaluators-python.md), [validation-evaluators-typescript](references/validation-evaluators-typescript.md) |
|
||||
| Sample traces for review | [observe-sampling-python](references/observe-sampling-python.md), [observe-sampling-typescript](references/observe-sampling-typescript.md) |
|
||||
| Analyze errors | [error-analysis](references/error-analysis.md), [error-analysis-multi-turn](references/error-analysis-multi-turn.md), [axial-coding](references/axial-coding.md) |
|
||||
| RAG evals | [evaluators-rag](references/evaluators-rag.md) |
|
||||
| Avoid common mistakes | [common-mistakes-python](references/common-mistakes-python.md), [fundamentals-anti-patterns](references/fundamentals-anti-patterns.md) |
|
||||
| Production | [production-overview](references/production-overview.md), [production-guardrails](references/production-guardrails.md), [production-continuous](references/production-continuous.md) |
|
||||
|
||||
## Workflows
|
||||
|
||||
**Starting Fresh:**
|
||||
[observe-tracing-setup](references/observe-tracing-setup.md) → [error-analysis](references/error-analysis.md) → [axial-coding](references/axial-coding.md) → [evaluators-overview](references/evaluators-overview.md)
|
||||
|
||||
**Building Evaluator:**
|
||||
[fundamentals](references/fundamentals.md) → [common-mistakes-python](references/common-mistakes-python.md) → evaluators-{code|llm}-{python|typescript} → validation-evaluators-{python|typescript}
|
||||
|
||||
**RAG Systems:**
|
||||
[evaluators-rag](references/evaluators-rag.md) → evaluators-code-* (retrieval) → evaluators-llm-* (faithfulness)
|
||||
|
||||
**Production:**
|
||||
[production-overview](references/production-overview.md) → [production-guardrails](references/production-guardrails.md) → [production-continuous](references/production-continuous.md)
|
||||
|
||||
## Reference Categories
|
||||
|
||||
| Prefix | Description |
|
||||
| ------ | ----------- |
|
||||
| `fundamentals-*` | Types, scores, anti-patterns |
|
||||
| `observe-*` | Tracing, sampling |
|
||||
| `error-analysis-*` | Finding failures |
|
||||
| `axial-coding-*` | Categorizing failures |
|
||||
| `evaluators-*` | Code, LLM, RAG evaluators |
|
||||
| `experiments-*` | Datasets, running experiments |
|
||||
| `validation-*` | Validating evaluator accuracy against human labels |
|
||||
| `production-*` | CI/CD, monitoring |
|
||||
|
||||
## Key Principles
|
||||
|
||||
| Principle | Action |
|
||||
| --------- | ------ |
|
||||
| Error analysis first | Can't automate what you haven't observed |
|
||||
| Custom > generic | Build from your failures |
|
||||
| Code first | Deterministic before LLM |
|
||||
| Validate judges | >80% TPR/TNR |
|
||||
| Binary > Likert | Pass/fail, not 1-5 |
|
||||
@@ -0,0 +1,95 @@
|
||||
# Axial Coding
|
||||
|
||||
Group open-ended notes into structured failure taxonomies.
|
||||
|
||||
## Process
|
||||
|
||||
1. **Gather** - Collect open coding notes
|
||||
2. **Pattern** - Group notes with common themes
|
||||
3. **Name** - Create actionable category names
|
||||
4. **Quantify** - Count failures per category
|
||||
|
||||
## Example Taxonomy
|
||||
|
||||
```yaml
|
||||
failure_taxonomy:
|
||||
content_quality:
|
||||
hallucination: [invented_facts, fictional_citations]
|
||||
incompleteness: [partial_answer, missing_key_info]
|
||||
inaccuracy: [wrong_numbers, wrong_dates]
|
||||
|
||||
communication:
|
||||
tone_mismatch: [too_casual, too_formal]
|
||||
clarity: [ambiguous, jargon_heavy]
|
||||
|
||||
context:
|
||||
user_context: [ignored_preferences, misunderstood_intent]
|
||||
retrieved_context: [ignored_documents, wrong_context]
|
||||
|
||||
safety:
|
||||
missing_disclaimers: [legal, medical, financial]
|
||||
```
|
||||
|
||||
## Add Annotation (Python)
|
||||
|
||||
```python
|
||||
from phoenix.client import Client
|
||||
|
||||
client = Client()
|
||||
client.spans.add_span_annotation(
|
||||
span_id="abc123",
|
||||
annotation_name="failure_category",
|
||||
label="hallucination",
|
||||
explanation="invented a feature that doesn't exist",
|
||||
annotator_kind="HUMAN",
|
||||
sync=True,
|
||||
)
|
||||
```
|
||||
|
||||
## Add Annotation (TypeScript)
|
||||
|
||||
```typescript
|
||||
import { addSpanAnnotation } from "@arizeai/phoenix-client/spans";
|
||||
|
||||
await addSpanAnnotation({
|
||||
spanAnnotation: {
|
||||
spanId: "abc123",
|
||||
name: "failure_category",
|
||||
label: "hallucination",
|
||||
explanation: "invented a feature that doesn't exist",
|
||||
annotatorKind: "HUMAN",
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
## Agent Failure Taxonomy
|
||||
|
||||
```yaml
|
||||
agent_failures:
|
||||
planning: [wrong_plan, incomplete_plan]
|
||||
tool_selection: [wrong_tool, missed_tool, unnecessary_call]
|
||||
tool_execution: [wrong_parameters, type_error]
|
||||
state_management: [lost_context, stuck_in_loop]
|
||||
error_recovery: [no_fallback, wrong_fallback]
|
||||
```
|
||||
|
||||
## Transition Matrix (Agents)
|
||||
|
||||
Shows where failures occur between states:
|
||||
|
||||
```python
|
||||
def build_transition_matrix(conversations, states):
|
||||
matrix = defaultdict(lambda: defaultdict(int))
|
||||
for conv in conversations:
|
||||
if conv["failed"]:
|
||||
last_success = find_last_success(conv)
|
||||
first_failure = find_first_failure(conv)
|
||||
matrix[last_success][first_failure] += 1
|
||||
return pd.DataFrame(matrix).fillna(0)
|
||||
```
|
||||
|
||||
## Principles
|
||||
|
||||
- **MECE** - Each failure fits ONE category
|
||||
- **Actionable** - Categories suggest fixes
|
||||
- **Bottom-up** - Let categories emerge from data
|
||||
@@ -0,0 +1,225 @@
|
||||
# Common Mistakes (Python)
|
||||
|
||||
Patterns that LLMs frequently generate incorrectly from training data.
|
||||
|
||||
## Legacy Model Classes
|
||||
|
||||
```python
|
||||
# WRONG
|
||||
from phoenix.evals import OpenAIModel, AnthropicModel
|
||||
model = OpenAIModel(model="gpt-4")
|
||||
|
||||
# RIGHT
|
||||
from phoenix.evals import LLM
|
||||
llm = LLM(provider="openai", model="gpt-4o")
|
||||
```
|
||||
|
||||
**Why**: `OpenAIModel`, `AnthropicModel`, etc. are legacy 1.0 wrappers in `phoenix.evals.legacy`.
|
||||
The `LLM` class is provider-agnostic and is the current 2.0 API.
|
||||
|
||||
## Using run_evals Instead of evaluate_dataframe
|
||||
|
||||
```python
|
||||
# WRONG — legacy 1.0 API
|
||||
from phoenix.evals import run_evals
|
||||
results = run_evals(dataframe=df, evaluators=[eval1], provide_explanation=True)
|
||||
# Returns list of DataFrames
|
||||
|
||||
# RIGHT — current 2.0 API
|
||||
from phoenix.evals import evaluate_dataframe
|
||||
results_df = evaluate_dataframe(dataframe=df, evaluators=[eval1])
|
||||
# Returns single DataFrame with {name}_score dict columns
|
||||
```
|
||||
|
||||
**Why**: `run_evals` is the legacy 1.0 batch function. `evaluate_dataframe` is the current
|
||||
2.0 function with a different return format.
|
||||
|
||||
## Wrong Result Column Names
|
||||
|
||||
```python
|
||||
# WRONG — column doesn't exist
|
||||
score = results_df["relevance"].mean()
|
||||
|
||||
# WRONG — column exists but contains dicts, not numbers
|
||||
score = results_df["relevance_score"].mean()
|
||||
|
||||
# RIGHT — extract numeric score from dict
|
||||
scores = results_df["relevance_score"].apply(
|
||||
lambda x: x.get("score", 0.0) if isinstance(x, dict) else 0.0
|
||||
)
|
||||
score = scores.mean()
|
||||
```
|
||||
|
||||
**Why**: `evaluate_dataframe` returns columns named `{name}_score` containing Score dicts
|
||||
like `{"name": "...", "score": 1.0, "label": "...", "explanation": "..."}`.
|
||||
|
||||
## Deprecated project_name Parameter
|
||||
|
||||
```python
|
||||
# WRONG
|
||||
df = client.spans.get_spans_dataframe(project_name="my-project")
|
||||
|
||||
# RIGHT
|
||||
df = client.spans.get_spans_dataframe(project_identifier="my-project")
|
||||
```
|
||||
|
||||
**Why**: `project_name` is deprecated in favor of `project_identifier`, which also
|
||||
accepts project IDs.
|
||||
|
||||
## Wrong Client Constructor
|
||||
|
||||
```python
|
||||
# WRONG
|
||||
client = Client(endpoint="https://app.phoenix.arize.com")
|
||||
client = Client(url="https://app.phoenix.arize.com")
|
||||
|
||||
# RIGHT — for remote/cloud Phoenix
|
||||
client = Client(base_url="https://app.phoenix.arize.com", api_key="...")
|
||||
|
||||
# ALSO RIGHT — for local Phoenix (falls back to env vars or localhost:6006)
|
||||
client = Client()
|
||||
```
|
||||
|
||||
**Why**: The parameter is `base_url`, not `endpoint` or `url`. For local instances,
|
||||
`Client()` with no args works fine. For remote instances, `base_url` and `api_key` are required.
|
||||
|
||||
## Too-Aggressive Time Filters
|
||||
|
||||
```python
|
||||
# WRONG — often returns zero spans
|
||||
from datetime import datetime, timedelta
|
||||
df = client.spans.get_spans_dataframe(
|
||||
project_identifier="my-project",
|
||||
start_time=datetime.now() - timedelta(hours=1),
|
||||
)
|
||||
|
||||
# RIGHT — use limit to control result size instead
|
||||
df = client.spans.get_spans_dataframe(
|
||||
project_identifier="my-project",
|
||||
limit=50,
|
||||
)
|
||||
```
|
||||
|
||||
**Why**: Traces may be from any time period. A 1-hour window frequently returns
|
||||
nothing. Use `limit=` to control result size instead.
|
||||
|
||||
## Not Filtering Spans Appropriately
|
||||
|
||||
```python
|
||||
# WRONG — fetches all spans including internal LLM calls, retrievers, etc.
|
||||
df = client.spans.get_spans_dataframe(project_identifier="my-project")
|
||||
|
||||
# RIGHT for end-to-end evaluation — filter to top-level spans
|
||||
df = client.spans.get_spans_dataframe(
|
||||
project_identifier="my-project",
|
||||
root_spans_only=True,
|
||||
)
|
||||
|
||||
# RIGHT for RAG evaluation — fetch child spans for retriever/LLM metrics
|
||||
all_spans = client.spans.get_spans_dataframe(
|
||||
project_identifier="my-project",
|
||||
)
|
||||
retriever_spans = all_spans[all_spans["span_kind"] == "RETRIEVER"]
|
||||
llm_spans = all_spans[all_spans["span_kind"] == "LLM"]
|
||||
```
|
||||
|
||||
**Why**: For end-to-end evaluation (e.g., overall answer quality), use `root_spans_only=True`.
|
||||
For RAG systems, you often need child spans separately — retriever spans for
|
||||
DocumentRelevance and LLM spans for Faithfulness. Choose the right span level
|
||||
for your evaluation target.
|
||||
|
||||
## Assuming Span Output is Plain Text
|
||||
|
||||
```python
|
||||
# WRONG — output may be JSON, not plain text
|
||||
df["output"] = df["attributes.output.value"]
|
||||
|
||||
# RIGHT — parse JSON and extract the answer field
|
||||
import json
|
||||
|
||||
def extract_answer(output_value):
|
||||
if not isinstance(output_value, str):
|
||||
return str(output_value) if output_value is not None else ""
|
||||
try:
|
||||
parsed = json.loads(output_value)
|
||||
if isinstance(parsed, dict):
|
||||
for key in ("answer", "result", "output", "response"):
|
||||
if key in parsed:
|
||||
return str(parsed[key])
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
return output_value
|
||||
|
||||
df["output"] = df["attributes.output.value"].apply(extract_answer)
|
||||
```
|
||||
|
||||
**Why**: LangChain and other frameworks often output structured JSON from root spans,
|
||||
like `{"context": "...", "question": "...", "answer": "..."}`. Evaluators need
|
||||
the actual answer text, not the raw JSON.
|
||||
|
||||
## Using @create_evaluator for LLM-Based Evaluation
|
||||
|
||||
```python
|
||||
# WRONG — @create_evaluator doesn't call an LLM
|
||||
@create_evaluator(name="relevance", kind="llm")
|
||||
def relevance(input: str, output: str) -> str:
|
||||
pass # No LLM is involved
|
||||
|
||||
# RIGHT — use ClassificationEvaluator for LLM-based evaluation
|
||||
from phoenix.evals import ClassificationEvaluator, LLM
|
||||
|
||||
relevance = ClassificationEvaluator(
|
||||
name="relevance",
|
||||
prompt_template="Is this relevant?\n{{input}}\n{{output}}\nAnswer:",
|
||||
llm=LLM(provider="openai", model="gpt-4o"),
|
||||
choices={"relevant": 1.0, "irrelevant": 0.0},
|
||||
)
|
||||
```
|
||||
|
||||
**Why**: `@create_evaluator` wraps a plain Python function. Setting `kind="llm"`
|
||||
marks it as LLM-based but you must implement the LLM call yourself.
|
||||
For LLM-based evaluation, prefer `ClassificationEvaluator` which handles
|
||||
the LLM call, structured output parsing, and explanations automatically.
|
||||
|
||||
## Using llm_classify Instead of ClassificationEvaluator
|
||||
|
||||
```python
|
||||
# WRONG — legacy 1.0 API
|
||||
from phoenix.evals import llm_classify
|
||||
results = llm_classify(
|
||||
dataframe=df,
|
||||
template=template_str,
|
||||
model=model,
|
||||
rails=["relevant", "irrelevant"],
|
||||
)
|
||||
|
||||
# RIGHT — current 2.0 API
|
||||
from phoenix.evals import ClassificationEvaluator, async_evaluate_dataframe, LLM
|
||||
|
||||
classifier = ClassificationEvaluator(
|
||||
name="relevance",
|
||||
prompt_template=template_str,
|
||||
llm=LLM(provider="openai", model="gpt-4o"),
|
||||
choices={"relevant": 1.0, "irrelevant": 0.0},
|
||||
)
|
||||
results_df = await async_evaluate_dataframe(dataframe=df, evaluators=[classifier])
|
||||
```
|
||||
|
||||
**Why**: `llm_classify` is the legacy 1.0 function. The current pattern is to create
|
||||
an evaluator with `ClassificationEvaluator` and run it with `async_evaluate_dataframe()`.
|
||||
|
||||
## Using HallucinationEvaluator
|
||||
|
||||
```python
|
||||
# WRONG — deprecated
|
||||
from phoenix.evals import HallucinationEvaluator
|
||||
eval = HallucinationEvaluator(model)
|
||||
|
||||
# RIGHT — use FaithfulnessEvaluator
|
||||
from phoenix.evals.metrics import FaithfulnessEvaluator
|
||||
from phoenix.evals import LLM
|
||||
eval = FaithfulnessEvaluator(llm=LLM(provider="openai", model="gpt-4o"))
|
||||
```
|
||||
|
||||
**Why**: `HallucinationEvaluator` is deprecated. `FaithfulnessEvaluator` is its replacement,
|
||||
using "faithful"/"unfaithful" labels with maximized score (1.0 = faithful).
|
||||
@@ -0,0 +1,52 @@
|
||||
# Error Analysis: Multi-Turn Conversations
|
||||
|
||||
Debugging complex multi-turn conversation traces.
|
||||
|
||||
## The Approach
|
||||
|
||||
1. **End-to-end first** - Did the conversation achieve the goal?
|
||||
2. **Find first failure** - Trace backwards to root cause
|
||||
3. **Simplify** - Try single-turn before multi-turn debug
|
||||
4. **N-1 testing** - Isolate turn-specific vs capability issues
|
||||
|
||||
## Find First Upstream Failure
|
||||
|
||||
```
|
||||
Turn 1: User asks about flights ✓
|
||||
Turn 2: Assistant asks for dates ✓
|
||||
Turn 3: User provides dates ✓
|
||||
Turn 4: Assistant searches WRONG dates ← FIRST FAILURE
|
||||
Turn 5: Shows wrong flights (consequence)
|
||||
Turn 6: User frustrated (consequence)
|
||||
```
|
||||
|
||||
Focus on Turn 4, not Turn 6.
|
||||
|
||||
## Simplify First
|
||||
|
||||
Before debugging multi-turn, test single-turn:
|
||||
|
||||
```python
|
||||
# If single-turn also fails → problem is retrieval/knowledge
|
||||
# If single-turn passes → problem is conversation context
|
||||
response = chat("What's the return policy for electronics?")
|
||||
```
|
||||
|
||||
## N-1 Testing
|
||||
|
||||
Give turns 1 to N-1 as context, test turn N:
|
||||
|
||||
```python
|
||||
context = conversation[:n-1]
|
||||
response = chat_with_context(context, user_message_n)
|
||||
# Compare to actual turn N
|
||||
```
|
||||
|
||||
This isolates whether error is from context or underlying capability.
|
||||
|
||||
## Checklist
|
||||
|
||||
1. Did conversation achieve goal? (E2E)
|
||||
2. Which turn first went wrong?
|
||||
3. Can you reproduce with single-turn?
|
||||
4. Is error from context or capability? (N-1 test)
|
||||
@@ -0,0 +1,170 @@
|
||||
# Error Analysis
|
||||
|
||||
Review traces to discover failure modes before building evaluators.
|
||||
|
||||
## Process
|
||||
|
||||
1. **Sample** - 100+ traces (errors, negative feedback, random)
|
||||
2. **Open Code** - Write free-form notes per trace
|
||||
3. **Axial Code** - Group notes into failure categories
|
||||
4. **Quantify** - Count failures per category
|
||||
5. **Prioritize** - Rank by frequency × severity
|
||||
|
||||
## Sample Traces
|
||||
|
||||
### Span-level sampling (Python — DataFrame)
|
||||
|
||||
```python
|
||||
from phoenix.client import Client
|
||||
|
||||
# Client() works for local Phoenix (falls back to env vars or localhost:6006)
|
||||
# For remote/cloud: Client(base_url="https://app.phoenix.arize.com", api_key="...")
|
||||
client = Client()
|
||||
spans_df = client.spans.get_spans_dataframe(project_identifier="my-app")
|
||||
|
||||
# Build representative sample
|
||||
sample = pd.concat([
|
||||
spans_df[spans_df["status_code"] == "ERROR"].sample(30),
|
||||
spans_df[spans_df["feedback"] == "negative"].sample(30),
|
||||
spans_df.sample(40),
|
||||
]).drop_duplicates("span_id").head(100)
|
||||
```
|
||||
|
||||
### Span-level sampling (TypeScript)
|
||||
|
||||
```typescript
|
||||
import { getSpans } from "@arizeai/phoenix-client/spans";
|
||||
|
||||
const { spans: errors } = await getSpans({
|
||||
project: { projectName: "my-app" },
|
||||
statusCode: "ERROR",
|
||||
limit: 30,
|
||||
});
|
||||
const { spans: allSpans } = await getSpans({
|
||||
project: { projectName: "my-app" },
|
||||
limit: 70,
|
||||
});
|
||||
const sample = [...errors, ...allSpans.sort(() => Math.random() - 0.5).slice(0, 40)];
|
||||
const unique = [...new Map(sample.map((s) => [s.context.span_id, s])).values()].slice(0, 100);
|
||||
```
|
||||
|
||||
### Trace-level sampling (Python)
|
||||
|
||||
When errors span multiple spans (e.g., agent workflows), sample whole traces:
|
||||
|
||||
```python
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
traces = client.traces.get_traces(
|
||||
project_identifier="my-app",
|
||||
start_time=datetime.now() - timedelta(hours=24),
|
||||
include_spans=True,
|
||||
sort="latency_ms",
|
||||
order="desc",
|
||||
limit=100,
|
||||
)
|
||||
# Each trace has: trace_id, start_time, end_time, spans
|
||||
```
|
||||
|
||||
### Trace-level sampling (TypeScript)
|
||||
|
||||
```typescript
|
||||
import { getTraces } from "@arizeai/phoenix-client/traces";
|
||||
|
||||
const { traces } = await getTraces({
|
||||
project: { projectName: "my-app" },
|
||||
startTime: new Date(Date.now() - 24 * 60 * 60 * 1000),
|
||||
includeSpans: true,
|
||||
limit: 100,
|
||||
});
|
||||
```
|
||||
|
||||
## Add Notes (Python)
|
||||
|
||||
```python
|
||||
client.spans.add_span_note(
|
||||
span_id="abc123",
|
||||
note="wrong timezone - said 3pm EST but user is PST"
|
||||
)
|
||||
```
|
||||
|
||||
## Add Notes (TypeScript)
|
||||
|
||||
```typescript
|
||||
import { addSpanNote } from "@arizeai/phoenix-client/spans";
|
||||
|
||||
await addSpanNote({
|
||||
spanNote: {
|
||||
spanId: "abc123",
|
||||
note: "wrong timezone - said 3pm EST but user is PST"
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
## What to Note
|
||||
|
||||
| Type | Examples |
|
||||
| ---- | -------- |
|
||||
| Factual errors | Wrong dates, prices, made-up features |
|
||||
| Missing info | Didn't answer question, omitted details |
|
||||
| Tone issues | Too casual/formal for context |
|
||||
| Tool issues | Wrong tool, wrong parameters |
|
||||
| Retrieval | Wrong docs, missing relevant docs |
|
||||
|
||||
## Good Notes
|
||||
|
||||
```
|
||||
BAD: "Response is bad"
|
||||
GOOD: "Response says ships in 2 days but policy is 5-7 days"
|
||||
```
|
||||
|
||||
## Group into Categories
|
||||
|
||||
```python
|
||||
categories = {
|
||||
"factual_inaccuracy": ["wrong shipping time", "incorrect price"],
|
||||
"hallucination": ["made up a discount", "invented feature"],
|
||||
"tone_mismatch": ["informal for enterprise client"],
|
||||
}
|
||||
# Priority = Frequency × Severity
|
||||
```
|
||||
|
||||
## Retrieve Existing Annotations
|
||||
|
||||
### Python
|
||||
|
||||
```python
|
||||
# From a spans DataFrame
|
||||
annotations_df = client.spans.get_span_annotations_dataframe(
|
||||
spans_dataframe=sample,
|
||||
project_identifier="my-app",
|
||||
include_annotation_names=["quality", "correctness"],
|
||||
)
|
||||
# annotations_df has: span_id (index), name, label, score, explanation
|
||||
|
||||
# Or from specific span IDs
|
||||
annotations_df = client.spans.get_span_annotations_dataframe(
|
||||
span_ids=["span-id-1", "span-id-2"],
|
||||
project_identifier="my-app",
|
||||
)
|
||||
```
|
||||
|
||||
### TypeScript
|
||||
|
||||
```typescript
|
||||
import { getSpanAnnotations } from "@arizeai/phoenix-client/spans";
|
||||
|
||||
const { annotations } = await getSpanAnnotations({
|
||||
project: { projectName: "my-app" },
|
||||
spanIds: ["span-id-1", "span-id-2"],
|
||||
includeAnnotationNames: ["quality", "correctness"],
|
||||
});
|
||||
|
||||
for (const ann of annotations) {
|
||||
console.log(`${ann.span_id}: ${ann.name} = ${ann.result?.label} (${ann.result?.score})`);
|
||||
}
|
||||
```
|
||||
|
||||
## Saturation
|
||||
|
||||
Stop when new traces reveal no new failure modes. Minimum: 100 traces.
|
||||
@@ -0,0 +1,137 @@
|
||||
# Batch Evaluation with evaluate_dataframe (Python)
|
||||
|
||||
Run evaluators across a DataFrame. The core 2.0 batch evaluation API.
|
||||
|
||||
## Preferred: async_evaluate_dataframe
|
||||
|
||||
For batch evaluations (especially with LLM evaluators), prefer the async version
|
||||
for better throughput:
|
||||
|
||||
```python
|
||||
from phoenix.evals import async_evaluate_dataframe
|
||||
|
||||
results_df = await async_evaluate_dataframe(
|
||||
dataframe=df, # pandas DataFrame with columns matching evaluator params
|
||||
evaluators=[eval1, eval2], # List of evaluators
|
||||
concurrency=5, # Max concurrent LLM calls (default 3)
|
||||
exit_on_error=False, # Optional: stop on first error (default True)
|
||||
max_retries=3, # Optional: retry failed LLM calls (default 10)
|
||||
)
|
||||
```
|
||||
|
||||
## Sync Version
|
||||
|
||||
```python
|
||||
from phoenix.evals import evaluate_dataframe
|
||||
|
||||
results_df = evaluate_dataframe(
|
||||
dataframe=df, # pandas DataFrame with columns matching evaluator params
|
||||
evaluators=[eval1, eval2], # List of evaluators
|
||||
exit_on_error=False, # Optional: stop on first error (default True)
|
||||
max_retries=3, # Optional: retry failed LLM calls (default 10)
|
||||
)
|
||||
```
|
||||
|
||||
## Result Column Format
|
||||
|
||||
`async_evaluate_dataframe` / `evaluate_dataframe` returns a copy of the input DataFrame with added columns.
|
||||
**Result columns contain dicts, NOT raw numbers.**
|
||||
|
||||
For each evaluator named `"foo"`, two columns are added:
|
||||
|
||||
| Column | Type | Contents |
|
||||
| ------ | ---- | -------- |
|
||||
| `foo_score` | `dict` | `{"name": "foo", "score": 1.0, "label": "True", "explanation": "...", "metadata": {...}, "kind": "code", "direction": "maximize"}` |
|
||||
| `foo_execution_details` | `dict` | `{"status": "success", "exceptions": [], "execution_seconds": 0.001}` |
|
||||
|
||||
Only non-None fields appear in the score dict.
|
||||
|
||||
### Extracting Numeric Scores
|
||||
|
||||
```python
|
||||
# WRONG — these will fail or produce unexpected results
|
||||
score = results_df["relevance"].mean() # KeyError!
|
||||
score = results_df["relevance_score"].mean() # Tries to average dicts!
|
||||
|
||||
# RIGHT — extract the numeric score from each dict
|
||||
scores = results_df["relevance_score"].apply(
|
||||
lambda x: x.get("score", 0.0) if isinstance(x, dict) else 0.0
|
||||
)
|
||||
mean_score = scores.mean()
|
||||
```
|
||||
|
||||
### Extracting Labels
|
||||
|
||||
```python
|
||||
labels = results_df["relevance_score"].apply(
|
||||
lambda x: x.get("label", "") if isinstance(x, dict) else ""
|
||||
)
|
||||
```
|
||||
|
||||
### Extracting Explanations (LLM evaluators)
|
||||
|
||||
```python
|
||||
explanations = results_df["relevance_score"].apply(
|
||||
lambda x: x.get("explanation", "") if isinstance(x, dict) else ""
|
||||
)
|
||||
```
|
||||
|
||||
### Finding Failures
|
||||
|
||||
```python
|
||||
scores = results_df["relevance_score"].apply(
|
||||
lambda x: x.get("score", 0.0) if isinstance(x, dict) else 0.0
|
||||
)
|
||||
failed_mask = scores < 0.5
|
||||
failures = results_df[failed_mask]
|
||||
```
|
||||
|
||||
## Input Mapping
|
||||
|
||||
Evaluators receive each row as a dict. Column names must match the evaluator's
|
||||
expected parameter names. If they don't match, use `.bind()` or `bind_evaluator`:
|
||||
|
||||
```python
|
||||
from phoenix.evals import bind_evaluator, create_evaluator, async_evaluate_dataframe
|
||||
|
||||
@create_evaluator(name="check", kind="code")
|
||||
def check(response: str) -> bool:
|
||||
return len(response.strip()) > 0
|
||||
|
||||
# Option 1: Use .bind() method on the evaluator
|
||||
check.bind(input_mapping={"response": "answer"})
|
||||
results_df = await async_evaluate_dataframe(dataframe=df, evaluators=[check])
|
||||
|
||||
# Option 2: Use bind_evaluator function
|
||||
bound = bind_evaluator(evaluator=check, input_mapping={"response": "answer"})
|
||||
results_df = await async_evaluate_dataframe(dataframe=df, evaluators=[bound])
|
||||
```
|
||||
|
||||
Or simply rename columns to match:
|
||||
|
||||
```python
|
||||
df = df.rename(columns={
|
||||
"attributes.input.value": "input",
|
||||
"attributes.output.value": "output",
|
||||
})
|
||||
```
|
||||
|
||||
## DO NOT use run_evals
|
||||
|
||||
```python
|
||||
# WRONG — legacy 1.0 API
|
||||
from phoenix.evals import run_evals
|
||||
results = run_evals(dataframe=df, evaluators=[eval1])
|
||||
# Returns List[DataFrame] — one per evaluator
|
||||
|
||||
# RIGHT — current 2.0 API
|
||||
from phoenix.evals import async_evaluate_dataframe
|
||||
results_df = await async_evaluate_dataframe(dataframe=df, evaluators=[eval1])
|
||||
# Returns single DataFrame with {name}_score dict columns
|
||||
```
|
||||
|
||||
Key differences:
|
||||
- `run_evals` returns a **list** of DataFrames (one per evaluator)
|
||||
- `async_evaluate_dataframe` returns a **single** DataFrame with all results merged
|
||||
- `async_evaluate_dataframe` uses `{name}_score` dict column format
|
||||
- `async_evaluate_dataframe` uses `bind_evaluator` for input mapping (not `input_mapping=` param)
|
||||
@@ -0,0 +1,91 @@
|
||||
# Evaluators: Code Evaluators in Python
|
||||
|
||||
Deterministic evaluators without LLM. Fast, cheap, reproducible.
|
||||
|
||||
## Basic Pattern
|
||||
|
||||
```python
|
||||
import re
|
||||
import json
|
||||
from phoenix.evals import create_evaluator
|
||||
|
||||
@create_evaluator(name="has_citation", kind="code")
|
||||
def has_citation(output: str) -> bool:
|
||||
return bool(re.search(r'\[\d+\]', output))
|
||||
|
||||
@create_evaluator(name="json_valid", kind="code")
|
||||
def json_valid(output: str) -> bool:
|
||||
try:
|
||||
json.loads(output)
|
||||
return True
|
||||
except json.JSONDecodeError:
|
||||
return False
|
||||
```
|
||||
|
||||
## Parameter Binding
|
||||
|
||||
| Parameter | Description |
|
||||
| --------- | ----------- |
|
||||
| `output` | Task output |
|
||||
| `input` | Example input |
|
||||
| `expected` | Expected output |
|
||||
| `metadata` | Example metadata |
|
||||
|
||||
```python
|
||||
@create_evaluator(name="matches_expected", kind="code")
|
||||
def matches_expected(output: str, expected: dict) -> bool:
|
||||
return output.strip() == expected.get("answer", "").strip()
|
||||
```
|
||||
|
||||
## Common Patterns
|
||||
|
||||
- **Regex**: `re.search(pattern, output)`
|
||||
- **JSON schema**: `jsonschema.validate()`
|
||||
- **Keywords**: `keyword in output.lower()`
|
||||
- **Length**: `len(output.split())`
|
||||
- **Similarity**: `editdistance.eval()` or Jaccard
|
||||
|
||||
## Return Types
|
||||
|
||||
| Return type | Result |
|
||||
| ----------- | ------ |
|
||||
| `bool` | `True` → score=1.0, label="True"; `False` → score=0.0, label="False" |
|
||||
| `float`/`int` | Used as the `score` value directly |
|
||||
| `str` (short, ≤3 words) | Used as the `label` value |
|
||||
| `str` (long, ≥4 words) | Used as the `explanation` value |
|
||||
| `dict` with `score`/`label`/`explanation` | Mapped to Score fields directly |
|
||||
| `Score` object | Used as-is |
|
||||
|
||||
## Important: Code vs LLM Evaluators
|
||||
|
||||
The `@create_evaluator` decorator wraps a plain Python function.
|
||||
|
||||
- `kind="code"` (default): For deterministic evaluators that don't call an LLM.
|
||||
- `kind="llm"`: Marks the evaluator as LLM-based, but **you** must implement the LLM
|
||||
call inside the function. The decorator does not call an LLM for you.
|
||||
|
||||
For most LLM-based evaluation, prefer `ClassificationEvaluator` which handles
|
||||
the LLM call, structured output parsing, and explanations automatically:
|
||||
|
||||
```python
|
||||
from phoenix.evals import ClassificationEvaluator, LLM
|
||||
|
||||
relevance = ClassificationEvaluator(
|
||||
name="relevance",
|
||||
prompt_template="Is this relevant?\n{{input}}\n{{output}}\nAnswer:",
|
||||
llm=LLM(provider="openai", model="gpt-4o"),
|
||||
choices={"relevant": 1.0, "irrelevant": 0.0},
|
||||
)
|
||||
```
|
||||
|
||||
## Pre-Built
|
||||
|
||||
```python
|
||||
from phoenix.experiments.evaluators import ContainsAnyKeyword, JSONParseable, MatchesRegex
|
||||
|
||||
evaluators = [
|
||||
ContainsAnyKeyword(keywords=["disclaimer"]),
|
||||
JSONParseable(),
|
||||
MatchesRegex(pattern=r"\d{4}-\d{2}-\d{2}"),
|
||||
]
|
||||
```
|
||||
@@ -0,0 +1,51 @@
|
||||
# Evaluators: Code Evaluators in TypeScript
|
||||
|
||||
Deterministic evaluators without LLM. Fast, cheap, reproducible.
|
||||
|
||||
## Basic Pattern
|
||||
|
||||
```typescript
|
||||
import { createEvaluator } from "@arizeai/phoenix-evals";
|
||||
|
||||
const containsCitation = createEvaluator<{ output: string }>(
|
||||
({ output }) => /\[\d+\]/.test(output) ? 1 : 0,
|
||||
{ name: "contains_citation", kind: "CODE" }
|
||||
);
|
||||
```
|
||||
|
||||
## With Full Results (asExperimentEvaluator)
|
||||
|
||||
```typescript
|
||||
import { asExperimentEvaluator } from "@arizeai/phoenix-client/experiments";
|
||||
|
||||
const jsonValid = asExperimentEvaluator({
|
||||
name: "json_valid",
|
||||
kind: "CODE",
|
||||
evaluate: async ({ output }) => {
|
||||
try {
|
||||
JSON.parse(String(output));
|
||||
return { score: 1.0, label: "valid_json" };
|
||||
} catch (e) {
|
||||
return { score: 0.0, label: "invalid_json", explanation: String(e) };
|
||||
}
|
||||
},
|
||||
});
|
||||
```
|
||||
|
||||
## Parameter Types
|
||||
|
||||
```typescript
|
||||
interface EvaluatorParams {
|
||||
input: Record<string, unknown>;
|
||||
output: unknown;
|
||||
expected: Record<string, unknown>;
|
||||
metadata: Record<string, unknown>;
|
||||
}
|
||||
```
|
||||
|
||||
## Common Patterns
|
||||
|
||||
- **Regex**: `/pattern/.test(output)`
|
||||
- **JSON**: `JSON.parse()` + zod schema
|
||||
- **Keywords**: `output.includes(keyword)`
|
||||
- **Similarity**: `fastest-levenshtein`
|
||||
@@ -0,0 +1,54 @@
|
||||
# Evaluators: Custom Templates
|
||||
|
||||
Design LLM judge prompts.
|
||||
|
||||
## Complete Template Pattern
|
||||
|
||||
```python
|
||||
TEMPLATE = """Evaluate faithfulness of the response to the context.
|
||||
|
||||
<context>{{context}}</context>
|
||||
<response>{{output}}</response>
|
||||
|
||||
CRITERIA:
|
||||
"faithful" = ALL claims supported by context
|
||||
"unfaithful" = ANY claim NOT in context
|
||||
|
||||
EXAMPLES:
|
||||
Context: "Price is $10" → Response: "It costs $10" → faithful
|
||||
Context: "Price is $10" → Response: "About $15" → unfaithful
|
||||
|
||||
EDGE CASES:
|
||||
- Empty context → cannot_evaluate
|
||||
- "I don't know" when appropriate → faithful
|
||||
- Partial faithfulness → unfaithful (strict)
|
||||
|
||||
Answer (faithful/unfaithful):"""
|
||||
```
|
||||
|
||||
## Template Structure
|
||||
|
||||
1. Task description
|
||||
2. Input variables in XML tags
|
||||
3. Criteria definitions
|
||||
4. Examples (2-4 cases)
|
||||
5. Edge cases
|
||||
6. Output format
|
||||
|
||||
## XML Tags
|
||||
|
||||
```
|
||||
<question>{{input}}</question>
|
||||
<response>{{output}}</response>
|
||||
<context>{{context}}</context>
|
||||
<reference>{{reference}}</reference>
|
||||
```
|
||||
|
||||
## Common Mistakes
|
||||
|
||||
| Mistake | Fix |
|
||||
| ------- | --- |
|
||||
| Vague criteria | Define each label exactly |
|
||||
| No examples | Include 2-4 cases |
|
||||
| Ambiguous format | Specify exact output |
|
||||
| No edge cases | Address ambiguity |
|
||||
@@ -0,0 +1,92 @@
|
||||
# Evaluators: LLM Evaluators in Python
|
||||
|
||||
LLM evaluators use a language model to judge outputs. Use when criteria are subjective.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```python
|
||||
from phoenix.evals import ClassificationEvaluator, LLM
|
||||
|
||||
llm = LLM(provider="openai", model="gpt-4o")
|
||||
|
||||
HELPFULNESS_TEMPLATE = """Rate how helpful the response is.
|
||||
|
||||
<question>{{input}}</question>
|
||||
<response>{{output}}</response>
|
||||
|
||||
"helpful" means directly addresses the question.
|
||||
"not_helpful" means does not address the question.
|
||||
|
||||
Your answer (helpful/not_helpful):"""
|
||||
|
||||
helpfulness = ClassificationEvaluator(
|
||||
name="helpfulness",
|
||||
prompt_template=HELPFULNESS_TEMPLATE,
|
||||
llm=llm,
|
||||
choices={"not_helpful": 0, "helpful": 1}
|
||||
)
|
||||
```
|
||||
|
||||
## Template Variables
|
||||
|
||||
Use XML tags to wrap variables for clarity:
|
||||
|
||||
| Variable | XML Tag |
|
||||
| -------- | ------- |
|
||||
| `{{input}}` | `<question>{{input}}</question>` |
|
||||
| `{{output}}` | `<response>{{output}}</response>` |
|
||||
| `{{reference}}` | `<reference>{{reference}}</reference>` |
|
||||
| `{{context}}` | `<context>{{context}}</context>` |
|
||||
|
||||
## create_classifier (Factory)
|
||||
|
||||
Shorthand factory that returns a `ClassificationEvaluator`. Prefer direct
|
||||
`ClassificationEvaluator` instantiation for more parameters/customization:
|
||||
|
||||
```python
|
||||
from phoenix.evals import create_classifier, LLM
|
||||
|
||||
relevance = create_classifier(
|
||||
name="relevance",
|
||||
prompt_template="""Is this response relevant to the question?
|
||||
<question>{{input}}</question>
|
||||
<response>{{output}}</response>
|
||||
Answer (relevant/irrelevant):""",
|
||||
llm=LLM(provider="openai", model="gpt-4o"),
|
||||
choices={"relevant": 1.0, "irrelevant": 0.0},
|
||||
)
|
||||
```
|
||||
|
||||
## Input Mapping
|
||||
|
||||
Column names must match template variables. Rename columns or use `bind_evaluator`:
|
||||
|
||||
```python
|
||||
# Option 1: Rename columns to match template variables
|
||||
df = df.rename(columns={"user_query": "input", "ai_response": "output"})
|
||||
|
||||
# Option 2: Use bind_evaluator
|
||||
from phoenix.evals import bind_evaluator
|
||||
|
||||
bound = bind_evaluator(
|
||||
evaluator=helpfulness,
|
||||
input_mapping={"input": "user_query", "output": "ai_response"},
|
||||
)
|
||||
```
|
||||
|
||||
## Running
|
||||
|
||||
```python
|
||||
from phoenix.evals import evaluate_dataframe
|
||||
|
||||
results_df = evaluate_dataframe(dataframe=df, evaluators=[helpfulness])
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Be specific** - Define exactly what pass/fail means
|
||||
2. **Include examples** - Show concrete cases for each label
|
||||
3. **Explanations by default** - `ClassificationEvaluator` includes explanations automatically
|
||||
4. **Study built-in prompts** - See
|
||||
`phoenix.evals.__generated__.classification_evaluator_configs` for examples
|
||||
of well-structured evaluation prompts (Faithfulness, Correctness, DocumentRelevance, etc.)
|
||||
@@ -0,0 +1,58 @@
|
||||
# Evaluators: LLM Evaluators in TypeScript
|
||||
|
||||
LLM evaluators use a language model to judge outputs. Uses Vercel AI SDK.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```typescript
|
||||
import { createClassificationEvaluator } from "@arizeai/phoenix-evals";
|
||||
import { openai } from "@ai-sdk/openai";
|
||||
|
||||
const helpfulness = await createClassificationEvaluator<{
|
||||
input: string;
|
||||
output: string;
|
||||
}>({
|
||||
name: "helpfulness",
|
||||
model: openai("gpt-4o"),
|
||||
promptTemplate: `Rate helpfulness.
|
||||
<question>{{input}}</question>
|
||||
<response>{{output}}</response>
|
||||
Answer (helpful/not_helpful):`,
|
||||
choices: { not_helpful: 0, helpful: 1 },
|
||||
});
|
||||
```
|
||||
|
||||
## Template Variables
|
||||
|
||||
Use XML tags: `<question>{{input}}</question>`, `<response>{{output}}</response>`, `<context>{{context}}</context>`
|
||||
|
||||
## Custom Evaluator with asExperimentEvaluator
|
||||
|
||||
```typescript
|
||||
import { asExperimentEvaluator } from "@arizeai/phoenix-client/experiments";
|
||||
|
||||
const customEval = asExperimentEvaluator({
|
||||
name: "custom",
|
||||
kind: "LLM",
|
||||
evaluate: async ({ input, output }) => {
|
||||
// Your LLM call here
|
||||
return { score: 1.0, label: "pass", explanation: "..." };
|
||||
},
|
||||
});
|
||||
```
|
||||
|
||||
## Pre-Built Evaluators
|
||||
|
||||
```typescript
|
||||
import { createFaithfulnessEvaluator } from "@arizeai/phoenix-evals";
|
||||
|
||||
const faithfulnessEvaluator = createFaithfulnessEvaluator({
|
||||
model: openai("gpt-4o"),
|
||||
});
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
- Be specific about criteria
|
||||
- Include examples in prompts
|
||||
- Use `<thinking>` for chain of thought
|
||||
@@ -0,0 +1,40 @@
|
||||
# Evaluators: Overview
|
||||
|
||||
When and how to build automated evaluators.
|
||||
|
||||
## Decision Framework
|
||||
|
||||
```
|
||||
Should I Build an Evaluator?
|
||||
│
|
||||
▼
|
||||
Can I fix it with a prompt change?
|
||||
YES → Fix the prompt first
|
||||
NO → Is this a recurring issue?
|
||||
YES → Build evaluator
|
||||
NO → Add to watchlist
|
||||
```
|
||||
|
||||
**Don't automate prematurely.** Many issues are simple prompt fixes.
|
||||
|
||||
## Evaluator Requirements
|
||||
|
||||
1. **Clear criteria** - Specific, not "Is it good?"
|
||||
2. **Labeled test set** - 100+ examples with human labels
|
||||
3. **Measured accuracy** - Know TPR/TNR before deploying
|
||||
|
||||
## Evaluator Lifecycle
|
||||
|
||||
1. **Discover** - Error analysis reveals pattern
|
||||
2. **Design** - Define criteria and test cases
|
||||
3. **Implement** - Build code or LLM evaluator
|
||||
4. **Calibrate** - Validate against human labels
|
||||
5. **Deploy** - Add to experiment/CI pipeline
|
||||
6. **Monitor** - Track accuracy over time
|
||||
7. **Maintain** - Update as product evolves
|
||||
|
||||
## What NOT to Automate
|
||||
|
||||
- **Rare issues** - <5 instances? Watchlist, don't build
|
||||
- **Quick fixes** - Fixable by prompt change? Fix it
|
||||
- **Evolving criteria** - Stabilize definition first
|
||||
@@ -0,0 +1,75 @@
|
||||
# Evaluators: Pre-Built
|
||||
|
||||
Use for exploration only. Validate before production.
|
||||
|
||||
## Python
|
||||
|
||||
```python
|
||||
from phoenix.evals import LLM
|
||||
from phoenix.evals.metrics import FaithfulnessEvaluator
|
||||
|
||||
llm = LLM(provider="openai", model="gpt-4o")
|
||||
faithfulness_eval = FaithfulnessEvaluator(llm=llm)
|
||||
```
|
||||
|
||||
**Note**: `HallucinationEvaluator` is deprecated. Use `FaithfulnessEvaluator` instead.
|
||||
It uses "faithful"/"unfaithful" labels with score 1.0 = faithful.
|
||||
|
||||
## TypeScript
|
||||
|
||||
```typescript
|
||||
import { createHallucinationEvaluator } from "@arizeai/phoenix-evals";
|
||||
import { openai } from "@ai-sdk/openai";
|
||||
|
||||
const hallucinationEval = createHallucinationEvaluator({ model: openai("gpt-4o") });
|
||||
```
|
||||
|
||||
## Available (2.0)
|
||||
|
||||
| Evaluator | Type | Description |
|
||||
| --------- | ---- | ----------- |
|
||||
| `FaithfulnessEvaluator` | LLM | Is the response faithful to the context? |
|
||||
| `CorrectnessEvaluator` | LLM | Is the response correct? |
|
||||
| `DocumentRelevanceEvaluator` | LLM | Are retrieved documents relevant? |
|
||||
| `ToolSelectionEvaluator` | LLM | Did the agent select the right tool? |
|
||||
| `ToolInvocationEvaluator` | LLM | Did the agent invoke the tool correctly? |
|
||||
| `ToolResponseHandlingEvaluator` | LLM | Did the agent handle the tool response well? |
|
||||
| `MatchesRegex` | Code | Does output match a regex pattern? |
|
||||
| `PrecisionRecallFScore` | Code | Precision/recall/F-score metrics |
|
||||
| `exact_match` | Code | Exact string match |
|
||||
|
||||
Legacy evaluators (`HallucinationEvaluator`, `QAEvaluator`, `RelevanceEvaluator`,
|
||||
`ToxicityEvaluator`, `SummarizationEvaluator`) are in `phoenix.evals.legacy` and deprecated.
|
||||
|
||||
## When to Use
|
||||
|
||||
| Situation | Recommendation |
|
||||
| --------- | -------------- |
|
||||
| Exploration | Find traces to review |
|
||||
| Find outliers | Sort by scores |
|
||||
| Production | Validate first (>80% human agreement) |
|
||||
| Domain-specific | Build custom |
|
||||
|
||||
## Exploration Pattern
|
||||
|
||||
```python
|
||||
from phoenix.evals import evaluate_dataframe
|
||||
|
||||
results_df = evaluate_dataframe(dataframe=traces, evaluators=[faithfulness_eval])
|
||||
|
||||
# Score columns contain dicts — extract numeric scores
|
||||
scores = results_df["faithfulness_score"].apply(
|
||||
lambda x: x.get("score", 0.0) if isinstance(x, dict) else 0.0
|
||||
)
|
||||
low_scores = results_df[scores < 0.5] # Review these
|
||||
high_scores = results_df[scores > 0.9] # Also sample
|
||||
```
|
||||
|
||||
## Validation Required
|
||||
|
||||
```python
|
||||
from sklearn.metrics import classification_report
|
||||
|
||||
print(classification_report(human_labels, evaluator_results["label"]))
|
||||
# Target: >80% agreement
|
||||
```
|
||||
@@ -0,0 +1,108 @@
|
||||
# Evaluators: RAG Systems
|
||||
|
||||
RAG has two distinct components requiring different evaluation approaches.
|
||||
|
||||
## Two-Phase Evaluation
|
||||
|
||||
```
|
||||
RETRIEVAL GENERATION
|
||||
───────── ──────────
|
||||
Query → Retriever → Docs Docs + Query → LLM → Answer
|
||||
│ │
|
||||
IR Metrics LLM Judges / Code Checks
|
||||
```
|
||||
|
||||
**Debug retrieval first** using IR metrics, then tackle generation quality.
|
||||
|
||||
## Retrieval Evaluation (IR Metrics)
|
||||
|
||||
Use traditional information retrieval metrics:
|
||||
|
||||
| Metric | What It Measures |
|
||||
| ------ | ---------------- |
|
||||
| Recall@k | Of all relevant docs, how many in top k? |
|
||||
| Precision@k | Of k retrieved docs, how many relevant? |
|
||||
| MRR | How high is first relevant doc? |
|
||||
| NDCG | Quality weighted by position |
|
||||
|
||||
```python
|
||||
# Requires query-document relevance labels
|
||||
def recall_at_k(retrieved_ids, relevant_ids, k=5):
|
||||
retrieved_set = set(retrieved_ids[:k])
|
||||
relevant_set = set(relevant_ids)
|
||||
if not relevant_set:
|
||||
return 0.0
|
||||
return len(retrieved_set & relevant_set) / len(relevant_set)
|
||||
```
|
||||
|
||||
## Creating Retrieval Test Data
|
||||
|
||||
Generate query-document pairs synthetically:
|
||||
|
||||
```python
|
||||
# Reverse process: document → questions that document answers
|
||||
def generate_retrieval_test(documents):
|
||||
test_pairs = []
|
||||
for doc in documents:
|
||||
# Extract facts, generate questions
|
||||
questions = llm(f"Generate 3 questions this document answers:\n{doc}")
|
||||
for q in questions:
|
||||
test_pairs.append({"query": q, "relevant_doc_id": doc.id})
|
||||
return test_pairs
|
||||
```
|
||||
|
||||
## Generation Evaluation
|
||||
|
||||
Use LLM judges for qualities code can't measure:
|
||||
|
||||
| Eval | Question |
|
||||
| ---- | -------- |
|
||||
| **Faithfulness** | Are all claims supported by retrieved context? |
|
||||
| **Relevance** | Does answer address the question? |
|
||||
| **Completeness** | Does answer cover key points from context? |
|
||||
|
||||
```python
|
||||
from phoenix.evals import ClassificationEvaluator, LLM
|
||||
|
||||
FAITHFULNESS_TEMPLATE = """Given the context and answer, is every claim in the answer supported by the context?
|
||||
|
||||
<context>{{context}}</context>
|
||||
<answer>{{output}}</answer>
|
||||
|
||||
"faithful" = ALL claims supported by context
|
||||
"unfaithful" = ANY claim NOT in context
|
||||
|
||||
Answer (faithful/unfaithful):"""
|
||||
|
||||
faithfulness = ClassificationEvaluator(
|
||||
name="faithfulness",
|
||||
prompt_template=FAITHFULNESS_TEMPLATE,
|
||||
llm=LLM(provider="openai", model="gpt-4o"),
|
||||
choices={"unfaithful": 0, "faithful": 1}
|
||||
)
|
||||
```
|
||||
|
||||
## RAG Failure Taxonomy
|
||||
|
||||
Common failure modes to evaluate:
|
||||
|
||||
```yaml
|
||||
retrieval_failures:
|
||||
- no_relevant_docs: Query returns unrelated content
|
||||
- partial_retrieval: Some relevant docs missed
|
||||
- wrong_chunk: Right doc, wrong section
|
||||
|
||||
generation_failures:
|
||||
- hallucination: Claims not in retrieved context
|
||||
- ignored_context: Answer doesn't use retrieved docs
|
||||
- incomplete: Missing key information from context
|
||||
- wrong_synthesis: Misinterprets or miscombines sources
|
||||
```
|
||||
|
||||
## Evaluation Order
|
||||
|
||||
1. **Retrieval first** - If wrong docs, generation will fail
|
||||
2. **Faithfulness** - Is answer grounded in context?
|
||||
3. **Answer quality** - Does answer address the question?
|
||||
|
||||
Fix retrieval problems before debugging generation.
|
||||
@@ -0,0 +1,133 @@
|
||||
# Experiments: Datasets in Python
|
||||
|
||||
Creating and managing evaluation datasets.
|
||||
|
||||
## Creating Datasets
|
||||
|
||||
```python
|
||||
from phoenix.client import Client
|
||||
|
||||
client = Client()
|
||||
|
||||
# From examples
|
||||
dataset = client.datasets.create_dataset(
|
||||
name="qa-test-v1",
|
||||
examples=[
|
||||
{
|
||||
"input": {"question": "What is 2+2?"},
|
||||
"output": {"answer": "4"},
|
||||
"metadata": {"category": "math"},
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
# From DataFrame
|
||||
dataset = client.datasets.create_dataset(
|
||||
dataframe=df,
|
||||
name="qa-test-v1",
|
||||
input_keys=["question"],
|
||||
output_keys=["answer"],
|
||||
metadata_keys=["category"],
|
||||
)
|
||||
```
|
||||
|
||||
## From Production Traces
|
||||
|
||||
```python
|
||||
spans_df = client.spans.get_spans_dataframe(project_identifier="my-app")
|
||||
|
||||
dataset = client.datasets.create_dataset(
|
||||
dataframe=spans_df[["input.value", "output.value"]],
|
||||
name="production-sample-v1",
|
||||
input_keys=["input.value"],
|
||||
output_keys=["output.value"],
|
||||
)
|
||||
```
|
||||
|
||||
## Retrieving Datasets
|
||||
|
||||
```python
|
||||
dataset = client.datasets.get_dataset(name="qa-test-v1")
|
||||
df = dataset.to_dataframe()
|
||||
```
|
||||
|
||||
## Key Parameters
|
||||
|
||||
| Parameter | Description |
|
||||
| --------- | ----------- |
|
||||
| `input_keys` | Columns for task input |
|
||||
| `output_keys` | Columns for expected output |
|
||||
| `metadata_keys` | Additional context |
|
||||
|
||||
## Using Evaluators in Experiments
|
||||
|
||||
### Evaluators as experiment evaluators
|
||||
|
||||
Pass phoenix-evals evaluators directly to `run_experiment` as the `evaluators` argument:
|
||||
|
||||
```python
|
||||
from functools import partial
|
||||
from phoenix.client import AsyncClient
|
||||
from phoenix.evals import ClassificationEvaluator, LLM, bind_evaluator
|
||||
|
||||
# Define an LLM evaluator
|
||||
refusal = ClassificationEvaluator(
|
||||
name="refusal",
|
||||
prompt_template="Is this a refusal?\nQuestion: {{query}}\nResponse: {{response}}",
|
||||
llm=LLM(provider="openai", model="gpt-4o"),
|
||||
choices={"refusal": 0, "answer": 1},
|
||||
)
|
||||
|
||||
# Bind to map dataset columns to evaluator params
|
||||
refusal_evaluator = bind_evaluator(refusal, {"query": "input.query", "response": "output"})
|
||||
|
||||
# Define experiment task
|
||||
async def run_rag_task(input, rag_engine):
|
||||
return rag_engine.query(input["query"])
|
||||
|
||||
# Run experiment with the evaluator
|
||||
experiment = await AsyncClient().experiments.run_experiment(
|
||||
dataset=ds,
|
||||
task=partial(run_rag_task, rag_engine=query_engine),
|
||||
experiment_name="baseline",
|
||||
evaluators=[refusal_evaluator],
|
||||
concurrency=10,
|
||||
)
|
||||
```
|
||||
|
||||
### Evaluators as the task (meta evaluation)
|
||||
|
||||
Use an LLM evaluator as the experiment **task** to test the evaluator itself
|
||||
against human annotations:
|
||||
|
||||
```python
|
||||
from phoenix.evals import create_evaluator
|
||||
|
||||
# The evaluator IS the task being tested
|
||||
def run_refusal_eval(input, evaluator):
|
||||
result = evaluator.evaluate(input)
|
||||
return result[0]
|
||||
|
||||
# A simple heuristic checks judge vs human agreement
|
||||
@create_evaluator(name="exact_match")
|
||||
def exact_match(output, expected):
|
||||
return float(output["score"]) == float(expected["refusal_score"])
|
||||
|
||||
# Run: evaluator is the task, exact_match evaluates it
|
||||
experiment = await AsyncClient().experiments.run_experiment(
|
||||
dataset=annotated_dataset,
|
||||
task=partial(run_refusal_eval, evaluator=refusal),
|
||||
experiment_name="judge-v1",
|
||||
evaluators=[exact_match],
|
||||
concurrency=10,
|
||||
)
|
||||
```
|
||||
|
||||
This pattern lets you iterate on evaluator prompts until they align with human judgments.
|
||||
See `tutorials/evals/evals-2/evals_2.0_rag_demo.ipynb` for a full worked example.
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Versioning**: Create new datasets (e.g., `qa-test-v2`), don't modify
|
||||
- **Metadata**: Track source, category, difficulty
|
||||
- **Balance**: Ensure diverse coverage across categories
|
||||
@@ -0,0 +1,69 @@
|
||||
# Experiments: Datasets in TypeScript
|
||||
|
||||
Creating and managing evaluation datasets.
|
||||
|
||||
## Creating Datasets
|
||||
|
||||
```typescript
|
||||
import { createClient } from "@arizeai/phoenix-client";
|
||||
import { createDataset } from "@arizeai/phoenix-client/datasets";
|
||||
|
||||
const client = createClient();
|
||||
|
||||
const { datasetId } = await createDataset({
|
||||
client,
|
||||
name: "qa-test-v1",
|
||||
examples: [
|
||||
{
|
||||
input: { question: "What is 2+2?" },
|
||||
output: { answer: "4" },
|
||||
metadata: { category: "math" },
|
||||
},
|
||||
],
|
||||
});
|
||||
```
|
||||
|
||||
## Example Structure
|
||||
|
||||
```typescript
|
||||
interface DatasetExample {
|
||||
input: Record<string, unknown>; // Task input
|
||||
output?: Record<string, unknown>; // Expected output
|
||||
metadata?: Record<string, unknown>; // Additional context
|
||||
}
|
||||
```
|
||||
|
||||
## From Production Traces
|
||||
|
||||
```typescript
|
||||
import { getSpans } from "@arizeai/phoenix-client/spans";
|
||||
|
||||
const { spans } = await getSpans({
|
||||
project: { projectName: "my-app" },
|
||||
parentId: null, // root spans only
|
||||
limit: 100,
|
||||
});
|
||||
|
||||
const examples = spans.map((span) => ({
|
||||
input: { query: span.attributes?.["input.value"] },
|
||||
output: { response: span.attributes?.["output.value"] },
|
||||
metadata: { spanId: span.context.span_id },
|
||||
}));
|
||||
|
||||
await createDataset({ client, name: "production-sample", examples });
|
||||
```
|
||||
|
||||
## Retrieving Datasets
|
||||
|
||||
```typescript
|
||||
import { getDataset, listDatasets } from "@arizeai/phoenix-client/datasets";
|
||||
|
||||
const dataset = await getDataset({ client, datasetId: "..." });
|
||||
const all = await listDatasets({ client });
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Versioning**: Create new datasets, don't modify existing
|
||||
- **Metadata**: Track source, category, provenance
|
||||
- **Type safety**: Use TypeScript interfaces for structure
|
||||
@@ -0,0 +1,50 @@
|
||||
# Experiments: Overview
|
||||
|
||||
Systematic testing of AI systems with datasets, tasks, and evaluators.
|
||||
|
||||
## Structure
|
||||
|
||||
```
|
||||
DATASET → Examples: {input, expected_output, metadata}
|
||||
TASK → function(input) → output
|
||||
EVALUATORS → (input, output, expected) → score
|
||||
EXPERIMENT → Run task on all examples, score results
|
||||
```
|
||||
|
||||
## Basic Usage
|
||||
|
||||
```python
|
||||
from phoenix.client.experiments import run_experiment
|
||||
|
||||
experiment = run_experiment(
|
||||
dataset=my_dataset,
|
||||
task=my_task,
|
||||
evaluators=[accuracy, faithfulness],
|
||||
experiment_name="improved-retrieval-v2",
|
||||
)
|
||||
|
||||
print(experiment.aggregate_scores)
|
||||
# {'accuracy': 0.85, 'faithfulness': 0.92}
|
||||
```
|
||||
|
||||
## Workflow
|
||||
|
||||
1. **Create dataset** - From traces, synthetic data, or manual curation
|
||||
2. **Define task** - The function to test (your LLM pipeline)
|
||||
3. **Select evaluators** - Code and/or LLM-based
|
||||
4. **Run experiment** - Execute and score
|
||||
5. **Analyze & iterate** - Review, modify task, re-run
|
||||
|
||||
## Dry Runs
|
||||
|
||||
Test setup before full execution:
|
||||
|
||||
```python
|
||||
experiment = run_experiment(dataset, task, evaluators, dry_run=3) # Just 3 examples
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Name meaningfully**: `"improved-retrieval-v2-2024-01-15"` not `"test"`
|
||||
- **Version datasets**: Don't modify existing
|
||||
- **Multiple evaluators**: Combine perspectives
|
||||
@@ -0,0 +1,78 @@
|
||||
# Experiments: Running Experiments in Python
|
||||
|
||||
Execute experiments with `run_experiment`.
|
||||
|
||||
## Basic Usage
|
||||
|
||||
```python
|
||||
from phoenix.client import Client
|
||||
from phoenix.client.experiments import run_experiment
|
||||
|
||||
client = Client()
|
||||
dataset = client.datasets.get_dataset(name="qa-test-v1")
|
||||
|
||||
def my_task(example):
|
||||
return call_llm(example.input["question"])
|
||||
|
||||
def exact_match(output, expected):
|
||||
return 1.0 if output.strip().lower() == expected["answer"].strip().lower() else 0.0
|
||||
|
||||
experiment = run_experiment(
|
||||
dataset=dataset,
|
||||
task=my_task,
|
||||
evaluators=[exact_match],
|
||||
experiment_name="qa-experiment-v1",
|
||||
)
|
||||
```
|
||||
|
||||
## Task Functions
|
||||
|
||||
```python
|
||||
# Basic task
|
||||
def task(example):
|
||||
return call_llm(example.input["question"])
|
||||
|
||||
# With context (RAG)
|
||||
def rag_task(example):
|
||||
return call_llm(f"Context: {example.input['context']}\nQ: {example.input['question']}")
|
||||
```
|
||||
|
||||
## Evaluator Parameters
|
||||
|
||||
| Parameter | Access |
|
||||
| --------- | ------ |
|
||||
| `output` | Task output |
|
||||
| `expected` | Example expected output |
|
||||
| `input` | Example input |
|
||||
| `metadata` | Example metadata |
|
||||
|
||||
## Options
|
||||
|
||||
```python
|
||||
experiment = run_experiment(
|
||||
dataset=dataset,
|
||||
task=my_task,
|
||||
evaluators=evaluators,
|
||||
experiment_name="my-experiment",
|
||||
dry_run=3, # Test with 3 examples
|
||||
repetitions=3, # Run each example 3 times
|
||||
)
|
||||
```
|
||||
|
||||
## Results
|
||||
|
||||
```python
|
||||
print(experiment.aggregate_scores)
|
||||
# {'accuracy': 0.85, 'faithfulness': 0.92}
|
||||
|
||||
for run in experiment.runs:
|
||||
print(run.output, run.scores)
|
||||
```
|
||||
|
||||
## Add Evaluations Later
|
||||
|
||||
```python
|
||||
from phoenix.client.experiments import evaluate_experiment
|
||||
|
||||
evaluate_experiment(experiment=experiment, evaluators=[new_evaluator])
|
||||
```
|
||||
@@ -0,0 +1,82 @@
|
||||
# Experiments: Running Experiments in TypeScript
|
||||
|
||||
Execute experiments with `runExperiment`.
|
||||
|
||||
## Basic Usage
|
||||
|
||||
```typescript
|
||||
import { createClient } from "@arizeai/phoenix-client";
|
||||
import {
|
||||
runExperiment,
|
||||
asExperimentEvaluator,
|
||||
} from "@arizeai/phoenix-client/experiments";
|
||||
|
||||
const client = createClient();
|
||||
|
||||
const task = async (example: { input: Record<string, unknown> }) => {
|
||||
return await callLLM(example.input.question as string);
|
||||
};
|
||||
|
||||
const exactMatch = asExperimentEvaluator({
|
||||
name: "exact_match",
|
||||
kind: "CODE",
|
||||
evaluate: async ({ output, expected }) => ({
|
||||
score: output === expected?.answer ? 1.0 : 0.0,
|
||||
label: output === expected?.answer ? "match" : "no_match",
|
||||
}),
|
||||
});
|
||||
|
||||
const experiment = await runExperiment({
|
||||
client,
|
||||
experimentName: "qa-experiment-v1",
|
||||
dataset: { datasetId: "your-dataset-id" },
|
||||
task,
|
||||
evaluators: [exactMatch],
|
||||
});
|
||||
```
|
||||
|
||||
## Task Functions
|
||||
|
||||
```typescript
|
||||
// Basic task
|
||||
const task = async (example) => await callLLM(example.input.question as string);
|
||||
|
||||
// With context (RAG)
|
||||
const ragTask = async (example) => {
|
||||
const prompt = `Context: ${example.input.context}\nQ: ${example.input.question}`;
|
||||
return await callLLM(prompt);
|
||||
};
|
||||
```
|
||||
|
||||
## Evaluator Parameters
|
||||
|
||||
```typescript
|
||||
interface EvaluatorParams {
|
||||
input: Record<string, unknown>;
|
||||
output: unknown;
|
||||
expected: Record<string, unknown>;
|
||||
metadata: Record<string, unknown>;
|
||||
}
|
||||
```
|
||||
|
||||
## Options
|
||||
|
||||
```typescript
|
||||
const experiment = await runExperiment({
|
||||
client,
|
||||
experimentName: "my-experiment",
|
||||
dataset: { datasetName: "qa-test-v1" },
|
||||
task,
|
||||
evaluators,
|
||||
repetitions: 3, // Run each example 3 times
|
||||
maxConcurrency: 5, // Limit concurrent executions
|
||||
});
|
||||
```
|
||||
|
||||
## Add Evaluations Later
|
||||
|
||||
```typescript
|
||||
import { evaluateExperiment } from "@arizeai/phoenix-client/experiments";
|
||||
|
||||
await evaluateExperiment({ client, experiment, evaluators: [newEvaluator] });
|
||||
```
|
||||
@@ -0,0 +1,70 @@
|
||||
# Experiments: Generating Synthetic Test Data
|
||||
|
||||
Creating diverse, targeted test data for evaluation.
|
||||
|
||||
## Dimension-Based Approach
|
||||
|
||||
Define axes of variation, then generate combinations:
|
||||
|
||||
```python
|
||||
dimensions = {
|
||||
"issue_type": ["billing", "technical", "shipping"],
|
||||
"customer_mood": ["frustrated", "neutral", "happy"],
|
||||
"complexity": ["simple", "moderate", "complex"],
|
||||
}
|
||||
```
|
||||
|
||||
## Two-Step Generation
|
||||
|
||||
1. **Generate tuples** (combinations of dimension values)
|
||||
2. **Convert to natural queries** (separate LLM call per tuple)
|
||||
|
||||
```python
|
||||
# Step 1: Create tuples
|
||||
tuples = [
|
||||
("billing", "frustrated", "complex"),
|
||||
("shipping", "neutral", "simple"),
|
||||
]
|
||||
|
||||
# Step 2: Convert to natural query
|
||||
def tuple_to_query(t):
|
||||
prompt = f"""Generate a realistic customer message:
|
||||
Issue: {t[0]}, Mood: {t[1]}, Complexity: {t[2]}
|
||||
|
||||
Write naturally, include typos if appropriate. Don't be formulaic."""
|
||||
return llm(prompt)
|
||||
```
|
||||
|
||||
## Target Failure Modes
|
||||
|
||||
Dimensions should target known failures from error analysis:
|
||||
|
||||
```python
|
||||
# From error analysis findings
|
||||
dimensions = {
|
||||
"timezone": ["EST", "PST", "UTC", "ambiguous"], # Known failure
|
||||
"date_format": ["ISO", "US", "EU", "relative"], # Known failure
|
||||
}
|
||||
```
|
||||
|
||||
## Quality Control
|
||||
|
||||
- **Validate**: Check for placeholder text, minimum length
|
||||
- **Deduplicate**: Remove near-duplicate queries using embeddings
|
||||
- **Balance**: Ensure coverage across dimension values
|
||||
|
||||
## When to Use
|
||||
|
||||
| Use Synthetic | Use Real Data |
|
||||
| ------------- | ------------- |
|
||||
| Limited production data | Sufficient traces |
|
||||
| Testing edge cases | Validating actual behavior |
|
||||
| Pre-launch evals | Post-launch monitoring |
|
||||
|
||||
## Sample Sizes
|
||||
|
||||
| Purpose | Size |
|
||||
| ------- | ---- |
|
||||
| Initial exploration | 50-100 |
|
||||
| Comprehensive eval | 100-500 |
|
||||
| Per-dimension | 10-20 per combination |
|
||||
@@ -0,0 +1,86 @@
|
||||
# Experiments: Generating Synthetic Test Data (TypeScript)
|
||||
|
||||
Creating diverse, targeted test data for evaluation.
|
||||
|
||||
## Dimension-Based Approach
|
||||
|
||||
Define axes of variation, then generate combinations:
|
||||
|
||||
```typescript
|
||||
const dimensions = {
|
||||
issueType: ["billing", "technical", "shipping"],
|
||||
customerMood: ["frustrated", "neutral", "happy"],
|
||||
complexity: ["simple", "moderate", "complex"],
|
||||
};
|
||||
```
|
||||
|
||||
## Two-Step Generation
|
||||
|
||||
1. **Generate tuples** (combinations of dimension values)
|
||||
2. **Convert to natural queries** (separate LLM call per tuple)
|
||||
|
||||
```typescript
|
||||
import { generateText } from "ai";
|
||||
import { openai } from "@ai-sdk/openai";
|
||||
|
||||
// Step 1: Create tuples
|
||||
type Tuple = [string, string, string];
|
||||
const tuples: Tuple[] = [
|
||||
["billing", "frustrated", "complex"],
|
||||
["shipping", "neutral", "simple"],
|
||||
];
|
||||
|
||||
// Step 2: Convert to natural query
|
||||
async function tupleToQuery(t: Tuple): Promise<string> {
|
||||
const { text } = await generateText({
|
||||
model: openai("gpt-4o"),
|
||||
prompt: `Generate a realistic customer message:
|
||||
Issue: ${t[0]}, Mood: ${t[1]}, Complexity: ${t[2]}
|
||||
|
||||
Write naturally, include typos if appropriate. Don't be formulaic.`,
|
||||
});
|
||||
return text;
|
||||
}
|
||||
```
|
||||
|
||||
## Target Failure Modes
|
||||
|
||||
Dimensions should target known failures from error analysis:
|
||||
|
||||
```typescript
|
||||
// From error analysis findings
|
||||
const dimensions = {
|
||||
timezone: ["EST", "PST", "UTC", "ambiguous"], // Known failure
|
||||
dateFormat: ["ISO", "US", "EU", "relative"], // Known failure
|
||||
};
|
||||
```
|
||||
|
||||
## Quality Control
|
||||
|
||||
- **Validate**: Check for placeholder text, minimum length
|
||||
- **Deduplicate**: Remove near-duplicate queries using embeddings
|
||||
- **Balance**: Ensure coverage across dimension values
|
||||
|
||||
```typescript
|
||||
function validateQuery(query: string): boolean {
|
||||
const minLength = 20;
|
||||
const hasPlaceholder = /\[.*?\]|<.*?>/.test(query);
|
||||
return query.length >= minLength && !hasPlaceholder;
|
||||
}
|
||||
```
|
||||
|
||||
## When to Use
|
||||
|
||||
| Use Synthetic | Use Real Data |
|
||||
| ------------- | ------------- |
|
||||
| Limited production data | Sufficient traces |
|
||||
| Testing edge cases | Validating actual behavior |
|
||||
| Pre-launch evals | Post-launch monitoring |
|
||||
|
||||
## Sample Sizes
|
||||
|
||||
| Purpose | Size |
|
||||
| ------- | ---- |
|
||||
| Initial exploration | 50-100 |
|
||||
| Comprehensive eval | 100-500 |
|
||||
| Per-dimension | 10-20 per combination |
|
||||
@@ -0,0 +1,43 @@
|
||||
# Anti-Patterns
|
||||
|
||||
Common mistakes and fixes.
|
||||
|
||||
| Anti-Pattern | Problem | Fix |
|
||||
| ------------ | ------- | --- |
|
||||
| Generic metrics | Pre-built scores don't match your failures | Build from error analysis |
|
||||
| Vibe-based | No quantification | Measure with experiments |
|
||||
| Ignoring humans | Uncalibrated LLM judges | Validate >80% TPR/TNR |
|
||||
| Premature automation | Evaluators for imagined problems | Let observed failures drive |
|
||||
| Saturation blindness | 100% pass = no signal | Keep capability evals at 50-80% |
|
||||
| Similarity metrics | BERTScore/ROUGE for generation | Use for retrieval only |
|
||||
| Model switching | Hoping a model works better | Error analysis first |
|
||||
|
||||
## Quantify Changes
|
||||
|
||||
```python
|
||||
baseline = run_experiment(dataset, old_prompt, evaluators)
|
||||
improved = run_experiment(dataset, new_prompt, evaluators)
|
||||
print(f"Improvement: {improved.pass_rate - baseline.pass_rate:+.1%}")
|
||||
```
|
||||
|
||||
## Don't Use Similarity for Generation
|
||||
|
||||
```python
|
||||
# BAD
|
||||
score = bertscore(output, reference)
|
||||
|
||||
# GOOD
|
||||
correct_facts = check_facts_against_source(output, context)
|
||||
```
|
||||
|
||||
## Error Analysis Before Model Change
|
||||
|
||||
```python
|
||||
# BAD
|
||||
for model in models:
|
||||
results = test(model)
|
||||
|
||||
# GOOD
|
||||
failures = analyze_errors(results)
|
||||
# Then decide if model change is warranted
|
||||
```
|
||||
@@ -0,0 +1,58 @@
|
||||
# Model Selection
|
||||
|
||||
Error analysis first, model changes last.
|
||||
|
||||
## Decision Tree
|
||||
|
||||
```
|
||||
Performance Issue?
|
||||
│
|
||||
▼
|
||||
Error analysis suggests model problem?
|
||||
NO → Fix prompts, retrieval, tools
|
||||
YES → Is it a capability gap?
|
||||
YES → Consider model change
|
||||
NO → Fix the actual problem
|
||||
```
|
||||
|
||||
## Judge Model Selection
|
||||
|
||||
| Principle | Action |
|
||||
| --------- | ------ |
|
||||
| Start capable | Use gpt-4o first |
|
||||
| Optimize later | Test cheaper after criteria stable |
|
||||
| Same model OK | Judge does different task |
|
||||
|
||||
```python
|
||||
# Start with capable model
|
||||
judge = ClassificationEvaluator(
|
||||
llm=LLM(provider="openai", model="gpt-4o"),
|
||||
...
|
||||
)
|
||||
|
||||
# After validation, test cheaper
|
||||
judge_cheap = ClassificationEvaluator(
|
||||
llm=LLM(provider="openai", model="gpt-4o-mini"),
|
||||
...
|
||||
)
|
||||
# Compare TPR/TNR on same test set
|
||||
```
|
||||
|
||||
## Don't Model Shop
|
||||
|
||||
```python
|
||||
# BAD
|
||||
for model in ["gpt-4o", "claude-3", "gemini-pro"]:
|
||||
results = run_experiment(dataset, task, model)
|
||||
|
||||
# GOOD
|
||||
failures = analyze_errors(results)
|
||||
# "Ignores context" → Fix prompt
|
||||
# "Can't do math" → Maybe try better model
|
||||
```
|
||||
|
||||
## When Model Change Is Warranted
|
||||
|
||||
- Failures persist after prompt optimization
|
||||
- Capability gaps (reasoning, math, code)
|
||||
- Error analysis confirms model limitation
|
||||
@@ -0,0 +1,76 @@
|
||||
# Fundamentals
|
||||
|
||||
Application-specific tests for AI systems. Code first, LLM for nuance, human for truth.
|
||||
|
||||
## Evaluator Types
|
||||
|
||||
| Type | Speed | Cost | Use Case |
|
||||
| ---- | ----- | ---- | -------- |
|
||||
| **Code** | Fast | Cheap | Regex, JSON, format, exact match |
|
||||
| **LLM** | Medium | Medium | Subjective quality, complex criteria |
|
||||
| **Human** | Slow | Expensive | Ground truth, calibration |
|
||||
|
||||
**Decision:** Code first → LLM only when code can't capture criteria → Human for calibration.
|
||||
|
||||
## Score Structure
|
||||
|
||||
| Property | Required | Description |
|
||||
| -------- | -------- | ----------- |
|
||||
| `name` | Yes | Evaluator name |
|
||||
| `kind` | Yes | `"code"`, `"llm"`, `"human"` |
|
||||
| `score` | No* | 0-1 numeric |
|
||||
| `label` | No* | `"pass"`, `"fail"` |
|
||||
| `explanation` | No | Rationale |
|
||||
|
||||
*One of `score` or `label` required.
|
||||
|
||||
## Binary > Likert
|
||||
|
||||
Use pass/fail, not 1-5 scales. Clearer criteria, easier calibration.
|
||||
|
||||
```python
|
||||
# Multiple binary checks instead of one Likert scale
|
||||
evaluators = [
|
||||
AnswersQuestion(), # Yes/No
|
||||
UsesContext(), # Yes/No
|
||||
NoHallucination(), # Yes/No
|
||||
]
|
||||
```
|
||||
|
||||
## Quick Patterns
|
||||
|
||||
### Code Evaluator
|
||||
|
||||
```python
|
||||
from phoenix.evals import create_evaluator
|
||||
|
||||
@create_evaluator(name="has_citation", kind="code")
|
||||
def has_citation(output: str) -> bool:
|
||||
return bool(re.search(r'\[\d+\]', output))
|
||||
```
|
||||
|
||||
### LLM Evaluator
|
||||
|
||||
```python
|
||||
from phoenix.evals import ClassificationEvaluator, LLM
|
||||
|
||||
evaluator = ClassificationEvaluator(
|
||||
name="helpfulness",
|
||||
prompt_template="...",
|
||||
llm=LLM(provider="openai", model="gpt-4o"),
|
||||
choices={"not_helpful": 0, "helpful": 1}
|
||||
)
|
||||
```
|
||||
|
||||
### Run Experiment
|
||||
|
||||
```python
|
||||
from phoenix.client.experiments import run_experiment
|
||||
|
||||
experiment = run_experiment(
|
||||
dataset=dataset,
|
||||
task=my_task,
|
||||
evaluators=[evaluator1, evaluator2],
|
||||
)
|
||||
print(experiment.aggregate_scores)
|
||||
```
|
||||
@@ -0,0 +1,101 @@
|
||||
# Observe: Sampling Strategies
|
||||
|
||||
How to efficiently sample production traces for review.
|
||||
|
||||
## Strategies
|
||||
|
||||
### 1. Failure-Focused (Highest Priority)
|
||||
|
||||
```python
|
||||
errors = spans_df[spans_df["status_code"] == "ERROR"]
|
||||
negative_feedback = spans_df[spans_df["feedback"] == "negative"]
|
||||
```
|
||||
|
||||
### 2. Outliers
|
||||
|
||||
```python
|
||||
long_responses = spans_df.nlargest(50, "response_length")
|
||||
slow_responses = spans_df.nlargest(50, "latency_ms")
|
||||
```
|
||||
|
||||
### 3. Stratified (Coverage)
|
||||
|
||||
```python
|
||||
# Sample equally from each category
|
||||
by_query_type = spans_df.groupby("metadata.query_type").apply(
|
||||
lambda x: x.sample(min(len(x), 20))
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Metric-Guided
|
||||
|
||||
```python
|
||||
# Review traces flagged by automated evaluators
|
||||
flagged = spans_df[eval_results["label"] == "hallucinated"]
|
||||
borderline = spans_df[(eval_results["score"] > 0.3) & (eval_results["score"] < 0.7)]
|
||||
```
|
||||
|
||||
## Building a Review Queue
|
||||
|
||||
```python
|
||||
def build_review_queue(spans_df, max_traces=100):
|
||||
queue = pd.concat([
|
||||
spans_df[spans_df["status_code"] == "ERROR"],
|
||||
spans_df[spans_df["feedback"] == "negative"],
|
||||
spans_df.nlargest(10, "response_length"),
|
||||
spans_df.sample(min(30, len(spans_df))),
|
||||
]).drop_duplicates("span_id").head(max_traces)
|
||||
return queue
|
||||
```
|
||||
|
||||
## Sample Size Guidelines
|
||||
|
||||
| Purpose | Size |
|
||||
| ------- | ---- |
|
||||
| Initial exploration | 50-100 |
|
||||
| Error analysis | 100+ (until saturation) |
|
||||
| Golden dataset | 100-500 |
|
||||
| Judge calibration | 100+ per class |
|
||||
|
||||
**Saturation:** Stop when new traces show the same failure patterns.
|
||||
|
||||
## Trace-Level Sampling
|
||||
|
||||
When you need whole requests (all spans per trace), use `get_traces`:
|
||||
|
||||
```python
|
||||
from phoenix.client import Client
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
client = Client()
|
||||
|
||||
# Recent traces with full span trees
|
||||
traces = client.traces.get_traces(
|
||||
project_identifier="my-app",
|
||||
limit=100,
|
||||
include_spans=True,
|
||||
)
|
||||
|
||||
# Time-windowed sampling (e.g., last hour)
|
||||
traces = client.traces.get_traces(
|
||||
project_identifier="my-app",
|
||||
start_time=datetime.now() - timedelta(hours=1),
|
||||
limit=50,
|
||||
include_spans=True,
|
||||
)
|
||||
|
||||
# Filter by session (multi-turn conversations)
|
||||
traces = client.traces.get_traces(
|
||||
project_identifier="my-app",
|
||||
session_id="user-session-abc",
|
||||
include_spans=True,
|
||||
)
|
||||
|
||||
# Sort by latency to find slowest requests
|
||||
traces = client.traces.get_traces(
|
||||
project_identifier="my-app",
|
||||
sort="latency_ms",
|
||||
order="desc",
|
||||
limit=50,
|
||||
)
|
||||
```
|
||||
@@ -0,0 +1,147 @@
|
||||
# Observe: Sampling Strategies (TypeScript)
|
||||
|
||||
How to efficiently sample production traces for review.
|
||||
|
||||
## Strategies
|
||||
|
||||
### 1. Failure-Focused (Highest Priority)
|
||||
|
||||
Use server-side filters to fetch only what you need:
|
||||
|
||||
```typescript
|
||||
import { getSpans } from "@arizeai/phoenix-client/spans";
|
||||
|
||||
// Server-side filter — only ERROR spans are returned
|
||||
const { spans: errors } = await getSpans({
|
||||
project: { projectName: "my-project" },
|
||||
statusCode: "ERROR",
|
||||
limit: 100,
|
||||
});
|
||||
|
||||
// Fetch only LLM spans
|
||||
const { spans: llmSpans } = await getSpans({
|
||||
project: { projectName: "my-project" },
|
||||
spanKind: "LLM",
|
||||
limit: 100,
|
||||
});
|
||||
|
||||
// Filter by span name
|
||||
const { spans: chatSpans } = await getSpans({
|
||||
project: { projectName: "my-project" },
|
||||
name: "chat_completion",
|
||||
limit: 100,
|
||||
});
|
||||
```
|
||||
|
||||
### 2. Outliers
|
||||
|
||||
```typescript
|
||||
const { spans } = await getSpans({
|
||||
project: { projectName: "my-project" },
|
||||
limit: 200,
|
||||
});
|
||||
const latency = (s: (typeof spans)[number]) =>
|
||||
new Date(s.end_time).getTime() - new Date(s.start_time).getTime();
|
||||
const sorted = [...spans].sort((a, b) => latency(b) - latency(a));
|
||||
const slowResponses = sorted.slice(0, 50);
|
||||
```
|
||||
|
||||
### 3. Stratified (Coverage)
|
||||
|
||||
```typescript
|
||||
// Sample equally from each category
|
||||
function stratifiedSample<T>(items: T[], groupBy: (item: T) => string, perGroup: number): T[] {
|
||||
const groups = new Map<string, T[]>();
|
||||
for (const item of items) {
|
||||
const key = groupBy(item);
|
||||
if (!groups.has(key)) groups.set(key, []);
|
||||
groups.get(key)!.push(item);
|
||||
}
|
||||
return [...groups.values()].flatMap((g) => g.slice(0, perGroup));
|
||||
}
|
||||
|
||||
const { spans } = await getSpans({
|
||||
project: { projectName: "my-project" },
|
||||
limit: 500,
|
||||
});
|
||||
const byQueryType = stratifiedSample(spans, (s) => s.attributes?.["metadata.query_type"] ?? "unknown", 20);
|
||||
```
|
||||
|
||||
### 4. Metric-Guided
|
||||
|
||||
```typescript
|
||||
import { getSpanAnnotations } from "@arizeai/phoenix-client/spans";
|
||||
|
||||
// Fetch annotations for your spans, then filter by label
|
||||
const { annotations } = await getSpanAnnotations({
|
||||
project: { projectName: "my-project" },
|
||||
spanIds: spans.map((s) => s.context.span_id),
|
||||
includeAnnotationNames: ["hallucination"],
|
||||
});
|
||||
|
||||
const flaggedSpanIds = new Set(
|
||||
annotations.filter((a) => a.result?.label === "hallucinated").map((a) => a.span_id)
|
||||
);
|
||||
const flagged = spans.filter((s) => flaggedSpanIds.has(s.context.span_id));
|
||||
```
|
||||
|
||||
## Trace-Level Sampling
|
||||
|
||||
When you need whole requests (all spans in a trace), use `getTraces`:
|
||||
|
||||
```typescript
|
||||
import { getTraces } from "@arizeai/phoenix-client/traces";
|
||||
|
||||
// Recent traces with full span trees
|
||||
const { traces } = await getTraces({
|
||||
project: { projectName: "my-project" },
|
||||
limit: 100,
|
||||
includeSpans: true,
|
||||
});
|
||||
|
||||
// Filter by session (e.g., multi-turn conversations)
|
||||
const { traces: sessionTraces } = await getTraces({
|
||||
project: { projectName: "my-project" },
|
||||
sessionId: "user-session-abc",
|
||||
includeSpans: true,
|
||||
});
|
||||
|
||||
// Time-windowed sampling
|
||||
const { traces: recentTraces } = await getTraces({
|
||||
project: { projectName: "my-project" },
|
||||
startTime: new Date(Date.now() - 60 * 60 * 1000), // last hour
|
||||
limit: 50,
|
||||
includeSpans: true,
|
||||
});
|
||||
```
|
||||
|
||||
## Building a Review Queue
|
||||
|
||||
```typescript
|
||||
// Combine server-side filters into a review queue
|
||||
const { spans: errorSpans } = await getSpans({
|
||||
project: { projectName: "my-project" },
|
||||
statusCode: "ERROR",
|
||||
limit: 30,
|
||||
});
|
||||
const { spans: allSpans } = await getSpans({
|
||||
project: { projectName: "my-project" },
|
||||
limit: 100,
|
||||
});
|
||||
const random = allSpans.sort(() => Math.random() - 0.5).slice(0, 30);
|
||||
|
||||
const combined = [...errorSpans, ...random];
|
||||
const unique = [...new Map(combined.map((s) => [s.context.span_id, s])).values()];
|
||||
const reviewQueue = unique.slice(0, 100);
|
||||
```
|
||||
|
||||
## Sample Size Guidelines
|
||||
|
||||
| Purpose | Size |
|
||||
| ------- | ---- |
|
||||
| Initial exploration | 50-100 |
|
||||
| Error analysis | 100+ (until saturation) |
|
||||
| Golden dataset | 100-500 |
|
||||
| Judge calibration | 100+ per class |
|
||||
|
||||
**Saturation:** Stop when new traces show the same failure patterns.
|
||||
@@ -0,0 +1,144 @@
|
||||
# Observe: Tracing Setup
|
||||
|
||||
Configure tracing to capture data for evaluation.
|
||||
|
||||
## Quick Setup
|
||||
|
||||
```python
|
||||
# Python
|
||||
from phoenix.otel import register
|
||||
|
||||
register(project_name="my-app", auto_instrument=True)
|
||||
```
|
||||
|
||||
```typescript
|
||||
// TypeScript
|
||||
import { registerPhoenix } from "@arizeai/phoenix-otel";
|
||||
|
||||
registerPhoenix({ projectName: "my-app", autoInstrument: true });
|
||||
```
|
||||
|
||||
## Essential Attributes
|
||||
|
||||
| Attribute | Why It Matters |
|
||||
| --------- | -------------- |
|
||||
| `input.value` | User's request |
|
||||
| `output.value` | Response to evaluate |
|
||||
| `retrieval.documents` | Context for faithfulness |
|
||||
| `tool.name`, `tool.parameters` | Agent evaluation |
|
||||
| `llm.model_name` | Track by model |
|
||||
|
||||
## Custom Attributes for Evals
|
||||
|
||||
```python
|
||||
span.set_attribute("metadata.client_type", "enterprise")
|
||||
span.set_attribute("metadata.query_category", "billing")
|
||||
```
|
||||
|
||||
## Exporting for Evaluation
|
||||
|
||||
### Spans (Python — DataFrame)
|
||||
|
||||
```python
|
||||
from phoenix.client import Client
|
||||
|
||||
# Client() works for local Phoenix (falls back to env vars or localhost:6006)
|
||||
# For remote/cloud: Client(base_url="https://app.phoenix.arize.com", api_key="...")
|
||||
client = Client()
|
||||
spans_df = client.spans.get_spans_dataframe(
|
||||
project_identifier="my-app", # NOT project_name= (deprecated)
|
||||
root_spans_only=True,
|
||||
)
|
||||
|
||||
dataset = client.datasets.create_dataset(
|
||||
name="error-analysis-set",
|
||||
dataframe=spans_df[["input.value", "output.value"]],
|
||||
input_keys=["input.value"],
|
||||
output_keys=["output.value"],
|
||||
)
|
||||
```
|
||||
|
||||
### Spans (TypeScript)
|
||||
|
||||
```typescript
|
||||
import { getSpans } from "@arizeai/phoenix-client/spans";
|
||||
|
||||
const { spans } = await getSpans({
|
||||
project: { projectName: "my-app" },
|
||||
parentId: null, // root spans only
|
||||
limit: 100,
|
||||
});
|
||||
```
|
||||
|
||||
### Traces (Python — structured)
|
||||
|
||||
Use `get_traces` when you need full trace trees (e.g., multi-turn conversations, agent workflows):
|
||||
|
||||
```python
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
traces = client.traces.get_traces(
|
||||
project_identifier="my-app",
|
||||
start_time=datetime.now() - timedelta(hours=24),
|
||||
include_spans=True, # includes all spans per trace
|
||||
limit=100,
|
||||
)
|
||||
# Each trace has: trace_id, start_time, end_time, spans (when include_spans=True)
|
||||
```
|
||||
|
||||
### Traces (TypeScript)
|
||||
|
||||
```typescript
|
||||
import { getTraces } from "@arizeai/phoenix-client/traces";
|
||||
|
||||
const { traces } = await getTraces({
|
||||
project: { projectName: "my-app" },
|
||||
startTime: new Date(Date.now() - 24 * 60 * 60 * 1000),
|
||||
includeSpans: true,
|
||||
limit: 100,
|
||||
});
|
||||
```
|
||||
|
||||
## Uploading Evaluations as Annotations
|
||||
|
||||
### Python
|
||||
|
||||
```python
|
||||
from phoenix.evals import evaluate_dataframe
|
||||
from phoenix.evals.utils import to_annotation_dataframe
|
||||
|
||||
# Run evaluations
|
||||
results_df = evaluate_dataframe(dataframe=spans_df, evaluators=[my_eval])
|
||||
|
||||
# Format results for Phoenix annotations
|
||||
annotations_df = to_annotation_dataframe(results_df)
|
||||
|
||||
# Upload to Phoenix
|
||||
client.spans.log_span_annotations_dataframe(dataframe=annotations_df)
|
||||
```
|
||||
|
||||
### TypeScript
|
||||
|
||||
```typescript
|
||||
import { logSpanAnnotations } from "@arizeai/phoenix-client/spans";
|
||||
|
||||
await logSpanAnnotations({
|
||||
spanAnnotations: [
|
||||
{
|
||||
spanId: "abc123",
|
||||
name: "quality",
|
||||
label: "good",
|
||||
score: 0.95,
|
||||
annotatorKind: "LLM",
|
||||
},
|
||||
],
|
||||
});
|
||||
```
|
||||
|
||||
Annotations are visible in the Phoenix UI alongside your traces.
|
||||
|
||||
## Verify
|
||||
|
||||
Required attributes: `input.value`, `output.value`, `status_code`
|
||||
For RAG: `retrieval.documents`
|
||||
For agents: `tool.name`, `tool.parameters`
|
||||
@@ -0,0 +1,137 @@
|
||||
# Production: Continuous Evaluation
|
||||
|
||||
Capability vs regression evals and the ongoing feedback loop.
|
||||
|
||||
## Two Types of Evals
|
||||
|
||||
| Type | Pass Rate Target | Purpose | Update |
|
||||
| ---- | ---------------- | ------- | ------ |
|
||||
| **Capability** | 50-80% | Measure improvement | Add harder cases |
|
||||
| **Regression** | 95-100% | Catch breakage | Add fixed bugs |
|
||||
|
||||
## Saturation
|
||||
|
||||
When capability evals hit >95% pass rate, they're saturated:
|
||||
1. Graduate passing cases to regression suite
|
||||
2. Add new challenging cases to capability suite
|
||||
|
||||
## Feedback Loop
|
||||
|
||||
```
|
||||
Production → Sample traffic → Run evaluators → Find failures
|
||||
↑ ↓
|
||||
Deploy ← Run CI evals ← Create test cases ← Error analysis
|
||||
```
|
||||
|
||||
## Implementation
|
||||
|
||||
Build a continuous monitoring loop:
|
||||
|
||||
1. **Sample recent traces** at regular intervals (e.g., 100 traces per hour)
|
||||
2. **Run evaluators** on sampled traces
|
||||
3. **Log results** to Phoenix for tracking
|
||||
4. **Queue concerning results** for human review
|
||||
5. **Create test cases** from recurring failure patterns
|
||||
|
||||
### Python
|
||||
|
||||
```python
|
||||
from phoenix.client import Client
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
client = Client()
|
||||
|
||||
# 1. Sample recent spans (includes full attributes for evaluation)
|
||||
spans_df = client.spans.get_spans_dataframe(
|
||||
project_identifier="my-app",
|
||||
start_time=datetime.now() - timedelta(hours=1),
|
||||
root_spans_only=True,
|
||||
limit=100,
|
||||
)
|
||||
|
||||
# 2. Run evaluators
|
||||
from phoenix.evals import evaluate_dataframe
|
||||
|
||||
results_df = evaluate_dataframe(
|
||||
dataframe=spans_df,
|
||||
evaluators=[quality_eval, safety_eval],
|
||||
)
|
||||
|
||||
# 3. Upload results as annotations
|
||||
from phoenix.evals.utils import to_annotation_dataframe
|
||||
|
||||
annotations_df = to_annotation_dataframe(results_df)
|
||||
client.spans.log_span_annotations_dataframe(dataframe=annotations_df)
|
||||
```
|
||||
|
||||
### TypeScript
|
||||
|
||||
```typescript
|
||||
import { getSpans } from "@arizeai/phoenix-client/spans";
|
||||
import { logSpanAnnotations } from "@arizeai/phoenix-client/spans";
|
||||
|
||||
// 1. Sample recent spans
|
||||
const { spans } = await getSpans({
|
||||
project: { projectName: "my-app" },
|
||||
startTime: new Date(Date.now() - 60 * 60 * 1000),
|
||||
parentId: null, // root spans only
|
||||
limit: 100,
|
||||
});
|
||||
|
||||
// 2. Run evaluators (user-defined)
|
||||
const results = await Promise.all(
|
||||
spans.map(async (span) => ({
|
||||
spanId: span.context.span_id,
|
||||
...await runEvaluators(span, [qualityEval, safetyEval]),
|
||||
}))
|
||||
);
|
||||
|
||||
// 3. Upload results as annotations
|
||||
await logSpanAnnotations({
|
||||
spanAnnotations: results.map((r) => ({
|
||||
spanId: r.spanId,
|
||||
name: "quality",
|
||||
score: r.qualityScore,
|
||||
label: r.qualityLabel,
|
||||
annotatorKind: "LLM" as const,
|
||||
})),
|
||||
});
|
||||
```
|
||||
|
||||
For trace-level monitoring (e.g., agent workflows), use `get_traces`/`getTraces` to identify traces:
|
||||
|
||||
```python
|
||||
# Python: identify slow traces
|
||||
traces = client.traces.get_traces(
|
||||
project_identifier="my-app",
|
||||
start_time=datetime.now() - timedelta(hours=1),
|
||||
sort="latency_ms",
|
||||
order="desc",
|
||||
limit=50,
|
||||
)
|
||||
```
|
||||
|
||||
```typescript
|
||||
// TypeScript: identify slow traces
|
||||
import { getTraces } from "@arizeai/phoenix-client/traces";
|
||||
|
||||
const { traces } = await getTraces({
|
||||
project: { projectName: "my-app" },
|
||||
startTime: new Date(Date.now() - 60 * 60 * 1000),
|
||||
limit: 50,
|
||||
});
|
||||
```
|
||||
|
||||
## Alerting
|
||||
|
||||
| Condition | Severity | Action |
|
||||
| --------- | -------- | ------ |
|
||||
| Regression < 98% | Critical | Page oncall |
|
||||
| Capability declining | Warning | Slack notify |
|
||||
| Capability > 95% for 7d | Info | Schedule review |
|
||||
|
||||
## Key Principles
|
||||
|
||||
- **Two suites** - Capability + Regression always
|
||||
- **Graduate cases** - Move consistent passes to regression
|
||||
- **Track trends** - Monitor over time, not just snapshots
|
||||
@@ -0,0 +1,53 @@
|
||||
# Production: Guardrails vs Evaluators
|
||||
|
||||
Guardrails block in real-time. Evaluators measure asynchronously.
|
||||
|
||||
## Key Distinction
|
||||
|
||||
```
|
||||
Request → [INPUT GUARDRAIL] → LLM → [OUTPUT GUARDRAIL] → Response
|
||||
│
|
||||
└──→ ASYNC EVALUATOR (background)
|
||||
```
|
||||
|
||||
## Guardrails
|
||||
|
||||
| Aspect | Requirement |
|
||||
| ------ | ----------- |
|
||||
| Timing | Synchronous, blocking |
|
||||
| Latency | < 100ms |
|
||||
| Purpose | Prevent harm |
|
||||
| Type | Code-based (deterministic) |
|
||||
|
||||
**Use for:** PII detection, prompt injection, profanity, length limits, format validation.
|
||||
|
||||
## Evaluators
|
||||
|
||||
| Aspect | Characteristic |
|
||||
| ------ | -------------- |
|
||||
| Timing | Async, background |
|
||||
| Latency | Can be seconds |
|
||||
| Purpose | Measure quality |
|
||||
| Type | Can use LLMs |
|
||||
|
||||
**Use for:** Helpfulness, faithfulness, tone, completeness, citation accuracy.
|
||||
|
||||
## Decision
|
||||
|
||||
| Question | Answer |
|
||||
| -------- | ------ |
|
||||
| Must block harmful content? | Guardrail |
|
||||
| Measuring quality? | Evaluator |
|
||||
| Need LLM judgment? | Evaluator |
|
||||
| < 100ms required? | Guardrail |
|
||||
| False positives = angry users? | Evaluator |
|
||||
|
||||
## LLM Guardrails: Rarely
|
||||
|
||||
Only use LLM guardrails if:
|
||||
- Latency budget > 1s
|
||||
- Error cost >> LLM cost
|
||||
- Low volume
|
||||
- Fallback exists
|
||||
|
||||
**Key Principle:** Guardrails prevent harm (block). Evaluators measure quality (log).
|
||||
@@ -0,0 +1,92 @@
|
||||
# Production: Overview
|
||||
|
||||
CI/CD evals vs production monitoring - complementary approaches.
|
||||
|
||||
## Two Evaluation Modes
|
||||
|
||||
| Aspect | CI/CD Evals | Production Monitoring |
|
||||
| ------ | ----------- | -------------------- |
|
||||
| **When** | Pre-deployment | Post-deployment, ongoing |
|
||||
| **Data** | Fixed dataset | Sampled traffic |
|
||||
| **Goal** | Prevent regression | Detect drift |
|
||||
| **Response** | Block deploy | Alert & analyze |
|
||||
|
||||
## CI/CD Evaluations
|
||||
|
||||
```python
|
||||
# Fast, deterministic checks
|
||||
ci_evaluators = [
|
||||
has_required_format,
|
||||
no_pii_leak,
|
||||
safety_check,
|
||||
regression_test_suite,
|
||||
]
|
||||
|
||||
# Small but representative dataset (~100 examples)
|
||||
run_experiment(ci_dataset, task, ci_evaluators)
|
||||
```
|
||||
|
||||
Set thresholds: regression=0.95, safety=1.0, format=0.98.
|
||||
|
||||
## Production Monitoring
|
||||
|
||||
### Python
|
||||
|
||||
```python
|
||||
from phoenix.client import Client
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
client = Client()
|
||||
|
||||
# Sample recent traces (last hour)
|
||||
traces = client.traces.get_traces(
|
||||
project_identifier="my-app",
|
||||
start_time=datetime.now() - timedelta(hours=1),
|
||||
include_spans=True,
|
||||
limit=100,
|
||||
)
|
||||
|
||||
# Run evaluators on sampled traffic
|
||||
for trace in traces:
|
||||
results = run_evaluators_async(trace, production_evaluators)
|
||||
if any(r["score"] < 0.5 for r in results):
|
||||
alert_on_failure(trace, results)
|
||||
```
|
||||
|
||||
### TypeScript
|
||||
|
||||
```typescript
|
||||
import { getTraces } from "@arizeai/phoenix-client/traces";
|
||||
import { getSpans } from "@arizeai/phoenix-client/spans";
|
||||
|
||||
// Sample recent traces (last hour)
|
||||
const { traces } = await getTraces({
|
||||
project: { projectName: "my-app" },
|
||||
startTime: new Date(Date.now() - 60 * 60 * 1000),
|
||||
includeSpans: true,
|
||||
limit: 100,
|
||||
});
|
||||
|
||||
// Or sample spans directly for evaluation
|
||||
const { spans } = await getSpans({
|
||||
project: { projectName: "my-app" },
|
||||
startTime: new Date(Date.now() - 60 * 60 * 1000),
|
||||
limit: 100,
|
||||
});
|
||||
|
||||
// Run evaluators on sampled traffic
|
||||
for (const span of spans) {
|
||||
const results = await runEvaluators(span, productionEvaluators);
|
||||
if (results.some((r) => r.score < 0.5)) {
|
||||
await alertOnFailure(span, results);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Prioritize: errors → negative feedback → random sample.
|
||||
|
||||
## Feedback Loop
|
||||
|
||||
```
|
||||
Production finds failure → Error analysis → Add to CI dataset → Prevents future regression
|
||||
```
|
||||
@@ -0,0 +1,64 @@
|
||||
# Setup: Python
|
||||
|
||||
Packages required for Phoenix evals and experiments.
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
# Core Phoenix package (includes client, evals, otel)
|
||||
pip install arize-phoenix
|
||||
|
||||
# Or install individual packages
|
||||
pip install arize-phoenix-client # Phoenix client only
|
||||
pip install arize-phoenix-evals # Evaluation utilities
|
||||
pip install arize-phoenix-otel # OpenTelemetry integration
|
||||
```
|
||||
|
||||
## LLM Providers
|
||||
|
||||
For LLM-as-judge evaluators, install your provider's SDK:
|
||||
|
||||
```bash
|
||||
pip install openai # OpenAI
|
||||
pip install anthropic # Anthropic
|
||||
pip install google-generativeai # Google
|
||||
```
|
||||
|
||||
## Validation (Optional)
|
||||
|
||||
```bash
|
||||
pip install scikit-learn # For TPR/TNR metrics
|
||||
```
|
||||
|
||||
## Quick Verify
|
||||
|
||||
```python
|
||||
from phoenix.client import Client
|
||||
from phoenix.evals import LLM, ClassificationEvaluator
|
||||
from phoenix.otel import register
|
||||
|
||||
# All imports should work
|
||||
print("Phoenix Python setup complete")
|
||||
```
|
||||
|
||||
## Key Imports (Evals 2.0)
|
||||
|
||||
```python
|
||||
from phoenix.client import Client
|
||||
from phoenix.evals import (
|
||||
ClassificationEvaluator, # LLM classification evaluator (preferred)
|
||||
LLM, # Provider-agnostic LLM wrapper
|
||||
async_evaluate_dataframe, # Batch evaluate a DataFrame (preferred, async)
|
||||
evaluate_dataframe, # Batch evaluate a DataFrame (sync)
|
||||
create_evaluator, # Decorator for code-based evaluators
|
||||
create_classifier, # Factory for LLM classification evaluators
|
||||
bind_evaluator, # Map column names to evaluator params
|
||||
Score, # Score dataclass
|
||||
)
|
||||
from phoenix.evals.utils import to_annotation_dataframe # Format results for Phoenix annotations
|
||||
```
|
||||
|
||||
**Prefer**: `ClassificationEvaluator` over `create_classifier` (more parameters/customization).
|
||||
**Prefer**: `async_evaluate_dataframe` over `evaluate_dataframe` (better throughput for LLM evals).
|
||||
|
||||
**Do NOT use** legacy 1.0 imports: `OpenAIModel`, `AnthropicModel`, `run_evals`, `llm_classify`.
|
||||
@@ -0,0 +1,41 @@
|
||||
# Setup: TypeScript
|
||||
|
||||
Packages required for Phoenix evals and experiments.
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
# Using npm
|
||||
npm install @arizeai/phoenix-client @arizeai/phoenix-evals @arizeai/phoenix-otel
|
||||
|
||||
# Using pnpm
|
||||
pnpm add @arizeai/phoenix-client @arizeai/phoenix-evals @arizeai/phoenix-otel
|
||||
```
|
||||
|
||||
## LLM Providers
|
||||
|
||||
For LLM-as-judge evaluators, install Vercel AI SDK providers:
|
||||
|
||||
```bash
|
||||
npm install ai @ai-sdk/openai # Vercel AI SDK + OpenAI
|
||||
npm install @ai-sdk/anthropic # Anthropic
|
||||
npm install @ai-sdk/google # Google
|
||||
```
|
||||
|
||||
Or use direct provider SDKs:
|
||||
|
||||
```bash
|
||||
npm install openai # OpenAI direct
|
||||
npm install @anthropic-ai/sdk # Anthropic direct
|
||||
```
|
||||
|
||||
## Quick Verify
|
||||
|
||||
```typescript
|
||||
import { createClient } from "@arizeai/phoenix-client";
|
||||
import { createClassificationEvaluator } from "@arizeai/phoenix-evals";
|
||||
import { registerPhoenix } from "@arizeai/phoenix-otel";
|
||||
|
||||
// All imports should work
|
||||
console.log("Phoenix TypeScript setup complete");
|
||||
```
|
||||
@@ -0,0 +1,43 @@
|
||||
# Validating Evaluators (Python)
|
||||
|
||||
Validate LLM evaluators against human-labeled examples. Target >80% TPR/TNR/Accuracy.
|
||||
|
||||
## Calculate Metrics
|
||||
|
||||
```python
|
||||
from sklearn.metrics import classification_report, confusion_matrix
|
||||
|
||||
print(classification_report(human_labels, evaluator_predictions))
|
||||
|
||||
cm = confusion_matrix(human_labels, evaluator_predictions)
|
||||
tn, fp, fn, tp = cm.ravel()
|
||||
tpr = tp / (tp + fn)
|
||||
tnr = tn / (tn + fp)
|
||||
print(f"TPR: {tpr:.2f}, TNR: {tnr:.2f}")
|
||||
```
|
||||
|
||||
## Correct Production Estimates
|
||||
|
||||
```python
|
||||
def correct_estimate(observed, tpr, tnr):
|
||||
"""Adjust observed pass rate using known TPR/TNR."""
|
||||
return (observed - (1 - tnr)) / (tpr - (1 - tnr))
|
||||
```
|
||||
|
||||
## Find Misclassified
|
||||
|
||||
```python
|
||||
# False Positives: Evaluator pass, human fail
|
||||
fp_mask = (evaluator_predictions == 1) & (human_labels == 0)
|
||||
false_positives = dataset[fp_mask]
|
||||
|
||||
# False Negatives: Evaluator fail, human pass
|
||||
fn_mask = (evaluator_predictions == 0) & (human_labels == 1)
|
||||
false_negatives = dataset[fn_mask]
|
||||
```
|
||||
|
||||
## Red Flags
|
||||
|
||||
- TPR or TNR < 70%
|
||||
- Large gap between TPR and TNR
|
||||
- Kappa < 0.6
|
||||
@@ -0,0 +1,106 @@
|
||||
# Validating Evaluators (TypeScript)
|
||||
|
||||
Validate an LLM evaluator against human-labeled examples before deploying it.
|
||||
Target: **>80% TPR and >80% TNR**.
|
||||
|
||||
Roles are inverted compared to a normal task experiment:
|
||||
|
||||
| Normal experiment | Evaluator validation |
|
||||
|---|---|
|
||||
| Task = agent logic | Task = run the evaluator under test |
|
||||
| Evaluator = judge output | Evaluator = exact-match vs human ground truth |
|
||||
| Dataset = agent examples | Dataset = golden hand-labeled examples |
|
||||
|
||||
## Golden Dataset
|
||||
|
||||
Use a separate dataset name so validation experiments don't mix with task experiments in Phoenix.
|
||||
Store human ground truth in `metadata.groundTruthLabel`. Aim for ~50/50 balance:
|
||||
|
||||
```typescript
|
||||
import type { Example } from "@arizeai/phoenix-client/types/datasets";
|
||||
|
||||
const goldenExamples: Example[] = [
|
||||
{ input: { q: "Capital of France?" }, output: { answer: "Paris" }, metadata: { groundTruthLabel: "correct" } },
|
||||
{ input: { q: "Capital of France?" }, output: { answer: "Lyon" }, metadata: { groundTruthLabel: "incorrect" } },
|
||||
{ input: { q: "Capital of France?" }, output: { answer: "Major city..." }, metadata: { groundTruthLabel: "incorrect" } },
|
||||
];
|
||||
|
||||
const VALIDATOR_DATASET = "my-app-qa-evaluator-validation"; // separate from task dataset
|
||||
const POSITIVE_LABEL = "correct";
|
||||
const NEGATIVE_LABEL = "incorrect";
|
||||
```
|
||||
|
||||
## Validation Experiment
|
||||
|
||||
```typescript
|
||||
import { createClient } from "@arizeai/phoenix-client";
|
||||
import { createOrGetDataset, getDatasetExamples } from "@arizeai/phoenix-client/datasets";
|
||||
import { asExperimentEvaluator, runExperiment } from "@arizeai/phoenix-client/experiments";
|
||||
import { myEvaluator } from "./myEvaluator.js";
|
||||
|
||||
const client = createClient();
|
||||
|
||||
const { datasetId } = await createOrGetDataset({ client, name: VALIDATOR_DATASET, examples: goldenExamples });
|
||||
const { examples } = await getDatasetExamples({ client, dataset: { datasetId } });
|
||||
const groundTruth = new Map(examples.map((ex) => [ex.id, ex.metadata?.groundTruthLabel as string]));
|
||||
|
||||
// Task: invoke the evaluator under test
|
||||
const task = async (example: (typeof examples)[number]) => {
|
||||
const result = await myEvaluator.evaluate({ input: example.input, output: example.output, metadata: example.metadata });
|
||||
return result.label ?? "unknown";
|
||||
};
|
||||
|
||||
// Evaluator: exact-match against human ground truth
|
||||
const exactMatch = asExperimentEvaluator({
|
||||
name: "exact-match", kind: "CODE",
|
||||
evaluate: ({ output, metadata }) => {
|
||||
const expected = metadata?.groundTruthLabel as string;
|
||||
const predicted = typeof output === "string" ? output : "unknown";
|
||||
return { score: predicted === expected ? 1 : 0, label: predicted, explanation: `Expected: ${expected}, Got: ${predicted}` };
|
||||
},
|
||||
});
|
||||
|
||||
const experiment = await runExperiment({
|
||||
client, experimentName: `evaluator-validation-${Date.now()}`,
|
||||
dataset: { datasetId }, task, evaluators: [exactMatch],
|
||||
});
|
||||
|
||||
// Compute confusion matrix
|
||||
const runs = Object.values(experiment.runs);
|
||||
const predicted = new Map((experiment.evaluationRuns ?? [])
|
||||
.filter((e) => e.name === "exact-match")
|
||||
.map((e) => [e.experimentRunId, e.result?.label ?? null]));
|
||||
|
||||
let tp = 0, fp = 0, tn = 0, fn = 0;
|
||||
for (const run of runs) {
|
||||
if (run.error) continue;
|
||||
const p = predicted.get(run.id), a = groundTruth.get(run.datasetExampleId);
|
||||
if (!p || !a) continue;
|
||||
if (a === POSITIVE_LABEL && p === POSITIVE_LABEL) tp++;
|
||||
else if (a === NEGATIVE_LABEL && p === POSITIVE_LABEL) fp++;
|
||||
else if (a === NEGATIVE_LABEL && p === NEGATIVE_LABEL) tn++;
|
||||
else if (a === POSITIVE_LABEL && p === NEGATIVE_LABEL) fn++;
|
||||
}
|
||||
const total = tp + fp + tn + fn;
|
||||
const tpr = tp + fn > 0 ? (tp / (tp + fn)) * 100 : 0;
|
||||
const tnr = tn + fp > 0 ? (tn / (tn + fp)) * 100 : 0;
|
||||
console.log(`TPR: ${tpr.toFixed(1)}% TNR: ${tnr.toFixed(1)}% Accuracy: ${((tp + tn) / total * 100).toFixed(1)}%`);
|
||||
```
|
||||
|
||||
## Results & Quality Rules
|
||||
|
||||
| Metric | Target | Low value means |
|
||||
|---|---|---|
|
||||
| TPR (sensitivity) | >80% | Misses real failures (false negatives) |
|
||||
| TNR (specificity) | >80% | Flags good outputs (false positives) |
|
||||
| Accuracy | >80% | General weakness |
|
||||
|
||||
**Golden dataset rules:** ~50/50 balance · include edge cases · human-labeled only · never mutate (append new versions) · 20–50 examples is enough.
|
||||
|
||||
**Re-validate when:** prompt template changes · judge model changes · criteria updated · production FP/FN spike.
|
||||
|
||||
## See Also
|
||||
|
||||
- `validation.md` — Metric definitions and concepts
|
||||
- `experiments-running-typescript.md` — `runExperiment` API
|
||||
- `experiments-datasets-typescript.md` — `createOrGetDataset` / `getDatasetExamples`
|
||||
@@ -0,0 +1,74 @@
|
||||
# Validation
|
||||
|
||||
Validate LLM judges against human labels before deploying. Target >80% agreement.
|
||||
|
||||
## Requirements
|
||||
|
||||
| Requirement | Target |
|
||||
| ----------- | ------ |
|
||||
| Test set size | 100+ examples |
|
||||
| Balance | ~50/50 pass/fail |
|
||||
| Accuracy | >80% |
|
||||
| TPR/TNR | Both >70% |
|
||||
|
||||
## Metrics
|
||||
|
||||
| Metric | Formula | Use When |
|
||||
| ------ | ------- | -------- |
|
||||
| **Accuracy** | (TP+TN) / Total | General |
|
||||
| **TPR (Recall)** | TP / (TP+FN) | Quality assurance |
|
||||
| **TNR (Specificity)** | TN / (TN+FP) | Safety-critical |
|
||||
| **Cohen's Kappa** | Agreement beyond chance | Comparing evaluators |
|
||||
|
||||
## Quick Validation
|
||||
|
||||
```python
|
||||
from sklearn.metrics import classification_report, confusion_matrix, cohen_kappa_score
|
||||
|
||||
print(classification_report(human_labels, evaluator_predictions))
|
||||
print(f"Kappa: {cohen_kappa_score(human_labels, evaluator_predictions):.3f}")
|
||||
|
||||
# Get TPR/TNR
|
||||
cm = confusion_matrix(human_labels, evaluator_predictions)
|
||||
tn, fp, fn, tp = cm.ravel()
|
||||
tpr = tp / (tp + fn)
|
||||
tnr = tn / (tn + fp)
|
||||
```
|
||||
|
||||
## Golden Dataset Structure
|
||||
|
||||
```python
|
||||
golden_example = {
|
||||
"input": "What is the capital of France?",
|
||||
"output": "Paris is the capital.",
|
||||
"ground_truth_label": "correct",
|
||||
}
|
||||
```
|
||||
|
||||
## Building Golden Datasets
|
||||
|
||||
1. Sample production traces (errors, negative feedback, edge cases)
|
||||
2. Balance ~50/50 pass/fail
|
||||
3. Expert labels each example
|
||||
4. Version datasets (never modify existing)
|
||||
|
||||
```python
|
||||
# GOOD - create new version
|
||||
golden_v2 = golden_v1 + [new_examples]
|
||||
|
||||
# BAD - never modify existing
|
||||
golden_v1.append(new_example)
|
||||
```
|
||||
|
||||
## Warning Signs
|
||||
|
||||
- All pass or all fail → too lenient/strict
|
||||
- Random results → criteria unclear
|
||||
- TPR/TNR < 70% → needs improvement
|
||||
|
||||
## Re-Validate When
|
||||
|
||||
- Prompt template changes
|
||||
- Judge model changes
|
||||
- Criteria changes
|
||||
- Monthly
|
||||
24
plugins/phoenix/skills/phoenix-tracing/README.md
Normal file
24
plugins/phoenix/skills/phoenix-tracing/README.md
Normal file
@@ -0,0 +1,24 @@
|
||||
# Phoenix Tracing Skill
|
||||
|
||||
OpenInference semantic conventions and instrumentation guides for Phoenix.
|
||||
|
||||
## Usage
|
||||
|
||||
Start with `SKILL.md` for the index and quick reference.
|
||||
|
||||
## File Organization
|
||||
|
||||
All files in flat `rules/` directory with semantic prefixes:
|
||||
|
||||
- `span-*` - Span kinds (LLM, CHAIN, TOOL, etc.)
|
||||
- `setup-*`, `instrumentation-*` - Getting started guides
|
||||
- `fundamentals-*`, `attributes-*` - Reference docs
|
||||
- `annotations-*`, `export-*` - Advanced features
|
||||
|
||||
## Reference
|
||||
|
||||
- [OpenInference Spec](https://github.com/Arize-ai/openinference/tree/main/spec)
|
||||
- [Phoenix Documentation](https://docs.arize.com/phoenix)
|
||||
- [Python OTEL API](https://arize-phoenix.readthedocs.io/projects/otel/en/latest/)
|
||||
- [Python Client API](https://arize-phoenix.readthedocs.io/projects/client/en/latest/)
|
||||
- [TypeScript API](https://arize-ai.github.io/phoenix/)
|
||||
139
plugins/phoenix/skills/phoenix-tracing/SKILL.md
Normal file
139
plugins/phoenix/skills/phoenix-tracing/SKILL.md
Normal file
@@ -0,0 +1,139 @@
|
||||
---
|
||||
name: phoenix-tracing
|
||||
description: OpenInference semantic conventions and instrumentation for Phoenix AI observability. Use when implementing LLM tracing, creating custom spans, or deploying to production.
|
||||
license: Apache-2.0
|
||||
compatibility: Requires Phoenix server. Python skills need arize-phoenix-otel; TypeScript skills need @arizeai/phoenix-otel.
|
||||
metadata:
|
||||
author: oss@arize.com
|
||||
version: "1.0.0"
|
||||
languages: "Python, TypeScript"
|
||||
---
|
||||
|
||||
# Phoenix Tracing
|
||||
|
||||
Comprehensive guide for instrumenting LLM applications with OpenInference tracing in Phoenix. Contains reference files covering setup, instrumentation, span types, and production deployment.
|
||||
|
||||
## When to Apply
|
||||
|
||||
Reference these guidelines when:
|
||||
|
||||
- Setting up Phoenix tracing (Python or TypeScript)
|
||||
- Creating custom spans for LLM operations
|
||||
- Adding attributes following OpenInference conventions
|
||||
- Deploying tracing to production
|
||||
- Querying and analyzing trace data
|
||||
|
||||
## Reference Categories
|
||||
|
||||
| Priority | Category | Description | Prefix |
|
||||
| -------- | --------------- | ------------------------------ | -------------------------- |
|
||||
| 1 | Setup | Installation and configuration | `setup-*` |
|
||||
| 2 | Instrumentation | Auto and manual tracing | `instrumentation-*` |
|
||||
| 3 | Span Types | 9 span kinds with attributes | `span-*` |
|
||||
| 4 | Organization | Projects and sessions | `projects-*`, `sessions-*` |
|
||||
| 5 | Enrichment | Custom metadata | `metadata-*` |
|
||||
| 6 | Production | Batch processing, masking | `production-*` |
|
||||
| 7 | Feedback | Annotations and evaluation | `annotations-*` |
|
||||
|
||||
## Quick Reference
|
||||
|
||||
### 1. Setup (START HERE)
|
||||
|
||||
- [setup-python](references/setup-python.md) - Install arize-phoenix-otel, configure endpoint
|
||||
- [setup-typescript](references/setup-typescript.md) - Install @arizeai/phoenix-otel, configure endpoint
|
||||
|
||||
### 2. Instrumentation
|
||||
|
||||
- [instrumentation-auto-python](references/instrumentation-auto-python.md) - Auto-instrument OpenAI, LangChain, etc.
|
||||
- [instrumentation-auto-typescript](references/instrumentation-auto-typescript.md) - Auto-instrument supported frameworks
|
||||
- [instrumentation-manual-python](references/instrumentation-manual-python.md) - Custom spans with decorators
|
||||
- [instrumentation-manual-typescript](references/instrumentation-manual-typescript.md) - Custom spans with wrappers
|
||||
|
||||
### 3. Span Types (with full attribute schemas)
|
||||
|
||||
- [span-llm](references/span-llm.md) - LLM API calls (model, tokens, messages, cost)
|
||||
- [span-chain](references/span-chain.md) - Multi-step workflows and pipelines
|
||||
- [span-retriever](references/span-retriever.md) - Document retrieval (documents, scores)
|
||||
- [span-tool](references/span-tool.md) - Function/API calls (name, parameters)
|
||||
- [span-agent](references/span-agent.md) - Multi-step reasoning agents
|
||||
- [span-embedding](references/span-embedding.md) - Vector generation
|
||||
- [span-reranker](references/span-reranker.md) - Document re-ranking
|
||||
- [span-guardrail](references/span-guardrail.md) - Safety checks
|
||||
- [span-evaluator](references/span-evaluator.md) - LLM evaluation
|
||||
|
||||
### 4. Organization
|
||||
|
||||
- [projects-python](references/projects-python.md) / [projects-typescript](references/projects-typescript.md) - Group traces by application
|
||||
- [sessions-python](references/sessions-python.md) / [sessions-typescript](references/sessions-typescript.md) - Track conversations
|
||||
|
||||
### 5. Enrichment
|
||||
|
||||
- [metadata-python](references/metadata-python.md) / [metadata-typescript](references/metadata-typescript.md) - Custom attributes
|
||||
|
||||
### 6. Production (CRITICAL)
|
||||
|
||||
- [production-python](references/production-python.md) / [production-typescript](references/production-typescript.md) - Batch processing, PII masking
|
||||
|
||||
### 7. Feedback
|
||||
|
||||
- [annotations-overview](references/annotations-overview.md) - Feedback concepts
|
||||
- [annotations-python](references/annotations-python.md) / [annotations-typescript](references/annotations-typescript.md) - Add feedback to spans
|
||||
|
||||
### Reference Files
|
||||
|
||||
- [fundamentals-overview](references/fundamentals-overview.md) - Traces, spans, attributes basics
|
||||
- [fundamentals-required-attributes](references/fundamentals-required-attributes.md) - Required fields per span type
|
||||
- [fundamentals-universal-attributes](references/fundamentals-universal-attributes.md) - Common attributes (user.id, session.id)
|
||||
- [fundamentals-flattening](references/fundamentals-flattening.md) - JSON flattening rules
|
||||
- [attributes-messages](references/attributes-messages.md) - Chat message format
|
||||
- [attributes-metadata](references/attributes-metadata.md) - Custom metadata schema
|
||||
- [attributes-graph](references/attributes-graph.md) - Agent workflow attributes
|
||||
- [attributes-exceptions](references/attributes-exceptions.md) - Error tracking
|
||||
|
||||
## Common Workflows
|
||||
|
||||
- **Quick Start**: setup-{lang} → instrumentation-auto-{lang} → Check Phoenix
|
||||
- **Custom Spans**: setup-{lang} → instrumentation-manual-{lang} → span-{type}
|
||||
- **Session Tracking**: sessions-{lang} for conversation grouping patterns
|
||||
- **Production**: production-{lang} for batching, masking, and deployment
|
||||
|
||||
## How to Use This Skill
|
||||
|
||||
**Navigation Patterns:**
|
||||
|
||||
```bash
|
||||
# By category prefix
|
||||
references/setup-* # Installation and configuration
|
||||
references/instrumentation-* # Auto and manual tracing
|
||||
references/span-* # Span type specifications
|
||||
references/sessions-* # Session tracking
|
||||
references/production-* # Production deployment
|
||||
references/fundamentals-* # Core concepts
|
||||
references/attributes-* # Attribute specifications
|
||||
|
||||
# By language
|
||||
references/*-python.md # Python implementations
|
||||
references/*-typescript.md # TypeScript implementations
|
||||
```
|
||||
|
||||
**Reading Order:**
|
||||
1. Start with setup-{lang} for your language
|
||||
2. Choose instrumentation-auto-{lang} OR instrumentation-manual-{lang}
|
||||
3. Reference span-{type} files as needed for specific operations
|
||||
4. See fundamentals-* files for attribute specifications
|
||||
|
||||
## References
|
||||
|
||||
**Phoenix Documentation:**
|
||||
|
||||
- [Phoenix Documentation](https://docs.arize.com/phoenix)
|
||||
- [OpenInference Spec](https://github.com/Arize-ai/openinference/tree/main/spec)
|
||||
|
||||
**Python API Documentation:**
|
||||
|
||||
- [Python OTEL Package](https://arize-phoenix.readthedocs.io/projects/otel/en/latest/) - `arize-phoenix-otel` API reference
|
||||
- [Python Client Package](https://arize-phoenix.readthedocs.io/projects/client/en/latest/) - `arize-phoenix-client` API reference
|
||||
|
||||
**TypeScript API Documentation:**
|
||||
|
||||
- [TypeScript Packages](https://arize-ai.github.io/phoenix/) - `@arizeai/phoenix-otel`, `@arizeai/phoenix-client`, and other TypeScript packages
|
||||
@@ -0,0 +1,69 @@
|
||||
# Annotations Overview
|
||||
|
||||
Annotations allow you to add human or automated feedback to traces, spans, documents, and sessions. Annotations are essential for evaluation, quality assessment, and building training datasets.
|
||||
|
||||
## Annotation Types
|
||||
|
||||
Phoenix supports four types of annotations:
|
||||
|
||||
| Type | Target | Purpose | Example Use Case |
|
||||
| ----------------------- | -------------------------------- | ---------------------------------------- | -------------------------------- |
|
||||
| **Span Annotation** | Individual span | Feedback on a specific operation | "This LLM response was accurate" |
|
||||
| **Document Annotation** | Document within a RETRIEVER span | Feedback on retrieved document relevance | "This document was not helpful" |
|
||||
| **Trace Annotation** | Entire trace | Feedback on end-to-end interaction | "User was satisfied with result" |
|
||||
| **Session Annotation** | User session | Feedback on multi-turn conversation | "Session ended successfully" |
|
||||
|
||||
## Annotation Fields
|
||||
|
||||
Every annotation has these fields:
|
||||
|
||||
### Required Fields
|
||||
|
||||
| Field | Type | Description |
|
||||
| --------- | ------ | ----------------------------------------------------------------------------- |
|
||||
| Entity ID | String | ID of the target entity (span_id, trace_id, session_id, or document_position) |
|
||||
| `name` | String | Annotation name/label (e.g., "quality", "relevance", "helpfulness") |
|
||||
|
||||
### Result Fields (At Least One Required)
|
||||
|
||||
| Field | Type | Description |
|
||||
| ------------- | ----------------- | ----------------------------------------------------------------- |
|
||||
| `label` | String (optional) | Categorical value (e.g., "good", "bad", "relevant", "irrelevant") |
|
||||
| `score` | Float (optional) | Numeric value (typically 0-1, but can be any range) |
|
||||
| `explanation` | String (optional) | Free-text explanation of the annotation |
|
||||
|
||||
**At least one** of `label`, `score`, or `explanation` must be provided.
|
||||
|
||||
### Optional Fields
|
||||
|
||||
| Field | Type | Description |
|
||||
| ---------------- | ------ | --------------------------------------------------------------------------------------- |
|
||||
| `annotator_kind` | String | Who created this annotation: "HUMAN", "LLM", or "CODE" (default: "HUMAN") |
|
||||
| `identifier` | String | Unique identifier for upsert behavior (updates existing if same name+entity+identifier) |
|
||||
| `metadata` | Object | Custom metadata as key-value pairs |
|
||||
|
||||
## Annotator Kinds
|
||||
|
||||
| Kind | Description | Example |
|
||||
| ------- | ------------------------------ | --------------------------------- |
|
||||
| `HUMAN` | Manual feedback from a person | User ratings, expert labels |
|
||||
| `LLM` | Automated feedback from an LLM | GPT-4 evaluating response quality |
|
||||
| `CODE` | Automated feedback from code | Rule-based checks, heuristics |
|
||||
|
||||
## Examples
|
||||
|
||||
**Quality Assessment:**
|
||||
|
||||
- `quality` - Overall quality (label: good/fair/poor, score: 0-1)
|
||||
- `correctness` - Factual accuracy (label: correct/incorrect, score: 0-1)
|
||||
- `helpfulness` - User satisfaction (label: helpful/not_helpful, score: 0-1)
|
||||
|
||||
**RAG-Specific:**
|
||||
|
||||
- `relevance` - Document relevance to query (label: relevant/irrelevant, score: 0-1)
|
||||
- `faithfulness` - Answer grounded in context (label: faithful/unfaithful, score: 0-1)
|
||||
|
||||
**Safety:**
|
||||
|
||||
- `toxicity` - Contains harmful content (score: 0-1)
|
||||
- `pii_detected` - Contains personally identifiable information (label: yes/no)
|
||||
@@ -0,0 +1,114 @@
|
||||
# Python SDK Annotation Patterns
|
||||
|
||||
Add feedback to spans, traces, documents, and sessions using the Python client.
|
||||
|
||||
## Client Setup
|
||||
|
||||
```python
|
||||
from phoenix.client import Client
|
||||
client = Client() # Default: http://localhost:6006
|
||||
```
|
||||
|
||||
## Span Annotations
|
||||
|
||||
Add feedback to individual spans:
|
||||
|
||||
```python
|
||||
client.spans.add_span_annotation(
|
||||
span_id="abc123",
|
||||
annotation_name="quality",
|
||||
annotator_kind="HUMAN",
|
||||
label="high_quality",
|
||||
score=0.95,
|
||||
explanation="Accurate and well-formatted",
|
||||
metadata={"reviewer": "alice"},
|
||||
sync=True
|
||||
)
|
||||
```
|
||||
|
||||
## Document Annotations
|
||||
|
||||
Rate individual documents in RETRIEVER spans:
|
||||
|
||||
```python
|
||||
client.spans.add_document_annotation(
|
||||
span_id="retriever_span",
|
||||
document_position=0, # 0-based index
|
||||
annotation_name="relevance",
|
||||
annotator_kind="LLM",
|
||||
label="relevant",
|
||||
score=0.95
|
||||
)
|
||||
```
|
||||
|
||||
## Trace Annotations
|
||||
|
||||
Feedback on entire traces:
|
||||
|
||||
```python
|
||||
client.traces.add_trace_annotation(
|
||||
trace_id="trace_abc",
|
||||
annotation_name="correctness",
|
||||
annotator_kind="HUMAN",
|
||||
label="correct",
|
||||
score=1.0
|
||||
)
|
||||
```
|
||||
|
||||
## Session Annotations
|
||||
|
||||
Feedback on multi-turn conversations:
|
||||
|
||||
```python
|
||||
client.sessions.add_session_annotation(
|
||||
session_id="session_xyz",
|
||||
annotation_name="user_satisfaction",
|
||||
annotator_kind="HUMAN",
|
||||
label="satisfied",
|
||||
score=0.85
|
||||
)
|
||||
```
|
||||
|
||||
## RAG Pipeline Example
|
||||
|
||||
```python
|
||||
from phoenix.client import Client
|
||||
from phoenix.client.resources.spans import SpanDocumentAnnotationData
|
||||
|
||||
client = Client()
|
||||
|
||||
# Document relevance (batch)
|
||||
client.spans.log_document_annotations(
|
||||
document_annotations=[
|
||||
SpanDocumentAnnotationData(
|
||||
name="relevance", span_id="retriever_span", document_position=i,
|
||||
annotator_kind="LLM", result={"label": label, "score": score}
|
||||
)
|
||||
for i, (label, score) in enumerate([
|
||||
("relevant", 0.95), ("relevant", 0.80), ("irrelevant", 0.10)
|
||||
])
|
||||
]
|
||||
)
|
||||
|
||||
# LLM response quality
|
||||
client.spans.add_span_annotation(
|
||||
span_id="llm_span",
|
||||
annotation_name="faithfulness",
|
||||
annotator_kind="LLM",
|
||||
label="faithful",
|
||||
score=0.90
|
||||
)
|
||||
|
||||
# Overall trace quality
|
||||
client.traces.add_trace_annotation(
|
||||
trace_id="trace_123",
|
||||
annotation_name="correctness",
|
||||
annotator_kind="HUMAN",
|
||||
label="correct",
|
||||
score=1.0
|
||||
)
|
||||
```
|
||||
|
||||
## API Reference
|
||||
|
||||
- [Python Client API](https://arize-phoenix.readthedocs.io/projects/client/en/latest/)
|
||||
@@ -0,0 +1,137 @@
|
||||
# TypeScript SDK Annotation Patterns
|
||||
|
||||
Add feedback to spans, traces, documents, and sessions using the TypeScript client.
|
||||
|
||||
## Client Setup
|
||||
|
||||
```typescript
|
||||
import { createClient } from "phoenix-client";
|
||||
const client = createClient(); // Default: http://localhost:6006
|
||||
```
|
||||
|
||||
## Span Annotations
|
||||
|
||||
Add feedback to individual spans:
|
||||
|
||||
```typescript
|
||||
import { addSpanAnnotation } from "phoenix-client";
|
||||
|
||||
await addSpanAnnotation({
|
||||
client,
|
||||
spanAnnotation: {
|
||||
spanId: "abc123",
|
||||
name: "quality",
|
||||
annotatorKind: "HUMAN",
|
||||
label: "high_quality",
|
||||
score: 0.95,
|
||||
explanation: "Accurate and well-formatted",
|
||||
metadata: { reviewer: "alice" }
|
||||
},
|
||||
sync: true
|
||||
});
|
||||
```
|
||||
|
||||
## Document Annotations
|
||||
|
||||
Rate individual documents in RETRIEVER spans:
|
||||
|
||||
```typescript
|
||||
import { addDocumentAnnotation } from "phoenix-client";
|
||||
|
||||
await addDocumentAnnotation({
|
||||
client,
|
||||
documentAnnotation: {
|
||||
spanId: "retriever_span",
|
||||
documentPosition: 0, // 0-based index
|
||||
name: "relevance",
|
||||
annotatorKind: "LLM",
|
||||
label: "relevant",
|
||||
score: 0.95
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
## Trace Annotations
|
||||
|
||||
Feedback on entire traces:
|
||||
|
||||
```typescript
|
||||
import { addTraceAnnotation } from "phoenix-client";
|
||||
|
||||
await addTraceAnnotation({
|
||||
client,
|
||||
traceAnnotation: {
|
||||
traceId: "trace_abc",
|
||||
name: "correctness",
|
||||
annotatorKind: "HUMAN",
|
||||
label: "correct",
|
||||
score: 1.0
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
## Session Annotations
|
||||
|
||||
Feedback on multi-turn conversations:
|
||||
|
||||
```typescript
|
||||
import { addSessionAnnotation } from "phoenix-client";
|
||||
|
||||
await addSessionAnnotation({
|
||||
client,
|
||||
sessionAnnotation: {
|
||||
sessionId: "session_xyz",
|
||||
name: "user_satisfaction",
|
||||
annotatorKind: "HUMAN",
|
||||
label: "satisfied",
|
||||
score: 0.85
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
## RAG Pipeline Example
|
||||
|
||||
```typescript
|
||||
import { createClient, logDocumentAnnotations, addSpanAnnotation, addTraceAnnotation } from "phoenix-client";
|
||||
|
||||
const client = createClient();
|
||||
|
||||
// Document relevance (batch)
|
||||
await logDocumentAnnotations({
|
||||
client,
|
||||
documentAnnotations: [
|
||||
{ spanId: "retriever_span", documentPosition: 0, name: "relevance",
|
||||
annotatorKind: "LLM", label: "relevant", score: 0.95 },
|
||||
{ spanId: "retriever_span", documentPosition: 1, name: "relevance",
|
||||
annotatorKind: "LLM", label: "relevant", score: 0.80 }
|
||||
]
|
||||
});
|
||||
|
||||
// LLM response quality
|
||||
await addSpanAnnotation({
|
||||
client,
|
||||
spanAnnotation: {
|
||||
spanId: "llm_span",
|
||||
name: "faithfulness",
|
||||
annotatorKind: "LLM",
|
||||
label: "faithful",
|
||||
score: 0.90
|
||||
}
|
||||
});
|
||||
|
||||
// Overall trace quality
|
||||
await addTraceAnnotation({
|
||||
client,
|
||||
traceAnnotation: {
|
||||
traceId: "trace_123",
|
||||
name: "correctness",
|
||||
annotatorKind: "HUMAN",
|
||||
label: "correct",
|
||||
score: 1.0
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
## API Reference
|
||||
|
||||
- [TypeScript Client API](https://arize-ai.github.io/phoenix/)
|
||||
@@ -0,0 +1,58 @@
|
||||
# Flattening Convention
|
||||
|
||||
OpenInference flattens nested data structures into dot-notation attributes for database compatibility, OpenTelemetry compatibility, and simple querying.
|
||||
|
||||
## Flattening Rules
|
||||
|
||||
**Objects → Dot Notation**
|
||||
|
||||
```javascript
|
||||
{ llm: { model_name: "gpt-4", token_count: { prompt: 10, completion: 20 } } }
|
||||
// becomes
|
||||
{ "llm.model_name": "gpt-4", "llm.token_count.prompt": 10, "llm.token_count.completion": 20 }
|
||||
```
|
||||
|
||||
**Arrays → Zero-Indexed Notation**
|
||||
|
||||
```javascript
|
||||
{ llm: { input_messages: [{ role: "user", content: "Hi" }] } }
|
||||
// becomes
|
||||
{ "llm.input_messages.0.message.role": "user", "llm.input_messages.0.message.content": "Hi" }
|
||||
```
|
||||
|
||||
**Message Convention: `.message.` segment required**
|
||||
|
||||
```
|
||||
llm.input_messages.{index}.message.{field}
|
||||
llm.input_messages.0.message.tool_calls.0.tool_call.function.name
|
||||
```
|
||||
|
||||
## Complete Example
|
||||
|
||||
```javascript
|
||||
// Original
|
||||
{
|
||||
openinference: { span: { kind: "LLM" } },
|
||||
llm: {
|
||||
model_name: "claude-3-5-sonnet-20241022",
|
||||
invocation_parameters: { temperature: 0.7, max_tokens: 1000 },
|
||||
input_messages: [{ role: "user", content: "Tell me a joke" }],
|
||||
output_messages: [{ role: "assistant", content: "Why did the chicken cross the road?" }],
|
||||
token_count: { prompt: 5, completion: 10, total: 15 }
|
||||
}
|
||||
}
|
||||
|
||||
// Flattened (stored in Phoenix spans.attributes JSONB)
|
||||
{
|
||||
"openinference.span.kind": "LLM",
|
||||
"llm.model_name": "claude-3-5-sonnet-20241022",
|
||||
"llm.invocation_parameters": "{\"temperature\": 0.7, \"max_tokens\": 1000}",
|
||||
"llm.input_messages.0.message.role": "user",
|
||||
"llm.input_messages.0.message.content": "Tell me a joke",
|
||||
"llm.output_messages.0.message.role": "assistant",
|
||||
"llm.output_messages.0.message.content": "Why did the chicken cross the road?",
|
||||
"llm.token_count.prompt": 5,
|
||||
"llm.token_count.completion": 10,
|
||||
"llm.token_count.total": 15
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,53 @@
|
||||
# Overview and Traces & Spans
|
||||
|
||||
This document covers the fundamental concepts of OpenInference traces and spans in Phoenix.
|
||||
|
||||
## Overview
|
||||
|
||||
OpenInference is a set of semantic conventions for AI and LLM applications based on OpenTelemetry. Phoenix uses these conventions to capture, store, and analyze traces from AI applications.
|
||||
|
||||
**Key Concepts:**
|
||||
|
||||
- **Traces** represent end-to-end requests through your application
|
||||
- **Spans** represent individual operations within a trace (LLM calls, retrievals, tool invocations)
|
||||
- **Attributes** are key-value pairs attached to spans using flattened, dot-notation paths
|
||||
- **Span Kinds** categorize the type of operation (LLM, RETRIEVER, TOOL, etc.)
|
||||
|
||||
## Traces and Spans
|
||||
|
||||
### Trace Hierarchy
|
||||
|
||||
A **trace** is a tree of **spans** representing a complete request:
|
||||
|
||||
```
|
||||
Trace ID: abc123
|
||||
├─ Span 1: CHAIN (root span, parent_id = null)
|
||||
│ ├─ Span 2: RETRIEVER (parent_id = span_1_id)
|
||||
│ │ └─ Span 3: EMBEDDING (parent_id = span_2_id)
|
||||
│ └─ Span 4: LLM (parent_id = span_1_id)
|
||||
│ └─ Span 5: TOOL (parent_id = span_4_id)
|
||||
```
|
||||
|
||||
### Context Propagation
|
||||
|
||||
Spans maintain parent-child relationships via:
|
||||
|
||||
- `trace_id` - Same for all spans in a trace
|
||||
- `span_id` - Unique identifier for this span
|
||||
- `parent_id` - References parent span's `span_id` (null for root spans)
|
||||
|
||||
Phoenix uses these relationships to:
|
||||
|
||||
- Build the span tree visualization in the UI
|
||||
- Calculate cumulative metrics (tokens, errors) up the tree
|
||||
- Enable nested querying (e.g., "find CHAIN spans containing LLM spans with errors")
|
||||
|
||||
### Span Lifecycle
|
||||
|
||||
Each span has:
|
||||
|
||||
- `start_time` - When the operation began (Unix timestamp in nanoseconds)
|
||||
- `end_time` - When the operation completed
|
||||
- `status_code` - OK, ERROR, or UNSET
|
||||
- `status_message` - Optional error message
|
||||
- `attributes` - object with all semantic convention attributes
|
||||
@@ -0,0 +1,64 @@
|
||||
# Required and Recommended Attributes
|
||||
|
||||
This document covers the required attribute and highly recommended attributes for all OpenInference spans.
|
||||
|
||||
## Required Attribute
|
||||
|
||||
**Every span MUST have exactly one required attribute:**
|
||||
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "LLM"
|
||||
}
|
||||
```
|
||||
|
||||
## Highly Recommended Attributes
|
||||
|
||||
While not strictly required, these attributes are **highly recommended** on all spans as they:
|
||||
- Enable evaluation and quality assessment
|
||||
- Help understand information flow through your application
|
||||
- Make traces more useful for debugging
|
||||
|
||||
### Input/Output Values
|
||||
|
||||
| Attribute | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `input.value` | String | Input to the operation (prompt, query, document) |
|
||||
| `output.value` | String | Output from the operation (response, result, answer) |
|
||||
|
||||
**Example:**
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "LLM",
|
||||
"input.value": "What is the capital of France?",
|
||||
"output.value": "The capital of France is Paris."
|
||||
}
|
||||
```
|
||||
|
||||
**Why these matter:**
|
||||
- **Evaluations**: Many evaluators (faithfulness, relevance, hallucination detection) require both input and output to assess quality
|
||||
- **Information flow**: Seeing inputs/outputs makes it easy to trace how data transforms through your application
|
||||
- **Debugging**: When something goes wrong, having the actual input/output makes root cause analysis much faster
|
||||
- **Analytics**: Enables pattern analysis across similar inputs or outputs
|
||||
|
||||
**Phoenix Behavior:**
|
||||
- Input/output displayed prominently in span details
|
||||
- Evaluators can automatically access these values
|
||||
- Search/filter traces by input or output content
|
||||
- Export inputs/outputs for fine-tuning datasets
|
||||
|
||||
## Valid Span Kinds
|
||||
|
||||
There are exactly **9 valid span kinds** in OpenInference:
|
||||
|
||||
| Span Kind | Purpose | Common Use Case |
|
||||
|-----------|---------|-----------------|
|
||||
| `LLM` | Language model inference | OpenAI, Anthropic, local LLM calls |
|
||||
| `EMBEDDING` | Vector generation | Text-to-vector conversion |
|
||||
| `CHAIN` | Application flow orchestration | LangChain chains, custom workflows |
|
||||
| `RETRIEVER` | Document/context retrieval | Vector DB queries, semantic search |
|
||||
| `RERANKER` | Result reordering | Rerank retrieved documents |
|
||||
| `TOOL` | External tool invocation | API calls, function execution |
|
||||
| `AGENT` | Autonomous reasoning | ReAct agents, planning loops |
|
||||
| `GUARDRAIL` | Safety/policy checks | Content moderation, PII detection |
|
||||
| `EVALUATOR` | Quality assessment | Answer relevance, faithfulness scoring |
|
||||
@@ -0,0 +1,72 @@
|
||||
# Universal Attributes
|
||||
|
||||
This document covers attributes that can be used on any span kind in OpenInference.
|
||||
|
||||
## Overview
|
||||
|
||||
These attributes can be used on **any span kind** to provide additional context, tracking, and metadata.
|
||||
|
||||
## Input/Output
|
||||
|
||||
| Attribute | Type | Description |
|
||||
| ------------------ | ------ | ---------------------------------------------------- |
|
||||
| `input.value` | String | Input to the operation (prompt, query, document) |
|
||||
| `input.mime_type` | String | MIME type (e.g., "text/plain", "application/json") |
|
||||
| `output.value` | String | Output from the operation (response, vector, result) |
|
||||
| `output.mime_type` | String | MIME type of output |
|
||||
|
||||
### Why Capture I/O?
|
||||
|
||||
**Always capture input/output for evaluation-ready spans:**
|
||||
- Phoenix evaluators (faithfulness, relevance, Q&A correctness) require `input.value` and `output.value`
|
||||
- Phoenix UI displays I/O prominently in trace views for debugging
|
||||
- Enables exporting I/O for creating fine-tuning datasets
|
||||
- Provides complete context for analyzing agent behavior
|
||||
|
||||
**Example attributes:**
|
||||
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "CHAIN",
|
||||
"input.value": "What is the weather?",
|
||||
"input.mime_type": "text/plain",
|
||||
"output.value": "I don't have access to weather data.",
|
||||
"output.mime_type": "text/plain"
|
||||
}
|
||||
```
|
||||
|
||||
**See language-specific implementation:**
|
||||
- TypeScript: `instrumentation-manual-typescript.md`
|
||||
- Python: `instrumentation-manual-python.md`
|
||||
|
||||
## Session and User Tracking
|
||||
|
||||
| Attribute | Type | Description |
|
||||
| ------------ | ------ | ---------------------------------------------- |
|
||||
| `session.id` | String | Session identifier for grouping related traces |
|
||||
| `user.id` | String | User identifier for per-user analysis |
|
||||
|
||||
**Example:**
|
||||
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "LLM",
|
||||
"session.id": "session_abc123",
|
||||
"user.id": "user_xyz789"
|
||||
}
|
||||
```
|
||||
|
||||
## Metadata
|
||||
|
||||
| Attribute | Type | Description |
|
||||
| ---------- | ------ | ------------------------------------------ |
|
||||
| `metadata` | string | JSON-serialized object of key-value pairs |
|
||||
|
||||
**Example:**
|
||||
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "LLM",
|
||||
"metadata": "{\"environment\": \"production\", \"model_version\": \"v2.1\", \"cost_center\": \"engineering\"}"
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,85 @@
|
||||
# Phoenix Tracing: Auto-Instrumentation (Python)
|
||||
|
||||
**Automatically create spans for LLM calls without code changes.**
|
||||
|
||||
## Overview
|
||||
|
||||
Auto-instrumentation patches supported libraries at runtime to create spans automatically. Use for supported frameworks (LangChain, LlamaIndex, OpenAI SDK, etc.). For custom logic, manual-instrumentation-python.md.
|
||||
|
||||
## Supported Frameworks
|
||||
|
||||
**Python:**
|
||||
|
||||
- LLM SDKs: OpenAI, Anthropic, Bedrock, Mistral, Vertex AI, Groq, Ollama
|
||||
- Frameworks: LangChain, LlamaIndex, DSPy, CrewAI, Instructor, Haystack
|
||||
- Install: `pip install openinference-instrumentation-{name}`
|
||||
|
||||
## Setup
|
||||
|
||||
**Install and enable:**
|
||||
|
||||
```bash
|
||||
pip install arize-phoenix-otel
|
||||
pip install openinference-instrumentation-openai # Add others as needed
|
||||
```
|
||||
|
||||
```python
|
||||
from phoenix.otel import register
|
||||
|
||||
register(project_name="my-app", auto_instrument=True) # Discovers all installed instrumentors
|
||||
```
|
||||
|
||||
**Example:**
|
||||
|
||||
```python
|
||||
from phoenix.otel import register
|
||||
from openai import OpenAI
|
||||
|
||||
register(project_name="my-app", auto_instrument=True)
|
||||
|
||||
client = OpenAI()
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4",
|
||||
messages=[{"role": "user", "content": "Hello!"}]
|
||||
)
|
||||
```
|
||||
|
||||
Traces appear in Phoenix UI with model, input/output, tokens, timing automatically captured. See span kind files for full attribute schemas.
|
||||
|
||||
**Selective instrumentation** (explicit control):
|
||||
|
||||
```python
|
||||
from phoenix.otel import register
|
||||
from openinference.instrumentation.openai import OpenAIInstrumentor
|
||||
|
||||
tracer_provider = register(project_name="my-app") # No auto_instrument
|
||||
OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)
|
||||
```
|
||||
|
||||
## Limitations
|
||||
|
||||
Auto-instrumentation does NOT capture:
|
||||
|
||||
- Custom business logic
|
||||
- Internal function calls
|
||||
|
||||
**Example:**
|
||||
|
||||
```python
|
||||
def my_custom_workflow(query: str) -> str:
|
||||
preprocessed = preprocess(query) # Not traced
|
||||
response = client.chat.completions.create(...) # Traced (auto)
|
||||
postprocessed = postprocess(response) # Not traced
|
||||
return postprocessed
|
||||
```
|
||||
|
||||
**Solution:** Add manual instrumentation:
|
||||
|
||||
```python
|
||||
@tracer.chain
|
||||
def my_custom_workflow(query: str) -> str:
|
||||
preprocessed = preprocess(query)
|
||||
response = client.chat.completions.create(...)
|
||||
postprocessed = postprocess(response)
|
||||
return postprocessed
|
||||
```
|
||||
@@ -0,0 +1,87 @@
|
||||
# Auto-Instrumentation (TypeScript)
|
||||
|
||||
Automatically create spans for LLM calls without code changes.
|
||||
|
||||
## Supported Frameworks
|
||||
|
||||
- **LLM SDKs:** OpenAI
|
||||
- **Frameworks:** LangChain
|
||||
- **Install:** `npm install @arizeai/openinference-instrumentation-{name}`
|
||||
|
||||
## Setup
|
||||
|
||||
**CommonJS (automatic):**
|
||||
|
||||
```javascript
|
||||
const { register } = require("@arizeai/phoenix-otel");
|
||||
const OpenAI = require("openai");
|
||||
|
||||
register({ projectName: "my-app" });
|
||||
|
||||
const client = new OpenAI();
|
||||
```
|
||||
|
||||
**ESM (manual required):**
|
||||
|
||||
```typescript
|
||||
import { register, registerInstrumentations } from "@arizeai/phoenix-otel";
|
||||
import { OpenAIInstrumentation } from "@arizeai/openinference-instrumentation-openai";
|
||||
import OpenAI from "openai";
|
||||
|
||||
register({ projectName: "my-app" });
|
||||
|
||||
const instrumentation = new OpenAIInstrumentation();
|
||||
instrumentation.manuallyInstrument(OpenAI);
|
||||
registerInstrumentations({ instrumentations: [instrumentation] });
|
||||
```
|
||||
|
||||
**Why:** ESM imports are hoisted before `register()` runs.
|
||||
|
||||
## Limitations
|
||||
|
||||
**What auto-instrumentation does NOT capture:**
|
||||
|
||||
```typescript
|
||||
async function myWorkflow(query: string): Promise<string> {
|
||||
const preprocessed = await preprocess(query); // Not traced
|
||||
const response = await client.chat.completions.create(...); // Traced (auto)
|
||||
const postprocessed = await postprocess(response); // Not traced
|
||||
return postprocessed;
|
||||
}
|
||||
```
|
||||
|
||||
**Solution:** Add manual instrumentation for custom logic:
|
||||
|
||||
```typescript
|
||||
import { traceChain } from "@arizeai/openinference-core";
|
||||
|
||||
const myWorkflow = traceChain(
|
||||
async (query: string): Promise<string> => {
|
||||
const preprocessed = await preprocess(query);
|
||||
const response = await client.chat.completions.create(...);
|
||||
const postprocessed = await postprocess(response);
|
||||
return postprocessed;
|
||||
},
|
||||
{ name: "my-workflow" }
|
||||
);
|
||||
```
|
||||
|
||||
## Combining Auto + Manual
|
||||
|
||||
```typescript
|
||||
import { register } from "@arizeai/phoenix-otel";
|
||||
import { traceChain } from "@arizeai/openinference-core";
|
||||
|
||||
register({ projectName: "my-app" });
|
||||
|
||||
const client = new OpenAI();
|
||||
|
||||
const workflow = traceChain(
|
||||
async (query: string) => {
|
||||
const preprocessed = await preprocess(query);
|
||||
const response = await client.chat.completions.create(...); // Auto-instrumented
|
||||
return postprocess(response);
|
||||
},
|
||||
{ name: "my-workflow" }
|
||||
);
|
||||
```
|
||||
@@ -0,0 +1,182 @@
|
||||
# Manual Instrumentation (Python)
|
||||
|
||||
Add custom spans using decorators or context managers for fine-grained tracing control.
|
||||
|
||||
## Setup
|
||||
|
||||
```bash
|
||||
pip install arize-phoenix-otel
|
||||
```
|
||||
|
||||
```python
|
||||
from phoenix.otel import register
|
||||
tracer_provider = register(project_name="my-app")
|
||||
tracer = tracer_provider.get_tracer(__name__)
|
||||
```
|
||||
|
||||
## Quick Reference
|
||||
|
||||
| Span Kind | Decorator | Use Case |
|
||||
|-----------|-----------|----------|
|
||||
| CHAIN | `@tracer.chain` | Orchestration, workflows, pipelines |
|
||||
| RETRIEVER | `@tracer.retriever` | Vector search, document retrieval |
|
||||
| TOOL | `@tracer.tool` | External API calls, function execution |
|
||||
| AGENT | `@tracer.agent` | Multi-step reasoning, planning |
|
||||
| LLM | `@tracer.llm` | LLM API calls (manual only) |
|
||||
| EMBEDDING | `@tracer.embedding` | Embedding generation |
|
||||
| RERANKER | `@tracer.reranker` | Document re-ranking |
|
||||
| GUARDRAIL | `@tracer.guardrail` | Safety checks, content moderation |
|
||||
| EVALUATOR | `@tracer.evaluator` | LLM evaluation, quality checks |
|
||||
|
||||
## Decorator Approach (Recommended)
|
||||
|
||||
**Use for:** Full function instrumentation, automatic I/O capture
|
||||
|
||||
```python
|
||||
@tracer.chain
|
||||
def rag_pipeline(query: str) -> str:
|
||||
docs = retrieve_documents(query)
|
||||
ranked = rerank(docs, query)
|
||||
return generate_response(ranked, query)
|
||||
|
||||
@tracer.retriever
|
||||
def retrieve_documents(query: str) -> list[dict]:
|
||||
results = vector_db.search(query, top_k=5)
|
||||
return [{"content": doc.text, "score": doc.score} for doc in results]
|
||||
|
||||
@tracer.tool
|
||||
def get_weather(city: str) -> str:
|
||||
response = requests.get(f"https://api.weather.com/{city}")
|
||||
return response.json()["weather"]
|
||||
```
|
||||
|
||||
**Custom span names:**
|
||||
|
||||
```python
|
||||
@tracer.chain(name="rag-pipeline-v2")
|
||||
def my_workflow(query: str) -> str:
|
||||
return process(query)
|
||||
```
|
||||
|
||||
## Context Manager Approach
|
||||
|
||||
**Use for:** Partial function instrumentation, custom attributes, dynamic control
|
||||
|
||||
```python
|
||||
from opentelemetry.trace import Status, StatusCode
|
||||
import json
|
||||
|
||||
def retrieve_with_metadata(query: str):
|
||||
with tracer.start_as_current_span(
|
||||
"vector_search",
|
||||
openinference_span_kind="retriever"
|
||||
) as span:
|
||||
span.set_attribute("input.value", query)
|
||||
|
||||
results = vector_db.search(query, top_k=5)
|
||||
|
||||
documents = [
|
||||
{
|
||||
"document.id": doc.id,
|
||||
"document.content": doc.text,
|
||||
"document.score": doc.score
|
||||
}
|
||||
for doc in results
|
||||
]
|
||||
span.set_attribute("retrieval.documents", json.dumps(documents))
|
||||
span.set_status(Status(StatusCode.OK))
|
||||
|
||||
return documents
|
||||
```
|
||||
|
||||
## Capturing Input/Output
|
||||
|
||||
**Always capture I/O for evaluation-ready spans.**
|
||||
|
||||
### Automatic I/O Capture (Decorators)
|
||||
|
||||
Decorators automatically capture input arguments and return values:
|
||||
|
||||
```python theme={null}
|
||||
@tracer.chain
|
||||
def handle_query(user_input: str) -> str:
|
||||
result = agent.generate(user_input)
|
||||
return result.text
|
||||
|
||||
# Automatically captures:
|
||||
# - input.value: user_input
|
||||
# - output.value: result.text
|
||||
# - input.mime_type / output.mime_type: auto-detected
|
||||
```
|
||||
|
||||
### Manual I/O Capture (Context Manager)
|
||||
|
||||
Use `set_input()` and `set_output()` for simple I/O capture:
|
||||
|
||||
```python theme={null}
|
||||
from opentelemetry.trace import Status, StatusCode
|
||||
|
||||
def handle_query(user_input: str) -> str:
|
||||
with tracer.start_as_current_span(
|
||||
"query.handler",
|
||||
openinference_span_kind="chain"
|
||||
) as span:
|
||||
span.set_input(user_input)
|
||||
|
||||
result = agent.generate(user_input)
|
||||
|
||||
span.set_output(result.text)
|
||||
span.set_status(Status(StatusCode.OK))
|
||||
|
||||
return result.text
|
||||
```
|
||||
|
||||
**What gets captured:**
|
||||
|
||||
```json
|
||||
{
|
||||
"input.value": "What is 2+2?",
|
||||
"input.mime_type": "text/plain",
|
||||
"output.value": "2+2 equals 4.",
|
||||
"output.mime_type": "text/plain"
|
||||
}
|
||||
```
|
||||
|
||||
**Why this matters:**
|
||||
- Phoenix evaluators require `input.value` and `output.value`
|
||||
- Phoenix UI displays I/O prominently for debugging
|
||||
- Enables exporting data for fine-tuning datasets
|
||||
|
||||
### Custom I/O with Additional Metadata
|
||||
|
||||
Use `set_attribute()` for custom attributes alongside I/O:
|
||||
|
||||
```python theme={null}
|
||||
def process_query(query: str):
|
||||
with tracer.start_as_current_span(
|
||||
"query.process",
|
||||
openinference_span_kind="chain"
|
||||
) as span:
|
||||
# Standard I/O
|
||||
span.set_input(query)
|
||||
|
||||
# Custom metadata
|
||||
span.set_attribute("input.length", len(query))
|
||||
|
||||
result = llm.generate(query)
|
||||
|
||||
# Standard output
|
||||
span.set_output(result.text)
|
||||
|
||||
# Custom metadata
|
||||
span.set_attribute("output.tokens", result.usage.total_tokens)
|
||||
span.set_status(Status(StatusCode.OK))
|
||||
|
||||
return result
|
||||
```
|
||||
|
||||
## See Also
|
||||
|
||||
- **Span attributes:** `span-chain.md`, `span-retriever.md`, `span-tool.md`, `span-llm.md`, `span-agent.md`, `span-embedding.md`, `span-reranker.md`, `span-guardrail.md`, `span-evaluator.md`
|
||||
- **Auto-instrumentation:** `instrumentation-auto-python.md` for framework integrations
|
||||
- **API docs:** https://docs.arize.com/phoenix/tracing/manual-instrumentation
|
||||
@@ -0,0 +1,172 @@
|
||||
# Manual Instrumentation (TypeScript)
|
||||
|
||||
Add custom spans using convenience wrappers or withSpan for fine-grained tracing control.
|
||||
|
||||
## Setup
|
||||
|
||||
```bash
|
||||
npm install @arizeai/phoenix-otel @arizeai/openinference-core
|
||||
```
|
||||
|
||||
```typescript
|
||||
import { register } from "@arizeai/phoenix-otel";
|
||||
register({ projectName: "my-app" });
|
||||
```
|
||||
|
||||
## Quick Reference
|
||||
|
||||
| Span Kind | Method | Use Case |
|
||||
|-----------|--------|----------|
|
||||
| CHAIN | `traceChain` | Workflows, pipelines, orchestration |
|
||||
| AGENT | `traceAgent` | Multi-step reasoning, planning |
|
||||
| TOOL | `traceTool` | External APIs, function calls |
|
||||
| RETRIEVER | `withSpan` | Vector search, document retrieval |
|
||||
| LLM | `withSpan` | LLM API calls (prefer auto-instrumentation) |
|
||||
| EMBEDDING | `withSpan` | Embedding generation |
|
||||
| RERANKER | `withSpan` | Document re-ranking |
|
||||
| GUARDRAIL | `withSpan` | Safety checks, content moderation |
|
||||
| EVALUATOR | `withSpan` | LLM evaluation |
|
||||
|
||||
## Convenience Wrappers
|
||||
|
||||
```typescript
|
||||
import { traceChain, traceAgent, traceTool } from "@arizeai/openinference-core";
|
||||
|
||||
// CHAIN - workflows
|
||||
const pipeline = traceChain(
|
||||
async (query: string) => {
|
||||
const docs = await retrieve(query);
|
||||
return await generate(docs, query);
|
||||
},
|
||||
{ name: "rag-pipeline" }
|
||||
);
|
||||
|
||||
// AGENT - reasoning
|
||||
const agent = traceAgent(
|
||||
async (question: string) => {
|
||||
const thought = await llm.generate(`Think: ${question}`);
|
||||
return await processThought(thought);
|
||||
},
|
||||
{ name: "my-agent" }
|
||||
);
|
||||
|
||||
// TOOL - function calls
|
||||
const getWeather = traceTool(
|
||||
async (city: string) => fetch(`/api/weather/${city}`).then(r => r.json()),
|
||||
{ name: "get-weather" }
|
||||
);
|
||||
```
|
||||
|
||||
## withSpan for Other Kinds
|
||||
|
||||
```typescript
|
||||
import { withSpan, getInputAttributes, getRetrieverAttributes } from "@arizeai/openinference-core";
|
||||
|
||||
// RETRIEVER with custom attributes
|
||||
const retrieve = withSpan(
|
||||
async (query: string) => {
|
||||
const results = await vectorDb.search(query, { topK: 5 });
|
||||
return results.map(doc => ({ content: doc.text, score: doc.score }));
|
||||
},
|
||||
{
|
||||
kind: "RETRIEVER",
|
||||
name: "vector-search",
|
||||
processInput: (query) => getInputAttributes(query),
|
||||
processOutput: (docs) => getRetrieverAttributes({ documents: docs })
|
||||
}
|
||||
);
|
||||
```
|
||||
|
||||
**Options:**
|
||||
|
||||
```typescript
|
||||
withSpan(fn, {
|
||||
kind: "RETRIEVER", // OpenInference span kind
|
||||
name: "span-name", // Span name (defaults to function name)
|
||||
processInput: (args) => {}, // Transform input to attributes
|
||||
processOutput: (result) => {}, // Transform output to attributes
|
||||
attributes: { key: "value" } // Static attributes
|
||||
});
|
||||
```
|
||||
|
||||
## Capturing Input/Output
|
||||
|
||||
**Always capture I/O for evaluation-ready spans.** Use `getInputAttributes` and `getOutputAttributes` helpers for automatic MIME type detection:
|
||||
|
||||
```typescript
|
||||
import {
|
||||
getInputAttributes,
|
||||
getOutputAttributes,
|
||||
withSpan,
|
||||
} from "@arizeai/openinference-core";
|
||||
|
||||
const handleQuery = withSpan(
|
||||
async (userInput: string) => {
|
||||
const result = await agent.generate({ prompt: userInput });
|
||||
return result;
|
||||
},
|
||||
{
|
||||
name: "query.handler",
|
||||
kind: "CHAIN",
|
||||
// Use helpers - automatic MIME type detection
|
||||
processInput: (input) => getInputAttributes(input),
|
||||
processOutput: (result) => getOutputAttributes(result.text),
|
||||
}
|
||||
);
|
||||
|
||||
await handleQuery("What is 2+2?");
|
||||
```
|
||||
|
||||
**What gets captured:**
|
||||
|
||||
```json
|
||||
{
|
||||
"input.value": "What is 2+2?",
|
||||
"input.mime_type": "text/plain",
|
||||
"output.value": "2+2 equals 4.",
|
||||
"output.mime_type": "text/plain"
|
||||
}
|
||||
```
|
||||
|
||||
**Helper behavior:**
|
||||
- Strings → `text/plain`
|
||||
- Objects/Arrays → `application/json` (automatically serialized)
|
||||
- `undefined`/`null` → No attributes set
|
||||
|
||||
**Why this matters:**
|
||||
- Phoenix evaluators require `input.value` and `output.value`
|
||||
- Phoenix UI displays I/O prominently for debugging
|
||||
- Enables exporting data for fine-tuning datasets
|
||||
|
||||
### Custom I/O Processing
|
||||
|
||||
Add custom metadata alongside standard I/O attributes:
|
||||
|
||||
```typescript
|
||||
const processWithMetadata = withSpan(
|
||||
async (query: string) => {
|
||||
const result = await llm.generate(query);
|
||||
return result;
|
||||
},
|
||||
{
|
||||
name: "query.process",
|
||||
kind: "CHAIN",
|
||||
processInput: (query) => ({
|
||||
"input.value": query,
|
||||
"input.mime_type": "text/plain",
|
||||
"input.length": query.length, // Custom attribute
|
||||
}),
|
||||
processOutput: (result) => ({
|
||||
"output.value": result.text,
|
||||
"output.mime_type": "text/plain",
|
||||
"output.tokens": result.usage?.totalTokens, // Custom attribute
|
||||
}),
|
||||
}
|
||||
);
|
||||
```
|
||||
|
||||
## See Also
|
||||
|
||||
- **Span attributes:** `span-chain.md`, `span-retriever.md`, `span-tool.md`, etc.
|
||||
- **Attribute helpers:** https://docs.arize.com/phoenix/tracing/manual-instrumentation-typescript#attribute-helpers
|
||||
- **Auto-instrumentation:** `instrumentation-auto-typescript.md` for framework integrations
|
||||
@@ -0,0 +1,87 @@
|
||||
# Phoenix Tracing: Custom Metadata (Python)
|
||||
|
||||
Add custom attributes to spans for richer observability.
|
||||
|
||||
## Install
|
||||
|
||||
```bash
|
||||
pip install openinference-instrumentation
|
||||
```
|
||||
|
||||
## Session
|
||||
|
||||
```python
|
||||
from openinference.instrumentation import using_session
|
||||
|
||||
with using_session(session_id="my-session-id"):
|
||||
# Spans get: "session.id" = "my-session-id"
|
||||
...
|
||||
```
|
||||
|
||||
## User
|
||||
|
||||
```python
|
||||
from openinference.instrumentation import using_user
|
||||
|
||||
with using_user("my-user-id"):
|
||||
# Spans get: "user.id" = "my-user-id"
|
||||
...
|
||||
```
|
||||
|
||||
## Metadata
|
||||
|
||||
```python
|
||||
from openinference.instrumentation import using_metadata
|
||||
|
||||
with using_metadata({"key": "value", "experiment_id": "exp_123"}):
|
||||
# Spans get: "metadata" = '{"key": "value", "experiment_id": "exp_123"}'
|
||||
...
|
||||
```
|
||||
|
||||
## Tags
|
||||
|
||||
```python
|
||||
from openinference.instrumentation import using_tags
|
||||
|
||||
with using_tags(["tag_1", "tag_2"]):
|
||||
# Spans get: "tag.tags" = '["tag_1", "tag_2"]'
|
||||
...
|
||||
```
|
||||
|
||||
## Combined (using_attributes)
|
||||
|
||||
```python
|
||||
from openinference.instrumentation import using_attributes
|
||||
|
||||
with using_attributes(
|
||||
session_id="my-session-id",
|
||||
user_id="my-user-id",
|
||||
metadata={"environment": "production"},
|
||||
tags=["prod", "v2"],
|
||||
prompt_template="Answer: {question}",
|
||||
prompt_template_version="v1.0",
|
||||
prompt_template_variables={"question": "What is Phoenix?"},
|
||||
):
|
||||
# All attributes applied to spans in this context
|
||||
...
|
||||
```
|
||||
|
||||
## On a Single Span
|
||||
|
||||
```python
|
||||
span.set_attribute("metadata", json.dumps({"key": "value"}))
|
||||
span.set_attribute("user.id", "user_123")
|
||||
span.set_attribute("session.id", "session_456")
|
||||
```
|
||||
|
||||
## As Decorators
|
||||
|
||||
All context managers can be used as decorators:
|
||||
|
||||
```python
|
||||
@using_session(session_id="my-session-id")
|
||||
@using_user("my-user-id")
|
||||
@using_metadata({"env": "prod"})
|
||||
def my_function():
|
||||
...
|
||||
```
|
||||
@@ -0,0 +1,50 @@
|
||||
# Phoenix Tracing: Custom Metadata (TypeScript)
|
||||
|
||||
Add custom attributes to spans for richer observability.
|
||||
|
||||
## Using Context (Propagates to All Child Spans)
|
||||
|
||||
```typescript
|
||||
import { context } from "@arizeai/phoenix-otel";
|
||||
import { setMetadata } from "@arizeai/openinference-core";
|
||||
|
||||
context.with(
|
||||
setMetadata(context.active(), {
|
||||
experiment_id: "exp_123",
|
||||
model_version: "gpt-4-1106-preview",
|
||||
environment: "production",
|
||||
}),
|
||||
async () => {
|
||||
// All spans created within this block will have:
|
||||
// "metadata" = '{"experiment_id": "exp_123", ...}'
|
||||
await myApp.run(query);
|
||||
}
|
||||
);
|
||||
```
|
||||
|
||||
## On a Single Span
|
||||
|
||||
```typescript
|
||||
import { traceChain } from "@arizeai/openinference-core";
|
||||
import { trace } from "@arizeai/phoenix-otel";
|
||||
|
||||
const myFunction = traceChain(
|
||||
async (input: string) => {
|
||||
const span = trace.getActiveSpan();
|
||||
|
||||
span?.setAttribute(
|
||||
"metadata",
|
||||
JSON.stringify({
|
||||
experiment_id: "exp_123",
|
||||
model_version: "gpt-4-1106-preview",
|
||||
environment: "production",
|
||||
})
|
||||
);
|
||||
|
||||
return result;
|
||||
},
|
||||
{ name: "my-function" }
|
||||
);
|
||||
|
||||
await myFunction("hello");
|
||||
```
|
||||
@@ -0,0 +1,58 @@
|
||||
# Phoenix Tracing: Production Guide (Python)
|
||||
|
||||
**CRITICAL: Configure batching, data masking, and span filtering for production deployment.**
|
||||
|
||||
## Metadata
|
||||
|
||||
| Attribute | Value |
|
||||
|-----------|-------|
|
||||
| Priority | Critical - production readiness |
|
||||
| Impact | Security, Performance |
|
||||
| Setup Time | 5-15 min |
|
||||
|
||||
## Batch Processing
|
||||
|
||||
**Enable batch processing for production efficiency.** Batching reduces network overhead by sending spans in groups rather than individually.
|
||||
|
||||
## Data Masking (PII Protection)
|
||||
|
||||
**Environment variables:**
|
||||
|
||||
```bash
|
||||
export OPENINFERENCE_HIDE_INPUTS=true # Hide input.value
|
||||
export OPENINFERENCE_HIDE_OUTPUTS=true # Hide output.value
|
||||
export OPENINFERENCE_HIDE_INPUT_MESSAGES=true # Hide LLM input messages
|
||||
export OPENINFERENCE_HIDE_OUTPUT_MESSAGES=true # Hide LLM output messages
|
||||
export OPENINFERENCE_HIDE_INPUT_IMAGES=true # Hide image content
|
||||
export OPENINFERENCE_HIDE_INPUT_TEXT=true # Hide embedding text
|
||||
export OPENINFERENCE_BASE64_IMAGE_MAX_LENGTH=10000 # Limit image size
|
||||
```
|
||||
|
||||
**Python TraceConfig:**
|
||||
|
||||
```python
|
||||
from phoenix.otel import register
|
||||
from openinference.instrumentation import TraceConfig
|
||||
|
||||
config = TraceConfig(
|
||||
hide_inputs=True,
|
||||
hide_outputs=True,
|
||||
hide_input_messages=True
|
||||
)
|
||||
register(trace_config=config)
|
||||
```
|
||||
|
||||
**Precedence:** Code > Environment variables > Defaults
|
||||
|
||||
---
|
||||
|
||||
## Span Filtering
|
||||
|
||||
**Suppress specific code blocks:**
|
||||
|
||||
```python
|
||||
from phoenix.otel import suppress_tracing
|
||||
|
||||
with suppress_tracing():
|
||||
internal_logging() # No spans generated
|
||||
```
|
||||
@@ -0,0 +1,148 @@
|
||||
# Phoenix Tracing: Production Guide (TypeScript)
|
||||
|
||||
**CRITICAL: Configure batching, data masking, and span filtering for production deployment.**
|
||||
|
||||
## Metadata
|
||||
|
||||
| Attribute | Value |
|
||||
|-----------|-------|
|
||||
| Priority | Critical - production readiness |
|
||||
| Impact | Security, Performance |
|
||||
| Setup Time | 5-15 min |
|
||||
|
||||
## Batch Processing
|
||||
|
||||
**Enable batch processing for production efficiency.** Batching reduces network overhead by sending spans in groups rather than individually.
|
||||
|
||||
```typescript
|
||||
import { register } from "@arizeai/phoenix-otel";
|
||||
|
||||
const provider = register({
|
||||
projectName: "my-app",
|
||||
batch: true, // Production default
|
||||
});
|
||||
```
|
||||
|
||||
### Shutdown Handling
|
||||
|
||||
**CRITICAL:** Spans may not be exported if still queued in the processor when your process exits. Call `provider.shutdown()` to explicitly flush before exit.
|
||||
|
||||
```typescript
|
||||
// Explicit shutdown to flush queued spans
|
||||
const provider = register({
|
||||
projectName: "my-app",
|
||||
batch: true,
|
||||
});
|
||||
|
||||
async function main() {
|
||||
await doWork();
|
||||
await provider.shutdown(); // Flush spans before exit
|
||||
}
|
||||
|
||||
main().catch(async (error) => {
|
||||
console.error(error);
|
||||
await provider.shutdown(); // Flush on error too
|
||||
process.exit(1);
|
||||
});
|
||||
```
|
||||
|
||||
**Graceful termination signals:**
|
||||
|
||||
```typescript
|
||||
// Graceful shutdown on SIGTERM
|
||||
const provider = register({
|
||||
projectName: "my-server",
|
||||
batch: true,
|
||||
});
|
||||
|
||||
process.on("SIGTERM", async () => {
|
||||
await provider.shutdown();
|
||||
process.exit(0);
|
||||
});
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Data Masking (PII Protection)
|
||||
|
||||
**Environment variables:**
|
||||
|
||||
```bash
|
||||
export OPENINFERENCE_HIDE_INPUTS=true # Hide input.value
|
||||
export OPENINFERENCE_HIDE_OUTPUTS=true # Hide output.value
|
||||
export OPENINFERENCE_HIDE_INPUT_MESSAGES=true # Hide LLM input messages
|
||||
export OPENINFERENCE_HIDE_OUTPUT_MESSAGES=true # Hide LLM output messages
|
||||
export OPENINFERENCE_HIDE_INPUT_IMAGES=true # Hide image content
|
||||
export OPENINFERENCE_HIDE_INPUT_TEXT=true # Hide embedding text
|
||||
export OPENINFERENCE_BASE64_IMAGE_MAX_LENGTH=10000 # Limit image size
|
||||
```
|
||||
|
||||
**TypeScript TraceConfig:**
|
||||
|
||||
```typescript
|
||||
import { register } from "@arizeai/phoenix-otel";
|
||||
import { OpenAIInstrumentation } from "@arizeai/openinference-instrumentation-openai";
|
||||
|
||||
const traceConfig = {
|
||||
hideInputs: true,
|
||||
hideOutputs: true,
|
||||
hideInputMessages: true
|
||||
};
|
||||
|
||||
const instrumentation = new OpenAIInstrumentation({ traceConfig });
|
||||
```
|
||||
|
||||
**Precedence:** Code > Environment variables > Defaults
|
||||
|
||||
---
|
||||
|
||||
## Span Filtering
|
||||
|
||||
**Suppress specific code blocks:**
|
||||
|
||||
```typescript
|
||||
import { suppressTracing } from "@opentelemetry/core";
|
||||
import { context } from "@opentelemetry/api";
|
||||
|
||||
await context.with(suppressTracing(context.active()), async () => {
|
||||
internalLogging(); // No spans generated
|
||||
});
|
||||
```
|
||||
|
||||
**Sampling:**
|
||||
|
||||
```bash
|
||||
export OTEL_TRACES_SAMPLER="parentbased_traceidratio"
|
||||
export OTEL_TRACES_SAMPLER_ARG="0.1" # Sample 10%
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Error Handling
|
||||
|
||||
```typescript
|
||||
import { SpanStatusCode } from "@opentelemetry/api";
|
||||
|
||||
try {
|
||||
result = await riskyOperation();
|
||||
span?.setStatus({ code: SpanStatusCode.OK });
|
||||
} catch (e) {
|
||||
span?.recordException(e);
|
||||
span?.setStatus({ code: SpanStatusCode.ERROR });
|
||||
throw e;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Production Checklist
|
||||
|
||||
- [ ] Batch processing enabled
|
||||
- [ ] **Shutdown handling:** Call `provider.shutdown()` before exit to flush queued spans
|
||||
- [ ] **Graceful termination:** Flush spans on SIGTERM/SIGINT signals
|
||||
- [ ] Data masking configured (`HIDE_INPUTS`/`HIDE_OUTPUTS` if PII)
|
||||
- [ ] Span filtering for health checks/noisy paths
|
||||
- [ ] Error handling implemented
|
||||
- [ ] Graceful degradation if Phoenix unavailable
|
||||
- [ ] Performance tested
|
||||
- [ ] Monitoring configured (Phoenix UI checked)
|
||||
@@ -0,0 +1,73 @@
|
||||
# Phoenix Tracing: Projects (Python)
|
||||
|
||||
**Organize traces by application using projects (Phoenix's top-level grouping).**
|
||||
|
||||
## Overview
|
||||
|
||||
Projects group traces for a single application or experiment.
|
||||
|
||||
**Use for:** Environments (dev/staging/prod), A/B testing, versioning
|
||||
|
||||
## Setup
|
||||
|
||||
### Environment Variable (Recommended)
|
||||
|
||||
```bash
|
||||
export PHOENIX_PROJECT_NAME="my-app-prod"
|
||||
```
|
||||
|
||||
```python
|
||||
import os
|
||||
os.environ["PHOENIX_PROJECT_NAME"] = "my-app-prod"
|
||||
from phoenix.otel import register
|
||||
register() # Uses "my-app-prod"
|
||||
```
|
||||
|
||||
### Code
|
||||
|
||||
```python
|
||||
from phoenix.otel import register
|
||||
register(project_name="my-app-prod")
|
||||
```
|
||||
|
||||
## Use Cases
|
||||
|
||||
**Environments:**
|
||||
|
||||
```python
|
||||
# Dev, staging, prod
|
||||
register(project_name="my-app-dev")
|
||||
register(project_name="my-app-staging")
|
||||
register(project_name="my-app-prod")
|
||||
```
|
||||
|
||||
**A/B Testing:**
|
||||
|
||||
```python
|
||||
# Compare models
|
||||
register(project_name="chatbot-gpt4")
|
||||
register(project_name="chatbot-claude")
|
||||
```
|
||||
|
||||
**Versioning:**
|
||||
|
||||
```python
|
||||
# Track versions
|
||||
register(project_name="my-app-v1")
|
||||
register(project_name="my-app-v2")
|
||||
```
|
||||
|
||||
## Switching Projects (Python Notebooks Only)
|
||||
|
||||
```python
|
||||
from openinference.instrumentation import dangerously_using_project
|
||||
from phoenix.otel import register
|
||||
|
||||
register(project_name="my-app")
|
||||
|
||||
# Switch temporarily for evals
|
||||
with dangerously_using_project("my-eval-project"):
|
||||
run_evaluations()
|
||||
```
|
||||
|
||||
**⚠️ Only use in notebooks/scripts, not production.**
|
||||
@@ -0,0 +1,54 @@
|
||||
# Phoenix Tracing: Projects (TypeScript)
|
||||
|
||||
**Organize traces by application using projects (Phoenix's top-level grouping).**
|
||||
|
||||
## Overview
|
||||
|
||||
Projects group traces for a single application or experiment.
|
||||
|
||||
**Use for:** Environments (dev/staging/prod), A/B testing, versioning
|
||||
|
||||
## Setup
|
||||
|
||||
### Environment Variable (Recommended)
|
||||
|
||||
```bash
|
||||
export PHOENIX_PROJECT_NAME="my-app-prod"
|
||||
```
|
||||
|
||||
```typescript
|
||||
process.env.PHOENIX_PROJECT_NAME = "my-app-prod";
|
||||
import { register } from "@arizeai/phoenix-otel";
|
||||
register(); // Uses "my-app-prod"
|
||||
```
|
||||
|
||||
### Code
|
||||
|
||||
```typescript
|
||||
import { register } from "@arizeai/phoenix-otel";
|
||||
register({ projectName: "my-app-prod" });
|
||||
```
|
||||
|
||||
## Use Cases
|
||||
|
||||
**Environments:**
|
||||
```typescript
|
||||
// Dev, staging, prod
|
||||
register({ projectName: "my-app-dev" });
|
||||
register({ projectName: "my-app-staging" });
|
||||
register({ projectName: "my-app-prod" });
|
||||
```
|
||||
|
||||
**A/B Testing:**
|
||||
```typescript
|
||||
// Compare models
|
||||
register({ projectName: "chatbot-gpt4" });
|
||||
register({ projectName: "chatbot-claude" });
|
||||
```
|
||||
|
||||
**Versioning:**
|
||||
```typescript
|
||||
// Track versions
|
||||
register({ projectName: "my-app-v1" });
|
||||
register({ projectName: "my-app-v2" });
|
||||
```
|
||||
@@ -0,0 +1,104 @@
|
||||
# Sessions (Python)
|
||||
|
||||
Track multi-turn conversations by grouping traces with session IDs.
|
||||
|
||||
## Setup
|
||||
|
||||
```python
|
||||
from openinference.instrumentation import using_session
|
||||
|
||||
with using_session(session_id="user_123_conv_456"):
|
||||
response = llm.invoke(prompt)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
**Bad: Only parent span gets session ID**
|
||||
|
||||
```python
|
||||
from openinference.semconv.trace import SpanAttributes
|
||||
from opentelemetry import trace
|
||||
|
||||
span = trace.get_current_span()
|
||||
span.set_attribute(SpanAttributes.SESSION_ID, session_id)
|
||||
response = client.chat.completions.create(...)
|
||||
```
|
||||
|
||||
**Good: All child spans inherit session ID**
|
||||
|
||||
```python
|
||||
with using_session(session_id):
|
||||
response = client.chat.completions.create(...)
|
||||
result = my_custom_function()
|
||||
```
|
||||
|
||||
**Why:** `using_session()` propagates session ID to all nested spans automatically.
|
||||
|
||||
## Session ID Patterns
|
||||
|
||||
```python
|
||||
import uuid
|
||||
|
||||
session_id = str(uuid.uuid4())
|
||||
session_id = f"user_{user_id}_conv_{conversation_id}"
|
||||
session_id = f"debug_{timestamp}"
|
||||
```
|
||||
|
||||
Good: `str(uuid.uuid4())`, `"user_123_conv_456"`
|
||||
Bad: `"session_1"`, `"test"`, empty string
|
||||
|
||||
## Multi-Turn Chatbot Example
|
||||
|
||||
```python
|
||||
import uuid
|
||||
from openinference.instrumentation import using_session
|
||||
|
||||
session_id = str(uuid.uuid4())
|
||||
messages = []
|
||||
|
||||
def send_message(user_input: str) -> str:
|
||||
messages.append({"role": "user", "content": user_input})
|
||||
|
||||
with using_session(session_id):
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4",
|
||||
messages=messages
|
||||
)
|
||||
|
||||
assistant_message = response.choices[0].message.content
|
||||
messages.append({"role": "assistant", "content": assistant_message})
|
||||
return assistant_message
|
||||
```
|
||||
|
||||
## Additional Attributes
|
||||
|
||||
```python
|
||||
from openinference.instrumentation import using_attributes
|
||||
|
||||
with using_attributes(
|
||||
user_id="user_123",
|
||||
session_id="conv_456",
|
||||
metadata={"tier": "premium", "region": "us-west"}
|
||||
):
|
||||
response = llm.invoke(prompt)
|
||||
```
|
||||
|
||||
## LangChain Integration
|
||||
|
||||
LangChain threads are automatically recognized as sessions:
|
||||
|
||||
```python
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
|
||||
response = llm.invoke(
|
||||
[HumanMessage(content="Hi!")],
|
||||
config={"metadata": {"thread_id": "user_123_thread"}}
|
||||
)
|
||||
```
|
||||
|
||||
Phoenix recognizes: `thread_id`, `session_id`, `conversation_id`
|
||||
|
||||
## See Also
|
||||
|
||||
- **TypeScript sessions:** `sessions-typescript.md`
|
||||
- **Session docs:** https://docs.arize.com/phoenix/tracing/sessions
|
||||
@@ -0,0 +1,199 @@
|
||||
# Sessions (TypeScript)
|
||||
|
||||
Track multi-turn conversations by grouping traces with session IDs. **Use `withSpan` directly from `@arizeai/openinference-core`** - no wrappers or custom utilities needed.
|
||||
|
||||
## Core Concept
|
||||
|
||||
**Session Pattern:**
|
||||
1. Generate a unique `session.id` once at application startup
|
||||
2. Export SESSION_ID, import `withSpan` where needed
|
||||
3. Use `withSpan` to create a parent CHAIN span with `session.id` for each interaction
|
||||
4. All child spans (LLM, TOOL, AGENT, etc.) automatically group under the parent
|
||||
5. Query traces by `session.id` in Phoenix to see all interactions
|
||||
|
||||
## Implementation (Best Practice)
|
||||
|
||||
### 1. Setup (instrumentation.ts)
|
||||
|
||||
```typescript
|
||||
import { register } from "@arizeai/phoenix-otel";
|
||||
import { randomUUID } from "node:crypto";
|
||||
|
||||
// Initialize Phoenix
|
||||
register({
|
||||
projectName: "your-app",
|
||||
url: process.env.PHOENIX_COLLECTOR_ENDPOINT || "http://localhost:6006",
|
||||
apiKey: process.env.PHOENIX_API_KEY,
|
||||
batch: true,
|
||||
});
|
||||
|
||||
// Generate and export session ID
|
||||
export const SESSION_ID = randomUUID();
|
||||
```
|
||||
|
||||
### 2. Usage (app code)
|
||||
|
||||
```typescript
|
||||
import { withSpan } from "@arizeai/openinference-core";
|
||||
import { SESSION_ID } from "./instrumentation";
|
||||
|
||||
// Use withSpan directly - no wrapper needed
|
||||
const handleInteraction = withSpan(
|
||||
async () => {
|
||||
const result = await agent.generate({ prompt: userInput });
|
||||
return result;
|
||||
},
|
||||
{
|
||||
name: "cli.interaction",
|
||||
kind: "CHAIN",
|
||||
attributes: { "session.id": SESSION_ID },
|
||||
}
|
||||
);
|
||||
|
||||
// Call it
|
||||
const result = await handleInteraction();
|
||||
```
|
||||
|
||||
### With Input Parameters
|
||||
|
||||
```typescript
|
||||
const processQuery = withSpan(
|
||||
async (query: string) => {
|
||||
return await agent.generate({ prompt: query });
|
||||
},
|
||||
{
|
||||
name: "process.query",
|
||||
kind: "CHAIN",
|
||||
attributes: { "session.id": SESSION_ID },
|
||||
}
|
||||
);
|
||||
|
||||
await processQuery("What is 2+2?");
|
||||
```
|
||||
|
||||
## Key Points
|
||||
|
||||
### Session ID Scope
|
||||
- **CLI/Desktop Apps**: Generate once at process startup
|
||||
- **Web Servers**: Generate per-user session (e.g., on login, store in session storage)
|
||||
- **Stateless APIs**: Accept session.id as a parameter from client
|
||||
|
||||
### Span Hierarchy
|
||||
```
|
||||
cli.interaction (CHAIN) ← session.id here
|
||||
├── ai.generateText (AGENT)
|
||||
│ ├── ai.generateText.doGenerate (LLM)
|
||||
│ └── ai.toolCall (TOOL)
|
||||
└── ai.generateText.doGenerate (LLM)
|
||||
```
|
||||
|
||||
The `session.id` is only set on the **root span**. Child spans are automatically grouped by the trace hierarchy.
|
||||
|
||||
### Querying Sessions
|
||||
|
||||
```bash
|
||||
# Get all traces for a session
|
||||
npx @arizeai/phoenix-cli traces \
|
||||
--endpoint http://localhost:6006 \
|
||||
--project your-app \
|
||||
--format raw \
|
||||
--no-progress | \
|
||||
jq '.[] | select(.spans[0].attributes["session.id"] == "YOUR-SESSION-ID")'
|
||||
```
|
||||
|
||||
## Dependencies
|
||||
|
||||
```json
|
||||
{
|
||||
"dependencies": {
|
||||
"@arizeai/openinference-core": "^2.0.5",
|
||||
"@arizeai/phoenix-otel": "^0.4.1"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Note:** `@opentelemetry/api` is NOT needed - it's only for manual span management.
|
||||
|
||||
## Why This Pattern?
|
||||
|
||||
1. **Simple**: Just export SESSION_ID, use withSpan directly - no wrappers
|
||||
2. **Built-in**: `withSpan` from `@arizeai/openinference-core` handles everything
|
||||
3. **Type-safe**: Preserves function signatures and type information
|
||||
4. **Automatic lifecycle**: Handles span creation, error tracking, and cleanup
|
||||
5. **Framework-agnostic**: Works with any LLM framework (AI SDK, LangChain, etc.)
|
||||
6. **No extra deps**: Don't need `@opentelemetry/api` or custom utilities
|
||||
|
||||
## Adding More Attributes
|
||||
|
||||
```typescript
|
||||
import { withSpan } from "@arizeai/openinference-core";
|
||||
import { SESSION_ID } from "./instrumentation";
|
||||
|
||||
const handleWithContext = withSpan(
|
||||
async (userInput: string) => {
|
||||
return await agent.generate({ prompt: userInput });
|
||||
},
|
||||
{
|
||||
name: "cli.interaction",
|
||||
kind: "CHAIN",
|
||||
attributes: {
|
||||
"session.id": SESSION_ID,
|
||||
"user.id": userId, // Track user
|
||||
"metadata.environment": "prod", // Custom metadata
|
||||
},
|
||||
}
|
||||
);
|
||||
```
|
||||
|
||||
## Anti-Pattern: Don't Create Wrappers
|
||||
|
||||
❌ **Don't do this:**
|
||||
```typescript
|
||||
// Unnecessary wrapper
|
||||
export function withSessionTracking(fn) {
|
||||
return withSpan(fn, { attributes: { "session.id": SESSION_ID } });
|
||||
}
|
||||
```
|
||||
|
||||
✅ **Do this instead:**
|
||||
```typescript
|
||||
// Use withSpan directly
|
||||
import { withSpan } from "@arizeai/openinference-core";
|
||||
import { SESSION_ID } from "./instrumentation";
|
||||
|
||||
const handler = withSpan(fn, {
|
||||
attributes: { "session.id": SESSION_ID }
|
||||
});
|
||||
```
|
||||
|
||||
## Alternative: Context API Pattern
|
||||
|
||||
For web servers or complex async flows where you need to propagate session IDs through middleware, you can use the Context API:
|
||||
|
||||
```typescript
|
||||
import { context } from "@opentelemetry/api";
|
||||
import { setSession } from "@arizeai/openinference-core";
|
||||
|
||||
await context.with(
|
||||
setSession(context.active(), { sessionId: "user_123_conv_456" }),
|
||||
async () => {
|
||||
const response = await llm.invoke(prompt);
|
||||
}
|
||||
);
|
||||
```
|
||||
|
||||
**Use Context API when:**
|
||||
- Building web servers with middleware chains
|
||||
- Session ID needs to flow through many async boundaries
|
||||
- You don't control the call stack (e.g., framework-provided handlers)
|
||||
|
||||
**Use withSpan when:**
|
||||
- Building CLI apps or scripts
|
||||
- You control the function call points
|
||||
- Simpler, more explicit code is preferred
|
||||
|
||||
## Related
|
||||
|
||||
- `fundamentals-universal-attributes.md` - Other universal attributes (user.id, metadata)
|
||||
- `span-chain.md` - CHAIN span specification
|
||||
- `sessions-python.md` - Python session tracking patterns
|
||||
@@ -0,0 +1,131 @@
|
||||
# Phoenix Tracing: Python Setup
|
||||
|
||||
**Setup Phoenix tracing in Python with `arize-phoenix-otel`.**
|
||||
|
||||
## Metadata
|
||||
|
||||
| Attribute | Value |
|
||||
| ---------- | ----------------------------------- |
|
||||
| Priority | Critical - required for all tracing |
|
||||
| Setup Time | <5 min |
|
||||
|
||||
## Quick Start (3 lines)
|
||||
|
||||
```python
|
||||
from phoenix.otel import register
|
||||
register(project_name="my-app", auto_instrument=True)
|
||||
```
|
||||
|
||||
**Connects to `http://localhost:6006`, auto-instruments all supported libraries.**
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install arize-phoenix-otel
|
||||
```
|
||||
|
||||
**Supported:** Python 3.10-3.13
|
||||
|
||||
## Configuration
|
||||
|
||||
### Environment Variables (Recommended)
|
||||
|
||||
```bash
|
||||
export PHOENIX_API_KEY="your-api-key" # Required for Phoenix Cloud
|
||||
export PHOENIX_COLLECTOR_ENDPOINT="http://localhost:6006" # Or Cloud URL
|
||||
export PHOENIX_PROJECT_NAME="my-app" # Optional
|
||||
```
|
||||
|
||||
### Python Code
|
||||
|
||||
```python
|
||||
from phoenix.otel import register
|
||||
|
||||
tracer_provider = register(
|
||||
project_name="my-app", # Project name
|
||||
endpoint="http://localhost:6006", # Phoenix endpoint
|
||||
auto_instrument=True, # Auto-instrument supported libs
|
||||
batch=True, # Batch processing (default: True)
|
||||
)
|
||||
```
|
||||
|
||||
**Parameters:**
|
||||
|
||||
- `project_name`: Project name (overrides `PHOENIX_PROJECT_NAME`)
|
||||
- `endpoint`: Phoenix URL (overrides `PHOENIX_COLLECTOR_ENDPOINT`)
|
||||
- `auto_instrument`: Enable auto-instrumentation (default: False)
|
||||
- `batch`: Use BatchSpanProcessor (default: True, production-recommended)
|
||||
- `protocol`: `"http/protobuf"` (default) or `"grpc"`
|
||||
|
||||
## Auto-Instrumentation
|
||||
|
||||
Install instrumentors for your frameworks:
|
||||
|
||||
```bash
|
||||
pip install openinference-instrumentation-openai # OpenAI SDK
|
||||
pip install openinference-instrumentation-langchain # LangChain
|
||||
pip install openinference-instrumentation-llama-index # LlamaIndex
|
||||
# ... install others as needed
|
||||
```
|
||||
|
||||
Then enable auto-instrumentation:
|
||||
|
||||
```python
|
||||
register(project_name="my-app", auto_instrument=True)
|
||||
```
|
||||
|
||||
Phoenix discovers and instruments all installed OpenInference packages automatically.
|
||||
|
||||
## Batch Processing (Production)
|
||||
|
||||
Enabled by default. Configure via environment variables:
|
||||
|
||||
```bash
|
||||
export OTEL_BSP_SCHEDULE_DELAY=5000 # Batch every 5s
|
||||
export OTEL_BSP_MAX_QUEUE_SIZE=2048 # Queue 2048 spans
|
||||
export OTEL_BSP_MAX_EXPORT_BATCH_SIZE=512 # Send 512 spans/batch
|
||||
```
|
||||
|
||||
**Link:** https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/
|
||||
|
||||
## Verification
|
||||
|
||||
1. Open Phoenix UI: `http://localhost:6006`
|
||||
2. Navigate to your project
|
||||
3. Run your application
|
||||
4. Check for traces (appear within batch delay)
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**No traces:**
|
||||
|
||||
- Verify `PHOENIX_COLLECTOR_ENDPOINT` matches Phoenix server
|
||||
- Set `PHOENIX_API_KEY` for Phoenix Cloud
|
||||
- Confirm instrumentors installed
|
||||
|
||||
**Missing attributes:**
|
||||
|
||||
- Check span kind (see rules/ directory)
|
||||
- Verify attribute names (see rules/ directory)
|
||||
|
||||
## Example
|
||||
|
||||
```python
|
||||
from phoenix.otel import register
|
||||
from openai import OpenAI
|
||||
|
||||
# Enable tracing with auto-instrumentation
|
||||
register(project_name="my-chatbot", auto_instrument=True)
|
||||
|
||||
# OpenAI automatically instrumented
|
||||
client = OpenAI()
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4",
|
||||
messages=[{"role": "user", "content": "Hello!"}]
|
||||
)
|
||||
```
|
||||
|
||||
## API Reference
|
||||
|
||||
- [Python OTEL API Docs](https://arize-phoenix.readthedocs.io/projects/otel/en/latest/)
|
||||
- [Python Client API Docs](https://arize-phoenix.readthedocs.io/projects/client/en/latest/)
|
||||
@@ -0,0 +1,170 @@
|
||||
# TypeScript Setup
|
||||
|
||||
Setup Phoenix tracing in TypeScript/JavaScript with `@arizeai/phoenix-otel`.
|
||||
|
||||
## Metadata
|
||||
|
||||
| Attribute | Value |
|
||||
|-----------|-------|
|
||||
| Priority | Critical - required for all tracing |
|
||||
| Setup Time | <5 min |
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
npm install @arizeai/phoenix-otel
|
||||
```
|
||||
|
||||
```typescript
|
||||
import { register } from "@arizeai/phoenix-otel";
|
||||
register({ projectName: "my-app" });
|
||||
```
|
||||
|
||||
Connects to `http://localhost:6006` by default.
|
||||
|
||||
## Configuration
|
||||
|
||||
```typescript
|
||||
import { register } from "@arizeai/phoenix-otel";
|
||||
|
||||
register({
|
||||
projectName: "my-app",
|
||||
url: "http://localhost:6006",
|
||||
apiKey: process.env.PHOENIX_API_KEY,
|
||||
batch: true
|
||||
});
|
||||
```
|
||||
|
||||
**Environment variables:**
|
||||
|
||||
```bash
|
||||
export PHOENIX_API_KEY="your-api-key"
|
||||
export PHOENIX_COLLECTOR_ENDPOINT="http://localhost:6006"
|
||||
export PHOENIX_PROJECT_NAME="my-app"
|
||||
```
|
||||
|
||||
## ESM vs CommonJS
|
||||
|
||||
**CommonJS (automatic):**
|
||||
|
||||
```javascript
|
||||
const { register } = require("@arizeai/phoenix-otel");
|
||||
register({ projectName: "my-app" });
|
||||
|
||||
const OpenAI = require("openai");
|
||||
```
|
||||
|
||||
**ESM (manual instrumentation required):**
|
||||
|
||||
```typescript
|
||||
import { register, registerInstrumentations } from "@arizeai/phoenix-otel";
|
||||
import { OpenAIInstrumentation } from "@arizeai/openinference-instrumentation-openai";
|
||||
import OpenAI from "openai";
|
||||
|
||||
register({ projectName: "my-app" });
|
||||
|
||||
const instrumentation = new OpenAIInstrumentation();
|
||||
instrumentation.manuallyInstrument(OpenAI);
|
||||
registerInstrumentations({ instrumentations: [instrumentation] });
|
||||
```
|
||||
|
||||
**Why:** ESM imports are hoisted, so `manuallyInstrument()` is needed.
|
||||
|
||||
## Framework Integration
|
||||
|
||||
**Next.js (App Router):**
|
||||
|
||||
```typescript
|
||||
// instrumentation.ts
|
||||
export async function register() {
|
||||
if (process.env.NEXT_RUNTIME === "nodejs") {
|
||||
const { register } = await import("@arizeai/phoenix-otel");
|
||||
register({ projectName: "my-nextjs-app" });
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Express.js:**
|
||||
|
||||
```typescript
|
||||
import { register } from "@arizeai/phoenix-otel";
|
||||
|
||||
register({ projectName: "my-express-app" });
|
||||
|
||||
const app = express();
|
||||
```
|
||||
|
||||
## Flushing Spans Before Exit
|
||||
|
||||
**CRITICAL:** Spans may not be exported if still queued in the processor when your process exits. Call `provider.shutdown()` to explicitly flush before exit.
|
||||
|
||||
**Standard pattern:**
|
||||
|
||||
```typescript
|
||||
const provider = register({
|
||||
projectName: "my-app",
|
||||
batch: true,
|
||||
});
|
||||
|
||||
async function main() {
|
||||
await doWork();
|
||||
await provider.shutdown(); // Flush spans before exit
|
||||
}
|
||||
|
||||
main().catch(async (error) => {
|
||||
console.error(error);
|
||||
await provider.shutdown(); // Flush on error too
|
||||
process.exit(1);
|
||||
});
|
||||
```
|
||||
|
||||
**Alternative:**
|
||||
|
||||
```typescript
|
||||
// Use batch: false for immediate export (no shutdown needed)
|
||||
register({
|
||||
projectName: "my-app",
|
||||
batch: false,
|
||||
});
|
||||
```
|
||||
|
||||
For production patterns including graceful termination, see `production-typescript.md`.
|
||||
|
||||
## Verification
|
||||
|
||||
1. Open Phoenix UI: `http://localhost:6006`
|
||||
2. Run your application
|
||||
3. Check for traces in your project
|
||||
|
||||
**Enable diagnostic logging:**
|
||||
|
||||
```typescript
|
||||
import { DiagLogLevel, register } from "@arizeai/phoenix-otel";
|
||||
|
||||
register({
|
||||
projectName: "my-app",
|
||||
diagLogLevel: DiagLogLevel.DEBUG,
|
||||
});
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**No traces:**
|
||||
- Verify `PHOENIX_COLLECTOR_ENDPOINT` is correct
|
||||
- Set `PHOENIX_API_KEY` for Phoenix Cloud
|
||||
- For ESM: Ensure `manuallyInstrument()` is called
|
||||
- **With `batch: true`:** Call `provider.shutdown()` before exit to flush queued spans (see Flushing Spans section)
|
||||
|
||||
**Traces missing:**
|
||||
- With `batch: true`: Call `await provider.shutdown()` before process exit to flush queued spans
|
||||
- Alternative: Set `batch: false` for immediate export (no shutdown needed)
|
||||
|
||||
**Missing attributes:**
|
||||
- Check instrumentation is registered (ESM requires manual setup)
|
||||
- See `instrumentation-auto-typescript.md`
|
||||
|
||||
## See Also
|
||||
|
||||
- **Auto-instrumentation:** `instrumentation-auto-typescript.md`
|
||||
- **Manual instrumentation:** `instrumentation-manual-typescript.md`
|
||||
- **API docs:** https://arize-ai.github.io/phoenix/
|
||||
@@ -0,0 +1,15 @@
|
||||
# AGENT Spans
|
||||
|
||||
AGENT spans represent autonomous reasoning blocks (ReAct agents, planning loops, multi-step decision making).
|
||||
|
||||
**Required:** `openinference.span.kind` = "AGENT"
|
||||
|
||||
## Example
|
||||
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "AGENT",
|
||||
"input.value": "Book a flight to New York for next Monday",
|
||||
"output.value": "I've booked flight AA123 departing Monday at 9:00 AM"
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,43 @@
|
||||
# CHAIN Spans
|
||||
|
||||
## Purpose
|
||||
|
||||
CHAIN spans represent orchestration layers in your application (LangChain chains, custom workflows, application entry points). Often used as root spans.
|
||||
|
||||
## Required Attributes
|
||||
|
||||
| Attribute | Type | Description | Required |
|
||||
| ------------------------- | ------ | --------------- | -------- |
|
||||
| `openinference.span.kind` | String | Must be "CHAIN" | Yes |
|
||||
|
||||
## Common Attributes
|
||||
|
||||
CHAIN spans typically use [Universal Attributes](fundamentals-universal-attributes.md):
|
||||
|
||||
- `input.value` - Input to the chain (user query, request payload)
|
||||
- `output.value` - Output from the chain (final response)
|
||||
- `input.mime_type` / `output.mime_type` - Format indicators
|
||||
|
||||
## Example: Root Chain
|
||||
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "CHAIN",
|
||||
"input.value": "{\"question\": \"What is the capital of France?\"}",
|
||||
"input.mime_type": "application/json",
|
||||
"output.value": "{\"answer\": \"The capital of France is Paris.\", \"sources\": [\"doc_123\"]}",
|
||||
"output.mime_type": "application/json",
|
||||
"session.id": "session_abc123",
|
||||
"user.id": "user_xyz789"
|
||||
}
|
||||
```
|
||||
|
||||
## Example: Nested Sub-Chain
|
||||
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "CHAIN",
|
||||
"input.value": "Summarize this document: ...",
|
||||
"output.value": "This document discusses..."
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,91 @@
|
||||
# EMBEDDING Spans
|
||||
|
||||
## Purpose
|
||||
|
||||
EMBEDDING spans represent vector generation operations (text-to-vector conversion for semantic search).
|
||||
|
||||
## Required Attributes
|
||||
|
||||
| Attribute | Type | Description | Required |
|
||||
|-----------|------|-------------|----------|
|
||||
| `openinference.span.kind` | String | Must be "EMBEDDING" | Yes |
|
||||
| `embedding.model_name` | String | Embedding model identifier | Recommended |
|
||||
|
||||
## Attribute Reference
|
||||
|
||||
### Single Embedding
|
||||
|
||||
| Attribute | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `embedding.model_name` | String | Embedding model identifier |
|
||||
| `embedding.text` | String | Input text to embed |
|
||||
| `embedding.vector` | String (JSON array) | Generated embedding vector |
|
||||
|
||||
**Example:**
|
||||
```json
|
||||
{
|
||||
"embedding.model_name": "text-embedding-ada-002",
|
||||
"embedding.text": "What is machine learning?",
|
||||
"embedding.vector": "[0.023, -0.012, 0.045, ..., 0.001]"
|
||||
}
|
||||
```
|
||||
|
||||
### Batch Embeddings
|
||||
|
||||
| Attribute Pattern | Type | Description |
|
||||
|-------------------|------|-------------|
|
||||
| `embedding.embeddings.{i}.embedding.text` | String | Text at index i |
|
||||
| `embedding.embeddings.{i}.embedding.vector` | String (JSON array) | Vector at index i |
|
||||
|
||||
**Example:**
|
||||
```json
|
||||
{
|
||||
"embedding.model_name": "text-embedding-ada-002",
|
||||
"embedding.embeddings.0.embedding.text": "First document",
|
||||
"embedding.embeddings.0.embedding.vector": "[0.1, 0.2, 0.3, ..., 0.5]",
|
||||
"embedding.embeddings.1.embedding.text": "Second document",
|
||||
"embedding.embeddings.1.embedding.vector": "[0.6, 0.7, 0.8, ..., 0.9]"
|
||||
}
|
||||
```
|
||||
|
||||
### Vector Format
|
||||
|
||||
Vectors stored as JSON array strings:
|
||||
- Dimensions: Typically 384, 768, 1536, or 3072
|
||||
- Format: `"[0.123, -0.456, 0.789, ...]"`
|
||||
- Precision: Usually 3-6 decimal places
|
||||
|
||||
**Storage Considerations:**
|
||||
- Large vectors can significantly increase trace size
|
||||
- Consider omitting vectors in production (keep `embedding.text` for debugging)
|
||||
- Use separate vector database for actual similarity search
|
||||
|
||||
## Examples
|
||||
|
||||
### Single Embedding
|
||||
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "EMBEDDING",
|
||||
"embedding.model_name": "text-embedding-ada-002",
|
||||
"embedding.text": "What is machine learning?",
|
||||
"embedding.vector": "[0.023, -0.012, 0.045, ..., 0.001]",
|
||||
"input.value": "What is machine learning?",
|
||||
"output.value": "[0.023, -0.012, 0.045, ..., 0.001]"
|
||||
}
|
||||
```
|
||||
|
||||
### Batch Embeddings
|
||||
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "EMBEDDING",
|
||||
"embedding.model_name": "text-embedding-ada-002",
|
||||
"embedding.embeddings.0.embedding.text": "First document",
|
||||
"embedding.embeddings.0.embedding.vector": "[0.1, 0.2, 0.3]",
|
||||
"embedding.embeddings.1.embedding.text": "Second document",
|
||||
"embedding.embeddings.1.embedding.vector": "[0.4, 0.5, 0.6]",
|
||||
"embedding.embeddings.2.embedding.text": "Third document",
|
||||
"embedding.embeddings.2.embedding.vector": "[0.7, 0.8, 0.9]"
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,51 @@
|
||||
# EVALUATOR Spans
|
||||
|
||||
## Purpose
|
||||
|
||||
EVALUATOR spans represent quality assessment operations (answer relevance, faithfulness, hallucination detection).
|
||||
|
||||
## Required Attributes
|
||||
|
||||
| Attribute | Type | Description | Required |
|
||||
|-----------|------|-------------|----------|
|
||||
| `openinference.span.kind` | String | Must be "EVALUATOR" | Yes |
|
||||
|
||||
## Common Attributes
|
||||
|
||||
| Attribute | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `input.value` | String | Content being evaluated |
|
||||
| `output.value` | String | Evaluation result (score, label, explanation) |
|
||||
| `metadata.evaluator_name` | String | Evaluator identifier |
|
||||
| `metadata.score` | Float | Numeric score (0-1) |
|
||||
| `metadata.label` | String | Categorical label (relevant/irrelevant) |
|
||||
|
||||
## Example: Answer Relevance
|
||||
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "EVALUATOR",
|
||||
"input.value": "{\"question\": \"What is the capital of France?\", \"answer\": \"The capital of France is Paris.\"}",
|
||||
"input.mime_type": "application/json",
|
||||
"output.value": "0.95",
|
||||
"metadata.evaluator_name": "answer_relevance",
|
||||
"metadata.score": 0.95,
|
||||
"metadata.label": "relevant",
|
||||
"metadata.explanation": "Answer directly addresses the question with correct information"
|
||||
}
|
||||
```
|
||||
|
||||
## Example: Faithfulness Check
|
||||
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "EVALUATOR",
|
||||
"input.value": "{\"context\": \"Paris is in France.\", \"answer\": \"Paris is the capital of France.\"}",
|
||||
"input.mime_type": "application/json",
|
||||
"output.value": "0.5",
|
||||
"metadata.evaluator_name": "faithfulness",
|
||||
"metadata.score": 0.5,
|
||||
"metadata.label": "partially_faithful",
|
||||
"metadata.explanation": "Answer makes unsupported claim about Paris being the capital"
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,49 @@
|
||||
# GUARDRAIL Spans
|
||||
|
||||
## Purpose
|
||||
|
||||
GUARDRAIL spans represent safety and policy checks (content moderation, PII detection, toxicity scoring).
|
||||
|
||||
## Required Attributes
|
||||
|
||||
| Attribute | Type | Description | Required |
|
||||
|-----------|------|-------------|----------|
|
||||
| `openinference.span.kind` | String | Must be "GUARDRAIL" | Yes |
|
||||
|
||||
## Common Attributes
|
||||
|
||||
| Attribute | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `input.value` | String | Content being checked |
|
||||
| `output.value` | String | Guardrail result (allowed/blocked/flagged) |
|
||||
| `metadata.guardrail_type` | String | Type of check (toxicity, pii, bias) |
|
||||
| `metadata.score` | Float | Safety score (0-1) |
|
||||
| `metadata.threshold` | Float | Threshold for blocking |
|
||||
|
||||
## Example: Content Moderation
|
||||
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "GUARDRAIL",
|
||||
"input.value": "User message: I want to build a bomb",
|
||||
"output.value": "BLOCKED",
|
||||
"metadata.guardrail_type": "content_moderation",
|
||||
"metadata.score": 0.95,
|
||||
"metadata.threshold": 0.7,
|
||||
"metadata.categories": "[\"violence\", \"weapons\"]",
|
||||
"metadata.action": "block_and_log"
|
||||
}
|
||||
```
|
||||
|
||||
## Example: PII Detection
|
||||
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "GUARDRAIL",
|
||||
"input.value": "My SSN is 123-45-6789",
|
||||
"output.value": "FLAGGED",
|
||||
"metadata.guardrail_type": "pii_detection",
|
||||
"metadata.detected_pii": "[\"ssn\"]",
|
||||
"metadata.redacted_output": "My SSN is [REDACTED]"
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,79 @@
|
||||
# LLM Spans
|
||||
|
||||
Represent calls to language models (OpenAI, Anthropic, local models, etc.).
|
||||
|
||||
## Required Attributes
|
||||
|
||||
| Attribute | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `openinference.span.kind` | String | Must be "LLM" |
|
||||
| `llm.model_name` | String | Model identifier (e.g., "gpt-4", "claude-3-5-sonnet-20241022") |
|
||||
|
||||
## Key Attributes
|
||||
|
||||
| Category | Attributes | Example |
|
||||
|----------|------------|---------|
|
||||
| **Model** | `llm.model_name`, `llm.provider` | "gpt-4-turbo", "openai" |
|
||||
| **Tokens** | `llm.token_count.prompt`, `llm.token_count.completion`, `llm.token_count.total` | 25, 8, 33 |
|
||||
| **Cost** | `llm.cost.prompt`, `llm.cost.completion`, `llm.cost.total` | 0.0021, 0.0045, 0.0066 |
|
||||
| **Parameters** | `llm.invocation_parameters` (JSON) | `{"temperature": 0.7, "max_tokens": 1024}` |
|
||||
| **Messages** | `llm.input_messages.{i}.*`, `llm.output_messages.{i}.*` | See examples below |
|
||||
| **Tools** | `llm.tools.{i}.tool.json_schema` | Function definitions |
|
||||
|
||||
## Cost Tracking
|
||||
|
||||
**Core attributes:**
|
||||
- `llm.cost.prompt` - Total input cost (USD)
|
||||
- `llm.cost.completion` - Total output cost (USD)
|
||||
- `llm.cost.total` - Total cost (USD)
|
||||
|
||||
**Detailed cost breakdown:**
|
||||
- `llm.cost.prompt_details.{input,cache_read,cache_write,audio}` - Input cost components
|
||||
- `llm.cost.completion_details.{output,reasoning,audio}` - Output cost components
|
||||
|
||||
## Messages
|
||||
|
||||
**Input messages:**
|
||||
- `llm.input_messages.{i}.message.role` - "user", "assistant", "system", "tool"
|
||||
- `llm.input_messages.{i}.message.content` - Text content
|
||||
- `llm.input_messages.{i}.message.contents.{j}` - Multimodal (text + images)
|
||||
- `llm.input_messages.{i}.message.tool_calls` - Tool invocations
|
||||
|
||||
**Output messages:** Same structure as input messages.
|
||||
|
||||
## Example: Basic LLM Call
|
||||
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "LLM",
|
||||
"llm.model_name": "claude-3-5-sonnet-20241022",
|
||||
"llm.invocation_parameters": "{\"temperature\": 0.7, \"max_tokens\": 1024}",
|
||||
"llm.input_messages.0.message.role": "system",
|
||||
"llm.input_messages.0.message.content": "You are a helpful assistant.",
|
||||
"llm.input_messages.1.message.role": "user",
|
||||
"llm.input_messages.1.message.content": "What is the capital of France?",
|
||||
"llm.output_messages.0.message.role": "assistant",
|
||||
"llm.output_messages.0.message.content": "The capital of France is Paris.",
|
||||
"llm.token_count.prompt": 25,
|
||||
"llm.token_count.completion": 8,
|
||||
"llm.token_count.total": 33
|
||||
}
|
||||
```
|
||||
|
||||
## Example: LLM with Tool Calls
|
||||
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "LLM",
|
||||
"llm.model_name": "gpt-4-turbo",
|
||||
"llm.input_messages.0.message.content": "What's the weather in SF?",
|
||||
"llm.output_messages.0.message.tool_calls.0.tool_call.function.name": "get_weather",
|
||||
"llm.output_messages.0.message.tool_calls.0.tool_call.function.arguments": "{\"location\": \"San Francisco\"}",
|
||||
"llm.tools.0.tool.json_schema": "{\"type\": \"function\", \"function\": {\"name\": \"get_weather\"}}"
|
||||
}
|
||||
```
|
||||
|
||||
## See Also
|
||||
|
||||
- **Instrumentation:** `instrumentation-auto-python.md`, `instrumentation-manual-python.md`
|
||||
- **Full spec:** https://github.com/Arize-ai/openinference/blob/main/spec/semantic_conventions.md
|
||||
@@ -0,0 +1,86 @@
|
||||
# RERANKER Spans
|
||||
|
||||
## Purpose
|
||||
|
||||
RERANKER spans represent reordering of retrieved documents (Cohere Rerank, cross-encoder models).
|
||||
|
||||
## Required Attributes
|
||||
|
||||
| Attribute | Type | Description | Required |
|
||||
|-----------|------|-------------|----------|
|
||||
| `openinference.span.kind` | String | Must be "RERANKER" | Yes |
|
||||
|
||||
## Attribute Reference
|
||||
|
||||
### Reranker Parameters
|
||||
|
||||
| Attribute | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `reranker.model_name` | String | Reranker model identifier |
|
||||
| `reranker.query` | String | Query used for reranking |
|
||||
| `reranker.top_k` | Integer | Number of documents to return |
|
||||
|
||||
### Input Documents
|
||||
|
||||
| Attribute Pattern | Type | Description |
|
||||
|-------------------|------|-------------|
|
||||
| `reranker.input_documents.{i}.document.id` | String | Input document ID |
|
||||
| `reranker.input_documents.{i}.document.content` | String | Input document content |
|
||||
| `reranker.input_documents.{i}.document.score` | Float | Original retrieval score |
|
||||
| `reranker.input_documents.{i}.document.metadata` | String (JSON) | Document metadata |
|
||||
|
||||
### Output Documents
|
||||
|
||||
| Attribute Pattern | Type | Description |
|
||||
|-------------------|------|-------------|
|
||||
| `reranker.output_documents.{i}.document.id` | String | Output document ID (reordered) |
|
||||
| `reranker.output_documents.{i}.document.content` | String | Output document content |
|
||||
| `reranker.output_documents.{i}.document.score` | Float | New reranker score |
|
||||
| `reranker.output_documents.{i}.document.metadata` | String (JSON) | Document metadata |
|
||||
|
||||
### Score Comparison
|
||||
|
||||
Input scores (from retriever) vs. output scores (from reranker):
|
||||
|
||||
```json
|
||||
{
|
||||
"reranker.input_documents.0.document.id": "doc_A",
|
||||
"reranker.input_documents.0.document.score": 0.7,
|
||||
"reranker.input_documents.1.document.id": "doc_B",
|
||||
"reranker.input_documents.1.document.score": 0.9,
|
||||
"reranker.output_documents.0.document.id": "doc_B",
|
||||
"reranker.output_documents.0.document.score": 0.95,
|
||||
"reranker.output_documents.1.document.id": "doc_A",
|
||||
"reranker.output_documents.1.document.score": 0.85
|
||||
}
|
||||
```
|
||||
|
||||
In this example:
|
||||
- Input: doc_B (0.9) ranked higher than doc_A (0.7)
|
||||
- Output: doc_B still highest but both scores increased
|
||||
- Reranker confirmed retriever's ordering but refined scores
|
||||
|
||||
## Examples
|
||||
|
||||
### Complete Reranking Example
|
||||
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "RERANKER",
|
||||
"reranker.model_name": "cohere-rerank-v2",
|
||||
"reranker.query": "What is machine learning?",
|
||||
"reranker.top_k": 2,
|
||||
"reranker.input_documents.0.document.id": "doc_123",
|
||||
"reranker.input_documents.0.document.content": "Machine learning is a subset...",
|
||||
"reranker.input_documents.1.document.id": "doc_456",
|
||||
"reranker.input_documents.1.document.content": "Supervised learning algorithms...",
|
||||
"reranker.input_documents.2.document.id": "doc_789",
|
||||
"reranker.input_documents.2.document.content": "Neural networks are...",
|
||||
"reranker.output_documents.0.document.id": "doc_456",
|
||||
"reranker.output_documents.0.document.content": "Supervised learning algorithms...",
|
||||
"reranker.output_documents.0.document.score": 0.95,
|
||||
"reranker.output_documents.1.document.id": "doc_123",
|
||||
"reranker.output_documents.1.document.content": "Machine learning is a subset...",
|
||||
"reranker.output_documents.1.document.score": 0.88
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,110 @@
|
||||
# RETRIEVER Spans
|
||||
|
||||
## Purpose
|
||||
|
||||
RETRIEVER spans represent document/context retrieval operations (vector DB queries, semantic search, keyword search).
|
||||
|
||||
## Required Attributes
|
||||
|
||||
| Attribute | Type | Description | Required |
|
||||
|-----------|------|-------------|----------|
|
||||
| `openinference.span.kind` | String | Must be "RETRIEVER" | Yes |
|
||||
|
||||
## Attribute Reference
|
||||
|
||||
### Query
|
||||
|
||||
| Attribute | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `input.value` | String | Search query text |
|
||||
|
||||
### Document Schema
|
||||
|
||||
| Attribute Pattern | Type | Description |
|
||||
|-------------------|------|-------------|
|
||||
| `retrieval.documents.{i}.document.id` | String | Unique document identifier |
|
||||
| `retrieval.documents.{i}.document.content` | String | Document text content |
|
||||
| `retrieval.documents.{i}.document.score` | Float | Relevance score (0-1 or distance) |
|
||||
| `retrieval.documents.{i}.document.metadata` | String (JSON) | Document metadata |
|
||||
|
||||
### Flattening Pattern for Documents
|
||||
|
||||
Documents are flattened using zero-indexed notation:
|
||||
|
||||
```
|
||||
retrieval.documents.0.document.id
|
||||
retrieval.documents.0.document.content
|
||||
retrieval.documents.0.document.score
|
||||
retrieval.documents.1.document.id
|
||||
retrieval.documents.1.document.content
|
||||
retrieval.documents.1.document.score
|
||||
...
|
||||
```
|
||||
|
||||
### Document Metadata
|
||||
|
||||
Common metadata fields (stored as JSON string):
|
||||
|
||||
```json
|
||||
{
|
||||
"source": "knowledge_base.pdf",
|
||||
"page": 42,
|
||||
"section": "Introduction",
|
||||
"author": "Jane Doe",
|
||||
"created_at": "2024-01-15",
|
||||
"url": "https://example.com/doc",
|
||||
"chunk_id": "chunk_123"
|
||||
}
|
||||
```
|
||||
|
||||
**Example with metadata:**
|
||||
```json
|
||||
{
|
||||
"retrieval.documents.0.document.id": "doc_123",
|
||||
"retrieval.documents.0.document.content": "Machine learning is a method of data analysis...",
|
||||
"retrieval.documents.0.document.score": 0.92,
|
||||
"retrieval.documents.0.document.metadata": "{\"source\": \"ml_textbook.pdf\", \"page\": 15, \"chapter\": \"Introduction\"}"
|
||||
}
|
||||
```
|
||||
|
||||
### Ordering
|
||||
|
||||
Documents are ordered by index (0, 1, 2, ...). Typically:
|
||||
- Index 0 = highest scoring document
|
||||
- Index 1 = second highest
|
||||
- etc.
|
||||
|
||||
Preserve retrieval order in your flattened attributes.
|
||||
|
||||
### Large Document Handling
|
||||
|
||||
For very long documents:
|
||||
- Consider truncating `document.content` to first N characters
|
||||
- Store full content in separate document store
|
||||
- Use `document.id` to reference full content
|
||||
|
||||
## Examples
|
||||
|
||||
### Basic Vector Search
|
||||
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "RETRIEVER",
|
||||
"input.value": "What is machine learning?",
|
||||
"retrieval.documents.0.document.id": "doc_123",
|
||||
"retrieval.documents.0.document.content": "Machine learning is a subset of artificial intelligence...",
|
||||
"retrieval.documents.0.document.score": 0.92,
|
||||
"retrieval.documents.0.document.metadata": "{\"source\": \"textbook.pdf\", \"page\": 42}",
|
||||
"retrieval.documents.1.document.id": "doc_456",
|
||||
"retrieval.documents.1.document.content": "Machine learning algorithms learn patterns from data...",
|
||||
"retrieval.documents.1.document.score": 0.87,
|
||||
"retrieval.documents.1.document.metadata": "{\"source\": \"article.html\", \"author\": \"Jane Doe\"}",
|
||||
"retrieval.documents.2.document.id": "doc_789",
|
||||
"retrieval.documents.2.document.content": "Supervised learning is a type of machine learning...",
|
||||
"retrieval.documents.2.document.score": 0.81,
|
||||
"retrieval.documents.2.document.metadata": "{\"source\": \"wiki.org\"}",
|
||||
"metadata.retriever_type": "vector_search",
|
||||
"metadata.vector_db": "pinecone",
|
||||
"metadata.top_k": 3
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,67 @@
|
||||
# TOOL Spans
|
||||
|
||||
## Purpose
|
||||
|
||||
TOOL spans represent external tool or function invocations (API calls, database queries, calculators, custom functions).
|
||||
|
||||
## Required Attributes
|
||||
|
||||
| Attribute | Type | Description | Required |
|
||||
| ------------------------- | ------ | ------------------ | ----------- |
|
||||
| `openinference.span.kind` | String | Must be "TOOL" | Yes |
|
||||
| `tool.name` | String | Tool/function name | Recommended |
|
||||
|
||||
## Attribute Reference
|
||||
|
||||
### Tool Execution Attributes
|
||||
|
||||
| Attribute | Type | Description |
|
||||
| ------------------ | ------------- | ------------------------------------------ |
|
||||
| `tool.name` | String | Tool/function name |
|
||||
| `tool.description` | String | Tool purpose/description |
|
||||
| `tool.parameters` | String (JSON) | JSON schema defining the tool's parameters |
|
||||
| `input.value` | String (JSON) | Actual input values passed to the tool |
|
||||
| `output.value` | String | Tool output/result |
|
||||
| `output.mime_type` | String | Result content type (e.g., "application/json") |
|
||||
|
||||
## Examples
|
||||
|
||||
### API Call Tool
|
||||
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "TOOL",
|
||||
"tool.name": "get_weather",
|
||||
"tool.description": "Fetches current weather for a location",
|
||||
"tool.parameters": "{\"type\": \"object\", \"properties\": {\"location\": {\"type\": \"string\"}, \"units\": {\"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"]}}, \"required\": [\"location\"]}",
|
||||
"input.value": "{\"location\": \"San Francisco\", \"units\": \"celsius\"}",
|
||||
"output.value": "{\"temperature\": 18, \"conditions\": \"partly cloudy\"}"
|
||||
}
|
||||
```
|
||||
|
||||
### Calculator Tool
|
||||
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "TOOL",
|
||||
"tool.name": "calculator",
|
||||
"tool.description": "Performs mathematical calculations",
|
||||
"tool.parameters": "{\"type\": \"object\", \"properties\": {\"expression\": {\"type\": \"string\", \"description\": \"Math expression to evaluate\"}}, \"required\": [\"expression\"]}",
|
||||
"input.value": "{\"expression\": \"2 + 2\"}",
|
||||
"output.value": "4"
|
||||
}
|
||||
```
|
||||
|
||||
### Database Query Tool
|
||||
|
||||
```json
|
||||
{
|
||||
"openinference.span.kind": "TOOL",
|
||||
"tool.name": "sql_query",
|
||||
"tool.description": "Executes SQL query on user database",
|
||||
"tool.parameters": "{\"type\": \"object\", \"properties\": {\"query\": {\"type\": \"string\", \"description\": \"SQL query to execute\"}}, \"required\": [\"query\"]}",
|
||||
"input.value": "{\"query\": \"SELECT * FROM users WHERE id = 123\"}",
|
||||
"output.value": "[{\"id\": 123, \"name\": \"Alice\", \"email\": \"alice@example.com\"}]",
|
||||
"output.mime_type": "application/json"
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user