mirror of
https://github.com/github/awesome-copilot.git
synced 2026-04-11 10:45:56 +00:00
1.6 KiB
1.6 KiB
Experiments: Running Experiments in Python
Execute experiments with run_experiment.
Basic Usage
from phoenix.client import Client
from phoenix.client.experiments import run_experiment
client = Client()
dataset = client.datasets.get_dataset(name="qa-test-v1")
def my_task(example):
return call_llm(example.input["question"])
def exact_match(output, expected):
return 1.0 if output.strip().lower() == expected["answer"].strip().lower() else 0.0
experiment = run_experiment(
dataset=dataset,
task=my_task,
evaluators=[exact_match],
experiment_name="qa-experiment-v1",
)
Task Functions
# Basic task
def task(example):
return call_llm(example.input["question"])
# With context (RAG)
def rag_task(example):
return call_llm(f"Context: {example.input['context']}\nQ: {example.input['question']}")
Evaluator Parameters
| Parameter | Access |
|---|---|
output |
Task output |
expected |
Example expected output |
input |
Example input |
metadata |
Example metadata |
Options
experiment = run_experiment(
dataset=dataset,
task=my_task,
evaluators=evaluators,
experiment_name="my-experiment",
dry_run=3, # Test with 3 examples
repetitions=3, # Run each example 3 times
)
Results
print(experiment.aggregate_scores)
# {'accuracy': 0.85, 'faithfulness': 0.92}
for run in experiment.runs:
print(run.output, run.scores)
Add Evaluations Later
from phoenix.client.experiments import evaluate_experiment
evaluate_experiment(experiment=experiment, evaluators=[new_evaluator])