mirror of
https://github.com/github/awesome-copilot.git
synced 2026-02-20 02:15:12 +00:00
Add agentic-eval skill for agent evaluation patterns
This commit is contained in:
189
skills/agentic-eval/SKILL.md
Normal file
189
skills/agentic-eval/SKILL.md
Normal file
@@ -0,0 +1,189 @@
|
|||||||
|
---
|
||||||
|
name: agentic-eval
|
||||||
|
description: |
|
||||||
|
Patterns and techniques for evaluating and improving AI agent outputs. Use this skill when:
|
||||||
|
- Implementing self-critique and reflection loops
|
||||||
|
- Building evaluator-optimizer pipelines for quality-critical generation
|
||||||
|
- Creating test-driven code refinement workflows
|
||||||
|
- Designing rubric-based or LLM-as-judge evaluation systems
|
||||||
|
- Adding iterative improvement to agent outputs (code, reports, analysis)
|
||||||
|
- Measuring and improving agent response quality
|
||||||
|
---
|
||||||
|
|
||||||
|
# Agentic Evaluation Patterns
|
||||||
|
|
||||||
|
Patterns for self-improvement through iterative evaluation and refinement.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Evaluation patterns enable agents to assess and improve their own outputs, moving beyond single-shot generation to iterative refinement loops.
|
||||||
|
|
||||||
|
```
|
||||||
|
Generate → Evaluate → Critique → Refine → Output
|
||||||
|
↑ │
|
||||||
|
└──────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## When to Use
|
||||||
|
|
||||||
|
- **Quality-critical generation**: Code, reports, analysis requiring high accuracy
|
||||||
|
- **Tasks with clear evaluation criteria**: Defined success metrics exist
|
||||||
|
- **Content requiring specific standards**: Style guides, compliance, formatting
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Pattern 1: Basic Reflection
|
||||||
|
|
||||||
|
Agent evaluates and improves its own output through self-critique.
|
||||||
|
|
||||||
|
```python
|
||||||
|
def reflect_and_refine(task: str, criteria: list[str], max_iterations: int = 3) -> str:
|
||||||
|
"""Generate with reflection loop."""
|
||||||
|
output = llm(f"Complete this task:\n{task}")
|
||||||
|
|
||||||
|
for i in range(max_iterations):
|
||||||
|
# Self-critique
|
||||||
|
critique = llm(f"""
|
||||||
|
Evaluate this output against criteria: {criteria}
|
||||||
|
Output: {output}
|
||||||
|
Rate each: PASS/FAIL with feedback as JSON.
|
||||||
|
""")
|
||||||
|
|
||||||
|
critique_data = json.loads(critique)
|
||||||
|
all_pass = all(c["status"] == "PASS" for c in critique_data.values())
|
||||||
|
if all_pass:
|
||||||
|
return output
|
||||||
|
|
||||||
|
# Refine based on critique
|
||||||
|
failed = {k: v["feedback"] for k, v in critique_data.items() if v["status"] == "FAIL"}
|
||||||
|
output = llm(f"Improve to address: {failed}\nOriginal: {output}")
|
||||||
|
|
||||||
|
return output
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key insight**: Use structured JSON output for reliable parsing of critique results.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Pattern 2: Evaluator-Optimizer
|
||||||
|
|
||||||
|
Separate generation and evaluation into distinct components for clearer responsibilities.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class EvaluatorOptimizer:
|
||||||
|
def __init__(self, score_threshold: float = 0.8):
|
||||||
|
self.score_threshold = score_threshold
|
||||||
|
|
||||||
|
def generate(self, task: str) -> str:
|
||||||
|
return llm(f"Complete: {task}")
|
||||||
|
|
||||||
|
def evaluate(self, output: str, task: str) -> dict:
|
||||||
|
return json.loads(llm(f"""
|
||||||
|
Evaluate output for task: {task}
|
||||||
|
Output: {output}
|
||||||
|
Return JSON: {{"overall_score": 0-1, "dimensions": {{"accuracy": ..., "clarity": ...}}}}
|
||||||
|
"""))
|
||||||
|
|
||||||
|
def optimize(self, output: str, feedback: dict) -> str:
|
||||||
|
return llm(f"Improve based on feedback: {feedback}\nOutput: {output}")
|
||||||
|
|
||||||
|
def run(self, task: str, max_iterations: int = 3) -> str:
|
||||||
|
output = self.generate(task)
|
||||||
|
for _ in range(max_iterations):
|
||||||
|
evaluation = self.evaluate(output, task)
|
||||||
|
if evaluation["overall_score"] >= self.score_threshold:
|
||||||
|
break
|
||||||
|
output = self.optimize(output, evaluation)
|
||||||
|
return output
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Pattern 3: Code-Specific Reflection
|
||||||
|
|
||||||
|
Test-driven refinement loop for code generation.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class CodeReflector:
|
||||||
|
def reflect_and_fix(self, spec: str, max_iterations: int = 3) -> str:
|
||||||
|
code = llm(f"Write Python code for: {spec}")
|
||||||
|
tests = llm(f"Generate pytest tests for: {spec}\nCode: {code}")
|
||||||
|
|
||||||
|
for _ in range(max_iterations):
|
||||||
|
result = run_tests(code, tests)
|
||||||
|
if result["success"]:
|
||||||
|
return code
|
||||||
|
code = llm(f"Fix error: {result['error']}\nCode: {code}")
|
||||||
|
return code
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Evaluation Strategies
|
||||||
|
|
||||||
|
### Outcome-Based
|
||||||
|
Evaluate whether output achieves the expected result.
|
||||||
|
|
||||||
|
```python
|
||||||
|
def evaluate_outcome(task: str, output: str, expected: str) -> str:
|
||||||
|
return llm(f"Does output achieve expected outcome? Task: {task}, Expected: {expected}, Output: {output}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### LLM-as-Judge
|
||||||
|
Use LLM to compare and rank outputs.
|
||||||
|
|
||||||
|
```python
|
||||||
|
def llm_judge(output_a: str, output_b: str, criteria: str) -> str:
|
||||||
|
return llm(f"Compare outputs A and B for {criteria}. Which is better and why?")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Rubric-Based
|
||||||
|
Score outputs against weighted dimensions.
|
||||||
|
|
||||||
|
```python
|
||||||
|
RUBRIC = {
|
||||||
|
"accuracy": {"weight": 0.4},
|
||||||
|
"clarity": {"weight": 0.3},
|
||||||
|
"completeness": {"weight": 0.3}
|
||||||
|
}
|
||||||
|
|
||||||
|
def evaluate_with_rubric(output: str, rubric: dict) -> float:
|
||||||
|
scores = json.loads(llm(f"Rate 1-5 for each dimension: {list(rubric.keys())}\nOutput: {output}"))
|
||||||
|
return sum(scores[d] * rubric[d]["weight"] for d in rubric) / 5
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
| Practice | Rationale |
|
||||||
|
|----------|-----------|
|
||||||
|
| **Clear criteria** | Define specific, measurable evaluation criteria upfront |
|
||||||
|
| **Iteration limits** | Set max iterations (3-5) to prevent infinite loops |
|
||||||
|
| **Convergence check** | Stop if output score isn't improving between iterations |
|
||||||
|
| **Log history** | Keep full trajectory for debugging and analysis |
|
||||||
|
| **Structured output** | Use JSON for reliable parsing of evaluation results |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Quick Start Checklist
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
## Evaluation Implementation Checklist
|
||||||
|
|
||||||
|
### Setup
|
||||||
|
- [ ] Define evaluation criteria/rubric
|
||||||
|
- [ ] Set score threshold for "good enough"
|
||||||
|
- [ ] Configure max iterations (default: 3)
|
||||||
|
|
||||||
|
### Implementation
|
||||||
|
- [ ] Implement generate() function
|
||||||
|
- [ ] Implement evaluate() function with structured output
|
||||||
|
- [ ] Implement optimize() function
|
||||||
|
- [ ] Wire up the refinement loop
|
||||||
|
|
||||||
|
### Safety
|
||||||
|
- [ ] Add convergence detection
|
||||||
|
- [ ] Log all iterations for debugging
|
||||||
|
- [ ] Handle evaluation parse failures gracefully
|
||||||
|
```
|
||||||
Reference in New Issue
Block a user