Files
clawrity/rag/evaluator.py
T
2026-05-04 22:00:38 +05:30

124 lines
3.7 KiB
Python

"""
Clawrity — RAG Evaluator
Lightweight Groq-based evaluation (no OpenAI, no full RAGAs).
Four metrics: faithfulness, answer_relevancy, context_precision, context_recall.
Single Groq call with structured JSON output.
"""
import json
import logging
from dataclasses import dataclass
from typing import Dict, List, Optional
from groq import Groq
from config.settings import get_settings
logger = logging.getLogger(__name__)
EVAL_PROMPT = """Evaluate this RAG-augmented response on four criteria.
## User Query
{query}
## Retrieved Context Chunks
{chunks}
## Generated Response
{response}
## Evaluation Criteria (score each 0.0 to 1.0)
1. **Faithfulness**: Does the response ONLY contain information from the retrieved chunks? No hallucination?
2. **Answer Relevancy**: Does the response directly address the user's question?
3. **Context Precision**: Were the retrieved chunks actually relevant to the question?
4. **Context Recall**: Did the retrieval capture enough context to answer the question fully?
Return ONLY a JSON object:
{{
"faithfulness": <float>,
"answer_relevancy": <float>,
"context_precision": <float>,
"context_recall": <float>,
"overall": <float (average of all four)>,
"notes": "<brief explanation>"
}}"""
@dataclass
class EvalResult:
faithfulness: float = 0.0
answer_relevancy: float = 0.0
context_precision: float = 0.0
context_recall: float = 0.0
overall: float = 0.0
notes: str = ""
class RAGEvaluator:
"""Evaluates RAG pipeline quality using Groq LLM."""
def __init__(self):
settings = get_settings()
self.client = Groq(api_key=settings.groq_api_key)
self.model = settings.llm_model
def evaluate(
self,
query: str,
chunks: List[Dict],
response: str,
) -> EvalResult:
"""Evaluate a RAG response."""
chunks_text = "\n".join(
f"{i+1}. {c.get('text', '')} (similarity: {c.get('similarity', 0):.2f})"
for i, c in enumerate(chunks)
) if chunks else "No chunks retrieved."
prompt = EVAL_PROMPT.format(
query=query,
chunks=chunks_text,
response=response,
)
try:
result = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "You are a RAG evaluation expert. Return only valid JSON."},
{"role": "user", "content": prompt},
],
temperature=0.1,
max_tokens=512,
)
raw = result.choices[0].message.content.strip()
return self._parse(raw)
except Exception as e:
logger.error(f"RAG evaluation failed: {e}")
return EvalResult(notes=f"Evaluation error: {str(e)}")
def _parse(self, raw: str) -> EvalResult:
"""Parse JSON evaluation response."""
try:
cleaned = raw.strip()
if cleaned.startswith("```"):
cleaned = cleaned.split("\n", 1)[1] if "\n" in cleaned else cleaned[3:]
if cleaned.endswith("```"):
cleaned = cleaned[:-3]
data = json.loads(cleaned.strip())
return EvalResult(
faithfulness=float(data.get("faithfulness", 0)),
answer_relevancy=float(data.get("answer_relevancy", 0)),
context_precision=float(data.get("context_precision", 0)),
context_recall=float(data.get("context_recall", 0)),
overall=float(data.get("overall", 0)),
notes=data.get("notes", ""),
)
except Exception as e:
logger.warning(f"Could not parse evaluation: {e}")
return EvalResult(notes="Parse error")