prototype

2026-05-16 19:35:21 +00:00 · 2026-05-04 22:00:38 +05:30
commit 711d691870
48 changed files with 5093 additions and 0 deletions
@@ -0,0 +1,123 @@
+"""
+Clawrity — RAG Evaluator
+
+Lightweight Groq-based evaluation (no OpenAI, no full RAGAs).
+Four metrics: faithfulness, answer_relevancy, context_precision, context_recall.
+Single Groq call with structured JSON output.
+"""
+
+import json
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+
+from groq import Groq
+
+from config.settings import get_settings
+
+logger = logging.getLogger(__name__)
+
+EVAL_PROMPT = """Evaluate this RAG-augmented response on four criteria.
+
+## User Query
+{query}
+
+## Retrieved Context Chunks
+{chunks}
+
+## Generated Response
+{response}
+
+## Evaluation Criteria (score each 0.0 to 1.0)
+
+1. **Faithfulness**: Does the response ONLY contain information from the retrieved chunks? No hallucination?
+2. **Answer Relevancy**: Does the response directly address the user's question?
+3. **Context Precision**: Were the retrieved chunks actually relevant to the question?
+4. **Context Recall**: Did the retrieval capture enough context to answer the question fully?
+
+Return ONLY a JSON object:
+{{
+    "faithfulness": <float>,
+    "answer_relevancy": <float>,
+    "context_precision": <float>,
+    "context_recall": <float>,
+    "overall": <float (average of all four)>,
+    "notes": "<brief explanation>"
+}}"""
+
+
+@dataclass
+class EvalResult:
+    faithfulness: float = 0.0
+    answer_relevancy: float = 0.0
+    context_precision: float = 0.0
+    context_recall: float = 0.0
+    overall: float = 0.0
+    notes: str = ""
+
+
+class RAGEvaluator:
+    """Evaluates RAG pipeline quality using Groq LLM."""
+
+    def __init__(self):
+        settings = get_settings()
+        self.client = Groq(api_key=settings.groq_api_key)
+        self.model = settings.llm_model
+
+    def evaluate(
+        self,
+        query: str,
+        chunks: List[Dict],
+        response: str,
+    ) -> EvalResult:
+        """Evaluate a RAG response."""
+        chunks_text = "\n".join(
+            f"{i+1}. {c.get('text', '')} (similarity: {c.get('similarity', 0):.2f})"
+            for i, c in enumerate(chunks)
+        ) if chunks else "No chunks retrieved."
+
+        prompt = EVAL_PROMPT.format(
+            query=query,
+            chunks=chunks_text,
+            response=response,
+        )
+
+        try:
+            result = self.client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {"role": "system", "content": "You are a RAG evaluation expert. Return only valid JSON."},
+                    {"role": "user", "content": prompt},
+                ],
+                temperature=0.1,
+                max_tokens=512,
+            )
+
+            raw = result.choices[0].message.content.strip()
+            return self._parse(raw)
+
+        except Exception as e:
+            logger.error(f"RAG evaluation failed: {e}")
+            return EvalResult(notes=f"Evaluation error: {str(e)}")
+
+    def _parse(self, raw: str) -> EvalResult:
+        """Parse JSON evaluation response."""
+        try:
+            cleaned = raw.strip()
+            if cleaned.startswith("```"):
+                cleaned = cleaned.split("\n", 1)[1] if "\n" in cleaned else cleaned[3:]
+            if cleaned.endswith("```"):
+                cleaned = cleaned[:-3]
+
+            data = json.loads(cleaned.strip())
+            return EvalResult(
+                faithfulness=float(data.get("faithfulness", 0)),
+                answer_relevancy=float(data.get("answer_relevancy", 0)),
+                context_precision=float(data.get("context_precision", 0)),
+                context_recall=float(data.get("context_recall", 0)),
+                overall=float(data.get("overall", 0)),
+                notes=data.get("notes", ""),
+            )
+        except Exception as e:
+            logger.warning(f"Could not parse evaluation: {e}")
+            return EvalResult(notes="Parse error")