prototype

This commit is contained in:
2026-05-04 22:00:38 +05:30
commit 711d691870
48 changed files with 5093 additions and 0 deletions
View File
+287
View File
@@ -0,0 +1,287 @@
"""
Clawrity — RAG Chunker
Aggregation-based semantic chunking — NOT fixed-size, NOT sliding window.
Source is structured tabular data. We aggregate rows into business-meaningful
units and write natural language narratives.
Three chunk types:
1. branch_weekly — GROUP BY branch, country, week
2. channel_monthly — GROUP BY channel, country, month
3. trend_qoq — GROUP BY branch, country, quarter (QoQ delta COMPUTED)
Plus Faker-generated narrative summaries reflecting real patterns.
"""
import hashlib
import logging
from dataclasses import dataclass, field
from typing import Dict, List, Optional
import numpy as np
import pandas as pd
from faker import Faker
logger = logging.getLogger(__name__)
fake = Faker()
@dataclass
class Chunk:
"""A single RAG chunk."""
id: str
client_id: str
chunk_type: str
text: str
metadata: Dict
def to_dict(self) -> Dict:
return {
"id": self.id,
"client_id": self.client_id,
"chunk_type": self.chunk_type,
"text": self.text,
"metadata": self.metadata,
}
def generate_chunks(df: pd.DataFrame, client_id: str) -> List[Chunk]:
"""Generate all chunk types from preprocessed data."""
chunks = []
df = df.copy()
df["date"] = pd.to_datetime(df["date"])
chunks.extend(_branch_weekly(df, client_id))
chunks.extend(_channel_monthly(df, client_id))
chunks.extend(_trend_qoq(df, client_id))
chunks.extend(_faker_narratives(df, client_id))
logger.info(f"Generated {len(chunks)} total chunks for {client_id}")
return chunks
def _chunk_id(client_id: str, chunk_type: str, *parts) -> str:
"""Generate a deterministic chunk ID."""
raw = f"{client_id}:{chunk_type}:" + ":".join(str(p) for p in parts)
return hashlib.md5(raw.encode()).hexdigest()[:16]
# ---------------------------------------------------------------------------
# Chunk Type 1: Branch Weekly
# ---------------------------------------------------------------------------
def _branch_weekly(df: pd.DataFrame, client_id: str) -> List[Chunk]:
"""GROUP BY branch, country, week. One chunk per branch per week."""
chunks = []
df = df.copy()
df["week"] = df["date"].dt.isocalendar().week.astype(int)
df["month"] = df["date"].dt.month_name()
df["year"] = df["date"].dt.year
grouped = df.groupby(["branch", "country", "year", "week", "month"]).agg(
spend=("spend", "sum"),
revenue=("revenue", "sum"),
leads=("leads", "sum"),
conversions=("conversions", "sum"),
).reset_index()
for _, row in grouped.iterrows():
spend = row["spend"]
revenue = row["revenue"]
roi = round(revenue / spend, 2) if spend > 0 else 0
conv_rate = round(row["conversions"] / row["leads"] * 100, 1) if row["leads"] > 0 else 0
text = (
f"{row['branch']} ({row['country']}) in week {row['week']} of "
f"{row['month']} {row['year']}: spent ${spend:,.0f}, earned "
f"${revenue:,.0f}, ROI {roi}x, {row['leads']} leads, "
f"{conv_rate}% conversion rate."
)
chunks.append(Chunk(
id=_chunk_id(client_id, "branch_weekly", row["branch"], row["year"], row["week"]),
client_id=client_id,
chunk_type="branch_weekly",
text=text,
metadata={
"branch": row["branch"],
"country": row["country"],
"week": int(row["week"]),
"month": row["month"],
"year": int(row["year"]),
"roi": roi,
},
))
logger.info(f"Generated {len(chunks)} branch_weekly chunks")
return chunks
# ---------------------------------------------------------------------------
# Chunk Type 2: Channel Monthly
# ---------------------------------------------------------------------------
def _channel_monthly(df: pd.DataFrame, client_id: str) -> List[Chunk]:
"""GROUP BY channel, country, month, quarter."""
chunks = []
df = df.copy()
df["month"] = df["date"].dt.month_name()
df["quarter"] = "Q" + df["date"].dt.quarter.astype(str)
df["year"] = df["date"].dt.year
grouped = df.groupby(["channel", "country", "year", "month", "quarter"]).agg(
spend=("spend", "sum"),
revenue=("revenue", "sum"),
leads=("leads", "sum"),
conversions=("conversions", "sum"),
).reset_index()
for _, row in grouped.iterrows():
spend = row["spend"]
revenue = row["revenue"]
roi = round(revenue / spend, 2) if spend > 0 else 0
text = (
f"{row['channel']} in {row['country']} during {row['month']} "
f"({row['quarter']}) {row['year']}: ${spend:,.0f} spent, "
f"${revenue:,.0f} revenue, ROI {roi}x."
)
chunks.append(Chunk(
id=_chunk_id(client_id, "channel_monthly", row["channel"], row["country"], row["year"], row["month"]),
client_id=client_id,
chunk_type="channel_monthly",
text=text,
metadata={
"channel": row["channel"],
"country": row["country"],
"month": row["month"],
"quarter": row["quarter"],
"year": int(row["year"]),
"roi": roi,
},
))
logger.info(f"Generated {len(chunks)} channel_monthly chunks")
return chunks
# ---------------------------------------------------------------------------
# Chunk Type 3: QoQ Trend (Most Important)
# ---------------------------------------------------------------------------
def _trend_qoq(df: pd.DataFrame, client_id: str) -> List[Chunk]:
"""GROUP BY branch, country, quarter. Compute quarter-over-quarter delta."""
chunks = []
df = df.copy()
df["quarter"] = df["date"].dt.to_period("Q").astype(str)
grouped = df.groupby(["branch", "country", "quarter"]).agg(
spend=("spend", "sum"),
revenue=("revenue", "sum"),
).reset_index()
# Sort for QoQ calculation
grouped = grouped.sort_values(["branch", "country", "quarter"])
for (branch, country), group in grouped.groupby(["branch", "country"]):
group = group.sort_values("quarter").reset_index(drop=True)
for i in range(1, len(group)):
prev = group.iloc[i - 1]
curr = group.iloc[i]
prev_rev = prev["revenue"]
curr_rev = curr["revenue"]
if prev_rev > 0:
delta = round((curr_rev - prev_rev) / prev_rev * 100, 1)
else:
delta = 0
direction = "grew" if delta > 0 else "declined"
text = (
f"{branch} ({country}) revenue {direction} {abs(delta)}% "
f"in {curr['quarter']} vs {prev['quarter']}. "
f"Total spend: ${curr['spend']:,.0f}, revenue: ${curr_rev:,.0f}."
)
chunks.append(Chunk(
id=_chunk_id(client_id, "trend_qoq", branch, country, curr["quarter"]),
client_id=client_id,
chunk_type="trend_qoq",
text=text,
metadata={
"branch": branch,
"country": country,
"quarter": curr["quarter"],
"prev_quarter": prev["quarter"],
"delta_pct": delta,
},
))
logger.info(f"Generated {len(chunks)} trend_qoq chunks")
return chunks
# ---------------------------------------------------------------------------
# Faker Narrative Chunks
# ---------------------------------------------------------------------------
def _faker_narratives(df: pd.DataFrame, client_id: str) -> List[Chunk]:
"""Generate plausible narrative chunks reflecting real data patterns."""
chunks = []
df = df.copy()
df["quarter"] = df["date"].dt.to_period("Q").astype(str)
# Find top and bottom performers per quarter
quarterly = df.groupby(["branch", "country", "quarter"]).agg(
revenue=("revenue", "sum"),
spend=("spend", "sum"),
leads=("leads", "sum"),
).reset_index()
templates = [
"{branch} branch demonstrated strong {quarter} performance driven by {channel} efficiency, outperforming regional averages.",
"In {quarter}, {branch} ({country}) showed {trend} momentum with revenue reaching ${revenue:,.0f}, primarily through {channel} campaigns.",
"{branch} branch in {country} maintained steady growth in {quarter}, with lead generation up and conversion rates holding above {conv_rate:.1f}%.",
"Cost efficiency at {branch} ({country}) improved in {quarter}, with spend-to-revenue ratio tightening to {ratio:.2f}x.",
]
channels = df["channel"].dropna().unique().tolist() or ["Paid Search", "Social Media", "Email"]
for _, row in quarterly.iterrows():
roi = row["revenue"] / row["spend"] if row["spend"] > 0 else 0
conv_rate = np.random.uniform(5, 20)
trend = "positive" if roi > 1.5 else "moderate" if roi > 1 else "challenging"
channel = np.random.choice(channels)
template = np.random.choice(templates)
text = template.format(
branch=row["branch"],
country=row["country"],
quarter=row["quarter"],
channel=channel,
revenue=row["revenue"],
trend=trend,
conv_rate=conv_rate,
ratio=1 / roi if roi > 0 else 0,
)
chunks.append(Chunk(
id=_chunk_id(client_id, "narrative", row["branch"], row["country"], row["quarter"]),
client_id=client_id,
chunk_type="narrative",
text=text,
metadata={
"branch": row["branch"],
"country": row["country"],
"quarter": row["quarter"],
"source": "generated_narrative",
},
))
logger.info(f"Generated {len(chunks)} narrative chunks")
return chunks
+123
View File
@@ -0,0 +1,123 @@
"""
Clawrity — RAG Evaluator
Lightweight Groq-based evaluation (no OpenAI, no full RAGAs).
Four metrics: faithfulness, answer_relevancy, context_precision, context_recall.
Single Groq call with structured JSON output.
"""
import json
import logging
from dataclasses import dataclass
from typing import Dict, List, Optional
from groq import Groq
from config.settings import get_settings
logger = logging.getLogger(__name__)
EVAL_PROMPT = """Evaluate this RAG-augmented response on four criteria.
## User Query
{query}
## Retrieved Context Chunks
{chunks}
## Generated Response
{response}
## Evaluation Criteria (score each 0.0 to 1.0)
1. **Faithfulness**: Does the response ONLY contain information from the retrieved chunks? No hallucination?
2. **Answer Relevancy**: Does the response directly address the user's question?
3. **Context Precision**: Were the retrieved chunks actually relevant to the question?
4. **Context Recall**: Did the retrieval capture enough context to answer the question fully?
Return ONLY a JSON object:
{{
"faithfulness": <float>,
"answer_relevancy": <float>,
"context_precision": <float>,
"context_recall": <float>,
"overall": <float (average of all four)>,
"notes": "<brief explanation>"
}}"""
@dataclass
class EvalResult:
faithfulness: float = 0.0
answer_relevancy: float = 0.0
context_precision: float = 0.0
context_recall: float = 0.0
overall: float = 0.0
notes: str = ""
class RAGEvaluator:
"""Evaluates RAG pipeline quality using Groq LLM."""
def __init__(self):
settings = get_settings()
self.client = Groq(api_key=settings.groq_api_key)
self.model = settings.llm_model
def evaluate(
self,
query: str,
chunks: List[Dict],
response: str,
) -> EvalResult:
"""Evaluate a RAG response."""
chunks_text = "\n".join(
f"{i+1}. {c.get('text', '')} (similarity: {c.get('similarity', 0):.2f})"
for i, c in enumerate(chunks)
) if chunks else "No chunks retrieved."
prompt = EVAL_PROMPT.format(
query=query,
chunks=chunks_text,
response=response,
)
try:
result = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "You are a RAG evaluation expert. Return only valid JSON."},
{"role": "user", "content": prompt},
],
temperature=0.1,
max_tokens=512,
)
raw = result.choices[0].message.content.strip()
return self._parse(raw)
except Exception as e:
logger.error(f"RAG evaluation failed: {e}")
return EvalResult(notes=f"Evaluation error: {str(e)}")
def _parse(self, raw: str) -> EvalResult:
"""Parse JSON evaluation response."""
try:
cleaned = raw.strip()
if cleaned.startswith("```"):
cleaned = cleaned.split("\n", 1)[1] if "\n" in cleaned else cleaned[3:]
if cleaned.endswith("```"):
cleaned = cleaned[:-3]
data = json.loads(cleaned.strip())
return EvalResult(
faithfulness=float(data.get("faithfulness", 0)),
answer_relevancy=float(data.get("answer_relevancy", 0)),
context_precision=float(data.get("context_precision", 0)),
context_recall=float(data.get("context_recall", 0)),
overall=float(data.get("overall", 0)),
notes=data.get("notes", ""),
)
except Exception as e:
logger.warning(f"Could not parse evaluation: {e}")
return EvalResult(notes="Parse error")
+105
View File
@@ -0,0 +1,105 @@
"""
Clawrity — RAG Monitoring
Logs every interaction to JSONL and provides aggregated stats.
Exposes data for /admin/stats/{client_id} endpoint.
"""
import json
import logging
import os
from datetime import datetime
from typing import Dict, Optional
from config.settings import get_settings
logger = logging.getLogger(__name__)
def _log_path(client_id: str) -> str:
"""Get the JSONL log file path for a client."""
logs_dir = get_settings().logs_dir
os.makedirs(logs_dir, exist_ok=True)
return os.path.join(logs_dir, f"{client_id}_interactions.jsonl")
def log_interaction(
client_id: str,
query: str,
num_chunks: int,
chunk_types_used: list,
qa_score: float,
qa_passed: bool,
retries: int,
response_length: int,
elapsed_seconds: float = 0.0,
):
"""Log an interaction to JSONL."""
entry = {
"timestamp": datetime.utcnow().isoformat(),
"client_id": client_id,
"query": query,
"num_chunks": num_chunks,
"chunk_types_used": chunk_types_used,
"qa_score": qa_score,
"qa_passed": qa_passed,
"retries": retries,
"response_length": response_length,
"elapsed_seconds": elapsed_seconds,
}
try:
path = _log_path(client_id)
with open(path, "a") as f:
f.write(json.dumps(entry) + "\n")
except Exception as e:
logger.error(f"Failed to log interaction: {e}")
def get_stats(client_id: str) -> Dict:
"""
Get aggregated monitoring stats for a client.
Returns:
Dict with: total_queries, pass_rate, avg_qa_score, avg_retries,
queries_needing_retry
"""
path = _log_path(client_id)
if not os.path.exists(path):
return {
"client_id": client_id,
"total_queries": 0,
"pass_rate": 0.0,
"avg_qa_score": 0.0,
"avg_retries": 0.0,
"queries_needing_retry": 0,
}
entries = []
try:
with open(path, "r") as f:
for line in f:
line = line.strip()
if line:
entries.append(json.loads(line))
except Exception as e:
logger.error(f"Error reading log file: {e}")
return {"error": str(e)}
if not entries:
return {"client_id": client_id, "total_queries": 0}
total = len(entries)
passed = sum(1 for e in entries if e.get("qa_passed", False))
scores = [e.get("qa_score", 0) for e in entries]
retries = [e.get("retries", 0) for e in entries]
retry_queries = sum(1 for r in retries if r > 0)
return {
"client_id": client_id,
"total_queries": total,
"pass_rate": round(passed / total * 100, 1) if total > 0 else 0,
"avg_qa_score": round(sum(scores) / total, 3) if total > 0 else 0,
"avg_retries": round(sum(retries) / total, 2) if total > 0 else 0,
"queries_needing_retry": retry_queries,
}
+72
View File
@@ -0,0 +1,72 @@
"""
Clawrity — RAG Preprocessor
Fetches data from PostgreSQL, cleans it for RAG chunking:
- Removes nulls, outliers > 3 std devs, duplicates
- Normalises string columns
"""
import logging
from typing import Optional
import pandas as pd
from etl.normaliser import remove_outliers
from skills.postgres_connector import get_connector
logger = logging.getLogger(__name__)
def preprocess_for_rag(
client_id: str,
days: int = 365,
) -> pd.DataFrame:
"""
Fetch and preprocess data for RAG chunking.
Args:
client_id: Client to fetch data for
days: Number of days of data to fetch (default 365)
Returns:
Clean DataFrame ready for chunking
"""
db = get_connector()
sql = """
SELECT date, country, branch, channel, spend, revenue, leads, conversions
FROM spend_data
WHERE client_id = %s AND date >= CURRENT_DATE - INTERVAL '%s days'
ORDER BY date
"""
# Can't parameterise interval directly, use string formatting for days
safe_sql = f"""
SELECT date, country, branch, channel, spend, revenue, leads, conversions
FROM spend_data
WHERE client_id = %s AND date >= CURRENT_DATE - INTERVAL '{int(days)} days'
ORDER BY date
"""
df = db.execute_query(safe_sql, (client_id,))
logger.info(f"Fetched {len(df)} rows for RAG preprocessing")
if df.empty:
logger.warning(f"No data found for client {client_id}")
return df
# Remove rows with critical nulls
critical_cols = ["date", "branch", "country", "revenue"]
df = df.dropna(subset=[c for c in critical_cols if c in df.columns])
# Remove outliers on numeric columns
df = remove_outliers(df, ["spend", "revenue", "leads", "conversions"])
# Clean strings
for col in ["country", "branch", "channel"]:
if col in df.columns:
df[col] = df[col].astype(str).str.strip().str.title()
# Remove duplicates
df = df.drop_duplicates()
logger.info(f"Preprocessed: {len(df)} rows ready for chunking")
return df
+95
View File
@@ -0,0 +1,95 @@
"""
Clawrity — RAG Retriever
Detects query intent → selects chunk_type → searches pgvector.
Intent detection based on keywords:
- "should/recommend/allocate/shift" → trend_qoq
- "channel/paid/email/social" → channel_monthly
- everything else → branch_weekly
"""
import logging
import re
from typing import List, Dict, Optional
from rag.vector_store import search
logger = logging.getLogger(__name__)
# Intent → chunk_type mapping based on keywords
INTENT_PATTERNS = {
"trend_qoq": [
"should", "recommend", "allocate", "shift", "increase", "decrease",
"budget", "realloc", "invest", "optimize", "growth", "trend",
"quarter", "qoq", "forecast", "predict",
],
"channel_monthly": [
"channel", "paid", "email", "social", "search", "display",
"organic", "referral", "campaign", "marketing", "roi",
"spend", "advertising",
],
}
class Retriever:
"""RAG retriever with intent-based chunk type filtering."""
def retrieve(
self,
query: str,
client_id: str,
top_k: int = 5,
chunk_type_override: Optional[str] = None,
) -> List[Dict]:
"""
Retrieve relevant chunks based on query intent.
Args:
query: User's natural language query
client_id: Client to search within
top_k: Number of chunks to retrieve
chunk_type_override: Force a specific chunk type
Returns:
List of dicts with text, metadata, similarity
"""
if chunk_type_override:
chunk_type = chunk_type_override
else:
chunk_type = self._detect_intent(query)
logger.info(f"Detected intent → chunk_type: {chunk_type}")
results = search(
query=query,
client_id=client_id,
chunk_type=chunk_type,
top_k=top_k,
)
# If no results with the detected type, fall back to all types
if not results:
logger.info(f"No results for {chunk_type}, falling back to all types")
results = search(
query=query,
client_id=client_id,
chunk_type=None,
top_k=top_k,
)
return results
def _detect_intent(self, query: str) -> str:
"""Detect query intent from keywords."""
query_lower = query.lower()
scores = {}
for chunk_type, keywords in INTENT_PATTERNS.items():
score = sum(1 for kw in keywords if kw in query_lower)
scores[chunk_type] = score
# Return the chunk type with highest score, default to branch_weekly
if max(scores.values()) > 0:
return max(scores, key=scores.get)
return "branch_weekly"
+135
View File
@@ -0,0 +1,135 @@
"""
Clawrity — RAG Vector Store
Embeds chunks using sentence-transformers all-MiniLM-L6-v2 (CPU, 384 dims).
Stores and searches via pgvector in PostgreSQL.
"""
import logging
from typing import List, Optional
import numpy as np
from rag.chunker import Chunk
from skills.postgres_connector import get_connector
logger = logging.getLogger(__name__)
_model = None
def _get_embedding_model():
"""Lazy-load the embedding model (CPU only, ~90MB)."""
global _model
if _model is None:
from sentence_transformers import SentenceTransformer
_model = SentenceTransformer("all-MiniLM-L6-v2")
logger.info("Loaded embedding model: all-MiniLM-L6-v2 (384 dims)")
return _model
def embed_texts(texts: List[str], batch_size: int = 100) -> np.ndarray:
"""
Embed a list of texts using MiniLM.
Args:
texts: List of text strings to embed
batch_size: Batch size for encoding (default 100)
Returns:
numpy array of shape (len(texts), 384)
"""
model = _get_embedding_model()
embeddings = model.encode(
texts,
batch_size=batch_size,
show_progress_bar=len(texts) > 100,
normalize_embeddings=True,
)
logger.info(f"Embedded {len(texts)} texts → shape {embeddings.shape}")
return embeddings
def embed_query(query: str) -> np.ndarray:
"""Embed a single query string."""
model = _get_embedding_model()
return model.encode(query, normalize_embeddings=True)
def store_chunks(chunks: List[Chunk], embeddings: np.ndarray):
"""
Upsert chunks + embeddings into pgvector.
Uses ON CONFLICT DO UPDATE for safe nightly re-indexing.
"""
seen = set()
unique_chunks = []
unique_embeddings = []
for chunk, emb in zip(chunks, embeddings):
if chunk.id not in seen:
seen.add(chunk.id)
unique_chunks.append(chunk)
unique_embeddings.append(emb)
chunks = unique_chunks
embeddings = unique_embeddings
db = get_connector()
data = []
for chunk, embedding in zip(chunks, embeddings):
data.append({
"id": chunk.id,
"client_id": chunk.client_id,
"chunk_type": chunk.chunk_type,
"text": chunk.text,
"metadata": chunk.metadata,
"embedding": embedding.tolist(),
})
# Batch upsert
batch_size = 100
for i in range(0, len(data), batch_size):
batch = data[i:i + batch_size]
db.upsert_embeddings(batch)
logger.info(f"Stored {len(data)} chunks in pgvector")
# Try to create IVFFlat index (needs enough rows)
try:
db.create_vector_index()
except Exception:
pass
def search(
query: str,
client_id: str,
chunk_type: Optional[str] = None,
top_k: int = 5,
) -> List[dict]:
"""
Search pgvector for similar chunks.
Args:
query: Natural language query
client_id: Client to search within
chunk_type: Optional filter (branch_weekly, channel_monthly, trend_qoq)
top_k: Number of results
Returns:
List of dicts with text, metadata, similarity
"""
query_embedding = embed_query(query)
db = get_connector()
results = db.search_embeddings(
query_embedding=query_embedding,
client_id=client_id,
chunk_type=chunk_type,
top_k=top_k,
)
logger.info(
f"Vector search: query='{query[:50]}...', "
f"chunk_type={chunk_type}, results={len(results)}"
)
return results