prototype

2026-05-16 19:35:21 +00:00 · 2026-05-04 22:00:38 +05:30
commit 711d691870
48 changed files with 5093 additions and 0 deletions
@@ -0,0 +1,287 @@
+"""
+Clawrity — RAG Chunker
+
+Aggregation-based semantic chunking — NOT fixed-size, NOT sliding window.
+Source is structured tabular data. We aggregate rows into business-meaningful
+units and write natural language narratives.
+
+Three chunk types:
+  1. branch_weekly   — GROUP BY branch, country, week
+  2. channel_monthly — GROUP BY channel, country, month
+  3. trend_qoq       — GROUP BY branch, country, quarter (QoQ delta COMPUTED)
+
+Plus Faker-generated narrative summaries reflecting real patterns.
+"""
+
+import hashlib
+import logging
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+
+import numpy as np
+import pandas as pd
+from faker import Faker
+
+logger = logging.getLogger(__name__)
+fake = Faker()
+
+
+@dataclass
+class Chunk:
+    """A single RAG chunk."""
+    id: str
+    client_id: str
+    chunk_type: str
+    text: str
+    metadata: Dict
+
+    def to_dict(self) -> Dict:
+        return {
+            "id": self.id,
+            "client_id": self.client_id,
+            "chunk_type": self.chunk_type,
+            "text": self.text,
+            "metadata": self.metadata,
+        }
+
+
+def generate_chunks(df: pd.DataFrame, client_id: str) -> List[Chunk]:
+    """Generate all chunk types from preprocessed data."""
+    chunks = []
+
+    df = df.copy()
+    df["date"] = pd.to_datetime(df["date"])
+
+    chunks.extend(_branch_weekly(df, client_id))
+    chunks.extend(_channel_monthly(df, client_id))
+    chunks.extend(_trend_qoq(df, client_id))
+    chunks.extend(_faker_narratives(df, client_id))
+
+    logger.info(f"Generated {len(chunks)} total chunks for {client_id}")
+    return chunks
+
+
+def _chunk_id(client_id: str, chunk_type: str, *parts) -> str:
+    """Generate a deterministic chunk ID."""
+    raw = f"{client_id}:{chunk_type}:" + ":".join(str(p) for p in parts)
+    return hashlib.md5(raw.encode()).hexdigest()[:16]
+
+
+# ---------------------------------------------------------------------------
+# Chunk Type 1: Branch Weekly
+# ---------------------------------------------------------------------------
+
+def _branch_weekly(df: pd.DataFrame, client_id: str) -> List[Chunk]:
+    """GROUP BY branch, country, week. One chunk per branch per week."""
+    chunks = []
+    df = df.copy()
+    df["week"] = df["date"].dt.isocalendar().week.astype(int)
+    df["month"] = df["date"].dt.month_name()
+    df["year"] = df["date"].dt.year
+
+    grouped = df.groupby(["branch", "country", "year", "week", "month"]).agg(
+        spend=("spend", "sum"),
+        revenue=("revenue", "sum"),
+        leads=("leads", "sum"),
+        conversions=("conversions", "sum"),
+    ).reset_index()
+
+    for _, row in grouped.iterrows():
+        spend = row["spend"]
+        revenue = row["revenue"]
+        roi = round(revenue / spend, 2) if spend > 0 else 0
+        conv_rate = round(row["conversions"] / row["leads"] * 100, 1) if row["leads"] > 0 else 0
+
+        text = (
+            f"{row['branch']} ({row['country']}) in week {row['week']} of "
+            f"{row['month']} {row['year']}: spent ${spend:,.0f}, earned "
+            f"${revenue:,.0f}, ROI {roi}x, {row['leads']} leads, "
+            f"{conv_rate}% conversion rate."
+        )
+
+        chunks.append(Chunk(
+            id=_chunk_id(client_id, "branch_weekly", row["branch"], row["year"], row["week"]),
+            client_id=client_id,
+            chunk_type="branch_weekly",
+            text=text,
+            metadata={
+                "branch": row["branch"],
+                "country": row["country"],
+                "week": int(row["week"]),
+                "month": row["month"],
+                "year": int(row["year"]),
+                "roi": roi,
+            },
+        ))
+
+    logger.info(f"Generated {len(chunks)} branch_weekly chunks")
+    return chunks
+
+
+# ---------------------------------------------------------------------------
+# Chunk Type 2: Channel Monthly
+# ---------------------------------------------------------------------------
+
+def _channel_monthly(df: pd.DataFrame, client_id: str) -> List[Chunk]:
+    """GROUP BY channel, country, month, quarter."""
+    chunks = []
+    df = df.copy()
+    df["month"] = df["date"].dt.month_name()
+    df["quarter"] = "Q" + df["date"].dt.quarter.astype(str)
+    df["year"] = df["date"].dt.year
+
+    grouped = df.groupby(["channel", "country", "year", "month", "quarter"]).agg(
+        spend=("spend", "sum"),
+        revenue=("revenue", "sum"),
+        leads=("leads", "sum"),
+        conversions=("conversions", "sum"),
+    ).reset_index()
+
+    for _, row in grouped.iterrows():
+        spend = row["spend"]
+        revenue = row["revenue"]
+        roi = round(revenue / spend, 2) if spend > 0 else 0
+
+        text = (
+            f"{row['channel']} in {row['country']} during {row['month']} "
+            f"({row['quarter']}) {row['year']}: ${spend:,.0f} spent, "
+            f"${revenue:,.0f} revenue, ROI {roi}x."
+        )
+
+        chunks.append(Chunk(
+            id=_chunk_id(client_id, "channel_monthly", row["channel"], row["country"], row["year"], row["month"]),
+            client_id=client_id,
+            chunk_type="channel_monthly",
+            text=text,
+            metadata={
+                "channel": row["channel"],
+                "country": row["country"],
+                "month": row["month"],
+                "quarter": row["quarter"],
+                "year": int(row["year"]),
+                "roi": roi,
+            },
+        ))
+
+    logger.info(f"Generated {len(chunks)} channel_monthly chunks")
+    return chunks
+
+
+# ---------------------------------------------------------------------------
+# Chunk Type 3: QoQ Trend (Most Important)
+# ---------------------------------------------------------------------------
+
+def _trend_qoq(df: pd.DataFrame, client_id: str) -> List[Chunk]:
+    """GROUP BY branch, country, quarter. Compute quarter-over-quarter delta."""
+    chunks = []
+    df = df.copy()
+    df["quarter"] = df["date"].dt.to_period("Q").astype(str)
+
+    grouped = df.groupby(["branch", "country", "quarter"]).agg(
+        spend=("spend", "sum"),
+        revenue=("revenue", "sum"),
+    ).reset_index()
+
+    # Sort for QoQ calculation
+    grouped = grouped.sort_values(["branch", "country", "quarter"])
+
+    for (branch, country), group in grouped.groupby(["branch", "country"]):
+        group = group.sort_values("quarter").reset_index(drop=True)
+
+        for i in range(1, len(group)):
+            prev = group.iloc[i - 1]
+            curr = group.iloc[i]
+
+            prev_rev = prev["revenue"]
+            curr_rev = curr["revenue"]
+
+            if prev_rev > 0:
+                delta = round((curr_rev - prev_rev) / prev_rev * 100, 1)
+            else:
+                delta = 0
+
+            direction = "grew" if delta > 0 else "declined"
+
+            text = (
+                f"{branch} ({country}) revenue {direction} {abs(delta)}% "
+                f"in {curr['quarter']} vs {prev['quarter']}. "
+                f"Total spend: ${curr['spend']:,.0f}, revenue: ${curr_rev:,.0f}."
+            )
+
+            chunks.append(Chunk(
+                id=_chunk_id(client_id, "trend_qoq", branch, country, curr["quarter"]),
+                client_id=client_id,
+                chunk_type="trend_qoq",
+                text=text,
+                metadata={
+                    "branch": branch,
+                    "country": country,
+                    "quarter": curr["quarter"],
+                    "prev_quarter": prev["quarter"],
+                    "delta_pct": delta,
+                },
+            ))
+
+    logger.info(f"Generated {len(chunks)} trend_qoq chunks")
+    return chunks
+
+
+# ---------------------------------------------------------------------------
+# Faker Narrative Chunks
+# ---------------------------------------------------------------------------
+
+def _faker_narratives(df: pd.DataFrame, client_id: str) -> List[Chunk]:
+    """Generate plausible narrative chunks reflecting real data patterns."""
+    chunks = []
+    df = df.copy()
+    df["quarter"] = df["date"].dt.to_period("Q").astype(str)
+
+    # Find top and bottom performers per quarter
+    quarterly = df.groupby(["branch", "country", "quarter"]).agg(
+        revenue=("revenue", "sum"),
+        spend=("spend", "sum"),
+        leads=("leads", "sum"),
+    ).reset_index()
+
+    templates = [
+        "{branch} branch demonstrated strong {quarter} performance driven by {channel} efficiency, outperforming regional averages.",
+        "In {quarter}, {branch} ({country}) showed {trend} momentum with revenue reaching ${revenue:,.0f}, primarily through {channel} campaigns.",
+        "{branch} branch in {country} maintained steady growth in {quarter}, with lead generation up and conversion rates holding above {conv_rate:.1f}%.",
+        "Cost efficiency at {branch} ({country}) improved in {quarter}, with spend-to-revenue ratio tightening to {ratio:.2f}x.",
+    ]
+
+    channels = df["channel"].dropna().unique().tolist() or ["Paid Search", "Social Media", "Email"]
+
+    for _, row in quarterly.iterrows():
+        roi = row["revenue"] / row["spend"] if row["spend"] > 0 else 0
+        conv_rate = np.random.uniform(5, 20)
+        trend = "positive" if roi > 1.5 else "moderate" if roi > 1 else "challenging"
+        channel = np.random.choice(channels)
+
+        template = np.random.choice(templates)
+        text = template.format(
+            branch=row["branch"],
+            country=row["country"],
+            quarter=row["quarter"],
+            channel=channel,
+            revenue=row["revenue"],
+            trend=trend,
+            conv_rate=conv_rate,
+            ratio=1 / roi if roi > 0 else 0,
+        )
+
+        chunks.append(Chunk(
+            id=_chunk_id(client_id, "narrative", row["branch"], row["country"], row["quarter"]),
+            client_id=client_id,
+            chunk_type="narrative",
+            text=text,
+            metadata={
+                "branch": row["branch"],
+                "country": row["country"],
+                "quarter": row["quarter"],
+                "source": "generated_narrative",
+            },
+        ))
+
+    logger.info(f"Generated {len(chunks)} narrative chunks")
+    return chunks