prototype

2026-05-16 19:35:21 +00:00 · 2026-05-04 22:00:38 +05:30
commit 711d691870
48 changed files with 5093 additions and 0 deletions
@@ -0,0 +1,72 @@
+"""
+Clawrity — RAG Preprocessor
+
+Fetches data from PostgreSQL, cleans it for RAG chunking:
+  - Removes nulls, outliers > 3 std devs, duplicates
+  - Normalises string columns
+"""
+
+import logging
+from typing import Optional
+
+import pandas as pd
+
+from etl.normaliser import remove_outliers
+from skills.postgres_connector import get_connector
+
+logger = logging.getLogger(__name__)
+
+
+def preprocess_for_rag(
+    client_id: str,
+    days: int = 365,
+) -> pd.DataFrame:
+    """
+    Fetch and preprocess data for RAG chunking.
+
+    Args:
+        client_id: Client to fetch data for
+        days: Number of days of data to fetch (default 365)
+
+    Returns:
+        Clean DataFrame ready for chunking
+    """
+    db = get_connector()
+
+    sql = """
+        SELECT date, country, branch, channel, spend, revenue, leads, conversions
+        FROM spend_data
+        WHERE client_id = %s AND date >= CURRENT_DATE - INTERVAL '%s days'
+        ORDER BY date
+    """
+    # Can't parameterise interval directly, use string formatting for days
+    safe_sql = f"""
+        SELECT date, country, branch, channel, spend, revenue, leads, conversions
+        FROM spend_data
+        WHERE client_id = %s AND date >= CURRENT_DATE - INTERVAL '{int(days)} days'
+        ORDER BY date
+    """
+    df = db.execute_query(safe_sql, (client_id,))
+    logger.info(f"Fetched {len(df)} rows for RAG preprocessing")
+
+    if df.empty:
+        logger.warning(f"No data found for client {client_id}")
+        return df
+
+    # Remove rows with critical nulls
+    critical_cols = ["date", "branch", "country", "revenue"]
+    df = df.dropna(subset=[c for c in critical_cols if c in df.columns])
+
+    # Remove outliers on numeric columns
+    df = remove_outliers(df, ["spend", "revenue", "leads", "conversions"])
+
+    # Clean strings
+    for col in ["country", "branch", "channel"]:
+        if col in df.columns:
+            df[col] = df[col].astype(str).str.strip().str.title()
+
+    # Remove duplicates
+    df = df.drop_duplicates()
+
+    logger.info(f"Preprocessed: {len(df)} rows ready for chunking")
+    return df