prototype

2026-05-16 19:35:21 +00:00 · 2026-05-04 22:00:38 +05:30
commit 711d691870
48 changed files with 5093 additions and 0 deletions
@@ -0,0 +1,140 @@
+"""
+Clawrity — NL-to-SQL Engine
+
+Converts natural language questions into valid PostgreSQL SELECT queries.
+Uses LLM at temperature 0.1 for deterministic SQL generation.
+Safety: Only SELECT queries allowed. INSERT/UPDATE/DELETE/DROP rejected.
+"""
+
+import re
+import logging
+from typing import Optional
+
+from config.llm_client import get_llm_client, get_model_name
+
+logger = logging.getLogger(__name__)
+
+# Dangerous SQL patterns — reject anything that isn't a SELECT
+UNSAFE_PATTERNS = re.compile(
+    r"\b(INSERT|UPDATE|DELETE|DROP|ALTER|TRUNCATE|CREATE|GRANT|REVOKE|EXEC)\b",
+    re.IGNORECASE
+)
+
+SYSTEM_PROMPT = """You are a PostgreSQL SQL generator. Generate ONLY a valid SELECT query.
+Return ONLY the raw SQL — no markdown, no explanation, no code fences.
+
+Table: spend_data
+Columns:
+  - id: SERIAL PRIMARY KEY
+  - date: DATE
+  - country: VARCHAR(100)
+  - branch: VARCHAR(100)
+  - channel: VARCHAR(100)
+  - spend: FLOAT
+  - revenue: FLOAT
+  - leads: INT
+  - conversions: INT
+  - client_id: VARCHAR(100)
+
+Available countries: {countries}
+Available branches (sample): {branches}
+Available channels: {channels}
+Date range: {date_min} to {date_max}
+
+RULES:
+1. ALWAYS include WHERE client_id = '{client_id}' in your queries
+2. Use standard PostgreSQL syntax
+3. For date ranges, use DATE type comparisons
+4. For "last N days", use: date >= CURRENT_DATE - INTERVAL '{n} days'
+5. For "last month", use: date >= DATE_TRUNC('month', CURRENT_DATE - INTERVAL '1 month')
+6. Return meaningful aggregations with GROUP BY when appropriate
+7. Use aliases for computed columns (e.g., SUM(revenue) AS total_revenue)
+8. LIMIT results to 50 rows maximum unless the user asks for all
+9. For "bottom N" use ASC ordering, for "top N" use DESC ordering
+"""
+
+
+class NLToSQL:
+    """Natural language to SQL converter using LLM."""
+
+    def __init__(self):
+        self.client = get_llm_client()
+        self.model = get_model_name()
+
+    def generate_sql(
+        self,
+        question: str,
+        client_id: str,
+        schema_metadata: dict,
+    ) -> Optional[str]:
+        """
+        Convert a natural language question to a PostgreSQL SELECT query.
+
+        Args:
+            question: User's natural language question
+            client_id: Client ID for filtering
+            schema_metadata: Dict with countries, branches, channels, date_min, date_max
+
+        Returns:
+            Valid SQL SELECT string, or None on failure
+        """
+        # Build the system prompt with schema context
+        system = SYSTEM_PROMPT.format(
+            countries=", ".join(schema_metadata.get("countries", [])[:20]),
+            branches=", ".join(schema_metadata.get("branches", [])[:20]),
+            channels=", ".join(schema_metadata.get("channels", [])),
+            date_min=schema_metadata.get("date_min", "unknown"),
+            date_max=schema_metadata.get("date_max", "unknown"),
+            client_id=client_id,
+            n="7",  # Default for interval template
+        )
+
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {"role": "system", "content": system},
+                    {"role": "user", "content": question},
+                ],
+                temperature=0.1,
+                max_tokens=1024,
+            )
+
+            raw_sql = response.choices[0].message.content.strip()
+            sql = self._clean_sql(raw_sql)
+
+            if not self._validate_sql(sql):
+                logger.warning(f"Generated SQL failed validation: {sql}")
+                return None
+
+            logger.info(f"Generated SQL: {sql}")
+            return sql
+
+        except Exception as e:
+            logger.error(f"NL-to-SQL generation failed: {e}")
+            return None
+
+    def _clean_sql(self, raw: str) -> str:
+        """Extract SQL from LLM response, stripping markdown code fences."""
+        # Remove markdown code blocks
+        cleaned = re.sub(r"```(?:sql)?\s*", "", raw)
+        cleaned = re.sub(r"```\s*$", "", cleaned)
+        cleaned = cleaned.strip().rstrip(";") + ";"
+        return cleaned
+
+    def _validate_sql(self, sql: str) -> bool:
+        """Validate that the SQL is a safe SELECT query."""
+        if not sql or len(sql) < 10:
+            return False
+
+        # Must start with SELECT
+        if not sql.strip().upper().startswith("SELECT"):
+            logger.warning("SQL does not start with SELECT")
+            return False
+
+        # Must not contain dangerous operations
+        if UNSAFE_PATTERNS.search(sql):
+            logger.warning("SQL contains unsafe operations")
+            return False
+
+        return True
@@ -0,0 +1,384 @@
+"""
+Clawrity — PostgreSQL + pgvector Connector
+
+Connection pool management, schema initialization, and query execution.
+Single database handles both structured queries (NL-to-SQL) and vector search (pgvector).
+"""
+
+import logging
+import time
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+import psycopg2
+import psycopg2.extras
+from pgvector.psycopg2 import register_vector
+
+from config.settings import get_settings
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Schema DDL
+# ---------------------------------------------------------------------------
+
+INIT_SCHEMA_SQL = """
+-- Enable pgvector extension
+CREATE EXTENSION IF NOT EXISTS vector;
+
+-- Structured business data (replaces BigQuery)
+CREATE TABLE IF NOT EXISTS spend_data (
+    id          SERIAL PRIMARY KEY,
+    date        DATE,
+    country     VARCHAR(100),
+    branch      VARCHAR(100),
+    channel     VARCHAR(100),
+    spend       FLOAT,
+    revenue     FLOAT,
+    leads       INT,
+    conversions INT,
+    client_id   VARCHAR(100)
+);
+
+-- Vector embeddings (replaces ChromaDB)
+CREATE TABLE IF NOT EXISTS embeddings (
+    id          VARCHAR(200) PRIMARY KEY,
+    client_id   VARCHAR(100),
+    chunk_type  VARCHAR(50),
+    text        TEXT,
+    metadata    JSONB,
+    embedding   vector(384)
+);
+
+-- Forecast cache
+CREATE TABLE IF NOT EXISTS forecasts (
+    id              SERIAL PRIMARY KEY,
+    client_id       VARCHAR(100),
+    branch          VARCHAR(100),
+    country         VARCHAR(100),
+    horizon_months  INT,
+    forecast_data   JSONB,
+    computed_at     TIMESTAMP DEFAULT NOW()
+);
+
+-- Indexes
+CREATE INDEX IF NOT EXISTS idx_spend_data_client
+    ON spend_data (client_id);
+CREATE INDEX IF NOT EXISTS idx_spend_data_date
+    ON spend_data (client_id, date);
+CREATE INDEX IF NOT EXISTS idx_embeddings_client_type
+    ON embeddings (client_id, chunk_type);
+CREATE INDEX IF NOT EXISTS idx_forecasts_client
+    ON forecasts (client_id, branch, country);
+"""
+
+# IVFFlat index requires rows to exist — created separately after data load
+IVFFLAT_INDEX_SQL = """
+CREATE INDEX IF NOT EXISTS idx_embeddings_cosine
+    ON embeddings USING ivfflat (embedding vector_cosine_ops)
+    WITH (lists = 100);
+"""
+
+
+class PostgresConnector:
+    """PostgreSQL + pgvector connection manager."""
+
+    def __init__(self, database_url: Optional[str] = None):
+        self.database_url = database_url or get_settings().database_url
+        self._conn: Optional[psycopg2.extensions.connection] = None
+
+    def _get_connection(self) -> psycopg2.extensions.connection:
+        """Get or create a database connection with retry logic."""
+        if self._conn is None or self._conn.closed:
+            max_retries = 3
+            for attempt in range(max_retries):
+                try:
+                    self._conn = psycopg2.connect(self.database_url)
+                    register_vector(self._conn)
+                    logger.info("Connected to PostgreSQL with pgvector support")
+                    return self._conn
+                except psycopg2.OperationalError as e:
+                    wait = 2**attempt
+                    logger.warning(
+                        f"DB connection attempt {attempt + 1}/{max_retries} failed: {e}. "
+                        f"Retrying in {wait}s..."
+                    )
+                    time.sleep(wait)
+            raise ConnectionError("Failed to connect to PostgreSQL after 3 attempts")
+        return self._conn
+
+    def close(self):
+        """Close the database connection."""
+        if self._conn and not self._conn.closed:
+            self._conn.close()
+            logger.info("PostgreSQL connection closed")
+
+    def init_schema(self):
+        """Create tables and extensions if they don't exist."""
+        conn = self._get_connection()
+        try:
+            with conn.cursor() as cur:
+                cur.execute(INIT_SCHEMA_SQL)
+            conn.commit()
+            logger.info("Database schema initialized successfully")
+        except Exception as e:
+            conn.rollback()
+            logger.error(f"Schema initialization failed: {e}")
+            raise
+
+    def create_vector_index(self):
+        """Create IVFFlat index — call AFTER data has been loaded into embeddings."""
+        conn = self._get_connection()
+        try:
+            with conn.cursor() as cur:
+                cur.execute(IVFFLAT_INDEX_SQL)
+            conn.commit()
+            logger.info("IVFFlat vector index created")
+        except Exception as e:
+            conn.rollback()
+            logger.warning(f"Could not create IVFFlat index (may need more rows): {e}")
+
+    # ------------------------------------------------------------------
+    # Query execution
+    # ------------------------------------------------------------------
+
+    def execute_query(self, sql: str, params: Optional[tuple] = None) -> pd.DataFrame:
+        """
+        Execute a SELECT query and return results as a DataFrame.
+
+        Args:
+            sql: SQL query string (must be SELECT only)
+            params: Query parameters for parameterised queries
+
+        Returns:
+            pandas DataFrame with query results
+        """
+        conn = self._get_connection()
+        try:
+            df = pd.read_sql_query(sql, conn, params=params)
+            conn.rollback()
+            logger.debug(f"Query returned {len(df)} rows")
+            return df
+        except Exception as e:
+            logger.error(f"Query execution failed: {e}")
+            conn.rollback()
+            raise
+
+    def execute_raw(self, sql: str, params: Optional[tuple] = None) -> List[Dict]:
+        """Execute a query and return raw dictionaries."""
+        conn = self._get_connection()
+        try:
+            with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+                cur.execute(sql, params)
+                if cur.description:
+                    results = [dict(row) for row in cur.fetchall()]
+                    conn.rollback()
+                    return results
+                conn.commit()
+                return []
+        except Exception as e:
+            conn.rollback()
+            logger.error(f"Raw query execution failed: {e}")
+            raise
+
+    def execute_write(self, sql: str, params: Optional[tuple] = None):
+        """Execute an INSERT/UPDATE/DELETE statement."""
+        conn = self._get_connection()
+        try:
+            with conn.cursor() as cur:
+                cur.execute(sql, params)
+            conn.commit()
+        except Exception as e:
+            conn.rollback()
+            logger.error(f"Write execution failed: {e}")
+            raise
+
+    def execute_batch(self, sql: str, data: List[tuple], page_size: int = 1000):
+        """Execute a batch INSERT using execute_values for performance."""
+        conn = self._get_connection()
+        try:
+            with conn.cursor() as cur:
+                psycopg2.extras.execute_values(cur, sql, data, page_size=page_size)
+            conn.commit()
+            logger.info(f"Batch insert: {len(data)} rows")
+        except Exception as e:
+            conn.rollback()
+            logger.error(f"Batch execution failed: {e}")
+            raise
+
+    # ------------------------------------------------------------------
+    # pgvector operations
+    # ------------------------------------------------------------------
+
+    def upsert_embeddings(self, embeddings_data: List[Dict[str, Any]]):
+        """
+        Upsert embedding records into the embeddings table.
+
+        Args:
+            embeddings_data: List of dicts with keys:
+                id, client_id, chunk_type, text, metadata, embedding
+        """
+        conn = self._get_connection()
+        sql = """
+            INSERT INTO embeddings (id, client_id, chunk_type, text, metadata, embedding)
+            VALUES %s
+            ON CONFLICT (id) DO UPDATE SET
+                text = EXCLUDED.text,
+                metadata = EXCLUDED.metadata,
+                embedding = EXCLUDED.embedding
+        """
+        data = [
+            (
+                d["id"],
+                d["client_id"],
+                d["chunk_type"],
+                d["text"],
+                psycopg2.extras.Json(d["metadata"]),
+                np.array(d["embedding"]),
+            )
+            for d in embeddings_data
+        ]
+        try:
+            with conn.cursor() as cur:
+                psycopg2.extras.execute_values(cur, sql, data, page_size=100)
+            conn.commit()
+            logger.info(f"Upserted {len(data)} embeddings")
+        except Exception as e:
+            conn.rollback()
+            logger.error(f"Embedding upsert failed: {e}")
+            raise
+
+    def search_embeddings(
+        self,
+        query_embedding: np.ndarray,
+        client_id: str,
+        chunk_type: Optional[str] = None,
+        top_k: int = 5,
+    ) -> List[Dict]:
+        """
+        Search for similar embeddings using pgvector cosine similarity.
+
+        Args:
+            query_embedding: Query vector (384 dims)
+            client_id: Filter by client
+            chunk_type: Optional filter by chunk type
+            top_k: Number of results to return
+
+        Returns:
+            List of dicts with text, metadata, and similarity score
+        """
+        conn = self._get_connection()
+        query_vec = np.array(query_embedding)
+
+        if chunk_type:
+            sql = """
+                SELECT text, metadata, 1 - (embedding <=> %s) AS similarity
+                FROM embeddings
+                WHERE client_id = %s AND chunk_type = %s
+                ORDER BY embedding <=> %s
+                LIMIT %s
+            """
+            params = (query_vec, client_id, chunk_type, query_vec, top_k)
+        else:
+            sql = """
+                SELECT text, metadata, 1 - (embedding <=> %s) AS similarity
+                FROM embeddings
+                WHERE client_id = %s
+                ORDER BY embedding <=> %s
+                LIMIT %s
+            """
+            params = (query_vec, client_id, query_vec, top_k)
+
+        try:
+            with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+                cur.execute(sql, params)
+                results = [dict(row) for row in cur.fetchall()]
+            logger.debug(f"Vector search returned {len(results)} results")
+            return results
+        except Exception as e:
+            logger.error(f"Vector search failed: {e}")
+            raise
+
+    # ------------------------------------------------------------------
+    # Utility
+    # ------------------------------------------------------------------
+
+    def get_table_count(self, table: str, client_id: Optional[str] = None) -> int:
+        """Get row count for a table, optionally filtered by client_id."""
+        conn = self._get_connection()
+        try:
+            with conn.cursor() as cur:
+                if client_id:
+                    cur.execute(
+                        f"SELECT COUNT(*) FROM {table} WHERE client_id = %s",
+                        (client_id,),
+                    )
+                else:
+                    cur.execute(f"SELECT COUNT(*) FROM {table}")
+                return cur.fetchone()[0]
+        except Exception as e:
+            logger.error(f"Count query failed: {e}")
+            return 0
+
+    def get_spend_data_schema(self, client_id: str) -> Dict:
+        """Get metadata about available data for a client — used by NL-to-SQL."""
+        conn = self._get_connection()
+        try:
+            with conn.cursor() as cur:
+                cur.execute(
+                    "SELECT DISTINCT country FROM spend_data WHERE client_id = %s ORDER BY country",
+                    (client_id,),
+                )
+                countries = [row[0] for row in cur.fetchall()]
+
+                cur.execute(
+                    "SELECT DISTINCT branch FROM spend_data WHERE client_id = %s ORDER BY branch",
+                    (client_id,),
+                )
+                branches = [row[0] for row in cur.fetchall()]
+
+                cur.execute(
+                    "SELECT DISTINCT channel FROM spend_data WHERE client_id = %s ORDER BY channel",
+                    (client_id,),
+                )
+                channels = [row[0] for row in cur.fetchall()]
+
+                cur.execute(
+                    "SELECT MIN(date), MAX(date) FROM spend_data WHERE client_id = %s",
+                    (client_id,),
+                )
+                date_range = cur.fetchone()
+
+            return {
+                "countries": countries,
+                "branches": branches,
+                "channels": channels,
+                "date_min": str(date_range[0]) if date_range[0] else None,
+                "date_max": str(date_range[1]) if date_range[1] else None,
+            }
+        except Exception as e:
+            logger.error(f"Schema metadata query failed: {e}")
+            return {
+                "countries": [],
+                "branches": [],
+                "channels": [],
+                "date_min": None,
+                "date_max": None,
+            }
+
+
+# ---------------------------------------------------------------------------
+# Module-level singleton
+# ---------------------------------------------------------------------------
+
+_connector: Optional[PostgresConnector] = None
+
+
+def get_connector() -> PostgresConnector:
+    """Get the shared PostgresConnector singleton."""
+    global _connector
+    if _connector is None:
+        _connector = PostgresConnector()
+    return _connector
@@ -0,0 +1,139 @@
+"""
+Clawrity — Web Search Skill
+
+Primary: Tavily API (clean, summarised results built for LLM agents)
+Fallback: duckduckgo-search (no API key, no rate limits, free)
+
+Auto-fallback: if Tavily errors or quota exceeded, silently switch to DuckDuckGo.
+"""
+
+import logging
+from datetime import datetime, timedelta
+from typing import List, Dict, Optional
+
+from config.settings import get_settings
+
+logger = logging.getLogger(__name__)
+
+
+def web_search(
+    query: str,
+    max_results: int = 5,
+    lookback_days: int = 1,
+) -> List[Dict]:
+    """
+    Search the web using Tavily (primary) or DuckDuckGo (fallback).
+
+    Args:
+        query: Search query string
+        max_results: Maximum number of results
+        lookback_days: Only keep results from the last N days
+
+    Returns:
+        List of dicts with: title, url, content, date
+    """
+    results = _tavily_search(query, max_results)
+
+    if not results:
+        logger.info("Tavily returned no results, falling back to DuckDuckGo")
+        results = _ddg_search(query, max_results)
+
+    # Filter by recency
+    if lookback_days > 0:
+        results = _filter_recent(results, lookback_days)
+
+    return results
+
+
+def _tavily_search(query: str, max_results: int = 5) -> List[Dict]:
+    """Search using Tavily API."""
+    settings = get_settings()
+
+    if not settings.tavily_api_key:
+        logger.info("Tavily API key not configured, skipping")
+        return []
+
+    try:
+        from tavily import TavilyClient
+
+        client = TavilyClient(api_key=settings.tavily_api_key)
+        response = client.search(
+            query=query,
+            search_depth="advanced",
+            max_results=max_results,
+        )
+
+        results = []
+        for item in response.get("results", []):
+            results.append({
+                "title": item.get("title", ""),
+                "url": item.get("url", ""),
+                "content": item.get("content", ""),
+                "date": item.get("published_date", ""),
+                "source": "tavily",
+            })
+
+        logger.info(f"Tavily returned {len(results)} results for: {query[:50]}")
+        return results
+
+    except Exception as e:
+        logger.warning(f"Tavily search failed: {e}")
+        return []
+
+
+def _ddg_search(query: str, max_results: int = 5) -> List[Dict]:
+    """Search using DuckDuckGo (fallback — no API key needed)."""
+    try:
+        from duckduckgo_search import DDGS
+
+        results = []
+        with DDGS() as ddgs:
+            for r in ddgs.text(query, max_results=max_results):
+                results.append({
+                    "title": r.get("title", ""),
+                    "url": r.get("href", ""),
+                    "content": r.get("body", ""),
+                    "date": "",
+                    "source": "duckduckgo",
+                })
+
+        logger.info(f"DuckDuckGo returned {len(results)} results for: {query[:50]}")
+        return results
+
+    except Exception as e:
+        logger.warning(f"DuckDuckGo search failed: {e}")
+        return []
+
+
+def _filter_recent(results: List[Dict], lookback_days: int) -> List[Dict]:
+    """Filter results to only include items from the last N days."""
+    if not results:
+        return results
+
+    cutoff = datetime.utcnow() - timedelta(days=lookback_days)
+    filtered = []
+
+    for r in results:
+        date_str = r.get("date", "")
+        if not date_str:
+            # No date info — include it (benefit of the doubt)
+            filtered.append(r)
+            continue
+
+        try:
+            # Try common date formats
+            for fmt in ("%Y-%m-%dT%H:%M:%S", "%Y-%m-%d", "%B %d, %Y"):
+                try:
+                    dt = datetime.strptime(date_str[:19], fmt)
+                    if dt >= cutoff:
+                        filtered.append(r)
+                    break
+                except ValueError:
+                    continue
+            else:
+                # Can't parse date, include it
+                filtered.append(r)
+        except Exception:
+            filtered.append(r)
+
+    return filtered