mirror of
https://github.com/Manoj-HV30/clawrity.git
synced 2026-05-16 19:35:21 +00:00
prototype
This commit is contained in:
@@ -0,0 +1,26 @@
|
|||||||
|
# =============================================================================
|
||||||
|
# Clawrity — Environment Variables
|
||||||
|
# Copy this file to .env and fill in your values.
|
||||||
|
# NEVER commit .env to git.
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# --- Groq API (free at https://console.groq.com) ---
|
||||||
|
GROQ_API_KEY=
|
||||||
|
|
||||||
|
# --- PostgreSQL + pgvector (docker-compose handles this if using defaults) ---
|
||||||
|
DATABASE_URL=postgresql://user:pass@localhost:5432/clawrity
|
||||||
|
|
||||||
|
# --- Slack Bot (Socket Mode) ---
|
||||||
|
# 1. Create app at https://api.slack.com/apps
|
||||||
|
# 2. Enable Socket Mode → generate App-Level Token (xapp-...)
|
||||||
|
# 3. OAuth & Permissions → install to workspace → copy Bot Token (xoxb-...)
|
||||||
|
# 4. Basic Information → Signing Secret
|
||||||
|
SLACK_BOT_TOKEN=
|
||||||
|
SLACK_APP_TOKEN=
|
||||||
|
SLACK_SIGNING_SECRET=
|
||||||
|
|
||||||
|
# --- Tavily Web Search (free at https://app.tavily.com) ---
|
||||||
|
TAVILY_API_KEY=
|
||||||
|
|
||||||
|
# --- Slack Webhook for digest delivery ---
|
||||||
|
ACME_SLACK_WEBHOOK=
|
||||||
+43
@@ -0,0 +1,43 @@
|
|||||||
|
# === Environment & Secrets ===
|
||||||
|
.env
|
||||||
|
*.env
|
||||||
|
|
||||||
|
# === Dataset files — never commit raw or processed data ===
|
||||||
|
data/raw/
|
||||||
|
data/processed/
|
||||||
|
|
||||||
|
# === Python ===
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
*.so
|
||||||
|
*.egg-info/
|
||||||
|
dist/
|
||||||
|
build/
|
||||||
|
*.egg
|
||||||
|
|
||||||
|
# === Virtual Environment ===
|
||||||
|
venv/
|
||||||
|
.venv/
|
||||||
|
env/
|
||||||
|
|
||||||
|
# === IDE ===
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
|
||||||
|
# === OS ===
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
# === Logs ===
|
||||||
|
logs/
|
||||||
|
*.log
|
||||||
|
*.jsonl
|
||||||
|
|
||||||
|
# === Docker ===
|
||||||
|
pg_data/
|
||||||
|
|
||||||
|
# === Model Cache ===
|
||||||
|
.cache/
|
||||||
+23
@@ -0,0 +1,23 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install system dependencies for psycopg2 and Prophet
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
gcc \
|
||||||
|
libpq-dev \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install Python dependencies
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Copy project
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Create necessary directories
|
||||||
|
RUN mkdir -p data/raw data/processed logs
|
||||||
|
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
@@ -0,0 +1,213 @@
|
|||||||
|
# Clawrity
|
||||||
|
|
||||||
|
**Multi-channel AI business intelligence agent.** Enterprise clients interact via Slack (or Teams) and get data-grounded answers, daily digests, budget recommendations, ROI forecasts, and competitor/sector intelligence — all specific to their business data.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
Built on the **OpenClaw pattern**:
|
||||||
|
- **ProtocolAdapter** — normalises messages from any channel (Slack, Teams, etc.)
|
||||||
|
- **SOUL.md** — per-client personality, rules, and business context
|
||||||
|
- **HEARTBEAT.md** — autonomous daily digest scheduling
|
||||||
|
|
||||||
|
All intelligence lives in the Clawrity backend. OpenClaw layer has zero business logic.
|
||||||
|
|
||||||
|
## Tech Stack
|
||||||
|
|
||||||
|
| Component | Tool |
|
||||||
|
|---|---|
|
||||||
|
| Language | Python 3.11 |
|
||||||
|
| API Framework | FastAPI + uvicorn |
|
||||||
|
| LLM | Groq API — llama-3.3-70b-versatile |
|
||||||
|
| Embeddings | sentence-transformers all-MiniLM-L6-v2 (CPU, 384d) |
|
||||||
|
| Database | PostgreSQL + pgvector |
|
||||||
|
| Channel (dev) | Slack Bolt SDK (Socket Mode) |
|
||||||
|
| Channel (demo) | Microsoft Teams Bot Framework SDK |
|
||||||
|
| Scheduler | APScheduler AsyncIOScheduler |
|
||||||
|
| Web Search | Tavily API + DuckDuckGo fallback |
|
||||||
|
| Forecasting | Prophet |
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### 1. Prerequisites
|
||||||
|
|
||||||
|
- Python 3.11+
|
||||||
|
- Docker & Docker Compose
|
||||||
|
- Groq API key (free: https://console.groq.com)
|
||||||
|
- Tavily API key (free: https://app.tavily.com)
|
||||||
|
|
||||||
|
### 2. Environment Setup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cp .env.example .env
|
||||||
|
# Fill in your API keys in .env
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Start PostgreSQL + pgvector
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose up -d postgres
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Install Dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m venv venv
|
||||||
|
source venv/bin/activate
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. Download Kaggle Datasets
|
||||||
|
|
||||||
|
Download these two datasets and place them in `data/raw/`:
|
||||||
|
|
||||||
|
1. **Global Superstore**: https://kaggle.com/datasets/apoorvaappz/global-super-store-dataset
|
||||||
|
2. **Marketing Campaign Performance**: https://kaggle.com/datasets/manishabhatt22/marketing-campaign-performance-dataset
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir -p data/raw data/processed
|
||||||
|
# Place downloaded files in data/raw/
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6. Seed Demo Data
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python scripts/seed_demo_data.py --client_id acme_corp \
|
||||||
|
--superstore data/raw/Global_Superstore2.csv \
|
||||||
|
--marketing data/raw/marketing_campaign_dataset.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
### 7. Run RAG Pipeline
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python scripts/run_rag_pipeline.py --client_id acme_corp
|
||||||
|
```
|
||||||
|
|
||||||
|
### 8. Start the API
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uvicorn main:app --reload --port 8000
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Slack Bot Setup (Socket Mode)
|
||||||
|
|
||||||
|
### Step 1: Create Slack App
|
||||||
|
|
||||||
|
1. Go to https://api.slack.com/apps
|
||||||
|
2. Click **Create New App** → **From scratch**
|
||||||
|
3. Name it `Clawrity` and select your workspace
|
||||||
|
|
||||||
|
### Step 2: Enable Socket Mode
|
||||||
|
|
||||||
|
1. In the left sidebar, click **Socket Mode**
|
||||||
|
2. Toggle **Enable Socket Mode** to ON
|
||||||
|
3. Click **Generate Token** — name it `clawrity-socket`
|
||||||
|
4. Copy the `xapp-...` token → paste into `.env` as `SLACK_APP_TOKEN`
|
||||||
|
|
||||||
|
### Step 3: Configure Bot Token
|
||||||
|
|
||||||
|
1. Go to **OAuth & Permissions**
|
||||||
|
2. Under **Bot Token Scopes**, add:
|
||||||
|
- `app_mentions:read`
|
||||||
|
- `chat:write`
|
||||||
|
- `channels:history`
|
||||||
|
- `channels:read`
|
||||||
|
3. Click **Install to Workspace**
|
||||||
|
4. Copy the `xoxb-...` token → paste into `.env` as `SLACK_BOT_TOKEN`
|
||||||
|
|
||||||
|
### Step 4: Enable Events
|
||||||
|
|
||||||
|
1. Go to **Event Subscriptions**
|
||||||
|
2. Toggle **Enable Events** to ON (no Request URL needed in Socket Mode)
|
||||||
|
3. Under **Subscribe to bot events**, add:
|
||||||
|
- `app_mention`
|
||||||
|
- `message.channels`
|
||||||
|
4. Click **Save Changes**
|
||||||
|
|
||||||
|
### Step 5: Get Signing Secret
|
||||||
|
|
||||||
|
1. Go to **Basic Information**
|
||||||
|
2. Under **App Credentials**, copy **Signing Secret**
|
||||||
|
3. Paste into `.env` as `SLACK_SIGNING_SECRET`
|
||||||
|
|
||||||
|
### Step 6: Invite Bot to Channel
|
||||||
|
|
||||||
|
In Slack, go to your desired channel and type:
|
||||||
|
```
|
||||||
|
/invite @Clawrity
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|------|-------------|
|
||||||
|
| POST | `/chat` | Send message → get AI response |
|
||||||
|
| POST | `/slack/events` | Slack webhook fallback |
|
||||||
|
| POST | `/compare` | Side-by-side RAG vs no-RAG |
|
||||||
|
| POST | `/forecast/run/{client_id}` | Trigger Prophet forecasting |
|
||||||
|
| GET | `/forecast/{client_id}/{branch}` | Get cached forecast |
|
||||||
|
| GET | `/admin/stats/{client_id}` | RAG monitoring stats |
|
||||||
|
| GET | `/health` | System status |
|
||||||
|
|
||||||
|
## Adding a New Client
|
||||||
|
|
||||||
|
1. Create `config/clients/client_newclient.yaml` (copy from `client_acme.yaml`)
|
||||||
|
2. Create `soul/newclient_soul.md`
|
||||||
|
3. Create `heartbeat/newclient_heartbeat.md`
|
||||||
|
4. Place data in `data/raw/` and run seed + RAG scripts
|
||||||
|
5. Restart — zero code changes required
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
clawrity/
|
||||||
|
├── main.py # FastAPI application
|
||||||
|
├── config/ # Configuration
|
||||||
|
│ ├── settings.py # pydantic-settings from .env
|
||||||
|
│ ├── client_loader.py # YAML client config loader
|
||||||
|
│ └── clients/client_acme.yaml # Per-client config
|
||||||
|
├── soul/ # Per-client personality
|
||||||
|
│ ├── soul_loader.py
|
||||||
|
│ └── acme_soul.md
|
||||||
|
├── heartbeat/ # Autonomous digest scheduling
|
||||||
|
│ ├── heartbeat_loader.py
|
||||||
|
│ ├── scheduler.py
|
||||||
|
│ └── acme_heartbeat.md
|
||||||
|
├── agents/ # AI agents
|
||||||
|
│ ├── gen_agent.py # Response generation
|
||||||
|
│ ├── qa_agent.py # Quality assurance
|
||||||
|
│ ├── orchestrator.py # Pipeline coordinator
|
||||||
|
│ └── scout_agent.py # Competitor intelligence
|
||||||
|
├── skills/ # Capabilities
|
||||||
|
│ ├── postgres_connector.py # DB connection pool
|
||||||
|
│ ├── nl_to_sql.py # Natural language → SQL
|
||||||
|
│ └── web_search.py # Tavily + DuckDuckGo
|
||||||
|
├── channels/ # Message channels
|
||||||
|
│ ├── protocol_adapter.py # OpenClaw normalisation
|
||||||
|
│ ├── slack_handler.py # Slack Socket Mode
|
||||||
|
│ └── teams_handler.py # Teams stub
|
||||||
|
├── rag/ # Retrieval-augmented generation
|
||||||
|
│ ├── preprocessor.py
|
||||||
|
│ ├── chunker.py
|
||||||
|
│ ├── vector_store.py
|
||||||
|
│ ├── retriever.py
|
||||||
|
│ ├── evaluator.py
|
||||||
|
│ └── monitoring.py
|
||||||
|
├── forecasting/
|
||||||
|
│ └── prophet_engine.py
|
||||||
|
├── connectors/
|
||||||
|
│ ├── base_connector.py
|
||||||
|
│ └── csv_connector.py
|
||||||
|
├── etl/
|
||||||
|
│ └── normaliser.py
|
||||||
|
└── scripts/
|
||||||
|
├── seed_demo_data.py
|
||||||
|
└── run_rag_pipeline.py
|
||||||
|
```
|
||||||
@@ -0,0 +1,184 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — Gen Agent
|
||||||
|
|
||||||
|
Generates newsletter-style, data-grounded responses using LLM.
|
||||||
|
Supports NVIDIA NIM and Groq via OpenAI-compatible API.
|
||||||
|
Temperature 0.7 (reduced by 0.2 on each retry).
|
||||||
|
Augmented with SOUL.md + live query results + RAG chunks (Phase 2).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import List, Optional, Dict
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from config.llm_client import get_llm_client, get_model_name
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class GenAgent:
|
||||||
|
"""Response generation agent using LLM (NVIDIA NIM or Groq)."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.client = get_llm_client()
|
||||||
|
self.model = get_model_name()
|
||||||
|
self.base_temperature = 0.7
|
||||||
|
|
||||||
|
def generate(
|
||||||
|
self,
|
||||||
|
question: str,
|
||||||
|
soul_content: str,
|
||||||
|
data_context: Optional[pd.DataFrame] = None,
|
||||||
|
rag_chunks: Optional[List[Dict]] = None,
|
||||||
|
retry_issues: Optional[List[str]] = None,
|
||||||
|
retry_count: int = 0,
|
||||||
|
strict_data_instruction: Optional[str] = None,
|
||||||
|
supplementary_context: Optional[pd.DataFrame] = None,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Generate a data-grounded response.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
question: User's original question
|
||||||
|
soul_content: SOUL.md content for personality/rules
|
||||||
|
data_context: DataFrame from PostgreSQL query results
|
||||||
|
rag_chunks: Retrieved chunks with similarity scores (Phase 2)
|
||||||
|
retry_issues: QA Agent issues from previous attempt
|
||||||
|
retry_count: Current retry number (0-2)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Markdown-formatted response string
|
||||||
|
"""
|
||||||
|
temperature = max(0.1, self.base_temperature - (retry_count * 0.2))
|
||||||
|
|
||||||
|
prompt = self._build_prompt(
|
||||||
|
question, soul_content, data_context, rag_chunks, retry_issues,
|
||||||
|
strict_data_instruction, supplementary_context,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self.client.chat.completions.create(
|
||||||
|
model=self.model,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": soul_content},
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
|
],
|
||||||
|
temperature=temperature,
|
||||||
|
max_tokens=2048,
|
||||||
|
)
|
||||||
|
result = response.choices[0].message.content.strip()
|
||||||
|
logger.info(
|
||||||
|
f"Gen Agent produced {len(result)} chars "
|
||||||
|
f"(temp={temperature}, retry={retry_count})"
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Gen Agent failed: {e}")
|
||||||
|
return f"I encountered an error generating your response. Please try again."
|
||||||
|
|
||||||
|
def generate_digest(
|
||||||
|
self,
|
||||||
|
soul_content: str,
|
||||||
|
data_context: pd.DataFrame,
|
||||||
|
rag_chunks: Optional[List[Dict]] = None,
|
||||||
|
) -> str:
|
||||||
|
"""Generate a daily digest newsletter."""
|
||||||
|
prompt = f"""Generate a professional daily business intelligence digest.
|
||||||
|
|
||||||
|
## Performance Data (Last 7 Days)
|
||||||
|
{data_context.to_markdown(index=False) if data_context is not None and len(data_context) > 0 else "No data available."}
|
||||||
|
|
||||||
|
"""
|
||||||
|
if rag_chunks:
|
||||||
|
prompt += "## Historical Context\n"
|
||||||
|
for i, chunk in enumerate(rag_chunks, 1):
|
||||||
|
sim = chunk.get("similarity", 0)
|
||||||
|
prompt += f"{i}. {chunk['text']} (relevance: {sim:.2f})\n"
|
||||||
|
prompt += "\n"
|
||||||
|
|
||||||
|
prompt += """Format as a newsletter with:
|
||||||
|
1. **Executive Summary** — key highlights in 2-3 sentences
|
||||||
|
2. **Top Performers** — best performing branches
|
||||||
|
3. **Attention Required** — bottom 3 branches by revenue (ALWAYS include this)
|
||||||
|
4. **Channel Insights** — spending efficiency across channels
|
||||||
|
5. **Recommendations** — specific, data-backed suggestions
|
||||||
|
|
||||||
|
Use bullet points, bold key numbers, and keep it concise."""
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self.client.chat.completions.create(
|
||||||
|
model=self.model,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": soul_content},
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
|
],
|
||||||
|
temperature=0.7,
|
||||||
|
max_tokens=3000,
|
||||||
|
)
|
||||||
|
return response.choices[0].message.content.strip()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Digest generation failed: {e}")
|
||||||
|
return "Daily digest generation encountered an error."
|
||||||
|
|
||||||
|
def _build_prompt(
|
||||||
|
self,
|
||||||
|
question: str,
|
||||||
|
soul_content: str,
|
||||||
|
data_context: Optional[pd.DataFrame],
|
||||||
|
rag_chunks: Optional[List[Dict]],
|
||||||
|
retry_issues: Optional[List[str]],
|
||||||
|
strict_data_instruction: Optional[str] = None,
|
||||||
|
supplementary_context: Optional[pd.DataFrame] = None,
|
||||||
|
) -> str:
|
||||||
|
"""Build the augmented prompt for response generation."""
|
||||||
|
parts = []
|
||||||
|
|
||||||
|
# Strict data instruction (on retry — prevents hallucination)
|
||||||
|
if strict_data_instruction:
|
||||||
|
parts.append(f"## ⚠️ STRICT REQUIREMENT\n{strict_data_instruction}\n")
|
||||||
|
|
||||||
|
# Data context
|
||||||
|
if data_context is not None and len(data_context) > 0:
|
||||||
|
parts.append("## Data Context (query results for the user's question)")
|
||||||
|
parts.append(data_context.to_markdown(index=False))
|
||||||
|
else:
|
||||||
|
parts.append("## Data Context\nNo query results available.")
|
||||||
|
|
||||||
|
# Supplementary context (top performers for comparison)
|
||||||
|
if supplementary_context is not None and len(supplementary_context) > 0:
|
||||||
|
parts.append("\n## Benchmark Data (top-performing branches for comparison)")
|
||||||
|
parts.append(supplementary_context.to_markdown(index=False))
|
||||||
|
parts.append(
|
||||||
|
"\nUse this benchmark data to compare the queried branch's performance "
|
||||||
|
"against top performers. Identify which channels and strategies work "
|
||||||
|
"best, and recommend specific, actionable improvements based on what "
|
||||||
|
"top-performing branches are doing differently."
|
||||||
|
)
|
||||||
|
|
||||||
|
# RAG chunks (Phase 2)
|
||||||
|
if rag_chunks:
|
||||||
|
parts.append("\n## Historical Business Context (retrieved from intelligence layer)")
|
||||||
|
if strict_data_instruction:
|
||||||
|
parts.append("⚠️ ONLY use historical context that is about branches/entities in the Data Context above. IGNORE any historical context about other branches.")
|
||||||
|
for i, chunk in enumerate(rag_chunks, 1):
|
||||||
|
sim = chunk.get("similarity", 0)
|
||||||
|
parts.append(f"{i}. {chunk['text']} (relevance: {sim:.2f})")
|
||||||
|
parts.append("\nBase suggestions on historical context. Cite specific data points.")
|
||||||
|
|
||||||
|
# Retry instructions
|
||||||
|
if retry_issues:
|
||||||
|
parts.append("\n## IMPORTANT — Previous Response Issues")
|
||||||
|
parts.append("Your previous response had these problems. Fix them:")
|
||||||
|
for issue in retry_issues:
|
||||||
|
parts.append(f"- {issue}")
|
||||||
|
parts.append("Be more precise. Only state facts supported by the data above.")
|
||||||
|
parts.append("Do NOT introduce any new branches, cities, or figures that are not in the Data Context.")
|
||||||
|
|
||||||
|
# User question
|
||||||
|
parts.append(f"\n## User Question\n{question}")
|
||||||
|
|
||||||
|
parts.append("\nProvide a professional, data-grounded response. Cite specific numbers from the data.")
|
||||||
|
|
||||||
|
return "\n".join(parts)
|
||||||
@@ -0,0 +1,294 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — Orchestrator
|
||||||
|
|
||||||
|
Coordinates the full message pipeline:
|
||||||
|
NormalisedMessage → NL-to-SQL → PostgreSQL → (RAG Retriever) → Gen Agent → QA Agent → Response
|
||||||
|
|
||||||
|
Max 2 retries per query. Returns best attempt with confidence warning after max retries.
|
||||||
|
|
||||||
|
Context enrichment: when a query returns sparse data (≤3 rows) and the question
|
||||||
|
asks for recommendations, automatically pulls top-performing branches as comparison
|
||||||
|
context so the Gen Agent can give actionable suggestions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from typing import Dict, Optional, List
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from agents.gen_agent import GenAgent
|
||||||
|
from agents.qa_agent import QAAgent
|
||||||
|
from channels.protocol_adapter import NormalisedMessage
|
||||||
|
from config.client_loader import ClientConfig
|
||||||
|
from skills.nl_to_sql import NLToSQL
|
||||||
|
from skills.postgres_connector import get_connector
|
||||||
|
from soul.soul_loader import load_soul
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
MAX_RETRIES = 2
|
||||||
|
|
||||||
|
# Keywords that signal the user wants recommendations, not just raw data
|
||||||
|
_RECOMMENDATION_KEYWORDS = re.compile(
|
||||||
|
r"\b(improve|increase|boost|grow|fix|help|recommend|suggest|advice|strategy|"
|
||||||
|
r"what (should|can|do)|how (to|can|do|should))\b",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Orchestrator:
|
||||||
|
"""Pipeline orchestrator — the central brain of Clawrity."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.nl_to_sql = NLToSQL()
|
||||||
|
self.gen_agent = GenAgent()
|
||||||
|
self.qa_agent = QAAgent()
|
||||||
|
self.retriever = None # Set in Phase 2 via set_retriever()
|
||||||
|
|
||||||
|
def set_retriever(self, retriever):
|
||||||
|
"""Attach the RAG retriever (Phase 2)."""
|
||||||
|
self.retriever = retriever
|
||||||
|
|
||||||
|
async def process(
|
||||||
|
self,
|
||||||
|
message: NormalisedMessage,
|
||||||
|
client_config: ClientConfig,
|
||||||
|
) -> Dict:
|
||||||
|
"""
|
||||||
|
Process a user message through the full pipeline.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with: response, qa_score, qa_passed, retries, metadata
|
||||||
|
"""
|
||||||
|
start_time = time.time()
|
||||||
|
db = get_connector()
|
||||||
|
|
||||||
|
# Load SOUL
|
||||||
|
soul_content = load_soul(client_config)
|
||||||
|
|
||||||
|
# Step 1: NL-to-SQL
|
||||||
|
schema_meta = db.get_spend_data_schema(client_config.client_id)
|
||||||
|
sql = self.nl_to_sql.generate_sql(
|
||||||
|
question=message.text,
|
||||||
|
client_id=client_config.client_id,
|
||||||
|
schema_metadata=schema_meta,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 2: Execute SQL
|
||||||
|
data_context = None
|
||||||
|
if sql:
|
||||||
|
try:
|
||||||
|
data_context = db.execute_query(sql)
|
||||||
|
logger.info(f"SQL returned {len(data_context)} rows")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"SQL execution failed: {e}")
|
||||||
|
data_context = pd.DataFrame()
|
||||||
|
else:
|
||||||
|
data_context = pd.DataFrame()
|
||||||
|
|
||||||
|
# Step 2b: Context enrichment for sparse results
|
||||||
|
# When data is sparse and the user wants recommendations, pull
|
||||||
|
# top performers and channel benchmarks as supplementary context
|
||||||
|
supplementary_context = None
|
||||||
|
if self._needs_enrichment(message.text, data_context):
|
||||||
|
supplementary_context = self._enrich_context(
|
||||||
|
db, client_config.client_id, message.text, data_context
|
||||||
|
)
|
||||||
|
if supplementary_context is not None:
|
||||||
|
logger.info(
|
||||||
|
f"Context enriched: {len(supplementary_context)} supplementary rows"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 3: RAG Retrieval (Phase 2)
|
||||||
|
rag_chunks = None
|
||||||
|
if self.retriever:
|
||||||
|
try:
|
||||||
|
rag_chunks = self.retriever.retrieve(
|
||||||
|
query=message.text,
|
||||||
|
client_id=client_config.client_id,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"RAG retrieval failed: {e}")
|
||||||
|
|
||||||
|
# Step 4: Gen Agent → QA Agent loop (max 2 retries)
|
||||||
|
# When supplementary context is provided (enrichment mode), use a relaxed
|
||||||
|
# QA threshold since the response naturally references broader benchmark data
|
||||||
|
qa_threshold = client_config.hallucination_threshold
|
||||||
|
if supplementary_context is not None and len(supplementary_context) > 0:
|
||||||
|
qa_threshold = min(qa_threshold, 0.5)
|
||||||
|
logger.info(f"Using relaxed QA threshold ({qa_threshold}) for enriched context")
|
||||||
|
|
||||||
|
best_response = None
|
||||||
|
best_score = 0.0
|
||||||
|
qa_result = {"score": 0, "passed": False, "issues": []}
|
||||||
|
retries = 0
|
||||||
|
|
||||||
|
for attempt in range(MAX_RETRIES + 1):
|
||||||
|
retry_issues = qa_result["issues"] if attempt > 0 else None
|
||||||
|
|
||||||
|
# On retry, add explicit data-only instruction to prevent hallucination
|
||||||
|
strict_data_instruction = None
|
||||||
|
if attempt > 0:
|
||||||
|
if supplementary_context is not None and len(supplementary_context) > 0:
|
||||||
|
strict_data_instruction = (
|
||||||
|
"CRITICAL: Only use data from the Data Context and Benchmark Data "
|
||||||
|
"sections provided. Do NOT invent figures or branch names that are "
|
||||||
|
"not present in either of those sections. You MAY reference benchmark "
|
||||||
|
"branches for comparison and recommendations."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
strict_data_instruction = (
|
||||||
|
"CRITICAL: Do NOT mention any branches, figures, or historical data "
|
||||||
|
"that are not in the SQL query result provided. Stick strictly to the "
|
||||||
|
"data. If historical context from RAG is about different branches than "
|
||||||
|
"what the query returned, IGNORE that context entirely."
|
||||||
|
)
|
||||||
|
|
||||||
|
response = self.gen_agent.generate(
|
||||||
|
question=message.text,
|
||||||
|
soul_content=soul_content,
|
||||||
|
data_context=data_context,
|
||||||
|
rag_chunks=rag_chunks,
|
||||||
|
retry_issues=retry_issues,
|
||||||
|
retry_count=attempt,
|
||||||
|
strict_data_instruction=strict_data_instruction,
|
||||||
|
supplementary_context=supplementary_context,
|
||||||
|
)
|
||||||
|
|
||||||
|
qa_result = self.qa_agent.evaluate(
|
||||||
|
response=response,
|
||||||
|
data_context=data_context,
|
||||||
|
threshold=qa_threshold,
|
||||||
|
supplementary_context=supplementary_context,
|
||||||
|
user_question=message.text,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Track best response (prefer longer, richer responses over "no data" stubs)
|
||||||
|
if qa_result["score"] > best_score or (
|
||||||
|
qa_result["score"] == best_score
|
||||||
|
and best_response is not None
|
||||||
|
and len(response) > len(best_response)
|
||||||
|
):
|
||||||
|
best_score = qa_result["score"]
|
||||||
|
best_response = response
|
||||||
|
|
||||||
|
if qa_result["passed"]:
|
||||||
|
logger.info(f"QA passed on attempt {attempt + 1}")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
retries += 1
|
||||||
|
logger.warning(
|
||||||
|
f"QA failed on attempt {attempt + 1}: "
|
||||||
|
f"score={qa_result['score']:.2f}, issues={qa_result['issues']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# If max retries exceeded, use best response with confidence warning
|
||||||
|
final_response = best_response or response
|
||||||
|
if not qa_result["passed"] and retries >= MAX_RETRIES:
|
||||||
|
final_response += (
|
||||||
|
"\n\n---\n"
|
||||||
|
f"⚠️ *Confidence: {best_score:.0%} — "
|
||||||
|
f"This response may contain approximations. "
|
||||||
|
f"Please verify critical numbers against your source data.*"
|
||||||
|
)
|
||||||
|
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"response": final_response,
|
||||||
|
"qa_score": best_score,
|
||||||
|
"qa_passed": qa_result["passed"],
|
||||||
|
"retries": retries,
|
||||||
|
"sql": sql,
|
||||||
|
"data_rows": len(data_context) if data_context is not None else 0,
|
||||||
|
"rag_chunks_used": len(rag_chunks) if rag_chunks else 0,
|
||||||
|
"elapsed_seconds": round(elapsed, 2),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Log interaction
|
||||||
|
self._log_interaction(message, client_config, result)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _needs_enrichment(
|
||||||
|
self,
|
||||||
|
question: str,
|
||||||
|
data_context: Optional[pd.DataFrame],
|
||||||
|
) -> bool:
|
||||||
|
"""Check if the query result is too sparse for a recommendation question."""
|
||||||
|
# Only enrich if data is sparse
|
||||||
|
if data_context is not None and len(data_context) > 3:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Only enrich if user is asking for recommendations/improvement
|
||||||
|
return bool(_RECOMMENDATION_KEYWORDS.search(question))
|
||||||
|
|
||||||
|
def _enrich_context(
|
||||||
|
self,
|
||||||
|
db,
|
||||||
|
client_id: str,
|
||||||
|
question: str,
|
||||||
|
data_context: Optional[pd.DataFrame],
|
||||||
|
) -> Optional[pd.DataFrame]:
|
||||||
|
"""
|
||||||
|
Pull supplementary context: top-performing branches and channel
|
||||||
|
benchmarks to help Gen Agent give actionable recommendations.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Get top 5 branches by ROI for comparison
|
||||||
|
enrichment_sql = """
|
||||||
|
SELECT branch, country, channel,
|
||||||
|
SUM(spend) as total_spend,
|
||||||
|
SUM(revenue) as total_revenue,
|
||||||
|
SUM(leads) as total_leads,
|
||||||
|
SUM(conversions) as total_conversions,
|
||||||
|
ROUND((SUM(revenue)/NULLIF(SUM(spend),0))::numeric, 2) as roi
|
||||||
|
FROM spend_data
|
||||||
|
WHERE client_id = %s
|
||||||
|
AND date >= CURRENT_DATE - INTERVAL '90 days'
|
||||||
|
GROUP BY branch, country, channel
|
||||||
|
HAVING SUM(spend) > 0
|
||||||
|
ORDER BY roi DESC
|
||||||
|
LIMIT 10
|
||||||
|
"""
|
||||||
|
top_performers = db.execute_query(enrichment_sql, (client_id,))
|
||||||
|
|
||||||
|
if top_performers is not None and len(top_performers) > 0:
|
||||||
|
logger.info(f"Enrichment: fetched {len(top_performers)} top performer rows")
|
||||||
|
return top_performers
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Context enrichment failed: {e}")
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _log_interaction(
|
||||||
|
self,
|
||||||
|
message: NormalisedMessage,
|
||||||
|
client_config: ClientConfig,
|
||||||
|
result: Dict,
|
||||||
|
):
|
||||||
|
"""Log interaction for monitoring."""
|
||||||
|
try:
|
||||||
|
from rag.monitoring import log_interaction
|
||||||
|
log_interaction(
|
||||||
|
client_id=client_config.client_id,
|
||||||
|
query=message.text,
|
||||||
|
num_chunks=result.get("rag_chunks_used", 0),
|
||||||
|
chunk_types_used=[], # Populated when retriever provides this info
|
||||||
|
qa_score=result.get("qa_score", 0),
|
||||||
|
qa_passed=result.get("qa_passed", False),
|
||||||
|
retries=result.get("retries", 0),
|
||||||
|
response_length=len(result.get("response", "")),
|
||||||
|
elapsed_seconds=result.get("elapsed_seconds", 0),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Monitoring log failed: {e}")
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"[{client_config.client_id}] Query processed: "
|
||||||
|
f"score={result['qa_score']:.2f}, passed={result['qa_passed']}, "
|
||||||
|
f"retries={result['retries']}, time={result['elapsed_seconds']}s"
|
||||||
|
)
|
||||||
@@ -0,0 +1,165 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — QA Agent
|
||||||
|
|
||||||
|
Evaluates Gen Agent responses for faithfulness against data context.
|
||||||
|
Uses Groq LLM at temperature 0.1 for strict, deterministic evaluation.
|
||||||
|
Returns JSON: { score, passed, issues }
|
||||||
|
Threshold from client YAML hallucination_threshold (default 0.75).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from typing import Optional, List, Dict
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from config.llm_client import get_llm_client, get_model_name
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
EVAL_PROMPT = """You are a strict quality assurance evaluator for business intelligence responses.
|
||||||
|
|
||||||
|
Your job: verify that the response ONLY contains claims supported by the provided data.
|
||||||
|
|
||||||
|
## Data Context (ground truth)
|
||||||
|
{data_context}
|
||||||
|
|
||||||
|
## Response to Evaluate
|
||||||
|
{response}
|
||||||
|
|
||||||
|
## Evaluation Criteria
|
||||||
|
|
||||||
|
### 1. Branch Name Validation (CRITICAL)
|
||||||
|
- Extract ALL branch/city names mentioned in the response
|
||||||
|
- Compare against the branch names in the Data Context above
|
||||||
|
- If ANY branch name appears in the response but NOT in the Data Context, this is a HALLUCINATION
|
||||||
|
- Deduct 0.3 from score for EACH unrelated branch mentioned
|
||||||
|
|
||||||
|
### 2. Numerical Accuracy (CRITICAL)
|
||||||
|
- ALL revenue, spend, lead, conversion, and ROI figures in the response must match the Data Context EXACTLY
|
||||||
|
- If a number is mentioned that does not appear in the Data Context, deduct 0.2 from score
|
||||||
|
- Rounded numbers are acceptable only if clearly approximate (e.g., "~$1.2M")
|
||||||
|
|
||||||
|
### 3. Historical Context Relevance
|
||||||
|
- If the response includes historical context or trends, it is acceptable ONLY if it directly supports the answer about branches/entities present in the Data Context
|
||||||
|
- Historical context about branches NOT in the current Data Context must be penalized: deduct 0.3 from score
|
||||||
|
- Example: If Data Context shows Toronto, Vancouver, Dubai but response mentions "Lawton showed 16436% growth" — this is IRRELEVANT historical context and must be penalized
|
||||||
|
|
||||||
|
### 4. Completeness
|
||||||
|
- Does the response address the user's question?
|
||||||
|
- Are key data points from the Data Context included?
|
||||||
|
|
||||||
|
### 5. Appropriate Hedging
|
||||||
|
- Does the response use uncertain language for inferences?
|
||||||
|
- Recommendations should be clearly marked as suggestions, not facts
|
||||||
|
|
||||||
|
## Scoring
|
||||||
|
Start at 1.0 and deduct points per the rules above. Minimum score is 0.0.
|
||||||
|
|
||||||
|
Return a JSON object with exactly this structure:
|
||||||
|
{{
|
||||||
|
"score": <float between 0.0 and 1.0>,
|
||||||
|
"passed": <true if score >= {threshold}>,
|
||||||
|
"issues": [<list of specific issues found, empty if none>]
|
||||||
|
}}
|
||||||
|
|
||||||
|
IMPORTANT: If score < {threshold}, include in issues list exactly which branches, figures, or historical data were mentioned that do NOT appear in the Data Context. Format as:
|
||||||
|
"Mentioned branches/figures not in current query result: [list them]"
|
||||||
|
|
||||||
|
Return ONLY the JSON. No other text."""
|
||||||
|
|
||||||
|
|
||||||
|
class QAAgent:
|
||||||
|
"""Quality assurance agent for validating Gen Agent responses."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.client = get_llm_client()
|
||||||
|
self.model = get_model_name()
|
||||||
|
|
||||||
|
def evaluate(
|
||||||
|
self,
|
||||||
|
response: str,
|
||||||
|
data_context: Optional[pd.DataFrame] = None,
|
||||||
|
threshold: float = 0.75,
|
||||||
|
supplementary_context: Optional[pd.DataFrame] = None,
|
||||||
|
user_question: str = "",
|
||||||
|
) -> Dict:
|
||||||
|
"""
|
||||||
|
Evaluate a response for faithfulness.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
response: Gen Agent's response text
|
||||||
|
data_context: The data the response should be grounded in
|
||||||
|
threshold: Minimum score to pass (from client YAML)
|
||||||
|
supplementary_context: Benchmark data (top performers) that is also valid ground truth
|
||||||
|
user_question: The user's original question (entities mentioned here are valid context)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with score (float), passed (bool), issues (list[str])
|
||||||
|
"""
|
||||||
|
data_str = ""
|
||||||
|
if data_context is not None and len(data_context) > 0:
|
||||||
|
data_str = data_context.to_markdown(index=False)
|
||||||
|
else:
|
||||||
|
data_str = "No structured data available."
|
||||||
|
|
||||||
|
# Include supplementary (benchmark) context as valid ground truth
|
||||||
|
if supplementary_context is not None and len(supplementary_context) > 0:
|
||||||
|
data_str += "\n\n### Benchmark Data (also valid ground truth)\n"
|
||||||
|
data_str += supplementary_context.to_markdown(index=False)
|
||||||
|
|
||||||
|
# Include user question so QA knows which entities are valid context
|
||||||
|
if user_question:
|
||||||
|
data_str += f"\n\n### User Question Context\nThe user asked: \"{user_question}\"\nBranch/entity names mentioned in the user's question are valid to reference in the response."
|
||||||
|
|
||||||
|
prompt = EVAL_PROMPT.format(
|
||||||
|
data_context=data_str,
|
||||||
|
response=response,
|
||||||
|
threshold=threshold,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = self.client.chat.completions.create(
|
||||||
|
model=self.model,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": "You are a strict QA evaluator. Return only valid JSON. Pay special attention to branch names and figures that appear in the response but NOT in the data context — these are hallucinations."},
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
|
],
|
||||||
|
temperature=0.1,
|
||||||
|
max_tokens=512,
|
||||||
|
)
|
||||||
|
|
||||||
|
raw = result.choices[0].message.content.strip()
|
||||||
|
evaluation = self._parse_response(raw, threshold)
|
||||||
|
logger.info(
|
||||||
|
f"QA evaluation: score={evaluation['score']:.2f}, "
|
||||||
|
f"passed={evaluation['passed']}, issues={len(evaluation['issues'])}"
|
||||||
|
)
|
||||||
|
return evaluation
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"QA evaluation failed: {e}")
|
||||||
|
# On failure, pass with warning
|
||||||
|
return {"score": 0.5, "passed": True, "issues": [f"QA evaluation error: {str(e)}"]}
|
||||||
|
|
||||||
|
def _parse_response(self, raw: str, threshold: float) -> Dict:
|
||||||
|
"""Parse JSON response from QA LLM call."""
|
||||||
|
try:
|
||||||
|
# Strip markdown code fences if present
|
||||||
|
cleaned = raw.strip()
|
||||||
|
if cleaned.startswith("```"):
|
||||||
|
cleaned = cleaned.split("\n", 1)[1] if "\n" in cleaned else cleaned[3:]
|
||||||
|
if cleaned.endswith("```"):
|
||||||
|
cleaned = cleaned[:-3]
|
||||||
|
cleaned = cleaned.strip()
|
||||||
|
|
||||||
|
data = json.loads(cleaned)
|
||||||
|
score = float(data.get("score", 0.5))
|
||||||
|
return {
|
||||||
|
"score": score,
|
||||||
|
"passed": score >= threshold,
|
||||||
|
"issues": data.get("issues", []),
|
||||||
|
}
|
||||||
|
except (json.JSONDecodeError, ValueError) as e:
|
||||||
|
logger.warning(f"Could not parse QA response: {e}. Raw: {raw[:200]}")
|
||||||
|
return {"score": 0.5, "passed": True, "issues": ["QA response parsing failed"]}
|
||||||
@@ -0,0 +1,214 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — Scout Agent
|
||||||
|
|
||||||
|
Fetches real-time competitor updates and sector-specific news.
|
||||||
|
Runs inside HEARTBEAT digest job ONLY — never on ad-hoc /chat queries.
|
||||||
|
Appends "Market Intelligence" section to morning digest.
|
||||||
|
|
||||||
|
If nothing relevant is found, the section is omitted entirely — no filler.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from config.llm_client import get_llm_client, get_model_name
|
||||||
|
from config.client_loader import ClientConfig
|
||||||
|
from config.settings import get_settings
|
||||||
|
from skills.web_search import web_search
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
SCOUT_PROMPT = """You are a business intelligence scout for {client_name}.
|
||||||
|
Their sector: {sector}
|
||||||
|
Their competitors: {competitors}
|
||||||
|
|
||||||
|
Below are web search results from the last {lookback} day(s).
|
||||||
|
Extract ONLY what is directly relevant to this client's business.
|
||||||
|
Ignore anything generic or unrelated to their sector.
|
||||||
|
If nothing is relevant, respond with exactly: NO_RELEVANT_NEWS
|
||||||
|
|
||||||
|
Format relevant findings as a clean "Market Intelligence" section with bullet points.
|
||||||
|
Each bullet should summarize one key finding with its source.
|
||||||
|
|
||||||
|
Results:
|
||||||
|
{search_results}"""
|
||||||
|
|
||||||
|
QUERY_PROMPT = """You are a business intelligence scout for {client_name}.
|
||||||
|
Sector: {sector}
|
||||||
|
Competitors: {competitors}
|
||||||
|
|
||||||
|
The user asked: "{query}"
|
||||||
|
|
||||||
|
Below are web search results. Extract ONLY what is directly relevant to the
|
||||||
|
user's question and this client's business context. Ignore generic or unrelated content.
|
||||||
|
If nothing is relevant, respond with exactly: NO_RELEVANT_NEWS
|
||||||
|
|
||||||
|
Format findings as concise bullet points with sources.
|
||||||
|
|
||||||
|
Results:
|
||||||
|
{search_results}"""
|
||||||
|
|
||||||
|
|
||||||
|
class ScoutAgent:
|
||||||
|
"""Competitor and sector intelligence agent."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.client = get_llm_client()
|
||||||
|
self.model = get_model_name()
|
||||||
|
|
||||||
|
async def gather_intelligence(
|
||||||
|
self,
|
||||||
|
client_config: ClientConfig,
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Fetch and summarize competitor/sector news for digest.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
client_config: Client config with scout section
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Formatted "Market Intelligence" markdown section, or None if nothing relevant
|
||||||
|
"""
|
||||||
|
scout_config = client_config.scout
|
||||||
|
if not scout_config.sector and not scout_config.competitors:
|
||||||
|
logger.info(f"[{client_config.client_id}] No scout config — skipping")
|
||||||
|
return None
|
||||||
|
|
||||||
|
lookback = scout_config.news_lookback_days
|
||||||
|
today = datetime.now().strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
# Gather search results
|
||||||
|
all_results = []
|
||||||
|
|
||||||
|
# Search for each competitor
|
||||||
|
for competitor in scout_config.competitors:
|
||||||
|
query = f"{competitor} latest news"
|
||||||
|
results = web_search(query, max_results=3, lookback_days=lookback)
|
||||||
|
all_results.extend(results)
|
||||||
|
|
||||||
|
# Search for sector keywords
|
||||||
|
for keyword in scout_config.keywords[:3]: # Limit to 3 keywords
|
||||||
|
query = f"{keyword} news {today}"
|
||||||
|
results = web_search(query, max_results=3, lookback_days=lookback)
|
||||||
|
all_results.extend(results)
|
||||||
|
|
||||||
|
if not all_results:
|
||||||
|
logger.info(f"[{client_config.client_id}] No search results found")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Format results for LLM
|
||||||
|
results_text = "\n\n".join(
|
||||||
|
f"**{r['title']}** ({r['url']})\n{r['content']}"
|
||||||
|
for r in all_results
|
||||||
|
)
|
||||||
|
|
||||||
|
# Summarize with Groq
|
||||||
|
prompt = SCOUT_PROMPT.format(
|
||||||
|
client_name=client_config.client_name,
|
||||||
|
sector=scout_config.sector,
|
||||||
|
competitors=", ".join(scout_config.competitors),
|
||||||
|
lookback=lookback,
|
||||||
|
search_results=results_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self.client.chat.completions.create(
|
||||||
|
model=self.model,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": "You are a business intelligence scout."},
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
|
],
|
||||||
|
temperature=0.3,
|
||||||
|
max_tokens=1024,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = response.choices[0].message.content.strip()
|
||||||
|
|
||||||
|
if result == "NO_RELEVANT_NEWS":
|
||||||
|
logger.info(f"[{client_config.client_id}] Scout: no relevant news found")
|
||||||
|
return None
|
||||||
|
|
||||||
|
section = f"## 🔭 Market Intelligence\n\n{result}"
|
||||||
|
logger.info(f"[{client_config.client_id}] Scout: generated intelligence section")
|
||||||
|
return section
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Scout Agent failed: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def search_query(
|
||||||
|
self,
|
||||||
|
client_config: ClientConfig,
|
||||||
|
query: str,
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Run a targeted scout search for a specific user query.
|
||||||
|
|
||||||
|
Used by the /scout endpoint for ad-hoc competitor/news queries.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
client_config: Client config with scout section
|
||||||
|
query: User's specific question about competitors/market
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Formatted intelligence summary, or None if nothing relevant
|
||||||
|
"""
|
||||||
|
scout_config = client_config.scout
|
||||||
|
|
||||||
|
# Search with the user's query directly
|
||||||
|
results = web_search(query, max_results=5, lookback_days=scout_config.news_lookback_days)
|
||||||
|
|
||||||
|
# Also search with competitor names if they appear in the query
|
||||||
|
for competitor in scout_config.competitors:
|
||||||
|
if competitor.lower() in query.lower():
|
||||||
|
extra = web_search(f"{competitor} latest news", max_results=3, lookback_days=scout_config.news_lookback_days)
|
||||||
|
results.extend(extra)
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
logger.info(f"[{client_config.client_id}] Scout query returned no results")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Deduplicate by URL
|
||||||
|
seen_urls = set()
|
||||||
|
unique_results = []
|
||||||
|
for r in results:
|
||||||
|
if r["url"] not in seen_urls:
|
||||||
|
seen_urls.add(r["url"])
|
||||||
|
unique_results.append(r)
|
||||||
|
|
||||||
|
# Format results for LLM
|
||||||
|
results_text = "\n\n".join(
|
||||||
|
f"**{r['title']}** ({r['url']})\n{r['content']}"
|
||||||
|
for r in unique_results
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt = QUERY_PROMPT.format(
|
||||||
|
client_name=client_config.client_name,
|
||||||
|
sector=scout_config.sector,
|
||||||
|
competitors=", ".join(scout_config.competitors),
|
||||||
|
query=query,
|
||||||
|
search_results=results_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self.client.chat.completions.create(
|
||||||
|
model=self.model,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": "You are a business intelligence scout."},
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
|
],
|
||||||
|
temperature=0.3,
|
||||||
|
max_tokens=1024,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = response.choices[0].message.content.strip()
|
||||||
|
|
||||||
|
if result == "NO_RELEVANT_NEWS":
|
||||||
|
return None
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Scout query failed: {e}")
|
||||||
|
return None
|
||||||
@@ -0,0 +1,121 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — Protocol Adapter (OpenClaw Pattern)
|
||||||
|
|
||||||
|
Normalises messages from any channel into a unified NormalisedMessage.
|
||||||
|
Maps workspace/team IDs → client_id. Strips bot mentions.
|
||||||
|
Interface: any channel handler produces NormalisedMessage — adding Teams,
|
||||||
|
WhatsApp, etc. requires zero pipeline changes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, Optional
|
||||||
|
|
||||||
|
from config.client_loader import ClientConfig
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class NormalisedMessage:
|
||||||
|
"""Unified message format — channel-agnostic."""
|
||||||
|
text: str
|
||||||
|
channel: str # Channel/conversation ID
|
||||||
|
user_id: str
|
||||||
|
client_id: str
|
||||||
|
timestamp: datetime = field(default_factory=datetime.utcnow)
|
||||||
|
source: str = "unknown" # "slack", "teams", "api"
|
||||||
|
raw_event: Optional[Dict] = None
|
||||||
|
|
||||||
|
|
||||||
|
# Pattern to match Slack bot mentions like <@U1234567890>
|
||||||
|
SLACK_MENTION_PATTERN = re.compile(r"<@[A-Z0-9]+>\s*")
|
||||||
|
|
||||||
|
|
||||||
|
class ProtocolAdapter:
|
||||||
|
"""Normalises raw channel events into NormalisedMessages."""
|
||||||
|
|
||||||
|
def __init__(self, client_configs: Dict[str, ClientConfig]):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
client_configs: Dict of client_id → ClientConfig
|
||||||
|
"""
|
||||||
|
self.client_configs = client_configs
|
||||||
|
# Build workspace → client_id lookup
|
||||||
|
self._workspace_map: Dict[str, str] = {}
|
||||||
|
for cid, config in client_configs.items():
|
||||||
|
for ws_id in config.slack_workspace_ids:
|
||||||
|
self._workspace_map[ws_id] = cid
|
||||||
|
# If only one client, use it as default
|
||||||
|
self._default_client_id = (
|
||||||
|
list(client_configs.keys())[0] if len(client_configs) == 1 else None
|
||||||
|
)
|
||||||
|
|
||||||
|
def normalise_slack(self, event: dict, team_id: Optional[str] = None) -> NormalisedMessage:
|
||||||
|
"""
|
||||||
|
Normalise a Slack event into a NormalisedMessage.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
event: Raw Slack event dict (from Bolt SDK)
|
||||||
|
team_id: Slack workspace/team ID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
NormalisedMessage
|
||||||
|
"""
|
||||||
|
text = event.get("text", "")
|
||||||
|
# Strip bot mention tags
|
||||||
|
text = SLACK_MENTION_PATTERN.sub("", text).strip()
|
||||||
|
|
||||||
|
channel = event.get("channel", "")
|
||||||
|
user_id = event.get("user", "")
|
||||||
|
|
||||||
|
# Map workspace to client
|
||||||
|
client_id = self._resolve_client_id(team_id)
|
||||||
|
|
||||||
|
return NormalisedMessage(
|
||||||
|
text=text,
|
||||||
|
channel=channel,
|
||||||
|
user_id=user_id,
|
||||||
|
client_id=client_id,
|
||||||
|
source="slack",
|
||||||
|
raw_event=event,
|
||||||
|
)
|
||||||
|
|
||||||
|
def normalise_api(self, client_id: str, message: str) -> NormalisedMessage:
|
||||||
|
"""Normalise a direct API call (POST /chat)."""
|
||||||
|
return NormalisedMessage(
|
||||||
|
text=message,
|
||||||
|
channel="api",
|
||||||
|
user_id="api_user",
|
||||||
|
client_id=client_id,
|
||||||
|
source="api",
|
||||||
|
)
|
||||||
|
|
||||||
|
def normalise_teams(self, activity: dict) -> NormalisedMessage:
|
||||||
|
"""
|
||||||
|
Normalise a Microsoft Teams Bot Framework activity.
|
||||||
|
# TODO: Implement full Teams normalisation when Teams handler is wired up.
|
||||||
|
"""
|
||||||
|
text = activity.get("text", "")
|
||||||
|
# Strip Teams bot mention (usually <at>BotName</at>)
|
||||||
|
text = re.sub(r"<at>.*?</at>\s*", "", text).strip()
|
||||||
|
|
||||||
|
return NormalisedMessage(
|
||||||
|
text=text,
|
||||||
|
channel=activity.get("channelId", "teams"),
|
||||||
|
user_id=activity.get("from", {}).get("id", ""),
|
||||||
|
client_id=self._default_client_id or "unknown",
|
||||||
|
source="teams",
|
||||||
|
raw_event=activity,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _resolve_client_id(self, workspace_id: Optional[str]) -> str:
|
||||||
|
"""Resolve workspace/team ID to client_id."""
|
||||||
|
if workspace_id and workspace_id in self._workspace_map:
|
||||||
|
return self._workspace_map[workspace_id]
|
||||||
|
if self._default_client_id:
|
||||||
|
return self._default_client_id
|
||||||
|
logger.warning(f"Could not resolve client for workspace: {workspace_id}")
|
||||||
|
return "unknown"
|
||||||
@@ -0,0 +1,263 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — Slack Handler (Socket Mode)
|
||||||
|
|
||||||
|
Listens for app_mention and message events via Slack Bolt SDK.
|
||||||
|
Runs in a background thread to not block FastAPI.
|
||||||
|
|
||||||
|
=== SETUP REQUIRED ===
|
||||||
|
Before running, configure these in your .env file:
|
||||||
|
|
||||||
|
SLACK_BOT_TOKEN=xoxb-... ← OAuth & Permissions → Install to Workspace
|
||||||
|
SLACK_APP_TOKEN=xapp-... ← Socket Mode → Generate App-Level Token
|
||||||
|
SLACK_SIGNING_SECRET=... ← Basic Information → App Credentials
|
||||||
|
|
||||||
|
See README.md for detailed Slack app setup instructions.
|
||||||
|
=======================
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from typing import Dict, Optional, Set
|
||||||
|
|
||||||
|
from config.settings import get_settings
|
||||||
|
from config.client_loader import ClientConfig
|
||||||
|
from channels.protocol_adapter import ProtocolAdapter, NormalisedMessage
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Thread pool for processing LLM pipeline without blocking event handlers
|
||||||
|
_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="clawrity-slack")
|
||||||
|
|
||||||
|
# Module-level guard: only one SlackHandler should be active at a time
|
||||||
|
_active_handler: Optional["SlackHandler"] = None
|
||||||
|
|
||||||
|
|
||||||
|
class SlackHandler:
|
||||||
|
"""Slack Bot using Socket Mode via Bolt SDK."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
protocol_adapter: ProtocolAdapter,
|
||||||
|
client_configs: Dict[str, ClientConfig],
|
||||||
|
orchestrator, # agents.orchestrator.Orchestrator
|
||||||
|
):
|
||||||
|
self.adapter = protocol_adapter
|
||||||
|
self.client_configs = client_configs
|
||||||
|
self.orchestrator = orchestrator
|
||||||
|
self._thread: Optional[threading.Thread] = None
|
||||||
|
|
||||||
|
settings = get_settings()
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Bot Token (xoxb-...) — from .env SLACK_BOT_TOKEN
|
||||||
|
# This is the OAuth token installed to your workspace.
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
self.bot_token = settings.slack_bot_token
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# App-Level Token (xapp-...) — from .env SLACK_APP_TOKEN
|
||||||
|
# Required for Socket Mode. Generated in Slack app settings.
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
self.app_token = settings.slack_app_token
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Signing Secret — from .env SLACK_SIGNING_SECRET
|
||||||
|
# Used to verify incoming requests from Slack.
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
self.signing_secret = settings.slack_signing_secret
|
||||||
|
|
||||||
|
self.app = None
|
||||||
|
self.handler = None
|
||||||
|
|
||||||
|
# Deduplication: track recently processed event timestamps
|
||||||
|
# Slack retries events if handler is slow — this prevents duplicates
|
||||||
|
self._processed_events: Set[str] = set()
|
||||||
|
self._processed_lock = threading.Lock()
|
||||||
|
|
||||||
|
def _validate_tokens(self) -> bool:
|
||||||
|
"""Check that all required Slack tokens are configured."""
|
||||||
|
if not self.bot_token:
|
||||||
|
logger.warning(
|
||||||
|
"SLACK_BOT_TOKEN not set. Slack bot will not start. "
|
||||||
|
"See README.md → Slack Bot Setup for instructions."
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
if not self.app_token:
|
||||||
|
logger.warning(
|
||||||
|
"SLACK_APP_TOKEN not set. Socket Mode requires an app-level token. "
|
||||||
|
"Go to your Slack app → Socket Mode → Generate Token."
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _is_duplicate_event(self, event: dict) -> bool:
|
||||||
|
"""Check if we've already processed this event (Slack retry dedup)."""
|
||||||
|
# Use multiple fields to build a robust dedup key.
|
||||||
|
# client_msg_id is unique per user message (present on message events,
|
||||||
|
# but NOT on app_mention events). event_ts is present on both.
|
||||||
|
# We store keys for all strategies so cross-event-type dedup works.
|
||||||
|
msg_id = event.get("client_msg_id")
|
||||||
|
event_ts = event.get("event_ts") or event.get("ts", "")
|
||||||
|
user = event.get("user", "")
|
||||||
|
|
||||||
|
# Build candidate keys
|
||||||
|
keys = set()
|
||||||
|
if msg_id:
|
||||||
|
keys.add(f"msg:{msg_id}")
|
||||||
|
if event_ts:
|
||||||
|
keys.add(f"ts:{event_ts}")
|
||||||
|
# Fallback: combine event type + ts + user for events without client_msg_id
|
||||||
|
event_type = event.get("type", "")
|
||||||
|
if event_ts and user:
|
||||||
|
keys.add(f"evt:{event_type}:{event_ts}:{user}")
|
||||||
|
|
||||||
|
if not keys:
|
||||||
|
return False
|
||||||
|
|
||||||
|
with self._processed_lock:
|
||||||
|
# Check ALL keys — if any match, it's a duplicate
|
||||||
|
for key in keys:
|
||||||
|
if key in self._processed_events:
|
||||||
|
logger.debug(f"Skipping duplicate event (matched key: {key})")
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Register ALL keys so cross-event-type dedup works
|
||||||
|
# (app_mention and message for the same user message share event_ts)
|
||||||
|
self._processed_events.update(keys)
|
||||||
|
|
||||||
|
# Prune old entries (keep set from growing indefinitely)
|
||||||
|
if len(self._processed_events) > 500:
|
||||||
|
self._processed_events = set(list(self._processed_events)[-200:])
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _setup_app(self):
|
||||||
|
"""Initialize Slack Bolt App and register event handlers."""
|
||||||
|
from slack_bolt import App
|
||||||
|
from slack_bolt.adapter.socket_mode import SocketModeHandler
|
||||||
|
|
||||||
|
self.app = App(
|
||||||
|
token=self.bot_token,
|
||||||
|
signing_secret=self.signing_secret if self.signing_secret else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Track bot's own user ID to prevent self-response loops
|
||||||
|
self._bot_user_id = None
|
||||||
|
try:
|
||||||
|
auth = self.app.client.auth_test()
|
||||||
|
self._bot_user_id = auth.get("user_id", "")
|
||||||
|
logger.info(f"Bot user ID: {self._bot_user_id}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not fetch bot user ID: {e}")
|
||||||
|
|
||||||
|
# --- Event: Bot mentioned in a channel ---
|
||||||
|
@self.app.event("app_mention")
|
||||||
|
def handle_mention(event, say, context):
|
||||||
|
# Return IMMEDIATELY so Slack gets ack — process in background
|
||||||
|
if self._is_duplicate_event(event):
|
||||||
|
return
|
||||||
|
_executor.submit(self._handle_event, event, say, context)
|
||||||
|
|
||||||
|
# --- Event: Direct message to bot ---
|
||||||
|
@self.app.event("message")
|
||||||
|
def handle_message(event, say, context):
|
||||||
|
# Ignore bot's own messages and message_changed events
|
||||||
|
if event.get("subtype") in (
|
||||||
|
"bot_message",
|
||||||
|
"message_changed",
|
||||||
|
"message_deleted",
|
||||||
|
):
|
||||||
|
return
|
||||||
|
if event.get("bot_id"):
|
||||||
|
return
|
||||||
|
# Ignore if this is from the bot itself
|
||||||
|
if self._bot_user_id and event.get("user") == self._bot_user_id:
|
||||||
|
return
|
||||||
|
# Skip channel messages that contain a bot mention —
|
||||||
|
# those are handled by the app_mention handler above.
|
||||||
|
# Only process DMs here (channel_type == "im").
|
||||||
|
channel_type = event.get("channel_type", "")
|
||||||
|
if channel_type != "im":
|
||||||
|
return
|
||||||
|
if self._is_duplicate_event(event):
|
||||||
|
return
|
||||||
|
# Return IMMEDIATELY — process in background
|
||||||
|
_executor.submit(self._handle_event, event, say, context)
|
||||||
|
|
||||||
|
self.handler = SocketModeHandler(self.app, self.app_token)
|
||||||
|
|
||||||
|
def _handle_event(self, event: dict, say, context):
|
||||||
|
"""Process an incoming Slack event (runs in background thread)."""
|
||||||
|
try:
|
||||||
|
team_id = context.get("team_id", None) if context else None
|
||||||
|
message = self.adapter.normalise_slack(event, team_id=team_id)
|
||||||
|
|
||||||
|
if not message.text:
|
||||||
|
return
|
||||||
|
|
||||||
|
if message.client_id == "unknown":
|
||||||
|
say("⚠️ Could not identify your workspace. Please contact support.")
|
||||||
|
return
|
||||||
|
|
||||||
|
client_config = self.client_configs.get(message.client_id)
|
||||||
|
if not client_config:
|
||||||
|
say(f"⚠️ No configuration found for client: {message.client_id}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Run the orchestrator pipeline (async in sync context)
|
||||||
|
loop = asyncio.new_event_loop()
|
||||||
|
try:
|
||||||
|
result = loop.run_until_complete(
|
||||||
|
self.orchestrator.process(message, client_config)
|
||||||
|
)
|
||||||
|
say(result["response"])
|
||||||
|
finally:
|
||||||
|
loop.close()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Slack event handler error: {e}", exc_info=True)
|
||||||
|
say(
|
||||||
|
"❌ I encountered an error processing your request. "
|
||||||
|
"Please try again or contact support."
|
||||||
|
)
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
"""Start the Slack bot in a background thread."""
|
||||||
|
global _active_handler
|
||||||
|
|
||||||
|
if not self._validate_tokens():
|
||||||
|
logger.info("Slack bot not started — missing tokens")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Stop any existing handler to prevent duplicate Socket Mode connections
|
||||||
|
if _active_handler is not None:
|
||||||
|
logger.info("Stopping previous Slack handler before starting new one")
|
||||||
|
_active_handler.stop()
|
||||||
|
_active_handler = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
self._setup_app()
|
||||||
|
|
||||||
|
def _run():
|
||||||
|
logger.info("Starting Slack bot (Socket Mode)...")
|
||||||
|
self.handler.start()
|
||||||
|
|
||||||
|
self._thread = threading.Thread(target=_run, daemon=True)
|
||||||
|
self._thread.start()
|
||||||
|
_active_handler = self
|
||||||
|
logger.info("Slack bot started in background thread")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to start Slack bot: {e}")
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
"""Stop the Slack bot."""
|
||||||
|
if self.handler:
|
||||||
|
try:
|
||||||
|
self.handler.close()
|
||||||
|
logger.info("Slack bot stopped")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error stopping Slack bot: {e}")
|
||||||
@@ -0,0 +1,124 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — Microsoft Teams Handler (STUB)
|
||||||
|
|
||||||
|
Skeleton implementation of the Bot Framework adapter for Microsoft Teams.
|
||||||
|
Proves the multi-channel architecture is real — any channel handler produces
|
||||||
|
NormalisedMessage via ProtocolAdapter, so the entire pipeline works unchanged.
|
||||||
|
|
||||||
|
# TODO: Wire up Azure Bot credentials when ready for Teams demo.
|
||||||
|
# Required: MICROSOFT_APP_ID, MICROSOFT_APP_PASSWORD
|
||||||
|
# Package: botbuilder-core, botbuilder-schema
|
||||||
|
|
||||||
|
Status: NOT IMPLEMENTED — Slack is the priority for development.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Optional
|
||||||
|
|
||||||
|
from channels.protocol_adapter import ProtocolAdapter, NormalisedMessage
|
||||||
|
from config.client_loader import ClientConfig
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class TeamsHandler:
|
||||||
|
"""
|
||||||
|
Microsoft Teams bot handler stub.
|
||||||
|
|
||||||
|
Architecture:
|
||||||
|
Teams Activity → ProtocolAdapter.normalise_teams() → Orchestrator → Response
|
||||||
|
|
||||||
|
The same pipeline used by Slack — zero business logic in this layer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
protocol_adapter: ProtocolAdapter,
|
||||||
|
client_configs: Dict[str, ClientConfig],
|
||||||
|
orchestrator, # agents.orchestrator.Orchestrator
|
||||||
|
):
|
||||||
|
self.adapter = protocol_adapter
|
||||||
|
self.client_configs = client_configs
|
||||||
|
self.orchestrator = orchestrator
|
||||||
|
|
||||||
|
# TODO: Wire up Azure Bot credentials from .env
|
||||||
|
# self.app_id = settings.microsoft_app_id
|
||||||
|
# self.app_password = settings.microsoft_app_password
|
||||||
|
|
||||||
|
async def handle_activity(self, activity: dict) -> str:
|
||||||
|
"""
|
||||||
|
Process an incoming Teams Bot Framework activity.
|
||||||
|
|
||||||
|
# TODO: Implement when ready for Teams demo.
|
||||||
|
|
||||||
|
Expected flow:
|
||||||
|
1. Receive activity from Bot Framework webhook
|
||||||
|
2. Normalise via ProtocolAdapter.normalise_teams(activity)
|
||||||
|
3. Route to Orchestrator.process(message, client_config)
|
||||||
|
4. Return response via Bot Framework turn context
|
||||||
|
|
||||||
|
Args:
|
||||||
|
activity: Raw Bot Framework activity dict
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Response text to send back to Teams
|
||||||
|
"""
|
||||||
|
# --- Stub implementation ---
|
||||||
|
message = self.adapter.normalise_teams(activity)
|
||||||
|
|
||||||
|
client_config = self.client_configs.get(message.client_id)
|
||||||
|
if not client_config:
|
||||||
|
return f"No configuration found for client: {message.client_id}"
|
||||||
|
|
||||||
|
result = await self.orchestrator.process(message, client_config)
|
||||||
|
return result["response"]
|
||||||
|
|
||||||
|
def setup_routes(self, app):
|
||||||
|
"""
|
||||||
|
Register Teams webhook endpoint with FastAPI.
|
||||||
|
|
||||||
|
# TODO: Implement Bot Framework adapter integration.
|
||||||
|
|
||||||
|
Expected endpoint:
|
||||||
|
POST /api/teams/messages → Bot Framework webhook
|
||||||
|
|
||||||
|
Requires:
|
||||||
|
- botbuilder-core package
|
||||||
|
- BotFrameworkAdapter with app_id + app_password
|
||||||
|
- CloudAdapter or BotFrameworkHttpClient
|
||||||
|
"""
|
||||||
|
logger.info(
|
||||||
|
"Teams handler stub loaded. "
|
||||||
|
"To enable Teams: install botbuilder-core, set Azure Bot credentials."
|
||||||
|
)
|
||||||
|
|
||||||
|
# TODO: Uncomment and implement when ready
|
||||||
|
#
|
||||||
|
# from botbuilder.core import (
|
||||||
|
# BotFrameworkAdapter,
|
||||||
|
# BotFrameworkAdapterSettings,
|
||||||
|
# TurnContext,
|
||||||
|
# )
|
||||||
|
#
|
||||||
|
# settings = BotFrameworkAdapterSettings(
|
||||||
|
# app_id=self.app_id,
|
||||||
|
# app_password=self.app_password,
|
||||||
|
# )
|
||||||
|
# adapter = BotFrameworkAdapter(settings)
|
||||||
|
#
|
||||||
|
# @app.post("/api/teams/messages")
|
||||||
|
# async def teams_webhook(request: Request):
|
||||||
|
# body = await request.json()
|
||||||
|
# activity = Activity().deserialize(body)
|
||||||
|
# auth_header = request.headers.get("Authorization", "")
|
||||||
|
# response = await adapter.process_activity(
|
||||||
|
# activity, auth_header, self._on_turn
|
||||||
|
# )
|
||||||
|
# return response
|
||||||
|
#
|
||||||
|
# async def _on_turn(turn_context: TurnContext):
|
||||||
|
# activity = turn_context.activity
|
||||||
|
# response = await self.handle_activity(activity.__dict__)
|
||||||
|
# await turn_context.send_activity(response)
|
||||||
|
|
||||||
|
pass
|
||||||
@@ -0,0 +1,158 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — Client Configuration Loader
|
||||||
|
|
||||||
|
Scans config/clients/ for YAML files and parses each into a ClientConfig model.
|
||||||
|
Supports ${ENV_VAR} interpolation in YAML values.
|
||||||
|
New client = new YAML file. Zero code changes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import glob
|
||||||
|
import logging
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from config.settings import get_settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Pydantic models for client YAML structure
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class DataSourceConfig(BaseModel):
|
||||||
|
type: str = "csv"
|
||||||
|
path: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class DatabaseConfig(BaseModel):
|
||||||
|
url: str = ""
|
||||||
|
schema_name: str = "" # 'schema' is a Pydantic reserved attr
|
||||||
|
|
||||||
|
|
||||||
|
class ScoutConfig(BaseModel):
|
||||||
|
sector: str = ""
|
||||||
|
competitors: List[str] = []
|
||||||
|
keywords: List[str] = []
|
||||||
|
news_lookback_days: int = 1
|
||||||
|
|
||||||
|
|
||||||
|
class ClientConfig(BaseModel):
|
||||||
|
client_id: str
|
||||||
|
client_name: str = ""
|
||||||
|
|
||||||
|
data_source: DataSourceConfig = DataSourceConfig()
|
||||||
|
database: DatabaseConfig = DatabaseConfig()
|
||||||
|
|
||||||
|
countries: List[str] = []
|
||||||
|
risk_threshold: float = 0.15
|
||||||
|
hallucination_threshold: float = 0.75
|
||||||
|
|
||||||
|
digest_schedule: str = "08:00"
|
||||||
|
timezone: str = "UTC"
|
||||||
|
|
||||||
|
channels: Dict[str, str] = {}
|
||||||
|
|
||||||
|
soul_file: str = ""
|
||||||
|
heartbeat_file: str = ""
|
||||||
|
|
||||||
|
column_mapping: Dict[str, str] = {}
|
||||||
|
|
||||||
|
scout: ScoutConfig = ScoutConfig()
|
||||||
|
|
||||||
|
# Runtime: workspace/team ID → client_id mapping for ProtocolAdapter
|
||||||
|
slack_workspace_ids: List[str] = []
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Environment variable interpolation
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_ENV_PATTERN = re.compile(r"\$\{(\w+)\}")
|
||||||
|
|
||||||
|
|
||||||
|
def _interpolate_env(value: str) -> str:
|
||||||
|
"""Replace ${ENV_VAR} placeholders with actual environment variable values."""
|
||||||
|
def _replace(match):
|
||||||
|
var_name = match.group(1)
|
||||||
|
return os.environ.get(var_name, match.group(0))
|
||||||
|
|
||||||
|
if isinstance(value, str):
|
||||||
|
return _ENV_PATTERN.sub(_replace, value)
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
def _interpolate_dict(d: dict) -> dict:
|
||||||
|
"""Recursively interpolate environment variables in a dictionary."""
|
||||||
|
result = {}
|
||||||
|
for key, value in d.items():
|
||||||
|
if isinstance(value, dict):
|
||||||
|
result[key] = _interpolate_dict(value)
|
||||||
|
elif isinstance(value, list):
|
||||||
|
result[key] = [
|
||||||
|
_interpolate_env(v) if isinstance(v, str) else v
|
||||||
|
for v in value
|
||||||
|
]
|
||||||
|
elif isinstance(value, str):
|
||||||
|
result[key] = _interpolate_env(value)
|
||||||
|
else:
|
||||||
|
result[key] = value
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Loader
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def load_client_configs(config_dir: Optional[str] = None) -> Dict[str, ClientConfig]:
|
||||||
|
"""
|
||||||
|
Load all client YAML files from the config directory.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping client_id → ClientConfig
|
||||||
|
"""
|
||||||
|
if config_dir is None:
|
||||||
|
config_dir = get_settings().clients_config_dir
|
||||||
|
|
||||||
|
configs: Dict[str, ClientConfig] = {}
|
||||||
|
yaml_pattern = os.path.join(config_dir, "*.yaml")
|
||||||
|
|
||||||
|
for yaml_path in glob.glob(yaml_pattern):
|
||||||
|
try:
|
||||||
|
with open(yaml_path, "r") as f:
|
||||||
|
raw = yaml.safe_load(f)
|
||||||
|
|
||||||
|
if not raw or "client_id" not in raw:
|
||||||
|
logger.warning(f"Skipping {yaml_path}: missing client_id")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Interpolate environment variables
|
||||||
|
interpolated = _interpolate_dict(raw)
|
||||||
|
|
||||||
|
# Handle 'schema' → 'schema_name' mapping for Pydantic
|
||||||
|
if "database" in interpolated and "schema" in interpolated["database"]:
|
||||||
|
interpolated["database"]["schema_name"] = interpolated["database"].pop("schema")
|
||||||
|
|
||||||
|
config = ClientConfig(**interpolated)
|
||||||
|
configs[config.client_id] = config
|
||||||
|
logger.info(f"Loaded client config: {config.client_id} from {yaml_path}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error loading {yaml_path}: {e}")
|
||||||
|
|
||||||
|
if not configs:
|
||||||
|
logger.warning(f"No client configs found in {config_dir}")
|
||||||
|
|
||||||
|
return configs
|
||||||
|
|
||||||
|
|
||||||
|
def get_client_config(client_id: str, configs: Optional[Dict[str, ClientConfig]] = None) -> Optional[ClientConfig]:
|
||||||
|
"""Get a specific client config by ID."""
|
||||||
|
if configs is None:
|
||||||
|
configs = load_client_configs()
|
||||||
|
return configs.get(client_id)
|
||||||
@@ -0,0 +1,36 @@
|
|||||||
|
client_id: acme_corp
|
||||||
|
client_name: ACME Corporation
|
||||||
|
|
||||||
|
data_source:
|
||||||
|
type: "csv"
|
||||||
|
path: "data/processed/acme_merged.csv"
|
||||||
|
|
||||||
|
database:
|
||||||
|
url: "${DATABASE_URL}"
|
||||||
|
schema: "acme"
|
||||||
|
|
||||||
|
countries: ["US", "Canada", "MENA"]
|
||||||
|
risk_threshold: 0.15
|
||||||
|
hallucination_threshold: 0.75
|
||||||
|
|
||||||
|
digest_schedule: "08:00"
|
||||||
|
timezone: "Asia/Kolkata"
|
||||||
|
|
||||||
|
channels:
|
||||||
|
slack_webhook: "${ACME_SLACK_WEBHOOK}"
|
||||||
|
|
||||||
|
soul_file: "soul/acme_soul.md"
|
||||||
|
heartbeat_file: "heartbeat/acme_heartbeat.md"
|
||||||
|
|
||||||
|
column_mapping:
|
||||||
|
Order Date: date
|
||||||
|
Country: country
|
||||||
|
City: branch
|
||||||
|
Sales: revenue
|
||||||
|
Profit: profit
|
||||||
|
|
||||||
|
scout:
|
||||||
|
sector: "global retail"
|
||||||
|
competitors: ["IKEA", "Amazon", "Walmart", "Staples"]
|
||||||
|
keywords: ["retail supply chain", "furniture market trends", "office supplies demand", "global retail ecommerce"]
|
||||||
|
news_lookback_days: 1
|
||||||
@@ -0,0 +1,76 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — LLM Client Factory
|
||||||
|
|
||||||
|
Provides a unified LLM client that works with both NVIDIA NIM and Groq.
|
||||||
|
Both are OpenAI-compatible APIs, so we use the OpenAI client with different
|
||||||
|
base URLs and API keys.
|
||||||
|
|
||||||
|
Auto-detects provider from settings:
|
||||||
|
- NVIDIA NIM: base_url="https://integrate.api.nvidia.com/v1"
|
||||||
|
- Groq: base_url="https://api.groq.com/openai/v1"
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
from config.settings import get_settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Provider configs
|
||||||
|
_PROVIDERS = {
|
||||||
|
"nvidia": {
|
||||||
|
"base_url": "https://integrate.api.nvidia.com/v1",
|
||||||
|
"default_model": "meta/llama-3.3-70b-instruct",
|
||||||
|
},
|
||||||
|
"groq": {
|
||||||
|
"base_url": "https://api.groq.com/openai/v1",
|
||||||
|
"default_model": "llama-3.3-70b-versatile",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_llm_client() -> OpenAI:
|
||||||
|
"""Get the configured LLM client (NVIDIA NIM or Groq)."""
|
||||||
|
settings = get_settings()
|
||||||
|
provider = settings.active_llm_provider
|
||||||
|
|
||||||
|
if provider == "nvidia":
|
||||||
|
api_key = settings.nvidia_api_key
|
||||||
|
elif provider == "groq":
|
||||||
|
api_key = settings.groq_api_key
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown LLM provider: {provider}")
|
||||||
|
|
||||||
|
if not api_key:
|
||||||
|
raise ValueError(
|
||||||
|
f"No API key configured for LLM provider '{provider}'. "
|
||||||
|
f"Set {'NVIDIA_API_KEY' if provider == 'nvidia' else 'GROQ_API_KEY'} in .env"
|
||||||
|
)
|
||||||
|
|
||||||
|
config = _PROVIDERS[provider]
|
||||||
|
client = OpenAI(
|
||||||
|
api_key=api_key,
|
||||||
|
base_url=config["base_url"],
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"LLM client: {provider} ({config['base_url']})")
|
||||||
|
return client
|
||||||
|
|
||||||
|
|
||||||
|
def get_model_name() -> str:
|
||||||
|
"""Get the model name for the active provider."""
|
||||||
|
settings = get_settings()
|
||||||
|
provider = settings.active_llm_provider
|
||||||
|
|
||||||
|
# If user specified a model in settings, use it
|
||||||
|
# Otherwise use the provider default
|
||||||
|
model = settings.llm_model
|
||||||
|
if model == "meta/llama-3.3-70b-instruct" and provider == "groq":
|
||||||
|
model = _PROVIDERS["groq"]["default_model"]
|
||||||
|
elif model == "llama-3.3-70b-versatile" and provider == "nvidia":
|
||||||
|
model = _PROVIDERS["nvidia"]["default_model"]
|
||||||
|
|
||||||
|
return model
|
||||||
@@ -0,0 +1,72 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — Application Settings
|
||||||
|
|
||||||
|
Loads environment variables via pydantic-settings.
|
||||||
|
All secrets read from .env file — nothing is hardcoded.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from functools import lru_cache
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from pydantic_settings import BaseSettings
|
||||||
|
|
||||||
|
|
||||||
|
class Settings(BaseSettings):
|
||||||
|
"""Application settings loaded from environment variables."""
|
||||||
|
|
||||||
|
# --- Database ---
|
||||||
|
database_url: str = "postgresql://user:pass@localhost:5432/clawrity"
|
||||||
|
|
||||||
|
# --- LLM Providers ---
|
||||||
|
groq_api_key: str = ""
|
||||||
|
nvidia_api_key: str = ""
|
||||||
|
|
||||||
|
# --- Slack (Socket Mode) ---
|
||||||
|
# Bot Token (xoxb-...) — OAuth & Permissions → Install to Workspace
|
||||||
|
slack_bot_token: str = ""
|
||||||
|
# App-Level Token (xapp-...) — Socket Mode → Generate Token
|
||||||
|
slack_app_token: str = ""
|
||||||
|
# Signing Secret — Basic Information → App Credentials
|
||||||
|
slack_signing_secret: str = ""
|
||||||
|
|
||||||
|
# --- Tavily Web Search ---
|
||||||
|
tavily_api_key: str = ""
|
||||||
|
|
||||||
|
# --- Slack Webhook for digest delivery ---
|
||||||
|
acme_slack_webhook: str = ""
|
||||||
|
|
||||||
|
# --- Paths ---
|
||||||
|
data_raw_dir: str = "data/raw"
|
||||||
|
data_processed_dir: str = "data/processed"
|
||||||
|
logs_dir: str = "logs"
|
||||||
|
clients_config_dir: str = "config/clients"
|
||||||
|
|
||||||
|
# --- Model Defaults ---
|
||||||
|
llm_model: str = "meta/llama-3.3-70b-instruct"
|
||||||
|
llm_provider: str = "" # auto-detected: "nvidia" or "groq"
|
||||||
|
embedding_model: str = "all-MiniLM-L6-v2"
|
||||||
|
embedding_dim: int = 384
|
||||||
|
|
||||||
|
@property
|
||||||
|
def active_llm_provider(self) -> str:
|
||||||
|
"""Auto-detect which LLM provider to use based on available keys."""
|
||||||
|
if self.llm_provider:
|
||||||
|
return self.llm_provider
|
||||||
|
if self.nvidia_api_key:
|
||||||
|
return "nvidia"
|
||||||
|
if self.groq_api_key:
|
||||||
|
return "groq"
|
||||||
|
return "nvidia" # default
|
||||||
|
|
||||||
|
model_config = {
|
||||||
|
"env_file": ".env",
|
||||||
|
"env_file_encoding": "utf-8",
|
||||||
|
"case_sensitive": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache()
|
||||||
|
def get_settings() -> Settings:
|
||||||
|
"""Singleton settings instance. Cached after first call."""
|
||||||
|
return Settings()
|
||||||
@@ -0,0 +1,42 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — Base Data Connector
|
||||||
|
|
||||||
|
Abstract interface for data connectors.
|
||||||
|
All connectors implement load() → pd.DataFrame.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
class BaseConnector(ABC):
|
||||||
|
"""Abstract base class for data source connectors."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def load(self, path: str, **kwargs) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Load data from the source.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to the data source
|
||||||
|
**kwargs: Additional arguments specific to the connector
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pandas DataFrame with loaded data
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def validate(self, df: pd.DataFrame, required_columns: list) -> bool:
|
||||||
|
"""
|
||||||
|
Validate that the DataFrame has expected columns.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame to validate
|
||||||
|
required_columns: List of column names that must be present
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if valid
|
||||||
|
"""
|
||||||
|
pass
|
||||||
@@ -0,0 +1,88 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — CSV/Excel Data Connector
|
||||||
|
|
||||||
|
Auto-detects file format based on extension:
|
||||||
|
.csv → pandas read_csv
|
||||||
|
.xlsx / .xls → pandas read_excel (via openpyxl)
|
||||||
|
|
||||||
|
Supports both formats since Kaggle datasets vary by download version.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from connectors.base_connector import BaseConnector
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class CSVConnector(BaseConnector):
|
||||||
|
"""Connector for CSV and Excel files with auto-detection."""
|
||||||
|
|
||||||
|
def load(self, path: str, **kwargs) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Load data from a CSV or Excel file.
|
||||||
|
Auto-detects format based on file extension.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to the file (.csv, .xlsx, .xls)
|
||||||
|
**kwargs: Passed through to pandas read function.
|
||||||
|
Useful kwargs: sheet_name, encoding, sep
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pandas DataFrame
|
||||||
|
"""
|
||||||
|
file_path = Path(path)
|
||||||
|
|
||||||
|
if not file_path.exists():
|
||||||
|
raise FileNotFoundError(f"Data file not found: {path}")
|
||||||
|
|
||||||
|
ext = file_path.suffix.lower()
|
||||||
|
|
||||||
|
if ext == ".csv":
|
||||||
|
logger.info(f"Loading CSV: {path}")
|
||||||
|
df = pd.read_csv(path, encoding='latin-1', **kwargs)
|
||||||
|
elif ext in (".xlsx", ".xls"):
|
||||||
|
logger.info(f"Loading Excel ({ext}): {path}")
|
||||||
|
# Default to first sheet unless specified
|
||||||
|
sheet_name = kwargs.pop("sheet_name", 0)
|
||||||
|
df = pd.read_excel(path, sheet_name=sheet_name, engine="openpyxl", **kwargs)
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unsupported file format: {ext}. "
|
||||||
|
f"Supported: .csv, .xlsx, .xls"
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns from {file_path.name}")
|
||||||
|
return df
|
||||||
|
|
||||||
|
def validate(self, df: pd.DataFrame, required_columns: list) -> bool:
|
||||||
|
"""
|
||||||
|
Validate that the DataFrame has all required columns.
|
||||||
|
Uses case-insensitive matching.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame to validate
|
||||||
|
required_columns: List of column names that must be present
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if all required columns found
|
||||||
|
"""
|
||||||
|
df_cols_lower = {col.lower().strip() for col in df.columns}
|
||||||
|
missing = []
|
||||||
|
|
||||||
|
for col in required_columns:
|
||||||
|
if col.lower().strip() not in df_cols_lower:
|
||||||
|
missing.append(col)
|
||||||
|
|
||||||
|
if missing:
|
||||||
|
logger.error(
|
||||||
|
f"Missing required columns: {missing}. "
|
||||||
|
f"Available: {list(df.columns)}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
@@ -0,0 +1,38 @@
|
|||||||
|
services:
|
||||||
|
clawrity-api:
|
||||||
|
build: .
|
||||||
|
ports:
|
||||||
|
- "8000:8000"
|
||||||
|
environment:
|
||||||
|
- DATABASE_URL=postgresql://user:pass@postgres:5432/clawrity
|
||||||
|
- GROQ_API_KEY=${GROQ_API_KEY}
|
||||||
|
- SLACK_BOT_TOKEN=${SLACK_BOT_TOKEN}
|
||||||
|
- SLACK_APP_TOKEN=${SLACK_APP_TOKEN}
|
||||||
|
- SLACK_SIGNING_SECRET=${SLACK_SIGNING_SECRET}
|
||||||
|
- TAVILY_API_KEY=${TAVILY_API_KEY}
|
||||||
|
- ACME_SLACK_WEBHOOK=${ACME_SLACK_WEBHOOK}
|
||||||
|
depends_on:
|
||||||
|
postgres:
|
||||||
|
condition: service_healthy
|
||||||
|
volumes:
|
||||||
|
- ./data:/app/data
|
||||||
|
- ./logs:/app/logs
|
||||||
|
|
||||||
|
postgres:
|
||||||
|
image: ankane/pgvector
|
||||||
|
environment:
|
||||||
|
POSTGRES_DB: clawrity
|
||||||
|
POSTGRES_USER: user
|
||||||
|
POSTGRES_PASSWORD: pass
|
||||||
|
volumes:
|
||||||
|
- pg_data:/var/lib/postgresql/data
|
||||||
|
ports:
|
||||||
|
- "5432:5432"
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "pg_isready -U user -d clawrity"]
|
||||||
|
interval: 5s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 5
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
pg_data:
|
||||||
@@ -0,0 +1,82 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — ETL Normaliser
|
||||||
|
|
||||||
|
Applies column mappings from client YAML, normalises data types,
|
||||||
|
cleans strings, and handles nulls.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def normalise_dataframe(
|
||||||
|
df: pd.DataFrame,
|
||||||
|
column_mapping: Dict[str, str],
|
||||||
|
date_column: str = "date",
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""Normalise a DataFrame using the client's column mapping."""
|
||||||
|
df = df.copy()
|
||||||
|
original_len = len(df)
|
||||||
|
|
||||||
|
# Step 1: Apply column mapping (case-insensitive)
|
||||||
|
df_cols_map = {col.strip(): col for col in df.columns}
|
||||||
|
rename_map = {}
|
||||||
|
for source, target in column_mapping.items():
|
||||||
|
if source in df_cols_map:
|
||||||
|
rename_map[df_cols_map[source]] = target
|
||||||
|
else:
|
||||||
|
for orig_col, actual_col in df_cols_map.items():
|
||||||
|
if orig_col.lower() == source.lower():
|
||||||
|
rename_map[actual_col] = target
|
||||||
|
break
|
||||||
|
if rename_map:
|
||||||
|
df = df.rename(columns=rename_map)
|
||||||
|
logger.info(f"Renamed columns: {rename_map}")
|
||||||
|
|
||||||
|
# Step 2: Parse dates
|
||||||
|
if date_column in df.columns:
|
||||||
|
df[date_column] = pd.to_datetime(df[date_column], errors="coerce")
|
||||||
|
df = df.dropna(subset=[date_column])
|
||||||
|
df[date_column] = df[date_column].dt.date
|
||||||
|
|
||||||
|
# Step 3: Clean string columns
|
||||||
|
for col in ["country", "branch", "channel"]:
|
||||||
|
if col in df.columns:
|
||||||
|
df[col] = (
|
||||||
|
df[col].astype(str).str.strip().str.title()
|
||||||
|
.replace({"Nan": None, "None": None, "": None})
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 4: Handle numeric nulls
|
||||||
|
for col in ["spend", "revenue", "profit", "leads", "conversions"]:
|
||||||
|
if col in df.columns:
|
||||||
|
df[col] = pd.to_numeric(df[col], errors="coerce")
|
||||||
|
|
||||||
|
# Step 5: Remove duplicates
|
||||||
|
df = df.drop_duplicates()
|
||||||
|
dropped = original_len - len(df)
|
||||||
|
if dropped > 0:
|
||||||
|
logger.info(f"Removed {dropped} duplicate rows")
|
||||||
|
|
||||||
|
logger.info(f"Normalisation complete: {len(df)} rows")
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def remove_outliers(df: pd.DataFrame, columns: list, n_std: float = 3.0) -> pd.DataFrame:
|
||||||
|
"""Remove rows with values > n_std standard deviations from mean."""
|
||||||
|
df = df.copy()
|
||||||
|
original_len = len(df)
|
||||||
|
for col in columns:
|
||||||
|
if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
|
||||||
|
mean, std = df[col].mean(), df[col].std()
|
||||||
|
if std > 0:
|
||||||
|
df = df[(df[col] - mean).abs() <= n_std * std]
|
||||||
|
removed = original_len - len(df)
|
||||||
|
if removed > 0:
|
||||||
|
logger.info(f"Removed {removed} outlier rows (>{n_std} std devs)")
|
||||||
|
return df
|
||||||
@@ -0,0 +1,173 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — Prophet Forecasting Engine
|
||||||
|
|
||||||
|
Trains Prophet models on branch-level monthly revenue time series.
|
||||||
|
Forecasts 6 months ahead. Caches results in PostgreSQL forecasts table.
|
||||||
|
|
||||||
|
Limitations (be explicit):
|
||||||
|
- Predicts revenue TRENDS only
|
||||||
|
- Does NOT claim ROI-per-dollar forecasting (spend→revenue is approximate)
|
||||||
|
- Requires minimum 2 years of data per branch
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from skills.postgres_connector import get_connector
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
MIN_MONTHS = 24 # Minimum 2 years of data
|
||||||
|
FORECAST_MONTHS = 6
|
||||||
|
|
||||||
|
|
||||||
|
class ProphetEngine:
|
||||||
|
"""Time series forecasting using Facebook Prophet."""
|
||||||
|
|
||||||
|
def train_and_forecast(self, client_id: str) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Train Prophet models for each branch and cache forecasts.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
client_id: Client to forecast for
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of forecast result dicts (one per branch)
|
||||||
|
"""
|
||||||
|
from prophet import Prophet
|
||||||
|
|
||||||
|
db = get_connector()
|
||||||
|
|
||||||
|
# Get monthly revenue per branch
|
||||||
|
sql = """
|
||||||
|
SELECT branch, country,
|
||||||
|
DATE_TRUNC('month', date) AS month,
|
||||||
|
SUM(revenue) AS monthly_revenue
|
||||||
|
FROM spend_data
|
||||||
|
WHERE client_id = %s
|
||||||
|
GROUP BY branch, country, DATE_TRUNC('month', date)
|
||||||
|
ORDER BY branch, month
|
||||||
|
"""
|
||||||
|
df = db.execute_query(sql, (client_id,))
|
||||||
|
|
||||||
|
if df.empty:
|
||||||
|
logger.warning(f"No data for forecasting: {client_id}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
results = []
|
||||||
|
branches = df.groupby(["branch", "country"])
|
||||||
|
|
||||||
|
for (branch, country), group in branches:
|
||||||
|
group = group.sort_values("month").reset_index(drop=True)
|
||||||
|
|
||||||
|
if len(group) < MIN_MONTHS:
|
||||||
|
logger.info(
|
||||||
|
f"Skipping {branch} ({country}): only {len(group)} months "
|
||||||
|
f"(need {MIN_MONTHS})"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Prepare Prophet format: ds (date), y (value)
|
||||||
|
prophet_df = pd.DataFrame({
|
||||||
|
"ds": pd.to_datetime(group["month"]),
|
||||||
|
"y": group["monthly_revenue"].astype(float),
|
||||||
|
})
|
||||||
|
|
||||||
|
# Train
|
||||||
|
model = Prophet(
|
||||||
|
yearly_seasonality=True,
|
||||||
|
weekly_seasonality=False,
|
||||||
|
daily_seasonality=False,
|
||||||
|
)
|
||||||
|
model.fit(prophet_df)
|
||||||
|
|
||||||
|
# Forecast
|
||||||
|
future = model.make_future_dataframe(
|
||||||
|
periods=FORECAST_MONTHS, freq="MS"
|
||||||
|
)
|
||||||
|
forecast = model.predict(future)
|
||||||
|
|
||||||
|
# Extract forecast period only
|
||||||
|
forecast_only = forecast.tail(FORECAST_MONTHS)
|
||||||
|
|
||||||
|
forecast_data = {
|
||||||
|
"branch": branch,
|
||||||
|
"country": country,
|
||||||
|
"horizon_months": FORECAST_MONTHS,
|
||||||
|
"dates": forecast_only["ds"].dt.strftime("%Y-%m-%d").tolist(),
|
||||||
|
"forecast_revenue": forecast_only["yhat"].round(2).tolist(),
|
||||||
|
"lower_bound": forecast_only["yhat_lower"].round(2).tolist(),
|
||||||
|
"upper_bound": forecast_only["yhat_upper"].round(2).tolist(),
|
||||||
|
"computed_at": datetime.utcnow().isoformat(),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Cache in PostgreSQL
|
||||||
|
self._cache_forecast(client_id, forecast_data)
|
||||||
|
results.append(forecast_data)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Forecast generated for {branch} ({country}): "
|
||||||
|
f"{FORECAST_MONTHS} months ahead"
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Prophet failed for {branch} ({country}): {e}")
|
||||||
|
|
||||||
|
logger.info(f"Forecasting complete: {len(results)} branches forecast")
|
||||||
|
return results
|
||||||
|
|
||||||
|
def get_cached_forecast(
|
||||||
|
self,
|
||||||
|
client_id: str,
|
||||||
|
branch: str,
|
||||||
|
) -> Optional[Dict]:
|
||||||
|
"""Get the most recent cached forecast for a branch."""
|
||||||
|
db = get_connector()
|
||||||
|
|
||||||
|
sql = """
|
||||||
|
SELECT forecast_data, computed_at
|
||||||
|
FROM forecasts
|
||||||
|
WHERE client_id = %s AND branch = %s
|
||||||
|
ORDER BY computed_at DESC
|
||||||
|
LIMIT 1
|
||||||
|
"""
|
||||||
|
rows = db.execute_raw(sql, (client_id, branch))
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
return None
|
||||||
|
|
||||||
|
row = rows[0]
|
||||||
|
data = row["forecast_data"]
|
||||||
|
if isinstance(data, str):
|
||||||
|
data = json.loads(data)
|
||||||
|
|
||||||
|
data["computed_at"] = str(row["computed_at"])
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _cache_forecast(self, client_id: str, forecast_data: Dict):
|
||||||
|
"""Store forecast in PostgreSQL."""
|
||||||
|
db = get_connector()
|
||||||
|
|
||||||
|
# Delete old forecast for this branch
|
||||||
|
db.execute_write(
|
||||||
|
"DELETE FROM forecasts WHERE client_id = %s AND branch = %s AND country = %s",
|
||||||
|
(client_id, forecast_data["branch"], forecast_data["country"]),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Insert new
|
||||||
|
db.execute_write(
|
||||||
|
"""INSERT INTO forecasts (client_id, branch, country, horizon_months, forecast_data)
|
||||||
|
VALUES (%s, %s, %s, %s, %s)""",
|
||||||
|
(
|
||||||
|
client_id,
|
||||||
|
forecast_data["branch"],
|
||||||
|
forecast_data["country"],
|
||||||
|
forecast_data["horizon_months"],
|
||||||
|
json.dumps(forecast_data),
|
||||||
|
),
|
||||||
|
)
|
||||||
@@ -0,0 +1,18 @@
|
|||||||
|
# HEARTBEAT — ACME Corporation
|
||||||
|
|
||||||
|
## Schedule
|
||||||
|
- trigger: daily
|
||||||
|
- time: "08:00"
|
||||||
|
- timezone: "Asia/Kolkata"
|
||||||
|
|
||||||
|
## Digest Tasks
|
||||||
|
1. Pull last 7 days spend + revenue per branch
|
||||||
|
2. Identify bottom 3 performing branches by revenue
|
||||||
|
3. Generate newsletter-style summary via Gen Agent → QA Agent
|
||||||
|
4. Run Scout Agent for competitor + sector news
|
||||||
|
5. Append Market Intelligence section to digest
|
||||||
|
6. Push complete digest to Slack channel
|
||||||
|
|
||||||
|
## Retry
|
||||||
|
- on_failure: retry after 15 minutes
|
||||||
|
- max_retries: 3
|
||||||
@@ -0,0 +1,124 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — HEARTBEAT Loader
|
||||||
|
|
||||||
|
Parses HEARTBEAT.md files to extract schedule, digest tasks, and retry config.
|
||||||
|
HEARTBEAT.md drives autonomous daily digest generation per client.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, Dict, Any
|
||||||
|
|
||||||
|
from config.client_loader import ClientConfig
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class HeartbeatConfig:
|
||||||
|
"""Parsed heartbeat configuration."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.trigger: str = "daily"
|
||||||
|
self.time: str = "08:00"
|
||||||
|
self.timezone: str = "UTC"
|
||||||
|
self.retry_delay_minutes: int = 15
|
||||||
|
self.max_retries: int = 3
|
||||||
|
self.tasks: list = []
|
||||||
|
self.raw_content: str = ""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def hour(self) -> int:
|
||||||
|
"""Extract hour from time string."""
|
||||||
|
return int(self.time.split(":")[0])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def minute(self) -> int:
|
||||||
|
"""Extract minute from time string."""
|
||||||
|
return int(self.time.split(":")[1])
|
||||||
|
|
||||||
|
|
||||||
|
def load_heartbeat(client_config: ClientConfig) -> HeartbeatConfig:
|
||||||
|
"""
|
||||||
|
Load and parse the HEARTBEAT.md file for a client.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
client_config: The client's configuration containing heartbeat_file path.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Parsed HeartbeatConfig with schedule, tasks, and retry settings.
|
||||||
|
"""
|
||||||
|
config = HeartbeatConfig()
|
||||||
|
heartbeat_path = Path(client_config.heartbeat_file)
|
||||||
|
|
||||||
|
# Use client YAML timezone as fallback
|
||||||
|
config.timezone = client_config.timezone
|
||||||
|
|
||||||
|
if not heartbeat_path.exists():
|
||||||
|
logger.warning(
|
||||||
|
f"HEARTBEAT file not found at {heartbeat_path} for client "
|
||||||
|
f"{client_config.client_id}. Using defaults from client YAML."
|
||||||
|
)
|
||||||
|
config.time = client_config.digest_schedule
|
||||||
|
return config
|
||||||
|
|
||||||
|
try:
|
||||||
|
content = heartbeat_path.read_text(encoding="utf-8")
|
||||||
|
config.raw_content = content
|
||||||
|
_parse_heartbeat(content, config)
|
||||||
|
logger.info(
|
||||||
|
f"Loaded HEARTBEAT for {client_config.client_id}: "
|
||||||
|
f"{config.trigger} at {config.time} {config.timezone}"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error parsing HEARTBEAT file {heartbeat_path}: {e}")
|
||||||
|
config.time = client_config.digest_schedule
|
||||||
|
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_heartbeat(content: str, config: HeartbeatConfig) -> None:
|
||||||
|
"""Parse markdown content and extract structured config."""
|
||||||
|
lines = content.split("\n")
|
||||||
|
|
||||||
|
current_section = None
|
||||||
|
task_lines = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
stripped = line.strip()
|
||||||
|
|
||||||
|
# Detect section headers
|
||||||
|
if stripped.startswith("## "):
|
||||||
|
current_section = stripped[3:].strip().lower()
|
||||||
|
continue
|
||||||
|
|
||||||
|
if current_section == "schedule":
|
||||||
|
# Parse key-value pairs like "- trigger: daily"
|
||||||
|
match = re.match(r"-\s*(\w+):\s*\"?([^\"]+)\"?", stripped)
|
||||||
|
if match:
|
||||||
|
key, value = match.group(1).strip(), match.group(2).strip()
|
||||||
|
if key == "trigger":
|
||||||
|
config.trigger = value
|
||||||
|
elif key == "time":
|
||||||
|
config.time = value
|
||||||
|
elif key == "timezone":
|
||||||
|
config.timezone = value
|
||||||
|
|
||||||
|
elif current_section == "digest tasks":
|
||||||
|
# Parse numbered list items
|
||||||
|
match = re.match(r"\d+\.\s+(.*)", stripped)
|
||||||
|
if match:
|
||||||
|
config.tasks.append(match.group(1).strip())
|
||||||
|
|
||||||
|
elif current_section == "retry":
|
||||||
|
# Parse retry config
|
||||||
|
match = re.match(r"-\s*(\w+):\s*(.+)", stripped)
|
||||||
|
if match:
|
||||||
|
key, value = match.group(1).strip(), match.group(2).strip()
|
||||||
|
if "retry" in key and "after" in value:
|
||||||
|
# Extract minutes from "retry after 15 minutes"
|
||||||
|
mins = re.search(r"(\d+)", value)
|
||||||
|
if mins:
|
||||||
|
config.retry_delay_minutes = int(mins.group(1))
|
||||||
|
elif key == "max_retries":
|
||||||
|
config.max_retries = int(value)
|
||||||
@@ -0,0 +1,295 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — HEARTBEAT Scheduler
|
||||||
|
|
||||||
|
APScheduler AsyncIOScheduler fires digest jobs per client at configured times.
|
||||||
|
Schedule: ETL at 02:00 → RAG re-index at 03:00 → Digest + Scout at configured time.
|
||||||
|
Retry: on failure, retry after N minutes, max retries from HEARTBEAT.md.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, Optional
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||||
|
from apscheduler.triggers.cron import CronTrigger
|
||||||
|
|
||||||
|
from agents.orchestrator import Orchestrator
|
||||||
|
from channels.protocol_adapter import NormalisedMessage
|
||||||
|
from config.client_loader import ClientConfig
|
||||||
|
from config.settings import get_settings
|
||||||
|
from heartbeat.heartbeat_loader import load_heartbeat
|
||||||
|
from skills.postgres_connector import get_connector
|
||||||
|
from soul.soul_loader import load_soul
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_digest(
|
||||||
|
client_config: ClientConfig,
|
||||||
|
orchestrator: Orchestrator,
|
||||||
|
retry_count: int = 0,
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Run the daily digest for a client.
|
||||||
|
|
||||||
|
Steps:
|
||||||
|
1. Query bottom 3 branches by revenue (last 7 days)
|
||||||
|
2. Gen Agent → QA Agent pipeline for digest
|
||||||
|
3. Scout Agent for competitor/sector news
|
||||||
|
4. Push to Slack webhook
|
||||||
|
5. Log success/failure to JSONL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Full digest text if successful, None on failure
|
||||||
|
"""
|
||||||
|
from agents.gen_agent import GenAgent
|
||||||
|
from agents.qa_agent import QAAgent
|
||||||
|
|
||||||
|
client_id = client_config.client_id
|
||||||
|
logger.info(f"[{client_id}] Running daily digest (attempt {retry_count + 1})")
|
||||||
|
|
||||||
|
db = get_connector()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Step 1: Get bottom 3 branches by revenue with ROI
|
||||||
|
bottom_sql = """
|
||||||
|
SELECT branch, country,
|
||||||
|
SUM(revenue) as total_revenue,
|
||||||
|
SUM(spend) as total_spend,
|
||||||
|
SUM(leads) as total_leads,
|
||||||
|
ROUND((SUM(revenue)/NULLIF(SUM(spend),0))::numeric, 2) as roi
|
||||||
|
FROM spend_data
|
||||||
|
WHERE client_id = %s
|
||||||
|
AND date >= CURRENT_DATE - INTERVAL '7 days'
|
||||||
|
GROUP BY branch, country
|
||||||
|
ORDER BY total_revenue ASC
|
||||||
|
LIMIT 3
|
||||||
|
"""
|
||||||
|
data = db.execute_query(bottom_sql, (client_id,))
|
||||||
|
|
||||||
|
# Step 2: Generate digest via Gen Agent with specific prompt
|
||||||
|
soul_content = load_soul(client_config)
|
||||||
|
gen_agent = GenAgent()
|
||||||
|
qa_agent = QAAgent()
|
||||||
|
|
||||||
|
# Retrieve RAG chunks for digest context
|
||||||
|
rag_chunks = None
|
||||||
|
if orchestrator.retriever:
|
||||||
|
try:
|
||||||
|
rag_chunks = orchestrator.retriever.retrieve(
|
||||||
|
query="weekly performance bottom performers budget recommendations",
|
||||||
|
client_id=client_id,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"RAG retrieval for digest failed: {e}")
|
||||||
|
|
||||||
|
# Generate digest with explicit prompt
|
||||||
|
digest = gen_agent.generate(
|
||||||
|
question="Generate morning business digest. Highlight bottom 3 branches. Suggest where to focus budget. Newsletter style.",
|
||||||
|
soul_content=soul_content,
|
||||||
|
data_context=data,
|
||||||
|
rag_chunks=rag_chunks,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 2b: QA pass on digest (more lenient threshold for digest)
|
||||||
|
qa_result = qa_agent.evaluate(
|
||||||
|
response=digest,
|
||||||
|
data_context=data,
|
||||||
|
threshold=0.6, # More lenient for digest
|
||||||
|
)
|
||||||
|
|
||||||
|
if not qa_result["passed"]:
|
||||||
|
logger.warning(
|
||||||
|
f"[{client_id}] Digest QA failed (score={qa_result['score']:.2f}), "
|
||||||
|
f"retrying with strict instruction"
|
||||||
|
)
|
||||||
|
# Retry digest generation with strict instruction
|
||||||
|
digest = gen_agent.generate(
|
||||||
|
question="Generate morning business digest. Highlight bottom 3 branches. Suggest where to focus budget. Newsletter style.",
|
||||||
|
soul_content=soul_content,
|
||||||
|
data_context=data,
|
||||||
|
rag_chunks=rag_chunks,
|
||||||
|
retry_issues=qa_result["issues"],
|
||||||
|
retry_count=1,
|
||||||
|
strict_data_instruction=(
|
||||||
|
"CRITICAL: Only mention branches and figures that appear in the "
|
||||||
|
"Data Context. Do not reference any other branches or historical data."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 3: Scout Agent for competitor/sector news
|
||||||
|
scout_section = None
|
||||||
|
try:
|
||||||
|
from agents.scout_agent import ScoutAgent
|
||||||
|
scout = ScoutAgent()
|
||||||
|
scout_section = await scout.gather_intelligence(client_config)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Scout Agent failed: {e}")
|
||||||
|
|
||||||
|
# Step 4: Assemble full digest
|
||||||
|
full_digest = f"📊 **Clawrity Daily Digest — {client_config.client_name}**\n"
|
||||||
|
full_digest += f"*{datetime.now().strftime('%B %d, %Y')}*\n\n"
|
||||||
|
full_digest += digest
|
||||||
|
|
||||||
|
if scout_section:
|
||||||
|
full_digest += f"\n\n---\n\n{scout_section}"
|
||||||
|
|
||||||
|
# Step 5: Push to Slack webhook
|
||||||
|
webhook_url = client_config.channels.get("slack_webhook", "")
|
||||||
|
if webhook_url:
|
||||||
|
await _push_to_slack(webhook_url, full_digest)
|
||||||
|
else:
|
||||||
|
logger.warning(f"[{client_id}] No Slack webhook configured")
|
||||||
|
|
||||||
|
# Step 6: Log success to JSONL
|
||||||
|
_log_digest_event(client_id, "success", {
|
||||||
|
"qa_score": qa_result["score"],
|
||||||
|
"qa_passed": qa_result["passed"],
|
||||||
|
"scout_included": scout_section is not None,
|
||||||
|
"digest_length": len(full_digest),
|
||||||
|
})
|
||||||
|
|
||||||
|
logger.info(f"[{client_id}] Digest completed successfully")
|
||||||
|
return full_digest
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[{client_id}] Digest failed: {e}", exc_info=True)
|
||||||
|
_log_digest_event(client_id, "failure", {"error": str(e), "attempt": retry_count + 1})
|
||||||
|
|
||||||
|
heartbeat = load_heartbeat(client_config)
|
||||||
|
|
||||||
|
if retry_count < heartbeat.max_retries:
|
||||||
|
delay_minutes = heartbeat.retry_delay_minutes
|
||||||
|
logger.info(
|
||||||
|
f"[{client_id}] Scheduling digest retry in {delay_minutes} minutes "
|
||||||
|
f"(attempt {retry_count + 2}/{heartbeat.max_retries + 1})"
|
||||||
|
)
|
||||||
|
await asyncio.sleep(delay_minutes * 60)
|
||||||
|
return await run_digest(client_config, orchestrator, retry_count + 1)
|
||||||
|
else:
|
||||||
|
logger.error(f"[{client_id}] Digest failed after {heartbeat.max_retries + 1} attempts")
|
||||||
|
# Post failure notification to Slack
|
||||||
|
webhook_url = client_config.channels.get("slack_webhook", "")
|
||||||
|
if webhook_url:
|
||||||
|
await _push_to_slack(
|
||||||
|
webhook_url,
|
||||||
|
"Clawrity digest unavailable. Backend may be offline."
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def _push_to_slack(webhook_url: str, message: str):
|
||||||
|
"""Push a message to a Slack incoming webhook."""
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
response = await client.post(
|
||||||
|
webhook_url,
|
||||||
|
json={"text": message},
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
if response.status_code == 200:
|
||||||
|
logger.info("Digest pushed to Slack successfully")
|
||||||
|
else:
|
||||||
|
logger.error(f"Slack webhook returned {response.status_code}: {response.text}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to push digest to Slack: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def _log_digest_event(client_id: str, status: str, details: dict):
|
||||||
|
"""Log digest event to JSONL monitoring file."""
|
||||||
|
settings = get_settings()
|
||||||
|
logs_dir = settings.logs_dir
|
||||||
|
os.makedirs(logs_dir, exist_ok=True)
|
||||||
|
log_path = os.path.join(logs_dir, f"{client_id}_digest.jsonl")
|
||||||
|
|
||||||
|
entry = {
|
||||||
|
"timestamp": datetime.utcnow().isoformat(),
|
||||||
|
"client_id": client_id,
|
||||||
|
"event": "digest",
|
||||||
|
"status": status,
|
||||||
|
**details,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(log_path, "a") as f:
|
||||||
|
f.write(json.dumps(entry) + "\n")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to log digest event: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def start_scheduler(
|
||||||
|
client_configs: Dict[str, ClientConfig],
|
||||||
|
orchestrator: Orchestrator,
|
||||||
|
) -> AsyncIOScheduler:
|
||||||
|
"""
|
||||||
|
Start the APScheduler with digest jobs for all clients.
|
||||||
|
|
||||||
|
Schedule per client:
|
||||||
|
- Digest at configured time (from HEARTBEAT.md)
|
||||||
|
- ETL sync at 02:00 (placeholder)
|
||||||
|
- RAG re-index at 03:00 (placeholder)
|
||||||
|
"""
|
||||||
|
scheduler = AsyncIOScheduler()
|
||||||
|
|
||||||
|
for client_id, config in client_configs.items():
|
||||||
|
heartbeat = load_heartbeat(config)
|
||||||
|
|
||||||
|
# Daily digest at configured time
|
||||||
|
scheduler.add_job(
|
||||||
|
run_digest,
|
||||||
|
CronTrigger(
|
||||||
|
hour=heartbeat.hour,
|
||||||
|
minute=heartbeat.minute,
|
||||||
|
timezone=heartbeat.timezone,
|
||||||
|
),
|
||||||
|
args=[config, orchestrator],
|
||||||
|
id=f"digest_{client_id}",
|
||||||
|
name=f"Daily Digest — {config.client_name}",
|
||||||
|
replace_existing=True,
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f"Scheduled digest for {client_id}: "
|
||||||
|
f"{heartbeat.time} {heartbeat.timezone}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# ETL sync at 02:00 (placeholder)
|
||||||
|
scheduler.add_job(
|
||||||
|
_etl_sync_placeholder,
|
||||||
|
CronTrigger(hour=2, minute=0, timezone=heartbeat.timezone),
|
||||||
|
args=[client_id],
|
||||||
|
id=f"etl_{client_id}",
|
||||||
|
name=f"ETL Sync — {config.client_name}",
|
||||||
|
replace_existing=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# RAG re-index at 03:00 (placeholder)
|
||||||
|
scheduler.add_job(
|
||||||
|
_rag_reindex_placeholder,
|
||||||
|
CronTrigger(hour=3, minute=0, timezone=heartbeat.timezone),
|
||||||
|
args=[client_id],
|
||||||
|
id=f"rag_reindex_{client_id}",
|
||||||
|
name=f"RAG Re-index — {config.client_name}",
|
||||||
|
replace_existing=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
scheduler.start()
|
||||||
|
return scheduler
|
||||||
|
|
||||||
|
|
||||||
|
async def _etl_sync_placeholder(client_id: str):
|
||||||
|
"""Placeholder for nightly ETL data sync."""
|
||||||
|
logger.info(f"[{client_id}] ETL sync triggered (placeholder)")
|
||||||
|
|
||||||
|
|
||||||
|
async def _rag_reindex_placeholder(client_id: str):
|
||||||
|
"""Placeholder for nightly RAG re-indexing."""
|
||||||
|
logger.info(f"[{client_id}] RAG re-index triggered (placeholder)")
|
||||||
|
try:
|
||||||
|
from scripts.run_rag_pipeline import run_pipeline
|
||||||
|
run_pipeline(client_id)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"RAG re-index failed: {e}")
|
||||||
@@ -0,0 +1,345 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — FastAPI Application
|
||||||
|
|
||||||
|
Main entry point. Initializes database, loads client configs,
|
||||||
|
starts Slack bot, and exposes REST endpoints.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
from typing import Dict, Optional
|
||||||
|
|
||||||
|
from fastapi import FastAPI, HTTPException
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from agents.orchestrator import Orchestrator
|
||||||
|
from channels.protocol_adapter import ProtocolAdapter, NormalisedMessage
|
||||||
|
from channels.slack_handler import SlackHandler
|
||||||
|
from config.client_loader import ClientConfig, load_client_configs
|
||||||
|
from config.settings import get_settings
|
||||||
|
from skills.postgres_connector import get_connector
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s [%(levelname)s] %(name)s — %(message)s",
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Global state
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
client_configs: Dict[str, ClientConfig] = {}
|
||||||
|
orchestrator: Optional[Orchestrator] = None
|
||||||
|
protocol_adapter: Optional[ProtocolAdapter] = None
|
||||||
|
slack_handler: Optional[SlackHandler] = None
|
||||||
|
scheduler = None # Set by heartbeat.scheduler
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Lifespan
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
@asynccontextmanager
|
||||||
|
async def lifespan(app: FastAPI):
|
||||||
|
"""Startup and shutdown logic."""
|
||||||
|
global client_configs, orchestrator, protocol_adapter, slack_handler, scheduler
|
||||||
|
|
||||||
|
logger.info("=== Clawrity starting up ===")
|
||||||
|
|
||||||
|
# 1. Init database schema
|
||||||
|
db = get_connector()
|
||||||
|
db.init_schema()
|
||||||
|
logger.info("Database schema ready")
|
||||||
|
|
||||||
|
# 2. Load client configs
|
||||||
|
client_configs = load_client_configs()
|
||||||
|
logger.info(f"Loaded {len(client_configs)} client(s): {list(client_configs.keys())}")
|
||||||
|
|
||||||
|
# 3. Init orchestrator
|
||||||
|
orchestrator = Orchestrator()
|
||||||
|
|
||||||
|
# 4. Try to attach RAG retriever
|
||||||
|
try:
|
||||||
|
from rag.retriever import Retriever
|
||||||
|
retriever = Retriever()
|
||||||
|
orchestrator.set_retriever(retriever)
|
||||||
|
logger.info("RAG retriever attached to orchestrator")
|
||||||
|
except Exception as e:
|
||||||
|
logger.info(f"RAG retriever not available (Phase 2): {e}")
|
||||||
|
|
||||||
|
# 5. Init protocol adapter
|
||||||
|
protocol_adapter = ProtocolAdapter(client_configs)
|
||||||
|
|
||||||
|
# 6. Start Slack bot
|
||||||
|
slack_handler = SlackHandler(protocol_adapter, client_configs, orchestrator)
|
||||||
|
slack_handler.start()
|
||||||
|
|
||||||
|
# 7. Start scheduler
|
||||||
|
try:
|
||||||
|
from heartbeat.scheduler import start_scheduler
|
||||||
|
scheduler = start_scheduler(client_configs, orchestrator)
|
||||||
|
logger.info("HEARTBEAT scheduler started")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Scheduler not started: {e}")
|
||||||
|
|
||||||
|
logger.info("=== Clawrity ready ===")
|
||||||
|
|
||||||
|
yield # App runs here
|
||||||
|
|
||||||
|
# Shutdown
|
||||||
|
logger.info("=== Clawrity shutting down ===")
|
||||||
|
if slack_handler:
|
||||||
|
slack_handler.stop()
|
||||||
|
if scheduler:
|
||||||
|
scheduler.shutdown(wait=False)
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# FastAPI App
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
app = FastAPI(
|
||||||
|
title="Clawrity",
|
||||||
|
description="Multi-channel AI business intelligence agent",
|
||||||
|
version="1.0.0",
|
||||||
|
lifespan=lifespan,
|
||||||
|
)
|
||||||
|
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"],
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Request/Response Models
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
class ChatRequest(BaseModel):
|
||||||
|
client_id: str
|
||||||
|
message: str
|
||||||
|
|
||||||
|
|
||||||
|
class ChatResponse(BaseModel):
|
||||||
|
response: str
|
||||||
|
qa_score: float
|
||||||
|
qa_passed: bool
|
||||||
|
retries: int
|
||||||
|
sql: Optional[str] = None
|
||||||
|
data_rows: int = 0
|
||||||
|
rag_chunks_used: int = 0
|
||||||
|
elapsed_seconds: float = 0.0
|
||||||
|
|
||||||
|
|
||||||
|
class CompareRequest(BaseModel):
|
||||||
|
client_id: str
|
||||||
|
message: str
|
||||||
|
|
||||||
|
|
||||||
|
class CompareResponse(BaseModel):
|
||||||
|
without_rag: ChatResponse
|
||||||
|
with_rag: ChatResponse
|
||||||
|
|
||||||
|
|
||||||
|
class ScoutRequest(BaseModel):
|
||||||
|
client_id: str
|
||||||
|
query: str
|
||||||
|
|
||||||
|
|
||||||
|
class ClientRequest(BaseModel):
|
||||||
|
client_id: str
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Endpoints
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@app.post("/chat", response_model=ChatResponse)
|
||||||
|
async def chat(request: ChatRequest):
|
||||||
|
"""Send a message and get an AI response."""
|
||||||
|
if request.client_id not in client_configs:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Client not found: {request.client_id}")
|
||||||
|
|
||||||
|
config = client_configs[request.client_id]
|
||||||
|
message = protocol_adapter.normalise_api(request.client_id, request.message)
|
||||||
|
|
||||||
|
result = await orchestrator.process(message, config)
|
||||||
|
return ChatResponse(**result)
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/compare", response_model=CompareResponse)
|
||||||
|
async def compare(request: CompareRequest):
|
||||||
|
"""Side-by-side comparison: with RAG vs without RAG."""
|
||||||
|
if request.client_id not in client_configs:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Client not found: {request.client_id}")
|
||||||
|
|
||||||
|
config = client_configs[request.client_id]
|
||||||
|
message = protocol_adapter.normalise_api(request.client_id, request.message)
|
||||||
|
|
||||||
|
# Without RAG
|
||||||
|
saved_retriever = orchestrator.retriever
|
||||||
|
orchestrator.retriever = None
|
||||||
|
result_no_rag = await orchestrator.process(message, config)
|
||||||
|
orchestrator.retriever = saved_retriever
|
||||||
|
|
||||||
|
# With RAG
|
||||||
|
result_with_rag = await orchestrator.process(message, config)
|
||||||
|
|
||||||
|
return CompareResponse(
|
||||||
|
without_rag=ChatResponse(**result_no_rag),
|
||||||
|
with_rag=ChatResponse(**result_with_rag),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/scout")
|
||||||
|
async def scout(request: ScoutRequest):
|
||||||
|
"""Run a targeted scout search for competitor/market intelligence."""
|
||||||
|
if request.client_id not in client_configs:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Client not found: {request.client_id}")
|
||||||
|
|
||||||
|
config = client_configs[request.client_id]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from agents.scout_agent import ScoutAgent
|
||||||
|
scout_agent = ScoutAgent()
|
||||||
|
result = await scout_agent.search_query(config, request.query)
|
||||||
|
|
||||||
|
if result is None:
|
||||||
|
return {"response": "No relevant competitor or market news found for this query.", "has_results": False}
|
||||||
|
|
||||||
|
return {"response": result, "has_results": True}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Scout endpoint failed: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/scout/digest")
|
||||||
|
async def scout_digest(request: ClientRequest):
|
||||||
|
"""Run full scout agent digest for a client."""
|
||||||
|
if request.client_id not in client_configs:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Client not found: {request.client_id}")
|
||||||
|
|
||||||
|
config = client_configs[request.client_id]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from agents.scout_agent import ScoutAgent
|
||||||
|
scout_agent = ScoutAgent()
|
||||||
|
result = await scout_agent.gather_intelligence(config)
|
||||||
|
|
||||||
|
if result is None:
|
||||||
|
return {"response": "No relevant market intelligence found.", "has_results": False}
|
||||||
|
|
||||||
|
return {"response": result, "has_results": True}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Scout digest failed: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/digest")
|
||||||
|
async def trigger_digest(request: ClientRequest):
|
||||||
|
"""Manually trigger the daily digest pipeline (same as scheduled job)."""
|
||||||
|
if request.client_id not in client_configs:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Client not found: {request.client_id}")
|
||||||
|
|
||||||
|
config = client_configs[request.client_id]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from heartbeat.scheduler import run_digest
|
||||||
|
digest_text = await run_digest(config, orchestrator)
|
||||||
|
|
||||||
|
if digest_text is None:
|
||||||
|
raise HTTPException(status_code=500, detail="Digest generation failed after all retries")
|
||||||
|
|
||||||
|
return {"response": digest_text, "status": "success"}
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Manual digest trigger failed: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/admin/stats/{client_id}")
|
||||||
|
async def admin_stats(client_id: str):
|
||||||
|
"""RAG monitoring stats for a client."""
|
||||||
|
if client_id not in client_configs:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Client not found: {client_id}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from rag.monitoring import get_stats
|
||||||
|
return get_stats(client_id)
|
||||||
|
except Exception as e:
|
||||||
|
return {"error": str(e), "message": "Monitoring not yet configured"}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/forecast/run/{client_id}")
|
||||||
|
async def run_forecast(client_id: str):
|
||||||
|
"""Trigger Prophet forecasting for a client."""
|
||||||
|
if client_id not in client_configs:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Client not found: {client_id}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from forecasting.prophet_engine import ProphetEngine
|
||||||
|
engine = ProphetEngine()
|
||||||
|
results = engine.train_and_forecast(client_id)
|
||||||
|
return {"status": "success", "branches_forecast": len(results)}
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/forecast/{client_id}/{branch}")
|
||||||
|
async def get_forecast(client_id: str, branch: str):
|
||||||
|
"""Get cached forecast for a branch."""
|
||||||
|
if client_id not in client_configs:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Client not found: {client_id}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from forecasting.prophet_engine import ProphetEngine
|
||||||
|
engine = ProphetEngine()
|
||||||
|
forecast = engine.get_cached_forecast(client_id, branch)
|
||||||
|
if not forecast:
|
||||||
|
raise HTTPException(status_code=404, detail=f"No forecast found for {branch}")
|
||||||
|
return forecast
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health():
|
||||||
|
"""System health check."""
|
||||||
|
db = get_connector()
|
||||||
|
db_connected = False
|
||||||
|
try:
|
||||||
|
db.execute_raw("SELECT 1")
|
||||||
|
db_connected = True
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
scheduled_jobs = []
|
||||||
|
if scheduler and hasattr(scheduler, 'get_jobs'):
|
||||||
|
try:
|
||||||
|
scheduled_jobs = [
|
||||||
|
{"id": job.id, "name": job.name, "next_run": str(job.next_run_time)}
|
||||||
|
for job in scheduler.get_jobs()
|
||||||
|
]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": "healthy" if db_connected else "degraded",
|
||||||
|
"database": "connected" if db_connected else "disconnected",
|
||||||
|
"clients": list(client_configs.keys()),
|
||||||
|
"scheduler_running": scheduler is not None and scheduler.running if scheduler else False,
|
||||||
|
"scheduled_jobs": scheduled_jobs,
|
||||||
|
"slack_active": slack_handler is not None and slack_handler._thread is not None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/slack/events")
|
||||||
|
async def slack_events():
|
||||||
|
"""Slack webhook endpoint (HTTP mode fallback). Socket Mode is primary."""
|
||||||
|
return {"message": "Slack events are handled via Socket Mode. This endpoint is a fallback."}
|
||||||
+287
@@ -0,0 +1,287 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — RAG Chunker
|
||||||
|
|
||||||
|
Aggregation-based semantic chunking — NOT fixed-size, NOT sliding window.
|
||||||
|
Source is structured tabular data. We aggregate rows into business-meaningful
|
||||||
|
units and write natural language narratives.
|
||||||
|
|
||||||
|
Three chunk types:
|
||||||
|
1. branch_weekly — GROUP BY branch, country, week
|
||||||
|
2. channel_monthly — GROUP BY channel, country, month
|
||||||
|
3. trend_qoq — GROUP BY branch, country, quarter (QoQ delta COMPUTED)
|
||||||
|
|
||||||
|
Plus Faker-generated narrative summaries reflecting real patterns.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from faker import Faker
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
fake = Faker()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Chunk:
|
||||||
|
"""A single RAG chunk."""
|
||||||
|
id: str
|
||||||
|
client_id: str
|
||||||
|
chunk_type: str
|
||||||
|
text: str
|
||||||
|
metadata: Dict
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict:
|
||||||
|
return {
|
||||||
|
"id": self.id,
|
||||||
|
"client_id": self.client_id,
|
||||||
|
"chunk_type": self.chunk_type,
|
||||||
|
"text": self.text,
|
||||||
|
"metadata": self.metadata,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def generate_chunks(df: pd.DataFrame, client_id: str) -> List[Chunk]:
|
||||||
|
"""Generate all chunk types from preprocessed data."""
|
||||||
|
chunks = []
|
||||||
|
|
||||||
|
df = df.copy()
|
||||||
|
df["date"] = pd.to_datetime(df["date"])
|
||||||
|
|
||||||
|
chunks.extend(_branch_weekly(df, client_id))
|
||||||
|
chunks.extend(_channel_monthly(df, client_id))
|
||||||
|
chunks.extend(_trend_qoq(df, client_id))
|
||||||
|
chunks.extend(_faker_narratives(df, client_id))
|
||||||
|
|
||||||
|
logger.info(f"Generated {len(chunks)} total chunks for {client_id}")
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def _chunk_id(client_id: str, chunk_type: str, *parts) -> str:
|
||||||
|
"""Generate a deterministic chunk ID."""
|
||||||
|
raw = f"{client_id}:{chunk_type}:" + ":".join(str(p) for p in parts)
|
||||||
|
return hashlib.md5(raw.encode()).hexdigest()[:16]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Chunk Type 1: Branch Weekly
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _branch_weekly(df: pd.DataFrame, client_id: str) -> List[Chunk]:
|
||||||
|
"""GROUP BY branch, country, week. One chunk per branch per week."""
|
||||||
|
chunks = []
|
||||||
|
df = df.copy()
|
||||||
|
df["week"] = df["date"].dt.isocalendar().week.astype(int)
|
||||||
|
df["month"] = df["date"].dt.month_name()
|
||||||
|
df["year"] = df["date"].dt.year
|
||||||
|
|
||||||
|
grouped = df.groupby(["branch", "country", "year", "week", "month"]).agg(
|
||||||
|
spend=("spend", "sum"),
|
||||||
|
revenue=("revenue", "sum"),
|
||||||
|
leads=("leads", "sum"),
|
||||||
|
conversions=("conversions", "sum"),
|
||||||
|
).reset_index()
|
||||||
|
|
||||||
|
for _, row in grouped.iterrows():
|
||||||
|
spend = row["spend"]
|
||||||
|
revenue = row["revenue"]
|
||||||
|
roi = round(revenue / spend, 2) if spend > 0 else 0
|
||||||
|
conv_rate = round(row["conversions"] / row["leads"] * 100, 1) if row["leads"] > 0 else 0
|
||||||
|
|
||||||
|
text = (
|
||||||
|
f"{row['branch']} ({row['country']}) in week {row['week']} of "
|
||||||
|
f"{row['month']} {row['year']}: spent ${spend:,.0f}, earned "
|
||||||
|
f"${revenue:,.0f}, ROI {roi}x, {row['leads']} leads, "
|
||||||
|
f"{conv_rate}% conversion rate."
|
||||||
|
)
|
||||||
|
|
||||||
|
chunks.append(Chunk(
|
||||||
|
id=_chunk_id(client_id, "branch_weekly", row["branch"], row["year"], row["week"]),
|
||||||
|
client_id=client_id,
|
||||||
|
chunk_type="branch_weekly",
|
||||||
|
text=text,
|
||||||
|
metadata={
|
||||||
|
"branch": row["branch"],
|
||||||
|
"country": row["country"],
|
||||||
|
"week": int(row["week"]),
|
||||||
|
"month": row["month"],
|
||||||
|
"year": int(row["year"]),
|
||||||
|
"roi": roi,
|
||||||
|
},
|
||||||
|
))
|
||||||
|
|
||||||
|
logger.info(f"Generated {len(chunks)} branch_weekly chunks")
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Chunk Type 2: Channel Monthly
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _channel_monthly(df: pd.DataFrame, client_id: str) -> List[Chunk]:
|
||||||
|
"""GROUP BY channel, country, month, quarter."""
|
||||||
|
chunks = []
|
||||||
|
df = df.copy()
|
||||||
|
df["month"] = df["date"].dt.month_name()
|
||||||
|
df["quarter"] = "Q" + df["date"].dt.quarter.astype(str)
|
||||||
|
df["year"] = df["date"].dt.year
|
||||||
|
|
||||||
|
grouped = df.groupby(["channel", "country", "year", "month", "quarter"]).agg(
|
||||||
|
spend=("spend", "sum"),
|
||||||
|
revenue=("revenue", "sum"),
|
||||||
|
leads=("leads", "sum"),
|
||||||
|
conversions=("conversions", "sum"),
|
||||||
|
).reset_index()
|
||||||
|
|
||||||
|
for _, row in grouped.iterrows():
|
||||||
|
spend = row["spend"]
|
||||||
|
revenue = row["revenue"]
|
||||||
|
roi = round(revenue / spend, 2) if spend > 0 else 0
|
||||||
|
|
||||||
|
text = (
|
||||||
|
f"{row['channel']} in {row['country']} during {row['month']} "
|
||||||
|
f"({row['quarter']}) {row['year']}: ${spend:,.0f} spent, "
|
||||||
|
f"${revenue:,.0f} revenue, ROI {roi}x."
|
||||||
|
)
|
||||||
|
|
||||||
|
chunks.append(Chunk(
|
||||||
|
id=_chunk_id(client_id, "channel_monthly", row["channel"], row["country"], row["year"], row["month"]),
|
||||||
|
client_id=client_id,
|
||||||
|
chunk_type="channel_monthly",
|
||||||
|
text=text,
|
||||||
|
metadata={
|
||||||
|
"channel": row["channel"],
|
||||||
|
"country": row["country"],
|
||||||
|
"month": row["month"],
|
||||||
|
"quarter": row["quarter"],
|
||||||
|
"year": int(row["year"]),
|
||||||
|
"roi": roi,
|
||||||
|
},
|
||||||
|
))
|
||||||
|
|
||||||
|
logger.info(f"Generated {len(chunks)} channel_monthly chunks")
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Chunk Type 3: QoQ Trend (Most Important)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _trend_qoq(df: pd.DataFrame, client_id: str) -> List[Chunk]:
|
||||||
|
"""GROUP BY branch, country, quarter. Compute quarter-over-quarter delta."""
|
||||||
|
chunks = []
|
||||||
|
df = df.copy()
|
||||||
|
df["quarter"] = df["date"].dt.to_period("Q").astype(str)
|
||||||
|
|
||||||
|
grouped = df.groupby(["branch", "country", "quarter"]).agg(
|
||||||
|
spend=("spend", "sum"),
|
||||||
|
revenue=("revenue", "sum"),
|
||||||
|
).reset_index()
|
||||||
|
|
||||||
|
# Sort for QoQ calculation
|
||||||
|
grouped = grouped.sort_values(["branch", "country", "quarter"])
|
||||||
|
|
||||||
|
for (branch, country), group in grouped.groupby(["branch", "country"]):
|
||||||
|
group = group.sort_values("quarter").reset_index(drop=True)
|
||||||
|
|
||||||
|
for i in range(1, len(group)):
|
||||||
|
prev = group.iloc[i - 1]
|
||||||
|
curr = group.iloc[i]
|
||||||
|
|
||||||
|
prev_rev = prev["revenue"]
|
||||||
|
curr_rev = curr["revenue"]
|
||||||
|
|
||||||
|
if prev_rev > 0:
|
||||||
|
delta = round((curr_rev - prev_rev) / prev_rev * 100, 1)
|
||||||
|
else:
|
||||||
|
delta = 0
|
||||||
|
|
||||||
|
direction = "grew" if delta > 0 else "declined"
|
||||||
|
|
||||||
|
text = (
|
||||||
|
f"{branch} ({country}) revenue {direction} {abs(delta)}% "
|
||||||
|
f"in {curr['quarter']} vs {prev['quarter']}. "
|
||||||
|
f"Total spend: ${curr['spend']:,.0f}, revenue: ${curr_rev:,.0f}."
|
||||||
|
)
|
||||||
|
|
||||||
|
chunks.append(Chunk(
|
||||||
|
id=_chunk_id(client_id, "trend_qoq", branch, country, curr["quarter"]),
|
||||||
|
client_id=client_id,
|
||||||
|
chunk_type="trend_qoq",
|
||||||
|
text=text,
|
||||||
|
metadata={
|
||||||
|
"branch": branch,
|
||||||
|
"country": country,
|
||||||
|
"quarter": curr["quarter"],
|
||||||
|
"prev_quarter": prev["quarter"],
|
||||||
|
"delta_pct": delta,
|
||||||
|
},
|
||||||
|
))
|
||||||
|
|
||||||
|
logger.info(f"Generated {len(chunks)} trend_qoq chunks")
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Faker Narrative Chunks
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _faker_narratives(df: pd.DataFrame, client_id: str) -> List[Chunk]:
|
||||||
|
"""Generate plausible narrative chunks reflecting real data patterns."""
|
||||||
|
chunks = []
|
||||||
|
df = df.copy()
|
||||||
|
df["quarter"] = df["date"].dt.to_period("Q").astype(str)
|
||||||
|
|
||||||
|
# Find top and bottom performers per quarter
|
||||||
|
quarterly = df.groupby(["branch", "country", "quarter"]).agg(
|
||||||
|
revenue=("revenue", "sum"),
|
||||||
|
spend=("spend", "sum"),
|
||||||
|
leads=("leads", "sum"),
|
||||||
|
).reset_index()
|
||||||
|
|
||||||
|
templates = [
|
||||||
|
"{branch} branch demonstrated strong {quarter} performance driven by {channel} efficiency, outperforming regional averages.",
|
||||||
|
"In {quarter}, {branch} ({country}) showed {trend} momentum with revenue reaching ${revenue:,.0f}, primarily through {channel} campaigns.",
|
||||||
|
"{branch} branch in {country} maintained steady growth in {quarter}, with lead generation up and conversion rates holding above {conv_rate:.1f}%.",
|
||||||
|
"Cost efficiency at {branch} ({country}) improved in {quarter}, with spend-to-revenue ratio tightening to {ratio:.2f}x.",
|
||||||
|
]
|
||||||
|
|
||||||
|
channels = df["channel"].dropna().unique().tolist() or ["Paid Search", "Social Media", "Email"]
|
||||||
|
|
||||||
|
for _, row in quarterly.iterrows():
|
||||||
|
roi = row["revenue"] / row["spend"] if row["spend"] > 0 else 0
|
||||||
|
conv_rate = np.random.uniform(5, 20)
|
||||||
|
trend = "positive" if roi > 1.5 else "moderate" if roi > 1 else "challenging"
|
||||||
|
channel = np.random.choice(channels)
|
||||||
|
|
||||||
|
template = np.random.choice(templates)
|
||||||
|
text = template.format(
|
||||||
|
branch=row["branch"],
|
||||||
|
country=row["country"],
|
||||||
|
quarter=row["quarter"],
|
||||||
|
channel=channel,
|
||||||
|
revenue=row["revenue"],
|
||||||
|
trend=trend,
|
||||||
|
conv_rate=conv_rate,
|
||||||
|
ratio=1 / roi if roi > 0 else 0,
|
||||||
|
)
|
||||||
|
|
||||||
|
chunks.append(Chunk(
|
||||||
|
id=_chunk_id(client_id, "narrative", row["branch"], row["country"], row["quarter"]),
|
||||||
|
client_id=client_id,
|
||||||
|
chunk_type="narrative",
|
||||||
|
text=text,
|
||||||
|
metadata={
|
||||||
|
"branch": row["branch"],
|
||||||
|
"country": row["country"],
|
||||||
|
"quarter": row["quarter"],
|
||||||
|
"source": "generated_narrative",
|
||||||
|
},
|
||||||
|
))
|
||||||
|
|
||||||
|
logger.info(f"Generated {len(chunks)} narrative chunks")
|
||||||
|
return chunks
|
||||||
@@ -0,0 +1,123 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — RAG Evaluator
|
||||||
|
|
||||||
|
Lightweight Groq-based evaluation (no OpenAI, no full RAGAs).
|
||||||
|
Four metrics: faithfulness, answer_relevancy, context_precision, context_recall.
|
||||||
|
Single Groq call with structured JSON output.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
from groq import Groq
|
||||||
|
|
||||||
|
from config.settings import get_settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
EVAL_PROMPT = """Evaluate this RAG-augmented response on four criteria.
|
||||||
|
|
||||||
|
## User Query
|
||||||
|
{query}
|
||||||
|
|
||||||
|
## Retrieved Context Chunks
|
||||||
|
{chunks}
|
||||||
|
|
||||||
|
## Generated Response
|
||||||
|
{response}
|
||||||
|
|
||||||
|
## Evaluation Criteria (score each 0.0 to 1.0)
|
||||||
|
|
||||||
|
1. **Faithfulness**: Does the response ONLY contain information from the retrieved chunks? No hallucination?
|
||||||
|
2. **Answer Relevancy**: Does the response directly address the user's question?
|
||||||
|
3. **Context Precision**: Were the retrieved chunks actually relevant to the question?
|
||||||
|
4. **Context Recall**: Did the retrieval capture enough context to answer the question fully?
|
||||||
|
|
||||||
|
Return ONLY a JSON object:
|
||||||
|
{{
|
||||||
|
"faithfulness": <float>,
|
||||||
|
"answer_relevancy": <float>,
|
||||||
|
"context_precision": <float>,
|
||||||
|
"context_recall": <float>,
|
||||||
|
"overall": <float (average of all four)>,
|
||||||
|
"notes": "<brief explanation>"
|
||||||
|
}}"""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EvalResult:
|
||||||
|
faithfulness: float = 0.0
|
||||||
|
answer_relevancy: float = 0.0
|
||||||
|
context_precision: float = 0.0
|
||||||
|
context_recall: float = 0.0
|
||||||
|
overall: float = 0.0
|
||||||
|
notes: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class RAGEvaluator:
|
||||||
|
"""Evaluates RAG pipeline quality using Groq LLM."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
settings = get_settings()
|
||||||
|
self.client = Groq(api_key=settings.groq_api_key)
|
||||||
|
self.model = settings.llm_model
|
||||||
|
|
||||||
|
def evaluate(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
chunks: List[Dict],
|
||||||
|
response: str,
|
||||||
|
) -> EvalResult:
|
||||||
|
"""Evaluate a RAG response."""
|
||||||
|
chunks_text = "\n".join(
|
||||||
|
f"{i+1}. {c.get('text', '')} (similarity: {c.get('similarity', 0):.2f})"
|
||||||
|
for i, c in enumerate(chunks)
|
||||||
|
) if chunks else "No chunks retrieved."
|
||||||
|
|
||||||
|
prompt = EVAL_PROMPT.format(
|
||||||
|
query=query,
|
||||||
|
chunks=chunks_text,
|
||||||
|
response=response,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = self.client.chat.completions.create(
|
||||||
|
model=self.model,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": "You are a RAG evaluation expert. Return only valid JSON."},
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
|
],
|
||||||
|
temperature=0.1,
|
||||||
|
max_tokens=512,
|
||||||
|
)
|
||||||
|
|
||||||
|
raw = result.choices[0].message.content.strip()
|
||||||
|
return self._parse(raw)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"RAG evaluation failed: {e}")
|
||||||
|
return EvalResult(notes=f"Evaluation error: {str(e)}")
|
||||||
|
|
||||||
|
def _parse(self, raw: str) -> EvalResult:
|
||||||
|
"""Parse JSON evaluation response."""
|
||||||
|
try:
|
||||||
|
cleaned = raw.strip()
|
||||||
|
if cleaned.startswith("```"):
|
||||||
|
cleaned = cleaned.split("\n", 1)[1] if "\n" in cleaned else cleaned[3:]
|
||||||
|
if cleaned.endswith("```"):
|
||||||
|
cleaned = cleaned[:-3]
|
||||||
|
|
||||||
|
data = json.loads(cleaned.strip())
|
||||||
|
return EvalResult(
|
||||||
|
faithfulness=float(data.get("faithfulness", 0)),
|
||||||
|
answer_relevancy=float(data.get("answer_relevancy", 0)),
|
||||||
|
context_precision=float(data.get("context_precision", 0)),
|
||||||
|
context_recall=float(data.get("context_recall", 0)),
|
||||||
|
overall=float(data.get("overall", 0)),
|
||||||
|
notes=data.get("notes", ""),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not parse evaluation: {e}")
|
||||||
|
return EvalResult(notes="Parse error")
|
||||||
@@ -0,0 +1,105 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — RAG Monitoring
|
||||||
|
|
||||||
|
Logs every interaction to JSONL and provides aggregated stats.
|
||||||
|
Exposes data for /admin/stats/{client_id} endpoint.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, Optional
|
||||||
|
|
||||||
|
from config.settings import get_settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _log_path(client_id: str) -> str:
|
||||||
|
"""Get the JSONL log file path for a client."""
|
||||||
|
logs_dir = get_settings().logs_dir
|
||||||
|
os.makedirs(logs_dir, exist_ok=True)
|
||||||
|
return os.path.join(logs_dir, f"{client_id}_interactions.jsonl")
|
||||||
|
|
||||||
|
|
||||||
|
def log_interaction(
|
||||||
|
client_id: str,
|
||||||
|
query: str,
|
||||||
|
num_chunks: int,
|
||||||
|
chunk_types_used: list,
|
||||||
|
qa_score: float,
|
||||||
|
qa_passed: bool,
|
||||||
|
retries: int,
|
||||||
|
response_length: int,
|
||||||
|
elapsed_seconds: float = 0.0,
|
||||||
|
):
|
||||||
|
"""Log an interaction to JSONL."""
|
||||||
|
entry = {
|
||||||
|
"timestamp": datetime.utcnow().isoformat(),
|
||||||
|
"client_id": client_id,
|
||||||
|
"query": query,
|
||||||
|
"num_chunks": num_chunks,
|
||||||
|
"chunk_types_used": chunk_types_used,
|
||||||
|
"qa_score": qa_score,
|
||||||
|
"qa_passed": qa_passed,
|
||||||
|
"retries": retries,
|
||||||
|
"response_length": response_length,
|
||||||
|
"elapsed_seconds": elapsed_seconds,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
path = _log_path(client_id)
|
||||||
|
with open(path, "a") as f:
|
||||||
|
f.write(json.dumps(entry) + "\n")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to log interaction: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def get_stats(client_id: str) -> Dict:
|
||||||
|
"""
|
||||||
|
Get aggregated monitoring stats for a client.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with: total_queries, pass_rate, avg_qa_score, avg_retries,
|
||||||
|
queries_needing_retry
|
||||||
|
"""
|
||||||
|
path = _log_path(client_id)
|
||||||
|
if not os.path.exists(path):
|
||||||
|
return {
|
||||||
|
"client_id": client_id,
|
||||||
|
"total_queries": 0,
|
||||||
|
"pass_rate": 0.0,
|
||||||
|
"avg_qa_score": 0.0,
|
||||||
|
"avg_retries": 0.0,
|
||||||
|
"queries_needing_retry": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
entries = []
|
||||||
|
try:
|
||||||
|
with open(path, "r") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if line:
|
||||||
|
entries.append(json.loads(line))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error reading log file: {e}")
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
if not entries:
|
||||||
|
return {"client_id": client_id, "total_queries": 0}
|
||||||
|
|
||||||
|
total = len(entries)
|
||||||
|
passed = sum(1 for e in entries if e.get("qa_passed", False))
|
||||||
|
scores = [e.get("qa_score", 0) for e in entries]
|
||||||
|
retries = [e.get("retries", 0) for e in entries]
|
||||||
|
retry_queries = sum(1 for r in retries if r > 0)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"client_id": client_id,
|
||||||
|
"total_queries": total,
|
||||||
|
"pass_rate": round(passed / total * 100, 1) if total > 0 else 0,
|
||||||
|
"avg_qa_score": round(sum(scores) / total, 3) if total > 0 else 0,
|
||||||
|
"avg_retries": round(sum(retries) / total, 2) if total > 0 else 0,
|
||||||
|
"queries_needing_retry": retry_queries,
|
||||||
|
}
|
||||||
@@ -0,0 +1,72 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — RAG Preprocessor
|
||||||
|
|
||||||
|
Fetches data from PostgreSQL, cleans it for RAG chunking:
|
||||||
|
- Removes nulls, outliers > 3 std devs, duplicates
|
||||||
|
- Normalises string columns
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from etl.normaliser import remove_outliers
|
||||||
|
from skills.postgres_connector import get_connector
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_for_rag(
|
||||||
|
client_id: str,
|
||||||
|
days: int = 365,
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Fetch and preprocess data for RAG chunking.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
client_id: Client to fetch data for
|
||||||
|
days: Number of days of data to fetch (default 365)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Clean DataFrame ready for chunking
|
||||||
|
"""
|
||||||
|
db = get_connector()
|
||||||
|
|
||||||
|
sql = """
|
||||||
|
SELECT date, country, branch, channel, spend, revenue, leads, conversions
|
||||||
|
FROM spend_data
|
||||||
|
WHERE client_id = %s AND date >= CURRENT_DATE - INTERVAL '%s days'
|
||||||
|
ORDER BY date
|
||||||
|
"""
|
||||||
|
# Can't parameterise interval directly, use string formatting for days
|
||||||
|
safe_sql = f"""
|
||||||
|
SELECT date, country, branch, channel, spend, revenue, leads, conversions
|
||||||
|
FROM spend_data
|
||||||
|
WHERE client_id = %s AND date >= CURRENT_DATE - INTERVAL '{int(days)} days'
|
||||||
|
ORDER BY date
|
||||||
|
"""
|
||||||
|
df = db.execute_query(safe_sql, (client_id,))
|
||||||
|
logger.info(f"Fetched {len(df)} rows for RAG preprocessing")
|
||||||
|
|
||||||
|
if df.empty:
|
||||||
|
logger.warning(f"No data found for client {client_id}")
|
||||||
|
return df
|
||||||
|
|
||||||
|
# Remove rows with critical nulls
|
||||||
|
critical_cols = ["date", "branch", "country", "revenue"]
|
||||||
|
df = df.dropna(subset=[c for c in critical_cols if c in df.columns])
|
||||||
|
|
||||||
|
# Remove outliers on numeric columns
|
||||||
|
df = remove_outliers(df, ["spend", "revenue", "leads", "conversions"])
|
||||||
|
|
||||||
|
# Clean strings
|
||||||
|
for col in ["country", "branch", "channel"]:
|
||||||
|
if col in df.columns:
|
||||||
|
df[col] = df[col].astype(str).str.strip().str.title()
|
||||||
|
|
||||||
|
# Remove duplicates
|
||||||
|
df = df.drop_duplicates()
|
||||||
|
|
||||||
|
logger.info(f"Preprocessed: {len(df)} rows ready for chunking")
|
||||||
|
return df
|
||||||
@@ -0,0 +1,95 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — RAG Retriever
|
||||||
|
|
||||||
|
Detects query intent → selects chunk_type → searches pgvector.
|
||||||
|
Intent detection based on keywords:
|
||||||
|
- "should/recommend/allocate/shift" → trend_qoq
|
||||||
|
- "channel/paid/email/social" → channel_monthly
|
||||||
|
- everything else → branch_weekly
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import List, Dict, Optional
|
||||||
|
|
||||||
|
from rag.vector_store import search
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Intent → chunk_type mapping based on keywords
|
||||||
|
INTENT_PATTERNS = {
|
||||||
|
"trend_qoq": [
|
||||||
|
"should", "recommend", "allocate", "shift", "increase", "decrease",
|
||||||
|
"budget", "realloc", "invest", "optimize", "growth", "trend",
|
||||||
|
"quarter", "qoq", "forecast", "predict",
|
||||||
|
],
|
||||||
|
"channel_monthly": [
|
||||||
|
"channel", "paid", "email", "social", "search", "display",
|
||||||
|
"organic", "referral", "campaign", "marketing", "roi",
|
||||||
|
"spend", "advertising",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class Retriever:
|
||||||
|
"""RAG retriever with intent-based chunk type filtering."""
|
||||||
|
|
||||||
|
def retrieve(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
client_id: str,
|
||||||
|
top_k: int = 5,
|
||||||
|
chunk_type_override: Optional[str] = None,
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Retrieve relevant chunks based on query intent.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: User's natural language query
|
||||||
|
client_id: Client to search within
|
||||||
|
top_k: Number of chunks to retrieve
|
||||||
|
chunk_type_override: Force a specific chunk type
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of dicts with text, metadata, similarity
|
||||||
|
"""
|
||||||
|
if chunk_type_override:
|
||||||
|
chunk_type = chunk_type_override
|
||||||
|
else:
|
||||||
|
chunk_type = self._detect_intent(query)
|
||||||
|
|
||||||
|
logger.info(f"Detected intent → chunk_type: {chunk_type}")
|
||||||
|
|
||||||
|
results = search(
|
||||||
|
query=query,
|
||||||
|
client_id=client_id,
|
||||||
|
chunk_type=chunk_type,
|
||||||
|
top_k=top_k,
|
||||||
|
)
|
||||||
|
|
||||||
|
# If no results with the detected type, fall back to all types
|
||||||
|
if not results:
|
||||||
|
logger.info(f"No results for {chunk_type}, falling back to all types")
|
||||||
|
results = search(
|
||||||
|
query=query,
|
||||||
|
client_id=client_id,
|
||||||
|
chunk_type=None,
|
||||||
|
top_k=top_k,
|
||||||
|
)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def _detect_intent(self, query: str) -> str:
|
||||||
|
"""Detect query intent from keywords."""
|
||||||
|
query_lower = query.lower()
|
||||||
|
|
||||||
|
scores = {}
|
||||||
|
for chunk_type, keywords in INTENT_PATTERNS.items():
|
||||||
|
score = sum(1 for kw in keywords if kw in query_lower)
|
||||||
|
scores[chunk_type] = score
|
||||||
|
|
||||||
|
# Return the chunk type with highest score, default to branch_weekly
|
||||||
|
if max(scores.values()) > 0:
|
||||||
|
return max(scores, key=scores.get)
|
||||||
|
|
||||||
|
return "branch_weekly"
|
||||||
@@ -0,0 +1,135 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — RAG Vector Store
|
||||||
|
|
||||||
|
Embeds chunks using sentence-transformers all-MiniLM-L6-v2 (CPU, 384 dims).
|
||||||
|
Stores and searches via pgvector in PostgreSQL.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from rag.chunker import Chunk
|
||||||
|
from skills.postgres_connector import get_connector
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_model = None
|
||||||
|
|
||||||
|
|
||||||
|
def _get_embedding_model():
|
||||||
|
"""Lazy-load the embedding model (CPU only, ~90MB)."""
|
||||||
|
global _model
|
||||||
|
if _model is None:
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
_model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||||
|
logger.info("Loaded embedding model: all-MiniLM-L6-v2 (384 dims)")
|
||||||
|
return _model
|
||||||
|
|
||||||
|
|
||||||
|
def embed_texts(texts: List[str], batch_size: int = 100) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Embed a list of texts using MiniLM.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts: List of text strings to embed
|
||||||
|
batch_size: Batch size for encoding (default 100)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
numpy array of shape (len(texts), 384)
|
||||||
|
"""
|
||||||
|
model = _get_embedding_model()
|
||||||
|
embeddings = model.encode(
|
||||||
|
texts,
|
||||||
|
batch_size=batch_size,
|
||||||
|
show_progress_bar=len(texts) > 100,
|
||||||
|
normalize_embeddings=True,
|
||||||
|
)
|
||||||
|
logger.info(f"Embedded {len(texts)} texts → shape {embeddings.shape}")
|
||||||
|
return embeddings
|
||||||
|
|
||||||
|
|
||||||
|
def embed_query(query: str) -> np.ndarray:
|
||||||
|
"""Embed a single query string."""
|
||||||
|
model = _get_embedding_model()
|
||||||
|
return model.encode(query, normalize_embeddings=True)
|
||||||
|
|
||||||
|
|
||||||
|
def store_chunks(chunks: List[Chunk], embeddings: np.ndarray):
|
||||||
|
"""
|
||||||
|
Upsert chunks + embeddings into pgvector.
|
||||||
|
Uses ON CONFLICT DO UPDATE for safe nightly re-indexing.
|
||||||
|
"""
|
||||||
|
seen = set()
|
||||||
|
unique_chunks = []
|
||||||
|
unique_embeddings = []
|
||||||
|
for chunk, emb in zip(chunks, embeddings):
|
||||||
|
if chunk.id not in seen:
|
||||||
|
seen.add(chunk.id)
|
||||||
|
unique_chunks.append(chunk)
|
||||||
|
unique_embeddings.append(emb)
|
||||||
|
chunks = unique_chunks
|
||||||
|
embeddings = unique_embeddings
|
||||||
|
|
||||||
|
db = get_connector()
|
||||||
|
|
||||||
|
data = []
|
||||||
|
for chunk, embedding in zip(chunks, embeddings):
|
||||||
|
data.append({
|
||||||
|
"id": chunk.id,
|
||||||
|
"client_id": chunk.client_id,
|
||||||
|
"chunk_type": chunk.chunk_type,
|
||||||
|
"text": chunk.text,
|
||||||
|
"metadata": chunk.metadata,
|
||||||
|
"embedding": embedding.tolist(),
|
||||||
|
})
|
||||||
|
|
||||||
|
# Batch upsert
|
||||||
|
batch_size = 100
|
||||||
|
for i in range(0, len(data), batch_size):
|
||||||
|
batch = data[i:i + batch_size]
|
||||||
|
db.upsert_embeddings(batch)
|
||||||
|
|
||||||
|
logger.info(f"Stored {len(data)} chunks in pgvector")
|
||||||
|
|
||||||
|
# Try to create IVFFlat index (needs enough rows)
|
||||||
|
try:
|
||||||
|
db.create_vector_index()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def search(
|
||||||
|
query: str,
|
||||||
|
client_id: str,
|
||||||
|
chunk_type: Optional[str] = None,
|
||||||
|
top_k: int = 5,
|
||||||
|
) -> List[dict]:
|
||||||
|
"""
|
||||||
|
Search pgvector for similar chunks.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Natural language query
|
||||||
|
client_id: Client to search within
|
||||||
|
chunk_type: Optional filter (branch_weekly, channel_monthly, trend_qoq)
|
||||||
|
top_k: Number of results
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of dicts with text, metadata, similarity
|
||||||
|
"""
|
||||||
|
query_embedding = embed_query(query)
|
||||||
|
db = get_connector()
|
||||||
|
|
||||||
|
results = db.search_embeddings(
|
||||||
|
query_embedding=query_embedding,
|
||||||
|
client_id=client_id,
|
||||||
|
chunk_type=chunk_type,
|
||||||
|
top_k=top_k,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Vector search: query='{query[:50]}...', "
|
||||||
|
f"chunk_type={chunk_type}, results={len(results)}"
|
||||||
|
)
|
||||||
|
return results
|
||||||
@@ -0,0 +1,42 @@
|
|||||||
|
# === Core Framework ===
|
||||||
|
fastapi>=0.115.0
|
||||||
|
uvicorn[standard]>=0.30.0
|
||||||
|
python-dotenv>=1.0.0
|
||||||
|
|
||||||
|
# === LLM ===
|
||||||
|
groq>=0.11.0
|
||||||
|
|
||||||
|
# === Embeddings (CPU only — all-MiniLM-L6-v2, 384 dims, ~90MB) ===
|
||||||
|
sentence-transformers>=3.0.0
|
||||||
|
|
||||||
|
# === Database — PostgreSQL + pgvector ===
|
||||||
|
psycopg2-binary>=2.9.9
|
||||||
|
pgvector>=0.3.0
|
||||||
|
asyncpg>=0.29.0
|
||||||
|
|
||||||
|
# === Channel — Slack (Socket Mode) ===
|
||||||
|
slack-bolt>=1.20.0
|
||||||
|
|
||||||
|
# === Scheduler ===
|
||||||
|
apscheduler>=3.10.0
|
||||||
|
|
||||||
|
# === Web Search (Scout Agent) ===
|
||||||
|
tavily-python>=0.5.0
|
||||||
|
duckduckgo-search>=6.0.0
|
||||||
|
|
||||||
|
# === Forecasting ===
|
||||||
|
prophet>=1.1.5
|
||||||
|
|
||||||
|
# === Data Processing ===
|
||||||
|
pandas>=2.2.0
|
||||||
|
numpy>=1.26.0
|
||||||
|
openpyxl>=3.1.0
|
||||||
|
faker>=28.0.0
|
||||||
|
|
||||||
|
# === Config ===
|
||||||
|
pydantic>=2.9.0
|
||||||
|
pydantic-settings>=2.5.0
|
||||||
|
pyyaml>=6.0.2
|
||||||
|
|
||||||
|
# === HTTP Client ===
|
||||||
|
httpx>=0.27.0
|
||||||
@@ -0,0 +1,67 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — RAG Pipeline Script
|
||||||
|
|
||||||
|
CLI to run the full RAG pipeline: preprocess → chunk → embed → store in pgvector.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/run_rag_pipeline.py --client_id acme_corp
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
from rag.preprocessor import preprocess_for_rag
|
||||||
|
from rag.chunker import generate_chunks
|
||||||
|
from rag.vector_store import embed_texts, store_chunks
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def run_pipeline(client_id: str, days: int = 365):
|
||||||
|
"""Run the full RAG pipeline for a client."""
|
||||||
|
logger.info(f"=== RAG Pipeline: {client_id} ===")
|
||||||
|
|
||||||
|
# Step 1: Preprocess
|
||||||
|
logger.info("Step 1/4: Preprocessing data...")
|
||||||
|
df = preprocess_for_rag(client_id, days=days)
|
||||||
|
if df.empty:
|
||||||
|
logger.error("No data to process. Run seed_demo_data.py first.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Step 2: Generate chunks
|
||||||
|
logger.info("Step 2/4: Generating chunks...")
|
||||||
|
chunks = generate_chunks(df, client_id)
|
||||||
|
logger.info(f"Generated {len(chunks)} chunks")
|
||||||
|
|
||||||
|
if not chunks:
|
||||||
|
logger.error("No chunks generated.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Step 3: Embed
|
||||||
|
logger.info("Step 3/4: Embedding chunks (CPU, batch_size=100)...")
|
||||||
|
texts = [c.text for c in chunks]
|
||||||
|
embeddings = embed_texts(texts, batch_size=100)
|
||||||
|
|
||||||
|
# Step 4: Store in pgvector
|
||||||
|
logger.info("Step 4/4: Upserting into pgvector...")
|
||||||
|
store_chunks(chunks, embeddings)
|
||||||
|
|
||||||
|
logger.info(f"=== RAG Pipeline complete: {len(chunks)} chunks indexed ===")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Run RAG pipeline")
|
||||||
|
parser.add_argument("--client_id", required=True, help="Client ID")
|
||||||
|
parser.add_argument("--days", type=int, default=365, help="Days of data to process")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
run_pipeline(args.client_id, args.days)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,214 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — Demo Data Seeder
|
||||||
|
|
||||||
|
Merges Global Superstore + Marketing Campaign datasets with Faker gap-filling.
|
||||||
|
Inserts into PostgreSQL spend_data table.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/seed_demo_data.py --client_id acme_corp \
|
||||||
|
--superstore data/raw/Global_Superstore2.csv \
|
||||||
|
--marketing data/raw/marketing_campaign_dataset.csv
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import random
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from faker import Faker
|
||||||
|
|
||||||
|
# Add project root to path
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
from connectors.csv_connector import CSVConnector
|
||||||
|
from etl.normaliser import normalise_dataframe
|
||||||
|
from skills.postgres_connector import PostgresConnector
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
fake = Faker()
|
||||||
|
Faker.seed(42)
|
||||||
|
random.seed(42)
|
||||||
|
np.random.seed(42)
|
||||||
|
|
||||||
|
# Marketing channels to assign
|
||||||
|
CHANNELS = ["Paid Search", "Social Media", "Email", "Display", "Organic", "Referral"]
|
||||||
|
|
||||||
|
# Column mapping for Global Superstore
|
||||||
|
SUPERSTORE_MAPPING = {
|
||||||
|
"Order Date": "date",
|
||||||
|
"Country": "country",
|
||||||
|
"City": "branch",
|
||||||
|
"Sales": "revenue",
|
||||||
|
"Profit": "profit",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def load_superstore(path: str) -> pd.DataFrame:
|
||||||
|
"""Load and normalize the Global Superstore dataset."""
|
||||||
|
connector = CSVConnector()
|
||||||
|
df = connector.load(path)
|
||||||
|
logger.info(f"Superstore columns: {list(df.columns)}")
|
||||||
|
|
||||||
|
# Apply column mapping
|
||||||
|
df = normalise_dataframe(df, SUPERSTORE_MAPPING)
|
||||||
|
|
||||||
|
# Keep only needed columns
|
||||||
|
keep = ["date", "country", "branch", "revenue", "profit"]
|
||||||
|
available = [c for c in keep if c in df.columns]
|
||||||
|
df = df[available].copy()
|
||||||
|
|
||||||
|
logger.info(f"Superstore: {len(df)} rows after normalisation")
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def load_marketing(path: str) -> pd.DataFrame:
|
||||||
|
"""Load the Marketing Campaign Performance dataset."""
|
||||||
|
connector = CSVConnector()
|
||||||
|
df = connector.load(path)
|
||||||
|
logger.info(f"Marketing columns: {list(df.columns)}")
|
||||||
|
|
||||||
|
# Standardize column names
|
||||||
|
col_map = {}
|
||||||
|
for col in df.columns:
|
||||||
|
cl = col.lower().strip()
|
||||||
|
if "channel" in cl:
|
||||||
|
col_map[col] = "channel"
|
||||||
|
elif "spend" in cl or "budget" in cl:
|
||||||
|
col_map[col] = "spend"
|
||||||
|
elif "click" in cl:
|
||||||
|
col_map[col] = "leads"
|
||||||
|
elif "conversion" in cl:
|
||||||
|
col_map[col] = "conversions"
|
||||||
|
elif "roi" in cl:
|
||||||
|
col_map[col] = "roi_raw"
|
||||||
|
elif "impression" in cl:
|
||||||
|
col_map[col] = "impressions"
|
||||||
|
|
||||||
|
df = df.rename(columns=col_map)
|
||||||
|
logger.info(f"Marketing: {len(df)} rows, mapped columns: {list(df.columns)}")
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def merge_datasets(superstore: pd.DataFrame, marketing: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Merge superstore (base) with marketing channel metrics.
|
||||||
|
Each superstore row gets a channel + spend/leads/conversions.
|
||||||
|
"""
|
||||||
|
df = superstore.copy()
|
||||||
|
|
||||||
|
# Assign channels proportionally from marketing data
|
||||||
|
if "channel" in marketing.columns:
|
||||||
|
channel_list = marketing["channel"].dropna().unique().tolist()
|
||||||
|
if not channel_list:
|
||||||
|
channel_list = CHANNELS
|
||||||
|
else:
|
||||||
|
channel_list = CHANNELS
|
||||||
|
|
||||||
|
# Assign channel to each row (deterministic based on index)
|
||||||
|
df["channel"] = [channel_list[i % len(channel_list)] for i in range(len(df))]
|
||||||
|
|
||||||
|
# Build channel-level spend/leads/conversions stats from marketing data
|
||||||
|
channel_stats = {}
|
||||||
|
if "spend" in marketing.columns and "channel" in marketing.columns:
|
||||||
|
for ch in channel_list:
|
||||||
|
ch_data = marketing[marketing["channel"] == ch] if "channel" in marketing.columns else marketing
|
||||||
|
channel_stats[ch] = {
|
||||||
|
"avg_spend": ch_data["spend"].mean() if "spend" in ch_data.columns and len(ch_data) > 0 else 500,
|
||||||
|
"avg_leads": ch_data["leads"].mean() if "leads" in ch_data.columns and len(ch_data) > 0 else 50,
|
||||||
|
"avg_conv": ch_data["conversions"].mean() if "conversions" in ch_data.columns and len(ch_data) > 0 else 5,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Fill spend, leads, conversions using marketing stats + Faker variation
|
||||||
|
spends, leads_list, conv_list = [], [], []
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
ch = row["channel"]
|
||||||
|
stats = channel_stats.get(ch, {"avg_spend": 500, "avg_leads": 50, "avg_conv": 5})
|
||||||
|
|
||||||
|
rev = row.get("revenue", 1000)
|
||||||
|
# Spend: proportion of revenue with channel-based variation
|
||||||
|
spend = max(10, rev * random.uniform(0.3, 0.6) + random.gauss(0, stats["avg_spend"] * 0.1))
|
||||||
|
leads = max(1, int(spend / random.uniform(15, 40)))
|
||||||
|
conversions = max(0, int(leads * random.uniform(0.05, 0.20)))
|
||||||
|
|
||||||
|
spends.append(round(spend, 2))
|
||||||
|
leads_list.append(leads)
|
||||||
|
conv_list.append(conversions)
|
||||||
|
|
||||||
|
df["spend"] = spends
|
||||||
|
df["leads"] = leads_list
|
||||||
|
df["conversions"] = conv_list
|
||||||
|
|
||||||
|
# Drop profit column (not in spend_data schema)
|
||||||
|
if "profit" in df.columns:
|
||||||
|
df = df.drop(columns=["profit"])
|
||||||
|
|
||||||
|
logger.info(f"Merged dataset: {len(df)} rows, columns: {list(df.columns)}")
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def seed_to_postgres(df: pd.DataFrame, client_id: str):
|
||||||
|
"""Insert merged data into PostgreSQL spend_data table."""
|
||||||
|
connector = PostgresConnector()
|
||||||
|
connector.init_schema()
|
||||||
|
|
||||||
|
# Clear existing data for this client
|
||||||
|
connector.execute_write(
|
||||||
|
"DELETE FROM spend_data WHERE client_id = %s", (client_id,)
|
||||||
|
)
|
||||||
|
logger.info(f"Cleared existing data for client: {client_id}")
|
||||||
|
|
||||||
|
# Add client_id column
|
||||||
|
df["client_id"] = client_id
|
||||||
|
|
||||||
|
# Prepare batch insert
|
||||||
|
sql = """
|
||||||
|
INSERT INTO spend_data (date, country, branch, channel, spend, revenue, leads, conversions, client_id)
|
||||||
|
VALUES %s
|
||||||
|
"""
|
||||||
|
data = [
|
||||||
|
(
|
||||||
|
row["date"], row["country"], row["branch"], row["channel"],
|
||||||
|
row["spend"], row["revenue"], row["leads"], row["conversions"],
|
||||||
|
row["client_id"]
|
||||||
|
)
|
||||||
|
for _, row in df.iterrows()
|
||||||
|
]
|
||||||
|
|
||||||
|
connector.execute_batch(sql, data, page_size=2000)
|
||||||
|
|
||||||
|
count = connector.get_table_count("spend_data", client_id)
|
||||||
|
logger.info(f"Seeded {count} rows into spend_data for client: {client_id}")
|
||||||
|
|
||||||
|
# Save processed CSV
|
||||||
|
os.makedirs("data/processed", exist_ok=True)
|
||||||
|
output_path = f"data/processed/{client_id}_merged.csv"
|
||||||
|
df.to_csv(output_path, index=False)
|
||||||
|
logger.info(f"Saved processed data to {output_path}")
|
||||||
|
|
||||||
|
connector.close()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Seed demo data into PostgreSQL")
|
||||||
|
parser.add_argument("--client_id", default="acme_corp", help="Client ID")
|
||||||
|
parser.add_argument("--superstore", required=True, help="Path to Global Superstore CSV/XLSX")
|
||||||
|
parser.add_argument("--marketing", required=True, help="Path to Marketing Campaign CSV")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
logger.info(f"=== Seeding data for client: {args.client_id} ===")
|
||||||
|
|
||||||
|
superstore = load_superstore(args.superstore)
|
||||||
|
marketing = load_marketing(args.marketing)
|
||||||
|
merged = merge_datasets(superstore, marketing)
|
||||||
|
seed_to_postgres(merged, args.client_id)
|
||||||
|
|
||||||
|
logger.info("=== Seeding complete ===")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,140 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — NL-to-SQL Engine
|
||||||
|
|
||||||
|
Converts natural language questions into valid PostgreSQL SELECT queries.
|
||||||
|
Uses LLM at temperature 0.1 for deterministic SQL generation.
|
||||||
|
Safety: Only SELECT queries allowed. INSERT/UPDATE/DELETE/DROP rejected.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from config.llm_client import get_llm_client, get_model_name
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Dangerous SQL patterns — reject anything that isn't a SELECT
|
||||||
|
UNSAFE_PATTERNS = re.compile(
|
||||||
|
r"\b(INSERT|UPDATE|DELETE|DROP|ALTER|TRUNCATE|CREATE|GRANT|REVOKE|EXEC)\b",
|
||||||
|
re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
SYSTEM_PROMPT = """You are a PostgreSQL SQL generator. Generate ONLY a valid SELECT query.
|
||||||
|
Return ONLY the raw SQL — no markdown, no explanation, no code fences.
|
||||||
|
|
||||||
|
Table: spend_data
|
||||||
|
Columns:
|
||||||
|
- id: SERIAL PRIMARY KEY
|
||||||
|
- date: DATE
|
||||||
|
- country: VARCHAR(100)
|
||||||
|
- branch: VARCHAR(100)
|
||||||
|
- channel: VARCHAR(100)
|
||||||
|
- spend: FLOAT
|
||||||
|
- revenue: FLOAT
|
||||||
|
- leads: INT
|
||||||
|
- conversions: INT
|
||||||
|
- client_id: VARCHAR(100)
|
||||||
|
|
||||||
|
Available countries: {countries}
|
||||||
|
Available branches (sample): {branches}
|
||||||
|
Available channels: {channels}
|
||||||
|
Date range: {date_min} to {date_max}
|
||||||
|
|
||||||
|
RULES:
|
||||||
|
1. ALWAYS include WHERE client_id = '{client_id}' in your queries
|
||||||
|
2. Use standard PostgreSQL syntax
|
||||||
|
3. For date ranges, use DATE type comparisons
|
||||||
|
4. For "last N days", use: date >= CURRENT_DATE - INTERVAL '{n} days'
|
||||||
|
5. For "last month", use: date >= DATE_TRUNC('month', CURRENT_DATE - INTERVAL '1 month')
|
||||||
|
6. Return meaningful aggregations with GROUP BY when appropriate
|
||||||
|
7. Use aliases for computed columns (e.g., SUM(revenue) AS total_revenue)
|
||||||
|
8. LIMIT results to 50 rows maximum unless the user asks for all
|
||||||
|
9. For "bottom N" use ASC ordering, for "top N" use DESC ordering
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class NLToSQL:
|
||||||
|
"""Natural language to SQL converter using LLM."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.client = get_llm_client()
|
||||||
|
self.model = get_model_name()
|
||||||
|
|
||||||
|
def generate_sql(
|
||||||
|
self,
|
||||||
|
question: str,
|
||||||
|
client_id: str,
|
||||||
|
schema_metadata: dict,
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Convert a natural language question to a PostgreSQL SELECT query.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
question: User's natural language question
|
||||||
|
client_id: Client ID for filtering
|
||||||
|
schema_metadata: Dict with countries, branches, channels, date_min, date_max
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Valid SQL SELECT string, or None on failure
|
||||||
|
"""
|
||||||
|
# Build the system prompt with schema context
|
||||||
|
system = SYSTEM_PROMPT.format(
|
||||||
|
countries=", ".join(schema_metadata.get("countries", [])[:20]),
|
||||||
|
branches=", ".join(schema_metadata.get("branches", [])[:20]),
|
||||||
|
channels=", ".join(schema_metadata.get("channels", [])),
|
||||||
|
date_min=schema_metadata.get("date_min", "unknown"),
|
||||||
|
date_max=schema_metadata.get("date_max", "unknown"),
|
||||||
|
client_id=client_id,
|
||||||
|
n="7", # Default for interval template
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self.client.chat.completions.create(
|
||||||
|
model=self.model,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": system},
|
||||||
|
{"role": "user", "content": question},
|
||||||
|
],
|
||||||
|
temperature=0.1,
|
||||||
|
max_tokens=1024,
|
||||||
|
)
|
||||||
|
|
||||||
|
raw_sql = response.choices[0].message.content.strip()
|
||||||
|
sql = self._clean_sql(raw_sql)
|
||||||
|
|
||||||
|
if not self._validate_sql(sql):
|
||||||
|
logger.warning(f"Generated SQL failed validation: {sql}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
logger.info(f"Generated SQL: {sql}")
|
||||||
|
return sql
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"NL-to-SQL generation failed: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _clean_sql(self, raw: str) -> str:
|
||||||
|
"""Extract SQL from LLM response, stripping markdown code fences."""
|
||||||
|
# Remove markdown code blocks
|
||||||
|
cleaned = re.sub(r"```(?:sql)?\s*", "", raw)
|
||||||
|
cleaned = re.sub(r"```\s*$", "", cleaned)
|
||||||
|
cleaned = cleaned.strip().rstrip(";") + ";"
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
def _validate_sql(self, sql: str) -> bool:
|
||||||
|
"""Validate that the SQL is a safe SELECT query."""
|
||||||
|
if not sql or len(sql) < 10:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Must start with SELECT
|
||||||
|
if not sql.strip().upper().startswith("SELECT"):
|
||||||
|
logger.warning("SQL does not start with SELECT")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Must not contain dangerous operations
|
||||||
|
if UNSAFE_PATTERNS.search(sql):
|
||||||
|
logger.warning("SQL contains unsafe operations")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
@@ -0,0 +1,384 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — PostgreSQL + pgvector Connector
|
||||||
|
|
||||||
|
Connection pool management, schema initialization, and query execution.
|
||||||
|
Single database handles both structured queries (NL-to-SQL) and vector search (pgvector).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import psycopg2
|
||||||
|
import psycopg2.extras
|
||||||
|
from pgvector.psycopg2 import register_vector
|
||||||
|
|
||||||
|
from config.settings import get_settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Schema DDL
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
INIT_SCHEMA_SQL = """
|
||||||
|
-- Enable pgvector extension
|
||||||
|
CREATE EXTENSION IF NOT EXISTS vector;
|
||||||
|
|
||||||
|
-- Structured business data (replaces BigQuery)
|
||||||
|
CREATE TABLE IF NOT EXISTS spend_data (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
date DATE,
|
||||||
|
country VARCHAR(100),
|
||||||
|
branch VARCHAR(100),
|
||||||
|
channel VARCHAR(100),
|
||||||
|
spend FLOAT,
|
||||||
|
revenue FLOAT,
|
||||||
|
leads INT,
|
||||||
|
conversions INT,
|
||||||
|
client_id VARCHAR(100)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Vector embeddings (replaces ChromaDB)
|
||||||
|
CREATE TABLE IF NOT EXISTS embeddings (
|
||||||
|
id VARCHAR(200) PRIMARY KEY,
|
||||||
|
client_id VARCHAR(100),
|
||||||
|
chunk_type VARCHAR(50),
|
||||||
|
text TEXT,
|
||||||
|
metadata JSONB,
|
||||||
|
embedding vector(384)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Forecast cache
|
||||||
|
CREATE TABLE IF NOT EXISTS forecasts (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
client_id VARCHAR(100),
|
||||||
|
branch VARCHAR(100),
|
||||||
|
country VARCHAR(100),
|
||||||
|
horizon_months INT,
|
||||||
|
forecast_data JSONB,
|
||||||
|
computed_at TIMESTAMP DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Indexes
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_spend_data_client
|
||||||
|
ON spend_data (client_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_spend_data_date
|
||||||
|
ON spend_data (client_id, date);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_embeddings_client_type
|
||||||
|
ON embeddings (client_id, chunk_type);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_forecasts_client
|
||||||
|
ON forecasts (client_id, branch, country);
|
||||||
|
"""
|
||||||
|
|
||||||
|
# IVFFlat index requires rows to exist — created separately after data load
|
||||||
|
IVFFLAT_INDEX_SQL = """
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_embeddings_cosine
|
||||||
|
ON embeddings USING ivfflat (embedding vector_cosine_ops)
|
||||||
|
WITH (lists = 100);
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class PostgresConnector:
|
||||||
|
"""PostgreSQL + pgvector connection manager."""
|
||||||
|
|
||||||
|
def __init__(self, database_url: Optional[str] = None):
|
||||||
|
self.database_url = database_url or get_settings().database_url
|
||||||
|
self._conn: Optional[psycopg2.extensions.connection] = None
|
||||||
|
|
||||||
|
def _get_connection(self) -> psycopg2.extensions.connection:
|
||||||
|
"""Get or create a database connection with retry logic."""
|
||||||
|
if self._conn is None or self._conn.closed:
|
||||||
|
max_retries = 3
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
|
self._conn = psycopg2.connect(self.database_url)
|
||||||
|
register_vector(self._conn)
|
||||||
|
logger.info("Connected to PostgreSQL with pgvector support")
|
||||||
|
return self._conn
|
||||||
|
except psycopg2.OperationalError as e:
|
||||||
|
wait = 2**attempt
|
||||||
|
logger.warning(
|
||||||
|
f"DB connection attempt {attempt + 1}/{max_retries} failed: {e}. "
|
||||||
|
f"Retrying in {wait}s..."
|
||||||
|
)
|
||||||
|
time.sleep(wait)
|
||||||
|
raise ConnectionError("Failed to connect to PostgreSQL after 3 attempts")
|
||||||
|
return self._conn
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
"""Close the database connection."""
|
||||||
|
if self._conn and not self._conn.closed:
|
||||||
|
self._conn.close()
|
||||||
|
logger.info("PostgreSQL connection closed")
|
||||||
|
|
||||||
|
def init_schema(self):
|
||||||
|
"""Create tables and extensions if they don't exist."""
|
||||||
|
conn = self._get_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute(INIT_SCHEMA_SQL)
|
||||||
|
conn.commit()
|
||||||
|
logger.info("Database schema initialized successfully")
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
logger.error(f"Schema initialization failed: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def create_vector_index(self):
|
||||||
|
"""Create IVFFlat index — call AFTER data has been loaded into embeddings."""
|
||||||
|
conn = self._get_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute(IVFFLAT_INDEX_SQL)
|
||||||
|
conn.commit()
|
||||||
|
logger.info("IVFFlat vector index created")
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
logger.warning(f"Could not create IVFFlat index (may need more rows): {e}")
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Query execution
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def execute_query(self, sql: str, params: Optional[tuple] = None) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Execute a SELECT query and return results as a DataFrame.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sql: SQL query string (must be SELECT only)
|
||||||
|
params: Query parameters for parameterised queries
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pandas DataFrame with query results
|
||||||
|
"""
|
||||||
|
conn = self._get_connection()
|
||||||
|
try:
|
||||||
|
df = pd.read_sql_query(sql, conn, params=params)
|
||||||
|
conn.rollback()
|
||||||
|
logger.debug(f"Query returned {len(df)} rows")
|
||||||
|
return df
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Query execution failed: {e}")
|
||||||
|
conn.rollback()
|
||||||
|
raise
|
||||||
|
|
||||||
|
def execute_raw(self, sql: str, params: Optional[tuple] = None) -> List[Dict]:
|
||||||
|
"""Execute a query and return raw dictionaries."""
|
||||||
|
conn = self._get_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||||||
|
cur.execute(sql, params)
|
||||||
|
if cur.description:
|
||||||
|
results = [dict(row) for row in cur.fetchall()]
|
||||||
|
conn.rollback()
|
||||||
|
return results
|
||||||
|
conn.commit()
|
||||||
|
return []
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
logger.error(f"Raw query execution failed: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def execute_write(self, sql: str, params: Optional[tuple] = None):
|
||||||
|
"""Execute an INSERT/UPDATE/DELETE statement."""
|
||||||
|
conn = self._get_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute(sql, params)
|
||||||
|
conn.commit()
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
logger.error(f"Write execution failed: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def execute_batch(self, sql: str, data: List[tuple], page_size: int = 1000):
|
||||||
|
"""Execute a batch INSERT using execute_values for performance."""
|
||||||
|
conn = self._get_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
psycopg2.extras.execute_values(cur, sql, data, page_size=page_size)
|
||||||
|
conn.commit()
|
||||||
|
logger.info(f"Batch insert: {len(data)} rows")
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
logger.error(f"Batch execution failed: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# pgvector operations
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def upsert_embeddings(self, embeddings_data: List[Dict[str, Any]]):
|
||||||
|
"""
|
||||||
|
Upsert embedding records into the embeddings table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
embeddings_data: List of dicts with keys:
|
||||||
|
id, client_id, chunk_type, text, metadata, embedding
|
||||||
|
"""
|
||||||
|
conn = self._get_connection()
|
||||||
|
sql = """
|
||||||
|
INSERT INTO embeddings (id, client_id, chunk_type, text, metadata, embedding)
|
||||||
|
VALUES %s
|
||||||
|
ON CONFLICT (id) DO UPDATE SET
|
||||||
|
text = EXCLUDED.text,
|
||||||
|
metadata = EXCLUDED.metadata,
|
||||||
|
embedding = EXCLUDED.embedding
|
||||||
|
"""
|
||||||
|
data = [
|
||||||
|
(
|
||||||
|
d["id"],
|
||||||
|
d["client_id"],
|
||||||
|
d["chunk_type"],
|
||||||
|
d["text"],
|
||||||
|
psycopg2.extras.Json(d["metadata"]),
|
||||||
|
np.array(d["embedding"]),
|
||||||
|
)
|
||||||
|
for d in embeddings_data
|
||||||
|
]
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
psycopg2.extras.execute_values(cur, sql, data, page_size=100)
|
||||||
|
conn.commit()
|
||||||
|
logger.info(f"Upserted {len(data)} embeddings")
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
logger.error(f"Embedding upsert failed: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def search_embeddings(
|
||||||
|
self,
|
||||||
|
query_embedding: np.ndarray,
|
||||||
|
client_id: str,
|
||||||
|
chunk_type: Optional[str] = None,
|
||||||
|
top_k: int = 5,
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Search for similar embeddings using pgvector cosine similarity.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query_embedding: Query vector (384 dims)
|
||||||
|
client_id: Filter by client
|
||||||
|
chunk_type: Optional filter by chunk type
|
||||||
|
top_k: Number of results to return
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of dicts with text, metadata, and similarity score
|
||||||
|
"""
|
||||||
|
conn = self._get_connection()
|
||||||
|
query_vec = np.array(query_embedding)
|
||||||
|
|
||||||
|
if chunk_type:
|
||||||
|
sql = """
|
||||||
|
SELECT text, metadata, 1 - (embedding <=> %s) AS similarity
|
||||||
|
FROM embeddings
|
||||||
|
WHERE client_id = %s AND chunk_type = %s
|
||||||
|
ORDER BY embedding <=> %s
|
||||||
|
LIMIT %s
|
||||||
|
"""
|
||||||
|
params = (query_vec, client_id, chunk_type, query_vec, top_k)
|
||||||
|
else:
|
||||||
|
sql = """
|
||||||
|
SELECT text, metadata, 1 - (embedding <=> %s) AS similarity
|
||||||
|
FROM embeddings
|
||||||
|
WHERE client_id = %s
|
||||||
|
ORDER BY embedding <=> %s
|
||||||
|
LIMIT %s
|
||||||
|
"""
|
||||||
|
params = (query_vec, client_id, query_vec, top_k)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||||||
|
cur.execute(sql, params)
|
||||||
|
results = [dict(row) for row in cur.fetchall()]
|
||||||
|
logger.debug(f"Vector search returned {len(results)} results")
|
||||||
|
return results
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Vector search failed: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Utility
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_table_count(self, table: str, client_id: Optional[str] = None) -> int:
|
||||||
|
"""Get row count for a table, optionally filtered by client_id."""
|
||||||
|
conn = self._get_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
if client_id:
|
||||||
|
cur.execute(
|
||||||
|
f"SELECT COUNT(*) FROM {table} WHERE client_id = %s",
|
||||||
|
(client_id,),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cur.execute(f"SELECT COUNT(*) FROM {table}")
|
||||||
|
return cur.fetchone()[0]
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Count query failed: {e}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def get_spend_data_schema(self, client_id: str) -> Dict:
|
||||||
|
"""Get metadata about available data for a client — used by NL-to-SQL."""
|
||||||
|
conn = self._get_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute(
|
||||||
|
"SELECT DISTINCT country FROM spend_data WHERE client_id = %s ORDER BY country",
|
||||||
|
(client_id,),
|
||||||
|
)
|
||||||
|
countries = [row[0] for row in cur.fetchall()]
|
||||||
|
|
||||||
|
cur.execute(
|
||||||
|
"SELECT DISTINCT branch FROM spend_data WHERE client_id = %s ORDER BY branch",
|
||||||
|
(client_id,),
|
||||||
|
)
|
||||||
|
branches = [row[0] for row in cur.fetchall()]
|
||||||
|
|
||||||
|
cur.execute(
|
||||||
|
"SELECT DISTINCT channel FROM spend_data WHERE client_id = %s ORDER BY channel",
|
||||||
|
(client_id,),
|
||||||
|
)
|
||||||
|
channels = [row[0] for row in cur.fetchall()]
|
||||||
|
|
||||||
|
cur.execute(
|
||||||
|
"SELECT MIN(date), MAX(date) FROM spend_data WHERE client_id = %s",
|
||||||
|
(client_id,),
|
||||||
|
)
|
||||||
|
date_range = cur.fetchone()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"countries": countries,
|
||||||
|
"branches": branches,
|
||||||
|
"channels": channels,
|
||||||
|
"date_min": str(date_range[0]) if date_range[0] else None,
|
||||||
|
"date_max": str(date_range[1]) if date_range[1] else None,
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Schema metadata query failed: {e}")
|
||||||
|
return {
|
||||||
|
"countries": [],
|
||||||
|
"branches": [],
|
||||||
|
"channels": [],
|
||||||
|
"date_min": None,
|
||||||
|
"date_max": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Module-level singleton
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_connector: Optional[PostgresConnector] = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_connector() -> PostgresConnector:
|
||||||
|
"""Get the shared PostgresConnector singleton."""
|
||||||
|
global _connector
|
||||||
|
if _connector is None:
|
||||||
|
_connector = PostgresConnector()
|
||||||
|
return _connector
|
||||||
@@ -0,0 +1,139 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — Web Search Skill
|
||||||
|
|
||||||
|
Primary: Tavily API (clean, summarised results built for LLM agents)
|
||||||
|
Fallback: duckduckgo-search (no API key, no rate limits, free)
|
||||||
|
|
||||||
|
Auto-fallback: if Tavily errors or quota exceeded, silently switch to DuckDuckGo.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from typing import List, Dict, Optional
|
||||||
|
|
||||||
|
from config.settings import get_settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def web_search(
|
||||||
|
query: str,
|
||||||
|
max_results: int = 5,
|
||||||
|
lookback_days: int = 1,
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Search the web using Tavily (primary) or DuckDuckGo (fallback).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Search query string
|
||||||
|
max_results: Maximum number of results
|
||||||
|
lookback_days: Only keep results from the last N days
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of dicts with: title, url, content, date
|
||||||
|
"""
|
||||||
|
results = _tavily_search(query, max_results)
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
logger.info("Tavily returned no results, falling back to DuckDuckGo")
|
||||||
|
results = _ddg_search(query, max_results)
|
||||||
|
|
||||||
|
# Filter by recency
|
||||||
|
if lookback_days > 0:
|
||||||
|
results = _filter_recent(results, lookback_days)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _tavily_search(query: str, max_results: int = 5) -> List[Dict]:
|
||||||
|
"""Search using Tavily API."""
|
||||||
|
settings = get_settings()
|
||||||
|
|
||||||
|
if not settings.tavily_api_key:
|
||||||
|
logger.info("Tavily API key not configured, skipping")
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
from tavily import TavilyClient
|
||||||
|
|
||||||
|
client = TavilyClient(api_key=settings.tavily_api_key)
|
||||||
|
response = client.search(
|
||||||
|
query=query,
|
||||||
|
search_depth="advanced",
|
||||||
|
max_results=max_results,
|
||||||
|
)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for item in response.get("results", []):
|
||||||
|
results.append({
|
||||||
|
"title": item.get("title", ""),
|
||||||
|
"url": item.get("url", ""),
|
||||||
|
"content": item.get("content", ""),
|
||||||
|
"date": item.get("published_date", ""),
|
||||||
|
"source": "tavily",
|
||||||
|
})
|
||||||
|
|
||||||
|
logger.info(f"Tavily returned {len(results)} results for: {query[:50]}")
|
||||||
|
return results
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Tavily search failed: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def _ddg_search(query: str, max_results: int = 5) -> List[Dict]:
|
||||||
|
"""Search using DuckDuckGo (fallback — no API key needed)."""
|
||||||
|
try:
|
||||||
|
from duckduckgo_search import DDGS
|
||||||
|
|
||||||
|
results = []
|
||||||
|
with DDGS() as ddgs:
|
||||||
|
for r in ddgs.text(query, max_results=max_results):
|
||||||
|
results.append({
|
||||||
|
"title": r.get("title", ""),
|
||||||
|
"url": r.get("href", ""),
|
||||||
|
"content": r.get("body", ""),
|
||||||
|
"date": "",
|
||||||
|
"source": "duckduckgo",
|
||||||
|
})
|
||||||
|
|
||||||
|
logger.info(f"DuckDuckGo returned {len(results)} results for: {query[:50]}")
|
||||||
|
return results
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"DuckDuckGo search failed: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_recent(results: List[Dict], lookback_days: int) -> List[Dict]:
|
||||||
|
"""Filter results to only include items from the last N days."""
|
||||||
|
if not results:
|
||||||
|
return results
|
||||||
|
|
||||||
|
cutoff = datetime.utcnow() - timedelta(days=lookback_days)
|
||||||
|
filtered = []
|
||||||
|
|
||||||
|
for r in results:
|
||||||
|
date_str = r.get("date", "")
|
||||||
|
if not date_str:
|
||||||
|
# No date info — include it (benefit of the doubt)
|
||||||
|
filtered.append(r)
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Try common date formats
|
||||||
|
for fmt in ("%Y-%m-%dT%H:%M:%S", "%Y-%m-%d", "%B %d, %Y"):
|
||||||
|
try:
|
||||||
|
dt = datetime.strptime(date_str[:19], fmt)
|
||||||
|
if dt >= cutoff:
|
||||||
|
filtered.append(r)
|
||||||
|
break
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Can't parse date, include it
|
||||||
|
filtered.append(r)
|
||||||
|
except Exception:
|
||||||
|
filtered.append(r)
|
||||||
|
|
||||||
|
return filtered
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
# SOUL — ACME Corporation
|
||||||
|
|
||||||
|
## Identity
|
||||||
|
You are Clawrity, ACME's business intelligence assistant.
|
||||||
|
Speak professionally but conversationally.
|
||||||
|
Always ground answers in data. Never speculate.
|
||||||
|
|
||||||
|
## Business Context
|
||||||
|
- Operates in: US, Canada, MENA
|
||||||
|
- Primary metric: Revenue per lead
|
||||||
|
- Risk tolerance: Conservative (max 15% budget reallocation per suggestion)
|
||||||
|
|
||||||
|
## Rules
|
||||||
|
- If data unavailable, say "I don't have that data right now"
|
||||||
|
- Always surface bottom 3 branches in daily digests
|
||||||
|
- Budget suggestions must cite specific historical data points
|
||||||
|
- Never compare to competitors by name unless from Scout Agent
|
||||||
@@ -0,0 +1,56 @@
|
|||||||
|
"""
|
||||||
|
Clawrity — SOUL Loader
|
||||||
|
|
||||||
|
Reads the SOUL.md file for a client and returns raw text for prompt injection.
|
||||||
|
SOUL.md defines the AI's personality, business context, and rules per client.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from config.client_loader import ClientConfig
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def load_soul(client_config: ClientConfig) -> str:
|
||||||
|
"""
|
||||||
|
Load the SOUL.md content for a client.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
client_config: The client's configuration containing soul_file path.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Raw markdown text of the SOUL file, or a default prompt if file not found.
|
||||||
|
"""
|
||||||
|
soul_path = Path(client_config.soul_file)
|
||||||
|
|
||||||
|
if not soul_path.exists():
|
||||||
|
logger.warning(
|
||||||
|
f"SOUL file not found at {soul_path} for client {client_config.client_id}. "
|
||||||
|
f"Using default personality."
|
||||||
|
)
|
||||||
|
return _default_soul(client_config)
|
||||||
|
|
||||||
|
try:
|
||||||
|
content = soul_path.read_text(encoding="utf-8")
|
||||||
|
logger.info(f"Loaded SOUL for {client_config.client_id} from {soul_path}")
|
||||||
|
return content
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error reading SOUL file {soul_path}: {e}")
|
||||||
|
return _default_soul(client_config)
|
||||||
|
|
||||||
|
|
||||||
|
def _default_soul(client_config: ClientConfig) -> str:
|
||||||
|
"""Generate a minimal default SOUL if the file is missing."""
|
||||||
|
return f"""# SOUL — {client_config.client_name}
|
||||||
|
|
||||||
|
## Identity
|
||||||
|
You are Clawrity, {client_config.client_name}'s business intelligence assistant.
|
||||||
|
Speak professionally. Always ground answers in data. Never speculate.
|
||||||
|
|
||||||
|
## Rules
|
||||||
|
- If data unavailable, say "I don't have that data right now"
|
||||||
|
- Always cite specific data points in your responses
|
||||||
|
"""
|
||||||
Reference in New Issue
Block a user