prototype

2026-05-16 19:35:21 +00:00 · 2026-05-04 22:00:38 +05:30
commit 711d691870
48 changed files with 5093 additions and 0 deletions
@@ -0,0 +1,18 @@
+# HEARTBEAT — ACME Corporation
+
+## Schedule
+- trigger: daily
+- time: "08:00"
+- timezone: "Asia/Kolkata"
+
+## Digest Tasks
+1. Pull last 7 days spend + revenue per branch
+2. Identify bottom 3 performing branches by revenue
+3. Generate newsletter-style summary via Gen Agent → QA Agent
+4. Run Scout Agent for competitor + sector news
+5. Append Market Intelligence section to digest
+6. Push complete digest to Slack channel
+
+## Retry
+- on_failure: retry after 15 minutes
+- max_retries: 3
@@ -0,0 +1,124 @@
+"""
+Clawrity — HEARTBEAT Loader
+
+Parses HEARTBEAT.md files to extract schedule, digest tasks, and retry config.
+HEARTBEAT.md drives autonomous daily digest generation per client.
+"""
+
+import re
+import logging
+from pathlib import Path
+from typing import Optional, Dict, Any
+
+from config.client_loader import ClientConfig
+
+logger = logging.getLogger(__name__)
+
+
+class HeartbeatConfig:
+    """Parsed heartbeat configuration."""
+
+    def __init__(self):
+        self.trigger: str = "daily"
+        self.time: str = "08:00"
+        self.timezone: str = "UTC"
+        self.retry_delay_minutes: int = 15
+        self.max_retries: int = 3
+        self.tasks: list = []
+        self.raw_content: str = ""
+
+    @property
+    def hour(self) -> int:
+        """Extract hour from time string."""
+        return int(self.time.split(":")[0])
+
+    @property
+    def minute(self) -> int:
+        """Extract minute from time string."""
+        return int(self.time.split(":")[1])
+
+
+def load_heartbeat(client_config: ClientConfig) -> HeartbeatConfig:
+    """
+    Load and parse the HEARTBEAT.md file for a client.
+
+    Args:
+        client_config: The client's configuration containing heartbeat_file path.
+
+    Returns:
+        Parsed HeartbeatConfig with schedule, tasks, and retry settings.
+    """
+    config = HeartbeatConfig()
+    heartbeat_path = Path(client_config.heartbeat_file)
+
+    # Use client YAML timezone as fallback
+    config.timezone = client_config.timezone
+
+    if not heartbeat_path.exists():
+        logger.warning(
+            f"HEARTBEAT file not found at {heartbeat_path} for client "
+            f"{client_config.client_id}. Using defaults from client YAML."
+        )
+        config.time = client_config.digest_schedule
+        return config
+
+    try:
+        content = heartbeat_path.read_text(encoding="utf-8")
+        config.raw_content = content
+        _parse_heartbeat(content, config)
+        logger.info(
+            f"Loaded HEARTBEAT for {client_config.client_id}: "
+            f"{config.trigger} at {config.time} {config.timezone}"
+        )
+    except Exception as e:
+        logger.error(f"Error parsing HEARTBEAT file {heartbeat_path}: {e}")
+        config.time = client_config.digest_schedule
+
+    return config
+
+
+def _parse_heartbeat(content: str, config: HeartbeatConfig) -> None:
+    """Parse markdown content and extract structured config."""
+    lines = content.split("\n")
+
+    current_section = None
+    task_lines = []
+
+    for line in lines:
+        stripped = line.strip()
+
+        # Detect section headers
+        if stripped.startswith("## "):
+            current_section = stripped[3:].strip().lower()
+            continue
+
+        if current_section == "schedule":
+            # Parse key-value pairs like "- trigger: daily"
+            match = re.match(r"-\s*(\w+):\s*\"?([^\"]+)\"?", stripped)
+            if match:
+                key, value = match.group(1).strip(), match.group(2).strip()
+                if key == "trigger":
+                    config.trigger = value
+                elif key == "time":
+                    config.time = value
+                elif key == "timezone":
+                    config.timezone = value
+
+        elif current_section == "digest tasks":
+            # Parse numbered list items
+            match = re.match(r"\d+\.\s+(.*)", stripped)
+            if match:
+                config.tasks.append(match.group(1).strip())
+
+        elif current_section == "retry":
+            # Parse retry config
+            match = re.match(r"-\s*(\w+):\s*(.+)", stripped)
+            if match:
+                key, value = match.group(1).strip(), match.group(2).strip()
+                if "retry" in key and "after" in value:
+                    # Extract minutes from "retry after 15 minutes"
+                    mins = re.search(r"(\d+)", value)
+                    if mins:
+                        config.retry_delay_minutes = int(mins.group(1))
+                elif key == "max_retries":
+                    config.max_retries = int(value)
@@ -0,0 +1,295 @@
+"""
+Clawrity — HEARTBEAT Scheduler
+
+APScheduler AsyncIOScheduler fires digest jobs per client at configured times.
+Schedule: ETL at 02:00 → RAG re-index at 03:00 → Digest + Scout at configured time.
+Retry: on failure, retry after N minutes, max retries from HEARTBEAT.md.
+"""
+
+import asyncio
+import json
+import logging
+import os
+from datetime import datetime
+from typing import Dict, Optional
+
+import httpx
+from apscheduler.schedulers.asyncio import AsyncIOScheduler
+from apscheduler.triggers.cron import CronTrigger
+
+from agents.orchestrator import Orchestrator
+from channels.protocol_adapter import NormalisedMessage
+from config.client_loader import ClientConfig
+from config.settings import get_settings
+from heartbeat.heartbeat_loader import load_heartbeat
+from skills.postgres_connector import get_connector
+from soul.soul_loader import load_soul
+
+logger = logging.getLogger(__name__)
+
+
+async def run_digest(
+    client_config: ClientConfig,
+    orchestrator: Orchestrator,
+    retry_count: int = 0,
+) -> Optional[str]:
+    """
+    Run the daily digest for a client.
+
+    Steps:
+    1. Query bottom 3 branches by revenue (last 7 days)
+    2. Gen Agent → QA Agent pipeline for digest
+    3. Scout Agent for competitor/sector news
+    4. Push to Slack webhook
+    5. Log success/failure to JSONL
+
+    Returns:
+        Full digest text if successful, None on failure
+    """
+    from agents.gen_agent import GenAgent
+    from agents.qa_agent import QAAgent
+
+    client_id = client_config.client_id
+    logger.info(f"[{client_id}] Running daily digest (attempt {retry_count + 1})")
+
+    db = get_connector()
+
+    try:
+        # Step 1: Get bottom 3 branches by revenue with ROI
+        bottom_sql = """
+            SELECT branch, country, 
+                   SUM(revenue) as total_revenue,
+                   SUM(spend) as total_spend,
+                   SUM(leads) as total_leads,
+                   ROUND((SUM(revenue)/NULLIF(SUM(spend),0))::numeric, 2) as roi
+            FROM spend_data
+            WHERE client_id = %s
+              AND date >= CURRENT_DATE - INTERVAL '7 days'
+            GROUP BY branch, country
+            ORDER BY total_revenue ASC
+            LIMIT 3
+        """
+        data = db.execute_query(bottom_sql, (client_id,))
+
+        # Step 2: Generate digest via Gen Agent with specific prompt
+        soul_content = load_soul(client_config)
+        gen_agent = GenAgent()
+        qa_agent = QAAgent()
+
+        # Retrieve RAG chunks for digest context
+        rag_chunks = None
+        if orchestrator.retriever:
+            try:
+                rag_chunks = orchestrator.retriever.retrieve(
+                    query="weekly performance bottom performers budget recommendations",
+                    client_id=client_id,
+                )
+            except Exception as e:
+                logger.warning(f"RAG retrieval for digest failed: {e}")
+
+        # Generate digest with explicit prompt
+        digest = gen_agent.generate(
+            question="Generate morning business digest. Highlight bottom 3 branches. Suggest where to focus budget. Newsletter style.",
+            soul_content=soul_content,
+            data_context=data,
+            rag_chunks=rag_chunks,
+        )
+
+        # Step 2b: QA pass on digest (more lenient threshold for digest)
+        qa_result = qa_agent.evaluate(
+            response=digest,
+            data_context=data,
+            threshold=0.6,  # More lenient for digest
+        )
+
+        if not qa_result["passed"]:
+            logger.warning(
+                f"[{client_id}] Digest QA failed (score={qa_result['score']:.2f}), "
+                f"retrying with strict instruction"
+            )
+            # Retry digest generation with strict instruction
+            digest = gen_agent.generate(
+                question="Generate morning business digest. Highlight bottom 3 branches. Suggest where to focus budget. Newsletter style.",
+                soul_content=soul_content,
+                data_context=data,
+                rag_chunks=rag_chunks,
+                retry_issues=qa_result["issues"],
+                retry_count=1,
+                strict_data_instruction=(
+                    "CRITICAL: Only mention branches and figures that appear in the "
+                    "Data Context. Do not reference any other branches or historical data."
+                ),
+            )
+
+        # Step 3: Scout Agent for competitor/sector news
+        scout_section = None
+        try:
+            from agents.scout_agent import ScoutAgent
+            scout = ScoutAgent()
+            scout_section = await scout.gather_intelligence(client_config)
+        except Exception as e:
+            logger.warning(f"Scout Agent failed: {e}")
+
+        # Step 4: Assemble full digest
+        full_digest = f"📊 **Clawrity Daily Digest — {client_config.client_name}**\n"
+        full_digest += f"*{datetime.now().strftime('%B %d, %Y')}*\n\n"
+        full_digest += digest
+
+        if scout_section:
+            full_digest += f"\n\n---\n\n{scout_section}"
+
+        # Step 5: Push to Slack webhook
+        webhook_url = client_config.channels.get("slack_webhook", "")
+        if webhook_url:
+            await _push_to_slack(webhook_url, full_digest)
+        else:
+            logger.warning(f"[{client_id}] No Slack webhook configured")
+
+        # Step 6: Log success to JSONL
+        _log_digest_event(client_id, "success", {
+            "qa_score": qa_result["score"],
+            "qa_passed": qa_result["passed"],
+            "scout_included": scout_section is not None,
+            "digest_length": len(full_digest),
+        })
+
+        logger.info(f"[{client_id}] Digest completed successfully")
+        return full_digest
+
+    except Exception as e:
+        logger.error(f"[{client_id}] Digest failed: {e}", exc_info=True)
+        _log_digest_event(client_id, "failure", {"error": str(e), "attempt": retry_count + 1})
+
+        heartbeat = load_heartbeat(client_config)
+
+        if retry_count < heartbeat.max_retries:
+            delay_minutes = heartbeat.retry_delay_minutes
+            logger.info(
+                f"[{client_id}] Scheduling digest retry in {delay_minutes} minutes "
+                f"(attempt {retry_count + 2}/{heartbeat.max_retries + 1})"
+            )
+            await asyncio.sleep(delay_minutes * 60)
+            return await run_digest(client_config, orchestrator, retry_count + 1)
+        else:
+            logger.error(f"[{client_id}] Digest failed after {heartbeat.max_retries + 1} attempts")
+            # Post failure notification to Slack
+            webhook_url = client_config.channels.get("slack_webhook", "")
+            if webhook_url:
+                await _push_to_slack(
+                    webhook_url,
+                    "Clawrity digest unavailable. Backend may be offline."
+                )
+            return None
+
+
+async def _push_to_slack(webhook_url: str, message: str):
+    """Push a message to a Slack incoming webhook."""
+    try:
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                webhook_url,
+                json={"text": message},
+                timeout=30,
+            )
+            if response.status_code == 200:
+                logger.info("Digest pushed to Slack successfully")
+            else:
+                logger.error(f"Slack webhook returned {response.status_code}: {response.text}")
+    except Exception as e:
+        logger.error(f"Failed to push digest to Slack: {e}")
+
+
+def _log_digest_event(client_id: str, status: str, details: dict):
+    """Log digest event to JSONL monitoring file."""
+    settings = get_settings()
+    logs_dir = settings.logs_dir
+    os.makedirs(logs_dir, exist_ok=True)
+    log_path = os.path.join(logs_dir, f"{client_id}_digest.jsonl")
+
+    entry = {
+        "timestamp": datetime.utcnow().isoformat(),
+        "client_id": client_id,
+        "event": "digest",
+        "status": status,
+        **details,
+    }
+
+    try:
+        with open(log_path, "a") as f:
+            f.write(json.dumps(entry) + "\n")
+    except Exception as e:
+        logger.error(f"Failed to log digest event: {e}")
+
+
+def start_scheduler(
+    client_configs: Dict[str, ClientConfig],
+    orchestrator: Orchestrator,
+) -> AsyncIOScheduler:
+    """
+    Start the APScheduler with digest jobs for all clients.
+
+    Schedule per client:
+    - Digest at configured time (from HEARTBEAT.md)
+    - ETL sync at 02:00 (placeholder)
+    - RAG re-index at 03:00 (placeholder)
+    """
+    scheduler = AsyncIOScheduler()
+
+    for client_id, config in client_configs.items():
+        heartbeat = load_heartbeat(config)
+
+        # Daily digest at configured time
+        scheduler.add_job(
+            run_digest,
+            CronTrigger(
+                hour=heartbeat.hour,
+                minute=heartbeat.minute,
+                timezone=heartbeat.timezone,
+            ),
+            args=[config, orchestrator],
+            id=f"digest_{client_id}",
+            name=f"Daily Digest — {config.client_name}",
+            replace_existing=True,
+        )
+        logger.info(
+            f"Scheduled digest for {client_id}: "
+            f"{heartbeat.time} {heartbeat.timezone}"
+        )
+
+        # ETL sync at 02:00 (placeholder)
+        scheduler.add_job(
+            _etl_sync_placeholder,
+            CronTrigger(hour=2, minute=0, timezone=heartbeat.timezone),
+            args=[client_id],
+            id=f"etl_{client_id}",
+            name=f"ETL Sync — {config.client_name}",
+            replace_existing=True,
+        )
+
+        # RAG re-index at 03:00 (placeholder)
+        scheduler.add_job(
+            _rag_reindex_placeholder,
+            CronTrigger(hour=3, minute=0, timezone=heartbeat.timezone),
+            args=[client_id],
+            id=f"rag_reindex_{client_id}",
+            name=f"RAG Re-index — {config.client_name}",
+            replace_existing=True,
+        )
+
+    scheduler.start()
+    return scheduler
+
+
+async def _etl_sync_placeholder(client_id: str):
+    """Placeholder for nightly ETL data sync."""
+    logger.info(f"[{client_id}] ETL sync triggered (placeholder)")
+
+
+async def _rag_reindex_placeholder(client_id: str):
+    """Placeholder for nightly RAG re-indexing."""
+    logger.info(f"[{client_id}] RAG re-index triggered (placeholder)")
+    try:
+        from scripts.run_rag_pipeline import run_pipeline
+        run_pipeline(client_id)
+    except Exception as e:
+        logger.warning(f"RAG re-index failed: {e}")