response redundancy fixed and proper backend communication

2026-05-16 19:35:21 +00:00 · 2026-05-05 17:58:58 +05:30
parent 711d691870
commit ba61963d6f
12 changed files with 880 additions and 287 deletions
@@ -12,7 +12,7 @@ from typing import List, Optional, Dict

 import pandas as pd

-from config.llm_client import get_llm_client, get_model_name
+from config.llm_client import get_llm_client, get_model_name, chat_with_retry

 logger = logging.getLogger(__name__)

@@ -35,6 +35,7 @@ class GenAgent:
        retry_count: int = 0,
        strict_data_instruction: Optional[str] = None,
        supplementary_context: Optional[pd.DataFrame] = None,
+        sql: Optional[str] = None,
    ) -> str:
        """
        Generate a data-grounded response.
@@ -46,6 +47,7 @@ class GenAgent:
            rag_chunks: Retrieved chunks with similarity scores (Phase 2)
            retry_issues: QA Agent issues from previous attempt
            retry_count: Current retry number (0-2)
+            sql: The SQL query that produced the data context

        Returns:
            Markdown-formatted response string
@@ -53,12 +55,19 @@ class GenAgent:
        temperature = max(0.1, self.base_temperature - (retry_count * 0.2))

        prompt = self._build_prompt(
-            question, soul_content, data_context, rag_chunks, retry_issues,
-            strict_data_instruction, supplementary_context,
+            question,
+            soul_content,
+            data_context,
+            rag_chunks,
+            retry_issues,
+            strict_data_instruction,
+            supplementary_context,
+            sql,
        )

        try:
-            response = self.client.chat.completions.create(
+            response = chat_with_retry(
+                self.client,
                model=self.model,
                messages=[
                    {"role": "system", "content": soul_content},
@@ -108,7 +117,8 @@ class GenAgent:
 Use bullet points, bold key numbers, and keep it concise."""

        try:
-            response = self.client.chat.completions.create(
+            response = chat_with_retry(
+                self.client,
                model=self.model,
                messages=[
                    {"role": "system", "content": soul_content},
@@ -131,18 +141,28 @@ Use bullet points, bold key numbers, and keep it concise."""
        retry_issues: Optional[List[str]],
        strict_data_instruction: Optional[str] = None,
        supplementary_context: Optional[pd.DataFrame] = None,
+        sql: Optional[str] = None,
    ) -> str:
        """Build the augmented prompt for response generation."""
        parts = []

-        # Strict data instruction (on retry — prevents hallucination)
+        # Strict data instruction — prevents hallucination
        if strict_data_instruction:
            parts.append(f"## ⚠️ STRICT REQUIREMENT\n{strict_data_instruction}\n")

-        # Data context
+        # SQL query that produced the data (so the model knows what filters were applied)
+        if sql:
+            parts.append(f"## SQL Query Used\n```sql\n{sql}\n```\n")
+
+        # Data context with computed summaries
        if data_context is not None and len(data_context) > 0:
            parts.append("## Data Context (query results for the user's question)")
            parts.append(data_context.to_markdown(index=False))
+
+            # Compute summary statistics to help the LLM cite precise numbers
+            summary = self._compute_summary(data_context)
+            if summary:
+                parts.append(f"\n### Computed Summary\n{summary}")
        else:
            parts.append("## Data Context\nNo query results available.")

@@ -150,22 +170,31 @@ Use bullet points, bold key numbers, and keep it concise."""
        if supplementary_context is not None and len(supplementary_context) > 0:
            parts.append("\n## Benchmark Data (top-performing branches for comparison)")
            parts.append(supplementary_context.to_markdown(index=False))
+
+            bench_summary = self._compute_summary(supplementary_context)
+            if bench_summary:
+                parts.append(f"\n### Benchmark Summary\n{bench_summary}")
+
            parts.append(
-                "\nUse this benchmark data to compare the queried branch's performance "
-                "against top performers. Identify which channels and strategies work "
-                "best, and recommend specific, actionable improvements based on what "
-                "top-performing branches are doing differently."
+                "\n### How to use benchmark data\n"
+                "Compare the queried branch's metrics against these top performers:\n"
+                "- If the queried branch's ROI is lower than benchmarks, recommend shifting budget to higher-ROI channels\n"
+                "- If a channel underperforms vs benchmarks, suggest reducing spend or optimizing it\n"
+                "- Cite SPECIFIC numbers: 'Your Email ROI is 2.29 vs the top performer's 2.50'\n"
+                "- Be concrete: 'Shift $X from Facebook to Email based on the ROI difference'"
            )

        # RAG chunks (Phase 2)
        if rag_chunks:
-            parts.append("\n## Historical Business Context (retrieved from intelligence layer)")
-            if strict_data_instruction:
-                parts.append("⚠️ ONLY use historical context that is about branches/entities in the Data Context above. IGNORE any historical context about other branches.")
+            parts.append(
+                "\n## Historical Business Context (retrieved from intelligence layer)"
+            )
+            parts.append(
+                "⚠️ ONLY use historical context that is about branches/entities in the Data Context above. IGNORE any historical context about other branches."
+            )
            for i, chunk in enumerate(rag_chunks, 1):
                sim = chunk.get("similarity", 0)
                parts.append(f"{i}. {chunk['text']} (relevance: {sim:.2f})")
-            parts.append("\nBase suggestions on historical context. Cite specific data points.")

        # Retry instructions
        if retry_issues:
@@ -173,12 +202,82 @@ Use bullet points, bold key numbers, and keep it concise."""
            parts.append("Your previous response had these problems. Fix them:")
            for issue in retry_issues:
                parts.append(f"- {issue}")
-            parts.append("Be more precise. Only state facts supported by the data above.")
-            parts.append("Do NOT introduce any new branches, cities, or figures that are not in the Data Context.")
+            parts.append(
+                "Be more precise. Only state facts supported by the data above."
+            )
+            parts.append(
+                "Do NOT introduce any new branches, cities, or figures that are not in the Data Context."
+            )

        # User question
        parts.append(f"\n## User Question\n{question}")

-        parts.append("\nProvide a professional, data-grounded response. Cite specific numbers from the data.")
+        # Response quality instructions
+        parts.append(
+            "\n## Response Quality Rules\n"
+            "1. ALWAYS cite specific numbers from the Data Context (e.g., '$29,941 revenue', 'ROI of 2.29')\n"
+            "2. When comparing channels or branches, use EXACT figures from the data — never round unless using ~\n"
+            "3. For recommendations, reference specific metrics: 'Email has ROI 2.29 vs Facebook's 2.06 — consider reallocating budget'\n"
+            "4. Structure your answer with clear sections: Data Summary → Analysis → Recommendations\n"
+            "5. Do NOT give generic advice — every recommendation must tie to a specific data point\n"
+            "6. Do NOT mention branches, cities, or figures that are not in the Data Context above\n"
+            "7. Keep the response concise but data-dense — prefer bullet points over paragraphs"
+        )

        return "\n".join(parts)
+
+    def _compute_summary(self, df: pd.DataFrame) -> str:
+        """Compute summary statistics from a DataFrame to help the LLM cite precise numbers."""
+        if df is None or len(df) == 0:
+            return ""
+
+        lines = []
+        numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
+
+        # Total row
+        totals = {}
+        for col in numeric_cols:
+            total = df[col].sum()
+            if total != 0:
+                totals[col] = total
+
+        if totals:
+            total_parts = []
+            for col, val in totals.items():
+                if val >= 1_000_000:
+                    total_parts.append(f"Total {col}: ${val / 1_000_000:.2f}M")
+                elif val >= 1_000:
+                    total_parts.append(f"Total {col}: ${val:,.2f}")
+                else:
+                    total_parts.append(f"Total {col}: {val:,.0f}")
+            lines.append(" | ".join(total_parts))
+
+        # ROI if revenue and spend columns exist
+        rev_col = next((c for c in numeric_cols if "revenue" in c.lower()), None)
+        spend_col = next((c for c in numeric_cols if "spend" in c.lower()), None)
+        if rev_col and spend_col:
+            total_rev = df[rev_col].sum()
+            total_spend = df[spend_col].sum()
+            if total_spend > 0:
+                lines.append(f"Overall ROI: {total_rev / total_spend:.2f}")
+
+        # Per-row highlights (top/bottom)
+        if rev_col and len(df) > 1:
+            idx_max = df[rev_col].idxmax()
+            idx_min = df[rev_col].idxmin()
+            label_col = None
+            for candidate in ["branch", "channel", "country", "name"]:
+                if candidate in df.columns:
+                    label_col = candidate
+                    break
+            if label_col:
+                top = df.loc[idx_max]
+                bot = df.loc[idx_min]
+                lines.append(
+                    f"Highest {rev_col}: {top[label_col]} (${top[rev_col]:,.2f})"
+                )
+                lines.append(
+                    f"Lowest {rev_col}: {bot[label_col]} (${bot[rev_col]:,.2f})"
+                )
+
+        return "\n".join(lines) if lines else ""
@@ -118,7 +118,9 @@ class Orchestrator:
        qa_threshold = client_config.hallucination_threshold
        if supplementary_context is not None and len(supplementary_context) > 0:
            qa_threshold = min(qa_threshold, 0.5)
-            logger.info(f"Using relaxed QA threshold ({qa_threshold}) for enriched context")
+            logger.info(
+                f"Using relaxed QA threshold ({qa_threshold}) for enriched context"
+            )

        best_response = None
        best_score = 0.0
@@ -128,23 +130,23 @@ class Orchestrator:
        for attempt in range(MAX_RETRIES + 1):
            retry_issues = qa_result["issues"] if attempt > 0 else None

-            # On retry, add explicit data-only instruction to prevent hallucination
-            strict_data_instruction = None
-            if attempt > 0:
-                if supplementary_context is not None and len(supplementary_context) > 0:
-                    strict_data_instruction = (
-                        "CRITICAL: Only use data from the Data Context and Benchmark Data "
-                        "sections provided. Do NOT invent figures or branch names that are "
-                        "not present in either of those sections. You MAY reference benchmark "
-                        "branches for comparison and recommendations."
-                    )
-                else:
-                    strict_data_instruction = (
-                        "CRITICAL: Do NOT mention any branches, figures, or historical data "
-                        "that are not in the SQL query result provided. Stick strictly to the "
-                        "data. If historical context from RAG is about different branches than "
-                        "what the query returned, IGNORE that context entirely."
-                    )
+            # Always provide strict data grounding instruction to prevent
+            # the Gen Agent from hallucinating branch/figure data from RAG
+            # chunks that don't match the actual SQL query results.
+            if supplementary_context is not None and len(supplementary_context) > 0:
+                strict_data_instruction = (
+                    "CRITICAL: Only use data from the Data Context and Benchmark Data "
+                    "sections provided. Do NOT invent figures or branch names that are "
+                    "not present in either of those sections. You MAY reference benchmark "
+                    "branches for comparison and recommendations."
+                )
+            else:
+                strict_data_instruction = (
+                    "CRITICAL: Do NOT mention any branches, figures, or historical data "
+                    "that are not in the SQL query result provided. Stick strictly to the "
+                    "data. If historical context from RAG is about different branches than "
+                    "what the query returned, IGNORE that context entirely."
+                )

            response = self.gen_agent.generate(
                question=message.text,
@@ -155,6 +157,7 @@ class Orchestrator:
                retry_count=attempt,
                strict_data_instruction=strict_data_instruction,
                supplementary_context=supplementary_context,
+                sql=sql,
            )

            qa_result = self.qa_agent.evaluate(
@@ -163,6 +166,7 @@ class Orchestrator:
                threshold=qa_threshold,
                supplementary_context=supplementary_context,
                user_question=message.text,
+                sql=sql,
            )

            # Track best response (prefer longer, richer responses over "no data" stubs)
@@ -256,7 +260,9 @@ class Orchestrator:
            top_performers = db.execute_query(enrichment_sql, (client_id,))

            if top_performers is not None and len(top_performers) > 0:
-                logger.info(f"Enrichment: fetched {len(top_performers)} top performer rows")
+                logger.info(
+                    f"Enrichment: fetched {len(top_performers)} top performer rows"
+                )
                return top_performers

        except Exception as e:
@@ -273,6 +279,7 @@ class Orchestrator:
        """Log interaction for monitoring."""
        try:
            from rag.monitoring import log_interaction
+
            log_interaction(
                client_id=client_config.client_id,
                query=message.text,
@@ -9,11 +9,12 @@ Threshold from client YAML hallucination_threshold (default 0.75).

 import json
 import logging
+import re
 from typing import Optional, List, Dict

 import pandas as pd

-from config.llm_client import get_llm_client, get_model_name
+from config.llm_client import get_llm_client, get_model_name, chat_with_retry

 logger = logging.getLogger(__name__)

@@ -32,7 +33,9 @@ Your job: verify that the response ONLY contains claims supported by the provide
 ### 1. Branch Name Validation (CRITICAL)
 - Extract ALL branch/city names mentioned in the response
 - Compare against the branch names in the Data Context above
- If ANY branch name appears in the response but NOT in the Data Context, this is a HALLUCINATION
+- Branch/entity names listed under "Valid Entities from User Question" are VALID even if not listed in query results
+- Branch/entity names listed under "Branches/entities filtered in SQL WHERE clause" are VALID even if not in result rows (e.g., if SQL has WHERE branch = 'X', then 'X' is valid context)
+- If ANY branch name appears in the response but NOT in the Data Context, the valid-entities list, or the SQL WHERE clause filters, this is a HALLUCINATION
 - Deduct 0.3 from score for EACH unrelated branch mentioned

 ### 2. Numerical Accuracy (CRITICAL)
@@ -83,6 +86,7 @@ class QAAgent:
        threshold: float = 0.75,
        supplementary_context: Optional[pd.DataFrame] = None,
        user_question: str = "",
+        sql: Optional[str] = None,
    ) -> Dict:
        """
        Evaluate a response for faithfulness.
@@ -93,6 +97,7 @@ class QAAgent:
            threshold: Minimum score to pass (from client YAML)
            supplementary_context: Benchmark data (top performers) that is also valid ground truth
            user_question: The user's original question (entities mentioned here are valid context)
+            sql: The SQL query that produced the data context (branch/entity filters are valid context)

        Returns:
            Dict with score (float), passed (bool), issues (list[str])
@@ -103,6 +108,20 @@ class QAAgent:
        else:
            data_str = "No structured data available."

+        # Include the SQL query so QA understands what filters were applied
+        # (e.g., branch names in WHERE clause are valid context even if not in result rows)
+        if sql:
+            data_str += (
+                f"\n\n### SQL Query (defines the data scope)\n```sql\n{sql}\n```"
+            )
+            # Extract branch/entity filters from SQL WHERE clause
+            where_branches = self._extract_where_entities(sql)
+            if where_branches:
+                data_str += (
+                    f"\nBranches/entities filtered in SQL WHERE clause (VALID context): "
+                    f"{', '.join(sorted(where_branches))}"
+                )
+
        # Include supplementary (benchmark) context as valid ground truth
        if supplementary_context is not None and len(supplementary_context) > 0:
            data_str += "\n\n### Benchmark Data (also valid ground truth)\n"
@@ -110,7 +129,16 @@ class QAAgent:

        # Include user question so QA knows which entities are valid context
        if user_question:
-            data_str += f"\n\n### User Question Context\nThe user asked: \"{user_question}\"\nBranch/entity names mentioned in the user's question are valid to reference in the response."
+            entities = self._extract_entities(user_question)
+            if entities:
+                entity_list = ", ".join(sorted(entities))
+            else:
+                entity_list = "(none)"
+            data_str += (
+                "\n\n### User Question Context\n"
+                f'The user asked: "{user_question}"\n'
+                f"Valid Entities from User Question: {entity_list}"
+            )

        prompt = EVAL_PROMPT.format(
            data_context=data_str,
@@ -119,10 +147,14 @@ class QAAgent:
        )

        try:
-            result = self.client.chat.completions.create(
+            result = chat_with_retry(
+                self.client,
                model=self.model,
                messages=[
-                    {"role": "system", "content": "You are a strict QA evaluator. Return only valid JSON. Pay special attention to branch names and figures that appear in the response but NOT in the data context — these are hallucinations."},
+                    {
+                        "role": "system",
+                        "content": "You are a strict QA evaluator. Return only valid JSON. Pay special attention to branch names and figures that appear in the response but NOT in the data context — these are hallucinations.",
+                    },
                    {"role": "user", "content": prompt},
                ],
                temperature=0.1,
@@ -140,7 +172,11 @@ class QAAgent:
        except Exception as e:
            logger.error(f"QA evaluation failed: {e}")
            # On failure, pass with warning
-            return {"score": 0.5, "passed": True, "issues": [f"QA evaluation error: {str(e)}"]}
+            return {
+                "score": 0.5,
+                "passed": True,
+                "issues": [f"QA evaluation error: {str(e)}"],
+            }

    def _parse_response(self, raw: str, threshold: float) -> Dict:
        """Parse JSON response from QA LLM call."""
@@ -162,4 +198,82 @@ class QAAgent:
            }
        except (json.JSONDecodeError, ValueError) as e:
            logger.warning(f"Could not parse QA response: {e}. Raw: {raw[:200]}")
-            return {"score": 0.5, "passed": True, "issues": ["QA response parsing failed"]}
+            return {
+                "score": 0.5,
+                "passed": True,
+                "issues": ["QA response parsing failed"],
+            }
+
+    def _extract_where_entities(self, sql: str) -> List[str]:
+        """Extract branch/city entity names from SQL WHERE clause filters."""
+        if not sql:
+            return []
+        entities = set()
+        # Match patterns like: branch = 'Seattle', city = 'Toronto'
+        for match in re.finditer(
+            r"(?:branch|city|country)\s*=\s*'([^']+)'",
+            sql,
+            re.IGNORECASE,
+        ):
+            val = match.group(1).strip()
+            if val and len(val) > 1:
+                entities.add(val)
+        # Also handle IN ('val1', 'val2') patterns
+        for match in re.finditer(
+            r"(?:branch|city|country)\s+IN\s*\(([^)]+)\)",
+            sql,
+            re.IGNORECASE,
+        ):
+            for val in re.findall(r"'([^']+)'", match.group(1)):
+                if val and len(val) > 1:
+                    entities.add(val)
+        return list(entities)
+
+    def _extract_entities(self, text: str) -> List[str]:
+        """Extract likely branch/city entities from a user question."""
+        if not text:
+            return []
+
+        lowered = text.lower()
+        patterns = [
+            r"\bbranch\s+([a-z][a-z\s\-']{1,60})",
+            r"\bin\s+([a-z][a-z\s\-']{1,60})",
+            r"\bfor\s+the\s+([a-z][a-z\s\-']{1,60})\s+branch",
+        ]
+
+        stops = {
+            "the",
+            "a",
+            "an",
+            "my",
+            "our",
+            "this",
+            "that",
+            "these",
+            "those",
+            "branch",
+            "branches",
+            "revenue",
+            "sales",
+            "roi",
+            "profit",
+            "performance",
+        }
+
+        entities = set()
+        for pattern in patterns:
+            for match in re.findall(pattern, lowered):
+                candidate = match.strip(" .,!?:;\"'")
+                candidate = " ".join(candidate.split())
+                if not candidate:
+                    continue
+                if candidate in stops:
+                    continue
+                if any(word in stops for word in candidate.split()):
+                    candidate = " ".join(w for w in candidate.split() if w not in stops)
+                candidate = candidate.strip()
+                if len(candidate) < 2:
+                    continue
+                entities.add(candidate.title())
+
+        return list(entities)
@@ -12,7 +12,7 @@ import logging
 from datetime import datetime
 from typing import Optional

-from config.llm_client import get_llm_client, get_model_name
+from config.llm_client import get_llm_client, get_model_name, chat_with_retry
 from config.client_loader import ClientConfig
 from config.settings import get_settings
 from skills.web_search import web_search
@@ -99,8 +99,7 @@ class ScoutAgent:

        # Format results for LLM
        results_text = "\n\n".join(
-            f"**{r['title']}** ({r['url']})\n{r['content']}"
-            for r in all_results
+            f"**{r['title']}** ({r['url']})\n{r['content']}" for r in all_results
        )

        # Summarize with Groq
@@ -113,10 +112,14 @@ class ScoutAgent:
        )

        try:
-            response = self.client.chat.completions.create(
+            response = chat_with_retry(
+                self.client,
                model=self.model,
                messages=[
-                    {"role": "system", "content": "You are a business intelligence scout."},
+                    {
+                        "role": "system",
+                        "content": "You are a business intelligence scout.",
+                    },
                    {"role": "user", "content": prompt},
                ],
                temperature=0.3,
@@ -126,11 +129,15 @@ class ScoutAgent:
            result = response.choices[0].message.content.strip()

            if result == "NO_RELEVANT_NEWS":
-                logger.info(f"[{client_config.client_id}] Scout: no relevant news found")
+                logger.info(
+                    f"[{client_config.client_id}] Scout: no relevant news found"
+                )
                return None

            section = f"## 🔭 Market Intelligence\n\n{result}"
-            logger.info(f"[{client_config.client_id}] Scout: generated intelligence section")
+            logger.info(
+                f"[{client_config.client_id}] Scout: generated intelligence section"
+            )
            return section

        except Exception as e:
@@ -157,12 +164,18 @@ class ScoutAgent:
        scout_config = client_config.scout

        # Search with the user's query directly
-        results = web_search(query, max_results=5, lookback_days=scout_config.news_lookback_days)
+        results = web_search(
+            query, max_results=5, lookback_days=scout_config.news_lookback_days
+        )

        # Also search with competitor names if they appear in the query
        for competitor in scout_config.competitors:
            if competitor.lower() in query.lower():
-                extra = web_search(f"{competitor} latest news", max_results=3, lookback_days=scout_config.news_lookback_days)
+                extra = web_search(
+                    f"{competitor} latest news",
+                    max_results=3,
+                    lookback_days=scout_config.news_lookback_days,
+                )
                results.extend(extra)

        if not results:
@@ -179,8 +192,7 @@ class ScoutAgent:

        # Format results for LLM
        results_text = "\n\n".join(
-            f"**{r['title']}** ({r['url']})\n{r['content']}"
-            for r in unique_results
+            f"**{r['title']}** ({r['url']})\n{r['content']}" for r in unique_results
        )

        prompt = QUERY_PROMPT.format(
@@ -192,10 +204,14 @@ class ScoutAgent:
        )

        try:
-            response = self.client.chat.completions.create(
+            response = chat_with_retry(
+                self.client,
                model=self.model,
                messages=[
-                    {"role": "system", "content": "You are a business intelligence scout."},
+                    {
+                        "role": "system",
+                        "content": "You are a business intelligence scout.",
+                    },
                    {"role": "user", "content": prompt},
                ],
                temperature=0.3,