prototype

2026-05-16 19:35:21 +00:00 · 2026-05-04 22:00:38 +05:30
commit 711d691870
48 changed files with 5093 additions and 0 deletions
@@ -0,0 +1,158 @@
+"""
+Clawrity — Client Configuration Loader
+
+Scans config/clients/ for YAML files and parses each into a ClientConfig model.
+Supports ${ENV_VAR} interpolation in YAML values.
+New client = new YAML file. Zero code changes.
+"""
+
+import os
+import re
+import glob
+import logging
+from typing import Dict, List, Optional
+from pathlib import Path
+
+import yaml
+from pydantic import BaseModel
+
+from config.settings import get_settings
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Pydantic models for client YAML structure
+# ---------------------------------------------------------------------------
+
+class DataSourceConfig(BaseModel):
+    type: str = "csv"
+    path: str = ""
+
+
+class DatabaseConfig(BaseModel):
+    url: str = ""
+    schema_name: str = ""  # 'schema' is a Pydantic reserved attr
+
+
+class ScoutConfig(BaseModel):
+    sector: str = ""
+    competitors: List[str] = []
+    keywords: List[str] = []
+    news_lookback_days: int = 1
+
+
+class ClientConfig(BaseModel):
+    client_id: str
+    client_name: str = ""
+
+    data_source: DataSourceConfig = DataSourceConfig()
+    database: DatabaseConfig = DatabaseConfig()
+
+    countries: List[str] = []
+    risk_threshold: float = 0.15
+    hallucination_threshold: float = 0.75
+
+    digest_schedule: str = "08:00"
+    timezone: str = "UTC"
+
+    channels: Dict[str, str] = {}
+
+    soul_file: str = ""
+    heartbeat_file: str = ""
+
+    column_mapping: Dict[str, str] = {}
+
+    scout: ScoutConfig = ScoutConfig()
+
+    # Runtime: workspace/team ID → client_id mapping for ProtocolAdapter
+    slack_workspace_ids: List[str] = []
+
+
+# ---------------------------------------------------------------------------
+# Environment variable interpolation
+# ---------------------------------------------------------------------------
+
+_ENV_PATTERN = re.compile(r"\$\{(\w+)\}")
+
+
+def _interpolate_env(value: str) -> str:
+    """Replace ${ENV_VAR} placeholders with actual environment variable values."""
+    def _replace(match):
+        var_name = match.group(1)
+        return os.environ.get(var_name, match.group(0))
+
+    if isinstance(value, str):
+        return _ENV_PATTERN.sub(_replace, value)
+    return value
+
+
+def _interpolate_dict(d: dict) -> dict:
+    """Recursively interpolate environment variables in a dictionary."""
+    result = {}
+    for key, value in d.items():
+        if isinstance(value, dict):
+            result[key] = _interpolate_dict(value)
+        elif isinstance(value, list):
+            result[key] = [
+                _interpolate_env(v) if isinstance(v, str) else v
+                for v in value
+            ]
+        elif isinstance(value, str):
+            result[key] = _interpolate_env(value)
+        else:
+            result[key] = value
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Loader
+# ---------------------------------------------------------------------------
+
+def load_client_configs(config_dir: Optional[str] = None) -> Dict[str, ClientConfig]:
+    """
+    Load all client YAML files from the config directory.
+
+    Returns:
+        Dict mapping client_id → ClientConfig
+    """
+    if config_dir is None:
+        config_dir = get_settings().clients_config_dir
+
+    configs: Dict[str, ClientConfig] = {}
+    yaml_pattern = os.path.join(config_dir, "*.yaml")
+
+    for yaml_path in glob.glob(yaml_pattern):
+        try:
+            with open(yaml_path, "r") as f:
+                raw = yaml.safe_load(f)
+
+            if not raw or "client_id" not in raw:
+                logger.warning(f"Skipping {yaml_path}: missing client_id")
+                continue
+
+            # Interpolate environment variables
+            interpolated = _interpolate_dict(raw)
+
+            # Handle 'schema' → 'schema_name' mapping for Pydantic
+            if "database" in interpolated and "schema" in interpolated["database"]:
+                interpolated["database"]["schema_name"] = interpolated["database"].pop("schema")
+
+            config = ClientConfig(**interpolated)
+            configs[config.client_id] = config
+            logger.info(f"Loaded client config: {config.client_id} from {yaml_path}")
+
+        except Exception as e:
+            logger.error(f"Error loading {yaml_path}: {e}")
+
+    if not configs:
+        logger.warning(f"No client configs found in {config_dir}")
+
+    return configs
+
+
+def get_client_config(client_id: str, configs: Optional[Dict[str, ClientConfig]] = None) -> Optional[ClientConfig]:
+    """Get a specific client config by ID."""
+    if configs is None:
+        configs = load_client_configs()
+    return configs.get(client_id)
@@ -0,0 +1,36 @@
+client_id: acme_corp
+client_name: ACME Corporation
+
+data_source:
+  type: "csv"
+  path: "data/processed/acme_merged.csv"
+
+database:
+  url: "${DATABASE_URL}"
+  schema: "acme"
+
+countries: ["US", "Canada", "MENA"]
+risk_threshold: 0.15
+hallucination_threshold: 0.75
+
+digest_schedule: "08:00"
+timezone: "Asia/Kolkata"
+
+channels:
+  slack_webhook: "${ACME_SLACK_WEBHOOK}"
+
+soul_file: "soul/acme_soul.md"
+heartbeat_file: "heartbeat/acme_heartbeat.md"
+
+column_mapping:
+  Order Date: date
+  Country: country
+  City: branch
+  Sales: revenue
+  Profit: profit
+
+scout:
+  sector: "global retail"
+  competitors: ["IKEA", "Amazon", "Walmart", "Staples"]
+  keywords: ["retail supply chain", "furniture market trends", "office supplies demand", "global retail ecommerce"]
+  news_lookback_days: 1
@@ -0,0 +1,76 @@
+"""
+Clawrity — LLM Client Factory
+
+Provides a unified LLM client that works with both NVIDIA NIM and Groq.
+Both are OpenAI-compatible APIs, so we use the OpenAI client with different
+base URLs and API keys.
+
+Auto-detects provider from settings:
+  - NVIDIA NIM: base_url="https://integrate.api.nvidia.com/v1"
+  - Groq: base_url="https://api.groq.com/openai/v1"
+"""
+
+import logging
+from functools import lru_cache
+
+from openai import OpenAI
+
+from config.settings import get_settings
+
+logger = logging.getLogger(__name__)
+
+# Provider configs
+_PROVIDERS = {
+    "nvidia": {
+        "base_url": "https://integrate.api.nvidia.com/v1",
+        "default_model": "meta/llama-3.3-70b-instruct",
+    },
+    "groq": {
+        "base_url": "https://api.groq.com/openai/v1",
+        "default_model": "llama-3.3-70b-versatile",
+    },
+}
+
+
+def get_llm_client() -> OpenAI:
+    """Get the configured LLM client (NVIDIA NIM or Groq)."""
+    settings = get_settings()
+    provider = settings.active_llm_provider
+
+    if provider == "nvidia":
+        api_key = settings.nvidia_api_key
+    elif provider == "groq":
+        api_key = settings.groq_api_key
+    else:
+        raise ValueError(f"Unknown LLM provider: {provider}")
+
+    if not api_key:
+        raise ValueError(
+            f"No API key configured for LLM provider '{provider}'. "
+            f"Set {'NVIDIA_API_KEY' if provider == 'nvidia' else 'GROQ_API_KEY'} in .env"
+        )
+
+    config = _PROVIDERS[provider]
+    client = OpenAI(
+        api_key=api_key,
+        base_url=config["base_url"],
+    )
+
+    logger.info(f"LLM client: {provider} ({config['base_url']})")
+    return client
+
+
+def get_model_name() -> str:
+    """Get the model name for the active provider."""
+    settings = get_settings()
+    provider = settings.active_llm_provider
+
+    # If user specified a model in settings, use it
+    # Otherwise use the provider default
+    model = settings.llm_model
+    if model == "meta/llama-3.3-70b-instruct" and provider == "groq":
+        model = _PROVIDERS["groq"]["default_model"]
+    elif model == "llama-3.3-70b-versatile" and provider == "nvidia":
+        model = _PROVIDERS["nvidia"]["default_model"]
+
+    return model
@@ -0,0 +1,72 @@
+"""
+Clawrity — Application Settings
+
+Loads environment variables via pydantic-settings.
+All secrets read from .env file — nothing is hardcoded.
+"""
+
+import os
+from functools import lru_cache
+from typing import Optional
+
+from pydantic_settings import BaseSettings
+
+
+class Settings(BaseSettings):
+    """Application settings loaded from environment variables."""
+
+    # --- Database ---
+    database_url: str = "postgresql://user:pass@localhost:5432/clawrity"
+
+    # --- LLM Providers ---
+    groq_api_key: str = ""
+    nvidia_api_key: str = ""
+
+    # --- Slack (Socket Mode) ---
+    # Bot Token (xoxb-...) — OAuth & Permissions → Install to Workspace
+    slack_bot_token: str = ""
+    # App-Level Token (xapp-...) — Socket Mode → Generate Token
+    slack_app_token: str = ""
+    # Signing Secret — Basic Information → App Credentials
+    slack_signing_secret: str = ""
+
+    # --- Tavily Web Search ---
+    tavily_api_key: str = ""
+
+    # --- Slack Webhook for digest delivery ---
+    acme_slack_webhook: str = ""
+
+    # --- Paths ---
+    data_raw_dir: str = "data/raw"
+    data_processed_dir: str = "data/processed"
+    logs_dir: str = "logs"
+    clients_config_dir: str = "config/clients"
+
+    # --- Model Defaults ---
+    llm_model: str = "meta/llama-3.3-70b-instruct"
+    llm_provider: str = ""  # auto-detected: "nvidia" or "groq"
+    embedding_model: str = "all-MiniLM-L6-v2"
+    embedding_dim: int = 384
+
+    @property
+    def active_llm_provider(self) -> str:
+        """Auto-detect which LLM provider to use based on available keys."""
+        if self.llm_provider:
+            return self.llm_provider
+        if self.nvidia_api_key:
+            return "nvidia"
+        if self.groq_api_key:
+            return "groq"
+        return "nvidia"  # default
+
+    model_config = {
+        "env_file": ".env",
+        "env_file_encoding": "utf-8",
+        "case_sensitive": False,
+    }
+
+
+@lru_cache()
+def get_settings() -> Settings:
+    """Singleton settings instance. Cached after first call."""
+    return Settings()