prototype

This commit is contained in:
2026-05-04 22:00:38 +05:30
commit 711d691870
48 changed files with 5093 additions and 0 deletions
View File
+158
View File
@@ -0,0 +1,158 @@
"""
Clawrity — Client Configuration Loader
Scans config/clients/ for YAML files and parses each into a ClientConfig model.
Supports ${ENV_VAR} interpolation in YAML values.
New client = new YAML file. Zero code changes.
"""
import os
import re
import glob
import logging
from typing import Dict, List, Optional
from pathlib import Path
import yaml
from pydantic import BaseModel
from config.settings import get_settings
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Pydantic models for client YAML structure
# ---------------------------------------------------------------------------
class DataSourceConfig(BaseModel):
type: str = "csv"
path: str = ""
class DatabaseConfig(BaseModel):
url: str = ""
schema_name: str = "" # 'schema' is a Pydantic reserved attr
class ScoutConfig(BaseModel):
sector: str = ""
competitors: List[str] = []
keywords: List[str] = []
news_lookback_days: int = 1
class ClientConfig(BaseModel):
client_id: str
client_name: str = ""
data_source: DataSourceConfig = DataSourceConfig()
database: DatabaseConfig = DatabaseConfig()
countries: List[str] = []
risk_threshold: float = 0.15
hallucination_threshold: float = 0.75
digest_schedule: str = "08:00"
timezone: str = "UTC"
channels: Dict[str, str] = {}
soul_file: str = ""
heartbeat_file: str = ""
column_mapping: Dict[str, str] = {}
scout: ScoutConfig = ScoutConfig()
# Runtime: workspace/team ID → client_id mapping for ProtocolAdapter
slack_workspace_ids: List[str] = []
# ---------------------------------------------------------------------------
# Environment variable interpolation
# ---------------------------------------------------------------------------
_ENV_PATTERN = re.compile(r"\$\{(\w+)\}")
def _interpolate_env(value: str) -> str:
"""Replace ${ENV_VAR} placeholders with actual environment variable values."""
def _replace(match):
var_name = match.group(1)
return os.environ.get(var_name, match.group(0))
if isinstance(value, str):
return _ENV_PATTERN.sub(_replace, value)
return value
def _interpolate_dict(d: dict) -> dict:
"""Recursively interpolate environment variables in a dictionary."""
result = {}
for key, value in d.items():
if isinstance(value, dict):
result[key] = _interpolate_dict(value)
elif isinstance(value, list):
result[key] = [
_interpolate_env(v) if isinstance(v, str) else v
for v in value
]
elif isinstance(value, str):
result[key] = _interpolate_env(value)
else:
result[key] = value
return result
# ---------------------------------------------------------------------------
# Loader
# ---------------------------------------------------------------------------
def load_client_configs(config_dir: Optional[str] = None) -> Dict[str, ClientConfig]:
"""
Load all client YAML files from the config directory.
Returns:
Dict mapping client_id → ClientConfig
"""
if config_dir is None:
config_dir = get_settings().clients_config_dir
configs: Dict[str, ClientConfig] = {}
yaml_pattern = os.path.join(config_dir, "*.yaml")
for yaml_path in glob.glob(yaml_pattern):
try:
with open(yaml_path, "r") as f:
raw = yaml.safe_load(f)
if not raw or "client_id" not in raw:
logger.warning(f"Skipping {yaml_path}: missing client_id")
continue
# Interpolate environment variables
interpolated = _interpolate_dict(raw)
# Handle 'schema' → 'schema_name' mapping for Pydantic
if "database" in interpolated and "schema" in interpolated["database"]:
interpolated["database"]["schema_name"] = interpolated["database"].pop("schema")
config = ClientConfig(**interpolated)
configs[config.client_id] = config
logger.info(f"Loaded client config: {config.client_id} from {yaml_path}")
except Exception as e:
logger.error(f"Error loading {yaml_path}: {e}")
if not configs:
logger.warning(f"No client configs found in {config_dir}")
return configs
def get_client_config(client_id: str, configs: Optional[Dict[str, ClientConfig]] = None) -> Optional[ClientConfig]:
"""Get a specific client config by ID."""
if configs is None:
configs = load_client_configs()
return configs.get(client_id)
+36
View File
@@ -0,0 +1,36 @@
client_id: acme_corp
client_name: ACME Corporation
data_source:
type: "csv"
path: "data/processed/acme_merged.csv"
database:
url: "${DATABASE_URL}"
schema: "acme"
countries: ["US", "Canada", "MENA"]
risk_threshold: 0.15
hallucination_threshold: 0.75
digest_schedule: "08:00"
timezone: "Asia/Kolkata"
channels:
slack_webhook: "${ACME_SLACK_WEBHOOK}"
soul_file: "soul/acme_soul.md"
heartbeat_file: "heartbeat/acme_heartbeat.md"
column_mapping:
Order Date: date
Country: country
City: branch
Sales: revenue
Profit: profit
scout:
sector: "global retail"
competitors: ["IKEA", "Amazon", "Walmart", "Staples"]
keywords: ["retail supply chain", "furniture market trends", "office supplies demand", "global retail ecommerce"]
news_lookback_days: 1
+76
View File
@@ -0,0 +1,76 @@
"""
Clawrity — LLM Client Factory
Provides a unified LLM client that works with both NVIDIA NIM and Groq.
Both are OpenAI-compatible APIs, so we use the OpenAI client with different
base URLs and API keys.
Auto-detects provider from settings:
- NVIDIA NIM: base_url="https://integrate.api.nvidia.com/v1"
- Groq: base_url="https://api.groq.com/openai/v1"
"""
import logging
from functools import lru_cache
from openai import OpenAI
from config.settings import get_settings
logger = logging.getLogger(__name__)
# Provider configs
_PROVIDERS = {
"nvidia": {
"base_url": "https://integrate.api.nvidia.com/v1",
"default_model": "meta/llama-3.3-70b-instruct",
},
"groq": {
"base_url": "https://api.groq.com/openai/v1",
"default_model": "llama-3.3-70b-versatile",
},
}
def get_llm_client() -> OpenAI:
"""Get the configured LLM client (NVIDIA NIM or Groq)."""
settings = get_settings()
provider = settings.active_llm_provider
if provider == "nvidia":
api_key = settings.nvidia_api_key
elif provider == "groq":
api_key = settings.groq_api_key
else:
raise ValueError(f"Unknown LLM provider: {provider}")
if not api_key:
raise ValueError(
f"No API key configured for LLM provider '{provider}'. "
f"Set {'NVIDIA_API_KEY' if provider == 'nvidia' else 'GROQ_API_KEY'} in .env"
)
config = _PROVIDERS[provider]
client = OpenAI(
api_key=api_key,
base_url=config["base_url"],
)
logger.info(f"LLM client: {provider} ({config['base_url']})")
return client
def get_model_name() -> str:
"""Get the model name for the active provider."""
settings = get_settings()
provider = settings.active_llm_provider
# If user specified a model in settings, use it
# Otherwise use the provider default
model = settings.llm_model
if model == "meta/llama-3.3-70b-instruct" and provider == "groq":
model = _PROVIDERS["groq"]["default_model"]
elif model == "llama-3.3-70b-versatile" and provider == "nvidia":
model = _PROVIDERS["nvidia"]["default_model"]
return model
+72
View File
@@ -0,0 +1,72 @@
"""
Clawrity — Application Settings
Loads environment variables via pydantic-settings.
All secrets read from .env file — nothing is hardcoded.
"""
import os
from functools import lru_cache
from typing import Optional
from pydantic_settings import BaseSettings
class Settings(BaseSettings):
"""Application settings loaded from environment variables."""
# --- Database ---
database_url: str = "postgresql://user:pass@localhost:5432/clawrity"
# --- LLM Providers ---
groq_api_key: str = ""
nvidia_api_key: str = ""
# --- Slack (Socket Mode) ---
# Bot Token (xoxb-...) — OAuth & Permissions → Install to Workspace
slack_bot_token: str = ""
# App-Level Token (xapp-...) — Socket Mode → Generate Token
slack_app_token: str = ""
# Signing Secret — Basic Information → App Credentials
slack_signing_secret: str = ""
# --- Tavily Web Search ---
tavily_api_key: str = ""
# --- Slack Webhook for digest delivery ---
acme_slack_webhook: str = ""
# --- Paths ---
data_raw_dir: str = "data/raw"
data_processed_dir: str = "data/processed"
logs_dir: str = "logs"
clients_config_dir: str = "config/clients"
# --- Model Defaults ---
llm_model: str = "meta/llama-3.3-70b-instruct"
llm_provider: str = "" # auto-detected: "nvidia" or "groq"
embedding_model: str = "all-MiniLM-L6-v2"
embedding_dim: int = 384
@property
def active_llm_provider(self) -> str:
"""Auto-detect which LLM provider to use based on available keys."""
if self.llm_provider:
return self.llm_provider
if self.nvidia_api_key:
return "nvidia"
if self.groq_api_key:
return "groq"
return "nvidia" # default
model_config = {
"env_file": ".env",
"env_file_encoding": "utf-8",
"case_sensitive": False,
}
@lru_cache()
def get_settings() -> Settings:
"""Singleton settings instance. Cached after first call."""
return Settings()