mirror of
https://github.com/Manoj-HV30/clawrity.git
synced 2026-05-16 19:35:21 +00:00
prototype
This commit is contained in:
@@ -0,0 +1,140 @@
|
||||
"""
|
||||
Clawrity — NL-to-SQL Engine
|
||||
|
||||
Converts natural language questions into valid PostgreSQL SELECT queries.
|
||||
Uses LLM at temperature 0.1 for deterministic SQL generation.
|
||||
Safety: Only SELECT queries allowed. INSERT/UPDATE/DELETE/DROP rejected.
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from config.llm_client import get_llm_client, get_model_name
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Dangerous SQL patterns — reject anything that isn't a SELECT
|
||||
UNSAFE_PATTERNS = re.compile(
|
||||
r"\b(INSERT|UPDATE|DELETE|DROP|ALTER|TRUNCATE|CREATE|GRANT|REVOKE|EXEC)\b",
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
SYSTEM_PROMPT = """You are a PostgreSQL SQL generator. Generate ONLY a valid SELECT query.
|
||||
Return ONLY the raw SQL — no markdown, no explanation, no code fences.
|
||||
|
||||
Table: spend_data
|
||||
Columns:
|
||||
- id: SERIAL PRIMARY KEY
|
||||
- date: DATE
|
||||
- country: VARCHAR(100)
|
||||
- branch: VARCHAR(100)
|
||||
- channel: VARCHAR(100)
|
||||
- spend: FLOAT
|
||||
- revenue: FLOAT
|
||||
- leads: INT
|
||||
- conversions: INT
|
||||
- client_id: VARCHAR(100)
|
||||
|
||||
Available countries: {countries}
|
||||
Available branches (sample): {branches}
|
||||
Available channels: {channels}
|
||||
Date range: {date_min} to {date_max}
|
||||
|
||||
RULES:
|
||||
1. ALWAYS include WHERE client_id = '{client_id}' in your queries
|
||||
2. Use standard PostgreSQL syntax
|
||||
3. For date ranges, use DATE type comparisons
|
||||
4. For "last N days", use: date >= CURRENT_DATE - INTERVAL '{n} days'
|
||||
5. For "last month", use: date >= DATE_TRUNC('month', CURRENT_DATE - INTERVAL '1 month')
|
||||
6. Return meaningful aggregations with GROUP BY when appropriate
|
||||
7. Use aliases for computed columns (e.g., SUM(revenue) AS total_revenue)
|
||||
8. LIMIT results to 50 rows maximum unless the user asks for all
|
||||
9. For "bottom N" use ASC ordering, for "top N" use DESC ordering
|
||||
"""
|
||||
|
||||
|
||||
class NLToSQL:
|
||||
"""Natural language to SQL converter using LLM."""
|
||||
|
||||
def __init__(self):
|
||||
self.client = get_llm_client()
|
||||
self.model = get_model_name()
|
||||
|
||||
def generate_sql(
|
||||
self,
|
||||
question: str,
|
||||
client_id: str,
|
||||
schema_metadata: dict,
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Convert a natural language question to a PostgreSQL SELECT query.
|
||||
|
||||
Args:
|
||||
question: User's natural language question
|
||||
client_id: Client ID for filtering
|
||||
schema_metadata: Dict with countries, branches, channels, date_min, date_max
|
||||
|
||||
Returns:
|
||||
Valid SQL SELECT string, or None on failure
|
||||
"""
|
||||
# Build the system prompt with schema context
|
||||
system = SYSTEM_PROMPT.format(
|
||||
countries=", ".join(schema_metadata.get("countries", [])[:20]),
|
||||
branches=", ".join(schema_metadata.get("branches", [])[:20]),
|
||||
channels=", ".join(schema_metadata.get("channels", [])),
|
||||
date_min=schema_metadata.get("date_min", "unknown"),
|
||||
date_max=schema_metadata.get("date_max", "unknown"),
|
||||
client_id=client_id,
|
||||
n="7", # Default for interval template
|
||||
)
|
||||
|
||||
try:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": question},
|
||||
],
|
||||
temperature=0.1,
|
||||
max_tokens=1024,
|
||||
)
|
||||
|
||||
raw_sql = response.choices[0].message.content.strip()
|
||||
sql = self._clean_sql(raw_sql)
|
||||
|
||||
if not self._validate_sql(sql):
|
||||
logger.warning(f"Generated SQL failed validation: {sql}")
|
||||
return None
|
||||
|
||||
logger.info(f"Generated SQL: {sql}")
|
||||
return sql
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"NL-to-SQL generation failed: {e}")
|
||||
return None
|
||||
|
||||
def _clean_sql(self, raw: str) -> str:
|
||||
"""Extract SQL from LLM response, stripping markdown code fences."""
|
||||
# Remove markdown code blocks
|
||||
cleaned = re.sub(r"```(?:sql)?\s*", "", raw)
|
||||
cleaned = re.sub(r"```\s*$", "", cleaned)
|
||||
cleaned = cleaned.strip().rstrip(";") + ";"
|
||||
return cleaned
|
||||
|
||||
def _validate_sql(self, sql: str) -> bool:
|
||||
"""Validate that the SQL is a safe SELECT query."""
|
||||
if not sql or len(sql) < 10:
|
||||
return False
|
||||
|
||||
# Must start with SELECT
|
||||
if not sql.strip().upper().startswith("SELECT"):
|
||||
logger.warning("SQL does not start with SELECT")
|
||||
return False
|
||||
|
||||
# Must not contain dangerous operations
|
||||
if UNSAFE_PATTERNS.search(sql):
|
||||
logger.warning("SQL contains unsafe operations")
|
||||
return False
|
||||
|
||||
return True
|
||||
@@ -0,0 +1,384 @@
|
||||
"""
|
||||
Clawrity — PostgreSQL + pgvector Connector
|
||||
|
||||
Connection pool management, schema initialization, and query execution.
|
||||
Single database handles both structured queries (NL-to-SQL) and vector search (pgvector).
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
from pgvector.psycopg2 import register_vector
|
||||
|
||||
from config.settings import get_settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Schema DDL
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
INIT_SCHEMA_SQL = """
|
||||
-- Enable pgvector extension
|
||||
CREATE EXTENSION IF NOT EXISTS vector;
|
||||
|
||||
-- Structured business data (replaces BigQuery)
|
||||
CREATE TABLE IF NOT EXISTS spend_data (
|
||||
id SERIAL PRIMARY KEY,
|
||||
date DATE,
|
||||
country VARCHAR(100),
|
||||
branch VARCHAR(100),
|
||||
channel VARCHAR(100),
|
||||
spend FLOAT,
|
||||
revenue FLOAT,
|
||||
leads INT,
|
||||
conversions INT,
|
||||
client_id VARCHAR(100)
|
||||
);
|
||||
|
||||
-- Vector embeddings (replaces ChromaDB)
|
||||
CREATE TABLE IF NOT EXISTS embeddings (
|
||||
id VARCHAR(200) PRIMARY KEY,
|
||||
client_id VARCHAR(100),
|
||||
chunk_type VARCHAR(50),
|
||||
text TEXT,
|
||||
metadata JSONB,
|
||||
embedding vector(384)
|
||||
);
|
||||
|
||||
-- Forecast cache
|
||||
CREATE TABLE IF NOT EXISTS forecasts (
|
||||
id SERIAL PRIMARY KEY,
|
||||
client_id VARCHAR(100),
|
||||
branch VARCHAR(100),
|
||||
country VARCHAR(100),
|
||||
horizon_months INT,
|
||||
forecast_data JSONB,
|
||||
computed_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Indexes
|
||||
CREATE INDEX IF NOT EXISTS idx_spend_data_client
|
||||
ON spend_data (client_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_spend_data_date
|
||||
ON spend_data (client_id, date);
|
||||
CREATE INDEX IF NOT EXISTS idx_embeddings_client_type
|
||||
ON embeddings (client_id, chunk_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_forecasts_client
|
||||
ON forecasts (client_id, branch, country);
|
||||
"""
|
||||
|
||||
# IVFFlat index requires rows to exist — created separately after data load
|
||||
IVFFLAT_INDEX_SQL = """
|
||||
CREATE INDEX IF NOT EXISTS idx_embeddings_cosine
|
||||
ON embeddings USING ivfflat (embedding vector_cosine_ops)
|
||||
WITH (lists = 100);
|
||||
"""
|
||||
|
||||
|
||||
class PostgresConnector:
|
||||
"""PostgreSQL + pgvector connection manager."""
|
||||
|
||||
def __init__(self, database_url: Optional[str] = None):
|
||||
self.database_url = database_url or get_settings().database_url
|
||||
self._conn: Optional[psycopg2.extensions.connection] = None
|
||||
|
||||
def _get_connection(self) -> psycopg2.extensions.connection:
|
||||
"""Get or create a database connection with retry logic."""
|
||||
if self._conn is None or self._conn.closed:
|
||||
max_retries = 3
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
self._conn = psycopg2.connect(self.database_url)
|
||||
register_vector(self._conn)
|
||||
logger.info("Connected to PostgreSQL with pgvector support")
|
||||
return self._conn
|
||||
except psycopg2.OperationalError as e:
|
||||
wait = 2**attempt
|
||||
logger.warning(
|
||||
f"DB connection attempt {attempt + 1}/{max_retries} failed: {e}. "
|
||||
f"Retrying in {wait}s..."
|
||||
)
|
||||
time.sleep(wait)
|
||||
raise ConnectionError("Failed to connect to PostgreSQL after 3 attempts")
|
||||
return self._conn
|
||||
|
||||
def close(self):
|
||||
"""Close the database connection."""
|
||||
if self._conn and not self._conn.closed:
|
||||
self._conn.close()
|
||||
logger.info("PostgreSQL connection closed")
|
||||
|
||||
def init_schema(self):
|
||||
"""Create tables and extensions if they don't exist."""
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(INIT_SCHEMA_SQL)
|
||||
conn.commit()
|
||||
logger.info("Database schema initialized successfully")
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logger.error(f"Schema initialization failed: {e}")
|
||||
raise
|
||||
|
||||
def create_vector_index(self):
|
||||
"""Create IVFFlat index — call AFTER data has been loaded into embeddings."""
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(IVFFLAT_INDEX_SQL)
|
||||
conn.commit()
|
||||
logger.info("IVFFlat vector index created")
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logger.warning(f"Could not create IVFFlat index (may need more rows): {e}")
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Query execution
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def execute_query(self, sql: str, params: Optional[tuple] = None) -> pd.DataFrame:
|
||||
"""
|
||||
Execute a SELECT query and return results as a DataFrame.
|
||||
|
||||
Args:
|
||||
sql: SQL query string (must be SELECT only)
|
||||
params: Query parameters for parameterised queries
|
||||
|
||||
Returns:
|
||||
pandas DataFrame with query results
|
||||
"""
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
df = pd.read_sql_query(sql, conn, params=params)
|
||||
conn.rollback()
|
||||
logger.debug(f"Query returned {len(df)} rows")
|
||||
return df
|
||||
except Exception as e:
|
||||
logger.error(f"Query execution failed: {e}")
|
||||
conn.rollback()
|
||||
raise
|
||||
|
||||
def execute_raw(self, sql: str, params: Optional[tuple] = None) -> List[Dict]:
|
||||
"""Execute a query and return raw dictionaries."""
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||||
cur.execute(sql, params)
|
||||
if cur.description:
|
||||
results = [dict(row) for row in cur.fetchall()]
|
||||
conn.rollback()
|
||||
return results
|
||||
conn.commit()
|
||||
return []
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logger.error(f"Raw query execution failed: {e}")
|
||||
raise
|
||||
|
||||
def execute_write(self, sql: str, params: Optional[tuple] = None):
|
||||
"""Execute an INSERT/UPDATE/DELETE statement."""
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(sql, params)
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logger.error(f"Write execution failed: {e}")
|
||||
raise
|
||||
|
||||
def execute_batch(self, sql: str, data: List[tuple], page_size: int = 1000):
|
||||
"""Execute a batch INSERT using execute_values for performance."""
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
psycopg2.extras.execute_values(cur, sql, data, page_size=page_size)
|
||||
conn.commit()
|
||||
logger.info(f"Batch insert: {len(data)} rows")
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logger.error(f"Batch execution failed: {e}")
|
||||
raise
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# pgvector operations
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def upsert_embeddings(self, embeddings_data: List[Dict[str, Any]]):
|
||||
"""
|
||||
Upsert embedding records into the embeddings table.
|
||||
|
||||
Args:
|
||||
embeddings_data: List of dicts with keys:
|
||||
id, client_id, chunk_type, text, metadata, embedding
|
||||
"""
|
||||
conn = self._get_connection()
|
||||
sql = """
|
||||
INSERT INTO embeddings (id, client_id, chunk_type, text, metadata, embedding)
|
||||
VALUES %s
|
||||
ON CONFLICT (id) DO UPDATE SET
|
||||
text = EXCLUDED.text,
|
||||
metadata = EXCLUDED.metadata,
|
||||
embedding = EXCLUDED.embedding
|
||||
"""
|
||||
data = [
|
||||
(
|
||||
d["id"],
|
||||
d["client_id"],
|
||||
d["chunk_type"],
|
||||
d["text"],
|
||||
psycopg2.extras.Json(d["metadata"]),
|
||||
np.array(d["embedding"]),
|
||||
)
|
||||
for d in embeddings_data
|
||||
]
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
psycopg2.extras.execute_values(cur, sql, data, page_size=100)
|
||||
conn.commit()
|
||||
logger.info(f"Upserted {len(data)} embeddings")
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logger.error(f"Embedding upsert failed: {e}")
|
||||
raise
|
||||
|
||||
def search_embeddings(
|
||||
self,
|
||||
query_embedding: np.ndarray,
|
||||
client_id: str,
|
||||
chunk_type: Optional[str] = None,
|
||||
top_k: int = 5,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Search for similar embeddings using pgvector cosine similarity.
|
||||
|
||||
Args:
|
||||
query_embedding: Query vector (384 dims)
|
||||
client_id: Filter by client
|
||||
chunk_type: Optional filter by chunk type
|
||||
top_k: Number of results to return
|
||||
|
||||
Returns:
|
||||
List of dicts with text, metadata, and similarity score
|
||||
"""
|
||||
conn = self._get_connection()
|
||||
query_vec = np.array(query_embedding)
|
||||
|
||||
if chunk_type:
|
||||
sql = """
|
||||
SELECT text, metadata, 1 - (embedding <=> %s) AS similarity
|
||||
FROM embeddings
|
||||
WHERE client_id = %s AND chunk_type = %s
|
||||
ORDER BY embedding <=> %s
|
||||
LIMIT %s
|
||||
"""
|
||||
params = (query_vec, client_id, chunk_type, query_vec, top_k)
|
||||
else:
|
||||
sql = """
|
||||
SELECT text, metadata, 1 - (embedding <=> %s) AS similarity
|
||||
FROM embeddings
|
||||
WHERE client_id = %s
|
||||
ORDER BY embedding <=> %s
|
||||
LIMIT %s
|
||||
"""
|
||||
params = (query_vec, client_id, query_vec, top_k)
|
||||
|
||||
try:
|
||||
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||||
cur.execute(sql, params)
|
||||
results = [dict(row) for row in cur.fetchall()]
|
||||
logger.debug(f"Vector search returned {len(results)} results")
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.error(f"Vector search failed: {e}")
|
||||
raise
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Utility
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_table_count(self, table: str, client_id: Optional[str] = None) -> int:
|
||||
"""Get row count for a table, optionally filtered by client_id."""
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
if client_id:
|
||||
cur.execute(
|
||||
f"SELECT COUNT(*) FROM {table} WHERE client_id = %s",
|
||||
(client_id,),
|
||||
)
|
||||
else:
|
||||
cur.execute(f"SELECT COUNT(*) FROM {table}")
|
||||
return cur.fetchone()[0]
|
||||
except Exception as e:
|
||||
logger.error(f"Count query failed: {e}")
|
||||
return 0
|
||||
|
||||
def get_spend_data_schema(self, client_id: str) -> Dict:
|
||||
"""Get metadata about available data for a client — used by NL-to-SQL."""
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT DISTINCT country FROM spend_data WHERE client_id = %s ORDER BY country",
|
||||
(client_id,),
|
||||
)
|
||||
countries = [row[0] for row in cur.fetchall()]
|
||||
|
||||
cur.execute(
|
||||
"SELECT DISTINCT branch FROM spend_data WHERE client_id = %s ORDER BY branch",
|
||||
(client_id,),
|
||||
)
|
||||
branches = [row[0] for row in cur.fetchall()]
|
||||
|
||||
cur.execute(
|
||||
"SELECT DISTINCT channel FROM spend_data WHERE client_id = %s ORDER BY channel",
|
||||
(client_id,),
|
||||
)
|
||||
channels = [row[0] for row in cur.fetchall()]
|
||||
|
||||
cur.execute(
|
||||
"SELECT MIN(date), MAX(date) FROM spend_data WHERE client_id = %s",
|
||||
(client_id,),
|
||||
)
|
||||
date_range = cur.fetchone()
|
||||
|
||||
return {
|
||||
"countries": countries,
|
||||
"branches": branches,
|
||||
"channels": channels,
|
||||
"date_min": str(date_range[0]) if date_range[0] else None,
|
||||
"date_max": str(date_range[1]) if date_range[1] else None,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Schema metadata query failed: {e}")
|
||||
return {
|
||||
"countries": [],
|
||||
"branches": [],
|
||||
"channels": [],
|
||||
"date_min": None,
|
||||
"date_max": None,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Module-level singleton
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_connector: Optional[PostgresConnector] = None
|
||||
|
||||
|
||||
def get_connector() -> PostgresConnector:
|
||||
"""Get the shared PostgresConnector singleton."""
|
||||
global _connector
|
||||
if _connector is None:
|
||||
_connector = PostgresConnector()
|
||||
return _connector
|
||||
@@ -0,0 +1,139 @@
|
||||
"""
|
||||
Clawrity — Web Search Skill
|
||||
|
||||
Primary: Tavily API (clean, summarised results built for LLM agents)
|
||||
Fallback: duckduckgo-search (no API key, no rate limits, free)
|
||||
|
||||
Auto-fallback: if Tavily errors or quota exceeded, silently switch to DuckDuckGo.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
from config.settings import get_settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def web_search(
|
||||
query: str,
|
||||
max_results: int = 5,
|
||||
lookback_days: int = 1,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Search the web using Tavily (primary) or DuckDuckGo (fallback).
|
||||
|
||||
Args:
|
||||
query: Search query string
|
||||
max_results: Maximum number of results
|
||||
lookback_days: Only keep results from the last N days
|
||||
|
||||
Returns:
|
||||
List of dicts with: title, url, content, date
|
||||
"""
|
||||
results = _tavily_search(query, max_results)
|
||||
|
||||
if not results:
|
||||
logger.info("Tavily returned no results, falling back to DuckDuckGo")
|
||||
results = _ddg_search(query, max_results)
|
||||
|
||||
# Filter by recency
|
||||
if lookback_days > 0:
|
||||
results = _filter_recent(results, lookback_days)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _tavily_search(query: str, max_results: int = 5) -> List[Dict]:
|
||||
"""Search using Tavily API."""
|
||||
settings = get_settings()
|
||||
|
||||
if not settings.tavily_api_key:
|
||||
logger.info("Tavily API key not configured, skipping")
|
||||
return []
|
||||
|
||||
try:
|
||||
from tavily import TavilyClient
|
||||
|
||||
client = TavilyClient(api_key=settings.tavily_api_key)
|
||||
response = client.search(
|
||||
query=query,
|
||||
search_depth="advanced",
|
||||
max_results=max_results,
|
||||
)
|
||||
|
||||
results = []
|
||||
for item in response.get("results", []):
|
||||
results.append({
|
||||
"title": item.get("title", ""),
|
||||
"url": item.get("url", ""),
|
||||
"content": item.get("content", ""),
|
||||
"date": item.get("published_date", ""),
|
||||
"source": "tavily",
|
||||
})
|
||||
|
||||
logger.info(f"Tavily returned {len(results)} results for: {query[:50]}")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Tavily search failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def _ddg_search(query: str, max_results: int = 5) -> List[Dict]:
|
||||
"""Search using DuckDuckGo (fallback — no API key needed)."""
|
||||
try:
|
||||
from duckduckgo_search import DDGS
|
||||
|
||||
results = []
|
||||
with DDGS() as ddgs:
|
||||
for r in ddgs.text(query, max_results=max_results):
|
||||
results.append({
|
||||
"title": r.get("title", ""),
|
||||
"url": r.get("href", ""),
|
||||
"content": r.get("body", ""),
|
||||
"date": "",
|
||||
"source": "duckduckgo",
|
||||
})
|
||||
|
||||
logger.info(f"DuckDuckGo returned {len(results)} results for: {query[:50]}")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"DuckDuckGo search failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def _filter_recent(results: List[Dict], lookback_days: int) -> List[Dict]:
|
||||
"""Filter results to only include items from the last N days."""
|
||||
if not results:
|
||||
return results
|
||||
|
||||
cutoff = datetime.utcnow() - timedelta(days=lookback_days)
|
||||
filtered = []
|
||||
|
||||
for r in results:
|
||||
date_str = r.get("date", "")
|
||||
if not date_str:
|
||||
# No date info — include it (benefit of the doubt)
|
||||
filtered.append(r)
|
||||
continue
|
||||
|
||||
try:
|
||||
# Try common date formats
|
||||
for fmt in ("%Y-%m-%dT%H:%M:%S", "%Y-%m-%d", "%B %d, %Y"):
|
||||
try:
|
||||
dt = datetime.strptime(date_str[:19], fmt)
|
||||
if dt >= cutoff:
|
||||
filtered.append(r)
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
else:
|
||||
# Can't parse date, include it
|
||||
filtered.append(r)
|
||||
except Exception:
|
||||
filtered.append(r)
|
||||
|
||||
return filtered
|
||||
Reference in New Issue
Block a user