mirror of
https://github.com/Manoj-HV30/clawrity.git
synced 2026-05-16 19:35:21 +00:00
140 lines
4.0 KiB
Python
140 lines
4.0 KiB
Python
"""
|
|
Clawrity — Web Search Skill
|
|
|
|
Primary: Tavily API (clean, summarised results built for LLM agents)
|
|
Fallback: duckduckgo-search (no API key, no rate limits, free)
|
|
|
|
Auto-fallback: if Tavily errors or quota exceeded, silently switch to DuckDuckGo.
|
|
"""
|
|
|
|
import logging
|
|
from datetime import datetime, timedelta
|
|
from typing import List, Dict, Optional
|
|
|
|
from config.settings import get_settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def web_search(
|
|
query: str,
|
|
max_results: int = 5,
|
|
lookback_days: int = 1,
|
|
) -> List[Dict]:
|
|
"""
|
|
Search the web using Tavily (primary) or DuckDuckGo (fallback).
|
|
|
|
Args:
|
|
query: Search query string
|
|
max_results: Maximum number of results
|
|
lookback_days: Only keep results from the last N days
|
|
|
|
Returns:
|
|
List of dicts with: title, url, content, date
|
|
"""
|
|
results = _tavily_search(query, max_results)
|
|
|
|
if not results:
|
|
logger.info("Tavily returned no results, falling back to DuckDuckGo")
|
|
results = _ddg_search(query, max_results)
|
|
|
|
# Filter by recency
|
|
if lookback_days > 0:
|
|
results = _filter_recent(results, lookback_days)
|
|
|
|
return results
|
|
|
|
|
|
def _tavily_search(query: str, max_results: int = 5) -> List[Dict]:
|
|
"""Search using Tavily API."""
|
|
settings = get_settings()
|
|
|
|
if not settings.tavily_api_key:
|
|
logger.info("Tavily API key not configured, skipping")
|
|
return []
|
|
|
|
try:
|
|
from tavily import TavilyClient
|
|
|
|
client = TavilyClient(api_key=settings.tavily_api_key)
|
|
response = client.search(
|
|
query=query,
|
|
search_depth="advanced",
|
|
max_results=max_results,
|
|
)
|
|
|
|
results = []
|
|
for item in response.get("results", []):
|
|
results.append({
|
|
"title": item.get("title", ""),
|
|
"url": item.get("url", ""),
|
|
"content": item.get("content", ""),
|
|
"date": item.get("published_date", ""),
|
|
"source": "tavily",
|
|
})
|
|
|
|
logger.info(f"Tavily returned {len(results)} results for: {query[:50]}")
|
|
return results
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Tavily search failed: {e}")
|
|
return []
|
|
|
|
|
|
def _ddg_search(query: str, max_results: int = 5) -> List[Dict]:
|
|
"""Search using DuckDuckGo (fallback — no API key needed)."""
|
|
try:
|
|
from duckduckgo_search import DDGS
|
|
|
|
results = []
|
|
with DDGS() as ddgs:
|
|
for r in ddgs.text(query, max_results=max_results):
|
|
results.append({
|
|
"title": r.get("title", ""),
|
|
"url": r.get("href", ""),
|
|
"content": r.get("body", ""),
|
|
"date": "",
|
|
"source": "duckduckgo",
|
|
})
|
|
|
|
logger.info(f"DuckDuckGo returned {len(results)} results for: {query[:50]}")
|
|
return results
|
|
|
|
except Exception as e:
|
|
logger.warning(f"DuckDuckGo search failed: {e}")
|
|
return []
|
|
|
|
|
|
def _filter_recent(results: List[Dict], lookback_days: int) -> List[Dict]:
|
|
"""Filter results to only include items from the last N days."""
|
|
if not results:
|
|
return results
|
|
|
|
cutoff = datetime.utcnow() - timedelta(days=lookback_days)
|
|
filtered = []
|
|
|
|
for r in results:
|
|
date_str = r.get("date", "")
|
|
if not date_str:
|
|
# No date info — include it (benefit of the doubt)
|
|
filtered.append(r)
|
|
continue
|
|
|
|
try:
|
|
# Try common date formats
|
|
for fmt in ("%Y-%m-%dT%H:%M:%S", "%Y-%m-%d", "%B %d, %Y"):
|
|
try:
|
|
dt = datetime.strptime(date_str[:19], fmt)
|
|
if dt >= cutoff:
|
|
filtered.append(r)
|
|
break
|
|
except ValueError:
|
|
continue
|
|
else:
|
|
# Can't parse date, include it
|
|
filtered.append(r)
|
|
except Exception:
|
|
filtered.append(r)
|
|
|
|
return filtered
|