""" Clawrity — RAG Vector Store Embeds chunks using sentence-transformers all-MiniLM-L6-v2 (CPU, 384 dims). Stores and searches via pgvector in PostgreSQL. """ import logging from typing import List, Optional import numpy as np from rag.chunker import Chunk from skills.postgres_connector import get_connector logger = logging.getLogger(__name__) _model = None def _get_embedding_model(): """Lazy-load the embedding model (CPU only, ~90MB).""" global _model if _model is None: from sentence_transformers import SentenceTransformer _model = SentenceTransformer("all-MiniLM-L6-v2") logger.info("Loaded embedding model: all-MiniLM-L6-v2 (384 dims)") return _model def embed_texts(texts: List[str], batch_size: int = 100) -> np.ndarray: """ Embed a list of texts using MiniLM. Args: texts: List of text strings to embed batch_size: Batch size for encoding (default 100) Returns: numpy array of shape (len(texts), 384) """ model = _get_embedding_model() embeddings = model.encode( texts, batch_size=batch_size, show_progress_bar=len(texts) > 100, normalize_embeddings=True, ) logger.info(f"Embedded {len(texts)} texts → shape {embeddings.shape}") return embeddings def embed_query(query: str) -> np.ndarray: """Embed a single query string.""" model = _get_embedding_model() return model.encode(query, normalize_embeddings=True) def store_chunks(chunks: List[Chunk], embeddings: np.ndarray): """ Upsert chunks + embeddings into pgvector. Uses ON CONFLICT DO UPDATE for safe nightly re-indexing. """ seen = set() unique_chunks = [] unique_embeddings = [] for chunk, emb in zip(chunks, embeddings): if chunk.id not in seen: seen.add(chunk.id) unique_chunks.append(chunk) unique_embeddings.append(emb) chunks = unique_chunks embeddings = unique_embeddings db = get_connector() data = [] for chunk, embedding in zip(chunks, embeddings): data.append({ "id": chunk.id, "client_id": chunk.client_id, "chunk_type": chunk.chunk_type, "text": chunk.text, "metadata": chunk.metadata, "embedding": embedding.tolist(), }) # Batch upsert batch_size = 100 for i in range(0, len(data), batch_size): batch = data[i:i + batch_size] db.upsert_embeddings(batch) logger.info(f"Stored {len(data)} chunks in pgvector") # Try to create IVFFlat index (needs enough rows) try: db.create_vector_index() except Exception: pass def search( query: str, client_id: str, chunk_type: Optional[str] = None, top_k: int = 5, ) -> List[dict]: """ Search pgvector for similar chunks. Args: query: Natural language query client_id: Client to search within chunk_type: Optional filter (branch_weekly, channel_monthly, trend_qoq) top_k: Number of results Returns: List of dicts with text, metadata, similarity """ query_embedding = embed_query(query) db = get_connector() results = db.search_embeddings( query_embedding=query_embedding, client_id=client_id, chunk_type=chunk_type, top_k=top_k, ) logger.info( f"Vector search: query='{query[:50]}...', " f"chunk_type={chunk_type}, results={len(results)}" ) return results