clawrity/scripts/run_rag_pipeline.py

"""
Clawrity — RAG Pipeline Script

CLI to run the full RAG pipeline: preprocess → chunk → embed → store in pgvector.

Usage:
    python scripts/run_rag_pipeline.py --client_id acme_corp
"""

import argparse
import logging
import sys
import os

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from rag.preprocessor import preprocess_for_rag
from rag.chunker import generate_chunks
from rag.vector_store import embed_texts, store_chunks

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)


def run_pipeline(client_id: str, days: int = 365):
    """Run the full RAG pipeline for a client."""
    logger.info(f"=== RAG Pipeline: {client_id} ===")

    # Step 1: Preprocess
    logger.info("Step 1/4: Preprocessing data...")
    df = preprocess_for_rag(client_id, days=days)
    if df.empty:
        logger.error("No data to process. Run seed_demo_data.py first.")
        return

    # Step 2: Generate chunks
    logger.info("Step 2/4: Generating chunks...")
    chunks = generate_chunks(df, client_id)
    logger.info(f"Generated {len(chunks)} chunks")

    if not chunks:
        logger.error("No chunks generated.")
        return

    # Step 3: Embed
    logger.info("Step 3/4: Embedding chunks (CPU, batch_size=100)...")
    texts = [c.text for c in chunks]
    embeddings = embed_texts(texts, batch_size=100)

    # Step 4: Store in pgvector
    logger.info("Step 4/4: Upserting into pgvector...")
    store_chunks(chunks, embeddings)

    logger.info(f"=== RAG Pipeline complete: {len(chunks)} chunks indexed ===")


def main():
    parser = argparse.ArgumentParser(description="Run RAG pipeline")
    parser.add_argument("--client_id", required=True, help="Client ID")
    parser.add_argument("--days", type=int, default=365, help="Days of data to process")
    args = parser.parse_args()

    run_pipeline(args.client_id, args.days)


if __name__ == "__main__":
    main()