clawrity/connectors/csv_connector.py

"""
Clawrity — CSV/Excel Data Connector

Auto-detects file format based on extension:
  .csv → pandas read_csv
  .xlsx / .xls → pandas read_excel (via openpyxl)

Supports both formats since Kaggle datasets vary by download version.
"""

import logging
from pathlib import Path

import pandas as pd

from connectors.base_connector import BaseConnector

logger = logging.getLogger(__name__)


class CSVConnector(BaseConnector):
    """Connector for CSV and Excel files with auto-detection."""

    def load(self, path: str, **kwargs) -> pd.DataFrame:
        """
        Load data from a CSV or Excel file.
        Auto-detects format based on file extension.

        Args:
            path: Path to the file (.csv, .xlsx, .xls)
            **kwargs: Passed through to pandas read function.
                      Useful kwargs: sheet_name, encoding, sep

        Returns:
            pandas DataFrame
        """
        file_path = Path(path)

        if not file_path.exists():
            raise FileNotFoundError(f"Data file not found: {path}")

        ext = file_path.suffix.lower()

        if ext == ".csv":
            logger.info(f"Loading CSV: {path}")
            df = pd.read_csv(path, encoding='latin-1', **kwargs)
        elif ext in (".xlsx", ".xls"):
            logger.info(f"Loading Excel ({ext}): {path}")
            # Default to first sheet unless specified
            sheet_name = kwargs.pop("sheet_name", 0)
            df = pd.read_excel(path, sheet_name=sheet_name, engine="openpyxl", **kwargs)

        else:
            raise ValueError(
                f"Unsupported file format: {ext}. "
                f"Supported: .csv, .xlsx, .xls"
            )

        logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns from {file_path.name}")
        return df

    def validate(self, df: pd.DataFrame, required_columns: list) -> bool:
        """
        Validate that the DataFrame has all required columns.
        Uses case-insensitive matching.

        Args:
            df: DataFrame to validate
            required_columns: List of column names that must be present

        Returns:
            True if all required columns found
        """
        df_cols_lower = {col.lower().strip() for col in df.columns}
        missing = []

        for col in required_columns:
            if col.lower().strip() not in df_cols_lower:
                missing.append(col)

        if missing:
            logger.error(
                f"Missing required columns: {missing}. "
                f"Available: {list(df.columns)}"
            )
            return False

        return True