Files
clawrity/connectors/csv_connector.py
T
2026-05-04 22:00:38 +05:30

89 lines
2.6 KiB
Python

"""
Clawrity — CSV/Excel Data Connector
Auto-detects file format based on extension:
.csv → pandas read_csv
.xlsx / .xls → pandas read_excel (via openpyxl)
Supports both formats since Kaggle datasets vary by download version.
"""
import logging
from pathlib import Path
import pandas as pd
from connectors.base_connector import BaseConnector
logger = logging.getLogger(__name__)
class CSVConnector(BaseConnector):
"""Connector for CSV and Excel files with auto-detection."""
def load(self, path: str, **kwargs) -> pd.DataFrame:
"""
Load data from a CSV or Excel file.
Auto-detects format based on file extension.
Args:
path: Path to the file (.csv, .xlsx, .xls)
**kwargs: Passed through to pandas read function.
Useful kwargs: sheet_name, encoding, sep
Returns:
pandas DataFrame
"""
file_path = Path(path)
if not file_path.exists():
raise FileNotFoundError(f"Data file not found: {path}")
ext = file_path.suffix.lower()
if ext == ".csv":
logger.info(f"Loading CSV: {path}")
df = pd.read_csv(path, encoding='latin-1', **kwargs)
elif ext in (".xlsx", ".xls"):
logger.info(f"Loading Excel ({ext}): {path}")
# Default to first sheet unless specified
sheet_name = kwargs.pop("sheet_name", 0)
df = pd.read_excel(path, sheet_name=sheet_name, engine="openpyxl", **kwargs)
else:
raise ValueError(
f"Unsupported file format: {ext}. "
f"Supported: .csv, .xlsx, .xls"
)
logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns from {file_path.name}")
return df
def validate(self, df: pd.DataFrame, required_columns: list) -> bool:
"""
Validate that the DataFrame has all required columns.
Uses case-insensitive matching.
Args:
df: DataFrame to validate
required_columns: List of column names that must be present
Returns:
True if all required columns found
"""
df_cols_lower = {col.lower().strip() for col in df.columns}
missing = []
for col in required_columns:
if col.lower().strip() not in df_cols_lower:
missing.append(col)
if missing:
logger.error(
f"Missing required columns: {missing}. "
f"Available: {list(df.columns)}"
)
return False
return True