mirror of
https://github.com/Manoj-HV30/clawrity.git
synced 2026-05-16 19:35:21 +00:00
89 lines
2.6 KiB
Python
89 lines
2.6 KiB
Python
"""
|
|
Clawrity — CSV/Excel Data Connector
|
|
|
|
Auto-detects file format based on extension:
|
|
.csv → pandas read_csv
|
|
.xlsx / .xls → pandas read_excel (via openpyxl)
|
|
|
|
Supports both formats since Kaggle datasets vary by download version.
|
|
"""
|
|
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
|
|
from connectors.base_connector import BaseConnector
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class CSVConnector(BaseConnector):
|
|
"""Connector for CSV and Excel files with auto-detection."""
|
|
|
|
def load(self, path: str, **kwargs) -> pd.DataFrame:
|
|
"""
|
|
Load data from a CSV or Excel file.
|
|
Auto-detects format based on file extension.
|
|
|
|
Args:
|
|
path: Path to the file (.csv, .xlsx, .xls)
|
|
**kwargs: Passed through to pandas read function.
|
|
Useful kwargs: sheet_name, encoding, sep
|
|
|
|
Returns:
|
|
pandas DataFrame
|
|
"""
|
|
file_path = Path(path)
|
|
|
|
if not file_path.exists():
|
|
raise FileNotFoundError(f"Data file not found: {path}")
|
|
|
|
ext = file_path.suffix.lower()
|
|
|
|
if ext == ".csv":
|
|
logger.info(f"Loading CSV: {path}")
|
|
df = pd.read_csv(path, encoding='latin-1', **kwargs)
|
|
elif ext in (".xlsx", ".xls"):
|
|
logger.info(f"Loading Excel ({ext}): {path}")
|
|
# Default to first sheet unless specified
|
|
sheet_name = kwargs.pop("sheet_name", 0)
|
|
df = pd.read_excel(path, sheet_name=sheet_name, engine="openpyxl", **kwargs)
|
|
|
|
else:
|
|
raise ValueError(
|
|
f"Unsupported file format: {ext}. "
|
|
f"Supported: .csv, .xlsx, .xls"
|
|
)
|
|
|
|
logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns from {file_path.name}")
|
|
return df
|
|
|
|
def validate(self, df: pd.DataFrame, required_columns: list) -> bool:
|
|
"""
|
|
Validate that the DataFrame has all required columns.
|
|
Uses case-insensitive matching.
|
|
|
|
Args:
|
|
df: DataFrame to validate
|
|
required_columns: List of column names that must be present
|
|
|
|
Returns:
|
|
True if all required columns found
|
|
"""
|
|
df_cols_lower = {col.lower().strip() for col in df.columns}
|
|
missing = []
|
|
|
|
for col in required_columns:
|
|
if col.lower().strip() not in df_cols_lower:
|
|
missing.append(col)
|
|
|
|
if missing:
|
|
logger.error(
|
|
f"Missing required columns: {missing}. "
|
|
f"Available: {list(df.columns)}"
|
|
)
|
|
return False
|
|
|
|
return True
|