mirror of
https://github.com/Manoj-HV30/clawrity.git
synced 2026-05-16 19:35:21 +00:00
prototype
This commit is contained in:
@@ -0,0 +1,88 @@
|
||||
"""
|
||||
Clawrity — CSV/Excel Data Connector
|
||||
|
||||
Auto-detects file format based on extension:
|
||||
.csv → pandas read_csv
|
||||
.xlsx / .xls → pandas read_excel (via openpyxl)
|
||||
|
||||
Supports both formats since Kaggle datasets vary by download version.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from connectors.base_connector import BaseConnector
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CSVConnector(BaseConnector):
|
||||
"""Connector for CSV and Excel files with auto-detection."""
|
||||
|
||||
def load(self, path: str, **kwargs) -> pd.DataFrame:
|
||||
"""
|
||||
Load data from a CSV or Excel file.
|
||||
Auto-detects format based on file extension.
|
||||
|
||||
Args:
|
||||
path: Path to the file (.csv, .xlsx, .xls)
|
||||
**kwargs: Passed through to pandas read function.
|
||||
Useful kwargs: sheet_name, encoding, sep
|
||||
|
||||
Returns:
|
||||
pandas DataFrame
|
||||
"""
|
||||
file_path = Path(path)
|
||||
|
||||
if not file_path.exists():
|
||||
raise FileNotFoundError(f"Data file not found: {path}")
|
||||
|
||||
ext = file_path.suffix.lower()
|
||||
|
||||
if ext == ".csv":
|
||||
logger.info(f"Loading CSV: {path}")
|
||||
df = pd.read_csv(path, encoding='latin-1', **kwargs)
|
||||
elif ext in (".xlsx", ".xls"):
|
||||
logger.info(f"Loading Excel ({ext}): {path}")
|
||||
# Default to first sheet unless specified
|
||||
sheet_name = kwargs.pop("sheet_name", 0)
|
||||
df = pd.read_excel(path, sheet_name=sheet_name, engine="openpyxl", **kwargs)
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported file format: {ext}. "
|
||||
f"Supported: .csv, .xlsx, .xls"
|
||||
)
|
||||
|
||||
logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns from {file_path.name}")
|
||||
return df
|
||||
|
||||
def validate(self, df: pd.DataFrame, required_columns: list) -> bool:
|
||||
"""
|
||||
Validate that the DataFrame has all required columns.
|
||||
Uses case-insensitive matching.
|
||||
|
||||
Args:
|
||||
df: DataFrame to validate
|
||||
required_columns: List of column names that must be present
|
||||
|
||||
Returns:
|
||||
True if all required columns found
|
||||
"""
|
||||
df_cols_lower = {col.lower().strip() for col in df.columns}
|
||||
missing = []
|
||||
|
||||
for col in required_columns:
|
||||
if col.lower().strip() not in df_cols_lower:
|
||||
missing.append(col)
|
||||
|
||||
if missing:
|
||||
logger.error(
|
||||
f"Missing required columns: {missing}. "
|
||||
f"Available: {list(df.columns)}"
|
||||
)
|
||||
return False
|
||||
|
||||
return True
|
||||
Reference in New Issue
Block a user