prototype

2026-05-16 19:35:21 +00:00 · 2026-05-04 22:00:38 +05:30
commit 711d691870
48 changed files with 5093 additions and 0 deletions
@@ -0,0 +1,88 @@
+"""
+Clawrity — CSV/Excel Data Connector
+
+Auto-detects file format based on extension:
+  .csv → pandas read_csv
+  .xlsx / .xls → pandas read_excel (via openpyxl)
+
+Supports both formats since Kaggle datasets vary by download version.
+"""
+
+import logging
+from pathlib import Path
+
+import pandas as pd
+
+from connectors.base_connector import BaseConnector
+
+logger = logging.getLogger(__name__)
+
+
+class CSVConnector(BaseConnector):
+    """Connector for CSV and Excel files with auto-detection."""
+
+    def load(self, path: str, **kwargs) -> pd.DataFrame:
+        """
+        Load data from a CSV or Excel file.
+        Auto-detects format based on file extension.
+
+        Args:
+            path: Path to the file (.csv, .xlsx, .xls)
+            **kwargs: Passed through to pandas read function.
+                      Useful kwargs: sheet_name, encoding, sep
+
+        Returns:
+            pandas DataFrame
+        """
+        file_path = Path(path)
+
+        if not file_path.exists():
+            raise FileNotFoundError(f"Data file not found: {path}")
+
+        ext = file_path.suffix.lower()
+
+        if ext == ".csv":
+            logger.info(f"Loading CSV: {path}")
+            df = pd.read_csv(path, encoding='latin-1', **kwargs)
+        elif ext in (".xlsx", ".xls"):
+            logger.info(f"Loading Excel ({ext}): {path}")
+            # Default to first sheet unless specified
+            sheet_name = kwargs.pop("sheet_name", 0)
+            df = pd.read_excel(path, sheet_name=sheet_name, engine="openpyxl", **kwargs)
+
+        else:
+            raise ValueError(
+                f"Unsupported file format: {ext}. "
+                f"Supported: .csv, .xlsx, .xls"
+            )
+
+        logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns from {file_path.name}")
+        return df
+
+    def validate(self, df: pd.DataFrame, required_columns: list) -> bool:
+        """
+        Validate that the DataFrame has all required columns.
+        Uses case-insensitive matching.
+
+        Args:
+            df: DataFrame to validate
+            required_columns: List of column names that must be present
+
+        Returns:
+            True if all required columns found
+        """
+        df_cols_lower = {col.lower().strip() for col in df.columns}
+        missing = []
+
+        for col in required_columns:
+            if col.lower().strip() not in df_cols_lower:
+                missing.append(col)
+
+        if missing:
+            logger.error(
+                f"Missing required columns: {missing}. "
+                f"Available: {list(df.columns)}"
+            )
+            return False
+
+        return True