Skip to content

Simple case detector

gridgulp.detectors.simple_case_detector

Simple case detector for identifying single-table sheets.

This module provides fast detection of sheets that contain only a single table, allowing the system to avoid expensive vision processing for simple cases.

SimpleTableResult dataclass

SimpleTableResult(is_simple_table: bool, table_range: str | None = None, confidence: float = 0.0, has_headers: bool = True, reason: str = '')

Result of simple table detection.

SimpleCaseDetector

SimpleCaseDetector(max_empty_threshold: int = 3)

Detects sheets containing a single, continuous table starting from A1.

Initialize the simple case detector.

Parameters:

  • max_empty_threshold (int, default: 3 ) –

    Maximum consecutive empty rows/cols allowed within table

Source code in src/gridgulp/detectors/simple_case_detector.py
def __init__(self, max_empty_threshold: int = 3):
    """Initialize the simple case detector.

    Args:
        max_empty_threshold: Maximum consecutive empty rows/cols allowed within table
    """
    self.logger = logger
    self.max_empty_threshold = max_empty_threshold

detect_simple_table

detect_simple_table(sheet_data: SheetData) -> SimpleTableResult

Detect if sheet contains a simple single table case.

A simple table case is defined as: 1. Data starts at or near A1 (within first 3 rows/cols) 2. Continuous data region with no empty rows/columns 3. Rectangular shape 4. Optional headers in first row

Parameters:

  • sheet_data (SheetData) –

    Sheet data to analyze

Returns:

Source code in src/gridgulp/detectors/simple_case_detector.py
def detect_simple_table(self, sheet_data: "SheetData") -> SimpleTableResult:
    """Detect if sheet contains a simple single table case.

    A simple table case is defined as:
    1. Data starts at or near A1 (within first 3 rows/cols)
    2. Continuous data region with no empty rows/columns
    3. Rectangular shape
    4. Optional headers in first row

    Args:
        sheet_data: Sheet data to analyze

    Returns:
        SimpleTableResult with detection outcome
    """
    # Quick check: if no data, not a simple table
    if not sheet_data.has_data():
        return SimpleTableResult(is_simple_table=False, reason="Sheet has no data")

    # Find the bounds of all data
    min_row, max_row, min_col, max_col = self._find_data_bounds(sheet_data)

    # Check if data starts near A1
    if min_row > 2 or min_col > 2:  # Allow for small offsets
        return SimpleTableResult(
            is_simple_table=False,
            reason=f"Data doesn't start near A1 (starts at row {min_row + 1}, col {get_column_letter(min_col)})",
        )

    # Check for continuity - no empty rows or columns within the data region
    empty_rows = self._find_empty_rows(sheet_data, min_row, max_row, min_col, max_col)
    empty_cols = self._find_empty_columns(sheet_data, min_row, max_row, min_col, max_col)

    if empty_rows:
        return SimpleTableResult(
            is_simple_table=False,
            reason=f"Found {len(empty_rows)} empty rows within data region",
        )

    if empty_cols:
        return SimpleTableResult(
            is_simple_table=False,
            reason=f"Found {len(empty_cols)} empty columns within data region",
        )

    # Check density - ensure reasonable data density
    total_cells = (max_row - min_row + 1) * (max_col - min_col + 1)
    filled_cells = self._count_filled_cells(sheet_data, min_row, max_row, min_col, max_col)
    density = filled_cells / total_cells if total_cells > 0 else 0

    if density < 0.5:  # At least 50% of cells should have data
        return SimpleTableResult(
            is_simple_table=False, reason=f"Low data density: {density:.1%}"
        )

    # Check if first row looks like headers
    has_headers = self._detect_headers(sheet_data, min_row, min_col, max_col)

    # Calculate confidence based on various factors
    confidence = self._calculate_confidence(
        min_row, min_col, density, has_headers, max_row - min_row + 1
    )

    # Build the range string
    start_cell = f"{get_column_letter(min_col)}{min_row + 1}"
    end_cell = f"{get_column_letter(max_col)}{max_row + 1}"
    table_range = f"{start_cell}:{end_cell}"

    return SimpleTableResult(
        is_simple_table=True,
        table_range=table_range,
        confidence=confidence,
        has_headers=has_headers,
        reason="Detected simple continuous table",
    )

convert_to_table_info

convert_to_table_info(result: SimpleTableResult, sheet_name: str, sheet_data: Optional[SheetData] = None) -> TableInfo | None

Convert simple table result to TableInfo.

Parameters:

  • result (SimpleTableResult) –

    SimpleTableResult from detection

  • sheet_name (str) –

    Name of the sheet

  • sheet_data (Optional[SheetData], default: None ) –

    Sheet data for header extraction (optional)

Returns:

  • TableInfo | None

    TableInfo if simple table detected, None otherwise

Source code in src/gridgulp/detectors/simple_case_detector.py
def convert_to_table_info(
    self, result: SimpleTableResult, sheet_name: str, sheet_data: Optional["SheetData"] = None
) -> TableInfo | None:
    """Convert simple table result to TableInfo.

    Args:
        result: SimpleTableResult from detection
        sheet_name: Name of the sheet
        sheet_data: Sheet data for header extraction (optional)

    Returns:
        TableInfo if simple table detected, None otherwise
    """
    if not result.is_simple_table or not result.table_range:
        return None

    # Parse the range string to create TableRange
    if ":" in result.table_range:
        from ..utils.excel_utils import cell_to_indices

        start, end = result.table_range.split(":")
        start_row, start_col = cell_to_indices(start)
        end_row, end_col = cell_to_indices(end)

        table_range = TableRange(
            start_row=start_row,
            start_col=start_col,
            end_row=end_row,
            end_col=end_col,
        )

        return TableInfo(
            id=f"simple_{start_row}_{start_col}",
            range=table_range,
            suggested_name=f"{sheet_name}_table",
            confidence=result.confidence,
            detection_method="simple_case",
            headers=self._extract_headers(sheet_data, table_range)
            if result.has_headers and sheet_data
            else None,
            data_preview=None,  # Would need to extract if needed
        )

    return None