knowledge-pipeline/src/knowledge_pipeline/chunker/base_chunker.py

from abc import ABC, abstractmethod
from pathlib import Path
from typing import List, TypedDict


class Document(TypedDict):
    """A dictionary representing a processed document chunk."""

    page_content: str
    metadata: dict


class BaseChunker(ABC):
    """Abstract base class for chunker implementations."""

    max_chunk_size: int

    @abstractmethod
    def process_text(self, text: str) -> List[Document]:
        """
        Processes a string of text into a list of Document chunks.

        Args:
            text: The input string to process.

        Returns:
            A list of Document objects.
        """
        ...

    def process_path(self, path: Path) -> List[Document]:
        """
        Reads a file from a Path object and processes its content.

        It attempts to read the file with UTF-8 encoding and falls back to
        latin-1 if a UnicodeDecodeError occurs.

        Args:
            path: The Path object pointing to the file.

        Returns:
            A list of Document objects from the file's content.
        """
        try:
            text = path.read_text(encoding="utf-8")
        except UnicodeDecodeError:
            text = path.read_text(encoding="latin-1")
        return self.process_text(text)

    def process_bytes(self, b: bytes) -> List[Document]:
        """
        Decodes a byte string and processes its content.

        It first attempts to decode the bytes as UTF-8. If that fails,
        it falls back to latin-1.

        Args:
            b: The input byte string.

        Returns:
            A list of Document objects from the byte string's content.
        """
        try:
            text = b.decode("utf-8")
        except UnicodeDecodeError:
            # Fallback for files that are not UTF-8 encoded.
            text = b.decode("utf-8-sig")
        return self.process_text(text)