from abc import ABC, abstractmethod from pathlib import Path from typing import List, TypedDict class Document(TypedDict): """A dictionary representing a processed document chunk.""" page_content: str metadata: dict class BaseChunker(ABC): """Abstract base class for chunker implementations.""" max_chunk_size: int @abstractmethod def process_text(self, text: str) -> List[Document]: """ Processes a string of text into a list of Document chunks. Args: text: The input string to process. Returns: A list of Document objects. """ ... def process_path(self, path: Path) -> List[Document]: """ Reads a file from a Path object and processes its content. It attempts to read the file with UTF-8 encoding and falls back to latin-1 if a UnicodeDecodeError occurs. Args: path: The Path object pointing to the file. Returns: A list of Document objects from the file's content. """ try: text = path.read_text(encoding="utf-8") except UnicodeDecodeError: text = path.read_text(encoding="latin-1") return self.process_text(text) def process_bytes(self, b: bytes) -> List[Document]: """ Decodes a byte string and processes its content. It first attempts to decode the bytes as UTF-8. If that fails, it falls back to latin-1. Args: b: The input byte string. Returns: A list of Document objects from the byte string's content. """ try: text = b.decode("utf-8") except UnicodeDecodeError: # Fallback for files that are not UTF-8 encoded. text = b.decode("utf-8-sig") return self.process_text(text)