69 lines
1.8 KiB
Python
69 lines
1.8 KiB
Python
from abc import ABC, abstractmethod
|
|
from pathlib import Path
|
|
from typing import List, TypedDict
|
|
|
|
|
|
class Document(TypedDict):
|
|
"""A dictionary representing a processed document chunk."""
|
|
|
|
page_content: str
|
|
metadata: dict
|
|
|
|
|
|
class BaseChunker(ABC):
|
|
"""Abstract base class for chunker implementations."""
|
|
|
|
max_chunk_size: int
|
|
|
|
@abstractmethod
|
|
def process_text(self, text: str) -> List[Document]:
|
|
"""
|
|
Processes a string of text into a list of Document chunks.
|
|
|
|
Args:
|
|
text: The input string to process.
|
|
|
|
Returns:
|
|
A list of Document objects.
|
|
"""
|
|
...
|
|
|
|
def process_path(self, path: Path) -> List[Document]:
|
|
"""
|
|
Reads a file from a Path object and processes its content.
|
|
|
|
It attempts to read the file with UTF-8 encoding and falls back to
|
|
latin-1 if a UnicodeDecodeError occurs.
|
|
|
|
Args:
|
|
path: The Path object pointing to the file.
|
|
|
|
Returns:
|
|
A list of Document objects from the file's content.
|
|
"""
|
|
try:
|
|
text = path.read_text(encoding="utf-8")
|
|
except UnicodeDecodeError:
|
|
text = path.read_text(encoding="latin-1")
|
|
return self.process_text(text)
|
|
|
|
def process_bytes(self, b: bytes) -> List[Document]:
|
|
"""
|
|
Decodes a byte string and processes its content.
|
|
|
|
It first attempts to decode the bytes as UTF-8. If that fails,
|
|
it falls back to latin-1.
|
|
|
|
Args:
|
|
b: The input byte string.
|
|
|
|
Returns:
|
|
A list of Document objects from the byte string's content.
|
|
"""
|
|
try:
|
|
text = b.decode("utf-8")
|
|
except UnicodeDecodeError:
|
|
# Fallback for files that are not UTF-8 encoded.
|
|
text = b.decode("utf-8-sig")
|
|
return self.process_text(text)
|