code-shredding

2026-02-24 03:50:34 +00:00
parent 98a1b5939e
commit 9ccfbb44c2
71 changed files with 1026 additions and 2417 deletions
--- a/src/knowledge_pipeline/chunker/base_chunker.py
+++ b/src/knowledge_pipeline/chunker/base_chunker.py
@@ -0,0 +1,68 @@
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import List, TypedDict
+
+
+class Document(TypedDict):
+    """A dictionary representing a processed document chunk."""
+
+    page_content: str
+    metadata: dict
+
+
+class BaseChunker(ABC):
+    """Abstract base class for chunker implementations."""
+
+    max_chunk_size: int
+
+    @abstractmethod
+    def process_text(self, text: str) -> List[Document]:
+        """
+        Processes a string of text into a list of Document chunks.
+
+        Args:
+            text: The input string to process.
+
+        Returns:
+            A list of Document objects.
+        """
+        ...
+
+    def process_path(self, path: Path) -> List[Document]:
+        """
+        Reads a file from a Path object and processes its content.
+
+        It attempts to read the file with UTF-8 encoding and falls back to
+        latin-1 if a UnicodeDecodeError occurs.
+
+        Args:
+            path: The Path object pointing to the file.
+
+        Returns:
+            A list of Document objects from the file's content.
+        """
+        try:
+            text = path.read_text(encoding="utf-8")
+        except UnicodeDecodeError:
+            text = path.read_text(encoding="latin-1")
+        return self.process_text(text)
+
+    def process_bytes(self, b: bytes) -> List[Document]:
+        """
+        Decodes a byte string and processes its content.
+
+        It first attempts to decode the bytes as UTF-8. If that fails,
+        it falls back to latin-1.
+
+        Args:
+            b: The input byte string.
+
+        Returns:
+            A list of Document objects from the byte string's content.
+        """
+        try:
+            text = b.decode("utf-8")
+        except UnicodeDecodeError:
+            # Fallback for files that are not UTF-8 encoded.
+            text = b.decode("utf-8-sig")
+        return self.process_text(text)