code-shredding

This commit is contained in:
2026-02-24 03:50:34 +00:00
parent 98a1b5939e
commit 9ccfbb44c2
71 changed files with 1026 additions and 2417 deletions

View File

@@ -0,0 +1,68 @@
from abc import ABC, abstractmethod
from pathlib import Path
from typing import List, TypedDict
class Document(TypedDict):
"""A dictionary representing a processed document chunk."""
page_content: str
metadata: dict
class BaseChunker(ABC):
"""Abstract base class for chunker implementations."""
max_chunk_size: int
@abstractmethod
def process_text(self, text: str) -> List[Document]:
"""
Processes a string of text into a list of Document chunks.
Args:
text: The input string to process.
Returns:
A list of Document objects.
"""
...
def process_path(self, path: Path) -> List[Document]:
"""
Reads a file from a Path object and processes its content.
It attempts to read the file with UTF-8 encoding and falls back to
latin-1 if a UnicodeDecodeError occurs.
Args:
path: The Path object pointing to the file.
Returns:
A list of Document objects from the file's content.
"""
try:
text = path.read_text(encoding="utf-8")
except UnicodeDecodeError:
text = path.read_text(encoding="latin-1")
return self.process_text(text)
def process_bytes(self, b: bytes) -> List[Document]:
"""
Decodes a byte string and processes its content.
It first attempts to decode the bytes as UTF-8. If that fails,
it falls back to latin-1.
Args:
b: The input byte string.
Returns:
A list of Document objects from the byte string's content.
"""
try:
text = b.decode("utf-8")
except UnicodeDecodeError:
# Fallback for files that are not UTF-8 encoded.
text = b.decode("utf-8-sig")
return self.process_text(text)