code-shredding
This commit is contained in:
68
src/knowledge_pipeline/chunker/base_chunker.py
Normal file
68
src/knowledge_pipeline/chunker/base_chunker.py
Normal file
@@ -0,0 +1,68 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import List, TypedDict
|
||||
|
||||
|
||||
class Document(TypedDict):
|
||||
"""A dictionary representing a processed document chunk."""
|
||||
|
||||
page_content: str
|
||||
metadata: dict
|
||||
|
||||
|
||||
class BaseChunker(ABC):
|
||||
"""Abstract base class for chunker implementations."""
|
||||
|
||||
max_chunk_size: int
|
||||
|
||||
@abstractmethod
|
||||
def process_text(self, text: str) -> List[Document]:
|
||||
"""
|
||||
Processes a string of text into a list of Document chunks.
|
||||
|
||||
Args:
|
||||
text: The input string to process.
|
||||
|
||||
Returns:
|
||||
A list of Document objects.
|
||||
"""
|
||||
...
|
||||
|
||||
def process_path(self, path: Path) -> List[Document]:
|
||||
"""
|
||||
Reads a file from a Path object and processes its content.
|
||||
|
||||
It attempts to read the file with UTF-8 encoding and falls back to
|
||||
latin-1 if a UnicodeDecodeError occurs.
|
||||
|
||||
Args:
|
||||
path: The Path object pointing to the file.
|
||||
|
||||
Returns:
|
||||
A list of Document objects from the file's content.
|
||||
"""
|
||||
try:
|
||||
text = path.read_text(encoding="utf-8")
|
||||
except UnicodeDecodeError:
|
||||
text = path.read_text(encoding="latin-1")
|
||||
return self.process_text(text)
|
||||
|
||||
def process_bytes(self, b: bytes) -> List[Document]:
|
||||
"""
|
||||
Decodes a byte string and processes its content.
|
||||
|
||||
It first attempts to decode the bytes as UTF-8. If that fails,
|
||||
it falls back to latin-1.
|
||||
|
||||
Args:
|
||||
b: The input byte string.
|
||||
|
||||
Returns:
|
||||
A list of Document objects from the byte string's content.
|
||||
"""
|
||||
try:
|
||||
text = b.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
# Fallback for files that are not UTF-8 encoded.
|
||||
text = b.decode("utf-8-sig")
|
||||
return self.process_text(text)
|
||||
Reference in New Issue
Block a user