code-shredding
This commit is contained in:
80
src/knowledge_pipeline/chunker/recursive_chunker.py
Normal file
80
src/knowledge_pipeline/chunker/recursive_chunker.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Annotated, List
|
||||
|
||||
import chonkie
|
||||
import typer
|
||||
|
||||
from .base_chunker import BaseChunker, Document
|
||||
|
||||
|
||||
class RecursiveChunker(BaseChunker):
|
||||
"""A chunker that uses the chonkie RecursiveChunker."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initializes the RecursiveChunker."""
|
||||
self.processor = chonkie.RecursiveChunker()
|
||||
|
||||
def process_text(self, text: str) -> List[Document]:
|
||||
"""
|
||||
Processes a string of text into a list of Document chunks.
|
||||
|
||||
Args:
|
||||
text: The input string to process.
|
||||
|
||||
Returns:
|
||||
A list of Document objects.
|
||||
"""
|
||||
chunks = self.processor(text)
|
||||
documents: List[Document] = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
doc: Document = {
|
||||
"page_content": chunk.text,
|
||||
"metadata": {"chunk_index": i},
|
||||
}
|
||||
documents.append(doc)
|
||||
return documents
|
||||
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
@app.command()
|
||||
def main(
|
||||
input_file_path: Annotated[
|
||||
str, typer.Argument(help="Path to the input text file.")
|
||||
],
|
||||
output_dir: Annotated[
|
||||
str, typer.Argument(help="Directory to save the output file.")
|
||||
],
|
||||
):
|
||||
"""
|
||||
Processes a text file using RecursiveChunker and saves the output to a JSONL file.
|
||||
"""
|
||||
print(f"Starting to process {input_file_path}...")
|
||||
|
||||
# 1. Instantiate chunker and process the file using the inherited method
|
||||
chunker = RecursiveChunker()
|
||||
documents = chunker.process_path(Path(input_file_path))
|
||||
|
||||
print(f"Successfully created {len(documents)} chunks.")
|
||||
|
||||
# 2. Prepare and save the output
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
print(f"Created output directory: {output_dir}")
|
||||
|
||||
output_file_path = os.path.join(output_dir, "chunked_documents.jsonl")
|
||||
|
||||
with open(output_file_path, "w", encoding="utf-8") as f:
|
||||
for doc in documents:
|
||||
# Add source file info to metadata before writing
|
||||
doc["metadata"]["source_file"] = os.path.basename(input_file_path)
|
||||
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
|
||||
|
||||
print(f"Successfully saved {len(documents)} chunks to {output_file_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
Reference in New Issue
Block a user