Files
knowledge-pipeline/src/knowledge_pipeline/chunker/recursive_chunker.py
2026-02-24 06:49:31 +00:00

81 lines
2.2 KiB
Python

import json
import os
from pathlib import Path
from typing import Annotated, List
import chonkie
import typer
from .base_chunker import BaseChunker, Document
class RecursiveChunker(BaseChunker):
"""A chunker that uses the chonkie RecursiveChunker."""
def __init__(self) -> None:
"""Initializes the RecursiveChunker."""
self.processor = chonkie.RecursiveChunker()
def process_text(self, text: str) -> List[Document]:
"""
Processes a string of text into a list of Document chunks.
Args:
text: The input string to process.
Returns:
A list of Document objects.
"""
chunks = self.processor(text)
documents: List[Document] = []
for i, chunk in enumerate(chunks):
doc: Document = {
"page_content": chunk.text,
"metadata": {"chunk_index": i},
}
documents.append(doc)
return documents
app = typer.Typer()
@app.command()
def main(
input_file_path: Annotated[
str, typer.Argument(help="Path to the input text file.")
],
output_dir: Annotated[
str, typer.Argument(help="Directory to save the output file.")
],
):
"""
Processes a text file using RecursiveChunker and saves the output to a JSONL file.
"""
print(f"Starting to process {input_file_path}...")
# 1. Instantiate chunker and process the file using the inherited method
chunker = RecursiveChunker()
documents = chunker.process_path(Path(input_file_path))
print(f"Successfully created {len(documents)} chunks.")
# 2. Prepare and save the output
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"Created output directory: {output_dir}")
output_file_path = os.path.join(output_dir, "chunked_documents.jsonl")
with open(output_file_path, "w", encoding="utf-8") as f:
for doc in documents:
# Add source file info to metadata before writing
doc["metadata"]["source_file"] = os.path.basename(input_file_path)
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
print(f"Successfully saved {len(documents)} chunks to {output_file_path}")
if __name__ == "__main__":
app()