import json import os from pathlib import Path from typing import Annotated, List import chonkie import typer from .base_chunker import BaseChunker, Document class RecursiveChunker(BaseChunker): """A chunker that uses the chonkie RecursiveChunker.""" def __init__(self) -> None: """Initializes the RecursiveChunker.""" self.processor = chonkie.RecursiveChunker() def process_text(self, text: str) -> List[Document]: """ Processes a string of text into a list of Document chunks. Args: text: The input string to process. Returns: A list of Document objects. """ chunks = self.processor(text) documents: List[Document] = [] for i, chunk in enumerate(chunks): doc: Document = { "page_content": chunk.text, "metadata": {"chunk_index": i}, } documents.append(doc) return documents app = typer.Typer() @app.command() def main( input_file_path: Annotated[ str, typer.Argument(help="Path to the input text file.") ], output_dir: Annotated[ str, typer.Argument(help="Directory to save the output file.") ], ): """ Processes a text file using RecursiveChunker and saves the output to a JSONL file. """ print(f"Starting to process {input_file_path}...") # 1. Instantiate chunker and process the file using the inherited method chunker = RecursiveChunker() documents = chunker.process_path(Path(input_file_path)) print(f"Successfully created {len(documents)} chunks.") # 2. Prepare and save the output if not os.path.exists(output_dir): os.makedirs(output_dir) print(f"Created output directory: {output_dir}") output_file_path = os.path.join(output_dir, "chunked_documents.jsonl") with open(output_file_path, "w", encoding="utf-8") as f: for doc in documents: # Add source file info to metadata before writing doc["metadata"]["source_file"] = os.path.basename(input_file_path) f.write(json.dumps(doc, ensure_ascii=False) + "\n") print(f"Successfully saved {len(documents)} chunks to {output_file_path}") if __name__ == "__main__": app()