81 lines
2.2 KiB
Python
81 lines
2.2 KiB
Python
import json
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Annotated, List
|
|
|
|
import chonkie
|
|
import typer
|
|
|
|
from .base_chunker import BaseChunker, Document
|
|
|
|
|
|
class RecursiveChunker(BaseChunker):
|
|
"""A chunker that uses the chonkie RecursiveChunker."""
|
|
|
|
def __init__(self) -> None:
|
|
"""Initializes the RecursiveChunker."""
|
|
self.processor = chonkie.RecursiveChunker()
|
|
|
|
def process_text(self, text: str) -> List[Document]:
|
|
"""
|
|
Processes a string of text into a list of Document chunks.
|
|
|
|
Args:
|
|
text: The input string to process.
|
|
|
|
Returns:
|
|
A list of Document objects.
|
|
"""
|
|
chunks = self.processor(text)
|
|
documents: List[Document] = []
|
|
for i, chunk in enumerate(chunks):
|
|
doc: Document = {
|
|
"page_content": chunk.text,
|
|
"metadata": {"chunk_index": i},
|
|
}
|
|
documents.append(doc)
|
|
return documents
|
|
|
|
|
|
app = typer.Typer()
|
|
|
|
|
|
@app.command()
|
|
def main(
|
|
input_file_path: Annotated[
|
|
str, typer.Argument(help="Path to the input text file.")
|
|
],
|
|
output_dir: Annotated[
|
|
str, typer.Argument(help="Directory to save the output file.")
|
|
],
|
|
):
|
|
"""
|
|
Processes a text file using RecursiveChunker and saves the output to a JSONL file.
|
|
"""
|
|
print(f"Starting to process {input_file_path}...")
|
|
|
|
# 1. Instantiate chunker and process the file using the inherited method
|
|
chunker = RecursiveChunker()
|
|
documents = chunker.process_path(Path(input_file_path))
|
|
|
|
print(f"Successfully created {len(documents)} chunks.")
|
|
|
|
# 2. Prepare and save the output
|
|
if not os.path.exists(output_dir):
|
|
os.makedirs(output_dir)
|
|
print(f"Created output directory: {output_dir}")
|
|
|
|
output_file_path = os.path.join(output_dir, "chunked_documents.jsonl")
|
|
|
|
with open(output_file_path, "w", encoding="utf-8") as f:
|
|
for doc in documents:
|
|
# Add source file info to metadata before writing
|
|
doc["metadata"]["source_file"] = os.path.basename(input_file_path)
|
|
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
|
|
|
|
print(f"Successfully saved {len(documents)} chunks to {output_file_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app()
|