code-shredding

2026-02-24 03:50:34 +00:00
parent 98a1b5939e
commit ba7581055c
71 changed files with 1026 additions and 2417 deletions
--- a/src/knowledge_pipeline/chunker/recursive_chunker.py
+++ b/src/knowledge_pipeline/chunker/recursive_chunker.py
@@ -0,0 +1,80 @@
+import json
+import os
+from pathlib import Path
+from typing import Annotated, List
+
+import chonkie
+import typer
+
+from .base_chunker import BaseChunker, Document
+
+
+class RecursiveChunker(BaseChunker):
+    """A chunker that uses the chonkie RecursiveChunker."""
+
+    def __init__(self) -> None:
+        """Initializes the RecursiveChunker."""
+        self.processor = chonkie.RecursiveChunker()
+
+    def process_text(self, text: str) -> List[Document]:
+        """
+        Processes a string of text into a list of Document chunks.
+
+        Args:
+            text: The input string to process.
+
+        Returns:
+            A list of Document objects.
+        """
+        chunks = self.processor(text)
+        documents: List[Document] = []
+        for i, chunk in enumerate(chunks):
+            doc: Document = {
+                "page_content": chunk.text,
+                "metadata": {"chunk_index": i},
+            }
+            documents.append(doc)
+        return documents
+
+
+app = typer.Typer()
+
+
+@app.command()
+def main(
+    input_file_path: Annotated[
+        str, typer.Argument(help="Path to the input text file.")
+    ],
+    output_dir: Annotated[
+        str, typer.Argument(help="Directory to save the output file.")
+    ],
+):
+    """
+    Processes a text file using RecursiveChunker and saves the output to a JSONL file.
+    """
+    print(f"Starting to process {input_file_path}...")
+
+    # 1. Instantiate chunker and process the file using the inherited method
+    chunker = RecursiveChunker()
+    documents = chunker.process_path(Path(input_file_path))
+
+    print(f"Successfully created {len(documents)} chunks.")
+
+    # 2. Prepare and save the output
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+        print(f"Created output directory: {output_dir}")
+
+    output_file_path = os.path.join(output_dir, "chunked_documents.jsonl")
+
+    with open(output_file_path, "w", encoding="utf-8") as f:
+        for doc in documents:
+            # Add source file info to metadata before writing
+            doc["metadata"]["source_file"] = os.path.basename(input_file_path)
+            f.write(json.dumps(doc, ensure_ascii=False) + "\n")
+
+    print(f"Successfully saved {len(documents)} chunks to {output_file_path}")
+
+
+if __name__ == "__main__":
+    app()