code-shredding

2026-02-24 03:50:34 +00:00
parent 98a1b5939e
commit ba7581055c
71 changed files with 1026 additions and 2417 deletions
--- a/src/knowledge_pipeline/init.py
+++ b/src/knowledge_pipeline/init.py
@@ -0,0 +1,2 @@
+def main() -> None:
+    print("Hello from rag-eval!")
--- a/src/knowledge_pipeline/chunker/init.py
+++ b/src/knowledge_pipeline/chunker/init.py
@@ -0,0 +1,2 @@
+def hello() -> str:
+    return "Hello from chunker!"
--- a/src/knowledge_pipeline/chunker/base_chunker.py
+++ b/src/knowledge_pipeline/chunker/base_chunker.py
@@ -0,0 +1,68 @@
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import List, TypedDict
+
+
+class Document(TypedDict):
+    """A dictionary representing a processed document chunk."""
+
+    page_content: str
+    metadata: dict
+
+
+class BaseChunker(ABC):
+    """Abstract base class for chunker implementations."""
+
+    max_chunk_size: int
+
+    @abstractmethod
+    def process_text(self, text: str) -> List[Document]:
+        """
+        Processes a string of text into a list of Document chunks.
+
+        Args:
+            text: The input string to process.
+
+        Returns:
+            A list of Document objects.
+        """
+        ...
+
+    def process_path(self, path: Path) -> List[Document]:
+        """
+        Reads a file from a Path object and processes its content.
+
+        It attempts to read the file with UTF-8 encoding and falls back to
+        latin-1 if a UnicodeDecodeError occurs.
+
+        Args:
+            path: The Path object pointing to the file.
+
+        Returns:
+            A list of Document objects from the file's content.
+        """
+        try:
+            text = path.read_text(encoding="utf-8")
+        except UnicodeDecodeError:
+            text = path.read_text(encoding="latin-1")
+        return self.process_text(text)
+
+    def process_bytes(self, b: bytes) -> List[Document]:
+        """
+        Decodes a byte string and processes its content.
+
+        It first attempts to decode the bytes as UTF-8. If that fails,
+        it falls back to latin-1.
+
+        Args:
+            b: The input byte string.
+
+        Returns:
+            A list of Document objects from the byte string's content.
+        """
+        try:
+            text = b.decode("utf-8")
+        except UnicodeDecodeError:
+            # Fallback for files that are not UTF-8 encoded.
+            text = b.decode("utf-8-sig")
+        return self.process_text(text)
--- a/src/knowledge_pipeline/chunker/contextual_chunker.py
+++ b/src/knowledge_pipeline/chunker/contextual_chunker.py
@@ -0,0 +1,98 @@
+from .base_chunker import BaseChunker, Document
+
+
+class ContextualChunker(BaseChunker):
+    """
+    A chunker that uses a large language model to create context-aware chunks.
+    """
+
+    def __init__(
+        self,
+        model: str = "google-vertex:gemini-2.0-flash",
+        max_chunk_size: int = 800,
+    ):
+        self.max_chunk_size = max_chunk_size
+        self.model = model
+
+    def _split_text(self, text: str) -> list[str]:
+        """Splits text into evenly sized chunks of a maximum size, trying to respect sentence and paragraph boundaries."""
+        import math
+
+        num_chunks = math.ceil(len(text) / self.max_chunk_size)
+        if num_chunks == 1:
+            return [text]
+
+        ideal_chunk_size = math.ceil(len(text) / num_chunks)
+
+        chunks = []
+        current_pos = 0
+        while current_pos < len(text):
+            end_pos = min(current_pos + ideal_chunk_size, len(text))
+
+            # Find a good split point around the end_pos
+            split_point = -1
+            if end_pos < len(text):
+                paragraph_break = text.rfind("\n\n", current_pos, end_pos)
+                if paragraph_break != -1:
+                    split_point = paragraph_break + 2
+                else:
+                    sentence_break = text.rfind(". ", current_pos, end_pos)
+                    if sentence_break != -1:
+                        split_point = sentence_break + 1
+                    else:
+                        split_point = end_pos
+            else:
+                split_point = end_pos
+
+            chunks.append(text[current_pos:split_point])
+            current_pos = split_point
+
+        return chunks
+
+    def process_text(self, text: str) -> list[Document]:
+        """
+        Processes a string of text into a list of context-aware Document chunks.
+        """
+        if len(text) <= self.max_chunk_size:
+            return [{"page_content": text, "metadata": {}}]
+
+        chunks = self._split_text(text)
+        processed_chunks: list[Document] = []
+
+        for i, chunk_content in enumerate(chunks):
+            prompt = f"""
+            Documento Original:
+            ---
+            {text}
+            ---
+
+            Fragmento Actual:
+            ---
+            {chunk_content}
+            ---
+
+            Tarea:
+            Genera un resumen conciso del "Documento Original" que proporcione el contexto necesario para entender el "Fragmento Actual". El resumen debe ser un solo párrafo en español.
+            """
+
+            from pydantic_ai import ModelRequest
+            from pydantic_ai.direct import model_request_sync
+
+            response = model_request_sync(
+                self.model,
+                [ModelRequest.user_text_prompt(prompt)],
+            )
+            summary = next(p.content for p in response.parts if p.part_kind == "text")
+            contextualized_chunk = (
+                f"> **Contexto del documento original:**\n> {summary}\n\n---\n\n"
+                + chunk_content
+            )
+
+            processed_chunks.append(
+                {
+                    "page_content": contextualized_chunk,
+                    "metadata": {"chunk_index": i},
+                }
+            )
+
+        return processed_chunks
--- a/src/knowledge_pipeline/chunker/llm_chunker.py
+++ b/src/knowledge_pipeline/chunker/llm_chunker.py
@@ -0,0 +1,576 @@
+import hashlib
+import json
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from typing import Annotated, List
+
+import tiktoken
+import typer
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_core.documents import Document as LangchainDocument
+from llm.vertex_ai import VertexAILLM
+from pdf2image import convert_from_path
+from pypdf import PdfReader
+from rag_eval.config import Settings
+
+from .base_chunker import BaseChunker, Document
+
+
+class TokenManager:
+    """Manages token counting and truncation."""
+
+    def __init__(self, model_name: str = "gpt-3.5-turbo"):
+        try:
+            self.encoding = tiktoken.encoding_for_model(model_name)
+        except KeyError:
+            self.encoding = tiktoken.get_encoding("cl100k_base")
+
+    def count_tokens(self, text: str) -> int:
+        return len(self.encoding.encode(text))
+
+    def truncate_to_tokens(
+        self, text: str, max_tokens: int, preserve_sentences: bool = True
+    ) -> str:
+        tokens = self.encoding.encode(text)
+
+        if len(tokens) <= max_tokens:
+            return text
+
+        truncated_tokens = tokens[:max_tokens]
+        truncated_text = self.encoding.decode(truncated_tokens)
+
+        if preserve_sentences:
+            last_period = truncated_text.rfind(".")
+            if last_period > len(truncated_text) * 0.7:
+                return truncated_text[: last_period + 1]
+
+        return truncated_text
+
+
+class OptimizedChunkProcessor:
+    """Uses an LLM to merge and enhance text chunks."""
+
+    def __init__(
+        self,
+        model: str,
+        max_tokens: int = 1000,
+        target_tokens: int = 800,
+        chunks_per_batch: int = 5,
+        gemini_client: VertexAILLM | None = None,
+        model_name: str = "gpt-3.5-turbo",
+        custom_instructions: str = "",
+    ):
+        self.model = model
+        self.client = gemini_client
+        self.chunks_per_batch = chunks_per_batch
+        self.max_tokens = max_tokens
+        self.target_tokens = target_tokens
+        self.token_manager = TokenManager(model_name)
+        self.custom_instructions = custom_instructions
+        self._merge_cache = {}
+        self._enhance_cache = {}
+
+    def _get_cache_key(self, text: str) -> str:
+        combined = text + self.custom_instructions
+        return hashlib.md5(combined.encode()).hexdigest()[:16]
+
+    def should_merge_chunks(self, chunk1: str, chunk2: str) -> bool:
+        cache_key = f"{self._get_cache_key(chunk1)}_{self._get_cache_key(chunk2)}"
+        if cache_key in self._merge_cache:
+            return self._merge_cache[cache_key]
+
+        try:
+            combined_text = f"{chunk1}\n\n{chunk2}"
+            combined_tokens = self.token_manager.count_tokens(combined_text)
+
+            if combined_tokens > self.max_tokens:
+                self._merge_cache[cache_key] = False
+                return False
+
+            if self.client:
+                base_prompt = f"""Analiza estos dos fragmentos de texto y determina si deben unirse.
+                
+                LÍMITES ESTRICTOS:
+                - Tokens combinados: {combined_tokens}/{self.max_tokens}
+                - Solo unir si hay continuidad semántica clara
+                
+                Criterios de unión:
+                1. El primer fragmento termina abruptamente
+                2. El segundo fragmento continúa la misma idea/concepto
+                3. La unión mejora la coherencia del contenido
+                4. Exceder {self.max_tokens} tokens, SOLAMENTE si es necesario para mantener el contexto"""
+
+                base_prompt += f"""
+                
+                Responde SOLO 'SI' o 'NO'.
+                
+                Fragmento 1 ({self.token_manager.count_tokens(chunk1)} tokens):
+                {chunk1[:500]}...
+                
+                Fragmento 2 ({self.token_manager.count_tokens(chunk2)} tokens):
+                {chunk2[:500]}..."""
+
+                response = self.client.generate(self.model, base_prompt).text
+                result = response.strip().upper() == "SI"
+                self._merge_cache[cache_key] = result
+                return result
+
+            result = (
+                not chunk1.rstrip().endswith((".", "!", "?"))
+                and combined_tokens <= self.target_tokens
+            )
+            self._merge_cache[cache_key] = result
+            return result
+
+        except Exception as e:
+            print(f"Error analizando chunks para merge: {e}")
+            self._merge_cache[cache_key] = False
+            return False
+
+    def enhance_chunk(self, chunk_text: str) -> str:
+        cache_key = self._get_cache_key(chunk_text)
+        if cache_key in self._enhance_cache:
+            return self._enhance_cache[cache_key]
+
+        current_tokens = self.token_manager.count_tokens(chunk_text)
+
+        try:
+            if self.client and current_tokens < self.max_tokens:
+                base_prompt = f"""Optimiza este texto siguiendo estas reglas ESTRICTAS:
+                
+                LÍMITES DE TOKENS:
+                - Actual: {current_tokens} tokens
+                - Máximo permitido: {self.max_tokens} tokens
+                - Objetivo: {self.target_tokens} tokens
+                
+                REGLAS FUNDAMENTALES:
+                NO exceder {self.max_tokens} tokens bajo ninguna circunstancia
+                Mantener TODA la información esencial y metadatos
+                NO cambiar términos técnicos o palabras clave
+                Asegurar oraciones completas y coherentes
+                Optimizar claridad y estructura sin añadir contenido
+                SOLO devuelve el texto no agregues conclusiones NUNCA
+                
+                Si el texto está cerca del límite, NO expandir. Solo mejorar estructura."""
+
+                if self.custom_instructions.strip():
+                    base_prompt += (
+                        f"\n\nINSTRUCCIONES ADICIONALES:\n{self.custom_instructions}"
+                    )
+
+                base_prompt += f"\n\nTexto a optimizar:\n{chunk_text}"
+
+                response = self.client.generate(self.model, base_prompt).text
+                enhanced_text = response.strip()
+
+                enhanced_tokens = self.token_manager.count_tokens(enhanced_text)
+                if enhanced_tokens > self.max_tokens:
+                    print(
+                        f"Advertencia: Texto optimizado excede límite ({enhanced_tokens} > {self.max_tokens})"
+                    )
+                    enhanced_text = self.token_manager.truncate_to_tokens(
+                        enhanced_text, self.max_tokens
+                    )
+
+                self._enhance_cache[cache_key] = enhanced_text
+                return enhanced_text
+            else:
+                if current_tokens > self.max_tokens:
+                    truncated = self.token_manager.truncate_to_tokens(
+                        chunk_text, self.max_tokens
+                    )
+                    self._enhance_cache[cache_key] = truncated
+                    return truncated
+
+                self._enhance_cache[cache_key] = chunk_text
+                return chunk_text
+
+        except Exception as e:
+            print(f"Error procesando chunk: {e}")
+            if current_tokens > self.max_tokens:
+                truncated = self.token_manager.truncate_to_tokens(
+                    chunk_text, self.max_tokens
+                )
+                self._enhance_cache[cache_key] = truncated
+                return truncated
+
+            self._enhance_cache[cache_key] = chunk_text
+            return chunk_text
+
+    def process_chunks_batch(
+        self, chunks: List[LangchainDocument], merge_related: bool = False
+    ) -> List[LangchainDocument]:
+        processed_chunks = []
+        total_chunks = len(chunks)
+
+        print(f"Procesando {total_chunks} chunks en lotes de {self.chunks_per_batch}")
+        if self.custom_instructions:
+            print(
+                f"Con instrucciones personalizadas: {self.custom_instructions[:100]}..."
+            )
+
+        i = 0
+        while i < len(chunks):
+            batch_start = time.time()
+            current_chunk = chunks[i]
+            merged_content = current_chunk.page_content
+            original_tokens = self.token_manager.count_tokens(merged_content)
+
+            if merge_related and i < len(chunks) - 1:
+                merge_count = 0
+                while i + merge_count < len(chunks) - 1 and self.should_merge_chunks(
+                    merged_content, chunks[i + merge_count + 1].page_content
+                ):
+                    merge_count += 1
+                    merged_content += "\n\n" + chunks[i + merge_count].page_content
+                    print(f"  Uniendo chunk {i + 1} con chunk {i + merge_count + 1}")
+
+                i += merge_count
+
+            print(f"\nProcesando chunk {i + 1}/{total_chunks}")
+            print(f"  Tokens originales: {original_tokens}")
+
+            enhanced_content = self.enhance_chunk(merged_content)
+            final_tokens = self.token_manager.count_tokens(enhanced_content)
+
+            processed_chunks.append(
+                LangchainDocument(
+                    page_content=enhanced_content,
+                    metadata={
+                        **current_chunk.metadata,
+                        "final_tokens": final_tokens,
+                    },
+                )
+            )
+
+            print(f"  Tokens finales: {final_tokens}")
+            print(f"  Tiempo de procesamiento: {time.time() - batch_start:.2f}s")
+
+            i += 1
+
+            if i % self.chunks_per_batch == 0 and i < len(chunks):
+                print(f"\nCompletados {i}/{total_chunks} chunks")
+                time.sleep(0.1)
+
+        return processed_chunks
+
+
+class LLMChunker(BaseChunker):
+    """Implements a chunker that uses an LLM to optimize PDF and text content."""
+
+    def __init__(
+        self,
+        output_dir: str,
+        model: str,
+        max_tokens: int = 1000,
+        target_tokens: int = 800,
+        gemini_client: VertexAILLM | None = None,
+        custom_instructions: str = "",
+        extract_images: bool = True,
+        max_workers: int = 4,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+        merge_related: bool = True,
+    ):
+        self.output_dir = output_dir
+        self.model = model
+        self.client = gemini_client
+        self.max_workers = max_workers
+        self.token_manager = TokenManager()
+        self.custom_instructions = custom_instructions
+        self.extract_images = extract_images
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.merge_related = merge_related
+        self._format_cache = {}
+
+        self.chunk_processor = OptimizedChunkProcessor(
+            model=self.model,
+            max_tokens=max_tokens,
+            target_tokens=target_tokens,
+            gemini_client=gemini_client,
+            custom_instructions=custom_instructions,
+        )
+
+    def process_text(self, text: str) -> List[Document]:
+        """Processes raw text using the LLM optimizer."""
+        print("\n=== Iniciando procesamiento de texto ===")
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=self.chunk_size,
+            chunk_overlap=self.chunk_overlap,
+            length_function=self.token_manager.count_tokens,
+            separators=["\n\n", "\n", ". ", " ", ""],
+        )
+        # Create dummy LangchainDocuments for compatibility with process_chunks_batch
+        langchain_docs = text_splitter.create_documents([text])
+
+        processed_docs = self.chunk_processor.process_chunks_batch(
+            langchain_docs, self.merge_related
+        )
+
+        # Convert from LangchainDocument to our Document TypedDict
+        final_documents: List[Document] = [
+            {"page_content": doc.page_content, "metadata": doc.metadata}
+            for doc in processed_docs
+        ]
+        print(
+            f"\n=== Procesamiento de texto completado: {len(final_documents)} chunks creados ==="
+        )
+        return final_documents
+
+    def process_path(self, path: Path) -> List[Document]:
+        """Processes a PDF file, extracts text and images, and optimizes chunks."""
+        overall_start = time.time()
+        print(f"\n=== Iniciando procesamiento optimizado de PDF: {path.name} ===")
+        # ... (rest of the logic from process_pdf_optimized)
+        if not os.path.exists(self.output_dir):
+            os.makedirs(self.output_dir)
+
+        print("\n1. Creando chunks del PDF...")
+        chunks = self._create_optimized_chunks(
+            str(path), self.chunk_size, self.chunk_overlap
+        )
+        print(f"  Total chunks creados: {len(chunks)}")
+
+        pages_to_extract = set()
+        if self.extract_images:
+            print("\n2. Detectando formatos especiales...")
+            format_results = self.detect_special_format_batch(chunks)
+            for i, has_special_format in format_results.items():
+                if has_special_format:
+                    page_number = chunks[i].metadata.get("page")
+                    if page_number:
+                        pages_to_extract.add(page_number)
+            print(f"  Páginas con formato especial: {sorted(pages_to_extract)}")
+
+        if self.extract_images and pages_to_extract:
+            print(f"\n3. Extrayendo {len(pages_to_extract)} páginas como imágenes...")
+            self._extract_pages_parallel(str(path), self.output_dir, pages_to_extract)
+
+        print("\n4. Procesando y optimizando chunks...")
+        processed_chunks = self.chunk_processor.process_chunks_batch(
+            chunks, self.merge_related
+        )
+
+        if self.extract_images:
+            final_chunks = self._add_image_references(
+                processed_chunks, pages_to_extract, str(path), self.output_dir
+            )
+        else:
+            final_chunks = processed_chunks
+
+        total_time = time.time() - overall_start
+        print(f"\n=== Procesamiento completado en {total_time:.2f}s ===")
+
+        # Convert from LangchainDocument to our Document TypedDict
+        final_documents: List[Document] = [
+            {"page_content": doc.page_content, "metadata": doc.metadata}
+            for doc in final_chunks
+        ]
+        return final_documents
+
+    def detect_special_format_batch(
+        self, chunks: List[LangchainDocument]
+    ) -> dict[int, bool]:
+        results = {}
+        chunks_to_process = []
+        for i, chunk in enumerate(chunks):
+            cache_key = hashlib.md5(chunk.page_content.encode()).hexdigest()[:16]
+            if cache_key in self._format_cache:
+                results[i] = self._format_cache[cache_key]
+            else:
+                chunks_to_process.append((i, chunk, cache_key))
+
+        if not chunks_to_process:
+            return results
+
+        if self.client and len(chunks_to_process) > 1:
+            with ThreadPoolExecutor(
+                max_workers=min(self.max_workers, len(chunks_to_process))
+            ) as executor:
+                futures = {
+                    executor.submit(self._detect_single_format, chunk): (i, cache_key)
+                    for i, chunk, cache_key in chunks_to_process
+                }
+                for future in futures:
+                    i, cache_key = futures[future]
+                    try:
+                        result = future.result()
+                        results[i] = result
+                        self._format_cache[cache_key] = result
+                    except Exception as e:
+                        print(f"Error procesando chunk {i}: {e}")
+                        results[i] = False
+        else:
+            for i, chunk, cache_key in chunks_to_process:
+                result = self._detect_single_format(chunk)
+                results[i] = result
+                self._format_cache[cache_key] = result
+        return results
+
+    def _detect_single_format(self, chunk: LangchainDocument) -> bool:
+        if not self.client:
+            content = chunk.page_content
+            table_indicators = ["│", "├", "┼", "┤", "┬", "┴", "|", "+", "-"]
+            has_table_chars = any(char in content for char in table_indicators)
+            has_multiple_columns = content.count("\t") > 10 or content.count("  ") > 20
+            return has_table_chars or has_multiple_columns
+        try:
+            prompt = f"""¿Contiene este texto tablas estructuradas, diagramas ASCII, o elementos que requieren formato especial?
+
+            Responde SOLO 'SI' o 'NO'.
+            
+            Texto:
+            {chunk.page_content[:1000]}"""
+            response = self.client.generate(self.model, prompt).text
+            return response.strip().upper() == "SI"
+        except Exception as e:
+            print(f"Error detectando formato: {e}")
+            return False
+
+    def _create_optimized_chunks(
+        self, pdf_path: str, chunk_size: int, chunk_overlap: int
+    ) -> List[LangchainDocument]:
+        pdf = PdfReader(pdf_path)
+        chunks = []
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            length_function=self.token_manager.count_tokens,
+            separators=["\n\n", "\n", ". ", " ", ""],
+        )
+        for page_num, page in enumerate(pdf.pages, 1):
+            text = page.extract_text()
+            if text.strip():
+                page_chunks = text_splitter.create_documents(
+                    [text],
+                    metadatas=[
+                        {
+                            "page": page_num,
+                            "file_name": os.path.basename(pdf_path),
+                        }
+                    ],
+                )
+                chunks.extend(page_chunks)
+        return chunks
+
+    def _extract_pages_parallel(self, pdf_path: str, output_dir: str, pages: set):
+        def extract_single_page(page_number):
+            try:
+                pdf_filename = os.path.basename(pdf_path)
+                image_path = os.path.join(
+                    output_dir, f"{page_number}_{pdf_filename}.png"
+                )
+                images = convert_from_path(
+                    pdf_path,
+                    first_page=page_number,
+                    last_page=page_number,
+                    dpi=150,
+                    thread_count=1,
+                    grayscale=False,
+                )
+                if images:
+                    images[0].save(image_path, "PNG", optimize=True)
+            except Exception as e:
+                print(f"    Error extrayendo página {page_number}: {e}")
+
+        with ThreadPoolExecutor(
+            max_workers=min(self.max_workers, len(pages))
+        ) as executor:
+            futures = [executor.submit(extract_single_page, page) for page in pages]
+            for future in futures:
+                future.result()  # Wait for completion
+
+    def _add_image_references(
+        self,
+        chunks: List[LangchainDocument],
+        pages_to_extract: set,
+        pdf_path: str,
+        output_dir: str,
+    ) -> List[LangchainDocument]:
+        pdf_filename = os.path.basename(pdf_path)
+        for chunk in chunks:
+            page_number = chunk.metadata.get("page")
+            if page_number in pages_to_extract:
+                image_path = os.path.join(
+                    output_dir, f"page_{page_number}_{pdf_filename}.png"
+                )
+                if os.path.exists(image_path):
+                    image_reference = (
+                        f"\n[IMAGEN DISPONIBLE - Página {page_number}: {image_path}]\n"
+                    )
+                    chunk.page_content = image_reference + chunk.page_content
+                    chunk.metadata["has_image"] = True
+                    chunk.metadata["image_path"] = image_path
+        return chunks
+
+
+app = typer.Typer()
+
+
+@app.command()
+def main(
+    pdf_path: Annotated[str, typer.Argument(help="Ruta al archivo PDF")],
+    output_dir: Annotated[
+        str, typer.Argument(help="Directorio de salida para imágenes y chunks")
+    ],
+    model: Annotated[
+        str, typer.Option(help="Modelo a usar para el procesamiento")
+    ] = "gemini-2.0-flash",
+    max_tokens: Annotated[
+        int, typer.Option(help="Límite máximo de tokens por chunk")
+    ] = 950,
+    target_tokens: Annotated[
+        int, typer.Option(help="Tokens objetivo para optimización")
+    ] = 800,
+    chunk_size: Annotated[int, typer.Option(help="Tamaño base de chunks")] = 1000,
+    chunk_overlap: Annotated[int, typer.Option(help="Solapamiento entre chunks")] = 200,
+    merge_related: Annotated[
+        bool, typer.Option(help="Si unir chunks relacionados")
+    ] = True,
+    custom_instructions: Annotated[
+        str, typer.Option(help="Instrucciones adicionales para optimización")
+    ] = "",
+    extract_images: Annotated[
+        bool,
+        typer.Option(help="Si True, extrae páginas con formato especial como imágenes"),
+    ] = True,
+):
+    """
+    Función principal para procesar PDFs con control completo de tokens.
+    """
+    settings = Settings()
+    llm = VertexAILLM(
+        project=settings.project_id,
+        location=settings.location,
+    )
+
+    chunker = LLMChunker(
+        output_dir=output_dir,
+        model=model,
+        max_tokens=max_tokens,
+        target_tokens=target_tokens,
+        gemini_client=llm,
+        custom_instructions=custom_instructions,
+        extract_images=extract_images,
+        max_workers=4,
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        merge_related=merge_related,
+    )
+
+    documents = chunker.process_path(Path(pdf_path))
+    print(f"Processed {len(documents)} documents.")
+
+    output_file_path = os.path.join(output_dir, "chunked_documents.jsonl")
+    with open(output_file_path, "w", encoding="utf-8") as f:
+        for doc in documents:
+            f.write(json.dumps(doc, ensure_ascii=False) + "\n")
+
+    print(f"Saved {len(documents)} documents to {output_file_path}")
+
+
+if __name__ == "__main__":
+    app()
--- a/src/knowledge_pipeline/chunker/recursive_chunker.py
+++ b/src/knowledge_pipeline/chunker/recursive_chunker.py
@@ -0,0 +1,80 @@
+import json
+import os
+from pathlib import Path
+from typing import Annotated, List
+
+import chonkie
+import typer
+
+from .base_chunker import BaseChunker, Document
+
+
+class RecursiveChunker(BaseChunker):
+    """A chunker that uses the chonkie RecursiveChunker."""
+
+    def __init__(self) -> None:
+        """Initializes the RecursiveChunker."""
+        self.processor = chonkie.RecursiveChunker()
+
+    def process_text(self, text: str) -> List[Document]:
+        """
+        Processes a string of text into a list of Document chunks.
+
+        Args:
+            text: The input string to process.
+
+        Returns:
+            A list of Document objects.
+        """
+        chunks = self.processor(text)
+        documents: List[Document] = []
+        for i, chunk in enumerate(chunks):
+            doc: Document = {
+                "page_content": chunk.text,
+                "metadata": {"chunk_index": i},
+            }
+            documents.append(doc)
+        return documents
+
+
+app = typer.Typer()
+
+
+@app.command()
+def main(
+    input_file_path: Annotated[
+        str, typer.Argument(help="Path to the input text file.")
+    ],
+    output_dir: Annotated[
+        str, typer.Argument(help="Directory to save the output file.")
+    ],
+):
+    """
+    Processes a text file using RecursiveChunker and saves the output to a JSONL file.
+    """
+    print(f"Starting to process {input_file_path}...")
+
+    # 1. Instantiate chunker and process the file using the inherited method
+    chunker = RecursiveChunker()
+    documents = chunker.process_path(Path(input_file_path))
+
+    print(f"Successfully created {len(documents)} chunks.")
+
+    # 2. Prepare and save the output
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+        print(f"Created output directory: {output_dir}")
+
+    output_file_path = os.path.join(output_dir, "chunked_documents.jsonl")
+
+    with open(output_file_path, "w", encoding="utf-8") as f:
+        for doc in documents:
+            # Add source file info to metadata before writing
+            doc["metadata"]["source_file"] = os.path.basename(input_file_path)
+            f.write(json.dumps(doc, ensure_ascii=False) + "\n")
+
+    print(f"Successfully saved {len(documents)} chunks to {output_file_path}")
+
+
+if __name__ == "__main__":
+    app()
--- a/src/knowledge_pipeline/cli.py
+++ b/src/knowledge_pipeline/cli.py
@@ -0,0 +1,20 @@
+import logging
+
+import typer
+
+from .config import Settings
+from .pipeline import run_pipeline
+
+app = typer.Typer()
+
+
+@app.command()
+def run_ingestion():
+    """Main function for the CLI script."""
+    settings = Settings.model_validate({})
+    logging.getLogger("knowledge_pipeline").setLevel(getattr(logging, settings.log_level))
+    run_pipeline(settings)
+
+
+if __name__ == "__main__":
+    app()
--- a/src/knowledge_pipeline/config.py
+++ b/src/knowledge_pipeline/config.py
@@ -0,0 +1,101 @@
+import os
+from functools import cached_property
+
+from google.cloud.aiplatform.matching_engine.matching_engine_index_config import (
+    DistanceMeasureType,
+)
+from pydantic_settings import (
+    BaseSettings,
+    PydanticBaseSettingsSource,
+    SettingsConfigDict,
+    YamlConfigSettingsSource,
+)
+
+CONFIG_FILE_PATH = os.getenv("CONFIG_YAML", "config.yaml")
+
+
+class Settings(BaseSettings):
+    project_id: str
+    location: str
+    log_level: str = "INFO"
+
+    agent_embedding_model: str
+
+    index_name: str
+    index_dimensions: int
+    index_machine_type: str = "e2-standard-16"
+    index_origin: str
+    index_destination: str
+    index_chunk_limit: int
+    index_distance_measure_type: DistanceMeasureType = (
+        DistanceMeasureType.DOT_PRODUCT_DISTANCE
+    )
+    index_approximate_neighbors_count: int = 150
+    index_leaf_node_embedding_count: int = 1000
+    index_leaf_nodes_to_search_percent: int = 10
+    index_public_endpoint_enabled: bool = True
+
+    model_config = SettingsConfigDict(yaml_file=CONFIG_FILE_PATH)
+
+    def model_post_init(self, _):
+        from google.cloud import aiplatform
+
+        aiplatform.init(project=self.project_id, location=self.location)
+
+    @property
+    def index_deployment(self) -> str:
+        return self.index_name.replace("-", "_") + "_deployed"
+
+    @property
+    def index_data(self) -> str:
+        return self.index_destination + self.index_name
+
+    @property
+    def index_contents_dir(self) -> str:
+        return f"{self.index_data}/contents"
+
+    @property
+    def index_vectors_dir(self) -> str:
+        return f"{self.index_data}/vectors"
+
+    @property
+    def index_vectors_jsonl_path(self) -> str:
+        return f"{self.index_vectors_dir}/vectors.json"
+
+    @cached_property
+    def gcs_client(self):
+        from google.cloud import storage
+
+        return storage.Client()
+
+    @cached_property
+    def converter(self):
+        from markitdown import MarkItDown
+
+        return MarkItDown(enable_plugins=False)
+
+    @cached_property
+    def embedder(self):
+        from pydantic_ai import Embedder
+
+        return Embedder(f"google-vertex:{self.agent_embedding_model}")
+
+    @cached_property
+    def chunker(self):
+        from .chunker.contextual_chunker import ContextualChunker
+
+        return ContextualChunker(max_chunk_size=self.index_chunk_limit)
+
+    @classmethod
+    def settings_customise_sources(
+        cls,
+        settings_cls: type[BaseSettings],
+        init_settings: PydanticBaseSettingsSource,
+        env_settings: PydanticBaseSettingsSource,
+        dotenv_settings: PydanticBaseSettingsSource,
+        file_secret_settings: PydanticBaseSettingsSource,
+    ) -> tuple[PydanticBaseSettingsSource, ...]:
+        return (
+            env_settings,
+            YamlConfigSettingsSource(settings_cls),
+        )
--- a/src/knowledge_pipeline/pipeline.py
+++ b/src/knowledge_pipeline/pipeline.py
@@ -0,0 +1,209 @@
+import json
+import logging
+import re
+import tempfile
+import unicodedata
+from collections.abc import Sequence
+from pathlib import Path
+
+from google.cloud import aiplatform
+from google.cloud.aiplatform.matching_engine.matching_engine_index_config import (
+    DistanceMeasureType,
+)
+from google.cloud.storage import Client as StorageClient
+from markitdown import MarkItDown
+from pydantic_ai import Embedder
+
+from .chunker.base_chunker import BaseChunker, Document
+from .config import Settings
+
+log = logging.getLogger(__name__)
+
+
+def _parse_gcs_uri(uri: str) -> tuple[str, str]:
+    """Parse a 'gs://bucket/path' URI into (bucket_name, object_path)."""
+    bucket, _, path = uri.removeprefix("gs://").partition("/")
+    return bucket, path
+
+
+def normalize_string(s: str) -> str:
+    """Normalizes a string to be a valid filename."""
+    nfkd_form = unicodedata.normalize("NFKD", s)
+    only_ascii = "".join([c for c in nfkd_form if not unicodedata.combining(c)])
+    only_ascii = only_ascii.lower()
+    only_ascii = re.sub(r"\s+", "_", only_ascii)
+    only_ascii = re.sub(r"[^a-z0-9_.-]", "", only_ascii)
+    return only_ascii
+
+
+def gather_pdfs(index_origin: str, gcs_client: StorageClient) -> list[str]:
+    """Lists all PDF file URIs in a GCS directory."""
+    bucket, prefix = _parse_gcs_uri(index_origin)
+    blobs = gcs_client.bucket(bucket).list_blobs(prefix=prefix)
+    pdf_files = [
+        f"gs://{bucket}/{blob.name}" for blob in blobs if blob.name.endswith(".pdf")
+    ]
+    log.info("Found %d PDF files in %s", len(pdf_files), index_origin)
+    return pdf_files
+
+
+def split_into_chunks(text: str, file_id: str, chunker: BaseChunker) -> list[Document]:
+    """Splits text into chunks, or returns a single chunk if small enough."""
+    if len(text) <= chunker.max_chunk_size:
+        return [{"page_content": text, "metadata": {"id": file_id}}]
+
+    chunks = chunker.process_text(text)
+    for i, chunk in enumerate(chunks):
+        chunk["metadata"]["id"] = f"{file_id}_{i}"
+    return chunks
+
+
+def upload_to_gcs(
+    chunks: list[Document],
+    vectors: list[dict],
+    index_contents_dir: str,
+    index_vectors_jsonl_path: str,
+    gcs_client: StorageClient,
+) -> None:
+    """Uploads chunk contents and vectors to GCS."""
+    bucket, prefix = _parse_gcs_uri(index_contents_dir)
+    gcs_bucket = gcs_client.bucket(bucket)
+    for chunk in chunks:
+        chunk_id = chunk["metadata"]["id"]
+        gcs_bucket.blob(f"{prefix}/{chunk_id}.md").upload_from_string(
+            chunk["page_content"], content_type="text/markdown; charset=utf-8"
+        )
+
+    vectors_jsonl = "\n".join(json.dumps(v) for v in vectors) + "\n"
+    bucket, obj_path = _parse_gcs_uri(index_vectors_jsonl_path)
+    gcs_client.bucket(bucket).blob(obj_path).upload_from_string(
+        vectors_jsonl, content_type="application/x-ndjson; charset=utf-8"
+    )
+    log.info("Uploaded %d chunks and %d vectors to GCS", len(chunks), len(vectors))
+
+
+def build_vectors(
+    chunks: list[Document],
+    embeddings: Sequence[Sequence[float]],
+    source_folder: str,
+) -> list[dict]:
+    """Builds vector records from chunks and their embeddings."""
+    source = Path(source_folder).parts[0] if source_folder else ""
+    return [
+        {
+            "id": chunk["metadata"]["id"],
+            "embedding": list(embedding),
+            "restricts": [{"namespace": "source", "allow": [source]}],
+        }
+        for chunk, embedding in zip(chunks, embeddings)
+    ]
+
+
+def create_vector_index(
+    index_name: str,
+    index_vectors_dir: str,
+    index_dimensions: int,
+    index_distance_measure_type: DistanceMeasureType,
+    index_deployment: str,
+    index_machine_type: str,
+    approximate_neighbors_count: int,
+    leaf_node_embedding_count: int,
+    leaf_nodes_to_search_percent: int,
+    public_endpoint_enabled: bool,
+):
+    """Creates and deploys a Vertex AI Vector Search Index."""
+    log.info("Creating index '%s'...", index_name)
+    index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
+        display_name=index_name,
+        contents_delta_uri=index_vectors_dir,
+        dimensions=index_dimensions,
+        approximate_neighbors_count=approximate_neighbors_count,
+        distance_measure_type=index_distance_measure_type,
+        leaf_node_embedding_count=leaf_node_embedding_count,
+        leaf_nodes_to_search_percent=leaf_nodes_to_search_percent,
+    )
+    log.info("Index '%s' created successfully.", index_name)
+
+    log.info("Deploying index to a new endpoint...")
+    endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
+        display_name=f"{index_name}-endpoint",
+        public_endpoint_enabled=public_endpoint_enabled,
+    )
+    endpoint.deploy_index(
+        index=index,
+        deployed_index_id=index_deployment,
+        machine_type=index_machine_type,
+    )
+    log.info("Index deployed: %s", endpoint.display_name)
+
+
+def process_file(
+    file_uri: str,
+    temp_dir: Path,
+    gcs_client: StorageClient,
+    converter: MarkItDown,
+    embedder: Embedder,
+    chunker: BaseChunker,
+) -> tuple[list[Document], list[dict]]:
+    """Downloads a PDF from GCS, converts to markdown, chunks, and embeds."""
+    bucket, obj_path = _parse_gcs_uri(file_uri)
+    local_path = temp_dir / Path(file_uri).name
+    gcs_client.bucket(bucket).blob(obj_path).download_to_filename(local_path)
+
+    try:
+        markdown = converter.convert(local_path).text_content
+        file_id = normalize_string(Path(file_uri).stem)
+        source_folder = Path(obj_path).parent.as_posix()
+
+        chunks = split_into_chunks(markdown, file_id, chunker)
+        texts = [c["page_content"] for c in chunks]
+        embeddings = embedder.embed_documents_sync(texts).embeddings
+
+        vectors = build_vectors(chunks, embeddings, source_folder)
+        return chunks, vectors
+    finally:
+        if local_path.exists():
+            local_path.unlink()
+
+
+def run_pipeline(settings: Settings):
+    """Runs the full ingestion pipeline: gather → process → aggregate → index."""
+    files = gather_pdfs(settings.index_origin, settings.gcs_client)
+
+    all_chunks: list[Document] = []
+    all_vectors: list[dict] = []
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        for file_uri in files:
+            log.info("Processing file: %s", file_uri)
+            chunks, vectors = process_file(
+                file_uri,
+                Path(temp_dir),
+                settings.gcs_client,
+                settings.converter,
+                settings.embedder,
+                settings.chunker,
+            )
+            all_chunks.extend(chunks)
+            all_vectors.extend(vectors)
+
+    upload_to_gcs(
+        all_chunks,
+        all_vectors,
+        settings.index_contents_dir,
+        settings.index_vectors_jsonl_path,
+        settings.gcs_client,
+    )
+
+    create_vector_index(
+        settings.index_name,
+        settings.index_vectors_dir,
+        settings.index_dimensions,
+        settings.index_distance_measure_type,
+        settings.index_deployment,
+        settings.index_machine_type,
+        settings.index_approximate_neighbors_count,
+        settings.index_leaf_node_embedding_count,
+        settings.index_leaf_nodes_to_search_percent,
+        settings.index_public_endpoint_enabled,
+    )