ic

2025-10-13 18:16:25 +00:00
parent 739f087cef
commit 325f1ef439
415 changed files with 46870 additions and 0 deletions
--- a/notebooks/chunk_with_llm/README.md
+++ b/notebooks/chunk_with_llm/README.md
--- a/notebooks/chunk_with_llm/mise.toml
+++ b/notebooks/chunk_with_llm/mise.toml
@@ -0,0 +1,9 @@
+[tools]
+azure-functions-core-tools = "latest"
+
+[tasks.edit]
+description = "Run 'chunk_with_llm' notebook in editable mode."
+run = "uv run marimo edit notebook.py"
+
+[tasks.worker]
+run = "uv run taskiq worker broker:broker"
--- a/notebooks/chunk_with_llm/notebook.py
+++ b/notebooks/chunk_with_llm/notebook.py
@@ -0,0 +1,707 @@
+import marimo
+
+__generated_with = "0.13.15"
+app = marimo.App(width="medium")
+
+with app.setup:
+    import hashlib
+    import json
+    import logging
+    import textwrap
+    import time
+    from pathlib import Path
+
+    from langchain.text_splitter import RecursiveCharacterTextSplitter
+    from langchain_core.documents import Document
+    from pdf2image import convert_from_path
+    from pypdf import PdfReader
+    from qdrant_client.models import Distance, PointStruct, VectorParams
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    import numpy as np
+
+    from banortegpt.embedding.azure_ada import Ada
+    from banortegpt.generation.vertex_ai_gemini import Gemini
+    from banortegpt.vector.qdrant import Qdrant
+
+    logger = logging.getLogger(__name__)
+
+    def load_prompt(prompt_file: str) -> str:
+        prompt_dir = Path("prompts/")
+        return (prompt_dir / prompt_file).read_text()
+
+    class TempFile:
+        temp_dir = Path("temp_dir/")
+
+        def __init__(self, name: str, contents: bytes):
+            self.name = name
+            self.contents = contents
+
+        def __enter__(self):
+            self.file = self.temp_dir / self.name
+            self.file.write_bytes(self.contents)
+            return self.file
+
+        def __exit__(self, exc_type, exc_val, exc_tb):
+            self.file.unlink()
+
+    def id_from_json(json_data: dict) -> int:
+        json_str = json.dumps(json_data, sort_keys=True)
+        hash_obj = hashlib.sha256(json_str.encode("utf-8"))
+        return abs(int.from_bytes(hash_obj.digest(), byteorder="big"))
+
+
+@app.class_definition(hide_code=True)
+class PDFPageExtractor:
+    detect_special_format_prompt = load_prompt("detect_special_format_prompt.md")
+
+    def __init__(self, gemini_client: Gemini):
+        self.client = gemini_client
+        self._cache = {}  # Cache para resultados de detección
+
+    def detect_special_format(self, chunk: Document) -> bool:
+        """
+        Detecta si un chunk contiene tablas o formatos especiales.
+        Usa caché para evitar llamadas API repetidas.
+        """
+        # Usar un hash simple del contenido como clave de caché
+        cache_key = hash(chunk.page_content)
+        if cache_key in self._cache:
+            return self._cache[cache_key]
+
+        start_time = time.time()
+        try:
+            prompt = self.detect_special_format_prompt.format(chunk.page_content)
+
+            response = self.client.generate(prompt).text
+            result = response.strip().upper() == "SI"
+            self._cache[cache_key] = result
+            logger.info(f"Tiempo de análisis de chunk: {time.time() - start_time:.2f}s")
+            return result
+        except Exception as e:
+            logger.error(f"Error detectando formato especial: {e}")
+            return False
+
+    def _create_chunks_from_pdf(
+        self, pdf_path: Path, chunk_size: int = 1000, chunk_overlap: int = 200
+    ) -> list[Document]:
+        """
+        Crea chunks a partir de un PDF manteniendo la información de la página original.
+        """
+        start_time = time.time()
+        logger.info(f"Iniciando lectura del PDF: {pdf_path}")
+
+        pdf = PdfReader(pdf_path)
+        total_pages = len(pdf.pages)
+        logger.info(f"Total de páginas en el PDF: {total_pages}")
+
+        chunks = []
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            length_function=len,
+            separators=["\n\n", "\n", " ", ""],
+        )
+
+        for page_num in range(total_pages):
+            page_start = time.time()
+            logger.info(f"Procesando página {page_num + 1}/{total_pages}...")
+
+            page = pdf.pages[page_num]
+            text = page.extract_text()
+
+            if text.strip():
+                page_chunks = text_splitter.create_documents(
+                    [text],
+                    metadatas=[{"page": page_num + 1, "file_name": pdf_path.name}],
+                )
+                chunks.extend(page_chunks)
+                logger.info(
+                    f"  - Chunks creados para página {page_num + 1}: {len(page_chunks)}"
+                )
+            else:
+                logger.info(f"  - Página {page_num + 1} está vacía o no contiene texto")
+
+            logger.info(
+                f"  - Tiempo de procesamiento página {page_num + 1}: {time.time() - page_start:.2f}s"
+            )
+
+        logger.info(
+            f"Tiempo total de procesamiento PDF: {time.time() - start_time:.2f}s"
+        )
+        logger.info(f"Total de chunks creados: {len(chunks)}")
+
+        return chunks
+
+    def process_pdf(
+        self,
+        pdf_path: Path,
+        output_dir: Path,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+    ) -> list[Document]:
+        """
+        Procesa un PDF completo, detectando formatos especiales y extrayendo páginas.
+        """
+        overall_start = time.time()
+        logger.info("\n=== Iniciando procesamiento de PDF ===")
+
+        if not output_dir.exists():
+            output_dir.mkdir()
+            logger.info(f"Directorio de salida creado: {output_dir}")
+
+        # Crear chunks del PDF
+        logger.info("\n1. Creando chunks del PDF...")
+        chunks_start = time.time()
+        chunks = self._create_chunks_from_pdf(pdf_path, chunk_size, chunk_overlap)
+        logger.info(f"Chunks creados en {time.time() - chunks_start:.2f}s")
+
+        processed_chunks = []
+        pages_to_extract = set()
+
+        # Identificar páginas con formatos especiales
+        logger.info("\n2. Analizando chunks para detectar formatos especiales...")
+        analysis_start = time.time()
+        for i, chunk in enumerate(chunks, 1):
+            logger.info(f"\nAnalizando chunk {i}/{len(chunks)}")
+            if self.detect_special_format(chunk):
+                page_number = chunk.metadata.get("page")
+                if page_number not in pages_to_extract:
+                    pages_to_extract.add(page_number)
+                    logger.info(
+                        f"  - Formato especial detectado en página {page_number}"
+                    )
+
+        logger.info(f"Análisis completado en {time.time() - analysis_start:.2f}s")
+        logger.info(f"Páginas a extraer: {sorted(pages_to_extract)}")
+
+        # Extraer páginas con formatos especiales
+        if pages_to_extract:
+            logger.info("\n3. Extrayendo páginas como imágenes...")
+            extraction_start = time.time()
+
+            for page_number in sorted(pages_to_extract):
+                page_start = time.time()
+                logger.info(f"\nProcesando página {page_number}...")
+                pdf_filename = pdf_path.name
+                image_path = output_dir / f"{page_number}_{pdf_filename}.png"
+
+                try:
+                    images = convert_from_path(
+                        pdf_path,
+                        first_page=page_number,
+                        last_page=page_number,
+                        dpi=150,
+                        thread_count=4,
+                        grayscale=False,
+                    )
+
+                    if images:
+                        images[0].save(image_path, "PNG", optimize=True)
+                        logger.info(f"  - Imagen guardada: {image_path}")
+                        logger.info(
+                            f"  - Tiempo de extracción: {time.time() - page_start:.2f}s"
+                        )
+                except Exception as e:
+                    logger.error(f"  - Error extrayendo página {page_number}: {e}")
+
+            logger.info(
+                f"Extracción de imágenes completada en {time.time() - extraction_start:.2f}s"
+            )
+
+        # Procesar chunks y agregar referencias a imágenes
+        logger.info("\n4. Procesando chunks finales...")
+        for chunk in chunks:
+            page_number = chunk.metadata.get("page")
+            if page_number in pages_to_extract:
+                pdf_filename = pdf_path.name
+                image_path = output_dir / f"{page_number}_{pdf_filename}.png"
+                if image_path.exists():
+                    image_reference = f"\n[Ver página {page_number} completa en imagen: {image_path}]\n"
+                    chunk.page_content = image_reference + chunk.page_content
+            processed_chunks.append(chunk)
+
+        total_time = time.time() - overall_start
+        logger.info(f"\n=== Procesamiento completado en {total_time:.2f}s ===")
+        logger.info(f"Total de chunks procesados: {len(processed_chunks)}")
+        logger.info(f"Total de páginas extraídas como imagen: {len(pages_to_extract)}")
+
+        return processed_chunks
+
+
+@app.class_definition(hide_code=True)
+class ChunkProcessor:
+    should_merge_prompt = load_prompt("should_merge_prompt.md")
+    enhance_chunk_prompt = load_prompt("enhance_chunk_prompt.md")
+    MAX_TOKENS = 750  # límite máximo de tokens
+
+    def __init__(self, gemini_client: Gemini, chunks_per_page: int = 5):
+        self.client = gemini_client
+        self.chunks_per_page = chunks_per_page
+
+    def should_merge_chunks(self, chunk1: str, chunk2: str) -> bool:
+        """
+        Determina si dos chunks deberían unirse basado en su contenido y longitud.
+        """
+        try:
+            combined_length = len(chunk1) + len(chunk2)
+            if combined_length > 3375:
+                return False
+
+            prompt = self.should_merge_prompt.format(chunk1, chunk2)
+
+            response = self.client.generate(prompt).text
+            return response.strip().upper() == "SI"
+        except Exception as e:
+            logger.error(f"Error analizando chunks: {e}")
+            return False
+
+    def enhance_chunk(self, chunk_text: str) -> str:
+        """Mejora un chunk individual manteniendo el límite de tokens."""
+        try:
+            prompt = self.enhance_chunk_prompt.format(chunk_text)
+
+            response = self.client.generate(prompt).text
+            enhanced_text = response.strip()
+
+            if len(enhanced_text) > 3375:
+                logger.warning(
+                    "Advertencia: Texto optimizado excede el límite de tokens"
+                )
+                truncated = enhanced_text[:3375].rsplit(".", 1)[0] + "."
+                return truncated
+
+            return enhanced_text
+        except Exception as e:
+            logger.error(f"Error procesando chunk: {e}")
+            return chunk_text
+
+    def process_chunks(
+        self, chunks: list[Document], merge_related: bool = False
+    ) -> list[Document]:
+        """
+        Procesa y opcionalmente une chunks relacionados.
+
+        Args:
+            chunks: Lista de chunks a procesar
+            merge_related: Si es True, intenta unir chunks relacionados
+
+        Returns:
+            List[Document]: Lista de chunks procesados
+        """
+        processed_chunks = []
+        i = 0
+
+        while i < len(chunks):
+            current_chunk = chunks[i]
+            merged_content = current_chunk.page_content
+
+            if merge_related and i < len(chunks) - 1:
+                while i < len(chunks) - 1 and self.should_merge_chunks(
+                    merged_content, chunks[i + 1].page_content
+                ):
+                    logger.info(f"\nUniendo chunks {i + 1} y {i + 2}...")
+                    merged_content += "\n\n" + chunks[i + 1].page_content
+                    i += 1
+
+            logger.info(f"\nProcesando chunk {i + 1}:")
+            logger.info(textwrap.fill(merged_content, width=80))
+            logger.info("\nMejorando contenido")
+
+            enhanced_content = self.enhance_chunk(merged_content)
+
+            processed_chunks.append(
+                Document(page_content=enhanced_content, metadata=current_chunk.metadata)
+            )
+
+            logger.info("\nContenido mejorado")
+            logger.info(textwrap.fill(enhanced_content, width=80))
+            logger.info("-" * 80)
+
+            i += 1
+
+            if i % self.chunks_per_page == 0 and i < len(chunks):
+                continue_processing = "s"  # input("\n¿Continuar con la siguiente página? (s/n): ").lower()
+                if continue_processing != "s":
+                    break
+
+        return processed_chunks
+
+
+@app.class_definition(hide_code=True)
+class Pipeline:
+    def __init__(self, *, ada: Ada, qdrant: Qdrant, gemini: Gemini):
+        self.ada = ada
+        self.qdrant = qdrant
+        self.gemini = gemini
+        self.extractor = PDFPageExtractor(gemini_client=gemini)
+        self.processor = ChunkProcessor(gemini_client=gemini)
+
+    def run(self, name: str, contents: bytes):
+        with TempFile(name=name, contents=contents) as pdf:
+            chunks = self.extractor.process_pdf(pdf, Path("output_images"))
+        merged_enhanced_chunks = self.processor.process_chunks(
+            chunks, merge_related=True
+        )
+        points = self._build_points_from_chunks(merged_enhanced_chunks)
+        return points
+
+    def _build_points_from_chunks(self, chunks):
+        points = [
+            PointStruct(
+                id=id_from_json(document.metadata),
+                payload={
+                    "page_content": document.page_content,
+                    "metadata": document.metadata,
+                },
+                vector={self.ada.model: self.ada.embed(input=document.page_content)},
+            )
+            for document in chunks
+        ]
+        return points
+
+    def upload_points(self, points: list[PointStruct]):
+        self.qdrant.create_collection_if_not_exists(
+            vector_config={
+                self.ada.model: VectorParams(size=3072, distance=Distance.COSINE)
+            }
+        )
+
+        self.qdrant.upload_to_collection(points=points)
+
+    @classmethod
+    def from_vault(
+        cls, vault: str, *, collection: str, embedding_model: str, gemini_model: str
+    ):
+        return cls(
+            ada=Ada.from_vault(vault, model=embedding_model),
+            qdrant=Qdrant.from_vault(vault, collection=collection),
+            gemini=Gemini.from_vault(vault, model=gemini_model),
+        )
+
+
+@app.class_definition(hide_code=True)
+class ChunkDistGraph:
+    def __init__(
+        self,
+        points: list[dict],
+        campo_texto: str = "page_content",
+        titulo: str = "Distribución de Chunks por Longitud",
+    ) -> None:
+        self.points = points
+        self.campo_texto = campo_texto
+        self.title = titulo
+
+    def show(self):
+        longitudes = self._obtener_longitudes()
+        plot = self._visualizar_distribucion_chunks(longitudes)
+        return plot.gcf()
+
+    def _obtener_longitudes(self) -> list[int]:
+        """
+        Obtiene la longitud de todos los chunks de texto en una lista de puntos.
+        """
+        longitudes = []
+
+        for point in self.points:
+            texto = point.payload[self.campo_texto]
+            longitudes.append(len(str(texto)))
+
+        return longitudes
+
+    def _visualizar_distribucion_chunks(self, longitudes: list[int]):
+        """
+        Crea una visualización de la distribución de chunks según su longitud.
+        """
+        plt.figure(figsize=(15, 6))
+
+        n_bins = int(np.log2(len(longitudes)) + 1)
+
+        n, bins, patches = plt.hist(
+            longitudes, bins=n_bins, color="skyblue", edgecolor="black", alpha=0.7
+        )
+
+        from scipy.stats import gaussian_kde
+
+        density = gaussian_kde(longitudes)
+        xs = np.linspace(min(longitudes), max(longitudes), 200)
+        plt.plot(
+            xs,
+            density(xs) * len(longitudes) * (bins[1] - bins[0]),
+            color="red",
+            linewidth=2,
+            label="Tendencia",
+        )
+
+        # Personalizar el gráfico
+        plt.title(self.title, fontsize=14, pad=20)
+        plt.xlabel("Cantidad de Caracteres", fontsize=12)
+        plt.ylabel("Cantidad de Chunks", fontsize=12)
+
+        media = np.mean(longitudes)
+        mediana = np.median(longitudes)
+        desv_std = np.std(longitudes)
+
+        stats_text = (
+            f"Estadísticas:\n"
+            f"• Media: {media:.1f} caracteres\n"
+            f"• Mediana: {mediana:.1f} caracteres\n"
+            f"• Desv. Estándar: {desv_std:.1f}\n"
+            f"• Total de chunks: {len(longitudes)}"
+        )
+
+        plt.text(
+            1.02,
+            0.95,
+            stats_text,
+            transform=plt.gca().transAxes,
+            bbox=dict(facecolor="white", alpha=0.8),
+            verticalalignment="top",
+        )
+
+        plt.tight_layout()
+
+        return plt
+
+
+@app.class_definition(hide_code=True)
+class ChunkDistGraph2:
+    def __init__(
+        self,
+        points: list[dict],
+        campo_texto: str = "page_content",
+        titulo: str = "Distribución de longitud de chunks",
+    ) -> None:
+        self.points = points
+        self.campo_texto = campo_texto
+        self.titulo = titulo
+
+    def show(self):
+        chunks_info = self._obtener_longitudes_chunks()
+
+        longitudes = [length for length, _, _, _ in chunks_info]
+
+        chunks_extremos = self._encontrar_chunks_extremos(chunks_info)
+
+        print("\nInformación de la colección:")
+        print(f"Número total de chunks: {len(longitudes)}")
+        print(f"Número de longitudes únicas: {len(set(longitudes))}")
+        if longitudes:
+            print(f"Rango de longitudes: {min(longitudes)} a {max(longitudes)}")
+
+        fig = self._visualizar_distribucion(longitudes, chunks_extremos)
+
+        return fig.gcf()
+
+    def _obtener_longitudes_chunks(self) -> list[int]:
+        """
+        Obtiene la longitud de todos los chunks de texto en una colección de Qdrant.
+        """
+        chunks_info = []
+        for point in self.points:  # Fixed: was using 'points' instead of 'self.points'
+            texto = point.payload[self.campo_texto]
+
+            chunks_info.append(
+                (
+                    len(str(texto)),
+                    str(texto)[:100],
+                    str(point.id),
+                    point.payload.get("metadata", {}).get("page", "N/A"),
+                )
+            )
+
+        return chunks_info
+
+    def _encontrar_chunks_extremos(
+        self, chunks_info: list[tuple[int, str, str, str]]
+    ) -> dict:
+        """
+        Encuentra los chunks más largo y más corto.
+        """
+        if not chunks_info:
+            return {}
+
+        chunk_mas_corto = min(chunks_info, key=lambda x: x[0])
+        chunk_mas_largo = max(chunks_info, key=lambda x: x[0])
+
+        return {
+            "mas_corto": {
+                "longitud": chunk_mas_corto[0],
+                "preview": chunk_mas_corto[1] + "..."
+                if len(chunk_mas_corto[1]) == 100
+                else chunk_mas_corto[1],
+                "id": chunk_mas_corto[2],
+                "page": chunk_mas_corto[3],
+            },
+            "mas_largo": {
+                "longitud": chunk_mas_largo[0],
+                "preview": chunk_mas_largo[1] + "..."
+                if len(chunk_mas_largo[1]) == 100
+                else chunk_mas_largo[1],
+                "id": chunk_mas_largo[2],
+                "page": chunk_mas_largo[3],
+            },
+        }
+
+    def _visualizar_distribucion(self, longitudes: list[int], chunks_extremos: dict):
+        """
+        Crea una visualización suavizada de la distribución de longitudes.
+        """
+        if not longitudes:
+            raise ValueError("No hay datos para visualizar")
+
+        longitudes = [float(x) for x in longitudes]
+
+        plt.figure(figsize=(15, 6))
+
+        n_bins = max(10, min(50, len(set(longitudes)) // 2))
+
+        if len(longitudes) < 2:
+            plt.text(
+                0.5,
+                0.5,
+                "Datos insuficientes para visualización",
+                ha="center",
+                va="center",
+            )
+            return plt.gcf()
+
+        counts, bins, _ = plt.hist(
+            longitudes,
+            bins=n_bins,
+            density=True,
+            alpha=0.6,
+            color="skyblue",
+            edgecolor="black",
+        )
+
+        bin_centers = (bins[:-1] + bins[1:]) / 2
+        window_size = 5
+        if len(counts) > window_size:
+            smoothed = np.convolve(
+                counts, np.ones(window_size) / window_size, mode="valid"
+            )
+            smoothed_x = bin_centers[window_size - 1 :]
+            plt.plot(smoothed_x, smoothed, color="blue", linewidth=2, alpha=0.8)
+
+        plt.title(self.titulo, fontsize=14, pad=5)  # Reduced pad from 20 to 5
+        plt.xlabel("Longitud del chunk (caracteres)", fontsize=12)
+        plt.ylabel("Densidad", fontsize=12)
+
+        media = np.mean(longitudes)
+        mediana = np.median(longitudes)
+        desv_std = np.std(longitudes)
+
+        info_text = (
+            f"Estadísticas:\n"
+            f"• Media: {media:.1f} caracteres\n"
+            f"• Mediana: {mediana:.1f} caracteres\n"
+            f"• Desv. Estándar: {desv_std:.1f}\n\n"
+            f"Chunks Extremos:\n\n"
+            f"• Más corto: {chunks_extremos['mas_corto']['longitud']} caracteres\n"
+            f"  ID para buscar en dashboard: \n"
+            f"  {chunks_extremos['mas_corto']['id']}\n"
+            f"  Página: {chunks_extremos['mas_corto'].get('page', 'N/A')}\n"
+            f"  Preview: {chunks_extremos['mas_corto']['preview']}\n\n"
+            f"• Más largo: {chunks_extremos['mas_largo']['longitud']} caracteres\n"
+            f"  ID para buscar en dashboard: \n"
+            f"  {chunks_extremos['mas_largo']['id']}\n"
+            f"  Página: {chunks_extremos['mas_largo'].get('page', 'N/A')}\n"
+            f"  Preview: {chunks_extremos['mas_largo']['preview']}"
+        )
+
+        plt.figtext(
+            1.02,
+            0.5,
+            info_text,
+            fontsize=10,
+            bbox=dict(facecolor="white", alpha=0.8, edgecolor="none"),
+            wrap=True,
+        )
+
+        # Remove whitespace at the top by adjusting subplots
+        plt.subplots_adjust(top=0.92, bottom=0.1, left=0.08, right=0.75)
+
+        return plt
+
+
+@app.cell
+def _():
+    import marimo as mo
+
+    logger.setLevel(logging.INFO)
+    return (mo,)
+
+
+@app.cell
+def _():
+    pipeline = Pipeline.from_vault(
+        "banortegpt",
+        collection="MayaNormativa",
+        embedding_model="text-embedding-3-large",
+        gemini_model="gemini-1.5-flash",
+    )
+    return (pipeline,)
+
+
+@app.cell
+def _(mo):
+    uploads = mo.ui.file(filetypes=[".pdf"], kind="area", multiple=True).form()
+    uploads
+    return (uploads,)
+
+
+@app.cell
+def _(mo, pipeline, uploads):
+    mo.stop(uploads.value is None)
+
+    points = [
+        point
+        for upload in mo.status.progress_bar(uploads.value, remove_on_exit=True)
+        for point in pipeline.run(upload.name, upload.contents)
+    ]
+    return (points,)
+
+
+@app.cell
+def _(points):
+    ChunkDistGraph(points).show()
+    return
+
+
+@app.cell
+def _():
+    # ChunkDistGraph2(points).show()
+    return
+
+
+@app.cell
+def _(points):
+    import polars as pl
+
+    pl.from_records([p.payload for p in points])
+    return
+
+
+@app.cell
+def _(mo):
+    upload_button = mo.ui.run_button(label="Upload to Qdrant", kind="success")
+    upload_button
+    return (upload_button,)
+
+
+@app.cell
+def _(mo, pipeline, points, upload_button):
+    mo.stop(upload_button.value is False)
+
+    pipeline.upload_points(points)
+    return
+
+
+@app.cell
+def _():
+    return
+
+
+if __name__ == "__main__":
+    app.run()
--- a/notebooks/chunk_with_llm/prompts/detect_special_format_prompt.md
+++ b/notebooks/chunk_with_llm/prompts/detect_special_format_prompt.md
@@ -0,0 +1,26 @@
+Analiza este fragmento de texto y determina si contiene alguno de estos elementos:
+
+1. Tablas estructuradas:
+  - Columnas claramente definidas
+  - Múltiples filas de datos
+  - Formato tabular que requiere mantener el espaciado
+
+2. Elementos visuales o especiales:
+  - Diagramas o figuras en ASCII art
+  - Representaciones gráficas en texto
+  - Fórmulas o ecuaciones con formato especial
+  - Firmas o sellos digitales
+  - Elementos que requieren alineación específica
+
+NO consideres como elementos especiales:
+- Listas simples de elementos
+- Texto con sangrías o indentación normal
+- Párrafos con formato estándar
+- Referencias o citas regulares
+- Texto normal con espaciado simple
+
+Responde SOLO con 'SI' si detectas CLARAMENTE alguno de los elementos listados arriba,
+o 'NO' para texto normal sin elementos especiales.
+
+Texto a analizar:
+{}
--- a/notebooks/chunk_with_llm/prompts/enhance_chunk_prompt.md
+++ b/notebooks/chunk_with_llm/prompts/enhance_chunk_prompt.md
@@ -0,0 +1,12 @@
+Optimiza este texto manteniendo estas reglas ESTRICTAS:
+
+1. NO DEBE exceder 750 tokens (aprox. 3375 caracteres en español)
+2. Mantener TODA la información importante y metadatos
+3. NO cambiar palabras clave o términos técnicos
+4. Asegurar que cada oración sea completa y coherente
+5. Si el texto excede el límite, priorizar mantener oraciones completas
+
+OBJETIVO: Texto coherente y completo dentro del límite de tokens.
+
+Texto a optimizar:
+{}
--- a/notebooks/chunk_with_llm/prompts/should_merge_prompt.md
+++ b/notebooks/chunk_with_llm/prompts/should_merge_prompt.md
@@ -0,0 +1,18 @@
+Analiza estos dos fragmentos de texto y determina si deben unirse.
+
+IMPORTANTE: La longitud combinada NO debe exceder ~750 tokens (3375 caracteres).
+
+Criterios ESTRICTOS de unión:
+1. El primer fragmento termina a mitad de una oración/palabra
+2. El segundo fragmento es la continuación directa del primero
+3. La unión resultante debe ser coherente y no exceder 750 tokens
+
+Responde ÚNICAMENTE con:
+- 'SI': si cumple TODOS los criterios y la unión es NECESARIA
+- 'NO': en cualquier otro caso
+
+Texto 1:
+{}
+
+Texto 2:
+{}
--- a/notebooks/chunk_with_llm/pyproject.toml
+++ b/notebooks/chunk_with_llm/pyproject.toml
@@ -0,0 +1,31 @@
+[project]
+name = "chunk-with-llm"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "azure-ada",
+    "qdrant",
+    "vertex-ai-gemini",
+    "langchain>=0.3.25",
+    "langchain-experimental>=0.3.4",
+    "langchain-openai>=0.3.16",
+    "marimo>=0.13.10",
+    "openai>=1.72.0",
+    "pdf2image>=1.17.0",
+    "pypdf>=5.5.0",
+    "python-dotenv>=1.0.1",
+    "qdrant-client>=1.12.2",
+    "matplotlib>=3.10.3",
+    "seaborn>=0.13.2",
+    "scipy>=1.15.3",
+]
+
+[tool.uv.sources]
+azure-ada = { workspace = true }
+qdrant = { workspace = true }
+vertex-ai-gemini = { workspace = true }
+
+[dependency-groups]
+dev = ["polars>=1.29.0"]
--- a/notebooks/chunk_with_llm/uv.lock
+++ b/notebooks/chunk_with_llm/uv.lock
--- a/notebooks/search-evaluator/README.md
+++ b/notebooks/search-evaluator/README.md
--- a/notebooks/search-evaluator/main.py
+++ b/notebooks/search-evaluator/main.py
@@ -0,0 +1,189 @@
+import marimo
+
+__generated_with = "0.13.15"
+app = marimo.App(width="medium")
+
+with app.setup:
+    import marimo as mo
+
+    from banortegpt.embedding.azure_ada import Ada
+    from banortegpt.vector.qdrant import Qdrant
+
+    ada = Ada.from_vault("banortegpt")
+    qdrant = Qdrant.from_vault("banortegpt")
+
+    collections = qdrant.list_collections()
+
+
+@app.cell
+def _():
+    import os
+
+    settings = (
+        mo.md(
+            """
+        Content Field: {campo_texto}\n
+        Embedding Model: {embedding_model}\n
+        Collection: {collection}\n
+        Score Threshold: {threshold}\n
+        Synthetic Questions: {synthetic_questions}
+        """
+        )
+        .batch(
+            campo_texto=mo.ui.text(value="page_content"),
+            embedding_model=mo.ui.text(value="text-embedding-3-large"),
+            collection=mo.ui.dropdown(collections, searchable=True),
+            threshold=mo.ui.number(value=0.5, step=0.1),
+            synthetic_questions=mo.ui.file(filetypes=[".json"]),
+        )
+        .form(bordered=True)
+    )
+
+    settings
+    return (settings,)
+
+
+@app.cell
+def _(settings):
+    import json
+
+    mo.stop(not settings.value)
+
+    stg = settings.value
+
+    EMBEDDING_MODEL = stg["embedding_model"]
+    COLLECTION = stg["collection"]
+    THRESHOLD = stg["threshold"]
+    QUESTIONS = json.loads(stg["synthetic_questions"][0].contents)
+
+    ada.model = EMBEDDING_MODEL
+    return COLLECTION, QUESTIONS, THRESHOLD
+
+
+@app.cell
+def _(COLLECTION, THRESHOLD):
+    import ranx
+
+    def create_qrels(questions):
+        qrels_dict = {}
+
+        for q in questions:
+            question = q["pregunta"]
+            source_ids = q["ids"]
+
+            qrels_dict[question] = {}
+            for id in source_ids:
+                qrels_dict[question][id] = 1
+
+        return ranx.Qrels(qrels_dict)
+
+    def create_run(questions):
+        run_dict = {}
+
+        for q in questions:
+            question = q["pregunta"]
+
+            embedding = ada.embed(question)
+
+            query_response = qdrant.client.query_points(
+                collection_name=COLLECTION,
+                query=embedding,
+                limit=100,
+                score_threshold=THRESHOLD,
+            )
+
+            run_dict[question] = {}
+            for point in query_response.points:
+                run_dict[question][point.id] = point.score
+
+        return ranx.Run(run_dict)
+
+    return create_qrels, create_run, ranx
+
+
+@app.cell
+def _(create_qrels, create_run, ranx):
+    def create_evals(questions, ks):
+        qrels = create_qrels(questions)
+        run = create_run(questions)
+
+        return [
+            ranx.evaluate(qrels, run, [f"precision@{k}", f"recall@{k}", f"ndcg@{k}"])
+            for k in ks
+        ]
+
+    return (create_evals,)
+
+
+@app.cell
+def _():
+    import matplotlib.pyplot as plt
+
+    def plot_retrieval_metrics(results):
+        # Extract k values and metrics
+        k_values = [int(list(result.keys())[0].split("@")[1]) for result in results]
+
+        # Prepare data for plotting
+        precision_values = [
+            list(result.values())[0]
+            for result in results
+            if "precision" in list(result.keys())[0]
+        ]
+        recall_values = [
+            list(result.values())[1]
+            for result in results
+            if "recall" in list(result.keys())[1]
+        ]
+        ndcg_values = [
+            list(result.values())[2]
+            for result in results
+            if "ndcg" in list(result.keys())[2]
+        ]
+
+        # Create a figure with three subplots
+        fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
+
+        # Precision Plot
+        ax1.plot(k_values, precision_values, marker="o", linestyle="-", color="blue")
+        ax1.set_title("Precision @ K")
+        ax1.set_xlabel("Number of Retrieved Documents (K)")
+        ax1.set_ylabel("Precision")
+        ax1.set_xticks(k_values)
+
+        # Recall Plot
+        ax2.plot(k_values, recall_values, marker="o", linestyle="-", color="green")
+        ax2.set_title("Recall @ K")
+        ax2.set_xlabel("Number of Retrieved Documents (K)")
+        ax2.set_ylabel("Recall")
+        ax2.set_xticks(k_values)
+
+        # NDCG Plot
+        ax3.plot(k_values, ndcg_values, marker="o", linestyle="-", color="red")
+        ax3.set_title("NDCG @ K")
+        ax3.set_xlabel("Number of Retrieved Documents (K)")
+        ax3.set_ylabel("NDCG")
+        ax3.set_xticks(k_values)
+
+        # Add value labels
+        for ax, values in zip(
+            [ax1, ax2, ax3], [precision_values, recall_values, ndcg_values]
+        ):
+            for i, v in enumerate(values):
+                ax.text(k_values[i], v, f"{v:.2f}", ha="center", va="bottom")
+
+        plt.tight_layout()
+        return plt.gca()
+
+    return (plot_retrieval_metrics,)
+
+
+@app.cell
+def _(QUESTIONS, create_evals, plot_retrieval_metrics):
+    results = create_evals(QUESTIONS, [1, 3, 5, 10, 20])
+
+    plot_retrieval_metrics(results)
+    return
+
+
+if __name__ == "__main__":
+    app.run()
--- a/notebooks/search-evaluator/pyproject.toml
+++ b/notebooks/search-evaluator/pyproject.toml
@@ -0,0 +1,17 @@
+[project]
+name = "search-evaluator"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "azure-ada",
+    "marimo>=0.13.15",
+    "matplotlib>=3.10.3",
+    "qdrant",
+    "ranx>=0.3.20",
+]
+
+[tool.uv.sources]
+azure-ada = { workspace = true }
+qdrant = { workspace = true }
--- a/notebooks/synthetic-question-generator/README.md
+++ b/notebooks/synthetic-question-generator/README.md
--- a/notebooks/synthetic-question-generator/main.py
+++ b/notebooks/synthetic-question-generator/main.py
@@ -0,0 +1,226 @@
+import marimo
+
+__generated_with = "0.13.15"
+app = marimo.App(width="medium")
+
+with app.setup:
+    import marimo as mo
+
+    import random
+    import json
+    import os
+
+    from banortegpt.generation.vertex_ai_gemini import Gemini
+    from banortegpt.vector.qdrant import Qdrant
+
+    gemini = Gemini.from_vault("banortegpt", token=os.getenv("VAULT_TOKEN"))
+    qdrant = Qdrant.from_vault("banortegpt", token=os.getenv("VAULT_TOKEN"))
+
+    collection_list = qdrant.list_collections()
+
+    question_type_map = {
+        "Factual": "Questions targeting specific details within a reference (e.g., a company’s profit in a report, a verdict in a legal case, or symptoms in a medical record) to test RAG’s retrieval accuracy.",
+        "Summarization": "Questions that require comprehensive answers, covering all relevant information, to mainly evaluate the recall rate of RAG retrieval.",
+        "Multi-hop Reasoning": "Questions involve logical relationships among events and details within adocument, forming a reasoning chain to assess RAG’s logical reasoning ability.",
+        "Unanswerable": "Questions arise from potential information loss during the schema-to-article generation, where no corresponding information fragment exists, or the information is insufficient for an answer.",
+    }
+    question_types = list(question_type_map.keys())
+
+    FORMAT_TEMPLATE = """
+    <document>
+      <id>
+      {id}
+      </id>
+      <content>
+      {content}
+      </content>
+    </document>
+    """
+
+    PROMPT_TEMPLATE = """
+    Eres un experto en generación de preguntas sínteticas. Tu tarea es crear preguntas sintéticas en español basadas en documentos de referencia proporcionados.
+
+    ## INSTRUCCIONES:
+
+    ### Requisitos obligatorios:
+    1. **Idioma**: La pregunta DEBE estar completamente en español
+    2. **Basada en documentos**: La pregunta DEBE poder responderse ÚNICAMENTE con la información contenida en los documentos proporcionados
+    3. **Tipo de pregunta**: Sigue estrictamente la definición del tipo de pregunta especificado
+    4. **Identificación de fuentes**: Incluye el ID de fuente de todos los documentos necesarios para responder la pregunta
+    5. **Respuesta ideal**: Incluye la respuesta perfecta basada en los documentos necesarios para responder la pregunta
+
+    ### Tipo de pregunta solicitado:
+    **Tipo**: {qtype}
+    **Definición**: {qtype_def}
+
+    ### Documentos de referencia:
+    {context}
+
+    Por favor, genera una pregunta siguiendo estas instrucciones.
+    """.strip()
+
+    response_schema = {
+        "type": "object",
+        "properties": {
+            "pregunta": {
+                "type": "string",
+            },
+            "respuesta": {
+                "type": "string",
+            },
+            "ids": {"type": "array", "items": {"type": "string"}},
+        },
+        "required": ["pregunta", "respuesta", "ids"],
+    }
+
+
+@app.cell
+def _():
+    mo.md(
+        r"""
+    # Generador de Preguntas Sintéticas
+
+    ## Guía de Uso
+
+    1. **Selecciona una colección de vectores** y especifica el campo que contiene el texto del vector
+    2. **Elige un modelo LLM** para la generación de preguntas sintéticas
+       - Modelo por defecto: `gemini-2.0-flash`
+    3. **Selecciona el tipo** y cantidad de chunks por pregunta
+    4. **Define la cantidad** de preguntas sintéticas que deseas crear
+    5. **Ejecuta la generación** y revisa los resultados
+    """
+    )
+    return
+
+
+@app.cell
+def _():
+    settings = (
+        mo.md(
+            """
+        Collection: {collection} Key: {content_key}\n
+        LLM: {model}\n
+        Question type: {qtype} Chunks: {chunks}\n
+        Target amount: {amount}
+        """
+        )
+        .batch(
+            model=mo.ui.text(value="gemini-2.0-flash"),
+            collection=mo.ui.dropdown(collection_list, searchable=True),
+            content_key=mo.ui.text(value="page_content"),
+            amount=mo.ui.number(value=10, step=10),
+            chunks=mo.ui.number(value=3, step=1),
+            qtype=mo.ui.dropdown(question_types),
+        )
+        .form(bordered=True)
+    )
+
+    settings
+    return (settings,)
+
+
+@app.cell
+def _(settings):
+    mo.stop(not settings.value)
+
+    CONTENT_KEY: str = settings.value["content_key"]
+    QUESTION_TYPE: str = settings.value["qtype"]
+    CHUNKS: int = settings.value["chunks"]
+    TYPE_DEFINITION: str = question_type_map[QUESTION_TYPE]
+    AMOUNT: int = settings.value["amount"]
+
+    gemini.set_model(settings.value["model"])
+    qdrant.collection = settings.value["collection"]
+    return AMOUNT, CHUNKS, CONTENT_KEY, QUESTION_TYPE, TYPE_DEFINITION
+
+
+@app.function
+def get_point_ids():
+    limit = qdrant.client.get_collection(qdrant.collection).points_count
+
+    query_response = qdrant.client.query_points(qdrant.collection, limit=limit)
+
+    return [point.id for point in query_response.points]
+
+
+@app.cell
+def _(CHUNKS: int, CONTENT_KEY: str):
+    def select_random_points(points: list):
+        selected_points = []
+        max = len(points) - 1
+
+        for _ in range(CHUNKS):
+            idx = random.randint(0, max)
+
+            selected_points.append(points[idx])
+
+        query_response = qdrant.client.retrieve(
+            qdrant.collection,
+            ids=selected_points,
+        )
+
+        data = [(point.id, point.payload[CONTENT_KEY]) for point in query_response]
+
+        return data
+
+    return (select_random_points,)
+
+
+@app.function
+def format_points_into_context(points):
+    templates = [FORMAT_TEMPLATE.format(id=p[0], content=p[1]) for p in points]
+
+    return "\n".join(templates)
+
+
+@app.function
+def generate_synthetic_questions(prompt):
+    response = gemini.generate(prompt, response_schema=response_schema)
+    return response
+
+
+@app.cell
+def _(QUESTION_TYPE: str, TYPE_DEFINITION: str, select_random_points):
+    def generate_questions(amount: int):
+        results = []
+
+        for _ in mo.status.progress_bar(range(amount), remove_on_exit=True):
+            point_ids = get_point_ids()
+
+            selected_points = select_random_points(point_ids)
+
+            context = format_points_into_context(selected_points)
+
+            prompt = PROMPT_TEMPLATE.format(
+                context=context, qtype=QUESTION_TYPE, qtype_def=TYPE_DEFINITION
+            )
+
+            questions = generate_synthetic_questions(prompt)
+
+            result = json.loads(questions.text)
+
+            result["type"] = QUESTION_TYPE
+
+            results.append(result)
+
+        return results
+
+    return (generate_questions,)
+
+
+@app.cell
+def _(AMOUNT: int, generate_questions):
+    results = generate_questions(AMOUNT)
+    return (results,)
+
+
+@app.cell
+def _(results):
+    import polars as pl
+
+    pl.from_records(results)
+    return
+
+
+if __name__ == "__main__":
+    app.run()
--- a/notebooks/synthetic-question-generator/pyproject.toml
+++ b/notebooks/synthetic-question-generator/pyproject.toml
@@ -0,0 +1,20 @@
+[project]
+name = "synthetic-question-generator"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "hvac>=2.3.0",
+    "marimo>=0.13.15",
+    "polars>=1.30.0",
+    "qdrant",
+    "vertex-ai-gemini",
+]
+
+[tool.uv.sources]
+qdrant = { workspace = true }
+vertex-ai-gemini = { workspace = true }
+
+[dependency-groups]
+dev = []
--- a/notebooks/vector-db-migrator/README.md
+++ b/notebooks/vector-db-migrator/README.md
--- a/notebooks/vector-db-migrator/main.py
+++ b/notebooks/vector-db-migrator/main.py
@@ -0,0 +1,6 @@
+def main():
+    print("Hello from vector-db-migrator!")
+
+
+if __name__ == "__main__":
+    main()
--- a/notebooks/vector-db-migrator/pyproject.toml
+++ b/notebooks/vector-db-migrator/pyproject.toml
@@ -0,0 +1,7 @@
+[project]
+name = "vector-db-migrator"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = []
--- a/notebooks/vector-db-migrator/qdrant_to_cosmbosdb_mongo.py
+++ b/notebooks/vector-db-migrator/qdrant_to_cosmbosdb_mongo.py
@@ -0,0 +1,127 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "marimo",
+#     "numpy==2.1.0",
+#     "pymongo==4.11",
+#     "qdrant-client==1.11.0",
+#     "scikit-learn==1.6.1",
+#     "umap-learn==0.5.7",
+# ]
+# ///
+
+import marimo
+
+__generated_with = "0.11.0"
+app = marimo.App(width="medium")
+
+
+@app.cell
+def _():
+    from qdrant_client import QdrantClient, models
+    from pymongo import MongoClient
+
+    return MongoClient, QdrantClient, models
+
+
+@app.cell
+def _(QdrantClient):
+    qdrant = QdrantClient(
+        api_key="g2nZn0AMxuBREAqfna1YlednbVO1D8wAG3KNrKbYghyrftgVTP0TIg",
+        location="https://82ba8a5d-26e6-41ff-a4f0-ac5e7554ef15.eastus-0.azure.cloud.qdrant.io:6333",
+    )
+    print(qdrant.get_collection("MayaOCP").points_count)
+    return (qdrant,)
+
+
+@app.cell
+def _(MongoClient):
+    mongo = MongoClient(
+        "mongodb+srv://banorte:innovacion2024.@mayacontigo-mongo.global.mongocluster.cosmos.azure.com/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000"
+    )
+    print(mongo.admin.command("ping"))
+    return (mongo,)
+
+
+@app.cell
+def _(qdrant):
+    points = qdrant.scroll(
+        collection_name="MayaOCP", with_vectors=True, with_payload=True, limit=100000
+    )[0]
+    print(len(points))
+    return (points,)
+
+
+@app.cell
+def _(mongo):
+    mongodb = mongo["MayaContigo"]
+    collection = mongodb["MayaOCP"]
+    return collection, mongodb
+
+
+@app.cell
+def _(points):
+    documents = [{"vector": p.vector[:2000], **p.payload} for p in points]
+    documents[:2]
+    return (documents,)
+
+
+@app.cell
+def _(collection, documents):
+    collection.insert_many(documents)
+    return
+
+
+@app.cell
+def _(mongodb):
+    mongodb.command(
+        {
+            "createIndexes": "MayaOCP",
+            "indexes": [
+                {
+                    "name": "VectorSearchIndex",
+                    "key": {"vector": "cosmosSearch"},
+                    "cosmosSearchOptions": {
+                        "kind": "vector-hnsw",
+                        "similarity": "COS",
+                        "dimensions": 2000,
+                    },
+                }
+            ],
+        }
+    )
+    return
+
+
+@app.cell
+def _(points):
+    query_vector = points[0].vector
+    query_vector
+    return (query_vector,)
+
+
+@app.cell
+def _(collection, query_vector):
+    pipeline = [
+        {
+            "$search": {
+                "cosmosSearch": {
+                    "path": "vector",
+                    "vector": query_vector[:2000],
+                    "k": 5,
+                }
+            }
+        }
+    ]
+    for r in collection.aggregate(pipeline):
+        print(r)
+    return pipeline, r
+
+
+@app.cell
+def _():
+    return
+
+
+if __name__ == "__main__":
+    app.run()