ic

2025-10-13 18:16:25 +00:00
parent 739f087cef
commit 325f1ef439
415 changed files with 46870 additions and 0 deletions
--- a/notebooks/synthetic-question-generator/main.py
+++ b/notebooks/synthetic-question-generator/main.py
@@ -0,0 +1,226 @@
+import marimo
+
+__generated_with = "0.13.15"
+app = marimo.App(width="medium")
+
+with app.setup:
+    import marimo as mo
+
+    import random
+    import json
+    import os
+
+    from banortegpt.generation.vertex_ai_gemini import Gemini
+    from banortegpt.vector.qdrant import Qdrant
+
+    gemini = Gemini.from_vault("banortegpt", token=os.getenv("VAULT_TOKEN"))
+    qdrant = Qdrant.from_vault("banortegpt", token=os.getenv("VAULT_TOKEN"))
+
+    collection_list = qdrant.list_collections()
+
+    question_type_map = {
+        "Factual": "Questions targeting specific details within a reference (e.g., a company’s profit in a report, a verdict in a legal case, or symptoms in a medical record) to test RAG’s retrieval accuracy.",
+        "Summarization": "Questions that require comprehensive answers, covering all relevant information, to mainly evaluate the recall rate of RAG retrieval.",
+        "Multi-hop Reasoning": "Questions involve logical relationships among events and details within adocument, forming a reasoning chain to assess RAG’s logical reasoning ability.",
+        "Unanswerable": "Questions arise from potential information loss during the schema-to-article generation, where no corresponding information fragment exists, or the information is insufficient for an answer.",
+    }
+    question_types = list(question_type_map.keys())
+
+    FORMAT_TEMPLATE = """
+    <document>
+      <id>
+      {id}
+      </id>
+      <content>
+      {content}
+      </content>
+    </document>
+    """
+
+    PROMPT_TEMPLATE = """
+    Eres un experto en generación de preguntas sínteticas. Tu tarea es crear preguntas sintéticas en español basadas en documentos de referencia proporcionados.
+
+    ## INSTRUCCIONES:
+
+    ### Requisitos obligatorios:
+    1. **Idioma**: La pregunta DEBE estar completamente en español
+    2. **Basada en documentos**: La pregunta DEBE poder responderse ÚNICAMENTE con la información contenida en los documentos proporcionados
+    3. **Tipo de pregunta**: Sigue estrictamente la definición del tipo de pregunta especificado
+    4. **Identificación de fuentes**: Incluye el ID de fuente de todos los documentos necesarios para responder la pregunta
+    5. **Respuesta ideal**: Incluye la respuesta perfecta basada en los documentos necesarios para responder la pregunta
+
+    ### Tipo de pregunta solicitado:
+    **Tipo**: {qtype}
+    **Definición**: {qtype_def}
+
+    ### Documentos de referencia:
+    {context}
+
+    Por favor, genera una pregunta siguiendo estas instrucciones.
+    """.strip()
+
+    response_schema = {
+        "type": "object",
+        "properties": {
+            "pregunta": {
+                "type": "string",
+            },
+            "respuesta": {
+                "type": "string",
+            },
+            "ids": {"type": "array", "items": {"type": "string"}},
+        },
+        "required": ["pregunta", "respuesta", "ids"],
+    }
+
+
+@app.cell
+def _():
+    mo.md(
+        r"""
+    # Generador de Preguntas Sintéticas
+
+    ## Guía de Uso
+
+    1. **Selecciona una colección de vectores** y especifica el campo que contiene el texto del vector
+    2. **Elige un modelo LLM** para la generación de preguntas sintéticas
+       - Modelo por defecto: `gemini-2.0-flash`
+    3. **Selecciona el tipo** y cantidad de chunks por pregunta
+    4. **Define la cantidad** de preguntas sintéticas que deseas crear
+    5. **Ejecuta la generación** y revisa los resultados
+    """
+    )
+    return
+
+
+@app.cell
+def _():
+    settings = (
+        mo.md(
+            """
+        Collection: {collection} Key: {content_key}\n
+        LLM: {model}\n
+        Question type: {qtype} Chunks: {chunks}\n
+        Target amount: {amount}
+        """
+        )
+        .batch(
+            model=mo.ui.text(value="gemini-2.0-flash"),
+            collection=mo.ui.dropdown(collection_list, searchable=True),
+            content_key=mo.ui.text(value="page_content"),
+            amount=mo.ui.number(value=10, step=10),
+            chunks=mo.ui.number(value=3, step=1),
+            qtype=mo.ui.dropdown(question_types),
+        )
+        .form(bordered=True)
+    )
+
+    settings
+    return (settings,)
+
+
+@app.cell
+def _(settings):
+    mo.stop(not settings.value)
+
+    CONTENT_KEY: str = settings.value["content_key"]
+    QUESTION_TYPE: str = settings.value["qtype"]
+    CHUNKS: int = settings.value["chunks"]
+    TYPE_DEFINITION: str = question_type_map[QUESTION_TYPE]
+    AMOUNT: int = settings.value["amount"]
+
+    gemini.set_model(settings.value["model"])
+    qdrant.collection = settings.value["collection"]
+    return AMOUNT, CHUNKS, CONTENT_KEY, QUESTION_TYPE, TYPE_DEFINITION
+
+
+@app.function
+def get_point_ids():
+    limit = qdrant.client.get_collection(qdrant.collection).points_count
+
+    query_response = qdrant.client.query_points(qdrant.collection, limit=limit)
+
+    return [point.id for point in query_response.points]
+
+
+@app.cell
+def _(CHUNKS: int, CONTENT_KEY: str):
+    def select_random_points(points: list):
+        selected_points = []
+        max = len(points) - 1
+
+        for _ in range(CHUNKS):
+            idx = random.randint(0, max)
+
+            selected_points.append(points[idx])
+
+        query_response = qdrant.client.retrieve(
+            qdrant.collection,
+            ids=selected_points,
+        )
+
+        data = [(point.id, point.payload[CONTENT_KEY]) for point in query_response]
+
+        return data
+
+    return (select_random_points,)
+
+
+@app.function
+def format_points_into_context(points):
+    templates = [FORMAT_TEMPLATE.format(id=p[0], content=p[1]) for p in points]
+
+    return "\n".join(templates)
+
+
+@app.function
+def generate_synthetic_questions(prompt):
+    response = gemini.generate(prompt, response_schema=response_schema)
+    return response
+
+
+@app.cell
+def _(QUESTION_TYPE: str, TYPE_DEFINITION: str, select_random_points):
+    def generate_questions(amount: int):
+        results = []
+
+        for _ in mo.status.progress_bar(range(amount), remove_on_exit=True):
+            point_ids = get_point_ids()
+
+            selected_points = select_random_points(point_ids)
+
+            context = format_points_into_context(selected_points)
+
+            prompt = PROMPT_TEMPLATE.format(
+                context=context, qtype=QUESTION_TYPE, qtype_def=TYPE_DEFINITION
+            )
+
+            questions = generate_synthetic_questions(prompt)
+
+            result = json.loads(questions.text)
+
+            result["type"] = QUESTION_TYPE
+
+            results.append(result)
+
+        return results
+
+    return (generate_questions,)
+
+
+@app.cell
+def _(AMOUNT: int, generate_questions):
+    results = generate_questions(AMOUNT)
+    return (results,)
+
+
+@app.cell
+def _(results):
+    import polars as pl
+
+    pl.from_records(results)
+    return
+
+
+if __name__ == "__main__":
+    app.run()