import marimo __generated_with = "0.13.15" app = marimo.App(width="medium") with app.setup: import marimo as mo import random import json import os from banortegpt.generation.vertex_ai_gemini import Gemini from banortegpt.vector.qdrant import Qdrant gemini = Gemini.from_vault("banortegpt", token=os.getenv("VAULT_TOKEN")) qdrant = Qdrant.from_vault("banortegpt", token=os.getenv("VAULT_TOKEN")) collection_list = qdrant.list_collections() question_type_map = { "Factual": "Questions targeting specific details within a reference (e.g., a company’s profit in a report, a verdict in a legal case, or symptoms in a medical record) to test RAG’s retrieval accuracy.", "Summarization": "Questions that require comprehensive answers, covering all relevant information, to mainly evaluate the recall rate of RAG retrieval.", "Multi-hop Reasoning": "Questions involve logical relationships among events and details within adocument, forming a reasoning chain to assess RAG’s logical reasoning ability.", "Unanswerable": "Questions arise from potential information loss during the schema-to-article generation, where no corresponding information fragment exists, or the information is insufficient for an answer.", } question_types = list(question_type_map.keys()) FORMAT_TEMPLATE = """ {id} {content} """ PROMPT_TEMPLATE = """ Eres un experto en generación de preguntas sínteticas. Tu tarea es crear preguntas sintéticas en español basadas en documentos de referencia proporcionados. ## INSTRUCCIONES: ### Requisitos obligatorios: 1. **Idioma**: La pregunta DEBE estar completamente en español 2. **Basada en documentos**: La pregunta DEBE poder responderse ÚNICAMENTE con la información contenida en los documentos proporcionados 3. **Tipo de pregunta**: Sigue estrictamente la definición del tipo de pregunta especificado 4. **Identificación de fuentes**: Incluye el ID de fuente de todos los documentos necesarios para responder la pregunta 5. **Respuesta ideal**: Incluye la respuesta perfecta basada en los documentos necesarios para responder la pregunta ### Tipo de pregunta solicitado: **Tipo**: {qtype} **Definición**: {qtype_def} ### Documentos de referencia: {context} Por favor, genera una pregunta siguiendo estas instrucciones. """.strip() response_schema = { "type": "object", "properties": { "pregunta": { "type": "string", }, "respuesta": { "type": "string", }, "ids": {"type": "array", "items": {"type": "string"}}, }, "required": ["pregunta", "respuesta", "ids"], } @app.cell def _(): mo.md( r""" # Generador de Preguntas Sintéticas ## Guía de Uso 1. **Selecciona una colección de vectores** y especifica el campo que contiene el texto del vector 2. **Elige un modelo LLM** para la generación de preguntas sintéticas - Modelo por defecto: `gemini-2.0-flash` 3. **Selecciona el tipo** y cantidad de chunks por pregunta 4. **Define la cantidad** de preguntas sintéticas que deseas crear 5. **Ejecuta la generación** y revisa los resultados """ ) return @app.cell def _(): settings = ( mo.md( """ Collection: {collection} Key: {content_key}\n LLM: {model}\n Question type: {qtype} Chunks: {chunks}\n Target amount: {amount} """ ) .batch( model=mo.ui.text(value="gemini-2.0-flash"), collection=mo.ui.dropdown(collection_list, searchable=True), content_key=mo.ui.text(value="page_content"), amount=mo.ui.number(value=10, step=10), chunks=mo.ui.number(value=3, step=1), qtype=mo.ui.dropdown(question_types), ) .form(bordered=True) ) settings return (settings,) @app.cell def _(settings): mo.stop(not settings.value) CONTENT_KEY: str = settings.value["content_key"] QUESTION_TYPE: str = settings.value["qtype"] CHUNKS: int = settings.value["chunks"] TYPE_DEFINITION: str = question_type_map[QUESTION_TYPE] AMOUNT: int = settings.value["amount"] gemini.set_model(settings.value["model"]) qdrant.collection = settings.value["collection"] return AMOUNT, CHUNKS, CONTENT_KEY, QUESTION_TYPE, TYPE_DEFINITION @app.function def get_point_ids(): limit = qdrant.client.get_collection(qdrant.collection).points_count query_response = qdrant.client.query_points(qdrant.collection, limit=limit) return [point.id for point in query_response.points] @app.cell def _(CHUNKS: int, CONTENT_KEY: str): def select_random_points(points: list): selected_points = [] max = len(points) - 1 for _ in range(CHUNKS): idx = random.randint(0, max) selected_points.append(points[idx]) query_response = qdrant.client.retrieve( qdrant.collection, ids=selected_points, ) data = [(point.id, point.payload[CONTENT_KEY]) for point in query_response] return data return (select_random_points,) @app.function def format_points_into_context(points): templates = [FORMAT_TEMPLATE.format(id=p[0], content=p[1]) for p in points] return "\n".join(templates) @app.function def generate_synthetic_questions(prompt): response = gemini.generate(prompt, response_schema=response_schema) return response @app.cell def _(QUESTION_TYPE: str, TYPE_DEFINITION: str, select_random_points): def generate_questions(amount: int): results = [] for _ in mo.status.progress_bar(range(amount), remove_on_exit=True): point_ids = get_point_ids() selected_points = select_random_points(point_ids) context = format_points_into_context(selected_points) prompt = PROMPT_TEMPLATE.format( context=context, qtype=QUESTION_TYPE, qtype_def=TYPE_DEFINITION ) questions = generate_synthetic_questions(prompt) result = json.loads(questions.text) result["type"] = QUESTION_TYPE results.append(result) return results return (generate_questions,) @app.cell def _(AMOUNT: int, generate_questions): results = generate_questions(AMOUNT) return (results,) @app.cell def _(results): import polars as pl pl.from_records(results) return if __name__ == "__main__": app.run()