Files
Mayacontigo/notebooks/synthetic-question-generator/main.py
Rogelio 325f1ef439 ic
2025-10-13 18:16:25 +00:00

227 lines
6.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import marimo
__generated_with = "0.13.15"
app = marimo.App(width="medium")
with app.setup:
import marimo as mo
import random
import json
import os
from banortegpt.generation.vertex_ai_gemini import Gemini
from banortegpt.vector.qdrant import Qdrant
gemini = Gemini.from_vault("banortegpt", token=os.getenv("VAULT_TOKEN"))
qdrant = Qdrant.from_vault("banortegpt", token=os.getenv("VAULT_TOKEN"))
collection_list = qdrant.list_collections()
question_type_map = {
"Factual": "Questions targeting specific details within a reference (e.g., a companys profit in a report, a verdict in a legal case, or symptoms in a medical record) to test RAGs retrieval accuracy.",
"Summarization": "Questions that require comprehensive answers, covering all relevant information, to mainly evaluate the recall rate of RAG retrieval.",
"Multi-hop Reasoning": "Questions involve logical relationships among events and details within adocument, forming a reasoning chain to assess RAGs logical reasoning ability.",
"Unanswerable": "Questions arise from potential information loss during the schema-to-article generation, where no corresponding information fragment exists, or the information is insufficient for an answer.",
}
question_types = list(question_type_map.keys())
FORMAT_TEMPLATE = """
<document>
<id>
{id}
</id>
<content>
{content}
</content>
</document>
"""
PROMPT_TEMPLATE = """
Eres un experto en generación de preguntas sínteticas. Tu tarea es crear preguntas sintéticas en español basadas en documentos de referencia proporcionados.
## INSTRUCCIONES:
### Requisitos obligatorios:
1. **Idioma**: La pregunta DEBE estar completamente en español
2. **Basada en documentos**: La pregunta DEBE poder responderse ÚNICAMENTE con la información contenida en los documentos proporcionados
3. **Tipo de pregunta**: Sigue estrictamente la definición del tipo de pregunta especificado
4. **Identificación de fuentes**: Incluye el ID de fuente de todos los documentos necesarios para responder la pregunta
5. **Respuesta ideal**: Incluye la respuesta perfecta basada en los documentos necesarios para responder la pregunta
### Tipo de pregunta solicitado:
**Tipo**: {qtype}
**Definición**: {qtype_def}
### Documentos de referencia:
{context}
Por favor, genera una pregunta siguiendo estas instrucciones.
""".strip()
response_schema = {
"type": "object",
"properties": {
"pregunta": {
"type": "string",
},
"respuesta": {
"type": "string",
},
"ids": {"type": "array", "items": {"type": "string"}},
},
"required": ["pregunta", "respuesta", "ids"],
}
@app.cell
def _():
mo.md(
r"""
# Generador de Preguntas Sintéticas
## Guía de Uso
1. **Selecciona una colección de vectores** y especifica el campo que contiene el texto del vector
2. **Elige un modelo LLM** para la generación de preguntas sintéticas
- Modelo por defecto: `gemini-2.0-flash`
3. **Selecciona el tipo** y cantidad de chunks por pregunta
4. **Define la cantidad** de preguntas sintéticas que deseas crear
5. **Ejecuta la generación** y revisa los resultados
"""
)
return
@app.cell
def _():
settings = (
mo.md(
"""
Collection: {collection} Key: {content_key}\n
LLM: {model}\n
Question type: {qtype} Chunks: {chunks}\n
Target amount: {amount}
"""
)
.batch(
model=mo.ui.text(value="gemini-2.0-flash"),
collection=mo.ui.dropdown(collection_list, searchable=True),
content_key=mo.ui.text(value="page_content"),
amount=mo.ui.number(value=10, step=10),
chunks=mo.ui.number(value=3, step=1),
qtype=mo.ui.dropdown(question_types),
)
.form(bordered=True)
)
settings
return (settings,)
@app.cell
def _(settings):
mo.stop(not settings.value)
CONTENT_KEY: str = settings.value["content_key"]
QUESTION_TYPE: str = settings.value["qtype"]
CHUNKS: int = settings.value["chunks"]
TYPE_DEFINITION: str = question_type_map[QUESTION_TYPE]
AMOUNT: int = settings.value["amount"]
gemini.set_model(settings.value["model"])
qdrant.collection = settings.value["collection"]
return AMOUNT, CHUNKS, CONTENT_KEY, QUESTION_TYPE, TYPE_DEFINITION
@app.function
def get_point_ids():
limit = qdrant.client.get_collection(qdrant.collection).points_count
query_response = qdrant.client.query_points(qdrant.collection, limit=limit)
return [point.id for point in query_response.points]
@app.cell
def _(CHUNKS: int, CONTENT_KEY: str):
def select_random_points(points: list):
selected_points = []
max = len(points) - 1
for _ in range(CHUNKS):
idx = random.randint(0, max)
selected_points.append(points[idx])
query_response = qdrant.client.retrieve(
qdrant.collection,
ids=selected_points,
)
data = [(point.id, point.payload[CONTENT_KEY]) for point in query_response]
return data
return (select_random_points,)
@app.function
def format_points_into_context(points):
templates = [FORMAT_TEMPLATE.format(id=p[0], content=p[1]) for p in points]
return "\n".join(templates)
@app.function
def generate_synthetic_questions(prompt):
response = gemini.generate(prompt, response_schema=response_schema)
return response
@app.cell
def _(QUESTION_TYPE: str, TYPE_DEFINITION: str, select_random_points):
def generate_questions(amount: int):
results = []
for _ in mo.status.progress_bar(range(amount), remove_on_exit=True):
point_ids = get_point_ids()
selected_points = select_random_points(point_ids)
context = format_points_into_context(selected_points)
prompt = PROMPT_TEMPLATE.format(
context=context, qtype=QUESTION_TYPE, qtype_def=TYPE_DEFINITION
)
questions = generate_synthetic_questions(prompt)
result = json.loads(questions.text)
result["type"] = QUESTION_TYPE
results.append(result)
return results
return (generate_questions,)
@app.cell
def _(AMOUNT: int, generate_questions):
results = generate_questions(AMOUNT)
return (results,)
@app.cell
def _(results):
import polars as pl
pl.from_records(results)
return
if __name__ == "__main__":
app.run()