forked from innovacion/Mayacontigo
ic
This commit is contained in:
226
notebooks/synthetic-question-generator/main.py
Normal file
226
notebooks/synthetic-question-generator/main.py
Normal file
@@ -0,0 +1,226 @@
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.13.15"
|
||||
app = marimo.App(width="medium")
|
||||
|
||||
with app.setup:
|
||||
import marimo as mo
|
||||
|
||||
import random
|
||||
import json
|
||||
import os
|
||||
|
||||
from banortegpt.generation.vertex_ai_gemini import Gemini
|
||||
from banortegpt.vector.qdrant import Qdrant
|
||||
|
||||
gemini = Gemini.from_vault("banortegpt", token=os.getenv("VAULT_TOKEN"))
|
||||
qdrant = Qdrant.from_vault("banortegpt", token=os.getenv("VAULT_TOKEN"))
|
||||
|
||||
collection_list = qdrant.list_collections()
|
||||
|
||||
question_type_map = {
|
||||
"Factual": "Questions targeting specific details within a reference (e.g., a company’s profit in a report, a verdict in a legal case, or symptoms in a medical record) to test RAG’s retrieval accuracy.",
|
||||
"Summarization": "Questions that require comprehensive answers, covering all relevant information, to mainly evaluate the recall rate of RAG retrieval.",
|
||||
"Multi-hop Reasoning": "Questions involve logical relationships among events and details within adocument, forming a reasoning chain to assess RAG’s logical reasoning ability.",
|
||||
"Unanswerable": "Questions arise from potential information loss during the schema-to-article generation, where no corresponding information fragment exists, or the information is insufficient for an answer.",
|
||||
}
|
||||
question_types = list(question_type_map.keys())
|
||||
|
||||
FORMAT_TEMPLATE = """
|
||||
<document>
|
||||
<id>
|
||||
{id}
|
||||
</id>
|
||||
<content>
|
||||
{content}
|
||||
</content>
|
||||
</document>
|
||||
"""
|
||||
|
||||
PROMPT_TEMPLATE = """
|
||||
Eres un experto en generación de preguntas sínteticas. Tu tarea es crear preguntas sintéticas en español basadas en documentos de referencia proporcionados.
|
||||
|
||||
## INSTRUCCIONES:
|
||||
|
||||
### Requisitos obligatorios:
|
||||
1. **Idioma**: La pregunta DEBE estar completamente en español
|
||||
2. **Basada en documentos**: La pregunta DEBE poder responderse ÚNICAMENTE con la información contenida en los documentos proporcionados
|
||||
3. **Tipo de pregunta**: Sigue estrictamente la definición del tipo de pregunta especificado
|
||||
4. **Identificación de fuentes**: Incluye el ID de fuente de todos los documentos necesarios para responder la pregunta
|
||||
5. **Respuesta ideal**: Incluye la respuesta perfecta basada en los documentos necesarios para responder la pregunta
|
||||
|
||||
### Tipo de pregunta solicitado:
|
||||
**Tipo**: {qtype}
|
||||
**Definición**: {qtype_def}
|
||||
|
||||
### Documentos de referencia:
|
||||
{context}
|
||||
|
||||
Por favor, genera una pregunta siguiendo estas instrucciones.
|
||||
""".strip()
|
||||
|
||||
response_schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"pregunta": {
|
||||
"type": "string",
|
||||
},
|
||||
"respuesta": {
|
||||
"type": "string",
|
||||
},
|
||||
"ids": {"type": "array", "items": {"type": "string"}},
|
||||
},
|
||||
"required": ["pregunta", "respuesta", "ids"],
|
||||
}
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
mo.md(
|
||||
r"""
|
||||
# Generador de Preguntas Sintéticas
|
||||
|
||||
## Guía de Uso
|
||||
|
||||
1. **Selecciona una colección de vectores** y especifica el campo que contiene el texto del vector
|
||||
2. **Elige un modelo LLM** para la generación de preguntas sintéticas
|
||||
- Modelo por defecto: `gemini-2.0-flash`
|
||||
3. **Selecciona el tipo** y cantidad de chunks por pregunta
|
||||
4. **Define la cantidad** de preguntas sintéticas que deseas crear
|
||||
5. **Ejecuta la generación** y revisa los resultados
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
settings = (
|
||||
mo.md(
|
||||
"""
|
||||
Collection: {collection} Key: {content_key}\n
|
||||
LLM: {model}\n
|
||||
Question type: {qtype} Chunks: {chunks}\n
|
||||
Target amount: {amount}
|
||||
"""
|
||||
)
|
||||
.batch(
|
||||
model=mo.ui.text(value="gemini-2.0-flash"),
|
||||
collection=mo.ui.dropdown(collection_list, searchable=True),
|
||||
content_key=mo.ui.text(value="page_content"),
|
||||
amount=mo.ui.number(value=10, step=10),
|
||||
chunks=mo.ui.number(value=3, step=1),
|
||||
qtype=mo.ui.dropdown(question_types),
|
||||
)
|
||||
.form(bordered=True)
|
||||
)
|
||||
|
||||
settings
|
||||
return (settings,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(settings):
|
||||
mo.stop(not settings.value)
|
||||
|
||||
CONTENT_KEY: str = settings.value["content_key"]
|
||||
QUESTION_TYPE: str = settings.value["qtype"]
|
||||
CHUNKS: int = settings.value["chunks"]
|
||||
TYPE_DEFINITION: str = question_type_map[QUESTION_TYPE]
|
||||
AMOUNT: int = settings.value["amount"]
|
||||
|
||||
gemini.set_model(settings.value["model"])
|
||||
qdrant.collection = settings.value["collection"]
|
||||
return AMOUNT, CHUNKS, CONTENT_KEY, QUESTION_TYPE, TYPE_DEFINITION
|
||||
|
||||
|
||||
@app.function
|
||||
def get_point_ids():
|
||||
limit = qdrant.client.get_collection(qdrant.collection).points_count
|
||||
|
||||
query_response = qdrant.client.query_points(qdrant.collection, limit=limit)
|
||||
|
||||
return [point.id for point in query_response.points]
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(CHUNKS: int, CONTENT_KEY: str):
|
||||
def select_random_points(points: list):
|
||||
selected_points = []
|
||||
max = len(points) - 1
|
||||
|
||||
for _ in range(CHUNKS):
|
||||
idx = random.randint(0, max)
|
||||
|
||||
selected_points.append(points[idx])
|
||||
|
||||
query_response = qdrant.client.retrieve(
|
||||
qdrant.collection,
|
||||
ids=selected_points,
|
||||
)
|
||||
|
||||
data = [(point.id, point.payload[CONTENT_KEY]) for point in query_response]
|
||||
|
||||
return data
|
||||
|
||||
return (select_random_points,)
|
||||
|
||||
|
||||
@app.function
|
||||
def format_points_into_context(points):
|
||||
templates = [FORMAT_TEMPLATE.format(id=p[0], content=p[1]) for p in points]
|
||||
|
||||
return "\n".join(templates)
|
||||
|
||||
|
||||
@app.function
|
||||
def generate_synthetic_questions(prompt):
|
||||
response = gemini.generate(prompt, response_schema=response_schema)
|
||||
return response
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(QUESTION_TYPE: str, TYPE_DEFINITION: str, select_random_points):
|
||||
def generate_questions(amount: int):
|
||||
results = []
|
||||
|
||||
for _ in mo.status.progress_bar(range(amount), remove_on_exit=True):
|
||||
point_ids = get_point_ids()
|
||||
|
||||
selected_points = select_random_points(point_ids)
|
||||
|
||||
context = format_points_into_context(selected_points)
|
||||
|
||||
prompt = PROMPT_TEMPLATE.format(
|
||||
context=context, qtype=QUESTION_TYPE, qtype_def=TYPE_DEFINITION
|
||||
)
|
||||
|
||||
questions = generate_synthetic_questions(prompt)
|
||||
|
||||
result = json.loads(questions.text)
|
||||
|
||||
result["type"] = QUESTION_TYPE
|
||||
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
return (generate_questions,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(AMOUNT: int, generate_questions):
|
||||
results = generate_questions(AMOUNT)
|
||||
return (results,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(results):
|
||||
import polars as pl
|
||||
|
||||
pl.from_records(results)
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
Reference in New Issue
Block a user