This commit is contained in:
Rogelio
2025-10-13 18:16:25 +00:00
parent 739f087cef
commit 325f1ef439
415 changed files with 46870 additions and 0 deletions

View File

@@ -0,0 +1,226 @@
import marimo
__generated_with = "0.13.15"
app = marimo.App(width="medium")
with app.setup:
import marimo as mo
import random
import json
import os
from banortegpt.generation.vertex_ai_gemini import Gemini
from banortegpt.vector.qdrant import Qdrant
gemini = Gemini.from_vault("banortegpt", token=os.getenv("VAULT_TOKEN"))
qdrant = Qdrant.from_vault("banortegpt", token=os.getenv("VAULT_TOKEN"))
collection_list = qdrant.list_collections()
question_type_map = {
"Factual": "Questions targeting specific details within a reference (e.g., a companys profit in a report, a verdict in a legal case, or symptoms in a medical record) to test RAGs retrieval accuracy.",
"Summarization": "Questions that require comprehensive answers, covering all relevant information, to mainly evaluate the recall rate of RAG retrieval.",
"Multi-hop Reasoning": "Questions involve logical relationships among events and details within adocument, forming a reasoning chain to assess RAGs logical reasoning ability.",
"Unanswerable": "Questions arise from potential information loss during the schema-to-article generation, where no corresponding information fragment exists, or the information is insufficient for an answer.",
}
question_types = list(question_type_map.keys())
FORMAT_TEMPLATE = """
<document>
<id>
{id}
</id>
<content>
{content}
</content>
</document>
"""
PROMPT_TEMPLATE = """
Eres un experto en generación de preguntas sínteticas. Tu tarea es crear preguntas sintéticas en español basadas en documentos de referencia proporcionados.
## INSTRUCCIONES:
### Requisitos obligatorios:
1. **Idioma**: La pregunta DEBE estar completamente en español
2. **Basada en documentos**: La pregunta DEBE poder responderse ÚNICAMENTE con la información contenida en los documentos proporcionados
3. **Tipo de pregunta**: Sigue estrictamente la definición del tipo de pregunta especificado
4. **Identificación de fuentes**: Incluye el ID de fuente de todos los documentos necesarios para responder la pregunta
5. **Respuesta ideal**: Incluye la respuesta perfecta basada en los documentos necesarios para responder la pregunta
### Tipo de pregunta solicitado:
**Tipo**: {qtype}
**Definición**: {qtype_def}
### Documentos de referencia:
{context}
Por favor, genera una pregunta siguiendo estas instrucciones.
""".strip()
response_schema = {
"type": "object",
"properties": {
"pregunta": {
"type": "string",
},
"respuesta": {
"type": "string",
},
"ids": {"type": "array", "items": {"type": "string"}},
},
"required": ["pregunta", "respuesta", "ids"],
}
@app.cell
def _():
mo.md(
r"""
# Generador de Preguntas Sintéticas
## Guía de Uso
1. **Selecciona una colección de vectores** y especifica el campo que contiene el texto del vector
2. **Elige un modelo LLM** para la generación de preguntas sintéticas
- Modelo por defecto: `gemini-2.0-flash`
3. **Selecciona el tipo** y cantidad de chunks por pregunta
4. **Define la cantidad** de preguntas sintéticas que deseas crear
5. **Ejecuta la generación** y revisa los resultados
"""
)
return
@app.cell
def _():
settings = (
mo.md(
"""
Collection: {collection} Key: {content_key}\n
LLM: {model}\n
Question type: {qtype} Chunks: {chunks}\n
Target amount: {amount}
"""
)
.batch(
model=mo.ui.text(value="gemini-2.0-flash"),
collection=mo.ui.dropdown(collection_list, searchable=True),
content_key=mo.ui.text(value="page_content"),
amount=mo.ui.number(value=10, step=10),
chunks=mo.ui.number(value=3, step=1),
qtype=mo.ui.dropdown(question_types),
)
.form(bordered=True)
)
settings
return (settings,)
@app.cell
def _(settings):
mo.stop(not settings.value)
CONTENT_KEY: str = settings.value["content_key"]
QUESTION_TYPE: str = settings.value["qtype"]
CHUNKS: int = settings.value["chunks"]
TYPE_DEFINITION: str = question_type_map[QUESTION_TYPE]
AMOUNT: int = settings.value["amount"]
gemini.set_model(settings.value["model"])
qdrant.collection = settings.value["collection"]
return AMOUNT, CHUNKS, CONTENT_KEY, QUESTION_TYPE, TYPE_DEFINITION
@app.function
def get_point_ids():
limit = qdrant.client.get_collection(qdrant.collection).points_count
query_response = qdrant.client.query_points(qdrant.collection, limit=limit)
return [point.id for point in query_response.points]
@app.cell
def _(CHUNKS: int, CONTENT_KEY: str):
def select_random_points(points: list):
selected_points = []
max = len(points) - 1
for _ in range(CHUNKS):
idx = random.randint(0, max)
selected_points.append(points[idx])
query_response = qdrant.client.retrieve(
qdrant.collection,
ids=selected_points,
)
data = [(point.id, point.payload[CONTENT_KEY]) for point in query_response]
return data
return (select_random_points,)
@app.function
def format_points_into_context(points):
templates = [FORMAT_TEMPLATE.format(id=p[0], content=p[1]) for p in points]
return "\n".join(templates)
@app.function
def generate_synthetic_questions(prompt):
response = gemini.generate(prompt, response_schema=response_schema)
return response
@app.cell
def _(QUESTION_TYPE: str, TYPE_DEFINITION: str, select_random_points):
def generate_questions(amount: int):
results = []
for _ in mo.status.progress_bar(range(amount), remove_on_exit=True):
point_ids = get_point_ids()
selected_points = select_random_points(point_ids)
context = format_points_into_context(selected_points)
prompt = PROMPT_TEMPLATE.format(
context=context, qtype=QUESTION_TYPE, qtype_def=TYPE_DEFINITION
)
questions = generate_synthetic_questions(prompt)
result = json.loads(questions.text)
result["type"] = QUESTION_TYPE
results.append(result)
return results
return (generate_questions,)
@app.cell
def _(AMOUNT: int, generate_questions):
results = generate_questions(AMOUNT)
return (results,)
@app.cell
def _(results):
import polars as pl
pl.from_records(results)
return
if __name__ == "__main__":
app.run()