First commit

2026-02-18 19:57:43 +00:00
commit a53f8fcf62
115 changed files with 9957 additions and 0 deletions
--- a/apps/synth-gen/README.md
+++ b/apps/synth-gen/README.md
@@ -0,0 +1,28 @@
+# Synthetic Question Generator
+
+This application generates a set of synthetic questions from documents stored in Google Cloud Storage (GCS) and saves them to a local CSV file. For each document, it generates one question for each predefined question type (Factual, Summarization, etc.).
+
+The output CSV is structured for easy uploading to a BigQuery table with the following schema: `input` (STRING), `expected_output` (STRING), `source` (STRING), `type` (STRING).
+
+## Usage
+
+The script is run from the command line. You need to provide the path to the source documents within your GCS bucket and a path for the output CSV file.
+
+### Command
+
+```bash
+uv run python -m synth_gen.main [OPTIONS] GCS_PATH
+```
+
+### Arguments
+
+*   `GCS_PATH`: (Required) The path to the directory in your GCS bucket where the source markdown files are located (e.g., `documents/markdown/`).
+*   `--output-csv, -o`: (Required) The local file path where the generated questions will be saved in CSV format.
+
+### Example
+
+```bash
+uv run python -m synth_gen.main documents/processed/ --output-csv synthetic_questions.csv
+```
+
+This command will fetch all documents from the `gs://<your-bucket-name>/documents/processed/` directory, generate questions for each, and save them to a file named `synthetic_questions.csv` in the current directory.
--- a/apps/synth-gen/pyproject.toml
+++ b/apps/synth-gen/pyproject.toml
@@ -0,0 +1,22 @@
+[project]
+name = "synth-gen"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+authors = [
+    { name = "Anibal Angulo", email = "a8065384@banorte.com" }
+]
+requires-python = ">=3.12"
+dependencies = [
+    "llm",
+]
+
+[project.scripts]
+synth-gen = "synth_gen.main:app"
+
+[build-system]
+requires = ["uv_build>=0.8.3,<0.9.0"]
+build-backend = "uv_build"
+
+[tool.uv.sources]
+llm = { workspace = true }
--- a/apps/synth-gen/src/synth_gen/init.py
+++ b/apps/synth-gen/src/synth_gen/init.py
@@ -0,0 +1,2 @@
+def main() -> None:
+    print("Hello from synth-gen!")
--- a/apps/synth-gen/src/synth_gen/main.py
+++ b/apps/synth-gen/src/synth_gen/main.py
@@ -0,0 +1,349 @@
+import datetime
+import os
+import random
+from typing import Annotated, Any, Dict, List
+
+import pandas as pd
+import typer
+from file_storage.google_cloud import GoogleCloudFileStorage
+from llm.vertex_ai import VertexAILLM
+from pydantic import BaseModel
+from rich.console import Console
+from rich.progress import track
+
+from rag_eval.config import Settings
+
+# --- Configuration ---
+PROMPT_TEMPLATE = """
+Eres un experto en generación de preguntas sintéticas. Tu tarea es crear preguntas sintéticas en español basadas en documentos de referencia proporcionados.
+
+## INSTRUCCIONES:
+
+### Requisitos obligatorios:
+1. **Idioma**: La pregunta DEBE estar completamente en español
+2. **Basada en documentos**: La pregunta DEBE poder responderse ÚNICAMENTE con la información contenida en los documentos proporcionados
+3. **Tipo de pregunta**: Sigue estrictamente la definición del tipo de pregunta especificado
+4. **Identificación de fuentes**: Incluye el ID de fuente de todos los documentos necesarios para responder la pregunta
+5. **Salida esperada**: Incluye la respuesta perfecta basada en los documentos necesarios para responder la pregunta
+
+### Tono de pregunta:
+La pregunta debe ser similar a la que haría un usuario sin contexto sobre el sistema o la información disponible. Ingenuo y curioso.
+
+### Tipo de pregunta solicitado:
+**Tipo**: {qtype}
+**Definición**: {qtype_def}
+
+### Documentos de referencia:
+{context}
+
+Por favor, genera una pregunta siguiendo estas instrucciones.
+""".strip()
+
+RESPONSE_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "pregunta": {
+            "type": "string",
+        },
+        "expected_output": {
+            "type": "string",
+        },
+        "ids": {"type": "array", "items": {"type": "string"}},
+    },
+    "required": ["pregunta", "expected_output", "ids"],
+}
+
+
+class ResponseSchema(BaseModel):
+    pregunta: str
+    expected_output: str
+    ids: List[str]
+
+
+class Turn(BaseModel):
+    pregunta: str
+    expected_output: str
+
+
+class MultiStepResponseSchema(BaseModel):
+    conversation: List[Turn]
+
+
+MULTI_STEP_PROMPT_TEMPLATE = """
+Eres un experto en la generación de conversaciones sintéticas. Tu tarea es crear una conversación en español con múltiples turnos basada en los documentos de referencia proporcionados.
+
+## INSTRUCCIONES:
+
+### Requisitos obligatorios:
+1. **Idioma**: La conversación DEBE estar completamente en español.
+2. **Basada en documentos**: Todas las respuestas DEBEN poder responderse ÚNICAMENTE con la información contenida en los documentos de referencia.
+3. **Número de turnos**: La conversación debe tener exactamente {num_turns} turnos. Un turno consiste en una pregunta del usuario y una respuesta del asistente.
+4. **Flujo conversacional**: Las preguntas deben seguir un orden lógico, como si un usuario estuviera explorando un tema paso a paso. La segunda pregunta debe ser una continuación de la primera, y así sucesivamente.
+5. **Salida esperada**: Proporciona la respuesta perfecta para cada pregunta, basada en los documentos de referencia.
+
+### Tono de las preguntas:
+Las preguntas deben ser similares a las que haría un usuario sin contexto sobre el sistema o la información disponible. Deben ser ingenuas y curiosas.
+
+### Documentos de referencia:
+{context}
+
+Por favor, genera una conversación de {num_turns} turnos siguiendo estas instrucciones.
+""".strip()
+
+
+QUESTION_TYPE_MAP = {
+    "Factual": "Questions targeting specific details within a reference (e.g., a company’s profit in a report, a verdict in a legal case, or symptoms in a medical record) to test RAG’s retrieval accuracy.",
+    "Summarization": "Questions that require comprehensive answers, covering all relevant information, to mainly evaluate the recall rate of RAG retrieval.",
+    "Multi-hop Reasoning": "Questions involve logical relationships among events and details within adocument, forming a reasoning chain to assess RAG’s logical reasoning ability.",
+    "Unanswerable": "Questions arise from potential information loss during the schema-to-article generation, where no corresponding information fragment exists, or the information is insufficient for an answer.",
+}
+
+
+# --- Core Logic ---
+def generate_synthetic_question(
+    llm: VertexAILLM, file_content: str, file_path: str, q_type: str, q_def: str, language_model: str
+) -> Dict[str, Any]:
+    """Generates a single synthetic question using the LLM."""
+    prompt = PROMPT_TEMPLATE.format(
+        context=file_content, id=file_path, qtype=q_type, qtype_def=q_def
+    )
+    response = llm.structured_generation(
+        model=language_model,
+        prompt=prompt,
+        response_model=ResponseSchema,
+    )
+    return response
+
+
+def generate_synthetic_conversation(
+    llm: VertexAILLM,
+    file_content: str,
+    file_path: str,
+    num_turns: int,
+    language_model: str,
+) -> MultiStepResponseSchema:
+    """Generates a synthetic conversation with multiple turns using the LLM."""
+    prompt = MULTI_STEP_PROMPT_TEMPLATE.format(
+        context=file_content, num_turns=num_turns
+    )
+    response = llm.structured_generation(
+        model=language_model,
+        prompt=prompt,
+        response_model=MultiStepResponseSchema,
+    )
+    return response
+
+
+app = typer.Typer()
+
+
+def generate(
+    num_questions: int,
+    output_csv: str = None,
+    num_turns: int = 1,
+) -> str:
+    """
+    Core logic for generating a specified number of synthetic questions.
+    """
+    console = Console()
+    settings = Settings()
+    llm = VertexAILLM(project=settings.project_id, location=settings.location)
+    storage = GoogleCloudFileStorage(bucket=settings.bucket)
+
+    run_id = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%d-%H%M%S")
+    console.print(f"[bold yellow]Generated Run ID: {run_id}[/bold yellow]")
+
+    all_rows = []
+    if not settings.index:
+        console.print("[yellow]Skipping as no index is configured.[/yellow]")
+        return ""
+
+    gcs_path = f"{settings.index.name}/contents/"
+    console.print(f"[green]Fetching files from GCS path: {gcs_path}[/green]")
+
+    try:
+        all_files = storage.list_files(path=gcs_path)
+        console.print(f"Found {len(all_files)} total files to process.")
+    except Exception as e:
+        console.print(f"[bold red]Error listing files: {e}[/bold red]")
+        return ""
+
+    if not all_files:
+        console.print("[yellow]No files found. Skipping.[/yellow]")
+        return ""
+
+    files_to_process = random.sample(
+        all_files, k=min(num_questions, len(all_files))
+    )
+    console.print(
+        f"Randomly selected {len(files_to_process)} files to generate questions from."
+    )
+
+    for file_path in track(files_to_process, description="Generating questions..."):
+        try:
+            file_content = storage.get_file_stream(file_path).read().decode("utf-8-sig")
+            q_type, q_def = random.choice(list(QUESTION_TYPE_MAP.items()))
+
+            if num_turns > 1:
+                conversation_data = None
+                for attempt in range(3):  # Retry up to 3 times
+                    conversation_data = generate_synthetic_conversation(
+                        llm,
+                        file_content,
+                        file_path,
+                        num_turns,
+                        settings.agent.language_model,
+                    )
+                    if (
+                        conversation_data
+                        and conversation_data.conversation
+                        and len(conversation_data.conversation) == num_turns
+                    ):
+                        break  # Success
+                    console.print(
+                        f"[yellow]Failed to generate valid conversation for {os.path.basename(file_path)}. Retrying ({attempt + 1}/3)...[/yellow]"
+                    )
+                    conversation_data = None
+
+                if not conversation_data:
+                    console.print(
+                        f"[bold red]Failed to generate valid conversation for {os.path.basename(file_path)} after 3 attempts. Skipping.[/bold red]"
+                    )
+                    continue
+
+                conversation_id = str(random.randint(10000, 99999))
+                for i, turn in enumerate(conversation_data.conversation):
+                    row = {
+                        "input": turn.pregunta,
+                        "expected_output": turn.expected_output,
+                        "source": os.path.splitext(os.path.basename(file_path))[0],
+                        "type": "Multi-turn",
+                        "agent": settings.agent.name,
+                        "run_id": run_id,
+                        "conversation_id": conversation_id,
+                        "turn": i + 1,
+                    }
+                    all_rows.append(row)
+
+            else:  # Single turn generation
+                generated_data = None
+                for attempt in range(3):  # Retry up to 3 times
+                    generated_data = generate_synthetic_question(
+                        llm,
+                        file_content,
+                        file_path,
+                        q_type,
+                        q_def,
+                        settings.agent.language_model,
+                    )
+                    if (
+                        generated_data
+                        and generated_data.expected_output
+                        and generated_data.expected_output.strip()
+                    ):
+                        break  # Success, exit retry loop
+                    console.print(
+                        f"[yellow]Empty answer for {q_type} on {os.path.basename(file_path)}. Retrying ({attempt + 1}/3)...[/yellow]"
+                    )
+                    generated_data = None  # Reset to indicate failure
+
+                if not generated_data:
+                    console.print(
+                        f"[bold red]Failed to generate valid answer for {q_type} on {os.path.basename(file_path)} after 3 attempts. Skipping.[/bold red]"
+                    )
+                    continue
+
+                row = {
+                    "input": generated_data.pregunta,
+                    "expected_output": generated_data.expected_output,
+                    "source": os.path.splitext(os.path.basename(file_path))[0],
+                    "type": q_type,
+                    "agent": settings.agent.name,
+                    "run_id": run_id,
+                }
+                all_rows.append(row)
+
+        except Exception as e:
+            console.print(f"[bold red]Error processing file {file_path}: {e}[/bold red]")
+
+    if not all_rows:
+        console.print("[bold yellow]No questions were generated.[/bold yellow]")
+        return ""
+
+    df = pd.DataFrame(all_rows)
+
+    if output_csv:
+        console.print(
+            f"\n[bold green]Saving {len(df)} generated questions to {output_csv}...[/bold green]"
+        )
+        df.to_csv(output_csv, index=False, encoding="utf-8-sig")
+        console.print("[bold green]Synthetic question generation complete.[/bold green]")
+    else:
+        console.print(
+            f"\n[bold green]Saving {len(df)} generated questions to BigQuery...[/bold green]"
+        )
+        project_id = settings.bigquery.project_id or settings.project_id
+        dataset_id = settings.bigquery.dataset_id
+        table_name = settings.bigquery.table_ids["synth_gen"]
+        table_id = f"{project_id}.{dataset_id}.{table_name}"
+
+        console.print(f"Saving to BigQuery table: [bold cyan]{table_id}[/bold cyan]")
+        try:
+            # Ensure new columns exist for all rows before upload
+            if "conversation_id" not in df.columns:
+                df["conversation_id"] = None
+            if "turn" not in df.columns:
+                df["turn"] = None
+            
+            df.to_gbq(
+                destination_table=f"{dataset_id}.{table_name}",
+                project_id=project_id,
+                if_exists="append",
+            )
+            console.print(
+                f"Successfully saved {len(df)} rows to [bold green]{table_id}[/bold green]"
+            )
+        except Exception as e:
+            console.print(
+                f"[bold red]An error occurred while saving to BigQuery: {e}[/bold red]"
+            )
+            raise typer.Exit(code=1)
+
+    console.print(f"[bold yellow]Finished run with ID: {run_id}[/bold yellow]")
+    return run_id
+
+
+@app.command()
+def main(
+    num_questions: Annotated[
+        int,
+        typer.Option(
+            "--num-questions", "-n", help="Number of questions to generate."
+        ),
+    ] = 10,
+    output_csv: Annotated[
+        str,
+        typer.Option(
+            "--output-csv", "-o", help="Optional: Path to save the output CSV file."
+        ),
+    ] = None,
+    num_turns: Annotated[
+        int,
+        typer.Option(
+            "--num-turns",
+            "-t",
+            help="Number of conversational turns to generate.",
+        ),
+    ] = 1,
+):
+    """
+    Generates a specified number of synthetic questions and saves them to BigQuery (default) or a local CSV file.
+    """
+    generate(
+        num_questions=num_questions, output_csv=output_csv, num_turns=num_turns
+    )
+
+
+
+if __name__ == "__main__":
+    app()