First commit

This commit is contained in:
Anibal Angulo
2026-02-18 19:57:43 +00:00
commit a53f8fcf62
115 changed files with 9957 additions and 0 deletions

View File

@@ -0,0 +1,30 @@
# Keypoint Evaluator
This application evaluates a RAG (Retrieval-Augmented Generation) system based on the keypoint methodology from the RAGEval paper.
## How to use
To run the evaluation, execute the following command from the root directory of the project:
```bash
python -m keypoint_eval.main --evaluation-name <EVALUATION_NAME> --matriz-eval <PATH_TO_EVALUATION_MATRIX_FILE>
```
### Arguments
* `--evaluation-name`: The name of the evaluation.
* `--matriz-eval`: The path to the evaluation matrix file.
The application will read the evaluation matrix from the specified file and will generate a CSV and a JSON file with the evaluation results.
## Input File Structure
The input file can be a CSV, Excel, or JSON file.
The file must contain the following columns:
* `input`: The user's question.
* `expected_output`: The ground truth or expected answer.
* `category` (optional): The category of the question.
If the `input` column is not found, the application will look for columns containing "pregunta" or "question".

View File

@@ -0,0 +1,17 @@
[project]
name = "keypoint-eval"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
authors = [
{ name = "Anibal Angulo", email = "a8065384@banorte.com" }
]
requires-python = ">=3.12"
dependencies = []
[project.scripts]
keypoint-eval = "keypoint_eval.cli:app"
[build-system]
requires = ["uv_build>=0.8.3,<0.9.0"]
build-backend = "uv_build"

View File

@@ -0,0 +1,2 @@
def main() -> None:
print("Hello from keypoint-eval!")

View File

@@ -0,0 +1,58 @@
import warnings
from typing import Annotated
import typer
from .main import run_keypoint_evaluation
warnings.filterwarnings("ignore")
app = typer.Typer(name="keypoint-eval")
@app.command()
def main(
input_file: Annotated[
str,
typer.Option(
"--input-file",
"-i",
help="Path to a local CSV or SQLite file for evaluation data. "
"If not provided, data will be loaded from BigQuery.",
),
] = None,
output_file: Annotated[
str,
typer.Option(
"--output-file",
"-o",
help="Optional: Path to save the output CSV file. "
"If not provided, results will be saved to BigQuery.",
),
] = None,
run_id: Annotated[
str | None,
typer.Option(
help="Optional: The specific run_id to filter the evaluation data by."
),
] = None,
agent_name: Annotated[
str | None,
typer.Option(
"-a",
"--agent-name",
help="Optional: The name of a specific agent to run. Use 'dialogflow' to run the Dialogflow agent.",
),
] = None,
):
"""CLI for running keypoint-based evaluation."""
run_keypoint_evaluation(
input_file=input_file,
output_file=output_file,
run_id=run_id,
agent_name=agent_name,
)
if __name__ == "__main__":
app()

View File

@@ -0,0 +1,330 @@
from datetime import datetime
from typing import Literal
from llm.vertex_ai import VertexAILLM
from pydantic import BaseModel, Field
from rich.console import Console
from rich.panel import Panel
from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn
from rich.text import Text
from rag_eval.config import settings
class KeypointMetricPrompt(BaseModel):
name: str
description: str
template: str
class KeyPointResponse(BaseModel):
keypoints: list[str]
class KeyPointEval(BaseModel):
keypoint: str
analysis: str
category: Literal["relevant", "irrelevant", "incorrect"]
class KeyPointEvalList(BaseModel):
evals: list[KeyPointEval]
def _count(self, category: str) -> int:
return sum(1 for e in self.evals if e.category == category)
def count_relevant(self) -> int:
return self._count("relevant")
def count_irrelevant(self) -> int:
return self._count("irrelevant")
def count_incorrect(self) -> int:
return self._count("incorrect")
def keypoint_details(self) -> list[dict]:
return [e.model_dump() for e in self.evals]
class ConcisenessScore(BaseModel):
score: float = Field(
description="A score from 0.0 to 1.0 evaluating the conciseness of the answer."
)
class KeypointRAGEvaluator:
"""
Evaluador de sistemas RAG basado en la metodología de keypoints del paper RAGEval.
Se enfoca en 3 métricas principales:
- Completeness: Qué tan bien la respuesta captura los puntos clave de la respuesta ideal
- Hallucination: Identificación de contenido que contradice los puntos clave
- Irrelevance: Proporción de puntos clave que no son cubiertos ni contradichos
"""
def __init__(self, console: Console, model: str = "gemini-2.0-flash"):
self.metrics_results = []
self.console = console
self.llm = VertexAILLM(project=settings.project_id, location=settings.location)
self.model = model
def evaluate_conciseness(self, query: str, answer: str) -> float:
"""Evaluates the conciseness of a generated answer."""
prompt = f"""Evaluate the conciseness of the following generated answer in response to the user's query.
The score should be a single float from 0.0 to 1.0, where 1.0 is perfectly concise and direct, and 0.0 is extremely verbose and full of conversational fluff.
Only consider the conciseness, not the correctness of the answer.
User Query: {query}
Generated Answer: {answer}
"""
try:
response = self.llm.structured_generation(
model=self.model,
prompt=prompt,
response_model=ConcisenessScore,
system_prompt="You are an expert evaluator focused on the conciseness and directness of answers. You output a single float score and nothing else.",
)
return response.score
except Exception as e:
self.console.print(
f"[bold red]Error during conciseness evaluation: {str(e)}[/bold red]"
)
return 0.0 # Return a neutral score in case of error
def extract_keypoints(self, question: str, ground_truth: str) -> list[str]:
"""
Extrae puntos clave (keypoints) de la respuesta de referencia y agrega keypoints
estándar para verificar la adherencia al dominio de Banorte.
Args:
question: Pregunta del usuario
ground_truth: Respuesta ideal o de referencia
Returns:
Lista de puntos clave extraídos más los keypoints estándar de dominio
"""
prompt = f"""En esta tarea, se te dará una pregunta y una respuesta ideal. Basado en la respuesta ideal,
necesitas resumir los puntos clave necesarios para responder la pregunta.
<ejemplo>
<pregunta>
Cómo puedo sacar un adelanto de nómina?
</pregunta>
<respuesta>
¡Hola! 👋 Sacar un Adelanto de Nómina con Banorte es muy fácil y
puede ayudarte con liquidez al instante. Aquí te explico cómo
funciona:
Es un monto de hasta $10,000 MXN que puedes usar para lo que
necesites, sin intereses y con una comisión fija del 7%. Lo puedes
contratar directamente desde la aplicación móvil de Banorte. Los
pagos se ajustan a la frecuencia de tu nómina y se cargan
automáticamente a tu cuenta.
Los principales requisitos son:
* Recibir tu nómina en Banorte y no tener otro adelanto vigente.
* Tener un ingreso neto mensual mayor a $2,000 MXN.
* Tener entre 18 y 74 años con 11 meses.
* Contar con un buen historial en Buró de Crédito.
¡Espero que esta información te sea muy útil! 😊
</respuesta>
<puntos clave>
[
"Recibir tu nómina en Banorte",
"No tener otro adelanto vigente",
"Tener entre 18 y 74 años con 11 meses",
"Contar con buen historial en Buró de Crédito",
]
</puntos clave>
</ejemplo>
<real>
<pregunta>
{question}
</pregunta>
<respuesta>
{ground_truth}
</respuesta>
</real>
"""
try:
response = self.llm.structured_generation(
model=self.model,
prompt=prompt,
response_model=KeyPointResponse,
system_prompt="Eres un asistente experto en extraer puntos clave informativos de respuestas.",
)
return response.keypoints
except Exception as e:
self.console.print(
f"[bold red]Error al extraer keypoints: {str(e)}[/bold red]"
)
raise
def evaluate_keypoints(
self,
generated_answer: str,
keypoints: list[str],
) -> tuple[dict[str, float], list[dict]]:
"""
Evalúa una respuesta generada según los puntos clave extraídos.
Args:
generated_answer: Respuesta generada por el sistema RAG
keypoints: Lista de puntos clave de la respuesta ideal
Returns:
Diccionario con las puntuaciones de las métricas y lista detallada de la clasificación de cada keypoint
"""
prompt = f"""En esta tarea, recibirás una respuesta real y múltiples puntos clave
extraídos de una respuesta ideal. Tu objetivo es evaluar la calidad y concisión de la respuesta generada.
Para cada punto clave, proporciona un breve análisis y concluye con una de las siguientes clasificaciones:
[[[ Relevante ]]] - La respuesta generada aborda el punto clave de manera precisa, correcta y directa. La información es fácil de encontrar y no está oculta por un exceso de texto innecesario o "fluff" conversacional (saludos, despedidas, jerga, etc.).
[[[ Irrelevante ]]] - La respuesta generada omite por completo el punto clave o no contiene ninguna información relacionada con él. También se considera Irrelevante si la información del punto clave está presente, pero tan oculta por el "fluff" que un usuario tendría dificultades para encontrarla.
[[[ Incorrecto ]]] - La respuesta generada contiene información relacionada con el punto clave pero es incorrecta, contradice el punto clave, o podría confundir o desinformar al usuario.
**Criterio de Evaluación:**
Sé estricto con el "fluff". Una respuesta ideal es tanto correcta como concisa. El exceso de texto conversacional que no aporta valor a la respuesta debe penalizarse. Si la información clave está presente pero la respuesta es innecesariamente larga y verbosa, considera rebajar su clasificación de Relevante a Irrelevante.
Respuesta Generada: {generated_answer}
Puntos Clave de la Respuesta ideal:
{"\n".join([f"{i + 1}. {kp}" for i, kp in enumerate(keypoints)])}
"""
try:
response = self.llm.structured_generation(
model=self.model,
prompt=prompt,
response_model=KeyPointEvalList,
system_prompt="Eres un evaluador experto de respuestas basadas en puntos clave, capaz de detectar si la información es relevante, irrelevante o incorrecta. Adoptas una postura favorable cuando evalúas la utilidad de las respuestas para los usuarios.",
)
relevant_count = response.count_relevant()
irrelevant_count = response.count_irrelevant()
incorrect_count = response.count_incorrect()
total_keypoints = len(keypoints)
completeness = (
relevant_count / total_keypoints if total_keypoints > 0 else 0
)
hallucination = (
incorrect_count / total_keypoints if total_keypoints > 0 else 0
)
irrelevance = (
irrelevant_count / total_keypoints if total_keypoints > 0 else 0
)
keypoint_details = response.keypoint_details()
metrics = {
"completeness": completeness,
"hallucination": hallucination,
"irrelevance": irrelevance,
}
return metrics, keypoint_details
except Exception as e:
self.console.print(
f"[bold red]Error al evaluar keypoints: {str(e)}[/bold red]"
)
raise
def evaluate_rag_pipeline(
self,
query: str,
response: str,
ground_truth: str,
retrieved_contexts: list[str],
verbose: bool = True,
) -> dict:
"""
Evalúa un pipeline RAG utilizando la metodología de keypoints.
Args:
query: Pregunta del usuario
response: Respuesta generada por el sistema RAG
ground_truth: Respuesta ideal o de referencia
retrieved_contexts: Contextos recuperados para generar la respuesta
verbose: Si se muestran detalles de la evaluación
Returns:
Diccionario con los resultados de la evaluación
"""
try:
if verbose:
self.console.print(
Panel(
Text(
f"Question: {query}\n\nAnswer: {response}", justify="left"
),
title="[bold blue]Evaluating[/bold blue]",
border_style="blue",
)
)
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
transient=True,
console=self.console,
disable=not verbose,
) as progress:
task = progress.add_task("Evaluation", total=2)
progress.update(task, description="Extracting keypoints...")
keypoints = self.extract_keypoints(query, ground_truth)
progress.advance(task)
if verbose:
self.console.print(
f"\nSe han extraído {len(keypoints)} puntos clave:"
)
for i, kp in enumerate(keypoints):
self.console.print(f"{i + 1}. {kp}")
progress.update(task, description="Evaluating keypoints...")
metrics, keypoint_details = self.evaluate_keypoints(response, keypoints)
progress.advance(task)
results = {
"query": query,
"response": response,
"ground_truth": ground_truth,
"retrieved_contexts": retrieved_contexts,
"completeness": metrics["completeness"],
"hallucination": metrics["hallucination"],
"irrelevance": metrics["irrelevance"],
"keypoints": keypoints,
"keypoint_details": keypoint_details,
"timestamp": datetime.now(),
}
if verbose:
self.console.print("\nResultados de la evaluación:")
self.console.print(f"Completeness: {metrics['completeness']:.3f}")
self.console.print(f"Hallucination: {metrics['hallucination']:.3f}")
self.console.print(f"Irrelevance: {metrics['irrelevance']:.3f}")
self.console.print("\nDetalles de la evaluación por punto clave:")
for i, detail in enumerate(keypoint_details):
self.console.print(f"\nKeypoint {i + 1}: {detail['keypoint']}")
self.console.print(f"Categoría: {detail['category']}")
self.metrics_results.append(results)
return results
except Exception as e:
self.console.print(f"[bold red]Error en la evaluación: {str(e)}[/bold red]")
raise

View File

@@ -0,0 +1,132 @@
import pathlib
import sqlite3
import pandas as pd
from google.cloud import bigquery
from rich.console import Console
from rag_eval.config import settings as config
def load_data_from_local_file(
file_path: str, console: Console, run_id: str = None
) -> pd.DataFrame:
"""Loads evaluation data from a local CSV or SQLite file and returns a DataFrame."""
console.print(f"Loading data from {file_path}...")
path = pathlib.Path(file_path)
if not path.exists():
raise Exception(f"Error: File not found at {file_path}")
if path.suffix == ".csv":
try:
df = pd.read_csv(path)
except Exception as e:
raise Exception(f"An error occurred while reading the CSV file: {e}")
elif path.suffix in [".db", ".sqlite"]:
try:
con = sqlite3.connect(path)
# Assuming table name is the file stem
table_name = path.stem
df = pd.read_sql(f"SELECT * FROM {table_name}", con)
con.close()
except Exception as e:
raise Exception(f"An error occurred while reading the SQLite DB: {e}")
else:
raise Exception(
f"Unsupported file type: {path.suffix}. Please use .csv or .db/.sqlite"
)
# Check for required columns
if (
"input" not in df.columns
or "expected_output" not in df.columns
):
raise Exception(
"Error: The input file must contain 'input' and 'expected_output' columns."
)
df["agent"] = config.agent.name
print(f"{run_id=}")
if run_id:
if "run_id" in df.columns:
df = df[df["run_id"] == run_id].copy()
console.print(f"Filtered data for run_id: {run_id}")
if df.empty:
console.print(
f"[yellow]Warning: No data found for run_id '{run_id}' in {file_path}.[/yellow]"
)
else:
console.print(
f"[yellow]Warning: --run-id provided, but 'run_id' column not found in {file_path}. Using all data.[/yellow]"
)
# Filter out unanswerable questions if 'type' column exists
if "type" in df.columns:
df = df[df["type"] != "Unanswerable"].copy()
df.dropna(subset=["input", "expected_output"], inplace=True)
console.print(f"Loaded {len(df)} questions for evaluation from {file_path}.")
return df
def load_data_from_bigquery(console: Console, run_id: str = None) -> pd.DataFrame:
"""Loads evaluation data from the BigQuery table and returns a DataFrame."""
console.print("Loading data from BigQuery...")
bq_project_id = config.bigquery.project_id or config.project_id
client = bigquery.Client(project=bq_project_id)
table_ref = f"{bq_project_id}.{config.bigquery.dataset_id}.{config.bigquery.table_ids['synth_gen']}"
console.print(f"Querying table: {table_ref}")
try:
table = client.get_table(table_ref)
all_columns = [schema.name for schema in table.schema]
select_cols = ["input", "expected_output"]
if "category" in all_columns:
select_cols.append("category")
query_parts = [f"SELECT {', '.join(select_cols)}", f"FROM `{table_ref}`"]
# Build WHERE clauses
where_clauses = []
if "type" in all_columns:
where_clauses.append("type != 'Unanswerable'")
if run_id:
if "run_id" in all_columns:
where_clauses.append(f"run_id = '{run_id}'")
console.print(f"Filtering data for run_id: {run_id}")
else:
console.print(
"[yellow]Warning: --run-id provided, but 'run_id' column not found in BigQuery table. Using all data.[/yellow]"
)
if where_clauses:
query_parts.append("WHERE " + " AND ".join(where_clauses))
query = "\n".join(query_parts)
df = client.query(query).to_dataframe()
except Exception as e:
if "Not found" in str(e):
console.print(f"[bold red]Error: Table {table_ref} not found.[/bold red]")
console.print(
"Please ensure the table exists and the configuration in 'config.yaml' is correct."
)
raise
else:
console.print(
f"[bold red]An error occurred while querying BigQuery: {e}[/bold red]"
)
raise
df.dropna(subset=["input", "expected_output"], inplace=True)
df["agent"] = config.agent.name
console.print(f"Loaded {len(df)} questions for evaluation.")
if run_id and df.empty:
console.print(
f"[yellow]Warning: No data found for run_id '{run_id}' in BigQuery.[/yellow]"
)
return df

View File

@@ -0,0 +1,347 @@
import json
import uuid
import pandas as pd
from dialogflow.main import DialogflowAgent as OriginalDialogflowAgent
from google.api_core import exceptions as google_exceptions
from rich.console import Console
from rich.panel import Panel
from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn
from rich.table import Table
from rag_eval.agent import Agent
from rag_eval.config import settings as config
from . import loaders
from .evaluator import KeypointRAGEvaluator
class DialogflowEvalAgent:
"""Adapter for DialogflowAgent to be used in evaluation."""
def __init__(self, session_id: str = None):
self.agent = OriginalDialogflowAgent()
self.session_id = session_id or str(uuid.uuid4())
def call(self, query: str) -> str:
"""Calls the Dialogflow agent with the session ID and returns only the response text."""
response = self.agent.call(query, session_id=self.session_id)
return response.get("response_text", "")
def run_keypoint_evaluation(
input_file: str = None,
output_file: str = None,
run_id: str = None,
agent_name: str = None,
):
"""
Runs keypoint-based evaluation for each agent found in the input data.
Handles both single-turn and multi-turn conversational data.
"""
console = Console()
# --- Introduction Panel ---
intro_panel = Panel(
f"""
[bold]Input File:[/bold] [cyan]{input_file or 'BigQuery'}[/cyan]
[bold]Output File:[/bold] [cyan]{output_file or 'BigQuery'}[/cyan]
[bold]Run ID:[/bold] [cyan]{run_id or 'Not specified'}[/cyan]
[bold]Agent Name:[/bold] [cyan]{agent_name or 'All'}[/cyan]
""",
title="[bold magenta]Keypoint Evaluation Run[/bold magenta]",
expand=False,
border_style="magenta",
)
console.print(intro_panel)
try:
if input_file:
df = loaders.load_data_from_local_file(input_file, console, run_id=run_id)
else:
df = loaders.load_data_from_bigquery(console, run_id=run_id)
except Exception as e:
console.print(
f"[bold red]An unexpected error occurred during data loading: {e}[/bold red]"
)
raise
if run_id is None:
run_id = "run_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
if df.empty:
console.print("[bold red]No data loaded, exiting.[/bold red]")
return
# --- Set up agents to evaluate ---
evaluables = []
if agent_name:
if agent_name == "dialogflow":
evaluables.append(
{"name": "dialogflow", "agent_class": DialogflowEvalAgent, "is_special": True}
)
console.print("[bold green]Agent 'dialogflow' selected for evaluation.[/bold green]")
elif agent_name == config.agent.name:
evaluables.append(
{"name": config.agent.name, "agent_class": Agent, "is_special": False}
)
else:
console.print(
f"[bold red]Error: Agent '{agent_name}' not found in the configuration.[/bold red]"
)
raise ValueError(f"Agent '{agent_name}' not found in the configuration")
else:
evaluables.append(
{"name": config.agent.name, "agent_class": Agent, "is_special": False}
)
all_agents_results = []
total_skipped_questions = 0
# --- Check for conversational data ---
is_conversational = "conversation_id" in df.columns and "turn" in df.columns
if is_conversational:
df.sort_values(by=["conversation_id", "turn"], inplace=True)
conversations = df.groupby("conversation_id")
console.print(f"Found [bold cyan]{len(conversations)}[/bold cyan] conversations to evaluate.")
progress_total = len(df)
else:
console.print(f"Found [bold cyan]{len(df)}[/bold cyan] single questions to evaluate.")
conversations = [(None, df)] # Treat all rows as one big group
progress_total = len(df)
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
console=console,
) as progress:
task = progress.add_task(
"[green]Processing evaluations...[/green]",
total=progress_total,
)
for conversation_id, conversation_df in conversations:
if is_conversational:
console.print(
Panel(
f"Evaluating conversation: [bold blue]{conversation_id}[/bold blue]",
expand=False,
border_style="blue",
)
)
for evaluable in evaluables:
agent_name_for_results = evaluable["name"]
# Initialize agent and history for each conversation
if evaluable["is_special"]:
rag_agent = evaluable["agent_class"](session_id=str(uuid.uuid4()))
else:
rag_agent = evaluable["agent_class"]()
history = []
evaluator = KeypointRAGEvaluator(console)
for _, row in conversation_df.iterrows():
query = row["input"]
ground_truth = row["expected_output"]
progress.update(
task, description=f"Agent: {agent_name_for_results}, Conv: {conversation_id or 'N/A'}"
)
try:
# Step 1: Call agent to get the response
if is_conversational and not evaluable["is_special"]:
# For standard agent in conversational mode, manage history
history.append({"role": "user", "content": query})
response = rag_agent.call(history)
history.append({"role": "assistant", "content": response})
else:
# For special agents or single-turn mode
response = rag_agent.call(query)
# Step 2: Evaluate the response
eval_result = evaluator.evaluate_rag_pipeline(
query=query,
response=response,
ground_truth=ground_truth,
retrieved_contexts=[],
verbose=False,
)
# Step 3: Evaluate conciseness
conciseness_score = evaluator.evaluate_conciseness(query, response)
eval_result["conciseness"] = conciseness_score
eval_result["agent"] = agent_name_for_results
# Add conversational info if present
if is_conversational:
eval_result["conversation_id"] = conversation_id
eval_result["turn"] = row["turn"]
all_agents_results.append(eval_result)
except google_exceptions.FailedPrecondition as e:
if "Token limit exceeded" in str(e):
total_skipped_questions += 1
console.print(
Panel(
f"[bold]Query:[/bold]\n[white]{query}[/white]",
title="[yellow]Skipping Question (Token Limit Exceeded)[/yellow]",
expand=False,
border_style="yellow",
)
)
else:
raise
finally:
progress.advance(task)
if not all_agents_results:
console.print("[bold red]No evaluation results were generated.[/bold red]")
return
final_df = pd.DataFrame(all_agents_results)
# --- Summary Table ---
summary_df = (
final_df.groupby("agent")[["completeness", "hallucination", "irrelevance", "conciseness"]]
.mean()
.reset_index()
)
table = Table(
title="[bold green]Keypoint Evaluation Summary[/bold green]",
show_header=True,
header_style="bold magenta",
)
table.add_column("Agent", justify="left", style="cyan", no_wrap=True)
table.add_column("Completeness", justify="right", style="magenta")
table.add_column("Hallucination", justify="right", style="green")
table.add_column("Irrelevance", justify="right", style="yellow")
table.add_column("Conciseness", justify="right", style="cyan")
for _, row in summary_df.iterrows():
table.add_row(
row["agent"],
f"{row['completeness']:.4f}",
f"{row['hallucination']:.4f}",
f"{row['irrelevance']:.4f}",
f"{row['conciseness']:.4f}",
)
console.print(table)
# --- Skipped Questions Summary ---
if total_skipped_questions > 0:
console.print(
Panel(
f"[bold yellow]Total questions skipped due to token limit: {total_skipped_questions}[/bold yellow]",
title="[bold]Skipped Questions[/bold]",
expand=False,
border_style="yellow",
)
)
if "timestamp" in final_df.columns:
final_df["timestamp"] = pd.to_datetime(final_df["timestamp"]).dt.tz_localize(
None
)
if output_file:
for col in ["keypoints", "keypoint_details", "retrieved_contexts"]:
if col in final_df.columns:
final_df[col] = final_df[col].apply(json.dumps)
output_panel = Panel(
f"Saving results to CSV file: [bold cyan]{output_file}[/bold cyan]\n"
f"Successfully saved {len(final_df)} rows to [bold green]{output_file}[/bold green]",
title="[bold green]Output[/bold green]",
expand=False,
border_style="green",
)
console.print(output_panel)
final_df.to_csv(output_file, index=False, encoding="utf-8-sig")
else:
project_id = config.bigquery.project_id or config.project_id
dataset_id = config.bigquery.dataset_id
table_name = config.bigquery.table_ids["keypoint_eval"]
table_id = f"{project_id}.{dataset_id}.{table_name}"
bq_schema = [
{"name": "run_id", "type": "STRING"},
{"name": "query", "type": "STRING"},
{"name": "response", "type": "STRING"},
{"name": "ground_truth", "type": "STRING"},
{"name": "retrieved_contexts", "type": "STRING", "mode": "REPEATED"},
{"name": "completeness", "type": "FLOAT"},
{"name": "hallucination", "type": "FLOAT"},
{"name": "irrelevance", "type": "FLOAT"},
{"name": "conciseness", "type": "FLOAT"},
{"name": "keypoints", "type": "STRING", "mode": "REPEATED"},
{
"name": "keypoint_details",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{"name": "keypoint", "type": "STRING"},
{"name": "analysis", "type": "STRING"},
{"name": "category", "type": "STRING"},
],
},
{"name": "timestamp", "type": "TIMESTAMP"},
{"name": "agent", "type": "STRING"},
{"name": "error", "type": "STRING"},
{"name": "conversation_id", "type": "STRING"},
{"name": "turn", "type": "INTEGER"},
]
final_df["run_id"] = run_id
bq_column_names = [col["name"] for col in bq_schema]
for col_name in bq_column_names:
if col_name not in final_df.columns:
final_df[col_name] = None
final_df["completeness"] = final_df["completeness"].fillna(0.0)
final_df["hallucination"] = final_df["hallucination"].fillna(0.0)
final_df["irrelevance"] = final_df["irrelevance"].fillna(0.0)
final_df["error"] = final_df["error"].fillna("")
for col_name in ["retrieved_contexts", "keypoints", "keypoint_details"]:
if col_name in final_df.columns:
# Ensure any non-list items (like NaN or None) become an empty list
final_df[col_name] = [
item if isinstance(item, list) else [] for item in final_df[col_name]
]
final_df_for_bq = final_df[bq_column_names].copy()
output_panel = Panel(
f"Saving results to BigQuery table: [bold cyan]{table_id}[/bold cyan]\n"
f"Successfully saved {len(final_df_for_bq)} rows to [bold green]{table_id}[/bold green]",
title="[bold green]Output[/bold green]",
expand=False,
border_style="green",
)
console.print(output_panel)
try:
final_df_for_bq.to_gbq(
destination_table=f"{dataset_id}.{table_name}",
project_id=project_id,
if_exists="append",
table_schema=bq_schema,
)
except Exception as e:
console.print(
f"[bold red]An error occurred while saving to BigQuery: {e}[/bold red]"
)
console.print("DataFrame schema used for upload:")
console.print(final_df_for_bq.info())
raise