Migrate keypoint eval to live queires

This commit is contained in:
Anibal Angulo
2026-02-23 18:27:37 +00:00
parent ddd13805c9
commit af62c650a1
4 changed files with 48 additions and 7 deletions

View File

@@ -9,6 +9,7 @@ dependencies = [
"google-cloud-bigquery>=3.40.1", "google-cloud-bigquery>=3.40.1",
"google-cloud-storage", "google-cloud-storage",
"google-genai>=1.64.0", "google-genai>=1.64.0",
"httpx",
"pandas", "pandas",
"pandas-gbq", "pandas-gbq",
"pydantic", "pydantic",

View File

@@ -89,8 +89,20 @@ def eval_keypoint(
help="Optional: The specific run_id to filter the evaluation data by." help="Optional: The specific run_id to filter the evaluation data by."
), ),
] = None, ] = None,
agent_url: Annotated[
str,
typer.Option(
"--agent-url",
help="Base URL of the agent server to query for responses.",
),
] = "http://localhost:8000",
): ):
"""Evaluates RAG responses using the keypoint methodology.""" """Evaluates RAG responses using the keypoint methodology."""
from va_evaluator.keypoint_metrics_evaluator import evaluate from va_evaluator.keypoint_metrics_evaluator import evaluate
evaluate(input_file=input_file, output_file=output_file, run_id=run_id) evaluate(
input_file=input_file,
output_file=output_file,
run_id=run_id,
agent_url=agent_url,
)

View File

@@ -4,6 +4,7 @@ import sqlite3
from datetime import datetime from datetime import datetime
from typing import Literal from typing import Literal
import httpx
import pandas as pd import pandas as pd
import typer import typer
from google import genai from google import genai
@@ -289,7 +290,7 @@ def load_data_from_local_file(
) )
raise ValueError(f"Unsupported file type: {path.suffix}") raise ValueError(f"Unsupported file type: {path.suffix}")
required_cols = {"input", "expected_output", "response"} required_cols = {"input", "expected_output"}
if not required_cols.issubset(df.columns): if not required_cols.issubset(df.columns):
missing = required_cols - set(df.columns) missing = required_cols - set(df.columns)
console.print( console.print(
@@ -313,7 +314,7 @@ def load_data_from_local_file(
if "type" in df.columns: if "type" in df.columns:
df = df[df["type"] != "Unanswerable"].copy() df = df[df["type"] != "Unanswerable"].copy()
df.dropna(subset=["input", "expected_output", "response"], inplace=True) df.dropna(subset=["input", "expected_output"], inplace=True)
console.print(f"Loaded {len(df)} rows for evaluation.") console.print(f"Loaded {len(df)} rows for evaluation.")
return df return df
@@ -338,7 +339,7 @@ def load_data_from_bigquery(
query += f" AND run_id = '{run_id}'" query += f" AND run_id = '{run_id}'"
df = client.query(query).to_dataframe() df = client.query(query).to_dataframe()
df.dropna(subset=["input", "expected_output", "response"], inplace=True) df.dropna(subset=["input", "expected_output"], inplace=True)
console.print(f"Loaded {len(df)} rows for evaluation.") console.print(f"Loaded {len(df)} rows for evaluation.")
if df.empty: if df.empty:
console.print("[bold yellow]Warning: No data found in BigQuery.[/bold yellow]") console.print("[bold yellow]Warning: No data found in BigQuery.[/bold yellow]")
@@ -348,10 +349,28 @@ def load_data_from_bigquery(
# --- Core Logic --- # --- Core Logic ---
def fetch_agent_response(
http_client: httpx.Client, agent_url: str, query: str
) -> str:
"""Sends a query to the agent server and returns the response text."""
response = http_client.post(
f"{agent_url}/api/v1/query",
json={
"phone_number": "eval-user",
"text": query,
"type": "conversation",
"language_code": "es",
},
)
response.raise_for_status()
return response.json()["response_text"]
def evaluate( def evaluate(
input_file: str | None = None, input_file: str | None = None,
output_file: str | None = None, output_file: str | None = None,
run_id: str | None = None, run_id: str | None = None,
agent_url: str = "http://localhost:8000",
): ):
"""Core logic for running keypoint-based evaluation.""" """Core logic for running keypoint-based evaluation."""
console = Console() console = Console()
@@ -388,7 +407,11 @@ def evaluate(
all_results = [] all_results = []
total_skipped = 0 total_skipped = 0
with Progress( console.print(
f"[bold blue]Fetching responses from agent at: {agent_url}[/bold blue]"
)
with httpx.Client(timeout=120.0) as http_client, Progress(
SpinnerColumn(), SpinnerColumn(),
TextColumn("[progress.description]{task.description}"), TextColumn("[progress.description]{task.description}"),
BarColumn(), BarColumn(),
@@ -402,9 +425,12 @@ def evaluate(
for _, row in df.iterrows(): for _, row in df.iterrows():
try: try:
agent_response = fetch_agent_response(
http_client, agent_url, row["input"]
)
result = evaluator.evaluate_response( result = evaluator.evaluate_response(
query=row["input"], query=row["input"],
response=row["response"], response=agent_response,
ground_truth=row["expected_output"], ground_truth=row["expected_output"],
) )
result["agent"] = settings.agent.name result["agent"] = settings.agent.name
@@ -430,7 +456,7 @@ def evaluate(
all_results.append( all_results.append(
{ {
"query": row["input"], "query": row["input"],
"response": row["response"], "response": "",
"ground_truth": row["expected_output"], "ground_truth": row["expected_output"],
"completeness": 0.0, "completeness": 0.0,
"hallucination": 0.0, "hallucination": 0.0,

2
uv.lock generated
View File

@@ -2621,6 +2621,7 @@ dependencies = [
{ name = "google-cloud-bigquery" }, { name = "google-cloud-bigquery" },
{ name = "google-cloud-storage" }, { name = "google-cloud-storage" },
{ name = "google-genai" }, { name = "google-genai" },
{ name = "httpx" },
{ name = "pandas" }, { name = "pandas" },
{ name = "pandas-gbq" }, { name = "pandas-gbq" },
{ name = "pydantic" }, { name = "pydantic" },
@@ -2642,6 +2643,7 @@ requires-dist = [
{ name = "google-cloud-bigquery", specifier = ">=3.40.1" }, { name = "google-cloud-bigquery", specifier = ">=3.40.1" },
{ name = "google-cloud-storage" }, { name = "google-cloud-storage" },
{ name = "google-genai", specifier = ">=1.64.0" }, { name = "google-genai", specifier = ">=1.64.0" },
{ name = "httpx" },
{ name = "pandas" }, { name = "pandas" },
{ name = "pandas-gbq" }, { name = "pandas-gbq" },
{ name = "pydantic" }, { name = "pydantic" },