Migrate keypoint eval to live queires

This commit is contained in:
Anibal Angulo
2026-02-23 18:27:37 +00:00
parent ddd13805c9
commit af62c650a1
4 changed files with 48 additions and 7 deletions

View File

@@ -9,6 +9,7 @@ dependencies = [
"google-cloud-bigquery>=3.40.1",
"google-cloud-storage",
"google-genai>=1.64.0",
"httpx",
"pandas",
"pandas-gbq",
"pydantic",

View File

@@ -89,8 +89,20 @@ def eval_keypoint(
help="Optional: The specific run_id to filter the evaluation data by."
),
] = None,
agent_url: Annotated[
str,
typer.Option(
"--agent-url",
help="Base URL of the agent server to query for responses.",
),
] = "http://localhost:8000",
):
"""Evaluates RAG responses using the keypoint methodology."""
from va_evaluator.keypoint_metrics_evaluator import evaluate
evaluate(input_file=input_file, output_file=output_file, run_id=run_id)
evaluate(
input_file=input_file,
output_file=output_file,
run_id=run_id,
agent_url=agent_url,
)

View File

@@ -4,6 +4,7 @@ import sqlite3
from datetime import datetime
from typing import Literal
import httpx
import pandas as pd
import typer
from google import genai
@@ -289,7 +290,7 @@ def load_data_from_local_file(
)
raise ValueError(f"Unsupported file type: {path.suffix}")
required_cols = {"input", "expected_output", "response"}
required_cols = {"input", "expected_output"}
if not required_cols.issubset(df.columns):
missing = required_cols - set(df.columns)
console.print(
@@ -313,7 +314,7 @@ def load_data_from_local_file(
if "type" in df.columns:
df = df[df["type"] != "Unanswerable"].copy()
df.dropna(subset=["input", "expected_output", "response"], inplace=True)
df.dropna(subset=["input", "expected_output"], inplace=True)
console.print(f"Loaded {len(df)} rows for evaluation.")
return df
@@ -338,7 +339,7 @@ def load_data_from_bigquery(
query += f" AND run_id = '{run_id}'"
df = client.query(query).to_dataframe()
df.dropna(subset=["input", "expected_output", "response"], inplace=True)
df.dropna(subset=["input", "expected_output"], inplace=True)
console.print(f"Loaded {len(df)} rows for evaluation.")
if df.empty:
console.print("[bold yellow]Warning: No data found in BigQuery.[/bold yellow]")
@@ -348,10 +349,28 @@ def load_data_from_bigquery(
# --- Core Logic ---
def fetch_agent_response(
http_client: httpx.Client, agent_url: str, query: str
) -> str:
"""Sends a query to the agent server and returns the response text."""
response = http_client.post(
f"{agent_url}/api/v1/query",
json={
"phone_number": "eval-user",
"text": query,
"type": "conversation",
"language_code": "es",
},
)
response.raise_for_status()
return response.json()["response_text"]
def evaluate(
input_file: str | None = None,
output_file: str | None = None,
run_id: str | None = None,
agent_url: str = "http://localhost:8000",
):
"""Core logic for running keypoint-based evaluation."""
console = Console()
@@ -388,7 +407,11 @@ def evaluate(
all_results = []
total_skipped = 0
with Progress(
console.print(
f"[bold blue]Fetching responses from agent at: {agent_url}[/bold blue]"
)
with httpx.Client(timeout=120.0) as http_client, Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
@@ -402,9 +425,12 @@ def evaluate(
for _, row in df.iterrows():
try:
agent_response = fetch_agent_response(
http_client, agent_url, row["input"]
)
result = evaluator.evaluate_response(
query=row["input"],
response=row["response"],
response=agent_response,
ground_truth=row["expected_output"],
)
result["agent"] = settings.agent.name
@@ -430,7 +456,7 @@ def evaluate(
all_results.append(
{
"query": row["input"],
"response": row["response"],
"response": "",
"ground_truth": row["expected_output"],
"completeness": 0.0,
"hallucination": 0.0,

2
uv.lock generated
View File

@@ -2621,6 +2621,7 @@ dependencies = [
{ name = "google-cloud-bigquery" },
{ name = "google-cloud-storage" },
{ name = "google-genai" },
{ name = "httpx" },
{ name = "pandas" },
{ name = "pandas-gbq" },
{ name = "pydantic" },
@@ -2642,6 +2643,7 @@ requires-dist = [
{ name = "google-cloud-bigquery", specifier = ">=3.40.1" },
{ name = "google-cloud-storage" },
{ name = "google-genai", specifier = ">=1.64.0" },
{ name = "httpx" },
{ name = "pandas" },
{ name = "pandas-gbq" },
{ name = "pydantic" },