Migrate keypoint eval to live queires
This commit is contained in:
@@ -9,6 +9,7 @@ dependencies = [
|
||||
"google-cloud-bigquery>=3.40.1",
|
||||
"google-cloud-storage",
|
||||
"google-genai>=1.64.0",
|
||||
"httpx",
|
||||
"pandas",
|
||||
"pandas-gbq",
|
||||
"pydantic",
|
||||
|
||||
@@ -89,8 +89,20 @@ def eval_keypoint(
|
||||
help="Optional: The specific run_id to filter the evaluation data by."
|
||||
),
|
||||
] = None,
|
||||
agent_url: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
"--agent-url",
|
||||
help="Base URL of the agent server to query for responses.",
|
||||
),
|
||||
] = "http://localhost:8000",
|
||||
):
|
||||
"""Evaluates RAG responses using the keypoint methodology."""
|
||||
from va_evaluator.keypoint_metrics_evaluator import evaluate
|
||||
|
||||
evaluate(input_file=input_file, output_file=output_file, run_id=run_id)
|
||||
evaluate(
|
||||
input_file=input_file,
|
||||
output_file=output_file,
|
||||
run_id=run_id,
|
||||
agent_url=agent_url,
|
||||
)
|
||||
|
||||
@@ -4,6 +4,7 @@ import sqlite3
|
||||
from datetime import datetime
|
||||
from typing import Literal
|
||||
|
||||
import httpx
|
||||
import pandas as pd
|
||||
import typer
|
||||
from google import genai
|
||||
@@ -289,7 +290,7 @@ def load_data_from_local_file(
|
||||
)
|
||||
raise ValueError(f"Unsupported file type: {path.suffix}")
|
||||
|
||||
required_cols = {"input", "expected_output", "response"}
|
||||
required_cols = {"input", "expected_output"}
|
||||
if not required_cols.issubset(df.columns):
|
||||
missing = required_cols - set(df.columns)
|
||||
console.print(
|
||||
@@ -313,7 +314,7 @@ def load_data_from_local_file(
|
||||
if "type" in df.columns:
|
||||
df = df[df["type"] != "Unanswerable"].copy()
|
||||
|
||||
df.dropna(subset=["input", "expected_output", "response"], inplace=True)
|
||||
df.dropna(subset=["input", "expected_output"], inplace=True)
|
||||
console.print(f"Loaded {len(df)} rows for evaluation.")
|
||||
return df
|
||||
|
||||
@@ -338,7 +339,7 @@ def load_data_from_bigquery(
|
||||
query += f" AND run_id = '{run_id}'"
|
||||
|
||||
df = client.query(query).to_dataframe()
|
||||
df.dropna(subset=["input", "expected_output", "response"], inplace=True)
|
||||
df.dropna(subset=["input", "expected_output"], inplace=True)
|
||||
console.print(f"Loaded {len(df)} rows for evaluation.")
|
||||
if df.empty:
|
||||
console.print("[bold yellow]Warning: No data found in BigQuery.[/bold yellow]")
|
||||
@@ -348,10 +349,28 @@ def load_data_from_bigquery(
|
||||
# --- Core Logic ---
|
||||
|
||||
|
||||
def fetch_agent_response(
|
||||
http_client: httpx.Client, agent_url: str, query: str
|
||||
) -> str:
|
||||
"""Sends a query to the agent server and returns the response text."""
|
||||
response = http_client.post(
|
||||
f"{agent_url}/api/v1/query",
|
||||
json={
|
||||
"phone_number": "eval-user",
|
||||
"text": query,
|
||||
"type": "conversation",
|
||||
"language_code": "es",
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()["response_text"]
|
||||
|
||||
|
||||
def evaluate(
|
||||
input_file: str | None = None,
|
||||
output_file: str | None = None,
|
||||
run_id: str | None = None,
|
||||
agent_url: str = "http://localhost:8000",
|
||||
):
|
||||
"""Core logic for running keypoint-based evaluation."""
|
||||
console = Console()
|
||||
@@ -388,7 +407,11 @@ def evaluate(
|
||||
all_results = []
|
||||
total_skipped = 0
|
||||
|
||||
with Progress(
|
||||
console.print(
|
||||
f"[bold blue]Fetching responses from agent at: {agent_url}[/bold blue]"
|
||||
)
|
||||
|
||||
with httpx.Client(timeout=120.0) as http_client, Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
@@ -402,9 +425,12 @@ def evaluate(
|
||||
|
||||
for _, row in df.iterrows():
|
||||
try:
|
||||
agent_response = fetch_agent_response(
|
||||
http_client, agent_url, row["input"]
|
||||
)
|
||||
result = evaluator.evaluate_response(
|
||||
query=row["input"],
|
||||
response=row["response"],
|
||||
response=agent_response,
|
||||
ground_truth=row["expected_output"],
|
||||
)
|
||||
result["agent"] = settings.agent.name
|
||||
@@ -430,7 +456,7 @@ def evaluate(
|
||||
all_results.append(
|
||||
{
|
||||
"query": row["input"],
|
||||
"response": row["response"],
|
||||
"response": "",
|
||||
"ground_truth": row["expected_output"],
|
||||
"completeness": 0.0,
|
||||
"hallucination": 0.0,
|
||||
|
||||
2
uv.lock
generated
2
uv.lock
generated
@@ -2621,6 +2621,7 @@ dependencies = [
|
||||
{ name = "google-cloud-bigquery" },
|
||||
{ name = "google-cloud-storage" },
|
||||
{ name = "google-genai" },
|
||||
{ name = "httpx" },
|
||||
{ name = "pandas" },
|
||||
{ name = "pandas-gbq" },
|
||||
{ name = "pydantic" },
|
||||
@@ -2642,6 +2643,7 @@ requires-dist = [
|
||||
{ name = "google-cloud-bigquery", specifier = ">=3.40.1" },
|
||||
{ name = "google-cloud-storage" },
|
||||
{ name = "google-genai", specifier = ">=1.64.0" },
|
||||
{ name = "httpx" },
|
||||
{ name = "pandas" },
|
||||
{ name = "pandas-gbq" },
|
||||
{ name = "pydantic" },
|
||||
|
||||
Reference in New Issue
Block a user