Migrate keypoint eval to live queires
This commit is contained in:
@@ -9,6 +9,7 @@ dependencies = [
|
|||||||
"google-cloud-bigquery>=3.40.1",
|
"google-cloud-bigquery>=3.40.1",
|
||||||
"google-cloud-storage",
|
"google-cloud-storage",
|
||||||
"google-genai>=1.64.0",
|
"google-genai>=1.64.0",
|
||||||
|
"httpx",
|
||||||
"pandas",
|
"pandas",
|
||||||
"pandas-gbq",
|
"pandas-gbq",
|
||||||
"pydantic",
|
"pydantic",
|
||||||
|
|||||||
@@ -89,8 +89,20 @@ def eval_keypoint(
|
|||||||
help="Optional: The specific run_id to filter the evaluation data by."
|
help="Optional: The specific run_id to filter the evaluation data by."
|
||||||
),
|
),
|
||||||
] = None,
|
] = None,
|
||||||
|
agent_url: Annotated[
|
||||||
|
str,
|
||||||
|
typer.Option(
|
||||||
|
"--agent-url",
|
||||||
|
help="Base URL of the agent server to query for responses.",
|
||||||
|
),
|
||||||
|
] = "http://localhost:8000",
|
||||||
):
|
):
|
||||||
"""Evaluates RAG responses using the keypoint methodology."""
|
"""Evaluates RAG responses using the keypoint methodology."""
|
||||||
from va_evaluator.keypoint_metrics_evaluator import evaluate
|
from va_evaluator.keypoint_metrics_evaluator import evaluate
|
||||||
|
|
||||||
evaluate(input_file=input_file, output_file=output_file, run_id=run_id)
|
evaluate(
|
||||||
|
input_file=input_file,
|
||||||
|
output_file=output_file,
|
||||||
|
run_id=run_id,
|
||||||
|
agent_url=agent_url,
|
||||||
|
)
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import sqlite3
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Literal
|
from typing import Literal
|
||||||
|
|
||||||
|
import httpx
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import typer
|
import typer
|
||||||
from google import genai
|
from google import genai
|
||||||
@@ -289,7 +290,7 @@ def load_data_from_local_file(
|
|||||||
)
|
)
|
||||||
raise ValueError(f"Unsupported file type: {path.suffix}")
|
raise ValueError(f"Unsupported file type: {path.suffix}")
|
||||||
|
|
||||||
required_cols = {"input", "expected_output", "response"}
|
required_cols = {"input", "expected_output"}
|
||||||
if not required_cols.issubset(df.columns):
|
if not required_cols.issubset(df.columns):
|
||||||
missing = required_cols - set(df.columns)
|
missing = required_cols - set(df.columns)
|
||||||
console.print(
|
console.print(
|
||||||
@@ -313,7 +314,7 @@ def load_data_from_local_file(
|
|||||||
if "type" in df.columns:
|
if "type" in df.columns:
|
||||||
df = df[df["type"] != "Unanswerable"].copy()
|
df = df[df["type"] != "Unanswerable"].copy()
|
||||||
|
|
||||||
df.dropna(subset=["input", "expected_output", "response"], inplace=True)
|
df.dropna(subset=["input", "expected_output"], inplace=True)
|
||||||
console.print(f"Loaded {len(df)} rows for evaluation.")
|
console.print(f"Loaded {len(df)} rows for evaluation.")
|
||||||
return df
|
return df
|
||||||
|
|
||||||
@@ -338,7 +339,7 @@ def load_data_from_bigquery(
|
|||||||
query += f" AND run_id = '{run_id}'"
|
query += f" AND run_id = '{run_id}'"
|
||||||
|
|
||||||
df = client.query(query).to_dataframe()
|
df = client.query(query).to_dataframe()
|
||||||
df.dropna(subset=["input", "expected_output", "response"], inplace=True)
|
df.dropna(subset=["input", "expected_output"], inplace=True)
|
||||||
console.print(f"Loaded {len(df)} rows for evaluation.")
|
console.print(f"Loaded {len(df)} rows for evaluation.")
|
||||||
if df.empty:
|
if df.empty:
|
||||||
console.print("[bold yellow]Warning: No data found in BigQuery.[/bold yellow]")
|
console.print("[bold yellow]Warning: No data found in BigQuery.[/bold yellow]")
|
||||||
@@ -348,10 +349,28 @@ def load_data_from_bigquery(
|
|||||||
# --- Core Logic ---
|
# --- Core Logic ---
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_agent_response(
|
||||||
|
http_client: httpx.Client, agent_url: str, query: str
|
||||||
|
) -> str:
|
||||||
|
"""Sends a query to the agent server and returns the response text."""
|
||||||
|
response = http_client.post(
|
||||||
|
f"{agent_url}/api/v1/query",
|
||||||
|
json={
|
||||||
|
"phone_number": "eval-user",
|
||||||
|
"text": query,
|
||||||
|
"type": "conversation",
|
||||||
|
"language_code": "es",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()["response_text"]
|
||||||
|
|
||||||
|
|
||||||
def evaluate(
|
def evaluate(
|
||||||
input_file: str | None = None,
|
input_file: str | None = None,
|
||||||
output_file: str | None = None,
|
output_file: str | None = None,
|
||||||
run_id: str | None = None,
|
run_id: str | None = None,
|
||||||
|
agent_url: str = "http://localhost:8000",
|
||||||
):
|
):
|
||||||
"""Core logic for running keypoint-based evaluation."""
|
"""Core logic for running keypoint-based evaluation."""
|
||||||
console = Console()
|
console = Console()
|
||||||
@@ -388,7 +407,11 @@ def evaluate(
|
|||||||
all_results = []
|
all_results = []
|
||||||
total_skipped = 0
|
total_skipped = 0
|
||||||
|
|
||||||
with Progress(
|
console.print(
|
||||||
|
f"[bold blue]Fetching responses from agent at: {agent_url}[/bold blue]"
|
||||||
|
)
|
||||||
|
|
||||||
|
with httpx.Client(timeout=120.0) as http_client, Progress(
|
||||||
SpinnerColumn(),
|
SpinnerColumn(),
|
||||||
TextColumn("[progress.description]{task.description}"),
|
TextColumn("[progress.description]{task.description}"),
|
||||||
BarColumn(),
|
BarColumn(),
|
||||||
@@ -402,9 +425,12 @@ def evaluate(
|
|||||||
|
|
||||||
for _, row in df.iterrows():
|
for _, row in df.iterrows():
|
||||||
try:
|
try:
|
||||||
|
agent_response = fetch_agent_response(
|
||||||
|
http_client, agent_url, row["input"]
|
||||||
|
)
|
||||||
result = evaluator.evaluate_response(
|
result = evaluator.evaluate_response(
|
||||||
query=row["input"],
|
query=row["input"],
|
||||||
response=row["response"],
|
response=agent_response,
|
||||||
ground_truth=row["expected_output"],
|
ground_truth=row["expected_output"],
|
||||||
)
|
)
|
||||||
result["agent"] = settings.agent.name
|
result["agent"] = settings.agent.name
|
||||||
@@ -430,7 +456,7 @@ def evaluate(
|
|||||||
all_results.append(
|
all_results.append(
|
||||||
{
|
{
|
||||||
"query": row["input"],
|
"query": row["input"],
|
||||||
"response": row["response"],
|
"response": "",
|
||||||
"ground_truth": row["expected_output"],
|
"ground_truth": row["expected_output"],
|
||||||
"completeness": 0.0,
|
"completeness": 0.0,
|
||||||
"hallucination": 0.0,
|
"hallucination": 0.0,
|
||||||
|
|||||||
2
uv.lock
generated
2
uv.lock
generated
@@ -2621,6 +2621,7 @@ dependencies = [
|
|||||||
{ name = "google-cloud-bigquery" },
|
{ name = "google-cloud-bigquery" },
|
||||||
{ name = "google-cloud-storage" },
|
{ name = "google-cloud-storage" },
|
||||||
{ name = "google-genai" },
|
{ name = "google-genai" },
|
||||||
|
{ name = "httpx" },
|
||||||
{ name = "pandas" },
|
{ name = "pandas" },
|
||||||
{ name = "pandas-gbq" },
|
{ name = "pandas-gbq" },
|
||||||
{ name = "pydantic" },
|
{ name = "pydantic" },
|
||||||
@@ -2642,6 +2643,7 @@ requires-dist = [
|
|||||||
{ name = "google-cloud-bigquery", specifier = ">=3.40.1" },
|
{ name = "google-cloud-bigquery", specifier = ">=3.40.1" },
|
||||||
{ name = "google-cloud-storage" },
|
{ name = "google-cloud-storage" },
|
||||||
{ name = "google-genai", specifier = ">=1.64.0" },
|
{ name = "google-genai", specifier = ">=1.64.0" },
|
||||||
|
{ name = "httpx" },
|
||||||
{ name = "pandas" },
|
{ name = "pandas" },
|
||||||
{ name = "pandas-gbq" },
|
{ name = "pandas-gbq" },
|
||||||
{ name = "pydantic" },
|
{ name = "pydantic" },
|
||||||
|
|||||||
Reference in New Issue
Block a user