diff --git a/pyproject.toml b/pyproject.toml index 76c12c3..fb88d80 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ dependencies = [ "google-cloud-bigquery>=3.40.1", "google-cloud-storage", "google-genai>=1.64.0", + "httpx", "pandas", "pandas-gbq", "pydantic", diff --git a/src/va_evaluator/cli.py b/src/va_evaluator/cli.py index 6b58b1c..68150ff 100644 --- a/src/va_evaluator/cli.py +++ b/src/va_evaluator/cli.py @@ -89,8 +89,20 @@ def eval_keypoint( help="Optional: The specific run_id to filter the evaluation data by." ), ] = None, + agent_url: Annotated[ + str, + typer.Option( + "--agent-url", + help="Base URL of the agent server to query for responses.", + ), + ] = "http://localhost:8000", ): """Evaluates RAG responses using the keypoint methodology.""" from va_evaluator.keypoint_metrics_evaluator import evaluate - evaluate(input_file=input_file, output_file=output_file, run_id=run_id) + evaluate( + input_file=input_file, + output_file=output_file, + run_id=run_id, + agent_url=agent_url, + ) diff --git a/src/va_evaluator/keypoint_metrics_evaluator.py b/src/va_evaluator/keypoint_metrics_evaluator.py index 8afc9a6..2b42757 100644 --- a/src/va_evaluator/keypoint_metrics_evaluator.py +++ b/src/va_evaluator/keypoint_metrics_evaluator.py @@ -4,6 +4,7 @@ import sqlite3 from datetime import datetime from typing import Literal +import httpx import pandas as pd import typer from google import genai @@ -289,7 +290,7 @@ def load_data_from_local_file( ) raise ValueError(f"Unsupported file type: {path.suffix}") - required_cols = {"input", "expected_output", "response"} + required_cols = {"input", "expected_output"} if not required_cols.issubset(df.columns): missing = required_cols - set(df.columns) console.print( @@ -313,7 +314,7 @@ def load_data_from_local_file( if "type" in df.columns: df = df[df["type"] != "Unanswerable"].copy() - df.dropna(subset=["input", "expected_output", "response"], inplace=True) + df.dropna(subset=["input", "expected_output"], inplace=True) console.print(f"Loaded {len(df)} rows for evaluation.") return df @@ -338,7 +339,7 @@ def load_data_from_bigquery( query += f" AND run_id = '{run_id}'" df = client.query(query).to_dataframe() - df.dropna(subset=["input", "expected_output", "response"], inplace=True) + df.dropna(subset=["input", "expected_output"], inplace=True) console.print(f"Loaded {len(df)} rows for evaluation.") if df.empty: console.print("[bold yellow]Warning: No data found in BigQuery.[/bold yellow]") @@ -348,10 +349,28 @@ def load_data_from_bigquery( # --- Core Logic --- +def fetch_agent_response( + http_client: httpx.Client, agent_url: str, query: str +) -> str: + """Sends a query to the agent server and returns the response text.""" + response = http_client.post( + f"{agent_url}/api/v1/query", + json={ + "phone_number": "eval-user", + "text": query, + "type": "conversation", + "language_code": "es", + }, + ) + response.raise_for_status() + return response.json()["response_text"] + + def evaluate( input_file: str | None = None, output_file: str | None = None, run_id: str | None = None, + agent_url: str = "http://localhost:8000", ): """Core logic for running keypoint-based evaluation.""" console = Console() @@ -388,7 +407,11 @@ def evaluate( all_results = [] total_skipped = 0 - with Progress( + console.print( + f"[bold blue]Fetching responses from agent at: {agent_url}[/bold blue]" + ) + + with httpx.Client(timeout=120.0) as http_client, Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), @@ -402,9 +425,12 @@ def evaluate( for _, row in df.iterrows(): try: + agent_response = fetch_agent_response( + http_client, agent_url, row["input"] + ) result = evaluator.evaluate_response( query=row["input"], - response=row["response"], + response=agent_response, ground_truth=row["expected_output"], ) result["agent"] = settings.agent.name @@ -430,7 +456,7 @@ def evaluate( all_results.append( { "query": row["input"], - "response": row["response"], + "response": "", "ground_truth": row["expected_output"], "completeness": 0.0, "hallucination": 0.0, diff --git a/uv.lock b/uv.lock index af7871d..1912110 100644 --- a/uv.lock +++ b/uv.lock @@ -2621,6 +2621,7 @@ dependencies = [ { name = "google-cloud-bigquery" }, { name = "google-cloud-storage" }, { name = "google-genai" }, + { name = "httpx" }, { name = "pandas" }, { name = "pandas-gbq" }, { name = "pydantic" }, @@ -2642,6 +2643,7 @@ requires-dist = [ { name = "google-cloud-bigquery", specifier = ">=3.40.1" }, { name = "google-cloud-storage" }, { name = "google-genai", specifier = ">=1.64.0" }, + { name = "httpx" }, { name = "pandas" }, { name = "pandas-gbq" }, { name = "pydantic" },