Migrate keypoint eval to live queires

2026-02-23 18:27:37 +00:00
parent ddd13805c9
commit af62c650a1
4 changed files with 48 additions and 7 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,6 +9,7 @@ dependencies = [
    "google-cloud-bigquery>=3.40.1",
    "google-cloud-storage",
    "google-genai>=1.64.0",
    "httpx",
    "pandas",
    "pandas-gbq",
    "pydantic",
--- a/src/va_evaluator/cli.py
+++ b/src/va_evaluator/cli.py
@@ -89,8 +89,20 @@ def eval_keypoint(
            help="Optional: The specific run_id to filter the evaluation data by."
        ),
    ] = None,
    agent_url: Annotated[
        str,
        typer.Option(
            "--agent-url",
            help="Base URL of the agent server to query for responses.",
        ),
    ] = "http://localhost:8000",
 ):
    """Evaluates RAG responses using the keypoint methodology."""
    from va_evaluator.keypoint_metrics_evaluator import evaluate
-    evaluate(input_file=input_file, output_file=output_file, run_id=run_id)
+    evaluate(
        input_file=input_file,
        output_file=output_file,
        run_id=run_id,
        agent_url=agent_url,
    )
--- a/src/va_evaluator/keypoint_metrics_evaluator.py
+++ b/src/va_evaluator/keypoint_metrics_evaluator.py
@@ -4,6 +4,7 @@ import sqlite3
 from datetime import datetime
 from typing import Literal
 import httpx
 import pandas as pd
 import typer
 from google import genai
@@ -289,7 +290,7 @@ def load_data_from_local_file(
        )
        raise ValueError(f"Unsupported file type: {path.suffix}")
-    required_cols = {"input", "expected_output", "response"}
+    required_cols = {"input", "expected_output"}
    if not required_cols.issubset(df.columns):
        missing = required_cols - set(df.columns)
        console.print(
@@ -313,7 +314,7 @@ def load_data_from_local_file(
    if "type" in df.columns:
        df = df[df["type"] != "Unanswerable"].copy()
-    df.dropna(subset=["input", "expected_output", "response"], inplace=True)
+    df.dropna(subset=["input", "expected_output"], inplace=True)
    console.print(f"Loaded {len(df)} rows for evaluation.")
    return df
@@ -338,7 +339,7 @@ def load_data_from_bigquery(
        query += f" AND run_id = '{run_id}'"
    df = client.query(query).to_dataframe()
-    df.dropna(subset=["input", "expected_output", "response"], inplace=True)
+    df.dropna(subset=["input", "expected_output"], inplace=True)
    console.print(f"Loaded {len(df)} rows for evaluation.")
    if df.empty:
        console.print("[bold yellow]Warning: No data found in BigQuery.[/bold yellow]")
@@ -348,10 +349,28 @@ def load_data_from_bigquery(
 # --- Core Logic ---
 def fetch_agent_response(
    http_client: httpx.Client, agent_url: str, query: str
 ) -> str:
    """Sends a query to the agent server and returns the response text."""
    response = http_client.post(
        f"{agent_url}/api/v1/query",
        json={
            "phone_number": "eval-user",
            "text": query,
            "type": "conversation",
            "language_code": "es",
        },
    )
    response.raise_for_status()
    return response.json()["response_text"]
 def evaluate(
    input_file: str | None = None,
    output_file: str | None = None,
    run_id: str | None = None,
    agent_url: str = "http://localhost:8000",
 ):
    """Core logic for running keypoint-based evaluation."""
    console = Console()
@@ -388,7 +407,11 @@ def evaluate(
    all_results = []
    total_skipped = 0
-    with Progress(
+    console.print(
        f"[bold blue]Fetching responses from agent at: {agent_url}[/bold blue]"
    )
    with httpx.Client(timeout=120.0) as http_client, Progress(
        SpinnerColumn(),
        TextColumn("[progress.description]{task.description}"),
        BarColumn(),
@@ -402,9 +425,12 @@ def evaluate(
        for _, row in df.iterrows():
            try:
                agent_response = fetch_agent_response(
                    http_client, agent_url, row["input"]
                )
                result = evaluator.evaluate_response(
                    query=row["input"],
-                    response=row["response"],
+                    response=agent_response,
                    ground_truth=row["expected_output"],
                )
                result["agent"] = settings.agent.name
@@ -430,7 +456,7 @@ def evaluate(
                    all_results.append(
                        {
                            "query": row["input"],
-                            "response": row["response"],
+                            "response": "",
                            "ground_truth": row["expected_output"],
                            "completeness": 0.0,
                            "hallucination": 0.0,
--- a/uv.lock
+++ b/uv.lock
@@ -2621,6 +2621,7 @@ dependencies = [
    { name = "google-cloud-bigquery" },
    { name = "google-cloud-storage" },
    { name = "google-genai" },
    { name = "httpx" },
    { name = "pandas" },
    { name = "pandas-gbq" },
    { name = "pydantic" },
@@ -2642,6 +2643,7 @@ requires-dist = [
    { name = "google-cloud-bigquery", specifier = ">=3.40.1" },
    { name = "google-cloud-storage" },
    { name = "google-genai", specifier = ">=1.64.0" },
    { name = "httpx" },
    { name = "pandas" },
    { name = "pandas-gbq" },
    { name = "pydantic" },