First commit

This commit is contained in:
Anibal Angulo
2026-02-18 19:57:43 +00:00
commit a53f8fcf62
115 changed files with 9957 additions and 0 deletions

View File

@@ -0,0 +1,132 @@
import pathlib
import sqlite3
import pandas as pd
from google.cloud import bigquery
from rich.console import Console
from rag_eval.config import settings as config
def load_data_from_local_file(
file_path: str, console: Console, run_id: str = None
) -> pd.DataFrame:
"""Loads evaluation data from a local CSV or SQLite file and returns a DataFrame."""
console.print(f"Loading data from {file_path}...")
path = pathlib.Path(file_path)
if not path.exists():
raise Exception(f"Error: File not found at {file_path}")
if path.suffix == ".csv":
try:
df = pd.read_csv(path)
except Exception as e:
raise Exception(f"An error occurred while reading the CSV file: {e}")
elif path.suffix in [".db", ".sqlite"]:
try:
con = sqlite3.connect(path)
# Assuming table name is the file stem
table_name = path.stem
df = pd.read_sql(f"SELECT * FROM {table_name}", con)
con.close()
except Exception as e:
raise Exception(f"An error occurred while reading the SQLite DB: {e}")
else:
raise Exception(
f"Unsupported file type: {path.suffix}. Please use .csv or .db/.sqlite"
)
# Check for required columns
if (
"input" not in df.columns
or "expected_output" not in df.columns
):
raise Exception(
"Error: The input file must contain 'input' and 'expected_output' columns."
)
df["agent"] = config.agent.name
print(f"{run_id=}")
if run_id:
if "run_id" in df.columns:
df = df[df["run_id"] == run_id].copy()
console.print(f"Filtered data for run_id: {run_id}")
if df.empty:
console.print(
f"[yellow]Warning: No data found for run_id '{run_id}' in {file_path}.[/yellow]"
)
else:
console.print(
f"[yellow]Warning: --run-id provided, but 'run_id' column not found in {file_path}. Using all data.[/yellow]"
)
# Filter out unanswerable questions if 'type' column exists
if "type" in df.columns:
df = df[df["type"] != "Unanswerable"].copy()
df.dropna(subset=["input", "expected_output"], inplace=True)
console.print(f"Loaded {len(df)} questions for evaluation from {file_path}.")
return df
def load_data_from_bigquery(console: Console, run_id: str = None) -> pd.DataFrame:
"""Loads evaluation data from the BigQuery table and returns a DataFrame."""
console.print("Loading data from BigQuery...")
bq_project_id = config.bigquery.project_id or config.project_id
client = bigquery.Client(project=bq_project_id)
table_ref = f"{bq_project_id}.{config.bigquery.dataset_id}.{config.bigquery.table_ids['synth_gen']}"
console.print(f"Querying table: {table_ref}")
try:
table = client.get_table(table_ref)
all_columns = [schema.name for schema in table.schema]
select_cols = ["input", "expected_output"]
if "category" in all_columns:
select_cols.append("category")
query_parts = [f"SELECT {', '.join(select_cols)}", f"FROM `{table_ref}`"]
# Build WHERE clauses
where_clauses = []
if "type" in all_columns:
where_clauses.append("type != 'Unanswerable'")
if run_id:
if "run_id" in all_columns:
where_clauses.append(f"run_id = '{run_id}'")
console.print(f"Filtering data for run_id: {run_id}")
else:
console.print(
"[yellow]Warning: --run-id provided, but 'run_id' column not found in BigQuery table. Using all data.[/yellow]"
)
if where_clauses:
query_parts.append("WHERE " + " AND ".join(where_clauses))
query = "\n".join(query_parts)
df = client.query(query).to_dataframe()
except Exception as e:
if "Not found" in str(e):
console.print(f"[bold red]Error: Table {table_ref} not found.[/bold red]")
console.print(
"Please ensure the table exists and the configuration in 'config.yaml' is correct."
)
raise
else:
console.print(
f"[bold red]An error occurred while querying BigQuery: {e}[/bold red]"
)
raise
df.dropna(subset=["input", "expected_output"], inplace=True)
df["agent"] = config.agent.name
console.print(f"Loaded {len(df)} questions for evaluation.")
if run_id and df.empty:
console.print(
f"[yellow]Warning: No data found for run_id '{run_id}' in BigQuery.[/yellow]"
)
return df