add analysis component

This commit is contained in:
Anibal Angulo
2025-11-09 10:24:58 -06:00
parent 77a11ef32e
commit 1ce4162e4a
11 changed files with 1155 additions and 8 deletions

View File

@@ -0,0 +1,112 @@
from __future__ import annotations
from typing import Any, Iterable, List
from app.agents.form_auditor.models import ExtractedIrsForm990PfDataSchema
from .agent import agent
from .metrics import SnapshotBundle, build_key_metrics, build_snapshots
from .models import AnalystReport, AnalystState
__all__ = ["build_performance_report"]
def _resolve_year(
entry: dict[str, Any], extraction: ExtractedIrsForm990PfDataSchema
) -> int:
candidates: Iterable[Any] = (
entry.get("calendar_year"),
entry.get("year"),
entry.get("tax_year"),
entry.get("return_year"),
entry.get("metadata", {}).get("return_year")
if isinstance(entry.get("metadata"), dict)
else None,
entry.get("metadata", {}).get("tax_year")
if isinstance(entry.get("metadata"), dict)
else None,
extraction.core_organization_metadata.calendar_year,
)
for candidate in candidates:
if candidate in (None, ""):
continue
try:
return int(candidate)
except (TypeError, ValueError):
continue
raise ValueError("Unable to determine filing year for one of the payload entries.")
async def build_performance_report(payloads: List[dict[str, Any]]) -> AnalystReport:
if not payloads:
raise ValueError("At least one payload is required for performance analysis.")
bundles: List[SnapshotBundle] = []
organisation_name = ""
organisation_ein = ""
for entry in payloads:
if not isinstance(entry, dict):
raise TypeError("Each payload entry must be a dict.")
extraction_payload = entry.get("extraction") if "extraction" in entry else entry
extraction = ExtractedIrsForm990PfDataSchema.model_validate(extraction_payload)
year = _resolve_year(entry, extraction)
if not organisation_ein:
organisation_ein = extraction.core_organization_metadata.ein
organisation_name = extraction.core_organization_metadata.legal_name
else:
if extraction.core_organization_metadata.ein != organisation_ein:
raise ValueError(
"All payload entries must belong to the same organization."
)
bundles.append(SnapshotBundle(year=year, extraction=extraction))
bundles.sort(key=lambda bundle: bundle.year)
snapshots = build_snapshots(bundles)
metrics = build_key_metrics(snapshots)
notes = []
if metrics:
revenue_metric = metrics[0]
expense_metric = metrics[1] if len(metrics) > 1 else None
if revenue_metric.cagr is not None:
notes.append(f"Revenue CAGR: {revenue_metric.cagr:.2%}")
if expense_metric and expense_metric.cagr is not None:
notes.append(f"Expense CAGR: {expense_metric.cagr:.2%}")
surplus_metric = next(
(m for m in metrics if m.name == "Operating Surplus"), None
)
if surplus_metric:
last_value = surplus_metric.points[-1].value if surplus_metric.points else 0
notes.append(f"Latest operating surplus: {last_value:,.0f}")
state = AnalystState(
organisation_name=organisation_name,
organisation_ein=organisation_ein,
series=snapshots,
key_metrics=metrics,
notes=notes,
)
prompt = (
"Analyze the provided multi-year financial context. Quantify notable trends, "
"call out risks or strengths, and supply actionable recommendations. "
"Capture both positive momentum and areas requiring attention."
)
result = await agent.run(prompt, deps=state)
report = result.output
years = [snapshot.year for snapshot in snapshots]
return report.model_copy(
update={
"organisation_name": organisation_name,
"organisation_ein": organisation_ein,
"years_analyzed": years,
"key_metrics": metrics,
}
)

View File

@@ -0,0 +1,33 @@
from __future__ import annotations
from pydantic_ai import Agent
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.providers.azure import AzureProvider
from app.core.config import settings
from .models import AnalystReport, AnalystState
provider = AzureProvider(
azure_endpoint=settings.AZURE_OPENAI_ENDPOINT,
api_version=settings.AZURE_OPENAI_API_VERSION,
api_key=settings.AZURE_OPENAI_API_KEY,
)
model = OpenAIChatModel(model_name="gpt-4o", provider=provider)
agent = Agent(
model=model,
name="MultiYearAnalyst",
deps_type=AnalystState,
output_type=AnalystReport,
system_prompt=(
"You are a nonprofit financial analyst. You receive multi-year Form 990 extractions "
"summarized into deterministic metrics (series, ratios, surplus, CAGR). Use the context "
"to highlight performance trends, governance implications, and forward-looking risks. "
"Focus on numeric trends: revenue growth, expense discipline, surplus stability, "
"program-vs-admin mix, and fundraising efficiency. Provide concise bullet insights, "
"clear recommendations tied to the data, and a balanced outlook (strengths vs watch items). "
"Only cite facts available in the provided series—do not invent figures."
),
)

View File

@@ -0,0 +1,197 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Iterable, List, Sequence, Tuple
from app.agents.form_auditor.models import ExtractedIrsForm990PfDataSchema
from .models import TrendDirection, TrendMetric, TrendMetricPoint, YearlySnapshot
@dataclass
class SnapshotBundle:
year: int
extraction: ExtractedIrsForm990PfDataSchema
def _safe_ratio(numerator: float, denominator: float) -> float | None:
if denominator in (0, None):
return None
try:
return numerator / denominator
except ZeroDivisionError:
return None
def _growth(current: float, previous: float | None) -> float | None:
if previous in (None, 0):
return None
try:
return (current - previous) / previous
except ZeroDivisionError:
return None
def _direction_from_points(values: Sequence[float | None]) -> TrendDirection:
clean = [value for value in values if value is not None]
if len(clean) < 2:
return TrendDirection.STABLE
start, end = clean[0], clean[-1]
if start is None or end is None:
return TrendDirection.STABLE
delta = end - start
tolerance = abs(start) * 0.02 if start else 0.01
if abs(delta) <= tolerance:
return TrendDirection.STABLE
if len(clean) > 2:
swings = sum(
1
for idx in range(1, len(clean) - 1)
if (clean[idx] - clean[idx - 1]) * (clean[idx + 1] - clean[idx]) < 0
)
if swings >= len(clean) // 2:
return TrendDirection.VOLATILE
return TrendDirection.IMPROVING if delta > 0 else TrendDirection.DECLINING
def _cagr(start: float | None, end: float | None, periods: int) -> float | None:
if start is None or end is None or start <= 0 or end <= 0 or periods <= 0:
return None
return (end / start) ** (1 / periods) - 1
def build_snapshots(bundles: Sequence[SnapshotBundle]) -> List[YearlySnapshot]:
snapshots: List[YearlySnapshot] = []
previous_revenue = None
previous_expenses = None
for bundle in bundles:
rev = bundle.extraction.revenue_breakdown.total_revenue
exp = bundle.extraction.expenses_breakdown.total_expenses
program = bundle.extraction.expenses_breakdown.program_services_expenses
admin = bundle.extraction.expenses_breakdown.management_general_expenses
fundraising = bundle.extraction.expenses_breakdown.fundraising_expenses
snapshots.append(
YearlySnapshot(
year=bundle.year,
total_revenue=rev,
total_expenses=exp,
revenue_growth=_growth(rev, previous_revenue),
expense_growth=_growth(exp, previous_expenses),
surplus=rev - exp,
program_ratio=_safe_ratio(program, exp),
admin_ratio=_safe_ratio(admin, exp),
fundraising_ratio=_safe_ratio(fundraising, exp),
net_margin=_safe_ratio(rev - exp, rev),
)
)
previous_revenue = rev
previous_expenses = exp
return snapshots
def _metric_from_series(
name: str,
unit: str,
description: str,
values: Iterable[Tuple[int, float | None]],
) -> TrendMetric:
points = [
TrendMetricPoint(year=year, value=value or 0.0, growth=None)
for year, value in values
]
for idx in range(1, len(points)):
prev = points[idx - 1].value
curr = points[idx].value
points[idx].growth = _growth(curr, prev)
data_values = [point.value for point in points]
direction = _direction_from_points(data_values)
cagr = None
if len(points) >= 2:
cagr = _cagr(points[0].value, points[-1].value, len(points) - 1)
return TrendMetric(
name=name,
unit=unit,
description=description,
points=points,
cagr=cagr,
direction=direction,
)
def build_key_metrics(snapshots: Sequence[YearlySnapshot]) -> List[TrendMetric]:
if not snapshots:
return []
metrics = [
_metric_from_series(
"Total Revenue",
"USD",
"Reported total revenue in Part I.",
[(snap.year, snap.total_revenue) for snap in snapshots],
),
_metric_from_series(
"Total Expenses",
"USD",
"Reported total expenses in Part I.",
[(snap.year, snap.total_expenses) for snap in snapshots],
),
_metric_from_series(
"Operating Surplus",
"USD",
"Difference between total revenue and total expenses.",
[(snap.year, snap.surplus) for snap in snapshots],
),
_metric_from_series(
"Program Service Ratio",
"Ratio",
"Program service expenses divided by total expenses.",
[
(
snap.year,
snap.program_ratio if snap.program_ratio is not None else 0.0,
)
for snap in snapshots
],
),
_metric_from_series(
"Administrative Ratio",
"Ratio",
"Management & general expenses divided by total expenses.",
[
(snap.year, snap.admin_ratio if snap.admin_ratio is not None else 0.0)
for snap in snapshots
],
),
_metric_from_series(
"Fundraising Ratio",
"Ratio",
"Fundraising expenses divided by total expenses.",
[
(
snap.year,
snap.fundraising_ratio
if snap.fundraising_ratio is not None
else 0.0,
)
for snap in snapshots
],
),
]
for metric in metrics:
if metric.name.endswith("Ratio"):
metric.notes = "Higher values indicate greater spending share."
elif metric.name == "Operating Surplus":
metric.notes = "Positive surplus implies revenues exceeded expenses."
return metrics

View File

@@ -0,0 +1,74 @@
from __future__ import annotations
from enum import Enum
from typing import List
from pydantic import BaseModel, Field
class TrendDirection(str, Enum):
IMPROVING = "Improving"
DECLINING = "Declining"
STABLE = "Stable"
VOLATILE = "Volatile"
class TrendMetricPoint(BaseModel):
year: int
value: float
growth: float | None = Field(
default=None, description="Year-over-year growth expressed as a decimal."
)
class TrendMetric(BaseModel):
name: str
unit: str
description: str
points: List[TrendMetricPoint]
cagr: float | None = Field(
default=None,
description="Compound annual growth rate across the analyzed period.",
)
direction: TrendDirection = Field(
default=TrendDirection.STABLE, description="Overall direction of the metric."
)
notes: str | None = None
class TrendInsight(BaseModel):
category: str
direction: TrendDirection
summary: str
confidence: float = Field(default=0.7, ge=0.0, le=1.0)
class AnalystReport(BaseModel):
organisation_name: str
organisation_ein: str
years_analyzed: List[int] = Field(default_factory=list)
key_metrics: List[TrendMetric] = Field(default_factory=list)
insights: List[TrendInsight] = Field(default_factory=list)
recommendations: List[str] = Field(default_factory=list)
outlook: str = "Pending analysis"
class YearlySnapshot(BaseModel):
year: int
total_revenue: float
total_expenses: float
revenue_growth: float | None = None
expense_growth: float | None = None
surplus: float | None = None
program_ratio: float | None = None
admin_ratio: float | None = None
fundraising_ratio: float | None = None
net_margin: float | None = None
class AnalystState(BaseModel):
organisation_name: str
organisation_ein: str
series: List[YearlySnapshot]
key_metrics: List[TrendMetric]
notes: List[str] = Field(default_factory=list)

View File

@@ -106,6 +106,11 @@ class CoreOrganizationMetadata(BaseModel):
incorporation_state: str = Field(
..., description="State of incorporation.", title="Incorporation State"
)
calendar_year: str | None = Field(
default=None,
description="Calendar year covered by the return (if different from fiscal year).",
title="Calendar Year",
)
class RevenueBreakdown(BaseModel):
@@ -579,6 +584,7 @@ def _transform_flat_payload(data: dict[str, Any]) -> dict[str, Any]:
"organization_type": get_str("organization_type"),
"year_of_formation": get_str("year_of_formation"),
"incorporation_state": get_str("incorporation_state"),
"calendar_year": get_str("calendar_year"),
},
"revenue_breakdown": {
"total_revenue": get_value("total_revenue"),

View File

@@ -1,4 +1,5 @@
import json
import logging
from dataclasses import dataclass
from typing import Annotated, Any
@@ -10,7 +11,7 @@ from pydantic_ai.ui.vercel_ai import VercelAIAdapter
from starlette.requests import Request
from starlette.responses import Response
from app.agents import form_auditor, web_search
from app.agents import analyst, form_auditor, web_search
from app.core.config import settings
from app.services.extracted_data_service import get_extracted_data_service
@@ -24,27 +25,43 @@ model = OpenAIChatModel(model_name="gpt-4o", provider=provider)
@dataclass
class Deps:
extracted_data: dict[str, Any]
extracted_data: list[dict[str, Any]]
agent = Agent(model=model, deps_type=Deps)
router = APIRouter(prefix="/api/v1/agent", tags=["Agent"])
logger = logging.getLogger(__name__)
@agent.tool
async def build_audit_report(ctx: RunContext[Deps]):
"""Calls the audit subagent to get a full audit report of the organization"""
data = ctx.deps.extracted_data
with open("data/audit_report.json", "w") as f:
json.dump(data, f)
data = ctx.deps.extracted_data[0]
result = await form_auditor.build_audit_report(data)
return result.model_dump()
@agent.tool
async def build_analysis_report(ctx: RunContext[Deps]):
"""Calls the analyst subagent to get a full report of the organization's performance across years"""
data = ctx.deps.extracted_data
if not data:
raise ValueError("No extracted data available for analysis.")
if len(data) == 1:
logger.info(
"build_analysis_report called with single-year data; report will still be generated but trends may be limited."
)
result = await analyst.build_performance_report(data)
return result.model_dump()
@agent.tool_plain
async def search_web_information(query: str, max_results: int = 5):
"""Search the web for up-to-date information using Tavily. Use this when you need current information, news, research, or facts not in your knowledge base."""
@@ -61,6 +78,8 @@ async def chat(request: Request, tema: Annotated[str, Header()]) -> Response:
extracted_data = [doc.get_extracted_data() for doc in data]
deps = Deps(extracted_data=extracted_data[0])
logger.info(f"Extracted data amount: {len(extracted_data)}")
deps = Deps(extracted_data=extracted_data)
return await VercelAIAdapter.dispatch_request(request, agent=agent, deps=deps)

View File

@@ -12,6 +12,15 @@
"max_value": null,
"pattern": "^\\d{2}-\\d{7}$"
},
{
"name": "calendar_year",
"type": "integer",
"description": "Calendar year for which the data is reported",
"required": true,
"min_value": null,
"max_value": null,
"pattern": null
},
{
"name": "legal_name",
"type": "string",
@@ -764,4 +773,4 @@
"updated_at": "2025-11-07T23:45:00.000000",
"tema": "IRS_FORM_990PF",
"is_global": true
}
}