add analysis component

2025-11-09 10:24:58 -06:00
parent 77a11ef32e
commit 1ce4162e4a
11 changed files with 1155 additions and 8 deletions
--- a/backend/app/agents/analyst/init.py
+++ b/backend/app/agents/analyst/init.py
@@ -0,0 +1,112 @@
+from __future__ import annotations
+
+from typing import Any, Iterable, List
+
+from app.agents.form_auditor.models import ExtractedIrsForm990PfDataSchema
+
+from .agent import agent
+from .metrics import SnapshotBundle, build_key_metrics, build_snapshots
+from .models import AnalystReport, AnalystState
+
+__all__ = ["build_performance_report"]
+
+
+def _resolve_year(
+    entry: dict[str, Any], extraction: ExtractedIrsForm990PfDataSchema
+) -> int:
+    candidates: Iterable[Any] = (
+        entry.get("calendar_year"),
+        entry.get("year"),
+        entry.get("tax_year"),
+        entry.get("return_year"),
+        entry.get("metadata", {}).get("return_year")
+        if isinstance(entry.get("metadata"), dict)
+        else None,
+        entry.get("metadata", {}).get("tax_year")
+        if isinstance(entry.get("metadata"), dict)
+        else None,
+        extraction.core_organization_metadata.calendar_year,
+    )
+    for candidate in candidates:
+        if candidate in (None, ""):
+            continue
+        try:
+            return int(candidate)
+        except (TypeError, ValueError):
+            continue
+    raise ValueError("Unable to determine filing year for one of the payload entries.")
+
+
+async def build_performance_report(payloads: List[dict[str, Any]]) -> AnalystReport:
+    if not payloads:
+        raise ValueError("At least one payload is required for performance analysis.")
+
+    bundles: List[SnapshotBundle] = []
+
+    organisation_name = ""
+    organisation_ein = ""
+
+    for entry in payloads:
+        if not isinstance(entry, dict):
+            raise TypeError("Each payload entry must be a dict.")
+
+        extraction_payload = entry.get("extraction") if "extraction" in entry else entry
+        extraction = ExtractedIrsForm990PfDataSchema.model_validate(extraction_payload)
+        year = _resolve_year(entry, extraction)
+
+        if not organisation_ein:
+            organisation_ein = extraction.core_organization_metadata.ein
+            organisation_name = extraction.core_organization_metadata.legal_name
+        else:
+            if extraction.core_organization_metadata.ein != organisation_ein:
+                raise ValueError(
+                    "All payload entries must belong to the same organization."
+                )
+
+        bundles.append(SnapshotBundle(year=year, extraction=extraction))
+
+    bundles.sort(key=lambda bundle: bundle.year)
+    snapshots = build_snapshots(bundles)
+    metrics = build_key_metrics(snapshots)
+
+    notes = []
+    if metrics:
+        revenue_metric = metrics[0]
+        expense_metric = metrics[1] if len(metrics) > 1 else None
+        if revenue_metric.cagr is not None:
+            notes.append(f"Revenue CAGR: {revenue_metric.cagr:.2%}")
+        if expense_metric and expense_metric.cagr is not None:
+            notes.append(f"Expense CAGR: {expense_metric.cagr:.2%}")
+        surplus_metric = next(
+            (m for m in metrics if m.name == "Operating Surplus"), None
+        )
+        if surplus_metric:
+            last_value = surplus_metric.points[-1].value if surplus_metric.points else 0
+            notes.append(f"Latest operating surplus: {last_value:,.0f}")
+
+    state = AnalystState(
+        organisation_name=organisation_name,
+        organisation_ein=organisation_ein,
+        series=snapshots,
+        key_metrics=metrics,
+        notes=notes,
+    )
+
+    prompt = (
+        "Analyze the provided multi-year financial context. Quantify notable trends, "
+        "call out risks or strengths, and supply actionable recommendations. "
+        "Capture both positive momentum and areas requiring attention."
+    )
+    result = await agent.run(prompt, deps=state)
+    report = result.output
+
+    years = [snapshot.year for snapshot in snapshots]
+
+    return report.model_copy(
+        update={
+            "organisation_name": organisation_name,
+            "organisation_ein": organisation_ein,
+            "years_analyzed": years,
+            "key_metrics": metrics,
+        }
+    )
--- a/backend/app/agents/analyst/agent.py
+++ b/backend/app/agents/analyst/agent.py
@@ -0,0 +1,33 @@
+from __future__ import annotations
+
+from pydantic_ai import Agent
+from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.providers.azure import AzureProvider
+
+from app.core.config import settings
+
+from .models import AnalystReport, AnalystState
+
+provider = AzureProvider(
+    azure_endpoint=settings.AZURE_OPENAI_ENDPOINT,
+    api_version=settings.AZURE_OPENAI_API_VERSION,
+    api_key=settings.AZURE_OPENAI_API_KEY,
+)
+
+model = OpenAIChatModel(model_name="gpt-4o", provider=provider)
+
+agent = Agent(
+    model=model,
+    name="MultiYearAnalyst",
+    deps_type=AnalystState,
+    output_type=AnalystReport,
+    system_prompt=(
+        "You are a nonprofit financial analyst. You receive multi-year Form 990 extractions "
+        "summarized into deterministic metrics (series, ratios, surplus, CAGR). Use the context "
+        "to highlight performance trends, governance implications, and forward-looking risks. "
+        "Focus on numeric trends: revenue growth, expense discipline, surplus stability, "
+        "program-vs-admin mix, and fundraising efficiency. Provide concise bullet insights, "
+        "clear recommendations tied to the data, and a balanced outlook (strengths vs watch items). "
+        "Only cite facts available in the provided series—do not invent figures."
+    ),
+)
--- a/backend/app/agents/analyst/metrics.py
+++ b/backend/app/agents/analyst/metrics.py
@@ -0,0 +1,197 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Iterable, List, Sequence, Tuple
+
+from app.agents.form_auditor.models import ExtractedIrsForm990PfDataSchema
+
+from .models import TrendDirection, TrendMetric, TrendMetricPoint, YearlySnapshot
+
+
+@dataclass
+class SnapshotBundle:
+    year: int
+    extraction: ExtractedIrsForm990PfDataSchema
+
+
+def _safe_ratio(numerator: float, denominator: float) -> float | None:
+    if denominator in (0, None):
+        return None
+    try:
+        return numerator / denominator
+    except ZeroDivisionError:
+        return None
+
+
+def _growth(current: float, previous: float | None) -> float | None:
+    if previous in (None, 0):
+        return None
+    try:
+        return (current - previous) / previous
+    except ZeroDivisionError:
+        return None
+
+
+def _direction_from_points(values: Sequence[float | None]) -> TrendDirection:
+    clean = [value for value in values if value is not None]
+    if len(clean) < 2:
+        return TrendDirection.STABLE
+
+    start, end = clean[0], clean[-1]
+    if start is None or end is None:
+        return TrendDirection.STABLE
+
+    delta = end - start
+    tolerance = abs(start) * 0.02 if start else 0.01
+    if abs(delta) <= tolerance:
+        return TrendDirection.STABLE
+
+    if len(clean) > 2:
+        swings = sum(
+            1
+            for idx in range(1, len(clean) - 1)
+            if (clean[idx] - clean[idx - 1]) * (clean[idx + 1] - clean[idx]) < 0
+        )
+        if swings >= len(clean) // 2:
+            return TrendDirection.VOLATILE
+
+    return TrendDirection.IMPROVING if delta > 0 else TrendDirection.DECLINING
+
+
+def _cagr(start: float | None, end: float | None, periods: int) -> float | None:
+    if start is None or end is None or start <= 0 or end <= 0 or periods <= 0:
+        return None
+    return (end / start) ** (1 / periods) - 1
+
+
+def build_snapshots(bundles: Sequence[SnapshotBundle]) -> List[YearlySnapshot]:
+    snapshots: List[YearlySnapshot] = []
+    previous_revenue = None
+    previous_expenses = None
+
+    for bundle in bundles:
+        rev = bundle.extraction.revenue_breakdown.total_revenue
+        exp = bundle.extraction.expenses_breakdown.total_expenses
+        program = bundle.extraction.expenses_breakdown.program_services_expenses
+        admin = bundle.extraction.expenses_breakdown.management_general_expenses
+        fundraising = bundle.extraction.expenses_breakdown.fundraising_expenses
+
+        snapshots.append(
+            YearlySnapshot(
+                year=bundle.year,
+                total_revenue=rev,
+                total_expenses=exp,
+                revenue_growth=_growth(rev, previous_revenue),
+                expense_growth=_growth(exp, previous_expenses),
+                surplus=rev - exp,
+                program_ratio=_safe_ratio(program, exp),
+                admin_ratio=_safe_ratio(admin, exp),
+                fundraising_ratio=_safe_ratio(fundraising, exp),
+                net_margin=_safe_ratio(rev - exp, rev),
+            )
+        )
+        previous_revenue = rev
+        previous_expenses = exp
+
+    return snapshots
+
+
+def _metric_from_series(
+    name: str,
+    unit: str,
+    description: str,
+    values: Iterable[Tuple[int, float | None]],
+) -> TrendMetric:
+    points = [
+        TrendMetricPoint(year=year, value=value or 0.0, growth=None)
+        for year, value in values
+    ]
+
+    for idx in range(1, len(points)):
+        prev = points[idx - 1].value
+        curr = points[idx].value
+        points[idx].growth = _growth(curr, prev)
+
+    data_values = [point.value for point in points]
+    direction = _direction_from_points(data_values)
+    cagr = None
+    if len(points) >= 2:
+        cagr = _cagr(points[0].value, points[-1].value, len(points) - 1)
+
+    return TrendMetric(
+        name=name,
+        unit=unit,
+        description=description,
+        points=points,
+        cagr=cagr,
+        direction=direction,
+    )
+
+
+def build_key_metrics(snapshots: Sequence[YearlySnapshot]) -> List[TrendMetric]:
+    if not snapshots:
+        return []
+
+    metrics = [
+        _metric_from_series(
+            "Total Revenue",
+            "USD",
+            "Reported total revenue in Part I.",
+            [(snap.year, snap.total_revenue) for snap in snapshots],
+        ),
+        _metric_from_series(
+            "Total Expenses",
+            "USD",
+            "Reported total expenses in Part I.",
+            [(snap.year, snap.total_expenses) for snap in snapshots],
+        ),
+        _metric_from_series(
+            "Operating Surplus",
+            "USD",
+            "Difference between total revenue and total expenses.",
+            [(snap.year, snap.surplus) for snap in snapshots],
+        ),
+        _metric_from_series(
+            "Program Service Ratio",
+            "Ratio",
+            "Program service expenses divided by total expenses.",
+            [
+                (
+                    snap.year,
+                    snap.program_ratio if snap.program_ratio is not None else 0.0,
+                )
+                for snap in snapshots
+            ],
+        ),
+        _metric_from_series(
+            "Administrative Ratio",
+            "Ratio",
+            "Management & general expenses divided by total expenses.",
+            [
+                (snap.year, snap.admin_ratio if snap.admin_ratio is not None else 0.0)
+                for snap in snapshots
+            ],
+        ),
+        _metric_from_series(
+            "Fundraising Ratio",
+            "Ratio",
+            "Fundraising expenses divided by total expenses.",
+            [
+                (
+                    snap.year,
+                    snap.fundraising_ratio
+                    if snap.fundraising_ratio is not None
+                    else 0.0,
+                )
+                for snap in snapshots
+            ],
+        ),
+    ]
+
+    for metric in metrics:
+        if metric.name.endswith("Ratio"):
+            metric.notes = "Higher values indicate greater spending share."
+        elif metric.name == "Operating Surplus":
+            metric.notes = "Positive surplus implies revenues exceeded expenses."
+
+    return metrics
--- a/backend/app/agents/analyst/models.py
+++ b/backend/app/agents/analyst/models.py
@@ -0,0 +1,74 @@
+from __future__ import annotations
+
+from enum import Enum
+from typing import List
+
+from pydantic import BaseModel, Field
+
+
+class TrendDirection(str, Enum):
+    IMPROVING = "Improving"
+    DECLINING = "Declining"
+    STABLE = "Stable"
+    VOLATILE = "Volatile"
+
+
+class TrendMetricPoint(BaseModel):
+    year: int
+    value: float
+    growth: float | None = Field(
+        default=None, description="Year-over-year growth expressed as a decimal."
+    )
+
+
+class TrendMetric(BaseModel):
+    name: str
+    unit: str
+    description: str
+    points: List[TrendMetricPoint]
+    cagr: float | None = Field(
+        default=None,
+        description="Compound annual growth rate across the analyzed period.",
+    )
+    direction: TrendDirection = Field(
+        default=TrendDirection.STABLE, description="Overall direction of the metric."
+    )
+    notes: str | None = None
+
+
+class TrendInsight(BaseModel):
+    category: str
+    direction: TrendDirection
+    summary: str
+    confidence: float = Field(default=0.7, ge=0.0, le=1.0)
+
+
+class AnalystReport(BaseModel):
+    organisation_name: str
+    organisation_ein: str
+    years_analyzed: List[int] = Field(default_factory=list)
+    key_metrics: List[TrendMetric] = Field(default_factory=list)
+    insights: List[TrendInsight] = Field(default_factory=list)
+    recommendations: List[str] = Field(default_factory=list)
+    outlook: str = "Pending analysis"
+
+
+class YearlySnapshot(BaseModel):
+    year: int
+    total_revenue: float
+    total_expenses: float
+    revenue_growth: float | None = None
+    expense_growth: float | None = None
+    surplus: float | None = None
+    program_ratio: float | None = None
+    admin_ratio: float | None = None
+    fundraising_ratio: float | None = None
+    net_margin: float | None = None
+
+
+class AnalystState(BaseModel):
+    organisation_name: str
+    organisation_ein: str
+    series: List[YearlySnapshot]
+    key_metrics: List[TrendMetric]
+    notes: List[str] = Field(default_factory=list)
--- a/backend/app/agents/form_auditor/models.py
+++ b/backend/app/agents/form_auditor/models.py
@@ -106,6 +106,11 @@ class CoreOrganizationMetadata(BaseModel):
    incorporation_state: str = Field(
        ..., description="State of incorporation.", title="Incorporation State"
    )
+    calendar_year: str | None = Field(
+        default=None,
+        description="Calendar year covered by the return (if different from fiscal year).",
+        title="Calendar Year",
+    )


 class RevenueBreakdown(BaseModel):
@@ -579,6 +584,7 @@ def _transform_flat_payload(data: dict[str, Any]) -> dict[str, Any]:
            "organization_type": get_str("organization_type"),
            "year_of_formation": get_str("year_of_formation"),
            "incorporation_state": get_str("incorporation_state"),
+            "calendar_year": get_str("calendar_year"),
        },
        "revenue_breakdown": {
            "total_revenue": get_value("total_revenue"),
--- a/backend/app/routers/agent.py
+++ b/backend/app/routers/agent.py
@@ -1,4 +1,5 @@
 import json
+import logging
 from dataclasses import dataclass
 from typing import Annotated, Any

@@ -10,7 +11,7 @@ from pydantic_ai.ui.vercel_ai import VercelAIAdapter
 from starlette.requests import Request
 from starlette.responses import Response

-from app.agents import form_auditor, web_search
+from app.agents import analyst, form_auditor, web_search
 from app.core.config import settings
 from app.services.extracted_data_service import get_extracted_data_service

@@ -24,27 +25,43 @@ model = OpenAIChatModel(model_name="gpt-4o", provider=provider)

@dataclass
 class Deps:
-    extracted_data: dict[str, Any]
+    extracted_data: list[dict[str, Any]]


 agent = Agent(model=model, deps_type=Deps)

 router = APIRouter(prefix="/api/v1/agent", tags=["Agent"])

+logger = logging.getLogger(__name__)
+

@agent.tool
 async def build_audit_report(ctx: RunContext[Deps]):
    """Calls the audit subagent to get a full audit report of the organization"""
-    data = ctx.deps.extracted_data
-
-    with open("data/audit_report.json", "w") as f:
-        json.dump(data, f)
+    data = ctx.deps.extracted_data[0]

    result = await form_auditor.build_audit_report(data)

    return result.model_dump()


+@agent.tool
+async def build_analysis_report(ctx: RunContext[Deps]):
+    """Calls the analyst subagent to get a full report of the organization's performance across years"""
+    data = ctx.deps.extracted_data
+    if not data:
+        raise ValueError("No extracted data available for analysis.")
+
+    if len(data) == 1:
+        logger.info(
+            "build_analysis_report called with single-year data; report will still be generated but trends may be limited."
+        )
+
+    result = await analyst.build_performance_report(data)
+
+    return result.model_dump()
+
+
@agent.tool_plain
 async def search_web_information(query: str, max_results: int = 5):
    """Search the web for up-to-date information using Tavily. Use this when you need current information, news, research, or facts not in your knowledge base."""
@@ -61,6 +78,8 @@ async def chat(request: Request, tema: Annotated[str, Header()]) -> Response:

    extracted_data = [doc.get_extracted_data() for doc in data]

-    deps = Deps(extracted_data=extracted_data[0])
+    logger.info(f"Extracted data amount: {len(extracted_data)}")
+
+    deps = Deps(extracted_data=extracted_data)

    return await VercelAIAdapter.dispatch_request(request, agent=agent, deps=deps)
--- a/backend/data/schemas/schema_103b7090a542.json
+++ b/backend/data/schemas/schema_103b7090a542.json
@@ -12,6 +12,15 @@
      "max_value": null,
      "pattern": "^\\d{2}-\\d{7}$"
    },
+    {
+      "name": "calendar_year",
+      "type": "integer",
+      "description": "Calendar year for which the data is reported",
+      "required": true,
+      "min_value": null,
+      "max_value": null,
+      "pattern": null
+    },
    {
      "name": "legal_name",
      "type": "string",
@@ -764,4 +773,4 @@
  "updated_at": "2025-11-07T23:45:00.000000",
  "tema": "IRS_FORM_990PF",
  "is_global": true
-}
+}