wip chat
This commit is contained in:
@@ -6,6 +6,7 @@ from pydantic_ai.ui.vercel_ai import VercelAIAdapter
|
||||
from starlette.requests import Request
|
||||
from starlette.responses import Response
|
||||
|
||||
from app.agents import form_auditor
|
||||
from app.core.config import settings
|
||||
|
||||
provider = AzureProvider(
|
||||
@@ -19,6 +20,347 @@ agent = Agent(model=model)
|
||||
router = APIRouter(prefix="/api/v1/agent", tags=["Agent"])
|
||||
|
||||
|
||||
@agent.tool_plain()
|
||||
async def build_audit_report():
|
||||
"""Calls the audit subagent to get a full audit report of the organization"""
|
||||
data = {
|
||||
"extraction": {
|
||||
"core_organization_metadata": {
|
||||
"ein": "84-2674654",
|
||||
"legal_name": "07 IN HEAVEN MEMORIAL SCHOLARSHIP",
|
||||
"phone_number": "(262) 215-0300",
|
||||
"website_url": "",
|
||||
"return_type": "990-PF",
|
||||
"amended_return": "No",
|
||||
"group_exemption_number": "",
|
||||
"subsection_code": "501(c)(3)",
|
||||
"ruling_date": "",
|
||||
"accounting_method": "Cash",
|
||||
"organization_type": "corporation",
|
||||
"year_of_formation": "",
|
||||
"incorporation_state": "WI",
|
||||
},
|
||||
"revenue_breakdown": {
|
||||
"total_revenue": 5227,
|
||||
"contributions_gifts_grants": 5227,
|
||||
"program_service_revenue": 0,
|
||||
"membership_dues": 0,
|
||||
"investment_income": 0,
|
||||
"gains_losses_sales_assets": 0,
|
||||
"rental_income": 0,
|
||||
"related_organizations_revenue": 0,
|
||||
"gaming_revenue": 0,
|
||||
"other_revenue": 0,
|
||||
"government_grants": 0,
|
||||
"foreign_contributions": 0,
|
||||
},
|
||||
"expenses_breakdown": {
|
||||
"total_expenses": 2104,
|
||||
"program_services_expenses": 0,
|
||||
"management_general_expenses": 0,
|
||||
"fundraising_expenses": 2104,
|
||||
"grants_us_organizations": 0,
|
||||
"grants_us_individuals": 0,
|
||||
"grants_foreign_organizations": 0,
|
||||
"grants_foreign_individuals": 0,
|
||||
"compensation_officers": 0,
|
||||
"compensation_other_staff": 0,
|
||||
"payroll_taxes_benefits": 0,
|
||||
"professional_fees": 0,
|
||||
"office_occupancy_costs": 0,
|
||||
"information_technology_costs": 0,
|
||||
"travel_conference_expenses": 0,
|
||||
"depreciation_amortization": 0,
|
||||
"insurance": 0,
|
||||
},
|
||||
"balance_sheet": {},
|
||||
"officers_directors_trustees_key_employees": [
|
||||
{
|
||||
"name": "REBECCA TERPSTRA",
|
||||
"title_position": "PRESIDENT",
|
||||
"average_hours_per_week": 0.1,
|
||||
"related_party_transactions": "",
|
||||
"former_officer": "",
|
||||
"governance_role": "",
|
||||
},
|
||||
{
|
||||
"name": "ROBERT GUZMAN",
|
||||
"title_position": "VICE PRESDEINT",
|
||||
"average_hours_per_week": 0.1,
|
||||
"related_party_transactions": "",
|
||||
"former_officer": "",
|
||||
"governance_role": "",
|
||||
},
|
||||
{
|
||||
"name": "ANDREA VALENTI",
|
||||
"title_position": "TREASURER",
|
||||
"average_hours_per_week": 0.1,
|
||||
"related_party_transactions": "",
|
||||
"former_officer": "",
|
||||
"governance_role": "",
|
||||
},
|
||||
{
|
||||
"name": "BETHANY WALSH",
|
||||
"title_position": "SECRETARY",
|
||||
"average_hours_per_week": 0.1,
|
||||
"related_party_transactions": "",
|
||||
"former_officer": "",
|
||||
"governance_role": "",
|
||||
},
|
||||
],
|
||||
"governance_management_disclosure": {
|
||||
"governing_body_size": 4,
|
||||
"independent_members": 4,
|
||||
"financial_statements_reviewed": "",
|
||||
"form_990_provided_to_governing_body": "",
|
||||
"conflict_of_interest_policy": "",
|
||||
"whistleblower_policy": "",
|
||||
"document_retention_policy": "",
|
||||
"ceo_compensation_review_process": "",
|
||||
"public_disclosure_practices": "Yes",
|
||||
},
|
||||
"program_service_accomplishments": [],
|
||||
"fundraising_grantmaking": {
|
||||
"total_fundraising_event_revenue": 0,
|
||||
"total_fundraising_event_expenses": 2104,
|
||||
"professional_fundraiser_fees": 0,
|
||||
},
|
||||
"functional_operational_data": {
|
||||
"number_of_employees": 0,
|
||||
"number_of_volunteers": 0,
|
||||
"occupancy_costs": 0,
|
||||
"fundraising_method_descriptions": "",
|
||||
"joint_ventures_disregarded_entities": "",
|
||||
},
|
||||
"compensation_details": {
|
||||
"base_compensation": 0,
|
||||
"bonus": 0,
|
||||
"incentive": 0,
|
||||
"other": 0,
|
||||
"non_fixed_compensation": "",
|
||||
"first_class_travel": "",
|
||||
"housing_allowance": "",
|
||||
"expense_account_usage": "",
|
||||
"supplemental_retirement": "",
|
||||
},
|
||||
"political_lobbying_activities": {
|
||||
"lobbying_expenditures_direct": 0,
|
||||
"lobbying_expenditures_grassroots": 0,
|
||||
"election_501h_status": "",
|
||||
"political_campaign_expenditures": 0,
|
||||
"related_organizations_affiliates": "",
|
||||
},
|
||||
"investments_endowment": {
|
||||
"investment_types": "",
|
||||
"donor_restricted_endowment_values": 0,
|
||||
"net_appreciation_depreciation": 0,
|
||||
"related_organization_transactions": "",
|
||||
"loans_to_from_related_parties": "",
|
||||
},
|
||||
"tax_compliance_penalties": {
|
||||
"penalties_excise_taxes_reported": "No",
|
||||
"unrelated_business_income_disclosure": "No",
|
||||
"foreign_bank_account_reporting": "No",
|
||||
"schedule_o_narrative_explanations": "",
|
||||
},
|
||||
},
|
||||
"extraction_metadata": {
|
||||
"core_organization_metadata": {
|
||||
"ein": {"value": "84-2674654", "references": ["0-7"]},
|
||||
"legal_name": {
|
||||
"value": "07 IN HEAVEN MEMORIAL SCHOLARSHIP",
|
||||
"references": ["0-6"],
|
||||
},
|
||||
"phone_number": {"value": "(262) 215-0300", "references": ["0-a"]},
|
||||
"website_url": {"value": "", "references": []},
|
||||
"return_type": {
|
||||
"value": "990-PF",
|
||||
"references": ["4ade8ed0-bce7-4bd5-bd8d-190e3e4be95b"],
|
||||
},
|
||||
"amended_return": {
|
||||
"value": "No",
|
||||
"references": ["4ac9edc4-e9bb-430f-b4c4-a42bf4c04b28"],
|
||||
},
|
||||
"group_exemption_number": {"value": "", "references": []},
|
||||
"subsection_code": {
|
||||
"value": "501(c)(3)",
|
||||
"references": ["4ac9edc4-e9bb-430f-b4c4-a42bf4c04b28"],
|
||||
},
|
||||
"ruling_date": {"value": "", "references": []},
|
||||
"accounting_method": {"value": "Cash", "references": ["0-d"]},
|
||||
"organization_type": {
|
||||
"value": "corporation",
|
||||
"references": ["4ac9edc4-e9bb-430f-b4c4-a42bf4c04b28"],
|
||||
},
|
||||
"year_of_formation": {"value": "", "references": []},
|
||||
"incorporation_state": {
|
||||
"value": "WI",
|
||||
"references": ["4ac9edc4-e9bb-430f-b4c4-a42bf4c04b28"],
|
||||
},
|
||||
},
|
||||
"revenue_breakdown": {
|
||||
"total_revenue": {"value": 5227, "references": ["0-1z"]},
|
||||
"contributions_gifts_grants": {"value": 5227, "references": ["0-m"]},
|
||||
"program_service_revenue": {"value": 0, "references": []},
|
||||
"membership_dues": {"value": 0, "references": []},
|
||||
"investment_income": {"value": 0, "references": []},
|
||||
"gains_losses_sales_assets": {"value": 0, "references": []},
|
||||
"rental_income": {"value": 0, "references": []},
|
||||
"related_organizations_revenue": {"value": 0, "references": []},
|
||||
"gaming_revenue": {"value": 0, "references": []},
|
||||
"other_revenue": {"value": 0, "references": []},
|
||||
"government_grants": {"value": 0, "references": []},
|
||||
"foreign_contributions": {"value": 0, "references": []},
|
||||
},
|
||||
"expenses_breakdown": {
|
||||
"total_expenses": {"value": 2104, "references": ["0-2S"]},
|
||||
"program_services_expenses": {"value": 0, "references": []},
|
||||
"management_general_expenses": {"value": 0, "references": []},
|
||||
"fundraising_expenses": {"value": 2104, "references": ["13-d"]},
|
||||
"grants_us_organizations": {"value": 0, "references": []},
|
||||
"grants_us_individuals": {"value": 0, "references": []},
|
||||
"grants_foreign_organizations": {"value": 0, "references": []},
|
||||
"grants_foreign_individuals": {"value": 0, "references": []},
|
||||
"compensation_officers": {
|
||||
"value": 0,
|
||||
"references": ["5-1q", "5-1w", "5-1C", "5-1I"],
|
||||
},
|
||||
"compensation_other_staff": {"value": 0, "references": []},
|
||||
"payroll_taxes_benefits": {"value": 0, "references": []},
|
||||
"professional_fees": {"value": 0, "references": []},
|
||||
"office_occupancy_costs": {"value": 0, "references": []},
|
||||
"information_technology_costs": {"value": 0, "references": []},
|
||||
"travel_conference_expenses": {"value": 0, "references": []},
|
||||
"depreciation_amortization": {"value": 0, "references": []},
|
||||
"insurance": {"value": 0, "references": []},
|
||||
},
|
||||
"balance_sheet": {},
|
||||
"officers_directors_trustees_key_employees": [
|
||||
{
|
||||
"name": {"value": "REBECCA TERPSTRA", "references": ["5-1o"]},
|
||||
"title_position": {"value": "PRESIDENT", "references": ["5-1p"]},
|
||||
"average_hours_per_week": {"value": 0.1, "references": ["5-1p"]},
|
||||
"related_party_transactions": {"value": "", "references": []},
|
||||
"former_officer": {"value": "", "references": []},
|
||||
"governance_role": {"value": "", "references": []},
|
||||
},
|
||||
{
|
||||
"name": {"value": "ROBERT GUZMAN", "references": ["5-1u"]},
|
||||
"title_position": {
|
||||
"value": "VICE PRESDEINT",
|
||||
"references": ["5-1v"],
|
||||
},
|
||||
"average_hours_per_week": {"value": 0.1, "references": ["5-1v"]},
|
||||
"related_party_transactions": {"value": "", "references": []},
|
||||
"former_officer": {"value": "", "references": []},
|
||||
"governance_role": {"value": "", "references": []},
|
||||
},
|
||||
{
|
||||
"name": {"value": "ANDREA VALENTI", "references": ["5-1A"]},
|
||||
"title_position": {"value": "TREASURER", "references": ["5-1B"]},
|
||||
"average_hours_per_week": {"value": 0.1, "references": ["5-1B"]},
|
||||
"related_party_transactions": {"value": "", "references": []},
|
||||
"former_officer": {"value": "", "references": []},
|
||||
"governance_role": {"value": "", "references": []},
|
||||
},
|
||||
{
|
||||
"name": {"value": "BETHANY WALSH", "references": ["5-1G"]},
|
||||
"title_position": {"value": "SECRETARY", "references": ["5-1H"]},
|
||||
"average_hours_per_week": {"value": 0.1, "references": ["5-1H"]},
|
||||
"related_party_transactions": {"value": "", "references": []},
|
||||
"former_officer": {"value": "", "references": []},
|
||||
"governance_role": {"value": "", "references": []},
|
||||
},
|
||||
],
|
||||
"governance_management_disclosure": {
|
||||
"governing_body_size": {
|
||||
"value": 4,
|
||||
"references": ["5-1o", "5-1u", "5-1A", "5-1G"],
|
||||
},
|
||||
"independent_members": {
|
||||
"value": 4,
|
||||
"references": ["5-1o", "5-1u", "5-1A", "5-1G"],
|
||||
},
|
||||
"financial_statements_reviewed": {"value": "", "references": []},
|
||||
"form_990_provided_to_governing_body": {"value": "", "references": []},
|
||||
"conflict_of_interest_policy": {"value": "", "references": []},
|
||||
"whistleblower_policy": {"value": "", "references": []},
|
||||
"document_retention_policy": {"value": "", "references": []},
|
||||
"ceo_compensation_review_process": {"value": "", "references": []},
|
||||
"public_disclosure_practices": {"value": "Yes", "references": ["4-g"]},
|
||||
},
|
||||
"program_service_accomplishments": [],
|
||||
"fundraising_grantmaking": {
|
||||
"total_fundraising_event_revenue": {"value": 0, "references": []},
|
||||
"total_fundraising_event_expenses": {
|
||||
"value": 2104,
|
||||
"references": ["13-d"],
|
||||
},
|
||||
"professional_fundraiser_fees": {"value": 0, "references": []},
|
||||
},
|
||||
"functional_operational_data": {
|
||||
"number_of_employees": {"value": 0, "references": []},
|
||||
"number_of_volunteers": {"value": 0, "references": []},
|
||||
"occupancy_costs": {"value": 0, "references": []},
|
||||
"fundraising_method_descriptions": {"value": "", "references": []},
|
||||
"joint_ventures_disregarded_entities": {"value": "", "references": []},
|
||||
},
|
||||
"compensation_details": {
|
||||
"base_compensation": {"value": 0, "references": ["5-1q", "5-1w"]},
|
||||
"bonus": {"value": 0, "references": []},
|
||||
"incentive": {"value": 0, "references": []},
|
||||
"other": {"value": 0, "references": []},
|
||||
"non_fixed_compensation": {"value": "", "references": []},
|
||||
"first_class_travel": {"value": "", "references": []},
|
||||
"housing_allowance": {"value": "", "references": []},
|
||||
"expense_account_usage": {"value": "", "references": []},
|
||||
"supplemental_retirement": {"value": "", "references": []},
|
||||
},
|
||||
"political_lobbying_activities": {
|
||||
"lobbying_expenditures_direct": {"value": 0, "references": []},
|
||||
"lobbying_expenditures_grassroots": {"value": 0, "references": []},
|
||||
"election_501h_status": {"value": "", "references": []},
|
||||
"political_campaign_expenditures": {"value": 0, "references": []},
|
||||
"related_organizations_affiliates": {"value": "", "references": []},
|
||||
},
|
||||
"investments_endowment": {
|
||||
"investment_types": {"value": "", "references": []},
|
||||
"donor_restricted_endowment_values": {"value": 0, "references": []},
|
||||
"net_appreciation_depreciation": {"value": 0, "references": []},
|
||||
"related_organization_transactions": {"value": "", "references": []},
|
||||
"loans_to_from_related_parties": {"value": "", "references": []},
|
||||
},
|
||||
"tax_compliance_penalties": {
|
||||
"penalties_excise_taxes_reported": {
|
||||
"value": "No",
|
||||
"references": ["3-I"],
|
||||
},
|
||||
"unrelated_business_income_disclosure": {
|
||||
"value": "No",
|
||||
"references": ["3-Y"],
|
||||
},
|
||||
"foreign_bank_account_reporting": {
|
||||
"value": "No",
|
||||
"references": ["4-H"],
|
||||
},
|
||||
"schedule_o_narrative_explanations": {"value": "", "references": []},
|
||||
},
|
||||
},
|
||||
"metadata": {
|
||||
"filename": "markdown.md",
|
||||
"org_id": None,
|
||||
"duration_ms": 16656,
|
||||
"credit_usage": 27.2,
|
||||
"job_id": "nnmr8lcxtykk5ll5wodjtrnn6",
|
||||
"version": "extract-20250930",
|
||||
},
|
||||
}
|
||||
|
||||
result = await form_auditor.build_audit_report(data)
|
||||
|
||||
return result.model_dump_json()
|
||||
|
||||
|
||||
@router.post("/chat")
|
||||
async def chat(request: Request) -> Response:
|
||||
return await VercelAIAdapter.dispatch_request(request, agent=agent)
|
||||
|
||||
@@ -2,17 +2,18 @@
|
||||
Router para procesamiento de PDFs con LandingAI.
|
||||
Soporta dos modos: rápido (solo parse) y extracción (parse + extract con schema).
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from typing import List, Literal, Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Optional, List, Literal
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from ..services.landingai_service import get_landingai_service
|
||||
from ..services.chunking_service import get_chunking_service
|
||||
from ..repositories.schema_repository import get_schema_repository
|
||||
from ..services.chunking_service import get_chunking_service
|
||||
from ..services.landingai_service import get_landingai_service
|
||||
from ..utils.chunking.token_manager import TokenManager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -22,6 +23,7 @@ router = APIRouter(prefix="/api/v1/chunking-landingai", tags=["chunking-landinga
|
||||
|
||||
class ProcessLandingAIRequest(BaseModel):
|
||||
"""Request para procesar PDF con LandingAI"""
|
||||
|
||||
file_name: str = Field(..., description="Nombre del archivo PDF")
|
||||
tema: str = Field(..., description="Tema/carpeta del archivo")
|
||||
collection_name: str = Field(..., description="Colección de Qdrant")
|
||||
@@ -29,34 +31,33 @@ class ProcessLandingAIRequest(BaseModel):
|
||||
# Modo de procesamiento
|
||||
mode: Literal["quick", "extract"] = Field(
|
||||
default="quick",
|
||||
description="Modo: 'quick' (solo parse) o 'extract' (parse + datos estructurados)"
|
||||
description="Modo: 'quick' (solo parse) o 'extract' (parse + datos estructurados)",
|
||||
)
|
||||
|
||||
# Schema (obligatorio si mode='extract')
|
||||
schema_id: Optional[str] = Field(
|
||||
None,
|
||||
description="ID del schema a usar (requerido si mode='extract')"
|
||||
None, description="ID del schema a usar (requerido si mode='extract')"
|
||||
)
|
||||
|
||||
# Configuración de chunks
|
||||
include_chunk_types: List[str] = Field(
|
||||
default=["text", "table"],
|
||||
description="Tipos de chunks a incluir: text, table, figure, etc."
|
||||
description="Tipos de chunks a incluir: text, table, figure, etc.",
|
||||
)
|
||||
max_tokens_per_chunk: int = Field(
|
||||
default=1500,
|
||||
ge=500,
|
||||
le=3000,
|
||||
description="Tokens máximos por chunk (flexible para tablas/figuras)"
|
||||
description="Tokens máximos por chunk (flexible para tablas/figuras)",
|
||||
)
|
||||
merge_small_chunks: bool = Field(
|
||||
default=True,
|
||||
description="Unir chunks pequeños de la misma página y tipo"
|
||||
default=True, description="Unir chunks pequeños de la misma página y tipo"
|
||||
)
|
||||
|
||||
|
||||
class ProcessLandingAIResponse(BaseModel):
|
||||
"""Response del procesamiento con LandingAI"""
|
||||
|
||||
success: bool
|
||||
mode: str
|
||||
processing_time_seconds: float
|
||||
@@ -97,9 +98,9 @@ async def process_with_landingai(request: ProcessLandingAIRequest):
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
logger.info(f"\n{'='*60}")
|
||||
logger.info(f"INICIANDO PROCESAMIENTO CON LANDINGAI")
|
||||
logger.info(f"{'='*60}")
|
||||
logger.info(f"\n{'=' * 60}")
|
||||
logger.info("INICIANDO PROCESAMIENTO CON LANDINGAI")
|
||||
logger.info(f"{'=' * 60}")
|
||||
logger.info(f"Archivo: {request.file_name}")
|
||||
logger.info(f"Tema: {request.tema}")
|
||||
logger.info(f"Modo: {request.mode}")
|
||||
@@ -111,7 +112,7 @@ async def process_with_landingai(request: ProcessLandingAIRequest):
|
||||
if not request.schema_id:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="schema_id es requerido cuando mode='extract'"
|
||||
detail="schema_id es requerido cuando mode='extract'",
|
||||
)
|
||||
|
||||
schema_repo = get_schema_repository()
|
||||
@@ -119,8 +120,7 @@ async def process_with_landingai(request: ProcessLandingAIRequest):
|
||||
|
||||
if not custom_schema:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=f"Schema no encontrado: {request.schema_id}"
|
||||
status_code=404, detail=f"Schema no encontrado: {request.schema_id}"
|
||||
)
|
||||
|
||||
logger.info(f"Schema seleccionado: {custom_schema.schema_name}")
|
||||
@@ -131,14 +131,12 @@ async def process_with_landingai(request: ProcessLandingAIRequest):
|
||||
|
||||
try:
|
||||
pdf_bytes = await chunking_service.download_pdf_from_blob(
|
||||
request.file_name,
|
||||
request.tema
|
||||
request.file_name, request.tema
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error descargando PDF: {e}")
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=f"No se pudo descargar el PDF: {str(e)}"
|
||||
status_code=404, detail=f"No se pudo descargar el PDF: {str(e)}"
|
||||
)
|
||||
|
||||
# 3. Procesar con LandingAI
|
||||
@@ -150,13 +148,12 @@ async def process_with_landingai(request: ProcessLandingAIRequest):
|
||||
pdf_bytes=pdf_bytes,
|
||||
file_name=request.file_name,
|
||||
custom_schema=custom_schema,
|
||||
include_chunk_types=request.include_chunk_types
|
||||
include_chunk_types=request.include_chunk_types,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error en LandingAI: {e}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Error procesando con LandingAI: {str(e)}"
|
||||
status_code=500, detail=f"Error procesando con LandingAI: {str(e)}"
|
||||
)
|
||||
|
||||
documents = result["chunks"]
|
||||
@@ -164,7 +161,7 @@ async def process_with_landingai(request: ProcessLandingAIRequest):
|
||||
if not documents:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="No se generaron chunks después del procesamiento"
|
||||
detail="No se generaron chunks después del procesamiento",
|
||||
)
|
||||
|
||||
# 4. Aplicar control flexible de tokens
|
||||
@@ -172,7 +169,7 @@ async def process_with_landingai(request: ProcessLandingAIRequest):
|
||||
documents = _apply_flexible_token_control(
|
||||
documents,
|
||||
max_tokens=request.max_tokens_per_chunk,
|
||||
merge_small=request.merge_small_chunks
|
||||
merge_small=request.merge_small_chunks,
|
||||
)
|
||||
|
||||
# 5. Generar embeddings
|
||||
@@ -180,13 +177,16 @@ async def process_with_landingai(request: ProcessLandingAIRequest):
|
||||
texts = [doc.page_content for doc in documents]
|
||||
|
||||
try:
|
||||
embeddings = await chunking_service.embedding_service.generate_embeddings_batch(texts)
|
||||
embeddings = (
|
||||
await chunking_service.embedding_service.generate_embeddings_batch(
|
||||
texts
|
||||
)
|
||||
)
|
||||
logger.info(f"Embeddings generados: {len(embeddings)} vectores")
|
||||
except Exception as e:
|
||||
logger.error(f"Error generando embeddings: {e}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Error generando embeddings: {str(e)}"
|
||||
status_code=500, detail=f"Error generando embeddings: {str(e)}"
|
||||
)
|
||||
|
||||
# 6. Preparar chunks para Qdrant con IDs determinísticos
|
||||
@@ -198,38 +198,38 @@ async def process_with_landingai(request: ProcessLandingAIRequest):
|
||||
chunk_id = chunking_service._generate_deterministic_id(
|
||||
file_name=request.file_name,
|
||||
page=doc.metadata.get("page", 1),
|
||||
chunk_index=doc.metadata.get("chunk_id", str(idx))
|
||||
chunk_index=doc.metadata.get("chunk_id", str(idx)),
|
||||
)
|
||||
|
||||
qdrant_chunks.append({
|
||||
"id": chunk_id,
|
||||
"vector": embedding,
|
||||
"payload": {
|
||||
"page_content": doc.page_content,
|
||||
"metadata": doc.metadata # Metadata rica de LandingAI
|
||||
qdrant_chunks.append(
|
||||
{
|
||||
"id": chunk_id,
|
||||
"vector": embedding,
|
||||
"payload": {
|
||||
"page_content": doc.page_content,
|
||||
"metadata": doc.metadata, # Metadata rica de LandingAI
|
||||
},
|
||||
}
|
||||
})
|
||||
)
|
||||
|
||||
# 7. Subir a Qdrant
|
||||
try:
|
||||
upload_result = await chunking_service.vector_db.add_chunks(
|
||||
request.collection_name,
|
||||
qdrant_chunks
|
||||
request.collection_name, qdrant_chunks
|
||||
)
|
||||
logger.info(f"Subida completada: {upload_result['chunks_added']} chunks")
|
||||
except Exception as e:
|
||||
logger.error(f"Error subiendo a Qdrant: {e}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Error subiendo a Qdrant: {str(e)}"
|
||||
status_code=500, detail=f"Error subiendo a Qdrant: {str(e)}"
|
||||
)
|
||||
|
||||
# Tiempo total
|
||||
processing_time = time.time() - start_time
|
||||
|
||||
logger.info(f"\n{'='*60}")
|
||||
logger.info(f"\n{'=' * 60}")
|
||||
logger.info(f"PROCESAMIENTO COMPLETADO")
|
||||
logger.info(f"{'='*60}")
|
||||
logger.info(f"{'=' * 60}")
|
||||
logger.info(f"Tiempo: {processing_time:.2f}s")
|
||||
logger.info(f"Chunks procesados: {len(documents)}")
|
||||
logger.info(f"Chunks subidos: {upload_result['chunks_added']}")
|
||||
@@ -245,23 +245,18 @@ async def process_with_landingai(request: ProcessLandingAIRequest):
|
||||
schema_used=custom_schema.schema_id if custom_schema else None,
|
||||
extracted_data=result.get("extracted_data"),
|
||||
parse_metadata=result["parse_metadata"],
|
||||
message=f"PDF procesado exitosamente en modo {request.mode}"
|
||||
message=f"PDF procesado exitosamente en modo {request.mode}",
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error inesperado en procesamiento: {e}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Error inesperado: {str(e)}"
|
||||
)
|
||||
raise HTTPException(status_code=500, detail=f"Error inesperado: {str(e)}")
|
||||
|
||||
|
||||
def _apply_flexible_token_control(
|
||||
documents: List[Document],
|
||||
max_tokens: int,
|
||||
merge_small: bool
|
||||
documents: List[Document], max_tokens: int, merge_small: bool
|
||||
) -> List[Document]:
|
||||
"""
|
||||
Aplica control flexible de tokens (Opción C del diseño).
|
||||
@@ -306,14 +301,10 @@ def _apply_flexible_token_control(
|
||||
|
||||
else:
|
||||
# Intentar merge si es pequeño
|
||||
if (
|
||||
merge_small and
|
||||
tokens < max_tokens * 0.5 and
|
||||
i < len(documents) - 1
|
||||
):
|
||||
if merge_small and tokens < max_tokens * 0.5 and i < len(documents) - 1:
|
||||
next_doc = documents[i + 1]
|
||||
if _can_merge(doc, next_doc, max_tokens, token_manager):
|
||||
logger.debug(f"Merging chunks {i} y {i+1}")
|
||||
logger.debug(f"Merging chunks {i} y {i + 1}")
|
||||
doc = _merge_documents(doc, next_doc)
|
||||
i += 1 # Skip next
|
||||
|
||||
@@ -326,9 +317,7 @@ def _apply_flexible_token_control(
|
||||
|
||||
|
||||
def _split_large_chunk(
|
||||
doc: Document,
|
||||
max_tokens: int,
|
||||
token_manager: TokenManager
|
||||
doc: Document, max_tokens: int, token_manager: TokenManager
|
||||
) -> List[Document]:
|
||||
"""Divide un chunk grande en sub-chunks"""
|
||||
content = doc.page_content
|
||||
@@ -343,8 +332,7 @@ def _split_large_chunk(
|
||||
# Guardar chunk actual
|
||||
sub_content = " ".join(current_chunk)
|
||||
sub_doc = Document(
|
||||
page_content=sub_content,
|
||||
metadata={**doc.metadata, "is_split": True}
|
||||
page_content=sub_content, metadata={**doc.metadata, "is_split": True}
|
||||
)
|
||||
sub_chunks.append(sub_doc)
|
||||
current_chunk = [word]
|
||||
@@ -357,8 +345,7 @@ def _split_large_chunk(
|
||||
if current_chunk:
|
||||
sub_content = " ".join(current_chunk)
|
||||
sub_doc = Document(
|
||||
page_content=sub_content,
|
||||
metadata={**doc.metadata, "is_split": True}
|
||||
page_content=sub_content, metadata={**doc.metadata, "is_split": True}
|
||||
)
|
||||
sub_chunks.append(sub_doc)
|
||||
|
||||
@@ -366,10 +353,7 @@ def _split_large_chunk(
|
||||
|
||||
|
||||
def _can_merge(
|
||||
doc1: Document,
|
||||
doc2: Document,
|
||||
max_tokens: int,
|
||||
token_manager: TokenManager
|
||||
doc1: Document, doc2: Document, max_tokens: int, token_manager: TokenManager
|
||||
) -> bool:
|
||||
"""Verifica si dos docs se pueden mergear"""
|
||||
# Misma página
|
||||
@@ -391,6 +375,5 @@ def _merge_documents(doc1: Document, doc2: Document) -> Document:
|
||||
"""Mergea dos documentos"""
|
||||
merged_content = f"{doc1.page_content}\n\n{doc2.page_content}"
|
||||
return Document(
|
||||
page_content=merged_content,
|
||||
metadata={**doc1.metadata, "is_merged": True}
|
||||
page_content=merged_content, metadata={**doc1.metadata, "is_merged": True}
|
||||
)
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
from ..models.dataroom import DataRoom
|
||||
from ..models.vector_models import CollectionCreateRequest
|
||||
from ..services.azure_service import azure_service
|
||||
from ..services.vector_service import vector_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -16,9 +18,136 @@ class DataroomCreate(BaseModel):
|
||||
storage: str = ""
|
||||
|
||||
|
||||
class DataroomInfo(BaseModel):
|
||||
name: str
|
||||
collection: str
|
||||
storage: str
|
||||
file_count: int
|
||||
total_size_bytes: int
|
||||
total_size_mb: float
|
||||
collection_exists: bool
|
||||
vector_count: Optional[int]
|
||||
collection_info: Optional[dict]
|
||||
file_types: dict
|
||||
recent_files: list
|
||||
|
||||
|
||||
router = APIRouter(prefix="/dataroom", tags=["Dataroom"])
|
||||
|
||||
|
||||
@router.get("/{dataroom_name}/info")
|
||||
async def dataroom_info(dataroom_name: str) -> DataroomInfo:
|
||||
"""
|
||||
Obtener información detallada de un dataroom específico
|
||||
"""
|
||||
try:
|
||||
# Find the dataroom in Redis
|
||||
datarooms = DataRoom.find().all()
|
||||
dataroom = None
|
||||
for room in datarooms:
|
||||
if room.name == dataroom_name:
|
||||
dataroom = room
|
||||
break
|
||||
|
||||
if not dataroom:
|
||||
raise HTTPException(
|
||||
status_code=404, detail=f"Dataroom '{dataroom_name}' not found"
|
||||
)
|
||||
|
||||
# Get file information from Azure Storage
|
||||
try:
|
||||
files_data = await azure_service.list_files(dataroom_name)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not fetch files for dataroom '{dataroom_name}': {e}")
|
||||
files_data = []
|
||||
|
||||
# Calculate file metrics
|
||||
file_count = len(files_data)
|
||||
total_size_bytes = sum(file_data.get("size", 0) for file_data in files_data)
|
||||
total_size_mb = (
|
||||
round(total_size_bytes / (1024 * 1024), 2) if total_size_bytes > 0 else 0.0
|
||||
)
|
||||
|
||||
# Analyze file types
|
||||
file_types = {}
|
||||
recent_files = []
|
||||
|
||||
for file_data in files_data:
|
||||
# Count file types by extension
|
||||
filename = file_data.get("name", "")
|
||||
if "." in filename:
|
||||
ext = filename.split(".")[-1].lower()
|
||||
file_types[ext] = file_types.get(ext, 0) + 1
|
||||
|
||||
# Collect recent files (up to 5)
|
||||
if len(recent_files) < 5:
|
||||
recent_files.append(
|
||||
{
|
||||
"name": filename,
|
||||
"size_mb": round(file_data.get("size", 0) / (1024 * 1024), 2),
|
||||
"last_modified": file_data.get("last_modified"),
|
||||
}
|
||||
)
|
||||
|
||||
# Sort recent files by last modified (newest first)
|
||||
recent_files.sort(key=lambda x: x.get("last_modified", ""), reverse=True)
|
||||
|
||||
# Get vector collection information
|
||||
collection_exists = False
|
||||
vector_count = None
|
||||
collection_info = None
|
||||
|
||||
try:
|
||||
collection_exists_response = await vector_service.check_collection_exists(
|
||||
dataroom_name
|
||||
)
|
||||
collection_exists = collection_exists_response.exists
|
||||
|
||||
if collection_exists:
|
||||
collection_info_response = await vector_service.get_collection_info(
|
||||
dataroom_name
|
||||
)
|
||||
if collection_info_response:
|
||||
collection_info = {
|
||||
"vectors_count": collection_info_response.vectors_count,
|
||||
"indexed_vectors_count": collection_info_response.indexed_vectors_count,
|
||||
"points_count": collection_info_response.points_count,
|
||||
"segments_count": collection_info_response.segments_count,
|
||||
"status": collection_info_response.status,
|
||||
}
|
||||
vector_count = collection_info_response.vectors_count
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Could not fetch collection info for '{dataroom_name}': {e}"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Retrieved info for dataroom '{dataroom_name}': {file_count} files, {total_size_mb}MB"
|
||||
)
|
||||
|
||||
return DataroomInfo(
|
||||
name=dataroom.name,
|
||||
collection=dataroom.collection,
|
||||
storage=dataroom.storage,
|
||||
file_count=file_count,
|
||||
total_size_bytes=total_size_bytes,
|
||||
total_size_mb=total_size_mb,
|
||||
collection_exists=collection_exists,
|
||||
vector_count=vector_count,
|
||||
collection_info=collection_info,
|
||||
file_types=file_types,
|
||||
recent_files=recent_files,
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting dataroom info for '{dataroom_name}': {e}")
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Error getting dataroom info: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@router.get("/")
|
||||
async def list_datarooms():
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user