This commit is contained in:
Anibal Angulo
2025-11-07 09:41:18 -06:00
parent cafe0bf5f3
commit af9b5fed01
21 changed files with 3065 additions and 266 deletions

View File

@@ -6,6 +6,7 @@ from pydantic_ai.ui.vercel_ai import VercelAIAdapter
from starlette.requests import Request
from starlette.responses import Response
from app.agents import form_auditor
from app.core.config import settings
provider = AzureProvider(
@@ -19,6 +20,347 @@ agent = Agent(model=model)
router = APIRouter(prefix="/api/v1/agent", tags=["Agent"])
@agent.tool_plain()
async def build_audit_report():
"""Calls the audit subagent to get a full audit report of the organization"""
data = {
"extraction": {
"core_organization_metadata": {
"ein": "84-2674654",
"legal_name": "07 IN HEAVEN MEMORIAL SCHOLARSHIP",
"phone_number": "(262) 215-0300",
"website_url": "",
"return_type": "990-PF",
"amended_return": "No",
"group_exemption_number": "",
"subsection_code": "501(c)(3)",
"ruling_date": "",
"accounting_method": "Cash",
"organization_type": "corporation",
"year_of_formation": "",
"incorporation_state": "WI",
},
"revenue_breakdown": {
"total_revenue": 5227,
"contributions_gifts_grants": 5227,
"program_service_revenue": 0,
"membership_dues": 0,
"investment_income": 0,
"gains_losses_sales_assets": 0,
"rental_income": 0,
"related_organizations_revenue": 0,
"gaming_revenue": 0,
"other_revenue": 0,
"government_grants": 0,
"foreign_contributions": 0,
},
"expenses_breakdown": {
"total_expenses": 2104,
"program_services_expenses": 0,
"management_general_expenses": 0,
"fundraising_expenses": 2104,
"grants_us_organizations": 0,
"grants_us_individuals": 0,
"grants_foreign_organizations": 0,
"grants_foreign_individuals": 0,
"compensation_officers": 0,
"compensation_other_staff": 0,
"payroll_taxes_benefits": 0,
"professional_fees": 0,
"office_occupancy_costs": 0,
"information_technology_costs": 0,
"travel_conference_expenses": 0,
"depreciation_amortization": 0,
"insurance": 0,
},
"balance_sheet": {},
"officers_directors_trustees_key_employees": [
{
"name": "REBECCA TERPSTRA",
"title_position": "PRESIDENT",
"average_hours_per_week": 0.1,
"related_party_transactions": "",
"former_officer": "",
"governance_role": "",
},
{
"name": "ROBERT GUZMAN",
"title_position": "VICE PRESDEINT",
"average_hours_per_week": 0.1,
"related_party_transactions": "",
"former_officer": "",
"governance_role": "",
},
{
"name": "ANDREA VALENTI",
"title_position": "TREASURER",
"average_hours_per_week": 0.1,
"related_party_transactions": "",
"former_officer": "",
"governance_role": "",
},
{
"name": "BETHANY WALSH",
"title_position": "SECRETARY",
"average_hours_per_week": 0.1,
"related_party_transactions": "",
"former_officer": "",
"governance_role": "",
},
],
"governance_management_disclosure": {
"governing_body_size": 4,
"independent_members": 4,
"financial_statements_reviewed": "",
"form_990_provided_to_governing_body": "",
"conflict_of_interest_policy": "",
"whistleblower_policy": "",
"document_retention_policy": "",
"ceo_compensation_review_process": "",
"public_disclosure_practices": "Yes",
},
"program_service_accomplishments": [],
"fundraising_grantmaking": {
"total_fundraising_event_revenue": 0,
"total_fundraising_event_expenses": 2104,
"professional_fundraiser_fees": 0,
},
"functional_operational_data": {
"number_of_employees": 0,
"number_of_volunteers": 0,
"occupancy_costs": 0,
"fundraising_method_descriptions": "",
"joint_ventures_disregarded_entities": "",
},
"compensation_details": {
"base_compensation": 0,
"bonus": 0,
"incentive": 0,
"other": 0,
"non_fixed_compensation": "",
"first_class_travel": "",
"housing_allowance": "",
"expense_account_usage": "",
"supplemental_retirement": "",
},
"political_lobbying_activities": {
"lobbying_expenditures_direct": 0,
"lobbying_expenditures_grassroots": 0,
"election_501h_status": "",
"political_campaign_expenditures": 0,
"related_organizations_affiliates": "",
},
"investments_endowment": {
"investment_types": "",
"donor_restricted_endowment_values": 0,
"net_appreciation_depreciation": 0,
"related_organization_transactions": "",
"loans_to_from_related_parties": "",
},
"tax_compliance_penalties": {
"penalties_excise_taxes_reported": "No",
"unrelated_business_income_disclosure": "No",
"foreign_bank_account_reporting": "No",
"schedule_o_narrative_explanations": "",
},
},
"extraction_metadata": {
"core_organization_metadata": {
"ein": {"value": "84-2674654", "references": ["0-7"]},
"legal_name": {
"value": "07 IN HEAVEN MEMORIAL SCHOLARSHIP",
"references": ["0-6"],
},
"phone_number": {"value": "(262) 215-0300", "references": ["0-a"]},
"website_url": {"value": "", "references": []},
"return_type": {
"value": "990-PF",
"references": ["4ade8ed0-bce7-4bd5-bd8d-190e3e4be95b"],
},
"amended_return": {
"value": "No",
"references": ["4ac9edc4-e9bb-430f-b4c4-a42bf4c04b28"],
},
"group_exemption_number": {"value": "", "references": []},
"subsection_code": {
"value": "501(c)(3)",
"references": ["4ac9edc4-e9bb-430f-b4c4-a42bf4c04b28"],
},
"ruling_date": {"value": "", "references": []},
"accounting_method": {"value": "Cash", "references": ["0-d"]},
"organization_type": {
"value": "corporation",
"references": ["4ac9edc4-e9bb-430f-b4c4-a42bf4c04b28"],
},
"year_of_formation": {"value": "", "references": []},
"incorporation_state": {
"value": "WI",
"references": ["4ac9edc4-e9bb-430f-b4c4-a42bf4c04b28"],
},
},
"revenue_breakdown": {
"total_revenue": {"value": 5227, "references": ["0-1z"]},
"contributions_gifts_grants": {"value": 5227, "references": ["0-m"]},
"program_service_revenue": {"value": 0, "references": []},
"membership_dues": {"value": 0, "references": []},
"investment_income": {"value": 0, "references": []},
"gains_losses_sales_assets": {"value": 0, "references": []},
"rental_income": {"value": 0, "references": []},
"related_organizations_revenue": {"value": 0, "references": []},
"gaming_revenue": {"value": 0, "references": []},
"other_revenue": {"value": 0, "references": []},
"government_grants": {"value": 0, "references": []},
"foreign_contributions": {"value": 0, "references": []},
},
"expenses_breakdown": {
"total_expenses": {"value": 2104, "references": ["0-2S"]},
"program_services_expenses": {"value": 0, "references": []},
"management_general_expenses": {"value": 0, "references": []},
"fundraising_expenses": {"value": 2104, "references": ["13-d"]},
"grants_us_organizations": {"value": 0, "references": []},
"grants_us_individuals": {"value": 0, "references": []},
"grants_foreign_organizations": {"value": 0, "references": []},
"grants_foreign_individuals": {"value": 0, "references": []},
"compensation_officers": {
"value": 0,
"references": ["5-1q", "5-1w", "5-1C", "5-1I"],
},
"compensation_other_staff": {"value": 0, "references": []},
"payroll_taxes_benefits": {"value": 0, "references": []},
"professional_fees": {"value": 0, "references": []},
"office_occupancy_costs": {"value": 0, "references": []},
"information_technology_costs": {"value": 0, "references": []},
"travel_conference_expenses": {"value": 0, "references": []},
"depreciation_amortization": {"value": 0, "references": []},
"insurance": {"value": 0, "references": []},
},
"balance_sheet": {},
"officers_directors_trustees_key_employees": [
{
"name": {"value": "REBECCA TERPSTRA", "references": ["5-1o"]},
"title_position": {"value": "PRESIDENT", "references": ["5-1p"]},
"average_hours_per_week": {"value": 0.1, "references": ["5-1p"]},
"related_party_transactions": {"value": "", "references": []},
"former_officer": {"value": "", "references": []},
"governance_role": {"value": "", "references": []},
},
{
"name": {"value": "ROBERT GUZMAN", "references": ["5-1u"]},
"title_position": {
"value": "VICE PRESDEINT",
"references": ["5-1v"],
},
"average_hours_per_week": {"value": 0.1, "references": ["5-1v"]},
"related_party_transactions": {"value": "", "references": []},
"former_officer": {"value": "", "references": []},
"governance_role": {"value": "", "references": []},
},
{
"name": {"value": "ANDREA VALENTI", "references": ["5-1A"]},
"title_position": {"value": "TREASURER", "references": ["5-1B"]},
"average_hours_per_week": {"value": 0.1, "references": ["5-1B"]},
"related_party_transactions": {"value": "", "references": []},
"former_officer": {"value": "", "references": []},
"governance_role": {"value": "", "references": []},
},
{
"name": {"value": "BETHANY WALSH", "references": ["5-1G"]},
"title_position": {"value": "SECRETARY", "references": ["5-1H"]},
"average_hours_per_week": {"value": 0.1, "references": ["5-1H"]},
"related_party_transactions": {"value": "", "references": []},
"former_officer": {"value": "", "references": []},
"governance_role": {"value": "", "references": []},
},
],
"governance_management_disclosure": {
"governing_body_size": {
"value": 4,
"references": ["5-1o", "5-1u", "5-1A", "5-1G"],
},
"independent_members": {
"value": 4,
"references": ["5-1o", "5-1u", "5-1A", "5-1G"],
},
"financial_statements_reviewed": {"value": "", "references": []},
"form_990_provided_to_governing_body": {"value": "", "references": []},
"conflict_of_interest_policy": {"value": "", "references": []},
"whistleblower_policy": {"value": "", "references": []},
"document_retention_policy": {"value": "", "references": []},
"ceo_compensation_review_process": {"value": "", "references": []},
"public_disclosure_practices": {"value": "Yes", "references": ["4-g"]},
},
"program_service_accomplishments": [],
"fundraising_grantmaking": {
"total_fundraising_event_revenue": {"value": 0, "references": []},
"total_fundraising_event_expenses": {
"value": 2104,
"references": ["13-d"],
},
"professional_fundraiser_fees": {"value": 0, "references": []},
},
"functional_operational_data": {
"number_of_employees": {"value": 0, "references": []},
"number_of_volunteers": {"value": 0, "references": []},
"occupancy_costs": {"value": 0, "references": []},
"fundraising_method_descriptions": {"value": "", "references": []},
"joint_ventures_disregarded_entities": {"value": "", "references": []},
},
"compensation_details": {
"base_compensation": {"value": 0, "references": ["5-1q", "5-1w"]},
"bonus": {"value": 0, "references": []},
"incentive": {"value": 0, "references": []},
"other": {"value": 0, "references": []},
"non_fixed_compensation": {"value": "", "references": []},
"first_class_travel": {"value": "", "references": []},
"housing_allowance": {"value": "", "references": []},
"expense_account_usage": {"value": "", "references": []},
"supplemental_retirement": {"value": "", "references": []},
},
"political_lobbying_activities": {
"lobbying_expenditures_direct": {"value": 0, "references": []},
"lobbying_expenditures_grassroots": {"value": 0, "references": []},
"election_501h_status": {"value": "", "references": []},
"political_campaign_expenditures": {"value": 0, "references": []},
"related_organizations_affiliates": {"value": "", "references": []},
},
"investments_endowment": {
"investment_types": {"value": "", "references": []},
"donor_restricted_endowment_values": {"value": 0, "references": []},
"net_appreciation_depreciation": {"value": 0, "references": []},
"related_organization_transactions": {"value": "", "references": []},
"loans_to_from_related_parties": {"value": "", "references": []},
},
"tax_compliance_penalties": {
"penalties_excise_taxes_reported": {
"value": "No",
"references": ["3-I"],
},
"unrelated_business_income_disclosure": {
"value": "No",
"references": ["3-Y"],
},
"foreign_bank_account_reporting": {
"value": "No",
"references": ["4-H"],
},
"schedule_o_narrative_explanations": {"value": "", "references": []},
},
},
"metadata": {
"filename": "markdown.md",
"org_id": None,
"duration_ms": 16656,
"credit_usage": 27.2,
"job_id": "nnmr8lcxtykk5ll5wodjtrnn6",
"version": "extract-20250930",
},
}
result = await form_auditor.build_audit_report(data)
return result.model_dump_json()
@router.post("/chat")
async def chat(request: Request) -> Response:
return await VercelAIAdapter.dispatch_request(request, agent=agent)

View File

@@ -2,17 +2,18 @@
Router para procesamiento de PDFs con LandingAI.
Soporta dos modos: rápido (solo parse) y extracción (parse + extract con schema).
"""
import logging
import time
from typing import List, Literal, Optional
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel, Field
from typing import Optional, List, Literal
from langchain_core.documents import Document
from pydantic import BaseModel, Field
from ..services.landingai_service import get_landingai_service
from ..services.chunking_service import get_chunking_service
from ..repositories.schema_repository import get_schema_repository
from ..services.chunking_service import get_chunking_service
from ..services.landingai_service import get_landingai_service
from ..utils.chunking.token_manager import TokenManager
logger = logging.getLogger(__name__)
@@ -22,6 +23,7 @@ router = APIRouter(prefix="/api/v1/chunking-landingai", tags=["chunking-landinga
class ProcessLandingAIRequest(BaseModel):
"""Request para procesar PDF con LandingAI"""
file_name: str = Field(..., description="Nombre del archivo PDF")
tema: str = Field(..., description="Tema/carpeta del archivo")
collection_name: str = Field(..., description="Colección de Qdrant")
@@ -29,34 +31,33 @@ class ProcessLandingAIRequest(BaseModel):
# Modo de procesamiento
mode: Literal["quick", "extract"] = Field(
default="quick",
description="Modo: 'quick' (solo parse) o 'extract' (parse + datos estructurados)"
description="Modo: 'quick' (solo parse) o 'extract' (parse + datos estructurados)",
)
# Schema (obligatorio si mode='extract')
schema_id: Optional[str] = Field(
None,
description="ID del schema a usar (requerido si mode='extract')"
None, description="ID del schema a usar (requerido si mode='extract')"
)
# Configuración de chunks
include_chunk_types: List[str] = Field(
default=["text", "table"],
description="Tipos de chunks a incluir: text, table, figure, etc."
description="Tipos de chunks a incluir: text, table, figure, etc.",
)
max_tokens_per_chunk: int = Field(
default=1500,
ge=500,
le=3000,
description="Tokens máximos por chunk (flexible para tablas/figuras)"
description="Tokens máximos por chunk (flexible para tablas/figuras)",
)
merge_small_chunks: bool = Field(
default=True,
description="Unir chunks pequeños de la misma página y tipo"
default=True, description="Unir chunks pequeños de la misma página y tipo"
)
class ProcessLandingAIResponse(BaseModel):
"""Response del procesamiento con LandingAI"""
success: bool
mode: str
processing_time_seconds: float
@@ -97,9 +98,9 @@ async def process_with_landingai(request: ProcessLandingAIRequest):
start_time = time.time()
try:
logger.info(f"\n{'='*60}")
logger.info(f"INICIANDO PROCESAMIENTO CON LANDINGAI")
logger.info(f"{'='*60}")
logger.info(f"\n{'=' * 60}")
logger.info("INICIANDO PROCESAMIENTO CON LANDINGAI")
logger.info(f"{'=' * 60}")
logger.info(f"Archivo: {request.file_name}")
logger.info(f"Tema: {request.tema}")
logger.info(f"Modo: {request.mode}")
@@ -111,7 +112,7 @@ async def process_with_landingai(request: ProcessLandingAIRequest):
if not request.schema_id:
raise HTTPException(
status_code=400,
detail="schema_id es requerido cuando mode='extract'"
detail="schema_id es requerido cuando mode='extract'",
)
schema_repo = get_schema_repository()
@@ -119,8 +120,7 @@ async def process_with_landingai(request: ProcessLandingAIRequest):
if not custom_schema:
raise HTTPException(
status_code=404,
detail=f"Schema no encontrado: {request.schema_id}"
status_code=404, detail=f"Schema no encontrado: {request.schema_id}"
)
logger.info(f"Schema seleccionado: {custom_schema.schema_name}")
@@ -131,14 +131,12 @@ async def process_with_landingai(request: ProcessLandingAIRequest):
try:
pdf_bytes = await chunking_service.download_pdf_from_blob(
request.file_name,
request.tema
request.file_name, request.tema
)
except Exception as e:
logger.error(f"Error descargando PDF: {e}")
raise HTTPException(
status_code=404,
detail=f"No se pudo descargar el PDF: {str(e)}"
status_code=404, detail=f"No se pudo descargar el PDF: {str(e)}"
)
# 3. Procesar con LandingAI
@@ -150,13 +148,12 @@ async def process_with_landingai(request: ProcessLandingAIRequest):
pdf_bytes=pdf_bytes,
file_name=request.file_name,
custom_schema=custom_schema,
include_chunk_types=request.include_chunk_types
include_chunk_types=request.include_chunk_types,
)
except Exception as e:
logger.error(f"Error en LandingAI: {e}")
raise HTTPException(
status_code=500,
detail=f"Error procesando con LandingAI: {str(e)}"
status_code=500, detail=f"Error procesando con LandingAI: {str(e)}"
)
documents = result["chunks"]
@@ -164,7 +161,7 @@ async def process_with_landingai(request: ProcessLandingAIRequest):
if not documents:
raise HTTPException(
status_code=400,
detail="No se generaron chunks después del procesamiento"
detail="No se generaron chunks después del procesamiento",
)
# 4. Aplicar control flexible de tokens
@@ -172,7 +169,7 @@ async def process_with_landingai(request: ProcessLandingAIRequest):
documents = _apply_flexible_token_control(
documents,
max_tokens=request.max_tokens_per_chunk,
merge_small=request.merge_small_chunks
merge_small=request.merge_small_chunks,
)
# 5. Generar embeddings
@@ -180,13 +177,16 @@ async def process_with_landingai(request: ProcessLandingAIRequest):
texts = [doc.page_content for doc in documents]
try:
embeddings = await chunking_service.embedding_service.generate_embeddings_batch(texts)
embeddings = (
await chunking_service.embedding_service.generate_embeddings_batch(
texts
)
)
logger.info(f"Embeddings generados: {len(embeddings)} vectores")
except Exception as e:
logger.error(f"Error generando embeddings: {e}")
raise HTTPException(
status_code=500,
detail=f"Error generando embeddings: {str(e)}"
status_code=500, detail=f"Error generando embeddings: {str(e)}"
)
# 6. Preparar chunks para Qdrant con IDs determinísticos
@@ -198,38 +198,38 @@ async def process_with_landingai(request: ProcessLandingAIRequest):
chunk_id = chunking_service._generate_deterministic_id(
file_name=request.file_name,
page=doc.metadata.get("page", 1),
chunk_index=doc.metadata.get("chunk_id", str(idx))
chunk_index=doc.metadata.get("chunk_id", str(idx)),
)
qdrant_chunks.append({
"id": chunk_id,
"vector": embedding,
"payload": {
"page_content": doc.page_content,
"metadata": doc.metadata # Metadata rica de LandingAI
qdrant_chunks.append(
{
"id": chunk_id,
"vector": embedding,
"payload": {
"page_content": doc.page_content,
"metadata": doc.metadata, # Metadata rica de LandingAI
},
}
})
)
# 7. Subir a Qdrant
try:
upload_result = await chunking_service.vector_db.add_chunks(
request.collection_name,
qdrant_chunks
request.collection_name, qdrant_chunks
)
logger.info(f"Subida completada: {upload_result['chunks_added']} chunks")
except Exception as e:
logger.error(f"Error subiendo a Qdrant: {e}")
raise HTTPException(
status_code=500,
detail=f"Error subiendo a Qdrant: {str(e)}"
status_code=500, detail=f"Error subiendo a Qdrant: {str(e)}"
)
# Tiempo total
processing_time = time.time() - start_time
logger.info(f"\n{'='*60}")
logger.info(f"\n{'=' * 60}")
logger.info(f"PROCESAMIENTO COMPLETADO")
logger.info(f"{'='*60}")
logger.info(f"{'=' * 60}")
logger.info(f"Tiempo: {processing_time:.2f}s")
logger.info(f"Chunks procesados: {len(documents)}")
logger.info(f"Chunks subidos: {upload_result['chunks_added']}")
@@ -245,23 +245,18 @@ async def process_with_landingai(request: ProcessLandingAIRequest):
schema_used=custom_schema.schema_id if custom_schema else None,
extracted_data=result.get("extracted_data"),
parse_metadata=result["parse_metadata"],
message=f"PDF procesado exitosamente en modo {request.mode}"
message=f"PDF procesado exitosamente en modo {request.mode}",
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error inesperado en procesamiento: {e}")
raise HTTPException(
status_code=500,
detail=f"Error inesperado: {str(e)}"
)
raise HTTPException(status_code=500, detail=f"Error inesperado: {str(e)}")
def _apply_flexible_token_control(
documents: List[Document],
max_tokens: int,
merge_small: bool
documents: List[Document], max_tokens: int, merge_small: bool
) -> List[Document]:
"""
Aplica control flexible de tokens (Opción C del diseño).
@@ -306,14 +301,10 @@ def _apply_flexible_token_control(
else:
# Intentar merge si es pequeño
if (
merge_small and
tokens < max_tokens * 0.5 and
i < len(documents) - 1
):
if merge_small and tokens < max_tokens * 0.5 and i < len(documents) - 1:
next_doc = documents[i + 1]
if _can_merge(doc, next_doc, max_tokens, token_manager):
logger.debug(f"Merging chunks {i} y {i+1}")
logger.debug(f"Merging chunks {i} y {i + 1}")
doc = _merge_documents(doc, next_doc)
i += 1 # Skip next
@@ -326,9 +317,7 @@ def _apply_flexible_token_control(
def _split_large_chunk(
doc: Document,
max_tokens: int,
token_manager: TokenManager
doc: Document, max_tokens: int, token_manager: TokenManager
) -> List[Document]:
"""Divide un chunk grande en sub-chunks"""
content = doc.page_content
@@ -343,8 +332,7 @@ def _split_large_chunk(
# Guardar chunk actual
sub_content = " ".join(current_chunk)
sub_doc = Document(
page_content=sub_content,
metadata={**doc.metadata, "is_split": True}
page_content=sub_content, metadata={**doc.metadata, "is_split": True}
)
sub_chunks.append(sub_doc)
current_chunk = [word]
@@ -357,8 +345,7 @@ def _split_large_chunk(
if current_chunk:
sub_content = " ".join(current_chunk)
sub_doc = Document(
page_content=sub_content,
metadata={**doc.metadata, "is_split": True}
page_content=sub_content, metadata={**doc.metadata, "is_split": True}
)
sub_chunks.append(sub_doc)
@@ -366,10 +353,7 @@ def _split_large_chunk(
def _can_merge(
doc1: Document,
doc2: Document,
max_tokens: int,
token_manager: TokenManager
doc1: Document, doc2: Document, max_tokens: int, token_manager: TokenManager
) -> bool:
"""Verifica si dos docs se pueden mergear"""
# Misma página
@@ -391,6 +375,5 @@ def _merge_documents(doc1: Document, doc2: Document) -> Document:
"""Mergea dos documentos"""
merged_content = f"{doc1.page_content}\n\n{doc2.page_content}"
return Document(
page_content=merged_content,
metadata={**doc1.metadata, "is_merged": True}
page_content=merged_content, metadata={**doc1.metadata, "is_merged": True}
)

View File

@@ -1,10 +1,12 @@
import logging
from typing import Optional
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from ..models.dataroom import DataRoom
from ..models.vector_models import CollectionCreateRequest
from ..services.azure_service import azure_service
from ..services.vector_service import vector_service
logger = logging.getLogger(__name__)
@@ -16,9 +18,136 @@ class DataroomCreate(BaseModel):
storage: str = ""
class DataroomInfo(BaseModel):
name: str
collection: str
storage: str
file_count: int
total_size_bytes: int
total_size_mb: float
collection_exists: bool
vector_count: Optional[int]
collection_info: Optional[dict]
file_types: dict
recent_files: list
router = APIRouter(prefix="/dataroom", tags=["Dataroom"])
@router.get("/{dataroom_name}/info")
async def dataroom_info(dataroom_name: str) -> DataroomInfo:
"""
Obtener información detallada de un dataroom específico
"""
try:
# Find the dataroom in Redis
datarooms = DataRoom.find().all()
dataroom = None
for room in datarooms:
if room.name == dataroom_name:
dataroom = room
break
if not dataroom:
raise HTTPException(
status_code=404, detail=f"Dataroom '{dataroom_name}' not found"
)
# Get file information from Azure Storage
try:
files_data = await azure_service.list_files(dataroom_name)
except Exception as e:
logger.warning(f"Could not fetch files for dataroom '{dataroom_name}': {e}")
files_data = []
# Calculate file metrics
file_count = len(files_data)
total_size_bytes = sum(file_data.get("size", 0) for file_data in files_data)
total_size_mb = (
round(total_size_bytes / (1024 * 1024), 2) if total_size_bytes > 0 else 0.0
)
# Analyze file types
file_types = {}
recent_files = []
for file_data in files_data:
# Count file types by extension
filename = file_data.get("name", "")
if "." in filename:
ext = filename.split(".")[-1].lower()
file_types[ext] = file_types.get(ext, 0) + 1
# Collect recent files (up to 5)
if len(recent_files) < 5:
recent_files.append(
{
"name": filename,
"size_mb": round(file_data.get("size", 0) / (1024 * 1024), 2),
"last_modified": file_data.get("last_modified"),
}
)
# Sort recent files by last modified (newest first)
recent_files.sort(key=lambda x: x.get("last_modified", ""), reverse=True)
# Get vector collection information
collection_exists = False
vector_count = None
collection_info = None
try:
collection_exists_response = await vector_service.check_collection_exists(
dataroom_name
)
collection_exists = collection_exists_response.exists
if collection_exists:
collection_info_response = await vector_service.get_collection_info(
dataroom_name
)
if collection_info_response:
collection_info = {
"vectors_count": collection_info_response.vectors_count,
"indexed_vectors_count": collection_info_response.indexed_vectors_count,
"points_count": collection_info_response.points_count,
"segments_count": collection_info_response.segments_count,
"status": collection_info_response.status,
}
vector_count = collection_info_response.vectors_count
except Exception as e:
logger.warning(
f"Could not fetch collection info for '{dataroom_name}': {e}"
)
logger.info(
f"Retrieved info for dataroom '{dataroom_name}': {file_count} files, {total_size_mb}MB"
)
return DataroomInfo(
name=dataroom.name,
collection=dataroom.collection,
storage=dataroom.storage,
file_count=file_count,
total_size_bytes=total_size_bytes,
total_size_mb=total_size_mb,
collection_exists=collection_exists,
vector_count=vector_count,
collection_info=collection_info,
file_types=file_types,
recent_files=recent_files,
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error getting dataroom info for '{dataroom_name}': {e}")
raise HTTPException(
status_code=500, detail=f"Error getting dataroom info: {str(e)}"
)
@router.get("/")
async def list_datarooms():
"""