diff --git a/backend/RATE_LIMITING.md b/backend/RATE_LIMITING.md new file mode 100644 index 0000000..358321e --- /dev/null +++ b/backend/RATE_LIMITING.md @@ -0,0 +1,105 @@ +# Configuración de Rate Limiting para Azure OpenAI + +Este documento explica cómo configurar el rate limiting para evitar errores `429 RateLimitReached` en Azure OpenAI. + +## Variables de Entorno + +Agrega estas variables en tu archivo `.env`: + +```bash +# Rate limiting para embeddings +EMBEDDING_BATCH_SIZE=16 +EMBEDDING_DELAY_BETWEEN_BATCHES=1.0 +EMBEDDING_MAX_RETRIES=5 +``` + +## Configuración según Azure OpenAI Tier + +### **S0 Tier (Gratis/Básico)** +- **Límite**: ~1-3 requests/minuto, ~1,000 tokens/minuto +- **Configuración recomendada**: + ```bash + EMBEDDING_BATCH_SIZE=16 + EMBEDDING_DELAY_BETWEEN_BATCHES=1.0 + EMBEDDING_MAX_RETRIES=5 + ``` + +### **Standard Tier** +- **Límite**: ~10-20 requests/segundo, ~100,000 tokens/minuto +- **Configuración recomendada**: + ```bash + EMBEDDING_BATCH_SIZE=50 + EMBEDDING_DELAY_BETWEEN_BATCHES=0.5 + EMBEDDING_MAX_RETRIES=3 + ``` + +### **Premium Tier** +- **Límite**: ~100+ requests/segundo, ~500,000+ tokens/minuto +- **Configuración recomendada**: + ```bash + EMBEDDING_BATCH_SIZE=100 + EMBEDDING_DELAY_BETWEEN_BATCHES=0.1 + EMBEDDING_MAX_RETRIES=3 + ``` + +## Cómo Funciona el Rate Limiting + +### 1. **Batching** +Los textos se dividen en lotes de tamaño `EMBEDDING_BATCH_SIZE`. Un lote más pequeño reduce la probabilidad de exceder el rate limit. + +### 2. **Delays entre Batches** +Después de procesar cada lote, el sistema espera `EMBEDDING_DELAY_BETWEEN_BATCHES` segundos antes de procesar el siguiente. + +### 3. **Retry con Exponential Backoff** +Si ocurre un error 429 (rate limit): +- **Reintento 1**: espera 2 segundos +- **Reintento 2**: espera 4 segundos +- **Reintento 3**: espera 8 segundos +- **Reintento 4**: espera 16 segundos +- **Reintento 5**: espera 32 segundos + +Después de `EMBEDDING_MAX_RETRIES` reintentos, el proceso falla. + +## Monitoreo de Logs + +Cuando procesas documentos, verás logs como: + +``` +📊 Procesando batch 1/10 (16 textos)... +✓ Batch 1/10 completado exitosamente +📊 Procesando batch 2/10 (16 textos)... +⚠️ Rate limit alcanzado en batch 2/10. Reintento 1/5 en 2s... +✓ Batch 2/10 completado exitosamente +... +✅ Embeddings generados exitosamente: 150 vectores de 3072D +``` + +## Cálculo de Tiempo de Procesamiento + +Para estimar cuánto tardará el procesamiento: + +``` +Tiempo estimado = (total_chunks / EMBEDDING_BATCH_SIZE) * EMBEDDING_DELAY_BETWEEN_BATCHES +``` + +**Ejemplos**: +- 100 chunks con S0 config: `(100/16) * 1.0 = ~6.25 segundos` (sin contar reintentos) +- 1000 chunks con S0 config: `(1000/16) * 1.0 = ~62.5 segundos` (sin contar reintentos) + +## Ajuste Dinámico + +Si experimentas muchos errores 429: +1. **Reduce** `EMBEDDING_BATCH_SIZE` (ej: de 16 a 8) +2. **Aumenta** `EMBEDDING_DELAY_BETWEEN_BATCHES` (ej: de 1.0 a 2.0) +3. **Aumenta** `EMBEDDING_MAX_RETRIES` (ej: de 5 a 10) + +Si el procesamiento es muy lento y NO tienes errores 429: +1. **Aumenta** `EMBEDDING_BATCH_SIZE` (ej: de 16 a 32) +2. **Reduce** `EMBEDDING_DELAY_BETWEEN_BATCHES` (ej: de 1.0 a 0.5) + +## Upgrade de Azure OpenAI Tier + +Para aumentar tu límite, visita: +https://aka.ms/oai/quotaincrease + +Después del upgrade, ajusta las variables de entorno según tu nuevo tier. diff --git a/backend/app/core/config.py b/backend/app/core/config.py index bf700a7..a4cb29e 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -41,6 +41,13 @@ class Settings(BaseSettings): AZURE_OPENAI_EMBEDDING_MODEL: str = "text-embedding-3-large" AZURE_OPENAI_EMBEDDING_DEPLOYMENT: str = "text-embedding-3-large" + # Rate limiting para embeddings (ajustar según tier de Azure OpenAI) + # S0 tier: batch_size=16, delay=1.0 es seguro + # Tier superior: batch_size=100, delay=0.1 + EMBEDDING_BATCH_SIZE: int = 16 + EMBEDDING_DELAY_BETWEEN_BATCHES: float = 1.0 + EMBEDDING_MAX_RETRIES: int = 5 + # Google Cloud / Vertex AI configuración GOOGLE_APPLICATION_CREDENTIALS: str GOOGLE_CLOUD_PROJECT: str diff --git a/backend/app/main.py b/backend/app/main.py index f675635..7fe9d9e 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -11,6 +11,7 @@ from .routers.agent import router as agent_router from .routers.chunking import router as chunking_router from .routers.chunking_landingai import router as chunking_landingai_router from .routers.dataroom import router as dataroom_router +from .routers.extracted_data import router as extracted_data_router from .routers.files import router as files_router from .routers.schemas import router as schemas_router from .routers.vectors import router as vectors_router @@ -123,6 +124,9 @@ app.include_router(schemas_router) # Chunking LandingAI router (nuevo) app.include_router(chunking_landingai_router) +# Extracted data router (nuevo) +app.include_router(extracted_data_router) + app.include_router(dataroom_router, prefix="/api/v1") app.include_router(agent_router) diff --git a/backend/app/models/extracted_data.py b/backend/app/models/extracted_data.py new file mode 100644 index 0000000..fe7d734 --- /dev/null +++ b/backend/app/models/extracted_data.py @@ -0,0 +1,68 @@ +""" +Modelo Redis-OM para almacenar datos extraídos de documentos. +Permite búsqueda rápida de datos estructurados sin necesidad de búsqueda vectorial. +""" +from datetime import datetime +from typing import Optional, Dict, Any +from redis_om import HashModel, Field, Migrator +import json + + +class ExtractedDocument(HashModel): + """ + Modelo para guardar datos extraídos de documentos en Redis. + + Uso: + 1. Cuando se procesa un PDF con schema y se extraen datos + 2. Los chunks van a Qdrant (para RAG) + 3. Los datos extraídos van a Redis (para búsqueda estructurada) + + Ventajas: + - Búsqueda rápida por file_name, tema, collection_name + - Acceso directo a datos extraídos sin búsqueda vectorial + - Permite filtros y agregaciones + """ + + # Identificadores + file_name: str = Field(index=True) + tema: str = Field(index=True) + collection_name: str = Field(index=True) + + # Datos extraídos (JSON serializado) + # Redis-OM HashModel no soporta Dict directamente, usamos str y serializamos + extracted_data_json: str + + # Metadata + extraction_timestamp: str # ISO format + + class Meta: + database = None # Se configura en runtime + global_key_prefix = "extracted_doc" + model_key_prefix = "doc" + + def set_extracted_data(self, data: Dict[str, Any]) -> None: + """Helper para serializar datos extraídos a JSON""" + self.extracted_data_json = json.dumps(data, ensure_ascii=False, indent=2) + + def get_extracted_data(self) -> Dict[str, Any]: + """Helper para deserializar datos extraídos desde JSON""" + return json.loads(self.extracted_data_json) + + @classmethod + def find_by_file(cls, file_name: str): + """Busca todos los documentos extraídos de un archivo""" + return cls.find(cls.file_name == file_name).all() + + @classmethod + def find_by_tema(cls, tema: str): + """Busca todos los documentos extraídos de un tema""" + return cls.find(cls.tema == tema).all() + + @classmethod + def find_by_collection(cls, collection_name: str): + """Busca todos los documentos en una colección""" + return cls.find(cls.collection_name == collection_name).all() + + +# Ejecutar migración para crear índices en Redis +Migrator().run() diff --git a/backend/app/models/schema_models.py b/backend/app/models/schema_models.py index c18e50c..af45ba6 100644 --- a/backend/app/models/schema_models.py +++ b/backend/app/models/schema_models.py @@ -58,7 +58,7 @@ class CustomSchema(BaseModel): schema_id: Optional[str] = Field(None, description="ID único del schema (generado automáticamente si no se provee)") schema_name: str = Field(..., description="Nombre descriptivo del schema", min_length=1, max_length=100) description: str = Field(..., description="Descripción de qué extrae este schema", min_length=1, max_length=500) - fields: List[SchemaField] = Field(..., description="Lista de campos a extraer", min_items=1, max_items=50) + fields: List[SchemaField] = Field(..., description="Lista de campos a extraer", min_items=1, max_items=200) # Metadata created_at: Optional[str] = Field(None, description="Timestamp de creación ISO") diff --git a/backend/app/routers/chunking_landingai.py b/backend/app/routers/chunking_landingai.py index 5f1d5d6..1d3aab3 100644 --- a/backend/app/routers/chunking_landingai.py +++ b/backend/app/routers/chunking_landingai.py @@ -14,6 +14,7 @@ from pydantic import BaseModel, Field from ..repositories.schema_repository import get_schema_repository from ..services.chunking_service import get_chunking_service from ..services.landingai_service import get_landingai_service +from ..services.extracted_data_service import get_extracted_data_service from ..utils.chunking.token_manager import TokenManager logger = logging.getLogger(__name__) @@ -105,11 +106,12 @@ async def process_with_landingai(request: ProcessLandingAIRequest): logger.info(f"Tema: {request.tema}") logger.info(f"Modo: {request.mode}") logger.info(f"Colección: {request.collection_name}") + logger.info(f"Schema ID recibido: '{request.schema_id}' (tipo: {type(request.schema_id).__name__})") # 1. Validar schema si es modo extract custom_schema = None if request.mode == "extract": - if not request.schema_id: + if not request.schema_id or request.schema_id.strip() == "": raise HTTPException( status_code=400, detail="schema_id es requerido cuando mode='extract'", @@ -224,6 +226,22 @@ async def process_with_landingai(request: ProcessLandingAIRequest): status_code=500, detail=f"Error subiendo a Qdrant: {str(e)}" ) + # 8. Guardar datos extraídos en Redis (si existe extracted_data) + if result.get("extracted_data") and result["extracted_data"].get("extraction"): + try: + logger.info("\n[6/6] Guardando datos extraídos en Redis...") + extracted_data_service = get_extracted_data_service() + + await extracted_data_service.save_extracted_data( + file_name=request.file_name, + tema=request.tema, + collection_name=request.collection_name, + extracted_data=result["extracted_data"]["extraction"] + ) + except Exception as e: + # No fallar si Redis falla, solo logear + logger.warning(f"⚠️ No se pudieron guardar datos en Redis (no crítico): {e}") + # Tiempo total processing_time = time.time() - start_time diff --git a/backend/app/routers/extracted_data.py b/backend/app/routers/extracted_data.py new file mode 100644 index 0000000..31935d3 --- /dev/null +++ b/backend/app/routers/extracted_data.py @@ -0,0 +1,141 @@ +""" +Router para consultar datos extraídos almacenados en Redis. +""" +import logging +from typing import List, Optional + +from fastapi import APIRouter, HTTPException, Query +from pydantic import BaseModel + +from ..services.extracted_data_service import get_extracted_data_service + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/api/v1/extracted-data", tags=["extracted-data"]) + + +class ExtractedDataResponse(BaseModel): + """Response con datos extraídos de un documento""" + pk: str + file_name: str + tema: str + collection_name: str + extracted_data: dict + extraction_timestamp: str + + +class ExtractedDataListResponse(BaseModel): + """Response con lista de datos extraídos""" + total: int + documents: List[ExtractedDataResponse] + + +@router.get("/by-file/{file_name}", response_model=ExtractedDataListResponse) +async def get_by_file(file_name: str): + """ + Obtiene todos los datos extraídos de un archivo específico. + + Args: + file_name: Nombre del archivo + + Returns: + Lista de documentos con datos extraídos + """ + try: + service = get_extracted_data_service() + docs = await service.get_by_file(file_name) + + documents = [ + ExtractedDataResponse( + pk=doc.pk, + file_name=doc.file_name, + tema=doc.tema, + collection_name=doc.collection_name, + extracted_data=doc.get_extracted_data(), + extraction_timestamp=doc.extraction_timestamp + ) + for doc in docs + ] + + return ExtractedDataListResponse( + total=len(documents), + documents=documents + ) + + except Exception as e: + logger.error(f"Error obteniendo datos extraídos por archivo: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.get("/by-tema/{tema}", response_model=ExtractedDataListResponse) +async def get_by_tema(tema: str): + """ + Obtiene todos los datos extraídos de un tema específico. + + Args: + tema: Nombre del tema + + Returns: + Lista de documentos con datos extraídos + """ + try: + service = get_extracted_data_service() + docs = await service.get_by_tema(tema) + + documents = [ + ExtractedDataResponse( + pk=doc.pk, + file_name=doc.file_name, + tema=doc.tema, + collection_name=doc.collection_name, + extracted_data=doc.get_extracted_data(), + extraction_timestamp=doc.extraction_timestamp + ) + for doc in docs + ] + + return ExtractedDataListResponse( + total=len(documents), + documents=documents + ) + + except Exception as e: + logger.error(f"Error obteniendo datos extraídos por tema: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.get("/by-collection/{collection_name}", response_model=ExtractedDataListResponse) +async def get_by_collection(collection_name: str): + """ + Obtiene todos los datos extraídos de una colección específica. + + Args: + collection_name: Nombre de la colección + + Returns: + Lista de documentos con datos extraídos + """ + try: + service = get_extracted_data_service() + docs = await service.get_by_collection(collection_name) + + documents = [ + ExtractedDataResponse( + pk=doc.pk, + file_name=doc.file_name, + tema=doc.tema, + collection_name=doc.collection_name, + extracted_data=doc.get_extracted_data(), + extraction_timestamp=doc.extraction_timestamp + ) + for doc in docs + ] + + return ExtractedDataListResponse( + total=len(documents), + documents=documents + ) + + except Exception as e: + logger.error(f"Error obteniendo datos extraídos por colección: {e}") + raise HTTPException(status_code=500, detail=str(e)) diff --git a/backend/app/services/chunking_service.py b/backend/app/services/chunking_service.py index 1c54836..f2bd599 100644 --- a/backend/app/services/chunking_service.py +++ b/backend/app/services/chunking_service.py @@ -66,6 +66,8 @@ class ChunkingService: """ Descarga un PDF desde Azure Blob Storage. + NOTA: Todos los blobs se guardan en minúsculas en Azure. + Args: file_name: Nombre del archivo tema: Tema/carpeta del archivo @@ -77,8 +79,9 @@ class ChunkingService: Exception: Si hay error descargando el archivo """ try: - blob_path = f"{tema}/{file_name}" - logger.info(f"Descargando PDF: {blob_path}") + # Convertir a minúsculas ya que todos los blobs están en minúsculas + blob_path = f"{tema.lower()}/{file_name.lower()}" + logger.info(f"Descargando PDF: {blob_path} (tema original: {tema}, file original: {file_name})") blob_client = self.blob_service.get_blob_client( container=self.container_name, diff --git a/backend/app/services/embedding_service.py b/backend/app/services/embedding_service.py index 7581584..eb02727 100644 --- a/backend/app/services/embedding_service.py +++ b/backend/app/services/embedding_service.py @@ -1,10 +1,12 @@ """ Servicio de embeddings usando Azure OpenAI. Genera embeddings para chunks de texto usando text-embedding-3-large (3072 dimensiones). +Incluye manejo de rate limits con retry exponencial y delays entre batches. """ +import asyncio import logging from typing import List -from openai import AzureOpenAI +from openai import AzureOpenAI, RateLimitError from ..core.config import settings logger = logging.getLogger(__name__) @@ -63,46 +65,89 @@ class EmbeddingService: async def generate_embeddings_batch( self, texts: List[str], - batch_size: int = 100 + batch_size: int | None = None, + delay_between_batches: float | None = None, + max_retries: int | None = None ) -> List[List[float]]: """ - Genera embeddings para múltiples textos en lotes. + Genera embeddings para múltiples textos en lotes con manejo de rate limits. Args: texts: Lista de textos para generar embeddings - batch_size: Tamaño del lote para procesamiento (default: 100) + batch_size: Tamaño del lote (None = usar configuración de settings) + delay_between_batches: Segundos de espera entre batches (None = usar configuración) + max_retries: Número máximo de reintentos (None = usar configuración) Returns: Lista de vectores de embeddings Raises: - Exception: Si hay error al generar los embeddings + Exception: Si hay error al generar los embeddings después de todos los reintentos """ + # Usar configuración de settings si no se proporciona + batch_size = batch_size or settings.EMBEDDING_BATCH_SIZE + delay_between_batches = delay_between_batches or settings.EMBEDDING_DELAY_BETWEEN_BATCHES + max_retries = max_retries or settings.EMBEDDING_MAX_RETRIES + try: embeddings = [] + total_batches = (len(texts) - 1) // batch_size + 1 + + logger.info(f"Iniciando generación de embeddings: {len(texts)} textos en {total_batches} batches") + logger.info(f"Configuración: batch_size={batch_size}, delay={delay_between_batches}s, max_retries={max_retries}") for i in range(0, len(texts), batch_size): batch = texts[i:i + batch_size] - logger.info(f"Procesando lote {i//batch_size + 1}/{(len(texts)-1)//batch_size + 1}") + batch_num = i // batch_size + 1 - response = self.client.embeddings.create( - input=batch, - model=self.model - ) + logger.info(f"📊 Procesando batch {batch_num}/{total_batches} ({len(batch)} textos)...") - batch_embeddings = [item.embedding for item in response.data] - - # Validar dimensiones - for idx, emb in enumerate(batch_embeddings): - if len(emb) != self.embedding_dimension: - raise ValueError( - f"Dimensión incorrecta en índice {i + idx}: " - f"esperada {self.embedding_dimension}, obtenida {len(emb)}" + # Retry con exponential backoff + retry_count = 0 + while retry_count <= max_retries: + try: + response = self.client.embeddings.create( + input=batch, + model=self.model ) - embeddings.extend(batch_embeddings) + batch_embeddings = [item.embedding for item in response.data] - logger.info(f"Generados {len(embeddings)} embeddings exitosamente") + # Validar dimensiones + for idx, emb in enumerate(batch_embeddings): + if len(emb) != self.embedding_dimension: + raise ValueError( + f"Dimensión incorrecta en índice {i + idx}: " + f"esperada {self.embedding_dimension}, obtenida {len(emb)}" + ) + + embeddings.extend(batch_embeddings) + logger.info(f"✓ Batch {batch_num}/{total_batches} completado exitosamente") + break # Éxito, salir del retry loop + + except RateLimitError as e: + retry_count += 1 + if retry_count > max_retries: + logger.error(f"❌ Rate limit excedido después de {max_retries} reintentos") + raise + + # Exponential backoff: 2^retry_count segundos + wait_time = 2 ** retry_count + logger.warning( + f"⚠️ Rate limit alcanzado en batch {batch_num}/{total_batches}. " + f"Reintento {retry_count}/{max_retries} en {wait_time}s..." + ) + await asyncio.sleep(wait_time) + + except Exception as e: + logger.error(f"❌ Error en batch {batch_num}/{total_batches}: {e}") + raise + + # Delay entre batches para respetar rate limit (excepto en el último) + if i + batch_size < len(texts): + await asyncio.sleep(delay_between_batches) + + logger.info(f"✅ Embeddings generados exitosamente: {len(embeddings)} vectores de {self.embedding_dimension}D") return embeddings except Exception as e: diff --git a/backend/app/services/extracted_data_service.py b/backend/app/services/extracted_data_service.py new file mode 100644 index 0000000..e19605c --- /dev/null +++ b/backend/app/services/extracted_data_service.py @@ -0,0 +1,131 @@ +""" +Servicio para manejar el almacenamiento de datos extraídos en Redis. +""" +import logging +from datetime import datetime +from typing import Dict, Any, List, Optional + +from ..models.extracted_data import ExtractedDocument + +logger = logging.getLogger(__name__) + + +class ExtractedDataService: + """Servicio para guardar y recuperar datos extraídos de documentos""" + + async def save_extracted_data( + self, + file_name: str, + tema: str, + collection_name: str, + extracted_data: Dict[str, Any] + ) -> ExtractedDocument: + """ + Guarda datos extraídos de un documento en Redis. + + Args: + file_name: Nombre del archivo + tema: Tema del documento + collection_name: Colección de Qdrant + extracted_data: Datos extraídos (dict) + + Returns: + ExtractedDocument guardado + """ + try: + # Crear instancia del modelo + doc = ExtractedDocument( + file_name=file_name, + tema=tema, + collection_name=collection_name, + extracted_data_json="", # Se setea después + extraction_timestamp=datetime.utcnow().isoformat() + ) + + # Serializar datos extraídos + doc.set_extracted_data(extracted_data) + + # Guardar en Redis + doc.save() + + logger.info( + f"💾 Datos extraídos guardados en Redis: {file_name} " + f"({len(extracted_data)} campos)" + ) + + return doc + + except Exception as e: + logger.error(f"Error guardando datos extraídos en Redis: {e}") + raise + + async def get_by_file(self, file_name: str) -> List[ExtractedDocument]: + """ + Obtiene todos los documentos extraídos de un archivo. + + Args: + file_name: Nombre del archivo + + Returns: + Lista de ExtractedDocument + """ + try: + docs = ExtractedDocument.find_by_file(file_name) + logger.info(f"Encontrados {len(docs)} documentos extraídos para {file_name}") + return docs + except Exception as e: + logger.error(f"Error buscando documentos por archivo: {e}") + return [] + + async def get_by_tema(self, tema: str) -> List[ExtractedDocument]: + """ + Obtiene todos los documentos extraídos de un tema. + + Args: + tema: Tema a buscar + + Returns: + Lista de ExtractedDocument + """ + try: + docs = ExtractedDocument.find_by_tema(tema) + logger.info(f"Encontrados {len(docs)} documentos extraídos para tema {tema}") + return docs + except Exception as e: + logger.error(f"Error buscando documentos por tema: {e}") + return [] + + async def get_by_collection(self, collection_name: str) -> List[ExtractedDocument]: + """ + Obtiene todos los documentos de una colección. + + Args: + collection_name: Nombre de la colección + + Returns: + Lista de ExtractedDocument + """ + try: + docs = ExtractedDocument.find_by_collection(collection_name) + logger.info(f"Encontrados {len(docs)} documentos en colección {collection_name}") + return docs + except Exception as e: + logger.error(f"Error buscando documentos por colección: {e}") + return [] + + +# Instancia global singleton +_extracted_data_service: Optional[ExtractedDataService] = None + + +def get_extracted_data_service() -> ExtractedDataService: + """ + Obtiene la instancia singleton del servicio. + + Returns: + Instancia de ExtractedDataService + """ + global _extracted_data_service + if _extracted_data_service is None: + _extracted_data_service = ExtractedDataService() + return _extracted_data_service diff --git a/backend/data/schemas/schema_103b7090a542.json b/backend/data/schemas/schema_103b7090a542.json new file mode 100644 index 0000000..c7de5ba --- /dev/null +++ b/backend/data/schemas/schema_103b7090a542.json @@ -0,0 +1,767 @@ +{ + "schema_id": "schema_103b7090a542", + "schema_name": "Form 990-PF Data Extraction", + "description": "Comprehensive data extraction schema for IRS Form 990-PF (Private Foundation) including financial, governance, and operational information", + "fields": [ + { + "name": "ein", + "type": "string", + "description": "Federal Employer Identification Number of the organization", + "required": true, + "min_value": null, + "max_value": null, + "pattern": "^\\d{2}-\\d{7}$" + }, + { + "name": "legal_name", + "type": "string", + "description": "Official registered name of the organization", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "phone_number", + "type": "string", + "description": "Primary contact phone number", + "required": true, + "min_value": null, + "max_value": null, + "pattern": "^\\([0-9]{3}\\) [0-9]{3}-[0-9]{4}$" + }, + { + "name": "website_url", + "type": "string", + "description": "Organization's website address", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "return_type", + "type": "string", + "description": "Type of IRS return filed (990-PF for private foundations)", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "amended_return", + "type": "string", + "description": "Indicates if this is an amended return (Yes/No)", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "group_exemption_number", + "type": "string", + "description": "IRS group exemption number, if applicable", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "subsection_code", + "type": "string", + "description": "IRS subsection code (typically 501(c)(3) for foundations)", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "ruling_date", + "type": "string", + "description": "Date of IRS ruling or determination letter", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "accounting_method", + "type": "string", + "description": "Accounting method used (Cash, Accrual, or Other)", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "organization_type", + "type": "string", + "description": "Legal structure (corporation, trust, association, etc.)", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "year_of_formation", + "type": "string", + "description": "Year the organization was established", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "incorporation_state", + "type": "string", + "description": "State where the organization was incorporated", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "total_revenue", + "type": "float", + "description": "Sum of all revenue sources for the year", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "contributions_gifts_grants", + "type": "float", + "description": "Revenue from donations, contributions, and grants", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "program_service_revenue", + "type": "float", + "description": "Revenue generated from program services", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "membership_dues", + "type": "float", + "description": "Revenue from membership dues and assessments", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "investment_income", + "type": "float", + "description": "Income from interest, dividends, and other investments", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "gains_losses_sales_assets", + "type": "float", + "description": "Net gains or losses from sale of investments and assets", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "rental_income", + "type": "float", + "description": "Income from rental of real estate or equipment", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "related_organizations_revenue", + "type": "float", + "description": "Revenue received from related organizations", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "gaming_revenue", + "type": "float", + "description": "Revenue from gaming and gambling activities", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "other_revenue", + "type": "float", + "description": "All other revenue not categorized elsewhere", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "government_grants", + "type": "float", + "description": "Revenue from federal, state, and local government grants", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "foreign_contributions", + "type": "float", + "description": "Revenue from foreign sources and contributors", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "total_expenses", + "type": "float", + "description": "Sum of all organizational expenses for the year", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "program_services_expenses", + "type": "float", + "description": "Direct expenses for charitable program activities", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "management_general_expenses", + "type": "float", + "description": "Administrative and general operating expenses", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "fundraising_expenses", + "type": "float", + "description": "Expenses related to fundraising activities", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "grants_us_organizations", + "type": "float", + "description": "Grants and assistance provided to domestic organizations", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "grants_us_individuals", + "type": "float", + "description": "Grants and assistance provided to domestic individuals", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "grants_foreign_organizations", + "type": "float", + "description": "Grants and assistance provided to foreign organizations", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "grants_foreign_individuals", + "type": "float", + "description": "Grants and assistance provided to foreign individuals", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "compensation_officers", + "type": "float", + "description": "Total compensation paid to officers and key employees", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "compensation_other_staff", + "type": "float", + "description": "Compensation paid to other employees", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "payroll_taxes_benefits", + "type": "float", + "description": "Payroll taxes, pension plans, and employee benefits", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "professional_fees", + "type": "float", + "description": "Legal, accounting, and other professional service fees", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "office_occupancy_costs", + "type": "float", + "description": "Rent, utilities, and facility-related expenses", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "information_technology_costs", + "type": "float", + "description": "IT equipment, software, and technology expenses", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "travel_conference_expenses", + "type": "float", + "description": "Travel, conferences, conventions, and meetings", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "depreciation_amortization", + "type": "float", + "description": "Depreciation of equipment and amortization of intangibles", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "insurance", + "type": "float", + "description": "Insurance premiums and related costs", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "officers_list", + "type": "array_string", + "description": "JSON array of officers, directors, trustees, and key employees with their details", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "governing_body_size", + "type": "integer", + "description": "Total number of voting members on the governing body", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "independent_members", + "type": "integer", + "description": "Number of independent voting members", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "financial_statements_reviewed", + "type": "string", + "description": "Whether financial statements were reviewed or audited", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "form_990_provided_to_governing_body", + "type": "string", + "description": "Whether Form 990 was provided to governing body before filing", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "conflict_of_interest_policy", + "type": "string", + "description": "Whether organization has a conflict of interest policy", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "whistleblower_policy", + "type": "string", + "description": "Whether organization has a whistleblower policy", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "document_retention_policy", + "type": "string", + "description": "Whether organization has a document retention and destruction policy", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "ceo_compensation_review_process", + "type": "string", + "description": "Process used to determine compensation of organization's top management", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "public_disclosure_practices", + "type": "string", + "description": "How organization makes its governing documents and annual returns available to the public", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "program_accomplishments_list", + "type": "array_string", + "description": "JSON array of program service accomplishments with descriptions and financial details", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "total_fundraising_event_revenue", + "type": "float", + "description": "Total revenue from all fundraising events", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "total_fundraising_event_expenses", + "type": "float", + "description": "Total direct expenses for all fundraising events", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "professional_fundraiser_fees", + "type": "float", + "description": "Fees paid to professional fundraising services", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "number_of_employees", + "type": "integer", + "description": "Total number of employees during the year", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "number_of_volunteers", + "type": "integer", + "description": "Estimate of volunteers who provided services", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "occupancy_costs", + "type": "float", + "description": "Total costs for office space and facilities", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "fundraising_method_descriptions", + "type": "string", + "description": "Description of methods used for fundraising", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "joint_ventures_disregarded_entities", + "type": "string", + "description": "Information about joint ventures and disregarded entities", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "base_compensation", + "type": "float", + "description": "Base salary or wages paid to key personnel", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "bonus", + "type": "float", + "description": "Bonus and incentive compensation paid", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "incentive", + "type": "float", + "description": "Other incentive compensation", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "other_compensation", + "type": "float", + "description": "Other forms of compensation", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "non_fixed_compensation", + "type": "string", + "description": "Whether compensation arrangement is non-fixed", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "first_class_travel", + "type": "string", + "description": "Whether first-class or charter travel was provided", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "housing_allowance", + "type": "string", + "description": "Whether housing allowance or residence was provided", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "expense_account_usage", + "type": "string", + "description": "Whether payments for business use of personal residence were made", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "supplemental_retirement", + "type": "string", + "description": "Whether supplemental nonqualified retirement plan was provided", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "lobbying_expenditures_direct", + "type": "float", + "description": "Amount spent on direct lobbying activities", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "lobbying_expenditures_grassroots", + "type": "float", + "description": "Amount spent on grassroots lobbying activities", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "election_501h_status", + "type": "string", + "description": "Whether the organization made a Section 501(h) election", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "political_campaign_expenditures", + "type": "float", + "description": "Amount spent on political campaign activities", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "related_organizations_affiliates", + "type": "string", + "description": "Information about related organizations involved in political activities", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "investment_types", + "type": "string", + "description": "Description of types of investments held", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "donor_restricted_endowment_values", + "type": "float", + "description": "Value of permanently restricted endowment funds", + "required": true, + "min_value": 0, + "max_value": null, + "pattern": null + }, + { + "name": "net_appreciation_depreciation", + "type": "float", + "description": "Net appreciation or depreciation in fair value of investments", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "related_organization_transactions", + "type": "string", + "description": "Information about transactions with related organizations", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "loans_to_from_related_parties", + "type": "string", + "description": "Information about loans to or from related parties", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "penalties_excise_taxes_reported", + "type": "string", + "description": "Whether the organization reported any penalties or excise taxes", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "unrelated_business_income_disclosure", + "type": "string", + "description": "Whether the organization had unrelated business income", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "foreign_bank_account_reporting", + "type": "string", + "description": "Whether the organization had foreign bank accounts or assets", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "schedule_o_narrative_explanations", + "type": "string", + "description": "Additional narrative explanations from Schedule O", + "required": true, + "min_value": null, + "max_value": null, + "pattern": null + } + ], + "created_at": "2025-11-07T23:45:00.000000", + "updated_at": "2025-11-07T23:45:00.000000", + "tema": "IRS_FORM_990PF", + "is_global": true +} \ No newline at end of file diff --git a/backend/data/schemas/schema_103b7090a545.json b/backend/data/schemas/schema_103b7090a545.json new file mode 100644 index 0000000..3c0ac4e --- /dev/null +++ b/backend/data/schemas/schema_103b7090a545.json @@ -0,0 +1,74 @@ +{ + "schema_id": "schema_103b7090a545", + "schema_name": "Testing", + "description": "Informacion de las facturas de taxes (Prueba)", + "fields": [ + { + "name": "employed_id", + "type": "string", + "description": "id number from employed", + "required": false, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "ej_numero", + "type": "integer", + "description": "ejemplo", + "required": false, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "ej_decimal", + "type": "float", + "description": "ejemplo", + "required": false, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "ej_booleano", + "type": "boolean", + "description": "ejemplo", + "required": false, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "ej_list", + "type": "array_string", + "description": "ejemplo", + "required": false, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "ej_listnum", + "type": "array_integer", + "description": "ejemplo", + "required": false, + "min_value": null, + "max_value": null, + "pattern": null + }, + { + "name": "fecha", + "type": "date", + "description": "ejemplo", + "required": false, + "min_value": null, + "max_value": null, + "pattern": null + } + ], + "created_at": "2025-11-07T17:49:18.193078", + "updated_at": "2025-11-07T22:19:53.434529", + "tema": "ULTA", + "is_global": true +} \ No newline at end of file diff --git a/backend/data/schemas/schema_990.json b/backend/data/schemas/schema_990.json new file mode 100644 index 0000000..e69de29 diff --git a/docker-compose.yml b/docker-compose.yml index 84a4c08..2f0df99 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -20,6 +20,7 @@ services: volumes: - ./backend/app:/app/app - ./backend/.secrets:/app/.secrets + - ./backend/data:/app/data env_file: - .env networks: @@ -27,7 +28,6 @@ services: db: image: redis/redis-stack:latest - command: redis-server --appendonly yes ports: - 6379:6379 - 8001:8001 diff --git a/frontend/src/components/FilesTab.tsx b/frontend/src/components/FilesTab.tsx index 8a2e9fd..6467b75 100644 --- a/frontend/src/components/FilesTab.tsx +++ b/frontend/src/components/FilesTab.tsx @@ -218,10 +218,10 @@ export function FilesTab({ tema: chunkingFileTema, collection_name: chunkingCollectionName, mode: config.mode, - schema_id: config.schemaId, - include_chunk_types: config.includeChunkTypes, - max_tokens_per_chunk: config.maxTokensPerChunk, - merge_small_chunks: config.mergeSmallChunks, + schema_id: config.schema_id, + include_chunk_types: config.include_chunk_types, + max_tokens_per_chunk: config.max_tokens_per_chunk, + merge_small_chunks: config.merge_small_chunks, }; await api.processWithLandingAI(processConfig);