Landing AI integrado

2025-11-06 13:29:43 +00:00
parent 7c6e8c4858
commit c03d0e27c4
32 changed files with 3908 additions and 728 deletions
--- a/backend/app/pycache/init.cpython-312.pyc
+++ b/backend/app/pycache/init.cpython-312.pyc
--- a/backend/app/pycache/main.cpython-312.pyc
+++ b/backend/app/pycache/main.cpython-312.pyc
--- a/backend/app/core/pycache/init.cpython-312.pyc
+++ b/backend/app/core/pycache/init.cpython-312.pyc
--- a/backend/app/core/pycache/config.cpython-312.pyc
+++ b/backend/app/core/pycache/config.cpython-312.pyc
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -45,6 +45,13 @@ class Settings(BaseSettings):
    GOOGLE_CLOUD_LOCATION: str = "us-central1"
    GEMINI_MODEL: str = "gemini-2.0-flash"

+    # LandingAI configuración
+    LANDINGAI_API_KEY: str
+    LANDINGAI_ENVIRONMENT: str = "production"  # "production" o "eu"
+
+    # Schemas storage
+    SCHEMAS_DIR: str = "./data/schemas"
+
    @validator("AZURE_STORAGE_CONNECTION_STRING")
    def validate_azure_connection_string(cls, v):
        """Validar que el connection string de Azure esté presente"""
@@ -94,6 +101,13 @@ class Settings(BaseSettings):
            raise ValueError("GOOGLE_CLOUD_PROJECT es requerido")
        return v

+    @validator("LANDINGAI_API_KEY")
+    def validate_landingai_api_key(cls, v):
+        """Validar que la API key de LandingAI esté presente"""
+        if not v:
+            raise ValueError("LANDINGAI_API_KEY es requerido")
+        return v
+
    class Config:
        env_file = ".env"
        case_sensitive = True
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -8,6 +8,8 @@ import logging
 from .routers.files import router as files_router
 from .routers.vectors import router as vectors_router
 from .routers.chunking import router as chunking_router
+from .routers.schemas import router as schemas_router
+from .routers.chunking_landingai import router as chunking_landingai_router
 from .core.config import settings
 # from routers.ai import router as ai_router  #  futuro con Azure OpenAI

@@ -112,6 +114,12 @@ app.include_router(
    tags=["chunking"]
 )

+# Schemas router (nuevo)
+app.include_router(schemas_router)
+
+# Chunking LandingAI router (nuevo)
+app.include_router(chunking_landingai_router)
+
 # Router para IA
 # app.include_router(
 #     ai_router,
--- a/backend/app/models/pycache/init.cpython-312.pyc
+++ b/backend/app/models/pycache/init.cpython-312.pyc
--- a/backend/app/models/pycache/file_models.cpython-312.pyc
+++ b/backend/app/models/pycache/file_models.cpython-312.pyc
--- a/backend/app/models/schema_models.py
+++ b/backend/app/models/schema_models.py
@@ -0,0 +1,96 @@
+"""
+Modelos Pydantic para schemas personalizables.
+Permite definir schemas dinámicos desde el frontend para extracción de datos.
+"""
+from pydantic import BaseModel, Field, field_validator
+from typing import List, Optional
+from enum import Enum
+from datetime import datetime
+
+
+class FieldType(str, Enum):
+    """Tipos de campos soportados para extracción"""
+    STRING = "string"
+    INTEGER = "integer"
+    FLOAT = "float"
+    BOOLEAN = "boolean"
+    ARRAY_STRING = "array_string"
+    ARRAY_INTEGER = "array_integer"
+    ARRAY_FLOAT = "array_float"
+    DATE = "date"
+
+
+class SchemaField(BaseModel):
+    """Definición de un campo del schema"""
+    name: str = Field(..., description="Nombre del campo (snake_case)", min_length=1)
+    type: FieldType = Field(..., description="Tipo de dato del campo")
+    description: str = Field(..., description="Descripción clara para el LLM sobre qué extraer", min_length=1)
+    required: bool = Field(default=False, description="¿Es obligatorio extraer este campo?")
+
+    # Validaciones opcionales
+    min_value: Optional[float] = Field(None, description="Valor mínimo (para integer/float)")
+    max_value: Optional[float] = Field(None, description="Valor máximo (para integer/float)")
+    pattern: Optional[str] = Field(None, description="Patrón regex para validar strings")
+
+    @field_validator('name')
+    @classmethod
+    def validate_field_name(cls, v: str) -> str:
+        """Valida que el nombre del campo sea snake_case válido"""
+        if not v.replace('_', '').isalnum():
+            raise ValueError("El nombre del campo debe ser snake_case alfanumérico")
+        if v[0].isdigit():
+            raise ValueError("El nombre del campo no puede empezar con número")
+        return v.lower()
+
+    @field_validator('min_value', 'max_value')
+    @classmethod
+    def validate_numeric_constraints(cls, v: Optional[float], info) -> Optional[float]:
+        """Valida que min/max solo se usen con tipos numéricos"""
+        if v is not None:
+            field_type = info.data.get('type')
+            if field_type not in [FieldType.INTEGER, FieldType.FLOAT, FieldType.ARRAY_INTEGER, FieldType.ARRAY_FLOAT]:
+                raise ValueError(f"min_value/max_value solo aplican a campos numéricos, no a {field_type}")
+        return v
+
+
+class CustomSchema(BaseModel):
+    """Schema personalizable por el usuario para extracción de datos"""
+    schema_id: Optional[str] = Field(None, description="ID único del schema (generado automáticamente si no se provee)")
+    schema_name: str = Field(..., description="Nombre descriptivo del schema", min_length=1, max_length=100)
+    description: str = Field(..., description="Descripción de qué extrae este schema", min_length=1, max_length=500)
+    fields: List[SchemaField] = Field(..., description="Lista de campos a extraer", min_items=1, max_items=50)
+
+    # Metadata
+    created_at: Optional[str] = Field(None, description="Timestamp de creación ISO")
+    updated_at: Optional[str] = Field(None, description="Timestamp de última actualización ISO")
+    tema: Optional[str] = Field(None, description="Tema asociado (si es específico de un tema)")
+    is_global: bool = Field(default=False, description="¿Disponible para todos los temas?")
+
+    @field_validator('fields')
+    @classmethod
+    def validate_unique_field_names(cls, v: List[SchemaField]) -> List[SchemaField]:
+        """Valida que no haya nombres de campos duplicados"""
+        field_names = [field.name for field in v]
+        if len(field_names) != len(set(field_names)):
+            raise ValueError("Los nombres de campos deben ser únicos en el schema")
+        return v
+
+    @field_validator('schema_name')
+    @classmethod
+    def validate_schema_name(cls, v: str) -> str:
+        """Limpia y valida el nombre del schema"""
+        return v.strip()
+
+
+class SchemaListResponse(BaseModel):
+    """Response para listar schemas"""
+    schemas: List[CustomSchema]
+    total: int
+
+
+class SchemaValidationResponse(BaseModel):
+    """Response para validación de schema"""
+    valid: bool
+    message: str
+    json_schema: Optional[dict] = None
+    errors: Optional[List[str]] = None
--- a/backend/app/repositories/init.py
+++ b/backend/app/repositories/init.py
@@ -0,0 +1,4 @@
+"""
+Repositories for data persistence.
+Implementa patrón Repository para abstraer la capa de datos.
+"""
--- a/backend/app/repositories/schema_repository.py
+++ b/backend/app/repositories/schema_repository.py
@@ -0,0 +1,243 @@
+"""
+Schema Repository - Patrón Repository
+Abstrae la persistencia de schemas, actualmente usando archivos JSON.
+Fácil migrar a base de datos después.
+"""
+import logging
+import json
+from pathlib import Path
+from typing import List, Optional
+from datetime import datetime
+
+from ..models.schema_models import CustomSchema
+
+logger = logging.getLogger(__name__)
+
+
+class SchemaRepository:
+    """
+    Repository para gestión de schemas.
+    Implementa patrón Repository para abstraer almacenamiento.
+
+    Actualmente usa archivos JSON en disco.
+    Para migrar a DB: solo cambiar esta clase, resto del código no cambia.
+    """
+
+    def __init__(self, schemas_dir: Path):
+        """
+        Inicializa el repositorio.
+
+        Args:
+            schemas_dir: Directorio donde se guardan los schemas
+        """
+        self.schemas_dir = Path(schemas_dir)
+        self.schemas_dir.mkdir(parents=True, exist_ok=True)
+        logger.info(f"SchemaRepository inicializado en: {self.schemas_dir}")
+
+    def save(self, schema: CustomSchema) -> CustomSchema:
+        """
+        Guarda o actualiza un schema.
+
+        Args:
+            schema: Schema a guardar
+
+        Returns:
+            Schema guardado con timestamps actualizados
+
+        Raises:
+            IOError: Si hay error escribiendo el archivo
+        """
+        try:
+            # Actualizar timestamps
+            now = datetime.utcnow().isoformat()
+            if not schema.created_at:
+                schema.created_at = now
+            schema.updated_at = now
+
+            # Guardar archivo
+            file_path = self._get_file_path(schema.schema_id)
+            with open(file_path, 'w', encoding='utf-8') as f:
+                json.dump(schema.model_dump(), f, indent=2, ensure_ascii=False)
+
+            logger.info(f"Schema guardado: {schema.schema_id} - {schema.schema_name}")
+            return schema
+
+        except Exception as e:
+            logger.error(f"Error guardando schema {schema.schema_id}: {e}")
+            raise IOError(f"No se pudo guardar el schema: {str(e)}")
+
+    def get_by_id(self, schema_id: str) -> Optional[CustomSchema]:
+        """
+        Obtiene un schema por su ID.
+
+        Args:
+            schema_id: ID del schema
+
+        Returns:
+            Schema si existe, None si no
+
+        Raises:
+            ValueError: Si el archivo está corrupto
+        """
+        try:
+            file_path = self._get_file_path(schema_id)
+            if not file_path.exists():
+                logger.debug(f"Schema no encontrado: {schema_id}")
+                return None
+
+            with open(file_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+
+            schema = CustomSchema(**data)
+            logger.debug(f"Schema cargado: {schema_id}")
+            return schema
+
+        except json.JSONDecodeError as e:
+            logger.error(f"Archivo JSON corrupto para schema {schema_id}: {e}")
+            raise ValueError(f"Schema corrupto: {schema_id}")
+        except Exception as e:
+            logger.error(f"Error cargando schema {schema_id}: {e}")
+            return None
+
+    def list_all(self) -> List[CustomSchema]:
+        """
+        Lista todos los schemas disponibles.
+
+        Returns:
+            Lista de schemas ordenados por fecha de creación (más reciente primero)
+        """
+        schemas = []
+
+        try:
+            for file_path in self.schemas_dir.glob("*.json"):
+                try:
+                    with open(file_path, 'r', encoding='utf-8') as f:
+                        data = json.load(f)
+                    schema = CustomSchema(**data)
+                    schemas.append(schema)
+                except Exception as e:
+                    logger.warning(f"Error cargando schema desde {file_path}: {e}")
+                    continue
+
+            # Ordenar por fecha de creación (más reciente primero)
+            schemas.sort(key=lambda s: s.created_at or "", reverse=True)
+
+            logger.info(f"Listados {len(schemas)} schemas")
+            return schemas
+
+        except Exception as e:
+            logger.error(f"Error listando schemas: {e}")
+            return []
+
+    def list_by_tema(self, tema: str) -> List[CustomSchema]:
+        """
+        Lista schemas disponibles para un tema específico.
+        Incluye schemas del tema + schemas globales.
+
+        Args:
+            tema: Nombre del tema
+
+        Returns:
+            Lista de schemas aplicables al tema
+        """
+        all_schemas = self.list_all()
+
+        filtered = [
+            schema for schema in all_schemas
+            if schema.tema == tema or schema.is_global
+        ]
+
+        logger.info(f"Encontrados {len(filtered)} schemas para tema '{tema}'")
+        return filtered
+
+    def delete(self, schema_id: str) -> bool:
+        """
+        Elimina un schema.
+
+        Args:
+            schema_id: ID del schema a eliminar
+
+        Returns:
+            True si se eliminó, False si no existía
+        """
+        try:
+            file_path = self._get_file_path(schema_id)
+            if not file_path.exists():
+                logger.warning(f"Intento de eliminar schema inexistente: {schema_id}")
+                return False
+
+            file_path.unlink()
+            logger.info(f"Schema eliminado: {schema_id}")
+            return True
+
+        except Exception as e:
+            logger.error(f"Error eliminando schema {schema_id}: {e}")
+            raise IOError(f"No se pudo eliminar el schema: {str(e)}")
+
+    def exists(self, schema_id: str) -> bool:
+        """
+        Verifica si un schema existe.
+
+        Args:
+            schema_id: ID del schema
+
+        Returns:
+            True si existe, False si no
+        """
+        file_path = self._get_file_path(schema_id)
+        return file_path.exists()
+
+    def count(self) -> int:
+        """
+        Cuenta el número total de schemas.
+
+        Returns:
+            Número de schemas
+        """
+        return len(list(self.schemas_dir.glob("*.json")))
+
+    def _get_file_path(self, schema_id: str) -> Path:
+        """
+        Obtiene la ruta del archivo para un schema.
+
+        Args:
+            schema_id: ID del schema
+
+        Returns:
+            Path del archivo
+        """
+        # Sanitizar schema_id para evitar path traversal
+        safe_id = schema_id.replace("/", "_").replace("\\", "_")
+        return self.schemas_dir / f"{safe_id}.json"
+
+
+# Singleton factory pattern
+_schema_repository: Optional[SchemaRepository] = None
+
+
+def get_schema_repository() -> SchemaRepository:
+    """
+    Factory para obtener instancia singleton del repositorio.
+
+    Returns:
+        Instancia única de SchemaRepository
+
+    Raises:
+        RuntimeError: Si la configuración no está disponible
+    """
+    global _schema_repository
+
+    if _schema_repository is None:
+        try:
+            from ..core.config import settings
+
+            schemas_dir = getattr(settings, 'SCHEMAS_DIR', None) or "./data/schemas"
+            _schema_repository = SchemaRepository(Path(schemas_dir))
+
+            logger.info("SchemaRepository singleton inicializado")
+
+        except Exception as e:
+            logger.error(f"Error inicializando SchemaRepository: {e}")
+            raise RuntimeError(f"No se pudo inicializar SchemaRepository: {str(e)}")
+
+    return _schema_repository
--- a/backend/app/routers/pycache/init.cpython-312.pyc
+++ b/backend/app/routers/pycache/init.cpython-312.pyc
--- a/backend/app/routers/pycache/files.cpython-312.pyc
+++ b/backend/app/routers/pycache/files.cpython-312.pyc
--- a/backend/app/routers/chunking_landingai.py
+++ b/backend/app/routers/chunking_landingai.py
@@ -0,0 +1,396 @@
+"""
+Router para procesamiento de PDFs con LandingAI.
+Soporta dos modos: rápido (solo parse) y extracción (parse + extract con schema).
+"""
+import logging
+import time
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel, Field
+from typing import Optional, List, Literal
+
+from langchain_core.documents import Document
+
+from ..services.landingai_service import get_landingai_service
+from ..services.chunking_service import get_chunking_service
+from ..repositories.schema_repository import get_schema_repository
+from ..utils.chunking.token_manager import TokenManager
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(prefix="/api/v1/chunking-landingai", tags=["chunking-landingai"])
+
+
+class ProcessLandingAIRequest(BaseModel):
+    """Request para procesar PDF con LandingAI"""
+    file_name: str = Field(..., description="Nombre del archivo PDF")
+    tema: str = Field(..., description="Tema/carpeta del archivo")
+    collection_name: str = Field(..., description="Colección de Qdrant")
+
+    # Modo de procesamiento
+    mode: Literal["quick", "extract"] = Field(
+        default="quick",
+        description="Modo: 'quick' (solo parse) o 'extract' (parse + datos estructurados)"
+    )
+
+    # Schema (obligatorio si mode='extract')
+    schema_id: Optional[str] = Field(
+        None,
+        description="ID del schema a usar (requerido si mode='extract')"
+    )
+
+    # Configuración de chunks
+    include_chunk_types: List[str] = Field(
+        default=["text", "table"],
+        description="Tipos de chunks a incluir: text, table, figure, etc."
+    )
+    max_tokens_per_chunk: int = Field(
+        default=1500,
+        ge=500,
+        le=3000,
+        description="Tokens máximos por chunk (flexible para tablas/figuras)"
+    )
+    merge_small_chunks: bool = Field(
+        default=True,
+        description="Unir chunks pequeños de la misma página y tipo"
+    )
+
+
+class ProcessLandingAIResponse(BaseModel):
+    """Response del procesamiento con LandingAI"""
+    success: bool
+    mode: str
+    processing_time_seconds: float
+    collection_name: str
+    file_name: str
+    total_chunks: int
+    chunks_added: int
+    schema_used: Optional[str] = None
+    extracted_data: Optional[dict] = None
+    parse_metadata: dict
+    message: str
+
+
+@router.post("/process", response_model=ProcessLandingAIResponse)
+async def process_with_landingai(request: ProcessLandingAIRequest):
+    """
+    Procesa un PDF con LandingAI y sube a Qdrant.
+
+    Flujo:
+    1. Descarga PDF de Azure Blob
+    2. Parse con LandingAI (siempre)
+    3. Extract con schema (solo si mode='extract')
+    4. Procesa chunks (filtrado, merge, control de tokens)
+    5. Genera embeddings (Azure OpenAI)
+    6. Sube a Qdrant con metadata rica
+
+    Args:
+        request: Configuración del procesamiento
+
+    Returns:
+        Resultado del procesamiento con estadísticas
+
+    Raises:
+        HTTPException 400: Si mode='extract' y no se provee schema_id
+        HTTPException 404: Si el PDF o schema no existen
+        HTTPException 500: Si hay error en el procesamiento
+    """
+    start_time = time.time()
+
+    try:
+        logger.info(f"\n{'='*60}")
+        logger.info(f"INICIANDO PROCESAMIENTO CON LANDINGAI")
+        logger.info(f"{'='*60}")
+        logger.info(f"Archivo: {request.file_name}")
+        logger.info(f"Tema: {request.tema}")
+        logger.info(f"Modo: {request.mode}")
+        logger.info(f"Colección: {request.collection_name}")
+
+        # 1. Validar schema si es modo extract
+        custom_schema = None
+        if request.mode == "extract":
+            if not request.schema_id:
+                raise HTTPException(
+                    status_code=400,
+                    detail="schema_id es requerido cuando mode='extract'"
+                )
+
+            schema_repo = get_schema_repository()
+            custom_schema = schema_repo.get_by_id(request.schema_id)
+
+            if not custom_schema:
+                raise HTTPException(
+                    status_code=404,
+                    detail=f"Schema no encontrado: {request.schema_id}"
+                )
+
+            logger.info(f"Schema seleccionado: {custom_schema.schema_name}")
+
+        # 2. Descargar PDF desde Azure Blob
+        logger.info("\n[1/5] Descargando PDF desde Azure Blob...")
+        chunking_service = get_chunking_service()
+
+        try:
+            pdf_bytes = await chunking_service.download_pdf_from_blob(
+                request.file_name,
+                request.tema
+            )
+        except Exception as e:
+            logger.error(f"Error descargando PDF: {e}")
+            raise HTTPException(
+                status_code=404,
+                detail=f"No se pudo descargar el PDF: {str(e)}"
+            )
+
+        # 3. Procesar con LandingAI
+        logger.info("\n[2/5] Procesando con LandingAI...")
+        landingai_service = get_landingai_service()
+
+        try:
+            result = landingai_service.process_pdf(
+                pdf_bytes=pdf_bytes,
+                file_name=request.file_name,
+                custom_schema=custom_schema,
+                include_chunk_types=request.include_chunk_types
+            )
+        except Exception as e:
+            logger.error(f"Error en LandingAI: {e}")
+            raise HTTPException(
+                status_code=500,
+                detail=f"Error procesando con LandingAI: {str(e)}"
+            )
+
+        documents = result["chunks"]
+
+        if not documents:
+            raise HTTPException(
+                status_code=400,
+                detail="No se generaron chunks después del procesamiento"
+            )
+
+        # 4. Aplicar control flexible de tokens
+        logger.info("\n[3/5] Aplicando control de tokens...")
+        documents = _apply_flexible_token_control(
+            documents,
+            max_tokens=request.max_tokens_per_chunk,
+            merge_small=request.merge_small_chunks
+        )
+
+        # 5. Generar embeddings
+        logger.info(f"\n[4/5] Generando embeddings para {len(documents)} chunks...")
+        texts = [doc.page_content for doc in documents]
+
+        try:
+            embeddings = await chunking_service.embedding_service.generate_embeddings_batch(texts)
+            logger.info(f"Embeddings generados: {len(embeddings)} vectores")
+        except Exception as e:
+            logger.error(f"Error generando embeddings: {e}")
+            raise HTTPException(
+                status_code=500,
+                detail=f"Error generando embeddings: {str(e)}"
+            )
+
+        # 6. Preparar chunks para Qdrant con IDs determinísticos
+        logger.info("\n[5/5] Subiendo a Qdrant...")
+        qdrant_chunks = []
+
+        for idx, (doc, embedding) in enumerate(zip(documents, embeddings)):
+            # ID determinístico
+            chunk_id = chunking_service._generate_deterministic_id(
+                file_name=request.file_name,
+                page=doc.metadata.get("page", 1),
+                chunk_index=doc.metadata.get("chunk_id", str(idx))
+            )
+
+            qdrant_chunks.append({
+                "id": chunk_id,
+                "vector": embedding,
+                "payload": {
+                    "page_content": doc.page_content,
+                    "metadata": doc.metadata  # Metadata rica de LandingAI
+                }
+            })
+
+        # 7. Subir a Qdrant
+        try:
+            upload_result = await chunking_service.vector_db.add_chunks(
+                request.collection_name,
+                qdrant_chunks
+            )
+            logger.info(f"Subida completada: {upload_result['chunks_added']} chunks")
+        except Exception as e:
+            logger.error(f"Error subiendo a Qdrant: {e}")
+            raise HTTPException(
+                status_code=500,
+                detail=f"Error subiendo a Qdrant: {str(e)}"
+            )
+
+        # Tiempo total
+        processing_time = time.time() - start_time
+
+        logger.info(f"\n{'='*60}")
+        logger.info(f"PROCESAMIENTO COMPLETADO")
+        logger.info(f"{'='*60}")
+        logger.info(f"Tiempo: {processing_time:.2f}s")
+        logger.info(f"Chunks procesados: {len(documents)}")
+        logger.info(f"Chunks subidos: {upload_result['chunks_added']}")
+
+        return ProcessLandingAIResponse(
+            success=True,
+            mode=request.mode,
+            processing_time_seconds=round(processing_time, 2),
+            collection_name=request.collection_name,
+            file_name=request.file_name,
+            total_chunks=len(documents),
+            chunks_added=upload_result["chunks_added"],
+            schema_used=custom_schema.schema_id if custom_schema else None,
+            extracted_data=result.get("extracted_data"),
+            parse_metadata=result["parse_metadata"],
+            message=f"PDF procesado exitosamente en modo {request.mode}"
+        )
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error inesperado en procesamiento: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Error inesperado: {str(e)}"
+        )
+
+
+def _apply_flexible_token_control(
+    documents: List[Document],
+    max_tokens: int,
+    merge_small: bool
+) -> List[Document]:
+    """
+    Aplica control flexible de tokens (Opción C del diseño).
+
+    - Permite chunks más grandes para tablas/figuras (50% extra)
+    - Mergea chunks pequeños de misma página y tipo
+    - Divide chunks muy grandes en sub-chunks
+
+    Args:
+        documents: Lista de Documents
+        max_tokens: Límite sugerido de tokens
+        merge_small: Si True, une chunks pequeños
+
+    Returns:
+        Lista de Documents procesados
+    """
+    token_manager = TokenManager()
+    processed = []
+    i = 0
+
+    logger.info(f"Control de tokens: max={max_tokens}, merge={merge_small}")
+
+    while i < len(documents):
+        doc = documents[i]
+        tokens = token_manager.count_tokens(doc.page_content)
+        chunk_type = doc.metadata.get("chunk_type", "text")
+
+        # Límite flexible según tipo
+        if chunk_type in ["table", "figure"]:
+            max_allowed = int(max_tokens * 1.5)  # 50% más para contenido estructurado
+        else:
+            max_allowed = max_tokens
+
+        # Si excede mucho el límite, dividir
+        if tokens > max_allowed * 1.2:  # 20% de tolerancia
+            logger.warning(
+                f"Chunk muy grande ({tokens} tokens), dividiendo... "
+                f"(tipo: {chunk_type})"
+            )
+            sub_chunks = _split_large_chunk(doc, max_tokens, token_manager)
+            processed.extend(sub_chunks)
+
+        else:
+            # Intentar merge si es pequeño
+            if (
+                merge_small and
+                tokens < max_tokens * 0.5 and
+                i < len(documents) - 1
+            ):
+                next_doc = documents[i + 1]
+                if _can_merge(doc, next_doc, max_tokens, token_manager):
+                    logger.debug(f"Merging chunks {i} y {i+1}")
+                    doc = _merge_documents(doc, next_doc)
+                    i += 1  # Skip next
+
+            processed.append(doc)
+
+        i += 1
+
+    logger.info(f"Tokens aplicados: {len(documents)} → {len(processed)} chunks")
+    return processed
+
+
+def _split_large_chunk(
+    doc: Document,
+    max_tokens: int,
+    token_manager: TokenManager
+) -> List[Document]:
+    """Divide un chunk grande en sub-chunks"""
+    content = doc.page_content
+    words = content.split()
+    sub_chunks = []
+    current_chunk = []
+    current_tokens = 0
+
+    for word in words:
+        word_tokens = token_manager.count_tokens(word)
+        if current_tokens + word_tokens > max_tokens and current_chunk:
+            # Guardar chunk actual
+            sub_content = " ".join(current_chunk)
+            sub_doc = Document(
+                page_content=sub_content,
+                metadata={**doc.metadata, "is_split": True}
+            )
+            sub_chunks.append(sub_doc)
+            current_chunk = [word]
+            current_tokens = word_tokens
+        else:
+            current_chunk.append(word)
+            current_tokens += word_tokens
+
+    # Último chunk
+    if current_chunk:
+        sub_content = " ".join(current_chunk)
+        sub_doc = Document(
+            page_content=sub_content,
+            metadata={**doc.metadata, "is_split": True}
+        )
+        sub_chunks.append(sub_doc)
+
+    return sub_chunks
+
+
+def _can_merge(
+    doc1: Document,
+    doc2: Document,
+    max_tokens: int,
+    token_manager: TokenManager
+) -> bool:
+    """Verifica si dos docs se pueden mergear"""
+    # Misma página
+    if doc1.metadata.get("page") != doc2.metadata.get("page"):
+        return False
+
+    # Mismo tipo
+    if doc1.metadata.get("chunk_type") != doc2.metadata.get("chunk_type"):
+        return False
+
+    # No exceder límite
+    combined_text = f"{doc1.page_content}\n\n{doc2.page_content}"
+    combined_tokens = token_manager.count_tokens(combined_text)
+
+    return combined_tokens <= max_tokens
+
+
+def _merge_documents(doc1: Document, doc2: Document) -> Document:
+    """Mergea dos documentos"""
+    merged_content = f"{doc1.page_content}\n\n{doc2.page_content}"
+    return Document(
+        page_content=merged_content,
+        metadata={**doc1.metadata, "is_merged": True}
+    )
--- a/backend/app/routers/schemas.py
+++ b/backend/app/routers/schemas.py
@@ -0,0 +1,288 @@
+"""
+Router para gestión de schemas personalizables.
+Endpoints CRUD para crear, leer, actualizar y eliminar schemas.
+"""
+import logging
+import uuid
+from fastapi import APIRouter, HTTPException, Query
+from typing import List, Optional
+
+from ..models.schema_models import (
+    CustomSchema,
+    SchemaListResponse,
+    SchemaValidationResponse
+)
+from ..repositories.schema_repository import get_schema_repository
+from ..services.schema_builder_service import SchemaBuilderService
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(prefix="/api/v1/schemas", tags=["schemas"])
+
+
+@router.post("/", response_model=CustomSchema, status_code=201)
+async def create_schema(schema: CustomSchema):
+    """
+    Crea un nuevo schema personalizado.
+
+    Args:
+        schema: Definición del schema
+
+    Returns:
+        Schema creado con ID y timestamps
+
+    Raises:
+        HTTPException 400: Si el schema es inválido
+        HTTPException 409: Si ya existe un schema con ese ID
+    """
+    try:
+        # Generar ID si no viene
+        if not schema.schema_id:
+            schema.schema_id = f"schema_{uuid.uuid4().hex[:12]}"
+
+        # Verificar que no exista
+        repo = get_schema_repository()
+        if repo.exists(schema.schema_id):
+            raise HTTPException(
+                status_code=409,
+                detail=f"Ya existe un schema con ID: {schema.schema_id}"
+            )
+
+        # Validar que se puede construir el schema
+        builder = SchemaBuilderService()
+        validation = builder.validate_schema(schema)
+
+        if not validation["valid"]:
+            raise HTTPException(
+                status_code=400,
+                detail={
+                    "message": "Schema inválido",
+                    "errors": validation["errors"]
+                }
+            )
+
+        # Guardar
+        saved_schema = repo.save(schema)
+
+        logger.info(f"Schema creado: {saved_schema.schema_id} - {saved_schema.schema_name}")
+        return saved_schema
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error creando schema: {e}")
+        raise HTTPException(status_code=500, detail=f"Error interno: {str(e)}")
+
+
+@router.get("/", response_model=List[CustomSchema])
+async def list_schemas(
+    tema: Optional[str] = Query(None, description="Filtrar por tema (incluye globales)")
+):
+    """
+    Lista todos los schemas o filtrados por tema.
+
+    Args:
+        tema: Nombre del tema para filtrar (opcional)
+
+    Returns:
+        Lista de schemas
+    """
+    try:
+        repo = get_schema_repository()
+
+        if tema:
+            schemas = repo.list_by_tema(tema)
+        else:
+            schemas = repo.list_all()
+
+        return schemas
+
+    except Exception as e:
+        logger.error(f"Error listando schemas: {e}")
+        raise HTTPException(status_code=500, detail=f"Error interno: {str(e)}")
+
+
+@router.get("/{schema_id}", response_model=CustomSchema)
+async def get_schema(schema_id: str):
+    """
+    Obtiene un schema por su ID.
+
+    Args:
+        schema_id: ID del schema
+
+    Returns:
+        Schema solicitado
+
+    Raises:
+        HTTPException 404: Si el schema no existe
+    """
+    try:
+        repo = get_schema_repository()
+        schema = repo.get_by_id(schema_id)
+
+        if not schema:
+            raise HTTPException(
+                status_code=404,
+                detail=f"Schema no encontrado: {schema_id}"
+            )
+
+        return schema
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error obteniendo schema {schema_id}: {e}")
+        raise HTTPException(status_code=500, detail=f"Error interno: {str(e)}")
+
+
+@router.put("/{schema_id}", response_model=CustomSchema)
+async def update_schema(schema_id: str, schema: CustomSchema):
+    """
+    Actualiza un schema existente.
+
+    Args:
+        schema_id: ID del schema a actualizar
+        schema: Nueva definición del schema
+
+    Returns:
+        Schema actualizado
+
+    Raises:
+        HTTPException 404: Si el schema no existe
+        HTTPException 400: Si el nuevo schema es inválido
+    """
+    try:
+        repo = get_schema_repository()
+
+        # Verificar que existe
+        existing = repo.get_by_id(schema_id)
+        if not existing:
+            raise HTTPException(
+                status_code=404,
+                detail=f"Schema no encontrado: {schema_id}"
+            )
+
+        # Mantener el ID original
+        schema.schema_id = schema_id
+        schema.created_at = existing.created_at  # Mantener fecha de creación
+
+        # Validar nuevo schema
+        builder = SchemaBuilderService()
+        validation = builder.validate_schema(schema)
+
+        if not validation["valid"]:
+            raise HTTPException(
+                status_code=400,
+                detail={
+                    "message": "Schema inválido",
+                    "errors": validation["errors"]
+                }
+            )
+
+        # Guardar
+        updated_schema = repo.save(schema)
+
+        logger.info(f"Schema actualizado: {schema_id}")
+        return updated_schema
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error actualizando schema {schema_id}: {e}")
+        raise HTTPException(status_code=500, detail=f"Error interno: {str(e)}")
+
+
+@router.delete("/{schema_id}")
+async def delete_schema(schema_id: str):
+    """
+    Elimina un schema.
+
+    Args:
+        schema_id: ID del schema a eliminar
+
+    Returns:
+        Mensaje de confirmación
+
+    Raises:
+        HTTPException 404: Si el schema no existe
+    """
+    try:
+        repo = get_schema_repository()
+
+        if not repo.delete(schema_id):
+            raise HTTPException(
+                status_code=404,
+                detail=f"Schema no encontrado: {schema_id}"
+            )
+
+        logger.info(f"Schema eliminado: {schema_id}")
+        return {
+            "success": True,
+            "message": f"Schema {schema_id} eliminado exitosamente"
+        }
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error eliminando schema {schema_id}: {e}")
+        raise HTTPException(status_code=500, detail=f"Error interno: {str(e)}")
+
+
+@router.post("/validate", response_model=SchemaValidationResponse)
+async def validate_schema(schema: CustomSchema):
+    """
+    Valida un schema sin guardarlo.
+    Útil para preview en el frontend antes de guardar.
+
+    Args:
+        schema: Schema a validar
+
+    Returns:
+        Resultado de validación con detalles
+    """
+    try:
+        builder = SchemaBuilderService()
+        validation = builder.validate_schema(schema)
+
+        return SchemaValidationResponse(**validation)
+
+    except Exception as e:
+        logger.error(f"Error validando schema: {e}")
+        return SchemaValidationResponse(
+            valid=False,
+            message="Error en validación",
+            errors=[str(e)]
+        )
+
+
+@router.get("/stats/count")
+async def get_schemas_count():
+    """
+    Obtiene estadísticas de schemas.
+
+    Returns:
+        Conteo de schemas total y por tema
+    """
+    try:
+        repo = get_schema_repository()
+        all_schemas = repo.list_all()
+
+        # Contar por tema
+        tema_counts = {}
+        global_count = 0
+
+        for schema in all_schemas:
+            if schema.is_global:
+                global_count += 1
+            elif schema.tema:
+                tema_counts[schema.tema] = tema_counts.get(schema.tema, 0) + 1
+
+        return {
+            "total": len(all_schemas),
+            "global": global_count,
+            "by_tema": tema_counts
+        }
+
+    except Exception as e:
+        logger.error(f"Error obteniendo estadísticas: {e}")
+        raise HTTPException(status_code=500, detail=f"Error interno: {str(e)}")
--- a/backend/app/services/pycache/init.cpython-312.pyc
+++ b/backend/app/services/pycache/init.cpython-312.pyc
--- a/backend/app/services/pycache/azure_service.cpython-312.pyc
+++ b/backend/app/services/pycache/azure_service.cpython-312.pyc
--- a/backend/app/services/pycache/file_service.cpython-312.pyc
+++ b/backend/app/services/pycache/file_service.cpython-312.pyc
--- a/backend/app/services/landingai_service.py
+++ b/backend/app/services/landingai_service.py
@@ -0,0 +1,353 @@
+"""
+LandingAI Service - Servicio independiente
+Maneja toda la interacción con LandingAI ADE API.
+Usa parse() para extracción de chunks y extract() para datos estructurados.
+"""
+import logging
+import tempfile
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+
+from langchain_core.documents import Document
+
+from ..models.schema_models import CustomSchema
+from ..services.schema_builder_service import SchemaBuilderService
+
+logger = logging.getLogger(__name__)
+
+
+class LandingAIService:
+    """
+    Servicio para procesamiento de PDFs con LandingAI.
+
+    Flujo:
+    1. Parse PDF → obtener chunks estructurados + markdown
+    2. Extract (opcional) → extraer datos según schema personalizado
+    3. Process chunks → filtrar, enriquecer, controlar tokens
+    4. Return Documents → listos para embeddings y Qdrant
+    """
+
+    def __init__(self, api_key: str, environment: str = "production"):
+        """
+        Inicializa el servicio LandingAI.
+
+        Args:
+            api_key: API key de LandingAI
+            environment: "production" o "eu"
+
+        Raises:
+            ImportError: Si landingai-ade no está instalado
+        """
+        try:
+            from landingai_ade import LandingAIADE
+
+            self.client = LandingAIADE(
+                apikey=api_key,
+                environment=environment,
+                timeout=480.0,  # 8 minutos para PDFs grandes
+                max_retries=2
+            )
+
+            self.schema_builder = SchemaBuilderService()
+
+            logger.info(f"LandingAIService inicializado (environment: {environment})")
+
+        except ImportError:
+            logger.error("landingai-ade no está instalado")
+            raise ImportError(
+                "Se requiere landingai-ade. Instalar con: pip install landingai-ade"
+            )
+
+    def process_pdf(
+        self,
+        pdf_bytes: bytes,
+        file_name: str,
+        custom_schema: Optional[CustomSchema] = None,
+        include_chunk_types: Optional[List[str]] = None,
+        model: str = "dpt-2-latest"
+    ) -> Dict[str, Any]:
+        """
+        Procesa un PDF con LandingAI (modo rápido o con extracción).
+
+        Args:
+            pdf_bytes: Contenido del PDF en bytes
+            file_name: Nombre del archivo
+            custom_schema: Schema personalizado para extract (None = modo rápido)
+            include_chunk_types: Tipos de chunks a incluir ["text", "table", "figure"]
+            model: Modelo de LandingAI a usar
+
+        Returns:
+            Dict con:
+            - chunks: List[Document] listos para embeddings
+            - parse_metadata: Metadata del parse (páginas, duración, etc.)
+            - extracted_data: Datos extraídos (si usó schema)
+            - file_name: Nombre del archivo
+
+        Raises:
+            Exception: Si hay error en parse o extract
+        """
+        logger.info(f"=== Procesando PDF con LandingAI: {file_name} ===")
+        logger.info(f"  Modo: {'Extracción' if custom_schema else 'Rápido'}")
+        logger.info(f"  Tipos incluidos: {include_chunk_types or 'todos'}")
+
+        # 1. Parse PDF
+        parse_result = self._parse_pdf(pdf_bytes, file_name, model)
+
+        # 2. Extract (si hay schema)
+        extracted_data = None
+        if custom_schema:
+            logger.info(f"  Extrayendo datos con schema: {custom_schema.schema_name}")
+            extracted_data = self._extract_data(
+                parse_result["markdown"],
+                custom_schema
+            )
+
+        # 3. Procesar chunks
+        documents = self._process_chunks(
+            parse_result,
+            extracted_data,
+            file_name,
+            include_chunk_types
+        )
+
+        logger.info(f"=== Procesamiento completado: {len(documents)} chunks ===")
+
+        return {
+            "chunks": documents,
+            "parse_metadata": parse_result["metadata"],
+            "extracted_data": extracted_data,
+            "file_name": file_name
+        }
+
+    def _parse_pdf(
+        self,
+        pdf_bytes: bytes,
+        file_name: str,
+        model: str
+    ) -> Dict[str, Any]:
+        """
+        Parse PDF con LandingAI.
+
+        Args:
+            pdf_bytes: Contenido del PDF
+            file_name: Nombre del archivo
+            model: Modelo de LandingAI
+
+        Returns:
+            Dict con chunks, markdown, grounding y metadata
+        """
+        logger.info(f"  Parseando PDF con modelo {model}...")
+
+        # LandingAI requiere Path, crear archivo temporal
+        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
+            tmp.write(pdf_bytes)
+            tmp_path = Path(tmp.name)
+
+        try:
+            # Parse con LandingAI
+            response = self.client.parse(document=tmp_path, model=model)
+
+            # Procesar respuesta
+            chunks_data = []
+            for chunk in response.chunks:
+                # Obtener grounding info del chunk
+                grounding_info = {}
+                if hasattr(response, 'grounding') and hasattr(response.grounding, chunk.id):
+                    ground = getattr(response.grounding, chunk.id)
+                    grounding_info = {
+                        "bbox": ground.bbox if hasattr(ground, 'bbox') else None,
+                        "page": ground.page if hasattr(ground, 'page') else 1
+                    }
+
+                page_num = grounding_info.get("page", 1) if grounding_info else 1
+
+                chunks_data.append({
+                    "id": chunk.id,
+                    "content": chunk.markdown,
+                    "type": chunk.type,
+                    "grounding": grounding_info,
+                    "page": page_num
+                })
+
+            # Obtener metadata
+            metadata_dict = {}
+            if hasattr(response, 'metadata'):
+                metadata_dict = {
+                    "page_count": getattr(response.metadata, 'page_count', None),
+                    "duration_ms": getattr(response.metadata, 'duration_ms', None),
+                    "version": getattr(response.metadata, 'version', None)
+                }
+
+            logger.info(
+                f"  Parse completado: {len(chunks_data)} chunks, "
+                f"{metadata_dict.get('page_count', 'N/A')} páginas"
+            )
+
+            return {
+                "chunks": chunks_data,
+                "markdown": response.markdown,
+                "grounding": response.grounding,
+                "metadata": metadata_dict
+            }
+
+        finally:
+            # Limpiar archivo temporal
+            tmp_path.unlink(missing_ok=True)
+
+    def _extract_data(
+        self,
+        markdown: str,
+        custom_schema: CustomSchema
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Extrae datos estructurados del markdown usando schema personalizado.
+
+        Args:
+            markdown: Markdown completo del documento
+            custom_schema: Schema personalizado
+
+        Returns:
+            Dict con extraction, extraction_metadata y schema_used
+            None si hay error
+        """
+        try:
+            # 1. Construir Pydantic schema
+            pydantic_schema = self.schema_builder.build_pydantic_schema(custom_schema)
+
+            # 2. Convertir a JSON schema
+            json_schema = self.schema_builder.to_json_schema(pydantic_schema)
+
+            # 3. Crear archivo temporal con markdown
+            with tempfile.NamedTemporaryFile(
+                mode='w',
+                suffix=".md",
+                delete=False,
+                encoding='utf-8'
+            ) as tmp:
+                tmp.write(markdown)
+                tmp_path = Path(tmp.name)
+
+            try:
+                # 4. Extract con LandingAI
+                response = self.client.extract(
+                    schema=json_schema,
+                    markdown=tmp_path
+                )
+
+                logger.info(f"  Extracción completada: {len(response.extraction)} campos")
+
+                return {
+                    "extraction": response.extraction,
+                    "extraction_metadata": response.extraction_metadata,
+                    "schema_used": custom_schema.schema_id
+                }
+
+            finally:
+                tmp_path.unlink(missing_ok=True)
+
+        except Exception as e:
+            logger.error(f"Error en extract: {e}")
+            return None
+
+    def _process_chunks(
+        self,
+        parse_result: Dict[str, Any],
+        extracted_data: Optional[Dict[str, Any]],
+        file_name: str,
+        include_chunk_types: Optional[List[str]]
+    ) -> List[Document]:
+        """
+        Convierte chunks de LandingAI a Documents de LangChain con metadata rica.
+
+        Args:
+            parse_result: Resultado del parse
+            extracted_data: Datos extraídos (opcional)
+            file_name: Nombre del archivo
+            include_chunk_types: Tipos a incluir
+
+        Returns:
+            Lista de Documents listos para embeddings
+        """
+        documents = []
+        filtered_count = 0
+
+        for chunk in parse_result["chunks"]:
+            # Filtrar por tipo si se especificó
+            if include_chunk_types and chunk["type"] not in include_chunk_types:
+                filtered_count += 1
+                continue
+
+            # Construir metadata rica
+            metadata = {
+                "file_name": file_name,
+                "page": chunk["page"],
+                "chunk_id": chunk["id"],
+                "chunk_type": chunk["type"],
+                "bbox": chunk["grounding"].get("bbox"),
+
+                # Metadata del documento
+                "document_metadata": {
+                    "page_count": parse_result["metadata"].get("page_count"),
+                    "processing_duration_ms": parse_result["metadata"].get("duration_ms"),
+                    "landingai_version": parse_result["metadata"].get("version"),
+                }
+            }
+
+            # Agregar datos extraídos si existen
+            if extracted_data:
+                metadata["extracted_data"] = extracted_data["extraction"]
+                metadata["extraction_metadata"] = extracted_data["extraction_metadata"]
+                metadata["schema_used"] = extracted_data["schema_used"]
+
+            # Crear Document
+            doc = Document(
+                page_content=chunk["content"],
+                metadata=metadata
+            )
+            documents.append(doc)
+
+        if filtered_count > 0:
+            logger.info(f"  Filtrados {filtered_count} chunks por tipo")
+
+        logger.info(f"  Generados {len(documents)} documents")
+        return documents
+
+
+# Singleton factory
+_landingai_service: Optional[LandingAIService] = None
+
+
+def get_landingai_service() -> LandingAIService:
+    """
+    Factory para obtener instancia singleton del servicio.
+
+    Returns:
+        Instancia única de LandingAIService
+
+    Raises:
+        RuntimeError: Si la configuración no está disponible
+    """
+    global _landingai_service
+
+    if _landingai_service is None:
+        try:
+            from ..core.config import settings
+
+            api_key = settings.LANDINGAI_API_KEY
+            if not api_key:
+                raise ValueError("LANDINGAI_API_KEY no está configurada")
+
+            environment = getattr(settings, 'LANDINGAI_ENVIRONMENT', 'production')
+
+            _landingai_service = LandingAIService(
+                api_key=api_key,
+                environment=environment
+            )
+
+            logger.info("LandingAIService singleton inicializado")
+
+        except Exception as e:
+            logger.error(f"Error inicializando LandingAIService: {e}")
+            raise RuntimeError(f"No se pudo inicializar LandingAIService: {str(e)}")
+
+    return _landingai_service
--- a/backend/app/services/schema_builder_service.py
+++ b/backend/app/services/schema_builder_service.py
@@ -0,0 +1,215 @@
+"""
+Schema Builder Service - Patrón Builder
+Construye schemas Pydantic dinámicamente desde definiciones JSON del frontend.
+"""
+import logging
+from typing import Dict, Any, Type, get_origin, get_args
+from pydantic import BaseModel, Field, create_model
+from pydantic.fields import FieldInfo
+
+from ..models.schema_models import CustomSchema, FieldType, SchemaField
+
+logger = logging.getLogger(__name__)
+
+
+class SchemaBuilderService:
+    """
+    Servicio para construir schemas Pydantic dinámicamente.
+    Implementa patrón Builder para construcción step-by-step.
+    """
+
+    @staticmethod
+    def build_pydantic_schema(custom_schema: CustomSchema) -> Type[BaseModel]:
+        """
+        Convierte un CustomSchema a una clase Pydantic dinámica.
+
+        Este método es el núcleo del patrón Builder, construyendo
+        una clase Pydantic válida que puede ser usada por LandingAI.
+
+        Args:
+            custom_schema: Schema personalizado del usuario
+
+        Returns:
+            Clase Pydantic generada dinámicamente
+
+        Raises:
+            ValueError: Si el schema es inválido
+        """
+        logger.info(f"Construyendo Pydantic schema: {custom_schema.schema_name}")
+
+        field_definitions = {}
+
+        for field in custom_schema.fields:
+            try:
+                # 1. Mapear tipo Python
+                python_type = SchemaBuilderService._map_field_type(field.type)
+
+                # 2. Crear FieldInfo con validaciones
+                field_info = SchemaBuilderService._build_field_info(field)
+
+                # 3. Agregar al diccionario de definiciones
+                field_definitions[field.name] = (python_type, field_info)
+
+                logger.debug(f"  Campo '{field.name}': {python_type} - {field.description[:50]}...")
+
+            except Exception as e:
+                logger.error(f"Error construyendo campo '{field.name}': {e}")
+                raise ValueError(f"Campo inválido '{field.name}': {str(e)}")
+
+        # 4. Crear clase dinámica
+        try:
+            # Nombre de clase válido (sin espacios ni caracteres especiales)
+            class_name = custom_schema.schema_name.replace(" ", "").replace("-", "")
+            if not class_name[0].isalpha():
+                class_name = "Schema" + class_name
+
+            DynamicSchema = create_model(
+                class_name,
+                **field_definitions
+            )
+
+            logger.info(f"Schema Pydantic creado exitosamente: {class_name} con {len(field_definitions)} campos")
+            return DynamicSchema
+
+        except Exception as e:
+            logger.error(f"Error creando modelo Pydantic: {e}")
+            raise ValueError(f"No se pudo crear el schema: {str(e)}")
+
+    @staticmethod
+    def _map_field_type(field_type: FieldType) -> Type:
+        """
+        Mapea FieldType a tipo Python nativo.
+
+        Args:
+            field_type: Tipo de campo del schema
+
+        Returns:
+            Tipo Python correspondiente
+        """
+        from typing import List
+
+        type_mapping = {
+            FieldType.STRING: str,
+            FieldType.INTEGER: int,
+            FieldType.FLOAT: float,
+            FieldType.BOOLEAN: bool,
+            FieldType.ARRAY_STRING: List[str],
+            FieldType.ARRAY_INTEGER: List[int],
+            FieldType.ARRAY_FLOAT: List[float],
+            FieldType.DATE: str,  # Dates como strings ISO 8601
+        }
+
+        if field_type not in type_mapping:
+            raise ValueError(f"Tipo de campo no soportado: {field_type}")
+
+        return type_mapping[field_type]
+
+    @staticmethod
+    def _build_field_info(field: SchemaField) -> FieldInfo:
+        """
+        Construye FieldInfo con validaciones apropiadas.
+
+        Args:
+            field: Definición del campo
+
+        Returns:
+            FieldInfo configurado
+        """
+        # Configuración base
+        field_kwargs = {
+            "description": field.description,
+        }
+
+        # Default value según si es requerido
+        if field.required:
+            field_kwargs["default"] = ...  # Ellipsis = required
+        else:
+            field_kwargs["default"] = None
+
+        # Validaciones numéricas
+        if field.min_value is not None:
+            field_kwargs["ge"] = field.min_value  # greater or equal
+
+        if field.max_value is not None:
+            field_kwargs["le"] = field.max_value  # less or equal
+
+        # Validaciones de string
+        if field.pattern:
+            field_kwargs["pattern"] = field.pattern
+
+        return Field(**field_kwargs)
+
+    @staticmethod
+    def to_json_schema(pydantic_schema: Type[BaseModel]) -> Dict[str, Any]:
+        """
+        Convierte un Pydantic schema a JSON Schema para LandingAI.
+
+        Args:
+            pydantic_schema: Clase Pydantic
+
+        Returns:
+            JSON Schema dict compatible con LandingAI
+
+        Raises:
+            ImportError: Si landingai-ade no está instalado
+        """
+        try:
+            from landingai_ade.lib import pydantic_to_json_schema
+
+            json_schema = pydantic_to_json_schema(pydantic_schema)
+            logger.info("Schema convertido a JSON schema exitosamente")
+            return json_schema
+
+        except ImportError:
+            logger.error("landingai-ade no está instalado")
+            raise ImportError(
+                "Se requiere landingai-ade para convertir a JSON schema. "
+                "Instalar con: pip install landingai-ade"
+            )
+
+    @staticmethod
+    def validate_schema(custom_schema: CustomSchema) -> Dict[str, Any]:
+        """
+        Valida que un schema se pueda construir correctamente.
+
+        Args:
+            custom_schema: Schema a validar
+
+        Returns:
+            Dict con resultado de validación:
+            {
+                "valid": bool,
+                "message": str,
+                "json_schema": dict (si válido),
+                "errors": List[str] (si inválido)
+            }
+        """
+        errors = []
+
+        try:
+            # Intentar construir el schema Pydantic
+            pydantic_schema = SchemaBuilderService.build_pydantic_schema(custom_schema)
+
+            # Intentar convertir a JSON schema
+            json_schema = SchemaBuilderService.to_json_schema(pydantic_schema)
+
+            return {
+                "valid": True,
+                "message": f"Schema '{custom_schema.schema_name}' es válido",
+                "json_schema": json_schema,
+                "errors": None
+            }
+
+        except ValueError as e:
+            errors.append(f"Error de validación: {str(e)}")
+        except ImportError as e:
+            errors.append(f"Error de dependencias: {str(e)}")
+        except Exception as e:
+            errors.append(f"Error inesperado: {str(e)}")
+
+        return {
+            "valid": False,
+            "message": f"Schema '{custom_schema.schema_name}' es inválido",
+            "json_schema": None,
+            "errors": errors
+        }
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -25,6 +25,8 @@ dependencies = [
    # WebSockets
    "websockets>=14.1",
    "langchain-text-splitters>=1.0.0",
+    # LandingAI Document AI
+    "landingai-ade>=0.2.1",
 ]
 [project.scripts]
 dev = "uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload"
--- a/backend/uv.lock
+++ b/backend/uv.lock
@@ -67,6 +67,7 @@ dependencies = [
    { name = "azure-storage-blob" },
    { name = "fastapi" },
    { name = "google-cloud-aiplatform" },
+    { name = "landingai-ade" },
    { name = "langchain" },
    { name = "langchain-core" },
    { name = "langchain-text-splitters" },
@@ -88,6 +89,7 @@ requires-dist = [
    { name = "azure-storage-blob", specifier = ">=12.26.0" },
    { name = "fastapi", specifier = ">=0.116.1" },
    { name = "google-cloud-aiplatform", specifier = ">=1.77.0" },
+    { name = "landingai-ade", specifier = ">=0.2.1" },
    { name = "langchain", specifier = ">=0.3.12" },
    { name = "langchain-core", specifier = ">=0.3.24" },
    { name = "langchain-text-splitters", specifier = ">=1.0.0" },
@@ -810,6 +812,23 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/71/92/5e77f98553e9e75130c78900d000368476aed74276eb8ae8796f65f00918/jsonpointer-3.0.0-py2.py3-none-any.whl", hash = "sha256:13e088adc14fca8b6aa8177c044e12701e6ad4b28ff10e65f2267a90109c9942", size = 7595, upload-time = "2024-06-10T19:24:40.698Z" },
 ]

+[[package]]
+name = "landingai-ade"
+version = "0.20.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "distro" },
+    { name = "httpx" },
+    { name = "pydantic" },
+    { name = "sniffio" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/46/05/8e94b262625545683db8b0206929691715a0905c8227b776866885be7fb6/landingai_ade-0.20.3.tar.gz", hash = "sha256:8fd95fabdf3f72b5a1c0d0b3b485631e41e1c25c6e5636de10fbec83078772d2", size = 107576, upload-time = "2025-11-05T02:03:46.808Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/49/d0/6daeb909475d2b49847ebb27d1093aa586154a1d910e9b14addb1b356007/landingai_ade-0.20.3-py3-none-any.whl", hash = "sha256:599a7ec1b31ca3923160c20037cef5f3afbf84485ce98b8ead028ac540d50bbd", size = 86222, upload-time = "2025-11-05T02:03:45.825Z" },
+]
+
 [[package]]
 name = "langchain"
 version = "1.0.3"