Landing AI integrado

2025-11-06 13:29:43 +00:00
parent 7c6e8c4858
commit c03d0e27c4
32 changed files with 3908 additions and 728 deletions
--- a/backend/app/services/pycache/init.cpython-312.pyc
+++ b/backend/app/services/pycache/init.cpython-312.pyc
--- a/backend/app/services/pycache/azure_service.cpython-312.pyc
+++ b/backend/app/services/pycache/azure_service.cpython-312.pyc
--- a/backend/app/services/pycache/file_service.cpython-312.pyc
+++ b/backend/app/services/pycache/file_service.cpython-312.pyc
--- a/backend/app/services/landingai_service.py
+++ b/backend/app/services/landingai_service.py
@@ -0,0 +1,353 @@
+"""
+LandingAI Service - Servicio independiente
+Maneja toda la interacción con LandingAI ADE API.
+Usa parse() para extracción de chunks y extract() para datos estructurados.
+"""
+import logging
+import tempfile
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+
+from langchain_core.documents import Document
+
+from ..models.schema_models import CustomSchema
+from ..services.schema_builder_service import SchemaBuilderService
+
+logger = logging.getLogger(__name__)
+
+
+class LandingAIService:
+    """
+    Servicio para procesamiento de PDFs con LandingAI.
+
+    Flujo:
+    1. Parse PDF → obtener chunks estructurados + markdown
+    2. Extract (opcional) → extraer datos según schema personalizado
+    3. Process chunks → filtrar, enriquecer, controlar tokens
+    4. Return Documents → listos para embeddings y Qdrant
+    """
+
+    def __init__(self, api_key: str, environment: str = "production"):
+        """
+        Inicializa el servicio LandingAI.
+
+        Args:
+            api_key: API key de LandingAI
+            environment: "production" o "eu"
+
+        Raises:
+            ImportError: Si landingai-ade no está instalado
+        """
+        try:
+            from landingai_ade import LandingAIADE
+
+            self.client = LandingAIADE(
+                apikey=api_key,
+                environment=environment,
+                timeout=480.0,  # 8 minutos para PDFs grandes
+                max_retries=2
+            )
+
+            self.schema_builder = SchemaBuilderService()
+
+            logger.info(f"LandingAIService inicializado (environment: {environment})")
+
+        except ImportError:
+            logger.error("landingai-ade no está instalado")
+            raise ImportError(
+                "Se requiere landingai-ade. Instalar con: pip install landingai-ade"
+            )
+
+    def process_pdf(
+        self,
+        pdf_bytes: bytes,
+        file_name: str,
+        custom_schema: Optional[CustomSchema] = None,
+        include_chunk_types: Optional[List[str]] = None,
+        model: str = "dpt-2-latest"
+    ) -> Dict[str, Any]:
+        """
+        Procesa un PDF con LandingAI (modo rápido o con extracción).
+
+        Args:
+            pdf_bytes: Contenido del PDF en bytes
+            file_name: Nombre del archivo
+            custom_schema: Schema personalizado para extract (None = modo rápido)
+            include_chunk_types: Tipos de chunks a incluir ["text", "table", "figure"]
+            model: Modelo de LandingAI a usar
+
+        Returns:
+            Dict con:
+            - chunks: List[Document] listos para embeddings
+            - parse_metadata: Metadata del parse (páginas, duración, etc.)
+            - extracted_data: Datos extraídos (si usó schema)
+            - file_name: Nombre del archivo
+
+        Raises:
+            Exception: Si hay error en parse o extract
+        """
+        logger.info(f"=== Procesando PDF con LandingAI: {file_name} ===")
+        logger.info(f"  Modo: {'Extracción' if custom_schema else 'Rápido'}")
+        logger.info(f"  Tipos incluidos: {include_chunk_types or 'todos'}")
+
+        # 1. Parse PDF
+        parse_result = self._parse_pdf(pdf_bytes, file_name, model)
+
+        # 2. Extract (si hay schema)
+        extracted_data = None
+        if custom_schema:
+            logger.info(f"  Extrayendo datos con schema: {custom_schema.schema_name}")
+            extracted_data = self._extract_data(
+                parse_result["markdown"],
+                custom_schema
+            )
+
+        # 3. Procesar chunks
+        documents = self._process_chunks(
+            parse_result,
+            extracted_data,
+            file_name,
+            include_chunk_types
+        )
+
+        logger.info(f"=== Procesamiento completado: {len(documents)} chunks ===")
+
+        return {
+            "chunks": documents,
+            "parse_metadata": parse_result["metadata"],
+            "extracted_data": extracted_data,
+            "file_name": file_name
+        }
+
+    def _parse_pdf(
+        self,
+        pdf_bytes: bytes,
+        file_name: str,
+        model: str
+    ) -> Dict[str, Any]:
+        """
+        Parse PDF con LandingAI.
+
+        Args:
+            pdf_bytes: Contenido del PDF
+            file_name: Nombre del archivo
+            model: Modelo de LandingAI
+
+        Returns:
+            Dict con chunks, markdown, grounding y metadata
+        """
+        logger.info(f"  Parseando PDF con modelo {model}...")
+
+        # LandingAI requiere Path, crear archivo temporal
+        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
+            tmp.write(pdf_bytes)
+            tmp_path = Path(tmp.name)
+
+        try:
+            # Parse con LandingAI
+            response = self.client.parse(document=tmp_path, model=model)
+
+            # Procesar respuesta
+            chunks_data = []
+            for chunk in response.chunks:
+                # Obtener grounding info del chunk
+                grounding_info = {}
+                if hasattr(response, 'grounding') and hasattr(response.grounding, chunk.id):
+                    ground = getattr(response.grounding, chunk.id)
+                    grounding_info = {
+                        "bbox": ground.bbox if hasattr(ground, 'bbox') else None,
+                        "page": ground.page if hasattr(ground, 'page') else 1
+                    }
+
+                page_num = grounding_info.get("page", 1) if grounding_info else 1
+
+                chunks_data.append({
+                    "id": chunk.id,
+                    "content": chunk.markdown,
+                    "type": chunk.type,
+                    "grounding": grounding_info,
+                    "page": page_num
+                })
+
+            # Obtener metadata
+            metadata_dict = {}
+            if hasattr(response, 'metadata'):
+                metadata_dict = {
+                    "page_count": getattr(response.metadata, 'page_count', None),
+                    "duration_ms": getattr(response.metadata, 'duration_ms', None),
+                    "version": getattr(response.metadata, 'version', None)
+                }
+
+            logger.info(
+                f"  Parse completado: {len(chunks_data)} chunks, "
+                f"{metadata_dict.get('page_count', 'N/A')} páginas"
+            )
+
+            return {
+                "chunks": chunks_data,
+                "markdown": response.markdown,
+                "grounding": response.grounding,
+                "metadata": metadata_dict
+            }
+
+        finally:
+            # Limpiar archivo temporal
+            tmp_path.unlink(missing_ok=True)
+
+    def _extract_data(
+        self,
+        markdown: str,
+        custom_schema: CustomSchema
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Extrae datos estructurados del markdown usando schema personalizado.
+
+        Args:
+            markdown: Markdown completo del documento
+            custom_schema: Schema personalizado
+
+        Returns:
+            Dict con extraction, extraction_metadata y schema_used
+            None si hay error
+        """
+        try:
+            # 1. Construir Pydantic schema
+            pydantic_schema = self.schema_builder.build_pydantic_schema(custom_schema)
+
+            # 2. Convertir a JSON schema
+            json_schema = self.schema_builder.to_json_schema(pydantic_schema)
+
+            # 3. Crear archivo temporal con markdown
+            with tempfile.NamedTemporaryFile(
+                mode='w',
+                suffix=".md",
+                delete=False,
+                encoding='utf-8'
+            ) as tmp:
+                tmp.write(markdown)
+                tmp_path = Path(tmp.name)
+
+            try:
+                # 4. Extract con LandingAI
+                response = self.client.extract(
+                    schema=json_schema,
+                    markdown=tmp_path
+                )
+
+                logger.info(f"  Extracción completada: {len(response.extraction)} campos")
+
+                return {
+                    "extraction": response.extraction,
+                    "extraction_metadata": response.extraction_metadata,
+                    "schema_used": custom_schema.schema_id
+                }
+
+            finally:
+                tmp_path.unlink(missing_ok=True)
+
+        except Exception as e:
+            logger.error(f"Error en extract: {e}")
+            return None
+
+    def _process_chunks(
+        self,
+        parse_result: Dict[str, Any],
+        extracted_data: Optional[Dict[str, Any]],
+        file_name: str,
+        include_chunk_types: Optional[List[str]]
+    ) -> List[Document]:
+        """
+        Convierte chunks de LandingAI a Documents de LangChain con metadata rica.
+
+        Args:
+            parse_result: Resultado del parse
+            extracted_data: Datos extraídos (opcional)
+            file_name: Nombre del archivo
+            include_chunk_types: Tipos a incluir
+
+        Returns:
+            Lista de Documents listos para embeddings
+        """
+        documents = []
+        filtered_count = 0
+
+        for chunk in parse_result["chunks"]:
+            # Filtrar por tipo si se especificó
+            if include_chunk_types and chunk["type"] not in include_chunk_types:
+                filtered_count += 1
+                continue
+
+            # Construir metadata rica
+            metadata = {
+                "file_name": file_name,
+                "page": chunk["page"],
+                "chunk_id": chunk["id"],
+                "chunk_type": chunk["type"],
+                "bbox": chunk["grounding"].get("bbox"),
+
+                # Metadata del documento
+                "document_metadata": {
+                    "page_count": parse_result["metadata"].get("page_count"),
+                    "processing_duration_ms": parse_result["metadata"].get("duration_ms"),
+                    "landingai_version": parse_result["metadata"].get("version"),
+                }
+            }
+
+            # Agregar datos extraídos si existen
+            if extracted_data:
+                metadata["extracted_data"] = extracted_data["extraction"]
+                metadata["extraction_metadata"] = extracted_data["extraction_metadata"]
+                metadata["schema_used"] = extracted_data["schema_used"]
+
+            # Crear Document
+            doc = Document(
+                page_content=chunk["content"],
+                metadata=metadata
+            )
+            documents.append(doc)
+
+        if filtered_count > 0:
+            logger.info(f"  Filtrados {filtered_count} chunks por tipo")
+
+        logger.info(f"  Generados {len(documents)} documents")
+        return documents
+
+
+# Singleton factory
+_landingai_service: Optional[LandingAIService] = None
+
+
+def get_landingai_service() -> LandingAIService:
+    """
+    Factory para obtener instancia singleton del servicio.
+
+    Returns:
+        Instancia única de LandingAIService
+
+    Raises:
+        RuntimeError: Si la configuración no está disponible
+    """
+    global _landingai_service
+
+    if _landingai_service is None:
+        try:
+            from ..core.config import settings
+
+            api_key = settings.LANDINGAI_API_KEY
+            if not api_key:
+                raise ValueError("LANDINGAI_API_KEY no está configurada")
+
+            environment = getattr(settings, 'LANDINGAI_ENVIRONMENT', 'production')
+
+            _landingai_service = LandingAIService(
+                api_key=api_key,
+                environment=environment
+            )
+
+            logger.info("LandingAIService singleton inicializado")
+
+        except Exception as e:
+            logger.error(f"Error inicializando LandingAIService: {e}")
+            raise RuntimeError(f"No se pudo inicializar LandingAIService: {str(e)}")
+
+    return _landingai_service
--- a/backend/app/services/schema_builder_service.py
+++ b/backend/app/services/schema_builder_service.py
@@ -0,0 +1,215 @@
+"""
+Schema Builder Service - Patrón Builder
+Construye schemas Pydantic dinámicamente desde definiciones JSON del frontend.
+"""
+import logging
+from typing import Dict, Any, Type, get_origin, get_args
+from pydantic import BaseModel, Field, create_model
+from pydantic.fields import FieldInfo
+
+from ..models.schema_models import CustomSchema, FieldType, SchemaField
+
+logger = logging.getLogger(__name__)
+
+
+class SchemaBuilderService:
+    """
+    Servicio para construir schemas Pydantic dinámicamente.
+    Implementa patrón Builder para construcción step-by-step.
+    """
+
+    @staticmethod
+    def build_pydantic_schema(custom_schema: CustomSchema) -> Type[BaseModel]:
+        """
+        Convierte un CustomSchema a una clase Pydantic dinámica.
+
+        Este método es el núcleo del patrón Builder, construyendo
+        una clase Pydantic válida que puede ser usada por LandingAI.
+
+        Args:
+            custom_schema: Schema personalizado del usuario
+
+        Returns:
+            Clase Pydantic generada dinámicamente
+
+        Raises:
+            ValueError: Si el schema es inválido
+        """
+        logger.info(f"Construyendo Pydantic schema: {custom_schema.schema_name}")
+
+        field_definitions = {}
+
+        for field in custom_schema.fields:
+            try:
+                # 1. Mapear tipo Python
+                python_type = SchemaBuilderService._map_field_type(field.type)
+
+                # 2. Crear FieldInfo con validaciones
+                field_info = SchemaBuilderService._build_field_info(field)
+
+                # 3. Agregar al diccionario de definiciones
+                field_definitions[field.name] = (python_type, field_info)
+
+                logger.debug(f"  Campo '{field.name}': {python_type} - {field.description[:50]}...")
+
+            except Exception as e:
+                logger.error(f"Error construyendo campo '{field.name}': {e}")
+                raise ValueError(f"Campo inválido '{field.name}': {str(e)}")
+
+        # 4. Crear clase dinámica
+        try:
+            # Nombre de clase válido (sin espacios ni caracteres especiales)
+            class_name = custom_schema.schema_name.replace(" ", "").replace("-", "")
+            if not class_name[0].isalpha():
+                class_name = "Schema" + class_name
+
+            DynamicSchema = create_model(
+                class_name,
+                **field_definitions
+            )
+
+            logger.info(f"Schema Pydantic creado exitosamente: {class_name} con {len(field_definitions)} campos")
+            return DynamicSchema
+
+        except Exception as e:
+            logger.error(f"Error creando modelo Pydantic: {e}")
+            raise ValueError(f"No se pudo crear el schema: {str(e)}")
+
+    @staticmethod
+    def _map_field_type(field_type: FieldType) -> Type:
+        """
+        Mapea FieldType a tipo Python nativo.
+
+        Args:
+            field_type: Tipo de campo del schema
+
+        Returns:
+            Tipo Python correspondiente
+        """
+        from typing import List
+
+        type_mapping = {
+            FieldType.STRING: str,
+            FieldType.INTEGER: int,
+            FieldType.FLOAT: float,
+            FieldType.BOOLEAN: bool,
+            FieldType.ARRAY_STRING: List[str],
+            FieldType.ARRAY_INTEGER: List[int],
+            FieldType.ARRAY_FLOAT: List[float],
+            FieldType.DATE: str,  # Dates como strings ISO 8601
+        }
+
+        if field_type not in type_mapping:
+            raise ValueError(f"Tipo de campo no soportado: {field_type}")
+
+        return type_mapping[field_type]
+
+    @staticmethod
+    def _build_field_info(field: SchemaField) -> FieldInfo:
+        """
+        Construye FieldInfo con validaciones apropiadas.
+
+        Args:
+            field: Definición del campo
+
+        Returns:
+            FieldInfo configurado
+        """
+        # Configuración base
+        field_kwargs = {
+            "description": field.description,
+        }
+
+        # Default value según si es requerido
+        if field.required:
+            field_kwargs["default"] = ...  # Ellipsis = required
+        else:
+            field_kwargs["default"] = None
+
+        # Validaciones numéricas
+        if field.min_value is not None:
+            field_kwargs["ge"] = field.min_value  # greater or equal
+
+        if field.max_value is not None:
+            field_kwargs["le"] = field.max_value  # less or equal
+
+        # Validaciones de string
+        if field.pattern:
+            field_kwargs["pattern"] = field.pattern
+
+        return Field(**field_kwargs)
+
+    @staticmethod
+    def to_json_schema(pydantic_schema: Type[BaseModel]) -> Dict[str, Any]:
+        """
+        Convierte un Pydantic schema a JSON Schema para LandingAI.
+
+        Args:
+            pydantic_schema: Clase Pydantic
+
+        Returns:
+            JSON Schema dict compatible con LandingAI
+
+        Raises:
+            ImportError: Si landingai-ade no está instalado
+        """
+        try:
+            from landingai_ade.lib import pydantic_to_json_schema
+
+            json_schema = pydantic_to_json_schema(pydantic_schema)
+            logger.info("Schema convertido a JSON schema exitosamente")
+            return json_schema
+
+        except ImportError:
+            logger.error("landingai-ade no está instalado")
+            raise ImportError(
+                "Se requiere landingai-ade para convertir a JSON schema. "
+                "Instalar con: pip install landingai-ade"
+            )
+
+    @staticmethod
+    def validate_schema(custom_schema: CustomSchema) -> Dict[str, Any]:
+        """
+        Valida que un schema se pueda construir correctamente.
+
+        Args:
+            custom_schema: Schema a validar
+
+        Returns:
+            Dict con resultado de validación:
+            {
+                "valid": bool,
+                "message": str,
+                "json_schema": dict (si válido),
+                "errors": List[str] (si inválido)
+            }
+        """
+        errors = []
+
+        try:
+            # Intentar construir el schema Pydantic
+            pydantic_schema = SchemaBuilderService.build_pydantic_schema(custom_schema)
+
+            # Intentar convertir a JSON schema
+            json_schema = SchemaBuilderService.to_json_schema(pydantic_schema)
+
+            return {
+                "valid": True,
+                "message": f"Schema '{custom_schema.schema_name}' es válido",
+                "json_schema": json_schema,
+                "errors": None
+            }
+
+        except ValueError as e:
+            errors.append(f"Error de validación: {str(e)}")
+        except ImportError as e:
+            errors.append(f"Error de dependencias: {str(e)}")
+        except Exception as e:
+            errors.append(f"Error inesperado: {str(e)}")
+
+        return {
+            "valid": False,
+            "message": f"Schema '{custom_schema.schema_name}' es inválido",
+            "json_schema": None,
+            "errors": errors
+        }