Primera version de chunkeo completo crud

This commit is contained in:
Sebastian
2025-11-05 19:18:11 +00:00
parent df2c184814
commit 7c6e8c4858
36 changed files with 6242 additions and 5 deletions

2
.gitignore vendored
View File

@@ -350,6 +350,8 @@ docker-compose.override.yml
.env.*.local .env.*.local
config.json config.json
secrets.json secrets.json
.secrets/
**/.secrets/
# === ARCHIVOS DE RESPALDO === # === ARCHIVOS DE RESPALDO ===
*.bak *.bak

View File

@@ -26,14 +26,74 @@ class Settings(BaseSettings):
AZURE_STORAGE_CONNECTION_STRING: str AZURE_STORAGE_CONNECTION_STRING: str
AZURE_STORAGE_ACCOUNT_NAME: str = "" AZURE_STORAGE_ACCOUNT_NAME: str = ""
AZURE_CONTAINER_NAME: str = "files" AZURE_CONTAINER_NAME: str = "files"
# Qdrant Vector DB configuración
QDRANT_URL: str
QDRANT_API_KEY: str
VECTOR_DB_TYPE: str = "qdrant" # Para futuro: soportar otros tipos
# Azure OpenAI configuración
AZURE_OPENAI_ENDPOINT: str
AZURE_OPENAI_API_KEY: str
AZURE_OPENAI_API_VERSION: str = "2024-02-01"
AZURE_OPENAI_EMBEDDING_MODEL: str = "text-embedding-3-large"
AZURE_OPENAI_EMBEDDING_DEPLOYMENT: str = "text-embedding-3-large"
# Google Cloud / Vertex AI configuración
GOOGLE_APPLICATION_CREDENTIALS: str
GOOGLE_CLOUD_PROJECT: str
GOOGLE_CLOUD_LOCATION: str = "us-central1"
GEMINI_MODEL: str = "gemini-2.0-flash"
@validator("AZURE_STORAGE_CONNECTION_STRING") @validator("AZURE_STORAGE_CONNECTION_STRING")
def validate_azure_connection_string(cls, v): def validate_azure_connection_string(cls, v):
"""Validar que el connection string de Azure esté presente""" """Validar que el connection string de Azure esté presente"""
if not v: if not v:
raise ValueError("AZURE_STORAGE_CONNECTION_STRING es requerido") raise ValueError("AZURE_STORAGE_CONNECTION_STRING es requerido")
return v return v
@validator("QDRANT_URL")
def validate_qdrant_url(cls, v):
"""Validar que la URL de Qdrant esté presente"""
if not v:
raise ValueError("QDRANT_URL es requerido")
return v
@validator("QDRANT_API_KEY")
def validate_qdrant_api_key(cls, v):
"""Validar que la API key de Qdrant esté presente"""
if not v:
raise ValueError("QDRANT_API_KEY es requerido")
return v
@validator("AZURE_OPENAI_ENDPOINT")
def validate_azure_openai_endpoint(cls, v):
"""Validar que el endpoint de Azure OpenAI esté presente"""
if not v:
raise ValueError("AZURE_OPENAI_ENDPOINT es requerido")
return v
@validator("AZURE_OPENAI_API_KEY")
def validate_azure_openai_api_key(cls, v):
"""Validar que la API key de Azure OpenAI esté presente"""
if not v:
raise ValueError("AZURE_OPENAI_API_KEY es requerido")
return v
@validator("GOOGLE_APPLICATION_CREDENTIALS")
def validate_google_credentials(cls, v):
"""Validar que el path de credenciales de Google esté presente"""
if not v:
raise ValueError("GOOGLE_APPLICATION_CREDENTIALS es requerido")
return v
@validator("GOOGLE_CLOUD_PROJECT")
def validate_google_project(cls, v):
"""Validar que el proyecto de Google Cloud esté presente"""
if not v:
raise ValueError("GOOGLE_CLOUD_PROJECT es requerido")
return v
class Config: class Config:
env_file = ".env" env_file = ".env"
case_sensitive = True case_sensitive = True

View File

@@ -6,6 +6,8 @@ import logging
# Import routers # Import routers
from .routers.files import router as files_router from .routers.files import router as files_router
from .routers.vectors import router as vectors_router
from .routers.chunking import router as chunking_router
from .core.config import settings from .core.config import settings
# from routers.ai import router as ai_router # futuro con Azure OpenAI # from routers.ai import router as ai_router # futuro con Azure OpenAI
@@ -98,6 +100,18 @@ app.include_router(
tags=["files"] tags=["files"]
) )
app.include_router(
vectors_router,
prefix="/api/v1",
tags=["vectors"]
)
app.include_router(
chunking_router,
prefix="/api/v1",
tags=["chunking"]
)
# Router para IA # Router para IA
# app.include_router( # app.include_router(
# ai_router, # ai_router,
@@ -110,6 +124,7 @@ app.include_router(
async def startup_event(): async def startup_event():
logger.info("Iniciando File Manager API...") logger.info("Iniciando File Manager API...")
logger.info(f"Conectando a Azure Storage Account: {settings.AZURE_STORAGE_ACCOUNT_NAME}") logger.info(f"Conectando a Azure Storage Account: {settings.AZURE_STORAGE_ACCOUNT_NAME}")
logger.info(f"Conectando a Qdrant: {settings.QDRANT_URL}")
# validaciones de conexión a Azure # validaciones de conexión a Azure

View File

@@ -0,0 +1,128 @@
"""
Modelos Pydantic para las operaciones de chunking.
"""
from pydantic import BaseModel, Field, validator
from typing import List, Dict, Any, Optional
# Request Models
class ChunkingPreviewRequest(BaseModel):
"""Request para generar preview de chunks"""
file_name: str = Field(..., description="Nombre del archivo PDF")
tema: str = Field(..., description="Tema/carpeta del archivo")
max_tokens: int = Field(default=950, ge=100, le=2000, description="Límite máximo de tokens por chunk")
target_tokens: int = Field(default=800, ge=100, le=2000, description="Tokens objetivo")
chunk_size: int = Field(default=1000, ge=100, le=3000, description="Tamaño del chunk")
chunk_overlap: int = Field(default=200, ge=0, le=1000, description="Solapamiento entre chunks")
use_llm: bool = Field(default=True, description="Usar LLM (Gemini) para procesamiento inteligente")
custom_instructions: str = Field(default="", description="Instrucciones personalizadas (solo si use_llm=True)")
@validator("custom_instructions")
def validate_custom_instructions(cls, v, values):
"""Valida que custom_instructions solo se use con LLM habilitado"""
if v and not values.get("use_llm", True):
raise ValueError("custom_instructions solo puede usarse cuando use_llm=True")
return v
@validator("target_tokens")
def validate_target_tokens(cls, v, values):
"""Valida que target_tokens sea menor que max_tokens"""
if "max_tokens" in values and v >= values["max_tokens"]:
raise ValueError("target_tokens debe ser menor que max_tokens")
return v
class ChunkingProcessRequest(BaseModel):
"""Request para procesar PDF completo"""
file_name: str = Field(..., description="Nombre del archivo PDF")
tema: str = Field(..., description="Tema/carpeta del archivo")
collection_name: str = Field(..., description="Nombre de la colección en Qdrant")
max_tokens: int = Field(default=950, ge=100, le=2000, description="Límite máximo de tokens por chunk")
target_tokens: int = Field(default=800, ge=100, le=2000, description="Tokens objetivo")
chunk_size: int = Field(default=1000, ge=100, le=3000, description="Tamaño del chunk")
chunk_overlap: int = Field(default=200, ge=0, le=1000, description="Solapamiento entre chunks")
use_llm: bool = Field(default=True, description="Usar LLM (Gemini) para procesamiento inteligente")
custom_instructions: str = Field(default="", description="Instrucciones personalizadas (solo si use_llm=True)")
@validator("custom_instructions")
def validate_custom_instructions(cls, v, values):
"""Valida que custom_instructions solo se use con LLM habilitado"""
if v and not values.get("use_llm", True):
raise ValueError("custom_instructions solo puede usarse cuando use_llm=True")
return v
@validator("target_tokens")
def validate_target_tokens(cls, v, values):
"""Valida que target_tokens sea menor que max_tokens"""
if "max_tokens" in values and v >= values["max_tokens"]:
raise ValueError("target_tokens debe ser menor que max_tokens")
return v
# Response Models
class ChunkPreview(BaseModel):
"""Modelo para un chunk de preview"""
index: int = Field(..., description="Índice del chunk")
text: str = Field(..., description="Contenido del chunk")
page: int = Field(..., description="Número de página")
file_name: str = Field(..., description="Nombre del archivo")
tokens: int = Field(..., description="Número aproximado de tokens")
class ChunkingPreviewResponse(BaseModel):
"""Response para preview de chunks"""
success: bool = Field(default=True, description="Indica si la operación fue exitosa")
file_name: str = Field(..., description="Nombre del archivo procesado")
tema: str = Field(..., description="Tema del archivo")
chunks: List[ChunkPreview] = Field(..., description="Lista de chunks de preview (hasta 3)")
message: str = Field(default="Preview generado exitosamente", description="Mensaje descriptivo")
@validator("chunks")
def validate_chunk_count(cls, v):
"""Valida que haya al menos 1 chunk y máximo 3 chunks en el preview"""
if len(v) < 1:
raise ValueError("El preview debe contener al menos 1 chunk")
if len(v) > 3:
raise ValueError("El preview no puede contener más de 3 chunks")
return v
class ChunkingProcessResponse(BaseModel):
"""Response para procesamiento completo"""
success: bool = Field(..., description="Indica si la operación fue exitosa")
collection_name: str = Field(..., description="Nombre de la colección")
file_name: str = Field(..., description="Nombre del archivo procesado")
total_chunks: int = Field(..., description="Total de chunks generados")
chunks_added: int = Field(..., description="Chunks agregados a Qdrant")
message: str = Field(..., description="Mensaje descriptivo")
# Profile Models
class ChunkingProfile(BaseModel):
"""Perfil de configuración predefinido para chunking"""
id: str = Field(..., description="ID del perfil")
name: str = Field(..., description="Nombre del perfil")
description: str = Field(..., description="Descripción del perfil")
max_tokens: int = Field(..., description="Límite máximo de tokens")
target_tokens: int = Field(..., description="Tokens objetivo")
chunk_size: int = Field(..., description="Tamaño del chunk")
chunk_overlap: int = Field(..., description="Solapamiento")
use_llm: bool = Field(..., description="Si usa LLM")
class ChunkingProfilesResponse(BaseModel):
"""Response con perfiles disponibles"""
profiles: List[ChunkingProfile] = Field(..., description="Lista de perfiles disponibles")
# Progress Models (para WebSockets)
class ChunkingProgress(BaseModel):
"""Modelo para reportar progreso de chunking"""
status: str = Field(..., description="Estado actual: downloading, chunking, embedding, uploading, completed, error")
progress: int = Field(..., ge=0, le=100, description="Progreso en porcentaje")
message: Optional[str] = Field(None, description="Mensaje adicional")
error: Optional[str] = Field(None, description="Mensaje de error si status=error")

View File

@@ -0,0 +1,184 @@
"""
Modelos Pydantic para operaciones con bases de datos vectoriales.
Este módulo define todos los modelos de datos para requests y responses
relacionados con la gestión de colecciones y chunks en bases de datos vectoriales.
"""
from typing import List, Optional, Dict, Any
from pydantic import BaseModel, Field, validator
# ============================================================================
# Modelos para Colecciones
# ============================================================================
class CollectionExistsRequest(BaseModel):
"""Request para verificar si una colección existe."""
collection_name: str = Field(..., description="Nombre de la colección a verificar")
class CollectionExistsResponse(BaseModel):
"""Response de verificación de existencia de colección."""
exists: bool = Field(..., description="True si la colección existe")
collection_name: str = Field(..., description="Nombre de la colección")
class CollectionCreateRequest(BaseModel):
"""Request para crear una nueva colección."""
collection_name: str = Field(..., description="Nombre de la colección a crear")
vector_size: int = Field(default=3072, description="Dimensión de los vectores")
distance: str = Field(default="Cosine", description="Métrica de distancia")
@validator("distance")
def validate_distance(cls, v):
"""Valida que la métrica de distancia sea válida."""
allowed = ["Cosine", "Euclid", "Dot"]
if v not in allowed:
raise ValueError(f"Métrica de distancia debe ser una de: {allowed}")
return v
@validator("vector_size")
def validate_vector_size(cls, v):
"""Valida que el tamaño del vector sea positivo."""
if v <= 0:
raise ValueError("El tamaño del vector debe ser mayor a 0")
return v
class CollectionCreateResponse(BaseModel):
"""Response de creación de colección."""
success: bool = Field(..., description="True si se creó exitosamente")
collection_name: str = Field(..., description="Nombre de la colección creada")
message: str = Field(..., description="Mensaje descriptivo")
class CollectionDeleteResponse(BaseModel):
"""Response de eliminación de colección."""
success: bool = Field(..., description="True si se eliminó exitosamente")
collection_name: str = Field(..., description="Nombre de la colección eliminada")
message: str = Field(..., description="Mensaje descriptivo")
class CollectionInfoResponse(BaseModel):
"""Response con información de una colección."""
name: str = Field(..., description="Nombre de la colección")
vectors_count: int = Field(..., description="Número total de vectores")
vectors_config: Dict[str, Any] = Field(..., description="Configuración de vectores")
status: str = Field(..., description="Estado de la colección")
# ============================================================================
# Modelos para Archivos en Colecciones
# ============================================================================
class FileExistsInCollectionRequest(BaseModel):
"""Request para verificar si un archivo existe en una colección."""
collection_name: str = Field(..., description="Nombre de la colección")
file_name: str = Field(..., description="Nombre del archivo a verificar")
class FileExistsInCollectionResponse(BaseModel):
"""Response de verificación de existencia de archivo."""
exists: bool = Field(..., description="True si el archivo existe")
collection_name: str = Field(..., description="Nombre de la colección")
file_name: str = Field(..., description="Nombre del archivo")
chunk_count: Optional[int] = Field(None, description="Número de chunks del archivo si existe")
# ============================================================================
# Modelos para Chunks
# ============================================================================
class ChunkMetadata(BaseModel):
"""Metadata de un chunk."""
file_name: str = Field(..., description="Nombre del archivo")
page: int = Field(..., description="Número de página")
text: Optional[str] = Field(None, description="Texto del chunk")
# Se pueden agregar más campos según necesidad
class ChunkData(BaseModel):
"""Datos completos de un chunk."""
id: str = Field(..., description="ID único del chunk")
payload: ChunkMetadata = Field(..., description="Metadata del chunk")
vector: Optional[List[float]] = Field(None, description="Vector de embeddings")
class GetChunksByFileRequest(BaseModel):
"""Request para obtener chunks de un archivo."""
collection_name: str = Field(..., description="Nombre de la colección")
file_name: str = Field(..., description="Nombre del archivo")
limit: Optional[int] = Field(None, description="Límite de chunks a retornar")
@validator("limit")
def validate_limit(cls, v):
"""Valida que el límite sea positivo si está presente."""
if v is not None and v <= 0:
raise ValueError("El límite debe ser mayor a 0")
return v
class GetChunksByFileResponse(BaseModel):
"""Response con los chunks de un archivo."""
collection_name: str = Field(..., description="Nombre de la colección")
file_name: str = Field(..., description="Nombre del archivo")
chunks: List[Dict[str, Any]] = Field(..., description="Lista de chunks")
total_chunks: int = Field(..., description="Número total de chunks")
class DeleteFileFromCollectionRequest(BaseModel):
"""Request para eliminar un archivo de una colección."""
collection_name: str = Field(..., description="Nombre de la colección")
file_name: str = Field(..., description="Nombre del archivo a eliminar")
class DeleteFileFromCollectionResponse(BaseModel):
"""Response de eliminación de archivo."""
success: bool = Field(..., description="True si se eliminó exitosamente")
collection_name: str = Field(..., description="Nombre de la colección")
file_name: str = Field(..., description="Nombre del archivo eliminado")
chunks_deleted: int = Field(..., description="Número de chunks eliminados")
message: str = Field(..., description="Mensaje descriptivo")
class AddChunksRequest(BaseModel):
"""Request para agregar chunks a una colección."""
collection_name: str = Field(..., description="Nombre de la colección")
chunks: List[Dict[str, Any]] = Field(..., description="Lista de chunks a agregar")
@validator("chunks")
def validate_chunks(cls, v):
"""Valida que la lista de chunks no esté vacía."""
if not v:
raise ValueError("La lista de chunks no puede estar vacía")
return v
class AddChunksResponse(BaseModel):
"""Response de agregado de chunks."""
success: bool = Field(..., description="True si se agregaron exitosamente")
collection_name: str = Field(..., description="Nombre de la colección")
chunks_added: int = Field(..., description="Número de chunks agregados")
message: str = Field(..., description="Mensaje descriptivo")
# ============================================================================
# Modelos para Health Check
# ============================================================================
class VectorDBHealthResponse(BaseModel):
"""Response del health check de la base de datos vectorial."""
status: str = Field(..., description="Estado de la conexión")
db_type: str = Field(..., description="Tipo de base de datos vectorial")
message: str = Field(..., description="Mensaje descriptivo")
# ============================================================================
# Modelos para Errores
# ============================================================================
class VectorDBErrorResponse(BaseModel):
"""Response genérico de error."""
error: str = Field(..., description="Descripción del error")
detail: Optional[str] = Field(None, description="Detalle adicional del error")

View File

@@ -0,0 +1,183 @@
"""
Router para operaciones de chunking de PDFs.
Endpoints para generar preview y procesar PDFs completos.
"""
import logging
from fastapi import APIRouter, HTTPException, status
from typing import List
from ..models.chunking_models import (
ChunkingPreviewRequest,
ChunkingPreviewResponse,
ChunkingProcessRequest,
ChunkingProcessResponse,
ChunkingProfilesResponse,
ChunkingProfile,
ChunkPreview
)
from ..services.chunking_service import get_chunking_service
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/chunking", tags=["chunking"])
# Perfiles predefinidos
CHUNKING_PROFILES = [
ChunkingProfile(
id="balanced",
name="Balanceado",
description="Configuración equilibrada entre velocidad y calidad",
max_tokens=950,
target_tokens=800,
chunk_size=1000,
chunk_overlap=200,
use_llm=True
),
ChunkingProfile(
id="detailed",
name="Detallado",
description="Chunks más grandes para mantener más contexto",
max_tokens=1500,
target_tokens=1200,
chunk_size=1500,
chunk_overlap=300,
use_llm=True
),
ChunkingProfile(
id="compact",
name="Compacto",
description="Chunks más pequeños para búsquedas precisas",
max_tokens=600,
target_tokens=500,
chunk_size=700,
chunk_overlap=150,
use_llm=True
),
ChunkingProfile(
id="fast",
name="Rápido",
description="Sin LLM, solo procesamiento básico",
max_tokens=950,
target_tokens=800,
chunk_size=1000,
chunk_overlap=200,
use_llm=False
),
]
@router.get("/profiles", response_model=ChunkingProfilesResponse)
async def get_chunking_profiles():
"""
Obtiene los perfiles de configuración predefinidos para chunking.
Returns:
Lista de perfiles disponibles
"""
return ChunkingProfilesResponse(profiles=CHUNKING_PROFILES)
@router.post("/preview", response_model=ChunkingPreviewResponse)
async def generate_preview(request: ChunkingPreviewRequest):
"""
Genera preview de chunks para un PDF (hasta 3 chunks).
Args:
request: Configuración de chunking y ubicación del archivo
Returns:
Preview con chunks de ejemplo (máximo 3, mínimo 1)
Raises:
HTTPException: Si hay error generando el preview
"""
try:
logger.info(f"Generando preview para {request.file_name} (tema: {request.tema})")
chunking_service = get_chunking_service()
chunks = await chunking_service.process_pdf_preview(
file_name=request.file_name,
tema=request.tema,
max_tokens=request.max_tokens,
target_tokens=request.target_tokens,
chunk_size=request.chunk_size,
chunk_overlap=request.chunk_overlap,
use_llm=request.use_llm,
custom_instructions=request.custom_instructions
)
# Convertir a modelos Pydantic
chunk_previews = [
ChunkPreview(
index=chunk["index"],
text=chunk["text"],
page=chunk["page"],
file_name=chunk["file_name"],
tokens=chunk["tokens"]
)
for chunk in chunks
]
return ChunkingPreviewResponse(
success=True,
file_name=request.file_name,
tema=request.tema,
chunks=chunk_previews,
message="Preview generado exitosamente"
)
except Exception as e:
logger.error(f"Error generando preview: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error generando preview: {str(e)}"
)
@router.post("/process", response_model=ChunkingProcessResponse)
async def process_pdf_full(request: ChunkingProcessRequest):
"""
Procesa un PDF completo y lo sube a Qdrant.
Este endpoint:
1. Descarga el PDF desde Azure Blob
2. Lo procesa en chunks con control de tokens
3. Genera embeddings con Azure OpenAI
4. Sube los chunks a Qdrant con IDs determinísticos
Args:
request: Configuración de chunking y destino
Returns:
Resultado del procesamiento con estadísticas
Raises:
HTTPException: Si hay error procesando el PDF
"""
try:
logger.info(f"Procesando PDF completo: {request.file_name} (tema: {request.tema})")
chunking_service = get_chunking_service()
result = await chunking_service.process_pdf_full(
file_name=request.file_name,
tema=request.tema,
collection_name=request.collection_name,
max_tokens=request.max_tokens,
target_tokens=request.target_tokens,
chunk_size=request.chunk_size,
chunk_overlap=request.chunk_overlap,
use_llm=request.use_llm,
custom_instructions=request.custom_instructions
)
return ChunkingProcessResponse(**result)
except Exception as e:
logger.error(f"Error procesando PDF: {e}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error procesando PDF: {str(e)}"
)

View File

@@ -0,0 +1,277 @@
"""
Router para endpoints de operaciones con bases de datos vectoriales.
Este módulo define todos los endpoints de la API relacionados con
la gestión de colecciones y chunks en bases de datos vectoriales.
"""
import logging
from fastapi import APIRouter, HTTPException, status, Query
from typing import Optional
from app.services.vector_service import vector_service
from app.models.vector_models import (
CollectionExistsRequest,
CollectionExistsResponse,
CollectionCreateRequest,
CollectionCreateResponse,
CollectionDeleteResponse,
CollectionInfoResponse,
FileExistsInCollectionRequest,
FileExistsInCollectionResponse,
GetChunksByFileRequest,
GetChunksByFileResponse,
DeleteFileFromCollectionRequest,
DeleteFileFromCollectionResponse,
AddChunksRequest,
AddChunksResponse,
VectorDBHealthResponse,
VectorDBErrorResponse
)
logger = logging.getLogger(__name__)
router = APIRouter(
prefix="/vectors",
tags=["Vectors"],
responses={
500: {"model": VectorDBErrorResponse, "description": "Error interno del servidor"}
}
)
# ============================================================================
# Endpoints de Health Check
# ============================================================================
@router.get(
"/health",
response_model=VectorDBHealthResponse,
summary="Verificar estado de la base de datos vectorial",
description="Verifica que la conexión con la base de datos vectorial esté funcionando correctamente"
)
async def health_check():
"""Health check de la base de datos vectorial."""
try:
return await vector_service.health_check()
except Exception as e:
logger.error(f"Error en health check: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error al verificar estado de la base de datos: {str(e)}"
)
# ============================================================================
# Endpoints de Colecciones
# ============================================================================
@router.post(
"/collections/exists",
response_model=CollectionExistsResponse,
summary="Verificar si una colección existe",
description="Verifica si existe una colección con el nombre especificado"
)
async def check_collection_exists(request: CollectionExistsRequest):
"""Verifica si una colección existe."""
try:
return await vector_service.check_collection_exists(request.collection_name)
except Exception as e:
logger.error(f"Error al verificar colección: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error al verificar colección: {str(e)}"
)
@router.post(
"/collections/create",
response_model=CollectionCreateResponse,
status_code=status.HTTP_201_CREATED,
summary="Crear una nueva colección",
description="Crea una nueva colección en la base de datos vectorial"
)
async def create_collection(request: CollectionCreateRequest):
"""Crea una nueva colección."""
try:
return await vector_service.create_collection(request)
except ValueError as e:
logger.warning(f"Error de validación al crear colección: {e}")
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=str(e)
)
except Exception as e:
logger.error(f"Error al crear colección: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error al crear colección: {str(e)}"
)
@router.delete(
"/collections/{collection_name}",
response_model=CollectionDeleteResponse,
summary="Eliminar una colección",
description="Elimina completamente una colección y todos sus datos"
)
async def delete_collection(collection_name: str):
"""Elimina una colección completa."""
try:
return await vector_service.delete_collection(collection_name)
except ValueError as e:
logger.warning(f"Error de validación al eliminar colección: {e}")
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=str(e)
)
except Exception as e:
logger.error(f"Error al eliminar colección: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error al eliminar colección: {str(e)}"
)
@router.get(
"/collections/{collection_name}/info",
response_model=CollectionInfoResponse,
summary="Obtener información de una colección",
description="Obtiene información detallada sobre una colección"
)
async def get_collection_info(collection_name: str):
"""Obtiene información de una colección."""
try:
info = await vector_service.get_collection_info(collection_name)
if info is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Colección '{collection_name}' no encontrada"
)
return info
except HTTPException:
raise
except Exception as e:
logger.error(f"Error al obtener info de colección: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error al obtener información de colección: {str(e)}"
)
# ============================================================================
# Endpoints de Archivos en Colecciones
# ============================================================================
@router.post(
"/files/exists",
response_model=FileExistsInCollectionResponse,
summary="Verificar si un archivo existe en una colección",
description="Verifica si un archivo específico existe en una colección"
)
async def check_file_exists_in_collection(request: FileExistsInCollectionRequest):
"""Verifica si un archivo existe en una colección."""
try:
return await vector_service.check_file_exists_in_collection(
request.collection_name,
request.file_name
)
except Exception as e:
logger.error(f"Error al verificar archivo en colección: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error al verificar archivo: {str(e)}"
)
@router.get(
"/collections/{collection_name}/files/{file_name}/chunks",
response_model=GetChunksByFileResponse,
summary="Obtener chunks de un archivo",
description="Obtiene todos los chunks de un archivo específico en una colección"
)
async def get_chunks_by_file(
collection_name: str,
file_name: str,
limit: Optional[int] = Query(None, description="Límite de chunks a retornar")
):
"""Obtiene todos los chunks de un archivo."""
try:
return await vector_service.get_chunks_by_file(
collection_name,
file_name,
limit
)
except ValueError as e:
logger.warning(f"Error de validación al obtener chunks: {e}")
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=str(e)
)
except Exception as e:
logger.error(f"Error al obtener chunks: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error al obtener chunks: {str(e)}"
)
@router.delete(
"/collections/{collection_name}/files/{file_name}",
response_model=DeleteFileFromCollectionResponse,
summary="Eliminar un archivo de una colección",
description="Elimina todos los chunks de un archivo de una colección"
)
async def delete_file_from_collection(collection_name: str, file_name: str):
"""Elimina todos los chunks de un archivo."""
try:
return await vector_service.delete_file_from_collection(
collection_name,
file_name
)
except ValueError as e:
logger.warning(f"Error de validación al eliminar archivo: {e}")
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=str(e)
)
except Exception as e:
logger.error(f"Error al eliminar archivo: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error al eliminar archivo: {str(e)}"
)
# ============================================================================
# Endpoints de Chunks
# ============================================================================
@router.post(
"/chunks/add",
response_model=AddChunksResponse,
status_code=status.HTTP_201_CREATED,
summary="Agregar chunks a una colección",
description="Agrega múltiples chunks a una colección existente"
)
async def add_chunks(request: AddChunksRequest):
"""Agrega chunks a una colección."""
try:
return await vector_service.add_chunks(
request.collection_name,
request.chunks
)
except ValueError as e:
logger.warning(f"Error de validación al agregar chunks: {e}")
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=str(e)
)
except Exception as e:
logger.error(f"Error al agregar chunks: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error al agregar chunks: {str(e)}"
)

View File

@@ -0,0 +1,316 @@
"""
Servicio de chunking que orquesta todo el proceso:
- Descarga PDF desde Azure Blob
- Procesa con pipeline de chunking
- Genera embeddings con Azure OpenAI
- Sube a Qdrant con IDs determinísticos
"""
import logging
import uuid
from typing import List, Dict, Any, Optional
from io import BytesIO
from azure.storage.blob import BlobServiceClient
from langchain_core.documents import Document
from ..core.config import settings
from ..utils.chunking import (
process_pdf_with_token_control,
get_gemini_client,
GeminiClient
)
from ..services.embedding_service import get_embedding_service
from ..vector_db.factory import get_vector_db
logger = logging.getLogger(__name__)
class ChunkingService:
"""Servicio para procesar PDFs y subirlos a Qdrant"""
def __init__(self):
"""Inicializa el servicio con conexiones a Azure Blob y clientes"""
self.blob_service = BlobServiceClient.from_connection_string(
settings.AZURE_STORAGE_CONNECTION_STRING
)
self.container_name = settings.AZURE_CONTAINER_NAME
self.embedding_service = get_embedding_service()
self.vector_db = get_vector_db()
def _generate_deterministic_id(
self,
file_name: str,
page: int,
chunk_index: int
) -> str:
"""
Genera un ID determinístico para un chunk usando UUID v5.
Args:
file_name: Nombre del archivo
page: Número de página
chunk_index: Índice del chunk dentro de la página
Returns:
ID en formato UUID válido para Qdrant
"""
id_string = f"{file_name}_{page}_{chunk_index}"
# Usar UUID v5 con namespace DNS para generar UUID determinístico
return str(uuid.uuid5(uuid.NAMESPACE_DNS, id_string))
async def download_pdf_from_blob(
self,
file_name: str,
tema: str
) -> bytes:
"""
Descarga un PDF desde Azure Blob Storage.
Args:
file_name: Nombre del archivo
tema: Tema/carpeta del archivo
Returns:
Contenido del PDF en bytes
Raises:
Exception: Si hay error descargando el archivo
"""
try:
blob_path = f"{tema}/{file_name}"
logger.info(f"Descargando PDF: {blob_path}")
blob_client = self.blob_service.get_blob_client(
container=self.container_name,
blob=blob_path
)
pdf_bytes = blob_client.download_blob().readall()
logger.info(f"PDF descargado: {len(pdf_bytes)} bytes")
return pdf_bytes
except Exception as e:
logger.error(f"Error descargando PDF: {e}")
raise
async def process_pdf_preview(
self,
file_name: str,
tema: str,
max_tokens: int = 950,
target_tokens: int = 800,
chunk_size: int = 1000,
chunk_overlap: int = 200,
use_llm: bool = True,
custom_instructions: str = ""
) -> List[Dict[str, Any]]:
"""
Procesa un PDF y genera exactamente 3 chunks de preview.
Args:
file_name: Nombre del archivo PDF
tema: Tema/carpeta del archivo
max_tokens: Límite máximo de tokens por chunk
target_tokens: Tokens objetivo
chunk_size: Tamaño del chunk
chunk_overlap: Solapamiento
use_llm: Si True, usa Gemini para procesamiento inteligente
custom_instructions: Instrucciones personalizadas (solo si use_llm=True)
Returns:
Lista con exactamente 3 chunks de preview con metadata
"""
try:
logger.info(f"Generando preview para {file_name} (tema: {tema})")
# Descargar PDF
pdf_bytes = await self.download_pdf_from_blob(file_name, tema)
# Configurar cliente Gemini si está habilitado
gemini_client = get_gemini_client() if use_llm else None
# Si LLM está deshabilitado, ignorar custom_instructions
instructions = custom_instructions if use_llm else ""
# Procesar PDF
chunks = process_pdf_with_token_control(
pdf_bytes=pdf_bytes,
file_name=file_name,
max_tokens=max_tokens,
target_tokens=target_tokens,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
merge_related=True,
gemini_client=gemini_client,
custom_instructions=instructions,
extract_images=False # Deshabilitado según requerimientos
)
# Tomar los primeros chunks para preview (máximo 3, mínimo 1)
preview_chunks = chunks[:min(3, len(chunks))] if chunks else []
# Formatear para respuesta
result = []
for idx, chunk in enumerate(preview_chunks):
result.append({
"index": idx,
"text": chunk.page_content,
"page": chunk.metadata.get("page", 0),
"file_name": chunk.metadata.get("file_name", file_name),
"tokens": len(chunk.page_content.split()) # Aproximación
})
logger.info(f"Preview generado: {len(result)} chunks")
return result
except Exception as e:
logger.error(f"Error generando preview: {e}")
raise
async def process_pdf_full(
self,
file_name: str,
tema: str,
collection_name: str,
max_tokens: int = 950,
target_tokens: int = 800,
chunk_size: int = 1000,
chunk_overlap: int = 200,
use_llm: bool = True,
custom_instructions: str = "",
progress_callback: Optional[callable] = None
) -> Dict[str, Any]:
"""
Procesa un PDF completo y lo sube a Qdrant.
Args:
file_name: Nombre del archivo PDF
tema: Tema/carpeta del archivo
collection_name: Nombre de la colección en Qdrant
max_tokens: Límite máximo de tokens por chunk
target_tokens: Tokens objetivo
chunk_size: Tamaño del chunk
chunk_overlap: Solapamiento
use_llm: Si True, usa Gemini para procesamiento inteligente
custom_instructions: Instrucciones personalizadas (solo si use_llm=True)
progress_callback: Callback para reportar progreso
Returns:
Diccionario con resultados del procesamiento
"""
try:
logger.info(f"Procesando PDF completo: {file_name} (tema: {tema})")
if progress_callback:
await progress_callback({"status": "downloading", "progress": 0})
# 1. Descargar PDF
pdf_bytes = await self.download_pdf_from_blob(file_name, tema)
if progress_callback:
await progress_callback({"status": "chunking", "progress": 20})
# 2. Configurar cliente Gemini
gemini_client = get_gemini_client() if use_llm else None
instructions = custom_instructions if use_llm else ""
# 3. Procesar PDF
chunks = process_pdf_with_token_control(
pdf_bytes=pdf_bytes,
file_name=file_name,
max_tokens=max_tokens,
target_tokens=target_tokens,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
merge_related=True,
gemini_client=gemini_client,
custom_instructions=instructions,
extract_images=False
)
if progress_callback:
await progress_callback({"status": "embedding", "progress": 50})
# 4. Generar embeddings
texts = [chunk.page_content for chunk in chunks]
logger.info(f"Generando embeddings para {len(texts)} chunks")
embeddings = await self.embedding_service.generate_embeddings_batch(texts)
logger.info(f"Embeddings generados: {len(embeddings)} vectores de dimensión {len(embeddings[0]) if embeddings else 0}")
if progress_callback:
await progress_callback({"status": "uploading", "progress": 80})
# 5. Preparar chunks para Qdrant con IDs determinísticos
qdrant_chunks = []
page_chunk_count = {} # Contador de chunks por página
logger.info(f"Preparando {len(chunks)} chunks con {len(embeddings)} embeddings para subir")
for chunk, embedding in zip(chunks, embeddings):
page = chunk.metadata.get("page", 0)
# Incrementar contador para esta página
if page not in page_chunk_count:
page_chunk_count[page] = 0
chunk_index = page_chunk_count[page]
page_chunk_count[page] += 1
# Generar ID determinístico
chunk_id = self._generate_deterministic_id(
file_name=file_name,
page=page,
chunk_index=chunk_index
)
qdrant_chunks.append({
"id": chunk_id,
"vector": embedding,
"payload": {
"page_content": chunk.page_content,
"metadata": {
"page": page,
"file_name": file_name
}
}
})
# 6. Subir a Qdrant
logger.info(f"Subiendo {len(qdrant_chunks)} chunks a Qdrant colección '{collection_name}'")
result = await self.vector_db.add_chunks(collection_name, qdrant_chunks)
logger.info(f"Resultado de upsert: {result}")
if progress_callback:
await progress_callback({"status": "completed", "progress": 100})
logger.info(f"Procesamiento completo: {result['chunks_added']} chunks subidos")
return {
"success": True,
"collection_name": collection_name,
"file_name": file_name,
"total_chunks": len(chunks),
"chunks_added": result['chunks_added'],
"message": "PDF procesado y subido exitosamente"
}
except Exception as e:
logger.error(f"Error procesando PDF completo: {e}")
if progress_callback:
await progress_callback({"status": "error", "progress": 0, "error": str(e)})
raise
# Instancia global singleton
_chunking_service: ChunkingService | None = None
def get_chunking_service() -> ChunkingService:
"""
Obtiene la instancia singleton del servicio de chunking.
Returns:
Instancia de ChunkingService
"""
global _chunking_service
if _chunking_service is None:
_chunking_service = ChunkingService()
return _chunking_service

View File

@@ -0,0 +1,127 @@
"""
Servicio de embeddings usando Azure OpenAI.
Genera embeddings para chunks de texto usando text-embedding-3-large (3072 dimensiones).
"""
import logging
from typing import List
from openai import AzureOpenAI
from ..core.config import settings
logger = logging.getLogger(__name__)
class EmbeddingService:
"""Servicio para generar embeddings usando Azure OpenAI"""
def __init__(self):
"""Inicializa el cliente de Azure OpenAI"""
try:
self.client = AzureOpenAI(
api_key=settings.AZURE_OPENAI_API_KEY,
api_version=settings.AZURE_OPENAI_API_VERSION,
azure_endpoint=settings.AZURE_OPENAI_ENDPOINT
)
self.model = settings.AZURE_OPENAI_EMBEDDING_DEPLOYMENT
self.embedding_dimension = 3072
logger.info(f"EmbeddingService inicializado con modelo {self.model}")
except Exception as e:
logger.error(f"Error inicializando EmbeddingService: {e}")
raise
async def generate_embedding(self, text: str) -> List[float]:
"""
Genera un embedding para un texto individual.
Args:
text: Texto para generar embedding
Returns:
Vector de embedding (3072 dimensiones)
Raises:
Exception: Si hay error al generar el embedding
"""
try:
response = self.client.embeddings.create(
input=[text],
model=self.model
)
embedding = response.data[0].embedding
if len(embedding) != self.embedding_dimension:
raise ValueError(
f"Dimensión incorrecta: esperada {self.embedding_dimension}, "
f"obtenida {len(embedding)}"
)
return embedding
except Exception as e:
logger.error(f"Error generando embedding: {e}")
raise
async def generate_embeddings_batch(
self,
texts: List[str],
batch_size: int = 100
) -> List[List[float]]:
"""
Genera embeddings para múltiples textos en lotes.
Args:
texts: Lista de textos para generar embeddings
batch_size: Tamaño del lote para procesamiento (default: 100)
Returns:
Lista de vectores de embeddings
Raises:
Exception: Si hay error al generar los embeddings
"""
try:
embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
logger.info(f"Procesando lote {i//batch_size + 1}/{(len(texts)-1)//batch_size + 1}")
response = self.client.embeddings.create(
input=batch,
model=self.model
)
batch_embeddings = [item.embedding for item in response.data]
# Validar dimensiones
for idx, emb in enumerate(batch_embeddings):
if len(emb) != self.embedding_dimension:
raise ValueError(
f"Dimensión incorrecta en índice {i + idx}: "
f"esperada {self.embedding_dimension}, obtenida {len(emb)}"
)
embeddings.extend(batch_embeddings)
logger.info(f"Generados {len(embeddings)} embeddings exitosamente")
return embeddings
except Exception as e:
logger.error(f"Error generando embeddings en lote: {e}")
raise
# Instancia global singleton
_embedding_service: EmbeddingService | None = None
def get_embedding_service() -> EmbeddingService:
"""
Obtiene la instancia singleton del servicio de embeddings.
Returns:
Instancia de EmbeddingService
"""
global _embedding_service
if _embedding_service is None:
_embedding_service = EmbeddingService()
return _embedding_service

View File

@@ -0,0 +1,442 @@
"""
Servicio de lógica de negocio para operaciones con bases de datos vectoriales.
Este módulo contiene toda la lógica de negocio relacionada con la gestión
de colecciones y chunks en bases de datos vectoriales.
"""
import logging
from typing import List, Dict, Any, Optional
from app.vector_db import get_vector_db
from app.models.vector_models import (
CollectionCreateRequest,
CollectionCreateResponse,
CollectionDeleteResponse,
CollectionExistsResponse,
CollectionInfoResponse,
FileExistsInCollectionResponse,
GetChunksByFileResponse,
DeleteFileFromCollectionResponse,
AddChunksResponse,
VectorDBHealthResponse,
VectorDBErrorResponse
)
logger = logging.getLogger(__name__)
class VectorService:
"""
Servicio para gestionar operaciones con bases de datos vectoriales.
Este servicio actúa como una capa intermedia entre los routers y
la implementación de la base de datos vectorial.
"""
def __init__(self):
"""Inicializa el servicio con la instancia de la base de datos vectorial."""
self.vector_db = get_vector_db()
async def check_collection_exists(self, collection_name: str) -> CollectionExistsResponse:
"""
Verifica si una colección existe.
Args:
collection_name: Nombre de la colección
Returns:
CollectionExistsResponse: Response con el resultado
"""
try:
exists = await self.vector_db.collection_exists(collection_name)
logger.info(f"Verificación de colección '{collection_name}': {exists}")
return CollectionExistsResponse(
exists=exists,
collection_name=collection_name
)
except Exception as e:
logger.error(f"Error al verificar colección '{collection_name}': {e}")
raise
async def create_collection(
self,
request: CollectionCreateRequest
) -> CollectionCreateResponse:
"""
Crea una nueva colección.
Args:
request: Request con los datos de la colección
Returns:
CollectionCreateResponse: Response con el resultado
Raises:
ValueError: Si la colección ya existe
"""
try:
# Verificar si ya existe
exists = await self.vector_db.collection_exists(request.collection_name)
if exists:
logger.warning(f"Intento de crear colección existente: '{request.collection_name}'")
raise ValueError(f"La colección '{request.collection_name}' ya existe")
# Crear la colección
success = await self.vector_db.create_collection(
collection_name=request.collection_name,
vector_size=request.vector_size,
distance=request.distance
)
if success:
logger.info(f"Colección '{request.collection_name}' creada exitosamente")
return CollectionCreateResponse(
success=True,
collection_name=request.collection_name,
message=f"Colección '{request.collection_name}' creada exitosamente"
)
else:
logger.error(f"Fallo al crear colección '{request.collection_name}'")
raise Exception(f"No se pudo crear la colección '{request.collection_name}'")
except ValueError:
raise
except Exception as e:
logger.error(f"Error al crear colección '{request.collection_name}': {e}")
raise
async def delete_collection(self, collection_name: str) -> CollectionDeleteResponse:
"""
Elimina una colección completa.
Args:
collection_name: Nombre de la colección
Returns:
CollectionDeleteResponse: Response con el resultado
Raises:
ValueError: Si la colección no existe
"""
try:
# Verificar que existe
exists = await self.vector_db.collection_exists(collection_name)
if not exists:
logger.warning(f"Intento de eliminar colección inexistente: '{collection_name}'")
raise ValueError(f"La colección '{collection_name}' no existe")
# Eliminar la colección
success = await self.vector_db.delete_collection(collection_name)
if success:
logger.info(f"Colección '{collection_name}' eliminada exitosamente")
return CollectionDeleteResponse(
success=True,
collection_name=collection_name,
message=f"Colección '{collection_name}' eliminada exitosamente"
)
else:
logger.error(f"Fallo al eliminar colección '{collection_name}'")
raise Exception(f"No se pudo eliminar la colección '{collection_name}'")
except ValueError:
raise
except Exception as e:
logger.error(f"Error al eliminar colección '{collection_name}': {e}")
raise
async def get_collection_info(self, collection_name: str) -> Optional[CollectionInfoResponse]:
"""
Obtiene información de una colección.
Args:
collection_name: Nombre de la colección
Returns:
Optional[CollectionInfoResponse]: Información de la colección o None
"""
try:
info = await self.vector_db.get_collection_info(collection_name)
if info is None:
logger.warning(f"Colección '{collection_name}' no encontrada")
return None
return CollectionInfoResponse(**info)
except Exception as e:
logger.error(f"Error al obtener info de colección '{collection_name}': {e}")
raise
async def check_file_exists_in_collection(
self,
collection_name: str,
file_name: str
) -> FileExistsInCollectionResponse:
"""
Verifica si un archivo existe en una colección.
Args:
collection_name: Nombre de la colección
file_name: Nombre del archivo
Returns:
FileExistsInCollectionResponse: Response con el resultado
"""
try:
# Primero verificar que la colección existe
collection_exists = await self.vector_db.collection_exists(collection_name)
if not collection_exists:
logger.warning(f"Colección '{collection_name}' no existe")
return FileExistsInCollectionResponse(
exists=False,
collection_name=collection_name,
file_name=file_name,
chunk_count=0
)
# Verificar si el archivo existe
file_exists = await self.vector_db.file_exists_in_collection(
collection_name,
file_name
)
chunk_count = None
if file_exists:
chunk_count = await self.vector_db.count_chunks_in_file(
collection_name,
file_name
)
logger.info(
f"Archivo '{file_name}' en colección '{collection_name}': "
f"existe={file_exists}, chunks={chunk_count}"
)
return FileExistsInCollectionResponse(
exists=file_exists,
collection_name=collection_name,
file_name=file_name,
chunk_count=chunk_count
)
except Exception as e:
logger.error(
f"Error al verificar archivo '{file_name}' "
f"en colección '{collection_name}': {e}"
)
raise
async def get_chunks_by_file(
self,
collection_name: str,
file_name: str,
limit: Optional[int] = None
) -> GetChunksByFileResponse:
"""
Obtiene todos los chunks de un archivo.
Args:
collection_name: Nombre de la colección
file_name: Nombre del archivo
limit: Límite opcional de chunks
Returns:
GetChunksByFileResponse: Response con los chunks
Raises:
ValueError: Si la colección no existe
"""
try:
# Verificar que la colección existe
exists = await self.vector_db.collection_exists(collection_name)
if not exists:
logger.warning(f"Colección '{collection_name}' no existe")
raise ValueError(f"La colección '{collection_name}' no existe")
# Obtener chunks
chunks = await self.vector_db.get_chunks_by_file(
collection_name,
file_name,
limit
)
logger.info(
f"Obtenidos {len(chunks)} chunks del archivo '{file_name}' "
f"de la colección '{collection_name}'"
)
return GetChunksByFileResponse(
collection_name=collection_name,
file_name=file_name,
chunks=chunks,
total_chunks=len(chunks)
)
except ValueError:
raise
except Exception as e:
logger.error(
f"Error al obtener chunks del archivo '{file_name}' "
f"de la colección '{collection_name}': {e}"
)
raise
async def delete_file_from_collection(
self,
collection_name: str,
file_name: str
) -> DeleteFileFromCollectionResponse:
"""
Elimina todos los chunks de un archivo de una colección.
Args:
collection_name: Nombre de la colección
file_name: Nombre del archivo
Returns:
DeleteFileFromCollectionResponse: Response con el resultado
Raises:
ValueError: Si la colección no existe o el archivo no está en la colección
"""
try:
# Verificar que la colección existe
collection_exists = await self.vector_db.collection_exists(collection_name)
if not collection_exists:
logger.warning(f"Colección '{collection_name}' no existe")
raise ValueError(f"La colección '{collection_name}' no existe")
# Verificar que el archivo existe en la colección
file_exists = await self.vector_db.file_exists_in_collection(
collection_name,
file_name
)
if not file_exists:
logger.warning(
f"Archivo '{file_name}' no existe en colección '{collection_name}'"
)
raise ValueError(
f"El archivo '{file_name}' no existe en la colección '{collection_name}'"
)
# Eliminar el archivo
chunks_deleted = await self.vector_db.delete_file_from_collection(
collection_name,
file_name
)
logger.info(
f"Eliminados {chunks_deleted} chunks del archivo '{file_name}' "
f"de la colección '{collection_name}'"
)
return DeleteFileFromCollectionResponse(
success=True,
collection_name=collection_name,
file_name=file_name,
chunks_deleted=chunks_deleted,
message=f"Archivo '{file_name}' eliminado exitosamente ({chunks_deleted} chunks)"
)
except ValueError:
raise
except Exception as e:
logger.error(
f"Error al eliminar archivo '{file_name}' "
f"de la colección '{collection_name}': {e}"
)
raise
async def add_chunks(
self,
collection_name: str,
chunks: List[Dict[str, Any]]
) -> AddChunksResponse:
"""
Agrega chunks a una colección.
Args:
collection_name: Nombre de la colección
chunks: Lista de chunks a agregar
Returns:
AddChunksResponse: Response con el resultado
Raises:
ValueError: Si la colección no existe
"""
try:
# Verificar que la colección existe
exists = await self.vector_db.collection_exists(collection_name)
if not exists:
logger.warning(f"Colección '{collection_name}' no existe")
raise ValueError(f"La colección '{collection_name}' no existe")
# Agregar chunks
success = await self.vector_db.add_chunks(collection_name, chunks)
if success:
logger.info(
f"Agregados {len(chunks)} chunks a la colección '{collection_name}'"
)
return AddChunksResponse(
success=True,
collection_name=collection_name,
chunks_added=len(chunks),
message=f"Se agregaron {len(chunks)} chunks exitosamente"
)
else:
logger.error(f"Fallo al agregar chunks a '{collection_name}'")
raise Exception(f"No se pudieron agregar los chunks a '{collection_name}'")
except ValueError:
raise
except Exception as e:
logger.error(f"Error al agregar chunks a '{collection_name}': {e}")
raise
async def health_check(self) -> VectorDBHealthResponse:
"""
Verifica el estado de la conexión con la base de datos vectorial.
Returns:
VectorDBHealthResponse: Response con el estado
"""
try:
is_healthy = await self.vector_db.health_check()
if is_healthy:
return VectorDBHealthResponse(
status="healthy",
db_type="qdrant",
message="Conexión exitosa con la base de datos vectorial"
)
else:
return VectorDBHealthResponse(
status="unhealthy",
db_type="qdrant",
message="No se pudo conectar con la base de datos vectorial"
)
except Exception as e:
logger.error(f"Error en health check: {e}")
return VectorDBHealthResponse(
status="error",
db_type="qdrant",
message=f"Error al verificar conexión: {str(e)}"
)
# Instancia global del servicio
vector_service = VectorService()

View File

@@ -0,0 +1,18 @@
"""
Utilidades de chunking para procesamiento de PDFs.
Refactorización modular del pipeline de chunking_token.py
"""
from .gemini_client import GeminiClient, get_gemini_client
from .token_manager import TokenManager
from .chunk_processor import OptimizedChunkProcessor
from .pdf_extractor import OptimizedPDFExtractor
from .pipeline import process_pdf_with_token_control
__all__ = [
"GeminiClient",
"get_gemini_client",
"TokenManager",
"OptimizedChunkProcessor",
"OptimizedPDFExtractor",
"process_pdf_with_token_control",
]

View File

@@ -0,0 +1,258 @@
"""
Procesador optimizado de chunks con soporte para LLM (Gemini).
Permite merge inteligente y mejora de chunks usando IA.
"""
import logging
import time
import hashlib
from typing import List, Optional
from langchain_core.documents import Document
from .token_manager import TokenManager
from .gemini_client import GeminiClient
logger = logging.getLogger(__name__)
class OptimizedChunkProcessor:
"""Procesador de chunks con optimización mediante LLM"""
def __init__(
self,
max_tokens: int = 1000,
target_tokens: int = 800,
chunks_per_batch: int = 5,
gemini_client: Optional[GeminiClient] = None,
model_name: str = "gpt-3.5-turbo",
custom_instructions: str = ""
):
"""
Inicializa el procesador de chunks.
Args:
max_tokens: Límite máximo de tokens por chunk
target_tokens: Tokens objetivo para chunks optimizados
chunks_per_batch: Chunks a procesar por lote
gemini_client: Cliente de Gemini para procesamiento (opcional)
model_name: Modelo para cálculo de tokens
custom_instructions: Instrucciones adicionales para el prompt de optimización
"""
self.client = gemini_client
self.chunks_per_batch = chunks_per_batch
self.max_tokens = max_tokens
self.target_tokens = target_tokens
self.token_manager = TokenManager(model_name)
self.custom_instructions = custom_instructions
# Caché para evitar reprocesamiento
self._merge_cache = {}
self._enhance_cache = {}
def _get_cache_key(self, text: str) -> str:
"""Genera una clave de caché para el texto"""
combined = text + self.custom_instructions
return hashlib.md5(combined.encode()).hexdigest()[:16]
def should_merge_chunks(self, chunk1: str, chunk2: str) -> bool:
"""
Determina si dos chunks deben unirse basándose en continuidad semántica.
Args:
chunk1: Primer chunk
chunk2: Segundo chunk
Returns:
True si los chunks deben unirse
"""
cache_key = f"{self._get_cache_key(chunk1)}_{self._get_cache_key(chunk2)}"
if cache_key in self._merge_cache:
return self._merge_cache[cache_key]
try:
combined_text = f"{chunk1}\n\n{chunk2}"
combined_tokens = self.token_manager.count_tokens(combined_text)
if combined_tokens > self.max_tokens:
self._merge_cache[cache_key] = False
return False
if self.client:
base_prompt = f"""Analiza estos dos fragmentos de texto y determina si deben unirse.
LÍMITES ESTRICTOS:
- Tokens combinados: {combined_tokens}/{self.max_tokens}
- Solo unir si hay continuidad semántica clara
Criterios de unión:
1. El primer fragmento termina abruptamente
2. El segundo fragmento continúa la misma idea/concepto
3. La unión mejora la coherencia del contenido
4. Exceder {self.max_tokens} tokens, SOLAMENTE si es necesario para mantener el contexto
Responde SOLO 'SI' o 'NO'.
Fragmento 1 ({self.token_manager.count_tokens(chunk1)} tokens):
{chunk1[:500]}...
Fragmento 2 ({self.token_manager.count_tokens(chunk2)} tokens):
{chunk2[:500]}..."""
response = self.client.generate_content(base_prompt)
result = response.strip().upper() == 'SI'
self._merge_cache[cache_key] = result
return result
# Heurística simple si no hay cliente LLM
result = (
chunk1.rstrip().endswith(('.', '!', '?')) == False and
combined_tokens <= self.target_tokens
)
self._merge_cache[cache_key] = result
return result
except Exception as e:
logger.error(f"Error analizando chunks para merge: {e}")
self._merge_cache[cache_key] = False
return False
def enhance_chunk(self, chunk_text: str) -> str:
"""
Mejora un chunk usando LLM o truncamiento.
Args:
chunk_text: Texto del chunk a mejorar
Returns:
Texto del chunk mejorado
"""
cache_key = self._get_cache_key(chunk_text)
if cache_key in self._enhance_cache:
return self._enhance_cache[cache_key]
current_tokens = self.token_manager.count_tokens(chunk_text)
try:
if self.client and current_tokens < self.max_tokens:
base_prompt = f"""Optimiza este texto siguiendo estas reglas ESTRICTAS:
LÍMITES DE TOKENS:
- Actual: {current_tokens} tokens
- Máximo permitido: {self.max_tokens} tokens
- Objetivo: {self.target_tokens} tokens
REGLAS FUNDAMENTALES:
NO exceder {self.max_tokens} tokens bajo ninguna circunstancia
Mantener TODA la información esencial y metadatos
NO cambiar términos técnicos o palabras clave
Asegurar oraciones completas y coherentes
Optimizar claridad y estructura sin añadir contenido
SOLO devuelve el texto no agregues conclusiones NUNCA
Si el texto está cerca del límite, NO expandir. Solo mejorar estructura."""
if self.custom_instructions.strip():
base_prompt += f"\n\nINSTRUCCIONES ADICIONALES:\n{self.custom_instructions}"
base_prompt += f"\n\nTexto a optimizar:\n{chunk_text}"
response = self.client.generate_content(base_prompt)
enhanced_text = response.strip()
enhanced_tokens = self.token_manager.count_tokens(enhanced_text)
if enhanced_tokens > self.max_tokens:
logger.warning(
f"Texto optimizado excede límite ({enhanced_tokens} > {self.max_tokens}), truncando"
)
enhanced_text = self.token_manager.truncate_to_tokens(enhanced_text, self.max_tokens)
self._enhance_cache[cache_key] = enhanced_text
return enhanced_text
else:
# Sin LLM o ya en límite, solo truncar si es necesario
if current_tokens > self.max_tokens:
truncated = self.token_manager.truncate_to_tokens(chunk_text, self.max_tokens)
self._enhance_cache[cache_key] = truncated
return truncated
self._enhance_cache[cache_key] = chunk_text
return chunk_text
except Exception as e:
logger.error(f"Error procesando chunk: {e}")
if current_tokens > self.max_tokens:
truncated = self.token_manager.truncate_to_tokens(chunk_text, self.max_tokens)
self._enhance_cache[cache_key] = truncated
return truncated
self._enhance_cache[cache_key] = chunk_text
return chunk_text
def process_chunks_batch(
self,
chunks: List[Document],
merge_related: bool = False
) -> List[Document]:
"""
Procesa un lote de chunks, aplicando merge y mejoras.
Args:
chunks: Lista de documentos a procesar
merge_related: Si True, intenta unir chunks relacionados
Returns:
Lista de documentos procesados
"""
processed_chunks = []
total_chunks = len(chunks)
logger.info(f"Procesando {total_chunks} chunks en lotes de {self.chunks_per_batch}")
if self.custom_instructions:
logger.info(f"Con instrucciones personalizadas: {self.custom_instructions[:100]}...")
i = 0
while i < len(chunks):
batch_start = time.time()
current_chunk = chunks[i]
merged_content = current_chunk.page_content
original_tokens = self.token_manager.count_tokens(merged_content)
# Intentar merge si está habilitado
if merge_related and i < len(chunks) - 1:
merge_count = 0
while (
i + merge_count < len(chunks) - 1 and
self.should_merge_chunks(
merged_content,
chunks[i + merge_count + 1].page_content
)
):
merge_count += 1
merged_content += "\n\n" + chunks[i + merge_count].page_content
logger.info(f" Uniendo chunk {i + 1} con chunk {i + merge_count + 1}")
i += merge_count
logger.info(f"\nProcesando chunk {i + 1}/{total_chunks}")
logger.info(f" Tokens originales: {original_tokens}")
# Mejorar chunk
enhanced_content = self.enhance_chunk(merged_content)
final_tokens = self.token_manager.count_tokens(enhanced_content)
processed_chunks.append(Document(
page_content=enhanced_content,
metadata={
**current_chunk.metadata,
}
))
logger.info(f" Tokens finales: {final_tokens}")
logger.info(f" Tiempo de procesamiento: {time.time() - batch_start:.2f}s")
i += 1
if i % self.chunks_per_batch == 0 and i < len(chunks):
logger.info(f"\nCompletados {i}/{total_chunks} chunks")
time.sleep(0.1)
return processed_chunks

View File

@@ -0,0 +1,91 @@
"""
Cliente para interactuar con Gemini (Google Vertex AI).
Usado para procesamiento inteligente de chunks con LLM.
"""
import logging
import os
import google.oauth2.service_account as sa
import vertexai.generative_models as gm
import vertexai
from ...core.config import settings
logger = logging.getLogger(__name__)
class GeminiClient:
"""Cliente para generar contenido usando Gemini via Vertex AI"""
def __init__(
self,
account_file: str | None = None,
project: str | None = None,
model: str | None = None
) -> None:
"""
Inicializa el cliente de Gemini.
Args:
account_file: Ruta al archivo de credenciales de servicio (default: desde settings)
project: ID del proyecto de Google Cloud (default: desde settings)
model: Modelo de Gemini a usar (default: desde settings)
"""
# Usar configuración de settings si no se proporciona
account_file = account_file or settings.GOOGLE_APPLICATION_CREDENTIALS
project = project or settings.GOOGLE_CLOUD_PROJECT
model = model or settings.GEMINI_MODEL
try:
# Cargar credenciales desde archivo
credentials = sa.Credentials.from_service_account_file(account_file)
# Inicializar Vertex AI
vertexai.init(
project=project,
credentials=credentials,
location=settings.GOOGLE_CLOUD_LOCATION
)
# Inicializar modelo
self.model = gm.GenerativeModel(model)
logger.info(f"GeminiClient inicializado con modelo {model}")
except Exception as e:
logger.error(f"Error inicializando GeminiClient: {e}")
raise
def generate_content(self, prompt: str) -> str:
"""
Genera contenido usando Gemini.
Args:
prompt: Prompt para el modelo
Returns:
Texto generado por el modelo
Raises:
Exception: Si hay error en la generación
"""
try:
response = self.model.generate_content(prompt)
return response.text
except Exception as e:
logger.error(f"Error en Gemini: {e}")
return ""
# Instancia global singleton
_gemini_client: GeminiClient | None = None
def get_gemini_client() -> GeminiClient:
"""
Obtiene la instancia singleton del cliente de Gemini.
Returns:
Instancia de GeminiClient
"""
global _gemini_client
if _gemini_client is None:
_gemini_client = GeminiClient()
return _gemini_client

View File

@@ -0,0 +1,299 @@
"""
Extractor optimizado de PDFs con soporte para BytesIO y procesamiento paralelo.
Adaptado para trabajar con Azure Blob Storage sin archivos temporales.
"""
import logging
import os
import time
import hashlib
from typing import List, Optional, Dict, BinaryIO
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor
from langchain_core.documents import Document
from pypdf import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pdf2image import convert_from_bytes
from .token_manager import TokenManager
from .chunk_processor import OptimizedChunkProcessor
from .gemini_client import GeminiClient
logger = logging.getLogger(__name__)
class OptimizedPDFExtractor:
"""Extractor optimizado de PDFs con soporte para BytesIO"""
def __init__(
self,
max_tokens: int = 1000,
target_tokens: int = 800,
gemini_client: Optional[GeminiClient] = None,
custom_instructions: str = "",
extract_images: bool = False, # Por defecto deshabilitado según requerimientos
max_workers: int = 4
):
"""
Inicializa el extractor de PDFs.
Args:
max_tokens: Límite máximo de tokens por chunk
target_tokens: Tokens objetivo para chunks
gemini_client: Cliente de Gemini (opcional)
custom_instructions: Instrucciones adicionales para optimización
extract_images: Si True, extrae páginas con formato especial como imágenes
max_workers: Número máximo de workers para procesamiento paralelo
"""
self.client = gemini_client
self.max_workers = max_workers
self.token_manager = TokenManager()
self.custom_instructions = custom_instructions
self.extract_images = extract_images
self._format_cache = {}
self.chunk_processor = OptimizedChunkProcessor(
max_tokens=max_tokens,
target_tokens=target_tokens,
gemini_client=gemini_client,
custom_instructions=custom_instructions
)
def detect_special_format_batch(self, chunks: List[Document]) -> Dict[int, bool]:
"""
Detecta chunks con formatos especiales (tablas, diagramas, etc.) en lote.
Args:
chunks: Lista de chunks a analizar
Returns:
Diccionario con índices de chunks y si tienen formato especial
"""
results = {}
chunks_to_process = []
for i, chunk in enumerate(chunks):
cache_key = hashlib.md5(chunk.page_content.encode()).hexdigest()[:16]
if cache_key in self._format_cache:
results[i] = self._format_cache[cache_key]
else:
chunks_to_process.append((i, chunk, cache_key))
if not chunks_to_process:
return results
logger.info(f"Analizando {len(chunks_to_process)} chunks para formatos especiales...")
if self.client and len(chunks_to_process) > 1:
with ThreadPoolExecutor(max_workers=min(self.max_workers, len(chunks_to_process))) as executor:
futures = {
executor.submit(self._detect_single_format, chunk): (i, cache_key)
for i, chunk, cache_key in chunks_to_process
}
for future in futures:
i, cache_key = futures[future]
try:
result = future.result()
results[i] = result
self._format_cache[cache_key] = result
except Exception as e:
logger.error(f"Error procesando chunk {i}: {e}")
results[i] = False
self._format_cache[cache_key] = False
else:
for i, chunk, cache_key in chunks_to_process:
result = self._detect_single_format(chunk)
results[i] = result
self._format_cache[cache_key] = result
return results
def _detect_single_format(self, chunk: Document) -> bool:
"""Detecta formato especial en un chunk individual."""
if not self.client:
content = chunk.page_content
table_indicators = ['', '', '', '', '', '', '|', '+', '-']
has_table_chars = any(char in content for char in table_indicators)
has_multiple_columns = content.count('\t') > 10 or content.count(' ') > 20
return has_table_chars or has_multiple_columns
try:
prompt = f"""¿Contiene este texto tablas estructuradas, diagramas ASCII, o elementos que requieren formato especial?
Responde SOLO 'SI' o 'NO'.
Texto:
{chunk.page_content[:1000]}"""
response = self.client.generate_content(prompt)
return response.strip().upper() == 'SI'
except Exception as e:
logger.error(f"Error detectando formato: {e}")
return False
def process_pdf_from_bytes(
self,
pdf_bytes: bytes,
file_name: str,
chunk_size: int = 1000,
chunk_overlap: int = 200,
merge_related: bool = True
) -> List[Document]:
"""
Procesa un PDF desde bytes (BytesIO).
Args:
pdf_bytes: Contenido del PDF en bytes
file_name: Nombre del archivo PDF
chunk_size: Tamaño del chunk
chunk_overlap: Solapamiento entre chunks
merge_related: Si True, intenta unir chunks relacionados
Returns:
Lista de documentos procesados
"""
overall_start = time.time()
logger.info(f"\n=== Iniciando procesamiento optimizado de PDF: {file_name} ===")
logger.info(f"Configuración:")
logger.info(f" - Tokens máximos por chunk: {self.chunk_processor.max_tokens}")
logger.info(f" - Tokens objetivo: {self.chunk_processor.target_tokens}")
logger.info(f" - Chunk size: {chunk_size}")
logger.info(f" - Chunk overlap: {chunk_overlap}")
logger.info(f" - Merge relacionados: {merge_related}")
logger.info(f" - Extraer imágenes: {'' if self.extract_images else ''}")
if self.custom_instructions:
logger.info(f" - Instrucciones personalizadas: {self.custom_instructions[:100]}...")
logger.info(f"\n1. Creando chunks del PDF...")
chunks = self._create_optimized_chunks_from_bytes(
pdf_bytes,
file_name,
chunk_size,
chunk_overlap
)
logger.info(f" Total chunks creados: {len(chunks)}")
# Nota: La extracción de imágenes desde bytes no se implementa por ahora
# ya que extract_images está deshabilitado por defecto según requerimientos
if self.extract_images:
logger.warning("Extracción de imágenes desde bytes no implementada aún")
logger.info(f"\n2. Procesando y optimizando chunks...")
processed_chunks = self.chunk_processor.process_chunks_batch(chunks, merge_related)
total_time = time.time() - overall_start
if processed_chunks:
avg_tokens = sum(
self.token_manager.count_tokens(chunk.page_content)
for chunk in processed_chunks
) / len(processed_chunks)
else:
avg_tokens = 0
logger.info(f"\n=== Procesamiento completado ===")
logger.info(f" Tiempo total: {total_time:.2f}s")
logger.info(f" Chunks procesados: {len(processed_chunks)}")
logger.info(f" Tokens promedio por chunk: {avg_tokens:.1f}")
if self.custom_instructions:
logger.info(f" Custom instructions aplicadas: ✅")
return processed_chunks
def _create_optimized_chunks_from_bytes(
self,
pdf_bytes: bytes,
file_name: str,
chunk_size: int,
chunk_overlap: int
) -> List[Document]:
"""
Crea chunks optimizados desde bytes del PDF.
Args:
pdf_bytes: Contenido del PDF en bytes
file_name: Nombre del archivo
chunk_size: Tamaño del chunk
chunk_overlap: Solapamiento entre chunks
Returns:
Lista de documentos con chunks
"""
logger.info(f" Leyendo PDF desde bytes: {file_name}")
# Crear BytesIO para pypdf
pdf_buffer = BytesIO(pdf_bytes)
pdf = PdfReader(pdf_buffer)
chunks = []
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=self.token_manager.count_tokens,
separators=["\n\n", "\n", ". ", " ", ""]
)
# Extraer todo el texto concatenado con tracking de páginas
full_text = ""
page_boundaries = [] # Lista de (char_position, page_num)
for page_num, page in enumerate(pdf.pages, 1):
text = page.extract_text()
if text.strip():
page_start = len(full_text)
full_text += text
# Agregar separador entre páginas (excepto después de la última)
if page_num < len(pdf.pages):
full_text += "\n\n"
page_end = len(full_text)
page_boundaries.append((page_start, page_end, page_num))
if not full_text.strip():
return []
# Dividir el texto completo (esto permite overlap entre páginas)
text_chunks = text_splitter.split_text(full_text)
logger.info(f" Total de chunks generados por splitter: {len(text_chunks)}")
if len(text_chunks) >= 2:
# Verificar overlap entre primer y segundo chunk
chunk0_end = text_chunks[0][-100:] if len(text_chunks[0]) > 100 else text_chunks[0]
chunk1_start = text_chunks[1][:100] if len(text_chunks[1]) > 100 else text_chunks[1]
logger.info(f" Chunk 0 termina con: ...{chunk0_end}")
logger.info(f" Chunk 1 empieza con: {chunk1_start}...")
# Asignar página a cada chunk basándonos en su posición en el texto original
chunks = []
current_search_pos = 0
for chunk_text in text_chunks:
# Buscar donde aparece este chunk en el texto completo
chunk_pos = full_text.find(chunk_text, current_search_pos)
if chunk_pos == -1:
# Si no lo encontramos, usar la última posición conocida
chunk_pos = current_search_pos
# Determinar la página basándonos en la posición del inicio del chunk
chunk_page = 1
for start, end, page_num in page_boundaries:
if chunk_pos >= start and chunk_pos < end:
chunk_page = page_num
break
elif chunk_pos >= end:
# El chunk está después de esta página, continuar buscando
chunk_page = page_num # Guardar la última página vista
chunks.append(Document(
page_content=chunk_text,
metadata={
"page": chunk_page,
"file_name": file_name,
}
))
# Actualizar posición de búsqueda para el siguiente chunk
current_search_pos = chunk_pos + len(chunk_text)
return chunks

View File

@@ -0,0 +1,65 @@
"""
Pipeline principal para procesar PDFs con control de tokens.
Función de alto nivel que orquesta el proceso completo de chunking.
"""
import logging
from typing import List, Optional
from langchain_core.documents import Document
from .pdf_extractor import OptimizedPDFExtractor
from .gemini_client import GeminiClient
logger = logging.getLogger(__name__)
def process_pdf_with_token_control(
pdf_bytes: bytes,
file_name: str,
max_tokens: int = 950,
target_tokens: int = 800,
chunk_size: int = 1000,
chunk_overlap: int = 200,
merge_related: bool = True,
gemini_client: Optional[GeminiClient] = None,
custom_instructions: str = "",
extract_images: bool = False
) -> List[Document]:
"""
Función principal para procesar PDFs con control completo de tokens.
Args:
pdf_bytes: Contenido del PDF en bytes
file_name: Nombre del archivo PDF
max_tokens: Límite máximo de tokens por chunk
target_tokens: Tokens objetivo para optimización
chunk_size: Tamaño base de chunks
chunk_overlap: Solapamiento entre chunks
merge_related: Si unir chunks relacionados
gemini_client: Cliente de Gemini (opcional, para LLM processing)
custom_instructions: Instrucciones adicionales para optimización
extract_images: Si True, extrae páginas con formato especial como imágenes
Returns:
Lista de documentos procesados con metadata simple (page, file_name)
"""
logger.info(f"Iniciando pipeline de chunking para {file_name}")
extractor = OptimizedPDFExtractor(
max_tokens=max_tokens,
target_tokens=target_tokens,
gemini_client=gemini_client,
custom_instructions=custom_instructions,
extract_images=extract_images,
max_workers=4
)
chunks = extractor.process_pdf_from_bytes(
pdf_bytes=pdf_bytes,
file_name=file_name,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
merge_related=merge_related
)
logger.info(f"Pipeline completado: {len(chunks)} chunks generados")
return chunks

View File

@@ -0,0 +1,72 @@
"""
Gestor de tokens para contar y truncar texto basado en modelos de tokenización.
"""
import logging
import tiktoken
logger = logging.getLogger(__name__)
class TokenManager:
"""Gestor para contar y truncar tokens usando tiktoken"""
def __init__(self, model_name: str = "gpt-3.5-turbo"):
"""
Inicializa el gestor de tokens.
Args:
model_name: Nombre del modelo para la codificación de tokens
"""
try:
self.encoding = tiktoken.encoding_for_model(model_name)
except KeyError:
logger.warning(
f"Modelo {model_name} no encontrado, usando codificación por defecto cl100k_base"
)
self.encoding = tiktoken.get_encoding("cl100k_base")
def count_tokens(self, text: str) -> int:
"""
Cuenta el número de tokens en un texto.
Args:
text: Texto a analizar
Returns:
Número de tokens
"""
return len(self.encoding.encode(text))
def truncate_to_tokens(
self,
text: str,
max_tokens: int,
preserve_sentences: bool = True
) -> str:
"""
Trunca texto a un número máximo de tokens.
Args:
text: Texto a truncar
max_tokens: Número máximo de tokens
preserve_sentences: Si True, intenta mantener oraciones completas
Returns:
Texto truncado
"""
tokens = self.encoding.encode(text)
if len(tokens) <= max_tokens:
return text
truncated_tokens = tokens[:max_tokens]
truncated_text = self.encoding.decode(truncated_tokens)
if preserve_sentences:
# Intentar cortar en el último punto
last_period = truncated_text.rfind('.')
# Solo cortar si el punto está en el último 30% del texto
if last_period > len(truncated_text) * 0.7:
return truncated_text[:last_period + 1]
return truncated_text

View File

@@ -0,0 +1,12 @@
"""
Vector Database Module
Este módulo proporciona una abstracción para trabajar con bases de datos vectoriales.
Utiliza el patrón Repository para permitir cambiar fácilmente entre diferentes
implementaciones (Qdrant, Pinecone, Weaviate, etc.).
"""
from .base import VectorDBBase
from .factory import get_vector_db
__all__ = ["VectorDBBase", "get_vector_db"]

View File

@@ -0,0 +1,189 @@
"""
Clase abstracta base para operaciones con bases de datos vectoriales.
Este módulo define la interfaz que todas las implementaciones de bases de datos
vectoriales deben seguir, permitiendo cambiar fácilmente entre diferentes proveedores.
"""
from abc import ABC, abstractmethod
from typing import List, Dict, Any, Optional
class VectorDBBase(ABC):
"""
Clase abstracta que define las operaciones básicas para una base de datos vectorial.
Esta interfaz permite implementar el patrón Repository/Strategy para
abstraer la lógica de acceso a datos vectoriales.
"""
@abstractmethod
async def collection_exists(self, collection_name: str) -> bool:
"""
Verifica si existe una colección con el nombre especificado.
Args:
collection_name: Nombre de la colección a verificar
Returns:
bool: True si la colección existe, False en caso contrario
"""
pass
@abstractmethod
async def create_collection(
self,
collection_name: str,
vector_size: int = 3072,
distance: str = "Cosine"
) -> bool:
"""
Crea una nueva colección en la base de datos vectorial.
Args:
collection_name: Nombre de la colección a crear
vector_size: Dimensión de los vectores (por defecto 3072)
distance: Métrica de distancia ("Cosine", "Euclid", "Dot")
Returns:
bool: True si se creó exitosamente, False en caso contrario
"""
pass
@abstractmethod
async def delete_collection(self, collection_name: str) -> bool:
"""
Elimina una colección completa.
Args:
collection_name: Nombre de la colección a eliminar
Returns:
bool: True si se eliminó exitosamente, False en caso contrario
"""
pass
@abstractmethod
async def file_exists_in_collection(
self,
collection_name: str,
file_name: str
) -> bool:
"""
Verifica si un archivo ya existe en una colección.
Args:
collection_name: Nombre de la colección
file_name: Nombre del archivo a buscar
Returns:
bool: True si el archivo existe, False en caso contrario
"""
pass
@abstractmethod
async def get_chunks_by_file(
self,
collection_name: str,
file_name: str,
limit: Optional[int] = None
) -> List[Dict[str, Any]]:
"""
Obtiene todos los chunks de un archivo específico.
Args:
collection_name: Nombre de la colección
file_name: Nombre del archivo
limit: Límite opcional de resultados
Returns:
List[Dict]: Lista de chunks con su metadata
"""
pass
@abstractmethod
async def delete_file_from_collection(
self,
collection_name: str,
file_name: str
) -> int:
"""
Elimina todos los chunks de un archivo de una colección.
Args:
collection_name: Nombre de la colección
file_name: Nombre del archivo a eliminar
Returns:
int: Número de chunks eliminados
"""
pass
@abstractmethod
async def add_chunks(
self,
collection_name: str,
chunks: List[Dict[str, Any]]
) -> Dict[str, Any]:
"""
Agrega múltiples chunks a una colección.
Args:
collection_name: Nombre de la colección
chunks: Lista de chunks con estructura:
{
"id": str,
"vector": List[float],
"payload": {
"text": str,
"file_name": str,
"page": int,
...otros campos opcionales
}
}
Returns:
Dict con 'success' (bool) y 'chunks_added' (int)
"""
pass
@abstractmethod
async def get_collection_info(self, collection_name: str) -> Optional[Dict[str, Any]]:
"""
Obtiene información sobre una colección.
Args:
collection_name: Nombre de la colección
Returns:
Optional[Dict]: Información de la colección o None si no existe
"""
pass
@abstractmethod
async def count_chunks_in_file(
self,
collection_name: str,
file_name: str
) -> int:
"""
Cuenta el número de chunks de un archivo.
Args:
collection_name: Nombre de la colección
file_name: Nombre del archivo
Returns:
int: Número de chunks del archivo
"""
pass
@abstractmethod
async def health_check(self) -> bool:
"""
Verifica que la conexión con la base de datos vectorial esté funcionando.
Returns:
bool: True si la conexión es exitosa, False en caso contrario
"""
pass

View File

@@ -0,0 +1,76 @@
"""
Factory para crear instancias de bases de datos vectoriales.
Este módulo implementa el patrón Factory para crear la instancia correcta
de base de datos vectorial según la configuración.
"""
import logging
from typing import Optional
from app.core.config import settings
from .base import VectorDBBase
from .qdrant_client import QdrantVectorDB
logger = logging.getLogger(__name__)
# Instancia global singleton
_vector_db_instance: Optional[VectorDBBase] = None
def get_vector_db() -> VectorDBBase:
"""
Factory function que retorna la instancia de base de datos vectorial configurada.
Utiliza un patrón Singleton para mantener una sola instancia durante
el ciclo de vida de la aplicación.
Returns:
VectorDBBase: Instancia de la base de datos vectorial configurada
Raises:
ValueError: Si el tipo de base de datos no está soportado
"""
global _vector_db_instance
# Si ya existe una instancia, retornarla
if _vector_db_instance is not None:
return _vector_db_instance
# Crear nueva instancia según configuración
db_type = settings.VECTOR_DB_TYPE.lower()
if db_type == "qdrant":
logger.info(f"Inicializando Qdrant con URL: {settings.QDRANT_URL}")
_vector_db_instance = QdrantVectorDB(
url=settings.QDRANT_URL,
api_key=settings.QDRANT_API_KEY
)
# Aquí se pueden agregar otros proveedores en el futuro
# elif db_type == "pinecone":
# _vector_db_instance = PineconeVectorDB(...)
# elif db_type == "weaviate":
# _vector_db_instance = WeaviateVectorDB(...)
else:
raise ValueError(
f"Tipo de base de datos vectorial no soportado: {db_type}. "
f"Tipos soportados: qdrant"
)
logger.info(f"Base de datos vectorial '{db_type}' inicializada exitosamente")
return _vector_db_instance
def reset_vector_db() -> None:
"""
Resetea la instancia global de la base de datos vectorial.
NOTA: Esta función solo cierra la conexión del cliente en memoria.
NO elimina ni modifica datos en Qdrant.
Útil principalmente para testing.
"""
global _vector_db_instance
_vector_db_instance = None
logger.info("Instancia de base de datos vectorial reseteada")

View File

@@ -0,0 +1,410 @@
"""
Implementación de Qdrant para la interfaz VectorDBBase.
Este módulo proporciona la implementación concreta de todas las operaciones
vectoriales utilizando Qdrant como base de datos.
"""
import logging
from typing import List, Dict, Any, Optional
from qdrant_client import QdrantClient
from qdrant_client.models import (
Distance,
VectorParams,
PointStruct,
Filter,
FieldCondition,
MatchValue
)
from qdrant_client.http.exceptions import UnexpectedResponse
from .base import VectorDBBase
logger = logging.getLogger(__name__)
class QdrantVectorDB(VectorDBBase):
"""
Implementación de VectorDBBase usando Qdrant como proveedor.
Atributos:
client: Cliente de Qdrant
url: URL del servidor Qdrant
api_key: API key para autenticación
"""
def __init__(self, url: str, api_key: str):
"""
Inicializa el cliente de Qdrant.
Args:
url: URL del servidor Qdrant
api_key: API key para autenticación
"""
self.url = url
self.api_key = api_key
self.client = QdrantClient(
url=url,
api_key=api_key,
timeout=30
)
logger.info(f"QdrantVectorDB inicializado con URL: {url}")
async def collection_exists(self, collection_name: str) -> bool:
"""
Verifica si existe una colección en Qdrant.
Args:
collection_name: Nombre de la colección
Returns:
bool: True si existe, False en caso contrario
"""
try:
collections = self.client.get_collections().collections
return any(col.name == collection_name for col in collections)
except Exception as e:
logger.error(f"Error al verificar colección '{collection_name}': {e}")
return False
async def create_collection(
self,
collection_name: str,
vector_size: int = 3072,
distance: str = "Cosine"
) -> bool:
"""
Crea una nueva colección en Qdrant.
Args:
collection_name: Nombre de la colección
vector_size: Dimensión de los vectores (default: 3072)
distance: Métrica de distancia
Returns:
bool: True si se creó exitosamente
"""
try:
# Mapear string a enum de Qdrant
distance_map = {
"Cosine": Distance.COSINE,
"Euclid": Distance.EUCLID,
"Dot": Distance.DOT
}
distance_metric = distance_map.get(distance, Distance.COSINE)
self.client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(
size=vector_size,
distance=distance_metric
)
)
logger.info(f"Colección '{collection_name}' creada exitosamente")
return True
except Exception as e:
logger.error(f"Error al crear colección '{collection_name}': {e}")
return False
async def delete_collection(self, collection_name: str) -> bool:
"""
Elimina una colección completa de Qdrant.
Args:
collection_name: Nombre de la colección
Returns:
bool: True si se eliminó exitosamente
"""
try:
self.client.delete_collection(collection_name=collection_name)
logger.info(f"Colección '{collection_name}' eliminada exitosamente")
return True
except Exception as e:
logger.error(f"Error al eliminar colección '{collection_name}': {e}")
return False
async def file_exists_in_collection(
self,
collection_name: str,
file_name: str
) -> bool:
"""
Verifica si un archivo existe en una colección.
Args:
collection_name: Nombre de la colección
file_name: Nombre del archivo
Returns:
bool: True si el archivo existe
"""
try:
# Buscar un solo punto con el file_name en metadata
result = self.client.scroll(
collection_name=collection_name,
scroll_filter=Filter(
must=[
FieldCondition(
key="metadata.file_name",
match=MatchValue(value=file_name)
)
]
),
limit=1
)
return len(result[0]) > 0
except Exception as e:
logger.error(f"Error al verificar archivo '{file_name}' en colección '{collection_name}': {e}")
return False
async def get_chunks_by_file(
self,
collection_name: str,
file_name: str,
limit: Optional[int] = None
) -> List[Dict[str, Any]]:
"""
Obtiene todos los chunks de un archivo.
Args:
collection_name: Nombre de la colección
file_name: Nombre del archivo
limit: Límite opcional de resultados
Returns:
List[Dict]: Lista de chunks con metadata
"""
try:
chunks = []
offset = None
while True:
result = self.client.scroll(
collection_name=collection_name,
scroll_filter=Filter(
must=[
FieldCondition(
key="metadata.file_name",
match=MatchValue(value=file_name)
)
]
),
limit=limit if limit else 100,
offset=offset
)
points, next_offset = result
for point in points:
chunks.append({
"id": str(point.id),
"payload": point.payload,
"vector": point.vector if hasattr(point, 'vector') else None
})
# Si hay límite y lo alcanzamos, salimos
if limit and len(chunks) >= limit:
break
# Si no hay más resultados, salimos
if next_offset is None:
break
offset = next_offset
logger.info(f"Obtenidos {len(chunks)} chunks del archivo '{file_name}'")
return chunks
except Exception as e:
logger.error(f"Error al obtener chunks del archivo '{file_name}': {e}")
return []
async def delete_file_from_collection(
self,
collection_name: str,
file_name: str
) -> int:
"""
Elimina todos los chunks de un archivo.
Args:
collection_name: Nombre de la colección
file_name: Nombre del archivo
Returns:
int: Número de chunks eliminados
"""
try:
# Primero obtener todos los IDs del archivo
chunks = await self.get_chunks_by_file(collection_name, file_name)
if not chunks:
logger.info(f"No se encontraron chunks para el archivo '{file_name}'")
return 0
# Extraer los IDs
point_ids = [chunk["id"] for chunk in chunks]
# Eliminar por IDs
self.client.delete(
collection_name=collection_name,
points_selector=point_ids
)
logger.info(f"Eliminados {len(point_ids)} chunks del archivo '{file_name}'")
return len(point_ids)
except Exception as e:
logger.error(f"Error al eliminar archivo '{file_name}': {e}")
return 0
async def add_chunks(
self,
collection_name: str,
chunks: List[Dict[str, Any]]
) -> Dict[str, Any]:
"""
Agrega múltiples chunks a una colección.
Args:
collection_name: Nombre de la colección
chunks: Lista de chunks con estructura:
{
"id": str,
"vector": List[float],
"payload": {
"page_content": str,
"metadata": {
"file_name": str,
"page": int
}
}
}
Returns:
Dict con 'success' (bool) y 'chunks_added' (int)
"""
try:
points = []
for chunk in chunks:
point = PointStruct(
id=chunk["id"],
vector=chunk["vector"],
payload=chunk["payload"]
)
points.append(point)
self.client.upsert(
collection_name=collection_name,
points=points
)
logger.info(f"Agregados {len(points)} chunks a la colección '{collection_name}'")
return {
"success": True,
"chunks_added": len(points)
}
except Exception as e:
logger.error(f"Error al agregar chunks a '{collection_name}': {e}")
return {
"success": False,
"chunks_added": 0
}
async def get_collection_info(self, collection_name: str) -> Optional[Dict[str, Any]]:
"""
Obtiene información sobre una colección.
Args:
collection_name: Nombre de la colección
Returns:
Optional[Dict]: Información de la colección o None
"""
try:
collection_info = self.client.get_collection(collection_name=collection_name)
return {
"name": collection_name,
"vectors_count": collection_info.points_count,
"vectors_config": {
"size": collection_info.config.params.vectors.size,
"distance": collection_info.config.params.vectors.distance.name
},
"status": collection_info.status.name
}
except UnexpectedResponse as e:
if e.status_code == 404:
logger.warning(f"Colección '{collection_name}' no encontrada")
return None
logger.error(f"Error al obtener info de colección '{collection_name}': {e}")
return None
except Exception as e:
logger.error(f"Error inesperado al obtener info de colección '{collection_name}': {e}")
return None
async def count_chunks_in_file(
self,
collection_name: str,
file_name: str
) -> int:
"""
Cuenta el número de chunks de un archivo.
Args:
collection_name: Nombre de la colección
file_name: Nombre del archivo
Returns:
int: Número de chunks
"""
try:
result = self.client.scroll(
collection_name=collection_name,
scroll_filter=Filter(
must=[
FieldCondition(
key="file_name",
match=MatchValue(value=file_name)
)
]
),
limit=1,
with_payload=False,
with_vectors=False
)
# Qdrant scroll no retorna count directo, así que obtenemos todos
chunks = await self.get_chunks_by_file(collection_name, file_name)
count = len(chunks)
logger.info(f"Archivo '{file_name}' tiene {count} chunks")
return count
except Exception as e:
logger.error(f"Error al contar chunks del archivo '{file_name}': {e}")
return 0
async def health_check(self) -> bool:
"""
Verifica la conexión con Qdrant.
Returns:
bool: True si la conexión es exitosa
"""
try:
self.client.get_collections()
logger.info("Health check de Qdrant exitoso")
return True
except Exception as e:
logger.error(f"Health check de Qdrant falló: {e}")
return False

View File

@@ -12,6 +12,19 @@ dependencies = [
"python-multipart>=0.0.20", "python-multipart>=0.0.20",
"qdrant-client>=1.15.1", "qdrant-client>=1.15.1",
"uvicorn[standard]>=0.35.0", "uvicorn[standard]>=0.35.0",
# Chunking & PDF processing
"pypdf>=5.1.0",
"pdf2image>=1.17.0",
"pillow>=11.0.0",
# LLM & Embeddings
"openai>=1.59.6",
"google-cloud-aiplatform>=1.77.0",
"langchain>=0.3.12",
"langchain-core>=0.3.24",
"tiktoken>=0.8.0",
# WebSockets
"websockets>=14.1",
"langchain-text-splitters>=1.0.0",
] ]
[project.scripts] [project.scripts]
dev = "uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload" dev = "uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload"

1170
backend/uv.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -11,7 +11,10 @@
"@radix-ui/react-checkbox": "^1.3.3", "@radix-ui/react-checkbox": "^1.3.3",
"@radix-ui/react-dialog": "^1.1.15", "@radix-ui/react-dialog": "^1.1.15",
"@radix-ui/react-label": "^2.1.7", "@radix-ui/react-label": "^2.1.7",
"@radix-ui/react-select": "^2.2.6",
"@radix-ui/react-slot": "^1.2.3", "@radix-ui/react-slot": "^1.2.3",
"@radix-ui/react-switch": "^1.2.6",
"@radix-ui/react-tabs": "^1.1.13",
"class-variance-authority": "^0.7.1", "class-variance-authority": "^0.7.1",
"clsx": "^2.1.1", "clsx": "^2.1.1",
"lucide-react": "^0.543.0", "lucide-react": "^0.543.0",
@@ -878,6 +881,44 @@
"node": "^18.18.0 || ^20.9.0 || >=21.1.0" "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
} }
}, },
"node_modules/@floating-ui/core": {
"version": "1.7.3",
"resolved": "https://registry.npmjs.org/@floating-ui/core/-/core-1.7.3.tgz",
"integrity": "sha512-sGnvb5dmrJaKEZ+LDIpguvdX3bDlEllmv4/ClQ9awcmCZrlx5jQyyMWFM5kBI+EyNOCDDiKk8il0zeuX3Zlg/w==",
"license": "MIT",
"dependencies": {
"@floating-ui/utils": "^0.2.10"
}
},
"node_modules/@floating-ui/dom": {
"version": "1.7.4",
"resolved": "https://registry.npmjs.org/@floating-ui/dom/-/dom-1.7.4.tgz",
"integrity": "sha512-OOchDgh4F2CchOX94cRVqhvy7b3AFb+/rQXyswmzmGakRfkMgoWVjfnLWkRirfLEfuD4ysVW16eXzwt3jHIzKA==",
"license": "MIT",
"dependencies": {
"@floating-ui/core": "^1.7.3",
"@floating-ui/utils": "^0.2.10"
}
},
"node_modules/@floating-ui/react-dom": {
"version": "2.1.6",
"resolved": "https://registry.npmjs.org/@floating-ui/react-dom/-/react-dom-2.1.6.tgz",
"integrity": "sha512-4JX6rEatQEvlmgU80wZyq9RT96HZJa88q8hp0pBd+LrczeDI4o6uA2M+uvxngVHo4Ihr8uibXxH6+70zhAFrVw==",
"license": "MIT",
"dependencies": {
"@floating-ui/dom": "^1.7.4"
},
"peerDependencies": {
"react": ">=16.8.0",
"react-dom": ">=16.8.0"
}
},
"node_modules/@floating-ui/utils": {
"version": "0.2.10",
"resolved": "https://registry.npmjs.org/@floating-ui/utils/-/utils-0.2.10.tgz",
"integrity": "sha512-aGTxbpbg8/b5JfU1HXSrbH3wXZuLPJcNEcZQFMxLs3oSzgtVu6nFPkbbGGUvBcUjKV2YyB9Wxxabo+HEH9tcRQ==",
"license": "MIT"
},
"node_modules/@humanfs/core": { "node_modules/@humanfs/core": {
"version": "0.19.1", "version": "0.19.1",
"resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.1.tgz", "resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.1.tgz",
@@ -1204,12 +1245,41 @@
"node": ">= 8" "node": ">= 8"
} }
}, },
"node_modules/@radix-ui/number": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/@radix-ui/number/-/number-1.1.1.tgz",
"integrity": "sha512-MkKCwxlXTgz6CFoJx3pCwn07GKp36+aZyu/u2Ln2VrA5DcdyCZkASEDBTd8x5whTQQL5CiYf4prXKLcgQdv29g==",
"license": "MIT"
},
"node_modules/@radix-ui/primitive": { "node_modules/@radix-ui/primitive": {
"version": "1.1.3", "version": "1.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz",
"integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==", "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/@radix-ui/react-arrow": {
"version": "1.1.7",
"resolved": "https://registry.npmjs.org/@radix-ui/react-arrow/-/react-arrow-1.1.7.tgz",
"integrity": "sha512-F+M1tLhO+mlQaOWspE8Wstg+z6PwxwRd8oQ8IXceWz92kfAmalTRf0EjrouQeo7QssEPfCn05B4Ihs1K9WQ/7w==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-primitive": "2.1.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-checkbox": { "node_modules/@radix-ui/react-checkbox": {
"version": "1.3.3", "version": "1.3.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-checkbox/-/react-checkbox-1.3.3.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-checkbox/-/react-checkbox-1.3.3.tgz",
@@ -1240,6 +1310,32 @@
} }
} }
}, },
"node_modules/@radix-ui/react-collection": {
"version": "1.1.7",
"resolved": "https://registry.npmjs.org/@radix-ui/react-collection/-/react-collection-1.1.7.tgz",
"integrity": "sha512-Fh9rGN0MoI4ZFUNyfFVNU4y9LUz93u9/0K+yLgA2bwRojxM8JU1DyvvMBabnZPBgMWREAJvU2jjVzq+LrFUglw==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2",
"@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-slot": "1.2.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-compose-refs": { "node_modules/@radix-ui/react-compose-refs": {
"version": "1.1.2", "version": "1.1.2",
"resolved": "https://registry.npmjs.org/@radix-ui/react-compose-refs/-/react-compose-refs-1.1.2.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-compose-refs/-/react-compose-refs-1.1.2.tgz",
@@ -1305,6 +1401,21 @@
} }
} }
}, },
"node_modules/@radix-ui/react-direction": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/@radix-ui/react-direction/-/react-direction-1.1.1.tgz",
"integrity": "sha512-1UEWRX6jnOA2y4H5WczZ44gOOjTEmlqv1uNW4GAJEO5+bauCBhv8snY65Iw5/VOS/ghKN9gr2KjnLKxrsvoMVw==",
"license": "MIT",
"peerDependencies": {
"@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-dismissable-layer": { "node_modules/@radix-ui/react-dismissable-layer": {
"version": "1.1.11", "version": "1.1.11",
"resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.11.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.11.tgz",
@@ -1413,6 +1524,38 @@
} }
} }
}, },
"node_modules/@radix-ui/react-popper": {
"version": "1.2.8",
"resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.8.tgz",
"integrity": "sha512-0NJQ4LFFUuWkE7Oxf0htBKS6zLkkjBH+hM1uk7Ng705ReR8m/uelduy1DBo0PyBXPKVnBA6YBlU94MBGXrSBCw==",
"license": "MIT",
"dependencies": {
"@floating-ui/react-dom": "^2.0.0",
"@radix-ui/react-arrow": "1.1.7",
"@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2",
"@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-callback-ref": "1.1.1",
"@radix-ui/react-use-layout-effect": "1.1.1",
"@radix-ui/react-use-rect": "1.1.1",
"@radix-ui/react-use-size": "1.1.1",
"@radix-ui/rect": "1.1.1"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-portal": { "node_modules/@radix-ui/react-portal": {
"version": "1.1.9", "version": "1.1.9",
"resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz",
@@ -1484,6 +1627,80 @@
} }
} }
}, },
"node_modules/@radix-ui/react-roving-focus": {
"version": "1.1.11",
"resolved": "https://registry.npmjs.org/@radix-ui/react-roving-focus/-/react-roving-focus-1.1.11.tgz",
"integrity": "sha512-7A6S9jSgm/S+7MdtNDSb+IU859vQqJ/QAtcYQcfFC6W8RS4IxIZDldLR0xqCFZ6DCyrQLjLPsxtTNch5jVA4lA==",
"license": "MIT",
"dependencies": {
"@radix-ui/primitive": "1.1.3",
"@radix-ui/react-collection": "1.1.7",
"@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2",
"@radix-ui/react-direction": "1.1.1",
"@radix-ui/react-id": "1.1.1",
"@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-callback-ref": "1.1.1",
"@radix-ui/react-use-controllable-state": "1.2.2"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-select": {
"version": "2.2.6",
"resolved": "https://registry.npmjs.org/@radix-ui/react-select/-/react-select-2.2.6.tgz",
"integrity": "sha512-I30RydO+bnn2PQztvo25tswPH+wFBjehVGtmagkU78yMdwTwVf12wnAOF+AeP8S2N8xD+5UPbGhkUfPyvT+mwQ==",
"license": "MIT",
"dependencies": {
"@radix-ui/number": "1.1.1",
"@radix-ui/primitive": "1.1.3",
"@radix-ui/react-collection": "1.1.7",
"@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2",
"@radix-ui/react-direction": "1.1.1",
"@radix-ui/react-dismissable-layer": "1.1.11",
"@radix-ui/react-focus-guards": "1.1.3",
"@radix-ui/react-focus-scope": "1.1.7",
"@radix-ui/react-id": "1.1.1",
"@radix-ui/react-popper": "1.2.8",
"@radix-ui/react-portal": "1.1.9",
"@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-slot": "1.2.3",
"@radix-ui/react-use-callback-ref": "1.1.1",
"@radix-ui/react-use-controllable-state": "1.2.2",
"@radix-ui/react-use-layout-effect": "1.1.1",
"@radix-ui/react-use-previous": "1.1.1",
"@radix-ui/react-visually-hidden": "1.2.3",
"aria-hidden": "^1.2.4",
"react-remove-scroll": "^2.6.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-slot": { "node_modules/@radix-ui/react-slot": {
"version": "1.2.3", "version": "1.2.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.2.3.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.2.3.tgz",
@@ -1502,6 +1719,65 @@
} }
} }
}, },
"node_modules/@radix-ui/react-switch": {
"version": "1.2.6",
"resolved": "https://registry.npmjs.org/@radix-ui/react-switch/-/react-switch-1.2.6.tgz",
"integrity": "sha512-bByzr1+ep1zk4VubeEVViV592vu2lHE2BZY5OnzehZqOOgogN80+mNtCqPkhn2gklJqOpxWgPoYTSnhBCqpOXQ==",
"license": "MIT",
"dependencies": {
"@radix-ui/primitive": "1.1.3",
"@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2",
"@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-controllable-state": "1.2.2",
"@radix-ui/react-use-previous": "1.1.1",
"@radix-ui/react-use-size": "1.1.1"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-tabs": {
"version": "1.1.13",
"resolved": "https://registry.npmjs.org/@radix-ui/react-tabs/-/react-tabs-1.1.13.tgz",
"integrity": "sha512-7xdcatg7/U+7+Udyoj2zodtI9H/IIopqo+YOIcZOq1nJwXWBZ9p8xiu5llXlekDbZkca79a/fozEYQXIA4sW6A==",
"license": "MIT",
"dependencies": {
"@radix-ui/primitive": "1.1.3",
"@radix-ui/react-context": "1.1.2",
"@radix-ui/react-direction": "1.1.1",
"@radix-ui/react-id": "1.1.1",
"@radix-ui/react-presence": "1.1.5",
"@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-roving-focus": "1.1.11",
"@radix-ui/react-use-controllable-state": "1.2.2"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-use-callback-ref": { "node_modules/@radix-ui/react-use-callback-ref": {
"version": "1.1.1", "version": "1.1.1",
"resolved": "https://registry.npmjs.org/@radix-ui/react-use-callback-ref/-/react-use-callback-ref-1.1.1.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-use-callback-ref/-/react-use-callback-ref-1.1.1.tgz",
@@ -1602,6 +1878,24 @@
} }
} }
}, },
"node_modules/@radix-ui/react-use-rect": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/@radix-ui/react-use-rect/-/react-use-rect-1.1.1.tgz",
"integrity": "sha512-QTYuDesS0VtuHNNvMh+CjlKJ4LJickCMUAqjlE3+j8w+RlRpwyX3apEQKGFzbZGdo7XNG1tXa+bQqIE7HIXT2w==",
"license": "MIT",
"dependencies": {
"@radix-ui/rect": "1.1.1"
},
"peerDependencies": {
"@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-use-size": { "node_modules/@radix-ui/react-use-size": {
"version": "1.1.1", "version": "1.1.1",
"resolved": "https://registry.npmjs.org/@radix-ui/react-use-size/-/react-use-size-1.1.1.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-use-size/-/react-use-size-1.1.1.tgz",
@@ -1620,6 +1914,35 @@
} }
} }
}, },
"node_modules/@radix-ui/react-visually-hidden": {
"version": "1.2.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-visually-hidden/-/react-visually-hidden-1.2.3.tgz",
"integrity": "sha512-pzJq12tEaaIhqjbzpCuv/OypJY/BPavOofm+dbab+MHLajy277+1lLm6JFcGgF5eskJ6mquGirhXY2GD/8u8Ug==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-primitive": "2.1.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/rect": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/@radix-ui/rect/-/rect-1.1.1.tgz",
"integrity": "sha512-HPwpGIzkl28mWyZqG52jiqDJ12waP11Pa1lGoiyUkIEuMLBP0oeK/C89esbXrxsky5we7dfd8U58nm0SgAWpVw==",
"license": "MIT"
},
"node_modules/@rolldown/pluginutils": { "node_modules/@rolldown/pluginutils": {
"version": "1.0.0-beta.34", "version": "1.0.0-beta.34",
"resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.34.tgz", "resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.34.tgz",

View File

@@ -13,7 +13,10 @@
"@radix-ui/react-checkbox": "^1.3.3", "@radix-ui/react-checkbox": "^1.3.3",
"@radix-ui/react-dialog": "^1.1.15", "@radix-ui/react-dialog": "^1.1.15",
"@radix-ui/react-label": "^2.1.7", "@radix-ui/react-label": "^2.1.7",
"@radix-ui/react-select": "^2.2.6",
"@radix-ui/react-slot": "^1.2.3", "@radix-ui/react-slot": "^1.2.3",
"@radix-ui/react-switch": "^1.2.6",
"@radix-ui/react-tabs": "^1.1.13",
"class-variance-authority": "^0.7.1", "class-variance-authority": "^0.7.1",
"clsx": "^2.1.1", "clsx": "^2.1.1",
"lucide-react": "^0.543.0", "lucide-react": "^0.543.0",

View File

@@ -0,0 +1,228 @@
import { useState, useEffect } from 'react'
import { api } from '../services/api'
import {
Dialog,
DialogContent,
DialogDescription,
DialogFooter,
DialogHeader,
DialogTitle,
} from './ui/dialog'
import { Button } from './ui/button'
import { AlertCircle, Loader2, FileText, CheckCircle2, XCircle } from 'lucide-react'
import type { ChunkingConfig } from './ChunkingConfigModal'
interface ChunkPreviewPanelProps {
isOpen: boolean
onClose: () => void
config: ChunkingConfig | null
onAccept: (config: ChunkingConfig) => void
onCancel: () => void
}
interface PreviewChunk {
index: number
text: string
page: number
file_name: string
tokens: number
}
export function ChunkPreviewPanel({
isOpen,
onClose,
config,
onAccept,
onCancel,
}: ChunkPreviewPanelProps) {
const [chunks, setChunks] = useState<PreviewChunk[]>([])
const [loading, setLoading] = useState(false)
const [processing, setProcessing] = useState(false)
const [error, setError] = useState<string | null>(null)
const [success, setSuccess] = useState(false)
// Auto-cargar preview cuando se abre el modal
useEffect(() => {
if (isOpen && config && chunks.length === 0) {
loadPreview()
}
}, [isOpen, config])
const loadPreview = async () => {
if (!config) return
setLoading(true)
setError(null)
setSuccess(false)
try {
const result = await api.generateChunkPreview(config)
setChunks(result.chunks)
} catch (err) {
console.error('Error loading preview:', err)
setError(err instanceof Error ? err.message : 'Error generando preview')
} finally {
setLoading(false)
}
}
const handleAccept = async () => {
if (!config) return
setProcessing(true)
setError(null)
try {
await onAccept(config)
setSuccess(true)
// Cerrar después de 2 segundos
setTimeout(() => {
handleClose()
}, 2000)
} catch (err) {
console.error('Error processing:', err)
setError(err instanceof Error ? err.message : 'Error procesando PDF')
} finally {
setProcessing(false)
}
}
const handleCancel = () => {
onCancel()
handleClose()
}
const handleClose = () => {
setChunks([])
setError(null)
setSuccess(false)
onClose()
}
if (!config) return null
return (
<Dialog open={isOpen} onOpenChange={handleClose}>
<DialogContent className="max-w-4xl max-h-[85vh] flex flex-col">
<DialogHeader>
<DialogTitle className="flex items-center gap-2">
<FileText className="w-5 h-5" />
Preview de Chunks
</DialogTitle>
<DialogDescription>
Vista previa de chunks para <strong>{config.file_name}</strong>
</DialogDescription>
</DialogHeader>
{/* Contenido */}
<div className="flex-1 overflow-y-auto">
{loading ? (
<div className="flex items-center justify-center py-12">
<Loader2 className="w-8 h-8 animate-spin text-gray-400" />
<span className="ml-2 text-gray-500">Generando preview...</span>
</div>
) : error ? (
<div className="flex items-center gap-2 text-sm text-red-600 bg-red-50 p-4 rounded">
<AlertCircle className="w-5 h-5" />
<span>{error}</span>
</div>
) : success ? (
<div className="flex flex-col items-center justify-center py-12 text-center">
<CheckCircle2 className="w-16 h-16 text-green-500 mb-4" />
<h3 className="text-lg font-semibold text-green-700">
Procesamiento Completado
</h3>
<p className="text-sm text-gray-600 mt-2">
El PDF ha sido procesado y subido a Qdrant exitosamente
</p>
</div>
) : chunks.length === 0 ? (
<div className="text-center py-12 text-gray-500">
<FileText className="w-12 h-12 mx-auto mb-2 text-gray-300" />
<p>No hay chunks para mostrar</p>
</div>
) : (
<div className="space-y-4">
{/* Información de configuración */}
<div className="bg-blue-50 p-3 rounded">
<p className="text-sm text-blue-800">
<strong>Configuración:</strong> Max {config.max_tokens} tokens, Target{' '}
{config.target_tokens} tokens
{config.use_llm && ' | LLM Habilitado'}
</p>
</div>
{/* Lista de chunks */}
{chunks.map((chunk) => (
<div key={chunk.index} className="border rounded-lg p-4 space-y-2">
{/* Header del chunk */}
<div className="flex items-start justify-between">
<div className="flex items-center gap-2">
<span className="text-sm font-semibold text-gray-700">
Chunk #{chunk.index + 1}
</span>
<span className="text-xs text-gray-500 bg-gray-100 px-2 py-1 rounded">
Página {chunk.page}
</span>
<span className="text-xs text-blue-600 bg-blue-100 px-2 py-1 rounded">
~{chunk.tokens} tokens
</span>
</div>
</div>
{/* Texto del chunk */}
<div className="bg-gray-50 p-3 rounded text-sm">
<p className="text-gray-700 whitespace-pre-wrap leading-relaxed">
{chunk.text}
</p>
</div>
{/* Indicador de longitud */}
<div className="text-xs text-gray-500">
Longitud: {chunk.text.length} caracteres
</div>
</div>
))}
{/* Información adicional */}
<div className="bg-yellow-50 border border-yellow-200 p-3 rounded">
<p className="text-sm text-yellow-800">
<strong>Nota:</strong> Estos son chunks de ejemplo (hasta 3). El documento
completo generará más chunks según su tamaño.
</p>
</div>
</div>
)}
</div>
{/* Footer con acciones */}
<DialogFooter className="flex justify-between items-center pt-4 border-t">
<Button
variant="outline"
onClick={handleCancel}
disabled={processing || success}
className="text-red-600 hover:text-red-700 hover:bg-red-50"
>
<XCircle className="w-4 h-4 mr-2" />
Cancelar
</Button>
<Button onClick={handleAccept} disabled={processing || loading || chunks.length === 0 || success}>
{processing ? (
<>
<Loader2 className="w-4 h-4 mr-2 animate-spin" />
Procesando...
</>
) : (
<>
<CheckCircle2 className="w-4 h-4 mr-2" />
Aceptar y Procesar
</>
)}
</Button>
</DialogFooter>
</DialogContent>
</Dialog>
)
}

View File

@@ -0,0 +1,205 @@
import { useEffect, useState } from 'react'
import { api } from '../services/api'
import {
Dialog,
DialogContent,
DialogDescription,
DialogHeader,
DialogTitle,
} from './ui/dialog'
import { Button } from './ui/button'
import { AlertCircle, Loader2, FileText, Trash2 } from 'lucide-react'
interface ChunkViewerModalProps {
isOpen: boolean
onClose: () => void
fileName: string
tema: string
}
interface Chunk {
id: string
payload: {
page_content: string
metadata: {
file_name: string
page: number
[key: string]: any
}
[key: string]: any
}
vector?: number[]
}
export function ChunkViewerModal({ isOpen, onClose, fileName, tema }: ChunkViewerModalProps) {
const [chunks, setChunks] = useState<Chunk[]>([])
const [loading, setLoading] = useState(false)
const [error, setError] = useState<string | null>(null)
const [deleting, setDeleting] = useState(false)
useEffect(() => {
if (isOpen && fileName && tema) {
loadChunks()
}
}, [isOpen, fileName, tema])
const loadChunks = async () => {
setLoading(true)
setError(null)
try {
const result = await api.getChunksByFile(tema, fileName)
setChunks(result.chunks)
} catch (err) {
console.error('Error loading chunks:', err)
setError(err instanceof Error ? err.message : 'Error al cargar chunks')
} finally {
setLoading(false)
}
}
const handleDeleteFile = async () => {
if (!confirm(`¿Estás seguro de eliminar todos los chunks del archivo "${fileName}" de la colección "${tema}"?`)) {
return
}
setDeleting(true)
setError(null)
try {
await api.deleteFileFromCollection(tema, fileName)
alert('Archivo eliminado de la colección exitosamente')
onClose()
} catch (err) {
console.error('Error deleting file from collection:', err)
setError(err instanceof Error ? err.message : 'Error al eliminar archivo')
} finally {
setDeleting(false)
}
}
const handleClose = () => {
setChunks([])
setError(null)
onClose()
}
return (
<Dialog open={isOpen} onOpenChange={handleClose}>
<DialogContent className="max-w-4xl max-h-[80vh] flex flex-col">
<DialogHeader>
<DialogTitle className="flex items-center gap-2">
<FileText className="w-5 h-5" />
Chunks de "{fileName}"
</DialogTitle>
<DialogDescription>
Colección: <strong>{tema}</strong>
</DialogDescription>
</DialogHeader>
{/* Contenido */}
<div className="flex-1 overflow-y-auto">
{loading ? (
<div className="flex items-center justify-center py-8">
<Loader2 className="w-8 h-8 animate-spin text-gray-400" />
<span className="ml-2 text-gray-500">Cargando chunks...</span>
</div>
) : error ? (
<div className="flex items-center gap-2 text-sm text-red-600 bg-red-50 p-4 rounded">
<AlertCircle className="w-5 h-5" />
<span>{error}</span>
</div>
) : chunks.length === 0 ? (
<div className="text-center py-8 text-gray-500">
<FileText className="w-12 h-12 mx-auto mb-2 text-gray-300" />
<p>No se encontraron chunks para este archivo.</p>
<p className="text-sm mt-1">El archivo aún no ha sido procesado o no existe en la colección.</p>
</div>
) : (
<div className="space-y-4">
{/* Estadísticas */}
<div className="bg-blue-50 p-3 rounded">
<p className="text-sm text-blue-800">
<strong>Total de chunks:</strong> {chunks.length}
</p>
</div>
{/* Lista de chunks */}
{chunks.map((chunk, index) => (
<div key={chunk.id} className="border rounded-lg p-4 space-y-2">
{/* Header del chunk */}
<div className="flex items-start justify-between">
<div className="flex items-center gap-2">
<span className="text-sm font-semibold text-gray-700">
Chunk #{index + 1}
</span>
<span className="text-xs text-gray-500">
Página {chunk.payload.metadata.page}
</span>
</div>
<span className="text-xs text-gray-400 font-mono">
ID: {chunk.id.substring(0, 8)}...
</span>
</div>
{/* Texto del chunk */}
{chunk.payload.page_content && (
<div className="bg-gray-50 p-3 rounded text-sm">
<p className="text-gray-700 whitespace-pre-wrap">
{chunk.payload.page_content}
</p>
<div className="mt-2 text-xs text-gray-500">
<strong>Caracteres:</strong> {chunk.payload.page_content.length}
</div>
</div>
)}
{/* Metadata */}
<div className="text-xs text-gray-500">
<strong>Metadata:</strong>
<pre className="mt-1 bg-gray-100 p-2 rounded overflow-x-auto">
{JSON.stringify(chunk.payload.metadata, null, 2)}
</pre>
</div>
{/* Información del vector (opcional) */}
{chunk.vector && (
<div className="text-xs text-gray-400">
Vector dimension: {chunk.vector.length}
</div>
)}
</div>
))}
</div>
)}
</div>
{/* Footer con acciones */}
<div className="flex justify-between items-center pt-4 border-t">
<Button
variant="outline"
onClick={handleDeleteFile}
disabled={deleting || chunks.length === 0}
className="text-red-600 hover:text-red-700 hover:bg-red-50"
>
{deleting ? (
<>
<Loader2 className="w-4 h-4 mr-2 animate-spin" />
Eliminando...
</>
) : (
<>
<Trash2 className="w-4 h-4 mr-2" />
Eliminar de colección
</>
)}
</Button>
<Button onClick={handleClose}>
Cerrar
</Button>
</div>
</DialogContent>
</Dialog>
)
}

View File

@@ -0,0 +1,307 @@
import { useEffect, useState } from 'react'
import { api } from '../services/api'
import {
Dialog,
DialogContent,
DialogDescription,
DialogFooter,
DialogHeader,
DialogTitle,
} from './ui/dialog'
import { Button } from './ui/button'
import { Label } from './ui/label'
import { Input } from './ui/input'
import { Textarea } from './ui/textarea'
import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from './ui/select'
import { Switch } from './ui/switch'
import { AlertCircle, Loader2, Settings, Sparkles } from 'lucide-react'
import { Tabs, TabsContent, TabsList, TabsTrigger } from './ui/tabs'
interface ChunkingConfigModalProps {
isOpen: boolean
onClose: () => void
fileName: string
tema: string
collectionName: string
onPreview: (config: ChunkingConfig) => void
}
export interface ChunkingConfig {
file_name: string
tema: string
collection_name: string
max_tokens: number
target_tokens: number
chunk_size: number
chunk_overlap: number
use_llm: boolean
custom_instructions: string
}
interface ChunkingProfile {
id: string
name: string
description: string
max_tokens: number
target_tokens: number
chunk_size: number
chunk_overlap: number
use_llm: boolean
}
export function ChunkingConfigModal({
isOpen,
onClose,
fileName,
tema,
collectionName,
onPreview,
}: ChunkingConfigModalProps) {
const [profiles, setProfiles] = useState<ChunkingProfile[]>([])
const [selectedProfile, setSelectedProfile] = useState<string>('balanced')
const [loading, setLoading] = useState(false)
const [error, setError] = useState<string | null>(null)
// Custom configuration
const [maxTokens, setMaxTokens] = useState(950)
const [targetTokens, setTargetTokens] = useState(800)
const [chunkSize, setChunkSize] = useState(1000)
const [chunkOverlap, setChunkOverlap] = useState(200)
const [useLLM, setUseLLM] = useState(true)
const [customInstructions, setCustomInstructions] = useState('')
useEffect(() => {
if (isOpen) {
loadProfiles()
}
}, [isOpen])
const loadProfiles = async () => {
setLoading(true)
setError(null)
try {
const result = await api.getChunkingProfiles()
setProfiles(result.profiles)
} catch (err) {
console.error('Error loading profiles:', err)
setError(err instanceof Error ? err.message : 'Error cargando perfiles')
} finally {
setLoading(false)
}
}
const handleProfileChange = (profileId: string) => {
setSelectedProfile(profileId)
const profile = profiles.find((p) => p.id === profileId)
if (profile) {
setMaxTokens(profile.max_tokens)
setTargetTokens(profile.target_tokens)
setChunkSize(profile.chunk_size)
setChunkOverlap(profile.chunk_overlap)
setUseLLM(profile.use_llm)
}
}
const handlePreview = () => {
const config: ChunkingConfig = {
file_name: fileName,
tema: tema,
collection_name: collectionName,
max_tokens: maxTokens,
target_tokens: targetTokens,
chunk_size: chunkSize,
chunk_overlap: chunkOverlap,
use_llm: useLLM,
custom_instructions: useLLM ? customInstructions : '',
}
onPreview(config)
}
const handleClose = () => {
setError(null)
onClose()
}
return (
<Dialog open={isOpen} onOpenChange={handleClose}>
<DialogContent className="max-w-2xl max-h-[90vh] flex flex-col">
<DialogHeader>
<DialogTitle className="flex items-center gap-2">
<Settings className="w-5 h-5" />
Configurar Chunking
</DialogTitle>
<DialogDescription>
Configura cómo se procesará el archivo <strong>{fileName}</strong>
</DialogDescription>
</DialogHeader>
{loading ? (
<div className="flex items-center justify-center py-8">
<Loader2 className="w-8 h-8 animate-spin text-gray-400" />
<span className="ml-2 text-gray-500">Cargando perfiles...</span>
</div>
) : error ? (
<div className="flex items-center gap-2 text-sm text-red-600 bg-red-50 p-4 rounded">
<AlertCircle className="w-5 h-5" />
<span>{error}</span>
</div>
) : (
<Tabs defaultValue="profiles" className="flex-1">
<TabsList className="grid w-full grid-cols-2">
<TabsTrigger value="profiles">Perfiles</TabsTrigger>
<TabsTrigger value="custom">Personalizado</TabsTrigger>
</TabsList>
{/* Tab de Perfiles */}
<TabsContent value="profiles" className="space-y-4">
<div className="space-y-2">
<Label>Perfil de Configuración</Label>
<Select value={selectedProfile} onValueChange={handleProfileChange}>
<SelectTrigger>
<SelectValue placeholder="Selecciona un perfil" />
</SelectTrigger>
<SelectContent>
{profiles.map((profile) => (
<SelectItem key={profile.id} value={profile.id}>
<div className="flex flex-col">
<span className="font-medium">{profile.name}</span>
<span className="text-xs text-gray-500">{profile.description}</span>
</div>
</SelectItem>
))}
</SelectContent>
</Select>
</div>
{/* Mostrar detalles del perfil seleccionado */}
{selectedProfile && (
<div className="bg-gray-50 p-4 rounded-lg space-y-2 text-sm">
<div className="grid grid-cols-2 gap-2">
<div>
<span className="font-medium">Max Tokens:</span> {maxTokens}
</div>
<div>
<span className="font-medium">Target Tokens:</span> {targetTokens}
</div>
<div>
<span className="font-medium">Chunk Size:</span> {chunkSize}
</div>
<div>
<span className="font-medium">Overlap:</span> {chunkOverlap}
</div>
<div className="col-span-2">
<span className="font-medium">LLM:</span>{' '}
{useLLM ? '✅ Habilitado' : '❌ Deshabilitado'}
</div>
</div>
</div>
)}
</TabsContent>
{/* Tab Personalizado */}
<TabsContent value="custom" className="space-y-4 overflow-y-auto max-h-[50vh]">
<div className="grid grid-cols-2 gap-4">
<div className="space-y-2">
<Label htmlFor="maxTokens">Max Tokens</Label>
<Input
id="maxTokens"
type="number"
min={100}
max={2000}
value={maxTokens}
onChange={(e) => setMaxTokens(Number(e.target.value))}
/>
</div>
<div className="space-y-2">
<Label htmlFor="targetTokens">Target Tokens</Label>
<Input
id="targetTokens"
type="number"
min={100}
max={2000}
value={targetTokens}
onChange={(e) => setTargetTokens(Number(e.target.value))}
/>
</div>
<div className="space-y-2">
<Label htmlFor="chunkSize">Chunk Size</Label>
<Input
id="chunkSize"
type="number"
min={100}
max={3000}
value={chunkSize}
onChange={(e) => setChunkSize(Number(e.target.value))}
/>
</div>
<div className="space-y-2">
<Label htmlFor="chunkOverlap">Chunk Overlap</Label>
<Input
id="chunkOverlap"
type="number"
min={0}
max={1000}
value={chunkOverlap}
onChange={(e) => setChunkOverlap(Number(e.target.value))}
/>
</div>
</div>
{/* Toggle LLM */}
<div className="flex items-center justify-between p-4 bg-blue-50 rounded-lg">
<div className="flex items-center gap-2">
<Sparkles className="w-5 h-5 text-blue-600" />
<div>
<Label htmlFor="useLLM" className="font-medium cursor-pointer">
Usar LLM (Gemini)
</Label>
<p className="text-xs text-gray-600">
Procesamiento inteligente con IA
</p>
</div>
</div>
<Switch
id="useLLM"
checked={useLLM}
onCheckedChange={setUseLLM}
/>
</div>
{/* Custom Instructions (solo si LLM está habilitado) */}
{useLLM && (
<div className="space-y-2">
<Label htmlFor="customInstructions">
Instrucciones Personalizadas (Opcional)
</Label>
<Textarea
id="customInstructions"
placeholder="Ej: Mantén todos los términos técnicos en inglés..."
value={customInstructions}
onChange={(e) => setCustomInstructions(e.target.value)}
rows={3}
/>
<p className="text-xs text-gray-500">
Instrucciones adicionales para guiar el procesamiento con IA
</p>
</div>
)}
</TabsContent>
</Tabs>
)}
<DialogFooter className="flex justify-between items-center pt-4 border-t">
<Button variant="outline" onClick={handleClose}>
Cancelar
</Button>
<Button onClick={handlePreview} disabled={loading}>
Generar Preview
</Button>
</DialogFooter>
</DialogContent>
</Dialog>
)
}

View File

@@ -0,0 +1,164 @@
import { useEffect, useState } from 'react'
import { api } from '../services/api'
import {
Dialog,
DialogContent,
DialogDescription,
DialogFooter,
DialogHeader,
DialogTitle,
} from './ui/dialog'
import { Button } from './ui/button'
import { AlertCircle, CheckCircle2, Loader2 } from 'lucide-react'
interface CollectionVerifierProps {
tema: string | null
onVerified?: (exists: boolean) => void
}
export function CollectionVerifier({ tema, onVerified }: CollectionVerifierProps) {
const [isChecking, setIsChecking] = useState(false)
const [collectionExists, setCollectionExists] = useState<boolean | null>(null)
const [showCreateDialog, setShowCreateDialog] = useState(false)
const [isCreating, setIsCreating] = useState(false)
const [error, setError] = useState<string | null>(null)
useEffect(() => {
if (tema) {
checkCollection()
} else {
setCollectionExists(null)
}
}, [tema])
const checkCollection = async () => {
if (!tema) return
setIsChecking(true)
setError(null)
try {
const result = await api.checkCollectionExists(tema)
setCollectionExists(result.exists)
// Si no existe, mostrar el diálogo de confirmación
if (!result.exists) {
setShowCreateDialog(true)
}
onVerified?.(result.exists)
} catch (err) {
console.error('Error checking collection:', err)
setError(err instanceof Error ? err.message : 'Error al verificar colección')
setCollectionExists(null)
} finally {
setIsChecking(false)
}
}
const handleCreateCollection = async () => {
if (!tema) return
setIsCreating(true)
setError(null)
try {
const result = await api.createCollection(tema)
if (result.success) {
setCollectionExists(true)
setShowCreateDialog(false)
onVerified?.(true)
}
} catch (err) {
console.error('Error creating collection:', err)
setError(err instanceof Error ? err.message : 'Error al crear colección')
} finally {
setIsCreating(false)
}
}
const handleCancelCreate = () => {
setShowCreateDialog(false)
// Opcionalmente podemos notificar que no se creó la colección
onVerified?.(false)
}
// No renderizar nada si no hay tema seleccionado
if (!tema) {
return null
}
return (
<>
{/* Indicador de estado de la colección */}
{isChecking ? (
<div className="flex items-center gap-2 text-sm text-gray-500 mb-4">
<Loader2 className="w-4 h-4 animate-spin" />
<span>Verificando colección...</span>
</div>
) : collectionExists === true ? (
<div className="flex items-center gap-2 text-sm text-green-600 mb-4">
<CheckCircle2 className="w-4 h-4" />
<span>Colección "{tema}" disponible en Qdrant</span>
</div>
) : collectionExists === false ? (
<div className="flex items-center gap-2 text-sm text-yellow-600 mb-4">
<AlertCircle className="w-4 h-4" />
<span>Colección "{tema}" no existe en Qdrant</span>
</div>
) : error ? (
<div className="flex items-center gap-2 text-sm text-red-600 mb-4">
<AlertCircle className="w-4 h-4" />
<span>{error}</span>
</div>
) : null}
{/* Diálogo de confirmación para crear colección */}
<Dialog open={showCreateDialog} onOpenChange={setShowCreateDialog}>
<DialogContent>
<DialogHeader>
<DialogTitle>Crear colección en Qdrant</DialogTitle>
<DialogDescription>
La colección "<strong>{tema}</strong>" no existe en la base de datos vectorial.
<br />
<br />
¿Deseas crear esta colección ahora? Esto permitirá almacenar y buscar chunks de
documentos para este tema.
</DialogDescription>
</DialogHeader>
{error && (
<div className="flex items-center gap-2 text-sm text-red-600 bg-red-50 p-3 rounded">
<AlertCircle className="w-4 h-4" />
<span>{error}</span>
</div>
)}
<DialogFooter>
<Button
variant="outline"
onClick={handleCancelCreate}
disabled={isCreating}
>
Cancelar
</Button>
<Button
onClick={handleCreateCollection}
disabled={isCreating}
>
{isCreating ? (
<>
<Loader2 className="w-4 h-4 mr-2 animate-spin" />
Creando...
</>
) : (
'Crear colección'
)}
</Button>
</DialogFooter>
</DialogContent>
</Dialog>
</>
)
}

View File

@@ -15,6 +15,10 @@ import { Checkbox } from '@/components/ui/checkbox'
import { FileUpload } from './FileUpload' import { FileUpload } from './FileUpload'
import { DeleteConfirmDialog } from './DeleteConfirmDialog' import { DeleteConfirmDialog } from './DeleteConfirmDialog'
import { PDFPreviewModal } from './PDFPreviewModal' import { PDFPreviewModal } from './PDFPreviewModal'
import { CollectionVerifier } from './CollectionVerifier'
import { ChunkViewerModal } from './ChunkViewerModal'
import { ChunkingConfigModal, type ChunkingConfig } from './ChunkingConfigModal'
import { ChunkPreviewPanel } from './ChunkPreviewPanel'
import { import {
Upload, Upload,
Download, Download,
@@ -22,7 +26,8 @@ import {
Search, Search,
FileText, FileText,
Eye, Eye,
MessageSquare MessageSquare,
Scissors
} from 'lucide-react' } from 'lucide-react'
export function Dashboard() { export function Dashboard() {
@@ -52,6 +57,20 @@ export function Dashboard() {
const [previewFileTema, setPreviewFileTema] = useState<string | undefined>(undefined) const [previewFileTema, setPreviewFileTema] = useState<string | undefined>(undefined)
const [loadingPreview, setLoadingPreview] = useState(false) const [loadingPreview, setLoadingPreview] = useState(false)
// Estados para el modal de chunks
const [chunkViewerOpen, setChunkViewerOpen] = useState(false)
const [chunkFileName, setChunkFileName] = useState('')
const [chunkFileTema, setChunkFileTema] = useState('')
// Estados para chunking
const [chunkingConfigOpen, setChunkingConfigOpen] = useState(false)
const [chunkingFileName, setChunkingFileName] = useState('')
const [chunkingFileTema, setChunkingFileTema] = useState('')
const [chunkingCollectionName, setChunkingCollectionName] = useState('')
const [chunkPreviewOpen, setChunkPreviewOpen] = useState(false)
const [chunkingConfig, setChunkingConfig] = useState<ChunkingConfig | null>(null)
useEffect(() => { useEffect(() => {
loadFiles() loadFiles()
}, [selectedTema]) }, [selectedTema])
@@ -173,6 +192,54 @@ export function Dashboard() {
} }
} }
// Abrir modal de chunks
const handleViewChunks = (filename: string, tema: string) => {
if (!tema) {
alert('No hay tema seleccionado. Por favor selecciona un tema primero.')
return
}
setChunkFileName(filename)
setChunkFileTema(tema)
setChunkViewerOpen(true)
}
// Handlers para chunking
const handleStartChunking = (filename: string, tema: string) => {
if (!tema) {
alert('No hay tema seleccionado. Por favor selecciona un tema primero.')
return
}
setChunkingFileName(filename)
setChunkingFileTema(tema)
setChunkingCollectionName(tema) // Usar el tema como nombre de colección
setChunkingConfigOpen(true)
}
const handlePreviewChunking = (config: ChunkingConfig) => {
setChunkingConfig(config)
setChunkingConfigOpen(false)
setChunkPreviewOpen(true)
}
const handleAcceptChunking = async (config: ChunkingConfig) => {
try {
const result = await api.processChunkingFull(config)
alert(`Procesamiento completado: ${result.chunks_added} chunks agregados a ${result.collection_name}`)
// Recargar archivos para actualizar el estado
loadFiles()
} catch (error) {
console.error('Error processing PDF:', error)
throw error
}
}
const handleCancelChunking = () => {
setChunkPreviewOpen(false)
setChunkingConfig(null)
// Opcionalmente volver al modal de configuración
// setChunkingConfigOpen(true)
}
const filteredFiles = files.filter(file => const filteredFiles = files.filter(file =>
file.name.toLowerCase().includes(searchTerm.toLowerCase()) file.name.toLowerCase().includes(searchTerm.toLowerCase())
) )
@@ -350,10 +417,19 @@ export function Dashboard() {
> >
<Download className="w-4 h-4" /> <Download className="w-4 h-4" />
</Button> </Button>
<Button
variant="ghost"
size="sm"
title="Procesar con chunking"
onClick={() => handleStartChunking(file.name, file.tema)}
>
<Scissors className="w-4 h-4" />
</Button>
<Button <Button
variant="ghost" variant="ghost"
size="sm" size="sm"
title="Ver chunks" title="Ver chunks"
onClick={() => handleViewChunks(file.name, file.tema)}
> >
<Eye className="w-4 h-4" /> <Eye className="w-4 h-4" />
</Button> </Button>
@@ -406,6 +482,41 @@ export function Dashboard() {
fileName={previewFileName} fileName={previewFileName}
onDownload={handleDownloadFromPreview} onDownload={handleDownloadFromPreview}
/> />
{/* Collection Verifier - Verifica/crea colección cuando se selecciona un tema */}
<CollectionVerifier
tema={selectedTema}
onVerified={(exists) => {
console.log(`Collection ${selectedTema} exists: ${exists}`)
}}
/>
{/* Chunk Viewer Modal */}
<ChunkViewerModal
isOpen={chunkViewerOpen}
onClose={() => setChunkViewerOpen(false)}
fileName={chunkFileName}
tema={chunkFileTema}
/>
{/* Modal de configuración de chunking */}
<ChunkingConfigModal
isOpen={chunkingConfigOpen}
onClose={() => setChunkingConfigOpen(false)}
fileName={chunkingFileName}
tema={chunkingFileTema}
collectionName={chunkingCollectionName}
onPreview={handlePreviewChunking}
/>
{/* Panel de preview de chunks */}
<ChunkPreviewPanel
isOpen={chunkPreviewOpen}
onClose={() => setChunkPreviewOpen(false)}
config={chunkingConfig}
onAccept={handleAcceptChunking}
onCancel={handleCancelChunking}
/>
</div> </div>
) )
} }

View File

@@ -0,0 +1,159 @@
"use client"
import * as React from "react"
import * as SelectPrimitive from "@radix-ui/react-select"
import { Check, ChevronDown, ChevronUp } from "lucide-react"
import { cn } from "@/lib/utils"
const Select = SelectPrimitive.Root
const SelectGroup = SelectPrimitive.Group
const SelectValue = SelectPrimitive.Value
const SelectTrigger = React.forwardRef<
React.ElementRef<typeof SelectPrimitive.Trigger>,
React.ComponentPropsWithoutRef<typeof SelectPrimitive.Trigger>
>(({ className, children, ...props }, ref) => (
<SelectPrimitive.Trigger
ref={ref}
className={cn(
"flex h-9 w-full items-center justify-between whitespace-nowrap rounded-md border border-input bg-transparent px-3 py-2 text-sm shadow-sm ring-offset-background data-[placeholder]:text-muted-foreground focus:outline-none focus:ring-1 focus:ring-ring disabled:cursor-not-allowed disabled:opacity-50 [&>span]:line-clamp-1",
className
)}
{...props}
>
{children}
<SelectPrimitive.Icon asChild>
<ChevronDown className="h-4 w-4 opacity-50" />
</SelectPrimitive.Icon>
</SelectPrimitive.Trigger>
))
SelectTrigger.displayName = SelectPrimitive.Trigger.displayName
const SelectScrollUpButton = React.forwardRef<
React.ElementRef<typeof SelectPrimitive.ScrollUpButton>,
React.ComponentPropsWithoutRef<typeof SelectPrimitive.ScrollUpButton>
>(({ className, ...props }, ref) => (
<SelectPrimitive.ScrollUpButton
ref={ref}
className={cn(
"flex cursor-default items-center justify-center py-1",
className
)}
{...props}
>
<ChevronUp className="h-4 w-4" />
</SelectPrimitive.ScrollUpButton>
))
SelectScrollUpButton.displayName = SelectPrimitive.ScrollUpButton.displayName
const SelectScrollDownButton = React.forwardRef<
React.ElementRef<typeof SelectPrimitive.ScrollDownButton>,
React.ComponentPropsWithoutRef<typeof SelectPrimitive.ScrollDownButton>
>(({ className, ...props }, ref) => (
<SelectPrimitive.ScrollDownButton
ref={ref}
className={cn(
"flex cursor-default items-center justify-center py-1",
className
)}
{...props}
>
<ChevronDown className="h-4 w-4" />
</SelectPrimitive.ScrollDownButton>
))
SelectScrollDownButton.displayName =
SelectPrimitive.ScrollDownButton.displayName
const SelectContent = React.forwardRef<
React.ElementRef<typeof SelectPrimitive.Content>,
React.ComponentPropsWithoutRef<typeof SelectPrimitive.Content>
>(({ className, children, position = "popper", ...props }, ref) => (
<SelectPrimitive.Portal>
<SelectPrimitive.Content
ref={ref}
className={cn(
"relative z-50 max-h-[--radix-select-content-available-height] min-w-[8rem] overflow-y-auto overflow-x-hidden rounded-md border bg-popover text-popover-foreground shadow-md data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 origin-[--radix-select-content-transform-origin]",
position === "popper" &&
"data-[side=bottom]:translate-y-1 data-[side=left]:-translate-x-1 data-[side=right]:translate-x-1 data-[side=top]:-translate-y-1",
className
)}
position={position}
{...props}
>
<SelectScrollUpButton />
<SelectPrimitive.Viewport
className={cn(
"p-1",
position === "popper" &&
"h-[var(--radix-select-trigger-height)] w-full min-w-[var(--radix-select-trigger-width)]"
)}
>
{children}
</SelectPrimitive.Viewport>
<SelectScrollDownButton />
</SelectPrimitive.Content>
</SelectPrimitive.Portal>
))
SelectContent.displayName = SelectPrimitive.Content.displayName
const SelectLabel = React.forwardRef<
React.ElementRef<typeof SelectPrimitive.Label>,
React.ComponentPropsWithoutRef<typeof SelectPrimitive.Label>
>(({ className, ...props }, ref) => (
<SelectPrimitive.Label
ref={ref}
className={cn("px-2 py-1.5 text-sm font-semibold", className)}
{...props}
/>
))
SelectLabel.displayName = SelectPrimitive.Label.displayName
const SelectItem = React.forwardRef<
React.ElementRef<typeof SelectPrimitive.Item>,
React.ComponentPropsWithoutRef<typeof SelectPrimitive.Item>
>(({ className, children, ...props }, ref) => (
<SelectPrimitive.Item
ref={ref}
className={cn(
"relative flex w-full cursor-default select-none items-center rounded-sm py-1.5 pl-2 pr-8 text-sm outline-none focus:bg-accent focus:text-accent-foreground data-[disabled]:pointer-events-none data-[disabled]:opacity-50",
className
)}
{...props}
>
<span className="absolute right-2 flex h-3.5 w-3.5 items-center justify-center">
<SelectPrimitive.ItemIndicator>
<Check className="h-4 w-4" />
</SelectPrimitive.ItemIndicator>
</span>
<SelectPrimitive.ItemText>{children}</SelectPrimitive.ItemText>
</SelectPrimitive.Item>
))
SelectItem.displayName = SelectPrimitive.Item.displayName
const SelectSeparator = React.forwardRef<
React.ElementRef<typeof SelectPrimitive.Separator>,
React.ComponentPropsWithoutRef<typeof SelectPrimitive.Separator>
>(({ className, ...props }, ref) => (
<SelectPrimitive.Separator
ref={ref}
className={cn("-mx-1 my-1 h-px bg-muted", className)}
{...props}
/>
))
SelectSeparator.displayName = SelectPrimitive.Separator.displayName
export {
Select,
SelectGroup,
SelectValue,
SelectTrigger,
SelectContent,
SelectLabel,
SelectItem,
SelectSeparator,
SelectScrollUpButton,
SelectScrollDownButton,
}

View File

@@ -0,0 +1,27 @@
import * as React from "react"
import * as SwitchPrimitives from "@radix-ui/react-switch"
import { cn } from "@/lib/utils"
const Switch = React.forwardRef<
React.ElementRef<typeof SwitchPrimitives.Root>,
React.ComponentPropsWithoutRef<typeof SwitchPrimitives.Root>
>(({ className, ...props }, ref) => (
<SwitchPrimitives.Root
className={cn(
"peer inline-flex h-5 w-9 shrink-0 cursor-pointer items-center rounded-full border-2 border-transparent shadow-sm transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 focus-visible:ring-offset-background disabled:cursor-not-allowed disabled:opacity-50 data-[state=checked]:bg-primary data-[state=unchecked]:bg-input",
className
)}
{...props}
ref={ref}
>
<SwitchPrimitives.Thumb
className={cn(
"pointer-events-none block h-4 w-4 rounded-full bg-background shadow-lg ring-0 transition-transform data-[state=checked]:translate-x-4 data-[state=unchecked]:translate-x-0"
)}
/>
</SwitchPrimitives.Root>
))
Switch.displayName = SwitchPrimitives.Root.displayName
export { Switch }

View File

@@ -0,0 +1,53 @@
import * as React from "react"
import * as TabsPrimitive from "@radix-ui/react-tabs"
import { cn } from "@/lib/utils"
const Tabs = TabsPrimitive.Root
const TabsList = React.forwardRef<
React.ElementRef<typeof TabsPrimitive.List>,
React.ComponentPropsWithoutRef<typeof TabsPrimitive.List>
>(({ className, ...props }, ref) => (
<TabsPrimitive.List
ref={ref}
className={cn(
"inline-flex h-9 items-center justify-center rounded-lg bg-muted p-1 text-muted-foreground",
className
)}
{...props}
/>
))
TabsList.displayName = TabsPrimitive.List.displayName
const TabsTrigger = React.forwardRef<
React.ElementRef<typeof TabsPrimitive.Trigger>,
React.ComponentPropsWithoutRef<typeof TabsPrimitive.Trigger>
>(({ className, ...props }, ref) => (
<TabsPrimitive.Trigger
ref={ref}
className={cn(
"inline-flex items-center justify-center whitespace-nowrap rounded-md px-3 py-1 text-sm font-medium ring-offset-background transition-all focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50 data-[state=active]:bg-background data-[state=active]:text-foreground data-[state=active]:shadow",
className
)}
{...props}
/>
))
TabsTrigger.displayName = TabsPrimitive.Trigger.displayName
const TabsContent = React.forwardRef<
React.ElementRef<typeof TabsPrimitive.Content>,
React.ComponentPropsWithoutRef<typeof TabsPrimitive.Content>
>(({ className, ...props }, ref) => (
<TabsPrimitive.Content
ref={ref}
className={cn(
"mt-2 ring-offset-background focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2",
className
)}
{...props}
/>
))
TabsContent.displayName = TabsPrimitive.Content.displayName
export { Tabs, TabsList, TabsTrigger, TabsContent }

View File

@@ -0,0 +1,22 @@
import * as React from "react"
import { cn } from "@/lib/utils"
const Textarea = React.forwardRef<
HTMLTextAreaElement,
React.ComponentProps<"textarea">
>(({ className, ...props }, ref) => {
return (
<textarea
className={cn(
"flex min-h-[60px] w-full rounded-md border border-input bg-transparent px-3 py-2 text-base shadow-sm placeholder:text-muted-foreground focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:cursor-not-allowed disabled:opacity-50 md:text-sm",
className
)}
ref={ref}
{...props}
/>
)
})
Textarea.displayName = "Textarea"
export { Textarea }

View File

@@ -188,4 +188,236 @@ export const api = {
return data.url return data.url
}, },
// ============================================================================
// Vector Database / Qdrant Operations
// ============================================================================
// Health check de la base de datos vectorial
vectorHealthCheck: async (): Promise<{ status: string; db_type: string; message: string }> => {
const response = await fetch(`${API_BASE_URL}/vectors/health`)
if (!response.ok) throw new Error('Error checking vector DB health')
return response.json()
},
// Verificar si una colección existe
checkCollectionExists: async (collectionName: string): Promise<{ exists: boolean; collection_name: string }> => {
const response = await fetch(`${API_BASE_URL}/vectors/collections/exists`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({ collection_name: collectionName }),
})
if (!response.ok) throw new Error('Error checking collection')
return response.json()
},
// Crear una nueva colección
createCollection: async (
collectionName: string,
vectorSize: number = 3072,
distance: string = 'Cosine'
): Promise<{ success: boolean; collection_name: string; message: string }> => {
const response = await fetch(`${API_BASE_URL}/vectors/collections/create`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
collection_name: collectionName,
vector_size: vectorSize,
distance: distance,
}),
})
if (!response.ok) {
const error = await response.json()
throw new Error(error.detail || 'Error creating collection')
}
return response.json()
},
// Eliminar una colección
deleteCollection: async (collectionName: string): Promise<{ success: boolean; collection_name: string; message: string }> => {
const response = await fetch(`${API_BASE_URL}/vectors/collections/${encodeURIComponent(collectionName)}`, {
method: 'DELETE',
})
if (!response.ok) throw new Error('Error deleting collection')
return response.json()
},
// Obtener información de una colección
getCollectionInfo: async (collectionName: string): Promise<{
name: string
vectors_count: number
vectors_config: { size: number; distance: string }
status: string
}> => {
const response = await fetch(`${API_BASE_URL}/vectors/collections/${encodeURIComponent(collectionName)}/info`)
if (!response.ok) throw new Error('Error getting collection info')
return response.json()
},
// Verificar si un archivo existe en una colección
checkFileExistsInCollection: async (
collectionName: string,
fileName: string
): Promise<{ exists: boolean; collection_name: string; file_name: string; chunk_count?: number }> => {
const response = await fetch(`${API_BASE_URL}/vectors/files/exists`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
collection_name: collectionName,
file_name: fileName,
}),
})
if (!response.ok) throw new Error('Error checking file in collection')
return response.json()
},
// Obtener chunks de un archivo
getChunksByFile: async (
collectionName: string,
fileName: string,
limit?: number
): Promise<{
collection_name: string
file_name: string
chunks: Array<{ id: string; payload: any; vector?: number[] }>
total_chunks: number
}> => {
const url = limit
? `${API_BASE_URL}/vectors/collections/${encodeURIComponent(collectionName)}/files/${encodeURIComponent(fileName)}/chunks?limit=${limit}`
: `${API_BASE_URL}/vectors/collections/${encodeURIComponent(collectionName)}/files/${encodeURIComponent(fileName)}/chunks`
const response = await fetch(url)
if (!response.ok) throw new Error('Error getting chunks')
return response.json()
},
// Eliminar archivo de colección
deleteFileFromCollection: async (
collectionName: string,
fileName: string
): Promise<{ success: boolean; collection_name: string; file_name: string; chunks_deleted: number; message: string }> => {
const response = await fetch(
`${API_BASE_URL}/vectors/collections/${encodeURIComponent(collectionName)}/files/${encodeURIComponent(fileName)}`,
{ method: 'DELETE' }
)
if (!response.ok) throw new Error('Error deleting file from collection')
return response.json()
},
// Agregar chunks a una colección
addChunks: async (
collectionName: string,
chunks: Array<{ id: string; vector: number[]; payload: any }>
): Promise<{ success: boolean; collection_name: string; chunks_added: number; message: string }> => {
const response = await fetch(`${API_BASE_URL}/vectors/chunks/add`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
collection_name: collectionName,
chunks: chunks,
}),
})
if (!response.ok) throw new Error('Error adding chunks')
return response.json()
},
// ============================================================================
// Chunking Operations
// ============================================================================
// Obtener perfiles de chunking predefinidos
getChunkingProfiles: async (): Promise<{
profiles: Array<{
id: string
name: string
description: string
max_tokens: number
target_tokens: number
chunk_size: number
chunk_overlap: number
use_llm: boolean
}>
}> => {
const response = await fetch(`${API_BASE_URL}/chunking/profiles`)
if (!response.ok) throw new Error('Error fetching chunking profiles')
return response.json()
},
// Generar preview de chunks (hasta 3 chunks)
generateChunkPreview: async (config: {
file_name: string
tema: string
max_tokens?: number
target_tokens?: number
chunk_size?: number
chunk_overlap?: number
use_llm?: boolean
custom_instructions?: string
}): Promise<{
success: boolean
file_name: string
tema: string
chunks: Array<{
index: number
text: string
page: number
file_name: string
tokens: number
}>
message: string
}> => {
const response = await fetch(`${API_BASE_URL}/chunking/preview`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify(config),
})
if (!response.ok) {
const error = await response.json()
throw new Error(error.detail || 'Error generating preview')
}
return response.json()
},
// Procesar PDF completo
processChunkingFull: async (config: {
file_name: string
tema: string
collection_name: string
max_tokens?: number
target_tokens?: number
chunk_size?: number
chunk_overlap?: number
use_llm?: boolean
custom_instructions?: string
}): Promise<{
success: boolean
collection_name: string
file_name: string
total_chunks: number
chunks_added: number
message: string
}> => {
const response = await fetch(`${API_BASE_URL}/chunking/process`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify(config),
})
if (!response.ok) {
const error = await response.json()
throw new Error(error.detail || 'Error processing PDF')
}
return response.json()
},
} }