Primera version de chunkeo completo crud
This commit is contained in:
183
backend/app/routers/chunking.py
Normal file
183
backend/app/routers/chunking.py
Normal file
@@ -0,0 +1,183 @@
|
||||
"""
|
||||
Router para operaciones de chunking de PDFs.
|
||||
Endpoints para generar preview y procesar PDFs completos.
|
||||
"""
|
||||
import logging
|
||||
from fastapi import APIRouter, HTTPException, status
|
||||
from typing import List
|
||||
|
||||
from ..models.chunking_models import (
|
||||
ChunkingPreviewRequest,
|
||||
ChunkingPreviewResponse,
|
||||
ChunkingProcessRequest,
|
||||
ChunkingProcessResponse,
|
||||
ChunkingProfilesResponse,
|
||||
ChunkingProfile,
|
||||
ChunkPreview
|
||||
)
|
||||
from ..services.chunking_service import get_chunking_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/chunking", tags=["chunking"])
|
||||
|
||||
|
||||
# Perfiles predefinidos
|
||||
CHUNKING_PROFILES = [
|
||||
ChunkingProfile(
|
||||
id="balanced",
|
||||
name="Balanceado",
|
||||
description="Configuración equilibrada entre velocidad y calidad",
|
||||
max_tokens=950,
|
||||
target_tokens=800,
|
||||
chunk_size=1000,
|
||||
chunk_overlap=200,
|
||||
use_llm=True
|
||||
),
|
||||
ChunkingProfile(
|
||||
id="detailed",
|
||||
name="Detallado",
|
||||
description="Chunks más grandes para mantener más contexto",
|
||||
max_tokens=1500,
|
||||
target_tokens=1200,
|
||||
chunk_size=1500,
|
||||
chunk_overlap=300,
|
||||
use_llm=True
|
||||
),
|
||||
ChunkingProfile(
|
||||
id="compact",
|
||||
name="Compacto",
|
||||
description="Chunks más pequeños para búsquedas precisas",
|
||||
max_tokens=600,
|
||||
target_tokens=500,
|
||||
chunk_size=700,
|
||||
chunk_overlap=150,
|
||||
use_llm=True
|
||||
),
|
||||
ChunkingProfile(
|
||||
id="fast",
|
||||
name="Rápido",
|
||||
description="Sin LLM, solo procesamiento básico",
|
||||
max_tokens=950,
|
||||
target_tokens=800,
|
||||
chunk_size=1000,
|
||||
chunk_overlap=200,
|
||||
use_llm=False
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@router.get("/profiles", response_model=ChunkingProfilesResponse)
|
||||
async def get_chunking_profiles():
|
||||
"""
|
||||
Obtiene los perfiles de configuración predefinidos para chunking.
|
||||
|
||||
Returns:
|
||||
Lista de perfiles disponibles
|
||||
"""
|
||||
return ChunkingProfilesResponse(profiles=CHUNKING_PROFILES)
|
||||
|
||||
|
||||
@router.post("/preview", response_model=ChunkingPreviewResponse)
|
||||
async def generate_preview(request: ChunkingPreviewRequest):
|
||||
"""
|
||||
Genera preview de chunks para un PDF (hasta 3 chunks).
|
||||
|
||||
Args:
|
||||
request: Configuración de chunking y ubicación del archivo
|
||||
|
||||
Returns:
|
||||
Preview con chunks de ejemplo (máximo 3, mínimo 1)
|
||||
|
||||
Raises:
|
||||
HTTPException: Si hay error generando el preview
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Generando preview para {request.file_name} (tema: {request.tema})")
|
||||
|
||||
chunking_service = get_chunking_service()
|
||||
|
||||
chunks = await chunking_service.process_pdf_preview(
|
||||
file_name=request.file_name,
|
||||
tema=request.tema,
|
||||
max_tokens=request.max_tokens,
|
||||
target_tokens=request.target_tokens,
|
||||
chunk_size=request.chunk_size,
|
||||
chunk_overlap=request.chunk_overlap,
|
||||
use_llm=request.use_llm,
|
||||
custom_instructions=request.custom_instructions
|
||||
)
|
||||
|
||||
# Convertir a modelos Pydantic
|
||||
chunk_previews = [
|
||||
ChunkPreview(
|
||||
index=chunk["index"],
|
||||
text=chunk["text"],
|
||||
page=chunk["page"],
|
||||
file_name=chunk["file_name"],
|
||||
tokens=chunk["tokens"]
|
||||
)
|
||||
for chunk in chunks
|
||||
]
|
||||
|
||||
return ChunkingPreviewResponse(
|
||||
success=True,
|
||||
file_name=request.file_name,
|
||||
tema=request.tema,
|
||||
chunks=chunk_previews,
|
||||
message="Preview generado exitosamente"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generando preview: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Error generando preview: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@router.post("/process", response_model=ChunkingProcessResponse)
|
||||
async def process_pdf_full(request: ChunkingProcessRequest):
|
||||
"""
|
||||
Procesa un PDF completo y lo sube a Qdrant.
|
||||
|
||||
Este endpoint:
|
||||
1. Descarga el PDF desde Azure Blob
|
||||
2. Lo procesa en chunks con control de tokens
|
||||
3. Genera embeddings con Azure OpenAI
|
||||
4. Sube los chunks a Qdrant con IDs determinísticos
|
||||
|
||||
Args:
|
||||
request: Configuración de chunking y destino
|
||||
|
||||
Returns:
|
||||
Resultado del procesamiento con estadísticas
|
||||
|
||||
Raises:
|
||||
HTTPException: Si hay error procesando el PDF
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Procesando PDF completo: {request.file_name} (tema: {request.tema})")
|
||||
|
||||
chunking_service = get_chunking_service()
|
||||
|
||||
result = await chunking_service.process_pdf_full(
|
||||
file_name=request.file_name,
|
||||
tema=request.tema,
|
||||
collection_name=request.collection_name,
|
||||
max_tokens=request.max_tokens,
|
||||
target_tokens=request.target_tokens,
|
||||
chunk_size=request.chunk_size,
|
||||
chunk_overlap=request.chunk_overlap,
|
||||
use_llm=request.use_llm,
|
||||
custom_instructions=request.custom_instructions
|
||||
)
|
||||
|
||||
return ChunkingProcessResponse(**result)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error procesando PDF: {e}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Error procesando PDF: {str(e)}"
|
||||
)
|
||||
277
backend/app/routers/vectors.py
Normal file
277
backend/app/routers/vectors.py
Normal file
@@ -0,0 +1,277 @@
|
||||
"""
|
||||
Router para endpoints de operaciones con bases de datos vectoriales.
|
||||
|
||||
Este módulo define todos los endpoints de la API relacionados con
|
||||
la gestión de colecciones y chunks en bases de datos vectoriales.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from fastapi import APIRouter, HTTPException, status, Query
|
||||
from typing import Optional
|
||||
|
||||
from app.services.vector_service import vector_service
|
||||
from app.models.vector_models import (
|
||||
CollectionExistsRequest,
|
||||
CollectionExistsResponse,
|
||||
CollectionCreateRequest,
|
||||
CollectionCreateResponse,
|
||||
CollectionDeleteResponse,
|
||||
CollectionInfoResponse,
|
||||
FileExistsInCollectionRequest,
|
||||
FileExistsInCollectionResponse,
|
||||
GetChunksByFileRequest,
|
||||
GetChunksByFileResponse,
|
||||
DeleteFileFromCollectionRequest,
|
||||
DeleteFileFromCollectionResponse,
|
||||
AddChunksRequest,
|
||||
AddChunksResponse,
|
||||
VectorDBHealthResponse,
|
||||
VectorDBErrorResponse
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(
|
||||
prefix="/vectors",
|
||||
tags=["Vectors"],
|
||||
responses={
|
||||
500: {"model": VectorDBErrorResponse, "description": "Error interno del servidor"}
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Endpoints de Health Check
|
||||
# ============================================================================
|
||||
|
||||
@router.get(
|
||||
"/health",
|
||||
response_model=VectorDBHealthResponse,
|
||||
summary="Verificar estado de la base de datos vectorial",
|
||||
description="Verifica que la conexión con la base de datos vectorial esté funcionando correctamente"
|
||||
)
|
||||
async def health_check():
|
||||
"""Health check de la base de datos vectorial."""
|
||||
try:
|
||||
return await vector_service.health_check()
|
||||
except Exception as e:
|
||||
logger.error(f"Error en health check: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Error al verificar estado de la base de datos: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Endpoints de Colecciones
|
||||
# ============================================================================
|
||||
|
||||
@router.post(
|
||||
"/collections/exists",
|
||||
response_model=CollectionExistsResponse,
|
||||
summary="Verificar si una colección existe",
|
||||
description="Verifica si existe una colección con el nombre especificado"
|
||||
)
|
||||
async def check_collection_exists(request: CollectionExistsRequest):
|
||||
"""Verifica si una colección existe."""
|
||||
try:
|
||||
return await vector_service.check_collection_exists(request.collection_name)
|
||||
except Exception as e:
|
||||
logger.error(f"Error al verificar colección: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Error al verificar colección: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/collections/create",
|
||||
response_model=CollectionCreateResponse,
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
summary="Crear una nueva colección",
|
||||
description="Crea una nueva colección en la base de datos vectorial"
|
||||
)
|
||||
async def create_collection(request: CollectionCreateRequest):
|
||||
"""Crea una nueva colección."""
|
||||
try:
|
||||
return await vector_service.create_collection(request)
|
||||
except ValueError as e:
|
||||
logger.warning(f"Error de validación al crear colección: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=str(e)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error al crear colección: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Error al crear colección: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@router.delete(
|
||||
"/collections/{collection_name}",
|
||||
response_model=CollectionDeleteResponse,
|
||||
summary="Eliminar una colección",
|
||||
description="Elimina completamente una colección y todos sus datos"
|
||||
)
|
||||
async def delete_collection(collection_name: str):
|
||||
"""Elimina una colección completa."""
|
||||
try:
|
||||
return await vector_service.delete_collection(collection_name)
|
||||
except ValueError as e:
|
||||
logger.warning(f"Error de validación al eliminar colección: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=str(e)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error al eliminar colección: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Error al eliminar colección: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/collections/{collection_name}/info",
|
||||
response_model=CollectionInfoResponse,
|
||||
summary="Obtener información de una colección",
|
||||
description="Obtiene información detallada sobre una colección"
|
||||
)
|
||||
async def get_collection_info(collection_name: str):
|
||||
"""Obtiene información de una colección."""
|
||||
try:
|
||||
info = await vector_service.get_collection_info(collection_name)
|
||||
|
||||
if info is None:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=f"Colección '{collection_name}' no encontrada"
|
||||
)
|
||||
|
||||
return info
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error al obtener info de colección: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Error al obtener información de colección: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Endpoints de Archivos en Colecciones
|
||||
# ============================================================================
|
||||
|
||||
@router.post(
|
||||
"/files/exists",
|
||||
response_model=FileExistsInCollectionResponse,
|
||||
summary="Verificar si un archivo existe en una colección",
|
||||
description="Verifica si un archivo específico existe en una colección"
|
||||
)
|
||||
async def check_file_exists_in_collection(request: FileExistsInCollectionRequest):
|
||||
"""Verifica si un archivo existe en una colección."""
|
||||
try:
|
||||
return await vector_service.check_file_exists_in_collection(
|
||||
request.collection_name,
|
||||
request.file_name
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error al verificar archivo en colección: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Error al verificar archivo: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/collections/{collection_name}/files/{file_name}/chunks",
|
||||
response_model=GetChunksByFileResponse,
|
||||
summary="Obtener chunks de un archivo",
|
||||
description="Obtiene todos los chunks de un archivo específico en una colección"
|
||||
)
|
||||
async def get_chunks_by_file(
|
||||
collection_name: str,
|
||||
file_name: str,
|
||||
limit: Optional[int] = Query(None, description="Límite de chunks a retornar")
|
||||
):
|
||||
"""Obtiene todos los chunks de un archivo."""
|
||||
try:
|
||||
return await vector_service.get_chunks_by_file(
|
||||
collection_name,
|
||||
file_name,
|
||||
limit
|
||||
)
|
||||
except ValueError as e:
|
||||
logger.warning(f"Error de validación al obtener chunks: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=str(e)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error al obtener chunks: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Error al obtener chunks: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@router.delete(
|
||||
"/collections/{collection_name}/files/{file_name}",
|
||||
response_model=DeleteFileFromCollectionResponse,
|
||||
summary="Eliminar un archivo de una colección",
|
||||
description="Elimina todos los chunks de un archivo de una colección"
|
||||
)
|
||||
async def delete_file_from_collection(collection_name: str, file_name: str):
|
||||
"""Elimina todos los chunks de un archivo."""
|
||||
try:
|
||||
return await vector_service.delete_file_from_collection(
|
||||
collection_name,
|
||||
file_name
|
||||
)
|
||||
except ValueError as e:
|
||||
logger.warning(f"Error de validación al eliminar archivo: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=str(e)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error al eliminar archivo: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Error al eliminar archivo: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Endpoints de Chunks
|
||||
# ============================================================================
|
||||
|
||||
@router.post(
|
||||
"/chunks/add",
|
||||
response_model=AddChunksResponse,
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
summary="Agregar chunks a una colección",
|
||||
description="Agrega múltiples chunks a una colección existente"
|
||||
)
|
||||
async def add_chunks(request: AddChunksRequest):
|
||||
"""Agrega chunks a una colección."""
|
||||
try:
|
||||
return await vector_service.add_chunks(
|
||||
request.collection_name,
|
||||
request.chunks
|
||||
)
|
||||
except ValueError as e:
|
||||
logger.warning(f"Error de validación al agregar chunks: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=str(e)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error al agregar chunks: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Error al agregar chunks: {str(e)}"
|
||||
)
|
||||
Reference in New Issue
Block a user