184 lines
5.2 KiB
Python
184 lines
5.2 KiB
Python
"""
|
|
Router para operaciones de chunking de PDFs.
|
|
Endpoints para generar preview y procesar PDFs completos.
|
|
"""
|
|
import logging
|
|
from fastapi import APIRouter, HTTPException, status
|
|
from typing import List
|
|
|
|
from ..models.chunking_models import (
|
|
ChunkingPreviewRequest,
|
|
ChunkingPreviewResponse,
|
|
ChunkingProcessRequest,
|
|
ChunkingProcessResponse,
|
|
ChunkingProfilesResponse,
|
|
ChunkingProfile,
|
|
ChunkPreview
|
|
)
|
|
from ..services.chunking_service import get_chunking_service
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(prefix="/chunking", tags=["chunking"])
|
|
|
|
|
|
# Perfiles predefinidos
|
|
CHUNKING_PROFILES = [
|
|
ChunkingProfile(
|
|
id="balanced",
|
|
name="Balanceado",
|
|
description="Configuración equilibrada entre velocidad y calidad",
|
|
max_tokens=950,
|
|
target_tokens=800,
|
|
chunk_size=1000,
|
|
chunk_overlap=200,
|
|
use_llm=True
|
|
),
|
|
ChunkingProfile(
|
|
id="detailed",
|
|
name="Detallado",
|
|
description="Chunks más grandes para mantener más contexto",
|
|
max_tokens=1500,
|
|
target_tokens=1200,
|
|
chunk_size=1500,
|
|
chunk_overlap=300,
|
|
use_llm=True
|
|
),
|
|
ChunkingProfile(
|
|
id="compact",
|
|
name="Compacto",
|
|
description="Chunks más pequeños para búsquedas precisas",
|
|
max_tokens=600,
|
|
target_tokens=500,
|
|
chunk_size=700,
|
|
chunk_overlap=150,
|
|
use_llm=True
|
|
),
|
|
ChunkingProfile(
|
|
id="fast",
|
|
name="Rápido",
|
|
description="Sin LLM, solo procesamiento básico",
|
|
max_tokens=950,
|
|
target_tokens=800,
|
|
chunk_size=1000,
|
|
chunk_overlap=200,
|
|
use_llm=False
|
|
),
|
|
]
|
|
|
|
|
|
@router.get("/profiles", response_model=ChunkingProfilesResponse)
|
|
async def get_chunking_profiles():
|
|
"""
|
|
Obtiene los perfiles de configuración predefinidos para chunking.
|
|
|
|
Returns:
|
|
Lista de perfiles disponibles
|
|
"""
|
|
return ChunkingProfilesResponse(profiles=CHUNKING_PROFILES)
|
|
|
|
|
|
@router.post("/preview", response_model=ChunkingPreviewResponse)
|
|
async def generate_preview(request: ChunkingPreviewRequest):
|
|
"""
|
|
Genera preview de chunks para un PDF (hasta 3 chunks).
|
|
|
|
Args:
|
|
request: Configuración de chunking y ubicación del archivo
|
|
|
|
Returns:
|
|
Preview con chunks de ejemplo (máximo 3, mínimo 1)
|
|
|
|
Raises:
|
|
HTTPException: Si hay error generando el preview
|
|
"""
|
|
try:
|
|
logger.info(f"Generando preview para {request.file_name} (tema: {request.tema})")
|
|
|
|
chunking_service = get_chunking_service()
|
|
|
|
chunks = await chunking_service.process_pdf_preview(
|
|
file_name=request.file_name,
|
|
tema=request.tema,
|
|
max_tokens=request.max_tokens,
|
|
target_tokens=request.target_tokens,
|
|
chunk_size=request.chunk_size,
|
|
chunk_overlap=request.chunk_overlap,
|
|
use_llm=request.use_llm,
|
|
custom_instructions=request.custom_instructions
|
|
)
|
|
|
|
# Convertir a modelos Pydantic
|
|
chunk_previews = [
|
|
ChunkPreview(
|
|
index=chunk["index"],
|
|
text=chunk["text"],
|
|
page=chunk["page"],
|
|
file_name=chunk["file_name"],
|
|
tokens=chunk["tokens"]
|
|
)
|
|
for chunk in chunks
|
|
]
|
|
|
|
return ChunkingPreviewResponse(
|
|
success=True,
|
|
file_name=request.file_name,
|
|
tema=request.tema,
|
|
chunks=chunk_previews,
|
|
message="Preview generado exitosamente"
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error generando preview: {e}")
|
|
raise HTTPException(
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
detail=f"Error generando preview: {str(e)}"
|
|
)
|
|
|
|
|
|
@router.post("/process", response_model=ChunkingProcessResponse)
|
|
async def process_pdf_full(request: ChunkingProcessRequest):
|
|
"""
|
|
Procesa un PDF completo y lo sube a Qdrant.
|
|
|
|
Este endpoint:
|
|
1. Descarga el PDF desde Azure Blob
|
|
2. Lo procesa en chunks con control de tokens
|
|
3. Genera embeddings con Azure OpenAI
|
|
4. Sube los chunks a Qdrant con IDs determinísticos
|
|
|
|
Args:
|
|
request: Configuración de chunking y destino
|
|
|
|
Returns:
|
|
Resultado del procesamiento con estadísticas
|
|
|
|
Raises:
|
|
HTTPException: Si hay error procesando el PDF
|
|
"""
|
|
try:
|
|
logger.info(f"Procesando PDF completo: {request.file_name} (tema: {request.tema})")
|
|
|
|
chunking_service = get_chunking_service()
|
|
|
|
result = await chunking_service.process_pdf_full(
|
|
file_name=request.file_name,
|
|
tema=request.tema,
|
|
collection_name=request.collection_name,
|
|
max_tokens=request.max_tokens,
|
|
target_tokens=request.target_tokens,
|
|
chunk_size=request.chunk_size,
|
|
chunk_overlap=request.chunk_overlap,
|
|
use_llm=request.use_llm,
|
|
custom_instructions=request.custom_instructions
|
|
)
|
|
|
|
return ChunkingProcessResponse(**result)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error procesando PDF: {e}", exc_info=True)
|
|
raise HTTPException(
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
detail=f"Error procesando PDF: {str(e)}"
|
|
)
|