Primera version de chunkeo completo crud
This commit is contained in:
183
backend/app/routers/chunking.py
Normal file
183
backend/app/routers/chunking.py
Normal file
@@ -0,0 +1,183 @@
|
||||
"""
|
||||
Router para operaciones de chunking de PDFs.
|
||||
Endpoints para generar preview y procesar PDFs completos.
|
||||
"""
|
||||
import logging
|
||||
from fastapi import APIRouter, HTTPException, status
|
||||
from typing import List
|
||||
|
||||
from ..models.chunking_models import (
|
||||
ChunkingPreviewRequest,
|
||||
ChunkingPreviewResponse,
|
||||
ChunkingProcessRequest,
|
||||
ChunkingProcessResponse,
|
||||
ChunkingProfilesResponse,
|
||||
ChunkingProfile,
|
||||
ChunkPreview
|
||||
)
|
||||
from ..services.chunking_service import get_chunking_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/chunking", tags=["chunking"])
|
||||
|
||||
|
||||
# Perfiles predefinidos
|
||||
CHUNKING_PROFILES = [
|
||||
ChunkingProfile(
|
||||
id="balanced",
|
||||
name="Balanceado",
|
||||
description="Configuración equilibrada entre velocidad y calidad",
|
||||
max_tokens=950,
|
||||
target_tokens=800,
|
||||
chunk_size=1000,
|
||||
chunk_overlap=200,
|
||||
use_llm=True
|
||||
),
|
||||
ChunkingProfile(
|
||||
id="detailed",
|
||||
name="Detallado",
|
||||
description="Chunks más grandes para mantener más contexto",
|
||||
max_tokens=1500,
|
||||
target_tokens=1200,
|
||||
chunk_size=1500,
|
||||
chunk_overlap=300,
|
||||
use_llm=True
|
||||
),
|
||||
ChunkingProfile(
|
||||
id="compact",
|
||||
name="Compacto",
|
||||
description="Chunks más pequeños para búsquedas precisas",
|
||||
max_tokens=600,
|
||||
target_tokens=500,
|
||||
chunk_size=700,
|
||||
chunk_overlap=150,
|
||||
use_llm=True
|
||||
),
|
||||
ChunkingProfile(
|
||||
id="fast",
|
||||
name="Rápido",
|
||||
description="Sin LLM, solo procesamiento básico",
|
||||
max_tokens=950,
|
||||
target_tokens=800,
|
||||
chunk_size=1000,
|
||||
chunk_overlap=200,
|
||||
use_llm=False
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@router.get("/profiles", response_model=ChunkingProfilesResponse)
|
||||
async def get_chunking_profiles():
|
||||
"""
|
||||
Obtiene los perfiles de configuración predefinidos para chunking.
|
||||
|
||||
Returns:
|
||||
Lista de perfiles disponibles
|
||||
"""
|
||||
return ChunkingProfilesResponse(profiles=CHUNKING_PROFILES)
|
||||
|
||||
|
||||
@router.post("/preview", response_model=ChunkingPreviewResponse)
|
||||
async def generate_preview(request: ChunkingPreviewRequest):
|
||||
"""
|
||||
Genera preview de chunks para un PDF (hasta 3 chunks).
|
||||
|
||||
Args:
|
||||
request: Configuración de chunking y ubicación del archivo
|
||||
|
||||
Returns:
|
||||
Preview con chunks de ejemplo (máximo 3, mínimo 1)
|
||||
|
||||
Raises:
|
||||
HTTPException: Si hay error generando el preview
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Generando preview para {request.file_name} (tema: {request.tema})")
|
||||
|
||||
chunking_service = get_chunking_service()
|
||||
|
||||
chunks = await chunking_service.process_pdf_preview(
|
||||
file_name=request.file_name,
|
||||
tema=request.tema,
|
||||
max_tokens=request.max_tokens,
|
||||
target_tokens=request.target_tokens,
|
||||
chunk_size=request.chunk_size,
|
||||
chunk_overlap=request.chunk_overlap,
|
||||
use_llm=request.use_llm,
|
||||
custom_instructions=request.custom_instructions
|
||||
)
|
||||
|
||||
# Convertir a modelos Pydantic
|
||||
chunk_previews = [
|
||||
ChunkPreview(
|
||||
index=chunk["index"],
|
||||
text=chunk["text"],
|
||||
page=chunk["page"],
|
||||
file_name=chunk["file_name"],
|
||||
tokens=chunk["tokens"]
|
||||
)
|
||||
for chunk in chunks
|
||||
]
|
||||
|
||||
return ChunkingPreviewResponse(
|
||||
success=True,
|
||||
file_name=request.file_name,
|
||||
tema=request.tema,
|
||||
chunks=chunk_previews,
|
||||
message="Preview generado exitosamente"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generando preview: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Error generando preview: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@router.post("/process", response_model=ChunkingProcessResponse)
|
||||
async def process_pdf_full(request: ChunkingProcessRequest):
|
||||
"""
|
||||
Procesa un PDF completo y lo sube a Qdrant.
|
||||
|
||||
Este endpoint:
|
||||
1. Descarga el PDF desde Azure Blob
|
||||
2. Lo procesa en chunks con control de tokens
|
||||
3. Genera embeddings con Azure OpenAI
|
||||
4. Sube los chunks a Qdrant con IDs determinísticos
|
||||
|
||||
Args:
|
||||
request: Configuración de chunking y destino
|
||||
|
||||
Returns:
|
||||
Resultado del procesamiento con estadísticas
|
||||
|
||||
Raises:
|
||||
HTTPException: Si hay error procesando el PDF
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Procesando PDF completo: {request.file_name} (tema: {request.tema})")
|
||||
|
||||
chunking_service = get_chunking_service()
|
||||
|
||||
result = await chunking_service.process_pdf_full(
|
||||
file_name=request.file_name,
|
||||
tema=request.tema,
|
||||
collection_name=request.collection_name,
|
||||
max_tokens=request.max_tokens,
|
||||
target_tokens=request.target_tokens,
|
||||
chunk_size=request.chunk_size,
|
||||
chunk_overlap=request.chunk_overlap,
|
||||
use_llm=request.use_llm,
|
||||
custom_instructions=request.custom_instructions
|
||||
)
|
||||
|
||||
return ChunkingProcessResponse(**result)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error procesando PDF: {e}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Error procesando PDF: {str(e)}"
|
||||
)
|
||||
Reference in New Issue
Block a user