code-shredding
This commit is contained in:
2
src/knowledge_pipeline/__init__.py
Normal file
2
src/knowledge_pipeline/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
def main() -> None:
|
||||
print("Hello from rag-eval!")
|
||||
2
src/knowledge_pipeline/chunker/__init__.py
Normal file
2
src/knowledge_pipeline/chunker/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
def hello() -> str:
|
||||
return "Hello from chunker!"
|
||||
68
src/knowledge_pipeline/chunker/base_chunker.py
Normal file
68
src/knowledge_pipeline/chunker/base_chunker.py
Normal file
@@ -0,0 +1,68 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import List, TypedDict
|
||||
|
||||
|
||||
class Document(TypedDict):
|
||||
"""A dictionary representing a processed document chunk."""
|
||||
|
||||
page_content: str
|
||||
metadata: dict
|
||||
|
||||
|
||||
class BaseChunker(ABC):
|
||||
"""Abstract base class for chunker implementations."""
|
||||
|
||||
max_chunk_size: int
|
||||
|
||||
@abstractmethod
|
||||
def process_text(self, text: str) -> List[Document]:
|
||||
"""
|
||||
Processes a string of text into a list of Document chunks.
|
||||
|
||||
Args:
|
||||
text: The input string to process.
|
||||
|
||||
Returns:
|
||||
A list of Document objects.
|
||||
"""
|
||||
...
|
||||
|
||||
def process_path(self, path: Path) -> List[Document]:
|
||||
"""
|
||||
Reads a file from a Path object and processes its content.
|
||||
|
||||
It attempts to read the file with UTF-8 encoding and falls back to
|
||||
latin-1 if a UnicodeDecodeError occurs.
|
||||
|
||||
Args:
|
||||
path: The Path object pointing to the file.
|
||||
|
||||
Returns:
|
||||
A list of Document objects from the file's content.
|
||||
"""
|
||||
try:
|
||||
text = path.read_text(encoding="utf-8")
|
||||
except UnicodeDecodeError:
|
||||
text = path.read_text(encoding="latin-1")
|
||||
return self.process_text(text)
|
||||
|
||||
def process_bytes(self, b: bytes) -> List[Document]:
|
||||
"""
|
||||
Decodes a byte string and processes its content.
|
||||
|
||||
It first attempts to decode the bytes as UTF-8. If that fails,
|
||||
it falls back to latin-1.
|
||||
|
||||
Args:
|
||||
b: The input byte string.
|
||||
|
||||
Returns:
|
||||
A list of Document objects from the byte string's content.
|
||||
"""
|
||||
try:
|
||||
text = b.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
# Fallback for files that are not UTF-8 encoded.
|
||||
text = b.decode("utf-8-sig")
|
||||
return self.process_text(text)
|
||||
98
src/knowledge_pipeline/chunker/contextual_chunker.py
Normal file
98
src/knowledge_pipeline/chunker/contextual_chunker.py
Normal file
@@ -0,0 +1,98 @@
|
||||
from .base_chunker import BaseChunker, Document
|
||||
|
||||
|
||||
class ContextualChunker(BaseChunker):
|
||||
"""
|
||||
A chunker that uses a large language model to create context-aware chunks.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str = "google-vertex:gemini-2.0-flash",
|
||||
max_chunk_size: int = 800,
|
||||
):
|
||||
self.max_chunk_size = max_chunk_size
|
||||
self.model = model
|
||||
|
||||
def _split_text(self, text: str) -> list[str]:
|
||||
"""Splits text into evenly sized chunks of a maximum size, trying to respect sentence and paragraph boundaries."""
|
||||
import math
|
||||
|
||||
num_chunks = math.ceil(len(text) / self.max_chunk_size)
|
||||
if num_chunks == 1:
|
||||
return [text]
|
||||
|
||||
ideal_chunk_size = math.ceil(len(text) / num_chunks)
|
||||
|
||||
chunks = []
|
||||
current_pos = 0
|
||||
while current_pos < len(text):
|
||||
end_pos = min(current_pos + ideal_chunk_size, len(text))
|
||||
|
||||
# Find a good split point around the end_pos
|
||||
split_point = -1
|
||||
if end_pos < len(text):
|
||||
paragraph_break = text.rfind("\n\n", current_pos, end_pos)
|
||||
if paragraph_break != -1:
|
||||
split_point = paragraph_break + 2
|
||||
else:
|
||||
sentence_break = text.rfind(". ", current_pos, end_pos)
|
||||
if sentence_break != -1:
|
||||
split_point = sentence_break + 1
|
||||
else:
|
||||
split_point = end_pos
|
||||
else:
|
||||
split_point = end_pos
|
||||
|
||||
chunks.append(text[current_pos:split_point])
|
||||
current_pos = split_point
|
||||
|
||||
return chunks
|
||||
|
||||
def process_text(self, text: str) -> list[Document]:
|
||||
"""
|
||||
Processes a string of text into a list of context-aware Document chunks.
|
||||
"""
|
||||
if len(text) <= self.max_chunk_size:
|
||||
return [{"page_content": text, "metadata": {}}]
|
||||
|
||||
chunks = self._split_text(text)
|
||||
processed_chunks: list[Document] = []
|
||||
|
||||
for i, chunk_content in enumerate(chunks):
|
||||
prompt = f"""
|
||||
Documento Original:
|
||||
---
|
||||
{text}
|
||||
---
|
||||
|
||||
Fragmento Actual:
|
||||
---
|
||||
{chunk_content}
|
||||
---
|
||||
|
||||
Tarea:
|
||||
Genera un resumen conciso del "Documento Original" que proporcione el contexto necesario para entender el "Fragmento Actual". El resumen debe ser un solo párrafo en español.
|
||||
"""
|
||||
|
||||
from pydantic_ai import ModelRequest
|
||||
from pydantic_ai.direct import model_request_sync
|
||||
|
||||
response = model_request_sync(
|
||||
self.model,
|
||||
[ModelRequest.user_text_prompt(prompt)],
|
||||
)
|
||||
summary = next(p.content for p in response.parts if p.part_kind == "text")
|
||||
contextualized_chunk = (
|
||||
f"> **Contexto del documento original:**\n> {summary}\n\n---\n\n"
|
||||
+ chunk_content
|
||||
)
|
||||
|
||||
processed_chunks.append(
|
||||
{
|
||||
"page_content": contextualized_chunk,
|
||||
"metadata": {"chunk_index": i},
|
||||
}
|
||||
)
|
||||
|
||||
return processed_chunks
|
||||
576
src/knowledge_pipeline/chunker/llm_chunker.py
Normal file
576
src/knowledge_pipeline/chunker/llm_chunker.py
Normal file
@@ -0,0 +1,576 @@
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
from typing import Annotated, List
|
||||
|
||||
import tiktoken
|
||||
import typer
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_core.documents import Document as LangchainDocument
|
||||
from llm.vertex_ai import VertexAILLM
|
||||
from pdf2image import convert_from_path
|
||||
from pypdf import PdfReader
|
||||
from rag_eval.config import Settings
|
||||
|
||||
from .base_chunker import BaseChunker, Document
|
||||
|
||||
|
||||
class TokenManager:
|
||||
"""Manages token counting and truncation."""
|
||||
|
||||
def __init__(self, model_name: str = "gpt-3.5-turbo"):
|
||||
try:
|
||||
self.encoding = tiktoken.encoding_for_model(model_name)
|
||||
except KeyError:
|
||||
self.encoding = tiktoken.get_encoding("cl100k_base")
|
||||
|
||||
def count_tokens(self, text: str) -> int:
|
||||
return len(self.encoding.encode(text))
|
||||
|
||||
def truncate_to_tokens(
|
||||
self, text: str, max_tokens: int, preserve_sentences: bool = True
|
||||
) -> str:
|
||||
tokens = self.encoding.encode(text)
|
||||
|
||||
if len(tokens) <= max_tokens:
|
||||
return text
|
||||
|
||||
truncated_tokens = tokens[:max_tokens]
|
||||
truncated_text = self.encoding.decode(truncated_tokens)
|
||||
|
||||
if preserve_sentences:
|
||||
last_period = truncated_text.rfind(".")
|
||||
if last_period > len(truncated_text) * 0.7:
|
||||
return truncated_text[: last_period + 1]
|
||||
|
||||
return truncated_text
|
||||
|
||||
|
||||
class OptimizedChunkProcessor:
|
||||
"""Uses an LLM to merge and enhance text chunks."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str,
|
||||
max_tokens: int = 1000,
|
||||
target_tokens: int = 800,
|
||||
chunks_per_batch: int = 5,
|
||||
gemini_client: VertexAILLM | None = None,
|
||||
model_name: str = "gpt-3.5-turbo",
|
||||
custom_instructions: str = "",
|
||||
):
|
||||
self.model = model
|
||||
self.client = gemini_client
|
||||
self.chunks_per_batch = chunks_per_batch
|
||||
self.max_tokens = max_tokens
|
||||
self.target_tokens = target_tokens
|
||||
self.token_manager = TokenManager(model_name)
|
||||
self.custom_instructions = custom_instructions
|
||||
self._merge_cache = {}
|
||||
self._enhance_cache = {}
|
||||
|
||||
def _get_cache_key(self, text: str) -> str:
|
||||
combined = text + self.custom_instructions
|
||||
return hashlib.md5(combined.encode()).hexdigest()[:16]
|
||||
|
||||
def should_merge_chunks(self, chunk1: str, chunk2: str) -> bool:
|
||||
cache_key = f"{self._get_cache_key(chunk1)}_{self._get_cache_key(chunk2)}"
|
||||
if cache_key in self._merge_cache:
|
||||
return self._merge_cache[cache_key]
|
||||
|
||||
try:
|
||||
combined_text = f"{chunk1}\n\n{chunk2}"
|
||||
combined_tokens = self.token_manager.count_tokens(combined_text)
|
||||
|
||||
if combined_tokens > self.max_tokens:
|
||||
self._merge_cache[cache_key] = False
|
||||
return False
|
||||
|
||||
if self.client:
|
||||
base_prompt = f"""Analiza estos dos fragmentos de texto y determina si deben unirse.
|
||||
|
||||
LÍMITES ESTRICTOS:
|
||||
- Tokens combinados: {combined_tokens}/{self.max_tokens}
|
||||
- Solo unir si hay continuidad semántica clara
|
||||
|
||||
Criterios de unión:
|
||||
1. El primer fragmento termina abruptamente
|
||||
2. El segundo fragmento continúa la misma idea/concepto
|
||||
3. La unión mejora la coherencia del contenido
|
||||
4. Exceder {self.max_tokens} tokens, SOLAMENTE si es necesario para mantener el contexto"""
|
||||
|
||||
base_prompt += f"""
|
||||
|
||||
Responde SOLO 'SI' o 'NO'.
|
||||
|
||||
Fragmento 1 ({self.token_manager.count_tokens(chunk1)} tokens):
|
||||
{chunk1[:500]}...
|
||||
|
||||
Fragmento 2 ({self.token_manager.count_tokens(chunk2)} tokens):
|
||||
{chunk2[:500]}..."""
|
||||
|
||||
response = self.client.generate(self.model, base_prompt).text
|
||||
result = response.strip().upper() == "SI"
|
||||
self._merge_cache[cache_key] = result
|
||||
return result
|
||||
|
||||
result = (
|
||||
not chunk1.rstrip().endswith((".", "!", "?"))
|
||||
and combined_tokens <= self.target_tokens
|
||||
)
|
||||
self._merge_cache[cache_key] = result
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error analizando chunks para merge: {e}")
|
||||
self._merge_cache[cache_key] = False
|
||||
return False
|
||||
|
||||
def enhance_chunk(self, chunk_text: str) -> str:
|
||||
cache_key = self._get_cache_key(chunk_text)
|
||||
if cache_key in self._enhance_cache:
|
||||
return self._enhance_cache[cache_key]
|
||||
|
||||
current_tokens = self.token_manager.count_tokens(chunk_text)
|
||||
|
||||
try:
|
||||
if self.client and current_tokens < self.max_tokens:
|
||||
base_prompt = f"""Optimiza este texto siguiendo estas reglas ESTRICTAS:
|
||||
|
||||
LÍMITES DE TOKENS:
|
||||
- Actual: {current_tokens} tokens
|
||||
- Máximo permitido: {self.max_tokens} tokens
|
||||
- Objetivo: {self.target_tokens} tokens
|
||||
|
||||
REGLAS FUNDAMENTALES:
|
||||
NO exceder {self.max_tokens} tokens bajo ninguna circunstancia
|
||||
Mantener TODA la información esencial y metadatos
|
||||
NO cambiar términos técnicos o palabras clave
|
||||
Asegurar oraciones completas y coherentes
|
||||
Optimizar claridad y estructura sin añadir contenido
|
||||
SOLO devuelve el texto no agregues conclusiones NUNCA
|
||||
|
||||
Si el texto está cerca del límite, NO expandir. Solo mejorar estructura."""
|
||||
|
||||
if self.custom_instructions.strip():
|
||||
base_prompt += (
|
||||
f"\n\nINSTRUCCIONES ADICIONALES:\n{self.custom_instructions}"
|
||||
)
|
||||
|
||||
base_prompt += f"\n\nTexto a optimizar:\n{chunk_text}"
|
||||
|
||||
response = self.client.generate(self.model, base_prompt).text
|
||||
enhanced_text = response.strip()
|
||||
|
||||
enhanced_tokens = self.token_manager.count_tokens(enhanced_text)
|
||||
if enhanced_tokens > self.max_tokens:
|
||||
print(
|
||||
f"Advertencia: Texto optimizado excede límite ({enhanced_tokens} > {self.max_tokens})"
|
||||
)
|
||||
enhanced_text = self.token_manager.truncate_to_tokens(
|
||||
enhanced_text, self.max_tokens
|
||||
)
|
||||
|
||||
self._enhance_cache[cache_key] = enhanced_text
|
||||
return enhanced_text
|
||||
else:
|
||||
if current_tokens > self.max_tokens:
|
||||
truncated = self.token_manager.truncate_to_tokens(
|
||||
chunk_text, self.max_tokens
|
||||
)
|
||||
self._enhance_cache[cache_key] = truncated
|
||||
return truncated
|
||||
|
||||
self._enhance_cache[cache_key] = chunk_text
|
||||
return chunk_text
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error procesando chunk: {e}")
|
||||
if current_tokens > self.max_tokens:
|
||||
truncated = self.token_manager.truncate_to_tokens(
|
||||
chunk_text, self.max_tokens
|
||||
)
|
||||
self._enhance_cache[cache_key] = truncated
|
||||
return truncated
|
||||
|
||||
self._enhance_cache[cache_key] = chunk_text
|
||||
return chunk_text
|
||||
|
||||
def process_chunks_batch(
|
||||
self, chunks: List[LangchainDocument], merge_related: bool = False
|
||||
) -> List[LangchainDocument]:
|
||||
processed_chunks = []
|
||||
total_chunks = len(chunks)
|
||||
|
||||
print(f"Procesando {total_chunks} chunks en lotes de {self.chunks_per_batch}")
|
||||
if self.custom_instructions:
|
||||
print(
|
||||
f"Con instrucciones personalizadas: {self.custom_instructions[:100]}..."
|
||||
)
|
||||
|
||||
i = 0
|
||||
while i < len(chunks):
|
||||
batch_start = time.time()
|
||||
current_chunk = chunks[i]
|
||||
merged_content = current_chunk.page_content
|
||||
original_tokens = self.token_manager.count_tokens(merged_content)
|
||||
|
||||
if merge_related and i < len(chunks) - 1:
|
||||
merge_count = 0
|
||||
while i + merge_count < len(chunks) - 1 and self.should_merge_chunks(
|
||||
merged_content, chunks[i + merge_count + 1].page_content
|
||||
):
|
||||
merge_count += 1
|
||||
merged_content += "\n\n" + chunks[i + merge_count].page_content
|
||||
print(f" Uniendo chunk {i + 1} con chunk {i + merge_count + 1}")
|
||||
|
||||
i += merge_count
|
||||
|
||||
print(f"\nProcesando chunk {i + 1}/{total_chunks}")
|
||||
print(f" Tokens originales: {original_tokens}")
|
||||
|
||||
enhanced_content = self.enhance_chunk(merged_content)
|
||||
final_tokens = self.token_manager.count_tokens(enhanced_content)
|
||||
|
||||
processed_chunks.append(
|
||||
LangchainDocument(
|
||||
page_content=enhanced_content,
|
||||
metadata={
|
||||
**current_chunk.metadata,
|
||||
"final_tokens": final_tokens,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
print(f" Tokens finales: {final_tokens}")
|
||||
print(f" Tiempo de procesamiento: {time.time() - batch_start:.2f}s")
|
||||
|
||||
i += 1
|
||||
|
||||
if i % self.chunks_per_batch == 0 and i < len(chunks):
|
||||
print(f"\nCompletados {i}/{total_chunks} chunks")
|
||||
time.sleep(0.1)
|
||||
|
||||
return processed_chunks
|
||||
|
||||
|
||||
class LLMChunker(BaseChunker):
|
||||
"""Implements a chunker that uses an LLM to optimize PDF and text content."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
output_dir: str,
|
||||
model: str,
|
||||
max_tokens: int = 1000,
|
||||
target_tokens: int = 800,
|
||||
gemini_client: VertexAILLM | None = None,
|
||||
custom_instructions: str = "",
|
||||
extract_images: bool = True,
|
||||
max_workers: int = 4,
|
||||
chunk_size: int = 1000,
|
||||
chunk_overlap: int = 200,
|
||||
merge_related: bool = True,
|
||||
):
|
||||
self.output_dir = output_dir
|
||||
self.model = model
|
||||
self.client = gemini_client
|
||||
self.max_workers = max_workers
|
||||
self.token_manager = TokenManager()
|
||||
self.custom_instructions = custom_instructions
|
||||
self.extract_images = extract_images
|
||||
self.chunk_size = chunk_size
|
||||
self.chunk_overlap = chunk_overlap
|
||||
self.merge_related = merge_related
|
||||
self._format_cache = {}
|
||||
|
||||
self.chunk_processor = OptimizedChunkProcessor(
|
||||
model=self.model,
|
||||
max_tokens=max_tokens,
|
||||
target_tokens=target_tokens,
|
||||
gemini_client=gemini_client,
|
||||
custom_instructions=custom_instructions,
|
||||
)
|
||||
|
||||
def process_text(self, text: str) -> List[Document]:
|
||||
"""Processes raw text using the LLM optimizer."""
|
||||
print("\n=== Iniciando procesamiento de texto ===")
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=self.chunk_size,
|
||||
chunk_overlap=self.chunk_overlap,
|
||||
length_function=self.token_manager.count_tokens,
|
||||
separators=["\n\n", "\n", ". ", " ", ""],
|
||||
)
|
||||
# Create dummy LangchainDocuments for compatibility with process_chunks_batch
|
||||
langchain_docs = text_splitter.create_documents([text])
|
||||
|
||||
processed_docs = self.chunk_processor.process_chunks_batch(
|
||||
langchain_docs, self.merge_related
|
||||
)
|
||||
|
||||
# Convert from LangchainDocument to our Document TypedDict
|
||||
final_documents: List[Document] = [
|
||||
{"page_content": doc.page_content, "metadata": doc.metadata}
|
||||
for doc in processed_docs
|
||||
]
|
||||
print(
|
||||
f"\n=== Procesamiento de texto completado: {len(final_documents)} chunks creados ==="
|
||||
)
|
||||
return final_documents
|
||||
|
||||
def process_path(self, path: Path) -> List[Document]:
|
||||
"""Processes a PDF file, extracts text and images, and optimizes chunks."""
|
||||
overall_start = time.time()
|
||||
print(f"\n=== Iniciando procesamiento optimizado de PDF: {path.name} ===")
|
||||
# ... (rest of the logic from process_pdf_optimized)
|
||||
if not os.path.exists(self.output_dir):
|
||||
os.makedirs(self.output_dir)
|
||||
|
||||
print("\n1. Creando chunks del PDF...")
|
||||
chunks = self._create_optimized_chunks(
|
||||
str(path), self.chunk_size, self.chunk_overlap
|
||||
)
|
||||
print(f" Total chunks creados: {len(chunks)}")
|
||||
|
||||
pages_to_extract = set()
|
||||
if self.extract_images:
|
||||
print("\n2. Detectando formatos especiales...")
|
||||
format_results = self.detect_special_format_batch(chunks)
|
||||
for i, has_special_format in format_results.items():
|
||||
if has_special_format:
|
||||
page_number = chunks[i].metadata.get("page")
|
||||
if page_number:
|
||||
pages_to_extract.add(page_number)
|
||||
print(f" Páginas con formato especial: {sorted(pages_to_extract)}")
|
||||
|
||||
if self.extract_images and pages_to_extract:
|
||||
print(f"\n3. Extrayendo {len(pages_to_extract)} páginas como imágenes...")
|
||||
self._extract_pages_parallel(str(path), self.output_dir, pages_to_extract)
|
||||
|
||||
print("\n4. Procesando y optimizando chunks...")
|
||||
processed_chunks = self.chunk_processor.process_chunks_batch(
|
||||
chunks, self.merge_related
|
||||
)
|
||||
|
||||
if self.extract_images:
|
||||
final_chunks = self._add_image_references(
|
||||
processed_chunks, pages_to_extract, str(path), self.output_dir
|
||||
)
|
||||
else:
|
||||
final_chunks = processed_chunks
|
||||
|
||||
total_time = time.time() - overall_start
|
||||
print(f"\n=== Procesamiento completado en {total_time:.2f}s ===")
|
||||
|
||||
# Convert from LangchainDocument to our Document TypedDict
|
||||
final_documents: List[Document] = [
|
||||
{"page_content": doc.page_content, "metadata": doc.metadata}
|
||||
for doc in final_chunks
|
||||
]
|
||||
return final_documents
|
||||
|
||||
def detect_special_format_batch(
|
||||
self, chunks: List[LangchainDocument]
|
||||
) -> dict[int, bool]:
|
||||
results = {}
|
||||
chunks_to_process = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
cache_key = hashlib.md5(chunk.page_content.encode()).hexdigest()[:16]
|
||||
if cache_key in self._format_cache:
|
||||
results[i] = self._format_cache[cache_key]
|
||||
else:
|
||||
chunks_to_process.append((i, chunk, cache_key))
|
||||
|
||||
if not chunks_to_process:
|
||||
return results
|
||||
|
||||
if self.client and len(chunks_to_process) > 1:
|
||||
with ThreadPoolExecutor(
|
||||
max_workers=min(self.max_workers, len(chunks_to_process))
|
||||
) as executor:
|
||||
futures = {
|
||||
executor.submit(self._detect_single_format, chunk): (i, cache_key)
|
||||
for i, chunk, cache_key in chunks_to_process
|
||||
}
|
||||
for future in futures:
|
||||
i, cache_key = futures[future]
|
||||
try:
|
||||
result = future.result()
|
||||
results[i] = result
|
||||
self._format_cache[cache_key] = result
|
||||
except Exception as e:
|
||||
print(f"Error procesando chunk {i}: {e}")
|
||||
results[i] = False
|
||||
else:
|
||||
for i, chunk, cache_key in chunks_to_process:
|
||||
result = self._detect_single_format(chunk)
|
||||
results[i] = result
|
||||
self._format_cache[cache_key] = result
|
||||
return results
|
||||
|
||||
def _detect_single_format(self, chunk: LangchainDocument) -> bool:
|
||||
if not self.client:
|
||||
content = chunk.page_content
|
||||
table_indicators = ["│", "├", "┼", "┤", "┬", "┴", "|", "+", "-"]
|
||||
has_table_chars = any(char in content for char in table_indicators)
|
||||
has_multiple_columns = content.count("\t") > 10 or content.count(" ") > 20
|
||||
return has_table_chars or has_multiple_columns
|
||||
try:
|
||||
prompt = f"""¿Contiene este texto tablas estructuradas, diagramas ASCII, o elementos que requieren formato especial?
|
||||
|
||||
Responde SOLO 'SI' o 'NO'.
|
||||
|
||||
Texto:
|
||||
{chunk.page_content[:1000]}"""
|
||||
response = self.client.generate(self.model, prompt).text
|
||||
return response.strip().upper() == "SI"
|
||||
except Exception as e:
|
||||
print(f"Error detectando formato: {e}")
|
||||
return False
|
||||
|
||||
def _create_optimized_chunks(
|
||||
self, pdf_path: str, chunk_size: int, chunk_overlap: int
|
||||
) -> List[LangchainDocument]:
|
||||
pdf = PdfReader(pdf_path)
|
||||
chunks = []
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
length_function=self.token_manager.count_tokens,
|
||||
separators=["\n\n", "\n", ". ", " ", ""],
|
||||
)
|
||||
for page_num, page in enumerate(pdf.pages, 1):
|
||||
text = page.extract_text()
|
||||
if text.strip():
|
||||
page_chunks = text_splitter.create_documents(
|
||||
[text],
|
||||
metadatas=[
|
||||
{
|
||||
"page": page_num,
|
||||
"file_name": os.path.basename(pdf_path),
|
||||
}
|
||||
],
|
||||
)
|
||||
chunks.extend(page_chunks)
|
||||
return chunks
|
||||
|
||||
def _extract_pages_parallel(self, pdf_path: str, output_dir: str, pages: set):
|
||||
def extract_single_page(page_number):
|
||||
try:
|
||||
pdf_filename = os.path.basename(pdf_path)
|
||||
image_path = os.path.join(
|
||||
output_dir, f"{page_number}_{pdf_filename}.png"
|
||||
)
|
||||
images = convert_from_path(
|
||||
pdf_path,
|
||||
first_page=page_number,
|
||||
last_page=page_number,
|
||||
dpi=150,
|
||||
thread_count=1,
|
||||
grayscale=False,
|
||||
)
|
||||
if images:
|
||||
images[0].save(image_path, "PNG", optimize=True)
|
||||
except Exception as e:
|
||||
print(f" Error extrayendo página {page_number}: {e}")
|
||||
|
||||
with ThreadPoolExecutor(
|
||||
max_workers=min(self.max_workers, len(pages))
|
||||
) as executor:
|
||||
futures = [executor.submit(extract_single_page, page) for page in pages]
|
||||
for future in futures:
|
||||
future.result() # Wait for completion
|
||||
|
||||
def _add_image_references(
|
||||
self,
|
||||
chunks: List[LangchainDocument],
|
||||
pages_to_extract: set,
|
||||
pdf_path: str,
|
||||
output_dir: str,
|
||||
) -> List[LangchainDocument]:
|
||||
pdf_filename = os.path.basename(pdf_path)
|
||||
for chunk in chunks:
|
||||
page_number = chunk.metadata.get("page")
|
||||
if page_number in pages_to_extract:
|
||||
image_path = os.path.join(
|
||||
output_dir, f"page_{page_number}_{pdf_filename}.png"
|
||||
)
|
||||
if os.path.exists(image_path):
|
||||
image_reference = (
|
||||
f"\n[IMAGEN DISPONIBLE - Página {page_number}: {image_path}]\n"
|
||||
)
|
||||
chunk.page_content = image_reference + chunk.page_content
|
||||
chunk.metadata["has_image"] = True
|
||||
chunk.metadata["image_path"] = image_path
|
||||
return chunks
|
||||
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
@app.command()
|
||||
def main(
|
||||
pdf_path: Annotated[str, typer.Argument(help="Ruta al archivo PDF")],
|
||||
output_dir: Annotated[
|
||||
str, typer.Argument(help="Directorio de salida para imágenes y chunks")
|
||||
],
|
||||
model: Annotated[
|
||||
str, typer.Option(help="Modelo a usar para el procesamiento")
|
||||
] = "gemini-2.0-flash",
|
||||
max_tokens: Annotated[
|
||||
int, typer.Option(help="Límite máximo de tokens por chunk")
|
||||
] = 950,
|
||||
target_tokens: Annotated[
|
||||
int, typer.Option(help="Tokens objetivo para optimización")
|
||||
] = 800,
|
||||
chunk_size: Annotated[int, typer.Option(help="Tamaño base de chunks")] = 1000,
|
||||
chunk_overlap: Annotated[int, typer.Option(help="Solapamiento entre chunks")] = 200,
|
||||
merge_related: Annotated[
|
||||
bool, typer.Option(help="Si unir chunks relacionados")
|
||||
] = True,
|
||||
custom_instructions: Annotated[
|
||||
str, typer.Option(help="Instrucciones adicionales para optimización")
|
||||
] = "",
|
||||
extract_images: Annotated[
|
||||
bool,
|
||||
typer.Option(help="Si True, extrae páginas con formato especial como imágenes"),
|
||||
] = True,
|
||||
):
|
||||
"""
|
||||
Función principal para procesar PDFs con control completo de tokens.
|
||||
"""
|
||||
settings = Settings()
|
||||
llm = VertexAILLM(
|
||||
project=settings.project_id,
|
||||
location=settings.location,
|
||||
)
|
||||
|
||||
chunker = LLMChunker(
|
||||
output_dir=output_dir,
|
||||
model=model,
|
||||
max_tokens=max_tokens,
|
||||
target_tokens=target_tokens,
|
||||
gemini_client=llm,
|
||||
custom_instructions=custom_instructions,
|
||||
extract_images=extract_images,
|
||||
max_workers=4,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
merge_related=merge_related,
|
||||
)
|
||||
|
||||
documents = chunker.process_path(Path(pdf_path))
|
||||
print(f"Processed {len(documents)} documents.")
|
||||
|
||||
output_file_path = os.path.join(output_dir, "chunked_documents.jsonl")
|
||||
with open(output_file_path, "w", encoding="utf-8") as f:
|
||||
for doc in documents:
|
||||
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
|
||||
|
||||
print(f"Saved {len(documents)} documents to {output_file_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
80
src/knowledge_pipeline/chunker/recursive_chunker.py
Normal file
80
src/knowledge_pipeline/chunker/recursive_chunker.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Annotated, List
|
||||
|
||||
import chonkie
|
||||
import typer
|
||||
|
||||
from .base_chunker import BaseChunker, Document
|
||||
|
||||
|
||||
class RecursiveChunker(BaseChunker):
|
||||
"""A chunker that uses the chonkie RecursiveChunker."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initializes the RecursiveChunker."""
|
||||
self.processor = chonkie.RecursiveChunker()
|
||||
|
||||
def process_text(self, text: str) -> List[Document]:
|
||||
"""
|
||||
Processes a string of text into a list of Document chunks.
|
||||
|
||||
Args:
|
||||
text: The input string to process.
|
||||
|
||||
Returns:
|
||||
A list of Document objects.
|
||||
"""
|
||||
chunks = self.processor(text)
|
||||
documents: List[Document] = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
doc: Document = {
|
||||
"page_content": chunk.text,
|
||||
"metadata": {"chunk_index": i},
|
||||
}
|
||||
documents.append(doc)
|
||||
return documents
|
||||
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
@app.command()
|
||||
def main(
|
||||
input_file_path: Annotated[
|
||||
str, typer.Argument(help="Path to the input text file.")
|
||||
],
|
||||
output_dir: Annotated[
|
||||
str, typer.Argument(help="Directory to save the output file.")
|
||||
],
|
||||
):
|
||||
"""
|
||||
Processes a text file using RecursiveChunker and saves the output to a JSONL file.
|
||||
"""
|
||||
print(f"Starting to process {input_file_path}...")
|
||||
|
||||
# 1. Instantiate chunker and process the file using the inherited method
|
||||
chunker = RecursiveChunker()
|
||||
documents = chunker.process_path(Path(input_file_path))
|
||||
|
||||
print(f"Successfully created {len(documents)} chunks.")
|
||||
|
||||
# 2. Prepare and save the output
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
print(f"Created output directory: {output_dir}")
|
||||
|
||||
output_file_path = os.path.join(output_dir, "chunked_documents.jsonl")
|
||||
|
||||
with open(output_file_path, "w", encoding="utf-8") as f:
|
||||
for doc in documents:
|
||||
# Add source file info to metadata before writing
|
||||
doc["metadata"]["source_file"] = os.path.basename(input_file_path)
|
||||
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
|
||||
|
||||
print(f"Successfully saved {len(documents)} chunks to {output_file_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
20
src/knowledge_pipeline/cli.py
Normal file
20
src/knowledge_pipeline/cli.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import logging
|
||||
|
||||
import typer
|
||||
|
||||
from .config import Settings
|
||||
from .pipeline import run_pipeline
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
@app.command()
|
||||
def run_ingestion():
|
||||
"""Main function for the CLI script."""
|
||||
settings = Settings.model_validate({})
|
||||
logging.getLogger("knowledge_pipeline").setLevel(getattr(logging, settings.log_level))
|
||||
run_pipeline(settings)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
101
src/knowledge_pipeline/config.py
Normal file
101
src/knowledge_pipeline/config.py
Normal file
@@ -0,0 +1,101 @@
|
||||
import os
|
||||
from functools import cached_property
|
||||
|
||||
from google.cloud.aiplatform.matching_engine.matching_engine_index_config import (
|
||||
DistanceMeasureType,
|
||||
)
|
||||
from pydantic_settings import (
|
||||
BaseSettings,
|
||||
PydanticBaseSettingsSource,
|
||||
SettingsConfigDict,
|
||||
YamlConfigSettingsSource,
|
||||
)
|
||||
|
||||
CONFIG_FILE_PATH = os.getenv("CONFIG_YAML", "config.yaml")
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
project_id: str
|
||||
location: str
|
||||
log_level: str = "INFO"
|
||||
|
||||
agent_embedding_model: str
|
||||
|
||||
index_name: str
|
||||
index_dimensions: int
|
||||
index_machine_type: str = "e2-standard-16"
|
||||
index_origin: str
|
||||
index_destination: str
|
||||
index_chunk_limit: int
|
||||
index_distance_measure_type: DistanceMeasureType = (
|
||||
DistanceMeasureType.DOT_PRODUCT_DISTANCE
|
||||
)
|
||||
index_approximate_neighbors_count: int = 150
|
||||
index_leaf_node_embedding_count: int = 1000
|
||||
index_leaf_nodes_to_search_percent: int = 10
|
||||
index_public_endpoint_enabled: bool = True
|
||||
|
||||
model_config = SettingsConfigDict(yaml_file=CONFIG_FILE_PATH)
|
||||
|
||||
def model_post_init(self, _):
|
||||
from google.cloud import aiplatform
|
||||
|
||||
aiplatform.init(project=self.project_id, location=self.location)
|
||||
|
||||
@property
|
||||
def index_deployment(self) -> str:
|
||||
return self.index_name.replace("-", "_") + "_deployed"
|
||||
|
||||
@property
|
||||
def index_data(self) -> str:
|
||||
return self.index_destination + self.index_name
|
||||
|
||||
@property
|
||||
def index_contents_dir(self) -> str:
|
||||
return f"{self.index_data}/contents"
|
||||
|
||||
@property
|
||||
def index_vectors_dir(self) -> str:
|
||||
return f"{self.index_data}/vectors"
|
||||
|
||||
@property
|
||||
def index_vectors_jsonl_path(self) -> str:
|
||||
return f"{self.index_vectors_dir}/vectors.json"
|
||||
|
||||
@cached_property
|
||||
def gcs_client(self):
|
||||
from google.cloud import storage
|
||||
|
||||
return storage.Client()
|
||||
|
||||
@cached_property
|
||||
def converter(self):
|
||||
from markitdown import MarkItDown
|
||||
|
||||
return MarkItDown(enable_plugins=False)
|
||||
|
||||
@cached_property
|
||||
def embedder(self):
|
||||
from pydantic_ai import Embedder
|
||||
|
||||
return Embedder(f"google-vertex:{self.agent_embedding_model}")
|
||||
|
||||
@cached_property
|
||||
def chunker(self):
|
||||
from .chunker.contextual_chunker import ContextualChunker
|
||||
|
||||
return ContextualChunker(max_chunk_size=self.index_chunk_limit)
|
||||
|
||||
@classmethod
|
||||
def settings_customise_sources(
|
||||
cls,
|
||||
settings_cls: type[BaseSettings],
|
||||
init_settings: PydanticBaseSettingsSource,
|
||||
env_settings: PydanticBaseSettingsSource,
|
||||
dotenv_settings: PydanticBaseSettingsSource,
|
||||
file_secret_settings: PydanticBaseSettingsSource,
|
||||
) -> tuple[PydanticBaseSettingsSource, ...]:
|
||||
return (
|
||||
env_settings,
|
||||
YamlConfigSettingsSource(settings_cls),
|
||||
)
|
||||
209
src/knowledge_pipeline/pipeline.py
Normal file
209
src/knowledge_pipeline/pipeline.py
Normal file
@@ -0,0 +1,209 @@
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import tempfile
|
||||
import unicodedata
|
||||
from collections.abc import Sequence
|
||||
from pathlib import Path
|
||||
|
||||
from google.cloud import aiplatform
|
||||
from google.cloud.aiplatform.matching_engine.matching_engine_index_config import (
|
||||
DistanceMeasureType,
|
||||
)
|
||||
from google.cloud.storage import Client as StorageClient
|
||||
from markitdown import MarkItDown
|
||||
from pydantic_ai import Embedder
|
||||
|
||||
from .chunker.base_chunker import BaseChunker, Document
|
||||
from .config import Settings
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _parse_gcs_uri(uri: str) -> tuple[str, str]:
|
||||
"""Parse a 'gs://bucket/path' URI into (bucket_name, object_path)."""
|
||||
bucket, _, path = uri.removeprefix("gs://").partition("/")
|
||||
return bucket, path
|
||||
|
||||
|
||||
def normalize_string(s: str) -> str:
|
||||
"""Normalizes a string to be a valid filename."""
|
||||
nfkd_form = unicodedata.normalize("NFKD", s)
|
||||
only_ascii = "".join([c for c in nfkd_form if not unicodedata.combining(c)])
|
||||
only_ascii = only_ascii.lower()
|
||||
only_ascii = re.sub(r"\s+", "_", only_ascii)
|
||||
only_ascii = re.sub(r"[^a-z0-9_.-]", "", only_ascii)
|
||||
return only_ascii
|
||||
|
||||
|
||||
def gather_pdfs(index_origin: str, gcs_client: StorageClient) -> list[str]:
|
||||
"""Lists all PDF file URIs in a GCS directory."""
|
||||
bucket, prefix = _parse_gcs_uri(index_origin)
|
||||
blobs = gcs_client.bucket(bucket).list_blobs(prefix=prefix)
|
||||
pdf_files = [
|
||||
f"gs://{bucket}/{blob.name}" for blob in blobs if blob.name.endswith(".pdf")
|
||||
]
|
||||
log.info("Found %d PDF files in %s", len(pdf_files), index_origin)
|
||||
return pdf_files
|
||||
|
||||
|
||||
def split_into_chunks(text: str, file_id: str, chunker: BaseChunker) -> list[Document]:
|
||||
"""Splits text into chunks, or returns a single chunk if small enough."""
|
||||
if len(text) <= chunker.max_chunk_size:
|
||||
return [{"page_content": text, "metadata": {"id": file_id}}]
|
||||
|
||||
chunks = chunker.process_text(text)
|
||||
for i, chunk in enumerate(chunks):
|
||||
chunk["metadata"]["id"] = f"{file_id}_{i}"
|
||||
return chunks
|
||||
|
||||
|
||||
def upload_to_gcs(
|
||||
chunks: list[Document],
|
||||
vectors: list[dict],
|
||||
index_contents_dir: str,
|
||||
index_vectors_jsonl_path: str,
|
||||
gcs_client: StorageClient,
|
||||
) -> None:
|
||||
"""Uploads chunk contents and vectors to GCS."""
|
||||
bucket, prefix = _parse_gcs_uri(index_contents_dir)
|
||||
gcs_bucket = gcs_client.bucket(bucket)
|
||||
for chunk in chunks:
|
||||
chunk_id = chunk["metadata"]["id"]
|
||||
gcs_bucket.blob(f"{prefix}/{chunk_id}.md").upload_from_string(
|
||||
chunk["page_content"], content_type="text/markdown; charset=utf-8"
|
||||
)
|
||||
|
||||
vectors_jsonl = "\n".join(json.dumps(v) for v in vectors) + "\n"
|
||||
bucket, obj_path = _parse_gcs_uri(index_vectors_jsonl_path)
|
||||
gcs_client.bucket(bucket).blob(obj_path).upload_from_string(
|
||||
vectors_jsonl, content_type="application/x-ndjson; charset=utf-8"
|
||||
)
|
||||
log.info("Uploaded %d chunks and %d vectors to GCS", len(chunks), len(vectors))
|
||||
|
||||
|
||||
def build_vectors(
|
||||
chunks: list[Document],
|
||||
embeddings: Sequence[Sequence[float]],
|
||||
source_folder: str,
|
||||
) -> list[dict]:
|
||||
"""Builds vector records from chunks and their embeddings."""
|
||||
source = Path(source_folder).parts[0] if source_folder else ""
|
||||
return [
|
||||
{
|
||||
"id": chunk["metadata"]["id"],
|
||||
"embedding": list(embedding),
|
||||
"restricts": [{"namespace": "source", "allow": [source]}],
|
||||
}
|
||||
for chunk, embedding in zip(chunks, embeddings)
|
||||
]
|
||||
|
||||
|
||||
def create_vector_index(
|
||||
index_name: str,
|
||||
index_vectors_dir: str,
|
||||
index_dimensions: int,
|
||||
index_distance_measure_type: DistanceMeasureType,
|
||||
index_deployment: str,
|
||||
index_machine_type: str,
|
||||
approximate_neighbors_count: int,
|
||||
leaf_node_embedding_count: int,
|
||||
leaf_nodes_to_search_percent: int,
|
||||
public_endpoint_enabled: bool,
|
||||
):
|
||||
"""Creates and deploys a Vertex AI Vector Search Index."""
|
||||
log.info("Creating index '%s'...", index_name)
|
||||
index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
|
||||
display_name=index_name,
|
||||
contents_delta_uri=index_vectors_dir,
|
||||
dimensions=index_dimensions,
|
||||
approximate_neighbors_count=approximate_neighbors_count,
|
||||
distance_measure_type=index_distance_measure_type,
|
||||
leaf_node_embedding_count=leaf_node_embedding_count,
|
||||
leaf_nodes_to_search_percent=leaf_nodes_to_search_percent,
|
||||
)
|
||||
log.info("Index '%s' created successfully.", index_name)
|
||||
|
||||
log.info("Deploying index to a new endpoint...")
|
||||
endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
|
||||
display_name=f"{index_name}-endpoint",
|
||||
public_endpoint_enabled=public_endpoint_enabled,
|
||||
)
|
||||
endpoint.deploy_index(
|
||||
index=index,
|
||||
deployed_index_id=index_deployment,
|
||||
machine_type=index_machine_type,
|
||||
)
|
||||
log.info("Index deployed: %s", endpoint.display_name)
|
||||
|
||||
|
||||
def process_file(
|
||||
file_uri: str,
|
||||
temp_dir: Path,
|
||||
gcs_client: StorageClient,
|
||||
converter: MarkItDown,
|
||||
embedder: Embedder,
|
||||
chunker: BaseChunker,
|
||||
) -> tuple[list[Document], list[dict]]:
|
||||
"""Downloads a PDF from GCS, converts to markdown, chunks, and embeds."""
|
||||
bucket, obj_path = _parse_gcs_uri(file_uri)
|
||||
local_path = temp_dir / Path(file_uri).name
|
||||
gcs_client.bucket(bucket).blob(obj_path).download_to_filename(local_path)
|
||||
|
||||
try:
|
||||
markdown = converter.convert(local_path).text_content
|
||||
file_id = normalize_string(Path(file_uri).stem)
|
||||
source_folder = Path(obj_path).parent.as_posix()
|
||||
|
||||
chunks = split_into_chunks(markdown, file_id, chunker)
|
||||
texts = [c["page_content"] for c in chunks]
|
||||
embeddings = embedder.embed_documents_sync(texts).embeddings
|
||||
|
||||
vectors = build_vectors(chunks, embeddings, source_folder)
|
||||
return chunks, vectors
|
||||
finally:
|
||||
if local_path.exists():
|
||||
local_path.unlink()
|
||||
|
||||
|
||||
def run_pipeline(settings: Settings):
|
||||
"""Runs the full ingestion pipeline: gather → process → aggregate → index."""
|
||||
files = gather_pdfs(settings.index_origin, settings.gcs_client)
|
||||
|
||||
all_chunks: list[Document] = []
|
||||
all_vectors: list[dict] = []
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
for file_uri in files:
|
||||
log.info("Processing file: %s", file_uri)
|
||||
chunks, vectors = process_file(
|
||||
file_uri,
|
||||
Path(temp_dir),
|
||||
settings.gcs_client,
|
||||
settings.converter,
|
||||
settings.embedder,
|
||||
settings.chunker,
|
||||
)
|
||||
all_chunks.extend(chunks)
|
||||
all_vectors.extend(vectors)
|
||||
|
||||
upload_to_gcs(
|
||||
all_chunks,
|
||||
all_vectors,
|
||||
settings.index_contents_dir,
|
||||
settings.index_vectors_jsonl_path,
|
||||
settings.gcs_client,
|
||||
)
|
||||
|
||||
create_vector_index(
|
||||
settings.index_name,
|
||||
settings.index_vectors_dir,
|
||||
settings.index_dimensions,
|
||||
settings.index_distance_measure_type,
|
||||
settings.index_deployment,
|
||||
settings.index_machine_type,
|
||||
settings.index_approximate_neighbors_count,
|
||||
settings.index_leaf_node_embedding_count,
|
||||
settings.index_leaf_nodes_to_search_percent,
|
||||
settings.index_public_endpoint_enabled,
|
||||
)
|
||||
Reference in New Issue
Block a user