First commit

This commit is contained in:
Anibal Angulo
2026-02-18 19:57:43 +00:00
commit a53f8fcf62
115 changed files with 9957 additions and 0 deletions

View File

@@ -0,0 +1,225 @@
"""
This script defines a Kubeflow Pipeline (KFP) for ingesting and processing documents.
The pipeline is designed to run on Vertex AI Pipelines and consists of the following steps:
1. **Gather Files**: Scans a GCS directory for PDF files to process.
2. **Process Files (in parallel)**: For each PDF file found, this step:
a. Converts the PDF to Markdown text.
b. Chunks the text if it's too long.
c. Generates a vector embedding for each chunk using a Vertex AI embedding model.
d. Saves the markdown content and the vector embedding to separate GCS output paths.
"""
import json
import logging
import os
import tempfile
from pathlib import Path
from rag_eval.config import settings
def build_gcs_path(base_path: str, suffix: str) -> str:
"""Builds a GCS path by appending a suffix."""
return f"{base_path}{suffix}"
def gather_files(
input_dir: str,
) -> list:
"""Gathers all PDF file paths from a GCS directory."""
from google.cloud import storage
logging.getLogger().setLevel(logging.INFO)
gcs_client = storage.Client()
bucket_name, prefix = input_dir.replace("gs://", "").split("/", 1)
bucket = gcs_client.bucket(bucket_name)
blob_list = bucket.list_blobs(prefix=prefix)
pdf_files = [
f"gs://{bucket_name}/{blob.name}"
for blob in blob_list
if blob.name.endswith(".pdf")
]
logging.info(f"Found {len(pdf_files)} PDF files in {input_dir}")
return pdf_files
def process_file(
file_path: str,
model_name: str,
contents_output_dir: str,
vectors_output_file: Path,
chunk_limit: int,
):
"""
Processes a single PDF file: converts to markdown, chunks, and generates embeddings.
The vector embeddings are written to a local JSONL file.
"""
# Imports are inside the function as KFP serializes this function
from pathlib import Path
from chunker.contextual_chunker import ContextualChunker
from document_converter.markdown import MarkdownConverter
from embedder.vertex_ai import VertexAIEmbedder
from google.cloud import storage
from llm.vertex_ai import VertexAILLM
from utils.normalize_filenames import normalize_string
logging.getLogger().setLevel(logging.INFO)
# Initialize converters and embedders
converter = MarkdownConverter()
embedder = VertexAIEmbedder(model_name=model_name, project=settings.project_id, location=settings.location)
llm = VertexAILLM(project=settings.project_id, location=settings.location)
chunker = ContextualChunker(llm_client=llm, max_chunk_size=chunk_limit)
gcs_client = storage.Client()
file_id = normalize_string(Path(file_path).stem)
local_path = Path(f"/tmp/{Path(file_path).name}")
with open(vectors_output_file, "w", encoding="utf-8") as f:
try:
# Download file from GCS
bucket_name, blob_name = file_path.replace("gs://", "").split("/", 1)
bucket = gcs_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
blob.download_to_filename(local_path)
logging.info(f"Processing file: {file_path}")
# Process the downloaded file
markdown_content = converter.process_file(local_path)
def upload_to_gcs(bucket_name, blob_name, data):
bucket = gcs_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
blob.upload_from_string(data, content_type="text/markdown; charset=utf-8")
# Determine output bucket and paths for markdown
contents_bucket_name, contents_prefix = contents_output_dir.replace(
"gs://", ""
).split("/", 1)
if len(markdown_content) > chunk_limit:
chunks = chunker.process_text(markdown_content)
for i, chunk in enumerate(chunks):
chunk_id = f"{file_id}_{i}"
embedding = embedder.generate_embedding(chunk["page_content"])
# Upload markdown chunk
md_blob_name = f"{contents_prefix}/{chunk_id}.md"
upload_to_gcs(
contents_bucket_name, md_blob_name, chunk["page_content"]
)
# Write vector to local JSONL file
json_line = json.dumps({"id": chunk_id, "embedding": embedding})
f.write(json_line + '\n')
else:
embedding = embedder.generate_embedding(markdown_content)
# Upload markdown
md_blob_name = f"{contents_prefix}/{file_id}.md"
upload_to_gcs(contents_bucket_name, md_blob_name, markdown_content)
# Write vector to local JSONL file
json_line = json.dumps({"id": file_id, "embedding": embedding})
f.write(json_line + '\n')
except Exception as e:
logging.error(f"Failed to process file {file_path}: {e}", exc_info=True)
raise
finally:
# Clean up the downloaded file
if os.path.exists(local_path):
os.remove(local_path)
def aggregate_vectors(
vector_artifacts: list, # This will be a list of paths to the artifact files
output_gcs_path: str,
):
"""
Aggregates multiple JSONL artifact files into a single JSONL file in GCS.
"""
from google.cloud import storage
logging.getLogger().setLevel(logging.INFO)
# Create a temporary file to aggregate all vector data
with tempfile.NamedTemporaryFile(
mode="w", delete=False, encoding="utf-8"
) as temp_agg_file:
logging.info(f"Aggregating vectors into temporary file: {temp_agg_file.name}")
for artifact_path in vector_artifacts:
with open(artifact_path, "r", encoding="utf-8") as f:
# Each line is a complete JSON object
for line in f:
temp_agg_file.write(line) # line already includes newline
temp_file_path = temp_agg_file.name
logging.info("Uploading aggregated file to GCS...")
gcs_client = storage.Client()
bucket_name, blob_name = output_gcs_path.replace("gs://", "").split("/", 1)
bucket = gcs_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
blob.upload_from_filename(temp_file_path, content_type="application/json; charset=utf-8")
logging.info(f"Successfully uploaded aggregated vectors to {output_gcs_path}")
# Clean up the temporary file
import os
os.remove(temp_file_path)
def create_vector_index(
vectors_dir: str,
):
"""Creates and deploys a Vertex AI Vector Search Index."""
from vector_search.vertex_ai import GoogleCloudVectorSearch
from rag_eval.config import settings as config
logging.getLogger().setLevel(logging.INFO)
try:
index_config = config.index
logging.info(
f"Initializing Vertex AI client for project '{config.project_id}' in '{config.location}'..."
)
vector_search = GoogleCloudVectorSearch(
project_id=config.project_id,
location=config.location,
bucket=config.bucket,
index_name=index_config.name,
)
logging.info(f"Starting creation of index '{index_config.name}'...")
vector_search.create_index(
name=index_config.name,
content_path=vectors_dir,
dimensions=index_config.dimensions,
)
logging.info(f"Index '{index_config.name}' created successfully.")
logging.info("Deploying index to a new endpoint...")
vector_search.deploy_index(
index_name=index_config.name, machine_type=index_config.machine_type
)
logging.info("Index deployed successfully!")
logging.info(f"Endpoint name: {vector_search.index_endpoint.display_name}")
logging.info(
f"Endpoint resource name: {vector_search.index_endpoint.resource_name}"
)
except Exception as e:
logging.error(f"An error occurred during index creation or deployment: {e}", exc_info=True)
raise