First commmit

This commit is contained in:
2026-02-22 15:25:27 +00:00
commit 35d5a65b17
70 changed files with 4298 additions and 0 deletions

0
apps/index-gen/README.md Normal file
View File

View File

@@ -0,0 +1,34 @@
[project]
name = "index-gen"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
authors = [
{ name = "Anibal Angulo", email = "a8065384@banorte.com" }
]
requires-python = ">=3.12"
dependencies = [
"chunker",
"document-converter",
"embedder",
"file-storage",
"llm",
"utils",
"vector-search",
]
[project.scripts]
index-gen = "index_gen.cli:app"
[build-system]
requires = ["uv_build>=0.8.12,<0.9.0"]
build-backend = "uv_build"
[tool.uv.sources]
file-storage = { workspace = true }
vector-search = { workspace = true }
utils = { workspace = true }
embedder = { workspace = true }
chunker = { workspace = true }
document-converter = { workspace = true }
llm = { workspace = true }

View File

@@ -0,0 +1,2 @@
def main() -> None:
print("Hello from index-gen!")

View File

@@ -0,0 +1,68 @@
import logging
import tempfile
from pathlib import Path
import typer
from index_gen.main import (
aggregate_vectors,
build_gcs_path,
create_vector_index,
gather_files,
process_file,
)
from rag_eval.config import settings
app = typer.Typer()
@app.command()
def run_ingestion():
"""Main function for the CLI script."""
logging.basicConfig(level=logging.INFO)
agent_config = settings.agent
index_config = settings.index
if not agent_config or not index_config:
raise ValueError("Agent or index configuration not found in config.yaml")
# Gather files
files = gather_files(index_config.origin)
# Build output paths
contents_output_dir = build_gcs_path(index_config.data, "/contents")
vectors_output_dir = build_gcs_path(index_config.data, "/vectors")
aggregated_vectors_gcs_path = build_gcs_path(
index_config.data, "/vectors/vectors.json"
)
with tempfile.TemporaryDirectory() as temp_dir:
temp_dir_path = Path(temp_dir)
vector_artifact_paths = []
# Process files and create local artifacts
for i, file in enumerate(files):
artifact_path = temp_dir_path / f"vectors_{i}.jsonl"
vector_artifact_paths.append(artifact_path)
process_file(
file,
agent_config.embedding_model,
contents_output_dir,
artifact_path, # Pass the local path
index_config.chunk_limit,
)
# Aggregate the local artifacts into one file in GCS
aggregate_vectors(
vector_artifacts=vector_artifact_paths,
output_gcs_path=aggregated_vectors_gcs_path,
)
# Create vector index
create_vector_index(vectors_output_dir)
if __name__ == "__main__":
app()

View File

@@ -0,0 +1,238 @@
"""
This script defines a Kubeflow Pipeline (KFP) for ingesting and processing documents.
The pipeline is designed to run on Vertex AI Pipelines and consists of the following steps:
1. **Gather Files**: Scans a GCS directory for PDF files to process.
2. **Process Files (in parallel)**: For each PDF file found, this step:
a. Converts the PDF to Markdown text.
b. Chunks the text if it's too long.
c. Generates a vector embedding for each chunk using a Vertex AI embedding model.
d. Saves the markdown content and the vector embedding to separate GCS output paths.
"""
import json
import logging
import os
import tempfile
from pathlib import Path
from rag_eval.config import settings
def build_gcs_path(base_path: str, suffix: str) -> str:
"""Builds a GCS path by appending a suffix."""
return f"{base_path}{suffix}"
def gather_files(
input_dir: str,
) -> list:
"""Gathers all PDF file paths from a GCS directory."""
from google.cloud import storage
logging.getLogger().setLevel(logging.INFO)
gcs_client = storage.Client()
bucket_name, prefix = input_dir.replace("gs://", "").split("/", 1)
bucket = gcs_client.bucket(bucket_name)
blob_list = bucket.list_blobs(prefix=prefix)
pdf_files = [
f"gs://{bucket_name}/{blob.name}"
for blob in blob_list
if blob.name.endswith(".pdf")
]
logging.info(f"Found {len(pdf_files)} PDF files in {input_dir}")
return pdf_files
def process_file(
file_path: str,
model_name: str,
contents_output_dir: str,
vectors_output_file: Path,
chunk_limit: int,
):
"""
Processes a single PDF file: converts to markdown, chunks, and generates embeddings.
The vector embeddings are written to a local JSONL file.
"""
# Imports are inside the function as KFP serializes this function
from pathlib import Path
from chunker.contextual_chunker import ContextualChunker
from document_converter.markdown import MarkdownConverter
from embedder.vertex_ai import VertexAIEmbedder
from google.cloud import storage
from llm.vertex_ai import VertexAILLM
from utils.normalize_filenames import normalize_string
logging.getLogger().setLevel(logging.INFO)
# Initialize converters and embedders
converter = MarkdownConverter()
embedder = VertexAIEmbedder(model_name=model_name, project=settings.project_id, location=settings.location)
llm = VertexAILLM(project=settings.project_id, location=settings.location)
chunker = ContextualChunker(llm_client=llm, max_chunk_size=chunk_limit)
gcs_client = storage.Client()
file_id = normalize_string(Path(file_path).stem)
local_path = Path(f"/tmp/{Path(file_path).name}")
with open(vectors_output_file, "w", encoding="utf-8") as f:
try:
# Download file from GCS
bucket_name, blob_name = file_path.replace("gs://", "").split("/", 1)
bucket = gcs_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
blob.download_to_filename(local_path)
logging.info(f"Processing file: {file_path}")
# Process the downloaded file
markdown_content = converter.process_file(local_path)
def upload_to_gcs(bucket_name, blob_name, data):
bucket = gcs_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
blob.upload_from_string(data, content_type="text/markdown; charset=utf-8")
# Determine output bucket and paths for markdown
contents_bucket_name, contents_prefix = contents_output_dir.replace(
"gs://", ""
).split("/", 1)
# Extract source folder from file path
source_folder = Path(blob_name).parent.as_posix() if blob_name else ""
if len(markdown_content) > chunk_limit:
chunks = chunker.process_text(markdown_content)
for i, chunk in enumerate(chunks):
chunk_id = f"{file_id}_{i}"
embedding = embedder.generate_embedding(chunk["page_content"])
# Upload markdown chunk
md_blob_name = f"{contents_prefix}/{chunk_id}.md"
upload_to_gcs(
contents_bucket_name, md_blob_name, chunk["page_content"]
)
# Write vector to local JSONL file with source folder
vector_data = {
"id": chunk_id,
"embedding": embedding,
"source_folder": source_folder
}
json_line = json.dumps(vector_data)
f.write(json_line + '\n')
else:
embedding = embedder.generate_embedding(markdown_content)
# Upload markdown
md_blob_name = f"{contents_prefix}/{file_id}.md"
upload_to_gcs(contents_bucket_name, md_blob_name, markdown_content)
# Write vector to local JSONL file with source folder
vector_data = {
"id": file_id,
"embedding": embedding,
"source_folder": source_folder
}
json_line = json.dumps(vector_data)
f.write(json_line + '\n')
except Exception as e:
logging.error(f"Failed to process file {file_path}: {e}", exc_info=True)
raise
finally:
# Clean up the downloaded file
if os.path.exists(local_path):
os.remove(local_path)
def aggregate_vectors(
vector_artifacts: list, # This will be a list of paths to the artifact files
output_gcs_path: str,
):
"""
Aggregates multiple JSONL artifact files into a single JSONL file in GCS.
"""
from google.cloud import storage
logging.getLogger().setLevel(logging.INFO)
# Create a temporary file to aggregate all vector data
with tempfile.NamedTemporaryFile(
mode="w", delete=False, encoding="utf-8"
) as temp_agg_file:
logging.info(f"Aggregating vectors into temporary file: {temp_agg_file.name}")
for artifact_path in vector_artifacts:
with open(artifact_path, "r", encoding="utf-8") as f:
# Each line is a complete JSON object
for line in f:
temp_agg_file.write(line) # line already includes newline
temp_file_path = temp_agg_file.name
logging.info("Uploading aggregated file to GCS...")
gcs_client = storage.Client()
bucket_name, blob_name = output_gcs_path.replace("gs://", "").split("/", 1)
bucket = gcs_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
blob.upload_from_filename(temp_file_path, content_type="application/json; charset=utf-8")
logging.info(f"Successfully uploaded aggregated vectors to {output_gcs_path}")
# Clean up the temporary file
import os
os.remove(temp_file_path)
def create_vector_index(
vectors_dir: str,
):
"""Creates and deploys a Vertex AI Vector Search Index."""
from vector_search.vertex_ai import GoogleCloudVectorSearch
from rag_eval.config import settings as config
logging.getLogger().setLevel(logging.INFO)
try:
index_config = config.index
logging.info(
f"Initializing Vertex AI client for project '{config.project_id}' in '{config.location}'..."
)
vector_search = GoogleCloudVectorSearch(
project_id=config.project_id,
location=config.location,
bucket=config.bucket,
index_name=index_config.name,
)
logging.info(f"Starting creation of index '{index_config.name}'...")
vector_search.create_index(
name=index_config.name,
content_path=vectors_dir,
dimensions=index_config.dimensions,
)
logging.info(f"Index '{index_config.name}' created successfully.")
logging.info("Deploying index to a new endpoint...")
vector_search.deploy_index(
index_name=index_config.name, machine_type=index_config.machine_type
)
logging.info("Index deployed successfully!")
logging.info(f"Endpoint name: {vector_search.index_endpoint.display_name}")
logging.info(
f"Endpoint resource name: {vector_search.index_endpoint.resource_name}"
)
except Exception as e:
logging.error(f"An error occurred during index creation or deployment: {e}", exc_info=True)
raise