First commit
This commit is contained in:
0
apps/index-gen/README.md
Normal file
0
apps/index-gen/README.md
Normal file
34
apps/index-gen/pyproject.toml
Normal file
34
apps/index-gen/pyproject.toml
Normal file
@@ -0,0 +1,34 @@
|
||||
[project]
|
||||
name = "index-gen"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
authors = [
|
||||
{ name = "Anibal Angulo", email = "a8065384@banorte.com" }
|
||||
]
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"chunker",
|
||||
"document-converter",
|
||||
"embedder",
|
||||
"file-storage",
|
||||
"llm",
|
||||
"utils",
|
||||
"vector-search",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
index-gen = "index_gen.cli:app"
|
||||
|
||||
[build-system]
|
||||
requires = ["uv_build>=0.8.12,<0.9.0"]
|
||||
build-backend = "uv_build"
|
||||
|
||||
[tool.uv.sources]
|
||||
file-storage = { workspace = true }
|
||||
vector-search = { workspace = true }
|
||||
utils = { workspace = true }
|
||||
embedder = { workspace = true }
|
||||
chunker = { workspace = true }
|
||||
document-converter = { workspace = true }
|
||||
llm = { workspace = true }
|
||||
2
apps/index-gen/src/index_gen/__init__.py
Normal file
2
apps/index-gen/src/index_gen/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
def main() -> None:
|
||||
print("Hello from index-gen!")
|
||||
68
apps/index-gen/src/index_gen/cli.py
Normal file
68
apps/index-gen/src/index_gen/cli.py
Normal file
@@ -0,0 +1,68 @@
|
||||
import logging
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import typer
|
||||
|
||||
from index_gen.main import (
|
||||
aggregate_vectors,
|
||||
build_gcs_path,
|
||||
create_vector_index,
|
||||
gather_files,
|
||||
process_file,
|
||||
)
|
||||
from rag_eval.config import settings
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
@app.command()
|
||||
def run_ingestion():
|
||||
"""Main function for the CLI script."""
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
agent_config = settings.agent
|
||||
index_config = settings.index
|
||||
|
||||
if not agent_config or not index_config:
|
||||
raise ValueError("Agent or index configuration not found in config.yaml")
|
||||
|
||||
# Gather files
|
||||
files = gather_files(index_config.origin)
|
||||
|
||||
# Build output paths
|
||||
contents_output_dir = build_gcs_path(index_config.data, "/contents")
|
||||
vectors_output_dir = build_gcs_path(index_config.data, "/vectors")
|
||||
aggregated_vectors_gcs_path = build_gcs_path(
|
||||
index_config.data, "/vectors/vectors.json"
|
||||
)
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
temp_dir_path = Path(temp_dir)
|
||||
vector_artifact_paths = []
|
||||
|
||||
# Process files and create local artifacts
|
||||
for i, file in enumerate(files):
|
||||
artifact_path = temp_dir_path / f"vectors_{i}.jsonl"
|
||||
vector_artifact_paths.append(artifact_path)
|
||||
|
||||
process_file(
|
||||
file,
|
||||
agent_config.embedding_model,
|
||||
contents_output_dir,
|
||||
artifact_path, # Pass the local path
|
||||
index_config.chunk_limit,
|
||||
)
|
||||
|
||||
# Aggregate the local artifacts into one file in GCS
|
||||
aggregate_vectors(
|
||||
vector_artifacts=vector_artifact_paths,
|
||||
output_gcs_path=aggregated_vectors_gcs_path,
|
||||
)
|
||||
|
||||
# Create vector index
|
||||
create_vector_index(vectors_output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
225
apps/index-gen/src/index_gen/main.py
Normal file
225
apps/index-gen/src/index_gen/main.py
Normal file
@@ -0,0 +1,225 @@
|
||||
"""
|
||||
This script defines a Kubeflow Pipeline (KFP) for ingesting and processing documents.
|
||||
|
||||
The pipeline is designed to run on Vertex AI Pipelines and consists of the following steps:
|
||||
1. **Gather Files**: Scans a GCS directory for PDF files to process.
|
||||
2. **Process Files (in parallel)**: For each PDF file found, this step:
|
||||
a. Converts the PDF to Markdown text.
|
||||
b. Chunks the text if it's too long.
|
||||
c. Generates a vector embedding for each chunk using a Vertex AI embedding model.
|
||||
d. Saves the markdown content and the vector embedding to separate GCS output paths.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from rag_eval.config import settings
|
||||
|
||||
|
||||
def build_gcs_path(base_path: str, suffix: str) -> str:
|
||||
"""Builds a GCS path by appending a suffix."""
|
||||
return f"{base_path}{suffix}"
|
||||
|
||||
|
||||
def gather_files(
|
||||
input_dir: str,
|
||||
) -> list:
|
||||
"""Gathers all PDF file paths from a GCS directory."""
|
||||
from google.cloud import storage
|
||||
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
gcs_client = storage.Client()
|
||||
bucket_name, prefix = input_dir.replace("gs://", "").split("/", 1)
|
||||
bucket = gcs_client.bucket(bucket_name)
|
||||
blob_list = bucket.list_blobs(prefix=prefix)
|
||||
|
||||
pdf_files = [
|
||||
f"gs://{bucket_name}/{blob.name}"
|
||||
for blob in blob_list
|
||||
if blob.name.endswith(".pdf")
|
||||
]
|
||||
logging.info(f"Found {len(pdf_files)} PDF files in {input_dir}")
|
||||
return pdf_files
|
||||
|
||||
|
||||
def process_file(
|
||||
file_path: str,
|
||||
model_name: str,
|
||||
contents_output_dir: str,
|
||||
vectors_output_file: Path,
|
||||
chunk_limit: int,
|
||||
):
|
||||
"""
|
||||
Processes a single PDF file: converts to markdown, chunks, and generates embeddings.
|
||||
The vector embeddings are written to a local JSONL file.
|
||||
"""
|
||||
# Imports are inside the function as KFP serializes this function
|
||||
from pathlib import Path
|
||||
|
||||
from chunker.contextual_chunker import ContextualChunker
|
||||
from document_converter.markdown import MarkdownConverter
|
||||
from embedder.vertex_ai import VertexAIEmbedder
|
||||
from google.cloud import storage
|
||||
from llm.vertex_ai import VertexAILLM
|
||||
from utils.normalize_filenames import normalize_string
|
||||
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
# Initialize converters and embedders
|
||||
converter = MarkdownConverter()
|
||||
embedder = VertexAIEmbedder(model_name=model_name, project=settings.project_id, location=settings.location)
|
||||
llm = VertexAILLM(project=settings.project_id, location=settings.location)
|
||||
chunker = ContextualChunker(llm_client=llm, max_chunk_size=chunk_limit)
|
||||
gcs_client = storage.Client()
|
||||
|
||||
file_id = normalize_string(Path(file_path).stem)
|
||||
local_path = Path(f"/tmp/{Path(file_path).name}")
|
||||
|
||||
with open(vectors_output_file, "w", encoding="utf-8") as f:
|
||||
try:
|
||||
# Download file from GCS
|
||||
bucket_name, blob_name = file_path.replace("gs://", "").split("/", 1)
|
||||
bucket = gcs_client.bucket(bucket_name)
|
||||
blob = bucket.blob(blob_name)
|
||||
blob.download_to_filename(local_path)
|
||||
logging.info(f"Processing file: {file_path}")
|
||||
|
||||
# Process the downloaded file
|
||||
markdown_content = converter.process_file(local_path)
|
||||
|
||||
def upload_to_gcs(bucket_name, blob_name, data):
|
||||
bucket = gcs_client.bucket(bucket_name)
|
||||
blob = bucket.blob(blob_name)
|
||||
blob.upload_from_string(data, content_type="text/markdown; charset=utf-8")
|
||||
|
||||
# Determine output bucket and paths for markdown
|
||||
contents_bucket_name, contents_prefix = contents_output_dir.replace(
|
||||
"gs://", ""
|
||||
).split("/", 1)
|
||||
|
||||
if len(markdown_content) > chunk_limit:
|
||||
chunks = chunker.process_text(markdown_content)
|
||||
for i, chunk in enumerate(chunks):
|
||||
chunk_id = f"{file_id}_{i}"
|
||||
embedding = embedder.generate_embedding(chunk["page_content"])
|
||||
|
||||
# Upload markdown chunk
|
||||
md_blob_name = f"{contents_prefix}/{chunk_id}.md"
|
||||
upload_to_gcs(
|
||||
contents_bucket_name, md_blob_name, chunk["page_content"]
|
||||
)
|
||||
|
||||
# Write vector to local JSONL file
|
||||
json_line = json.dumps({"id": chunk_id, "embedding": embedding})
|
||||
f.write(json_line + '\n')
|
||||
else:
|
||||
embedding = embedder.generate_embedding(markdown_content)
|
||||
|
||||
# Upload markdown
|
||||
md_blob_name = f"{contents_prefix}/{file_id}.md"
|
||||
upload_to_gcs(contents_bucket_name, md_blob_name, markdown_content)
|
||||
|
||||
# Write vector to local JSONL file
|
||||
json_line = json.dumps({"id": file_id, "embedding": embedding})
|
||||
f.write(json_line + '\n')
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to process file {file_path}: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
finally:
|
||||
# Clean up the downloaded file
|
||||
if os.path.exists(local_path):
|
||||
os.remove(local_path)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def aggregate_vectors(
|
||||
vector_artifacts: list, # This will be a list of paths to the artifact files
|
||||
output_gcs_path: str,
|
||||
):
|
||||
"""
|
||||
Aggregates multiple JSONL artifact files into a single JSONL file in GCS.
|
||||
"""
|
||||
from google.cloud import storage
|
||||
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
# Create a temporary file to aggregate all vector data
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w", delete=False, encoding="utf-8"
|
||||
) as temp_agg_file:
|
||||
logging.info(f"Aggregating vectors into temporary file: {temp_agg_file.name}")
|
||||
for artifact_path in vector_artifacts:
|
||||
with open(artifact_path, "r", encoding="utf-8") as f:
|
||||
# Each line is a complete JSON object
|
||||
for line in f:
|
||||
temp_agg_file.write(line) # line already includes newline
|
||||
|
||||
temp_file_path = temp_agg_file.name
|
||||
|
||||
logging.info("Uploading aggregated file to GCS...")
|
||||
gcs_client = storage.Client()
|
||||
bucket_name, blob_name = output_gcs_path.replace("gs://", "").split("/", 1)
|
||||
bucket = gcs_client.bucket(bucket_name)
|
||||
blob = bucket.blob(blob_name)
|
||||
blob.upload_from_filename(temp_file_path, content_type="application/json; charset=utf-8")
|
||||
|
||||
logging.info(f"Successfully uploaded aggregated vectors to {output_gcs_path}")
|
||||
|
||||
# Clean up the temporary file
|
||||
import os
|
||||
|
||||
os.remove(temp_file_path)
|
||||
|
||||
|
||||
|
||||
def create_vector_index(
|
||||
vectors_dir: str,
|
||||
):
|
||||
"""Creates and deploys a Vertex AI Vector Search Index."""
|
||||
from vector_search.vertex_ai import GoogleCloudVectorSearch
|
||||
|
||||
from rag_eval.config import settings as config
|
||||
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
try:
|
||||
index_config = config.index
|
||||
|
||||
logging.info(
|
||||
f"Initializing Vertex AI client for project '{config.project_id}' in '{config.location}'..."
|
||||
)
|
||||
vector_search = GoogleCloudVectorSearch(
|
||||
project_id=config.project_id,
|
||||
location=config.location,
|
||||
bucket=config.bucket,
|
||||
index_name=index_config.name,
|
||||
)
|
||||
|
||||
logging.info(f"Starting creation of index '{index_config.name}'...")
|
||||
vector_search.create_index(
|
||||
name=index_config.name,
|
||||
content_path=vectors_dir,
|
||||
dimensions=index_config.dimensions,
|
||||
)
|
||||
logging.info(f"Index '{index_config.name}' created successfully.")
|
||||
|
||||
logging.info("Deploying index to a new endpoint...")
|
||||
vector_search.deploy_index(
|
||||
index_name=index_config.name, machine_type=index_config.machine_type
|
||||
)
|
||||
logging.info("Index deployed successfully!")
|
||||
logging.info(f"Endpoint name: {vector_search.index_endpoint.display_name}")
|
||||
logging.info(
|
||||
f"Endpoint resource name: {vector_search.index_endpoint.resource_name}"
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error(f"An error occurred during index creation or deployment: {e}", exc_info=True)
|
||||
raise
|
||||
Reference in New Issue
Block a user