First commmit

2026-02-22 15:25:27 +00:00
commit 35d5a65b17
70 changed files with 4298 additions and 0 deletions
--- a/apps/index-gen/README.md
+++ b/apps/index-gen/README.md
--- a/apps/index-gen/pyproject.toml
+++ b/apps/index-gen/pyproject.toml
@@ -0,0 +1,34 @@
+[project]
+name = "index-gen"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+authors = [
+    { name = "Anibal Angulo", email = "a8065384@banorte.com" }
+]
+requires-python = ">=3.12"
+dependencies = [
+    "chunker",
+    "document-converter",
+    "embedder",
+    "file-storage",
+    "llm",
+    "utils",
+    "vector-search",
+]
+
+[project.scripts]
+index-gen = "index_gen.cli:app"
+
+[build-system]
+requires = ["uv_build>=0.8.12,<0.9.0"]
+build-backend = "uv_build"
+
+[tool.uv.sources]
+file-storage = { workspace = true }
+vector-search = { workspace = true }
+utils = { workspace = true }
+embedder = { workspace = true }
+chunker = { workspace = true }
+document-converter = { workspace = true }
+llm = { workspace = true }
--- a/apps/index-gen/src/index_gen/init.py
+++ b/apps/index-gen/src/index_gen/init.py
@@ -0,0 +1,2 @@
+def main() -> None:
+    print("Hello from index-gen!")
--- a/apps/index-gen/src/index_gen/cli.py
+++ b/apps/index-gen/src/index_gen/cli.py
@@ -0,0 +1,68 @@
+import logging
+import tempfile
+from pathlib import Path
+
+import typer
+
+from index_gen.main import (
+    aggregate_vectors,
+    build_gcs_path,
+    create_vector_index,
+    gather_files,
+    process_file,
+)
+from rag_eval.config import settings
+
+app = typer.Typer()
+
+
+@app.command()
+def run_ingestion():
+    """Main function for the CLI script."""
+    logging.basicConfig(level=logging.INFO)
+
+    agent_config = settings.agent
+    index_config = settings.index
+
+    if not agent_config or not index_config:
+        raise ValueError("Agent or index configuration not found in config.yaml")
+
+    # Gather files
+    files = gather_files(index_config.origin)
+
+    # Build output paths
+    contents_output_dir = build_gcs_path(index_config.data, "/contents")
+    vectors_output_dir = build_gcs_path(index_config.data, "/vectors")
+    aggregated_vectors_gcs_path = build_gcs_path(
+        index_config.data, "/vectors/vectors.json"
+    )
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_dir_path = Path(temp_dir)
+        vector_artifact_paths = []
+
+        # Process files and create local artifacts
+        for i, file in enumerate(files):
+            artifact_path = temp_dir_path / f"vectors_{i}.jsonl"
+            vector_artifact_paths.append(artifact_path)
+
+            process_file(
+                file,
+                agent_config.embedding_model,
+                contents_output_dir,
+                artifact_path,  # Pass the local path
+                index_config.chunk_limit,
+            )
+
+        # Aggregate the local artifacts into one file in GCS
+        aggregate_vectors(
+            vector_artifacts=vector_artifact_paths,
+            output_gcs_path=aggregated_vectors_gcs_path,
+        )
+
+    # Create vector index
+    create_vector_index(vectors_output_dir)
+
+
+if __name__ == "__main__":
+    app()
--- a/apps/index-gen/src/index_gen/main.py
+++ b/apps/index-gen/src/index_gen/main.py
@@ -0,0 +1,238 @@
+"""
+This script defines a Kubeflow Pipeline (KFP) for ingesting and processing documents.
+
+The pipeline is designed to run on Vertex AI Pipelines and consists of the following steps:
+1.  **Gather Files**: Scans a GCS directory for PDF files to process.
+2.  **Process Files (in parallel)**: For each PDF file found, this step:
+    a. Converts the PDF to Markdown text.
+    b. Chunks the text if it's too long.
+    c. Generates a vector embedding for each chunk using a Vertex AI embedding model.
+    d. Saves the markdown content and the vector embedding to separate GCS output paths.
+"""
+
+import json
+import logging
+import os
+import tempfile
+from pathlib import Path
+
+from rag_eval.config import settings
+
+
+def build_gcs_path(base_path: str, suffix: str) -> str:
+    """Builds a GCS path by appending a suffix."""
+    return f"{base_path}{suffix}"
+
+
+def gather_files(
+    input_dir: str,
+) -> list:
+    """Gathers all PDF file paths from a GCS directory."""
+    from google.cloud import storage
+
+    logging.getLogger().setLevel(logging.INFO)
+
+    gcs_client = storage.Client()
+    bucket_name, prefix = input_dir.replace("gs://", "").split("/", 1)
+    bucket = gcs_client.bucket(bucket_name)
+    blob_list = bucket.list_blobs(prefix=prefix)
+
+    pdf_files = [
+        f"gs://{bucket_name}/{blob.name}"
+        for blob in blob_list
+        if blob.name.endswith(".pdf")
+    ]
+    logging.info(f"Found {len(pdf_files)} PDF files in {input_dir}")
+    return pdf_files
+
+
+def process_file(
+    file_path: str,
+    model_name: str,
+    contents_output_dir: str,
+    vectors_output_file: Path,
+    chunk_limit: int,
+):
+    """
+    Processes a single PDF file: converts to markdown, chunks, and generates embeddings.
+    The vector embeddings are written to a local JSONL file.
+    """
+    # Imports are inside the function as KFP serializes this function
+    from pathlib import Path
+
+    from chunker.contextual_chunker import ContextualChunker
+    from document_converter.markdown import MarkdownConverter
+    from embedder.vertex_ai import VertexAIEmbedder
+    from google.cloud import storage
+    from llm.vertex_ai import VertexAILLM
+    from utils.normalize_filenames import normalize_string
+
+    logging.getLogger().setLevel(logging.INFO)
+
+    # Initialize converters and embedders
+    converter = MarkdownConverter()
+    embedder = VertexAIEmbedder(model_name=model_name, project=settings.project_id, location=settings.location)
+    llm = VertexAILLM(project=settings.project_id, location=settings.location)
+    chunker = ContextualChunker(llm_client=llm, max_chunk_size=chunk_limit)
+    gcs_client = storage.Client()
+
+    file_id = normalize_string(Path(file_path).stem)
+    local_path = Path(f"/tmp/{Path(file_path).name}")
+
+    with open(vectors_output_file, "w", encoding="utf-8") as f:
+        try:
+            # Download file from GCS
+            bucket_name, blob_name = file_path.replace("gs://", "").split("/", 1)
+            bucket = gcs_client.bucket(bucket_name)
+            blob = bucket.blob(blob_name)
+            blob.download_to_filename(local_path)
+            logging.info(f"Processing file: {file_path}")
+
+            # Process the downloaded file
+            markdown_content = converter.process_file(local_path)
+
+            def upload_to_gcs(bucket_name, blob_name, data):
+                bucket = gcs_client.bucket(bucket_name)
+                blob = bucket.blob(blob_name)
+                blob.upload_from_string(data, content_type="text/markdown; charset=utf-8")
+
+            # Determine output bucket and paths for markdown
+            contents_bucket_name, contents_prefix = contents_output_dir.replace(
+                "gs://", ""
+            ).split("/", 1)
+
+            # Extract source folder from file path
+            source_folder = Path(blob_name).parent.as_posix() if blob_name else ""
+
+            if len(markdown_content) > chunk_limit:
+                chunks = chunker.process_text(markdown_content)
+                for i, chunk in enumerate(chunks):
+                    chunk_id = f"{file_id}_{i}"
+                    embedding = embedder.generate_embedding(chunk["page_content"])
+
+                    # Upload markdown chunk
+                    md_blob_name = f"{contents_prefix}/{chunk_id}.md"
+                    upload_to_gcs(
+                        contents_bucket_name, md_blob_name, chunk["page_content"]
+                    )
+
+                    # Write vector to local JSONL file with source folder
+                    vector_data = {
+                        "id": chunk_id,
+                        "embedding": embedding,
+                        "source_folder": source_folder
+                    }
+                    json_line = json.dumps(vector_data)
+                    f.write(json_line + '\n')
+            else:
+                embedding = embedder.generate_embedding(markdown_content)
+
+                # Upload markdown
+                md_blob_name = f"{contents_prefix}/{file_id}.md"
+                upload_to_gcs(contents_bucket_name, md_blob_name, markdown_content)
+
+                # Write vector to local JSONL file with source folder
+                vector_data = {
+                    "id": file_id,
+                    "embedding": embedding,
+                    "source_folder": source_folder
+                }
+                json_line = json.dumps(vector_data)
+                f.write(json_line + '\n')
+
+        except Exception as e:
+            logging.error(f"Failed to process file {file_path}: {e}", exc_info=True)
+            raise
+
+        finally:
+            # Clean up the downloaded file
+            if os.path.exists(local_path):
+                os.remove(local_path)
+
+
+
+
+
+def aggregate_vectors(
+    vector_artifacts: list,  # This will be a list of paths to the artifact files
+    output_gcs_path: str,
+):
+    """
+    Aggregates multiple JSONL artifact files into a single JSONL file in GCS.
+    """
+    from google.cloud import storage
+
+    logging.getLogger().setLevel(logging.INFO)
+
+    # Create a temporary file to aggregate all vector data
+    with tempfile.NamedTemporaryFile(
+        mode="w", delete=False, encoding="utf-8"
+    ) as temp_agg_file:
+        logging.info(f"Aggregating vectors into temporary file: {temp_agg_file.name}")
+        for artifact_path in vector_artifacts:
+            with open(artifact_path, "r", encoding="utf-8") as f:
+                # Each line is a complete JSON object
+                for line in f:
+                    temp_agg_file.write(line)  # line already includes newline
+
+        temp_file_path = temp_agg_file.name
+
+    logging.info("Uploading aggregated file to GCS...")
+    gcs_client = storage.Client()
+    bucket_name, blob_name = output_gcs_path.replace("gs://", "").split("/", 1)
+    bucket = gcs_client.bucket(bucket_name)
+    blob = bucket.blob(blob_name)
+    blob.upload_from_filename(temp_file_path, content_type="application/json; charset=utf-8")
+
+    logging.info(f"Successfully uploaded aggregated vectors to {output_gcs_path}")
+
+    # Clean up the temporary file
+    import os
+
+    os.remove(temp_file_path)
+
+
+
+def create_vector_index(
+    vectors_dir: str,
+):
+    """Creates and deploys a Vertex AI Vector Search Index."""
+    from vector_search.vertex_ai import GoogleCloudVectorSearch
+
+    from rag_eval.config import settings as config
+
+    logging.getLogger().setLevel(logging.INFO)
+
+    try:
+        index_config = config.index
+
+        logging.info(
+            f"Initializing Vertex AI client for project '{config.project_id}' in '{config.location}'..."
+        )
+        vector_search = GoogleCloudVectorSearch(
+            project_id=config.project_id,
+            location=config.location,
+            bucket=config.bucket,
+            index_name=index_config.name,
+        )
+
+        logging.info(f"Starting creation of index '{index_config.name}'...")
+        vector_search.create_index(
+            name=index_config.name,
+            content_path=vectors_dir,
+            dimensions=index_config.dimensions,
+        )
+        logging.info(f"Index '{index_config.name}' created successfully.")
+
+        logging.info("Deploying index to a new endpoint...")
+        vector_search.deploy_index(
+            index_name=index_config.name, machine_type=index_config.machine_type
+        )
+        logging.info("Index deployed successfully!")
+        logging.info(f"Endpoint name: {vector_search.index_endpoint.display_name}")
+        logging.info(
+            f"Endpoint resource name: {vector_search.index_endpoint.resource_name}"
+        )
+    except Exception as e:
+        logging.error(f"An error occurred during index creation or deployment: {e}", exc_info=True)
+        raise