First commit

2026-02-18 19:57:43 +00:00
commit a53f8fcf62
115 changed files with 9957 additions and 0 deletions
--- a/apps/index-gen/src/index_gen/main.py
+++ b/apps/index-gen/src/index_gen/main.py
@@ -0,0 +1,225 @@
+"""
+This script defines a Kubeflow Pipeline (KFP) for ingesting and processing documents.
+
+The pipeline is designed to run on Vertex AI Pipelines and consists of the following steps:
+1.  **Gather Files**: Scans a GCS directory for PDF files to process.
+2.  **Process Files (in parallel)**: For each PDF file found, this step:
+    a. Converts the PDF to Markdown text.
+    b. Chunks the text if it's too long.
+    c. Generates a vector embedding for each chunk using a Vertex AI embedding model.
+    d. Saves the markdown content and the vector embedding to separate GCS output paths.
+"""
+
+import json
+import logging
+import os
+import tempfile
+from pathlib import Path
+
+from rag_eval.config import settings
+
+
+def build_gcs_path(base_path: str, suffix: str) -> str:
+    """Builds a GCS path by appending a suffix."""
+    return f"{base_path}{suffix}"
+
+
+def gather_files(
+    input_dir: str,
+) -> list:
+    """Gathers all PDF file paths from a GCS directory."""
+    from google.cloud import storage
+
+    logging.getLogger().setLevel(logging.INFO)
+
+    gcs_client = storage.Client()
+    bucket_name, prefix = input_dir.replace("gs://", "").split("/", 1)
+    bucket = gcs_client.bucket(bucket_name)
+    blob_list = bucket.list_blobs(prefix=prefix)
+
+    pdf_files = [
+        f"gs://{bucket_name}/{blob.name}"
+        for blob in blob_list
+        if blob.name.endswith(".pdf")
+    ]
+    logging.info(f"Found {len(pdf_files)} PDF files in {input_dir}")
+    return pdf_files
+
+
+def process_file(
+    file_path: str,
+    model_name: str,
+    contents_output_dir: str,
+    vectors_output_file: Path,
+    chunk_limit: int,
+):
+    """
+    Processes a single PDF file: converts to markdown, chunks, and generates embeddings.
+    The vector embeddings are written to a local JSONL file.
+    """
+    # Imports are inside the function as KFP serializes this function
+    from pathlib import Path
+
+    from chunker.contextual_chunker import ContextualChunker
+    from document_converter.markdown import MarkdownConverter
+    from embedder.vertex_ai import VertexAIEmbedder
+    from google.cloud import storage
+    from llm.vertex_ai import VertexAILLM
+    from utils.normalize_filenames import normalize_string
+
+    logging.getLogger().setLevel(logging.INFO)
+
+    # Initialize converters and embedders
+    converter = MarkdownConverter()
+    embedder = VertexAIEmbedder(model_name=model_name, project=settings.project_id, location=settings.location)
+    llm = VertexAILLM(project=settings.project_id, location=settings.location)
+    chunker = ContextualChunker(llm_client=llm, max_chunk_size=chunk_limit)
+    gcs_client = storage.Client()
+
+    file_id = normalize_string(Path(file_path).stem)
+    local_path = Path(f"/tmp/{Path(file_path).name}")
+
+    with open(vectors_output_file, "w", encoding="utf-8") as f:
+        try:
+            # Download file from GCS
+            bucket_name, blob_name = file_path.replace("gs://", "").split("/", 1)
+            bucket = gcs_client.bucket(bucket_name)
+            blob = bucket.blob(blob_name)
+            blob.download_to_filename(local_path)
+            logging.info(f"Processing file: {file_path}")
+
+            # Process the downloaded file
+            markdown_content = converter.process_file(local_path)
+
+            def upload_to_gcs(bucket_name, blob_name, data):
+                bucket = gcs_client.bucket(bucket_name)
+                blob = bucket.blob(blob_name)
+                blob.upload_from_string(data, content_type="text/markdown; charset=utf-8")
+
+            # Determine output bucket and paths for markdown
+            contents_bucket_name, contents_prefix = contents_output_dir.replace(
+                "gs://", ""
+            ).split("/", 1)
+
+            if len(markdown_content) > chunk_limit:
+                chunks = chunker.process_text(markdown_content)
+                for i, chunk in enumerate(chunks):
+                    chunk_id = f"{file_id}_{i}"
+                    embedding = embedder.generate_embedding(chunk["page_content"])
+
+                    # Upload markdown chunk
+                    md_blob_name = f"{contents_prefix}/{chunk_id}.md"
+                    upload_to_gcs(
+                        contents_bucket_name, md_blob_name, chunk["page_content"]
+                    )
+
+                    # Write vector to local JSONL file
+                    json_line = json.dumps({"id": chunk_id, "embedding": embedding})
+                    f.write(json_line + '\n')
+            else:
+                embedding = embedder.generate_embedding(markdown_content)
+
+                # Upload markdown
+                md_blob_name = f"{contents_prefix}/{file_id}.md"
+                upload_to_gcs(contents_bucket_name, md_blob_name, markdown_content)
+
+                # Write vector to local JSONL file
+                json_line = json.dumps({"id": file_id, "embedding": embedding})
+                f.write(json_line + '\n')
+
+        except Exception as e:
+            logging.error(f"Failed to process file {file_path}: {e}", exc_info=True)
+            raise
+
+        finally:
+            # Clean up the downloaded file
+            if os.path.exists(local_path):
+                os.remove(local_path)
+
+
+
+
+
+def aggregate_vectors(
+    vector_artifacts: list,  # This will be a list of paths to the artifact files
+    output_gcs_path: str,
+):
+    """
+    Aggregates multiple JSONL artifact files into a single JSONL file in GCS.
+    """
+    from google.cloud import storage
+
+    logging.getLogger().setLevel(logging.INFO)
+
+    # Create a temporary file to aggregate all vector data
+    with tempfile.NamedTemporaryFile(
+        mode="w", delete=False, encoding="utf-8"
+    ) as temp_agg_file:
+        logging.info(f"Aggregating vectors into temporary file: {temp_agg_file.name}")
+        for artifact_path in vector_artifacts:
+            with open(artifact_path, "r", encoding="utf-8") as f:
+                # Each line is a complete JSON object
+                for line in f:
+                    temp_agg_file.write(line)  # line already includes newline
+
+        temp_file_path = temp_agg_file.name
+
+    logging.info("Uploading aggregated file to GCS...")
+    gcs_client = storage.Client()
+    bucket_name, blob_name = output_gcs_path.replace("gs://", "").split("/", 1)
+    bucket = gcs_client.bucket(bucket_name)
+    blob = bucket.blob(blob_name)
+    blob.upload_from_filename(temp_file_path, content_type="application/json; charset=utf-8")
+
+    logging.info(f"Successfully uploaded aggregated vectors to {output_gcs_path}")
+
+    # Clean up the temporary file
+    import os
+
+    os.remove(temp_file_path)
+
+
+
+def create_vector_index(
+    vectors_dir: str,
+):
+    """Creates and deploys a Vertex AI Vector Search Index."""
+    from vector_search.vertex_ai import GoogleCloudVectorSearch
+
+    from rag_eval.config import settings as config
+
+    logging.getLogger().setLevel(logging.INFO)
+
+    try:
+        index_config = config.index
+
+        logging.info(
+            f"Initializing Vertex AI client for project '{config.project_id}' in '{config.location}'..."
+        )
+        vector_search = GoogleCloudVectorSearch(
+            project_id=config.project_id,
+            location=config.location,
+            bucket=config.bucket,
+            index_name=index_config.name,
+        )
+
+        logging.info(f"Starting creation of index '{index_config.name}'...")
+        vector_search.create_index(
+            name=index_config.name,
+            content_path=vectors_dir,
+            dimensions=index_config.dimensions,
+        )
+        logging.info(f"Index '{index_config.name}' created successfully.")
+
+        logging.info("Deploying index to a new endpoint...")
+        vector_search.deploy_index(
+            index_name=index_config.name, machine_type=index_config.machine_type
+        )
+        logging.info("Index deployed successfully!")
+        logging.info(f"Endpoint name: {vector_search.index_endpoint.display_name}")
+        logging.info(
+            f"Endpoint resource name: {vector_search.index_endpoint.resource_name}"
+        )
+    except Exception as e:
+        logging.error(f"An error occurred during index creation or deployment: {e}", exc_info=True)
+        raise