First commit

2026-02-18 19:57:43 +00:00
commit a53f8fcf62
115 changed files with 9957 additions and 0 deletions
--- a/packages/vector-search/.python-version
+++ b/packages/vector-search/.python-version
@@ -0,0 +1 @@
+3.10
--- a/packages/vector-search/README.md
+++ b/packages/vector-search/README.md
--- a/packages/vector-search/pyproject.toml
+++ b/packages/vector-search/pyproject.toml
@@ -0,0 +1,29 @@
+[project]
+name = "vector-search"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+authors = [
+    { name = "Anibal Angulo", email = "a8065384@banorte.com" }
+]
+requires-python = ">=3.12"
+dependencies = [
+    "embedder",
+    "file-storage",
+    "google-cloud-aiplatform>=1.106.0",
+    "aiohttp>=3.10.11,<4",
+    "gcloud-aio-auth>=5.3.0",
+    "google-auth==2.29.0",
+    "typer>=0.16.1",
+]
+
+[project.scripts]
+vector-search = "vector_search.cli:app"
+
+[build-system]
+requires = ["uv_build>=0.8.3,<0.9.0"]
+build-backend = "uv_build"
+
+[tool.uv.sources]
+file-storage = { workspace = true }
+embedder = { workspace = true }
--- a/packages/vector-search/src/vector_search/init.py
+++ b/packages/vector-search/src/vector_search/init.py
@@ -0,0 +1,2 @@
+def hello() -> str:
+    return "Hello from vector-search!"
--- a/packages/vector-search/src/vector_search/base.py
+++ b/packages/vector-search/src/vector_search/base.py
@@ -0,0 +1,62 @@
+from abc import ABC, abstractmethod
+from typing import List, TypedDict
+
+
+class SearchResult(TypedDict):
+    id: str
+    distance: float
+    content: str
+
+
+class BaseVectorSearch(ABC):
+    """
+    Abstract base class for a vector search provider.
+
+    This class defines the standard interface for creating a vector search index
+    and running queries against it.
+    """
+
+    @abstractmethod
+    def create_index(self, name: str, content_path: str, **kwargs) -> None:
+        """
+        Creates a new vector search index and populates it with the provided content.
+
+        Args:
+            name: The desired name for the new index.
+            content_path: The local file system path to the data that will be used to
+                          populate the index. This is expected to be a JSON file
+                          containing a list of objects, each with an 'id', 'name',
+                          and 'embedding' key.
+            **kwargs: Additional provider-specific arguments for index creation.
+        """
+        ...
+
+    @abstractmethod
+    def update_index(self, index_name: str, content_path: str, **kwargs) -> None:
+        """
+        Updates an existing vector search index with new content.
+
+        Args:
+            index_name: The name of the index to update.
+            content_path: The local file system path to the data that will be used to
+                          populate the index.
+            **kwargs: Additional provider-specific arguments for index update.
+        """
+        ...
+
+    @abstractmethod
+    def run_query(
+        self, index: str, query: List[float], limit: int
+    ) -> List[SearchResult]:
+        """
+        Runs a similarity search query against the index.
+
+        Args:
+            query: The embedding vector to use for the search query.
+            limit: The maximum number of nearest neighbors to return.
+
+        Returns:
+            A list of dictionaries, where each dictionary represents a matched item
+            and contains at least the item's 'id' and the search 'distance'.
+        """
+        ...
--- a/packages/vector-search/src/vector_search/cli/init.py
+++ b/packages/vector-search/src/vector_search/cli/init.py
@@ -0,0 +1,10 @@
+from typer import Typer
+
+from .create import app as create_callback
+from .delete import app as delete_callback
+from .query import app as query_callback
+
+app = Typer()
+app.add_typer(create_callback, name="create")
+app.add_typer(delete_callback, name="delete")
+app.add_typer(query_callback, name="query")
--- a/packages/vector-search/src/vector_search/cli/create.py
+++ b/packages/vector-search/src/vector_search/cli/create.py
@@ -0,0 +1,91 @@
+"""Create and deploy a Vertex AI Vector Search index."""
+
+from typing import Annotated
+
+import typer
+from rich.console import Console
+
+from rag_eval.config import settings as config
+from vector_search.vertex_ai import GoogleCloudVectorSearch
+
+app = typer.Typer()
+
+
+@app.callback(invoke_without_command=True)
+def create(
+    path: Annotated[
+        str,
+        typer.Option(
+            "--path",
+            "-p",
+            help="The GCS URI (gs://...) to the directory containing your embedding JSON file(s).",
+        ),
+    ],
+    agent_name: Annotated[
+        str,
+        typer.Option(
+            "--agent",
+            "-a",
+            help="The name of the agent to create the index for.",
+        ),
+    ],
+):
+    """Create and deploy a Vertex AI Vector Search index for a specific agent."""
+    console = Console()
+
+    try:
+        console.print(
+            f"[bold green]Looking up configuration for agent '{agent_name}'...[/bold green]"
+        )
+        agent_config = config.agents.get(agent_name)
+        if not agent_config:
+            console.print(
+                f"[bold red]Agent '{agent_name}' not found in settings.[/bold red]"
+            )
+            raise typer.Exit(code=1)
+
+        if not agent_config.index:
+            console.print(
+                f"[bold red]Index configuration not found for agent '{agent_name}'.[/bold red]"
+            )
+            raise typer.Exit(code=1)
+
+        index_config = agent_config.index
+
+        console.print(
+            f"[bold green]Initializing Vertex AI client for project '{config.project_id}' in '{config.location}'...[/bold green]"
+        )
+        vector_search = GoogleCloudVectorSearch(
+            project_id=config.project_id,
+            location=config.location,
+            bucket=config.bucket,
+            index_name=index_config.name,
+        )
+
+        console.print(
+            f"[bold green]Starting creation of index '{index_config.name}'...[/bold green]"
+        )
+        console.print("This may take a while.")
+        vector_search.create_index(
+            name=index_config.name,
+            content_path=f"gs://{config.bucket}/{path}",
+            dimensions=index_config.dimensions,
+        )
+        console.print(
+            f"[bold green]Index '{index_config.name}' created successfully.[/bold green]"
+        )
+
+        console.print("[bold green]Deploying index to a new endpoint...[/bold green]")
+        console.print("This will also take some time.")
+        vector_search.deploy_index(
+            index_name=index_config.name, machine_type=index_config.machine_type
+        )
+        console.print("[bold green]Index deployed successfully![/bold green]")
+        console.print(f"Endpoint name: {vector_search.index_endpoint.display_name}")
+        console.print(
+            f"Endpoint resource name: {vector_search.index_endpoint.resource_name}"
+        )
+
+    except Exception as e:
+        console.print(f"[bold red]An error occurred: {e}[/bold red]")
+        raise typer.Exit(code=1)
--- a/packages/vector-search/src/vector_search/cli/delete.py
+++ b/packages/vector-search/src/vector_search/cli/delete.py
@@ -0,0 +1,38 @@
+"""Delete a vector index or endpoint."""
+
+import typer
+from rich.console import Console
+
+from rag_eval.config import settings as config
+from vector_search.vertex_ai import GoogleCloudVectorSearch
+
+app = typer.Typer()
+
+
+@app.callback(invoke_without_command=True)
+def delete(
+    id: str = typer.Argument(..., help="The ID of the index or endpoint to delete."),
+    endpoint: bool = typer.Option(
+        False, "--endpoint", help="Delete an endpoint instead of an index."
+    ),
+):
+    """Delete a vector index or endpoint."""
+    console = Console()
+    vector_search = GoogleCloudVectorSearch(
+        project_id=config.project_id, location=config.location, bucket=config.bucket
+    )
+
+    try:
+        if endpoint:
+            console.print(f"[bold red]Deleting endpoint {id}...[/bold red]")
+            vector_search.delete_index_endpoint(id)
+            console.print(
+                f"[bold green]Endpoint {id} deleted successfully.[/bold green]"
+            )
+        else:
+            console.print(f"[bold red]Deleting index {id}...[/bold red]")
+            vector_search.delete_index(id)
+            console.print(f"[bold green]Index {id} deleted successfully.[/bold green]")
+    except Exception as e:
+        console.print(f"[bold red]An error occurred: {e}[/bold red]")
+        raise typer.Exit(code=1)
--- a/packages/vector-search/src/vector_search/cli/generate.py
+++ b/packages/vector-search/src/vector_search/cli/generate.py
@@ -0,0 +1,91 @@
+"""Generate embeddings for documents and save them to a JSON file."""
+
+import json
+from pathlib import Path
+
+import typer
+from embedder.vertex_ai import VertexAIEmbedder
+from file_storage.google_cloud import GoogleCloudFileStorage
+from rich.console import Console
+from rich.progress import Progress
+
+from rag_eval.config import Settings
+
+app = typer.Typer()
+
+
+@app.callback(invoke_without_command=True)
+def generate(
+    path: str = typer.Argument(..., help="The path to the markdown files."),
+    output_file: str = typer.Option(
+        ...,
+        "--output-file",
+        "-o",
+        help="The local path to save the output JSON file.",
+    ),
+    batch_size: int = typer.Option(
+        10,
+        "--batch-size",
+        "-b",
+        help="The batch size for processing files.",
+    ),
+    jsonl: bool = typer.Option(
+        False,
+        "--jsonl",
+        help="Output in JSONL format instead of JSON.",
+    ),
+):
+    """Generate embeddings for documents and save them to a JSON file."""
+    config = Settings()
+    console = Console()
+
+    console.print("[bold green]Starting vector generation...[/bold green]")
+
+    try:
+        storage = GoogleCloudFileStorage(bucket=config.bucket)
+        embedder = VertexAIEmbedder(model_name=config.embedding_model)
+
+        remote_files = storage.list_files(path=path)
+        results = []
+
+        with Progress(console=console) as progress:
+            task = progress.add_task(
+                "[cyan]Generating embeddings...", total=len(remote_files)
+            )
+
+            for i in range(0, len(remote_files), batch_size):
+                batch_files = remote_files[i : i + batch_size]
+                batch_contents = []
+
+                for remote_file in batch_files:
+                    file_stream = storage.get_file_stream(remote_file)
+                    batch_contents.append(
+                        file_stream.read().decode("utf-8-sig", errors="replace")
+                    )
+
+                batch_embeddings = embedder.generate_embeddings_batch(batch_contents)
+
+                for j, remote_file in enumerate(batch_files):
+                    results.append(
+                        {"id": remote_file, "embedding": batch_embeddings[j]}
+                    )
+                    progress.update(task, advance=1)
+
+    except Exception as e:
+        console.print(
+            f"[bold red]An error occurred during vector generation: {e}[/bold red]"
+        )
+        raise typer.Exit(code=1)
+
+    output_path = Path(output_file)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w") as f:
+        if jsonl:
+            for record in results:
+                f.write(json.dumps(record) + "\n")
+        else:
+            json.dump(results, f, indent=2)
+
+    console.print(
+        f"[bold green]Embedding generation complete. {len(results)} vectors saved to '{output_path.resolve()}'[/bold green]"
+    )
--- a/packages/vector-search/src/vector_search/cli/query.py
+++ b/packages/vector-search/src/vector_search/cli/query.py
@@ -0,0 +1,55 @@
+"""Query the vector search index."""
+
+import typer
+from embedder.vertex_ai import VertexAIEmbedder
+from rich.console import Console
+from rich.table import Table
+from typer import Argument, Option
+
+from rag_eval.config import settings as config
+from vector_search.vertex_ai import GoogleCloudVectorSearch
+
+app = typer.Typer()
+
+
+@app.callback(invoke_without_command=True)
+def query(
+    query: str = Argument(..., help="The text query to search for."),
+    limit: int = Option(5, "--limit", "-l", help="The number of results to return."),
+):
+    """Queries the vector search index."""
+    console = Console()
+
+    try:
+        console.print("[bold green]Initializing clients...[/bold green]")
+        embedder = VertexAIEmbedder(model_name=config.embedding_model)
+        vector_search = GoogleCloudVectorSearch(
+            project_id=config.project_id, location=config.location, bucket=config.bucket
+        )
+
+        console.print("[bold green]Loading index endpoint...[/bold green]")
+        vector_search.load_index_endpoint(config.index.endpoint)
+
+        console.print("[bold green]Generating embedding for query...[/bold green]")
+        query_embedding = embedder.generate_embedding(query)
+
+        console.print("[bold green]Running search query...[/bold green]")
+        search_results = vector_search.run_query(
+            deployed_index_id=config.index.deployment,
+            query=query_embedding,
+            limit=limit,
+        )
+
+        table = Table(title="Search Results")
+        table.add_column("ID", justify="left", style="cyan")
+        table.add_column("Distance", justify="left", style="magenta")
+        table.add_column("Content", justify="left", style="green")
+
+        for result in search_results:
+            table.add_row(result["id"], str(result["distance"]), result["content"])
+
+        console.print(table)
+
+    except Exception as e:
+        console.print(f"[bold red]An error occurred: {e}[/bold red]")
+        raise typer.Exit(code=1)
--- a/packages/vector-search/src/vector_search/py.typed
+++ b/packages/vector-search/src/vector_search/py.typed
--- a/packages/vector-search/src/vector_search/vertex_ai.py
+++ b/packages/vector-search/src/vector_search/vertex_ai.py
@@ -0,0 +1,255 @@
+import asyncio
+from typing import List
+from uuid import uuid4
+
+import aiohttp
+import google.auth
+import google.auth.transport.requests
+from file_storage.google_cloud import GoogleCloudFileStorage
+from gcloud.aio.auth import Token
+from google.cloud import aiplatform
+
+from .base import BaseVectorSearch, SearchResult
+
+
+class GoogleCloudVectorSearch(BaseVectorSearch):
+    """
+    A vector search provider that uses Google Cloud's Vertex AI Vector Search.
+    """
+
+    def __init__(
+        self, project_id: str, location: str, bucket: str, index_name: str = None
+    ):
+        """
+        Initializes the GoogleCloudVectorSearch client.
+
+        Args:
+            project_id: The Google Cloud project ID.
+            location: The Google Cloud location (e.g., 'us-central1').
+            bucket: The GCS bucket to use for file storage.
+            index_name: The name of the index. If None, it will be taken from settings.
+        """
+        aiplatform.init(project=project_id, location=location)
+        self.project_id = project_id
+        self.location = location
+        self.storage = GoogleCloudFileStorage(bucket=bucket)
+        self.index_name = index_name
+        self._credentials = None
+        self._aio_session: aiohttp.ClientSession | None = None
+        self._async_token: Token | None = None
+
+    def _get_auth_headers(self) -> dict:
+        if self._credentials is None:
+            self._credentials, _ = google.auth.default(
+                scopes=["https://www.googleapis.com/auth/cloud-platform"]
+            )
+        if not self._credentials.token or self._credentials.expired:
+            self._credentials.refresh(google.auth.transport.requests.Request())
+        return {
+            "Authorization": f"Bearer {self._credentials.token}",
+            "Content-Type": "application/json",
+        }
+
+    async def _async_get_auth_headers(self) -> dict:
+        if self._async_token is None:
+            self._async_token = Token(
+                session=self._get_aio_session(),
+                scopes=["https://www.googleapis.com/auth/cloud-platform"],
+            )
+        access_token = await self._async_token.get()
+        return {
+            "Authorization": f"Bearer {access_token}",
+            "Content-Type": "application/json",
+        }
+
+    def _get_aio_session(self) -> aiohttp.ClientSession:
+        if self._aio_session is None or self._aio_session.closed:
+            connector = aiohttp.TCPConnector(limit=300, limit_per_host=50)
+            timeout = aiohttp.ClientTimeout(total=60)
+            self._aio_session = aiohttp.ClientSession(
+                timeout=timeout, connector=connector
+            )
+        return self._aio_session
+
+    def create_index(
+        self,
+        name: str,
+        content_path: str,
+        dimensions: int,
+        approximate_neighbors_count: int = 150,
+        distance_measure_type: str = "DOT_PRODUCT_DISTANCE",
+        **kwargs,
+    ) -> None:
+        """
+        Creates a new Vertex AI Vector Search index.
+
+        Args:
+            name: The display name for the new index.
+            content_path: The GCS URI to the JSON file containing the embeddings.
+            dimensions: The number of dimensions in the embedding vectors.
+            approximate_neighbors_count: The number of neighbors to find for each vector.
+            distance_measure_type: The distance measure to use (e.g., 'DOT_PRODUCT_DISTANCE').
+        """
+        index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
+            display_name=name,
+            contents_delta_uri=content_path,
+            dimensions=dimensions,
+            approximate_neighbors_count=approximate_neighbors_count,
+            distance_measure_type=distance_measure_type,
+            leaf_node_embedding_count=1000,
+            leaf_nodes_to_search_percent=10,
+        )
+        self.index = index
+
+    def update_index(self, index_name: str, content_path: str, **kwargs) -> None:
+        """
+        Updates an existing Vertex AI Vector Search index.
+
+        Args:
+            index_name: The resource name of the index to update.
+            content_path: The GCS URI to the JSON file containing the new embeddings.
+        """
+        index = aiplatform.MatchingEngineIndex(index_name=index_name)
+        index.update_embeddings(
+            contents_delta_uri=content_path,
+        )
+        self.index = index
+
+    def deploy_index(
+        self, index_name: str, machine_type: str = "e2-standard-2"
+    ) -> None:
+        """
+        Deploys a Vertex AI Vector Search index to an endpoint.
+
+        Args:
+            index_name: The name of the index to deploy.
+            machine_type: The type of machine to use for the endpoint.
+        """
+        index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
+            display_name=f"{index_name}-endpoint",
+            public_endpoint_enabled=True,
+        )
+        index_endpoint.deploy_index(
+            index=self.index,
+            deployed_index_id=f"{index_name.replace('-', '_')}_deployed_{uuid4().hex}",
+            machine_type=machine_type,
+        )
+        self.index_endpoint = index_endpoint
+
+    def load_index_endpoint(self, endpoint_name: str) -> None:
+        """
+        Loads an existing Vertex AI Vector Search index endpoint.
+
+        Args:
+            endpoint_name: The resource name of the index endpoint.
+        """
+        self.index_endpoint = aiplatform.MatchingEngineIndexEndpoint(endpoint_name)
+        if not self.index_endpoint.public_endpoint_domain_name:
+            raise ValueError(
+                "The index endpoint does not have a public endpoint. "
+                "Please ensure that the endpoint is configured for public access."
+            )
+
+    def run_query(
+        self, deployed_index_id: str, query: List[float], limit: int
+    ) -> List[SearchResult]:
+        """
+        Runs a similarity search query against the deployed index.
+
+        Args:
+            deployed_index_id: The ID of the deployed index.
+            query: The embedding vector to use for the search query.
+            limit: The maximum number of nearest neighbors to return.
+
+        Returns:
+            A list of dictionaries representing the matched items.
+        """
+        response = self.index_endpoint.find_neighbors(
+            deployed_index_id=deployed_index_id, queries=[query], num_neighbors=limit
+        )
+        results = []
+        for neighbor in response[0]:
+            file_path = self.index_name + "/contents/" + neighbor.id + ".md"
+            content = self.storage.get_file_stream(file_path).read().decode("utf-8")
+            results.append(
+                {"id": neighbor.id, "distance": neighbor.distance, "content": content}
+            )
+        return results
+
+    async def async_run_query(
+        self, deployed_index_id: str, query: List[float], limit: int
+    ) -> List[SearchResult]:
+        """
+        Runs a non-blocking similarity search query against the deployed index
+        using the REST API directly with an async HTTP client.
+
+        Args:
+            deployed_index_id: The ID of the deployed index.
+            query: The embedding vector to use for the search query.
+            limit: The maximum number of nearest neighbors to return.
+
+        Returns:
+            A list of dictionaries representing the matched items.
+        """
+        domain = self.index_endpoint.public_endpoint_domain_name
+        endpoint_id = self.index_endpoint.name.split("/")[-1]
+        url = (
+            f"https://{domain}/v1/projects/{self.project_id}"
+            f"/locations/{self.location}"
+            f"/indexEndpoints/{endpoint_id}:findNeighbors"
+        )
+        payload = {
+            "deployed_index_id": deployed_index_id,
+            "queries": [
+                {
+                    "datapoint": {"feature_vector": query},
+                    "neighbor_count": limit,
+                }
+            ],
+        }
+
+        headers = await self._async_get_auth_headers()
+        session = self._get_aio_session()
+        async with session.post(url, json=payload, headers=headers) as response:
+            response.raise_for_status()
+            data = await response.json()
+
+        neighbors = data.get("nearestNeighbors", [{}])[0].get("neighbors", [])
+        content_tasks = []
+        for neighbor in neighbors:
+            datapoint_id = neighbor["datapoint"]["datapointId"]
+            file_path = f"{self.index_name}/contents/{datapoint_id}.md"
+            content_tasks.append(self.storage.async_get_file_stream(file_path))
+
+        file_streams = await asyncio.gather(*content_tasks)
+        results: List[SearchResult] = []
+        for neighbor, stream in zip(neighbors, file_streams):
+            results.append(
+                {
+                    "id": neighbor["datapoint"]["datapointId"],
+                    "distance": neighbor["distance"],
+                    "content": stream.read().decode("utf-8"),
+                }
+            )
+        return results
+
+    def delete_index(self, index_name: str) -> None:
+        """
+        Deletes a Vertex AI Vector Search index.
+
+        Args:
+            index_name: The resource name of the index.
+        """
+        index = aiplatform.MatchingEngineIndex(index_name)
+        index.delete()
+
+    def delete_index_endpoint(self, index_endpoint_name: str) -> None:
+        """
+        Deletes a Vertex AI Vector Search index endpoint.
+
+        Args:
+            index_endpoint_name: The resource name of the index endpoint.
+        """
+        index_endpoint = aiplatform.MatchingEngineIndexEndpoint(index_endpoint_name)
+        index_endpoint.undeploy_all()
+        index_endpoint.delete(force=True)