knowledge-search-mcp/src/knowledge_search_mcp/clients/storage.py

# ruff: noqa: INP001
"""Google Cloud Storage client with caching."""

import asyncio
import io
from typing import BinaryIO

import aiohttp
from gcloud.aio.storage import Storage

from ..logging import log_structured_entry
from ..utils.cache import LRUCache
from .base import BaseGoogleCloudClient

HTTP_TOO_MANY_REQUESTS = 429
HTTP_SERVER_ERROR = 500


class GoogleCloudFileStorage(BaseGoogleCloudClient):
    """Cache-aware helper for downloading files from Google Cloud Storage."""

    def __init__(self, bucket: str, cache_size: int = 100) -> None:
        """Initialize the storage helper with LRU cache."""
        super().__init__()
        self.bucket_name = bucket
        self._aio_storage: Storage | None = None
        self._cache = LRUCache(max_size=cache_size)

    def _get_aio_storage(self) -> Storage:
        if self._aio_storage is None:
            self._aio_storage = Storage(
                session=self._get_aio_session(),
            )
        return self._aio_storage

    async def async_get_file_stream(
        self,
        file_name: str,
        max_retries: int = 3,
    ) -> BinaryIO:
        """Get a file asynchronously with retry on transient errors.

        Args:
            file_name: The blob name to retrieve.
            max_retries: Maximum number of retry attempts.

        Returns:
            A BytesIO stream with the file contents.

        Raises:
            TimeoutError: If all retry attempts fail.

        """
        cached_content = self._cache.get(file_name)
        if cached_content is not None:
            log_structured_entry(
                "File retrieved from cache",
                "INFO",
                {"file": file_name, "bucket": self.bucket_name}
            )
            file_stream = io.BytesIO(cached_content)
            file_stream.name = file_name
            return file_stream

        log_structured_entry(
            "Starting file download from GCS",
            "INFO",
            {"file": file_name, "bucket": self.bucket_name}
        )

        storage_client = self._get_aio_storage()
        last_exception: Exception | None = None

        for attempt in range(max_retries):
            try:
                content = await storage_client.download(
                    self.bucket_name,
                    file_name,
                )
                self._cache.put(file_name, content)
                file_stream = io.BytesIO(content)
                file_stream.name = file_name
                log_structured_entry(
                    "File downloaded successfully",
                    "INFO",
                    {
                        "file": file_name,
                        "bucket": self.bucket_name,
                        "size_bytes": len(content),
                        "attempt": attempt + 1
                    }
                )
            except TimeoutError as exc:
                last_exception = exc
                log_structured_entry(
                    f"Timeout downloading gs://{self.bucket_name}/{file_name} (attempt {attempt + 1}/{max_retries})",
                    "WARNING",
                    {"error": str(exc)}
                )
            except aiohttp.ClientResponseError as exc:
                last_exception = exc
                if (
                    exc.status == HTTP_TOO_MANY_REQUESTS
                    or exc.status >= HTTP_SERVER_ERROR
                ):
                     log_structured_entry(
                        f"HTTP {exc.status} downloading gs://{self.bucket_name}/{file_name} (attempt {attempt + 1}/{max_retries})",
                        "WARNING",
                        {"status": exc.status, "message": str(exc)}
                    )
                else:
                    log_structured_entry(
                        f"Non-retryable HTTP error downloading gs://{self.bucket_name}/{file_name}",
                        "ERROR",
                        {"status": exc.status, "message": str(exc)}
                    )
                    raise
            else:
                return file_stream

            if attempt < max_retries - 1:
                delay = 0.5 * (2**attempt)
                log_structured_entry(
                    "Retrying file download",
                    "INFO",
                    {"file": file_name, "delay_seconds": delay}
                )
                await asyncio.sleep(delay)

        msg = (
            f"Failed to download gs://{self.bucket_name}/{file_name} "
            f"after {max_retries} attempts"
        )
        log_structured_entry(
            "File download failed after all retries",
            "ERROR",
            {
                "file": file_name,
                "bucket": self.bucket_name,
                "max_retries": max_retries,
                "last_error": str(last_exception)
            }
        )
        raise TimeoutError(msg) from last_exception