# ruff: noqa: INP001 """Google Cloud Storage client with caching.""" import asyncio import io from typing import BinaryIO import aiohttp from gcloud.aio.storage import Storage from ..logging import log_structured_entry from ..utils.cache import LRUCache from .base import BaseGoogleCloudClient HTTP_TOO_MANY_REQUESTS = 429 HTTP_SERVER_ERROR = 500 class GoogleCloudFileStorage(BaseGoogleCloudClient): """Cache-aware helper for downloading files from Google Cloud Storage.""" def __init__(self, bucket: str, cache_size: int = 100) -> None: """Initialize the storage helper with LRU cache.""" super().__init__() self.bucket_name = bucket self._aio_storage: Storage | None = None self._cache = LRUCache(max_size=cache_size) def _get_aio_storage(self) -> Storage: if self._aio_storage is None: self._aio_storage = Storage( session=self._get_aio_session(), ) return self._aio_storage async def async_get_file_stream( self, file_name: str, max_retries: int = 3, ) -> BinaryIO: """Get a file asynchronously with retry on transient errors. Args: file_name: The blob name to retrieve. max_retries: Maximum number of retry attempts. Returns: A BytesIO stream with the file contents. Raises: TimeoutError: If all retry attempts fail. """ cached_content = self._cache.get(file_name) if cached_content is not None: log_structured_entry( "File retrieved from cache", "INFO", {"file": file_name, "bucket": self.bucket_name} ) file_stream = io.BytesIO(cached_content) file_stream.name = file_name return file_stream log_structured_entry( "Starting file download from GCS", "INFO", {"file": file_name, "bucket": self.bucket_name} ) storage_client = self._get_aio_storage() last_exception: Exception | None = None for attempt in range(max_retries): try: content = await storage_client.download( self.bucket_name, file_name, ) self._cache.put(file_name, content) file_stream = io.BytesIO(content) file_stream.name = file_name log_structured_entry( "File downloaded successfully", "INFO", { "file": file_name, "bucket": self.bucket_name, "size_bytes": len(content), "attempt": attempt + 1 } ) except TimeoutError as exc: last_exception = exc log_structured_entry( f"Timeout downloading gs://{self.bucket_name}/{file_name} (attempt {attempt + 1}/{max_retries})", "WARNING", {"error": str(exc)} ) except aiohttp.ClientResponseError as exc: last_exception = exc if ( exc.status == HTTP_TOO_MANY_REQUESTS or exc.status >= HTTP_SERVER_ERROR ): log_structured_entry( f"HTTP {exc.status} downloading gs://{self.bucket_name}/{file_name} (attempt {attempt + 1}/{max_retries})", "WARNING", {"status": exc.status, "message": str(exc)} ) else: log_structured_entry( f"Non-retryable HTTP error downloading gs://{self.bucket_name}/{file_name}", "ERROR", {"status": exc.status, "message": str(exc)} ) raise else: return file_stream if attempt < max_retries - 1: delay = 0.5 * (2**attempt) log_structured_entry( "Retrying file download", "INFO", {"file": file_name, "delay_seconds": delay} ) await asyncio.sleep(delay) msg = ( f"Failed to download gs://{self.bucket_name}/{file_name} " f"after {max_retries} attempts" ) log_structured_entry( "File download failed after all retries", "ERROR", { "file": file_name, "bucket": self.bucket_name, "max_retries": max_retries, "last_error": str(last_exception) } ) raise TimeoutError(msg) from last_exception