Split out main module
This commit is contained in:
144
src/knowledge_search_mcp/clients/storage.py
Normal file
144
src/knowledge_search_mcp/clients/storage.py
Normal file
@@ -0,0 +1,144 @@
|
||||
# ruff: noqa: INP001
|
||||
"""Google Cloud Storage client with caching."""
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
from typing import BinaryIO
|
||||
|
||||
import aiohttp
|
||||
from gcloud.aio.storage import Storage
|
||||
|
||||
from ..logging import log_structured_entry
|
||||
from ..utils.cache import LRUCache
|
||||
from .base import BaseGoogleCloudClient
|
||||
|
||||
HTTP_TOO_MANY_REQUESTS = 429
|
||||
HTTP_SERVER_ERROR = 500
|
||||
|
||||
|
||||
class GoogleCloudFileStorage(BaseGoogleCloudClient):
|
||||
"""Cache-aware helper for downloading files from Google Cloud Storage."""
|
||||
|
||||
def __init__(self, bucket: str, cache_size: int = 100) -> None:
|
||||
"""Initialize the storage helper with LRU cache."""
|
||||
super().__init__()
|
||||
self.bucket_name = bucket
|
||||
self._aio_storage: Storage | None = None
|
||||
self._cache = LRUCache(max_size=cache_size)
|
||||
|
||||
def _get_aio_storage(self) -> Storage:
|
||||
if self._aio_storage is None:
|
||||
self._aio_storage = Storage(
|
||||
session=self._get_aio_session(),
|
||||
)
|
||||
return self._aio_storage
|
||||
|
||||
async def async_get_file_stream(
|
||||
self,
|
||||
file_name: str,
|
||||
max_retries: int = 3,
|
||||
) -> BinaryIO:
|
||||
"""Get a file asynchronously with retry on transient errors.
|
||||
|
||||
Args:
|
||||
file_name: The blob name to retrieve.
|
||||
max_retries: Maximum number of retry attempts.
|
||||
|
||||
Returns:
|
||||
A BytesIO stream with the file contents.
|
||||
|
||||
Raises:
|
||||
TimeoutError: If all retry attempts fail.
|
||||
|
||||
"""
|
||||
cached_content = self._cache.get(file_name)
|
||||
if cached_content is not None:
|
||||
log_structured_entry(
|
||||
"File retrieved from cache",
|
||||
"INFO",
|
||||
{"file": file_name, "bucket": self.bucket_name}
|
||||
)
|
||||
file_stream = io.BytesIO(cached_content)
|
||||
file_stream.name = file_name
|
||||
return file_stream
|
||||
|
||||
log_structured_entry(
|
||||
"Starting file download from GCS",
|
||||
"INFO",
|
||||
{"file": file_name, "bucket": self.bucket_name}
|
||||
)
|
||||
|
||||
storage_client = self._get_aio_storage()
|
||||
last_exception: Exception | None = None
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
content = await storage_client.download(
|
||||
self.bucket_name,
|
||||
file_name,
|
||||
)
|
||||
self._cache.put(file_name, content)
|
||||
file_stream = io.BytesIO(content)
|
||||
file_stream.name = file_name
|
||||
log_structured_entry(
|
||||
"File downloaded successfully",
|
||||
"INFO",
|
||||
{
|
||||
"file": file_name,
|
||||
"bucket": self.bucket_name,
|
||||
"size_bytes": len(content),
|
||||
"attempt": attempt + 1
|
||||
}
|
||||
)
|
||||
except TimeoutError as exc:
|
||||
last_exception = exc
|
||||
log_structured_entry(
|
||||
f"Timeout downloading gs://{self.bucket_name}/{file_name} (attempt {attempt + 1}/{max_retries})",
|
||||
"WARNING",
|
||||
{"error": str(exc)}
|
||||
)
|
||||
except aiohttp.ClientResponseError as exc:
|
||||
last_exception = exc
|
||||
if (
|
||||
exc.status == HTTP_TOO_MANY_REQUESTS
|
||||
or exc.status >= HTTP_SERVER_ERROR
|
||||
):
|
||||
log_structured_entry(
|
||||
f"HTTP {exc.status} downloading gs://{self.bucket_name}/{file_name} (attempt {attempt + 1}/{max_retries})",
|
||||
"WARNING",
|
||||
{"status": exc.status, "message": str(exc)}
|
||||
)
|
||||
else:
|
||||
log_structured_entry(
|
||||
f"Non-retryable HTTP error downloading gs://{self.bucket_name}/{file_name}",
|
||||
"ERROR",
|
||||
{"status": exc.status, "message": str(exc)}
|
||||
)
|
||||
raise
|
||||
else:
|
||||
return file_stream
|
||||
|
||||
if attempt < max_retries - 1:
|
||||
delay = 0.5 * (2**attempt)
|
||||
log_structured_entry(
|
||||
"Retrying file download",
|
||||
"INFO",
|
||||
{"file": file_name, "delay_seconds": delay}
|
||||
)
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
msg = (
|
||||
f"Failed to download gs://{self.bucket_name}/{file_name} "
|
||||
f"after {max_retries} attempts"
|
||||
)
|
||||
log_structured_entry(
|
||||
"File download failed after all retries",
|
||||
"ERROR",
|
||||
{
|
||||
"file": file_name,
|
||||
"bucket": self.bucket_name,
|
||||
"max_retries": max_retries,
|
||||
"last_error": str(last_exception)
|
||||
}
|
||||
)
|
||||
raise TimeoutError(msg) from last_exception
|
||||
Reference in New Issue
Block a user