First commmit

This commit is contained in:
2026-02-22 15:25:27 +00:00
commit 35d5a65b17
70 changed files with 4298 additions and 0 deletions

View File

@@ -0,0 +1 @@
3.10

View File

View File

@@ -0,0 +1,22 @@
[project]
name = "file-storage"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
authors = [
{ name = "Anibal Angulo", email = "a8065384@banorte.com" }
]
requires-python = ">=3.12"
dependencies = [
"gcloud-aio-storage>=9.6.1",
"google-cloud-storage>=2.19.0",
"aiohttp>=3.10.11,<4",
"typer>=0.12.3",
]
[project.scripts]
file-storage = "file_storage.cli:app"
[build-system]
requires = ["uv_build>=0.8.3,<0.9.0"]
build-backend = "uv_build"

View File

@@ -0,0 +1,2 @@
def hello() -> str:
return "Hello from file-storage!"

View File

@@ -0,0 +1,48 @@
from abc import ABC, abstractmethod
from typing import BinaryIO, List, Optional
class BaseFileStorage(ABC):
"""
Abstract base class for a remote file processor.
This class defines the interface for listing and processing files from a remote source.
"""
@abstractmethod
def upload_file(
self,
file_path: str,
destination_blob_name: str,
content_type: Optional[str] = None,
) -> None:
"""
Uploads a file to the remote source.
Args:
file_path: The local path to the file to upload.
destination_blob_name: The name of the file in the remote source.
content_type: The content type of the file.
"""
...
@abstractmethod
def list_files(self, path: Optional[str] = None) -> List[str]:
"""
Lists files from a remote location.
Args:
path: The path to a specific file or directory in the remote bucket.
If None, it recursively lists all files in the bucket.
Returns:
A list of file paths.
"""
...
@abstractmethod
def get_file_stream(self, file_name: str) -> BinaryIO:
"""
Gets a file from the remote source and returns it as a file-like object.
"""
...

View File

@@ -0,0 +1,89 @@
import os
from typing import Annotated
import rich
import typer
from rag_eval.config import settings
from .google_cloud import GoogleCloudFileStorage
app = typer.Typer()
def get_storage_client() -> GoogleCloudFileStorage:
return GoogleCloudFileStorage(bucket=settings.bucket)
@app.command("upload")
def upload(
file_path: str,
destination_blob_name: str,
content_type: Annotated[str, typer.Option()] = None,
):
"""
Uploads a file or directory to the remote source.
"""
storage_client = get_storage_client()
if os.path.isdir(file_path):
for root, _, files in os.walk(file_path):
for file in files:
local_file_path = os.path.join(root, file)
# preserve the directory structure and use forward slashes for blob name
dest_blob_name = os.path.join(
destination_blob_name, os.path.relpath(local_file_path, file_path)
).replace(os.sep, "/")
storage_client.upload_file(
local_file_path, dest_blob_name, content_type
)
rich.print(
f"[green]File {local_file_path} uploaded to {dest_blob_name}.[/green]"
)
rich.print(
f"[bold green]Directory {file_path} uploaded to {destination_blob_name}.[/bold green]"
)
else:
storage_client.upload_file(file_path, destination_blob_name, content_type)
rich.print(
f"[green]File {file_path} uploaded to {destination_blob_name}.[/green]"
)
@app.command("list")
def list_items(path: Annotated[str, typer.Option()] = None):
"""
Obtain a list of all files at the given location inside the remote bucket
If path is none, recursively shows all files in the remote bucket.
"""
storage_client = get_storage_client()
files = storage_client.list_files(path)
for file in files:
rich.print(f"[blue]{file}[/blue]")
@app.command("download")
def download(file_name: str, destination_path: str):
"""
Gets a file from the remote source and returns it as a file-like object.
"""
storage_client = get_storage_client()
file_stream = storage_client.get_file_stream(file_name)
with open(destination_path, "wb") as f:
f.write(file_stream.read())
rich.print(f"[green]File {file_name} downloaded to {destination_path}[/green]")
@app.command("delete")
def delete(path: str):
"""
Deletes all files at the given location inside the remote bucket.
If path is a single file, it will delete only that file.
If path is a directory, it will delete all files in that directory.
"""
storage_client = get_storage_client()
storage_client.delete_files(path)
rich.print(f"[bold red]Files at {path} deleted.[/bold red]")
if __name__ == "__main__":
app()

View File

@@ -0,0 +1,138 @@
import asyncio
import io
import logging
from typing import BinaryIO, List, Optional
import aiohttp
from gcloud.aio.storage import Storage
from google.cloud import storage
from .base import BaseFileStorage
logger = logging.getLogger(__name__)
class GoogleCloudFileStorage(BaseFileStorage):
def __init__(self, bucket: str) -> None:
self.bucket_name = bucket
self.storage_client = storage.Client()
self.bucket_client = self.storage_client.bucket(self.bucket_name)
self._aio_session: aiohttp.ClientSession | None = None
self._aio_storage: Storage | None = None
self._cache: dict[str, bytes] = {}
def upload_file(
self,
file_path: str,
destination_blob_name: str,
content_type: Optional[str] = None,
) -> None:
"""
Uploads a file to the remote source.
Args:
file_path: The local path to the file to upload.
destination_blob_name: The name of the file in the remote source.
content_type: The content type of the file.
"""
blob = self.bucket_client.blob(destination_blob_name)
blob.upload_from_filename(
file_path,
content_type=content_type,
if_generation_match=0,
)
self._cache.pop(destination_blob_name, None)
def list_files(self, path: Optional[str] = None) -> List[str]:
"""
Obtain a list of all files at the given location inside the remote bucket
If path is none, recursively shows all files in the remote bucket.
"""
blobs = self.storage_client.list_blobs(self.bucket_name, prefix=path)
return [blob.name for blob in blobs]
def get_file_stream(self, file_name: str) -> BinaryIO:
"""
Gets a file from the remote source and returns it as a file-like object.
"""
if file_name not in self._cache:
blob = self.bucket_client.blob(file_name)
self._cache[file_name] = blob.download_as_bytes()
file_stream = io.BytesIO(self._cache[file_name])
file_stream.name = file_name
return file_stream
def _get_aio_session(self) -> aiohttp.ClientSession:
if self._aio_session is None or self._aio_session.closed:
connector = aiohttp.TCPConnector(limit=300, limit_per_host=50)
timeout = aiohttp.ClientTimeout(total=60)
self._aio_session = aiohttp.ClientSession(
timeout=timeout, connector=connector
)
return self._aio_session
def _get_aio_storage(self) -> Storage:
if self._aio_storage is None:
self._aio_storage = Storage(session=self._get_aio_session())
return self._aio_storage
async def async_get_file_stream(
self, file_name: str, max_retries: int = 3
) -> BinaryIO:
"""
Gets a file from the remote source asynchronously and returns it as a file-like object.
Retries on transient errors (429, 5xx, timeouts) with exponential backoff.
"""
if file_name in self._cache:
file_stream = io.BytesIO(self._cache[file_name])
file_stream.name = file_name
return file_stream
storage_client = self._get_aio_storage()
last_exception: Exception | None = None
for attempt in range(max_retries):
try:
self._cache[file_name] = await storage_client.download(
self.bucket_name, file_name
)
file_stream = io.BytesIO(self._cache[file_name])
file_stream.name = file_name
return file_stream
except asyncio.TimeoutError as exc:
last_exception = exc
logger.warning(
"Timeout downloading gs://%s/%s (attempt %d/%d)",
self.bucket_name, file_name, attempt + 1, max_retries,
)
except aiohttp.ClientResponseError as exc:
last_exception = exc
if exc.status == 429 or exc.status >= 500:
logger.warning(
"HTTP %d downloading gs://%s/%s (attempt %d/%d)",
exc.status, self.bucket_name, file_name,
attempt + 1, max_retries,
)
else:
raise
if attempt < max_retries - 1:
delay = 0.5 * (2 ** attempt)
await asyncio.sleep(delay)
raise TimeoutError(
f"Failed to download gs://{self.bucket_name}/{file_name} "
f"after {max_retries} attempts"
) from last_exception
def delete_files(self, path: str) -> None:
"""
Deletes all files at the given location inside the remote bucket.
If path is a single file, it will delete only that file.
If path is a directory, it will delete all files in that directory.
"""
blobs = self.storage_client.list_blobs(self.bucket_name, prefix=path)
for blob in blobs:
blob.delete()
self._cache.pop(blob.name, None)