code-shredding

This commit is contained in:
2026-02-24 03:50:34 +00:00
parent 98a1b5939e
commit e8f407d313
71 changed files with 1025 additions and 2417 deletions

View File

@@ -260,30 +260,18 @@ embeddings = embedder.generate_embeddings_batch(texts, batch_size=10)
### **Opción 4: Almacenar en GCS**
```python
from file_storage.google_cloud import GoogleCloudFileStorage
import gcsfs
storage = GoogleCloudFileStorage(bucket="mi-bucket")
fs = gcsfs.GCSFileSystem()
# Subir archivo
storage.upload_file(
file_path="local_file.md",
destination_blob_name="chunks/documento_0.md",
content_type="text/markdown"
)
fs.put("local_file.md", "mi-bucket/chunks/documento_0.md")
# Listar archivos
files = storage.list_files(path="chunks/")
files = fs.ls("mi-bucket/chunks/")
# Descargar archivo
file_stream = storage.get_file_stream("chunks/documento_0.md")
content = file_stream.read().decode("utf-8")
```
**CLI:**
```bash
file-storage upload local_file.md chunks/documento_0.md
file-storage list chunks/
file-storage download chunks/documento_0.md
content = fs.cat_file("mi-bucket/chunks/documento_0.md").decode("utf-8")
```
---
@@ -340,10 +328,10 @@ vector-search delete mi-indice
## 🔄 Flujo Completo de Ejemplo
```python
import gcsfs
from pathlib import Path
from chunker.contextual_chunker import ContextualChunker
from embedder.vertex_ai import VertexAIEmbedder
from file_storage.google_cloud import GoogleCloudFileStorage
from llm.vertex_ai import VertexAILLM
# 1. Setup
@@ -354,7 +342,7 @@ embedder = VertexAIEmbedder(
project="mi-proyecto",
location="us-central1"
)
storage = GoogleCloudFileStorage(bucket="mi-bucket")
fs = gcsfs.GCSFileSystem()
# 2. Chunking
documents = chunker.process_path(Path("documento.pdf"))
@@ -368,10 +356,7 @@ for i, doc in enumerate(documents):
embedding = embedder.generate_embedding(doc["page_content"])
# Guardar contenido en GCS
storage.upload_file(
file_path=f"temp_{chunk_id}.md",
destination_blob_name=f"contents/{chunk_id}.md"
)
fs.put(f"temp_{chunk_id}.md", f"mi-bucket/contents/{chunk_id}.md")
# Guardar vector (escribir a JSONL localmente, luego subir)
print(f"Chunk {chunk_id}: {len(embedding)} dimensiones")