code-shredding
This commit is contained in:
31
README.md
31
README.md
@@ -260,30 +260,18 @@ embeddings = embedder.generate_embeddings_batch(texts, batch_size=10)
|
||||
### **Opción 4: Almacenar en GCS**
|
||||
|
||||
```python
|
||||
from file_storage.google_cloud import GoogleCloudFileStorage
|
||||
import gcsfs
|
||||
|
||||
storage = GoogleCloudFileStorage(bucket="mi-bucket")
|
||||
fs = gcsfs.GCSFileSystem()
|
||||
|
||||
# Subir archivo
|
||||
storage.upload_file(
|
||||
file_path="local_file.md",
|
||||
destination_blob_name="chunks/documento_0.md",
|
||||
content_type="text/markdown"
|
||||
)
|
||||
fs.put("local_file.md", "mi-bucket/chunks/documento_0.md")
|
||||
|
||||
# Listar archivos
|
||||
files = storage.list_files(path="chunks/")
|
||||
files = fs.ls("mi-bucket/chunks/")
|
||||
|
||||
# Descargar archivo
|
||||
file_stream = storage.get_file_stream("chunks/documento_0.md")
|
||||
content = file_stream.read().decode("utf-8")
|
||||
```
|
||||
|
||||
**CLI:**
|
||||
```bash
|
||||
file-storage upload local_file.md chunks/documento_0.md
|
||||
file-storage list chunks/
|
||||
file-storage download chunks/documento_0.md
|
||||
content = fs.cat_file("mi-bucket/chunks/documento_0.md").decode("utf-8")
|
||||
```
|
||||
|
||||
---
|
||||
@@ -340,10 +328,10 @@ vector-search delete mi-indice
|
||||
## 🔄 Flujo Completo de Ejemplo
|
||||
|
||||
```python
|
||||
import gcsfs
|
||||
from pathlib import Path
|
||||
from chunker.contextual_chunker import ContextualChunker
|
||||
from embedder.vertex_ai import VertexAIEmbedder
|
||||
from file_storage.google_cloud import GoogleCloudFileStorage
|
||||
from llm.vertex_ai import VertexAILLM
|
||||
|
||||
# 1. Setup
|
||||
@@ -354,7 +342,7 @@ embedder = VertexAIEmbedder(
|
||||
project="mi-proyecto",
|
||||
location="us-central1"
|
||||
)
|
||||
storage = GoogleCloudFileStorage(bucket="mi-bucket")
|
||||
fs = gcsfs.GCSFileSystem()
|
||||
|
||||
# 2. Chunking
|
||||
documents = chunker.process_path(Path("documento.pdf"))
|
||||
@@ -368,10 +356,7 @@ for i, doc in enumerate(documents):
|
||||
embedding = embedder.generate_embedding(doc["page_content"])
|
||||
|
||||
# Guardar contenido en GCS
|
||||
storage.upload_file(
|
||||
file_path=f"temp_{chunk_id}.md",
|
||||
destination_blob_name=f"contents/{chunk_id}.md"
|
||||
)
|
||||
fs.put(f"temp_{chunk_id}.md", f"mi-bucket/contents/{chunk_id}.md")
|
||||
|
||||
# Guardar vector (escribir a JSONL localmente, luego subir)
|
||||
print(f"Chunk {chunk_id}: {len(embedding)} dimensiones")
|
||||
|
||||
Reference in New Issue
Block a user