forked from innovacion/Mayacontigo
117 lines
3.2 KiB
Python
117 lines
3.2 KiB
Python
import time
|
|
import uuid
|
|
from contextlib import asynccontextmanager
|
|
|
|
from fastapi import FastAPI
|
|
from fastapi.responses import StreamingResponse
|
|
from langfuse import Langfuse
|
|
from pydantic import BaseModel
|
|
|
|
from api import services
|
|
from api.agent import Agent
|
|
from api.config import config
|
|
|
|
# Configurar Langfuse
|
|
langfuse = Langfuse(
|
|
public_key="pk-lf-49cb04b3-0c7d-475b-8105-ad8b8749ecdd",
|
|
secret_key="sk-lf-e02fa322-c709-4d80-bef2-9cb279846a0c",
|
|
host="https://ailogger.azurewebsites.net",
|
|
)
|
|
|
|
|
|
@asynccontextmanager
|
|
async def lifespan(_: FastAPI):
|
|
await config.init_mongo_db()
|
|
yield
|
|
|
|
|
|
app = FastAPI(lifespan=lifespan)
|
|
agent = Agent()
|
|
|
|
|
|
@app.post("/api/v1/conversation")
|
|
async def create_conversation():
|
|
conversation_id = uuid.uuid4()
|
|
await services.create_conversation(conversation_id, agent.system_prompt)
|
|
return {"conversation_id": conversation_id}
|
|
|
|
|
|
class Message(BaseModel):
|
|
conversation_id: uuid.UUID
|
|
prompt: str
|
|
|
|
|
|
@app.post("/api/v1/message")
|
|
async def send(message: Message):
|
|
# Crear trace principal
|
|
trace = langfuse.trace(
|
|
name="chat_message",
|
|
session_id=str(message.conversation_id),
|
|
input={
|
|
"prompt": message.prompt,
|
|
"conversation_id": str(message.conversation_id),
|
|
},
|
|
)
|
|
|
|
def b64_sse(func):
|
|
async def wrapper(*args, **kwargs):
|
|
response_parts = []
|
|
start_time = time.time()
|
|
|
|
async for chunk in func(*args, **kwargs):
|
|
if chunk.type == "text" and chunk.content:
|
|
response_parts.append(str(chunk.content))
|
|
|
|
content = chunk.model_dump_json()
|
|
data = f"data: {content}\n\n"
|
|
yield data
|
|
|
|
end_time = time.time()
|
|
latency_ms = round((end_time - start_time) * 1000)
|
|
full_response = "".join(response_parts)
|
|
|
|
input_tokens = len(message.prompt.split()) * 1.3
|
|
output_tokens = len(full_response.split()) * 1.3
|
|
total_tokens = int(input_tokens + output_tokens)
|
|
|
|
cost_per_1k_input = 0.03
|
|
cost_per_1k_output = 0.06
|
|
total_cost = (input_tokens / 1000 * cost_per_1k_input) + (
|
|
output_tokens / 1000 * cost_per_1k_output
|
|
)
|
|
|
|
trace.update(
|
|
output={"response": full_response},
|
|
usage={
|
|
"input": int(input_tokens),
|
|
"output": int(output_tokens),
|
|
"total": total_tokens,
|
|
"unit": "TOKENS",
|
|
},
|
|
)
|
|
|
|
langfuse.score(
|
|
trace_id=trace.id,
|
|
name="latency",
|
|
value=latency_ms,
|
|
comment=f"Response time: {latency_ms}ms",
|
|
)
|
|
|
|
langfuse.score(
|
|
trace_id=trace.id,
|
|
name="cost",
|
|
value=round(total_cost, 4),
|
|
comment=f"Estimated cost: ${round(total_cost, 4)}",
|
|
)
|
|
|
|
return wrapper
|
|
|
|
sse_stream = b64_sse(services.stream)
|
|
generator = sse_stream(agent, message.prompt, message.conversation_id)
|
|
return StreamingResponse(generator, media_type="text/event-stream")
|
|
|
|
|
|
@app.get("/")
|
|
async def health():
|
|
return {"status": "ok"}
|