Mayacontigo/apps/egresos/api/server.py

import time
import uuid
from contextlib import asynccontextmanager

from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from langfuse import Langfuse
from pydantic import BaseModel

from api import services
from api.agent import Agent
from api.config import config

# Configurar Langfuse
langfuse = Langfuse(
    public_key="pk-lf-49cb04b3-0c7d-475b-8105-ad8b8749ecdd",
    secret_key="sk-lf-e02fa322-c709-4d80-bef2-9cb279846a0c",
    host="https://ailogger.azurewebsites.net",
)


@asynccontextmanager
async def lifespan(_: FastAPI):
    await config.init_mongo_db()
    yield


app = FastAPI(lifespan=lifespan)
agent = Agent()


@app.post("/api/v1/conversation")
async def create_conversation():
    conversation_id = uuid.uuid4()
    await services.create_conversation(conversation_id, agent.system_prompt)
    return {"conversation_id": conversation_id}


class Message(BaseModel):
    conversation_id: uuid.UUID
    prompt: str


@app.post("/api/v1/message")
async def send(message: Message):
    # Crear trace principal
    trace = langfuse.trace(
        name="chat_message",
        session_id=str(message.conversation_id),
        input={
            "prompt": message.prompt,
            "conversation_id": str(message.conversation_id),
        },
    )

    def b64_sse(func):
        async def wrapper(*args, **kwargs):
            response_parts = []
            start_time = time.time()

            async for chunk in func(*args, **kwargs):
                if chunk.type == "text" and chunk.content:
                    response_parts.append(str(chunk.content))

                content = chunk.model_dump_json()
                data = f"data: {content}\n\n"
                yield data

            end_time = time.time()
            latency_ms = round((end_time - start_time) * 1000)
            full_response = "".join(response_parts)

            input_tokens = len(message.prompt.split()) * 1.3
            output_tokens = len(full_response.split()) * 1.3
            total_tokens = int(input_tokens + output_tokens)

            cost_per_1k_input = 0.03
            cost_per_1k_output = 0.06
            total_cost = (input_tokens / 1000 * cost_per_1k_input) + (
                output_tokens / 1000 * cost_per_1k_output
            )

            trace.update(
                output={"response": full_response},
                usage={
                    "input": int(input_tokens),
                    "output": int(output_tokens),
                    "total": total_tokens,
                    "unit": "TOKENS",
                },
            )

            langfuse.score(
                trace_id=trace.id,
                name="latency",
                value=latency_ms,
                comment=f"Response time: {latency_ms}ms",
            )

            langfuse.score(
                trace_id=trace.id,
                name="cost",
                value=round(total_cost, 4),
                comment=f"Estimated cost: ${round(total_cost, 4)}",
            )

        return wrapper

    sse_stream = b64_sse(services.stream)
    generator = sse_stream(agent, message.prompt, message.conversation_id)
    return StreamingResponse(generator, media_type="text/event-stream")


@app.get("/")
async def health():
    return {"status": "ok"}