latticelm/scripts/chat.py

#!/usr/bin/env python3
# /// script
# requires-python = ">=3.11"
# dependencies = [
#     "rich>=13.7.0",
#     "httpx>=0.27.0",
# ]
# ///

"""
Terminal chat interface for go-llm-gateway.

Usage:
    python chat.py
    python chat.py --url http://localhost:8080
    python chat.py --model gemini-2.0-flash-exp
    python chat.py --token $(gcloud auth print-identity-token)
"""

import argparse
import json
import sys
from typing import Optional

import httpx
from rich.console import Console
from rich.live import Live
from rich.markdown import Markdown
from rich.panel import Panel
from rich.prompt import Prompt
from rich.table import Table


class ChatClient:
    def __init__(self, base_url: str, token: Optional[str] = None):
        self.base_url = base_url.rstrip("/")
        self.token = token
        self.messages = []
        self.console = Console()

    def _headers(self) -> dict:
        headers = {"Content-Type": "application/json"}
        if self.token:
            headers["Authorization"] = f"Bearer {self.token}"
        return headers

    def chat(self, user_message: str, model: str, stream: bool = True):
        """Send a chat message and get response."""
        # Add user message to history
        self.messages.append({
            "role": "user",
            "content": [{"type": "input_text", "text": user_message}]
        })

        payload = {
            "model": model,
            "input": self.messages,
            "stream": stream
        }

        if stream:
            return self._stream_response(payload, model)
        else:
            return self._sync_response(payload, model)

    def _sync_response(self, payload: dict, model: str) -> str:
        """Non-streaming response."""
        with self.console.status(f"[bold blue]Thinking ({model})..."):
            resp = httpx.post(
                f"{self.base_url}/v1/responses",
                json=payload,
                headers=self._headers(),
                timeout=60.0
            )
            resp.raise_for_status()

        data = resp.json()
        assistant_text = ""

        for msg in data.get("output", []):
            for block in msg.get("content", []):
                if block.get("type") == "output_text":
                    assistant_text += block.get("text", "")

        # Add to history
        self.messages.append({
            "role": "assistant",
            "content": [{"type": "output_text", "text": assistant_text}]
        })

        return assistant_text

    def _stream_response(self, payload: dict, model: str) -> str:
        """Streaming response with live rendering."""
        assistant_text = ""

        with httpx.stream(
            "POST",
            f"{self.base_url}/v1/responses",
            json=payload,
            headers=self._headers(),
            timeout=60.0
        ) as resp:
            resp.raise_for_status()

            with Live(console=self.console, refresh_per_second=10) as live:
                for line in resp.iter_lines():
                    if not line.startswith("data: "):
                        continue

                    data_str = line[6:]  # Remove "data: " prefix

                    try:
                        chunk = json.loads(data_str)
                    except json.JSONDecodeError:
                        continue

                    if chunk.get("done"):
                        break

                    delta = chunk.get("delta", {})
                    for block in delta.get("content", []):
                        if block.get("type") == "output_text":
                            assistant_text += block.get("text", "")

                    # Render markdown in real-time
                    live.update(Markdown(assistant_text))

        # Add to history
        self.messages.append({
            "role": "assistant",
            "content": [{"type": "output_text", "text": assistant_text}]
        })

        return assistant_text

    def clear_history(self):
        """Clear conversation history."""
        self.messages = []


def print_models_table(base_url: str, headers: dict):
    """Fetch and print available models from the gateway."""
    console = Console()
    try:
        resp = httpx.get(f"{base_url}/v1/models", headers=headers, timeout=10)
        resp.raise_for_status()
        data = resp.json().get("data", [])
    except Exception as e:
        console.print(f"[red]Failed to fetch models: {e}[/red]")
        return

    table = Table(title="Available Models", show_header=True, header_style="bold magenta")
    table.add_column("Provider", style="cyan")
    table.add_column("Model ID", style="green")

    for model in data:
        table.add_row(model.get("provider", ""), model.get("id", ""))

    console.print(table)


def main():
    parser = argparse.ArgumentParser(description="Chat with go-llm-gateway")
    parser.add_argument("--url", default="http://localhost:8080", help="Gateway URL")
    parser.add_argument("--model", default="gemini-2.0-flash-exp", help="Model to use")
    parser.add_argument("--token", help="Auth token (Bearer)")
    parser.add_argument("--no-stream", action="store_true", help="Disable streaming")
    args = parser.parse_args()

    console = Console()
    client = ChatClient(args.url, args.token)
    current_model = args.model
    stream_enabled = not args.no_stream

    # Welcome banner
    console.print(Panel.fit(
        "[bold cyan]go-llm-gateway Chat Interface[/bold cyan]\n"
        f"Connected to: [green]{args.url}[/green]\n"
        f"Model: [yellow]{current_model}[/yellow]\n"
        f"Streaming: [{'green' if stream_enabled else 'red'}]{stream_enabled}[/]\n\n"
        "Commands:\n"
        "  [bold]/model <name>[/bold] - Switch model\n"
        "  [bold]/models[/bold] - List available models\n"
        "  [bold]/stream[/bold] - Toggle streaming\n"
        "  [bold]/clear[/bold] - Clear conversation\n"
        "  [bold]/quit[/bold] or [bold]/exit[/bold] - Exit\n"
        "  [bold]/help[/bold] - Show this help",
        title="Welcome",
        border_style="cyan"
    ))

    # Model aliases
    model_aliases = {
        "gpt4": "gpt-4o",
        "gpt4-mini": "gpt-4o-mini",
        "o1": "o1",
        "claude": "claude-3-5-sonnet-20241022",
        "haiku": "claude-3-5-haiku-20241022",
        "gemini": "gemini-2.0-flash-exp",
        "gemini-pro": "gemini-1.5-pro"
    }

    while True:
        try:
            user_input = Prompt.ask("\n[bold blue]You[/bold blue]")

            if not user_input.strip():
                continue

            # Handle commands
            if user_input.startswith("/"):
                cmd_parts = user_input.split(maxsplit=1)
                cmd = cmd_parts[0].lower()

                if cmd in ["/quit", "/exit"]:
                    console.print("[yellow]Goodbye! 👋[/yellow]")
                    break

                elif cmd == "/help":
                    console.print(Panel(
                        "[bold]Commands:[/bold]\n"
                        "  /model <name> - Switch model\n"
                        "  /models - List available models\n"
                        "  /stream - Toggle streaming\n"
                        "  /clear - Clear conversation\n"
                        "  /quit - Exit",
                        title="Help",
                        border_style="cyan"
                    ))

                elif cmd == "/models":
                    print_models_table(args.url, client._headers())

                elif cmd == "/model":
                    if len(cmd_parts) < 2:
                        console.print("[red]Usage: /model <model-name>[/red]")
                        continue

                    new_model = cmd_parts[1]
                    # Check if it's an alias
                    new_model = model_aliases.get(new_model, new_model)
                    current_model = new_model
                    console.print(f"[green]Switched to model: {current_model}[/green]")

                elif cmd == "/stream":
                    stream_enabled = not stream_enabled
                    console.print(f"[green]Streaming {'enabled' if stream_enabled else 'disabled'}[/green]")

                elif cmd == "/clear":
                    client.clear_history()
                    console.print("[green]Conversation history cleared[/green]")

                else:
                    console.print(f"[red]Unknown command: {cmd}[/red]")

                continue

            # Send message to LLM
            try:
                console.print(f"\n[bold green]Assistant ({current_model})[/bold green]")
                response = client.chat(user_input, current_model, stream=stream_enabled)

                if not stream_enabled:
                    # For non-streaming, render markdown
                    console.print(Markdown(response))

            except httpx.HTTPStatusError as e:
                console.print(f"[bold red]Error {e.response.status_code}:[/bold red] {e.response.text}")
            except Exception as e:
                console.print(f"[bold red]Error:[/bold red] {e}")

        except KeyboardInterrupt:
            console.print("\n[yellow]Use /quit to exit[/yellow]")
        except EOFError:
            break


if __name__ == "__main__":
    main()