#!/usr/bin/env python3
"""
TheOldLLM Proxy - v4.1.0
"""

import json
import uuid
import time
import os
import logging
import traceback
from typing import Optional, List, Dict, AsyncGenerator
from pathlib import Path
from datetime import datetime
from contextlib import asynccontextmanager

import httpx
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse, JSONResponse, Response, HTMLResponse
from fastapi.middleware.cors import CORSMiddleware
import uvicorn

# === Version ===
VERSION = "4.1.0"

# === Logging ===
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    datefmt='%H:%M:%S'
)
logger = logging.getLogger(__name__)

# === Configuration (Environment Variables) ===
UPSTREAM_ORIGIN = os.getenv("UPSTREAM_ORIGIN", "https://theoldllm.vercel.app")
PROXY_API_P9 = os.getenv("PROXY_API_P9", f"{UPSTREAM_ORIGIN}/api/proxy?provider=p9")
PROXY_API_P8 = os.getenv("PROXY_API_P8", f"{UPSTREAM_ORIGIN}/api/proxy?provider=p8")

DEFAULT_CHAT_MODEL = os.getenv("DEFAULT_CHAT_MODEL", "gpt-4o")
PORT = int(os.getenv("PORT", "8001"))
REQUEST_TIMEOUT = float(os.getenv("REQUEST_TIMEOUT", "300.0"))

# --- DEBUG OPTION ---
DEBUG_MODE = os.getenv("DEBUG_MODE", "false").lower() == "true"
DEBUG_DIR = Path(os.getenv("DEBUG_DIR", "debug_logs"))

# --- THINKING OUTPUT MODE ---
# Options:
#   "tagged"      - Wrap thinking in <think></think> tags [DEFAULT]
#   "merged"      - Stream everything as content (no separation)
#   "hidden"      - Only show final answer, hide thinking
#   "xml"         - Wrap in <reasoning></reasoning> tags (Claude style)
#   "markdown"    - Use markdown blockquote for thinking
#   "separated"   - Clear separator line between thinking and answer
#   "labeled"     - Add 💭/💬 labels to distinguish sections
#   "details"     - HTML <details> tag (collapsible)
#   "brackets"    - Use【】brackets for thinking
#   "italics"     - Italicize thinking content
THINKING_OUTPUT_MODE = os.getenv("THINKING_OUTPUT_MODE", "tagged")

# Custom separator (used in "separated" mode)
THINKING_SEPARATOR = os.getenv("THINKING_SEPARATOR", "\n\n---\n\n")

# Custom thinking prefix/suffix (for customization)
THINKING_START = os.getenv("THINKING_START", "")
THINKING_END = os.getenv("THINKING_END", "")

# === Models ===
# type: 0 = P9 (Default)
# type: 2 = P8 (Kimi, Mistral Nemotron, Qwen)
# thinking: True = Model uses thinking/reasoning (needs special handling)

ALL_MODELS = [
    # === GPT-5 Series ===
    {"id": "gpt-5.2", "name": "GPT-5.2", "llmVersion": "gpt-5.2"},
    {"id": "gpt-5.2-chat-latest", "name": "GPT-5.2 Chat Latest", "llmVersion": "gpt-5.2-chat-latest"},
    {"id": "gpt-5.1", "name": "GPT-5.1", "llmVersion": "gpt-5.1"},
    {"id": "gpt-5.1-chat-latest", "name": "GPT-5.1 Chat Latest", "llmVersion": "gpt-5.1-chat-latest"},
    {"id": "gpt-5", "name": "GPT-5", "llmVersion": "gpt-5"},
    {"id": "gpt-5-mini", "name": "GPT-5 Mini", "llmVersion": "gpt-5-mini"},
    {"id": "gpt-5-nano", "name": "GPT-5 Nano", "llmVersion": "gpt-5-nano"},
    {"id": "gpt-5-chat", "name": "GPT-5 Chat", "llmVersion": "gpt-5-chat"},
    {"id": "gpt-5-chat-latest", "name": "GPT-5 Chat Latest", "llmVersion": "gpt-5-chat-latest"},
    
    # === O-Series ===
    {"id": "o4-mini", "name": "O4 Mini", "llmVersion": "o4-mini"},
    {"id": "o3", "name": "O3", "llmVersion": "o3"},
    {"id": "o3-mini", "name": "O3 Mini", "llmVersion": "o3-mini"},
    {"id": "o1", "name": "O1", "llmVersion": "o1"},
    
    # === GPT-4 Series ===
    {"id": "gpt-4.1", "name": "GPT-4.1", "llmVersion": "gpt-4.1"},
    {"id": "gpt-4.1-mini", "name": "GPT-4.1 Mini", "llmVersion": "gpt-4.1-mini"},
    {"id": "gpt-4.1-nano", "name": "GPT-4.1 Nano", "llmVersion": "gpt-4.1-nano"},
    {"id": "gpt-4o", "name": "GPT-4o", "llmVersion": "gpt-4o"},
    {"id": "gpt-4o-mini", "name": "GPT-4o Mini", "llmVersion": "gpt-4o-mini"},
    {"id": "gpt-4o-search-preview", "name": "GPT-4o Search Preview", "llmVersion": "gpt-4o-search-preview"},
    {"id": "gpt-4o-search-mini-preview", "name": "GPT-4o Search Mini Preview", "llmVersion": "gpt-4o-search-mini-preview"},
    {"id": "gpt-4-turbo", "name": "GPT-4 Turbo", "llmVersion": "gpt-4-turbo"},
    {"id": "gpt-4-turbo-preview", "name": "GPT-4 Turbo Preview", "llmVersion": "gpt-4-turbo-preview"},
    {"id": "gpt-4", "name": "GPT-4", "llmVersion": "gpt-4"},
    {"id": "chatgpt-4o-latest", "name": "ChatGPT 4o Latest", "llmVersion": "chatgpt-4o-latest"},
    
    # === GPT-3.5 Series ===
    {"id": "gpt-3.5-turbo", "name": "GPT-3.5 Turbo", "llmVersion": "gpt-3.5-turbo"},

    # === GPT OSS ===
    {"id": "gpt-oss-120b", "name": "GPT OSS 120b", "llmVersion": "gpt-oss-120b"},
    {"id": "gpt-oss-20b", "name": "GPT OSS 20b", "llmVersion": "gpt-oss-20b"},
    
    # === Claude Opus ===
    {"id": "claude-opus-4.5", "name": "Claude Opus 4.5", "llmVersion": "claude-opus-4.5"},
    {"id": "claude-opus-4.1", "name": "Claude Opus 4.1", "llmVersion": "claude-opus-4.1"},
    {"id": "claude-opus-4", "name": "Claude Opus 4", "llmVersion": "claude-opus-4"},
    
    # === Claude Sonnet ===
    {"id": "claude-sonnet-4.5", "name": "Claude Sonnet 4.5", "llmVersion": "claude-sonnet-4.5"},
    {"id": "claude-sonnet-4", "name": "Claude Sonnet 4", "llmVersion": "claude-sonnet-4"},
    {"id": "claude-3.7-sonnet", "name": "Claude 3.7 Sonnet", "llmVersion": "claude-3.7-sonnet"},
    {"id": "claude-3.5-sonnet", "name": "Claude 3.5 Sonnet", "llmVersion": "claude-3.5-sonnet"},
    
    # === Claude Haiku ===
    {"id": "claude-haiku-4.5", "name": "Claude Haiku 4.5", "llmVersion": "claude-haiku-4.5"},
    {"id": "claude-3.5-haiku", "name": "Claude 3.5 Haiku", "llmVersion": "claude-3.5-haiku"},
    
    # === Claude 3 ===
    {"id": "claude-3-opus", "name": "Claude 3 Opus", "llmVersion": "claude-3-opus"},
    {"id": "claude-3-haiku", "name": "Claude 3 Haiku", "llmVersion": "claude-3-haiku"},
    
    # === Gemini ===
    {"id": "gemini-3-pro-preview", "name": "Gemini 3 Pro (Preview)", "llmVersion": "gemini-3-pro-preview"},
    {"id": "gemini-3-flash-preview", "name": "Gemini 3 Flash (Preview)", "llmVersion": "gemini-3-flash-preview"},
    {"id": "gemini-2.5-pro", "name": "Gemini 2.5 Pro", "llmVersion": "gemini-2.5-pro", "thinking": True},
    {"id": "gemini-2.5-flash", "name": "Gemini 2.5 Flash", "llmVersion": "gemini-2.5-flash", "thinking": True},
    {"id": "gemini-2.5-flash-lite", "name": "Gemini 2.5 Flash Lite", "llmVersion": "gemini-2.5-flash-lite"},
    {"id": "gemini-2.0-flash", "name": "Gemini 2.0 Flash", "llmVersion": "gemini-2.0-flash"},
    {"id": "gemini-2.0-flash-001", "name": "Gemini 2.0 Flash 001", "llmVersion": "gemini-2.0-flash-001"},
    {"id": "gemini-2.0-flash-lite", "name": "Gemini 2.0 Flash Lite", "llmVersion": "gemini-2.0-flash-lite"},
    {"id": "gemini-2.0-flash-lite-001", "name": "Gemini 2.0 Flash Lite 001", "llmVersion": "gemini-2.0-flash-lite-001"},

    # === DeepSeek (Thinking Models) ===
    {"id": "deepseek-v3.1-terminus", "name": "DeepSeek V3.1 Terminus", "llmVersion": "deepseek-v3.1-terminus"},
    {"id": "deepseek-v3.2", "name": "DeepSeek V3.2", "llmVersion": "deepseek-v3.2"},
    {"id": "deepseek-v3.1", "name": "DeepSeek V3.1", "llmVersion": "deepseek-v3.1"},
    {"id": "deepseek-v3", "name": "DeepSeek V3.1", "llmVersion": "deepseek-v3"},
    {"id": "deepseek-r1", "name": "DeepSeek R1", "llmVersion": "deepseek-r1", "thinking": True},
    {"id": "deepseek-r1-0528", "name": "DeepSeek R1 0528", "llmVersion": "deepseek-r1-0528", "thinking": True},
    {"id": "deepseek-prover-v2", "name": "DeepSeek Prover V2", "llmVersion": "deepseek-prover-v2"},
    {"id": "deepseek-r1t2-chimera:free", "name": "DeepSeek R1T2 Chimera Free", "llmVersion": "deepseek-r1t2-chimera:free", "thinking": True},
    
    # === GLM (Thinking Models) ===
    {"id": "glm-4.5", "name": "GLM 4.5", "llmVersion": "glm-4.5", "thinking": True},
    {"id": "glm-4.6", "name": "GLM 4.6", "llmVersion": "glm-4.6", "thinking": True},
    {"id": "glm-4.7", "name": "GLM 4.7", "llmVersion": "glm-4.7", "thinking": True},
    {"id": "glm-4.5-air", "name": "GLM 4.5 Air", "llmVersion": "glm-4.5-air", "thinking": True},
    {"id": "glm-4.5-air:free", "name": "GLM 4.5 Air Free", "llmVersion": "glm-4.5-air:free", "thinking": True},

    # === Grok ===
    {"id": "grok-4.1-fast", "name": "Grok 4.1 Fast", "llmVersion": "grok-4.1-fast"},
    {"id": "grok-4", "name": "Grok 4", "llmVersion": "grok-4"},
    {"id": "grok-3", "name": "Grok 3", "llmVersion": "grok-3"},
    {"id": "grok-code-fast-1", "name": "Grok Code Fast 1", "llmVersion": "grok-code-fast-1"},
    
    # === Kimi (P8/P9) ===
    {"id": "kimi-k2-thinking", "name": "Kimi K2 Thinking", "llmVersion": "kimi-k2-thinking", "type": 2, "thinking": True},
    {"id": "kimi-k2-instruct", "name": "Kimi K2 Instruct", "llmVersion": "kimi-k2-instruct", "type": 2},
    {"id": "kimi-k2", "name": "Kimi K2", "llmVersion": "kimi-k2"},
    {"id": "kimi-k2-0905", "name": "Kimi K2 0905", "llmVersion": "kimi-k2-0905"},

    # === Mistral (P8/P9) ===
    {"id": "mistral-nemotron", "name": "Mistral Nemotron", "llmVersion": "mistral-nemotron", "type": 2},
    {"id": "mistral-nemo", "name": "Mistral Nemo", "llmVersion": "mistral-nemo"},
    {"id": "devstral-2512:free", "name": "Devstral 2512 Free", "llmVersion": "devstral-2512:free"},

    # === Qwen (P8/P9 - Some Thinking) ===
    {"id": "qwen3-235b-a22b-07-25", "name": "Qwen 3 Next 235B A22B 07 25", "llmVersion": "qwen3-235b-a22b-07-25"},
    {"id": "qwen3-32b", "name": "Qwen 3 32B", "llmVersion": "qwen3-32b", "thinking": True},
    {"id": "qwen3-next-80b-a3b-thinking", "name": "Qwen 3 80B Next A3B Thinking", "llmVersion": "qwen3-next-80b-a3b-thinking", "thinking": True},
    {"id": "qwen3-next-80b-a3b-instruct", "name": "Qwen 3 Next 80B Instruct", "llmVersion": "qwen/qwen3-next-80b-a3b-instruct", "type": 2},

    # === MiniMax ===
    {"id": "minimax-m2", "name": "MiniMax M2", "llmVersion": "minimax-m2", "thinking": True},

    # === Meta Llama ===
    {"id": "llama-3.3-70b-instruct", "name": "Llama 3.3 70b Instruct", "llmVersion": "llama-3.3-70b-instruct"},
]

# Build lookup dict for faster access
MODEL_LOOKUP: Dict[str, Dict] = {m["id"]: m for m in ALL_MODELS}


# === Shared HTTP Client ===
http_client: Optional[httpx.AsyncClient] = None


# === Thinking Output Formatters ===
class ThinkingFormatter:
    """Handles formatting of thinking/reasoning content based on mode."""
    
    MODE_INFO = {
        "tagged": {
            "description": "Wrap thinking in <think></think> tags [DEFAULT]",
            "example": "<think>thinking...</think>\n\nanswer"
        },
        "merged": {
            "description": "Stream everything as content (no visual separation)",
            "example": "thinking... answer"
        },
        "hidden": {
            "description": "Only show final answer, hide all thinking",
            "example": "answer"
        },
        "xml": {
            "description": "Wrap in <reasoning></reasoning> tags (Claude style)",
            "example": "<reasoning>thinking...</reasoning>\n\nanswer"
        },
        "markdown": {
            "description": "Use markdown blockquote for thinking",
            "example": "> thinking...\n\nanswer"
        },
        "separated": {
            "description": "Clear separator line between thinking and answer",
            "example": "thinking...\n\n---\n\nanswer"
        },
        "labeled": {
            "description": "Add emoji labels to distinguish sections",
            "example": "💭 **Thinking:**\nthinking...\n\n💬 **Answer:**\nanswer"
        },
        "details": {
            "description": "HTML <details> tag (collapsible in some clients)",
            "example": "<details><summary>Thinking</summary>thinking...</details>\n\nanswer"
        },
        "brackets": {
            "description": "Use 【】brackets for thinking",
            "example": "【thinking...】\n\nanswer"
        },
        "italics": {
            "description": "Italicize thinking content",
            "example": "*thinking...*\n\nanswer"
        }
    }
    
    def __init__(self, mode: str = "tagged"):
        self.mode = mode
        self.thinking_started = False
        self.thinking_ended = False
        self.first_thinking_chunk = True
        self.first_content_chunk = True
    
    def reset(self):
        """Reset state for new stream."""
        self.thinking_started = False
        self.thinking_ended = False
        self.first_thinking_chunk = True
        self.first_content_chunk = True
    
    def format_thinking_start(self) -> str:
        """Return prefix for thinking section."""
        if self.mode == "tagged":
            return "<think>\n"
        elif self.mode == "xml":
            return "<reasoning>\n"
        elif self.mode == "markdown":
            return "> "
        elif self.mode == "labeled":
            return "💭 **Thinking:**\n"
        elif self.mode == "details":
            return "<details><summary>💭 Thinking Process</summary>\n\n"
        elif self.mode == "brackets":
            return "【"
        elif self.mode == "italics":
            return "*"
        elif THINKING_START:
            return THINKING_START
        return ""
    
    def format_thinking_end(self) -> str:
        """Return suffix for thinking section."""
        if self.mode == "tagged":
            return "\n</think>"
        elif self.mode == "xml":
            return "\n</reasoning>"
        elif self.mode == "markdown":
            return ""
        elif self.mode == "labeled":
            return ""
        elif self.mode == "details":
            return "\n\n</details>"
        elif self.mode == "brackets":
            return "】"
        elif self.mode == "italics":
            return "*"
        elif THINKING_END:
            return THINKING_END
        return ""
    
    def format_transition(self) -> str:
        """Return separator between thinking and answer."""
        if self.mode == "hidden":
            return ""
        elif self.mode == "merged":
            return ""
        elif self.mode == "tagged":
            return "\n\n"
        elif self.mode == "xml":
            return "\n\n"
        elif self.mode == "markdown":
            return "\n\n"
        elif self.mode == "separated":
            return THINKING_SEPARATOR
        elif self.mode == "labeled":
            return "\n\n💬 **Answer:**\n"
        elif self.mode == "details":
            return "\n\n"
        elif self.mode == "brackets":
            return "\n\n"
        elif self.mode == "italics":
            return "\n\n"
        return "\n\n"
    
    def format_thinking_chunk(self, text: str) -> str:
        """Format a chunk of thinking content."""
        if self.mode == "hidden":
            return ""
        
        result = ""
        
        if self.first_thinking_chunk:
            result += self.format_thinking_start()
            self.first_thinking_chunk = False
            self.thinking_started = True
        
        if self.mode == "markdown":
            lines = text.split('\n')
            text = '\n> '.join(lines)
        
        result += text
        return result
    
    def format_content_chunk(self, text: str, has_thinking: bool) -> str:
        """Format a chunk of answer content."""
        result = ""
        
        if has_thinking and self.first_content_chunk:
            if self.thinking_started and not self.thinking_ended:
                result += self.format_thinking_end()
                self.thinking_ended = True
            result += self.format_transition()
            self.first_content_chunk = False
        
        result += text
        return result
    
    def format_complete(self, thinking: str, content: str) -> str:
        """Format complete thinking + content (non-streaming)."""
        if self.mode == "hidden":
            return content
        
        if not thinking:
            return content
        
        if self.mode == "merged":
            return thinking + "\n\n" + content if content else thinking
        
        result = ""
        
        result += self.format_thinking_start()
        
        if self.mode == "markdown":
            lines = thinking.split('\n')
            result += '\n> '.join(lines)
        else:
            result += thinking
        
        result += self.format_thinking_end()
        
        if content:
            result += self.format_transition()
            result += content
        
        return result


# === Helper Functions ===
def get_headers(auth_header: Optional[str] = None) -> Dict[str, str]:
    """Generate browser-like headers. Passes through auth if provided."""
    headers = {
        "Host": "theoldllm.vercel.app",
        "connection": "keep-alive",
        "pragma": "no-cache",
        "cache-control": "no-cache",
        "sec-ch-ua": '"Chromium";v="137", "Not/A)Brand";v="24"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": '"Linux"',
        "dnt": "1",
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36",
        "accept": "*/*",
        "content-type": "application/json",
        "sec-fetch-site": "same-origin",
        "sec-fetch-mode": "cors",
        "sec-fetch-dest": "empty",
        "referer": f"{UPSTREAM_ORIGIN}/",
        "origin": UPSTREAM_ORIGIN,
        "accept-language": "en-US,en;q=0.9",
        "priority": "u=1, i",
    }
    
    if auth_header:
        headers["authorization"] = auth_header
        
    return headers


def get_model_config(model_id: str) -> Dict:
    """Get model configuration."""
    model = MODEL_LOOKUP.get(model_id)
    if not model:
        return {"version": model_id, "type": 0, "thinking": False}
    return {
        "version": model.get("llmVersion", model_id),
        "type": model.get("type", 0),
        "thinking": model.get("thinking", False)
    }


def validate_messages(messages: List) -> tuple:
    """Validate message format. Returns (is_valid, error_message)."""
    if not isinstance(messages, list):
        return False, "messages must be a list"
    
    if len(messages) == 0:
        return False, "messages cannot be empty"
    
    for i, msg in enumerate(messages):
        if not isinstance(msg, dict):
            return False, f"message at index {i} must be an object"
        
        if "role" not in msg:
            return False, f"message at index {i} missing 'role'"
        
        if "content" not in msg:
            return False, f"message at index {i} missing 'content'"
        
        role = msg.get("role")
        if role not in ("system", "user", "assistant", "tool", "function"):
            return False, f"message at index {i} has invalid role: {role}"
    
    return True, None


def format_messages(messages: List[Dict]) -> List[Dict]:
    """Format messages for upstream API."""
    formatted = []
    for msg in messages:
        formatted_msg = {
            "role": msg.get("role"),
            "content": msg.get("content", "")
        }
        
        if "name" in msg:
            formatted_msg["name"] = msg["name"]
        if "tool_calls" in msg:
            formatted_msg["tool_calls"] = msg["tool_calls"]
        if "tool_call_id" in msg:
            formatted_msg["tool_call_id"] = msg["tool_call_id"]
        if "function_call" in msg:
            formatted_msg["function_call"] = msg["function_call"]
            
        formatted.append(formatted_msg)
    
    return formatted


def extract_content_from_chunk(data: Dict, is_thinking_model: bool = False) -> tuple:
    """
    Extract content from a streaming chunk.
    Returns (content, reasoning).
    """
    content = ""
    reasoning = None
    
    if not isinstance(data, dict):
        return content, reasoning
    
    choices = data.get("choices", [])
    if choices and isinstance(choices, list) and len(choices) > 0:
        choice = choices[0]
        if isinstance(choice, dict):
            delta = choice.get("delta", {})
            if isinstance(delta, dict):
                content = delta.get("content") or ""
                
                if is_thinking_model:
                    reasoning = (
                        delta.get("reasoning") or
                        delta.get("reasoning_content") or
                        delta.get("thinking") or
                        delta.get("thought") or
                        None
                    )
                    
                    if not reasoning:
                        reasoning_details = delta.get("reasoning_details", [])
                        if reasoning_details and isinstance(reasoning_details, list):
                            texts = []
                            for detail in reasoning_details:
                                if isinstance(detail, dict):
                                    text = detail.get("text", "")
                                    if text:
                                        texts.append(text)
                            if texts:
                                reasoning = "".join(texts)
            
            message = choice.get("message", {})
            if isinstance(message, dict) and not content:
                content = message.get("content") or ""
                if is_thinking_model and not reasoning:
                    reasoning = (
                        message.get("reasoning") or
                        message.get("reasoning_content") or
                        message.get("thinking") or
                        None
                    )
    
    if not content:
        obj = data.get("obj", {})
        if isinstance(obj, dict):
            content = obj.get("content") or ""
    
    if not content:
        content = data.get("content") or ""
    
    return content, reasoning


def extract_content_from_response(data: Dict, is_thinking_model: bool = False) -> tuple:
    """Extract content from a non-streaming response."""
    content = ""
    reasoning = None
    
    if not isinstance(data, dict):
        return content, reasoning
    
    choices = data.get("choices", [])
    if choices and isinstance(choices, list) and len(choices) > 0:
        choice = choices[0]
        if isinstance(choice, dict):
            message = choice.get("message", {})
            if isinstance(message, dict):
                content = message.get("content") or ""
                
                if is_thinking_model:
                    reasoning = (
                        message.get("reasoning") or
                        message.get("reasoning_content") or
                        message.get("thinking") or
                        None
                    )
    
    if not content:
        content = data.get("content") or ""
    
    if not content:
        obj = data.get("obj", {})
        if isinstance(obj, dict):
            content = obj.get("content") or ""
    
    return content, reasoning


def save_debug_json(payload: dict, mode: str, extra_info: Optional[str] = None):
    """Save the outgoing JSON payload to a file for debugging."""
    if not DEBUG_MODE:
        return
        
    try:
        DEBUG_DIR.mkdir(parents=True, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
        model_name = payload.get("model", "unknown").replace("/", "_").replace(":", "_")
        extra = f"_{extra_info}" if extra_info else ""
        filename = DEBUG_DIR / f"{timestamp}_{mode}_{model_name}{extra}.json"
        
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(payload, f, indent=2, ensure_ascii=False)
            
        logger.info(f"🐛 Debug payload saved: {filename}")
    except Exception as e:
        logger.error(f"❌ Failed to save debug log: {e}")


def save_debug_response(data, mode: str, model: str):
    """Save response data for debugging."""
    if not DEBUG_MODE:
        return
    
    try:
        DEBUG_DIR.mkdir(parents=True, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
        model_name = model.replace("/", "_").replace(":", "_")
        filename = DEBUG_DIR / f"{timestamp}_{mode}_response_{model_name}.json"
        
        with open(filename, "w", encoding="utf-8") as f:
            if isinstance(data, str):
                f.write(data)
            else:
                json.dump(data, f, indent=2, ensure_ascii=False)
                
        logger.info(f"🐛 Debug response saved: {filename}")
    except Exception as e:
        logger.error(f"❌ Failed to save debug response: {e}")


# === Lifespan ===
@asynccontextmanager
async def lifespan(app: FastAPI):
    """Application lifespan handler."""
    global http_client
    
    logger.info("=" * 50)
    logger.info(f"  TheOldLLM Proxy v{VERSION}")
    logger.info(f"  Port: {PORT}")
    logger.info(f"  API: http://localhost:{PORT}/v1")
    logger.info(f"  Debug Mode: {'Enabled' if DEBUG_MODE else 'Disabled'}")
    logger.info(f"  Thinking Mode: {THINKING_OUTPUT_MODE}")
    logger.info(f"  Models: {len(ALL_MODELS)} total, {len([m for m in ALL_MODELS if m.get('thinking')])} thinking")
    logger.info("=" * 50)
    
    http_client = httpx.AsyncClient(
        timeout=httpx.Timeout(REQUEST_TIMEOUT, connect=30.0),
        limits=httpx.Limits(max_keepalive_connections=20, max_connections=100),
        http2=True
    )
    
    yield
    
    logger.info("Shutting down...")
    if http_client:
        await http_client.aclose()


# === FastAPI App ===
app = FastAPI(
    title="TheOldLLM Proxy",
    version=VERSION,
    lifespan=lifespan
)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


@app.get("/favicon.ico", response_model=None)
async def favicon():
    return Response(status_code=204)


@app.options("/{path:path}", response_model=None)
async def options_handler(path: str):
    return Response(status_code=200, headers={
        "Access-Control-Allow-Origin": "*",
        "Access-Control-Allow-Methods": "GET, POST, PUT, DELETE, OPTIONS",
        "Access-Control-Allow-Headers": "*",
    })


# === Dashboard ===
@app.get("/", response_model=None)
async def dashboard():
    """Dashboard with status and mode examples."""
    p9_count = len([m for m in ALL_MODELS if m.get("type", 0) == 0])
    p8_count = len([m for m in ALL_MODELS if m.get("type", 0) == 2])
    thinking_count = len([m for m in ALL_MODELS if m.get("thinking", False)])
    debug_status = "Enabled" if DEBUG_MODE else "Disabled"
    
    # Build mode options HTML
    mode_options = ""
    for mode, info in ThinkingFormatter.MODE_INFO.items():
        selected = "✓ " if mode == THINKING_OUTPUT_MODE else ""
        example_escaped = info["example"].replace("<", "&lt;").replace(">", "&gt;")
        mode_options += f"""
        <div class="mode-option {'selected' if mode == THINKING_OUTPUT_MODE else ''}">
            <div class="mode-name">{selected}{mode}</div>
            <div class="mode-desc">{info['description']}</div>
            <pre class="mode-example">{example_escaped}</pre>
        </div>
        """
    
    return HTMLResponse(f"""
<!DOCTYPE html>
<html>
<head>
    <title>TheOldLLM Proxy - Port {PORT}</title>
    <meta charset="utf-8">
    <link rel="icon" href="data:,">
    <style>
        * {{ box-sizing: border-box; }}
        body {{ 
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
            max-width: 1000px; margin: 0 auto; padding: 20px;
            background: #0d1117; color: #c9d1d9;
        }}
        h1 {{ color: #58a6ff; margin-bottom: 5px; }}
        h3 {{ color: #8b949e; margin-top: 0; }}
        .subtitle {{ color: #8b949e; margin-bottom: 30px; }}
        .card {{
            background: #161b22; border: 1px solid #30363d;
            border-radius: 8px; padding: 20px; margin: 15px 0;
        }}
        .stats {{ display: flex; gap: 20px; margin-top: 15px; flex-wrap: wrap; }}
        .stat {{ 
            background: #0d1117; padding: 10px 15px; 
            border-radius: 6px; border: 1px solid #30363d; 
            min-width: 120px; 
        }}
        .stat-value {{ font-size: 24px; font-weight: bold; color: #58a6ff; }}
        .stat-label {{ font-size: 12px; color: #8b949e; }}
        pre {{
            background: #0d1117; border: 1px solid #30363d;
            padding: 15px; border-radius: 6px; overflow-x: auto;
            font-size: 12px; color: #7ee787; margin: 5px 0;
        }}
        .thinking {{ color: #f0883e; }}
        .mode-option {{
            background: #0d1117; border: 1px solid #30363d;
            border-radius: 6px; padding: 12px; margin: 8px 0;
        }}
        .mode-option.selected {{
            border-color: #58a6ff;
            background: #1c2128;
        }}
        .mode-name {{ 
            font-weight: bold; color: #58a6ff; 
            font-family: monospace; font-size: 14px;
        }}
        .mode-desc {{ color: #8b949e; font-size: 13px; margin: 5px 0; }}
        .mode-example {{ 
            font-size: 11px; color: #7ee787; 
            margin: 8px 0 0 0; padding: 8px;
        }}
        .grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); gap: 10px; }}
    </style>
</head>
<body>
    <h1>🔐 TheOldLLM Proxy</h1>
    <p class="subtitle">OpenAI-compatible proxy with thinking model support - v{VERSION}</p>
    
    <div class="card">
        <h3>📊 System Status</h3>
        <div class="stats">
            <div class="stat">
                <div class="stat-value">{p9_count}</div>
                <div class="stat-label">P9 Models</div>
            </div>
            <div class="stat">
                <div class="stat-value">{p8_count}</div>
                <div class="stat-label">P8 Models</div>
            </div>
            <div class="stat">
                <div class="stat-value thinking">{thinking_count}</div>
                <div class="stat-label">Thinking Models</div>
            </div>
            <div class="stat">
                <div class="stat-value">{debug_status}</div>
                <div class="stat-label">Debug Mode</div>
            </div>
        </div>
    </div>
    
    <div class="card">
        <h3>🔌 API Endpoint</h3>
        <pre>http://localhost:{PORT}/v1</pre>
    </div>
    
    <div class="card">
        <h3>🧠 Thinking Output Modes</h3>
        <p style="color: #8b949e; font-size: 14px; margin-bottom: 15px;">
            Current mode: <strong style="color: #58a6ff;">{THINKING_OUTPUT_MODE}</strong>
            — Set via <code>THINKING_OUTPUT_MODE</code> environment variable
        </p>
        <div class="grid">
            {mode_options}
        </div>
    </div>
    
    <div class="card">
        <h3>⚙️ Example Environment Variables</h3>
        <pre>
# Thinking output format (default: tagged)
THINKING_OUTPUT_MODE=tagged  # tagged|merged|hidden|xml|markdown|separated|labeled|details|brackets|italics

# Custom separator (for "separated" mode)
THINKING_SEPARATOR="\\n\\n---\\n\\n"

# Custom start/end markers
THINKING_START=""
THINKING_END=""

# Other settings
DEBUG_MODE=false
PORT=8001
REQUEST_TIMEOUT=300
        </pre>
    </div>
    
    <div class="card">
        <h3>💭 Thinking Models</h3>
        <pre style="color: #f0883e;">{', '.join(m['id'] for m in ALL_MODELS if m.get('thinking'))}</pre>
    </div>
</body>
</html>
    """)


# === OpenAI Endpoints ===
@app.get("/v1/models", response_model=None)
async def list_models():
    """List available models."""
    safe_models = []
    for m in ALL_MODELS:
        if isinstance(m, dict) and "id" in m:
            model_data = {
                "id": m["id"], 
                "object": "model", 
                "created": int(time.time()), 
                "owned_by": "theoldllm-proxy", 
                "name": m.get("name", m["id"])
            }
            if m.get("thinking"):
                model_data["capabilities"] = {"thinking": True}
            safe_models.append(model_data)
            
    return JSONResponse(content={
        "object": "list",
        "data": safe_models
    }, headers={"Access-Control-Allow-Origin": "*"})


@app.post("/v1/chat/completions", response_model=None)
async def chat_completions(request: Request):
    """Handle chat completions."""
    
    try:
        body = await request.json()
    except json.JSONDecodeError as e:
        return JSONResponse(
            content={"error": {"message": f"Invalid JSON: {e}", "type": "invalid_request_error"}},
            status_code=400, headers={"Access-Control-Allow-Origin": "*"}
        )
    
    user_model = body.get("model", DEFAULT_CHAT_MODEL)
    is_stream = body.get("stream", False)
    messages = body.get("messages", [])
    
    is_valid, error_msg = validate_messages(messages)
    if not is_valid:
        return JSONResponse(
            content={"error": {"message": error_msg, "type": "invalid_request_error"}},
            status_code=400, headers={"Access-Control-Allow-Origin": "*"}
        )
    
    config = get_model_config(user_model)
    actual_model = config["version"]
    model_type = config["type"]
    is_thinking = config["thinking"]
    
    target_url = PROXY_API_P8 if model_type == 2 else PROXY_API_P9
    provider_name = "P8" if model_type == 2 else "P9"
    
    # Allow per-request override of thinking mode
    thinking_mode = body.get("thinking_mode", THINKING_OUTPUT_MODE)
    
    logger.info(f"📨 Request: model={user_model}, provider={provider_name}, stream={is_stream}, thinking={is_thinking}, mode={thinking_mode}")
    
    auth_header = request.headers.get("Authorization")
    
    if is_stream:
        return StreamingResponse(
            proxy_streaming_request(
                auth_header, target_url, actual_model, messages, 
                user_model, is_thinking, thinking_mode
            ),
            media_type="text/event-stream",
            headers={
                "Access-Control-Allow-Origin": "*",
                "Cache-Control": "no-cache",
                "Connection": "keep-alive",
                "X-Accel-Buffering": "no",
            }
        )
    else:
        return await proxy_non_streaming_request(
            auth_header, target_url, actual_model, messages, 
            user_model, is_thinking, thinking_mode
        )


# === Streaming Handler ===
async def proxy_streaming_request(
    auth_header: Optional[str],
    target_url: str,
    model: str,
    messages: List[Dict],
    user_model: str,
    is_thinking: bool = False,
    thinking_mode: str = "tagged"
) -> AsyncGenerator[str, None]:
    """Streaming request with thinking model support."""
    global http_client
    
    chunk_id = f"chatcmpl-{uuid.uuid4()}"
    created = int(time.time())
    formatter = ThinkingFormatter(thinking_mode)
    had_thinking = False
    
    def make_chunk(content: str = "", finish_reason: Optional[str] = None) -> str:
        delta = {}
        if content:
            delta["content"] = content
            
        chunk = {
            "id": chunk_id,
            "object": "chat.completion.chunk",
            "created": created,
            "model": user_model,
            "choices": [{
                "index": 0,
                "delta": delta,
                "finish_reason": finish_reason
            }]
        }
        return f"data: {json.dumps(chunk)}\n\n"
    
    def make_error(msg: str) -> str:
        return make_chunk(f"[Error: {msg}]") + "data: [DONE]\n\n"
    
    if not http_client:
        yield make_error("HTTP client not initialized")
        return
    
    headers = get_headers(auth_header)
    
    try:
        provider_name = "P8" if "p8" in target_url else "P9"
        logger.info(f"[{provider_name}] POST to {target_url}...")
        
        formatted_messages = format_messages(messages)
        
        payload = {
            "model": model,
            "messages": formatted_messages,
            "stream": True
        }
        
        save_debug_json(payload, "stream", f"thinking_{is_thinking}")
        
        chunk_count = 0
        reasoning_chunks = 0
        raw_response_log = []
        
        async with http_client.stream(
            "POST",
            target_url,
            headers=headers,
            json=payload,
        ) as response:
            
            if response.status_code == 403:
                logger.error("   ❌ 403 Forbidden")
                yield make_error("Forbidden.")
                return
            
            if response.status_code != 200:
                error_text = await response.aread()
                logger.error(f"   ❌ Proxy failed: {response.status_code} - {error_text[:200]}")
                yield make_error(f"Proxy failed: {response.status_code}")
                return
            
            buffer = ""
            
            async for raw_chunk in response.aiter_text():
                buffer += raw_chunk
                
                if DEBUG_MODE:
                    raw_response_log.append(raw_chunk)
                
                while "\n" in buffer:
                    line, buffer = buffer.split("\n", 1)
                    line = line.strip()
                    
                    if not line:
                        continue
                    
                    if line.startswith("data: "):
                        line = line[6:]
                    
                    if line == "[DONE]" or line.startswith("["):
                        continue
                    
                    try:
                        data = json.loads(line)
                        content, reasoning = extract_content_from_chunk(data, is_thinking)
                        
                        # Process reasoning chunks
                        if reasoning and is_thinking:
                            had_thinking = True
                            reasoning_chunks += 1
                            formatted_text = formatter.format_thinking_chunk(reasoning)
                            if formatted_text:
                                yield make_chunk(formatted_text)
                        
                        # Process content chunks
                        if content:
                            chunk_count += 1
                            formatted_text = formatter.format_content_chunk(content, had_thinking)
                            if formatted_text:
                                yield make_chunk(formatted_text)
                            
                    except json.JSONDecodeError:
                        continue
        
        if DEBUG_MODE and raw_response_log:
            save_debug_response("".join(raw_response_log), "stream", model)
        
        yield make_chunk("", "stop")
        yield "data: [DONE]\n\n"
        
        logger.info(f"   ✅ Stream complete ({chunk_count} content, {reasoning_chunks} reasoning)")
        
    except httpx.TimeoutException:
        logger.error("   ❌ Timeout")
        yield make_error("Request timeout")
    except httpx.ConnectError as e:
        logger.error(f"   ❌ Connection error: {e}")
        yield make_error(f"Connection failed: {e}")
    except Exception as e:
        logger.error(f"   ❌ Error: {e}")
        logger.error(traceback.format_exc())
        yield make_error(str(e))


# === Non-Streaming Handler ===
async def proxy_non_streaming_request(
    auth_header: Optional[str],
    target_url: str,
    model: str,
    messages: List[Dict],
    user_model: str,
    is_thinking: bool = False,
    thinking_mode: str = "tagged"
) -> JSONResponse:
    """Non-streaming request with thinking model support."""
    global http_client
    
    if not http_client:
        return JSONResponse(
            content={"error": {"message": "HTTP client not initialized", "type": "internal_error"}},
            status_code=500, headers={"Access-Control-Allow-Origin": "*"}
        )
    
    try:
        headers = get_headers(auth_header)
        provider_name = "P8" if "p8" in target_url else "P9"
        
        logger.info(f"[{provider_name}] POST to {target_url}...")
        
        formatted_messages = format_messages(messages)
        
        payload = {
            "model": model,
            "messages": formatted_messages,
            "stream": False
        }
        
        save_debug_json(payload, "non_stream", f"thinking_{is_thinking}")
        
        response = await http_client.post(
            target_url,
            headers=headers,
            json=payload
        )
        
        if response.status_code == 403:
            return JSONResponse(
                content={"error": {"message": "403 Forbidden", "type": "authentication_error"}},
                status_code=403, headers={"Access-Control-Allow-Origin": "*"}
            )
        
        if response.status_code != 200:
            error_text = response.text[:500]
            return JSONResponse(
                content={"error": {"message": f"Proxy failed: {response.status_code} - {error_text}", "type": "api_error"}},
                status_code=500, headers={"Access-Control-Allow-Origin": "*"}
            )
        
        data = response.json()
        
        if DEBUG_MODE:
            save_debug_response(data, "non_stream", model)
        
        content, reasoning = extract_content_from_response(data, is_thinking)
        
        # Format complete response
        formatter = ThinkingFormatter(thinking_mode)
        final_content = formatter.format_complete(reasoning or "", content)
        
        logger.info(f"   ✅ Response: {len(final_content)} chars")
        
        return JSONResponse(content={
            "id": f"chatcmpl-{uuid.uuid4()}",
            "object": "chat.completion",
            "created": int(time.time()),
            "model": user_model,
            "choices": [{
                "index": 0,
                "message": {"role": "assistant", "content": final_content},
                "finish_reason": "stop"
            }],
            "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
        }, headers={"Access-Control-Allow-Origin": "*"})
    
    except httpx.TimeoutException:
        return JSONResponse(
            content={"error": {"message": "Request timeout", "type": "timeout_error"}},
            status_code=504, headers={"Access-Control-Allow-Origin": "*"}
        )
    except httpx.ConnectError as e:
        return JSONResponse(
            content={"error": {"message": f"Connection failed: {e}", "type": "connection_error"}},
            status_code=502, headers={"Access-Control-Allow-Origin": "*"}
        )
    except Exception as e:
        logger.error(f"❌ Error: {e}")
        logger.error(traceback.format_exc())
        return JSONResponse(
            content={"error": {"message": str(e), "type": "internal_error"}},
            status_code=500, headers={"Access-Control-Allow-Origin": "*"}
        )


# === Health Check ===
@app.get("/health", response_model=None)
async def health_check():
    return JSONResponse(content={
        "status": "healthy",
        "version": VERSION,
        "models_count": len(ALL_MODELS),
        "thinking_models_count": len([m for m in ALL_MODELS if m.get("thinking")]),
        "thinking_output_mode": THINKING_OUTPUT_MODE
    })


# === Mode Info Endpoint ===
@app.get("/v1/thinking-modes", response_model=None)
async def list_thinking_modes():
    """List available thinking output modes."""
    return JSONResponse(content={
        "current_mode": THINKING_OUTPUT_MODE,
        "available_modes": ThinkingFormatter.MODE_INFO
    }, headers={"Access-Control-Allow-Origin": "*"})


if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=PORT)