import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

MODEL_ID = "GhostScientist/qwen25-coder-1.5b-codealpaca-sft"
BASE_MODEL_ID = "Qwen/Qwen2.5-Coder-1.5B-Instruct"

# Load tokenizer at startup (CPU)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)

# Global model variable - will be loaded on first GPU call
model = None

def load_model():
    """Load and merge the model with adapter."""
    global model
    if model is None:
        base_model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL_ID,
            torch_dtype=torch.float16,
            device_map="auto",
        )
        model = PeftModel.from_pretrained(base_model, MODEL_ID)
        model = model.merge_and_unload()
    return model

@spaces.GPU(duration=120)
def generate_response(message, history, system_message, max_tokens, temperature, top_p):
    """Generate response using the fine-tuned Qwen coder model."""
    # Load model on GPU
    model = load_model()

    messages = [{"role": "system", "content": system_message}]

    for item in history:
        if isinstance(item, (list, tuple)) and len(item) == 2:
            user_msg, assistant_msg = item
            if user_msg:
                messages.append({"role": "user", "content": user_msg})
            if assistant_msg:
                messages.append({"role": "assistant", "content": assistant_msg})

    messages.append({"role": "user", "content": message})

    # Apply chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    inputs = tokenizer([text], return_tensors="pt").to(model.device)

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=int(max_tokens),
            temperature=float(temperature),
            top_p=float(top_p),
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Decode only the new tokens
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return response


SYSTEM_PROMPT = """You are an expert coding assistant. You help users write, debug, explain, and improve code.
You provide clear, concise, and accurate responses with well-formatted code examples when appropriate.
Always explain your reasoning and suggest best practices."""

EXAMPLES = [
    ["Write a Python function to check if a number is prime"],
    ["Explain the difference between a list and a tuple in Python"],
    ["How do I reverse a string in JavaScript?"],
    ["Write a SQL query to find duplicate records in a table"],
    ["Debug this code: def add(a, b): return a - b"],
]

demo = gr.ChatInterface(
    fn=generate_response,
    title="Qwen 2.5 Coder Assistant",
    description="""A fine-tuned Qwen 2.5 Coder 1.5B model for code assistance.
Ask me to write code, explain concepts, debug issues, or help with any programming task!

**Model:** [GhostScientist/qwen25-coder-1.5b-codealpaca-sft](https://huggingface.co/GhostScientist/qwen25-coder-1.5b-codealpaca-sft)
""",
    additional_inputs=[
        gr.Textbox(
            value=SYSTEM_PROMPT,
            label="System Prompt",
            lines=3
        ),
        gr.Slider(
            minimum=64,
            maximum=2048,
            value=512,
            step=64,
            label="Max Tokens"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.5,
            value=0.7,
            step=0.1,
            label="Temperature"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p"
        ),
    ],
    examples=EXAMPLES,
)

if __name__ == "__main__":
    demo.launch()