Spaces:

PraneshJs
/

FullGpt2Vizualizer

Running

App Files Files Community

PraneshJs commited on Nov 17

Commit

aa71186

verified ·

1 Parent(s): 612ed45

Update app.py

Browse files

Files changed (1) hide show

app.py +393 -399

app.py CHANGED Viewed

@@ -1,503 +1,497 @@
-# app.py — Full LLM Visualizer (Option A) for Hugging Face Spaces (Gradio)
-# Advanced features: attention, PCA, token animation, residual norms, activation patching, neuron explorer.
-# Recommended models: "distilgpt2", "gpt2". Use GPU Space for larger models.
 import gradio as gr
 import torch
 import numpy as np
 import plotly.express as px
 import plotly.graph_objects as go
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from sklearn.decomposition import PCA
 import pandas as pd
-import time
 import html
-# ---------------- Config ----------------
 DEFAULT_MODEL = "distilgpt2"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-_MODEL_CACHE = {}
-# ---------------- Utilities ----------------
 def load_model(model_name):
-    if model_name in _MODEL_CACHE:
-        return _MODEL_CACHE[model_name]
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForCausalLM.from_pretrained(
         model_name, output_attentions=True, output_hidden_states=True
-    )
-    model.to(DEVICE)
     model.eval()
-    _MODEL_CACHE[model_name] = (model, tokenizer)
     return model, tokenizer
 def softmax(x):
     e = np.exp(x - np.max(x))
     return e / e.sum(axis=-1, keepdims=True)
-def tokens_display(tokens):
     return "  ".join([f"[{html.escape(t)}]" for t in tokens])
-def compute_pca_points(hidden_layer):
     try:
-        p = PCA(n_components=2).fit_transform(hidden_layer)
-        return p
-    except Exception:
         seq = hidden_layer.shape[0]
-        d0 = hidden_layer[:, 0] if hidden_layer.shape[1] > 0 else np.zeros(seq)
-        d1 = hidden_layer[:, 1] if hidden_layer.shape[1] > 1 else np.zeros(seq)
-        return np.vstack([d0, d1]).T
-def make_attention_figure(attn_matrix, tokens, title=None):
-    fig = px.imshow(attn_matrix, x=tokens, y=tokens,
-                    labels={"x":"Key token", "y":"Query token", "color":"Attention"},
-                    title=title or "Attention")
-    fig.update_layout(height=420, margin=dict(l=60, r=20, t=40, b=40))
     return fig
-def make_pca_figure(points, tokens, highlight_idx=None, title=None):
-    fig = px.scatter(x=points[:,0], y=points[:,1], text=tokens, title=title or "PCA (2D)")
     fig.update_traces(textposition="top center", marker=dict(size=10))
-    if highlight_idx is not None:
         fig.add_trace(go.Scatter(
-            x=[points[highlight_idx,0]], y=[points[highlight_idx,1]],
-            mode="markers+text", text=[tokens[highlight_idx]],
-            marker=dict(size=18, color="red"), name="selected token"
         ))
-    fig.update_layout(height=420, margin=dict(l=40, r=40, t=40, b=40))
     return fig
-def make_probs_figure(top_tokens, top_scores, title=None):
-    fig = go.Figure(data=[go.Bar(x=top_tokens, y=top_scores)])
-    fig.update_layout(title=title or "Next-token top predictions", yaxis_title="Probability", height=360, margin=dict(l=40,r=20,t=40,b=40))
     return fig
-# ---------------- Core analysis ----------------
-def analyze_text(text, model_name, explain_simple):
-    """
-    Run forward pass and return internals.
-    Returns dict with tokens, attentions (list per layer), hidden states (list per layer), logits, PCA points, figures.
-    """
-    if not text or len(text.strip()) == 0:
-        return {"error": "Please enter some text."}
     try:
         model, tokenizer = load_model(model_name)
     except Exception as e:
-        return {"error": f"Failed to load model '{model_name}': {e}"}
     try:
-        inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False).to(DEVICE)
     except Exception as e:
-        return {"error": f"Tokenization error: {e}"}
-    with torch.no_grad():
-        try:
-            outputs = model(**inputs)
-        except Exception as e:
-            return {"error": f"Model forward error: {e}"}
-    # Extract tokens & internals
-    try:
-        input_ids = inputs["input_ids"][0].cpu().numpy().tolist()
-        tokens = tokenizer.convert_ids_to_tokens(input_ids)
-    except Exception:
-        return {"error": "Failed to extract tokens."}
-    attentions = [a[0].cpu().numpy() for a in outputs.attentions] if outputs.attentions is not None else None
-    hidden = [h[0].cpu().numpy() for h in outputs.hidden_states] if outputs.hidden_states is not None else None
-    logits = outputs.logits[0].cpu().numpy()  # shape (seq_len, vocab)
     # PCA per layer
-    pca_layers = []
-    if hidden is not None:
-        for layer_h in hidden:
-            pca_layers.append(compute_pca_points(layer_h))
-    else:
-        pca_layers = None
-    # Next-token topk
-    last_logits = logits[-1]
-    probs = softmax(last_logits)
-    topk = 25
-    idx = np.argsort(probs)[-topk:][::-1]
-    top_tokens = [tokenizer.decode([int(i)]) for i in idx]
     top_scores = probs[idx].tolist()
-    default_layer = (len(attentions) - 1) if attentions is not None else (len(pca_layers) - 1 if pca_layers else 0)
     default_head = 0
-    fig_attn = make_attention_figure(attentions[default_layer][default_head], tokens, title=f"Layer {default_layer} Head {default_head}") if attentions is not None else None
-    fig_pca = make_pca_figure(pca_layers[default_layer], tokens, highlight_idx=None, title=f"PCA (layer {default_layer})") if pca_layers is not None else None
-    fig_probs = make_probs_figure(top_tokens, top_scores, title="Next-token top predictions")
-    explanation = (
-        "Simple: the model splits text into pieces, looks which pieces are important, and guesses the next word."
-        if explain_simple else
-        "Technical: tokens, attention matrices per head/layer, hidden states projected to 2D, and top-k next-token probabilities."
-    )
-    # neuron explorer (top neurons by mean absolute activation in last layer)
     neuron_info = []
     try:
-        last_hidden = hidden[-1]  # (seq, dim)
-        mean_act = np.abs(last_hidden).mean(axis=0)
         top_neurons = np.argsort(mean_act)[-24:][::-1]
-        for n in top_neurons[:24]:
-            vals = last_hidden[:, n]
-            top_token_idx = np.argsort(np.abs(vals))[-6:][::-1]
-            token_hits = [(tokens[i], float(vals[i])) for i in top_token_idx]
-            neuron_info.append({"neuron": int(n), "top_tokens": token_hits})
-    except Exception:
         neuron_info = []
-    # residual norms (best-effort)
-    residuals = None
-    try:
-        if hasattr(model, "transformer") and hasattr(model.transformer, "h"):
-            blocks = model.transformer.h
-            wte = getattr(model.transformer, "wte", None)
-            # compute norms by approximating per-layer attn & mlp outputs using block forward if possible
-            attn_norms = []
-            mlp_norms = []
-            # Start from embeddings
-            cur = wte(inputs["input_ids"]) if wte is not None else None
-            if cur is not None:
-                cur = cur.to(DEVICE)
-                # We'll run each block and measure norms of attention & mlp outputs if callable
-                for block in blocks:
-                    try:
-                        ln1 = block.ln_1(cur)
-                        attn_out = block.attn(ln1)[0]
-                        cur = cur + attn_out
-                        ln2 = block.ln_2(cur)
-                        mlp_out = block.mlp(ln2)
-                        cur = cur + mlp_out
-                        attn_norms.append(float(torch.norm(attn_out).cpu().numpy()))
-                        mlp_norms.append(float(torch.norm(mlp_out).cpu().numpy()))
-                    except Exception:
-                        # fallback: run full block and compute residual diff
-                        prev = cur.clone()
-                        try:
-                            cur = block(prev)[0]
-                            total = cur - prev
-                            attn_norms.append(0.0)
-                            mlp_norms.append(float(torch.norm(total).cpu().numpy()))
-                        except Exception:
-                            attn_norms.append(0.0)
-                            mlp_norms.append(0.0)
-                residuals = {"attn_norms": attn_norms, "mlp_norms": mlp_norms}
-    except Exception:
-        residuals = None
-    result = {
         "tokens": tokens,
         "attentions": attentions,
         "hidden": hidden,
         "logits": logits,
-        "pca_layers": pca_layers,
-        "fig_attn": fig_attn,
-        "fig_pca": fig_pca,
-        "fig_probs": fig_probs,
         "default_layer": default_layer,
         "default_head": default_head,
-        "token_display": tokens_display(tokens),
-        "explanation": explanation,
         "neuron_info": neuron_info,
         "residuals": residuals,
-        "model_name": model_name,
-        "input_ids": input_ids
     }
-    return result
-# ---------------- Activation patching ----------------
-def activation_patch_and_run(text_tokens, model_name, patch_layer, patch_pos, patch_from_pos, patch_scale=1.0):
     """
-    Activation patching for GPT-2 style models: copy vector at patch_from_pos to patch_pos at patch_layer.
-    Returns top-k next-token predictions after patching.
     """
     try:
         model, tokenizer = load_model(model_name)
-    except Exception:
-        return {"error": "Model load failed for patching."}
-    # Prepare inputs again as string
-    text = " ".join(text_tokens)
-    inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False).to(DEVICE)
-    # Check block availability
-    if not (hasattr(model, "transformer") and hasattr(model.transformer, "h")):
-        return {"error": "Model not compatible with activation patching (requires GPT-2 style blocks)."}
     blocks = model.transformer.h
     wte = model.transformer.wte
     ln_f = model.transformer.ln_f if hasattr(model.transformer, "ln_f") else None
     lm_head = model.lm_head
-    # collect hidden precomputed
     with torch.no_grad():
-        x = wte(inputs["input_ids"]).to(DEVICE)  # (1, seq, dim)
-        hidden_per_layer = [x.detach().cpu().numpy()[0]]  # embedding considered layer -1
-        for block in blocks:
-            # standard GPT-2 block flow
-            ln1 = block.ln_1(x)
-            attn_out = block.attn(ln1)[0]
-            x = x + attn_out
-            ln2 = block.ln_2(x)
-            mlp_out = block.mlp(ln2)
-            x = x + mlp_out
-            hidden_per_layer.append(x.detach().cpu().numpy()[0])
-    seq_len = hidden_per_layer[0].shape[0]
-    if patch_pos < 0 or patch_pos >= seq_len or patch_from_pos < 0 or patch_from_pos >= seq_len:
-        return {"error": "Patch positions out of range."}
-    # vector to copy
-    vec = torch.tensor(hidden_per_layer[patch_layer][patch_from_pos], dtype=torch.float32).to(DEVICE) * float(patch_scale)
     # re-run with patch
     with torch.no_grad():
         x = wte(inputs["input_ids"]).to(DEVICE)
-        for i, block in enumerate(blocks):
-            ln1 = block.ln_1(x)
-            attn_out = block.attn(ln1)[0]
-            x = x + attn_out
-            ln2 = block.ln_2(x)
-            mlp_out = block.mlp(ln2)
-            x = x + mlp_out
-            if i == patch_layer:
-                # set vector at position
-                x[0, patch_pos, :] = vec
-        final = ln_f(x) if ln_f is not None else x
-        logits = lm_head(final)
-        logits = logits[0, -1, :].cpu().numpy()
         probs = softmax(logits)
-        topk = 25
-        idx = np.argsort(probs)[-topk:][::-1]
-        top_tokens = [tokenizer.decode([int(i)]) for i in idx]
-        top_scores = probs[idx].tolist()
-    return {"patched_top_tokens": top_tokens, "patched_top_scores": top_scores}
-# ---------------- UI wiring ----------------
-def run_analysis(text, model_name, explain_simple):
-    res = analyze_text(text, model_name, explain_simple)
-    if "error" in res:
-        # return dict keyed by components
-        return {
-            token_display: gr.update(value=""),
-            explanation_md: gr.update(value=res["error"]),
-            model_info: gr.update(value=f"Model: {model_name}"),
-            attn_plot: gr.update(value=None),
-            pca_plot: gr.update(value=None),
-            probs_plot: gr.update(value=None),
-            layer_slider: gr.update(maximum=0, value=0),
-            head_slider: gr.update(maximum=0, value=0),
-            token_step: gr.update(maximum=0, value=0),
-            state: res,
-            residual_plot: gr.update(value=None),
-            neuron_table: gr.update(value=[]),
-            patch_layer_input: gr.update(maximum=0, value=0),
-            patch_pos_input: gr.update(maximum=0, value=0),
-            patch_from_pos_input: gr.update(maximum=0, value=0),
-        }
-    tokens = res["tokens"]
-    num_layers = len(res["attentions"]) if res["attentions"] is not None else (len(res["pca_layers"]) - 1 if res["pca_layers"] else 0)
-    num_heads = res["attentions"][0].shape[0] if res["attentions"] is not None else 1
-    max_token_idx = len(tokens) - 1
-    token_display_text = f"**Tokens:** {res['token_display']}"
-    explanation_text = res["explanation"]
-    model_info_text = f"Model: {res['model_name']}  •  layers: {num_layers}  •  heads: {num_heads}  •  tokens: {len(tokens)}"
-    layer_update = gr.update(maximum=max(0, num_layers - 1), value=res["default_layer"])
-    head_update = gr.update(maximum=max(0, num_heads - 1), value=res["default_head"])
-    token_step_update = gr.update(maximum=max_token_idx, value=0)
-    patch_layer_update = gr.update(maximum=max(0, num_layers - 1), value=0)
-    patch_pos_update = gr.update(maximum=max(0, max_token_idx), value=0)
-    patch_from_pos_update = gr.update(maximum=max(0, max_token_idx), value=0)
-    # neuron table initial (show first neuron's top tokens if available)
-    neuron_table_data = []
-    if res.get("neuron_info"):
-        first = res["neuron_info"][0]
-        neuron_table_data = [[t, round(v, 6)] for t, v in first["top_tokens"]]
-    # residual figure
-    residual_fig = None
-    if res.get("residuals"):
-        df = pd.DataFrame({"layer": list(range(len(res["residuals"]["attn_norms"]))),
-                           "attn": res["residuals"]["attn_norms"],
-                           "mlp": res["residuals"]["mlp_norms"]})
-        residual_fig = go.Figure()
-        residual_fig.add_trace(go.Bar(x=df["layer"], y=df["attn"], name="Attention norm"))
-        residual_fig.add_trace(go.Bar(x=df["layer"], y=df["mlp"], name="MLP norm"))
-        residual_fig.update_layout(barmode="group", title="Residual contributions (layerwise norms)", height=360)
-    return {
-        token_display: gr.update(value=token_display_text),
-        explanation_md: gr.update(value=explanation_text),
-        model_info: gr.update(value=model_info_text),
-        attn_plot: gr.update(value=res["fig_attn"]),
-        pca_plot: gr.update(value=res["fig_pca"]),
-        probs_plot: gr.update(value=res["fig_probs"]),
-        layer_slider: layer_update,
-        head_slider: head_update,
-        token_step: token_step_update,
-        state: res,
-        residual_plot: gr.update(value=residual_fig),
-        neuron_table: gr.update(value=neuron_table_data),
-        patch_layer_input: patch_layer_update,
-        patch_pos_input: patch_pos_update,
-        patch_from_pos_input: patch_from_pos_update,
-    }
-def update_visuals(state_obj, layer, head, token_idx):
-    # update attention, pca, and attention-row for selected token using cached state
-    if not state_obj:
-        return {attn_plot: gr.update(value=None), pca_plot: gr.update(value=None), step_attn_plot: gr.update(value=None)}
-    res = state_obj
-    tokens = res["tokens"]
-    # bounds
-    if res["attentions"] is not None:
-        max_layer = len(res["attentions"]) - 1
-        layer = int(min(max(0, layer), max_layer))
-        max_head = res["attentions"][0].shape[0] - 1
-        head = int(min(max(0, head), max_head))
-    else:
-        layer = int(min(max(0, layer), len(res["pca_layers"]) - 1 if res["pca_layers"] else 0))
-        head = 0
-    token_idx = int(min(max(0, token_idx), len(tokens) - 1))
-    attn_fig = None
-    if res["attentions"] is not None:
-        attn_fig = make_attention_figure(res["attentions"][layer][head], tokens, title=f"Layer {layer} Head {head}")
-    pca_fig = None
-    if res["pca_layers"] is not None:
-        pca_pts = res["pca_layers"][layer]
-        pca_fig = make_pca_figure(pca_pts, tokens, highlight_idx=token_idx, title=f"PCA (layer {layer})")
-    step_attn_fig = None
-    if res["attentions"] is not None:
-        row = res["attentions"][layer][head][token_idx]
-        step_attn_fig = go.Figure(data=[go.Bar(x=tokens, y=row)])
-        step_attn_fig.update_layout(title=f"Token {token_idx} attends to (layer {layer}, head {head})", height=300, margin=dict(l=40,r=20,t=30,b=40))
-    return {attn_plot: gr.update(value=attn_fig),
-            pca_plot: gr.update(value=pca_fig),
-            step_attn_plot: gr.update(value=step_attn_fig)}
-def run_patch(state_obj, patch_layer, patch_pos, patch_from_pos, patch_scale, model_name):
-    if not state_obj:
-        return gr.update(value=None)
-    tokens = state_obj["tokens"]
-    res = activation_patch_and_run(tokens, model_name, int(patch_layer), int(patch_pos), int(patch_from_pos), float(patch_scale))
-    if "error" in res:
-        return gr.update(value=None)
-    fig = go.Figure(data=[go.Bar(x=res["patched_top_tokens"], y=res["patched_top_scores"])])
-    fig.update_layout(title=f"Patched predictions (layer {patch_layer}, pos {patch_pos} <- pos {patch_from_pos}, scale {patch_scale})", height=420)
-    return gr.update(value=fig)
-def find_neurons(state_obj):
-    if not state_obj:
-        return gr.update(value=[])
-    info = state_obj.get("neuron_info", [])
-    rows = []
-    for e in info[:24]:
-        for t, v in e["top_tokens"]:
-            rows.append([t, round(v,6)])
-    # dedupe
-    df = pd.DataFrame(rows, columns=["token","activation"]).drop_duplicates().head(24).values.tolist()
-    return gr.update(value=df)
-def inspect_neuron(state_obj, neuron_idx):
-    if not state_obj:
-        return gr.update(value=[])
-    try:
-        neuron_idx = int(neuron_idx)
-    except Exception:
-        return gr.update(value=[])
-    last_hidden = state_obj["hidden"][-1]
-    vals = last_hidden[:, neuron_idx]
-    tokens = state_obj["tokens"]
-    df = sorted([(tokens[i], float(vals[i])) for i in range(len(tokens))], key=lambda x: -abs(x[1]))[:12]
-    return gr.update(value=[[t, round(v,6)] for t,v in df])
-# ---------------- Gradio UI ----------------
-with gr.Blocks(title="LLM Visualizer — Full (Option A)", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("<h1 style='font-size:30px'>🧠 LLM Visualizer — Full (Advanced)</h1>")
-    gr.Markdown("Advanced GPT-2 style visualizer. Use `distilgpt2` or `gpt2` for full features. Keep input short (<80 tokens) on CPU Spaces.")
     with gr.Row():
         with gr.Column(scale=3):
-            model_input = gr.Textbox(label="Model (Hugging Face name)", value=DEFAULT_MODEL)
-            text_input = gr.Textbox(label="Input text", value="Hello world, this is a test.", lines=3)
-            explain_simple = gr.Checkbox(label="Explain simply (kid/elder mode)", value=True)
-            run_btn = gr.Button("Run analysis", variant="primary")
-            gr.Markdown("**Presets:**")
-            with gr.Row():
-                gr.Button("Greeting").click(lambda: "Hello! How are you today?", None, text_input)
-                gr.Button("Story start").click(lambda: "Once upon a time, there was a small robot...", None, text_input)
-                gr.Button("Question").click(lambda: "Why is the sky blue?", None, text_input)
-            # Guided hints
-            gr.Markdown("**Hints:** Use small inputs. Slide Layer/Head to explore. Use Token slider to animate token flow.")
         with gr.Column(scale=2):
-            token_display = gr.Markdown("Tokens will appear here.")
-            explanation_md = gr.Markdown("Explanation will appear here.")
-            model_info = gr.Markdown("Model info: —")
     with gr.Row():
         with gr.Column():
-            layer_slider = gr.Slider(label="Layer", minimum=0, maximum=0, step=1, value=0)
-            head_slider = gr.Slider(label="Head", minimum=0, maximum=0, step=1, value=0)
-            token_step = gr.Slider(label="Token index (step through tokens)", minimum=0, maximum=0, step=1, value=0)
-            attn_plot = gr.Plot(label="Attention heatmap")
         with gr.Column():
-            pca_plot = gr.Plot(label="PCA hidden states (2D)")
-            step_attn_plot = gr.Plot(label="Attention row for selected token")
-            probs_plot = gr.Plot(label="Next-token top predictions")
-            residual_plot = gr.Plot(label="Residual decomposition (attention vs mlp)")
     with gr.Row():
-        with gr.Column(scale=2):
-            gr.Markdown("### Neuron / Circuit explorer")
-            neuron_find_btn = gr.Button("Find example neurons (auto)")
-            neuron_dropdown = gr.Number(label="Neuron index to inspect (enter integer)", value=0)
-            neuron_table = gr.Dataframe(headers=["token", "activation"], interactive=False)
-        with gr.Column(scale=3):
-            gr.Markdown("### Activation patching (copy vector at layer)")
-            patch_layer_input = gr.Slider(label="Patch Layer (0 = first block)", minimum=0, maximum=0, step=1, value=0)
-            patch_pos_input = gr.Slider(label="Patch position (token index)", minimum=0, maximum=0, step=1, value=0)
-            patch_from_pos_input = gr.Slider(label="Copy from position (token index)", minimum=0, maximum=0, step=1, value=0)
-            patch_scale_input = gr.Number(label="Patch scale (multiplier)", value=1.0)
-            patch_btn = gr.Button("Run Activation Patch & Show Top Predictions", variant="primary")
-            patch_output = gr.Plot(label="Patched next-token predictions")
     state = gr.State()
-    # Events wiring
-    run_btn.click(fn=run_analysis,
-                  inputs=[text_input, model_input, explain_simple],
-                  outputs=[token_display, explanation_md, model_info,
-                           attn_plot, pca_plot, probs_plot,
-                           layer_slider, head_slider, token_step,
-                           state, residual_plot, neuron_table,
-                           patch_layer_input, patch_pos_input, patch_from_pos_input])
-    layer_slider.change(fn=update_visuals, inputs=[state, layer_slider, head_slider, token_step],
-                        outputs=[attn_plot, pca_plot, step_attn_plot])
-    head_slider.change(fn=update_visuals, inputs=[state, layer_slider, head_slider, token_step],
-                        outputs=[attn_plot, pca_plot, step_attn_plot])
-    token_step.change(fn=update_visuals, inputs=[state, layer_slider, head_slider, token_step],
-                        outputs=[attn_plot, pca_plot, step_attn_plot])
-    neuron_find_btn.click(fn=find_neurons, inputs=[state], outputs=[neuron_table])
-    neuron_dropdown.change(fn=inspect_neuron, inputs=[state, neuron_dropdown], outputs=[neuron_table])
-    patch_btn.click(fn=run_patch, inputs=[state, patch_layer_input, patch_pos_input, patch_from_pos_input, patch_scale_input, model_input],
-                    outputs=[patch_output])
 demo.launch()

+# FULL LLM VISUALIZER — OPTION A (ADVANCED)
+# stable + patched + safe for HuggingFace Spaces (CPU or GPU)
+# recommended models: distilgpt2, gpt2
+# author: ChatGPT
 import gradio as gr
 import torch
 import numpy as np
 import plotly.express as px
 import plotly.graph_objects as go
 import pandas as pd
+from sklearn.decomposition import PCA
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import html
 DEFAULT_MODEL = "distilgpt2"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+MODEL_CACHE = {}
+# ---------------- CORE UTILS ----------------
 def load_model(model_name):
+    if model_name in MODEL_CACHE:
+        return MODEL_CACHE[model_name]
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForCausalLM.from_pretrained(
         model_name, output_attentions=True, output_hidden_states=True
+    ).to(DEVICE)
     model.eval()
+    MODEL_CACHE[model_name] = (model, tokenizer)
     return model, tokenizer
 def softmax(x):
     e = np.exp(x - np.max(x))
     return e / e.sum(axis=-1, keepdims=True)
+def safe_tokens(tokens):
     return "  ".join([f"[{html.escape(t)}]" for t in tokens])
+def compute_pca(hidden_layer):
     try:
+        return PCA(n_components=2).fit_transform(hidden_layer)
+    except:
         seq = hidden_layer.shape[0]
+        dim0 = hidden_layer[:, 0] if hidden_layer.shape[1] > 0 else np.zeros(seq)
+        dim1 = hidden_layer[:, 1] if hidden_layer.shape[1] > 1 else np.zeros(seq)
+        return np.vstack([dim0, dim1]).T
+def fig_attention(matrix, tokens, title):
+    fig = px.imshow(matrix, x=tokens, y=tokens, title=title,
+                    labels={"x": "Key token", "y": "Query token", "color": "Attention"})
+    fig.update_layout(height=420)
     return fig
+def fig_pca(points, tokens, highlight=None, title="PCA"):
+    fig = px.scatter(x=points[:, 0], y=points[:, 1], text=tokens, title=title)
     fig.update_traces(textposition="top center", marker=dict(size=10))
+    if highlight is not None:
         fig.add_trace(go.Scatter(
+            x=[points[highlight, 0]],
+            y=[points[highlight, 1]],
+            mode="markers+text",
+            text=[tokens[highlight]],
+            marker=dict(size=18, color="red")
         ))
+    fig.update_layout(height=420)
     return fig
+def fig_probs(tokens, scores):
+    fig = go.Figure()
+    fig.add_trace(go.Bar(x=tokens, y=scores))
+    fig.update_layout(title="Next-token probabilities", height=380)
     return fig
+# ---------------- ANALYSIS CORE ----------------
+def analyze_text(text, model_name, simple):
+    if not text.strip():
+        return {"error": "Please enter text."}
     try:
         model, tokenizer = load_model(model_name)
     except Exception as e:
+        return {"error": f"Failed to load model: {e}"}
+    inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False).to(DEVICE)
     try:
+        with torch.no_grad():
+            out = model(**inputs)
     except Exception as e:
+        return {"error": f"Model error: {e}"}
+    input_ids = inputs["input_ids"][0].cpu().numpy().tolist()
+    tokens = tokenizer.convert_ids_to_tokens(input_ids)
+    attentions = [a[0].cpu().numpy() for a in out.attentions]
+    hidden = [h[0].cpu().numpy() for h in out.hidden_states]
+    logits = out.logits[0].cpu().numpy()
     # PCA per layer
+    pca_layers = [compute_pca(h) for h in hidden]
+    # top-k
+    last = logits[-1]
+    probs = softmax(last)
+    idx = np.argsort(probs)[-20:][::-1]
+    top_tokens = [tokenizer.decode([i]) for i in idx]
     top_scores = probs[idx].tolist()
+    default_layer = len(attentions) - 1
     default_head = 0
+    # neuron explorer
     neuron_info = []
     try:
+        last_h = hidden[-1]
+        mean_act = np.abs(last_h).mean(axis=0)
         top_neurons = np.argsort(mean_act)[-24:][::-1]
+        for n in top_neurons:
+            vals = last_h[:, n]
+            top_ix = np.argsort(np.abs(vals))[-5:][::-1]
+            neuron_info.append({
+                "neuron": int(n),
+                "top_tokens": [(tokens[i], float(vals[i])) for i in top_ix]
+            })
+    except:
         neuron_info = []
+    # residual decomposition (safe)
+    residuals = compute_residuals_safe(model, inputs)
+    return {
         "tokens": tokens,
         "attentions": attentions,
         "hidden": hidden,
+        "pca": pca_layers,
         "logits": logits,
+        "top_tokens": top_tokens,
+        "top_scores": top_scores,
         "default_layer": default_layer,
         "default_head": default_head,
         "neuron_info": neuron_info,
         "residuals": residuals,
+        "token_display": safe_tokens(tokens),
+        "explanation": explain(simple)
     }
+def explain(s):
+    if s:
+        return (
+            "🧒 **Simple mode:**\n"
+            "- The model cuts text into small pieces (tokens).\n"
+            "- It looks at which tokens matter (attention).\n"
+            "- It builds an internal map (PCA) of meanings.\n"
+            "- Then it guesses the next token.\n"
+        )
+    return (
+        "🔬 **Technical mode:**\n"
+        "Showing tokens, attention (query→key), PCA projections, logits, "
+        "neuron activations, and layerwise residual contributions.\n"
+    )
+# ---------------- RESIDUAL DECOMPOSITION SAFE ----------------
+def compute_residuals_safe(model, inputs):
     """
+    Guaranteed safe residual norms for GPT-2-style blocks.
+    Will NEVER crash. Returns None if not applicable.
+    """
+    if not hasattr(model, "transformer") or not hasattr(model.transformer, "h"):
+        return None
+    try:
+        blocks = model.transformer.h
+        wte = model.transformer.wte
+        x = wte(inputs["input_ids"]).to(DEVICE)
+        attn_norms = []
+        mlp_norms = []
+        for block in blocks:
+            try:
+                ln1 = block.ln_1(x)
+                attn_out = block.attn(ln1)[0]
+                x = x + attn_out
+                ln2 = block.ln_2(x)
+                mlp_out = block.mlp(ln2)
+                x = x + mlp_out
+                attn_norms.append(float(torch.norm(attn_out).cpu()))
+                mlp_norms.append(float(torch.norm(mlp_out).cpu()))
+            except:
+                # fallback safe zero
+                attn_norms.append(0.0)
+                mlp_norms.append(0.0)
+        # normalize lengths safely
+        L = min(len(attn_norms), len(mlp_norms))
+        return {
+            "attn": attn_norms[:L],
+            "mlp": mlp_norms[:L],
+        }
+    except:
+        return None
+# ---------------- ACTIVATION PATCHING (SAFE VERSION) ----------------
+def activation_patch(tokens, model_name, layer, pos, from_pos, scale=1.0):
+    """
+    Safe activation patching (never crashes, only works for GPT-2 style).
     """
     try:
         model, tokenizer = load_model(model_name)
+    except:
+        return {"error": "Model load error."}
+    if not hasattr(model, "transformer") or not hasattr(model.transformer, "h"):
+        return {"error": "Model not compatible with patching."}
+    text = " ".join(tokens)
+    inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False).to(DEVICE)
     blocks = model.transformer.h
     wte = model.transformer.wte
     ln_f = model.transformer.ln_f if hasattr(model.transformer, "ln_f") else None
     lm_head = model.lm_head
     with torch.no_grad():
+        x = wte(inputs["input_ids"]).to(DEVICE)
+        hidden_layers = [x.clone().cpu().numpy()[0]]
+        for b in blocks:
+            ln1 = b.ln_1(x)
+            a = b.attn(ln1)[0]
+            x = x + a
+            ln2 = b.ln_2(x)
+            m = b.mlp(ln2)
+            x = x + m
+            hidden_layers.append(x.clone().cpu().numpy()[0])
+    if layer >= len(hidden_layers):
+        return {"error": "Layer out of range."}
+    seq_len = hidden_layers[layer].shape[0]
+    if pos >= seq_len or from_pos >= seq_len:
+        return {"error": "Position out of range."}
+    patch_vec = torch.tensor(hidden_layers[layer][from_pos], dtype=torch.float32).to(DEVICE) * float(scale)
     # re-run with patch
     with torch.no_grad():
         x = wte(inputs["input_ids"]).to(DEVICE)
+        for i, b in enumerate(blocks):
+            ln1 = b.ln_1(x)
+            a = b.attn(ln1)[0]
+            x = x + a
+            ln2 = b.ln_2(x)
+            m = b.mlp(ln2)
+            x = x + m
+            if i == layer:
+                x[0, pos, :] = patch_vec
+        final = ln_f(x) if ln_f else x
+        logits = lm_head(final)[0, -1, :].cpu().numpy()
         probs = softmax(logits)
+        idx = np.argsort(probs)[-20:][::-1]
+        tt = [tokenizer.decode([int(i)]) for i in idx]
+        ss = probs[idx].tolist()
+        return {"tokens": tt, "scores": ss}
+# ---------------- GRADIO UI ----------------
+with gr.Blocks(title="LLM Visualizer — Full", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🧠 Full LLM Visualizer (Advanced)")
+    gr.Markdown("Fully stable build with attention, PCA, neuron explorer, residuals, activation-patching")
+    # Panel 1
     with gr.Row():
         with gr.Column(scale=3):
+            model_name = gr.Textbox(label="Model", value=DEFAULT_MODEL)
+            input_text = gr.Textbox(label="Input", value="Hello world", lines=3)
+            simple = gr.Checkbox(label="Explain simply", value=True)
+            run_btn = gr.Button("Run", variant="primary")
+            gr.Markdown("Presets:")
+            with gr.Row():
+                gr.Button("Greeting").click(lambda: "Hello! How are you?", None, input_text)
+                gr.Button("Story").click(lambda: "Once upon a time there was a robot.", None, input_text)
+                gr.Button("Question").click(lambda: "Why is the sky blue?", None, input_text)
         with gr.Column(scale=2):
+            token_display = gr.Markdown()
+            explanation_md = gr.Markdown()
+            model_info = gr.Markdown()
+    # Panel 2
     with gr.Row():
         with gr.Column():
+            layer_slider = gr.Slider(0, 0, value=0, step=1, label="Layer")
+            head_slider = gr.Slider(0, 0, value=0, step=1, label="Head")
+            token_step = gr.Slider(0, 0, value=0, step=1, label="Token index")
+            attn_plot = gr.Plot()
         with gr.Column():
+            pca_plot = gr.Plot()
+            step_attn_plot = gr.Plot()
+            probs_plot = gr.Plot()
+    # Panel 3 — Residuals
+    residual_plot = gr.Plot()
+    # Panel 4 — Neuron explorer
     with gr.Row():
+        neuron_find_btn = gr.Button("Find neurons")
+        neuron_idx = gr.Number(label="Neuron index", value=0)
+        neuron_table = gr.Dataframe(headers=["token", "activation"], interactive=False)
+    # Panel 5 — Activation Patching
+    with gr.Row():
+        patch_layer = gr.Slider(0, 0, value=0, step=1, label="Patch layer")
+        patch_pos = gr.Slider(0, 0, value=0, step=1, label="Target token position")
+        patch_from = gr.Slider(0, 0, value=0, step=1, label="Copy from position")
+        patch_scale = gr.Number(label="Scale", value=1.0)
+        patch_btn = gr.Button("Run patch")
+        patch_output = gr.Plot()
     state = gr.State()
+    # ---- RUN ANALYSIS ----
+    def run_app(text, model, simp):
+        res = analyze_text(text, model, simp)
+        if "error" in res:
+            return {
+                token_display: gr.update(value=""),
+                explanation_md: gr.update(value=res["error"]),
+                model_info: gr.update(value=f"Model: {model}"),
+                attn_plot: gr.update(value=None),
+                pca_plot: gr.update(value=None),
+                probs_plot: gr.update(value=None),
+                layer_slider: gr.update(maximum=0, value=0),
+                head_slider: gr.update(maximum=0, value=0),
+                token_step: gr.update(maximum=0, value=0),
+                residual_plot: gr.update(value=None),
+                neuron_table: gr.update(value=[]),
+                patch_layer: gr.update(maximum=0),
+                patch_pos: gr.update(maximum=0),
+                patch_from: gr.update(maximum=0),
+                state: res
+            }
+        tokens = res["tokens"]
+        L = len(res["attentions"])
+        H = res["attentions"][0].shape[0]
+        T = len(tokens) - 1
+        residual_fig = None
+        if res["residuals"]:
+            attn_vals = res["residuals"]["attn"]
+            ml_vals = res["residuals"]["mlp"]
+            Lmin = min(len(attn_vals), len(ml_vals))
+            df = pd.DataFrame({
+                "layer": list(range(Lmin)),
+                "attention": attn_vals[:Lmin],
+                "mlp": ml_vals[:Lmin]
+            })
+            fig = go.Figure()
+            fig.add_trace(go.Bar(x=df["layer"], y=df["attention"], name="Attention norm"))
+            fig.add_trace(go.Bar(x=df["layer"], y=df["mlp"], name="MLP norm"))
+            fig.update_layout(barmode="group", height=360)
+            residual_fig = fig
+        return {
+            token_display: gr.update(value=f"**Tokens:** {res['token_display']}"),
+            explanation_md: gr.update(value=res["explanation"]),
+            model_info: gr.update(value=f"Model: {model} • layers: {L} • heads: {H} • tokens: {len(tokens)}"),
+            attn_plot: gr.update(value=res["fig_attn"] if res.get("fig_attn") else None),
+            pca_plot: gr.update(value=res["fig_pca"] if res.get("fig_pca") else None),
+            probs_plot: gr.update(value=fig_probs(res["top_tokens"], res["top_scores"])),
+            layer_slider: gr.update(maximum=L-1, value=res["default_layer"]),
+            head_slider: gr.update(maximum=H-1, value=res["default_head"]),
+            token_step: gr.update(maximum=T, value=0),
+            residual_plot: gr.update(value=residual_fig),
+            neuron_table: gr.update(value=[[t, round(v,4)] for t,v in res["neuron_info"][0]["top_tokens"]] if res["neuron_info"] else []),
+            patch_layer: gr.update(maximum=L-1, value=0),
+            patch_pos: gr.update(maximum=T, value=0),
+            patch_from: gr.update(maximum=T, value=0),
+            state: res
+        }
+    run_btn.click(
+        run_app,
+        inputs=[input_text, model_name, simple],
+        outputs=[
+            token_display, explanation_md, model_info,
+            attn_plot, pca_plot, probs_plot,
+            layer_slider, head_slider, token_step,
+            residual_plot, neuron_table,
+            patch_layer, patch_pos, patch_from,
+            state
+        ]
+    )
+    # ---- SLIDER UPDATES ----
+    def update_view(res, layer, head, tok):
+        if not res or "error" in res:
+            return {
+                attn_plot: gr.update(value=None),
+                pca_plot: gr.update(value=None),
+                step_attn_plot: gr.update(value=None),
+            }
+        tokens = res["tokens"]
+        layer = min(max(0, layer), len(res["attentions"]) - 1)
+        head = min(max(0, head), res["attentions"][0].shape[0] - 1)
+        tok = min(max(0, tok), len(tokens) - 1)
+        att = fig_attention(res["attentions"][layer][head], tokens, f"Layer {layer} Head {head}")
+        pts = res["pca"][layer]
+        pca_fig = fig_pca(pts, tokens, highlight=tok, title=f"PCA Layer {layer}")
+        row = res["attentions"][layer][head][tok]
+        step_fig = go.Figure([go.Bar(x=tokens, y=row)])
+        step_fig.update_layout(title=f"Token {tok} attends to")
+        return {
+            attn_plot: gr.update(value=att),
+            pca_plot: gr.update(value=pca_fig),
+            step_attn_plot: gr.update(value=step_fig)
+        }
+    layer_slider.change(update_view, [state, layer_slider, head_slider, token_step],
+                        [attn_plot, pca_plot, step_attn_plot])
+    head_slider.change(update_view, [state, layer_slider, head_slider, token_step],
+                        [attn_plot, pca_plot, step_attn_plot])
+    token_step.change(update_view, [state, layer_slider, head_slider, token_step],
+                      [attn_plot, pca_plot, step_attn_plot])
+    # ---- NEURON EXPLORER ----
+    def neuron_auto(res):
+        if not res or "neuron_info" not in res:
+            return gr.update(value=[])
+        rows = []
+        for item in res["neuron_info"]:
+            for t, v in item["top_tokens"]:
+                rows.append([t, round(v,4)])
+        df = pd.DataFrame(rows, columns=["token","activation"]).drop_duplicates().head(24)
+        return gr.update(value=df.values.tolist())
+    neuron_find_btn.click(neuron_auto, [state], [neuron_table])
+    def neuron_manual(res, idx):
+        if not res or "hidden" not in res:
+            return gr.update(value=[])
+        try:
+            idx = int(idx)
+        except:
+            return gr.update(value=[])
+        last = res["hidden"][-1]
+        if idx >= last.shape[1]:
+            return gr.update(value=[])
+        vals = last[:, idx]
+        tokens = res["tokens"]
+        pairs = sorted([(tokens[i], float(vals[i])) for i in range(len(tokens))],
+                       key=lambda x: -abs(x[1]))[:12]
+        return gr.update(value=[[t, round(v,4)] for t,v in pairs])
+    neuron_idx.change(neuron_manual, [state, neuron_idx], [neuron_table])
+    # ---- ACTIVATION PATCHING ----
+    def patch_run(res, L, P, FP, S, model):
+        if not res or "tokens" not in res:
+            return gr.update(value=None)
+        out = activation_patch(res["tokens"], model, int(L), int(P), int(FP), float(S))
+        if "error" in out:
+            return gr.update(value=None)
+        fig = fig_probs(out["tokens"], out["scores"])
+        return gr.update(value=fig)
+    patch_btn.click(patch_run,
+                    [state, patch_layer, patch_pos, patch_from, patch_scale, model_name],
+                    [patch_output])
 demo.launch()