Spaces:
Sleeping
Sleeping
File size: 7,219 Bytes
11f25fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
import gradio as gr
from transformers import AutoTokenizer
import collections
# Map of display names to HF model IDs
MODEL_MAP = {
"Nomic Embed v1.5": "nomic-ai/nomic-embed-text-v1.5",
"MixedBread XSmall v1": "mixedbread-ai/mxbai-embed-xsmall-v1",
"Google EmbeddingGemma 300m": "google/embeddinggemma-300m",
"all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
"BGE-M3": "BAAI/bge-m3",
"BERT Base (Baseline WordPiece)": "bert-base-uncased",
"RoBERTa Base (Byte-Level BPE)": "roberta-base",
"E5 Mistral 7B (Llama Tokenizer)": "intfloat/e5-mistral-7b-instruct",
}
# Global cache for tokenizers
tokenizer_cache = {}
def get_tokenizer(model_name):
"""Lazy load tokenizers."""
model_id = MODEL_MAP[model_name]
if model_id not in tokenizer_cache:
print(f"Loading tokenizer: {model_id}...")
try:
tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
except Exception as e:
return None, f"Error loading tokenizer: {str(e)}"
return tokenizer_cache[model_id], None
def format_byte_token(text):
"""
Attempts to identify if a token is a RoBERTa/GPT-2 style byte mapping
(e.g., 'â' representing 0xE2) and converts it to <0xXX> for clarity.
"""
# If the text is just one char and looks "weird" (extended unicode),
# it might be a byte mapping.
if len(text) == 1 and ord(text) > 256:
# This is a heuristic: RoBERTa maps bytes to specific unicode ranges.
# It's safer to just label it as a byte artifact if it matches our fragmentation logic.
return f"<{hex(ord(text))}>"
return text
def analyze_tokenization(text, model_name=MODEL_MAP.keys().__iter__().__next__()):
tokenizer, error = get_tokenizer(model_name)
if error:
return [], error
try:
# Tokenize with offsets
encoding = tokenizer(text, add_special_tokens=False, return_offsets_mapping=True)
except Exception as e:
return [], f"Tokenization failed: {str(e)}"
tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
ids = encoding["input_ids"]
offsets = encoding["offset_mapping"]
# Map character indices to the list of tokens that cover them
char_coverage = collections.defaultdict(list)
for i, (start, end) in enumerate(offsets):
for char_idx in range(start, end):
char_coverage[char_idx].append(i)
output_spans = []
for i, (token, token_id) in enumerate(zip(tokens, ids)):
label = None
display_text = token
# --- Visual Cleanup for RoBERTa/GPT-2 ---
# Replace the special 'Ġ' (G with dot) which represents a space
display_text = display_text.replace('Ġ', ' ')
# Replace 'Ċ' (C with dot) which represents a newline
display_text = display_text.replace('Ċ', '\n')
# Replace 'ĉ' which represents a tab/control
display_text = display_text.replace('ĉ', '\t')
# Check 1: Explicit UNK (The "Hard Failure")
if token_id == tokenizer.unk_token_id:
label = "UNK (Data Loss)"
# Check 2: Byte Fallback / Fragmentation
start, end = offsets[i]
is_fragment = False
# If a single character in the input generated multiple tokens, it's a fragmentation/byte-split
if (end - start) == 1:
tokens_covering_this_char = char_coverage[start]
if len(tokens_covering_this_char) > 1:
is_fragment = True
# Check for Llama/Mistral style byte tokens (<0xE2>)
if token.startswith("<0x") and token.endswith(">"):
is_fragment = True
if is_fragment and label is None:
label = "Byte/Fragment"
# If it's a RoBERTa weird char (like â), try to show it as hex
# to make it look less like random noise
if len(display_text) == 1 and ord(display_text) > 127:
# It's likely a mapped byte. We don't have the reverse map easily accessible,
# but we can mark it clearly.
display_text = f"<{display_text}>"
# Check 3: Subwords (Blue)
if label is None:
# WordPiece '##'
if token.startswith("##"):
label = "Subword"
# SentencePiece/RoBERTa often treats non-leading-space tokens as subwords
elif i > 0 and not token.startswith("Ġ") and not token.startswith(" "):
# Heuristic: If previous token ended at the same spot this one starts
prev_end = offsets[i-1][1]
if start == prev_end:
label = "Subword"
output_spans.append((display_text, label))
return output_spans, f"Total Tokens: {len(tokens)}"
# Scientific text example
scientific_text = "Acidity (pKa)2.97 (25 °C)[5] 13.82 (20 °C)[3] UV-vis (λmax)210 nm (χ)−72.23·10−6 cm3/mol"
with gr.Blocks(title="Embedding Model Tokenizer Detective") as demo:
gr.Markdown(
"""
# 🕵️♀️ Embedding Model Tokenizer Detective
Different embedding models handle unknown characters (OOV) differently.
* **Red (UNK):** The model **deleted** information. It saw a symbol it didn't know and replaced it with a generic placeholder.
* **Orange (Byte/Fragment):** The model **struggled** and split a single character (like a Greek letter or math symbol) into multiple raw bytes.
* **Blue:** Standard subword splitting.
"""
)
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Input Text",
lines=5,
placeholder="Enter scientific or multilingual text here...",
value=scientific_text
)
model_selector = gr.Dropdown(
label="Select Embedding Model / Tokenizer",
choices=list(MODEL_MAP.keys()),
value="Nomic Embed v1.5"
)
analyze_btn = gr.Button("Diagnose Tokenization", variant="primary")
with gr.Column():
output_display = gr.HighlightedText(
label="Tokenized Analysis",
combine_adjacent=False,
show_legend=True,
color_map={"UNK (Data Loss)": "red", "Byte/Fragment": "orange", "Subword": "blue"}
)
stats_output = gr.Label(label="Statistics")
analyze_btn.click(
fn=analyze_tokenization,
inputs=[input_text, model_selector],
outputs=[output_display, stats_output]
)
gr.Examples(
examples=[
["The quick brown fox jumps over the lazy dog."],
[scientific_text],
["susceptibility (Ⅹ) = −72.23·10−6 cm3/mol"],
["汉字漢字カタカナひらがな"],
["⅕ of a pizza is 2 slices."],
["😊 😂 🥺"],
],
inputs=[input_text],
#outputs=[output_display, stats_output],
fn=analyze_tokenization,
run_on_click=True
)
if __name__ == "__main__":
demo.launch() |