import gradio as gr from transformers import AutoTokenizer import collections # Map of display names to HF model IDs MODEL_MAP = { "Nomic Embed v1.5": "nomic-ai/nomic-embed-text-v1.5", "MixedBread XSmall v1": "mixedbread-ai/mxbai-embed-xsmall-v1", "Google EmbeddingGemma 300m": "google/embeddinggemma-300m", "all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2", "BGE-M3": "BAAI/bge-m3", "BERT Base (Baseline WordPiece)": "bert-base-uncased", "RoBERTa Base (Byte-Level BPE)": "roberta-base", "E5 Mistral 7B (Llama Tokenizer)": "intfloat/e5-mistral-7b-instruct", } # Global cache for tokenizers tokenizer_cache = {} def get_tokenizer(model_name): """Lazy load tokenizers.""" model_id = MODEL_MAP[model_name] if model_id not in tokenizer_cache: print(f"Loading tokenizer: {model_id}...") try: tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) except Exception as e: return None, f"Error loading tokenizer: {str(e)}" return tokenizer_cache[model_id], None def format_byte_token(text): """ Attempts to identify if a token is a RoBERTa/GPT-2 style byte mapping (e.g., 'â' representing 0xE2) and converts it to <0xXX> for clarity. """ # If the text is just one char and looks "weird" (extended unicode), # it might be a byte mapping. if len(text) == 1 and ord(text) > 256: # This is a heuristic: RoBERTa maps bytes to specific unicode ranges. # It's safer to just label it as a byte artifact if it matches our fragmentation logic. return f"<{hex(ord(text))}>" return text def analyze_tokenization(text, model_name=MODEL_MAP.keys().__iter__().__next__()): tokenizer, error = get_tokenizer(model_name) if error: return [], error try: # Tokenize with offsets encoding = tokenizer(text, add_special_tokens=False, return_offsets_mapping=True) except Exception as e: return [], f"Tokenization failed: {str(e)}" tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"]) ids = encoding["input_ids"] offsets = encoding["offset_mapping"] # Map character indices to the list of tokens that cover them char_coverage = collections.defaultdict(list) for i, (start, end) in enumerate(offsets): for char_idx in range(start, end): char_coverage[char_idx].append(i) output_spans = [] for i, (token, token_id) in enumerate(zip(tokens, ids)): label = None display_text = token # --- Visual Cleanup for RoBERTa/GPT-2 --- # Replace the special 'Ġ' (G with dot) which represents a space display_text = display_text.replace('Ġ', ' ') # Replace 'Ċ' (C with dot) which represents a newline display_text = display_text.replace('Ċ', '\n') # Replace 'ĉ' which represents a tab/control display_text = display_text.replace('ĉ', '\t') # Check 1: Explicit UNK (The "Hard Failure") if token_id == tokenizer.unk_token_id: label = "UNK (Data Loss)" # Check 2: Byte Fallback / Fragmentation start, end = offsets[i] is_fragment = False # If a single character in the input generated multiple tokens, it's a fragmentation/byte-split if (end - start) == 1: tokens_covering_this_char = char_coverage[start] if len(tokens_covering_this_char) > 1: is_fragment = True # Check for Llama/Mistral style byte tokens (<0xE2>) if token.startswith("<0x") and token.endswith(">"): is_fragment = True if is_fragment and label is None: label = "Byte/Fragment" # If it's a RoBERTa weird char (like â), try to show it as hex # to make it look less like random noise if len(display_text) == 1 and ord(display_text) > 127: # It's likely a mapped byte. We don't have the reverse map easily accessible, # but we can mark it clearly. display_text = f"<{display_text}>" # Check 3: Subwords (Blue) if label is None: # WordPiece '##' if token.startswith("##"): label = "Subword" # SentencePiece/RoBERTa often treats non-leading-space tokens as subwords elif i > 0 and not token.startswith("Ġ") and not token.startswith(" "): # Heuristic: If previous token ended at the same spot this one starts prev_end = offsets[i-1][1] if start == prev_end: label = "Subword" output_spans.append((display_text, label)) return output_spans, f"Total Tokens: {len(tokens)}" # Scientific text example scientific_text = "Acidity (pKa)2.97 (25 °C)[5] 13.82 (20 °C)[3] UV-vis (λmax)210 nm (χ)−72.23·10−6 cm3/mol" with gr.Blocks(title="Embedding Model Tokenizer Detective") as demo: gr.Markdown( """ # 🕵️‍♀️ Embedding Model Tokenizer Detective Different embedding models handle unknown characters (OOV) differently. * **Red (UNK):** The model **deleted** information. It saw a symbol it didn't know and replaced it with a generic placeholder. * **Orange (Byte/Fragment):** The model **struggled** and split a single character (like a Greek letter or math symbol) into multiple raw bytes. * **Blue:** Standard subword splitting. """ ) with gr.Row(): with gr.Column(): input_text = gr.Textbox( label="Input Text", lines=5, placeholder="Enter scientific or multilingual text here...", value=scientific_text ) model_selector = gr.Dropdown( label="Select Embedding Model / Tokenizer", choices=list(MODEL_MAP.keys()), value="Nomic Embed v1.5" ) analyze_btn = gr.Button("Diagnose Tokenization", variant="primary") with gr.Column(): output_display = gr.HighlightedText( label="Tokenized Analysis", combine_adjacent=False, show_legend=True, color_map={"UNK (Data Loss)": "red", "Byte/Fragment": "orange", "Subword": "blue"} ) stats_output = gr.Label(label="Statistics") analyze_btn.click( fn=analyze_tokenization, inputs=[input_text, model_selector], outputs=[output_display, stats_output] ) gr.Examples( examples=[ ["The quick brown fox jumps over the lazy dog."], [scientific_text], ["susceptibility (Ⅹ) = −72.23·10−6 cm3/mol"], ["汉字漢字カタカナひらがな"], ["⅕ of a pizza is 2 slices."], ["😊 😂 🥺"], ], inputs=[input_text], #outputs=[output_display, stats_output], fn=analyze_tokenization, run_on_click=True ) if __name__ == "__main__": demo.launch()