Spaces:

dwablimol
/

token_detective

Sleeping

token_detective / app.py

Martin Elstner

Application added

11f25fb 18 days ago

7.22 kB

	import gradio as gr
	from transformers import AutoTokenizer
	import collections

	# Map of display names to HF model IDs
	MODEL_MAP = {
	"Nomic Embed v1.5": "nomic-ai/nomic-embed-text-v1.5",
	"MixedBread XSmall v1": "mixedbread-ai/mxbai-embed-xsmall-v1",
	"Google EmbeddingGemma 300m": "google/embeddinggemma-300m",
	"all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
	"BGE-M3": "BAAI/bge-m3",
	"BERT Base (Baseline WordPiece)": "bert-base-uncased",
	"RoBERTa Base (Byte-Level BPE)": "roberta-base",
	"E5 Mistral 7B (Llama Tokenizer)": "intfloat/e5-mistral-7b-instruct",
	}

	# Global cache for tokenizers
	tokenizer_cache = {}

	def get_tokenizer(model_name):
	"""Lazy load tokenizers."""
	model_id = MODEL_MAP[model_name]
	if model_id not in tokenizer_cache:
	print(f"Loading tokenizer: {model_id}...")
	try:
	tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
	except Exception as e:
	return None, f"Error loading tokenizer: {str(e)}"
	return tokenizer_cache[model_id], None

	def format_byte_token(text):
	"""
	Attempts to identify if a token is a RoBERTa/GPT-2 style byte mapping
	(e.g., 'â' representing 0xE2) and converts it to <0xXX> for clarity.
	"""
	# If the text is just one char and looks "weird" (extended unicode),
	# it might be a byte mapping.
	if len(text) == 1 and ord(text) > 256:
	# This is a heuristic: RoBERTa maps bytes to specific unicode ranges.
	# It's safer to just label it as a byte artifact if it matches our fragmentation logic.
	return f"<{hex(ord(text))}>"
	return text

	def analyze_tokenization(text, model_name=MODEL_MAP.keys().__iter__().__next__()):
	tokenizer, error = get_tokenizer(model_name)
	if error:
	return [], error

	try:
	# Tokenize with offsets
	encoding = tokenizer(text, add_special_tokens=False, return_offsets_mapping=True)
	except Exception as e:
	return [], f"Tokenization failed: {str(e)}"

	tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
	ids = encoding["input_ids"]
	offsets = encoding["offset_mapping"]

	# Map character indices to the list of tokens that cover them
	char_coverage = collections.defaultdict(list)
	for i, (start, end) in enumerate(offsets):
	for char_idx in range(start, end):
	char_coverage[char_idx].append(i)

	output_spans = []

	for i, (token, token_id) in enumerate(zip(tokens, ids)):
	label = None
	display_text = token

	# --- Visual Cleanup for RoBERTa/GPT-2 ---
	# Replace the special 'Ġ' (G with dot) which represents a space
	display_text = display_text.replace('Ġ', ' ')
	# Replace 'Ċ' (C with dot) which represents a newline
	display_text = display_text.replace('Ċ', '\n')
	# Replace 'ĉ' which represents a tab/control
	display_text = display_text.replace('ĉ', '\t')

	# Check 1: Explicit UNK (The "Hard Failure")
	if token_id == tokenizer.unk_token_id:
	label = "UNK (Data Loss)"

	# Check 2: Byte Fallback / Fragmentation
	start, end = offsets[i]
	is_fragment = False

	# If a single character in the input generated multiple tokens, it's a fragmentation/byte-split
	if (end - start) == 1:
	tokens_covering_this_char = char_coverage[start]
	if len(tokens_covering_this_char) > 1:
	is_fragment = True

	# Check for Llama/Mistral style byte tokens (<0xE2>)
	if token.startswith("<0x") and token.endswith(">"):
	is_fragment = True

	if is_fragment and label is None:
	label = "Byte/Fragment"
	# If it's a RoBERTa weird char (like â), try to show it as hex
	# to make it look less like random noise
	if len(display_text) == 1 and ord(display_text) > 127:
	# It's likely a mapped byte. We don't have the reverse map easily accessible,
	# but we can mark it clearly.
	display_text = f"<{display_text}>"

	# Check 3: Subwords (Blue)
	if label is None:
	# WordPiece '##'
	if token.startswith("##"):
	label = "Subword"
	# SentencePiece/RoBERTa often treats non-leading-space tokens as subwords
	elif i > 0 and not token.startswith("Ġ") and not token.startswith(" "):
	# Heuristic: If previous token ended at the same spot this one starts
	prev_end = offsets[i-1][1]
	if start == prev_end:
	label = "Subword"

	output_spans.append((display_text, label))

	return output_spans, f"Total Tokens: {len(tokens)}"

	# Scientific text example
	scientific_text = "Acidity (pKa)2.97 (25 °C)[5] 13.82 (20 °C)[3] UV-vis (λmax)210 nm (χ)−72.23·10−6 cm3/mol"

	with gr.Blocks(title="Embedding Model Tokenizer Detective") as demo:
	gr.Markdown(
	"""
	# 🕵️‍♀️ Embedding Model Tokenizer Detective

	Different embedding models handle unknown characters (OOV) differently.

	* Red (UNK): The model deleted information. It saw a symbol it didn't know and replaced it with a generic placeholder.
	* Orange (Byte/Fragment): The model struggled and split a single character (like a Greek letter or math symbol) into multiple raw bytes.
	* Blue: Standard subword splitting.
	"""
	)

	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(
	label="Input Text",
	lines=5,
	placeholder="Enter scientific or multilingual text here...",
	value=scientific_text
	)
	model_selector = gr.Dropdown(
	label="Select Embedding Model / Tokenizer",
	choices=list(MODEL_MAP.keys()),
	value="Nomic Embed v1.5"
	)
	analyze_btn = gr.Button("Diagnose Tokenization", variant="primary")

	with gr.Column():
	output_display = gr.HighlightedText(
	label="Tokenized Analysis",
	combine_adjacent=False,
	show_legend=True,
	color_map={"UNK (Data Loss)": "red", "Byte/Fragment": "orange", "Subword": "blue"}
	)
	stats_output = gr.Label(label="Statistics")

	analyze_btn.click(
	fn=analyze_tokenization,
	inputs=[input_text, model_selector],
	outputs=[output_display, stats_output]
	)

	gr.Examples(
	examples=[
	["The quick brown fox jumps over the lazy dog."],
	[scientific_text],
	["susceptibility (Ⅹ) = −72.23·10−6 cm3/mol"],
	["汉字漢字カタカナひらがな"],
	["⅕ of a pizza is 2 slices."],
	["😊 😂 🥺"],
	],
	inputs=[input_text],
	#outputs=[output_display, stats_output],
	fn=analyze_tokenization,
	run_on_click=True
	)

	if __name__ == "__main__":
	demo.launch()