File size: 7,219 Bytes
11f25fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import gradio as gr
from transformers import AutoTokenizer
import collections

# Map of display names to HF model IDs
MODEL_MAP = {
    "Nomic Embed v1.5": "nomic-ai/nomic-embed-text-v1.5",
    "MixedBread XSmall v1": "mixedbread-ai/mxbai-embed-xsmall-v1",
    "Google EmbeddingGemma 300m": "google/embeddinggemma-300m",
    "all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
    "BGE-M3": "BAAI/bge-m3",
    "BERT Base (Baseline WordPiece)": "bert-base-uncased",
    "RoBERTa Base (Byte-Level BPE)": "roberta-base",
    "E5 Mistral 7B (Llama Tokenizer)": "intfloat/e5-mistral-7b-instruct",
}

# Global cache for tokenizers
tokenizer_cache = {}

def get_tokenizer(model_name):
    """Lazy load tokenizers."""
    model_id = MODEL_MAP[model_name]
    if model_id not in tokenizer_cache:
        print(f"Loading tokenizer: {model_id}...")
        try:
            tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        except Exception as e:
            return None, f"Error loading tokenizer: {str(e)}"
    return tokenizer_cache[model_id], None

def format_byte_token(text):
    """
    Attempts to identify if a token is a RoBERTa/GPT-2 style byte mapping 
    (e.g., 'â' representing 0xE2) and converts it to <0xXX> for clarity.
    """
    # If the text is just one char and looks "weird" (extended unicode), 
    # it might be a byte mapping.
    if len(text) == 1 and ord(text) > 256:
        # This is a heuristic: RoBERTa maps bytes to specific unicode ranges.
        # It's safer to just label it as a byte artifact if it matches our fragmentation logic.
        return f"<{hex(ord(text))}>" 
    return text

def analyze_tokenization(text, model_name=MODEL_MAP.keys().__iter__().__next__()):
    tokenizer, error = get_tokenizer(model_name)
    if error:
        return [], error

    try:
        # Tokenize with offsets
        encoding = tokenizer(text, add_special_tokens=False, return_offsets_mapping=True)
    except Exception as e:
        return [], f"Tokenization failed: {str(e)}"

    tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
    ids = encoding["input_ids"]
    offsets = encoding["offset_mapping"]

    # Map character indices to the list of tokens that cover them
    char_coverage = collections.defaultdict(list)
    for i, (start, end) in enumerate(offsets):
        for char_idx in range(start, end):
            char_coverage[char_idx].append(i)

    output_spans = []
    
    for i, (token, token_id) in enumerate(zip(tokens, ids)):
        label = None
        display_text = token

        # --- Visual Cleanup for RoBERTa/GPT-2 ---
        # Replace the special 'Ġ' (G with dot) which represents a space
        display_text = display_text.replace('Ġ', ' ')
        # Replace 'Ċ' (C with dot) which represents a newline
        display_text = display_text.replace('Ċ', '\n')
        # Replace 'ĉ' which represents a tab/control
        display_text = display_text.replace('ĉ', '\t')
        
        # Check 1: Explicit UNK (The "Hard Failure")
        if token_id == tokenizer.unk_token_id:
            label = "UNK (Data Loss)"
        
        # Check 2: Byte Fallback / Fragmentation
        start, end = offsets[i]
        is_fragment = False
        
        # If a single character in the input generated multiple tokens, it's a fragmentation/byte-split
        if (end - start) == 1:
            tokens_covering_this_char = char_coverage[start]
            if len(tokens_covering_this_char) > 1:
                is_fragment = True
        
        # Check for Llama/Mistral style byte tokens (<0xE2>)
        if token.startswith("<0x") and token.endswith(">"):
            is_fragment = True

        if is_fragment and label is None:
            label = "Byte/Fragment"
            # If it's a RoBERTa weird char (like â), try to show it as hex
            # to make it look less like random noise
            if len(display_text) == 1 and ord(display_text) > 127:
                 # It's likely a mapped byte. We don't have the reverse map easily accessible,
                 # but we can mark it clearly.
                 display_text = f"<{display_text}>"

        # Check 3: Subwords (Blue)
        if label is None:
            # WordPiece '##'
            if token.startswith("##"):
                label = "Subword"
            # SentencePiece/RoBERTa often treats non-leading-space tokens as subwords
            elif i > 0 and not token.startswith("Ġ") and not token.startswith(" "):
                 # Heuristic: If previous token ended at the same spot this one starts
                 prev_end = offsets[i-1][1]
                 if start == prev_end:
                     label = "Subword"

        output_spans.append((display_text, label))

    return output_spans, f"Total Tokens: {len(tokens)}"

# Scientific text example
scientific_text = "Acidity (pKa)2.97 (25 °C)[5] 13.82 (20 °C)[3] UV-vis (λmax)210 nm (χ)−72.23·10−6 cm3/mol"

with gr.Blocks(title="Embedding Model Tokenizer Detective") as demo:
    gr.Markdown(
        """
        # 🕵️‍♀️ Embedding Model Tokenizer Detective
        
        Different embedding models handle unknown characters (OOV) differently. 
        
        * **Red (UNK):** The model **deleted** information. It saw a symbol it didn't know and replaced it with a generic placeholder.
        * **Orange (Byte/Fragment):** The model **struggled** and split a single character (like a Greek letter or math symbol) into multiple raw bytes.
        * **Blue:** Standard subword splitting.
        """
    )
    
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                label="Input Text", 
                lines=5, 
                placeholder="Enter scientific or multilingual text here...",
                value=scientific_text
            )
            model_selector = gr.Dropdown(
                label="Select Embedding Model / Tokenizer",
                choices=list(MODEL_MAP.keys()),
                value="Nomic Embed v1.5"
            )
            analyze_btn = gr.Button("Diagnose Tokenization", variant="primary")
            
        with gr.Column():
            output_display = gr.HighlightedText(
                label="Tokenized Analysis",
                combine_adjacent=False,
                show_legend=True,
                color_map={"UNK (Data Loss)": "red", "Byte/Fragment": "orange", "Subword": "blue"}
            )
            stats_output = gr.Label(label="Statistics")

    analyze_btn.click(
        fn=analyze_tokenization,
        inputs=[input_text, model_selector],
        outputs=[output_display, stats_output]
    )

    gr.Examples(
        examples=[
            ["The quick brown fox jumps over the lazy dog."],
            [scientific_text],
            ["susceptibility (Ⅹ) = −72.23·10−6 cm3/mol"],
            ["汉字漢字カタカナひらがな"],
            ["⅕ of a pizza is 2 slices."],
            ["😊 😂 🥺"],
        ],
        inputs=[input_text],
        #outputs=[output_display, stats_output],
        fn=analyze_tokenization,
        run_on_click=True
    )

if __name__ == "__main__":
    demo.launch()