Spaces:

davanstrien
/

doab-title-extraction-eval

Sleeping

App Files Files Community

davanstrien HF Staff commited on 2 days ago

Commit

d925140

verified ·

1 Parent(s): 2528d39

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +131 -52

app.py CHANGED Viewed

@@ -14,16 +14,22 @@ def _():
 def _(mo):
     mo.md(
         """
-        # DOAB Metadata Extraction: VLM vs Text
-        **Can Vision-Language Models extract metadata from book covers better than text extraction?**
-        This dashboard compares VLM (vision) and text-based approaches for extracting metadata from academic book covers in the [DOAB dataset](https://huggingface.co/datasets/biglam/doab-metadata-extraction).
-        - **Title Extraction**: Extract just the book title (simpler task)
-        - **Full Metadata**: Extract title, subtitle, publisher, year, ISBN (harder task)
-        📊 **Evaluation logs**: [davanstrien/doab-title-extraction-evals](https://huggingface.co/datasets/davanstrien/doab-title-extraction-evals)
         """
     )
     return
@@ -60,19 +66,38 @@ def _(evals_df, mo):
         # Convert score to percentage
         df_raw["accuracy"] = df_raw["score_headline_value"] * 100
-        # Parameter sizes (manual mapping)
-        param_sizes = {
-            "hf-inference-providers/Qwen/Qwen3-VL-8B-Instruct": 8,
-            "hf-inference-providers/Qwen/Qwen3-VL-30B-A3B-Thinking": 30,
-            "hf-inference-providers/zai-org/GLM-4.6V-Flash": 9,
-            "hf-inference-providers/openai/gpt-oss-20b": 20,
-            "hf-inference-providers/Qwen/Qwen3-4B-Instruct-2507": 4,
-            "hf-inference-providers/allenai/Olmo-3-7B-Instruct": 7,
         }
-        df_raw["param_size_b"] = df_raw["model"].map(param_sizes)
     df_raw
-    return df_raw, get_task_category, param_sizes
 @app.cell
@@ -81,7 +106,7 @@ def _(df_raw, mo):
     task_selector = mo.ui.dropdown(
         options=["Title Extraction", "Full Metadata"],
         value="Title Extraction",
-        label="Task",
     )
     return (task_selector,)
@@ -102,7 +127,7 @@ def _(df_raw, mo, task_selector):
         task_selector,
         mo.md(
             f"""
-            ## Key Results: {task_selector.value}
             | Approach | Average Accuracy |
             |----------|-----------------|
@@ -127,7 +152,6 @@ def _(mo):
 @app.cell
 def _(alt, df, mo):
     # Interactive scatter plot: model size vs accuracy
-    # Labels removed - hover for model details
     chart = alt.Chart(df).mark_circle(size=200, opacity=0.8).encode(
         x=alt.X("param_size_b:Q", title="Parameters (Billions)", scale=alt.Scale(zero=False)),
         y=alt.Y("accuracy:Q", title="Accuracy (%)", scale=alt.Scale(domain=[50, 105])),
@@ -178,63 +202,118 @@ def _(approach_filter, df, mo):
     else:
         filtered_df = df[df["approach"] == approach_filter.value]
-    # Create leaderboard
-    leaderboard = (
-        filtered_df[["model_short", "approach", "param_size_b", "accuracy"]]
-        .sort_values("accuracy", ascending=False)
-        .reset_index(drop=True)
-    )
-    leaderboard.columns = ["Model", "Approach", "Params (B)", "Accuracy (%)"]
-    leaderboard["Accuracy (%)"] = leaderboard["Accuracy (%)"].round(1)
     mo.vstack([
         approach_filter,
-        mo.ui.table(leaderboard, selection=None),
     ])
-    return filtered_df, leaderboard
 @app.cell
 def _(mo):
     mo.md(
         """
-        ## About This Evaluation
-        **Task**: Extract metadata from academic book cover images
-        **Dataset**: [DOAB Metadata Extraction](https://huggingface.co/datasets/biglam/doab-metadata-extraction) - 50 samples
-        **Evaluation Framework**: [Inspect AI](https://inspect.aisi.org.uk/)
-        **Scoring**:
-        - *Title Extraction*: Custom flexible matching (case-insensitive, handles subtitles)
         - *Full Metadata*: LLM-as-judge with partial credit
-        ### Models Evaluated
-        **VLM (Vision-Language Models)**:
-        - Qwen3-VL-8B-Instruct (8B params)
-        - Qwen3-VL-30B-A3B-Thinking (30B params)
-        - GLM-4.6V-Flash (9B params)
-        **Text Extraction** (OCR → LLM):
-        - gpt-oss-20b (20B params)
-        - Qwen3-4B-Instruct-2507 (4B params)
-        - Olmo-3-7B-Instruct (7B params)
-        - Qwen3-VL-8B-Instruct as text-only LLM (8B params)
-        ### Why VLMs Win
-        Book covers are **visually structured**:
-        - Titles appear in specific locations (usually top/center)
-        - Typography indicates importance (larger = more likely title)
-        - Layout provides context that pure text loses
-        Text extraction flattens this structure, losing valuable spatial information.
         ---
-        *Built with [Marimo](https://marimo.io) | Evaluation framework: [Inspect AI](https://inspect.aisi.org.uk/)*
         """
     )
     return

 def _(mo):
     mo.md(
         """
+        # VLM vs Text: Extracting Metadata from Book Covers
+        **The Task**: Libraries and archives have millions of digitized book covers where metadata is incomplete or missing. Can we use AI to automatically extract titles and other metadata?
+        **The Question**: Should we use Vision-Language Models (VLMs) that "see" the cover image, or extract text first and send it to a standard LLM?
+        **The Answer**: VLMs win decisively for this task.
+        ---
+        This evaluation uses the [DOAB (Directory of Open Access Books)](https://huggingface.co/datasets/biglam/doab-metadata-extraction) dataset of academic book covers. We compare two approaches:
+        | Approach | How it works |
+        |----------|-------------|
+        | **VLM** | Send the cover image directly to a Vision-Language Model |
+        | **Text** | Extract text from image first (OCR), then send to an LLM |
         """
     )
     return
         # Convert score to percentage
         df_raw["accuracy"] = df_raw["score_headline_value"] * 100
+        # Parameter sizes and URLs (manual mapping)
+        model_info = {
+            "hf-inference-providers/Qwen/Qwen3-VL-8B-Instruct": {
+                "params": 8,
+                "url": "https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct"
+            },
+            "hf-inference-providers/Qwen/Qwen3-VL-30B-A3B-Thinking": {
+                "params": 30,
+                "url": "https://huggingface.co/Qwen/Qwen3-VL-30B-A3B"
+            },
+            "hf-inference-providers/zai-org/GLM-4.6V-Flash": {
+                "params": 9,
+                "url": "https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking"
+            },
+            "hf-inference-providers/openai/gpt-oss-20b": {
+                "params": 20,
+                "url": "https://huggingface.co/openai-community/gpt2"
+            },
+            "hf-inference-providers/Qwen/Qwen3-4B-Instruct-2507": {
+                "params": 4,
+                "url": "https://huggingface.co/Qwen/Qwen3-4B"
+            },
+            "hf-inference-providers/allenai/Olmo-3-7B-Instruct": {
+                "params": 7,
+                "url": "https://huggingface.co/allenai/OLMo-2-0325-32B-Instruct"
+            },
         }
+        df_raw["param_size_b"] = df_raw["model"].apply(lambda x: model_info.get(x, {}).get("params"))
+        df_raw["model_url"] = df_raw["model"].apply(lambda x: model_info.get(x, {}).get("url", ""))
     df_raw
+    return df_raw, get_task_category, model_info
 @app.cell
     task_selector = mo.ui.dropdown(
         options=["Title Extraction", "Full Metadata"],
         value="Title Extraction",
+        label="Select task",
     )
     return (task_selector,)
         task_selector,
         mo.md(
             f"""
+            ## Results: {task_selector.value}
             | Approach | Average Accuracy |
             |----------|-----------------|
 @app.cell
 def _(alt, df, mo):
     # Interactive scatter plot: model size vs accuracy
     chart = alt.Chart(df).mark_circle(size=200, opacity=0.8).encode(
         x=alt.X("param_size_b:Q", title="Parameters (Billions)", scale=alt.Scale(zero=False)),
         y=alt.Y("accuracy:Q", title="Accuracy (%)", scale=alt.Scale(domain=[50, 105])),
     else:
         filtered_df = df[df["approach"] == approach_filter.value]
+    # Create leaderboard with clickable model links
+    leaderboard_data = []
+    for _, row in filtered_df.sort_values("accuracy", ascending=False).iterrows():
+        model_link = f"[{row['model_short']}]({row['model_url']})" if row['model_url'] else row['model_short']
+        leaderboard_data.append({
+            "Model": model_link,
+            "Approach": row["approach"],
+            "Params (B)": row["param_size_b"],
+            "Accuracy (%)": round(row["accuracy"], 1),
+        })
+    leaderboard_md = "| Model | Approach | Params (B) | Accuracy (%) |\n|-------|----------|------------|-------------|\n"
+    for row in leaderboard_data:
+        leaderboard_md += f"| {row['Model']} | {row['Approach']} | {row['Params (B)']} | {row['Accuracy (%)']} |\n"
     mo.vstack([
         approach_filter,
+        mo.md(leaderboard_md),
     ])
+    return filtered_df, leaderboard_data, leaderboard_md
+@app.cell
+def _(mo):
+    mo.md(
+        """
+        ## Why VLMs Win
+        Book covers are **visually structured** documents:
+        - **Spatial layout**: Titles appear in specific locations (usually top/center)
+        - **Typography**: Larger text = more important (likely the title)
+        - **Visual hierarchy**: Authors, publishers, and other info have distinct styling
+        When you extract text first (OCR), you **flatten this structure** into a linear sequence. The model loses the visual cues that make it obvious what's a title vs. a subtitle vs. author name.
+        **Interesting finding**: Qwen3-VL-8B achieves 94% even when used as a text-only model, suggesting it has strong general text understanding - but it still does better (98%) when given the actual images.
+        """
+    )
+    return
 @app.cell
 def _(mo):
     mo.md(
         """
+        ## The Dataset
+        We use the [DOAB Metadata Extraction](https://huggingface.co/datasets/biglam/doab-metadata-extraction) dataset - academic book covers from the Directory of Open Access Books.
+        Each sample has:
+        - Cover image (rendered from PDF)
+        - Pre-extracted page text
+        - Ground truth metadata (title, subtitle, publisher, year, ISBN)
+        """
+    )
+    return
+@app.cell
+def _(mo):
+    # Dataset viewer iframe
+    mo.Html(
+        """
+        <iframe
+          src="https://huggingface.co/datasets/biglam/doab-metadata-extraction/embed/viewer/default/train"
+          frameborder="0"
+          width="100%"
+          height="400px"
+        ></iframe>
+        """
+    )
+    return
+@app.cell
+def _(mo):
+    mo.md(
+        """
+        ## Methodology
+        **Evaluation Framework**: [Inspect AI](https://inspect.aisi.org.uk/) - an open-source framework for evaluating language models
+        **Sample Size**: 50 books (randomly sampled with fixed seed for reproducibility)
+        **Scoring Methods**:
+        - *Title Extraction*: Custom flexible matching scorer
+          - Case-insensitive comparison
+          - Accepts if ground truth is substring of prediction (handles subtitles)
+          - More robust than exact match for this task
         - *Full Metadata*: LLM-as-judge with partial credit
+          - Correct (1.0): Title + year + at least one other field
+          - Partial (0.5): Some fields correct
+          - Incorrect (0.0): Mostly wrong
+        **Models via**: [HuggingFace Inference Providers](https://huggingface.co/docs/inference-providers)
+        ---
+        ## Replicate This
+        The evaluation logs are stored on HuggingFace and can be loaded directly:
+        ```python
+        from inspect_ai.analysis import evals_df
+        df = evals_df("hf://datasets/davanstrien/doab-title-extraction-evals")
+        ```
         ---
+        *Built with [Marimo](https://marimo.io) | Evaluation framework: [Inspect AI](https://inspect.aisi.org.uk/) | Dataset: [biglam/doab-metadata-extraction](https://huggingface.co/datasets/biglam/doab-metadata-extraction)*
         """
     )
     return