Spaces:

davanstrien
/

doab-title-extraction-eval

Running

App Files Files Community

davanstrien HF Staff commited on about 21 hours ago

Commit

8f9e935

verified ·

1 Parent(s): 786e808

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +60 -105

app.py CHANGED Viewed

@@ -30,6 +30,12 @@ def _(mo):
         |----------|-------------|
         | **VLM** | Send the cover image directly to a Vision-Language Model |
         | **Text** | Extract text from image first (OCR), then send to an LLM |
         """
     )
     return
@@ -46,7 +52,6 @@ def _():
 @app.cell
 def _(evals_df, mo):
     # Load evaluation results with persistent caching
-    # First run downloads ~180MB, subsequent runs load from disk cache
     with mo.persistent_cache(name="doab_evals"):
         df_raw = evals_df("hf://datasets/davanstrien/doab-title-extraction-evals", quiet=True)
@@ -66,7 +71,7 @@ def _(evals_df, mo):
         # Convert score to percentage
         df_raw["accuracy"] = df_raw["score_headline_value"] * 100
-        # Parameter sizes and URLs (manual mapping)
         model_info = {
             "hf-inference-providers/Qwen/Qwen3-VL-8B-Instruct": {
                 "params": 8,
@@ -101,33 +106,22 @@ def _(evals_df, mo):
 @app.cell
-def _(df_raw, mo):
-    # Task selector
-    task_selector = mo.ui.dropdown(
-        options=["Title Extraction", "Full Metadata"],
-        value="Title Extraction",
-        label="Select task",
-    )
-    return (task_selector,)
-@app.cell
-def _(df_raw, mo, task_selector):
-    # Filter by selected task
-    df = df_raw[df_raw["task_category"] == task_selector.value].copy()
-    # Calculate summary stats
-    vlm_avg = df[df["approach"] == "VLM"]["accuracy"].mean()
-    text_avg = df[df["approach"] == "Text"]["accuracy"].mean()
-    diff = vlm_avg - text_avg
-    task_desc = "book titles" if task_selector.value == "Title Extraction" else "full metadata (title, subtitle, publisher, year, ISBN)"
-    mo.vstack([
-        task_selector,
-        mo.md(
             f"""
-            ## Results: {task_selector.value}
             | Approach | Average Accuracy |
             |----------|-----------------|
@@ -136,98 +130,60 @@ def _(df_raw, mo, task_selector):
             **VLM advantage: +{diff:.0f} percentage points**
-            VLMs {'significantly ' if diff > 15 else ''}outperform text extraction for extracting {task_desc} from book covers.
             """
         )
-    ])
-    return df, diff, task_desc, text_avg, vlm_avg
-@app.cell
-def _(mo):
-    mo.md("## Model Size vs Accuracy")
-    return
-@app.cell
-def _(alt, df, mo):
-    # Interactive scatter plot: model size vs accuracy
-    chart = alt.Chart(df).mark_circle(size=200, opacity=0.8).encode(
-        x=alt.X("param_size_b:Q", title="Parameters (Billions)", scale=alt.Scale(zero=False)),
-        y=alt.Y("accuracy:Q", title="Accuracy (%)", scale=alt.Scale(domain=[50, 105])),
-        color=alt.Color("approach:N", title="Approach", scale=alt.Scale(domain=["VLM", "Text"], range=["#1f77b4", "#ff7f0e"])),
-        tooltip=[
-            alt.Tooltip("model_short:N", title="Model"),
-            alt.Tooltip("approach:N", title="Approach"),
-            alt.Tooltip("param_size_b:Q", title="Params (B)"),
-            alt.Tooltip("accuracy:Q", title="Accuracy", format=".1f"),
-        ],
-    ).properties(
-        width=550,
-        height=350,
-    ).configure_axis(
-        labelFontSize=12,
-        titleFontSize=14,
-    )
-    mo.vstack([
-        mo.as_html(chart),
-        mo.md("*Hover over points to see model details*"),
-    ])
-    return (chart,)
-@app.cell
-def _(mo):
-    mo.md("## Model Leaderboard")
-    return
-@app.cell
-def _(df, mo):
-    # Filter selector for approach
-    approach_filter = mo.ui.dropdown(
-        options=["All", "VLM", "Text"],
-        value="All",
-        label="Filter by approach",
-    )
-    return (approach_filter,)
-@app.cell
-def _(approach_filter, df, mo):
-    # Filter data based on selection
-    if approach_filter.value == "All":
-        filtered_df = df
-    else:
-        filtered_df = df[df["approach"] == approach_filter.value]
-    # Create leaderboard with clickable model links
-    leaderboard_data = []
-    for _, row in filtered_df.sort_values("accuracy", ascending=False).iterrows():
-        model_link = f"[{row['model_short']}]({row['model_url']})" if row['model_url'] else row['model_short']
-        leaderboard_data.append({
-            "Model": model_link,
-            "Approach": row["approach"],
-            "Params (B)": row["param_size_b"],
-            "Accuracy (%)": round(row["accuracy"], 1),
-        })
-    leaderboard_md = "| Model | Approach | Params (B) | Accuracy (%) |\n|-------|----------|------------|-------------|\n"
-    for row in leaderboard_data:
-        leaderboard_md += f"| {row['Model']} | {row['Approach']} | {row['Params (B)']} | {row['Accuracy (%)']} |\n"
-    mo.vstack([
-        approach_filter,
-        mo.md(leaderboard_md),
-    ])
-    return filtered_df, leaderboard_data, leaderboard_md
 @app.cell
 def _(mo):
     mo.md(
         """
         ## Why VLMs Win
         Book covers are **visually structured** documents:
@@ -263,7 +219,6 @@ def _(mo):
 @app.cell
 def _(mo):
-    # Dataset viewer iframe
     mo.Html(
         """
         <iframe

         |----------|-------------|
         | **VLM** | Send the cover image directly to a Vision-Language Model |
         | **Text** | Extract text from image first (OCR), then send to an LLM |
+        ---
+        ## Evaluation Results
+        Select a task below to see how different models performed:
         """
     )
     return
 @app.cell
 def _(evals_df, mo):
     # Load evaluation results with persistent caching
     with mo.persistent_cache(name="doab_evals"):
         df_raw = evals_df("hf://datasets/davanstrien/doab-title-extraction-evals", quiet=True)
         # Convert score to percentage
         df_raw["accuracy"] = df_raw["score_headline_value"] * 100
+        # Parameter sizes and URLs
         model_info = {
             "hf-inference-providers/Qwen/Qwen3-VL-8B-Instruct": {
                 "params": 8,
 @app.cell
+def _(alt, df_raw, mo):
+    def make_task_content(task_name):
+        """Generate the complete results view for a task."""
+        df = df_raw[df_raw["task_category"] == task_name].copy()
+        # Calculate summary stats
+        vlm_avg = df[df["approach"] == "VLM"]["accuracy"].mean()
+        text_avg = df[df["approach"] == "Text"]["accuracy"].mean()
+        diff = vlm_avg - text_avg
+        task_desc = "book titles" if task_name == "Title Extraction" else "full metadata (title, subtitle, publisher, year, ISBN)"
+        # Results summary
+        results_md = mo.md(
             f"""
+            ### Summary
             | Approach | Average Accuracy |
             |----------|-----------------|
             **VLM advantage: +{diff:.0f} percentage points**
+            VLMs {'significantly ' if diff > 15 else ''}outperform text extraction for extracting {task_desc}.
             """
         )
+        # Scatter plot
+        chart = alt.Chart(df).mark_circle(size=200, opacity=0.8).encode(
+            x=alt.X("param_size_b:Q", title="Parameters (Billions)", scale=alt.Scale(zero=False)),
+            y=alt.Y("accuracy:Q", title="Accuracy (%)", scale=alt.Scale(domain=[50, 105])),
+            color=alt.Color("approach:N", title="Approach", scale=alt.Scale(domain=["VLM", "Text"], range=["#1f77b4", "#ff7f0e"])),
+            tooltip=[
+                alt.Tooltip("model_short:N", title="Model"),
+                alt.Tooltip("approach:N", title="Approach"),
+                alt.Tooltip("param_size_b:Q", title="Params (B)"),
+                alt.Tooltip("accuracy:Q", title="Accuracy", format=".1f"),
+            ],
+        ).properties(
+            width=500,
+            height=300,
+            title="Model Size vs Accuracy"
+        ).configure_axis(
+            labelFontSize=12,
+            titleFontSize=14,
+        )
+        # Leaderboard
+        leaderboard_md = "### Model Leaderboard\n\n| Model | Approach | Params (B) | Accuracy (%) |\n|-------|----------|------------|-------------|\n"
+        for _, row in df.sort_values("accuracy", ascending=False).iterrows():
+            model_link = f"[{row['model_short']}]({row['model_url']})" if row['model_url'] else row['model_short']
+            leaderboard_md += f"| {model_link} | {row['approach']} | {row['param_size_b']} | {row['accuracy']:.1f} |\n"
+        return mo.vstack([
+            results_md,
+            mo.md("### Model Size vs Accuracy"),
+            mo.as_html(chart),
+            mo.md("*Hover over points to see model details*"),
+            mo.md(leaderboard_md),
+        ])
+    # Create tabs
+    tabs = mo.ui.tabs({
+        "📄 Title Extraction": make_task_content("Title Extraction"),
+        "📚 Full Metadata": make_task_content("Full Metadata"),
+    })
+    tabs
+    return make_task_content, tabs
 @app.cell
 def _(mo):
     mo.md(
         """
+        ---
         ## Why VLMs Win
         Book covers are **visually structured** documents:
 @app.cell
 def _(mo):
     mo.Html(
         """
         <iframe