import marimo __generated_with = "0.10.9" app = marimo.App(width="medium") @app.cell def _(): import marimo as mo return (mo,) @app.cell def _(mo): mo.md( """ # VLM vs Text: Extracting Metadata from Book Covers **The Task**: Libraries and archives have millions of digitized book covers where metadata is incomplete or missing. Can we use AI to automatically extract titles and other metadata? **The Question**: Should we use Vision-Language Models (VLMs) that "see" the cover image, or extract text first and send it to a standard LLM? **The Answer**: VLMs win decisively for this task. --- This evaluation uses the [DOAB (Directory of Open Access Books)](https://huggingface.co/datasets/biglam/doab-metadata-extraction) dataset of academic book covers. We compare two approaches: | Approach | How it works | |----------|-------------| | **VLM** | Send the cover image directly to a Vision-Language Model | | **Text** | Extract text from image first (OCR), then send to an LLM | --- ## Evaluation Results Select a task below to see how different models performed: """ ) return @app.cell def _(): import pandas as pd import altair as alt from inspect_ai.analysis import evals_df return alt, evals_df, pd @app.cell def _(evals_df, mo): # Load evaluation results with persistent caching with mo.persistent_cache(name="doab_evals"): df_raw = evals_df("hf://datasets/davanstrien/doab-title-extraction-evals", quiet=True) # Add metadata columns df_raw["approach"] = df_raw["task_name"].apply(lambda x: "VLM" if "vlm" in x else "Text") df_raw["model_short"] = df_raw["model"].apply(lambda x: x.split("/")[-1]) # Determine task category def get_task_category(task_name): if "llm_judge" in task_name: return "Full Metadata" else: return "Title Extraction" df_raw["task_category"] = df_raw["task_name"].apply(get_task_category) # Convert score to percentage df_raw["accuracy"] = df_raw["score_headline_value"] * 100 # Parameter sizes and URLs model_info = { "hf-inference-providers/Qwen/Qwen3-VL-8B-Instruct": { "params": 8, "url": "https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct" }, "hf-inference-providers/Qwen/Qwen3-VL-30B-A3B-Thinking": { "params": 30, "url": "https://huggingface.co/Qwen/Qwen3-VL-30B-A3B" }, "hf-inference-providers/zai-org/GLM-4.6V-Flash": { "params": 9, "url": "https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking" }, "hf-inference-providers/openai/gpt-oss-20b": { "params": 20, "url": "https://huggingface.co/openai/gpt-oss-20b" }, "hf-inference-providers/Qwen/Qwen3-4B-Instruct-2507": { "params": 4, "url": "https://huggingface.co/Qwen/Qwen3-4B" }, "hf-inference-providers/allenai/Olmo-3-7B-Instruct": { "params": 7, "url": "https://huggingface.co/allenai/OLMo-2-0325-32B-Instruct" }, } df_raw["param_size_b"] = df_raw["model"].apply(lambda x: model_info.get(x, {}).get("params")) df_raw["model_url"] = df_raw["model"].apply(lambda x: model_info.get(x, {}).get("url", "")) df_raw return df_raw, get_task_category, model_info @app.cell def _(alt, df_raw, mo): def make_task_content(task_name): """Generate the complete results view for a task.""" df = df_raw[df_raw["task_category"] == task_name].copy() # Calculate summary stats vlm_avg = df[df["approach"] == "VLM"]["accuracy"].mean() text_avg = df[df["approach"] == "Text"]["accuracy"].mean() diff = vlm_avg - text_avg task_desc = "book titles" if task_name == "Title Extraction" else "full metadata (title, subtitle, publisher, year, ISBN)" # Results summary results_md = mo.md( f""" ### Summary | Approach | Average Accuracy | |----------|-----------------| | **VLM (Vision)** | **{vlm_avg:.0f}%** | | Text Extraction | {text_avg:.0f}% | **VLM advantage: +{diff:.0f} percentage points** VLMs {'significantly ' if diff > 15 else ''}outperform text extraction for extracting {task_desc}. """ ) # Scatter plot chart = alt.Chart(df).mark_circle(size=200, opacity=0.8).encode( x=alt.X("param_size_b:Q", title="Parameters (Billions)", scale=alt.Scale(zero=False)), y=alt.Y("accuracy:Q", title="Accuracy (%)", scale=alt.Scale(domain=[50, 105])), color=alt.Color("approach:N", title="Approach", scale=alt.Scale(domain=["VLM", "Text"], range=["#1f77b4", "#ff7f0e"])), tooltip=[ alt.Tooltip("model_short:N", title="Model"), alt.Tooltip("approach:N", title="Approach"), alt.Tooltip("param_size_b:Q", title="Params (B)"), alt.Tooltip("accuracy:Q", title="Accuracy", format=".1f"), ], ).properties( width=500, height=300, title="Model Size vs Accuracy" ).configure_axis( labelFontSize=12, titleFontSize=14, ) # Leaderboard leaderboard_md = "### Model Leaderboard\n\n| Model | Approach | Params (B) | Accuracy (%) |\n|-------|----------|------------|-------------|\n" for _, row in df.sort_values("accuracy", ascending=False).iterrows(): model_link = f"[{row['model_short']}]({row['model_url']})" if row['model_url'] else row['model_short'] leaderboard_md += f"| {model_link} | {row['approach']} | {row['param_size_b']} | {row['accuracy']:.1f} |\n" return mo.vstack([ results_md, mo.md("### Model Size vs Accuracy"), mo.as_html(chart), mo.md("*Hover over points to see model details*"), mo.md(leaderboard_md), ]) # Create tabs tabs = mo.ui.tabs({ "📄 Title Extraction": make_task_content("Title Extraction"), "📚 Full Metadata": make_task_content("Full Metadata"), }) tabs return make_task_content, tabs @app.cell def _(mo): mo.md( """ --- ## Why VLMs Win Book covers are **visually structured** documents: - **Spatial layout**: Titles appear in specific locations (usually top/center) - **Typography**: Larger text = more important (likely the title) - **Visual hierarchy**: Authors, publishers, and other info have distinct styling When you extract text first (OCR), you **flatten this structure** into a linear sequence. The model loses the visual cues that make it obvious what's a title vs. a subtitle vs. author name. **Interesting finding**: Qwen3-VL-8B achieves 94% even when used as a text-only model, suggesting it has strong general text understanding - but it still does better (98%) when given the actual images. """ ) return @app.cell def _(mo): mo.md( """ ## The Dataset We use the [DOAB Metadata Extraction](https://huggingface.co/datasets/biglam/doab-metadata-extraction) dataset - academic book covers from the Directory of Open Access Books. Each sample has: - Cover image (rendered from PDF) - Pre-extracted page text - Ground truth metadata (title, subtitle, publisher, year, ISBN) """ ) return @app.cell def _(mo): mo.Html( """ """ ) return @app.cell def _(mo): mo.md( """ ## Methodology **Evaluation Framework**: [Inspect AI](https://inspect.aisi.org.uk/) - an open-source framework for evaluating language models **Sample Size**: 50 books (randomly sampled with fixed seed for reproducibility) **Scoring Methods**: - *Title Extraction*: Custom flexible matching scorer - Case-insensitive comparison - Accepts if ground truth is substring of prediction (handles subtitles) - More robust than exact match for this task - *Full Metadata*: LLM-as-judge with partial credit - Correct (1.0): Title + year + at least one other field - Partial (0.5): Some fields correct - Incorrect (0.0): Mostly wrong **Models via**: [HuggingFace Inference Providers](https://huggingface.co/docs/inference-providers) --- ## Replicate This The evaluation logs are stored on HuggingFace and can be loaded directly: ```python from inspect_ai.analysis import evals_df df = evals_df("hf://datasets/davanstrien/doab-title-extraction-evals") ``` --- *Built with [Marimo](https://marimo.io) | Evaluation framework: [Inspect AI](https://inspect.aisi.org.uk/) | Dataset: [biglam/doab-metadata-extraction](https://huggingface.co/datasets/biglam/doab-metadata-extraction)* """ ) return if __name__ == "__main__": app.run()