Spaces:

librarian-bots
/

dataset-card-drafter

Running

davanstrien HF Staff Claude Opus 4.5 commited on 3 days ago

Commit

763c57d

1 Parent(s): afeeac0

Add pending queue for datasets where viewer isn't ready

- ViewerNotReadyError raised when dataset preview unavailable
- Pending queue (pending.json) tracks datasets waiting for viewer
- Background retry task with delays: 1min, 2min, 5min
- New 'Pending' tab in UI to view queue and manual retry
- Webhook spawns async retry task when status is 'pending'

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <[email protected]>

Files changed (3) hide show

.beads/issues.jsonl +1 -0
app.py +157 -1
description_generator.py +20 -1

.beads/issues.jsonl CHANGED Viewed

@@ -1,2 +1,3 @@
 {"id":"dataset-card-drafter-ebu","title":"Add PR deduplication logic","description":"Multiple PRs being opened for same dataset. Need to check for existing open PRs before creating new ones.","status":"closed","priority":1,"issue_type":"bug","created_at":"2025-12-15T17:43:02.474669Z","updated_at":"2025-12-15T17:48:03.770007Z","closed_at":"2025-12-15T17:48:03.770007Z","close_reason":"Added has_existing_pr() check using get_repo_discussions + improved PR description"}
 {"id":"dataset-card-drafter-wbd","title":"MVP implementation: WebhooksServer + DatasetCard + InferenceClient","description":"","status":"closed","priority":1,"issue_type":"feature","created_at":"2025-12-15T17:24:36.365733Z","updated_at":"2025-12-15T17:28:21.127763Z","closed_at":"2025-12-15T17:28:21.127763Z","close_reason":"MVP implemented with WebhooksServer, DatasetCard, and InferenceClient"}

+{"id":"dataset-card-drafter-a69","title":"Add pending queue for datasets with viewer not ready","description":"","status":"in_progress","priority":1,"issue_type":"feature","created_at":"2025-12-15T18:00:44.18695Z","updated_at":"2025-12-15T18:00:50.025261Z"}
 {"id":"dataset-card-drafter-ebu","title":"Add PR deduplication logic","description":"Multiple PRs being opened for same dataset. Need to check for existing open PRs before creating new ones.","status":"closed","priority":1,"issue_type":"bug","created_at":"2025-12-15T17:43:02.474669Z","updated_at":"2025-12-15T17:48:03.770007Z","closed_at":"2025-12-15T17:48:03.770007Z","close_reason":"Added has_existing_pr() check using get_repo_discussions + improved PR description"}
 {"id":"dataset-card-drafter-wbd","title":"MVP implementation: WebhooksServer + DatasetCard + InferenceClient","description":"","status":"closed","priority":1,"issue_type":"feature","created_at":"2025-12-15T17:24:36.365733Z","updated_at":"2025-12-15T17:28:21.127763Z","closed_at":"2025-12-15T17:28:21.127763Z","close_reason":"MVP implemented with WebhooksServer, DatasetCard, and InferenceClient"}

app.py CHANGED Viewed

@@ -3,7 +3,9 @@
 Watches davanstrien/* datasets and opens PRs with auto-generated descriptions.
 """
 import json
 import os
 from datetime import datetime
 from pathlib import Path
@@ -16,7 +18,11 @@ from huggingface_hub import (
     get_repo_discussions,
 )
-from description_generator import generate_description
 # Space URL for attribution
 SPACE_URL = "https://huggingface.co/spaces/librarian-bots/dataset-card-drafter"
@@ -31,6 +37,10 @@ MIN_DESCRIPTION_LENGTH = 100  # Chars below which we generate
 DATA_DIR = Path("/data") if Path("/data").exists() else Path("./local_data")
 DATA_DIR.mkdir(exist_ok=True)
 PROCESSED_FILE = DATA_DIR / "processed.json"
 def load_processed() -> dict:
@@ -45,6 +55,39 @@ def save_processed(data: dict) -> None:
     PROCESSED_FILE.write_text(json.dumps(data, indent=2))
 def is_watched_repo(repo_name: str) -> bool:
     """Check if a repo is in our watched list."""
     return any(repo_name.startswith(prefix) for prefix in WATCHED_PREFIXES)
@@ -104,6 +147,7 @@ async def process_dataset(dataset_id: str, inference_token: str, pr_token: str)
     """
     # Check for existing open PR first
     if has_existing_pr(dataset_id):
         return {"status": "skipped", "reason": "open PR already exists"}
     # Load current card
@@ -114,11 +158,14 @@ async def process_dataset(dataset_id: str, inference_token: str, pr_token: str)
     # Check if description needed
     if not should_generate(card):
         return {"status": "skipped", "reason": "description exists"}
     # Generate description using inference token
     try:
         description = generate_description(dataset_id, inference_token)
     except Exception as e:
         return {"status": "error", "reason": f"generation failed: {e}"}
@@ -141,9 +188,63 @@ async def process_dataset(dataset_id: str, inference_token: str, pr_token: str)
     except Exception as e:
         return {"status": "error", "reason": f"PR creation failed: {e}"}
     return {"status": "pr_created", "pr_url": pr_url, "description": description}
 # Gradio UI
 with gr.Blocks(title="Dataset Card Drafter") as demo:
     gr.Markdown("# Dataset Card Drafter MVP")
@@ -157,6 +258,53 @@ with gr.Blocks(title="Dataset Card Drafter") as demo:
         refresh_btn = gr.Button("Refresh")
         refresh_btn.click(fn=load_processed, outputs=status_display)
     with gr.Tab("Manual Test"):
         gr.Markdown(
             "Test description generation without opening a PR.\n\n"
@@ -266,6 +414,14 @@ async def handle_dataset_webhook(payload: WebhookPayload) -> dict:
     # Process the dataset
     result = await process_dataset(dataset_id, inference_token, pr_token)
     # Save to processed log
     processed = load_processed()
     processed[dataset_id] = {

 Watches davanstrien/* datasets and opens PRs with auto-generated descriptions.
 """
+import asyncio
 import json
+import logging
 import os
 from datetime import datetime
 from pathlib import Path
     get_repo_discussions,
 )
+from description_generator import ViewerNotReadyError, generate_description
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Space URL for attribution
 SPACE_URL = "https://huggingface.co/spaces/librarian-bots/dataset-card-drafter"
 DATA_DIR = Path("/data") if Path("/data").exists() else Path("./local_data")
 DATA_DIR.mkdir(exist_ok=True)
 PROCESSED_FILE = DATA_DIR / "processed.json"
+PENDING_FILE = DATA_DIR / "pending.json"
+# Retry configuration
+RETRY_DELAYS = [60, 120, 300]  # Seconds to wait between retries (1min, 2min, 5min)
 def load_processed() -> dict:
     PROCESSED_FILE.write_text(json.dumps(data, indent=2))
+def load_pending() -> dict:
+    """Load pending datasets from persistence."""
+    if PENDING_FILE.exists():
+        return json.loads(PENDING_FILE.read_text())
+    return {}
+def save_pending(data: dict) -> None:
+    """Save pending datasets to persistence."""
+    PENDING_FILE.write_text(json.dumps(data, indent=2))
+def add_to_pending(dataset_id: str, reason: str) -> None:
+    """Add a dataset to the pending queue."""
+    pending = load_pending()
+    pending[dataset_id] = {
+        "added": datetime.now().isoformat(),
+        "reason": reason,
+        "retries": 0,
+    }
+    save_pending(pending)
+    logger.info(f"Added {dataset_id} to pending queue: {reason}")
+def remove_from_pending(dataset_id: str) -> None:
+    """Remove a dataset from the pending queue."""
+    pending = load_pending()
+    if dataset_id in pending:
+        del pending[dataset_id]
+        save_pending(pending)
+        logger.info(f"Removed {dataset_id} from pending queue")
 def is_watched_repo(repo_name: str) -> bool:
     """Check if a repo is in our watched list."""
     return any(repo_name.startswith(prefix) for prefix in WATCHED_PREFIXES)
     """
     # Check for existing open PR first
     if has_existing_pr(dataset_id):
+        remove_from_pending(dataset_id)  # Clean up if it was pending
         return {"status": "skipped", "reason": "open PR already exists"}
     # Load current card
     # Check if description needed
     if not should_generate(card):
+        remove_from_pending(dataset_id)  # Clean up if it was pending
         return {"status": "skipped", "reason": "description exists"}
     # Generate description using inference token
     try:
         description = generate_description(dataset_id, inference_token)
+    except ViewerNotReadyError as e:
+        return {"status": "pending", "reason": str(e)}
     except Exception as e:
         return {"status": "error", "reason": f"generation failed: {e}"}
     except Exception as e:
         return {"status": "error", "reason": f"PR creation failed: {e}"}
+    # Success - remove from pending if it was there
+    remove_from_pending(dataset_id)
     return {"status": "pr_created", "pr_url": pr_url, "description": description}
+async def retry_pending_dataset(dataset_id: str) -> None:
+    """Background task to retry a pending dataset after delays."""
+    inference_token = os.getenv("HF_TOKEN")
+    pr_token = os.getenv("LIBRARIAN_BOT_TOKEN")
+    if not inference_token or not pr_token:
+        logger.error("Missing tokens for retry")
+        return
+    for i, delay in enumerate(RETRY_DELAYS):
+        logger.info(f"Waiting {delay}s before retry {i + 1} for {dataset_id}")
+        await asyncio.sleep(delay)
+        # Update retry count
+        pending = load_pending()
+        if dataset_id not in pending:
+            logger.info(f"{dataset_id} no longer pending, stopping retries")
+            return
+        pending[dataset_id]["retries"] = i + 1
+        save_pending(pending)
+        # Try processing
+        result = await process_dataset(dataset_id, inference_token, pr_token)
+        if result["status"] == "pr_created":
+            logger.info(f"Successfully processed {dataset_id} on retry {i + 1}")
+            # Log to processed
+            processed = load_processed()
+            processed[dataset_id] = {
+                "pr_url": result.get("pr_url"),
+                "timestamp": datetime.now().isoformat(),
+                "status": "pr_created",
+                "trigger": "retry",
+                "retry_attempt": i + 1,
+            }
+            save_processed(processed)
+            return
+        elif result["status"] != "pending":
+            # Got a definitive answer (error or skipped), stop retrying
+            logger.info(f"Stopping retries for {dataset_id}: {result}")
+            remove_from_pending(dataset_id)
+            return
+    # Exhausted retries
+    logger.warning(f"Exhausted retries for {dataset_id}")
+    pending = load_pending()
+    if dataset_id in pending:
+        pending[dataset_id]["exhausted"] = True
+        save_pending(pending)
 # Gradio UI
 with gr.Blocks(title="Dataset Card Drafter") as demo:
     gr.Markdown("# Dataset Card Drafter MVP")
         refresh_btn = gr.Button("Refresh")
         refresh_btn.click(fn=load_processed, outputs=status_display)
+    with gr.Tab("Pending"):
+        gr.Markdown(
+            "Datasets waiting for the viewer to be ready.\n\n"
+            "Background retries happen at 1min, 2min, 5min intervals."
+        )
+        pending_display = gr.JSON(label="Pending Datasets", value=load_pending)
+        pending_refresh_btn = gr.Button("Refresh")
+        pending_refresh_btn.click(fn=load_pending, outputs=pending_display)
+        # Manual retry button
+        retry_input = gr.Textbox(
+            label="Dataset ID to retry",
+            placeholder="davanstrien/dataset-name",
+        )
+        retry_btn = gr.Button("Retry Now")
+        retry_output = gr.JSON(label="Result")
+        async def manual_retry(dataset_id: str):
+            if not dataset_id:
+                return {"status": "error", "reason": "no dataset ID provided"}
+            inference_token = os.getenv("HF_TOKEN")
+            pr_token = os.getenv("LIBRARIAN_BOT_TOKEN")
+            if not inference_token or not pr_token:
+                return {"status": "error", "reason": "tokens not configured"}
+            result = await process_dataset(dataset_id, inference_token, pr_token)
+            if result.get("status") == "pr_created":
+                processed = load_processed()
+                processed[dataset_id] = {
+                    "pr_url": result.get("pr_url"),
+                    "timestamp": datetime.now().isoformat(),
+                    "status": "pr_created",
+                    "trigger": "manual_retry",
+                }
+                save_processed(processed)
+            return result
+        retry_btn.click(
+            fn=manual_retry,
+            inputs=retry_input,
+            outputs=retry_output,
+        )
     with gr.Tab("Manual Test"):
         gr.Markdown(
             "Test description generation without opening a PR.\n\n"
     # Process the dataset
     result = await process_dataset(dataset_id, inference_token, pr_token)
+    # Handle pending status - queue for retry
+    if result.get("status") == "pending":
+        add_to_pending(dataset_id, result.get("reason", "viewer not ready"))
+        # Spawn background retry task (non-blocking)
+        asyncio.create_task(retry_pending_dataset(dataset_id))
+        logger.info(f"Queued {dataset_id} for background retry")
+        return result
     # Save to processed log
     processed = load_processed()
     processed[dataset_id] = {

description_generator.py CHANGED Viewed

@@ -9,8 +9,18 @@ from huggingface_hub import InferenceClient
 DEFAULT_MODEL = "zai-org/GLM-4.6V:zai-org"
 def gather_dataset_info(dataset: str, hf_token: str | None = None) -> dict:
-    """Gather all dataset information upfront from Datasets Viewer API."""
     client = DatasetsServerClient(token=hf_token)
     info = {"dataset": dataset}
@@ -25,6 +35,15 @@ def gather_dataset_info(dataset: str, hf_token: str | None = None) -> dict:
             "filter": validity.filter,
             "statistics": validity.statistics,
         }
     except Exception as e:
         info["validity_error"] = str(e)
         return info  # Can't continue without validity

 DEFAULT_MODEL = "zai-org/GLM-4.6V:zai-org"
+class ViewerNotReadyError(Exception):
+    """Raised when the Datasets Viewer hasn't processed a dataset yet."""
+    pass
 def gather_dataset_info(dataset: str, hf_token: str | None = None) -> dict:
+    """Gather all dataset information upfront from Datasets Viewer API.
+    Raises:
+        ViewerNotReadyError: If the dataset preview is not available yet.
+    """
     client = DatasetsServerClient(token=hf_token)
     info = {"dataset": dataset}
             "filter": validity.filter,
             "statistics": validity.statistics,
         }
+        # Check if preview is ready - we need it to get sample rows
+        if not validity.preview:
+            raise ViewerNotReadyError(
+                f"Dataset viewer not ready for '{dataset}'. "
+                "The dataset may be new or still processing."
+            )
+    except ViewerNotReadyError:
+        raise  # Re-raise our custom exception
     except Exception as e:
         info["validity_error"] = str(e)
         return info  # Can't continue without validity