File size: 15,215 Bytes
72431fa
 
 
 
 
763c57d
72431fa
763c57d
72431fa
 
 
 
 
fbc8ccb
 
 
 
 
 
72431fa
763c57d
 
 
 
 
72431fa
fbc8ccb
 
 
 
 
72431fa
 
 
 
 
 
 
 
763c57d
 
 
 
72431fa
 
 
 
 
 
 
 
 
 
 
 
 
 
763c57d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72431fa
 
 
 
 
 
 
 
 
 
 
 
fbc8ccb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
afeeac0
72431fa
 
afeeac0
 
 
 
 
72431fa
 
fbc8ccb
 
763c57d
fbc8ccb
 
af6fe58
72431fa
 
af6fe58
 
 
 
72431fa
 
 
763c57d
72431fa
 
afeeac0
72431fa
afeeac0
763c57d
 
72431fa
 
 
 
 
 
afeeac0
72431fa
 
 
 
 
 
fbc8ccb
 
72431fa
afeeac0
72431fa
 
 
 
 
763c57d
 
 
72431fa
 
 
763c57d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72431fa
 
 
 
 
 
 
 
 
 
 
 
 
763c57d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72431fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
afeeac0
 
72431fa
 
 
 
 
 
 
 
 
 
 
 
afeeac0
 
 
 
72431fa
afeeac0
 
72431fa
afeeac0
72431fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
afeeac0
 
 
 
 
72431fa
afeeac0
 
72431fa
 
afeeac0
72431fa
763c57d
 
 
 
 
 
 
 
72431fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69b756e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
"""Dataset Card Drafter - MVP Space.

Watches davanstrien/* datasets and opens PRs with auto-generated descriptions.
"""

import asyncio
import json
import logging
import os
from datetime import datetime
from pathlib import Path

import gradio as gr
from huggingface_hub import (
    DatasetCard,
    WebhookPayload,
    WebhooksServer,
    get_repo_discussions,
)

from description_generator import ViewerNotReadyError, generate_description

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Space URL for attribution
SPACE_URL = "https://huggingface.co/spaces/librarian-bots/dataset-card-drafter"
BOT_NAME = "librarian-bot"
PR_TITLE = "Add dataset description"

# Configuration
WATCHED_PREFIXES = ["davanstrien/"]  # Repos to watch
MIN_DESCRIPTION_LENGTH = 100  # Chars below which we generate

# Persistence directory
DATA_DIR = Path("/data") if Path("/data").exists() else Path("./local_data")
DATA_DIR.mkdir(exist_ok=True)
PROCESSED_FILE = DATA_DIR / "processed.json"
PENDING_FILE = DATA_DIR / "pending.json"

# Retry configuration
RETRY_DELAYS = [60, 120, 300]  # Seconds to wait between retries (1min, 2min, 5min)


def load_processed() -> dict:
    """Load processed datasets from persistence."""
    if PROCESSED_FILE.exists():
        return json.loads(PROCESSED_FILE.read_text())
    return {}


def save_processed(data: dict) -> None:
    """Save processed datasets to persistence."""
    PROCESSED_FILE.write_text(json.dumps(data, indent=2))


def load_pending() -> dict:
    """Load pending datasets from persistence."""
    if PENDING_FILE.exists():
        return json.loads(PENDING_FILE.read_text())
    return {}


def save_pending(data: dict) -> None:
    """Save pending datasets to persistence."""
    PENDING_FILE.write_text(json.dumps(data, indent=2))


def add_to_pending(dataset_id: str, reason: str) -> None:
    """Add a dataset to the pending queue."""
    pending = load_pending()
    pending[dataset_id] = {
        "added": datetime.now().isoformat(),
        "reason": reason,
        "retries": 0,
    }
    save_pending(pending)
    logger.info(f"Added {dataset_id} to pending queue: {reason}")


def remove_from_pending(dataset_id: str) -> None:
    """Remove a dataset from the pending queue."""
    pending = load_pending()
    if dataset_id in pending:
        del pending[dataset_id]
        save_pending(pending)
        logger.info(f"Removed {dataset_id} from pending queue")


def is_watched_repo(repo_name: str) -> bool:
    """Check if a repo is in our watched list."""
    return any(repo_name.startswith(prefix) for prefix in WATCHED_PREFIXES)


def should_generate(card: DatasetCard) -> bool:
    """Check if a dataset card needs a description."""
    if not card.text:
        return True
    return len(card.text.strip()) < MIN_DESCRIPTION_LENGTH


def has_existing_pr(dataset_id: str) -> bool:
    """Check if there's already an open PR from librarian-bot for this dataset."""
    try:
        discussions = list(get_repo_discussions(dataset_id, repo_type="dataset"))
        for discussion in discussions:
            if not discussion.is_pull_request:
                continue
            # Check if it's from librarian-bot and matches our PR title
            if discussion.author == BOT_NAME and discussion.title == PR_TITLE:
                # Check if still open
                if discussion.status == "open":
                    return True
        return False
    except Exception:
        # If we can't check, err on the side of caution
        return False


def build_pr_description(description: str) -> str:
    """Build the PR description with attribution and the generated content."""
    return f"""## Auto-generated Dataset Description

This PR was automatically generated by the [Dataset Card Drafter]({SPACE_URL}) Space.

### Suggested Description

{description}

---

*If you find this description helpful, please merge the PR. If you'd like to edit it first, you can modify the README.md in this branch before merging.*

*Generated by [{BOT_NAME}]({SPACE_URL})*"""


async def process_dataset(dataset_id: str, inference_token: str, pr_token: str) -> dict:
    """Process a single dataset: check, generate, and open PR.

    Args:
        dataset_id: The dataset to process
        inference_token: Token for inference API calls (e.g., davanstrien's token)
        pr_token: Token for opening PRs (librarian-bot's token)

    Returns a status dict with results.
    """
    # Check for existing open PR first
    if has_existing_pr(dataset_id):
        remove_from_pending(dataset_id)  # Clean up if it was pending
        return {"status": "skipped", "reason": "open PR already exists"}

    # Load current card (or create new if none exists)
    try:
        card = DatasetCard.load(dataset_id)
    except Exception:
        # No README exists - create a new card
        card = DatasetCard("")
        logger.info(f"No existing card for {dataset_id}, creating new one")

    # Check if description needed
    if not should_generate(card):
        remove_from_pending(dataset_id)  # Clean up if it was pending
        return {"status": "skipped", "reason": "description exists"}

    # Generate description using inference token
    try:
        description = generate_description(dataset_id, inference_token)
    except ViewerNotReadyError as e:
        return {"status": "pending", "reason": str(e)}
    except Exception as e:
        return {"status": "error", "reason": f"generation failed: {e}"}

    if not description:
        return {"status": "error", "reason": "empty description generated"}

    # Update card and push as PR using librarian-bot token
    card.text = description

    try:
        commit_info = card.push_to_hub(
            repo_id=dataset_id,
            repo_type="dataset",
            commit_message=PR_TITLE,
            commit_description=build_pr_description(description),
            create_pr=True,
            token=pr_token,
        )
        pr_url = getattr(commit_info, "pr_url", str(commit_info))
    except Exception as e:
        return {"status": "error", "reason": f"PR creation failed: {e}"}

    # Success - remove from pending if it was there
    remove_from_pending(dataset_id)

    return {"status": "pr_created", "pr_url": pr_url, "description": description}


async def retry_pending_dataset(dataset_id: str) -> None:
    """Background task to retry a pending dataset after delays."""
    inference_token = os.getenv("HF_TOKEN")
    pr_token = os.getenv("LIBRARIAN_BOT_TOKEN")

    if not inference_token or not pr_token:
        logger.error("Missing tokens for retry")
        return

    for i, delay in enumerate(RETRY_DELAYS):
        logger.info(f"Waiting {delay}s before retry {i + 1} for {dataset_id}")
        await asyncio.sleep(delay)

        # Update retry count
        pending = load_pending()
        if dataset_id not in pending:
            logger.info(f"{dataset_id} no longer pending, stopping retries")
            return
        pending[dataset_id]["retries"] = i + 1
        save_pending(pending)

        # Try processing
        result = await process_dataset(dataset_id, inference_token, pr_token)

        if result["status"] == "pr_created":
            logger.info(f"Successfully processed {dataset_id} on retry {i + 1}")
            # Log to processed
            processed = load_processed()
            processed[dataset_id] = {
                "pr_url": result.get("pr_url"),
                "timestamp": datetime.now().isoformat(),
                "status": "pr_created",
                "trigger": "retry",
                "retry_attempt": i + 1,
            }
            save_processed(processed)
            return
        elif result["status"] != "pending":
            # Got a definitive answer (error or skipped), stop retrying
            logger.info(f"Stopping retries for {dataset_id}: {result}")
            remove_from_pending(dataset_id)
            return

    # Exhausted retries
    logger.warning(f"Exhausted retries for {dataset_id}")
    pending = load_pending()
    if dataset_id in pending:
        pending[dataset_id]["exhausted"] = True
        save_pending(pending)


# Gradio UI
with gr.Blocks(title="Dataset Card Drafter") as demo:
    gr.Markdown("# Dataset Card Drafter MVP")
    gr.Markdown(
        f"Watching datasets matching: `{'`, `'.join(WATCHED_PREFIXES)}`\n\n"
        f"Triggers when description < {MIN_DESCRIPTION_LENGTH} characters."
    )

    with gr.Tab("Status"):
        status_display = gr.JSON(label="Processed Datasets", value=load_processed)
        refresh_btn = gr.Button("Refresh")
        refresh_btn.click(fn=load_processed, outputs=status_display)

    with gr.Tab("Pending"):
        gr.Markdown(
            "Datasets waiting for the viewer to be ready.\n\n"
            "Background retries happen at 1min, 2min, 5min intervals."
        )
        pending_display = gr.JSON(label="Pending Datasets", value=load_pending)
        pending_refresh_btn = gr.Button("Refresh")
        pending_refresh_btn.click(fn=load_pending, outputs=pending_display)

        # Manual retry button
        retry_input = gr.Textbox(
            label="Dataset ID to retry",
            placeholder="davanstrien/dataset-name",
        )
        retry_btn = gr.Button("Retry Now")
        retry_output = gr.JSON(label="Result")

        async def manual_retry(dataset_id: str):
            if not dataset_id:
                return {"status": "error", "reason": "no dataset ID provided"}

            inference_token = os.getenv("HF_TOKEN")
            pr_token = os.getenv("LIBRARIAN_BOT_TOKEN")

            if not inference_token or not pr_token:
                return {"status": "error", "reason": "tokens not configured"}

            result = await process_dataset(dataset_id, inference_token, pr_token)

            if result.get("status") == "pr_created":
                processed = load_processed()
                processed[dataset_id] = {
                    "pr_url": result.get("pr_url"),
                    "timestamp": datetime.now().isoformat(),
                    "status": "pr_created",
                    "trigger": "manual_retry",
                }
                save_processed(processed)

            return result

        retry_btn.click(
            fn=manual_retry,
            inputs=retry_input,
            outputs=retry_output,
        )

    with gr.Tab("Manual Test"):
        gr.Markdown(
            "Test description generation without opening a PR.\n\n"
            "**Note:** This requires `HF_TOKEN` to be set."
        )
        test_input = gr.Textbox(
            label="Dataset ID",
            placeholder="davanstrien/test-dataset",
        )
        test_btn = gr.Button("Generate Description (Preview)")
        test_output = gr.Textbox(label="Generated Description", lines=5)
        test_status = gr.JSON(label="Status")

        def test_generate(dataset_id: str):
            if not dataset_id:
                return "", {"status": "error", "reason": "no dataset ID provided"}

            hf_token = os.getenv("HF_TOKEN")
            if not hf_token:
                return "", {"status": "error", "reason": "HF_TOKEN not set"}

            try:
                description = generate_description(dataset_id, hf_token)
                return description, {"status": "success", "length": len(description)}
            except Exception as e:
                return "", {"status": "error", "reason": str(e)}

        test_btn.click(
            fn=test_generate,
            inputs=test_input,
            outputs=[test_output, test_status],
        )

    with gr.Tab("Trigger PR"):
        gr.Markdown(
            "Manually trigger description generation and PR creation.\n\n"
            "**Warning:** This will open a real PR!\n\n"
            "Requires `HF_TOKEN` (for inference) and `LIBRARIAN_BOT_TOKEN` (for PRs)."
        )
        trigger_input = gr.Textbox(
            label="Dataset ID",
            placeholder="davanstrien/test-dataset",
        )
        trigger_btn = gr.Button("Generate & Open PR", variant="primary")
        trigger_output = gr.JSON(label="Result")

        async def trigger_pr(dataset_id: str):
            if not dataset_id:
                return {"status": "error", "reason": "no dataset ID provided"}

            inference_token = os.getenv("HF_TOKEN")
            pr_token = os.getenv("LIBRARIAN_BOT_TOKEN")

            if not inference_token:
                return {"status": "error", "reason": "HF_TOKEN not set"}
            if not pr_token:
                return {"status": "error", "reason": "LIBRARIAN_BOT_TOKEN not set"}

            result = await process_dataset(dataset_id, inference_token, pr_token)

            # Save to processed log
            if result.get("status") == "pr_created":
                processed = load_processed()
                processed[dataset_id] = {
                    "pr_url": result.get("pr_url"),
                    "timestamp": datetime.now().isoformat(),
                    "status": "pr_created",
                    "trigger": "manual",
                }
                save_processed(processed)

            return result

        trigger_btn.click(
            fn=trigger_pr,
            inputs=trigger_input,
            outputs=trigger_output,
        )


# WebhooksServer with automatic secret verification
app = WebhooksServer(ui=demo, webhook_secret=os.getenv("WEBHOOK_SECRET"))


@app.add_webhook("/dataset_update")
async def handle_dataset_webhook(payload: WebhookPayload) -> dict:
    """Handle dataset creation/update webhooks."""
    # Filter for datasets only
    if payload.repo.type != "dataset":
        return {"status": "skipped", "reason": "not a dataset"}

    # Filter for watched repos
    if not is_watched_repo(payload.repo.name):
        return {"status": "skipped", "reason": "not in watched list"}

    dataset_id = payload.repo.name

    # Get tokens
    inference_token = os.getenv("HF_TOKEN")
    pr_token = os.getenv("LIBRARIAN_BOT_TOKEN")

    if not inference_token:
        return {"status": "error", "reason": "HF_TOKEN not configured"}
    if not pr_token:
        return {"status": "error", "reason": "LIBRARIAN_BOT_TOKEN not configured"}

    # Process the dataset
    result = await process_dataset(dataset_id, inference_token, pr_token)

    # Handle pending status - queue for retry
    if result.get("status") == "pending":
        add_to_pending(dataset_id, result.get("reason", "viewer not ready"))
        # Spawn background retry task (non-blocking)
        asyncio.create_task(retry_pending_dataset(dataset_id))
        logger.info(f"Queued {dataset_id} for background retry")
        return result

    # Save to processed log
    processed = load_processed()
    processed[dataset_id] = {
        "pr_url": result.get("pr_url"),
        "timestamp": datetime.now().isoformat(),
        "status": result.get("status"),
        "reason": result.get("reason"),
        "trigger": "webhook",
        "event": payload.event.action if payload.event else None,
    }
    save_processed(processed)

    return result


if __name__ == "__main__":
    app.launch(ssr_mode=False)