Spaces:

victor
/

pelican-benchmark

Running

App Files Files Community

pelican-benchmark / benchmark_models.py

victor HF Staff

Add Pelican Bicycle SVG Benchmark

38e4cc9 5 months ago

raw

history blame contribute delete

6.85 kB

	import os
	import json
	import time
	import re
	import requests
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from datetime import datetime

	# Get HF token
	HF_TOKEN = os.environ.get("HF_TOKEN", "")

	# Load models
	with open("models.json", "r") as f:
	models_data = json.load(f)

	# Extract model IDs
	model_ids = [model["id"] for model in models_data["data"]]

	# Limit to first 20 models
	model_ids = model_ids[:20]


	def extract_svg(text):
	"""Extract SVG content from model response"""
	# First, check for code blocks with different markers
	code_block_patterns = [
	r"```svg\s(.?)\s*```",
	r"```xml\s(.?)\s*```",
	r"```html\s(.?)\s*```",
	r"```\s(.?)\s*```",
	]

	for pattern in code_block_patterns:
	match = re.search(pattern, text, re.DOTALL \| re.IGNORECASE)
	if match:
	content = match.group(1)
	# Extract SVG from the code block content
	if "<svg" in content:
	svg_match = re.search(
	r"<svg[^>]>.?</svg>", content, re.DOTALL \| re.IGNORECASE
	)
	if svg_match:
	return svg_match.group(0)

	# If no code blocks, look for SVG directly in the text
	# Handle cases where SVG might be in thinking tags or other wrappers
	svg_pattern = r"<svg[^>]>.?</svg>"
	svg_match = re.search(svg_pattern, text, re.DOTALL \| re.IGNORECASE)
	if svg_match:
	return svg_match.group(0)

	return None


	def test_model_with_temperature(model_id, temperature):
	"""Test a single model with a specific temperature"""
	print(f"Testing {model_id} with temperature {temperature}...")

	result = {
	"model_id": model_id,
	"temperature": temperature,
	"timestamp": datetime.now().isoformat(),
	"success": False,
	"response_time": None,
	"svg_content": None,
	"error": None,
	"raw_response": None,
	}

	prompt = """Create a pelican riding a bicycle using SVG. Return only the SVG code without any explanation or markdown formatting. The SVG should be a complete, valid SVG document starting with <svg> and ending with </svg>."""

	headers = {
	"Authorization": f"Bearer {HF_TOKEN}",
	"Content-Type": "application/json",
	}

	data = {
	"model": model_id,
	"messages": [{"role": "user", "content": prompt}],
	"max_tokens": 2000,
	"temperature": temperature,
	}

	try:
	start_time = time.time()

	response = requests.post(
	"https://router.huggingface.co/v1/chat/completions",
	headers=headers,
	json=data,
	timeout=60,
	)

	response_time = time.time() - start_time
	result["response_time"] = response_time

	if response.status_code == 200:
	response_data = response.json()
	if response_data.get("choices") and response_data["choices"][0].get(
	"message"
	):
	response_text = response_data["choices"][0]["message"]["content"]
	result["raw_response"] = response_text

	# Extract SVG
	svg_content = extract_svg(response_text)
	if svg_content:
	result["svg_content"] = svg_content
	result["success"] = True
	else:
	result["error"] = "No valid SVG found in response"
	else:
	result["error"] = "Empty response from model"
	else:
	result["error"] = f"HTTP {response.status_code}: {response.text}"

	except Exception as e:
	result["error"] = str(e)
	print(f"Error testing {model_id} with temperature {temperature}: {e}")

	return result


	def main():
	temperatures = [0, 0.5, 1.0]
	print(f"Testing {len(model_ids)} models with {len(temperatures)} temperature settings...")
	results = []

	# Create test tasks for each model and temperature combination
	test_tasks = []
	for model_id in model_ids:
	for temp in temperatures:
	test_tasks.append((model_id, temp))

	# Use ThreadPoolExecutor for concurrent requests
	with ThreadPoolExecutor(max_workers=10) as executor:
	future_to_task = {
	executor.submit(test_model_with_temperature, task[0], task[1]): task
	for task in test_tasks
	}

	for future in as_completed(future_to_task):
	task = future_to_task[future]
	model_id, temp = task
	try:
	result = future.result()
	results.append(result)
	print(
	f"Completed {model_id} (temp={temp}): {'Success' if result['success'] else 'Failed'}"
	)
	except Exception as e:
	print(f"Exception for {model_id} (temp={temp}): {e}")
	results.append({
	"model_id": model_id,
	"temperature": temp,
	"success": False,
	"error": str(e)
	})

	# Save results
	with open("benchmark_results.json", "w") as f:
	json.dump(results, f, indent=2)

	# Generate statistics
	total_tests = len(results)
	successful_tests = sum(1 for r in results if r.get("success", False))

	# Group by model to count unique models with at least one success
	models_with_success = {}
	for r in results:
	if r.get("success", False):
	models_with_success[r["model_id"]] = True

	stats = {
	"total_models": len(model_ids),
	"temperatures_tested": temperatures,
	"total_tests": total_tests,
	"successful_tests": successful_tests,
	"failed_tests": total_tests - successful_tests,
	"models_with_at_least_one_success": len(models_with_success),
	"average_response_time": (
	sum(r.get("response_time", 0) for r in results if r.get("response_time"))
	/ len([r for r in results if r.get("response_time")])
	if any(r.get("response_time") for r in results)
	else 0
	),
	"successful_model_ids": list(models_with_success.keys()),
	}

	with open("benchmark_stats.json", "w") as f:
	json.dump(stats, f, indent=2)

	print("\nBenchmark complete!")
	print(f"Total models tested: {stats['total_models']}")
	print(f"Temperature settings: {stats['temperatures_tested']}")
	print(f"Total tests: {stats['total_tests']}")
	print(f"Successful tests: {stats['successful_tests']}")
	print(f"Failed tests: {stats['failed_tests']}")
	print(f"Models with at least one success: {stats['models_with_at_least_one_success']}")
	print(f"Average response time: {stats['average_response_time']:.2f}s")


	if __name__ == "__main__":
	main()