Spaces:

huggingface
/

ai-deadlines

Running

App Files Files Community

ai-deadlines / .github /scripts /update_conferences_new.py

nielsr HF Staff

Ruff

004e138 14 days ago

raw

history blame contribute delete

12.1 kB

	import yaml
	import requests
	import os
	import re
	from datetime import datetime
	from typing import Dict, List, Any


	def fetch_conference_files() -> List[Dict[str, Any]]:
	"""Fetch all conference YAML files from ccfddl repository."""

	# First get the directory listing from GitHub API
	api_url = "https://api.github.com/repos/ccfddl/ccf-deadlines/contents/conference/AI"
	response = requests.get(api_url)
	files = response.json()

	conferences = []
	for file in files:
	if file["name"].endswith(".yml"):
	yaml_content = requests.get(file["download_url"]).text
	conf_data = yaml.safe_load(yaml_content)
	# The data is a list with a single item
	if isinstance(conf_data, list) and len(conf_data) > 0:
	conferences.append(conf_data[0])

	return conferences


	def parse_date_range(date_str: str, year: str) -> tuple[str, str]:
	"""Parse various date formats and return start and end dates."""
	# Remove the year if it appears at the end of the string
	date_str = date_str.replace(f", {year}", "")

	# Handle various date formats
	try:
	# Split into start and end dates
	if " - " in date_str:
	start, end = date_str.split(" - ")
	elif "-" in date_str:
	start, end = date_str.split("-")
	else:
	# For single date format like "May 19, 2025"
	start = end = date_str

	# Clean up month abbreviations
	month_map = {
	"Sept": "September", # Handle Sept before Sep
	"Jan": "January",
	"Feb": "February",
	"Mar": "March",
	"Apr": "April",
	"Jun": "June",
	"Jul": "July",
	"Aug": "August",
	"Sep": "September",
	"Oct": "October",
	"Nov": "November",
	"Dec": "December",
	}

	# Create a set of all month names (full and abbreviated)
	all_months = set(month_map.keys()) \| set(month_map.values())

	# Handle cases like "April 29-May 4"
	has_month = any(month in end for month in all_months)
	if not has_month:
	# End is just a day number, use start's month
	start_parts = start.split()
	if len(start_parts) >= 1:
	end = f"{start_parts[0]} {end.strip()}"

	# Replace month abbreviations
	for abbr, full in month_map.items():
	start = start.replace(abbr, full)
	end = end.replace(abbr, full)

	# Clean up any extra spaces
	start = " ".join(start.split())
	end = " ".join(end.split())

	# Parse start date
	start_date = datetime.strptime(f"{start}, {year}", "%B %d, %Y")

	# Parse end date
	end_date = datetime.strptime(f"{end}, {year}", "%B %d, %Y")

	return start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")

	except Exception as e:
	raise ValueError(f"Could not parse date: {date_str} ({e})")


	def transform_conference_data(
	conferences: List[Dict[str, Any]],
	) -> List[Dict[str, Any]]:
	"""Transform ccfddl format to our format."""
	transformed = []
	current_year = datetime.now().year

	for conf in conferences:
	# Get the most recent or upcoming conference instance
	recent_conf = None
	if "confs" in conf:
	for instance in conf["confs"]:
	if instance["year"] >= current_year:
	recent_conf = instance
	break

	if not recent_conf:
	continue

	# Transform to our format
	transformed_conf = {
	"title": conf.get("title", ""),
	"year": recent_conf["year"],
	"id": recent_conf["id"],
	"full_name": conf.get("description", ""),
	"link": recent_conf.get("link", ""),
	"deadline": recent_conf.get("timeline", [{}])[0].get("deadline", ""),
	"timezone": recent_conf.get("timezone", ""),
	"date": recent_conf.get("date", ""),
	"tags": [], # We'll need to maintain a mapping for tags
	}

	# Handle city and country fields instead of place
	place = recent_conf.get("place", "")
	if place:
	# Try to parse the place into city and country if it contains a comma
	if "," in place:
	city, country = place.split(",", 1)
	transformed_conf["city"] = city.strip()
	transformed_conf["country"] = country.strip()
	else:
	# If we can't parse, just set the country
	transformed_conf["country"] = place.strip()

	# Add optional fields
	timeline = recent_conf.get("timeline", [{}])[0]
	if "abstract_deadline" in timeline:
	transformed_conf["abstract_deadline"] = timeline["abstract_deadline"]

	# Parse date range for start/end
	try:
	if transformed_conf["date"]:
	start_date, end_date = parse_date_range(
	transformed_conf["date"], str(transformed_conf["year"])
	)
	transformed_conf["start"] = start_date
	transformed_conf["end"] = end_date
	except Exception as e:
	print(f"Warning: Could not parse date for {transformed_conf['title']}: {e}")

	# Add rankings as separate field
	if "rank" in conf:
	rankings = []
	for rank_type, rank_value in conf["rank"].items():
	rankings.append(f"{rank_type.upper()}: {rank_value}")
	if rankings:
	transformed_conf["rankings"] = ", ".join(rankings)

	transformed.append(transformed_conf)

	return transformed


	def load_all_current_conferences() -> Dict[str, List[Dict[str, Any]]]:
	"""Load all current conferences from individual files."""
	conferences_dir = "src/data/conferences"
	conference_groups = {}

	if not os.path.exists(conferences_dir):
	return {}

	for filename in os.listdir(conferences_dir):
	if filename.endswith(".yml"):
	filepath = os.path.join(conferences_dir, filename)
	with open(filepath, "r") as f:
	conferences = yaml.safe_load(f)
	if conferences:
	# Extract conference title from the first entry
	title = conferences[0]["title"]
	conference_groups[title] = conferences

	return conference_groups


	def create_filename_from_title(title: str) -> str:
	"""Create a filename-safe version of the conference title."""
	filename = re.sub(r"[^a-zA-Z0-9\s&()-]", "", title.lower())
	filename = re.sub(r"\s+", "_", filename)
	filename = filename.replace("&", "and")
	filename = filename.strip("_")
	return filename


	def update_conference_loader():
	"""Update the conference loader file with all current conferences."""
	conferences_dir = "src/data/conferences"
	loader_path = "src/utils/conferenceLoader.ts"

	# Get all conference files
	conference_files = []
	if os.path.exists(conferences_dir):
	for filename in sorted(os.listdir(conferences_dir)):
	if filename.endswith(".yml"):
	conference_files.append(filename)

	# Generate import statements
	imports = []
	variable_names = []

	for filename in conference_files:
	# Create variable name from filename
	var_name = filename.replace(".yml", "").replace("-", "_") + "Data"
	variable_names.append(var_name)
	imports.append(f"import {var_name} from '@/data/conferences/{filename}';")

	# Generate the loader file content
	loader_content = f"""import {{ Conference }} from '@/types/conference';

	// Import all conference YAML files
	{chr(10).join(imports)}

	// Combine all conference data into a single array
	const allConferencesData: Conference[] = [
	{chr(10).join(f" ...{var_name}," for var_name in variable_names)}
	];

	export default allConferencesData;"""

	# Write the loader file
	with open(loader_path, "w") as f:
	f.write(loader_content)

	print(f"Updated conference loader with {len(conference_files)} conference files")


	def main():
	try:
	# Load current conferences from individual files
	current_conference_groups = load_all_current_conferences()

	# Fetch and transform new data
	new_conferences = fetch_conference_files()
	if not new_conferences:
	print("Warning: No conferences fetched from ccfddl")
	return

	transformed_conferences = transform_conference_data(new_conferences)
	if not transformed_conferences:
	print("Warning: No conferences transformed")
	return

	# Create conferences directory if it doesn't exist
	conferences_dir = "src/data/conferences"
	os.makedirs(conferences_dir, exist_ok=True)

	# Group new conferences by title
	new_conference_groups = {}
	for conf in transformed_conferences:
	title = conf["title"]
	if title not in new_conference_groups:
	new_conference_groups[title] = []
	new_conference_groups[title].append(conf)

	# Update each conference group
	updated_count = 0
	for title, new_confs in new_conference_groups.items():
	filename = create_filename_from_title(title) + ".yml"
	filepath = os.path.join(conferences_dir, filename)

	# Get current conferences for this title
	current_confs = current_conference_groups.get(title, [])
	current_conf_dict = {conf["id"]: conf for conf in current_confs}

	# Update or add new conferences
	for new_conf in new_confs:
	if new_conf["id"] in current_conf_dict:
	# Update existing conference while preserving fields
	curr_conf = current_conf_dict[new_conf["id"]]

	# Preserve existing fields
	preserved_fields = [
	"tags",
	"venue",
	"hindex",
	"submission_deadline",
	"timezone_submission",
	"rebuttal_period_start",
	"rebuttal_period_end",
	"final_decision_date",
	"review_release_date",
	"commitment_deadline",
	"start",
	"end",
	"note",
	"city",
	"country",
	"deadlines",
	]
	for field in preserved_fields:
	if field in curr_conf:
	new_conf[field] = curr_conf[field]

	# Preserve existing rankings if available
	if "rankings" in curr_conf:
	new_conf["rankings"] = curr_conf["rankings"]

	current_conf_dict[new_conf["id"]] = new_conf
	else:
	# Add new conference
	current_conf_dict[new_conf["id"]] = new_conf

	# Convert back to list and sort by year
	all_confs = list(current_conf_dict.values())
	all_confs.sort(key=lambda x: x.get("year", 9999))

	# Write to individual file
	with open(filepath, "w") as f:
	yaml.dump(
	all_confs,
	f,
	default_flow_style=False,
	sort_keys=False,
	allow_unicode=True,
	)

	updated_count += 1
	print(f"Updated {filename} with {len(all_confs)} entries")

	# Update the conference loader
	update_conference_loader()

	print(f"Successfully updated {updated_count} conference files")

	except Exception as e:
	print(f"Error: {e}")
	raise


	if __name__ == "__main__":
	main()