Spaces:

Spestly
/

VAML

Sleeping

App Files Files Community

VAML / app.py

Spestly

Rename main.py to app.py

3c02583 verified 3 months ago

raw

history blame contribute delete

32.8 kB

	"""
	ATAR Prediction System with ML Ensemble
	All-in-one Gradio app with training, inference, and HF Model Repo integration
	Optimized for ZeroGPU (no persistent storage needed)

	Author: Victor Academy
	"""

	import gradio as gr
	import numpy as np
	import pandas as pd
	import json
	import os
	from typing import List, Dict, Any, Tuple
	import warnings
	warnings.filterwarnings('ignore')

	# ZeroGPU support for Hugging Face Spaces
	try:
	import spaces
	ZEROGPU_AVAILABLE = True
	print("✅ ZeroGPU support enabled")
	except ImportError:
	ZEROGPU_AVAILABLE = False
	print("ℹ️ Running without ZeroGPU (local mode)")

	# ML Libraries
	try:
	from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
	from sklearn.linear_model import Ridge
	from sklearn.preprocessing import StandardScaler
	from sklearn.model_selection import train_test_split
	import joblib
	except ImportError:
	print("⚠️ Installing scikit-learn...")
	os.system("pip install scikit-learn joblib")
	from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
	from sklearn.linear_model import Ridge
	from sklearn.preprocessing import StandardScaler
	from sklearn.model_test_split import train_test_split
	import joblib

	# Hugging Face Hub for model storage
	try:
	from huggingface_hub import HfApi, login, hf_hub_download
	except ImportError:
	print("⚠️ Installing huggingface_hub...")
	os.system("pip install huggingface_hub")
	from huggingface_hub import HfApi, login, hf_hub_download

	# ============================================
	# CONFIGURATION
	# ============================================

	HF_MODEL_REPO = "Spestly/VAML-ATAR" # Your HF model repo
	FEATURE_COUNT = 18
	MODEL_VERSION = "v1.0.0" # Semantic versioning: major.minor.patch

	# HF Token - REQUIRED for training (set as environment variable in HF Space settings)
	# Get from: https://huggingface.co/settings/tokens (write access needed)
	# In HF Space: Settings → Variables and secrets → Add: HF_TOKEN = hf_xxxxx
	HF_TOKEN = os.environ.get('HF_TOKEN', None)

	if not HF_TOKEN:
	print("⚠️ Warning: HF_TOKEN not set! Training will fail.")
	print(" Set HF_TOKEN environment variable in Space settings.")
	else:
	print("✅ HF_TOKEN found")

	# Subject scaling data (2024 HSC data)
	SUBJECT_SCALING_DATA = {
	'Mathematics Extension 2': {'scaling_factor': 1.1943, 'mean': 71.2, 'std': 12.5, 'difficulty': 'very_hard'},
	'Mathematics Extension 1': {'scaling_factor': 1.1547, 'mean': 69.8, 'std': 13.1, 'difficulty': 'hard'},
	'Mathematics Advanced': {'scaling_factor': 1.0821, 'mean': 72.5, 'std': 11.8, 'difficulty': 'medium'},
	'Physics': {'scaling_factor': 1.1037, 'mean': 70.3, 'std': 12.2, 'difficulty': 'hard'},
	'Chemistry': {'scaling_factor': 1.0956, 'mean': 71.1, 'std': 11.9, 'difficulty': 'hard'},
	'Biology': {'scaling_factor': 1.0234, 'mean': 73.8, 'std': 10.5, 'difficulty': 'medium'},
	'English Advanced': {'scaling_factor': 1.0000, 'mean': 75.2, 'std': 9.8, 'difficulty': 'medium'},
	'English Standard': {'scaling_factor': 0.9234, 'mean': 68.5, 'std': 11.2, 'difficulty': 'easy'},
	'Economics': {'scaling_factor': 1.0645, 'mean': 72.8, 'std': 11.3, 'difficulty': 'medium'},
	'Business Studies': {'scaling_factor': 0.9856, 'mean': 71.2, 'std': 10.8, 'difficulty': 'medium'},
	'Legal Studies': {'scaling_factor': 0.9923, 'mean': 72.5, 'std': 10.2, 'difficulty': 'medium'},
	'Modern History': {'scaling_factor': 1.0112, 'mean': 73.1, 'std': 10.6, 'difficulty': 'medium'},
	'Ancient History': {'scaling_factor': 1.0089, 'mean': 72.9, 'std': 10.4, 'difficulty': 'medium'},
	'PDHPE': {'scaling_factor': 0.9639, 'mean': 70.8, 'std': 11.5, 'difficulty': 'easy'},
	'Software Design & Development': {'scaling_factor': 1.0423, 'mean': 71.6, 'std': 12.1, 'difficulty': 'medium'},
	'Visual Arts': {'scaling_factor': 0.9734, 'mean': 76.2, 'std': 8.9, 'difficulty': 'easy'},
	'Music 2': {'scaling_factor': 1.0567, 'mean': 77.5, 'std': 9.2, 'difficulty': 'medium'},
	'Geography': {'scaling_factor': 0.9912, 'mean': 72.3, 'std': 10.7, 'difficulty': 'medium'},
	'Industrial Technology': {'scaling_factor': 0.9523, 'mean': 69.7, 'std': 11.8, 'difficulty': 'easy'},
	}

	# ============================================
	# FEATURE ENGINEERING
	# ============================================

	def extract_features(subjects: List[Dict]) -> np.ndarray:
	"""
	Extract 18 features from subject data

	Features:
	- 10 subject marks (padded with 0 if fewer subjects)
	- Average mark
	- Standard deviation
	- High-scaling subject count
	- Overall trend score
	- Assessment count score
	- Top mark quality
	- Bottom mark quality
	- Has good English flag
	"""
	# Get top 10 subjects by mark
	sorted_subjects = sorted(subjects, key=lambda x: x.get('raw_mark', 0), reverse=True)[:10]

	# Extract marks
	marks = [s.get('raw_mark', 0) for s in sorted_subjects]
	while len(marks) < 10:
	marks.append(0)

	# Normalize to 0-1
	marks_normalized = [m / 100.0 for m in marks[:10]]

	# Calculate derived features
	valid_marks = [m for m in marks if m > 0]
	avg_mark = np.mean(valid_marks) if valid_marks else 0
	std_dev = np.std(valid_marks) if len(valid_marks) > 1 else 0

	# Count high-scaling subjects (factor > 1.05)
	high_scaling_count = sum(1 for s in sorted_subjects
	if SUBJECT_SCALING_DATA.get(s.get('subject_name', ''), {}).get('scaling_factor', 1.0) > 1.05)

	# Trend score (0-1)
	trend_map = {'improving': 1.0, 'stable': 0.5, 'declining': 0.0}
	trends = [trend_map.get(s.get('trend', 'stable'), 0.5) for s in sorted_subjects]
	trend_score = np.mean(trends) if trends else 0.5

	# Assessment count score (normalized)
	assessment_counts = [s.get('assessment_count', 1) for s in sorted_subjects]
	assessment_score = min(np.mean(assessment_counts) / 10.0, 1.0)

	# Quality metrics
	top_mark_quality = marks[0] / 90.0 if marks[0] > 0 else 0
	bottom_mark_quality = marks[-1] / 90.0 if marks[-1] > 0 else 0

	# English quality flag
	english_subjects = [s for s in sorted_subjects if 'English' in s.get('subject_name', '')]
	has_good_english = 1.0 if english_subjects and english_subjects[0].get('raw_mark', 0) >= 80 else 0.0

	# Combine features
	features = marks_normalized + [
	avg_mark / 100.0,
	min(std_dev / 20.0, 1.0),
	high_scaling_count / 10.0,
	trend_score,
	assessment_score,
	top_mark_quality,
	bottom_mark_quality,
	has_good_english
	]

	return np.array(features, dtype=np.float32)

	# ============================================
	# DATA GENERATION (for training)
	# ============================================

	def generate_synthetic_data(n_samples: int = 10000) -> Tuple[np.ndarray, np.ndarray]:
	"""
	Generate synthetic ATAR training data using UAC formula
	"""
	np.random.seed(42)

	X = []
	y = []

	for _ in range(n_samples):
	# Generate 10 subject marks
	subject_marks = np.random.normal(73, 10, 10)
	subject_marks = np.clip(subject_marks, 40, 100)
	subject_marks = np.sort(subject_marks)[::-1] # Sort descending

	# Derived features
	avg_mark = np.mean(subject_marks)
	std_dev = np.std(subject_marks)
	high_scaling_count = np.random.randint(0, 6)
	trend_score = np.random.uniform(0, 1)
	assessment_count = np.random.uniform(0, 1)
	top_mark_quality = min(subject_marks[0] / 90, 1)
	bottom_mark_quality = min(subject_marks[-1] / 90, 1)
	has_good_english = 1 if subject_marks[0] >= 80 else 0

	# Calculate ATAR using UAC formula
	# Aggregate scaled marks (simplified)
	aggregate = sum([m * 2 / 50.0 for m in subject_marks])

	# Base ATAR calculation
	base_atar = 99.95 * (aggregate / 500) ** 0.85

	# Adjustments
	atar = base_atar + (high_scaling_count - 2.5) * 0.5
	atar += (trend_score - 0.5) * 2
	atar += np.random.normal(0, 0.5) # Add noise
	atar = np.clip(atar, 30, 99.95)

	# Features (normalized)
	features = list(subject_marks / 100) + [
	avg_mark / 100,
	min(std_dev / 20, 1),
	high_scaling_count / 10,
	trend_score,
	assessment_count,
	top_mark_quality,
	bottom_mark_quality,
	has_good_english
	]

	X.append(features)
	y.append(atar)

	return np.array(X), np.array(y)

	# ============================================
	# MODEL TRAINING
	# ============================================

	class ATARMLEnsemble:
	"""
	ML Ensemble for ATAR prediction
	Uses Gradient Boosting + Random Forest + Ridge Regression
	"""
	def __init__(self):
	self.scaler = StandardScaler()
	self.models = {
	'gb': GradientBoostingRegressor(n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42),
	'rf': RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42),
	'ridge': Ridge(alpha=1.0, random_state=42)
	}
	self.weights = {'gb': 0.5, 'rf': 0.3, 'ridge': 0.2} # Ensemble weights
	self.is_trained = False

	def train(self, X, y, X_test=None, y_test=None):
	"""Train all models in the ensemble"""
	print(f"🚀 Training on {len(X)} samples...")

	# Scale features
	X_scaled = self.scaler.fit_transform(X)

	# Train each model
	for name, model in self.models.items():
	print(f"Training {name}...")
	model.fit(X_scaled, y)

	self.is_trained = True
	self.training_samples = len(X)

	# Store metrics for versioning
	train_pred = self.predict(X)
	self.train_mae = np.mean(np.abs(train_pred - y))

	if X_test is not None and y_test is not None:
	test_pred = self.predict(X_test)
	self.test_mae = np.mean(np.abs(test_pred - y_test))
	else:
	self.test_mae = None

	print("✅ Ensemble training complete!")

	def predict(self, X):
	"""Predict using weighted ensemble"""
	if not self.is_trained:
	raise ValueError("Model not trained! Train first or load from HF.")

	X_scaled = self.scaler.transform(X)

	# Get predictions from each model
	predictions = {}
	for name, model in self.models.items():
	predictions[name] = model.predict(X_scaled)

	# Weighted average
	final_pred = sum(predictions[name] * self.weights[name] for name in self.models.keys())

	return final_pred

	def save_local(self, path='models'):
	"""Save models locally"""
	os.makedirs(path, exist_ok=True)
	joblib.dump(self.scaler, f'{path}/scaler.pkl')
	for name, model in self.models.items():
	joblib.dump(model, f'{path}/{name}.pkl')
	joblib.dump(self.weights, f'{path}/weights.pkl')
	print(f"✅ Models saved to {path}/")

	def load_local(self, path='models'):
	"""Load models from local path"""
	self.scaler = joblib.load(f'{path}/scaler.pkl')
	for name in self.models.keys():
	self.models[name] = joblib.load(f'{path}/{name}.pkl')
	self.weights = joblib.load(f'{path}/weights.pkl')
	self.is_trained = True
	print(f"✅ Models loaded from {path}/")

	# Global model instance
	ensemble = ATARMLEnsemble()

	# ============================================
	# HUGGING FACE INTEGRATION
	# ============================================

	def upload_to_hf(version: str = None, repo_name: str = HF_MODEL_REPO):
	"""
	Upload trained models to HF Model Repo with versioning

	Versioning strategy:
	- models/{version}/ → Specific version (e.g., models/v1.0.0/)
	- models/latest/ → Always points to newest version
	- metadata.json → Tracks all versions and metrics
	"""
	try:
	# Check if HF_TOKEN is set
	if not HF_TOKEN:
	return "❌ HF_TOKEN not set! Please set it as environment variable in Space settings."

	# Login to HF
	login(token=HF_TOKEN)
	api = HfApi()

	# Use provided version or generate from timestamp
	if version is None:
	from datetime import datetime
	version = datetime.now().strftime("v%Y%m%d_%H%M%S")

	# Create repo if doesn't exist
	try:
	api.create_repo(repo_id=repo_name, repo_type="model", private=False)
	print(f"✅ Created repo: {repo_name}")
	except:
	print(f"ℹ️ Repo {repo_name} already exists")

	# Upload model files to versioned folder
	files = ['scaler.pkl', 'gb.pkl', 'rf.pkl', 'ridge.pkl', 'weights.pkl']

	print(f"📤 Uploading version: {version}")

	# Upload to specific version folder
	for file in files:
	api.upload_file(
	path_or_fileobj=f'models/{file}',
	path_in_repo=f'models/{version}/{file}',
	repo_id=repo_name,
	repo_type="model"
	)

	# Also upload to 'latest' folder (for easy access)
	for file in files:
	api.upload_file(
	path_or_fileobj=f'models/{file}',
	path_in_repo=f'models/latest/{file}',
	repo_id=repo_name,
	repo_type="model"
	)

	# Download existing metadata if it exists
	try:
	import tempfile
	temp_dir = tempfile.mkdtemp()
	metadata_path = hf_hub_download(
	repo_id=repo_name,
	filename="metadata.json",
	repo_type="model",
	cache_dir=temp_dir
	)
	with open(metadata_path, 'r') as f:
	metadata = json.load(f)
	except:
	metadata = {
	"versions": [],
	"latest_version": None,
	"model_type": "ML Ensemble (Gradient Boosting + Random Forest + Ridge)",
	"feature_count": FEATURE_COUNT
	}

	# Add new version to metadata
	from datetime import datetime
	new_version_info = {
	"version": version,
	"timestamp": datetime.now().isoformat(),
	"training_samples": getattr(ensemble, 'training_samples', "unknown"),
	"train_mae": getattr(ensemble, 'train_mae', None),
	"test_mae": getattr(ensemble, 'test_mae', None),
	"model_files": files
	}

	metadata["versions"].append(new_version_info)
	metadata["latest_version"] = version
	metadata["total_versions"] = len(metadata["versions"])

	# Save updated metadata locally
	with open('models/metadata.json', 'w') as f:
	json.dump(metadata, f, indent=2)

	# Upload metadata
	api.upload_file(
	path_or_fileobj='models/metadata.json',
	path_in_repo='metadata.json',
	repo_id=repo_name,
	repo_type="model"
	)

	return f"""✅ Models uploaded successfully!

	📦 Version: {version}
	🔗 Repo: https://huggingface.co/{repo_name}
	📊 Total versions: {len(metadata['versions'])}

	Access:
	- Latest: models/latest/
	- This version: models/{version}/
	- All versions: See metadata.json
	"""
	except Exception as e:
	return f"❌ Upload failed: {str(e)}"

	def download_from_hf(version: str = "latest", repo_name: str = HF_MODEL_REPO, token: str = None):
	"""
	Download models from HF Model Repo

	Args:
	version: Version to load ('latest', 'v1.0.0', etc.)
	repo_name: HF model repo name
	token: HF token (optional - only needed for private repos)
	"""
	try:
	os.makedirs('models', exist_ok=True)

	# Use provided token, or environment variable, or None (for public repos)
	auth_token = token or HF_TOKEN

	# Determine path based on version
	path_prefix = f"models/{version}/"

	files = ['scaler.pkl', 'gb.pkl', 'rf.pkl', 'ridge.pkl', 'weights.pkl']

	print(f"📥 Downloading version: {version}")
	if auth_token:
	print("🔒 Using authentication (private repo)")
	else:
	print("🌐 No token - assuming public repo")

	for file in files:
	local_path = hf_hub_download(
	repo_id=repo_name,
	filename=f"{path_prefix}{file}",
	repo_type="model",
	cache_dir='models',
	token=auth_token # ← Added token support
	)
	# Copy to models/ directory
	import shutil
	shutil.copy(local_path, f'models/{file}')

	# Load into ensemble
	ensemble.load_local('models')

	# Try to get version info from metadata
	try:
	metadata_path = hf_hub_download(
	repo_id=repo_name,
	filename="metadata.json",
	repo_type="model",
	cache_dir='models',
	token=auth_token # ← Added token support
	)
	with open(metadata_path, 'r') as f:
	metadata = json.load(f)

	version_info = next((v for v in metadata["versions"] if v["version"] == version), None)

	info_str = f"""✅ Models loaded successfully!

	📦 Version: {version}
	📅 Trained: {version_info.get('timestamp', 'Unknown') if version_info else 'Unknown'}
	📊 Train MAE: {version_info.get('train_mae', 'N/A') if version_info else 'N/A'} ATAR points
	📊 Test MAE: {version_info.get('test_mae', 'N/A') if version_info else 'N/A'} ATAR points
	🔗 Repo: https://huggingface.co/{repo_name}
	"""
	return info_str
	except:
	return f"✅ Models loaded from https://huggingface.co/{repo_name} ({version})"

	except Exception as e:
	return f"❌ Download failed: {str(e)}\nTrain the model first or check version name!"

	# ============================================
	# PREDICTION LOGIC
	# ============================================

	def predict_atar(subjects: List[Dict]) -> Dict[str, Any]:
	"""
	Predict ATAR using ML ensemble
	Auto-loads model from HF if not loaded
	"""
	# Auto-load model if not trained
	if not ensemble.is_trained:
	result = download_from_hf()
	if "❌" in result:
	return {
	'error': 'Model not trained or available. Please train first!',
	'predicted_atar': 0,
	'confidence': 0
	}

	# Extract features
	features = extract_features(subjects)
	X = features.reshape(1, -1)

	# Predict
	predicted_atar = ensemble.predict(X)[0]
	predicted_atar = np.clip(predicted_atar, 30, 99.95)

	# Calculate confidence (based on data quality)
	confidence = calculate_confidence(subjects)

	# Generate insights
	insights = generate_insights(subjects, predicted_atar)
	recommendations = generate_recommendations(subjects, predicted_atar)

	return {
	'predicted_atar': round(predicted_atar, 2),
	'confidence': round(confidence, 2),
	'insights': insights,
	'recommendations': recommendations
	}

	def calculate_confidence(subjects: List[Dict]) -> float:
	"""Calculate prediction confidence based on data quality"""
	if not subjects:
	return 0.0

	# Factors affecting confidence
	assessment_completeness = min(sum(s.get('assessment_count', 0) for s in subjects) / (len(subjects) * 5), 1.0)
	subject_count_factor = min(len(subjects) / 10, 1.0)
	has_trends = sum(1 for s in subjects if 'trend' in s) / len(subjects)

	confidence = 0.4 * assessment_completeness + 0.3 * subject_count_factor + 0.3 * has_trends
	return confidence

	def generate_insights(subjects: List[Dict], predicted_atar: float) -> List[str]:
	"""Generate insights based on subject performance"""
	insights = []

	# Performance level
	if predicted_atar >= 95:
	insights.append("🎯 Excellent performance! You're on track for elite universities.")
	elif predicted_atar >= 85:
	insights.append("📈 Strong performance! Many competitive courses within reach.")
	elif predicted_atar >= 75:
	insights.append("✅ Solid foundation! Focus on improvement areas for better outcomes.")
	else:
	insights.append("💪 Room for growth! Strategic improvement can boost your ATAR significantly.")

	# Subject mix analysis
	high_scaling = [s for s in subjects if SUBJECT_SCALING_DATA.get(s.get('subject_name', ''), {}).get('scaling_factor', 1.0) > 1.05]
	if len(high_scaling) >= 3:
	insights.append(f"⭐ Your {len(high_scaling)} high-scaling subjects will boost your ATAR!")

	return insights

	def generate_recommendations(subjects: List[Dict], predicted_atar: float) -> List[str]:
	"""Generate improvement recommendations"""
	recommendations = []

	# Find weakest subjects
	sorted_subjects = sorted(subjects, key=lambda x: x.get('raw_mark', 0))
	if sorted_subjects:
	weakest = sorted_subjects[0]
	recommendations.append(f"🎯 Focus on {weakest.get('subject_name', 'weakest subject')} - raising this by 5 marks could add ~1 ATAR point")

	# Suggest high-scaling subjects
	low_scaling = [s for s in subjects if SUBJECT_SCALING_DATA.get(s.get('subject_name', ''), {}).get('scaling_factor', 1.0) < 0.98]
	if low_scaling:
	recommendations.append(f"⚖️ Consider if {low_scaling[0].get('subject_name')} is in your best 10 units")

	return recommendations

	# ============================================
	# GRADIO INTERFACE
	# ============================================

	@spaces.GPU(duration=120) if ZEROGPU_AVAILABLE else lambda x: x
	def train_model_interface(n_samples: int, version: str = None):
	"""Train model and upload to HF with versioning"""
	try:
	# Generate data
	yield "📊 Generating synthetic training data..."
	X, y = generate_synthetic_data(n_samples)

	# Split
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Train
	yield "🚀 Training ML ensemble (Gradient Boosting + Random Forest + Ridge)..."
	ensemble.train(X_train, y_train, X_test, y_test)

	# Evaluate
	train_pred = ensemble.predict(X_train)
	test_pred = ensemble.predict(X_test)

	train_mae = np.mean(np.abs(train_pred - y_train))
	test_mae = np.mean(np.abs(test_pred - y_test))

	yield f"✅ Training complete!\n\n📊 Results:\n- Train MAE: {train_mae:.2f} ATAR points\n- Test MAE: {test_mae:.2f} ATAR points\n- Training samples: {len(X_train):,}\n\n💾 Saving models locally..."

	# Save locally
	ensemble.save_local('models')

	# Upload to HF with versioning
	yield f"✅ Models saved!\n\n☁️ Uploading to Hugging Face with versioning..."

	# Auto-generate version if not provided
	if not version or version.strip() == "":
	from datetime import datetime
	version = datetime.now().strftime("v%Y%m%d_%H%M%S")

	result = upload_to_hf(version=version)
	yield f"✅ Training complete!\n\n📊 Results:\n- Train MAE: {train_mae:.2f} ATAR points\n- Test MAE: {test_mae:.2f} ATAR points\n- Training samples: {len(X_train):,}\n\n{result}\n\n🎉 Model ready for inference!"

	except Exception as e:
	yield f"❌ Training failed: {str(e)}"

	@spaces.GPU(duration=5) if ZEROGPU_AVAILABLE else lambda x: x
	def predict_interface(subjects_json: str):
	"""Predict ATAR from JSON input"""
	try:
	subjects = json.loads(subjects_json)
	result = predict_atar(subjects)
	return json.dumps(result, indent=2)
	except Exception as e:
	return json.dumps({'error': str(e)})

	# ============================================
	# GRADIO APP
	# ============================================

	with gr.Blocks(title="ATAR Prediction ML Ensemble", theme=gr.themes.Soft()) as app:
	gr.Markdown("""
	# 🎓 ATAR Prediction System (ML Ensemble)
	Powered by Gradient Boosting + Random Forest + Ridge Regression

	### Features:
	- 🚀 Train on ZeroGPU with automatic HF Model Repo upload
	- 🔮 Predict ATAR from subject marks (auto-loads model from HF)
	- ☁️ No persistent storage needed - models live in HF Model Repo
	""")

	with gr.Tabs():
	# Tab 1: Training
	with gr.Tab("🏋️ Train Model"):
	gr.Markdown("### Train ML Ensemble & Upload to Hugging Face")

	with gr.Row():
	n_samples_input = gr.Slider(1000, 50000, value=10000, step=1000, label="Training Samples")
	version_input = gr.Textbox(
	label="Version (optional - auto-generated if empty)",
	placeholder="v1.0.0 or leave empty for timestamp",
	value=""
	)

	train_btn = gr.Button("🚀 Train & Upload to HF", variant="primary", size="lg")
	train_output = gr.Textbox(label="Training Progress", lines=12)

	train_btn.click(
	fn=train_model_interface,
	inputs=[n_samples_input, version_input],
	outputs=train_output
	)

	gr.Markdown("""
	Instructions:
	1. Set `HF_TOKEN` environment variable in Space settings (write access)
	- Go to Space Settings → Variables and secrets
	- Add secret: `HF_TOKEN` = your token from https://huggingface.co/settings/tokens
	2. (Optional) Specify version like `v1.0.0`, `v1.1.0`, etc. or leave empty for auto timestamp
	3. Click "Train & Upload to HF"
	4. Model will be uploaded to `victor-academy/atar-predictor-ensemble`
	5. Each training creates a new version - no overwrites!

	Versioning:
	- `models/latest/` - Always the newest model
	- `models/v1.0.0/` - Specific version you can roll back to
	- `metadata.json` - Tracks all versions with metrics

	ZeroGPU:
	- Training uses GPU for 120 seconds (free tier)
	- Inference uses GPU for 5 seconds per request
	- All model storage handled via HF Model Repo
	""")

	# Tab 2: JSON API
	with gr.Tab("🔌 JSON API"):
	gr.Markdown("### Predict ATAR (JSON API)")

	with gr.Row():
	load_version_input = gr.Textbox(
	label="Model Version to Load (optional)",
	placeholder="latest (default), v1.0.0, v20241007_143022, etc.",
	value="latest"
	)
	load_btn = gr.Button("📥 Load Model", variant="secondary")

	load_status = gr.Textbox(label="Load Status", lines=3)

	def load_model_interface(version):
	return download_from_hf(version=version)

	load_btn.click(
	fn=load_model_interface,
	inputs=load_version_input,
	outputs=load_status
	)

	gr.Markdown("---")

	subjects_input = gr.Code(
	label="Input: Subjects JSON",
	language="json",
	value=json.dumps([
	{"subject_name": "Mathematics Extension 2", "raw_mark": 88.5, "trend": "improving", "assessment_count": 4},
	{"subject_name": "Physics", "raw_mark": 85.0, "trend": "stable", "assessment_count": 5},
	{"subject_name": "Chemistry", "raw_mark": 84.0, "trend": "stable", "assessment_count": 5},
	{"subject_name": "English Advanced", "raw_mark": 82.0, "trend": "improving", "assessment_count": 4},
	{"subject_name": "Software Design & Development", "raw_mark": 86.0, "trend": "improving", "assessment_count": 3}
	], indent=2)
	)

	predict_btn = gr.Button("🔮 Predict ATAR", variant="primary")
	prediction_output = gr.Code(label="Output: Prediction JSON", language="json")

	predict_btn.click(
	fn=predict_interface,
	inputs=subjects_input,
	outputs=prediction_output
	)

	gr.Markdown("""
	Note:
	- Model auto-loads `latest` version on first API call if not manually loaded
	- Manually load a specific version to test different models
	- All versions are preserved in HF Model Repo
	- Public repos: No token needed for downloads
	- Private repos: Set `HF_TOKEN` environment variable in Space settings
	""")

	# Tab 3: Simple Calculator
	with gr.Tab("📝 Simple Calculator"):
	gr.Markdown("### Quick ATAR Estimate")

	with gr.Row():
	with gr.Column():
	subj1 = gr.Dropdown(choices=list(SUBJECT_SCALING_DATA.keys()), label="Subject 1")
	mark1 = gr.Slider(0, 100, 85, label="Mark")
	with gr.Column():
	subj2 = gr.Dropdown(choices=list(SUBJECT_SCALING_DATA.keys()), label="Subject 2")
	mark2 = gr.Slider(0, 100, 85, label="Mark")

	with gr.Row():
	with gr.Column():
	subj3 = gr.Dropdown(choices=list(SUBJECT_SCALING_DATA.keys()), label="Subject 3")
	mark3 = gr.Slider(0, 100, 85, label="Mark")
	with gr.Column():
	subj4 = gr.Dropdown(choices=list(SUBJECT_SCALING_DATA.keys()), label="Subject 4")
	mark4 = gr.Slider(0, 100, 85, label="Mark")

	calc_btn = gr.Button("Calculate ATAR", variant="primary")
	calc_output = gr.Textbox(label="Result", lines=8)

	def simple_calc(s1, m1, s2, m2, s3, m3, s4, m4):
	subjects = []
	for s, m in [(s1, m1), (s2, m2), (s3, m3), (s4, m4)]:
	if s:
	subjects.append({"subject_name": s, "raw_mark": m, "trend": "stable", "assessment_count": 3})

	if not subjects:
	return "⚠️ Please select at least one subject"

	result = predict_atar(subjects)

	if 'error' in result:
	return f"❌ {result['error']}"

	output = f"🎯 Predicted ATAR: {result['predicted_atar']}\n"
	output += f"📊 Confidence: {result['confidence']*100:.0f}%\n\n"
	output += "💡 Insights:\n" + "\n".join(result['insights'])
	return output

	calc_btn.click(
	fn=simple_calc,
	inputs=[subj1, mark1, subj2, mark2, subj3, mark3, subj4, mark4],
	outputs=calc_output
	)

	# Tab 4: Scaling Reference
	with gr.Tab("📊 Scaling Reference"):
	gr.Markdown("### 2024 HSC Subject Scaling Data")

	scaling_df = pd.DataFrame([
	{
	'Subject': name,
	'Scaling Factor': f"{data['scaling_factor']:.4f}",
	'Mean Mark': data['mean'],
	'Difficulty': data['difficulty']
	}
	for name, data in sorted(SUBJECT_SCALING_DATA.items(),
	key=lambda x: x[1]['scaling_factor'],
	reverse=True)
	])

	gr.Dataframe(scaling_df, label="Subject Scaling Factors (sorted by scaling)")

	# ============================================
	# LAUNCH
	# ============================================

	if __name__ == "__main__":
	app.launch(share=True, server_name="0.0.0.0", server_port=7860)