Spaces:

MCP-1st-Birthday
/

legacy_code_modernizer

Running

App Files Files Community

legacy_code_modernizer / src /agents /classifier.py

naazimsnh02

Initial deployment: Autonomous AI agent for code modernization

ec4aa90 20 days ago

raw

history blame contribute delete

4.11 kB

	"""Code classification using AI."""

	import json
	from typing import Dict, List
	import os
	from dotenv import load_dotenv

	from src.config import AIManager, GeminiSchemas

	load_dotenv()


	class CodeClassifier:
	"""Classifies code files into modernization categories using Gemini."""

	def __init__(self):
	"""Initialize the classifier with AI client."""
	# Use centralized AI manager
	self.ai_manager = AIManager()

	def classify_files(self, file_list: List[str], batch_size: int = 25) -> Dict[str, str]:
	"""
	Classify files using Gemini with few-shot prompting.

	Args:
	file_list: List of file paths to classify
	batch_size: Number of files to process per API call

	Returns:
	Dictionary mapping filenames to categories
	"""
	all_results = {}

	# Process in batches to avoid token limits
	for i in range(0, len(file_list), batch_size):
	batch = file_list[i:i + batch_size]
	batch_results = self._classify_batch(batch)
	all_results.update(batch_results)

	return all_results

	def _classify_batch(self, file_list: List[str]) -> Dict[str, str]:
	"""Classify a batch of files."""

	prompt = f"""You are a code modernization expert. Classify these files into categories.

	CATEGORIES:
	- modernize_high: Legacy patterns that need immediate update (Python 2, deprecated libs, security issues)
	- modernize_low: Minor improvements needed (add type hints, optimize imports)
	- skip: Already modern or non-code files

	FEW-SHOT EXAMPLES:
	1. utils/db.py (uses MySQLdb, string interpolation) → modernize_high
	2. config.py (hardcoded credentials) → modernize_high
	3. models/user.py (missing type hints) → modernize_low
	4. src/api/UserController.java (uses deprecated Vector, no generics) → modernize_high
	5. frontend/app.js (uses jQuery 1.x, inline event handlers) → modernize_high
	6. legacy_php/login.php (mysql_connect, no prepared statements) → modernize_high
	7. README.md → skip
	8. tests/test_api.py (uses unittest, modern Python 3) → skip
	9. package.json → skip
	10. .gitignore → skip

	FILES TO CLASSIFY:
	{json.dumps(file_list, indent=2)}

	Return JSON object with filename as key and category as value.
	Example: {{"file1.py": "modernize_high", "file2.js": "skip"}}
	"""

	try:
	# Use JSON schema for guaranteed structure
	schema = GeminiSchemas.file_classification()

	response_text = self.ai_manager.generate_content(
	prompt=prompt,
	temperature=AIManager.TEMPERATURE_PRECISE,
	max_tokens=AIManager.MAX_OUTPUT_TOKENS_MEDIUM,
	response_format="json",
	response_schema=schema
	)

	result = json.loads(response_text)

	# Validate results
	valid_categories = {"modernize_high", "modernize_low", "skip"}
	for filename, category in result.items():
	if category not in valid_categories:
	result[filename] = "skip" # Default to skip if invalid

	return result

	except Exception as e:
	print(f"Error classifying batch: {e}")
	# Return default classifications on error
	return {f: "skip" for f in file_list}

	def get_statistics(self, classifications: Dict[str, str]) -> Dict[str, int]:
	"""
	Get statistics about classifications.

	Args:
	classifications: Dictionary of file classifications

	Returns:
	Dictionary with counts per category
	"""
	stats = {
	"modernize_high": 0,
	"modernize_low": 0,
	"skip": 0,
	"total": len(classifications)
	}

	for category in classifications.values():
	if category in stats:
	stats[category] += 1

	return stats