naazimsnh02's picture
Initial deployment: Autonomous AI agent for code modernization
ec4aa90
"""Code classification using AI."""
import json
from typing import Dict, List
import os
from dotenv import load_dotenv
from src.config import AIManager, GeminiSchemas
load_dotenv()
class CodeClassifier:
"""Classifies code files into modernization categories using Gemini."""
def __init__(self):
"""Initialize the classifier with AI client."""
# Use centralized AI manager
self.ai_manager = AIManager()
def classify_files(self, file_list: List[str], batch_size: int = 25) -> Dict[str, str]:
"""
Classify files using Gemini with few-shot prompting.
Args:
file_list: List of file paths to classify
batch_size: Number of files to process per API call
Returns:
Dictionary mapping filenames to categories
"""
all_results = {}
# Process in batches to avoid token limits
for i in range(0, len(file_list), batch_size):
batch = file_list[i:i + batch_size]
batch_results = self._classify_batch(batch)
all_results.update(batch_results)
return all_results
def _classify_batch(self, file_list: List[str]) -> Dict[str, str]:
"""Classify a batch of files."""
prompt = f"""You are a code modernization expert. Classify these files into categories.
CATEGORIES:
- modernize_high: Legacy patterns that need immediate update (Python 2, deprecated libs, security issues)
- modernize_low: Minor improvements needed (add type hints, optimize imports)
- skip: Already modern or non-code files
FEW-SHOT EXAMPLES:
1. utils/db.py (uses MySQLdb, string interpolation) β†’ modernize_high
2. config.py (hardcoded credentials) β†’ modernize_high
3. models/user.py (missing type hints) β†’ modernize_low
4. src/api/UserController.java (uses deprecated Vector, no generics) β†’ modernize_high
5. frontend/app.js (uses jQuery 1.x, inline event handlers) β†’ modernize_high
6. legacy_php/login.php (mysql_connect, no prepared statements) β†’ modernize_high
7. README.md β†’ skip
8. tests/test_api.py (uses unittest, modern Python 3) β†’ skip
9. package.json β†’ skip
10. .gitignore β†’ skip
FILES TO CLASSIFY:
{json.dumps(file_list, indent=2)}
Return JSON object with filename as key and category as value.
Example: {{"file1.py": "modernize_high", "file2.js": "skip"}}
"""
try:
# Use JSON schema for guaranteed structure
schema = GeminiSchemas.file_classification()
response_text = self.ai_manager.generate_content(
prompt=prompt,
temperature=AIManager.TEMPERATURE_PRECISE,
max_tokens=AIManager.MAX_OUTPUT_TOKENS_MEDIUM,
response_format="json",
response_schema=schema
)
result = json.loads(response_text)
# Validate results
valid_categories = {"modernize_high", "modernize_low", "skip"}
for filename, category in result.items():
if category not in valid_categories:
result[filename] = "skip" # Default to skip if invalid
return result
except Exception as e:
print(f"Error classifying batch: {e}")
# Return default classifications on error
return {f: "skip" for f in file_list}
def get_statistics(self, classifications: Dict[str, str]) -> Dict[str, int]:
"""
Get statistics about classifications.
Args:
classifications: Dictionary of file classifications
Returns:
Dictionary with counts per category
"""
stats = {
"modernize_high": 0,
"modernize_low": 0,
"skip": 0,
"total": len(classifications)
}
for category in classifications.values():
if category in stats:
stats[category] += 1
return stats