|
|
"""Code classification using AI.""" |
|
|
|
|
|
import json |
|
|
from typing import Dict, List |
|
|
import os |
|
|
from dotenv import load_dotenv |
|
|
|
|
|
from src.config import AIManager, GeminiSchemas |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
class CodeClassifier: |
|
|
"""Classifies code files into modernization categories using Gemini.""" |
|
|
|
|
|
def __init__(self): |
|
|
"""Initialize the classifier with AI client.""" |
|
|
|
|
|
self.ai_manager = AIManager() |
|
|
|
|
|
def classify_files(self, file_list: List[str], batch_size: int = 25) -> Dict[str, str]: |
|
|
""" |
|
|
Classify files using Gemini with few-shot prompting. |
|
|
|
|
|
Args: |
|
|
file_list: List of file paths to classify |
|
|
batch_size: Number of files to process per API call |
|
|
|
|
|
Returns: |
|
|
Dictionary mapping filenames to categories |
|
|
""" |
|
|
all_results = {} |
|
|
|
|
|
|
|
|
for i in range(0, len(file_list), batch_size): |
|
|
batch = file_list[i:i + batch_size] |
|
|
batch_results = self._classify_batch(batch) |
|
|
all_results.update(batch_results) |
|
|
|
|
|
return all_results |
|
|
|
|
|
def _classify_batch(self, file_list: List[str]) -> Dict[str, str]: |
|
|
"""Classify a batch of files.""" |
|
|
|
|
|
prompt = f"""You are a code modernization expert. Classify these files into categories. |
|
|
|
|
|
CATEGORIES: |
|
|
- modernize_high: Legacy patterns that need immediate update (Python 2, deprecated libs, security issues) |
|
|
- modernize_low: Minor improvements needed (add type hints, optimize imports) |
|
|
- skip: Already modern or non-code files |
|
|
|
|
|
FEW-SHOT EXAMPLES: |
|
|
1. utils/db.py (uses MySQLdb, string interpolation) β modernize_high |
|
|
2. config.py (hardcoded credentials) β modernize_high |
|
|
3. models/user.py (missing type hints) β modernize_low |
|
|
4. src/api/UserController.java (uses deprecated Vector, no generics) β modernize_high |
|
|
5. frontend/app.js (uses jQuery 1.x, inline event handlers) β modernize_high |
|
|
6. legacy_php/login.php (mysql_connect, no prepared statements) β modernize_high |
|
|
7. README.md β skip |
|
|
8. tests/test_api.py (uses unittest, modern Python 3) β skip |
|
|
9. package.json β skip |
|
|
10. .gitignore β skip |
|
|
|
|
|
FILES TO CLASSIFY: |
|
|
{json.dumps(file_list, indent=2)} |
|
|
|
|
|
Return JSON object with filename as key and category as value. |
|
|
Example: {{"file1.py": "modernize_high", "file2.js": "skip"}} |
|
|
""" |
|
|
|
|
|
try: |
|
|
|
|
|
schema = GeminiSchemas.file_classification() |
|
|
|
|
|
response_text = self.ai_manager.generate_content( |
|
|
prompt=prompt, |
|
|
temperature=AIManager.TEMPERATURE_PRECISE, |
|
|
max_tokens=AIManager.MAX_OUTPUT_TOKENS_MEDIUM, |
|
|
response_format="json", |
|
|
response_schema=schema |
|
|
) |
|
|
|
|
|
result = json.loads(response_text) |
|
|
|
|
|
|
|
|
valid_categories = {"modernize_high", "modernize_low", "skip"} |
|
|
for filename, category in result.items(): |
|
|
if category not in valid_categories: |
|
|
result[filename] = "skip" |
|
|
|
|
|
return result |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error classifying batch: {e}") |
|
|
|
|
|
return {f: "skip" for f in file_list} |
|
|
|
|
|
def get_statistics(self, classifications: Dict[str, str]) -> Dict[str, int]: |
|
|
""" |
|
|
Get statistics about classifications. |
|
|
|
|
|
Args: |
|
|
classifications: Dictionary of file classifications |
|
|
|
|
|
Returns: |
|
|
Dictionary with counts per category |
|
|
""" |
|
|
stats = { |
|
|
"modernize_high": 0, |
|
|
"modernize_low": 0, |
|
|
"skip": 0, |
|
|
"total": len(classifications) |
|
|
} |
|
|
|
|
|
for category in classifications.values(): |
|
|
if category in stats: |
|
|
stats[category] += 1 |
|
|
|
|
|
return stats |
|
|
|