File size: 4,108 Bytes
ec4aa90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""Code classification using AI."""

import json
from typing import Dict, List
import os
from dotenv import load_dotenv

from src.config import AIManager, GeminiSchemas

load_dotenv()


class CodeClassifier:
    """Classifies code files into modernization categories using Gemini."""
    
    def __init__(self):
        """Initialize the classifier with AI client."""
        # Use centralized AI manager
        self.ai_manager = AIManager()
        
    def classify_files(self, file_list: List[str], batch_size: int = 25) -> Dict[str, str]:
        """
        Classify files using Gemini with few-shot prompting.
        
        Args:
            file_list: List of file paths to classify
            batch_size: Number of files to process per API call
            
        Returns:
            Dictionary mapping filenames to categories
        """
        all_results = {}
        
        # Process in batches to avoid token limits
        for i in range(0, len(file_list), batch_size):
            batch = file_list[i:i + batch_size]
            batch_results = self._classify_batch(batch)
            all_results.update(batch_results)
        
        return all_results
    
    def _classify_batch(self, file_list: List[str]) -> Dict[str, str]:
        """Classify a batch of files."""
        
        prompt = f"""You are a code modernization expert. Classify these files into categories.

CATEGORIES:
- modernize_high: Legacy patterns that need immediate update (Python 2, deprecated libs, security issues)
- modernize_low: Minor improvements needed (add type hints, optimize imports)
- skip: Already modern or non-code files

FEW-SHOT EXAMPLES:
1. utils/db.py (uses MySQLdb, string interpolation) β†’ modernize_high
2. config.py (hardcoded credentials) β†’ modernize_high
3. models/user.py (missing type hints) β†’ modernize_low
4. src/api/UserController.java (uses deprecated Vector, no generics) β†’ modernize_high
5. frontend/app.js (uses jQuery 1.x, inline event handlers) β†’ modernize_high
6. legacy_php/login.php (mysql_connect, no prepared statements) β†’ modernize_high
7. README.md β†’ skip
8. tests/test_api.py (uses unittest, modern Python 3) β†’ skip
9. package.json β†’ skip
10. .gitignore β†’ skip

FILES TO CLASSIFY:
{json.dumps(file_list, indent=2)}

Return JSON object with filename as key and category as value.
Example: {{"file1.py": "modernize_high", "file2.js": "skip"}}
"""
        
        try:
            # Use JSON schema for guaranteed structure
            schema = GeminiSchemas.file_classification()
            
            response_text = self.ai_manager.generate_content(
                prompt=prompt,
                temperature=AIManager.TEMPERATURE_PRECISE,
                max_tokens=AIManager.MAX_OUTPUT_TOKENS_MEDIUM,
                response_format="json",
                response_schema=schema
            )
            
            result = json.loads(response_text)
            
            # Validate results
            valid_categories = {"modernize_high", "modernize_low", "skip"}
            for filename, category in result.items():
                if category not in valid_categories:
                    result[filename] = "skip"  # Default to skip if invalid
            
            return result
            
        except Exception as e:
            print(f"Error classifying batch: {e}")
            # Return default classifications on error
            return {f: "skip" for f in file_list}
    
    def get_statistics(self, classifications: Dict[str, str]) -> Dict[str, int]:
        """
        Get statistics about classifications.
        
        Args:
            classifications: Dictionary of file classifications
            
        Returns:
            Dictionary with counts per category
        """
        stats = {
            "modernize_high": 0,
            "modernize_low": 0,
            "skip": 0,
            "total": len(classifications)
        }
        
        for category in classifications.values():
            if category in stats:
                stats[category] += 1
        
        return stats