kasimali commited on
Commit
8e263ff
Β·
verified Β·
1 Parent(s): ad18a48

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +3 -6
  2. app.py +622 -0
  3. requirements.txt +4 -0
README.md CHANGED
@@ -1,10 +1,7 @@
1
  ---
2
- title: Xls R1b
3
- emoji: ⚑
4
- colorFrom: pink
5
- colorTo: gray
6
  sdk: static
7
- pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: XLS-R1B
3
+ emoji: πŸš€
 
 
4
  sdk: static
 
5
  ---
6
 
7
+ # XLS-R1B
app.py ADDED
@@ -0,0 +1,622 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # XLS-R1B
2
+
3
+ # ============================================================================
4
+ # CELL 1: SETUP AND INSTALLATION
5
+ # ============================================================================
6
+ import os
7
+ import warnings
8
+ warnings.filterwarnings('ignore')
9
+
10
+ print("πŸš€ MMS Language Identification Test (Final Verified Version)")
11
+ print("=" * 60)
12
+
13
+ # Mount Google Drive
14
+ from google.colab import drive
15
+
16
+ # Install and update necessary packages
17
+ print("πŸ“¦ Installing and updating packages...")
18
+
19
+ print("βœ… Setup complete! Please restart the runtime now to apply updates.")
20
+
21
+
22
+ # ============================================================================
23
+ # CELL 2: MODEL LOADING (Final Verified Version)
24
+ # ============================================================================
25
+ import torch
26
+ import librosa
27
+ import pandas as pd
28
+ import numpy as np
29
+ from datetime import datetime
30
+ from transformers import Wav2Vec2FeatureExtractor, AutoModelForAudioClassification
31
+ from sklearn.metrics import accuracy_score, classification_report
32
+
33
+ # --- Your Folder and Language Mappings ---
34
+ CUSTOM_FOLDER_MAPPING = {
35
+ 'as': 'asm', 'bn': 'ben', 'br': 'brx', 'doi': 'dgo', 'en': 'eng',
36
+ 'gu': 'guj', 'hi': 'hin', 'kn': 'kan', 'kok': 'kok', 'ks': 'kas',
37
+ 'mai': 'mai', 'ml': 'mal', 'mni': 'mni', 'mr': 'mar', 'ne': 'nep',
38
+ 'or': 'ory', 'pa': 'pan', 'sa': 'san', 'sat': 'sat', 'sd': 'snd',
39
+ 'ta': 'tam', 'te': 'tel', 'ur': 'urd'
40
+ }
41
+ ISO_TO_FULL_NAME = {
42
+ 'asm': 'Assamese', 'ben': 'Bengali', 'brx': 'Bodo', 'dgo': 'Dogri', 'eng': 'English',
43
+ 'guj': 'Gujarati', 'hin': 'Hindi', 'kan': 'Kannada', 'kok': 'Konkani', 'kas': 'Kashmiri',
44
+ 'mai': 'Maithili', 'mal': 'Malayalam', 'mni': 'Manipuri', 'mar': 'Marathi', 'nep': 'Nepali',
45
+ 'ory': 'Odia', 'pan': 'Punjabi', 'san': 'Sanskrit', 'sat': 'Santali', 'snd': 'Sindhi',
46
+ 'tam': 'Tamil', 'tel': 'Telugu', 'urd': 'Urdu'
47
+ }
48
+
49
+ # --- Update Your Paths ---
50
+ AUDIO_FOLDER = "/content/drive/MyDrive/Audio_files" # <-- Update this
51
+ RESULTS_FOLDER = "/content/drive/MyDrive/mms_lid_results"
52
+ os.makedirs(RESULTS_FOLDER, exist_ok=True)
53
+
54
+ # --- Load Components Separately (The Fix) ---
55
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
56
+ print(f"πŸ”§ Device: {device}")
57
+
58
+ MODEL_NAME = "facebook/mms-lid-256"
59
+
60
+ # 1. Load the feature extractor ONLY
61
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
62
+
63
+ # 2. Load the model for classification
64
+ model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME).to(device)
65
+ model.eval()
66
+
67
+ print(f"βœ… MMS LID model and feature extractor loaded successfully: {MODEL_NAME}")
68
+
69
+
70
+ # ============================================================================
71
+ # CELL 3: AUDIO PROCESSING AND PREDICTION
72
+ # ============================================================================
73
+ def load_audio_raw(file_path):
74
+ try:
75
+ audio, sr = librosa.load(file_path, sr=16000, mono=True)
76
+ duration = len(audio) / 16000
77
+ return audio, duration
78
+ except Exception as e:
79
+ print(f"Error loading {file_path}: {e}")
80
+ return None, 0
81
+
82
+ def predict_language_mms(audio_array):
83
+ try:
84
+ # Use the feature_extractor directly
85
+ inputs = feature_extractor(audio_array, sampling_rate=16000, return_tensors="pt")
86
+ inputs = {k: v.to(device) for k, v in inputs.items()}
87
+
88
+ with torch.no_grad():
89
+ outputs = model(**inputs)
90
+
91
+ logits = outputs.logits
92
+ pred_idx = torch.argmax(logits, dim=-1).item()
93
+ pred_lang_code = model.config.id2label[pred_idx]
94
+
95
+ probabilities = torch.softmax(logits, dim=-1)[0]
96
+ confidence = probabilities[pred_idx].item()
97
+
98
+ return pred_lang_code, confidence
99
+
100
+ except Exception as e:
101
+ return "error", 0.0
102
+
103
+ def find_audio_files(base_path):
104
+ audio_files = []
105
+ for root, _, files in os.walk(base_path):
106
+ folder_code = os.path.basename(root).lower()
107
+ if folder_code in CUSTOM_FOLDER_MAPPING:
108
+ ground_truth_iso = CUSTOM_FOLDER_MAPPING[folder_code]
109
+ for file in files:
110
+ if file.lower().endswith(('.wav', '.mp3', '.m4a', '.flac', '.ogg')):
111
+ audio_files.append({
112
+ "file_path": os.path.join(root, file),
113
+ "filename": file,
114
+ "ground_truth": ground_truth_iso
115
+ })
116
+ return audio_files
117
+
118
+ print("βœ… Functions are ready!")
119
+
120
+
121
+ # ============================================================================
122
+ # CELL 4: PROCESS ALL FILES AND GENERATE REPORT
123
+ # ============================================================================
124
+ def run_full_analysis():
125
+ print("πŸš€ Processing FULL dataset with MMS LID Model...")
126
+
127
+ audio_files = find_audio_files(AUDIO_FOLDER)
128
+ if not audio_files:
129
+ print("❌ No audio files found. Please check your AUDIO_FOLDER path.")
130
+ return
131
+
132
+ total_files = len(audio_files)
133
+ results = []
134
+
135
+ print(f"πŸ”„ Processing {total_files} files...")
136
+ print("-" * 50)
137
+
138
+ for i, file_info in enumerate(audio_files):
139
+ if (i + 1) % 50 == 0:
140
+ print(f"Progress: {i+1}/{total_files} ({(i+1)/total_files*100:.1f}%)")
141
+
142
+ audio, duration = load_audio_raw(str(file_info['file_path']))
143
+ if audio is None:
144
+ result = {**file_info, "predicted_language": "load_error", "confidence": 0.0, "duration": 0.0, "is_short_file": False}
145
+ else:
146
+ pred_lang_code, confidence = predict_language_mms(audio)
147
+ is_short = duration < 3.0
148
+ result = {**file_info, "predicted_language": pred_lang_code, "confidence": confidence, "duration": duration, "is_short_file": is_short}
149
+
150
+ if is_short and pred_lang_code != "error":
151
+ print(f"⚠️ SHORT ({duration:.1f}s): {file_info['filename']} -> {ISO_TO_FULL_NAME.get(pred_lang_code, pred_lang_code)} ({confidence:.3f})")
152
+
153
+ results.append(result)
154
+
155
+ results_df = pd.DataFrame(results)
156
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
157
+ csv_path = f"{RESULTS_FOLDER}/mms_lid_results_{timestamp}.csv"
158
+ results_df.to_csv(csv_path, index=False)
159
+ print(f"\nβœ… Processing complete! Results saved to: {csv_path}")
160
+
161
+ # --- Detailed Analysis ---
162
+ print("\n" + "=" * 60)
163
+ print("πŸ“Š MMS LID MODEL - DETAILED ANALYSIS")
164
+ print("=" * 60)
165
+
166
+ valid_data = results_df[(results_df['predicted_language'] != 'error') & (results_df['predicted_language'] != 'load_error')]
167
+
168
+ if len(valid_data) > 0:
169
+ overall_accuracy = accuracy_score(valid_data['ground_truth'], valid_data['predicted_language'])
170
+ print(f"\n🎯 OVERALL MODEL ACCURACY: {overall_accuracy:.2%}")
171
+
172
+ print(f"\nπŸ“‹ LANGUAGE-WISE ACCURACY:")
173
+ report_true = [ISO_TO_FULL_NAME.get(code, code) for code in valid_data['ground_truth']]
174
+ report_pred = [ISO_TO_FULL_NAME.get(code, code) for code in valid_data['predicted_language']]
175
+ print(classification_report(report_true, report_pred, zero_division=0))
176
+
177
+ short_files = results_df[results_df.get('is_short_file', False) == True]
178
+ valid_short = short_files[(short_files['predicted_language'] != 'error') & (short_files['predicted_language'] != 'load_error')]
179
+
180
+ print(f"\n⚠️ SHORT FILES ANALYSIS (<3 seconds):")
181
+ print(f"Total short files: {len(short_files)}")
182
+ if len(valid_short) > 0:
183
+ avg_conf = valid_short['confidence'].mean()
184
+ print(f"Average confidence for short files: {avg_conf:.3f}")
185
+
186
+ print("\n" + "=" * 60)
187
+ print("🏁 ANALYSIS COMPLETE")
188
+
189
+ # Run the full analysis
190
+ run_full_analysis()
191
+
192
+
193
+ # ============================================================================
194
+ # CELL 5: GENERATE FILTERED EXCEL REPORT
195
+ # ============================================================================
196
+ import pandas as pd
197
+ from sklearn.metrics import accuracy_score
198
+
199
+ # Install the package needed to write Excel files
200
+
201
+ def generate_filtered_excel_report(df, folder_path):
202
+ """
203
+ Generates an Excel report with overall and per-language accuracy,
204
+ excluding files shorter than 3 seconds from the accuracy calculation.
205
+ """
206
+ if df is None or df.empty:
207
+ print("❌ No results DataFrame found. Please run the analysis in Cell 4 first.")
208
+ return
209
+
210
+ print("πŸ“Š Generating filtered accuracy report...")
211
+
212
+ # --- 1. Filter the DataFrame ---
213
+ # Exclude errors and files shorter than 3 seconds
214
+ accuracy_df = df[
215
+ (df['duration'] >= 3) &
216
+ (df['predicted_language'] != 'error') &
217
+ (df['predicted_language'] != 'load_error')
218
+ ].copy()
219
+
220
+ print(f"Total files in accuracy calculation (>= 3s): {len(accuracy_df)} out of {len(df)}")
221
+
222
+ # --- 2. Calculate Overall Accuracy ---
223
+ if not accuracy_df.empty:
224
+ overall_accuracy = accuracy_score(accuracy_df['ground_truth'], accuracy_df['predicted_language'])
225
+ summary_df = pd.DataFrame([{'Overall Accuracy (>= 3s)': f"{overall_accuracy:.2%}"}])
226
+ else:
227
+ summary_df = pd.DataFrame([{'Overall Accuracy (>= 3s)': "N/A"}])
228
+
229
+ # --- 3. Calculate Per-Language Accuracy ---
230
+ per_language_stats = []
231
+ if not accuracy_df.empty:
232
+ # Use full names for the report
233
+ accuracy_df['ground_truth_name'] = accuracy_df['ground_truth'].map(ISO_TO_FULL_NAME)
234
+ accuracy_df['predicted_language_name'] = accuracy_df['predicted_language'].map(ISO_TO_FULL_NAME)
235
+
236
+ for lang_code, lang_name in sorted(ISO_TO_FULL_NAME.items()):
237
+ lang_df = accuracy_df[accuracy_df['ground_truth'] == lang_code]
238
+ if not lang_df.empty:
239
+ lang_accuracy = accuracy_score(lang_df['ground_truth'], lang_df['predicted_language'])
240
+ per_language_stats.append({
241
+ 'Language': lang_name,
242
+ 'Accuracy': f"{lang_accuracy:.2%}",
243
+ 'File Count (>= 3s)': len(lang_df)
244
+ })
245
+
246
+ per_language_df = pd.DataFrame(per_language_stats)
247
+
248
+ # --- 4. Save to Excel ---
249
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
250
+ report_path = os.path.join(folder_path, f"filtered_accuracy_report_{timestamp}.xlsx")
251
+
252
+ with pd.ExcelWriter(report_path, engine='xlsxwriter') as writer:
253
+ summary_df.to_excel(writer, sheet_name='Summary', index=False)
254
+ per_language_df.to_excel(writer, sheet_name='Per_Language_Accuracy', index=False)
255
+ df.to_excel(writer, sheet_name='All_Results', index=False)
256
+ accuracy_df.to_excel(writer, sheet_name='Filtered_Results (for accuracy)', index=False)
257
+
258
+ # Auto-adjust column widths for readability
259
+ for sheet_name in writer.sheets:
260
+ worksheet = writer.sheets[sheet_name]
261
+ for idx, col in enumerate(pd.read_excel(report_path, sheet_name=sheet_name).columns):
262
+ max_len = max(
263
+ df[col].astype(str).map(len).max() if col in df else 0,
264
+ len(str(col))
265
+ ) + 2
266
+ worksheet.set_column(idx, idx, max_len)
267
+
268
+ print(f"\nβœ… Filtered Excel report saved successfully to: {report_path}")
269
+
270
+ # Run the function to generate the report
271
+ # This assumes 'full_results_df' was created in the previous cell
272
+ if 'full_results_df' in locals():
273
+ generate_filtered_excel_report(full_results_df, RESULTS_FOLDER)
274
+ else:
275
+ print("❌ 'full_results_df' not found. Please run the previous cell to process the dataset first.")
276
+
277
+
278
+
279
+
280
+ # ============================================================================
281
+ # CELL 5: LOAD EXISTING RESULTS AND EXTRACT FEATURES
282
+ # ============================================================================
283
+ import pandas as pd
284
+ import numpy as np
285
+ import librosa
286
+ import os
287
+
288
+ # --- 1. Load Your Existing CSV File ---
289
+ # ⚠️ PASTE THE FULL PATH to your CSV file here
290
+ csv_path = "/content/drive/MyDrive/mms_lid_results/mms_lid_results_20250925_072344.csv"
291
+
292
+ try:
293
+ full_results_df = pd.read_csv(csv_path)
294
+ print(f"βœ… Successfully loaded {len(full_results_df)} records from {csv_path}")
295
+ except FileNotFoundError:
296
+ print(f"❌ ERROR: File not found at '{csv_path}'. Please check the path and try again.")
297
+ # Stop execution if the file is not found
298
+ raise
299
+
300
+ # --- 2. In-Depth Feature Extraction ---
301
+ print("\nπŸš€ Starting in-depth feature extraction...")
302
+
303
+ def extract_audio_features(row):
304
+ """Calculates SNR proxy and silence ratio for a given audio file."""
305
+ try:
306
+ audio, sr = librosa.load(row['file_path'], sr=16000, mono=True)
307
+
308
+ # Calculate RMS energy for silence detection
309
+ rms = librosa.feature.rms(y=audio, frame_length=2048, hop_length=512)[0]
310
+
311
+ # Silence Ratio: Percentage of frames below 20% of max energy
312
+ silence_threshold = 0.2 * np.max(rms) if rms.size > 0 else 0
313
+ silence_ratio = np.mean(rms < silence_threshold) if rms.size > 0 else 1.0
314
+
315
+ # SNR Proxy: Ratio of energy in loud parts vs. quiet parts
316
+ loud_rms = np.mean(rms[rms >= silence_threshold]) if np.any(rms >= silence_threshold) else 0
317
+ quiet_rms = np.mean(rms[rms < silence_threshold]) if np.any(rms < silence_threshold) else 0
318
+ snr_proxy = 20 * np.log10(loud_rms / (quiet_rms + 1e-7) + 1e-7) if quiet_rms > 0 else 50.0
319
+
320
+ return pd.Series([snr_proxy, silence_ratio])
321
+
322
+ except Exception as e:
323
+ return pd.Series([np.nan, np.nan])
324
+
325
+ # Apply the feature extraction to each row
326
+ print("Calculating SNR and silence ratios for all files... (This may take a few minutes)")
327
+ features_df = full_results_df.apply(extract_audio_features, axis=1)
328
+ features_df.columns = ['snr_proxy', 'silence_ratio']
329
+
330
+ # Combine the new features with your existing results
331
+ analysis_df = pd.concat([full_results_df, features_df], axis=1)
332
+
333
+ print("βœ… Feature extraction complete!")
334
+
335
+
336
+ # ============================================================================
337
+ # CELL 6: COMPREHENSIVE ANALYSIS AND EXCEL REPORT
338
+ # ============================================================================
339
+ import pandas as pd
340
+ from sklearn.metrics import accuracy_score, confusion_matrix
341
+
342
+ # Install xlsxwriter if not already installed
343
+
344
+ def generate_comprehensive_report(df, folder_path):
345
+ """
346
+ Generates a comprehensive Excel report with multiple analysis sheets.
347
+ """
348
+ if 'analysis_df' not in locals():
349
+ print("❌ 'analysis_df' with features not found. Please run the feature extraction cell first.")
350
+ return
351
+
352
+ print("πŸ“Š Generating comprehensive analysis report...")
353
+
354
+ # --- Create a new Excel writer ---
355
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
356
+ report_path = os.path.join(folder_path, f"comprehensive_analysis_report_{timestamp}.xlsx")
357
+ writer = pd.ExcelWriter(report_path, engine='xlsxwriter')
358
+
359
+ # --- Sheet 1: All Results with Features ---
360
+ df.to_excel(writer, sheet_name='Results_with_Features', index=False)
361
+
362
+ # Filter for valid predictions for all subsequent analyses
363
+ valid_df = df[
364
+ (df['predicted_language'] != 'error') &
365
+ (df['predicted_language'] != 'load_error')
366
+ ].copy()
367
+
368
+ # --- Sheet 2 & 3: Calibration Analysis ---
369
+ n_bins = 10
370
+ bins = np.linspace(0, 1, n_bins + 1)
371
+ valid_df['confidence_bin'] = pd.cut(valid_df['confidence'], bins=bins, include_lowest=True, right=True)
372
+
373
+ # Ensure all bins are present for groupby
374
+ valid_df['confidence_bin'] = valid_df['confidence_bin'].astype(str)
375
+
376
+ calib_data = valid_df.groupby('confidence_bin').apply(lambda x: pd.Series({
377
+ 'bin_accuracy': accuracy_score(x['ground_truth'], x['predicted_language']),
378
+ 'avg_confidence': x['confidence'].mean(),
379
+ 'sample_count': len(x)
380
+ })).reset_index()
381
+
382
+ overall_ece = np.sum(np.abs(calib_data['bin_accuracy'] - calib_data['avg_confidence']) * (calib_data['sample_count'] / len(valid_df)))
383
+
384
+ calibration_overview_df = pd.DataFrame([{'Expected Calibration Error (ECE)': f"{overall_ece:.4f}"}])
385
+ calibration_overview_df.to_excel(writer, sheet_name='Calibration_Overview', index=False)
386
+ calib_data.to_excel(writer, sheet_name='Calibration_Bins', index=False)
387
+
388
+ # --- Sheets 4, 5, 6: Accuracy vs. Features ---
389
+ def get_accuracy_slice(dataframe, column, bins):
390
+ dataframe[f'{column}_bin'] = pd.cut(dataframe[column], bins=bins, include_lowest=True)
391
+ return dataframe.groupby(f'{column}_bin', observed=False).apply(lambda x: accuracy_score(x['ground_truth'], x['predicted_language']) if not x.empty else 0).reset_index(name='accuracy')
392
+
393
+ acc_vs_duration = get_accuracy_slice(valid_df.copy(), 'duration', bins=[0, 1, 2, 3, 5, 10, np.inf])
394
+ acc_vs_snr = get_accuracy_slice(valid_df.copy(), 'snr_proxy', bins=[-np.inf, 0, 10, 20, 30, 40, np.inf])
395
+ acc_vs_silence = get_accuracy_slice(valid_df.copy(), 'silence_ratio', bins=[-0.01, 0.1, 0.3, 0.5, 0.7, 1.0])
396
+
397
+ acc_vs_duration.to_excel(writer, sheet_name='Acc_vs_Duration', index=False)
398
+ acc_vs_snr.to_excel(writer, sheet_name='Acc_vs_SNR', index=False)
399
+ acc_vs_silence.to_excel(writer, sheet_name='Acc_vs_Silence', index=False)
400
+
401
+ # --- Sheet 7 & 8: Confusion Matrix and Asymmetry ---
402
+ labels = sorted(list(set(valid_df['ground_truth'].unique()) | set(valid_df['predicted_language'].unique())))
403
+ cm = confusion_matrix(valid_df['ground_truth'], valid_df['predicted_language'], labels=labels)
404
+ cm_df = pd.DataFrame(cm, index=[ISO_TO_FULL_NAME.get(l, l) for l in labels], columns=[ISO_TO_FULL_NAME.get(l, l) for l in labels])
405
+
406
+ confusion_asymmetry_df = cm_df.subtract(cm_df.T)
407
+
408
+ cm_df.to_excel(writer, sheet_name='Confusion_Matrix')
409
+ confusion_asymmetry_df.to_excel(writer, sheet_name='Confusion_Asymmetry')
410
+
411
+ # --- Sheet 9 & 10: Hard Cases Analysis ---
412
+ hard_misclassifications = valid_df[
413
+ (valid_df['ground_truth'] != valid_df['predicted_language']) &
414
+ (valid_df['confidence'] > 0.8)
415
+ ].sort_values('confidence', ascending=False)
416
+
417
+ ambiguous_correct = valid_df[
418
+ (valid_df['ground_truth'] == valid_df['predicted_language']) &
419
+ (valid_df['confidence'] < 0.5)
420
+ ].sort_values('confidence', ascending=True)
421
+
422
+ hard_misclassifications.to_excel(writer, sheet_name='Hard_Misclassifications', index=False)
423
+ ambiguous_correct.to_excel(writer, sheet_name='Ambiguous_Correct', index=False)
424
+
425
+ # --- Save the Excel file ---
426
+ writer.close()
427
+ print(f"\nβœ… Comprehensive analysis report saved successfully to: {report_path}")
428
+
429
+
430
+ # Run the function to generate the final report
431
+ if 'analysis_df' in locals():
432
+ generate_comprehensive_report(analysis_df, RESULTS_FOLDER)
433
+ else:
434
+ print("❌ 'analysis_df' not found. Please run the feature extraction in the previous cell first.")
435
+
436
+
437
+ # ============================================================================
438
+ # CELL 6: COMPREHENSIVE ANALYSIS AND EXCEL REPORT (UNIFIED)
439
+ # ============================================================================
440
+ import pandas as pd
441
+ from sklearn.metrics import accuracy_score, confusion_matrix
442
+
443
+ # Install xlsxwriter if not already installed
444
+
445
+ def generate_comprehensive_report(df, folder_path):
446
+ """
447
+ Generates a comprehensive Excel report with multiple analysis sheets.
448
+ """
449
+ if df is None or df.empty:
450
+ print("❌ The 'analysis_df' DataFrame is empty. Please check the previous cell.")
451
+ return
452
+
453
+ print("πŸ“Š Generating comprehensive analysis report...")
454
+
455
+ # --- Create a new Excel writer ---
456
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
457
+ report_path = os.path.join(folder_path, f"comprehensive_analysis_report_{timestamp}.xlsx")
458
+
459
+ with pd.ExcelWriter(report_path, engine='xlsxwriter') as writer:
460
+ # --- Sheet 1: All Results with Features ---
461
+ df.to_excel(writer, sheet_name='Results_with_Features', index=False)
462
+
463
+ # Filter for valid predictions for all subsequent analyses
464
+ valid_df = df[
465
+ (df['predicted_language'] != 'error') &
466
+ (df['predicted_language'] != 'load_error')
467
+ ].copy()
468
+
469
+ # --- Sheet 2 & 3: Calibration Analysis ---
470
+ n_bins = 10
471
+ bins = np.linspace(0, 1, n_bins + 1)
472
+ valid_df['confidence_bin'] = pd.cut(valid_df['confidence'], bins=bins, include_lowest=True, right=True)
473
+ valid_df['confidence_bin'] = valid_df['confidence_bin'].astype(str)
474
+
475
+ calib_data = valid_df.groupby('confidence_bin', observed=False).apply(lambda x: pd.Series({
476
+ 'bin_accuracy': accuracy_score(x['ground_truth'], x['predicted_language']) if not x.empty else 0,
477
+ 'avg_confidence': x['confidence'].mean() if not x.empty else 0,
478
+ 'sample_count': len(x)
479
+ })).reset_index()
480
+
481
+ overall_ece = np.sum(np.abs(calib_data['bin_accuracy'] - calib_data['avg_confidence']) * (calib_data['sample_count'] / len(valid_df)))
482
+
483
+ calibration_overview_df = pd.DataFrame([{'Expected Calibration Error (ECE)': f"{overall_ece:.4f}"}])
484
+ calibration_overview_df.to_excel(writer, sheet_name='Calibration_Overview', index=False)
485
+ calib_data.to_excel(writer, sheet_name='Calibration_Bins', index=False)
486
+
487
+ # --- Sheets 4, 5, 6: Accuracy vs. Features ---
488
+ def get_accuracy_slice(dataframe, column, bins):
489
+ dataframe[f'{column}_bin'] = pd.cut(dataframe[column], bins=bins, include_lowest=True)
490
+ return dataframe.groupby(f'{column}_bin', observed=False).apply(lambda x: accuracy_score(x['ground_truth'], x['predicted_language']) if not x.empty else 0).reset_index(name='accuracy')
491
+
492
+ acc_vs_duration = get_accuracy_slice(valid_df.copy(), 'duration', bins=[0, 1, 2, 3, 5, 10, np.inf])
493
+ acc_vs_snr = get_accuracy_slice(valid_df.copy(), 'snr_proxy', bins=[-np.inf, 0, 10, 20, 30, 40, np.inf])
494
+ acc_vs_silence = get_accuracy_slice(valid_df.copy(), 'silence_ratio', bins=[-0.01, 0.1, 0.3, 0.5, 0.7, 1.0])
495
+
496
+ acc_vs_duration.to_excel(writer, sheet_name='Acc_vs_Duration', index=False)
497
+ acc_vs_snr.to_excel(writer, sheet_name='Acc_vs_SNR', index=False)
498
+ acc_vs_silence.to_excel(writer, sheet_name='Acc_vs_Silence', index=False)
499
+
500
+ # --- Sheet 7 & 8: Confusion Matrix and Asymmetry ---
501
+ labels = sorted(list(set(valid_df['ground_truth'].unique()) | set(valid_df['predicted_language'].unique())))
502
+ cm = confusion_matrix(valid_df['ground_truth'], valid_df['predicted_language'], labels=labels)
503
+ cm_df = pd.DataFrame(cm, index=[ISO_TO_FULL_NAME.get(l, l) for l in labels], columns=[ISO_TO_FULL_NAME.get(l, l) for l in labels])
504
+
505
+ confusion_asymmetry_df = cm_df.subtract(cm_df.T)
506
+
507
+ cm_df.to_excel(writer, sheet_name='Confusion_Matrix')
508
+ confusion_asymmetry_df.to_excel(writer, sheet_name='Confusion_Asymmetry')
509
+
510
+ # --- Sheet 9 & 10: Hard Cases Analysis ---
511
+ hard_misclassifications = valid_df[
512
+ (valid_df['ground_truth'] != valid_df['predicted_language']) &
513
+ (valid_df['confidence'] > 0.8)
514
+ ].sort_values('confidence', ascending=False)
515
+
516
+ ambiguous_correct = valid_df[
517
+ (valid_df['ground_truth'] == valid_df['predicted_language']) &
518
+ (valid_df['confidence'] < 0.5)
519
+ ].sort_values('confidence', ascending=True)
520
+
521
+ hard_misclassifications.to_excel(writer, sheet_name='Hard_Misclassifications', index=False)
522
+ ambiguous_correct.to_excel(writer, sheet_name='Ambiguous_Correct', index=False)
523
+
524
+ print(f"\nβœ… Comprehensive analysis report saved successfully to: {report_path}")
525
+
526
+ # Run the function to generate the final report
527
+ # This will now work because 'analysis_df' was created in the cell right above
528
+ if 'analysis_df' in locals():
529
+ generate_comprehensive_report(analysis_df, RESULTS_FOLDER)
530
+ else:
531
+ print("❌ 'analysis_df' not found. Please re-run the previous cell to load and process the data.")
532
+
533
+
534
+ # ============================================================================
535
+ # FINAL ANALYSIS CELL: NORMALIZATION AND DUAL ACCURACY REPORTS
536
+ # ============================================================================
537
+ import pandas as pd
538
+ import numpy as np
539
+ from sklearn.metrics import accuracy_score, classification_report
540
+ import os
541
+
542
+ # Install xlsxwriter for Excel reporting
543
+
544
+ # --- 1. Load Your Existing CSV File ---
545
+ # ⚠️ PASTE THE FULL PATH to your most recent CSV file here
546
+ csv_path = "/content/drive/MyDrive/mms_lid_results/mms_lid_results_20250925_072344.csv"
547
+
548
+ try:
549
+ results_df = pd.read_csv(csv_path)
550
+ print(f"βœ… Successfully loaded {len(results_df)} records from {csv_path}")
551
+ except FileNotFoundError:
552
+ print(f"❌ ERROR: File not found at '{csv_path}'. Please check the path and try again.")
553
+ raise
554
+
555
+ # --- 2. Define the Comprehensive Normalization Mapping ---
556
+ # This dictionary will standardize all known language code variations.
557
+ NORMALIZATION_MAPPING = {
558
+ # MMS model's 3-letter codes (prediction) to your 2-letter folder names (ground truth)
559
+ 'asm': 'as', 'ben': 'bn', 'brx': 'br', 'dgo': 'doi', 'eng': 'en',
560
+ 'guj': 'gu', 'hin': 'hi', 'kan': 'kn', 'kok': 'kok', 'kas': 'ks',
561
+ 'mai': 'mai', 'mal': 'ml', 'mni': 'mni', 'mar': 'mr', 'nep': 'ne',
562
+ 'ory': 'or', 'pan': 'pa', 'san': 'sa', 'sat': 'sat', 'snd': 'sd',
563
+ 'tam': 'ta', 'tel': 'te', 'urd': 'ur',
564
+ # Crucial fix for Nepali
565
+ 'npi': 'ne'
566
+ }
567
+
568
+ # --- 3. Apply Normalization ---
569
+ print("\nApplying comprehensive normalization to language codes...")
570
+ results_df['normalized_prediction'] = results_df['predicted_language'].map(NORMALIZATION_MAPPING)
571
+ # Fill any unmapped predictions with a placeholder to mark them as incorrect
572
+ results_df['normalized_prediction'].fillna('unknown', inplace=True)
573
+
574
+ # --- 4. Define the Analysis Function ---
575
+ def generate_accuracy_report(df, report_title):
576
+ """Calculates and returns overall and per-language accuracy DataFrames."""
577
+ print(f"\n--- Generating Report: {report_title} ---")
578
+
579
+ # Filter for valid predictions (where normalization resulted in a known language)
580
+ valid_df = df[df['normalized_prediction'] != 'unknown'].copy()
581
+ print(f"Calculating accuracy on {len(valid_df)} valid predictions.")
582
+
583
+ if valid_df.empty:
584
+ print("No valid data to report on.")
585
+ return pd.DataFrame([{'Overall Accuracy': 'N/A'}]), pd.DataFrame()
586
+
587
+ # Calculate Overall Accuracy
588
+ overall_accuracy = accuracy_score(valid_df['ground_truth'], valid_df['normalized_prediction'])
589
+ summary_df = pd.DataFrame([{'Overall Accuracy': f"{overall_accuracy:.2%}"}])
590
+ print(f"Overall Accuracy: {overall_accuracy:.2%}")
591
+
592
+ # Calculate Per-Language Accuracy
593
+ report_dict = classification_report(valid_df['ground_truth'], valid_df['normalized_prediction'], output_dict=True, zero_division=0)
594
+ per_language_df = pd.DataFrame(report_dict).transpose().reset_index().rename(columns={'index': 'Language'})
595
+
596
+ # Keep only the rows for actual languages, not the summary rows
597
+ per_language_df = per_language_df[per_language_df['Language'].isin(valid_df['ground_truth'].unique())]
598
+
599
+ return summary_df, per_language_df
600
+
601
+ # --- 5. Generate Both Reports ---
602
+ # Report 1: Including ALL files
603
+ all_files_summary_df, all_files_per_lang_df = generate_accuracy_report(results_df, "All Audio Files")
604
+
605
+ # Report 2: Excluding files < 3 seconds
606
+ df_filtered = results_df[results_df['duration'] >= 3].copy()
607
+ filtered_summary_df, filtered_per_lang_df = generate_accuracy_report(df_filtered, "Audio Files >= 3 Seconds")
608
+
609
+ # --- 6. Save Everything to a Single Excel File ---
610
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
611
+ report_path = os.path.join(os.path.dirname(csv_path), f"final_corrected_analysis_{timestamp}.xlsx")
612
+
613
+ print(f"\nπŸ’Ύ Saving final corrected analysis to: {report_path}")
614
+
615
+ with pd.ExcelWriter(report_path, engine='xlsxwriter') as writer:
616
+ all_files_summary_df.to_excel(writer, sheet_name='Overall_Accuracy_ALL_Files', index=False)
617
+ all_files_per_lang_df.to_excel(writer, sheet_name='Per_Lang_Accuracy_ALL_Files', index=False)
618
+ filtered_summary_df.to_excel(writer, sheet_name='Overall_Accuracy_>=3_Sec', index=False)
619
+ filtered_per_lang_df.to_excel(writer, sheet_name='Per_Lang_Accuracy_>=3_Sec', index=False)
620
+ results_df.to_excel(writer, sheet_name='Raw_Normalized_Results', index=False)
621
+
622
+ print("βœ… Analysis complete. All reports saved.")
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ torch
4
+ transformers