Spaces:

kasimali
/

xls-r1b

Running

App Files Files Community

kasimali commited on Oct 8, 2025

Commit

8e263ff

verified ·

1 Parent(s): ad18a48

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

README.md +3 -6
app.py +622 -0
requirements.txt +4 -0

README.md CHANGED Viewed

@@ -1,10 +1,7 @@
 ---
-title: Xls R1b
-emoji: ⚡
-colorFrom: pink
-colorTo: gray
 sdk: static
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: XLS-R1B
+emoji: 🚀
 sdk: static
 ---
+# XLS-R1B

app.py ADDED Viewed

	@@ -0,0 +1,622 @@

+# XLS-R1B
+# ============================================================================
+# CELL 1: SETUP AND INSTALLATION
+# ============================================================================
+import os
+import warnings
+warnings.filterwarnings('ignore')
+print("🚀 MMS Language Identification Test (Final Verified Version)")
+print("=" * 60)
+# Mount Google Drive
+from google.colab import drive
+# Install and update necessary packages
+print("📦 Installing and updating packages...")
+print("✅ Setup complete! Please restart the runtime now to apply updates.")
+# ============================================================================
+# CELL 2: MODEL LOADING (Final Verified Version)
+# ============================================================================
+import torch
+import librosa
+import pandas as pd
+import numpy as np
+from datetime import datetime
+from transformers import Wav2Vec2FeatureExtractor, AutoModelForAudioClassification
+from sklearn.metrics import accuracy_score, classification_report
+# --- Your Folder and Language Mappings ---
+CUSTOM_FOLDER_MAPPING = {
+    'as': 'asm', 'bn': 'ben', 'br': 'brx', 'doi': 'dgo', 'en': 'eng',
+    'gu': 'guj', 'hi': 'hin', 'kn': 'kan', 'kok': 'kok', 'ks': 'kas',
+    'mai': 'mai', 'ml': 'mal', 'mni': 'mni', 'mr': 'mar', 'ne': 'nep',
+    'or': 'ory', 'pa': 'pan', 'sa': 'san', 'sat': 'sat', 'sd': 'snd',
+    'ta': 'tam', 'te': 'tel', 'ur': 'urd'
+}
+ISO_TO_FULL_NAME = {
+    'asm': 'Assamese', 'ben': 'Bengali', 'brx': 'Bodo', 'dgo': 'Dogri', 'eng': 'English',
+    'guj': 'Gujarati', 'hin': 'Hindi', 'kan': 'Kannada', 'kok': 'Konkani', 'kas': 'Kashmiri',
+    'mai': 'Maithili', 'mal': 'Malayalam', 'mni': 'Manipuri', 'mar': 'Marathi', 'nep': 'Nepali',
+    'ory': 'Odia', 'pan': 'Punjabi', 'san': 'Sanskrit', 'sat': 'Santali', 'snd': 'Sindhi',
+    'tam': 'Tamil', 'tel': 'Telugu', 'urd': 'Urdu'
+}
+# --- Update Your Paths ---
+AUDIO_FOLDER = "/content/drive/MyDrive/Audio_files"  # <-- Update this
+RESULTS_FOLDER = "/content/drive/MyDrive/mms_lid_results"
+os.makedirs(RESULTS_FOLDER, exist_ok=True)
+# --- Load Components Separately (The Fix) ---
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"🔧 Device: {device}")
+MODEL_NAME = "facebook/mms-lid-256"
+# 1. Load the feature extractor ONLY
+feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
+# 2. Load the model for classification
+model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME).to(device)
+model.eval()
+print(f"✅ MMS LID model and feature extractor loaded successfully: {MODEL_NAME}")
+# ============================================================================
+# CELL 3: AUDIO PROCESSING AND PREDICTION
+# ============================================================================
+def load_audio_raw(file_path):
+    try:
+        audio, sr = librosa.load(file_path, sr=16000, mono=True)
+        duration = len(audio) / 16000
+        return audio, duration
+    except Exception as e:
+        print(f"Error loading {file_path}: {e}")
+        return None, 0
+def predict_language_mms(audio_array):
+    try:
+        # Use the feature_extractor directly
+        inputs = feature_extractor(audio_array, sampling_rate=16000, return_tensors="pt")
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model(**inputs)
+        logits = outputs.logits
+        pred_idx = torch.argmax(logits, dim=-1).item()
+        pred_lang_code = model.config.id2label[pred_idx]
+        probabilities = torch.softmax(logits, dim=-1)[0]
+        confidence = probabilities[pred_idx].item()
+        return pred_lang_code, confidence
+    except Exception as e:
+        return "error", 0.0
+def find_audio_files(base_path):
+    audio_files = []
+    for root, _, files in os.walk(base_path):
+        folder_code = os.path.basename(root).lower()
+        if folder_code in CUSTOM_FOLDER_MAPPING:
+            ground_truth_iso = CUSTOM_FOLDER_MAPPING[folder_code]
+            for file in files:
+                if file.lower().endswith(('.wav', '.mp3', '.m4a', '.flac', '.ogg')):
+                    audio_files.append({
+                        "file_path": os.path.join(root, file),
+                        "filename": file,
+                        "ground_truth": ground_truth_iso
+                    })
+    return audio_files
+print("✅ Functions are ready!")
+# ============================================================================
+# CELL 4: PROCESS ALL FILES AND GENERATE REPORT
+# ============================================================================
+def run_full_analysis():
+    print("🚀 Processing FULL dataset with MMS LID Model...")
+    audio_files = find_audio_files(AUDIO_FOLDER)
+    if not audio_files:
+        print("❌ No audio files found. Please check your AUDIO_FOLDER path.")
+        return
+    total_files = len(audio_files)
+    results = []
+    print(f"🔄 Processing {total_files} files...")
+    print("-" * 50)
+    for i, file_info in enumerate(audio_files):
+        if (i + 1) % 50 == 0:
+            print(f"Progress: {i+1}/{total_files} ({(i+1)/total_files*100:.1f}%)")
+        audio, duration = load_audio_raw(str(file_info['file_path']))
+        if audio is None:
+            result = {**file_info, "predicted_language": "load_error", "confidence": 0.0, "duration": 0.0, "is_short_file": False}
+        else:
+            pred_lang_code, confidence = predict_language_mms(audio)
+            is_short = duration < 3.0
+            result = {**file_info, "predicted_language": pred_lang_code, "confidence": confidence, "duration": duration, "is_short_file": is_short}
+            if is_short and pred_lang_code != "error":
+                print(f"⚠️  SHORT ({duration:.1f}s): {file_info['filename']} -> {ISO_TO_FULL_NAME.get(pred_lang_code, pred_lang_code)} ({confidence:.3f})")
+        results.append(result)
+    results_df = pd.DataFrame(results)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    csv_path = f"{RESULTS_FOLDER}/mms_lid_results_{timestamp}.csv"
+    results_df.to_csv(csv_path, index=False)
+    print(f"\n✅ Processing complete! Results saved to: {csv_path}")
+    # --- Detailed Analysis ---
+    print("\n" + "=" * 60)
+    print("📊 MMS LID MODEL - DETAILED ANALYSIS")
+    print("=" * 60)
+    valid_data = results_df[(results_df['predicted_language'] != 'error') & (results_df['predicted_language'] != 'load_error')]
+    if len(valid_data) > 0:
+        overall_accuracy = accuracy_score(valid_data['ground_truth'], valid_data['predicted_language'])
+        print(f"\n🎯 OVERALL MODEL ACCURACY: {overall_accuracy:.2%}")
+        print(f"\n📋 LANGUAGE-WISE ACCURACY:")
+        report_true = [ISO_TO_FULL_NAME.get(code, code) for code in valid_data['ground_truth']]
+        report_pred = [ISO_TO_FULL_NAME.get(code, code) for code in valid_data['predicted_language']]
+        print(classification_report(report_true, report_pred, zero_division=0))
+    short_files = results_df[results_df.get('is_short_file', False) == True]
+    valid_short = short_files[(short_files['predicted_language'] != 'error') & (short_files['predicted_language'] != 'load_error')]
+    print(f"\n⚠️  SHORT FILES ANALYSIS (<3 seconds):")
+    print(f"Total short files: {len(short_files)}")
+    if len(valid_short) > 0:
+        avg_conf = valid_short['confidence'].mean()
+        print(f"Average confidence for short files: {avg_conf:.3f}")
+    print("\n" + "=" * 60)
+    print("🏁 ANALYSIS COMPLETE")
+# Run the full analysis
+run_full_analysis()
+# ============================================================================
+# CELL 5: GENERATE FILTERED EXCEL REPORT
+# ============================================================================
+import pandas as pd
+from sklearn.metrics import accuracy_score
+# Install the package needed to write Excel files
+def generate_filtered_excel_report(df, folder_path):
+    """
+    Generates an Excel report with overall and per-language accuracy,
+    excluding files shorter than 3 seconds from the accuracy calculation.
+    """
+    if df is None or df.empty:
+        print("❌ No results DataFrame found. Please run the analysis in Cell 4 first.")
+        return
+    print("📊 Generating filtered accuracy report...")
+    # --- 1. Filter the DataFrame ---
+    # Exclude errors and files shorter than 3 seconds
+    accuracy_df = df[
+        (df['duration'] >= 3) &
+        (df['predicted_language'] != 'error') &
+        (df['predicted_language'] != 'load_error')
+    ].copy()
+    print(f"Total files in accuracy calculation (>= 3s): {len(accuracy_df)} out of {len(df)}")
+    # --- 2. Calculate Overall Accuracy ---
+    if not accuracy_df.empty:
+        overall_accuracy = accuracy_score(accuracy_df['ground_truth'], accuracy_df['predicted_language'])
+        summary_df = pd.DataFrame([{'Overall Accuracy (>= 3s)': f"{overall_accuracy:.2%}"}])
+    else:
+        summary_df = pd.DataFrame([{'Overall Accuracy (>= 3s)': "N/A"}])
+    # --- 3. Calculate Per-Language Accuracy ---
+    per_language_stats = []
+    if not accuracy_df.empty:
+        # Use full names for the report
+        accuracy_df['ground_truth_name'] = accuracy_df['ground_truth'].map(ISO_TO_FULL_NAME)
+        accuracy_df['predicted_language_name'] = accuracy_df['predicted_language'].map(ISO_TO_FULL_NAME)
+        for lang_code, lang_name in sorted(ISO_TO_FULL_NAME.items()):
+            lang_df = accuracy_df[accuracy_df['ground_truth'] == lang_code]
+            if not lang_df.empty:
+                lang_accuracy = accuracy_score(lang_df['ground_truth'], lang_df['predicted_language'])
+                per_language_stats.append({
+                    'Language': lang_name,
+                    'Accuracy': f"{lang_accuracy:.2%}",
+                    'File Count (>= 3s)': len(lang_df)
+                })
+    per_language_df = pd.DataFrame(per_language_stats)
+    # --- 4. Save to Excel ---
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    report_path = os.path.join(folder_path, f"filtered_accuracy_report_{timestamp}.xlsx")
+    with pd.ExcelWriter(report_path, engine='xlsxwriter') as writer:
+        summary_df.to_excel(writer, sheet_name='Summary', index=False)
+        per_language_df.to_excel(writer, sheet_name='Per_Language_Accuracy', index=False)
+        df.to_excel(writer, sheet_name='All_Results', index=False)
+        accuracy_df.to_excel(writer, sheet_name='Filtered_Results (for accuracy)', index=False)
+        # Auto-adjust column widths for readability
+        for sheet_name in writer.sheets:
+            worksheet = writer.sheets[sheet_name]
+            for idx, col in enumerate(pd.read_excel(report_path, sheet_name=sheet_name).columns):
+                max_len = max(
+                    df[col].astype(str).map(len).max() if col in df else 0,
+                    len(str(col))
+                ) + 2
+                worksheet.set_column(idx, idx, max_len)
+    print(f"\n✅ Filtered Excel report saved successfully to: {report_path}")
+# Run the function to generate the report
+# This assumes 'full_results_df' was created in the previous cell
+if 'full_results_df' in locals():
+    generate_filtered_excel_report(full_results_df, RESULTS_FOLDER)
+else:
+    print("❌ 'full_results_df' not found. Please run the previous cell to process the dataset first.")
+# ============================================================================
+# CELL 5: LOAD EXISTING RESULTS AND EXTRACT FEATURES
+# ============================================================================
+import pandas as pd
+import numpy as np
+import librosa
+import os
+# --- 1. Load Your Existing CSV File ---
+# ⚠️ PASTE THE FULL PATH to your CSV file here
+csv_path = "/content/drive/MyDrive/mms_lid_results/mms_lid_results_20250925_072344.csv"
+try:
+    full_results_df = pd.read_csv(csv_path)
+    print(f"✅ Successfully loaded {len(full_results_df)} records from {csv_path}")
+except FileNotFoundError:
+    print(f"❌ ERROR: File not found at '{csv_path}'. Please check the path and try again.")
+    # Stop execution if the file is not found
+    raise
+# --- 2. In-Depth Feature Extraction ---
+print("\n🚀 Starting in-depth feature extraction...")
+def extract_audio_features(row):
+    """Calculates SNR proxy and silence ratio for a given audio file."""
+    try:
+        audio, sr = librosa.load(row['file_path'], sr=16000, mono=True)
+        # Calculate RMS energy for silence detection
+        rms = librosa.feature.rms(y=audio, frame_length=2048, hop_length=512)[0]
+        # Silence Ratio: Percentage of frames below 20% of max energy
+        silence_threshold = 0.2 * np.max(rms) if rms.size > 0 else 0
+        silence_ratio = np.mean(rms < silence_threshold) if rms.size > 0 else 1.0
+        # SNR Proxy: Ratio of energy in loud parts vs. quiet parts
+        loud_rms = np.mean(rms[rms >= silence_threshold]) if np.any(rms >= silence_threshold) else 0
+        quiet_rms = np.mean(rms[rms < silence_threshold]) if np.any(rms < silence_threshold) else 0
+        snr_proxy = 20 * np.log10(loud_rms / (quiet_rms + 1e-7) + 1e-7) if quiet_rms > 0 else 50.0
+        return pd.Series([snr_proxy, silence_ratio])
+    except Exception as e:
+        return pd.Series([np.nan, np.nan])
+# Apply the feature extraction to each row
+print("Calculating SNR and silence ratios for all files... (This may take a few minutes)")
+features_df = full_results_df.apply(extract_audio_features, axis=1)
+features_df.columns = ['snr_proxy', 'silence_ratio']
+# Combine the new features with your existing results
+analysis_df = pd.concat([full_results_df, features_df], axis=1)
+print("✅ Feature extraction complete!")
+# ============================================================================
+# CELL 6: COMPREHENSIVE ANALYSIS AND EXCEL REPORT
+# ============================================================================
+import pandas as pd
+from sklearn.metrics import accuracy_score, confusion_matrix
+# Install xlsxwriter if not already installed
+def generate_comprehensive_report(df, folder_path):
+    """
+    Generates a comprehensive Excel report with multiple analysis sheets.
+    """
+    if 'analysis_df' not in locals():
+        print("❌ 'analysis_df' with features not found. Please run the feature extraction cell first.")
+        return
+    print("📊 Generating comprehensive analysis report...")
+    # --- Create a new Excel writer ---
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    report_path = os.path.join(folder_path, f"comprehensive_analysis_report_{timestamp}.xlsx")
+    writer = pd.ExcelWriter(report_path, engine='xlsxwriter')
+    # --- Sheet 1: All Results with Features ---
+    df.to_excel(writer, sheet_name='Results_with_Features', index=False)
+    # Filter for valid predictions for all subsequent analyses
+    valid_df = df[
+        (df['predicted_language'] != 'error') &
+        (df['predicted_language'] != 'load_error')
+    ].copy()
+    # --- Sheet 2 & 3: Calibration Analysis ---
+    n_bins = 10
+    bins = np.linspace(0, 1, n_bins + 1)
+    valid_df['confidence_bin'] = pd.cut(valid_df['confidence'], bins=bins, include_lowest=True, right=True)
+    # Ensure all bins are present for groupby
+    valid_df['confidence_bin'] = valid_df['confidence_bin'].astype(str)
+    calib_data = valid_df.groupby('confidence_bin').apply(lambda x: pd.Series({
+        'bin_accuracy': accuracy_score(x['ground_truth'], x['predicted_language']),
+        'avg_confidence': x['confidence'].mean(),
+        'sample_count': len(x)
+    })).reset_index()
+    overall_ece = np.sum(np.abs(calib_data['bin_accuracy'] - calib_data['avg_confidence']) * (calib_data['sample_count'] / len(valid_df)))
+    calibration_overview_df = pd.DataFrame([{'Expected Calibration Error (ECE)': f"{overall_ece:.4f}"}])
+    calibration_overview_df.to_excel(writer, sheet_name='Calibration_Overview', index=False)
+    calib_data.to_excel(writer, sheet_name='Calibration_Bins', index=False)
+    # --- Sheets 4, 5, 6: Accuracy vs. Features ---
+    def get_accuracy_slice(dataframe, column, bins):
+        dataframe[f'{column}_bin'] = pd.cut(dataframe[column], bins=bins, include_lowest=True)
+        return dataframe.groupby(f'{column}_bin', observed=False).apply(lambda x: accuracy_score(x['ground_truth'], x['predicted_language']) if not x.empty else 0).reset_index(name='accuracy')
+    acc_vs_duration = get_accuracy_slice(valid_df.copy(), 'duration', bins=[0, 1, 2, 3, 5, 10, np.inf])
+    acc_vs_snr = get_accuracy_slice(valid_df.copy(), 'snr_proxy', bins=[-np.inf, 0, 10, 20, 30, 40, np.inf])
+    acc_vs_silence = get_accuracy_slice(valid_df.copy(), 'silence_ratio', bins=[-0.01, 0.1, 0.3, 0.5, 0.7, 1.0])
+    acc_vs_duration.to_excel(writer, sheet_name='Acc_vs_Duration', index=False)
+    acc_vs_snr.to_excel(writer, sheet_name='Acc_vs_SNR', index=False)
+    acc_vs_silence.to_excel(writer, sheet_name='Acc_vs_Silence', index=False)
+    # --- Sheet 7 & 8: Confusion Matrix and Asymmetry ---
+    labels = sorted(list(set(valid_df['ground_truth'].unique()) | set(valid_df['predicted_language'].unique())))
+    cm = confusion_matrix(valid_df['ground_truth'], valid_df['predicted_language'], labels=labels)
+    cm_df = pd.DataFrame(cm, index=[ISO_TO_FULL_NAME.get(l, l) for l in labels], columns=[ISO_TO_FULL_NAME.get(l, l) for l in labels])
+    confusion_asymmetry_df = cm_df.subtract(cm_df.T)
+    cm_df.to_excel(writer, sheet_name='Confusion_Matrix')
+    confusion_asymmetry_df.to_excel(writer, sheet_name='Confusion_Asymmetry')
+    # --- Sheet 9 & 10: Hard Cases Analysis ---
+    hard_misclassifications = valid_df[
+        (valid_df['ground_truth'] != valid_df['predicted_language']) &
+        (valid_df['confidence'] > 0.8)
+    ].sort_values('confidence', ascending=False)
+    ambiguous_correct = valid_df[
+        (valid_df['ground_truth'] == valid_df['predicted_language']) &
+        (valid_df['confidence'] < 0.5)
+    ].sort_values('confidence', ascending=True)
+    hard_misclassifications.to_excel(writer, sheet_name='Hard_Misclassifications', index=False)
+    ambiguous_correct.to_excel(writer, sheet_name='Ambiguous_Correct', index=False)
+    # --- Save the Excel file ---
+    writer.close()
+    print(f"\n✅ Comprehensive analysis report saved successfully to: {report_path}")
+# Run the function to generate the final report
+if 'analysis_df' in locals():
+    generate_comprehensive_report(analysis_df, RESULTS_FOLDER)
+else:
+    print("❌ 'analysis_df' not found. Please run the feature extraction in the previous cell first.")
+# ============================================================================
+# CELL 6: COMPREHENSIVE ANALYSIS AND EXCEL REPORT (UNIFIED)
+# ============================================================================
+import pandas as pd
+from sklearn.metrics import accuracy_score, confusion_matrix
+# Install xlsxwriter if not already installed
+def generate_comprehensive_report(df, folder_path):
+    """
+    Generates a comprehensive Excel report with multiple analysis sheets.
+    """
+    if df is None or df.empty:
+        print("❌ The 'analysis_df' DataFrame is empty. Please check the previous cell.")
+        return
+    print("📊 Generating comprehensive analysis report...")
+    # --- Create a new Excel writer ---
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    report_path = os.path.join(folder_path, f"comprehensive_analysis_report_{timestamp}.xlsx")
+    with pd.ExcelWriter(report_path, engine='xlsxwriter') as writer:
+        # --- Sheet 1: All Results with Features ---
+        df.to_excel(writer, sheet_name='Results_with_Features', index=False)
+        # Filter for valid predictions for all subsequent analyses
+        valid_df = df[
+            (df['predicted_language'] != 'error') &
+            (df['predicted_language'] != 'load_error')
+        ].copy()
+        # --- Sheet 2 & 3: Calibration Analysis ---
+        n_bins = 10
+        bins = np.linspace(0, 1, n_bins + 1)
+        valid_df['confidence_bin'] = pd.cut(valid_df['confidence'], bins=bins, include_lowest=True, right=True)
+        valid_df['confidence_bin'] = valid_df['confidence_bin'].astype(str)
+        calib_data = valid_df.groupby('confidence_bin', observed=False).apply(lambda x: pd.Series({
+            'bin_accuracy': accuracy_score(x['ground_truth'], x['predicted_language']) if not x.empty else 0,
+            'avg_confidence': x['confidence'].mean() if not x.empty else 0,
+            'sample_count': len(x)
+        })).reset_index()
+        overall_ece = np.sum(np.abs(calib_data['bin_accuracy'] - calib_data['avg_confidence']) * (calib_data['sample_count'] / len(valid_df)))
+        calibration_overview_df = pd.DataFrame([{'Expected Calibration Error (ECE)': f"{overall_ece:.4f}"}])
+        calibration_overview_df.to_excel(writer, sheet_name='Calibration_Overview', index=False)
+        calib_data.to_excel(writer, sheet_name='Calibration_Bins', index=False)
+        # --- Sheets 4, 5, 6: Accuracy vs. Features ---
+        def get_accuracy_slice(dataframe, column, bins):
+            dataframe[f'{column}_bin'] = pd.cut(dataframe[column], bins=bins, include_lowest=True)
+            return dataframe.groupby(f'{column}_bin', observed=False).apply(lambda x: accuracy_score(x['ground_truth'], x['predicted_language']) if not x.empty else 0).reset_index(name='accuracy')
+        acc_vs_duration = get_accuracy_slice(valid_df.copy(), 'duration', bins=[0, 1, 2, 3, 5, 10, np.inf])
+        acc_vs_snr = get_accuracy_slice(valid_df.copy(), 'snr_proxy', bins=[-np.inf, 0, 10, 20, 30, 40, np.inf])
+        acc_vs_silence = get_accuracy_slice(valid_df.copy(), 'silence_ratio', bins=[-0.01, 0.1, 0.3, 0.5, 0.7, 1.0])
+        acc_vs_duration.to_excel(writer, sheet_name='Acc_vs_Duration', index=False)
+        acc_vs_snr.to_excel(writer, sheet_name='Acc_vs_SNR', index=False)
+        acc_vs_silence.to_excel(writer, sheet_name='Acc_vs_Silence', index=False)
+        # --- Sheet 7 & 8: Confusion Matrix and Asymmetry ---
+        labels = sorted(list(set(valid_df['ground_truth'].unique()) | set(valid_df['predicted_language'].unique())))
+        cm = confusion_matrix(valid_df['ground_truth'], valid_df['predicted_language'], labels=labels)
+        cm_df = pd.DataFrame(cm, index=[ISO_TO_FULL_NAME.get(l, l) for l in labels], columns=[ISO_TO_FULL_NAME.get(l, l) for l in labels])
+        confusion_asymmetry_df = cm_df.subtract(cm_df.T)
+        cm_df.to_excel(writer, sheet_name='Confusion_Matrix')
+        confusion_asymmetry_df.to_excel(writer, sheet_name='Confusion_Asymmetry')
+        # --- Sheet 9 & 10: Hard Cases Analysis ---
+        hard_misclassifications = valid_df[
+            (valid_df['ground_truth'] != valid_df['predicted_language']) &
+            (valid_df['confidence'] > 0.8)
+        ].sort_values('confidence', ascending=False)
+        ambiguous_correct = valid_df[
+            (valid_df['ground_truth'] == valid_df['predicted_language']) &
+            (valid_df['confidence'] < 0.5)
+        ].sort_values('confidence', ascending=True)
+        hard_misclassifications.to_excel(writer, sheet_name='Hard_Misclassifications', index=False)
+        ambiguous_correct.to_excel(writer, sheet_name='Ambiguous_Correct', index=False)
+    print(f"\n✅ Comprehensive analysis report saved successfully to: {report_path}")
+# Run the function to generate the final report
+# This will now work because 'analysis_df' was created in the cell right above
+if 'analysis_df' in locals():
+    generate_comprehensive_report(analysis_df, RESULTS_FOLDER)
+else:
+    print("❌ 'analysis_df' not found. Please re-run the previous cell to load and process the data.")
+# ============================================================================
+# FINAL ANALYSIS CELL: NORMALIZATION AND DUAL ACCURACY REPORTS
+# ============================================================================
+import pandas as pd
+import numpy as np
+from sklearn.metrics import accuracy_score, classification_report
+import os
+# Install xlsxwriter for Excel reporting
+# --- 1. Load Your Existing CSV File ---
+# ⚠️ PASTE THE FULL PATH to your most recent CSV file here
+csv_path = "/content/drive/MyDrive/mms_lid_results/mms_lid_results_20250925_072344.csv"
+try:
+    results_df = pd.read_csv(csv_path)
+    print(f"✅ Successfully loaded {len(results_df)} records from {csv_path}")
+except FileNotFoundError:
+    print(f"❌ ERROR: File not found at '{csv_path}'. Please check the path and try again.")
+    raise
+# --- 2. Define the Comprehensive Normalization Mapping ---
+# This dictionary will standardize all known language code variations.
+NORMALIZATION_MAPPING = {
+    # MMS model's 3-letter codes (prediction) to your 2-letter folder names (ground truth)
+    'asm': 'as', 'ben': 'bn', 'brx': 'br', 'dgo': 'doi', 'eng': 'en',
+    'guj': 'gu', 'hin': 'hi', 'kan': 'kn', 'kok': 'kok', 'kas': 'ks',
+    'mai': 'mai', 'mal': 'ml', 'mni': 'mni', 'mar': 'mr', 'nep': 'ne',
+    'ory': 'or', 'pan': 'pa', 'san': 'sa', 'sat': 'sat', 'snd': 'sd',
+    'tam': 'ta', 'tel': 'te', 'urd': 'ur',
+    # Crucial fix for Nepali
+    'npi': 'ne'
+}
+# --- 3. Apply Normalization ---
+print("\nApplying comprehensive normalization to language codes...")
+results_df['normalized_prediction'] = results_df['predicted_language'].map(NORMALIZATION_MAPPING)
+# Fill any unmapped predictions with a placeholder to mark them as incorrect
+results_df['normalized_prediction'].fillna('unknown', inplace=True)
+# --- 4. Define the Analysis Function ---
+def generate_accuracy_report(df, report_title):
+    """Calculates and returns overall and per-language accuracy DataFrames."""
+    print(f"\n--- Generating Report: {report_title} ---")
+    # Filter for valid predictions (where normalization resulted in a known language)
+    valid_df = df[df['normalized_prediction'] != 'unknown'].copy()
+    print(f"Calculating accuracy on {len(valid_df)} valid predictions.")
+    if valid_df.empty:
+        print("No valid data to report on.")
+        return pd.DataFrame([{'Overall Accuracy': 'N/A'}]), pd.DataFrame()
+    # Calculate Overall Accuracy
+    overall_accuracy = accuracy_score(valid_df['ground_truth'], valid_df['normalized_prediction'])
+    summary_df = pd.DataFrame([{'Overall Accuracy': f"{overall_accuracy:.2%}"}])
+    print(f"Overall Accuracy: {overall_accuracy:.2%}")
+    # Calculate Per-Language Accuracy
+    report_dict = classification_report(valid_df['ground_truth'], valid_df['normalized_prediction'], output_dict=True, zero_division=0)
+    per_language_df = pd.DataFrame(report_dict).transpose().reset_index().rename(columns={'index': 'Language'})
+    # Keep only the rows for actual languages, not the summary rows
+    per_language_df = per_language_df[per_language_df['Language'].isin(valid_df['ground_truth'].unique())]
+    return summary_df, per_language_df
+# --- 5. Generate Both Reports ---
+# Report 1: Including ALL files
+all_files_summary_df, all_files_per_lang_df = generate_accuracy_report(results_df, "All Audio Files")
+# Report 2: Excluding files < 3 seconds
+df_filtered = results_df[results_df['duration'] >= 3].copy()
+filtered_summary_df, filtered_per_lang_df = generate_accuracy_report(df_filtered, "Audio Files >= 3 Seconds")
+# --- 6. Save Everything to a Single Excel File ---
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+report_path = os.path.join(os.path.dirname(csv_path), f"final_corrected_analysis_{timestamp}.xlsx")
+print(f"\n💾 Saving final corrected analysis to: {report_path}")
+with pd.ExcelWriter(report_path, engine='xlsxwriter') as writer:
+    all_files_summary_df.to_excel(writer, sheet_name='Overall_Accuracy_ALL_Files', index=False)
+    all_files_per_lang_df.to_excel(writer, sheet_name='Per_Lang_Accuracy_ALL_Files', index=False)
+    filtered_summary_df.to_excel(writer, sheet_name='Overall_Accuracy_>=3_Sec', index=False)
+    filtered_per_lang_df.to_excel(writer, sheet_name='Per_Lang_Accuracy_>=3_Sec', index=False)
+    results_df.to_excel(writer, sheet_name='Raw_Normalized_Results', index=False)
+print("✅ Analysis complete. All reports saved.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+numpy
+pandas
+torch
+transformers