Spaces:

Mo2294
/

MoTTS

Running

App Files Files Community

Mo2294 commited on 15 days ago

Commit

0800432

verified ·

1 Parent(s): 9d5ea3d

Update app.py

Browse files

Files changed (1) hide show

app.py +188 -135

app.py CHANGED Viewed

@@ -1,7 +1,20 @@
 #!/usr/bin/env python3
 """
 HuggingFace Spaces app.py for IndexTTS2 with Auto-Processing and Combined Audio
 """
 import os
 import sys
 import subprocess
@@ -36,9 +49,16 @@ auto_process_thread = None
 current_status = "Ready"
 tts_model = None
 # Constants
-MAX_COMBINED_DURATION = 30 * 60  # 30 minutes in seconds
-PAUSE_DURATION = 3.0  # 3 seconds pause between audios
 def sanitize_filename(text: str, max_len: int = 120) -> str:
@@ -129,116 +149,131 @@ def parse_audio_duration_from_log(log_line: str):
     return None
-def create_combined_audios(audio_files_info):
     """
-    Create combined audio file(s) with 3-second pauses,
-    without changing pitch, samplerate or bitdepth.
-    audio_files_info: List[(file_path, duration_in_seconds)]
     """
-    # 1) Samplerate der ersten Datei korrekt auslesen (z.B. 22050 Hz von BigVGAN)
-    first_file = audio_files_info[0][0]
     _, sr = sf.read(first_file, dtype="int16")
-    # 3 Sekunden Stille in ORIGINAL-SAMPLERATE erzeugen
-    silence_3s = np.zeros(int(sr * PAUSE_DURATION), dtype=np.int16)
-    combined_files = []
     current_files = []
     current_duration = 0.0
-    combined_index = 1
-    for file_path, duration in audio_files_info:
-        # "Was wäre die Länge, wenn wir diese Datei hinzufügen?"
-        new_length = current_duration
-        if current_files:
-            new_length += PAUSE_DURATION
-        new_length += duration
-        # Wenn zu lang → speichern & neue Combined beginnen
-        if new_length > MAX_COMBINED_DURATION and current_files:
-            combined_name = (
-                "temp_combined.wav"
-                if combined_index == 1 and len(audio_files_info) <= 30
-                else f"temp_combined_{combined_index:03d}.wav"
-            )
-            audio_out = []
-            # 1.5 Sekunden Intro-Stille vor der ersten Audio
-            silence_intro = np.zeros(int(sr * 1.5), dtype=np.int16)
-            audio_out.append(silence_intro)
-            for i, fp in enumerate(current_files):
-                data, _ = sf.read(fp, dtype="int16")
-                audio_out.append(data)
-                # Zwischen Affirmationen 3 Sekunden Pause
-                if i < len(current_files) - 1:
-                    audio_out.append(silence_3s)
-            final_audio = np.concatenate(audio_out)
-            sf.write(combined_name, final_audio, sr, subtype="PCM_16")
-            combined_files.append((combined_name, current_duration))
-            print(
-                f"Created combined file {combined_index}: "
-                f"{int(current_duration // 60)}:{int(current_duration % 60):02d}"
-            )
-            combined_index += 1
-            # Neue Combined-Gruppe beginnen mit aktueller Datei
-            current_files = [file_path]
-            current_duration = duration
-        else:
-            current_files.append(file_path)
-            if len(current_files) == 1:
-                current_duration = duration
-            else:
-                current_duration += PAUSE_DURATION + duration
-    # Letzte Combined-Datei speichern
-    if current_files:
-        combined_name = (
-            "temp_combined.wav"
-            if combined_index == 1 and len(audio_files_info) <= 30
-            else f"temp_combined_{combined_index:03d}.wav"
         )
-        audio_out = []
-        # 1.5 Sekunden Intro-Stille vor der ersten Audio
-        silence_intro = np.zeros(int(sr * 1.5), dtype=np.int16)
-        audio_out.append(silence_intro)
-        for i, fp in enumerate(current_files):
-            data, _ = sf.read(fp, dtype="int16")
-            audio_out.append(data)
-            # Zwischen Affirmationen 3 Sekunden Pause
-            if i < len(current_files) - 1:
-                audio_out.append(silence_3s)
-        final_audio = np.concatenate(audio_out)
-        sf.write(combined_name, final_audio, sr, subtype="PCM_16")
-        combined_files.append((combined_name, current_duration))
-        print(
-            f"Created combined file {combined_index}: "
-            f"{int(current_duration // 60)}:{int(current_duration % 60):02d}"
-        )
-    return combined_files
 def auto_process_dataset():
     """
-    Auto-process TXT files from Monarchtaba22/rawAffirmation
-    Generate audio for each sentence (split by .-) and upload to output dataset
-    Create combined audio(s) with 3s pauses, max 30 min each
-    Move processed TXT files to /done folder
     """
     global auto_process_running, current_status, tts_model
@@ -253,6 +288,8 @@ def auto_process_dataset():
             return
         api = HfApi(token=token)
         input_dataset_id = "Mo2294/rawAffirmation"
         output_dataset_id = "Mo2294/outputAffirmation"
@@ -271,7 +308,6 @@ def auto_process_dataset():
             repo_files = list_repo_files(
                 repo_id=input_dataset_id, repo_type="dataset", token=token
             )
-            # Filter for TXT files not in /done folder
             txt_files = [
                 f
                 for f in repo_files
@@ -296,6 +332,9 @@ def auto_process_dataset():
             txt_name = Path(txt_file).stem
             current_status = f"Processing: {txt_name}"
             try:
                 # Download TXT file
                 txt_path = hf_hub_download(
@@ -309,17 +348,17 @@ def auto_process_dataset():
                 with open(txt_path, "r", encoding="utf-8") as f:
                     content = f.read()
-                # IMPROVED SPLITTING - preserve the actual text
                 raw_sentences = content.split(".-")
                 sentences = []
                 for s in raw_sentences:
                     cleaned = s.strip()
                     if cleaned:
                         # Remove only trailing punctuation if it's a single dash or dot
                         if cleaned.endswith("-") or cleaned.endswith("."):
                             cleaned = cleaned[:-1].rstrip()
-                        sentences.append(cleaned)
                 if not sentences:
                     current_status = f"No sentences found in {txt_name}"
@@ -328,11 +367,7 @@ def auto_process_dataset():
                 current_status = f"Found {len(sentences)} sentences in {txt_name}"
                 print(f"Processing sentences from {txt_name}:")
-                temp_files = []
-                audio_files_info = []  # still used for durations/logging, not for combining
-                commit_operations = []
-                # Track used filenames to avoid duplicates within same TXT
                 used_names = set()
                 # Process each sentence
@@ -351,7 +386,6 @@ def auto_process_dataset():
                         # Filename should be the affirmation text (before adding punctuation)
                         base_name = sanitize_filename(sentence)
                         if base_name in used_names:
-                            # avoid overwriting if identical sentence appears multiple times
                             suffix = 2
                             while f"{base_name}_{suffix}" in used_names:
                                 suffix += 1
@@ -365,7 +399,7 @@ def auto_process_dataset():
                         print(f"  Sentence {idx+1}: '{tts_sentence}'")
-                        # Generate audio using IndexTTS2
                         output_filename = f"temp_{base_name}.wav"
                         # Capture stdout to get audio duration
@@ -381,7 +415,6 @@ def auto_process_dataset():
                                 verbose=True,
                             )
-                        # Parse duration from output
                         output_log = buf.getvalue()
                         duration = None
                         for line in output_log.split("\n"):
@@ -396,10 +429,9 @@ def auto_process_dataset():
                         print(f"    Generated audio: {duration:.2f} seconds")
-                        audio_files_info.append((output_filename, duration))
                         temp_files.append(output_filename)
-                        # Upload path: use affirmation name, no numbering
                         output_path = f"Affirmations/{txt_name}/{base_name}.wav"
                         commit_operations.append(
                             CommitOperationAdd(
@@ -408,26 +440,53 @@ def auto_process_dataset():
                             )
                         )
                     except Exception as e:
                         current_status = f"Error generating audio for sentence {idx+1}: {e}"
                         print(f"Generation error: {e}")
                         continue
-                # ✅ NO MORE COMBINED AUDIO CREATION HERE
-                # (combined generation removed/disabled as requested)
-                # Upload all generated files
-                if commit_operations and auto_process_running:
-                    total_individual = len(commit_operations)
-                    current_status = f"Uploading {total_individual} audio files for {txt_name}..."
                     try:
                         api.create_commit(
                             repo_id=output_dataset_id,
                             repo_type="dataset",
                             operations=commit_operations,
-                            commit_message=f"Add {total_individual} audio files for {txt_name}",
                             token=token,
                         )
                         current_status = f"Successfully uploaded files for {txt_name}"
@@ -454,7 +513,11 @@ def auto_process_dataset():
                             token=token,
                         )
-                        current_status = f"✅ Completed {txt_name}: {total_individual} audio files"
                     except Exception as e:
                         current_status = f"Upload/Move error for {txt_name}: {e}"
@@ -473,6 +536,15 @@ def auto_process_dataset():
             except Exception as e:
                 current_status = f"Error processing {txt_name}: {e}"
                 print(f"Error: {e}")
                 continue
         if auto_process_running:
@@ -559,9 +631,7 @@ def manual_generate(text, reference_audio, emotion_audio, emo_alpha, use_emo_tex
 # Create Gradio interface
 with gr.Blocks(title="IndexTTS2 with Auto-Processing") as demo:
     gr.Markdown("# 🎤 IndexTTS2 Voice Synthesis")
-    gr.Markdown(
-        "State-of-the-art TTS with auto-processing and combined audio generation"
-    )
     # Manual tab
     with gr.Tab("Manual Processing"):
@@ -592,25 +662,15 @@ with gr.Blocks(title="IndexTTS2 with Auto-Processing") as demo:
                         step=0.1,
                         label="Emotion strength",
                     )
-                    use_emo_text = gr.Checkbox(
-                        label="Use text-based emotion", value=False
-                    )
             with gr.Column():
-                generate_btn = gr.Button(
-                    "🎙️ Generate", variant="primary", size="lg"
-                )
                 output_audio = gr.Audio(label="Generated audio", type="numpy")
         generate_btn.click(
             manual_generate,
-            inputs=[
-                text_input,
-                reference_audio,
-                emotion_audio,
-                emo_alpha,
-                use_emo_text,
-            ],
             outputs=output_audio,
         )
@@ -628,8 +688,9 @@ with gr.Blocks(title="IndexTTS2 with Auto-Processing") as demo:
                 - 🎙️ Voice: `Mo.wav`
                 - ✂️ Delimiter: `.-`
                 - 📝 Structure: `/Affirmations/[name]/`
-                - ⏰ Combined: Max 30 min chunks
-                - ⏸️ Pauses: 3 seconds between audios
                 """
                 )
@@ -642,23 +703,15 @@ with gr.Blocks(title="IndexTTS2 with Auto-Processing") as demo:
                 )
                 with gr.Row():
-                    start_btn = gr.Button(
-                        "▶️ Start Processing", variant="primary", scale=2
-                    )
                     stop_btn = gr.Button("⏹️ Stop", variant="stop", scale=1)
                     refresh_btn = gr.Button("🔄 Refresh", scale=1)
-                message_display = gr.Textbox(
-                    label="Message", interactive=False, visible=False
-                )
         # Event handlers
-        start_btn.click(
-            start_auto_process, outputs=[message_display, status_display]
-        )
-        stop_btn.click(
-            stop_auto_process, outputs=[message_display, status_display]
-        )
         refresh_btn.click(get_status, outputs=status_display)
     # Footer

 #!/usr/bin/env python3
 """
 HuggingFace Spaces app.py for IndexTTS2 with Auto-Processing and Combined Audio
+- Auto-process TXT files from dataset
+- Generate per-affirmation WAVs (named by sanitized affirmation text)
+- Create combined WAV chunks with:
+    - 1.5s pause between affirmations
+    - max duration 29:50 (1790s) per combined
+- Upload combined WAV + matching TXT (same basename) per combined:
+    combined_001.wav + combined_001.txt
+    where TXT contains the original affirmations in the exact audio order:
+        i am worthy.-
+        I am blessed.-
+        ...
+- Move processed TXT files to done/
 """
 import os
 import sys
 import subprocess
 current_status = "Ready"
 tts_model = None
+# ---------------------------------------------------------------------
 # Constants
+# ---------------------------------------------------------------------
+# Combined constraints (as requested)
+PAUSE_DURATION = 1.5  # 1.5 seconds between each affirmation in combined audio
+MAX_COMBINED_DURATION = 29 * 60 + 50  # 29:50 = 1790 seconds max per combined
+# (kept; not used in combined anymore as per new constants)
+MAX_COMBINED_DURATION_OLD = 30 * 60
+PAUSE_DURATION_OLD = 3.0
 def sanitize_filename(text: str, max_len: int = 120) -> str:
     return None
+def create_combined_audios(audio_items):
     """
+    Create combined audio file(s) with 1.5-second pauses between affirmations,
+    max duration 29:50 per combined, without changing pitch/samplerate/bitdepth.
+    audio_items: List[dict] where each item is:
+      {
+        "file_path": <temp wav path>,
+        "duration": <seconds float>,
+        "text": <original affirmation text (no trailing .-)>,
+      }
+    Returns:
+      List[dict]:
+        {
+          "wav_path": <combined wav filename>,
+          "txt_path": <combined txt filename>,
+          "duration": <duration seconds float>,
+          "texts": <list[str]>,
+        }
     """
+    if not audio_items:
+        return []
+    # Read samplerate from first file
+    first_file = audio_items[0]["file_path"]
     _, sr = sf.read(first_file, dtype="int16")
+    # Silence between each affirmation in combined audio (original SR, int16)
+    silence_pause = np.zeros(int(sr * PAUSE_DURATION), dtype=np.int16)
+    combined_results = []
+    combined_index = 1
     current_files = []
+    current_texts = []
     current_duration = 0.0
+    def flush_current():
+        nonlocal combined_index, current_files, current_texts, current_duration
+        if not current_files:
+            return
+        audio_out = []
+        # IMPORTANT: no intro silence requested now; only 1.5s between affirmations
+        for i, fp in enumerate(current_files):
+            data, _ = sf.read(fp, dtype="int16")
+            audio_out.append(data)
+            if i < len(current_files) - 1:
+                audio_out.append(silence_pause)
+        final_audio = np.concatenate(audio_out) if len(audio_out) > 1 else audio_out[0]
+        wav_name = f"combined_{combined_index:03d}.wav"
+        txt_name = f"combined_{combined_index:03d}.txt"
+        sf.write(wav_name, final_audio, sr, subtype="PCM_16")
+        # TXT must contain original affirmation text in exact order, each ending with .-
+        with open(txt_name, "w", encoding="utf-8") as f:
+            for t in current_texts:
+                f.write(f"{t}.-\n")
+        combined_results.append(
+            {
+                "wav_path": wav_name,
+                "txt_path": txt_name,
+                "duration": current_duration,
+                "texts": list(current_texts),
+            }
         )
+        print(
+            f"Created combined file {combined_index}: "
+            f"{int(current_duration // 60)}:{int(current_duration % 60):02d}"
+        )
+        combined_index += 1
+        current_files = []
+        current_texts = []
+        current_duration = 0.0
+    for item in audio_items:
+        file_path = item["file_path"]
+        duration = float(item["duration"])
+        text = item["text"]
+        projected = current_duration
+        if current_files:
+            projected += PAUSE_DURATION
+        projected += duration
+        # If adding this would exceed max, flush first
+        if projected > MAX_COMBINED_DURATION and current_files:
+            flush_current()
+        current_files.append(file_path)
+        current_texts.append(text)
+        if len(current_files) == 1:
+            current_duration = duration
+        else:
+            current_duration += PAUSE_DURATION + duration
+    # flush final
+    flush_current()
+    return combined_results
 def auto_process_dataset():
     """
+    Auto-process TXT files from dataset:
+      - input:  Mo2294/rawAffirmation
+      - output: Mo2294/outputAffirmation
+    For each TXT:
+      - Split affirmations by ".-"
+      - Generate individual WAV named by sanitized affirmation text
+      - Create combined WAV chunks (max 29:50) with 1.5s pauses
+      - For each combined WAV, create matching combined_XXX.txt with original affirmations in order (each line ends ".-")
+      - Upload all WAVs + combined WAVs + combined TXTs
+      - Move processed TXT to done/
     """
     global auto_process_running, current_status, tts_model
             return
         api = HfApi(token=token)
+        # TODO: replace with your Monarchtaba22 ids if different
         input_dataset_id = "Mo2294/rawAffirmation"
         output_dataset_id = "Mo2294/outputAffirmation"
             repo_files = list_repo_files(
                 repo_id=input_dataset_id, repo_type="dataset", token=token
             )
             txt_files = [
                 f
                 for f in repo_files
             txt_name = Path(txt_file).stem
             current_status = f"Processing: {txt_name}"
+            temp_files = []
+            commit_operations = []
             try:
                 # Download TXT file
                 txt_path = hf_hub_download(
                 with open(txt_path, "r", encoding="utf-8") as f:
                     content = f.read()
+                # Split by ".-" and preserve original text (minus trailing '.'/'-')
                 raw_sentences = content.split(".-")
                 sentences = []
                 for s in raw_sentences:
                     cleaned = s.strip()
                     if cleaned:
                         # Remove only trailing punctuation if it's a single dash or dot
                         if cleaned.endswith("-") or cleaned.endswith("."):
                             cleaned = cleaned[:-1].rstrip()
+                        if cleaned:
+                            sentences.append(cleaned)
                 if not sentences:
                     current_status = f"No sentences found in {txt_name}"
                 current_status = f"Found {len(sentences)} sentences in {txt_name}"
                 print(f"Processing sentences from {txt_name}:")
+                audio_items = []  # used for combined creation: includes text + duration + temp wav path
                 used_names = set()
                 # Process each sentence
                         # Filename should be the affirmation text (before adding punctuation)
                         base_name = sanitize_filename(sentence)
                         if base_name in used_names:
                             suffix = 2
                             while f"{base_name}_{suffix}" in used_names:
                                 suffix += 1
                         print(f"  Sentence {idx+1}: '{tts_sentence}'")
+                        # Generate audio
                         output_filename = f"temp_{base_name}.wav"
                         # Capture stdout to get audio duration
                                 verbose=True,
                             )
                         output_log = buf.getvalue()
                         duration = None
                         for line in output_log.split("\n"):
                         print(f"    Generated audio: {duration:.2f} seconds")
                         temp_files.append(output_filename)
+                        # Upload individual WAV (named by affirmation text)
                         output_path = f"Affirmations/{txt_name}/{base_name}.wav"
                         commit_operations.append(
                             CommitOperationAdd(
                             )
                         )
+                        # For combined creation we must preserve original text + order
+                        audio_items.append(
+                            {
+                                "file_path": output_filename,
+                                "duration": duration,
+                                "text": sentence,  # ORIGINAL text (case preserved) for combined TXT
+                            }
+                        )
                     except Exception as e:
                         current_status = f"Error generating audio for sentence {idx+1}: {e}"
                         print(f"Generation error: {e}")
                         continue
+                # Create combined WAV(s) + TXT(s)
+                if audio_items and auto_process_running:
+                    current_status = f"Creating combined audio(s) for {txt_name}..."
+                    combined_results = create_combined_audios(audio_items)
+                    for c in combined_results:
+                        # upload combined wav
+                        commit_operations.append(
+                            CommitOperationAdd(
+                                path_in_repo=f"Affirmations/{txt_name}/{Path(c['wav_path']).name}",
+                                path_or_fileobj=c["wav_path"],
+                            )
+                        )
+                        # upload combined txt (same basename)
+                        commit_operations.append(
+                            CommitOperationAdd(
+                                path_in_repo=f"Affirmations/{txt_name}/{Path(c['txt_path']).name}",
+                                path_or_fileobj=c["txt_path"],
+                            )
+                        )
+                        temp_files.append(c["wav_path"])
+                        temp_files.append(c["txt_path"])
+                # Upload all generated files
+                if commit_operations and auto_process_running:
+                    current_status = f"Uploading files for {txt_name}..."
                     try:
                         api.create_commit(
                             repo_id=output_dataset_id,
                             repo_type="dataset",
                             operations=commit_operations,
+                            commit_message=f"Add affirmations + combined for {txt_name}",
                             token=token,
                         )
                         current_status = f"Successfully uploaded files for {txt_name}"
                             token=token,
                         )
+                        current_status = (
+                            f"✅ Completed {txt_name}: "
+                            f"{len(audio_items)} individual + "
+                            f"{sum(1 for _ in [op for op in commit_operations if isinstance(op, CommitOperationAdd)]) - len(audio_items)} combined assets"
+                        )
                     except Exception as e:
                         current_status = f"Upload/Move error for {txt_name}: {e}"
             except Exception as e:
                 current_status = f"Error processing {txt_name}: {e}"
                 print(f"Error: {e}")
+                # Cleanup on failure too
+                for temp_file in temp_files:
+                    try:
+                        if os.path.exists(temp_file):
+                            os.remove(temp_file)
+                    except Exception:
+                        pass
                 continue
         if auto_process_running:
 # Create Gradio interface
 with gr.Blocks(title="IndexTTS2 with Auto-Processing") as demo:
     gr.Markdown("# 🎤 IndexTTS2 Voice Synthesis")
+    gr.Markdown("State-of-the-art TTS with auto-processing and combined audio generation")
     # Manual tab
     with gr.Tab("Manual Processing"):
                         step=0.1,
                         label="Emotion strength",
                     )
+                    use_emo_text = gr.Checkbox(label="Use text-based emotion", value=False)
             with gr.Column():
+                generate_btn = gr.Button("🎙️ Generate", variant="primary", size="lg")
                 output_audio = gr.Audio(label="Generated audio", type="numpy")
         generate_btn.click(
             manual_generate,
+            inputs=[text_input, reference_audio, emotion_audio, emo_alpha, use_emo_text],
             outputs=output_audio,
         )
                 - 🎙️ Voice: `Mo.wav`
                 - ✂️ Delimiter: `.-`
                 - 📝 Structure: `/Affirmations/[name]/`
+                - ⏰ Combined: Max 29:50 chunks
+                - ⏸️ Pauses: 1.5 seconds between audios
+                - 🧾 TXT: one `combined_XXX.txt` per combined wav
                 """
                 )
                 )
                 with gr.Row():
+                    start_btn = gr.Button("▶️ Start Processing", variant="primary", scale=2)
                     stop_btn = gr.Button("⏹️ Stop", variant="stop", scale=1)
                     refresh_btn = gr.Button("🔄 Refresh", scale=1)
+                message_display = gr.Textbox(label="Message", interactive=False, visible=False)
         # Event handlers
+        start_btn.click(start_auto_process, outputs=[message_display, status_display])
+        stop_btn.click(stop_auto_process, outputs=[message_display, status_display])
         refresh_btn.click(get_status, outputs=status_display)
     # Footer