Spaces:

mnhatdaous
/

learnable-speech

Sleeping

App Files Files Community

mnhatdaous commited on Sep 9

Commit

421543d

1 Parent(s): 1c43d7b

Add Hugging Face Space configuration with Docker support

Browse files

Files changed (5) hide show

.dockerignore +45 -0
Dockerfile +30 -0
README_HF.md +53 -0
app.py +127 -0
requirements-hf.txt +14 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,45 @@

+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+env/
+venv/
+.venv/
+pip-log.txt
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.log
+.git/
+.mypy_cache/
+.pytest_cache/
+.hypothesis/
+.DS_Store
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Large model files (download separately)
+*.pt
+*.pth
+*.bin
+*.safetensors
+*.ckpt
+# Dataset files
+*.wav
+*.mp3
+*.flac
+*.parquet
+# Logs and temporary files
+logs/
+wandb/
+tmp/
+temp/

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+FROM python:3.10-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    ffmpeg \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for better caching
+COPY requirements-hf.txt ./requirements.txt
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the entire project
+COPY . .
+# Set environment variables
+ENV PYTHONPATH=/app
+ENV GRADIO_SERVER_NAME=0.0.0.0
+ENV GRADIO_SERVER_PORT=7860
+# Expose the port
+EXPOSE 7860
+# Run the application
+CMD ["python", "app.py"]

README_HF.md ADDED Viewed

	@@ -0,0 +1,53 @@

+---
+title: Learnable Speech
+emoji: 🎤
+colorFrom: blue
+colorTo: purple
+sdk: docker
+pinned: false
+license: apache-2.0
+app_port: 7860
+---
+# Learnable-Speech: High-Quality 24kHz Speech Synthesis
+An unofficial implementation based on improvements of CosyVoice with learnable encoder and DAC-VAE.
+## Demo
+This Space provides a demo interface for the Learnable-Speech model. Currently, it shows a placeholder implementation. To use the actual trained model, you would need to:
+1. Train the model using the provided training pipeline
+2. Upload the trained checkpoints
+3. Replace the placeholder inference code with actual model loading and inference
+## Features
+- **24kHz Audio Support**: High-quality audio generation at 24kHz sampling rate
+- **Flow matching AE**: Flow matching training for autoencoders
+- **Immiscible assignment**: Support immiscible adding noise while training
+- **Contrastive Flow matching**: Support Contrastive training
+## Architecture
+### Stage 1: Audio to Discrete Tokens
+Converts raw audio into discrete representations using the FSQ (S3Tokenizer) framework.
+### Stage 2: Discrete Tokens to Continuous Latent Space
+Maps discrete tokens to a continuous latent space using a Variational Autoencoder (VAE).
+## Links
+- [GitHub Repository](https://github.com/primepake/learnable-speech)
+- [Technical Paper](https://arxiv.org/pdf/2505.07916)
+- [CosyVoice2](https://github.com/FunAudioLLM/CosyVoice)
+## Usage
+1. Enter text in the text box
+2. Select a speaker ID (0-10)
+3. Click "Generate Speech" to synthesize audio
+**Note**: This is currently a placeholder demo. The actual model requires training first.

app.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import gradio as gr
+import numpy as np
+def synthesize_speech(text, speaker_id=0):
+    """
+    Placeholder function for speech synthesis
+    Replace this with actual model inference when you have trained models
+    """
+    if not text.strip():
+        return None
+    # This is a placeholder - replace with actual model inference
+    sample_rate = 24000
+    duration = max(1.0, len(text) * 0.08)  # rough estimate
+    samples = int(sample_rate * duration)
+    # Generate simple sine wave as placeholder
+    t = np.linspace(0, duration, samples)
+    frequency = 440 + (speaker_id * 50)  # vary frequency by speaker
+    # Create a more interesting waveform
+    audio = (
+        0.3 * np.sin(2 * np.pi * frequency * t) * np.exp(-t/(duration*0.8)) +
+        0.1 * np.sin(2 * np.pi * frequency * 2 * t) * np.exp(-t/duration) +
+        0.05 * np.random.randn(samples)  # add some noise
+    )
+    # Apply fade in/out
+    fade_samples = int(0.1 * sample_rate)
+    audio[:fade_samples] *= np.linspace(0, 1, fade_samples)
+    audio[-fade_samples:] *= np.linspace(1, 0, fade_samples)
+    return (sample_rate, audio.astype(np.float32))
+def create_demo():
+    with gr.Blocks(title="Learnable-Speech Demo", theme=gr.themes.Soft()) as demo:
+        gr.Markdown(
+            """
+            # 🎤 Learnable-Speech: High-Quality 24kHz Speech Synthesis
+            An unofficial implementation based on improvements of CosyVoice with learnable encoder and DAC-VAE.
+            > **Note**: This is a demo interface. To use the actual model, you need to train it first using the provided training pipeline.
+            """
+        )
+        with gr.Row():
+            with gr.Column():
+                text_input = gr.Textbox(
+                    label="Text to synthesize",
+                    placeholder="Enter text here...",
+                    lines=3,
+                    value="Hello, this is a demo of Learnable-Speech synthesis."
+                )
+                with gr.Row():
+                    speaker_slider = gr.Slider(
+                        minimum=0,
+                        maximum=10,
+                        value=0,
+                        step=1,
+                        label="Speaker ID"
+                    )
+                generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
+            with gr.Column():
+                audio_output = gr.Audio(
+                    label="Generated Speech",
+                    type="numpy"
+                )
+        with gr.Accordion("📋 Project Information", open=False):
+            gr.Markdown(
+                """
+                ### Key Features
+                - **24kHz Audio Support**: High-quality audio generation at 24kHz sampling rate
+                - **Flow matching AE**: Flow matching training for autoencoders
+                - **Immiscible assignment**: Support immiscible adding noise while training
+                - **Contrastive Flow matching**: Support Contrastive training
+                ### Architecture
+                **Stage 1**: Audio to Discrete Tokens - Converts raw audio into discrete representations using FSQ (S3Tokenizer)
+                **Stage 2**: Discrete Tokens to Continuous Latent Space - Maps discrete tokens to continuous latent space using VAE
+                ### Training Pipeline
+                1. Extract discrete tokens using trained FSQ S3Tokenizer
+                2. Generate continuous latent representations using trained DAC-VAE
+                3. Train Stage 1: BPE tokens → Discrete FSQ
+                4. Train Stage 2: Discrete FSQ → DAC-VAE Continuous latent space
+                ### Links
+                - [GitHub Repository](https://github.com/primepake/learnable-speech)
+                - [Technical Paper](https://arxiv.org/pdf/2505.07916)
+                """
+            )
+        # Example inputs
+        gr.Examples(
+            examples=[
+                ["Hello everyone! I am here to tell you that Learnable-Speech is amazing!", 0],
+                ["The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle.", 1],
+                ["We propose Learnable-Speech, a new approach to neural text-to-speech synthesis.", 2],
+                ["This implementation uses flow matching for high-quality 24kHz audio generation.", 3],
+            ],
+            inputs=[text_input, speaker_slider],
+            outputs=audio_output,
+            fn=synthesize_speech,
+            cache_examples=False,
+        )
+        generate_btn.click(
+            fn=synthesize_speech,
+            inputs=[text_input, speaker_slider],
+            outputs=audio_output
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )

requirements-hf.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+gradio==4.44.0
+torch==2.1.0
+torchaudio==2.1.0
+numpy==1.24.3
+soundfile==0.12.1
+librosa==0.10.1
+transformers==4.36.0
+omegaconf==2.3.0
+hydra-core==1.3.2
+# Optional: Add these if you need the full training pipeline
+# deepspeed==0.12.6
+# tensorboard==2.14.0
+# matplotlib==3.7.2