File size: 5,649 Bytes
d2b29c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import gradio as gr
import numpy as np
import librosa
import soundfile as sf
import pandas as pd
def generate_audio(clip_length=4.0, fade_in_duration=0.5, fade_out_duration=0.5, volume_factor=0.3):
# Load audio files
narration, sr = librosa.load('narration.wav', sr=None)
baa, _ = librosa.load('baa.wav', sr=sr)
murmur, _ = librosa.load('murmur.wav', sr=sr)
# Calculate RMS for normalization
narration_rms = np.sqrt(np.mean(narration**2))
baa_rms = np.sqrt(np.mean(baa**2))
murmur_rms = np.sqrt(np.mean(murmur**2))
# Normalize baa and murmur to match narration volume
baa_normalized = baa * (narration_rms / baa_rms)
murmur_normalized = murmur * (narration_rms / murmur_rms)
# Create output array with narration length
output_length = len(narration)
combined = np.zeros(output_length)
# Add narration as baseline
combined += narration
# Add first clip_length seconds of baa at 0:05 (5 seconds) with fade in/out
baa_clip = baa_normalized[:int(clip_length*sr)] * volume_factor
# Create fade-in and fade-out envelopes
# fade_in_duration and fade_out_duration are factors (0-1) of the clip length
fade_in_samples = int(fade_in_duration * len(baa_clip))
fade_out_samples = int(fade_out_duration * len(baa_clip))
fade_in = np.linspace(0, 1, fade_in_samples)
fade_out = np.linspace(1, 0, fade_out_samples)
# Apply fade effects
if fade_in_samples > 0:
baa_clip[:fade_in_samples] *= fade_in
if fade_out_samples > 0:
baa_clip[-fade_out_samples:] *= fade_out
start_idx = int(5 * sr)
end_idx = start_idx + len(baa_clip)
if end_idx <= output_length:
combined[start_idx:end_idx] += baa_clip
# Add first clip_length seconds of murmur at 0:15 (15 seconds) with fade in/out
murmur_clip = murmur_normalized[:int(clip_length*sr)] * volume_factor
# Calculate fade samples for murmur clip
murmur_fade_in_samples = int(fade_in_duration * len(murmur_clip))
murmur_fade_out_samples = int(fade_out_duration * len(murmur_clip))
# Apply fade effects to murmur
if murmur_fade_in_samples > 0:
murmur_fade_in = np.linspace(0, 1, murmur_fade_in_samples)
murmur_clip[:murmur_fade_in_samples] *= murmur_fade_in
if murmur_fade_out_samples > 0:
murmur_fade_out = np.linspace(1, 0, murmur_fade_out_samples)
murmur_clip[-murmur_fade_out_samples:] *= murmur_fade_out
start_idx = int(15 * sr)
end_idx = start_idx + len(murmur_clip)
if end_idx <= output_length:
combined[start_idx:end_idx] += murmur_clip
# Normalize to prevent clipping
max_val = np.max(np.abs(combined))
if max_val > 1.0:
combined = combined / max_val
return (sr, combined)
def visualize_sfx(sound_effect_clip_length, fade_in_duration, fade_out_duration, sound_effect_volume_factor):
# Calculate fade durations in seconds
fade_in_seconds = fade_in_duration * sound_effect_clip_length
fade_out_seconds = fade_out_duration * sound_effect_clip_length
# Create time array with high resolution for smooth visualization
time_resolution = 0.01 # 10ms resolution
times = np.arange(0, sound_effect_clip_length + time_resolution, time_resolution)
# Calculate volume envelope
volumes = []
for t in times:
if t <= fade_in_seconds and fade_in_seconds > 0:
# Fade in phase
volume = sound_effect_volume_factor * (t / fade_in_seconds)
elif t >= sound_effect_clip_length - fade_out_seconds and fade_out_seconds > 0:
# Fade out phase
fade_out_progress = (sound_effect_clip_length - t) / fade_out_seconds
volume = sound_effect_volume_factor * fade_out_progress
else:
# Steady state phase
volume = sound_effect_volume_factor
volumes.append(volume)
# Create DataFrame for LinePlot
plot_data = pd.DataFrame({
"time": times,
"volume": volumes
})
return plot_data
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
sound_effect_clip_length = gr.Slider(minimum=0.5, maximum=5, value=4.0, step=0.1, label="Sound Effect Clip Length (seconds)")
fade_in_duration = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.05, label="Fade In Duration Factor", info="0.0 = no fade in, 1.0 = fade in over entire clip")
fade_out_duration = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.05, label="Fade Out Duration Factor", info="0.0 = no fade out, 1.0 = fade out over entire clip")
sound_effect_volume_factor = gr.Slider(minimum=0.1, maximum=1.0, value=0.15, step=0.05, label="Sound Effect Volume Factor", info="0.1 is 10% of the narration volume, 1.0 is 100% of the original volume")
visualization = gr.LinePlot(label="Sound Effect Volume Envelope", x="time", y="volume", y_lim=[0, 1])
generate_button = gr.Button("Generate Audio")
with gr.Column():
output = gr.Audio()
gr.on(
[demo.load, sound_effect_clip_length.change, fade_in_duration.change, fade_out_duration.change, sound_effect_volume_factor.change],
fn=visualize_sfx,
inputs=[sound_effect_clip_length, fade_in_duration, fade_out_duration, sound_effect_volume_factor],
outputs=visualization
)
generate_button.click(generate_audio, inputs=[sound_effect_clip_length, fade_in_duration, fade_out_duration, sound_effect_volume_factor], outputs=output)
if __name__ == "__main__":
demo.launch() |