Benaah commited on
Commit
2455231
·
verified ·
1 Parent(s): 1d4ca79

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. Dockerfile +55 -0
  2. README.md +44 -12
  3. app.py +198 -0
  4. requirements.txt +12 -0
Dockerfile ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AmaniQuery VibeVoice - Hugging Face Spaces Dockerfile
2
+ # Dedicated TTS service using Microsoft VibeVoice model
3
+
4
+ FROM python:3.11-slim AS base
5
+
6
+ ARG CACHEBUST=2025-12-16
7
+
8
+ ENV PYTHONDONTWRITEBYTECODE=1 \
9
+ PYTHONUNBUFFERED=1 \
10
+ PYTHONPATH=/app \
11
+ PORT=7860 \
12
+ HF_HOME=/app/models
13
+
14
+ # Install system dependencies for audio processing
15
+ RUN apt-get update && \
16
+ apt-get install -y --no-install-recommends \
17
+ curl \
18
+ build-essential \
19
+ ffmpeg \
20
+ libsndfile1 \
21
+ && rm -rf /var/lib/apt/lists/*
22
+
23
+ WORKDIR /app
24
+
25
+ # Copy requirements for VibeVoice
26
+ COPY hf-spaces/vibevoice/requirements.txt ./
27
+ RUN pip install --no-cache-dir --upgrade pip && \
28
+ pip install --no-cache-dir torch torchaudio --index-url https://download.pytorch.org/whl/cpu && \
29
+ pip install --no-cache-dir -r requirements.txt
30
+
31
+ # Pre-download VibeVoice model
32
+ RUN python -c "from huggingface_hub import snapshot_download; snapshot_download('microsoft/VibeVoice-Realtime-0.5B')"
33
+
34
+ # Copy VibeVoice library
35
+ COPY VibeVoice/vibevoice/ ./vibevoice/
36
+ COPY hf-spaces/vibevoice/app.py ./
37
+
38
+ # Create necessary directories
39
+ RUN mkdir -p /app/models /app/cache && \
40
+ useradd --create-home --shell /bin/bash app && \
41
+ chown -R app:app /app
42
+
43
+ USER app
44
+
45
+ EXPOSE ${PORT}
46
+
47
+ HEALTHCHECK --interval=30s --timeout=30s --start-period=120s --retries=3 \
48
+ CMD curl -f http://localhost:${PORT}/health || exit 1
49
+
50
+ # Default voice configuration
51
+ ENV VIBEVOICE_DEVICE=cpu \
52
+ VIBEVOICE_VOICE=Wayne \
53
+ VIBEVOICE_CFG_SCALE=1.5
54
+
55
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
README.md CHANGED
@@ -1,12 +1,44 @@
1
- ---
2
- title: Vibevoice
3
- emoji: 📉
4
- colorFrom: indigo
5
- colorTo: red
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- short_description: VibeVoice Agent
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: AmaniQuery VibeVoice
3
+ emoji: 🎤
4
+ colorFrom: purple
5
+ colorTo: pink
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
+ license: mit
10
+ ---
11
+
12
+ # AmaniQuery VibeVoice
13
+
14
+ Text-to-Speech service for AmaniQuery using Microsoft VibeVoice model.
15
+
16
+ ## Features
17
+
18
+ - 🎤 High-quality text-to-speech synthesis
19
+ - 💬 Conversational voice responses
20
+ - 🔊 Multiple voice presets (Wayne, Angela, etc.)
21
+ - ⚡ Real-time streaming audio
22
+
23
+ ## API Endpoints
24
+
25
+ - `GET /health` - Health check
26
+ - `POST /api/v1/voice/speak` - Convert text to speech
27
+ - `POST /api/v1/voice/chat` - Conversational voice synthesis
28
+ - `GET /api/v1/voice/voices` - List available voice presets
29
+
30
+ ## Request Format
31
+
32
+ ```json
33
+ {
34
+ "text": "Hello, I am your legal research assistant.",
35
+ "voice": "Wayne",
36
+ "cfg_scale": 1.5
37
+ }
38
+ ```
39
+
40
+ ## Environment Variables
41
+
42
+ Required secrets in HF Space settings:
43
+ - `JWT_SECRET` - Shared JWT secret for cross-service auth
44
+ - `BACKEND_SPACE_URL` - URL to main AmaniQuery backend
app.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AmaniQuery VibeVoice Service - FastAPI wrapper for TTS
3
+ Standalone HuggingFace Space for voice synthesis
4
+ """
5
+
6
+ import os
7
+ import io
8
+ import wave
9
+ import logging
10
+ from typing import Optional
11
+ from fastapi import FastAPI, HTTPException, Header, Request
12
+ from fastapi.middleware.cors import CORSMiddleware
13
+ from fastapi.responses import StreamingResponse, JSONResponse
14
+ from pydantic import BaseModel, Field
15
+
16
+ # Configure logging
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # FastAPI app
21
+ app = FastAPI(
22
+ title="AmaniQuery VibeVoice",
23
+ description="Text-to-Speech service for AmaniQuery using Microsoft VibeVoice",
24
+ version="1.0.0",
25
+ )
26
+
27
+ # CORS configuration
28
+ app.add_middleware(
29
+ CORSMiddleware,
30
+ allow_origins=["*"], # In production, restrict to specific origins
31
+ allow_credentials=True,
32
+ allow_methods=["*"],
33
+ allow_headers=["*"],
34
+ )
35
+
36
+ # Global model instance (lazy loaded)
37
+ _tts_model = None
38
+ _processor = None
39
+
40
+
41
+ class SpeakRequest(BaseModel):
42
+ """Request model for text-to-speech"""
43
+ text: str = Field(..., description="Text to convert to speech", max_length=5000)
44
+ voice: str = Field(default="Wayne", description="Voice preset to use")
45
+ cfg_scale: float = Field(default=1.5, ge=1.0, le=3.0, description="Classifier-free guidance scale")
46
+
47
+
48
+ class VoiceInfo(BaseModel):
49
+ """Voice preset information"""
50
+ name: str
51
+ description: str
52
+
53
+
54
+ # Available voice presets
55
+ VOICE_PRESETS = [
56
+ VoiceInfo(name="Wayne", description="Male, American English, Calm"),
57
+ VoiceInfo(name="Angela", description="Female, American English, Professional"),
58
+ VoiceInfo(name="Aria", description="Female, American English, Warm"),
59
+ VoiceInfo(name="Davis", description="Male, American English, Confident"),
60
+ ]
61
+
62
+
63
+ def get_tts_model():
64
+ """Lazy load the TTS model"""
65
+ global _tts_model, _processor
66
+
67
+ if _tts_model is None:
68
+ logger.info("Loading VibeVoice model...")
69
+ try:
70
+ import torch
71
+ from vibevoice.modular import (
72
+ VibeVoiceStreamingForConditionalGenerationInference,
73
+ VibeVoiceStreamingConfig,
74
+ )
75
+ from vibevoice.processor import VibeVoiceStreamingProcessor
76
+
77
+ device = os.getenv("VIBEVOICE_DEVICE", "cpu")
78
+ if device == "auto":
79
+ device = "cuda" if torch.cuda.is_available() else "cpu"
80
+
81
+ config = VibeVoiceStreamingConfig(
82
+ model_path="microsoft/VibeVoice-Realtime-0.5B",
83
+ device=device,
84
+ )
85
+
86
+ _tts_model = VibeVoiceStreamingForConditionalGenerationInference(config)
87
+ _processor = VibeVoiceStreamingProcessor()
88
+
89
+ logger.info(f"VibeVoice model loaded on {device}")
90
+ except Exception as e:
91
+ logger.error(f"Failed to load VibeVoice model: {e}")
92
+ raise
93
+
94
+ return _tts_model, _processor
95
+
96
+
97
+ def validate_jwt(authorization: Optional[str] = None) -> bool:
98
+ """Validate JWT token for cross-service auth (optional)"""
99
+ jwt_secret = os.getenv("JWT_SECRET")
100
+ if not jwt_secret:
101
+ # No JWT configured, allow all requests
102
+ return True
103
+
104
+ if not authorization or not authorization.startswith("Bearer "):
105
+ return False
106
+
107
+ try:
108
+ import jwt
109
+ token = authorization.replace("Bearer ", "")
110
+ jwt.decode(token, jwt_secret, algorithms=["HS256"])
111
+ return True
112
+ except Exception:
113
+ return False
114
+
115
+
116
+ @app.get("/health")
117
+ async def health_check():
118
+ """Health check endpoint"""
119
+ return {"status": "healthy", "service": "vibevoice"}
120
+
121
+
122
+ @app.get("/api/v1/voice/voices")
123
+ async def list_voices():
124
+ """List available voice presets"""
125
+ return {"voices": [v.dict() for v in VOICE_PRESETS]}
126
+
127
+
128
+ @app.post("/api/v1/voice/speak")
129
+ async def speak(
130
+ request: SpeakRequest,
131
+ authorization: Optional[str] = Header(None),
132
+ ):
133
+ """Convert text to speech"""
134
+ # Optional JWT validation
135
+ if os.getenv("JWT_SECRET") and not validate_jwt(authorization):
136
+ raise HTTPException(status_code=401, detail="Invalid or missing authentication")
137
+
138
+ try:
139
+ model, processor = get_tts_model()
140
+
141
+ # Generate audio
142
+ logger.info(f"Generating speech for: {request.text[:50]}...")
143
+
144
+ # Process text and generate audio
145
+ audio_data = model.generate(
146
+ text=request.text,
147
+ voice=request.voice,
148
+ cfg_scale=request.cfg_scale,
149
+ )
150
+
151
+ # Convert to WAV format
152
+ audio_buffer = io.BytesIO()
153
+ with wave.open(audio_buffer, 'wb') as wav_file:
154
+ wav_file.setnchannels(1)
155
+ wav_file.setsampwidth(2) # 16-bit
156
+ wav_file.setframerate(24000) # Sample rate
157
+ wav_file.writeframes(audio_data.tobytes())
158
+
159
+ audio_buffer.seek(0)
160
+
161
+ return StreamingResponse(
162
+ audio_buffer,
163
+ media_type="audio/wav",
164
+ headers={"Content-Disposition": "attachment; filename=speech.wav"}
165
+ )
166
+
167
+ except Exception as e:
168
+ logger.error(f"TTS generation failed: {e}")
169
+ raise HTTPException(status_code=500, detail=f"Speech generation failed: {str(e)}")
170
+
171
+
172
+ @app.post("/api/v1/voice/chat")
173
+ async def voice_chat(
174
+ request: SpeakRequest,
175
+ authorization: Optional[str] = Header(None),
176
+ ):
177
+ """Generate conversational voice response (same as speak for now)"""
178
+ return await speak(request, authorization)
179
+
180
+
181
+ @app.get("/")
182
+ async def root():
183
+ """Root endpoint with service info"""
184
+ return {
185
+ "service": "AmaniQuery VibeVoice",
186
+ "version": "1.0.0",
187
+ "endpoints": {
188
+ "health": "/health",
189
+ "speak": "/api/v1/voice/speak",
190
+ "voices": "/api/v1/voice/voices",
191
+ }
192
+ }
193
+
194
+
195
+ if __name__ == "__main__":
196
+ import uvicorn
197
+ port = int(os.getenv("PORT", 7860))
198
+ uvicorn.run(app, host="0.0.0.0", port=port)
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # VibeVoice HuggingFace Space Requirements
2
+ fastapi>=0.104.0
3
+ uvicorn>=0.24.0
4
+ python-multipart>=0.0.6
5
+ pydantic>=2.0.0
6
+ transformers>=4.36.0
7
+ accelerate>=0.25.0
8
+ soundfile>=0.12.1
9
+ scipy>=1.11.0
10
+ numpy>=1.24.0
11
+ huggingface-hub>=0.20.0
12
+ PyJWT>=2.8.0