Spaces:

AmaniQuery
/

amaniquery-vibevoice

Running

App Files Files Community

Benaah commited on about 16 hours ago

Commit

2455231

verified ·

1 Parent(s): 1d4ca79

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

Dockerfile +55 -0
README.md +44 -12
app.py +198 -0
requirements.txt +12 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,55 @@

+# AmaniQuery VibeVoice - Hugging Face Spaces Dockerfile
+# Dedicated TTS service using Microsoft VibeVoice model
+FROM python:3.11-slim AS base
+ARG CACHEBUST=2025-12-16
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PYTHONPATH=/app \
+    PORT=7860 \
+    HF_HOME=/app/models
+# Install system dependencies for audio processing
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    curl \
+    build-essential \
+    ffmpeg \
+    libsndfile1 \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Copy requirements for VibeVoice
+COPY hf-spaces/vibevoice/requirements.txt ./
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir torch torchaudio --index-url https://download.pytorch.org/whl/cpu && \
+    pip install --no-cache-dir -r requirements.txt
+# Pre-download VibeVoice model
+RUN python -c "from huggingface_hub import snapshot_download; snapshot_download('microsoft/VibeVoice-Realtime-0.5B')"
+# Copy VibeVoice library
+COPY VibeVoice/vibevoice/ ./vibevoice/
+COPY hf-spaces/vibevoice/app.py ./
+# Create necessary directories
+RUN mkdir -p /app/models /app/cache && \
+    useradd --create-home --shell /bin/bash app && \
+    chown -R app:app /app
+USER app
+EXPOSE ${PORT}
+HEALTHCHECK --interval=30s --timeout=30s --start-period=120s --retries=3 \
+    CMD curl -f http://localhost:${PORT}/health || exit 1
+# Default voice configuration
+ENV VIBEVOICE_DEVICE=cpu \
+    VIBEVOICE_VOICE=Wayne \
+    VIBEVOICE_CFG_SCALE=1.5
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

README.md CHANGED Viewed

@@ -1,12 +1,44 @@
----
-title: Vibevoice
-emoji: 📉
-colorFrom: indigo
-colorTo: red
-sdk: docker
-pinned: false
-license: mit
-short_description: VibeVoice Agent
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: AmaniQuery VibeVoice
+emoji: 🎤
+colorFrom: purple
+colorTo: pink
+sdk: docker
+app_port: 7860
+pinned: false
+license: mit
+---
+# AmaniQuery VibeVoice
+Text-to-Speech service for AmaniQuery using Microsoft VibeVoice model.
+## Features
+- 🎤 High-quality text-to-speech synthesis
+- 💬 Conversational voice responses
+- 🔊 Multiple voice presets (Wayne, Angela, etc.)
+- ⚡ Real-time streaming audio
+## API Endpoints
+- `GET /health` - Health check
+- `POST /api/v1/voice/speak` - Convert text to speech
+- `POST /api/v1/voice/chat` - Conversational voice synthesis
+- `GET /api/v1/voice/voices` - List available voice presets
+## Request Format
+```json
+{
+  "text": "Hello, I am your legal research assistant.",
+  "voice": "Wayne",
+  "cfg_scale": 1.5
+}
+```
+## Environment Variables
+Required secrets in HF Space settings:
+- `JWT_SECRET` - Shared JWT secret for cross-service auth
+- `BACKEND_SPACE_URL` - URL to main AmaniQuery backend

app.py ADDED Viewed

	@@ -0,0 +1,198 @@

+"""
+AmaniQuery VibeVoice Service - FastAPI wrapper for TTS
+Standalone HuggingFace Space for voice synthesis
+"""
+import os
+import io
+import wave
+import logging
+from typing import Optional
+from fastapi import FastAPI, HTTPException, Header, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse, JSONResponse
+from pydantic import BaseModel, Field
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# FastAPI app
+app = FastAPI(
+    title="AmaniQuery VibeVoice",
+    description="Text-to-Speech service for AmaniQuery using Microsoft VibeVoice",
+    version="1.0.0",
+)
+# CORS configuration
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # In production, restrict to specific origins
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Global model instance (lazy loaded)
+_tts_model = None
+_processor = None
+class SpeakRequest(BaseModel):
+    """Request model for text-to-speech"""
+    text: str = Field(..., description="Text to convert to speech", max_length=5000)
+    voice: str = Field(default="Wayne", description="Voice preset to use")
+    cfg_scale: float = Field(default=1.5, ge=1.0, le=3.0, description="Classifier-free guidance scale")
+class VoiceInfo(BaseModel):
+    """Voice preset information"""
+    name: str
+    description: str
+# Available voice presets
+VOICE_PRESETS = [
+    VoiceInfo(name="Wayne", description="Male, American English, Calm"),
+    VoiceInfo(name="Angela", description="Female, American English, Professional"),
+    VoiceInfo(name="Aria", description="Female, American English, Warm"),
+    VoiceInfo(name="Davis", description="Male, American English, Confident"),
+]
+def get_tts_model():
+    """Lazy load the TTS model"""
+    global _tts_model, _processor
+    if _tts_model is None:
+        logger.info("Loading VibeVoice model...")
+        try:
+            import torch
+            from vibevoice.modular import (
+                VibeVoiceStreamingForConditionalGenerationInference,
+                VibeVoiceStreamingConfig,
+            )
+            from vibevoice.processor import VibeVoiceStreamingProcessor
+            device = os.getenv("VIBEVOICE_DEVICE", "cpu")
+            if device == "auto":
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+            config = VibeVoiceStreamingConfig(
+                model_path="microsoft/VibeVoice-Realtime-0.5B",
+                device=device,
+            )
+            _tts_model = VibeVoiceStreamingForConditionalGenerationInference(config)
+            _processor = VibeVoiceStreamingProcessor()
+            logger.info(f"VibeVoice model loaded on {device}")
+        except Exception as e:
+            logger.error(f"Failed to load VibeVoice model: {e}")
+            raise
+    return _tts_model, _processor
+def validate_jwt(authorization: Optional[str] = None) -> bool:
+    """Validate JWT token for cross-service auth (optional)"""
+    jwt_secret = os.getenv("JWT_SECRET")
+    if not jwt_secret:
+        # No JWT configured, allow all requests
+        return True
+    if not authorization or not authorization.startswith("Bearer "):
+        return False
+    try:
+        import jwt
+        token = authorization.replace("Bearer ", "")
+        jwt.decode(token, jwt_secret, algorithms=["HS256"])
+        return True
+    except Exception:
+        return False
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {"status": "healthy", "service": "vibevoice"}
+@app.get("/api/v1/voice/voices")
+async def list_voices():
+    """List available voice presets"""
+    return {"voices": [v.dict() for v in VOICE_PRESETS]}
+@app.post("/api/v1/voice/speak")
+async def speak(
+    request: SpeakRequest,
+    authorization: Optional[str] = Header(None),
+):
+    """Convert text to speech"""
+    # Optional JWT validation
+    if os.getenv("JWT_SECRET") and not validate_jwt(authorization):
+        raise HTTPException(status_code=401, detail="Invalid or missing authentication")
+    try:
+        model, processor = get_tts_model()
+        # Generate audio
+        logger.info(f"Generating speech for: {request.text[:50]}...")
+        # Process text and generate audio
+        audio_data = model.generate(
+            text=request.text,
+            voice=request.voice,
+            cfg_scale=request.cfg_scale,
+        )
+        # Convert to WAV format
+        audio_buffer = io.BytesIO()
+        with wave.open(audio_buffer, 'wb') as wav_file:
+            wav_file.setnchannels(1)
+            wav_file.setsampwidth(2)  # 16-bit
+            wav_file.setframerate(24000)  # Sample rate
+            wav_file.writeframes(audio_data.tobytes())
+        audio_buffer.seek(0)
+        return StreamingResponse(
+            audio_buffer,
+            media_type="audio/wav",
+            headers={"Content-Disposition": "attachment; filename=speech.wav"}
+        )
+    except Exception as e:
+        logger.error(f"TTS generation failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Speech generation failed: {str(e)}")
+@app.post("/api/v1/voice/chat")
+async def voice_chat(
+    request: SpeakRequest,
+    authorization: Optional[str] = Header(None),
+):
+    """Generate conversational voice response (same as speak for now)"""
+    return await speak(request, authorization)
+@app.get("/")
+async def root():
+    """Root endpoint with service info"""
+    return {
+        "service": "AmaniQuery VibeVoice",
+        "version": "1.0.0",
+        "endpoints": {
+            "health": "/health",
+            "speak": "/api/v1/voice/speak",
+            "voices": "/api/v1/voice/voices",
+        }
+    }
+if __name__ == "__main__":
+    import uvicorn
+    port = int(os.getenv("PORT", 7860))
+    uvicorn.run(app, host="0.0.0.0", port=port)

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+# VibeVoice HuggingFace Space Requirements
+fastapi>=0.104.0
+uvicorn>=0.24.0
+python-multipart>=0.0.6
+pydantic>=2.0.0
+transformers>=4.36.0
+accelerate>=0.25.0
+soundfile>=0.12.1
+scipy>=1.11.0
+numpy>=1.24.0
+huggingface-hub>=0.20.0
+PyJWT>=2.8.0