FRIDA-Qwen3

Any-to-Any

Transformers

Model card Files Files and versions

xet

Community

rosassebastian2003 commited on Oct 6, 2025

Commit

f2527c6

1 Parent(s): 8641305

test

Browse files

Files changed (1) hide show

handler.py +41 -24

handler.py CHANGED Viewed

@@ -8,92 +8,107 @@ import os
 import tempfile
 import numpy as np
 class EndpointHandler():
     def __init__(self, path=""):
         model_kwargs = {
-            "device_map": "auto",
             "torch_dtype": torch.bfloat16 if torch.cuda.is_available() else None,
-            "enable_audio_output": True
         }
         self.pipeline = pipeline(
             task="text-generation",
-            model=path,
-            **model_kwargs
         )
         self.system_prompt = (
             "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
             "capable of perceiving auditory and visual inputs, as well as generating text and speech."
         )
-        self.sampling_rate = self.pipeline.model.config.sampling_rate
     def _handle_audio_input(self, data: Dict[str, Any]) -> str:
         audio_data_base64 = data.get("audio_data")
         if not audio_data_base64:
             return None
         temp_file_path = None
         try:
             audio_bytes = base64.b64decode(audio_data_base64)
             temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
             temp_file.write(audio_bytes)
             temp_file.close()
             temp_file_path = temp_file.name
             return temp_file_path
         except Exception as e:
             if temp_file_path and os.path.exists(temp_file_path):
-                os.remove(temp_file_path)
             raise ValueError(f"Error al decodificar y guardar el audio Base64: {e}")
     def _handle_audio_output(self, generated_audio: torch.Tensor, sampling_rate: int) -> str:
         audio_array = generated_audio.cpu().numpy().squeeze()
         if audio_array.dtype!= np.float32:
-            audio_array = audio_array.astype(np.float32)
-        encoded_audio = None
         with io.BytesIO() as buffer:
             wavfile.write(buffer, rate=sampling_rate, data=audio_array)
             buffer.seek(0)
-            encoded_audio = base64.b64encode(buffer.read()).decode('utf-8')
-        return encoded_audio
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         prompt = data.get("inputs")
         if not prompt:
             raise ValueError("El campo 'inputs' (prompt de texto) es obligatorio.")
         generation_kwargs = data.get("parameters", {})
         audio_file_path = None
         try:
             audio_file_path = self._handle_audio_input(data)
             inputs_list = [prompt]
             if audio_file_path:
                 inputs_list.append(audio_file_path)
             generation_kwargs.update({
-                "system_prompt": self.system_prompt,
-                "return_audio": True,
                 "max_new_tokens": generation_kwargs.get("max_new_tokens", 512),
             })
             raw_output = self.pipeline(inputs_list, **generation_kwargs)
             response = raw_output
             final_response = {
                 "generated_text": response.get("generated_text"),
                 "audio_output": None
             }
             if "audio_array" in response:
                 encoded_audio = self._handle_audio_output(response["audio_array"], self.sampling_rate)
                 final_response["audio_output"] = encoded_audio
@@ -101,8 +116,10 @@ class EndpointHandler():
             return [final_response]
         except Exception as e:
             return [{"error": str(e)}]
         finally:
             if audio_file_path and os.path.exists(audio_file_path):
-                os.remove(audio_file_path)

 import tempfile
 import numpy as np
+# Nombre del modelo (usado como fallback si 'path' no se proporciona)
+MODEL_NAME = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
 class EndpointHandler():
     def __init__(self, path=""):
+        # 1. Configuraciones críticas para la carga del modelo MoE y la funcionalidad de voz
         model_kwargs = {
+            "device_map": "auto", # Optimización para la distribución de pesos en GPU [1]
             "torch_dtype": torch.bfloat16 if torch.cuda.is_available() else None,
+            "enable_audio_output": True  # Clave esencial para cargar el componente Talker (generador de voz) [4]
         }
+        # 2. Carga del pipeline genérico de generación de texto (el wrapper para LLM multimodales) [3]
         self.pipeline = pipeline(
             task="text-generation",
+            model=path or MODEL_NAME,
+            **model_kwargs # Inyección de los parámetros específicos de Qwen3
         )
+        # 3. System prompt obligatorio para Qwen3-Omni para generar audio natural [4]
         self.system_prompt = (
             "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
             "capable of perceiving auditory and visual inputs, as well as generating text and speech."
         )
+        # 4. Tasa de muestreo del modelo (necesaria para la serialización de audio en __call__)
+        self.sampling_rate = getattr(self.pipeline.model.config, 'sampling_rate', 24000)
     def _handle_audio_input(self, data: Dict[str, Any]) -> str:
+        """ Decodifica la entrada de audio Base64 y la guarda temporalmente como un archivo WAV. """
         audio_data_base64 = data.get("audio_data")
         if not audio_data_base64:
             return None
         temp_file_path = None
         try:
             audio_bytes = base64.b64decode(audio_data_base64)
+            # Guardar en un archivo temporal para que el pipeline lo pueda procesar [5]
             temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
             temp_file.write(audio_bytes)
             temp_file.close()
             temp_file_path = temp_file.name
             return temp_file_path
         except Exception as e:
             if temp_file_path and os.path.exists(temp_file_path):
+                 os.remove(temp_file_path)
             raise ValueError(f"Error al decodificar y guardar el audio Base64: {e}")
     def _handle_audio_output(self, generated_audio: torch.Tensor, sampling_rate: int) -> str:
+        """ Convierte el tensor de audio de salida a un buffer WAV y lo codifica en Base64. """
         audio_array = generated_audio.cpu().numpy().squeeze()
         if audio_array.dtype!= np.float32:
+             audio_array = audio_array.astype(np.float32)
         with io.BytesIO() as buffer:
+            # Escribir el array como WAV [2]
             wavfile.write(buffer, rate=sampling_rate, data=audio_array)
             buffer.seek(0)
+            # Codificar a Base64 para la respuesta JSON
+            encoded_audio = base64.b64encode(buffer.read()).decode('utf-8')
+            return encoded_audio
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         prompt = data.get("inputs")
         if not prompt:
             raise ValueError("El campo 'inputs' (prompt de texto) es obligatorio.")
         generation_kwargs = data.get("parameters", {})
         audio_file_path = None
         try:
+            # 1. Manejo de I/O de audio (Base64 -> Archivo Temporal)
             audio_file_path = self._handle_audio_input(data)
+            # 2. El pipeline espera una lista de entradas multimodales (Texto o Audio)
             inputs_list = [prompt]
             if audio_file_path:
                 inputs_list.append(audio_file_path)
+            # 3. Configuración de generación
             generation_kwargs.update({
+                "system_prompt": self.system_prompt, # Requerido para la calidad de la voz [4]
+                "return_audio": True, # Solicitamos que la salida contenga el tensor de audio [4]
                 "max_new_tokens": generation_kwargs.get("max_new_tokens", 512),
             })
+            # 4. Ejecutar el pipeline
             raw_output = self.pipeline(inputs_list, **generation_kwargs)
+            # El pipeline devuelve una lista de diccionarios, extraemos el primer resultado
             response = raw_output
             final_response = {
                 "generated_text": response.get("generated_text"),
                 "audio_output": None
             }
+            # 5. Post-procesamiento (Tensor -> Base64-WAV)
             if "audio_array" in response:
                 encoded_audio = self._handle_audio_output(response["audio_array"], self.sampling_rate)
                 final_response["audio_output"] = encoded_audio
             return [final_response]
         except Exception as e:
+            # Manejo de errores
             return [{"error": str(e)}]
         finally:
+            # 6. Limpieza de archivos temporales (Mantenimiento crítico)
             if audio_file_path and os.path.exists(audio_file_path):
+                os.remove(audio_file_path)