from typing import Dict, List, Any import soundfile as sf from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor from qwen_omni_utils import process_mm_info class EndpointHandler(): def __init__(self, path="./"): self.model = Qwen3OmniMoeForConditionalGeneration.from_pretrained( path, dtype="auto", device_map="auto", ) self.processor = Qwen3OmniMoeProcessor.from_pretrained(path) def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: messages = data.get("messages", []) use_audio_in_video = data.get("use_audio_in_video", True) speaker = data.get("speaker", "Ethan") text = self.processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) audios, images, videos = process_mm_info(messages, use_audio_in_video=use_audio_in_video) inputs = self.processor( text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=use_audio_in_video ) inputs = inputs.to(self.model.device).to(self.model.dtype) text_ids, audio = self.model.generate( **inputs, speaker=speaker, thinker_return_dict_in_generate=True, use_audio_in_video=use_audio_in_video ) text_output = self.processor.batch_decode( text_ids.sequences[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True, clean_up_tokenization_spaces=False ) result = {"generated_text": text_output} if audio is not None: # Guarda el audio en un archivo temporal y retorna la ruta sf.write("output.wav", audio.reshape(-1).detach().cpu().numpy(), samplerate=24000) result["audio_path"] = "output.wav" return [result]