FRIDA-Qwen3 / handler.py
rosassebastian2003's picture
Se volvio a el approach de usar transformers
81ff557
from typing import Dict, List, Any
import soundfile as sf
from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor
from qwen_omni_utils import process_mm_info
class EndpointHandler():
def __init__(self, path="./"):
self.model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
path,
dtype="auto",
device_map="auto",
)
self.processor = Qwen3OmniMoeProcessor.from_pretrained(path)
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
messages = data.get("messages", [])
use_audio_in_video = data.get("use_audio_in_video", True)
speaker = data.get("speaker", "Ethan")
text = self.processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
audios, images, videos = process_mm_info(messages, use_audio_in_video=use_audio_in_video)
inputs = self.processor(
text=text,
audio=audios,
images=images,
videos=videos,
return_tensors="pt",
padding=True,
use_audio_in_video=use_audio_in_video
)
inputs = inputs.to(self.model.device).to(self.model.dtype)
text_ids, audio = self.model.generate(
**inputs,
speaker=speaker,
thinker_return_dict_in_generate=True,
use_audio_in_video=use_audio_in_video
)
text_output = self.processor.batch_decode(
text_ids.sequences[:, inputs["input_ids"].shape[1]:],
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)
result = {"generated_text": text_output}
if audio is not None:
# Guarda el audio en un archivo temporal y retorna la ruta
sf.write("output.wav", audio.reshape(-1).detach().cpu().numpy(), samplerate=24000)
result["audio_path"] = "output.wav"
return [result]