Feat: Handler.py added

Browse files

Files changed (4) hide show

.DS_Store +0 -0
handler.py +53 -0
requirements.txt +6 -0
test.py +26 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

handler.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from typing import Dict, List, Any
+import soundfile as sf
+from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor
+from qwen_omni_utils import process_mm_info
+class EndpointHandler():
+    def __init__(self, path="./"):
+        self.model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
+            path,
+            dtype="auto",
+            device_map="auto",
+        )
+        self.processor = Qwen3OmniMoeProcessor.from_pretrained(path)
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        messages = data.get("messages", [])
+        use_audio_in_video = data.get("use_audio_in_video", True)
+        speaker = data.get("speaker", "Ethan")
+        text = self.processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        audios, images, videos = process_mm_info(messages, use_audio_in_video=use_audio_in_video)
+        inputs = self.processor(
+            text=text,
+            audio=audios,
+            images=images,
+            videos=videos,
+            return_tensors="pt",
+            padding=True,
+            use_audio_in_video=use_audio_in_video
+        )
+        inputs = inputs.to(self.model.device).to(self.model.dtype)
+        text_ids, audio = self.model.generate(
+            **inputs,
+            speaker=speaker,
+            thinker_return_dict_in_generate=True,
+            use_audio_in_video=use_audio_in_video
+        )
+        text_output = self.processor.batch_decode(
+            text_ids.sequences[:, inputs["input_ids"].shape[1]:],
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )
+        result = {"generated_text": text_output}
+        if audio is not None:
+            # Guarda el audio en un archivo temporal y retorna la ruta
+            sf.write("output.wav", audio.reshape(-1).detach().cpu().numpy(), samplerate=24000)
+            result["audio_path"] = "output.wav"
+        return [result]

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+soundfile
+transformers
+torch
+qwen-omni-utils
+torchvision
+accelerate

test.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from handler import EndpointHandler
+# Inicializa el handler
+my_handler = EndpointHandler(path=".")
+# Prepara un payload con audio y texto
+payload = {
+    "messages": [
+        {
+            "role": "user",
+            "content": [
+                {"type": "audio", "audio": "audio.wav"},
+                {"type": "text", "text": "Hola, ¿cómo estás?"},
+            ]
+        }
+    ],
+    "speaker": "Ethan",  # Puedes usar "Chelsie" o "Aiden"
+    "use_audio_in_video": True
+}
+# Prueba el handler
+result = my_handler(payload)
+# Muestra resultados
+print(result)
+# Si hay audio generado, el resultado incluirá "audio_path": "output.wav"