Sebastian Rosas Maciel commited on
Commit
a2b7cfd
·
1 Parent(s): 5852e55

Feat: Handler.py added

Browse files
Files changed (4) hide show
  1. .DS_Store +0 -0
  2. handler.py +53 -0
  3. requirements.txt +6 -0
  4. test.py +26 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
handler.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ import soundfile as sf
3
+ from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor
4
+ from qwen_omni_utils import process_mm_info
5
+
6
+ class EndpointHandler():
7
+ def __init__(self, path="./"):
8
+ self.model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
9
+ path,
10
+ dtype="auto",
11
+ device_map="auto",
12
+ )
13
+ self.processor = Qwen3OmniMoeProcessor.from_pretrained(path)
14
+
15
+ def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
16
+ messages = data.get("messages", [])
17
+ use_audio_in_video = data.get("use_audio_in_video", True)
18
+ speaker = data.get("speaker", "Ethan")
19
+
20
+ text = self.processor.apply_chat_template(
21
+ messages,
22
+ tokenize=False,
23
+ add_generation_prompt=True,
24
+ )
25
+ audios, images, videos = process_mm_info(messages, use_audio_in_video=use_audio_in_video)
26
+ inputs = self.processor(
27
+ text=text,
28
+ audio=audios,
29
+ images=images,
30
+ videos=videos,
31
+ return_tensors="pt",
32
+ padding=True,
33
+ use_audio_in_video=use_audio_in_video
34
+ )
35
+ inputs = inputs.to(self.model.device).to(self.model.dtype)
36
+
37
+ text_ids, audio = self.model.generate(
38
+ **inputs,
39
+ speaker=speaker,
40
+ thinker_return_dict_in_generate=True,
41
+ use_audio_in_video=use_audio_in_video
42
+ )
43
+ text_output = self.processor.batch_decode(
44
+ text_ids.sequences[:, inputs["input_ids"].shape[1]:],
45
+ skip_special_tokens=True,
46
+ clean_up_tokenization_spaces=False
47
+ )
48
+ result = {"generated_text": text_output}
49
+ if audio is not None:
50
+ # Guarda el audio en un archivo temporal y retorna la ruta
51
+ sf.write("output.wav", audio.reshape(-1).detach().cpu().numpy(), samplerate=24000)
52
+ result["audio_path"] = "output.wav"
53
+ return [result]
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ soundfile
2
+ transformers
3
+ torch
4
+ qwen-omni-utils
5
+ torchvision
6
+ accelerate
test.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from handler import EndpointHandler
2
+
3
+ # Inicializa el handler
4
+ my_handler = EndpointHandler(path=".")
5
+
6
+ # Prepara un payload con audio y texto
7
+ payload = {
8
+ "messages": [
9
+ {
10
+ "role": "user",
11
+ "content": [
12
+ {"type": "audio", "audio": "audio.wav"},
13
+ {"type": "text", "text": "Hola, ¿cómo estás?"},
14
+ ]
15
+ }
16
+ ],
17
+ "speaker": "Ethan", # Puedes usar "Chelsie" o "Aiden"
18
+ "use_audio_in_video": True
19
+ }
20
+
21
+ # Prueba el handler
22
+ result = my_handler(payload)
23
+
24
+ # Muestra resultados
25
+ print(result)
26
+ # Si hay audio generado, el resultado incluirá "audio_path": "output.wav"