Spaces:
Running
Running
| from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, logging | |
| from qwen_vl_utils import process_vision_info | |
| import torch | |
| from PIL import Image | |
| import gc | |
| import warnings | |
| ### Utility functions ### | |
| def clear_gpu_cache(): | |
| """Clears GPU cache and performs garbage collection""" | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| def get_device_and_dtype(): | |
| if torch.cuda.is_available(): | |
| device = 'cuda' | |
| torch_dtype = torch.float16 | |
| else: | |
| device = 'cpu' | |
| torch_dtype = torch.float32 | |
| return device, torch_dtype | |
| ### Model-related functions ### | |
| def initialize_model(device: str, torch_dtype: torch.dtype): | |
| """Initialize and return model with GPU optimization""" | |
| with warnings.catch_warnings(): | |
| warnings.simplefilter("ignore") | |
| return Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| "Qwen/Qwen2.5-VL-7B-Instruct-AWQ", | |
| device_map=device, | |
| torch_dtype=torch_dtype, | |
| ) | |
| def initialize_processor(): | |
| """Initialize and return processor""" | |
| return AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct-AWQ") | |
| def prepare_model_inputs(processor, messages, device: str): | |
| """Prepare all inputs for model inference""" | |
| text = processor.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| return processor( | |
| text=[text], | |
| images=image_inputs, | |
| videos=video_inputs, | |
| padding=True, | |
| return_tensors="pt" | |
| ).to(device) | |
| def generate_description(model, inputs): | |
| """Generate description using model""" | |
| return model.generate( | |
| **inputs, | |
| max_new_tokens=512, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.9 | |
| ) | |
| def process_model_output(processor, inputs, generated_ids): | |
| """Process model output to get clean text""" | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
| ] | |
| return processor.batch_decode( | |
| generated_ids_trimmed, | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=False | |
| )[0] | |
| def create_prompt_message(image_path, timestamp, custom_prompt=None): | |
| """ | |
| Create standardized message prompt for model with optional custom prompt | |
| Args: | |
| image_path (str): Path to the image file | |
| timestamp (str): Video timestamp in HH:MM:SS.SSS format | |
| custom_prompt (str, optional): Custom prompt text. If None, uses default prompt. | |
| Returns: | |
| list: Formatted message for model input | |
| """ | |
| default_prompt = f"Video timestamp: {timestamp}. Describe in detail what is happening in this frame." | |
| prompt_text = custom_prompt.format(timestamp=timestamp) if custom_prompt else default_prompt | |
| return [{ | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": f"file://{image_path}"}, | |
| {"type": "text", "text": prompt_text}, | |
| ], | |
| }] | |
| if __name__ == "__main__": | |
| selected_device, selected_dtype = get_device_and_dtype() | |
| image_url = "/workspaces/Video_Analyser/app_srv/temp/a28af289-377d-468d-b0eb-ed0f7dcd2ab3/frames/frame_00_01_50_110.jpg" | |
| timestamp = "00:01:16.076" | |
| custom_prompt = "Timestamp {timestamp}. Analyze this frame focusing on objects in the foreground." | |
| model = initialize_model(selected_device, selected_dtype) | |
| processor = initialize_processor() | |
| messages = create_prompt_message(image_url, timestamp, custom_prompt) | |
| inputs = prepare_model_inputs(processor, messages, selected_device) | |
| generated_ids = generate_description(model, inputs) | |
| image_description = process_model_output(processor, inputs, generated_ids) | |
| print(image_description) | |