from transformers import OwlViTProcessor, OwlViTForObjectDetection from PIL import Image import torch from rembg import remove import os import cv2 import numpy as np from PIL import Image, ImageEnhance, ImageFilter from gradio_client import Client, handle_file import json import google.generativeai as genai import base64 import image_enhancement_option3_helper from dotenv import load_dotenv load_dotenv() class process_image: def __init__(self): self.image_path = None self.raw_image = None self.detected_objects = [] self.cropped_image = None self.no_background_image = None self.enhanced_image_1 = None self.enhanced_image_2 = None self.enhanced_image_3 = None self.chosen_image = None self.description = "" def detect_object(self): processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32") model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32") texts = [[ # Giyim "clothing", "topwear", "bottomwear", "outerwear", "apparel", "sportswear", "uniform", "underwear", "dress", "outfit", # Ayakkabı "footwear", "shoes", "boots", "sneakers", # Aksesuarlar "accessory", "bag", "backpack", "handbag", "wallet", "belt", "hat", "cap", "scarf", "glasses", "watch", "jewelry", # Elektronik "electronics", "device", "gadget", "smartphone", "laptop", "tablet", "headphones", "smartwatch", # Kozmetik / Kişisel Bakım "cosmetics", "beauty product", "skincare", "makeup", "perfume", "hair product", # Bebek ve çocuk "baby product", "baby clothes", "toy", "stroller", "pacifier", # Ev ve yaşam "home item", "furniture", "appliance", "decor", "kitchenware", "bedding", "cleaning tool", # Spor ve outdoor "sports gear", "fitness equipment", "gym accessory", "camping gear", "bicycle equipment" ] ] inputs = processor(text=texts, images=self.raw_image, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) target_sizes = torch.tensor([self.raw_image.size[::-1]]) results = processor.post_process_grounded_object_detection( outputs=outputs, target_sizes=target_sizes, threshold=0.2 )[0] self.detected_objects = results["labels"].tolist() # Collect all valid bounding boxes valid_boxes = [] detected_labels = [] for score, label_id, box in zip(results["scores"], results["labels"], results["boxes"]): if score < 0.05: continue valid_boxes.append(box.tolist()) detected_labels.append(texts[0][label_id]) if len(valid_boxes) == 0: self.cropped_image = self.raw_image elif len(valid_boxes) == 1: # Single object detected xmin, ymin, xmax, ymax = map(int, valid_boxes[0]) self.cropped_image = self.raw_image.crop((xmin, ymin, xmax, ymax)) print(f"Single object detected: {detected_labels[0]}") else: # Multiple objects detected and they are pairs similar_items = ['shoes', 'boots', 'sneakers', 'footwear', 'glasses', 'earrings', 'gloves', 'socks', 'jewelry', 'watch', 'bracelet'] clothing_items = ['clothing', 'topwear', 'bottomwear', 'dress', 'outfit', 'apparel'] has_similar_items = any(any(item in label.lower() for item in similar_items) for label in detected_labels) has_clothing_items = any(any(item in label.lower() for item in clothing_items) for label in detected_labels) if has_similar_items or has_clothing_items or len(valid_boxes) <= 3: # Combining them all_xmin = min(box[0] for box in valid_boxes) all_ymin = min(box[1] for box in valid_boxes) all_xmax = max(box[2] for box in valid_boxes) all_ymax = max(box[3] for box in valid_boxes) self.cropped_image = self.raw_image.crop((all_xmin, all_ymin, all_xmax, all_ymax)) else: # If there are too many different objects self.cropped_image = self.raw_image def remove_background(self): if self.cropped_image is None: print("No cropped image available. Using entire image.") self.cropped_image = self.raw_image self.no_background_image = remove(self.cropped_image) def enhance_image_option1(self): sharpened = self.no_background_image.filter(ImageFilter.UnsharpMask( radius=1, percent=120, threshold=1 )) enhancer = ImageEnhance.Contrast(sharpened) contrast_enhanced = enhancer.enhance(1.1) # 10% more contrast enhancer = ImageEnhance.Brightness(contrast_enhanced) brightness_enhanced = enhancer.enhance(1.02) # 2% brighter enhancer = ImageEnhance.Color(brightness_enhanced) color_enhanced = enhancer.enhance(1.05) # 5% more vibrant img_array = np.array(color_enhanced) img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR) denoised = cv2.bilateralFilter(img_bgr, 3, 10, 10) img_rgb = cv2.cvtColor(denoised, cv2.COLOR_BGR2RGB) self.enhanced_image_1 = Image.fromarray(img_rgb) scale = 1.5 original_size = self.enhanced_image_1.size new_size = (int(original_size[0] * scale), int(original_size[1] * scale)) self.enhanced_image_1 = self.enhanced_image_1.resize(new_size, Image.Resampling.LANCZOS) return self.enhanced_image_1 def enhance_image_option2(self): client = Client("finegrain/finegrain-image-enhancer") script_dir = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.join(script_dir, "temp_image.png") self.no_background_image.save(output_path) script_dir = os.path.dirname(os.path.abspath(__file__)) temp_image_path = os.path.join(script_dir, "temp_image.png") result = client.predict( input_image=handle_file(temp_image_path), prompt="", negative_prompt="", seed=0, upscale_factor=2.6, controlnet_scale=0.5, controlnet_decay=0.6, condition_scale=5, tile_width=200, tile_height=200, denoise_strength=0, num_inference_steps=23, solver="DPMSolver", api_name="/process" ) # Get the image from result[1] - local file path, not a URL image_path = result[1] self.enhanced_image_2 = Image.open(image_path) return self.enhanced_image_2 def enhance_image_option3(self): enhancer = image_enhancement_option3_helper.image_enhancement_option3_helper(model=None) self.enhanced_image_3 = enhancer.ai_enhanced_image_processing(self.no_background_image) def generate_description_from_image(self, image_b64: str, tone: str = "professional", lang: str = "en") -> str: API_KEY = os.getenv("SECRET_API_KEY") genai.configure(api_key=API_KEY) # ← ONLY this line model = genai.GenerativeModel("gemini-2.0-flash-exp") # Updated model name prompt = ( f"Analyze this product image and generate an SEO-optimized e-commerce product listing in {lang}. " f"Tone: {tone}. Respond ONLY with valid JSON (no markdown formatting) containing these exact keys: " f"'title', 'description', 'features', 'tags'. " f"The 'features' and 'tags' must be arrays of strings. " f"Do not include any other text or formatting." ) try: response = model.generate_content( [ {"inline_data": {"mime_type": "image/jpeg", "data": image_b64}}, prompt ] ) text = response.text.strip() # Remove markdown code blocks if text.startswith("```json"): text = text[7:] # Remove ```json if text.startswith("```"): text = text[3:] # Remove ``` if text.endswith("```"): text = text[:-3] # Remove trailing ``` text = text.strip() # Parsing the JSON response try: parsed_json = json.loads(text) print("Successfully parsed JSON response") return text except json.JSONDecodeError: return "Invalid JSON response: " + text except Exception as err: return "Error generating description: " + str(err) def choose_image(self, number: int): if number == 1: self.chosen_image = self.enhanced_image_1 elif number == 2: self.chosen_image = self.enhanced_image_2 elif number == 3: self.chosen_image = self.enhanced_image_3 else: raise ValueError("Invalid image number. Choose 1, 2, or 3.") def generate_description(self): print("Starting description generation...") if self.chosen_image is None: print("Error: No image chosen for description generation") self.description = "Error: No image selected for description generation" return self.description try: print("Converting image to base64...") from io import BytesIO buffer = BytesIO() # It handles RGBA images by converting to RGB image_to_save = self.chosen_image if image_to_save.mode == 'RGBA': background = Image.new('RGB', image_to_save.size, (255, 255, 255)) background.paste(image_to_save, mask=image_to_save.split()[-1]) # Use alpha channel as mask image_to_save = background elif image_to_save.mode != 'RGB': image_to_save = image_to_save.convert('RGB') image_to_save.save(buffer, format='JPEG', quality=95) img_b64 = base64.b64encode(buffer.getvalue()).decode() print(f"Image converted to base64, size: {len(img_b64)} characters") tone = "professional" lang = "en" self.description = self.generate_description_from_image(img_b64, tone, lang) if len(self.description) > 15000: self.description = self.description[:15000] + "..." return self.description except Exception as e: print(f"Error in generate_description: {str(e)}") import traceback traceback.print_exc() self.description = f"Error generating description: {str(e)}" return self.description def process(self, image_path): if os.path.isabs(image_path): # If absolute path, use it directly self.image_path = image_path else: # If relative path, join with script directory script_dir = os.path.dirname(os.path.abspath(__file__)) self.image_path = os.path.join(script_dir, image_path) self.raw_image = Image.open(self.image_path).convert("RGB") def get_enhanced_images(self): return self.enhanced_image_1, self.enhanced_image_2, self.enhanced_image_3 def get_description(self): return self.description