import gradio as gr
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
import cv2

# Load the pre-trained CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def apply_gradcam(image, text):
    inputs = processor(text=[text], images=image, return_tensors="pt", padding=True)
    outputs = model(**inputs)
    
    image_embeds = outputs.image_embeds
    text_embeds = outputs.text_embeds
    similarity = torch.nn.functional.cosine_similarity(image_embeds, text_embeds)
    similarity.backward()

    gradients = model.get_input_embeddings().weight.grad
    pooled_gradients = torch.mean(gradients, dim=[0, 2, 3])

    activations = outputs.last_hidden_state
    for i in range(pooled_gradients.shape[0]):
        activations[:, i, :, :] *= pooled_gradients[i]
    heatmap = torch.mean(activations, dim=1).squeeze().detach().cpu().numpy()

    heatmap = np.maximum(heatmap, 0)
    heatmap /= np.max(heatmap)
    heatmap = cv2.resize(heatmap, (image.size[0], image.size[1]))
    heatmap = np.uint8(255 * heatmap)
    heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)

    superimposed_img = cv2.addWeighted(np.array(image), 0.6, heatmap, 0.4, 0)
    return superimposed_img

def highlight_image(image, text):
    highlighted_image = apply_gradcam(image, text)
    return Image.fromarray(highlighted_image)

# Define Gradio interface
iface = gr.Interface(
    fn=highlight_image,
    inputs=[gr.Image(type="pil"), gr.Textbox(label="Text Description")],
    outputs=gr.Image(type="pil"),
    title="Image Text Highlight",
    description="Upload an image and provide a text description to highlight the relevant part of the image."
)

# Launch the Gradio app
iface.launch()