#!/usr/bin/env python
# coding: utf-8

# # LLaVA-Med 1.5 OpenVINO demo


import json
import os
import warnings

warnings.filterwarnings("ignore")

import numpy as np
import torch
from PIL import Image
from transformers import logging

from llava.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
from llava.conversation import conv_templates
from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
from llava.model.builder import load_pretrained_model

logging.set_verbosity_error()
image_folder = "data\\qa50_images"


# ## Load Model and Data

model_path = "llava-med-imf16-llmint4"
# model_path = "llava-med-imint8-llmint4"
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(
    model_path=model_path, model_base=None, model_name=model_name, device="gpu", openvino=True, image_device="npu"
)
print("loaded models")

# questions = []
# with open("data/eval/llava_med_eval_qa50_qa.jsonl") as f:
#     for line in f:
#         questions.append(json.loads(line))


# ## Functions

def prepare_inputs_image(question, image):
    conv_mode = "vicuna_v1"  # default
    qs = question.replace(DEFAULT_IMAGE_TOKEN, "").strip()
    qs = DEFAULT_IMAGE_TOKEN + "\n" + qs  # model.config.mm_use_im_start_end is False

    conv = conv_templates[conv_mode].copy()
    conv.append_message(conv.roles[0], qs)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()
    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0)

    # image = Image.open(image_file)
    image_tensor = process_images([image], image_processor, model.config)[0]
    return input_ids, image_tensor


def run_inference_image(image, question):
    # image = cv2.cvtColor(cv2.imread(image_file), cv2.COLOR_BGR2RGB)
    
    # cv2.imshow("Image", image)
    # question = input("Question:\n")
    #image = Image.open(image_file)
    input_ids, image_tensor = prepare_inputs_image(question, image)

    ov_output_ids = model.generate(
        input_ids,
        images=image_tensor.unsqueeze(0).half(),
        do_sample=False,
        # no_repeat_ngram_size=3,
        max_new_tokens=1024,
        use_cache=True,
    )

    input_length = input_ids.shape[-1]
    ov_output_ids = ov_output_ids[:, input_length:]
    answer = tokenizer.batch_decode(ov_output_ids, skip_special_tokens=True)[0].strip()
    # print(f"Answer: {answer}")
    return answer

# suggested indices are indices where model output is similar to source model output
# it may still be incorrect!
# int8 image model: 2, 13, 14, 16, 17 (shorter) and  4, 5, 8 (longer)
# f32 image model : 0, 2, 7, 9, 13, 14, 15, 16, 17, 18, 19 (shorter) and 3,5,6,8 (longer)

if __name__ == "__main__":
   import sys
   image_file = sys.argv[1]
   run_inference_image(model, image_file)