Spaces:

Groundlight
/

grpo-vlm-decoder

Running on Zero

App Files Files Community

grpo-vlm-decoder / app.py

Groundlight

requesting ZeroGPU gpus

8efd006 3 months ago

raw

history blame

10.7 kB

	import spaces
	import random
	from threading import Thread

	import gradio as gr
	import torch # Need this for torch.no_grad()
	from datasets import load_dataset
	from qwen_vl_utils import process_vision_info
	from transformers import (
	AutoProcessor,
	Qwen2_5_VLForConditionalGeneration,
	TextIteratorStreamer,
	)
	from trl import ModelConfig

	# run with:
	# CUDA_VISIBLE_DEVICES=0 uv run gradio demo/demo.py


	def get_eval_dataset():
	full_dataset = load_dataset("sunildkumar/message-decoding-words-and-sequences")["train"]
	full_dataset = full_dataset.shuffle(seed=42)

	# split the dataset with the same seed as used in the training script
	splits = full_dataset.train_test_split(test_size=0.1, seed=42)
	test_dataset = splits["test"]

	return test_dataset


	def load_model_and_tokenizer():
	model_config = ModelConfig(
	model_name_or_path="Groundlight/message-decoding-r1",
	torch_dtype="bfloat16",
	use_peft=False,
	)

	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	pretrained_model_name_or_path=model_config.model_name_or_path,
	torch_dtype=model_config.torch_dtype,
	use_cache=False,
	device_map="auto", # Force CPU usage
	)

	# put model in eval mode
	model.eval()

	processor = AutoProcessor.from_pretrained(
	model_config.model_name_or_path, padding_side="left"
	)

	return model, processor


	# Move resource loading inside a function
	def load_resources():
	global eval_dataset, model, processor
	eval_dataset = get_eval_dataset()
	model, processor = load_model_and_tokenizer()


	def show_random_example():
	# Get a random example
	random_idx = random.randint(0, len(eval_dataset) - 1)
	example = eval_dataset[random_idx]

	# Return image for display, mapping for state, and image for state
	return example["image"], example["mapping"], example["image"]


	def prepare_model_input(image, mapping, processor, submitted_word):
	"""
	Prepare the input for the model using the mapping, processor, and submitted word.

	Args:
	image: The decoder image to use
	mapping (dict): The mapping data from the dataset
	processor: The model's processor/tokenizer
	submitted_word (str): The word submitted by the user

	Returns:
	dict: The processed inputs ready for the model
	"""
	decoded_message = submitted_word.lower()
	print(f"Decoded message: {decoded_message}")

	# reverse the decoder to encode the word
	encoder = {v: k for k, v in mapping.items()}
	print(f"Encoder: {encoder}")
	# leaving the space as is
	coded_message = [encoder[c] if c in encoder else c for c in decoded_message]
	print(f"Coded message: {coded_message}")

	# add spaces between each character to prevent tokenization issues
	coded_message = " ".join(coded_message)

	instruction = (
	f'Use the decoder in the image to decode this coded message: "{coded_message}". '
	"The decoded message will be one or more words. Underscore characters "
	'("_") in the coded message should be mapped to a space (" ") when decoding.'
	)

	ending = (
	"Show your work in <think> </think> tags and return the answer in <answer> </answer> tags. "
	"While thinking, you must include a section with the decoded characters using <chars></chars> tags. "
	"The <chars> section should include the decoded characters in the order they are decoded. It should include the "
	"underscore character wherever there is a space in the decoded message. For example, if the coded message is "
	"a b c _ d e f, the <chars> section might be <chars> c a t _ d o g </chars>. Once you are done thinking, "
	"provide your answer in the <answer> section, e.g. <answer> cat dog </answer>."
	)
	instruction = f"{instruction} {ending}"

	print(f"Instruction: {instruction}")

	r1_messages = [
	{
	"role": "system",
	"content": [
	{
	"type": "text",
	"text": "You are a helpful assistant. You first think about the reasoning process in the mind and then provide the user with the answer.",
	}
	],
	},
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": instruction},
	],
	},
	{
	"role": "assistant",
	"content": [
	{"type": "text", "text": "Let me solve this step by step.\n<think>"}
	],
	},
	]

	texts = processor.apply_chat_template(
	r1_messages, continue_final_message=True, tokenize=False
	)

	image_input, _ = process_vision_info(r1_messages)

	image_input = [image_input]

	batch = processor(
	text=texts,
	images=image_input,
	padding=True,
	return_tensors="pt",
	)

	return batch


	def encode_word(word, mapping):
	"""
	Encode a word using the given mapping.
	"""
	if not word or not mapping:
	return ""

	word = word.lower()
	# reverse the decoder to encode the word
	encoder = {v: k for k, v in mapping.items()}
	# leaving the space as is
	coded_message = [encoder[c] if c in encoder else c for c in word]
	return " ".join(coded_message)


	def validate_and_submit(word, mapping):
	# Check if input contains only letters
	if not word.replace(" ", "").isalpha():
	return (
	gr.update(), # word input
	gr.update(), # submit button
	gr.update(interactive=False), # run button - disable but keep visible
	gr.update(visible=False) # encoded word display
	)

	word = word.lower()
	encoded_word = encode_word(word, mapping)

	# Only enable run button if we have a valid encoded word
	has_valid_encoded_word = bool(encoded_word.strip())

	# Return updates for input, submit button, run button, and encoded word display
	return (
	gr.update(value=word, interactive=False, label="Submitted Word"),
	gr.update(interactive=False), # Disable submit button
	gr.update(interactive=has_valid_encoded_word), # Enable run button only if valid, but always visible
	gr.update(value=f"Encoded word: {encoded_word}", visible=has_valid_encoded_word) # Show encoded word
	)


	def prepare_for_inference():
	"""Setup function that runs before streaming starts"""
	return (
	gr.update(value="", visible=True), # Clear and show output
	gr.update(interactive=False), # Disable run button
	gr.update(visible=True), # Show loading indicator
	)


	def run_inference(word, image, mapping):
	"""Main inference function, now focused just on generation"""
	if not word or not image or not mapping:
	raise gr.Error("Please submit a word and load a decoder first")

	# Prepare model input
	model_inputs = prepare_model_input(image, mapping, processor, word)
	model_inputs = {k: v.to("cuda") for k, v in model_inputs.items()}

	# Initialize streamer
	streamer = TextIteratorStreamer(
	tokenizer=processor,
	skip_special_tokens=True,
	decode_kwargs={"skip_special_tokens": True},
	)

	# Set up generation parameters
	generation_kwargs = dict(
	**model_inputs,
	max_new_tokens=512,
	do_sample=True,
	temperature=1.0,
	streamer=streamer,
	)

	# Start generation in a separate thread with torch.no_grad()
	def generate_with_no_grad():
	with torch.no_grad():
	model.generate(**generation_kwargs)

	thread = Thread(target=generate_with_no_grad)
	thread.start()

	# Stream the output
	generated_text = ""
	for new_text in streamer:
	generated_text += new_text
	yield generated_text

	thread.join()
	return generated_text


	# Create the Gradio interface
	with gr.Blocks() as demo:
	# Load resources when the app starts
	load_resources()

	gr.Markdown("# Message Decoding Demo")
	current_mapping = gr.State()
	current_image = gr.State()

	with gr.Row():
	# Image display component
	image_output = gr.Image(label="Decoder")

	# Button to load new random example
	next_button = gr.Button("Generate Random Decoder")
	next_button.click(
	fn=show_random_example, outputs=[image_output, current_mapping, current_image]
	)

	# Text input for the word
	word_input = gr.Textbox(
	label="Enter a single word",
	placeholder="Enter word here...",
	max_lines=1,
	show_copy_button=False,
	)

	# Add encoded word display
	encoded_word_display = gr.Textbox(
	label="Encoded Word",
	interactive=False,
	visible=False,
	max_lines=1,
	show_copy_button=True,
	)

	# Group submit and run buttons vertically
	with gr.Column(): # Use Column instead of Row for vertical layout
	submit_button = gr.Button("Submit Word")
	run_button = gr.Button("Run Model", interactive=False) # Initialize as visible but disabled

	# Output area for model response
	model_output = gr.Textbox(
	label="Model Output",
	interactive=False,
	visible=False,
	max_lines=10,
	container=True,
	show_copy_button=True,
	)

	# Add loading indicator
	with gr.Row():
	loading_indicator = gr.HTML(visible=False)

	# Validate word on submit and update interface
	submit_button.click(
	fn=validate_and_submit,
	inputs=[word_input, current_mapping],
	outputs=[word_input, submit_button, run_button, encoded_word_display],
	)

	# Run inference when run button is clicked
	run_button.click(
	fn=prepare_for_inference,
	outputs=[model_output, run_button, loading_indicator],
	).then(
	fn=run_inference,
	inputs=[word_input, current_image, current_mapping],
	outputs=model_output,
	api_name=False,
	).then(
	# Reset interface after generation
	lambda: (
	gr.update(interactive=False), # Disable run button but keep visible
	gr.update(visible=False), # Hide loading indicator
	gr.update(interactive=True, label="Enter a single word"), # Re-enable word input
	gr.update(interactive=True), # Re-enable submit button
	gr.update(visible=False), # Hide encoded word display
	),
	None,
	[run_button, loading_indicator, word_input, submit_button, encoded_word_display],
	)


	@spaces.GPU
	def main():
	demo.launch()

	if __name__ == "__main__":
	main()