import streamlit as st from transformers import pipeline import torch from gtts import gTTS import io st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜") st.header("Turn Your Image📷 to a Short Audio Story🔊 for Children👶") uploaded_file = st.file_uploader("Select an Image After the Models are Loaded...") # function part # Preload models once @st.cache_resource def load_models(): return { "img_model": pipeline("image-to-text", "cnmoro/tiny-image-captioning"), "story_model": pipeline("text-generation", "Qwen/Qwen2.5-0.5B-Instruct") } models = load_models() # img2text def img2text(url): text = models["img_model"](url)[0]["generated_text"] return text # text2story def text2story(text): # Define your messages prompt = f"Generate a brief 100-word story about: {text}" messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt} ] response = models["story_model"]( messages, max_new_tokens=100, do_sample=True, temperature=0.7)[0]["generated_text"] story_text = response[2]["content"] return story_text # text2audio def text2audio(story_text): # Create temporary in-memory file audio_io = io.BytesIO() # Generate speech using gTTS tts = gTTS(text=story_text, lang='en', slow=False) tts.write_to_fp(audio_io) audio_io.seek(0) # Return as dictionary with compatible structure return { 'audio': audio_io, 'sampling_rate': 16000 # gTTS uses 16kHz by default } # Initialize session state variables if 'processed_data' not in st.session_state: st.session_state.processed_data = { 'scenario': None, 'story': None, 'audio': None } if uploaded_file is not None: print(uploaded_file) bytes_data = uploaded_file.getvalue() with open(uploaded_file.name, "wb") as file: file.write(bytes_data) st.image(uploaded_file, caption="Uploaded Image", use_container_width=True) # Only process if file is new if st.session_state.get('current_file') != uploaded_file.name: st.session_state.current_file = uploaded_file.name # Stage 1: Image to Text with st.spinner('Processing image...'): st.session_state.processed_data['scenario'] = img2text(uploaded_file.name) # Stage 2: Text to Story with st.spinner('Generating story...'): st.session_state.processed_data['story'] = text2story( st.session_state.processed_data['scenario'] ) # Stage 3: Story to Audio with st.spinner('Creating audio...'): st.session_state.processed_data['audio'] = text2audio( st.session_state.processed_data['story'] ) # Display results # st.image(uploaded_file) st.write("Caption:", st.session_state.processed_data['scenario']) st.write("Story:", st.session_state.processed_data['story']) # Keep audio button OUTSIDE file processing block if st.button("Play Audio of the Story Generated"): if st.session_state.processed_data.get('audio'): audio_data = st.session_state.processed_data['audio'] # Convert BytesIO to bytes and specify format st.audio( audio_data['audio'].getvalue(), format="audio/mp3" # gTTS outputs MP3 by default ) else: st.warning("Please generate a story first!")