import streamlit as st from transformers import pipeline from PIL import Image # Load pipelines image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") text_to_speech = pipeline("text-to-speech", model="facebook/mms-tts-eng") st.title("Image-to-Text and Text-to-Speech App") # Image uploader uploaded_image = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"]) if uploaded_image: image = Image.open(uploaded_image) st.image(image, caption="Uploaded Image", use_container_width=True) # Convert image to text text_output = image_to_text(image)[0]['generated_text'] st.write("### Extracted Text:") st.write(text_output) # Convert text to speech speech_output = text_to_speech(text_output) st.write("### Listen to Speech Output:") st.audio(speech_output['audio'], format="audio/wav", start_time=0, sample_rate = speech_output['sampling_rate'])