Spaces:

ajibs75
/

image_to_text_app

Sleeping

ajibs75 commited on Jun 29, 2024

Commit

3fc93cc

verified ·

1 Parent(s): cd72a9f

Create app.py

Files changed (1) hide show

app.py ADDED Viewed

+# Use a pipeline as a high-level helper
+import torch
+from transformers import pipeline
+from scipy.io import wavfile
+from PIL import Image
+import gradio as gr
+device = "cuda" if torch.cuda.is_available() else "cpu"
+image_pipe  = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large",device=device)
+narator     = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs",device=device)
+def generate_audio(text):
+    # generate the audio from the text
+    audio_text = narator(text)
+    # save the audio to a WAV file
+    wavfile.write(filename="audio.wav",
+                  rate=audio_text['sampling_rate'],
+                  data=audio_text['audio'][0])
+    return "audio.wav"
+def caption_my_image(image_path):
+    image = image_pipe(image_path)
+    caption_text = image[0]['generated_text']
+    return generate_audio(caption_text)
+demo = gr.Interface(fn=caption_my_image,
+                    inputs=[gr.Image(label="Image",type="pil")],
+                    outputs=[gr.Audio(label="Image Caption")],
+                    title="@SmartChoiceLearningHubs HF project 1 :Image to Text to Speech",
+                    description="This app generates a caption for an image and converts the caption to speech.")
+demo.launch()