Norahsal commited on
Commit
793a82d
·
verified ·
1 Parent(s): 4b8848f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -0
app.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # to create nueral network
2
+ import torch
3
+
4
+ # for interface
5
+ import gradio as gr
6
+
7
+ # to open images
8
+ from PIL import Image
9
+
10
+ # used for audio
11
+ import scipy.io.wavfile as wavfile
12
+
13
+ # Use a pipeline as a high-level helper
14
+ from transformers import pipeline
15
+
16
+
17
+ device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
18
+
19
+ narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
20
+
21
+ # Load the pretrained weights
22
+ caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
23
+
24
+
25
+ # Define the function to generate audio from text
26
+ def generate_audio(text):
27
+ # Generate the narrated text
28
+ narrated_text = narrator(text)
29
+
30
+ # Save the audio to WAV file
31
+ wavfile.write("output.wav", rate=narrated_text["sampling_rate"],
32
+ data=narrated_text["audio"][0])
33
+
34
+ # Return the path to the saved output WAV file
35
+ return "output.wav" # return audio
36
+
37
+ def caption_my_image(pil_image):
38
+
39
+ semantics = caption_image(images=pil_image)[0]['generated_text']
40
+ audio = generate_audio(semantics)
41
+ return semantics,audio # returns both text and audio output
42
+
43
+
44
+ demo = gr.Interface(fn=caption_my_image,
45
+ inputs=[gr.Image(label="Select Image",type="pil")],
46
+ outputs=[
47
+ gr.Textbox(label="Image Caption"),
48
+ gr.Audio(label="Image Caption Audio")],
49
+ title="IMAGE CAPTIONING WITH AUDIO OUTPUT",
50
+ description="THIS APPLICATION WILL BE USED TO CAPTION IMAGES WITH THE HELP OF AI")
51
+ demo.launch()