thiagohersan's picture
Update app.py
8a21543 verified
import gradio as gr
import numpy as np
from transformers import pipeline
# caption = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
caption = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
generate = pipeline("text-generation", model="openai-community/gpt2")
tts = pipeline(task="text-to-speech", model="facebook/mms-tts-eng")
def run_caption(img):
res = caption(img, max_new_tokens=128)
return res[0]["generated_text"]
def run_generate(txt):
res = generate(txt, max_length=50)
return res[0]["generated_text"]
def run_tts(txt):
res = tts(txt)
audio = (res["audio"].reshape(-1) * 2 ** 15).astype(np.int16)
return res["sampling_rate"], audio
def run_caption_tts(img):
return run_tts(run_caption(img))
def run_caption_generate_tts(img):
return run_tts(run_generate(run_caption(img)))
with gr.Blocks() as demo:
gr.Interface(
run_caption,
inputs=gr.Image(type="pil"),
outputs="text",
)
gr.Interface(
run_generate,
inputs="text",
outputs="text",
)
gr.Interface(
run_tts,
inputs=gr.Textbox(),
outputs="audio",
)
gr.Interface(
run_caption_tts,
inputs=gr.Image(type="pil"),
outputs="audio",
)
gr.Interface(
run_caption_generate_tts,
inputs=gr.Image(type="pil"),
outputs="audio",
)
if __name__ == "__main__":
demo.launch()