import gradio as gr import requests from PIL import Image from io import BytesIO from transformers import pipeline from datasets import load_dataset import torch import soundfile as sf image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts") embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) url = "https://www.thecocktaildb.com/api/json/v1/1/search.php?s=margarita" response = requests.get(url) lista = [] if response.status_code == 200: datos = response.json() drinks = datos.get("drinks", []) for drink in drinks: lista.append(drink['strDrink']) else: print(f"Error: {response.status_code}") def change_textbox(choice): cocktail = requests.get(f"https://www.thecocktaildb.com/api/json/v1/1/search.php?s={choice}") data = cocktail.json() dataCocktail = data.get("drinks", []) for i in dataCocktail: if i['strDrink'].lower() == choice.lower(): name = i['strDrink'] instructions = i['strInstructions'] image_url = i['strDrinkThumb'] break textInstructions = gr.Textbox(instructions) img_response = requests.get(image_url) image = Image.open(BytesIO(img_response.content)).convert("RGB") result = image_to_text(image) descripcion = result[0]['generated_text'] speech = synthesiser(instructions, forward_params={"speaker_embeddings": speaker_embedding}) sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"]) audio_path = "speech.wav" speech2 = synthesiser(descripcion, forward_params={"speaker_embeddings": speaker_embedding}) sf.write("speech2.wav", speech2["audio"], samplerate=speech2["sampling_rate"]) audio_path2 = "speech2.wav" return name, image,textInstructions,audio_path,descripcion,audio_path2 with gr.Blocks() as demo: gr.HTML( """ """ ) gr.Markdown( """

Cocktails Descriptions

""" ) radio = gr.Radio(lista, label="Choose your cocktail:") text = gr.Textbox(lines=2, interactive=False, show_copy_button=True, label="Cocktail Name") imagen = gr.Image(label="Cocktail Image") text2 = gr.Textbox(lines=2, interactive=False, show_copy_button=True, label="Instructions") audio = gr.Audio(label="Cocktail Instructions Audio") text3 = gr.Textbox(lines=2, interactive=False, show_copy_button=True, label="Image description") audio2 = gr.Audio(label="Audio image description") radio.change(fn=change_textbox, inputs=radio, outputs=[text, imagen,text2, audio,text3, audio2]) demo.launch()