alvarochamorro3 commited on
Commit
da9eae4
·
verified ·
1 Parent(s): 9b04ab5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -5
app.py CHANGED
@@ -1,11 +1,30 @@
1
  import gradio as gr
 
 
 
 
 
2
 
 
 
 
 
3
 
4
- def greet(name):
5
- return "Hello " + name + "!"
6
 
 
 
 
 
 
 
 
7
 
8
- demo = gr.Interface(fn=greet, inputs="textbox", outputs="textbox")
 
 
 
9
 
10
- if __name__ == "__main__":
11
- demo.launch()
 
 
 
1
  import gradio as gr
2
+ from transformers import pipeline
3
+ from datasets import load_dataset
4
+ import soundfile as sf
5
+ import torch
6
+ import requests
7
 
8
+ # Image-to-text model
9
+ image_to_text_pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
10
+ API_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large"
11
+ headers = {} # Replace with your actual key
12
 
 
 
13
 
14
+ def image_to_text(image_path):
15
+ with open(image_path, "rb") as f:
16
+ data = f.read()
17
+ response = requests.post(API_URL, headers=headers, data=data)
18
+ response_json = response.json()
19
+ generated_text = response_json[0]['generated_text']
20
+ return generated_text
21
 
22
+ # Text-to-audio model
23
+ text_to_audio_pipe = pipeline("text-to-speech", model="microsoft/speecht5_tts")
24
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
25
+ speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
26
 
27
+ def text_to_audio(text):
28
+ speech = text_to_audio_pipe(text, forward_params={"speaker_embeddings": speaker_embedding})
29
+ sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"])
30
+ return "speech.wav"