Spaces:

calebaryee321
/

Whisper2Image

Build error

App Files Files Community

calebaryee321 commited on Nov 30, 2022

Commit

3c53ea4

1 Parent(s): 89f557e

Create app.py

Browse files

Files changed (1) hide show

app.py +54 -0

app.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import gradio as gr
+import time
+import sounddevice as sd
+import soundfile as sf
+import time
+import whisper
+from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
+def SpeechToText(audio):
+    if audio == None : return ""
+    model = whisper.load_model("base")
+    audio = whisper.load_audio(audio)
+    audio = whisper.pad_or_trim(audio)
+    # make log-Mel spectrogram and move to the same device as the model
+    mel = whisper.log_mel_spectrogram(audio).to(model.device)
+    # Detect the Max probability of language ?
+    _, probs = model.detect_language(mel)
+    lang = f"Language: {max(probs, key=probs.get)}"
+    #  Decode audio to Text
+    options = whisper.DecodingOptions(fp16 = False)
+    result = whisper.decode(model, mel, options)
+    return result.text
+def img_Generation(text):
+  print(text)
+  model_id = "stabilityai/stable-diffusion-2"
+  # Use the Euler scheduler here instead
+  scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
+  pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, revision="fp16", torch_dtype=torch.float16)
+  pipe = pipe.to("cuda")
+  image = pipe(text, num_inference_steps = 150).images[0]
+  image.save("img_1.png")
+  return image
+def transcribe(audio):
+  text = SpeechToText(audio)
+  image = img_Generation(text)
+  return image
+gr.Interface(
+    fn=transcribe,
+    inputs=gr.Audio(source="microphone", type="filepath"),
+    outputs="image",description="A Speech to Image Generation App Using OpenAI's Whisper",title= "Whisper2IMG").launch(share="True")