AVE2

Runtime error

File size: 3,318 Bytes

1d6f96a
 
 
d0263ad
3fdea53
37e33c3
3c19287
b3ece58
d97a50e
1d6f96a
b3ece58
670552f
 
 
 
 
 
 
 
f2b7d46
 
 
d97a50e
fc15cbc
f2b7d46
e2ed0ec
d97a50e
ac02da5
d97a50e
f2b7d46
 
b3ece58
e2ed0ec
670552f
 
 
 
 
 
 
e2ed0ec
b3ece58
e2ed0ec
 
b3ece58
e2ed0ec
670552f
 
 
 
f2b7d46
670552f
 
 
 
 
 
 
e2ed0ec
366a4b5
33519ac
d97a50e
4e7f2d5
f2b7d46
 
 
 
d97a50e
f2b7d46
 
 
 
 
 
 
 
 
836ccde
8f0b9db
2fb792e
d97a50e
 
 
2fb792e
591dae5
8425dd7
2fb792e
d0cc8ca
f679e15
d97a50e
 
8ebdf87
2fb792e
47ef36a
 
dc1b3f3
 
 
 
 
9524e11
8f0b9db
d97a50e

import gradio as gr
import numpy as np
import librosa
import requests
from io import BytesIO
from PIL import Image
import os
from tensorflow.keras.models import load_model
from faster_whisper import WhisperModel

# Load the emotion prediction model
def load_emotion_model(model_path):
    try:
        model = load_model(model_path)
        return model
    except Exception as e:
        print("Error loading emotion prediction model:", e)
        return None

model_path = 'mymodel_SER_LSTM_RAVDESS.h5'
model = load_emotion_model(model_path)

# Initialize WhisperModel
model_size = "small"
model2 = WhisperModel(model_size, device="cpu", compute_type="int8")

# Function to transcribe audio
def transcribe(wav_filepath):
    segments, _ = model2.transcribe(wav_filepath, beam_size=5)
    return "".join([segment.text for segment in segments])

# Function to extract MFCC features from audio
def extract_mfcc(wav_file_name):
    try:
        y, sr = librosa.load(wav_file_name)
        mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
        return mfccs
    except Exception as e:
        print("Error extracting MFCC features:", e)
        return None

# Emotions dictionary
emotions = {1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad', 5: 'angry', 6: 'fearful', 7: 'disgust', 8: 'surprised'}

# Function to predict emotion from audio
def predict_emotion_from_audio(wav_filepath):
    try:
        test_point = extract_mfcc(wav_filepath)
        if test_point is not None:
            test_point = np.reshape(test_point, newshape=(1, 40, 1))
            predictions = model.predict(test_point)
            predicted_emotion_label = np.argmax(predictions[0]) + 1
            return emotions[predicted_emotion_label]
        else:
            return "Error: Unable to extract features"
    except Exception as e:
        print("Error predicting emotion:", e)
        return None

api_key = os.getenv("DeepAI_api_key")

# Function to generate an image using DeepAI Text to Image API
def generate_image(api_key, text):
    url = "https://api.deepai.org/api/text2img"
    headers = {'api-key': api_key}
    response = requests.post(
        url,
        data={'text': text},
        headers=headers
    )
    response_data = response.json()
    if 'output_url' in response_data:
        image_url = response_data['output_url']
        image_response = requests.get(image_url)
        image = Image.open(BytesIO(image_response.content))
        return image
    else:
        return None

        
# Function to get predictions
def get_predictions(audio_input):
    emotion_prediction = predict_emotion_from_audio(audio_input)
    transcribed_text = transcribe(audio_input)
    texto_imagen = emotion_prediction + transcribed_text
    image = generate_image(api_key, texto_imagen)
    return emotion_prediction, transcribed_text, image

# Create the Gradio interface
interface = gr.Interface(
    fn=get_predictions,
    inputs=gr.Audio(label="Input Audio", type="filepath"),
    outputs=[
        gr.Label("Acoustic Prediction", label="Acoustic Prediction"),
        gr.Label("Transcribed Text", label="Transcribed Text"),
        gr.Image(type='pil', label="Generated Image")
    ],
    title="Affective Virtual Environments",
    description="Create an AVE using your voice."
)


interface.launch()