AuraSense / app.py
Anupam251272's picture
Rename aap.py to app.py
563a64a verified
raw
history blame contribute delete
6.92 kB
import torch
import gradio as gr
import whisper
import numpy as np
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
AutoModelForCausalLM
)
from datetime import datetime
from gtts import gTTS
import tempfile
import os
class EnhancedCaregiverSystem:
def __init__(self):
# Device configuration
self.device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {self.device}")
# Initialize Whisper for speech recognition
self.whisper_model = whisper.load_model("base").to(self.device)
# Initialize emotion detection
self.emotion_model_name = "bhadresh-savani/bert-base-uncased-emotion"
self.emotion_tokenizer = AutoTokenizer.from_pretrained(self.emotion_model_name)
self.emotion_model = AutoModelForSequenceClassification.from_pretrained(
self.emotion_model_name
).to(self.device)
# Initialize conversation model
self.conv_model_name = "microsoft/DialoGPT-medium"
self.conv_tokenizer = AutoTokenizer.from_pretrained(self.conv_model_name)
self.conv_model = AutoModelForCausalLM.from_pretrained(
self.conv_model_name
).to(self.device)
# Initialize chat history
self.chat_history = []
def transcribe_audio(self, audio):
"""Convert speech to text using Whisper"""
try:
if isinstance(audio, np.ndarray):
audio = np.mean(audio, axis=1) # Convert stereo to mono if necessary
result = self.whisper_model.transcribe(audio)
return result["text"].strip()
except Exception as e:
print(f"Error in transcription: {e}")
return "Could not transcribe audio. Please try again."
def detect_emotion(self, text):
"""Detect emotion from text"""
try:
inputs = self.emotion_tokenizer(
text,
return_tensors="pt",
truncation=True,
max_length=512
).to(self.device)
with torch.no_grad():
outputs = self.emotion_model(**inputs)
emotion_id = outputs.logits.argmax().item()
emotion_map = {
0: "sadness",
1: "joy",
2: "love",
3: "anger",
4: "fear",
5: "surprise"
}
return emotion_map.get(emotion_id, "neutral")
except Exception as e:
print(f"Error in emotion detection: {e}")
return "neutral"
def generate_response(self, text, emotion):
"""Generate contextual response based on input and emotion"""
try:
emotion_context = f"[Emotion: {emotion}] "
full_prompt = emotion_context + text + self.conv_tokenizer.eos_token
inputs = self.conv_tokenizer.encode(
full_prompt,
return_tensors="pt"
).to(self.device)
with torch.no_grad():
outputs = self.conv_model.generate(
inputs,
max_length=1000,
pad_token_id=self.conv_tokenizer.eos_token_id,
temperature=0.7,
top_p=0.9,
do_sample=True,
no_repeat_ngram_size=3
)
response = self.conv_tokenizer.decode(
outputs[:, inputs.shape[-1]:][0],
skip_special_tokens=True
)
if emotion in ["sadness", "anger", "fear"]:
response += f"\n\nI notice you might be feeling {emotion}. I'm here to support you."
elif emotion == "joy":
response += "\n\nIt's wonderful to hear you're feeling positive!"
return response
except Exception as e:
print(f"Error in response generation: {e}")
return "I apologize, but I'm having trouble generating a response. Could you rephrase that?"
def text_to_speech(self, text):
"""Convert response to speech using gTTS"""
try:
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as fp:
tts = gTTS(text=text, lang='en')
tts.save(fp.name)
return fp.name
except Exception as e:
print(f"Error in text-to-speech: {e}")
return None
def process_input(self, input_type, input_data):
"""Process either text or audio input"""
if input_type == "audio":
text = self.transcribe_audio(input_data)
else:
text = input_data.strip()
if not text:
return {
"text": "Please provide some input.",
"emotion": "neutral",
"response": "I couldn't understand that. Could you try again?",
"audio_response": None
}
emotion = self.detect_emotion(text)
response = self.generate_response(text, emotion)
audio_file = self.text_to_speech(response)
timestamp = datetime.now().strftime("%H:%M:%S")
self.chat_history.append({
"timestamp": timestamp,
"user_input": text,
"emotion": emotion,
"response": response
})
return {
"text": text,
"emotion": emotion,
"response": response,
"audio_response": audio_file
}
# Initialize the system
caregiver = EnhancedCaregiverSystem()
# Define Gradio interface functions
def process_text(message):
result = caregiver.process_input("text", message)
return result["text"], result["emotion"], result["response"], result["audio_response"]
def process_audio(audio):
result = caregiver.process_input("audio", audio)
return result["text"], result["emotion"], result["response"], result["audio_response"]
# Create Gradio interface
with gr.Blocks() as iface:
gr.Markdown("# AI Caregiver System")
with gr.Tab("Text Input"):
text_input = gr.Textbox(label="Your message")
text_button = gr.Button("Send Message")
text_output = [gr.Textbox(label="Transcription"), gr.Textbox(label="Emotion"), gr.Textbox(label="Response"), gr.Audio(label="Audio Response")]
text_button.click(process_text, inputs=text_input, outputs=text_output)
with gr.Tab("Voice Input"):
# Removing the source argument and using the default value "upload"
audio_input = gr.Audio(type="numpy", label="Your voice message")
audio_button = gr.Button("Process Voice")
audio_output = [gr.Textbox(label="Transcription"), gr.Textbox(label="Emotion"), gr.Textbox(label="Response"), gr.Audio(label="Audio Response")]
audio_button.click(process_audio, inputs=audio_input, outputs=audio_output)
iface.launch(share=True)