Spaces:

Anupam251272
/

AuraSense

Runtime error

App Files Files Community

AuraSense / app.py

Anupam251272

Rename aap.py to app.py

563a64a verified 6 months ago

raw

history blame contribute delete

6.92 kB

	import torch
	import gradio as gr
	import whisper
	import numpy as np
	from transformers import (
	AutoModelForSequenceClassification,
	AutoTokenizer,
	AutoModelForCausalLM
	)
	from datetime import datetime
	from gtts import gTTS
	import tempfile
	import os

	class EnhancedCaregiverSystem:
	def __init__(self):
	# Device configuration
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {self.device}")

	# Initialize Whisper for speech recognition
	self.whisper_model = whisper.load_model("base").to(self.device)

	# Initialize emotion detection
	self.emotion_model_name = "bhadresh-savani/bert-base-uncased-emotion"
	self.emotion_tokenizer = AutoTokenizer.from_pretrained(self.emotion_model_name)
	self.emotion_model = AutoModelForSequenceClassification.from_pretrained(
	self.emotion_model_name
	).to(self.device)

	# Initialize conversation model
	self.conv_model_name = "microsoft/DialoGPT-medium"
	self.conv_tokenizer = AutoTokenizer.from_pretrained(self.conv_model_name)
	self.conv_model = AutoModelForCausalLM.from_pretrained(
	self.conv_model_name
	).to(self.device)

	# Initialize chat history
	self.chat_history = []

	def transcribe_audio(self, audio):
	"""Convert speech to text using Whisper"""
	try:
	if isinstance(audio, np.ndarray):
	audio = np.mean(audio, axis=1) # Convert stereo to mono if necessary
	result = self.whisper_model.transcribe(audio)
	return result["text"].strip()
	except Exception as e:
	print(f"Error in transcription: {e}")
	return "Could not transcribe audio. Please try again."

	def detect_emotion(self, text):
	"""Detect emotion from text"""
	try:
	inputs = self.emotion_tokenizer(
	text,
	return_tensors="pt",
	truncation=True,
	max_length=512
	).to(self.device)

	with torch.no_grad():
	outputs = self.emotion_model(**inputs)
	emotion_id = outputs.logits.argmax().item()

	emotion_map = {
	0: "sadness",
	1: "joy",
	2: "love",
	3: "anger",
	4: "fear",
	5: "surprise"
	}
	return emotion_map.get(emotion_id, "neutral")
	except Exception as e:
	print(f"Error in emotion detection: {e}")
	return "neutral"

	def generate_response(self, text, emotion):
	"""Generate contextual response based on input and emotion"""
	try:
	emotion_context = f"[Emotion: {emotion}] "
	full_prompt = emotion_context + text + self.conv_tokenizer.eos_token

	inputs = self.conv_tokenizer.encode(
	full_prompt,
	return_tensors="pt"
	).to(self.device)

	with torch.no_grad():
	outputs = self.conv_model.generate(
	inputs,
	max_length=1000,
	pad_token_id=self.conv_tokenizer.eos_token_id,
	temperature=0.7,
	top_p=0.9,
	do_sample=True,
	no_repeat_ngram_size=3
	)

	response = self.conv_tokenizer.decode(
	outputs[:, inputs.shape[-1]:][0],
	skip_special_tokens=True
	)

	if emotion in ["sadness", "anger", "fear"]:
	response += f"\n\nI notice you might be feeling {emotion}. I'm here to support you."
	elif emotion == "joy":
	response += "\n\nIt's wonderful to hear you're feeling positive!"

	return response
	except Exception as e:
	print(f"Error in response generation: {e}")
	return "I apologize, but I'm having trouble generating a response. Could you rephrase that?"

	def text_to_speech(self, text):
	"""Convert response to speech using gTTS"""
	try:
	with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as fp:
	tts = gTTS(text=text, lang='en')
	tts.save(fp.name)
	return fp.name
	except Exception as e:
	print(f"Error in text-to-speech: {e}")
	return None

	def process_input(self, input_type, input_data):
	"""Process either text or audio input"""
	if input_type == "audio":
	text = self.transcribe_audio(input_data)
	else:
	text = input_data.strip()

	if not text:
	return {
	"text": "Please provide some input.",
	"emotion": "neutral",
	"response": "I couldn't understand that. Could you try again?",
	"audio_response": None
	}

	emotion = self.detect_emotion(text)
	response = self.generate_response(text, emotion)
	audio_file = self.text_to_speech(response)

	timestamp = datetime.now().strftime("%H:%M:%S")
	self.chat_history.append({
	"timestamp": timestamp,
	"user_input": text,
	"emotion": emotion,
	"response": response
	})

	return {
	"text": text,
	"emotion": emotion,
	"response": response,
	"audio_response": audio_file
	}

	# Initialize the system
	caregiver = EnhancedCaregiverSystem()

	# Define Gradio interface functions
	def process_text(message):
	result = caregiver.process_input("text", message)
	return result["text"], result["emotion"], result["response"], result["audio_response"]

	def process_audio(audio):
	result = caregiver.process_input("audio", audio)
	return result["text"], result["emotion"], result["response"], result["audio_response"]

	# Create Gradio interface
	with gr.Blocks() as iface:
	gr.Markdown("# AI Caregiver System")

	with gr.Tab("Text Input"):
	text_input = gr.Textbox(label="Your message")
	text_button = gr.Button("Send Message")
	text_output = [gr.Textbox(label="Transcription"), gr.Textbox(label="Emotion"), gr.Textbox(label="Response"), gr.Audio(label="Audio Response")]
	text_button.click(process_text, inputs=text_input, outputs=text_output)

	with gr.Tab("Voice Input"):
	# Removing the source argument and using the default value "upload"
	audio_input = gr.Audio(type="numpy", label="Your voice message")
	audio_button = gr.Button("Process Voice")
	audio_output = [gr.Textbox(label="Transcription"), gr.Textbox(label="Emotion"), gr.Textbox(label="Response"), gr.Audio(label="Audio Response")]
	audio_button.click(process_audio, inputs=audio_input, outputs=audio_output)

	iface.launch(share=True)