import torch import gradio as gr import whisper import numpy as np from transformers import ( AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM ) from datetime import datetime from gtts import gTTS import tempfile import os class EnhancedCaregiverSystem: def __init__(self): # Device configuration self.device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {self.device}") # Initialize Whisper for speech recognition self.whisper_model = whisper.load_model("base").to(self.device) # Initialize emotion detection self.emotion_model_name = "bhadresh-savani/bert-base-uncased-emotion" self.emotion_tokenizer = AutoTokenizer.from_pretrained(self.emotion_model_name) self.emotion_model = AutoModelForSequenceClassification.from_pretrained( self.emotion_model_name ).to(self.device) # Initialize conversation model self.conv_model_name = "microsoft/DialoGPT-medium" self.conv_tokenizer = AutoTokenizer.from_pretrained(self.conv_model_name) self.conv_model = AutoModelForCausalLM.from_pretrained( self.conv_model_name ).to(self.device) # Initialize chat history self.chat_history = [] def transcribe_audio(self, audio): """Convert speech to text using Whisper""" try: if isinstance(audio, np.ndarray): audio = np.mean(audio, axis=1) # Convert stereo to mono if necessary result = self.whisper_model.transcribe(audio) return result["text"].strip() except Exception as e: print(f"Error in transcription: {e}") return "Could not transcribe audio. Please try again." def detect_emotion(self, text): """Detect emotion from text""" try: inputs = self.emotion_tokenizer( text, return_tensors="pt", truncation=True, max_length=512 ).to(self.device) with torch.no_grad(): outputs = self.emotion_model(**inputs) emotion_id = outputs.logits.argmax().item() emotion_map = { 0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise" } return emotion_map.get(emotion_id, "neutral") except Exception as e: print(f"Error in emotion detection: {e}") return "neutral" def generate_response(self, text, emotion): """Generate contextual response based on input and emotion""" try: emotion_context = f"[Emotion: {emotion}] " full_prompt = emotion_context + text + self.conv_tokenizer.eos_token inputs = self.conv_tokenizer.encode( full_prompt, return_tensors="pt" ).to(self.device) with torch.no_grad(): outputs = self.conv_model.generate( inputs, max_length=1000, pad_token_id=self.conv_tokenizer.eos_token_id, temperature=0.7, top_p=0.9, do_sample=True, no_repeat_ngram_size=3 ) response = self.conv_tokenizer.decode( outputs[:, inputs.shape[-1]:][0], skip_special_tokens=True ) if emotion in ["sadness", "anger", "fear"]: response += f"\n\nI notice you might be feeling {emotion}. I'm here to support you." elif emotion == "joy": response += "\n\nIt's wonderful to hear you're feeling positive!" return response except Exception as e: print(f"Error in response generation: {e}") return "I apologize, but I'm having trouble generating a response. Could you rephrase that?" def text_to_speech(self, text): """Convert response to speech using gTTS""" try: with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as fp: tts = gTTS(text=text, lang='en') tts.save(fp.name) return fp.name except Exception as e: print(f"Error in text-to-speech: {e}") return None def process_input(self, input_type, input_data): """Process either text or audio input""" if input_type == "audio": text = self.transcribe_audio(input_data) else: text = input_data.strip() if not text: return { "text": "Please provide some input.", "emotion": "neutral", "response": "I couldn't understand that. Could you try again?", "audio_response": None } emotion = self.detect_emotion(text) response = self.generate_response(text, emotion) audio_file = self.text_to_speech(response) timestamp = datetime.now().strftime("%H:%M:%S") self.chat_history.append({ "timestamp": timestamp, "user_input": text, "emotion": emotion, "response": response }) return { "text": text, "emotion": emotion, "response": response, "audio_response": audio_file } # Initialize the system caregiver = EnhancedCaregiverSystem() # Define Gradio interface functions def process_text(message): result = caregiver.process_input("text", message) return result["text"], result["emotion"], result["response"], result["audio_response"] def process_audio(audio): result = caregiver.process_input("audio", audio) return result["text"], result["emotion"], result["response"], result["audio_response"] # Create Gradio interface with gr.Blocks() as iface: gr.Markdown("# AI Caregiver System") with gr.Tab("Text Input"): text_input = gr.Textbox(label="Your message") text_button = gr.Button("Send Message") text_output = [gr.Textbox(label="Transcription"), gr.Textbox(label="Emotion"), gr.Textbox(label="Response"), gr.Audio(label="Audio Response")] text_button.click(process_text, inputs=text_input, outputs=text_output) with gr.Tab("Voice Input"): # Removing the source argument and using the default value "upload" audio_input = gr.Audio(type="numpy", label="Your voice message") audio_button = gr.Button("Process Voice") audio_output = [gr.Textbox(label="Transcription"), gr.Textbox(label="Emotion"), gr.Textbox(label="Response"), gr.Audio(label="Audio Response")] audio_button.click(process_audio, inputs=audio_input, outputs=audio_output) iface.launch(share=True)