Spaces:

invincible-jha
/

MentalHealthVocalBiomarkers

Sleeping

App Files Files Community

invincible-jha commited on Nov 18, 2024

Commit

f7af1db

verified ·

1 Parent(s): 1cd7ce8

Upload app.py

Browse files

Files changed (1) hide show

app.py +150 -143

app.py CHANGED Viewed

@@ -4,123 +4,73 @@ from transformers import WhisperProcessor, WhisperForConditionalGeneration, Auto
 import librosa
 import numpy as np
 import plotly.graph_objects as go
-class ModelManager:
-    def __init__(self):
-        self.device = torch.device("cpu")
-        self.models = {}
-        self.tokenizers = {}
-        self.processors = {}
-        self.load_models()
-    def load_models(self):
-        try:
-            print("Loading Whisper model...")
-            self.processors['whisper'] = WhisperProcessor.from_pretrained(
-                "openai/whisper-base"  # Removed device_map parameter
-            )
-            self.models['whisper'] = WhisperForConditionalGeneration.from_pretrained(
-                "openai/whisper-base"  # Removed device_map parameter
-            ).to(self.device)
-            print("Loading emotion model...")
-            self.tokenizers['emotion'] = AutoTokenizer.from_pretrained(
-                "j-hartmann/emotion-english-distilroberta-base"
-            )
-            self.models['emotion'] = AutoModelForSequenceClassification.from_pretrained(
-                "j-hartmann/emotion-english-distilroberta-base"  # Removed device_map parameter
-            ).to(self.device)
-            print("Models loaded successfully")
-        except Exception as e:
-            print(f"Error loading models: {str(e)}")
-            raise
-class AudioProcessor:
-    def __init__(self):
-        self.sample_rate = 16000
-        self.n_mfcc = 13
-    def process_audio(self, audio_path):
-        try:
-            waveform, sr = librosa.load(audio_path, sr=self.sample_rate)
-            return waveform, self._extract_features(waveform)
-        except Exception as e:
-            print(f"Error processing audio: {str(e)}")
-            raise
-    def _extract_features(self, waveform):
-        try:
-            return {
-                'mfcc': librosa.feature.mfcc(y=waveform, sr=self.sample_rate, n_mfcc=self.n_mfcc),
-                'energy': librosa.feature.rms(y=waveform)[0]
-            }
-        except Exception as e:
-            print(f"Error extracting features: {str(e)}")
-            raise
-class Analyzer:
-    def __init__(self):
-        print("Initializing Analyzer...")
-        try:
-            self.model_manager = ModelManager()
-            self.audio_processor = AudioProcessor()
-            print("Analyzer initialization complete")
-        except Exception as e:
-            print(f"Error initializing Analyzer: {str(e)}")
-            raise
-    def analyze(self, audio_path):
-        try:
-            print(f"Processing audio file: {audio_path}")
-            waveform, features = self.audio_processor.process_audio(audio_path)
-            print("Transcribing audio...")
-            inputs = self.model_manager.processors['whisper'](
-                waveform,
-                return_tensors="pt"
-            ).input_features.to(self.model_manager.device)
-            with torch.no_grad():
-                predicted_ids = self.model_manager.models['whisper'].generate(inputs)
-            transcription = self.model_manager.processors['whisper'].batch_decode(
-                predicted_ids,
-                skip_special_tokens=True
-            )[0]
-            print("Analyzing emotions...")
-            inputs = self.model_manager.tokenizers['emotion'](
-                transcription,
-                return_tensors="pt",
-                padding=True,
-                truncation=True,
-                max_length=512
-            )
-            inputs = {k: v.to(self.model_manager.device) for k, v in inputs.items()}
-            with torch.no_grad():
-                outputs = self.model_manager.models['emotion'](**inputs)
-            emotions = torch.nn.functional.softmax(outputs.logits, dim=-1)
-            emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
-            emotion_scores = {
-                label: float(score)
-                for label, score in zip(emotion_labels, emotions[0].cpu())
-            }
-            return {
-                'transcription': transcription,
-                'emotions': emotion_scores
-            }
-        except Exception as e:
-            print(f"Error in analysis: {str(e)}")
-            raise
 def create_emotion_plot(emotions):
     try:
         fig = go.Figure(data=[
             go.Bar(
-                x=list(emotions.keys()),
                 y=list(emotions.values()),
                 marker_color='rgb(55, 83, 109)'
             )
@@ -140,48 +90,105 @@ def create_emotion_plot(emotions):
         print(f"Error creating plot: {str(e)}")
         return "Error creating visualization"
-def process_audio(audio_file):
     try:
-        if audio_file is None:
             return "No audio file provided", "Please provide an audio file"
-        print(f"Processing audio file: {audio_file}")
-        results = analyzer.analyze(audio_file)
-        return (
-            results['transcription'],
-            create_emotion_plot(results['emotions'])
         )
     except Exception as e:
-        error_msg = f"Error processing audio: {str(e)}"
         print(error_msg)
         return error_msg, "Error in analysis"
 if __name__ == "__main__":
-    print("Initializing application...")
-    try:
-        analyzer = Analyzer()
-        print("Creating Gradio interface...")
-        interface = gr.Interface(
-            fn=process_audio,
-            inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
-            outputs=[
-                gr.Textbox(label="Transcription"),
-                gr.HTML(label="Emotion Analysis")
-            ],
-            title="Vocal Biomarker Analysis",
-            description="Analyze voice for emotional indicators",
-            examples=[],
-            cache_examples=False
-        )
-        print("Launching application...")
-        interface.launch(
-            server_name="0.0.0.0",
-            server_port=7860,
-            share=False
-        )
-    except Exception as e:
-        print(f"Fatal error during application startup: {str(e)}")
-        raise

 import librosa
 import numpy as np
 import plotly.graph_objects as go
+import warnings
+import os
+warnings.filterwarnings('ignore')
+# Global variables for models
+processor = None
+whisper_model = None
+emotion_tokenizer = None
+emotion_model = None
+def load_models():
+    """Initialize and load all required models"""
+    global processor, whisper_model, emotion_tokenizer, emotion_model
+    try:
+        print("Loading Whisper model...")
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+        whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+        print("Loading emotion model...")
+        emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
+        emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
+        # Move models to CPU explicitly
+        whisper_model.to("cpu")
+        emotion_model.to("cpu")
+        print("Models loaded successfully!")
+        return True
+    except Exception as e:
+        print(f"Error loading models: {str(e)}")
+        return False
+def process_audio(audio_input):
+    """Process audio file and extract waveform"""
+    try:
+        print(f"Audio input received: {type(audio_input)}")
+        # Handle tuple input from Gradio
+        if isinstance(audio_input, tuple):
+            print(f"Audio input is tuple: {audio_input[0]}, {audio_input[1]}")
+            audio_path = audio_input[0]  # Get the file path
+        else:
+            audio_path = audio_input
+        print(f"Processing audio from path: {audio_path}")
+        # Verify file exists
+        if not os.path.exists(audio_path):
+            raise FileNotFoundError(f"Audio file not found at {audio_path}")
+        # Load and resample audio
+        print("Loading audio file with librosa...")
+        waveform, sr = librosa.load(audio_path, sr=16000)
+        print(f"Audio loaded successfully. Shape: {waveform.shape}, SR: {sr}")
+        return waveform
+    except Exception as e:
+        print(f"Error processing audio: {str(e)}")
+        raise
 def create_emotion_plot(emotions):
+    """Create plotly visualization for emotion scores"""
     try:
         fig = go.Figure(data=[
             go.Bar(
+                x=list(emotions.keys()),
                 y=list(emotions.values()),
                 marker_color='rgb(55, 83, 109)'
             )
         print(f"Error creating plot: {str(e)}")
         return "Error creating visualization"
+def analyze_audio(audio_input):
+    """Main function to analyze audio input"""
     try:
+        if audio_input is None:
+            print("No audio input provided")
             return "No audio file provided", "Please provide an audio file"
+        print(f"Received audio input: {audio_input}")
+        # Process audio
+        waveform = process_audio(audio_input)
+        if waveform is None or len(waveform) == 0:
+            return "Error: Invalid audio file", "Please provide a valid audio file"
+        # Transcribe audio
+        print("Transcribing audio...")
+        inputs = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features
+        with torch.no_grad():
+            predicted_ids = whisper_model.generate(inputs)
+        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        print(f"Transcription completed: {transcription}")
+        if not transcription or transcription.isspace():
+            return "No speech detected in audio", "Unable to analyze emotions without speech"
+        # Analyze emotions
+        print("Analyzing emotions...")
+        inputs = emotion_tokenizer(
+            transcription,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=512
         )
+        with torch.no_grad():
+            outputs = emotion_model(**inputs)
+        emotions = torch.nn.functional.softmax(outputs.logits, dim=-1)
+        emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
+        emotion_scores = {
+            label: float(score)
+            for label, score in zip(emotion_labels, emotions[0].cpu().numpy())
+        }
+        print(f"Emotion analysis completed: {emotion_scores}")
+        # Create visualization
+        emotion_viz = create_emotion_plot(emotion_scores)
+        return transcription, emotion_viz
+    except FileNotFoundError as e:
+        error_msg = f"Audio file not found: {str(e)}"
+        print(error_msg)
+        return error_msg, "Please provide a valid audio file"
     except Exception as e:
+        error_msg = f"Error analyzing audio: {str(e)}"
         print(error_msg)
         return error_msg, "Error in analysis"
+# Load models at startup
+print("Initializing application...")
+if not load_models():
+    raise RuntimeError("Failed to load required models")
+# Create Gradio interface
+demo = gr.Interface(
+    fn=analyze_audio,
+    inputs=gr.Audio(
+        source="microphone",
+        type="filepath",
+        label="Audio Input"
+    ),
+    outputs=[
+        gr.Textbox(label="Transcription"),
+        gr.HTML(label="Emotion Analysis")
+    ],
+    title="Vocal Emotion Analysis",
+    description="""
+    This app analyzes voice recordings to:
+    1. Transcribe speech to text
+    2. Detect emotions in the speech
+    Upload an audio file or record directly through your microphone.
+    """,
+    article="""
+    Models used:
+    - Speech recognition: Whisper (tiny)
+    - Emotion detection: DistilRoBERTa
+    Note: Processing may take a few moments depending on the length of the audio.
+    """,
+    examples=None,
+    cache_examples=False
+)
 if __name__ == "__main__":
+    demo.launch(debug=True)