File size: 2,181 Bytes
79b7942
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import librosa
import numpy as np
from pydub import AudioSegment


# Feature extraction for Intonation
def evaluate_intonation(wav_file):
    # Load audio using librosa
    y, sr = librosa.load(wav_file)

    # 1. Sentence Stress (based on energy)
    # Calculate Root mean square energy which represents the perceived loudness or power of the audio, which can be linked to sentence stress
    rms_energy = librosa.feature.rms(y=y)[0]
    avg_energy = np.mean(rms_energy) * 10  # Scale up for score calculation
    avg_energy = float(avg_energy)  # Ensure scalar

    # 2. Intonation Patterns (based on pitch variation)
    # Estimate the pitch (fundamental frequency) of the audio signal over time.
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    pitch_values = pitches[magnitudes > np.median(magnitudes)]
    if len(pitch_values) > 0:
        pitch_variation = np.std(pitch_values) / np.mean(pitch_values) * 10
    else:
        pitch_variation = 0
    pitch_variation = float(pitch_variation)  # Ensure scalar

    # 3. Rhythm (based on tempo)
    # Estimates the tempo of the audio in beats per minute (BPM) by tracking the rhythmic structure (detecting beats in the signal).
    # Average speaking tempo is ~120 BPM
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    rhythm_score = (tempo / 120) * 10  # Adjust based on average speaking tempo (120 BPM)
    rhythm_score = float(rhythm_score)  # Ensure scalar

    # Average score as per the formula: (Sentence Stress + Intonation Patterns + Rhythm) / 3
    # Normalize the scores before calculating the final score
    sentence_stress_score = min(max(avg_energy, 0), 10)  # Ensure it's within [0, 10]
    intonation_patterns_score = min(max(pitch_variation, 0), 10)  # Ensure it's within [0, 10]
    rhythm_score = min(max(rhythm_score, 0), 10)  # Ensure it's within [0, 10]

    # Intonation score calculation
    intonation_score = (sentence_stress_score + intonation_patterns_score + rhythm_score) / 3

    return {
        "sentence_stress": sentence_stress_score,
        "intonation_patterns": intonation_patterns_score,
        "rhythm": rhythm_score,
        "intonation_score": intonation_score,
    }