File size: 3,026 Bytes
79b7942
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import csv
import json
import os
import time

import azure.cognitiveservices.speech as speechsdk
from dotenv import load_dotenv
from pydub import AudioSegment

from app.config import settings


def text_details(response_json):
    mispronunced_words = []
    nbest = response_json.get("NBest", [])[0]  # Assumes you use the top result
    word_list = nbest.get("Words", [])

    for word_info in word_list:
        if word_info["PronunciationAssessment"]["ErrorType"] == "Mispronunciation":
            # Collect necessary details
            mispronunced_words.append(
                {
                    "word": word_info["Word"],
                    "offset": word_info["Offset"],
                    "position_in_text": word_list.index(word_info),  # Get index for order reference
                }
            )

    display_text = nbest["Display"]

    return {"mispronunced_words": mispronunced_words, "display_text": display_text}


def pronunciation_assessment(file_path, language):
    if str(file_path).endswith("mp3"):
        mp3_path = file_path
        file_path = file_path.replace(".mp3", ".wav")
        sound = AudioSegment.from_mp3(mp3_path)
        sound.export(file_path, format="wav")

    # Initialize speech config
    speech_config = speechsdk.SpeechConfig(
        subscription=settings.AZURE_AI_SUBSCRIPTION_KEY, region=settings.AZURE_AI_REGION
    )
    speech_config.speech_recognition_language = language

    # Create pronunciation assessment config
    pronunciation_config = speechsdk.PronunciationAssessmentConfig(
        grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
        granularity=speechsdk.PronunciationAssessmentGranularity.Phoneme,
        enable_miscue=True,
    )

    # Initialize audio config from file
    audio_config = speechsdk.audio.AudioConfig(filename=file_path)

    # Initialize speech recognizer with the audio file
    recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    # Apply pronunciation assessment configuration
    pronunciation_config.apply_to(recognizer)

    # Perform recognition and assessment
    result = recognizer.recognize_once()

    audio_text_details = text_details(json.loads(result.json))

    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        # Extract pronunciation assessment results
        pronunciation_result = speechsdk.PronunciationAssessmentResult(result)

        # Build result dictionary
        analysis_results = {
            "File": file_path,
            "pronunciation_score": pronunciation_result.pronunciation_score,
            "accuracy_score": pronunciation_result.accuracy_score,
            "fluency_score": pronunciation_result.fluency_score,
            "completeness_score": pronunciation_result.completeness_score,
            "mispronunced_words": audio_text_details.get("mispronunced_words", []),
            "display_text": audio_text_details.get("display_text", ""),
        }

        return analysis_results