import csv import json import os import time import azure.cognitiveservices.speech as speechsdk from dotenv import load_dotenv from pydub import AudioSegment from app.config import settings def text_details(response_json): mispronunced_words = [] nbest = response_json.get("NBest", [])[0] # Assumes you use the top result word_list = nbest.get("Words", []) for word_info in word_list: if word_info["PronunciationAssessment"]["ErrorType"] == "Mispronunciation": # Collect necessary details mispronunced_words.append( { "word": word_info["Word"], "offset": word_info["Offset"], "position_in_text": word_list.index(word_info), # Get index for order reference } ) display_text = nbest["Display"] return {"mispronunced_words": mispronunced_words, "display_text": display_text} def pronunciation_assessment(file_path, language): if str(file_path).endswith("mp3"): mp3_path = file_path file_path = file_path.replace(".mp3", ".wav") sound = AudioSegment.from_mp3(mp3_path) sound.export(file_path, format="wav") # Initialize speech config speech_config = speechsdk.SpeechConfig( subscription=settings.AZURE_AI_SUBSCRIPTION_KEY, region=settings.AZURE_AI_REGION ) speech_config.speech_recognition_language = language # Create pronunciation assessment config pronunciation_config = speechsdk.PronunciationAssessmentConfig( grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark, granularity=speechsdk.PronunciationAssessmentGranularity.Phoneme, enable_miscue=True, ) # Initialize audio config from file audio_config = speechsdk.audio.AudioConfig(filename=file_path) # Initialize speech recognizer with the audio file recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config) # Apply pronunciation assessment configuration pronunciation_config.apply_to(recognizer) # Perform recognition and assessment result = recognizer.recognize_once() audio_text_details = text_details(json.loads(result.json)) if result.reason == speechsdk.ResultReason.RecognizedSpeech: # Extract pronunciation assessment results pronunciation_result = speechsdk.PronunciationAssessmentResult(result) # Build result dictionary analysis_results = { "File": file_path, "pronunciation_score": pronunciation_result.pronunciation_score, "accuracy_score": pronunciation_result.accuracy_score, "fluency_score": pronunciation_result.fluency_score, "completeness_score": pronunciation_result.completeness_score, "mispronunced_words": audio_text_details.get("mispronunced_words", []), "display_text": audio_text_details.get("display_text", ""), } return analysis_results