Spaces:

jaykishan-b
/

speech-analysis

Runtime error

App Files Files Community

speech-analysis / app /utils /ai_speech.py

jaykishan-b

init

79b7942 7 months ago

raw

history blame

3.03 kB

	import csv
	import json
	import os
	import time

	import azure.cognitiveservices.speech as speechsdk
	from dotenv import load_dotenv
	from pydub import AudioSegment

	from app.config import settings


	def text_details(response_json):
	mispronunced_words = []
	nbest = response_json.get("NBest", [])[0] # Assumes you use the top result
	word_list = nbest.get("Words", [])

	for word_info in word_list:
	if word_info["PronunciationAssessment"]["ErrorType"] == "Mispronunciation":
	# Collect necessary details
	mispronunced_words.append(
	{
	"word": word_info["Word"],
	"offset": word_info["Offset"],
	"position_in_text": word_list.index(word_info), # Get index for order reference
	}
	)

	display_text = nbest["Display"]

	return {"mispronunced_words": mispronunced_words, "display_text": display_text}


	def pronunciation_assessment(file_path, language):
	if str(file_path).endswith("mp3"):
	mp3_path = file_path
	file_path = file_path.replace(".mp3", ".wav")
	sound = AudioSegment.from_mp3(mp3_path)
	sound.export(file_path, format="wav")

	# Initialize speech config
	speech_config = speechsdk.SpeechConfig(
	subscription=settings.AZURE_AI_SUBSCRIPTION_KEY, region=settings.AZURE_AI_REGION
	)
	speech_config.speech_recognition_language = language

	# Create pronunciation assessment config
	pronunciation_config = speechsdk.PronunciationAssessmentConfig(
	grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
	granularity=speechsdk.PronunciationAssessmentGranularity.Phoneme,
	enable_miscue=True,
	)

	# Initialize audio config from file
	audio_config = speechsdk.audio.AudioConfig(filename=file_path)

	# Initialize speech recognizer with the audio file
	recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

	# Apply pronunciation assessment configuration
	pronunciation_config.apply_to(recognizer)

	# Perform recognition and assessment
	result = recognizer.recognize_once()

	audio_text_details = text_details(json.loads(result.json))

	if result.reason == speechsdk.ResultReason.RecognizedSpeech:
	# Extract pronunciation assessment results
	pronunciation_result = speechsdk.PronunciationAssessmentResult(result)

	# Build result dictionary
	analysis_results = {
	"File": file_path,
	"pronunciation_score": pronunciation_result.pronunciation_score,
	"accuracy_score": pronunciation_result.accuracy_score,
	"fluency_score": pronunciation_result.fluency_score,
	"completeness_score": pronunciation_result.completeness_score,
	"mispronunced_words": audio_text_details.get("mispronunced_words", []),
	"display_text": audio_text_details.get("display_text", ""),
	}

	return analysis_results