geethareddy commited on
Commit
9a63e27
Β·
verified Β·
1 Parent(s): a1e88be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -38
app.py CHANGED
@@ -2,8 +2,7 @@ import torch
2
  from flask import Flask, render_template, request, jsonify
3
  import os
4
  import re
5
- import ffmpeg
6
- from transformers import pipeline # βœ… Using correct Whisper ASR pipeline
7
  from gtts import gTTS
8
  from pydub import AudioSegment
9
  from pydub.silence import detect_nonsilent
@@ -11,11 +10,11 @@ from waitress import serve
11
 
12
  app = Flask(__name__)
13
 
14
- # βœ… Load Whisper ASR Model correctly
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
  asr_model = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=0 if device == "cuda" else -1)
17
 
18
- # Function to generate audio prompts
19
  def generate_audio_prompt(text, filename):
20
  tts = gTTS(text=text, lang="en")
21
  tts.save(os.path.join("static", filename))
@@ -31,44 +30,26 @@ prompts = {
31
  for key, text in prompts.items():
32
  generate_audio_prompt(text, f"{key}.mp3")
33
 
34
- # Symbol mapping for proper recognition
35
- SYMBOL_MAPPING = {
36
- "at the rate": "@",
37
- "at": "@",
38
- "dot": ".",
39
- "underscore": "_",
40
- "hash": "#",
41
- "plus": "+",
42
- "dash": "-",
43
- "comma": ",",
44
- "space": " "
45
- }
46
-
47
- # Function to convert audio to WAV format
48
  def convert_to_wav(input_path, output_path):
49
  try:
50
  audio = AudioSegment.from_file(input_path)
 
51
  audio.export(output_path, format="wav")
52
  except Exception as e:
53
  raise Exception(f"Audio conversion failed: {str(e)}")
54
 
55
- # Function to clean transcribed text
56
- def clean_transcription(text):
57
- text = text.lower().strip()
58
- ignore_phrases = ["my name is", "this is", "i am", "it's", "name"]
59
- for phrase in ignore_phrases:
60
- text = text.replace(phrase, "").strip()
61
-
62
- for word, symbol in SYMBOL_MAPPING.items():
63
- text = text.replace(word, symbol)
64
-
65
- return text.capitalize()
66
-
67
- # Function to check if audio contains actual speech
68
  def is_silent_audio(audio_path):
69
  audio = AudioSegment.from_wav(audio_path)
70
  nonsilent_parts = detect_nonsilent(audio, min_silence_len=500, silence_thresh=audio.dBFS-16)
71
- return len(nonsilent_parts) == 0 # Returns True if silence detected
 
 
 
 
 
 
72
 
73
  @app.route("/")
74
  def index():
@@ -85,21 +66,21 @@ def transcribe():
85
  audio_file.save(input_audio_path)
86
 
87
  try:
88
- # Convert to WAV
89
  convert_to_wav(input_audio_path, output_audio_path)
90
 
91
- # Check for silence
92
  if is_silent_audio(output_audio_path):
93
  return jsonify({"error": "No speech detected. Please try again."}), 400
94
-
95
- # βœ… Use Whisper ASR model for transcription
96
  result = asr_model(output_audio_path, generate_kwargs={"language": "en"})
97
  transcribed_text = clean_transcription(result["text"])
98
-
99
  return jsonify({"text": transcribed_text})
100
  except Exception as e:
101
  return jsonify({"error": f"Speech recognition error: {str(e)}"}), 500
102
 
103
- # Start Waitress Production Server
104
  if __name__ == "__main__":
105
  serve(app, host="0.0.0.0", port=7860)
 
2
  from flask import Flask, render_template, request, jsonify
3
  import os
4
  import re
5
+ from transformers import pipeline
 
6
  from gtts import gTTS
7
  from pydub import AudioSegment
8
  from pydub.silence import detect_nonsilent
 
10
 
11
  app = Flask(__name__)
12
 
13
+ # βœ… Load Whisper ASR Model Correctly with Language Specification
14
  device = "cuda" if torch.cuda.is_available() else "cpu"
15
  asr_model = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=0 if device == "cuda" else -1)
16
 
17
+ # Function to generate voice prompts
18
  def generate_audio_prompt(text, filename):
19
  tts = gTTS(text=text, lang="en")
20
  tts.save(os.path.join("static", filename))
 
30
  for key, text in prompts.items():
31
  generate_audio_prompt(text, f"{key}.mp3")
32
 
33
+ # βœ… Ensure Proper Audio Format (16kHz, Mono)
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def convert_to_wav(input_path, output_path):
35
  try:
36
  audio = AudioSegment.from_file(input_path)
37
+ audio = audio.set_frame_rate(16000).set_channels(1) # βœ… Convert to 16kHz, mono
38
  audio.export(output_path, format="wav")
39
  except Exception as e:
40
  raise Exception(f"Audio conversion failed: {str(e)}")
41
 
42
+ # βœ… Check for Silence
 
 
 
 
 
 
 
 
 
 
 
 
43
  def is_silent_audio(audio_path):
44
  audio = AudioSegment.from_wav(audio_path)
45
  nonsilent_parts = detect_nonsilent(audio, min_silence_len=500, silence_thresh=audio.dBFS-16)
46
+ return len(nonsilent_parts) == 0
47
+
48
+ # βœ… Clean Transcription Text
49
+ def clean_transcription(text):
50
+ text = text.strip()
51
+ text = re.sub(r"[-.]", "", text) # βœ… Remove unwanted characters
52
+ return text.capitalize()
53
 
54
  @app.route("/")
55
  def index():
 
66
  audio_file.save(input_audio_path)
67
 
68
  try:
69
+ # βœ… Convert audio to proper format
70
  convert_to_wav(input_audio_path, output_audio_path)
71
 
72
+ # βœ… Check for silent audio
73
  if is_silent_audio(output_audio_path):
74
  return jsonify({"error": "No speech detected. Please try again."}), 400
75
+
76
+ # βœ… Transcribe Using Whisper ASR
77
  result = asr_model(output_audio_path, generate_kwargs={"language": "en"})
78
  transcribed_text = clean_transcription(result["text"])
79
+
80
  return jsonify({"text": transcribed_text})
81
  except Exception as e:
82
  return jsonify({"error": f"Speech recognition error: {str(e)}"}), 500
83
 
84
+ # βœ… Start Production Server
85
  if __name__ == "__main__":
86
  serve(app, host="0.0.0.0", port=7860)