Afrinetwork7 commited on
Commit
0eaed7a
1 Parent(s): 1388ad6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -5
app.py CHANGED
@@ -8,7 +8,6 @@ import base64
8
  import logging
9
  import torch
10
  import librosa
11
- from transformers import Wav2Vec2ForCTC, AutoProcessor
12
  from pathlib import Path
13
  import magic # For MIME type detection
14
  from pydub import AudioSegment
@@ -17,7 +16,7 @@ from pydub import AudioSegment
17
  from asr import transcribe, ASR_LANGUAGES
18
  from tts import synthesize, TTS_LANGUAGES
19
  from lid import identify
20
- from asr import ASR_SAMPLING_RATE, transcribe
21
 
22
  # Configure logging
23
  logging.basicConfig(level=logging.INFO)
@@ -78,11 +77,27 @@ async def transcribe_audio(request: AudioRequest):
78
  @app.post("/synthesize")
79
  async def synthesize_speech(request: TTSRequest):
80
  try:
81
- audio, filtered_text = synthesize(request.text, request.language, request.speed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  # Convert numpy array to bytes
83
  buffer = io.BytesIO()
84
- sf.write(buffer, audio, 22050, format='wav')
85
  buffer.seek(0)
 
86
  return FileResponse(
87
  buffer,
88
  media_type="audio/wav",
@@ -117,4 +132,4 @@ async def get_tts_languages():
117
  return JSONResponse(content=TTS_LANGUAGES)
118
  except Exception as e:
119
  logger.error(f"Error in get_tts_languages: {str(e)}")
120
- raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
 
8
  import logging
9
  import torch
10
  import librosa
 
11
  from pathlib import Path
12
  import magic # For MIME type detection
13
  from pydub import AudioSegment
 
16
  from asr import transcribe, ASR_LANGUAGES
17
  from tts import synthesize, TTS_LANGUAGES
18
  from lid import identify
19
+ from asr import ASR_SAMPLING_RATE
20
 
21
  # Configure logging
22
  logging.basicConfig(level=logging.INFO)
 
77
  @app.post("/synthesize")
78
  async def synthesize_speech(request: TTSRequest):
79
  try:
80
+ logger.info(f"Synthesizing speech for text: {request.text}, language: {request.language}, speed: {request.speed}")
81
+ result, filtered_text = synthesize(request.text, request.language, request.speed)
82
+ logger.info(f"Synthesis complete. Filtered text: {filtered_text}")
83
+
84
+ sample_rate, audio = result
85
+ logger.info(f"Sample rate: {sample_rate}, Audio shape: {audio.shape}, Audio dtype: {audio.dtype}")
86
+
87
+ # Ensure audio is a numpy array with the correct dtype
88
+ audio = np.array(audio, dtype=np.float32)
89
+
90
+ # Normalize audio to [-1, 1] range
91
+ audio = audio / np.max(np.abs(audio))
92
+
93
+ # Convert to int16 for WAV file
94
+ audio = (audio * 32767).astype(np.int16)
95
+
96
  # Convert numpy array to bytes
97
  buffer = io.BytesIO()
98
+ sf.write(buffer, audio, sample_rate, format='wav')
99
  buffer.seek(0)
100
+
101
  return FileResponse(
102
  buffer,
103
  media_type="audio/wav",
 
132
  return JSONResponse(content=TTS_LANGUAGES)
133
  except Exception as e:
134
  logger.error(f"Error in get_tts_languages: {str(e)}")
135
+ raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")