gorkemgoknar commited on
Commit
e548e55
1 Parent(s): 245a4de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -8
app.py CHANGED
@@ -19,6 +19,7 @@ import torch
19
  import nltk # we'll use this to split into sentences
20
  nltk.download("punkt")
21
 
 
22
  import subprocess
23
  import langid
24
  import uuid
@@ -549,7 +550,7 @@ def generate_speech(history,chatbot_role,llm_model):
549
 
550
 
551
  # will generate speech audio file per sentence
552
- def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte=True):
553
  language = "autodetect"
554
 
555
  wav_bytestream = b""
@@ -583,7 +584,7 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
583
  # sentence = sentence[:-1] + " " + sentence[-1]
584
 
585
  # regex does the job well
586
- sentence= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)",r"\1 \2",sentence)
587
 
588
  print("Sentence for speech:", sentence)
589
 
@@ -618,23 +619,34 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
618
  # XTTS is actually using streaming response but we are playing audio by sentence
619
  # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
620
  if audio_stream is not None:
621
- wav_chunks = wave_header_chunk()
622
  frame_length = 0
623
  for chunk in audio_stream:
624
  try:
625
  wav_bytestream += chunk
626
- wav_chunks += chunk
627
  frame_length += len(chunk)
628
  except:
629
  # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
630
  continue
631
 
 
 
 
 
 
 
 
 
 
632
  if audio_stream is not None:
633
  if not return_as_byte:
634
  audio_unique_filename = "/tmp/"+ str(uuid.uuid4())+".wav"
635
- with open(audio_unique_filename, "wb") as f:
636
- f.write(wav_chunks)
637
- #Will write filename to context variable
 
 
 
 
638
  return (history , gr.Audio.update(value=audio_unique_filename, autoplay=True))
639
  else:
640
  return (history , gr.Audio.update(value=wav_bytestream, autoplay=True))
@@ -776,7 +788,7 @@ This Space demonstrates how to speak to a chatbot, based solely on open accessib
776
  It relies on following models :
777
  Speech to Text : [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
778
  LLM Mistral : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chat model, GGUF Q5_K_M quantized version used locally via llama_cpp[huggingface_hub](TheBloke/Mistral-7B-Instruct-v0.1-GGUF).
779
- LLM Zephyr : [Zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF).
780
  Text to Speech : [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the chatbot answers. This time, the model is hosted locally.
781
 
782
  Note:
 
19
  import nltk # we'll use this to split into sentences
20
  nltk.download("punkt")
21
 
22
+ import noisereduce as nr
23
  import subprocess
24
  import langid
25
  import uuid
 
550
 
551
 
552
  # will generate speech audio file per sentence
553
+ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte=False):
554
  language = "autodetect"
555
 
556
  wav_bytestream = b""
 
584
  # sentence = sentence[:-1] + " " + sentence[-1]
585
 
586
  # regex does the job well
587
+ sentence= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)",r"\1 \2\2",sentence)
588
 
589
  print("Sentence for speech:", sentence)
590
 
 
619
  # XTTS is actually using streaming response but we are playing audio by sentence
620
  # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
621
  if audio_stream is not None:
 
622
  frame_length = 0
623
  for chunk in audio_stream:
624
  try:
625
  wav_bytestream += chunk
 
626
  frame_length += len(chunk)
627
  except:
628
  # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
629
  continue
630
 
631
+ # Filter output for better voice
632
+ filter_output=True
633
+ if filter_output:
634
+ data_s16 = np.frombuffer(wav_bytestream, dtype=np.int16, count=len(wav_bytestream)//2, offset=0)
635
+ float_data = data_s16 * 0.5**15
636
+ reduced_noise = nr.reduce_noise(y=float_data, sr=24000,prop_decrease =0.8,n_fft=1024)
637
+ wav_bytestream = (reduced_noise * 32767).astype(np.int16)
638
+ wav_bytestream = wav_bytestream.tobytes()
639
+
640
  if audio_stream is not None:
641
  if not return_as_byte:
642
  audio_unique_filename = "/tmp/"+ str(uuid.uuid4())+".wav"
643
+ with wave.open(audio_unique_filename, "w") as f:
644
+ f.setnchannels(1)
645
+ # 2 bytes per sample.
646
+ f.setsampwidth(2)
647
+ f.setframerate(24000)
648
+ f.writeframes(wav_bytestream)
649
+
650
  return (history , gr.Audio.update(value=audio_unique_filename, autoplay=True))
651
  else:
652
  return (history , gr.Audio.update(value=wav_bytestream, autoplay=True))
 
788
  It relies on following models :
789
  Speech to Text : [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
790
  LLM Mistral : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chat model, GGUF Q5_K_M quantized version used locally via llama_cpp[huggingface_hub](TheBloke/Mistral-7B-Instruct-v0.1-GGUF).
791
+ LLM Zephyr : [Zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF).
792
  Text to Speech : [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the chatbot answers. This time, the model is hosted locally.
793
 
794
  Note: