Spaces:
Runtime error
Runtime error
gorkemgoknar
commited on
Commit
•
e548e55
1
Parent(s):
245a4de
Update app.py
Browse files
app.py
CHANGED
@@ -19,6 +19,7 @@ import torch
|
|
19 |
import nltk # we'll use this to split into sentences
|
20 |
nltk.download("punkt")
|
21 |
|
|
|
22 |
import subprocess
|
23 |
import langid
|
24 |
import uuid
|
@@ -549,7 +550,7 @@ def generate_speech(history,chatbot_role,llm_model):
|
|
549 |
|
550 |
|
551 |
# will generate speech audio file per sentence
|
552 |
-
def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte=
|
553 |
language = "autodetect"
|
554 |
|
555 |
wav_bytestream = b""
|
@@ -583,7 +584,7 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
|
|
583 |
# sentence = sentence[:-1] + " " + sentence[-1]
|
584 |
|
585 |
# regex does the job well
|
586 |
-
sentence= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)",r"\1 \2",sentence)
|
587 |
|
588 |
print("Sentence for speech:", sentence)
|
589 |
|
@@ -618,23 +619,34 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
|
|
618 |
# XTTS is actually using streaming response but we are playing audio by sentence
|
619 |
# If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
|
620 |
if audio_stream is not None:
|
621 |
-
wav_chunks = wave_header_chunk()
|
622 |
frame_length = 0
|
623 |
for chunk in audio_stream:
|
624 |
try:
|
625 |
wav_bytestream += chunk
|
626 |
-
wav_chunks += chunk
|
627 |
frame_length += len(chunk)
|
628 |
except:
|
629 |
# hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
|
630 |
continue
|
631 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
632 |
if audio_stream is not None:
|
633 |
if not return_as_byte:
|
634 |
audio_unique_filename = "/tmp/"+ str(uuid.uuid4())+".wav"
|
635 |
-
with open(audio_unique_filename, "
|
636 |
-
f.
|
637 |
-
|
|
|
|
|
|
|
|
|
638 |
return (history , gr.Audio.update(value=audio_unique_filename, autoplay=True))
|
639 |
else:
|
640 |
return (history , gr.Audio.update(value=wav_bytestream, autoplay=True))
|
@@ -776,7 +788,7 @@ This Space demonstrates how to speak to a chatbot, based solely on open accessib
|
|
776 |
It relies on following models :
|
777 |
Speech to Text : [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
|
778 |
LLM Mistral : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chat model, GGUF Q5_K_M quantized version used locally via llama_cpp[huggingface_hub](TheBloke/Mistral-7B-Instruct-v0.1-GGUF).
|
779 |
-
LLM Zephyr : [Zephyr-7b-
|
780 |
Text to Speech : [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the chatbot answers. This time, the model is hosted locally.
|
781 |
|
782 |
Note:
|
|
|
19 |
import nltk # we'll use this to split into sentences
|
20 |
nltk.download("punkt")
|
21 |
|
22 |
+
import noisereduce as nr
|
23 |
import subprocess
|
24 |
import langid
|
25 |
import uuid
|
|
|
550 |
|
551 |
|
552 |
# will generate speech audio file per sentence
|
553 |
+
def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte=False):
|
554 |
language = "autodetect"
|
555 |
|
556 |
wav_bytestream = b""
|
|
|
584 |
# sentence = sentence[:-1] + " " + sentence[-1]
|
585 |
|
586 |
# regex does the job well
|
587 |
+
sentence= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)",r"\1 \2\2",sentence)
|
588 |
|
589 |
print("Sentence for speech:", sentence)
|
590 |
|
|
|
619 |
# XTTS is actually using streaming response but we are playing audio by sentence
|
620 |
# If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
|
621 |
if audio_stream is not None:
|
|
|
622 |
frame_length = 0
|
623 |
for chunk in audio_stream:
|
624 |
try:
|
625 |
wav_bytestream += chunk
|
|
|
626 |
frame_length += len(chunk)
|
627 |
except:
|
628 |
# hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
|
629 |
continue
|
630 |
|
631 |
+
# Filter output for better voice
|
632 |
+
filter_output=True
|
633 |
+
if filter_output:
|
634 |
+
data_s16 = np.frombuffer(wav_bytestream, dtype=np.int16, count=len(wav_bytestream)//2, offset=0)
|
635 |
+
float_data = data_s16 * 0.5**15
|
636 |
+
reduced_noise = nr.reduce_noise(y=float_data, sr=24000,prop_decrease =0.8,n_fft=1024)
|
637 |
+
wav_bytestream = (reduced_noise * 32767).astype(np.int16)
|
638 |
+
wav_bytestream = wav_bytestream.tobytes()
|
639 |
+
|
640 |
if audio_stream is not None:
|
641 |
if not return_as_byte:
|
642 |
audio_unique_filename = "/tmp/"+ str(uuid.uuid4())+".wav"
|
643 |
+
with wave.open(audio_unique_filename, "w") as f:
|
644 |
+
f.setnchannels(1)
|
645 |
+
# 2 bytes per sample.
|
646 |
+
f.setsampwidth(2)
|
647 |
+
f.setframerate(24000)
|
648 |
+
f.writeframes(wav_bytestream)
|
649 |
+
|
650 |
return (history , gr.Audio.update(value=audio_unique_filename, autoplay=True))
|
651 |
else:
|
652 |
return (history , gr.Audio.update(value=wav_bytestream, autoplay=True))
|
|
|
788 |
It relies on following models :
|
789 |
Speech to Text : [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
|
790 |
LLM Mistral : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chat model, GGUF Q5_K_M quantized version used locally via llama_cpp[huggingface_hub](TheBloke/Mistral-7B-Instruct-v0.1-GGUF).
|
791 |
+
LLM Zephyr : [Zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF).
|
792 |
Text to Speech : [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the chatbot answers. This time, the model is hosted locally.
|
793 |
|
794 |
Note:
|