Spaces:

amu-cai
/

amu-bigos-audio-recorder

Sleeping

App Files Files Community

mj-new commited on Jun 11, 2023

Commit

0147fc2

1 Parent(s): 0587641

working audio file saving

Browse files

Files changed (3) hide show

.gitignore +2 -0
Temp.mp3 +0 -0
app.py +159 -34

.gitignore CHANGED Viewed

	@@ -1 +1,3 @@
1	.python-version

 .python-version
+data_local
+run.sh

Temp.mp3 ADDED Viewed

Binary file (39.6 kB). View file

app.py CHANGED Viewed

@@ -2,6 +2,21 @@ import gradio as gr
 import whisper
 import numpy as np
 import openai
 def greet(name):
     return "Hello " + name + "!!"
@@ -12,23 +27,118 @@ with open('app.css','r') as f:
 markdown="""
 # Polish ASR BIGOS workspace
 """
 def whisper_model_change(radio_whisper_model):
     whisper_model = whisper.load_model(radio_whisper_model)
     return(whisper_model)
-def prompt_gpt(input_text):
     messages = [
-    {"role": "system", "content": "You are a helpful assistant."}]
     if input_text:
         messages.append(
             {"role": "user", "content": input_text},
         )
         chat_completion = openai.ChatCompletion.create(
-            model="gpt-3.5-turbo", messages=messages
         )
     reply = chat_completion.choices[0].message.content
     return reply
 def process_pipeline(audio):
@@ -58,9 +168,9 @@ def init_whisper_model(whisper_model_type):
     whisper_model = whisper.load_model(whisper_model_type)
     return whisper_model
-def synthesize_speech(text):
-    audioobj = gTTS(text = out_result,
-                    lang = lang,
                     slow = False)
     audioobj.save("Temp.mp3")
@@ -71,8 +181,11 @@ with block:
     #state variables
     language = gr.State("en")
     whisper_model_type = gr.State("base")
     whisper_model = gr.State()
     # state handling functions
     def change_language(choice):
@@ -96,36 +209,48 @@ with block:
         return [whisper_model_type, whisper_model]
     gr.Markdown(markdown)
     with gr.Tabs():
-        with gr.TabItem('Voicebot playground'):
             with gr.Box():
-                gr.HTML("<p class=\"apikey\">API Key:</p>")
-                # API key textbox (password-style)
-                api_key = gr.Textbox(label="", elem_id="pw")
-            radio_lang = gr.Radio(["Polish", "English"], label="Language", info="If none selected, English is used")
-            #radio_asr_type = gr.Radio(["Local", "Cloud"], label="Select ASR type", info="Cloud models are faster and more accurate, but costs money")
-            #radio_cloud_asr = gr.Radio(["Whisper", "Google", "Azure"], label="Select Cloud ASR provider", info="You need to provide API keys for specific service")
-            radio_whisper_model = gr.Radio(["tiny", "base", "small", "medium", "large"], label="Whisper ASR model (local)", info="Larger models are better, but slower. Default - base")
-            mic_recording = gr.Audio(source="microphone", type="filepath", label='Record your voice')
-            out_asr = gr.Textbox(placeholder="ASR output",
-                               lines=5,
-                               max_lines=10,
-                               show_label=False)
-            out_gpt = gr.Textbox(placeholder="ChatGPT output",
-                               lines=10,
-                               max_lines=25,
-                               show_label=False)
-            button_transcribe = gr.Button("Transcribe")
-            button_prompt_gpt = gr.Button("Prompt ChatGPT")
-            button_transcribe.click(transcribe, inputs=[mic_recording,language, whisper_model,whisper_model_type], outputs=out_asr)
-            button_prompt_gpt.click(prompt_gpt, inputs=out_asr, outputs=out_gpt)
-            radio_lang.change(fn=change_language, inputs=radio_lang, outputs=language)
-            radio_whisper_model.change(fn=change_whisper_model, inputs=radio_whisper_model, outputs=[whisper_model_type, whisper_model])
 block.launch()

 import whisper
 import numpy as np
 import openai
+import os
+from gtts import gTTS
+import json
+import hashlib
+import random
+import string
+import uuid
+from datetime import date,datetime
+from huggingface_hub import Repository, upload_file
+import shutil
+HF_TOKEN_WRITE = os.environ.get("HF_TOKEN_WRITE")
+print("HF_TOKEN_WRITE", HF_TOKEN_WRITE)
+today = date.today()
+today_ymd = today.strftime("%Y%m%d")
 def greet(name):
     return "Hello " + name + "!!"
 markdown="""
 # Polish ASR BIGOS workspace
 """
+# TODO move to config
+WORKING_DATASET_REPO_URL = "https://huggingface.co/datasets/goodmike31/working-db"
+REPO_NAME = "goodmike31/working-db"
+REPOSITORY_DIR = "data"
+LOCAL_DIR = "data_local"
+os.makedirs(LOCAL_DIR,exist_ok=True)
+def dump_json(thing,file):
+    with open(file,'w+',encoding="utf8") as f:
+        json.dump(thing,f)
+def get_unique_name():
+    return ''.join([random.choice(string.ascii_letters
+            + string.digits) for n in range(32)])
+def save_recording_and_meta(project_name, recording, transcript, language):
+    #, name, age, gender):
+    # TODO save user data in the next version
+    speaker_metadata={}
+    speaker_metadata['gender'] = "test" #gender if gender!=GENDER[0] else ''
+    speaker_metadata['age'] = "test" #age if age !='' else ''
+    speaker_metadata['accent'] = "test" #accent if accent!='' else ''
+    lang_id = language.lower()
+    # TODO get ISO-693-1 codes
+    transcript =transcript.strip()
+    SAVE_ROOT_DIR = os.path.join(LOCAL_DIR, project_name, today_ymd)
+    SAVE_DIR_AUDIO = os.path.join(SAVE_ROOT_DIR, "audio")
+    SAVE_DIR_META = os.path.join(SAVE_ROOT_DIR, "meta")
+    os.makedirs(SAVE_DIR_AUDIO, exist_ok=True)
+    os.makedirs(SAVE_DIR_META, exist_ok=True)
+    # Write audio to file
+    #audio_name = get_unique_name()
+    uuid_name = str(uuid.uuid4())
+    audio_fn = uuid_name + ".wav"
+    audio_output_fp = os.path.join(SAVE_DIR_AUDIO, audio_fn)
+    print (f"Saving {recording} as {audio_output_fp}")
+    shutil.copy2(recording, audio_output_fp)
+    # Write metadata.json to file
+    meta_fn = uuid_name + 'metadata.jsonl'
+    json_file_path = os.path.join(SAVE_DIR_META, meta_fn)
+    now = datetime.now()
+    timestamp_str = now.strftime("%d/%m/%Y %H:%M:%S")
+    metadata= {'id':uuid_name,'audio_file': audio_fn,
+        'language_name':language,'language_id':lang_id,
+        'transcript':transcript,'age': speaker_metadata['age'],
+        'gender': speaker_metadata['gender'],'accent': speaker_metadata['accent'],
+        "date":today_ymd, "timestamp": timestamp_str }
+    dump_json(metadata, json_file_path)
+    # Simply upload the audio file and metadata using the hub's upload_file
+    # Upload the audio
+    repo_audio_path = os.path.join(REPOSITORY_DIR, project_name, today_ymd, "audio", audio_fn)
+    _ = upload_file(path_or_fileobj = audio_output_fp,
+                        path_in_repo = repo_audio_path,
+                        repo_id = REPO_NAME,
+                        repo_type = 'dataset',
+                        token = HF_TOKEN_WRITE
+                    )
+            # Upload the metadata
+    repo_json_path = os.path.join(REPOSITORY_DIR, project_name, today_ymd, "meta", meta_fn)
+    _ = upload_file(path_or_fileobj = json_file_path,
+                        path_in_repo = repo_json_path,
+                        repo_id = REPO_NAME,
+                        repo_type = 'dataset',
+                        token = HF_TOKEN_WRITE
+                    )
+    output = print(f"Recording {audio_fn} and meta file {meta_fn} successfully saved to repo!")
+    return
 def whisper_model_change(radio_whisper_model):
     whisper_model = whisper.load_model(radio_whisper_model)
     return(whisper_model)
+def prompt_gpt(input_text, api_key, temperature):
+    #, role, template_prompt, template_answer):
+    #TODO add option to specify instruction
+    openai.api_key = api_key
+    #TODO add specific message for specific role
+    system_role_message="You are a helpful assistant"
     messages = [
+    {"role": "system", "content": system_role_message}]
     if input_text:
         messages.append(
             {"role": "user", "content": input_text},
         )
         chat_completion = openai.ChatCompletion.create(
+            model="gpt-3.5-turbo",
+            messages=messages,
+            temperature=temperature
         )
     reply = chat_completion.choices[0].message.content
+    #TODO save chat completion for future reuse
     return reply
 def process_pipeline(audio):
     whisper_model = whisper.load_model(whisper_model_type)
     return whisper_model
+def synthesize_speech(text, language):
+    audioobj = gTTS(text = text,
+                    lang = language,
                     slow = False)
     audioobj.save("Temp.mp3")
     #state variables
     language = gr.State("en")
+    temperature = gr.State(0)
     whisper_model_type = gr.State("base")
     whisper_model = gr.State()
+    api_key = gr.State()
+    project_name = gr.State("voicebot") # TODO add list of projects to organize saved data
     # state handling functions
     def change_language(choice):
         return [whisper_model_type, whisper_model]
     gr.Markdown(markdown)
     with gr.Tabs():
+        with gr.Row():
+            with gr.TabItem('Voicebot playground'):
+                with gr.Accordion(label="Settings"):
+                    gr.HTML("<p class=\"apikey\">Open AI API Key:</p>")
+                    # API key textbox (password-style)
+                    api_key = gr.Textbox(label="", elem_id="pw")
+                    slider_temp = gr.Slider(minimum=0, maximum= 2, step=0.2, label="ChatGPT temperature")
+                    radio_lang = gr.Radio(["Polish", "English"], label="Language", info="If none selected, English is used")
+                    #radio_asr_type = gr.Radio(["Local", "Cloud"], label="Select ASR type", info="Cloud models are faster and more accurate, but costs money")
+                    #radio_cloud_asr = gr.Radio(["Whisper", "Google", "Azure"], label="Select Cloud ASR provider", info="You need to provide API keys for specific service")
+                    radio_whisper_model = gr.Radio(["tiny", "base", "small", "medium", "large"], label="Whisper ASR model (local)", info="Larger models are more accurate, but slower. Default - base")
             with gr.Box():
+                with gr.Row():
+                    mic_recording = gr.Audio(source="microphone", type="filepath", label='Record your voice')
+                    button_transcribe = gr.Button("Transcribe speech")
+                    button_save_audio_and_trans = gr.Button("Save recording and meta")
+                out_asr = gr.Textbox(placeholder="ASR output",
+                                lines=2,
+                                max_lines=5,
+                                show_label=False)
+                button_prompt_gpt = gr.Button("Prompt ChatGPT")
+                out_gpt = gr.Textbox(placeholder="ChatGPT output",
+                                lines=4,
+                                max_lines=10,
+                                show_label=False)
+                button_synth_speech = gr.Button("Synthesize speech")
+                synth_recording = gr.Audio()
+                # Events actions
+                button_save_audio_and_trans.click(save_recording_and_meta, inputs=[project_name, mic_recording, out_asr, language], outputs=[])
+                button_transcribe.click(transcribe, inputs=[mic_recording, language, whisper_model,whisper_model_type], outputs=out_asr)
+                button_prompt_gpt.click(prompt_gpt, inputs=[out_asr, api_key, slider_temp], outputs=out_gpt)
+                button_synth_speech.click(synthesize_speech, inputs=[out_gpt, language], outputs=synth_recording)
+                radio_lang.change(fn=change_language, inputs=radio_lang, outputs=language)
+                radio_whisper_model.change(fn=change_whisper_model, inputs=radio_whisper_model, outputs=[whisper_model_type, whisper_model])
 block.launch()