import gradio as gr import vowel_length import ctcalign meta_tsv = ['data/set1.tsv','data/set2.tsv'] ph_key = 'data/key_all.tsv' align_output = 'data/align_csv.pickle' dat,vck,kws,csvs = vowel_length.setup(meta_tsv,ph_key,align_output) # runan(w,'l1','w2v2',vck,dat,sources) # runan(w,'l1','mfa',vck,dat,sources) def manager(word,group,aligner,side): fig = vowel_length.runan(word,group,aligner,vck,dat,csvs) #TODO add colour by plot-side print(side) return fig def aligning(transcript, audio, language): formatted_output = ctcalign.langsalign(audio,transcript,language) return formatted_output bl = gr.Blocks() with bl: with gr.Tabs(): with gr.TabItem("CTC alignment"): gr.Markdown( """ # Forced alignment with CTC decoding Choose a language to upload a sentence with corresponding text. Generate word and letter time-alignments from the language's wav2vec-2.0 model, with output in MFA (Montreal Forced Aligner)-compatible format. It is best to upload short recordings of a sentence or so; recordings over a couple minutes require excessive memory to align, and should be divided into shorter pieces. Use only lower case letters with no punctuation. """ ) gr.Markdown( """ Contact caitlinr@ru.is with feedback, problems, and to request changes. """ ) with gr.Row(): with gr.Column(): transcript_boxx = gr.Textbox(label="Transcript",placeholder="Type or paste the transcript here. Capitalisation and punctuation, if any, will be ignored.") audio_file = gr.Audio(type="filepath") alangmenu = gr.Radio(["Icelandic", "Faroese", "Norwegian"],value="Icelandic",label="Language") al_btn = gr.Button(value="Run forced alignment") with gr.Column(): output_box = gr.Textbox(label="Forced alignment output") al_btn.click(aligning, [transcript_boxx, audio_file, alangmenu], output_box) with gr.TabItem("Vowel quantity"): gr.Markdown( """ # Long and short Icelandic vowels Choose a word, speaker group, and aligner type. Available speaker groups are native speakers, second-language speakers, or all. Aligner options are Montreal Forced Aligner (MFA) and CTC decoding with Wav2vec-2.0. If the graph shows "Error" this means there is not data for the selected word, speaker group, and alignment type. The general expectation is that syllables with long stressed vowels followed by short consonants have a higher vowel:consonant duration ratio, while syllables with short stressed vowels followed by long consonants have a lower vowel:consonant ratio. However, a great many other factors affect the relative duration in any one recorded token. See Pind 1999, 'Speech segment durations and quantity in Icelandic' (J. Acoustical Society of America, 106(2)) for a review of the acoustics of Icelandic vowel duration. All phoneme durations are measured automatically with no human correction. The purpose of this demo is to evaluate the role of such tools in large-scale phonetic research. Therefore, no measurements shown in this demo should be taken as conclusive without some independent verification. """ ) with gr.Row(): with gr.Column(): wmenu1 = gr.Dropdown(kws,label="Word",value="hala") lmenu1 = gr.Dropdown(["L1", "L2","All"],label="Speaker group",value="L1") amenu1 = gr.Dropdown(["MFA", "CTC"],label="Aligner",value="CTC") btn1 = gr.Button(value="Update Plot 1") pl1 = gr.Plot() btn1.click(manager, [wmenu1, lmenu1, amenu1, btn1], pl1) with gr.Column(): wmenu2 = gr.Dropdown(kws,label="Word",value="halla") lmenu2 = gr.Dropdown(["L1", "L2","All"],label="Speaker group",value="L1") amenu2 = gr.Dropdown(["MFA", "CTC"],label="Aligner",value="CTC") btn2 = gr.Button(value="Update Plot 2") pl2 = gr.Plot() btn2.click(manager, [wmenu2, lmenu2, amenu2, btn2], pl2) if __name__ == "__main__": bl.launch()