Spaces:
Runtime error
Runtime error
import os | |
os.system("pip install transformers") | |
os.system("pip install https://github.com/kpu/kenlm/archive/master.zip") | |
os.system("pip install pyctcdecode") | |
os.system("pip install gradio") | |
os.system("pip install librosa") | |
os.system("pip install torch") | |
import gradio as gr | |
import librosa | |
import torch | |
from transformers import Wav2Vec2CTCTokenizer | |
from transformers import Wav2Vec2FeatureExtractor | |
from transformers import Wav2Vec2Processor | |
from transformers import Wav2Vec2ForCTC | |
from transformers import Wav2Vec2ProcessorWithLM | |
repo_name = "aiface/vietnamese_s2t" | |
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
# device="cpu" | |
processor = Wav2Vec2ProcessorWithLM.from_pretrained(repo_name, token="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") | |
model = Wav2Vec2ForCTC.from_pretrained(repo_name, token="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").to(device) | |
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(repo_name, token="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") | |
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(repo_name, token="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") | |
def process_audio_file(file): | |
data, sr = librosa.load(file, sr = 16000) | |
return data | |
def transcribe(file_mic, file_upload): | |
warn_output = "" | |
if (file_mic is not None) and (file_upload is not None): | |
warn_output = "WARNING: You've uploaded an audio file and used the microphone. The recorded file from the microphone will be used and the uploaded audio will be discarded.\n" | |
file = file_mic | |
elif (file_mic is None) and (file_upload is None): | |
return "ERROR: You have to either use the microphone or upload an audio file" | |
elif file_mic is not None: | |
file = file_mic | |
else: | |
file = file_upload | |
input_values = process_audio_file(file) | |
input_dict = processor(input_values, sampling_rate=16_000, return_tensors="pt", padding=True) | |
logits = model(input_dict.input_values.to(device)).logits | |
pred_ids = torch.argmax(logits, dim=-1)[0] | |
pres = processor.batch_decode(logits.to("cpu").detach().numpy()).text | |
return warn_output + str(pres[0]) | |
iface = gr.Interface( | |
fn=transcribe, | |
inputs=[ | |
gr.inputs.Audio(source="microphone", type='filepath', optional=True), | |
gr.inputs.Audio(source="upload", type='filepath', optional=True), | |
], | |
outputs="text", | |
layout="horizontal", | |
theme="huggingface", | |
title="Speech to text MMS With Language Model", | |
description="Demo đơn giản speech to text", | |
) | |
iface.launch(share=True) |