Spaces:

sawadogosalif
/

Sachi-ASR-demo

Running on Zero

App Files Files Community

Sachi-ASR-demo / app.py

sawadogosalif

use torchaudio

3bc425a verified 11 days ago

raw

history blame contribute delete

5.89 kB

	import os
	import uuid
	import logging
	import tempfile
	from datetime import datetime
	import spaces

	import gradio as gr
	import librosa
	import soundfile as sf
	import torch
	from datasets import Dataset, DatasetDict, concatenate_datasets, Audio, load_dataset, DownloadConfig
	from transformers import pipeline
	from huggingface_hub import HfApi, login
	from resemble_enhance.enhancer.inference import denoise, enhance
	import torchaudio


	# Configure logging
	logging.basicConfig(
	format="%(asctime)s — %(levelname)s — %(message)s",
	level=logging.INFO
	)
	logger = logging.getLogger(__name__)

	# Constants
	HF_TOKEN = os.getenv("HF_TOKEN")
	if not HF_TOKEN:
	logger.error("Hugging Face token not found. Please set HF_TOKEN environment variable.")
	raise SystemExit

	CURRENT_DATASET = "sawadogosalif/Sachi_demo_dataset"
	SAMPLE_RATE = 16_000
	ASR_MODEL = "sawadogosalif/SaChi-ASR"

	# Authenticate with Hugging Face
	login(token=HF_TOKEN)
	api = HfApi(token=HF_TOKEN)


	def get_or_create_dataset(dataset_name: str) -> Dataset:
	"""
	Load the dataset if it exists, otherwise create a new empty one.
	"""
	try:
	ds = load_dataset(
	dataset_name,
	split="train",
	download_config=DownloadConfig(token=HF_TOKEN)
	)
	logger.info(f"Loaded dataset '{dataset_name}' with {len(ds)} examples.")
	except Exception:
	logger.warning(f"Dataset '{dataset_name}' not found or failed to load. Creating a new one.")
	ds = Dataset.from_dict({
	"audio": [],
	"text": [],
	"language": [],
	"datetime": [],
	})
	DatasetDict({"train": ds}).push_to_hub(dataset_name, token=HF_TOKEN)
	logger.info(f"Created empty dataset '{dataset_name}'.")
	return ds


	def save_dataset(dataset: Dataset, dataset_name: str) -> None:
	"""
	Push the updated dataset back to Hugging Face hub.
	"""
	ds_dict = DatasetDict({"train": dataset})
	ds_dict.push_to_hub(dataset_name, token=HF_TOKEN)
	logger.info(f"Pushed updated dataset to '{dataset_name}' ({len(dataset)} records).")





	class Transcriber:
	def __init__(self, asr_model: str):
	self.pipeline = pipeline(model=asr_model)

	def transcribe(self, audio_path: str) -> str:
	result = self.pipeline(audio_path)
	return result.get("text", "")


	# Initialize components
	current_dataset = get_or_create_dataset(CURRENT_DATASET)
	asr_client = Transcriber(ASR_MODEL)


	@spaces.GPU(duration=15)
	def transcribe_and_update(audio_filepath: str, history: str, apply_enhance: bool) -> tuple:
	"""
	Denoise every input, optionally enhance, then transcribe and push to HF dataset.
	"""
	if not audio_filepath:
	return "No audio detected. Please record or upload audio.", history

	try:
	# Load and preprocess
	audio_data, sr = dwav, sr = torchaudio.load(audio_filepath)
	# Always denoise
	try:
	device = "cuda"
	audio_data = audio_data.mean(dim=0)
	denoised_data, sr = denoise(audio_data, sr, device)
	logger.info("Audio denoised successfully.")
	except Exception as e:
	logger.warning(f"Denoise failed, using raw audio: {e}")
	denoised_data = audio_data

	# Optionally enhance
	if apply_enhance:
	try:
	enhanced_data, sr = enhance(denoised_data, sr, device)
	final_audio = enhanced_data
	logger.info("Audio enhanced successfully.")
	except Exception as e:
	logger.warning(f"Enhancement failed, using denoised audio: {e}")
	final_audio = denoised_data
	else:
	final_audio = denoised_data

	# Save processed audio to temp file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpf:
	sf.write(tmpf.name, final_audio, sr)
	local_path = tmpf.name

	# Transcription
	transcription = asr_client.transcribe(local_path)
	logger.info(f"Transcription: {transcription}")

	# Prepare new record
	new_record = {
	"audio": [local_path],
	"text": [transcription],
	"language": ["moore"],
	"datetime": [datetime.utcnow().isoformat()]
	}
	new_ds = Dataset.from_dict(new_record).cast_column("audio", Audio())

	# Update in-memory dataset
	global current_dataset
	if len(current_dataset) == 0:
	current_dataset = new_ds
	else:
	current_dataset = concatenate_datasets([current_dataset, new_ds])

	# Push to hub
	save_dataset(current_dataset, CURRENT_DATASET)

	# Update conversation history
	history = history + f"\nUser: [audio]\nAssistant: {transcription}"
	return transcription, history

	except Exception as exc:
	logger.error(f"Error during transcription pipeline: {exc}")
	return f"Error: {exc}", history


	def build_interface():
	with gr.Blocks() as demo:
	gr.Markdown("# 🗣️ ASR Moore Live 🧠")
	gr.Markdown("Speech Recognition interface for Moore language. Records or uploads audio, always denoises, and optionally enhances before ASR.")

	with gr.Row():
	audio_input = gr.Audio(type="filepath", label="Record or upload audio", sources=["microphone", "upload"])
	state_box = gr.State(value="")
	enhance_checkbox = gr.Checkbox(label="Apply Enhancement", value=False)

	output_text = gr.Textbox(label="Transcription")
	submit_btn = gr.Button("Transcribe and Save")
	submit_btn.click(fn=transcribe_and_update,
	inputs=[audio_input, state_box, enhance_checkbox],
	outputs=[output_text, state_box])

	demo.launch(debug=True)


	if __name__ == "__main__":
	build_interface()