nvidia
/

stt_hy_fastconformer_hybrid_large_pc

Automatic Speech Recognition

speech-recognition

hf-asr-leaderboard

Model card Files Files and versions Community

stt_hy_fastconformer_hybrid_large_pc / README.md

alcray's picture

Add stt_hy_fastconformer_hybrid_large_pc model

d53fe3b about 1 month ago

|

3.77 kB

	license: cc-by-4.0

	datasets:
	- mozilla-foundation/common_voice_17_0
	- google/fleurs

	language:
	- hy

	pipeline_tag: automatic-speech-recognition

	library_name: NeMo

	metrics:
	- WER
	- CER

	tags:
	- speech-recognition
	- ASR
	- Armenian
	- Conformer
	- Transducer
	- CTC
	- NeMo
	- hf-asr-leaderboard
	- speech
	- audio

	model-index:
	- name: stt_hy_fastconformer_hybrid_large_pc
	results:
	- task:
	name: Automatic Speech Recognition
	type: automatic-speech-recognition
	dataset:
	name: MCV17
	type: mozilla-foundation/common_voice_17_0
	split: test
	args:
	language: hy
	metrics:
	- name: Test WER
	type: wer
	value: 9.90
	- task:
	name: Automatic Speech Recognition
	type: automatic-speech-recognition
	dataset:
	name: FLEURS
	type: google/fleurs
	split: test
	args:
	language: hy
	metrics:
	- name: Test WER
	type: wer
	value: 12.32

	model-details:
	name: NVIDIA FastConformer-Hybrid Large (hy)
	description: \|
	This model transcribes speech in the Armenian language with capitalization and punctuation marks support. It is a "large" version of the FastConformer Transducer-CTC model with 115M parameters, trained on Transducer (default) and CTC losses.
	license: cc-by-4.0
	architecture: FastConformer-Hybrid
	tokenizer:
	type: SentencePiece
	vocab_size: 1024

	inputs:
	type: audio
	format: wav
	properties:
	- 16000 Hz Mono-channel Audio
	- Pre-Processing Not Needed

	outputs:
	type: text
	format: string
	properties:
	- Armenian text with punctuation and capitalization
	- May need inverse text normalization
	- Does not handle special characters

	limitations:
	- Non-streaming model
	- Accuracy depends on input audio characteristics
	- Not recommended for word-for-word transcription
	- Limited domain-specific vocabulary

	usage:
	framework: NeMo
	pre-trained-model: nvidia/stt_hy_fastconformer_hybrid_large_pc
	code:
	- import nemo.collections.asr as nemo_asr
	- asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(model_name="nvidia/stt_hy_fastconformer_hybrid_large_pc")
	- asr_model.transcribe(['your_audio_file.wav'])

	training:
	epochs: 200
	dataset:
	total_hours: 296.19
	sources:
	- Mozilla Common Voice 17.0 (48h)
	- Google Fleurs (12h)
	- ArmenianGrqaserAudioBooks (21.96h)
	- Proprietary Corpus 1 (69.23h)
	- Proprietary Corpus 2 (145h)

	evaluation:
	datasets:
	- Mozilla Common Voice 17.0
	- Google Fleurs
	- Proprietary Corpus 1
	metrics:
	WER:
	- MCV Test WER: 9.90
	- FLEURS Test WER: 12.32
	CER: Not provided

	deployment:
	hardware:
	- NVIDIA Ampere
	- NVIDIA Blackwell
	- NVIDIA Jetson
	- NVIDIA Hopper
	- NVIDIA Lovelace
	- NVIDIA Pascal
	- NVIDIA Turing
	- NVIDIA Volta
	runtime: NeMo 2.0.0
	os: Linux

	ethical-considerations:
	trustworthy-ai:
	considerations: Ensure model meets requirements for relevant industries and addresses misuse.
	explainability:
	application: Automatic Speech Recognition
	performance:
	- WER
	- CER
	- Real-Time Factor
	risks:
	- Accuracy may vary with input characteristics.
	privacy:
	compliance: Reviewed for privacy laws
	personal-data: No identifiable personal data
	safety:
	use-cases: Not applicable for life-critical applications.
	noise-sensitivity: Sensitive to noise and input variations.