license: cc-by-4.0

datasets:
  - mozilla-foundation/common_voice_17_0
  - google/fleurs

language: 
  - hy

pipeline_tag: automatic-speech-recognition

library_name: NeMo

metrics:
  - WER
  - CER

tags:
  - speech-recognition
  - ASR
  - Armenian
  - Conformer
  - Transducer
  - CTC
  - NeMo
  - hf-asr-leaderboard
  - speech
  - audio

model-index:
  - name: stt_hy_fastconformer_hybrid_large_pc
    results:
      - task:
          name: Automatic Speech Recognition
          type: automatic-speech-recognition
        dataset:
          name: MCV17
          type: mozilla-foundation/common_voice_17_0
          split: test
          args:
            language: hy
        metrics:
          - name: Test WER
            type: wer
            value: 9.90
      - task:
          name: Automatic Speech Recognition
          type: automatic-speech-recognition
        dataset:
          name: FLEURS
          type: google/fleurs
          split: test
          args:
            language: hy
        metrics:
          - name: Test WER
            type: wer
            value: 12.32

model-details:
  name: NVIDIA FastConformer-Hybrid Large (hy)
  description: |
    This model transcribes speech in the Armenian language with capitalization and punctuation marks support. It is a "large" version of the FastConformer Transducer-CTC model with 115M parameters, trained on Transducer (default) and CTC losses.
  license: cc-by-4.0
  architecture: FastConformer-Hybrid
  tokenizer:
    type: SentencePiece
    vocab_size: 1024

inputs:
  type: audio
  format: wav
  properties:
    - 16000 Hz Mono-channel Audio
    - Pre-Processing Not Needed

outputs:
  type: text
  format: string
  properties:
    - Armenian text with punctuation and capitalization
    - May need inverse text normalization
    - Does not handle special characters

limitations:
  - Non-streaming model
  - Accuracy depends on input audio characteristics
  - Not recommended for word-for-word transcription
  - Limited domain-specific vocabulary

usage:
  framework: NeMo
  pre-trained-model: nvidia/stt_hy_fastconformer_hybrid_large_pc
  code:
    - import nemo.collections.asr as nemo_asr
    - asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(model_name="nvidia/stt_hy_fastconformer_hybrid_large_pc")
    - asr_model.transcribe(['your_audio_file.wav'])

training:
  epochs: 200
  dataset:
    total_hours: 296.19
    sources:
      - Mozilla Common Voice 17.0 (48h)
      - Google Fleurs (12h)
      - ArmenianGrqaserAudioBooks (21.96h)
      - Proprietary Corpus 1 (69.23h)
      - Proprietary Corpus 2 (145h)

evaluation:
  datasets:
    - Mozilla Common Voice 17.0
    - Google Fleurs
    - Proprietary Corpus 1
  metrics:
    WER: 
      - MCV Test WER: 9.90
      - FLEURS Test WER: 12.32
    CER: Not provided

deployment:
  hardware: 
    - NVIDIA Ampere
    - NVIDIA Blackwell
    - NVIDIA Jetson
    - NVIDIA Hopper
    - NVIDIA Lovelace
    - NVIDIA Pascal
    - NVIDIA Turing
    - NVIDIA Volta
  runtime: NeMo 2.0.0
  os: Linux

ethical-considerations:
  trustworthy-ai:
    considerations: Ensure model meets requirements for relevant industries and addresses misuse.
  explainability:
    application: Automatic Speech Recognition
    performance:
      - WER
      - CER
      - Real-Time Factor
    risks:
      - Accuracy may vary with input characteristics.
  privacy:
    compliance: Reviewed for privacy laws
    personal-data: No identifiable personal data
  safety:
    use-cases: Not applicable for life-critical applications.
    noise-sensitivity: Sensitive to noise and input variations.