File size: 3,767 Bytes

8acc2e3
d53fe3b

license: cc-by-4.0

datasets:
  - mozilla-foundation/common_voice_17_0

  - google/fleurs



language: 

  - hy



pipeline_tag: automatic-speech-recognition

library_name: NeMo



metrics:

  - WER

  - CER



tags:

  - speech-recognition

  - ASR

  - Armenian

  - Conformer

  - Transducer

  - CTC

  - NeMo

  - hf-asr-leaderboard

  - speech

  - audio



model-index:

  - name: stt_hy_fastconformer_hybrid_large_pc
    results:

      - task:

          name: Automatic Speech Recognition

          type: automatic-speech-recognition

        dataset:

          name: MCV17

          type: mozilla-foundation/common_voice_17_0

          split: test

          args:

            language: hy

        metrics:

          - name: Test WER

            type: wer

            value: 9.90

      - task:

          name: Automatic Speech Recognition

          type: automatic-speech-recognition

        dataset:

          name: FLEURS

          type: google/fleurs

          split: test

          args:

            language: hy

        metrics:

          - name: Test WER

            type: wer

            value: 12.32


model-details:
  name: NVIDIA FastConformer-Hybrid Large (hy)
  description: |
    This model transcribes speech in the Armenian language with capitalization and punctuation marks support. It is a "large" version of the FastConformer Transducer-CTC model with 115M parameters, trained on Transducer (default) and CTC losses.

  license: cc-by-4.0

  architecture: FastConformer-Hybrid

  tokenizer:

    type: SentencePiece

    vocab_size: 1024


inputs:
  type: audio
  format: wav
  properties:
    - 16000 Hz Mono-channel Audio
    - Pre-Processing Not Needed

outputs:
  type: text
  format: string
  properties:
    - Armenian text with punctuation and capitalization
    - May need inverse text normalization
    - Does not handle special characters

limitations:
  - Non-streaming model
  - Accuracy depends on input audio characteristics
  - Not recommended for word-for-word transcription
  - Limited domain-specific vocabulary

usage:
  framework: NeMo
  pre-trained-model: nvidia/stt_hy_fastconformer_hybrid_large_pc

  code:

    - import nemo.collections.asr as nemo_asr
    - asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(model_name="nvidia/stt_hy_fastconformer_hybrid_large_pc")

    - asr_model.transcribe(['your_audio_file.wav'])

training:
  epochs: 200
  dataset:
    total_hours: 296.19

    sources:

      - Mozilla Common Voice 17.0 (48h)

      - Google Fleurs (12h)

      - ArmenianGrqaserAudioBooks (21.96h)

      - Proprietary Corpus 1 (69.23h)

      - Proprietary Corpus 2 (145h)


evaluation:
  datasets:
    - Mozilla Common Voice 17.0
    - Google Fleurs
    - Proprietary Corpus 1
  metrics:
    WER: 

      - MCV Test WER: 9.90

      - FLEURS Test WER: 12.32

    CER: Not provided


deployment:
  hardware: 
    - NVIDIA Ampere
    - NVIDIA Blackwell
    - NVIDIA Jetson
    - NVIDIA Hopper
    - NVIDIA Lovelace
    - NVIDIA Pascal
    - NVIDIA Turing
    - NVIDIA Volta
  runtime: NeMo 2.0.0
  os: Linux

ethical-considerations:
  trustworthy-ai:
    considerations: Ensure model meets requirements for relevant industries and addresses misuse.

  explainability:

    application: Automatic Speech Recognition

    performance:

      - WER

      - CER

      - Real-Time Factor

    risks:

      - Accuracy may vary with input characteristics.

  privacy:

    compliance: Reviewed for privacy laws

    personal-data: No identifiable personal data

  safety:

    use-cases: Not applicable for life-critical applications.

    noise-sensitivity: Sensitive to noise and input variations.