File size: 3,767 Bytes
8acc2e3
d53fe3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
license: cc-by-4.0

datasets:
  - mozilla-foundation/common_voice_17_0

  - google/fleurs



language: 

  - hy



pipeline_tag: automatic-speech-recognition

library_name: NeMo



metrics:

  - WER

  - CER



tags:

  - speech-recognition

  - ASR

  - Armenian

  - Conformer

  - Transducer

  - CTC

  - NeMo

  - hf-asr-leaderboard

  - speech

  - audio



model-index:

  - name: stt_hy_fastconformer_hybrid_large_pc
    results:

      - task:

          name: Automatic Speech Recognition

          type: automatic-speech-recognition

        dataset:

          name: MCV17

          type: mozilla-foundation/common_voice_17_0

          split: test

          args:

            language: hy

        metrics:

          - name: Test WER

            type: wer

            value: 9.90

      - task:

          name: Automatic Speech Recognition

          type: automatic-speech-recognition

        dataset:

          name: FLEURS

          type: google/fleurs

          split: test

          args:

            language: hy

        metrics:

          - name: Test WER

            type: wer

            value: 12.32


model-details:
  name: NVIDIA FastConformer-Hybrid Large (hy)
  description: |
    This model transcribes speech in the Armenian language with capitalization and punctuation marks support. It is a "large" version of the FastConformer Transducer-CTC model with 115M parameters, trained on Transducer (default) and CTC losses.

  license: cc-by-4.0

  architecture: FastConformer-Hybrid

  tokenizer:

    type: SentencePiece

    vocab_size: 1024


inputs:
  type: audio
  format: wav
  properties:
    - 16000 Hz Mono-channel Audio
    - Pre-Processing Not Needed

outputs:
  type: text
  format: string
  properties:
    - Armenian text with punctuation and capitalization
    - May need inverse text normalization
    - Does not handle special characters

limitations:
  - Non-streaming model
  - Accuracy depends on input audio characteristics
  - Not recommended for word-for-word transcription
  - Limited domain-specific vocabulary

usage:
  framework: NeMo
  pre-trained-model: nvidia/stt_hy_fastconformer_hybrid_large_pc

  code:

    - import nemo.collections.asr as nemo_asr
    - asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(model_name="nvidia/stt_hy_fastconformer_hybrid_large_pc")

    - asr_model.transcribe(['your_audio_file.wav'])

training:
  epochs: 200
  dataset:
    total_hours: 296.19

    sources:

      - Mozilla Common Voice 17.0 (48h)

      - Google Fleurs (12h)

      - ArmenianGrqaserAudioBooks (21.96h)

      - Proprietary Corpus 1 (69.23h)

      - Proprietary Corpus 2 (145h)


evaluation:
  datasets:
    - Mozilla Common Voice 17.0
    - Google Fleurs
    - Proprietary Corpus 1
  metrics:
    WER: 

      - MCV Test WER: 9.90

      - FLEURS Test WER: 12.32

    CER: Not provided


deployment:
  hardware: 
    - NVIDIA Ampere
    - NVIDIA Blackwell
    - NVIDIA Jetson
    - NVIDIA Hopper
    - NVIDIA Lovelace
    - NVIDIA Pascal
    - NVIDIA Turing
    - NVIDIA Volta
  runtime: NeMo 2.0.0
  os: Linux

ethical-considerations:
  trustworthy-ai:
    considerations: Ensure model meets requirements for relevant industries and addresses misuse.

  explainability:

    application: Automatic Speech Recognition

    performance:

      - WER

      - CER

      - Real-Time Factor

    risks:

      - Accuracy may vary with input characteristics.

  privacy:

    compliance: Reviewed for privacy laws

    personal-data: No identifiable personal data

  safety:

    use-cases: Not applicable for life-critical applications.

    noise-sensitivity: Sensitive to noise and input variations.