nguyenvulebinh commited on
Commit
bd55937
·
verified ·
1 Parent(s): 4da351c

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +47 -94
README.md CHANGED
@@ -34,37 +34,46 @@ from transformers import Speech2TextTokenizer
34
  import torch
35
 
36
  if __name__ == "__main__":
37
- # Load pretrained english model
38
- model = AV2TextForConditionalGeneration.from_pretrained('nguyenvulebinh/AV-HuBERT')
39
- tokenizer = Speech2TextTokenizer.from_pretrained('nguyenvulebinh/AV-HuBERT')
40
-
41
- # cuda
 
 
 
 
 
 
42
  model = model.cuda().eval()
43
 
44
- # Load normalized input data
 
 
 
 
 
 
 
 
45
  sample = load_feature(
46
- './example/lip_movement.mp4',
47
- "./example/noisy_audio.wav"
48
  )
49
 
50
- # cuda
51
  audio_feats = sample['audio_source'].cuda()
52
  video_feats = sample['video_source'].cuda()
53
  attention_mask = torch.BoolTensor(audio_feats.size(0), audio_feats.size(-1)).fill_(False).cuda()
54
 
55
- # Generate output sequence using HF interface
56
  output = model.generate(
57
  audio_feats,
58
  attention_mask=attention_mask,
59
  video=video_feats,
 
60
  )
61
 
62
- # decode output sequence
63
  print(tokenizer.batch_decode(output, skip_special_tokens=True))
64
-
65
- # check output
66
- assert output.detach().cpu().numpy().tolist() == [[ 2, 16, 130, 516, 8, 339, 541, 808, 210, 195, 541, 79, 130, 317, 269, 4, 2]]
67
- print("Example run successfully")
68
  ```
69
 
70
  ### Data preprocessing scripts
@@ -81,111 +90,55 @@ cp raw_video.mp4 ./example/
81
  python src/dataset/video_to_audio_lips.py
82
  ```
83
 
84
- ### Pretrained model
85
 
86
  <table align="center">
87
  <tr>
88
- <th>Task</th>
89
  <th>Languages</th>
90
  <th>Huggingface</th>
91
  </tr>
 
 
 
 
92
  <tr>
93
- <td rowspan="10">AVSR</td>
94
- <th>ar</th>
95
- <th><a href="todo">TODO</a></th>
96
- </tr>
97
- <tr>
98
- <th>de</th>
99
- <th><a href="todo">TODO</a></th>
100
- </tr>
101
- <tr>
102
- <th>el</th>
103
- <th><a href="todo">TODO</a></th>
104
- </tr>
105
- <tr>
106
- <th>en</th>
107
- <th><a href="nguyenvulebinh/AV-HuBERT">English Chekpoint</a></th>
108
- </tr>
109
- <tr>
110
- <th>es</th>
111
- <th><a href="todo">TODO</a></th>
112
- </tr>
113
- <tr>
114
- <th>fr</th>
115
- <th><a href="todo">TODO</a></th>
116
- </tr>
117
- <tr>
118
- <th>it</th>
119
- <th><a href="todo">TODO</a></th>
120
  </tr>
121
  <tr>
122
- <th>pt</th>
123
- <th><a href="todo">TODO</a></th>
124
  </tr>
125
  <tr>
126
- <th>ru</th>
127
- <th><a href="todo">TODO</a></th>
128
  </tr>
129
  <tr>
130
- <th>ar,de,el,es,fr,it,pt,ru</th>
131
- <th><a href="todo">TODO</a></th>
132
  </tr>
133
  <tr>
134
- <td rowspan="13">AVST</td>
135
- <th>en-el</th>
136
- <th><a href="todo">TODO</a></th>
137
- </tr>
138
- <tr>
139
- <th>en-es</th>
140
- <th><a href="todo">TODO</a></th>
141
  </tr>
142
  <tr>
143
- <th>en-fr</th>
144
- <th><a href="todo">TODO</a></th>
145
  </tr>
146
  <tr>
147
- <th>en-it</th>
148
- <th><a href="todo">TODO</a></th>
149
  </tr>
150
  <tr>
151
- <th>en-pt</th>
152
- <th><a href="todo">TODO</a></th>
153
  </tr>
154
  <tr>
155
- <th>en-ru</th>
156
- <th><a href="todo">TODO</a></th>
157
- </tr>
158
- <tr>
159
- <th>el-en</th>
160
- <th><a href="todo">TODO</a></th>
161
- </tr>
162
- <tr>
163
- <th>es-en</th>
164
- <th><a href="todo">TODO</a></th>
165
- </tr>
166
- <tr>
167
- <th>fr-en</th>
168
- <th><a href="todo">TODO</a></th>
169
- </tr>
170
- <tr>
171
- <th>it-en</th>
172
- <th><a href="todo">TODO</a></th>
173
- </tr>
174
- <tr>
175
- <th>pt-en</th>
176
- <th><a href="todo">TODO</a></th>
177
- </tr>
178
- <tr>
179
- <th>ru-en</th>
180
- <th><a href="todo">TODO</a></th>
181
- </tr>
182
- <tr>
183
- <th>{el,es,fr,it,pt,ru}-en</th>
184
- <th><a href="todo">TODO</a></th>
185
  </tr>
186
  </table>
187
 
188
-
189
  ## Acknowledgments
190
 
191
  **AV-HuBERT**: A significant portion of the codebase in this repository has been adapted from the original AV-HuBERT implementation.
 
34
  import torch
35
 
36
  if __name__ == "__main__":
37
+ # Choose language to run example
38
+ AVAILABEL_LANGUAGES = ["ar", "de", "el", "en", "es", "fr", "it", "pt", "ru", "multilingual"]
39
+ language = "ru"
40
+ assert language in AVAILABEL_LANGUAGES, f"Language {language} is not available, please choose one of {AVAILABEL_LANGUAGES}"
41
+
42
+
43
+ # Load model and tokenizer
44
+ model_name_or_path = f"nguyenvulebinh/AV-HuBERT-MuAViC-{language}"
45
+ model = AV2TextForConditionalGeneration.from_pretrained(model_name_or_path, cache_dir='./model-bin')
46
+ tokenizer = Speech2TextTokenizer.from_pretrained(model_name_or_path, cache_dir='./model-bin')
47
+
48
  model = model.cuda().eval()
49
 
50
+ # Load example video and audio
51
+ video_example = f"./example/video_processed/{language}_lip_movement.mp4"
52
+ audio_example = f"./example/video_processed/{language}_audio.wav"
53
+ if not os.path.exists(video_example) or not os.path.exists(audio_example):
54
+ print(f"WARNING: Example video and audio for {language} is not available english will be used instead")
55
+ video_example = f"./example/video_processed/en_lip_movement.mp4"
56
+ audio_example = f"./example/video_processed/en_audio.wav"
57
+
58
+ # Load and process example
59
  sample = load_feature(
60
+ video_example,
61
+ audio_example
62
  )
63
 
 
64
  audio_feats = sample['audio_source'].cuda()
65
  video_feats = sample['video_source'].cuda()
66
  attention_mask = torch.BoolTensor(audio_feats.size(0), audio_feats.size(-1)).fill_(False).cuda()
67
 
68
+ # Generate text
69
  output = model.generate(
70
  audio_feats,
71
  attention_mask=attention_mask,
72
  video=video_feats,
73
+ max_length=1024,
74
  )
75
 
 
76
  print(tokenizer.batch_decode(output, skip_special_tokens=True))
 
 
 
 
77
  ```
78
 
79
  ### Data preprocessing scripts
 
90
  python src/dataset/video_to_audio_lips.py
91
  ```
92
 
93
+ ### Pretrained AVSR model
94
 
95
  <table align="center">
96
  <tr>
 
97
  <th>Languages</th>
98
  <th>Huggingface</th>
99
  </tr>
100
+ <tr>
101
+ <th>Arabic</th>
102
+ <th><a href="https://huggingface.co/nguyenvulebinh/AV-HuBERT-MuAViC-ar">Checkpoint-AR</a></th>
103
+ </tr>
104
  <tr>
105
+ <th>German</th>
106
+ <th><a href="https://huggingface.co/nguyenvulebinh/AV-HuBERT-MuAViC-de">Checkpoint-DE</a></th>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  </tr>
108
  <tr>
109
+ <th>Greek</th>
110
+ <th><a href="https://huggingface.co/nguyenvulebinh/AV-HuBERT-MuAViC-el">Checkpoint-EL</a></th>
111
  </tr>
112
  <tr>
113
+ <th>English</th>
114
+ <th><a href="https://huggingface.co/nguyenvulebinh/AV-HuBERT-MuAViC-en">Checkpoint-EN</a></th>
115
  </tr>
116
  <tr>
117
+ <th>Spanish</th>
118
+ <th><a href="https://huggingface.co/nguyenvulebinh/AV-HuBERT-MuAViC-es">Checkpoint-ES</a></th>
119
  </tr>
120
  <tr>
121
+ <th>French</th>
122
+ <th><a href="https://huggingface.co/nguyenvulebinh/AV-HuBERT-MuAViC-fr">Checkpoint-FR</a></th>
 
 
 
 
 
123
  </tr>
124
  <tr>
125
+ <th>Italian</th>
126
+ <th><a href="https://huggingface.co/nguyenvulebinh/AV-HuBERT-MuAViC-it">Checkpoint-IT</a></th>
127
  </tr>
128
  <tr>
129
+ <th>Portuguese</th>
130
+ <th><a href="https://huggingface.co/nguyenvulebinh/AV-HuBERT-MuAViC-pt">Checkpoint-PT</a></th>
131
  </tr>
132
  <tr>
133
+ <th>Russian</th>
134
+ <th><a href="https://huggingface.co/nguyenvulebinh/AV-HuBERT-MuAViC-ru">Checkpoint-RU</a></th>
135
  </tr>
136
  <tr>
137
+ <th>Multilingual</th>
138
+ <th><a href="https://huggingface.co/nguyenvulebinh/AV-HuBERT-MuAViC-multilingual">Checkpoint-ar_de_el_es_fr_it_pt_ru</a></th>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  </tr>
140
  </table>
141
 
 
142
  ## Acknowledgments
143
 
144
  **AV-HuBERT**: A significant portion of the codebase in this repository has been adapted from the original AV-HuBERT implementation.