nguyenvulebinh
/

AV-HuBERT

@@ -34,37 +34,46 @@ from transformers import Speech2TextTokenizer
 import torch
 if __name__ == "__main__":
-    # Load pretrained english model
-    model = AV2TextForConditionalGeneration.from_pretrained('nguyenvulebinh/AV-HuBERT')
-    tokenizer = Speech2TextTokenizer.from_pretrained('nguyenvulebinh/AV-HuBERT')
-    # cuda
     model = model.cuda().eval()
-    # Load normalized input data
     sample = load_feature(
-        './example/lip_movement.mp4',
-        "./example/noisy_audio.wav"
     )
-    # cuda
     audio_feats = sample['audio_source'].cuda()
     video_feats = sample['video_source'].cuda()
     attention_mask = torch.BoolTensor(audio_feats.size(0), audio_feats.size(-1)).fill_(False).cuda()
-    # Generate output sequence using HF interface
     output = model.generate(
         audio_feats,
         attention_mask=attention_mask,
         video=video_feats,
     )
-    # decode output sequence
     print(tokenizer.batch_decode(output, skip_special_tokens=True))
-    # check output
-    assert output.detach().cpu().numpy().tolist() == [[  2,  16, 130, 516,   8, 339, 541, 808, 210, 195, 541,  79, 130, 317, 269,   4,   2]]
-    print("Example run successfully")
 ```
 ### Data preprocessing scripts
@@ -81,111 +90,55 @@ cp raw_video.mp4 ./example/
 python src/dataset/video_to_audio_lips.py
 ```
-### Pretrained model
 <table align="center">
     <tr>
-        <th>Task</th>
         <th>Languages</th>
         <th>Huggingface</th>
     </tr>
     <tr>
-        <td rowspan="10">AVSR</td>
-        <th>ar</th>
-        <th><a href="todo">TODO</a></th>
-    </tr>
-    <tr>
-        <th>de</th>
-        <th><a href="todo">TODO</a></th>
-    </tr>
-    <tr>
-        <th>el</th>
-        <th><a href="todo">TODO</a></th>
-    </tr>
-    <tr>
-        <th>en</th>
-        <th><a href="nguyenvulebinh/AV-HuBERT">English Chekpoint</a></th>
-    </tr>
-    <tr>
-        <th>es</th>
-        <th><a href="todo">TODO</a></th>
-    </tr>
-    <tr>
-        <th>fr</th>
-        <th><a href="todo">TODO</a></th>
-    </tr>
-    <tr>
-        <th>it</th>
-        <th><a href="todo">TODO</a></th>
     </tr>
     <tr>
-        <th>pt</th>
-        <th><a href="todo">TODO</a></th>
     </tr>
     <tr>
-        <th>ru</th>
-        <th><a href="todo">TODO</a></th>
     </tr>
     <tr>
-        <th>ar,de,el,es,fr,it,pt,ru</th>
-        <th><a href="todo">TODO</a></th>
     </tr>
     <tr>
-        <td rowspan="13">AVST</td>
-        <th>en-el</th>
-        <th><a href="todo">TODO</a></th>
-    </tr>
-     <tr>
-        <th>en-es</th>
-        <th><a href="todo">TODO</a></th>
     </tr>
     <tr>
-        <th>en-fr</th>
-        <th><a href="todo">TODO</a></th>
     </tr>
     <tr>
-        <th>en-it</th>
-        <th><a href="todo">TODO</a></th>
     </tr>
     <tr>
-        <th>en-pt</th>
-        <th><a href="todo">TODO</a></th>
     </tr>
     <tr>
-        <th>en-ru</th>
-        <th><a href="todo">TODO</a></th>
-    </tr>
-    <tr>
-        <th>el-en</th>
-        <th><a href="todo">TODO</a></th>
-    </tr>
-    <tr>
-        <th>es-en</th>
-        <th><a href="todo">TODO</a></th>
-    </tr>
-    <tr>
-        <th>fr-en</th>
-        <th><a href="todo">TODO</a></th>
-    </tr>
-    <tr>
-        <th>it-en</th>
-        <th><a href="todo">TODO</a></th>
-    </tr>
-    <tr>
-        <th>pt-en</th>
-        <th><a href="todo">TODO</a></th>
-    </tr>
-    <tr>
-        <th>ru-en</th>
-        <th><a href="todo">TODO</a></th>
-    </tr>
-    <tr>
-        <th>{el,es,fr,it,pt,ru}-en</th>
-        <th><a href="todo">TODO</a></th>
     </tr>
 </table>
 ## Acknowledgments
 **AV-HuBERT**: A significant portion of the codebase in this repository has been adapted from the original AV-HuBERT implementation.

 import torch
 if __name__ == "__main__":
+    # Choose language to run example
+    AVAILABEL_LANGUAGES = ["ar", "de", "el", "en", "es", "fr", "it", "pt", "ru", "multilingual"]
+    language = "ru"
+    assert language in AVAILABEL_LANGUAGES, f"Language {language} is not available, please choose one of {AVAILABEL_LANGUAGES}"
+    # Load model and tokenizer
+    model_name_or_path = f"nguyenvulebinh/AV-HuBERT-MuAViC-{language}"
+    model = AV2TextForConditionalGeneration.from_pretrained(model_name_or_path, cache_dir='./model-bin')
+    tokenizer = Speech2TextTokenizer.from_pretrained(model_name_or_path, cache_dir='./model-bin')
     model = model.cuda().eval()
+    # Load example video and audio
+    video_example = f"./example/video_processed/{language}_lip_movement.mp4"
+    audio_example = f"./example/video_processed/{language}_audio.wav"
+    if not os.path.exists(video_example) or not os.path.exists(audio_example):
+        print(f"WARNING: Example video and audio for {language} is not available english will be used instead")
+        video_example = f"./example/video_processed/en_lip_movement.mp4"
+        audio_example = f"./example/video_processed/en_audio.wav"
+    # Load and process example
     sample = load_feature(
+        video_example,
+        audio_example
     )
     audio_feats = sample['audio_source'].cuda()
     video_feats = sample['video_source'].cuda()
     attention_mask = torch.BoolTensor(audio_feats.size(0), audio_feats.size(-1)).fill_(False).cuda()
+    # Generate text
     output = model.generate(
         audio_feats,
         attention_mask=attention_mask,
         video=video_feats,
+        max_length=1024,
     )
     print(tokenizer.batch_decode(output, skip_special_tokens=True))
 ```
 ### Data preprocessing scripts
 python src/dataset/video_to_audio_lips.py
 ```
+### Pretrained AVSR model
 <table align="center">
     <tr>
         <th>Languages</th>
         <th>Huggingface</th>
     </tr>
+<tr>
+        <th>Arabic</th>
+        <th><a href="https://huggingface.co/nguyenvulebinh/AV-HuBERT-MuAViC-ar">Checkpoint-AR</a></th>
+    </tr>
     <tr>
+        <th>German</th>
+        <th><a href="https://huggingface.co/nguyenvulebinh/AV-HuBERT-MuAViC-de">Checkpoint-DE</a></th>
     </tr>
     <tr>
+        <th>Greek</th>
+        <th><a href="https://huggingface.co/nguyenvulebinh/AV-HuBERT-MuAViC-el">Checkpoint-EL</a></th>
     </tr>
     <tr>
+        <th>English</th>
+        <th><a href="https://huggingface.co/nguyenvulebinh/AV-HuBERT-MuAViC-en">Checkpoint-EN</a></th>
     </tr>
     <tr>
+        <th>Spanish</th>
+        <th><a href="https://huggingface.co/nguyenvulebinh/AV-HuBERT-MuAViC-es">Checkpoint-ES</a></th>
     </tr>
     <tr>
+        <th>French</th>
+        <th><a href="https://huggingface.co/nguyenvulebinh/AV-HuBERT-MuAViC-fr">Checkpoint-FR</a></th>
     </tr>
     <tr>
+        <th>Italian</th>
+        <th><a href="https://huggingface.co/nguyenvulebinh/AV-HuBERT-MuAViC-it">Checkpoint-IT</a></th>
     </tr>
     <tr>
+        <th>Portuguese</th>
+        <th><a href="https://huggingface.co/nguyenvulebinh/AV-HuBERT-MuAViC-pt">Checkpoint-PT</a></th>
     </tr>
     <tr>
+        <th>Russian</th>
+        <th><a href="https://huggingface.co/nguyenvulebinh/AV-HuBERT-MuAViC-ru">Checkpoint-RU</a></th>
     </tr>
     <tr>
+        <th>Multilingual</th>
+        <th><a href="https://huggingface.co/nguyenvulebinh/AV-HuBERT-MuAViC-multilingual">Checkpoint-ar_de_el_es_fr_it_pt_ru</a></th>
     </tr>
 </table>
 ## Acknowledgments
 **AV-HuBERT**: A significant portion of the codebase in this repository has been adapted from the original AV-HuBERT implementation.