Spaces:

vumichien
/

Lip_movement_reading

Runtime error

App Files Files Community

vumichien commited on Jan 17, 2023

Commit

8702c0e

1 Parent(s): 223da19

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -7

app.py CHANGED Viewed

@@ -42,6 +42,7 @@ from fairseq import checkpoint_utils, options, tasks, utils
 from fairseq.dataclass.configs import GenerationConfig
 from huggingface_hub import hf_hub_download
 import gradio as gr
 # os.chdir('/home/user/app/av_hubert/avhubert')
@@ -131,18 +132,44 @@ def predict(process_video):
 # ---- Gradio Layout -----
 video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
-video_out = gr.Video(label="Audio Visual Video", mirror_webcam=False, interactive=True)
-text_output = gr.Textbox()
 demo = gr.Blocks()
 demo.encrypt = False
 with demo:
-    examples = gr.Examples(examples=
-                [ ["demo1.mp4", "roi1.mp4"],
-                  ["demo2.mp4", "roi2.mp4"],
-                  ["demo3.mp4", "roi3.mp4"],],
-              label="Examples", inputs=[video_in, video_out])
     with gr.Row():
         video_in.render()
         video_out.render()
     with gr.Row():

 from fairseq.dataclass.configs import GenerationConfig
 from huggingface_hub import hf_hub_download
 import gradio as gr
+from pytube import YouTube
 # os.chdir('/home/user/app/av_hubert/avhubert')
 # ---- Gradio Layout -----
+youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
 video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
+video_out = gr.Video(label="Audio Visual Video", mirror_webcam=False, interactive=True)
 demo = gr.Blocks()
 demo.encrypt = False
+text_output = gr.Textbox()
 with demo:
+    gr.Markdown('''
+            <div>
+            <h1 style='text-align: center'>Speech Recognition from Visual Lip Movement by Audio-Visual Hidden Unit BERT Model (Avhubert)</h1>
+            This space uses Avhubert models from <a href='https://github.com/facebookresearch' target='_blank'><b>Meta Research</b></a> to recoginze the speech from Lip Movement 🤗
+            </div>
+        ''')
     with gr.Row():
+            gr.Markdown('''
+            ### Reading Lip movement with youtube link using Avhubert
+            ##### Step 1a. Download video from youtube (Note: the length of video should be less than 10 seconds if not it will be cut and the face should be stable for better result)
+            ##### Step 1b. You also can upload video directly
+            ##### Step 2. Generating landmarks surrounding mouth area
+            ##### Step 3. Reading lip movement.
+            ''')
+    with gr.Row():
+        gr.Markdown('''
+            ### You can test by following examples:
+            ''')
+    examples = gr.Examples(examples=
+            [ "https://www.youtube.com/watch?v=ZXVDnuepW2s",
+              "https://www.youtube.com/watch?v=X8_glJn1B8o",
+              "https://www.youtube.com/watch?v=80yqL2KzBVw"],
+          label="Examples", inputs=[youtube_url_in])
+    with gr.Column():
+          youtube_url_in.render()
+          download_youtube_btn = gr.Button("Download Youtube video")
+          download_youtube_btn.click(get_youtube, [youtube_url_in], [
+              video_in])
+          print(video_in)
+    with gr.Row():
         video_in.render()
         video_out.render()
     with gr.Row():