Suprath commited on
Commit
9ef675b
·
verified ·
1 Parent(s): a76083a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -8
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import sys
 
3
 
4
  os.system('git clone https://github.com/facebookresearch/av_hubert.git')
5
  os.chdir('/home/user/app/av_hubert')
@@ -16,7 +17,6 @@ os.system('pip install gradio==3.12')
16
  os.system('pip install numpy==1.23.3')
17
 
18
 
19
- # sys.path.append('/home/user/app/av_hubert')
20
  sys.path.append('/home/user/app/av_hubert/avhubert')
21
 
22
  print(sys.path)
@@ -25,7 +25,6 @@ print(sys.argv, type(sys.argv))
25
  sys.argv.append('dummy')
26
 
27
 
28
-
29
  import dlib, cv2, os
30
  import numpy as np
31
  import skvideo
@@ -44,8 +43,6 @@ from huggingface_hub import hf_hub_download
44
  import gradio as gr
45
  from pytube import YouTube
46
 
47
- # os.chdir('/home/user/app/av_hubert/avhubert')
48
-
49
  user_dir = "/home/user/app/av_hubert/avhubert"
50
  utils.import_user_module(Namespace(user_dir=user_dir))
51
  data_dir = "/home/user/app/video"
@@ -135,13 +132,26 @@ def predict(process_video):
135
  ref = decode_fn(sample['target'][0].int().cpu())
136
  hypo = hypos[0][0]['tokens'].int().cpu()
137
  hypo = decode_fn(hypo)
138
- return hypo
 
 
 
 
 
 
 
 
 
 
 
139
 
140
 
141
  # ---- Gradio Layout -----
142
  youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
143
  video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
144
  video_out = gr.Video(label="Audio Visual Video", mirror_webcam=False, interactive=True)
 
 
145
  demo = gr.Blocks()
146
  demo.encrypt = False
147
  text_output = gr.Textbox()
@@ -150,7 +160,7 @@ with demo:
150
  gr.Markdown('''
151
  <div>
152
  <h1 style='text-align: center'>Speech Recognition from Visual Lip Movement by Audio-Visual Hidden Unit BERT Model (AV-HuBERT)</h1>
153
- This space uses AV-HuBERT models from <a href='https://github.com/facebookresearch' target='_blank'><b>Meta Research</b></a> to recoginze the speech from Lip Movement
154
  <figure>
155
  <img src="https://huggingface.co/vumichien/AV-HuBERT/resolve/main/lipreading.gif" alt="Audio-Visual Speech Recognition">
156
  <figcaption> Speech Recognition from visual lip movement
@@ -190,7 +200,7 @@ with demo:
190
  video_out])
191
  predict_btn = gr.Button("Predict")
192
  predict_btn.click(predict, [video_out], [
193
- text_output])
194
  with gr.Row():
195
  # video_lip = gr.Video(label="Audio Visual Video", mirror_webcam=False)
196
  text_output.render()
@@ -198,4 +208,3 @@ with demo:
198
 
199
 
200
  demo.launch(debug=True)
201
-
 
1
  import os
2
  import sys
3
+ import xml.etree.ElementTree as ET
4
 
5
  os.system('git clone https://github.com/facebookresearch/av_hubert.git')
6
  os.chdir('/home/user/app/av_hubert')
 
17
  os.system('pip install numpy==1.23.3')
18
 
19
 
 
20
  sys.path.append('/home/user/app/av_hubert/avhubert')
21
 
22
  print(sys.path)
 
25
  sys.argv.append('dummy')
26
 
27
 
 
28
  import dlib, cv2, os
29
  import numpy as np
30
  import skvideo
 
43
  import gradio as gr
44
  from pytube import YouTube
45
 
 
 
46
  user_dir = "/home/user/app/av_hubert/avhubert"
47
  utils.import_user_module(Namespace(user_dir=user_dir))
48
  data_dir = "/home/user/app/video"
 
132
  ref = decode_fn(sample['target'][0].int().cpu())
133
  hypo = hypos[0][0]['tokens'].int().cpu()
134
  hypo = decode_fn(hypo)
135
+
136
+ # Create XML file
137
+ root = ET.Element("transcript")
138
+ for i, word in enumerate(hypo.split()):
139
+ word_element = ET.SubElement(root, "word")
140
+ word_element.set("timecode", str(i))
141
+ word_element.text = word
142
+
143
+ xml_tree = ET.ElementTree(root)
144
+ xml_tree.write("transcript.xml")
145
+
146
+ return hypo, "transcript.xml"
147
 
148
 
149
  # ---- Gradio Layout -----
150
  youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
151
  video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
152
  video_out = gr.Video(label="Audio Visual Video", mirror_webcam=False, interactive=True)
153
+ xml_output = gr.File(label="Download XML", download=True)
154
+
155
  demo = gr.Blocks()
156
  demo.encrypt = False
157
  text_output = gr.Textbox()
 
160
  gr.Markdown('''
161
  <div>
162
  <h1 style='text-align: center'>Speech Recognition from Visual Lip Movement by Audio-Visual Hidden Unit BERT Model (AV-HuBERT)</h1>
163
+ This space uses AV-HuBERT models from <a href='https://github.com/facebookresearch' target='_blank'><b>Meta Research</b></a> to recognize the speech from Lip Movement
164
  <figure>
165
  <img src="https://huggingface.co/vumichien/AV-HuBERT/resolve/main/lipreading.gif" alt="Audio-Visual Speech Recognition">
166
  <figcaption> Speech Recognition from visual lip movement
 
200
  video_out])
201
  predict_btn = gr.Button("Predict")
202
  predict_btn.click(predict, [video_out], [
203
+ text_output, xml_output])
204
  with gr.Row():
205
  # video_lip = gr.Video(label="Audio Visual Video", mirror_webcam=False)
206
  text_output.render()
 
208
 
209
 
210
  demo.launch(debug=True)