Spaces:
Running
Running
update gradio
Browse files- app.py +26 -18
- requirements.txt +1 -1
app.py
CHANGED
@@ -63,7 +63,7 @@ async def speech_to_text(video_file_path):
|
|
63 |
Using https://huggingface.co/tasks/automatic-speech-recognition pipeline
|
64 |
"""
|
65 |
global total_inferences_since_reboot
|
66 |
-
if(video_file_path == None):
|
67 |
raise ValueError("Error no video input")
|
68 |
|
69 |
video_path = Path(video_file_path)
|
@@ -84,6 +84,7 @@ async def speech_to_text(video_file_path):
|
|
84 |
print(f'Transcribing from API attempt {tries}')
|
85 |
try:
|
86 |
inference_reponse = await query_api(audio_memory)
|
|
|
87 |
transcription = inference_reponse["text"].lower()
|
88 |
timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
|
89 |
for chunk in inference_reponse['chunks']]
|
@@ -92,7 +93,8 @@ async def speech_to_text(video_file_path):
|
|
92 |
print("\n\ntotal_inferences_since_reboot: ",
|
93 |
total_inferences_since_reboot, "\n\n")
|
94 |
return (transcription, transcription, timestamps)
|
95 |
-
except:
|
|
|
96 |
if 'error' in inference_reponse and 'estimated_time' in inference_reponse:
|
97 |
wait_time = inference_reponse['estimated_time']
|
98 |
print("Waiting for model to load....", wait_time)
|
@@ -134,7 +136,7 @@ async def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
|
|
134 |
|
135 |
video_path = Path(video_in)
|
136 |
video_file_name = video_path.stem
|
137 |
-
if(video_in == None or text_in == None or transcription == None):
|
138 |
raise ValueError("Inputs undefined")
|
139 |
|
140 |
d = Differ()
|
@@ -150,7 +152,7 @@ async def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
|
|
150 |
# groupping character timestamps so there are less cuts
|
151 |
idx = 0
|
152 |
grouped = {}
|
153 |
-
for(a, b) in zip(filtered, timestamps):
|
154 |
if a[0] != '-':
|
155 |
if idx in grouped:
|
156 |
grouped[idx].append(b)
|
@@ -203,7 +205,15 @@ async def query_api(audio_bytes: bytes):
|
|
203 |
}).encode("utf-8")
|
204 |
async with aiohttp.ClientSession() as session:
|
205 |
async with session.post(API_URL, headers=headers, data=payload) as response:
|
206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
|
209 |
def ping(name):
|
@@ -222,28 +232,26 @@ video_in = gr.Video(label="Video file")
|
|
222 |
text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
|
223 |
video_out = gr.Video(label="Video Out")
|
224 |
diff_out = gr.HighlightedText(label="Cuts Diffs", combine_adjacent=True)
|
225 |
-
examples = gr.
|
226 |
-
components=[video_in], samples=VIDEOS, type="index")
|
227 |
|
228 |
-
|
229 |
#cut_btn, #reset_btn { align-self:stretch; }
|
230 |
#\\31 3 { max-width: 540px; }
|
231 |
.output-markdown {max-width: 65ch !important;}
|
232 |
-
|
233 |
-
|
234 |
-
with demo:
|
235 |
transcription_var = gr.Variable()
|
236 |
timestamps_var = gr.Variable()
|
237 |
with gr.Row():
|
238 |
with gr.Column():
|
239 |
-
gr.Markdown(
|
240 |
# Edit Video By Editing Text
|
241 |
This project is a quick proof of concept of a simple video editor where the edits
|
242 |
are made by editing the audio transcription.
|
243 |
Using the [Huggingface Automatic Speech Recognition Pipeline](https://huggingface.co/tasks/automatic-speech-recognition)
|
244 |
with a fine tuned [Wav2Vec2 model using Connectionist Temporal Classification (CTC)](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self)
|
245 |
you can predict not only the text transcription but also the [character or word base timestamps](https://huggingface.co/docs/transformers/v4.19.2/en/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__.return_timestamps)
|
246 |
-
|
247 |
|
248 |
with gr.Row():
|
249 |
|
@@ -269,9 +277,9 @@ with demo:
|
|
269 |
text_in, transcription_var, timestamps_var])
|
270 |
|
271 |
with gr.Row():
|
272 |
-
gr.Markdown(
|
273 |
### Now edit as text
|
274 |
-
After running the video transcription, you can make cuts to the text below (only cuts, not additions!)
|
275 |
|
276 |
with gr.Row():
|
277 |
with gr.Column():
|
@@ -290,13 +298,13 @@ with demo:
|
|
290 |
video_out.render()
|
291 |
diff_out.render()
|
292 |
with gr.Row():
|
293 |
-
gr.Markdown(
|
294 |
#### Video Credits
|
295 |
|
296 |
1. [Cooking](https://vimeo.com/573792389)
|
297 |
1. [Shia LaBeouf "Just Do It"](https://www.youtube.com/watch?v=n2lTxIk_Dr0)
|
298 |
1. [Mark Zuckerberg & Yuval Noah Harari in Conversation](https://www.youtube.com/watch?v=Boj9eD0Wug8)
|
299 |
-
|
300 |
-
|
301 |
if __name__ == "__main__":
|
302 |
demo.launch(debug=True)
|
|
|
63 |
Using https://huggingface.co/tasks/automatic-speech-recognition pipeline
|
64 |
"""
|
65 |
global total_inferences_since_reboot
|
66 |
+
if (video_file_path == None):
|
67 |
raise ValueError("Error no video input")
|
68 |
|
69 |
video_path = Path(video_file_path)
|
|
|
84 |
print(f'Transcribing from API attempt {tries}')
|
85 |
try:
|
86 |
inference_reponse = await query_api(audio_memory)
|
87 |
+
print(inference_reponse)
|
88 |
transcription = inference_reponse["text"].lower()
|
89 |
timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
|
90 |
for chunk in inference_reponse['chunks']]
|
|
|
93 |
print("\n\ntotal_inferences_since_reboot: ",
|
94 |
total_inferences_since_reboot, "\n\n")
|
95 |
return (transcription, transcription, timestamps)
|
96 |
+
except Exception as e:
|
97 |
+
print(e)
|
98 |
if 'error' in inference_reponse and 'estimated_time' in inference_reponse:
|
99 |
wait_time = inference_reponse['estimated_time']
|
100 |
print("Waiting for model to load....", wait_time)
|
|
|
136 |
|
137 |
video_path = Path(video_in)
|
138 |
video_file_name = video_path.stem
|
139 |
+
if (video_in == None or text_in == None or transcription == None):
|
140 |
raise ValueError("Inputs undefined")
|
141 |
|
142 |
d = Differ()
|
|
|
152 |
# groupping character timestamps so there are less cuts
|
153 |
idx = 0
|
154 |
grouped = {}
|
155 |
+
for (a, b) in zip(filtered, timestamps):
|
156 |
if a[0] != '-':
|
157 |
if idx in grouped:
|
158 |
grouped[idx].append(b)
|
|
|
205 |
}).encode("utf-8")
|
206 |
async with aiohttp.ClientSession() as session:
|
207 |
async with session.post(API_URL, headers=headers, data=payload) as response:
|
208 |
+
print("API Response: ", response.status)
|
209 |
+
if response.headers['Content-Type'] == 'application/json':
|
210 |
+
return await response.json()
|
211 |
+
elif response.headers['Content-Type'] == 'application/octet-stream':
|
212 |
+
return await response.read()
|
213 |
+
elif response.headers['Content-Type'] == 'text/plain':
|
214 |
+
return await response.text()
|
215 |
+
else:
|
216 |
+
raise RuntimeError("Error Fetching API")
|
217 |
|
218 |
|
219 |
def ping(name):
|
|
|
232 |
text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
|
233 |
video_out = gr.Video(label="Video Out")
|
234 |
diff_out = gr.HighlightedText(label="Cuts Diffs", combine_adjacent=True)
|
235 |
+
examples = gr.Dataset(components=[video_in], samples=VIDEOS, type="index")
|
|
|
236 |
|
237 |
+
css = """
|
238 |
#cut_btn, #reset_btn { align-self:stretch; }
|
239 |
#\\31 3 { max-width: 540px; }
|
240 |
.output-markdown {max-width: 65ch !important;}
|
241 |
+
"""
|
242 |
+
with gr.Blocks(css=css) as demo:
|
|
|
243 |
transcription_var = gr.Variable()
|
244 |
timestamps_var = gr.Variable()
|
245 |
with gr.Row():
|
246 |
with gr.Column():
|
247 |
+
gr.Markdown("""
|
248 |
# Edit Video By Editing Text
|
249 |
This project is a quick proof of concept of a simple video editor where the edits
|
250 |
are made by editing the audio transcription.
|
251 |
Using the [Huggingface Automatic Speech Recognition Pipeline](https://huggingface.co/tasks/automatic-speech-recognition)
|
252 |
with a fine tuned [Wav2Vec2 model using Connectionist Temporal Classification (CTC)](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self)
|
253 |
you can predict not only the text transcription but also the [character or word base timestamps](https://huggingface.co/docs/transformers/v4.19.2/en/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__.return_timestamps)
|
254 |
+
""")
|
255 |
|
256 |
with gr.Row():
|
257 |
|
|
|
277 |
text_in, transcription_var, timestamps_var])
|
278 |
|
279 |
with gr.Row():
|
280 |
+
gr.Markdown("""
|
281 |
### Now edit as text
|
282 |
+
After running the video transcription, you can make cuts to the text below (only cuts, not additions!)""")
|
283 |
|
284 |
with gr.Row():
|
285 |
with gr.Column():
|
|
|
298 |
video_out.render()
|
299 |
diff_out.render()
|
300 |
with gr.Row():
|
301 |
+
gr.Markdown("""
|
302 |
#### Video Credits
|
303 |
|
304 |
1. [Cooking](https://vimeo.com/573792389)
|
305 |
1. [Shia LaBeouf "Just Do It"](https://www.youtube.com/watch?v=n2lTxIk_Dr0)
|
306 |
1. [Mark Zuckerberg & Yuval Noah Harari in Conversation](https://www.youtube.com/watch?v=Boj9eD0Wug8)
|
307 |
+
""")
|
308 |
+
demo.queue()
|
309 |
if __name__ == "__main__":
|
310 |
demo.launch(debug=True)
|
requirements.txt
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
torch
|
2 |
transformers
|
3 |
-
gradio==3.
|
4 |
datasets
|
5 |
librosa
|
6 |
ffmpeg-python
|
|
|
1 |
torch
|
2 |
transformers
|
3 |
+
gradio==3.35.2
|
4 |
datasets
|
5 |
librosa
|
6 |
ffmpeg-python
|