Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import torch
|
2 |
# For data transformation
|
3 |
from torchvision import transforms
|
@@ -5,7 +6,6 @@ from torchvision import transforms
|
|
5 |
import transformers
|
6 |
from transformers import VivitImageProcessor, VivitConfig, VivitModel
|
7 |
from transformers import set_seed
|
8 |
-
|
9 |
# For Data Loaders
|
10 |
import datasets
|
11 |
from torch.utils.data import Dataset, DataLoader
|
@@ -25,6 +25,8 @@ import numpy as np
|
|
25 |
from torch.nn import Linear, Softmax
|
26 |
import gradio as gr
|
27 |
import cv2
|
|
|
|
|
28 |
# Mediapipe Library
|
29 |
import mediapipe as mp
|
30 |
from mediapipe.tasks import python
|
@@ -236,23 +238,38 @@ def prod_function(model_pretrained, prod_ds):
|
|
236 |
return prod_pred
|
237 |
|
238 |
# Function to get landmarked video
|
239 |
-
def
|
|
|
240 |
video_numpy = video_tensor.permute(0, 2, 3, 1).cpu().numpy()
|
241 |
# Normalize values to [0, 255] if necessary
|
242 |
if video_numpy.max() <= 1.0:
|
243 |
video_numpy = (video_numpy * 255).astype(np.uint8)
|
244 |
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
|
|
|
250 |
for frame in video_numpy:
|
251 |
-
|
|
|
|
|
|
|
252 |
out.release()
|
|
|
|
|
|
|
253 |
|
254 |
-
return output_path
|
255 |
-
|
256 |
# Function to list available videos dynamically
|
257 |
def list_videos():
|
258 |
if os.path.exists(data_path):
|
@@ -262,12 +279,13 @@ def list_videos():
|
|
262 |
# Function to return the selected video path
|
263 |
def play_video(selected_video):
|
264 |
return os.path.join(data_path, selected_video) if selected_video else None
|
|
|
265 |
|
266 |
# Main Function for tab - Gesture recognition
|
267 |
def translate_sign_language(gesture):
|
268 |
# Create Dataset
|
269 |
prod_ds = dataset_prod_obj.create_dataset(gesture)
|
270 |
-
prod_video_path =
|
271 |
#prod_video = np.random.randint(0, 255, (32, 225, 225, 3), dtype=np.uint8)
|
272 |
|
273 |
# Run ML Model
|
@@ -291,6 +309,7 @@ with gr.Blocks() as demo:
|
|
291 |
|
292 |
# Gesture recognition Tab
|
293 |
with gr.Tab("Gesture recognition"):
|
|
|
294 |
with gr.Row(height=350, variant="panel"): # equal_height=False, show_progress=True
|
295 |
with gr.Column(scale=1, variant="panel"):
|
296 |
# Add webcam input for sign language video capture
|
@@ -298,6 +317,7 @@ with gr.Blocks() as demo:
|
|
298 |
with gr.Column(scale=1, variant="panel"):
|
299 |
# Display the landmarked video
|
300 |
video_output = gr.Video(interactive=False, autoplay=True, streaming=False, label="Landmarked Gesture")
|
|
|
301 |
with gr.Row(variant="panel"): # equal_height=False, show_progress=True
|
302 |
with gr.Column(scale=1, variant="panel"):
|
303 |
# Submit the Video
|
@@ -307,6 +327,7 @@ with gr.Blocks() as demo:
|
|
307 |
text_output = gr.Textbox(label="Translation in English")
|
308 |
# Set up the interface
|
309 |
video_button.click(translate_sign_language, inputs=video_input, outputs=[text_output, video_output])
|
|
|
310 |
|
311 |
# Indian Sign Language gesture reference tab
|
312 |
with gr.Tab("Indian Sign Language gesture reference"):
|
|
|
1 |
+
# Base Framework
|
2 |
import torch
|
3 |
# For data transformation
|
4 |
from torchvision import transforms
|
|
|
6 |
import transformers
|
7 |
from transformers import VivitImageProcessor, VivitConfig, VivitModel
|
8 |
from transformers import set_seed
|
|
|
9 |
# For Data Loaders
|
10 |
import datasets
|
11 |
from torch.utils.data import Dataset, DataLoader
|
|
|
25 |
from torch.nn import Linear, Softmax
|
26 |
import gradio as gr
|
27 |
import cv2
|
28 |
+
import io
|
29 |
+
import tempfile
|
30 |
# Mediapipe Library
|
31 |
import mediapipe as mp
|
32 |
from mediapipe.tasks import python
|
|
|
238 |
return prod_pred
|
239 |
|
240 |
# Function to get landmarked video
|
241 |
+
def save_video_to_mp4(video_tensor, fps=20):
|
242 |
+
# Convert pytorch tensor to numpy ndarray
|
243 |
video_numpy = video_tensor.permute(0, 2, 3, 1).cpu().numpy()
|
244 |
# Normalize values to [0, 255] if necessary
|
245 |
if video_numpy.max() <= 1.0:
|
246 |
video_numpy = (video_numpy * 255).astype(np.uint8)
|
247 |
|
248 |
+
## Create a temporary file to save the video
|
249 |
+
#temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
|
250 |
+
#output_path = temp_file.name
|
251 |
+
# Create an in-memory byte buffer to store the video
|
252 |
+
byte_buffer = io.BytesIO()
|
253 |
+
|
254 |
+
# Get video dimensions
|
255 |
+
height, width, channels = video_numpy[0].shape
|
256 |
+
fourcc = cv2.VideoWriter_fourcc(*'mp4v') # Codec for .mp4
|
257 |
+
|
258 |
+
# Create VideoWriter object
|
259 |
+
#out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
|
260 |
+
out = cv2.VideoWriter(byte_buffer, fourcc, fps, (width, height), isColor=True)
|
261 |
|
262 |
+
# Write the frames to the output file
|
263 |
for frame in video_numpy:
|
264 |
+
# Convert RGB back to BGR for OpenCV
|
265 |
+
frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
|
266 |
+
out.write(frame_bgr)
|
267 |
+
|
268 |
out.release()
|
269 |
+
# Return the byte buffer's content (the video as bytes)
|
270 |
+
byte_buffer.seek(0)
|
271 |
+
return byte_buffer.read() #output_path
|
272 |
|
|
|
|
|
273 |
# Function to list available videos dynamically
|
274 |
def list_videos():
|
275 |
if os.path.exists(data_path):
|
|
|
279 |
# Function to return the selected video path
|
280 |
def play_video(selected_video):
|
281 |
return os.path.join(data_path, selected_video) if selected_video else None
|
282 |
+
# Get Landmarked video
|
283 |
|
284 |
# Main Function for tab - Gesture recognition
|
285 |
def translate_sign_language(gesture):
|
286 |
# Create Dataset
|
287 |
prod_ds = dataset_prod_obj.create_dataset(gesture)
|
288 |
+
prod_video_path = save_video_to_mp4(prod_ds)
|
289 |
#prod_video = np.random.randint(0, 255, (32, 225, 225, 3), dtype=np.uint8)
|
290 |
|
291 |
# Run ML Model
|
|
|
309 |
|
310 |
# Gesture recognition Tab
|
311 |
with gr.Tab("Gesture recognition"):
|
312 |
+
landmarked_video = gr.State([])
|
313 |
with gr.Row(height=350, variant="panel"): # equal_height=False, show_progress=True
|
314 |
with gr.Column(scale=1, variant="panel"):
|
315 |
# Add webcam input for sign language video capture
|
|
|
317 |
with gr.Column(scale=1, variant="panel"):
|
318 |
# Display the landmarked video
|
319 |
video_output = gr.Video(interactive=False, autoplay=True, streaming=False, label="Landmarked Gesture")
|
320 |
+
|
321 |
with gr.Row(variant="panel"): # equal_height=False, show_progress=True
|
322 |
with gr.Column(scale=1, variant="panel"):
|
323 |
# Submit the Video
|
|
|
327 |
text_output = gr.Textbox(label="Translation in English")
|
328 |
# Set up the interface
|
329 |
video_button.click(translate_sign_language, inputs=video_input, outputs=[text_output, video_output])
|
330 |
+
landmarked_video.change(translate_sign_language, inputs=landmarked_video, outputs=[text_output, video_output])
|
331 |
|
332 |
# Indian Sign Language gesture reference tab
|
333 |
with gr.Tab("Indian Sign Language gesture reference"):
|