Spaces:

Kaushik066
/

indian_sign_language_translation

Running

App Files Files Community

Kaushik066 commited on 15 days ago

Commit

38a6f4b

verified ·

1 Parent(s): d0910b0

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -10

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import torch
 # For data transformation
 from torchvision import transforms
@@ -5,7 +6,6 @@ from torchvision import transforms
 import transformers
 from transformers import VivitImageProcessor, VivitConfig, VivitModel
 from transformers import set_seed
 # For Data Loaders
 import datasets
 from torch.utils.data import Dataset, DataLoader
@@ -25,6 +25,8 @@ import numpy as np
 from torch.nn import Linear, Softmax
 import gradio as gr
 import cv2
 # Mediapipe Library
 import mediapipe as mp
 from mediapipe.tasks import python
@@ -236,23 +238,38 @@ def prod_function(model_pretrained, prod_ds):
     return prod_pred
 # Function to get landmarked video
-def tensor_to_video(video_tensor, fps=30, output_path="output.mp4"):
     video_numpy = video_tensor.permute(0, 2, 3, 1).cpu().numpy()
     # Normalize values to [0, 255] if necessary
     if video_numpy.max() <= 1.0:
         video_numpy = (video_numpy * 255).astype(np.uint8)
-    width = video_numpy.shape[2]
-    height = video_numpy.shape[3]
-    fourcc = cv2.VideoWriter_fourcc(*"mp4v")  # MP4 Codec
-    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
     for frame in video_numpy:
-        out.write(frame)
     out.release()
-    return output_path
 # Function to list available videos dynamically
 def list_videos():
     if os.path.exists(data_path):
@@ -262,12 +279,13 @@ def list_videos():
 # Function to return the selected video path
 def play_video(selected_video):
     return os.path.join(data_path, selected_video) if selected_video else None
 # Main Function for tab - Gesture recognition
 def translate_sign_language(gesture):
     # Create Dataset
     prod_ds = dataset_prod_obj.create_dataset(gesture)
-    prod_video_path = tensor_to_video(prod_ds)
     #prod_video = np.random.randint(0, 255, (32, 225, 225, 3), dtype=np.uint8)
     # Run ML Model
@@ -291,6 +309,7 @@ with gr.Blocks() as demo:
     # Gesture recognition Tab
     with gr.Tab("Gesture recognition"):
         with gr.Row(height=350, variant="panel"): # equal_height=False, show_progress=True
             with gr.Column(scale=1, variant="panel"):
                 # Add webcam input for sign language video capture
@@ -298,6 +317,7 @@ with gr.Blocks() as demo:
             with gr.Column(scale=1, variant="panel"):
                 # Display the landmarked video
                 video_output = gr.Video(interactive=False, autoplay=True, streaming=False, label="Landmarked Gesture")
         with gr.Row(variant="panel"): # equal_height=False, show_progress=True
             with gr.Column(scale=1, variant="panel"):
                 # Submit the Video
@@ -307,6 +327,7 @@ with gr.Blocks() as demo:
                 text_output = gr.Textbox(label="Translation in English")
     # Set up the interface
     video_button.click(translate_sign_language, inputs=video_input, outputs=[text_output, video_output])
     # Indian Sign Language gesture reference tab
     with gr.Tab("Indian Sign Language gesture reference"):

+# Base Framework
 import torch
 # For data transformation
 from torchvision import transforms
 import transformers
 from transformers import VivitImageProcessor, VivitConfig, VivitModel
 from transformers import set_seed
 # For Data Loaders
 import datasets
 from torch.utils.data import Dataset, DataLoader
 from torch.nn import Linear, Softmax
 import gradio as gr
 import cv2
+import io
+import tempfile
 # Mediapipe Library
 import mediapipe as mp
 from mediapipe.tasks import python
     return prod_pred
 # Function to get landmarked video
+def save_video_to_mp4(video_tensor, fps=20):
+    # Convert pytorch tensor to numpy ndarray
     video_numpy = video_tensor.permute(0, 2, 3, 1).cpu().numpy()
     # Normalize values to [0, 255] if necessary
     if video_numpy.max() <= 1.0:
         video_numpy = (video_numpy * 255).astype(np.uint8)
+    ## Create a temporary file to save the video
+    #temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+    #output_path = temp_file.name
+    # Create an in-memory byte buffer to store the video
+    byte_buffer = io.BytesIO()
+    # Get video dimensions
+    height, width, channels = video_numpy[0].shape
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for .mp4
+    # Create VideoWriter object
+    #out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+    out = cv2.VideoWriter(byte_buffer, fourcc, fps, (width, height), isColor=True)
+    # Write the frames to the output file
     for frame in video_numpy:
+        # Convert RGB back to BGR for OpenCV
+        frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+        out.write(frame_bgr)
     out.release()
+    # Return the byte buffer's content (the video as bytes)
+    byte_buffer.seek(0)
+    return byte_buffer.read() #output_path
 # Function to list available videos dynamically
 def list_videos():
     if os.path.exists(data_path):
 # Function to return the selected video path
 def play_video(selected_video):
     return os.path.join(data_path, selected_video) if selected_video else None
+# Get Landmarked video
 # Main Function for tab - Gesture recognition
 def translate_sign_language(gesture):
     # Create Dataset
     prod_ds = dataset_prod_obj.create_dataset(gesture)
+    prod_video_path = save_video_to_mp4(prod_ds)
     #prod_video = np.random.randint(0, 255, (32, 225, 225, 3), dtype=np.uint8)
     # Run ML Model
     # Gesture recognition Tab
     with gr.Tab("Gesture recognition"):
+        landmarked_video = gr.State([])
         with gr.Row(height=350, variant="panel"): # equal_height=False, show_progress=True
             with gr.Column(scale=1, variant="panel"):
                 # Add webcam input for sign language video capture
             with gr.Column(scale=1, variant="panel"):
                 # Display the landmarked video
                 video_output = gr.Video(interactive=False, autoplay=True, streaming=False, label="Landmarked Gesture")
         with gr.Row(variant="panel"): # equal_height=False, show_progress=True
             with gr.Column(scale=1, variant="panel"):
                 # Submit the Video
                 text_output = gr.Textbox(label="Translation in English")
     # Set up the interface
     video_button.click(translate_sign_language, inputs=video_input, outputs=[text_output, video_output])
+    landmarked_video.change(translate_sign_language, inputs=landmarked_video, outputs=[text_output, video_output])
     # Indian Sign Language gesture reference tab
     with gr.Tab("Indian Sign Language gesture reference"):