Spaces:

ManishThota
/

Super-Rapid-Annotator

Running on Zero

App Files Files Community

ManishThota commited on Jul 30

Commit

9178374

•

1 Parent(s): df2ba9f

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -10

app.py CHANGED Viewed

@@ -10,13 +10,15 @@ import json
 import csv
 import io
 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_compute_dtype=torch.float16
 )
-model_name = 'llava-hf/LLaVA-NeXT-Video-7B-DPO-hf'
 processor = LlavaNextVideoProcessor.from_pretrained(model_name)
 model = LlavaNextVideoForConditionalGeneration.from_pretrained(
     model_name,
@@ -25,7 +27,6 @@ model = LlavaNextVideoForConditionalGeneration.from_pretrained(
 )
-@spaces.GPU
 def read_video_pyav(container, indices):
     '''
     Decode the video with PyAV decoder.
@@ -63,18 +64,23 @@ def process_video(video_file, question):
     prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
     input = processor([prompt], videos=[video_clip], padding=True, return_tensors="pt").to(model.device)
     generate_kwargs = {"max_new_tokens": 100, "do_sample": True, "top_p": 0.9}
-    output = model.generate(**input, **generate_kwargs)
     generated_text = processor.batch_decode(output, skip_special_tokens=True)[0]
     return generated_text.split("ASSISTANT: ", 1)[-1].strip()
 @spaces.GPU
 def analyze_videos(video_files, selected_questions):
     """Analyzes videos, saves results to CSV, and returns CSV data and JSON."""
     all_results = {}
     questions = {
-        "hands_free": "Examine the subject’s right and left hands in the video to check if they are holding anything like a microphone, book, paper(White color), object, or any electronic device, try segmentations and decide if the hands are free or not.",
-        "standing/sitting": "Evaluate the subject’s body posture and movement within the video. Are they standing upright with both feet planted firmly on the ground? If so, they are standing. If they seem to be seated, they are seated.",
         "interaction_with_background": "Assess the surroundings behind the subject in the video. Do they seem to interact with any visible screens, such as laptops, TVs, or digital billboards? If yes, then they are interacting with a screen. If not, they are not interacting with a screen.",
-        "indoors/outdoors": "Consider the broader environmental context shown in the video’s background. Are there signs of an open-air space, like greenery, structures, or people passing by? If so, it’s an outdoor setting. If the setting looks confined with furniture, walls, or home decorations, it’s an indoor environment."
     }
     for video_file in video_files:
@@ -84,7 +90,7 @@ def analyze_videos(video_files, selected_questions):
             answer = process_video(video_file, questions[question_key])
             all_results[video_name][question_key] = "true" if "yes" in answer.lower() else "false"
-        del answer
         gc.collect()
         torch.cuda.empty_cache()
@@ -102,7 +108,6 @@ def analyze_videos(video_files, selected_questions):
     json_output = json.dumps(all_results, indent=4)
     return json_output, csv_content
 def download_csv(csv_content):
     """Creates a downloadable CSV file."""
     return gr.File.update(
@@ -114,10 +119,10 @@ def download_csv(csv_content):
 with gr.Blocks() as iface:
     with gr.Row():
         file_input = gr.File(label="Upload Videos", file_count="multiple")
-        question_input = gr.CheckboxGroup(["hands_free", "standing/sitting", "interaction_with_background", "indoors/outdoors"],
                                         label="Select Questions to Apply")
-    process_button = gr.Button("Process Videos")  # Process button below checkboxes
     with gr.Row():
         json_output = gr.JSON(label="Analysis Results (JSON)")

 import csv
 import io
+# Model Configuration
 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_compute_dtype=torch.float16
 )
+model_name = 'llava-hf/LLaVA-NeXT-Video-7B-DPO-hf'
+# Load Model and Processor
 processor = LlavaNextVideoProcessor.from_pretrained(model_name)
 model = LlavaNextVideoForConditionalGeneration.from_pretrained(
     model_name,
 )
 def read_video_pyav(container, indices):
     '''
     Decode the video with PyAV decoder.
     prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
     input = processor([prompt], videos=[video_clip], padding=True, return_tensors="pt").to(model.device)
     generate_kwargs = {"max_new_tokens": 100, "do_sample": True, "top_p": 0.9}
+    # Disable gradient calculation during inference
+    with torch.no_grad():
+        output = model.generate(**input, **generate_kwargs)
     generated_text = processor.batch_decode(output, skip_special_tokens=True)[0]
     return generated_text.split("ASSISTANT: ", 1)[-1].strip()
 @spaces.GPU
 def analyze_videos(video_files, selected_questions):
     """Analyzes videos, saves results to CSV, and returns CSV data and JSON."""
     all_results = {}
     questions = {
+        "hands_free": "Is the subject's hand in the video free or not?",
+        "standing": "Is the subject in the video sitting or standing?",
         "interaction_with_background": "Assess the surroundings behind the subject in the video. Do they seem to interact with any visible screens, such as laptops, TVs, or digital billboards? If yes, then they are interacting with a screen. If not, they are not interacting with a screen.",
+        "indoors": "Consider the broader environmental context shown in the video’s background. Are there signs of an open-air space, like greenery, structures, or people passing by? If so, it’s an outdoor setting. If the setting looks confined with furniture, walls, or home decorations, it’s an indoor environment."
     }
     for video_file in video_files:
             answer = process_video(video_file, questions[question_key])
             all_results[video_name][question_key] = "true" if "yes" in answer.lower() else "false"
+        # Clear cache and collect garbage after each video
         gc.collect()
         torch.cuda.empty_cache()
     json_output = json.dumps(all_results, indent=4)
     return json_output, csv_content
 def download_csv(csv_content):
     """Creates a downloadable CSV file."""
     return gr.File.update(
 with gr.Blocks() as iface:
     with gr.Row():
         file_input = gr.File(label="Upload Videos", file_count="multiple")
+        question_input = gr.CheckboxGroup(["hands_free", "standing", "interaction_with_background", "indoors"],
                                         label="Select Questions to Apply")
+    process_button = gr.Button("Process Videos")
     with gr.Row():
         json_output = gr.JSON(label="Analysis Results (JSON)")