ManishThota commited on
Commit
b9601bc
·
verified ·
1 Parent(s): 8cbe409

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -8
app.py CHANGED
@@ -66,7 +66,7 @@ def process_video(video_file, question_parts):
66
  input = processor([prompt], videos=[video_clip], padding=True, return_tensors="pt").to(model.device)
67
 
68
  # Generate output
69
- generate_kwargs = {"max_new_tokens": 500, "do_sample": False, "top_p": 0.9}
70
  output = model.generate(**input, **generate_kwargs)
71
  generated_text = processor.batch_decode(output, skip_special_tokens=True)[0]
72
 
@@ -83,20 +83,21 @@ def process_videos(video_files, question):
83
 
84
  # Define Gradio interface for multiple videos
85
  def gradio_interface(videos, indoors_outdoors, standing_sitting, hands_free, interacting_screen):
86
-
87
- question = "Is the subject in the video"
88
  if indoors_outdoors:
89
- question += "present indoors or outdoors? "
90
  if standing_sitting:
91
- question += "standing or sitting? "
92
  if hands_free:
93
- question += "hands free or not? "
94
  if interacting_screen:
95
- question += "interacting with any screen in the background?"
96
-
 
97
  answers = process_videos(videos, question)
98
  return answers
99
 
 
100
  iface = gr.Interface(
101
  fn=gradio_interface,
102
  inputs=[
@@ -111,5 +112,6 @@ iface = gr.Interface(
111
  description="Upload multiple videos and select questions to get answers."
112
  )
113
 
 
114
  if __name__ == "__main__":
115
  iface.launch(debug=True)
 
66
  input = processor([prompt], videos=[video_clip], padding=True, return_tensors="pt").to(model.device)
67
 
68
  # Generate output
69
+ generate_kwargs = {"max_new_tokens": 3000, "do_sample": False, "top_p": 0.9}
70
  output = model.generate(**input, **generate_kwargs)
71
  generated_text = processor.batch_decode(output, skip_special_tokens=True)[0]
72
 
 
83
 
84
  # Define Gradio interface for multiple videos
85
  def gradio_interface(videos, indoors_outdoors, standing_sitting, hands_free, interacting_screen):
86
+ question = "For each question, analyze the given video carefully and base your answers on the observations made."
 
87
  if indoors_outdoors:
88
+ question += "Consider the broader environmental context shown in the video’s background. Are there signs of an open-air space, like greenery, structures, or people passing by? If so, it’s an outdoor setting. If the setting looks confined with furniture, walls, or home decorations, it’s an indoor environment."
89
  if standing_sitting:
90
+ question += "Evaluate the subject’s body posture and movement within the video. Are they standing upright with both feet planted firmly on the ground? If so, they are standing."
91
  if hands_free:
92
+ question += "Examine the subject’s right and left hands in the video to check if they are holding anything like a microphone, book, paper(White color), object, or any electronic device, try segmentations and decide if the hands are free or not."
93
  if interacting_screen:
94
+ question += "Assess the surroundings behind the subject in the video. Do they seem to interact with any visible screens, such as laptops, TVs, or digital billboards? If yes, then they are interacting with a screen. If not, they are not interacting with a screen."
95
+ question_prefix = "By taking these factors into account when watching the video, please answer the questions accurately."
96
+ question = question + question_prefix
97
  answers = process_videos(videos, question)
98
  return answers
99
 
100
+
101
  iface = gr.Interface(
102
  fn=gradio_interface,
103
  inputs=[
 
112
  description="Upload multiple videos and select questions to get answers."
113
  )
114
 
115
+
116
  if __name__ == "__main__":
117
  iface.launch(debug=True)