RaushanTurganbay HF staff commited on
Commit
04d3cd3
·
verified ·
1 Parent(s): 3614a58

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -10
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from transformers import LlavaProcessor, LlavaForConditionalGeneration, TextIteratorStreamer
3
  from threading import Thread
4
  import re
5
  import time
@@ -8,17 +8,13 @@ import torch
8
  import cv2
9
  import spaces
10
 
11
- model_id = "llava-hf/llava-interleave-qwen-0.5b-hf"
12
 
13
  processor = LlavaProcessor.from_pretrained(model_id)
14
 
15
  model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16)
16
  model.to("cuda")
17
 
18
-
19
- def replace_video_with_images(text, frames):
20
- return text.replace("<video>", "<image>" * frames)
21
-
22
  def sample_frames(video_file, num_frames):
23
  video = cv2.VideoCapture(video_file)
24
  total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
@@ -63,8 +59,7 @@ def bot_streaming(message, history):
63
  if image[0].endswith(video_extensions):
64
 
65
  image = sample_frames(image[0], 12)
66
- image_tokens = "<image>" * 13
67
- prompt = f"<|im_start|>user {image_tokens}\n{message.text}<|im_end|><|im_start|>assistant"
68
  elif image[0].endswith(image_extensions):
69
  image = Image.open(image[0]).convert("RGB")
70
  prompt = f"<|im_start|>user <image>\n{message.text}<|im_end|><|im_start|>assistant"
@@ -109,7 +104,7 @@ def bot_streaming(message, history):
109
  yield generated_text_without_prompt
110
 
111
 
112
- demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA Interleave", examples=[
113
  {"text": "The input contains two videos, are the cats in this video and this video doing the same thing?", "files":["./cats_1.mp4", "./cats_2.mp4"]},
114
  {"text": "There are two images in the input. What is the relationship between this image and this image?", "files":["./bee.jpg", "./depth-bee.png"]},
115
  {"text": "What are these cats doing?", "files":["./cats.mp4"]},
@@ -117,6 +112,6 @@ demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA Interleave", examples=[
117
  {"text": "What is on the flower?", "files":["./bee.jpg"]},
118
  {"text": "How to make this pastry?", "files":["./baklava.png"]}],
119
  textbox=gr.MultimodalTextbox(file_count="multiple"),
120
- description="Try [LLaVA Interleave](https://huggingface.co/docs/transformers/main/en/model_doc/llava) in this demo (more specifically, the [Qwen-1.5-0.5B variant](https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf)). Upload an image or a video, and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error. ",
121
  stop_btn="Stop Generation", multimodal=True)
122
  demo.launch(debug=True)
 
1
  import gradio as gr
2
+ from transformers import LlavaOnevisionProcessor, LlavaOnevisionForConditionalGeneration, TextIteratorStreamer
3
  from threading import Thread
4
  import re
5
  import time
 
8
  import cv2
9
  import spaces
10
 
11
+ model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
12
 
13
  processor = LlavaProcessor.from_pretrained(model_id)
14
 
15
  model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16)
16
  model.to("cuda")
17
 
 
 
 
 
18
  def sample_frames(video_file, num_frames):
19
  video = cv2.VideoCapture(video_file)
20
  total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
 
59
  if image[0].endswith(video_extensions):
60
 
61
  image = sample_frames(image[0], 12)
62
+ prompt = f"<|im_start|>user <video>\n{message.text}<|im_end|><|im_start|>assistant"
 
63
  elif image[0].endswith(image_extensions):
64
  image = Image.open(image[0]).convert("RGB")
65
  prompt = f"<|im_start|>user <image>\n{message.text}<|im_end|><|im_start|>assistant"
 
104
  yield generated_text_without_prompt
105
 
106
 
107
+ demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA Onevision", examples=[
108
  {"text": "The input contains two videos, are the cats in this video and this video doing the same thing?", "files":["./cats_1.mp4", "./cats_2.mp4"]},
109
  {"text": "There are two images in the input. What is the relationship between this image and this image?", "files":["./bee.jpg", "./depth-bee.png"]},
110
  {"text": "What are these cats doing?", "files":["./cats.mp4"]},
 
112
  {"text": "What is on the flower?", "files":["./bee.jpg"]},
113
  {"text": "How to make this pastry?", "files":["./baklava.png"]}],
114
  textbox=gr.MultimodalTextbox(file_count="multiple"),
115
+ description="Try [LLaVA Onevision](https://huggingface.co/docs/transformers/main/en/model_doc/llava_onevision) in this demo (more specifically, the [Qwen-2-0.5B-Instruct variant](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)). Upload an image or a video, and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error. ",
116
  stop_btn="Stop Generation", multimodal=True)
117
  demo.launch(debug=True)