This PR upgrades the space to Llama 3.2V 11B Cot

#30
Files changed (3) hide show
  1. README.md +4 -4
  2. app.py +81 -89
  3. requirements.txt +1 -3
README.md CHANGED
@@ -1,14 +1,14 @@
1
  ---
2
- title: Llava Llama-3 8B
3
  emoji: 🔥
4
  colorFrom: yellow
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.42.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
- short_description: Meta Llama3 8b with Llava Multimodal capabilities
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Llama 3.2V 11B Cot
3
  emoji: 🔥
4
  colorFrom: yellow
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: 5.0.1
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
+ short_description: Meta Llama3 3.2V 11B Cot Multimodal
12
  ---
13
 
14
+ An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
app.py CHANGED
@@ -1,105 +1,97 @@
1
- import time
 
 
 
2
  from threading import Thread
3
-
4
  import gradio as gr
5
- import torch
6
- from PIL import Image
7
- from transformers import AutoProcessor, LlavaForConditionalGeneration
8
- from transformers import TextIteratorStreamer
9
-
10
  import spaces
11
-
12
-
13
- PLACEHOLDER = """
14
- <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
15
- <img src="https://cdn-uploads.huggingface.co/production/uploads/64ccdc322e592905f922a06e/DDIW0kbWmdOQWwy4XMhwX.png" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55; ">
16
- <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">LLaVA-Llama-3-8B</h1>
17
- <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Llava-Llama-3-8b is a LLaVA model fine-tuned from Meta-Llama-3-8B-Instruct and CLIP-ViT-Large-patch14-336 with ShareGPT4V-PT and InternVL-SFT by XTuner</p>
18
- </div>
19
- """
20
-
21
-
22
- model_id = "xtuner/llava-llama-3-8b-v1_1-transformers"
23
-
24
- processor = AutoProcessor.from_pretrained(model_id)
25
-
26
- model = LlavaForConditionalGeneration.from_pretrained(
27
- model_id,
28
- torch_dtype=torch.float16,
29
- low_cpu_mem_usage=True,
30
- )
31
-
32
- model.to("cuda:0")
33
- model.generation_config.eos_token_id = 128009
34
 
35
 
36
  @spaces.GPU
37
- def bot_streaming(message, history):
38
- print(message)
39
- if message["files"]:
40
- # message["files"][-1] is a Dict or just a string
41
- if type(message["files"][-1]) == dict:
42
- image = message["files"][-1]["path"]
43
- else:
44
- image = message["files"][-1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  else:
46
- # if there's no image uploaded for this turn, look for images in the past turns
47
- # kept inside tuples, take the last one
48
- for hist in history:
49
- if type(hist[0]) == tuple:
50
- image = hist[0][0]
51
- try:
52
- if image is None:
53
- # Handle the case where image is None
54
- gr.Error("You need to upload an image for LLaVA to work.")
55
- except NameError:
56
- # Handle the case where 'image' is not defined at all
57
- gr.Error("You need to upload an image for LLaVA to work.")
58
 
59
- prompt = f"<|start_header_id|>user<|end_header_id|>\n\n<image>\n{message['text']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
60
- # print(f"prompt: {prompt}")
61
- image = Image.open(image)
62
- inputs = processor(prompt, image, return_tensors='pt').to(0, torch.float16)
63
 
64
- streamer = TextIteratorStreamer(processor, **{"skip_special_tokens": False, "skip_prompt": True})
65
- generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024, do_sample=False)
66
 
 
 
 
 
 
 
 
 
 
 
67
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
68
  thread.start()
69
-
70
- text_prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{message['text']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
71
- # print(f"text_prompt: {text_prompt}")
72
-
73
  buffer = ""
74
- time.sleep(0.5)
75
  for new_text in streamer:
76
- # find <|eot_id|> and remove it from the new_text
77
- if "<|eot_id|>" in new_text:
78
- new_text = new_text.split("<|eot_id|>")[0]
79
  buffer += new_text
80
-
81
- # generated_text_without_prompt = buffer[len(text_prompt):]
82
  generated_text_without_prompt = buffer
83
- # print(generated_text_without_prompt)
84
- time.sleep(0.06)
85
- # print(f"new_text: {generated_text_without_prompt}")
86
- yield generated_text_without_prompt
87
-
88
-
89
- chatbot=gr.Chatbot(placeholder=PLACEHOLDER,scale=1)
90
- chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter message or upload file...", show_label=False)
91
- with gr.Blocks(fill_height=True, ) as demo:
92
- gr.ChatInterface(
93
- fn=bot_streaming,
94
- title="LLaVA Llama-3-8B",
95
- examples=[{"text": "What is on the flower?", "files": ["./bee.jpg"]},
96
- {"text": "How to make this pastry?", "files": ["./baklava.png"]}],
97
- description="Try [LLaVA Llama-3-8B](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers). Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.",
98
- stop_btn="Stop Generation",
99
- multimodal=True,
100
- textbox=chat_input,
101
- chatbot=chatbot,
102
- )
103
-
104
- demo.queue(api_open=False)
105
- demo.launch(show_api=False, share=False)
 
 
 
 
 
1
+ from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIteratorStreamer
2
+ from PIL import Image
3
+ import requests
4
+ import torch
5
  from threading import Thread
 
6
  import gradio as gr
7
+ from gradio import FileData
8
+ import time
 
 
 
9
  import spaces
10
+ import re
11
+ ckpt = "Xkev/Llama-3.2V-11B-cot"
12
+ model = MllamaForConditionalGeneration.from_pretrained(ckpt,
13
+ torch_dtype=torch.bfloat16).to("cuda")
14
+ processor = AutoProcessor.from_pretrained(ckpt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
 
17
  @spaces.GPU
18
+ def bot_streaming(message, history, max_new_tokens=250):
19
+
20
+ txt = message["text"]
21
+ ext_buffer = f"{txt}"
22
+
23
+ messages= []
24
+ images = []
25
+
26
+
27
+ for i, msg in enumerate(history):
28
+ if isinstance(msg[0], tuple):
29
+ messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
30
+ messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
31
+ images.append(Image.open(msg[0][0]).convert("RGB"))
32
+ elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
33
+ # messages are already handled
34
+ pass
35
+ elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): # text only turn
36
+ messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
37
+ messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
38
+
39
+ # add current message
40
+ if len(message["files"]) == 1:
41
+
42
+ if isinstance(message["files"][0], str): # examples
43
+ image = Image.open(message["files"][0]).convert("RGB")
44
+ else: # regular input
45
+ image = Image.open(message["files"][0]["path"]).convert("RGB")
46
+ images.append(image)
47
+ messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
48
  else:
49
+ messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
 
 
 
 
 
 
 
 
 
 
 
50
 
 
 
 
 
51
 
52
+ texts = processor.apply_chat_template(messages, add_generation_prompt=True)
 
53
 
54
+ if images == []:
55
+ inputs = processor(text=texts, return_tensors="pt").to("cuda")
56
+ else:
57
+ inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
58
+
59
+ streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
60
+
61
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
62
+ generated_text = ""
63
+
64
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
65
  thread.start()
 
 
 
 
66
  buffer = ""
67
+
68
  for new_text in streamer:
 
 
 
69
  buffer += new_text
 
 
70
  generated_text_without_prompt = buffer
71
+ time.sleep(0.01)
72
+
73
+ buffer = re.sub(r"<(\w+)>", r"(Here begins the \1 stage)", buffer)
74
+ buffer = re.sub(r"</(\w+)>", r"(Here ends the \1 stage)", buffer)
75
+
76
+ yield buffer
77
+
78
+
79
+ demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA-CoT",
80
+ textbox=gr.MultimodalTextbox(),
81
+ additional_inputs = [gr.Slider(
82
+ minimum=512,
83
+ maximum=1024,
84
+ value=512,
85
+ step=1,
86
+ label="Maximum number of new tokens to generate",
87
+ )
88
+ ],
89
+ examples=[[{"text": "What is on the flower?", "files": ["./bee.jpg"]},512],
90
+ [{"text": "How to make this pastry?", "files": ["./baklava.png"]},512]],
91
+ cache_examples=False,
92
+ description="Upload an image, and start chatting about it. To learn more about LLaVA-CoT, visit [our GitHub page](https://github.com/PKU-YuanGroup/LLaVA-CoT). Note: Since Gradio currently does not support displaying the special markings in the output, we have replaced it with the expression (Here begins the X phase).",
93
+ stop_btn="Stop Generation",
94
+ fill_height=True,
95
+ multimodal=True)
96
+
97
+ demo.launch(debug=True)
requirements.txt CHANGED
@@ -1,5 +1,3 @@
1
  torch
2
- git+https://github.com/huggingface/transformers.git
3
  spaces
4
- pillow
5
- accelerate
 
1
  torch
 
2
  spaces
3
+ git+https://github.com/huggingface/transformers.git