howard-hou commited on
Commit
7758cb9
1 Parent(s): d9a5ffa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -21
app.py CHANGED
@@ -1,10 +1,17 @@
1
  import gradio as gr
2
  import os, gc
3
  from datetime import datetime
 
4
  from huggingface_hub import hf_hub_download
 
 
 
 
5
 
6
  ctx_limit = 3500
 
7
  title = "rwkv1b5-vitl336p14-577token_mix665k_rwkv"
 
8
 
9
  os.environ["RWKV_JIT_ON"] = '1'
10
  os.environ["RWKV_CUDA_ON"] = '0' # if '1' then use CUDA kernel for seq mode (much faster)
@@ -17,32 +24,22 @@ pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
17
 
18
  ##########################################################################
19
  from modeling import VisualEncoder, EmbeddingMixer, VisualEncoderConfig
20
- emb_mixer = EmbeddingMixer(model.w["emb.weight"], num_image_embeddings=4096)
 
21
  config = VisualEncoderConfig(n_embd=model.args.n_embd,
22
- vision_tower_name='openai/clip-vit-large-patch14-336',
23
  grid_size=-1)
24
  visual_encoder = VisualEncoder(config)
 
25
  ##########################################################################
26
- def generate_prompt(instruction, input=""):
27
  instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
28
  input = input.strip().replace('\r\n','\n').replace('\n\n','\n')
29
- if input:
30
- return f"""Instruction: {instruction}
31
-
32
- Input: {input}
33
-
34
- Response:"""
35
- else:
36
- return f"""User: hi
37
-
38
- Assistant: Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.
39
 
40
- User: {instruction}
41
-
42
- Assistant:"""
43
-
44
- def evaluate(
45
  ctx,
 
46
  token_count=200,
47
  temperature=1.0,
48
  top_p=0.7,
@@ -61,7 +58,11 @@ def evaluate(
61
  occurrence = {}
62
  state = None
63
  for i in range(int(token_count)):
64
- out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state)
 
 
 
 
65
  for n in occurrence:
66
  out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
67
 
@@ -101,8 +102,13 @@ examples = [
101
  ]
102
  ]
103
  def test(image, question):
104
- print(image, question)
105
- return question
 
 
 
 
 
106
  demo = gr.Interface(fn=test,
107
  inputs=[gr.Image(type='pil'), "text"],
108
  outputs="text",
 
1
  import gradio as gr
2
  import os, gc
3
  from datetime import datetime
4
+ from transformers import CLIPImageProcessor
5
  from huggingface_hub import hf_hub_download
6
+ from typing import List, Dict
7
+ from dataclasses import dataclass
8
+ DEFAULT_IMAGE_TOKEN = "<image>"
9
+
10
 
11
  ctx_limit = 3500
12
+ num_image_embeddings = 4096
13
  title = "rwkv1b5-vitl336p14-577token_mix665k_rwkv"
14
+ vision_tower_name = 'openai/clip-vit-large-patch14-336'
15
 
16
  os.environ["RWKV_JIT_ON"] = '1'
17
  os.environ["RWKV_CUDA_ON"] = '0' # if '1' then use CUDA kernel for seq mode (much faster)
 
24
 
25
  ##########################################################################
26
  from modeling import VisualEncoder, EmbeddingMixer, VisualEncoderConfig
27
+ emb_mixer = EmbeddingMixer(model.w["emb.weight"],
28
+ num_image_embeddings=num_image_embeddings)
29
  config = VisualEncoderConfig(n_embd=model.args.n_embd,
30
+ vision_tower_name=vision_tower_name,
31
  grid_size=-1)
32
  visual_encoder = VisualEncoder(config)
33
+ image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
34
  ##########################################################################
35
+ def generate_prompt(instruction):
36
  instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
37
  input = input.strip().replace('\r\n','\n').replace('\n\n','\n')
38
+ return f"\n{instruction}\n\nAssistant:"
 
 
 
 
 
 
 
 
 
39
 
40
+ def generate(
 
 
 
 
41
  ctx,
42
+ image_ids,
43
  token_count=200,
44
  temperature=1.0,
45
  top_p=0.7,
 
58
  occurrence = {}
59
  state = None
60
  for i in range(int(token_count)):
61
+ if i == 0:
62
+ input_ids = (image_ids + pipeline.encode(ctx))[-ctx_limit:]
63
+ else:
64
+ input_ids = [token]
65
+ out, state = model.forward(input_ids, state)
66
  for n in occurrence:
67
  out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
68
 
 
102
  ]
103
  ]
104
  def test(image, question):
105
+ image = image_processor(images=image.convert('RGB'), return_tensors='pt')['pixel_values']
106
+ image_features = visual_encoder.encode_images(image.unsqueeze(0))
107
+ image_ids = [i for i in range(emb_mixer.image_start_index, emb_mixer.image_start_index + len(image_features))]
108
+ input_text = generate_prompt(question)
109
+ for output in generate(input_text, image_ids):
110
+ yield output
111
+
112
  demo = gr.Interface(fn=test,
113
  inputs=[gr.Image(type='pil'), "text"],
114
  outputs="text",