howard-hou commited on
Commit
c42ed6d
1 Parent(s): a14af9e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -127
app.py CHANGED
@@ -1,129 +1,3 @@
1
- import gradio as gr
2
- import os, gc, copy, torch
3
- from datetime import datetime
4
- from huggingface_hub import hf_hub_download
5
- from transformers import CLIPVisionModel
6
- import torch.nn as nn
7
- import torch.nn.functional as F
8
-
9
- ctx_limit = 3500
10
- title = "rwkv1b5-vitl336p14-577token_mix665k_rwkv"
11
-
12
- os.environ["RWKV_JIT_ON"] = '1'
13
- os.environ["RWKV_CUDA_ON"] = '0' # if '1' then use CUDA kernel for seq mode (much faster)
14
-
15
- from rwkv.model import RWKV
16
- model_path = hf_hub_download(repo_id="howard-hou/visualrwkv-5", filename=f"{title}.pth")
17
- model = RWKV(model=model_path, strategy='cpu fp32')
18
- from rwkv.utils import PIPELINE, PIPELINE_ARGS
19
- pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
20
-
21
-
22
- class VisualRWKV(nn.Module):
23
- def __init__(self, args):
24
- super().__init__()
25
- self.args = args
26
- self.vit = CLIPVisionModel.from_pretrained(args.vision_tower_name)
27
- self.proj = nn.Linear(self.vit.config.hidden_size, args.n_embd, bias=False)
28
-
29
- def encode_images(self, images):
30
- B, N, C, H, W = images.shape
31
- images = images.view(B*N, C, H, W)
32
- image_features = self.vit(images).last_hidden_state
33
- L, D = image_features.shape[1], image_features.shape[2]
34
- # rerange [B*N, L, D] -> [B, N, L, D]
35
- image_features = image_features.view(B, N, L, D)[:, 0, :, :]
36
- image_features = self.grid_pooling(image_features)
37
- return self.proj(image_features)
38
-
39
- def grid_pooling(self, image_features):
40
- if self.args.grid_size == -1: # no grid pooling
41
- return image_features
42
- if self.args.grid_size == 0: # take cls token
43
- return image_features[:, 0:1, :]
44
- if self.args.grid_size == 1: # global avg pooling
45
- return image_features.mean(dim=1, keepdim=True)
46
- cls_features = image_features[:, 0:1, :]
47
- image_features = image_features[:, 1:, :] #drop cls token
48
- B, L, D = image_features.shape
49
- H_or_W = int(L**0.5)
50
- image_features = image_features.view(B, H_or_W, H_or_W, D)
51
- grid_stride = H_or_W // self.args.grid_size
52
- image_features = F.avg_pool2d(image_features.permute(0, 3, 1, 2),
53
- padding=0,
54
- kernel_size=grid_stride,
55
- stride=grid_stride)
56
- image_features = image_features.permute(0, 2, 3, 1).view(B, -1, D)
57
- return torch.cat((cls_features, image_features), dim=1)
58
-
59
-
60
- ##########################################################################
61
-
62
-
63
- def generate_prompt(instruction, input=""):
64
- instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
65
- input = input.strip().replace('\r\n','\n').replace('\n\n','\n')
66
- if input:
67
- return f"""Instruction: {instruction}
68
-
69
- Input: {input}
70
-
71
- Response:"""
72
- else:
73
- return f"""User: hi
74
-
75
- Assistant: Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.
76
-
77
- User: {instruction}
78
-
79
- Assistant:"""
80
-
81
- def evaluate(
82
- ctx,
83
- token_count=200,
84
- temperature=1.0,
85
- top_p=0.7,
86
- presencePenalty = 0.1,
87
- countPenalty = 0.1,
88
- ):
89
- args = PIPELINE_ARGS(temperature = max(0.2, float(temperature)), top_p = float(top_p),
90
- alpha_frequency = countPenalty,
91
- alpha_presence = presencePenalty,
92
- token_ban = [], # ban the generation of some tokens
93
- token_stop = [0]) # stop generation whenever you see any token here
94
- ctx = ctx.strip()
95
- all_tokens = []
96
- out_last = 0
97
- out_str = ''
98
- occurrence = {}
99
- state = None
100
- for i in range(int(token_count)):
101
- out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state)
102
- for n in occurrence:
103
- out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
104
-
105
- token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
106
- if token in args.token_stop:
107
- break
108
- all_tokens += [token]
109
- for xxx in occurrence:
110
- occurrence[xxx] *= 0.996
111
- if token not in occurrence:
112
- occurrence[token] = 1
113
- else:
114
- occurrence[token] += 1
115
-
116
- tmp = pipeline.decode(all_tokens[out_last:])
117
- if '\ufffd' not in tmp:
118
- out_str += tmp
119
- yield out_str.strip()
120
- out_last = i + 1
121
-
122
- del out
123
- del state
124
- gc.collect()
125
- yield out_str.strip()
126
-
127
  import gradio as gr
128
  import os, gc
129
  from datetime import datetime
@@ -142,7 +16,7 @@ from rwkv.utils import PIPELINE, PIPELINE_ARGS
142
  pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
143
 
144
  ##########################################################################
145
- from model import VisualEncoder, EmbeddingMixer, VisualEncoderConfig
146
  emb_mixer = EmbeddingMixer(model.w["emb.weight"], num_image_embeddings=4096)
147
  config = VisualEncoderConfig(n_embd=model.args.n_embd,
148
  vision_tower_name='openai/clip-vit-large-patch14-336',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import os, gc
3
  from datetime import datetime
 
16
  pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
17
 
18
  ##########################################################################
19
+ from .model import VisualEncoder, EmbeddingMixer, VisualEncoderConfig
20
  emb_mixer = EmbeddingMixer(model.w["emb.weight"], num_image_embeddings=4096)
21
  config = VisualEncoderConfig(n_embd=model.args.n_embd,
22
  vision_tower_name='openai/clip-vit-large-patch14-336',