File size: 4,525 Bytes
69698e1
 
 
7758cb9
69698e1
7758cb9
 
 
 
69698e1
 
7758cb9
69698e1
7758cb9
69698e1
 
 
 
 
 
 
 
 
d889050
 
ced196e
7758cb9
 
69698e1
7758cb9
69698e1
 
7758cb9
69698e1
7758cb9
69698e1
 
7758cb9
69698e1
7758cb9
69698e1
7758cb9
69698e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7758cb9
 
 
 
 
69698e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e40a03
69698e1
 
6c82cee
69698e1
 
 
6c82cee
69698e1
 
 
 
7758cb9
 
9086e70
 
7758cb9
 
 
 
 
a30c82c
 
 
 
 
 
d889050
a30c82c
9086e70
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import gradio as gr
import os, gc
from datetime import datetime
from transformers import CLIPImageProcessor
from huggingface_hub import hf_hub_download
from typing import List, Dict
from dataclasses import dataclass
DEFAULT_IMAGE_TOKEN = "<image>"


ctx_limit = 3500
num_image_embeddings = 4096
title = "rwkv1b5-vitl336p14-577token_mix665k_rwkv"
vision_tower_name = 'openai/clip-vit-large-patch14-336'

os.environ["RWKV_JIT_ON"] = '1'
os.environ["RWKV_CUDA_ON"] = '0' # if '1' then use CUDA kernel for seq mode (much faster)

from rwkv.model import RWKV
model_path = hf_hub_download(repo_id="howard-hou/visualrwkv-5", filename=f"{title}.pth")
model = RWKV(model=model_path, strategy='cpu fp32')
from rwkv.utils import PIPELINE, PIPELINE_ARGS
pipeline = PIPELINE(model, "rwkv_vocab_v20230424")

##########################################################################
from modeling import VisualEncoder, EmbeddingMixer, VisualEncoderConfig
emb_mixer = EmbeddingMixer(model.w["emb.weight"], 
                           num_image_embeddings=num_image_embeddings)
config = VisualEncoderConfig(n_embd=model.args.n_embd, 
                             vision_tower_name=vision_tower_name, 
                             grid_size=-1)
visual_encoder = VisualEncoder(config)
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
##########################################################################
def generate_prompt(instruction):
    instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
    input = input.strip().replace('\r\n','\n').replace('\n\n','\n')
    return f"\n{instruction}\n\nAssistant:"

def generate(
    ctx,
    image_ids,
    token_count=200,
    temperature=1.0,
    top_p=0.7,
    presencePenalty = 0.1,
    countPenalty = 0.1,
):
    args = PIPELINE_ARGS(temperature = max(0.2, float(temperature)), top_p = float(top_p),
                     alpha_frequency = countPenalty,
                     alpha_presence = presencePenalty,
                     token_ban = [], # ban the generation of some tokens
                     token_stop = [0]) # stop generation whenever you see any token here
    ctx = ctx.strip()
    all_tokens = []
    out_last = 0
    out_str = ''
    occurrence = {}
    state = None
    for i in range(int(token_count)):
        if i == 0:
            input_ids = (image_ids + pipeline.encode(ctx))[-ctx_limit:]
        else:
            input_ids = [token]
        out, state = model.forward(input_ids, state)
        for n in occurrence:
            out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)

        token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
        if token in args.token_stop:
            break
        all_tokens += [token]
        for xxx in occurrence:
            occurrence[xxx] *= 0.996        
        if token not in occurrence:
            occurrence[token] = 1
        else:
            occurrence[token] += 1
        
        tmp = pipeline.decode(all_tokens[out_last:])
        if '\ufffd' not in tmp:
            out_str += tmp
            yield out_str.strip()
            out_last = i + 1

    del out
    del state
    gc.collect()
    yield out_str.strip()


##########################################################################
cur_dir = os.path.dirname(os.path.abspath(__file__))
examples = [
    [
        f"{cur_dir}/examples_extreme_ironing.jpg",
        "What is unusual about this image?",
    ],
    [
        f"{cur_dir}/examples_waterview.jpg",
        "What are the things I should be cautious about when I visit here?",
    ]
]
def test(image, question):
    image = image_processor(images=image.convert('RGB'), return_tensors='pt')['pixel_values']
    image_features = visual_encoder.encode_images(image.unsqueeze(0))
    emb_mixer.set_image_embeddings(image_features)
    model.w["emb.weight"] = emb_mixer.get_input_embeddings()
    image_ids = [i for i in range(emb_mixer.image_start_index, emb_mixer.image_start_index + len(image_features))]
    input_text = generate_prompt(question)
    for output in generate(input_text, image_ids):
        yield output

demo = gr.ChatInterface(fn=test, 
                        inputs=[gr.Image(type='pil'), "text"], 
                        outputs="text", 
                        examples=examples, 
                        title=title, 
                        description="VisualRWKV-v5.0")

demo = demo.queue(concurrency_count=1, max_size=10)
demo.launch(share=False, enable_queue=True)