File size: 9,032 Bytes
c5c0f28
 
 
 
 
 
 
2367c90
 
 
 
 
c5c0f28
 
2367c90
 
 
 
 
c5c0f28
 
 
 
 
2367c90
c5c0f28
 
 
 
 
 
 
 
 
 
 
 
 
2367c90
 
 
 
 
c5c0f28
 
 
2367c90
c5c0f28
 
 
 
 
 
 
 
2367c90
c5c0f28
2367c90
 
 
c5c0f28
2367c90
c5c0f28
 
 
 
 
 
 
 
 
 
2367c90
c5c0f28
 
 
 
2367c90
c5c0f28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2367c90
c5c0f28
2367c90
c5c0f28
2367c90
 
c5c0f28
 
 
2367c90
 
c5c0f28
 
2367c90
c5c0f28
 
2367c90
c5c0f28
 
2367c90
 
 
c5c0f28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2367c90
 
 
 
c5c0f28
 
 
2367c90
 
 
 
c5c0f28
 
 
 
 
 
 
 
 
 
2367c90
c5c0f28
2367c90
c5c0f28
 
 
 
 
 
 
 
 
 
 
 
2367c90
c5c0f28
 
2367c90
 
 
 
 
 
 
 
c5c0f28
 
2367c90
c5c0f28
 
 
 
 
 
 
 
 
 
2367c90
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import os
import numpy as np
import re
import secrets
import tempfile
import requests
from http import HTTPStatus
from argparse import ArgumentParser
from pathlib import Path

import gradio as gr
import oss2
from dashscope import MultiModalConversation
import dashscope

# Install required packages
os.system('pip install dashscope modelscope oss2 -U')

# Load environment variables
API_KEY = os.environ['API_KEY']
ENDPOINT = os.environ['ENDPOINT']
AK_ID = os.environ['AK_ID']
AK = os.environ['AK']
BUCKET_NAME = os.environ['BUCKET_NAME']

dashscope.api_key = API_KEY
endpoint = ENDPOINT
auth = oss2.Auth(AK_ID, AK)
bucket = oss2.Bucket(auth, endpoint, BUCKET_NAME)

REVISION = 'v1.0.4'
BOX_TAG_PATTERN = r"<box>([\s\S]*?)</box>"
PUNCTUATION = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."

def _get_args():
    parser = ArgumentParser()
    parser.add_argument("--revision", type=str, default=REVISION)
    parser.add_argument("--cpu-only", action="store_true", help="Run demo with CPU only")
    parser.add_argument("--share", action="store_true", default=False, help="Create a publicly shareable link for the interface.")
    parser.add_argument("--inbrowser", action="store_true", default=False, help="Automatically launch the interface in a new tab on the default browser.")
    parser.add_argument("--server-port", type=int, default=7860, help="Demo server port.")
    parser.add_argument("--server-name", type=str, default="127.0.0.1", help="Demo server name.")
    return parser.parse_args()

def _parse_text(text):
    lines = text.split("\n")
    lines = [line for line in lines if line]
    count = 0
    for i, line in enumerate(lines):
        if "```" in line:
            count += 1
            items = line.split("`")
            if count % 2 == 1:
                lines[i] = f'<pre><code class="language-{items[-1]}">'
            else:
                lines[i] = "<br></code></pre>"
        else:
            if i > 0 and count % 2 == 1:
                line = re.sub(r'[<>]', lambda x: f'&{x.group(0)};', line)
                line = line.replace(" ", "&nbsp;")
                lines[i] = "<br>" + line
    return "".join(lines)

def _remove_image_special(text):
    text = text.replace('<ref>', '').replace('</ref>', '')
    return re.sub(r'<box>.*?(</box>|$)', '', text)

def is_video_file(filename):
    video_extensions = ['.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm', '.mpeg']
    return any(filename.lower().endswith(ext) for ext in video_extensions)

def _launch_demo(args):
    uploaded_file_dir = os.environ.get("GRADIO_TEMP_DIR") or str(Path(tempfile.gettempdir()) / "gradio")

    def predict(_chatbot, task_history):
        chat_query = _chatbot[-1][0]
        query = task_history[-1][0]
        if not chat_query:
            _chatbot.pop()
            task_history.pop()
            return _chatbot
        print("User: " + _parse_text(query))
        history_cp = copy.deepcopy(task_history)
        full_response = ""
        messages = []
        content = []
        for q, a in history_cp:
            if isinstance(q, (tuple, list)):
                if is_video_file(q[0]):
                    key = q[0].split(os.sep)[-2]
                    bucket.put_object_from_file(key, q[0])
                    url = bucket.sign_url('GET', key, 3600)
                    content.append({'video': url})
                else:
                    content.append({'image': f'file://{q[0]}'})
            else:
                content.append({'text': q})
                messages.append({'role': 'user', 'content': content})
                messages.append({'role': 'assistant', 'content': [{'text': a}]})
                content = []
        messages.pop()
        responses = MultiModalConversation.call(model='qwen-vl-max-0809', messages=messages, stream=True)
        for response in responses:
            if response.status_code != HTTPStatus.OK:
                raise HTTPError(f'response.code: {response.code}\nresponse.message: {response.message}')
            response_content = response.output.choices[0].message.content
            response_text = [ele.get('text', '') for ele in response_content]
            response_text = ''.join(response_text)
            _chatbot[-1] = (_parse_text(chat_query), _remove_image_special(response_text))
            yield _chatbot
        if len(response_content) > 1:
            result_image = response_content[-1]['result_image']
            resp = requests.get(result_image)
            os.makedirs(uploaded_file_dir, exist_ok=True)
            filename = os.path.join(uploaded_file_dir, f"tmp{secrets.token_hex(20)}.jpg")
            with open(filename, 'wb') as f:
                f.write(resp.content)
            response_text = ''.join(r.get('box', '') for r in response_content[:-1])
            _chatbot.append((None, (filename,)))
        else:
            response_text = response_content[0]['text']
            _chatbot[-1] = (_parse_text(chat_query), response_text)
        full_response = _parse_text(response_text)
        task_history[-1] = (query, full_response)
        print("Qwen2-VL-Chat: " + _parse_text(full_response))
        yield _chatbot

    def regenerate(_chatbot, task_history):
        if not task_history:
            return _chatbot
        item = task_history[-1]
        if item[1] is None:
            return _chatbot
        task_history[-1] = (item[0], None)
        chatbot_item = _chatbot.pop(-1)
        if chatbot_item[0] is None:
            _chatbot[-1] = (_chatbot[-1][0], None)
        else:
            _chatbot.append((chatbot_item[0], None))
        _chatbot_gen = predict(_chatbot, task_history)
        for _chatbot in _chatbot_gen:
            yield _chatbot

    def add_text(history, task_history, text):
        task_text = text
        history = history or []
        task_history = task_history or []
        history.append((_parse_text(text), None))
        task_history.append((task_text, None))
        return history, task_history, ""

    def add_file(history, task_history, file):
        history = history or []
        task_history = task_history or []
        history.append(((file.name,), None))
        task_history.append(((file.name,), None))
        return history, task_history

    def reset_user_input():
        return gr.update(value="")

    def reset_state(task_history):
        task_history.clear()
        return []

    with gr.Blocks() as demo:
        gr.Markdown("""<p align="center"><img src="https://modelscope.oss-cn-beijing.aliyuncs.com/resource/qwen.png" style="height: 80px"/></p>""")
        gr.Markdown("""<center><font size=8>Qwen2-VL-72B</center>""")
        gr.Markdown("""<center><font size=3>This WebUI is based on Qwen2-VL-72B, developed by Alibaba Cloud.</center>""")
        gr.Markdown("""<center><font size=3>本WebUI基于Qwen2-VL-72B。</center>""")

        chatbot = gr.Chatbot(label='Qwen2-VL-72B', elem_classes="control-height", height=500)
        query = gr.Textbox(lines=2, label='Input')
        task_history = gr.State([])

        with gr.Row():
            addfile_btn = gr.UploadButton("📁 Upload (上传文件)", file_types=["image", "video"])
            submit_btn = gr.Button("🚀 Submit (发送)")
            regen_btn = gr.Button("🤔️ Regenerate (重试)")
            empty_bin = gr.Button("🧹 Clear History (清除历史)")

        submit_btn.click(add_text, [chatbot, task_history, query], [chatbot, task_history], concurrency_limit=40).then(
            predict, [chatbot, task_history], [chatbot], show_progress=True
        )
        submit_btn.click(reset_user_input, [], [query], concurrency_limit=40)
        empty_bin.click(reset_state, [task_history], [chatbot], show_progress=True, concurrency_limit=40)
        regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True, concurrency_limit=40)
        addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True, concurrency_limit=40)

        gr.Markdown("""<font size=2>Note: This demo is governed by the original license of Qwen2-VL. We strongly advise users not to knowingly generate or allow others to knowingly generate harmful content, including hate speech, violence, pornography, deception, etc. (注:本演示受Qwen2-VL的许可协议限制。我们强烈建议,用户不应传播及不应允许他人传播以下内容,包括但不限于仇恨言论、暴力、色情、欺诈相关的有害信息。)""")

    demo.queue(api_open=False, default_concurrency_limit=40).launch(
        share=args.share,
        max_threads=40,
        # Uncomment if needed:
        # inbrowser=args.inbrowser,
        # server_port=args.server_port,
        # server_name=args.server_name,
    )

def main():
    args = _get_args()
    _launch_demo(args)

if __name__ == '__main__':
    main()