File size: 2,672 Bytes
ad27c4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import gradio as gr
import time
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

starter_text = """# Abstract
 Within thirty years, we will have the technological means to create superhuman intelligence. Shortly after,
 the human era will be ended.
 Is such progress avoidable? If not to be avoided, can events be guided so that we may survive? These questions
 are investigated. Some possible answers (and some further dangers) are presented.
"""


def calculate_wait_seconds(tokens_per_second):
    return 1 / tokens_per_second


def get_tokens(prompt):
    tokens = tokenizer.tokenize(prompt)
    return [x.replace('▁', ' ').replace('<0x0A>', '\n') for x in tokens]


def echo(message, history, prompt, tokens_per_second, time_to_first_token, stream):
    wait_seconds = calculate_wait_seconds(tokens_per_second)

    response = f"{prompt}"
    tokens = get_tokens(response)

    if time_to_first_token:
        time.sleep(time_to_first_token / 1000)
    partial_message = ""
    for new_token in tokens:
        time.sleep(wait_seconds)
        if '<' in new_token:
            # Gradio chat chokes on HTML-like elements
            continue
        partial_message += str(new_token)
        if stream:
            yield partial_message

    if not stream:
        yield partial_message


with gr.Blocks(title='Tokens per Second Simulator') as demo:
    gr.Markdown('# ⏱️ Tokens per Second Simulator')
    gr.Markdown('Compare the feel of different response speeds for a chat bot')
    gr.Markdown('Reading speeds vary but in English 5-10 tokens per second is considered normal reading speed')
    gr.Markdown(
        'References for further research:\n'
        '- https://www.perplexity.ai/search/How-many-tokens-1d7VyXCDQuWf3pJnK4.0iw?s=c\n'
        '- https://www.databricks.com/blog/llm-inference-performance-engineering-best-practices\n'
        '- https://news.ycombinator.com/item?id=35978864\n'
        '- https://www.reddit.com/r/LocalLLaMA/comments/162pgx9/what_do_yall_consider_acceptable_tokens_per/')

    prompt = gr.Textbox(starter_text, label="Prompt to Echo")
    tps_slider = gr.Slider(1, 50, render=True, value=8, label='Tokens per second (TPS)')
    ttft_slider = gr.Slider(0, 5000, render=True, value=0,
                            label='Time to first token (TTFT) in milliseconds')
    stream_checkbox = gr.Checkbox(label='Stream Response', value=True)

    gr.ChatInterface(echo, additional_inputs=[prompt, tps_slider, ttft_slider, stream_checkbox],
                     description='Submit any text to echo the prompt above at the selected speed.')

demo.queue().launch()