Spaces:

Azure99
/

Blossom-9B-Demo

Running on Zero

App Files Files Community

Azure99 commited on Jul 24

Commit

7054442

•

1 Parent(s): f8566ce

Update app.py

Browse files

Files changed (1) hide show

app.py +3 -17

app.py CHANGED Viewed

@@ -1,17 +1,14 @@
-import time
 import gradio as gr
 import spaces
-from threading import Thread
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
 import torch
 MAX_INPUT_LIMIT = 3584
 MODEL_NAME = "Azure99/blossom-v5.1-9b"
 model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 GENERATE_CONFIG = dict(
@@ -22,7 +19,6 @@ GENERATE_CONFIG = dict(
     repetition_penalty=1.05
 )
 def get_input_ids(inst, history):
     prefix = ("A chat between a human and an artificial intelligence bot. "
               "The bot gives helpful, detailed, and polite answers to the human's questions.")
@@ -46,27 +42,17 @@ def chat(inst, history):
     with torch.no_grad():
         streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         input_ids = get_input_ids(inst, history)
-        print(len(input_ids))
         if len(input_ids) > MAX_INPUT_LIMIT:
             yield "The input is too long, please clear the history."
             return
         generation_kwargs = dict(input_ids=torch.tensor([input_ids]).to(model.device), do_sample=True,
                                  streamer=streamer, **GENERATE_CONFIG)
         Thread(target=model.generate, kwargs=generation_kwargs).start()
-        # stop watch
-        start = time.time()
         outputs = ""
         for new_text in streamer:
             outputs += new_text
             yield outputs
-        total_time = time.time() - start
-        output_token_len = len(tokenizer.encode(outputs, add_special_tokens=False))
-        speed = output_token_len / total_time
-        print("----------")
-        print(history)
-        print([inst, outputs])
-        print(f"Speed: {speed:.2f} tokens/s")
 gr.ChatInterface(chat,

+from threading import Thread
 import gradio as gr
 import spaces
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 MAX_INPUT_LIMIT = 3584
 MODEL_NAME = "Azure99/blossom-v5.1-9b"
 model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 GENERATE_CONFIG = dict(
     repetition_penalty=1.05
 )
 def get_input_ids(inst, history):
     prefix = ("A chat between a human and an artificial intelligence bot. "
               "The bot gives helpful, detailed, and polite answers to the human's questions.")
     with torch.no_grad():
         streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         input_ids = get_input_ids(inst, history)
         if len(input_ids) > MAX_INPUT_LIMIT:
             yield "The input is too long, please clear the history."
             return
         generation_kwargs = dict(input_ids=torch.tensor([input_ids]).to(model.device), do_sample=True,
                                  streamer=streamer, **GENERATE_CONFIG)
         Thread(target=model.generate, kwargs=generation_kwargs).start()
         outputs = ""
         for new_text in streamer:
             outputs += new_text
             yield outputs
 gr.ChatInterface(chat,