Spaces:
Running
Running
File size: 2,998 Bytes
8b8b0b2 bb35b84 8b8b0b2 d982401 bb35b84 8b8b0b2 bb35b84 a13b06e bb35b84 d982401 bb35b84 8b8b0b2 23ac168 8b8b0b2 23ac168 8b8b0b2 6f5234b a13b06e bca6bf8 a13b06e bb35b84 5b4a85e bb35b84 8b8b0b2 bb35b84 8b8b0b2 23ac168 8b8b0b2 1300829 8d83783 2a2899d 1300829 8b8b0b2 2a2899d 25721d0 23ac168 bb35b84 45ea516 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, pipeline
from threading import Thread
# The huggingface model id for Microsoft's phi-2 model
checkpoint = "microsoft/phi-2"
# Download and load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.float32, device_map="cpu", trust_remote_code=True)
# Text generation pipeline
phi2 = pipeline(
"text-generation",
tokenizer=tokenizer,
model=model,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
device_map="cpu"
)
# Function that accepts a prompt and generates text using the phi2 pipeline
def generate(message, chat_history, max_new_tokens):
instruction = "You are a helpful assistant to 'User'. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'."
final_prompt = f"Instruction: {instruction}\n"
for sent, received in chat_history:
final_prompt += "User: " + sent + "\n"
final_prompt += "Assistant: " + received + "\n"
final_prompt += "User: " + message + "\n"
final_prompt += "Output:"
if len(tokenizer.tokenize(final_prompt)) >= tokenizer.model_max_length - max_new_tokens:
final_prompt = "Instruction: Say 'Input exceeded context size, please clear the chat history and retry!' Output:"
# Streamer
streamer = TextIteratorStreamer(tokenizer=tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=300.0)
thread = Thread(target=phi2, kwargs={"text_inputs":final_prompt, "max_new_tokens":max_new_tokens, "streamer":streamer})
thread.start()
generated_text = ""
for word in streamer:
generated_text += word
response = generated_text.strip()
if "User:" in response:
response = response.split("User:")[0].strip()
if "Assistant:" in response:
response = response.split("Assistant:")[1].strip()
yield response
# Chat interface with gradio
with gr.Blocks() as demo:
gr.Markdown("""
# Phi-2 Chatbot Demo
This chatbot was created using Microsoft's 2.7 billion parameter [phi-2](https://huggingface.co/microsoft/phi-2) Transformer model.
In order to reduce the response time on this hardware, `max_new_tokens` has been set to `21` in the text generation pipeline. With this default configuration, it takes approximately `60 seconds` for the response to start being generated, and streamed one word at a time. Use the slider below to increase or decrease the length of the generated text.
""")
tokens_slider = gr.Slider(8, 128, value=21, label="Maximum new tokens", info="A larger `max_new_tokens` parameter value gives you longer text responses but at the cost of a slower response time.")
chatbot = gr.ChatInterface(
fn=generate,
additional_inputs=[tokens_slider],
stop_btn=None,
examples=[["Who is Leonhard Euler?"]]
)
demo.queue().launch() |