experimental-kphi-3-nano-4k-instruct-gradio-autoloader

Sleeping

experimental-kphi-3-nano-4k-instruct-gradio-autoloader

File size: 3,631 Bytes

352a6c0
 
75c74b0
cc932be
 
 
 
7b5b897
 
cc932be
 
94559fc
cc932be
 
 
64b4ed5
c0252bb
cc932be
 
 
352a6c0
96a08ea
 
94559fc
96a08ea
 
 
352a6c0
94559fc
352a6c0
 
 
 
 
 
 
 
96a08ea
 
b15fb69
 
b9d96b3
b15fb69
 
96a08ea
 
 
 
 
 
 
 
ad8bce1
 
c6e9b1e
 
 
273fe29
75c74b0
96a08ea
 
 
 
4329549
96a08ea
4329549
c0252bb
 
96a08ea
 
 
 
 
 
75c74b0
43c5e78
96a08ea
75c74b0
 
 
cc932be
96a08ea
352a6c0
cc932be
 
352a6c0
 
 
 
 
 
 
 
 
 
 
cc932be
352a6c0
 
 
 
 
 
 
 
7b5b897
4329549
c0252bb
352a6c0
 
 
4329549
352a6c0

import gradio as gr

import os, sys
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, pipeline
import torch

# Define the model repository
# REPO_NAME = 'schuler/experimental-JP47D20'
REPO_NAME = 'schuler/experimental-JP47D21-KPhi-3-micro-4k-instruct'

# How to cache?
@spaces.GPU()
def load_model(repo_name):
    tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
    generator_conf = GenerationConfig.from_pretrained(repo_name)
    model = AutoModelForCausalLM.from_pretrained(repo_name, trust_remote_code=True, torch_dtype=torch.bfloat16, attn_implementation="eager")
    model.to('cuda')
    return tokenizer, generator_conf, model

tokenizer, generator_conf, model = load_model(REPO_NAME)

global_error = ''
try:
    @spaces.GPU()
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
except Exception as e:
    global_error =  f"Failed to load model: {str(e)}"

@spaces.GPU()
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    result = 'none'
    try:
        # Build the conversation prompt
        prompt = ''
        messages = []
        if (len(system_message)>0):
            prompt = "<|assistant|>"+system_message+f"<|end|>\n"    
        for val in history:
            if val[0]:
                messages.append({"role": "user", "content": val[0]})
            if val[1]:
                messages.append({"role": "assistant", "content": val[1]})
    
        messages.append({"role": "user", "content": message})
    
        for hmessage in messages:
            role = "<|assistant|>" if hmessage['role'] == 'assistant' else "<|user|>"
            prompt += f"{role}{hmessage['content']}<|end|>"        
        # prompt += f"<|user|>{message}<|end|><|assistant|>"
        prompt += f"<|assistant|>"

        # """
        # Generate the response
        response_output = generator(
            prompt,
            generation_config=generator_conf,
            max_new_tokens=max_tokens,
            do_sample=True,
            top_p=top_p,
            repetition_penalty=1.2,
            temperature=temperature
        )
    
        generated_text = response_output[0]['generated_text']
    
        # Extract the assistant's response
        result = generated_text[len(prompt):].strip()
        # """
        # result = prompt +':'+result
    except Exception as error:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        result = str(error) +':'+ exc_type +':'+ fname +':'+ exc_tb.tb_lineno

    yield result

    
    """
    for message in client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        token = message.choices[0].delta.content

        response += token
        yield response
    """


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="" + global_error, label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=64, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=1.0, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.25,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)


if __name__ == "__main__":
    demo.launch()