#import os #os.environ["CUDA_VISIBLE_DEVICES"] = "0,2,3" import torch from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer model_path= "CubeAI/Zhuji-Internet-Literature-Intelligent-Writing-Model-V1.0" tokenizer = AutoTokenizer.from_pretrained(model_path, encode_special_tokens=True) model= AutoModelForCausalLM.from_pretrained( model_path, torch_dtype= torch.bfloat16, low_cpu_mem_usage= True, attn_implementation="flash_attention_2", device_map= "auto" ) model = torch.compile(model) model = model.eval() import gradio as gr import os from transformers import GemmaTokenizer, AutoModelForCausalLM from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread # Set an environment variable DESCRIPTION = '''

自研模型测试长篇小说概要

本空间旨在展示我们自行研发的模型在长篇小说领域的应用能力。该模型经过特别优化,适用于长篇小说的生成和理解任务,具备两种不同的规模配置:基础版和高级版。

📚 如果您对模型在长篇小说创作和分析方面的应用感兴趣,欢迎尝试使用我们的基础版模型进行初步探索。

🚀 对于寻求更高级功能和更深层次分析的用户,我们提供了高级版模型,它具备更强大的生成能力和更精细的文本理解技术。

''' LICENSE = """

--- Built with NovelGen """ PLACEHOLDER = """

ai助力写作

ai辅助写作

""" css = """ h1 { text-align: center; display: block; } #duplicate-button { margin: auto; color: white; background: #1565c0; border-radius: 100vh; } """ tokenizer.chat_template = """{% for message in messages %} {% if message['role'] == 'user' %} {{'<|user|>'+ message['content'].strip() + '<|observation|>'+ '<|assistant|>'}} {% elif message['role'] == 'system' %} {{ '<|system|>' + message['content'].strip() + '<|observation|>'}} {% elif message['role'] == 'assistant' %} {{ message['content'] + '<|observation|>'}} {% endif %} {% endfor %}""".replace("\n", "").replace(" ", "") def chat_zhuji( message: str, history: list, temperature: float, max_new_tokens: int ) -> str: """ Generate a streaming response using the llama3-8b model. Args: message (str): The input message. history (list): The conversation history used by ChatInterface. temperature (float): The temperature for generating the response. max_new_tokens (int): The maximum number of new tokens to generate. Returns: str: The generated response. """ conversation = [] #<|system|><|observation|><|user|> for user, assistant in history: conversation.extend([{"role": "system","content": "",},{"role": "user", "content": user}, {"role": "<|assistant|>", "content": assistant}]) conversation.append({"role": "user", "content": message}) input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device) streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True) generate_kwargs = dict( input_ids= input_ids, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, penalty_alpha= 0.65, top_p= 0.90, top_k= 35, use_cache= True, eos_token_id= tokenizer.encode("<|observation|>",add_special_tokens= False), temperature=temperature, ) # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash. if temperature == 0: generate_kwargs['do_sample'] = False t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() outputs = [] for text in streamer: outputs.append(text) yield "".join(outputs) # Gradio block chatbot=gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface') text_box= gr.Textbox(show_copy_button= True) with gr.Blocks(fill_height=True, css=css) as demo: #gr.Markdown(DESCRIPTION) gr.ChatInterface( fn=chat_zhuji, chatbot=chatbot, textbox= text_box, fill_height=True, additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False), additional_inputs=[ gr.Slider(minimum=0, maximum=1, step=0.1, value=0.95, label="Temperature", render=False), gr.Slider(minimum=2048, maximum=8192*2, step=1, value=8192*2, label="Max new tokens", render=False ), ], examples=[ ['请给一个古代美女的外貌来一段描写'], ['请生成4个东方神功的招式名称'], ['生成一段官军和倭寇打斗的场面描写'], ['生成一个都市大女主的角色档案'], ], cache_examples=False, ) gr.Markdown(LICENSE) if __name__ == "__main__": demo.launch( #server_name='0.0.0.0', #server_port=config.webui_config.port, #inbrowser=True, share=True )