Spaces:

alfredplpl
/

llm-jp-instruct-v2

Paused

App Files Files Community

alfredplpl commited on May 1, 2024

Commit

4fffa9e

verified ·

1 Parent(s): 44c960c

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -9

app.py CHANGED Viewed

@@ -45,7 +45,7 @@ h1 {
 # Load the tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained("llm-jp/llm-jp-13b-instruct-full-ac_001_16x-dolly-ichikara_004_001_single-oasst-oasst2-v2.0")
-model = AutoModelForCausalLM.from_pretrained("llm-jp/llm-jp-13b-instruct-full-ac_001_16x-dolly-ichikara_004_001_single-oasst-oasst2-v2.0", device_map="auto", torch_dtype=torch.bfloat16)
 model=model.eval()
 @spaces.GPU()
@@ -70,16 +70,30 @@ def chat_llm_jp_v2(message: str,
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": message})
-    tokenized_input = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, return_tensors="pt").to(model.device)
-    output = model.generate(
-        tokenized_input,
         max_new_tokens=max_new_tokens,
         do_sample=True,
-        top_p=0.95,
         temperature=temperature,
-        repetition_penalty=1.05,
-    )[0]
-    return tokenizer.decode(output)
 # Gradio block
@@ -97,7 +111,7 @@ with gr.Blocks(fill_height=True, css=css) as demo:
         additional_inputs=[
             gr.Slider(minimum=0.1,
                       maximum=1,
-                      step=0.1,
                       value=0.7,
                       label="Temperature",
                       render=False),

 # Load the tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained("llm-jp/llm-jp-13b-instruct-full-ac_001_16x-dolly-ichikara_004_001_single-oasst-oasst2-v2.0")
+model = AutoModelForCausalLM.from_pretrained("llm-jp/llm-jp-13b-instruct-full-ac_001_16x-dolly-ichikara_004_001_single-oasst-oasst2-v2.0", device_map="cuda", torch_dtype=torch.bfloat16)
 model=model.eval()
 @spaces.GPU()
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": message})
+    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        input_ids= input_ids,
+        streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
+        top_p=0.95,
+        repetition_penalty=1.1,
+        eos_token_id=terminators,
+    )
+    # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash.
+    if temperature == 0:
+        generate_kwargs['do_sample'] = False
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        print(outputs)
+        yield "".join(outputs)
 # Gradio block
         additional_inputs=[
             gr.Slider(minimum=0.1,
                       maximum=1,
+                      step=0.0,
                       value=0.7,
                       label="Temperature",
                       render=False),