Spaces:

gaochangkuan
/

Zhuji-Internet-Literature-Intelligent-Writing-Model-V1.0

Runtime error

gaochangkuan commited on Jun 6, 2024

Commit

5bebd03

verified ·

1 Parent(s): f27de23

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,7 +4,9 @@ import gradio as gr
 from transformers import GemmaTokenizer, AutoModelForCausalLM
 from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
 # Set an environment variable
 token = os.getenv('HUGGINGFACE_TOKEN')
@@ -15,13 +17,13 @@ model= AutoModelForCausalLM.from_pretrained(
                                             torch_dtype= torch.bfloat16,
                                             low_cpu_mem_usage= True,
                                             token=token,
-                                            #attn_implementation="flash_attention_2",
-                                            #device_map= "auto"
      )
 model = torch.compile(model)
-model.to("cuda")
 model = model.eval()
@@ -93,7 +95,7 @@ def chat_zhuji(
         conversation.extend([{"role": "system","content": "",},{"role": "user", "content": user}, {"role": "<|assistant|>", "content": assistant}])
     conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to("cuda")
     streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)

 from transformers import GemmaTokenizer, AutoModelForCausalLM
 from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
+import subprocess
+subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 # Set an environment variable
 token = os.getenv('HUGGINGFACE_TOKEN')
                                             torch_dtype= torch.bfloat16,
                                             low_cpu_mem_usage= True,
                                             token=token,
+                                            attn_implementation="flash_attention_2",
+                                            device_map= "auto"
      )
 model = torch.compile(model)
+#model.to("cuda")
 model = model.eval()
         conversation.extend([{"role": "system","content": "",},{"role": "user", "content": user}, {"role": "<|assistant|>", "content": assistant}])
     conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)