Spaces:

CreitinGameplays
/

ConvAIChat

Runtime error

App Files Files Community

CreitinGameplays commited on Dec 30, 2024

Commit

d6af013

verified ·

1 Parent(s): 335ddf3

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -20

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ from threading import Thread
 from typing import Iterator
 import gradio as gr
-import spaces
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
@@ -15,19 +14,34 @@ DESCRIPTION = """\
 # ConvAI 9b v2 Chat
 """
-if not torch.cuda.is_available():
-    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
-if torch.cuda.is_available():
-    model_id = "CreitinGameplays/ConvAI-9b-v2"
-    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
     tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left')
     tokenizer.use_default_system_prompt = False
-system_prompt_text = "You are a helpful, respectful and honest AI assistant named ChatGPT. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don’t know the answer to a question, please don’t share false information."
-@spaces.GPU(duration=90)
 def generate(
     message: str,
     chat_history: list[tuple[str, str]],
@@ -49,11 +63,11 @@ def generate(
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
-    input_ids = input_ids.to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=5.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
-        {"input_ids": input_ids},
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
@@ -71,7 +85,6 @@ def generate(
         outputs.append(text)
         yield "".join(outputs)
 chat_interface = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
@@ -122,13 +135,8 @@ chat_interface = gr.ChatInterface(
     ],
 )
-with gr.Blocks(css="style.css") as demo:
     gr.Markdown(DESCRIPTION)
-    gr.DuplicateButton(
-        value="Duplicate Space for private use",
-        elem_id="duplicate-button",
-        visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
-    )
     chat_interface.render()
 if __name__ == "__main__":

 from typing import Iterator
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 # ConvAI 9b v2 Chat
 """
+# Load model with appropriate device configuration
+def load_model():
+    model_id = "CreitinGameplays/dumbbot"
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # If using CPU, load in 32-bit to avoid potential issues with 16-bit operations
+    if device == "cpu":
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.float32,
+            low_cpu_mem_usage=True
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.float16,
+            device_map="auto"
+        )
     tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left')
     tokenizer.use_default_system_prompt = False
+    return model, tokenizer, device
+model, tokenizer, device = load_model()
+system_prompt_text = "You are Ricardinho."
 def generate(
     message: str,
     chat_history: list[tuple[str, str]],
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+    input_ids = input_ids.to(device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
+        input_ids=input_ids,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         outputs.append(text)
         yield "".join(outputs)
 chat_interface = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
     ],
 )
+with gr.Blocks() as demo:
     gr.Markdown(DESCRIPTION)
     chat_interface.render()
 if __name__ == "__main__":