Spaces:

hosseinhimself
/

ISANG-1.0-8B

Runtime error

App Files Files Community

hosseinhimself commited on Jan 6

Commit

7d2d3dd

verified ·

1 Parent(s): f892dbc

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -16

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 model_name = "hosseinhimself/ISANG-v1.0-8B"
-# Disable GPU globally
 torch.set_default_device("cpu")
 # Load tokenizer globally
@@ -12,12 +12,12 @@ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 def load_model():
     try:
-        # Load the model with optimizations for CPU
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
-            torch_dtype=torch.float32,  # Ensure compatibility with CPU
             trust_remote_code=True,
-            low_cpu_mem_usage=True  # Reduce memory usage during loading
         )
         model.to("cpu")  # Explicitly load the model on CPU
         print("Model loaded successfully on CPU.")
@@ -26,7 +26,7 @@ def load_model():
         print(f"Error loading model: {e}")
         raise
-def chat(prompt, history):
     model = load_model()
     # Add system prompt
     system_prompt = "You are ISANG, a multilingual large language model made by ISANG AI. You only respond in Persian, Korean, or English. If a user uses one of these languages, reply in the same language."
@@ -37,26 +37,41 @@ def chat(prompt, history):
         context += f"User: {user_message}\nBot: {bot_message}\n"
     context += f"User: {prompt}\nBot:"
-    # Generate a response
     inputs = tokenizer(context, return_tensors="pt", truncation=True, max_length=512)
-    with torch.no_grad():
-        outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.7)
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Extract the latest response
-    response = response[len(context):].strip()
-    history.append((prompt, response))
-    return history, response
-gradio_app = gr.ChatInterface(
-    fn=chat,
     title="ISANG Chatbot",
     description="This is a chatbot powered by the ISANG model. Enter your messages to chat with it!",
     examples=[
         ["سلام، چطوری؟"],
         ["برام یه داستان تعریف کن"],
         ["نظرت درباره هوش مصنوعی چیه؟"]
-    ]
 )
 if __name__ == "__main__":

 model_name = "hosseinhimself/ISANG-v1.0-8B"
+# Ensure CUDA is not used
 torch.set_default_device("cpu")
 # Load tokenizer globally
 def load_model():
     try:
+        # Load the model without `bitsandbytes` or CUDA
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
+            torch_dtype=torch.float32,  # Use standard float32 for CPU
             trust_remote_code=True,
+            low_cpu_mem_usage=True  # Optimize for CPU
         )
         model.to("cpu")  # Explicitly load the model on CPU
         print("Model loaded successfully on CPU.")
         print(f"Error loading model: {e}")
         raise
+def stream_chat(prompt, history):
     model = load_model()
     # Add system prompt
     system_prompt = "You are ISANG, a multilingual large language model made by ISANG AI. You only respond in Persian, Korean, or English. If a user uses one of these languages, reply in the same language."
         context += f"User: {user_message}\nBot: {bot_message}\n"
     context += f"User: {prompt}\nBot:"
+    # Generate a response incrementally
     inputs = tokenizer(context, return_tensors="pt", truncation=True, max_length=512)
+    output_ids = model.generate(
+        **inputs,
+        max_new_tokens=200,
+        temperature=0.7,
+        do_sample=True,
+        return_dict_in_generate=True,
+        output_scores=False
+    )
+    response_ids = output_ids.sequences[0]
+    decoded_text = tokenizer.decode(response_ids, skip_special_tokens=True)
+    # Stream response word by word
+    response = decoded_text[len(context):].strip()
+    words = response.split()
+    history.append((prompt, ""))  # Add the prompt to history with an empty response initially
+    for i, word in enumerate(words):
+        # Append the next word to the history
+        history[-1] = (prompt, " ".join(words[: i + 1]))
+        yield history, " ".join(words[: i + 1])  # Stream the current response
+gradio_app = gr.Interface(
+    fn=stream_chat,
+    inputs=[gr.Textbox(lines=2, placeholder="Enter your message here..."), "state"],
+    outputs=["state", "text"],
     title="ISANG Chatbot",
     description="This is a chatbot powered by the ISANG model. Enter your messages to chat with it!",
     examples=[
         ["سلام، چطوری؟"],
         ["برام یه داستان تعریف کن"],
         ["نظرت درباره هوش مصنوعی چیه؟"]
+    ],
+    live=True  # Enable live streaming for Gradio
 )
 if __name__ == "__main__":