hosseinhimself commited on
Commit
7d2d3dd
·
verified ·
1 Parent(s): f892dbc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -16
app.py CHANGED
@@ -4,7 +4,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
4
 
5
  model_name = "hosseinhimself/ISANG-v1.0-8B"
6
 
7
- # Disable GPU globally
8
  torch.set_default_device("cpu")
9
 
10
  # Load tokenizer globally
@@ -12,12 +12,12 @@ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
12
 
13
  def load_model():
14
  try:
15
- # Load the model with optimizations for CPU
16
  model = AutoModelForCausalLM.from_pretrained(
17
  model_name,
18
- torch_dtype=torch.float32, # Ensure compatibility with CPU
19
  trust_remote_code=True,
20
- low_cpu_mem_usage=True # Reduce memory usage during loading
21
  )
22
  model.to("cpu") # Explicitly load the model on CPU
23
  print("Model loaded successfully on CPU.")
@@ -26,7 +26,7 @@ def load_model():
26
  print(f"Error loading model: {e}")
27
  raise
28
 
29
- def chat(prompt, history):
30
  model = load_model()
31
  # Add system prompt
32
  system_prompt = "You are ISANG, a multilingual large language model made by ISANG AI. You only respond in Persian, Korean, or English. If a user uses one of these languages, reply in the same language."
@@ -37,26 +37,41 @@ def chat(prompt, history):
37
  context += f"User: {user_message}\nBot: {bot_message}\n"
38
  context += f"User: {prompt}\nBot:"
39
 
40
- # Generate a response
41
  inputs = tokenizer(context, return_tensors="pt", truncation=True, max_length=512)
42
- with torch.no_grad():
43
- outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.7)
44
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
45
 
46
- # Extract the latest response
47
- response = response[len(context):].strip()
48
- history.append((prompt, response))
49
- return history, response
50
 
51
- gradio_app = gr.ChatInterface(
52
- fn=chat,
 
 
 
 
 
 
 
 
 
 
 
53
  title="ISANG Chatbot",
54
  description="This is a chatbot powered by the ISANG model. Enter your messages to chat with it!",
55
  examples=[
56
  ["سلام، چطوری؟"],
57
  ["برام یه داستان تعریف کن"],
58
  ["نظرت درباره هوش مصنوعی چیه؟"]
59
- ]
 
60
  )
61
 
62
  if __name__ == "__main__":
 
4
 
5
  model_name = "hosseinhimself/ISANG-v1.0-8B"
6
 
7
+ # Ensure CUDA is not used
8
  torch.set_default_device("cpu")
9
 
10
  # Load tokenizer globally
 
12
 
13
  def load_model():
14
  try:
15
+ # Load the model without `bitsandbytes` or CUDA
16
  model = AutoModelForCausalLM.from_pretrained(
17
  model_name,
18
+ torch_dtype=torch.float32, # Use standard float32 for CPU
19
  trust_remote_code=True,
20
+ low_cpu_mem_usage=True # Optimize for CPU
21
  )
22
  model.to("cpu") # Explicitly load the model on CPU
23
  print("Model loaded successfully on CPU.")
 
26
  print(f"Error loading model: {e}")
27
  raise
28
 
29
+ def stream_chat(prompt, history):
30
  model = load_model()
31
  # Add system prompt
32
  system_prompt = "You are ISANG, a multilingual large language model made by ISANG AI. You only respond in Persian, Korean, or English. If a user uses one of these languages, reply in the same language."
 
37
  context += f"User: {user_message}\nBot: {bot_message}\n"
38
  context += f"User: {prompt}\nBot:"
39
 
40
+ # Generate a response incrementally
41
  inputs = tokenizer(context, return_tensors="pt", truncation=True, max_length=512)
42
+ output_ids = model.generate(
43
+ **inputs,
44
+ max_new_tokens=200,
45
+ temperature=0.7,
46
+ do_sample=True,
47
+ return_dict_in_generate=True,
48
+ output_scores=False
49
+ )
50
 
51
+ response_ids = output_ids.sequences[0]
52
+ decoded_text = tokenizer.decode(response_ids, skip_special_tokens=True)
 
 
53
 
54
+ # Stream response word by word
55
+ response = decoded_text[len(context):].strip()
56
+ words = response.split()
57
+ history.append((prompt, "")) # Add the prompt to history with an empty response initially
58
+ for i, word in enumerate(words):
59
+ # Append the next word to the history
60
+ history[-1] = (prompt, " ".join(words[: i + 1]))
61
+ yield history, " ".join(words[: i + 1]) # Stream the current response
62
+
63
+ gradio_app = gr.Interface(
64
+ fn=stream_chat,
65
+ inputs=[gr.Textbox(lines=2, placeholder="Enter your message here..."), "state"],
66
+ outputs=["state", "text"],
67
  title="ISANG Chatbot",
68
  description="This is a chatbot powered by the ISANG model. Enter your messages to chat with it!",
69
  examples=[
70
  ["سلام، چطوری؟"],
71
  ["برام یه داستان تعریف کن"],
72
  ["نظرت درباره هوش مصنوعی چیه؟"]
73
+ ],
74
+ live=True # Enable live streaming for Gradio
75
  )
76
 
77
  if __name__ == "__main__":