MohamedRashad commited on
Commit
c52807a
ยท
verified ยท
1 Parent(s): 3662bb4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -14
app.py CHANGED
@@ -10,8 +10,8 @@ import spaces
10
 
11
  # Load model directly
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
- tokenizer = AutoTokenizer.from_pretrained("Navid-AI/Mulhem-1-Mini", token=os.getenv("HF_TOKEN"))
14
- model = AutoModelForCausalLM.from_pretrained("Navid-AI/Mulhem-1-Mini", torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", token=os.getenv("HF_TOKEN")).to(device)
15
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
16
 
17
  @spaces.GPU
@@ -23,7 +23,6 @@ def respond(
23
  max_tokens,
24
  temperature,
25
  repetition_penalty,
26
- top_p,
27
  ):
28
  messages = [{"role": "system", "content": system_message}]
29
 
@@ -36,7 +35,7 @@ def respond(
36
  messages.append({"role": "user", "content": message})
37
  print(messages)
38
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True, enable_reasoning=enable_reasoning, return_dict=True).to(device)
39
- generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty)
40
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
41
 
42
  thread.start()
@@ -51,16 +50,9 @@ demo = gr.ChatInterface(
51
  additional_inputs=[
52
  gr.Checkbox(label="Enable reasoning", value=False),
53
  gr.Textbox(value="ุฃู†ุช ู…ูู„ู‡ู…. ุฐูƒุงุก ุงุตุทู†ุงุนูŠ ุชู… ุฅู†ุดุงุคู‡ ู…ู† ุดุฑูƒุฉ ู†ููŠุฏ ู„ุฅู„ู‡ุงู… ูˆุชุญููŠุฒ ุงู„ู…ุณุชุฎุฏู…ูŠู† ุนู„ู‰ ุงู„ุชุนู„ู‘ู…ุŒ ุงู„ู†ู…ูˆุŒ ูˆุชุญู‚ูŠู‚ ุฃู‡ุฏุงูู‡ู….", label="System message"),
54
- gr.Slider(minimum=1, maximum=8192, value=2048, step=1, label="Max new tokens"),
55
- gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1, label="Temperature"),
56
- gr.Slider(minimum=0.1, maximum=2.0, value=1.25, step=0.05, label="Repetition penalty"),
57
- gr.Slider(
58
- minimum=0.1,
59
- maximum=1.0,
60
- value=0.95,
61
- step=0.05,
62
- label="Top-p (nucleus sampling)",
63
- ),
64
  ],
65
  )
66
 
 
10
 
11
  # Load model directly
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
+ tokenizer = AutoTokenizer.from_pretrained("Navid-AI/Yehia-R1-3B", token=os.getenv("HF_TOKEN"))
14
+ model = AutoModelForCausalLM.from_pretrained("Navid-AI/Yehia-R1-3B", torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", token=os.getenv("HF_TOKEN")).to(device)
15
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
16
 
17
  @spaces.GPU
 
23
  max_tokens,
24
  temperature,
25
  repetition_penalty,
 
26
  ):
27
  messages = [{"role": "system", "content": system_message}]
28
 
 
35
  messages.append({"role": "user", "content": message})
36
  print(messages)
37
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True, enable_reasoning=enable_reasoning, return_dict=True).to(device)
38
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_tokens, temperature=temperature, repetition_penalty=repetition_penalty, eos_token_id=tokenizer.convert_tokens_to_ids("<|im_end|>"))
39
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
40
 
41
  thread.start()
 
50
  additional_inputs=[
51
  gr.Checkbox(label="Enable reasoning", value=False),
52
  gr.Textbox(value="ุฃู†ุช ู…ูู„ู‡ู…. ุฐูƒุงุก ุงุตุทู†ุงุนูŠ ุชู… ุฅู†ุดุงุคู‡ ู…ู† ุดุฑูƒุฉ ู†ููŠุฏ ู„ุฅู„ู‡ุงู… ูˆุชุญููŠุฒ ุงู„ู…ุณุชุฎุฏู…ูŠู† ุนู„ู‰ ุงู„ุชุนู„ู‘ู…ุŒ ุงู„ู†ู…ูˆุŒ ูˆุชุญู‚ูŠู‚ ุฃู‡ุฏุงูู‡ู….", label="System message"),
53
+ gr.Slider(minimum=1, maximum=8192, value=4096, step=1, label="Max new tokens"),
54
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.6, step=0.1, label="Temperature"),
55
+ gr.Slider(minimum=0.1, maximum=2.0, value=1.1, step=0.05, label="Repetition penalty"),
 
 
 
 
 
 
 
56
  ],
57
  )
58