jjsprockel commited on
Commit
9db7d28
·
verified ·
1 Parent(s): 23f9342

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -105
app.py CHANGED
@@ -3,12 +3,8 @@ import gradio as gr
3
  import torch
4
  import bitsandbytes
5
  from unsloth import FastLanguageModel
6
- from transformers import TextStreamer, StoppingCriteriaList, StoppingCriteria
7
-
8
- """
9
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
10
- """
11
- #client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
12
 
13
 
14
  model, tokenizer = FastLanguageModel.from_pretrained(
@@ -20,90 +16,6 @@ model, tokenizer = FastLanguageModel.from_pretrained(
20
 
21
  FastLanguageModel.for_inference(model)
22
 
23
- '''
24
- alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
25
-
26
- ### Instruction:
27
- {}
28
-
29
- ### Input:
30
- {}
31
-
32
- ### Response:
33
- {}"""
34
-
35
- def texto_Patol(input):
36
- FastLanguageModel.for_inference(model)
37
- inputs = tokenizer(
38
- [
39
- alpaca_prompt.format(
40
- input, # instruction
41
- "", # input
42
- "", # output - leave this blank for generation!
43
- )
44
- ], return_tensors = "pt")
45
- #.to("cuda")
46
-
47
- outputs = model.generate(**inputs, max_new_tokens = 2048, use_cache = True)
48
- return tokenizer.batch_decode(outputs)
49
- '''
50
-
51
- """
52
- def respond(
53
- message,
54
- history: list[tuple[str, str]],
55
- system_message,
56
- max_tokens,
57
- temperature,
58
- top_p,
59
- ):
60
- messages = [{"role": "system", "content": system_message}]
61
-
62
- for val in history:
63
- if val[0]:
64
- messages.append({"role": "user", "content": val[0]})
65
- if val[1]:
66
- messages.append({"role": "assistant", "content": val[1]})
67
-
68
- messages.append({"role": "user", "content": message})
69
-
70
- response = ""
71
-
72
- for message in client.chat_completion(
73
- messages,
74
- max_tokens=max_tokens,
75
- stream=True,
76
- temperature=temperature,
77
- top_p=top_p,
78
- ):
79
- token = message.choices[0].delta.content
80
-
81
- response += token
82
- yield response
83
-
84
- """
85
-
86
- """
87
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
88
- """
89
-
90
- """
91
- demo = gr.ChatInterface(
92
- respond,
93
- additional_inputs=[
94
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
95
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
96
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
97
- gr.Slider(
98
- minimum=0.1,
99
- maximum=1.0,
100
- value=0.95,
101
- step=0.05,
102
- label="Top-p (nucleus sampling)",
103
- ),
104
- ],
105
- )
106
- """
107
  class StopOnTokens(StoppingCriteria):
108
  def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
109
  stop_ids = [29, 0]
@@ -116,13 +28,27 @@ def predict(message, history):
116
  history_transformer_format = history + [[message, ""]]
117
  stop = StopOnTokens()
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  messages = "".join(["".join(["\n<human>:"+item[0], "\n<bot>:"+item[1]])
120
  for item in history_transformer_format])
121
 
122
  model_inputs = tokenizer([messages], return_tensors="pt").to("cuda")
123
  streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
124
  generate_kwargs = dict(
125
- model_inputs=inputs,
126
  streamer=streamer,
127
  max_new_tokens=2048,
128
  #do_sample=True,
@@ -143,17 +69,4 @@ def predict(message, history):
143
 
144
  gr.ChatInterface(predict).launch(debug=True)
145
 
146
- """
147
- demo = gr.Interface(fn=texto_Patol,
148
- inputs=[gr.Textbox(label="Ingresa una pregunta acerca de Patología", lines=2)],
149
- outputs=[gr.HighlightedText(label="Respuesta")],
150
- title="Chat de Patología en Español",
151
- description="Utiliza la primera versión del modelo ajustado Patologia_lora_model1 haciendo preguntas de patología",
152
- allow_flagging="never",
153
- #Here we introduce a new tag, examples, easy to use examples for your application
154
- examples=["¿Cuál es el mecanismo de acción del crizotinib?", "Cuál es la mutación accionable más frecuente en cáncer de mama"])
155
- demo.launch(share=True, debug=True)
156
- """
157
-
158
- #if __name__ == "__main__":
159
- # demo.launch()
 
3
  import torch
4
  import bitsandbytes
5
  from unsloth import FastLanguageModel
6
+ from transformers import TextStreamer, StoppingCriteriaList, StoppingCriteria, TextIteratorStreamer
7
+ from threading import Thread
 
 
 
 
8
 
9
 
10
  model, tokenizer = FastLanguageModel.from_pretrained(
 
16
 
17
  FastLanguageModel.for_inference(model)
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  class StopOnTokens(StoppingCriteria):
20
  def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
21
  stop_ids = [29, 0]
 
28
  history_transformer_format = history + [[message, ""]]
29
  stop = StopOnTokens()
30
 
31
+ '''
32
+ inputs = tokenizer(
33
+ [
34
+ alpaca_prompt.format(
35
+ message, # instruction
36
+ "", # input
37
+ "", # output - leave this blank for generation!
38
+ )
39
+ ], return_tensors = "pt").to("cuda")
40
+
41
+ text_streamer = TextStreamer(tokenizer)
42
+ _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 2048)
43
+ '''
44
+
45
  messages = "".join(["".join(["\n<human>:"+item[0], "\n<bot>:"+item[1]])
46
  for item in history_transformer_format])
47
 
48
  model_inputs = tokenizer([messages], return_tensors="pt").to("cuda")
49
  streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
50
  generate_kwargs = dict(
51
+ model_inputs,
52
  streamer=streamer,
53
  max_new_tokens=2048,
54
  #do_sample=True,
 
69
 
70
  gr.ChatInterface(predict).launch(debug=True)
71
 
72
+ gr.close_all()