seanpedrickcase commited on
Commit
f301d67
·
1 Parent(s): 8aa3ebb

Upgraded gradio and packages to latest. Switched Ctransformers with Llama.cpp Python

Browse files
Files changed (6) hide show
  1. Dockerfile +3 -2
  2. README.md +1 -1
  3. app.py +105 -20
  4. chatfuncs/chatfuncs.py +209 -45
  5. chatfuncs/ingest.py +1 -1
  6. requirements.txt +8 -5
Dockerfile CHANGED
@@ -13,13 +13,14 @@ USER user
13
  # Set home to the user's home directory
14
  ENV HOME=/home/user \
15
  PATH=/home/user/.local/bin:$PATH \
16
- PYTHONPATH=$HOME/app \
17
  PYTHONUNBUFFERED=1 \
18
  GRADIO_ALLOW_FLAGGING=never \
19
  GRADIO_NUM_PORTS=1 \
20
  GRADIO_SERVER_NAME=0.0.0.0 \
21
  GRADIO_THEME=huggingface \
22
- SYSTEM=spaces
 
23
 
24
  # Set the working directory to the user's home directory
25
  WORKDIR $HOME/app
 
13
  # Set home to the user's home directory
14
  ENV HOME=/home/user \
15
  PATH=/home/user/.local/bin:$PATH \
16
+ PYTHONPATH=$HOME/app \
17
  PYTHONUNBUFFERED=1 \
18
  GRADIO_ALLOW_FLAGGING=never \
19
  GRADIO_NUM_PORTS=1 \
20
  GRADIO_SERVER_NAME=0.0.0.0 \
21
  GRADIO_THEME=huggingface \
22
+ SYSTEM=spaces \
23
+ LLAMA_CUBLAS=1
24
 
25
  # Set the working directory to the user's home directory
26
  WORKDIR $HOME/app
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🌍
4
  colorFrom: yellow
5
  colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 3.50.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
4
  colorFrom: yellow
5
  colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 4.31.5
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py CHANGED
@@ -11,6 +11,12 @@ import pandas as pd
11
  from transformers import AutoTokenizer
12
  from ctransformers import AutoModelForCausalLM
13
 
 
 
 
 
 
 
14
  PandasDataFrame = Type[pd.DataFrame]
15
 
16
  # Disable cuda devices if necessary
@@ -38,7 +44,7 @@ def get_faiss_store(faiss_vstore_folder,embeddings):
38
  with zipfile.ZipFile(faiss_vstore_folder + '/' + faiss_vstore_folder + '.zip', 'r') as zip_ref:
39
  zip_ref.extractall(faiss_vstore_folder)
40
 
41
- faiss_vstore = FAISS.load_local(folder_path=faiss_vstore_folder, embeddings=embeddings)
42
  os.remove(faiss_vstore_folder + "/index.faiss")
43
  os.remove(faiss_vstore_folder + "/index.pkl")
44
 
@@ -53,6 +59,78 @@ import chatfuncs.chatfuncs as chatf
53
  chatf.embeddings = load_embeddings(embeddings_name)
54
  chatf.vectorstore = get_faiss_store(faiss_vstore_folder="faiss_embedding",embeddings=globals()["embeddings"])
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_device=None):
57
  print("Loading model")
58
 
@@ -67,26 +145,35 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
67
  if model_type == "Mistral Open Orca (larger, slow)":
68
  if torch_device == "cuda":
69
  gpu_config.update_gpu(gpu_layers)
 
70
  else:
71
  gpu_config.update_gpu(gpu_layers)
72
  cpu_config.update_gpu(gpu_layers)
73
 
74
- print("Loading with", cpu_config.gpu_layers, "model layers sent to GPU.")
75
 
76
  print(vars(gpu_config))
77
  print(vars(cpu_config))
78
 
79
  try:
80
- #model = AutoModelForCausalLM.from_pretrained('Aryanne/Orca-Mini-3B-gguf', model_type='llama', model_file='q5_0-orca-mini-3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
81
- #model = AutoModelForCausalLM.from_pretrained('Aryanne/Wizard-Orca-3B-gguf', model_type='llama', model_file='q4_1-wizard-orca-3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
82
- model = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
83
- #model = AutoModelForCausalLM.from_pretrained('TheBloke/MistralLite-7B-GGUF', model_type='mistral', model_file='mistrallite.Q4_K_M.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
 
 
 
84
 
85
- except:
86
- #model = AutoModelForCausalLM.from_pretrained('Aryanne/Orca-Mini-3B-gguf', model_type='llama', model_file='q5_0-orca-mini-3b.gguf', **vars(cpu_config)) #**asdict(CtransRunConfig_gpu())
87
- #model = AutoModelForCausalLM.from_pretrained('Aryanne/Wizard-Orca-3B-gguf', model_type='llama', model_file='q4_1-wizard-orca-3b.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
88
- model = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
89
- #model = AutoModelForCausalLM.from_pretrained('TheBloke/MistralLite-7B-GGUF', model_type='mistral', model_file='mistrallite.Q4_K_M.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
 
 
 
 
 
90
 
91
  tokenizer = []
92
 
@@ -100,14 +187,14 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
100
 
101
  if torch_device == "cuda":
102
  if "flan" in model_name:
103
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")
104
  else:
105
- model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
106
  else:
107
  if "flan" in model_name:
108
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
109
  else:
110
- model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
111
 
112
  tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
113
 
@@ -179,7 +266,7 @@ with block:
179
  #chat_height = 500
180
  chatbot = gr.Chatbot(avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False, scale = 1) # , height=chat_height
181
  with gr.Accordion("Open this tab to see the source paragraphs used to generate the answer", open = False):
182
- sources = gr.HTML(value = "Source paragraphs with the most relevant text will appear here", scale = 1) # , height=chat_height
183
 
184
  with gr.Row():
185
  message = gr.Textbox(
@@ -233,7 +320,7 @@ with block:
233
 
234
 
235
  gr.HTML(
236
- "<center>This app is based on the models Flan Alpaca and Mistral Open Orca. It powered by Gradio, Transformers, Ctransformers, and Langchain.</a></center>"
237
  )
238
 
239
  examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
@@ -289,6 +376,4 @@ with block:
289
  # Thumbs up or thumbs down voting function
290
  chatbot.like(chatf.vote, [chat_history_state, instruction_prompt_out, model_type_state], None)
291
 
292
- block.queue(concurrency_count=1).launch(debug=True)
293
- # -
294
-
 
11
  from transformers import AutoTokenizer
12
  from ctransformers import AutoModelForCausalLM
13
 
14
+ import torch
15
+
16
+ import llama_cpp
17
+ from llama_cpp import Llama
18
+ from huggingface_hub import hf_hub_download
19
+
20
  PandasDataFrame = Type[pd.DataFrame]
21
 
22
  # Disable cuda devices if necessary
 
44
  with zipfile.ZipFile(faiss_vstore_folder + '/' + faiss_vstore_folder + '.zip', 'r') as zip_ref:
45
  zip_ref.extractall(faiss_vstore_folder)
46
 
47
+ faiss_vstore = FAISS.load_local(folder_path=faiss_vstore_folder, embeddings=embeddings, allow_dangerous_deserialization=True)
48
  os.remove(faiss_vstore_folder + "/index.faiss")
49
  os.remove(faiss_vstore_folder + "/index.pkl")
50
 
 
59
  chatf.embeddings = load_embeddings(embeddings_name)
60
  chatf.vectorstore = get_faiss_store(faiss_vstore_folder="faiss_embedding",embeddings=globals()["embeddings"])
61
 
62
+ # def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_device=None):
63
+ # print("Loading model")
64
+
65
+ # # Default values inside the function
66
+ # if gpu_config is None:
67
+ # gpu_config = chatf.gpu_config
68
+ # if cpu_config is None:
69
+ # cpu_config = chatf.cpu_config
70
+ # if torch_device is None:
71
+ # torch_device = chatf.torch_device
72
+
73
+ # if model_type == "Mistral Open Orca (larger, slow)":
74
+ # if torch_device == "cuda":
75
+ # gpu_config.update_gpu(gpu_layers)
76
+ # else:
77
+ # gpu_config.update_gpu(gpu_layers)
78
+ # cpu_config.update_gpu(gpu_layers)
79
+
80
+ # print("Loading with", cpu_config.gpu_layers, "model layers sent to GPU.")
81
+
82
+ # print(vars(gpu_config))
83
+ # print(vars(cpu_config))
84
+
85
+ # try:
86
+ # #model = AutoModelForCausalLM.from_pretrained('Aryanne/Orca-Mini-3B-gguf', model_type='llama', model_file='q5_0-orca-mini-3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
87
+ # #model = AutoModelForCausalLM.from_pretrained('Aryanne/Wizard-Orca-3B-gguf', model_type='llama', model_file='q4_1-wizard-orca-3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
88
+ # model = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
89
+ # #model = AutoModelForCausalLM.from_pretrained('TheBloke/MistralLite-7B-GGUF', model_type='mistral', model_file='mistrallite.Q4_K_M.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
90
+
91
+ # except:
92
+ # #model = AutoModelForCausalLM.from_pretrained('Aryanne/Orca-Mini-3B-gguf', model_type='llama', model_file='q5_0-orca-mini-3b.gguf', **vars(cpu_config)) #**asdict(CtransRunConfig_gpu())
93
+ # #model = AutoModelForCausalLM.from_pretrained('Aryanne/Wizard-Orca-3B-gguf', model_type='llama', model_file='q4_1-wizard-orca-3b.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
94
+ # model = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
95
+ # #model = AutoModelForCausalLM.from_pretrained('TheBloke/MistralLite-7B-GGUF', model_type='mistral', model_file='mistrallite.Q4_K_M.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
96
+
97
+ # tokenizer = []
98
+
99
+ # if model_type == "Flan Alpaca (small, fast)":
100
+ # # Huggingface chat model
101
+ # hf_checkpoint = 'declare-lab/flan-alpaca-large'#'declare-lab/flan-alpaca-base' # # #
102
+
103
+ # def create_hf_model(model_name):
104
+
105
+ # from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM
106
+
107
+ # if torch_device == "cuda":
108
+ # if "flan" in model_name:
109
+ # model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")
110
+ # else:
111
+ # model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
112
+ # else:
113
+ # if "flan" in model_name:
114
+ # model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
115
+ # else:
116
+ # model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
117
+
118
+ # tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
119
+
120
+ # return model, tokenizer, model_type
121
+
122
+ # model, tokenizer, model_type = create_hf_model(model_name = hf_checkpoint)
123
+
124
+ # chatf.model = model
125
+ # chatf.tokenizer = tokenizer
126
+ # chatf.model_type = model_type
127
+
128
+ # load_confirmation = "Finished loading model: " + model_type
129
+
130
+ # print(load_confirmation)
131
+ # return model_type, load_confirmation, model_type
132
+
133
+
134
  def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_device=None):
135
  print("Loading model")
136
 
 
145
  if model_type == "Mistral Open Orca (larger, slow)":
146
  if torch_device == "cuda":
147
  gpu_config.update_gpu(gpu_layers)
148
+ print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU.")
149
  else:
150
  gpu_config.update_gpu(gpu_layers)
151
  cpu_config.update_gpu(gpu_layers)
152
 
153
+ print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU.")
154
 
155
  print(vars(gpu_config))
156
  print(vars(cpu_config))
157
 
158
  try:
159
+ model = Llama(
160
+ model_path=hf_hub_download(
161
+ repo_id=os.environ.get("REPO_ID", "TheBloke/Mistral-7B-OpenOrca-GGUF"),
162
+ filename=os.environ.get("MODEL_FILE", "mistral-7b-openorca.Q4_K_M.gguf"),
163
+ ),
164
+ **vars(gpu_config) # change n_gpu_layers if you have more or less VRAM
165
+ )
166
 
167
+ except Exception as e:
168
+ print("GPU load failed")
169
+ print(e)
170
+ model = Llama(
171
+ model_path=hf_hub_download(
172
+ repo_id=os.environ.get("REPO_ID", "TheBloke/Mistral-7B-OpenOrca-GGUF"),
173
+ filename=os.environ.get("MODEL_FILE", "mistral-7b-openorca.Q4_K_M.gguf"),
174
+ ),
175
+ **vars(cpu_config)
176
+ )
177
 
178
  tokenizer = []
179
 
 
187
 
188
  if torch_device == "cuda":
189
  if "flan" in model_name:
190
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
191
  else:
192
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
193
  else:
194
  if "flan" in model_name:
195
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16)
196
  else:
197
+ model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16)
198
 
199
  tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
200
 
 
266
  #chat_height = 500
267
  chatbot = gr.Chatbot(avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False, scale = 1) # , height=chat_height
268
  with gr.Accordion("Open this tab to see the source paragraphs used to generate the answer", open = False):
269
+ sources = gr.HTML(value = "Source paragraphs with the most relevant text will appear here") # , height=chat_height
270
 
271
  with gr.Row():
272
  message = gr.Textbox(
 
320
 
321
 
322
  gr.HTML(
323
+ "<center>This app is based on the models Flan Alpaca and Mistral Open Orca. It powered by Gradio, Transformers, and Llama.cpp.</a></center>"
324
  )
325
 
326
  examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
 
376
  # Thumbs up or thumbs down voting function
377
  chatbot.like(chatf.vote, [chat_history_state, instruction_prompt_out, model_type_state], None)
378
 
379
+ block.queue().launch(debug=True)
 
 
chatfuncs/chatfuncs.py CHANGED
@@ -38,6 +38,11 @@ from gensim.corpora import Dictionary
38
  from gensim.models import TfidfModel, OkapiBM25Model
39
  from gensim.similarities import SparseMatrixSimilarity
40
 
 
 
 
 
 
41
  import gradio as gr
42
 
43
  torch.cuda.empty_cache()
@@ -70,7 +75,7 @@ kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniL
70
  # Currently set gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
71
  if torch.cuda.is_available():
72
  torch_device = "cuda"
73
- gpu_layers = 0
74
  else:
75
  torch_device = "cpu"
76
  gpu_layers = 0
@@ -96,67 +101,129 @@ context_length:int = 2048
96
  sample = True
97
 
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  class CtransInitConfig_gpu:
100
- def __init__(self, temperature=temperature,
101
- top_k=top_k,
102
- top_p=top_p,
103
- repetition_penalty=repetition_penalty,
104
  last_n_tokens=last_n_tokens,
105
- max_new_tokens=max_new_tokens,
106
  seed=seed,
107
- reset=reset,
108
- stream=stream,
109
- threads=threads,
110
- batch_size=batch_size,
111
- context_length=context_length,
112
- gpu_layers=gpu_layers):
113
- self.temperature = temperature
114
- self.top_k = top_k
115
- self.top_p = top_p
116
- self.repetition_penalty = repetition_penalty# repetition_penalty
117
  self.last_n_tokens = last_n_tokens
118
- self.max_new_tokens = max_new_tokens
119
  self.seed = seed
120
- self.reset = reset
121
- self.stream = stream
122
- self.threads = threads
123
- self.batch_size = batch_size
124
- self.context_length = context_length
125
- self.gpu_layers = gpu_layers
126
  # self.stop: list[str] = field(default_factory=lambda: [stop_string])
127
 
128
  def update_gpu(self, new_value):
129
- self.gpu_layers = new_value
130
 
131
  class CtransInitConfig_cpu(CtransInitConfig_gpu):
132
  def __init__(self):
133
  super().__init__()
134
- self.gpu_layers = 0
135
 
136
  gpu_config = CtransInitConfig_gpu()
137
  cpu_config = CtransInitConfig_cpu()
138
 
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  class CtransGenGenerationConfig:
141
  def __init__(self, temperature=temperature,
142
  top_k=top_k,
143
  top_p=top_p,
144
- repetition_penalty=repetition_penalty,
145
- last_n_tokens=last_n_tokens,
146
  seed=seed,
147
- threads=threads,
148
- batch_size=batch_size,
149
- reset=True
 
 
150
  ):
151
  self.temperature = temperature
152
  self.top_k = top_k
153
  self.top_p = top_p
154
- self.repetition_penalty = repetition_penalty# repetition_penalty
155
- self.last_n_tokens = last_n_tokens
156
  self.seed = seed
157
- self.threads = threads
158
- self.batch_size = batch_size
159
- self.reset = reset
 
 
160
 
161
  def update_temp(self, new_value):
162
  self.temperature = new_value
@@ -352,6 +419,94 @@ def create_full_prompt(user_input, history, extracted_memory, vectorstore, embed
352
  return history, docs_content_string, instruction_prompt_out
353
 
354
  # Chat functions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  def produce_streaming_answer_chatbot(history, full_prompt, model_type,
356
  temperature=temperature,
357
  max_new_tokens=max_new_tokens,
@@ -412,7 +567,9 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
412
  print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
413
 
414
  elif model_type == "Mistral Open Orca (larger, slow)":
415
- tokens = model.tokenize(full_prompt)
 
 
416
 
417
  gen_config = CtransGenGenerationConfig()
418
  gen_config.update_temp(temperature)
@@ -424,13 +581,19 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
424
  NUM_TOKENS=0
425
  print('-'*4+'Start Generation'+'-'*4)
426
 
 
 
 
427
  history[-1][1] = ""
428
- for new_text in model.generate(tokens, **vars(gen_config)): #CtransGen_generate(prompt=full_prompt)#, config=CtransGenGenerationConfig()): # #top_k=top_k, temperature=temperature, repetition_penalty=repetition_penalty,
429
- if new_text == None: new_text = ""
430
- history[-1][1] += model.detokenize(new_text) #new_text
431
- NUM_TOKENS+=1
432
- yield history
433
-
 
 
 
434
  time_generate = time.time() - start
435
  print('\n')
436
  print('-'*4+'End Generation'+'-'*4)
@@ -439,6 +602,7 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
439
  print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
440
  print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
441
 
 
442
  # Chat helper functions
443
 
444
  def adapt_q_from_chat_history(question, chat_history, extracted_memory, keyword_model=""):#keyword_model): # new_question_keywords,
@@ -614,7 +778,7 @@ def hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val, out_p
614
  # 3rd level check on retrieved docs with SVM retriever
615
 
616
  svm_retriever = SVMRetriever.from_texts(content_keep, embeddings, k = k_val)
617
- svm_result = svm_retriever.get_relevant_documents(new_question_kworded)
618
 
619
 
620
  svm_rank=[]
@@ -994,10 +1158,10 @@ def restore_interactivity():
994
  return gr.update(interactive=True)
995
 
996
  def update_message(dropdown_value):
997
- return gr.Textbox.update(value=dropdown_value)
998
 
999
  def hide_block():
1000
- return gr.Radio.update(visible=False)
1001
 
1002
  # Vote function
1003
 
 
38
  from gensim.models import TfidfModel, OkapiBM25Model
39
  from gensim.similarities import SparseMatrixSimilarity
40
 
41
+ import copy
42
+ import llama_cpp
43
+ from llama_cpp import Llama
44
+ from huggingface_hub import hf_hub_download
45
+
46
  import gradio as gr
47
 
48
  torch.cuda.empty_cache()
 
75
  # Currently set gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
76
  if torch.cuda.is_available():
77
  torch_device = "cuda"
78
+ gpu_layers = 100
79
  else:
80
  torch_device = "cpu"
81
  gpu_layers = 0
 
101
  sample = True
102
 
103
 
104
+ # class CtransInitConfig_gpu:
105
+ # def __init__(self, temperature=temperature,
106
+ # top_k=top_k,
107
+ # top_p=top_p,
108
+ # repetition_penalty=repetition_penalty,
109
+ # last_n_tokens=last_n_tokens,
110
+ # max_new_tokens=max_new_tokens,
111
+ # seed=seed,
112
+ # reset=reset,
113
+ # stream=stream,
114
+ # threads=threads,
115
+ # batch_size=batch_size,
116
+ # context_length=context_length,
117
+ # gpu_layers=gpu_layers):
118
+ # self.temperature = temperature
119
+ # self.top_k = top_k
120
+ # self.top_p = top_p
121
+ # self.repetition_penalty = repetition_penalty# repetition_penalty
122
+ # self.last_n_tokens = last_n_tokens
123
+ # self.max_new_tokens = max_new_tokens
124
+ # self.seed = seed
125
+ # self.reset = reset
126
+ # self.stream = stream
127
+ # self.threads = threads
128
+ # self.batch_size = batch_size
129
+ # self.context_length = context_length
130
+ # self.gpu_layers = gpu_layers
131
+ # # self.stop: list[str] = field(default_factory=lambda: [stop_string])
132
+
133
+ # def update_gpu(self, new_value):
134
+ # self.gpu_layers = new_value
135
+
136
+ # class CtransInitConfig_cpu(CtransInitConfig_gpu):
137
+ # def __init__(self):
138
+ # super().__init__()
139
+ # self.gpu_layers = 0
140
+
141
  class CtransInitConfig_gpu:
142
+ def __init__(self, #temperature=temperature,
143
+ #top_k=top_k,
144
+ #top_p=top_p,
145
+ #repetition_penalty=repetition_penalty,
146
  last_n_tokens=last_n_tokens,
147
+ #max_new_tokens=max_new_tokens,
148
  seed=seed,
149
+ #reset=reset,
150
+ #stream=stream,
151
+ n_threads=threads,
152
+ n_batch=batch_size,
153
+ n_ctx=4096,
154
+ n_gpu_layers=gpu_layers):
155
+ #self.temperature = temperature
156
+ #self.top_k = top_k
157
+ #self.top_p = top_p
158
+ #self.repetition_penalty = repetition_penalty# repetition_penalty
159
  self.last_n_tokens = last_n_tokens
160
+ #self.max_new_tokens = max_new_tokens
161
  self.seed = seed
162
+ #self.reset = reset
163
+ #self.stream = stream
164
+ self.n_threads = n_threads
165
+ self.n_batch = n_batch
166
+ self.n_ctx = n_ctx
167
+ self.n_gpu_layers = n_gpu_layers
168
  # self.stop: list[str] = field(default_factory=lambda: [stop_string])
169
 
170
  def update_gpu(self, new_value):
171
+ self.n_gpu_layers = new_value
172
 
173
  class CtransInitConfig_cpu(CtransInitConfig_gpu):
174
  def __init__(self):
175
  super().__init__()
176
+ self.n_gpu_layers = 0
177
 
178
  gpu_config = CtransInitConfig_gpu()
179
  cpu_config = CtransInitConfig_cpu()
180
 
181
 
182
+ # class CtransGenGenerationConfig:
183
+ # def __init__(self, temperature=temperature,
184
+ # top_k=top_k,
185
+ # top_p=top_p,
186
+ # repetition_penalty=repetition_penalty,
187
+ # last_n_tokens=last_n_tokens,
188
+ # seed=seed,
189
+ # threads=threads,
190
+ # batch_size=batch_size,
191
+ # reset=True
192
+ # ):
193
+ # self.temperature = temperature
194
+ # self.top_k = top_k
195
+ # self.top_p = top_p
196
+ # self.repetition_penalty = repetition_penalty# repetition_penalty
197
+ # self.last_n_tokens = last_n_tokens
198
+ # self.seed = seed
199
+ # self.threads = threads
200
+ # self.batch_size = batch_size
201
+ # self.reset = reset
202
+
203
  class CtransGenGenerationConfig:
204
  def __init__(self, temperature=temperature,
205
  top_k=top_k,
206
  top_p=top_p,
207
+ repeat_penalty=repetition_penalty,
208
+ #last_n_tokens=last_n_tokens,
209
  seed=seed,
210
+ stream=stream,
211
+ max_tokens=max_new_tokens
212
+ #threads=threads,
213
+ #batch_size=batch_size,
214
+ #reset=True
215
  ):
216
  self.temperature = temperature
217
  self.top_k = top_k
218
  self.top_p = top_p
219
+ self.repeat_penalty = repeat_penalty
220
+ #self.last_n_tokens = last_n_tokens
221
  self.seed = seed
222
+ self.max_tokens=max_tokens
223
+ self.stream = stream
224
+ #self.threads = threads
225
+ #self.batch_size = batch_size
226
+ #self.reset = reset
227
 
228
  def update_temp(self, new_value):
229
  self.temperature = new_value
 
419
  return history, docs_content_string, instruction_prompt_out
420
 
421
  # Chat functions
422
+ # def produce_streaming_answer_chatbot(history, full_prompt, model_type,
423
+ # temperature=temperature,
424
+ # max_new_tokens=max_new_tokens,
425
+ # sample=sample,
426
+ # repetition_penalty=repetition_penalty,
427
+ # top_p=top_p,
428
+ # top_k=top_k
429
+ # ):
430
+ # #print("Model type is: ", model_type)
431
+
432
+ # #if not full_prompt.strip():
433
+ # # if history is None:
434
+ # # history = []
435
+
436
+ # # return history
437
+
438
+ # if model_type == "Flan Alpaca (small, fast)":
439
+ # # Get the model and tokenizer, and tokenize the user text.
440
+ # model_inputs = tokenizer(text=full_prompt, return_tensors="pt", return_attention_mask=False).to(torch_device) # return_attention_mask=False was added
441
+
442
+ # # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
443
+ # # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
444
+ # streamer = TextIteratorStreamer(tokenizer, timeout=120., skip_prompt=True, skip_special_tokens=True)
445
+ # generate_kwargs = dict(
446
+ # model_inputs,
447
+ # streamer=streamer,
448
+ # max_new_tokens=max_new_tokens,
449
+ # do_sample=sample,
450
+ # repetition_penalty=repetition_penalty,
451
+ # top_p=top_p,
452
+ # temperature=temperature,
453
+ # top_k=top_k
454
+ # )
455
+
456
+ # print(generate_kwargs)
457
+
458
+ # t = Thread(target=model.generate, kwargs=generate_kwargs)
459
+ # t.start()
460
+
461
+ # # Pull the generated text from the streamer, and update the model output.
462
+ # start = time.time()
463
+ # NUM_TOKENS=0
464
+ # print('-'*4+'Start Generation'+'-'*4)
465
+
466
+ # history[-1][1] = ""
467
+ # for new_text in streamer:
468
+ # if new_text == None: new_text = ""
469
+ # history[-1][1] += new_text
470
+ # NUM_TOKENS+=1
471
+ # yield history
472
+
473
+ # time_generate = time.time() - start
474
+ # print('\n')
475
+ # print('-'*4+'End Generation'+'-'*4)
476
+ # print(f'Num of generated tokens: {NUM_TOKENS}')
477
+ # print(f'Time for complete generation: {time_generate}s')
478
+ # print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
479
+ # print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
480
+
481
+ # elif model_type == "Mistral Open Orca (larger, slow)":
482
+ # tokens = model.tokenize(full_prompt)
483
+
484
+ # gen_config = CtransGenGenerationConfig()
485
+ # gen_config.update_temp(temperature)
486
+
487
+ # print(vars(gen_config))
488
+
489
+ # # Pull the generated text from the streamer, and update the model output.
490
+ # start = time.time()
491
+ # NUM_TOKENS=0
492
+ # print('-'*4+'Start Generation'+'-'*4)
493
+
494
+ # history[-1][1] = ""
495
+ # for new_text in model.generate(tokens, **vars(gen_config)): #CtransGen_generate(prompt=full_prompt)#, config=CtransGenGenerationConfig()): # #top_k=top_k, temperature=temperature, repetition_penalty=repetition_penalty,
496
+ # if new_text == None: new_text = ""
497
+ # history[-1][1] += model.detokenize(new_text) #new_text
498
+ # NUM_TOKENS+=1
499
+ # yield history
500
+
501
+ # time_generate = time.time() - start
502
+ # print('\n')
503
+ # print('-'*4+'End Generation'+'-'*4)
504
+ # print(f'Num of generated tokens: {NUM_TOKENS}')
505
+ # print(f'Time for complete generation: {time_generate}s')
506
+ # print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
507
+ # print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
508
+
509
+
510
  def produce_streaming_answer_chatbot(history, full_prompt, model_type,
511
  temperature=temperature,
512
  max_new_tokens=max_new_tokens,
 
567
  print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
568
 
569
  elif model_type == "Mistral Open Orca (larger, slow)":
570
+ #tokens = model.tokenize(full_prompt)
571
+
572
+ temp = ""
573
 
574
  gen_config = CtransGenGenerationConfig()
575
  gen_config.update_temp(temperature)
 
581
  NUM_TOKENS=0
582
  print('-'*4+'Start Generation'+'-'*4)
583
 
584
+ output = model(
585
+ full_prompt, **vars(gen_config))
586
+
587
  history[-1][1] = ""
588
+ for out in output:
589
+
590
+ if "choices" in out and len(out["choices"]) > 0 and "text" in out["choices"][0]:
591
+ history[-1][1] += out["choices"][0]["text"]
592
+ NUM_TOKENS+=1
593
+ yield history
594
+ else:
595
+ print(f"Unexpected output structure: {out}")
596
+
597
  time_generate = time.time() - start
598
  print('\n')
599
  print('-'*4+'End Generation'+'-'*4)
 
602
  print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
603
  print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
604
 
605
+
606
  # Chat helper functions
607
 
608
  def adapt_q_from_chat_history(question, chat_history, extracted_memory, keyword_model=""):#keyword_model): # new_question_keywords,
 
778
  # 3rd level check on retrieved docs with SVM retriever
779
 
780
  svm_retriever = SVMRetriever.from_texts(content_keep, embeddings, k = k_val)
781
+ svm_result = svm_retriever.invoke(new_question_kworded)
782
 
783
 
784
  svm_rank=[]
 
1158
  return gr.update(interactive=True)
1159
 
1160
  def update_message(dropdown_value):
1161
+ return gr.Textbox(value=dropdown_value)
1162
 
1163
  def hide_block():
1164
+ return gr.Radio(visible=False)
1165
 
1166
  # Vote function
1167
 
chatfuncs/ingest.py CHANGED
@@ -21,7 +21,7 @@ from pypdf import PdfReader
21
  PandasDataFrame = Type[pd.DataFrame]
22
 
23
  split_strat = ["\n\n", "\n", ". ", "! ", "? "]
24
- chunk_size = 500
25
  chunk_overlap = 0
26
  start_index = True
27
 
 
21
  PandasDataFrame = Type[pd.DataFrame]
22
 
23
  split_strat = ["\n\n", "\n", ". ", "! ", "? "]
24
+ chunk_size = 300
25
  chunk_overlap = 0
26
  start_index = True
27
 
requirements.txt CHANGED
@@ -3,15 +3,18 @@ langchain-community
3
  beautifulsoup4
4
  pandas
5
  transformers
6
- --extra-index-url https://download.pytorch.org/whl/cu118
7
- torch
 
8
  sentence_transformers
9
  faiss-cpu
10
  pypdf
11
  python-docx
12
- ctransformers[cuda]
13
  keybert
14
  span_marker
15
  gensim
16
- gradio==3.50.0
17
- gradio_client
 
 
 
3
  beautifulsoup4
4
  pandas
5
  transformers
6
+ llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
7
+ #torch \
8
+ #--extra-index-url https://download.pytorch.org/whl/cu121
9
  sentence_transformers
10
  faiss-cpu
11
  pypdf
12
  python-docx
13
+ #ctransformers[cuda]
14
  keybert
15
  span_marker
16
  gensim
17
+ gradio==4.31.5
18
+ gradio_client
19
+ nltk
20
+ scipy<1.13