Sean-Case commited on
Commit
275393f
·
1 Parent(s): d213c15

Changed embedding model, added reference to chat model on front page

Browse files
Generation speed GPU test.txt DELETED
@@ -1,51 +0,0 @@
1
- With 5 gpu layers, batch size 8
2
-
3
- Num of generated tokens: 113
4
- Time for complete generation: 115.42684650421143s
5
- Tokens per secound: 0.9789750255013432
6
- Time per token: 1021.4765177363843ms
7
-
8
- With 5 gpu layers, batch size 512
9
-
10
- Num of generated tokens: 102
11
- Time for complete generation: 40.369266986846924s
12
- Tokens per secound: 2.5266745624396285
13
- Time per token: 395.77712732202866ms
14
-
15
- With 6 gpu layers -
16
-
17
- Num of generated tokens: 113
18
- Time for complete generation: 46.37785983085632s
19
- Tokens per secound: 2.4365074285902764
20
- Time per token: 410.42353832616215ms
21
-
22
- With 6 gpu layers, batch size 1024 -
23
- Five pillars Q:
24
- Num of generated tokens: 102
25
- Time for complete generation: 41.85241961479187s
26
- Tokens per secound: 2.4371350793766346
27
- Time per token: 410.31783936070457ms
28
-
29
- With 8 threads
30
- Num of generated tokens: 102
31
- Time for complete generation: 40.64410996437073s
32
- Tokens per secound: 2.5095887224351774
33
- Time per token: 398.4716663173601ms
34
-
35
- Vision statement Q:
36
- Num of generated tokens: 84
37
- Time for complete generation: 35.57932233810425s
38
- Tokens per secound: 2.360921863597128
39
- Time per token: 423.5633611679077ms
40
-
41
- Commitments Q:
42
- Num of generated tokens: 50
43
- Time for complete generation: 23.73319172859192s
44
- Tokens per secound: 2.106754142965266
45
- Time per token: 474.6638345718384ms
46
-
47
- Outcomes Q
48
- Num of generated tokens: 167
49
- Time for complete generation: 52.302518367767334s
50
- Tokens per secound: 3.1929628861412094
51
- Time per token: 313.1887327411217ms
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -28,7 +28,7 @@ import chatfuncs.ingest as ing
28
 
29
  ## Load preset embeddings, vectorstore, and model
30
 
31
- embeddings_name = "thenlper/gte-base"
32
 
33
  def load_embeddings(embeddings_name = "thenlper/gte-base"):
34
 
@@ -79,7 +79,7 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
79
  if torch_device is None:
80
  torch_device = chatf.torch_device
81
 
82
- if model_type == "Orca Mini":
83
 
84
  gpu_config.update_gpu(gpu_layers)
85
  cpu_config.update_gpu(gpu_layers)
@@ -103,7 +103,7 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
103
 
104
  tokenizer = []
105
 
106
- if model_type == "Flan Alpaca":
107
  # Huggingface chat model
108
  hf_checkpoint = 'declare-lab/flan-alpaca-large'
109
 
@@ -135,14 +135,14 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
135
  load_confirmation = "Finished loading model: " + model_type
136
 
137
  print(load_confirmation)
138
- return model_type, load_confirmation
139
 
140
  # Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
141
- model_type = "Orca Mini"
142
 
143
  load_model(model_type, chatf.gpu_layers, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
144
 
145
- model_type = "Flan Alpaca"
146
  load_model(model_type, 0, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
147
 
148
  def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings=embeddings):
@@ -181,16 +181,19 @@ with block:
181
 
182
  gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
183
 
184
- gr.Markdown("Chat with PDF or web page documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Orca Mini), can reason a little better, but is much slower (See Advanced tab).\n\nBy default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
185
 
186
- current_source = gr.Textbox(label="Current data source that is loaded into the app", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf")
 
 
187
 
188
  with gr.Tab("Chatbot"):
189
 
190
  with gr.Row():
191
  chat_height = 500
192
  chatbot = gr.Chatbot(height=chat_height, avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False, scale = 1)
193
- sources = gr.HTML(value = "Source paragraphs where I looked for answers will appear here", height=chat_height, scale = 2)
 
194
 
195
  with gr.Row():
196
  message = gr.Textbox(
@@ -228,7 +231,7 @@ with block:
228
  ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
229
 
230
  with gr.Tab("Advanced features"):
231
- model_choice = gr.Radio(label="Choose a chat model", value="Flan Alpaca", choices = ["Flan Alpaca", "Orca Mini"])
232
  with gr.Row():
233
  gpu_layer_choice = gr.Slider(label="Choose number of model layers to send to GPU (WARNING: please don't modify unless you have a GPU).", value=0, minimum=0, maximum=6, step = 1, visible=False)
234
  change_model_button = gr.Button(value="Load model", scale=0)
@@ -241,7 +244,7 @@ with block:
241
  examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
242
 
243
  change_model_button.click(fn=chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
244
- then(fn=load_model, inputs=[model_choice, gpu_layer_choice], outputs = [model_type_state, load_text]).\
245
  then(lambda: chatf.restore_interactivity(), None, [message], queue=False).\
246
  then(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic]).\
247
  then(lambda: None, None, chatbot, queue=False)
 
28
 
29
  ## Load preset embeddings, vectorstore, and model
30
 
31
+ embeddings_name = "BAAI/bge-base-en-v1.5"
32
 
33
  def load_embeddings(embeddings_name = "thenlper/gte-base"):
34
 
 
79
  if torch_device is None:
80
  torch_device = chatf.torch_device
81
 
82
+ if model_type == "Orca Mini (larger, slow)":
83
 
84
  gpu_config.update_gpu(gpu_layers)
85
  cpu_config.update_gpu(gpu_layers)
 
103
 
104
  tokenizer = []
105
 
106
+ if model_type == "Flan Alpaca (small, fast)":
107
  # Huggingface chat model
108
  hf_checkpoint = 'declare-lab/flan-alpaca-large'
109
 
 
135
  load_confirmation = "Finished loading model: " + model_type
136
 
137
  print(load_confirmation)
138
+ return model_type, load_confirmation, model_type
139
 
140
  # Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
141
+ model_type = "Orca Mini (larger, slow)"
142
 
143
  load_model(model_type, chatf.gpu_layers, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
144
 
145
+ model_type = "Flan Alpaca (small, fast)"
146
  load_model(model_type, 0, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
147
 
148
  def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings=embeddings):
 
181
 
182
  gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
183
 
184
+ gr.Markdown("Chat with PDF or web page documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Orca Mini (larger, slow)), can reason a little better, but is much slower (See Advanced tab).\n\nBy default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
185
 
186
+ with gr.Row():
187
+ current_source = gr.Textbox(label="Current data source(s)", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf", scale = 10)
188
+ current_model = gr.Textbox(label="Current model", value=model_type, scale = 3)
189
 
190
  with gr.Tab("Chatbot"):
191
 
192
  with gr.Row():
193
  chat_height = 500
194
  chatbot = gr.Chatbot(height=chat_height, avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False, scale = 1)
195
+ #sources = gr.HTML(value = "Source paragraphs with the most relevant text will appear here", height=chat_height, scale = 2)
196
+ sources = gr.Markdown(value = "Source paragraphs with the most relevant text will appear here", height=chat_height, scale = 2)
197
 
198
  with gr.Row():
199
  message = gr.Textbox(
 
231
  ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
232
 
233
  with gr.Tab("Advanced features"):
234
+ model_choice = gr.Radio(label="Choose a chat model", value="Flan Alpaca (small, fast)", choices = ["Flan Alpaca (small, fast)", "Orca Mini (larger, slow)"])
235
  with gr.Row():
236
  gpu_layer_choice = gr.Slider(label="Choose number of model layers to send to GPU (WARNING: please don't modify unless you have a GPU).", value=0, minimum=0, maximum=6, step = 1, visible=False)
237
  change_model_button = gr.Button(value="Load model", scale=0)
 
244
  examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
245
 
246
  change_model_button.click(fn=chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
247
+ then(fn=load_model, inputs=[model_choice, gpu_layer_choice], outputs = [model_type_state, load_text, current_model]).\
248
  then(lambda: chatf.restore_interactivity(), None, [message], queue=False).\
249
  then(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic]).\
250
  then(lambda: None, None, chatbot, queue=False)
chatfuncs/chatfuncs.py CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
7
  import numpy as np
8
 
9
  # Model packages
10
- import torch
11
  from threading import Thread
12
  from transformers import pipeline, TextIteratorStreamer
13
 
@@ -21,16 +21,16 @@ from langchain.retrievers import SVMRetriever
21
  from langchain.text_splitter import RecursiveCharacterTextSplitter
22
  from langchain.docstore.document import Document
23
 
24
- # For keyword extraction
25
- import nltk
26
- nltk.download('wordnet')
27
  from nltk.corpus import stopwords
28
  from nltk.tokenize import RegexpTokenizer
29
  from nltk.stem import WordNetLemmatizer
30
- import keybert
31
 
32
  # For Name Entity Recognition model
33
- from span_marker import SpanMarkerModel
34
 
35
  # For BM25 retrieval
36
  from gensim.corpora import Dictionary
@@ -60,7 +60,7 @@ hlt_strat = [" ", ". ", "! ", "? ", ": ", "\n\n", "\n", ", "]
60
  hlt_overlap = 4
61
 
62
  ## Initialise NER model ##
63
- ner_model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-mbert-base-multinerd")
64
 
65
  ## Initialise keyword model ##
66
  # Used to pull out keywords from chat history to add to user queries behind the scenes
@@ -78,7 +78,7 @@ print("Running on device:", torch_device)
78
  threads = 8 #torch.get_num_threads()
79
  print("CPU threads:", threads)
80
 
81
- # Flan Alpaca Model parameters
82
  temperature: float = 0.1
83
  top_k: int = 3
84
  top_p: float = 1
@@ -202,7 +202,7 @@ def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings=embeddings):
202
 
203
  # Prompt functions
204
 
205
- def base_prompt_templates(model_type = "Flan Alpaca"):
206
 
207
  #EXAMPLE_PROMPT = PromptTemplate(
208
  # template="\nCONTENT:\n\n{page_content}\n\nSOURCE: {source}\n\n",
@@ -313,9 +313,9 @@ QUESTION: {question}
313
  ### RESPONSE:
314
  """
315
 
316
- if model_type == "Flan Alpaca":
317
  INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_alpaca, input_variables=['question', 'summaries'])
318
- elif model_type == "Orca Mini":
319
  INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_wizard_orca, input_variables=['question', 'summaries'])
320
 
321
  return INSTRUCTION_PROMPT, CONTENT_PROMPT
@@ -359,6 +359,9 @@ def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content
359
 
360
  def create_full_prompt(user_input, history, extracted_memory, vectorstore, embeddings, model_type):
361
 
 
 
 
362
  #if chain_agent is None:
363
  # history.append((user_input, "Please click the button to submit the Huggingface API key before using the chatbot (top right)"))
364
  # return history, history, "", ""
@@ -385,7 +388,13 @@ def create_full_prompt(user_input, history, extracted_memory, vectorstore, embed
385
  def produce_streaming_answer_chatbot(history, full_prompt, model_type):
386
  #print("Model type is: ", model_type)
387
 
388
- if model_type == "Flan Alpaca":
 
 
 
 
 
 
389
  # Get the model and tokenizer, and tokenize the user text.
390
  model_inputs = tokenizer(text=full_prompt, return_tensors="pt", return_attention_mask=False).to(torch_device) # return_attention_mask=False was added
391
 
@@ -425,7 +434,7 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type):
425
  print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
426
  print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
427
 
428
- elif model_type == "Orca Mini":
429
  tokens = model.tokenize(full_prompt)
430
 
431
  gen_config = CtransGenGenerationConfig()
@@ -460,7 +469,7 @@ def adapt_q_from_chat_history(question, chat_history, extracted_memory, keyword_
460
 
461
  if chat_history_str:
462
  # Keyword extraction is now done in the add_inputs_to_history function
463
- extracted_memory = extracted_memory#remove_q_stopwords(str(chat_history_first_q) + " " + str(chat_history_first_ans))
464
 
465
 
466
  new_question_kworded = str(extracted_memory) + ". " + question #+ " " + new_question_keywords
@@ -966,7 +975,7 @@ def keybert_keywords(text, n, kw_model):
966
  tokens_lemma = apply_lemmatize(text)
967
  lemmatised_text = ' '.join(tokens_lemma)
968
 
969
- keywords_text = keybert.KeyBERT(model=kw_model).extract_keywords(lemmatised_text, stop_words='english', top_n=n,
970
  keyphrase_ngram_range=(1, 1))
971
  keywords_list = [item[0] for item in keywords_text]
972
 
 
7
  import numpy as np
8
 
9
  # Model packages
10
+ import torch.cuda
11
  from threading import Thread
12
  from transformers import pipeline, TextIteratorStreamer
13
 
 
21
  from langchain.text_splitter import RecursiveCharacterTextSplitter
22
  from langchain.docstore.document import Document
23
 
24
+ # For keyword extraction (not currently used)
25
+ #import nltk
26
+ #nltk.download('wordnet')
27
  from nltk.corpus import stopwords
28
  from nltk.tokenize import RegexpTokenizer
29
  from nltk.stem import WordNetLemmatizer
30
+ from keybert import KeyBERT
31
 
32
  # For Name Entity Recognition model
33
+ #from span_marker import SpanMarkerModel # Not currently used
34
 
35
  # For BM25 retrieval
36
  from gensim.corpora import Dictionary
 
60
  hlt_overlap = 4
61
 
62
  ## Initialise NER model ##
63
+ ner_model = []#SpanMarkerModel.from_pretrained("tomaarsen/span-marker-mbert-base-multinerd") # Not currently used
64
 
65
  ## Initialise keyword model ##
66
  # Used to pull out keywords from chat history to add to user queries behind the scenes
 
78
  threads = 8 #torch.get_num_threads()
79
  print("CPU threads:", threads)
80
 
81
+ # Flan Alpaca (small, fast) Model parameters
82
  temperature: float = 0.1
83
  top_k: int = 3
84
  top_p: float = 1
 
202
 
203
  # Prompt functions
204
 
205
+ def base_prompt_templates(model_type = "Flan Alpaca (small, fast)"):
206
 
207
  #EXAMPLE_PROMPT = PromptTemplate(
208
  # template="\nCONTENT:\n\n{page_content}\n\nSOURCE: {source}\n\n",
 
313
  ### RESPONSE:
314
  """
315
 
316
+ if model_type == "Flan Alpaca (small, fast)":
317
  INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_alpaca, input_variables=['question', 'summaries'])
318
+ elif model_type == "Orca Mini (larger, slow)":
319
  INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_wizard_orca, input_variables=['question', 'summaries'])
320
 
321
  return INSTRUCTION_PROMPT, CONTENT_PROMPT
 
359
 
360
  def create_full_prompt(user_input, history, extracted_memory, vectorstore, embeddings, model_type):
361
 
362
+ if not user_input.strip():
363
+ return history, "", ""
364
+
365
  #if chain_agent is None:
366
  # history.append((user_input, "Please click the button to submit the Huggingface API key before using the chatbot (top right)"))
367
  # return history, history, "", ""
 
388
  def produce_streaming_answer_chatbot(history, full_prompt, model_type):
389
  #print("Model type is: ", model_type)
390
 
391
+ #if not full_prompt.strip():
392
+ # if history is None:
393
+ # history = []
394
+
395
+ # return history
396
+
397
+ if model_type == "Flan Alpaca (small, fast)":
398
  # Get the model and tokenizer, and tokenize the user text.
399
  model_inputs = tokenizer(text=full_prompt, return_tensors="pt", return_attention_mask=False).to(torch_device) # return_attention_mask=False was added
400
 
 
434
  print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
435
  print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
436
 
437
+ elif model_type == "Orca Mini (larger, slow)":
438
  tokens = model.tokenize(full_prompt)
439
 
440
  gen_config = CtransGenGenerationConfig()
 
469
 
470
  if chat_history_str:
471
  # Keyword extraction is now done in the add_inputs_to_history function
472
+ #remove_q_stopwords(str(chat_history_first_q) + " " + str(chat_history_first_ans))
473
 
474
 
475
  new_question_kworded = str(extracted_memory) + ". " + question #+ " " + new_question_keywords
 
975
  tokens_lemma = apply_lemmatize(text)
976
  lemmatised_text = ' '.join(tokens_lemma)
977
 
978
+ keywords_text = KeyBERT(model=kw_model).extract_keywords(lemmatised_text, stop_words='english', top_n=n,
979
  keyphrase_ngram_range=(1, 1))
980
  keywords_list = [item[0] for item in keywords_text]
981
 
chatfuncs/ingest_borough_plan.py CHANGED
@@ -1,16 +1,14 @@
1
  import ingest as ing
2
- import pandas as pd
3
 
4
-
5
- borough_plan_text = ing.parse_file([open("Lambeth_2030-Our_Future_Our_Lambeth.pdf")])
6
  print("Borough plan text created")
7
 
8
- #print(borough_plan_text)
9
 
10
  borough_plan_docs = ing.text_to_docs(borough_plan_text)
11
  print("Borough plan docs created")
12
 
13
- embedding_model = "thenlper/gte-base"
14
 
15
  embeddings = ing.load_embeddings(model_name = embedding_model)
16
  ing.embed_faiss_save_to_zip(borough_plan_docs, save_to="faiss_embedding", model_name = embedding_model)
 
1
  import ingest as ing
 
2
 
3
+ borough_plan_text, file_names = ing.parse_file([open("Lambeth_2030-Our_Future_Our_Lambeth.pdf")])
 
4
  print("Borough plan text created")
5
 
6
+ print(borough_plan_text)
7
 
8
  borough_plan_docs = ing.text_to_docs(borough_plan_text)
9
  print("Borough plan docs created")
10
 
11
+ embedding_model = "BAAI/bge-base-en-v1.5"
12
 
13
  embeddings = ing.load_embeddings(model_name = embedding_model)
14
  ing.embed_faiss_save_to_zip(borough_plan_docs, save_to="faiss_embedding", model_name = embedding_model)
faiss_embedding/faiss_embedding.zip CHANGED
Binary files a/faiss_embedding/faiss_embedding.zip and b/faiss_embedding/faiss_embedding.zip differ