seanpedrickcase
commited on
Commit
·
f301d67
1
Parent(s):
8aa3ebb
Upgraded gradio and packages to latest. Switched Ctransformers with Llama.cpp Python
Browse files- Dockerfile +3 -2
- README.md +1 -1
- app.py +105 -20
- chatfuncs/chatfuncs.py +209 -45
- chatfuncs/ingest.py +1 -1
- requirements.txt +8 -5
Dockerfile
CHANGED
@@ -13,13 +13,14 @@ USER user
|
|
13 |
# Set home to the user's home directory
|
14 |
ENV HOME=/home/user \
|
15 |
PATH=/home/user/.local/bin:$PATH \
|
16 |
-
|
17 |
PYTHONUNBUFFERED=1 \
|
18 |
GRADIO_ALLOW_FLAGGING=never \
|
19 |
GRADIO_NUM_PORTS=1 \
|
20 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
21 |
GRADIO_THEME=huggingface \
|
22 |
-
SYSTEM=spaces
|
|
|
23 |
|
24 |
# Set the working directory to the user's home directory
|
25 |
WORKDIR $HOME/app
|
|
|
13 |
# Set home to the user's home directory
|
14 |
ENV HOME=/home/user \
|
15 |
PATH=/home/user/.local/bin:$PATH \
|
16 |
+
PYTHONPATH=$HOME/app \
|
17 |
PYTHONUNBUFFERED=1 \
|
18 |
GRADIO_ALLOW_FLAGGING=never \
|
19 |
GRADIO_NUM_PORTS=1 \
|
20 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
21 |
GRADIO_THEME=huggingface \
|
22 |
+
SYSTEM=spaces \
|
23 |
+
LLAMA_CUBLAS=1
|
24 |
|
25 |
# Set the working directory to the user's home directory
|
26 |
WORKDIR $HOME/app
|
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: 🌍
|
|
4 |
colorFrom: yellow
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
|
|
4 |
colorFrom: yellow
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.31.5
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
app.py
CHANGED
@@ -11,6 +11,12 @@ import pandas as pd
|
|
11 |
from transformers import AutoTokenizer
|
12 |
from ctransformers import AutoModelForCausalLM
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
PandasDataFrame = Type[pd.DataFrame]
|
15 |
|
16 |
# Disable cuda devices if necessary
|
@@ -38,7 +44,7 @@ def get_faiss_store(faiss_vstore_folder,embeddings):
|
|
38 |
with zipfile.ZipFile(faiss_vstore_folder + '/' + faiss_vstore_folder + '.zip', 'r') as zip_ref:
|
39 |
zip_ref.extractall(faiss_vstore_folder)
|
40 |
|
41 |
-
faiss_vstore = FAISS.load_local(folder_path=faiss_vstore_folder, embeddings=embeddings)
|
42 |
os.remove(faiss_vstore_folder + "/index.faiss")
|
43 |
os.remove(faiss_vstore_folder + "/index.pkl")
|
44 |
|
@@ -53,6 +59,78 @@ import chatfuncs.chatfuncs as chatf
|
|
53 |
chatf.embeddings = load_embeddings(embeddings_name)
|
54 |
chatf.vectorstore = get_faiss_store(faiss_vstore_folder="faiss_embedding",embeddings=globals()["embeddings"])
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_device=None):
|
57 |
print("Loading model")
|
58 |
|
@@ -67,26 +145,35 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
|
|
67 |
if model_type == "Mistral Open Orca (larger, slow)":
|
68 |
if torch_device == "cuda":
|
69 |
gpu_config.update_gpu(gpu_layers)
|
|
|
70 |
else:
|
71 |
gpu_config.update_gpu(gpu_layers)
|
72 |
cpu_config.update_gpu(gpu_layers)
|
73 |
|
74 |
-
|
75 |
|
76 |
print(vars(gpu_config))
|
77 |
print(vars(cpu_config))
|
78 |
|
79 |
try:
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
84 |
|
85 |
-
except:
|
86 |
-
|
87 |
-
|
88 |
-
model =
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
tokenizer = []
|
92 |
|
@@ -100,14 +187,14 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
|
|
100 |
|
101 |
if torch_device == "cuda":
|
102 |
if "flan" in model_name:
|
103 |
-
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")
|
104 |
else:
|
105 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
|
106 |
else:
|
107 |
if "flan" in model_name:
|
108 |
-
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
109 |
else:
|
110 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
|
111 |
|
112 |
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
|
113 |
|
@@ -179,7 +266,7 @@ with block:
|
|
179 |
#chat_height = 500
|
180 |
chatbot = gr.Chatbot(avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False, scale = 1) # , height=chat_height
|
181 |
with gr.Accordion("Open this tab to see the source paragraphs used to generate the answer", open = False):
|
182 |
-
sources = gr.HTML(value = "Source paragraphs with the most relevant text will appear here"
|
183 |
|
184 |
with gr.Row():
|
185 |
message = gr.Textbox(
|
@@ -233,7 +320,7 @@ with block:
|
|
233 |
|
234 |
|
235 |
gr.HTML(
|
236 |
-
"<center>This app is based on the models Flan Alpaca and Mistral Open Orca. It powered by Gradio, Transformers,
|
237 |
)
|
238 |
|
239 |
examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
|
@@ -289,6 +376,4 @@ with block:
|
|
289 |
# Thumbs up or thumbs down voting function
|
290 |
chatbot.like(chatf.vote, [chat_history_state, instruction_prompt_out, model_type_state], None)
|
291 |
|
292 |
-
block.queue(
|
293 |
-
# -
|
294 |
-
|
|
|
11 |
from transformers import AutoTokenizer
|
12 |
from ctransformers import AutoModelForCausalLM
|
13 |
|
14 |
+
import torch
|
15 |
+
|
16 |
+
import llama_cpp
|
17 |
+
from llama_cpp import Llama
|
18 |
+
from huggingface_hub import hf_hub_download
|
19 |
+
|
20 |
PandasDataFrame = Type[pd.DataFrame]
|
21 |
|
22 |
# Disable cuda devices if necessary
|
|
|
44 |
with zipfile.ZipFile(faiss_vstore_folder + '/' + faiss_vstore_folder + '.zip', 'r') as zip_ref:
|
45 |
zip_ref.extractall(faiss_vstore_folder)
|
46 |
|
47 |
+
faiss_vstore = FAISS.load_local(folder_path=faiss_vstore_folder, embeddings=embeddings, allow_dangerous_deserialization=True)
|
48 |
os.remove(faiss_vstore_folder + "/index.faiss")
|
49 |
os.remove(faiss_vstore_folder + "/index.pkl")
|
50 |
|
|
|
59 |
chatf.embeddings = load_embeddings(embeddings_name)
|
60 |
chatf.vectorstore = get_faiss_store(faiss_vstore_folder="faiss_embedding",embeddings=globals()["embeddings"])
|
61 |
|
62 |
+
# def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_device=None):
|
63 |
+
# print("Loading model")
|
64 |
+
|
65 |
+
# # Default values inside the function
|
66 |
+
# if gpu_config is None:
|
67 |
+
# gpu_config = chatf.gpu_config
|
68 |
+
# if cpu_config is None:
|
69 |
+
# cpu_config = chatf.cpu_config
|
70 |
+
# if torch_device is None:
|
71 |
+
# torch_device = chatf.torch_device
|
72 |
+
|
73 |
+
# if model_type == "Mistral Open Orca (larger, slow)":
|
74 |
+
# if torch_device == "cuda":
|
75 |
+
# gpu_config.update_gpu(gpu_layers)
|
76 |
+
# else:
|
77 |
+
# gpu_config.update_gpu(gpu_layers)
|
78 |
+
# cpu_config.update_gpu(gpu_layers)
|
79 |
+
|
80 |
+
# print("Loading with", cpu_config.gpu_layers, "model layers sent to GPU.")
|
81 |
+
|
82 |
+
# print(vars(gpu_config))
|
83 |
+
# print(vars(cpu_config))
|
84 |
+
|
85 |
+
# try:
|
86 |
+
# #model = AutoModelForCausalLM.from_pretrained('Aryanne/Orca-Mini-3B-gguf', model_type='llama', model_file='q5_0-orca-mini-3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
|
87 |
+
# #model = AutoModelForCausalLM.from_pretrained('Aryanne/Wizard-Orca-3B-gguf', model_type='llama', model_file='q4_1-wizard-orca-3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
|
88 |
+
# model = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
|
89 |
+
# #model = AutoModelForCausalLM.from_pretrained('TheBloke/MistralLite-7B-GGUF', model_type='mistral', model_file='mistrallite.Q4_K_M.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
|
90 |
+
|
91 |
+
# except:
|
92 |
+
# #model = AutoModelForCausalLM.from_pretrained('Aryanne/Orca-Mini-3B-gguf', model_type='llama', model_file='q5_0-orca-mini-3b.gguf', **vars(cpu_config)) #**asdict(CtransRunConfig_gpu())
|
93 |
+
# #model = AutoModelForCausalLM.from_pretrained('Aryanne/Wizard-Orca-3B-gguf', model_type='llama', model_file='q4_1-wizard-orca-3b.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
|
94 |
+
# model = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
|
95 |
+
# #model = AutoModelForCausalLM.from_pretrained('TheBloke/MistralLite-7B-GGUF', model_type='mistral', model_file='mistrallite.Q4_K_M.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
|
96 |
+
|
97 |
+
# tokenizer = []
|
98 |
+
|
99 |
+
# if model_type == "Flan Alpaca (small, fast)":
|
100 |
+
# # Huggingface chat model
|
101 |
+
# hf_checkpoint = 'declare-lab/flan-alpaca-large'#'declare-lab/flan-alpaca-base' # # #
|
102 |
+
|
103 |
+
# def create_hf_model(model_name):
|
104 |
+
|
105 |
+
# from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM
|
106 |
+
|
107 |
+
# if torch_device == "cuda":
|
108 |
+
# if "flan" in model_name:
|
109 |
+
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")
|
110 |
+
# else:
|
111 |
+
# model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
|
112 |
+
# else:
|
113 |
+
# if "flan" in model_name:
|
114 |
+
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
115 |
+
# else:
|
116 |
+
# model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
|
117 |
+
|
118 |
+
# tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
|
119 |
+
|
120 |
+
# return model, tokenizer, model_type
|
121 |
+
|
122 |
+
# model, tokenizer, model_type = create_hf_model(model_name = hf_checkpoint)
|
123 |
+
|
124 |
+
# chatf.model = model
|
125 |
+
# chatf.tokenizer = tokenizer
|
126 |
+
# chatf.model_type = model_type
|
127 |
+
|
128 |
+
# load_confirmation = "Finished loading model: " + model_type
|
129 |
+
|
130 |
+
# print(load_confirmation)
|
131 |
+
# return model_type, load_confirmation, model_type
|
132 |
+
|
133 |
+
|
134 |
def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_device=None):
|
135 |
print("Loading model")
|
136 |
|
|
|
145 |
if model_type == "Mistral Open Orca (larger, slow)":
|
146 |
if torch_device == "cuda":
|
147 |
gpu_config.update_gpu(gpu_layers)
|
148 |
+
print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU.")
|
149 |
else:
|
150 |
gpu_config.update_gpu(gpu_layers)
|
151 |
cpu_config.update_gpu(gpu_layers)
|
152 |
|
153 |
+
print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU.")
|
154 |
|
155 |
print(vars(gpu_config))
|
156 |
print(vars(cpu_config))
|
157 |
|
158 |
try:
|
159 |
+
model = Llama(
|
160 |
+
model_path=hf_hub_download(
|
161 |
+
repo_id=os.environ.get("REPO_ID", "TheBloke/Mistral-7B-OpenOrca-GGUF"),
|
162 |
+
filename=os.environ.get("MODEL_FILE", "mistral-7b-openorca.Q4_K_M.gguf"),
|
163 |
+
),
|
164 |
+
**vars(gpu_config) # change n_gpu_layers if you have more or less VRAM
|
165 |
+
)
|
166 |
|
167 |
+
except Exception as e:
|
168 |
+
print("GPU load failed")
|
169 |
+
print(e)
|
170 |
+
model = Llama(
|
171 |
+
model_path=hf_hub_download(
|
172 |
+
repo_id=os.environ.get("REPO_ID", "TheBloke/Mistral-7B-OpenOrca-GGUF"),
|
173 |
+
filename=os.environ.get("MODEL_FILE", "mistral-7b-openorca.Q4_K_M.gguf"),
|
174 |
+
),
|
175 |
+
**vars(cpu_config)
|
176 |
+
)
|
177 |
|
178 |
tokenizer = []
|
179 |
|
|
|
187 |
|
188 |
if torch_device == "cuda":
|
189 |
if "flan" in model_name:
|
190 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
|
191 |
else:
|
192 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
|
193 |
else:
|
194 |
if "flan" in model_name:
|
195 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16)
|
196 |
else:
|
197 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16)
|
198 |
|
199 |
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
|
200 |
|
|
|
266 |
#chat_height = 500
|
267 |
chatbot = gr.Chatbot(avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False, scale = 1) # , height=chat_height
|
268 |
with gr.Accordion("Open this tab to see the source paragraphs used to generate the answer", open = False):
|
269 |
+
sources = gr.HTML(value = "Source paragraphs with the most relevant text will appear here") # , height=chat_height
|
270 |
|
271 |
with gr.Row():
|
272 |
message = gr.Textbox(
|
|
|
320 |
|
321 |
|
322 |
gr.HTML(
|
323 |
+
"<center>This app is based on the models Flan Alpaca and Mistral Open Orca. It powered by Gradio, Transformers, and Llama.cpp.</a></center>"
|
324 |
)
|
325 |
|
326 |
examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
|
|
|
376 |
# Thumbs up or thumbs down voting function
|
377 |
chatbot.like(chatf.vote, [chat_history_state, instruction_prompt_out, model_type_state], None)
|
378 |
|
379 |
+
block.queue().launch(debug=True)
|
|
|
|
chatfuncs/chatfuncs.py
CHANGED
@@ -38,6 +38,11 @@ from gensim.corpora import Dictionary
|
|
38 |
from gensim.models import TfidfModel, OkapiBM25Model
|
39 |
from gensim.similarities import SparseMatrixSimilarity
|
40 |
|
|
|
|
|
|
|
|
|
|
|
41 |
import gradio as gr
|
42 |
|
43 |
torch.cuda.empty_cache()
|
@@ -70,7 +75,7 @@ kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniL
|
|
70 |
# Currently set gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
|
71 |
if torch.cuda.is_available():
|
72 |
torch_device = "cuda"
|
73 |
-
gpu_layers =
|
74 |
else:
|
75 |
torch_device = "cpu"
|
76 |
gpu_layers = 0
|
@@ -96,67 +101,129 @@ context_length:int = 2048
|
|
96 |
sample = True
|
97 |
|
98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
class CtransInitConfig_gpu:
|
100 |
-
def __init__(self, temperature=temperature,
|
101 |
-
top_k=top_k,
|
102 |
-
top_p=top_p,
|
103 |
-
repetition_penalty=repetition_penalty,
|
104 |
last_n_tokens=last_n_tokens,
|
105 |
-
max_new_tokens=max_new_tokens,
|
106 |
seed=seed,
|
107 |
-
reset=reset,
|
108 |
-
stream=stream,
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
self.temperature = temperature
|
114 |
-
self.top_k = top_k
|
115 |
-
self.top_p = top_p
|
116 |
-
self.repetition_penalty = repetition_penalty# repetition_penalty
|
117 |
self.last_n_tokens = last_n_tokens
|
118 |
-
self.max_new_tokens = max_new_tokens
|
119 |
self.seed = seed
|
120 |
-
self.reset = reset
|
121 |
-
self.stream = stream
|
122 |
-
self.
|
123 |
-
self.
|
124 |
-
self.
|
125 |
-
self.
|
126 |
# self.stop: list[str] = field(default_factory=lambda: [stop_string])
|
127 |
|
128 |
def update_gpu(self, new_value):
|
129 |
-
self.
|
130 |
|
131 |
class CtransInitConfig_cpu(CtransInitConfig_gpu):
|
132 |
def __init__(self):
|
133 |
super().__init__()
|
134 |
-
self.
|
135 |
|
136 |
gpu_config = CtransInitConfig_gpu()
|
137 |
cpu_config = CtransInitConfig_cpu()
|
138 |
|
139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
class CtransGenGenerationConfig:
|
141 |
def __init__(self, temperature=temperature,
|
142 |
top_k=top_k,
|
143 |
top_p=top_p,
|
144 |
-
|
145 |
-
last_n_tokens=last_n_tokens,
|
146 |
seed=seed,
|
147 |
-
|
148 |
-
|
149 |
-
|
|
|
|
|
150 |
):
|
151 |
self.temperature = temperature
|
152 |
self.top_k = top_k
|
153 |
self.top_p = top_p
|
154 |
-
self.
|
155 |
-
self.last_n_tokens = last_n_tokens
|
156 |
self.seed = seed
|
157 |
-
self.
|
158 |
-
self.
|
159 |
-
self.
|
|
|
|
|
160 |
|
161 |
def update_temp(self, new_value):
|
162 |
self.temperature = new_value
|
@@ -352,6 +419,94 @@ def create_full_prompt(user_input, history, extracted_memory, vectorstore, embed
|
|
352 |
return history, docs_content_string, instruction_prompt_out
|
353 |
|
354 |
# Chat functions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
355 |
def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
356 |
temperature=temperature,
|
357 |
max_new_tokens=max_new_tokens,
|
@@ -412,7 +567,9 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
|
412 |
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
413 |
|
414 |
elif model_type == "Mistral Open Orca (larger, slow)":
|
415 |
-
tokens = model.tokenize(full_prompt)
|
|
|
|
|
416 |
|
417 |
gen_config = CtransGenGenerationConfig()
|
418 |
gen_config.update_temp(temperature)
|
@@ -424,13 +581,19 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
|
424 |
NUM_TOKENS=0
|
425 |
print('-'*4+'Start Generation'+'-'*4)
|
426 |
|
|
|
|
|
|
|
427 |
history[-1][1] = ""
|
428 |
-
for
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
|
|
|
|
|
|
434 |
time_generate = time.time() - start
|
435 |
print('\n')
|
436 |
print('-'*4+'End Generation'+'-'*4)
|
@@ -439,6 +602,7 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
|
439 |
print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
|
440 |
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
441 |
|
|
|
442 |
# Chat helper functions
|
443 |
|
444 |
def adapt_q_from_chat_history(question, chat_history, extracted_memory, keyword_model=""):#keyword_model): # new_question_keywords,
|
@@ -614,7 +778,7 @@ def hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val, out_p
|
|
614 |
# 3rd level check on retrieved docs with SVM retriever
|
615 |
|
616 |
svm_retriever = SVMRetriever.from_texts(content_keep, embeddings, k = k_val)
|
617 |
-
svm_result = svm_retriever.
|
618 |
|
619 |
|
620 |
svm_rank=[]
|
@@ -994,10 +1158,10 @@ def restore_interactivity():
|
|
994 |
return gr.update(interactive=True)
|
995 |
|
996 |
def update_message(dropdown_value):
|
997 |
-
return gr.Textbox
|
998 |
|
999 |
def hide_block():
|
1000 |
-
return gr.Radio
|
1001 |
|
1002 |
# Vote function
|
1003 |
|
|
|
38 |
from gensim.models import TfidfModel, OkapiBM25Model
|
39 |
from gensim.similarities import SparseMatrixSimilarity
|
40 |
|
41 |
+
import copy
|
42 |
+
import llama_cpp
|
43 |
+
from llama_cpp import Llama
|
44 |
+
from huggingface_hub import hf_hub_download
|
45 |
+
|
46 |
import gradio as gr
|
47 |
|
48 |
torch.cuda.empty_cache()
|
|
|
75 |
# Currently set gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
|
76 |
if torch.cuda.is_available():
|
77 |
torch_device = "cuda"
|
78 |
+
gpu_layers = 100
|
79 |
else:
|
80 |
torch_device = "cpu"
|
81 |
gpu_layers = 0
|
|
|
101 |
sample = True
|
102 |
|
103 |
|
104 |
+
# class CtransInitConfig_gpu:
|
105 |
+
# def __init__(self, temperature=temperature,
|
106 |
+
# top_k=top_k,
|
107 |
+
# top_p=top_p,
|
108 |
+
# repetition_penalty=repetition_penalty,
|
109 |
+
# last_n_tokens=last_n_tokens,
|
110 |
+
# max_new_tokens=max_new_tokens,
|
111 |
+
# seed=seed,
|
112 |
+
# reset=reset,
|
113 |
+
# stream=stream,
|
114 |
+
# threads=threads,
|
115 |
+
# batch_size=batch_size,
|
116 |
+
# context_length=context_length,
|
117 |
+
# gpu_layers=gpu_layers):
|
118 |
+
# self.temperature = temperature
|
119 |
+
# self.top_k = top_k
|
120 |
+
# self.top_p = top_p
|
121 |
+
# self.repetition_penalty = repetition_penalty# repetition_penalty
|
122 |
+
# self.last_n_tokens = last_n_tokens
|
123 |
+
# self.max_new_tokens = max_new_tokens
|
124 |
+
# self.seed = seed
|
125 |
+
# self.reset = reset
|
126 |
+
# self.stream = stream
|
127 |
+
# self.threads = threads
|
128 |
+
# self.batch_size = batch_size
|
129 |
+
# self.context_length = context_length
|
130 |
+
# self.gpu_layers = gpu_layers
|
131 |
+
# # self.stop: list[str] = field(default_factory=lambda: [stop_string])
|
132 |
+
|
133 |
+
# def update_gpu(self, new_value):
|
134 |
+
# self.gpu_layers = new_value
|
135 |
+
|
136 |
+
# class CtransInitConfig_cpu(CtransInitConfig_gpu):
|
137 |
+
# def __init__(self):
|
138 |
+
# super().__init__()
|
139 |
+
# self.gpu_layers = 0
|
140 |
+
|
141 |
class CtransInitConfig_gpu:
|
142 |
+
def __init__(self, #temperature=temperature,
|
143 |
+
#top_k=top_k,
|
144 |
+
#top_p=top_p,
|
145 |
+
#repetition_penalty=repetition_penalty,
|
146 |
last_n_tokens=last_n_tokens,
|
147 |
+
#max_new_tokens=max_new_tokens,
|
148 |
seed=seed,
|
149 |
+
#reset=reset,
|
150 |
+
#stream=stream,
|
151 |
+
n_threads=threads,
|
152 |
+
n_batch=batch_size,
|
153 |
+
n_ctx=4096,
|
154 |
+
n_gpu_layers=gpu_layers):
|
155 |
+
#self.temperature = temperature
|
156 |
+
#self.top_k = top_k
|
157 |
+
#self.top_p = top_p
|
158 |
+
#self.repetition_penalty = repetition_penalty# repetition_penalty
|
159 |
self.last_n_tokens = last_n_tokens
|
160 |
+
#self.max_new_tokens = max_new_tokens
|
161 |
self.seed = seed
|
162 |
+
#self.reset = reset
|
163 |
+
#self.stream = stream
|
164 |
+
self.n_threads = n_threads
|
165 |
+
self.n_batch = n_batch
|
166 |
+
self.n_ctx = n_ctx
|
167 |
+
self.n_gpu_layers = n_gpu_layers
|
168 |
# self.stop: list[str] = field(default_factory=lambda: [stop_string])
|
169 |
|
170 |
def update_gpu(self, new_value):
|
171 |
+
self.n_gpu_layers = new_value
|
172 |
|
173 |
class CtransInitConfig_cpu(CtransInitConfig_gpu):
|
174 |
def __init__(self):
|
175 |
super().__init__()
|
176 |
+
self.n_gpu_layers = 0
|
177 |
|
178 |
gpu_config = CtransInitConfig_gpu()
|
179 |
cpu_config = CtransInitConfig_cpu()
|
180 |
|
181 |
|
182 |
+
# class CtransGenGenerationConfig:
|
183 |
+
# def __init__(self, temperature=temperature,
|
184 |
+
# top_k=top_k,
|
185 |
+
# top_p=top_p,
|
186 |
+
# repetition_penalty=repetition_penalty,
|
187 |
+
# last_n_tokens=last_n_tokens,
|
188 |
+
# seed=seed,
|
189 |
+
# threads=threads,
|
190 |
+
# batch_size=batch_size,
|
191 |
+
# reset=True
|
192 |
+
# ):
|
193 |
+
# self.temperature = temperature
|
194 |
+
# self.top_k = top_k
|
195 |
+
# self.top_p = top_p
|
196 |
+
# self.repetition_penalty = repetition_penalty# repetition_penalty
|
197 |
+
# self.last_n_tokens = last_n_tokens
|
198 |
+
# self.seed = seed
|
199 |
+
# self.threads = threads
|
200 |
+
# self.batch_size = batch_size
|
201 |
+
# self.reset = reset
|
202 |
+
|
203 |
class CtransGenGenerationConfig:
|
204 |
def __init__(self, temperature=temperature,
|
205 |
top_k=top_k,
|
206 |
top_p=top_p,
|
207 |
+
repeat_penalty=repetition_penalty,
|
208 |
+
#last_n_tokens=last_n_tokens,
|
209 |
seed=seed,
|
210 |
+
stream=stream,
|
211 |
+
max_tokens=max_new_tokens
|
212 |
+
#threads=threads,
|
213 |
+
#batch_size=batch_size,
|
214 |
+
#reset=True
|
215 |
):
|
216 |
self.temperature = temperature
|
217 |
self.top_k = top_k
|
218 |
self.top_p = top_p
|
219 |
+
self.repeat_penalty = repeat_penalty
|
220 |
+
#self.last_n_tokens = last_n_tokens
|
221 |
self.seed = seed
|
222 |
+
self.max_tokens=max_tokens
|
223 |
+
self.stream = stream
|
224 |
+
#self.threads = threads
|
225 |
+
#self.batch_size = batch_size
|
226 |
+
#self.reset = reset
|
227 |
|
228 |
def update_temp(self, new_value):
|
229 |
self.temperature = new_value
|
|
|
419 |
return history, docs_content_string, instruction_prompt_out
|
420 |
|
421 |
# Chat functions
|
422 |
+
# def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
423 |
+
# temperature=temperature,
|
424 |
+
# max_new_tokens=max_new_tokens,
|
425 |
+
# sample=sample,
|
426 |
+
# repetition_penalty=repetition_penalty,
|
427 |
+
# top_p=top_p,
|
428 |
+
# top_k=top_k
|
429 |
+
# ):
|
430 |
+
# #print("Model type is: ", model_type)
|
431 |
+
|
432 |
+
# #if not full_prompt.strip():
|
433 |
+
# # if history is None:
|
434 |
+
# # history = []
|
435 |
+
|
436 |
+
# # return history
|
437 |
+
|
438 |
+
# if model_type == "Flan Alpaca (small, fast)":
|
439 |
+
# # Get the model and tokenizer, and tokenize the user text.
|
440 |
+
# model_inputs = tokenizer(text=full_prompt, return_tensors="pt", return_attention_mask=False).to(torch_device) # return_attention_mask=False was added
|
441 |
+
|
442 |
+
# # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
|
443 |
+
# # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
|
444 |
+
# streamer = TextIteratorStreamer(tokenizer, timeout=120., skip_prompt=True, skip_special_tokens=True)
|
445 |
+
# generate_kwargs = dict(
|
446 |
+
# model_inputs,
|
447 |
+
# streamer=streamer,
|
448 |
+
# max_new_tokens=max_new_tokens,
|
449 |
+
# do_sample=sample,
|
450 |
+
# repetition_penalty=repetition_penalty,
|
451 |
+
# top_p=top_p,
|
452 |
+
# temperature=temperature,
|
453 |
+
# top_k=top_k
|
454 |
+
# )
|
455 |
+
|
456 |
+
# print(generate_kwargs)
|
457 |
+
|
458 |
+
# t = Thread(target=model.generate, kwargs=generate_kwargs)
|
459 |
+
# t.start()
|
460 |
+
|
461 |
+
# # Pull the generated text from the streamer, and update the model output.
|
462 |
+
# start = time.time()
|
463 |
+
# NUM_TOKENS=0
|
464 |
+
# print('-'*4+'Start Generation'+'-'*4)
|
465 |
+
|
466 |
+
# history[-1][1] = ""
|
467 |
+
# for new_text in streamer:
|
468 |
+
# if new_text == None: new_text = ""
|
469 |
+
# history[-1][1] += new_text
|
470 |
+
# NUM_TOKENS+=1
|
471 |
+
# yield history
|
472 |
+
|
473 |
+
# time_generate = time.time() - start
|
474 |
+
# print('\n')
|
475 |
+
# print('-'*4+'End Generation'+'-'*4)
|
476 |
+
# print(f'Num of generated tokens: {NUM_TOKENS}')
|
477 |
+
# print(f'Time for complete generation: {time_generate}s')
|
478 |
+
# print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
|
479 |
+
# print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
480 |
+
|
481 |
+
# elif model_type == "Mistral Open Orca (larger, slow)":
|
482 |
+
# tokens = model.tokenize(full_prompt)
|
483 |
+
|
484 |
+
# gen_config = CtransGenGenerationConfig()
|
485 |
+
# gen_config.update_temp(temperature)
|
486 |
+
|
487 |
+
# print(vars(gen_config))
|
488 |
+
|
489 |
+
# # Pull the generated text from the streamer, and update the model output.
|
490 |
+
# start = time.time()
|
491 |
+
# NUM_TOKENS=0
|
492 |
+
# print('-'*4+'Start Generation'+'-'*4)
|
493 |
+
|
494 |
+
# history[-1][1] = ""
|
495 |
+
# for new_text in model.generate(tokens, **vars(gen_config)): #CtransGen_generate(prompt=full_prompt)#, config=CtransGenGenerationConfig()): # #top_k=top_k, temperature=temperature, repetition_penalty=repetition_penalty,
|
496 |
+
# if new_text == None: new_text = ""
|
497 |
+
# history[-1][1] += model.detokenize(new_text) #new_text
|
498 |
+
# NUM_TOKENS+=1
|
499 |
+
# yield history
|
500 |
+
|
501 |
+
# time_generate = time.time() - start
|
502 |
+
# print('\n')
|
503 |
+
# print('-'*4+'End Generation'+'-'*4)
|
504 |
+
# print(f'Num of generated tokens: {NUM_TOKENS}')
|
505 |
+
# print(f'Time for complete generation: {time_generate}s')
|
506 |
+
# print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
|
507 |
+
# print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
508 |
+
|
509 |
+
|
510 |
def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
511 |
temperature=temperature,
|
512 |
max_new_tokens=max_new_tokens,
|
|
|
567 |
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
568 |
|
569 |
elif model_type == "Mistral Open Orca (larger, slow)":
|
570 |
+
#tokens = model.tokenize(full_prompt)
|
571 |
+
|
572 |
+
temp = ""
|
573 |
|
574 |
gen_config = CtransGenGenerationConfig()
|
575 |
gen_config.update_temp(temperature)
|
|
|
581 |
NUM_TOKENS=0
|
582 |
print('-'*4+'Start Generation'+'-'*4)
|
583 |
|
584 |
+
output = model(
|
585 |
+
full_prompt, **vars(gen_config))
|
586 |
+
|
587 |
history[-1][1] = ""
|
588 |
+
for out in output:
|
589 |
+
|
590 |
+
if "choices" in out and len(out["choices"]) > 0 and "text" in out["choices"][0]:
|
591 |
+
history[-1][1] += out["choices"][0]["text"]
|
592 |
+
NUM_TOKENS+=1
|
593 |
+
yield history
|
594 |
+
else:
|
595 |
+
print(f"Unexpected output structure: {out}")
|
596 |
+
|
597 |
time_generate = time.time() - start
|
598 |
print('\n')
|
599 |
print('-'*4+'End Generation'+'-'*4)
|
|
|
602 |
print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
|
603 |
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
604 |
|
605 |
+
|
606 |
# Chat helper functions
|
607 |
|
608 |
def adapt_q_from_chat_history(question, chat_history, extracted_memory, keyword_model=""):#keyword_model): # new_question_keywords,
|
|
|
778 |
# 3rd level check on retrieved docs with SVM retriever
|
779 |
|
780 |
svm_retriever = SVMRetriever.from_texts(content_keep, embeddings, k = k_val)
|
781 |
+
svm_result = svm_retriever.invoke(new_question_kworded)
|
782 |
|
783 |
|
784 |
svm_rank=[]
|
|
|
1158 |
return gr.update(interactive=True)
|
1159 |
|
1160 |
def update_message(dropdown_value):
|
1161 |
+
return gr.Textbox(value=dropdown_value)
|
1162 |
|
1163 |
def hide_block():
|
1164 |
+
return gr.Radio(visible=False)
|
1165 |
|
1166 |
# Vote function
|
1167 |
|
chatfuncs/ingest.py
CHANGED
@@ -21,7 +21,7 @@ from pypdf import PdfReader
|
|
21 |
PandasDataFrame = Type[pd.DataFrame]
|
22 |
|
23 |
split_strat = ["\n\n", "\n", ". ", "! ", "? "]
|
24 |
-
chunk_size =
|
25 |
chunk_overlap = 0
|
26 |
start_index = True
|
27 |
|
|
|
21 |
PandasDataFrame = Type[pd.DataFrame]
|
22 |
|
23 |
split_strat = ["\n\n", "\n", ". ", "! ", "? "]
|
24 |
+
chunk_size = 300
|
25 |
chunk_overlap = 0
|
26 |
start_index = True
|
27 |
|
requirements.txt
CHANGED
@@ -3,15 +3,18 @@ langchain-community
|
|
3 |
beautifulsoup4
|
4 |
pandas
|
5 |
transformers
|
6 |
-
--extra-index-url https://
|
7 |
-
torch
|
|
|
8 |
sentence_transformers
|
9 |
faiss-cpu
|
10 |
pypdf
|
11 |
python-docx
|
12 |
-
ctransformers[cuda]
|
13 |
keybert
|
14 |
span_marker
|
15 |
gensim
|
16 |
-
gradio==
|
17 |
-
gradio_client
|
|
|
|
|
|
3 |
beautifulsoup4
|
4 |
pandas
|
5 |
transformers
|
6 |
+
llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
7 |
+
#torch \
|
8 |
+
#--extra-index-url https://download.pytorch.org/whl/cu121
|
9 |
sentence_transformers
|
10 |
faiss-cpu
|
11 |
pypdf
|
12 |
python-docx
|
13 |
+
#ctransformers[cuda]
|
14 |
keybert
|
15 |
span_marker
|
16 |
gensim
|
17 |
+
gradio==4.31.5
|
18 |
+
gradio_client
|
19 |
+
nltk
|
20 |
+
scipy<1.13
|