Farid Karimli commited on
Commit
1052297
·
2 Parent(s): c26167a ccfbb8c

Merge branch 'dev_branch' into text_extraction

Browse files
.github/workflows/push_to_hf_space_prototype.yml CHANGED
@@ -1,20 +1,21 @@
1
  name: Push Prototype to HuggingFace
2
 
3
  on:
4
- pull_request:
5
- branches:
6
- - dev_branch
7
-
 
8
 
9
  jobs:
10
- build:
11
  runs-on: ubuntu-latest
12
  steps:
13
- - name: Deploy Prototype to HuggingFace
14
- uses: nateraw/[email protected]
15
- with:
16
- github_repo_id: DL4DS/dl4ds_tutor
17
- huggingface_repo_id: dl4ds/tutor_dev
18
- repo_type: space
19
- space_sdk: static
20
- hf_token: ${{ secrets.HF_TOKEN }}
 
1
  name: Push Prototype to HuggingFace
2
 
3
  on:
4
+ push:
5
+ branches: [dev_branch]
6
+
7
+ # run this workflow manuall from the Actions tab
8
+ workflow_dispatch:
9
 
10
  jobs:
11
+ sync-to-hub:
12
  runs-on: ubuntu-latest
13
  steps:
14
+ - uses: actions/checkout@v4
15
+ with:
16
+ fetch-depth: 0
17
+ lfs: true
18
+ - name: Deploy Prototype to HuggingFace
19
+ env:
20
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
21
+ run: git push https://trgardos:$HF_TOKEN@huggingface.co/spaces/dl4ds/tutor_dev dev_branch:main
.vscode/launch.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // Use IntelliSense to learn about possible attributes.
3
+ // Hover to view descriptions of existing attributes.
4
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
+ "version": "0.2.0",
6
+ "configurations": [
7
+ {
8
+ "name": "Python Debugger: Chainlit run main.py",
9
+ "type": "debugpy",
10
+ "request": "launch",
11
+ "program": "${workspaceFolder}/.venv/bin/chainlit",
12
+ "console": "integratedTerminal",
13
+ "args": ["run", "main.py"],
14
+ "cwd": "${workspaceFolder}/code",
15
+ "justMyCode": true
16
+ },
17
+ { "name":"Python Debugger: Module store_manager",
18
+ "type":"debugpy",
19
+ "request":"launch",
20
+ "module":"modules.vectorstore.store_manager",
21
+ "env": {"PYTHONPATH": "${workspaceFolder}/code"},
22
+ "cwd": "${workspaceFolder}/code",
23
+ "justMyCode": true
24
+ },
25
+ {
26
+ "name": "Python Debugger: Module data_loader",
27
+ "type": "debugpy",
28
+ "request": "launch",
29
+ "module": "modules.dataloader.data_loader",
30
+ "env": {"PYTHONPATH": "${workspaceFolder}/code"},
31
+ "cwd": "${workspaceFolder}/code",
32
+ "justMyCode": true
33
+ }
34
+ ]
35
+ }
.vscode/tasks.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // See https://go.microsoft.com/fwlink/?LinkId=733558
3
+ // for the documentation about the tasks.json format
4
+ "version": "2.0.0",
5
+ "tasks": [
6
+ {
7
+ "label": "echo",
8
+ "type": "shell",
9
+ "command": "echo ${workspaceFolder}; ls ${workspaceFolder}/code",
10
+ "problemMatcher": []
11
+ }
12
+ ]
13
+ }
code/.chainlit/translations/en-US.json DELETED
@@ -1,229 +0,0 @@
1
- {
2
- "components": {
3
- "atoms": {
4
- "buttons": {
5
- "userButton": {
6
- "menu": {
7
- "settings": "Settings",
8
- "settingsKey": "S",
9
- "APIKeys": "API Keys",
10
- "logout": "Logout"
11
- }
12
- }
13
- }
14
- },
15
- "molecules": {
16
- "newChatButton": {
17
- "newChat": "New Chat"
18
- },
19
- "tasklist": {
20
- "TaskList": {
21
- "title": "\ud83d\uddd2\ufe0f Task List",
22
- "loading": "Loading...",
23
- "error": "An error occurred"
24
- }
25
- },
26
- "attachments": {
27
- "cancelUpload": "Cancel upload",
28
- "removeAttachment": "Remove attachment"
29
- },
30
- "newChatDialog": {
31
- "createNewChat": "Create new chat?",
32
- "clearChat": "This will clear the current messages and start a new chat.",
33
- "cancel": "Cancel",
34
- "confirm": "Confirm"
35
- },
36
- "settingsModal": {
37
- "settings": "Settings",
38
- "expandMessages": "Expand Messages",
39
- "hideChainOfThought": "Hide Chain of Thought",
40
- "darkMode": "Dark Mode"
41
- },
42
- "detailsButton": {
43
- "using": "Using",
44
- "used": "Used"
45
- },
46
- "auth": {
47
- "authLogin": {
48
- "title": "Login to access the app.",
49
- "form": {
50
- "email": "Email address",
51
- "password": "Password",
52
- "noAccount": "Don't have an account?",
53
- "alreadyHaveAccount": "Already have an account?",
54
- "signup": "Sign Up",
55
- "signin": "Sign In",
56
- "or": "OR",
57
- "continue": "Continue",
58
- "forgotPassword": "Forgot password?",
59
- "passwordMustContain": "Your password must contain:",
60
- "emailRequired": "email is a required field",
61
- "passwordRequired": "password is a required field"
62
- },
63
- "error": {
64
- "default": "Unable to sign in.",
65
- "signin": "Try signing in with a different account.",
66
- "oauthsignin": "Try signing in with a different account.",
67
- "redirect_uri_mismatch": "The redirect URI is not matching the oauth app configuration.",
68
- "oauthcallbackerror": "Try signing in with a different account.",
69
- "oauthcreateaccount": "Try signing in with a different account.",
70
- "emailcreateaccount": "Try signing in with a different account.",
71
- "callback": "Try signing in with a different account.",
72
- "oauthaccountnotlinked": "To confirm your identity, sign in with the same account you used originally.",
73
- "emailsignin": "The e-mail could not be sent.",
74
- "emailverify": "Please verify your email, a new email has been sent.",
75
- "credentialssignin": "Sign in failed. Check the details you provided are correct.",
76
- "sessionrequired": "Please sign in to access this page."
77
- }
78
- },
79
- "authVerifyEmail": {
80
- "almostThere": "You're almost there! We've sent an email to ",
81
- "verifyEmailLink": "Please click on the link in that email to complete your signup.",
82
- "didNotReceive": "Can't find the email?",
83
- "resendEmail": "Resend email",
84
- "goBack": "Go Back",
85
- "emailSent": "Email sent successfully.",
86
- "verifyEmail": "Verify your email address"
87
- },
88
- "providerButton": {
89
- "continue": "Continue with {{provider}}",
90
- "signup": "Sign up with {{provider}}"
91
- },
92
- "authResetPassword": {
93
- "newPasswordRequired": "New password is a required field",
94
- "passwordsMustMatch": "Passwords must match",
95
- "confirmPasswordRequired": "Confirm password is a required field",
96
- "newPassword": "New password",
97
- "confirmPassword": "Confirm password",
98
- "resetPassword": "Reset Password"
99
- },
100
- "authForgotPassword": {
101
- "email": "Email address",
102
- "emailRequired": "email is a required field",
103
- "emailSent": "Please check the email address {{email}} for instructions to reset your password.",
104
- "enterEmail": "Enter your email address and we will send you instructions to reset your password.",
105
- "resendEmail": "Resend email",
106
- "continue": "Continue",
107
- "goBack": "Go Back"
108
- }
109
- }
110
- },
111
- "organisms": {
112
- "chat": {
113
- "history": {
114
- "index": {
115
- "showHistory": "Show history",
116
- "lastInputs": "Last Inputs",
117
- "noInputs": "Such empty...",
118
- "loading": "Loading..."
119
- }
120
- },
121
- "inputBox": {
122
- "input": {
123
- "placeholder": "Type your message here..."
124
- },
125
- "speechButton": {
126
- "start": "Start recording",
127
- "stop": "Stop recording"
128
- },
129
- "SubmitButton": {
130
- "sendMessage": "Send message",
131
- "stopTask": "Stop Task"
132
- },
133
- "UploadButton": {
134
- "attachFiles": "Attach files"
135
- },
136
- "waterMark": {
137
- "text": "Built with"
138
- }
139
- },
140
- "Messages": {
141
- "index": {
142
- "running": "Running",
143
- "executedSuccessfully": "executed successfully",
144
- "failed": "failed",
145
- "feedbackUpdated": "Feedback updated",
146
- "updating": "Updating"
147
- }
148
- },
149
- "dropScreen": {
150
- "dropYourFilesHere": "Drop your files here"
151
- },
152
- "index": {
153
- "failedToUpload": "Failed to upload",
154
- "cancelledUploadOf": "Cancelled upload of",
155
- "couldNotReachServer": "Could not reach the server",
156
- "continuingChat": "Continuing previous chat"
157
- },
158
- "settings": {
159
- "settingsPanel": "Settings panel",
160
- "reset": "Reset",
161
- "cancel": "Cancel",
162
- "confirm": "Confirm"
163
- }
164
- },
165
- "threadHistory": {
166
- "sidebar": {
167
- "filters": {
168
- "FeedbackSelect": {
169
- "feedbackAll": "Feedback: All",
170
- "feedbackPositive": "Feedback: Positive",
171
- "feedbackNegative": "Feedback: Negative"
172
- },
173
- "SearchBar": {
174
- "search": "Search"
175
- }
176
- },
177
- "DeleteThreadButton": {
178
- "confirmMessage": "This will delete the thread as well as it's messages and elements.",
179
- "cancel": "Cancel",
180
- "confirm": "Confirm",
181
- "deletingChat": "Deleting chat",
182
- "chatDeleted": "Chat deleted"
183
- },
184
- "index": {
185
- "pastChats": "Past Chats"
186
- },
187
- "ThreadList": {
188
- "empty": "Empty...",
189
- "today": "Today",
190
- "yesterday": "Yesterday",
191
- "previous7days": "Previous 7 days",
192
- "previous30days": "Previous 30 days"
193
- },
194
- "TriggerButton": {
195
- "closeSidebar": "Close sidebar",
196
- "openSidebar": "Open sidebar"
197
- }
198
- },
199
- "Thread": {
200
- "backToChat": "Go back to chat",
201
- "chatCreatedOn": "This chat was created on"
202
- }
203
- },
204
- "header": {
205
- "chat": "Chat",
206
- "readme": "Readme"
207
- }
208
- }
209
- },
210
- "hooks": {
211
- "useLLMProviders": {
212
- "failedToFetchProviders": "Failed to fetch providers:"
213
- }
214
- },
215
- "pages": {
216
- "Design": {},
217
- "Env": {
218
- "savedSuccessfully": "Saved successfully",
219
- "requiredApiKeys": "Required API Keys",
220
- "requiredApiKeysInfo": "To use this app, the following API keys are required. The keys are stored on your device's local storage."
221
- },
222
- "Page": {
223
- "notPartOfProject": "You are not part of this project."
224
- },
225
- "ResumeButton": {
226
- "resumeChat": "Resume Chat"
227
- }
228
- }
229
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code/main.py CHANGED
@@ -17,6 +17,7 @@ from modules.chat.helpers import (
17
  get_sources,
18
  get_history_chat_resume,
19
  get_history_setup_llm,
 
20
  )
21
  import copy
22
  from typing import Optional
@@ -55,7 +56,7 @@ class Chatbot:
55
  """
56
  self.config = config
57
 
58
- def _load_config(self):
59
  """
60
  Load the configuration from a YAML file.
61
  """
@@ -277,7 +278,7 @@ class Chatbot:
277
  rename_dict = {"Chatbot": "AI Tutor"}
278
  return rename_dict.get(orig_author, orig_author)
279
 
280
- async def start(self):
281
  """
282
  Start the chatbot, initialize settings widgets,
283
  and display and load previous conversation if chat logging is enabled.
@@ -285,6 +286,12 @@ class Chatbot:
285
 
286
  start_time = time.time()
287
 
 
 
 
 
 
 
288
  await self.make_llm_settings_widgets(self.config)
289
  user = cl.user_session.get("user")
290
  self.user = {
@@ -370,25 +377,6 @@ class Chatbot:
370
 
371
  answer = res.get("answer", res.get("result"))
372
 
373
- if cl_data._data_layer is not None:
374
- with cl_data._data_layer.client.step(
375
- type="run",
376
- name="step_info",
377
- thread_id=cl.context.session.thread_id,
378
- # tags=self.tags,
379
- ) as step:
380
-
381
- step.input = {"question": user_query_dict["input"]}
382
-
383
- step.output = {
384
- "chat_history": res.get("chat_history"),
385
- "context": res.get("context"),
386
- "answer": answer,
387
- "rephrase_prompt": res.get("rephrase_prompt"),
388
- "qa_prompt": res.get("qa_prompt"),
389
- }
390
- step.metadata = self.config
391
-
392
  answer_with_sources, source_elements, sources_dict = get_sources(
393
  res, answer, stream=stream, view_sources=view_sources
394
  )
@@ -425,14 +413,21 @@ class Chatbot:
425
  elements=source_elements,
426
  author=LLM,
427
  actions=actions,
 
428
  ).send()
429
 
430
  async def on_chat_resume(self, thread: ThreadDict):
 
431
  steps = thread["steps"]
432
- k = self.config["llm_params"]["memory_window"]
 
 
433
  conversation_list = get_history_chat_resume(steps, k, SYSTEM, LLM)
 
 
 
434
  cl.user_session.set("memory", conversation_list)
435
- await self.start()
436
 
437
  @cl.oauth_callback
438
  def auth_callback(
 
17
  get_sources,
18
  get_history_chat_resume,
19
  get_history_setup_llm,
20
+ get_last_config,
21
  )
22
  import copy
23
  from typing import Optional
 
56
  """
57
  self.config = config
58
 
59
+ async def _load_config(self):
60
  """
61
  Load the configuration from a YAML file.
62
  """
 
278
  rename_dict = {"Chatbot": "AI Tutor"}
279
  return rename_dict.get(orig_author, orig_author)
280
 
281
+ async def start(self, config=None):
282
  """
283
  Start the chatbot, initialize settings widgets,
284
  and display and load previous conversation if chat logging is enabled.
 
286
 
287
  start_time = time.time()
288
 
289
+ self.config = (
290
+ await self._load_config() if config is None else config
291
+ ) # Reload the configuration on chat resume
292
+
293
+ await self.make_llm_settings_widgets(self.config) # Reload the settings widgets
294
+
295
  await self.make_llm_settings_widgets(self.config)
296
  user = cl.user_session.get("user")
297
  self.user = {
 
377
 
378
  answer = res.get("answer", res.get("result"))
379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  answer_with_sources, source_elements, sources_dict = get_sources(
381
  res, answer, stream=stream, view_sources=view_sources
382
  )
 
413
  elements=source_elements,
414
  author=LLM,
415
  actions=actions,
416
+ metadata=self.config,
417
  ).send()
418
 
419
  async def on_chat_resume(self, thread: ThreadDict):
420
+ thread_config = None
421
  steps = thread["steps"]
422
+ k = self.config["llm_params"][
423
+ "memory_window"
424
+ ] # on resume, alwyas use the default memory window
425
  conversation_list = get_history_chat_resume(steps, k, SYSTEM, LLM)
426
+ thread_config = get_last_config(
427
+ steps
428
+ ) # TODO: Returns None for now - which causes config to be reloaded with default values
429
  cl.user_session.set("memory", conversation_list)
430
+ await self.start(config=thread_config)
431
 
432
  @cl.oauth_callback
433
  def auth_callback(
code/modules/chat/chat_model_loader.py CHANGED
@@ -5,6 +5,8 @@ from langchain_community.llms import LlamaCpp
5
  import torch
6
  import transformers
7
  import os
 
 
8
  from langchain.callbacks.manager import CallbackManager
9
  from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
10
  from modules.config.constants import LLAMA_PATH
@@ -15,6 +17,14 @@ class ChatModelLoader:
15
  self.config = config
16
  self.huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
17
 
 
 
 
 
 
 
 
 
18
  def load_chat_model(self):
19
  if self.config["llm_params"]["llm_loader"] in [
20
  "gpt-3.5-turbo-1106",
@@ -24,6 +34,9 @@ class ChatModelLoader:
24
  llm = ChatOpenAI(model_name=self.config["llm_params"]["llm_loader"])
25
  elif self.config["llm_params"]["llm_loader"] == "local_llm":
26
  n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
 
 
 
27
  llm = LlamaCpp(
28
  model_path=LLAMA_PATH,
29
  n_batch=n_batch,
 
5
  import torch
6
  import transformers
7
  import os
8
+ from pathlib import Path
9
+ from huggingface_hub import hf_hub_download
10
  from langchain.callbacks.manager import CallbackManager
11
  from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
12
  from modules.config.constants import LLAMA_PATH
 
17
  self.config = config
18
  self.huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
19
 
20
+ def _verify_model_cache(self, model_cache_path):
21
+ hf_hub_download(
22
+ repo_id=self.config["llm_params"]["local_llm_params"]["repo_id"],
23
+ filename=self.config["llm_params"]["local_llm_params"]["filename"],
24
+ cache_dir=model_cache_path,
25
+ )
26
+ return str(list(Path(model_cache_path).glob("*/snapshots/*/*.gguf"))[0])
27
+
28
  def load_chat_model(self):
29
  if self.config["llm_params"]["llm_loader"] in [
30
  "gpt-3.5-turbo-1106",
 
34
  llm = ChatOpenAI(model_name=self.config["llm_params"]["llm_loader"])
35
  elif self.config["llm_params"]["llm_loader"] == "local_llm":
36
  n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
37
+ model_path = self._verify_model_cache(
38
+ self.config["llm_params"]["local_llm_params"]["model"]
39
+ )
40
  llm = LlamaCpp(
41
  model_path=LLAMA_PATH,
42
  n_batch=n_batch,
code/modules/chat/helpers.py CHANGED
@@ -162,3 +162,8 @@ def get_history_setup_llm(memory_list):
162
  raise ValueError("Invalid message type")
163
 
164
  return conversation_list
 
 
 
 
 
 
162
  raise ValueError("Invalid message type")
163
 
164
  return conversation_list
165
+
166
+
167
+ def get_last_config(steps):
168
+ # TODO: Implement this function
169
+ return None
code/modules/config/config.yml CHANGED
@@ -35,6 +35,9 @@ llm_params:
35
  temperature: 0.7 # float
36
  local_llm_params:
37
  temperature: 0.7 # float
 
 
 
38
  stream: False # bool
39
  pdf_reader: 'gpt' # str [llama, pymupdf, gpt]
40
 
@@ -54,4 +57,4 @@ splitter_options:
54
  chunk_separators : ["\n\n", "\n", " ", ""] # list of strings
55
  front_chunks_to_remove : null # int or None
56
  last_chunks_to_remove : null # int or None
57
- delimiters_to_remove : ['\t', '\n', ' ', ' '] # list of strings
 
35
  temperature: 0.7 # float
36
  local_llm_params:
37
  temperature: 0.7 # float
38
+ repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id
39
+ filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
40
+ pdf_reader: 'pymupdf' # str [llama, pymupdf, gpt]
41
  stream: False # bool
42
  pdf_reader: 'gpt' # str [llama, pymupdf, gpt]
43
 
 
57
  chunk_separators : ["\n\n", "\n", " ", ""] # list of strings
58
  front_chunks_to_remove : null # int or None
59
  last_chunks_to_remove : null # int or None
60
+ delimiters_to_remove : ['\t', '\n', ' ', ' '] # list of strings
code/modules/config/constants.py CHANGED
@@ -18,6 +18,6 @@ opening_message = f"Hey, What Can I Help You With?\n\nYou can me ask me question
18
 
19
  # Model Paths
20
 
21
- LLAMA_PATH = "../storage/models/tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf"
22
 
23
  RETRIEVER_HF_PATHS = {"RAGatouille": "XThomasBU/Colbert_Index"}
 
18
 
19
  # Model Paths
20
 
21
+ LLAMA_PATH = "../storage/models/tinyllama"
22
 
23
  RETRIEVER_HF_PATHS = {"RAGatouille": "XThomasBU/Colbert_Index"}
code/modules/dataloader/data_loader.py CHANGED
@@ -98,7 +98,6 @@ class FileReader:
98
  self.web_reader = HTMLReader()
99
  self.logger.info(f"Initialized FileReader with {kind} PDF reader and HTML reader")
100
 
101
-
102
  def extract_text_from_pdf(self, pdf_path):
103
  text = ""
104
  with open(pdf_path, "rb") as file:
@@ -315,6 +314,7 @@ class ChunkProcessor:
315
  return
316
 
317
  try:
 
318
  if file_path in self.document_data:
319
  self.logger.warning(f"File {file_name} already processed")
320
  documents = [Document(page_content=content) for content in self.document_data[file_path].values()]
 
98
  self.web_reader = HTMLReader()
99
  self.logger.info(f"Initialized FileReader with {kind} PDF reader and HTML reader")
100
 
 
101
  def extract_text_from_pdf(self, pdf_path):
102
  text = ""
103
  with open(pdf_path, "rb") as file:
 
314
  return
315
 
316
  try:
317
+
318
  if file_path in self.document_data:
319
  self.logger.warning(f"File {file_name} already processed")
320
  documents = [Document(page_content=content) for content in self.document_data[file_path].values()]