XThomasBU commited on
Commit
1e2550f
1 Parent(s): 9edf34d

setup and format instructions added

Browse files
.flake8 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [flake8]
2
+ max-line-length = 88
3
+ extend-ignore = E203, E266, E501, W503
.github/workflows/code_quality_check.yml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Code Quality and Security Checks
2
+
3
+ on:
4
+ push:
5
+ branches: [ main, dev_branch ]
6
+ pull_request:
7
+ branches: [ main, dev_branch ]
8
+
9
+ jobs:
10
+ code-quality:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+
15
+ - name: Set up Python
16
+ uses: actions/setup-python@v4
17
+ with:
18
+ python-version: '3.11'
19
+
20
+ - name: Install dependencies
21
+ run: |
22
+ python -m pip install --upgrade pip
23
+ pip install flake8 black bandit
24
+
25
+ - name: Run Black
26
+ run: black --check .
27
+
28
+ - name: Run Flake8
29
+ run: flake8 .
30
+
31
+ - name: Run Bandit
32
+ run: |
33
+ bandit -r .
.gitignore CHANGED
@@ -165,6 +165,7 @@ cython_debug/
165
  .ragatouille/*
166
  */__pycache__/*
167
  .chainlit/translations/
 
168
  storage/logs/*
169
  vectorstores/*
170
 
 
165
  .ragatouille/*
166
  */__pycache__/*
167
  .chainlit/translations/
168
+ code/.chainlit/translations/
169
  storage/logs/*
170
  vectorstores/*
171
 
README.md CHANGED
@@ -6,6 +6,8 @@ You can find an implementation of the Tutor at [DL4DS Tutor on Hugging Face](htt
6
 
7
  ## Running Locally
8
 
 
 
9
  1. **Clone the Repository**
10
  ```bash
11
  git clone https://github.com/DL4DS/dl4ds_tutor
@@ -27,7 +29,6 @@ You can find an implementation of the Tutor at [DL4DS Tutor on Hugging Face](htt
27
  python -m modules.vectorstore.store_manager
28
  ```
29
  - Note: You need to run the above command when you add new data to the `storage/data` directory, or if the `storage/data/urls.txt` file is updated.
30
- - Alternatively, you can set `["vectorstore"]["embedd_files"]` to `True` in the `code/modules/config/config.yaml` file, which will embed files from the storage directory every time you run the below chainlit command.
31
 
32
  5. **Run the Chainlit App**
33
  ```bash
@@ -81,4 +82,6 @@ docker run -it --rm -p 8000:8000 dev
81
 
82
  ## Contributing
83
 
84
- Please create an issue if you have any suggestions or improvements, and start working on it by creating a branch and by making a pull request to the main branch.
 
 
 
6
 
7
  ## Running Locally
8
 
9
+ Please view `docs/setup.md` for more information on setting up the project.
10
+
11
  1. **Clone the Repository**
12
  ```bash
13
  git clone https://github.com/DL4DS/dl4ds_tutor
 
29
  python -m modules.vectorstore.store_manager
30
  ```
31
  - Note: You need to run the above command when you add new data to the `storage/data` directory, or if the `storage/data/urls.txt` file is updated.
 
32
 
33
  5. **Run the Chainlit App**
34
  ```bash
 
82
 
83
  ## Contributing
84
 
85
+ Please create an issue if you have any suggestions or improvements, and start working on it by creating a branch and by making a pull request to the main branch.
86
+
87
+ Please view `docs/contribute.md` for more information on contributing.
code/.chainlit/config.toml CHANGED
@@ -49,6 +49,8 @@ auto_tag_thread = true
49
  # Sample rate of the audio
50
  sample_rate = 44100
51
 
 
 
52
  [UI]
53
  # Name of the assistant.
54
  name = "AI Tutor"
@@ -115,4 +117,4 @@ custom_meta_image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/f/
115
  #secondary = "#BDBDBD"
116
 
117
  [meta]
118
- generated_by = "1.1.304"
 
49
  # Sample rate of the audio
50
  sample_rate = 44100
51
 
52
+ edit_message = true
53
+
54
  [UI]
55
  # Name of the assistant.
56
  name = "AI Tutor"
 
117
  #secondary = "#BDBDBD"
118
 
119
  [meta]
120
+ generated_by = "1.1.306"
code/__init__.py DELETED
@@ -1 +0,0 @@
1
- from .modules import *
 
 
code/chainlit.md CHANGED
@@ -1,10 +1,14 @@
1
- # Welcome to DL4DS Tutor! 🚀🤖
2
 
3
- Hi there, this is an LLM chatbot designed to help answer questions on the course content, built using Langchain and Chainlit.
4
- This is still very much a Work in Progress.
5
-
6
- ### --- Please wait while the Tutor loads... ---
7
 
8
  ## Useful Links 🔗
9
 
10
- - **Documentation:** [Chainlit Documentation](https://docs.chainlit.io) 📚
 
 
 
 
 
 
 
 
1
+ # Welcome to Chainlit! 🚀🤖
2
 
3
+ Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.
 
 
 
4
 
5
  ## Useful Links 🔗
6
 
7
+ - **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚
8
+ - **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬
9
+
10
+ We can't wait to see what you create with Chainlit! Happy coding! 💻😊
11
+
12
+ ## Welcome screen
13
+
14
+ To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.
code/main.py CHANGED
@@ -1,7 +1,6 @@
1
  import chainlit.data as cl_data
2
  import asyncio
3
  from modules.config.constants import (
4
- LLAMA_PATH,
5
  LITERAL_API_KEY_LOGGING,
6
  LITERAL_API_URL,
7
  )
@@ -9,7 +8,6 @@ from modules.chat_processor.literal_ai import CustomLiteralDataLayer
9
 
10
  import json
11
  import yaml
12
- import os
13
  from typing import Any, Dict, no_type_check
14
  import chainlit as cl
15
  from modules.chat.llm_tutor import LLMTutor
@@ -73,7 +71,14 @@ class Chatbot:
73
  start_time = time.time()
74
 
75
  llm_settings = cl.user_session.get("llm_settings", {})
76
- chat_profile, retriever_method, memory_window, llm_style, generate_follow_up, chunking_mode = (
 
 
 
 
 
 
 
77
  llm_settings.get("chat_model"),
78
  llm_settings.get("retriever_method"),
79
  llm_settings.get("memory_window"),
@@ -113,8 +118,6 @@ class Chatbot:
113
  ),
114
  )
115
 
116
- tags = [chat_profile, self.config["vectorstore"]["db_option"]]
117
-
118
  cl.user_session.set("chain", self.chain)
119
  cl.user_session.set("llm_tutor", self.llm_tutor)
120
 
@@ -180,7 +183,7 @@ class Chatbot:
180
  cl.input_widget.Select(
181
  id="chunking_mode",
182
  label="Chunking mode",
183
- values=['fixed', 'semantic'],
184
  initial_index=1,
185
  ),
186
  cl.input_widget.Switch(
@@ -241,7 +244,8 @@ class Chatbot:
241
  ) # see if the thread has any steps
242
  if thread.steps or len(thread.steps) > 0:
243
  return None
244
- except:
 
245
  return [
246
  cl.Starter(
247
  label="recording on CNNs?",
@@ -294,10 +298,18 @@ class Chatbot:
294
 
295
  await self.make_llm_settings_widgets(self.config)
296
  user = cl.user_session.get("user")
297
- self.user = {
298
- "user_id": user.identifier,
299
- "session_id": cl.context.session.thread_id,
300
- }
 
 
 
 
 
 
 
 
301
 
302
  memory = cl.user_session.get("memory", [])
303
 
@@ -355,7 +367,7 @@ class Chatbot:
355
  llm_settings = cl.user_session.get("llm_settings", {})
356
  view_sources = llm_settings.get("view_sources", False)
357
  stream = llm_settings.get("stream_response", False)
358
- steam = False # Fix streaming
359
  user_query_dict = {"input": message.content}
360
  # Define the base configuration
361
  chain_config = {
 
1
  import chainlit.data as cl_data
2
  import asyncio
3
  from modules.config.constants import (
 
4
  LITERAL_API_KEY_LOGGING,
5
  LITERAL_API_URL,
6
  )
 
8
 
9
  import json
10
  import yaml
 
11
  from typing import Any, Dict, no_type_check
12
  import chainlit as cl
13
  from modules.chat.llm_tutor import LLMTutor
 
71
  start_time = time.time()
72
 
73
  llm_settings = cl.user_session.get("llm_settings", {})
74
+ (
75
+ chat_profile,
76
+ retriever_method,
77
+ memory_window,
78
+ llm_style,
79
+ generate_follow_up,
80
+ chunking_mode,
81
+ ) = (
82
  llm_settings.get("chat_model"),
83
  llm_settings.get("retriever_method"),
84
  llm_settings.get("memory_window"),
 
118
  ),
119
  )
120
 
 
 
121
  cl.user_session.set("chain", self.chain)
122
  cl.user_session.set("llm_tutor", self.llm_tutor)
123
 
 
183
  cl.input_widget.Select(
184
  id="chunking_mode",
185
  label="Chunking mode",
186
+ values=["fixed", "semantic"],
187
  initial_index=1,
188
  ),
189
  cl.input_widget.Switch(
 
244
  ) # see if the thread has any steps
245
  if thread.steps or len(thread.steps) > 0:
246
  return None
247
+ except Exception as e:
248
+ print(e)
249
  return [
250
  cl.Starter(
251
  label="recording on CNNs?",
 
298
 
299
  await self.make_llm_settings_widgets(self.config)
300
  user = cl.user_session.get("user")
301
+
302
+ try:
303
+ self.user = {
304
+ "user_id": user.identifier,
305
+ "session_id": cl.context.session.thread_id,
306
+ }
307
+ except Exception as e:
308
+ print(e)
309
+ self.user = {
310
+ "user_id": "guest",
311
+ "session_id": cl.context.session.thread_id,
312
+ }
313
 
314
  memory = cl.user_session.get("memory", [])
315
 
 
367
  llm_settings = cl.user_session.get("llm_settings", {})
368
  view_sources = llm_settings.get("view_sources", False)
369
  stream = llm_settings.get("stream_response", False)
370
+ stream = False # Fix streaming
371
  user_query_dict = {"input": message.content}
372
  # Define the base configuration
373
  chain_config = {
code/modules/chat/chat_model_loader.py CHANGED
@@ -1,15 +1,8 @@
1
  from langchain_openai import ChatOpenAI
2
- from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
3
- from transformers import AutoTokenizer, TextStreamer
4
  from langchain_community.llms import LlamaCpp
5
- import torch
6
- import transformers
7
  import os
8
  from pathlib import Path
9
  from huggingface_hub import hf_hub_download
10
- from langchain.callbacks.manager import CallbackManager
11
- from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
12
- from modules.config.constants import LLAMA_PATH
13
 
14
 
15
  class ChatModelLoader:
@@ -38,7 +31,7 @@ class ChatModelLoader:
38
  self.config["llm_params"]["local_llm_params"]["model"]
39
  )
40
  llm = LlamaCpp(
41
- model_path=LLAMA_PATH,
42
  n_batch=n_batch,
43
  n_ctx=2048,
44
  f16_kv=True,
 
1
  from langchain_openai import ChatOpenAI
 
 
2
  from langchain_community.llms import LlamaCpp
 
 
3
  import os
4
  from pathlib import Path
5
  from huggingface_hub import hf_hub_download
 
 
 
6
 
7
 
8
  class ChatModelLoader:
 
31
  self.config["llm_params"]["local_llm_params"]["model"]
32
  )
33
  llm = LlamaCpp(
34
+ model_path=model_path,
35
  n_batch=n_batch,
36
  n_ctx=2048,
37
  f16_kv=True,
code/modules/chat/langchain/__init__.py ADDED
File without changes
code/modules/chat/langchain/langchain_rag.py CHANGED
@@ -1,17 +1,22 @@
1
  from langchain_core.prompts import ChatPromptTemplate
2
 
3
- from modules.chat.langchain.utils import *
4
- from langchain.memory import ChatMessageHistory
5
  from modules.chat.base import BaseRAG
6
  from langchain_core.prompts import PromptTemplate
7
- from langchain.memory import (
8
- ConversationBufferWindowMemory,
9
- ConversationSummaryBufferMemory,
 
 
 
 
 
 
 
 
10
  )
11
 
12
- import chainlit as cl
13
- from langchain_community.chat_models import ChatOpenAI
14
-
15
 
16
  class Langchain_RAG_V1(BaseRAG):
17
 
 
1
  from langchain_core.prompts import ChatPromptTemplate
2
 
3
+ # from modules.chat.langchain.utils import
4
+ from langchain_community.chat_message_histories import ChatMessageHistory
5
  from modules.chat.base import BaseRAG
6
  from langchain_core.prompts import PromptTemplate
7
+ from langchain.memory import ConversationBufferWindowMemory
8
+ from langchain_core.runnables.utils import ConfigurableFieldSpec
9
+ from .utils import (
10
+ CustomConversationalRetrievalChain,
11
+ create_history_aware_retriever,
12
+ create_stuff_documents_chain,
13
+ create_retrieval_chain,
14
+ return_questions,
15
+ CustomRunnableWithHistory,
16
+ BaseChatMessageHistory,
17
+ InMemoryHistory,
18
  )
19
 
 
 
 
20
 
21
  class Langchain_RAG_V1(BaseRAG):
22
 
code/modules/chat/langchain/utils.py CHANGED
@@ -1,53 +1,29 @@
1
  from typing import Any, Dict, List, Union, Tuple, Optional
2
- from langchain_core.messages import (
3
- BaseMessage,
4
- AIMessage,
5
- FunctionMessage,
6
- HumanMessage,
7
- )
8
-
9
  from langchain_core.prompts.base import BasePromptTemplate, format_document
10
- from langchain_core.prompts.chat import MessagesPlaceholder
11
  from langchain_core.output_parsers import StrOutputParser
12
  from langchain_core.output_parsers.base import BaseOutputParser
13
  from langchain_core.retrievers import BaseRetriever, RetrieverOutput
14
  from langchain_core.language_models import LanguageModelLike
15
  from langchain_core.runnables import Runnable, RunnableBranch, RunnablePassthrough
16
  from langchain_core.runnables.history import RunnableWithMessageHistory
17
- from langchain_core.runnables.utils import ConfigurableFieldSpec
18
  from langchain_core.chat_history import BaseChatMessageHistory
19
  from langchain_core.pydantic_v1 import BaseModel, Field
20
  from langchain.chains.combine_documents.base import (
21
  DEFAULT_DOCUMENT_PROMPT,
22
  DEFAULT_DOCUMENT_SEPARATOR,
23
  DOCUMENTS_KEY,
24
- BaseCombineDocumentsChain,
25
  _validate_prompt,
26
  )
27
- from langchain.chains.llm import LLMChain
28
- from langchain_core.callbacks import Callbacks
29
- from langchain_core.documents import Document
30
-
31
-
32
- CHAT_TURN_TYPE = Union[Tuple[str, str], BaseMessage]
33
-
34
  from langchain_core.runnables.config import RunnableConfig
35
- from langchain_core.messages import BaseMessage
36
-
37
-
38
- from langchain_core.output_parsers import StrOutputParser
39
  from langchain_core.prompts import ChatPromptTemplate
40
  from langchain_community.chat_models import ChatOpenAI
41
-
42
- from langchain.chains import RetrievalQA, ConversationalRetrievalChain
43
- from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
44
-
45
- from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
46
  from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
47
  import inspect
48
- from langchain.chains.conversational_retrieval.base import _get_chat_history
49
  from langchain_core.messages import BaseMessage
50
 
 
 
51
 
52
  class CustomConversationalRetrievalChain(ConversationalRetrievalChain):
53
 
 
1
  from typing import Any, Dict, List, Union, Tuple, Optional
 
 
 
 
 
 
 
2
  from langchain_core.prompts.base import BasePromptTemplate, format_document
 
3
  from langchain_core.output_parsers import StrOutputParser
4
  from langchain_core.output_parsers.base import BaseOutputParser
5
  from langchain_core.retrievers import BaseRetriever, RetrieverOutput
6
  from langchain_core.language_models import LanguageModelLike
7
  from langchain_core.runnables import Runnable, RunnableBranch, RunnablePassthrough
8
  from langchain_core.runnables.history import RunnableWithMessageHistory
 
9
  from langchain_core.chat_history import BaseChatMessageHistory
10
  from langchain_core.pydantic_v1 import BaseModel, Field
11
  from langchain.chains.combine_documents.base import (
12
  DEFAULT_DOCUMENT_PROMPT,
13
  DEFAULT_DOCUMENT_SEPARATOR,
14
  DOCUMENTS_KEY,
 
15
  _validate_prompt,
16
  )
 
 
 
 
 
 
 
17
  from langchain_core.runnables.config import RunnableConfig
 
 
 
 
18
  from langchain_core.prompts import ChatPromptTemplate
19
  from langchain_community.chat_models import ChatOpenAI
20
+ from langchain.chains import ConversationalRetrievalChain
 
 
 
 
21
  from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
22
  import inspect
 
23
  from langchain_core.messages import BaseMessage
24
 
25
+ CHAT_TURN_TYPE = Union[Tuple[str, str], BaseMessage]
26
+
27
 
28
  class CustomConversationalRetrievalChain(ConversationalRetrievalChain):
29
 
code/modules/chat/llm_tutor.py CHANGED
@@ -3,7 +3,6 @@ from modules.chat.chat_model_loader import ChatModelLoader
3
  from modules.vectorstore.store_manager import VectorStoreManager
4
  from modules.retriever.retriever import Retriever
5
  from modules.chat.langchain.langchain_rag import (
6
- Langchain_RAG_V1,
7
  Langchain_RAG_V2,
8
  QuestionGenerator,
9
  )
@@ -28,9 +27,11 @@ class LLMTutor:
28
  self.rephrase_prompt = get_prompt(
29
  config, "rephrase"
30
  ) # Initialize rephrase_prompt
31
- if self.config["vectorstore"]["embedd_files"]:
32
- self.vector_db.create_database()
33
- self.vector_db.save_database()
 
 
34
 
35
  def update_llm(self, old_config, new_config):
36
  """
@@ -48,9 +49,11 @@ class LLMTutor:
48
  self.vector_db = VectorStoreManager(
49
  self.config, logger=self.logger
50
  ).load_database() # Reinitialize VectorStoreManager if vectorstore changes
51
- if self.config["vectorstore"]["embedd_files"]:
52
- self.vector_db.create_database()
53
- self.vector_db.save_database()
 
 
54
 
55
  if "llm_params.llm_style" in changes:
56
  self.qa_prompt = get_prompt(
 
3
  from modules.vectorstore.store_manager import VectorStoreManager
4
  from modules.retriever.retriever import Retriever
5
  from modules.chat.langchain.langchain_rag import (
 
6
  Langchain_RAG_V2,
7
  QuestionGenerator,
8
  )
 
27
  self.rephrase_prompt = get_prompt(
28
  config, "rephrase"
29
  ) # Initialize rephrase_prompt
30
+
31
+ # TODO: Removed this functionality for now, don't know if we need it
32
+ # if self.config["vectorstore"]["embedd_files"]:
33
+ # self.vector_db.create_database()
34
+ # self.vector_db.save_database()
35
 
36
  def update_llm(self, old_config, new_config):
37
  """
 
49
  self.vector_db = VectorStoreManager(
50
  self.config, logger=self.logger
51
  ).load_database() # Reinitialize VectorStoreManager if vectorstore changes
52
+
53
+ # TODO: Removed this functionality for now, don't know if we need it
54
+ # if self.config["vectorstore"]["embedd_files"]:
55
+ # self.vector_db.create_database()
56
+ # self.vector_db.save_database()
57
 
58
  if "llm_params.llm_style" in changes:
59
  self.qa_prompt = get_prompt(
code/modules/chat_processor/literal_ai.py CHANGED
@@ -1,44 +1,7 @@
1
- from chainlit.data import ChainlitDataLayer, queue_until_user_message
2
 
3
 
4
  # update custom methods here (Ref: https://github.com/Chainlit/chainlit/blob/4b533cd53173bcc24abe4341a7108f0070d60099/backend/chainlit/data/__init__.py)
5
  class CustomLiteralDataLayer(ChainlitDataLayer):
6
  def __init__(self, **kwargs):
7
  super().__init__(**kwargs)
8
-
9
- @queue_until_user_message()
10
- async def create_step(self, step_dict: "StepDict"):
11
- metadata = dict(
12
- step_dict.get("metadata", {}),
13
- **{
14
- "waitForAnswer": step_dict.get("waitForAnswer"),
15
- "language": step_dict.get("language"),
16
- "showInput": step_dict.get("showInput"),
17
- },
18
- )
19
-
20
- step: LiteralStepDict = {
21
- "createdAt": step_dict.get("createdAt"),
22
- "startTime": step_dict.get("start"),
23
- "endTime": step_dict.get("end"),
24
- "generation": step_dict.get("generation"),
25
- "id": step_dict.get("id"),
26
- "parentId": step_dict.get("parentId"),
27
- "name": step_dict.get("name"),
28
- "threadId": step_dict.get("threadId"),
29
- "type": step_dict.get("type"),
30
- "tags": step_dict.get("tags"),
31
- "metadata": metadata,
32
- }
33
- if step_dict.get("input"):
34
- step["input"] = {"content": step_dict.get("input")}
35
- if step_dict.get("output"):
36
- step["output"] = {"content": step_dict.get("output")}
37
- if step_dict.get("isError"):
38
- step["error"] = step_dict.get("output")
39
-
40
- # print("\n\n\n")
41
- # print("Step: ", step)
42
- # print("\n\n\n")
43
-
44
- await self.client.api.send_steps([step])
 
1
+ from chainlit.data import ChainlitDataLayer
2
 
3
 
4
  # update custom methods here (Ref: https://github.com/Chainlit/chainlit/blob/4b533cd53173bcc24abe4341a7108f0070d60099/backend/chainlit/data/__init__.py)
5
  class CustomLiteralDataLayer(ChainlitDataLayer):
6
  def __init__(self, **kwargs):
7
  super().__init__(**kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code/modules/config/config.yml CHANGED
@@ -4,7 +4,7 @@ device: 'cpu' # str [cuda, cpu]
4
 
5
  vectorstore:
6
  load_from_HF: True # bool
7
- embedd_files: False # bool
8
  data_path: '../storage/data' # str
9
  url_file_path: '../storage/data/urls.txt' # str
10
  expand_urls: True # bool
@@ -37,7 +37,6 @@ llm_params:
37
  temperature: 0.7 # float
38
  repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id
39
  filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
40
- pdf_reader: 'pymupdf' # str [llama, pymupdf, gpt]
41
  stream: False # bool
42
  pdf_reader: 'gpt' # str [llama, pymupdf, gpt]
43
 
 
4
 
5
  vectorstore:
6
  load_from_HF: True # bool
7
+ reparse_files: True # bool
8
  data_path: '../storage/data' # str
9
  url_file_path: '../storage/data/urls.txt' # str
10
  expand_urls: True # bool
 
37
  temperature: 0.7 # float
38
  repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id
39
  filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
 
40
  stream: False # bool
41
  pdf_reader: 'gpt' # str [llama, pymupdf, gpt]
42
 
code/modules/config/constants.py CHANGED
@@ -14,10 +14,11 @@ LITERAL_API_URL = os.getenv("LITERAL_API_URL")
14
  OAUTH_GOOGLE_CLIENT_ID = os.getenv("OAUTH_GOOGLE_CLIENT_ID")
15
  OAUTH_GOOGLE_CLIENT_SECRET = os.getenv("OAUTH_GOOGLE_CLIENT_SECRET")
16
 
17
- opening_message = f"Hey, What Can I Help You With?\n\nYou can me ask me questions about the course logistics, course content, about the final project, or anything else!"
 
 
 
18
 
19
  # Model Paths
20
 
21
  LLAMA_PATH = "../storage/models/tinyllama"
22
-
23
- RETRIEVER_HF_PATHS = {"RAGatouille": "XThomasBU/Colbert_Index"}
 
14
  OAUTH_GOOGLE_CLIENT_ID = os.getenv("OAUTH_GOOGLE_CLIENT_ID")
15
  OAUTH_GOOGLE_CLIENT_SECRET = os.getenv("OAUTH_GOOGLE_CLIENT_SECRET")
16
 
17
+ opening_message = "Hey, What Can I Help You With?\n\nYou can me ask me questions about the course logistics, course content, about the final project, or anything else!"
18
+ chat_end_message = (
19
+ "I hope I was able to help you. If you have any more questions, feel free to ask!"
20
+ )
21
 
22
  # Model Paths
23
 
24
  LLAMA_PATH = "../storage/models/tinyllama"
 
 
code/modules/config/user_config.yml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ retriever:
2
+ retriever_hf_paths:
3
+ RAGatouille: "XThomasBU/Colbert_Index"
code/modules/dataloader/data_loader.py CHANGED
@@ -3,40 +3,25 @@ import re
3
  import requests
4
  import pysrt
5
  from langchain_community.document_loaders import (
6
- PyMuPDFLoader,
7
  Docx2txtLoader,
8
  YoutubeLoader,
9
- WebBaseLoader,
10
  TextLoader,
11
  )
12
- from langchain_community.document_loaders import UnstructuredMarkdownLoader
13
- from llama_parse import LlamaParse
14
  from langchain.schema import Document
15
  import logging
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
17
  from langchain_experimental.text_splitter import SemanticChunker
18
  from langchain_openai.embeddings import OpenAIEmbeddings
19
- from ragatouille import RAGPretrainedModel
20
- from langchain.chains import LLMChain
21
- from langchain_community.llms import OpenAI
22
- from langchain import PromptTemplate
23
  import json
24
  from concurrent.futures import ThreadPoolExecutor
25
  from urllib.parse import urljoin
26
  import html2text
27
  import bs4
28
- import tempfile
29
  import PyPDF2
30
  from modules.dataloader.pdf_readers.base import PDFReader
31
  from modules.dataloader.pdf_readers.llama import LlamaParser
32
  from modules.dataloader.pdf_readers.gpt import GPTParser
33
-
34
- try:
35
- from modules.dataloader.helpers import get_metadata, download_pdf_from_url
36
- from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
37
- except:
38
- from dataloader.helpers import get_metadata, download_pdf_from_url
39
- from config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
40
 
41
  logger = logging.getLogger(__name__)
42
  BASE_DIR = os.getcwd()
@@ -65,11 +50,13 @@ class HTMLReader:
65
  href = href.replace("http", "https")
66
 
67
  absolute_url = urljoin(base_url, href)
68
- link['href'] = absolute_url
69
 
70
  resp = requests.head(absolute_url)
71
  if resp.status_code != 200:
72
- logger.warning(f"Link {absolute_url} is broken. Status code: {resp.status_code}")
 
 
73
 
74
  return str(soup)
75
 
@@ -85,6 +72,7 @@ class HTMLReader:
85
  else:
86
  return None
87
 
 
88
  class FileReader:
89
  def __init__(self, logger, kind):
90
  self.logger = logger
@@ -96,7 +84,9 @@ class FileReader:
96
  else:
97
  self.pdf_reader = PDFReader()
98
  self.web_reader = HTMLReader()
99
- self.logger.info(f"Initialized FileReader with {kind} PDF reader and HTML reader")
 
 
100
 
101
  def extract_text_from_pdf(self, pdf_path):
102
  text = ""
@@ -154,17 +144,20 @@ class ChunkProcessor:
154
  self.document_metadata = {}
155
  self.document_chunks_full = []
156
 
157
- if not config['vectorstore']['embedd_files']:
 
158
  self.load_document_data()
159
 
160
  if config["splitter_options"]["use_splitter"]:
161
  if config["splitter_options"]["chunking_mode"] == "fixed":
162
  if config["splitter_options"]["split_by_token"]:
163
- self.splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
164
- chunk_size=config["splitter_options"]["chunk_size"],
165
- chunk_overlap=config["splitter_options"]["chunk_overlap"],
166
- separators=config["splitter_options"]["chunk_separators"],
167
- disallowed_special=(),
 
 
168
  )
169
  else:
170
  self.splitter = RecursiveCharacterTextSplitter(
@@ -175,8 +168,7 @@ class ChunkProcessor:
175
  )
176
  else:
177
  self.splitter = SemanticChunker(
178
- OpenAIEmbeddings(),
179
- breakpoint_threshold_type="percentile"
180
  )
181
 
182
  else:
@@ -203,7 +195,10 @@ class ChunkProcessor:
203
  ):
204
  # TODO: Clear up this pipeline of re-adding metadata
205
  documents = [Document(page_content=documents, source=source, page=page)]
206
- if file_type == "pdf" and self.config["splitter_options"]["chunking_mode"] == "fixed":
 
 
 
207
  document_chunks = documents
208
  else:
209
  document_chunks = self.splitter.split_documents(documents)
@@ -229,6 +224,20 @@ class ChunkProcessor:
229
  "https://dl4ds.github.io/sp2024/lectures/",
230
  "https://dl4ds.github.io/sp2024/schedule/",
231
  ) # For any additional metadata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  with ThreadPoolExecutor() as executor:
233
  executor.map(
234
  self.process_file,
@@ -298,6 +307,7 @@ class ChunkProcessor:
298
  self.document_metadata[file_path] = file_metadata
299
 
300
  def process_file(self, file_path, file_index, file_reader, addl_metadata):
 
301
  file_name = os.path.basename(file_path)
302
 
303
  file_type = file_name.split(".")[-1]
@@ -314,10 +324,13 @@ class ChunkProcessor:
314
  return
315
 
316
  try:
317
-
318
  if file_path in self.document_data:
319
  self.logger.warning(f"File {file_name} already processed")
320
- documents = [Document(page_content=content) for content in self.document_data[file_path].values()]
 
 
 
321
  else:
322
  documents = read_methods[file_type](file_path)
323
 
@@ -370,22 +383,31 @@ class ChunkProcessor:
370
  json.dump(self.document_metadata, json_file, indent=4)
371
 
372
  def load_document_data(self):
373
- with open(
374
- f"{self.config['log_chunk_dir']}/docs/doc_content.json", "r"
375
- ) as json_file:
376
- self.document_data = json.load(json_file)
377
- with open(
378
- f"{self.config['log_chunk_dir']}/metadata/doc_metadata.json", "r"
379
- ) as json_file:
380
- self.document_metadata = json.load(json_file)
381
- self.logger.info(
382
- f"Loaded document content from {self.config['log_chunk_dir']}/docs/doc_content.json. Total documents: {len(self.document_data)}"
383
- )
 
 
 
 
 
 
 
384
 
385
 
386
  class DataLoader:
387
  def __init__(self, config, logger=None):
388
- self.file_reader = FileReader(logger=logger, kind=config["llm_params"]["pdf_reader"])
 
 
389
  self.chunk_processor = ChunkProcessor(config, logger=logger)
390
 
391
  def get_chunks(self, uploaded_files, weblinks):
@@ -403,19 +425,22 @@ if __name__ == "__main__":
403
  with open("../code/modules/config/config.yml", "r") as f:
404
  config = yaml.safe_load(f)
405
 
406
- STORAGE_DIR = os.path.join(BASE_DIR, config['vectorstore']["data_path"])
407
  uploaded_files = [
408
- os.path.join(STORAGE_DIR, file) for file in os.listdir(STORAGE_DIR) if file != "urls.txt"
 
 
409
  ]
410
 
411
  data_loader = DataLoader(config, logger=logger)
412
  document_chunks, document_names, documents, document_metadata = (
413
  data_loader.get_chunks(
414
- ["https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf"],
 
 
415
  [],
416
  )
417
  )
418
 
419
  print(document_names[:5])
420
  print(len(document_chunks))
421
-
 
3
  import requests
4
  import pysrt
5
  from langchain_community.document_loaders import (
 
6
  Docx2txtLoader,
7
  YoutubeLoader,
 
8
  TextLoader,
9
  )
 
 
10
  from langchain.schema import Document
11
  import logging
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
  from langchain_experimental.text_splitter import SemanticChunker
14
  from langchain_openai.embeddings import OpenAIEmbeddings
 
 
 
 
15
  import json
16
  from concurrent.futures import ThreadPoolExecutor
17
  from urllib.parse import urljoin
18
  import html2text
19
  import bs4
 
20
  import PyPDF2
21
  from modules.dataloader.pdf_readers.base import PDFReader
22
  from modules.dataloader.pdf_readers.llama import LlamaParser
23
  from modules.dataloader.pdf_readers.gpt import GPTParser
24
+ from modules.dataloader.helpers import get_metadata
 
 
 
 
 
 
25
 
26
  logger = logging.getLogger(__name__)
27
  BASE_DIR = os.getcwd()
 
50
  href = href.replace("http", "https")
51
 
52
  absolute_url = urljoin(base_url, href)
53
+ link["href"] = absolute_url
54
 
55
  resp = requests.head(absolute_url)
56
  if resp.status_code != 200:
57
+ logger.warning(
58
+ f"Link {absolute_url} is broken. Status code: {resp.status_code}"
59
+ )
60
 
61
  return str(soup)
62
 
 
72
  else:
73
  return None
74
 
75
+
76
  class FileReader:
77
  def __init__(self, logger, kind):
78
  self.logger = logger
 
84
  else:
85
  self.pdf_reader = PDFReader()
86
  self.web_reader = HTMLReader()
87
+ self.logger.info(
88
+ f"Initialized FileReader with {kind} PDF reader and HTML reader"
89
+ )
90
 
91
  def extract_text_from_pdf(self, pdf_path):
92
  text = ""
 
144
  self.document_metadata = {}
145
  self.document_chunks_full = []
146
 
147
+ # TODO: Fix when reparse_files is False
148
+ if not config["vectorstore"]["reparse_files"]:
149
  self.load_document_data()
150
 
151
  if config["splitter_options"]["use_splitter"]:
152
  if config["splitter_options"]["chunking_mode"] == "fixed":
153
  if config["splitter_options"]["split_by_token"]:
154
+ self.splitter = (
155
+ RecursiveCharacterTextSplitter.from_tiktoken_encoder(
156
+ chunk_size=config["splitter_options"]["chunk_size"],
157
+ chunk_overlap=config["splitter_options"]["chunk_overlap"],
158
+ separators=config["splitter_options"]["chunk_separators"],
159
+ disallowed_special=(),
160
+ )
161
  )
162
  else:
163
  self.splitter = RecursiveCharacterTextSplitter(
 
168
  )
169
  else:
170
  self.splitter = SemanticChunker(
171
+ OpenAIEmbeddings(), breakpoint_threshold_type="percentile"
 
172
  )
173
 
174
  else:
 
195
  ):
196
  # TODO: Clear up this pipeline of re-adding metadata
197
  documents = [Document(page_content=documents, source=source, page=page)]
198
+ if (
199
+ file_type == "pdf"
200
+ and self.config["splitter_options"]["chunking_mode"] == "fixed"
201
+ ):
202
  document_chunks = documents
203
  else:
204
  document_chunks = self.splitter.split_documents(documents)
 
224
  "https://dl4ds.github.io/sp2024/lectures/",
225
  "https://dl4ds.github.io/sp2024/schedule/",
226
  ) # For any additional metadata
227
+
228
+ # remove already processed files if reparse_files is False
229
+ if not self.config["vectorstore"]["reparse_files"]:
230
+ total_documents = len(uploaded_files) + len(weblinks)
231
+ uploaded_files = [
232
+ file_path
233
+ for file_path in uploaded_files
234
+ if file_path not in self.document_data
235
+ ]
236
+ weblinks = [link for link in weblinks if link not in self.document_data]
237
+ print(
238
+ f"Total documents to process: {total_documents}, Documents already processed: {total_documents - len(uploaded_files) - len(weblinks)}"
239
+ )
240
+
241
  with ThreadPoolExecutor() as executor:
242
  executor.map(
243
  self.process_file,
 
307
  self.document_metadata[file_path] = file_metadata
308
 
309
  def process_file(self, file_path, file_index, file_reader, addl_metadata):
310
+ print(f"Processing file {file_index + 1} : {file_path}")
311
  file_name = os.path.basename(file_path)
312
 
313
  file_type = file_name.split(".")[-1]
 
324
  return
325
 
326
  try:
327
+
328
  if file_path in self.document_data:
329
  self.logger.warning(f"File {file_name} already processed")
330
+ documents = [
331
+ Document(page_content=content)
332
+ for content in self.document_data[file_path].values()
333
+ ]
334
  else:
335
  documents = read_methods[file_type](file_path)
336
 
 
383
  json.dump(self.document_metadata, json_file, indent=4)
384
 
385
  def load_document_data(self):
386
+ try:
387
+ with open(
388
+ f"{self.config['log_chunk_dir']}/docs/doc_content.json", "r"
389
+ ) as json_file:
390
+ self.document_data = json.load(json_file)
391
+ with open(
392
+ f"{self.config['log_chunk_dir']}/metadata/doc_metadata.json", "r"
393
+ ) as json_file:
394
+ self.document_metadata = json.load(json_file)
395
+ self.logger.info(
396
+ f"Loaded document content from {self.config['log_chunk_dir']}/docs/doc_content.json. Total documents: {len(self.document_data)}"
397
+ )
398
+ except FileNotFoundError:
399
+ self.logger.warning(
400
+ f"Document content not found in {self.config['log_chunk_dir']}/docs/doc_content.json"
401
+ )
402
+ self.document_data = {}
403
+ self.document_metadata = {}
404
 
405
 
406
  class DataLoader:
407
  def __init__(self, config, logger=None):
408
+ self.file_reader = FileReader(
409
+ logger=logger, kind=config["llm_params"]["pdf_reader"]
410
+ )
411
  self.chunk_processor = ChunkProcessor(config, logger=logger)
412
 
413
  def get_chunks(self, uploaded_files, weblinks):
 
425
  with open("../code/modules/config/config.yml", "r") as f:
426
  config = yaml.safe_load(f)
427
 
428
+ STORAGE_DIR = os.path.join(BASE_DIR, config["vectorstore"]["data_path"])
429
  uploaded_files = [
430
+ os.path.join(STORAGE_DIR, file)
431
+ for file in os.listdir(STORAGE_DIR)
432
+ if file != "urls.txt"
433
  ]
434
 
435
  data_loader = DataLoader(config, logger=logger)
436
  document_chunks, document_names, documents, document_metadata = (
437
  data_loader.get_chunks(
438
+ [
439
+ "https://dl4ds.github.io/fa2024/static_files/discussion_slides/00_discussion.pdf"
440
+ ],
441
  [],
442
  )
443
  )
444
 
445
  print(document_names[:5])
446
  print(len(document_chunks))
 
code/modules/dataloader/helpers.py CHANGED
@@ -3,6 +3,7 @@ from bs4 import BeautifulSoup
3
  from urllib.parse import urlparse
4
  import tempfile
5
 
 
6
  def get_urls_from_file(file_path: str):
7
  """
8
  Function to get urls from a file
 
3
  from urllib.parse import urlparse
4
  import tempfile
5
 
6
+
7
  def get_urls_from_file(file_path: str):
8
  """
9
  Function to get urls from a file
code/modules/dataloader/pdf_readers/gpt.py CHANGED
@@ -19,9 +19,9 @@ class GPTParser:
19
  self.api_key = os.getenv("OPENAI_API_KEY")
20
  self.prompt = """
21
  The provided documents are images of PDFs of lecture slides of deep learning material.
22
- They contain LaTeX equations, images, and text.
23
  The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
24
- The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
25
  For images, give a description and if you can, a source. Separate each page with '---'.
26
  Just respond with the markdown. Do not include page numbers or any other metadata. Do not try to provide titles. Strictly the content.
27
  """
@@ -31,36 +31,44 @@ class GPTParser:
31
 
32
  encoded_images = [self.encode_image(image) for image in images]
33
 
34
- chunks = [encoded_images[i:i + 5] for i in range(0, len(encoded_images), 5)]
35
 
36
  headers = {
37
  "Content-Type": "application/json",
38
- "Authorization": f"Bearer {self.api_key}"
39
  }
40
 
41
  output = ""
42
  for chunk_num, chunk in enumerate(chunks):
43
- content = [{"type": "image_url", "image_url": {
44
- "url": f"data:image/jpeg;base64,{image}"}} for image in chunk]
 
 
 
 
 
45
 
46
  content.insert(0, {"type": "text", "text": self.prompt})
47
 
48
  payload = {
49
  "model": "gpt-4o-mini",
50
- "messages": [
51
- {
52
- "role": "user",
53
- "content": content
54
- }
55
- ],
56
  }
57
 
58
  response = requests.post(
59
- "https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
 
 
 
60
 
61
  resp = response.json()
62
 
63
- chunk_output = resp['choices'][0]['message']['content'].replace("```", "").replace("markdown", "").replace("````", "")
 
 
 
 
 
64
 
65
  output += chunk_output + "\n---\n"
66
 
@@ -68,14 +76,12 @@ class GPTParser:
68
  output = [doc for doc in output if doc.strip() != ""]
69
 
70
  documents = [
71
- Document(
72
- page_content=page,
73
- metadata={"source": pdf_path, "page": i}
74
- ) for i, page in enumerate(output)
75
  ]
76
  return documents
77
 
78
  def encode_image(self, image):
79
  buffered = BytesIO()
80
  image.save(buffered, format="JPEG")
81
- return base64.b64encode(buffered.getvalue()).decode('utf-8')
 
19
  self.api_key = os.getenv("OPENAI_API_KEY")
20
  self.prompt = """
21
  The provided documents are images of PDFs of lecture slides of deep learning material.
22
+ They contain LaTeX equations, images, and text.
23
  The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
24
+ The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
25
  For images, give a description and if you can, a source. Separate each page with '---'.
26
  Just respond with the markdown. Do not include page numbers or any other metadata. Do not try to provide titles. Strictly the content.
27
  """
 
31
 
32
  encoded_images = [self.encode_image(image) for image in images]
33
 
34
+ chunks = [encoded_images[i : i + 5] for i in range(0, len(encoded_images), 5)]
35
 
36
  headers = {
37
  "Content-Type": "application/json",
38
+ "Authorization": f"Bearer {self.api_key}",
39
  }
40
 
41
  output = ""
42
  for chunk_num, chunk in enumerate(chunks):
43
+ content = [
44
+ {
45
+ "type": "image_url",
46
+ "image_url": {"url": f"data:image/jpeg;base64,{image}"},
47
+ }
48
+ for image in chunk
49
+ ]
50
 
51
  content.insert(0, {"type": "text", "text": self.prompt})
52
 
53
  payload = {
54
  "model": "gpt-4o-mini",
55
+ "messages": [{"role": "user", "content": content}],
 
 
 
 
 
56
  }
57
 
58
  response = requests.post(
59
+ "https://api.openai.com/v1/chat/completions",
60
+ headers=headers,
61
+ json=payload,
62
+ )
63
 
64
  resp = response.json()
65
 
66
+ chunk_output = (
67
+ resp["choices"][0]["message"]["content"]
68
+ .replace("```", "")
69
+ .replace("markdown", "")
70
+ .replace("````", "")
71
+ )
72
 
73
  output += chunk_output + "\n---\n"
74
 
 
76
  output = [doc for doc in output if doc.strip() != ""]
77
 
78
  documents = [
79
+ Document(page_content=page, metadata={"source": pdf_path, "page": i})
80
+ for i, page in enumerate(output)
 
 
81
  ]
82
  return documents
83
 
84
  def encode_image(self, image):
85
  buffered = BytesIO()
86
  image.save(buffered, format="JPEG")
87
+ return base64.b64encode(buffered.getvalue()).decode("utf-8")
code/modules/dataloader/pdf_readers/llama.py CHANGED
@@ -6,15 +6,14 @@ from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
6
  from modules.dataloader.helpers import download_pdf_from_url
7
 
8
 
9
-
10
  class LlamaParser:
11
  def __init__(self):
12
  self.GPT_API_KEY = OPENAI_API_KEY
13
  self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
14
  self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
15
  self.headers = {
16
- 'Accept': 'application/json',
17
- 'Authorization': f'Bearer {LLAMA_CLOUD_API_KEY}'
18
  }
19
  self.parser = LlamaParse(
20
  api_key=LLAMA_CLOUD_API_KEY,
@@ -23,7 +22,7 @@ class LlamaParser:
23
  language="en",
24
  gpt4o_mode=False,
25
  # gpt4o_api_key=OPENAI_API_KEY,
26
- parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX format, between $ signs. For images, if you can, give a description and a source."
27
  )
28
 
29
  def parse(self, pdf_path):
@@ -38,10 +37,8 @@ class LlamaParser:
38
  pages = [page.strip() for page in pages]
39
 
40
  documents = [
41
- Document(
42
- page_content=page,
43
- metadata={"source": pdf_path, "page": i}
44
- ) for i, page in enumerate(pages)
45
  ]
46
 
47
  return documents
@@ -53,20 +50,26 @@ class LlamaParser:
53
  }
54
 
55
  files = [
56
- ('file', ('file', requests.get(pdf_url).content, 'application/octet-stream'))
 
 
 
57
  ]
58
 
59
  response = requests.request(
60
- "POST", self.parse_url, headers=self.headers, data=payload, files=files)
 
61
 
62
- return response.json()['id'], response.json()['status']
63
 
64
  async def get_result(self, job_id):
65
- url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/markdown"
 
 
66
 
67
  response = requests.request("GET", url, headers=self.headers, data={})
68
 
69
- return response.json()['markdown']
70
 
71
  async def _parse(self, pdf_path):
72
  job_id, status = self.make_request(pdf_path)
@@ -78,15 +81,9 @@ class LlamaParser:
78
 
79
  result = await self.get_result(job_id)
80
 
81
- documents = [
82
- Document(
83
- page_content=result,
84
- metadata={"source": pdf_path}
85
- )
86
- ]
87
 
88
  return documents
89
 
90
- async def _parse(self, pdf_path):
91
- return await self._parse(pdf_path)
92
-
 
6
  from modules.dataloader.helpers import download_pdf_from_url
7
 
8
 
 
9
  class LlamaParser:
10
  def __init__(self):
11
  self.GPT_API_KEY = OPENAI_API_KEY
12
  self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
13
  self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
14
  self.headers = {
15
+ "Accept": "application/json",
16
+ "Authorization": f"Bearer {LLAMA_CLOUD_API_KEY}",
17
  }
18
  self.parser = LlamaParse(
19
  api_key=LLAMA_CLOUD_API_KEY,
 
22
  language="en",
23
  gpt4o_mode=False,
24
  # gpt4o_api_key=OPENAI_API_KEY,
25
+ parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX format, between $ signs. For images, if you can, give a description and a source.",
26
  )
27
 
28
  def parse(self, pdf_path):
 
37
  pages = [page.strip() for page in pages]
38
 
39
  documents = [
40
+ Document(page_content=page, metadata={"source": pdf_path, "page": i})
41
+ for i, page in enumerate(pages)
 
 
42
  ]
43
 
44
  return documents
 
50
  }
51
 
52
  files = [
53
+ (
54
+ "file",
55
+ ("file", requests.get(pdf_url).content, "application/octet-stream"),
56
+ )
57
  ]
58
 
59
  response = requests.request(
60
+ "POST", self.parse_url, headers=self.headers, data=payload, files=files
61
+ )
62
 
63
+ return response.json()["id"], response.json()["status"]
64
 
65
  async def get_result(self, job_id):
66
+ url = (
67
+ f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/markdown"
68
+ )
69
 
70
  response = requests.request("GET", url, headers=self.headers, data={})
71
 
72
+ return response.json()["markdown"]
73
 
74
  async def _parse(self, pdf_path):
75
  job_id, status = self.make_request(pdf_path)
 
81
 
82
  result = await self.get_result(job_id)
83
 
84
+ documents = [Document(page_content=result, metadata={"source": pdf_path})]
 
 
 
 
 
85
 
86
  return documents
87
 
88
+ # async def _parse(self, pdf_path):
89
+ # return await self._parse(pdf_path)
 
code/modules/dataloader/webpage_crawler.py CHANGED
@@ -3,7 +3,8 @@ from aiohttp import ClientSession
3
  import asyncio
4
  import requests
5
  from bs4 import BeautifulSoup
6
- from urllib.parse import urlparse, urljoin, urldefrag
 
7
 
8
  class WebpageCrawler:
9
  def __init__(self):
 
3
  import asyncio
4
  import requests
5
  from bs4 import BeautifulSoup
6
+ from urllib.parse import urljoin, urldefrag
7
+
8
 
9
  class WebpageCrawler:
10
  def __init__(self):
code/modules/vectorstore/colbert.py CHANGED
@@ -1,9 +1,9 @@
1
  from ragatouille import RAGPretrainedModel
2
  from modules.vectorstore.base import VectorStoreBase
3
  from langchain_core.retrievers import BaseRetriever
4
- from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun, Callbacks
5
  from langchain_core.documents import Document
6
- from typing import Any, List, Optional, Sequence
7
  import os
8
  import json
9
 
@@ -85,6 +85,7 @@ class ColbertVectorStore(VectorStoreBase):
85
  document_ids=document_names,
86
  document_metadatas=document_metadata,
87
  )
 
88
  self.colbert.set_document_count(len(document_names))
89
 
90
  def load_database(self):
 
1
  from ragatouille import RAGPretrainedModel
2
  from modules.vectorstore.base import VectorStoreBase
3
  from langchain_core.retrievers import BaseRetriever
4
+ from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
5
  from langchain_core.documents import Document
6
+ from typing import Any, List
7
  import os
8
  import json
9
 
 
85
  document_ids=document_names,
86
  document_metadatas=document_metadata,
87
  )
88
+ print(f"Index created at {index_path}")
89
  self.colbert.set_document_count(len(document_names))
90
 
91
  def load_database(self):
code/modules/vectorstore/embedding_model_loader.py CHANGED
@@ -1,9 +1,6 @@
1
  from langchain_community.embeddings import OpenAIEmbeddings
2
  from langchain_community.embeddings import HuggingFaceEmbeddings
3
- from langchain_community.embeddings import LlamaCppEmbeddings
4
-
5
- from modules.config.constants import *
6
- import os
7
 
8
 
9
  class EmbeddingModelLoader:
@@ -28,8 +25,5 @@ class EmbeddingModelLoader:
28
  "trust_remote_code": True,
29
  },
30
  )
31
- # embedding_model = LlamaCppEmbeddings(
32
- # model_path=os.path.abspath("storage/llama-7b.ggmlv3.q4_0.bin")
33
- # )
34
 
35
  return embedding_model
 
1
  from langchain_community.embeddings import OpenAIEmbeddings
2
  from langchain_community.embeddings import HuggingFaceEmbeddings
3
+ from modules.config.constants import OPENAI_API_KEY, HUGGINGFACE_TOKEN
 
 
 
4
 
5
 
6
  class EmbeddingModelLoader:
 
25
  "trust_remote_code": True,
26
  },
27
  )
 
 
 
28
 
29
  return embedding_model
code/modules/vectorstore/faiss.py CHANGED
@@ -14,10 +14,15 @@ class FaissVectorStore(VectorStoreBase):
14
  def __init__(self, config):
15
  self.config = config
16
  self._init_vector_db()
17
- self.local_path = os.path.join(self.config["vectorstore"]["db_path"],
18
- "db_" + self.config["vectorstore"]["db_option"]
19
- + "_" + self.config["vectorstore"]["model"]
20
- + "_" + config["splitter_options"]["chunking_mode"])
 
 
 
 
 
21
 
22
  def _init_vector_db(self):
23
  self.faiss = FAISS(
@@ -28,9 +33,7 @@ class FaissVectorStore(VectorStoreBase):
28
  self.vectorstore = self.faiss.from_documents(
29
  documents=document_chunks, embedding=embedding_model
30
  )
31
- self.vectorstore.save_local(
32
- self.local_path
33
- )
34
 
35
  def load_database(self, embedding_model):
36
  self.vectorstore = self.faiss.load_local(
 
14
  def __init__(self, config):
15
  self.config = config
16
  self._init_vector_db()
17
+ self.local_path = os.path.join(
18
+ self.config["vectorstore"]["db_path"],
19
+ "db_"
20
+ + self.config["vectorstore"]["db_option"]
21
+ + "_"
22
+ + self.config["vectorstore"]["model"]
23
+ + "_"
24
+ + config["splitter_options"]["chunking_mode"],
25
+ )
26
 
27
  def _init_vector_db(self):
28
  self.faiss = FAISS(
 
33
  self.vectorstore = self.faiss.from_documents(
34
  documents=document_chunks, embedding=embedding_model
35
  )
36
+ self.vectorstore.save_local(self.local_path)
 
 
37
 
38
  def load_database(self, embedding_model):
39
  self.vectorstore = self.faiss.load_local(
code/modules/vectorstore/raptor.py CHANGED
@@ -317,13 +317,10 @@ class RAPTORVectoreStore(VectorStoreBase):
317
  print(f"--Generated {len(all_clusters)} clusters--")
318
 
319
  # Summarization
320
- template = """Here is content from the course DS598: Deep Learning for Data Science.
321
-
322
  The content may be form webapge about the course, or lecture content, or any other relevant information.
323
  If the content is in bullet points (from pdf lectre slides), you can summarize the bullet points.
324
-
325
  Give a detailed summary of the content below.
326
-
327
  Documentation:
328
  {context}
329
  """
 
317
  print(f"--Generated {len(all_clusters)} clusters--")
318
 
319
  # Summarization
320
+ template = """Here is content from the course DS598: Deep Learning for Data Science.
 
321
  The content may be form webapge about the course, or lecture content, or any other relevant information.
322
  If the content is in bullet points (from pdf lectre slides), you can summarize the bullet points.
 
323
  Give a detailed summary of the content below.
 
324
  Documentation:
325
  {context}
326
  """
code/modules/vectorstore/store_manager.py CHANGED
@@ -1,9 +1,7 @@
1
  from modules.vectorstore.vectorstore import VectorStore
2
- from modules.vectorstore.helpers import *
3
  from modules.dataloader.webpage_crawler import WebpageCrawler
4
  from modules.dataloader.data_loader import DataLoader
5
- from modules.dataloader.helpers import *
6
- from modules.config.constants import RETRIEVER_HF_PATHS
7
  from modules.vectorstore.embedding_model_loader import EmbeddingModelLoader
8
  import logging
9
  import os
@@ -170,13 +168,21 @@ if __name__ == "__main__":
170
 
171
  with open("modules/config/config.yml", "r") as f:
172
  config = yaml.safe_load(f)
 
 
173
  print(config)
 
174
  print(f"Trying to create database with config: {config}")
175
  vector_db = VectorStoreManager(config)
176
  if config["vectorstore"]["load_from_HF"]:
177
- if config["vectorstore"]["db_option"] in RETRIEVER_HF_PATHS:
 
 
 
178
  vector_db.load_from_HF(
179
- HF_PATH=RETRIEVER_HF_PATHS[config["vectorstore"]["db_option"]]
 
 
180
  )
181
  else:
182
  # print(f"HF_PATH not available for {config['vectorstore']['db_option']}")
@@ -189,7 +195,7 @@ if __name__ == "__main__":
189
  vector_db.create_database()
190
  print("Created database")
191
 
192
- print(f"Trying to load the database")
193
  vector_db = VectorStoreManager(config)
194
  vector_db.load_database()
195
  print("Loaded database")
 
1
  from modules.vectorstore.vectorstore import VectorStore
2
+ from modules.dataloader.helpers import get_urls_from_file
3
  from modules.dataloader.webpage_crawler import WebpageCrawler
4
  from modules.dataloader.data_loader import DataLoader
 
 
5
  from modules.vectorstore.embedding_model_loader import EmbeddingModelLoader
6
  import logging
7
  import os
 
168
 
169
  with open("modules/config/config.yml", "r") as f:
170
  config = yaml.safe_load(f)
171
+ with open("modules/config/user_config.yml", "r") as f:
172
+ user_config = yaml.safe_load(f)
173
  print(config)
174
+ print(user_config)
175
  print(f"Trying to create database with config: {config}")
176
  vector_db = VectorStoreManager(config)
177
  if config["vectorstore"]["load_from_HF"]:
178
+ if (
179
+ config["vectorstore"]["db_option"]
180
+ in user_config["retriever"]["retriever_hf_paths"]
181
+ ):
182
  vector_db.load_from_HF(
183
+ HF_PATH=user_config["retriever"]["retriever_hf_paths"][
184
+ config["vectorstore"]["db_option"]
185
+ ]
186
  )
187
  else:
188
  # print(f"HF_PATH not available for {config['vectorstore']['db_option']}")
 
195
  vector_db.create_database()
196
  print("Created database")
197
 
198
+ print("Trying to load the database")
199
  vector_db = VectorStoreManager(config)
200
  vector_db.load_database()
201
  print("Loaded database")
docs/README.md DELETED
@@ -1,51 +0,0 @@
1
- # Documentation
2
-
3
- ## File Structure:
4
- - `docs/` - Documentation files
5
- - `code/` - Code files
6
- - `storage/` - Storage files
7
- - `vectorstores/` - Vector Databases
8
- - `.env` - Environment Variables
9
- - `Dockerfile` - Dockerfile for Hugging Face
10
- - `.chainlit` - Chainlit Configuration
11
- - `chainlit.md` - Chainlit README
12
- - `README.md` - Repository README
13
- - `.gitignore` - Gitignore file
14
- - `requirements.txt` - Python Requirements
15
- - `.gitattributes` - Gitattributes file
16
-
17
- ## Code Structure
18
-
19
- - `code/main.py` - Main Chainlit App
20
- - `code/config.yaml` - Configuration File to set Embedding related, Vector Database related, and Chat Model related parameters.
21
- - `code/modules/vector_db.py` - Vector Database Creation
22
- - `code/modules/chat_model_loader.py` - Chat Model Loader (Creates the Chat Model)
23
- - `code/modules/constants.py` - Constants (Loads the Environment Variables, Prompts, Model Paths, etc.)
24
- - `code/modules/data_loader.py` - Loads and Chunks the Data
25
- - `code/modules/embedding_model.py` - Creates the Embedding Model to Embed the Data
26
- - `code/modules/llm_tutor.py` - Creates the RAG LLM Tutor
27
- - The Function `qa_bot()` loads the vector database and the chat model, and sets the prompt to pass to the chat model.
28
- - `code/modules/helpers.py` - Helper Functions
29
-
30
- ## Storage and Vectorstores
31
-
32
- - `storage/data/` - Data Storage (Put your pdf files under this directory, and urls in the urls.txt file)
33
- - `storage/models/` - Model Storage (Put your local LLMs under this directory)
34
-
35
- - `vectorstores/` - Vector Databases (Stores the Vector Databases generated from `code/modules/vector_db.py`)
36
-
37
-
38
- ## Useful Configurations
39
- set these in `code/config.yaml`:
40
- * ``["embedding_options"]["embedd_files"]`` - If set to True, embeds the files from the storage directory everytime you run the chainlit command. If set to False, uses the stored vector database.
41
- * ``["embedding_options"]["expand_urls"]`` - If set to True, gets and reads the data from all the links under the url provided. If set to False, only reads the data in the url provided.
42
- * ``["embedding_options"]["search_top_k"]`` - Number of sources that the retriever returns
43
- * ``["llm_params]["use_history"]`` - Whether to use history in the prompt or not
44
- * ``["llm_params]["memory_window"]`` - Number of interactions to keep a track of in the history
45
-
46
-
47
- ## LlamaCpp
48
- * https://python.langchain.com/docs/integrations/llms/llamacpp
49
-
50
- ## Hugging Face Models
51
- * Download the ``.gguf`` files for your Local LLM from Hugging Face (Example: https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/contribute.md ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 💡 **Please ensure formatting, linting, and security checks pass before submitting a pull request**
2
+
3
+ ## Code Formatting
4
+
5
+ The codebase is formatted using [black](https://github.com/psf/black)
6
+
7
+ To format the codebase, run the following command:
8
+
9
+ ```bash
10
+ black .
11
+ ```
12
+
13
+ Please ensure that the code is formatted before submitting a pull request.
14
+
15
+ ## Linting
16
+
17
+ The codebase is linted using [flake8](https://flake8.pycqa.org/en/latest/)
18
+
19
+ To view the linting errors, run the following command:
20
+
21
+ ```bash
22
+ flake8 .
23
+ ```
24
+
25
+ ## Security and Vulnerabilities
26
+
27
+ The codebase is scanned for security vulnerabilities using [bandit](https://github.com/PyCQA/bandit)
28
+
29
+ To scan the codebase for security vulnerabilities, run the following command:
30
+
31
+ ```bash
32
+ bandit -r .
33
+ ```
docs/setup.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Initial Setup
2
+
3
+ ## Python Environment
4
+
5
+ Python Version: 3.11
6
+
7
+ Create a virtual environment and install the required packages:
8
+
9
+ ```bash
10
+ conda create -n ai_tutor python=3.11
11
+ conda activate ai_tutor
12
+ pip install -r requirements.txt
13
+ ```
14
+
15
+ ## Google OAuth 2.0 Client ID and Secret
16
+
17
+ To set up the Google OAuth 2.0 Client ID and Secret, follow these steps:
18
+
19
+ 1. Go to the [Google Cloud Console](https://console.cloud.google.com/apis/credentials).
20
+ 2. Create a new project or select an existing one.
21
+ 3. Navigate to the "Credentials" page.
22
+ 4. Click on "Create Credentials" and select "OAuth 2.0 Client ID".
23
+ 5. Configure the OAuth consent screen if you haven't already.
24
+ 6. Choose "Web application" as the application type.
25
+ 7. Enter a name for your client ID and configure the redirect URIs as needed.
26
+ 8. Click "Create" and copy the generated `Client ID` and `Client Secret`.
27
+
28
+ Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
29
+
30
+ ```bash
31
+ OAUTH_GOOGLE_CLIENT_ID=<your_client_id>
32
+ OAUTH_GOOGLE_CLIENT_SECRET=<your_client_secret>
33
+ ```
34
+
35
+ ## Literal AI API Key
36
+
37
+ To obtain the Literal AI API key:
38
+
39
+ 1. Sign up or log in to [Literal AI](https://cloud.getliteral.ai/).
40
+ 2. Navigate to the API Keys section under your account settings.
41
+ 3. Create a new API key if necessary and copy it.
42
+
43
+ Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
44
+
45
+ ```bash
46
+ LITERAL_API_KEY=<your_api_key>
47
+ ```
48
+
49
+ ## LlamaCloud API Key
50
+
51
+ To obtain the LlamaCloud API Key:
52
+
53
+ 1. Go to [LlamaCloud](https://cloud.llamaindex.ai/).
54
+ 2. Sign up or log in to your account.
55
+ 3. Navigate to the API section and generate a new API key if necessary.
56
+
57
+ Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
58
+
59
+ ```bash
60
+ LLAMA_CLOUD_API_KEY=<your_api_key>
61
+ ```
62
+
63
+ ## Hugging Face Access Token
64
+
65
+ To obtain your Hugging Face access token:
66
+
67
+ 1. Go to [Hugging Face settings](https://huggingface.co/settings/tokens).
68
+ 2. Log in or create an account.
69
+ 3. Generate a new token or use an existing one.
70
+
71
+ Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
72
+
73
+ ```bash
74
+ HUGGINGFACE_TOKEN=<your-huggingface-token>
75
+ ```
76
+
77
+ # Configuration
78
+
79
+ The configuration file `code/modules/config.yaml` contains the parameters that control the behaviour of your app.
80
+ The configuration file `code/modules/user_config.yaml` contains user-defined parameters.
pyproject.toml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [tool.black]
2
+ line-length = 88
requirements.txt CHANGED
@@ -22,4 +22,7 @@ umap-learn
22
  llama-cpp-python
23
  pymupdf
24
  websockets
25
- langchain-openai
 
 
 
 
22
  llama-cpp-python
23
  pymupdf
24
  websockets
25
+ langchain-openai
26
+ black
27
+ flake8
28
+ bandit