aimakerspace/openai_utils/embedding.py CHANGED
@@ -48,6 +48,18 @@ class EmbeddingModel:
48
 
49
  return embedding.data[0].embedding
50
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  if __name__ == "__main__":
53
  embedding_model = EmbeddingModel()
 
48
 
49
  return embedding.data[0].embedding
50
 
51
+ def embed_documents(self, list_of_text: List[str]) -> List[List[float]]:
52
+ """
53
+ Embed a list of documents (text strings) using the OpenAI embeddings model.
54
+
55
+ Args:
56
+ list_of_text (List[str]): A list of text strings to be embedded.
57
+
58
+ Returns:
59
+ List[List[float]]: A list of embedding vectors.
60
+ """
61
+ return self.get_embeddings(list_of_text)
62
+
63
 
64
  if __name__ == "__main__":
65
  embedding_model = EmbeddingModel()
aimakerspace/pdfloader.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader
2
+
3
+
4
+ class PDFLoader:
5
+ def __init__(self, file_path: str):
6
+ """
7
+ Initialize the PDFLoader class with the path to the PDF file.
8
+
9
+ Args:
10
+ file_path (str): The path to the PDF file to be loaded.
11
+ """
12
+ self.file_path = file_path
13
+ self.loader = PyPDFLoader(self.file_path)
14
+ self.pages = None
15
+
16
+ def load_and_split(self):
17
+ """
18
+ Load and split the PDF file into pages.
19
+
20
+ Returns:
21
+ list: A list of pages after loading and splitting the PDF file.
22
+ """
23
+ self.pages = self.loader.load_and_split()
24
+ return self.pages
25
+
26
+ def get_page(self, page_number: int):
27
+ """
28
+ Get a specific page from the loaded PDF.
29
+
30
+ Args:
31
+ page_number (int): The page number to retrieve.
32
+
33
+ Returns:
34
+ dict: The content of the specified page.
35
+ """
36
+ if self.pages is None:
37
+ raise ValueError(
38
+ "The PDF has not been loaded yet. Call load_and_split() first."
39
+ )
40
+
41
+ if page_number < 1 or page_number > len(self.pages):
42
+ raise ValueError(
43
+ f"Page number out of range. Please choose a value between 1 and {len(self.pages)}."
44
+ )
45
+
46
+ return self.pages[page_number - 1]
47
+
48
+ def get_total_pages(self):
49
+ """
50
+ Get the total number of pages in the PDF.
51
+
52
+ Returns:
53
+ int: The total number of pages.
54
+ """
55
+ if self.pages is None:
56
+ raise ValueError(
57
+ "The PDF has not been loaded yet. Call load_and_split() first."
58
+ )
59
+
60
+ return len(self.pages)
aimakerspace/semantic_chunking.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_experimental.text_splitter import SemanticChunker
2
+ from typing import List
3
+
4
+
5
+ class SemanticChunking:
6
+ def __init__(self, embedding_model, breakpoint_threshold_type="percentile"):
7
+ """
8
+ Initialize the SemanticChunking class with a specified breakpoint threshold type and an embedding model.
9
+
10
+ Args:
11
+ embedding_model: An instance of the EmbeddingModel class to generate embeddings.
12
+ breakpoint_threshold_type (str): The type of breakpoint threshold to use for chunking.
13
+ Options include 'percentile', 'standard_deviation', 'interquartile'.
14
+ """
15
+ self.text_splitter = SemanticChunker(
16
+ embedding_model, breakpoint_threshold_type=breakpoint_threshold_type
17
+ )
18
+
19
+ def split_text(self, text: str) -> List:
20
+ """
21
+ Split the provided text into semantic chunks.
22
+
23
+ Args:
24
+ text (str): The text to be split into chunks.
25
+
26
+ Returns:
27
+ list: A list of documents (chunks) obtained from the text.
28
+ """
29
+ docs = self.text_splitter.create_documents([text])
30
+ return docs
31
+
32
+ def get_chunk(self, docs: List, chunk_index: int) -> str:
33
+ """
34
+ Get a specific chunk from the list of documents.
35
+
36
+ Args:
37
+ docs (list): The list of documents (chunks).
38
+ chunk_index (int): The index of the chunk to retrieve.
39
+
40
+ Returns:
41
+ str: The content of the specified chunk.
42
+ """
43
+ if chunk_index < 0 or chunk_index >= len(docs):
44
+ raise ValueError(
45
+ f"Chunk index out of range. Please choose a value between 0 and {len(docs) - 1}."
46
+ )
47
+
48
+ return docs[chunk_index].page_content
49
+
50
+ def get_total_chunks(self, docs: List) -> int:
51
+ """
52
+ Get the total number of chunks in the list of documents.
53
+
54
+ Args:
55
+ docs (list): The list of documents (chunks).
56
+
57
+ Returns:
58
+ int: The total number of chunks.
59
+ """
60
+ return len(docs)
app.py CHANGED
@@ -7,9 +7,11 @@ from aimakerspace.openai_utils.prompts import (
7
  SystemRolePrompt,
8
  AssistantRolePrompt,
9
  )
 
10
  from aimakerspace.openai_utils.embedding import EmbeddingModel
11
  from aimakerspace.vectordatabase import VectorDatabase
12
  from aimakerspace.openai_utils.chatmodel import ChatOpenAI
 
13
  import chainlit as cl
14
 
15
  system_template = """\
@@ -25,6 +27,7 @@ Question:
25
  """
26
  user_role_prompt = UserRolePrompt(user_prompt_template)
27
 
 
28
  class RetrievalAugmentedQAPipeline:
29
  def __init__(self, llm: ChatOpenAI(), vector_db_retriever: VectorDatabase) -> None:
30
  self.llm = llm
@@ -39,29 +42,55 @@ class RetrievalAugmentedQAPipeline:
39
 
40
  formatted_system_prompt = system_role_prompt.create_message()
41
 
42
- formatted_user_prompt = user_role_prompt.create_message(question=user_query, context=context_prompt)
 
 
43
 
44
  async def generate_response():
45
- async for chunk in self.llm.astream([formatted_system_prompt, formatted_user_prompt]):
 
 
46
  yield chunk
47
 
48
  return {"response": generate_response(), "context": context_list}
49
 
 
50
  text_splitter = CharacterTextSplitter()
 
 
51
 
52
 
53
  def process_text_file(file: AskFileResponse):
54
  import tempfile
55
 
56
- with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as temp_file:
 
 
 
 
 
 
 
 
 
 
57
  temp_file_path = temp_file.name
58
 
59
  with open(temp_file_path, "wb") as f:
60
  f.write(file.content)
61
 
62
- text_loader = TextFileLoader(temp_file_path)
63
- documents = text_loader.load_documents()
64
- texts = text_splitter.split_texts(documents)
 
 
 
 
 
 
 
 
 
65
  return texts
66
 
67
 
@@ -72,9 +101,9 @@ async def on_chat_start():
72
  # Wait for the user to upload a file
73
  while files == None:
74
  files = await cl.AskFileMessage(
75
- content="Please upload a Text File file to begin!",
76
- accept=["text/plain"],
77
- max_size_mb=10,
78
  timeout=180,
79
  ).send()
80
 
@@ -93,15 +122,14 @@ async def on_chat_start():
93
  # Create a dict vector store
94
  vector_db = VectorDatabase()
95
  vector_db = await vector_db.abuild_from_list(texts)
96
-
97
  chat_openai = ChatOpenAI()
98
 
99
  # Create a chain
100
  retrieval_augmented_qa_pipeline = RetrievalAugmentedQAPipeline(
101
- vector_db_retriever=vector_db,
102
- llm=chat_openai
103
  )
104
-
105
  # Let the user know that the system is ready
106
  msg.content = f"Processing `{file.name}` done. You can now ask questions!"
107
  await msg.update()
@@ -119,4 +147,4 @@ async def main(message):
119
  async for stream_resp in result["response"]:
120
  await msg.stream_token(stream_resp)
121
 
122
- await msg.send()
 
7
  SystemRolePrompt,
8
  AssistantRolePrompt,
9
  )
10
+ from aimakerspace.pdfloader import PDFLoader
11
  from aimakerspace.openai_utils.embedding import EmbeddingModel
12
  from aimakerspace.vectordatabase import VectorDatabase
13
  from aimakerspace.openai_utils.chatmodel import ChatOpenAI
14
+ from aimakerspace.semantic_chunking import SemanticChunking
15
  import chainlit as cl
16
 
17
  system_template = """\
 
27
  """
28
  user_role_prompt = UserRolePrompt(user_prompt_template)
29
 
30
+
31
  class RetrievalAugmentedQAPipeline:
32
  def __init__(self, llm: ChatOpenAI(), vector_db_retriever: VectorDatabase) -> None:
33
  self.llm = llm
 
42
 
43
  formatted_system_prompt = system_role_prompt.create_message()
44
 
45
+ formatted_user_prompt = user_role_prompt.create_message(
46
+ question=user_query, context=context_prompt
47
+ )
48
 
49
  async def generate_response():
50
+ async for chunk in self.llm.astream(
51
+ [formatted_system_prompt, formatted_user_prompt]
52
+ ):
53
  yield chunk
54
 
55
  return {"response": generate_response(), "context": context_list}
56
 
57
+
58
  text_splitter = CharacterTextSplitter()
59
+ embedding_model = EmbeddingModel()
60
+ chunker = SemanticChunking(embedding_model, breakpoint_threshold_type="percentile")
61
 
62
 
63
  def process_text_file(file: AskFileResponse):
64
  import tempfile
65
 
66
+ file_extension = os.path.splitext(file.name)[1].lower()
67
+ if file_extension == ".txt":
68
+ suffix = ".txt"
69
+ elif file_extension == ".pdf":
70
+ suffix = ".pdf"
71
+ else:
72
+ raise ValueError("Unsupported file type. Please upload a .txt or .pdf file.")
73
+
74
+ with tempfile.NamedTemporaryFile(
75
+ mode="w", delete=False, suffix=suffix
76
+ ) as temp_file:
77
  temp_file_path = temp_file.name
78
 
79
  with open(temp_file_path, "wb") as f:
80
  f.write(file.content)
81
 
82
+ if suffix == ".txt":
83
+ file_loader = TextFileLoader(temp_file_path)
84
+ elif suffix == ".pdf":
85
+ file_loader = PDFLoader(temp_file_path)
86
+ else:
87
+ raise ValueError("Unsupported file type. Please upload a .txt or .pdf file.")
88
+ documents = file_loader.load_documents()
89
+ split_pages = []
90
+ for doc in documents:
91
+ split_pages += chunker.split_text(doc.page_content)
92
+ texts = [i.page_content for i in split_pages]
93
+ # texts = text_splitter.split_texts(documents)
94
  return texts
95
 
96
 
 
101
  # Wait for the user to upload a file
102
  while files == None:
103
  files = await cl.AskFileMessage(
104
+ content="Please upload a Text or PDF File file to begin!",
105
+ accept=["text/plain", "application/pdf"],
106
+ max_size_mb=2,
107
  timeout=180,
108
  ).send()
109
 
 
122
  # Create a dict vector store
123
  vector_db = VectorDatabase()
124
  vector_db = await vector_db.abuild_from_list(texts)
125
+
126
  chat_openai = ChatOpenAI()
127
 
128
  # Create a chain
129
  retrieval_augmented_qa_pipeline = RetrievalAugmentedQAPipeline(
130
+ vector_db_retriever=vector_db, llm=chat_openai
 
131
  )
132
+
133
  # Let the user know that the system is ready
134
  msg.content = f"Processing `{file.name}` done. You can now ask questions!"
135
  await msg.update()
 
147
  async for stream_resp in result["response"]:
148
  await msg.stream_token(stream_resp)
149
 
150
+ await msg.send()
requirements.txt CHANGED
@@ -1,3 +1,8 @@
1
  numpy
2
  chainlit==0.7.700
3
- openai
 
 
 
 
 
 
1
  numpy
2
  chainlit==0.7.700
3
+ openai
4
+ langchain
5
+ langchain-community
6
+ langchain-experimental
7
+ langchain-openai
8
+ langchain-core