Spaces:

KireetiKunam
/

pythonicRAG

Sleeping

App Files Files Community

Support txt and pdf

by akashlives - opened Aug 27

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+180

-15

Files changed (5) hide show

aimakerspace/openai_utils/embedding.py +12 -0
aimakerspace/pdfloader.py +60 -0
aimakerspace/semantic_chunking.py +60 -0
app.py +42 -14
requirements.txt +6 -1

aimakerspace/openai_utils/embedding.py CHANGED Viewed

@@ -48,6 +48,18 @@ class EmbeddingModel:
  return embedding.data[0].embedding
 if __name__ == "__main__":
  embedding_model = EmbeddingModel()

  return embedding.data[0].embedding
+ def embed_documents(self, list_of_text: List[str]) -> List[List[float]]:
+ """
+ Embed a list of documents (text strings) using the OpenAI embeddings model.
+ Args:
+ list_of_text (List[str]): A list of text strings to be embedded.
+ Returns:
+ List[List[float]]: A list of embedding vectors.
+ """
+ return self.get_embeddings(list_of_text)
 if __name__ == "__main__":
  embedding_model = EmbeddingModel()

aimakerspace/pdfloader.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from langchain_community.document_loaders import PyPDFLoader
+class PDFLoader:
+ def __init__(self, file_path: str):
+ """
+ Initialize the PDFLoader class with the path to the PDF file.
+ Args:
+ file_path (str): The path to the PDF file to be loaded.
+ """
+ self.file_path = file_path
+ self.loader = PyPDFLoader(self.file_path)
+ self.pages = None
+ def load_and_split(self):
+ """
+ Load and split the PDF file into pages.
+ Returns:
+ list: A list of pages after loading and splitting the PDF file.
+ """
+ self.pages = self.loader.load_and_split()
+ return self.pages
+ def get_page(self, page_number: int):
+ """
+ Get a specific page from the loaded PDF.
+ Args:
+ page_number (int): The page number to retrieve.
+ Returns:
+ dict: The content of the specified page.
+ """
+ if self.pages is None:
+ raise ValueError(
+ "The PDF has not been loaded yet. Call load_and_split() first."
+ )
+ if page_number < 1 or page_number > len(self.pages):
+ raise ValueError(
+ f"Page number out of range. Please choose a value between 1 and {len(self.pages)}."
+ )
+ return self.pages[page_number - 1]
+ def get_total_pages(self):
+ """
+ Get the total number of pages in the PDF.
+ Returns:
+ int: The total number of pages.
+ """
+ if self.pages is None:
+ raise ValueError(
+ "The PDF has not been loaded yet. Call load_and_split() first."
+ )
+ return len(self.pages)

aimakerspace/semantic_chunking.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from langchain_experimental.text_splitter import SemanticChunker
+from typing import List
+class SemanticChunking:
+ def __init__(self, embedding_model, breakpoint_threshold_type="percentile"):
+ """
+ Initialize the SemanticChunking class with a specified breakpoint threshold type and an embedding model.
+ Args:
+ embedding_model: An instance of the EmbeddingModel class to generate embeddings.
+ breakpoint_threshold_type (str): The type of breakpoint threshold to use for chunking.
+ Options include 'percentile', 'standard_deviation', 'interquartile'.
+ """
+ self.text_splitter = SemanticChunker(
+ embedding_model, breakpoint_threshold_type=breakpoint_threshold_type
+ )
+ def split_text(self, text: str) -> List:
+ """
+ Split the provided text into semantic chunks.
+ Args:
+ text (str): The text to be split into chunks.
+ Returns:
+ list: A list of documents (chunks) obtained from the text.
+ """
+ docs = self.text_splitter.create_documents([text])
+ return docs
+ def get_chunk(self, docs: List, chunk_index: int) -> str:
+ """
+ Get a specific chunk from the list of documents.
+ Args:
+ docs (list): The list of documents (chunks).
+ chunk_index (int): The index of the chunk to retrieve.
+ Returns:
+ str: The content of the specified chunk.
+ """
+ if chunk_index < 0 or chunk_index >= len(docs):
+ raise ValueError(
+ f"Chunk index out of range. Please choose a value between 0 and {len(docs) - 1}."
+ )
+ return docs[chunk_index].page_content
+ def get_total_chunks(self, docs: List) -> int:
+ """
+ Get the total number of chunks in the list of documents.
+ Args:
+ docs (list): The list of documents (chunks).
+ Returns:
+ int: The total number of chunks.
+ """
+ return len(docs)

app.py CHANGED Viewed

@@ -7,9 +7,11 @@ from aimakerspace.openai_utils.prompts import (
  SystemRolePrompt,
  AssistantRolePrompt,
 )
 from aimakerspace.openai_utils.embedding import EmbeddingModel
 from aimakerspace.vectordatabase import VectorDatabase
 from aimakerspace.openai_utils.chatmodel import ChatOpenAI
 import chainlit as cl
 system_template = """\
@@ -25,6 +27,7 @@ Question:
 """
 user_role_prompt = UserRolePrompt(user_prompt_template)
 class RetrievalAugmentedQAPipeline:
  def __init__(self, llm: ChatOpenAI(), vector_db_retriever: VectorDatabase) -> None:
  self.llm = llm
@@ -39,29 +42,55 @@ class RetrievalAugmentedQAPipeline:
  formatted_system_prompt = system_role_prompt.create_message()
- formatted_user_prompt = user_role_prompt.create_message(question=user_query, context=context_prompt)
  async def generate_response():
- async for chunk in self.llm.astream([formatted_system_prompt, formatted_user_prompt]):
  yield chunk
  return {"response": generate_response(), "context": context_list}
 text_splitter = CharacterTextSplitter()
 def process_text_file(file: AskFileResponse):
  import tempfile
- with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as temp_file:
  temp_file_path = temp_file.name
  with open(temp_file_path, "wb") as f:
  f.write(file.content)
- text_loader = TextFileLoader(temp_file_path)
- documents = text_loader.load_documents()
- texts = text_splitter.split_texts(documents)
  return texts
@@ -72,9 +101,9 @@ async def on_chat_start():
  # Wait for the user to upload a file
  while files == None:
  files = await cl.AskFileMessage(
- content="Please upload a Text File file to begin!",
- accept=["text/plain"],
- max_size_mb=10,
  timeout=180,
  ).send()
@@ -93,15 +122,14 @@ async def on_chat_start():
  # Create a dict vector store
  vector_db = VectorDatabase()
  vector_db = await vector_db.abuild_from_list(texts)
  chat_openai = ChatOpenAI()
  # Create a chain
  retrieval_augmented_qa_pipeline = RetrievalAugmentedQAPipeline(
- vector_db_retriever=vector_db,
- llm=chat_openai
  )
  # Let the user know that the system is ready
  msg.content = f"Processing `{file.name}` done. You can now ask questions!"
  await msg.update()
@@ -119,4 +147,4 @@ async def main(message):
  async for stream_resp in result["response"]:
  await msg.stream_token(stream_resp)
- await msg.send()

  SystemRolePrompt,
  AssistantRolePrompt,
 )
+from aimakerspace.pdfloader import PDFLoader
 from aimakerspace.openai_utils.embedding import EmbeddingModel
 from aimakerspace.vectordatabase import VectorDatabase
 from aimakerspace.openai_utils.chatmodel import ChatOpenAI
+from aimakerspace.semantic_chunking import SemanticChunking
 import chainlit as cl
 system_template = """\
 """
 user_role_prompt = UserRolePrompt(user_prompt_template)
 class RetrievalAugmentedQAPipeline:
  def __init__(self, llm: ChatOpenAI(), vector_db_retriever: VectorDatabase) -> None:
  self.llm = llm
  formatted_system_prompt = system_role_prompt.create_message()
+ formatted_user_prompt = user_role_prompt.create_message(
+ question=user_query, context=context_prompt
+ )
  async def generate_response():
+ async for chunk in self.llm.astream(
+ [formatted_system_prompt, formatted_user_prompt]
+ ):
  yield chunk
  return {"response": generate_response(), "context": context_list}
 text_splitter = CharacterTextSplitter()
+embedding_model = EmbeddingModel()
+chunker = SemanticChunking(embedding_model, breakpoint_threshold_type="percentile")
 def process_text_file(file: AskFileResponse):
  import tempfile
+ file_extension = os.path.splitext(file.name)[1].lower()
+ if file_extension == ".txt":
+ suffix = ".txt"
+ elif file_extension == ".pdf":
+ suffix = ".pdf"
+ else:
+ raise ValueError("Unsupported file type. Please upload a .txt or .pdf file.")
+ with tempfile.NamedTemporaryFile(
+ mode="w", delete=False, suffix=suffix
+ ) as temp_file:
  temp_file_path = temp_file.name
  with open(temp_file_path, "wb") as f:
  f.write(file.content)
+ if suffix == ".txt":
+ file_loader = TextFileLoader(temp_file_path)
+ elif suffix == ".pdf":
+ file_loader = PDFLoader(temp_file_path)
+ else:
+ raise ValueError("Unsupported file type. Please upload a .txt or .pdf file.")
+ documents = file_loader.load_documents()
+ split_pages = []
+ for doc in documents:
+ split_pages += chunker.split_text(doc.page_content)
+ texts = [i.page_content for i in split_pages]
+ # texts = text_splitter.split_texts(documents)
  return texts
  # Wait for the user to upload a file
  while files == None:
  files = await cl.AskFileMessage(
+ content="Please upload a Text or PDF File file to begin!",
+ accept=["text/plain", "application/pdf"],
+ max_size_mb=2,
  timeout=180,
  ).send()
  # Create a dict vector store
  vector_db = VectorDatabase()
  vector_db = await vector_db.abuild_from_list(texts)
  chat_openai = ChatOpenAI()
  # Create a chain
  retrieval_augmented_qa_pipeline = RetrievalAugmentedQAPipeline(
+ vector_db_retriever=vector_db, llm=chat_openai
  )
  # Let the user know that the system is ready
  msg.content = f"Processing `{file.name}` done. You can now ask questions!"
  await msg.update()
  async for stream_resp in result["response"]:
  await msg.stream_token(stream_resp)
+ await msg.send()

requirements.txt CHANGED Viewed

@@ -1,3 +1,8 @@
 numpy
 chainlit==0.7.700
-openai

 numpy
 chainlit==0.7.700
+openai
+langchain
+langchain-community
+langchain-experimental
+langchain-openai
+langchain-core