fracapuano commited on
Commit
02556c2
1 Parent(s): acbe90b

fix: major code restructuring

Browse files
Files changed (1) hide show
  1. qa/utils.py +58 -29
qa/utils.py CHANGED
@@ -7,7 +7,7 @@ from langchain.llms import OpenAI
7
  from langchain.docstore.document import Document
8
  from langchain.vectorstores import FAISS, VectorStore
9
  import docx2txt
10
- from typing import List, Dict, Any, Union, Text, Tuple
11
  import re
12
  from io import BytesIO
13
  import streamlit as st
@@ -15,12 +15,38 @@ from .prompts import STUFF_PROMPT
15
  from pypdf import PdfReader
16
  from openai.error import AuthenticationError
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  class HashDocument(Document):
19
  """A document that uses the page content as the hash."""
20
  def __hash__(self):
21
  content = self.page_content + "".join(self.metadata[k] for k in self.metadata.keys())
22
  return hash(content)
23
 
 
24
  @st.cache_data
25
  def parse_docx(file: BytesIO) -> str:
26
  text = docx2txt.process(file)
@@ -43,7 +69,6 @@ def parse_pdf(file: BytesIO) -> List[str]:
43
  text = re.sub(r"\n\s*\n", "\n\n", text)
44
 
45
  output.append(text)
46
-
47
  return output
48
 
49
 
@@ -54,6 +79,19 @@ def parse_txt(file: BytesIO) -> str:
54
  text = re.sub(r"\n\s*\n", "\n\n", text)
55
  return text
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  @st.cache_data
59
  def text_to_docs(text: Union[Text, Tuple[Text]]) -> List[Document]:
@@ -61,10 +99,13 @@ def text_to_docs(text: Union[Text, Tuple[Text]]) -> List[Document]:
61
  Converts a string or frozenset of strings to a list of Documents
62
  with metadata.
63
  """
64
- if isinstance(text, str):
65
- # Take a single string as one page
66
- text = tuple([text])
67
- elif isinstance(text, tuple):
 
 
 
68
  # map each page into a document instance
69
  page_docs = [HashDocument(page_content=page) for page in text]
70
  # Add page numbers as metadata
@@ -72,52 +113,40 @@ def text_to_docs(text: Union[Text, Tuple[Text]]) -> List[Document]:
72
  doc.metadata["page"] = i + 1
73
  # Split pages into chunks
74
  doc_chunks = []
75
- # text splitter to split the text into chunks
76
- text_splitter = RecursiveCharacterTextSplitter(
77
- chunk_size=800,
78
- separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
79
- chunk_overlap=20, # minimal overlap to capture sematic overlap across chunks
80
- )
81
-
82
  for doc in page_docs:
 
83
  chunks = text_splitter.split_text(doc.page_content)
84
  for i, chunk in enumerate(chunks):
85
  # Create a new document for each individual chunk
86
  doc = HashDocument(
87
  page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
88
  )
89
- # Add sources a metadata
90
  doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
91
  doc_chunks.append(doc)
92
 
93
  return doc_chunks
94
 
95
- else:
96
- raise ValueError("Text must be either a string or a list of strings. Got: {type(text)}")
97
-
98
 
99
  @st.cache_data
100
  def embed_docs(_docs: Tuple[Document]) -> VectorStore:
101
  """Embeds a list of Documents and returns a FAISS index"""
102
- docs = _docs
103
- if not st.session_state.get("OPENAI_API_KEY"):
104
- raise AuthenticationError(
105
- "Enter your OpenAI API key in the sidebar. You can get a key at https://platform.openai.com/account/api-keys."
106
- )
107
- else:
108
- # Embed the chunks
109
- embeddings = OpenAIEmbeddings(openai_api_key=st.session_state.get("OPENAI_API_KEY"))
110
- index = FAISS.from_documents(list(docs), embeddings)
111
 
112
- return index
113
 
114
  @st.cache_data
115
- def search_docs(_index: VectorStore, query: str) -> List[Document]:
116
  """Searches a FAISS index for similar chunks to the query
117
  and returns a list of Documents."""
118
 
119
  # Search for similar chunks
120
- docs = _index.similarity_search(query, k=5)
121
  return docs
122
 
123
 
 
7
  from langchain.docstore.document import Document
8
  from langchain.vectorstores import FAISS, VectorStore
9
  import docx2txt
10
+ from typing import List, Dict, Any, Union, Text, Tuple, Iterable
11
  import re
12
  from io import BytesIO
13
  import streamlit as st
 
15
  from pypdf import PdfReader
16
  from openai.error import AuthenticationError
17
 
18
+ class PDFFile:
19
+ """A PDF file class for typing purposes."""
20
+ @classmethod
21
+ def is_pdf(file:Any) -> bool:
22
+ return file.name.endswith(".pdf")
23
+
24
+ class DocxFile:
25
+ """A Docx file class for typing purposes."""
26
+ @classmethod
27
+ def is_docx(file:Any) -> bool:
28
+ return file.name.endswith(".docx")
29
+
30
+ class TxtFile:
31
+ """A Txt file class for typing purposes."""
32
+ @classmethod
33
+ def is_txt(file:Any) -> bool:
34
+ return file.name.endswith(".txt")
35
+
36
+ class CodeFile:
37
+ """A scripting-file class for typing purposes."""
38
+ @classmethod
39
+ def is_code(file:Any) -> bool:
40
+ return file.name.split(".")[1] in [".py", ".json", ".html", ".css", ".md"]
41
+
42
+
43
  class HashDocument(Document):
44
  """A document that uses the page content as the hash."""
45
  def __hash__(self):
46
  content = self.page_content + "".join(self.metadata[k] for k in self.metadata.keys())
47
  return hash(content)
48
 
49
+
50
  @st.cache_data
51
  def parse_docx(file: BytesIO) -> str:
52
  text = docx2txt.process(file)
 
69
  text = re.sub(r"\n\s*\n", "\n\n", text)
70
 
71
  output.append(text)
 
72
  return output
73
 
74
 
 
79
  text = re.sub(r"\n\s*\n", "\n\n", text)
80
  return text
81
 
82
+ @st.cache_data
83
+ def get_text_splitter(
84
+ chunk_size:int=500,
85
+ chunk_overlap:int=50,
86
+ separators:Iterable[Text]= ["\n\n", "\n", ".", "!", "?", ",", " ", ""])->RecursiveCharacterTextSplitter:
87
+ """Returns a text splitter instance with the given parameters. Cached for performance."""
88
+ # text splitter to split the text into chunks
89
+ text_splitter = RecursiveCharacterTextSplitter(
90
+ chunk_size=chunk_size, # a limited chunk size ensures smaller chunks and more precise answers
91
+ separators=separators, # a list of separators to split the text on
92
+ chunk_overlap=chunk_overlap, # minimal overlap to capture sematic overlap across chunks
93
+ )
94
+ return text_splitter
95
 
96
  @st.cache_data
97
  def text_to_docs(text: Union[Text, Tuple[Text]]) -> List[Document]:
 
99
  Converts a string or frozenset of strings to a list of Documents
100
  with metadata.
101
  """
102
+ # sanity check on the input provided
103
+ if not isinstance(text, (str, tuple)):
104
+ raise ValueError("Text must be either a string or a list of strings. Got: {type(text)}")
105
+ elif isinstance(text, str):
106
+ # Take a single string as one page - make it a tuple so that is hashable
107
+ text = (text, )
108
+ if isinstance(text, tuple):
109
  # map each page into a document instance
110
  page_docs = [HashDocument(page_content=page) for page in text]
111
  # Add page numbers as metadata
 
113
  doc.metadata["page"] = i + 1
114
  # Split pages into chunks
115
  doc_chunks = []
116
+ # Get the text splitter
117
+ text_splitter = get_text_splitter()
118
+
 
 
 
 
119
  for doc in page_docs:
120
+ # this splits the page into chunks
121
  chunks = text_splitter.split_text(doc.page_content)
122
  for i, chunk in enumerate(chunks):
123
  # Create a new document for each individual chunk
124
  doc = HashDocument(
125
  page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
126
  )
127
+ # Add sources to metadata for retrieval later on
128
  doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
129
  doc_chunks.append(doc)
130
 
131
  return doc_chunks
132
 
 
 
 
133
 
134
  @st.cache_data
135
  def embed_docs(_docs: Tuple[Document]) -> VectorStore:
136
  """Embeds a list of Documents and returns a FAISS index"""
137
+ # Embed the chunks
138
+ embeddings = OpenAIEmbeddings(openai_api_key=st.session_state.get("OPENAI_API_KEY"))
139
+ index = FAISS.from_documents(list(_docs), embeddings)
 
 
 
 
 
 
140
 
141
+ return index
142
 
143
  @st.cache_data
144
+ def search_docs(_index: VectorStore, query: str, k:int=5) -> List[Document]:
145
  """Searches a FAISS index for similar chunks to the query
146
  and returns a list of Documents."""
147
 
148
  # Search for similar chunks
149
+ docs = _index.similarity_search(query, k=k)
150
  return docs
151
 
152