lfoppiano commited on
Commit
60c4caf
·
1 Parent(s): cbdc1a4

include title, authors and year in the data store

Browse files
document_qa/document_qa_engine.py CHANGED
@@ -1,4 +1,5 @@
1
  import copy
 
2
  import os
3
  from pathlib import Path
4
  from typing import Union, Any
@@ -173,8 +174,10 @@ class DocumentQAEngine:
173
  relevant_documents = multi_query_retriever.get_relevant_documents(query)
174
  return relevant_documents
175
 
176
- def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False):
177
- """Extract text from documents using Grobid, if chunk_size is < 0 it keep each paragraph separately"""
 
 
178
  if verbose:
179
  print("File", pdf_file_path)
180
  filename = Path(pdf_file_path).stem
@@ -189,6 +192,7 @@ class DocumentQAEngine:
189
  texts = []
190
  metadatas = []
191
  ids = []
 
192
  if chunk_size < 0:
193
  for passage in structure['passages']:
194
  biblio_copy = copy.copy(biblio)
@@ -212,10 +216,25 @@ class DocumentQAEngine:
212
  metadatas = [biblio for _ in range(len(texts))]
213
  ids = [id for id, t in enumerate(texts)]
214
 
 
 
 
 
 
 
 
 
 
 
215
  return texts, metadatas, ids
216
 
217
- def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1):
218
- texts, metadata, ids = self.get_text_from_document(pdf_path, chunk_size=chunk_size, perc_overlap=perc_overlap)
 
 
 
 
 
219
  if doc_id:
220
  hash = doc_id
221
  else:
@@ -233,7 +252,7 @@ class DocumentQAEngine:
233
 
234
  return hash
235
 
236
- def create_embeddings(self, pdfs_dir_path: Path, chunk_size=500, perc_overlap=0.1):
237
  input_files = []
238
  for root, dirs, files in os.walk(pdfs_dir_path, followlinks=False):
239
  for file_ in files:
@@ -250,9 +269,12 @@ class DocumentQAEngine:
250
  if os.path.exists(data_path):
251
  print(data_path, "exists. Skipping it ")
252
  continue
253
-
254
- texts, metadata, ids = self.get_text_from_document(input_file, chunk_size=chunk_size,
255
- perc_overlap=perc_overlap)
 
 
 
256
  filename = metadata[0]['filename']
257
 
258
  vector_db_document = Chroma.from_texts(texts,
 
1
  import copy
2
+ import json
3
  import os
4
  from pathlib import Path
5
  from typing import Union, Any
 
174
  relevant_documents = multi_query_retriever.get_relevant_documents(query)
175
  return relevant_documents
176
 
177
+ def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, include=(), verbose=False):
178
+ """
179
+ Extract text from documents using Grobid, if chunk_size is < 0 it keeps each paragraph separately
180
+ """
181
  if verbose:
182
  print("File", pdf_file_path)
183
  filename = Path(pdf_file_path).stem
 
192
  texts = []
193
  metadatas = []
194
  ids = []
195
+
196
  if chunk_size < 0:
197
  for passage in structure['passages']:
198
  biblio_copy = copy.copy(biblio)
 
216
  metadatas = [biblio for _ in range(len(texts))]
217
  ids = [id for id, t in enumerate(texts)]
218
 
219
+ if "biblio" in include:
220
+ biblio_metadata = copy.copy(biblio)
221
+ biblio_metadata['type'] = "biblio"
222
+ biblio_metadata['section'] = "header"
223
+ for key in ['title', 'authors', 'year']:
224
+ if key in biblio_metadata:
225
+ texts.append("{}: {}".format(key, biblio_metadata[key]))
226
+ metadatas.append(biblio_metadata)
227
+ ids.append(key)
228
+
229
  return texts, metadatas, ids
230
 
231
+ def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1, include_biblio=False):
232
+ include = ["biblio"] if include_biblio else []
233
+ texts, metadata, ids = self.get_text_from_document(
234
+ pdf_path,
235
+ chunk_size=chunk_size,
236
+ perc_overlap=perc_overlap,
237
+ include=include)
238
  if doc_id:
239
  hash = doc_id
240
  else:
 
252
 
253
  return hash
254
 
255
+ def create_embeddings(self, pdfs_dir_path: Path, chunk_size=500, perc_overlap=0.1, include_biblio=False):
256
  input_files = []
257
  for root, dirs, files in os.walk(pdfs_dir_path, followlinks=False):
258
  for file_ in files:
 
269
  if os.path.exists(data_path):
270
  print(data_path, "exists. Skipping it ")
271
  continue
272
+ include = ["biblio"] if include_biblio else []
273
+ texts, metadata, ids = self.get_text_from_document(
274
+ input_file,
275
+ chunk_size=chunk_size,
276
+ perc_overlap=perc_overlap,
277
+ include=include)
278
  filename = metadata[0]['filename']
279
 
280
  vector_db_document = Chroma.from_texts(texts,
streamlit_app.py CHANGED
@@ -283,7 +283,8 @@ if uploaded_file and not st.session_state.loaded_embeddings:
283
  # hash = get_file_hash(tmp_file.name)[:10]
284
  st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(tmp_file.name,
285
  chunk_size=chunk_size,
286
- perc_overlap=0.1)
 
287
  st.session_state['loaded_embeddings'] = True
288
  st.session_state.messages = []
289
 
 
283
  # hash = get_file_hash(tmp_file.name)[:10]
284
  st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(tmp_file.name,
285
  chunk_size=chunk_size,
286
+ perc_overlap=0.1,
287
+ include_biblio=True)
288
  st.session_state['loaded_embeddings'] = True
289
  st.session_state.messages = []
290