kpal002 commited on
Commit
6ce5fee
·
verified ·
1 Parent(s): 8460e38

Update RAG_utils.py

Browse files
Files changed (1) hide show
  1. RAG_utils.py +30 -2
RAG_utils.py CHANGED
@@ -11,6 +11,7 @@ import pandas as pd
11
  import numpy as np
12
  import evaluate
13
  import qdrant_client
 
14
  from pydantic import BaseModel, Field
15
  from typing import Any, List, Tuple, Set, Dict, Optional, Union
16
  from sklearn.metrics.pairwise import cosine_similarity
@@ -385,6 +386,32 @@ class PDFProcessor_Unstructured:
385
  return (current_chunk.endswith(",") or
386
  (current_chunk[-1].islower() and next_chunk[0].islower()))
387
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
  def process_pdf(self) -> Tuple[List[str], List[str]]:
389
  """
390
  Processes the PDF by extracting, categorizing, and merging elements.
@@ -430,9 +457,10 @@ class PDFProcessor_Unstructured:
430
 
431
  try:
432
  logging.debug(f"Processing PDF at {self.file_path}")
433
- results = self.process_pdf() # Assuming this is a defined method
 
434
  logging.info("PDF processing completed successfully.")
435
- return results
436
  except Exception as e:
437
  logging.error(f"Error processing PDF file: {e}", exc_info=True)
438
  raise
 
11
  import numpy as np
12
  import evaluate
13
  import qdrant_client
14
+ from pypdf import PdfReader
15
  from pydantic import BaseModel, Field
16
  from typing import Any, List, Tuple, Set, Dict, Optional, Union
17
  from sklearn.metrics.pairwise import cosine_similarity
 
386
  return (current_chunk.endswith(",") or
387
  (current_chunk[-1].islower() and next_chunk[0].islower()))
388
 
389
+ def extract_title_from_pdf(self, uploaded_file):
390
+ """
391
+ Extracts the title from a PDF file's metadata.
392
+
393
+ This function reads the metadata of a PDF file using PyPDF2 and attempts to
394
+ extract the title. If the title is present in the metadata, it is returned.
395
+ Otherwise, a default message indicating that the title was not found is returned.
396
+
397
+ Parameters:
398
+ uploaded_file (file): A file object or a path to the PDF file from which
399
+ to extract the title. The file must be opened in binary mode.
400
+
401
+ Returns:
402
+ str: The title of the PDF file as a string. If no title is found, returns
403
+ 'Title not found'.
404
+ """
405
+ # Initialize PDF reader
406
+ pdf_reader = PdfFileReader(uploaded_file)
407
+
408
+ # Extract document information
409
+ meta = pdf_reader.getDocumentInfo()
410
+
411
+ # Retrieve title from document information
412
+ title = meta.title if meta and meta.title else 'Title not found'
413
+ return title
414
+
415
  def process_pdf(self) -> Tuple[List[str], List[str]]:
416
  """
417
  Processes the PDF by extracting, categorizing, and merging elements.
 
457
 
458
  try:
459
  logging.debug(f"Processing PDF at {self.file_path}")
460
+ results = self.process_pdf()
461
+ title = extract_title_from_pdf(self.file_path)
462
  logging.info("PDF processing completed successfully.")
463
+ return results, title
464
  except Exception as e:
465
  logging.error(f"Error processing PDF file: {e}", exc_info=True)
466
  raise