Spaces:
Runtime error
Runtime error
Update RAG_utils.py
Browse files- RAG_utils.py +30 -2
RAG_utils.py
CHANGED
@@ -11,6 +11,7 @@ import pandas as pd
|
|
11 |
import numpy as np
|
12 |
import evaluate
|
13 |
import qdrant_client
|
|
|
14 |
from pydantic import BaseModel, Field
|
15 |
from typing import Any, List, Tuple, Set, Dict, Optional, Union
|
16 |
from sklearn.metrics.pairwise import cosine_similarity
|
@@ -385,6 +386,32 @@ class PDFProcessor_Unstructured:
|
|
385 |
return (current_chunk.endswith(",") or
|
386 |
(current_chunk[-1].islower() and next_chunk[0].islower()))
|
387 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
def process_pdf(self) -> Tuple[List[str], List[str]]:
|
389 |
"""
|
390 |
Processes the PDF by extracting, categorizing, and merging elements.
|
@@ -430,9 +457,10 @@ class PDFProcessor_Unstructured:
|
|
430 |
|
431 |
try:
|
432 |
logging.debug(f"Processing PDF at {self.file_path}")
|
433 |
-
results = self.process_pdf()
|
|
|
434 |
logging.info("PDF processing completed successfully.")
|
435 |
-
return results
|
436 |
except Exception as e:
|
437 |
logging.error(f"Error processing PDF file: {e}", exc_info=True)
|
438 |
raise
|
|
|
11 |
import numpy as np
|
12 |
import evaluate
|
13 |
import qdrant_client
|
14 |
+
from pypdf import PdfReader
|
15 |
from pydantic import BaseModel, Field
|
16 |
from typing import Any, List, Tuple, Set, Dict, Optional, Union
|
17 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
386 |
return (current_chunk.endswith(",") or
|
387 |
(current_chunk[-1].islower() and next_chunk[0].islower()))
|
388 |
|
389 |
+
def extract_title_from_pdf(self, uploaded_file):
|
390 |
+
"""
|
391 |
+
Extracts the title from a PDF file's metadata.
|
392 |
+
|
393 |
+
This function reads the metadata of a PDF file using PyPDF2 and attempts to
|
394 |
+
extract the title. If the title is present in the metadata, it is returned.
|
395 |
+
Otherwise, a default message indicating that the title was not found is returned.
|
396 |
+
|
397 |
+
Parameters:
|
398 |
+
uploaded_file (file): A file object or a path to the PDF file from which
|
399 |
+
to extract the title. The file must be opened in binary mode.
|
400 |
+
|
401 |
+
Returns:
|
402 |
+
str: The title of the PDF file as a string. If no title is found, returns
|
403 |
+
'Title not found'.
|
404 |
+
"""
|
405 |
+
# Initialize PDF reader
|
406 |
+
pdf_reader = PdfFileReader(uploaded_file)
|
407 |
+
|
408 |
+
# Extract document information
|
409 |
+
meta = pdf_reader.getDocumentInfo()
|
410 |
+
|
411 |
+
# Retrieve title from document information
|
412 |
+
title = meta.title if meta and meta.title else 'Title not found'
|
413 |
+
return title
|
414 |
+
|
415 |
def process_pdf(self) -> Tuple[List[str], List[str]]:
|
416 |
"""
|
417 |
Processes the PDF by extracting, categorizing, and merging elements.
|
|
|
457 |
|
458 |
try:
|
459 |
logging.debug(f"Processing PDF at {self.file_path}")
|
460 |
+
results = self.process_pdf()
|
461 |
+
title = extract_title_from_pdf(self.file_path)
|
462 |
logging.info("PDF processing completed successfully.")
|
463 |
+
return results, title
|
464 |
except Exception as e:
|
465 |
logging.error(f"Error processing PDF file: {e}", exc_info=True)
|
466 |
raise
|