Spaces:

jessica45
/

rag

Runtime error

App Files Files Community

jessica45 commited on Feb 12

Commit

8a3f76a

verified ·

1 Parent(s): c094457

Update pdf_utils.py

Browse files

Files changed (1) hide show

pdf_utils.py +28 -41

pdf_utils.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import re
 import pdfplumber
 from typing import List, Optional
-import textract
 from docx import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-import os
 import logging
 import warnings
@@ -19,53 +21,47 @@ def clean_text(text: str) -> str:
     text = re.sub(r'\n\s*\n', '\n\n', text)  # Remove multiple newlines
     return text.strip()
-def extract_text_from_pdf(pdf_path: str) -> Optional[str]:
     """
-    Extract text from PDF using pdfplumber.
     """
-    extracted_text = []
     try:
-        with pdfplumber.open(pdf_path) as pdf:
-            for page_num, page in enumerate(pdf.pages, 1):
-                try:
-                    page_text = page.extract_text()
-                    if page_text:
-                        extracted_text.append(page_text)
-                    else:
-                        logger.warning(f"No text extracted from page {page_num}")
-                except Exception as e:
-                    logger.error(f"Error extracting text from page {page_num}: {e}")
-                    continue
-        if not extracted_text:
-            logger.warning("No text was extracted from any page of the PDF")
-            return None
-        return clean_text('\n'.join(extracted_text))
     except Exception as e:
-        logger.error(f"Failed to process PDF {pdf_path}: {e}")
         return None
-def extract_text_from_docx(docx_path: str) -> Optional[str]:
     """
     Extract text from DOCX with enhanced error handling.
     """
     try:
-        doc = Document(docx_path)
         text = '\n'.join(para.text for para in doc.paragraphs if para.text.strip())
         return clean_text(text) if text else None
     except Exception as e:
-        logger.error(f"Failed to process DOCX {docx_path}: {e}")
         return None
-import tempfile
 def extract_text_from_file(uploaded_file) -> Optional[str]:
     """
-    Extract text from various file types with enhanced error handling and logging.
-    If file is uploaded as file-like object, save it temporarily.
     """
-    if isinstance(uploaded_file, str):  # Assuming file_path is a string for direct file handling
         file_path = uploaded_file
     else:  # Handle file-like objects (e.g., uploaded files)
         with tempfile.NamedTemporaryFile(delete=False) as temp_file:
@@ -81,7 +77,7 @@ def extract_text_from_file(uploaded_file) -> Optional[str]:
     try:
         if file_extension == ".pdf":
-            text = extract_text_from_pdf(file_path)
         elif file_extension == ".docx":
             text = extract_text_from_docx(file_path)
         elif file_extension == ".txt":
@@ -92,7 +88,7 @@ def extract_text_from_file(uploaded_file) -> Optional[str]:
                 with open(file_path, "r", encoding="latin-1") as file:
                     text = clean_text(file.read())
         else:
-            text = clean_text(textract.process(file_path).decode("utf-8"))
         if not text:
             logger.warning(f"No text content extracted from {file_path}")
@@ -104,7 +100,6 @@ def extract_text_from_file(uploaded_file) -> Optional[str]:
         logger.error(f"Error extracting text from {file_path}: {e}")
         return None
 def split_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[str]:
     """
     Split text into chunks with improved handling and validation.
@@ -131,11 +126,3 @@ def split_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> L
         logger.error(f"Error splitting text: {e}")
         return []
-# Example usage
-if __name__ == "__main__":
-    sample_file = "/Users/jessicawin/Downloads/github-recovery-codes.txt"
-    if os.path.exists(sample_file):
-        file_text = extract_text_from_file(sample_file)
-        if file_text:
-            chunks = split_text(file_text)
-            print(f"Successfully processed file into {len(chunks)} chunks")

+import os
 import re
 import pdfplumber
+import fitz  # PyMuPDF
+from tika import parser
 from typing import List, Optional
 from docx import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+import tempfile
 import logging
 import warnings
     text = re.sub(r'\n\s*\n', '\n\n', text)  # Remove multiple newlines
     return text.strip()
+def extract_text_from_pdf(file_path: str) -> Optional[str]:
     """
+    Extract text from PDF using PyMuPDF (faster than pdfplumber).
     """
     try:
+        doc = fitz.open(file_path)
+        text = "\n".join(page.get_text("text") for page in doc)
+        return clean_text(text) if text else None
     except Exception as e:
+        logger.error(f"Error extracting text from {file_path} using PyMuPDF: {e}")
         return None
+def extract_text_from_docx(file_path: str) -> Optional[str]:
     """
     Extract text from DOCX with enhanced error handling.
     """
     try:
+        doc = Document(file_path)
         text = '\n'.join(para.text for para in doc.paragraphs if para.text.strip())
         return clean_text(text) if text else None
     except Exception as e:
+        logger.error(f"Failed to process DOCX {file_path}: {e}")
         return None
+def extract_text_from_other_files(file_path: str) -> Optional[str]:
+    """
+    Extract text using Apache Tika for other file formats.
+    """
+    try:
+        parsed = parser.from_file(file_path)
+        text = parsed.get("content", "").strip()
+        return clean_text(text) if text else None
+    except Exception as e:
+        logger.error(f"Error extracting text from {file_path} using Tika: {e}")
+        return None
 def extract_text_from_file(uploaded_file) -> Optional[str]:
     """
+    Extract text from various file types.
     """
+    if isinstance(uploaded_file, str):  # Handle direct file paths
         file_path = uploaded_file
     else:  # Handle file-like objects (e.g., uploaded files)
         with tempfile.NamedTemporaryFile(delete=False) as temp_file:
     try:
         if file_extension == ".pdf":
+            text = extract_text_from_pdf(file_path)  # Use PyMuPDF
         elif file_extension == ".docx":
             text = extract_text_from_docx(file_path)
         elif file_extension == ".txt":
                 with open(file_path, "r", encoding="latin-1") as file:
                     text = clean_text(file.read())
         else:
+            text = extract_text_from_other_files(file_path)  # Use Apache Tika
         if not text:
             logger.warning(f"No text content extracted from {file_path}")
         logger.error(f"Error extracting text from {file_path}: {e}")
         return None
 def split_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[str]:
     """
     Split text into chunks with improved handling and validation.
         logger.error(f"Error splitting text: {e}")
         return []