rag / pdf_utils.py
jessica45's picture
Update pdf_utils.py
8a3f76a verified
import os
import re
import pdfplumber
import fitz # PyMuPDF
from tika import parser
from typing import List, Optional
from docx import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tempfile
import logging
import warnings
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def clean_text(text: str) -> str:
"""Clean extracted text by removing extra whitespace and invalid characters."""
text = re.sub(r'\s+', ' ', text) # Remove multiple spaces
text = ''.join(char for char in text if char.isprintable() or char == '\n') # Remove non-printable characters
text = re.sub(r'\n\s*\n', '\n\n', text) # Remove multiple newlines
return text.strip()
def extract_text_from_pdf(file_path: str) -> Optional[str]:
"""
Extract text from PDF using PyMuPDF (faster than pdfplumber).
"""
try:
doc = fitz.open(file_path)
text = "\n".join(page.get_text("text") for page in doc)
return clean_text(text) if text else None
except Exception as e:
logger.error(f"Error extracting text from {file_path} using PyMuPDF: {e}")
return None
def extract_text_from_docx(file_path: str) -> Optional[str]:
"""
Extract text from DOCX with enhanced error handling.
"""
try:
doc = Document(file_path)
text = '\n'.join(para.text for para in doc.paragraphs if para.text.strip())
return clean_text(text) if text else None
except Exception as e:
logger.error(f"Failed to process DOCX {file_path}: {e}")
return None
def extract_text_from_other_files(file_path: str) -> Optional[str]:
"""
Extract text using Apache Tika for other file formats.
"""
try:
parsed = parser.from_file(file_path)
text = parsed.get("content", "").strip()
return clean_text(text) if text else None
except Exception as e:
logger.error(f"Error extracting text from {file_path} using Tika: {e}")
return None
def extract_text_from_file(uploaded_file) -> Optional[str]:
"""
Extract text from various file types.
"""
if isinstance(uploaded_file, str): # Handle direct file paths
file_path = uploaded_file
else: # Handle file-like objects (e.g., uploaded files)
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(uploaded_file.read()) # Save file contents temporarily
file_path = temp_file.name # Temporary file path
if not os.path.exists(file_path):
logger.error(f"File not found: {file_path}")
return None
_, file_extension = os.path.splitext(file_path)
file_extension = file_extension.lower()
try:
if file_extension == ".pdf":
text = extract_text_from_pdf(file_path) # Use PyMuPDF
elif file_extension == ".docx":
text = extract_text_from_docx(file_path)
elif file_extension == ".txt":
try:
with open(file_path, "r", encoding="utf-8") as file:
text = clean_text(file.read())
except UnicodeDecodeError:
with open(file_path, "r", encoding="latin-1") as file:
text = clean_text(file.read())
else:
text = extract_text_from_other_files(file_path) # Use Apache Tika
if not text:
logger.warning(f"No text content extracted from {file_path}")
return None
return text
except Exception as e:
logger.error(f"Error extracting text from {file_path}: {e}")
return None
def split_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[str]:
"""
Split text into chunks with improved handling and validation.
"""
if not text:
logger.warning("Empty text provided for splitting")
return []
try:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
is_separator_regex=False
)
splits = text_splitter.split_text(text)
logger.info(f"Split text into {len(splits)} chunks")
return splits
except Exception as e:
logger.error(f"Error splitting text: {e}")
return []