|
import os |
|
import re |
|
import fitz |
|
import logging |
|
from PIL import Image |
|
from pdf2image import convert_from_path |
|
import platform |
|
import pytesseract |
|
import docx |
|
from odf.opendocument import load as load_odt |
|
from odf.text import P |
|
|
|
|
|
if platform.system() == "Windows": |
|
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' |
|
else: |
|
|
|
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_text_from_pdf(file_path): |
|
text = "" |
|
hyperlinks = [] |
|
try: |
|
doc = fitz.open(file_path) |
|
for page_num in range(doc.page_count): |
|
page = doc.load_page(page_num) |
|
page_text = page.get_text("text") |
|
|
|
if not page_text.strip(): |
|
images = convert_from_path(file_path, dpi=300) |
|
for image in images: |
|
text += pytesseract.image_to_string(image) |
|
else: |
|
text += page_text |
|
|
|
links = page.get_links() |
|
for link in links: |
|
if link.get("uri"): |
|
hyperlinks.append(link["uri"]) |
|
except Exception as e: |
|
logging.error(f"Error extracting text or hyperlinks from PDF: {e}") |
|
return "", [] |
|
|
|
return text, list(set(hyperlinks)) |
|
|
|
|
|
def extract_text_from_docx(file_path): |
|
try: |
|
doc = docx.Document(file_path) |
|
text = "\n".join([para.text for para in doc.paragraphs]) |
|
return text |
|
except Exception as e: |
|
logging.error(f"Error extracting text from DOCX: {e}") |
|
return "" |
|
|
|
|
|
def extract_text_from_rsf(file_path): |
|
try: |
|
with open(file_path, "r", encoding="utf-8") as file: |
|
return file.read() |
|
except Exception as e: |
|
logging.error(f"Error extracting text from RSF: {e}") |
|
return "" |
|
|
|
|
|
def extract_text_from_odt(file_path): |
|
try: |
|
odt_doc = load_odt(file_path) |
|
text_elements = odt_doc.getElementsByType(P) |
|
text = "\n".join([te.firstChild.data for te in text_elements if te.firstChild]) |
|
return text |
|
except Exception as e: |
|
logging.error(f"Error extracting text from ODT: {e}") |
|
return "" |
|
|
|
|
|
def extract_text_from_image(file_path): |
|
try: |
|
img = Image.open(file_path) |
|
text = pytesseract.image_to_string(img) |
|
|
|
return text |
|
except Exception as e: |
|
logging.error(f"Error extracting text from image: {e}") |
|
return "" |
|
|
|
|
|
def preprocess_text(text): |
|
text = re.sub(r'\s+', ' ', text) |
|
text = re.sub(r'\n', ' ', text) |
|
text = re.sub(r'(\b\d{3}[-.\s]??\d{3}[-.\s]??\d{4}\b)', r' \1 ', text) |
|
return text.strip() |
|
|
|
|
|
def extract_text_based_on_format(file_path): |
|
file_ext = os.path.splitext(file_path)[1].lower() |
|
|
|
if file_ext == '.pdf': |
|
text, hyperlinks = extract_text_from_pdf(file_path) |
|
elif file_ext == '.docx': |
|
text = extract_text_from_docx(file_path) |
|
hyperlinks = [] |
|
elif file_ext == '.rsf': |
|
text = extract_text_from_rsf(file_path) |
|
hyperlinks = [] |
|
elif file_ext == '.odt': |
|
text = extract_text_from_odt(file_path) |
|
hyperlinks = [] |
|
elif file_ext in ['.png', '.jpg', '.jpeg']: |
|
text = extract_text_from_image(file_path) |
|
hyperlinks = [] |
|
else: |
|
raise ValueError("Unsupported file format") |
|
|
|
return text, hyperlinks |
|
|