WebashalarForML commited on
Commit
0f64637
1 Parent(s): df05d1c

Update utils/fileTotext.py

Browse files
Files changed (1) hide show
  1. utils/fileTotext.py +127 -127
utils/fileTotext.py CHANGED
@@ -1,127 +1,127 @@
1
- import os
2
- import re
3
- import fitz
4
- import logging
5
- from PIL import Image
6
- from pdf2image import convert_from_path
7
- import platform
8
- import pytesseract
9
- import docx
10
- from odf.opendocument import load as load_odt
11
- from odf.text import P
12
-
13
- # Path to tesseract executable (ensure it points to tesseract.exe)
14
- if platform.system() == "Windows":
15
- pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
16
- else:
17
- # For Hugging Face Spaces or other Linux environments
18
- pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
19
-
20
- # # Set up logging
21
- # logging.basicConfig(
22
- # level=logging.DEBUG,
23
- # format='%(asctime)s - %(levelname)s - %(message)s',
24
- # handlers=[logging.StreamHandler()]
25
- # )
26
-
27
- # # Path to Tesseract executable
28
- # tesseract_path = os.getenv('TESSERACT_CMD', '/usr/bin/tesseract')
29
- # pytesseract.pytesseract.tesseract_cmd = tesseract_path
30
-
31
- # Function to extract text from PDF using PyMuPDF
32
- def extract_text_from_pdf(file_path):
33
- text = ""
34
- hyperlinks = []
35
- try:
36
- doc = fitz.open(file_path)
37
- for page_num in range(doc.page_count):
38
- page = doc.load_page(page_num)
39
- page_text = page.get_text("text")
40
-
41
- if not page_text.strip():
42
- images = convert_from_path(file_path, dpi=300)
43
- for image in images:
44
- text += pytesseract.image_to_string(image)
45
- else:
46
- text += page_text
47
-
48
- links = page.get_links()
49
- for link in links:
50
- if link.get("uri"):
51
- hyperlinks.append(link["uri"])
52
- except Exception as e:
53
- logging.error(f"Error extracting text or hyperlinks from PDF: {e}")
54
- return "", []
55
-
56
- return text, list(set(hyperlinks))
57
-
58
- # Function to extract text from DOCX
59
- def extract_text_from_docx(file_path):
60
- try:
61
- doc = docx.Document(file_path)
62
- text = "\n".join([para.text for para in doc.paragraphs])
63
- return text
64
- except Exception as e:
65
- logging.error(f"Error extracting text from DOCX: {e}")
66
- return ""
67
-
68
- # Function to extract text from RSF (assuming text-based format)
69
- def extract_text_from_rsf(file_path):
70
- try:
71
- with open(file_path, "r", encoding="utf-8") as file:
72
- return file.read()
73
- except Exception as e:
74
- logging.error(f"Error extracting text from RSF: {e}")
75
- return ""
76
-
77
- # Function to extract text from ODT
78
- def extract_text_from_odt(file_path):
79
- try:
80
- odt_doc = load_odt(file_path)
81
- text_elements = odt_doc.getElementsByType(P)
82
- text = "\n".join([te.firstChild.data for te in text_elements if te.firstChild])
83
- return text
84
- except Exception as e:
85
- logging.error(f"Error extracting text from ODT: {e}")
86
- return ""
87
-
88
- # Function to extract text from images using Tesseract
89
- def extract_text_from_image(file_path):
90
- try:
91
- img = Image.open(file_path)
92
- text = pytesseract.image_to_string(img)
93
-
94
- return text
95
- except Exception as e:
96
- logging.error(f"Error extracting text from image: {e}")
97
- return ""
98
-
99
- # Function to clean and preprocess the extracted text
100
- def preprocess_text(text):
101
- text = re.sub(r'\s+', ' ', text)
102
- text = re.sub(r'\n', ' ', text)
103
- text = re.sub(r'(\b\d{3}[-.\s]??\d{3}[-.\s]??\d{4}\b)', r' \1 ', text)
104
- return text.strip()
105
-
106
- # Function to automatically detect file format and extract text
107
- def extract_text_based_on_format(file_path):
108
- file_ext = os.path.splitext(file_path)[1].lower()
109
-
110
- if file_ext == '.pdf':
111
- text, hyperlinks = extract_text_from_pdf(file_path)
112
- elif file_ext == '.docx':
113
- text = extract_text_from_docx(file_path)
114
- hyperlinks = []
115
- elif file_ext == '.rsf':
116
- text = extract_text_from_rsf(file_path)
117
- hyperlinks = []
118
- elif file_ext == '.odt':
119
- text = extract_text_from_odt(file_path)
120
- hyperlinks = []
121
- elif file_ext in ['.png', '.jpg', '.jpeg']:
122
- text = extract_text_from_image(file_path)
123
- hyperlinks = []
124
- else:
125
- raise ValueError("Unsupported file format")
126
-
127
- return text, hyperlinks
 
1
+ import os
2
+ import re
3
+ import fitz
4
+ import logging
5
+ from PIL import Image
6
+ from pdf2image import convert_from_path
7
+ import platform
8
+ import pytesseract
9
+ import docx
10
+ from odf.opendocument import load as load_odt
11
+ from odf.text import P
12
+
13
+ # Path to tesseract executable (ensure it points to tesseract.exe)
14
+ #if platform.system() == "Windows":
15
+ # pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
16
+ #else:
17
+ # For Hugging Face Spaces or other Linux environments
18
+ pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
19
+
20
+ # # Set up logging
21
+ # logging.basicConfig(
22
+ # level=logging.DEBUG,
23
+ # format='%(asctime)s - %(levelname)s - %(message)s',
24
+ # handlers=[logging.StreamHandler()]
25
+ # )
26
+
27
+ # # Path to Tesseract executable
28
+ # tesseract_path = os.getenv('TESSERACT_CMD', '/usr/bin/tesseract')
29
+ # pytesseract.pytesseract.tesseract_cmd = tesseract_path
30
+
31
+ # Function to extract text from PDF using PyMuPDF
32
+ def extract_text_from_pdf(file_path):
33
+ text = ""
34
+ hyperlinks = []
35
+ try:
36
+ doc = fitz.open(file_path)
37
+ for page_num in range(doc.page_count):
38
+ page = doc.load_page(page_num)
39
+ page_text = page.get_text("text")
40
+
41
+ if not page_text.strip():
42
+ images = convert_from_path(file_path, dpi=300)
43
+ for image in images:
44
+ text += pytesseract.image_to_string(image)
45
+ else:
46
+ text += page_text
47
+
48
+ links = page.get_links()
49
+ for link in links:
50
+ if link.get("uri"):
51
+ hyperlinks.append(link["uri"])
52
+ except Exception as e:
53
+ logging.error(f"Error extracting text or hyperlinks from PDF: {e}")
54
+ return "", []
55
+
56
+ return text, list(set(hyperlinks))
57
+
58
+ # Function to extract text from DOCX
59
+ def extract_text_from_docx(file_path):
60
+ try:
61
+ doc = docx.Document(file_path)
62
+ text = "\n".join([para.text for para in doc.paragraphs])
63
+ return text
64
+ except Exception as e:
65
+ logging.error(f"Error extracting text from DOCX: {e}")
66
+ return ""
67
+
68
+ # Function to extract text from RSF (assuming text-based format)
69
+ def extract_text_from_rsf(file_path):
70
+ try:
71
+ with open(file_path, "r", encoding="utf-8") as file:
72
+ return file.read()
73
+ except Exception as e:
74
+ logging.error(f"Error extracting text from RSF: {e}")
75
+ return ""
76
+
77
+ # Function to extract text from ODT
78
+ def extract_text_from_odt(file_path):
79
+ try:
80
+ odt_doc = load_odt(file_path)
81
+ text_elements = odt_doc.getElementsByType(P)
82
+ text = "\n".join([te.firstChild.data for te in text_elements if te.firstChild])
83
+ return text
84
+ except Exception as e:
85
+ logging.error(f"Error extracting text from ODT: {e}")
86
+ return ""
87
+
88
+ # Function to extract text from images using Tesseract
89
+ def extract_text_from_image(file_path):
90
+ try:
91
+ img = Image.open(file_path)
92
+ text = pytesseract.image_to_string(img)
93
+
94
+ return text
95
+ except Exception as e:
96
+ logging.error(f"Error extracting text from image: {e}")
97
+ return ""
98
+
99
+ # Function to clean and preprocess the extracted text
100
+ def preprocess_text(text):
101
+ text = re.sub(r'\s+', ' ', text)
102
+ text = re.sub(r'\n', ' ', text)
103
+ text = re.sub(r'(\b\d{3}[-.\s]??\d{3}[-.\s]??\d{4}\b)', r' \1 ', text)
104
+ return text.strip()
105
+
106
+ # Function to automatically detect file format and extract text
107
+ def extract_text_based_on_format(file_path):
108
+ file_ext = os.path.splitext(file_path)[1].lower()
109
+
110
+ if file_ext == '.pdf':
111
+ text, hyperlinks = extract_text_from_pdf(file_path)
112
+ elif file_ext == '.docx':
113
+ text = extract_text_from_docx(file_path)
114
+ hyperlinks = []
115
+ elif file_ext == '.rsf':
116
+ text = extract_text_from_rsf(file_path)
117
+ hyperlinks = []
118
+ elif file_ext == '.odt':
119
+ text = extract_text_from_odt(file_path)
120
+ hyperlinks = []
121
+ elif file_ext in ['.png', '.jpg', '.jpeg']:
122
+ text = extract_text_from_image(file_path)
123
+ hyperlinks = []
124
+ else:
125
+ raise ValueError("Unsupported file format")
126
+
127
+ return text, hyperlinks