not-lain commited on
Commit
c577758
1 Parent(s): f17ebdc

add background removal & restructure code into multiple files

Browse files
Files changed (4) hide show
  1. app.py +33 -394
  2. background_removal.py +29 -0
  3. base_utils.py +413 -0
  4. requirements.txt +8 -1
app.py CHANGED
@@ -1,391 +1,17 @@
1
- from pptx import Presentation
2
  import gradio as gr
3
- from pdf2image import convert_from_path
4
- import pdfplumber
5
- from docx import Document
6
- import subprocess
7
- import os
8
- from typing import Optional, List
9
- import string
10
- import random
11
- import re
12
- import requests
13
- from bs4 import BeautifulSoup
14
- import logging
15
- import time
16
- from urllib.parse import urlparse
17
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- class URLTextExtractor:
20
- """
21
- A comprehensive utility for extracting text content from web pages with advanced features.
22
-
23
- Features:
24
- - Rotating User-Agents to mimic different browsers
25
- - Robust error handling and retry mechanism
26
- - Section preservation for maintaining document structure
27
- - Configurable extraction options
28
- - Logging support
29
-
30
- Attributes:
31
- USER_AGENTS (list): A comprehensive list of user agent strings to rotate through.
32
- logger (logging.Logger): Logger for tracking extraction attempts and errors.
33
-
34
- Example:
35
- >>> extractor = URLTextExtractor()
36
- >>> text = extractor.extract_text_from_url('https://example.com')
37
- >>> print(text)
38
- """
39
-
40
- # Expanded list of user agents including mobile and less common browsers
41
- USER_AGENTS = [
42
- # Desktop Browsers
43
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
44
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15',
45
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0',
46
-
47
- # Mobile Browsers
48
- 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1',
49
- 'Mozilla/5.0 (Linux; Android 10; SM-G970F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Mobile Safari/537.36',
50
- ]
51
-
52
- def __init__(self, logger=None):
53
- """
54
- Initialize the URLTextExtractor.
55
-
56
- Args:
57
- logger (logging.Logger, optional): Custom logger.
58
- If not provided, creates a default logger.
59
- """
60
- self.logger = logger or self._create_default_logger()
61
-
62
- def _create_default_logger(self):
63
- """
64
- Create a default logger for tracking extraction process.
65
-
66
- Returns:
67
- logging.Logger: Configured logger instance
68
- """
69
- logger = logging.getLogger(__name__)
70
- logger.setLevel(logging.INFO)
71
- handler = logging.StreamHandler()
72
- formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
73
- handler.setFormatter(formatter)
74
- logger.addHandler(handler)
75
- return logger
76
-
77
- def _process_element_text(self, element):
78
- """
79
- Process text within an element, handling anchor tags specially.
80
-
81
- Args:
82
- element (bs4.element.Tag): BeautifulSoup element to process
83
-
84
- Returns:
85
- str: Processed text with proper spacing
86
- """
87
- # Replace anchor tags with spaced text
88
- for a_tag in element.find_all('a'):
89
- # Add spaces around the anchor text
90
- a_tag.replace_with(f' {a_tag.get_text(strip=True)} ')
91
-
92
- # Get text with separator
93
- return element.get_text(separator=' ', strip=True)
94
-
95
- def extract_text_from_url(self, url, max_retries=3, preserve_sections=True,
96
- min_section_length=30, allowed_tags=None):
97
- """
98
- Extract text content from a given URL with advanced configuration.
99
-
100
- Args:
101
- url (str): The URL of the webpage to extract text from.
102
- max_retries (int, optional): Maximum number of retry attempts. Defaults to 3.
103
- preserve_sections (bool, optional): Whether to preserve section separations. Defaults to True.
104
- min_section_length (int, optional): Minimum length of text sections to include. Defaults to 30.
105
- allowed_tags (list, optional): Specific HTML tags to extract text from.
106
- If None, uses a default set of content-rich tags.
107
-
108
- Returns:
109
- str: Extracted text content from the webpage
110
-
111
- Raises:
112
- ValueError: If URL cannot be fetched after maximum retries
113
- requests.RequestException: For network-related errors
114
-
115
- Examples:
116
- >>> extractor = URLTextExtractor()
117
- >>> text = extractor.extract_text_from_url('https://example.com')
118
- >>> text = extractor.extract_text_from_url('https://example.com', preserve_sections=False)
119
- """
120
- # Default allowed tags if not specified
121
- if allowed_tags is None:
122
- allowed_tags = ['p', 'div', 'article', 'section', 'main',
123
- 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
124
-
125
- # Validate URL
126
- try:
127
- parsed_url = urlparse(url)
128
- if not all([parsed_url.scheme, parsed_url.netloc]):
129
- # raise ValueError("Invalid URL format")
130
- return None
131
- except Exception as e:
132
- self.logger.error(f"URL parsing error: {e}")
133
- raise
134
-
135
- for attempt in range(max_retries):
136
- try:
137
- # Randomly select a user agent
138
- headers = {
139
- 'User-Agent': random.choice(self.USER_AGENTS),
140
- 'Accept-Language': 'en-US,en;q=0.9',
141
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
142
- }
143
-
144
- # Send a GET request to the URL
145
- response = requests.get(
146
- url,
147
- headers=headers,
148
- timeout=10,
149
- allow_redirects=True
150
- )
151
-
152
- # Raise an exception for bad status codes
153
- response.raise_for_status()
154
-
155
- # Log successful fetch
156
- self.logger.info(f"Successfully fetched URL: {url}")
157
-
158
- # Parse the HTML content
159
- soup = BeautifulSoup(response.text, 'html.parser')
160
-
161
- # Remove unwanted elements
162
- for script in soup(["script", "style", "head", "header", "footer", "nav"]):
163
- script.decompose()
164
-
165
- # Extract text with section preservation
166
- if preserve_sections:
167
- # Extract text from specified tags
168
- sections = []
169
- for tag in allowed_tags:
170
- for element in soup.find_all(tag):
171
- # Process element text, handling anchor tags
172
- section_text = self._process_element_text(element)
173
-
174
- # Only add sections meeting minimum length
175
- if len(section_text) >= min_section_length:
176
- sections.append(section_text)
177
-
178
- # Join sections with newline
179
- text = '\n'.join(sections)
180
- else:
181
- # If not preserving sections, use modified text extraction
182
- text = ' '.join(self._process_element_text(element)
183
- for tag in allowed_tags
184
- for element in soup.find_all(tag))
185
-
186
- # Remove excessive whitespace and empty lines
187
- text = '\n'.join(line.strip() for line in text.split('\n') if line.strip())
188
-
189
- return text
190
-
191
- except (requests.RequestException, ValueError) as e:
192
- # Log error details
193
- self.logger.warning(f"Attempt {attempt + 1} failed: {e}")
194
-
195
- # If it's the last retry, raise the error
196
- if attempt == max_retries - 1:
197
- self.logger.error(f"Failed to fetch URL after {max_retries} attempts")
198
- raise ValueError(f"Error fetching URL after {max_retries} attempts: {e}")
199
-
200
- # Exponential backoff
201
- wait_time = 2 ** attempt
202
- self.logger.info(f"Waiting {wait_time} seconds before retry")
203
- time.sleep(wait_time)
204
-
205
- # Fallback (though this should never be reached due to the raise in the loop)
206
- return None
207
-
208
- def extract_text_from_pptx(file_path):
209
- prs = Presentation(file_path)
210
- text_content = []
211
-
212
- for slide in prs.slides:
213
- slide_text = []
214
- for shape in slide.shapes:
215
- if hasattr(shape, "text"):
216
- slide_text.append(shape.text)
217
- text_content.append("\n".join(slide_text))
218
-
219
- return "\n\n".join(text_content)
220
-
221
-
222
- def extract_text_from_ppt(file_path):
223
- try:
224
- print("file_path = ",file_path)
225
- # Convert PPT to PPTX using unoconv
226
- pptx_file_path = os.path.splitext(file_path)[0] + ".pptx"
227
- subprocess.run(["unoconv", "-f", "pptx", file_path], check=True)
228
-
229
- # Extract text from PPTX
230
- presentation = Presentation(pptx_file_path)
231
- text_content = []
232
-
233
- for slide in presentation.slides:
234
- slide_text = []
235
- for shape in slide.shapes:
236
- if hasattr(shape, "text"):
237
- slide_text.append(shape.text)
238
- text_content.append("\n".join(slide_text))
239
-
240
- # Remove the converted PPTX file
241
- os.remove(pptx_file_path)
242
-
243
- out = "\n\n".join(text_content)
244
- return out
245
- except Exception as e:
246
- print(f"Error extracting text from PPT file: {e}")
247
- return "Error extracting text from PPT file"
248
-
249
-
250
- # def extract_text_from_ppt_or_pptx(file_path):
251
- # if file_path.endswith(".pptx"):
252
- # return extract_text_from_pptx(file_path)
253
- # elif file_path.endswith(".ppt"):
254
- # return extract_text_from_ppt(file_path)
255
- # else:
256
- # return "Unsupported file type. Please provide a .ppt or .pptx file."
257
-
258
-
259
- def convert_pdf_to_image(file):
260
- images = convert_from_path(file)
261
- return images
262
-
263
-
264
- def extract_text_from_pdf(file):
265
- text = ""
266
- with pdfplumber.open(file) as pdf:
267
- for page in pdf.pages:
268
- text += page.extract_text() + "\n"
269
- return text
270
-
271
-
272
- def extract_text_from_docx(file_path):
273
- text = ""
274
- doc = Document(file_path.name)
275
- for paragraph in doc.paragraphs:
276
- text += paragraph.text + "\n"
277
- return text
278
-
279
-
280
- def convert_doc_to_text(file_path):
281
- try:
282
- subprocess.run(
283
- ["unoconv", "--format", "txt", file_path],
284
- capture_output=True,
285
- text=True,
286
- check=True,
287
- )
288
- txt_file_path = file_path.replace(".doc", ".txt")
289
- with open(txt_file_path, "r") as f:
290
- text = f.read()
291
- text = text.lstrip("\ufeff")
292
- os.remove(txt_file_path)
293
- return text
294
- except subprocess.CalledProcessError as e:
295
- print(f"Error converting {file_path} to text: {e}")
296
- return ""
297
-
298
-
299
-
300
- # function that generates a random string
301
- def generate_random_string(length=23):
302
- characters = string.ascii_letters + string.digits # Includes letters and digits
303
- random_string = "".join(random.choice(characters) for _ in range(length))
304
- return random_string
305
-
306
-
307
- # function that adds the necessary json fields
308
- def handle_json_output(json_list: list):
309
- n = len(json_list)
310
- for i in range(n):
311
- # not last element
312
- random_string1 = generate_random_string()
313
- random_string2 = generate_random_string()
314
- element = json_list[i]
315
- front = element["frontText"]
316
- back = element["backText"]
317
- element["frontHTML"] = (
318
- f'<div id="element-richtextarea-{random_string1}" style="position:absolute;left:100px;top:50px;width:800px;height:300px;text-align:center;display:flex;align-items:center;font-size:40px;">'
319
- f"<p>{front}</p></div>"
320
- )
321
- element["backHTML"] = (
322
- f'<div id="element-richtextarea-{random_string2}" style="position:absolute;left:100px;top:50px;width:800px;height:300px;text-align:center;display:flex;align-items:center;font-size:40px;">'
323
- f"<p>{back}</p></div>"
324
- )
325
- element["termType"] = "basic"
326
- cloze_matches = re.findall(r"_{2,}", front)
327
- # match only the first one, if there is multiple don't do anything
328
- if (cloze_matches != []) & (len(cloze_matches) <= 2):
329
- # It's a cloze type card
330
- element["termType"] = "cloze"
331
-
332
- # inject the back in a span format into the front
333
- def replace_cloze(match):
334
- return f'</p><p><span class="closure">{back}</span></p><p>'
335
-
336
- front_html = re.sub(r"_{2,}", replace_cloze, front)
337
- element["frontHTML"] = (
338
- f'<div id="element-richtextarea-{random_string1}" style="position:absolute;left:100px;top:50px;width:800px;height:300px;text-align:center;display:flex;align-items:center;font-size:40px;">'
339
- f"<p>{front_html}</p></div>"
340
- )
341
-
342
- def replace_underscores(match):
343
- return f" {back} "
344
-
345
- element["frontText"] = re.sub(r"_{2,}", replace_underscores, front)
346
- element["backText"] = ""
347
-
348
- element["backHTML"] = (
349
- f'<div id="element-richtextarea-{random_string2}" style="position:absolute;left:100px;top:50px;width:800px;height:300px;text-align:center;display:flex;align-items:center;font-size:40px;">'
350
- f"<p><br></p></div>"
351
- )
352
-
353
- return json_list
354
-
355
-
356
- def sanitize_list_of_lists(text: str) -> Optional[List[List]]:
357
- left = text.find("[")
358
- right = text.rfind("]")
359
- text = text[left : right + 1]
360
- try:
361
- # Safely evaluate the string to a Python object
362
- list_of_lists = eval(text)
363
- if isinstance(list_of_lists, list): # Ensure it's a list
364
- out = []
365
- try:
366
- # parse list of lists
367
- for front, back in list_of_lists:
368
- out.append({"frontText": front, "backText": back})
369
- return handle_json_output(out)
370
- # errors
371
- except Exception as e:
372
- print(e)
373
- # return anything that was already parsed
374
- if out != []:
375
- return handle_json_output(out)
376
- # original schedma is not respected
377
- else:
378
- return None
379
- else:
380
- print("The evaluated object is not a list.")
381
- return None
382
- except Exception as e:
383
- print(f"Error parsing the list of lists: {e}")
384
- return None
385
-
386
- extractor = URLTextExtractor()
387
- def parse_url(url):
388
- return extractor.extract_text_from_url(url)
389
 
390
  pdf_to_img = gr.Interface(
391
  convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
@@ -398,16 +24,10 @@ pdf_to_text = gr.Interface(
398
  )
399
 
400
  doc_to_text = gr.Interface(
401
- convert_doc_to_text,
402
- gr.File(),
403
- gr.Textbox(),
404
- api_name="doc_to_text"
405
  )
406
  docx_to_text = gr.Interface(
407
- extract_text_from_docx,
408
- gr.File(),
409
- gr.Textbox(),
410
- api_name="docx_to_text"
411
  )
412
 
413
  ppt_to_text = gr.Interface(
@@ -448,8 +68,26 @@ url_parser = gr.Interface(
448
  outputs=["text"],
449
  api_name="url_to_text",
450
  )
 
 
 
 
 
 
 
 
451
  demo = gr.TabbedInterface(
452
- [pdf_to_img, pdf_to_text, doc_to_text, docx_to_text , ppt_to_text, pptx_to_text, url_parser, str_to_json],
 
 
 
 
 
 
 
 
 
 
453
  [
454
  "PDF to Image",
455
  "Extract PDF Text",
@@ -459,6 +97,7 @@ demo = gr.TabbedInterface(
459
  "Extract PPTX Text",
460
  "Extract text from URL",
461
  "Extract Json",
 
462
  ],
463
  )
464
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ from base_utils import (
4
+ convert_pdf_to_image,
5
+ extract_text_from_pdf,
6
+ convert_doc_to_text,
7
+ extract_text_from_docx,
8
+ extract_text_from_ppt,
9
+ extract_text_from_pptx,
10
+ sanitize_list_of_lists,
11
+ parse_url,
12
+ )
13
 
14
+ from background_removal import remove_bg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  pdf_to_img = gr.Interface(
17
  convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
 
24
  )
25
 
26
  doc_to_text = gr.Interface(
27
+ convert_doc_to_text, gr.File(), gr.Textbox(), api_name="doc_to_text"
 
 
 
28
  )
29
  docx_to_text = gr.Interface(
30
+ extract_text_from_docx, gr.File(), gr.Textbox(), api_name="docx_to_text"
 
 
 
31
  )
32
 
33
  ppt_to_text = gr.Interface(
 
68
  outputs=["text"],
69
  api_name="url_to_text",
70
  )
71
+
72
+ rmbg = gr.Interface(
73
+ remove_bg,
74
+ inputs=["image"],
75
+ outputs=["image"],
76
+ api_name="rmbg",
77
+ )
78
+
79
  demo = gr.TabbedInterface(
80
+ [
81
+ pdf_to_img,
82
+ pdf_to_text,
83
+ doc_to_text,
84
+ docx_to_text,
85
+ ppt_to_text,
86
+ pptx_to_text,
87
+ url_parser,
88
+ str_to_json,
89
+ rmbg,
90
+ ],
91
  [
92
  "PDF to Image",
93
  "Extract PDF Text",
 
97
  "Extract PPTX Text",
98
  "Extract text from URL",
99
  "Extract Json",
100
+ "Remove Background",
101
  ],
102
  )
103
 
background_removal.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ from loadimg import load_img
3
+ import torch
4
+ from torchvision import transforms
5
+ # Load BiRefNet with weights
6
+ from transformers import AutoModelForImageSegmentation
7
+ birefnet = AutoModelForImageSegmentation.from_pretrained('ZhengPeng7/BiRefNet', trust_remote_code=True)
8
+
9
+ @spaces.GPU
10
+ def remove_bg(imagepath):
11
+ # Data settings
12
+ image_size = (1024, 1024)
13
+ transform_image = transforms.Compose([
14
+ transforms.Resize(image_size),
15
+ transforms.ToTensor(),
16
+ transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
17
+ ])
18
+
19
+ image = load_img(imagepath).convert("RGB")
20
+ input_images = transform_image(image).unsqueeze(0).to('cuda')
21
+
22
+ # Prediction
23
+ with torch.no_grad():
24
+ preds = birefnet(input_images)[-1].sigmoid().cpu()
25
+ pred = preds[0].squeeze()
26
+ pred_pil = transforms.ToPILImage()(pred)
27
+ mask = pred_pil.resize(image.size)
28
+ image.putalpha(mask)
29
+ return image
base_utils.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pptx import Presentation
2
+ from pdf2image import convert_from_path
3
+ import pdfplumber
4
+ from docx import Document
5
+ import subprocess
6
+ import os
7
+ from typing import Optional, List
8
+ import string
9
+ import random
10
+ import re
11
+ import requests
12
+ from bs4 import BeautifulSoup
13
+ import logging
14
+ import time
15
+ from urllib.parse import urlparse
16
+
17
+
18
+ class URLTextExtractor:
19
+ """
20
+ A comprehensive utility for extracting text content from web pages with advanced features.
21
+
22
+ Features:
23
+ - Rotating User-Agents to mimic different browsers
24
+ - Robust error handling and retry mechanism
25
+ - Section preservation for maintaining document structure
26
+ - Configurable extraction options
27
+ - Logging support
28
+
29
+ Attributes:
30
+ USER_AGENTS (list): A comprehensive list of user agent strings to rotate through.
31
+ logger (logging.Logger): Logger for tracking extraction attempts and errors.
32
+
33
+ Example:
34
+ >>> extractor = URLTextExtractor()
35
+ >>> text = extractor.extract_text_from_url('https://example.com')
36
+ >>> print(text)
37
+ """
38
+
39
+ # Expanded list of user agents including mobile and less common browsers
40
+ USER_AGENTS = [
41
+ # Desktop Browsers
42
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
43
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15",
44
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0",
45
+ # Mobile Browsers
46
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1",
47
+ "Mozilla/5.0 (Linux; Android 10; SM-G970F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Mobile Safari/537.36",
48
+ ]
49
+
50
+ def __init__(self, logger=None):
51
+ """
52
+ Initialize the URLTextExtractor.
53
+
54
+ Args:
55
+ logger (logging.Logger, optional): Custom logger.
56
+ If not provided, creates a default logger.
57
+ """
58
+ self.logger = logger or self._create_default_logger()
59
+
60
+ def _create_default_logger(self):
61
+ """
62
+ Create a default logger for tracking extraction process.
63
+
64
+ Returns:
65
+ logging.Logger: Configured logger instance
66
+ """
67
+ logger = logging.getLogger(__name__)
68
+ logger.setLevel(logging.INFO)
69
+ handler = logging.StreamHandler()
70
+ formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
71
+ handler.setFormatter(formatter)
72
+ logger.addHandler(handler)
73
+ return logger
74
+
75
+ def _process_element_text(self, element):
76
+ """
77
+ Process text within an element, handling anchor tags specially.
78
+
79
+ Args:
80
+ element (bs4.element.Tag): BeautifulSoup element to process
81
+
82
+ Returns:
83
+ str: Processed text with proper spacing
84
+ """
85
+ # Replace anchor tags with spaced text
86
+ for a_tag in element.find_all("a"):
87
+ # Add spaces around the anchor text
88
+ a_tag.replace_with(f" {a_tag.get_text(strip=True)} ")
89
+
90
+ # Get text with separator
91
+ return element.get_text(separator=" ", strip=True)
92
+
93
+ def extract_text_from_url(
94
+ self,
95
+ url,
96
+ max_retries=3,
97
+ preserve_sections=True,
98
+ min_section_length=30,
99
+ allowed_tags=None,
100
+ ):
101
+ """
102
+ Extract text content from a given URL with advanced configuration.
103
+
104
+ Args:
105
+ url (str): The URL of the webpage to extract text from.
106
+ max_retries (int, optional): Maximum number of retry attempts. Defaults to 3.
107
+ preserve_sections (bool, optional): Whether to preserve section separations. Defaults to True.
108
+ min_section_length (int, optional): Minimum length of text sections to include. Defaults to 30.
109
+ allowed_tags (list, optional): Specific HTML tags to extract text from.
110
+ If None, uses a default set of content-rich tags.
111
+
112
+ Returns:
113
+ str: Extracted text content from the webpage
114
+
115
+ Raises:
116
+ ValueError: If URL cannot be fetched after maximum retries
117
+ requests.RequestException: For network-related errors
118
+
119
+ Examples:
120
+ >>> extractor = URLTextExtractor()
121
+ >>> text = extractor.extract_text_from_url('https://example.com')
122
+ >>> text = extractor.extract_text_from_url('https://example.com', preserve_sections=False)
123
+ """
124
+ # Default allowed tags if not specified
125
+ if allowed_tags is None:
126
+ allowed_tags = [
127
+ "p",
128
+ "div",
129
+ "article",
130
+ "section",
131
+ "main",
132
+ "h1",
133
+ "h2",
134
+ "h3",
135
+ "h4",
136
+ "h5",
137
+ "h6",
138
+ ]
139
+
140
+ # Validate URL
141
+ try:
142
+ parsed_url = urlparse(url)
143
+ if not all([parsed_url.scheme, parsed_url.netloc]):
144
+ # raise ValueError("Invalid URL format")
145
+ return None
146
+ except Exception as e:
147
+ self.logger.error(f"URL parsing error: {e}")
148
+ raise
149
+
150
+ for attempt in range(max_retries):
151
+ try:
152
+ # Randomly select a user agent
153
+ headers = {
154
+ "User-Agent": random.choice(self.USER_AGENTS),
155
+ "Accept-Language": "en-US,en;q=0.9",
156
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
157
+ }
158
+
159
+ # Send a GET request to the URL
160
+ response = requests.get(
161
+ url, headers=headers, timeout=10, allow_redirects=True
162
+ )
163
+
164
+ # Raise an exception for bad status codes
165
+ response.raise_for_status()
166
+
167
+ # Log successful fetch
168
+ self.logger.info(f"Successfully fetched URL: {url}")
169
+
170
+ # Parse the HTML content
171
+ soup = BeautifulSoup(response.text, "html.parser")
172
+
173
+ # Remove unwanted elements
174
+ for script in soup(
175
+ ["script", "style", "head", "header", "footer", "nav"]
176
+ ):
177
+ script.decompose()
178
+
179
+ # Extract text with section preservation
180
+ if preserve_sections:
181
+ # Extract text from specified tags
182
+ sections = []
183
+ for tag in allowed_tags:
184
+ for element in soup.find_all(tag):
185
+ # Process element text, handling anchor tags
186
+ section_text = self._process_element_text(element)
187
+
188
+ # Only add sections meeting minimum length
189
+ if len(section_text) >= min_section_length:
190
+ sections.append(section_text)
191
+
192
+ # Join sections with newline
193
+ text = "\n".join(sections)
194
+ else:
195
+ # If not preserving sections, use modified text extraction
196
+ text = " ".join(
197
+ self._process_element_text(element)
198
+ for tag in allowed_tags
199
+ for element in soup.find_all(tag)
200
+ )
201
+
202
+ # Remove excessive whitespace and empty lines
203
+ text = "\n".join(
204
+ line.strip() for line in text.split("\n") if line.strip()
205
+ )
206
+
207
+ return text
208
+
209
+ except (requests.RequestException, ValueError) as e:
210
+ # Log error details
211
+ self.logger.warning(f"Attempt {attempt + 1} failed: {e}")
212
+
213
+ # If it's the last retry, raise the error
214
+ if attempt == max_retries - 1:
215
+ self.logger.error(
216
+ f"Failed to fetch URL after {max_retries} attempts"
217
+ )
218
+ raise ValueError(
219
+ f"Error fetching URL after {max_retries} attempts: {e}"
220
+ )
221
+
222
+ # Exponential backoff
223
+ wait_time = 2**attempt
224
+ self.logger.info(f"Waiting {wait_time} seconds before retry")
225
+ time.sleep(wait_time)
226
+
227
+ # Fallback (though this should never be reached due to the raise in the loop)
228
+ return None
229
+
230
+
231
+ def extract_text_from_pptx(file_path):
232
+ prs = Presentation(file_path)
233
+ text_content = []
234
+
235
+ for slide in prs.slides:
236
+ slide_text = []
237
+ for shape in slide.shapes:
238
+ if hasattr(shape, "text"):
239
+ slide_text.append(shape.text)
240
+ text_content.append("\n".join(slide_text))
241
+
242
+ return "\n\n".join(text_content)
243
+
244
+
245
+ def extract_text_from_ppt(file_path):
246
+ try:
247
+ print("file_path = ", file_path)
248
+ # Convert PPT to PPTX using unoconv
249
+ pptx_file_path = os.path.splitext(file_path)[0] + ".pptx"
250
+ subprocess.run(["unoconv", "-f", "pptx", file_path], check=True)
251
+
252
+ # Extract text from PPTX
253
+ presentation = Presentation(pptx_file_path)
254
+ text_content = []
255
+
256
+ for slide in presentation.slides:
257
+ slide_text = []
258
+ for shape in slide.shapes:
259
+ if hasattr(shape, "text"):
260
+ slide_text.append(shape.text)
261
+ text_content.append("\n".join(slide_text))
262
+
263
+ # Remove the converted PPTX file
264
+ os.remove(pptx_file_path)
265
+
266
+ out = "\n\n".join(text_content)
267
+ return out
268
+ except Exception as e:
269
+ print(f"Error extracting text from PPT file: {e}")
270
+ return "Error extracting text from PPT file"
271
+
272
+
273
+ # def extract_text_from_ppt_or_pptx(file_path):
274
+ # if file_path.endswith(".pptx"):
275
+ # return extract_text_from_pptx(file_path)
276
+ # elif file_path.endswith(".ppt"):
277
+ # return extract_text_from_ppt(file_path)
278
+ # else:
279
+ # return "Unsupported file type. Please provide a .ppt or .pptx file."
280
+
281
+
282
+ def convert_pdf_to_image(file):
283
+ images = convert_from_path(file)
284
+ return images
285
+
286
+
287
+ def extract_text_from_pdf(file):
288
+ text = ""
289
+ with pdfplumber.open(file) as pdf:
290
+ for page in pdf.pages:
291
+ text += page.extract_text() + "\n"
292
+ return text
293
+
294
+
295
+ def extract_text_from_docx(file_path):
296
+ text = ""
297
+ doc = Document(file_path.name)
298
+ for paragraph in doc.paragraphs:
299
+ text += paragraph.text + "\n"
300
+ return text
301
+
302
+
303
+ def convert_doc_to_text(file_path):
304
+ try:
305
+ subprocess.run(
306
+ ["unoconv", "--format", "txt", file_path],
307
+ capture_output=True,
308
+ text=True,
309
+ check=True,
310
+ )
311
+ txt_file_path = file_path.replace(".doc", ".txt")
312
+ with open(txt_file_path, "r") as f:
313
+ text = f.read()
314
+ text = text.lstrip("\ufeff")
315
+ os.remove(txt_file_path)
316
+ return text
317
+ except subprocess.CalledProcessError as e:
318
+ print(f"Error converting {file_path} to text: {e}")
319
+ return ""
320
+
321
+
322
+ # function that generates a random string
323
+ def generate_random_string(length=23):
324
+ characters = string.ascii_letters + string.digits # Includes letters and digits
325
+ random_string = "".join(random.choice(characters) for _ in range(length))
326
+ return random_string
327
+
328
+
329
+ # function that adds the necessary json fields
330
+ def handle_json_output(json_list: list):
331
+ n = len(json_list)
332
+ for i in range(n):
333
+ # not last element
334
+ random_string1 = generate_random_string()
335
+ random_string2 = generate_random_string()
336
+ element = json_list[i]
337
+ front = element["frontText"]
338
+ back = element["backText"]
339
+ element["frontHTML"] = (
340
+ f'<div id="element-richtextarea-{random_string1}" style="position:absolute;left:100px;top:50px;width:800px;height:300px;text-align:center;display:flex;align-items:center;font-size:40px;">'
341
+ f"<p>{front}</p></div>"
342
+ )
343
+ element["backHTML"] = (
344
+ f'<div id="element-richtextarea-{random_string2}" style="position:absolute;left:100px;top:50px;width:800px;height:300px;text-align:center;display:flex;align-items:center;font-size:40px;">'
345
+ f"<p>{back}</p></div>"
346
+ )
347
+ element["termType"] = "basic"
348
+ cloze_matches = re.findall(r"_{2,}", front)
349
+ # match only the first one, if there is multiple don't do anything
350
+ if (cloze_matches != []) & (len(cloze_matches) <= 2):
351
+ # It's a cloze type card
352
+ element["termType"] = "cloze"
353
+
354
+ # inject the back in a span format into the front
355
+ def replace_cloze(match):
356
+ return f'</p><p><span class="closure">{back}</span></p><p>'
357
+
358
+ front_html = re.sub(r"_{2,}", replace_cloze, front)
359
+ element["frontHTML"] = (
360
+ f'<div id="element-richtextarea-{random_string1}" style="position:absolute;left:100px;top:50px;width:800px;height:300px;text-align:center;display:flex;align-items:center;font-size:40px;">'
361
+ f"<p>{front_html}</p></div>"
362
+ )
363
+
364
+ def replace_underscores(match):
365
+ return f" {back} "
366
+
367
+ element["frontText"] = re.sub(r"_{2,}", replace_underscores, front)
368
+ element["backText"] = ""
369
+
370
+ element["backHTML"] = (
371
+ f'<div id="element-richtextarea-{random_string2}" style="position:absolute;left:100px;top:50px;width:800px;height:300px;text-align:center;display:flex;align-items:center;font-size:40px;">'
372
+ f"<p><br></p></div>"
373
+ )
374
+
375
+ return json_list
376
+
377
+
378
+ def sanitize_list_of_lists(text: str) -> Optional[List[List]]:
379
+ left = text.find("[")
380
+ right = text.rfind("]")
381
+ text = text[left : right + 1]
382
+ try:
383
+ # Safely evaluate the string to a Python object
384
+ list_of_lists = eval(text)
385
+ if isinstance(list_of_lists, list): # Ensure it's a list
386
+ out = []
387
+ try:
388
+ # parse list of lists
389
+ for front, back in list_of_lists:
390
+ out.append({"frontText": front, "backText": back})
391
+ return handle_json_output(out)
392
+ # errors
393
+ except Exception as e:
394
+ print(e)
395
+ # return anything that was already parsed
396
+ if out != []:
397
+ return handle_json_output(out)
398
+ # original schedma is not respected
399
+ else:
400
+ return None
401
+ else:
402
+ print("The evaluated object is not a list.")
403
+ return None
404
+ except Exception as e:
405
+ print(f"Error parsing the list of lists: {e}")
406
+ return None
407
+
408
+
409
+ extractor = URLTextExtractor()
410
+
411
+
412
+ def parse_url(url):
413
+ return extractor.extract_text_from_url(url)
requirements.txt CHANGED
@@ -5,4 +5,11 @@ pdfplumber
5
  python-docx
6
  gradio
7
  python-pptx
8
- beautifulsoup4
 
 
 
 
 
 
 
 
5
  python-docx
6
  gradio
7
  python-pptx
8
+ numpy<2
9
+ torch>=2
10
+ spaces
11
+ transformers
12
+ loadimg
13
+ torchvision
14
+ pillow
15
+ scikit-image