Spaces:

not-lain
/

utils

Running

App Files Files Community

not-lain commited on Dec 5, 2024

Commit

c577758

1 Parent(s): f17ebdc

add background removal & restructure code into multiple files

Browse files

Files changed (4) hide show

app.py +33 -394
background_removal.py +29 -0
base_utils.py +413 -0
requirements.txt +8 -1

app.py CHANGED Viewed

@@ -1,391 +1,17 @@
-from pptx import Presentation
 import gradio as gr
-from pdf2image import convert_from_path
-import pdfplumber
-from docx import Document
-import subprocess
-import os
-from typing import Optional, List
-import string
-import random
-import re
-import requests
-from bs4 import BeautifulSoup
-import logging
-import time
-from urllib.parse import urlparse
-class URLTextExtractor:
-    """
-    A comprehensive utility for extracting text content from web pages with advanced features.
-    Features:
-    - Rotating User-Agents to mimic different browsers
-    - Robust error handling and retry mechanism
-    - Section preservation for maintaining document structure
-    - Configurable extraction options
-    - Logging support
-    Attributes:
-        USER_AGENTS (list): A comprehensive list of user agent strings to rotate through.
-        logger (logging.Logger): Logger for tracking extraction attempts and errors.
-    Example:
-        >>> extractor = URLTextExtractor()
-        >>> text = extractor.extract_text_from_url('https://example.com')
-        >>> print(text)
-    """
-    # Expanded list of user agents including mobile and less common browsers
-    USER_AGENTS = [
-        # Desktop Browsers
-        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15',
-        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0',
-        # Mobile Browsers
-        'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1',
-        'Mozilla/5.0 (Linux; Android 10; SM-G970F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Mobile Safari/537.36',
-    ]
-    def __init__(self, logger=None):
-        """
-        Initialize the URLTextExtractor.
-        Args:
-            logger (logging.Logger, optional): Custom logger.
-                If not provided, creates a default logger.
-        """
-        self.logger = logger or self._create_default_logger()
-    def _create_default_logger(self):
-        """
-        Create a default logger for tracking extraction process.
-        Returns:
-            logging.Logger: Configured logger instance
-        """
-        logger = logging.getLogger(__name__)
-        logger.setLevel(logging.INFO)
-        handler = logging.StreamHandler()
-        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        handler.setFormatter(formatter)
-        logger.addHandler(handler)
-        return logger
-    def _process_element_text(self, element):
-        """
-        Process text within an element, handling anchor tags specially.
-        Args:
-            element (bs4.element.Tag): BeautifulSoup element to process
-        Returns:
-            str: Processed text with proper spacing
-        """
-        # Replace anchor tags with spaced text
-        for a_tag in element.find_all('a'):
-            # Add spaces around the anchor text
-            a_tag.replace_with(f' {a_tag.get_text(strip=True)} ')
-        # Get text with separator
-        return element.get_text(separator=' ', strip=True)
-    def extract_text_from_url(self, url, max_retries=3, preserve_sections=True,
-                               min_section_length=30, allowed_tags=None):
-        """
-        Extract text content from a given URL with advanced configuration.
-        Args:
-            url (str): The URL of the webpage to extract text from.
-            max_retries (int, optional): Maximum number of retry attempts. Defaults to 3.
-            preserve_sections (bool, optional): Whether to preserve section separations. Defaults to True.
-            min_section_length (int, optional): Minimum length of text sections to include. Defaults to 30.
-            allowed_tags (list, optional): Specific HTML tags to extract text from.
-                If None, uses a default set of content-rich tags.
-        Returns:
-            str: Extracted text content from the webpage
-        Raises:
-            ValueError: If URL cannot be fetched after maximum retries
-            requests.RequestException: For network-related errors
-        Examples:
-            >>> extractor = URLTextExtractor()
-            >>> text = extractor.extract_text_from_url('https://example.com')
-            >>> text = extractor.extract_text_from_url('https://example.com', preserve_sections=False)
-        """
-        # Default allowed tags if not specified
-        if allowed_tags is None:
-            allowed_tags = ['p', 'div', 'article', 'section', 'main',
-                            'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
-        # Validate URL
-        try:
-            parsed_url = urlparse(url)
-            if not all([parsed_url.scheme, parsed_url.netloc]):
-                # raise ValueError("Invalid URL format")
-                return None
-        except Exception as e:
-            self.logger.error(f"URL parsing error: {e}")
-            raise
-        for attempt in range(max_retries):
-            try:
-                # Randomly select a user agent
-                headers = {
-                    'User-Agent': random.choice(self.USER_AGENTS),
-                    'Accept-Language': 'en-US,en;q=0.9',
-                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
-                }
-                # Send a GET request to the URL
-                response = requests.get(
-                    url,
-                    headers=headers,
-                    timeout=10,
-                    allow_redirects=True
-                )
-                # Raise an exception for bad status codes
-                response.raise_for_status()
-                # Log successful fetch
-                self.logger.info(f"Successfully fetched URL: {url}")
-                # Parse the HTML content
-                soup = BeautifulSoup(response.text, 'html.parser')
-                # Remove unwanted elements
-                for script in soup(["script", "style", "head", "header", "footer", "nav"]):
-                    script.decompose()
-                # Extract text with section preservation
-                if preserve_sections:
-                    # Extract text from specified tags
-                    sections = []
-                    for tag in allowed_tags:
-                        for element in soup.find_all(tag):
-                            # Process element text, handling anchor tags
-                            section_text = self._process_element_text(element)
-                            # Only add sections meeting minimum length
-                            if len(section_text) >= min_section_length:
-                                sections.append(section_text)
-                    # Join sections with newline
-                    text = '\n'.join(sections)
-                else:
-                    # If not preserving sections, use modified text extraction
-                    text = ' '.join(self._process_element_text(element)
-                                    for tag in allowed_tags
-                                    for element in soup.find_all(tag))
-                # Remove excessive whitespace and empty lines
-                text = '\n'.join(line.strip() for line in text.split('\n') if line.strip())
-                return text
-            except (requests.RequestException, ValueError) as e:
-                # Log error details
-                self.logger.warning(f"Attempt {attempt + 1} failed: {e}")
-                # If it's the last retry, raise the error
-                if attempt == max_retries - 1:
-                    self.logger.error(f"Failed to fetch URL after {max_retries} attempts")
-                    raise ValueError(f"Error fetching URL after {max_retries} attempts: {e}")
-                # Exponential backoff
-                wait_time = 2 ** attempt
-                self.logger.info(f"Waiting {wait_time} seconds before retry")
-                time.sleep(wait_time)
-        # Fallback (though this should never be reached due to the raise in the loop)
-        return None
-def extract_text_from_pptx(file_path):
-    prs = Presentation(file_path)
-    text_content = []
-    for slide in prs.slides:
-        slide_text = []
-        for shape in slide.shapes:
-            if hasattr(shape, "text"):
-                slide_text.append(shape.text)
-        text_content.append("\n".join(slide_text))
-    return "\n\n".join(text_content)
-def extract_text_from_ppt(file_path):
-    try:
-        print("file_path = ",file_path)
-        # Convert PPT to PPTX using unoconv
-        pptx_file_path = os.path.splitext(file_path)[0] + ".pptx"
-        subprocess.run(["unoconv", "-f", "pptx", file_path], check=True)
-        # Extract text from PPTX
-        presentation = Presentation(pptx_file_path)
-        text_content = []
-        for slide in presentation.slides:
-            slide_text = []
-            for shape in slide.shapes:
-                if hasattr(shape, "text"):
-                    slide_text.append(shape.text)
-            text_content.append("\n".join(slide_text))
-        # Remove the converted PPTX file
-        os.remove(pptx_file_path)
-        out =  "\n\n".join(text_content)
-        return out
-    except Exception as e:
-        print(f"Error extracting text from PPT file: {e}")
-        return "Error extracting text from PPT file"
-# def extract_text_from_ppt_or_pptx(file_path):
-#     if file_path.endswith(".pptx"):
-#         return extract_text_from_pptx(file_path)
-#     elif file_path.endswith(".ppt"):
-#         return extract_text_from_ppt(file_path)
-#     else:
-#         return "Unsupported file type. Please provide a .ppt or .pptx file."
-def convert_pdf_to_image(file):
-    images = convert_from_path(file)
-    return images
-def extract_text_from_pdf(file):
-    text = ""
-    with pdfplumber.open(file) as pdf:
-        for page in pdf.pages:
-            text += page.extract_text() + "\n"
-    return text
-def extract_text_from_docx(file_path):
-    text = ""
-    doc = Document(file_path.name)
-    for paragraph in doc.paragraphs:
-        text += paragraph.text + "\n"
-    return text
-def convert_doc_to_text(file_path):
-    try:
-        subprocess.run(
-            ["unoconv", "--format", "txt", file_path],
-            capture_output=True,
-            text=True,
-            check=True,
-        )
-        txt_file_path = file_path.replace(".doc", ".txt")
-        with open(txt_file_path, "r") as f:
-            text = f.read()
-        text = text.lstrip("\ufeff")
-        os.remove(txt_file_path)
-        return text
-    except subprocess.CalledProcessError as e:
-        print(f"Error converting {file_path} to text: {e}")
-        return ""
-# function that generates a random string
-def generate_random_string(length=23):
-    characters = string.ascii_letters + string.digits  # Includes letters and digits
-    random_string = "".join(random.choice(characters) for _ in range(length))
-    return random_string
-# function that adds the necessary json fields
-def handle_json_output(json_list: list):
-    n = len(json_list)
-    for i in range(n):
-        # not last element
-        random_string1 = generate_random_string()
-        random_string2 = generate_random_string()
-        element = json_list[i]
-        front = element["frontText"]
-        back = element["backText"]
-        element["frontHTML"] = (
-            f'<div id="element-richtextarea-{random_string1}" style="position:absolute;left:100px;top:50px;width:800px;height:300px;text-align:center;display:flex;align-items:center;font-size:40px;">'
-            f"<p>{front}</p></div>"
-        )
-        element["backHTML"] = (
-            f'<div id="element-richtextarea-{random_string2}" style="position:absolute;left:100px;top:50px;width:800px;height:300px;text-align:center;display:flex;align-items:center;font-size:40px;">'
-            f"<p>{back}</p></div>"
-        )
-        element["termType"] = "basic"
-        cloze_matches = re.findall(r"_{2,}", front)
-        # match only the first one, if there is multiple don't do anything
-        if (cloze_matches != []) & (len(cloze_matches) <= 2):
-            # It's a cloze type card
-            element["termType"] = "cloze"
-            # inject the back in a span format into the front
-            def replace_cloze(match):
-                return f'</p><p><span class="closure">{back}</span></p><p>'
-            front_html = re.sub(r"_{2,}", replace_cloze, front)
-            element["frontHTML"] = (
-                f'<div id="element-richtextarea-{random_string1}" style="position:absolute;left:100px;top:50px;width:800px;height:300px;text-align:center;display:flex;align-items:center;font-size:40px;">'
-                f"<p>{front_html}</p></div>"
-            )
-            def replace_underscores(match):
-                return f" {back} "
-            element["frontText"] = re.sub(r"_{2,}", replace_underscores, front)
-            element["backText"] = ""
-            element["backHTML"] = (
-                f'<div id="element-richtextarea-{random_string2}" style="position:absolute;left:100px;top:50px;width:800px;height:300px;text-align:center;display:flex;align-items:center;font-size:40px;">'
-                f"<p><br></p></div>"
-            )
-    return json_list
-def sanitize_list_of_lists(text: str) -> Optional[List[List]]:
-    left = text.find("[")
-    right = text.rfind("]")
-    text = text[left : right + 1]
-    try:
-        # Safely evaluate the string to a Python object
-        list_of_lists = eval(text)
-        if isinstance(list_of_lists, list):  # Ensure it's a list
-            out = []
-            try:
-                # parse list of lists
-                for front, back in list_of_lists:
-                    out.append({"frontText": front, "backText": back})
-                return handle_json_output(out)
-            # errors
-            except Exception as e:
-                print(e)
-                # return anything that was already parsed
-                if out != []:
-                    return handle_json_output(out)
-                # original schedma is not respected
-                else:
-                    return None
-        else:
-            print("The evaluated object is not a list.")
-            return None
-    except Exception as e:
-        print(f"Error parsing the list of lists: {e}")
-        return None
-extractor = URLTextExtractor()
-def parse_url(url):
-    return extractor.extract_text_from_url(url)
 pdf_to_img = gr.Interface(
     convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
@@ -398,16 +24,10 @@ pdf_to_text = gr.Interface(
 )
 doc_to_text = gr.Interface(
-    convert_doc_to_text,
-    gr.File(),
-    gr.Textbox(),
-    api_name="doc_to_text"
 )
 docx_to_text = gr.Interface(
-    extract_text_from_docx,
-    gr.File(),
-    gr.Textbox(),
-    api_name="docx_to_text"
 )
 ppt_to_text = gr.Interface(
@@ -448,8 +68,26 @@ url_parser = gr.Interface(
     outputs=["text"],
     api_name="url_to_text",
 )
 demo = gr.TabbedInterface(
-    [pdf_to_img, pdf_to_text, doc_to_text, docx_to_text , ppt_to_text, pptx_to_text, url_parser, str_to_json],
     [
         "PDF to Image",
         "Extract PDF Text",
@@ -459,6 +97,7 @@ demo = gr.TabbedInterface(
         "Extract PPTX Text",
         "Extract text from URL",
         "Extract Json",
     ],
 )

 import gradio as gr
+from base_utils import (
+    convert_pdf_to_image,
+    extract_text_from_pdf,
+    convert_doc_to_text,
+    extract_text_from_docx,
+    extract_text_from_ppt,
+    extract_text_from_pptx,
+    sanitize_list_of_lists,
+    parse_url,
+)
+from background_removal import remove_bg
 pdf_to_img = gr.Interface(
     convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
 )
 doc_to_text = gr.Interface(
+    convert_doc_to_text, gr.File(), gr.Textbox(), api_name="doc_to_text"
 )
 docx_to_text = gr.Interface(
+    extract_text_from_docx, gr.File(), gr.Textbox(), api_name="docx_to_text"
 )
 ppt_to_text = gr.Interface(
     outputs=["text"],
     api_name="url_to_text",
 )
+rmbg = gr.Interface(
+    remove_bg,
+    inputs=["image"],
+    outputs=["image"],
+    api_name="rmbg",
+)
 demo = gr.TabbedInterface(
+    [
+        pdf_to_img,
+        pdf_to_text,
+        doc_to_text,
+        docx_to_text,
+        ppt_to_text,
+        pptx_to_text,
+        url_parser,
+        str_to_json,
+        rmbg,
+    ],
     [
         "PDF to Image",
         "Extract PDF Text",
         "Extract PPTX Text",
         "Extract text from URL",
         "Extract Json",
+        "Remove Background",
     ],
 )

background_removal.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import spaces
+from loadimg import load_img
+import torch
+from torchvision import transforms
+# Load BiRefNet with weights
+from transformers import AutoModelForImageSegmentation
+birefnet = AutoModelForImageSegmentation.from_pretrained('ZhengPeng7/BiRefNet', trust_remote_code=True)
+@spaces.GPU
+def remove_bg(imagepath):
+    # Data settings
+    image_size = (1024, 1024)
+    transform_image = transforms.Compose([
+        transforms.Resize(image_size),
+        transforms.ToTensor(),
+        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    image = load_img(imagepath).convert("RGB")
+    input_images = transform_image(image).unsqueeze(0).to('cuda')
+    # Prediction
+    with torch.no_grad():
+        preds = birefnet(input_images)[-1].sigmoid().cpu()
+    pred = preds[0].squeeze()
+    pred_pil = transforms.ToPILImage()(pred)
+    mask = pred_pil.resize(image.size)
+    image.putalpha(mask)
+    return image

base_utils.py ADDED Viewed

	@@ -0,0 +1,413 @@

+from pptx import Presentation
+from pdf2image import convert_from_path
+import pdfplumber
+from docx import Document
+import subprocess
+import os
+from typing import Optional, List
+import string
+import random
+import re
+import requests
+from bs4 import BeautifulSoup
+import logging
+import time
+from urllib.parse import urlparse
+class URLTextExtractor:
+    """
+    A comprehensive utility for extracting text content from web pages with advanced features.
+    Features:
+    - Rotating User-Agents to mimic different browsers
+    - Robust error handling and retry mechanism
+    - Section preservation for maintaining document structure
+    - Configurable extraction options
+    - Logging support
+    Attributes:
+        USER_AGENTS (list): A comprehensive list of user agent strings to rotate through.
+        logger (logging.Logger): Logger for tracking extraction attempts and errors.
+    Example:
+        >>> extractor = URLTextExtractor()
+        >>> text = extractor.extract_text_from_url('https://example.com')
+        >>> print(text)
+    """
+    # Expanded list of user agents including mobile and less common browsers
+    USER_AGENTS = [
+        # Desktop Browsers
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0",
+        # Mobile Browsers
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (Linux; Android 10; SM-G970F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Mobile Safari/537.36",
+    ]
+    def __init__(self, logger=None):
+        """
+        Initialize the URLTextExtractor.
+        Args:
+            logger (logging.Logger, optional): Custom logger.
+                If not provided, creates a default logger.
+        """
+        self.logger = logger or self._create_default_logger()
+    def _create_default_logger(self):
+        """
+        Create a default logger for tracking extraction process.
+        Returns:
+            logging.Logger: Configured logger instance
+        """
+        logger = logging.getLogger(__name__)
+        logger.setLevel(logging.INFO)
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        return logger
+    def _process_element_text(self, element):
+        """
+        Process text within an element, handling anchor tags specially.
+        Args:
+            element (bs4.element.Tag): BeautifulSoup element to process
+        Returns:
+            str: Processed text with proper spacing
+        """
+        # Replace anchor tags with spaced text
+        for a_tag in element.find_all("a"):
+            # Add spaces around the anchor text
+            a_tag.replace_with(f" {a_tag.get_text(strip=True)} ")
+        # Get text with separator
+        return element.get_text(separator=" ", strip=True)
+    def extract_text_from_url(
+        self,
+        url,
+        max_retries=3,
+        preserve_sections=True,
+        min_section_length=30,
+        allowed_tags=None,
+    ):
+        """
+        Extract text content from a given URL with advanced configuration.
+        Args:
+            url (str): The URL of the webpage to extract text from.
+            max_retries (int, optional): Maximum number of retry attempts. Defaults to 3.
+            preserve_sections (bool, optional): Whether to preserve section separations. Defaults to True.
+            min_section_length (int, optional): Minimum length of text sections to include. Defaults to 30.
+            allowed_tags (list, optional): Specific HTML tags to extract text from.
+                If None, uses a default set of content-rich tags.
+        Returns:
+            str: Extracted text content from the webpage
+        Raises:
+            ValueError: If URL cannot be fetched after maximum retries
+            requests.RequestException: For network-related errors
+        Examples:
+            >>> extractor = URLTextExtractor()
+            >>> text = extractor.extract_text_from_url('https://example.com')
+            >>> text = extractor.extract_text_from_url('https://example.com', preserve_sections=False)
+        """
+        # Default allowed tags if not specified
+        if allowed_tags is None:
+            allowed_tags = [
+                "p",
+                "div",
+                "article",
+                "section",
+                "main",
+                "h1",
+                "h2",
+                "h3",
+                "h4",
+                "h5",
+                "h6",
+            ]
+        # Validate URL
+        try:
+            parsed_url = urlparse(url)
+            if not all([parsed_url.scheme, parsed_url.netloc]):
+                # raise ValueError("Invalid URL format")
+                return None
+        except Exception as e:
+            self.logger.error(f"URL parsing error: {e}")
+            raise
+        for attempt in range(max_retries):
+            try:
+                # Randomly select a user agent
+                headers = {
+                    "User-Agent": random.choice(self.USER_AGENTS),
+                    "Accept-Language": "en-US,en;q=0.9",
+                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+                }
+                # Send a GET request to the URL
+                response = requests.get(
+                    url, headers=headers, timeout=10, allow_redirects=True
+                )
+                # Raise an exception for bad status codes
+                response.raise_for_status()
+                # Log successful fetch
+                self.logger.info(f"Successfully fetched URL: {url}")
+                # Parse the HTML content
+                soup = BeautifulSoup(response.text, "html.parser")
+                # Remove unwanted elements
+                for script in soup(
+                    ["script", "style", "head", "header", "footer", "nav"]
+                ):
+                    script.decompose()
+                # Extract text with section preservation
+                if preserve_sections:
+                    # Extract text from specified tags
+                    sections = []
+                    for tag in allowed_tags:
+                        for element in soup.find_all(tag):
+                            # Process element text, handling anchor tags
+                            section_text = self._process_element_text(element)
+                            # Only add sections meeting minimum length
+                            if len(section_text) >= min_section_length:
+                                sections.append(section_text)
+                    # Join sections with newline
+                    text = "\n".join(sections)
+                else:
+                    # If not preserving sections, use modified text extraction
+                    text = " ".join(
+                        self._process_element_text(element)
+                        for tag in allowed_tags
+                        for element in soup.find_all(tag)
+                    )
+                # Remove excessive whitespace and empty lines
+                text = "\n".join(
+                    line.strip() for line in text.split("\n") if line.strip()
+                )
+                return text
+            except (requests.RequestException, ValueError) as e:
+                # Log error details
+                self.logger.warning(f"Attempt {attempt + 1} failed: {e}")
+                # If it's the last retry, raise the error
+                if attempt == max_retries - 1:
+                    self.logger.error(
+                        f"Failed to fetch URL after {max_retries} attempts"
+                    )
+                    raise ValueError(
+                        f"Error fetching URL after {max_retries} attempts: {e}"
+                    )
+                # Exponential backoff
+                wait_time = 2**attempt
+                self.logger.info(f"Waiting {wait_time} seconds before retry")
+                time.sleep(wait_time)
+        # Fallback (though this should never be reached due to the raise in the loop)
+        return None
+def extract_text_from_pptx(file_path):
+    prs = Presentation(file_path)
+    text_content = []
+    for slide in prs.slides:
+        slide_text = []
+        for shape in slide.shapes:
+            if hasattr(shape, "text"):
+                slide_text.append(shape.text)
+        text_content.append("\n".join(slide_text))
+    return "\n\n".join(text_content)
+def extract_text_from_ppt(file_path):
+    try:
+        print("file_path = ", file_path)
+        # Convert PPT to PPTX using unoconv
+        pptx_file_path = os.path.splitext(file_path)[0] + ".pptx"
+        subprocess.run(["unoconv", "-f", "pptx", file_path], check=True)
+        # Extract text from PPTX
+        presentation = Presentation(pptx_file_path)
+        text_content = []
+        for slide in presentation.slides:
+            slide_text = []
+            for shape in slide.shapes:
+                if hasattr(shape, "text"):
+                    slide_text.append(shape.text)
+            text_content.append("\n".join(slide_text))
+        # Remove the converted PPTX file
+        os.remove(pptx_file_path)
+        out = "\n\n".join(text_content)
+        return out
+    except Exception as e:
+        print(f"Error extracting text from PPT file: {e}")
+        return "Error extracting text from PPT file"
+# def extract_text_from_ppt_or_pptx(file_path):
+#     if file_path.endswith(".pptx"):
+#         return extract_text_from_pptx(file_path)
+#     elif file_path.endswith(".ppt"):
+#         return extract_text_from_ppt(file_path)
+#     else:
+#         return "Unsupported file type. Please provide a .ppt or .pptx file."
+def convert_pdf_to_image(file):
+    images = convert_from_path(file)
+    return images
+def extract_text_from_pdf(file):
+    text = ""
+    with pdfplumber.open(file) as pdf:
+        for page in pdf.pages:
+            text += page.extract_text() + "\n"
+    return text
+def extract_text_from_docx(file_path):
+    text = ""
+    doc = Document(file_path.name)
+    for paragraph in doc.paragraphs:
+        text += paragraph.text + "\n"
+    return text
+def convert_doc_to_text(file_path):
+    try:
+        subprocess.run(
+            ["unoconv", "--format", "txt", file_path],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        txt_file_path = file_path.replace(".doc", ".txt")
+        with open(txt_file_path, "r") as f:
+            text = f.read()
+        text = text.lstrip("\ufeff")
+        os.remove(txt_file_path)
+        return text
+    except subprocess.CalledProcessError as e:
+        print(f"Error converting {file_path} to text: {e}")
+        return ""
+# function that generates a random string
+def generate_random_string(length=23):
+    characters = string.ascii_letters + string.digits  # Includes letters and digits
+    random_string = "".join(random.choice(characters) for _ in range(length))
+    return random_string
+# function that adds the necessary json fields
+def handle_json_output(json_list: list):
+    n = len(json_list)
+    for i in range(n):
+        # not last element
+        random_string1 = generate_random_string()
+        random_string2 = generate_random_string()
+        element = json_list[i]
+        front = element["frontText"]
+        back = element["backText"]
+        element["frontHTML"] = (
+            f'<div id="element-richtextarea-{random_string1}" style="position:absolute;left:100px;top:50px;width:800px;height:300px;text-align:center;display:flex;align-items:center;font-size:40px;">'
+            f"<p>{front}</p></div>"
+        )
+        element["backHTML"] = (
+            f'<div id="element-richtextarea-{random_string2}" style="position:absolute;left:100px;top:50px;width:800px;height:300px;text-align:center;display:flex;align-items:center;font-size:40px;">'
+            f"<p>{back}</p></div>"
+        )
+        element["termType"] = "basic"
+        cloze_matches = re.findall(r"_{2,}", front)
+        # match only the first one, if there is multiple don't do anything
+        if (cloze_matches != []) & (len(cloze_matches) <= 2):
+            # It's a cloze type card
+            element["termType"] = "cloze"
+            # inject the back in a span format into the front
+            def replace_cloze(match):
+                return f'</p><p><span class="closure">{back}</span></p><p>'
+            front_html = re.sub(r"_{2,}", replace_cloze, front)
+            element["frontHTML"] = (
+                f'<div id="element-richtextarea-{random_string1}" style="position:absolute;left:100px;top:50px;width:800px;height:300px;text-align:center;display:flex;align-items:center;font-size:40px;">'
+                f"<p>{front_html}</p></div>"
+            )
+            def replace_underscores(match):
+                return f" {back} "
+            element["frontText"] = re.sub(r"_{2,}", replace_underscores, front)
+            element["backText"] = ""
+            element["backHTML"] = (
+                f'<div id="element-richtextarea-{random_string2}" style="position:absolute;left:100px;top:50px;width:800px;height:300px;text-align:center;display:flex;align-items:center;font-size:40px;">'
+                f"<p><br></p></div>"
+            )
+    return json_list
+def sanitize_list_of_lists(text: str) -> Optional[List[List]]:
+    left = text.find("[")
+    right = text.rfind("]")
+    text = text[left : right + 1]
+    try:
+        # Safely evaluate the string to a Python object
+        list_of_lists = eval(text)
+        if isinstance(list_of_lists, list):  # Ensure it's a list
+            out = []
+            try:
+                # parse list of lists
+                for front, back in list_of_lists:
+                    out.append({"frontText": front, "backText": back})
+                return handle_json_output(out)
+            # errors
+            except Exception as e:
+                print(e)
+                # return anything that was already parsed
+                if out != []:
+                    return handle_json_output(out)
+                # original schedma is not respected
+                else:
+                    return None
+        else:
+            print("The evaluated object is not a list.")
+            return None
+    except Exception as e:
+        print(f"Error parsing the list of lists: {e}")
+        return None
+extractor = URLTextExtractor()
+def parse_url(url):
+    return extractor.extract_text_from_url(url)

requirements.txt CHANGED Viewed

@@ -5,4 +5,11 @@ pdfplumber
 python-docx
 gradio
 python-pptx
-beautifulsoup4

 python-docx
 gradio
 python-pptx
+numpy<2
+torch>=2
+spaces
+transformers
+loadimg
+torchvision
+pillow
+scikit-image