from pptx import Presentation import gradio as gr from pdf2image import convert_from_path import pdfplumber from docx import Document import subprocess import os def extract_text_from_pptx(file_path): prs = Presentation(file_path) text_content = [] for slide in prs.slides: slide_text = [] for shape in slide.shapes: if hasattr(shape, "text"): slide_text.append(shape.text) text_content.append("\n".join(slide_text)) return "\n\n".join(text_content) def convert_pdf_to_image(file): images = convert_from_path(file) return images def extract_text_from_pdf(file): text = "" with pdfplumber.open(file) as pdf: for page in pdf.pages: text += page.extract_text() + "\n" return text def extract_text_from_docx(file): text = "" doc = Document(file.name) for paragraph in doc.paragraphs: text += paragraph.text + "\n" return text def convert_doc_to_text(doc_path): try: subprocess.run( ["unoconv", "--format", "txt", doc_path], capture_output=True, text=True, check=True, ) txt_file_path = doc_path.replace(".doc", ".txt") with open(txt_file_path, "r") as f: text = f.read() text = text.lstrip("\ufeff") os.remove(txt_file_path) return text except subprocess.CalledProcessError as e: print(f"Error converting {doc_path} to text: {e}") return "" def extract_text_from_doc_or_docx(file): if file.name.endswith(".docx"): return extract_text_from_docx(file) elif file.name.endswith(".doc"): return convert_doc_to_text(file.name) else: return "Unsupported file type. Please upload a .doc or .docx file." pdf_to_img = gr.Interface( convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img" ) pdf_to_text = gr.Interface( extract_text_from_pdf, gr.File(), gr.Textbox(placeholder="Extracted text will appear here"), api_name="pdf_to_text", ) doc_or_docx_to_text = gr.Interface( extract_text_from_doc_or_docx, gr.File(), gr.Textbox(placeholder="Extracted text from DOC or DOCX will appear here"), api_name="doc_or_docx_to_text", ) pptx_to_text = gr.Interface( extract_text_from_pptx, gr.File(), gr.Textbox(placeholder="Extracted text from PPTX will appear here"), api_name="pptx_to_text", ) demo = gr.TabbedInterface( [pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_to_text], ["PDF to Image", "Extract PDF Text", "Extract DOC/DOCX Text", "Extract PPTX Text"], ) demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)