Spaces:

sadickam
/

PDF-text-extra

Running

App Files Files Community

sadickam commited on Oct 16, 2024

Commit

d9a4942

verified ·

1 Parent(s): 9f42776

Update app.py

Browse files

Files changed (1) hide show

app.py +134 -42

app.py CHANGED Viewed

@@ -1,58 +1,150 @@
 import gradio as gr
 import pandas as pd
 from langchain_community.document_loaders import UnstructuredFileLoader
-def extract_text_with_langchain_pdf(pdf_file):
-    """Extract text from a PDF page by page using LangChain's UnstructuredFileLoader."""
-    loader = UnstructuredFileLoader(pdf_file)  # Use the file path directly
-    documents = loader.load()
-    # Initialize an empty list to collect all extracted paragraphs
-    extracted_data = []
-    # Extract content for each page, split into paragraphs, and collect metadata
-    doc_name = pdf_file.split("/")[-1]  # Get the document name
-    for doc in documents:
-        page_num = doc.metadata.get("page_number", "Unknown")  # Get the page number if available
-        paragraphs = doc.page_content.split("\n\n")  # Split content by paragraphs
-        for paragraph in paragraphs:
-            if paragraph.strip():  # Skip empty paragraphs
-                extracted_data.append({
-                    "Document": doc_name,
-                    "Page": page_num,
-                    "Paragraph": paragraph.strip()
-                })
-    # Convert the extracted data to a DataFrame
-    df = pd.DataFrame(extracted_data)
-    return df
-def save_df_to_csv(df, output_filename="extracted_content.csv"):
-    """Save the DataFrame to a CSV file."""
-    df.to_csv(output_filename, index=False)
-    return output_filename
 with gr.Blocks() as demo:
     with gr.Row():
-        gr.Markdown("# PDF Text Extractor with Metadata and CSV Export")
     with gr.Row():
-        pdf_file = gr.File(label="Upload PDF", type="filepath")
     with gr.Row():
-        extract_button = gr.Button("Extract and Download CSV")
     with gr.Row():
-        download_button = gr.File(label="Download Extracted CSV")
-    def on_extract(pdf_file):
-        """Callback function to extract text, store in a DataFrame, and return a downloadable CSV."""
-        df = extract_text_with_langchain_pdf(pdf_file)
-        csv_path = save_df_to_csv(df)
-        return csv_path
-    extract_button.click(on_extract, inputs=[pdf_file], outputs=[download_button])
 # Launch the Gradio app
 demo.queue().launch()

 import gradio as gr
 import pandas as pd
+import io
 from langchain_community.document_loaders import UnstructuredFileLoader
+def extract_text_with_langchain_pdf(pdf_file_path):
+    """
+    Extract text from a PDF page by page using LangChain's UnstructuredFileLoader.
+    Args:
+        pdf_file_path (str): The file path to the uploaded PDF.
+    Returns:
+        tuple: DataFrame containing the extracted text with metadata, and the full concatenated text.
+    """
+    try:
+        loader = UnstructuredFileLoader(pdf_file_path)
+        documents = loader.load()
+        extracted_data = []
+        doc_name = pdf_file_path.split("/")[-1]  # Extract document name
+        # Concatenate all page contents into a single string
+        pdf_pages_content = '\n'.join(doc.page_content for doc in documents)
+        for doc in documents:
+            page_num = doc.metadata.get("page_number", "Unknown")
+            paragraphs = doc.page_content.split("\n\n")  # Split into paragraphs
+            for paragraph in paragraphs:
+                clean_para = paragraph.strip()
+                if clean_para:
+                    extracted_data.append({
+                        "Document": doc_name,
+                        "Page": page_num,
+                        "Paragraph": clean_para
+                    })
+        df = pd.DataFrame(extracted_data)
+        return df, pdf_pages_content
+    except Exception as e:
+        raise RuntimeError(f"Error during PDF extraction: {e}")
+def df_to_csv_bytes(df):
+    """
+    Convert DataFrame to CSV in bytes.
+    Args:
+        df (pd.DataFrame): The DataFrame to convert.
+    Returns:
+        bytes: CSV data in bytes.
+    """
+    try:
+        buffer = io.StringIO()
+        df.to_csv(buffer, index=False)
+        csv_data = buffer.getvalue().encode('utf-8')
+        buffer.close()
+        return csv_data
+    except Exception as e:
+        raise RuntimeError(f"Error during CSV conversion: {e}")
+def text_to_txt_bytes(text):
+    """
+    Convert text to TXT in bytes.
+    Args:
+        text (str): The text to convert.
+    Returns:
+        bytes: TXT data in bytes.
+    """
+    try:
+        txt_data = text.encode('utf-8')
+        return txt_data
+    except Exception as e:
+        raise RuntimeError(f"Error during TXT conversion: {e}")
+def on_extract(pdf_file):
+    """
+    Callback function to extract text from PDF and return CSV and TXT data.
+    Args:
+        pdf_file (gr.File): Dictionary containing file information.
+    Returns:
+        tuple: CSV bytes and filename, TXT bytes and filename.
+    """
+    if pdf_file is None:
+        return gr.update(), gr.update(), "No file uploaded.", "No file uploaded."
+    try:
+        # Extract text and create DataFrame
+        df, full_text = extract_text_with_langchain_pdf(pdf_file.name)
+        # Convert DataFrame to CSV bytes
+        csv_bytes = df_to_csv_bytes(df)
+        csv_filename = f"{pdf_file.name.rsplit('.', 1)[0]}_extracted.csv"
+        # Convert full text to TXT bytes
+        txt_bytes = text_to_txt_bytes(full_text)
+        txt_filename = f"{pdf_file.name.rsplit('.', 1)[0]}_full_text.txt"
+        return csv_bytes, csv_filename, txt_bytes, txt_filename
+    except Exception as e:
+        return gr.update(), gr.update(), f"Extraction failed: {e}", f"Extraction failed: {e}"
 with gr.Blocks() as demo:
+    gr.Markdown("# 📄 PDF Text Extractor with Metadata and Multiple Exports")
     with gr.Row():
+        pdf_input = gr.File(
+            label="Upload PDF",
+            file_types=[".pdf"],
+            type="file",
+            interactive=True
+        )
     with gr.Row():
+        extract_button = gr.Button("Extract and Download")
     with gr.Row():
+        csv_download = gr.Download(
+            label="Download Extracted CSV"
+        )
+        txt_download = gr.Download(
+            label="Download Full Text"
+        )
     with gr.Row():
+        error_output = gr.Textbox(
+            label="Status",
+            interactive=False,
+            lines=2
+        )
+    extract_button.click(
+        fn=on_extract,
+        inputs=pdf_input,
+        outputs=[csv_download, txt_download, error_output, error_output]
+    )
+    gr.Markdown("""
+    ---
+    Developed Gradio and LangChain.
+    """)
 # Launch the Gradio app
 demo.queue().launch()