Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
import io | |
import tempfile | |
import os | |
from langchain_community.document_loaders import PyPDFLoader | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
# Download NLTK's punkt tokenizer if not already downloaded | |
nltk.download('punkt_tab') | |
# Create a temporary directory for storing download files | |
temp_dir = tempfile.TemporaryDirectory() | |
def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None): | |
""" | |
Extract text from a PDF page by page using LangChain's PyPDFLoader. | |
Args: | |
pdf_file_path (str): The file path to the uploaded PDF. | |
start_page (int, optional): The starting page number for extraction (1-based index). | |
end_page (int, optional): The ending page number for extraction (1-based index). | |
Returns: | |
tuple: | |
- page_df (pd.DataFrame): DataFrame containing Document, Page, and Text. | |
- sentence_df (pd.DataFrame): DataFrame containing Document, Page, and Sentence. | |
""" | |
try: | |
# Initialize the loader | |
loader = PyPDFLoader(pdf_file_path) | |
documents = loader.load_and_split() # Each document corresponds to a single page | |
total_pages = len(documents) | |
doc_name = os.path.basename(pdf_file_path) # Extract document name | |
# Validate and adjust page range | |
if start_page is not None and end_page is not None: | |
# Convert to integers to avoid slicing issues | |
start_page = int(start_page) | |
end_page = int(end_page) | |
# Adjust to valid range | |
if start_page < 1: | |
start_page = 1 | |
if end_page > total_pages: | |
end_page = total_pages | |
if start_page > end_page: | |
start_page, end_page = end_page, start_page # Swap if out of order | |
# Select the subset of documents based on user input | |
selected_docs = documents[start_page - 1:end_page] | |
else: | |
selected_docs = documents | |
start_page = 1 | |
end_page = total_pages | |
# Initialize lists to store data | |
page_data = [] | |
sentence_data = [] | |
for idx, doc in enumerate(selected_docs, start=start_page): | |
page_num = idx | |
text = doc.page_content.strip() | |
# Append page-wise data | |
page_data.append({ | |
"Document": doc_name, | |
"Page": page_num, | |
"Text": text | |
}) | |
# Sentence tokenization | |
sentences = sent_tokenize(text) | |
for sentence in sentences: | |
sentence = sentence.strip() | |
if sentence: | |
sentence_data.append({ | |
"Document": doc_name, | |
"Page": page_num, | |
"Sentence": sentence | |
}) | |
# Create DataFrames | |
page_df = pd.DataFrame(page_data) | |
sentence_df = pd.DataFrame(sentence_data) | |
return page_df, sentence_df | |
except Exception as e: | |
raise RuntimeError(f"Error during PDF extraction: {e}") | |
def df_to_csv_bytes(df): | |
""" | |
Convert DataFrame to CSV in bytes. | |
Args: | |
df (pd.DataFrame): The DataFrame to convert. | |
Returns: | |
bytes: CSV data in bytes. | |
""" | |
try: | |
buffer = io.StringIO() | |
df.to_csv(buffer, index=False) | |
csv_data = buffer.getvalue().encode('utf-8') | |
buffer.close() | |
return csv_data | |
except Exception as e: | |
raise RuntimeError(f"Error during CSV conversion: {e}") | |
def on_extract(pdf_file_path, extraction_mode, start_page, end_page): | |
""" | |
Callback function to extract text from PDF and return CSV data. | |
Args: | |
pdf_file_path (str): The file path to the uploaded PDF. | |
extraction_mode (str): "All Pages" or "Range of Pages". | |
start_page (float): Starting page number for extraction. | |
end_page (float): Ending page number for extraction. | |
Returns: | |
tuple: | |
- page_csv_path (str): Path to the page-wise CSV file. | |
- sentence_csv_path (str): Path to the sentence-wise CSV file. | |
- status_message (str): Status of the extraction process. | |
""" | |
if not pdf_file_path: | |
return None, None, "No file uploaded." | |
try: | |
# Determine page range based on extraction_mode | |
if extraction_mode == "All Pages": | |
selected_start = None | |
selected_end = None | |
else: | |
selected_start = start_page | |
selected_end = end_page | |
# Extract text and create DataFrames | |
page_df, sentence_df = extract_text_with_py_pdf_loader( | |
pdf_file_path, | |
start_page=selected_start, | |
end_page=selected_end | |
) | |
# Convert DataFrames to CSV bytes | |
page_csv_bytes = df_to_csv_bytes(page_df) | |
sentence_csv_bytes = df_to_csv_bytes(sentence_df) | |
# Define CSV filenames | |
page_csv_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_pages.csv" | |
sentence_csv_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_sentences.csv" | |
# Define full paths within the temporary directory | |
page_csv_path = os.path.join(temp_dir.name, page_csv_filename) | |
sentence_csv_path = os.path.join(temp_dir.name, sentence_csv_filename) | |
# Write CSV bytes to temporary files | |
with open(page_csv_path, 'wb') as page_csv_file: | |
page_csv_file.write(page_csv_bytes) | |
with open(sentence_csv_path, 'wb') as sentence_csv_file: | |
sentence_csv_file.write(sentence_csv_bytes) | |
# Return the paths to the temporary CSV files and a success message | |
return ( | |
page_csv_path, | |
sentence_csv_path, | |
"Extraction successful!" | |
) | |
except Exception as e: | |
return None, None, f"Extraction failed: {e}" | |
with gr.Blocks() as demo: | |
gr.Markdown("# π PDF Text Extractor with Multiple Exports") | |
with gr.Row(): | |
pdf_input = gr.File( | |
label="Upload PDF", | |
file_types=[".pdf"], | |
type="filepath", # Ensure type is set to "filepath" | |
interactive=True | |
) | |
with gr.Row(): | |
extraction_mode = gr.Radio( | |
label="Extraction Mode", | |
choices=["All Pages", "Range of Pages"], | |
value="All Pages", | |
interactive=True | |
) | |
with gr.Row(): | |
start_page = gr.Number( | |
label="Start Page", | |
value=1, | |
precision=0, | |
interactive=True, | |
visible=False # Initially hidden | |
) | |
end_page = gr.Number( | |
label="End Page", | |
value=1, | |
precision=0, | |
interactive=True, | |
visible=False # Initially hidden | |
) | |
# Toggle visibility of start_page and end_page based on extraction_mode | |
extraction_mode.change( | |
fn=lambda mode: ( | |
gr.update(visible=(mode == "Range of Pages")), | |
gr.update(visible=(mode == "Range of Pages")) | |
), | |
inputs=[extraction_mode], | |
outputs=[start_page, end_page] | |
) | |
with gr.Row(): | |
extract_button = gr.Button("Extract and Download") | |
with gr.Row(): | |
page_csv_download = gr.File( | |
label="Download Page-wise CSV", | |
interactive=False | |
) | |
sentence_csv_download = gr.File( | |
label="Download Sentence-wise CSV", | |
interactive=False | |
) | |
with gr.Row(): | |
status_output = gr.Textbox( | |
label="Status", | |
interactive=False, | |
lines=2 | |
) | |
extract_button.click( | |
fn=on_extract, | |
inputs=[pdf_input, extraction_mode, start_page, end_page], | |
outputs=[page_csv_download, sentence_csv_download, status_output] | |
) | |
gr.Markdown(""" | |
--- | |
Developed with β€οΈ using Gradio and LangChain. | |
""") | |
# Launch the Gradio app | |
demo.queue().launch() |