sadickam commited on
Commit
d9a4942
·
verified ·
1 Parent(s): 9f42776

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -42
app.py CHANGED
@@ -1,58 +1,150 @@
1
  import gradio as gr
2
  import pandas as pd
 
3
  from langchain_community.document_loaders import UnstructuredFileLoader
4
 
5
- def extract_text_with_langchain_pdf(pdf_file):
6
- """Extract text from a PDF page by page using LangChain's UnstructuredFileLoader."""
7
- loader = UnstructuredFileLoader(pdf_file) # Use the file path directly
8
- documents = loader.load()
 
 
 
 
 
 
 
 
 
9
 
10
- # Initialize an empty list to collect all extracted paragraphs
11
- extracted_data = []
12
 
13
- # Extract content for each page, split into paragraphs, and collect metadata
14
- doc_name = pdf_file.split("/")[-1] # Get the document name
15
- for doc in documents:
16
- page_num = doc.metadata.get("page_number", "Unknown") # Get the page number if available
17
- paragraphs = doc.page_content.split("\n\n") # Split content by paragraphs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- for paragraph in paragraphs:
20
- if paragraph.strip(): # Skip empty paragraphs
21
- extracted_data.append({
22
- "Document": doc_name,
23
- "Page": page_num,
24
- "Paragraph": paragraph.strip()
25
- })
26
-
27
- # Convert the extracted data to a DataFrame
28
- df = pd.DataFrame(extracted_data)
29
- return df
30
-
31
- def save_df_to_csv(df, output_filename="extracted_content.csv"):
32
- """Save the DataFrame to a CSV file."""
33
- df.to_csv(output_filename, index=False)
34
- return output_filename
35
 
36
  with gr.Blocks() as demo:
 
 
37
  with gr.Row():
38
- gr.Markdown("# PDF Text Extractor with Metadata and CSV Export")
39
-
 
 
 
 
 
40
  with gr.Row():
41
- pdf_file = gr.File(label="Upload PDF", type="filepath")
42
-
43
  with gr.Row():
44
- extract_button = gr.Button("Extract and Download CSV")
45
-
 
 
 
 
 
46
  with gr.Row():
47
- download_button = gr.File(label="Download Extracted CSV")
48
-
49
- def on_extract(pdf_file):
50
- """Callback function to extract text, store in a DataFrame, and return a downloadable CSV."""
51
- df = extract_text_with_langchain_pdf(pdf_file)
52
- csv_path = save_df_to_csv(df)
53
- return csv_path
54
-
55
- extract_button.click(on_extract, inputs=[pdf_file], outputs=[download_button])
 
 
 
 
 
 
 
56
 
57
  # Launch the Gradio app
58
  demo.queue().launch()
 
1
  import gradio as gr
2
  import pandas as pd
3
+ import io
4
  from langchain_community.document_loaders import UnstructuredFileLoader
5
 
6
+ def extract_text_with_langchain_pdf(pdf_file_path):
7
+ """
8
+ Extract text from a PDF page by page using LangChain's UnstructuredFileLoader.
9
+
10
+ Args:
11
+ pdf_file_path (str): The file path to the uploaded PDF.
12
+
13
+ Returns:
14
+ tuple: DataFrame containing the extracted text with metadata, and the full concatenated text.
15
+ """
16
+ try:
17
+ loader = UnstructuredFileLoader(pdf_file_path)
18
+ documents = loader.load()
19
 
20
+ extracted_data = []
21
+ doc_name = pdf_file_path.split("/")[-1] # Extract document name
22
 
23
+ # Concatenate all page contents into a single string
24
+ pdf_pages_content = '\n'.join(doc.page_content for doc in documents)
25
+
26
+ for doc in documents:
27
+ page_num = doc.metadata.get("page_number", "Unknown")
28
+ paragraphs = doc.page_content.split("\n\n") # Split into paragraphs
29
+
30
+ for paragraph in paragraphs:
31
+ clean_para = paragraph.strip()
32
+ if clean_para:
33
+ extracted_data.append({
34
+ "Document": doc_name,
35
+ "Page": page_num,
36
+ "Paragraph": clean_para
37
+ })
38
+
39
+ df = pd.DataFrame(extracted_data)
40
+ return df, pdf_pages_content
41
+
42
+ except Exception as e:
43
+ raise RuntimeError(f"Error during PDF extraction: {e}")
44
+
45
+ def df_to_csv_bytes(df):
46
+ """
47
+ Convert DataFrame to CSV in bytes.
48
+
49
+ Args:
50
+ df (pd.DataFrame): The DataFrame to convert.
51
+
52
+ Returns:
53
+ bytes: CSV data in bytes.
54
+ """
55
+ try:
56
+ buffer = io.StringIO()
57
+ df.to_csv(buffer, index=False)
58
+ csv_data = buffer.getvalue().encode('utf-8')
59
+ buffer.close()
60
+ return csv_data
61
+ except Exception as e:
62
+ raise RuntimeError(f"Error during CSV conversion: {e}")
63
+
64
+ def text_to_txt_bytes(text):
65
+ """
66
+ Convert text to TXT in bytes.
67
+
68
+ Args:
69
+ text (str): The text to convert.
70
+
71
+ Returns:
72
+ bytes: TXT data in bytes.
73
+ """
74
+ try:
75
+ txt_data = text.encode('utf-8')
76
+ return txt_data
77
+ except Exception as e:
78
+ raise RuntimeError(f"Error during TXT conversion: {e}")
79
+
80
+ def on_extract(pdf_file):
81
+ """
82
+ Callback function to extract text from PDF and return CSV and TXT data.
83
+
84
+ Args:
85
+ pdf_file (gr.File): Dictionary containing file information.
86
+
87
+ Returns:
88
+ tuple: CSV bytes and filename, TXT bytes and filename.
89
+ """
90
+ if pdf_file is None:
91
+ return gr.update(), gr.update(), "No file uploaded.", "No file uploaded."
92
+
93
+ try:
94
+ # Extract text and create DataFrame
95
+ df, full_text = extract_text_with_langchain_pdf(pdf_file.name)
96
+
97
+ # Convert DataFrame to CSV bytes
98
+ csv_bytes = df_to_csv_bytes(df)
99
+ csv_filename = f"{pdf_file.name.rsplit('.', 1)[0]}_extracted.csv"
100
+
101
+ # Convert full text to TXT bytes
102
+ txt_bytes = text_to_txt_bytes(full_text)
103
+ txt_filename = f"{pdf_file.name.rsplit('.', 1)[0]}_full_text.txt"
104
 
105
+ return csv_bytes, csv_filename, txt_bytes, txt_filename
106
+ except Exception as e:
107
+ return gr.update(), gr.update(), f"Extraction failed: {e}", f"Extraction failed: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  with gr.Blocks() as demo:
110
+ gr.Markdown("# 📄 PDF Text Extractor with Metadata and Multiple Exports")
111
+
112
  with gr.Row():
113
+ pdf_input = gr.File(
114
+ label="Upload PDF",
115
+ file_types=[".pdf"],
116
+ type="file",
117
+ interactive=True
118
+ )
119
+
120
  with gr.Row():
121
+ extract_button = gr.Button("Extract and Download")
122
+
123
  with gr.Row():
124
+ csv_download = gr.Download(
125
+ label="Download Extracted CSV"
126
+ )
127
+ txt_download = gr.Download(
128
+ label="Download Full Text"
129
+ )
130
+
131
  with gr.Row():
132
+ error_output = gr.Textbox(
133
+ label="Status",
134
+ interactive=False,
135
+ lines=2
136
+ )
137
+
138
+ extract_button.click(
139
+ fn=on_extract,
140
+ inputs=pdf_input,
141
+ outputs=[csv_download, txt_download, error_output, error_output]
142
+ )
143
+
144
+ gr.Markdown("""
145
+ ---
146
+ Developed Gradio and LangChain.
147
+ """)
148
 
149
  # Launch the Gradio app
150
  demo.queue().launch()