Cachoups commited on
Commit
22db5e3
·
verified ·
1 Parent(s): d2deb56

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +166 -125
app.py CHANGED
@@ -3,7 +3,11 @@ import gradio as gr
3
  from transformers import pipeline
4
  import spacy
5
  import lib.read_pdf
6
-
 
 
 
 
7
  # Initialize spaCy model
8
  nlp = spacy.load('en_core_web_sm')
9
  nlp.add_pipe('sentencizer')
@@ -70,134 +74,171 @@ def get_pdf_files(folder):
70
 
71
  def show(name):
72
  return f"{name}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  stored_paragraphs_1 = []
75
  stored_paragraphs_2 = []
76
 
77
  with gr.Blocks() as demo:
78
- gr.Markdown("## Financial Report Paragraph Selection and Analysis")
79
-
80
- with gr.Row():
81
- # Upload PDFs
82
- with gr.Column():
83
- pdf1 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 1")
84
- pdf2 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 2")
85
-
86
- with gr.Column():
87
- b1 = gr.Button("Extract and Display Paragraphs")
88
- paragraph_1_dropdown = gr.Dropdown(label="Select Paragraph from PDF 1")
89
- paragraph_2_dropdown = gr.Dropdown(label="Select Paragraph from PDF 2")
90
-
91
- def update_paragraphs(pdf1, pdf2):
92
- global stored_paragraphs_1, stored_paragraphs_2
93
- stored_paragraphs_1, stored_paragraphs_2 = extract_and_summarize(pdf1, pdf2)
94
- updated_dropdown_1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)]
95
- updated_dropdown_2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)]
96
- return gr.update(choices=updated_dropdown_1), gr.update(choices=updated_dropdown_2)
97
-
98
- b1.click(fn=update_paragraphs, inputs=[pdf1, pdf2], outputs=[paragraph_1_dropdown, paragraph_2_dropdown])
99
-
100
- with gr.Row():
101
- # Process the selected paragraph from PDF 1
102
- with gr.Column():
103
- gr.Markdown("### PDF 1 Analysis")
104
- def process_paragraph_1_sum(paragraph):
105
- try:
106
- paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
107
- selected_paragraph = stored_paragraphs_1[paragraph_index]
108
- summary = summarize_text(selected_paragraph)
109
- return summary
110
- except (IndexError, ValueError):
111
- return "Error"
112
- def process_paragraph_1_sent(paragraph):
113
- try:
114
- paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
115
- selected_paragraph = stored_paragraphs_1[paragraph_index]
116
- sentiment = text_to_sentiment(selected_paragraph)
117
-
118
- return sentiment
119
- except (IndexError, ValueError):
120
- return "Error"
121
- def process_paragraph_1_sent_tone(paragraph):
122
- try:
123
- paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
124
- selected_paragraph = stored_paragraphs_1[paragraph_index]
125
- fin_spans = fin_ext(selected_paragraph)
126
- return fin_spans
127
- except (IndexError, ValueError):
128
- return []
129
- def process_paragraph_1_sent_tone_bis(paragraph):
130
- try:
131
- paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
132
- selected_paragraph = stored_paragraphs_1[paragraph_index]
133
- fin_spans = fin_ext_bis(selected_paragraph)
134
- return fin_spans
135
- except (IndexError, ValueError):
136
- return []
137
- selected_paragraph_1 = gr.Textbox(label="Selected Paragraph 1 Content", lines=4)
138
- selected_paragraph_1.change(show, paragraph_1_dropdown, selected_paragraph_1)
139
- summarize_btn1 = gr.Button("Summarize Text from PDF 1")
140
- summary_textbox_1 = gr.Textbox(label="Summary for PDF 1", lines=2)
141
- summarize_btn1.click(fn=lambda p: process_paragraph_1_sum(p), inputs=paragraph_1_dropdown, outputs=summary_textbox_1)
142
- sentiment_btn1 = gr.Button("Classify Financial Tone from PDF 1")
143
- sentiment_textbox_1 = gr.Textbox(label="Classification for PDF 1", lines=1)
144
- sentiment_btn1.click(fn=lambda p: process_paragraph_1_sent(p), inputs=paragraph_1_dropdown, outputs=sentiment_textbox_1)
145
- analyze_btn1 = gr.Button("Analyze Financial Tone on each sentence with yiyanghkust/finbert-tone")
146
- fin_spans_1 = gr.HighlightedText(label="Financial Tone Analysis for PDF 1")
147
- analyze_btn1.click(fn=lambda p: process_paragraph_1_sent_tone(p), inputs=paragraph_1_dropdown, outputs=fin_spans_1)
148
- analyze_btn1_ = gr.Button("Analyze Financial Tone on each sentence with ProsusAI/finbert")
149
- fin_spans_1_ = gr.HighlightedText(label="Financial Tone Analysis for PDF 1 bis")
150
- analyze_btn1_.click(fn=lambda p: process_paragraph_1_sent_tone_bis(p), inputs=paragraph_1_dropdown, outputs=fin_spans_1_)
151
-
152
- # Process the selected paragraph from PDF 2
153
- with gr.Column():
154
- gr.Markdown("### PDF 2 Analysis")
155
- def process_paragraph_2_sum(paragraph):
156
- try:
157
- paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
158
- selected_paragraph = stored_paragraphs_2[paragraph_index]
159
- summary = summarize_text(selected_paragraph)
160
- return summary
161
- except (IndexError, ValueError):
162
- return "Error"
163
- def process_paragraph_2_sent(paragraph):
164
- try:
165
- paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
166
- selected_paragraph = stored_paragraphs_2[paragraph_index]
167
- sentiment = text_to_sentiment(selected_paragraph)
168
-
169
- return sentiment
170
- except (IndexError, ValueError):
171
- return "Error"
172
- def process_paragraph_2_sent_tone(paragraph):
173
- try:
174
- paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
175
- selected_paragraph = stored_paragraphs_2[paragraph_index]
176
- fin_spans = fin_ext(selected_paragraph)
177
- return fin_spans
178
- except (IndexError, ValueError):
179
- return []
180
- def process_paragraph_2_sent_tone_bis(paragraph):
181
- try:
182
- paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
183
- selected_paragraph = stored_paragraphs_2[paragraph_index]
184
- fin_spans = fin_ext_bis(selected_paragraph)
185
- return fin_spans
186
- except (IndexError, ValueError):
187
- return []
188
- selected_paragraph_2 = gr.Textbox(label="Selected Paragraph 2 Content", lines=4)
189
- selected_paragraph_2.change(show, paragraph_2_dropdown, selected_paragraph_2)
190
- summarize_btn2 = gr.Button("Summarize Text from PDF 2")
191
- summary_textbox_2 = gr.Textbox(label="Summary for PDF 2", lines=2)
192
- summarize_btn2.click(fn=lambda p: process_paragraph_2_sum(p), inputs=paragraph_2_dropdown, outputs=summary_textbox_2)
193
- sentiment_btn2 = gr.Button("Classify Financial Tone from PDF 2")
194
- sentiment_textbox_2 = gr.Textbox(label="Classification for PDF 2", lines=1)
195
- sentiment_btn2.click(fn=lambda p: process_paragraph_2_sent(p), inputs=paragraph_2_dropdown, outputs=sentiment_textbox_2)
196
- analyze_btn2 = gr.Button("Analyze Financial Tone on each sentence with yiyanghkust/finbert-tone")
197
- fin_spans_2 = gr.HighlightedText(label="Financial Tone Analysis for PDF 2")
198
- analyze_btn2.click(fn=lambda p: process_paragraph_2_sent_tone(p), inputs=paragraph_2_dropdown, outputs=fin_spans_2)
199
- analyze_btn2_ = gr.Button("Analyze Financial Tone on each sentence with ProsusAI/finbert")
200
- fin_spans_2_ = gr.HighlightedText(label="Financial Tone Analysis for PDF 2 bis")
201
- analyze_btn2_.click(fn=lambda p: process_paragraph_2_sent_tone_bis(p), inputs=paragraph_2_dropdown, outputs=fin_spans_2_)
202
 
203
  demo.launch()
 
3
  from transformers import pipeline
4
  import spacy
5
  import lib.read_pdf
6
+ import pandas as pd
7
+ import re
8
+ import matplotlib.pyplot as plt
9
+ import matplotlib.patches as patches
10
+ import io
11
  # Initialize spaCy model
12
  nlp = spacy.load('en_core_web_sm')
13
  nlp.add_pipe('sentencizer')
 
74
 
75
  def show(name):
76
  return f"{name}"
77
+
78
+ def get_excel_files(folder):
79
+ return [f for f in os.listdir(folder) if f.endswith('.xlsx')]
80
+
81
+ def get_sheet_names(file):
82
+ xls = pd.ExcelFile(os.path.join(DATA_FOLDER, file))
83
+ return xls.sheet_names
84
+
85
+ def process_and_compare(file1, sheet1, file2, sheet2):
86
+ def process_file(file_path, sheet_name):
87
+ # Extract year from file name
88
+ year = int(re.search(r'(\d{4})', file_path).group(1))
89
+
90
+ # Load the Excel file
91
+ df = pd.read_excel(os.path.join("data", file_path), sheet_name=sheet_name, index_col=0)
92
+
93
+ # Define expected columns based on extracted year
94
+ historical_col = f'Historical {year - 1}'
95
+ baseline_cols = [f'Baseline {year}', f'Baseline {year + 1}', f'Baseline {year + 2}']
96
+ adverse_cols = [f'Adverse {year}', f'Adverse {year + 1}', f'Adverse {year + 2}']
97
+ level_deviation_col = f'Level Deviation {year + 2}'
98
+
99
+ # Drop rows and reset index
100
+ df = df.iloc[4:].reset_index(drop=True)
101
+
102
+ # Define the new column names
103
+ new_columns = ['Country', 'Code', historical_col] + baseline_cols + adverse_cols + ['Adverse Cumulative', 'Adverse Minimum', level_deviation_col]
104
+
105
+ # Ensure the number of columns matches
106
+ if len(df.columns) == len(new_columns):
107
+ df.columns = new_columns
108
+ else:
109
+ raise ValueError(f"Expected {len(new_columns)} columns, but found {len(df.columns)} columns in the data.")
110
+
111
+ return df
112
+
113
+ # Process both files
114
+ df1 = process_file(file1, sheet1)
115
+ df2 = process_file(file2, sheet2)
116
+ year1 = int(re.search(r'(\d{4})', file1).group(1))
117
+ year2 = int(re.search(r'(\d{4})', file2).group(1))
118
+ # Calculate the differences
119
+ # historical_col1 = f'Historical {int(year1) - 1}'
120
+ # historical_col2 = f'Historical {int(year2) - 1}'
121
+
122
+ # df1['Historical vs Adverse'] = df1[historical_col1] - df1['Adverse Cumulative']
123
+ # df2['Historical vs Adverse'] = df2[historical_col2] - df2['Adverse Cumulative']
124
+
125
+ # Merge dataframes on 'Country'
126
+ merged_df = pd.merge(df2, df1, on='Country', suffixes=(f'_{year1}', f'_{year2}'))
127
+ merged_df['Difference adverse cumulative growth'] = merged_df[f'Adverse Cumulative_{year2}'] - merged_df[f'Adverse Cumulative_{year1}']
128
+ # Ensure data types are correct
129
+ merged_df['Country'] = merged_df['Country'].astype(str)
130
+ merged_df['Difference adverse cumulative growth'] = pd.to_numeric(merged_df['Difference adverse cumulative growth'], errors='coerce')
131
+
132
+ # Create histogram plot with color coding
133
+ fig, ax = plt.subplots(figsize=(12, 8))
134
+ colors = plt.get_cmap('tab20').colors # Use a colormap with multiple colors
135
+ num_countries = len(merged_df['Country'])
136
+
137
+ bars = ax.bar(merged_df['Country'], merged_df['Difference adverse cumulative growth'], color=colors[:num_countries])
138
+
139
+ # Add a legend
140
+ handles = [patches.Patch(color=color, label=country) for color, country in zip(colors[:num_countries], merged_df['Country'])]
141
+ ax.legend(handles=handles, title='Countries', bbox_to_anchor=(1.05, 1), loc='upper left')
142
 
143
+ ax.set_title(f'Histogram of Difference between Adverse cumulative growth of {year2} and {year1} for {sheet1}')
144
+ ax.set_xlabel('Country')
145
+ ax.set_ylabel('Difference')
146
+ plt.xticks(rotation=90)
147
+
148
+ # Save plot to BytesIO object
149
+ buf = io.BytesIO()
150
+ plt.savefig(buf, format='png', bbox_inches='tight')
151
+ buf.seek(0)
152
+ img = buf.getvalue()
153
+ buf.close()
154
+
155
+ return img
156
+
157
+
158
  stored_paragraphs_1 = []
159
  stored_paragraphs_2 = []
160
 
161
  with gr.Blocks() as demo:
162
+ with gr.Tab("Financial Report Text Analysis"):
163
+ gr.Markdown("## Financial Report Paragraph Selection and Analysis on adverse macro-economy scenario")
164
+
165
+ with gr.Row():
166
+ # Upload PDFs
167
+ with gr.Column():
168
+ pdf1 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 1")
169
+ pdf2 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 2")
170
+
171
+ with gr.Column():
172
+ b1 = gr.Button("Extract and Display Paragraphs")
173
+ paragraph_1_dropdown = gr.Dropdown(label="Select Paragraph from PDF 1")
174
+ paragraph_2_dropdown = gr.Dropdown(label="Select Paragraph from PDF 2")
175
+
176
+ def update_paragraphs(pdf1, pdf2):
177
+ global stored_paragraphs_1, stored_paragraphs_2
178
+ stored_paragraphs_1, stored_paragraphs_2 = extract_and_summarize(pdf1, pdf2)
179
+ updated_dropdown_1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)]
180
+ updated_dropdown_2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)]
181
+ return gr.update(choices=updated_dropdown_1), gr.update(choices=updated_dropdown_2)
182
+
183
+ b1.click(fn=update_paragraphs, inputs=[pdf1, pdf2], outputs=[paragraph_1_dropdown, paragraph_2_dropdown])
184
+
185
+ with gr.Row():
186
+ # Process the selected paragraph from PDF 1
187
+ with gr.Column():
188
+ gr.Markdown("### PDF 1 Analysis")
189
+ selected_paragraph_1 = gr.Textbox(label="Selected Paragraph 1 Content", lines=4)
190
+ selected_paragraph_1.change(show, paragraph_1_dropdown, selected_paragraph_1)
191
+ summarize_btn1 = gr.Button("Summarize Text from PDF 1")
192
+ summary_textbox_1 = gr.Textbox(label="Summary for PDF 1", lines=2)
193
+ summarize_btn1.click(fn=lambda p: process_paragraph_1_sum(p), inputs=paragraph_1_dropdown, outputs=summary_textbox_1)
194
+ sentiment_btn1 = gr.Button("Classify Financial Tone from PDF 1")
195
+ sentiment_textbox_1 = gr.Textbox(label="Classification for PDF 1", lines=1)
196
+ sentiment_btn1.click(fn=lambda p: process_paragraph_1_sent(p), inputs=paragraph_1_dropdown, outputs=sentiment_textbox_1)
197
+ analyze_btn1 = gr.Button("Analyze Financial Tone on each sentence with yiyanghkust/finbert-tone")
198
+ fin_spans_1 = gr.HighlightedText(label="Financial Tone Analysis for PDF 1")
199
+ analyze_btn1.click(fn=lambda p: process_paragraph_1_sent_tone(p), inputs=paragraph_1_dropdown, outputs=fin_spans_1)
200
+ analyze_btn1_ = gr.Button("Analyze Financial Tone on each sentence with ProsusAI/finbert")
201
+ fin_spans_1_ = gr.HighlightedText(label="Financial Tone Analysis for PDF 1 bis")
202
+ analyze_btn1_.click(fn=lambda p: process_paragraph_1_sent_tone_bis(p), inputs=paragraph_1_dropdown, outputs=fin_spans_1_)
203
+
204
+ # Process the selected paragraph from PDF 2
205
+ with gr.Column():
206
+ gr.Markdown("### PDF 2 Analysis")
207
+ selected_paragraph_2 = gr.Textbox(label="Selected Paragraph 2 Content", lines=4)
208
+ selected_paragraph_2.change(show, paragraph_2_dropdown, selected_paragraph_2)
209
+ summarize_btn2 = gr.Button("Summarize Text from PDF 2")
210
+ summary_textbox_2 = gr.Textbox(label="Summary for PDF 2", lines=2)
211
+ summarize_btn2.click(fn=lambda p: process_paragraph_2_sum(p), inputs=paragraph_2_dropdown, outputs=summary_textbox_2)
212
+ sentiment_btn2 = gr.Button("Classify Financial Tone from PDF 2")
213
+ sentiment_textbox_2 = gr.Textbox(label="Classification for PDF 2", lines=1)
214
+ sentiment_btn2.click(fn=lambda p: process_paragraph_2_sent(p), inputs=paragraph_2_dropdown, outputs=sentiment_textbox_2)
215
+ analyze_btn2 = gr.Button("Analyze Financial Tone on each sentence with yiyanghkust/finbert-tone")
216
+ fin_spans_2 = gr.HighlightedText(label="Financial Tone Analysis for PDF 2")
217
+ analyze_btn2.click(fn=lambda p: process_paragraph_2_sent_tone(p), inputs=paragraph_2_dropdown, outputs=fin_spans_2)
218
+ analyze_btn2_ = gr.Button("Analyze Financial Tone on each sentence with ProsusAI/finbert")
219
+ fin_spans_2_ = gr.HighlightedText(label="Financial Tone Analysis for PDF 2 bis")
220
+ analyze_btn2_.click(fn=lambda p: process_paragraph_2_sent_tone_bis(p), inputs=paragraph_2_dropdown, outputs=fin_spans_2_)
221
+
222
+ with gr.Tab("Financial Report Table Analysis"):
223
+ # New tab content goes here
224
+ gr.Markdown("## Excel Data Comparison")
225
+
226
+ with gr.Row():
227
+ with gr.Column():
228
+ file1 = gr.Dropdown(choices=get_excel_files(DATA_FOLDER), label="Select Excel File 1")
229
+ file2 = gr.Dropdown(choices=get_excel_files(DATA_FOLDER), label="Select Excel File 2")
230
+ sheet = gr.Dropdown(choices=[], label="Select Sheet for File 1 and 2")
231
+
232
+ with gr.Column():
233
+ result = gr.Image(label="Comparison pLot")
234
+
235
+ def update_sheets(file):
236
+ return get_sheet_names(file)
237
+
238
+ file1.change(fn=update_sheets, inputs=file1, outputs=sheet)
239
+ file2.change(fn=update_sheets, inputs=file2, outputs=sheet)
240
+
241
+ b1 = gr.Button("Compare Data")
242
+ b1.click(fn=process_and_compare, inputs=[file1, sheet, file2, sheet], outputs=result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
 
244
  demo.launch()