Spaces:
Sleeping
Sleeping
File size: 35,020 Bytes
08b59ae a54501c cbcd9d2 08b59ae 37af58a 22db5e3 87561e3 6313df2 08b59ae 12cfba7 08b59ae bf1f436 07370e4 08b59ae 7708f6b ef0e117 7b7ff1a 08b59ae bf1f436 08b59ae 24621a7 bf9c349 1e43982 12cfba7 9c2be66 08b59ae 1c10bf3 d774f5b 08b59ae 9c2be66 10f2423 9c2be66 08b59ae ba7023b e2a28f9 ba7023b 09e491a b4cc8be 09e491a e2a28f9 ba7023b 24bd8b1 9936dad d8e9358 e1ef465 b4cc8be e1ef465 b4cc8be ef0e117 9e963c0 4166ff1 9e963c0 e1ef465 99d35a2 e1ef465 b4cc8be e1ef465 b4cc8be e1ef465 b4cc8be e1ef465 b4cc8be ef0e117 9e963c0 e1ef465 bebfc38 e1ef465 b4cc8be e1ef465 b4cc8be e1ef465 08b59ae aa4c32a 626cc85 b4cc8be aa4c32a b4cc8be d9b2ec2 626cc85 22db5e3 862aea5 6d27c94 22db5e3 9995b35 22db5e3 3d407d8 22db5e3 9995b35 22db5e3 9995b35 22db5e3 50d3706 22db5e3 8ccce28 1413861 8ccce28 8fd6111 0a5d324 22db5e3 9c2be66 524e417 9c2be66 524e417 9c2be66 524e417 9c2be66 e5c62b5 8ed5868 e5c62b5 9c2be66 e5c62b5 9c2be66 e5c62b5 8ed5868 e5c62b5 a070863 e5c62b5 6a2554d 9995b35 1807b97 1e70f78 a9d6164 1e70f78 9995b35 4dd4ffb f64bab8 1e70f78 a37f3d4 b90a585 9481c31 9995b35 7d68525 9481c31 9995b35 4dea577 d9ebe36 4dea577 a1835a8 922cb11 1e6310d ae1af52 5affe9a 922cb11 9c67e28 6006ca3 4dea577 ae1af52 93fbdd1 6006ca3 4dea577 1bfee56 efa46f3 ae8ac16 1bfee56 6cfe921 f64bab8 1bfee56 03106c5 1bfee56 6cfe921 1bfee56 ec643ee 1bfee56 efa46f3 385d234 161ab32 385d234 6a4d0f0 385d234 6a4d0f0 385d234 fae8526 6a4d0f0 385d234 6a4d0f0 ec643ee 6a4d0f0 6cfe921 9995b35 03106c5 9995b35 08b59ae d8e9358 9995b35 035782a edb7081 255a8fe 1722df0 255a8fe a175142 7d78b9a edb7081 3373145 019952c 550e68d 019952c 550e68d 019952c 161ab32 019952c 787788a 4810f30 22db5e3 d5ac762 22db5e3 4810f30 5f377cc 22db5e3 12cfba7 22db5e3 4810f30 d5ac762 22db5e3 ba7023b ef11d76 29ad3cb e2a28f9 4810f30 22db5e3 d5ac762 b4cc8be 9c2be66 b4cc8be d5ac762 4810f30 d5ac762 137833b 22db5e3 137833b 22db5e3 d72a722 aa4c32a 5cbe64a 5ab1c16 d8e9358 5ab1c16 f203192 6dbee67 f203192 c14f6ed 1722df0 c14f6ed 07df4f0 c14f6ed a175142 c14f6ed d72a722 22db5e3 d72a722 aa4c32a 5cbe64a 5ab1c16 d8e9358 5ab1c16 f203192 6dbee67 f203192 c14f6ed 1722df0 c14f6ed a175142 c14f6ed 137833b 22db5e3 d5ac762 22db5e3 3d407d8 d5ac762 9c2be66 f4bb53f d5ac762 f324e34 d5ac762 a070863 f324e34 d5ac762 a070863 f324e34 d5dafb7 f324e34 d5ac762 b2ec946 d5ac762 87bb9b5 c473846 fd8e3d8 1d6377a ea79eb7 540c0a1 8921c06 52a0e3f af47f62 ea79eb7 87bb9b5 ea79eb7 87bb9b5 ea79eb7 7d9c0a4 ea79eb7 87bb9b5 0e89bd7 d5ac762 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 |
import os
import gradio as gr
from gradio_calendar import Calendar
from transformers import pipeline
import spacy
import lib.read_pdf
import lib.comparison
import pandas as pd
import re
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import io
import shutil
# Initialize spaCy model
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('sentencizer')
# Gradio interface setup
PDF_FOLDER = "data"
def split_in_sentences(text):
doc = nlp(text)
return [str(sent).strip() for sent in doc.sents]
def make_spans(text, results):
results_list = [res['label'] for res in results]
facts_spans = list(zip(split_in_sentences(text), results_list))
return facts_spans
# Initialize pipelines
summarizer = pipeline("summarization", model="human-centered-summarization/financial-summarization-pegasus")
fin_model = pipeline("sentiment-analysis", model='yiyanghkust/finbert-tone', tokenizer='yiyanghkust/finbert-tone')
fin_model_bis = pipeline("sentiment-analysis", model='ProsusAI/finbert', tokenizer='ProsusAI/finbert')
table_to_text = pipeline('text2text-generation', model='google/flan-t5-xl')
def summarize_text(text):
resp = summarizer(text)
return resp[0]['summary_text']
def text_to_sentiment(text, all_score=False, label = True):
if label:
return fin_model(text, return_all_scores=all_score)[0]["label"]
else:
return fin_model(text, return_all_scores=all_score)
def fin_ext(text):
results = fin_model(split_in_sentences(text))
return make_spans(text, results)
def fin_ext_bis(text):
results = fin_model_bis(split_in_sentences(text))
return make_spans(text, results)
def upload_file_and_update_dropdown(files):
for file in files:
if file is not None:
# Save the file to the upload directory
file_path = os.path.join(PDF_FOLDER, os.path.basename(file))
shutil.copyfile(file.name, file_path)
# Get the updated list of files
files_list = os.listdir(PDF_FOLDER)
return gr.update(choices=files_list), gr.update(choices=files_list)
def extract_and_paragraph(pdf1, pdf2, paragraph):
if not pdf1 or not pdf2:
return [], []
pdf1_path = os.path.join(PDF_FOLDER, pdf1)
pdf2_path = os.path.join(PDF_FOLDER, pdf2)
# Extract and format paragraphs
paragraphs_1 = lib.read_pdf.extract_and_format_paragraphs(pdf1_path)
paragraphs_2 = lib.read_pdf.extract_and_format_paragraphs(pdf2_path)
start_keyword = ["Main risks to", "Developments in Financial Markets"]
end_keywords = ["4. Appendix", "Annex:", "4. Annex", "Detailed tables", "ACKNOWLEDGEMENTS", "STATISTICAL ANNEX", "PROSPECTS BY MEMBER STATES", "At the conclusion of the discussion"]
start_index1, end_index1 = lib.read_pdf.find_text_range(paragraphs_1, start_keyword, end_keywords)
start_index2, end_index2 = lib.read_pdf.find_text_range(paragraphs_2, start_keyword, end_keywords)
paragraphs_1 = lib.read_pdf.extract_relevant_text(paragraphs_1, start_index1, end_index1)
paragraphs_2 = lib.read_pdf.extract_relevant_text(paragraphs_2, start_index2, end_index2)
if paragraph:
paragraphs_1 = lib.read_pdf.split_text_into_paragraphs(paragraphs_1, 200)
paragraphs_2 = lib.read_pdf.split_text_into_paragraphs(paragraphs_2, 200)
return paragraphs_1, paragraphs_2
# Filter
def filter_paragraphs(keyword):
global stored_paragraphs_1, stored_paragraphs_2
global filter_paragraphs_1, filter_paragraphs_2
if not keyword:
paragraph1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)]
paragraph2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)]
filter_paragraphs_1 = stored_paragraphs_1
filter_paragraphs_2 = stored_paragraphs_2
return gr.update(choices=paragraph1, value=None), gr.update(choices=paragraph2, value=None) # No keyword entered, return original list
filter_paragraphs_1 = [p for p in stored_paragraphs_1 if keyword.lower() in p.lower()]
filter_paragraphs_2 = [p for p in stored_paragraphs_2 if keyword.lower() in p.lower()]
filtered1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(filter_paragraphs_1)]
filtered2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(filter_paragraphs_2)]
# Filter paragraphs that contain the keyword (case-insensitive)
# Update dropdown with filtered results
return gr.update(choices=filtered1, value=None), gr.update(choices=filtered2, value=None)
def clear_paragraphs():
global stored_paragraphs_1, stored_paragraphs_2
paragraph1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)]
paragraph2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)]
return gr.update(choices=paragraph1, value=None), gr.update(choices=paragraph2, value=None)
def filtered_close_paragraph(p, keyword, pdf):
if not keyword:
if pdf == "1":
return lib.comparison.compare_selected_paragraph(p, stored_paragraphs_1)
else:
return lib.comparison.compare_selected_paragraph(p, stored_paragraphs_2)
if pdf == "1":
return lib.comparison.compare_selected_paragraph(p, filter_paragraphs_1)
else:
return lib.comparison.compare_selected_paragraph(p, filter_paragraphs_2)
def process_paragraph_1_sum(paragraph):
try:
paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
selected_paragraph = filter_paragraphs_1[paragraph_index]
summary = summarize_text(selected_paragraph)
return summary
except (IndexError, ValueError):
return "Error"
def process_paragraph_1_sent(paragraph):
try:
paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
selected_paragraph = filter_paragraphs_1[paragraph_index]
results = text_to_sentiment(selected_paragraph, True, False)
if isinstance(results, list) and isinstance(results[0], list):
# We unpack the list of dictionaries to get all labels
output = {result['label']: result['score'] for result in results[0]}
print(output)
else:
output = {"Error": "Unexpected output format"}
return output
except (IndexError, ValueError):
return {"Error": "Unexpected output format"}
def process_paragraph_1_sent_tone(paragraph):
try:
paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
selected_paragraph = filter_paragraphs_1[paragraph_index]
fin_spans = fin_ext(selected_paragraph)
return fin_spans
except (IndexError, ValueError):
return []
def process_paragraph_1_sent_tone_bis(paragraph):
try:
paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
selected_paragraph = filter_paragraphs_1[paragraph_index]
fin_spans = fin_ext_bis(selected_paragraph)
return fin_spans
except (IndexError, ValueError):
return []
def process_paragraph_2_sum(paragraph):
try:
paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
selected_paragraph = filter_paragraphs_2[paragraph_index]
summary = summarize_text(selected_paragraph)
return summary
except (IndexError, ValueError):
return "Error"
def process_paragraph_2_sent(paragraph):
try:
paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
selected_paragraph = filter_paragraphs_2[paragraph_index]
results = text_to_sentiment(selected_paragraph, True, False)
if isinstance(results, list) and isinstance(results[0], list):
# We unpack the list of dictionaries to get all labels
output = {result['label']: result['score'] for result in results[0]}
else:
output = {"Error": "Unexpected output format"}
return output
except (IndexError, ValueError):
return {"Error": "Unexpected output format"}
def process_paragraph_2_sent_tone(paragraph):
try:
paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
selected_paragraph = filter_paragraphs_2[paragraph_index]
fin_spans = fin_ext(selected_paragraph)
return fin_spans
except (IndexError, ValueError):
return []
def process_paragraph_2_sent_tone_bis(paragraph):
try:
paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
selected_paragraph = filter_paragraphs_2[paragraph_index]
fin_spans = fin_ext_bis(selected_paragraph)
return fin_spans
except (IndexError, ValueError):
return []
def get_pdf_files(folder):
return [f for f in os.listdir(folder) if f.endswith('.pdf')]
def show1(paragraph):
try:
paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
selected_paragraph = filter_paragraphs_1[paragraph_index]
return selected_paragraph
except (IndexError, ValueError):
return "Error"
def show2(paragraph):
try:
paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
selected_paragraph = filter_paragraphs_2[paragraph_index]
return selected_paragraph
except (IndexError, ValueError):
return "Error"
def get_excel_files(folder):
return [f for f in os.listdir(folder) if f.endswith('.xlsx')]
def get_sheet_names(file):
xls = pd.ExcelFile(os.path.join(PDF_FOLDER, file))
return gr.update(choices=xls.sheet_names)
def process_and_compare(file1, sheet1, file2, sheet2):
def process_file(file_path, sheet_name):
# Extract year from file name
year = int(re.search(r'(\d{4})', file_path).group(1))
# Load the Excel file
df = pd.read_excel(os.path.join(PDF_FOLDER, file_path), sheet_name=sheet_name, index_col=0)
# Define expected columns based on extracted year
historical_col = f'Historical {year - 1}'
baseline_cols = [f'Baseline {year}', f'Baseline {year + 1}', f'Baseline {year + 2}']
adverse_cols = [f'Adverse {year}', f'Adverse {year + 1}', f'Adverse {year + 2}']
level_deviation_col = f'Level Deviation {year + 2}'
# Drop rows and reset index
df = df.iloc[4:].reset_index(drop=True)
# Define the new column names
new_columns = ['Country', 'Code', historical_col] + baseline_cols + adverse_cols + ['Adverse Cumulative', 'Adverse Minimum', level_deviation_col]
# Ensure the number of columns matches
if len(df.columns) == len(new_columns):
df.columns = new_columns
else:
raise ValueError(f"Expected {len(new_columns)} columns, but found {len(df.columns)} columns in the data.")
columns = ['Country', f'Adverse {year}', f'Adverse {year+1}', f'Adverse {year+2}', 'Adverse Cumulative']
return df, df[columns]
# Process both files
global stored_df1, stored_df2
df1, stored_df1 = process_file(file1, sheet1)
df2, stored_df2 = process_file(file2, sheet2)
year1 = int(re.search(r'(\d{4})', file1).group(1))
year2 = int(re.search(r'(\d{4})', file2).group(1))
# Merge dataframes on 'Country'
merged_df = pd.merge(df2, df1, on='Country', suffixes=(f'_{year1}', f'_{year2}'))
merged_df['Difference adverse cumulative growth'] = merged_df[f'Adverse Cumulative_{year2}'] - merged_df[f'Adverse Cumulative_{year1}']
# Ensure data types are correct
merged_df['Country'] = merged_df['Country'].astype(str)
merged_df['Difference adverse cumulative growth'] = pd.to_numeric(merged_df['Difference adverse cumulative growth'], errors='coerce')
# Create histogram plot with color coding
fig, ax = plt.subplots(figsize=(12, 8))
colors = plt.get_cmap('tab20').colors # Use a colormap with multiple colors
num_countries = len(merged_df['Country'])
bars = ax.bar(merged_df['Country'], merged_df['Difference adverse cumulative growth'], color=colors[:num_countries])
# Add a legend
handles = [patches.Patch(color=color, label=country) for color, country in zip(colors[:num_countries], merged_df['Country'])]
ax.legend(handles=handles, title='Countries', bbox_to_anchor=(1.05, 1), loc='upper left')
ax.set_title(f'Histogram of Difference between Adverse cumulative growth of {year2} and {year1} for {sheet1}')
ax.set_xlabel('Country')
ax.set_ylabel('Difference')
plt.xticks(rotation=90)
# Save plot to a file
file_path = 'output/plot.png'
plt.savefig(file_path, format='png', bbox_inches='tight')
plt.close()
filtered_countries1 = [country for country in stored_df1.Country.values.tolist() if (len(str(country)) < 20 and str(country) != "nan")]
filtered_countries2 = [country for country in stored_df2.Country.values.tolist() if (len(str(country)) < 20 and str(country) != "nan")]
return file_path, gr.update(choices=filtered_countries1), gr.update(choices=filtered_countries2)
def find_sentences_with_keywords(text, keywords):
# Split text into sentences using regular expression to match sentence-ending punctuation
sentences = re.split(r'(?<=[.!?])\s+', text)
matched_sentences = set() # Use a set to store unique sentences
# For each keyword, find sentences that contain the keyword as a whole word
for keyword in keywords:
keyword_pattern = re.compile(rf'\b{re.escape(keyword)}\b', re.IGNORECASE) # Using word boundaries
for sentence in sentences:
if keyword_pattern.search(sentence):
matched_sentences.add(sentence) # Add to set to ensure uniqueness
return list(matched_sentences) # Convert set back to list for consistent output
# Main function to process both PDFs based on the Excel file names and the sheet name
def process_pdfs_and_analyze_sentiment(file1, file2, sheet):
# Extract text from both PDFs based on the file name
pdf_file1 = file1.replace(".xlsx", ".pdf")
pdf_file2 = file2.replace(".xlsx", ".pdf")
text1, text2 =extract_and_paragraph(pdf_file1, pdf_file2, False)
# Use sheet name as the keyword to find relevant sentences
keywords = {
'GDP': ['GDP'],
'HICP': ['HICP'],
'RRE prices': ['RRE', 'residential'],
'CRE prices': ['CRE', 'commercial'],
'Unemployment': ['unemployment']
}
selected_keywords = keywords.get(sheet, [])
# Find sentences containing the keywords
sentences1 = find_sentences_with_keywords(text1, selected_keywords)
sentences2 = find_sentences_with_keywords(text2, selected_keywords)
# Concatenate all sentences for each PDF
text_pdf1 = "\n".join(sentences1)
text_pdf2 = "\n".join(sentences2)
# Perform sentiment analysis on the extracted sentences for each PDF
result_pdf1 = fin_ext(text_pdf1)
result_pdf2 = fin_ext(text_pdf2)
return result_pdf1, result_pdf2
#def change_choices(df):
# return gr.update(choices=df.Country.values.tolist())
def generate_text(df, country, theme):
# Filter the dataframe based on the country
#for column in df.columns:
# if column != 'Country':
# df[column] = df[column].apply(lambda x: f"{x:.6f}%")
#row = df[df['Country'] == country].iloc[0]
def format_row_for_prompt(row):
# Create a formatted string with colons and percentages
formatted_row = []
for col, value in row.items():
if col != 'Country': # Exclude 'Country' or format differently if needed
if isinstance(value, (int, float)): # Add percentage sign for numeric values
value_str = f"{value:.6f}%"
else:
value_str = str(value)
formatted_row.append(f"{col}: {value_str}")
else:
formatted_row.append(f"{col}: {value}")
return "\n".join(formatted_row)
# Convert the row to a string format for prompt
year = int(re.search(r'(\d{4})', df.columns[1]).group(1))
df.columns = ['Country', f'{year}', f'{year+1}', f'{year+2}', 'Total']
row = df[df['Country'] == country].iloc[0]
row_str = format_row_for_prompt(row)
#row_str = row.to_string(index=True)
print(row_str)
simple_prompt = f"""
Here is the data for {theme} in {country}:
{row_str}
Summarize the adverse growth for {theme} in {country}. Highlight any increase or decrease compared to previous years and include the cumulative result.
"""
prompt = f"""
Here is an example of how to describe adverse growth data for a given country:
Country: Australia
Adverse 1990: -0.43%
Adverse 1991: -1.99%
Adverse 1192: -1.20%
Adverse Cumulative: -3.57%
Topic: GDP
Description:
In the adverse scenario, the GDP growth in Australia was -0.43% in 1990. It decreased further to -1.99% in 1991, showing worsening conditions. There was a slight improvement to -1.20% in 1992. The total cumulative adverse growth is -3.57%.
Now, using the following data for {theme} in {country}, describe the adverse growth:
{row_str}
Topic: {theme}
Describe, using the similar pattern from the example, the changes for the provided country and years. Highlight how the values change year by year and whether they increased or decreased. Do not mention any other countries or years, and describe exactly what is in the table. Keep the description simple and direct.
"""
prompt = f"""
Example:
Country: Australia
1990: -0.43%
1991: -1.99%
1992: -1.20%
Total: -3.57%
Anwser:
In the adverse scenario, the growth in Australia was -0.43% in 1990. It worsened to -1.99% in 1991 and slightly improved to -1.20% in 1992. The total cumulative adverse growth was -3.57% from 1990 to 1992.
Now, using the following data in {country}, describe and provibe how the adverse growth changed each year, whether it increased or decreased, worsened or improved:
{row_str}
Answer:
"""
prompt1 = f"""
Given the following adverse growth data for {theme} in {country}:
{row_str}
Topic: {theme}
Describe the yearly changes in adverse growth, highlighting whether the values increased or decreased, and provide the cumulative growth. Follow this example:
Example:
Country: Australia
1990: -0.43%
1991: -1.99%
1992: -1.20%
Cumulative: -3.57%
Topic: GDP
Description:
In Australia, GDP growth was -0.43% in 1990. It worsened to -1.99% in 1991 and improved to -1.20% in 1992. The total cumulative adverse growth was -3.57%.
Now, describe the data for {country}
"""
print(year)
# Generate the descriptive text using the model
#result = table_to_text(prompt, max_length=240, temperature = 0.7, top_p = 0.3, do_sample = False)[0]['generated_text']
result = table_to_text(prompt, max_length=240)[0]['generated_text']
return result
# Global variable
stored_paragraphs_1 = []
stored_paragraphs_2 = []
filter_paragraphs_1 = []
filter_paragraphs_2 = []
stored_df1 = []
stored_df2 = []
current_theme = {"dark": False}
js_func = """
function refresh() {
const url = new URL(window.location);
if (url.searchParams.get('__theme') !== 'dark') {
url.searchParams.set('__theme', 'dark');
window.location.href = url.href;
}
}
"""
# Define custom colors for the labels
color_map = {
"Positive": "green", # Green for positive
"Neutral": "blue", # Blue for neutral
"Negative": "red" # Red for negative
}
color_map1 = {
"positive": "green", # Green for positive
"neutral": "blue", # Blue for neutral
"negative": "red" # Red for negative
}
with gr.Blocks(theme='gradio/soft',js=js_func) as demo:
with gr.Tab("Methodology"):
gr.Markdown("""
## Macro-economy Adverse Scenario Comparison from EBA Reports
This application allows the user to compare two reports from text contents or from tables. It's divided into two tabs.
**First Tab: Text Comparisons**
- It handdles EBA and Federal Open Market Committee files report. Don't modify federal file name.
- Select two PDFs. Each PDF's text content will be extracted into paragraphs.
- You can choose a keyword to filter paragraphs.
- Select a paragraph from one PDF, and find the most similar paragraph from the other PDF using a specific method.
- For a selected paragraph, compute summarization using the **FinPEGASUS model**.
- For a selected paragraph, compute sentiment analysis of the paragraph, and for each sentence, classify into three classes (Positive, Negative, Neutral) using two different fine-tuned **FinBERT models**:
- [ProsusAI/finbert](https://huggingface.co/ProsusAI/finbert)
- [yiyanghkust/finbert-tone](https://huggingface.co/yiyanghkust/finbert-tone)
**Second Tab: Table Comparisons**
- Select two Excel files and a sheet name.
- For the two selected tables, compute the difference of the cumulative adverse growth rate over their respective three years for the selected sheet name (topic).
- For the selected topic (sheet name), find related sentences in the associated PDF text that mention the topic, and classify them by sentiment.
- For a selected country and topic, describe the adverse growth rate trend over three years using the [**google/flan-t5-xl**](https://huggingface.co/google/flan-t5-xl).
""")
with gr.Tab("Financial Report Text Analysis"):
gr.Markdown("## Financial Report Paragraph Selection and Analysis on Adverse Macro-Economy Scenario")
with gr.Row():
# Upload PDFs
with gr.Column():
gr.Markdown("### Step 1: Upload PDF Files")
upload_button = gr.File(label="Upload files", file_types=[".pdf"], file_count="multiple")
pdf1 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 1")
pdf2 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 2")
upload_button.upload(upload_file_and_update_dropdown, upload_button, [pdf1, pdf2])
with gr.Column():
gr.Markdown("### Step 2: Extract and Display Paragraphs")
b1 = gr.Button("Extract and Display Paragraphs")
paragraph_1_dropdown = gr.Dropdown(label="Select Paragraph from PDF 1")
paragraph_2_dropdown = gr.Dropdown(label="Select Paragraph from PDF 2")
keyword_input = gr.Textbox(label="Enter keyword to search")
# Button to trigger the filtering
with gr.Row():
search_button = gr.Button("Search")
clear_button = gr.Button("Clear")
search_button.click(filter_paragraphs, inputs=keyword_input, outputs=[paragraph_1_dropdown, paragraph_2_dropdown])
clear_button.click(clear_paragraphs, inputs=[], outputs=[paragraph_1_dropdown, paragraph_2_dropdown])
# Extract paragraphs from PDFs and update dropdowns
def update_paragraphs(pdf1, pdf2):
global stored_paragraphs_1, stored_paragraphs_2
global filter_paragraphs_1, filter_paragraphs_2
stored_paragraphs_1, stored_paragraphs_2 = extract_and_paragraph(pdf1, pdf2, True)
filter_paragraphs_1, filter_paragraphs_2 = stored_paragraphs_1, stored_paragraphs_2
updated_dropdown_1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)]
updated_dropdown_2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)]
return gr.update(choices=updated_dropdown_1), gr.update(choices=updated_dropdown_2)
b1.click(fn=update_paragraphs, inputs=[pdf1, pdf2], outputs=[paragraph_1_dropdown, paragraph_2_dropdown])
gr.Markdown("---")
with gr.Row():
# PDF 1 Analysis section with custom background
with gr.Column():
gr.Markdown("### PDF 1 Analysis")
selected_paragraph_1 = gr.Textbox(label="Selected Paragraph 1 Content", lines=4)
paragraph_1_dropdown.select(fn=show1, inputs = paragraph_1_dropdown, outputs=selected_paragraph_1)
close_paragraph_1 = gr.Textbox(label="Closest Paragraph from PDF 2 to selected Paragraph PDF 1", lines=4)
paragraph_1_dropdown.select(
fn=lambda p, keyword: filtered_close_paragraph(p, keyword, "2"), # Use stored_paragraphs_2 inside the function
inputs=[paragraph_1_dropdown, keyword_input],
outputs=close_paragraph_1
)
with gr.Group():
summarize_btn1 = gr.Button("Summarize Text from PDF 1")
summary_textbox_1 = gr.Textbox(label="Summary for PDF 1", lines=2)
# Summarize the selected paragraph from PDF 1
summarize_btn1.click(fn=lambda p: process_paragraph_1_sum(p), inputs=paragraph_1_dropdown, outputs=summary_textbox_1)
sentiment_btn1 = gr.Button("Classify Financial Tone for paragraph from PDF 1")
sentiment_textbox_1 = gr.Label(label="Classification from PDF 1")
# Classify the financial tone of the selected paragraph from PDF 1
sentiment_btn1.click(fn=lambda p: process_paragraph_1_sent(p), inputs=paragraph_1_dropdown, outputs=sentiment_textbox_1)
with gr.Accordion("Analyze Financial Tone on each sentence"):
analyze_btn1 = gr.Button("With FinBERT-tone")
fin_spans_1 = gr.HighlightedText(label="Financial Tone Analysis for PDF 1",color_map=color_map, show_legend=True)
# Analyze financial tone on each sentence using FinBERT-tone
analyze_btn1.click(fn=lambda p: process_paragraph_1_sent_tone(p), inputs=paragraph_1_dropdown, outputs=fin_spans_1)
analyze_btn1_ = gr.Button("With ProsusAI/finbert")
fin_spans_1_ = gr.HighlightedText(label="Financial Tone Analysis for PDF 1 (Bis)",color_map=color_map1, show_legend=True)
# Analyze financial tone using ProsusAI/finbert
analyze_btn1_.click(fn=lambda p: process_paragraph_1_sent_tone_bis(p), inputs=paragraph_1_dropdown, outputs=fin_spans_1_)
# Process the selected paragraph from PDF 2
with gr.Column():
gr.Markdown("### PDF 2 Analysis")
selected_paragraph_2 = gr.Textbox(label="Selected Paragraph 2 Content", lines=4)
paragraph_2_dropdown.select(fn=show2, inputs = paragraph_2_dropdown, outputs=selected_paragraph_2)
close_paragraph_2 = gr.Textbox(label="Closest Paragraph from PDF 1 to selected Paragraph PDF 2", lines=4)
paragraph_2_dropdown.select(
fn=lambda p, keyword: filtered_close_paragraph(p, keyword, "1"), # Use stored_paragraphs_2 inside the function
inputs=[paragraph_2_dropdown, keyword_input],
outputs=close_paragraph_2
)
with gr.Group():
# Display selected paragraph from PDF 2
selected_paragraph_2.change(fn=show2, inputs=paragraph_2_dropdown, outputs=selected_paragraph_2)
summarize_btn2 = gr.Button("Summarize Text from PDF 2")
summary_textbox_2 = gr.Textbox(label="Summary for PDF 2", lines=2)
# Summarize the selected paragraph from PDF 2
summarize_btn2.click(fn=lambda p: process_paragraph_2_sum(p), inputs=paragraph_2_dropdown, outputs=summary_textbox_2)
sentiment_btn2 = gr.Button("Classify Financial Tone for paragraph from PDF 2")
sentiment_textbox_2 = gr.Label(label="Classification from PDF 2")
# Classify the financial tone of the selected paragraph from PDF 2
sentiment_btn2.click(fn=lambda p: process_paragraph_2_sent(p), inputs=paragraph_2_dropdown, outputs=sentiment_textbox_2)
with gr.Accordion("Analyze Financial Tone on each sentence"):
analyze_btn2 = gr.Button("With FinBERT-tone")
fin_spans_2 = gr.HighlightedText(label="Financial Tone Analysis for PDF 2",color_map=color_map, show_legend=True)
# Analyze financial tone on each sentence using FinBERT-tone for PDF 2
analyze_btn2.click(fn=lambda p: process_paragraph_2_sent_tone(p), inputs=paragraph_2_dropdown, outputs=fin_spans_2)
analyze_btn2_ = gr.Button("With ProsusAI/finbert")
fin_spans_2_ = gr.HighlightedText(label="Financial Tone Analysis for PDF 2 (Bis)",color_map=color_map1, show_legend=True)
# Analyze financial tone using ProsusAI/finbert for PDF 2
analyze_btn2_.click(fn=lambda p: process_paragraph_2_sent_tone_bis(p), inputs=paragraph_2_dropdown, outputs=fin_spans_2_)
with gr.Tab("Financial Report Table Analysis"):
# New tab content goes here
gr.Markdown("## Excel Data Comparison")
with gr.Row():
with gr.Column():
file1 = gr.Dropdown(choices=get_excel_files(PDF_FOLDER), label="Select Excel File 1")
file2 = gr.Dropdown(choices=get_excel_files(PDF_FOLDER), label="Select Excel File 2")
sheet = gr.Dropdown(choices=["GDP", "HICP", "RRE prices", "Unemployment", "CRE prices"], label="Select Sheet for File 1 and 2")
with gr.Column():
result = gr.Image(label="Comparison Plot")
#result = gr.BarPlot()
def update_sheets(file):
return get_sheet_names(file)
b1 = gr.Button("Compare Data")
b2 = gr.Button("Extract text information from PDFs")
with gr.Row():
with gr.Column():
sentiment_results_pdf1 = gr.HighlightedText(label="Sentiment Analysis - PDF 1",color_map=color_map, show_legend=True)
with gr.Column():
sentiment_results_pdf2 = gr.HighlightedText(label="Sentiment Analysis - PDF 2",color_map=color_map, show_legend=True)
with gr.Accordion("Adverse growth trends"):
with gr.Row():
with gr.Column():
country_1_dropdown = gr.Dropdown(label="Select Country from Excel File 1")
summarize_btn1_country = gr.Button("Summary for the selected country")
text_result_df1 = gr.Textbox(label="Sentence for excel file 1", lines=2)
summarize_btn1_country.click(fn=lambda country, theme: generate_text(stored_df1, country, theme),
inputs=[country_1_dropdown, sheet],
outputs=text_result_df1)
with gr.Column():
country_2_dropdown = gr.Dropdown(label="Select Country from Excel File 2")
summarize_btn2_country = gr.Button("Summary for the selected country")
text_result_df2 = gr.Textbox(label="Sentence for excel file 2", lines=2)
summarize_btn2_country.click(fn=lambda country, theme: generate_text(stored_df2, country, theme),
inputs=[country_2_dropdown, sheet],
outputs=text_result_df2)
# Button to extract text from PDFs and perform sentiment analysis
b1.click(fn=process_and_compare, inputs=[file1, sheet, file2, sheet], outputs=[result ,country_1_dropdown, country_2_dropdown])
b2.click(fn=process_pdfs_and_analyze_sentiment, inputs=[file1, file2, sheet], outputs=[sentiment_results_pdf1, sentiment_results_pdf2])
with gr.Tab("Fed data analysis"):
gr.Markdown("## Sentiment Analysis Overview")
# Display DataFrame
df = pd.read_csv("data/2008_2024_minutes.csv", header = 0)
df['Total_paragraphs']=df['Total_paragraphs']-df['Neutral']
df['Positive_ratio'] = df['Positive'] / df['Total_paragraphs']*100
df['Negative_ratio'] = df['Negative'] / df['Total_paragraphs']*100
df['Date'] = pd.to_datetime(df['Date'])
start_date = df['Date'].min()
end_date = df['Date'].max()
#start = Calendar(value ="2008-01-01", type="string", label="Start")
#end = Calendar(value="2025-01-01",type="string", label="End")
#apply_btn = gr.Button("Apply", scale=0)
#reset_btn = gr.Button("Reset", scale=0)
# data_table = gr.DataFrame(value=df[['Date', 'Positive_ratio', 'Negative_ratio', 'Total_paragraphs']], label="Sentiment Data", height=500)
# Pivot the DataFrame
#melted_df = df.melt(id_vars='Date', value_vars=['Positive_ratio', 'Negative_ratio'],
# var_name='Ratio_Type', value_name='Rate')
# Line plot for the ratios
line_plot = gr.LinePlot(
df,
x='Date',
y='Positive_ratio',
title="Positive Rate Over Time",
y_lim=[0, 100], # Limit y-axis to 0-1 since it's a ratio
#color = 'Ratio_Type'
)
#apply_btn.click(lambda start,end: gr.LinePlot(x_lim=[start, end]), [start, end], line_plot)
#reset_btn.click(lambda : gr.LinePlot(x_lim=[start_date, end_date]), [], line_plot)
demo.launch() |