File size: 35,020 Bytes
08b59ae
 
a54501c
cbcd9d2
08b59ae
 
37af58a
22db5e3
 
 
 
 
87561e3
6313df2
08b59ae
 
 
12cfba7
 
08b59ae
 
 
 
 
 
 
 
 
 
 
 
bf1f436
07370e4
08b59ae
 
 
 
 
7708f6b
ef0e117
 
 
7b7ff1a
08b59ae
 
 
bf1f436
 
 
08b59ae
24621a7
bf9c349
1e43982
 
 
 
 
 
 
12cfba7
9c2be66
08b59ae
 
 
 
 
 
 
 
 
 
1c10bf3
d774f5b
08b59ae
 
 
 
 
9c2be66
10f2423
 
9c2be66
08b59ae
 
ba7023b
 
 
e2a28f9
ba7023b
09e491a
 
b4cc8be
 
09e491a
e2a28f9
 
 
 
ba7023b
 
 
 
 
 
24bd8b1
 
 
9936dad
d8e9358
 
 
 
 
 
 
 
 
 
e1ef465
 
 
b4cc8be
e1ef465
 
 
 
 
 
 
b4cc8be
ef0e117
9e963c0
 
 
4166ff1
9e963c0
 
 
e1ef465
99d35a2
e1ef465
 
 
b4cc8be
e1ef465
 
 
 
 
 
 
b4cc8be
e1ef465
 
 
 
 
 
 
b4cc8be
e1ef465
 
 
 
 
 
 
b4cc8be
ef0e117
9e963c0
 
 
 
 
 
e1ef465
bebfc38
e1ef465
 
 
b4cc8be
e1ef465
 
 
 
 
 
 
b4cc8be
e1ef465
 
 
 
08b59ae
 
 
aa4c32a
626cc85
 
b4cc8be
aa4c32a
 
 
 
 
 
b4cc8be
d9b2ec2
626cc85
 
22db5e3
 
 
 
 
862aea5
6d27c94
22db5e3
9995b35
22db5e3
 
 
 
 
 
3d407d8
22db5e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9995b35
 
22db5e3
 
9995b35
 
 
22db5e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50d3706
22db5e3
 
 
 
 
8ccce28
1413861
8ccce28
 
8fd6111
 
0a5d324
22db5e3
9c2be66
 
 
 
524e417
9c2be66
 
 
 
 
 
 
524e417
9c2be66
524e417
9c2be66
 
 
e5c62b5
 
8ed5868
 
 
e5c62b5
 
 
 
 
 
 
9c2be66
e5c62b5
9c2be66
e5c62b5
 
 
 
 
8ed5868
 
e5c62b5
 
a070863
 
e5c62b5
 
6a2554d
 
9995b35
 
 
1807b97
 
 
1e70f78
 
 
 
 
 
 
 
 
 
 
a9d6164
 
1e70f78
9995b35
4dd4ffb
f64bab8
1e70f78
 
 
a37f3d4
b90a585
9481c31
9995b35
7d68525
9481c31
9995b35
4dea577
d9ebe36
4dea577
a1835a8
922cb11
 
 
1e6310d
ae1af52
 
5affe9a
922cb11
9c67e28
6006ca3
4dea577
ae1af52
93fbdd1
6006ca3
4dea577
1bfee56
efa46f3
ae8ac16
1bfee56
6cfe921
 
 
f64bab8
1bfee56
03106c5
 
1bfee56
6cfe921
1bfee56
ec643ee
 
1bfee56
efa46f3
385d234
 
 
161ab32
385d234
 
6a4d0f0
385d234
6a4d0f0
385d234
 
 
 
fae8526
6a4d0f0
 
385d234
6a4d0f0
ec643ee
6a4d0f0
6cfe921
9995b35
03106c5
 
9995b35
 
08b59ae
 
d8e9358
 
9995b35
 
 
035782a
 
edb7081
 
 
 
 
 
 
 
 
 
 
255a8fe
 
 
1722df0
 
 
255a8fe
 
a175142
 
 
 
 
 
7d78b9a
edb7081
3373145
019952c
 
 
 
 
 
550e68d
019952c
550e68d
019952c
 
 
 
 
 
 
 
 
 
 
161ab32
019952c
787788a
4810f30
22db5e3
 
d5ac762
22db5e3
4810f30
5f377cc
22db5e3
 
12cfba7
22db5e3
4810f30
d5ac762
22db5e3
 
ba7023b
 
 
 
 
ef11d76
29ad3cb
e2a28f9
4810f30
22db5e3
d5ac762
b4cc8be
9c2be66
b4cc8be
d5ac762
 
 
4810f30
d5ac762
137833b
22db5e3
137833b
22db5e3
d72a722
 
 
aa4c32a
5cbe64a
5ab1c16
d8e9358
 
5ab1c16
 
f203192
 
 
 
 
 
 
6dbee67
 
f203192
 
 
 
c14f6ed
 
1722df0
c14f6ed
 
07df4f0
c14f6ed
 
a175142
c14f6ed
 
 
d72a722
 
22db5e3
d72a722
 
 
aa4c32a
5cbe64a
5ab1c16
d8e9358
 
5ab1c16
 
f203192
 
 
 
 
 
 
 
 
 
6dbee67
 
f203192
 
 
c14f6ed
 
1722df0
c14f6ed
 
 
 
 
a175142
c14f6ed
 
 
137833b
22db5e3
 
d5ac762
 
22db5e3
 
 
3d407d8
 
d5ac762
 
9c2be66
f4bb53f
 
d5ac762
 
 
 
 
f324e34
d5ac762
 
 
a070863
f324e34
d5ac762
a070863
f324e34
d5dafb7
 
f324e34
 
 
 
 
 
 
 
 
 
 
 
 
 
d5ac762
b2ec946
d5ac762
87bb9b5
 
 
c473846
fd8e3d8
1d6377a
 
ea79eb7
540c0a1
 
8921c06
 
 
 
52a0e3f
af47f62
ea79eb7
 
87bb9b5
 
ea79eb7
87bb9b5
ea79eb7
 
7d9c0a4
ea79eb7
87bb9b5
0e89bd7
 
d5ac762
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
import os
import gradio as gr
from gradio_calendar import Calendar
from transformers import pipeline
import spacy
import lib.read_pdf
import lib.comparison
import pandas as pd
import re
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import io
import shutil

# Initialize spaCy model
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('sentencizer')
# Gradio interface setup
PDF_FOLDER = "data"
def split_in_sentences(text):
    doc = nlp(text)
    return [str(sent).strip() for sent in doc.sents]

def make_spans(text, results):
    results_list = [res['label'] for res in results]
    facts_spans = list(zip(split_in_sentences(text), results_list))
    return facts_spans

# Initialize pipelines
summarizer = pipeline("summarization", model="human-centered-summarization/financial-summarization-pegasus")
fin_model = pipeline("sentiment-analysis", model='yiyanghkust/finbert-tone', tokenizer='yiyanghkust/finbert-tone')
fin_model_bis = pipeline("sentiment-analysis", model='ProsusAI/finbert', tokenizer='ProsusAI/finbert')
table_to_text = pipeline('text2text-generation', model='google/flan-t5-xl')

def summarize_text(text):
    resp = summarizer(text)
    return resp[0]['summary_text']

def text_to_sentiment(text, all_score=False, label = True):
    if label:
        return fin_model(text, return_all_scores=all_score)[0]["label"]
    else:
        return fin_model(text, return_all_scores=all_score)
def fin_ext(text):
    results = fin_model(split_in_sentences(text))
    return make_spans(text, results)
def fin_ext_bis(text):
    results = fin_model_bis(split_in_sentences(text))
    return make_spans(text, results)

def upload_file_and_update_dropdown(files):
    for file in files:
        if file is not None:
            # Save the file to the upload directory
            file_path = os.path.join(PDF_FOLDER, os.path.basename(file))
            shutil.copyfile(file.name, file_path)
            # Get the updated list of files
    files_list = os.listdir(PDF_FOLDER)
    return gr.update(choices=files_list), gr.update(choices=files_list)

def extract_and_paragraph(pdf1, pdf2, paragraph):
    if not pdf1 or not pdf2:
        return [], []

    pdf1_path = os.path.join(PDF_FOLDER, pdf1)
    pdf2_path = os.path.join(PDF_FOLDER, pdf2)

    # Extract and format paragraphs
    paragraphs_1 = lib.read_pdf.extract_and_format_paragraphs(pdf1_path)
    paragraphs_2 = lib.read_pdf.extract_and_format_paragraphs(pdf2_path)

    start_keyword = ["Main risks to", "Developments in Financial Markets"]
    end_keywords = ["4. Appendix", "Annex:", "4. Annex", "Detailed tables", "ACKNOWLEDGEMENTS", "STATISTICAL ANNEX", "PROSPECTS BY MEMBER STATES", "At the conclusion of the discussion"]

    start_index1, end_index1 = lib.read_pdf.find_text_range(paragraphs_1, start_keyword, end_keywords)
    start_index2, end_index2 = lib.read_pdf.find_text_range(paragraphs_2, start_keyword, end_keywords)
    paragraphs_1 = lib.read_pdf.extract_relevant_text(paragraphs_1, start_index1, end_index1)
    paragraphs_2 = lib.read_pdf.extract_relevant_text(paragraphs_2, start_index2, end_index2)
    if paragraph:
        paragraphs_1 = lib.read_pdf.split_text_into_paragraphs(paragraphs_1, 200)
        paragraphs_2 = lib.read_pdf.split_text_into_paragraphs(paragraphs_2, 200)
    
    return paragraphs_1, paragraphs_2

# Filter
def filter_paragraphs(keyword):
    global stored_paragraphs_1, stored_paragraphs_2
    global filter_paragraphs_1, filter_paragraphs_2
    if not keyword:
        paragraph1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)]
        paragraph2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)]
        filter_paragraphs_1 = stored_paragraphs_1
        filter_paragraphs_2 = stored_paragraphs_2
        return gr.update(choices=paragraph1, value=None), gr.update(choices=paragraph2, value=None)  # No keyword entered, return original list
    filter_paragraphs_1 = [p for p in stored_paragraphs_1 if keyword.lower() in p.lower()]
    filter_paragraphs_2 = [p for p in stored_paragraphs_2 if keyword.lower() in p.lower()]
    filtered1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(filter_paragraphs_1)]
    filtered2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(filter_paragraphs_2)]
    # Filter paragraphs that contain the keyword (case-insensitive)
    # Update dropdown with filtered results
    return gr.update(choices=filtered1, value=None), gr.update(choices=filtered2, value=None) 
    
def clear_paragraphs():
    global stored_paragraphs_1, stored_paragraphs_2
    paragraph1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)]
    paragraph2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)]
    return gr.update(choices=paragraph1, value=None), gr.update(choices=paragraph2, value=None)
    
def filtered_close_paragraph(p, keyword, pdf):
    if not keyword:
        if pdf == "1":
            return lib.comparison.compare_selected_paragraph(p, stored_paragraphs_1)
        else:
            return lib.comparison.compare_selected_paragraph(p, stored_paragraphs_2)
    if pdf == "1":
        return lib.comparison.compare_selected_paragraph(p, filter_paragraphs_1)
    else:
        return lib.comparison.compare_selected_paragraph(p, filter_paragraphs_2)      
def process_paragraph_1_sum(paragraph):
                try:
                    paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
                    selected_paragraph = filter_paragraphs_1[paragraph_index]
                    summary = summarize_text(selected_paragraph)
                    return summary
                except (IndexError, ValueError):
                    return "Error"
def process_paragraph_1_sent(paragraph):
                try:
                    paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
                    selected_paragraph = filter_paragraphs_1[paragraph_index]
                    results = text_to_sentiment(selected_paragraph, True, False)
                    if isinstance(results, list) and isinstance(results[0], list):
                        # We unpack the list of dictionaries to get all labels
                        output = {result['label']: result['score'] for result in results[0]}
                        print(output)
                    else:
                        output = {"Error": "Unexpected output format"}
                    return output
                except (IndexError, ValueError):
                    return {"Error": "Unexpected output format"}
def process_paragraph_1_sent_tone(paragraph):
                try:
                    paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
                    selected_paragraph = filter_paragraphs_1[paragraph_index]
                    fin_spans = fin_ext(selected_paragraph)
                    return fin_spans
                except (IndexError, ValueError):
                    return []
def process_paragraph_1_sent_tone_bis(paragraph):
                try:
                    paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
                    selected_paragraph = filter_paragraphs_1[paragraph_index]
                    fin_spans = fin_ext_bis(selected_paragraph)
                    return fin_spans
                except (IndexError, ValueError):
                    return []
def process_paragraph_2_sum(paragraph):
                try:
                    paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
                    selected_paragraph = filter_paragraphs_2[paragraph_index]
                    summary = summarize_text(selected_paragraph)
                    return summary
                except (IndexError, ValueError):
                    return "Error"
def process_paragraph_2_sent(paragraph):
                try:
                    paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
                    selected_paragraph = filter_paragraphs_2[paragraph_index]
                    results = text_to_sentiment(selected_paragraph, True, False)
                    if isinstance(results, list) and isinstance(results[0], list):
                        # We unpack the list of dictionaries to get all labels
                        output = {result['label']: result['score'] for result in results[0]}
                    else:
                        output = {"Error": "Unexpected output format"}
                    return output
                except (IndexError, ValueError):
                    return {"Error": "Unexpected output format"}
def process_paragraph_2_sent_tone(paragraph):
                try:
                    paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
                    selected_paragraph = filter_paragraphs_2[paragraph_index]
                    fin_spans = fin_ext(selected_paragraph)
                    return fin_spans
                except (IndexError, ValueError):
                    return []
def process_paragraph_2_sent_tone_bis(paragraph):
                try:
                    paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
                    selected_paragraph = filter_paragraphs_2[paragraph_index]
                    fin_spans = fin_ext_bis(selected_paragraph)
                    return fin_spans
                except (IndexError, ValueError):
                    return []
def get_pdf_files(folder):
    return [f for f in os.listdir(folder) if f.endswith('.pdf')]

def show1(paragraph):
    try:
        paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
        selected_paragraph = filter_paragraphs_1[paragraph_index]
        return selected_paragraph
    except (IndexError, ValueError):
        return "Error"
def show2(paragraph):
    try:
        paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
        selected_paragraph = filter_paragraphs_2[paragraph_index]
        return selected_paragraph
    except (IndexError, ValueError):
        return "Error"

def get_excel_files(folder):
    return [f for f in os.listdir(folder) if f.endswith('.xlsx')]

def get_sheet_names(file):
    xls = pd.ExcelFile(os.path.join(PDF_FOLDER, file))
    return gr.update(choices=xls.sheet_names)

    
def process_and_compare(file1, sheet1, file2, sheet2):
    def process_file(file_path, sheet_name):
        # Extract year from file name
        year = int(re.search(r'(\d{4})', file_path).group(1))
        
        # Load the Excel file
        df = pd.read_excel(os.path.join(PDF_FOLDER, file_path), sheet_name=sheet_name, index_col=0)
        
        # Define expected columns based on extracted year
        historical_col = f'Historical {year - 1}'
        baseline_cols = [f'Baseline {year}', f'Baseline {year + 1}', f'Baseline {year + 2}']
        adverse_cols = [f'Adverse {year}', f'Adverse {year + 1}', f'Adverse {year + 2}']
        level_deviation_col = f'Level Deviation {year + 2}'

        # Drop rows and reset index
        df = df.iloc[4:].reset_index(drop=True)

        # Define the new column names
        new_columns = ['Country', 'Code', historical_col] + baseline_cols + adverse_cols + ['Adverse Cumulative', 'Adverse Minimum', level_deviation_col]
        
        # Ensure the number of columns matches
        if len(df.columns) == len(new_columns):
            df.columns = new_columns
        else:
            raise ValueError(f"Expected {len(new_columns)} columns, but found {len(df.columns)} columns in the data.")
        columns = ['Country', f'Adverse {year}', f'Adverse {year+1}', f'Adverse {year+2}', 'Adverse Cumulative']
        return df, df[columns]

    # Process both files
    global stored_df1, stored_df2
    df1, stored_df1 = process_file(file1, sheet1)
    df2, stored_df2 = process_file(file2, sheet2)
    year1 = int(re.search(r'(\d{4})', file1).group(1))
    year2 = int(re.search(r'(\d{4})', file2).group(1))

    # Merge dataframes on 'Country'
    merged_df = pd.merge(df2, df1, on='Country', suffixes=(f'_{year1}', f'_{year2}'))
    merged_df['Difference adverse cumulative growth'] = merged_df[f'Adverse Cumulative_{year2}'] - merged_df[f'Adverse Cumulative_{year1}']
    # Ensure data types are correct
    merged_df['Country'] = merged_df['Country'].astype(str)
    merged_df['Difference adverse cumulative growth'] = pd.to_numeric(merged_df['Difference adverse cumulative growth'], errors='coerce')

    # Create histogram plot with color coding
    fig, ax = plt.subplots(figsize=(12, 8))
    colors = plt.get_cmap('tab20').colors  # Use a colormap with multiple colors
    num_countries = len(merged_df['Country'])
    
    bars = ax.bar(merged_df['Country'], merged_df['Difference adverse cumulative growth'], color=colors[:num_countries])
    
    # Add a legend
    handles = [patches.Patch(color=color, label=country) for color, country in zip(colors[:num_countries], merged_df['Country'])]
    ax.legend(handles=handles, title='Countries', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    ax.set_title(f'Histogram of Difference between Adverse cumulative growth of {year2} and {year1} for {sheet1}')
    ax.set_xlabel('Country')
    ax.set_ylabel('Difference')
    plt.xticks(rotation=90)

    # Save plot to a file
    file_path = 'output/plot.png'
    plt.savefig(file_path, format='png', bbox_inches='tight')
    plt.close()
    filtered_countries1 = [country for country in stored_df1.Country.values.tolist() if (len(str(country)) < 20 and str(country) != "nan")]
    filtered_countries2 = [country for country in stored_df2.Country.values.tolist() if (len(str(country)) < 20 and str(country) != "nan")]
    return file_path, gr.update(choices=filtered_countries1), gr.update(choices=filtered_countries2)

def find_sentences_with_keywords(text, keywords):
    # Split text into sentences using regular expression to match sentence-ending punctuation
    sentences = re.split(r'(?<=[.!?])\s+', text)
    
    matched_sentences = set()  # Use a set to store unique sentences
    
    # For each keyword, find sentences that contain the keyword as a whole word
    for keyword in keywords:
        keyword_pattern = re.compile(rf'\b{re.escape(keyword)}\b', re.IGNORECASE)  # Using word boundaries

        for sentence in sentences:
            if keyword_pattern.search(sentence):
                matched_sentences.add(sentence)  # Add to set to ensure uniqueness

    return list(matched_sentences)  # Convert set back to list for consistent output


# Main function to process both PDFs based on the Excel file names and the sheet name
def process_pdfs_and_analyze_sentiment(file1, file2, sheet):
    # Extract text from both PDFs based on the file name
    pdf_file1 = file1.replace(".xlsx", ".pdf")
    pdf_file2 = file2.replace(".xlsx", ".pdf")
    text1, text2 =extract_and_paragraph(pdf_file1, pdf_file2, False)
    # Use sheet name as the keyword to find relevant sentences
    keywords = {
        'GDP': ['GDP'], 
        'HICP': ['HICP'], 
        'RRE prices': ['RRE', 'residential'], 
        'CRE prices': ['CRE', 'commercial'], 
        'Unemployment': ['unemployment']
    }
    selected_keywords = keywords.get(sheet, [])

    # Find sentences containing the keywords
    sentences1 = find_sentences_with_keywords(text1, selected_keywords)
    sentences2 = find_sentences_with_keywords(text2, selected_keywords)

    # Concatenate all sentences for each PDF
    text_pdf1 = "\n".join(sentences1)
    text_pdf2 = "\n".join(sentences2)
    
    # Perform sentiment analysis on the extracted sentences for each PDF
    result_pdf1 = fin_ext(text_pdf1)
    result_pdf2 = fin_ext(text_pdf2)
    
    return result_pdf1, result_pdf2
#def change_choices(df):
#    return gr.update(choices=df.Country.values.tolist())
    
def generate_text(df, country, theme):
    # Filter the dataframe based on the country
    #for column in df.columns:
    #    if column != 'Country':
    #        df[column] = df[column].apply(lambda x: f"{x:.6f}%")
    #row = df[df['Country'] == country].iloc[0]
    def format_row_for_prompt(row):
        # Create a formatted string with colons and percentages
        formatted_row = []
        for col, value in row.items():
            if col != 'Country':  # Exclude 'Country' or format differently if needed
                if isinstance(value, (int, float)):  # Add percentage sign for numeric values
                    value_str = f"{value:.6f}%"
                else:
                    value_str = str(value)
                formatted_row.append(f"{col}: {value_str}")
            else:
                formatted_row.append(f"{col}: {value}")
        return "\n".join(formatted_row)
    # Convert the row to a string format for prompt
    year = int(re.search(r'(\d{4})', df.columns[1]).group(1))
    df.columns = ['Country', f'{year}', f'{year+1}', f'{year+2}', 'Total']
    row = df[df['Country'] == country].iloc[0]
    row_str = format_row_for_prompt(row)
    #row_str = row.to_string(index=True)
    print(row_str)
    simple_prompt = f"""
    Here is the data for {theme} in {country}:
    {row_str}
    
    Summarize the adverse growth for {theme} in {country}. Highlight any increase or decrease compared to previous years and include the cumulative result.
    """

    prompt = f"""
    Here is an example of how to describe adverse growth data for a given country:
    Country: Australia
    Adverse 1990: -0.43%
    Adverse 1991: -1.99%
    Adverse 1192: -1.20%
    Adverse Cumulative: -3.57%
    Topic: GDP

    Description:
    In the adverse scenario, the GDP growth in Australia was -0.43% in 1990. It decreased further to -1.99% in 1991, showing worsening conditions. There was a slight improvement to -1.20% in 1992. The total cumulative adverse growth is -3.57%.
    
    Now, using the following data for {theme} in {country}, describe the adverse growth:
    {row_str}
    Topic: {theme}
    
    Describe, using the similar pattern from the example, the changes for the provided country and years. Highlight how the values change year by year and whether they increased or decreased. Do not mention any other countries or years, and describe exactly what is in the table. Keep the description simple and direct.
    """
    prompt = f"""
    Example:
    
    Country: Australia
    1990: -0.43%
    1991: -1.99%
    1992: -1.20%
    Total: -3.57%
    
    Anwser:
    In the adverse scenario, the growth in Australia was -0.43% in 1990. It worsened to -1.99% in 1991 and slightly improved to -1.20% in 1992. The total cumulative adverse growth was -3.57% from 1990 to 1992.
    
    Now, using the following data in {country}, describe and provibe how the adverse growth changed each year, whether it increased or decreased, worsened or improved:
    {row_str}

    Answer:
    """
    prompt1 = f"""
    Given the following adverse growth data for {theme} in {country}:
    
    {row_str}
    Topic: {theme}
    
    Describe the yearly changes in adverse growth, highlighting whether the values increased or decreased, and provide the cumulative growth. Follow this example:
    
    Example:
    Country: Australia
    1990: -0.43%
    1991: -1.99%
    1992: -1.20%
    Cumulative: -3.57%
    Topic: GDP
    
    Description:
    In Australia, GDP growth was -0.43% in 1990. It worsened to -1.99% in 1991 and improved to -1.20% in 1992. The total cumulative adverse growth was -3.57%.
    
    Now, describe the data for {country}
    """
    print(year)
    # Generate the descriptive text using the model
    #result = table_to_text(prompt, max_length=240, temperature = 0.7, top_p = 0.3, do_sample = False)[0]['generated_text']
    result = table_to_text(prompt, max_length=240)[0]['generated_text']
    return result
# Global variable
stored_paragraphs_1 = []
stored_paragraphs_2 = []
filter_paragraphs_1 = []
filter_paragraphs_2 = []
stored_df1 = []
stored_df2 = []

current_theme = {"dark": False}


js_func = """
function refresh() {
    const url = new URL(window.location);

    if (url.searchParams.get('__theme') !== 'dark') {
        url.searchParams.set('__theme', 'dark');
        window.location.href = url.href;
    }
}
"""

# Define custom colors for the labels
color_map = {
    "Positive": "green",  # Green for positive
    "Neutral": "blue",   # Blue for neutral
    "Negative": "red"   # Red for negative
}

color_map1 = {
    "positive": "green",  # Green for positive
    "neutral": "blue",   # Blue for neutral
    "negative": "red"   # Red for negative
}

    
with gr.Blocks(theme='gradio/soft',js=js_func) as demo:
    with gr.Tab("Methodology"):
        gr.Markdown("""
        ## Macro-economy Adverse Scenario Comparison from EBA Reports
        
        This application allows the user to compare two reports from text contents or from tables. It's divided into two tabs.
        
        **First Tab: Text Comparisons**
        - It handdles EBA and Federal Open Market Committee files report. Don't modify federal file name.
        - Select two PDFs. Each PDF's text content will be extracted into paragraphs.
        - You can choose a keyword to filter paragraphs.
        - Select a paragraph from one PDF, and find the most similar paragraph from the other PDF using a specific method.
        - For a selected paragraph, compute summarization using the **FinPEGASUS model**.
        - For a selected paragraph, compute sentiment analysis of the paragraph, and for each sentence, classify into three classes (Positive, Negative, Neutral) using two different fine-tuned **FinBERT models**:
          - [ProsusAI/finbert](https://huggingface.co/ProsusAI/finbert)
          - [yiyanghkust/finbert-tone](https://huggingface.co/yiyanghkust/finbert-tone)
        
        **Second Tab: Table Comparisons**
        
        - Select two Excel files and a sheet name.
        - For the two selected tables, compute the difference of the cumulative adverse growth rate over their respective three years for the selected sheet name (topic).
        - For the selected topic (sheet name), find related sentences in the associated PDF text that mention the topic, and classify them by sentiment.
        - For a selected country and topic, describe the adverse growth rate trend over three years using the [**google/flan-t5-xl**](https://huggingface.co/google/flan-t5-xl).
        """)
    with gr.Tab("Financial Report Text Analysis"):
        gr.Markdown("## Financial Report Paragraph Selection and Analysis on Adverse Macro-Economy Scenario")

        with gr.Row():
            # Upload PDFs
            with gr.Column():
                gr.Markdown("### Step 1: Upload PDF Files")
                upload_button = gr.File(label="Upload files", file_types=[".pdf"], file_count="multiple")
                pdf1 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 1")
                pdf2 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 2")
                upload_button.upload(upload_file_and_update_dropdown, upload_button, [pdf1, pdf2])
            with gr.Column():
                gr.Markdown("### Step 2: Extract and Display Paragraphs")
                b1 = gr.Button("Extract and Display Paragraphs")
                paragraph_1_dropdown = gr.Dropdown(label="Select Paragraph from PDF 1")
                paragraph_2_dropdown = gr.Dropdown(label="Select Paragraph from PDF 2")
                
                keyword_input = gr.Textbox(label="Enter keyword to search")
                # Button to trigger the filtering
                with gr.Row():
                    search_button = gr.Button("Search")
                    clear_button = gr.Button("Clear")
                search_button.click(filter_paragraphs, inputs=keyword_input, outputs=[paragraph_1_dropdown, paragraph_2_dropdown])
                clear_button.click(clear_paragraphs, inputs=[], outputs=[paragraph_1_dropdown, paragraph_2_dropdown])
                # Extract paragraphs from PDFs and update dropdowns
                def update_paragraphs(pdf1, pdf2):
                    global stored_paragraphs_1, stored_paragraphs_2
                    global filter_paragraphs_1, filter_paragraphs_2
                    stored_paragraphs_1, stored_paragraphs_2 = extract_and_paragraph(pdf1, pdf2, True)
                    filter_paragraphs_1, filter_paragraphs_2 = stored_paragraphs_1, stored_paragraphs_2
                    updated_dropdown_1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)]
                    updated_dropdown_2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)]
                    return gr.update(choices=updated_dropdown_1), gr.update(choices=updated_dropdown_2)
        
                b1.click(fn=update_paragraphs, inputs=[pdf1, pdf2], outputs=[paragraph_1_dropdown, paragraph_2_dropdown])
        gr.Markdown("---")
        with gr.Row():
        # PDF 1 Analysis section with custom background
            with gr.Column():
                gr.Markdown("### PDF 1 Analysis")
                
                selected_paragraph_1 = gr.Textbox(label="Selected Paragraph 1 Content", lines=4)
                paragraph_1_dropdown.select(fn=show1, inputs = paragraph_1_dropdown, outputs=selected_paragraph_1)
                close_paragraph_1 = gr.Textbox(label="Closest Paragraph from PDF 2 to selected Paragraph PDF 1", lines=4)
                paragraph_1_dropdown.select(
                    fn=lambda p, keyword: filtered_close_paragraph(p, keyword, "2"),  # Use stored_paragraphs_2 inside the function
                    inputs=[paragraph_1_dropdown, keyword_input],
                    outputs=close_paragraph_1
                )
                with gr.Group():
                    summarize_btn1 = gr.Button("Summarize Text from PDF 1")
                    summary_textbox_1 = gr.Textbox(label="Summary for PDF 1", lines=2)
            
                    # Summarize the selected paragraph from PDF 1
                    summarize_btn1.click(fn=lambda p: process_paragraph_1_sum(p), inputs=paragraph_1_dropdown, outputs=summary_textbox_1)
            
                    sentiment_btn1 = gr.Button("Classify Financial Tone for paragraph from PDF 1")
                    sentiment_textbox_1 = gr.Label(label="Classification from PDF 1")
            
                    # Classify the financial tone of the selected paragraph from PDF 1
                    sentiment_btn1.click(fn=lambda p: process_paragraph_1_sent(p), inputs=paragraph_1_dropdown, outputs=sentiment_textbox_1)
            
                    with gr.Accordion("Analyze Financial Tone on each sentence"):
                        analyze_btn1 = gr.Button("With FinBERT-tone")
                        fin_spans_1 = gr.HighlightedText(label="Financial Tone Analysis for PDF 1",color_map=color_map, show_legend=True)
                        
                        # Analyze financial tone on each sentence using FinBERT-tone
                        analyze_btn1.click(fn=lambda p: process_paragraph_1_sent_tone(p), inputs=paragraph_1_dropdown, outputs=fin_spans_1)
                
                        analyze_btn1_ = gr.Button("With ProsusAI/finbert")
                        fin_spans_1_ = gr.HighlightedText(label="Financial Tone Analysis for PDF 1 (Bis)",color_map=color_map1, show_legend=True)
                
                        # Analyze financial tone using ProsusAI/finbert
                        analyze_btn1_.click(fn=lambda p: process_paragraph_1_sent_tone_bis(p), inputs=paragraph_1_dropdown, outputs=fin_spans_1_)
        
            # Process the selected paragraph from PDF 2
            with gr.Column():
                gr.Markdown("### PDF 2 Analysis")
        
                selected_paragraph_2 = gr.Textbox(label="Selected Paragraph 2 Content", lines=4)
                paragraph_2_dropdown.select(fn=show2, inputs = paragraph_2_dropdown, outputs=selected_paragraph_2)
                close_paragraph_2 = gr.Textbox(label="Closest Paragraph from PDF 1 to selected Paragraph PDF 2", lines=4)
                paragraph_2_dropdown.select(
                    fn=lambda p, keyword: filtered_close_paragraph(p, keyword, "1"),  # Use stored_paragraphs_2 inside the function
                    inputs=[paragraph_2_dropdown, keyword_input],
                    outputs=close_paragraph_2
                )
                with gr.Group():
                    # Display selected paragraph from PDF 2
                    selected_paragraph_2.change(fn=show2, inputs=paragraph_2_dropdown, outputs=selected_paragraph_2)
            
                    summarize_btn2 = gr.Button("Summarize Text from PDF 2")
                    summary_textbox_2 = gr.Textbox(label="Summary for PDF 2", lines=2)
            
                    # Summarize the selected paragraph from PDF 2
                    summarize_btn2.click(fn=lambda p: process_paragraph_2_sum(p), inputs=paragraph_2_dropdown, outputs=summary_textbox_2)
            
                    sentiment_btn2 = gr.Button("Classify Financial Tone for paragraph from PDF 2")
                    sentiment_textbox_2 = gr.Label(label="Classification from PDF 2")
            
                    # Classify the financial tone of the selected paragraph from PDF 2
                    sentiment_btn2.click(fn=lambda p: process_paragraph_2_sent(p), inputs=paragraph_2_dropdown, outputs=sentiment_textbox_2)
                    with gr.Accordion("Analyze Financial Tone on each sentence"):
                        analyze_btn2 = gr.Button("With FinBERT-tone")
                        fin_spans_2 = gr.HighlightedText(label="Financial Tone Analysis for PDF 2",color_map=color_map, show_legend=True)
                
                        # Analyze financial tone on each sentence using FinBERT-tone for PDF 2
                        analyze_btn2.click(fn=lambda p: process_paragraph_2_sent_tone(p), inputs=paragraph_2_dropdown, outputs=fin_spans_2)
                
                        analyze_btn2_ = gr.Button("With ProsusAI/finbert")
                        fin_spans_2_ = gr.HighlightedText(label="Financial Tone Analysis for PDF 2 (Bis)",color_map=color_map1, show_legend=True)
                
                        # Analyze financial tone using ProsusAI/finbert for PDF 2
                        analyze_btn2_.click(fn=lambda p: process_paragraph_2_sent_tone_bis(p), inputs=paragraph_2_dropdown, outputs=fin_spans_2_)


    with gr.Tab("Financial Report Table Analysis"):
        # New tab content goes here
        gr.Markdown("## Excel Data Comparison")

        with gr.Row():
            with gr.Column():
                file1 = gr.Dropdown(choices=get_excel_files(PDF_FOLDER), label="Select Excel File 1")
                file2 = gr.Dropdown(choices=get_excel_files(PDF_FOLDER), label="Select Excel File 2")
                sheet = gr.Dropdown(choices=["GDP", "HICP", "RRE prices", "Unemployment", "CRE prices"], label="Select Sheet for File 1 and 2")
            
            with gr.Column():
                result = gr.Image(label="Comparison Plot")
                #result = gr.BarPlot()
        def update_sheets(file):
            return get_sheet_names(file)
        
        
        b1 = gr.Button("Compare Data")
        b2 = gr.Button("Extract text information from PDFs")
        
        with gr.Row():
            with gr.Column():
                sentiment_results_pdf1 = gr.HighlightedText(label="Sentiment Analysis - PDF 1",color_map=color_map, show_legend=True)

            with gr.Column():
                sentiment_results_pdf2 = gr.HighlightedText(label="Sentiment Analysis - PDF 2",color_map=color_map, show_legend=True)
               
        with gr.Accordion("Adverse growth trends"):
            with gr.Row():
                with gr.Column():
                    country_1_dropdown = gr.Dropdown(label="Select Country from Excel File 1")
                    summarize_btn1_country = gr.Button("Summary for the selected country")
                    text_result_df1 = gr.Textbox(label="Sentence for excel file 1", lines=2)
                    summarize_btn1_country.click(fn=lambda country, theme: generate_text(stored_df1, country, theme),
                                 inputs=[country_1_dropdown, sheet],
                                 outputs=text_result_df1)
                with gr.Column():
                    country_2_dropdown = gr.Dropdown(label="Select Country from Excel File 2")
                    summarize_btn2_country = gr.Button("Summary for the selected country")
                    text_result_df2 = gr.Textbox(label="Sentence for excel file 2", lines=2)
                    summarize_btn2_country.click(fn=lambda country, theme: generate_text(stored_df2, country, theme),
                                 inputs=[country_2_dropdown, sheet],
                                 outputs=text_result_df2)
        # Button to extract text from PDFs and perform sentiment analysis
        b1.click(fn=process_and_compare, inputs=[file1, sheet, file2, sheet], outputs=[result ,country_1_dropdown, country_2_dropdown])
        b2.click(fn=process_pdfs_and_analyze_sentiment, inputs=[file1, file2, sheet], outputs=[sentiment_results_pdf1, sentiment_results_pdf2])
    with gr.Tab("Fed data analysis"):
        gr.Markdown("## Sentiment Analysis Overview")
        # Display DataFrame
        df = pd.read_csv("data/2008_2024_minutes.csv", header = 0)
        df['Total_paragraphs']=df['Total_paragraphs']-df['Neutral']
        df['Positive_ratio'] = df['Positive'] / df['Total_paragraphs']*100
        df['Negative_ratio'] = df['Negative'] / df['Total_paragraphs']*100
        df['Date'] = pd.to_datetime(df['Date']) 
        start_date = df['Date'].min()
        end_date = df['Date'].max()
            #start = Calendar(value ="2008-01-01", type="string", label="Start")
            #end = Calendar(value="2025-01-01",type="string", label="End")
            #apply_btn = gr.Button("Apply", scale=0)
            #reset_btn = gr.Button("Reset", scale=0)
        # data_table = gr.DataFrame(value=df[['Date', 'Positive_ratio', 'Negative_ratio', 'Total_paragraphs']], label="Sentiment Data", height=500)
        # Pivot the DataFrame
        #melted_df = df.melt(id_vars='Date', value_vars=['Positive_ratio', 'Negative_ratio'],
        #            var_name='Ratio_Type', value_name='Rate')
        # Line plot for the ratios
        line_plot = gr.LinePlot(
            df,
            x='Date',
            y='Positive_ratio',
            title="Positive Rate Over Time",
            y_lim=[0, 100],   # Limit y-axis to 0-1 since it's a ratio
            #color = 'Ratio_Type'
        )
        #apply_btn.click(lambda start,end: gr.LinePlot(x_lim=[start, end]), [start, end], line_plot)
        #reset_btn.click(lambda : gr.LinePlot(x_lim=[start_date, end_date]), [], line_plot)
demo.launch()