Cachoups commited on
Commit
08b59ae
·
verified ·
1 Parent(s): 67606bf

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +140 -0
app.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from transformers import pipeline
4
+ import spacy
5
+ import lib.read_pdf
6
+ # Initialize spaCy model
7
+ nlp = spacy.load('en_core_web_sm')
8
+ nlp.add_pipe('sentencizer')
9
+
10
+ def split_in_sentences(text):
11
+ doc = nlp(text)
12
+ return [str(sent).strip() for sent in doc.sents]
13
+
14
+ def make_spans(text, results):
15
+ results_list = [res['label'] for res in results]
16
+ facts_spans = list(zip(split_in_sentences(text), results_list))
17
+ return facts_spans
18
+
19
+ # Initialize pipelines
20
+ summarizer = pipeline("summarization", model="human-centered-summarization/financial-summarization-pegasus")
21
+ fin_model = pipeline("sentiment-analysis", model='yiyanghkust/finbert-tone', tokenizer='yiyanghkust/finbert-tone')
22
+
23
+ def summarize_text(text):
24
+ resp = summarizer(text)
25
+ return resp[0]['summary_text']
26
+
27
+ def text_to_sentiment(text):
28
+ sentiment = fin_model(text)[0]["label"]
29
+ return sentiment
30
+
31
+ def fin_ext(text):
32
+ results = fin_model(split_in_sentences(text))
33
+ return make_spans(text, results)
34
+
35
+ def extract_and_summarize(pdf1, pdf2):
36
+ if not pdf1 or not pdf2:
37
+ return [], []
38
+
39
+ pdf1_path = os.path.join(PDF_FOLDER, pdf1)
40
+ pdf2_path = os.path.join(PDF_FOLDER, pdf2)
41
+
42
+ # Extract and format paragraphs
43
+ paragraphs_1 = lib.read_pdf.extract_and_format_paragraphs(pdf1_path)
44
+ paragraphs_2 = lib.read_pdf.extract_and_format_paragraphs(pdf2_path)
45
+
46
+ start_keyword = "Main risks to"
47
+ end_keywords = ["4. Appendix", "Annex:", "4. Annex", "Detailed tables", "ACKNOWLEDGEMENTS", "STATISTICAL ANNEX", "PROSPECTS BY MEMBER STATES"]
48
+
49
+ start_index1, end_index1 = lib.read_pdf.find_text_range(paragraphs_1, start_keyword, end_keywords)
50
+ start_index2, end_index2 = lib.read_pdf.find_text_range(paragraphs_2, start_keyword, end_keywords)
51
+
52
+ paragraphs_1 = lib.read_pdf.extract_relevant_text(paragraphs_1, start_index1, end_index1)
53
+ paragraphs_2 = lib.read_pdf.extract_relevant_text(paragraphs_2, start_index2, end_index2)
54
+
55
+ paragraphs_1 = lib.read_pdf.split_text_into_paragraphs(paragraphs_1, 0)
56
+ paragraphs_2 = lib.read_pdf.split_text_into_paragraphs(paragraphs_2, 0)
57
+
58
+ return paragraphs_1, paragraphs_2
59
+
60
+ # Gradio interface setup
61
+ PDF_FOLDER = "data"
62
+
63
+ def get_pdf_files(folder):
64
+ return [f for f in os.listdir(folder) if f.endswith('.pdf')]
65
+
66
+ stored_paragraphs_1 = []
67
+ stored_paragraphs_2 = []
68
+
69
+ with gr.Blocks() as demo:
70
+ gr.Markdown("## Financial Report Paragraph Selection and Analysis")
71
+
72
+ with gr.Row():
73
+ # Upload PDFs
74
+ with gr.Column():
75
+ pdf1 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 1")
76
+ pdf2 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 2")
77
+
78
+ with gr.Column():
79
+ b1 = gr.Button("Extract and Display Paragraphs")
80
+ paragraph_1_dropdown = gr.Dropdown(label="Select Paragraph from PDF 1")
81
+ paragraph_2_dropdown = gr.Dropdown(label="Select Paragraph from PDF 2")
82
+
83
+ def update_paragraphs(pdf1, pdf2):
84
+ global stored_paragraphs_1, stored_paragraphs_2
85
+ stored_paragraphs_1, stored_paragraphs_2 = extract_and_summarize(pdf1, pdf2)
86
+ updated_dropdown_1 = gr.Dropdown.update(choices=[f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)], label="Select Paragraph from PDF 1")
87
+ updated_dropdown_2 = gr.Dropdown.update(choices=[f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)], label="Select Paragraph from PDF 2")
88
+ return updated_dropdown_1, updated_dropdown_2
89
+
90
+ b1.click(fn=update_paragraphs, inputs=[pdf1, pdf2], outputs=[paragraph_1_dropdown, paragraph_2_dropdown])
91
+
92
+ with gr.Row():
93
+ # Process the selected paragraph from PDF 1
94
+ with gr.Column():
95
+ selected_paragraph_1 = gr.Textbox(label="Selected Paragraph 1 Content")
96
+ summarize_btn1 = gr.Button("Summarize Text from PDF 1")
97
+ sentiment_btn1 = gr.Button("Classify Financial Tone from PDF 1")
98
+ fin_spans_1 = gr.HighlightedText(label="Financial Tone Analysis for PDF 1")
99
+
100
+ def process_paragraph_1(paragraph):
101
+ try:
102
+ paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
103
+ selected_paragraph = stored_paragraphs_1[paragraph_index]
104
+ summary = summarize_text(selected_paragraph)
105
+ sentiment = text_to_sentiment(selected_paragraph)
106
+ fin_spans = fin_ext(selected_paragraph)
107
+ return selected_paragraph, summary, sentiment, fin_spans
108
+ except (IndexError, ValueError):
109
+ return "Invalid selection", "Error", "Error", []
110
+
111
+ summarize_btn1.click(fn=lambda p: process_paragraph_1(p)[1], inputs=paragraph_1_dropdown, outputs=selected_paragraph_1)
112
+ sentiment_btn1.click(fn=lambda p: process_paragraph_1(p)[2], inputs=paragraph_1_dropdown, outputs=selected_paragraph_1)
113
+ b5 = gr.Button("Analyze Financial Tone and FLS")
114
+ b5.click(fn=lambda p: process_paragraph_1(p)[3], inputs=paragraph_1_dropdown, outputs=fin_spans_1)
115
+
116
+ with gr.Row():
117
+ # Process the selected paragraph from PDF 2
118
+ with gr.Column():
119
+ selected_paragraph_2 = gr.Textbox(label="Selected Paragraph 2 Content")
120
+ summarize_btn2 = gr.Button("Summarize Text from PDF 2")
121
+ sentiment_btn2 = gr.Button("Classify Financial Tone from PDF 2")
122
+ fin_spans_2 = gr.HighlightedText(label="Financial Tone Analysis for PDF 2")
123
+
124
+ def process_paragraph_2(paragraph):
125
+ try:
126
+ paragraph_index = int(paragraph.split(':')[0].replace('Paragraph ', '')) - 1
127
+ selected_paragraph = stored_paragraphs_2[paragraph_index]
128
+ summary = summarize_text(selected_paragraph)
129
+ sentiment = text_to_sentiment(selected_paragraph)
130
+ fin_spans = fin_ext(selected_paragraph)
131
+ return selected_paragraph, summary, sentiment, fin_spans
132
+ except (IndexError, ValueError):
133
+ return "Invalid selection", "Error", "Error", []
134
+
135
+ summarize_btn2.click(fn=lambda p: process_paragraph_2(p)[1], inputs=paragraph_2_dropdown, outputs=selected_paragraph_2)
136
+ sentiment_btn2.click(fn=lambda p: process_paragraph_2(p)[2], inputs=paragraph_2_dropdown, outputs=selected_paragraph_2)
137
+ b6 = gr.Button("Analyze Financial Tone and FLS")
138
+ b6.click(fn=lambda p: process_paragraph_2(p)[3], inputs=paragraph_2_dropdown, outputs=fin_spans_2)
139
+
140
+ demo.launch()