msr2903 commited on
Commit
d58052d
1 Parent(s): dcd9b4f

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +22 -0
  2. config.py +48 -0
  3. pages.py +46 -0
  4. section_extract.py +209 -0
app.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pages import home, cover, underwriter, income_statement, balance_sheet, cash_flow
3
+
4
+ # Define pages
5
+ pages = {
6
+ "": [
7
+ st.Page(home, title="Home"),
8
+ ],
9
+ "IPO Info:": [
10
+ st.Page(cover, title="Cover"),
11
+ st.Page(underwriter, title="Underwriter")
12
+ ],
13
+ "Financial:": [
14
+ st.Page(income_statement, title="Income Statement"),
15
+ st.Page(balance_sheet, title="Balance Sheet"),
16
+ st.Page(cash_flow, title="Cash Flow")
17
+ ],
18
+ }
19
+
20
+ # Navigation
21
+ pg = st.navigation(pages) # Pass the entire pages dictionary
22
+ pg.run()
config.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Find the page that contains these keywords
2
+ keywords_dict = {
3
+ "underwriter": [
4
+ ["keterangan tentang penjaminan emisi efek"],
5
+ ["susunan dan jumlah porsi penjaminan"]
6
+ ],
7
+ "balance_sheet": [
8
+ ["laporan posisi keuangan", "cash and cash equivalent", "catatan/"],
9
+ ["laporan posisi keuangan", "cash", "total assets", "catatan/"],
10
+ ["laporan posisi keuangan", "piutang", "jumlah aset", "catatan"],
11
+ ["laporan posisi keuangan", "piutang", "total aset", "catatan"],
12
+ ["consolidated statement", "piutang", "total aset", "catatan/"],
13
+ ["piutang", "total aset", "notes"],
14
+ ["piutang", "jumlah aset", "notes"]
15
+ ],
16
+ "cash_flow": [
17
+ ["laporan arus kas", "arus kas dari", "aktivitas operasi", "catatan/"],
18
+ ["laporan arus kas", "arus kas dari", "catatan/"],
19
+ ["laporan arus kas", "arus kas dari", "catatan"],
20
+ ["arus kas dari", "aktivitas operasi", "catatan"]
21
+ ],
22
+ "income_statement": [
23
+ ["laporan laba rugi", "penjualan", "pokok penjualan", "catatan/"],
24
+ ["laporan laba rugi", "revenues", "beban pokok", "catatan/"],
25
+ ["laporan laba rugi", "revenue", "beban pokok", "catatan/"],
26
+ ["laporan laba rugi", "penjualan", "beban pokok", "catatan"],
27
+ ["laporan laba rugi", "pendapatan", "beban pokok", "catatan"],
28
+ ["laporan laba rugi", "income", "catatan/"],
29
+ ["laporan laba rugi", "pendapatan", "catatan/"],
30
+ ["laporan laba rugi", "pendapatan usaha", "catatan"],
31
+ ["laporan laba rugi", "pendapatan", "catatan"],
32
+ ["penjualan", "beban pokok", "catatan"]
33
+ ]
34
+ }
35
+
36
+ # Stop extraction until this keywords matched
37
+ stop_keywords = {
38
+ "balance_sheet": [["laba per saham"], ["jumlah ekuitas"], ["total ekuitas"]],
39
+ "cash_flow": [["kas dan setara kas"], ["kas dan bank"], ["kas dan setara"]],
40
+ "income_statement": [["per saham"], ["total comprehensive"], ["laba komprehensif"], ["laba bersih per"]]
41
+ }
42
+
43
+ # Exclude pages when this keywords matched
44
+ anti_keywords = {
45
+ "balance_sheet": [],
46
+ "cash_flow": [],
47
+ "income_statement": [["laporan perubahan ekuitas"], ["laporan arus kas"]]
48
+ }
pages.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from section_extract import find_cover, find_underwriter, find_section
3
+ from config import keywords_dict, stop_keywords, anti_keywords
4
+
5
+ def home():
6
+ st.title("Prospectus Lens")
7
+ st.write("Welcome to the Prospectus Lens! Upload the PDF of the prospectus below!")
8
+ uploaded_file = st.file_uploader("Upload your Prospectus File", accept_multiple_files=False, type=["pdf"])
9
+ st.session_state["uploaded_file"] = uploaded_file
10
+
11
+ def cover():
12
+ find_cover(uploaded_file=st.session_state.get("uploaded_file"))
13
+
14
+ def underwriter():
15
+ find_underwriter(
16
+ uploaded_file=st.session_state.get("uploaded_file"),
17
+ section_name="underwriter",
18
+ keywords_dict=keywords_dict
19
+ )
20
+
21
+ def income_statement():
22
+ find_section(
23
+ uploaded_file=st.session_state.get("uploaded_file"),
24
+ section_name="income_statement",
25
+ keywords_dict=keywords_dict,
26
+ stop_keywords=stop_keywords,
27
+ anti_keywords=anti_keywords
28
+ )
29
+
30
+ def balance_sheet():
31
+ find_section(
32
+ uploaded_file=st.session_state.get("uploaded_file"),
33
+ section_name="balance_sheet",
34
+ keywords_dict=keywords_dict,
35
+ stop_keywords=stop_keywords,
36
+ anti_keywords=anti_keywords
37
+ )
38
+
39
+ def cash_flow():
40
+ find_section(
41
+ uploaded_file=st.session_state.get("uploaded_file"),
42
+ section_name="cash_flow",
43
+ keywords_dict=keywords_dict,
44
+ stop_keywords=stop_keywords,
45
+ anti_keywords=anti_keywords
46
+ )
section_extract.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from PyPDF2 import PdfReader, PdfWriter
4
+ from streamlit_pdf_viewer import pdf_viewer
5
+ import streamlit as st
6
+
7
+ def find_cover(uploaded_file):
8
+ """
9
+ Extracts and displays the first page of a PDF.
10
+
11
+ Parameters:
12
+ uploaded_file: The uploaded PDF file.
13
+
14
+ Returns:
15
+ None
16
+ """
17
+ section_title = "Cover"
18
+ st.title(section_title)
19
+
20
+ if uploaded_file:
21
+ try:
22
+ # Read the PDF and extract the first page
23
+ pdf_reader = PdfReader(uploaded_file)
24
+ first_page = pdf_reader.pages[0]
25
+
26
+ pdf_writer = PdfWriter()
27
+ pdf_writer.add_page(first_page)
28
+
29
+ # Save the first page to a temporary file
30
+ temp_first_page_path = os.path.join(f"temp_{section_title.lower()}.pdf")
31
+ with open(temp_first_page_path, "wb") as f:
32
+ pdf_writer.write(f)
33
+
34
+ # Display the first page using pdf_viewer
35
+ pdf_viewer(temp_first_page_path)
36
+ except Exception as e:
37
+ st.error(f"An error occurred while processing the PDF: {e}")
38
+ else:
39
+ st.warning("Please upload a PDF on the Home page first.")
40
+
41
+
42
+ def find_underwriter(uploaded_file, section_name, keywords_dict):
43
+ """
44
+ Searches for pages in a PDF containing specific keywords for the 'underwriter' section and displays them,
45
+ starting from the last 2/3 of the PDF to improve performance.
46
+
47
+ Parameters:
48
+ uploaded_file: The uploaded PDF file.
49
+ section_name: The name of the section (e.g., "Underwriter").
50
+ keywords_dict: Dictionary containing keyword sets for different sections.
51
+
52
+ Returns:
53
+ None
54
+ """
55
+ st.title(section_name.title())
56
+
57
+ keyword_sets = keywords_dict.get(section_name, [])
58
+ if not keyword_sets:
59
+ st.error(f"No keywords defined for section: {section_name}")
60
+ return
61
+
62
+ if uploaded_file:
63
+ try:
64
+ pdf_reader = PdfReader(uploaded_file)
65
+ total_pages = len(pdf_reader.pages)
66
+ start_page = total_pages // 3 # Skip the first 1/3 of the PDF
67
+ pages = pdf_reader.pages[start_page:]
68
+
69
+ # Loop through the keyword sets
70
+ for keyword_set in keyword_sets:
71
+ for page_num, page in enumerate(pages, start=start_page + 1):
72
+ text = page.extract_text()
73
+
74
+ # Check if any keyword in the set is found on the page
75
+ if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
76
+ # Display the matched page
77
+ pdf_writer = PdfWriter()
78
+ pdf_writer.add_page(page)
79
+
80
+ temp_page_path = os.path.join(f"temp_{section_name.lower()}_page_{page_num + 1}.pdf")
81
+ with open(temp_page_path, "wb") as f:
82
+ pdf_writer.write(f)
83
+
84
+ st.write(f"Keyword found on page {page_num}")
85
+ pdf_viewer(temp_page_path)
86
+ return # Exit after finding the first match
87
+
88
+ st.warning(f"No pages contain the specified keywords for {section_name}.")
89
+ except Exception as e:
90
+ st.error(f"An error occurred while processing the PDF: {e}")
91
+ else:
92
+ st.warning("Please upload a PDF on the Home page first.")
93
+
94
+
95
+ def find_section(uploaded_file, section_name, keywords_dict, stop_keywords, anti_keywords):
96
+ """
97
+ Extracts and displays sections of a PDF based on keyword matches.
98
+
99
+ Parameters:
100
+ uploaded_file: The uploaded PDF file (Streamlit file uploader object).
101
+ section_name: The name of the section to search for (e.g., "income_statement").
102
+ keywords_dict: A dictionary containing keyword sets for different sections.
103
+ stop_keywords: A dictionary of keywords to indicate where extraction should stop.
104
+ anti_keywords: A dictionary of keywords to exclude specific pages from the results.
105
+
106
+ Returns:
107
+ bool: True if processing completed without interruptions; False if stopped or an error occurred.
108
+ """
109
+ st.title(section_name.replace("_", " ").title())
110
+
111
+ if uploaded_file:
112
+ try:
113
+ pdf_reader = PdfReader(uploaded_file)
114
+ total_pages = len(pdf_reader.pages)
115
+
116
+ # Step 1: Start from the second half of the PDF
117
+ start_page = total_pages // 2
118
+ pages = pdf_reader.pages[start_page:]
119
+
120
+ section_keywords = keywords_dict.get(section_name, [])
121
+ section_stop_keywords = stop_keywords.get(section_name, [])
122
+ section_anti_keywords = anti_keywords.get(section_name, [])
123
+
124
+ pdf_writer = PdfWriter() # Writer for the extracted pages
125
+ extraction_started = False # Flag to check if extraction has started
126
+
127
+ for page_num, page in enumerate(pages, start=start_page + 1):
128
+ text = page.extract_text()
129
+
130
+ # Step 2: Find the keywords within the keywords_dict
131
+ if not extraction_started:
132
+ for keyword_set in section_keywords:
133
+ if all(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
134
+ st.write(f"Keywords matched on page {page_num}. Starting extraction.")
135
+ pdf_writer.add_page(page)
136
+
137
+ # Check for stop keywords on the same page
138
+ if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
139
+ for stop_set in section_stop_keywords):
140
+ st.warning(f"Stop keywords matched on starting page {page_num}. Stopping extraction.")
141
+
142
+ # Check for anti-keywords before stopping
143
+ if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
144
+ for anti_set in section_anti_keywords):
145
+ st.write(f"Page {page_num} contains anti-keywords. Excluding from results.")
146
+ pdf_writer.pages.pop() # Remove the last added page
147
+
148
+ # Save and display the extracted pages (if any)
149
+ if len(pdf_writer.pages) > 0:
150
+ temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
151
+ with open(temp_section_path, "wb") as f:
152
+ pdf_writer.write(f)
153
+ pdf_viewer(temp_section_path)
154
+ else:
155
+ st.warning(f"No pages matched the criteria for {section_name}.")
156
+
157
+ # Stop extraction immediately and signal to stop all processing
158
+ return False
159
+ else:
160
+ # Continue extraction
161
+ extraction_started = True
162
+ break
163
+ elif extraction_started:
164
+ # Step 3: Add the page to the output
165
+ pdf_writer.add_page(page)
166
+
167
+ # Step 4: Check for stop keywords
168
+ if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
169
+ for stop_set in section_stop_keywords):
170
+ st.warning(f"Stopping extraction at page {page_num}. Stop keywords matched.")
171
+
172
+ # Step 5: After stopping, check for anti-keywords
173
+ if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
174
+ for anti_set in section_anti_keywords):
175
+ st.write(f"Page {page_num} contains anti-keywords. Excluding from results.")
176
+ pdf_writer.pages.pop() # Remove the last added page
177
+
178
+ # Save and display the extracted pages (if any)
179
+ if len(pdf_writer.pages) > 0:
180
+ temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
181
+ with open(temp_section_path, "wb") as f:
182
+ pdf_writer.write(f)
183
+ pdf_viewer(temp_section_path)
184
+ else:
185
+ st.warning(f"No pages matched the criteria for {section_name}.")
186
+
187
+ # Stop extraction and signal to stop all processing
188
+ return False
189
+
190
+ # If extraction finished without hitting stop keywords, save and display the pages
191
+ if len(pdf_writer.pages) > 0:
192
+ temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
193
+ with open(temp_section_path, "wb") as f:
194
+ pdf_writer.write(f)
195
+ pdf_viewer(temp_section_path)
196
+ else:
197
+ st.warning(f"No pages matched the criteria for {section_name}.")
198
+
199
+ # Indicate that processing can continue
200
+ return True
201
+
202
+ except Exception as e:
203
+ st.error(f"An error occurred while processing the PDF: {e}")
204
+ # Stop processing due to an error
205
+ return False
206
+ else:
207
+ st.warning("Please upload a PDF on the Home page first.")
208
+ # Stop processing since no file is uploaded
209
+ return False