Spaces:
Running
Running
Upload 4 files
Browse files- app.py +22 -0
- config.py +48 -0
- pages.py +46 -0
- section_extract.py +209 -0
app.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from pages import home, cover, underwriter, income_statement, balance_sheet, cash_flow
|
3 |
+
|
4 |
+
# Define pages
|
5 |
+
pages = {
|
6 |
+
"": [
|
7 |
+
st.Page(home, title="Home"),
|
8 |
+
],
|
9 |
+
"IPO Info:": [
|
10 |
+
st.Page(cover, title="Cover"),
|
11 |
+
st.Page(underwriter, title="Underwriter")
|
12 |
+
],
|
13 |
+
"Financial:": [
|
14 |
+
st.Page(income_statement, title="Income Statement"),
|
15 |
+
st.Page(balance_sheet, title="Balance Sheet"),
|
16 |
+
st.Page(cash_flow, title="Cash Flow")
|
17 |
+
],
|
18 |
+
}
|
19 |
+
|
20 |
+
# Navigation
|
21 |
+
pg = st.navigation(pages) # Pass the entire pages dictionary
|
22 |
+
pg.run()
|
config.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Find the page that contains these keywords
|
2 |
+
keywords_dict = {
|
3 |
+
"underwriter": [
|
4 |
+
["keterangan tentang penjaminan emisi efek"],
|
5 |
+
["susunan dan jumlah porsi penjaminan"]
|
6 |
+
],
|
7 |
+
"balance_sheet": [
|
8 |
+
["laporan posisi keuangan", "cash and cash equivalent", "catatan/"],
|
9 |
+
["laporan posisi keuangan", "cash", "total assets", "catatan/"],
|
10 |
+
["laporan posisi keuangan", "piutang", "jumlah aset", "catatan"],
|
11 |
+
["laporan posisi keuangan", "piutang", "total aset", "catatan"],
|
12 |
+
["consolidated statement", "piutang", "total aset", "catatan/"],
|
13 |
+
["piutang", "total aset", "notes"],
|
14 |
+
["piutang", "jumlah aset", "notes"]
|
15 |
+
],
|
16 |
+
"cash_flow": [
|
17 |
+
["laporan arus kas", "arus kas dari", "aktivitas operasi", "catatan/"],
|
18 |
+
["laporan arus kas", "arus kas dari", "catatan/"],
|
19 |
+
["laporan arus kas", "arus kas dari", "catatan"],
|
20 |
+
["arus kas dari", "aktivitas operasi", "catatan"]
|
21 |
+
],
|
22 |
+
"income_statement": [
|
23 |
+
["laporan laba rugi", "penjualan", "pokok penjualan", "catatan/"],
|
24 |
+
["laporan laba rugi", "revenues", "beban pokok", "catatan/"],
|
25 |
+
["laporan laba rugi", "revenue", "beban pokok", "catatan/"],
|
26 |
+
["laporan laba rugi", "penjualan", "beban pokok", "catatan"],
|
27 |
+
["laporan laba rugi", "pendapatan", "beban pokok", "catatan"],
|
28 |
+
["laporan laba rugi", "income", "catatan/"],
|
29 |
+
["laporan laba rugi", "pendapatan", "catatan/"],
|
30 |
+
["laporan laba rugi", "pendapatan usaha", "catatan"],
|
31 |
+
["laporan laba rugi", "pendapatan", "catatan"],
|
32 |
+
["penjualan", "beban pokok", "catatan"]
|
33 |
+
]
|
34 |
+
}
|
35 |
+
|
36 |
+
# Stop extraction until this keywords matched
|
37 |
+
stop_keywords = {
|
38 |
+
"balance_sheet": [["laba per saham"], ["jumlah ekuitas"], ["total ekuitas"]],
|
39 |
+
"cash_flow": [["kas dan setara kas"], ["kas dan bank"], ["kas dan setara"]],
|
40 |
+
"income_statement": [["per saham"], ["total comprehensive"], ["laba komprehensif"], ["laba bersih per"]]
|
41 |
+
}
|
42 |
+
|
43 |
+
# Exclude pages when this keywords matched
|
44 |
+
anti_keywords = {
|
45 |
+
"balance_sheet": [],
|
46 |
+
"cash_flow": [],
|
47 |
+
"income_statement": [["laporan perubahan ekuitas"], ["laporan arus kas"]]
|
48 |
+
}
|
pages.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from section_extract import find_cover, find_underwriter, find_section
|
3 |
+
from config import keywords_dict, stop_keywords, anti_keywords
|
4 |
+
|
5 |
+
def home():
|
6 |
+
st.title("Prospectus Lens")
|
7 |
+
st.write("Welcome to the Prospectus Lens! Upload the PDF of the prospectus below!")
|
8 |
+
uploaded_file = st.file_uploader("Upload your Prospectus File", accept_multiple_files=False, type=["pdf"])
|
9 |
+
st.session_state["uploaded_file"] = uploaded_file
|
10 |
+
|
11 |
+
def cover():
|
12 |
+
find_cover(uploaded_file=st.session_state.get("uploaded_file"))
|
13 |
+
|
14 |
+
def underwriter():
|
15 |
+
find_underwriter(
|
16 |
+
uploaded_file=st.session_state.get("uploaded_file"),
|
17 |
+
section_name="underwriter",
|
18 |
+
keywords_dict=keywords_dict
|
19 |
+
)
|
20 |
+
|
21 |
+
def income_statement():
|
22 |
+
find_section(
|
23 |
+
uploaded_file=st.session_state.get("uploaded_file"),
|
24 |
+
section_name="income_statement",
|
25 |
+
keywords_dict=keywords_dict,
|
26 |
+
stop_keywords=stop_keywords,
|
27 |
+
anti_keywords=anti_keywords
|
28 |
+
)
|
29 |
+
|
30 |
+
def balance_sheet():
|
31 |
+
find_section(
|
32 |
+
uploaded_file=st.session_state.get("uploaded_file"),
|
33 |
+
section_name="balance_sheet",
|
34 |
+
keywords_dict=keywords_dict,
|
35 |
+
stop_keywords=stop_keywords,
|
36 |
+
anti_keywords=anti_keywords
|
37 |
+
)
|
38 |
+
|
39 |
+
def cash_flow():
|
40 |
+
find_section(
|
41 |
+
uploaded_file=st.session_state.get("uploaded_file"),
|
42 |
+
section_name="cash_flow",
|
43 |
+
keywords_dict=keywords_dict,
|
44 |
+
stop_keywords=stop_keywords,
|
45 |
+
anti_keywords=anti_keywords
|
46 |
+
)
|
section_extract.py
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
from PyPDF2 import PdfReader, PdfWriter
|
4 |
+
from streamlit_pdf_viewer import pdf_viewer
|
5 |
+
import streamlit as st
|
6 |
+
|
7 |
+
def find_cover(uploaded_file):
|
8 |
+
"""
|
9 |
+
Extracts and displays the first page of a PDF.
|
10 |
+
|
11 |
+
Parameters:
|
12 |
+
uploaded_file: The uploaded PDF file.
|
13 |
+
|
14 |
+
Returns:
|
15 |
+
None
|
16 |
+
"""
|
17 |
+
section_title = "Cover"
|
18 |
+
st.title(section_title)
|
19 |
+
|
20 |
+
if uploaded_file:
|
21 |
+
try:
|
22 |
+
# Read the PDF and extract the first page
|
23 |
+
pdf_reader = PdfReader(uploaded_file)
|
24 |
+
first_page = pdf_reader.pages[0]
|
25 |
+
|
26 |
+
pdf_writer = PdfWriter()
|
27 |
+
pdf_writer.add_page(first_page)
|
28 |
+
|
29 |
+
# Save the first page to a temporary file
|
30 |
+
temp_first_page_path = os.path.join(f"temp_{section_title.lower()}.pdf")
|
31 |
+
with open(temp_first_page_path, "wb") as f:
|
32 |
+
pdf_writer.write(f)
|
33 |
+
|
34 |
+
# Display the first page using pdf_viewer
|
35 |
+
pdf_viewer(temp_first_page_path)
|
36 |
+
except Exception as e:
|
37 |
+
st.error(f"An error occurred while processing the PDF: {e}")
|
38 |
+
else:
|
39 |
+
st.warning("Please upload a PDF on the Home page first.")
|
40 |
+
|
41 |
+
|
42 |
+
def find_underwriter(uploaded_file, section_name, keywords_dict):
|
43 |
+
"""
|
44 |
+
Searches for pages in a PDF containing specific keywords for the 'underwriter' section and displays them,
|
45 |
+
starting from the last 2/3 of the PDF to improve performance.
|
46 |
+
|
47 |
+
Parameters:
|
48 |
+
uploaded_file: The uploaded PDF file.
|
49 |
+
section_name: The name of the section (e.g., "Underwriter").
|
50 |
+
keywords_dict: Dictionary containing keyword sets for different sections.
|
51 |
+
|
52 |
+
Returns:
|
53 |
+
None
|
54 |
+
"""
|
55 |
+
st.title(section_name.title())
|
56 |
+
|
57 |
+
keyword_sets = keywords_dict.get(section_name, [])
|
58 |
+
if not keyword_sets:
|
59 |
+
st.error(f"No keywords defined for section: {section_name}")
|
60 |
+
return
|
61 |
+
|
62 |
+
if uploaded_file:
|
63 |
+
try:
|
64 |
+
pdf_reader = PdfReader(uploaded_file)
|
65 |
+
total_pages = len(pdf_reader.pages)
|
66 |
+
start_page = total_pages // 3 # Skip the first 1/3 of the PDF
|
67 |
+
pages = pdf_reader.pages[start_page:]
|
68 |
+
|
69 |
+
# Loop through the keyword sets
|
70 |
+
for keyword_set in keyword_sets:
|
71 |
+
for page_num, page in enumerate(pages, start=start_page + 1):
|
72 |
+
text = page.extract_text()
|
73 |
+
|
74 |
+
# Check if any keyword in the set is found on the page
|
75 |
+
if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
|
76 |
+
# Display the matched page
|
77 |
+
pdf_writer = PdfWriter()
|
78 |
+
pdf_writer.add_page(page)
|
79 |
+
|
80 |
+
temp_page_path = os.path.join(f"temp_{section_name.lower()}_page_{page_num + 1}.pdf")
|
81 |
+
with open(temp_page_path, "wb") as f:
|
82 |
+
pdf_writer.write(f)
|
83 |
+
|
84 |
+
st.write(f"Keyword found on page {page_num}")
|
85 |
+
pdf_viewer(temp_page_path)
|
86 |
+
return # Exit after finding the first match
|
87 |
+
|
88 |
+
st.warning(f"No pages contain the specified keywords for {section_name}.")
|
89 |
+
except Exception as e:
|
90 |
+
st.error(f"An error occurred while processing the PDF: {e}")
|
91 |
+
else:
|
92 |
+
st.warning("Please upload a PDF on the Home page first.")
|
93 |
+
|
94 |
+
|
95 |
+
def find_section(uploaded_file, section_name, keywords_dict, stop_keywords, anti_keywords):
|
96 |
+
"""
|
97 |
+
Extracts and displays sections of a PDF based on keyword matches.
|
98 |
+
|
99 |
+
Parameters:
|
100 |
+
uploaded_file: The uploaded PDF file (Streamlit file uploader object).
|
101 |
+
section_name: The name of the section to search for (e.g., "income_statement").
|
102 |
+
keywords_dict: A dictionary containing keyword sets for different sections.
|
103 |
+
stop_keywords: A dictionary of keywords to indicate where extraction should stop.
|
104 |
+
anti_keywords: A dictionary of keywords to exclude specific pages from the results.
|
105 |
+
|
106 |
+
Returns:
|
107 |
+
bool: True if processing completed without interruptions; False if stopped or an error occurred.
|
108 |
+
"""
|
109 |
+
st.title(section_name.replace("_", " ").title())
|
110 |
+
|
111 |
+
if uploaded_file:
|
112 |
+
try:
|
113 |
+
pdf_reader = PdfReader(uploaded_file)
|
114 |
+
total_pages = len(pdf_reader.pages)
|
115 |
+
|
116 |
+
# Step 1: Start from the second half of the PDF
|
117 |
+
start_page = total_pages // 2
|
118 |
+
pages = pdf_reader.pages[start_page:]
|
119 |
+
|
120 |
+
section_keywords = keywords_dict.get(section_name, [])
|
121 |
+
section_stop_keywords = stop_keywords.get(section_name, [])
|
122 |
+
section_anti_keywords = anti_keywords.get(section_name, [])
|
123 |
+
|
124 |
+
pdf_writer = PdfWriter() # Writer for the extracted pages
|
125 |
+
extraction_started = False # Flag to check if extraction has started
|
126 |
+
|
127 |
+
for page_num, page in enumerate(pages, start=start_page + 1):
|
128 |
+
text = page.extract_text()
|
129 |
+
|
130 |
+
# Step 2: Find the keywords within the keywords_dict
|
131 |
+
if not extraction_started:
|
132 |
+
for keyword_set in section_keywords:
|
133 |
+
if all(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
|
134 |
+
st.write(f"Keywords matched on page {page_num}. Starting extraction.")
|
135 |
+
pdf_writer.add_page(page)
|
136 |
+
|
137 |
+
# Check for stop keywords on the same page
|
138 |
+
if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
|
139 |
+
for stop_set in section_stop_keywords):
|
140 |
+
st.warning(f"Stop keywords matched on starting page {page_num}. Stopping extraction.")
|
141 |
+
|
142 |
+
# Check for anti-keywords before stopping
|
143 |
+
if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
|
144 |
+
for anti_set in section_anti_keywords):
|
145 |
+
st.write(f"Page {page_num} contains anti-keywords. Excluding from results.")
|
146 |
+
pdf_writer.pages.pop() # Remove the last added page
|
147 |
+
|
148 |
+
# Save and display the extracted pages (if any)
|
149 |
+
if len(pdf_writer.pages) > 0:
|
150 |
+
temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
|
151 |
+
with open(temp_section_path, "wb") as f:
|
152 |
+
pdf_writer.write(f)
|
153 |
+
pdf_viewer(temp_section_path)
|
154 |
+
else:
|
155 |
+
st.warning(f"No pages matched the criteria for {section_name}.")
|
156 |
+
|
157 |
+
# Stop extraction immediately and signal to stop all processing
|
158 |
+
return False
|
159 |
+
else:
|
160 |
+
# Continue extraction
|
161 |
+
extraction_started = True
|
162 |
+
break
|
163 |
+
elif extraction_started:
|
164 |
+
# Step 3: Add the page to the output
|
165 |
+
pdf_writer.add_page(page)
|
166 |
+
|
167 |
+
# Step 4: Check for stop keywords
|
168 |
+
if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
|
169 |
+
for stop_set in section_stop_keywords):
|
170 |
+
st.warning(f"Stopping extraction at page {page_num}. Stop keywords matched.")
|
171 |
+
|
172 |
+
# Step 5: After stopping, check for anti-keywords
|
173 |
+
if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
|
174 |
+
for anti_set in section_anti_keywords):
|
175 |
+
st.write(f"Page {page_num} contains anti-keywords. Excluding from results.")
|
176 |
+
pdf_writer.pages.pop() # Remove the last added page
|
177 |
+
|
178 |
+
# Save and display the extracted pages (if any)
|
179 |
+
if len(pdf_writer.pages) > 0:
|
180 |
+
temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
|
181 |
+
with open(temp_section_path, "wb") as f:
|
182 |
+
pdf_writer.write(f)
|
183 |
+
pdf_viewer(temp_section_path)
|
184 |
+
else:
|
185 |
+
st.warning(f"No pages matched the criteria for {section_name}.")
|
186 |
+
|
187 |
+
# Stop extraction and signal to stop all processing
|
188 |
+
return False
|
189 |
+
|
190 |
+
# If extraction finished without hitting stop keywords, save and display the pages
|
191 |
+
if len(pdf_writer.pages) > 0:
|
192 |
+
temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
|
193 |
+
with open(temp_section_path, "wb") as f:
|
194 |
+
pdf_writer.write(f)
|
195 |
+
pdf_viewer(temp_section_path)
|
196 |
+
else:
|
197 |
+
st.warning(f"No pages matched the criteria for {section_name}.")
|
198 |
+
|
199 |
+
# Indicate that processing can continue
|
200 |
+
return True
|
201 |
+
|
202 |
+
except Exception as e:
|
203 |
+
st.error(f"An error occurred while processing the PDF: {e}")
|
204 |
+
# Stop processing due to an error
|
205 |
+
return False
|
206 |
+
else:
|
207 |
+
st.warning("Please upload a PDF on the Home page first.")
|
208 |
+
# Stop processing since no file is uploaded
|
209 |
+
return False
|