Spaces:

msr2903
/

ProspectusLens

Running

App Files Files Community

msr2903 commited on Nov 24

Commit

d58052d

•

1 Parent(s): dcd9b4f

Upload 4 files

Browse files

Files changed (4) hide show

app.py +22 -0
config.py +48 -0
pages.py +46 -0
section_extract.py +209 -0

app.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import streamlit as st
+from pages import home, cover, underwriter, income_statement, balance_sheet, cash_flow
+# Define pages
+pages = {
+    "": [
+        st.Page(home, title="Home"),
+    ],
+    "IPO Info:": [
+        st.Page(cover, title="Cover"),
+        st.Page(underwriter, title="Underwriter")
+    ],
+    "Financial:": [
+        st.Page(income_statement, title="Income Statement"),
+        st.Page(balance_sheet, title="Balance Sheet"),
+        st.Page(cash_flow, title="Cash Flow")
+    ],
+}
+# Navigation
+pg = st.navigation(pages)  # Pass the entire pages dictionary
+pg.run()

config.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# Find the page that contains these keywords
+keywords_dict = {
+    "underwriter": [
+        ["keterangan tentang penjaminan emisi efek"],
+        ["susunan dan jumlah porsi penjaminan"]
+    ],
+    "balance_sheet": [
+        ["laporan posisi keuangan", "cash and cash equivalent", "catatan/"],
+        ["laporan posisi keuangan", "cash", "total assets", "catatan/"],
+        ["laporan posisi keuangan", "piutang", "jumlah aset", "catatan"],
+        ["laporan posisi keuangan", "piutang", "total aset", "catatan"],
+        ["consolidated statement", "piutang", "total aset", "catatan/"],
+        ["piutang", "total aset", "notes"],
+        ["piutang", "jumlah aset", "notes"]
+    ],
+    "cash_flow": [
+        ["laporan arus kas", "arus kas dari", "aktivitas operasi", "catatan/"],
+        ["laporan arus kas", "arus kas dari", "catatan/"],
+        ["laporan arus kas", "arus kas dari", "catatan"],
+        ["arus kas dari", "aktivitas operasi", "catatan"]
+    ],
+    "income_statement": [
+        ["laporan laba rugi", "penjualan", "pokok penjualan", "catatan/"],
+        ["laporan laba rugi", "revenues", "beban pokok", "catatan/"],
+        ["laporan laba rugi", "revenue", "beban pokok", "catatan/"],
+        ["laporan laba rugi", "penjualan", "beban pokok", "catatan"],
+        ["laporan laba rugi", "pendapatan", "beban pokok", "catatan"],
+        ["laporan laba rugi", "income", "catatan/"],
+        ["laporan laba rugi", "pendapatan", "catatan/"],
+        ["laporan laba rugi", "pendapatan usaha", "catatan"],
+        ["laporan laba rugi", "pendapatan", "catatan"],
+        ["penjualan", "beban pokok", "catatan"]
+    ]
+}
+# Stop extraction until this keywords matched
+stop_keywords = {
+    "balance_sheet": [["laba per saham"], ["jumlah ekuitas"], ["total ekuitas"]],
+    "cash_flow": [["kas dan setara kas"], ["kas dan bank"], ["kas dan setara"]],
+    "income_statement": [["per saham"], ["total comprehensive"], ["laba komprehensif"], ["laba bersih per"]]
+}
+# Exclude pages when this keywords matched
+anti_keywords = {
+    "balance_sheet": [],
+    "cash_flow": [],
+    "income_statement": [["laporan perubahan ekuitas"], ["laporan arus kas"]]
+}

pages.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import streamlit as st
+from section_extract import find_cover, find_underwriter, find_section
+from config import keywords_dict, stop_keywords, anti_keywords
+def home():
+    st.title("Prospectus Lens")
+    st.write("Welcome to the Prospectus Lens! Upload the PDF of the prospectus below!")
+    uploaded_file = st.file_uploader("Upload your Prospectus File", accept_multiple_files=False, type=["pdf"])
+    st.session_state["uploaded_file"] = uploaded_file
+def cover():
+    find_cover(uploaded_file=st.session_state.get("uploaded_file"))
+def underwriter():
+    find_underwriter(
+        uploaded_file=st.session_state.get("uploaded_file"),
+        section_name="underwriter",
+        keywords_dict=keywords_dict
+    )
+def income_statement():
+    find_section(
+        uploaded_file=st.session_state.get("uploaded_file"),
+        section_name="income_statement",
+        keywords_dict=keywords_dict,
+        stop_keywords=stop_keywords,
+        anti_keywords=anti_keywords
+    )
+def balance_sheet():
+    find_section(
+        uploaded_file=st.session_state.get("uploaded_file"),
+        section_name="balance_sheet",
+        keywords_dict=keywords_dict,
+        stop_keywords=stop_keywords,
+        anti_keywords=anti_keywords
+    )
+def cash_flow():
+    find_section(
+        uploaded_file=st.session_state.get("uploaded_file"),
+        section_name="cash_flow",
+        keywords_dict=keywords_dict,
+        stop_keywords=stop_keywords,
+        anti_keywords=anti_keywords
+    )

section_extract.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import os
+import re
+from PyPDF2 import PdfReader, PdfWriter
+from streamlit_pdf_viewer import pdf_viewer
+import streamlit as st
+def find_cover(uploaded_file):
+    """
+    Extracts and displays the first page of a PDF.
+    Parameters:
+        uploaded_file: The uploaded PDF file.
+    Returns:
+        None
+    """
+    section_title = "Cover"
+    st.title(section_title)
+    if uploaded_file:
+        try:
+            # Read the PDF and extract the first page
+            pdf_reader = PdfReader(uploaded_file)
+            first_page = pdf_reader.pages[0]
+            pdf_writer = PdfWriter()
+            pdf_writer.add_page(first_page)
+            # Save the first page to a temporary file
+            temp_first_page_path = os.path.join(f"temp_{section_title.lower()}.pdf")
+            with open(temp_first_page_path, "wb") as f:
+                pdf_writer.write(f)
+            # Display the first page using pdf_viewer
+            pdf_viewer(temp_first_page_path)
+        except Exception as e:
+            st.error(f"An error occurred while processing the PDF: {e}")
+    else:
+        st.warning("Please upload a PDF on the Home page first.")
+def find_underwriter(uploaded_file, section_name, keywords_dict):
+    """
+    Searches for pages in a PDF containing specific keywords for the 'underwriter' section and displays them,
+    starting from the last 2/3 of the PDF to improve performance.
+    Parameters:
+        uploaded_file: The uploaded PDF file.
+        section_name: The name of the section (e.g., "Underwriter").
+        keywords_dict: Dictionary containing keyword sets for different sections.
+    Returns:
+        None
+    """
+    st.title(section_name.title())
+    keyword_sets = keywords_dict.get(section_name, [])
+    if not keyword_sets:
+        st.error(f"No keywords defined for section: {section_name}")
+        return
+    if uploaded_file:
+        try:
+            pdf_reader = PdfReader(uploaded_file)
+            total_pages = len(pdf_reader.pages)
+            start_page = total_pages // 3  # Skip the first 1/3 of the PDF
+            pages = pdf_reader.pages[start_page:]
+            # Loop through the keyword sets
+            for keyword_set in keyword_sets:
+                for page_num, page in enumerate(pages, start=start_page + 1):
+                    text = page.extract_text()
+                    # Check if any keyword in the set is found on the page
+                    if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
+                        # Display the matched page
+                        pdf_writer = PdfWriter()
+                        pdf_writer.add_page(page)
+                        temp_page_path = os.path.join(f"temp_{section_name.lower()}_page_{page_num + 1}.pdf")
+                        with open(temp_page_path, "wb") as f:
+                            pdf_writer.write(f)
+                        st.write(f"Keyword found on page {page_num}")
+                        pdf_viewer(temp_page_path)
+                        return  # Exit after finding the first match
+            st.warning(f"No pages contain the specified keywords for {section_name}.")
+        except Exception as e:
+            st.error(f"An error occurred while processing the PDF: {e}")
+    else:
+        st.warning("Please upload a PDF on the Home page first.")
+def find_section(uploaded_file, section_name, keywords_dict, stop_keywords, anti_keywords):
+    """
+    Extracts and displays sections of a PDF based on keyword matches.
+    Parameters:
+        uploaded_file: The uploaded PDF file (Streamlit file uploader object).
+        section_name: The name of the section to search for (e.g., "income_statement").
+        keywords_dict: A dictionary containing keyword sets for different sections.
+        stop_keywords: A dictionary of keywords to indicate where extraction should stop.
+        anti_keywords: A dictionary of keywords to exclude specific pages from the results.
+    Returns:
+        bool: True if processing completed without interruptions; False if stopped or an error occurred.
+    """
+    st.title(section_name.replace("_", " ").title())
+    if uploaded_file:
+        try:
+            pdf_reader = PdfReader(uploaded_file)
+            total_pages = len(pdf_reader.pages)
+            # Step 1: Start from the second half of the PDF
+            start_page = total_pages // 2
+            pages = pdf_reader.pages[start_page:]
+            section_keywords = keywords_dict.get(section_name, [])
+            section_stop_keywords = stop_keywords.get(section_name, [])
+            section_anti_keywords = anti_keywords.get(section_name, [])
+            pdf_writer = PdfWriter()  # Writer for the extracted pages
+            extraction_started = False  # Flag to check if extraction has started
+            for page_num, page in enumerate(pages, start=start_page + 1):
+                text = page.extract_text()
+                # Step 2: Find the keywords within the keywords_dict
+                if not extraction_started:
+                    for keyword_set in section_keywords:
+                        if all(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
+                            st.write(f"Keywords matched on page {page_num}. Starting extraction.")
+                            pdf_writer.add_page(page)
+                            # Check for stop keywords on the same page
+                            if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
+                                   for stop_set in section_stop_keywords):
+                                st.warning(f"Stop keywords matched on starting page {page_num}. Stopping extraction.")
+                                # Check for anti-keywords before stopping
+                                if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
+                                       for anti_set in section_anti_keywords):
+                                    st.write(f"Page {page_num} contains anti-keywords. Excluding from results.")
+                                    pdf_writer.pages.pop()  # Remove the last added page
+                                # Save and display the extracted pages (if any)
+                                if len(pdf_writer.pages) > 0:
+                                    temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
+                                    with open(temp_section_path, "wb") as f:
+                                        pdf_writer.write(f)
+                                    pdf_viewer(temp_section_path)
+                                else:
+                                    st.warning(f"No pages matched the criteria for {section_name}.")
+                                # Stop extraction immediately and signal to stop all processing
+                                return False
+                            else:
+                                # Continue extraction
+                                extraction_started = True
+                                break
+                elif extraction_started:
+                    # Step 3: Add the page to the output
+                    pdf_writer.add_page(page)
+                    # Step 4: Check for stop keywords
+                    if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
+                           for stop_set in section_stop_keywords):
+                        st.warning(f"Stopping extraction at page {page_num}. Stop keywords matched.")
+                        # Step 5: After stopping, check for anti-keywords
+                        if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
+                               for anti_set in section_anti_keywords):
+                            st.write(f"Page {page_num} contains anti-keywords. Excluding from results.")
+                            pdf_writer.pages.pop()  # Remove the last added page
+                        # Save and display the extracted pages (if any)
+                        if len(pdf_writer.pages) > 0:
+                            temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
+                            with open(temp_section_path, "wb") as f:
+                                pdf_writer.write(f)
+                            pdf_viewer(temp_section_path)
+                        else:
+                            st.warning(f"No pages matched the criteria for {section_name}.")
+                        # Stop extraction and signal to stop all processing
+                        return False
+            # If extraction finished without hitting stop keywords, save and display the pages
+            if len(pdf_writer.pages) > 0:
+                temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
+                with open(temp_section_path, "wb") as f:
+                    pdf_writer.write(f)
+                pdf_viewer(temp_section_path)
+            else:
+                st.warning(f"No pages matched the criteria for {section_name}.")
+            # Indicate that processing can continue
+            return True
+        except Exception as e:
+            st.error(f"An error occurred while processing the PDF: {e}")
+            # Stop processing due to an error
+            return False
+    else:
+        st.warning("Please upload a PDF on the Home page first.")
+        # Stop processing since no file is uploaded
+        return False