Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pdfplumber | |
import requests | |
from io import BytesIO | |
import pandas as pd | |
import re | |
# Streamlit UI | |
st.title("π Find Information From Noticeboard") | |
st.write("Provide at least one user input (Name or Form Number).") | |
# Google Drive links mapped to meaningful names | |
google_drive_files = { | |
"CCN Updated Attendance": "https://drive.google.com/file/d/17OqE5oSlXHlPlIzu7TLhnQXRWsE6TxQ1/view?usp=sharing", | |
} | |
# User inputs | |
name = st.text_input("Enter Name:", placeholder="Enter the full name to search for (e.g., John Doe).") | |
form_no = st.text_input("Enter Form Number:", placeholder="Enter the form number as it appears in the document.") | |
# Function to download PDFs from Google Drive links | |
def download_pdf_from_drive(link): | |
try: | |
file_id = link.split("/")[-2] # Extract the file ID from the link | |
download_url = f"https://drive.google.com/uc?id={file_id}&export=download" | |
response = requests.get(download_url, stream=True) | |
if response.status_code == 200: | |
return BytesIO(response.content) | |
else: | |
st.warning(f"β Failed to download file: {link} (Status code: {response.status_code})") | |
return None | |
except Exception as e: | |
st.warning(f"β Error: {e}") | |
return None | |
# Function to extract column headers dynamically | |
def detect_column_headers(pdf_file): | |
with pdfplumber.open(pdf_file) as pdf: | |
for page in pdf.pages[:3]: # Check first 3 pages for headers | |
table = page.extract_table() | |
if table and len(table) > 1 and table[0] is not None: | |
return table[0] # First row is header | |
text = page.extract_text() | |
if text: | |
lines = text.split("\n") | |
for line in lines: | |
columns = re.split(r'\s{2,}|\t', line.strip()) | |
if len(columns) > 2: | |
return columns | |
return None | |
# Function to extract relevant rows dynamically | |
def extract_relevant_rows(pdf_file, search_terms, file_name, detected_headers): | |
relevant_rows = [] | |
with pdfplumber.open(pdf_file) as pdf: | |
for page_num, page in enumerate(pdf.pages, start=1): | |
table = page.extract_table() | |
if table and len(table) > 1: | |
for row in table[1:]: | |
if row and any(term.lower() in " ".join(map(str, row)).lower() for term in search_terms): | |
row_dict = {"File Name": file_name, "Page": page_num} | |
row_dict.update({detected_headers[i]: row[i] if i < len(row) else None for i in range(len(detected_headers))}) | |
relevant_rows.append(row_dict) | |
else: | |
text = page.extract_text() | |
if text: | |
lines = text.split("\n") | |
for line in lines: | |
for term in search_terms: | |
if term.lower() in line.lower(): | |
columns = re.split(r'\s{2,}|\t', line.strip()) | |
row_dict = {"File Name": file_name, "Page": page_num} | |
row_dict.update({detected_headers[i]: columns[i] if i < len(columns) else None for i in range(len(detected_headers))}) | |
relevant_rows.append(row_dict) | |
return relevant_rows | |
# Process the request when the button is clicked | |
if st.button("π Find Information"): | |
if not (name or form_no): | |
st.error("β Please provide at least one input (Name or Form Number).") | |
else: | |
search_terms = [term for term in [name, form_no] if term] | |
all_relevant_rows = [] | |
for file_name, link in google_drive_files.items(): | |
pdf_file = download_pdf_from_drive(link) | |
if pdf_file: | |
detected_headers = detect_column_headers(pdf_file) | |
if not detected_headers: | |
st.warning(f"β Unable to detect headers in {file_name}.") | |
continue | |
relevant_rows = extract_relevant_rows(pdf_file, search_terms, file_name, detected_headers) | |
if relevant_rows: | |
all_relevant_rows.extend(relevant_rows) | |
if all_relevant_rows: | |
st.success("β Relevant information found:") | |
df = pd.DataFrame(all_relevant_rows).fillna("-") # Fill NaN values for better readability | |
st.dataframe(df) | |
else: | |
st.warning("β No matching information found in the provided files.") | |
# Footer | |
st.markdown("---") | |
st.markdown( | |
"<p style='text-align: center; font-size: 14px;'>Designed by: <b>Engr. Makhdoom Muhammad Naeem Hashmi</b></p>", | |
unsafe_allow_html=True | |
) |