import os from transformers import pipeline import streamlit as st from PyPDF2 import PdfReader # Initialize the Hugging Face model pipeline @st.cache(hash_funcs={pipeline: lambda _: None}) # Allow caching without hashing the model def load_model(): return pipeline("text-classification", model="fajjos/pdf_model") # Extract text from a PDF file def extract_text_from_pdf(pdf_path): text = "" try: reader = PdfReader(pdf_path) for page in reader.pages: if page.extract_text(): # Ensure text is not None text += page.extract_text() except Exception as e: st.error(f"Error reading {pdf_path}: {e}") return text # Search for the keyword in PDF files def search_keyword_in_pdfs(folder_path, keyword, model): pdf_files = [f for f in os.listdir(folder_path) if f.endswith(".pdf")] matched_files = [] for pdf_file in pdf_files: pdf_path = os.path.join(folder_path, pdf_file) text = extract_text_from_pdf(pdf_path) if text and keyword.lower() in text.lower(): # Case-insensitive search # Use the Hugging Face model for additional validation or relevance try: result = model(text) if any(keyword.lower() in res["label"].lower() for res in result): matched_files.append(pdf_file) except Exception as e: st.error(f"Error processing {pdf_file} with the model: {e}") return matched_files # Streamlit App UI st.title("PDF Keyword Search") # User Inputs folder_path = st.text_input("Enter the folder path:") keyword = st.text_input("Enter the keyword to search:") # Button to perform the search if st.button("Search PDFs"): if os.path.isdir(folder_path): if keyword: st.info("Searching... Please wait.") model = load_model() # Load the model matched_files = search_keyword_in_pdfs(folder_path, keyword, model) if matched_files: st.success(f"Found the keyword '{keyword}' in the following PDF(s):") for file in matched_files: st.write(f"- {file}") else: st.warning(f"No PDFs found with the keyword '{keyword}'.") else: st.error("Please enter a keyword.") else: st.error("Invalid folder path. Please enter a valid path.")