File size: 2,406 Bytes
72c8ec0 c4accb1 72c8ec0 c4accb1 a491234 c4accb1 72c8ec0 c4accb1 72c8ec0 c4accb1 a491234 c4accb1 72c8ec0 c4accb1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import os
from transformers import pipeline
import streamlit as st
from PyPDF2 import PdfReader
# Initialize the Hugging Face model pipeline
@st.cache(hash_funcs={pipeline: lambda _: None}) # Allow caching without hashing the model
def load_model():
return pipeline("text-classification", model="fajjos/pdf_model")
# Extract text from a PDF file
def extract_text_from_pdf(pdf_path):
text = ""
try:
reader = PdfReader(pdf_path)
for page in reader.pages:
if page.extract_text(): # Ensure text is not None
text += page.extract_text()
except Exception as e:
st.error(f"Error reading {pdf_path}: {e}")
return text
# Search for the keyword in PDF files
def search_keyword_in_pdfs(folder_path, keyword, model):
pdf_files = [f for f in os.listdir(folder_path) if f.endswith(".pdf")]
matched_files = []
for pdf_file in pdf_files:
pdf_path = os.path.join(folder_path, pdf_file)
text = extract_text_from_pdf(pdf_path)
if text and keyword.lower() in text.lower(): # Case-insensitive search
# Use the Hugging Face model for additional validation or relevance
try:
result = model(text)
if any(keyword.lower() in res["label"].lower() for res in result):
matched_files.append(pdf_file)
except Exception as e:
st.error(f"Error processing {pdf_file} with the model: {e}")
return matched_files
# Streamlit App UI
st.title("PDF Keyword Search")
# User Inputs
folder_path = st.text_input("Enter the folder path:")
keyword = st.text_input("Enter the keyword to search:")
# Button to perform the search
if st.button("Search PDFs"):
if os.path.isdir(folder_path):
if keyword:
st.info("Searching... Please wait.")
model = load_model() # Load the model
matched_files = search_keyword_in_pdfs(folder_path, keyword, model)
if matched_files:
st.success(f"Found the keyword '{keyword}' in the following PDF(s):")
for file in matched_files:
st.write(f"- {file}")
else:
st.warning(f"No PDFs found with the keyword '{keyword}'.")
else:
st.error("Please enter a keyword.")
else:
st.error("Invalid folder path. Please enter a valid path.")
|