Spaces:

fajjos
/

pdf_model

Sleeping

File size: 2,406 Bytes

72c8ec0
c4accb1
72c8ec0
 
 
c4accb1
 
 
 
a491234
c4accb1
 
72c8ec0
c4accb1
 
 
 
 
 
 
72c8ec0
 
c4accb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a491234
c4accb1
72c8ec0
c4accb1

import os
from transformers import pipeline
import streamlit as st
from PyPDF2 import PdfReader

# Initialize the Hugging Face model pipeline
@st.cache(hash_funcs={pipeline: lambda _: None})  # Allow caching without hashing the model
def load_model():
    return pipeline("text-classification", model="fajjos/pdf_model")

# Extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            if page.extract_text():  # Ensure text is not None
                text += page.extract_text()
    except Exception as e:
        st.error(f"Error reading {pdf_path}: {e}")
    return text

# Search for the keyword in PDF files
def search_keyword_in_pdfs(folder_path, keyword, model):
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith(".pdf")]
    matched_files = []

    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)
        text = extract_text_from_pdf(pdf_path)

        if text and keyword.lower() in text.lower():  # Case-insensitive search
            # Use the Hugging Face model for additional validation or relevance
            try:
                result = model(text)
                if any(keyword.lower() in res["label"].lower() for res in result):
                    matched_files.append(pdf_file)
            except Exception as e:
                st.error(f"Error processing {pdf_file} with the model: {e}")
    return matched_files

# Streamlit App UI
st.title("PDF Keyword Search")

# User Inputs
folder_path = st.text_input("Enter the folder path:")
keyword = st.text_input("Enter the keyword to search:")

# Button to perform the search
if st.button("Search PDFs"):
    if os.path.isdir(folder_path):
        if keyword:
            st.info("Searching... Please wait.")
            model = load_model()  # Load the model
            matched_files = search_keyword_in_pdfs(folder_path, keyword, model)

            if matched_files:
                st.success(f"Found the keyword '{keyword}' in the following PDF(s):")
                for file in matched_files:
                    st.write(f"- {file}")
            else:
                st.warning(f"No PDFs found with the keyword '{keyword}'.")
        else:
            st.error("Please enter a keyword.")
    else:
        st.error("Invalid folder path. Please enter a valid path.")