Shankarm08 commited on
Commit
bc28c5c
·
verified ·
1 Parent(s): 33015fa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -31
app.py CHANGED
@@ -3,22 +3,13 @@ import torch
3
  from transformers import BertTokenizer, BertModel
4
  import pdfplumber
5
 
6
- # Load the pre-trained BERT model and tokenizer outside the function for efficiency
7
  model_name = "bert-base-uncased"
8
  tokenizer = BertTokenizer.from_pretrained(model_name)
9
  model = BertModel.from_pretrained(model_name)
10
 
11
- # Define a function to extract text from a PDF
12
- def extract_text_from_pdf(pdf_file):
13
- with pdfplumber.open(pdf_file) as pdf:
14
- text = ""
15
- for page in pdf.pages:
16
- text += page.extract_text()
17
- return text
18
-
19
- # Define a function to classify the extracted text
20
- def classify_text(text):
21
- # Preprocess the input text
22
  inputs = tokenizer.encode_plus(
23
  text,
24
  add_special_tokens=True,
@@ -26,29 +17,44 @@ def classify_text(text):
26
  return_attention_mask=True,
27
  return_tensors='pt'
28
  )
29
-
30
- # Use the pre-trained BERT model to extract features from the input text
31
  outputs = model(**inputs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- # Extract the features
34
- features = outputs.last_hidden_state[:, 0, :]
35
 
36
- return features.tolist()
 
 
 
37
 
38
- # Streamlit app setup
39
- st.title("PDF Text Classification")
40
- st.write("Upload a PDF file to classify its text using BERT")
41
 
42
- # File uploader for PDFs
43
- pdf_file = st.file_uploader("Choose a PDF file", type="pdf")
 
 
 
 
44
 
45
- if pdf_file is not None:
46
- # Extract text from the uploaded PDF
47
- extracted_text = extract_text_from_pdf(pdf_file)
48
- st.write("Extracted Text:")
49
- st.write(extracted_text)
50
 
51
- # Classify the extracted text
52
- if st.button("Classify"):
53
- features = classify_text(extracted_text)
54
- st.json({"features": features}) # Display the features in JSON format
 
3
  from transformers import BertTokenizer, BertModel
4
  import pdfplumber
5
 
6
+ # Load the pre-trained BERT model and tokenizer once
7
  model_name = "bert-base-uncased"
8
  tokenizer = BertTokenizer.from_pretrained(model_name)
9
  model = BertModel.from_pretrained(model_name)
10
 
11
+ # Function to get BERT embeddings
12
+ def get_embeddings(text):
 
 
 
 
 
 
 
 
 
13
  inputs = tokenizer.encode_plus(
14
  text,
15
  add_special_tokens=True,
 
17
  return_attention_mask=True,
18
  return_tensors='pt'
19
  )
 
 
20
  outputs = model(**inputs)
21
+ return outputs.last_hidden_state[:, 0, :].detach().numpy()
22
+
23
+ # Extract text from PDF
24
+ def extract_text_from_pdf(pdf_file):
25
+ with pdfplumber.open(pdf_file) as pdf:
26
+ text = ""
27
+ for page in pdf.pages:
28
+ text += page.extract_text() + "\n" # Add newline for better separation
29
+ return text
30
+
31
+ # Store the PDF text and embeddings
32
+ pdf_text = ""
33
+ pdf_embeddings = None
34
+
35
+ # Streamlit app
36
+ st.title("PDF Chatbot using BERT")
37
 
38
+ # PDF file upload
39
+ pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])
40
 
41
+ if pdf_file:
42
+ pdf_text = extract_text_from_pdf(pdf_file)
43
+ pdf_embeddings = get_embeddings(pdf_text)
44
+ st.success("PDF loaded successfully!")
45
 
46
+ # User input for chatbot
47
+ user_input = st.text_input("Ask a question about the PDF:")
 
48
 
49
+ if st.button("Get Response"):
50
+ if pdf_text == "":
51
+ st.warning("Please upload a PDF file first.")
52
+ else:
53
+ # Get embeddings for user input
54
+ user_embeddings = get_embeddings(user_input)
55
 
56
+ # For demonstration, simply return the PDF text.
57
+ # Implement similarity matching logic here as needed.
58
+ st.write("### Response:")
59
+ st.write(pdf_text) # For simplicity, returning all text
 
60