itsanurag commited on
Commit
9890aee
1 Parent(s): 1245952

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -0
app.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ from transformers import DPRQuestionEncoderTokenizer, DPRQuestionEncoder
3
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
4
+ import json
5
+ import faiss
6
+ import numpy as np
7
+ import streamlit as st
8
+
9
+ # Function to extract text from PDF
10
+ def extract_text_from_pdf(pdf_path):
11
+ document = fitz.open(pdf_path)
12
+ text = ""
13
+ for page_num in range(document.page_count):
14
+ page = document.load_page(page_num)
15
+ text += page.get_text("text")
16
+ return text
17
+
18
+ # Function to chunk text into smaller segments
19
+ def chunk_text(text, chunk_size=1000):
20
+ return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
21
+
22
+ # Initialize models
23
+ retriever_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
24
+ retriever = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
25
+ generator_tokenizer = T5Tokenizer.from_pretrained('t5-base')
26
+ generator = T5ForConditionalGeneration.from_pretrained('t5-base')
27
+
28
+ # Index chunks using FAISS
29
+ def index_chunks(chunks):
30
+ index = faiss.IndexFlatL2(768) # Assuming 768-dimensional embeddings
31
+ chunk_embeddings = []
32
+ for chunk in chunks:
33
+ inputs = retriever_tokenizer(chunk, return_tensors='pt', padding=True, truncation=True)
34
+ chunk_embedding = retriever(**inputs).pooler_output.detach().numpy()
35
+ chunk_embeddings.append(chunk_embedding)
36
+ chunk_embeddings = np.vstack(chunk_embeddings)
37
+ index.add(chunk_embeddings)
38
+ return index, chunk_embeddings
39
+
40
+ # Function to get answer to a query
41
+ def get_answer(query, chunks, index, chunk_embeddings, max_length=50):
42
+ # Encode query using retriever
43
+ inputs = retriever_tokenizer(query, return_tensors='pt')
44
+ question_embedding = retriever(**inputs).pooler_output.detach().numpy()
45
+
46
+ # Search for the most relevant chunk
47
+ distances, indices = index.search(question_embedding, 1)
48
+ retrieved_chunk = chunks[indices[0][0]]
49
+
50
+ # Generate answer using generator
51
+ input_ids = generator_tokenizer(retrieved_chunk, return_tensors='pt').input_ids
52
+ output_ids = generator.generate(input_ids, max_length=max_length)
53
+ answer = generator_tokenizer.decode(output_ids[0], skip_special_tokens=True)
54
+
55
+ return answer
56
+
57
+ # Load and process PDF
58
+ pdf_text = extract_text_from_pdf('policy-booklet-0923.pdf')
59
+ chunks = chunk_text(pdf_text)
60
+ index, chunk_embeddings = index_chunks(chunks)
61
+
62
+ # Streamlit front-end
63
+ st.title("RAG-Powered PDF Chatbot")
64
+
65
+ user_query = st.text_input("Enter your question:")
66
+ if user_query:
67
+ answer = get_answer(user_query, chunks, index, chunk_embeddings, max_length=100)
68
+ st.write("Answer:", answer)