Pontonkid commited on
Commit
54202cd
Β·
verified Β·
1 Parent(s): 07f084c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -0
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import fitz # PyMuPDF for PDF processing
3
+ import faiss
4
+ import numpy as np
5
+ import pickle
6
+ import streamlit as st
7
+ from sentence_transformers import SentenceTransformer
8
+ from groq import Groq
9
+ from dotenv import load_dotenv
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+
14
+ # Initialize Groq Client
15
+ client = Groq(api_key="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")
16
+
17
+ # Load Sentence Transformer model for embeddings
18
+ embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
19
+
20
+ # Initialize FAISS index
21
+ INDEX_FILE = "faiss_index.pkl"
22
+
23
+ def load_faiss_index():
24
+ if os.path.exists(INDEX_FILE):
25
+ with open(INDEX_FILE, "rb") as f:
26
+ return pickle.load(f)
27
+ return faiss.IndexFlatL2(384)
28
+
29
+ index = load_faiss_index()
30
+ documents = []
31
+
32
+ def extract_text_from_pdf(pdf_file):
33
+ doc = fitz.open(pdf_file)
34
+ return "\n".join([page.get_text() for page in doc])
35
+
36
+ def chunk_text(text, chunk_size=500, overlap=100):
37
+ return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size - overlap)]
38
+
39
+ def add_to_faiss(text_chunks):
40
+ global index, documents
41
+ embeddings = embedding_model.encode(text_chunks)
42
+ index.add(np.array(embeddings, dtype=np.float32))
43
+ documents.extend(text_chunks)
44
+ with open(INDEX_FILE, "wb") as f:
45
+ pickle.dump(index, f)
46
+
47
+ def query_faiss(query, top_k=3):
48
+ query_embedding = embedding_model.encode([query])
49
+ _, indices = index.search(np.array(query_embedding, dtype=np.float32), top_k)
50
+ return [documents[i] for i in indices[0] if i < len(documents)]
51
+
52
+ def query_groq(prompt):
53
+ try:
54
+ chat_completion = client.chat.completions.create(
55
+ messages=[{"role": "user", "content": prompt}],
56
+ model="llama-3.3-70b-versatile"
57
+ )
58
+ return chat_completion.choices[0].message.content
59
+ except Exception as e:
60
+ return f"⚠️ Error: {str(e)}"
61
+
62
+ # Streamlit UI
63
+ st.set_page_config(page_title="RAG-based PDF Chatbot", page_icon="πŸ“„", layout="wide")
64
+
65
+ st.title("πŸ“„ RAG-based PDF Chatbot")
66
+ st.markdown("Talk to your PDFs using AI-powered search!")
67
+
68
+ with st.sidebar:
69
+ st.subheader("πŸ“€ Upload a PDF")
70
+ uploaded_file = st.file_uploader("Drag & drop or browse", type="pdf")
71
+
72
+ if uploaded_file:
73
+ with st.spinner("Processing your PDF..."):
74
+ with open("uploaded.pdf", "wb") as f:
75
+ f.write(uploaded_file.getbuffer())
76
+
77
+ text = extract_text_from_pdf("uploaded.pdf")
78
+ text_chunks = chunk_text(text)
79
+ add_to_faiss(text_chunks)
80
+
81
+ st.sidebar.success("βœ… PDF uploaded and indexed!")
82
+
83
+ with st.expander("πŸ“ƒ Extracted Text Preview", expanded=False):
84
+ st.text(text[:1000] + "...")
85
+
86
+ st.markdown("---")
87
+ st.subheader("πŸ” Ask something about the document")
88
+ query = st.text_input("Type your question below:")
89
+
90
+ if query:
91
+ retrieved_texts = query_faiss(query)
92
+
93
+ if retrieved_texts:
94
+ context = "\n".join(retrieved_texts)
95
+
96
+ with st.expander("πŸ“– Retrieved Context", expanded=False):
97
+ st.text(context[:1000] + "...")
98
+
99
+ response = query_groq(f"Context:\n{context}\n\nUser Query:\n{query}")
100
+
101
+ st.subheader("πŸ’¬ AI Response")
102
+ st.markdown(f"**{response}**")
103
+ else:
104
+ st.warning("⚠️ No relevant context found in the document!")