flutterbasit commited on
Commit
854bd7a
·
verified ·
1 Parent(s): 8c59a3f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -44
app.py CHANGED
@@ -1,19 +1,22 @@
1
- # Import necessary libraries
2
  import os
3
  import fitz # For PDF extraction
4
  from sentence_transformers import SentenceTransformer
5
  import faiss
6
  import numpy as np
7
- from groq import Groq
8
  import streamlit as st
9
 
10
  # Function to extract text from a PDF
11
  def extract_text_from_pdf(file):
12
- doc = fitz.open(stream=file.read(), filetype="pdf")
13
- text = ""
14
- for page in doc:
15
- text += page.get_text()
16
- return text
 
 
 
 
17
 
18
  # Function to chunk the text
19
  def chunk_text(text, chunk_size=500):
@@ -35,43 +38,52 @@ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
35
 
36
  # Function to generate embeddings
37
  def generate_embeddings(chunks):
38
- embeddings = embedding_model.encode(chunks)
39
- return embeddings
40
 
41
  # Function to store embeddings in FAISS
42
  def store_embeddings_in_faiss(embeddings):
43
- dimension = embeddings.shape[1]
44
- index = faiss.IndexFlatL2(dimension)
45
- index.add(embeddings)
46
- return index
 
 
 
 
47
 
48
  # Function to retrieve similar chunks
49
  def retrieve_similar_chunks(query, index, chunks, model):
50
- query_embedding = model.encode([query])[0]
51
- distances, indices = index.search(np.array([query_embedding]), k=5)
52
- return [chunks[i] for i in indices[0]]
53
-
54
- # Groq API setup
55
- # os.environ["GROQ_API_KEY"] = "your_groq_api_key" # Replace with your API key
56
- # groq_client = Groq(api_key=os.environ["GROQ_API_KEY"])
57
- # groq_api_key = os.getenv("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") # Fetch the API key from environment variables
58
- from dotenv import load_dotenv
59
 
60
- load_dotenv() # Load environment variables from .env
61
- groq_api_key = os.getenv("GROQ_API_KEY")
 
62
  if not groq_api_key:
63
- raise ValueError("The GROQ_API_KEY environment variable is not set.")
 
 
 
64
  groq_client = Groq(api_key=groq_api_key)
65
 
66
  def query_llm(prompt, model="llama3-8b-8192"):
67
- response = groq_client.chat.completions.create(
68
- messages=[
69
- {"role": "system", "content": "You are a helpful assistant."},
70
- {"role": "user", "content": prompt},
71
- ],
72
- model=model,
73
- )
74
- return response.choices[0].message.content
 
 
 
 
75
 
76
  # Streamlit application
77
  def main():
@@ -80,21 +92,25 @@ def main():
80
  # File upload
81
  uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
82
  if uploaded_file:
83
- # Step 1: Extract text from PDF
84
  pdf_text = extract_text_from_pdf(uploaded_file)
 
 
 
85
  st.write("PDF Text Extracted:")
86
- st.write(pdf_text[:500]) # Show a preview of the text
87
 
88
- # Step 2: Chunk the text
89
  chunks = chunk_text(pdf_text)
90
- st.write(f"Text has been split into {len(chunks)} chunks.")
91
-
92
- # Step 3: Generate embeddings and store in FAISS
93
  embeddings = np.array(generate_embeddings(chunks))
94
  index = store_embeddings_in_faiss(embeddings)
95
- st.write("Embeddings generated and stored in vector database.")
96
-
97
- # Step 4: User query
 
98
  query = st.text_input("Enter your query:")
99
  if query:
100
  similar_chunks = retrieve_similar_chunks(query, index, chunks, embedding_model)
@@ -102,8 +118,8 @@ def main():
102
  for i, chunk in enumerate(similar_chunks, start=1):
103
  st.write(f"Chunk {i}: {chunk}")
104
 
105
- # Step 5: Query the LLM using Groq API
106
- combined_context = " ".join(similar_chunks[:3]) # Combine top 3 chunks
107
  llm_prompt = f"Context: {combined_context}\n\nQuery: {query}"
108
  llm_response = query_llm(llm_prompt)
109
  st.write("LLM Response:")
 
 
1
  import os
2
  import fitz # For PDF extraction
3
  from sentence_transformers import SentenceTransformer
4
  import faiss
5
  import numpy as np
6
+ from dotenv import load_dotenv
7
  import streamlit as st
8
 
9
  # Function to extract text from a PDF
10
  def extract_text_from_pdf(file):
11
+ try:
12
+ doc = fitz.open(stream=file.read(), filetype="pdf")
13
+ text = ""
14
+ for page in doc:
15
+ text += page.get_text()
16
+ return text
17
+ except Exception as e:
18
+ st.error(f"Error extracting text: {e}")
19
+ return ""
20
 
21
  # Function to chunk the text
22
  def chunk_text(text, chunk_size=500):
 
38
 
39
  # Function to generate embeddings
40
  def generate_embeddings(chunks):
41
+ return embedding_model.encode(chunks)
 
42
 
43
  # Function to store embeddings in FAISS
44
  def store_embeddings_in_faiss(embeddings):
45
+ try:
46
+ dimension = embeddings.shape[1]
47
+ index = faiss.IndexFlatL2(dimension)
48
+ index.add(embeddings)
49
+ return index
50
+ except Exception as e:
51
+ st.error(f"Error with FAISS: {e}")
52
+ return None
53
 
54
  # Function to retrieve similar chunks
55
  def retrieve_similar_chunks(query, index, chunks, model):
56
+ try:
57
+ query_embedding = model.encode([query])[0]
58
+ distances, indices = index.search(np.array([query_embedding]), k=5)
59
+ return [chunks[i] for i in indices[0]]
60
+ except Exception as e:
61
+ st.error(f"Error retrieving similar chunks: {e}")
62
+ return []
 
 
63
 
64
+ # Load environment variables
65
+ load_dotenv()
66
+ groq_api_key = os.getenv("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")
67
  if not groq_api_key:
68
+ st.error("The GROQ_API_KEY environment variable is not set.")
69
+ exit()
70
+
71
+ # Initialize Groq client
72
  groq_client = Groq(api_key=groq_api_key)
73
 
74
  def query_llm(prompt, model="llama3-8b-8192"):
75
+ try:
76
+ response = groq_client.chat.completions.create(
77
+ messages=[
78
+ {"role": "system", "content": "You are a helpful assistant."},
79
+ {"role": "user", "content": prompt},
80
+ ],
81
+ model=model,
82
+ )
83
+ return response.choices[0].message.content
84
+ except Exception as e:
85
+ st.error(f"Error querying LLM: {e}")
86
+ return "Error in LLM response."
87
 
88
  # Streamlit application
89
  def main():
 
92
  # File upload
93
  uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
94
  if uploaded_file:
95
+ # Extract text
96
  pdf_text = extract_text_from_pdf(uploaded_file)
97
+ if not pdf_text:
98
+ return
99
+
100
  st.write("PDF Text Extracted:")
101
+ st.write(pdf_text[:500]) # Show a preview
102
 
103
+ # Chunk the text
104
  chunks = chunk_text(pdf_text)
105
+ st.write(f"Text split into {len(chunks)} chunks.")
106
+
107
+ # Generate embeddings
108
  embeddings = np.array(generate_embeddings(chunks))
109
  index = store_embeddings_in_faiss(embeddings)
110
+ if index is None:
111
+ return
112
+
113
+ # Query handling
114
  query = st.text_input("Enter your query:")
115
  if query:
116
  similar_chunks = retrieve_similar_chunks(query, index, chunks, embedding_model)
 
118
  for i, chunk in enumerate(similar_chunks, start=1):
119
  st.write(f"Chunk {i}: {chunk}")
120
 
121
+ # Query the LLM
122
+ combined_context = " ".join(similar_chunks[:3])
123
  llm_prompt = f"Context: {combined_context}\n\nQuery: {query}"
124
  llm_response = query_llm(llm_prompt)
125
  st.write("LLM Response:")