Talha812 commited on
Commit
4d4a229
Β·
verified Β·
1 Parent(s): e05da15

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +214 -36
app.py CHANGED
@@ -1,25 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import torch
3
  from transformers import GPTNeoXForCausalLM, AutoTokenizer
4
  from sentence_transformers import SentenceTransformer
5
  import faiss
6
- import fitz # PyMuPDF
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
8
 
9
- # 1. Set page config FIRST
10
  st.set_page_config(page_title="πŸ“š Smart Book Analyst", layout="wide")
11
 
12
  # Configuration
13
  MODEL_NAME = "ibm-granite/granite-3.1-1b-a400m-instruct"
14
  EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2"
15
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
16
- CHUNK_SIZE = 512
17
- CHUNK_OVERLAP = 50
 
18
 
19
  @st.cache_resource
20
  def load_models():
21
  try:
22
- # Load Granite model
23
  tokenizer = AutoTokenizer.from_pretrained(
24
  MODEL_NAME,
25
  trust_remote_code=True
@@ -27,13 +191,15 @@ def load_models():
27
 
28
  model = GPTNeoXForCausalLM.from_pretrained(
29
  MODEL_NAME,
30
- device_map="auto" if DEVICE == "cuda" else None,
31
  torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
32
- trust_remote_code=True
 
33
  ).eval()
34
 
35
- # Load sentence transformer for embeddings
36
  embedder = SentenceTransformer(EMBED_MODEL, device=DEVICE)
 
37
 
38
  return tokenizer, model, embedder
39
 
@@ -43,7 +209,6 @@ def load_models():
43
 
44
  tokenizer, model, embedder = load_models()
45
 
46
- # Text processing
47
  def process_text(text):
48
  splitter = RecursiveCharacterTextSplitter(
49
  chunk_size=CHUNK_SIZE,
@@ -52,70 +217,79 @@ def process_text(text):
52
  )
53
  return splitter.split_text(text)
54
 
55
- # PDF extraction
56
  def extract_pdf_text(uploaded_file):
57
  try:
58
  doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
59
- return "\n".join([page.get_text() for page in doc])
60
  except Exception as e:
61
  st.error(f"PDF extraction error: {str(e)}")
62
  return ""
63
 
64
- # Summarization function
65
  def generate_summary(text):
66
- chunks = process_text(text)[:10]
 
 
 
 
67
  summaries = []
68
 
69
- for chunk in chunks:
 
70
  prompt = f"""<|user|>
71
- Summarize this text section focusing on key themes, characters, and plot points:
72
- {chunk[:2000]}
73
  <|assistant|>
74
  """
75
 
76
  inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
77
- outputs = model.generate(**inputs, max_new_tokens=300, temperature=0.3)
 
 
 
 
 
78
  summaries.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
79
 
80
  combined = "\n".join(summaries)
81
  final_prompt = f"""<|user|>
82
- Combine these section summaries into a coherent book summary:
83
  {combined}
84
  <|assistant|>
85
- The comprehensive summary is:"""
86
 
87
  inputs = tokenizer(final_prompt, return_tensors="pt").to(DEVICE)
88
- outputs = model.generate(**inputs, max_new_tokens=500, temperature=0.5)
89
- return tokenizer.decode(outputs[0], skip_special_tokens=True).split(":")[-1].strip()
 
 
 
 
 
90
 
91
- # FAISS index creation
92
  def build_faiss_index(texts):
93
- embeddings = embedder.encode(texts, show_progress_bar=False)
94
  dimension = embeddings.shape[1]
95
  index = faiss.IndexFlatIP(dimension)
96
  faiss.normalize_L2(embeddings)
97
  index.add(embeddings)
98
  return index
99
 
100
- # Answer generation
101
  def generate_answer(query, context):
102
  prompt = f"""<|user|>
103
- Using this context: {context}
104
- Answer the question precisely and truthfully. If unsure, say "I don't know".
105
- Question: {query}
106
- <|assistant|>
107
- """
108
 
109
  inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(DEVICE)
110
  outputs = model.generate(
111
  **inputs,
112
- max_new_tokens=300,
113
- temperature=0.4,
114
- top_p=0.9,
115
- repetition_penalty=1.2,
116
  do_sample=True
117
  )
118
- return tokenizer.decode(outputs[0], skip_special_tokens=True).split("<|assistant|>")[-1].strip()
119
 
120
  # Streamlit UI
121
  st.title("πŸ“š AI-Powered Book Analysis System")
@@ -130,6 +304,10 @@ if uploaded_file:
130
  else:
131
  text = uploaded_file.read().decode()
132
 
 
 
 
 
133
  chunks = process_text(text)
134
  st.session_state.docs = chunks
135
  st.session_state.index = build_faiss_index(chunks)
@@ -148,14 +326,14 @@ if 'index' in st.session_state and st.session_state.index:
148
  try:
149
  query_embed = embedder.encode([query])
150
  faiss.normalize_L2(query_embed)
151
- distances, indices = st.session_state.index.search(query_embed, k=3)
152
 
153
  context = "\n".join([st.session_state.docs[i] for i in indices[0]])
154
  answer = generate_answer(query, context)
155
 
156
  st.subheader("Answer")
157
  st.markdown(f"```\n{answer}\n```")
158
- st.caption("Retrieved context confidence: {:.2f}".format(distances[0][0]))
159
 
160
  except Exception as e:
161
  st.error(f"Query failed: {str(e)}")
 
1
+ # import streamlit as st
2
+ # import torch
3
+ # from transformers import GPTNeoXForCausalLM, AutoTokenizer
4
+ # from sentence_transformers import SentenceTransformer
5
+ # import faiss
6
+ # import fitz # PyMuPDF
7
+ # from langchain_text_splitters import RecursiveCharacterTextSplitter
8
+
9
+ # # 1. Set page config FIRST
10
+ # st.set_page_config(page_title="πŸ“š Smart Book Analyst", layout="wide")
11
+
12
+ # # Configuration
13
+ # MODEL_NAME = "ibm-granite/granite-3.1-1b-a400m-instruct"
14
+ # EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2"
15
+ # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
16
+ # CHUNK_SIZE = 512
17
+ # CHUNK_OVERLAP = 50
18
+
19
+ # @st.cache_resource
20
+ # def load_models():
21
+ # try:
22
+ # # Load Granite model
23
+ # tokenizer = AutoTokenizer.from_pretrained(
24
+ # MODEL_NAME,
25
+ # trust_remote_code=True
26
+ # )
27
+
28
+ # model = GPTNeoXForCausalLM.from_pretrained(
29
+ # MODEL_NAME,
30
+ # device_map="auto" if DEVICE == "cuda" else None,
31
+ # torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
32
+ # trust_remote_code=True
33
+ # ).eval()
34
+
35
+ # # Load sentence transformer for embeddings
36
+ # embedder = SentenceTransformer(EMBED_MODEL, device=DEVICE)
37
+
38
+ # return tokenizer, model, embedder
39
+
40
+ # except Exception as e:
41
+ # st.error(f"Model loading failed: {str(e)}")
42
+ # st.stop()
43
+
44
+ # tokenizer, model, embedder = load_models()
45
+
46
+ # # Text processing
47
+ # def process_text(text):
48
+ # splitter = RecursiveCharacterTextSplitter(
49
+ # chunk_size=CHUNK_SIZE,
50
+ # chunk_overlap=CHUNK_OVERLAP,
51
+ # length_function=len
52
+ # )
53
+ # return splitter.split_text(text)
54
+
55
+ # # PDF extraction
56
+ # def extract_pdf_text(uploaded_file):
57
+ # try:
58
+ # doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
59
+ # return "\n".join([page.get_text() for page in doc])
60
+ # except Exception as e:
61
+ # st.error(f"PDF extraction error: {str(e)}")
62
+ # return ""
63
+
64
+ # # Summarization function
65
+ # def generate_summary(text):
66
+ # chunks = process_text(text)[:10]
67
+ # summaries = []
68
+
69
+ # for chunk in chunks:
70
+ # prompt = f"""<|user|>
71
+ # Summarize this text section focusing on key themes, characters, and plot points:
72
+ # {chunk[:2000]}
73
+ # <|assistant|>
74
+ # """
75
+
76
+ # inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
77
+ # outputs = model.generate(**inputs, max_new_tokens=300, temperature=0.3)
78
+ # summaries.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
79
+
80
+ # combined = "\n".join(summaries)
81
+ # final_prompt = f"""<|user|>
82
+ # Combine these section summaries into a coherent book summary:
83
+ # {combined}
84
+ # <|assistant|>
85
+ # The comprehensive summary is:"""
86
+
87
+ # inputs = tokenizer(final_prompt, return_tensors="pt").to(DEVICE)
88
+ # outputs = model.generate(**inputs, max_new_tokens=500, temperature=0.5)
89
+ # return tokenizer.decode(outputs[0], skip_special_tokens=True).split(":")[-1].strip()
90
+
91
+ # # FAISS index creation
92
+ # def build_faiss_index(texts):
93
+ # embeddings = embedder.encode(texts, show_progress_bar=False)
94
+ # dimension = embeddings.shape[1]
95
+ # index = faiss.IndexFlatIP(dimension)
96
+ # faiss.normalize_L2(embeddings)
97
+ # index.add(embeddings)
98
+ # return index
99
+
100
+ # # Answer generation
101
+ # def generate_answer(query, context):
102
+ # prompt = f"""<|user|>
103
+ # Using this context: {context}
104
+ # Answer the question precisely and truthfully. If unsure, say "I don't know".
105
+ # Question: {query}
106
+ # <|assistant|>
107
+ # """
108
+
109
+ # inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(DEVICE)
110
+ # outputs = model.generate(
111
+ # **inputs,
112
+ # max_new_tokens=300,
113
+ # temperature=0.4,
114
+ # top_p=0.9,
115
+ # repetition_penalty=1.2,
116
+ # do_sample=True
117
+ # )
118
+ # return tokenizer.decode(outputs[0], skip_special_tokens=True).split("<|assistant|>")[-1].strip()
119
+
120
+ # # Streamlit UI
121
+ # st.title("πŸ“š AI-Powered Book Analysis System")
122
+
123
+ # uploaded_file = st.file_uploader("Upload book (PDF or TXT)", type=["pdf", "txt"])
124
+
125
+ # if uploaded_file:
126
+ # with st.spinner("πŸ“– Analyzing book content..."):
127
+ # try:
128
+ # if uploaded_file.type == "application/pdf":
129
+ # text = extract_pdf_text(uploaded_file)
130
+ # else:
131
+ # text = uploaded_file.read().decode()
132
+
133
+ # chunks = process_text(text)
134
+ # st.session_state.docs = chunks
135
+ # st.session_state.index = build_faiss_index(chunks)
136
+
137
+ # with st.expander("πŸ“ Book Summary", expanded=True):
138
+ # summary = generate_summary(text)
139
+ # st.write(summary)
140
+
141
+ # except Exception as e:
142
+ # st.error(f"Processing failed: {str(e)}")
143
+
144
+ # if 'index' in st.session_state and st.session_state.index:
145
+ # query = st.text_input("Ask about the book:")
146
+ # if query:
147
+ # with st.spinner("πŸ” Searching for answers..."):
148
+ # try:
149
+ # query_embed = embedder.encode([query])
150
+ # faiss.normalize_L2(query_embed)
151
+ # distances, indices = st.session_state.index.search(query_embed, k=3)
152
+
153
+ # context = "\n".join([st.session_state.docs[i] for i in indices[0]])
154
+ # answer = generate_answer(query, context)
155
+
156
+ # st.subheader("Answer")
157
+ # st.markdown(f"```\n{answer}\n```")
158
+ # st.caption("Retrieved context confidence: {:.2f}".format(distances[0][0]))
159
+
160
+ # except Exception as e:
161
+ # st.error(f"Query failed: {str(e)}")
162
+
163
+
164
  import streamlit as st
165
  import torch
166
  from transformers import GPTNeoXForCausalLM, AutoTokenizer
167
  from sentence_transformers import SentenceTransformer
168
  import faiss
169
+ import fitz
170
  from langchain_text_splitters import RecursiveCharacterTextSplitter
171
 
172
+ # Set page config FIRST
173
  st.set_page_config(page_title="πŸ“š Smart Book Analyst", layout="wide")
174
 
175
  # Configuration
176
  MODEL_NAME = "ibm-granite/granite-3.1-1b-a400m-instruct"
177
  EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2"
178
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
179
+ CHUNK_SIZE = 1024 # Increased chunk size for better performance
180
+ CHUNK_OVERLAP = 100
181
+ MAX_SUMMARY_CHUNKS = 5 # Reduced from 10 to 5 for faster processing
182
 
183
  @st.cache_resource
184
  def load_models():
185
  try:
186
+ # Load model with optimized settings
187
  tokenizer = AutoTokenizer.from_pretrained(
188
  MODEL_NAME,
189
  trust_remote_code=True
 
191
 
192
  model = GPTNeoXForCausalLM.from_pretrained(
193
  MODEL_NAME,
194
+ device_map="auto",
195
  torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
196
+ trust_remote_code=True,
197
+ low_cpu_mem_usage=True
198
  ).eval()
199
 
200
+ # Load embedder with faster model
201
  embedder = SentenceTransformer(EMBED_MODEL, device=DEVICE)
202
+ embedder.max_seq_length = 256 # Reduce embedding dimension
203
 
204
  return tokenizer, model, embedder
205
 
 
209
 
210
  tokenizer, model, embedder = load_models()
211
 
 
212
  def process_text(text):
213
  splitter = RecursiveCharacterTextSplitter(
214
  chunk_size=CHUNK_SIZE,
 
217
  )
218
  return splitter.split_text(text)
219
 
 
220
  def extract_pdf_text(uploaded_file):
221
  try:
222
  doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
223
+ return "\n".join(page.get_text() for page in doc)
224
  except Exception as e:
225
  st.error(f"PDF extraction error: {str(e)}")
226
  return ""
227
 
 
228
  def generate_summary(text):
229
+ chunks = process_text(text)[:MAX_SUMMARY_CHUNKS]
230
+ if not chunks:
231
+ return "No meaningful content found."
232
+
233
+ progress_bar = st.progress(0)
234
  summaries = []
235
 
236
+ for i, chunk in enumerate(chunks):
237
+ progress_bar.progress((i+1)/len(chunks), text=f"Processing chunk {i+1}/{len(chunks)}...")
238
  prompt = f"""<|user|>
239
+ Summarize key points in 2 sentences:
240
+ {chunk[:1500]}
241
  <|assistant|>
242
  """
243
 
244
  inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
245
+ outputs = model.generate(
246
+ **inputs,
247
+ max_new_tokens=150,
248
+ temperature=0.2,
249
+ do_sample=False # Disable sampling for faster generation
250
+ )
251
  summaries.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
252
 
253
  combined = "\n".join(summaries)
254
  final_prompt = f"""<|user|>
255
+ Combine these into a concise summary (3-5 paragraphs):
256
  {combined}
257
  <|assistant|>
258
+ Summary:"""
259
 
260
  inputs = tokenizer(final_prompt, return_tensors="pt").to(DEVICE)
261
+ outputs = model.generate(
262
+ **inputs,
263
+ max_new_tokens=300,
264
+ temperature=0.3,
265
+ do_sample=False
266
+ )
267
+ return tokenizer.decode(outputs[0], skip_special_tokens=True).split("Summary:")[-1].strip()
268
 
 
269
  def build_faiss_index(texts):
270
+ embeddings = embedder.encode(texts, show_progress_bar=False, batch_size=32)
271
  dimension = embeddings.shape[1]
272
  index = faiss.IndexFlatIP(dimension)
273
  faiss.normalize_L2(embeddings)
274
  index.add(embeddings)
275
  return index
276
 
 
277
  def generate_answer(query, context):
278
  prompt = f"""<|user|>
279
+ Context: {context[:2000]}
280
+ Q: {query}
281
+ A:"""
 
 
282
 
283
  inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(DEVICE)
284
  outputs = model.generate(
285
  **inputs,
286
+ max_new_tokens=200,
287
+ temperature=0.3,
288
+ top_p=0.85,
289
+ repetition_penalty=1.1,
290
  do_sample=True
291
  )
292
+ return tokenizer.decode(outputs[0], skip_special_tokens=True).split("A:")[-1].strip()
293
 
294
  # Streamlit UI
295
  st.title("πŸ“š AI-Powered Book Analysis System")
 
304
  else:
305
  text = uploaded_file.read().decode()
306
 
307
+ if not text.strip():
308
+ st.error("Uploaded file appears to be empty")
309
+ st.stop()
310
+
311
  chunks = process_text(text)
312
  st.session_state.docs = chunks
313
  st.session_state.index = build_faiss_index(chunks)
 
326
  try:
327
  query_embed = embedder.encode([query])
328
  faiss.normalize_L2(query_embed)
329
+ distances, indices = st.session_state.index.search(query_embed, k=2)
330
 
331
  context = "\n".join([st.session_state.docs[i] for i in indices[0]])
332
  answer = generate_answer(query, context)
333
 
334
  st.subheader("Answer")
335
  st.markdown(f"```\n{answer}\n```")
336
+ st.caption(f"Confidence: {distances[0][0]:.2f}")
337
 
338
  except Exception as e:
339
  st.error(f"Query failed: {str(e)}")