Phoenix21 commited on
Commit
4583bb0
·
1 Parent(s): 1cfc91b

handled punkt error2

Browse files
Files changed (1) hide show
  1. app.py +89 -37
app.py CHANGED
@@ -3,6 +3,7 @@ import logging
3
  import re
4
  import nltk
5
  import spacy
 
6
  from nltk.tokenize import sent_tokenize
7
  from langchain.vectorstores import Chroma
8
  from langchain_core.output_parsers import StrOutputParser
@@ -18,18 +19,6 @@ import gradio as gr
18
  import pandas as pd
19
  import json
20
 
21
- # Download required NLTK resources
22
- nltk.download('punkt')
23
-
24
- # Load spaCy English model
25
- try:
26
- nlp = spacy.load("en_core_web_sm")
27
- except OSError:
28
- # If the model is not found, download it
29
- from spacy.cli import download
30
- download("en_core_web_sm")
31
- nlp = spacy.load("en_core_web_sm")
32
-
33
  # Enable logging for debugging
34
  logging.basicConfig(level=logging.DEBUG)
35
  logger = logging.getLogger(__name__)
@@ -49,6 +38,32 @@ api_key = clean_api_key(api_key).strip() # Clean and strip whitespace
49
  def clean_text(text):
50
  return text.encode("ascii", errors="ignore").decode()
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  # Function to load and clean documents from multiple file formats
53
  def load_documents(file_paths):
54
  docs = []
@@ -84,14 +99,32 @@ def load_documents(file_paths):
84
  logger.warning(f"Unsupported file format: {file_path}")
85
  except Exception as e:
86
  logger.error(f"Error processing file {file_path}: {e}")
 
87
  return docs
88
 
89
  # Function to ensure the response ends with complete sentences using NLTK
90
  def ensure_complete_sentences(text):
91
- sentences = sent_tokenize(text)
92
- if sentences:
93
- return ' '.join(sentences).strip()
94
- return text # Return as is if no complete sentence is found
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  # Advanced input validation using spaCy (Section 8a)
97
  def is_valid_input_nlp(text, threshold=0.5):
@@ -106,12 +139,15 @@ def is_valid_input_nlp(text, threshold=0.5):
106
  - bool: True if the input is valid, False otherwise.
107
  """
108
  if not text or text.strip() == "":
 
109
  return False
110
  doc = nlp(text)
111
  meaningful_tokens = [token for token in doc if token.is_alpha]
112
  if not meaningful_tokens:
 
113
  return False
114
  ratio = len(meaningful_tokens) / len(doc)
 
115
  return ratio >= threshold
116
 
117
  # Function to estimate prompt tokens (simple word count approximation)
@@ -133,9 +169,11 @@ def initialize_llm(model, temperature, max_tokens, prompt_template):
133
  try:
134
  # Estimate prompt tokens
135
  estimated_prompt_tokens = estimate_prompt_tokens(prompt_template)
 
136
 
137
  # Allocate remaining tokens to response
138
  response_max_tokens = max_tokens - estimated_prompt_tokens
 
139
 
140
  if response_max_tokens <= 100:
141
  raise ValueError("max_tokens is too small to allocate for the response.")
@@ -150,7 +188,8 @@ def initialize_llm(model, temperature, max_tokens, prompt_template):
150
  return llm
151
  except Exception as e:
152
  logger.error(f"Error initializing LLM: {e}")
153
- raise
 
154
 
155
  # Create the RAG pipeline
156
  def create_rag_pipeline(file_paths, model, temperature, max_tokens):
@@ -173,6 +212,7 @@ def create_rag_pipeline(file_paths, model, temperature, max_tokens):
173
 
174
  # Estimate prompt tokens
175
  estimated_prompt_tokens = estimate_prompt_tokens(custom_prompt_template.template)
 
176
 
177
  # Initialize the LLM with token allocation
178
  llm = initialize_llm(model, temperature, max_tokens, custom_prompt_template.template)
@@ -186,15 +226,17 @@ def create_rag_pipeline(file_paths, model, temperature, max_tokens):
186
  # Split documents into chunks
187
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
188
  splits = text_splitter.split_documents(docs)
 
189
 
190
  # Initialize the embedding model
191
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
192
 
193
- # Use a persistent database for Chroma
194
  vectorstore = Chroma.from_documents(
195
  documents=splits,
196
  embedding=embedding_model,
197
- persist_directory="./chroma_db" # Specify persistent storage directory
198
  )
199
  vectorstore.persist() # Save the database to disk
200
  logger.debug("Vectorstore initialized and persisted successfully.")
@@ -212,6 +254,7 @@ def create_rag_pipeline(file_paths, model, temperature, max_tokens):
212
  return rag_chain, "Pipeline created successfully."
213
  except Exception as e:
214
  logger.error(f"Error creating RAG pipeline: {e}")
 
215
  return None, f"Error creating RAG pipeline: {e}"
216
 
217
  # Function to handle feedback (Section 8d)
@@ -235,27 +278,36 @@ def handle_feedback(feedback_text):
235
 
236
  # Function to answer questions with input validation and post-processing
237
  def answer_question(file_paths, model, temperature, max_tokens, question, feedback):
238
- # Validate input using spaCy-based validation
239
- if not is_valid_input_nlp(question):
240
- return "Please provide a valid question or input containing meaningful text.", ""
241
-
242
- rag_chain, message = create_rag_pipeline(file_paths, model, temperature, max_tokens)
243
- if rag_chain is None:
244
- return message, ""
245
-
246
  try:
247
- answer = rag_chain.run(question)
248
- logger.debug("Question answered successfully.")
249
- # Post-process to ensure the answer ends with complete sentences
250
- complete_answer = ensure_complete_sentences(answer)
251
 
252
- # Handle feedback
253
- feedback_response = handle_feedback(feedback)
 
 
254
 
255
- return complete_answer, feedback_response
256
- except Exception as e:
257
- logger.error(f"Error during RAG pipeline execution: {e}")
258
- return f"Error during RAG pipeline execution: {e}", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
  # Gradio Interface with Feedback Mechanism (Section 8d)
261
  def gradio_interface(model, temperature, max_tokens, question, feedback):
 
3
  import re
4
  import nltk
5
  import spacy
6
+ import traceback
7
  from nltk.tokenize import sent_tokenize
8
  from langchain.vectorstores import Chroma
9
  from langchain_core.output_parsers import StrOutputParser
 
19
  import pandas as pd
20
  import json
21
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # Enable logging for debugging
23
  logging.basicConfig(level=logging.DEBUG)
24
  logger = logging.getLogger(__name__)
 
38
  def clean_text(text):
39
  return text.encode("ascii", errors="ignore").decode()
40
 
41
+ # Download required NLTK resources
42
+ try:
43
+ nltk.download('punkt', download_dir='/tmp/nltk_data')
44
+ nltk.data.path.append('/tmp/nltk_data')
45
+ logger.debug("NLTK 'punkt' resource downloaded successfully.")
46
+ except Exception as e:
47
+ logger.error("Failed to download NLTK 'punkt' resource.")
48
+ logger.error(traceback.format_exc())
49
+ raise e
50
+
51
+ # Load spaCy English model
52
+ try:
53
+ nlp = spacy.load("en_core_web_sm")
54
+ logger.debug("spaCy 'en_core_web_sm' model loaded successfully.")
55
+ except OSError:
56
+ try:
57
+ logger.debug("spaCy model not found. Downloading 'en_core_web_sm'.")
58
+ from spacy.cli import download
59
+ download("en_core_web_sm")
60
+ nlp = spacy.load("en_core_web_sm")
61
+ logger.debug("spaCy 'en_core_web_sm' model downloaded and loaded successfully.")
62
+ except Exception as e:
63
+ logger.error("Failed to download and load spaCy 'en_core_web_sm' model.")
64
+ logger.error(traceback.format_exc())
65
+ raise e
66
+
67
  # Function to load and clean documents from multiple file formats
68
  def load_documents(file_paths):
69
  docs = []
 
99
  logger.warning(f"Unsupported file format: {file_path}")
100
  except Exception as e:
101
  logger.error(f"Error processing file {file_path}: {e}")
102
+ logger.error(traceback.format_exc())
103
  return docs
104
 
105
  # Function to ensure the response ends with complete sentences using NLTK
106
  def ensure_complete_sentences(text):
107
+ logger.debug("Ensuring complete sentences for the given text.")
108
+ try:
109
+ sentences = sent_tokenize(text)
110
+ if sentences:
111
+ return ' '.join(sentences).strip()
112
+ return text # Return as is if no complete sentence is found
113
+ except LookupError as e:
114
+ logger.error("NLTK resource 'punkt' not found. Attempting to download again.")
115
+ try:
116
+ nltk.download('punkt', download_dir='/tmp/nltk_data')
117
+ nltk.data.path.append('/tmp/nltk_data')
118
+ sentences = sent_tokenize(text)
119
+ return ' '.join(sentences).strip()
120
+ except Exception as e_inner:
121
+ logger.error("Failed to download 'punkt' resource.")
122
+ logger.error(traceback.format_exc())
123
+ raise e_inner
124
+ except Exception as e:
125
+ logger.error("Unexpected error during sentence tokenization.")
126
+ logger.error(traceback.format_exc())
127
+ raise e
128
 
129
  # Advanced input validation using spaCy (Section 8a)
130
  def is_valid_input_nlp(text, threshold=0.5):
 
139
  - bool: True if the input is valid, False otherwise.
140
  """
141
  if not text or text.strip() == "":
142
+ logger.debug("Input text is empty or contains only whitespace.")
143
  return False
144
  doc = nlp(text)
145
  meaningful_tokens = [token for token in doc if token.is_alpha]
146
  if not meaningful_tokens:
147
+ logger.debug("No meaningful (alphabetic) tokens found in input.")
148
  return False
149
  ratio = len(meaningful_tokens) / len(doc)
150
+ logger.debug(f"Meaningful tokens ratio: {ratio}")
151
  return ratio >= threshold
152
 
153
  # Function to estimate prompt tokens (simple word count approximation)
 
169
  try:
170
  # Estimate prompt tokens
171
  estimated_prompt_tokens = estimate_prompt_tokens(prompt_template)
172
+ logger.debug(f"Estimated prompt tokens: {estimated_prompt_tokens}")
173
 
174
  # Allocate remaining tokens to response
175
  response_max_tokens = max_tokens - estimated_prompt_tokens
176
+ logger.debug(f"Response max tokens: {response_max_tokens}")
177
 
178
  if response_max_tokens <= 100:
179
  raise ValueError("max_tokens is too small to allocate for the response.")
 
188
  return llm
189
  except Exception as e:
190
  logger.error(f"Error initializing LLM: {e}")
191
+ logger.error(traceback.format_exc())
192
+ raise e
193
 
194
  # Create the RAG pipeline
195
  def create_rag_pipeline(file_paths, model, temperature, max_tokens):
 
212
 
213
  # Estimate prompt tokens
214
  estimated_prompt_tokens = estimate_prompt_tokens(custom_prompt_template.template)
215
+ logger.debug(f"Estimated prompt tokens from template: {estimated_prompt_tokens}")
216
 
217
  # Initialize the LLM with token allocation
218
  llm = initialize_llm(model, temperature, max_tokens, custom_prompt_template.template)
 
226
  # Split documents into chunks
227
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
228
  splits = text_splitter.split_documents(docs)
229
+ logger.debug(f"Documents split into {len(splits)} chunks.")
230
 
231
  # Initialize the embedding model
232
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
233
+ logger.debug("Embedding model initialized successfully.")
234
 
235
+ # Use a temporary directory for Chroma vectorstore to prevent caching issues on Hugging Face Spaces
236
  vectorstore = Chroma.from_documents(
237
  documents=splits,
238
  embedding=embedding_model,
239
+ persist_directory="/tmp/chroma_db" # Temporary storage directory
240
  )
241
  vectorstore.persist() # Save the database to disk
242
  logger.debug("Vectorstore initialized and persisted successfully.")
 
254
  return rag_chain, "Pipeline created successfully."
255
  except Exception as e:
256
  logger.error(f"Error creating RAG pipeline: {e}")
257
+ logger.error(traceback.format_exc())
258
  return None, f"Error creating RAG pipeline: {e}"
259
 
260
  # Function to handle feedback (Section 8d)
 
278
 
279
  # Function to answer questions with input validation and post-processing
280
  def answer_question(file_paths, model, temperature, max_tokens, question, feedback):
 
 
 
 
 
 
 
 
281
  try:
282
+ # Validate input using spaCy-based validation
283
+ if not is_valid_input_nlp(question):
284
+ logger.debug("Invalid input detected.")
285
+ return "Please provide a valid question or input containing meaningful text.", ""
286
 
287
+ rag_chain, message = create_rag_pipeline(file_paths, model, temperature, max_tokens)
288
+ if rag_chain is None:
289
+ logger.debug("RAG pipeline creation failed.")
290
+ return message, ""
291
 
292
+ try:
293
+ answer = rag_chain.run(question)
294
+ logger.debug("Question answered successfully.")
295
+ # Post-process to ensure the answer ends with complete sentences
296
+ complete_answer = ensure_complete_sentences(answer)
297
+
298
+ # Handle feedback
299
+ feedback_response = handle_feedback(feedback)
300
+
301
+ return complete_answer, feedback_response
302
+ except Exception as e_inner:
303
+ logger.error(f"Error during RAG pipeline execution: {e_inner}")
304
+ logger.error(traceback.format_exc())
305
+ return f"Error during RAG pipeline execution: {e_inner}", ""
306
+
307
+ except Exception as e_outer:
308
+ logger.error(f"Unexpected error in answer_question: {e_outer}")
309
+ logger.error(traceback.format_exc())
310
+ return f"Unexpected error: {e_outer}", ""
311
 
312
  # Gradio Interface with Feedback Mechanism (Section 8d)
313
  def gradio_interface(model, temperature, max_tokens, question, feedback):