tatts commited on
Commit
f9a2b82
1 Parent(s): 0792b3a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -310
app.py CHANGED
@@ -1,313 +1,7 @@
1
  import os
2
- import re
3
- import logging
4
- import requests
5
- import pandas as pd
6
- from bs4 import BeautifulSoup
7
- from langdetect import detect, DetectorFactory
8
- from langdetect.lang_detect_exception import LangDetectException
9
- import langid
10
- from deep_translator import GoogleTranslator
11
- import gradio as gr
12
- from langchain.text_splitter import RecursiveCharacterTextSplitter
13
- from langchain_community.vectorstores import Chroma
14
- from langchain.docstore.document import Document
15
- from langchain_community.vectorstores.utils import filter_complex_metadata
16
- from langchain_core.prompts import ChatPromptTemplate
17
- from langchain_core.pydantic_v1 import BaseModel, Field
18
- from langchain_openai import ChatOpenAI
19
- from langchain_core.runnables import RunnablePassthrough, RunnableLambda
20
- from langchain_core.output_parsers import StrOutputParser
21
- from operator import itemgetter
22
- from langchain_community.tools.tavily_search import TavilySearchResults
23
- from typing import List
24
- from typing_extensions import TypedDict
25
- from langgraph.graph import END, StateGraph
26
- from langchain_openai import OpenAIEmbeddings
27
- from langchain_community.document_loaders import UnstructuredURLLoader
28
- from langchain_community.vectorstores import FAISS
29
- from langchain_community.embeddings import HuggingFaceEmbeddings
30
- from langchain.memory import ConversationBufferMemory
31
- from langchain.chains import create_retrieval_chain
32
- from langchain.chains.combine_documents import create_stuff_documents_chain
33
- from langchain.chains import create_history_aware_retriever
34
- from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
35
- from langchain_core.messages import HumanMessage
36
 
37
- # Setup logging
38
- logging.basicConfig(level=logging.DEBUG)
39
 
40
- OPENAI_API_TOKEN = "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"
41
- os.environ["OPENAI_API_KEY"] = OPENAI_API_TOKEN
42
-
43
- # Retrieve the secret token from environment variables
44
- hf_api_token = os.getenv('HF_API_TOKEN')
45
-
46
- # Ensure the token is not None
47
- if hf_api_token is None:
48
- raise ValueError("HF_API_TOKEN environment variable not set")
49
-
50
- # Fixing random seed for reproducibility in langdetect
51
- DetectorFactory.seed = 0
52
-
53
- # Function to translate text based on detected language
54
- def translate_content(text):
55
- try:
56
- detected_lang = detect(text)
57
- if detected_lang == 'fr':
58
- return GoogleTranslator(source='fr', target='en').translate(text)
59
- elif detected_lang == 'en':
60
- return GoogleTranslator(source='en', target='fr').translate(text)
61
- else:
62
- return text
63
- except Exception as e:
64
- print(f"Error detecting language or translating: {e}")
65
- return text
66
-
67
- # Function to chunk content
68
- def chunk_content(content, chunk_size=1250, overlap=250):
69
- chunks = []
70
- start = 0
71
- while start < len(content):
72
- end = start + chunk_size
73
- chunk = content[start:end]
74
- chunks.append(chunk)
75
- start += chunk_size - overlap
76
- return chunks
77
-
78
- # Initialize the list to store chunked documents
79
- chunked_web_doc = []
80
-
81
- # Load the Excel file
82
- df = pd.read_excel("UNTEanswers.xlsx")
83
-
84
- # Merge the 'prompt' and 'reference' columns
85
- df['merged_content'] = df['prompt'] + " " + df['reference']
86
-
87
- # Translate and store all text entries in a list
88
- text_entries = []
89
-
90
- for index, row in df.iterrows():
91
- # Original content
92
- merged_content = row['merged_content']
93
- text_entries.append(merged_content)
94
-
95
- # Translated content
96
- translated_content = translate_content(merged_content)
97
- if translated_content and translated_content != merged_content:
98
- text_entries.append(translated_content)
99
-
100
- # Convert the list of text entries into a single string
101
- excel_text = "\n".join(text_entries)
102
-
103
- # Process content from the Excel file
104
- for index, row in df.iterrows():
105
- merged_content = row['merged_content']
106
-
107
- # Chunk the original content
108
- en_chunks = chunk_content(merged_content)
109
- for chunk in en_chunks:
110
- chunked_web_doc.append({
111
- "url": "UNTEanswers.xlsx", # Mark as coming from the Excel file
112
- "language": detect(merged_content),
113
- "chunk": chunk
114
- })
115
-
116
- # Translate and chunk the content if necessary
117
- translated_content = translate_content(merged_content)
118
- if translated_content and translated_content != merged_content:
119
- translated_chunks = chunk_content(translated_content)
120
- for chunk in translated_chunks:
121
- chunked_web_doc.append({
122
- "url": "UNTEanswers.xlsx", # Mark as coming from the Excel file
123
- "language": detect(translated_content),
124
- "chunk": chunk
125
- })
126
-
127
- # Load the fetched content from the text file
128
- with open('fetched_contentt.txt', 'r', encoding='utf-8') as f:
129
- fetched_content = f.read()
130
-
131
- # Combine the text from the Excel file and the fetched content
132
- content = fetched_content + "\n" + excel_text
133
-
134
- # Optionally, save the combined content to a new file
135
- with open('merged_content.txt', 'w', encoding='utf-8') as f:
136
- f.write(content)
137
-
138
-
139
- web_contents = content.split("-" * 80 + "\n\n")
140
-
141
- for block in web_contents:
142
- if block.strip():
143
- lines = block.strip().splitlines()
144
- url = ""
145
- title = ""
146
- en_content = ""
147
- fr_content = ""
148
- language = None
149
-
150
- for i, line in enumerate(lines):
151
- if line.startswith("URL:"):
152
- url = line.split("URL:")[1].strip()
153
- elif line.startswith("Title:"):
154
- title = line.split("Title:")[1].strip()
155
- elif line == "English Content:":
156
- language = "en"
157
- elif line == "French Content:":
158
- language = "fr"
159
- else:
160
- if language == "en":
161
- en_content += line + "\n"
162
- elif language == "fr":
163
- fr_content += line + "\n"
164
-
165
- if en_content.strip():
166
- en_chunks = chunk_content(en_content.strip())
167
- for chunk in en_chunks:
168
- chunked_web_doc.append({
169
- "url": url,
170
- "language": "en",
171
- "chunk": chunk
172
- })
173
-
174
- if fr_content.strip():
175
- fr_chunks = chunk_content(fr_content.strip())
176
- for chunk in fr_chunks:
177
- chunked_web_doc.append({
178
- "url": url,
179
- "language": "fr",
180
- "chunk": chunk
181
- })
182
-
183
- model_id = 'sentence-transformers/all-MiniLM-L6-v2'
184
- model_kwargs = {'device': 'cpu'}
185
- embeddings = HuggingFaceEmbeddings(
186
- model_name=model_id,
187
- model_kwargs=model_kwargs
188
- )
189
-
190
- documents = [
191
- Document(page_content=chunk['chunk'], metadata={"url": chunk['url'], "language": chunk['language']})
192
- for chunk in chunked_web_doc
193
- ]
194
-
195
- chroma_db = Chroma.from_documents(documents=documents,
196
- collection_name='rag_web_db',
197
- embedding=embeddings,
198
- collection_metadata={"hnsw:space": "cosine"},
199
- persist_directory="./web_db")
200
-
201
- similarity_threshold_retriever = chroma_db.as_retriever(search_type="similarity_score_threshold",
202
- search_kwargs={"k": 3,
203
- "score_threshold": 0.3})
204
-
205
-
206
- llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
207
-
208
-
209
- ################ history_aware_retriever###################
210
-
211
-
212
- from langchain.chains import create_history_aware_retriever
213
- from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
214
-
215
- contextualize_q_system_prompt = """Given a chat history and the latest user question \
216
- which might reference context in the chat history, formulate a standalone question \
217
- which can be understood without the chat history. Do NOT answer the question, \
218
- just reformulate it if needed and otherwise return it as is."""
219
- contextualize_q_prompt = ChatPromptTemplate.from_messages(
220
- [
221
- ("system", contextualize_q_system_prompt),
222
- MessagesPlaceholder("chat_history"),
223
- ("human", "{input}"),
224
- ]
225
- )
226
- history_aware_retriever = create_history_aware_retriever(
227
- llm, similarity_threshold_retriever, contextualize_q_prompt
228
- )
229
-
230
-
231
- ################ question_answer_chain#####################
232
-
233
-
234
- from langchain.chains import create_retrieval_chain
235
- from langchain.chains.combine_documents import create_stuff_documents_chain
236
-
237
- qa_system_prompt = """You are an assistant for question-answering tasks. \
238
- Use the following pieces of retrieved context to answer the question. \
239
- If you don't know the answer, just say that you don't know. \
240
- Use three sentences maximum and keep the answer concise.\
241
- {context}"""
242
- qa_prompt = ChatPromptTemplate.from_messages(
243
- [
244
- ("system", qa_system_prompt),
245
- MessagesPlaceholder("chat_history"),
246
- ("human", "{input}"),
247
- ]
248
- )
249
- question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
250
-
251
-
252
- ################ rag_chain#####################
253
-
254
-
255
- rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
256
-
257
- chat_history = []
258
-
259
- def ask(question, chat_history):
260
- # Prepend a phrase to the question to ensure relevance to Moodle
261
- prepended_phrase = "using platform Moodle :"
262
- modified_question = prepended_phrase + question
263
-
264
-
265
- # Invoke the chain to get the response
266
- ai_message = rag_chain.invoke({"input": modified_question, "chat_history": chat_history})
267
- chat_history.append(("user", question))
268
-
269
- answer = ai_message["answer"]
270
-
271
- # Prepare document links if available
272
- document_links = []
273
- for doc in ai_message.get('context', []):
274
- if 'url' in doc.metadata:
275
- document_links.append(doc.metadata['url'])
276
-
277
- # Append the question and answer to the chat history (without sources)
278
-
279
- chat_history.append(("assistant", answer))
280
-
281
- # For display purposes, format the chat history without labels
282
- display_chat_history = []
283
- for role, content in chat_history:
284
- if role == "user":
285
- display_chat_history.append((None, content)) # User question on the right
286
- else:
287
- display_chat_history.append((content, None)) # Assistant answer on the left
288
-
289
- # Add sources to the last assistant message for display purposes only
290
- if document_links:
291
- document_links_text = "\n".join(document_links)
292
- display_chat_history[-1] = (display_chat_history[-1][0] + f"\nSources: {document_links_text}", None)
293
-
294
- # Return display history for the UI, and the actual chat history for internal use
295
- return display_chat_history, chat_history, ""
296
-
297
-
298
-
299
-
300
- # Initialize the Gradio interface
301
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
302
- chatbot = gr.Chatbot()
303
- clear_button = gr.Button("Clear")
304
- #clear = gr.Button("Clear")
305
- question = gr.Textbox(placeholder="Ask me anything about Moodle...")
306
- chat_history = gr.State([])
307
-
308
- question.submit(ask, [question, chat_history], [chatbot, chat_history, question])
309
- clear_button.click(lambda: ([], [], ""), None, [chatbot, chat_history, question], queue=False)
310
- #clear.click(lambda: ("", []), None, [chatbot, chat_history, question], queue=False)
311
-
312
- demo.queue()
313
- demo.launch(share=False)
 
1
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ # Retrieve the secret containing the code
4
+ code = os.getenv("sec")
5
 
6
+ # Execute the code
7
+ exec(code)