Spaces:
Sleeping
Sleeping
UPDATE: functions
Browse files- app.py +20 -99
- functions.py +17 -10
app.py
CHANGED
@@ -13,8 +13,7 @@ from src.api.speech_api import speech_translator_router
|
|
13 |
from functions import client as supabase
|
14 |
from urllib.parse import urlparse
|
15 |
import nltk
|
16 |
-
|
17 |
-
import uuid
|
18 |
|
19 |
nltk.download('punkt_tab')
|
20 |
|
@@ -236,67 +235,34 @@ async def newChatbot(chatbotName: str, username: str):
|
|
236 |
return createTable(tablename=chatbotName)
|
237 |
|
238 |
|
239 |
-
@app.post("/
|
240 |
async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
241 |
source = pdf.filename
|
242 |
pdf = await pdf.read()
|
243 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
244 |
temp_file.write(pdf)
|
245 |
temp_file_path = temp_file.name
|
246 |
-
start = time.time()
|
247 |
text = extractTextFromPdf(temp_file_path)
|
248 |
-
textExtraction = time.time()
|
249 |
os.remove(temp_file_path)
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
"tokenLimit"]
|
255 |
-
newCount = currentCount + len(text)
|
256 |
-
if newCount < int(limit):
|
257 |
-
supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
|
258 |
-
"chatbotname", chatbotname).execute()
|
259 |
-
uploadStart = time.time()
|
260 |
-
output = addDocuments(text=text, source=source, vectorstore=vectorstore)
|
261 |
-
uploadEnd = time.time()
|
262 |
-
uploadTime = f"VECTOR UPLOAD TIME: {uploadEnd - uploadStart}s" + "\n"
|
263 |
-
timeTaken = f"TEXT EXTRACTION TIME: {textExtraction - start}s" + "\n"
|
264 |
-
tokenCount = f"TOKEN COUNT: {len(text)}" + "\n"
|
265 |
-
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
|
266 |
-
wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
|
267 |
-
newText = ("=" * 75 + "\n").join([timeTaken, uploadTime, wordCount, tokenCount, "TEXT: \n" + text + "\n"])
|
268 |
-
fileId = str(uuid.uuid4())
|
269 |
-
with open(f"{fileId}.txt", "w") as file:
|
270 |
-
file.write(newText)
|
271 |
-
with open(f"{fileId}.txt", "rb") as f:
|
272 |
-
supabase.storage.from_("ConversAI").upload(file=f, path=os.path.join("/", f.name),
|
273 |
-
file_options={"content-type": "text/plain"})
|
274 |
-
os.remove(f"{fileId}.txt")
|
275 |
-
output["supabaseFileName"] = f"{fileId}.txt"
|
276 |
-
return output
|
277 |
-
else:
|
278 |
-
return {
|
279 |
-
"output": "DOCUMENT EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
|
280 |
-
}
|
281 |
|
282 |
|
283 |
-
@app.post("/
|
284 |
async def returnText(pdf: UploadFile = File(...)):
|
285 |
source = pdf.filename
|
286 |
pdf = await pdf.read()
|
287 |
-
start = time.time()
|
288 |
text = getTextFromImagePDF(pdfBytes=pdf)
|
289 |
-
end = time.time()
|
290 |
-
timeTaken = f"{end - start}s"
|
291 |
return {
|
292 |
-
"
|
293 |
-
"
|
294 |
-
"output": text
|
295 |
}
|
296 |
|
297 |
|
298 |
@app.post("/addText")
|
299 |
-
async def addText(vectorstore: str, text: str, source: str
|
300 |
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
301 |
df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
302 |
currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
@@ -306,22 +272,7 @@ async def addText(vectorstore: str, text: str, source: str | None = None):
|
|
306 |
if newCount < int(limit):
|
307 |
supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
|
308 |
"chatbotname", chatbotname).execute()
|
309 |
-
uploadStart = time.time()
|
310 |
output = addDocuments(text=text, source=source, vectorstore=vectorstore)
|
311 |
-
uploadEnd = time.time()
|
312 |
-
uploadTime = f"VECTOR UPLOAD TIME: {uploadEnd - uploadStart}s" + "\n"
|
313 |
-
tokenCount = f"TOKEN COUNT: {len(text)}" + "\n"
|
314 |
-
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
|
315 |
-
wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
|
316 |
-
newText = ("=" * 75 + "\n").join([uploadTime, wordCount, tokenCount, "TEXT: \n" + text + "\n"])
|
317 |
-
fileId = str(uuid.uuid4())
|
318 |
-
with open(f"{fileId}.txt", "w") as file:
|
319 |
-
file.write(newText)
|
320 |
-
with open(f"{fileId}.txt", "rb") as f:
|
321 |
-
supabase.storage.from_("ConversAI").upload(file=f, path=os.path.join("/", f.name),
|
322 |
-
file_options={"content-type": "text/plain"})
|
323 |
-
os.remove(f"{fileId}.txt")
|
324 |
-
output["supabaseFileName"] = f"{fileId}.txt"
|
325 |
return output
|
326 |
else:
|
327 |
return {
|
@@ -354,44 +305,12 @@ async def addQAPairData(addQaPair: AddQAPair):
|
|
354 |
}
|
355 |
|
356 |
|
357 |
-
@app.post("/
|
358 |
async def addWebsite(vectorstore: str, websiteUrls: list[str]):
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
364 |
-
currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
365 |
-
newCount = currentCount + len(text)
|
366 |
-
limit = supabase.table("ConversAI_UserConfig").select("tokenLimit").eq("user_id", username).execute().data[0][
|
367 |
-
"tokenLimit"]
|
368 |
-
if newCount < int(limit):
|
369 |
-
supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
|
370 |
-
"chatbotname", chatbotname).execute()
|
371 |
-
uploadStart = time.time()
|
372 |
-
output = addDocuments(text=text, source=urlparse(websiteUrls[0]).netloc, vectorstore=vectorstore)
|
373 |
-
uploadEnd = time.time()
|
374 |
-
uploadTime = f"VECTOR UPLOAD TIME: {uploadEnd - uploadStart}s" + "\n"
|
375 |
-
timeTaken = f"TEXT EXTRACTION TIME: {textExtraction - start}s" + "\n"
|
376 |
-
tokenCount = f"TOKEN COUNT: {len(text)}" + "\n"
|
377 |
-
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
|
378 |
-
wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
|
379 |
-
links = "LINKS:\n" + "\n".join(websiteUrls) + "\n"
|
380 |
-
newText = ("=" * 75 + "\n").join(
|
381 |
-
[timeTaken, uploadTime, wordCount, tokenCount, links, "TEXT: \n" + text + "\n"])
|
382 |
-
fileId = str(uuid.uuid4())
|
383 |
-
with open(f"{fileId}.txt", "w") as file:
|
384 |
-
file.write(newText)
|
385 |
-
with open(f"{fileId}.txt", "rb") as f:
|
386 |
-
supabase.storage.from_("ConversAI").upload(file=f, path=os.path.join("/", f.name),
|
387 |
-
file_options={"content-type": "text/plain"})
|
388 |
-
os.remove(f"{fileId}.txt")
|
389 |
-
output["supabaseFileName"] = f"{fileId}.txt"
|
390 |
-
return output
|
391 |
-
else:
|
392 |
-
return {
|
393 |
-
"output": "WEBSITE EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
|
394 |
-
}
|
395 |
|
396 |
|
397 |
@app.post("/answerQuery")
|
@@ -422,7 +341,8 @@ async def delete(username: str):
|
|
422 |
@app.post("/getLinks")
|
423 |
async def crawlUrl(baseUrl: str):
|
424 |
return {
|
425 |
-
"urls": getLinks(url=baseUrl, timeout=30)
|
|
|
426 |
}
|
427 |
|
428 |
|
@@ -436,9 +356,10 @@ async def getCount(vectorstore: str):
|
|
436 |
|
437 |
|
438 |
@app.post("/getYoutubeTranscript")
|
439 |
-
async def getYTTranscript(urls: str):
|
440 |
return {
|
441 |
-
"
|
|
|
442 |
}
|
443 |
|
444 |
|
|
|
13 |
from functions import client as supabase
|
14 |
from urllib.parse import urlparse
|
15 |
import nltk
|
16 |
+
|
|
|
17 |
|
18 |
nltk.download('punkt_tab')
|
19 |
|
|
|
235 |
return createTable(tablename=chatbotName)
|
236 |
|
237 |
|
238 |
+
@app.post("/loadPDF")
|
239 |
async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
240 |
source = pdf.filename
|
241 |
pdf = await pdf.read()
|
242 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
243 |
temp_file.write(pdf)
|
244 |
temp_file_path = temp_file.name
|
|
|
245 |
text = extractTextFromPdf(temp_file_path)
|
|
|
246 |
os.remove(temp_file_path)
|
247 |
+
return {
|
248 |
+
"output": text,
|
249 |
+
"source": source
|
250 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
|
252 |
|
253 |
+
@app.post("/loadImagePDF")
|
254 |
async def returnText(pdf: UploadFile = File(...)):
|
255 |
source = pdf.filename
|
256 |
pdf = await pdf.read()
|
|
|
257 |
text = getTextFromImagePDF(pdfBytes=pdf)
|
|
|
|
|
258 |
return {
|
259 |
+
"output": text,
|
260 |
+
"source": source
|
|
|
261 |
}
|
262 |
|
263 |
|
264 |
@app.post("/addText")
|
265 |
+
async def addText(vectorstore: str, text: str, source: str = "Text"):
|
266 |
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
267 |
df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
268 |
currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
|
|
272 |
if newCount < int(limit):
|
273 |
supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
|
274 |
"chatbotname", chatbotname).execute()
|
|
|
275 |
output = addDocuments(text=text, source=source, vectorstore=vectorstore)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
return output
|
277 |
else:
|
278 |
return {
|
|
|
305 |
}
|
306 |
|
307 |
|
308 |
+
@app.post("/loadWebURLs")
|
309 |
async def addWebsite(vectorstore: str, websiteUrls: list[str]):
|
310 |
+
text = extractTextFromUrlList(urls=websiteUrls)
|
311 |
+
return {
|
312 |
+
"output": text
|
313 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
|
315 |
|
316 |
@app.post("/answerQuery")
|
|
|
341 |
@app.post("/getLinks")
|
342 |
async def crawlUrl(baseUrl: str):
|
343 |
return {
|
344 |
+
"urls": getLinks(url=baseUrl, timeout=30),
|
345 |
+
"source": urlparse(baseUrl).netloc
|
346 |
}
|
347 |
|
348 |
|
|
|
356 |
|
357 |
|
358 |
@app.post("/getYoutubeTranscript")
|
359 |
+
async def getYTTranscript(urls: list[str]):
|
360 |
return {
|
361 |
+
"output": getTranscript(urls=urls),
|
362 |
+
"source": "www.youtube.com"
|
363 |
}
|
364 |
|
365 |
|
functions.py
CHANGED
@@ -56,7 +56,7 @@ INSTRUCTIONS:
|
|
56 |
2. **Maintaining Focus**: Politely redirect any off-topic conversations back to relevant issues without breaking character.
|
57 |
3. **Exclusive Reliance on Context Data**: Base all answers strictly on the provided context data. If the context doesn’t cover the query, use a fallback response. Always maintain a third-person perspective.
|
58 |
4. **Restrictive Role Focus**: Do not engage in tasks or answer questions unrelated to your role or context data.
|
59 |
-
Ensure all instructions are strictly followed. Responses must be meaningful and concise, within 512 words.
|
60 |
CONTEXT:
|
61 |
=====================================
|
62 |
{context}
|
@@ -139,14 +139,19 @@ def addDocuments(text: str, source: str, vectorstore: str):
|
|
139 |
|
140 |
|
141 |
def format_docs(docs: str):
|
|
|
|
|
142 |
context = ""
|
143 |
for doc in docs:
|
144 |
-
|
145 |
-
|
|
|
|
|
146 |
if context == "":
|
147 |
context = "No context found"
|
148 |
else:
|
149 |
pass
|
|
|
150 |
return context
|
151 |
|
152 |
|
@@ -171,6 +176,7 @@ def trimMessages(chain_input):
|
|
171 |
def answerQuery(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192") -> str:
|
172 |
global prompt
|
173 |
global client
|
|
|
174 |
global vectorEmbeddings
|
175 |
global sparseEmbeddings
|
176 |
vectorStoreName = vectorstore
|
@@ -201,7 +207,8 @@ def answerQuery(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192")
|
|
201 |
"output": chain.invoke(
|
202 |
{"question": query},
|
203 |
{"configurable": {"session_id": vectorStoreName}}
|
204 |
-
)
|
|
|
205 |
}
|
206 |
|
207 |
|
@@ -271,13 +278,12 @@ def getTextFromImagePDF(pdfBytes):
|
|
271 |
return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
|
272 |
allImages = convert_from_bytes(pdfBytes)
|
273 |
texts = [getText(image) for image in allImages]
|
274 |
-
return
|
275 |
|
276 |
|
277 |
def getTranscript(urls: str):
|
278 |
-
urls = urls.split(",")
|
279 |
texts = []
|
280 |
-
for url in urls:
|
281 |
try:
|
282 |
loader = YoutubeLoader.from_youtube_url(
|
283 |
url, add_video_info=False
|
@@ -287,10 +293,11 @@ def getTranscript(urls: str):
|
|
287 |
except:
|
288 |
doc = ""
|
289 |
texts.append(doc)
|
290 |
-
return
|
291 |
|
292 |
|
293 |
def analyzeData(query, dataframe):
|
|
|
294 |
llm = ChatGroq(name="llama-3.1-8b-instant")
|
295 |
df = SmartDataframe(dataframe, config={"llm": llm, "verbose": False})
|
296 |
response = df.chat(query)
|
@@ -312,7 +319,7 @@ def extractTextFromPdf(pdf_path):
|
|
312 |
with ThreadPoolExecutor() as executor:
|
313 |
texts = list(executor.map(extractTextFromPage, pages))
|
314 |
doc.close()
|
315 |
-
return
|
316 |
|
317 |
|
318 |
def extractTextFromUrl(url):
|
@@ -326,4 +333,4 @@ def extractTextFromUrl(url):
|
|
326 |
def extractTextFromUrlList(urls):
|
327 |
with ThreadPoolExecutor() as executor:
|
328 |
texts = list(executor.map(extractTextFromUrl, urls))
|
329 |
-
return
|
|
|
56 |
2. **Maintaining Focus**: Politely redirect any off-topic conversations back to relevant issues without breaking character.
|
57 |
3. **Exclusive Reliance on Context Data**: Base all answers strictly on the provided context data. If the context doesn’t cover the query, use a fallback response. Always maintain a third-person perspective.
|
58 |
4. **Restrictive Role Focus**: Do not engage in tasks or answer questions unrelated to your role or context data.
|
59 |
+
Ensure all instructions are strictly followed. Responses must be meaningful and concise, within 512 words. Make sure the user is always happy and satisfied with the outputs you return.
|
60 |
CONTEXT:
|
61 |
=====================================
|
62 |
{context}
|
|
|
139 |
|
140 |
|
141 |
def format_docs(docs: str):
|
142 |
+
global sources
|
143 |
+
sources = []
|
144 |
context = ""
|
145 |
for doc in docs:
|
146 |
+
context += f"{doc.page_content}\n\n\n"
|
147 |
+
source = doc.metadata
|
148 |
+
source = source["source"]
|
149 |
+
sources.append(source)
|
150 |
if context == "":
|
151 |
context = "No context found"
|
152 |
else:
|
153 |
pass
|
154 |
+
sources = list(set(sources))
|
155 |
return context
|
156 |
|
157 |
|
|
|
176 |
def answerQuery(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192") -> str:
|
177 |
global prompt
|
178 |
global client
|
179 |
+
global sources
|
180 |
global vectorEmbeddings
|
181 |
global sparseEmbeddings
|
182 |
vectorStoreName = vectorstore
|
|
|
207 |
"output": chain.invoke(
|
208 |
{"question": query},
|
209 |
{"configurable": {"session_id": vectorStoreName}}
|
210 |
+
),
|
211 |
+
"sources": sources
|
212 |
}
|
213 |
|
214 |
|
|
|
278 |
return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
|
279 |
allImages = convert_from_bytes(pdfBytes)
|
280 |
texts = [getText(image) for image in allImages]
|
281 |
+
return {x + 1: y for x, y in enumerate(texts)}
|
282 |
|
283 |
|
284 |
def getTranscript(urls: str):
|
|
|
285 |
texts = []
|
286 |
+
for url in set(urls):
|
287 |
try:
|
288 |
loader = YoutubeLoader.from_youtube_url(
|
289 |
url, add_video_info=False
|
|
|
293 |
except:
|
294 |
doc = ""
|
295 |
texts.append(doc)
|
296 |
+
return {x: y for x, y in zip(urls, texts)}
|
297 |
|
298 |
|
299 |
def analyzeData(query, dataframe):
|
300 |
+
query += ". In case, you are to plot a chart, make sure the x-axis labels are 90 degree rotated"
|
301 |
llm = ChatGroq(name="llama-3.1-8b-instant")
|
302 |
df = SmartDataframe(dataframe, config={"llm": llm, "verbose": False})
|
303 |
response = df.chat(query)
|
|
|
319 |
with ThreadPoolExecutor() as executor:
|
320 |
texts = list(executor.map(extractTextFromPage, pages))
|
321 |
doc.close()
|
322 |
+
return {x + 1: y for x, y in enumerate(texts)}
|
323 |
|
324 |
|
325 |
def extractTextFromUrl(url):
|
|
|
333 |
def extractTextFromUrlList(urls):
|
334 |
with ThreadPoolExecutor() as executor:
|
335 |
texts = list(executor.map(extractTextFromUrl, urls))
|
336 |
+
return {x: y for x, y in zip(urls, texts)}
|