Spaces:

techconspartners
/

ConversAI

Sleeping

App Files Files Community

Rauhan commited on Aug 22

Commit

d3176f4

•

1 Parent(s): c5522cd

UPDATE: functions

Browse files

Files changed (2) hide show

app.py +20 -99
functions.py +17 -10

app.py CHANGED Viewed

@@ -13,8 +13,7 @@ from src.api.speech_api import speech_translator_router
 from functions import client as supabase
 from urllib.parse import urlparse
 import nltk
-import time
-import uuid
 nltk.download('punkt_tab')
@@ -236,67 +235,34 @@ async def newChatbot(chatbotName: str, username: str):
  return createTable(tablename=chatbotName)
-@app.post("/addPDF")
 async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
  source = pdf.filename
  pdf = await pdf.read()
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
  temp_file.write(pdf)
  temp_file_path = temp_file.name
- start = time.time()
  text = extractTextFromPdf(temp_file_path)
- textExtraction = time.time()
  os.remove(temp_file_path)
- username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
- df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
- currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
- limit = supabase.table("ConversAI_UserConfig").select("tokenLimit").eq("user_id", username).execute().data[0][
- "tokenLimit"]
- newCount = currentCount + len(text)
- if newCount < int(limit):
- supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
- "chatbotname", chatbotname).execute()
- uploadStart = time.time()
- output = addDocuments(text=text, source=source, vectorstore=vectorstore)
- uploadEnd = time.time()
- uploadTime = f"VECTOR UPLOAD TIME: {uploadEnd - uploadStart}s" + "\n"
- timeTaken = f"TEXT EXTRACTION TIME: {textExtraction - start}s" + "\n"
- tokenCount = f"TOKEN COUNT: {len(text)}" + "\n"
- tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
- wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
- newText = ("=" * 75 + "\n").join([timeTaken, uploadTime, wordCount, tokenCount, "TEXT: \n" + text + "\n"])
- fileId = str(uuid.uuid4())
- with open(f"{fileId}.txt", "w") as file:
- file.write(newText)
- with open(f"{fileId}.txt", "rb") as f:
- supabase.storage.from_("ConversAI").upload(file=f, path=os.path.join("/", f.name),
- file_options={"content-type": "text/plain"})
- os.remove(f"{fileId}.txt")
- output["supabaseFileName"] = f"{fileId}.txt"
- return output
- else:
- return {
- "output": "DOCUMENT EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
- }
-@app.post("/scanAndReturnText")
 async def returnText(pdf: UploadFile = File(...)):
  source = pdf.filename
  pdf = await pdf.read()
- start = time.time()
  text = getTextFromImagePDF(pdfBytes=pdf)
- end = time.time()
- timeTaken = f"{end - start}s"
  return {
- "source": source,
- "extractionTime": timeTaken,
- "output": text
  }
 @app.post("/addText")
-async def addText(vectorstore: str, text: str, source: str | None = None):
  username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
  df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
  currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
@@ -306,22 +272,7 @@ async def addText(vectorstore: str, text: str, source: str | None = None):
  if newCount < int(limit):
  supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
  "chatbotname", chatbotname).execute()
- uploadStart = time.time()
  output = addDocuments(text=text, source=source, vectorstore=vectorstore)
- uploadEnd = time.time()
- uploadTime = f"VECTOR UPLOAD TIME: {uploadEnd - uploadStart}s" + "\n"
- tokenCount = f"TOKEN COUNT: {len(text)}" + "\n"
- tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
- wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
- newText = ("=" * 75 + "\n").join([uploadTime, wordCount, tokenCount, "TEXT: \n" + text + "\n"])
- fileId = str(uuid.uuid4())
- with open(f"{fileId}.txt", "w") as file:
- file.write(newText)
- with open(f"{fileId}.txt", "rb") as f:
- supabase.storage.from_("ConversAI").upload(file=f, path=os.path.join("/", f.name),
- file_options={"content-type": "text/plain"})
- os.remove(f"{fileId}.txt")
- output["supabaseFileName"] = f"{fileId}.txt"
  return output
  else:
  return {
@@ -354,44 +305,12 @@ async def addQAPairData(addQaPair: AddQAPair):
  }
-@app.post("/addWebsite")
 async def addWebsite(vectorstore: str, websiteUrls: list[str]):
- start = time.time()
- text = extractTextFromUrlList(urls=websiteUrls)
- textExtraction = time.time()
- username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
- df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
- currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
- newCount = currentCount + len(text)
- limit = supabase.table("ConversAI_UserConfig").select("tokenLimit").eq("user_id", username).execute().data[0][
- "tokenLimit"]
- if newCount < int(limit):
- supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
- "chatbotname", chatbotname).execute()
- uploadStart = time.time()
- output = addDocuments(text=text, source=urlparse(websiteUrls[0]).netloc, vectorstore=vectorstore)
- uploadEnd = time.time()
- uploadTime = f"VECTOR UPLOAD TIME: {uploadEnd - uploadStart}s" + "\n"
- timeTaken = f"TEXT EXTRACTION TIME: {textExtraction - start}s" + "\n"
- tokenCount = f"TOKEN COUNT: {len(text)}" + "\n"
- tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
- wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
- links = "LINKS:\n" + "\n".join(websiteUrls) + "\n"
- newText = ("=" * 75 + "\n").join(
- [timeTaken, uploadTime, wordCount, tokenCount, links, "TEXT: \n" + text + "\n"])
- fileId = str(uuid.uuid4())
- with open(f"{fileId}.txt", "w") as file:
- file.write(newText)
- with open(f"{fileId}.txt", "rb") as f:
- supabase.storage.from_("ConversAI").upload(file=f, path=os.path.join("/", f.name),
- file_options={"content-type": "text/plain"})
- os.remove(f"{fileId}.txt")
- output["supabaseFileName"] = f"{fileId}.txt"
- return output
- else:
- return {
- "output": "WEBSITE EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
- }
 @app.post("/answerQuery")
@@ -422,7 +341,8 @@ async def delete(username: str):
 @app.post("/getLinks")
 async def crawlUrl(baseUrl: str):
  return {
- "urls": getLinks(url=baseUrl, timeout=30)
  }
@@ -436,9 +356,10 @@ async def getCount(vectorstore: str):
 @app.post("/getYoutubeTranscript")
-async def getYTTranscript(urls: str):
  return {
- "transcript": getTranscript(urls=urls)
  }

 from functions import client as supabase
 from urllib.parse import urlparse
 import nltk
 nltk.download('punkt_tab')
  return createTable(tablename=chatbotName)
+@app.post("/loadPDF")
 async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
  source = pdf.filename
  pdf = await pdf.read()
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
  temp_file.write(pdf)
  temp_file_path = temp_file.name
  text = extractTextFromPdf(temp_file_path)
  os.remove(temp_file_path)
+ return {
+ "output": text,
+ "source": source
+ }
+@app.post("/loadImagePDF")
 async def returnText(pdf: UploadFile = File(...)):
  source = pdf.filename
  pdf = await pdf.read()
  text = getTextFromImagePDF(pdfBytes=pdf)
  return {
+ "output": text,
+ "source": source
  }
 @app.post("/addText")
+async def addText(vectorstore: str, text: str, source: str = "Text"):
  username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
  df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
  currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
  if newCount < int(limit):
  supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
  "chatbotname", chatbotname).execute()
  output = addDocuments(text=text, source=source, vectorstore=vectorstore)
  return output
  else:
  return {
  }
+@app.post("/loadWebURLs")
 async def addWebsite(vectorstore: str, websiteUrls: list[str]):
+ text = extractTextFromUrlList(urls=websiteUrls)
+ return {
+ "output": text
+ }
 @app.post("/answerQuery")
 @app.post("/getLinks")
 async def crawlUrl(baseUrl: str):
  return {
+ "urls": getLinks(url=baseUrl, timeout=30),
+ "source": urlparse(baseUrl).netloc
  }
 @app.post("/getYoutubeTranscript")
+async def getYTTranscript(urls: list[str]):
  return {
+ "output": getTranscript(urls=urls),
+ "source": "www.youtube.com"
  }

functions.py CHANGED Viewed

@@ -56,7 +56,7 @@ INSTRUCTIONS:
 2. **Maintaining Focus**: Politely redirect any off-topic conversations back to relevant issues without breaking character.
 3. **Exclusive Reliance on Context Data**: Base all answers strictly on the provided context data. If the context doesn’t cover the query, use a fallback response. Always maintain a third-person perspective.
 4. **Restrictive Role Focus**: Do not engage in tasks or answer questions unrelated to your role or context data.
-Ensure all instructions are strictly followed. Responses must be meaningful and concise, within 512 words. Include sources to support your answers when possible.
 CONTEXT:
 =====================================
 {context}
@@ -139,14 +139,19 @@ def addDocuments(text: str, source: str, vectorstore: str):
 def format_docs(docs: str):
  context = ""
  for doc in docs:
- print("METADATA ::: ", type(doc.metadata))
- context += f"CONTENT: {doc.page_content}\nSOURCE: {doc.metadata} \n\n\n"
  if context == "":
  context = "No context found"
  else:
  pass
  return context
@@ -171,6 +176,7 @@ def trimMessages(chain_input):
 def answerQuery(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192") -> str:
  global prompt
  global client
  global vectorEmbeddings
  global sparseEmbeddings
  vectorStoreName = vectorstore
@@ -201,7 +207,8 @@ def answerQuery(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192")
  "output": chain.invoke(
  {"question": query},
  {"configurable": {"session_id": vectorStoreName}}
- )
  }
@@ -271,13 +278,12 @@ def getTextFromImagePDF(pdfBytes):
  return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
  allImages = convert_from_bytes(pdfBytes)
  texts = [getText(image) for image in allImages]
- return "\n\n\n".join(texts)
 def getTranscript(urls: str):
- urls = urls.split(",")
  texts = []
- for url in urls:
  try:
  loader = YoutubeLoader.from_youtube_url(
  url, add_video_info=False
@@ -287,10 +293,11 @@ def getTranscript(urls: str):
  except:
  doc = ""
  texts.append(doc)
- return "\n\n".join(texts)
 def analyzeData(query, dataframe):
  llm = ChatGroq(name="llama-3.1-8b-instant")
  df = SmartDataframe(dataframe, config={"llm": llm, "verbose": False})
  response = df.chat(query)
@@ -312,7 +319,7 @@ def extractTextFromPdf(pdf_path):
  with ThreadPoolExecutor() as executor:
  texts = list(executor.map(extractTextFromPage, pages))
  doc.close()
- return '.'.join(texts)
 def extractTextFromUrl(url):
@@ -326,4 +333,4 @@ def extractTextFromUrl(url):
 def extractTextFromUrlList(urls):
  with ThreadPoolExecutor() as executor:
  texts = list(executor.map(extractTextFromUrl, urls))
- return '.'.join(texts)

 2. **Maintaining Focus**: Politely redirect any off-topic conversations back to relevant issues without breaking character.
 3. **Exclusive Reliance on Context Data**: Base all answers strictly on the provided context data. If the context doesn’t cover the query, use a fallback response. Always maintain a third-person perspective.
 4. **Restrictive Role Focus**: Do not engage in tasks or answer questions unrelated to your role or context data.
+Ensure all instructions are strictly followed. Responses must be meaningful and concise, within 512 words. Make sure the user is always happy and satisfied with the outputs you return.
 CONTEXT:
 =====================================
 {context}
 def format_docs(docs: str):
+ global sources
+ sources = []
  context = ""
  for doc in docs:
+ context += f"{doc.page_content}\n\n\n"
+ source = doc.metadata
+ source = source["source"]
+ sources.append(source)
  if context == "":
  context = "No context found"
  else:
  pass
+ sources = list(set(sources))
  return context
 def answerQuery(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192") -> str:
  global prompt
  global client
+ global sources
  global vectorEmbeddings
  global sparseEmbeddings
  vectorStoreName = vectorstore
  "output": chain.invoke(
  {"question": query},
  {"configurable": {"session_id": vectorStoreName}}
+ ),
+ "sources": sources
  }
  return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
  allImages = convert_from_bytes(pdfBytes)
  texts = [getText(image) for image in allImages]
+ return {x + 1: y for x, y in enumerate(texts)}
 def getTranscript(urls: str):
  texts = []
+ for url in set(urls):
  try:
  loader = YoutubeLoader.from_youtube_url(
  url, add_video_info=False
  except:
  doc = ""
  texts.append(doc)
+ return {x: y for x, y in zip(urls, texts)}
 def analyzeData(query, dataframe):
+ query += ". In case, you are to plot a chart, make sure the x-axis labels are 90 degree rotated"
  llm = ChatGroq(name="llama-3.1-8b-instant")
  df = SmartDataframe(dataframe, config={"llm": llm, "verbose": False})
  response = df.chat(query)
  with ThreadPoolExecutor() as executor:
  texts = list(executor.map(extractTextFromPage, pages))
  doc.close()
+ return {x + 1: y for x, y in enumerate(texts)}
 def extractTextFromUrl(url):
 def extractTextFromUrlList(urls):
  with ThreadPoolExecutor() as executor:
  texts = list(executor.map(extractTextFromUrl, urls))
+ return {x: y for x, y in zip(urls, texts)}