Rauhan commited on
Commit
9f24b08
1 Parent(s): 6747b31

UPDATE: trainChatbot

Browse files
Files changed (2) hide show
  1. app.py +46 -33
  2. functions.py +12 -9
app.py CHANGED
@@ -271,7 +271,7 @@ async def loadPDF(vectorstore: str, pdf: UploadFile = File(...)):
271
  .insert({"username": username,
272
  "chatbotName": chatbotName,
273
  "dataSourceName": fileName,
274
- "sourceEndpoint": "\loadPDF",
275
  "sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
276
  .execute()
277
  )
@@ -299,7 +299,7 @@ async def loadImagePDF(vectorstore: str, pdf: UploadFile = File(...)):
299
  .insert({"username": username,
300
  "chatbotName": chatbotName,
301
  "dataSourceName": fileName,
302
- "sourceEndpoint": "\loadImagePDF",
303
  "sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
304
  .execute()
305
  )
@@ -330,7 +330,7 @@ async def loadText(addTextConfig: AddText):
330
  .insert({"username": username,
331
  "chatbotName": chatbotName,
332
  "dataSourceName": fileName,
333
- "sourceEndpoint": "\loadText",
334
  "sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
335
  .execute()
336
  )
@@ -339,28 +339,6 @@ async def loadText(addTextConfig: AddText):
339
  }
340
 
341
 
342
-
343
- @app.post("/addText")
344
- async def addText(addTextConfig: AddText):
345
- vectorstore, text, source = addTextConfig.vectorstore, addTextConfig.text, addTextConfig.source
346
- text = base64.b64decode(text.encode("utf-8")).decode("utf-8")
347
- username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
348
- df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
349
- currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
350
- newCount = currentCount + len(text)
351
- limit = supabase.table("ConversAI_UserConfig").select("tokenLimit").eq("user_id", username).execute().data[0][
352
- "tokenLimit"]
353
- if newCount < int(limit):
354
- supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
355
- "chatbotname", chatbotname).execute()
356
- output = addDocuments(text=text, source=source, vectorstore=vectorstore)
357
- return output
358
- else:
359
- return {
360
- "output": "WEBSITE EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
361
- }
362
-
363
-
364
  class AddQAPair(BaseModel):
365
  vectorstore: str
366
  question: str
@@ -410,7 +388,7 @@ async def loadWebURLs(loadWebsite: LoadWebsite):
410
  .insert({"username": username,
411
  "chatbotName": chatbotName,
412
  "dataSourceName": fileName,
413
- "sourceEndpoint": "\loadWebURLs",
414
  "sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
415
  .execute()
416
  )
@@ -467,8 +445,8 @@ class YtTranscript(BaseModel):
467
  urls: list[str]
468
 
469
 
470
- @app.post("/getYoutubeTranscript")
471
- async def getYoutubeTranscript(ytTranscript: YtTranscript):
472
  vectorstore, urls = ytTranscript.vectorstore, ytTranscript.urls
473
  username, chatbotName = vectorstore.split("$")[1], vectorstore.split("$")[2]
474
  text = getTranscript(urls=urls)
@@ -484,7 +462,7 @@ async def getYoutubeTranscript(ytTranscript: YtTranscript):
484
  .insert({"username": username,
485
  "chatbotName": chatbotName,
486
  "dataSourceName": fileName,
487
- "sourceEndpoint": "\getYoutubeTranscript",
488
  "sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
489
  .execute()
490
  )
@@ -523,7 +501,42 @@ async def chatHistory(vectorstore: str):
523
  return response
524
 
525
 
526
- # @app.post("/trainChatbot")
527
- # async def chatHistory(vectorstore: str):
528
- # username, chatbotName = vectorstore.split("$")[1], vectorstore.split("$")[2]
529
- # return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  .insert({"username": username,
272
  "chatbotName": chatbotName,
273
  "dataSourceName": fileName,
274
+ "sourceEndpoint": "/loadPDF",
275
  "sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
276
  .execute()
277
  )
 
299
  .insert({"username": username,
300
  "chatbotName": chatbotName,
301
  "dataSourceName": fileName,
302
+ "sourceEndpoint": "/loadImagePDF",
303
  "sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
304
  .execute()
305
  )
 
330
  .insert({"username": username,
331
  "chatbotName": chatbotName,
332
  "dataSourceName": fileName,
333
+ "sourceEndpoint": "/loadText",
334
  "sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
335
  .execute()
336
  )
 
339
  }
340
 
341
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
  class AddQAPair(BaseModel):
343
  vectorstore: str
344
  question: str
 
388
  .insert({"username": username,
389
  "chatbotName": chatbotName,
390
  "dataSourceName": fileName,
391
+ "sourceEndpoint": "/loadWebURLs",
392
  "sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
393
  .execute()
394
  )
 
445
  urls: list[str]
446
 
447
 
448
+ @app.post("/loadYoutubeTranscript")
449
+ async def loadYoutubeTranscript(ytTranscript: YtTranscript):
450
  vectorstore, urls = ytTranscript.vectorstore, ytTranscript.urls
451
  username, chatbotName = vectorstore.split("$")[1], vectorstore.split("$")[2]
452
  text = getTranscript(urls=urls)
 
462
  .insert({"username": username,
463
  "chatbotName": chatbotName,
464
  "dataSourceName": fileName,
465
+ "sourceEndpoint": "/getYoutubeTranscript",
466
  "sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
467
  .execute()
468
  )
 
501
  return response
502
 
503
 
504
+ @app.post("/listChatbotSources")
505
+ async def listChatbotSources(vectorstore: str):
506
+ username, chatbotName = vectorstore.split("$")[1], vectorstore.split("$")[2]
507
+ result = supabase.table("ConversAI_ChatbotDataSources").select("*").eq("username", username).eq("chatbotName", chatbotName).execute().data
508
+ return result
509
+
510
+
511
+
512
+ @app.post("/trainChatbot")
513
+ async def trainChatbot(vectorstore: str):
514
+ texts = []
515
+ sources = []
516
+ fileTypes = [supabase.table("ConversAI_ChatbotDataSources").select("sourceEndpoint").eq("sourceContentURL", x).execute().data[0]["sourceEndpoint"] for x in sources]
517
+ for source, fileType in zip(sources, fileTypes):
518
+ if ((fileType == "/loadPDF") | (fileType == "/loadImagePDF")):
519
+ r = requests.get(source)
520
+ file = eval(r.content.decode("utf-8"))
521
+ content = file["output"]
522
+ fileSource = file["source"]
523
+ texts.append(".".join([base64.b64decode(content[key].encode("utf-8")).decode("utf-8") for key in content.keys()]).replace("\n", " "))
524
+ sources.append(fileSource)
525
+ elif fileType == "/loadText":
526
+ r = requests.get(source)
527
+ file = eval(r.content.decode("utf-8"))
528
+ content = file["output"]
529
+ fileSource = file["source"]
530
+ texts.append(content.replace("\n", " "))
531
+ sources.append(fileSource)
532
+ elif ((fileType == "/loadWebURLs") | (fileType == "/loadYoutubeTranscript")):
533
+ r = requests.get(source)
534
+ file = eval(r.content.decode("utf-8"))
535
+ content = file["output"]
536
+ fileSource = file["source"]
537
+ texts.append(".".join([base64.b64decode(content[key].encode("utf-8")).decode("utf-8") for key in content.keys()]).replace("\n", " "))
538
+ sources.append(fileSource)
539
+ else:
540
+ pass
541
+ texts = [(text, source) for text, source in zip(texts, sources)]
542
+ return addDocuments(texts = texts, vectorstore = vectorstore)
functions.py CHANGED
@@ -113,6 +113,7 @@ def createTable(tablename: str):
113
  prefer_grpc=True,
114
  api_key=os.environ["QDRANT_API_KEY"],
115
  collection_name=tablename,
 
116
  retrieval_mode=RetrievalMode.HYBRID
117
  )
118
  return {
@@ -120,7 +121,7 @@ def createTable(tablename: str):
120
  }
121
 
122
 
123
- def addDocuments(text: str, source: str, vectorstore: str):
124
  global vectorEmbeddings
125
  global sparseEmbeddings
126
  splitter = RecursiveCharacterTextSplitter(
@@ -128,20 +129,22 @@ def addDocuments(text: str, source: str, vectorstore: str):
128
  chunk_overlap=250,
129
  add_start_index=True
130
  )
131
- text = text.replace("\n", " ")
132
- text = text.translate(str.maketrans('', '', string.punctuation.replace(".", "")))
133
- texts = [Document(page_content=text, metadata={"source": source})]
134
- texts = splitter.split_documents(texts)
135
- ids = [str(uuid4()) for _ in range(len(texts))]
136
- vectorstore = QdrantVectorStore.from_existing_collection(
 
137
  embedding=vectorEmbeddings,
138
  sparse_embedding=sparseEmbeddings,
139
- collection_name=vectorstore,
140
  url=os.environ["QDRANT_URL"],
 
141
  api_key=os.environ["QDRANT_API_KEY"],
 
 
142
  retrieval_mode=RetrievalMode.HYBRID
143
  )
144
- vectorstore.add_documents(documents=texts, ids=ids)
145
  return {
146
  "output": "SUCCESS"
147
  }
 
113
  prefer_grpc=True,
114
  api_key=os.environ["QDRANT_API_KEY"],
115
  collection_name=tablename,
116
+ force_recreate=True,
117
  retrieval_mode=RetrievalMode.HYBRID
118
  )
119
  return {
 
121
  }
122
 
123
 
124
+ def addDocuments(texts: list[tuple[str]], vectorstore: str):
125
  global vectorEmbeddings
126
  global sparseEmbeddings
127
  splitter = RecursiveCharacterTextSplitter(
 
129
  chunk_overlap=250,
130
  add_start_index=True
131
  )
132
+ sources = [textTuple[1] for textTuple in texts]
133
+ texts = [textTuple[0].replace("\n", " ") for textTuple in texts]
134
+ texts = [text.translate(str.maketrans('', '', string.punctuation.replace(".", ""))) for text in texts]
135
+ texts = [Document(page_content=text, metadata={"source": source}) for text, source in zip(texts, sources)]
136
+ documents = splitter.split_documents(texts)
137
+ vectorstore = QdrantVectorStore.from_documents(
138
+ documents=documents,
139
  embedding=vectorEmbeddings,
140
  sparse_embedding=sparseEmbeddings,
 
141
  url=os.environ["QDRANT_URL"],
142
+ prefer_grpc=True,
143
  api_key=os.environ["QDRANT_API_KEY"],
144
+ collection_name=vectorstore,
145
+ force_recreate=True,
146
  retrieval_mode=RetrievalMode.HYBRID
147
  )
 
148
  return {
149
  "output": "SUCCESS"
150
  }