Spaces:
Sleeping
Sleeping
UPDATE: trainChatbot
Browse files- app.py +46 -33
- functions.py +12 -9
app.py
CHANGED
@@ -271,7 +271,7 @@ async def loadPDF(vectorstore: str, pdf: UploadFile = File(...)):
|
|
271 |
.insert({"username": username,
|
272 |
"chatbotName": chatbotName,
|
273 |
"dataSourceName": fileName,
|
274 |
-
"sourceEndpoint": "
|
275 |
"sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
|
276 |
.execute()
|
277 |
)
|
@@ -299,7 +299,7 @@ async def loadImagePDF(vectorstore: str, pdf: UploadFile = File(...)):
|
|
299 |
.insert({"username": username,
|
300 |
"chatbotName": chatbotName,
|
301 |
"dataSourceName": fileName,
|
302 |
-
"sourceEndpoint": "
|
303 |
"sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
|
304 |
.execute()
|
305 |
)
|
@@ -330,7 +330,7 @@ async def loadText(addTextConfig: AddText):
|
|
330 |
.insert({"username": username,
|
331 |
"chatbotName": chatbotName,
|
332 |
"dataSourceName": fileName,
|
333 |
-
"sourceEndpoint": "
|
334 |
"sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
|
335 |
.execute()
|
336 |
)
|
@@ -339,28 +339,6 @@ async def loadText(addTextConfig: AddText):
|
|
339 |
}
|
340 |
|
341 |
|
342 |
-
|
343 |
-
@app.post("/addText")
|
344 |
-
async def addText(addTextConfig: AddText):
|
345 |
-
vectorstore, text, source = addTextConfig.vectorstore, addTextConfig.text, addTextConfig.source
|
346 |
-
text = base64.b64decode(text.encode("utf-8")).decode("utf-8")
|
347 |
-
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
348 |
-
df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
349 |
-
currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
350 |
-
newCount = currentCount + len(text)
|
351 |
-
limit = supabase.table("ConversAI_UserConfig").select("tokenLimit").eq("user_id", username).execute().data[0][
|
352 |
-
"tokenLimit"]
|
353 |
-
if newCount < int(limit):
|
354 |
-
supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
|
355 |
-
"chatbotname", chatbotname).execute()
|
356 |
-
output = addDocuments(text=text, source=source, vectorstore=vectorstore)
|
357 |
-
return output
|
358 |
-
else:
|
359 |
-
return {
|
360 |
-
"output": "WEBSITE EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
|
361 |
-
}
|
362 |
-
|
363 |
-
|
364 |
class AddQAPair(BaseModel):
|
365 |
vectorstore: str
|
366 |
question: str
|
@@ -410,7 +388,7 @@ async def loadWebURLs(loadWebsite: LoadWebsite):
|
|
410 |
.insert({"username": username,
|
411 |
"chatbotName": chatbotName,
|
412 |
"dataSourceName": fileName,
|
413 |
-
"sourceEndpoint": "
|
414 |
"sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
|
415 |
.execute()
|
416 |
)
|
@@ -467,8 +445,8 @@ class YtTranscript(BaseModel):
|
|
467 |
urls: list[str]
|
468 |
|
469 |
|
470 |
-
@app.post("/
|
471 |
-
async def
|
472 |
vectorstore, urls = ytTranscript.vectorstore, ytTranscript.urls
|
473 |
username, chatbotName = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
474 |
text = getTranscript(urls=urls)
|
@@ -484,7 +462,7 @@ async def getYoutubeTranscript(ytTranscript: YtTranscript):
|
|
484 |
.insert({"username": username,
|
485 |
"chatbotName": chatbotName,
|
486 |
"dataSourceName": fileName,
|
487 |
-
"sourceEndpoint": "
|
488 |
"sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
|
489 |
.execute()
|
490 |
)
|
@@ -523,7 +501,42 @@ async def chatHistory(vectorstore: str):
|
|
523 |
return response
|
524 |
|
525 |
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
.insert({"username": username,
|
272 |
"chatbotName": chatbotName,
|
273 |
"dataSourceName": fileName,
|
274 |
+
"sourceEndpoint": "/loadPDF",
|
275 |
"sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
|
276 |
.execute()
|
277 |
)
|
|
|
299 |
.insert({"username": username,
|
300 |
"chatbotName": chatbotName,
|
301 |
"dataSourceName": fileName,
|
302 |
+
"sourceEndpoint": "/loadImagePDF",
|
303 |
"sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
|
304 |
.execute()
|
305 |
)
|
|
|
330 |
.insert({"username": username,
|
331 |
"chatbotName": chatbotName,
|
332 |
"dataSourceName": fileName,
|
333 |
+
"sourceEndpoint": "/loadText",
|
334 |
"sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
|
335 |
.execute()
|
336 |
)
|
|
|
339 |
}
|
340 |
|
341 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
342 |
class AddQAPair(BaseModel):
|
343 |
vectorstore: str
|
344 |
question: str
|
|
|
388 |
.insert({"username": username,
|
389 |
"chatbotName": chatbotName,
|
390 |
"dataSourceName": fileName,
|
391 |
+
"sourceEndpoint": "/loadWebURLs",
|
392 |
"sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
|
393 |
.execute()
|
394 |
)
|
|
|
445 |
urls: list[str]
|
446 |
|
447 |
|
448 |
+
@app.post("/loadYoutubeTranscript")
|
449 |
+
async def loadYoutubeTranscript(ytTranscript: YtTranscript):
|
450 |
vectorstore, urls = ytTranscript.vectorstore, ytTranscript.urls
|
451 |
username, chatbotName = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
452 |
text = getTranscript(urls=urls)
|
|
|
462 |
.insert({"username": username,
|
463 |
"chatbotName": chatbotName,
|
464 |
"dataSourceName": fileName,
|
465 |
+
"sourceEndpoint": "/getYoutubeTranscript",
|
466 |
"sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
|
467 |
.execute()
|
468 |
)
|
|
|
501 |
return response
|
502 |
|
503 |
|
504 |
+
@app.post("/listChatbotSources")
|
505 |
+
async def listChatbotSources(vectorstore: str):
|
506 |
+
username, chatbotName = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
507 |
+
result = supabase.table("ConversAI_ChatbotDataSources").select("*").eq("username", username).eq("chatbotName", chatbotName).execute().data
|
508 |
+
return result
|
509 |
+
|
510 |
+
|
511 |
+
|
512 |
+
@app.post("/trainChatbot")
|
513 |
+
async def trainChatbot(vectorstore: str):
|
514 |
+
texts = []
|
515 |
+
sources = []
|
516 |
+
fileTypes = [supabase.table("ConversAI_ChatbotDataSources").select("sourceEndpoint").eq("sourceContentURL", x).execute().data[0]["sourceEndpoint"] for x in sources]
|
517 |
+
for source, fileType in zip(sources, fileTypes):
|
518 |
+
if ((fileType == "/loadPDF") | (fileType == "/loadImagePDF")):
|
519 |
+
r = requests.get(source)
|
520 |
+
file = eval(r.content.decode("utf-8"))
|
521 |
+
content = file["output"]
|
522 |
+
fileSource = file["source"]
|
523 |
+
texts.append(".".join([base64.b64decode(content[key].encode("utf-8")).decode("utf-8") for key in content.keys()]).replace("\n", " "))
|
524 |
+
sources.append(fileSource)
|
525 |
+
elif fileType == "/loadText":
|
526 |
+
r = requests.get(source)
|
527 |
+
file = eval(r.content.decode("utf-8"))
|
528 |
+
content = file["output"]
|
529 |
+
fileSource = file["source"]
|
530 |
+
texts.append(content.replace("\n", " "))
|
531 |
+
sources.append(fileSource)
|
532 |
+
elif ((fileType == "/loadWebURLs") | (fileType == "/loadYoutubeTranscript")):
|
533 |
+
r = requests.get(source)
|
534 |
+
file = eval(r.content.decode("utf-8"))
|
535 |
+
content = file["output"]
|
536 |
+
fileSource = file["source"]
|
537 |
+
texts.append(".".join([base64.b64decode(content[key].encode("utf-8")).decode("utf-8") for key in content.keys()]).replace("\n", " "))
|
538 |
+
sources.append(fileSource)
|
539 |
+
else:
|
540 |
+
pass
|
541 |
+
texts = [(text, source) for text, source in zip(texts, sources)]
|
542 |
+
return addDocuments(texts = texts, vectorstore = vectorstore)
|
functions.py
CHANGED
@@ -113,6 +113,7 @@ def createTable(tablename: str):
|
|
113 |
prefer_grpc=True,
|
114 |
api_key=os.environ["QDRANT_API_KEY"],
|
115 |
collection_name=tablename,
|
|
|
116 |
retrieval_mode=RetrievalMode.HYBRID
|
117 |
)
|
118 |
return {
|
@@ -120,7 +121,7 @@ def createTable(tablename: str):
|
|
120 |
}
|
121 |
|
122 |
|
123 |
-
def addDocuments(
|
124 |
global vectorEmbeddings
|
125 |
global sparseEmbeddings
|
126 |
splitter = RecursiveCharacterTextSplitter(
|
@@ -128,20 +129,22 @@ def addDocuments(text: str, source: str, vectorstore: str):
|
|
128 |
chunk_overlap=250,
|
129 |
add_start_index=True
|
130 |
)
|
131 |
-
|
132 |
-
|
133 |
-
texts = [
|
134 |
-
texts =
|
135 |
-
|
136 |
-
vectorstore = QdrantVectorStore.
|
|
|
137 |
embedding=vectorEmbeddings,
|
138 |
sparse_embedding=sparseEmbeddings,
|
139 |
-
collection_name=vectorstore,
|
140 |
url=os.environ["QDRANT_URL"],
|
|
|
141 |
api_key=os.environ["QDRANT_API_KEY"],
|
|
|
|
|
142 |
retrieval_mode=RetrievalMode.HYBRID
|
143 |
)
|
144 |
-
vectorstore.add_documents(documents=texts, ids=ids)
|
145 |
return {
|
146 |
"output": "SUCCESS"
|
147 |
}
|
|
|
113 |
prefer_grpc=True,
|
114 |
api_key=os.environ["QDRANT_API_KEY"],
|
115 |
collection_name=tablename,
|
116 |
+
force_recreate=True,
|
117 |
retrieval_mode=RetrievalMode.HYBRID
|
118 |
)
|
119 |
return {
|
|
|
121 |
}
|
122 |
|
123 |
|
124 |
+
def addDocuments(texts: list[tuple[str]], vectorstore: str):
|
125 |
global vectorEmbeddings
|
126 |
global sparseEmbeddings
|
127 |
splitter = RecursiveCharacterTextSplitter(
|
|
|
129 |
chunk_overlap=250,
|
130 |
add_start_index=True
|
131 |
)
|
132 |
+
sources = [textTuple[1] for textTuple in texts]
|
133 |
+
texts = [textTuple[0].replace("\n", " ") for textTuple in texts]
|
134 |
+
texts = [text.translate(str.maketrans('', '', string.punctuation.replace(".", ""))) for text in texts]
|
135 |
+
texts = [Document(page_content=text, metadata={"source": source}) for text, source in zip(texts, sources)]
|
136 |
+
documents = splitter.split_documents(texts)
|
137 |
+
vectorstore = QdrantVectorStore.from_documents(
|
138 |
+
documents=documents,
|
139 |
embedding=vectorEmbeddings,
|
140 |
sparse_embedding=sparseEmbeddings,
|
|
|
141 |
url=os.environ["QDRANT_URL"],
|
142 |
+
prefer_grpc=True,
|
143 |
api_key=os.environ["QDRANT_API_KEY"],
|
144 |
+
collection_name=vectorstore,
|
145 |
+
force_recreate=True,
|
146 |
retrieval_mode=RetrievalMode.HYBRID
|
147 |
)
|
|
|
148 |
return {
|
149 |
"output": "SUCCESS"
|
150 |
}
|