File size: 15,340 Bytes
29dd018
c5522cd
4c37639
0dda2a1
 
4005ffd
940df6c
0dda2a1
7a52fdd
 
 
9a054bf
 
937bcc4
9a054bf
ac9adab
b352af8
0dda2a1
940df6c
0dda2a1
40d15f0
0dda2a1
88d2fdc
 
47650a0
c4a2d1f
c4684f9
40d15f0
0dda2a1
 
a1f9f6b
c4a2d1f
 
 
e52ad04
0dda2a1
 
064943c
0dda2a1
 
940df6c
4c37639
3fa5f95
 
0dda2a1
3fa5f95
29dd018
0dda2a1
e6cb545
 
0dda2a1
39cf044
0dda2a1
39cf044
 
 
 
d3176f4
e6cb545
0dda2a1
 
0f88fd2
e6cb545
9a054bf
0dda2a1
e6cb545
9a054bf
 
4f74893
0dda2a1
 
9a054bf
0dda2a1
7a52fdd
 
 
 
 
 
4f74893
7a52fdd
 
 
 
 
 
 
 
 
 
 
0dda2a1
074d6fc
 
c178b42
074d6fc
 
 
 
3fa5f95
320eff2
074d6fc
 
 
064943c
074d6fc
 
 
 
 
064943c
074d6fc
 
 
 
064943c
074d6fc
d2fce7d
3fa5f95
0dda2a1
 
 
30f4617
 
c01e475
3fa5f95
 
c01e475
 
 
 
f0d6550
9f24b08
f0d6550
c01e475
 
 
 
0dda2a1
80cfec3
 
 
 
3fa5f95
9f24b08
c01e475
 
6d8505d
 
 
3fa5f95
c01e475
9f24b08
 
 
 
 
 
 
3fa5f95
c01e475
 
9f24b08
c01e475
9f24b08
 
c01e475
 
 
 
 
0dda2a1
 
 
d3176f4
7a52fdd
d3176f4
0f88fd2
 
d3176f4
 
 
 
0dda2a1
 
3fa5f95
 
d3176f4
7a52fdd
0dda2a1
 
9a054bf
 
6e09a79
 
 
9a054bf
 
 
 
 
8144327
9a054bf
 
 
3fa5f95
9a054bf
 
 
 
572d835
3fa5f95
27f5d4b
d3176f4
7a52fdd
 
 
940df6c
 
d912ba1
27f5d4b
3fa5f95
940df6c
27f5d4b
 
940df6c
 
27f5d4b
074d6fc
217fc47
3fa5f95
074d6fc
 
3fa5f95
7a52fdd
3fa5f95
 
217fc47
 
9a054bf
3fa5f95
 
9a054bf
3fa5f95
572d835
7a52fdd
217fc47
d912ba1
7a52fdd
 
 
 
 
d3176f4
27f5d4b
0dda2a1
 
 
064943c
 
 
 
 
 
 
 
 
 
40d15f0
3fa5f95
40d15f0
064943c
 
 
 
8c0f543
3fa5f95
064943c
 
 
 
c4a2d1f
 
3fa5f95
 
c4684f9
17050fe
fce68f1
 
 
 
 
 
2f18daa
 
 
4a2e5ad
2f18daa
c4684f9
fce68f1
4a2e5ad
29dd018
c4684f9
 
fce68f1
3fa5f95
c4684f9
 
 
 
 
 
c4a2d1f
c4684f9
88d2fdc
 
 
 
4a38803
 
80cfec3
 
074d6fc
e6a7560
9245bf5
d3176f4
ac9adab
3fa5f95
d9c4277
2c6b8d9
d3176f4
2c6b8d9
 
3fa5f95
2c6b8d9
 
80cfec3
2c6b8d9
 
 
d3176f4
937bcc4
 
 
d3176f4
3fa5f95
 
937bcc4
a1f9f6b
 
ed7063b
a1f9f6b
 
320eff2
4c37639
 
 
9245bf5
4c37639
29dd018
4c37639
 
 
 
 
29dd018
d3176f4
4c37639
29dd018
4c37639
 
 
 
 
9245bf5
4c37639
29dd018
4c37639
 
 
a8a5b30
 
 
da75ad8
 
 
ed7063b
da75ad8
 
 
 
 
 
b9076ac
ed7063b
b9076ac
 
ed7063b
da75ad8
 
 
a8a5b30
 
 
 
 
 
 
 
80cfec3
ea8ad26
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
import pymupdf
import string
from concurrent.futures import ThreadPoolExecutor
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
from langchain_qdrant import RetrievalMode
from langchain_core.prompts.chat import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain.memory import ChatMessageHistory
from pandasai import SmartDataframe
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.document_loaders import YoutubeLoader
from langchain.docstore.document import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_qdrant import FastEmbedSparse
from supabase.client import create_client
from qdrant_client import QdrantClient
from langchain_groq import ChatGroq
from pdf2image import convert_from_bytes
import numpy as np
import easyocr
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from supabase import create_client
from dotenv import load_dotenv
import os
import base64
import time
import requests


load_dotenv("secrets.env")
client = create_client(os.environ["SUPABASE_URL"], os.environ["SUPABASE_KEY"])
qdrantClient = QdrantClient(url=os.environ["QDRANT_URL"], api_key=os.environ["QDRANT_API_KEY"])
model_kwargs = {"device": "cuda"}
encode_kwargs = {"normalize_embeddings": True}
vectorEmbeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
reader = easyocr.Reader(['en'], gpu=True, model_storage_directory="/app/EasyOCRModels")
sparseEmbeddings = FastEmbedSparse(model="Qdrant/BM25", threads=20, parallel=0)
prompt = """
INSTRUCTIONS:
=====================================
### Role
**Primary Function**: You are an AI chatbot designed to provide accurate and efficient assistance to users based on provided context data. Your responses must be reliable, friendly, and directly address user inquiries or issues. Always clarify any unclear questions, and conclude responses positively.
### Constraints
1. **No Data Disclosure**: Never reveal access to training data or any context explicitly.
2. **Maintaining Focus**: Politely redirect any off-topic conversations back to relevant issues without breaking character.
3. **Exclusive Reliance on Context Data**: Base all answers strictly on the provided context data. If the context doesn’t cover the query, use a fallback response. Always maintain a third-person perspective.
4. **Restrictive Role Focus**: Do not engage in tasks or answer questions unrelated to your role or context data.
Ensure all instructions are strictly followed. Responses must be meaningful and concise, within 512 words. Make sure the user is always happy and satisfied with the outputs you return.
CONTEXT:
=====================================
{context}
======================================
QUESTION:
=====================================
{question}
CHAT HISTORY:
=====================================
{chatHistory}
NOTE: Generate responses directly without using phrases like "Response:" or "Answer:". NEVER mention the user about usage of any context to generate an answer.
"""
prompt = ChatPromptTemplate.from_template(prompt)
chatHistoryStore = dict()

class FollowUps(BaseModel):
    q1: str = Field(description="First Follow-up Question")
    q2: str = Field(description="Second Follow-up Question")
    q3: str = Field(description="Third Follow-up Question")

followUpPrompt = """
You are an expert chatbot at framing follow up questions using some given text such that their answers can be found in the text itself and have been given the task of doing the same. Make sure that the questions are good quality and not too long in length. Frame appropriate and meaningful questions out of the given text and DO NOT mention the usage of any text in the questions. Also, if no the given text says NO CONTEXT FOUND, please return an empty string for each question asked.
\n{format_instructions}
\n{context}
"""
jsonParser = JsonOutputParser(pydantic_object=FollowUps)
followUpPrompt = PromptTemplate(
    template=followUpPrompt,
    input_variables=["context"],
    partial_variables={"format_instructions": jsonParser.get_format_instructions()},
)



def createUser(user_id: str, username: str, email: str) -> dict:
    userData = client.table("ConversAI_UserInfo").select("*").execute().data
    if username not in [userData[x]["username"] for x in range(len(userData))]:
        try:
            client.table("ConversAI_UserInfo").insert(
                {"user_id": user_id, "username": username, "email": email}).execute()

            client.table("ConversAI_UserConfig").insert({"user_id": username}).execute()

            res = {
                "code": 200,
                "message": "User Setup Successful"
            }

        except Exception as e:
            res = {
                "code": 409,
                "message": "Email already exists",
            }

        return res

    else:
        return {
            "code": 409,
            "message": "Username already exists"
        }


def createTable(tablename: str):
    global vectorEmbeddings
    global sparseEmbeddings
    qdrant = QdrantVectorStore.from_documents(
        documents=[],
        embedding=vectorEmbeddings,
        sparse_embedding=sparseEmbeddings,
        url=os.environ["QDRANT_URL"],
        prefer_grpc=True,
        api_key=os.environ["QDRANT_API_KEY"],
        collection_name=tablename,
        force_recreate=True,
        retrieval_mode=RetrievalMode.HYBRID
    )
    return {
        "output": "SUCCESS"
    }

def cleanText(text: str):
    text = text.replace("\n", " ")
    text = text.translate(str.maketrans('', '', string.punctuation.replace(".", "")))
    return text

def addDocuments(texts: list[tuple[str]], vectorstore: str):
    global vectorEmbeddings
    global sparseEmbeddings
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,
        chunk_overlap=250,
        add_start_index=True
    )
    sources = [textTuple[1] for textTuple in texts]
    texts = [textTuple[0].replace("\n", " ") for textTuple in texts]
    texts = [text.translate(str.maketrans('', '', string.punctuation.replace(".", ""))) for text in texts]
    texts = [Document(page_content=text, metadata={"source": source}) for text, source in zip(texts, sources)]
    documents = splitter.split_documents(texts)
    vectorstore = QdrantVectorStore.from_documents(
        documents=documents,
        embedding=vectorEmbeddings,
        sparse_embedding=sparseEmbeddings,
        url=os.environ["QDRANT_URL"],
        prefer_grpc=True,
        api_key=os.environ["QDRANT_API_KEY"],
        collection_name=vectorstore,
        force_recreate=True,
        retrieval_mode=RetrievalMode.HYBRID
    )
    return {
        "output": "SUCCESS"
    }


def format_docs(docs: str):
    global sources
    global tempContext
    sources = []
    context = ""
    for doc in docs:
        context += f"{doc.page_content}\n\n\n"
        source = doc.metadata
        source = source["source"]
        sources.append(source)
    if context == "":
        context = "No context found"
    else:
        pass
    sources = list(set(sources))
    tempContext = context
    return context


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in chatHistoryStore:
        chatHistoryStore[session_id] = ChatMessageHistory()
    return chatHistoryStore[session_id]


def trimMessages(chain_input):
    for storeName in chatHistoryStore:
        messages = chatHistoryStore[storeName].messages
        if len(messages) <= 1:
            pass
        else:
            chatHistoryStore[storeName].clear()
            for message in messages[-1:]:
                chatHistoryStore[storeName].add_message(message)
    return True


def answerQuery(query: str, vectorstore: str, llmModel: str = "llama-3.1-70b-versatile") -> str:
    global prompt
    global client
    global sources
    global jsonParser
    global tempContext
    global followUpPrompt
    global vectorEmbeddings
    global sparseEmbeddings
    vectorStoreName = vectorstore
    vectorstore = QdrantVectorStore.from_existing_collection(
        embedding=vectorEmbeddings,
        sparse_embedding=sparseEmbeddings,
        collection_name=vectorstore,
        url=os.environ["QDRANT_URL"],
        api_key=os.environ["QDRANT_API_KEY"],
        retrieval_mode=RetrievalMode.HYBRID
    )
    retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 4, "score_threshold": None})
    baseChain = (
            {"context": RunnableLambda(lambda x: x["question"]) | retriever | RunnableLambda(format_docs),
             "question": RunnableLambda(lambda x: x["question"]),
             "chatHistory": RunnableLambda(lambda x: x["chatHistory"])}
            | prompt
            | ChatGroq(model_name=llmModel, temperature=0.75, max_tokens=512)
            | StrOutputParser()
    )
    messageChain = RunnableWithMessageHistory(
        baseChain,
        get_session_history,
        input_messages_key="question",
        history_messages_key="chatHistory"
    )
    chain = RunnablePassthrough.assign(messages_trimmed=trimMessages) | messageChain
    followUpChain = followUpPrompt | ChatGroq(model_name="llama-3.1-70b-versatile", temperature=0) | jsonParser
    output = chain.invoke(
            {"question": query},
            {"configurable": {"session_id": vectorStoreName}}
        )
    followUpQuestions = followUpChain.invoke({"context": tempContext})
    return {
        "output": output,
        "followUpQuestions": followUpQuestions,
        "sources": sources
    }


def deleteTable(tableName: str):
    try:
        global qdrantClient
        qdrantClient.delete_collection(collection_name=tableName)
        return {
            "output": "SUCCESS"
        }
    except Exception as e:
        return {
            "error": e
        }


def listTables(username: str):
    try:
        global qdrantClient
        qdrantCollections = qdrantClient.get_collections()
        return {
            "output": list(filter(lambda x: True if x.split("$")[1] == username else False,
                                  [x.name for x in qdrantCollections.collections]))
        }
    except Exception as e:
        return {
            "error": e
        }


def getLinks(url: str, timeout=30):
    start = time.time()

    def getLinksFromPage(url: str) -> list:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "lxml")
        anchors = soup.find_all("a")
        links = []
        for anchor in anchors:
            if "href" in anchor.attrs:
                if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
                    links.append(anchor.attrs["href"])
                elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")):
                    links.append(urljoin(url + "/", anchor.attrs["href"]))
                else:
                    pass
                links = [link for link in links if "#" not in link]
                links = list(set(links))
            else:
                continue
        return links

    links = getLinksFromPage(url)
    uniqueLinks = set()
    for link in links:
        now = time.time()
        if now - start > timeout:
            break
        else:
            uniqueLinks = uniqueLinks.union(set(getLinksFromPage(link)))
    return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))


def getTextFromImagePDF(pdfBytes):
    def getText(image):
        global reader
        text = "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
        return cleanText(text = text)

    allImages = convert_from_bytes(pdfBytes)
    texts = [getText(image) for image in allImages]
    return {x + 1: y for x, y in enumerate(texts)}


def getTranscript(urls: str):
    texts = []
    for url in set(urls):
        try:
            loader = YoutubeLoader.from_youtube_url(
                url, add_video_info=False
            )
            doc = " ".join([x.page_content for x in loader.load()])
            texts.append(cleanText(text = doc))
        except:
            doc = ""
            texts.append(doc)
    return {x: y for x, y in zip(urls, texts)}


def analyzeData(query, dataframe):
    query += ". In case, you are to plot a chart, make sure the x-axis labels are 90 degree rotated"
    llm = ChatGroq(name="llama-3.1-8b-instant")
    df = SmartDataframe(dataframe, config={"llm": llm, "verbose": False})
    response = df.chat(query)
    if os.path.isfile(response):
        with open(response, "rb") as file:
            b64string = base64.b64encode(file.read()).decode("utf-8", errors = "replace")
        return f"data:image/png;base64,{b64string}"
    else:
        return response


def extractTextFromPage(page):
    return cleanText(text = page.get_text())


def extractTextFromPdf(pdf_path):
    doc = pymupdf.open(pdf_path)
    pages = [doc.load_page(i) for i in range(len(doc))]
    with ThreadPoolExecutor() as executor:
        texts = list(executor.map(extractTextFromPage, pages))
    doc.close()
    return {x + 1: y for x, y in enumerate(texts)}


def extractTextFromUrl(url):
    response = requests.get(url)
    response.raise_for_status()
    html = response.text
    soup = BeautifulSoup(html, 'lxml')
    return cleanText(text = soup.get_text(separator=' ', strip=True))


def extractTextFromUrlList(urls):
    with ThreadPoolExecutor() as executor:
        texts = list(executor.map(extractTextFromUrl, urls))
    return {x: y for x, y in zip(urls, texts)}


def encodeToBase64(dct: dict):
    for key in dct:
        if type(dct[key]) == str:
            dct[key] = base64.b64encode(dct[key].encode("utf-8", errors = "replace")).decode("utf-8", errors = "replace")
        elif type(dct[key]) == dict:
            dct[key] = encodeToBase64(dct[key])
    return dct


def decodeBase64(dct: dict):
    if type(dct["output"]) == str:
        dct["output"] = base64.b64decode(dct["output"].encode("utf-8", errors = "replace")).decode("utf-8", errors = "replace")
    else:
        for key in dct["output"]:
            dct["output"][key] = base64.b64decode(dct["output"][key].encode("utf-8", errors = "replace")).decode("utf-8", errors = "replace")
    return dct


def createDataSourceName(sourceName):
  sources = [x["dataSourceName"] for x in client.table("ConversAI_ChatbotDataSources").select("dataSourceName").execute().data]
  if sourceName not in sources:
    return sourceName
  else:
    i = 1
    while True:
      sourceName = sourceName + "-" + str(i)
      return createDataSourceName(sourceName)
    

def trackUsage(vectorstore: str, endpoint: str):
    username, chatbotName = vectorstore.split("$")[1], vectorstore.split("$")[2]
    client.table("ConversAI_ActivityLog").insert({"username": username, "chatbotName": chatbotName, "endpointUsed": endpoint}).execute()