File size: 10,719 Bytes
fe370a3
 
 
 
 
 
 
 
 
b66e2f4
7495086
fe370a3
679ccf5
 
 
 
eeaf024
 
679ccf5
 
 
 
 
 
 
 
 
 
 
 
 
eeaf024
 
 
 
fe370a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55b4c5a
fe370a3
 
 
 
 
 
 
 
 
 
7495086
 
fe370a3
 
55b4c5a
fe370a3
7495086
eeaf024
fe370a3
 
 
 
 
7495086
fe370a3
 
eeaf024
fe370a3
 
679ccf5
d069333
679ccf5
 
 
 
 
 
 
 
 
 
5a8a35f
 
 
679ccf5
 
5a8a35f
679ccf5
 
5a8a35f
 
d069333
679ccf5
 
 
 
 
8698bd5
fe370a3
d40f467
fe370a3
 
 
6db239f
fe370a3
 
 
 
6db239f
fde2e8b
8698bd5
fde2e8b
 
 
59af68e
 
fde2e8b
 
 
6db239f
 
 
 
 
ae06536
6db239f
 
fde2e8b
59af68e
 
 
 
fde2e8b
 
 
 
 
8698bd5
6db239f
 
 
fe370a3
 
 
 
eeaf024
fe370a3
 
d069333
679ccf5
 
 
 
 
 
 
 
d069333
679ccf5
0f1d158
679ccf5
0f1d158
679ccf5
 
 
 
 
fe370a3
b66e2f4
679ccf5
d069333
679ccf5
 
 
 
 
 
 
 
 
 
 
 
d069333
679ccf5
d069333
 
 
 
679ccf5
 
 
 
 
 
 
 
 
 
fe370a3
 
b66e2f4
 
 
 
 
679ccf5
 
22c6fdf
 
679ccf5
 
 
fe370a3
679ccf5
 
 
 
 
fe370a3
 
679ccf5
 
 
fe370a3
 
ed41e47
fe370a3
 
ed41e47
fe370a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed41e47
9c41670
fde2e8b
 
9c41670
fde2e8b
9c41670
fde2e8b
9c41670
fde2e8b
9c41670
 
 
 
 
6c65306
9c41670
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document

from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_mistralai import ChatMistralAI
from uuid import uuid4

from pydantic import BaseModel, Field

from langchain_core.tools import tool

import unicodedata

class AddToKnowledgeBase(BaseModel):
    ''' Add information to the knowledge base if the user asks for it in his query'''
    information: str = Field(..., title="The information to add to the knowledge base")

def detect_language(text:str):
    llm = ChatOpenAI(model="gpt-4o-mini",temperature=0)
    template = "détecte la langue du texte suivant: {text}. rassure-toi que ta reponse contient seulement le nom de la langue detectée"
    prompt = PromptTemplate.from_template(template)
    chain = prompt | llm | StrOutputParser()
    response = chain.invoke({"text": text}).strip().lower()
    print(response)
    return response

def remove_non_standard_ascii(input_string: str) -> str:
    normalized_string = unicodedata.normalize('NFKD', input_string)
    return ''.join(char for char in normalized_string if 'a' <= char <= 'z' or 'A' <= char <= 'Z' or char.isdigit() or char in ' .,!?')

def get_text_from_content_for_doc(content):
    text = ""
    for page in content:
        text += content[page]["texte"]
    return text

def get_text_from_content_for_audio(content):
    return content["transcription"]


def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, # the character length of the chunck
        chunk_overlap=100, # the character length of the overlap between chuncks
        length_function=len # the length function - in this case, character length (aka the python len() fn.)
    )
    chunks = text_splitter.split_text(text)
    return chunks

def get_vectorstore(text_chunks,filename, file_type,namespace,index,enterprise_name):
    try:
        embedding = OpenAIEmbeddings(model="text-embedding-3-large")
        vector_store = PineconeVectorStore(index=index, embedding=embedding,namespace=namespace)

        file_name = filename.split(".")[0].replace(" ","_").replace("-","_").replace(".","_").replace("/","_").replace("\\","_").strip()

        documents = []
        uuids = []

        for i, chunk in enumerate(text_chunks):
            clean_filename = remove_non_standard_ascii(file_name)

            document = Document(
            page_content=chunk,
            metadata={"filename":filename,"file_type":file_type, "filename_id":clean_filename, "entreprise_name":enterprise_name},
            )
    
            uuid = f"{clean_filename}_{i}"
            uuids.append(uuid)
            documents.append(document)
        
        vector_store.add_documents(documents=documents, ids=uuids)

        return {"filename_id":clean_filename}
    
    except Exception as e:
        print(e)
        return False
    

def add_to_knowledge_base(enterprise_id,information,index,enterprise_name,user_id=""):
    ''' Add information to the knowledge base 
    Args:
        enterprise_id (str): the enterprise id
        information (str): the information to add
        index (str): the index name
    '''
    try:
        embedding = OpenAIEmbeddings(model="text-embedding-3-large")
        vector_store = PineconeVectorStore(index=index, embedding=embedding,namespace=enterprise_id)

        uuids = []
        uuid = f"kb_{user_id}_{uuid4()}"

        document = Document(
            page_content=information,
            metadata={"filename":"knowledge_base","file_type":"text", "filename_id":uuid, "entreprise_name":enterprise_name, "user_id":user_id},
        )

        uuids.append(uuid)
        vector_store.add_documents(documents=[document], ids=uuids)
        return uuid
    
    except Exception as e:
        print(e)
        return False
   
def get_retreive_answer(enterprise_id,prompt,index,common_id,user_id=""):
    try:
        print("common_id ",common_id)
        
        embedding = OpenAIEmbeddings(model="text-embedding-3-large")
        vector_store = PineconeVectorStore(index=index, embedding=embedding,namespace=enterprise_id)

        retriever = vector_store.as_retriever(
            search_type="similarity_score_threshold",
            search_kwargs={"k": 3, "score_threshold": 0.6},
        )

        enterprise_context = retriever.invoke(prompt)
        user_memory = retriever.invoke(prompt,filters={"user_id":user_id})

        if enterprise_context:
            print("found enterprise context")
            for chunk in enterprise_context:
                print(chunk.metadata)
        else:
            print("no enterprise context")

        if common_id:
            vector_store_commun = PineconeVectorStore(index=index, embedding=embedding,namespace=common_id)

            retriever_commun = vector_store_commun.as_retriever(
                search_type="similarity_score_threshold",
                search_kwargs={"k": 5, "score_threshold": 0.1},
            )

            commun_context = retriever_commun.invoke(prompt)
            
            for chunk in commun_context:
                print(chunk.metadata)

            if commun_context:
                print("found commun context")
            else:
                print("no commun context")

            response = user_memory + enterprise_context + commun_context

        else:
            response = retriever.invoke(prompt)

        return response
    
    except Exception as e:
        print(e)
        return False
    
def handle_calling_add_to_knowledge_base(query,enterprise_id = "",index = "",enterprise_name = "",user_id = "",llm = None):
    ''' Handle the calling of the add_to_knowledge_base function
    if the user, in his query, wants to add information to the knowledge base, the function will be called
    '''
    template = """
        You are an AI assistant that processes user queries.

        Determine if the user wants to add something to the knowledge base.

        - If the user wants to add something, extract the valuable information, reformulate and output 'add' followed by the information.
        - If the user does not want to add something, output 'no action'.

        Ensure your response is only 'add <content>' or 'no action'.

        User Query: "{query}"

        Response:
        """.strip()
    
    prompt = PromptTemplate.from_template(template)

    if not llm:
        llm = ChatOpenAI(model="gpt-4o",temperature=0)

    llm_with_tool = llm.bind_tools([AddToKnowledgeBase])

    # template = "En tant qu'IA experte en marketing, tu travailles pour l'entreprise {enterprise}, si dans la question, il y a une demande d'ajout d'information à la base de connaissance, fait appel à la fonction add_to_knowledge_base en ajoutant l'information demandée, sinon, n'appelle pas la fonction. la question est la suivante: {query}"
    
    # prompt = PromptTemplate.from_template(template)
    chain = prompt | llm | StrOutputParser()
    response = chain.invoke({"query": query}).strip().lower()

    if response.startswith("add"):
        item = response[len("add"):].strip()
        if item:
            item_id = add_to_knowledge_base(enterprise_id,item,index,enterprise_name,user_id)
            print("added to knowledge base")
            
            print(item)
            return {"item_id":item_id,"item":item}
    
    print(response)
    return False
    
    
    
    
def generate_response_via_langchain(query: str, stream: bool = False, model: str = "gpt-4o",context:str="",messages = [],style:str="formel",tonality:str="neutre",template:str = "",enterprise_name:str="",enterprise_id:str="",index:str=""):
    # Define the prompt template
    if template == "":
        template = "En tant qu'IA experte en marketing, réponds avec un style {style} et une tonalité {tonality} dans ta communcation pour l'entreprise {enterprise}, sachant le context suivant: {context}, et l'historique de la conversation, {messages}, {query}"
    
    # Initialize the OpenAI LLM with the specified model
    if model.startswith("gpt"):
        llm = ChatOpenAI(model=model,temperature=0)
    if model.startswith("mistral"):
        llm = ChatMistralAI(model=model,temperature=0)


    #handle_calling_add_to_knowledge_base(prompt.format(context=context,messages=messages,query=query,style=style,tonality=tonality,enterprise=enterprise_name))
    # if handle_calling_add_to_knowledge_base(query,enterprise_id,index,enterprise_name):
    #     template += " la base de connaissance a été mise à jour"

    language = detect_language(query)
    template += f" Reponds en {language}"
    # Create an LLM chain with the prompt and the LLM
    prompt = PromptTemplate.from_template(template)

    print(f"model: {model}")
    print(f"marque: {enterprise_name}")

    llm_chain = prompt | llm | StrOutputParser()

    
    print(f"language: {language}")

    if stream:
        # Return a generator that yields streamed responses
        return llm_chain.astream({ "query": query, "context": context, "messages": messages, "style": style, "tonality": tonality, "enterprise":enterprise_name })
    
    # Invoke the LLM chain and return the result
    return llm_chain.invoke({"query": query, "context": context, "messages": messages, "style": style, "tonality": tonality, "enterprise":enterprise_name})



def setup_rag(file_type,content):
    if file_type == "pdf":
        text = get_text_from_content_for_doc(content)
    elif file_type == "audio":
        text = get_text_from_content_for_audio(content)
    

    chunks = get_text_chunks(text)

    vectorstore = get_vectorstore(chunks)
    
    return vectorstore


def prompt_reformatting(prompt:str,context,query:str,style="formel",tonality="neutre",enterprise_name=""):
    if context == "":
        print("no context found for prompt reormatting")
        return prompt.format(context="Pas de contexte pertinent",messages="",query=query,style=style,tonality=tonality,enterprise=enterprise_name)
    

    docs_names = []
    print("context found for prompt reormatting")
    for chunk in context:
        print(chunk.metadata)
        chunk_name = chunk.metadata["filename"]
        if chunk_name not in docs_names:
            docs_names.append(chunk_name)
    context = ", ".join(docs_names)

    prompt = prompt.format(context=context,messages="",query=query,style=style,tonality=tonality,enterprise=enterprise_name)
    return prompt