File size: 5,481 Bytes
fe370a3
 
 
 
 
 
 
 
 
7495086
fe370a3
eeaf024
 
 
 
 
 
 
fe370a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55b4c5a
fe370a3
 
 
 
 
 
 
 
 
 
7495086
 
fe370a3
 
55b4c5a
fe370a3
7495086
eeaf024
fe370a3
 
 
 
 
7495086
fe370a3
 
eeaf024
fe370a3
 
6db239f
fe370a3
 
 
 
6db239f
fe370a3
 
 
 
6db239f
 
 
 
 
 
7686d42
6db239f
 
 
 
 
 
fe370a3
 
 
 
eeaf024
fe370a3
 
55b4c5a
f22ae3a
fe370a3
55b4c5a
 
0f1d158
 
fe370a3
 
 
fb4fd4c
fe370a3
 
 
 
 
 
9a4c626
fe370a3
 
9a4c626
fe370a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9c41670
 
 
 
 
 
798e445
9c41670
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document

from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from uuid import uuid4

import unicodedata

def remove_non_standard_ascii(input_string: str) -> str:
    normalized_string = unicodedata.normalize('NFKD', input_string)
    return ''.join(char for char in normalized_string if 'a' <= char <= 'z' or 'A' <= char <= 'Z' or char.isdigit() or char in ' .,!?')




def get_text_from_content_for_doc(content):
    text = ""
    for page in content:
        text += content[page]["texte"]
    return text

def get_text_from_content_for_audio(content):
    return content["transcription"]


def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, # the character length of the chunck
        chunk_overlap=100, # the character length of the overlap between chuncks
        length_function=len # the length function - in this case, character length (aka the python len() fn.)
    )
    chunks = text_splitter.split_text(text)
    return chunks

def get_vectorstore(text_chunks,filename, file_type,namespace,index,enterprise_name):
    try:
        embedding = OpenAIEmbeddings(model="text-embedding-3-large")
        vector_store = PineconeVectorStore(index=index, embedding=embedding,namespace=namespace)

        file_name = filename.split(".")[0].replace(" ","_").replace("-","_").replace(".","_").replace("/","_").replace("\\","_").strip()

        documents = []
        uuids = []

        for i, chunk in enumerate(text_chunks):
            clean_filename = remove_non_standard_ascii(file_name)

            document = Document(
            page_content=chunk,
            metadata={"filename":filename,"file_type":file_type, "filename_id":clean_filename, "entreprise_name":enterprise_name},
            )
    
            uuid = f"{clean_filename}_{i}"
            uuids.append(uuid)
            documents.append(document)
        
        vector_store.add_documents(documents=documents, ids=uuids)

        return {"filename_id":clean_filename}
    
    except Exception as e:
        print(e)
        return False
    
def get_retreive_answer(enterprise_id,prompt,index,common_id):
    try:
        
        embedding = OpenAIEmbeddings(model="text-embedding-3-large")
        vector_store = PineconeVectorStore(index=index, embedding=embedding,namespace=enterprise_id)

        retriever = vector_store.as_retriever(
            search_type="similarity_score_threshold",
            search_kwargs={"k": 3, "score_threshold": 0.6},
        )

        if common_id:
            vector_store_commun = PineconeVectorStore(index=index, embedding=embedding,namespace=common_id)

            retriever_commun = vector_store_commun.as_retriever(
                search_type="similarity_score_threshold",
                search_kwargs={"k": 3, "score_threshold": 0.1},
            )

            response = retriever.invoke(prompt) + retriever_commun.invoke(prompt)

        else:
            response = retriever.invoke(prompt)

        return response
    
    except Exception as e:
        print(e)
        return False
    
    
def generate_response_via_langchain(query: str, stream: bool = False, model: str = "gpt-4o",context:str="",messages = [],style:str="formel",tonality:str="neutre",template:str = ""):
    # Define the prompt template
    if template == "":
        template = "En tant qu'IA experte en marketing, réponds avec un style {style} et une tonalité {tonality} dans ta communcation, sachant le context suivant: {context}, et l'historique de la conversation, {messages}, {query}"


    prompt = PromptTemplate.from_template(template)
    
    # Initialize the OpenAI LLM with the specified model
    llm = ChatOpenAI(model=model,temperature=0)
    
    # Create an LLM chain with the prompt and the LLM
    llm_chain = prompt | llm | StrOutputParser()

    if stream:
        # Return a generator that yields streamed responses
        return llm_chain.astream({ "query": query, "context": context, "messages": messages, "style": style, "tonality": tonality })
    
    # Invoke the LLM chain and return the result
    return llm_chain.invoke({"query": query, "context": context, "messages": messages, "style": style, "tonality": tonality})



def setup_rag(file_type,content):
    if file_type == "pdf":
        text = get_text_from_content_for_doc(content)
    elif file_type == "audio":
        text = get_text_from_content_for_audio(content)
    

    chunks = get_text_chunks(text)

    vectorstore = get_vectorstore(chunks)
    
    return vectorstore


def prompt_reformatting(prompt:str,context,query:str,style="formel",tonality="neutre"):
    if context == "":
        return prompt.format(context="Pas de contexte pertinent",messages="",query=query,style=style,tonality=tonality)
    
    docs_names = []
    for chunk in context:
        print(chunk.metadata)
        chunk_name = chunk.metadata["filename"]
        if chunk_name not in docs_names:
            docs_names.append(chunk_name)
    context = ", ".join(docs_names)

    prompt = prompt.format(context=context,messages="",query=query,style=style,tonality=tonality)
    return prompt