Spaces:
Sleeping
Sleeping
File size: 10,719 Bytes
fe370a3 b66e2f4 7495086 fe370a3 679ccf5 eeaf024 679ccf5 eeaf024 fe370a3 55b4c5a fe370a3 7495086 fe370a3 55b4c5a fe370a3 7495086 eeaf024 fe370a3 7495086 fe370a3 eeaf024 fe370a3 679ccf5 d069333 679ccf5 5a8a35f 679ccf5 5a8a35f 679ccf5 5a8a35f d069333 679ccf5 8698bd5 fe370a3 d40f467 fe370a3 6db239f fe370a3 6db239f fde2e8b 8698bd5 fde2e8b 59af68e fde2e8b 6db239f ae06536 6db239f fde2e8b 59af68e fde2e8b 8698bd5 6db239f fe370a3 eeaf024 fe370a3 d069333 679ccf5 d069333 679ccf5 0f1d158 679ccf5 0f1d158 679ccf5 fe370a3 b66e2f4 679ccf5 d069333 679ccf5 d069333 679ccf5 d069333 679ccf5 fe370a3 b66e2f4 679ccf5 22c6fdf 679ccf5 fe370a3 679ccf5 fe370a3 679ccf5 fe370a3 ed41e47 fe370a3 ed41e47 fe370a3 ed41e47 9c41670 fde2e8b 9c41670 fde2e8b 9c41670 fde2e8b 9c41670 fde2e8b 9c41670 6c65306 9c41670 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_mistralai import ChatMistralAI
from uuid import uuid4
from pydantic import BaseModel, Field
from langchain_core.tools import tool
import unicodedata
class AddToKnowledgeBase(BaseModel):
''' Add information to the knowledge base if the user asks for it in his query'''
information: str = Field(..., title="The information to add to the knowledge base")
def detect_language(text:str):
llm = ChatOpenAI(model="gpt-4o-mini",temperature=0)
template = "détecte la langue du texte suivant: {text}. rassure-toi que ta reponse contient seulement le nom de la langue detectée"
prompt = PromptTemplate.from_template(template)
chain = prompt | llm | StrOutputParser()
response = chain.invoke({"text": text}).strip().lower()
print(response)
return response
def remove_non_standard_ascii(input_string: str) -> str:
normalized_string = unicodedata.normalize('NFKD', input_string)
return ''.join(char for char in normalized_string if 'a' <= char <= 'z' or 'A' <= char <= 'Z' or char.isdigit() or char in ' .,!?')
def get_text_from_content_for_doc(content):
text = ""
for page in content:
text += content[page]["texte"]
return text
def get_text_from_content_for_audio(content):
return content["transcription"]
def get_text_chunks(text):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500, # the character length of the chunck
chunk_overlap=100, # the character length of the overlap between chuncks
length_function=len # the length function - in this case, character length (aka the python len() fn.)
)
chunks = text_splitter.split_text(text)
return chunks
def get_vectorstore(text_chunks,filename, file_type,namespace,index,enterprise_name):
try:
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = PineconeVectorStore(index=index, embedding=embedding,namespace=namespace)
file_name = filename.split(".")[0].replace(" ","_").replace("-","_").replace(".","_").replace("/","_").replace("\\","_").strip()
documents = []
uuids = []
for i, chunk in enumerate(text_chunks):
clean_filename = remove_non_standard_ascii(file_name)
document = Document(
page_content=chunk,
metadata={"filename":filename,"file_type":file_type, "filename_id":clean_filename, "entreprise_name":enterprise_name},
)
uuid = f"{clean_filename}_{i}"
uuids.append(uuid)
documents.append(document)
vector_store.add_documents(documents=documents, ids=uuids)
return {"filename_id":clean_filename}
except Exception as e:
print(e)
return False
def add_to_knowledge_base(enterprise_id,information,index,enterprise_name,user_id=""):
''' Add information to the knowledge base
Args:
enterprise_id (str): the enterprise id
information (str): the information to add
index (str): the index name
'''
try:
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = PineconeVectorStore(index=index, embedding=embedding,namespace=enterprise_id)
uuids = []
uuid = f"kb_{user_id}_{uuid4()}"
document = Document(
page_content=information,
metadata={"filename":"knowledge_base","file_type":"text", "filename_id":uuid, "entreprise_name":enterprise_name, "user_id":user_id},
)
uuids.append(uuid)
vector_store.add_documents(documents=[document], ids=uuids)
return uuid
except Exception as e:
print(e)
return False
def get_retreive_answer(enterprise_id,prompt,index,common_id,user_id=""):
try:
print("common_id ",common_id)
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = PineconeVectorStore(index=index, embedding=embedding,namespace=enterprise_id)
retriever = vector_store.as_retriever(
search_type="similarity_score_threshold",
search_kwargs={"k": 3, "score_threshold": 0.6},
)
enterprise_context = retriever.invoke(prompt)
user_memory = retriever.invoke(prompt,filters={"user_id":user_id})
if enterprise_context:
print("found enterprise context")
for chunk in enterprise_context:
print(chunk.metadata)
else:
print("no enterprise context")
if common_id:
vector_store_commun = PineconeVectorStore(index=index, embedding=embedding,namespace=common_id)
retriever_commun = vector_store_commun.as_retriever(
search_type="similarity_score_threshold",
search_kwargs={"k": 5, "score_threshold": 0.1},
)
commun_context = retriever_commun.invoke(prompt)
for chunk in commun_context:
print(chunk.metadata)
if commun_context:
print("found commun context")
else:
print("no commun context")
response = user_memory + enterprise_context + commun_context
else:
response = retriever.invoke(prompt)
return response
except Exception as e:
print(e)
return False
def handle_calling_add_to_knowledge_base(query,enterprise_id = "",index = "",enterprise_name = "",user_id = "",llm = None):
''' Handle the calling of the add_to_knowledge_base function
if the user, in his query, wants to add information to the knowledge base, the function will be called
'''
template = """
You are an AI assistant that processes user queries.
Determine if the user wants to add something to the knowledge base.
- If the user wants to add something, extract the valuable information, reformulate and output 'add' followed by the information.
- If the user does not want to add something, output 'no action'.
Ensure your response is only 'add <content>' or 'no action'.
User Query: "{query}"
Response:
""".strip()
prompt = PromptTemplate.from_template(template)
if not llm:
llm = ChatOpenAI(model="gpt-4o",temperature=0)
llm_with_tool = llm.bind_tools([AddToKnowledgeBase])
# template = "En tant qu'IA experte en marketing, tu travailles pour l'entreprise {enterprise}, si dans la question, il y a une demande d'ajout d'information à la base de connaissance, fait appel à la fonction add_to_knowledge_base en ajoutant l'information demandée, sinon, n'appelle pas la fonction. la question est la suivante: {query}"
# prompt = PromptTemplate.from_template(template)
chain = prompt | llm | StrOutputParser()
response = chain.invoke({"query": query}).strip().lower()
if response.startswith("add"):
item = response[len("add"):].strip()
if item:
item_id = add_to_knowledge_base(enterprise_id,item,index,enterprise_name,user_id)
print("added to knowledge base")
print(item)
return {"item_id":item_id,"item":item}
print(response)
return False
def generate_response_via_langchain(query: str, stream: bool = False, model: str = "gpt-4o",context:str="",messages = [],style:str="formel",tonality:str="neutre",template:str = "",enterprise_name:str="",enterprise_id:str="",index:str=""):
# Define the prompt template
if template == "":
template = "En tant qu'IA experte en marketing, réponds avec un style {style} et une tonalité {tonality} dans ta communcation pour l'entreprise {enterprise}, sachant le context suivant: {context}, et l'historique de la conversation, {messages}, {query}"
# Initialize the OpenAI LLM with the specified model
if model.startswith("gpt"):
llm = ChatOpenAI(model=model,temperature=0)
if model.startswith("mistral"):
llm = ChatMistralAI(model=model,temperature=0)
#handle_calling_add_to_knowledge_base(prompt.format(context=context,messages=messages,query=query,style=style,tonality=tonality,enterprise=enterprise_name))
# if handle_calling_add_to_knowledge_base(query,enterprise_id,index,enterprise_name):
# template += " la base de connaissance a été mise à jour"
language = detect_language(query)
template += f" Reponds en {language}"
# Create an LLM chain with the prompt and the LLM
prompt = PromptTemplate.from_template(template)
print(f"model: {model}")
print(f"marque: {enterprise_name}")
llm_chain = prompt | llm | StrOutputParser()
print(f"language: {language}")
if stream:
# Return a generator that yields streamed responses
return llm_chain.astream({ "query": query, "context": context, "messages": messages, "style": style, "tonality": tonality, "enterprise":enterprise_name })
# Invoke the LLM chain and return the result
return llm_chain.invoke({"query": query, "context": context, "messages": messages, "style": style, "tonality": tonality, "enterprise":enterprise_name})
def setup_rag(file_type,content):
if file_type == "pdf":
text = get_text_from_content_for_doc(content)
elif file_type == "audio":
text = get_text_from_content_for_audio(content)
chunks = get_text_chunks(text)
vectorstore = get_vectorstore(chunks)
return vectorstore
def prompt_reformatting(prompt:str,context,query:str,style="formel",tonality="neutre",enterprise_name=""):
if context == "":
print("no context found for prompt reormatting")
return prompt.format(context="Pas de contexte pertinent",messages="",query=query,style=style,tonality=tonality,enterprise=enterprise_name)
docs_names = []
print("context found for prompt reormatting")
for chunk in context:
print(chunk.metadata)
chunk_name = chunk.metadata["filename"]
if chunk_name not in docs_names:
docs_names.append(chunk_name)
context = ", ".join(docs_names)
prompt = prompt.format(context=context,messages="",query=query,style=style,tonality=tonality,enterprise=enterprise_name)
return prompt
|