Spaces:
Sleeping
Sleeping
File size: 5,481 Bytes
fe370a3 7495086 fe370a3 eeaf024 fe370a3 55b4c5a fe370a3 7495086 fe370a3 55b4c5a fe370a3 7495086 eeaf024 fe370a3 7495086 fe370a3 eeaf024 fe370a3 6db239f fe370a3 6db239f fe370a3 6db239f 7686d42 6db239f fe370a3 eeaf024 fe370a3 55b4c5a f22ae3a fe370a3 55b4c5a 0f1d158 fe370a3 fb4fd4c fe370a3 9a4c626 fe370a3 9a4c626 fe370a3 9c41670 798e445 9c41670 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from uuid import uuid4
import unicodedata
def remove_non_standard_ascii(input_string: str) -> str:
normalized_string = unicodedata.normalize('NFKD', input_string)
return ''.join(char for char in normalized_string if 'a' <= char <= 'z' or 'A' <= char <= 'Z' or char.isdigit() or char in ' .,!?')
def get_text_from_content_for_doc(content):
text = ""
for page in content:
text += content[page]["texte"]
return text
def get_text_from_content_for_audio(content):
return content["transcription"]
def get_text_chunks(text):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500, # the character length of the chunck
chunk_overlap=100, # the character length of the overlap between chuncks
length_function=len # the length function - in this case, character length (aka the python len() fn.)
)
chunks = text_splitter.split_text(text)
return chunks
def get_vectorstore(text_chunks,filename, file_type,namespace,index,enterprise_name):
try:
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = PineconeVectorStore(index=index, embedding=embedding,namespace=namespace)
file_name = filename.split(".")[0].replace(" ","_").replace("-","_").replace(".","_").replace("/","_").replace("\\","_").strip()
documents = []
uuids = []
for i, chunk in enumerate(text_chunks):
clean_filename = remove_non_standard_ascii(file_name)
document = Document(
page_content=chunk,
metadata={"filename":filename,"file_type":file_type, "filename_id":clean_filename, "entreprise_name":enterprise_name},
)
uuid = f"{clean_filename}_{i}"
uuids.append(uuid)
documents.append(document)
vector_store.add_documents(documents=documents, ids=uuids)
return {"filename_id":clean_filename}
except Exception as e:
print(e)
return False
def get_retreive_answer(enterprise_id,prompt,index,common_id):
try:
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = PineconeVectorStore(index=index, embedding=embedding,namespace=enterprise_id)
retriever = vector_store.as_retriever(
search_type="similarity_score_threshold",
search_kwargs={"k": 3, "score_threshold": 0.6},
)
if common_id:
vector_store_commun = PineconeVectorStore(index=index, embedding=embedding,namespace=common_id)
retriever_commun = vector_store_commun.as_retriever(
search_type="similarity_score_threshold",
search_kwargs={"k": 3, "score_threshold": 0.1},
)
response = retriever.invoke(prompt) + retriever_commun.invoke(prompt)
else:
response = retriever.invoke(prompt)
return response
except Exception as e:
print(e)
return False
def generate_response_via_langchain(query: str, stream: bool = False, model: str = "gpt-4o",context:str="",messages = [],style:str="formel",tonality:str="neutre",template:str = ""):
# Define the prompt template
if template == "":
template = "En tant qu'IA experte en marketing, réponds avec un style {style} et une tonalité {tonality} dans ta communcation, sachant le context suivant: {context}, et l'historique de la conversation, {messages}, {query}"
prompt = PromptTemplate.from_template(template)
# Initialize the OpenAI LLM with the specified model
llm = ChatOpenAI(model=model,temperature=0)
# Create an LLM chain with the prompt and the LLM
llm_chain = prompt | llm | StrOutputParser()
if stream:
# Return a generator that yields streamed responses
return llm_chain.astream({ "query": query, "context": context, "messages": messages, "style": style, "tonality": tonality })
# Invoke the LLM chain and return the result
return llm_chain.invoke({"query": query, "context": context, "messages": messages, "style": style, "tonality": tonality})
def setup_rag(file_type,content):
if file_type == "pdf":
text = get_text_from_content_for_doc(content)
elif file_type == "audio":
text = get_text_from_content_for_audio(content)
chunks = get_text_chunks(text)
vectorstore = get_vectorstore(chunks)
return vectorstore
def prompt_reformatting(prompt:str,context,query:str,style="formel",tonality="neutre"):
if context == "":
return prompt.format(context="Pas de contexte pertinent",messages="",query=query,style=style,tonality=tonality)
docs_names = []
for chunk in context:
print(chunk.metadata)
chunk_name = chunk.metadata["filename"]
if chunk_name not in docs_names:
docs_names.append(chunk_name)
context = ", ".join(docs_names)
prompt = prompt.format(context=context,messages="",query=query,style=style,tonality=tonality)
return prompt
|