File size: 4,251 Bytes
fe370a3 eeaf024 fe370a3 eeaf024 fe370a3 eeaf024 fe370a3 eeaf024 fe370a3 9a4c626 fe370a3 9a4c626 fe370a3 9a4c626 fe370a3 9a4c626 fe370a3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
import unicodedata
def remove_non_standard_ascii(input_string: str) -> str:
normalized_string = unicodedata.normalize('NFKD', input_string)
return ''.join(char for char in normalized_string if 'a' <= char <= 'z' or 'A' <= char <= 'Z' or char.isdigit() or char in ' .,!?')
def get_text_from_content_for_doc(content):
text = ""
for page in content:
text += content[page]["texte"]
return text
def get_text_from_content_for_audio(content):
return content["transcription"]
def get_text_chunks(text):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500, # the character length of the chunck
chunk_overlap=100, # the character length of the overlap between chuncks
length_function=len # the length function - in this case, character length (aka the python len() fn.)
chunks = text_splitter.split_text(text)
return chunks
def get_vectorstore(text_chunks,filename, file_type,namespace,index):
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = PineconeVectorStore(index=index, embedding=embedding,namespace=namespace)
file_name = filename.split(".")[0].replace(" ","_").replace("-","_").replace(".","_").replace("/","_").replace("\\","_").strip()
documents = []
uuids = []
for i, chunk in enumerate(text_chunks):
document = Document(
clean_filename = remove_non_standard_ascii(file_name)
uuid = f"{clean_filename}_{i}"
vector_store.add_documents(documents=documents, ids=uuids)
return True
except Exception as e:
return False
def get_retreive_answer(enterprise_id,prompt,index):
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = PineconeVectorStore(index=index, embedding=embedding,namespace=enterprise_id)
retriever = vector_store.as_retriever(
search_kwargs={"k": 3, "score_threshold": 0.6},
response = retriever.invoke(prompt)
return response
except Exception as e:
return False
def generate_response_via_langchain(query: str, stream: bool = False, model: str = "gpt-4o-mini",context:str="",messages = [],style:str="formal",tonality:str="neutral"):
# Define the prompt template
template = "En tant qu'IA experte en marketing, réponds avec un style {style} et une tonalité {tonality} dans ta communcation, sachant le context suivant: {context}, et l'historique de la conversation: {messages}, {query}"
prompt = PromptTemplate.from_template(template)
# Initialize the OpenAI LLM with the specified model
llm = ChatOpenAI(model=model)
# Create an LLM chain with the prompt and the LLM
llm_chain = prompt | llm | StrOutputParser()
if stream:
# Return a generator that yields streamed responses
return llm_chain.astream({ "query": query, "context": context, "messages": messages, "style": style, "tonality": tonality })
# Invoke the LLM chain and return the result
return llm_chain.invoke({"query": query, "context": context, "messages": messages, "style": style, "tonality": tonality})
def setup_rag(file_type,content):
if file_type == "pdf":
text = get_text_from_content_for_doc(content)
elif file_type == "audio":
text = get_text_from_content_for_audio(content)
chunks = get_text_chunks(text)
vectorstore = get_vectorstore(chunks)
return vectorstore