|
|
|
"""chatbot_with_memory (1).ipynb |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1sIEqI5-wciuiYOdlEYwBkTPUIlvMEzkF |
|
""" |
|
|
|
!pip install chromadb==0.4.6 |
|
!pip install pydantic==1.10 |
|
!pip install sentence-transformers |
|
|
|
!pip install huggingface_hub |
|
|
|
!pip install transformers |
|
|
|
from langchain.document_loaders import TextLoader |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.vectorstores import FAISS |
|
from langchain.chains.question_answering import load_qa_chain |
|
from langchain.chains.question_answering import load_qa_chain |
|
from langchain import HuggingFaceHub |
|
from langchain.document_loaders import UnstructuredPDFLoader |
|
from langchain.indexes import VectorstoreIndexCreator |
|
from langchain.chains import RetrievalQA |
|
from langchain.document_loaders import UnstructuredURLLoader |
|
from langchain.chains.question_answering import load_qa_chain |
|
from langchain import HuggingFaceHub |
|
import os |
|
huggingfacehub_api_token = os.environ.get("HUGGINGFACEHUB_API_TOKEN") |
|
|
|
|
|
pip install pypdf |
|
|
|
from langchain.document_loaders import PyPDFLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
|
|
|
|
|
loader = PyPDFLoader("/content/Document sans titre (5).pdf") |
|
|
|
pages = loader.load_and_split() |
|
|
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size= 128, |
|
chunk_overlap=64, |
|
separators=['\n\n', '\n', '(?=>\. )', ' ', ''] |
|
) |
|
docs = text_splitter.split_documents(pages) |
|
|
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
embeddings = HuggingFaceEmbeddings() |
|
|
|
pip install faiss-gpu |
|
|
|
|
|
|
|
from langchain.vectorstores import FAISS |
|
db = FAISS.from_documents(docs, embeddings) |
|
|
|
llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":1, "max_length":1000000, "max_new_tokens": 500}) |
|
chain = load_qa_chain(llm, chain_type="stuff") |
|
|
|
|
|
query = "quelles sont les villes les facultees de medcine ?" |
|
docs = db.similarity_search(query) |
|
chain.run(input_documents=docs, question=query) |
|
|
|
from langchain.chains import RetrievalQA |
|
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", |
|
retriever=db.as_retriever(search_kwargs={"k": 3})) |
|
|
|
query = "donner moi plus des information sur les facultees de medcine?" |
|
qa.run(query) |
|
|
|
query = "What is the meaning of Descriptive Data Analysis?" |
|
qa.run(query) |
|
|
|
repo_id = 'google/flan-t5-xxl' |
|
llm = HuggingFaceHub(huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"], |
|
repo_id=repo_id, |
|
model_kwargs={'temperature':0.5, 'max_length':256}) |
|
|
|
query1 = "Bonjour, je suis zaynab ,j'ai des questions a vous " |
|
query2 = "j'habite a marrakech. tu sait son pays?" |
|
query3 = "quel est mon prenom?" |
|
query4 = "ou j'habite" |
|
|
|
pip install langchain --upgrade |
|
|
|
from langchain import HuggingFaceHub |
|
from langchain.chains import ConversationChain |
|
|
|
"""### Conversation Buffer memory""" |
|
|
|
from langchain.chains.conversation.memory import ConversationBufferMemory |
|
|
|
memory = ConversationBufferMemory() |
|
conversation_buf = ConversationChain( |
|
llm=llm, |
|
memory=memory) |
|
|
|
print("input: ",query1) |
|
conversation_buf.predict(input=query1) |
|
|
|
print("input: ",query2) |
|
conversation_buf.predict(input=query2) |
|
|
|
memory.load_memory_variables({}) |
|
|
|
print("input: ",query3) |
|
conversation_buf.predict(input=query3) |
|
|
|
print("input: ",query4) |
|
conversation_buf.predict(input=query4) |
|
|
|
print(memory.buffer) |
|
|
|
"""### Conversation Buffer Window Memory""" |
|
|
|
from langchain.memory import ConversationBufferWindowMemory |
|
|
|
memory2 = ConversationBufferWindowMemory(k=2) |
|
conversation_buf2 = ConversationChain( |
|
llm=llm, |
|
memory=memory2 |
|
) |
|
|
|
print("input: ",query1) |
|
conversation_buf2.predict(input=query1) |
|
|
|
print("input: ",query2) |
|
conversation_buf2.predict(input=query2) |
|
|
|
print("input: ",query3) |
|
conversation_buf2.predict(input=query3) |
|
|
|
print(memory2.buffer) |
|
|
|
"""### Conversation Summary Memory""" |
|
|
|
from langchain.memory import ConversationSummaryBufferMemory |
|
|
|
memory3 = ConversationSummaryBufferMemory(llm=llm, max_token_limit=80) |
|
conversation_buf3 = ConversationChain( |
|
llm=llm, |
|
memory=memory3 |
|
) |
|
|
|
print("input: ",query1) |
|
conversation_buf3.predict(input=query1) |
|
|
|
print("input: ",query2) |
|
conversation_buf3.predict(input=query2) |
|
|
|
print("input: ",query3) |
|
conversation_buf3.predict(input=query3) |
|
|
|
memory3.load_memory_variables({}) |
|
|
|
"""### Chat PDF with Memory |
|
|
|
Updated version of Pydantic package (dependency of chromadb) has changed leaving chromadb, incompatible: here are the possible solutions: [import error chromadb](https://github.com/langchain-ai/langchain/issues/1957) || Install specific versions of chromadb and pydantic while the bug is resolved |
|
|
|
 |
|
|
|
path = input("Enter PDF file path: ") |
|
loader = PyPDFLoader(path) |
|
pages = loader.load() |
|
|
|
|
|
len(pages) |
|
|
|
splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=10) |
|
docs = splitter.split_documents(pages) |
|
|
|
tokens = docs |
|
num_tokens = len(tokens) |
|
print("Nombre de jetons :", num_tokens) |
|
|
|
for token in tokens: |
|
print(token) |
|
|
|
embeddings = HuggingFaceEmbeddings() |
|
doc_search = Chroma.from_documents(docs, embeddings) |
|
|
|
print(doc_search) |
|
|
|
query = "Quelle sont les Facultees existent ?" |
|
similar_docs = doc_search.similarity_search(query, k=3) |
|
|
|
print(similar_docs) |
|
|
|
query = "donner moi des information ecole nationale d'Industrie Minérale ?" |
|
similar_docs = doc_search.similarity_search(query, k=10) |
|
|
|
repo_id = 'google/flan-t5-xxl' |
|
llm = HuggingFaceHub(huggingfacehub_api_token=os.environ['HUGGING_FACE_HUB_API_KEY'], |
|
repo_id=repo_id, |
|
model_kwargs={'temperature':1, 'max_length':10000000000, "max_tokens":1000000000}) |
|
|
|
template = """ |
|
Use the following context (delimited by <ctx></ctx>) and the chat history (delimited by <hs></hs>) to answer the question: |
|
------ |
|
<ctx> |
|
{context} |
|
</ctx> |
|
------ |
|
<hs> |
|
{history} |
|
</hs> |
|
------ |
|
{question} |
|
Answer: |
|
""" |
|
prompt = PromptTemplate( |
|
input_variables=["history", "context", "question"], |
|
template=template, |
|
) |
|
|
|
memory = ConversationBufferMemory( |
|
memory_key="history", |
|
input_key="question" |
|
) |
|
|
|
retrieval_chain = RetrievalQA.from_chain_type(llm, |
|
chain_type='stuff', |
|
retriever=doc_search.as_retriever(), |
|
chain_type_kwargs={ |
|
"prompt": prompt, |
|
"memory": memory |
|
}) |
|
|
|
query = " donner moi les villes de ces facultees de medcine? " |
|
retrieval_chain.run(query) |
|
|
|
query = "donner moi des information sur Facultees de medcine ?" |
|
retrieval_chain.run(query) |
|
|
|
memory.load_memory_variables({}) |
|
|
|
|