chatAna / appchat.py
ZainabNac's picture
Upload appchat.py
e1b7609
# -*- coding: utf-8 -*-
"""chatbot_with_memory (1).ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1sIEqI5-wciuiYOdlEYwBkTPUIlvMEzkF
"""
!pip install chromadb==0.4.6
!pip install pydantic==1.10
!pip install sentence-transformers
!pip install huggingface_hub
!pip install transformers
from langchain.document_loaders import TextLoader #for textfiles
from langchain.text_splitter import CharacterTextSplitter #text splitter
from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
from langchain.document_loaders import UnstructuredPDFLoader #load pdf
from langchain.indexes import VectorstoreIndexCreator #vectorize db index with chromadb
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredURLLoader #load urls into docoument-loader
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
import os
huggingfacehub_api_token = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
pip install pypdf
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
#import csvfrom langchain.document_loaders import PyPDFLoader
# Load the PDF file from current working directory
loader = PyPDFLoader("/content/Document sans titre (5).pdf")
# Split the PDF into Pages
pages = loader.load_and_split()
#import from langchain.text_splitter import RecursiveCharacterTextSplitter
# Define chunk size, overlap and separators
text_splitter = RecursiveCharacterTextSplitter(
chunk_size= 128,
chunk_overlap=64,
separators=['\n\n', '\n', '(?=>\. )', ' ', '']
)
docs = text_splitter.split_documents(pages)
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()
pip install faiss-gpu
#Create the vectorized db
# Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html
from langchain.vectorstores import FAISS
db = FAISS.from_documents(docs, embeddings)
llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":1, "max_length":1000000, "max_new_tokens": 500})
chain = load_qa_chain(llm, chain_type="stuff")
#QUERYING
query = "quelles sont les villes les facultees de medcine ?"
docs = db.similarity_search(query)
chain.run(input_documents=docs, question=query)
from langchain.chains import RetrievalQA
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff",
retriever=db.as_retriever(search_kwargs={"k": 3}))
query = "donner moi plus des information sur les facultees de medcine?"
qa.run(query)
query = "What is the meaning of Descriptive Data Analysis?"
qa.run(query)#import csv
repo_id = 'google/flan-t5-xxl' # has 3B parameters: https://huggingface.co/lmsys/fastchat-t5-3b-v1.0
llm = HuggingFaceHub(huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
repo_id=repo_id,
model_kwargs={'temperature':0.5, 'max_length':256})
query1 = "Bonjour, je suis zaynab ,j'ai des questions a vous "
query2 = "j'habite a marrakech. tu sait son pays?"
query3 = "quel est mon prenom?"
query4 = "ou j'habite"
pip install langchain --upgrade
from langchain import HuggingFaceHub
from langchain.chains import ConversationChain
"""### Conversation Buffer memory"""
from langchain.chains.conversation.memory import ConversationBufferMemory
# Adjust the import path accordingly
memory = ConversationBufferMemory()
conversation_buf = ConversationChain(
llm=llm,
memory=memory)
print("input: ",query1)
conversation_buf.predict(input=query1)
print("input: ",query2)
conversation_buf.predict(input=query2)
memory.load_memory_variables({})
print("input: ",query3)
conversation_buf.predict(input=query3)
print("input: ",query4)
conversation_buf.predict(input=query4)
print(memory.buffer)
"""### Conversation Buffer Window Memory"""
from langchain.memory import ConversationBufferWindowMemory
memory2 = ConversationBufferWindowMemory(k=2)
conversation_buf2 = ConversationChain(
llm=llm,
memory=memory2
)
print("input: ",query1)
conversation_buf2.predict(input=query1)
print("input: ",query2)
conversation_buf2.predict(input=query2)
print("input: ",query3)
conversation_buf2.predict(input=query3)
print(memory2.buffer)
"""### Conversation Summary Memory"""
from langchain.memory import ConversationSummaryBufferMemory
memory3 = ConversationSummaryBufferMemory(llm=llm, max_token_limit=80)
conversation_buf3 = ConversationChain(
llm=llm,
memory=memory3
)
print("input: ",query1)
conversation_buf3.predict(input=query1)
print("input: ",query2)
conversation_buf3.predict(input=query2)
print("input: ",query3)
conversation_buf3.predict(input=query3)
memory3.load_memory_variables({})
"""### Chat PDF with Memory
Updated version of Pydantic package (dependency of chromadb) has changed leaving chromadb, incompatible: here are the possible solutions: [import error chromadb](https://github.com/langchain-ai/langchain/issues/1957) || Install specific versions of chromadb and pydantic while the bug is resolved
![image.png](data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAACSIAAAFYCAYAAABNxBBUAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAHYcAAB2HAY/l8WUAAP+lSURBVHhe7N0LfFTlnT/+TzKXTCaZXBlyIyRcQkMImEbEgEARF7FSyipdqZd2EV30Z9W2S60vdX2p61b/1rK1al211tpatdqii/GGrICAGAXTyCWkXBNCLjDkOpkwmcnl/5zLTCaTmckkMwkJfN6v17w4c4bJOec5z3meZ87zPc8TUVRU1AMiIiIiIiIiIiIiIiIiIiIiIqIQRKr/EhERERERERERERERERERERERDRkDkYiIiIiIiIiIiIiIiIiIiIiIKGQMRCIiIiIiIiIiIiIiIiIiIiIiopAxEImIiIiIiIiIiIiIiIiIiIiIiELGQCQiIiIiIiIiIiIiIiIiIiIiIgoZA5GIiIiIiIiIiIiIiIiIiIiIiChkDEQiIiIiIiIiIiIiIiIiIiIiIqKQMRCJiIiIiIiIiIiIiIiIiIiIiIhCxkAkIiIiIiIiIiIiIiIiIiIiIiIKGQORiIiIiIiIiIiIiIiIiIiIiIgo
"""
!pip install pypdf
import langchain
import chromadb
import os
import getpass
from langchain.document_loaders import PyPDFLoader #document loader: https://python.langchain.com/docs/modules/data_connection/document_loaders
from langchain.text_splitter import RecursiveCharacterTextSplitter #document transformer: text splitter for chunking
from langchain.embeddings import HuggingFaceEmbeddings
from langchain import PromptTemplate
from langchain.vectorstores import Chroma #vector store
from langchain import HuggingFaceHub #model hub
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory
#loading the API key
import getpass
import os
os.environ['HUGGING_FACE_HUB_API_KEY'] = getpass.getpass('Hugging face api key:')
path = input("Enter PDF file path: ")#"C:/Users/Sourav/Downloads/pdf"
loader = PyPDFLoader(path)
pages = loader.load()
#number of pages
len(pages)
splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=10)
docs = splitter.split_documents(pages)
tokens = docs
num_tokens = len(tokens)
print("Nombre de jetons :", num_tokens)
for token in tokens:
print(token)
embeddings = HuggingFaceEmbeddings()
doc_search = Chroma.from_documents(docs, embeddings)
print(doc_search)
query = "Quelle sont les Facultees existent ?"
similar_docs = doc_search.similarity_search(query, k=3)
print(similar_docs)
query = "donner moi des information ecole nationale d'Industrie Minérale ?"
similar_docs = doc_search.similarity_search(query, k=10)
repo_id = 'google/flan-t5-xxl' # has 3B parameters: https://huggingface.co/lmsys/fastchat-t5-3b-v1.0
llm = HuggingFaceHub(huggingfacehub_api_token=os.environ['HUGGING_FACE_HUB_API_KEY'],
repo_id=repo_id,
model_kwargs={'temperature':1, 'max_length':10000000000, "max_tokens":1000000000})
template = """
Use the following context (delimited by <ctx></ctx>) and the chat history (delimited by <hs></hs>) to answer the question:
------
<ctx>
{context}
</ctx>
------
<hs>
{history}
</hs>
------
{question}
Answer:
"""
prompt = PromptTemplate(
input_variables=["history", "context", "question"],
template=template,
)
memory = ConversationBufferMemory(
memory_key="history",
input_key="question"
)
retrieval_chain = RetrievalQA.from_chain_type(llm,
chain_type='stuff',
retriever=doc_search.as_retriever(),
chain_type_kwargs={
"prompt": prompt,
"memory": memory
})
query = " donner moi les villes de ces facultees de medcine? "
retrieval_chain.run(query)
query = "donner moi des information sur Facultees de medcine ?"
retrieval_chain.run(query)
memory.load_memory_variables({})