Clement Vachet commited on
Commit
58b5050
·
1 Parent(s): 577e81d

Use langchain-chroma and langchain-huggingface libraries

Browse files
Files changed (1) hide show
  1. app.py +18 -14
app.py CHANGED
@@ -3,13 +3,12 @@ import os
3
 
4
  from langchain_community.document_loaders import PyPDFLoader
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from langchain_community.vectorstores import Chroma
7
  from langchain.chains import ConversationalRetrievalChain
8
- from langchain_community.embeddings import HuggingFaceEmbeddings
9
- from langchain_community.llms import HuggingFacePipeline
10
  from langchain.chains import ConversationChain
11
  from langchain.memory import ConversationBufferMemory
12
- from langchain_community.llms import HuggingFaceEndpoint
13
 
14
  from pathlib import Path
15
  import chromadb
@@ -23,7 +22,6 @@ import accelerate
23
  import re
24
 
25
 
26
-
27
  # default_persist_directory = './chroma_HF/'
28
  list_llm = ["mistralai/Mistral-7B-Instruct-v0.2", "mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/Mistral-7B-Instruct-v0.1", \
29
  "google/gemma-7b-it","google/gemma-2b-it", \
@@ -34,8 +32,11 @@ list_llm = ["mistralai/Mistral-7B-Instruct-v0.2", "mistralai/Mixtral-8x7B-Instru
34
  ]
35
  list_llm_simple = [os.path.basename(llm) for llm in list_llm]
36
 
 
37
  # Load PDF document and create doc splits
38
  def load_doc(list_file_path, chunk_size, chunk_overlap):
 
 
39
  loaders = [PyPDFLoader(x) for x in list_file_path]
40
  pages = []
41
  for loader in loaders:
@@ -49,7 +50,13 @@ def load_doc(list_file_path, chunk_size, chunk_overlap):
49
 
50
  # Create vector database
51
  def create_db(splits, collection_name):
52
- embedding = HuggingFaceEmbeddings()
 
 
 
 
 
 
53
  new_client = chromadb.EphemeralClient()
54
  vectordb = Chroma.from_documents(
55
  documents=splits,
@@ -61,23 +68,19 @@ def create_db(splits, collection_name):
61
  return vectordb
62
 
63
 
64
- # Load vector database
65
- def load_db():
66
- embedding = HuggingFaceEmbeddings()
67
- vectordb = Chroma(
68
- # persist_directory=default_persist_directory,
69
- embedding_function=embedding)
70
- return vectordb
71
-
72
 
73
  # Initialize langchain LLM chain
74
  def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db, progress=gr.Progress()):
 
 
75
  progress(0.1, desc="Initializing HF tokenizer...")
76
  # HuggingFaceHub uses HF inference endpoints
77
  progress(0.5, desc="Initializing HF Hub...")
78
  # Use of trust_remote_code as model_kwargs
79
  # Warning: langchain issue
80
  # URL: https://github.com/langchain-ai/langchain/issues/6080
 
 
81
  if llm_model == "mistralai/Mixtral-8x7B-Instruct-v0.1":
82
  llm = HuggingFaceEndpoint(
83
  repo_id=llm_model,
@@ -132,6 +135,7 @@ def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db, pr
132
  max_new_tokens = max_tokens,
133
  top_k = top_k,
134
  )
 
135
 
136
  progress(0.75, desc="Defining buffer memory...")
137
  memory = ConversationBufferMemory(
 
3
 
4
  from langchain_community.document_loaders import PyPDFLoader
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain_chroma import Chroma
7
  from langchain.chains import ConversationalRetrievalChain
8
+ from langchain_huggingface import HuggingFaceEmbeddings
 
9
  from langchain.chains import ConversationChain
10
  from langchain.memory import ConversationBufferMemory
11
+ from langchain_huggingface import HuggingFaceEndpoint
12
 
13
  from pathlib import Path
14
  import chromadb
 
22
  import re
23
 
24
 
 
25
  # default_persist_directory = './chroma_HF/'
26
  list_llm = ["mistralai/Mistral-7B-Instruct-v0.2", "mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/Mistral-7B-Instruct-v0.1", \
27
  "google/gemma-7b-it","google/gemma-2b-it", \
 
32
  ]
33
  list_llm_simple = [os.path.basename(llm) for llm in list_llm]
34
 
35
+
36
  # Load PDF document and create doc splits
37
  def load_doc(list_file_path, chunk_size, chunk_overlap):
38
+ """Load PDF document and create doc splits"""
39
+
40
  loaders = [PyPDFLoader(x) for x in list_file_path]
41
  pages = []
42
  for loader in loaders:
 
50
 
51
  # Create vector database
52
  def create_db(splits, collection_name):
53
+ """Create embeddings and vector database"""
54
+
55
+ embedding = HuggingFaceEmbeddings(
56
+ model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
57
+ model_kwargs={'device': 'cpu'},
58
+ encode_kwargs={'normalize_embeddings': False}
59
+ )
60
  new_client = chromadb.EphemeralClient()
61
  vectordb = Chroma.from_documents(
62
  documents=splits,
 
68
  return vectordb
69
 
70
 
 
 
 
 
 
 
 
 
71
 
72
  # Initialize langchain LLM chain
73
  def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db, progress=gr.Progress()):
74
+ """Initialize Langchain LLM chain"""
75
+
76
  progress(0.1, desc="Initializing HF tokenizer...")
77
  # HuggingFaceHub uses HF inference endpoints
78
  progress(0.5, desc="Initializing HF Hub...")
79
  # Use of trust_remote_code as model_kwargs
80
  # Warning: langchain issue
81
  # URL: https://github.com/langchain-ai/langchain/issues/6080
82
+
83
+ WARNING - simplify LLM use
84
  if llm_model == "mistralai/Mixtral-8x7B-Instruct-v0.1":
85
  llm = HuggingFaceEndpoint(
86
  repo_id=llm_model,
 
135
  max_new_tokens = max_tokens,
136
  top_k = top_k,
137
  )
138
+
139
 
140
  progress(0.75, desc="Defining buffer memory...")
141
  memory = ConversationBufferMemory(