Adrian Cowham commited on
Commit
5d02356
·
1 Parent(s): cfc622d

changed embedding model to finetuned model

Browse files
Files changed (2) hide show
  1. src/app.py +2 -7
  2. src/core/embedding.py +14 -26
src/app.py CHANGED
@@ -18,7 +18,7 @@ from src.core.parsing import read_file
18
  VECTOR_STORE = "faiss"
19
  MODEL = "openai"
20
  EMBEDDING = "openai"
21
- MODEL = "gpt-3.5-turbo-16k"
22
  K = 5
23
  USE_VERBOSE = True
24
  API_KEY = os.environ["OPENAI_API_KEY"]
@@ -52,12 +52,7 @@ def getretriever():
52
  print(e)
53
 
54
  chunked_file = chunk_file(file, chunk_size=512, chunk_overlap=0)
55
- folder_index = embed_files(
56
- files=[chunked_file],
57
- embedding=EMBEDDING,
58
- vector_store=VECTOR_STORE,
59
- openai_api_key=API_KEY,
60
- )
61
  return folder_index.index.as_retriever(verbose=True, search_type="similarity", search_kwargs={"k": K})
62
 
63
  retriever = getretriever()
 
18
  VECTOR_STORE = "faiss"
19
  MODEL = "openai"
20
  EMBEDDING = "openai"
21
+ MODEL = "gpt-4"
22
  K = 5
23
  USE_VERBOSE = True
24
  API_KEY = os.environ["OPENAI_API_KEY"]
 
52
  print(e)
53
 
54
  chunked_file = chunk_file(file, chunk_size=512, chunk_overlap=0)
55
+ folder_index = embed_files(files=[chunked_file])
 
 
 
 
 
56
  return folder_index.index.as_retriever(verbose=True, search_type="similarity", search_kwargs={"k": K})
57
 
58
  retriever = getretriever()
src/core/embedding.py CHANGED
@@ -1,7 +1,7 @@
1
  from typing import List, Type
2
 
3
  from langchain.docstore.document import Document
4
- from langchain.embeddings import OpenAIEmbeddings
5
  from langchain.embeddings.base import Embeddings
6
  from langchain.vectorstores import VectorStore
7
  from langchain.vectorstores.faiss import FAISS
@@ -47,30 +47,18 @@ class FolderIndex:
47
  return cls(files=files, index=index)
48
 
49
 
50
- def embed_files(
51
- files: List[File], embedding: str, vector_store: str, **kwargs
52
- ) -> FolderIndex:
53
- """Embeds a collection of files and stores them in a FolderIndex."""
54
-
55
- supported_embeddings: dict[str, Type[Embeddings]] = {
56
- "openai": OpenAIEmbeddings,
57
- "debug": FakeEmbeddings,
58
- }
59
- supported_vector_stores: dict[str, Type[VectorStore]] = {
60
- "faiss": FAISS,
61
- "debug": FakeVectorStore,
62
- }
63
-
64
- if embedding in supported_embeddings:
65
- _embeddings = supported_embeddings[embedding](**kwargs)
66
- else:
67
- raise NotImplementedError(f"Embedding {embedding} not supported.")
68
-
69
- if vector_store in supported_vector_stores:
70
- _vector_store = supported_vector_stores[vector_store]
71
- else:
72
- raise NotImplementedError(f"Vector store {vector_store} not supported.")
73
-
74
  return FolderIndex.from_files(
75
- files=files, embeddings=_embeddings, vector_store=_vector_store
76
  )
 
1
  from typing import List, Type
2
 
3
  from langchain.docstore.document import Document
4
+ from langchain.embeddings import HuggingFaceEmbeddings
5
  from langchain.embeddings.base import Embeddings
6
  from langchain.vectorstores import VectorStore
7
  from langchain.vectorstores.faiss import FAISS
 
47
  return cls(files=files, index=index)
48
 
49
 
50
+ def embed_files(files: List[File]) -> FolderIndex:
51
+ model_name = "adriancowham/letstalk-embed-gte-small"
52
+ model_kwargs = {'device': 'cpu'}
53
+ encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
54
+ print("Loading model...")
55
+ model_norm = HuggingFaceEmbeddings(
56
+ model_name=model_name,
57
+ model_kwargs=model_kwargs,
58
+ encode_kwargs=encode_kwargs
59
+ )
60
+ print("Model loaded.")
61
+ embeddings = model_norm
 
 
 
 
 
 
 
 
 
 
 
 
62
  return FolderIndex.from_files(
63
+ files=files, embeddings=embeddings, vector_store=FAISS
64
  )