Spaces:
Runtime error
Runtime error
Adrian Cowham
commited on
Commit
·
5d02356
1
Parent(s):
cfc622d
changed embedding model to finetuned model
Browse files- src/app.py +2 -7
- src/core/embedding.py +14 -26
src/app.py
CHANGED
@@ -18,7 +18,7 @@ from src.core.parsing import read_file
|
|
18 |
VECTOR_STORE = "faiss"
|
19 |
MODEL = "openai"
|
20 |
EMBEDDING = "openai"
|
21 |
-
MODEL = "gpt-
|
22 |
K = 5
|
23 |
USE_VERBOSE = True
|
24 |
API_KEY = os.environ["OPENAI_API_KEY"]
|
@@ -52,12 +52,7 @@ def getretriever():
|
|
52 |
print(e)
|
53 |
|
54 |
chunked_file = chunk_file(file, chunk_size=512, chunk_overlap=0)
|
55 |
-
folder_index = embed_files(
|
56 |
-
files=[chunked_file],
|
57 |
-
embedding=EMBEDDING,
|
58 |
-
vector_store=VECTOR_STORE,
|
59 |
-
openai_api_key=API_KEY,
|
60 |
-
)
|
61 |
return folder_index.index.as_retriever(verbose=True, search_type="similarity", search_kwargs={"k": K})
|
62 |
|
63 |
retriever = getretriever()
|
|
|
18 |
VECTOR_STORE = "faiss"
|
19 |
MODEL = "openai"
|
20 |
EMBEDDING = "openai"
|
21 |
+
MODEL = "gpt-4"
|
22 |
K = 5
|
23 |
USE_VERBOSE = True
|
24 |
API_KEY = os.environ["OPENAI_API_KEY"]
|
|
|
52 |
print(e)
|
53 |
|
54 |
chunked_file = chunk_file(file, chunk_size=512, chunk_overlap=0)
|
55 |
+
folder_index = embed_files(files=[chunked_file])
|
|
|
|
|
|
|
|
|
|
|
56 |
return folder_index.index.as_retriever(verbose=True, search_type="similarity", search_kwargs={"k": K})
|
57 |
|
58 |
retriever = getretriever()
|
src/core/embedding.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from typing import List, Type
|
2 |
|
3 |
from langchain.docstore.document import Document
|
4 |
-
from langchain.embeddings import
|
5 |
from langchain.embeddings.base import Embeddings
|
6 |
from langchain.vectorstores import VectorStore
|
7 |
from langchain.vectorstores.faiss import FAISS
|
@@ -47,30 +47,18 @@ class FolderIndex:
|
|
47 |
return cls(files=files, index=index)
|
48 |
|
49 |
|
50 |
-
def embed_files(
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
}
|
63 |
-
|
64 |
-
if embedding in supported_embeddings:
|
65 |
-
_embeddings = supported_embeddings[embedding](**kwargs)
|
66 |
-
else:
|
67 |
-
raise NotImplementedError(f"Embedding {embedding} not supported.")
|
68 |
-
|
69 |
-
if vector_store in supported_vector_stores:
|
70 |
-
_vector_store = supported_vector_stores[vector_store]
|
71 |
-
else:
|
72 |
-
raise NotImplementedError(f"Vector store {vector_store} not supported.")
|
73 |
-
|
74 |
return FolderIndex.from_files(
|
75 |
-
files=files, embeddings=
|
76 |
)
|
|
|
1 |
from typing import List, Type
|
2 |
|
3 |
from langchain.docstore.document import Document
|
4 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
5 |
from langchain.embeddings.base import Embeddings
|
6 |
from langchain.vectorstores import VectorStore
|
7 |
from langchain.vectorstores.faiss import FAISS
|
|
|
47 |
return cls(files=files, index=index)
|
48 |
|
49 |
|
50 |
+
def embed_files(files: List[File]) -> FolderIndex:
|
51 |
+
model_name = "adriancowham/letstalk-embed-gte-small"
|
52 |
+
model_kwargs = {'device': 'cpu'}
|
53 |
+
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
|
54 |
+
print("Loading model...")
|
55 |
+
model_norm = HuggingFaceEmbeddings(
|
56 |
+
model_name=model_name,
|
57 |
+
model_kwargs=model_kwargs,
|
58 |
+
encode_kwargs=encode_kwargs
|
59 |
+
)
|
60 |
+
print("Model loaded.")
|
61 |
+
embeddings = model_norm
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
return FolderIndex.from_files(
|
63 |
+
files=files, embeddings=embeddings, vector_store=FAISS
|
64 |
)
|