Spaces:
Sleeping
Sleeping
import os | |
import sys | |
sys.path.append(sys.path[0].replace('scripts', '')) | |
import pandas as pd | |
import numpy as np | |
from config.data_paths import VECTORDB_PATH | |
from typing import Sequence, List, Tuple | |
import faiss | |
from sentence_transformers import SentenceTransformer | |
class Vectorizer: | |
def __init__(self, model_name: str) -> None: | |
""" | |
Initialize the vectorizer with a pre-trained embedding model. | |
Args: | |
model_name: The name of the pre-trained embedding model (compatible with sentence-transformers). | |
""" | |
self.model = SentenceTransformer(model_name) | |
def transform(self, prompts: Sequence[str], build_index=False) -> np.ndarray: | |
""" | |
Transform texts into numerical vectors using the specified model. | |
Args: | |
prompts: The sequence of raw corpus prompts. | |
Returns: | |
Vectorized prompts as a numpy array. | |
""" | |
embeddings = self.model.encode(prompts, show_progress_bar=True) | |
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True) # normalize embeddings | |
if build_index: | |
# self.embeddings=embeddings | |
if os.path.isfile(os.path.join(VECTORDB_PATH, 'prompts_index.faiss')): | |
print('Embeddings already stored in vector db') | |
else: | |
index = self._build_index(embeddings) | |
faiss.write_index(index, os.path.join(VECTORDB_PATH, 'prompts_index.faiss')) | |
else: | |
return embeddings | |
def _build_index(self, embeddings: np.ndarray) -> faiss.IndexFlatIP: | |
""" | |
Build and return a FAISS index for the given embeddings. | |
Args: | |
embeddings: A numpy array of prompt embeddings. | |
Returns: | |
FAISS index for efficient similarity search. | |
""" | |
index = faiss.IndexFlatIP(embeddings.shape[1]) # Cosine similarity (IP on normalized vectors) | |
index.add(embeddings) | |
return index | |
def cosine_similarity(query_vector: np.ndarray, corpus_vectors: np.ndarray) -> np.ndarray: | |
""" | |
Calculate cosine similarity between prompt vectors. | |
Args: | |
query_vector: Vectorized prompt query of shape (1, D). | |
corpus_vectors: Vectorized prompt corpus of shape (N, D). | |
Returns: | |
A vector of shape (N,) with values in range [-1, 1] where 1 is maximum similarity. | |
""" | |
return np.dot(corpus_vectors, query_vector.T).flatten() | |
class PromptSearchEngine: | |
def __init__(self, corpus: str, model_name: str = 'all-MiniLM-L6-v2', use_index=False) -> None: | |
""" | |
Initialize search engine by vectorizing prompt corpus. | |
Vectorized prompt corpus should be used to find the top n most similar prompts. | |
Args: | |
corpus: Path to the parquet dataset with raw prompts. | |
model_name: The name of the pre-trained embedding model. | |
""" | |
self.use_index=use_index | |
self.prompts=pd.read_parquet(corpus)['prompt'].to_list() | |
self.prompts=self.prompts# if use_index else np.random.choice(self.prompts, 1000, replace=False) | |
self.vectorizer = Vectorizer(model_name) | |
self.embeddings = self.vectorizer.transform(self.prompts, | |
build_index=use_index) # build index initially for faster retrieval | |
if use_index: | |
self.index = faiss.read_index(os.path.join(VECTORDB_PATH, 'prompts_index.faiss')) | |
def most_similar(self, query: str, n: int = 5) -> List[Tuple[float, str]]: | |
""" | |
Return top n most similar prompts from the corpus. | |
Input query prompt is vectorized using the Vectorizer. After that, use the cosine_similarity | |
function to get the top n most similar prompts from the corpus. | |
Args: | |
query: The raw query prompt input from the user. | |
n: The number of similar prompts to return from the corpus. | |
Returns: | |
The list of top n most similar prompts from the corpus along with similarity scores. | |
Note that returned prompts are verbatim. | |
""" | |
query_vector = self.vectorizer.transform([query]) | |
if self.use_index: | |
distances, indices = self.index.search(query_vector, n) | |
results = [{'prompt': self.prompts[idx], 'score': distances[0][i]} for i, idx in enumerate(indices[0])] | |
return results | |
else: | |
similarities = cosine_similarity(query_vector, self.embeddings) | |
top_indices = np.argsort(-similarities)[:n] # Sort in descending order | |
return [{'prompt': self.prompts[i], 'score': similarities[i]} for i in top_indices] | |