Spaces:
Running
Running
File size: 6,681 Bytes
37c2a8d b1179cf 37c2a8d b1179cf 37c2a8d b1179cf 37c2a8d b1179cf 37c2a8d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
from src.pytorch_modules.datasets.schema_string_dataset import SchemaStringDataset
import os
import pandas as pd
import numpy as np
import json
import faiss
import torch
class UtilsSearch:
def __init__(self, config):
self.config = config
@staticmethod
def dataframe_to_index(df):
embeddings = np.stack(df['embeddings'].to_numpy())
norm_embeddings = np.ascontiguousarray(embeddings / np.linalg.norm(embeddings, axis=1)[:, None])
# Create a FAISS index (Step 2, unchanged but using normalized embeddings)
dimension = norm_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(norm_embeddings)
return index # Ad
def retrieve(self, query, df, model, index, top_k=100, api=False):
query += "Represent this sentence for searching relevant passages: "
"""
Search the DataFrame for the given query and return a sorted DataFrame based on similarity.
:param query: The search query string.
:param df: The input DataFrame containing embeddings.
:param model: The model to encode the query and compute embeddings.
:param index: The search index for querying.
:param top_k: The number of top results to return.
:return: A new DataFrame sorted by similarity to the query, with a 'similarities' column.
"""
# Check if CUDA is available and set the device accordingly
if not api:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Compute the query embedding
query_vector = model.encode(query, convert_to_tensor=True, device=device).cpu().numpy()
query_vector /= np.linalg.norm(query_vector)
else:
res = model.embeddings(
input=[query],
model=self.config["sentence_transformer_name"],
prompt=None,
)
query_vector = np.array([entry.embedding for entry in res.data][0]).astype(np.float32)
# Normalize the query vector
# Perform the search
distances, indices = index.search(np.array([query_vector]), top_k)
# Retrieve the rows from the DataFrame corresponding to the indices
retrieved_df = df.iloc[indices[0]]
# Attach the distances as a new column named 'similarities'
# Ensure the distances array matches the size of the retrieved DataFrame, especially if using slicing or other operations that might change its shape
retrieved_df = retrieved_df.assign(similarities=distances[0])
if 'similarities' in retrieved_df.columns:
retrieved_df = retrieved_df.sort_values(by='similarities', ascending=True)
# Optionally, you might want to reset the index if the order matters or if you need to serialize the DataFrame without index issues
retrieved_df = retrieved_df.reset_index(drop=True)
return retrieved_df
def rerank(self, query, df_top_100, cross_encoder, index):
# Convert the top 5 records to a list of dictionaries for processing
# print(df_top_100)
config = self.config
df_copy = df_top_100.copy().reset_index(drop=True)
records = df_copy.to_dict(orient='records')[:100]
# Assuming SchemaStringDataset can handle GPU data
dataset_str = SchemaStringDataset(records, config)
# Extract documents from dataset
documents = [batch["inputs"][:256] for batch in dataset_str]
# Rank documents based on the query
# Ensure data processed by cross_encoder is moved to the correct device
ids = [item["corpus_id"] for item in cross_encoder.rank(query, documents, top_k=10)]
# Use the ids to filter and reorder the original DataFrame
df_sorted_by_relevance = df_copy.loc[ids]
return df_sorted_by_relevance
def search(self, query, df, model, cross_encoder, index):
sorted_df = self.retrieve(query, df, model, index)
return self.rerank(query, sorted_df, cross_encoder, index)
@staticmethod
def top_10_common_values(df, column_name):
"""
This function takes a pandas dataframe and a column name,
and returns the top 10 most common non-null values of that column as a list.
"""
# Drop null values from the specified column and count occurrences of each value
# Convert the index of the resulting Series (which contains the values) to a list
value_counts_list = df[column_name].dropna().value_counts().head(10).index.tolist()
return value_counts_list
@staticmethod
def filter_dataframe(df, config, top_k_programmatic=100):
"""
Filters a DataFrame based on scalar and discrete column configurations, with type handling and null filtering.
Parameters:
- df: pandas.DataFrame to filter.
- config: Dictionary containing 'scalar_columns' and 'discrete_columns' configurations.
Returns:
- Filtered pandas.DataFrame.
"""
scalar_columns = config.get('scalar_columns', [])
discrete_columns = config.get('discrete_columns', [])
# Combine all column names to check for nulls
all_columns = [col["column_name"] for col in scalar_columns] + [col["column_name"] for col in discrete_columns]
# Drop rows where any of the specified columns have null values
df = df.dropna(subset=all_columns)
# Filtering based on scalar columns
for col in scalar_columns:
column_name = col["column_name"]
# Ensure min_value and max_value are of numeric type
min_value = float(col["min_value"])
max_value = float(col["max_value"])
# Convert the DataFrame column to numeric type to avoid comparison issues
df[column_name] = pd.to_numeric(df[column_name], errors='coerce')
df = df[df[column_name].between(min_value, max_value)]
# Filtering based on discrete columns
for col in discrete_columns:
column_name = col["column_name"]
default_values = col["default_values"]
if len(default_values) > 0:
df = df[df[column_name].isin(default_values)]
if 'similarities' in df.columns:
df = df.sort_values(by='similarities', ascending=False)
# Return the top 100 items with the highest similarity
return df
@staticmethod
def drop_columns(df, config):
columns_to_drop = config.get('columns_to_drop', [])
df_dropped = df.drop(columns_to_drop, axis=1)
return df_dropped
|