|
import os |
|
import string |
|
from typing import Any, Dict, List, Tuple, Union |
|
|
|
import chromadb |
|
import numpy as np |
|
import openai |
|
import pandas as pd |
|
import requests |
|
import streamlit as st |
|
from datasets import load_dataset |
|
from langchain.document_loaders import TextLoader |
|
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain.vectorstores import Chroma |
|
from scipy.spatial.distance import cosine |
|
|
|
openai.api_key = os.environ["OPENAI_API_KEY"] |
|
|
|
|
|
def merge_dataframes(dataframes: List[pd.DataFrame]) -> pd.DataFrame: |
|
""" |
|
Merges a list of pandas DataFrames into a single DataFrame. |
|
|
|
This function concatenates the given DataFrames and filters the resulting DataFrame to only include the columns 'context', 'questions', and 'answers'. |
|
|
|
Parameters: |
|
dataframes (List[pd.DataFrame]): A list of DataFrames to be merged. |
|
|
|
Returns: |
|
pd.DataFrame: The concatenated DataFrame containing only the specified columns. |
|
""" |
|
|
|
|
|
combined_dataframe = pd.concat( |
|
dataframes, ignore_index=True |
|
) |
|
|
|
|
|
combined_dataframe = combined_dataframe[ |
|
["context", "questions", "answers"] |
|
] |
|
|
|
return combined_dataframe |
|
|
|
|
|
def call_chatgpt(prompt: str) -> str: |
|
""" |
|
Uses the OpenAI API to generate an AI response to a prompt. |
|
|
|
Args: |
|
prompt: A string representing the prompt to send to the OpenAI API. |
|
|
|
Returns: |
|
A string representing the AI's generated response. |
|
|
|
""" |
|
|
|
|
|
response = openai.Completion.create( |
|
model="gpt-3.5-turbo-instruct", |
|
prompt=prompt, |
|
temperature=0.5, |
|
max_tokens=500, |
|
top_p=1, |
|
frequency_penalty=0, |
|
presence_penalty=0, |
|
) |
|
|
|
|
|
ans = response.choices[0]["text"] |
|
|
|
|
|
return ans |
|
|
|
|
|
def openai_text_embedding(prompt: str) -> str: |
|
""" |
|
Retrieves the text embedding for a given prompt using OpenAI's text-embedding model. |
|
|
|
This function utilizes OpenAI's API to generate an embedding for the input text. It specifically uses the "text-embedding-ada-002" model. |
|
|
|
Parameters: |
|
prompt (str): The text input for which to generate an embedding. |
|
|
|
Returns: |
|
str: A string representation of the text embedding. |
|
""" |
|
|
|
|
|
return openai.Embedding.create(input=prompt, model="text-embedding-ada-002")[ |
|
"data" |
|
][0][ |
|
"embedding" |
|
] |
|
|
|
|
|
def calculate_sts_openai_score(sentence1: str, sentence2: str) -> float: |
|
""" |
|
Calculates the Semantic Textual Similarity (STS) between two sentences using OpenAI's text-embedding model. |
|
|
|
This function computes embeddings for each sentence and then calculates the cosine similarity between these embeddings. A higher score indicates greater similarity. |
|
|
|
Parameters: |
|
sentence1 (str): The first sentence for similarity comparison. |
|
sentence2 (str): The second sentence for similarity comparison. |
|
|
|
Returns: |
|
float: The STS score representing the similarity between sentence1 and sentence2. |
|
""" |
|
|
|
|
|
embedding1 = openai_text_embedding(sentence1) |
|
embedding2 = openai_text_embedding(sentence2) |
|
|
|
|
|
embedding1 = np.asarray(embedding1) |
|
embedding2 = np.asarray(embedding2) |
|
|
|
|
|
|
|
similarity_score = 1 - cosine(embedding1, embedding2) |
|
|
|
return similarity_score |
|
|
|
|
|
def add_dist_score_column( |
|
dataframe: pd.DataFrame, |
|
sentence: str, |
|
) -> pd.DataFrame: |
|
""" |
|
Adds a new column to the provided DataFrame with STS (Semantic Textual Similarity) scores, |
|
calculated between a given sentence and each question in the 'questions' column of the DataFrame. |
|
The DataFrame is then sorted by this new column in descending order and the top 5 rows are returned. |
|
|
|
Parameters: |
|
dataframe (pd.DataFrame): A pandas DataFrame containing a 'questions' column. |
|
sentence (str): The sentence against which to compute STS scores for each question in the DataFrame. |
|
|
|
Returns: |
|
pd.DataFrame: A DataFrame containing the original data along with the new 'stsopenai' column, |
|
sorted by the 'stsopenai' column, and limited to the top 5 entries with the highest scores. |
|
""" |
|
|
|
|
|
dataframe["stsopenai"] = dataframe["questions"].apply( |
|
lambda x: calculate_sts_openai_score(str(x), sentence) |
|
) |
|
|
|
|
|
sorted_dataframe = dataframe.sort_values(by="stsopenai", ascending=False) |
|
|
|
|
|
return sorted_dataframe.iloc[:5, :] |
|
|
|
|
|
def convert_to_list_of_dict(df: pd.DataFrame) -> List[Dict[str, str]]: |
|
""" |
|
Reads in a pandas DataFrame and produces a list of dictionaries with two keys each, 'question' and 'answer.' |
|
|
|
Args: |
|
df: A pandas DataFrame with columns named 'questions' and 'answers'. |
|
|
|
Returns: |
|
A list of dictionaries, with each dictionary containing a 'question' and 'answer' key-value pair. |
|
""" |
|
|
|
|
|
result = [] |
|
|
|
|
|
for index, row in df.iterrows(): |
|
|
|
qa_dict_quest = {"role": "user", "content": row["questions"]} |
|
qa_dict_ans = {"role": "assistant", "content": row["answers"]} |
|
|
|
|
|
result.append(qa_dict_quest) |
|
result.append(qa_dict_ans) |
|
|
|
|
|
return result |
|
|
|
|
|
def query(payload: Dict[str, Any]) -> Dict[str, Any]: |
|
""" |
|
Sends a JSON payload to a predefined API URL and returns the JSON response. |
|
Args: |
|
payload (Dict[str, Any]): The JSON payload to be sent to the API. |
|
Returns: |
|
Dict[str, Any]: The JSON response received from the API. |
|
""" |
|
|
|
|
|
API_URL = "https://sks7h7h5qkhoxwxo.us-east-1.aws.endpoints.huggingface.cloud" |
|
|
|
|
|
headers = {"Accept": "application/json", "Content-Type": "application/json"} |
|
|
|
|
|
response = requests.post(API_URL, headers=headers, json=payload) |
|
|
|
|
|
return response.json() |
|
|
|
|
|
def llama2_7b_ysa(prompt: str) -> str: |
|
""" |
|
Queries a model and retrieves the generated text based on the given prompt. |
|
This function sends a prompt to a model (presumably named 'llama2_7b') and extracts |
|
the generated text from the model's response. It's tailored for handling responses |
|
from a specific API or model query structure where the response is expected to be |
|
a list of dictionaries, with at least one dictionary containing a key 'generated_text'. |
|
Parameters: |
|
- prompt (str): The text prompt to send to the model. |
|
Returns: |
|
- str: The generated text response from the model. |
|
Note: |
|
- The function assumes that the 'query' function is previously defined and accessible |
|
within the same scope or module. It should send a request to the model and return |
|
the response in a structured format. |
|
- The 'parameters' dictionary is passed empty but can be customized to include specific |
|
request parameters as needed by the model API. |
|
""" |
|
|
|
|
|
query_payload: Dict[str, Any] = { |
|
"inputs": prompt, |
|
"parameters": {"max_new_tokens": 20}, |
|
} |
|
|
|
|
|
output = query(query_payload) |
|
|
|
|
|
response: str = output[0]["generated_text"] |
|
|
|
return response |
|
|
|
|
|
def quantize_to_kbit(arr: Union[np.ndarray, Any], k: int = 16) -> np.ndarray: |
|
"""Converts an array to a k-bit representation by normalizing and scaling its values. |
|
|
|
Args: |
|
arr (Union[np.ndarray, Any]): The input array to be quantized. |
|
k (int): The number of levels to quantize to. Defaults to 16 for 4-bit quantization. |
|
|
|
Returns: |
|
np.ndarray: The quantized array with values scaled to 0 to k-1. |
|
""" |
|
if not isinstance(arr, np.ndarray): |
|
arr = np.array(arr) |
|
arr_min = arr.min() |
|
arr_max = arr.max() |
|
normalized_arr = (arr - arr_min) / (arr_max - arr_min) |
|
return np.round(normalized_arr * (k - 1)).astype(int) |
|
|
|
|
|
def quantized_influence(arr1: np.ndarray, arr2: np.ndarray, k: int = 16, use_dagger: bool = False) -> Tuple[float, List[float]]: |
|
""" |
|
Calculates a weighted measure of influence based on quantized version of input arrays and optionally applies a transformation. |
|
|
|
Args: |
|
arr1 (np.ndarray): First input array to be quantized and analyzed. |
|
arr2 (np.ndarray): Second input array to be quantized and used for influence measurement. |
|
k (int): The quantization level, defaults to 16 for 4-bit quantization. |
|
use_dagger (bool): Flag to apply a transformation based on local averages, defaults to False. |
|
|
|
Returns: |
|
Tuple[float, List[float]]: A tuple containing the quantized influence measure and an optional list of transformed values based on local estimates. |
|
""" |
|
|
|
arr1_quantized = quantize_to_kbit(arr1, k) |
|
arr2_quantized = quantize_to_kbit(arr2, k) |
|
|
|
|
|
unique_values = np.unique(arr1_quantized) |
|
|
|
|
|
y_bar_global = np.mean(arr2_quantized) |
|
|
|
|
|
weighted_local_averages = [(np.mean(arr2_quantized[arr1_quantized == val]) - y_bar_global)**2 * len(arr2_quantized[arr1_quantized == val])**2 for val in unique_values] |
|
qim = np.mean(weighted_local_averages) / np.std(arr2_quantized) |
|
|
|
if use_dagger: |
|
|
|
local_estimates = [np.mean(arr2_quantized[arr1_quantized == val]) for val in unique_values] |
|
daggers = {unique_values[i]: v for i, v in enumerate(local_estimates)} |
|
|
|
def find_val_(i: int) -> float: |
|
"""Helper function to map quantized values to their local estimates.""" |
|
return daggers[i] |
|
|
|
|
|
daggered_values = list(map(find_val_, arr1_quantized)) |
|
else: |
|
|
|
daggered_values = arr1_quantized.tolist() |
|
|
|
return qim, daggered_values |
|
|