|
import os |
|
import string |
|
from typing import Any, Dict, List, Tuple, Union |
|
|
|
import chromadb |
|
import numpy as np |
|
import openai |
|
import pandas as pd |
|
import requests |
|
import streamlit as st |
|
from datasets import load_dataset |
|
from langchain.document_loaders import TextLoader |
|
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain.vectorstores import Chroma |
|
from scipy.spatial.distance import cosine |
|
|
|
openai.api_key = os.environ["OPENAI_API_KEY"] |
|
|
|
|
|
def merge_dataframes(dataframes: List[pd.DataFrame]) -> pd.DataFrame: |
|
""" |
|
Merges a list of pandas DataFrames into a single DataFrame. |
|
|
|
This function concatenates the given DataFrames and filters the resulting DataFrame to only include the columns 'context', 'questions', and 'answers'. |
|
|
|
Parameters: |
|
dataframes (List[pd.DataFrame]): A list of DataFrames to be merged. |
|
|
|
Returns: |
|
pd.DataFrame: The concatenated DataFrame containing only the specified columns. |
|
""" |
|
|
|
|
|
combined_dataframe = pd.concat( |
|
dataframes, ignore_index=True |
|
) |
|
|
|
|
|
combined_dataframe = combined_dataframe[ |
|
["context", "questions", "answers"] |
|
] |
|
|
|
return combined_dataframe |
|
|
|
|
|
def call_chatgpt(prompt: str) -> str: |
|
""" |
|
Uses the OpenAI API to generate an AI response to a prompt. |
|
|
|
Args: |
|
prompt: A string representing the prompt to send to the OpenAI API. |
|
|
|
Returns: |
|
A string representing the AI's generated response. |
|
|
|
""" |
|
|
|
|
|
response = openai.Completion.create( |
|
model="gpt-3.5-turbo-instruct", |
|
prompt=prompt, |
|
temperature=0.5, |
|
max_tokens=500, |
|
top_p=1, |
|
frequency_penalty=0, |
|
presence_penalty=0, |
|
) |
|
|
|
|
|
ans = response.choices[0]["text"] |
|
|
|
|
|
return ans |
|
|
|
|
|
def openai_text_embedding(prompt: str) -> str: |
|
""" |
|
Retrieves the text embedding for a given prompt using OpenAI's text-embedding model. |
|
|
|
This function utilizes OpenAI's API to generate an embedding for the input text. It specifically uses the "text-embedding-ada-002" model. |
|
|
|
Parameters: |
|
prompt (str): The text input for which to generate an embedding. |
|
|
|
Returns: |
|
str: A string representation of the text embedding. |
|
""" |
|
|
|
|
|
return openai.Embedding.create(input=prompt, model="text-embedding-ada-002")[ |
|
"data" |
|
][0][ |
|
"embedding" |
|
] |
|
|
|
|
|
def calculate_sts_openai_score(sentence1: str, sentence2: str) -> float: |
|
""" |
|
Calculates the Semantic Textual Similarity (STS) between two sentences using OpenAI's text-embedding model. |
|
|
|
This function computes embeddings for each sentence and then calculates the cosine similarity between these embeddings. A higher score indicates greater similarity. |
|
|
|
Parameters: |
|
sentence1 (str): The first sentence for similarity comparison. |
|
sentence2 (str): The second sentence for similarity comparison. |
|
|
|
Returns: |
|
float: The STS score representing the similarity between sentence1 and sentence2. |
|
""" |
|
|
|
|
|
embedding1 = openai_text_embedding(sentence1) |
|
embedding2 = openai_text_embedding(sentence2) |
|
|
|
|
|
embedding1 = np.asarray(embedding1) |
|
embedding2 = np.asarray(embedding2) |
|
|
|
|
|
|
|
similarity_score = 1 - cosine(embedding1, embedding2) |
|
|
|
return similarity_score |
|
|
|
|
|
def add_dist_score_column( |
|
dataframe: pd.DataFrame, |
|
sentence: str, |
|
) -> pd.DataFrame: |
|
""" |
|
Adds a new column to the provided DataFrame with STS (Semantic Textual Similarity) scores, |
|
calculated between a given sentence and each question in the 'questions' column of the DataFrame. |
|
The DataFrame is then sorted by this new column in descending order and the top 5 rows are returned. |
|
|
|
Parameters: |
|
dataframe (pd.DataFrame): A pandas DataFrame containing a 'questions' column. |
|
sentence (str): The sentence against which to compute STS scores for each question in the DataFrame. |
|
|
|
Returns: |
|
pd.DataFrame: A DataFrame containing the original data along with the new 'stsopenai' column, |
|
sorted by the 'stsopenai' column, and limited to the top 5 entries with the highest scores. |
|
""" |
|
|
|
|
|
dataframe["stsopenai"] = dataframe["questions"].apply( |
|
lambda x: calculate_sts_openai_score(str(x), sentence) |
|
) |
|
|
|
|
|
sorted_dataframe = dataframe.sort_values(by="stsopenai", ascending=False) |
|
|
|
|
|
return sorted_dataframe.iloc[:5, :] |
|
|
|
|
|
def convert_to_list_of_dict(df: pd.DataFrame) -> List[Dict[str, str]]: |
|
""" |
|
Reads in a pandas DataFrame and produces a list of dictionaries with two keys each, 'question' and 'answer.' |
|
|
|
Args: |
|
df: A pandas DataFrame with columns named 'questions' and 'answers'. |
|
|
|
Returns: |
|
A list of dictionaries, with each dictionary containing a 'question' and 'answer' key-value pair. |
|
""" |
|
|
|
|
|
result = [] |
|
|
|
|
|
for index, row in df.iterrows(): |
|
|
|
qa_dict_quest = {"role": "user", "content": row["questions"]} |
|
qa_dict_ans = {"role": "assistant", "content": row["answers"]} |
|
|
|
|
|
result.append(qa_dict_quest) |
|
result.append(qa_dict_ans) |
|
|
|
|
|
return result |
|
|
|
|
|
def query(payload: Dict[str, Any]) -> Dict[str, Any]: |
|
""" |
|
Sends a JSON payload to a predefined API URL and returns the JSON response. |
|
Args: |
|
payload (Dict[str, Any]): The JSON payload to be sent to the API. |
|
Returns: |
|
Dict[str, Any]: The JSON response received from the API. |
|
""" |
|
|
|
|
|
API_URL = "https://sks7h7h5qkhoxwxo.us-east-1.aws.endpoints.huggingface.cloud" |
|
|
|
|
|
headers = {"Accept": "application/json", "Content-Type": "application/json"} |
|
|
|
|
|
response = requests.post(API_URL, headers=headers, json=payload) |
|
|
|
|
|
return response.json() |
|
|
|
|
|
def llama2_7b_ysa(prompt: str) -> str: |
|
""" |
|
Queries a model and retrieves the generated text based on the given prompt. |
|
This function sends a prompt to a model (presumably named 'llama2_7b') and extracts |
|
the generated text from the model's response. It's tailored for handling responses |
|
from a specific API or model query structure where the response is expected to be |
|
a list of dictionaries, with at least one dictionary containing a key 'generated_text'. |
|
Parameters: |
|
- prompt (str): The text prompt to send to the model. |
|
Returns: |
|
- str: The generated text response from the model. |
|
Note: |
|
- The function assumes that the 'query' function is previously defined and accessible |
|
within the same scope or module. It should send a request to the model and return |
|
the response in a structured format. |
|
- The 'parameters' dictionary is passed empty but can be customized to include specific |
|
request parameters as needed by the model API. |
|
""" |
|
|
|
|
|
query_payload: Dict[str, Any] = { |
|
"inputs": prompt, |
|
"parameters": {"max_new_tokens": 200}, |
|
} |
|
|
|
|
|
output = query(query_payload) |
|
|
|
|
|
response: str = output[0]["generated_text"] |
|
|
|
return response |
|
|
|
|
|
def quantize_to_4bit(arr: Union[np.ndarray, Any]) -> np.ndarray: |
|
""" |
|
Converts an array to a 4-bit representation by normalizing and scaling its values. |
|
|
|
The function first checks if the input is an instance of numpy ndarray, |
|
if not, it converts the input into a numpy ndarray. Then, it normalizes |
|
the values of the array to be between 0 and 1. Finally, it scales these |
|
normalized values to the range of 0-15, corresponding to 4-bit integers, |
|
and returns this array of integers. |
|
|
|
Parameters: |
|
arr (Union[np.ndarray, Any]): An array or any type that can be converted to a numpy ndarray. |
|
|
|
Returns: |
|
np.ndarray: A numpy ndarray containing the input data quantized to 4-bit representation. |
|
|
|
Examples: |
|
>>> quantize_to_4bit([0, 128, 255]) |
|
array([ 0, 7, 15]) |
|
""" |
|
if not isinstance(arr, np.ndarray): |
|
arr = np.array(arr) |
|
|
|
arr_min = arr.min() |
|
arr_max = arr.max() |
|
|
|
|
|
normalized_arr = (arr - arr_min) / (arr_max - arr_min) |
|
|
|
|
|
return np.round(normalized_arr * 15).astype(int) |
|
|
|
|
|
def quantized_influence(arr1: np.ndarray, arr2: np.ndarray) -> float: |
|
""" |
|
Calculates a weighted measure of influence between two arrays based on their quantized (4-bit) versions. |
|
|
|
This function first quantizes both input arrays to 4-bit representations and then calculates a weighting based |
|
on the unique values of the first array's quantized version. It uses these weights to compute local averages |
|
within the second array's quantized version, assessing the influence of the first array on the second. |
|
The influence is normalized by the standard deviation of the second array's quantized version. |
|
|
|
Parameters: |
|
arr1 (np.ndarray): The first input numpy array. |
|
arr2 (np.ndarray): The second input numpy array. |
|
|
|
Returns: |
|
float: The calculated influence value, representing a weighted average that has been normalized. |
|
|
|
Note: |
|
Both inputs must be numpy ndarrays and it's expected that a function named `quantize_to_4bit` |
|
exists for converting an array to its 4-bit representation. |
|
""" |
|
arr1_4bit = quantize_to_4bit(arr1) |
|
arr2_4bit = quantize_to_4bit(arr2) |
|
|
|
unique_values = np.unique( |
|
arr1_4bit |
|
) |
|
y_bar_global = np.mean( |
|
arr2_4bit |
|
) |
|
|
|
|
|
|
|
weighted_local_averages = [ |
|
(np.mean((arr2_4bit[arr1_4bit == val]) - y_bar_global) ** 2) |
|
* len(arr2_4bit[arr1_4bit == val]) ** 2 |
|
for val in unique_values |
|
] |
|
|
|
|
|
return np.mean(weighted_local_averages) / np.std(arr2_4bit) |
|
|