Spaces:

ACRLab
/

FraleyLabAttachmentBot

Sleeping

App Files Files Community

AjithKSenthil commited on May 16, 2023

Commit

8b16ee5

1 Parent(s): c77f143

modified it to use our data now

Browse files

Files changed (3) hide show

ChatAttachmentAnalysis.py +5 -3
ChatAttachmentAnalysisWithXG.py +17 -8
ObtainDataEmbedding.py +11 -5

ChatAttachmentAnalysis.py CHANGED Viewed

@@ -6,16 +6,16 @@ from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_squared_error, mean_absolute_error
 # Read your data file
-datafile_path = "data/chat_transcripts_with_embeddings.csv"
 df = pd.read_csv(datafile_path)
 # Convert embeddings to numpy arrays
-df["embedding"] = df.embedding.apply(eval).apply(np.array)
 # Split the data into features (X) and labels (y)
 X = list(df.embedding.values)
-y = df[['attachment', 'avoidance']] # Assuming your attachment scores are in these two columns
 # Split data into training and testing sets
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
@@ -32,3 +32,5 @@ mse = mean_squared_error(y_test, preds)
 mae = mean_absolute_error(y_test, preds)
 print(f"Chat transcript embeddings performance: mse={mse:.2f}, mae={mae:.2f}")

 from sklearn.metrics import mean_squared_error, mean_absolute_error
 # Read your data file
+datafile_path = "data/chat_transcripts_with_embeddings_and_scores.csv"
 df = pd.read_csv(datafile_path)
 # Convert embeddings to numpy arrays
+df['embedding'] = df['embedding'].apply(lambda x: [float(num) for num in x.strip('[]').split(',')])
 # Split the data into features (X) and labels (y)
 X = list(df.embedding.values)
+y = ['avoide', 'avoida', 'avoidb', 'avoidc', 'avoidd', 'anxietye', 'anxietya', 'anxietyb', 'anxietyc', 'anxietyd']
 # Split data into training and testing sets
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 mae = mean_absolute_error(y_test, preds)
 print(f"Chat transcript embeddings performance: mse={mse:.2f}, mae={mae:.2f}")

ChatAttachmentAnalysisWithXG.py CHANGED Viewed

@@ -1,27 +1,36 @@
 import pandas as pd
 import numpy as np
-from sklearn.multioutput import MultiOutputRegressor
 import xgboost as xgb
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_squared_error, mean_absolute_error
-datafile_path = "data/chat_transcripts_with_embeddings.csv"
 df = pd.read_csv(datafile_path)
-df["embedding"] = df.embedding.apply(eval).apply(np.array)
-X = np.array(df.embedding.tolist())
-y = df[["Attachment", "Avoidance"]]
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10)
-multioutputregressor = MultiOutputRegressor(xg_reg).fit(X_train, y_train)
-preds = multioutputregressor.predict(X_test)
 mse = mean_squared_error(y_test, preds)
 mae = mean_absolute_error(y_test, preds)
 print(f"ada-002 embedding performance on chat transcripts: mse={mse:.2f}, mae={mae:.2f}")

 import pandas as pd
 import numpy as np
 import xgboost as xgb
+from sklearn.multioutput import MultiOutputRegressor
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_squared_error, mean_absolute_error
+datafile_path = "data/chat_transcripts_with_embeddings_and_scores.csv"
 df = pd.read_csv(datafile_path)
+df['embedding'] = df['embedding'].apply(lambda x: [float(num) for num in x.strip('[]').split(',')])
+y_columns = ['avoide', 'avoida', 'avoidb', 'avoidc', 'avoidd', 'anxietye', 'anxietya', 'anxietyb', 'anxietyc', 'anxietyd']
+y = df[y_columns].values
+X_train, X_test, y_train, y_test = train_test_split(list(df.embedding.values), y, test_size=0.2, random_state=42)
 xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10)
+multioutput_reg = MultiOutputRegressor(xg_reg)
+multioutput_reg.fit(np.array(X_train).tolist(), y_train)
+preds = multioutput_reg.predict(np.array(X_test).tolist())
 mse = mean_squared_error(y_test, preds)
 mae = mean_absolute_error(y_test, preds)
 print(f"ada-002 embedding performance on chat transcripts: mse={mse:.2f}, mae={mae:.2f}")
+# The mean squared error (MSE) and mean absolute error (MAE) are both metrics for assessing the performance of our regression model.
+# MSE is calculated by taking the average of the squared differences between the predicted and actual values. It gives more weight to larger errors because they are squared in the calculation. This means that a model could have a relatively high MSE due to a few large errors, even if it made smaller errors on a majority of the instances.
+# MAE, on the other hand, is calculated by taking the average of the absolute differences between the predicted and actual values. This metric gives equal weight to all errors and is less sensitive to outliers than MSE.

ObtainDataEmbedding.py CHANGED Viewed

@@ -1,7 +1,13 @@
 # imports
 import pandas as pd
 import tiktoken
 from openai.embeddings_utils import get_embedding
 # embedding model parameters
 embedding_model = "text-embedding-ada-002"
@@ -9,24 +15,24 @@ embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-0
 max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191
 # load & inspect dataset
-input_datapath = "data/chat_transcripts.csv"
 df = pd.read_csv(input_datapath, index_col=0)
-df = df[["ChatTranscript", "Attachment", "Avoidance"]]
 df = df.dropna()
 df.head(2)
 # Filter out chat transcripts that are too long to embed, estimate for the maximum number of words would be around 1638 words (8191 tokens / 5).
 encoding = tiktoken.get_encoding(embedding_encoding)
-df["n_tokens"] = df.ChatTranscript.apply(lambda x: len(encoding.encode(x)))
 df = df[df.n_tokens <= max_tokens]
 len(df)
 # Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage
 # This may take a few minutes
-df["embedding"] = df.ChatTranscript.apply(lambda x: get_embedding(x, engine=embedding_model))
-df.to_csv("data/chat_transcripts_with_embeddings.csv")
 # Please replace "data/chat_transcripts.csv" with the path to your actual data file. Also, replace 'ChatTranscript', 'Attachment', 'Avoidance' with the actual column names of your chat transcripts and attachment scores in your data file.

+# ObtainDataEmbedding.py
 # imports
+import openai
 import pandas as pd
 import tiktoken
 from openai.embeddings_utils import get_embedding
+import config
+# set your API key
+openai.api_key = "your openai api key"
 # embedding model parameters
 embedding_model = "text-embedding-ada-002"
 max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191
 # load & inspect dataset
+input_datapath = "data/attachmentchatdata-formated.csv"
 df = pd.read_csv(input_datapath, index_col=0)
+df = df[["userid", "chathistory", "avoide", "avoida", "avoidb", "avoidc", "avoidd", "anxietye", "anxietya", "anxietyb", "anxietyc", "anxietyd"]]
 df = df.dropna()
 df.head(2)
 # Filter out chat transcripts that are too long to embed, estimate for the maximum number of words would be around 1638 words (8191 tokens / 5).
 encoding = tiktoken.get_encoding(embedding_encoding)
+df["n_tokens"] = df.chathistory.apply(lambda x: len(encoding.encode(x)))
 df = df[df.n_tokens <= max_tokens]
 len(df)
 # Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage
 # This may take a few minutes
+df["embedding"] = df.chathistory.apply(lambda x: get_embedding(x, engine=embedding_model))
+df.to_csv("data/chat_transcripts_with_embeddings_and_scores.csv")
 # Please replace "data/chat_transcripts.csv" with the path to your actual data file. Also, replace 'ChatTranscript', 'Attachment', 'Avoidance' with the actual column names of your chat transcripts and attachment scores in your data file.