Spaces:
Sleeping
Sleeping
Commit
·
8b16ee5
1
Parent(s):
c77f143
modified it to use our data now
Browse files- ChatAttachmentAnalysis.py +5 -3
- ChatAttachmentAnalysisWithXG.py +17 -8
- ObtainDataEmbedding.py +11 -5
ChatAttachmentAnalysis.py
CHANGED
@@ -6,16 +6,16 @@ from sklearn.model_selection import train_test_split
|
|
6 |
from sklearn.metrics import mean_squared_error, mean_absolute_error
|
7 |
|
8 |
# Read your data file
|
9 |
-
datafile_path = "data/
|
10 |
|
11 |
df = pd.read_csv(datafile_path)
|
12 |
|
13 |
# Convert embeddings to numpy arrays
|
14 |
-
df[
|
15 |
|
16 |
# Split the data into features (X) and labels (y)
|
17 |
X = list(df.embedding.values)
|
18 |
-
y =
|
19 |
|
20 |
# Split data into training and testing sets
|
21 |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
@@ -32,3 +32,5 @@ mse = mean_squared_error(y_test, preds)
|
|
32 |
mae = mean_absolute_error(y_test, preds)
|
33 |
|
34 |
print(f"Chat transcript embeddings performance: mse={mse:.2f}, mae={mae:.2f}")
|
|
|
|
|
|
6 |
from sklearn.metrics import mean_squared_error, mean_absolute_error
|
7 |
|
8 |
# Read your data file
|
9 |
+
datafile_path = "data/chat_transcripts_with_embeddings_and_scores.csv"
|
10 |
|
11 |
df = pd.read_csv(datafile_path)
|
12 |
|
13 |
# Convert embeddings to numpy arrays
|
14 |
+
df['embedding'] = df['embedding'].apply(lambda x: [float(num) for num in x.strip('[]').split(',')])
|
15 |
|
16 |
# Split the data into features (X) and labels (y)
|
17 |
X = list(df.embedding.values)
|
18 |
+
y = ['avoide', 'avoida', 'avoidb', 'avoidc', 'avoidd', 'anxietye', 'anxietya', 'anxietyb', 'anxietyc', 'anxietyd']
|
19 |
|
20 |
# Split data into training and testing sets
|
21 |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
|
32 |
mae = mean_absolute_error(y_test, preds)
|
33 |
|
34 |
print(f"Chat transcript embeddings performance: mse={mse:.2f}, mae={mae:.2f}")
|
35 |
+
|
36 |
+
|
ChatAttachmentAnalysisWithXG.py
CHANGED
@@ -1,27 +1,36 @@
|
|
1 |
import pandas as pd
|
2 |
import numpy as np
|
3 |
-
from sklearn.multioutput import MultiOutputRegressor
|
4 |
import xgboost as xgb
|
|
|
|
|
5 |
from sklearn.model_selection import train_test_split
|
6 |
from sklearn.metrics import mean_squared_error, mean_absolute_error
|
7 |
|
8 |
-
datafile_path = "data/
|
9 |
|
10 |
df = pd.read_csv(datafile_path)
|
11 |
-
df[
|
12 |
|
13 |
-
|
14 |
-
y = df[
|
15 |
|
16 |
-
X_train, X_test, y_train, y_test = train_test_split(
|
17 |
|
18 |
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10)
|
19 |
|
20 |
-
|
|
|
|
|
21 |
|
22 |
-
preds =
|
23 |
|
24 |
mse = mean_squared_error(y_test, preds)
|
25 |
mae = mean_absolute_error(y_test, preds)
|
26 |
|
27 |
print(f"ada-002 embedding performance on chat transcripts: mse={mse:.2f}, mae={mae:.2f}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
import numpy as np
|
|
|
3 |
import xgboost as xgb
|
4 |
+
|
5 |
+
from sklearn.multioutput import MultiOutputRegressor
|
6 |
from sklearn.model_selection import train_test_split
|
7 |
from sklearn.metrics import mean_squared_error, mean_absolute_error
|
8 |
|
9 |
+
datafile_path = "data/chat_transcripts_with_embeddings_and_scores.csv"
|
10 |
|
11 |
df = pd.read_csv(datafile_path)
|
12 |
+
df['embedding'] = df['embedding'].apply(lambda x: [float(num) for num in x.strip('[]').split(',')])
|
13 |
|
14 |
+
y_columns = ['avoide', 'avoida', 'avoidb', 'avoidc', 'avoidd', 'anxietye', 'anxietya', 'anxietyb', 'anxietyc', 'anxietyd']
|
15 |
+
y = df[y_columns].values
|
16 |
|
17 |
+
X_train, X_test, y_train, y_test = train_test_split(list(df.embedding.values), y, test_size=0.2, random_state=42)
|
18 |
|
19 |
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10)
|
20 |
|
21 |
+
multioutput_reg = MultiOutputRegressor(xg_reg)
|
22 |
+
|
23 |
+
multioutput_reg.fit(np.array(X_train).tolist(), y_train)
|
24 |
|
25 |
+
preds = multioutput_reg.predict(np.array(X_test).tolist())
|
26 |
|
27 |
mse = mean_squared_error(y_test, preds)
|
28 |
mae = mean_absolute_error(y_test, preds)
|
29 |
|
30 |
print(f"ada-002 embedding performance on chat transcripts: mse={mse:.2f}, mae={mae:.2f}")
|
31 |
+
|
32 |
+
# The mean squared error (MSE) and mean absolute error (MAE) are both metrics for assessing the performance of our regression model.
|
33 |
+
|
34 |
+
# MSE is calculated by taking the average of the squared differences between the predicted and actual values. It gives more weight to larger errors because they are squared in the calculation. This means that a model could have a relatively high MSE due to a few large errors, even if it made smaller errors on a majority of the instances.
|
35 |
+
|
36 |
+
# MAE, on the other hand, is calculated by taking the average of the absolute differences between the predicted and actual values. This metric gives equal weight to all errors and is less sensitive to outliers than MSE.
|
ObtainDataEmbedding.py
CHANGED
@@ -1,7 +1,13 @@
|
|
|
|
1 |
# imports
|
|
|
2 |
import pandas as pd
|
|
|
3 |
import tiktoken
|
4 |
from openai.embeddings_utils import get_embedding
|
|
|
|
|
|
|
5 |
|
6 |
# embedding model parameters
|
7 |
embedding_model = "text-embedding-ada-002"
|
@@ -9,24 +15,24 @@ embedding_encoding = "cl100k_base" # this the encoding for text-embedding-ada-0
|
|
9 |
max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191
|
10 |
|
11 |
# load & inspect dataset
|
12 |
-
input_datapath = "data/
|
13 |
df = pd.read_csv(input_datapath, index_col=0)
|
14 |
-
df = df[["
|
15 |
df = df.dropna()
|
16 |
df.head(2)
|
17 |
|
18 |
# Filter out chat transcripts that are too long to embed, estimate for the maximum number of words would be around 1638 words (8191 tokens / 5).
|
19 |
encoding = tiktoken.get_encoding(embedding_encoding)
|
20 |
|
21 |
-
df["n_tokens"] = df.
|
22 |
df = df[df.n_tokens <= max_tokens]
|
23 |
len(df)
|
24 |
|
25 |
# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage
|
26 |
|
27 |
# This may take a few minutes
|
28 |
-
df["embedding"] = df.
|
29 |
-
df.to_csv("data/
|
30 |
|
31 |
|
32 |
# Please replace "data/chat_transcripts.csv" with the path to your actual data file. Also, replace 'ChatTranscript', 'Attachment', 'Avoidance' with the actual column names of your chat transcripts and attachment scores in your data file.
|
|
|
1 |
+
# ObtainDataEmbedding.py
|
2 |
# imports
|
3 |
+
import openai
|
4 |
import pandas as pd
|
5 |
+
|
6 |
import tiktoken
|
7 |
from openai.embeddings_utils import get_embedding
|
8 |
+
import config
|
9 |
+
# set your API key
|
10 |
+
openai.api_key = "your openai api key"
|
11 |
|
12 |
# embedding model parameters
|
13 |
embedding_model = "text-embedding-ada-002"
|
|
|
15 |
max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191
|
16 |
|
17 |
# load & inspect dataset
|
18 |
+
input_datapath = "data/attachmentchatdata-formated.csv"
|
19 |
df = pd.read_csv(input_datapath, index_col=0)
|
20 |
+
df = df[["userid", "chathistory", "avoide", "avoida", "avoidb", "avoidc", "avoidd", "anxietye", "anxietya", "anxietyb", "anxietyc", "anxietyd"]]
|
21 |
df = df.dropna()
|
22 |
df.head(2)
|
23 |
|
24 |
# Filter out chat transcripts that are too long to embed, estimate for the maximum number of words would be around 1638 words (8191 tokens / 5).
|
25 |
encoding = tiktoken.get_encoding(embedding_encoding)
|
26 |
|
27 |
+
df["n_tokens"] = df.chathistory.apply(lambda x: len(encoding.encode(x)))
|
28 |
df = df[df.n_tokens <= max_tokens]
|
29 |
len(df)
|
30 |
|
31 |
# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage
|
32 |
|
33 |
# This may take a few minutes
|
34 |
+
df["embedding"] = df.chathistory.apply(lambda x: get_embedding(x, engine=embedding_model))
|
35 |
+
df.to_csv("data/chat_transcripts_with_embeddings_and_scores.csv")
|
36 |
|
37 |
|
38 |
# Please replace "data/chat_transcripts.csv" with the path to your actual data file. Also, replace 'ChatTranscript', 'Attachment', 'Avoidance' with the actual column names of your chat transcripts and attachment scores in your data file.
|