File size: 1,705 Bytes
8b16ee5
4247f5a
8b16ee5
4247f5a
8b16ee5
4247f5a
 
8b16ee5
 
 
4247f5a
 
 
 
 
 
 
8b16ee5
4247f5a
8b16ee5
4247f5a
 
 
 
 
 
8b16ee5
4247f5a
 
 
 
 
 
8b16ee5
 
4247f5a
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# ObtainDataEmbedding.py
# imports
import openai
import pandas as pd

import tiktoken
from openai.embeddings_utils import get_embedding
import config
# set your API key
openai.api_key = "your openai api key"

# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

# load & inspect dataset
input_datapath = "data/attachmentchatdata-formated.csv"
df = pd.read_csv(input_datapath, index_col=0)
df = df[["userid", "chathistory", "avoide", "avoida", "avoidb", "avoidc", "avoidd", "anxietye", "anxietya", "anxietyb", "anxietyc", "anxietyd"]]
df = df.dropna()
df.head(2)

# Filter out chat transcripts that are too long to embed, estimate for the maximum number of words would be around 1638 words (8191 tokens / 5).
encoding = tiktoken.get_encoding(embedding_encoding)

df["n_tokens"] = df.chathistory.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens]
len(df)

# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage

# This may take a few minutes
df["embedding"] = df.chathistory.apply(lambda x: get_embedding(x, engine=embedding_model))
df.to_csv("data/chat_transcripts_with_embeddings_and_scores.csv")


# Please replace "data/chat_transcripts.csv" with the path to your actual data file. Also, replace 'ChatTranscript', 'Attachment', 'Avoidance' with the actual column names of your chat transcripts and attachment scores in your data file.

# Also, remember to set the API key for OpenAI in your environment before running the get_embedding function.