Spaces:
Build error
Build error
File size: 4,574 Bytes
90f4ec6 fa5cfac 90f4ec6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import pandas as pd
from numpy import floor
#--- gensim ---
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
def conf_level(val):
""" Translates probability value into
a plain english statement """
# https://www.dni.gov/files/documents/ICD/ICD%20203%20Analytic%20Standards.pdf
conf = "undefined"
if val < 0.05:
conf = "Extremely Low Probability"
elif val >= 0.05 and val < 0.20:
conf = "Very Low Probability"
elif val >= 0.20 and val < 0.45:
conf = "Low Probability"
elif val >= 0.45 and val < 0.55:
conf = "Middling Probability"
elif val >= 0.55 and val < 0.80:
conf = "High Probability"
elif val >= 0.80 and val < 0.95:
conf = "Very High Probability"
elif val >= 0.95:
conf = "Extremely High Probability"
return conf
def subsample_df(df=None, size=10, sample_type="Random Sample"):
""" Subsample the dataframe """
size = int(size)
if sample_type == "Random Sample":
return df.sample(size)
elif sample_type == "Highest Probabilities":
df.sort_values(by="probability", ascending=False, inplace=True)
return df.head(size)
elif sample_type == "Lowest Probabilities":
df.sort_values(by="probability", ascending=True, inplace=True)
return df.head(size)
else:
# sample probabilities in the middle
tmp = df[(df["probability"] > 0.45) & (df["probability"] < 0.55)]
samp = min([size, int(tmp.shape[0])])
return tmp.sample(samp)
def down_samp(embedding):
"""Down sample a data frame for altiar visualization """
#total number of positive and negative sentiments in the class
total_size = embedding.groupby(['name', 'sentiment'],as_index=False).count()
user_data = 0
if 'Your Sentences' in str(total_size['name']):
tmp = embedding.groupby(['name'],as_index=False).count()
val = int(tmp[tmp['name'] == "Your Sentences"]['source'])
user_data=val
max_sample = total_size.groupby('name').max()['source']
#down sample to meeting altair's max values
#but keep the proportional representation of groups
down_samp = 1/(sum(max_sample)/(5000-user_data))
max_samp = max_sample.apply(lambda x: floor(x*down_samp)).astype(int).to_dict()
max_samp['Your Sentences'] = user_data
#sample down for each group in the data frame
embedding= embedding.groupby('name').apply(lambda x: x.sample(n=max_samp.get(x.name))).reset_index(drop = True)
#order the embedding
return(embedding.sort_values(['sort_order'],ascending=True))
def prep_embed_data(data,model):
''' Basic data tagging'''
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
embedding = [model.infer_vector(tagged_data[i].words) for i in range(len(tagged_data))]
return embedding
def prep_sentence_embedding(name,source, sentence, sentiment, sort_order,embed_model,idx,type="single"):
""" Prepare a custom sentence to add to the embedding"""
if type == "single":
#get vector embedding
tagged_data = TaggedDocument(words=word_tokenize(sentence.lower()), tags=['source'])
vector = embed_model.infer_vector(tagged_data.words)
tmp = {
'source': source,
'name': name,
'sort_order': sort_order,
'sentence': sentence,
'sentiment': sentiment,
'x': vector[0],
'y':vector[1]
}
return(pd.DataFrame(tmp,index=[idx]))
else:
#go through each group and add
df = {"source":[],
"name":[],
"sentence":[],
"sentiment":[],
"x":[],
"y":[],
"sort_order":[]
}
slice_short = sentence
slice_sentiment = sentiment
vec_embedding = prep_embed_data(sentence,embed_model)
df['source'] = df['source'] + [source]*len(slice_short)
df['name'] = df['name'] + [name]*len(slice_short)
#the sort order effects how its drawn by altair
df['sort_order'] = df['sort_order'] + [sort_order]*len(slice_short)
#add individual elements
for i in range(len(slice_short)):
df['sentence'].append(slice_short[i])
df['sentiment'].append(slice_sentiment[i])
df['x'].append(vec_embedding[i][0])
df['y'].append(vec_embedding[i][1])
df = pd.DataFrame(df)
return(df)
|