import pandas as pd from numpy import floor #--- gensim --- from nltk.tokenize import word_tokenize from gensim.models.doc2vec import Doc2Vec, TaggedDocument def conf_level(val): """ Translates probability value into a plain english statement """ # https://www.dni.gov/files/documents/ICD/ICD%20203%20Analytic%20Standards.pdf conf = "undefined" if val < 0.05: conf = "Extremely Low Probability" elif val >= 0.05 and val < 0.20: conf = "Very Low Probability" elif val >= 0.20 and val < 0.45: conf = "Low Probability" elif val >= 0.45 and val < 0.55: conf = "Middling Probability" elif val >= 0.55 and val < 0.80: conf = "High Probability" elif val >= 0.80 and val < 0.95: conf = "Very High Probability" elif val >= 0.95: conf = "Extremely High Probability" return conf def subsample_df(df=None, size=10, sample_type="Random Sample"): """ Subsample the dataframe """ size = int(size) if sample_type == "Random Sample": return df.sample(size) elif sample_type == "Highest Probabilities": df.sort_values(by="probability", ascending=False, inplace=True) return df.head(size) elif sample_type == "Lowest Probabilities": df.sort_values(by="probability", ascending=True, inplace=True) return df.head(size) else: # sample probabilities in the middle tmp = df[(df["probability"] > 0.45) & (df["probability"] < 0.55)] samp = min([size, int(tmp.shape[0])]) return tmp.sample(samp) def down_samp(embedding): """Down sample a data frame for altiar visualization """ #total number of positive and negative sentiments in the class total_size = embedding.groupby(['name', 'sentiment'],as_index=False).count() user_data = 0 if 'Your Sentences' in str(total_size['name']): tmp = embedding.groupby(['name'],as_index=False).count() val = int(tmp[tmp['name'] == "Your Sentences"]['source']) user_data=val max_sample = total_size.groupby('name').max()['source'] #down sample to meeting altair's max values #but keep the proportional representation of groups down_samp = 1/(sum(max_sample)/(5000-user_data)) max_samp = max_sample.apply(lambda x: floor(x*down_samp)).astype(int).to_dict() max_samp['Your Sentences'] = user_data #sample down for each group in the data frame embedding= embedding.groupby('name').apply(lambda x: x.sample(n=max_samp.get(x.name))).reset_index(drop = True) #order the embedding return(embedding.sort_values(['sort_order'],ascending=True)) def prep_embed_data(data,model): ''' Basic data tagging''' tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)] embedding = [model.infer_vector(tagged_data[i].words) for i in range(len(tagged_data))] return embedding def prep_sentence_embedding(name,source, sentence, sentiment, sort_order,embed_model,idx,type="single"): """ Prepare a custom sentence to add to the embedding""" if type == "single": #get vector embedding tagged_data = TaggedDocument(words=word_tokenize(sentence.lower()), tags=['source']) vector = embed_model.infer_vector(tagged_data.words) tmp = { 'source': source, 'name': name, 'sort_order': sort_order, 'sentence': sentence, 'sentiment': sentiment, 'x': vector[0], 'y':vector[1] } return(pd.DataFrame(tmp,index=[idx])) else: #go through each group and add df = {"source":[], "name":[], "sentence":[], "sentiment":[], "x":[], "y":[], "sort_order":[] } slice_short = sentence slice_sentiment = sentiment vec_embedding = prep_embed_data(sentence,embed_model) df['source'] = df['source'] + [source]*len(slice_short) df['name'] = df['name'] + [name]*len(slice_short) #the sort order effects how its drawn by altair df['sort_order'] = df['sort_order'] + [sort_order]*len(slice_short) #add individual elements for i in range(len(slice_short)): df['sentence'].append(slice_short[i]) df['sentiment'].append(slice_sentiment[i]) df['x'].append(vec_embedding[i][0]) df['y'].append(vec_embedding[i][1]) df = pd.DataFrame(df) return(df)