Spaces:

nazneen
/

interactive-model-cards

Build error

File size: 4,574 Bytes

import pandas as pd
from numpy import floor


#--- gensim ---
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument


def conf_level(val):
    """ Translates probability value into
        a plain english statement """
    # https://www.dni.gov/files/documents/ICD/ICD%20203%20Analytic%20Standards.pdf
    conf = "undefined"

    if val < 0.05:
        conf = "Extremely Low Probability"
    elif val >= 0.05 and val < 0.20:
        conf = "Very Low Probability"
    elif val >= 0.20 and val < 0.45:
        conf = "Low Probability"
    elif val >= 0.45 and val < 0.55:
        conf = "Middling Probability"
    elif val >= 0.55 and val < 0.80:
        conf = "High  Probability"
    elif val >= 0.80 and val < 0.95:
        conf = "Very High Probability"
    elif val >= 0.95:
        conf = "Extremely High Probability"

    return conf


def subsample_df(df=None, size=10, sample_type="Random Sample"):
    """ Subsample the dataframe  """
    size = int(size)
    if sample_type == "Random Sample":
        return df.sample(size)
    elif sample_type == "Highest Probabilities":
        df.sort_values(by="probability", ascending=False, inplace=True)
        return df.head(size)
    elif sample_type == "Lowest Probabilities":
        df.sort_values(by="probability", ascending=True, inplace=True)
        return df.head(size)
    else:
        # sample probabilities in the middle
        tmp = df[(df["probability"] > 0.45) & (df["probability"] < 0.55)]
        samp = min([size, int(tmp.shape[0])])
        return tmp.sample(samp)


def down_samp(embedding):
    """Down sample a data frame for altiar visualization """
    #total number of positive and negative sentiments in the class
    total_size = embedding.groupby(['name', 'sentiment'],as_index=False).count()

    user_data = 0
    if 'Your Sentences' in str(total_size['name']):
        tmp = embedding.groupby(['name'],as_index=False).count()
        val = int(tmp[tmp['name'] == "Your Sentences"]['source'])
        user_data=val

    max_sample = total_size.groupby('name').max()['source']

    #down sample to meeting altair's max values
    #but keep the proportional representation of groups
    down_samp = 1/(sum(max_sample)/(5000-user_data))

    max_samp = max_sample.apply(lambda x: floor(x*down_samp)).astype(int).to_dict()
    max_samp['Your Sentences'] = user_data

    #sample down for each group in the data frame
    embedding= embedding.groupby('name').apply(lambda x: x.sample(n=max_samp.get(x.name))).reset_index(drop = True)

    #order the embedding
    return(embedding.sort_values(['sort_order'],ascending=True))



def prep_embed_data(data,model):
    ''' Basic data tagging'''
    tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
    embedding = [model.infer_vector(tagged_data[i].words) for i in range(len(tagged_data))]
    return embedding

def prep_sentence_embedding(name,source, sentence, sentiment, sort_order,embed_model,idx,type="single"):
    """ Prepare a custom sentence to add to the embedding"""
    
    
    if type == "single":
        #get vector embedding
        tagged_data = TaggedDocument(words=word_tokenize(sentence.lower()), tags=['source'])
        vector = embed_model.infer_vector(tagged_data.words)

        tmp = {
            'source': source,
            'name': name,
            'sort_order': sort_order,
            'sentence': sentence,
            'sentiment': sentiment,
            'x': vector[0],
            'y':vector[1]
        }

        return(pd.DataFrame(tmp,index=[idx]))
    else:
        #go through each group and add 
        df = {"source":[],
            "name":[],
            "sentence":[],
            "sentiment":[],
            "x":[],
            "y":[],
            "sort_order":[]
        }


        slice_short = sentence
        slice_sentiment = sentiment
        vec_embedding = prep_embed_data(sentence,embed_model)

        df['source'] = df['source'] + [source]*len(slice_short)
        df['name'] = df['name'] + [name]*len(slice_short)

        #the sort order effects how its drawn by altair
        df['sort_order'] = df['sort_order'] + [sort_order]*len(slice_short)

        #add individual elements
        for i in range(len(slice_short)):
            df['sentence'].append(slice_short[i])
            df['sentiment'].append(slice_sentiment[i])
            df['x'].append(vec_embedding[i][0])
            df['y'].append(vec_embedding[i][1])

        df = pd.DataFrame(df) 
        return(df)