Job_sentiment_classifier / annotator.py
sepehr's picture
Create annotator.py
0307f7f
raw
history blame
2.69 kB
import numpy as np
import transformers
from sklearn import metrics
import pandas as pd
import streamlit as st
def ignitor_load():
dataj=pd.read_json('tinyignitorfile.json')
return dataj
def appendor(thex):
gaa=ignitor_load()
shortt=gaa.loc[:21,['text','index']]
shortt.loc[21,'text']=thex
return shortt
tokenizerr = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
modell = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased')
encod=[]
def allll(df):
for i in range(len(df)):
v=df.loc[i,'text']
embed=np.array(np.array(modell(np.array(tokenizerr.encode(v))[np.newaxis,:])[0][0][1:-1]).mean(0))
encod.append(embed)
#allll(shortt)
labs = {}
labs["SALARY"] = ['underpay','underpaid','overpay','overpaying','payments','wage','payroll','pay','paycheck']
labs["COLLEAGUES"] = ['colleague','employee','staff' ,'coworker','co-worker','colleagues']
labs["SUPERVISION"] = ['boss','supervisors','manager','supervisor']
labs["TIMEDAY"] = ['monday','weekday','day','weekend']
labs["TIMEDAYNOMONDAY"] = ['weekday','day','weekend']
emblabs={}
emblabss=[]
keyy=[]
for key,v in labs.items():
keyy.append(key)
embed=np.array(np.array(modell(np.array(tokenizerr.encode(v))[np.newaxis,:])[0][0][1:-1]).mean(0))
emblabss.append(embed)
for i in range(len(keyy)):
emblabs[keyy[i]] = emblabss[i]
hamme=[]
for a,z in emblabs.items():
jj=z.reshape(-1, 1)
hamme.append(jj)
sim=[]
for i in range(len(hamme)):
zz=metrics.pairwise.cosine_similarity(encod, hamme[i].T)
sim.append(zz)
sim=np.array(sim)
cyr1=st.secrets["cyr1"]
cyr1=float(cyr1)
cyr2=st.secrets["cyr2"]
cyr2=float(cyr2)
cyr3=st.secrets["cyr3"]
cyr3=float(cyr3)
cyr4=st.secrets["cyr4"]
cyr4=float(cyr4)
cyr5=st.secrets["cyr5"]
cyr5=float(cyr5)
referirv=[cyr1,cyr2,cyr3,cyr4,cyr5]
meanss=[]
labels = list(emblabs.keys())
for i in range(len(sim)):
sim[i] = sim[i] / sum(sim[i])
meanss.append(sim[i].mean())
zarayeb = [ii / jj for ii, jj in zip(referirv, meanss)]
for i in range(len(sim)):
sim[i] = (sim[i])*zarayeb[i]
threshhold=st.secrets["threshhold"]
threshhold=float(threshhold)
tags=[]
for j in range(len(sim[0])):
if np.amax([sim[:,j,0]]) <= threshhold:
label='None'
tags.append(label)
else:
label=np.argmax([sim[:,j,0]])
tags.append(label)
return tags[-1]