File size: 3,362 Bytes
c4543eb 481c6b3 de2bdfb 481c6b3 db2fdad 5f726f0 de2bdfb 3c6e00e db7d8dd 3c6e00e e5ffa90 dfe2e18 de2bdfb 7e51e2d de2bdfb 7e51e2d de2bdfb 7e51e2d de2bdfb 7e51e2d de2bdfb 7e51e2d de2bdfb 7e51e2d de2bdfb dfe2e18 623670e de2bdfb 7e51e2d 16c9ab3 623670e e5ffa90 983b8f4 623670e 481c6b3 6c86820 e5ffa90 6c86820 e5ffa90 6c86820 623670e 6c86820 e5ffa90 6c86820 db2fdad ba0c1c3 db2fdad e5ffa90 db2fdad e5ffa90 d481ecd e42c394 e616fac 59234d4 e616fac e42c394 e616fac e42c394 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import torch
from utils import label_full_decoder
import sys
import dataset
import engine
from model import BERTBaseUncased
import config
from transformers import pipeline, AutoTokenizer, AutoModel
import gradio as gr
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
device = config.device
model = BERTBaseUncased()
model.load_state_dict(torch.load(config.MODEL_PATH, map_location=torch.device(device)),strict=False)
model.to(device)
# T = tokenizer.TweetTokenizer(
# preserve_handles=True, preserve_hashes=True, preserve_case=False, preserve_url=False)
# text_processor = TextPreProcessor(
# # terms that will be normalized
# normalize=['url', 'email', 'percent', 'money', 'phone', 'user'],
# # terms that will be annotated
# annotate={},
# fix_html=True, # fix HTML tokens
# # corpus from which the word statistics are going to be used
# # for word segmentation
# segmenter="twitter",
# # corpus from which the word statistics are going to be used
# # for spell correction
# corrector="twitter",
# unpack_hashtags=False, # perform word segmentation on hashtags
# unpack_contractions=False, # Unpack contractions (can't -> can not)
# spell_correct_elong=False, # spell correction for elongated words
# # select a tokenizer. You can use SocialTokenizer, or pass your own
# # the tokenizer, should take as input a string and return a list of tokens
# tokenizer=SocialTokenizer(lowercase=True).tokenize,
# # list of dictionaries, for replacing tokens extracted from the text,
# # with other expressions. You can pass more than one dictionaries.
# dicts=[]
# )
social_tokenizer=SocialTokenizer(lowercase=True).tokenize
def preprocess(text):
# tokens = T.tokenize(text)
# tokens = text_processor.pre_process_docs(text)
tokens = social_tokenizer(text)
print(tokens, file=sys.stderr)
ptokens = []
for index, token in enumerate(tokens):
if "@" in token:
if index > 0:
# check if previous token was mention
if "@" in tokens[index-1]:
pass
else:
ptokens.append("mention_0")
else:
ptokens.append("mention_0")
else:
ptokens.append(token)
print(ptokens, file=sys.stderr)
return " ".join(ptokens)
def predict_sentiment(sentence):
sentence = preprocess(sentence)
model_path = config.MODEL_PATH
test_dataset = dataset.BERTDataset(
review=[sentence],
target=[0]
)
test_data_loader = torch.utils.data.DataLoader(
test_dataset,
batch_size=config.VALID_BATCH_SIZE,
num_workers=2
)
outputs, [] = engine.predict_fn(test_data_loader, model, device)
outputs = classifier(sentence)
print(outputs)
return outputs #{"label":outputs[0]}
interface = gr.Interface(
fn=predict_sentiment,
inputs='text',
outputs=['label'],
title='Latvian Twitter Sentiment Analysis',
examples= ["Es mīlu Tevi","Es ienīstu kafiju"],
description='Get the positive/neutral/negative sentiment for the given input.'
)
interface.launch(inline = False)
|