metadata
license: cc
datasets:
- vkovenko/cross_domain_uk_reviews
language:
- uk
metrics:
- f1
library_name: keras
pipeline_tag: text-classification
Deep lstm with attention module trained for rating estimation of Ukrainian reviews.
Code with example usage of the model can be found in the following repository: https://github.com/HikkaV/Ukrainian-Reviews-Estimation/.
Model uses custom layer and tokenizer is used in a custom class, thus the one can load the model and tokenizer using the following code:
import tensorflow as tf
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, Regex
import tokenizers
from tokenizers import Tokenizer, models, decoders, processors
from tokenizers import pre_tokenizers, trainers, Regex
import huggingface_hub
class Attention(tf.keras.layers.Layer):
def __init__(self,
units=128, **kwargs):
super(Attention,self).__init__(**kwargs)
self.units = units
def build(self, input_shape):
self.W1=self.add_weight(name='attention_weights_1', shape=(input_shape[-1], self.units),
initializer='glorot_uniform', trainable=True)
self.W2=self.add_weight(name='attention_weights_2', shape=(1, self.units),
initializer='glorot_uniform', trainable=True)
super(Attention, self).build(input_shape)
def call(self, x):
x = tf.transpose(x, perm=[0, 2, 1])
attention = tf.nn.softmax(tf.matmul(self.W2, tf.nn.tanh(tf.matmul(self.W1, x))))
weighted_context = tf.reduce_sum(x * attention, axis=-1)
return weighted_context, attention
def get_config(self):
config = super().get_config().copy()
config.update({
'units': self.units
})
return config
#download model
model = tf.keras.models.load_model(huggingface_hub.hf_hub_download('vkovenko/deep_lstm_attention_ukr_reviews_rating_estimation',
'deep_lstm_attention_w2v_huber.h5',
local_dir='model'),
compile=False,
custom_objects={'Attention':Attention})
class BPETokenizer:
def __init__(self, vocab, merges):
self.suffix = '</w>'
self.tokenizer = Tokenizer(models.BPE.from_file(vocab=vocab,
merges=merges, end_of_word_suffix=self.suffix))
self.tokenizer.pre_tokenizer = pre_tokenizers.Split(Regex(r"[\w'-]+|[^\w\s'-]+"),'removed', True)
self.id_to_token = self.tokenizer.id_to_token
self.encode_batch = self.tokenizer.encode_batch
self.token_to_id = self.tokenizer.token_to_id
self.encode = self.tokenizer.encode
def tokens_to_ids(self, tokens):
return list(map(self.token_to_id, tokens))
def ids_to_tokens(self, ids):
return list(map(self.id_to_token, ids))
def decode(self, tokens, return_indices=False):
decoded = []
merged_indices = []
i = 0
while i<len(tokens):
if tokens[i].endswith(self.suffix):
decoded.append(tokens[i])
merged_indices.append([i])
i+=1
else:
merged_token = ''
tmp_indc = []
while not tokens[i].endswith(self.suffix):
merged_token+=tokens[i]
tmp_indc.append(i)
i+=1
merged_token+=tokens[i]
tmp_indc.append(i)
decoded.append(merged_token)
merged_indices.append(tmp_indc)
i+=1
if return_indices:
return decoded, merged_indices
else:
return decoded
#download tokenizer
tokenizer = BPETokenizer(vocab=huggingface_hub.hf_hub_download('vkovenko/deep_lstm_attention_ukr_reviews_rating_estimation',
'tokenizer_30k.json',
local_dir='model'),
merges=huggingface_hub.hf_hub_download('vkovenko/deep_lstm_attention_ukr_reviews_rating_estimation',
'merges_tokenizer.txt',
local_dir='model')
)