transformerYSDA / app.py
niknikita's picture
Update app.py
7895958
raw
history blame
4.01 kB
import streamlit as st
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split
from transformers import get_linear_schedule_with_warmup, AdamW
from torch.cuda.amp import autocast, GradScaler
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, \
BigBirdPegasusForSequenceClassification, BigBirdTokenizer
from transformers import pipeline
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import streamlit as st
import pandas as pd
import json
import ast
from scipy import stats
import numpy as np
import time
import datetime
#
def get_top95(y_predict, convert_target):
lst_labels = []
tuple_arr = tuple((idx, val) for idx, val in enumerate(y_predict))
sort_y = sorted(tuple_arr, key=lambda x: x[1], reverse=True)
cumsum = 0
for key, prob in sort_y:
cumsum += prob
print(prob)
lst_labels.append(convert_target[str(key)])
if cumsum > 0.95:
break
return lst_labels
#
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.
from transformers import DistilBertModel, DistilBertTokenizer
model = torch.load("pytorch_distilbert_news (3).bin", map_location=torch.device('cpu'))
# model.load_state_dict(checkpoint['model'])
# optimizer.load_state_dict(checkpoint['opt'])
# model.to("cpu")
# print(model)
# model = DistilBertForSequenceClassification.from_pretrained("model/distilbert-model1.pt", local_files_only=True)
# tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-pegasus-large-arxiv')
# model = BigBirdPegasusForSequenceClassification.from_pretrained('google/bigbird-pegasus-large-arxiv',
# num_labels=8,
# return_dict=False)
def get_predict(title, abstract):
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
# encoded_dict = tokenizer.encode_plus(
# text, # document to encode.
# add_special_tokens=True, # add '[CLS]' and '[SEP]'
# max_length=512, # set max length
# truncation=True, # truncate longer messages
# pad_to_max_length=True, # add padding
# return_attention_mask=True, # create attn. masks
# return_tensors='pt' # return pytorch tensors
# )
inputs = tokenizer(title, abstract, return_tensors="pt")
outputs = model(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
)
logits = outputs[0]
print(logits)
y_predict = torch.nn.functional.softmax(logits).cpu().detach().numpy()
file_path = "sample.json"
with open(file_path, 'r') as json_file:
decode_target = json.load(json_file)
return get_top95(y_predict, decode_target)
#
#
#
#
#
# get_predict('''physics physics physics physics physics
# physics physics physics physics''')
#
st.markdown("### Hello, world!")
st.markdown("<img width=200px src='https://rozetked.me/images/uploads/dwoilp3BVjlE.jpg'>", unsafe_allow_html=True)
# ^-- можно показывать пользователю текст, картинки, ограниченное подмножество html - всё как в jupyter
title = st.text_area("TEXT HERE", key=1)
abstract = st.text_area("TEXT HERE", key=2)
# ^-- показать текстовое поле. В поле text лежит строка, которая находится там в данный момент
# from transformers import pipeline
# pipe = pipeline("ner", "Davlan/distilbert-base-multilingual-cased-ner-hrl")
# raw_predictions = pipe(text)
# тут уже знакомый вам код с huggingface.transformers -- его можно заменить на что угодно от fairseq до catboost
st.markdown(f"It's prediction: {get_predict(title, abstract)}")