import streamlit as st from torch.utils.data import Dataset, DataLoader import torch from sklearn.model_selection import train_test_split from transformers import get_linear_schedule_with_warmup, AdamW from torch.cuda.amp import autocast, GradScaler from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, \ BigBirdPegasusForSequenceClassification, BigBirdTokenizer from transformers import pipeline from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score import streamlit as st import pandas as pd import json import ast from scipy import stats import numpy as np import time import datetime # def get_top95(y_predict, convert_target): lst_labels = [] tuple_arr = tuple((idx, val) for idx, val in enumerate(y_predict)) sort_y = sorted(tuple_arr, key=lambda x: x[1], reverse=True) cumsum = 0 for key, prob in sort_y: cumsum += prob print(prob) lst_labels.append(convert_target[str(key)]) if cumsum > 0.95: break return lst_labels # # Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. from transformers import DistilBertModel, DistilBertTokenizer class DistillBERTClass(torch.nn.Module): def __init__(self): super(DistillBERTClass, self).__init__() self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased") self.pre_classifier = torch.nn.Linear(768, 768) self.dropout = torch.nn.Dropout(0.3) self.classifier = torch.nn.Linear(768, 8) def forward(self, input_ids, attention_mask): output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask) hidden_state = output_1[0] pooler = hidden_state[:, 0] pooler = self.pre_classifier(pooler) pooler = torch.nn.ReLU()(pooler) pooler = self.dropout(pooler) output = self.classifier(pooler) return output model = DistillBERTClass() LEARNING_RATE = 1e-05 optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE) model = torch.load("pytorch_distilbert_news (3).bin", map_location=torch.device('cpu')) # model.load_state_dict(checkpoint['model']) # optimizer.load_state_dict(checkpoint['opt']) # model.to("cpu") # print(model) # model = DistilBertForSequenceClassification.from_pretrained("model/distilbert-model1.pt", local_files_only=True) # tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-pegasus-large-arxiv') # model = BigBirdPegasusForSequenceClassification.from_pretrained('google/bigbird-pegasus-large-arxiv', # num_labels=8, # return_dict=False) def get_predict(title, abstract): tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') # encoded_dict = tokenizer.encode_plus( # text, # document to encode. # add_special_tokens=True, # add '[CLS]' and '[SEP]' # max_length=512, # set max length # truncation=True, # truncate longer messages # pad_to_max_length=True, # add padding # return_attention_mask=True, # create attn. masks # return_tensors='pt' # return pytorch tensors # ) inputs = tokenizer(title, abstract, return_tensors="pt") outputs = model( input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], ) logits = outputs[0] y_predict = torch.nn.functional.softmax(logits).cpu().detach().numpy() file_path = "sample.json" with open(file_path, 'r') as json_file: decode_target = json.load(json_file) return get_top95(y_predict, decode_target) # # # # # # get_predict('''physics physics physics physics physics # physics physics physics physics''') # st.markdown("### Hello, world!") st.markdown("", unsafe_allow_html=True) # ^-- можно показывать пользователю текст, картинки, ограниченное подмножество html - всё как в jupyter title = st.text_area("TEXT HERE") abstract = st.text_area("TEXT HERE") # ^-- показать текстовое поле. В поле text лежит строка, которая находится там в данный момент # from transformers import pipeline # pipe = pipeline("ner", "Davlan/distilbert-base-multilingual-cased-ner-hrl") # raw_predictions = pipe(text) # тут уже знакомый вам код с huggingface.transformers -- его можно заменить на что угодно от fairseq до catboost st.markdown(f"It's prediction: {get_predict(title, abstract)}")