Spaces:

thotranexe
/

toxicity

Runtime error

App Files Files Community

thotran commited on Apr 30, 2023

Commit

9993f32

1 Parent(s): 952a624

reduced wait time

Browse files

Files changed (4) hide show

.DS_Store +0 -0
app.py +53 -37
data/sub.csv +0 -0
requirements.txt +2 -2

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

app.py CHANGED Viewed

@@ -7,13 +7,15 @@ import torch.nn.functional as F
 from torch.utils.data import TensorDataset, DataLoader, Dataset
 from sklearn.metrics import roc_auc_score
 import re
-from tqdm import tqdm
 from typing import *
 import string
 from sklearn.model_selection import train_test_split
 from transformers import DistilBertTokenizer, AdamW
 from transformers import DistilBertModel, DistilBertConfig, DistilBertForSequenceClassification
 import streamlit as st
 st.write("Please be patient model training takes 20+ mins :P")
 #config constants
 SEED = 42
@@ -58,7 +60,7 @@ tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
 token_lens = []
-for txt in tqdm(data.comment_text):
   tokens = tokenizer.encode(txt, max_length=512)
   token_lens.append(len(tokens))
@@ -134,7 +136,7 @@ def train_epoch_for_hf(model, data_loader: DataLoader, device: torch.device, opt
     """
     model.train()
-    for batch in tqdm(data_loader):
         input_ids = batch["input_ids"].to(device)
         attention_mask = batch["attention_mask"].to(device)
         targets = batch["targets"].float().to(device)
@@ -152,7 +154,7 @@ def evaluate_for_hf(model, data_loader: DataLoader, device: torch.device):
     losses = []
     score = None
-    for idx, batch in enumerate(tqdm(data_loader)):
         input_ids = batch["input_ids"].to(device)
         attention_mask = batch["attention_mask"].to(device)
         targets = batch["targets"].float().to(device)
@@ -169,38 +171,52 @@ optimizer = AdamW(model.parameters(), lr=2e-5)
 best_val_loss = 9999.
 print('====START TRAINING====')
 #training here
-for epoch in tqdm(range(EPOCHS)):
-     print('-' * 10)
-     train_epoch_for_hf(model=model, data_loader=train_dataloader, optimizer=optimizer, device=device)
-     _, tr_loss = evaluate_for_hf(model=model, data_loader=train_dataloader, device=device)
-     val_pred, val_loss = evaluate_for_hf(model=model, data_loader=val_dataloader, device=device)
-     y_pred_np = val_pred.numpy()
-     val_auc = roc_auc_score(df_val[labels].to_numpy(), y_pred_np)
-     if val_loss < best_val_loss:
-         best_val_loss = val_loss
          #torch.save(model.state_dict(), 'distill_bert.pt')
-     print(f'Epoch {epoch + 1}/{EPOCHS}', f'train loss: {tr_loss:.4},', f'val loss: {val_loss:.4},', f'val auc: {val_auc:.4}')
 # once model is saved and generated no need to re run :)
-#model = DistilBertForSequenceClassification(config)
-#model.load_state_dict(torch.load('./distill_bert.pt'))
-#model = model.to(device)
-#test model here
-test_pred, test_loss = evaluate_for_hf(model=model, data_loader=test_dataloader, device=device)
-print('====TEST RESULT====')
-print(f'Log loss: {test_loss:.5}')
-y_pred_np = test_pred.numpy()
-test_auc = roc_auc_score(df_test[labels].to_numpy(), y_pred_np)
-print(f'ROC AUC: {test_auc:.5}')
-test_src_id = test.iloc[:, 0]
-test.drop(columns='id', inplace=True)
-test_labels.drop(columns='id', inplace=True)
-test_src = pd.concat((test, test_labels), axis=1)
-test_src_dataloader = create_data_loader(df=test_src, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
-prediction, _ = evaluate_for_hf(model=model, data_loader=test_src_dataloader, device=device)
-prediction = torch.sigmoid(prediction).numpy()
-sub[labels] = prediction
-sub.insert(1,"tweet",data.comment_text,True)
-st.daatframe(sub)

 from torch.utils.data import TensorDataset, DataLoader, Dataset
 from sklearn.metrics import roc_auc_score
 import re
+from stqdm import stqdm
 from typing import *
 import string
 from sklearn.model_selection import train_test_split
 from transformers import DistilBertTokenizer, AdamW
 from transformers import DistilBertModel, DistilBertConfig, DistilBertForSequenceClassification
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import streamlit as st
 st.write("Please be patient model training takes 20+ mins :P")
 #config constants
 SEED = 42
 token_lens = []
+for txt in stqdm(data.comment_text,desc="tokenizing"):
   tokens = tokenizer.encode(txt, max_length=512)
   token_lens.append(len(tokens))
     """
     model.train()
+    for batch in stqdm(data_loader, desc="training"):
         input_ids = batch["input_ids"].to(device)
         attention_mask = batch["attention_mask"].to(device)
         targets = batch["targets"].float().to(device)
     losses = []
     score = None
+    for idx, batch in enumerate(stqdm(data_loader,desc="evaluating")):
         input_ids = batch["input_ids"].to(device)
         attention_mask = batch["attention_mask"].to(device)
         targets = batch["targets"].float().to(device)
 best_val_loss = 9999.
 print('====START TRAINING====')
 #training here
+#for epoch in stqdm(range(EPOCHS)):
+#     print('-' * 10)
+#     train_epoch_for_hf(model=model, data_loader=train_dataloader, optimizer=optimizer, device=device)
+#     _, tr_loss = evaluate_for_hf(model=model, data_loader=train_dataloader, device=device)
+#     val_pred, val_loss = evaluate_for_hf(model=model, data_loader=val_dataloader, device=device)
+#     y_pred_np = val_pred.numpy()
+#     val_auc = roc_auc_score(df_val[labels].to_numpy(), y_pred_np)
+#     if val_loss < best_val_loss:
+#         best_val_loss = val_loss
          #torch.save(model.state_dict(), 'distill_bert.pt')
+#     print(f'Epoch {epoch + 1}/{EPOCHS}', f'train loss: {tr_loss:.4},', f'val loss: {val_loss:.4},', f'val auc: {val_auc:.4}')
 # once model is saved and generated no need to re run :)
+#PUSH MODEL TO HF
+#from huggingface_hub import notebook_login
+#notebook_login()
+#model.push_to_hub("tweetbert")
+#tokenizer.push_to_hub("tweetbert")
+#LOAD MODEL
+model=model = AutoModelForSequenceClassification.from_pretrained("thotranexe/tweetbert")
+model = model.to(device)
+#TEST MODEL
+#test_pred, test_loss = evaluate_for_hf(model=model, data_loader=test_dataloader, device=device)
+#print('====TEST RESULT====')
+#print(f'Log loss: {test_loss:.5}')
+#y_pred_np = test_pred.numpy()
+#test_auc = roc_auc_score(df_test[labels].to_numpy(), y_pred_np)
+#print(f'ROC AUC: {test_auc:.5}')
+#test_src_id = test.iloc[:, 0]
+#test.drop(columns='id', inplace=True)
+#test_labels.drop(columns='id', inplace=True)
+#test_src = pd.concat((test, test_labels), axis=1)
+#MAKE PREDICTIONS
+#test_src_dataloader = create_data_loader(df=test_src, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
+#prediction, _ = evaluate_for_hf(model=model, data_loader=test_src_dataloader, device=device)
+#prediction = torch.sigmoid(prediction).numpy()
+#SAVE RESULTS INTO SUBMISSION DATAFRAME
+#sub[labels] = prediction
+#sub.insert(1,"tweet",data.comment_text,True)
+#sub.to_csv("sub.csv", encoding='utf-8', index=False)
+#^commented above code, saved to csv to reduce wait/comput time
+sub=pd.read_csv('./data/sub.csv',engine='python',encoding='utf-8', error_bad_lines=False)
+sub.drop(index="id")
+st.dataframe(sub)

data/sub.csv ADDED Viewed

Binary file (71.3 MB). View file

requirements.txt CHANGED Viewed

@@ -2,7 +2,7 @@ numpy
 pandas
 streamlit
 torch
-tdqm
 scikit-learn
 transformers
-ipywidgets

 pandas
 streamlit
 torch
+stqdm
 scikit-learn
 transformers
+ipywidgets