annabellatian commited on
Commit
75890af
·
verified ·
1 Parent(s): 167bbc8

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +14 -46
README.md CHANGED
@@ -1,42 +1,28 @@
1
  ## Evaluation Pipeline
2
- # Use the raw version of the text below to evaluate the model. Make sure to set the datapath.
3
-
4
- # -*- coding: utf-8 -*-
5
- """CIS 5190 Transformer Model
6
-
7
- Automatically generated by Colab.
8
-
9
- Original file is located at
10
- https://colab.research.google.com/drive/1Iy-nQcufaF7--hI9He7Dp9FsW1TomgrP
11
- """
12
 
13
  import pandas as pd
14
  from sklearn.model_selection import train_test_split
15
- from sklearn.metrics import accuracy_score, classification_report
16
  import torch
17
  from torch.utils.data import Dataset, DataLoader
18
  from transformers import BertTokenizer, BertForSequenceClassification, AdamW
19
- from transformers import get_scheduler
20
- from google.colab import drive
21
-
22
- drive.mount('/content/drive')
23
 
24
- # SET DATASET PATH HERE
25
- dataset_path = '/content/drive/My Drive/24 Fall/CIS 5190/CIS 5190 Final Project/test_data_random_subset.csv'
26
 
27
  news_df = pd.read_csv(dataset_path)
28
 
29
  X = news_df['title']
30
  y = news_df['labels']
31
 
32
- # y = y.apply(lambda x: 1 if x == 'FoxNews' else 0)
33
-
34
- # Split the data into training and testing sets (80% train, 20% test)
35
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)
36
 
37
- # Tokenize the text using a BERT tokenizer
38
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
39
 
 
40
  def tokenize_data(texts, tokenizer, max_len=128):
41
  return tokenizer(
42
  list(texts),
@@ -50,7 +36,7 @@ def tokenize_data(texts, tokenizer, max_len=128):
50
  train_encodings = tokenize_data(X_train, tokenizer)
51
  test_encodings = tokenize_data(X_test, tokenizer)
52
 
53
- # Create a custom dataset class
54
  class NewsDataset(Dataset):
55
  def __init__(self, encodings, labels):
56
  self.encodings = encodings
@@ -71,32 +57,16 @@ test_dataset = NewsDataset(test_encodings, y_test.tolist())
71
  train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
72
  test_loader = DataLoader(test_dataset, batch_size=16)
73
 
74
- # Define the model
75
  model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
 
76
 
77
  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
78
  model.to(device)
79
 
80
  # Define optimizer and scheduler
81
- optimizer = AdamW(model.parameters(), lr=5e-5)
82
- num_training_steps = len(train_loader) * 4 # Assume 4 epochs
83
- lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
84
-
85
- # Train the model
86
- def train_model(model, train_loader, optimizer, scheduler, epochs=4):
87
- model.train()
88
- for epoch in range(epochs):
89
- epoch_loss = 0
90
- for batch in train_loader:
91
- batch = {k: v.to(device) for k, v in batch.items()}
92
- outputs = model(**batch)
93
- loss = outputs.loss
94
- loss.backward()
95
- optimizer.step()
96
- scheduler.step()
97
- optimizer.zero_grad()
98
- epoch_loss += loss.item()
99
- print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/len(train_loader):.4f}")
100
 
101
  # Evaluate the model
102
  def evaluate_model(model, test_loader):
@@ -112,10 +82,8 @@ def evaluate_model(model, test_loader):
112
  y_pred.extend(predictions.tolist())
113
  return y_true, y_pred
114
 
115
- train_model(model, train_loader, optimizer, lr_scheduler)
116
-
117
  y_true, y_pred = evaluate_model(model, test_loader)
118
 
119
- # 11. Print evaluation metrics
120
  print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
121
  print("Classification Report:\n", classification_report(y_true, y_pred))
 
1
  ## Evaluation Pipeline
2
+ # Use the raw version of the text below to evaluate the model. Make sure to set the dataset and model path.
 
 
 
 
 
 
 
 
 
3
 
4
  import pandas as pd
5
  from sklearn.model_selection import train_test_split
6
+ from google.colab import drive
7
  import torch
8
  from torch.utils.data import Dataset, DataLoader
9
  from transformers import BertTokenizer, BertForSequenceClassification, AdamW
10
+ from sklearn.metrics import accuracy_score, classification_report
 
 
 
11
 
12
+ dataset_path = ""
13
+ model_path = ""
14
 
15
  news_df = pd.read_csv(dataset_path)
16
 
17
  X = news_df['title']
18
  y = news_df['labels']
19
 
20
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
21
+ X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2
 
 
22
 
 
23
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
24
 
25
+
26
  def tokenize_data(texts, tokenizer, max_len=128):
27
  return tokenizer(
28
  list(texts),
 
36
  train_encodings = tokenize_data(X_train, tokenizer)
37
  test_encodings = tokenize_data(X_test, tokenizer)
38
 
39
+ # Create a custom Dataset class
40
  class NewsDataset(Dataset):
41
  def __init__(self, encodings, labels):
42
  self.encodings = encodings
 
57
  train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
58
  test_loader = DataLoader(test_dataset, batch_size=16)
59
 
 
60
  model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
61
+ model.load_state_dict(torch.load(model_path))
62
 
63
  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
64
  model.to(device)
65
 
66
  # Define optimizer and scheduler
67
+ # optimizer = AdamW(model.parameters(), lr=5e-5)
68
+ # num_training_steps = len(train_loader) * 4 # Assume 4 epochs
69
+ # lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  # Evaluate the model
72
  def evaluate_model(model, test_loader):
 
82
  y_pred.extend(predictions.tolist())
83
  return y_true, y_pred
84
 
 
 
85
  y_true, y_pred = evaluate_model(model, test_loader)
86
 
87
+ # Print evaluation metrics
88
  print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
89
  print("Classification Report:\n", classification_report(y_true, y_pred))