Spaces:

ACRLab
/

FraleyLabAttachmentBot

Sleeping

AjithKSenthil commited on Apr 5, 2023

Commit

379ec43

1 Parent(s): 2971e95

started coding the transcript analysis and survey prediction code

We are using BERT a pre trained transformer model and a regression layer to predict the survey responses as a numerical value for each question in the survey set.

Files changed (1) hide show

transcriptanalysis.py +103 -0

transcriptanalysis.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import torch
+from torch.utils.data import Dataset, DataLoader
+from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error, r2_score
+import numpy as np
+# Load the dataset
+chat_transcripts = ["chat transcript 1", "chat transcript 2", ...]
+survey_responses = [3.5, 4.2, ...]  # Numerical survey responses
+# Split the data into training, validation, and testing sets
+train_texts, temp_texts, train_labels, temp_labels = train_test_split(chat_transcripts, survey_responses, test_size=0.3, random_state=42)
+val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)
+# Pre-process the data using BERT tokenizer
+tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+train_encodings = tokenizer(train_texts, truncation=True, padding=True)
+val_encodings = tokenizer(val_texts, truncation=True, padding=True)
+test_encodings = tokenizer(test_texts, truncation=True, padding=True)
+class SurveyDataset(Dataset):
+    def __init__(self, encodings, labels):
+        self.encodings = encodings
+        self.labels = labels
+    def __getitem__(self, idx):
+        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
+        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
+        return item
+    def __len__(self):
+        return len(self.labels)
+train_dataset = SurveyDataset(train_encodings, train_labels)
+val_dataset = SurveyDataset(val_encodings, val_labels)
+test_dataset = SurveyDataset(test_encodings, test_labels)
+# Fine-tune the BERT model
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1).to(device)
+train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
+val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
+test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)
+optim = AdamW(model.parameters(), lr=2e-5)
+num_epochs = 3
+num_training_steps = num_epochs * len(train_loader)
+lr_scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=0, num_training_steps=num_training_steps)
+for epoch in range(num_epochs):
+    model.train()
+    for batch in train_loader:
+        optim.zero_grad()
+        input_ids = batch["input_ids"].to(device)
+        attention_mask = batch["attention_mask"].to(device)
+        labels = batch["labels"].unsqueeze(1).to(device)
+        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
+        loss = outputs.loss
+        loss.backward()
+        optim.step()
+        lr_scheduler.step()
+# Evaluate the model
+model.eval()
+preds = []
+with torch.no_grad():
+    for batch in test_loader:
+        input_ids = batch["input_ids"].to(device)
+        attention_mask = batch["attention_mask"].to(device)
+        outputs = model(input_ids, attention_mask=attention_mask)
+        logits = outputs.logits
+        preds.extend(logits.squeeze().tolist())
+mse = mean_squared_error(test_labels, preds)
+r2 = r2_score(test_labels, preds)
+print("Mean Squared Error:", mse)
+print("R-squared Score:", r2)
+def predict_survey_response(chat_transcript, model, tokenizer, device):
+    # Preprocess the chat transcript
+    encoding = tokenizer(chat_transcript, truncation=True, padding=True, return_tensors="pt")
+    # Move tensors to the device
+    input_ids = encoding["input_ids"].to(device)
+    attention_mask = encoding["attention_mask"].to(device)
+    # Predict the survey response using the fine-tuned model
+    model.eval()
+    with torch.no_grad():
+        outputs = model(input_ids, attention_mask=attention_mask)
+        logits = outputs.logits
+        predicted_response = logits.squeeze().item()
+    return predicted_response
+# Example usage
+new_chat_transcript = "A new chat transcript"
+predicted_response = predict_survey_response(new_chat_transcript, model, tokenizer, device)
+print("Predicted survey response:", predicted_response)