stanford-nlpxed
/

transcript-analysis

Model card Files Files and versions Community

ddemszky commited on May 16, 2023

Commit

91cb36e

1 Parent(s): 8b8145a

added custom handler

Browse files

Files changed (6) hide show

__pycache__/handler.cpython-39.pyc +0 -0
__pycache__/utils.cpython-39.pyc +0 -0
handler.py +243 -0
requirements.txt +6 -0
test.py +22 -0
utils.py +192 -0

__pycache__/handler.cpython-39.pyc ADDED Viewed

Binary file (8.82 kB). View file

__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (6.53 kB). View file

handler.py ADDED Viewed

	@@ -0,0 +1,243 @@

+from typing import Dict, List, Any
+from scipy.special import softmax
+import numpy as np
+import weakref
+from utils import clean_str, clean_str_nopunct
+import torch
+from transformers import BertTokenizer, BertForSequenceClassification
+from utils import MultiHeadModel, BertInputBuilder, get_num_words
+UPTAKE_MODEL='ddemszky/uptake-model'
+REASONING_MODEL ='ddemszky/student-reasoning'
+QUESTION_MODEL ='ddemszky/question-detection'
+class Utterance:
+    def __init__(self, speaker, text, uid=None,
+                 transcript=None, starttime=None, endtime=None, **kwargs):
+        self.speaker = speaker
+        self.text = text
+        self.uid = uid
+        self.starttime = starttime
+        self.endtime = endtime
+        self.transcript = weakref.ref(transcript) if transcript else None
+        self.props = kwargs
+        self.uptake = None
+        self.reasoning = None
+        self.question = None
+    def get_clean_text(self, remove_punct=False):
+        if remove_punct:
+            return clean_str_nopunct(self.text)
+        return clean_str(self.text)
+    def get_num_words(self):
+        return get_num_words(self.text)
+    def to_dict(self):
+        return {
+            'speaker': self.speaker,
+            'text': self.text,
+            'uid': self.uid,
+            'starttime': self.starttime,
+            'endtime': self.endtime,
+            'uptake': self.uptake,
+            'reasoning': self.reasoning,
+            'question':  self.question,
+            **self.props
+        }
+    def __repr__(self):
+        return f"Utterance(speaker='{self.speaker}'," \
+               f"text='{self.text}', uid={self.uid}," \
+               f"starttime={self.starttime}, endtime={self.endtime}, props={self.props})"
+class Transcript:
+    def __init__(self, **kwargs):
+        self.utterances = []
+        self.params = kwargs
+    def add_utterance(self, utterance):
+        utterance.transcript = weakref.ref(self)
+        self.utterances.append(utterance)
+    def get_idx(self, idx):
+        if idx >= len(self.utterances):
+            return None
+        return self.utterances[idx]
+    def get_uid(self, uid):
+        for utt in self.utterances:
+            if utt.uid == uid:
+                return utt
+        return None
+    def length(self):
+        return len(self.utterances)
+    def to_dict(self):
+        return {
+            'utterances': [utterance.to_dict() for utterance in self.utterances],
+            **self.params
+        }
+    def __repr__(self):
+        return f"Transcript(utterances={self.utterances}, custom_params={self.params})"
+class QuestionModel:
+    def __init__(self, device, tokenizer, input_builder, max_length=300, path=QUESTION_MODEL):
+        print("Loading models...")
+        self.device = device
+        self.tokenizer = tokenizer
+        self.input_builder = input_builder
+        self.max_length = max_length
+        self.model = MultiHeadModel.from_pretrained(path, head2size={"is_question": 2})
+        self.model.to(self.device)
+    def run_inference(self, transcript):
+        self.model.eval()
+        with torch.no_grad():
+            for i, utt in enumerate(transcript.utterances):
+                if "?" in utt.text:
+                    utt.question = 1
+                else:
+                    text = utt.get_clean_text(remove_punct=True)
+                    instance = self.input_builder.build_inputs([], text,
+                                                               max_length=self.max_length,
+                                                               input_str=True)
+                    output = self.get_prediction(instance)
+                    print(output)
+                    utt.question = np.argmax(output["is_question_logits"][0].tolist())
+    def get_prediction(self, instance):
+        instance["attention_mask"] = [[1] * len(instance["input_ids"])]
+        for key in ["input_ids", "token_type_ids", "attention_mask"]:
+            instance[key] = torch.tensor(instance[key]).unsqueeze(0)  # Batch size = 1
+            instance[key].to(self.device)
+        output = self.model(input_ids=instance["input_ids"],
+                            attention_mask=instance["attention_mask"],
+                            token_type_ids=instance["token_type_ids"],
+                            return_pooler_output=False)
+        return output
+class ReasoningModel:
+    def __init__(self, device, tokenizer, input_builder, max_length=128, path=REASONING_MODEL):
+        print("Loading models...")
+        self.device = device
+        self.tokenizer = tokenizer
+        self.input_builder = input_builder
+        self.max_length = max_length
+        self.model = BertForSequenceClassification.from_pretrained(path)
+        self.model.to(self.device)
+    def run_inference(self, transcript, min_num_words=8):
+        self.model.eval()
+        with torch.no_grad():
+            for i, utt in enumerate(transcript.utterances):
+                if utt.get_num_words() >= min_num_words:
+                    instance = self.input_builder.build_inputs([], utt.text,
+                                                               max_length=self.max_length,
+                                                               input_str=True)
+                    output = self.get_prediction(instance)
+                    utt.reasoning = np.argmax(output["logits"][0].tolist())
+    def get_prediction(self, instance):
+        instance["attention_mask"] = [[1] * len(instance["input_ids"])]
+        for key in ["input_ids", "token_type_ids", "attention_mask"]:
+            instance[key] = torch.tensor(instance[key]).unsqueeze(0)  # Batch size = 1
+            instance[key].to(self.device)
+        output = self.model(input_ids=instance["input_ids"],
+                            attention_mask=instance["attention_mask"],
+                            token_type_ids=instance["token_type_ids"])
+        return output
+class UptakeModel:
+    def __init__(self, device, tokenizer, input_builder, max_length=120, path=UPTAKE_MODEL):
+        print("Loading models...")
+        self.device = device
+        self.tokenizer = tokenizer
+        self.input_builder = input_builder
+        self.max_length = max_length
+        self.model = MultiHeadModel.from_pretrained(path, head2size={"nsp": 2})
+        self.model.to(self.device)
+    def run_inference(self, transcript, min_prev_words, uptake_speaker=None):
+        self.model.eval()
+        prev_num_words = 0
+        prev_utt = None
+        with torch.no_grad():
+            for i, utt in enumerate(transcript.utterances):
+                if ((uptake_speaker is None) or (utt.speaker == uptake_speaker)) and (prev_num_words >= min_prev_words):
+                    textA = prev_utt.get_clean_text(remove_punct=False)
+                    textB = utt.get_clean_text(remove_punct=False)
+                    instance = self.input_builder.build_inputs([textA], textB,
+                                                               max_length=self.max_length,
+                                                               input_str=True)
+                    output = self.get_prediction(instance)
+                    utt.uptake = int(softmax(output["nsp_logits"][0].tolist())[1] > .8)
+                prev_num_words = utt.get_num_words()
+                prev_utt = utt
+    def get_prediction(self, instance):
+        instance["attention_mask"] = [[1] * len(instance["input_ids"])]
+        for key in ["input_ids", "token_type_ids", "attention_mask"]:
+            instance[key] = torch.tensor(instance[key]).unsqueeze(0)  # Batch size = 1
+            instance[key].to(self.device)
+        output = self.model(input_ids=instance["input_ids"],
+                            attention_mask=instance["attention_mask"],
+                            token_type_ids=instance["token_type_ids"],
+                            return_pooler_output=False)
+        return output
+class EndpointHandler():
+    def __init__(self):
+        print("Loading models...")
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        self.input_builder = BertInputBuilder(tokenizer=self.tokenizer)
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+       data args:
+            inputs (:obj: `list`):
+            List of dicts, where each dict represents an utterance; each utterance object must have a `speaker`,
+            `text` and `uid`and can include list of custom properties
+            parameters (:obj: `dict`)
+       Return:
+            A :obj:`list` | `dict`: will be serialized and returned
+        """
+        # get inputs
+        utterances = data.pop("inputs", data)
+        params = data.pop("parameters", None)
+        print("EXAMPLES")
+        for utt in utterances[:3]:
+            print("speaker %s: %s" % (utt["speaker"], utt["text"]))
+        transcript = Transcript(filename=params.pop("filename", None))
+        for utt in utterances:
+            transcript.add_utterance(Utterance(**utt))
+        print("Running inference on %d examples..." % transcript.length())
+        # Uptake
+        uptake_model = UptakeModel(self.device, self.tokenizer, self.input_builder)
+        uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
+                                   uptake_speaker=params.pop("uptake_speaker", None))
+        # Reasoning
+        reasoning_model = ReasoningModel(self.device, self.tokenizer, self.input_builder)
+        reasoning_model.run_inference(transcript)
+        # Question
+        question_model = QuestionModel(self.device, self.tokenizer, self.input_builder)
+        question_model.run_inference(transcript)
+        return transcript.to_dict()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+clean-text==1.1.4
+num2words==0.5.10
+numpy==1.22.4
+scipy==1.7.3
+torch==1.10.2
+transformers==4.25.1

test.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from handler import EndpointHandler
+# init handler
+my_handler = EndpointHandler()
+# prepare sample payload
+example = {
+ "inputs": [
+   {"uid": "1", "speaker": "Alice", "text": "How much is the fish?" },
+   {"uid": "2", "speaker": "Bob", "text": "I do not know about the fish. Because you put a long side and it’s a long side. What do you think." },
+{"uid": "3", "speaker": "Alice", "text": "OK, thank you Bob." },
+ ],
+ "parameters": {
+   "uptake_min_num_words": 5,
+   "uptake_speaker": "Bob",
+    "filename": "sample.csv",
+ }
+}
+# test the handler
+print(my_handler(example))

utils.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import torch
+from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
+from torch import nn
+from itertools import chain
+from torch.nn import MSELoss, CrossEntropyLoss
+from cleantext import clean
+from num2words import num2words
+import re
+import string
+punct_chars = list((set(string.punctuation) | {'’', '‘', '–', '—', '~', '|', '“', '”', '…', "'", "`", '_'}))
+punct_chars.sort()
+punctuation = ''.join(punct_chars)
+replace = re.compile('[%s]' % re.escape(punctuation))
+def get_num_words(text):
+    if not isinstance(text, str):
+        print("%s is not a string" % text)
+    text = replace.sub(' ', text)
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    text = re.sub(r'\[.+\]', " ", text)
+    return len(text.split())
+def number_to_words(num):
+    try:
+        return num2words(re.sub(",", "", num))
+    except:
+        return num
+clean_str = lambda s: clean(s,
+                            fix_unicode=True,  # fix various unicode errors
+                            to_ascii=True,  # transliterate to closest ASCII representation
+                            lower=True,  # lowercase text
+                            no_line_breaks=True,  # fully strip line breaks as opposed to only normalizing them
+                            no_urls=True,  # replace all URLs with a special token
+                            no_emails=True,  # replace all email addresses with a special token
+                            no_phone_numbers=True,  # replace all phone numbers with a special token
+                            no_numbers=True,  # replace all numbers with a special token
+                            no_digits=False,  # replace all digits with a special token
+                            no_currency_symbols=False,  # replace all currency symbols with a special token
+                            no_punct=False,  # fully remove punctuation
+                            replace_with_url="<URL>",
+                            replace_with_email="<EMAIL>",
+                            replace_with_phone_number="<PHONE>",
+                            replace_with_number=lambda m: number_to_words(m.group()),
+                            replace_with_digit="0",
+                            replace_with_currency_symbol="<CUR>",
+                            lang="en"
+                            )
+clean_str_nopunct = lambda s: clean(s,
+                            fix_unicode=True,  # fix various unicode errors
+                            to_ascii=True,  # transliterate to closest ASCII representation
+                            lower=True,  # lowercase text
+                            no_line_breaks=True,  # fully strip line breaks as opposed to only normalizing them
+                            no_urls=True,  # replace all URLs with a special token
+                            no_emails=True,  # replace all email addresses with a special token
+                            no_phone_numbers=True,  # replace all phone numbers with a special token
+                            no_numbers=True,  # replace all numbers with a special token
+                            no_digits=False,  # replace all digits with a special token
+                            no_currency_symbols=False,  # replace all currency symbols with a special token
+                            no_punct=True,  # fully remove punctuation
+                            replace_with_url="<URL>",
+                            replace_with_email="<EMAIL>",
+                            replace_with_phone_number="<PHONE>",
+                            replace_with_number=lambda m: number_to_words(m.group()),
+                            replace_with_digit="0",
+                            replace_with_currency_symbol="<CUR>",
+                            lang="en"
+                            )
+class MultiHeadModel(BertPreTrainedModel):
+  """Pre-trained BERT model that uses our loss functions"""
+  def __init__(self, config, head2size):
+    super(MultiHeadModel, self).__init__(config, head2size)
+    config.num_labels = 1
+    self.bert = BertModel(config)
+    self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    module_dict = {}
+    for head_name, num_labels in head2size.items():
+      module_dict[head_name] = nn.Linear(config.hidden_size, num_labels)
+    self.heads = nn.ModuleDict(module_dict)
+    self.init_weights()
+  def forward(self, input_ids, token_type_ids=None, attention_mask=None,
+              head2labels=None, return_pooler_output=False, head2mask=None,
+              nsp_loss_weights=None):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Get logits
+    output = self.bert(
+      input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
+      output_attentions=False, output_hidden_states=False, return_dict=True)
+    pooled_output = self.dropout(output["pooler_output"]).to(device)
+    head2logits = {}
+    return_dict = {}
+    for head_name, head in self.heads.items():
+      head2logits[head_name] = self.heads[head_name](pooled_output)
+      head2logits[head_name] = head2logits[head_name].float()
+      return_dict[head_name + "_logits"] = head2logits[head_name]
+    if head2labels is not None:
+      for head_name, labels in head2labels.items():
+        num_classes = head2logits[head_name].shape[1]
+        # Regression (e.g. for politeness)
+        if num_classes == 1:
+          # Only consider positive examples
+          if head2mask is not None and head_name in head2mask:
+            num_positives = head2labels[head2mask[head_name]].sum()  # use certain labels as mask
+            if num_positives == 0:
+              return_dict[head_name + "_loss"] = torch.tensor([0]).to(device)
+            else:
+              loss_fct = MSELoss(reduction='none')
+              loss = loss_fct(head2logits[head_name].view(-1), labels.float().view(-1))
+              return_dict[head_name + "_loss"] = loss.dot(head2labels[head2mask[head_name]].float().view(-1)) / num_positives
+          else:
+            loss_fct = MSELoss()
+            return_dict[head_name + "_loss"] = loss_fct(head2logits[head_name].view(-1), labels.float().view(-1))
+        else:
+          loss_fct = CrossEntropyLoss(weight=nsp_loss_weights.float())
+          return_dict[head_name + "_loss"] = loss_fct(head2logits[head_name], labels.view(-1))
+    if return_pooler_output:
+      return_dict["pooler_output"] = output["pooler_output"]
+    return return_dict
+class InputBuilder(object):
+  """Base class for building inputs from segments."""
+  def __init__(self, tokenizer):
+      self.tokenizer = tokenizer
+      self.mask = [tokenizer.mask_token_id]
+  def build_inputs(self, history, reply, max_length):
+      raise NotImplementedError
+  def mask_seq(self, sequence, seq_id):
+      sequence[seq_id] = self.mask
+      return sequence
+  @classmethod
+  def _combine_sequence(self, history, reply, max_length, flipped=False):
+      # Trim all inputs to max_length
+      history = [s[:max_length] for s in history]
+      reply = reply[:max_length]
+      if flipped:
+          return [reply] + history
+      return history + [reply]
+class BertInputBuilder(InputBuilder):
+  """Processor for BERT inputs"""
+  def __init__(self, tokenizer):
+      InputBuilder.__init__(self, tokenizer)
+      self.cls = [tokenizer.cls_token_id]
+      self.sep = [tokenizer.sep_token_id]
+      self.model_inputs = ["input_ids", "token_type_ids", "attention_mask"]
+      self.padded_inputs = ["input_ids", "token_type_ids"]
+      self.flipped = False
+  def build_inputs(self, history, reply, max_length, input_str=True):
+    """See base class."""
+    if input_str:
+        history = [self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(t)) for t in history]
+        reply = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(reply))
+    sequence = self._combine_sequence(history, reply, max_length, self.flipped)
+    sequence = [s + self.sep for s in sequence]
+    sequence[0] = self.cls + sequence[0]
+    instance = {}
+    instance["input_ids"] = list(chain(*sequence))
+    last_speaker = 0
+    other_speaker = 1
+    seq_length = len(sequence)
+    instance["token_type_ids"] = [last_speaker if ((seq_length - i) % 2 == 1) else other_speaker
+                                  for i, s in enumerate(sequence) for _ in s]
+    return instance