Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

README.md +160 -0
config.json +76 -0
pytorch_model.bin +3 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +13 -0
training_args.bin +3 -0
vocab.txt +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,160 @@

+---
+license: bigscience-openrail-m
+widget:
+- text: >-
+    the atm protein is a single high molecular weight protein predominantly confined to the nucleus of human fibroblasts but is present in both nuclear and microsomal fractions from human lymphoblast cells and peripheral blood lymphocytes atm protein levels and localization remain constant throughout all stages of the cell cycle truncated atm protein was not detected in lymphoblasts from ataxia telangiectasia patients homozygous for mutations leading to premature protein termination exposure of normal human cells to gamma irradiation and the radiomimetic drug neocarzinostatin had no effect on atm protein levels in contrast to a noted rise in p53 levels over the same time interval these findings are consistent with a role for the atm protein in ensuring the fidelity of dna repair and cell cycle regulation following genome damage
+datasets:
+- bigbio/drugprot
+- bigbio/ncbi_disease
+language:
+- en
+pipeline_tag: token-classification
+tags:
+- biology
+- medical
+---
+# DistilBERT base model for restoring punctuation of medical/biotech speech-to-text transcripts
+E.g.:
+```
+the atm protein is a single high molecular weight protein predominantly confined to the nucleus of human
+fibroblasts but is present in both nuclear and microsomal fractions from human lymphoblast cells and peripheral
+blood lymphocytes atm protein levels and localization remain constant throughout all stages of the cell cycle
+truncated atm protein was not detected in lymphoblasts from ataxia telangiectasia patients homozygous
+for mutations leading to premature protein termination exposure of normal human cells to gamma irradiation and the
+radiomimetic drug neocarzinostatin had no effect on atm protein levels in contrast to a noted rise in p53 levels
+over the same time interval these findings are consistent with a role for the atm protein in ensuring the fidelity
+of dna repair and cell cycle regulation following genome damage
+```
+will be punctuated as follows:
+```
+The ATM protein is a single, high-molecular-weight protein predominantly confined to the nucleus of human
+fibroblasts, but is present in both nuclear and microsomal fractions from human lymphoblast cells and peripheral
+blood lymphocytes. ATM protein levels and localization remain constant throughout all stages of the cell cycle.
+Truncated ATM protein was not detected in lymphoblasts from ataxia-telangiectasia-patients homozygous
+for mutations leading to premature protein termination. Exposure of normal human cells to gamma-irradiation and the
+radiomimetic drug neocarzinostatin had no effect on ATM protein levels, in contrast to a noted rise in p53 levels
+over the same time interval. These findings are consistent with a role for the ATM protein in ensuring the fidelity
+of DNA repair and cell-cycle regulation following genome damage.
+```
+## How to use it in your code:
+```python
+import torch
+import numpy as np
+from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification
+checkpoint = "unikei/distilbert-base-re-punctuate"
+tokenizer = DistilBertTokenizerFast.from_pretrained(checkpoint)
+model = DistilBertForTokenClassification.from_pretrained(checkpoint)
+encoder_max_length = 256
+#
+# Split text to segments of length 200, with overlap 50
+#
+def split_to_segments(wrds, length, overlap):
+    resp = []
+    i = 0
+    while True:
+        wrds_split = wrds[(length * i):((length * (i + 1)) + overlap)]
+        if not wrds_split:
+            break
+        resp_obj = {
+            "text": wrds_split,
+            "start_idx": length * i,
+            "end_idx": (length * (i + 1)) + overlap,
+        }
+        resp.append(resp_obj)
+        i += 1
+    return resp
+#
+# Punctuate wordpieces
+#
+def punctuate_wordpiece(wordpiece, label):
+    if label.startswith('UPPER'):
+        wordpiece = wordpiece.upper()
+    elif label.startswith('Upper'):
+        wordpiece = wordpiece[0].upper() + wordpiece[1:]
+    if label[-1] != '_' and label[-1] != wordpiece[-1]:
+        wordpiece += label[-1]
+    return wordpiece
+#
+# Punctuate text segments (200 words)
+#
+def punctuate_segment(wordpieces, word_ids, labels, start_word):
+    result = ''
+    for idx in range(0, len(wordpieces)):
+        if word_ids[idx] == None:
+            continue
+        if word_ids[idx] < start_word:
+            continue
+        wordpiece = punctuate_wordpiece(wordpieces[idx][2:] if wordpieces[idx].startswith('##') else wordpieces[idx],
+                            labels[idx])
+        if idx > 0 and len(result) > 0 and word_ids[idx] != word_ids[idx - 1] and result[-1] != '-':
+            result += ' '
+        result += wordpiece
+    return result
+#
+# Tokenize, predict, punctuate text segments (200 words)
+#
+def process_segment(words, tokenizer, model, start_word):
+    tokens = tokenizer(words['text'],
+                       padding="max_length",
+                       # truncation=True,
+                       max_length=encoder_max_length,
+                       is_split_into_words=True, return_tensors='pt')
+    with torch.no_grad():
+        logits = model(**tokens).logits
+    logits = logits.cpu()
+    predictions = np.argmax(logits, axis=-1)
+    wordpieces = tokens.tokens()
+    word_ids = tokens.word_ids()
+    id2label = model.config.id2label
+    labels = [[id2label[p.item()] for p in prediction] for prediction in predictions][0]
+    return punctuate_segment(wordpieces, word_ids, labels, start_word)
+#
+# Punctuate text of any length
+#
+def punctuate(text, tokenizer, model):
+    text = text.lower()
+    text = text.replace('\n', ' ')
+    words = text.split(' ')
+    overlap = 50
+    slices = split_to_segments(words, 150, 50)
+    result = ""
+    start_word = 0
+    for text in slices:
+        corrected = process_segment(text, tokenizer, model, start_word)
+        result += corrected + ' '
+        start_word = overlap
+    return result
+#
+# Example
+#
+text = "the atm protein is a single high molecular weight protein predominantly confined to the nucleus of human fibroblasts but is present in both nuclear and microsomal fractions from human lymphoblast cells and peripheral blood lymphocytes atm protein levels and localization remain constant throughout all stages of the cell cycle truncated atm protein was not detected in lymphoblasts from ataxia telangiectasia patients homozygous for mutations leading to premature protein termination exposure of normal human cells to gamma irradiation and the radiomimetic drug neocarzinostatin had no effect on atm protein levels in contrast to a noted rise in p53 levels over the same time interval these findings are consistent with a role for the atm protein in ensuring the fidelity of dna repair and cell cycle regulation following genome damage"
+result = punctuate(text, tokenizer, model)
+print(result)
+"""
+Output:
+The ATM protein is a single, high-molecular-weight protein predominantly confined to the nucleus of human fibroblasts, but is present in both nuclear and microsomal fractions from human lymphoblast cells and peripheral blood lymphocytes. ATM protein levels and localization remain constant throughout all stages of the cell cycle. Truncated ATM protein was not detected in lymphoblasts from ataxia-telangiectasia-patients homozygous for mutations leading to premature protein termination. Exposure of normal human cells to gamma-irradiation and the radiomimetic drug neocarzinostatin had no effect on ATM protein levels, in contrast to a noted rise in p53 levels over the same time interval. These findings are consistent with a role for the ATM protein in ensuring the fidelity of DNA repair and cell-cycle regulation following genome damage.
+"""
+```

config.json ADDED Viewed

	@@ -0,0 +1,76 @@

+{
+  "_name_or_path": "unikei/distilbert-base-re-punctuate",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForTokenClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "id2label": {
+    "0": "UPPER_",
+    "1": "Upper_",
+    "2": "lower_",
+    "3": "UPPER.",
+    "4": "Upper.",
+    "5": "lower.",
+    "6": "UPPER,",
+    "7": "Upper,",
+    "8": "lower,",
+    "9": "UPPER!",
+    "10": "Upper!",
+    "11": "lower!",
+    "12": "UPPER?",
+    "13": "Upper?",
+    "14": "lower?",
+    "15": "UPPER:",
+    "16": "Upper:",
+    "17": "lower:",
+    "18": "UPPER;",
+    "19": "Upper;",
+    "20": "lower;",
+    "21": "UPPER-",
+    "22": "Upper-",
+    "23": "lower-"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "UPPER!": 9,
+    "UPPER,": 6,
+    "UPPER-": 21,
+    "UPPER.": 3,
+    "UPPER:": 15,
+    "UPPER;": 18,
+    "UPPER?": 12,
+    "UPPER_": 0,
+    "Upper!": 10,
+    "Upper,": 7,
+    "Upper-": 22,
+    "Upper.": 4,
+    "Upper:": 16,
+    "Upper;": 19,
+    "Upper?": 13,
+    "Upper_": 1,
+    "lower!": 11,
+    "lower,": 8,
+    "lower-": 23,
+    "lower.": 5,
+    "lower:": 17,
+    "lower;": 20,
+    "lower?": 14,
+    "lower_": 2
+  },
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.27.4",
+  "vocab_size": 30522
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c274c696e98066b4665ce7c3b10777ae80ab5b190317e3a4d74cf4977553977e
+size 265560165

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": null,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "DistilBertTokenizer",
+  "unk_token": "[UNK]"
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3821eb71ffec3cb3f38966b58ac10a102de3c75452f804546b254e6d68c33b39
+size 3515

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff