venkatchoudharyala commited on
Commit
959e5e4
·
verified ·
1 Parent(s): 26aa29c

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: bigscience-openrail-m
3
+ widget:
4
+ - text: >-
5
+ the atm protein is a single high molecular weight protein predominantly confined to the nucleus of human fibroblasts but is present in both nuclear and microsomal fractions from human lymphoblast cells and peripheral blood lymphocytes atm protein levels and localization remain constant throughout all stages of the cell cycle truncated atm protein was not detected in lymphoblasts from ataxia telangiectasia patients homozygous for mutations leading to premature protein termination exposure of normal human cells to gamma irradiation and the radiomimetic drug neocarzinostatin had no effect on atm protein levels in contrast to a noted rise in p53 levels over the same time interval these findings are consistent with a role for the atm protein in ensuring the fidelity of dna repair and cell cycle regulation following genome damage
6
+ datasets:
7
+ - bigbio/drugprot
8
+ - bigbio/ncbi_disease
9
+ language:
10
+ - en
11
+ pipeline_tag: token-classification
12
+ tags:
13
+ - biology
14
+ - medical
15
+ ---
16
+
17
+ # DistilBERT base model for restoring punctuation of medical/biotech speech-to-text transcripts
18
+ E.g.:
19
+ ```
20
+ the atm protein is a single high molecular weight protein predominantly confined to the nucleus of human
21
+ fibroblasts but is present in both nuclear and microsomal fractions from human lymphoblast cells and peripheral
22
+ blood lymphocytes atm protein levels and localization remain constant throughout all stages of the cell cycle
23
+ truncated atm protein was not detected in lymphoblasts from ataxia telangiectasia patients homozygous
24
+ for mutations leading to premature protein termination exposure of normal human cells to gamma irradiation and the
25
+ radiomimetic drug neocarzinostatin had no effect on atm protein levels in contrast to a noted rise in p53 levels
26
+ over the same time interval these findings are consistent with a role for the atm protein in ensuring the fidelity
27
+ of dna repair and cell cycle regulation following genome damage
28
+ ```
29
+ will be punctuated as follows:
30
+ ```
31
+ The ATM protein is a single, high-molecular-weight protein predominantly confined to the nucleus of human
32
+ fibroblasts, but is present in both nuclear and microsomal fractions from human lymphoblast cells and peripheral
33
+ blood lymphocytes. ATM protein levels and localization remain constant throughout all stages of the cell cycle.
34
+ Truncated ATM protein was not detected in lymphoblasts from ataxia-telangiectasia-patients homozygous
35
+ for mutations leading to premature protein termination. Exposure of normal human cells to gamma-irradiation and the
36
+ radiomimetic drug neocarzinostatin had no effect on ATM protein levels, in contrast to a noted rise in p53 levels
37
+ over the same time interval. These findings are consistent with a role for the ATM protein in ensuring the fidelity
38
+ of DNA repair and cell-cycle regulation following genome damage.
39
+ ```
40
+
41
+ ## How to use it in your code:
42
+ ```python
43
+ import torch
44
+ import numpy as np
45
+ from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification
46
+
47
+ checkpoint = "unikei/distilbert-base-re-punctuate"
48
+ tokenizer = DistilBertTokenizerFast.from_pretrained(checkpoint)
49
+ model = DistilBertForTokenClassification.from_pretrained(checkpoint)
50
+ encoder_max_length = 256
51
+
52
+ #
53
+ # Split text to segments of length 200, with overlap 50
54
+ #
55
+ def split_to_segments(wrds, length, overlap):
56
+ resp = []
57
+ i = 0
58
+ while True:
59
+ wrds_split = wrds[(length * i):((length * (i + 1)) + overlap)]
60
+ if not wrds_split:
61
+ break
62
+
63
+ resp_obj = {
64
+ "text": wrds_split,
65
+ "start_idx": length * i,
66
+ "end_idx": (length * (i + 1)) + overlap,
67
+ }
68
+
69
+ resp.append(resp_obj)
70
+ i += 1
71
+ return resp
72
+
73
+
74
+ #
75
+ # Punctuate wordpieces
76
+ #
77
+ def punctuate_wordpiece(wordpiece, label):
78
+ if label.startswith('UPPER'):
79
+ wordpiece = wordpiece.upper()
80
+ elif label.startswith('Upper'):
81
+ wordpiece = wordpiece[0].upper() + wordpiece[1:]
82
+ if label[-1] != '_' and label[-1] != wordpiece[-1]:
83
+ wordpiece += label[-1]
84
+ return wordpiece
85
+
86
+
87
+ #
88
+ # Punctuate text segments (200 words)
89
+ #
90
+ def punctuate_segment(wordpieces, word_ids, labels, start_word):
91
+ result = ''
92
+ for idx in range(0, len(wordpieces)):
93
+ if word_ids[idx] == None:
94
+ continue
95
+ if word_ids[idx] < start_word:
96
+ continue
97
+ wordpiece = punctuate_wordpiece(wordpieces[idx][2:] if wordpieces[idx].startswith('##') else wordpieces[idx],
98
+ labels[idx])
99
+ if idx > 0 and len(result) > 0 and word_ids[idx] != word_ids[idx - 1] and result[-1] != '-':
100
+ result += ' '
101
+ result += wordpiece
102
+ return result
103
+
104
+
105
+ #
106
+ # Tokenize, predict, punctuate text segments (200 words)
107
+ #
108
+ def process_segment(words, tokenizer, model, start_word):
109
+
110
+ tokens = tokenizer(words['text'],
111
+ padding="max_length",
112
+ # truncation=True,
113
+ max_length=encoder_max_length,
114
+ is_split_into_words=True, return_tensors='pt')
115
+
116
+ with torch.no_grad():
117
+ logits = model(**tokens).logits
118
+ logits = logits.cpu()
119
+ predictions = np.argmax(logits, axis=-1)
120
+
121
+ wordpieces = tokens.tokens()
122
+ word_ids = tokens.word_ids()
123
+ id2label = model.config.id2label
124
+ labels = [[id2label[p.item()] for p in prediction] for prediction in predictions][0]
125
+
126
+ return punctuate_segment(wordpieces, word_ids, labels, start_word)
127
+
128
+
129
+ #
130
+ # Punctuate text of any length
131
+ #
132
+ def punctuate(text, tokenizer, model):
133
+ text = text.lower()
134
+ text = text.replace('\n', ' ')
135
+ words = text.split(' ')
136
+
137
+ overlap = 50
138
+ slices = split_to_segments(words, 150, 50)
139
+
140
+ result = ""
141
+ start_word = 0
142
+ for text in slices:
143
+ corrected = process_segment(text, tokenizer, model, start_word)
144
+ result += corrected + ' '
145
+ start_word = overlap
146
+ return result
147
+
148
+ #
149
+ # Example
150
+ #
151
+ text = "the atm protein is a single high molecular weight protein predominantly confined to the nucleus of human fibroblasts but is present in both nuclear and microsomal fractions from human lymphoblast cells and peripheral blood lymphocytes atm protein levels and localization remain constant throughout all stages of the cell cycle truncated atm protein was not detected in lymphoblasts from ataxia telangiectasia patients homozygous for mutations leading to premature protein termination exposure of normal human cells to gamma irradiation and the radiomimetic drug neocarzinostatin had no effect on atm protein levels in contrast to a noted rise in p53 levels over the same time interval these findings are consistent with a role for the atm protein in ensuring the fidelity of dna repair and cell cycle regulation following genome damage"
152
+ result = punctuate(text, tokenizer, model)
153
+ print(result)
154
+
155
+
156
+ """
157
+ Output:
158
+ The ATM protein is a single, high-molecular-weight protein predominantly confined to the nucleus of human fibroblasts, but is present in both nuclear and microsomal fractions from human lymphoblast cells and peripheral blood lymphocytes. ATM protein levels and localization remain constant throughout all stages of the cell cycle. Truncated ATM protein was not detected in lymphoblasts from ataxia-telangiectasia-patients homozygous for mutations leading to premature protein termination. Exposure of normal human cells to gamma-irradiation and the radiomimetic drug neocarzinostatin had no effect on ATM protein levels, in contrast to a noted rise in p53 levels over the same time interval. These findings are consistent with a role for the ATM protein in ensuring the fidelity of DNA repair and cell-cycle regulation following genome damage.
159
+ """
160
+ ```
config.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "unikei/distilbert-base-re-punctuate",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForTokenClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "id2label": {
12
+ "0": "UPPER_",
13
+ "1": "Upper_",
14
+ "2": "lower_",
15
+ "3": "UPPER.",
16
+ "4": "Upper.",
17
+ "5": "lower.",
18
+ "6": "UPPER,",
19
+ "7": "Upper,",
20
+ "8": "lower,",
21
+ "9": "UPPER!",
22
+ "10": "Upper!",
23
+ "11": "lower!",
24
+ "12": "UPPER?",
25
+ "13": "Upper?",
26
+ "14": "lower?",
27
+ "15": "UPPER:",
28
+ "16": "Upper:",
29
+ "17": "lower:",
30
+ "18": "UPPER;",
31
+ "19": "Upper;",
32
+ "20": "lower;",
33
+ "21": "UPPER-",
34
+ "22": "Upper-",
35
+ "23": "lower-"
36
+ },
37
+ "initializer_range": 0.02,
38
+ "label2id": {
39
+ "UPPER!": 9,
40
+ "UPPER,": 6,
41
+ "UPPER-": 21,
42
+ "UPPER.": 3,
43
+ "UPPER:": 15,
44
+ "UPPER;": 18,
45
+ "UPPER?": 12,
46
+ "UPPER_": 0,
47
+ "Upper!": 10,
48
+ "Upper,": 7,
49
+ "Upper-": 22,
50
+ "Upper.": 4,
51
+ "Upper:": 16,
52
+ "Upper;": 19,
53
+ "Upper?": 13,
54
+ "Upper_": 1,
55
+ "lower!": 11,
56
+ "lower,": 8,
57
+ "lower-": 23,
58
+ "lower.": 5,
59
+ "lower:": 17,
60
+ "lower;": 20,
61
+ "lower?": 14,
62
+ "lower_": 2
63
+ },
64
+ "max_position_embeddings": 512,
65
+ "model_type": "distilbert",
66
+ "n_heads": 12,
67
+ "n_layers": 6,
68
+ "pad_token_id": 0,
69
+ "qa_dropout": 0.1,
70
+ "seq_classif_dropout": 0.2,
71
+ "sinusoidal_pos_embds": false,
72
+ "tie_weights_": true,
73
+ "torch_dtype": "float32",
74
+ "transformers_version": "4.27.4",
75
+ "vocab_size": 30522
76
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c274c696e98066b4665ce7c3b10777ae80ab5b190317e3a4d74cf4977553977e
3
+ size 265560165
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_lower_case": true,
4
+ "mask_token": "[MASK]",
5
+ "model_max_length": 512,
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "special_tokens_map_file": null,
9
+ "strip_accents": null,
10
+ "tokenize_chinese_chars": true,
11
+ "tokenizer_class": "DistilBertTokenizer",
12
+ "unk_token": "[UNK]"
13
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3821eb71ffec3cb3f38966b58ac10a102de3c75452f804546b254e6d68c33b39
3
+ size 3515
vocab.txt ADDED
The diff for this file is too large to render. See raw diff