Example Usage
import torch
import jieba
import numpy as np
from classifier import BertForMaskClassification
from transformers import AutoTokenizer, AutoConfig, BertForTokenClassification
label_list = ["O","COMMA","PERIOD","COLON"]
label2punct = {
"COMMA": "οΌ",
"PERIOD": "γ",
"COLON":"οΌ",
}
model_name_or_path = "pmp-h256"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = BertForMaskClassification.from_pretrained(model_name_or_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def punct(text):
tokenize_words = jieba.lcut(''.join(text))
mask_tokens = []
for word in tokenize_words:
mask_tokens.extend(word)
mask_tokens.append("[MASK]")
tokenized_inputs = tokenizer(mask_tokens,is_split_into_words=True, return_tensors="pt")
with torch.no_grad():
logits = model(**tokenized_inputs).logits
predictions = logits.argmax(-1).tolist()
predictions = predictions[0]
tokens = tokenizer.convert_ids_to_tokens(tokenized_inputs["input_ids"][0])
result =[]
print(tokens)
print(predictions)
for token, prediction in zip(tokens, predictions):
if token =="[CLS]" or token =="[SEP]":
continue
if token == "[MASK]":
label = label_list[prediction]
if label != "O":
punct = label2punct[label]
result.append(punct)
else:
result.append(token)
return "".join(result)
text = 'θζ΅ι³ηζ£εΈΈθδΈηδ½δΊιιͺ¨δΈηΊΏη¬¬δΊθι΄η§»ε¨ζ΅ι³ι΄ζ§θΎεΊζ ε©η'
print(punct(text))
# θζ΅ι³ηζ£εΈΈοΌθδΈηδ½δΊιιͺ¨δΈηΊΏη¬¬δΊθι΄οΌη§»ε¨ζ΅ι³ι΄ζ§οΌθΎεΊζ ε©ηγ
Acknowledgments
This work was in part supported by Shenzhen Science and Technology Program (No:JCYJ20210324135809025).
Citations
Coming Soon
License
MIT
- Downloads last month
- 0
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
π
Ask for provider support
Model tree for rickltt/pmp-h256
Unable to build the model tree, the base model loops to the model itself. Learn more.