initial release

Browse files

Files changed (8) hide show

README.md +25 -0
config.json +0 -0
maker.sh +13 -0
pytorch_model.bin +3 -0
special_tokens_map.json +37 -0
tokenizer_config.json +57 -0
upos.py +41 -0
vocab.txt +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,25 @@

+---
+language:
+- "bo"
+tags:
+- "tibetan"
+- "token-classification"
+- "pos"
+base_model: KoichiYasuoka/bert-base-tibetan
+license: "apache-2.0"
+pipeline_tag: "token-classification"
+---
+# bert-base-tibetan-upos
+## Model Description
+This is a BERT model for POS-tagging, derived from [bert-base-tibetan](https://huggingface.co/KoichiYasuoka/bert-base-tibetan). Every word is tagged by [UPOS](https://universaldependencies.org/u/pos/) (Universal Part-Of-Speech).
+## How to Use
+```py
+from transformers import pipeline
+nlp=pipeline("upos","KoichiYasuoka/bert-base-tibetan-upos",trust_remote_code=True,aggregation_strategy="simple")
+```

config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

maker.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+#! /bin/sh
+for D in classical-tibetan-corpus old-tibetan-corpus modern-tibetan-corpus
+do test -d $D || git clone --depth=1 https://github.com/tibetan-nlp/$D
+done
+( for F in *-tibetan-corpus/conllu/*.conllu
+ do case $F in
+ *-translated.conllu) : ;;
+ *) cat $F ;;
+ esac
+ done
+) | sed 's/\tNOTAG\t/\tX\t/' > all.conllu
+python3 -m esupar.train KoichiYasuoka/bert-base-tibetan KoichiYasuoka/bert-base-tibetan-upos 32 /tmp all.conllu
+exit 0

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:084a4a08376e2e50060c5aa8792a0d6ebe903c2c5933c083628c3a25c868f700
+size 434730022

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+ "cls_token": {
+ "content": "[CLS]",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "mask_token": {
+ "content": "[MASK]",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "[PAD]",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "sep_token": {
+ "content": "[SEP]",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "[UNK]",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+ "added_tokens_decoder": {
+ "0": {
+ "content": "[PAD]",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "[UNK]",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "[CLS]",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "3": {
+ "content": "[SEP]",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "4": {
+ "content": "[MASK]",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "clean_up_tokenization_spaces": true,
+ "cls_token": "[CLS]",
+ "do_basic_tokenize": true,
+ "do_lower_case": false,
+ "mask_token": "[MASK]",
+ "model_max_length": 512,
+ "never_split": null,
+ "pad_token": "[PAD]",
+ "sep_token": "[SEP]",
+ "strip_accents": false,
+ "tokenize_chinese_chars": true,
+ "tokenizer_class": "BertTokenizer",
+ "unk_token": "[UNK]"
+}

upos.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from transformers import TokenClassificationPipeline
+class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
+ def __init__(self,**kwargs):
+ import numpy
+ super().__init__(**kwargs)
+ x=self.model.config.label2id
+ y=[k for k in x if not k.startswith("I-")]
+ self.transition=numpy.full((len(x),len(x)),numpy.nan)
+ for k,v in x.items():
+ for j in ["I-"+k[2:]] if k.startswith("B-") else [k]+y if k.startswith("I-") else y:
+ self.transition[v,x[j]]=0
+ def check_model_type(self,supported_models):
+ pass
+ def postprocess(self,model_outputs,**kwargs):
+ import numpy
+ if "logits" not in model_outputs:
+ return self.postprocess(model_outputs[0],**kwargs)
+ m=model_outputs["logits"][0].numpy()
+ e=numpy.exp(m-numpy.max(m,axis=-1,keepdims=True))
+ z=e/e.sum(axis=-1,keepdims=True)
+ for i in range(m.shape[0]-1,0,-1):
+ m[i-1]+=numpy.nanmax(m[i]+self.transition,axis=1)
+ k=[numpy.nanargmax(m[0]+self.transition[0])]
+ for i in range(1,m.shape[0]):
+ k.append(numpy.nanargmax(m[i]+self.transition[k[-1]]))
+ w=[{"entity":self.model.config.id2label[j],"start":s,"end":e,"score":z[i,j]} for i,((s,e),j) in enumerate(zip(model_outputs["offset_mapping"][0].tolist(),k)) if s<e]
+ if "aggregation_strategy" in kwargs and kwargs["aggregation_strategy"]!="none":
+ for i,t in reversed(list(enumerate(w))):
+ p=t.pop("entity")
+ if p.startswith("I-"):
+ w[i-1]["score"]=min(w[i-1]["score"],t["score"])
+ w[i-1]["end"]=w.pop(i)["end"]
+ elif p.startswith("B-"):
+ t["entity_group"]=p[2:]
+ else:
+ t["entity_group"]=p
+ for t in w:
+ t["text"]=model_outputs["sentence"][t["start"]:t["end"]]
+ return w

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff