first commit

Files changed (8) hide show

__init__.py ADDED Viewed

File without changes

__pycache__/const.cpython-310.pyc ADDED Viewed

Binary file (16.7 kB). View file

__pycache__/const.cpython-39.pyc ADDED Viewed

Binary file (16.2 kB). View file

__pycache__/latex2operatortree.cpython-310.pyc ADDED Viewed

Binary file (818 Bytes). View file

__pycache__/latex2operatortree.cpython-39.pyc ADDED Viewed

Binary file (825 Bytes). View file

requirements.txt ADDED Viewed

Binary file (1.15 kB). View file

test.py CHANGED Viewed

@@ -4,12 +4,26 @@ from const import *
 from pprint import pprint
 from latex2operatortree import *
 from transformers import AutoTokenizer, AutoModel
 # text = "Trong các hình vẽ sau $y=\dfrac{x+1}{-x+1}$, hình nào biểu diễn đồ thị của hàm số $y=x^3$, $y=x^5$?"
 # pattern = r'\$.*?\$'
 # equations = re.findall(pattern, text)
 # pprint(latex2tree(text))
-dir = 'code/'
-py_vncorenlp.download_model(save_dir=dir)
-model = py_vncorenlp.VnCoreNLP(save_dir='code/')

 from pprint import pprint
 from latex2operatortree import *
 from transformers import AutoTokenizer, AutoModel
+from const import LATEX_VOC
 # text = "Trong các hình vẽ sau $y=\dfrac{x+1}{-x+1}$, hình nào biểu diễn đồ thị của hàm số $y=x^3$, $y=x^5$?"
 # pattern = r'\$.*?\$'
 # equations = re.findall(pattern, text)
 # pprint(latex2tree(text))
+# dir = 'code/'
+# py_vncorenlp.download_model(save_dir=dir)
+# model = py_vncorenlp.VnCoreNLP(save_dir='code/')
+model_path = "huuminh365/CustomBERT"
+latex_token = LATEX_VOC
+# print(len(latex_token), latex_token[:5])
+tok = AutoTokenizer.from_pretrained(model_path)
+# model = AutoModel.from_pretrained(model_path)
+tok.add_tokens(latex_token)
+# tok.save_pretrained(f'tokenizer_{model_path}_with_latex')
+text = 'Tính diện tích hình phẳng giới hạn bởi đồ thị các hàm số $y =x^3$, $y=x^5$, $\dfrac{2}{x}=5$. Cho hàm số $y=f(x)$ liên tục trên $\mathbb{R}$, có đồ thị cắt trục $Ox$ tại các điểm có hoành độ'
+print(tok.tokenize(text))

tokenizer.py CHANGED Viewed

@@ -127,6 +127,7 @@ class PhobertTokenizer(PreTrainedTokenizer):
             merges = merges_handle.read().split("\n")[:-1]
         merges = [tuple(merge.split()[:-1]) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
     def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:

             merges = merges_handle.read().split("\n")[:-1]
         merges = [tuple(merge.split()[:-1]) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
     def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]: