Update tokenization_qwen.py
Browse files- tokenization_qwen.py +7 -1
tokenization_qwen.py
CHANGED
@@ -27,6 +27,12 @@ logger = logging.getLogger(__name__)
|
|
27 |
|
28 |
|
29 |
VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken", "ttf": "SimSun.ttf"}
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
|
32 |
ENDOFTEXT = "<|endoftext|>"
|
@@ -497,7 +503,7 @@ class VisImage:
|
|
497 |
class Visualizer:
|
498 |
def __init__(self, img_rgb, metadata=None, scale=1.0):
|
499 |
self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
|
500 |
-
self.font_path =
|
501 |
self.output = VisImage(self.img, scale=scale)
|
502 |
self.cpu_device = torch.device("cpu")
|
503 |
|
|
|
27 |
|
28 |
|
29 |
VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken", "ttf": "SimSun.ttf"}
|
30 |
+
FONT_PATH = try_to_load_from_cache("../", "SimSun.ttf")
|
31 |
+
if FONT_PATH is None:
|
32 |
+
if not os.path.exists("SimSun.ttf"):
|
33 |
+
ttf = requests.get("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/SimSun.ttf")
|
34 |
+
open("SimSun.ttf", "wb").write(ttf.content)
|
35 |
+
FONT_PATH = "SimSun.ttf"
|
36 |
|
37 |
PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
|
38 |
ENDOFTEXT = "<|endoftext|>"
|
|
|
503 |
class Visualizer:
|
504 |
def __init__(self, img_rgb, metadata=None, scale=1.0):
|
505 |
self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
|
506 |
+
self.font_path = FONT_PATH
|
507 |
self.output = VisImage(self.img, scale=scale)
|
508 |
self.cpu_device = torch.device("cpu")
|
509 |
|