xu-song commited on
Commit
2461705
1 Parent(s): bce41d0

fix PyO3PanicException

Browse files
Files changed (1) hide show
  1. tokenizer/tiktoken_patch.py +12 -6
tokenizer/tiktoken_patch.py CHANGED
@@ -1,7 +1,7 @@
1
-
2
  from tiktoken import Encoding
3
  from utils.log_util import logger
4
 
 
5
  def decode(self, tokens, errors="replace", skip_special_tokens=False):
6
  """
7
  默认的decode,可能会报错,详见 decode_test.py
@@ -17,9 +17,13 @@ def decode(self, tokens, errors="replace", skip_special_tokens=False):
17
  "namereplace"
18
  """
19
  try:
 
20
  decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
21
- except Exception as e:
22
- logger.error(f"{e} -> return 'null'")
 
 
 
23
  decode_str = "null"
24
  return decode_str
25
 
@@ -30,11 +34,14 @@ def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
30
  """
31
  try:
32
  return self.decode_tokens_bytes(tokens)
33
- except Exception as e:
34
  # 什么要返回None?见zh_util.py
35
  # 16个空闲id, 100256 100261-100275
36
- logger.error(e)
37
  return [None for _ in tokens]
 
 
 
38
 
39
 
40
  def get_vocab(self, token_type="str"):
@@ -52,7 +59,6 @@ def get_vocab(self, token_type="str"):
52
  continue
53
  # token_str = token_byte.decode("utf-8")
54
  vocab[token_byte] = i
55
-
56
  except UnicodeDecodeError: # 773 UnicodeDecodeError
57
  unicode_decode_error_list.append((i, str(token_byte)))
58
  vocab[token_byte] = i
 
 
1
  from tiktoken import Encoding
2
  from utils.log_util import logger
3
 
4
+
5
  def decode(self, tokens, errors="replace", skip_special_tokens=False):
6
  """
7
  默认的decode,可能会报错,详见 decode_test.py
 
17
  "namereplace"
18
  """
19
  try:
20
+ print(tokens)
21
  decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
22
+ except Exception as e: # 捕捉不到 PyO3PanicException
23
+ logger.error(f"{e} for {tokens} -> return 'null'")
24
+ decode_str = "null"
25
+ except:
26
+ logger.error(f"unknow exception for {tokens} -> return 'null'")
27
  decode_str = "null"
28
  return decode_str
29
 
 
34
  """
35
  try:
36
  return self.decode_tokens_bytes(tokens)
37
+ except Exception as e: # 捕捉不到 PyO3PanicException
38
  # 什么要返回None?见zh_util.py
39
  # 16个空闲id, 100256 100261-100275
40
+ logger.error(f"{e} for {tokens} -> return None")
41
  return [None for _ in tokens]
42
+ except:
43
+ logger.error(f"unknow exception for {tokens} -> return None")
44
+ return
45
 
46
 
47
  def get_vocab(self, token_type="str"):
 
59
  continue
60
  # token_str = token_byte.decode("utf-8")
61
  vocab[token_byte] = i
 
62
  except UnicodeDecodeError: # 773 UnicodeDecodeError
63
  unicode_decode_error_list.append((i, str(token_byte)))
64
  vocab[token_byte] = i