Spaces:
Running
Running
update
Browse files- app.py +1 -0
- utils/log_util.py +2 -0
- vocab/gpt_35_turbo/__init__.py +10 -4
- vocab/gpt_4/__init__.py +9 -4
app.py
CHANGED
@@ -18,6 +18,7 @@
|
|
18 |
- baichuan的单字数量怎么两万多个?
|
19 |
- OOV
|
20 |
- feedback位置
|
|
|
21 |
|
22 |
|
23 |
plots
|
|
|
18 |
- baichuan的单字数量怎么两万多个?
|
19 |
- OOV
|
20 |
- feedback位置
|
21 |
+
- gpt4, gpt3.5 的overlap tokens 有问题。
|
22 |
|
23 |
|
24 |
plots
|
utils/log_util.py
CHANGED
@@ -3,7 +3,9 @@ import logging
|
|
3 |
|
4 |
logging.basicConfig(
|
5 |
format='%(asctime)s - %(filename)s - %(levelname)s - %(process)d - %(thread)d - %(message)s',
|
|
|
6 |
datefmt="%Y-%m-%d %H:%M:%S",
|
|
|
7 |
)
|
8 |
|
9 |
logger = logging.getLogger(__name__)
|
|
|
3 |
|
4 |
logging.basicConfig(
|
5 |
format='%(asctime)s - %(filename)s - %(levelname)s - %(process)d - %(thread)d - %(message)s',
|
6 |
+
level=logging.INFO,
|
7 |
datefmt="%Y-%m-%d %H:%M:%S",
|
8 |
+
|
9 |
)
|
10 |
|
11 |
logger = logging.getLogger(__name__)
|
vocab/gpt_35_turbo/__init__.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2 |
|
3 |
import tiktoken
|
4 |
from tiktoken import Encoding
|
|
|
5 |
|
6 |
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
|
7 |
tokenizer.vocab_size = tokenizer.n_vocab
|
@@ -22,16 +23,21 @@ def convert_ids_to_tokens(self, tokens):
|
|
22 |
def get_vocab(self):
|
23 |
"""Returns vocab as a dict"""
|
24 |
vocab = {}
|
|
|
|
|
25 |
for i in range(self.vocab_size):
|
26 |
try:
|
27 |
token_byte = self.convert_ids_to_tokens([i])[0]
|
28 |
token_str = token_byte.decode("utf-8")
|
29 |
vocab[token_str] = i
|
30 |
-
except KeyError:
|
31 |
-
|
32 |
-
except UnicodeDecodeError:
|
33 |
-
|
|
|
34 |
# vocab.update(self.added_tokens_encoder)
|
|
|
|
|
35 |
return vocab
|
36 |
|
37 |
|
|
|
2 |
|
3 |
import tiktoken
|
4 |
from tiktoken import Encoding
|
5 |
+
from utils.log_util import logger
|
6 |
|
7 |
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
|
8 |
tokenizer.vocab_size = tokenizer.n_vocab
|
|
|
23 |
def get_vocab(self):
|
24 |
"""Returns vocab as a dict"""
|
25 |
vocab = {}
|
26 |
+
key_error_list = []
|
27 |
+
unicode_decode_error_list = []
|
28 |
for i in range(self.vocab_size):
|
29 |
try:
|
30 |
token_byte = self.convert_ids_to_tokens([i])[0]
|
31 |
token_str = token_byte.decode("utf-8")
|
32 |
vocab[token_str] = i
|
33 |
+
except KeyError: # 100256 100261-100275
|
34 |
+
key_error_list.append(i)
|
35 |
+
except UnicodeDecodeError: # 特别多
|
36 |
+
unicode_decode_error_list.append((i, str(token_byte)))
|
37 |
+
|
38 |
# vocab.update(self.added_tokens_encoder)
|
39 |
+
logger.info(f"gpt_35_turbo {len(key_error_list)} KeyError: {key_error_list}")
|
40 |
+
logger.info(f"gpt_35_turbo {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
|
41 |
return vocab
|
42 |
|
43 |
|
vocab/gpt_4/__init__.py
CHANGED
@@ -22,16 +22,21 @@ def convert_ids_to_tokens(self, tokens):
|
|
22 |
def get_vocab(self):
|
23 |
"""Returns vocab as a dict"""
|
24 |
vocab = {}
|
|
|
|
|
25 |
for i in range(self.vocab_size):
|
26 |
try:
|
27 |
token_byte = self.convert_ids_to_tokens([i])[0]
|
28 |
token_str = token_byte.decode("utf-8")
|
29 |
vocab[token_str] = i
|
30 |
-
except KeyError:
|
31 |
-
|
32 |
-
except UnicodeDecodeError:
|
33 |
-
|
|
|
34 |
# vocab.update(self.added_tokens_encoder)
|
|
|
|
|
35 |
return vocab
|
36 |
|
37 |
|
|
|
22 |
def get_vocab(self):
|
23 |
"""Returns vocab as a dict"""
|
24 |
vocab = {}
|
25 |
+
key_error_list = []
|
26 |
+
unicode_decode_error_list = []
|
27 |
for i in range(self.vocab_size):
|
28 |
try:
|
29 |
token_byte = self.convert_ids_to_tokens([i])[0]
|
30 |
token_str = token_byte.decode("utf-8")
|
31 |
vocab[token_str] = i
|
32 |
+
except KeyError: # 100256 100261-100275
|
33 |
+
key_error_list.append(i)
|
34 |
+
except UnicodeDecodeError: # 特别多
|
35 |
+
unicode_decode_error_list.append((i, str(token_byte)))
|
36 |
+
|
37 |
# vocab.update(self.added_tokens_encoder)
|
38 |
+
logger.info(f"gpt-4 {len(key_error_list)} KeyError: {key_error_list}")
|
39 |
+
logger.info(f"gpt-4 {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
|
40 |
return vocab
|
41 |
|
42 |
|