alvin
commited on
Commit
·
34c19b7
1
Parent(s):
08d39dc
update <|endoftext|> tokenizer id from 50257 to 50256
Browse files- .idea/.gitignore +0 -0
- .idea/gpt2-medium-indonesian.iml +12 -0
- .idea/inspectionProfiles/Project_Default.xml +6 -0
- .idea/inspectionProfiles/profiles_settings.xml +6 -0
- .idea/misc.xml +4 -0
- .idea/modules.xml +8 -0
- .idea/vcs.xml +6 -0
- .idea/workspace.xml +60 -0
- replace_token_script.py +3 -2
- tokenizer.json +0 -0
.idea/.gitignore
ADDED
File without changes
|
.idea/gpt2-medium-indonesian.iml
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<module type="PYTHON_MODULE" version="4">
|
3 |
+
<component name="NewModuleRootManager">
|
4 |
+
<content url="file://$MODULE_DIR$" />
|
5 |
+
<orderEntry type="inheritedJdk" />
|
6 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
7 |
+
</component>
|
8 |
+
<component name="PyDocumentationSettings">
|
9 |
+
<option name="format" value="PLAIN" />
|
10 |
+
<option name="myDocStringFormat" value="Plain" />
|
11 |
+
</component>
|
12 |
+
</module>
|
.idea/inspectionProfiles/Project_Default.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<profile version="1.0">
|
3 |
+
<option name="myName" value="Project Default" />
|
4 |
+
<inspection_tool class="PyInterpreterInspection" enabled="false" level="WARNING" enabled_by_default="false" />
|
5 |
+
</profile>
|
6 |
+
</component>
|
.idea/inspectionProfiles/profiles_settings.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<settings>
|
3 |
+
<option name="USE_PROJECT_PROFILE" value="false" />
|
4 |
+
<version value="1.0" />
|
5 |
+
</settings>
|
6 |
+
</component>
|
.idea/misc.xml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
|
4 |
+
</project>
|
.idea/modules.xml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectModuleManager">
|
4 |
+
<modules>
|
5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/gpt2-medium-indonesian.iml" filepath="$PROJECT_DIR$/.idea/gpt2-medium-indonesian.iml" />
|
6 |
+
</modules>
|
7 |
+
</component>
|
8 |
+
</project>
|
.idea/vcs.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="VcsDirectoryMappings">
|
4 |
+
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
5 |
+
</component>
|
6 |
+
</project>
|
.idea/workspace.xml
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ChangeListManager">
|
4 |
+
<list default="true" id="9048d30b-80ad-481e-a9f3-3c0d058dcd65" name="Changes" comment="">
|
5 |
+
<change beforePath="$PROJECT_DIR$/replace_token_script.py" beforeDir="false" afterPath="$PROJECT_DIR$/replace_token_script.py" afterDir="false" />
|
6 |
+
<change beforePath="$PROJECT_DIR$/tokenizer.json" beforeDir="false" afterPath="$PROJECT_DIR$/tokenizer.json" afterDir="false" />
|
7 |
+
</list>
|
8 |
+
<option name="SHOW_DIALOG" value="false" />
|
9 |
+
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
10 |
+
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
11 |
+
<option name="LAST_RESOLUTION" value="IGNORE" />
|
12 |
+
</component>
|
13 |
+
<component name="Git.Settings">
|
14 |
+
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
|
15 |
+
</component>
|
16 |
+
<component name="ProjectId" id="1wVKJnvEaHY26dnjKns17akLxD6" />
|
17 |
+
<component name="ProjectViewState">
|
18 |
+
<option name="hideEmptyMiddlePackages" value="true" />
|
19 |
+
<option name="showLibraryContents" value="true" />
|
20 |
+
</component>
|
21 |
+
<component name="PropertiesComponent">
|
22 |
+
<property name="RunOnceActivity.OpenProjectViewOnStart" value="true" />
|
23 |
+
<property name="RunOnceActivity.ShowReadmeOnStart" value="true" />
|
24 |
+
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
|
25 |
+
</component>
|
26 |
+
<component name="RunManager">
|
27 |
+
<configuration name="replace_token_script" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
|
28 |
+
<module name="gpt2-medium-indonesian" />
|
29 |
+
<option name="INTERPRETER_OPTIONS" value="" />
|
30 |
+
<option name="PARENT_ENVS" value="true" />
|
31 |
+
<envs>
|
32 |
+
<env name="PYTHONUNBUFFERED" value="1" />
|
33 |
+
</envs>
|
34 |
+
<option name="SDK_HOME" value="$USER_HOME$/alvin_research/task/Hardy/transformers/venv/bin/python" />
|
35 |
+
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
|
36 |
+
<option name="IS_MODULE_SDK" value="false" />
|
37 |
+
<option name="ADD_CONTENT_ROOTS" value="true" />
|
38 |
+
<option name="ADD_SOURCE_ROOTS" value="true" />
|
39 |
+
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/replace_token_script.py" />
|
40 |
+
<option name="PARAMETERS" value="" />
|
41 |
+
<option name="SHOW_COMMAND_LINE" value="false" />
|
42 |
+
<option name="EMULATE_TERMINAL" value="false" />
|
43 |
+
<option name="MODULE_MODE" value="false" />
|
44 |
+
<option name="REDIRECT_INPUT" value="false" />
|
45 |
+
<option name="INPUT_FILE" value="" />
|
46 |
+
<method v="2" />
|
47 |
+
</configuration>
|
48 |
+
</component>
|
49 |
+
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
|
50 |
+
<component name="TaskManager">
|
51 |
+
<task active="true" id="Default" summary="Default task">
|
52 |
+
<changelist id="9048d30b-80ad-481e-a9f3-3c0d058dcd65" name="Changes" comment="" />
|
53 |
+
<created>1628538247361</created>
|
54 |
+
<option name="number" value="Default" />
|
55 |
+
<option name="presentableId" value="Default" />
|
56 |
+
<updated>1628538247361</updated>
|
57 |
+
</task>
|
58 |
+
<servers />
|
59 |
+
</component>
|
60 |
+
</project>
|
replace_token_script.py
CHANGED
@@ -18,10 +18,11 @@ with open(model_config_path, "r") as f:
|
|
18 |
|
19 |
model_vocab_size = model_config['vocab_size']
|
20 |
tokenizer_vocab = tokenizer_data['model']['vocab']
|
|
|
21 |
mergeslength = len(tokenizer_data['model']['merges'])
|
22 |
|
23 |
-
#readjust added_tokens 'id' to model_vocab_size
|
24 |
-
tokenizer_data['added_tokens'][-1]['id'] = model_vocab_size
|
25 |
|
26 |
final_index = model_vocab_size - 1
|
27 |
eos = '<|endoftext|>'
|
|
|
18 |
|
19 |
model_vocab_size = model_config['vocab_size']
|
20 |
tokenizer_vocab = tokenizer_data['model']['vocab']
|
21 |
+
|
22 |
mergeslength = len(tokenizer_data['model']['merges'])
|
23 |
|
24 |
+
#readjust added_tokens 'id' to model_vocab_size - 1
|
25 |
+
tokenizer_data['added_tokens'][-1]['id'] = model_vocab_size - 1
|
26 |
|
27 |
final_index = model_vocab_size - 1
|
28 |
eos = '<|endoftext|>'
|
tokenizer.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|